obi export: added options to export to metabaR compatible format

This commit is contained in:
mercierc
2021-11-11 15:24:12 +13:00
parent 5a8b9dca5d
commit f6353fbf28
5 changed files with 127 additions and 7 deletions

View File

@ -297,6 +297,29 @@ def __addExportOutputOption(optionManager):
const=b'tabular',
help="Output file is in tabular format")
group.add_argument('--metabaR-output',
action="store_const", dest="obi:outputformat",
default=None,
const=b'metabaR',
help="Export the files needed by the obifiles_to_metabarlist function of the metabaR package")
group.add_argument('--metabaR-prefix',
action="store", dest="obi:metabarprefix",
type=str,
help="Prefix for the files when using --metabaR-output option")
group.add_argument('--metabaR-ngsfilter',
action="store", dest="obi:metabarngsfilter",
type=str,
default=None,
help="URI to the ngsfilter view when using --metabaR-output option (if not provided, it is not exported)")
group.add_argument('--metabaR-samples',
action="store", dest="obi:metabarsamples",
type=str,
default=None,
help="URI to the sample metadata view when using --metabaR-output option (if not provided, it is built as just a list of the sample names)")
group.add_argument('--only-keys',
action="append", dest="obi:only_keys",
type=str,

View File

@ -6,6 +6,9 @@ from obitools3.apps.config import logger
from obitools3.dms import DMS
from obitools3.dms.obiseq import Nuc_Seq
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN
from obitools3.writers.tab import TabWriter
from obitools3.format.tab import TabFormat
from obitools3.utils cimport tobytes, tostr
from obitools3.apps.optiongroups import addMinimalInputOption, \
addExportOutputOption, \
@ -76,6 +79,13 @@ def run(config):
else:
pb = ProgressBar(withoutskip - skip, config)
if config['obi']['outputformat'] == b'metabaR':
# Check prefix
if "metabarprefix" not in config["obi"]:
raise Exception("Prefix needed when exporting for metabaR (--metabaR-prefix option)")
else:
metabaRprefix = config["obi"]["metabarprefix"]
i=0
for seq in iview :
PyErr_CheckSignals()
@ -91,6 +101,81 @@ def run(config):
pb(i, force=True)
print("", file=sys.stderr)
if config['obi']['outputformat'] == b'metabaR':
# Export ngsfilter file if view provided
if 'metabarngsfilter' in config['obi']:
ngsfilter_input = open_uri(config['obi']['metabarngsfilter'])
if ngsfilter_input is None:
raise Exception("Could not read ngsfilter view for metabaR output")
ngsfilter_view = ngsfilter_input[1]
ngsfilter_output = open(config['obi']['metabarprefix']+'.ngsfilter', 'w')
for line in ngsfilter_view:
line_to_print = b""
line_to_print += line[b'experiment']
line_to_print += b"\t"
line_to_print += line[b'sample']
line_to_print += b"\t"
line_to_print += line[b'forward_tag']
line_to_print += b":"
line_to_print += line[b'reverse_tag']
line_to_print += b"\t"
line_to_print += line[b'forward_primer']
line_to_print += b"\t"
line_to_print += line[b'reverse_primer']
line_to_print += b"\t"
line_to_print += line[b'additional_info']
print(tostr(line_to_print), file=ngsfilter_output)
if ngsfilter_input[0] != input[0]:
ngsfilter_input[0].close()
ngsfilter_output.close()
# Export sample metadata
samples_output = open(config['obi']['metabarprefix']+'_samples.csv', 'w')
# Export sample metadata file if view provided
if 'metabarsamples' in config['obi']:
samples_input = open_uri(config['obi']['metabarsamples'])
if samples_input is None:
raise Exception("Could not read sample view for metabaR output")
samples_view = samples_input[1]
# Export with tab formatter
TabWriter(TabFormat(header=True, sep='\t',),
samples_output,
header=True)
if samples_input[0] != input[0]:
samples_input[0].close()
# Else export just sample names from main view
else:
sample_list = []
if 'MERGED_sample' in iview:
sample_list = iview['MERGED_sample'].keys()
elif 'sample' not in iview:
for seq in iview:
sample = seq['sample']
if sample not in sample_list:
sample_list.append(sample)
else:
logger("warning", "Can not read sample list from main view for metabaR sample list export")
print("sample_id", file=samples_output)
for sample in sample_list:
line_to_print = b""
line_to_print += sample
line_to_print += b"\t"
print(tostr(line_to_print), file=samples_output)
samples_output.close()
# TODO save command in input dms?
if not BrokenPipeError and not IOError:

View File

@ -6,4 +6,6 @@ cdef class TabFormat:
cdef bytes NAString
cdef set tags
cdef bytes sep
cdef bint NAIntTo0
cdef bint NAIntTo0
cdef bint metabaR
cdef bint ngsfilter

View File

@ -10,13 +10,15 @@ import sys
cdef class TabFormat:
def __init__(self, list tags=[], header=True, bytes NAString=b"NA", bytes sep=b"\t", bint NAIntTo0=True):
def __init__(self, list tags=[], header=True, bytes NAString=b"NA", bytes sep=b"\t", bint NAIntTo0=True, metabaR=False, ngsfilter=False):
self.tags = set(tags)
self.header = header
self.first_line = True
self.NAString = NAString
self.sep = sep
self.NAIntTo0 = NAIntTo0
self.metabaR = metabaR
self.ngsfilter = ngsfilter
@cython.boundscheck(False)
def __call__(self, object data):
@ -34,13 +36,21 @@ cdef class TabFormat:
if self.header and self.first_line:
for k in ktags:
if k in tags:
if self.metabaR:
if k == b'NUC_SEQ':
ktoprint = b'sequence'
else:
ktoprint = k.lower()
ktoprint = ktoprint.replace(b'merged_', b'')
else:
ktoprint = k
if isinstance(data.view[k], Column_multi_elts):
keys = data.view[k].keys()
keys.sort()
for k2 in keys:
line.append(tobytes(k)+b':'+tobytes(k2))
line.append(tobytes(ktoprint)+b':'+tobytes(k2))
else:
line.append(tobytes(k))
line.append(tobytes(ktoprint))
r = self.sep.join(value for value in line)
r += b'\n'
line = []

View File

@ -48,13 +48,13 @@ def ngsfilterIterator(lineiterator,
all_lines.insert(0, firstline)
# Insert header for column names
column_names = [b"experiment", b"sample", b"forward_tag", b"reverse_tag", b"forward_primer", b"reverse_primer"]
column_names = [b"experiment", b"sample", b"forward_tag", b"reverse_tag", b"forward_primer", b"reverse_primer",b"additional_info"]
header = out_sep.join(column_names)
new_lines.append(header)
for line in all_lines:
split_line = line.split()
split_line = line.split(maxsplit=5)
tags = split_line.pop(2)
tags = tags.split(b":")
for t_idx in range(len(tags)):
@ -64,7 +64,7 @@ def ngsfilterIterator(lineiterator,
tags.append(tags[0])
split_line.insert(2, tags[0])
split_line.insert(3, tags[1])
new_lines.append(out_sep.join(split_line[0:6]))
new_lines.append(out_sep.join(split_line[0:7]))
return tabIterator(iter(new_lines),
header = True,