obi export: added options to export to metabaR compatible format

2021-11-11 15:24:12 +13:00
parent 5a8b9dca5d
commit f6353fbf28
5 changed files with 127 additions and 7 deletions
--- a/python/obitools3/apps/optiongroups/init.py
+++ b/python/obitools3/apps/optiongroups/init.py
@ -297,6 +297,29 @@ def __addExportOutputOption(optionManager):
                     const=b'tabular',
                     help="Output file is in tabular format")
    group.add_argument('--metabaR-output',
                     action="store_const", dest="obi:outputformat",
                     default=None,
                     const=b'metabaR',
                     help="Export the files needed by the obifiles_to_metabarlist function of the metabaR package")
    group.add_argument('--metabaR-prefix',
                     action="store", dest="obi:metabarprefix",
                     type=str,
                     help="Prefix for the files when using --metabaR-output option")
    group.add_argument('--metabaR-ngsfilter',
                     action="store", dest="obi:metabarngsfilter",
                     type=str,
                     default=None,
                     help="URI to the ngsfilter view when using --metabaR-output option (if not provided, it is not exported)")
    group.add_argument('--metabaR-samples',
                     action="store", dest="obi:metabarsamples",
                     type=str,
                     default=None,
                     help="URI to the sample metadata view when using --metabaR-output option (if not provided, it is built as just a list of the sample names)")
    group.add_argument('--only-keys',
                       action="append", dest="obi:only_keys",
                       type=str,
--- a/python/obitools3/commands/export.pyx
+++ b/python/obitools3/commands/export.pyx
@ -6,6 +6,9 @@ from obitools3.apps.config import logger
 from obitools3.dms import DMS
 from obitools3.dms.obiseq import Nuc_Seq
 from obitools3.dms.capi.obiview cimport QUALITY_COLUMN
 from obitools3.writers.tab import TabWriter
 from obitools3.format.tab import TabFormat
 from obitools3.utils cimport tobytes, tostr
 from obitools3.apps.optiongroups import addMinimalInputOption, \
                                        addExportOutputOption, \
@ -76,6 +79,13 @@ def run(config):
    else:
        pb = ProgressBar(withoutskip - skip, config)
    if config['obi']['outputformat'] == b'metabaR':
        # Check prefix
        if "metabarprefix" not in config["obi"]:
            raise Exception("Prefix needed when exporting for metabaR (--metabaR-prefix option)")
        else:
            metabaRprefix = config["obi"]["metabarprefix"]
    i=0
    for seq in iview :
        PyErr_CheckSignals()
@ -91,6 +101,81 @@ def run(config):
        pb(i, force=True)
        print("", file=sys.stderr)
    if config['obi']['outputformat'] == b'metabaR':
        # Export ngsfilter file if view provided
        if 'metabarngsfilter' in config['obi']:
            ngsfilter_input = open_uri(config['obi']['metabarngsfilter'])
            if ngsfilter_input is None:
                raise Exception("Could not read ngsfilter view for metabaR output")
            ngsfilter_view = ngsfilter_input[1]
            ngsfilter_output = open(config['obi']['metabarprefix']+'.ngsfilter', 'w')
            for line in ngsfilter_view:
                line_to_print = b""                
                line_to_print += line[b'experiment']
                line_to_print += b"\t"
                line_to_print += line[b'sample']
                line_to_print += b"\t"
                line_to_print += line[b'forward_tag']
                line_to_print += b":"
                line_to_print += line[b'reverse_tag']
                line_to_print += b"\t"
                line_to_print += line[b'forward_primer']
                line_to_print += b"\t"
                line_to_print += line[b'reverse_primer']
                line_to_print += b"\t"
                line_to_print += line[b'additional_info']
                print(tostr(line_to_print), file=ngsfilter_output)
            if ngsfilter_input[0] != input[0]:
                ngsfilter_input[0].close()
            ngsfilter_output.close()
        # Export sample metadata
        samples_output = open(config['obi']['metabarprefix']+'_samples.csv', 'w')
        # Export sample metadata file if view provided
        if 'metabarsamples' in config['obi']:
            samples_input = open_uri(config['obi']['metabarsamples'])
            if samples_input is None:
                raise Exception("Could not read sample view for metabaR output")
            samples_view = samples_input[1]
            # Export with tab formatter
            TabWriter(TabFormat(header=True, sep='\t',), 
                      samples_output,
                      header=True)
            if samples_input[0] != input[0]:
                samples_input[0].close()
        # Else export just sample names from main view
        else:
            sample_list = []
            if 'MERGED_sample' in iview:
                sample_list = iview['MERGED_sample'].keys()
            elif 'sample' not in iview:
                for seq in iview:
                    sample = seq['sample']
                    if sample not in sample_list:
                        sample_list.append(sample)
            else:
                logger("warning", "Can not read sample list from main view for metabaR sample list export")            
            print("sample_id", file=samples_output)
            for sample in sample_list:
                line_to_print = b""                
                line_to_print += sample
                line_to_print += b"\t"
                print(tostr(line_to_print), file=samples_output)
        samples_output.close()
    # TODO save command in input dms?
    if not BrokenPipeError and not IOError:
--- a/python/obitools3/format/tab.pxd
+++ b/python/obitools3/format/tab.pxd
@ -6,4 +6,6 @@ cdef class TabFormat:
    cdef bytes NAString
    cdef set   tags
    cdef bytes sep
-    cdef bint NAIntTo0
+    cdef bint NAIntTo0
    cdef bint metabaR
    cdef bint ngsfilter
--- a/python/obitools3/format/tab.pyx
+++ b/python/obitools3/format/tab.pyx
@ -10,13 +10,15 @@ import sys
 cdef class TabFormat:
-    def __init__(self, list tags=[], header=True, bytes NAString=b"NA", bytes sep=b"\t", bint NAIntTo0=True):
+    def __init__(self, list tags=[], header=True, bytes NAString=b"NA", bytes sep=b"\t", bint NAIntTo0=True, metabaR=False, ngsfilter=False):
        self.tags = set(tags)
        self.header = header
        self.first_line = True
        self.NAString = NAString
        self.sep = sep
        self.NAIntTo0 = NAIntTo0
        self.metabaR = metabaR
        self.ngsfilter = ngsfilter
    @cython.boundscheck(False)    
    def __call__(self, object data):
@ -34,13 +36,21 @@ cdef class TabFormat:
        if self.header and self.first_line:
            for k in ktags:
                if k in tags:
                    if self.metabaR:
                        if k == b'NUC_SEQ':
                            ktoprint = b'sequence'
                        else:
                            ktoprint = k.lower()
                        ktoprint = ktoprint.replace(b'merged_', b'')
                    else:
                        ktoprint = k
                    if isinstance(data.view[k], Column_multi_elts):
                        keys = data.view[k].keys()
                        keys.sort()
                        for k2 in keys:
-                            line.append(tobytes(k)+b':'+tobytes(k2))
+                            line.append(tobytes(ktoprint)+b':'+tobytes(k2))
                    else:
-                        line.append(tobytes(k))
+                        line.append(tobytes(ktoprint))
            r = self.sep.join(value for value in line)
            r += b'\n'
            line = []
--- a/python/obitools3/parsers/ngsfilter.pyx
+++ b/python/obitools3/parsers/ngsfilter.pyx
@ -48,13 +48,13 @@ def ngsfilterIterator(lineiterator,
        all_lines.insert(0, firstline)
    # Insert header for column names
-    column_names = [b"experiment", b"sample", b"forward_tag", b"reverse_tag", b"forward_primer", b"reverse_primer"]
+    column_names = [b"experiment", b"sample", b"forward_tag", b"reverse_tag", b"forward_primer", b"reverse_primer",b"additional_info"]
    header = out_sep.join(column_names)
    new_lines.append(header)
    for line in all_lines:
-        split_line = line.split()
+        split_line = line.split(maxsplit=5)
        tags = split_line.pop(2)
        tags = tags.split(b":")
        for t_idx in range(len(tags)):
@ -64,7 +64,7 @@ def ngsfilterIterator(lineiterator,
            tags.append(tags[0])
        split_line.insert(2, tags[0])
        split_line.insert(3, tags[1])
-        new_lines.append(out_sep.join(split_line[0:6]))
+        new_lines.append(out_sep.join(split_line[0:7]))
    return tabIterator(iter(new_lines),
                       header = True,