From 99c1cd60d68086bda49dad832c5b65d0887805c9 Mon Sep 17 00:00:00 2001 From: mercierc Date: Thu, 22 Jul 2021 09:23:18 +1200 Subject: [PATCH] export: now exports header for tabular files by default and added option to only export specific columns --- .../obitools3/apps/optiongroups/__init__.py | 14 ++-- python/obitools3/format/tab.pxd | 2 +- python/obitools3/format/tab.pyx | 72 ++++++++++--------- python/obitools3/uri/decode.pyx | 20 ++++-- 4 files changed, 67 insertions(+), 41 deletions(-) diff --git a/python/obitools3/apps/optiongroups/__init__.py b/python/obitools3/apps/optiongroups/__init__.py index ca4f9bb..79ccbe0 100755 --- a/python/obitools3/apps/optiongroups/__init__.py +++ b/python/obitools3/apps/optiongroups/__init__.py @@ -137,10 +137,10 @@ def __addImportInputOption(optionManager): def __addTabularOption(optionManager): group = optionManager.add_argument_group("Input and output format options for tabular files") - group.add_argument('--header', - action="store_true", dest="obi:header", - default=False, - help="First line of tabular file contains column names") + group.add_argument('--no-header', + action="store_false", dest="obi:header", + default=True, + help="Don't print the header (first line with column names") group.add_argument('--sep', action="store", dest="obi:sep", @@ -297,6 +297,12 @@ def __addExportOutputOption(optionManager): const=b'tabular', help="Output file is in tabular format") + group.add_argument('--only-keys', + action="append", dest="obi:only_keys", + type=str, + default=[], + help="Only export the given keys (columns).") + group.add_argument('--print-na', action="store_true", dest="obi:printna", default=False, diff --git a/python/obitools3/format/tab.pxd b/python/obitools3/format/tab.pxd index 47cb859..9574fd2 100755 --- a/python/obitools3/format/tab.pxd +++ b/python/obitools3/format/tab.pxd @@ -4,6 +4,6 @@ cdef class TabFormat: cdef bint header cdef bint first_line cdef bytes NAString - cdef list tags + cdef set tags cdef bytes sep cdef bint NAIntTo0 \ No newline at end of file diff --git a/python/obitools3/format/tab.pyx b/python/obitools3/format/tab.pyx index 93af246..cabdcd2 100755 --- a/python/obitools3/format/tab.pyx +++ b/python/obitools3/format/tab.pyx @@ -10,7 +10,8 @@ import sys cdef class TabFormat: - def __init__(self, header=True, bytes NAString=b"NA", bytes sep=b"\t", bint NAIntTo0=True): + def __init__(self, list tags=[], header=True, bytes NAString=b"NA", bytes sep=b"\t", bint NAIntTo0=True): + self.tags = set(tags) self.header = header self.first_line = True self.NAString = NAString @@ -20,46 +21,53 @@ cdef class TabFormat: @cython.boundscheck(False) def __call__(self, object data): + cdef set ktags + cdef list tags = [key for key in data] + line = [] - if self.first_line: - self.tags = [k for k in data.keys()] - + if self.tags is not None and self.tags: + ktags = self.tags + else: + ktags = set(tags) + if self.header and self.first_line: - for k in self.tags: - if isinstance(data.view[k], Column_multi_elts): - keys = data.view[k].keys() - keys.sort() - for k2 in keys: - line.append(tobytes(k)+b':'+tobytes(k2)) - else: - line.append(tobytes(k)) + for k in ktags: + if k in tags: + if isinstance(data.view[k], Column_multi_elts): + keys = data.view[k].keys() + keys.sort() + for k2 in keys: + line.append(tobytes(k)+b':'+tobytes(k2)) + else: + line.append(tobytes(k)) r = self.sep.join(value for value in line) r += b'\n' line = [] - for k in self.tags: - value = data[k] - if isinstance(data.view[k], Column_multi_elts): - keys = data.view[k].keys() - keys.sort() - if value is None: # all keys at None - for k2 in keys: # TODO could be much more efficient - line.append(self.NAString) - else: - for k2 in keys: # TODO could be much more efficient - if value[k2] is not None: - line.append(str2bytes(str(bytes2str_object(value[k2])))) # genius programming - else: - if self.NAIntTo0 and isinstance(data.view[k], Column_multi_elts_int): - line.append(b"0") + for k in ktags: + if k in tags: + value = data[k] + if isinstance(data.view[k], Column_multi_elts): + keys = data.view[k].keys() + keys.sort() + if value is None: # all keys at None + for k2 in keys: # TODO could be much more efficient + line.append(self.NAString) + else: + for k2 in keys: # TODO could be much more efficient + if value[k2] is not None: + line.append(str2bytes(str(bytes2str_object(value[k2])))) # genius programming else: - line.append(self.NAString) - else: - if value is not None or (self.NAIntTo0 and isinstance(data.view[k], Column_int)): - line.append(str2bytes(str(bytes2str_object(value)))) + if self.NAIntTo0 and isinstance(data.view[k], Column_multi_elts_int): + line.append(b"0") + else: + line.append(self.NAString) else: - line.append(self.NAString) + if value is not None or (self.NAIntTo0 and isinstance(data.view[k], Column_int)): + line.append(str2bytes(str(bytes2str_object(value)))) + else: + line.append(self.NAString) if self.header and self.first_line: r += self.sep.join(value for value in line) diff --git a/python/obitools3/uri/decode.pyx b/python/obitools3/uri/decode.pyx index c828369..5720479 100644 --- a/python/obitools3/uri/decode.pyx +++ b/python/obitools3/uri/decode.pyx @@ -476,6 +476,18 @@ def open_uri(uri, except KeyError: commentchar=b'#' + if b"only_keys" in qualifiers: + only_keys=qualifiers[b"only_keys"][0] # not sure that works but no one ever uses qualifiers + else: + try: + only_keys_str=config["obi"]["only_keys"] + only_keys=[] + for key in only_keys_str: + only_keys.append(tobytes(key)) + except KeyError: + only_keys=[] + + if format is not None: if seqtype==b"nuc": objclass = Nuc_Seq # Nuc_Seq_Stored? TODO @@ -486,7 +498,7 @@ def open_uri(uri, only=only, nastring=nastring) else: - iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring), + iseq = FastaNucWriter(FastaFormat(tags=only_keys, printNAKeys=printna, NAString=nastring), file, skip=skip, only=only) @@ -499,7 +511,7 @@ def open_uri(uri, noquality=noquality, nastring=nastring) else: - iseq = FastqWriter(FastqFormat(printNAKeys=printna, NAString=nastring), + iseq = FastqWriter(FastqFormat(tags=only_keys, printNAKeys=printna, NAString=nastring), file, skip=skip, only=only) @@ -535,7 +547,7 @@ def open_uri(uri, skip = skip, only = only) else: - iseq = TabWriter(TabFormat(header=header, NAString=nastring, sep=sep, NAIntTo0=na_int_to_0), + iseq = TabWriter(TabFormat(tags=only_keys, header=header, NAString=nastring, sep=sep, NAIntTo0=na_int_to_0), file, skip=skip, only=only, @@ -571,7 +583,7 @@ def open_uri(uri, commentchar) else: # default export is in fasta? or tab? TODO objclass = Nuc_Seq # Nuc_Seq_Stored? TODO - iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring), + iseq = FastaNucWriter(FastaFormat(tags=only_keys, printNAKeys=printna, NAString=nastring), file, skip=skip, only=only)