export: now exports header for tabular files by default and added option

to only export specific columns
This commit is contained in:
mercierc
2021-07-22 09:23:18 +12:00
parent ce7ae4ac55
commit 99c1cd60d6
4 changed files with 67 additions and 41 deletions

View File

@ -137,10 +137,10 @@ def __addImportInputOption(optionManager):
def __addTabularOption(optionManager): def __addTabularOption(optionManager):
group = optionManager.add_argument_group("Input and output format options for tabular files") group = optionManager.add_argument_group("Input and output format options for tabular files")
group.add_argument('--header', group.add_argument('--no-header',
action="store_true", dest="obi:header", action="store_false", dest="obi:header",
default=False, default=True,
help="First line of tabular file contains column names") help="Don't print the header (first line with column names")
group.add_argument('--sep', group.add_argument('--sep',
action="store", dest="obi:sep", action="store", dest="obi:sep",
@ -297,6 +297,12 @@ def __addExportOutputOption(optionManager):
const=b'tabular', const=b'tabular',
help="Output file is in tabular format") help="Output file is in tabular format")
group.add_argument('--only-keys',
action="append", dest="obi:only_keys",
type=str,
default=[],
help="Only export the given keys (columns).")
group.add_argument('--print-na', group.add_argument('--print-na',
action="store_true", dest="obi:printna", action="store_true", dest="obi:printna",
default=False, default=False,

View File

@ -4,6 +4,6 @@ cdef class TabFormat:
cdef bint header cdef bint header
cdef bint first_line cdef bint first_line
cdef bytes NAString cdef bytes NAString
cdef list tags cdef set tags
cdef bytes sep cdef bytes sep
cdef bint NAIntTo0 cdef bint NAIntTo0

View File

@ -10,7 +10,8 @@ import sys
cdef class TabFormat: cdef class TabFormat:
def __init__(self, header=True, bytes NAString=b"NA", bytes sep=b"\t", bint NAIntTo0=True): def __init__(self, list tags=[], header=True, bytes NAString=b"NA", bytes sep=b"\t", bint NAIntTo0=True):
self.tags = set(tags)
self.header = header self.header = header
self.first_line = True self.first_line = True
self.NAString = NAString self.NAString = NAString
@ -20,46 +21,53 @@ cdef class TabFormat:
@cython.boundscheck(False) @cython.boundscheck(False)
def __call__(self, object data): def __call__(self, object data):
cdef set ktags
cdef list tags = [key for key in data]
line = [] line = []
if self.first_line: if self.tags is not None and self.tags:
self.tags = [k for k in data.keys()] ktags = self.tags
else:
ktags = set(tags)
if self.header and self.first_line: if self.header and self.first_line:
for k in self.tags: for k in ktags:
if isinstance(data.view[k], Column_multi_elts): if k in tags:
keys = data.view[k].keys() if isinstance(data.view[k], Column_multi_elts):
keys.sort() keys = data.view[k].keys()
for k2 in keys: keys.sort()
line.append(tobytes(k)+b':'+tobytes(k2)) for k2 in keys:
else: line.append(tobytes(k)+b':'+tobytes(k2))
line.append(tobytes(k)) else:
line.append(tobytes(k))
r = self.sep.join(value for value in line) r = self.sep.join(value for value in line)
r += b'\n' r += b'\n'
line = [] line = []
for k in self.tags: for k in ktags:
value = data[k] if k in tags:
if isinstance(data.view[k], Column_multi_elts): value = data[k]
keys = data.view[k].keys() if isinstance(data.view[k], Column_multi_elts):
keys.sort() keys = data.view[k].keys()
if value is None: # all keys at None keys.sort()
for k2 in keys: # TODO could be much more efficient if value is None: # all keys at None
line.append(self.NAString) for k2 in keys: # TODO could be much more efficient
else: line.append(self.NAString)
for k2 in keys: # TODO could be much more efficient else:
if value[k2] is not None: for k2 in keys: # TODO could be much more efficient
line.append(str2bytes(str(bytes2str_object(value[k2])))) # genius programming if value[k2] is not None:
else: line.append(str2bytes(str(bytes2str_object(value[k2])))) # genius programming
if self.NAIntTo0 and isinstance(data.view[k], Column_multi_elts_int):
line.append(b"0")
else: else:
line.append(self.NAString) if self.NAIntTo0 and isinstance(data.view[k], Column_multi_elts_int):
else: line.append(b"0")
if value is not None or (self.NAIntTo0 and isinstance(data.view[k], Column_int)): else:
line.append(str2bytes(str(bytes2str_object(value)))) line.append(self.NAString)
else: else:
line.append(self.NAString) if value is not None or (self.NAIntTo0 and isinstance(data.view[k], Column_int)):
line.append(str2bytes(str(bytes2str_object(value))))
else:
line.append(self.NAString)
if self.header and self.first_line: if self.header and self.first_line:
r += self.sep.join(value for value in line) r += self.sep.join(value for value in line)

View File

@ -476,6 +476,18 @@ def open_uri(uri,
except KeyError: except KeyError:
commentchar=b'#' commentchar=b'#'
if b"only_keys" in qualifiers:
only_keys=qualifiers[b"only_keys"][0] # not sure that works but no one ever uses qualifiers
else:
try:
only_keys_str=config["obi"]["only_keys"]
only_keys=[]
for key in only_keys_str:
only_keys.append(tobytes(key))
except KeyError:
only_keys=[]
if format is not None: if format is not None:
if seqtype==b"nuc": if seqtype==b"nuc":
objclass = Nuc_Seq # Nuc_Seq_Stored? TODO objclass = Nuc_Seq # Nuc_Seq_Stored? TODO
@ -486,7 +498,7 @@ def open_uri(uri,
only=only, only=only,
nastring=nastring) nastring=nastring)
else: else:
iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring), iseq = FastaNucWriter(FastaFormat(tags=only_keys, printNAKeys=printna, NAString=nastring),
file, file,
skip=skip, skip=skip,
only=only) only=only)
@ -499,7 +511,7 @@ def open_uri(uri,
noquality=noquality, noquality=noquality,
nastring=nastring) nastring=nastring)
else: else:
iseq = FastqWriter(FastqFormat(printNAKeys=printna, NAString=nastring), iseq = FastqWriter(FastqFormat(tags=only_keys, printNAKeys=printna, NAString=nastring),
file, file,
skip=skip, skip=skip,
only=only) only=only)
@ -535,7 +547,7 @@ def open_uri(uri,
skip = skip, skip = skip,
only = only) only = only)
else: else:
iseq = TabWriter(TabFormat(header=header, NAString=nastring, sep=sep, NAIntTo0=na_int_to_0), iseq = TabWriter(TabFormat(tags=only_keys, header=header, NAString=nastring, sep=sep, NAIntTo0=na_int_to_0),
file, file,
skip=skip, skip=skip,
only=only, only=only,
@ -571,7 +583,7 @@ def open_uri(uri,
commentchar) commentchar)
else: # default export is in fasta? or tab? TODO else: # default export is in fasta? or tab? TODO
objclass = Nuc_Seq # Nuc_Seq_Stored? TODO objclass = Nuc_Seq # Nuc_Seq_Stored? TODO
iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring), iseq = FastaNucWriter(FastaFormat(tags=only_keys, printNAKeys=printna, NAString=nastring),
file, file,
skip=skip, skip=skip,
only=only) only=only)