Implements #108 export integer missing values as 0 for tables by default
This commit is contained in:
@ -177,6 +177,16 @@ def __addTabularInputOption(optionManager):
|
||||
help="Lines starting by this char are considered as comment")
|
||||
|
||||
|
||||
def __addTabularOutputOption(optionManager):
|
||||
group = optionManager.add_argument_group("Output format options for tabular files")
|
||||
|
||||
__addTabularOption(optionManager)
|
||||
|
||||
group.add_argument('--na-int-stay-na',
|
||||
action="store_false", dest="obi:na_int_to_0",
|
||||
help="NA (Non available) integer values should be exported as NA in tabular output (default: they are converted to 0 for tabular output).") # TODO
|
||||
|
||||
|
||||
def __addTaxdumpInputOption(optionManager): # TODO maybe not the best way to do it
|
||||
group = optionManager.add_argument_group("Input format options for taxdump")
|
||||
|
||||
@ -210,6 +220,10 @@ def addTabularInputOption(optionManager):
|
||||
__addTabularInputOption(optionManager)
|
||||
|
||||
|
||||
def addTabularOutputOption(optionManager):
|
||||
__addTabularOutputOption(optionManager)
|
||||
|
||||
|
||||
def addTaxonomyOption(optionManager):
|
||||
__addTaxonomyOption(optionManager)
|
||||
|
||||
@ -222,6 +236,7 @@ def addAllInputOption(optionManager):
|
||||
__addInputOption(optionManager)
|
||||
__addImportInputOption(optionManager)
|
||||
__addTabularInputOption(optionManager)
|
||||
__addTabularOutputOption(optionManager)
|
||||
__addTaxonomyOption(optionManager)
|
||||
__addTaxdumpInputOption(optionManager)
|
||||
|
||||
@ -314,14 +329,14 @@ def addTabularOutputOption(optionManager):
|
||||
|
||||
def addExportOutputOption(optionManager):
|
||||
__addExportOutputOption(optionManager)
|
||||
__addTabularOption(optionManager)
|
||||
__addTabularOutputOption(optionManager)
|
||||
|
||||
|
||||
def addAllOutputOption(optionManager):
|
||||
__addOutputOption(optionManager)
|
||||
__addDMSOutputOption(optionManager)
|
||||
__addExportOutputOption(optionManager)
|
||||
__addTabularOption(optionManager)
|
||||
__addTabularOutputOption(optionManager)
|
||||
|
||||
|
||||
def addNoProgressBarOption(optionManager):
|
||||
|
@ -7,11 +7,12 @@ from obitools3.utils cimport bytes2str
|
||||
|
||||
cdef class FastaFormat:
|
||||
|
||||
def __init__(self, list tags=[], bint printNAKeys=False, bytes NAString=b"NA"):
|
||||
def __init__(self, list tags=[], bint printNAKeys=False, bytes NAString=b"NA", bint NAIntTo0=False):
|
||||
self.headerFormatter = HeaderFormat("fasta",
|
||||
tags=tags,
|
||||
printNAKeys=printNAKeys,
|
||||
NAString=NAString)
|
||||
NAString=NAString,
|
||||
NAIntTo0=NAIntTo0)
|
||||
|
||||
@cython.boundscheck(False)
|
||||
def __call__(self, object data):
|
||||
|
@ -8,11 +8,12 @@ from obitools3.utils cimport bytes2str, str2bytes, tobytes
|
||||
# TODO quality offset option?
|
||||
cdef class FastqFormat:
|
||||
|
||||
def __init__(self, list tags=[], bint printNAKeys=False, bytes NAString=b"NA"):
|
||||
def __init__(self, list tags=[], bint printNAKeys=False, bytes NAString=b"NA", bint NAIntTo0=False):
|
||||
self.headerFormatter = HeaderFormat("fastq",
|
||||
tags=tags,
|
||||
printNAKeys=printNAKeys,
|
||||
NAString=NAString)
|
||||
NAString=NAString,
|
||||
NAIntTo0=NAIntTo0)
|
||||
|
||||
@cython.boundscheck(False)
|
||||
def __call__(self, object data):
|
||||
|
@ -4,5 +4,6 @@ cdef class HeaderFormat:
|
||||
cdef set tags
|
||||
cdef bint printNAKeys
|
||||
cdef bytes NAString
|
||||
cdef bint NAIntTo0
|
||||
cdef size_t headerBufferLength
|
||||
|
@ -8,13 +8,14 @@ from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN, \
|
||||
|
||||
from obitools3.utils cimport str2bytes, bytes2str_object
|
||||
from obitools3.dms.column.column cimport Column_line
|
||||
from obitools3.dms.column.typed_column.int cimport Column_int, Column_multi_elts_int
|
||||
|
||||
|
||||
cdef class HeaderFormat:
|
||||
|
||||
SPECIAL_KEYS = [NUC_SEQUENCE_COLUMN, ID_COLUMN, DEFINITION_COLUMN, QUALITY_COLUMN]
|
||||
|
||||
def __init__(self, str format="fasta", list tags=[], bint printNAKeys=False, bytes NAString=b"NA"):
|
||||
def __init__(self, str format="fasta", list tags=[], bint printNAKeys=False, bytes NAString=b"NA", bint NAIntTo0=False):
|
||||
'''
|
||||
@param format:
|
||||
@type format: `str`
|
||||
@ -32,6 +33,7 @@ cdef class HeaderFormat:
|
||||
self.tags = set(tags)
|
||||
self.printNAKeys = printNAKeys
|
||||
self.NAString = NAString
|
||||
self.NAIntTo0 = NAIntTo0
|
||||
|
||||
if format=="fasta":
|
||||
self.start=b">"
|
||||
@ -57,12 +59,20 @@ cdef class HeaderFormat:
|
||||
if k in tags:
|
||||
value = data[k]
|
||||
if value is None or (isinstance(value, Column_line) and value.is_NA()):
|
||||
if self.printNAKeys:
|
||||
if isinstance(data.view[k], Column_int) and self.NAIntTo0: # people want missing int values to be 0
|
||||
value = b'0'
|
||||
elif self.printNAKeys:
|
||||
value = self.NAString
|
||||
else:
|
||||
value = None
|
||||
else:
|
||||
if type(value) == Column_line:
|
||||
if isinstance(data.view[k], Column_multi_elts_int) and self.NAIntTo0:
|
||||
value = dict(value)
|
||||
for key in data.view[k].keys():
|
||||
if key not in value or value[key]:
|
||||
value[key] = 0
|
||||
else:
|
||||
value = value.bytes()
|
||||
else:
|
||||
if type(value) == tuple:
|
||||
|
@ -6,3 +6,4 @@ cdef class TabFormat:
|
||||
cdef bytes NAString
|
||||
cdef list tags
|
||||
cdef bytes sep
|
||||
cdef bint NAIntTo0
|
@ -4,16 +4,18 @@ cimport cython
|
||||
from obitools3.dms.view.view cimport Line
|
||||
from obitools3.utils cimport bytes2str_object, str2bytes, tobytes
|
||||
from obitools3.dms.column.column cimport Column_line, Column_multi_elts
|
||||
from obitools3.dms.column.typed_column.int cimport Column_int, Column_multi_elts_int
|
||||
|
||||
import sys
|
||||
|
||||
cdef class TabFormat:
|
||||
|
||||
def __init__(self, header=True, bytes NAString=b"NA", bytes sep=b"\t"):
|
||||
def __init__(self, header=True, bytes NAString=b"NA", bytes sep=b"\t", bint NAIntTo0=True):
|
||||
self.header = header
|
||||
self.first_line = True
|
||||
self.NAString = NAString
|
||||
self.sep = sep
|
||||
self.NAIntTo0 = NAIntTo0
|
||||
|
||||
@cython.boundscheck(False)
|
||||
def __call__(self, object data):
|
||||
@ -48,10 +50,13 @@ cdef class TabFormat:
|
||||
for k2 in keys: # TODO could be much more efficient
|
||||
if value[k2] is not None:
|
||||
line.append(str2bytes(str(bytes2str_object(value[k2])))) # genius programming
|
||||
else:
|
||||
if self.NAIntTo0 and isinstance(data.view[k], Column_multi_elts_int):
|
||||
line.append(b"0")
|
||||
else:
|
||||
line.append(self.NAString)
|
||||
else:
|
||||
if value is not None:
|
||||
if value is not None or (self.NAIntTo0 and isinstance(data.view[k], Column_int)):
|
||||
line.append(str2bytes(str(bytes2str_object(value))))
|
||||
else:
|
||||
line.append(self.NAString)
|
||||
|
@ -428,6 +428,20 @@ def open_uri(uri,
|
||||
except KeyError:
|
||||
nastring=b'NA'
|
||||
|
||||
if b"na_int_to_0" in qualifiers:
|
||||
try:
|
||||
na_int_to_0=eval(qualifiers[b"na_int_to_0"][0])
|
||||
except Exception as e:
|
||||
raise MalformedURIException("Malformed 'NA_int_to_0' argument in URI")
|
||||
else:
|
||||
try:
|
||||
na_int_to_0=config["obi"]["na_int_to_0"]
|
||||
except KeyError:
|
||||
if format==b"tabular":
|
||||
na_int_to_0=True
|
||||
else:
|
||||
na_int_to_0=False
|
||||
|
||||
if b"stripwhite" in qualifiers:
|
||||
try:
|
||||
stripwhite=eval(qualifiers[b"stripwhite"][0])
|
||||
@ -521,7 +535,7 @@ def open_uri(uri,
|
||||
skip = skip,
|
||||
only = only)
|
||||
else:
|
||||
iseq = TabWriter(TabFormat(header=header, NAString=nastring, sep=sep),
|
||||
iseq = TabWriter(TabFormat(header=header, NAString=nastring, sep=sep, NAIntTo0=na_int_to_0),
|
||||
file,
|
||||
skip=skip,
|
||||
only=only,
|
||||
|
Reference in New Issue
Block a user