diff --git a/python/obitools3/apps/optiongroups/__init__.py b/python/obitools3/apps/optiongroups/__init__.py index a907466..ca4f9bb 100755 --- a/python/obitools3/apps/optiongroups/__init__.py +++ b/python/obitools3/apps/optiongroups/__init__.py @@ -177,6 +177,16 @@ def __addTabularInputOption(optionManager): help="Lines starting by this char are considered as comment") +def __addTabularOutputOption(optionManager): + group = optionManager.add_argument_group("Output format options for tabular files") + + __addTabularOption(optionManager) + + group.add_argument('--na-int-stay-na', + action="store_false", dest="obi:na_int_to_0", + help="NA (Non available) integer values should be exported as NA in tabular output (default: they are converted to 0 for tabular output).") # TODO + + def __addTaxdumpInputOption(optionManager): # TODO maybe not the best way to do it group = optionManager.add_argument_group("Input format options for taxdump") @@ -210,6 +220,10 @@ def addTabularInputOption(optionManager): __addTabularInputOption(optionManager) +def addTabularOutputOption(optionManager): + __addTabularOutputOption(optionManager) + + def addTaxonomyOption(optionManager): __addTaxonomyOption(optionManager) @@ -222,6 +236,7 @@ def addAllInputOption(optionManager): __addInputOption(optionManager) __addImportInputOption(optionManager) __addTabularInputOption(optionManager) + __addTabularOutputOption(optionManager) __addTaxonomyOption(optionManager) __addTaxdumpInputOption(optionManager) @@ -314,14 +329,14 @@ def addTabularOutputOption(optionManager): def addExportOutputOption(optionManager): __addExportOutputOption(optionManager) - __addTabularOption(optionManager) + __addTabularOutputOption(optionManager) def addAllOutputOption(optionManager): __addOutputOption(optionManager) __addDMSOutputOption(optionManager) __addExportOutputOption(optionManager) - __addTabularOption(optionManager) + __addTabularOutputOption(optionManager) def addNoProgressBarOption(optionManager): diff --git a/python/obitools3/format/fasta.pyx b/python/obitools3/format/fasta.pyx index 54c13fa..4d98a48 100755 --- a/python/obitools3/format/fasta.pyx +++ b/python/obitools3/format/fasta.pyx @@ -7,11 +7,12 @@ from obitools3.utils cimport bytes2str cdef class FastaFormat: - def __init__(self, list tags=[], bint printNAKeys=False, bytes NAString=b"NA"): + def __init__(self, list tags=[], bint printNAKeys=False, bytes NAString=b"NA", bint NAIntTo0=False): self.headerFormatter = HeaderFormat("fasta", tags=tags, printNAKeys=printNAKeys, - NAString=NAString) + NAString=NAString, + NAIntTo0=NAIntTo0) @cython.boundscheck(False) def __call__(self, object data): diff --git a/python/obitools3/format/fastq.pyx b/python/obitools3/format/fastq.pyx index 9aea233..099b716 100755 --- a/python/obitools3/format/fastq.pyx +++ b/python/obitools3/format/fastq.pyx @@ -8,11 +8,12 @@ from obitools3.utils cimport bytes2str, str2bytes, tobytes # TODO quality offset option? cdef class FastqFormat: - def __init__(self, list tags=[], bint printNAKeys=False, bytes NAString=b"NA"): + def __init__(self, list tags=[], bint printNAKeys=False, bytes NAString=b"NA", bint NAIntTo0=False): self.headerFormatter = HeaderFormat("fastq", tags=tags, printNAKeys=printNAKeys, - NAString=NAString) + NAString=NAString, + NAIntTo0=NAIntTo0) @cython.boundscheck(False) def __call__(self, object data): diff --git a/python/obitools3/format/header.pxd b/python/obitools3/format/header.pxd index 7a627bf..c1d2886 100755 --- a/python/obitools3/format/header.pxd +++ b/python/obitools3/format/header.pxd @@ -4,5 +4,6 @@ cdef class HeaderFormat: cdef set tags cdef bint printNAKeys cdef bytes NAString + cdef bint NAIntTo0 cdef size_t headerBufferLength \ No newline at end of file diff --git a/python/obitools3/format/header.pyx b/python/obitools3/format/header.pyx index edea724..4ac2c29 100755 --- a/python/obitools3/format/header.pyx +++ b/python/obitools3/format/header.pyx @@ -8,13 +8,14 @@ from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN, \ from obitools3.utils cimport str2bytes, bytes2str_object from obitools3.dms.column.column cimport Column_line +from obitools3.dms.column.typed_column.int cimport Column_int, Column_multi_elts_int cdef class HeaderFormat: SPECIAL_KEYS = [NUC_SEQUENCE_COLUMN, ID_COLUMN, DEFINITION_COLUMN, QUALITY_COLUMN] - def __init__(self, str format="fasta", list tags=[], bint printNAKeys=False, bytes NAString=b"NA"): + def __init__(self, str format="fasta", list tags=[], bint printNAKeys=False, bytes NAString=b"NA", bint NAIntTo0=False): ''' @param format: @type format: `str` @@ -32,6 +33,7 @@ cdef class HeaderFormat: self.tags = set(tags) self.printNAKeys = printNAKeys self.NAString = NAString + self.NAIntTo0 = NAIntTo0 if format=="fasta": self.start=b">" @@ -57,17 +59,25 @@ cdef class HeaderFormat: if k in tags: value = data[k] if value is None or (isinstance(value, Column_line) and value.is_NA()): - if self.printNAKeys: + if isinstance(data.view[k], Column_int) and self.NAIntTo0: # people want missing int values to be 0 + value = b'0' + elif self.printNAKeys: value = self.NAString else: value = None else: if type(value) == Column_line: - value = value.bytes() + if isinstance(data.view[k], Column_multi_elts_int) and self.NAIntTo0: + value = dict(value) + for key in data.view[k].keys(): + if key not in value or value[key]: + value[key] = 0 + else: + value = value.bytes() else: if type(value) == tuple: value=list(value) - value = str2bytes(str(bytes2str_object(value))) # genius programming + value = str2bytes(str(bytes2str_object(value))) # genius programming if value is not None: lines.append(k + b"=" + value + b";") diff --git a/python/obitools3/format/tab.pxd b/python/obitools3/format/tab.pxd index a434faa..47cb859 100755 --- a/python/obitools3/format/tab.pxd +++ b/python/obitools3/format/tab.pxd @@ -5,4 +5,5 @@ cdef class TabFormat: cdef bint first_line cdef bytes NAString cdef list tags - cdef bytes sep \ No newline at end of file + cdef bytes sep + cdef bint NAIntTo0 \ No newline at end of file diff --git a/python/obitools3/format/tab.pyx b/python/obitools3/format/tab.pyx index 1f2c0e1..93af246 100755 --- a/python/obitools3/format/tab.pyx +++ b/python/obitools3/format/tab.pyx @@ -4,16 +4,18 @@ cimport cython from obitools3.dms.view.view cimport Line from obitools3.utils cimport bytes2str_object, str2bytes, tobytes from obitools3.dms.column.column cimport Column_line, Column_multi_elts +from obitools3.dms.column.typed_column.int cimport Column_int, Column_multi_elts_int import sys cdef class TabFormat: - def __init__(self, header=True, bytes NAString=b"NA", bytes sep=b"\t"): + def __init__(self, header=True, bytes NAString=b"NA", bytes sep=b"\t", bint NAIntTo0=True): self.header = header self.first_line = True self.NAString = NAString self.sep = sep + self.NAIntTo0 = NAIntTo0 @cython.boundscheck(False) def __call__(self, object data): @@ -49,9 +51,12 @@ cdef class TabFormat: if value[k2] is not None: line.append(str2bytes(str(bytes2str_object(value[k2])))) # genius programming else: - line.append(self.NAString) + if self.NAIntTo0 and isinstance(data.view[k], Column_multi_elts_int): + line.append(b"0") + else: + line.append(self.NAString) else: - if value is not None: + if value is not None or (self.NAIntTo0 and isinstance(data.view[k], Column_int)): line.append(str2bytes(str(bytes2str_object(value)))) else: line.append(self.NAString) diff --git a/python/obitools3/uri/decode.pyx b/python/obitools3/uri/decode.pyx index d88c187..c828369 100644 --- a/python/obitools3/uri/decode.pyx +++ b/python/obitools3/uri/decode.pyx @@ -427,7 +427,21 @@ def open_uri(uri, nastring=tobytes(config["obi"][nakey]) except KeyError: nastring=b'NA' - + + if b"na_int_to_0" in qualifiers: + try: + na_int_to_0=eval(qualifiers[b"na_int_to_0"][0]) + except Exception as e: + raise MalformedURIException("Malformed 'NA_int_to_0' argument in URI") + else: + try: + na_int_to_0=config["obi"]["na_int_to_0"] + except KeyError: + if format==b"tabular": + na_int_to_0=True + else: + na_int_to_0=False + if b"stripwhite" in qualifiers: try: stripwhite=eval(qualifiers[b"stripwhite"][0]) @@ -521,7 +535,7 @@ def open_uri(uri, skip = skip, only = only) else: - iseq = TabWriter(TabFormat(header=header, NAString=nastring, sep=sep), + iseq = TabWriter(TabFormat(header=header, NAString=nastring, sep=sep, NAIntTo0=na_int_to_0), file, skip=skip, only=only,