From 99c1cd60d68086bda49dad832c5b65d0887805c9 Mon Sep 17 00:00:00 2001
From: mercierc <celine.mercier@scionresearch.com>
Date: Thu, 22 Jul 2021 09:23:18 +1200
Subject: [PATCH] export: now exports header for tabular files by default and
 added option to only export specific columns

---
 .../obitools3/apps/optiongroups/__init__.py   | 14 ++--
 python/obitools3/format/tab.pxd               |  2 +-
 python/obitools3/format/tab.pyx               | 72 ++++++++++---------
 python/obitools3/uri/decode.pyx               | 20 ++++--
 4 files changed, 67 insertions(+), 41 deletions(-)

diff --git a/python/obitools3/apps/optiongroups/__init__.py b/python/obitools3/apps/optiongroups/__init__.py
index ca4f9bb..79ccbe0 100755
--- a/python/obitools3/apps/optiongroups/__init__.py
+++ b/python/obitools3/apps/optiongroups/__init__.py
@@ -137,10 +137,10 @@ def __addImportInputOption(optionManager):
 def __addTabularOption(optionManager):
     group = optionManager.add_argument_group("Input and output format options for tabular files")
 
-    group.add_argument('--header',
-                     action="store_true", dest="obi:header",
-                     default=False,
-                     help="First line of tabular file contains column names")
+    group.add_argument('--no-header',
+                     action="store_false", dest="obi:header",
+                     default=True,
+                     help="Don't print the header (first line with column names")
 
     group.add_argument('--sep',
                      action="store", dest="obi:sep",
@@ -297,6 +297,12 @@ def __addExportOutputOption(optionManager):
                      const=b'tabular',
                      help="Output file is in tabular format")
 
+    group.add_argument('--only-keys',
+                       action="append", dest="obi:only_keys",
+                       type=str,
+                       default=[],
+                       help="Only export the given keys (columns).")
+
     group.add_argument('--print-na',
                      action="store_true", dest="obi:printna",
                      default=False,
diff --git a/python/obitools3/format/tab.pxd b/python/obitools3/format/tab.pxd
index 47cb859..9574fd2 100755
--- a/python/obitools3/format/tab.pxd
+++ b/python/obitools3/format/tab.pxd
@@ -4,6 +4,6 @@ cdef class TabFormat:
     cdef bint header
     cdef bint first_line
     cdef bytes NAString
-    cdef list tags
+    cdef set   tags
     cdef bytes sep
     cdef bint NAIntTo0
\ No newline at end of file
diff --git a/python/obitools3/format/tab.pyx b/python/obitools3/format/tab.pyx
index 93af246..cabdcd2 100755
--- a/python/obitools3/format/tab.pyx
+++ b/python/obitools3/format/tab.pyx
@@ -10,7 +10,8 @@ import sys
 
 cdef class TabFormat:
     
-    def __init__(self, header=True, bytes NAString=b"NA", bytes sep=b"\t", bint NAIntTo0=True):
+    def __init__(self, list tags=[], header=True, bytes NAString=b"NA", bytes sep=b"\t", bint NAIntTo0=True):
+        self.tags = set(tags)
         self.header = header
         self.first_line = True
         self.NAString = NAString
@@ -20,46 +21,53 @@ cdef class TabFormat:
     @cython.boundscheck(False)    
     def __call__(self, object data):
         
+        cdef set ktags
+        cdef list tags = [key for key in data]
+        
         line = []
         
-        if self.first_line:
-            self.tags = [k for k in data.keys()]
-        
+        if self.tags is not None and self.tags:
+            ktags = self.tags
+        else:
+            ktags = set(tags)            
+                
         if self.header and self.first_line:
-            for k in self.tags:
-                if isinstance(data.view[k], Column_multi_elts):
-                    keys = data.view[k].keys()
-                    keys.sort()
-                    for k2 in keys:
-                        line.append(tobytes(k)+b':'+tobytes(k2))
-                else:
-                    line.append(tobytes(k))
+            for k in ktags:
+                if k in tags:
+                    if isinstance(data.view[k], Column_multi_elts):
+                        keys = data.view[k].keys()
+                        keys.sort()
+                        for k2 in keys:
+                            line.append(tobytes(k)+b':'+tobytes(k2))
+                    else:
+                        line.append(tobytes(k))
             r = self.sep.join(value for value in line)
             r += b'\n'
             line = []
                     
-        for k in self.tags:
-            value = data[k]
-            if isinstance(data.view[k], Column_multi_elts):
-                keys = data.view[k].keys()
-                keys.sort()
-                if value is None:  # all keys at None
-                    for k2 in keys: # TODO could be much more efficient
-                        line.append(self.NAString)
-                else:
-                    for k2 in keys: # TODO could be much more efficient
-                        if value[k2] is not None:
-                            line.append(str2bytes(str(bytes2str_object(value[k2]))))  # genius programming
-                        else:
-                            if self.NAIntTo0 and isinstance(data.view[k], Column_multi_elts_int):
-                                line.append(b"0")
+        for k in ktags:
+            if k in tags:
+                value = data[k]
+                if isinstance(data.view[k], Column_multi_elts):
+                    keys = data.view[k].keys()
+                    keys.sort()
+                    if value is None:  # all keys at None
+                        for k2 in keys: # TODO could be much more efficient
+                            line.append(self.NAString)
+                    else:
+                        for k2 in keys: # TODO could be much more efficient
+                            if value[k2] is not None:
+                                line.append(str2bytes(str(bytes2str_object(value[k2]))))  # genius programming
                             else:
-                                line.append(self.NAString)
-            else:
-                if value is not None or (self.NAIntTo0 and isinstance(data.view[k], Column_int)):
-                    line.append(str2bytes(str(bytes2str_object(value))))
+                                if self.NAIntTo0 and isinstance(data.view[k], Column_multi_elts_int):
+                                    line.append(b"0")
+                                else:
+                                    line.append(self.NAString)
                 else:
-                    line.append(self.NAString)
+                    if value is not None or (self.NAIntTo0 and isinstance(data.view[k], Column_int)):
+                        line.append(str2bytes(str(bytes2str_object(value))))
+                    else:
+                        line.append(self.NAString)
                   	      	
         if self.header and self.first_line:
             r += self.sep.join(value for value in line)
diff --git a/python/obitools3/uri/decode.pyx b/python/obitools3/uri/decode.pyx
index c828369..5720479 100644
--- a/python/obitools3/uri/decode.pyx
+++ b/python/obitools3/uri/decode.pyx
@@ -476,6 +476,18 @@ def open_uri(uri,
             except KeyError:
                 commentchar=b'#'
 
+        if b"only_keys" in qualifiers:
+            only_keys=qualifiers[b"only_keys"][0] # not sure that works but no one ever uses qualifiers
+        else:
+            try:
+                only_keys_str=config["obi"]["only_keys"]
+                only_keys=[]
+                for key in only_keys_str:
+                    only_keys.append(tobytes(key))
+            except KeyError:
+                only_keys=[]
+
+
         if format is not None:
             if seqtype==b"nuc":
                 objclass = Nuc_Seq    # Nuc_Seq_Stored? TODO
@@ -486,7 +498,7 @@ def open_uri(uri,
                                                 only=only,
                                                 nastring=nastring)
                     else:
-                        iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring), 
+                        iseq = FastaNucWriter(FastaFormat(tags=only_keys, printNAKeys=printna, NAString=nastring), 
                                               file,
                                               skip=skip,
                                               only=only)
@@ -499,7 +511,7 @@ def open_uri(uri,
                                              noquality=noquality,
                                              nastring=nastring)
                     else:
-                        iseq = FastqWriter(FastqFormat(printNAKeys=printna, NAString=nastring), 
+                        iseq = FastqWriter(FastqFormat(tags=only_keys, printNAKeys=printna, NAString=nastring), 
                                            file,
                                            skip=skip,
                                            only=only)
@@ -535,7 +547,7 @@ def open_uri(uri,
                                        skip = skip,
                                        only = only)
                 else:
-                    iseq = TabWriter(TabFormat(header=header, NAString=nastring, sep=sep, NAIntTo0=na_int_to_0), 
+                    iseq = TabWriter(TabFormat(tags=only_keys, header=header, NAString=nastring, sep=sep, NAIntTo0=na_int_to_0), 
                                                file,
                                                skip=skip,
                                                only=only,
@@ -571,7 +583,7 @@ def open_uri(uri,
                                                               commentchar)
             else:    # default export is in fasta? or tab? TODO
                 objclass = Nuc_Seq   # Nuc_Seq_Stored? TODO
-                iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring), 
+                iseq = FastaNucWriter(FastaFormat(tags=only_keys, printNAKeys=printna, NAString=nastring), 
                                       file,
                                       skip=skip,
                                       only=only)