Switch to version 3.0.1b11

export: now exports header for tabular files by default and added option
to only export specific columns
2021-07-22 09:25:39 +12:00 · 2021-07-22 09:23:18 +12:00 · 2021-07-21 15:23:04 +12:00 · 2021-07-21 15:22:08 +12:00 · 2021-07-21 15:20:44 +12:00 · 2021-07-21 15:19:24 +12:00
16 changed files with 161 additions and 62 deletions
--- a/python/obitools3/apps/optiongroups/init.py
+++ b/python/obitools3/apps/optiongroups/init.py
@ -137,10 +137,10 @@ def __addImportInputOption(optionManager):
 def __addTabularOption(optionManager):
    group = optionManager.add_argument_group("Input and output format options for tabular files")

-    group.add_argument('--header',
-                     action="store_true", dest="obi:header",
-                     default=False,
-                     help="First line of tabular file contains column names")
+    group.add_argument('--no-header',
+                     action="store_false", dest="obi:header",
+                     default=True,
+                     help="Don't print the header (first line with column names")

    group.add_argument('--sep',
                     action="store", dest="obi:sep",
@ -177,6 +177,16 @@ def __addTabularInputOption(optionManager):
                     help="Lines starting by this char are considered as comment")


+def __addTabularOutputOption(optionManager):
+    group = optionManager.add_argument_group("Output format options for tabular files")
+    
+    __addTabularOption(optionManager)
+
+    group.add_argument('--na-int-stay-na',
+                     action="store_false", dest="obi:na_int_to_0",
+                     help="NA (Non available) integer values should be exported as NA in tabular output (default: they are converted to 0 for tabular output).") # TODO
+
+
 def __addTaxdumpInputOption(optionManager):  # TODO maybe not the best way to do it
    group = optionManager.add_argument_group("Input format options for taxdump")

@ -210,6 +220,10 @@ def addTabularInputOption(optionManager):
    __addTabularInputOption(optionManager)


+def addTabularOutputOption(optionManager):
+    __addTabularOutputOption(optionManager)
+
+
 def addTaxonomyOption(optionManager):
    __addTaxonomyOption(optionManager)

@ -222,6 +236,7 @@ def addAllInputOption(optionManager):
    __addInputOption(optionManager)
    __addImportInputOption(optionManager)
    __addTabularInputOption(optionManager)
+    __addTabularOutputOption(optionManager)
    __addTaxonomyOption(optionManager)
    __addTaxdumpInputOption(optionManager)    

@ -282,6 +297,12 @@ def __addExportOutputOption(optionManager):
                     const=b'tabular',
                     help="Output file is in tabular format")

+    group.add_argument('--only-keys',
+                       action="append", dest="obi:only_keys",
+                       type=str,
+                       default=[],
+                       help="Only export the given keys (columns).")
+
    group.add_argument('--print-na',
                     action="store_true", dest="obi:printna",
                     default=False,
@ -314,14 +335,14 @@ def addTabularOutputOption(optionManager):

 def addExportOutputOption(optionManager):
    __addExportOutputOption(optionManager)
-    __addTabularOption(optionManager)
+    __addTabularOutputOption(optionManager)


 def addAllOutputOption(optionManager):
    __addOutputOption(optionManager)
    __addDMSOutputOption(optionManager)
    __addExportOutputOption(optionManager)
-    __addTabularOption(optionManager)
+    __addTabularOutputOption(optionManager)


 def addNoProgressBarOption(optionManager):
--- a/python/obitools3/commands/cat.pyx
+++ b/python/obitools3/commands/cat.pyx
@ -134,7 +134,11 @@ def run(config):
                rep = repr(entry)
                output_0.write(str2bytes(rep)+b"\n")
            else:
-                o_view[i] = entry
+                try:
+                    o_view[i] = entry
+                except:
+                    print("\nError with entry:", repr(entry))
+                    print(repr(o_view))
            i+=1
        v.close()

--- a/python/obitools3/commands/stats.pyx
+++ b/python/obitools3/commands/stats.pyx
@ -285,8 +285,8 @@ def run(config):
            print((("%%%df" % lvarp[m]) % varp[m][c])+"\t", end="")
        for m in config['stats']['sd']:
            print((("%%%df" % lsigma[m]) % sigma[m][c])+"\t", end="")
-        print("%7d" %catcount[c], end="")
-        print("%9d" %totcount[c])
+        print("%d" %catcount[c]+"\t", end="")
+        print("%d" %totcount[c]+"\t")

    input[0].close(force=True)
    
--- a/python/obitools3/dms/column/column.pyx
+++ b/python/obitools3/dms/column/column.pyx
@ -7,7 +7,8 @@ __OBIDMS_COLUMN_CLASS__ = {}
 from ..capi.obitypes cimport name_data_type, \
                             obitype_t, \
                             obiversion_t, \
-                             OBI_QUAL
+                             OBI_QUAL, \
+                             OBI_STR

 from ..capi.obidms cimport obi_import_column

@ -128,6 +129,10 @@ cdef class Column(OBIWrapper) :
        else:
            elements_names_p = NULL
        
+        if column_name_b == b"SAMPLE" or column_name_b == b"sample":
+        # force str type
+            data_type = OBI_STR
+        
        if data_type == OBI_QUAL:
            if associated_column_name_b == b"":
                if column_name == QUALITY_COLUMN:
--- a/python/obitools3/dms/column/typed_column/str.pyx
+++ b/python/obitools3/dms/column/typed_column/str.pyx
@ -74,6 +74,9 @@ cdef class Column_str(Column_idx):
        if value is None :
            value_b = <char*>OBIStr_NA
        else :
+            if self.name == b'sample' or self.name == b'SAMPLE':
+                if type(value) == int:
+                    value = str(value) # force sample ids to be str
            value_bytes = tobytes(value)
            value_b = <char*>value_bytes

@ -137,6 +140,9 @@ cdef class Column_multi_elts_str(Column_multi_elts_idx):
        if value is None :
            value_b = <char*>OBIStr_NA
        else :
+            if self.name == b'sample' or self.name == b'SAMPLE':
+                if type(value) == int:
+                    value = str(value) # force sample ids to be str
            value_bytes = tobytes(value)
            value_b = <char*>value_bytes
                
@ -206,6 +212,9 @@ cdef class Column_tuples_str(Column_idx):
            i = 0
            for elt in value :
                if elt is not None and elt != '':
+                    if self.name == b'sample' or self.name == b'SAMPLE':
+                        if type(elt) == int:
+                            elt = str(elt) # force sample ids to be str
                    elt_b = tobytes(elt)
                    strcpy(array+i, <char*>elt_b)
                    i = i + len(elt_b) + 1
--- a/python/obitools3/dms/taxo/taxo.pyx
+++ b/python/obitools3/dms/taxo/taxo.pyx
@ -1,5 +1,7 @@
 #cython: language_level=3

+import sys
+
 from obitools3.utils cimport str2bytes, bytes2str, tobytes, tostr
 from ..capi.obidms cimport OBIDMS_p, obi_dms_get_full_path
                          
@ -34,7 +36,7 @@ cdef class Taxonomy(OBIWrapper) :
        return <OBIDMS_taxonomy_p>(self._pointer)        

    cdef fill_name_dict(self):
-        print("Indexing taxon names...")
+        print("Indexing taxon names...", file=sys.stderr)
        
        cdef OBIDMS_taxonomy_p pointer = self.pointer()
        cdef ecotx_t*     taxon_p
@ -146,7 +148,9 @@ cdef class Taxonomy(OBIWrapper) :
        taxo._ranks = []
        for r in range((<OBIDMS_taxonomy_p>pointer).ranks.count) :
            taxo._ranks.append(obi_taxo_rank_index_to_label(r, (<OBIDMS_taxonomy_p>pointer).ranks))
-                
+        
+        print('Read %d taxa' % len(taxo), file=sys.stderr)
+        
        return taxo

    
@ -304,6 +308,11 @@ cdef class Taxonomy(OBIWrapper) :
    def name(self):
        return self._name
    
+    # ranks property getter
+    @property
+    def ranks(self):
+        return self._ranks
+
    
    def parental_tree_iterator(self, int taxid):
        """
--- a/python/obitools3/format/fasta.pyx
+++ b/python/obitools3/format/fasta.pyx
@ -7,11 +7,12 @@ from obitools3.utils cimport bytes2str

 cdef class FastaFormat:
 	
-	def __init__(self, list tags=[], bint printNAKeys=False, bytes NAString=b"NA"):
+	def __init__(self, list tags=[], bint printNAKeys=False, bytes NAString=b"NA", bint NAIntTo0=False):
 		self.headerFormatter = HeaderFormat("fasta",
 										    tags=tags,
 										    printNAKeys=printNAKeys,
-										    NAString=NAString)
+										    NAString=NAString,
+										    NAIntTo0=NAIntTo0)
 		
 	@cython.boundscheck(False)	
 	def __call__(self, object data):
--- a/python/obitools3/format/fastq.pyx
+++ b/python/obitools3/format/fastq.pyx
@ -8,11 +8,12 @@ from obitools3.utils cimport bytes2str, str2bytes, tobytes
 # TODO quality offset option?
 cdef class FastqFormat:
 	
-	def __init__(self, list tags=[], bint printNAKeys=False, bytes NAString=b"NA"):
+	def __init__(self, list tags=[], bint printNAKeys=False, bytes NAString=b"NA", bint NAIntTo0=False):
 		self.headerFormatter = HeaderFormat("fastq",
 										    tags=tags,
 										    printNAKeys=printNAKeys,
-										    NAString=NAString)
+										    NAString=NAString,
+                                            NAIntTo0=NAIntTo0)
 		
 	@cython.boundscheck(False)	
 	def __call__(self, object data):
--- a/python/obitools3/format/header.pxd
+++ b/python/obitools3/format/header.pxd
@ -4,5 +4,6 @@ cdef class HeaderFormat:
    cdef set    tags
    cdef bint   printNAKeys
    cdef bytes  NAString
+    cdef bint   NAIntTo0
    cdef size_t headerBufferLength
    
--- a/python/obitools3/format/header.pyx
+++ b/python/obitools3/format/header.pyx
@ -8,13 +8,14 @@ from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN, \

 from obitools3.utils cimport str2bytes, bytes2str_object
 from obitools3.dms.column.column cimport Column_line
+from obitools3.dms.column.typed_column.int cimport Column_int, Column_multi_elts_int


 cdef class HeaderFormat:
    
    SPECIAL_KEYS = [NUC_SEQUENCE_COLUMN, ID_COLUMN, DEFINITION_COLUMN, QUALITY_COLUMN]
    
-    def __init__(self, str format="fasta", list tags=[], bint printNAKeys=False, bytes NAString=b"NA"):
+    def __init__(self, str format="fasta", list tags=[], bint printNAKeys=False, bytes NAString=b"NA", bint NAIntTo0=False):
        '''
            @param format: 
            @type  format: `str`
@ -32,6 +33,7 @@ cdef class HeaderFormat:
        self.tags           = set(tags)
        self.printNAKeys    = printNAKeys
        self.NAString       = NAString
+        self.NAIntTo0       = NAIntTo0
        
        if format=="fasta":
            self.start=b">"
@ -57,17 +59,25 @@ cdef class HeaderFormat:
            if k in tags:
                value = data[k]
                if value is None or (isinstance(value, Column_line) and value.is_NA()):
-                    if self.printNAKeys:
+                    if isinstance(data.view[k], Column_int) and self.NAIntTo0: # people want missing int values to be 0
+                        value = b'0'
+                    elif self.printNAKeys:
                        value = self.NAString
                    else:
                        value = None
                else:
                    if type(value) == Column_line:
-                        value = value.bytes()
+                        if isinstance(data.view[k], Column_multi_elts_int) and self.NAIntTo0:
+                            value = dict(value)
+                            for key in data.view[k].keys():
+                                if key not in value or value[key]:
+                                    value[key] = 0
+                        else:
+                            value = value.bytes()
                    else:
                        if type(value) == tuple:
                            value=list(value)
-                        value = str2bytes(str(bytes2str_object(value))) # genius programming
+                    value = str2bytes(str(bytes2str_object(value))) # genius programming
                if value is not None:
                    lines.append(k + b"=" + value + b";")   
                
--- a/python/obitools3/format/tab.pxd
+++ b/python/obitools3/format/tab.pxd
@ -4,5 +4,6 @@ cdef class TabFormat:
    cdef bint header
    cdef bint first_line
    cdef bytes NAString
-    cdef list tags
-    cdef bytes sep
+    cdef set   tags
+    cdef bytes sep
+    cdef bint NAIntTo0
--- a/python/obitools3/format/tab.pyx
+++ b/python/obitools3/format/tab.pyx
@ -4,57 +4,70 @@ cimport cython
 from obitools3.dms.view.view cimport Line
 from obitools3.utils cimport bytes2str_object, str2bytes, tobytes
 from obitools3.dms.column.column cimport Column_line, Column_multi_elts
+from obitools3.dms.column.typed_column.int cimport Column_int, Column_multi_elts_int

 import sys

 cdef class TabFormat:
    
-    def __init__(self, header=True, bytes NAString=b"NA", bytes sep=b"\t"):
+    def __init__(self, list tags=[], header=True, bytes NAString=b"NA", bytes sep=b"\t", bint NAIntTo0=True):
+        self.tags = set(tags)
        self.header = header
        self.first_line = True
        self.NAString = NAString
        self.sep = sep
+        self.NAIntTo0 = NAIntTo0
        
    @cython.boundscheck(False)    
    def __call__(self, object data):
        
+        cdef set ktags
+        cdef list tags = [key for key in data]
+        
        line = []
        
-        if self.first_line:
-            self.tags = [k for k in data.keys()]
-        
+        if self.tags is not None and self.tags:
+            ktags = self.tags
+        else:
+            ktags = set(tags)            
+                
        if self.header and self.first_line:
-            for k in self.tags:
-                if isinstance(data.view[k], Column_multi_elts):
-                    keys = data.view[k].keys()
-                    keys.sort()
-                    for k2 in keys:
-                        line.append(tobytes(k)+b':'+tobytes(k2))
-                else:
-                    line.append(tobytes(k))
+            for k in ktags:
+                if k in tags:
+                    if isinstance(data.view[k], Column_multi_elts):
+                        keys = data.view[k].keys()
+                        keys.sort()
+                        for k2 in keys:
+                            line.append(tobytes(k)+b':'+tobytes(k2))
+                    else:
+                        line.append(tobytes(k))
            r = self.sep.join(value for value in line)
            r += b'\n'
            line = []
                    
-        for k in self.tags:
-            value = data[k]
-            if isinstance(data.view[k], Column_multi_elts):
-                keys = data.view[k].keys()
-                keys.sort()
-                if value is None:  # all keys at None
-                    for k2 in keys: # TODO could be much more efficient
-                        line.append(self.NAString)
-                else:
-                    for k2 in keys: # TODO could be much more efficient
-                        if value[k2] is not None:
-                            line.append(str2bytes(str(bytes2str_object(value[k2]))))  # genius programming
-                        else:
+        for k in ktags:
+            if k in tags:
+                value = data[k]
+                if isinstance(data.view[k], Column_multi_elts):
+                    keys = data.view[k].keys()
+                    keys.sort()
+                    if value is None:  # all keys at None
+                        for k2 in keys: # TODO could be much more efficient
                            line.append(self.NAString)
-            else:
-                if value is not None:
-                    line.append(str2bytes(str(bytes2str_object(value))))
+                    else:
+                        for k2 in keys: # TODO could be much more efficient
+                            if value[k2] is not None:
+                                line.append(str2bytes(str(bytes2str_object(value[k2]))))  # genius programming
+                            else:
+                                if self.NAIntTo0 and isinstance(data.view[k], Column_multi_elts_int):
+                                    line.append(b"0")
+                                else:
+                                    line.append(self.NAString)
                else:
-                    line.append(self.NAString)
+                    if value is not None or (self.NAIntTo0 and isinstance(data.view[k], Column_int)):
+                        line.append(str2bytes(str(bytes2str_object(value))))
+                    else:
+                        line.append(self.NAString)
                  	      	
        if self.header and self.first_line:
            r += self.sep.join(value for value in line)
--- a/python/obitools3/uri/decode.pyx
+++ b/python/obitools3/uri/decode.pyx
@ -427,7 +427,21 @@ def open_uri(uri,
                nastring=tobytes(config["obi"][nakey])
            except KeyError:
                nastring=b'NA'
-                
+
+        if b"na_int_to_0" in qualifiers:
+            try:
+                na_int_to_0=eval(qualifiers[b"na_int_to_0"][0])
+            except Exception as e:
+                raise MalformedURIException("Malformed 'NA_int_to_0' argument in URI")
+        else:
+            try:
+                na_int_to_0=config["obi"]["na_int_to_0"]
+            except KeyError:
+                if format==b"tabular":
+                    na_int_to_0=True
+                else:
+                    na_int_to_0=False
+                       
        if b"stripwhite" in qualifiers:
            try:
                stripwhite=eval(qualifiers[b"stripwhite"][0])
@ -462,6 +476,18 @@ def open_uri(uri,
            except KeyError:
                commentchar=b'#'

+        if b"only_keys" in qualifiers:
+            only_keys=qualifiers[b"only_keys"][0] # not sure that works but no one ever uses qualifiers
+        else:
+            try:
+                only_keys_str=config["obi"]["only_keys"]
+                only_keys=[]
+                for key in only_keys_str:
+                    only_keys.append(tobytes(key))
+            except KeyError:
+                only_keys=[]
+
+
        if format is not None:
            if seqtype==b"nuc":
                objclass = Nuc_Seq    # Nuc_Seq_Stored? TODO
@ -472,7 +498,7 @@ def open_uri(uri,
                                                only=only,
                                                nastring=nastring)
                    else:
-                        iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring), 
+                        iseq = FastaNucWriter(FastaFormat(tags=only_keys, printNAKeys=printna, NAString=nastring), 
                                              file,
                                              skip=skip,
                                              only=only)
@ -485,7 +511,7 @@ def open_uri(uri,
                                             noquality=noquality,
                                             nastring=nastring)
                    else:
-                        iseq = FastqWriter(FastqFormat(printNAKeys=printna, NAString=nastring), 
+                        iseq = FastqWriter(FastqFormat(tags=only_keys, printNAKeys=printna, NAString=nastring), 
                                           file,
                                           skip=skip,
                                           only=only)
@ -521,7 +547,7 @@ def open_uri(uri,
                                       skip = skip,
                                       only = only)
                else:
-                    iseq = TabWriter(TabFormat(header=header, NAString=nastring, sep=sep), 
+                    iseq = TabWriter(TabFormat(tags=only_keys, header=header, NAString=nastring, sep=sep, NAIntTo0=na_int_to_0), 
                                               file,
                                               skip=skip,
                                               only=only,
@ -557,7 +583,7 @@ def open_uri(uri,
                                                              commentchar)
            else:    # default export is in fasta? or tab? TODO
                objclass = Nuc_Seq   # Nuc_Seq_Stored? TODO
-                iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring), 
+                iseq = FastaNucWriter(FastaFormat(tags=only_keys, printNAKeys=printna, NAString=nastring), 
                                      file,
                                      skip=skip,
                                      only=only)
--- a/python/obitools3/version.py
+++ b/python/obitools3/version.py
@ -1,5 +1,5 @@
 major = 3
 minor = 0
-serial= '1b10'
+serial= '1b11'

 version ="%d.%d.%s" % (major,minor,serial)
--- a/python/obitools3/writers/tab.pyx
+++ b/python/obitools3/writers/tab.pyx
@ -20,8 +20,6 @@ cdef class TabWriter:
            self.only = -1
        else:
            self.only = int(only)
-            if header:
-                self.only += 1

        self.formatter = formatter
        self.output = output_object
--- a/src/obidms_taxonomy.c
+++ b/src/obidms_taxonomy.c
@ -873,7 +873,7 @@ static ecotxidx_t* read_taxonomy_idx(const char* taxa_file_name, const char* loc
 	taxa_index->buffer_size = taxa_index->count;

 	taxa_index->max_taxid = 0;
-	printf("Reading %d taxa...\n", count_taxa);
+	fprintf(stderr, "Reading %d taxa...\n", count_taxa);
 	for (i=0; i<count_taxa; i++)
 	{
 		readnext_ecotaxon(f_taxa, &(taxa_index->taxon[i]));
@ -886,9 +886,9 @@ static ecotxidx_t* read_taxonomy_idx(const char* taxa_file_name, const char* loc
 	}

 	if (count_local_taxa > 0)
-		printf("Reading %d local taxa...\n", count_local_taxa);
+		fprintf(stderr, "Reading %d local taxa...\n", count_local_taxa);
 	else
-		printf("No local taxa\n");
+		fprintf(stderr, "No local taxa\n");

 	count_taxa = taxa_index->count;
Author	SHA1	Message	Date
mercierc	82d2642000	Switch to version 3.0.1b11	2021-07-22 09:25:39 +12:00
mercierc	99c1cd60d6	export: now exports header for tabular files by default and added option to only export specific columns	2021-07-22 09:23:18 +12:00
mercierc	ce7ae4ac55	export: fixed 'only' option printing one too many if printing header	2021-07-21 15:23:04 +12:00
mercierc	0b4283bb58	cat: improved error handling	2021-07-21 15:22:08 +12:00
mercierc	747f3efbb2	Improved taxonomy reading information display	2021-07-21 15:20:44 +12:00
mercierc	6c1a3aff47	Fixed the handling of sample names that are numbers (forcing conversion)	2021-07-21 15:19:24 +12:00
mercierc	e2932b05f2	Implements #108 export integer missing values as 0 for tables by default	2021-07-21 14:41:54 +12:00
mercierc	32345b9ec4	Addresses #111	2021-07-19 15:55:25 +12:00