Compare commits

...

17 Commits

Author SHA1 Message Date
f43856b712 switch to version 3.0.1b12 2021-09-08 10:56:55 +12:00
9e0c319806 Cython: fixed rewriting of column when rewriting a 1 element dict column 2021-09-08 10:54:23 +12:00
58b42cd977 C: views: now correctly parses view names containing '.' when cleaning
unfinished views. Closes #115
2021-09-08 10:52:42 +12:00
34de90bce6 ngsfilter: checks better if there is an associated sequencing quality 2021-09-08 10:30:11 +12:00
4be9f36f99 stats: fixed the computation of variance when it is equal to 0 2021-08-05 11:32:16 +12:00
f10e78ba3c C: fixed the printing of view informations from a DMS (fixes #114) 2021-08-05 11:31:24 +12:00
88c8463ed7 Cython: taxonomy: improved logging 2021-08-05 11:29:20 +12:00
89168271ef ecopcr: now accepting taxonomy from a different DMS than the reference
sequences
2021-08-05 11:28:57 +12:00
82d2642000 Switch to version 3.0.1b11 2021-07-22 09:25:39 +12:00
99c1cd60d6 export: now exports header for tabular files by default and added option
to only export specific columns
2021-07-22 09:23:18 +12:00
ce7ae4ac55 export: fixed 'only' option printing one too many if printing header 2021-07-21 15:23:04 +12:00
0b4283bb58 cat: improved error handling 2021-07-21 15:22:08 +12:00
747f3efbb2 Improved taxonomy reading information display 2021-07-21 15:20:44 +12:00
6c1a3aff47 Fixed the handling of sample names that are numbers (forcing conversion) 2021-07-21 15:19:24 +12:00
e2932b05f2 Implements #108 export integer missing values as 0 for tables by default 2021-07-21 14:41:54 +12:00
32345b9ec4 Addresses #111 2021-07-19 15:55:25 +12:00
9334cf6cc6 import: improved genbank parser and switch to version 3.0.1.b10 2021-06-17 08:42:01 +12:00
25 changed files with 207 additions and 77 deletions

View File

@ -137,10 +137,10 @@ def __addImportInputOption(optionManager):
def __addTabularOption(optionManager): def __addTabularOption(optionManager):
group = optionManager.add_argument_group("Input and output format options for tabular files") group = optionManager.add_argument_group("Input and output format options for tabular files")
group.add_argument('--header', group.add_argument('--no-header',
action="store_true", dest="obi:header", action="store_false", dest="obi:header",
default=False, default=True,
help="First line of tabular file contains column names") help="Don't print the header (first line with column names")
group.add_argument('--sep', group.add_argument('--sep',
action="store", dest="obi:sep", action="store", dest="obi:sep",
@ -177,6 +177,16 @@ def __addTabularInputOption(optionManager):
help="Lines starting by this char are considered as comment") help="Lines starting by this char are considered as comment")
def __addTabularOutputOption(optionManager):
group = optionManager.add_argument_group("Output format options for tabular files")
__addTabularOption(optionManager)
group.add_argument('--na-int-stay-na',
action="store_false", dest="obi:na_int_to_0",
help="NA (Non available) integer values should be exported as NA in tabular output (default: they are converted to 0 for tabular output).") # TODO
def __addTaxdumpInputOption(optionManager): # TODO maybe not the best way to do it def __addTaxdumpInputOption(optionManager): # TODO maybe not the best way to do it
group = optionManager.add_argument_group("Input format options for taxdump") group = optionManager.add_argument_group("Input format options for taxdump")
@ -210,6 +220,10 @@ def addTabularInputOption(optionManager):
__addTabularInputOption(optionManager) __addTabularInputOption(optionManager)
def addTabularOutputOption(optionManager):
__addTabularOutputOption(optionManager)
def addTaxonomyOption(optionManager): def addTaxonomyOption(optionManager):
__addTaxonomyOption(optionManager) __addTaxonomyOption(optionManager)
@ -222,6 +236,7 @@ def addAllInputOption(optionManager):
__addInputOption(optionManager) __addInputOption(optionManager)
__addImportInputOption(optionManager) __addImportInputOption(optionManager)
__addTabularInputOption(optionManager) __addTabularInputOption(optionManager)
__addTabularOutputOption(optionManager)
__addTaxonomyOption(optionManager) __addTaxonomyOption(optionManager)
__addTaxdumpInputOption(optionManager) __addTaxdumpInputOption(optionManager)
@ -282,6 +297,12 @@ def __addExportOutputOption(optionManager):
const=b'tabular', const=b'tabular',
help="Output file is in tabular format") help="Output file is in tabular format")
group.add_argument('--only-keys',
action="append", dest="obi:only_keys",
type=str,
default=[],
help="Only export the given keys (columns).")
group.add_argument('--print-na', group.add_argument('--print-na',
action="store_true", dest="obi:printna", action="store_true", dest="obi:printna",
default=False, default=False,
@ -314,14 +335,14 @@ def addTabularOutputOption(optionManager):
def addExportOutputOption(optionManager): def addExportOutputOption(optionManager):
__addExportOutputOption(optionManager) __addExportOutputOption(optionManager)
__addTabularOption(optionManager) __addTabularOutputOption(optionManager)
def addAllOutputOption(optionManager): def addAllOutputOption(optionManager):
__addOutputOption(optionManager) __addOutputOption(optionManager)
__addDMSOutputOption(optionManager) __addDMSOutputOption(optionManager)
__addExportOutputOption(optionManager) __addExportOutputOption(optionManager)
__addTabularOption(optionManager) __addTabularOutputOption(optionManager)
def addNoProgressBarOption(optionManager): def addNoProgressBarOption(optionManager):

View File

@ -134,7 +134,11 @@ def run(config):
rep = repr(entry) rep = repr(entry)
output_0.write(str2bytes(rep)+b"\n") output_0.write(str2bytes(rep)+b"\n")
else: else:
o_view[i] = entry try:
o_view[i] = entry
except:
print("\nError with entry:", repr(entry))
print(repr(o_view))
i+=1 i+=1
v.close() v.close()

View File

@ -175,6 +175,14 @@ def run(config):
o_dms_name = output[0].name o_dms_name = output[0].name
o_view_name = output[1] o_view_name = output[1]
# Open the taxonomy DMS
taxdms = open_uri(config['obi']['taxoURI'],
dms_only=True)
if taxdms is None:
raise Exception("Could not open taxonomy DMS")
tax_dms = taxdms[0]
tax_dms_name = taxdms[0].name
# Read taxonomy name # Read taxonomy name
taxonomy_name = config['obi']['taxoURI'].split("/")[-1] # Robust in theory taxonomy_name = config['obi']['taxoURI'].split("/")[-1] # Robust in theory
@ -197,7 +205,8 @@ def run(config):
# TODO: primers in comments? # TODO: primers in comments?
if obi_ecopcr(i_dms.name_with_full_path, tobytes(i_view_name), tobytes(taxonomy_name), \ if obi_ecopcr(i_dms.name_with_full_path, tobytes(i_view_name),
tax_dms.name_with_full_path, tobytes(taxonomy_name), \
o_dms.name_with_full_path, tobytes(o_view_name), comments, \ o_dms.name_with_full_path, tobytes(o_view_name), comments, \
tobytes(config['ecopcr']['primer1']), tobytes(config['ecopcr']['primer2']), \ tobytes(config['ecopcr']['primer1']), tobytes(config['ecopcr']['primer2']), \
config['ecopcr']['error'], \ config['ecopcr']['error'], \

2
python/obitools3/commands/ngsfilter.pyx Normal file → Executable file
View File

@ -271,7 +271,7 @@ cdef tuple annotate(sequences, infos, no_tags, verbose=False):
sequences[0][REVERSE_QUALITY_COLUMN] = sequences[1].quality # used by alignpairedend tool sequences[0][REVERSE_QUALITY_COLUMN] = sequences[1].quality # used by alignpairedend tool
for seq in sequences: for seq in sequences:
if hasattr(seq, "quality_array"): if hasattr(seq, "quality_array") and seq.quality_array is not None:
q = -reduce(lambda x,y:x+y,(math.log10(z) for z in seq.quality_array),0)/len(seq.quality_array)*10 q = -reduce(lambda x,y:x+y,(math.log10(z) for z in seq.quality_array),0)/len(seq.quality_array)*10
seq[b'avg_quality']=q seq[b'avg_quality']=q
q = -reduce(lambda x,y:x+y,(math.log10(z) for z in seq.quality_array[0:10]),0) q = -reduce(lambda x,y:x+y,(math.log10(z) for z in seq.quality_array[0:10]),0)

View File

@ -119,9 +119,12 @@ def mean(values, options):
def variance(v): def variance(v):
if len(v)==1: if len(v)==1:
return 0 return 0
s = reduce(lambda x,y:(x[0]+y,x[1]+y**2),v,(0.,0.)) s = reduce(lambda x,y:(x[0]+y,x[1]+y**2),v,(0.,0.))
return s[1]/(len(v)-1) - s[0]**2/len(v)/(len(v)-1) var = round(s[1]/(len(v)-1) - s[0]**2/len(v)/(len(v)-1), 5) # round to go around shady python rounding stuff when var is actually 0
if var == -0.0: # then fix -0 to +0 if was rounded to -0
var = 0.0
return var
def varpop(values, options): def varpop(values, options):
@ -285,8 +288,8 @@ def run(config):
print((("%%%df" % lvarp[m]) % varp[m][c])+"\t", end="") print((("%%%df" % lvarp[m]) % varp[m][c])+"\t", end="")
for m in config['stats']['sd']: for m in config['stats']['sd']:
print((("%%%df" % lsigma[m]) % sigma[m][c])+"\t", end="") print((("%%%df" % lsigma[m]) % sigma[m][c])+"\t", end="")
print("%7d" %catcount[c], end="") print("%d" %catcount[c]+"\t", end="")
print("%9d" %totcount[c]) print("%d" %totcount[c]+"\t")
input[0].close(force=True) input[0].close(force=True)

View File

@ -8,6 +8,7 @@ cdef extern from "obi_ecopcr.h" nogil:
int obi_ecopcr(const char* input_dms_name, int obi_ecopcr(const char* input_dms_name,
const char* i_view_name, const char* i_view_name,
const char* tax_dms_name,
const char* taxonomy_name, const char* taxonomy_name,
const char* output_dms_name, const char* output_dms_name,
const char* o_view_name, const char* o_view_name,

View File

@ -7,7 +7,8 @@ __OBIDMS_COLUMN_CLASS__ = {}
from ..capi.obitypes cimport name_data_type, \ from ..capi.obitypes cimport name_data_type, \
obitype_t, \ obitype_t, \
obiversion_t, \ obiversion_t, \
OBI_QUAL OBI_QUAL, \
OBI_STR
from ..capi.obidms cimport obi_import_column from ..capi.obidms cimport obi_import_column
@ -128,6 +129,10 @@ cdef class Column(OBIWrapper) :
else: else:
elements_names_p = NULL elements_names_p = NULL
if column_name_b == b"SAMPLE" or column_name_b == b"sample":
# force str type
data_type = OBI_STR
if data_type == OBI_QUAL: if data_type == OBI_QUAL:
if associated_column_name_b == b"": if associated_column_name_b == b"":
if column_name == QUALITY_COLUMN: if column_name == QUALITY_COLUMN:

View File

@ -74,6 +74,9 @@ cdef class Column_str(Column_idx):
if value is None : if value is None :
value_b = <char*>OBIStr_NA value_b = <char*>OBIStr_NA
else : else :
if self.name == b'sample' or self.name == b'SAMPLE':
if type(value) == int:
value = str(value) # force sample ids to be str
value_bytes = tobytes(value) value_bytes = tobytes(value)
value_b = <char*>value_bytes value_b = <char*>value_bytes
@ -137,6 +140,9 @@ cdef class Column_multi_elts_str(Column_multi_elts_idx):
if value is None : if value is None :
value_b = <char*>OBIStr_NA value_b = <char*>OBIStr_NA
else : else :
if self.name == b'sample' or self.name == b'SAMPLE':
if type(value) == int:
value = str(value) # force sample ids to be str
value_bytes = tobytes(value) value_bytes = tobytes(value)
value_b = <char*>value_bytes value_b = <char*>value_bytes
@ -206,6 +212,9 @@ cdef class Column_tuples_str(Column_idx):
i = 0 i = 0
for elt in value : for elt in value :
if elt is not None and elt != '': if elt is not None and elt != '':
if self.name == b'sample' or self.name == b'SAMPLE':
if type(elt) == int:
elt = str(elt) # force sample ids to be str
elt_b = tobytes(elt) elt_b = tobytes(elt)
strcpy(array+i, <char*>elt_b) strcpy(array+i, <char*>elt_b)
i = i + len(elt_b) + 1 i = i + len(elt_b) + 1

View File

@ -1,5 +1,7 @@
#cython: language_level=3 #cython: language_level=3
import sys
from obitools3.utils cimport str2bytes, bytes2str, tobytes, tostr from obitools3.utils cimport str2bytes, bytes2str, tobytes, tostr
from ..capi.obidms cimport OBIDMS_p, obi_dms_get_full_path from ..capi.obidms cimport OBIDMS_p, obi_dms_get_full_path
@ -34,7 +36,7 @@ cdef class Taxonomy(OBIWrapper) :
return <OBIDMS_taxonomy_p>(self._pointer) return <OBIDMS_taxonomy_p>(self._pointer)
cdef fill_name_dict(self): cdef fill_name_dict(self):
print("Indexing taxon names...") print("Indexing taxon names...", file=sys.stderr)
cdef OBIDMS_taxonomy_p pointer = self.pointer() cdef OBIDMS_taxonomy_p pointer = self.pointer()
cdef ecotx_t* taxon_p cdef ecotx_t* taxon_p
@ -91,6 +93,8 @@ cdef class Taxonomy(OBIWrapper) :
raise RuntimeError("Error : Cannot read taxonomy %s" raise RuntimeError("Error : Cannot read taxonomy %s"
% tostr(name)) % tostr(name))
print("Taxonomy read", file=sys.stderr)
taxo = OBIWrapper.new_wrapper(Taxonomy, pointer) taxo = OBIWrapper.new_wrapper(Taxonomy, pointer)
dms.register(taxo) dms.register(taxo)
@ -146,7 +150,9 @@ cdef class Taxonomy(OBIWrapper) :
taxo._ranks = [] taxo._ranks = []
for r in range((<OBIDMS_taxonomy_p>pointer).ranks.count) : for r in range((<OBIDMS_taxonomy_p>pointer).ranks.count) :
taxo._ranks.append(obi_taxo_rank_index_to_label(r, (<OBIDMS_taxonomy_p>pointer).ranks)) taxo._ranks.append(obi_taxo_rank_index_to_label(r, (<OBIDMS_taxonomy_p>pointer).ranks))
print('Read %d taxa' % len(taxo), file=sys.stderr)
return taxo return taxo
@ -304,6 +310,11 @@ cdef class Taxonomy(OBIWrapper) :
def name(self): def name(self):
return self._name return self._name
# ranks property getter
@property
def ranks(self):
return self._ranks
def parental_tree_iterator(self, int taxid): def parental_tree_iterator(self, int taxid):
""" """

View File

@ -345,7 +345,7 @@ cdef class View(OBIWrapper) :
nb_elements_per_line=new_nb_elements_per_line, elements_names=new_elements_names, nb_elements_per_line=new_nb_elements_per_line, elements_names=new_elements_names,
dict_column=(new_nb_elements_per_line>1), comments=old_column.comments, alias=column_name_b+tobytes('___new___')) dict_column=(new_nb_elements_per_line>1), comments=old_column.comments, alias=column_name_b+tobytes('___new___'))
switch_to_dict = old_column.nb_elements_per_line == 1 and new_nb_elements_per_line > 1 switch_to_dict = not old_column.dict_column and new_nb_elements_per_line > 1
ori_key = old_column._elements_names[0] ori_key = old_column._elements_names[0]
for i in range(length) : for i in range(length) :

View File

@ -7,11 +7,12 @@ from obitools3.utils cimport bytes2str
cdef class FastaFormat: cdef class FastaFormat:
def __init__(self, list tags=[], bint printNAKeys=False, bytes NAString=b"NA"): def __init__(self, list tags=[], bint printNAKeys=False, bytes NAString=b"NA", bint NAIntTo0=False):
self.headerFormatter = HeaderFormat("fasta", self.headerFormatter = HeaderFormat("fasta",
tags=tags, tags=tags,
printNAKeys=printNAKeys, printNAKeys=printNAKeys,
NAString=NAString) NAString=NAString,
NAIntTo0=NAIntTo0)
@cython.boundscheck(False) @cython.boundscheck(False)
def __call__(self, object data): def __call__(self, object data):

View File

@ -8,11 +8,12 @@ from obitools3.utils cimport bytes2str, str2bytes, tobytes
# TODO quality offset option? # TODO quality offset option?
cdef class FastqFormat: cdef class FastqFormat:
def __init__(self, list tags=[], bint printNAKeys=False, bytes NAString=b"NA"): def __init__(self, list tags=[], bint printNAKeys=False, bytes NAString=b"NA", bint NAIntTo0=False):
self.headerFormatter = HeaderFormat("fastq", self.headerFormatter = HeaderFormat("fastq",
tags=tags, tags=tags,
printNAKeys=printNAKeys, printNAKeys=printNAKeys,
NAString=NAString) NAString=NAString,
NAIntTo0=NAIntTo0)
@cython.boundscheck(False) @cython.boundscheck(False)
def __call__(self, object data): def __call__(self, object data):

View File

@ -4,5 +4,6 @@ cdef class HeaderFormat:
cdef set tags cdef set tags
cdef bint printNAKeys cdef bint printNAKeys
cdef bytes NAString cdef bytes NAString
cdef bint NAIntTo0
cdef size_t headerBufferLength cdef size_t headerBufferLength

View File

@ -8,13 +8,14 @@ from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN, \
from obitools3.utils cimport str2bytes, bytes2str_object from obitools3.utils cimport str2bytes, bytes2str_object
from obitools3.dms.column.column cimport Column_line from obitools3.dms.column.column cimport Column_line
from obitools3.dms.column.typed_column.int cimport Column_int, Column_multi_elts_int
cdef class HeaderFormat: cdef class HeaderFormat:
SPECIAL_KEYS = [NUC_SEQUENCE_COLUMN, ID_COLUMN, DEFINITION_COLUMN, QUALITY_COLUMN] SPECIAL_KEYS = [NUC_SEQUENCE_COLUMN, ID_COLUMN, DEFINITION_COLUMN, QUALITY_COLUMN]
def __init__(self, str format="fasta", list tags=[], bint printNAKeys=False, bytes NAString=b"NA"): def __init__(self, str format="fasta", list tags=[], bint printNAKeys=False, bytes NAString=b"NA", bint NAIntTo0=False):
''' '''
@param format: @param format:
@type format: `str` @type format: `str`
@ -32,6 +33,7 @@ cdef class HeaderFormat:
self.tags = set(tags) self.tags = set(tags)
self.printNAKeys = printNAKeys self.printNAKeys = printNAKeys
self.NAString = NAString self.NAString = NAString
self.NAIntTo0 = NAIntTo0
if format=="fasta": if format=="fasta":
self.start=b">" self.start=b">"
@ -57,17 +59,25 @@ cdef class HeaderFormat:
if k in tags: if k in tags:
value = data[k] value = data[k]
if value is None or (isinstance(value, Column_line) and value.is_NA()): if value is None or (isinstance(value, Column_line) and value.is_NA()):
if self.printNAKeys: if isinstance(data.view[k], Column_int) and self.NAIntTo0: # people want missing int values to be 0
value = b'0'
elif self.printNAKeys:
value = self.NAString value = self.NAString
else: else:
value = None value = None
else: else:
if type(value) == Column_line: if type(value) == Column_line:
value = value.bytes() if isinstance(data.view[k], Column_multi_elts_int) and self.NAIntTo0:
value = dict(value)
for key in data.view[k].keys():
if key not in value or value[key]:
value[key] = 0
else:
value = value.bytes()
else: else:
if type(value) == tuple: if type(value) == tuple:
value=list(value) value=list(value)
value = str2bytes(str(bytes2str_object(value))) # genius programming value = str2bytes(str(bytes2str_object(value))) # genius programming
if value is not None: if value is not None:
lines.append(k + b"=" + value + b";") lines.append(k + b"=" + value + b";")

View File

@ -4,5 +4,6 @@ cdef class TabFormat:
cdef bint header cdef bint header
cdef bint first_line cdef bint first_line
cdef bytes NAString cdef bytes NAString
cdef list tags cdef set tags
cdef bytes sep cdef bytes sep
cdef bint NAIntTo0

View File

@ -4,57 +4,70 @@ cimport cython
from obitools3.dms.view.view cimport Line from obitools3.dms.view.view cimport Line
from obitools3.utils cimport bytes2str_object, str2bytes, tobytes from obitools3.utils cimport bytes2str_object, str2bytes, tobytes
from obitools3.dms.column.column cimport Column_line, Column_multi_elts from obitools3.dms.column.column cimport Column_line, Column_multi_elts
from obitools3.dms.column.typed_column.int cimport Column_int, Column_multi_elts_int
import sys import sys
cdef class TabFormat: cdef class TabFormat:
def __init__(self, header=True, bytes NAString=b"NA", bytes sep=b"\t"): def __init__(self, list tags=[], header=True, bytes NAString=b"NA", bytes sep=b"\t", bint NAIntTo0=True):
self.tags = set(tags)
self.header = header self.header = header
self.first_line = True self.first_line = True
self.NAString = NAString self.NAString = NAString
self.sep = sep self.sep = sep
self.NAIntTo0 = NAIntTo0
@cython.boundscheck(False) @cython.boundscheck(False)
def __call__(self, object data): def __call__(self, object data):
cdef set ktags
cdef list tags = [key for key in data]
line = [] line = []
if self.first_line: if self.tags is not None and self.tags:
self.tags = [k for k in data.keys()] ktags = self.tags
else:
ktags = set(tags)
if self.header and self.first_line: if self.header and self.first_line:
for k in self.tags: for k in ktags:
if isinstance(data.view[k], Column_multi_elts): if k in tags:
keys = data.view[k].keys() if isinstance(data.view[k], Column_multi_elts):
keys.sort() keys = data.view[k].keys()
for k2 in keys: keys.sort()
line.append(tobytes(k)+b':'+tobytes(k2)) for k2 in keys:
else: line.append(tobytes(k)+b':'+tobytes(k2))
line.append(tobytes(k)) else:
line.append(tobytes(k))
r = self.sep.join(value for value in line) r = self.sep.join(value for value in line)
r += b'\n' r += b'\n'
line = [] line = []
for k in self.tags: for k in ktags:
value = data[k] if k in tags:
if isinstance(data.view[k], Column_multi_elts): value = data[k]
keys = data.view[k].keys() if isinstance(data.view[k], Column_multi_elts):
keys.sort() keys = data.view[k].keys()
if value is None: # all keys at None keys.sort()
for k2 in keys: # TODO could be much more efficient if value is None: # all keys at None
line.append(self.NAString) for k2 in keys: # TODO could be much more efficient
else:
for k2 in keys: # TODO could be much more efficient
if value[k2] is not None:
line.append(str2bytes(str(bytes2str_object(value[k2])))) # genius programming
else:
line.append(self.NAString) line.append(self.NAString)
else: else:
if value is not None: for k2 in keys: # TODO could be much more efficient
line.append(str2bytes(str(bytes2str_object(value)))) if value[k2] is not None:
line.append(str2bytes(str(bytes2str_object(value[k2])))) # genius programming
else:
if self.NAIntTo0 and isinstance(data.view[k], Column_multi_elts_int):
line.append(b"0")
else:
line.append(self.NAString)
else: else:
line.append(self.NAString) if value is not None or (self.NAIntTo0 and isinstance(data.view[k], Column_int)):
line.append(str2bytes(str(bytes2str_object(value))))
else:
line.append(self.NAString)
if self.header and self.first_line: if self.header and self.first_line:
r += self.sep.join(value for value in line) r += self.sep.join(value for value in line)

View File

@ -25,7 +25,7 @@ from libc.string cimport strcpy, strlen
_featureMatcher = re.compile(b'^FEATURES.+\n(?=ORIGIN )',re.DOTALL + re.M) _featureMatcher = re.compile(b'^FEATURES.+\n(?=ORIGIN )',re.DOTALL + re.M)
_headerMatcher = re.compile(b'^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M) _headerMatcher = re.compile(b'^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M)
_seqMatcher = re.compile(b'ORIGIN .+(?=//\n)', re.DOTALL + re.M) _seqMatcher = re.compile(b'^ORIGIN .+(?=//\n)', re.DOTALL + re.M)
_cleanSeq1 = re.compile(b'ORIGIN.+\n') _cleanSeq1 = re.compile(b'ORIGIN.+\n')
_cleanSeq2 = re.compile(b'[ \n0-9]+') _cleanSeq2 = re.compile(b'[ \n0-9]+')
_acMatcher = re.compile(b'(?<=^ACCESSION ).+',re.M) _acMatcher = re.compile(b'(?<=^ACCESSION ).+',re.M)

View File

@ -427,7 +427,21 @@ def open_uri(uri,
nastring=tobytes(config["obi"][nakey]) nastring=tobytes(config["obi"][nakey])
except KeyError: except KeyError:
nastring=b'NA' nastring=b'NA'
if b"na_int_to_0" in qualifiers:
try:
na_int_to_0=eval(qualifiers[b"na_int_to_0"][0])
except Exception as e:
raise MalformedURIException("Malformed 'NA_int_to_0' argument in URI")
else:
try:
na_int_to_0=config["obi"]["na_int_to_0"]
except KeyError:
if format==b"tabular":
na_int_to_0=True
else:
na_int_to_0=False
if b"stripwhite" in qualifiers: if b"stripwhite" in qualifiers:
try: try:
stripwhite=eval(qualifiers[b"stripwhite"][0]) stripwhite=eval(qualifiers[b"stripwhite"][0])
@ -462,6 +476,18 @@ def open_uri(uri,
except KeyError: except KeyError:
commentchar=b'#' commentchar=b'#'
if b"only_keys" in qualifiers:
only_keys=qualifiers[b"only_keys"][0] # not sure that works but no one ever uses qualifiers
else:
try:
only_keys_str=config["obi"]["only_keys"]
only_keys=[]
for key in only_keys_str:
only_keys.append(tobytes(key))
except KeyError:
only_keys=[]
if format is not None: if format is not None:
if seqtype==b"nuc": if seqtype==b"nuc":
objclass = Nuc_Seq # Nuc_Seq_Stored? TODO objclass = Nuc_Seq # Nuc_Seq_Stored? TODO
@ -472,7 +498,7 @@ def open_uri(uri,
only=only, only=only,
nastring=nastring) nastring=nastring)
else: else:
iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring), iseq = FastaNucWriter(FastaFormat(tags=only_keys, printNAKeys=printna, NAString=nastring),
file, file,
skip=skip, skip=skip,
only=only) only=only)
@ -485,7 +511,7 @@ def open_uri(uri,
noquality=noquality, noquality=noquality,
nastring=nastring) nastring=nastring)
else: else:
iseq = FastqWriter(FastqFormat(printNAKeys=printna, NAString=nastring), iseq = FastqWriter(FastqFormat(tags=only_keys, printNAKeys=printna, NAString=nastring),
file, file,
skip=skip, skip=skip,
only=only) only=only)
@ -521,7 +547,7 @@ def open_uri(uri,
skip = skip, skip = skip,
only = only) only = only)
else: else:
iseq = TabWriter(TabFormat(header=header, NAString=nastring, sep=sep), iseq = TabWriter(TabFormat(tags=only_keys, header=header, NAString=nastring, sep=sep, NAIntTo0=na_int_to_0),
file, file,
skip=skip, skip=skip,
only=only, only=only,
@ -557,7 +583,7 @@ def open_uri(uri,
commentchar) commentchar)
else: # default export is in fasta? or tab? TODO else: # default export is in fasta? or tab? TODO
objclass = Nuc_Seq # Nuc_Seq_Stored? TODO objclass = Nuc_Seq # Nuc_Seq_Stored? TODO
iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring), iseq = FastaNucWriter(FastaFormat(tags=only_keys, printNAKeys=printna, NAString=nastring),
file, file,
skip=skip, skip=skip,
only=only) only=only)

View File

@ -1,5 +1,5 @@
major = 3 major = 3
minor = 0 minor = 0
serial= '1b9' serial= '1b12'
version ="%d.%d.%s" % (major,minor,serial) version ="%d.%d.%s" % (major,minor,serial)

View File

@ -20,8 +20,6 @@ cdef class TabWriter:
self.only = -1 self.only = -1
else: else:
self.only = int(only) self.only = int(only)
if header:
self.only += 1
self.formatter = formatter self.formatter = formatter
self.output = output_object self.output = output_object

View File

@ -645,7 +645,8 @@ static int print_seq(Obiview_p i_view, Obiview_p o_view,
int obi_ecopcr(const char* i_dms_name, int obi_ecopcr(const char* i_dms_name,
const char* i_view_name, const char* i_view_name,
const char* taxonomy_name, // TODO discuss that input dms assumed const char* tax_dms_name,
const char* taxonomy_name,
const char* o_dms_name, const char* o_dms_name,
const char* o_view_name, const char* o_view_name,
const char* o_view_comments, const char* o_view_comments,
@ -678,6 +679,7 @@ int obi_ecopcr(const char* i_dms_name,
OBIDMS_p i_dms = NULL; OBIDMS_p i_dms = NULL;
OBIDMS_p o_dms = NULL; OBIDMS_p o_dms = NULL;
OBIDMS_p tax_dms = NULL;
OBIDMS_taxonomy_p taxonomy = NULL; OBIDMS_taxonomy_p taxonomy = NULL;
Obiview_p i_view = NULL; Obiview_p i_view = NULL;
Obiview_p o_view = NULL; Obiview_p o_view = NULL;
@ -965,8 +967,16 @@ int obi_ecopcr(const char* i_dms_name,
return -1; return -1;
} }
// Open taxonomy DMS
tax_dms = obi_open_dms(tax_dms_name, false);
if (tax_dms == NULL)
{
obidebug(1, "\nError opening the taxonomy DMS");
return -1;
}
// Open the taxonomy // Open the taxonomy
taxonomy = obi_read_taxonomy(i_dms, taxonomy_name, false); taxonomy = obi_read_taxonomy(tax_dms, taxonomy_name, false);
if (taxonomy == NULL) if (taxonomy == NULL)
{ {
obidebug(1, "\nError opening the taxonomy"); obidebug(1, "\nError opening the taxonomy");

View File

@ -77,7 +77,8 @@
* *
* @param i_dms_name The path to the input DMS. * @param i_dms_name The path to the input DMS.
* @param i_view_name The name of the input view. * @param i_view_name The name of the input view.
* @param taxonomy_name The name of the taxonomy in the input DMS. * @param tax_dms_name The path to the DMS containing the taxonomy.
* @param taxonomy_name The name of the taxonomy.
* @param o_dms_name The path to the output DMS. * @param o_dms_name The path to the output DMS.
* @param o_view_name The name of the output view. * @param o_view_name The name of the output view.
* @param o_view_comments The comments to associate with the output view. * @param o_view_comments The comments to associate with the output view.
@ -106,6 +107,7 @@
*/ */
int obi_ecopcr(const char* i_dms_name, int obi_ecopcr(const char* i_dms_name,
const char* i_view_name, const char* i_view_name,
const char* tax_dms_name,
const char* taxonomy_name, const char* taxonomy_name,
const char* o_dms_name, const char* o_dms_name,
const char* o_view_name, const char* o_view_name,

View File

@ -1417,7 +1417,7 @@ char* obi_dms_formatted_infos(OBIDMS_p dms, bool detailed)
char* view_name = NULL; char* view_name = NULL;
char* tax_name = NULL; char* tax_name = NULL;
char* all_tax_dir_path = NULL; char* all_tax_dir_path = NULL;
int i; int i, last_dot_pos;
struct dirent* dp; struct dirent* dp;
Obiview_p view; Obiview_p view;
@ -1439,17 +1439,21 @@ char* obi_dms_formatted_infos(OBIDMS_p dms, bool detailed)
if ((dp->d_name)[0] == '.') if ((dp->d_name)[0] == '.')
continue; continue;
i=0; i=0;
while ((dp->d_name)[i] != '.') while (i < strlen(dp->d_name))
{
if ((dp->d_name)[i] == '.')
last_dot_pos = i;
i++; i++;
view_name = (char*) malloc((i+1) * sizeof(char)); }
view_name = (char*) malloc((last_dot_pos+1) * sizeof(char));
if (view_name == NULL) if (view_name == NULL)
{ {
obi_set_errno(OBI_MALLOC_ERROR); obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for a view name when getting formatted DMS infos: file %s", dp->d_name); obidebug(1, "\nError allocating memory for a view name when getting formatted DMS infos: file %s", dp->d_name);
return NULL; return NULL;
} }
strncpy(view_name, dp->d_name, i); strncpy(view_name, dp->d_name, last_dot_pos);
view_name[i] = '\0'; view_name[last_dot_pos] = '\0';
view = obi_open_view(dms, view_name); view = obi_open_view(dms, view_name);
if (view == NULL) if (view == NULL)
{ {

View File

@ -873,7 +873,7 @@ static ecotxidx_t* read_taxonomy_idx(const char* taxa_file_name, const char* loc
taxa_index->buffer_size = taxa_index->count; taxa_index->buffer_size = taxa_index->count;
taxa_index->max_taxid = 0; taxa_index->max_taxid = 0;
printf("Reading %d taxa...\n", count_taxa); fprintf(stderr, "Reading %d taxa...\n", count_taxa);
for (i=0; i<count_taxa; i++) for (i=0; i<count_taxa; i++)
{ {
readnext_ecotaxon(f_taxa, &(taxa_index->taxon[i])); readnext_ecotaxon(f_taxa, &(taxa_index->taxon[i]));
@ -886,9 +886,9 @@ static ecotxidx_t* read_taxonomy_idx(const char* taxa_file_name, const char* loc
} }
if (count_local_taxa > 0) if (count_local_taxa > 0)
printf("Reading %d local taxa...\n", count_local_taxa); fprintf(stderr, "Reading %d local taxa...\n", count_local_taxa);
else else
printf("No local taxa\n"); fprintf(stderr, "No local taxa\n");
count_taxa = taxa_index->count; count_taxa = taxa_index->count;

View File

@ -2910,7 +2910,7 @@ int obi_clean_unfinished_views(OBIDMS_p dms)
if ((dp->d_name)[0] == '.') if ((dp->d_name)[0] == '.')
continue; continue;
i=0; i=0;
while ((dp->d_name)[i] != '.') while (strncmp((dp->d_name)+i, ".obiview", 8))
i++; i++;
relative_path = (char*) malloc(strlen(VIEW_DIR_NAME) + strlen(dp->d_name) + 2); relative_path = (char*) malloc(strlen(VIEW_DIR_NAME) + strlen(dp->d_name) + 2);
strcpy(relative_path, VIEW_DIR_NAME); strcpy(relative_path, VIEW_DIR_NAME);