Compare commits

..

13 Commits

13 changed files with 279 additions and 26 deletions

View File

@ -297,6 +297,29 @@ def __addExportOutputOption(optionManager):
const=b'tabular', const=b'tabular',
help="Output file is in tabular format") help="Output file is in tabular format")
group.add_argument('--metabaR-output',
action="store_const", dest="obi:outputformat",
default=None,
const=b'metabaR',
help="Export the files needed by the obifiles_to_metabarlist function of the metabaR package")
group.add_argument('--metabaR-prefix',
action="store", dest="obi:metabarprefix",
type=str,
help="Prefix for the files when using --metabaR-output option")
group.add_argument('--metabaR-ngsfilter',
action="store", dest="obi:metabarngsfilter",
type=str,
default=None,
help="URI to the ngsfilter view when using --metabaR-output option (if not provided, it is not exported)")
group.add_argument('--metabaR-samples',
action="store", dest="obi:metabarsamples",
type=str,
default=None,
help="URI to the sample metadata view when using --metabaR-output option (if not provided, it is built as just a list of the sample names)")
group.add_argument('--only-keys', group.add_argument('--only-keys',
action="append", dest="obi:only_keys", action="append", dest="obi:only_keys",
type=str, type=str,

View File

@ -6,6 +6,9 @@ from obitools3.apps.config import logger
from obitools3.dms import DMS from obitools3.dms import DMS
from obitools3.dms.obiseq import Nuc_Seq from obitools3.dms.obiseq import Nuc_Seq
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN from obitools3.dms.capi.obiview cimport QUALITY_COLUMN
from obitools3.writers.tab import TabWriter
from obitools3.format.tab import TabFormat
from obitools3.utils cimport tobytes, tostr
from obitools3.apps.optiongroups import addMinimalInputOption, \ from obitools3.apps.optiongroups import addMinimalInputOption, \
addExportOutputOption, \ addExportOutputOption, \
@ -76,6 +79,13 @@ def run(config):
else: else:
pb = ProgressBar(withoutskip - skip, config) pb = ProgressBar(withoutskip - skip, config)
if config['obi']['outputformat'] == b'metabaR':
# Check prefix
if "metabarprefix" not in config["obi"]:
raise Exception("Prefix needed when exporting for metabaR (--metabaR-prefix option)")
else:
metabaRprefix = config["obi"]["metabarprefix"]
i=0 i=0
for seq in iview : for seq in iview :
PyErr_CheckSignals() PyErr_CheckSignals()
@ -91,6 +101,81 @@ def run(config):
pb(i, force=True) pb(i, force=True)
print("", file=sys.stderr) print("", file=sys.stderr)
if config['obi']['outputformat'] == b'metabaR':
# Export ngsfilter file if view provided
if 'metabarngsfilter' in config['obi']:
ngsfilter_input = open_uri(config['obi']['metabarngsfilter'])
if ngsfilter_input is None:
raise Exception("Could not read ngsfilter view for metabaR output")
ngsfilter_view = ngsfilter_input[1]
ngsfilter_output = open(config['obi']['metabarprefix']+'.ngsfilter', 'w')
for line in ngsfilter_view:
line_to_print = b""
line_to_print += line[b'experiment']
line_to_print += b"\t"
line_to_print += line[b'sample']
line_to_print += b"\t"
line_to_print += line[b'forward_tag']
line_to_print += b":"
line_to_print += line[b'reverse_tag']
line_to_print += b"\t"
line_to_print += line[b'forward_primer']
line_to_print += b"\t"
line_to_print += line[b'reverse_primer']
line_to_print += b"\t"
line_to_print += line[b'additional_info']
print(tostr(line_to_print), file=ngsfilter_output)
if ngsfilter_input[0] != input[0]:
ngsfilter_input[0].close()
ngsfilter_output.close()
# Export sample metadata
samples_output = open(config['obi']['metabarprefix']+'_samples.csv', 'w')
# Export sample metadata file if view provided
if 'metabarsamples' in config['obi']:
samples_input = open_uri(config['obi']['metabarsamples'])
if samples_input is None:
raise Exception("Could not read sample view for metabaR output")
samples_view = samples_input[1]
# Export with tab formatter
TabWriter(TabFormat(header=True, sep='\t',),
samples_output,
header=True)
if samples_input[0] != input[0]:
samples_input[0].close()
# Else export just sample names from main view
else:
sample_list = []
if 'MERGED_sample' in iview:
sample_list = iview['MERGED_sample'].keys()
elif 'sample' not in iview:
for seq in iview:
sample = seq['sample']
if sample not in sample_list:
sample_list.append(sample)
else:
logger("warning", "Can not read sample list from main view for metabaR sample list export")
print("sample_id", file=samples_output)
for sample in sample_list:
line_to_print = b""
line_to_print += sample
line_to_print += b"\t"
print(tostr(line_to_print), file=samples_output)
samples_output.close()
# TODO save command in input dms? # TODO save command in input dms?
if not BrokenPipeError and not IOError: if not BrokenPipeError and not IOError:

View File

@ -91,7 +91,7 @@ def addOptions(parser):
metavar="<ATTRIBUTE_NAME>", metavar="<ATTRIBUTE_NAME>",
help="Select records with the attribute <ATTRIBUTE_NAME> " help="Select records with the attribute <ATTRIBUTE_NAME> "
"defined (not set to NA value). " "defined (not set to NA value). "
"Several -a options can be used on the same " "Several -A options can be used on the same "
"command line.") "command line.")
group.add_argument("-L", "--lmax", group.add_argument("-L", "--lmax",

View File

@ -0,0 +1,105 @@
#cython: language_level=3
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.dms import DMS
from obitools3.dms.view.view cimport View, Line_selection
from obitools3.uri.decode import open_uri
from obitools3.apps.optiongroups import addMinimalInputOption, addTaxonomyOption, addMinimalOutputOption, addNoProgressBarOption
from obitools3.dms.view import RollbackException
from obitools3.apps.config import logger
from obitools3.utils cimport tobytes
import sys
from cpython.exc cimport PyErr_CheckSignals
__title__="Split"
def addOptions(parser):
addMinimalInputOption(parser)
addNoProgressBarOption(parser)
group=parser.add_argument_group("obi split specific options")
group.add_argument('-p','--prefix',
action="store", dest="split:prefix",
metavar="<PREFIX>",
help="Prefix added to each subview name (included undefined)")
group.add_argument('-t','--tag-name',
action="store", dest="split:tagname",
metavar="<TAG_NAME>",
help="Attribute/tag used to split the input")
group.add_argument('-u','--undefined',
action="store", dest="split:undefined",
default=b'UNDEFINED',
metavar="<VIEW_NAME>",
help="Name of the view where undefined sequenced are stored (will be PREFIX_VIEW_NAME)")
def run(config):
DMS.obi_atexit()
logger("info", "obi split")
# Open the input
input = open_uri(config["obi"]["inputURI"])
if input is None:
raise Exception("Could not read input view")
i_dms = input[0]
i_view = input[1]
# Initialize the progress bar
if config['obi']['noprogressbar'] == False:
pb = ProgressBar(len(i_view), config)
else:
pb = None
tag_to_split = config["split"]["tagname"]
undefined = tobytes(config["split"]["undefined"])
selections = {}
# Go through input view and split
for i in range(len(i_view)):
PyErr_CheckSignals()
if pb is not None:
pb(i)
line = i_view[i]
if tag_to_split not in line or line[tag_to_split] is None or len(line[tag_to_split])==0:
value = undefined
else:
value = line[tag_to_split]
if value not in selections:
selections[value] = Line_selection(i_view)
selections[value].append(i)
if pb is not None:
pb(len(i_view), force=True)
print("", file=sys.stderr)
# Create output views with the line selection
try:
for cat in selections:
o_view_name = config["split"]["prefix"].encode()+cat
o_view = selections[cat].materialize(o_view_name)
# Save command config in View and DMS comments
command_line = " ".join(sys.argv[1:])
input_dms_name=[input[0].name]
input_view_name=[input[1].name]
if 'taxoURI' in config['obi'] and config['obi']['taxoURI'] is not None:
input_dms_name.append(config['obi']['taxoURI'].split("/")[-3])
input_view_name.append("taxonomy/"+config['obi']['taxoURI'].split("/")[-1])
o_view.write_config(config, "split", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
o_view.close()
except Exception, e:
raise RollbackException("obi split error, rollbacking view: "+str(e), o_view)
i_dms.record_command_line(command_line)
i_dms.close(force=True)
logger("info", "Done.")

View File

@ -799,7 +799,8 @@ cdef class Line :
def keys(self): def keys(self):
return self._view.keys() cdef bytes key
return [key for key in self._view.keys()]
def __contains__(self, object column_name): def __contains__(self, object column_name):

View File

@ -6,4 +6,6 @@ cdef class TabFormat:
cdef bytes NAString cdef bytes NAString
cdef set tags cdef set tags
cdef bytes sep cdef bytes sep
cdef bint NAIntTo0 cdef bint NAIntTo0
cdef bint metabaR
cdef bint ngsfilter

View File

@ -10,13 +10,15 @@ import sys
cdef class TabFormat: cdef class TabFormat:
def __init__(self, list tags=[], header=True, bytes NAString=b"NA", bytes sep=b"\t", bint NAIntTo0=True): def __init__(self, list tags=[], header=True, bytes NAString=b"NA", bytes sep=b"\t", bint NAIntTo0=True, metabaR=False, ngsfilter=False):
self.tags = set(tags) self.tags = set(tags)
self.header = header self.header = header
self.first_line = True self.first_line = True
self.NAString = NAString self.NAString = NAString
self.sep = sep self.sep = sep
self.NAIntTo0 = NAIntTo0 self.NAIntTo0 = NAIntTo0
self.metabaR = metabaR
self.ngsfilter = ngsfilter
@cython.boundscheck(False) @cython.boundscheck(False)
def __call__(self, object data): def __call__(self, object data):
@ -34,13 +36,21 @@ cdef class TabFormat:
if self.header and self.first_line: if self.header and self.first_line:
for k in ktags: for k in ktags:
if k in tags: if k in tags:
if self.metabaR:
if k == b'NUC_SEQ':
ktoprint = b'sequence'
else:
ktoprint = k.lower()
ktoprint = ktoprint.replace(b'merged_', b'')
else:
ktoprint = k
if isinstance(data.view[k], Column_multi_elts): if isinstance(data.view[k], Column_multi_elts):
keys = data.view[k].keys() keys = data.view[k].keys()
keys.sort() keys.sort()
for k2 in keys: for k2 in keys:
line.append(tobytes(k)+b':'+tobytes(k2)) line.append(tobytes(ktoprint)+b':'+tobytes(k2))
else: else:
line.append(tobytes(k)) line.append(tobytes(ktoprint))
r = self.sep.join(value for value in line) r = self.sep.join(value for value in line)
r += b'\n' r += b'\n'
line = [] line = []

View File

@ -22,11 +22,11 @@ from libc.stdlib cimport free, malloc, realloc
from libc.string cimport strcpy, strlen from libc.string cimport strcpy, strlen
_featureMatcher = re.compile(b'^FEATURES.+\n(?=ORIGIN )',re.DOTALL + re.M) _featureMatcher = re.compile(b'^FEATURES.+\n(?=ORIGIN(\s*))',re.DOTALL + re.M)
_headerMatcher = re.compile(b'^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M) _headerMatcher = re.compile(b'^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M)
_seqMatcher = re.compile(b'^ORIGIN .+(?=//\n)', re.DOTALL + re.M) _seqMatcher = re.compile(b'^ORIGIN.+(?=//\n)', re.DOTALL + re.M)
_cleanSeq1 = re.compile(b'ORIGIN.+\n') _cleanSeq1 = re.compile(b'ORIGIN(\s*)\n')
_cleanSeq2 = re.compile(b'[ \n0-9]+') _cleanSeq2 = re.compile(b'[ \n0-9]+')
_acMatcher = re.compile(b'(?<=^ACCESSION ).+',re.M) _acMatcher = re.compile(b'(?<=^ACCESSION ).+',re.M)
_deMatcher = re.compile(b'(?<=^DEFINITION ).+\n( .+\n)*',re.M) _deMatcher = re.compile(b'(?<=^DEFINITION ).+\n( .+\n)*',re.M)
@ -155,10 +155,10 @@ def genbankIterator_file(lineiterator,
yield seq yield seq
read+=1 read+=1
# Last sequence # Last sequence if not empty lines
seq = genbankParser(entry) if entry.strip():
seq = genbankParser(entry)
yield seq yield seq
free(entry) free(entry)

View File

@ -48,13 +48,13 @@ def ngsfilterIterator(lineiterator,
all_lines.insert(0, firstline) all_lines.insert(0, firstline)
# Insert header for column names # Insert header for column names
column_names = [b"experiment", b"sample", b"forward_tag", b"reverse_tag", b"forward_primer", b"reverse_primer"] column_names = [b"experiment", b"sample", b"forward_tag", b"reverse_tag", b"forward_primer", b"reverse_primer",b"additional_info"]
header = out_sep.join(column_names) header = out_sep.join(column_names)
new_lines.append(header) new_lines.append(header)
for line in all_lines: for line in all_lines:
split_line = line.split() split_line = line.split(maxsplit=5)
tags = split_line.pop(2) tags = split_line.pop(2)
tags = tags.split(b":") tags = tags.split(b":")
for t_idx in range(len(tags)): for t_idx in range(len(tags)):
@ -64,7 +64,7 @@ def ngsfilterIterator(lineiterator,
tags.append(tags[0]) tags.append(tags[0])
split_line.insert(2, tags[0]) split_line.insert(2, tags[0])
split_line.insert(3, tags[1]) split_line.insert(3, tags[1])
new_lines.append(out_sep.join(split_line[0:6])) new_lines.append(out_sep.join(split_line[0:7]))
return tabIterator(iter(new_lines), return tabIterator(iter(new_lines),
header = True, header = True,

View File

@ -173,7 +173,10 @@ def open_uri(uri,
type newviewtype=View, type newviewtype=View,
dms_only=False, dms_only=False,
force_file=False): force_file=False):
if type(uri) == str and not uri.isascii():
raise Exception("Paths must be ASCII characters only")
cdef bytes urib = tobytes(uri) cdef bytes urib = tobytes(uri)
cdef bytes scheme cdef bytes scheme
cdef tuple dms cdef tuple dms
@ -277,7 +280,12 @@ def open_uri(uri,
iseq = urib iseq = urib
objclass = bytes objclass = bytes
else: # TODO update uopen to be able to write? else: # TODO update uopen to be able to write?
if not urip.path or urip.path == b'-': if config['obi']['outputformat'] == b'metabaR':
if 'metabarprefix' not in config['obi']:
raise Exception("Prefix needed when exporting for metabaR (--metabaR-prefix option)")
else:
file = open(config['obi']['metabarprefix']+'.tab', 'wb')
elif not urip.path or urip.path == b'-':
file = sys.stdout.buffer file = sys.stdout.buffer
else: else:
file = open(urip.path, 'wb') file = open(urip.path, 'wb')
@ -299,11 +307,11 @@ def open_uri(uri,
format=config["obi"][formatkey] format=config["obi"][formatkey]
except KeyError: except KeyError:
format=None format=None
if b'seqtype' in qualifiers: if b'seqtype' in qualifiers:
seqtype=qualifiers[b'seqtype'][0] seqtype=qualifiers[b'seqtype'][0]
else: else:
if format == b"ngsfilter" or format == b"tabular": # TODO discuss if format == b"ngsfilter" or format == b"tabular" or format == b"metabaR": # TODO discuss
seqtype=None seqtype=None
else: else:
try: try:
@ -437,7 +445,7 @@ def open_uri(uri,
try: try:
na_int_to_0=config["obi"]["na_int_to_0"] na_int_to_0=config["obi"]["na_int_to_0"]
except KeyError: except KeyError:
if format==b"tabular": if format==b"tabular" or format==b"metabaR":
na_int_to_0=True na_int_to_0=True
else: else:
na_int_to_0=False na_int_to_0=False
@ -487,6 +495,13 @@ def open_uri(uri,
except KeyError: except KeyError:
only_keys=[] only_keys=[]
if b"metabaR_prefix" in qualifiers:
metabaR_prefix = tobytes(qualifiers[b"metabaR_prefix"][0][0])
else:
try:
metabaR_prefix = tobytes(config["obi"]["metabarprefix"])
except KeyError:
metabaR_prefix=None
if format is not None: if format is not None:
if seqtype==b"nuc": if seqtype==b"nuc":
@ -552,6 +567,16 @@ def open_uri(uri,
skip=skip, skip=skip,
only=only, only=only,
header=header) header=header)
elif format==b"metabaR":
objclass = dict
if input:
raise NotImplementedError('Input data file format not implemented')
else:
iseq = TabWriter(TabFormat(tags=only_keys, header=header, NAString=nastring, sep=sep, NAIntTo0=na_int_to_0, metabaR=True),
file,
skip=skip,
only=only,
header=header)
elif format==b"ngsfilter": elif format==b"ngsfilter":
objclass = dict objclass = dict
if input: if input:
@ -565,7 +590,7 @@ def open_uri(uri,
skip = skip, skip = skip,
only = only) only = only)
else: else:
raise NotImplementedError('Output sequence file format not implemented') raise NotImplementedError('Output data file format not implemented')
else: else:
if input: if input:
iseq, objclass, format = entryIteratorFactory(file, iseq, objclass, format = entryIteratorFactory(file,

View File

@ -264,7 +264,7 @@ cdef obitype_t update_obitype(obitype_t obitype, object new_value) :
if new_value == None or new_type==list or new_type==dict or new_type==tuple: if new_value == None or new_type==list or new_type==dict or new_type==tuple:
return obitype return obitype
# TODO BOOL vers INT/FLOAT # TODO BOOL to INT/FLOAT
if new_type == str or new_type == bytes : if new_type == str or new_type == bytes :
if obitype == OBI_SEQ and is_a_DNA_seq(tobytes(new_value)) : if obitype == OBI_SEQ and is_a_DNA_seq(tobytes(new_value)) :
pass pass

View File

@ -1,5 +1,5 @@
major = 3 major = 3
minor = 0 minor = 0
serial= '1b12' serial= '1b15'
version ="%d.%d.%s" % (major,minor,serial) version ="%d.%d.%s" % (major,minor,serial)

View File

@ -77,6 +77,7 @@ static inline ecotx_t* get_lca_from_merged_taxids(Obiview_p view, OBIDMS_column_
{ {
ecotx_t* taxon = NULL; ecotx_t* taxon = NULL;
ecotx_t* lca = NULL; ecotx_t* lca = NULL;
ecotx_t* lca1 = NULL;
int32_t taxid; int32_t taxid;
index_t taxid_idx; index_t taxid_idx;
int64_t taxid_str_idx; int64_t taxid_str_idx;
@ -108,10 +109,11 @@ static inline ecotx_t* get_lca_from_merged_taxids(Obiview_p view, OBIDMS_column_
else else
{ {
// Compute LCA // Compute LCA
lca1 = lca;
lca = obi_taxo_get_lca(taxon, lca); lca = obi_taxo_get_lca(taxon, lca);
if (lca == NULL) if (lca == NULL)
{ {
obidebug(1, "\nError getting the last common ancestor of two taxa when building a reference database"); obidebug(1, "\nError getting the last common ancestor of two taxa when building a reference database, %d %d", taxid, lca1->taxid);
return NULL; return NULL;
} }
} }
@ -185,7 +187,7 @@ int build_reference_db(const char* dms_name,
matrix_view_name = strcpy(matrix_view_name, o_view_name); matrix_view_name = strcpy(matrix_view_name, o_view_name);
strcat(matrix_view_name, "_matrix"); strcat(matrix_view_name, "_matrix");
fprintf(stderr, "Aligning queries with reference database...\n"); fprintf(stderr, "Aligning sequences...\n");
if (obi_lcs_align_one_column(dms_name, if (obi_lcs_align_one_column(dms_name,
refs_view_name, refs_view_name,
"", "",