Compare commits

...

26 Commits

Author SHA1 Message Date
55ada80500 import: made ngsfilter file parsing more resilient and switching to
version 3.0.1b19
2022-07-15 16:02:21 +12:00
ef9d9674b0 obi import: added SINTAX format import and switch to version 3.0.1b18 2022-05-17 09:36:33 +12:00
4f39bb2418 switch to version 3.0.1b17 2022-05-03 10:55:36 +12:00
0a2b8adb50 import: added import of UNITE fasta format 2022-05-03 10:54:41 +12:00
f9b99a9397 annotate: fixed a bug where a column type could be wrongly guessed and
switch to version 3.0.1b16
2022-03-30 16:32:07 +13:00
ce2833c04b switch to version v3.0.1b15 2022-02-25 17:48:44 +13:00
f64b3da30b split command 2022-02-25 17:44:18 +13:00
388b3e0410 removed a trace 2021-11-11 15:53:27 +13:00
c9db990b83 switch to version 3.0.1b14 2021-11-11 15:28:00 +13:00
3f253feb5e Cython: View: fixed keys method to get list of view keys 2021-11-11 15:27:32 +13:00
85d2bab607 small fix 2021-11-11 15:26:48 +13:00
53b3d81137 small fixes and improvements 2021-11-11 15:26:09 +13:00
f6353fbf28 obi export: added options to export to metabaR compatible format 2021-11-11 15:24:12 +13:00
5a8b9dca5d goes with previous commit 2021-11-11 15:12:04 +13:00
8bd6d6c8e9 Python: URI decoding: now properly checking that paths can be encoded in
ASCII (issue #89)
2021-11-02 11:17:59 +13:00
405e6ef420 Python: URI decoding: added metabaR output 2021-11-02 11:16:29 +13:00
fedacfafe7 switch to version 3.0.1b13 2021-09-13 11:46:17 +12:00
2d66e0e965 python: genbank parser: better handling of white spaces 2021-09-13 11:44:38 +12:00
f43856b712 switch to version 3.0.1b12 2021-09-08 10:56:55 +12:00
9e0c319806 Cython: fixed rewriting of column when rewriting a 1 element dict column 2021-09-08 10:54:23 +12:00
58b42cd977 C: views: now correctly parses view names containing '.' when cleaning
unfinished views. Closes #115
2021-09-08 10:52:42 +12:00
34de90bce6 ngsfilter: checks better if there is an associated sequencing quality 2021-09-08 10:30:11 +12:00
4be9f36f99 stats: fixed the computation of variance when it is equal to 0 2021-08-05 11:32:16 +12:00
f10e78ba3c C: fixed the printing of view informations from a DMS (fixes #114) 2021-08-05 11:31:24 +12:00
88c8463ed7 Cython: taxonomy: improved logging 2021-08-05 11:29:20 +12:00
89168271ef ecopcr: now accepting taxonomy from a different DMS than the reference
sequences
2021-08-05 11:28:57 +12:00
24 changed files with 404 additions and 67 deletions

View File

@ -57,6 +57,18 @@ def __addImportInputOption(optionManager):
const=b'rdp', const=b'rdp',
help="Input file is in RDP training set fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.") help="Input file is in RDP training set fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
group.add_argument('--unite-input',
action="store_const", dest="obi:inputformat",
default=None,
const=b'unite',
help="Input file is in UNITE fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
group.add_argument('--sintax-input',
action="store_const", dest="obi:inputformat",
default=None,
const=b'sintax',
help="Input file is in SINTAX fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
group.add_argument('--embl-input', group.add_argument('--embl-input',
action="store_const", dest="obi:inputformat", action="store_const", dest="obi:inputformat",
default=None, default=None,
@ -297,6 +309,29 @@ def __addExportOutputOption(optionManager):
const=b'tabular', const=b'tabular',
help="Output file is in tabular format") help="Output file is in tabular format")
group.add_argument('--metabaR-output',
action="store_const", dest="obi:outputformat",
default=None,
const=b'metabaR',
help="Export the files needed by the obifiles_to_metabarlist function of the metabaR package")
group.add_argument('--metabaR-prefix',
action="store", dest="obi:metabarprefix",
type=str,
help="Prefix for the files when using --metabaR-output option")
group.add_argument('--metabaR-ngsfilter',
action="store", dest="obi:metabarngsfilter",
type=str,
default=None,
help="URI to the ngsfilter view when using --metabaR-output option (if not provided, it is not exported)")
group.add_argument('--metabaR-samples',
action="store", dest="obi:metabarsamples",
type=str,
default=None,
help="URI to the sample metadata view when using --metabaR-output option (if not provided, it is built as just a list of the sample names)")
group.add_argument('--only-keys', group.add_argument('--only-keys',
action="append", dest="obi:only_keys", action="append", dest="obi:only_keys",
type=str, type=str,

View File

@ -16,6 +16,8 @@ from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN, \
QUALITY_COLUMN, \ QUALITY_COLUMN, \
COUNT_COLUMN, \ COUNT_COLUMN, \
TAXID_COLUMN TAXID_COLUMN
from obitools3.dms.capi.obitypes cimport OBI_STR
from obitools3.dms.column.column cimport Column
import time import time
import math import math
@ -187,6 +189,8 @@ def sequenceTaggerGenerator(config, taxo=None):
else: else:
scn=None scn=None
seq[rank]=rtaxid seq[rank]=rtaxid
if "%s_name"%rank not in seq.view:
Column.new_column(seq.view, "%s_name"%rank, OBI_STR)
seq["%s_name"%rank]=scn seq["%s_name"%rank]=scn
if add_rank: if add_rank:

View File

@ -175,6 +175,14 @@ def run(config):
o_dms_name = output[0].name o_dms_name = output[0].name
o_view_name = output[1] o_view_name = output[1]
# Open the taxonomy DMS
taxdms = open_uri(config['obi']['taxoURI'],
dms_only=True)
if taxdms is None:
raise Exception("Could not open taxonomy DMS")
tax_dms = taxdms[0]
tax_dms_name = taxdms[0].name
# Read taxonomy name # Read taxonomy name
taxonomy_name = config['obi']['taxoURI'].split("/")[-1] # Robust in theory taxonomy_name = config['obi']['taxoURI'].split("/")[-1] # Robust in theory
@ -197,7 +205,8 @@ def run(config):
# TODO: primers in comments? # TODO: primers in comments?
if obi_ecopcr(i_dms.name_with_full_path, tobytes(i_view_name), tobytes(taxonomy_name), \ if obi_ecopcr(i_dms.name_with_full_path, tobytes(i_view_name),
tax_dms.name_with_full_path, tobytes(taxonomy_name), \
o_dms.name_with_full_path, tobytes(o_view_name), comments, \ o_dms.name_with_full_path, tobytes(o_view_name), comments, \
tobytes(config['ecopcr']['primer1']), tobytes(config['ecopcr']['primer2']), \ tobytes(config['ecopcr']['primer1']), tobytes(config['ecopcr']['primer2']), \
config['ecopcr']['error'], \ config['ecopcr']['error'], \

View File

@ -6,6 +6,9 @@ from obitools3.apps.config import logger
from obitools3.dms import DMS from obitools3.dms import DMS
from obitools3.dms.obiseq import Nuc_Seq from obitools3.dms.obiseq import Nuc_Seq
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN from obitools3.dms.capi.obiview cimport QUALITY_COLUMN
from obitools3.writers.tab import TabWriter
from obitools3.format.tab import TabFormat
from obitools3.utils cimport tobytes, tostr
from obitools3.apps.optiongroups import addMinimalInputOption, \ from obitools3.apps.optiongroups import addMinimalInputOption, \
addExportOutputOption, \ addExportOutputOption, \
@ -76,6 +79,13 @@ def run(config):
else: else:
pb = ProgressBar(withoutskip - skip, config) pb = ProgressBar(withoutskip - skip, config)
if config['obi']['outputformat'] == b'metabaR':
# Check prefix
if "metabarprefix" not in config["obi"]:
raise Exception("Prefix needed when exporting for metabaR (--metabaR-prefix option)")
else:
metabaRprefix = config["obi"]["metabarprefix"]
i=0 i=0
for seq in iview : for seq in iview :
PyErr_CheckSignals() PyErr_CheckSignals()
@ -91,6 +101,81 @@ def run(config):
pb(i, force=True) pb(i, force=True)
print("", file=sys.stderr) print("", file=sys.stderr)
if config['obi']['outputformat'] == b'metabaR':
# Export ngsfilter file if view provided
if 'metabarngsfilter' in config['obi']:
ngsfilter_input = open_uri(config['obi']['metabarngsfilter'])
if ngsfilter_input is None:
raise Exception("Could not read ngsfilter view for metabaR output")
ngsfilter_view = ngsfilter_input[1]
ngsfilter_output = open(config['obi']['metabarprefix']+'.ngsfilter', 'w')
for line in ngsfilter_view:
line_to_print = b""
line_to_print += line[b'experiment']
line_to_print += b"\t"
line_to_print += line[b'sample']
line_to_print += b"\t"
line_to_print += line[b'forward_tag']
line_to_print += b":"
line_to_print += line[b'reverse_tag']
line_to_print += b"\t"
line_to_print += line[b'forward_primer']
line_to_print += b"\t"
line_to_print += line[b'reverse_primer']
line_to_print += b"\t"
line_to_print += line[b'additional_info']
print(tostr(line_to_print), file=ngsfilter_output)
if ngsfilter_input[0] != input[0]:
ngsfilter_input[0].close()
ngsfilter_output.close()
# Export sample metadata
samples_output = open(config['obi']['metabarprefix']+'_samples.csv', 'w')
# Export sample metadata file if view provided
if 'metabarsamples' in config['obi']:
samples_input = open_uri(config['obi']['metabarsamples'])
if samples_input is None:
raise Exception("Could not read sample view for metabaR output")
samples_view = samples_input[1]
# Export with tab formatter
TabWriter(TabFormat(header=True, sep='\t',),
samples_output,
header=True)
if samples_input[0] != input[0]:
samples_input[0].close()
# Else export just sample names from main view
else:
sample_list = []
if 'MERGED_sample' in iview:
sample_list = iview['MERGED_sample'].keys()
elif 'sample' not in iview:
for seq in iview:
sample = seq['sample']
if sample not in sample_list:
sample_list.append(sample)
else:
logger("warning", "Can not read sample list from main view for metabaR sample list export")
print("sample_id", file=samples_output)
for sample in sample_list:
line_to_print = b""
line_to_print += sample
line_to_print += b"\t"
print(tostr(line_to_print), file=samples_output)
samples_output.close()
# TODO save command in input dms? # TODO save command in input dms?
if not BrokenPipeError and not IOError: if not BrokenPipeError and not IOError:

View File

@ -91,7 +91,7 @@ def addOptions(parser):
metavar="<ATTRIBUTE_NAME>", metavar="<ATTRIBUTE_NAME>",
help="Select records with the attribute <ATTRIBUTE_NAME> " help="Select records with the attribute <ATTRIBUTE_NAME> "
"defined (not set to NA value). " "defined (not set to NA value). "
"Several -a options can be used on the same " "Several -A options can be used on the same "
"command line.") "command line.")
group.add_argument("-L", "--lmax", group.add_argument("-L", "--lmax",

View File

@ -2,6 +2,7 @@
import sys import sys
import os import os
import re
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.dms.view.view cimport View from obitools3.dms.view.view cimport View
@ -108,6 +109,8 @@ def run(config):
cdef bint NUC_SEQS_view cdef bint NUC_SEQS_view
cdef bint silva cdef bint silva
cdef bint rdp cdef bint rdp
cdef bint unite
cdef bint sintax
cdef int nb_elts cdef int nb_elts
cdef object d cdef object d
cdef View view cdef View view
@ -233,16 +236,27 @@ def run(config):
def_col = view[DEFINITION_COLUMN] def_col = view[DEFINITION_COLUMN]
seq_col = view[NUC_SEQUENCE_COLUMN] seq_col = view[NUC_SEQUENCE_COLUMN]
# Prepare taxon scientific name and taxid refs if RDP or SILVA file # Prepare taxon scientific name and taxid refs if RDP/SILVA/UNITE/SINTAX formats
silva = False silva = False
rdp = False rdp = False
if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or config['obi']['inputformat'] == b"rdp"): unite = False
sintax=False
if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or \
config['obi']['inputformat'] == b"rdp" or \
config['obi']['inputformat'] == b"unite" or \
config['obi']['inputformat'] == b"sintax"):
#if taxo is None: #if taxo is None:
# raise Exception("A taxonomy (as built by 'obi import --taxdump') must be provided for SILVA and RDP files") # raise Exception("A taxonomy (as built by 'obi import --taxdump') must be provided for SILVA and RDP files")
silva = True if config['obi']['inputformat'] == b"silva":
rdp = True silva = True
elif config['obi']['inputformat'] == b"rdp":
rdp = True
elif config['obi']['inputformat'] == b"unite":
unite = True
elif config['obi']['inputformat'] == b"sintax":
sintax = True
sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR)
if taxo is not None: if taxo is not None:
sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR)
taxid_col = Column.new_column(view, TAXID_COLUMN, OBI_INT) taxid_col = Column.new_column(view, TAXID_COLUMN, OBI_INT)
dcols = {} dcols = {}
@ -349,17 +363,39 @@ def run(config):
if get_quality: if get_quality:
qual_col[i] = entry.quality qual_col[i] = entry.quality
# Parse taxon scientific name if RDP file # Parse taxon scientific name if RDP or Silva or Unite file
if (rdp or silva) and taxo is not None: if (rdp or silva or unite or sintax):
sci_names = entry.definition.split(b";") if rdp or silva:
for sci_name in reversed(sci_names): sci_names = entry.definition.split(b";")
if sci_name.split()[0] != b'unidentified' and sci_name.split()[0] != b'uncultured' and sci_name.split()[0] != b'metagenome' : sci_name_col[i] = sci_names[-1]
taxon = taxo.get_taxon_by_name(sci_name) elif unite:
if taxon is not None: sci_names = entry.id.split(b'|')[-1].split(b';')
sci_name_col[i] = taxon.name sci_name_col[i] = re.sub(b'[a-zA-Z]__', b'', sci_names[-1])
taxid_col[i] = taxon.taxid elif sintax:
#print(taxid_col[i], sci_name_col[i]) reconstructed_line = entry.id+b' '+entry.definition[:-1]
break splitted_reconstructed_line = reconstructed_line.split(b';')
taxa = splitted_reconstructed_line[1].split(b'=')[1]
taxa = splitted_reconstructed_line[1].split(b',')
sci_names = []
for t in taxa:
tf = t.split(b':')[1]
sci_names.append(tf)
sci_name_col[i] = sci_names[-1]
id_col[i] = reconstructed_line.split(b';')[0]
def_col[i] = reconstructed_line
# Fond taxid if taxonomy provided
if taxo is not None :
for sci_name in reversed(sci_names):
if unite:
sci_name = re.sub(b'[a-zA-Z]__', b'', sci_name)
if sci_name.split()[0] != b'unidentified' and sci_name.split()[0] != b'uncultured' and sci_name.split()[0] != b'metagenome':
taxon = taxo.get_taxon_by_name(sci_name)
if taxon is not None:
sci_name_col[i] = taxon.name
taxid_col[i] = taxon.taxid
#print(taxid_col[i], sci_name_col[i])
break
for tag in entry : for tag in entry :

2
python/obitools3/commands/ngsfilter.pyx Normal file → Executable file
View File

@ -271,7 +271,7 @@ cdef tuple annotate(sequences, infos, no_tags, verbose=False):
sequences[0][REVERSE_QUALITY_COLUMN] = sequences[1].quality # used by alignpairedend tool sequences[0][REVERSE_QUALITY_COLUMN] = sequences[1].quality # used by alignpairedend tool
for seq in sequences: for seq in sequences:
if hasattr(seq, "quality_array"): if hasattr(seq, "quality_array") and seq.quality_array is not None:
q = -reduce(lambda x,y:x+y,(math.log10(z) for z in seq.quality_array),0)/len(seq.quality_array)*10 q = -reduce(lambda x,y:x+y,(math.log10(z) for z in seq.quality_array),0)/len(seq.quality_array)*10
seq[b'avg_quality']=q seq[b'avg_quality']=q
q = -reduce(lambda x,y:x+y,(math.log10(z) for z in seq.quality_array[0:10]),0) q = -reduce(lambda x,y:x+y,(math.log10(z) for z in seq.quality_array[0:10]),0)

View File

@ -0,0 +1,105 @@
#cython: language_level=3
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.dms import DMS
from obitools3.dms.view.view cimport View, Line_selection
from obitools3.uri.decode import open_uri
from obitools3.apps.optiongroups import addMinimalInputOption, addTaxonomyOption, addMinimalOutputOption, addNoProgressBarOption
from obitools3.dms.view import RollbackException
from obitools3.apps.config import logger
from obitools3.utils cimport tobytes
import sys
from cpython.exc cimport PyErr_CheckSignals
__title__="Split"
def addOptions(parser):
addMinimalInputOption(parser)
addNoProgressBarOption(parser)
group=parser.add_argument_group("obi split specific options")
group.add_argument('-p','--prefix',
action="store", dest="split:prefix",
metavar="<PREFIX>",
help="Prefix added to each subview name (included undefined)")
group.add_argument('-t','--tag-name',
action="store", dest="split:tagname",
metavar="<TAG_NAME>",
help="Attribute/tag used to split the input")
group.add_argument('-u','--undefined',
action="store", dest="split:undefined",
default=b'UNDEFINED',
metavar="<VIEW_NAME>",
help="Name of the view where undefined sequenced are stored (will be PREFIX_VIEW_NAME)")
def run(config):
DMS.obi_atexit()
logger("info", "obi split")
# Open the input
input = open_uri(config["obi"]["inputURI"])
if input is None:
raise Exception("Could not read input view")
i_dms = input[0]
i_view = input[1]
# Initialize the progress bar
if config['obi']['noprogressbar'] == False:
pb = ProgressBar(len(i_view), config)
else:
pb = None
tag_to_split = config["split"]["tagname"]
undefined = tobytes(config["split"]["undefined"])
selections = {}
# Go through input view and split
for i in range(len(i_view)):
PyErr_CheckSignals()
if pb is not None:
pb(i)
line = i_view[i]
if tag_to_split not in line or line[tag_to_split] is None or len(line[tag_to_split])==0:
value = undefined
else:
value = line[tag_to_split]
if value not in selections:
selections[value] = Line_selection(i_view)
selections[value].append(i)
if pb is not None:
pb(len(i_view), force=True)
print("", file=sys.stderr)
# Create output views with the line selection
try:
for cat in selections:
o_view_name = config["split"]["prefix"].encode()+cat
o_view = selections[cat].materialize(o_view_name)
# Save command config in View and DMS comments
command_line = " ".join(sys.argv[1:])
input_dms_name=[input[0].name]
input_view_name=[input[1].name]
if 'taxoURI' in config['obi'] and config['obi']['taxoURI'] is not None:
input_dms_name.append(config['obi']['taxoURI'].split("/")[-3])
input_view_name.append("taxonomy/"+config['obi']['taxoURI'].split("/")[-1])
o_view.write_config(config, "split", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
o_view.close()
except Exception, e:
raise RollbackException("obi split error, rollbacking view: "+str(e), o_view)
i_dms.record_command_line(command_line)
i_dms.close(force=True)
logger("info", "Done.")

View File

@ -119,9 +119,12 @@ def mean(values, options):
def variance(v): def variance(v):
if len(v)==1: if len(v)==1:
return 0 return 0
s = reduce(lambda x,y:(x[0]+y,x[1]+y**2),v,(0.,0.)) s = reduce(lambda x,y:(x[0]+y,x[1]+y**2),v,(0.,0.))
return s[1]/(len(v)-1) - s[0]**2/len(v)/(len(v)-1) var = round(s[1]/(len(v)-1) - s[0]**2/len(v)/(len(v)-1), 5) # round to go around shady python rounding stuff when var is actually 0
if var == -0.0: # then fix -0 to +0 if was rounded to -0
var = 0.0
return var
def varpop(values, options): def varpop(values, options):

View File

@ -8,6 +8,7 @@ cdef extern from "obi_ecopcr.h" nogil:
int obi_ecopcr(const char* input_dms_name, int obi_ecopcr(const char* input_dms_name,
const char* i_view_name, const char* i_view_name,
const char* tax_dms_name,
const char* taxonomy_name, const char* taxonomy_name,
const char* output_dms_name, const char* output_dms_name,
const char* o_view_name, const char* o_view_name,

View File

@ -93,6 +93,8 @@ cdef class Taxonomy(OBIWrapper) :
raise RuntimeError("Error : Cannot read taxonomy %s" raise RuntimeError("Error : Cannot read taxonomy %s"
% tostr(name)) % tostr(name))
print("Taxonomy read", file=sys.stderr)
taxo = OBIWrapper.new_wrapper(Taxonomy, pointer) taxo = OBIWrapper.new_wrapper(Taxonomy, pointer)
dms.register(taxo) dms.register(taxo)

View File

@ -345,7 +345,7 @@ cdef class View(OBIWrapper) :
nb_elements_per_line=new_nb_elements_per_line, elements_names=new_elements_names, nb_elements_per_line=new_nb_elements_per_line, elements_names=new_elements_names,
dict_column=(new_nb_elements_per_line>1), comments=old_column.comments, alias=column_name_b+tobytes('___new___')) dict_column=(new_nb_elements_per_line>1), comments=old_column.comments, alias=column_name_b+tobytes('___new___'))
switch_to_dict = old_column.nb_elements_per_line == 1 and new_nb_elements_per_line > 1 switch_to_dict = not old_column.dict_column and new_nb_elements_per_line > 1
ori_key = old_column._elements_names[0] ori_key = old_column._elements_names[0]
for i in range(length) : for i in range(length) :
@ -799,7 +799,8 @@ cdef class Line :
def keys(self): def keys(self):
return self._view.keys() cdef bytes key
return [key for key in self._view.keys()]
def __contains__(self, object column_name): def __contains__(self, object column_name):

View File

@ -6,4 +6,6 @@ cdef class TabFormat:
cdef bytes NAString cdef bytes NAString
cdef set tags cdef set tags
cdef bytes sep cdef bytes sep
cdef bint NAIntTo0 cdef bint NAIntTo0
cdef bint metabaR
cdef bint ngsfilter

View File

@ -10,13 +10,15 @@ import sys
cdef class TabFormat: cdef class TabFormat:
def __init__(self, list tags=[], header=True, bytes NAString=b"NA", bytes sep=b"\t", bint NAIntTo0=True): def __init__(self, list tags=[], header=True, bytes NAString=b"NA", bytes sep=b"\t", bint NAIntTo0=True, metabaR=False, ngsfilter=False):
self.tags = set(tags) self.tags = set(tags)
self.header = header self.header = header
self.first_line = True self.first_line = True
self.NAString = NAString self.NAString = NAString
self.sep = sep self.sep = sep
self.NAIntTo0 = NAIntTo0 self.NAIntTo0 = NAIntTo0
self.metabaR = metabaR
self.ngsfilter = ngsfilter
@cython.boundscheck(False) @cython.boundscheck(False)
def __call__(self, object data): def __call__(self, object data):
@ -34,13 +36,21 @@ cdef class TabFormat:
if self.header and self.first_line: if self.header and self.first_line:
for k in ktags: for k in ktags:
if k in tags: if k in tags:
if self.metabaR:
if k == b'NUC_SEQ':
ktoprint = b'sequence'
else:
ktoprint = k.lower()
ktoprint = ktoprint.replace(b'merged_', b'')
else:
ktoprint = k
if isinstance(data.view[k], Column_multi_elts): if isinstance(data.view[k], Column_multi_elts):
keys = data.view[k].keys() keys = data.view[k].keys()
keys.sort() keys.sort()
for k2 in keys: for k2 in keys:
line.append(tobytes(k)+b':'+tobytes(k2)) line.append(tobytes(ktoprint)+b':'+tobytes(k2))
else: else:
line.append(tobytes(k)) line.append(tobytes(ktoprint))
r = self.sep.join(value for value in line) r = self.sep.join(value for value in line)
r += b'\n' r += b'\n'
line = [] line = []

View File

@ -22,11 +22,11 @@ from libc.stdlib cimport free, malloc, realloc
from libc.string cimport strcpy, strlen from libc.string cimport strcpy, strlen
_featureMatcher = re.compile(b'^FEATURES.+\n(?=ORIGIN )',re.DOTALL + re.M) _featureMatcher = re.compile(b'^FEATURES.+\n(?=ORIGIN(\s*))',re.DOTALL + re.M)
_headerMatcher = re.compile(b'^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M) _headerMatcher = re.compile(b'^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M)
_seqMatcher = re.compile(b'^ORIGIN .+(?=//\n)', re.DOTALL + re.M) _seqMatcher = re.compile(b'^ORIGIN.+(?=//\n)', re.DOTALL + re.M)
_cleanSeq1 = re.compile(b'ORIGIN.+\n') _cleanSeq1 = re.compile(b'ORIGIN(\s*)\n')
_cleanSeq2 = re.compile(b'[ \n0-9]+') _cleanSeq2 = re.compile(b'[ \n0-9]+')
_acMatcher = re.compile(b'(?<=^ACCESSION ).+',re.M) _acMatcher = re.compile(b'(?<=^ACCESSION ).+',re.M)
_deMatcher = re.compile(b'(?<=^DEFINITION ).+\n( .+\n)*',re.M) _deMatcher = re.compile(b'(?<=^DEFINITION ).+\n( .+\n)*',re.M)
@ -155,10 +155,10 @@ def genbankIterator_file(lineiterator,
yield seq yield seq
read+=1 read+=1
# Last sequence # Last sequence if not empty lines
seq = genbankParser(entry) if entry.strip():
seq = genbankParser(entry)
yield seq yield seq
free(entry) free(entry)

View File

@ -48,24 +48,25 @@ def ngsfilterIterator(lineiterator,
all_lines.insert(0, firstline) all_lines.insert(0, firstline)
# Insert header for column names # Insert header for column names
column_names = [b"experiment", b"sample", b"forward_tag", b"reverse_tag", b"forward_primer", b"reverse_primer"] column_names = [b"experiment", b"sample", b"forward_tag", b"reverse_tag", b"forward_primer", b"reverse_primer"] #,b"additional_info"]
header = out_sep.join(column_names) header = out_sep.join(column_names)
new_lines.append(header) new_lines.append(header)
for line in all_lines: for line in all_lines:
split_line = line.split() split_line = line.split(maxsplit=5)
tags = split_line.pop(2) if split_line:
tags = tags.split(b":") tags = split_line.pop(2)
for t_idx in range(len(tags)): tags = tags.split(b":")
if tags[t_idx]==b"-" or tags[t_idx]==b"None" or tags[t_idx]==b"": for t_idx in range(len(tags)):
tags[t_idx] = nastring if tags[t_idx]==b"-" or tags[t_idx]==b"None" or tags[t_idx]==b"":
if len(tags) == 1: # Forward and reverse tags are the same tags[t_idx] = nastring
tags.append(tags[0]) if len(tags) == 1: # Forward and reverse tags are the same
split_line.insert(2, tags[0]) tags.append(tags[0])
split_line.insert(3, tags[1]) split_line.insert(2, tags[0])
new_lines.append(out_sep.join(split_line[0:6])) split_line.insert(3, tags[1])
new_lines.append(out_sep.join(split_line[0:6]))
return tabIterator(iter(new_lines), return tabIterator(iter(new_lines),
header = True, header = True,
sep = out_sep, sep = out_sep,

View File

@ -173,7 +173,10 @@ def open_uri(uri,
type newviewtype=View, type newviewtype=View,
dms_only=False, dms_only=False,
force_file=False): force_file=False):
if type(uri) == str and not uri.isascii():
raise Exception("Paths must be ASCII characters only")
cdef bytes urib = tobytes(uri) cdef bytes urib = tobytes(uri)
cdef bytes scheme cdef bytes scheme
cdef tuple dms cdef tuple dms
@ -277,7 +280,12 @@ def open_uri(uri,
iseq = urib iseq = urib
objclass = bytes objclass = bytes
else: # TODO update uopen to be able to write? else: # TODO update uopen to be able to write?
if not urip.path or urip.path == b'-': if config['obi']['outputformat'] == b'metabaR':
if 'metabarprefix' not in config['obi']:
raise Exception("Prefix needed when exporting for metabaR (--metabaR-prefix option)")
else:
file = open(config['obi']['metabarprefix']+'.tab', 'wb')
elif not urip.path or urip.path == b'-':
file = sys.stdout.buffer file = sys.stdout.buffer
else: else:
file = open(urip.path, 'wb') file = open(urip.path, 'wb')
@ -299,11 +307,11 @@ def open_uri(uri,
format=config["obi"][formatkey] format=config["obi"][formatkey]
except KeyError: except KeyError:
format=None format=None
if b'seqtype' in qualifiers: if b'seqtype' in qualifiers:
seqtype=qualifiers[b'seqtype'][0] seqtype=qualifiers[b'seqtype'][0]
else: else:
if format == b"ngsfilter" or format == b"tabular": # TODO discuss if format == b"ngsfilter" or format == b"tabular" or format == b"metabaR": # TODO discuss
seqtype=None seqtype=None
else: else:
try: try:
@ -437,7 +445,7 @@ def open_uri(uri,
try: try:
na_int_to_0=config["obi"]["na_int_to_0"] na_int_to_0=config["obi"]["na_int_to_0"]
except KeyError: except KeyError:
if format==b"tabular": if format==b"tabular" or format==b"metabaR":
na_int_to_0=True na_int_to_0=True
else: else:
na_int_to_0=False na_int_to_0=False
@ -487,11 +495,18 @@ def open_uri(uri,
except KeyError: except KeyError:
only_keys=[] only_keys=[]
if b"metabaR_prefix" in qualifiers:
metabaR_prefix = tobytes(qualifiers[b"metabaR_prefix"][0][0])
else:
try:
metabaR_prefix = tobytes(config["obi"]["metabarprefix"])
except KeyError:
metabaR_prefix=None
if format is not None: if format is not None:
if seqtype==b"nuc": if seqtype==b"nuc":
objclass = Nuc_Seq # Nuc_Seq_Stored? TODO objclass = Nuc_Seq # Nuc_Seq_Stored? TODO
if format==b"fasta" or format==b"silva" or format==b"rdp": if format==b"fasta" or format==b"silva" or format==b"rdp" or format == b"unite" or format == b"sintax":
if input: if input:
iseq = fastaNucIterator(file, iseq = fastaNucIterator(file,
skip=skip, skip=skip,
@ -552,6 +567,16 @@ def open_uri(uri,
skip=skip, skip=skip,
only=only, only=only,
header=header) header=header)
elif format==b"metabaR":
objclass = dict
if input:
raise NotImplementedError('Input data file format not implemented')
else:
iseq = TabWriter(TabFormat(tags=only_keys, header=header, NAString=nastring, sep=sep, NAIntTo0=na_int_to_0, metabaR=True),
file,
skip=skip,
only=only,
header=header)
elif format==b"ngsfilter": elif format==b"ngsfilter":
objclass = dict objclass = dict
if input: if input:
@ -565,7 +590,7 @@ def open_uri(uri,
skip = skip, skip = skip,
only = only) only = only)
else: else:
raise NotImplementedError('Output sequence file format not implemented') raise NotImplementedError('Output data file format not implemented')
else: else:
if input: if input:
iseq, objclass, format = entryIteratorFactory(file, iseq, objclass, format = entryIteratorFactory(file,

View File

@ -264,7 +264,7 @@ cdef obitype_t update_obitype(obitype_t obitype, object new_value) :
if new_value == None or new_type==list or new_type==dict or new_type==tuple: if new_value == None or new_type==list or new_type==dict or new_type==tuple:
return obitype return obitype
# TODO BOOL vers INT/FLOAT # TODO BOOL to INT/FLOAT
if new_type == str or new_type == bytes : if new_type == str or new_type == bytes :
if obitype == OBI_SEQ and is_a_DNA_seq(tobytes(new_value)) : if obitype == OBI_SEQ and is_a_DNA_seq(tobytes(new_value)) :
pass pass

View File

@ -1,5 +1,5 @@
major = 3 major = 3
minor = 0 minor = 0
serial= '1b11' serial= '1b19'
version ="%d.%d.%s" % (major,minor,serial) version ="%d.%d.%s" % (major,minor,serial)

View File

@ -77,6 +77,7 @@ static inline ecotx_t* get_lca_from_merged_taxids(Obiview_p view, OBIDMS_column_
{ {
ecotx_t* taxon = NULL; ecotx_t* taxon = NULL;
ecotx_t* lca = NULL; ecotx_t* lca = NULL;
ecotx_t* lca1 = NULL;
int32_t taxid; int32_t taxid;
index_t taxid_idx; index_t taxid_idx;
int64_t taxid_str_idx; int64_t taxid_str_idx;
@ -108,10 +109,11 @@ static inline ecotx_t* get_lca_from_merged_taxids(Obiview_p view, OBIDMS_column_
else else
{ {
// Compute LCA // Compute LCA
lca1 = lca;
lca = obi_taxo_get_lca(taxon, lca); lca = obi_taxo_get_lca(taxon, lca);
if (lca == NULL) if (lca == NULL)
{ {
obidebug(1, "\nError getting the last common ancestor of two taxa when building a reference database"); obidebug(1, "\nError getting the last common ancestor of two taxa when building a reference database, %d %d", taxid, lca1->taxid);
return NULL; return NULL;
} }
} }
@ -185,7 +187,7 @@ int build_reference_db(const char* dms_name,
matrix_view_name = strcpy(matrix_view_name, o_view_name); matrix_view_name = strcpy(matrix_view_name, o_view_name);
strcat(matrix_view_name, "_matrix"); strcat(matrix_view_name, "_matrix");
fprintf(stderr, "Aligning queries with reference database...\n"); fprintf(stderr, "Aligning sequences...\n");
if (obi_lcs_align_one_column(dms_name, if (obi_lcs_align_one_column(dms_name,
refs_view_name, refs_view_name,
"", "",

View File

@ -645,7 +645,8 @@ static int print_seq(Obiview_p i_view, Obiview_p o_view,
int obi_ecopcr(const char* i_dms_name, int obi_ecopcr(const char* i_dms_name,
const char* i_view_name, const char* i_view_name,
const char* taxonomy_name, // TODO discuss that input dms assumed const char* tax_dms_name,
const char* taxonomy_name,
const char* o_dms_name, const char* o_dms_name,
const char* o_view_name, const char* o_view_name,
const char* o_view_comments, const char* o_view_comments,
@ -678,6 +679,7 @@ int obi_ecopcr(const char* i_dms_name,
OBIDMS_p i_dms = NULL; OBIDMS_p i_dms = NULL;
OBIDMS_p o_dms = NULL; OBIDMS_p o_dms = NULL;
OBIDMS_p tax_dms = NULL;
OBIDMS_taxonomy_p taxonomy = NULL; OBIDMS_taxonomy_p taxonomy = NULL;
Obiview_p i_view = NULL; Obiview_p i_view = NULL;
Obiview_p o_view = NULL; Obiview_p o_view = NULL;
@ -965,8 +967,16 @@ int obi_ecopcr(const char* i_dms_name,
return -1; return -1;
} }
// Open taxonomy DMS
tax_dms = obi_open_dms(tax_dms_name, false);
if (tax_dms == NULL)
{
obidebug(1, "\nError opening the taxonomy DMS");
return -1;
}
// Open the taxonomy // Open the taxonomy
taxonomy = obi_read_taxonomy(i_dms, taxonomy_name, false); taxonomy = obi_read_taxonomy(tax_dms, taxonomy_name, false);
if (taxonomy == NULL) if (taxonomy == NULL)
{ {
obidebug(1, "\nError opening the taxonomy"); obidebug(1, "\nError opening the taxonomy");

View File

@ -77,7 +77,8 @@
* *
* @param i_dms_name The path to the input DMS. * @param i_dms_name The path to the input DMS.
* @param i_view_name The name of the input view. * @param i_view_name The name of the input view.
* @param taxonomy_name The name of the taxonomy in the input DMS. * @param tax_dms_name The path to the DMS containing the taxonomy.
* @param taxonomy_name The name of the taxonomy.
* @param o_dms_name The path to the output DMS. * @param o_dms_name The path to the output DMS.
* @param o_view_name The name of the output view. * @param o_view_name The name of the output view.
* @param o_view_comments The comments to associate with the output view. * @param o_view_comments The comments to associate with the output view.
@ -106,6 +107,7 @@
*/ */
int obi_ecopcr(const char* i_dms_name, int obi_ecopcr(const char* i_dms_name,
const char* i_view_name, const char* i_view_name,
const char* tax_dms_name,
const char* taxonomy_name, const char* taxonomy_name,
const char* o_dms_name, const char* o_dms_name,
const char* o_view_name, const char* o_view_name,

View File

@ -1417,7 +1417,7 @@ char* obi_dms_formatted_infos(OBIDMS_p dms, bool detailed)
char* view_name = NULL; char* view_name = NULL;
char* tax_name = NULL; char* tax_name = NULL;
char* all_tax_dir_path = NULL; char* all_tax_dir_path = NULL;
int i; int i, last_dot_pos;
struct dirent* dp; struct dirent* dp;
Obiview_p view; Obiview_p view;
@ -1439,17 +1439,21 @@ char* obi_dms_formatted_infos(OBIDMS_p dms, bool detailed)
if ((dp->d_name)[0] == '.') if ((dp->d_name)[0] == '.')
continue; continue;
i=0; i=0;
while ((dp->d_name)[i] != '.') while (i < strlen(dp->d_name))
{
if ((dp->d_name)[i] == '.')
last_dot_pos = i;
i++; i++;
view_name = (char*) malloc((i+1) * sizeof(char)); }
view_name = (char*) malloc((last_dot_pos+1) * sizeof(char));
if (view_name == NULL) if (view_name == NULL)
{ {
obi_set_errno(OBI_MALLOC_ERROR); obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for a view name when getting formatted DMS infos: file %s", dp->d_name); obidebug(1, "\nError allocating memory for a view name when getting formatted DMS infos: file %s", dp->d_name);
return NULL; return NULL;
} }
strncpy(view_name, dp->d_name, i); strncpy(view_name, dp->d_name, last_dot_pos);
view_name[i] = '\0'; view_name[last_dot_pos] = '\0';
view = obi_open_view(dms, view_name); view = obi_open_view(dms, view_name);
if (view == NULL) if (view == NULL)
{ {

View File

@ -2910,7 +2910,7 @@ int obi_clean_unfinished_views(OBIDMS_p dms)
if ((dp->d_name)[0] == '.') if ((dp->d_name)[0] == '.')
continue; continue;
i=0; i=0;
while ((dp->d_name)[i] != '.') while (strncmp((dp->d_name)+i, ".obiview", 8))
i++; i++;
relative_path = (char*) malloc(strlen(VIEW_DIR_NAME) + strlen(dp->d_name) + 2); relative_path = (char*) malloc(strlen(VIEW_DIR_NAME) + strlen(dp->d_name) + 2);
strcpy(relative_path, VIEW_DIR_NAME); strcpy(relative_path, VIEW_DIR_NAME);