Compare commits

...

15 Commits

Author SHA1 Message Date
8ec13a294c Switch to version 3.0.1b9 2021-06-01 09:21:43 +12:00
3e45c34491 import: now imports and adds taxids for SILVA and RDP files, added
import of lists, fixed skipping of errors (was not overwriting), and
fixed --no-progress-bar option
2021-06-01 09:21:07 +12:00
c2f3d90dc1 build_ref_db: set default threshold to 0.99 2021-06-01 09:11:17 +12:00
6b732d11d3 align: fixed column URI parsing 2021-06-01 09:10:21 +12:00
9eb833a0af typo fix 2021-06-01 09:09:16 +12:00
6b7b0e3bd1 cat: fixed the handling of dictionary columns 2021-06-01 09:06:13 +12:00
47691a8f58 count: added option to specify the count column 2021-06-01 09:05:14 +12:00
b908b581c8 clean: hid not implemented option 2021-06-01 09:04:22 +12:00
03c174fd7a grep: added taxonomy check 2021-05-31 17:03:39 +12:00
2156588ff6 added TODO comment 2021-05-31 17:01:57 +12:00
6ff29c6a6a Increased maximum line count to 10E12 2021-05-31 17:00:55 +12:00
51a3c68fb5 C: build_reference_db: fixed gcc warning/error 2021-05-31 16:59:17 +12:00
da91ffc2c7 URI decoding: fixed reading a taxonomy before any view 2021-05-31 16:57:20 +12:00
c884615522 obi stats: various fixes and improvements 2021-05-31 16:51:06 +12:00
cb53381863 ecotag: BEST_MATCH_TAXIDS now dereplicated (no repeated taxids in the
list) and switch to version 3.0.1b8
2021-05-10 16:02:06 +12:00
18 changed files with 230 additions and 86 deletions

View File

@ -49,7 +49,13 @@ def __addImportInputOption(optionManager):
action="store_const", dest="obi:inputformat", action="store_const", dest="obi:inputformat",
default=None, default=None,
const=b'silva', const=b'silva',
help="Input file is in SILVA fasta format") help="Input file is in SILVA fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
group.add_argument('--rdp-input',
action="store_const", dest="obi:inputformat",
default=None,
const=b'rdp',
help="Input file is in RDP training set fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
group.add_argument('--embl-input', group.add_argument('--embl-input',
action="store_const", dest="obi:inputformat", action="store_const", dest="obi:inputformat",

View File

@ -45,7 +45,7 @@ def addOptions(parser):
metavar="<TAXID_TAG>", metavar="<TAXID_TAG>",
default=b"TAXID", default=b"TAXID",
help="Name of the tag to store the found taxid " help="Name of the tag to store the found taxid "
"(default: 'TAXID'.") "(default: 'TAXID').")
group.add_argument('-n', '--taxon-name-tag', group.add_argument('-n', '--taxon-name-tag',
action="store", action="store",
@ -53,7 +53,7 @@ def addOptions(parser):
metavar="<SCIENTIFIC_NAME_TAG>", metavar="<SCIENTIFIC_NAME_TAG>",
default=b"SCIENTIFIC_NAME", default=b"SCIENTIFIC_NAME",
help="Name of the tag giving the scientific name of the taxon " help="Name of the tag giving the scientific name of the taxon "
"(default: 'SCIENTIFIC_NAME'.") "(default: 'SCIENTIFIC_NAME').")
group.add_argument('-g', '--try-genus-match', group.add_argument('-g', '--try-genus-match',
action="store_true", dest="addtaxids:try_genus_match", action="store_true", dest="addtaxids:try_genus_match",
@ -174,6 +174,7 @@ def run(config):
taxid_column[i] = taxon.taxid taxid_column[i] = taxon.taxid
found_count+=1 found_count+=1
elif try_genus: # try finding genus or other parent taxon from the first word elif try_genus: # try finding genus or other parent taxon from the first word
#print(i, o_view[i].id)
taxon_name_sp = taxon_name.split(b" ") taxon_name_sp = taxon_name.split(b" ")
taxon = taxo.get_taxon_by_name(taxon_name_sp[0], res_anc) taxon = taxo.get_taxon_by_name(taxon_name_sp[0], res_anc)
if taxon is not None: if taxon is not None:

View File

@ -158,7 +158,7 @@ def run(config):
i_view_name = i_uri.split(b"/")[0] i_view_name = i_uri.split(b"/")[0]
i_column_name = b"" i_column_name = b""
i_element_name = b"" i_element_name = b""
if len(i_uri.split(b"/")) == 2: if len(i_uri.split(b"/")) >= 2:
i_column_name = i_uri.split(b"/")[1] i_column_name = i_uri.split(b"/")[1]
if len(i_uri.split(b"/")) == 3: if len(i_uri.split(b"/")) == 3:
i_element_name = i_uri.split(b"/")[2] i_element_name = i_uri.split(b"/")[2]
@ -181,7 +181,7 @@ def run(config):
i_dms_name_2 = i_dms_2.name i_dms_name_2 = i_dms_2.name
i_uri_2 = input_2[1] i_uri_2 = input_2[1]
original_i_view_name_2 = i_uri_2.split(b"/")[0] original_i_view_name_2 = i_uri_2.split(b"/")[0]
if len(i_uri_2.split(b"/")) == 2: if len(i_uri_2.split(b"/")) >= 2:
i_column_name_2 = i_uri_2.split(b"/")[1] i_column_name_2 = i_uri_2.split(b"/")[1]
if len(i_uri_2.split(b"/")) == 3: if len(i_uri_2.split(b"/")) == 3:
i_element_name_2 = i_uri_2.split(b"/")[2] i_element_name_2 = i_uri_2.split(b"/")[2]

View File

@ -31,10 +31,9 @@ def addOptions(parser):
group.add_argument('--threshold','-t', group.add_argument('--threshold','-t',
action="store", dest="build_ref_db:threshold", action="store", dest="build_ref_db:threshold",
metavar='<THRESHOLD>', metavar='<THRESHOLD>',
default=0.0, default=0.99,
type=float, type=float,
help="Score threshold as a normalized identity, e.g. 0.95 for an identity of 95%%. Default: 0.00" help="Score threshold as a normalized identity, e.g. 0.95 for an identity of 95%%. Default: 0.99.")
" (no threshold).")
def run(config): def run(config):

View File

@ -97,7 +97,7 @@ def run(config):
Column.new_column(o_view, REVERSE_QUALITY_COLUMN, OBI_QUAL, associated_column_name=REVERSE_SEQUENCE_COLUMN, associated_column_version=o_view[REVERSE_SEQUENCE_COLUMN].version) Column.new_column(o_view, REVERSE_QUALITY_COLUMN, OBI_QUAL, associated_column_name=REVERSE_SEQUENCE_COLUMN, associated_column_version=o_view[REVERSE_SEQUENCE_COLUMN].version)
# Initialize multiple elements columns # Initialize multiple elements columns
if type(output_0)==BufferedWriter: if type(output_0)!=BufferedWriter:
dict_cols = {} dict_cols = {}
for v_uri in config["cat"]["views_to_cat"]: for v_uri in config["cat"]["views_to_cat"]:
v = open_uri(v_uri)[1] v = open_uri(v_uri)[1]

View File

@ -54,11 +54,11 @@ def addOptions(parser):
default=False, default=False,
help="Only sequences labeled as heads are kept in the output. Default: False") help="Only sequences labeled as heads are kept in the output. Default: False")
group.add_argument('--cluster-tags', '-C', # group.add_argument('--cluster-tags', '-C',
action="store_true", # action="store_true",
dest="clean:cluster-tags", # dest="clean:cluster-tags",
default=False, # default=False,
help="Adds tags for each sequence giving its cluster's head and weight for each sample.") # help="Adds tags for each sequence giving its cluster's head and weight for each sample.")
group.add_argument('--thread-count','-p', # TODO should probably be in a specific option group group.add_argument('--thread-count','-p', # TODO should probably be in a specific option group
action="store", dest="clean:thread-count", action="store", dest="clean:thread-count",
@ -142,4 +142,5 @@ def run(config):
i_dms.close(force=True) i_dms.close(force=True)
logger("info", "Done.") logger("info", "Done.")

View File

@ -29,6 +29,12 @@ def addOptions(parser):
default=False, default=False,
help="Prints only the total count of sequence records (if a sequence has no `count` attribute, its default count is 1) (default: False).") help="Prints only the total count of sequence records (if a sequence has no `count` attribute, its default count is 1) (default: False).")
group.add_argument('-c','--count-tag',
action="store", dest="count:countcol",
default='COUNT',
type=str,
help="Name of the tag/column associated with the count information (default: COUNT).")
def run(config): def run(config):
@ -41,18 +47,20 @@ def run(config):
if input is None: if input is None:
raise Exception("Could not read input") raise Exception("Could not read input")
entries = input[1] entries = input[1]
countcol = config['count']['countcol']
count1 = len(entries) count1 = len(entries)
count2 = 0 count2 = 0
if COUNT_COLUMN in entries and ((config['count']['sequence'] == config['count']['all']) or (config['count']['all'])) : if countcol in entries and ((config['count']['sequence'] == config['count']['all']) or (config['count']['all'])) :
for e in entries: for e in entries:
PyErr_CheckSignals() PyErr_CheckSignals()
count2+=e[COUNT_COLUMN] count2+=e[countcol]
if COUNT_COLUMN in entries and (config['count']['sequence'] == config['count']['all']): if countcol in entries and (config['count']['sequence'] == config['count']['all']):
print(count1,count2) print(count1,count2)
elif COUNT_COLUMN in entries and config['count']['all']: elif countcol in entries and config['count']['all']:
print(count2) print(count2)
else: else:
print(count1) print(count1)

View File

@ -258,6 +258,13 @@ def Filter_generator(options, tax_filter, i_view):
def Taxonomy_filter_generator(taxo, options): def Taxonomy_filter_generator(taxo, options):
if (("required_ranks" in options and options["required_ranks"]) or \
("required_taxids" in options and options["required_taxids"]) or \
("ignored_taxids" in options and options["ignored_taxids"])) and \
(taxo is None):
raise RollbackException("obi grep error: can't use taxonomy options without providing a taxonomy. Rollbacking view")
if taxo is not None: if taxo is not None:
def tax_filter(seq): def tax_filter(seq):
good = True good = True

View File

@ -34,14 +34,17 @@ from obitools3.dms.capi.obidms cimport obi_import_view
from obitools3.dms.capi.obitypes cimport obitype_t, \ from obitools3.dms.capi.obitypes cimport obitype_t, \
OBI_VOID, \ OBI_VOID, \
OBI_QUAL, \ OBI_QUAL, \
OBI_STR OBI_STR, \
OBI_INT
from obitools3.dms.capi.obierrno cimport obi_errno from obitools3.dms.capi.obierrno cimport obi_errno
from obitools3.apps.optiongroups import addImportInputOption, \ from obitools3.apps.optiongroups import addImportInputOption, \
addTabularInputOption, \ addTabularInputOption, \
addTaxdumpInputOption, \ addTaxdumpInputOption, \
addMinimalOutputOption addMinimalOutputOption, \
addNoProgressBarOption, \
addTaxonomyOption
from obitools3.uri.decode import open_uri from obitools3.uri.decode import open_uri
@ -50,6 +53,7 @@ from obitools3.apps.config import logger
from cpython.exc cimport PyErr_CheckSignals from cpython.exc cimport PyErr_CheckSignals
from io import BufferedWriter from io import BufferedWriter
import ast
__title__="Import sequences from different formats into a DMS" __title__="Import sequences from different formats into a DMS"
@ -69,7 +73,9 @@ def addOptions(parser):
addImportInputOption(parser) addImportInputOption(parser)
addTabularInputOption(parser) addTabularInputOption(parser)
addTaxdumpInputOption(parser) addTaxdumpInputOption(parser)
addTaxonomyOption(parser)
addMinimalOutputOption(parser) addMinimalOutputOption(parser)
addNoProgressBarOption(parser)
group = parser.add_argument_group('obi import specific options') group = parser.add_argument_group('obi import specific options')
@ -85,6 +91,10 @@ def addOptions(parser):
help="If importing a view into another DMS, do it by importing each line, saving disk space if the original view " help="If importing a view into another DMS, do it by importing each line, saving disk space if the original view "
"has a line selection associated.") "has a line selection associated.")
# group.add_argument('--only-id',
# action="store", dest="import:onlyid",
# help="only id")
def run(config): def run(config):
cdef tuple input cdef tuple input
@ -97,6 +107,7 @@ def run(config):
cdef bint get_quality cdef bint get_quality
cdef bint NUC_SEQS_view cdef bint NUC_SEQS_view
cdef bint silva cdef bint silva
cdef bint rdp
cdef int nb_elts cdef int nb_elts
cdef object d cdef object d
cdef View view cdef View view
@ -180,6 +191,16 @@ def run(config):
logger("info", "Done.") logger("info", "Done.")
return return
# Open taxonomy if there is one
if 'taxoURI' in config['obi'] and config['obi']['taxoURI'] is not None:
taxo_uri = open_uri(config['obi']['taxoURI'])
if taxo_uri is None or taxo_uri[2] == bytes:
raise Exception("Couldn't open taxonomy")
taxo = taxo_uri[1]
else :
taxo = None
# If importing a view between two DMS and not wanting to save space if line selection in original view, use C API # If importing a view between two DMS and not wanting to save space if line selection in original view, use C API
if isinstance(input[1], View) and not config['import']['space_priority']: if isinstance(input[1], View) and not config['import']['space_priority']:
if obi_import_view(input[0].name_with_full_path, o_dms.name_with_full_path, input[1].name, tobytes((config['obi']['outputURI'].split('/'))[-1])) < 0 : if obi_import_view(input[0].name_with_full_path, o_dms.name_with_full_path, input[1].name, tobytes((config['obi']['outputURI'].split('/'))[-1])) < 0 :
@ -192,8 +213,11 @@ def run(config):
logger("info", "Done.") logger("info", "Done.")
return return
if entry_count >= 0: # Reinitialize the progress bar
if entry_count >= 0 and config['obi']['noprogressbar'] == False:
pb = ProgressBar(entry_count, config) pb = ProgressBar(entry_count, config)
else:
pb = None
NUC_SEQS_view = False NUC_SEQS_view = False
if isinstance(output[1], View) : if isinstance(output[1], View) :
@ -202,20 +226,25 @@ def run(config):
NUC_SEQS_view = True NUC_SEQS_view = True
else: else:
raise NotImplementedError() raise NotImplementedError()
# Save basic columns in variables for optimization # Save basic columns in variables for optimization
if NUC_SEQS_view : if NUC_SEQS_view :
id_col = view[ID_COLUMN] id_col = view[ID_COLUMN]
def_col = view[DEFINITION_COLUMN] def_col = view[DEFINITION_COLUMN]
seq_col = view[NUC_SEQUENCE_COLUMN] seq_col = view[NUC_SEQUENCE_COLUMN]
# Prepare taxon scientific name if SILVA file # Prepare taxon scientific name and taxid refs if RDP or SILVA file
if 'inputformat' in config['obi'] and config['obi']['inputformat'] == b"silva": silva = False
rdp = False
if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or config['obi']['inputformat'] == b"rdp"):
#if taxo is None:
# raise Exception("A taxonomy (as built by 'obi import --taxdump') must be provided for SILVA and RDP files")
silva = True silva = True
sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR) rdp = True
else: if taxo is not None:
silva = False sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR)
taxid_col = Column.new_column(view, TAXID_COLUMN, OBI_INT)
dcols = {} dcols = {}
# First read through the entries to prepare columns with dictionaries as they are very time-expensive to rewrite # First read through the entries to prepare columns with dictionaries as they are very time-expensive to rewrite
@ -265,8 +294,13 @@ def run(config):
# Reinitialize the input # Reinitialize the input
if isinstance(input[0], CompressedFile): if isinstance(input[0], CompressedFile):
input_is_file = True input_is_file = True
if entry_count >= 0:
# Reinitialize the progress bar
if entry_count >= 0 and config['obi']['noprogressbar'] == False:
pb = ProgressBar(entry_count, config) pb = ProgressBar(entry_count, config)
else:
pb = None
try: try:
input[0].close() input[0].close()
except AttributeError: except AttributeError:
@ -275,6 +309,11 @@ def run(config):
if input is None: if input is None:
raise Exception("Could not open input URI") raise Exception("Could not open input URI")
# if 'onlyid' in config['import']:
# onlyid = tobytes(config['import']['onlyid'])
# else:
# onlyid = None
entries = input[1] entries = input[1]
i = 0 i = 0
for entry in entries : for entry in entries :
@ -292,6 +331,9 @@ def run(config):
elif not i%50000: elif not i%50000:
logger("info", "Imported %d entries", i) logger("info", "Imported %d entries", i)
# if onlyid is not None and entry.id != onlyid:
# continue
try: try:
if NUC_SEQS_view: if NUC_SEQS_view:
@ -307,11 +349,18 @@ def run(config):
if get_quality: if get_quality:
qual_col[i] = entry.quality qual_col[i] = entry.quality
# Parse taxon scientific name if SILVA file # Parse taxon scientific name if RDP file
if silva: if (rdp or silva) and taxo is not None:
sci_name = entry.definition.split(b";")[-1] sci_names = entry.definition.split(b";")
sci_name_col[i] = sci_name for sci_name in reversed(sci_names):
if sci_name.split()[0] != b'unidentified' and sci_name.split()[0] != b'uncultured' and sci_name.split()[0] != b'metagenome' :
taxon = taxo.get_taxon_by_name(sci_name)
if taxon is not None:
sci_name_col[i] = taxon.name
taxid_col[i] = taxon.taxid
#print(taxid_col[i], sci_name_col[i])
break
for tag in entry : for tag in entry :
if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN : # TODO dirty if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN : # TODO dirty
@ -323,27 +372,39 @@ def run(config):
tag = COUNT_COLUMN tag = COUNT_COLUMN
if tag[:7] == b"merged_": if tag[:7] == b"merged_":
tag = MERGED_PREFIX+tag[7:] tag = MERGED_PREFIX+tag[7:]
if type(value) == bytes and value[:1]==b"[" :
try:
if type(eval(value)) == list:
value = eval(value)
#print(value)
except:
pass
if tag not in dcols : if tag not in dcols :
value_type = type(value) value_type = type(value)
nb_elts = 1 nb_elts = 1
value_obitype = OBI_VOID value_obitype = OBI_VOID
dict_col = False dict_col = False
if value_type == dict or value_type == list : if value_type == dict :
nb_elts = len(value) nb_elts = len(value)
elt_names = list(value) elt_names = list(value)
if value_type == dict : dict_col = True
dict_col = True
else : else :
nb_elts = 1 nb_elts = 1
elt_names = None elt_names = None
if value_type == list :
tuples = True
else:
tuples = False
value_obitype = get_obitype(value) value_obitype = get_obitype(value)
if value_obitype != OBI_VOID : if value_obitype != OBI_VOID :
dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names, dict_column=dict_col), value_obitype) dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names, dict_column=dict_col, tuples=tuples), value_obitype)
# Fill value # Fill value
dcols[tag][0][i] = value dcols[tag][0][i] = value
@ -419,11 +480,12 @@ def run(config):
dcols[tag][0][i] = value dcols[tag][0][i] = value
except Exception as e: except Exception as e:
print("\nCould not import sequence:", entry, "(error raised:", e, ")") print("\nCould not import sequence:", repr(entry), "(error raised:", e, ")")
if 'skiperror' in config['obi'] and not config['obi']['skiperror']: if 'skiperror' in config['obi'] and not config['obi']['skiperror']:
raise e raise e
else: else:
pass pass
i-=1 # overwrite problematic entry
i+=1 i+=1

View File

@ -154,7 +154,7 @@ def run(config):
else : else :
taxo = None taxo = None
statistics = set(config['stats']['minimum']) | set(config['stats']['maximum']) | set(config['stats']['mean']) statistics = set(config['stats']['minimum']) | set(config['stats']['maximum']) | set(config['stats']['mean']) | set(config['stats']['var']) | set(config['stats']['sd'])
total = 0 total = 0
catcount={} catcount={}
totcount={} totcount={}
@ -195,7 +195,7 @@ def run(config):
except KeyError: except KeyError:
totcount[category]=totcount.get(category,0)+1 totcount[category]=totcount.get(category,0)+1
for var in statistics: for var in statistics:
if var in line: if var in line and line[var] is not None:
v = line[var] v = line[var]
if var not in values: if var not in values:
values[var]={} values[var]={}
@ -238,14 +238,34 @@ def run(config):
else: else:
sdvar= "%s" sdvar= "%s"
hcat = "\t".join([pcat % x for x in config['stats']['categories']]) + \ hcat = ""
"\t".join([minvar % x for x in config['stats']['minimum']]) + \
"\t".join([maxvar % x for x in config['stats']['maximum']]) + \ for x in config['stats']['categories']:
"\t".join([meanvar % x for x in config['stats']['mean']]) + \ hcat += pcat % x
"\t".join([varvar % x for x in config['stats']['var']]) + \ hcat += "\t"
"\t".join([sdvar % x for x in config['stats']['sd']]) + \
"count\t" + \ for x in config['stats']['minimum']:
"total" hcat += minvar % x
hcat += "\t"
for x in config['stats']['maximum']:
hcat += maxvar % x
hcat += "\t"
for x in config['stats']['mean']:
hcat += meanvar % x
hcat += "\t"
for x in config['stats']['var']:
hcat += varvar % x
hcat += "\t"
for x in config['stats']['sd']:
hcat += sdvar % x
hcat += "\t"
hcat += "count\ttotal"
print(hcat) print(hcat)
sorted_stats = sorted(catcount.items(), key = lambda kv:(totcount[kv[0]]), reverse=True) sorted_stats = sorted(catcount.items(), key = lambda kv:(totcount[kv[0]]), reverse=True)
for i in range(len(sorted_stats)): for i in range(len(sorted_stats)):

View File

@ -77,7 +77,7 @@ cdef class View(OBIWrapper) :
@staticmethod @staticmethod
def import_view(object dms_1, object dms_2, object view_name_1, object view_name_2): def import_view(object dms_1, object dms_2, object view_name_1, object view_name_2): # TODO argument to import line by line to avoid huge AVL copy
if obi_import_view(tobytes(dms_1), tobytes(dms_2), tobytes(view_name_1), tobytes(view_name_2)) < 0 : if obi_import_view(tobytes(dms_1), tobytes(dms_2), tobytes(view_name_1), tobytes(view_name_2)) < 0 :
raise Exception("Error importing a view") raise Exception("Error importing a view")

View File

@ -192,7 +192,7 @@ def open_uri(uri,
config = getConfiguration() config = getConfiguration()
urip = urlparse(urib) urip = urlparse(urib)
if 'obi' not in config: if 'obi' not in config:
config['obi']={} config['obi']={}
@ -209,13 +209,14 @@ def open_uri(uri,
scheme = urip.scheme scheme = urip.scheme
error = None error = None
if urib != b"-" and \ if b'/taxonomy/' in urib or \
(urib != b"-" and \
(scheme==b"dms" or \ (scheme==b"dms" or \
(scheme==b"" and \ (scheme==b"" and \
(((not input) and "outputformat" not in config["obi"]) or \ (((not input) and "outputformat" not in config["obi"]) or \
(input and "inputformat" not in config["obi"])))): # TODO maybe not best way (input and "inputformat" not in config["obi"]))))): # TODO maybe not best way
if default_dms is not None and b"/" not in urip.path: # assuming view to open in default DMS (TODO discuss) if default_dms is not None and b"/" not in urip.path: # assuming view to open in default DMS (TODO discuss)
dms=(default_dms, urip.path) dms=(default_dms, urip.path)
else: else:
@ -223,7 +224,7 @@ def open_uri(uri,
if dms is None and default_dms is not None: if dms is None and default_dms is not None:
dms=(default_dms, urip.path) dms=(default_dms, urip.path)
if dms is not None: if dms is not None:
if dms_only: if dms_only:
return (dms[0], return (dms[0],
@ -248,7 +249,7 @@ def open_uri(uri,
if default_dms is None: if default_dms is None:
config["obi"]["defaultdms"]=resource[0] config["obi"]["defaultdms"]=resource[0]
return (resource[0], return (resource[0],
resource[1], resource[1],
type(resource[1]), type(resource[1]),
@ -464,7 +465,7 @@ def open_uri(uri,
if format is not None: if format is not None:
if seqtype==b"nuc": if seqtype==b"nuc":
objclass = Nuc_Seq # Nuc_Seq_Stored? TODO objclass = Nuc_Seq # Nuc_Seq_Stored? TODO
if format==b"fasta" or format==b"silva": if format==b"fasta" or format==b"silva" or format==b"rdp":
if input: if input:
iseq = fastaNucIterator(file, iseq = fastaNucIterator(file,
skip=skip, skip=skip,

View File

@ -18,7 +18,7 @@ cdef object clean_empty_values_from_object(object value, exclude=*)
cdef obitype_t get_obitype_single_value(object value) cdef obitype_t get_obitype_single_value(object value)
cdef obitype_t update_obitype(obitype_t obitype, object new_value) cdef obitype_t update_obitype(obitype_t obitype, object new_value)
cdef obitype_t get_obitype_iterable_value(object value) cdef obitype_t get_obitype_iterable_value(object value, type t)
cdef obitype_t get_obitype(object value) cdef obitype_t get_obitype(object value)
cdef object __etag__(bytes x, bytes nastring=*) cdef object __etag__(bytes x, bytes nastring=*)

View File

@ -260,38 +260,51 @@ cdef obitype_t update_obitype(obitype_t obitype, object new_value) :
new_type = type(new_value) new_type = type(new_value)
if obitype == OBI_INT : #if new_type == NoneType: # doesn't work because Cython sucks
if new_type == float or new_value > OBI_INT_MAX : if new_value == None or new_type==list or new_type==dict or new_type==tuple:
return OBI_FLOAT return obitype
# TODO BOOL vers INT/FLOAT # TODO BOOL vers INT/FLOAT
elif new_type == str or new_type == bytes : if new_type == str or new_type == bytes :
if obitype == OBI_SEQ and is_a_DNA_seq(tobytes(new_value)) : if obitype == OBI_SEQ and is_a_DNA_seq(tobytes(new_value)) :
pass pass
else : else :
return OBI_STR return OBI_STR
elif obitype == OBI_INT :
if new_type == float or new_value > OBI_INT_MAX :
return OBI_FLOAT
return obitype return obitype
cdef obitype_t get_obitype_iterable_value(object value) : cdef obitype_t get_obitype_iterable_value(object value, type t) :
cdef obitype_t value_obitype cdef obitype_t value_obitype
value_obitype = OBI_VOID value_obitype = OBI_VOID
for k in value : if t == dict:
if value_obitype == OBI_VOID : for k in value :
value_obitype = get_obitype_single_value(value[k]) if value_obitype == OBI_VOID :
else : value_obitype = get_obitype_single_value(value[k])
value_obitype = update_obitype(value_obitype, value[k]) else :
value_obitype = update_obitype(value_obitype, value[k])
elif t == list or t == tuple:
for v in value :
if value_obitype == OBI_VOID :
value_obitype = get_obitype_single_value(v)
else :
value_obitype = update_obitype(value_obitype, v)
return value_obitype return value_obitype
cdef obitype_t get_obitype(object value) : cdef obitype_t get_obitype(object value) :
if type(value) == dict or type(value) == list or type(value) == tuple : t = type(value)
return get_obitype_iterable_value(value) if t == dict or t == list or t == tuple :
return get_obitype_iterable_value(value, t)
else : else :
return get_obitype_single_value(value) return get_obitype_single_value(value)

View File

@ -1,5 +1,5 @@
major = 3 major = 3
minor = 0 minor = 0
serial= '1b7' serial= '1b9'
version ="%d.%d.%s" % (major,minor,serial) version ="%d.%d.%s" % (major,minor,serial)

View File

@ -863,7 +863,8 @@ int build_reference_db(const char* dms_name,
fprintf(stderr,"\rDone : 100 %% \n"); fprintf(stderr,"\rDone : 100 %% \n");
// Add information about the threshold used to build the DB // Add information about the threshold used to build the DB
snprintf(threshold_str, 5, "%f", threshold); #define snprintf_nowarn(...) (snprintf(__VA_ARGS__) < 0 ? abort() : (void)0)
snprintf_nowarn(threshold_str, 5, "%f", threshold);
new_comments = obi_add_comment((o_view->infos)->comments, DB_THRESHOLD_KEY_IN_COMMENTS, threshold_str); new_comments = obi_add_comment((o_view->infos)->comments, DB_THRESHOLD_KEY_IN_COMMENTS, threshold_str);
if (new_comments == NULL) if (new_comments == NULL)

View File

@ -233,7 +233,7 @@ int obi_ecotag(const char* dms_name,
// Write result (max score, threshold, LCA assigned, list of the ids of the best matches) // Write result (max score, threshold, LCA assigned, list of the ids of the best matches)
index_t i, j, k; index_t i, j, k, t;
ecotx_t* lca; ecotx_t* lca;
ecotx_t* lca_in_array; ecotx_t* lca_in_array;
ecotx_t* best_match; ecotx_t* best_match;
@ -259,16 +259,20 @@ int obi_ecotag(const char* dms_name,
int32_t* best_match_taxids; int32_t* best_match_taxids;
int32_t* best_match_taxids_to_store; int32_t* best_match_taxids_to_store;
int best_match_count; int best_match_count;
int best_match_taxid_count;
int buffer_size; int buffer_size;
int best_match_ids_buffer_size; int best_match_ids_buffer_size;
index_t best_match_idx; index_t best_match_idx;
int32_t lca_array_length; int32_t lca_array_length;
int32_t lca_taxid; int32_t lca_taxid;
int32_t taxid_best_match; int32_t taxid_best_match;
int32_t taxid;
int32_t taxid_to_store;
bool assigned; bool assigned;
const char* lca_name; const char* lca_name;
const char* id; const char* id;
int id_len; int id_len;
bool already_in;
OBIDMS_p dms = NULL; OBIDMS_p dms = NULL;
OBIDMS_p ref_dms = NULL; OBIDMS_p ref_dms = NULL;
@ -488,10 +492,11 @@ int obi_ecotag(const char* dms_name,
for (i=0; i < query_count; i++) for (i=0; i < query_count; i++)
{ {
if (i%1000 == 0) if (i%10 == 0)
fprintf(stderr,"\rDone : %f %% ", (i / (float) query_count)*100); fprintf(stderr,"\rDone : %f %% ", (i / (float) query_count)*100);
best_match_count = 0; best_match_count = 0;
best_match_taxid_count = 0;
best_match_ids_length = 0; best_match_ids_length = 0;
threshold = ecotag_threshold; threshold = ecotag_threshold;
best_score = 0.0; best_score = 0.0;
@ -543,6 +548,7 @@ int obi_ecotag(const char* dms_name,
// Reset the array with that match // Reset the array with that match
best_match_ids_length = 0; best_match_ids_length = 0;
best_match_count = 0; best_match_count = 0;
best_match_taxid_count = 0;
} }
// Store in best match array // Store in best match array
@ -585,8 +591,27 @@ int obi_ecotag(const char* dms_name,
// Save match // Save match
best_match_array[best_match_count] = j; best_match_array[best_match_count] = j;
best_match_taxids[best_match_count] = obi_get_int_with_elt_idx_and_col_p_in_view(ref_view, ref_taxid_column, j, 0);
best_match_count++; best_match_count++;
// Save best match taxid only if not already in array
taxid_to_store = obi_get_int_with_elt_idx_and_col_p_in_view(ref_view, ref_taxid_column, j, 0);
already_in = false;
for (t=0; t<best_match_taxid_count; t++)
{
taxid = best_match_taxids[t];
//fprintf(stderr, "\ntaxid %d, taxid_to_store %d\n", taxid, taxid_to_store);
if (taxid == taxid_to_store)
{
already_in = true;
break;
}
}
if (! already_in)
{
best_match_taxids[best_match_taxid_count] = taxid_to_store;
best_match_taxid_count++;
}
strcpy(best_match_ids+best_match_ids_length, id); strcpy(best_match_ids+best_match_ids_length, id);
best_match_ids_length = best_match_ids_length + id_len + 1; best_match_ids_length = best_match_ids_length + id_len + 1;
} }
@ -693,7 +718,7 @@ int obi_ecotag(const char* dms_name,
assigned_name_column, lca_name, assigned_name_column, lca_name,
assigned_status_column, assigned, assigned_status_column, assigned,
best_match_ids_column, best_match_ids_to_store, best_match_ids_length, best_match_ids_column, best_match_ids_to_store, best_match_ids_length,
best_match_taxids_column, best_match_taxids_to_store, best_match_count, best_match_taxids_column, best_match_taxids_to_store, best_match_taxid_count,
score_column, best_score score_column, best_score
) < 0) ) < 0)
return -1; return -1;

View File

@ -36,7 +36,7 @@
*/ */
#define COLUMN_GROWTH_FACTOR (2) /**< The growth factor when a column is enlarged. #define COLUMN_GROWTH_FACTOR (2) /**< The growth factor when a column is enlarged.
*/ */
#define MAXIMUM_LINE_COUNT (1000000000) /**< The maximum line count for the data of a column (1E9). //TODO #define MAXIMUM_LINE_COUNT (1000000000000) /**< The maximum line count for the data of a column (1E12). //TODO
*/ */
#define COMMENTS_MAX_LENGTH (4096) /**< The maximum length for comments. #define COMMENTS_MAX_LENGTH (4096) /**< The maximum length for comments.
*/ */