Compare commits
80 Commits
v3.0.0-bet
...
v3.0.0b25
Author | SHA1 | Date | |
---|---|---|---|
aa01236cae | |||
49b8810a76 | |||
7a39df54c0 | |||
09e483b0d6 | |||
14a2579173 | |||
36a8aaa92e | |||
a17eb445c2 | |||
e4a32788c2 | |||
2442cc80bf | |||
aa836b2ace | |||
8776ce22e6 | |||
4aa772c405 | |||
b0b96ac37a | |||
687e42ad22 | |||
5fbbb6d304 | |||
359a9fe237 | |||
f9b6851f75 | |||
29a2652bbf | |||
2a2c233936 | |||
faf8ea9d86 | |||
ffe2485e94 | |||
6094ce2bbc | |||
a7dcf16c06 | |||
f13f8f6165 | |||
b5a29ac413 | |||
efd2b9d338 | |||
ca6e3e7aad | |||
76ed8e18e5 | |||
1d17f28aec | |||
fa834e4b8b | |||
a72fea3cc9 | |||
e9a37d8a6e | |||
ef074f8455 | |||
aec5e69f2c | |||
170ef3f1ba | |||
f999946582 | |||
773b36ec37 | |||
69cb434a6c | |||
55d4f98d60 | |||
0bec2631e8 | |||
e6b6c6fa84 | |||
974528b2e6 | |||
1b346b54f9 | |||
058f2ad8b3 | |||
60bfd3ae8d | |||
67bdee105a | |||
0f745e0113 | |||
da8de52ba4 | |||
4d36538c6e | |||
8d0b17d87d | |||
343999a627 | |||
e9a40630e9 | |||
8dbcd3025a | |||
4cf635d001 | |||
b7e7cc232a | |||
b6ab792ceb | |||
ddea5a2964 | |||
30852ab7d5 | |||
4d0299904e | |||
eef5156d95 | |||
e62c991bbc | |||
1218eed7fd | |||
cd9cea8c97 | |||
98cfb70d73 | |||
b9f68c76c8 | |||
0b98371688 | |||
f0d152fcbd | |||
8019dee68e | |||
0b4a234671 | |||
d32cfdcce5 | |||
219c0d6fdc | |||
dc9f897917 | |||
bb72682f7d | |||
52920c3c71 | |||
18c22cecf9 | |||
1bfb96023c | |||
c67d668989 | |||
db0ac37d41 | |||
d0c21ecd39 | |||
53212168a2 |
@ -6,7 +6,7 @@ recursive-include doc/sphinx/source *.txt *.rst *.py
|
||||
recursive-include doc/sphinx/sphinxext *.py
|
||||
include doc/sphinx/Makefile
|
||||
include doc/sphinx/Doxyfile
|
||||
include README.txt
|
||||
include README.md
|
||||
include requirements.txt
|
||||
include scripts/obi
|
||||
|
||||
|
@ -1,4 +1,3 @@
|
||||
#/usr/bin/env bash
|
||||
|
||||
_obi_comp ()
|
||||
{
|
@ -266,9 +266,9 @@ def run(config):
|
||||
# If the input and the output DMS are different, delete the temporary result view in the input DMS
|
||||
if i_dms != o_dms:
|
||||
View.delete_view(i_dms, o_view_name)
|
||||
o_dms.close()
|
||||
o_dms.close(force=True)
|
||||
|
||||
i_dms.close()
|
||||
i_dms.close(force=True)
|
||||
|
||||
logger("info", "Done.")
|
||||
|
@ -247,10 +247,10 @@ def run(config):
|
||||
#print("\n\nOutput view:\n````````````", file=sys.stderr)
|
||||
#print(repr(view), file=sys.stderr)
|
||||
|
||||
input[0].close()
|
||||
input[0].close(force=True)
|
||||
if two_views:
|
||||
rinput[0].close()
|
||||
output[0].close()
|
||||
rinput[0].close(force=True)
|
||||
output[0].close(force=True)
|
||||
|
||||
logger("info", "Done.")
|
||||
|
@ -13,7 +13,8 @@ from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN, \
|
||||
ID_COLUMN, \
|
||||
DEFINITION_COLUMN, \
|
||||
QUALITY_COLUMN, \
|
||||
COUNT_COLUMN
|
||||
COUNT_COLUMN, \
|
||||
TAXID_COLUMN
|
||||
|
||||
import time
|
||||
import math
|
||||
@ -175,8 +176,8 @@ def sequenceTaggerGenerator(config, taxo=None):
|
||||
counter[0]+=1
|
||||
|
||||
for rank in annoteRank:
|
||||
if 'taxid' in seq:
|
||||
taxid = seq['taxid']
|
||||
if TAXID_COLUMN in seq:
|
||||
taxid = seq[TAXID_COLUMN]
|
||||
if taxid is not None:
|
||||
rtaxid = taxo.get_taxon_at_rank(taxid, rank)
|
||||
if rtaxid is not None:
|
||||
@ -190,58 +191,50 @@ def sequenceTaggerGenerator(config, taxo=None):
|
||||
seq['seq_rank']=counter[0]
|
||||
|
||||
for i,v in toSet:
|
||||
#try:
|
||||
if taxo is not None:
|
||||
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
||||
else:
|
||||
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
||||
val = eval(v, environ, seq)
|
||||
#except Exception,e: # TODO discuss usefulness of this
|
||||
# if options.onlyValid:
|
||||
# raise e
|
||||
# val = v
|
||||
try:
|
||||
if taxo is not None:
|
||||
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
||||
else:
|
||||
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
||||
val = eval(v, environ, seq)
|
||||
except Exception: # set string if not a valid expression
|
||||
val = v
|
||||
seq[i]=val
|
||||
|
||||
if length:
|
||||
seq['seq_length']=len(seq)
|
||||
|
||||
if newId is not None:
|
||||
# try:
|
||||
if taxo is not None:
|
||||
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
||||
else:
|
||||
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
||||
val = eval(newId, environ, seq)
|
||||
# except Exception,e:
|
||||
# if options.onlyValid:
|
||||
# raise e
|
||||
# val = newId
|
||||
try:
|
||||
if taxo is not None:
|
||||
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
||||
else:
|
||||
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
||||
val = eval(newId, environ, seq)
|
||||
except Exception: # set string if not a valid expression
|
||||
val = newId
|
||||
seq.id=val
|
||||
|
||||
if newDef is not None:
|
||||
# try:
|
||||
if taxo is not None:
|
||||
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
||||
else:
|
||||
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
||||
val = eval(newDef, environ, seq)
|
||||
# except Exception,e:
|
||||
# if options.onlyValid:
|
||||
# raise e
|
||||
# val = newDef
|
||||
try:
|
||||
if taxo is not None:
|
||||
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
||||
else:
|
||||
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
||||
val = eval(newDef, environ, seq)
|
||||
except Exception: # set string if not a valid expression
|
||||
val = newDef
|
||||
seq.definition=val
|
||||
#
|
||||
|
||||
if newSeq is not None:
|
||||
# try:
|
||||
if taxo is not None:
|
||||
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
||||
else:
|
||||
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
||||
val = eval(newSeq, environ, seq)
|
||||
# except Exception,e:
|
||||
# if options.onlyValid:
|
||||
# raise e
|
||||
# val = newSeq
|
||||
try:
|
||||
if taxo is not None:
|
||||
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
||||
else:
|
||||
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
||||
val = eval(newSeq, environ, seq)
|
||||
except Exception: # set string if not a valid expression
|
||||
val = newSeq
|
||||
seq.seq=val
|
||||
if 'seq_length' in seq:
|
||||
seq['seq_length']=len(seq)
|
||||
@ -251,15 +244,14 @@ def sequenceTaggerGenerator(config, taxo=None):
|
||||
seq.view.delete_column(QUALITY_COLUMN)
|
||||
|
||||
if run is not None:
|
||||
# try:
|
||||
if taxo is not None:
|
||||
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
||||
else:
|
||||
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
||||
eval(run, environ, seq)
|
||||
# except Exception,e:
|
||||
# if options.onlyValid:
|
||||
# raise e
|
||||
try:
|
||||
if taxo is not None:
|
||||
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
||||
else:
|
||||
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
||||
eval(run, environ, seq)
|
||||
except Exception,e:
|
||||
raise e
|
||||
|
||||
return sequenceTagger
|
||||
|
||||
@ -379,7 +371,7 @@ def run(config):
|
||||
# If the input and the output DMS are different, delete the temporary imported view used to create the final view
|
||||
if i_dms != o_dms:
|
||||
View.delete_view(o_dms, imported_view_name)
|
||||
o_dms.close()
|
||||
i_dms.close()
|
||||
o_dms.close(force=True)
|
||||
i_dms.close(force=True)
|
||||
|
||||
logger("info", "Done.")
|
||||
|
@ -97,9 +97,9 @@ def run(config):
|
||||
# If the input and the output DMS are different, delete the temporary result view in the input DMS
|
||||
if i_dms != o_dms:
|
||||
View.delete_view(i_dms, o_view_name)
|
||||
o_dms.close()
|
||||
o_dms.close(force=True)
|
||||
|
||||
i_dms.close()
|
||||
i_dms.close(force=True)
|
||||
|
||||
logger("info", "Done.")
|
||||
|
||||
|
@ -86,7 +86,24 @@ def run(config):
|
||||
if not remove_rev_qual:
|
||||
Column.new_column(o_view, REVERSE_SEQUENCE_COLUMN, OBI_SEQ)
|
||||
Column.new_column(o_view, REVERSE_QUALITY_COLUMN, OBI_QUAL, associated_column_name=REVERSE_SEQUENCE_COLUMN, associated_column_version=o_view[REVERSE_SEQUENCE_COLUMN].version)
|
||||
|
||||
|
||||
# Initialize multiple elements columns
|
||||
dict_cols = {}
|
||||
for v in iview_list:
|
||||
for coln in v.keys():
|
||||
if v[coln].nb_elements_per_line > 1:
|
||||
if coln not in dict_cols:
|
||||
dict_cols[coln] = {}
|
||||
dict_cols[coln]['eltnames'] = set(v[coln].elements_names)
|
||||
dict_cols[coln]['nbelts'] = v[coln].nb_elements_per_line
|
||||
dict_cols[coln]['obitype'] = v[coln].data_type_int
|
||||
else:
|
||||
dict_cols[coln]['eltnames'] = set(v[coln].elements_names + list(dict_cols[coln]['eltnames']))
|
||||
dict_cols[coln]['nbelts'] = len(dict_cols[coln]['eltnames'])
|
||||
for coln in dict_cols:
|
||||
Column.new_column(o_view, coln, dict_cols[coln]['obitype'],
|
||||
nb_elements_per_line=dict_cols[coln]['nbelts'], elements_names=list(dict_cols[coln]['eltnames']))
|
||||
|
||||
# Initialize the progress bar
|
||||
pb = ProgressBar(total_len, config, seconde=5)
|
||||
|
||||
@ -116,7 +133,7 @@ def run(config):
|
||||
#print(repr(view), file=sys.stderr)
|
||||
|
||||
for d in idms_list:
|
||||
d.close()
|
||||
o_dms.close()
|
||||
d.close(force=True)
|
||||
o_dms.close(force=True)
|
||||
|
||||
logger("info", "Done.")
|
||||
|
@ -124,8 +124,8 @@ def run(config):
|
||||
# If the input and the output DMS are different, delete the temporary result view in the input DMS
|
||||
if i_dms != o_dms:
|
||||
View.delete_view(i_dms, o_view_name)
|
||||
o_dms.close()
|
||||
o_dms.close(force=True)
|
||||
|
||||
i_dms.close()
|
||||
i_dms.close(force=True)
|
||||
|
||||
logger("info", "Done.")
|
@ -56,3 +56,5 @@ def run(config):
|
||||
print(count2)
|
||||
else:
|
||||
print(count1)
|
||||
|
||||
input[0].close(force=True)
|
||||
|
@ -35,13 +35,15 @@ def addOptions(parser):
|
||||
action="store", dest="ecopcr:primer1",
|
||||
metavar='<PRIMER>',
|
||||
type=str,
|
||||
help="Forward primer.")
|
||||
required=True,
|
||||
help="Forward primer, length must be less than or equal to 32")
|
||||
|
||||
group.add_argument('--primer2', '-R',
|
||||
action="store", dest="ecopcr:primer2",
|
||||
metavar='<PRIMER>',
|
||||
type=str,
|
||||
help="Reverse primer.")
|
||||
required=True,
|
||||
help="Reverse primer, length must be less than or equal to 32")
|
||||
|
||||
group.add_argument('--error', '-e',
|
||||
action="store", dest="ecopcr:error",
|
||||
@ -203,6 +205,7 @@ def run(config):
|
||||
#print("\n\nOutput view:\n````````````", file=sys.stderr)
|
||||
#print(repr(o_dms[o_view_name]), file=sys.stderr)
|
||||
|
||||
o_dms.close()
|
||||
i_dms.close(force=True)
|
||||
o_dms.close(force=True)
|
||||
|
||||
logger("info", "Done.")
|
||||
|
@ -64,10 +64,10 @@ def run(config):
|
||||
ref_view_name = ref[1]
|
||||
|
||||
# Check that the threshold demanded is greater than or equal to the threshold used to build the reference database
|
||||
if config['ecotag']['threshold'] < eval(i_dms[ref_view_name].comments["ref_db_threshold"]) :
|
||||
if config['ecotag']['threshold'] < eval(ref_dms[ref_view_name].comments["ref_db_threshold"]) :
|
||||
print("Error: The threshold demanded (%f) is lower than the threshold used to build the reference database (%f).",
|
||||
config['ecotag']['threshold'], i_dms[ref_view_name].comments["ref_db_threshold"])
|
||||
|
||||
config['ecotag']['threshold'], ref_dms[ref_view_name].comments["ref_db_threshold"])
|
||||
|
||||
# Open the output: only the DMS
|
||||
output = open_uri(config['obi']['outputURI'],
|
||||
input=False,
|
||||
@ -107,8 +107,8 @@ def run(config):
|
||||
comments = View.print_config(config, "ecotag", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
|
||||
|
||||
if obi_ecotag(i_dms.name_with_full_path, tobytes(i_view_name), \
|
||||
tobytes(ref_dms_name), tobytes(ref_view_name), \
|
||||
tobytes(taxo_dms_name), tobytes(taxonomy_name), \
|
||||
ref_dms.name_with_full_path, tobytes(ref_view_name), \
|
||||
taxo_dms.name_with_full_path, tobytes(taxonomy_name), \
|
||||
tobytes(o_view_name), comments,
|
||||
config['ecotag']['threshold']) < 0:
|
||||
raise Exception("Error running ecotag")
|
||||
@ -126,9 +126,11 @@ def run(config):
|
||||
# If the input and the output DMS are different, delete the temporary result view in the input DMS
|
||||
if i_dms != o_dms:
|
||||
View.delete_view(i_dms, o_view_name)
|
||||
o_dms.close()
|
||||
o_dms.close(force=True)
|
||||
|
||||
i_dms.close()
|
||||
taxo_dms.close(force=True)
|
||||
ref_dms.close(force=True)
|
||||
i_dms.close(force=True)
|
||||
|
||||
logger("info", "Done.")
|
||||
|
||||
|
@ -59,13 +59,23 @@ def run(config):
|
||||
# Check that the input view has the type NUC_SEQS if needed # TODO discuss, maybe bool property
|
||||
if (output[2] == Nuc_Seq) and (iview.type != b"NUC_SEQS_VIEW") : # Nuc_Seq_Stored? TODO
|
||||
raise Exception("Error: the view to export in fasta or fastq format is not a NUC_SEQS view")
|
||||
|
||||
|
||||
if config['obi']['only'] is not None:
|
||||
withoutskip = min(input[4], config['obi']['only'])
|
||||
else:
|
||||
withoutskip = input[4]
|
||||
|
||||
if config['obi']['skip'] is not None:
|
||||
skip = min(input[4], config['obi']['skip'])
|
||||
else:
|
||||
skip = 0
|
||||
|
||||
# Initialize the progress bar
|
||||
if config['obi']['noprogressbar']:
|
||||
pb = None
|
||||
else:
|
||||
pb = ProgressBar(len(iview), config, seconde=5)
|
||||
|
||||
pb = ProgressBar(withoutskip - skip, config, seconde=5)
|
||||
|
||||
i=0
|
||||
for seq in iview :
|
||||
PyErr_CheckSignals()
|
||||
@ -86,7 +96,7 @@ def run(config):
|
||||
if not BrokenPipeError and not IOError:
|
||||
output_object.close()
|
||||
iview.close()
|
||||
input[0].close()
|
||||
input[0].close(force=True)
|
||||
|
||||
logger("info", "Done.")
|
||||
|
||||
|
@ -161,8 +161,7 @@ def obi_eval(compiled_expr, loc_env, line):
|
||||
return obi_eval_result
|
||||
|
||||
|
||||
def Filter_generator(options, tax_filter):
|
||||
#taxfilter = taxonomyFilterGenerator(options)
|
||||
def Filter_generator(options, tax_filter, i_view):
|
||||
|
||||
# Initialize conditions
|
||||
predicates = None
|
||||
@ -171,6 +170,9 @@ def Filter_generator(options, tax_filter):
|
||||
attributes = None
|
||||
if "attributes" in options and len(options["attributes"]) > 0:
|
||||
attributes = options["attributes"]
|
||||
for attribute in attributes:
|
||||
if attribute not in i_view:
|
||||
return None
|
||||
lmax = None
|
||||
if "lmax" in options:
|
||||
lmax = options["lmax"]
|
||||
@ -196,6 +198,8 @@ def Filter_generator(options, tax_filter):
|
||||
if "attribute_patterns" in options and len(options["attribute_patterns"]) > 0:
|
||||
for p in options["attribute_patterns"]:
|
||||
attribute, pattern = p.split(":", 1)
|
||||
if attribute not in i_view:
|
||||
return None
|
||||
attribute_patterns[tobytes(attribute)] = re.compile(tobytes(pattern))
|
||||
|
||||
def filter(line, loc_env):
|
||||
@ -324,21 +328,29 @@ def run(config):
|
||||
|
||||
# Apply filter
|
||||
tax_filter = Taxonomy_filter_generator(taxo, config["grep"])
|
||||
filter = Filter_generator(config["grep"], tax_filter)
|
||||
filter = Filter_generator(config["grep"], tax_filter, i_view)
|
||||
selection = Line_selection(i_view)
|
||||
for i in range(len(i_view)):
|
||||
PyErr_CheckSignals()
|
||||
pb(i)
|
||||
line = i_view[i]
|
||||
|
||||
loc_env = {"sequence": line, "line": line, "taxonomy": taxo, "obi_eval_result": False}
|
||||
|
||||
good = filter(line, loc_env)
|
||||
|
||||
if good :
|
||||
selection.append(i)
|
||||
|
||||
pb(i, force=True)
|
||||
|
||||
if filter is None and config["grep"]["invert_selection"]: # all sequences are selected: filter is None if no line will be selected because some columns don't exist
|
||||
for i in range(len(i_view)):
|
||||
PyErr_CheckSignals()
|
||||
pb(i)
|
||||
selection.append(i)
|
||||
|
||||
elif filter is not None : # filter is None if no line will be selected because some columns don't exist
|
||||
for i in range(len(i_view)):
|
||||
PyErr_CheckSignals()
|
||||
pb(i)
|
||||
line = i_view[i]
|
||||
|
||||
loc_env = {"sequence": line, "line": line, "taxonomy": taxo, "obi_eval_result": False}
|
||||
|
||||
good = filter(line, loc_env)
|
||||
|
||||
if good :
|
||||
selection.append(i)
|
||||
|
||||
pb(len(i_view), force=True)
|
||||
print("", file=sys.stderr)
|
||||
|
||||
# Create output view with the line selection
|
||||
@ -370,7 +382,7 @@ def run(config):
|
||||
# If the input and the output DMS are different, delete the temporary imported view used to create the final view
|
||||
if i_dms != o_dms:
|
||||
View.delete_view(i_dms, o_view_name)
|
||||
o_dms.close()
|
||||
i_dms.close()
|
||||
o_dms.close(force=True)
|
||||
i_dms.close(force=True)
|
||||
|
||||
logger("info", "Done.")
|
||||
|
@ -103,7 +103,7 @@ def run(config):
|
||||
# If the input and the output DMS are different, delete the temporary imported view used to create the final view
|
||||
if i_dms != o_dms:
|
||||
View.delete_view(i_dms, o_view_name)
|
||||
o_dms.close()
|
||||
i_dms.close()
|
||||
o_dms.close(force=True)
|
||||
i_dms.close(force=True)
|
||||
|
||||
logger("info", "Done.")
|
||||
|
@ -54,4 +54,5 @@ def run(config):
|
||||
print(bytes2str(entries.ascii_history))
|
||||
else:
|
||||
raise Exception("ASCII history only available for views")
|
||||
|
||||
|
||||
input[0].close(force=True)
|
||||
|
@ -11,6 +11,7 @@ from obitools3.dms.column.column cimport Column
|
||||
from obitools3.dms.obiseq cimport Nuc_Seq
|
||||
from obitools3.dms import DMS
|
||||
from obitools3.dms.taxo.taxo cimport Taxonomy
|
||||
from obitools3.files.uncompress cimport CompressedFile
|
||||
|
||||
|
||||
from obitools3.utils cimport tobytes, \
|
||||
@ -24,7 +25,8 @@ from obitools3.dms.capi.obiview cimport VIEW_TYPE_NUC_SEQS, \
|
||||
DEFINITION_COLUMN, \
|
||||
QUALITY_COLUMN, \
|
||||
COUNT_COLUMN, \
|
||||
TAXID_COLUMN
|
||||
TAXID_COLUMN, \
|
||||
MERGED_PREFIX
|
||||
|
||||
from obitools3.dms.capi.obidms cimport obi_import_view
|
||||
|
||||
@ -65,6 +67,14 @@ def addOptions(parser):
|
||||
addTaxdumpInputOption(parser)
|
||||
addMinimalOutputOption(parser)
|
||||
|
||||
group = parser.add_argument_group('obi import specific options')
|
||||
|
||||
group.add_argument('--preread',
|
||||
action="store_true", dest="import:preread",
|
||||
default=False,
|
||||
help="Do a first readthrough of the dataset if it contains huge dictionaries (more than 100 keys) for "
|
||||
"a much faster import. This option is not recommended and will slow down the import in any other case.")
|
||||
|
||||
|
||||
def run(config):
|
||||
|
||||
@ -154,7 +164,7 @@ def run(config):
|
||||
taxo.write(taxo_name)
|
||||
taxo.close()
|
||||
o_dms.record_command_line(" ".join(sys.argv[1:]))
|
||||
o_dms.close()
|
||||
o_dms.close(force=True)
|
||||
logger("info", "Done.")
|
||||
return
|
||||
|
||||
@ -169,8 +179,6 @@ def run(config):
|
||||
|
||||
if entry_count >= 0:
|
||||
pb = ProgressBar(entry_count, config, seconde=5)
|
||||
|
||||
entries = input[1]
|
||||
|
||||
NUC_SEQS_view = False
|
||||
if isinstance(output[1], View) :
|
||||
@ -188,6 +196,63 @@ def run(config):
|
||||
|
||||
dcols = {}
|
||||
|
||||
# First read through the entries to prepare columns with dictionaries as they are very time-expensive to rewrite
|
||||
if config['import']['preread']:
|
||||
logger("info", "First readthrough...")
|
||||
entries = input[1]
|
||||
i = 0
|
||||
dict_dict = {}
|
||||
for entry in entries:
|
||||
PyErr_CheckSignals()
|
||||
|
||||
if entry is None: # error or exception handled at lower level, not raised because Python generators can't resume after any exception is raised
|
||||
if config['obi']['skiperror']:
|
||||
i-=1
|
||||
continue
|
||||
else:
|
||||
raise Exception("obi import error in first readthrough")
|
||||
|
||||
if pb is not None:
|
||||
pb(i)
|
||||
elif not i%50000:
|
||||
logger("info", "Read %d entries", i)
|
||||
|
||||
for tag in entry :
|
||||
newtag = tag
|
||||
if tag[:7] == b"merged_":
|
||||
newtag = MERGED_PREFIX+tag[7:]
|
||||
if type(entry[tag]) == dict :
|
||||
if tag in dict_dict:
|
||||
dict_dict[newtag][0].update(entry[tag].keys())
|
||||
else:
|
||||
dict_dict[newtag] = [set(entry[tag].keys()), get_obitype(entry[tag])]
|
||||
i+=1
|
||||
|
||||
if pb is not None:
|
||||
pb(i, force=True)
|
||||
print("", file=sys.stderr)
|
||||
|
||||
for tag in dict_dict:
|
||||
dcols[tag] = (Column.new_column(view, tag, dict_dict[tag][1], \
|
||||
nb_elements_per_line=len(dict_dict[tag][0]), \
|
||||
elements_names=list(dict_dict[tag][0])), \
|
||||
dict_dict[tag][1])
|
||||
|
||||
|
||||
# Reinitialize the input
|
||||
if isinstance(input[0], CompressedFile):
|
||||
input_is_file = True
|
||||
if entry_count >= 0:
|
||||
pb = ProgressBar(entry_count, config, seconde=5)
|
||||
try:
|
||||
input[0].close()
|
||||
except AttributeError:
|
||||
pass
|
||||
input = open_uri(config['obi']['inputURI'], force_file=input_is_file)
|
||||
if input is None:
|
||||
raise Exception("Could not open input URI")
|
||||
|
||||
entries = input[1]
|
||||
i = 0
|
||||
for entry in entries :
|
||||
|
||||
@ -195,7 +260,6 @@ def run(config):
|
||||
|
||||
if entry is None: # error or exception handled at lower level, not raised because Python generators can't resume after any exception is raised
|
||||
if config['obi']['skiperror']:
|
||||
i-=1
|
||||
continue
|
||||
else:
|
||||
raise RollbackException("obi import error, rollbacking view", view)
|
||||
@ -204,115 +268,134 @@ def run(config):
|
||||
pb(i)
|
||||
elif not i%50000:
|
||||
logger("info", "Imported %d entries", i)
|
||||
|
||||
if NUC_SEQS_view:
|
||||
id_col[i] = entry.id
|
||||
def_col[i] = entry.definition
|
||||
seq_col[i] = entry.seq
|
||||
# Check if there is a sequencing quality associated by checking the first entry # TODO haven't found a more robust solution yet
|
||||
if i == 0:
|
||||
get_quality = QUALITY_COLUMN in entry
|
||||
|
||||
try:
|
||||
|
||||
if NUC_SEQS_view:
|
||||
id_col[i] = entry.id
|
||||
def_col[i] = entry.definition
|
||||
seq_col[i] = entry.seq
|
||||
# Check if there is a sequencing quality associated by checking the first entry # TODO haven't found a more robust solution yet
|
||||
if i == 0:
|
||||
get_quality = QUALITY_COLUMN in entry
|
||||
if get_quality:
|
||||
Column.new_column(view, QUALITY_COLUMN, OBI_QUAL)
|
||||
qual_col = view[QUALITY_COLUMN]
|
||||
if get_quality:
|
||||
Column.new_column(view, QUALITY_COLUMN, OBI_QUAL)
|
||||
qual_col = view[QUALITY_COLUMN]
|
||||
if get_quality:
|
||||
qual_col[i] = entry.quality
|
||||
|
||||
for tag in entry :
|
||||
|
||||
if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN : # TODO dirty
|
||||
|
||||
value = entry[tag]
|
||||
if tag == b"taxid":
|
||||
tag = TAXID_COLUMN
|
||||
if tag == b"count":
|
||||
tag = COUNT_COLUMN
|
||||
|
||||
if tag not in dcols :
|
||||
|
||||
value_type = type(value)
|
||||
nb_elts = 1
|
||||
value_obitype = OBI_VOID
|
||||
|
||||
if value_type == dict or value_type == list :
|
||||
nb_elts = len(value)
|
||||
elt_names = list(value)
|
||||
else :
|
||||
nb_elts = 1
|
||||
elt_names = None
|
||||
|
||||
value_obitype = get_obitype(value)
|
||||
|
||||
if value_obitype != OBI_VOID :
|
||||
dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
|
||||
|
||||
# Fill value
|
||||
dcols[tag][0][i] = value
|
||||
|
||||
# TODO else log error?
|
||||
|
||||
else :
|
||||
|
||||
rewrite = False
|
||||
|
||||
# Check type adequation
|
||||
old_type = dcols[tag][1]
|
||||
new_type = OBI_VOID
|
||||
new_type = update_obitype(old_type, value)
|
||||
if old_type != new_type :
|
||||
rewrite = True
|
||||
|
||||
try:
|
||||
# Fill value
|
||||
dcols[tag][0][i] = value
|
||||
|
||||
except IndexError :
|
||||
|
||||
qual_col[i] = entry.quality
|
||||
|
||||
for tag in entry :
|
||||
|
||||
if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN : # TODO dirty
|
||||
|
||||
value = entry[tag]
|
||||
if tag == b"taxid":
|
||||
tag = TAXID_COLUMN
|
||||
if tag == b"count":
|
||||
tag = COUNT_COLUMN
|
||||
if tag[:7] == b"merged_":
|
||||
tag = MERGED_PREFIX+tag[7:]
|
||||
|
||||
if tag not in dcols :
|
||||
|
||||
value_type = type(value)
|
||||
old_column = dcols[tag][0]
|
||||
old_nb_elements_per_line = old_column.nb_elements_per_line
|
||||
new_nb_elements_per_line = 0
|
||||
old_elements_names = old_column.elements_names
|
||||
new_elements_names = None
|
||||
nb_elts = 1
|
||||
value_obitype = OBI_VOID
|
||||
|
||||
if value_type == dict or value_type == list :
|
||||
nb_elts = len(value)
|
||||
elt_names = list(value)
|
||||
else :
|
||||
nb_elts = 1
|
||||
elt_names = None
|
||||
|
||||
value_obitype = get_obitype(value)
|
||||
|
||||
if value_obitype != OBI_VOID :
|
||||
dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
|
||||
|
||||
# Fill value
|
||||
if value_type == dict and nb_elts == 1: # special case that makes the OBI3 create a 1 elt/line column which won't read a dict value
|
||||
value = value[list(value.keys())[0]] # The solution is to transform the value in a simple atomic one acceptable by the column
|
||||
dcols[tag][0][i] = value
|
||||
|
||||
# TODO else log error?
|
||||
|
||||
#####################################################################
|
||||
|
||||
# Check the length and keys of column lines if needed
|
||||
if value_type == dict : # Check dictionary keys
|
||||
for k in value :
|
||||
if k not in old_elements_names :
|
||||
new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
|
||||
rewrite = True
|
||||
break
|
||||
|
||||
elif value_type == list or value_type == tuple : # Check vector length
|
||||
if old_nb_elements_per_line < len(value) :
|
||||
new_nb_elements_per_line = len(value)
|
||||
rewrite = True
|
||||
|
||||
#####################################################################
|
||||
|
||||
if rewrite :
|
||||
if new_nb_elements_per_line == 0 and new_elements_names is not None :
|
||||
new_nb_elements_per_line = len(new_elements_names)
|
||||
|
||||
# Reset obierrno
|
||||
obi_errno = 0
|
||||
|
||||
dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
|
||||
new_data_type=new_type,
|
||||
new_nb_elements_per_line=new_nb_elements_per_line,
|
||||
new_elements_names=new_elements_names,
|
||||
rewrite_last_line=False),
|
||||
new_type)
|
||||
|
||||
# Update the dictionary:
|
||||
for t in dcols :
|
||||
dcols[t] = (view[t], dcols[t][1])
|
||||
|
||||
else :
|
||||
|
||||
rewrite = False
|
||||
|
||||
# Check type adequation
|
||||
old_type = dcols[tag][1]
|
||||
new_type = OBI_VOID
|
||||
new_type = update_obitype(old_type, value)
|
||||
if old_type != new_type :
|
||||
rewrite = True
|
||||
|
||||
try:
|
||||
# Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key
|
||||
if type(value) == dict and \
|
||||
dcols[tag][0].nb_elements_per_line == 1 \
|
||||
and set(dcols[tag][0].elements_names) != set(value.keys()) :
|
||||
raise IndexError # trigger column rewrite
|
||||
|
||||
# Fill value
|
||||
dcols[tag][0][i] = value
|
||||
|
||||
|
||||
except IndexError :
|
||||
|
||||
value_type = type(value)
|
||||
old_column = dcols[tag][0]
|
||||
old_nb_elements_per_line = old_column.nb_elements_per_line
|
||||
new_nb_elements_per_line = 0
|
||||
old_elements_names = old_column.elements_names
|
||||
new_elements_names = None
|
||||
|
||||
#####################################################################
|
||||
|
||||
# Check the length and keys of column lines if needed
|
||||
if value_type == dict : # Check dictionary keys
|
||||
for k in value :
|
||||
if k not in old_elements_names :
|
||||
new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
|
||||
rewrite = True
|
||||
break
|
||||
|
||||
elif value_type == list or value_type == tuple : # Check vector length
|
||||
if old_nb_elements_per_line < len(value) :
|
||||
new_nb_elements_per_line = len(value)
|
||||
rewrite = True
|
||||
|
||||
#####################################################################
|
||||
|
||||
if rewrite :
|
||||
if new_nb_elements_per_line == 0 and new_elements_names is not None :
|
||||
new_nb_elements_per_line = len(new_elements_names)
|
||||
|
||||
# Reset obierrno
|
||||
obi_errno = 0
|
||||
|
||||
dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
|
||||
new_data_type=new_type,
|
||||
new_nb_elements_per_line=new_nb_elements_per_line,
|
||||
new_elements_names=new_elements_names,
|
||||
rewrite_last_line=False),
|
||||
new_type)
|
||||
|
||||
# Update the dictionary:
|
||||
for t in dcols :
|
||||
dcols[t] = (view[t], dcols[t][1])
|
||||
|
||||
# Fill value
|
||||
dcols[tag][0][i] = value
|
||||
|
||||
except Exception as e:
|
||||
print("\nCould not import sequence id:", entry.id, "(error raised:", e, ")")
|
||||
if 'skiperror' in config['obi'] and not config['obi']['skiperror']:
|
||||
raise e
|
||||
else:
|
||||
pass
|
||||
|
||||
i+=1
|
||||
|
||||
if pb is not None:
|
||||
@ -333,7 +416,7 @@ def run(config):
|
||||
except AttributeError:
|
||||
pass
|
||||
try:
|
||||
output[0].close()
|
||||
output[0].close(force=True)
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
|
@ -46,5 +46,5 @@ def run(config):
|
||||
process.wait()
|
||||
|
||||
iview.close()
|
||||
input[0].close()
|
||||
input[0].close(force=True)
|
||||
|
||||
|
@ -34,8 +34,10 @@ def run(config):
|
||||
if input[2] == DMS and not config['ls']['longformat']:
|
||||
dms = input[0]
|
||||
l = []
|
||||
for view in input[0]:
|
||||
l.append(tostr(view) + "\t(Date created: " + str(bytes2str_object(dms[view].comments["Date created"]))+")")
|
||||
for viewname in input[0]:
|
||||
view = dms[viewname]
|
||||
l.append(tostr(viewname) + "\t(Date created: " + str(bytes2str_object(view.comments["Date created"]))+")")
|
||||
view.close()
|
||||
l.sort()
|
||||
for v in l:
|
||||
print(v)
|
||||
@ -51,4 +53,5 @@ def run(config):
|
||||
if config['ls']['longformat'] and len(input[1].comments) > 0:
|
||||
print("\n### Comments:")
|
||||
print(str(input[1].comments))
|
||||
|
||||
|
||||
input[0].close(force=True)
|
||||
|
@ -42,7 +42,9 @@ def addOptions(parser):
|
||||
metavar="<URI>",
|
||||
type=str,
|
||||
default=None,
|
||||
help="URI to the view containing the samples definition (with tags, primers, sample names,...)")
|
||||
required=True,
|
||||
help="URI to the view containing the samples definition (with tags, primers, sample names,...).\n"
|
||||
"\nWarning: primer lengths must be less than or equal to 32")
|
||||
|
||||
group.add_argument('-R', '--reverse-reads',
|
||||
action="store", dest="ngsfilter:reverse",
|
||||
@ -172,6 +174,13 @@ cdef read_info_view(info_view, max_errors=2, verbose=False, not_aligned=False):
|
||||
primer_list = []
|
||||
i=0
|
||||
for p in info_view:
|
||||
|
||||
# Check primer length: should not be longer than 32, the max allowed by the apat lib
|
||||
if len(p[b'forward_primer']) > 32:
|
||||
raise RollbackException("Error: primers can not be longer than 32bp, rollbacking views")
|
||||
if len(p[b'reverse_primer']) > 32:
|
||||
raise RollbackException("Error: primers can not be longer than 32bp, rollbacking views")
|
||||
|
||||
forward=Primer(p[b'forward_primer'],
|
||||
len(p[b'forward_tag']) if (b'forward_tag' in p and p[b'forward_tag']!=None) else None,
|
||||
True,
|
||||
@ -497,7 +506,7 @@ cdef tuple annotate(sequences, infos, no_tags, verbose=False):
|
||||
sample=None
|
||||
|
||||
if sample is None:
|
||||
sequences[0][b'error']=b"No tags found"
|
||||
sequences[0][b'error']=b"No sample with that tag combination"
|
||||
return False, sequences[0]
|
||||
|
||||
sequences[0].update(sample)
|
||||
@ -594,7 +603,13 @@ def run(config):
|
||||
pb = ProgressBar(entries_len, config, seconde=5)
|
||||
|
||||
# Check and store primers and tags
|
||||
infos, primer_list = read_info_view(info_view, max_errors=config['ngsfilter']['error'], verbose=False, not_aligned=not_aligned) # TODO obi verbose option
|
||||
try:
|
||||
infos, primer_list = read_info_view(info_view, max_errors=config['ngsfilter']['error'], verbose=False, not_aligned=not_aligned) # TODO obi verbose option
|
||||
except RollbackException, e:
|
||||
if unidentified is not None:
|
||||
raise RollbackException("obi ngsfilter error, rollbacking views: "+str(e), o_view, unidentified)
|
||||
else:
|
||||
raise RollbackException("obi ngsfilter error, rollbacking view: "+str(e), o_view)
|
||||
|
||||
aligner = Primer_search(primer_list, config['ngsfilter']['error'])
|
||||
|
||||
@ -652,11 +667,11 @@ def run(config):
|
||||
#print("\n\nOutput view:\n````````````", file=sys.stderr)
|
||||
#print(repr(o_view), file=sys.stderr)
|
||||
|
||||
input[0].close()
|
||||
output[0].close()
|
||||
info_input[0].close()
|
||||
input[0].close(force=True)
|
||||
output[0].close(force=True)
|
||||
info_input[0].close(force=True)
|
||||
if unidentified is not None:
|
||||
unidentified_input[0].close()
|
||||
unidentified_input[0].close(force=True)
|
||||
aligner.free()
|
||||
|
||||
logger("info", "Done.")
|
||||
|
@ -141,7 +141,7 @@ def run(config):
|
||||
# If the input and the output DMS are different, delete the temporary imported view used to create the final view
|
||||
if i_dms != o_dms:
|
||||
View.delete_view(i_dms, o_view_name)
|
||||
o_dms.close()
|
||||
i_dms.close()
|
||||
o_dms.close(force=True)
|
||||
i_dms.close(force=True)
|
||||
|
||||
logger("info", "Done.")
|
||||
|
@ -251,7 +251,7 @@ def run(config):
|
||||
for i in range(len(sorted_stats)):
|
||||
c = sorted_stats[i][0]
|
||||
for v in c:
|
||||
if v is not None:
|
||||
if type(v) == bytes:
|
||||
print(pcat % tostr(v)+"\t", end="")
|
||||
else:
|
||||
print(pcat % str(v)+"\t", end="")
|
||||
@ -268,6 +268,6 @@ def run(config):
|
||||
print("%7d" %catcount[c], end="")
|
||||
print("%9d" %totcount[c])
|
||||
|
||||
input[0].close()
|
||||
input[0].close(force=True)
|
||||
|
||||
logger("info", "Done.")
|
||||
|
@ -106,7 +106,7 @@ def run(config):
|
||||
# If the input and the output DMS are different, delete the temporary imported view used to create the final view
|
||||
if i_dms != o_dms:
|
||||
View.delete_view(i_dms, o_view_name)
|
||||
o_dms.close()
|
||||
i_dms.close()
|
||||
o_dms.close(force=True)
|
||||
i_dms.close(force=True)
|
||||
|
||||
logger("info", "Done.")
|
||||
|
@ -529,7 +529,7 @@ def run(config):
|
||||
test_taxo(config, infos)
|
||||
|
||||
infos['view'].close()
|
||||
infos['dms'].close()
|
||||
infos['dms'].close(force=True)
|
||||
shutil.rmtree(config['obi']['defaultdms']+'.obidms', ignore_errors=True)
|
||||
|
||||
print("Done.")
|
||||
|
@ -5,5 +5,5 @@ from obitools3.dms.taxo.taxo cimport Taxonomy
|
||||
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
||||
|
||||
|
||||
cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy)
|
||||
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=*, Taxonomy taxonomy=*, bint mergeIds=*, list categories=*, int max_elts=*)
|
||||
cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy, dict config)
|
||||
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, dict config, list mergedKeys_list=*, Taxonomy taxonomy=*, bint mergeIds=*, list categories=*, int max_elts=*)
|
||||
|
@ -56,7 +56,7 @@ def addOptions(parser):
|
||||
"(option can be used several times).")
|
||||
|
||||
|
||||
cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy, dict config) :
|
||||
|
||||
cdef int taxid
|
||||
cdef Nuc_Seq_Stored seq
|
||||
@ -69,7 +69,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
cdef object gn_sn
|
||||
cdef object fa_sn
|
||||
|
||||
# Create columns
|
||||
# Create columns and save them for efficiency
|
||||
if b"species" in o_view and o_view[b"species"].data_type_int != OBI_INT :
|
||||
o_view.delete_column(b"species")
|
||||
if b"species" not in o_view:
|
||||
@ -77,6 +77,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
b"species",
|
||||
OBI_INT
|
||||
)
|
||||
species_column = o_view[b"species"]
|
||||
|
||||
if b"genus" in o_view and o_view[b"genus"].data_type_int != OBI_INT :
|
||||
o_view.delete_column(b"genus")
|
||||
@ -85,6 +86,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
b"genus",
|
||||
OBI_INT
|
||||
)
|
||||
genus_column = o_view[b"genus"]
|
||||
|
||||
if b"family" in o_view and o_view[b"family"].data_type_int != OBI_INT :
|
||||
o_view.delete_column(b"family")
|
||||
@ -93,6 +95,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
b"family",
|
||||
OBI_INT
|
||||
)
|
||||
family_column = o_view[b"family"]
|
||||
|
||||
if b"species_name" in o_view and o_view[b"species_name"].data_type_int != OBI_STR :
|
||||
o_view.delete_column(b"species_name")
|
||||
@ -101,6 +104,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
b"species_name",
|
||||
OBI_STR
|
||||
)
|
||||
species_name_column = o_view[b"species_name"]
|
||||
|
||||
if b"genus_name" in o_view and o_view[b"genus_name"].data_type_int != OBI_STR :
|
||||
o_view.delete_column(b"genus_name")
|
||||
@ -109,6 +113,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
b"genus_name",
|
||||
OBI_STR
|
||||
)
|
||||
genus_name_column = o_view[b"genus_name"]
|
||||
|
||||
if b"family_name" in o_view and o_view[b"family_name"].data_type_int != OBI_STR :
|
||||
o_view.delete_column(b"family_name")
|
||||
@ -117,6 +122,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
b"family_name",
|
||||
OBI_STR
|
||||
)
|
||||
family_name_column = o_view[b"family_name"]
|
||||
|
||||
if b"rank" in o_view and o_view[b"rank"].data_type_int != OBI_STR :
|
||||
o_view.delete_column(b"rank")
|
||||
@ -125,6 +131,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
b"rank",
|
||||
OBI_STR
|
||||
)
|
||||
rank_column = o_view[b"rank"]
|
||||
|
||||
if b"scientific_name" in o_view and o_view[b"scientific_name"].data_type_int != OBI_STR :
|
||||
o_view.delete_column(b"scientific_name")
|
||||
@ -133,9 +140,15 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
b"scientific_name",
|
||||
OBI_STR
|
||||
)
|
||||
|
||||
for seq in o_view:
|
||||
PyErr_CheckSignals()
|
||||
scientific_name_column = o_view[b"scientific_name"]
|
||||
|
||||
# Initialize the progress bar
|
||||
pb = ProgressBar(len(o_view), config, seconde=5)
|
||||
|
||||
i=0
|
||||
for seq in o_view:
|
||||
PyErr_CheckSignals()
|
||||
pb(i)
|
||||
if MERGED_TAXID_COLUMN in seq :
|
||||
m_taxids = []
|
||||
m_taxids_dict = seq[MERGED_TAXID_COLUMN]
|
||||
@ -165,20 +178,23 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
else:
|
||||
fa_sn = None
|
||||
tfa = None
|
||||
|
||||
seq[b"species"] = tsp
|
||||
seq[b"genus"] = tgn
|
||||
seq[b"family"] = tfa
|
||||
|
||||
seq[b"species_name"] = sp_sn
|
||||
seq[b"genus_name"] = gn_sn
|
||||
seq[b"family_name"] = fa_sn
|
||||
|
||||
seq[b"rank"] = taxonomy.get_rank(taxid)
|
||||
seq[b"scientific_name"] = taxonomy.get_scientific_name(taxid)
|
||||
species_column[i] = tsp
|
||||
genus_column[i] = tgn
|
||||
family_column[i] = tfa
|
||||
|
||||
species_name_column[i] = sp_sn
|
||||
genus_name_column[i] = gn_sn
|
||||
family_name_column[i] = fa_sn
|
||||
|
||||
rank_column[i] = taxonomy.get_rank(taxid)
|
||||
scientific_name_column[i] = taxonomy.get_scientific_name(taxid)
|
||||
i+=1
|
||||
|
||||
pb(len(o_view), force=True)
|
||||
|
||||
|
||||
|
||||
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None, int max_elts=1000000) :
|
||||
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, dict config, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None, int max_elts=1000000) :
|
||||
|
||||
cdef int i
|
||||
cdef int k
|
||||
@ -187,6 +203,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
cdef int u_idx
|
||||
cdef int i_idx
|
||||
cdef int i_count
|
||||
cdef int o_count
|
||||
cdef str key_str
|
||||
cdef bytes key
|
||||
cdef bytes mkey
|
||||
@ -209,7 +226,6 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
cdef Nuc_Seq_Stored i_seq
|
||||
cdef Nuc_Seq_Stored o_seq
|
||||
cdef Nuc_Seq_Stored u_seq
|
||||
cdef Column i_col
|
||||
cdef Column i_seq_col
|
||||
cdef Column i_id_col
|
||||
cdef Column i_taxid_col
|
||||
@ -217,6 +233,8 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
cdef Column o_id_col
|
||||
cdef Column o_taxid_dist_col
|
||||
cdef Column o_merged_col
|
||||
cdef Column o_count_col
|
||||
cdef Column i_count_col
|
||||
cdef Column_line i_mcol
|
||||
cdef object taxid_dist_dict
|
||||
cdef object iter_view
|
||||
@ -252,7 +270,12 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
mergedKeys_m = []
|
||||
for k in range(k_count):
|
||||
mergedKeys_m.append(MERGED_PREFIX + mergedKeys[k])
|
||||
|
||||
|
||||
# Check that not trying to remerge without total count information
|
||||
for key in mergedKeys_m:
|
||||
if key in view and COUNT_COLUMN not in view:
|
||||
raise Exception("\n>>>>\nError: trying to re-merge tags without total count tag. Run obi annotate to add the count tag from the relevant merged tag, i.e.: \nobi annotate --set-tag COUNT:'sum([value for key,value in sequence['MERGED_sample'].items()])' dms/input dms/output\n")
|
||||
|
||||
if categories is None:
|
||||
categories = []
|
||||
|
||||
@ -320,7 +343,11 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
for k in range(k_count):
|
||||
key = mergedKeys[k]
|
||||
merged_col_name = mergedKeys_m[k]
|
||||
i_col = view[key]
|
||||
|
||||
if merged_col_name in view:
|
||||
i_col = view[merged_col_name]
|
||||
else:
|
||||
i_col = view[key]
|
||||
|
||||
if merged_infos[merged_col_name]['nb_elts'] > max_elts:
|
||||
str_merged_cols.append(merged_col_name)
|
||||
@ -374,23 +401,30 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
alias=MERGED_COLUMN
|
||||
)
|
||||
|
||||
# Keep columns that are going to be used a lot in variables
|
||||
# Keep columns in variables for efficiency
|
||||
o_id_col = o_view[ID_COLUMN]
|
||||
if TAXID_DIST_COLUMN in o_view:
|
||||
o_taxid_dist_col = o_view[TAXID_DIST_COLUMN]
|
||||
if MERGED_COLUMN in o_view:
|
||||
o_merged_col = o_view[MERGED_COLUMN]
|
||||
if COUNT_COLUMN not in o_view:
|
||||
Column.new_column(o_view,
|
||||
COUNT_COLUMN,
|
||||
OBI_INT)
|
||||
o_count_col = o_view[COUNT_COLUMN]
|
||||
if COUNT_COLUMN in view:
|
||||
i_count_col = view[COUNT_COLUMN]
|
||||
|
||||
pb(len(view), force=True)
|
||||
print("")
|
||||
logger("info", "Second browsing through the input")
|
||||
# Initialize the progress bar
|
||||
pb = ProgressBar(len(uniques), seconde=5)
|
||||
pb = ProgressBar(len(view), seconde=5)
|
||||
o_idx = 0
|
||||
total_treated = 0
|
||||
|
||||
for unique_id in uniques :
|
||||
PyErr_CheckSignals()
|
||||
pb(o_idx)
|
||||
|
||||
merged_sequences = uniques[unique_id]
|
||||
|
||||
@ -407,7 +441,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
merged_list = list(set(merged_list)) # deduplicate the list
|
||||
o_merged_col[o_idx] = merged_list
|
||||
|
||||
o_seq[COUNT_COLUMN] = 0
|
||||
o_count = 0
|
||||
|
||||
if TAXID_DIST_COLUMN in u_seq and i_taxid_dist_col[u_idx] is not None:
|
||||
taxid_dist_dict = i_taxid_dist_col[u_idx]
|
||||
@ -419,16 +453,17 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
merged_dict[mkey] = {}
|
||||
|
||||
for i_idx in merged_sequences:
|
||||
|
||||
pb(total_treated)
|
||||
|
||||
i_id = i_id_col[i_idx]
|
||||
i_seq = view[i_idx]
|
||||
|
||||
if COUNT_COLUMN not in i_seq or i_seq[COUNT_COLUMN] is None:
|
||||
if COUNT_COLUMN not in i_seq or i_count_col[i_idx] is None:
|
||||
i_count = 1
|
||||
else:
|
||||
i_count = i_seq[COUNT_COLUMN]
|
||||
i_count = i_count_col[i_idx]
|
||||
|
||||
o_seq[COUNT_COLUMN] += i_count
|
||||
o_count += i_count
|
||||
|
||||
for k in range(k_count):
|
||||
|
||||
@ -463,44 +498,54 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
mcol[key2] = i_mcol[key2]
|
||||
else:
|
||||
mcol[key2] = mcol[key2] + i_mcol[key2]
|
||||
|
||||
# Write taxid_dist
|
||||
if mergeIds and TAXID_COLUMN in mergedKeys:
|
||||
if TAXID_DIST_COLUMN in str_merged_cols:
|
||||
o_taxid_dist_col[o_idx] = str(taxid_dist_dict)
|
||||
else:
|
||||
o_taxid_dist_col[o_idx] = taxid_dist_dict
|
||||
|
||||
# Write merged dicts
|
||||
for mkey in merged_dict:
|
||||
if mkey in str_merged_cols:
|
||||
mkey_cols[mkey][o_idx] = str(merged_dict[mkey])
|
||||
else:
|
||||
mkey_cols[mkey][o_idx] = merged_dict[mkey]
|
||||
# Sets NA values to 0 # TODO discuss, for now keep as None and test for None instead of testing for 0 in tools
|
||||
#for key in mkey_cols[mkey][o_idx]:
|
||||
# if mkey_cols[mkey][o_idx][key] is None:
|
||||
# mkey_cols[mkey][o_idx][key] = 0
|
||||
|
||||
|
||||
for key in i_seq.keys():
|
||||
# Delete informations that differ between the merged sequences
|
||||
# TODO make special columns list?
|
||||
# TODO make special columns list? // could be more efficient
|
||||
if key != COUNT_COLUMN and key != ID_COLUMN and key != NUC_SEQUENCE_COLUMN and key in o_seq and o_seq[key] != i_seq[key] \
|
||||
and key not in merged_dict :
|
||||
o_seq[key] = None
|
||||
|
||||
total_treated += 1
|
||||
|
||||
# Write merged dicts
|
||||
for mkey in merged_dict:
|
||||
if mkey in str_merged_cols:
|
||||
mkey_cols[mkey][o_idx] = str(merged_dict[mkey])
|
||||
else:
|
||||
mkey_cols[mkey][o_idx] = merged_dict[mkey]
|
||||
# Sets NA values to 0 # TODO discuss, for now keep as None and test for None instead of testing for 0 in tools
|
||||
#for key in mkey_cols[mkey][o_idx]:
|
||||
# if mkey_cols[mkey][o_idx][key] is None:
|
||||
# mkey_cols[mkey][o_idx][key] = 0
|
||||
|
||||
# Write taxid_dist
|
||||
if mergeIds and TAXID_COLUMN in mergedKeys:
|
||||
if TAXID_DIST_COLUMN in str_merged_cols:
|
||||
o_taxid_dist_col[o_idx] = str(taxid_dist_dict)
|
||||
else:
|
||||
o_taxid_dist_col[o_idx] = taxid_dist_dict
|
||||
|
||||
o_count_col[o_idx] = o_count
|
||||
o_idx += 1
|
||||
|
||||
pb(len(view), force=True)
|
||||
|
||||
# Deletes quality columns if there is one because the matching between sequence and quality will be broken (quality set to NA when sequence not)
|
||||
if QUALITY_COLUMN in view:
|
||||
o_view.delete_column(QUALITY_COLUMN)
|
||||
if REVERSE_QUALITY_COLUMN in view:
|
||||
o_view.delete_column(REVERSE_QUALITY_COLUMN)
|
||||
|
||||
# Delete old columns that are now merged
|
||||
for k in range(k_count):
|
||||
if mergedKeys[k] in o_view:
|
||||
o_view.delete_column(mergedKeys[k])
|
||||
|
||||
if taxonomy is not None:
|
||||
print("") # TODO because in the middle of progress bar. Better solution?
|
||||
logger("info", "Merging taxonomy classification")
|
||||
merge_taxonomy_classification(o_view, taxonomy)
|
||||
merge_taxonomy_classification(o_view, taxonomy, config)
|
||||
|
||||
|
||||
|
||||
@ -546,12 +591,12 @@ def run(config):
|
||||
# Initialize the progress bar
|
||||
pb = ProgressBar(len(entries), config, seconde=5)
|
||||
|
||||
try:
|
||||
uniq_sequences(entries, o_view, pb, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'], max_elts=config['obi']['maxelts'])
|
||||
except Exception, e:
|
||||
raise RollbackException("obi uniq error, rollbacking view: "+str(e), o_view)
|
||||
if len(entries) > 0:
|
||||
try:
|
||||
uniq_sequences(entries, o_view, pb, config, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'], max_elts=config['obi']['maxelts'])
|
||||
except Exception, e:
|
||||
raise RollbackException("obi uniq error, rollbacking view: "+str(e), o_view)
|
||||
|
||||
pb(len(entries), force=True)
|
||||
print("", file=sys.stderr)
|
||||
|
||||
# Save command config in View and DMS comments
|
||||
@ -567,8 +612,8 @@ def run(config):
|
||||
#print("\n\nOutput view:\n````````````", file=sys.stderr)
|
||||
#print(repr(o_view), file=sys.stderr)
|
||||
|
||||
input[0].close()
|
||||
output[0].close()
|
||||
input[0].close(force=True)
|
||||
output[0].close(force=True)
|
||||
|
||||
logger("info", "Done.")
|
||||
|
||||
|
@ -63,6 +63,8 @@ cdef extern from "obidmscolumn.h" nogil:
|
||||
|
||||
char* obi_get_elements_names(OBIDMS_column_p column)
|
||||
|
||||
char* obi_column_formatted_infos(OBIDMS_column_p column)
|
||||
|
||||
index_t obi_column_get_element_index_from_name(OBIDMS_column_p column, const char* element_name)
|
||||
|
||||
int obi_column_write_comments(OBIDMS_column_p column, const char* comments)
|
||||
|
@ -102,7 +102,7 @@ cdef extern from "obiview.h" nogil:
|
||||
const_char_p comments,
|
||||
bint create)
|
||||
|
||||
int obi_view_delete_column(Obiview_p view, const_char_p column_name)
|
||||
int obi_view_delete_column(Obiview_p view, const_char_p column_name, bint delete_file)
|
||||
|
||||
OBIDMS_column_p obi_view_get_column(Obiview_p view, const_char_p column_name)
|
||||
|
||||
|
@ -14,6 +14,7 @@ from ..capi.obidms cimport obi_import_column
|
||||
from ..capi.obidmscolumn cimport OBIDMS_column_header_p, \
|
||||
obi_close_column, \
|
||||
obi_get_elements_names, \
|
||||
obi_column_formatted_infos, \
|
||||
obi_column_write_comments
|
||||
|
||||
from ..capi.obiutils cimport obi_format_date
|
||||
@ -38,7 +39,7 @@ from obitools3.utils cimport tobytes, \
|
||||
|
||||
from obitools3.dms.column import typed_column
|
||||
|
||||
from libc.stdlib cimport free
|
||||
from libc.stdlib cimport free
|
||||
|
||||
import importlib
|
||||
import inspect
|
||||
@ -288,9 +289,15 @@ cdef class Column(OBIWrapper) :
|
||||
@OBIWrapper.checkIsActive
|
||||
def __repr__(self) :
|
||||
cdef bytes s
|
||||
#cdef char* s_b
|
||||
#cdef str s_str
|
||||
#s_b = obi_column_formatted_infos(self.pointer())
|
||||
#s_str = bytes2str(s_b)
|
||||
#free(s_b)
|
||||
s = self._alias + b", data type: " + self.data_type
|
||||
#return s_str
|
||||
return bytes2str(s)
|
||||
|
||||
|
||||
|
||||
def close(self): # TODO discuss, can't be called bc then bug when closing view that tries to close it in C
|
||||
|
||||
|
@ -94,16 +94,16 @@ cdef class DMS(OBIWrapper):
|
||||
return dms
|
||||
|
||||
|
||||
def close(self) :
|
||||
def close(self, force=False) :
|
||||
'''
|
||||
Closes the DMS instance and free the associated memory
|
||||
Closes the DMS instance and free the associated memory (no counter, closing is final)
|
||||
|
||||
The `close` method is automatically called by the object destructor.
|
||||
'''
|
||||
cdef OBIDMS_p pointer = self.pointer()
|
||||
if self.active() :
|
||||
OBIWrapper.close(self)
|
||||
if (obi_close_dms(pointer, False)) < 0 :
|
||||
if (obi_close_dms(pointer, force=force)) < 0 :
|
||||
raise Exception("Problem closing an OBIDMS")
|
||||
|
||||
|
||||
@ -227,7 +227,9 @@ cdef class DMS(OBIWrapper):
|
||||
cdef str s
|
||||
s=""
|
||||
for view_name in self.keys():
|
||||
s = s + repr(self.get_view(view_name)) + "\n"
|
||||
view = self.get_view(view_name)
|
||||
s = s + repr(view) + "\n"
|
||||
view.close()
|
||||
return s
|
||||
|
||||
|
||||
@ -254,12 +256,13 @@ cdef class DMS(OBIWrapper):
|
||||
# bash command history property getter
|
||||
@property
|
||||
def bash_history(self):
|
||||
s = b"#!/bin/bash\n\n"
|
||||
#s = b"#!${bash}/bin/bash\n\n"
|
||||
s = b""
|
||||
first = True
|
||||
for command in self.command_line_history:
|
||||
s+=b"#"
|
||||
s+=command[b"time"]
|
||||
s+=b"\n"
|
||||
s+=b"\nobi "
|
||||
s+=command[b"command"]
|
||||
s+=b"\n"
|
||||
return s
|
||||
|
@ -22,7 +22,8 @@ cdef class View(OBIWrapper):
|
||||
cdef inline Obiview_p pointer(self)
|
||||
|
||||
cpdef delete_column(self,
|
||||
object column_name)
|
||||
object column_name,
|
||||
bint delete_file=*)
|
||||
|
||||
cpdef rename_column(self,
|
||||
object current_name,
|
||||
|
@ -227,7 +227,8 @@ cdef class View(OBIWrapper) :
|
||||
|
||||
|
||||
cpdef delete_column(self,
|
||||
object column_name) :
|
||||
object column_name,
|
||||
bint delete_file=False) :
|
||||
|
||||
cdef bytes column_name_b = tobytes(column_name)
|
||||
|
||||
@ -239,7 +240,7 @@ cdef class View(OBIWrapper) :
|
||||
col.close()
|
||||
|
||||
# Remove the column from the view which closes the C structure
|
||||
if obi_view_delete_column(self.pointer(), column_name_b) < 0 :
|
||||
if obi_view_delete_column(self.pointer(), column_name_b, delete_file) < 0 :
|
||||
raise RollbackException("Problem deleting column %s from a view",
|
||||
bytes2str(column_name_b), self)
|
||||
|
||||
@ -297,11 +298,17 @@ cdef class View(OBIWrapper) :
|
||||
nb_elements_per_line=new_nb_elements_per_line, elements_names=new_elements_names,
|
||||
comments=old_column.comments, alias=column_name_b+tobytes('___new___'))
|
||||
|
||||
switch_to_dict = old_column.nb_elements_per_line == 1 and new_nb_elements_per_line > 1
|
||||
ori_key = old_column._elements_names[0]
|
||||
|
||||
for i in range(length) :
|
||||
new_column[i] = old_column[i]
|
||||
if switch_to_dict :
|
||||
new_column[i] = {ori_key: old_column[i]}
|
||||
else:
|
||||
new_column[i] = old_column[i]
|
||||
|
||||
# Remove old column from view
|
||||
self.delete_column(column_name_b)
|
||||
self.delete_column(column_name_b, delete_file=True)
|
||||
|
||||
# Rename new
|
||||
new_column.name = column_name_b
|
||||
@ -519,11 +526,12 @@ cdef class View(OBIWrapper) :
|
||||
# bash command history property getter
|
||||
@property
|
||||
def bash_history(self):
|
||||
s = b"#!/bin/bash\n\n"
|
||||
s = b""
|
||||
first = True
|
||||
for level in self.view_history:
|
||||
command_list = [level[input][b"command_line"] for input in level.keys()]
|
||||
for command in command_list:
|
||||
s+=b"obi "
|
||||
s+=command
|
||||
s+=b"\n"
|
||||
return s
|
||||
|
@ -188,7 +188,7 @@ def buildConsensus(ali, seq, ref_tags=None):
|
||||
seq[b'shift']=ali.shift
|
||||
else:
|
||||
if len(ali[0])>999: # TODO why?
|
||||
raise AssertionError,"Too long alignemnt"
|
||||
raise AssertionError,"Too long alignment"
|
||||
|
||||
ic=IterOnConsensus(ali)
|
||||
|
||||
@ -250,11 +250,21 @@ def buildJoinedSequence(ali, reverse, seq, forward=None):
|
||||
quality.extend(reverse.quality)
|
||||
seq.set(forward.id +b"_PairedEnd", s, definition=forward.definition, quality=quality)
|
||||
seq[b"score"]=ali.score
|
||||
seq[b"ali_direction"]=ali.direction
|
||||
if len(ali.direction) > 0:
|
||||
seq[b"ali_direction"]=ali.direction
|
||||
else:
|
||||
seq[b"ali_direction"]=None
|
||||
seq[b"mode"]=b"joined"
|
||||
seq[b"pairedend_limit"]=len(forward)
|
||||
seq[b"pairedend_limit"]=len(forward)
|
||||
seq[b"ali_length"] = ali.consensus_len
|
||||
if ali.consensus_len > 0:
|
||||
seq[b"score_norm"]=float(ali.score)/ali.consensus_len
|
||||
else:
|
||||
seq[b"score_norm"]=0.0
|
||||
|
||||
for tag in forward:
|
||||
if tag != REVERSE_SEQUENCE_COLUMN and tag != REVERSE_QUALITY_COLUMN:
|
||||
if tag != REVERSE_SEQUENCE_COLUMN and tag != REVERSE_QUALITY_COLUMN and \
|
||||
tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN:
|
||||
seq[tag] = forward[tag]
|
||||
return seq
|
||||
|
||||
|
@ -156,6 +156,9 @@ def emblIterator_file(lineiterator,
|
||||
yield seq
|
||||
read+=1
|
||||
|
||||
# Last sequence
|
||||
seq = emblParser(entry)
|
||||
|
||||
yield seq
|
||||
|
||||
free(entry)
|
||||
@ -174,7 +177,7 @@ def emblIterator_dir(dir_path,
|
||||
for filename in files:
|
||||
if read==only:
|
||||
return
|
||||
print("Parsing file %s (%d/%d)" % (tostr(filename), read_files, len(files)))
|
||||
print("Parsing file %s (%d/%d)" % (tostr(filename), read_files+1, len(files)))
|
||||
f = uopen(filename)
|
||||
if only is not None:
|
||||
only_f = only-read
|
||||
|
@ -104,6 +104,7 @@ def fastaNucIterator(lineiterator,
|
||||
cdef bytes sequence
|
||||
cdef int skipped, ionly, read
|
||||
cdef Nuc_Seq seq
|
||||
cdef bint stop
|
||||
|
||||
if only is None:
|
||||
ionly = -1
|
||||
@ -130,7 +131,8 @@ def fastaNucIterator(lineiterator,
|
||||
else:
|
||||
line = firstline
|
||||
|
||||
while True:
|
||||
stop=False
|
||||
while not stop:
|
||||
|
||||
if ionly >= 0 and read >= ionly:
|
||||
break
|
||||
@ -153,7 +155,7 @@ def fastaNucIterator(lineiterator,
|
||||
s.append(line[0:-1])
|
||||
line = next(iterator)
|
||||
except StopIteration:
|
||||
pass
|
||||
stop=True
|
||||
|
||||
sequence = b"".join(s)
|
||||
|
||||
|
@ -25,8 +25,9 @@ from libc.string cimport strcpy, strlen
|
||||
_featureMatcher = re.compile(b'^FEATURES.+\n(?=ORIGIN)',re.DOTALL + re.M)
|
||||
|
||||
_headerMatcher = re.compile(b'^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M)
|
||||
_seqMatcher = re.compile(b'(?<=ORIGIN).+(?=//\n)', re.DOTALL + re.M)
|
||||
_cleanSeq = re.compile(b'[ \n0-9]+')
|
||||
_seqMatcher = re.compile(b'ORIGIN.+(?=//\n)', re.DOTALL + re.M)
|
||||
_cleanSeq1 = re.compile(b'ORIGIN.+\n')
|
||||
_cleanSeq2 = re.compile(b'[ \n0-9]+')
|
||||
_acMatcher = re.compile(b'(?<=^ACCESSION ).+',re.M)
|
||||
_deMatcher = re.compile(b'(?<=^DEFINITION ).+\n( .+\n)*',re.M)
|
||||
_cleanDe = re.compile(b'\n *')
|
||||
@ -42,7 +43,8 @@ def genbankParser(bytes text):
|
||||
ft = _featureMatcher.search(text).group()
|
||||
|
||||
s = _seqMatcher.search(text).group()
|
||||
s = _cleanSeq.sub(b'', s).upper()
|
||||
s = _cleanSeq1.sub(b'', s)
|
||||
s = _cleanSeq2.sub(b'', s)
|
||||
|
||||
acs = _acMatcher.search(text).group()
|
||||
acs = acs.split()
|
||||
@ -51,23 +53,23 @@ def genbankParser(bytes text):
|
||||
|
||||
de = _deMatcher.search(header).group()
|
||||
de = _cleanDe.sub(b' ',de).strip().strip(b'.')
|
||||
|
||||
|
||||
tags = {}
|
||||
extractTaxon(ft, tags)
|
||||
|
||||
seq = Nuc_Seq(ac,
|
||||
s,
|
||||
definition=de,
|
||||
quality=None,
|
||||
offset=-1,
|
||||
tags=tags)
|
||||
|
||||
except Exception as e:
|
||||
print("\nCould not import sequence id:", text.split()[1], "(error raised:", e, ")")
|
||||
# Do not raise any Exception if you need the possibility to resume the generator
|
||||
# (Python generators can't resume after any exception is raised)
|
||||
return None
|
||||
|
||||
tags = {}
|
||||
extractTaxon(ft, tags)
|
||||
|
||||
seq = Nuc_Seq(ac,
|
||||
s,
|
||||
definition=de,
|
||||
quality=None,
|
||||
offset=-1,
|
||||
tags=tags)
|
||||
|
||||
|
||||
return seq
|
||||
|
||||
|
||||
@ -153,6 +155,9 @@ def genbankIterator_file(lineiterator,
|
||||
yield seq
|
||||
read+=1
|
||||
|
||||
# Last sequence
|
||||
seq = genbankParser(entry)
|
||||
|
||||
yield seq
|
||||
|
||||
free(entry)
|
||||
@ -168,10 +173,12 @@ def genbankIterator_dir(dir_path,
|
||||
read = 0
|
||||
read_files = 0
|
||||
files = [filename for filename in glob.glob(os.path.join(path, b'*.gbff*'))]
|
||||
files.extend([filename for filename in glob.glob(os.path.join(path, b'*.seq*'))]) # new genbank extension
|
||||
files = list(set(files))
|
||||
for filename in files:
|
||||
if read==only:
|
||||
return
|
||||
print("Parsing file %s (%d/%d)" % (tostr(filename), read_files, len(files)))
|
||||
print("Parsing file %s (%d/%d)" % (tostr(filename), read_files+1, len(files)))
|
||||
f = uopen(filename)
|
||||
if only is not None:
|
||||
only_f = only-read
|
||||
|
7
python/obitools3/uri/decode.pyx
Executable file → Normal file
7
python/obitools3/uri/decode.pyx
Executable file → Normal file
@ -171,7 +171,8 @@ Reads an URI and returns a tuple containing:
|
||||
def open_uri(uri,
|
||||
bint input=True,
|
||||
type newviewtype=View,
|
||||
dms_only=False):
|
||||
dms_only=False,
|
||||
force_file=False):
|
||||
|
||||
cdef bytes urib = tobytes(uri)
|
||||
cdef bytes scheme
|
||||
@ -195,9 +196,9 @@ def open_uri(uri,
|
||||
if 'obi' not in config:
|
||||
config['obi']={}
|
||||
|
||||
try:
|
||||
if not force_file and "defaultdms" in config["obi"]:
|
||||
default_dms=config["obi"]["defaultdms"]
|
||||
except KeyError:
|
||||
else:
|
||||
default_dms=None
|
||||
|
||||
try:
|
||||
|
@ -72,7 +72,7 @@ cpdef int count_entries(file, bytes format):
|
||||
return -1
|
||||
mmapped_file = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
|
||||
total_count += len(re.findall(sep, mmapped_file))
|
||||
if format != b"ngsfilter" and format != b"tabular":
|
||||
if format != b"ngsfilter" and format != b"tabular" and format != b"embl" and format != b"genbank":
|
||||
total_count += 1 # adding +1 for 1st entry because separators include \n (ngsfilter and tabular already count one more because of last \n)
|
||||
|
||||
except:
|
||||
|
@ -1,5 +1,5 @@
|
||||
major = 3
|
||||
minor = 0
|
||||
serial= '0-beta6'
|
||||
serial= '0b25'
|
||||
|
||||
version ="%d.%02d.%s" % (major,minor,serial)
|
||||
version ="%d.%d.%s" % (major,minor,serial)
|
||||
|
5
requirements.txt
Executable file
5
requirements.txt
Executable file
@ -0,0 +1,5 @@
|
||||
--extra-index-url https://pypi.python.org/simple/
|
||||
Cython>=0.24
|
||||
Sphinx>=1.2.0
|
||||
ipython>=3.0.0
|
||||
breathe>=4.0.0
|
26
setup.py
26
setup.py
@ -5,8 +5,9 @@ import re
|
||||
import subprocess
|
||||
|
||||
from distutils import log
|
||||
from distutils.core import setup
|
||||
|
||||
#from distutils.core import setup
|
||||
from setuptools import setup # to work with pip
|
||||
|
||||
from distutils.core import Extension
|
||||
from distutils.sysconfig import get_python_lib
|
||||
|
||||
@ -26,10 +27,11 @@ class Distribution(ori_Distribution):
|
||||
|
||||
ori_Distribution.__init__(self, attrs)
|
||||
|
||||
self.global_options.insert(0,('cobitools3', None, "intall location of the C library"
|
||||
self.global_options.insert(0,('cobitools3', None, "install location of the C library"
|
||||
))
|
||||
|
||||
from distutils.command.build import build as build_ori
|
||||
from setuptools.command.bdist_egg import bdist_egg as bdist_egg_ori
|
||||
from distutils.core import Command
|
||||
|
||||
|
||||
@ -70,6 +72,12 @@ class build(build_ori):
|
||||
build_ori.run(self)
|
||||
|
||||
|
||||
class bdist_egg(bdist_egg_ori):
|
||||
def run(self):
|
||||
self.run_command('build_clib')
|
||||
bdist_egg_ori.run(self)
|
||||
|
||||
|
||||
sys.path.append(os.path.abspath("python"))
|
||||
|
||||
|
||||
@ -88,9 +96,10 @@ PACKAGE = "OBITools3"
|
||||
VERSION = version
|
||||
AUTHOR = 'Celine Mercier'
|
||||
EMAIL = 'celine.mercier@metabarcoding.org'
|
||||
URL = "http://metabarcoding.org/obitools3"
|
||||
URL = "https://metabarcoding.org/obitools3"
|
||||
PLATFORMS = "posix"
|
||||
LICENSE = "CeCILL-V2"
|
||||
DESCRIPTION = "Tools and library for DNA metabarcoding",
|
||||
DESCRIPTION = "A package for the management of analyses and data in DNA metabarcoding."
|
||||
PYTHONMIN = '3.5'
|
||||
|
||||
SRC = 'python'
|
||||
@ -147,17 +156,24 @@ classifiers=['Development Status :: 4 - Beta',
|
||||
'Topic :: Utilities',
|
||||
]
|
||||
|
||||
with open("README.md", "r") as fh:
|
||||
long_description = fh.read()
|
||||
|
||||
setup(name=PACKAGE,
|
||||
description=DESCRIPTION,
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
classifiers=classifiers,
|
||||
version=VERSION,
|
||||
author=AUTHOR,
|
||||
author_email=EMAIL,
|
||||
platforms=PLATFORMS,
|
||||
license=LICENSE,
|
||||
url=URL,
|
||||
ext_modules=xx,
|
||||
distclass=Distribution,
|
||||
cmdclass={'build': build,
|
||||
'bdist_egg': bdist_egg,
|
||||
'build_clib': build_clib},
|
||||
cobitools3=get_python_lib(),
|
||||
packages = findPackage('python'),
|
||||
|
@ -157,7 +157,7 @@ int build_reference_db(const char* dms_name,
|
||||
ecotx_t* lca_2 = NULL;
|
||||
ecotx_t* lca = NULL;
|
||||
index_t idx1, idx2;
|
||||
index_t i, j, k;
|
||||
index_t i, j, k, count;
|
||||
int32_t taxid_array_length;
|
||||
int32_t score_array_length;
|
||||
int32_t taxid_array_writable_length;
|
||||
@ -185,6 +185,7 @@ int build_reference_db(const char* dms_name,
|
||||
matrix_view_name = strcpy(matrix_view_name, o_view_name);
|
||||
strcat(matrix_view_name, "_matrix");
|
||||
|
||||
fprintf(stderr, "Aligning queries with reference database...\n");
|
||||
if (obi_lcs_align_one_column(dms_name,
|
||||
refs_view_name,
|
||||
"",
|
||||
@ -320,13 +321,19 @@ int build_reference_db(const char* dms_name,
|
||||
return -1;
|
||||
}
|
||||
|
||||
count = (matrix_with_lca_view->infos)->line_count;
|
||||
fprintf(stderr, "Computing LCAs...\n");
|
||||
|
||||
// Compute all the LCAs
|
||||
// For each pair
|
||||
for (i=0; i<(matrix_with_lca_view->infos)->line_count; i++)
|
||||
for (i=0; i<count; i++)
|
||||
{
|
||||
if (! keep_running)
|
||||
return -1;
|
||||
|
||||
if (i%1000 == 0)
|
||||
fprintf(stderr,"\rDone : %f %% ", (i / (float) count)*100);
|
||||
|
||||
// Read all taxids associated with the first sequence and compute their LCA
|
||||
// Read line index
|
||||
idx1 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx1_column, i, 0);
|
||||
@ -363,6 +370,7 @@ int build_reference_db(const char* dms_name,
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
fprintf(stderr,"\rDone : 100 %% \n");
|
||||
|
||||
// Clone refs view, add 2 arrays columns for lca and score, compute and write them
|
||||
|
||||
@ -442,13 +450,18 @@ int build_reference_db(const char* dms_name,
|
||||
return -1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "Building LCA arrays...\n");
|
||||
|
||||
// For each sequence, look for all its alignments in the matrix, and for each different LCA taxid/score, order them and write them
|
||||
// Going through matrix once, filling refs arrays on the go for efficiency
|
||||
for (i=0; i<(matrix_with_lca_view->infos)->line_count; i++)
|
||||
for (i=0; i<count; i++)
|
||||
{
|
||||
if (! keep_running)
|
||||
return -1;
|
||||
|
||||
if (i%1000 == 0)
|
||||
fprintf(stderr,"\rDone : %f %% ", (i / (float) count)*100);
|
||||
|
||||
// Read ref line indexes
|
||||
idx1 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx1_column, i, 0);
|
||||
idx2 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx2_column, i, 0);
|
||||
@ -464,6 +477,8 @@ int build_reference_db(const char* dms_name,
|
||||
// Read alignment score
|
||||
score = obi_get_float_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_score_column, i, 0);
|
||||
|
||||
//fprintf(stderr, "\n\ntaxid_lca=%d, score=%f, idx1=%d, idx2=%d", taxid_lca, score, idx1, idx2);
|
||||
|
||||
///////////////// Compute for first sequence \\\\\\\\\\\\\\\\\\\\\\\ (TODO function)
|
||||
|
||||
// Read arrays
|
||||
@ -480,9 +495,11 @@ int build_reference_db(const char* dms_name,
|
||||
// return -1;
|
||||
// }
|
||||
|
||||
//fprintf(stderr, "\n1st sequence");
|
||||
// If empty, add values
|
||||
if (taxid_array_length == 0)
|
||||
{
|
||||
//fprintf(stderr, "\nEmpty, add value");
|
||||
if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx1, &taxid_lca, (uint8_t) (obi_sizeof(OBI_INT) * 8), 1) < 0)
|
||||
{
|
||||
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
|
||||
@ -496,6 +513,8 @@ int build_reference_db(const char* dms_name,
|
||||
}
|
||||
else
|
||||
{
|
||||
//fprintf(stderr, "\nNot empty");
|
||||
|
||||
j = 0;
|
||||
modified = false;
|
||||
while (j < taxid_array_length)
|
||||
@ -509,6 +528,9 @@ int build_reference_db(const char* dms_name,
|
||||
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
|
||||
modified = true;
|
||||
|
||||
//fprintf(stderr, "\nSame LCA, replace %d and %f with %d and %f", lca_taxid_array_writable[j],
|
||||
// score_array_writable[j], taxid_lca, score);
|
||||
|
||||
// Better score for the same LCA, replace this LCA/score pair
|
||||
lca_taxid_array_writable[j] = taxid_lca;
|
||||
score_array_writable[j] = score;
|
||||
@ -535,6 +557,8 @@ int build_reference_db(const char* dms_name,
|
||||
{
|
||||
if (score > score_array[j])
|
||||
{
|
||||
//fprintf(stderr, "\nInsert new");
|
||||
|
||||
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
|
||||
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
|
||||
modified = true;
|
||||
@ -579,10 +603,15 @@ int build_reference_db(const char* dms_name,
|
||||
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
|
||||
modified = true;
|
||||
|
||||
//fprintf(stderr, "\nAppend at the end");
|
||||
|
||||
// Append LCA
|
||||
lca_taxid_array_writable[taxid_array_writable_length] = taxid_lca;
|
||||
score_array_writable[score_array_writable_length] = score;
|
||||
|
||||
taxid_array_writable_length++;
|
||||
score_array_writable_length++;
|
||||
|
||||
// Remove the previous (children) LCAs from the array if their score is equal or lower
|
||||
while ((j>0) && (score_array_writable[j-1] <= score))
|
||||
{
|
||||
@ -603,6 +632,13 @@ int build_reference_db(const char* dms_name,
|
||||
// Write new arrays
|
||||
if (modified)
|
||||
{
|
||||
// fprintf(stderr, "\n\nnew array:");
|
||||
// for (k=0;k<taxid_array_writable_length;k++)
|
||||
// {
|
||||
// lca = obi_taxo_get_taxon_with_taxid(tax, lca_taxid_array_writable[k]);
|
||||
// fprintf(stderr, "\nLCA=%d, %s, score=%f", lca_taxid_array_writable[k], lca->name, score_array_writable[k]);
|
||||
// }
|
||||
|
||||
if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx1, lca_taxid_array_writable, (uint8_t) (obi_sizeof(OBI_INT) * 8), taxid_array_writable_length) < 0)
|
||||
{
|
||||
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
|
||||
@ -632,9 +668,13 @@ int build_reference_db(const char* dms_name,
|
||||
// return -1;
|
||||
// }
|
||||
|
||||
//fprintf(stderr, "\n2nd sequence");
|
||||
|
||||
// If empty, add values
|
||||
if (taxid_array_length == 0)
|
||||
{
|
||||
//fprintf(stderr, "\nEmpty, add value");
|
||||
|
||||
if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx2, &taxid_lca, (uint8_t) (obi_sizeof(OBI_INT) * 8), 1) < 0)
|
||||
{
|
||||
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
|
||||
@ -648,6 +688,8 @@ int build_reference_db(const char* dms_name,
|
||||
}
|
||||
else
|
||||
{
|
||||
//fprintf(stderr, "\nNot empty");
|
||||
|
||||
j = 0;
|
||||
modified = false;
|
||||
while (j < taxid_array_length)
|
||||
@ -661,6 +703,9 @@ int build_reference_db(const char* dms_name,
|
||||
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
|
||||
modified = true;
|
||||
|
||||
//fprintf(stderr, "\nSame LCA, replace %d and %f with %d and %f", lca_taxid_array_writable[j],
|
||||
// score_array_writable[j], taxid_lca, score);
|
||||
|
||||
// Better score for the same LCA, replace this LCA/score pair
|
||||
lca_taxid_array_writable[j] = taxid_lca;
|
||||
score_array_writable[j] = score;
|
||||
@ -687,6 +732,8 @@ int build_reference_db(const char* dms_name,
|
||||
{
|
||||
if (score > score_array[j])
|
||||
{
|
||||
//fprintf(stderr, "\nInsert new");
|
||||
|
||||
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
|
||||
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
|
||||
modified = true;
|
||||
@ -727,6 +774,8 @@ int build_reference_db(const char* dms_name,
|
||||
|
||||
if (j == taxid_array_length) // same or parent LCA not found, need to be appended at the end
|
||||
{
|
||||
//fprintf(stderr, "\nAppend at the end");
|
||||
|
||||
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
|
||||
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
|
||||
modified = true;
|
||||
@ -735,6 +784,9 @@ int build_reference_db(const char* dms_name,
|
||||
lca_taxid_array_writable[taxid_array_writable_length] = taxid_lca;
|
||||
score_array_writable[score_array_writable_length] = score;
|
||||
|
||||
taxid_array_writable_length++;
|
||||
score_array_writable_length++;
|
||||
|
||||
// Remove the previous (children) LCAs from the array if their score is equal or lower
|
||||
while ((j>0) && (score_array_writable[j-1] <= score))
|
||||
{
|
||||
@ -769,11 +821,17 @@ int build_reference_db(const char* dms_name,
|
||||
}
|
||||
}
|
||||
}
|
||||
fprintf(stderr,"\rDone : 100 %% \n");
|
||||
|
||||
fprintf(stderr, "Writing results...\n");
|
||||
count = (o_view->infos)->line_count;
|
||||
// Fill empty LCA informations (because filling from potentially sparse alignment matrix) with the sequence taxid
|
||||
score=1.0; // technically getting LCA of identical sequences
|
||||
for (i=0; i<(o_view->infos)->line_count; i++)
|
||||
for (i=0; i<count; i++)
|
||||
{
|
||||
if (i%1000 == 0)
|
||||
fprintf(stderr,"\rDone : %f %% ", (i / (float) count)*100);
|
||||
|
||||
obi_get_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, i, &taxid_array_length);
|
||||
if (taxid_array_length == 0) // no LCA set
|
||||
{
|
||||
@ -799,6 +857,7 @@ int build_reference_db(const char* dms_name,
|
||||
}
|
||||
}
|
||||
}
|
||||
fprintf(stderr,"\rDone : 100 %% \n");
|
||||
|
||||
// Add information about the threshold used to build the DB
|
||||
snprintf(threshold_str, 5, "%f", threshold);
|
||||
@ -858,7 +917,6 @@ int build_reference_db(const char* dms_name,
|
||||
free(matrix_view_name);
|
||||
free(matrix_with_lca_view_name);
|
||||
|
||||
fprintf(stderr,"\rDone : 100 %% \n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -413,7 +413,10 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
||||
return NULL;
|
||||
}
|
||||
|
||||
score = max_common_kmers + kmer_size - 1; // aka the number of nucleotides in the longest stretch of kmers perfectly matching
|
||||
if (max_common_kmers > 0)
|
||||
score = max_common_kmers + kmer_size - 1; // aka the number of nucleotides in the longest stretch of kmers perfectly matching
|
||||
else
|
||||
score = 0;
|
||||
abs_shift = abs(best_shift);
|
||||
|
||||
// Save result in Obi_ali structure
|
||||
@ -423,10 +426,15 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
||||
ali->shift = abs_shift;
|
||||
ali->consensus_seq = NULL;
|
||||
ali->consensus_qual = NULL;
|
||||
if (((best_shift <= 0) && (!switched_seqs)) || ((best_shift > 0) && switched_seqs))
|
||||
strcpy(ali->direction, "left");
|
||||
if (score == 0)
|
||||
ali->direction[0] = '\0';
|
||||
else
|
||||
strcpy(ali->direction, "right");
|
||||
{
|
||||
if (((best_shift <= 0) && (!switched_seqs)) || ((best_shift > 0) && switched_seqs))
|
||||
strcpy(ali->direction, "left");
|
||||
else
|
||||
strcpy(ali->direction, "right");
|
||||
}
|
||||
|
||||
// Build the consensus sequence if asked
|
||||
if (build_consensus)
|
||||
|
@ -409,8 +409,7 @@ int obi_clean(const char* dms_name,
|
||||
stop = true;
|
||||
}
|
||||
|
||||
#pragma omp parallel default(none) \
|
||||
shared(thread_count, seq_count, blob_array, complete_sample_count_array, alignment_result_array, \
|
||||
#pragma omp parallel shared(thread_count, seq_count, blob_array, complete_sample_count_array, alignment_result_array, \
|
||||
stop, blob1, i, obi_errno, keep_running, stderr, max_ratio, iseq_column, i_view, \
|
||||
similarity_mode, reference, normalize, threshold, ktable, status_column, o_view, sample_count)
|
||||
{
|
||||
|
48
src/obi_ecopcr.c
Executable file → Normal file
48
src/obi_ecopcr.c
Executable file → Normal file
@ -105,7 +105,8 @@ static int create_output_columns(Obiview_p o_view, bool kingdom_mode);
|
||||
* @param o_temp1_column A pointer on the output column for the temperature for the first primer.
|
||||
* @param o_temp2_column A pointer on the output column for the temperature for the second primer.
|
||||
*
|
||||
* @retval 0 if the operation was successfully completed.
|
||||
* @retval 0 if the sequence was skipped (taxid not found, warning printed).
|
||||
* @retval 1 if the sequence was successfully printed to the output.
|
||||
* @retval -1 if an error occurred.
|
||||
*
|
||||
* @since July 2018
|
||||
@ -366,6 +367,17 @@ static int print_seq(Obiview_p i_view, Obiview_p o_view,
|
||||
|
||||
// TODO add check for primer longer than MAX_PAT_LEN (32)
|
||||
|
||||
// Get sequence id
|
||||
seq_id = obi_get_str_with_elt_idx_and_col_p_in_view(i_view, i_id_column, i_idx, 0);
|
||||
|
||||
// Get the taxon structure
|
||||
main_taxon = obi_taxo_get_taxon_with_taxid(taxonomy, taxid);
|
||||
if (main_taxon == NULL)
|
||||
{
|
||||
obidebug(1, "\nWarning: error reading the taxonomic information of a sequence. Seq id: %s, taxid: %d. Probably deprecated taxid. Skipping this sequence.", seq_id, taxid);
|
||||
return 0;
|
||||
}
|
||||
|
||||
ldelta = (pos1 <= keep_nucleotides)?pos1:keep_nucleotides;
|
||||
rdelta = ((pos2+keep_nucleotides)>=seq_len)?seq_len-pos2:keep_nucleotides;
|
||||
|
||||
@ -431,16 +443,7 @@ static int print_seq(Obiview_p i_view, Obiview_p o_view,
|
||||
if (isnan(tm2))
|
||||
tm2 = OBIFloat_NA;
|
||||
|
||||
// Get the taxon structure
|
||||
main_taxon = obi_taxo_get_taxon_with_taxid(taxonomy, taxid);
|
||||
if (main_taxon == NULL)
|
||||
{
|
||||
obidebug(1, "\nError reading the taxonomic information of a sequence");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Write sequence id
|
||||
seq_id = obi_get_str_with_elt_idx_and_col_p_in_view(i_view, i_id_column, i_idx, 0);
|
||||
if (obi_set_str_with_elt_idx_and_col_p_in_view(o_view, o_id_column, o_idx, 0, seq_id) < 0)
|
||||
{
|
||||
obidebug(1, "\nError writing the sequence id");
|
||||
@ -629,7 +632,7 @@ static int print_seq(Obiview_p i_view, Obiview_p o_view,
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@ -698,6 +701,7 @@ int obi_ecopcr(const char* i_dms_name,
|
||||
|
||||
obiint_t taxid;
|
||||
char* sequence;
|
||||
int printed;
|
||||
|
||||
SeqPtr apatseq=NULL;
|
||||
int32_t o1Hits;
|
||||
@ -1057,14 +1061,14 @@ int obi_ecopcr(const char* i_dms_name,
|
||||
length = 0;
|
||||
if (posj > posi)
|
||||
length = posj - posi - o1->patlen - o2->patlen;
|
||||
if (posj < posi)
|
||||
else if (circular > 0)
|
||||
length = posj + apatseq->seqlen - posi - o1->patlen - o2->patlen;
|
||||
if ((length>0) && // For when primers touch or overlap
|
||||
(!min_len || (length >= min_len)) &&
|
||||
(!max_len || (length <= max_len)))
|
||||
{
|
||||
// Print the found amplicon
|
||||
if (print_seq(i_view, o_view,
|
||||
printed = print_seq(i_view, o_view,
|
||||
i_idx, o_idx,
|
||||
taxonomy,
|
||||
sequence,
|
||||
@ -1090,12 +1094,14 @@ int obi_ecopcr(const char* i_dms_name,
|
||||
o_strand_column,
|
||||
o_primer1_column, o_primer2_column,
|
||||
o_error1_column, o_error2_column,
|
||||
o_temp1_column, o_temp2_column) < 0)
|
||||
o_temp1_column, o_temp2_column);
|
||||
if (printed < 0)
|
||||
{
|
||||
obidebug(1, "\nError writing the ecopcr result");
|
||||
return -1;
|
||||
}
|
||||
o_idx++;
|
||||
else if (printed > 0)
|
||||
o_idx++;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1145,14 +1151,14 @@ int obi_ecopcr(const char* i_dms_name,
|
||||
length = 0;
|
||||
if (posj > posi)
|
||||
length = posj - posi + 1 - o2->patlen - o1->patlen; /* - o1->patlen : deleted by <EC> (prior to the OBITools3) */
|
||||
if (posj < posi)
|
||||
else if (circular > 0)
|
||||
length = posj + apatseq->seqlen - posi - o1->patlen - o2->patlen;
|
||||
if ((length>0) && // For when primers touch or overlap
|
||||
(!min_len || (length >= min_len)) &&
|
||||
(!max_len || (length <= max_len)))
|
||||
{
|
||||
// Print the found amplicon
|
||||
if (print_seq(i_view, o_view,
|
||||
printed = print_seq(i_view, o_view,
|
||||
i_idx, o_idx,
|
||||
taxonomy,
|
||||
sequence,
|
||||
@ -1178,12 +1184,14 @@ int obi_ecopcr(const char* i_dms_name,
|
||||
o_strand_column,
|
||||
o_primer1_column, o_primer2_column,
|
||||
o_error1_column, o_error2_column,
|
||||
o_temp1_column, o_temp2_column) < 0)
|
||||
o_temp1_column, o_temp2_column);
|
||||
if (printed < 0)
|
||||
{
|
||||
obidebug(1, "\nError writing the ecopcr result");
|
||||
return -1;
|
||||
}
|
||||
o_idx++;
|
||||
else if (printed > 0)
|
||||
o_idx++;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1224,7 +1232,7 @@ int obi_ecopcr(const char* i_dms_name,
|
||||
return -1;
|
||||
}
|
||||
|
||||
fprintf(stderr,"\rDone : 100 %% ");
|
||||
fprintf(stderr,"\rDone : 100 %% \n");
|
||||
return 0;
|
||||
|
||||
return 0;
|
||||
|
@ -81,8 +81,8 @@
|
||||
* @param o_dms_name The path to the output DMS.
|
||||
* @param o_view_name The name of the output view.
|
||||
* @param o_view_comments The comments to associate with the output view.
|
||||
* @param primer1 The first primer.
|
||||
* @param primer2 The second primer.
|
||||
* @param primer1 The first primer, length must be less than or equal to 32 (because of apat lib limitation).
|
||||
* @param primer2 The second primer, length must be less than or equal to 32 (because of apat lib limitation).
|
||||
* @param error_max The maximum number of errors allowed per primer for amplification.
|
||||
* @param min_len The minimum length of an amplicon.
|
||||
* @param max_len The maximum length of an amplicon.
|
||||
|
@ -71,9 +71,12 @@ static int create_output_columns(Obiview_p o_view);
|
||||
* @param name The assigned scientific name.
|
||||
* @param assigned_status_column A pointer on the column where the assigned status should be written.
|
||||
* @param assigned The assigned status (whether the sequence was assigned to a taxon or not).
|
||||
* @param best_match_column A pointer on the column where the list of ids of the best matches should be written.
|
||||
* @param best_match_ids_column A pointer on the column where the list of ids of the best matches should be written.
|
||||
* @param best_match_ids The list of ids of the best matches as an array of the concatenated ids separated by '\0'.
|
||||
* @param best_match_ids_length The total length of the array of ids of best matches.
|
||||
* @param best_match_taxids_column A pointer on the column where the list of taxids of the best matches should be written.
|
||||
* @param best_match_taxids The list of taxids of the best matches as an array of the taxids.
|
||||
* @param best_match_taxids_length The length of the array of taxids of best matches.
|
||||
* @param score_column A pointer on the column where the score should be written.
|
||||
* @param score The similarity score of the sequence with its best match(es).
|
||||
*
|
||||
@ -87,7 +90,8 @@ int print_assignment_result(Obiview_p output_view, index_t line,
|
||||
OBIDMS_column_p assigned_taxid_column, int32_t taxid,
|
||||
OBIDMS_column_p assigned_name_column, const char* name,
|
||||
OBIDMS_column_p assigned_status_column, bool assigned,
|
||||
OBIDMS_column_p best_match_column, const char* best_match_ids, int best_match_ids_length,
|
||||
OBIDMS_column_p best_match_ids_column, const char* best_match_ids, int best_match_ids_length,
|
||||
OBIDMS_column_p best_match_taxids_column, const int32_t* best_match_taxids, int best_match_taxids_length,
|
||||
OBIDMS_column_p score_column, double score);
|
||||
|
||||
|
||||
@ -100,37 +104,44 @@ int print_assignment_result(Obiview_p output_view, index_t line,
|
||||
static int create_output_columns(Obiview_p o_view)
|
||||
{
|
||||
// Score column
|
||||
if (obi_view_add_column(o_view, ECOTAG_SCORE_COLUMN_NAME, -1, NULL, OBI_FLOAT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_SCORE_COLUMN_NAME, true) < 0)
|
||||
if (obi_view_add_column(o_view, ECOTAG_SCORE_COLUMN_NAME, -1, NULL, OBI_FLOAT, 0, 1, NULL, false, false, false, NULL, NULL, -1, "{}", true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the column for the score in ecotag");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Assigned taxid column
|
||||
if (obi_view_add_column(o_view, ECOTAG_TAXID_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_TAXID_COLUMN_NAME, true) < 0)
|
||||
if (obi_view_add_column(o_view, ECOTAG_TAXID_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, "{}", true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the column for the assigned taxid in ecotag");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Assigned scientific name column
|
||||
if (obi_view_add_column(o_view, ECOTAG_NAME_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_NAME_COLUMN_NAME, true) < 0)
|
||||
if (obi_view_add_column(o_view, ECOTAG_NAME_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, false, false, NULL, NULL, -1, "{}", true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the column for the assigned scientific name in ecotag");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Assignement status column
|
||||
if (obi_view_add_column(o_view, ECOTAG_STATUS_COLUMN_NAME, -1, NULL, OBI_BOOL, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_STATUS_COLUMN_NAME, true) < 0)
|
||||
if (obi_view_add_column(o_view, ECOTAG_STATUS_COLUMN_NAME, -1, NULL, OBI_BOOL, 0, 1, NULL, false, false, false, NULL, NULL, -1, "{}", true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the column for the assignment status in ecotag");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Column for array of best match ids
|
||||
if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, true, false, NULL, NULL, -1, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, true) < 0)
|
||||
if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, true, false, NULL, NULL, -1, "{}", true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the column for the array of ids of the best match in ecotag");
|
||||
obidebug(1, "\nError creating the column for the array of ids of best matches in ecotag");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Column for array of best match taxids
|
||||
if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_TAXIDS_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, true, false, NULL, NULL, -1, "{}", true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the column for the array of taxids of best matches in ecotag");
|
||||
return -1;
|
||||
}
|
||||
|
||||
@ -142,7 +153,8 @@ int print_assignment_result(Obiview_p output_view, index_t line,
|
||||
OBIDMS_column_p assigned_taxid_column, int32_t taxid,
|
||||
OBIDMS_column_p assigned_name_column, const char* name,
|
||||
OBIDMS_column_p assigned_status_column, bool assigned,
|
||||
OBIDMS_column_p best_match_column, const char* best_match_ids, int best_match_ids_length,
|
||||
OBIDMS_column_p best_match_ids_column, const char* best_match_ids, int best_match_ids_length,
|
||||
OBIDMS_column_p best_match_taxids_column, const int32_t* best_match_taxids, int best_match_taxids_length,
|
||||
OBIDMS_column_p score_column, double score)
|
||||
{
|
||||
// Write the assigned taxid
|
||||
@ -167,9 +179,16 @@ int print_assignment_result(Obiview_p output_view, index_t line,
|
||||
}
|
||||
|
||||
// Write the best match ids
|
||||
if (obi_set_array_with_col_p_in_view(output_view, best_match_column, line, best_match_ids, (uint8_t)(sizeof(char)*8), best_match_ids_length) < 0)
|
||||
if (obi_set_array_with_col_p_in_view(output_view, best_match_ids_column, line, best_match_ids, (uint8_t)(sizeof(char)*8), best_match_ids_length) < 0)
|
||||
{
|
||||
obidebug(1, "\nError writing a assignment status in a column when writing ecotag results");
|
||||
obidebug(1, "\nError writing the array of best match ids in a column when writing ecotag results");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Write the best match taxids
|
||||
if (obi_set_array_with_col_p_in_view(output_view, best_match_taxids_column, line, best_match_taxids, (uint8_t)(sizeof(OBI_INT)*8), best_match_taxids_length) < 0)
|
||||
{
|
||||
obidebug(1, "\nError writing the array of best match taxids in a column when writing ecotag results");
|
||||
return -1;
|
||||
}
|
||||
|
||||
@ -235,6 +254,8 @@ int obi_ecotag(const char* dms_name,
|
||||
char* best_match_ids;
|
||||
char* best_match_ids_to_store;
|
||||
int32_t best_match_ids_length;
|
||||
int32_t* best_match_taxids;
|
||||
int32_t* best_match_taxids_to_store;
|
||||
int best_match_count;
|
||||
int buffer_size;
|
||||
int best_match_ids_buffer_size;
|
||||
@ -263,7 +284,8 @@ int obi_ecotag(const char* dms_name,
|
||||
OBIDMS_column_p assigned_taxid_column = NULL;
|
||||
OBIDMS_column_p assigned_name_column = NULL;
|
||||
OBIDMS_column_p assigned_status_column = NULL;
|
||||
OBIDMS_column_p best_match_column = NULL;
|
||||
OBIDMS_column_p best_match_ids_column = NULL;
|
||||
OBIDMS_column_p best_match_taxids_column = NULL;
|
||||
OBIDMS_column_p lca_taxid_a_column = NULL;
|
||||
OBIDMS_column_p score_a_column = NULL;
|
||||
OBIDMS_column_p ref_taxid_column = NULL;
|
||||
@ -396,7 +418,8 @@ int obi_ecotag(const char* dms_name,
|
||||
assigned_taxid_column = obi_view_get_column(output_view, ECOTAG_TAXID_COLUMN_NAME);
|
||||
assigned_name_column = obi_view_get_column(output_view, ECOTAG_NAME_COLUMN_NAME);
|
||||
assigned_status_column = obi_view_get_column(output_view, ECOTAG_STATUS_COLUMN_NAME);
|
||||
best_match_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME);
|
||||
best_match_ids_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME);
|
||||
best_match_taxids_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_TAXIDS_COLUMN_NAME);
|
||||
score_column = obi_view_get_column(output_view, ECOTAG_SCORE_COLUMN_NAME);
|
||||
|
||||
// Open the used reference columns
|
||||
@ -453,9 +476,17 @@ int obi_ecotag(const char* dms_name,
|
||||
return -1;
|
||||
}
|
||||
|
||||
best_match_taxids = (int32_t*) malloc(buffer_size* sizeof(int32_t));
|
||||
if (best_match_taxids == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError allocating memory for the best match taxid array in ecotag");
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (i=0; i < query_count; i++)
|
||||
{
|
||||
if (i%100 == 0)
|
||||
if (i%1000 == 0)
|
||||
fprintf(stderr,"\rDone : %f %% ", (i / (float) query_count)*100);
|
||||
|
||||
best_match_count = 0;
|
||||
@ -514,7 +545,7 @@ int obi_ecotag(const char* dms_name,
|
||||
|
||||
// Store in best match array
|
||||
|
||||
// Grow match array if needed
|
||||
// Grow match and taxid array if needed
|
||||
if (best_match_count == buffer_size)
|
||||
{
|
||||
buffer_size = buffer_size*2;
|
||||
@ -525,6 +556,13 @@ int obi_ecotag(const char* dms_name,
|
||||
obidebug(1, "\nError reallocating match array when assigning");
|
||||
return -1;
|
||||
}
|
||||
best_match_taxids = (int32_t*) realloc(best_match_taxids, buffer_size*sizeof(int32_t));
|
||||
if (best_match_taxids == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError reallocating match taxids array when assigning");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
id = obi_get_str_with_elt_idx_and_col_p_in_view(ref_view, ref_id_column, j, 0);
|
||||
@ -545,6 +583,7 @@ int obi_ecotag(const char* dms_name,
|
||||
|
||||
// Save match
|
||||
best_match_array[best_match_count] = j;
|
||||
best_match_taxids[best_match_count] = obi_get_int_with_elt_idx_and_col_p_in_view(ref_view, ref_taxid_column, j, 0);
|
||||
best_match_count++;
|
||||
strcpy(best_match_ids+best_match_ids_length, id);
|
||||
best_match_ids_length = best_match_ids_length + id_len + 1;
|
||||
@ -562,7 +601,7 @@ int obi_ecotag(const char* dms_name,
|
||||
score_array = obi_get_array_with_col_p_in_view(ref_view, score_a_column, best_match_idx, &lca_array_length);
|
||||
|
||||
k = 0;
|
||||
while ((k < lca_array_length) && (score_array[k] >= ecotag_threshold))
|
||||
while ((k < lca_array_length) && (score_array[k] >= best_score))
|
||||
k++;
|
||||
|
||||
if (k>0)
|
||||
@ -570,12 +609,12 @@ int obi_ecotag(const char* dms_name,
|
||||
lca_array = obi_get_array_with_col_p_in_view(ref_view, lca_taxid_a_column, best_match_idx, &lca_array_length);
|
||||
if (j>0)
|
||||
{
|
||||
lca = obi_taxo_get_taxon_with_taxid(taxonomy, lca_taxid);
|
||||
if (lca == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting a taxon from a taxid when doing taxonomic assignment");
|
||||
return -1;
|
||||
}
|
||||
// lca = obi_taxo_get_taxon_with_taxid(taxonomy, lca_taxid);
|
||||
// if (lca == NULL)
|
||||
// {
|
||||
// obidebug(1, "\nError getting a taxon from a taxid when doing taxonomic assignment");
|
||||
// return -1;
|
||||
// }
|
||||
lca_in_array = obi_taxo_get_taxon_with_taxid(taxonomy, lca_array[k-1]);
|
||||
if (lca_in_array == NULL)
|
||||
{
|
||||
@ -629,6 +668,7 @@ int obi_ecotag(const char* dms_name,
|
||||
else
|
||||
lca_name = lca->name;
|
||||
best_match_ids_to_store = best_match_ids;
|
||||
best_match_taxids_to_store = best_match_taxids;
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -636,6 +676,7 @@ int obi_ecotag(const char* dms_name,
|
||||
lca_name = OBIStr_NA;
|
||||
lca_taxid = OBIInt_NA;
|
||||
best_match_ids_to_store = OBITuple_NA;
|
||||
best_match_taxids_to_store = OBITuple_NA;
|
||||
score = OBIFloat_NA;
|
||||
}
|
||||
|
||||
@ -644,7 +685,8 @@ int obi_ecotag(const char* dms_name,
|
||||
assigned_taxid_column, lca_taxid,
|
||||
assigned_name_column, lca_name,
|
||||
assigned_status_column, assigned,
|
||||
best_match_column, best_match_ids_to_store, best_match_ids_length,
|
||||
best_match_ids_column, best_match_ids_to_store, best_match_ids_length,
|
||||
best_match_taxids_column, best_match_taxids_to_store, best_match_count,
|
||||
score_column, best_score
|
||||
) < 0)
|
||||
return -1;
|
||||
@ -652,6 +694,7 @@ int obi_ecotag(const char* dms_name,
|
||||
|
||||
free(best_match_array);
|
||||
free(best_match_ids);
|
||||
free(best_match_taxids);
|
||||
|
||||
obi_close_taxonomy(taxonomy);
|
||||
obi_save_and_close_view(query_view);
|
||||
|
@ -23,7 +23,8 @@
|
||||
#define ECOTAG_TAXID_COLUMN_NAME "TAXID"
|
||||
#define ECOTAG_NAME_COLUMN_NAME "SCIENTIFIC_NAME"
|
||||
#define ECOTAG_STATUS_COLUMN_NAME "ID_STATUS"
|
||||
#define ECOTAG_BEST_MATCH_IDS_COLUMN_NAME "BEST_MATCH"
|
||||
#define ECOTAG_BEST_MATCH_IDS_COLUMN_NAME "BEST_MATCH_IDS"
|
||||
#define ECOTAG_BEST_MATCH_TAXIDS_COLUMN_NAME "BEST_MATCH_TAXIDS"
|
||||
#define ECOTAG_SCORE_COLUMN_NAME "BEST_IDENTITY"
|
||||
|
||||
|
||||
|
@ -648,7 +648,7 @@ int truncate_avl_data_to_size_used(OBIDMS_avl_data_p avl_data) // TODO is it nec
|
||||
new_data_size = ((index_t) multiple) * getpagesize();
|
||||
|
||||
// Check that it is actually greater than the current size of the file, otherwise no need to truncate
|
||||
if ((avl_data->header)->data_size_max == new_data_size)
|
||||
if ((avl_data->header)->data_size_max >= new_data_size)
|
||||
return 0;
|
||||
|
||||
// Get the file descriptor
|
||||
@ -667,7 +667,7 @@ int truncate_avl_data_to_size_used(OBIDMS_avl_data_p avl_data) // TODO is it nec
|
||||
if (ftruncate(file_descriptor, file_size) < 0)
|
||||
{
|
||||
obi_set_errno(OBI_AVL_ERROR);
|
||||
obidebug(1, "\nError truncating an AVL data file");
|
||||
obidebug(1, "\nError truncating an AVL data file, old data size = %lld, new data size = %lld", (avl_data->header)->data_size_max, new_data_size);
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
@ -1496,7 +1496,7 @@ obiversion_t obi_import_column(const char* dms_path_1, const char* dms_path_2, c
|
||||
memcpy(column_2->data, column_1->data, header_1->data_size);
|
||||
|
||||
// Copy the AVL files if there are some (overwriting the automatically created files)
|
||||
if ((header_1->returned_data_type == OBI_STR) || (header_1->returned_data_type == OBI_SEQ) || (header_1->returned_data_type == OBI_QUAL))
|
||||
if ((header_1->tuples) || ((header_1->returned_data_type == OBI_STR) || (header_1->returned_data_type == OBI_SEQ) || (header_1->returned_data_type == OBI_QUAL)))
|
||||
{
|
||||
avl_name_1 = (char*) malloc((strlen(header_1->indexer_name) + 1) * sizeof(char));
|
||||
if (avl_name_1 == NULL)
|
||||
|
@ -2376,9 +2376,10 @@ int read_merged_dmp(const char* taxdump, OBIDMS_taxonomy_p tax, int32_t* delnode
|
||||
// and the deleted taxids with no current reference. An element of the list is composed of the taxid, and the index
|
||||
// of the taxon in the taxa structure, or -1 for deleted taxids.
|
||||
// Creating the merged list requires to merge the 3 ordered lists into one.
|
||||
while (((nT < (tax->taxa)->count) && ((tax->taxa)->taxon[nT].taxid < old_taxid)) || ((nD >= 0) && (delnodes[nD] < old_taxid)))
|
||||
while (((nT < (tax->taxa)->count) && ((tax->taxa)->taxon[nT].taxid < old_taxid)) ||
|
||||
((nD >= 0) && (delnodes[nD] < old_taxid)))
|
||||
{
|
||||
if ((tax->taxa)->taxon[nT].taxid < delnodes[nD])
|
||||
if ((nT < (tax->taxa)->count) && (tax->taxa)->taxon[nT].taxid < delnodes[nD])
|
||||
{ // Add element from taxa list
|
||||
// Enlarge structure if needed
|
||||
if (n == buffer_size)
|
||||
@ -2401,7 +2402,7 @@ int read_merged_dmp(const char* taxdump, OBIDMS_taxonomy_p tax, int32_t* delnode
|
||||
nT++;
|
||||
n++;
|
||||
}
|
||||
else if (delnodes[nD] < (tax->taxa)->taxon[nT].taxid)
|
||||
else
|
||||
{ // Add element from deleted taxids list
|
||||
// Enlarge structure if needed
|
||||
if (n == buffer_size)
|
||||
@ -3036,12 +3037,12 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo
|
||||
|
||||
strcpy(tax->tax_name, taxonomy_name);
|
||||
|
||||
buffer_size = 2048;
|
||||
|
||||
taxonomy_path = get_taxonomy_path(dms, taxonomy_name);
|
||||
if (taxonomy_path == NULL)
|
||||
return NULL;
|
||||
|
||||
buffer_size = strlen(taxonomy_path) + strlen(taxonomy_name) + 6;
|
||||
|
||||
// Read ranks
|
||||
ranks_file_name = (char*) malloc(buffer_size*sizeof(char));
|
||||
if (ranks_file_name == NULL)
|
||||
|
@ -1350,6 +1350,8 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms,
|
||||
}
|
||||
strncpy(header->indexer_name, final_indexer_name, INDEXER_MAX_NAME);
|
||||
}
|
||||
else
|
||||
new_column->indexer = NULL;
|
||||
|
||||
// Fill the data with NA values
|
||||
obi_ini_to_NA_values(new_column, 0, nb_lines);
|
||||
@ -1558,6 +1560,8 @@ OBIDMS_column_p obi_open_column(OBIDMS_p dms,
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
else
|
||||
column->indexer = NULL;
|
||||
|
||||
if (close(column_file_descriptor) < 0)
|
||||
{
|
||||
@ -1693,8 +1697,8 @@ int obi_close_column(OBIDMS_column_p column)
|
||||
if (obi_dms_unlist_column(column->dms, column) < 0)
|
||||
ret_val = -1;
|
||||
|
||||
// If the data type is OBI_STR, OBI_SEQ or OBI_QUAL, the associated indexer is closed
|
||||
if (((column->header)->returned_data_type == OBI_STR) || ((column->header)->returned_data_type == OBI_SEQ) || ((column->header)->returned_data_type == OBI_QUAL))
|
||||
// If it's a tuple column or the data type is OBI_STR, OBI_SEQ or OBI_QUAL, the associated indexer is closed
|
||||
if ((column->indexer) != NULL)
|
||||
if (obi_close_indexer(column->indexer) < 0)
|
||||
ret_val = -1;
|
||||
|
||||
@ -1973,7 +1977,11 @@ int obi_enlarge_column(OBIDMS_column_p column)
|
||||
|
||||
// Calculate the new file size
|
||||
old_line_count = (column->header)->line_count;
|
||||
new_line_count = old_line_count * COLUMN_GROWTH_FACTOR;
|
||||
new_line_count = ceil((double) old_line_count * (double) COLUMN_GROWTH_FACTOR);
|
||||
if (new_line_count > old_line_count+100000)
|
||||
new_line_count = old_line_count+100000;
|
||||
else if (new_line_count < old_line_count+1000)
|
||||
new_line_count = old_line_count+1000;
|
||||
|
||||
if (new_line_count > MAXIMUM_LINE_COUNT)
|
||||
{
|
||||
@ -2381,6 +2389,54 @@ char* obi_get_elements_names(OBIDMS_column_p column)
|
||||
}
|
||||
|
||||
|
||||
char* obi_get_formatted_elements_names(OBIDMS_column_p column)
|
||||
{
|
||||
char* elements_names;
|
||||
int i, j;
|
||||
int elt_idx;
|
||||
int len;
|
||||
|
||||
elements_names = (char*) malloc(((column->header)->elements_names_length + (column->header)->nb_elements_per_line) * sizeof(char));
|
||||
if (elements_names == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError allocating memory for elements names");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
j = 0;
|
||||
for (i=0; i < (column->header)->nb_elements_per_line; i++)
|
||||
{
|
||||
elt_idx = ((column->header)->elements_names_idx)[i];
|
||||
len = strlen(((column->header)->elements_names)+elt_idx);
|
||||
memcpy(elements_names+j, ((column->header)->elements_names)+elt_idx, len*sizeof(char));
|
||||
j = j + len;
|
||||
elements_names[j] = ';';
|
||||
j++;
|
||||
elements_names[j] = ' ';
|
||||
j++;
|
||||
}
|
||||
|
||||
elements_names[j - 1] = '\0';
|
||||
|
||||
return elements_names;
|
||||
}
|
||||
|
||||
|
||||
char* obi_column_formatted_infos(OBIDMS_column_p column)
|
||||
{
|
||||
char* column_infos;
|
||||
char* elt_names;
|
||||
|
||||
column_infos = malloc(1024 * sizeof(char));
|
||||
|
||||
elt_names = obi_get_formatted_elements_names(column);
|
||||
|
||||
|
||||
free(elt_names);
|
||||
return column_infos;
|
||||
}
|
||||
|
||||
|
||||
int obi_column_prepare_to_set_value(OBIDMS_column_p column, index_t line_nb, index_t elt_idx)
|
||||
{
|
||||
|
@ -505,6 +505,14 @@ index_t obi_column_get_element_index_from_name(OBIDMS_column_p column, const cha
|
||||
char* obi_get_elements_names(OBIDMS_column_p column);
|
||||
|
||||
|
||||
// TODO
|
||||
//char* obi_get_formatted_elements_names(OBIDMS_column_p column);
|
||||
|
||||
|
||||
// TODO
|
||||
//char* obi_column_formatted_infos(OBIDMS_column_p column);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Prepares a column to set a value.
|
||||
*
|
||||
|
@ -1037,8 +1037,9 @@ static int finish_view(Obiview_p view)
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Add count column if it's a NUC_SEQ_VIEW with no count column // TODO discuss
|
||||
if ((!strcmp((view->infos)->view_type, VIEW_TYPE_NUC_SEQS)) && (!obi_view_column_exists(view, COUNT_COLUMN)))
|
||||
// Add count column if it's a NUC_SEQ_VIEW with no count column (and there's no MERGED_sample column) // TODO discuss
|
||||
if ((!strcmp((view->infos)->view_type, VIEW_TYPE_NUC_SEQS)) && (!obi_view_column_exists(view, COUNT_COLUMN))
|
||||
&& (!obi_view_column_exists(view, "MERGED_sample"))) // TODO should eventually compute from merged samples?
|
||||
{
|
||||
if (obi_create_auto_count_column(view) < 0)
|
||||
{
|
||||
@ -1407,7 +1408,7 @@ static char* view_check_qual_match_seqs(Obiview_p view)
|
||||
// Test that the lengths of the quality and the sequence are equal
|
||||
if ((size_t)qual_len != strlen(seq))
|
||||
{
|
||||
obidebug(1, "\nError checking the predicate for view %s: The sequences and sequence quality arrays match.", (view->infos)->name);
|
||||
obidebug(1, "\nError checking the predicate for view %s: The sequences and sequence quality arrays match (index %lld, seq=%s, quality length = %d).", (view->infos)->name, j, seq, qual_len);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
@ -2380,11 +2381,12 @@ int obi_view_add_column(Obiview_p view,
|
||||
}
|
||||
|
||||
|
||||
int obi_view_delete_column(Obiview_p view, const char* column_name)
|
||||
int obi_view_delete_column(Obiview_p view, const char* column_name, bool delete_file)
|
||||
{
|
||||
int i;
|
||||
bool found;
|
||||
OBIDMS_column_p column;
|
||||
char* col_to_delete_path;
|
||||
|
||||
// Check that the view is not read-only
|
||||
if (view->read_only)
|
||||
@ -2406,8 +2408,31 @@ int obi_view_delete_column(Obiview_p view, const char* column_name)
|
||||
obidebug(1, "\nError getting a column from the linked list of column pointers of a view when deleting a column from a view");
|
||||
return -1;
|
||||
}
|
||||
// Keep column path if need to delete the file
|
||||
if (delete_file)
|
||||
{
|
||||
col_to_delete_path = obi_column_full_path(view->dms, column->header->name, column->header->version);
|
||||
if (col_to_delete_path == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting a column file path when deleting a column");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
obi_close_column(column);
|
||||
|
||||
// Delete file if needed
|
||||
if (delete_file)
|
||||
{
|
||||
if (remove(col_to_delete_path) < 0)
|
||||
{
|
||||
obi_set_errno(OBICOL_UNKNOWN_ERROR);
|
||||
obidebug(1, "\nError deleting a column file when deleting unfinished columns: file %s", col_to_delete_path);
|
||||
return -1;
|
||||
}
|
||||
free(col_to_delete_path);
|
||||
}
|
||||
|
||||
view->columns = ll_delete(view->columns, i);
|
||||
// TODO how do we check for error? NULL can be empty list
|
||||
found = true;
|
||||
@ -3047,7 +3072,7 @@ int obi_create_auto_id_column(Obiview_p view, const char* prefix)
|
||||
// Delete old ID column if it exists
|
||||
if (obi_view_get_column(view, ID_COLUMN) != NULL)
|
||||
{
|
||||
if (obi_view_delete_column(view, ID_COLUMN) < 0)
|
||||
if (obi_view_delete_column(view, ID_COLUMN, false) < 0)
|
||||
{
|
||||
obidebug(1, "Error deleting an ID column to replace it in a view");
|
||||
return -1;
|
||||
|
@ -440,6 +440,7 @@ int obi_view_add_column(Obiview_p view,
|
||||
*
|
||||
* @param view A pointer on the view.
|
||||
* @param column_name The name of the column that should be deleted from the view.
|
||||
* @param delete_file Whether the column file should be deleted. Use carefully re: dependencies.
|
||||
*
|
||||
* @returns A value indicating the success of the operation.
|
||||
* @retval 0 if the operation was successfully completed.
|
||||
@ -448,7 +449,7 @@ int obi_view_add_column(Obiview_p view,
|
||||
* @since February 2016
|
||||
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||
*/
|
||||
int obi_view_delete_column(Obiview_p view, const char* column_name);
|
||||
int obi_view_delete_column(Obiview_p view, const char* column_name, bool delete_file);
|
||||
|
||||
|
||||
/**
|
||||
|
@ -686,6 +686,9 @@ int calculateSizeToAllocate(int maxLen, int LCSmin)
|
||||
size *= 3;
|
||||
size += 16;
|
||||
|
||||
size += 10; // band-aid for memory bug I don't understand (triggered on specific db on ubuntu)
|
||||
// bug might have to do with the way different systems behave when aligning the address in obi_get_memory_aligned_on_16
|
||||
|
||||
return(size*sizeof(int16_t));
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user