Compare commits

...

64 Commits

Author SHA1 Message Date
e4a32788c2 Switch to version 3.0.0b24 2020-06-09 14:36:58 +02:00
2442cc80bf Cython: View: fixed bash history display 2020-06-09 14:36:37 +02:00
aa836b2ace uniq: improved progress bar of second browsing 2020-06-09 14:36:02 +02:00
8776ce22e6 C: fixed a bug where indexers referring to tuples of certain types were
not properly closed and imported
2020-06-09 14:34:43 +02:00
4aa772c405 ecotag: Added list of taxids for all best matches (closes #80) 2020-06-09 14:33:14 +02:00
b0b96ac37a version 3.0.0b23a 2020-06-05 16:10:24 +02:00
687e42ad22 C: kmer alignment: fixed a bug where scores of 0 were at
(0+kmer_length-1) (and now setting alignment direction to None if score
is 0
2020-06-05 16:09:33 +02:00
5fbbb6d304 alignpairedend: fixed a bug when rebuilding joined (unaligned) sequences
where only the forward sequence was kept
2020-06-05 16:06:43 +02:00
359a9fe237 Switch to version 3.0.0b23 2020-06-04 15:35:03 +02:00
f9b6851f75 Python: correctly flagged some mandatory options as required 2020-06-04 15:34:24 +02:00
29a2652bbf Fixed installation on Ubuntu without pip 2020-06-04 15:06:35 +02:00
2a2c233936 obi import: fixed a bug when skipping an entry 2020-05-29 21:19:42 +02:00
faf8ea9d86 Switch to version 3.0.0b21 2020-05-28 20:42:09 +02:00
ffe2485e94 Genbank parser: now reading ORIGIN lines with comments without
triggering error
2020-05-28 20:41:34 +02:00
6094ce2bbc obi import: skip on error more robust 2020-05-28 20:40:36 +02:00
a7dcf16c06 Minor changes for pip release 2020-05-20 15:59:04 +02:00
f13f8f6165 obi import: minor doc/display improvements 2020-05-20 11:46:29 +02:00
b5a29ac413 Switch to version 3.0.0b19 2020-05-20 10:29:36 +02:00
efd2b9d338 Cleaner installation 2020-05-20 10:29:12 +02:00
ca6e3e7aad obi import: fixed to work with seq genbank extension 2020-05-20 10:28:14 +02:00
76ed8e18e5 Switch to version 3.0.0b18 with version formatting that fits setuptools 2020-05-18 17:08:55 +02:00
1d17f28aec setup: now using setuptools instead of distutils to work with pip 2020-05-18 17:08:09 +02:00
fa834e4b8b obi import: small bug fix 2020-05-18 17:06:58 +02:00
a72fea3cc9 Python: fasta parser: fixed a bug stopping the program when the last
line contained a single nucleotide
2020-05-12 11:24:12 +02:00
e9a37d8a6e Switch to version 3.0.0-beta16 2020-05-07 17:09:26 +02:00
ef074f8455 typo 2020-05-07 17:08:59 +02:00
aec5e69f2c C, views: no more automatic COUNT column if MERGED_sample column exists 2020-05-07 17:08:07 +02:00
170ef3f1ba Views: added obi prefix to commands in bash history 2020-05-07 17:07:01 +02:00
f999946582 obi uniq: fixed the remerging of already merged informations, and
efficiency improvements
2020-05-07 17:05:54 +02:00
773b36ec37 obi import: fixed the import of old obitools files with premerged
informations, and other minor improvements
2020-05-07 17:03:04 +02:00
69cb434a6c version 3.0.0-beta15c 2020-04-29 14:25:33 +02:00
55d4f98d60 obi annotate: fixed annotation at ranks 2020-04-29 14:24:40 +02:00
0bec2631e8 ecotag: fixed a bug where all the full DMS path weren't properly sent to
the C layer
2020-04-29 10:35:55 +02:00
e6b6c6fa84 AVLs: Made an error message more informative 2020-04-29 10:14:04 +02:00
974528b2e6 build_ref_db: fixed bug erasing some of the higher LCAs (i.e. lowest
similarities)
2020-04-28 15:56:06 +02:00
1b346b54f9 ecotag: better specificity by now correctly looking for similarities
within refs above best score instead of ecotag threshold
2020-04-28 15:10:07 +02:00
058f2ad8b3 ecopcr: fixed a bug where sequences were considered circular (generating
false positives)
2020-04-27 14:44:35 +02:00
60bfd3ae8d obi annotate: now defaults to setting str if expression is not valid 2020-04-24 11:35:20 +02:00
67bdee105a C: build_ref_db: added progress display for each step 2020-04-18 14:24:08 +02:00
0f745e0113 C: Columns: optimizing column file growth 2020-04-18 13:55:47 +02:00
da8de52ba4 export: fixed progress bar bug 2020-04-17 15:09:10 +02:00
4d36538c6e C: SSE lcs alignment: band-aid for memory bug I don't understand
(triggered on specific db on ubuntu)
2020-04-17 15:07:52 +02:00
8d0b17d87d Switch to version 3.0.0-beta14 2020-04-15 17:47:26 +02:00
343999a627 Taxonomy: fixed a critical memory bug when building the list of merged
taxids
2020-04-15 17:46:13 +02:00
e9a40630e9 C: Columns: rounding column growth to ceil to avoid looping on small
values
2020-04-13 19:02:10 +02:00
8dbcd3025a C: Columns: reduced column growth factor from 2 to 1.3 to avoid errno28 2020-04-13 14:47:56 +02:00
4cf635d001 Switch to version 3.0.0-beta13 2020-04-12 17:42:58 +02:00
b7e7cc232a Made completion script cleaner 2020-04-12 17:41:59 +02:00
b6ab792ceb C: made error message more detailed when checking that sequences and
qualities match
2020-04-12 17:40:24 +02:00
ddea5a2964 obi import: fixed inconsequential error when precomputing number of
entries in some formats
2020-04-12 17:38:42 +02:00
30852ab7d5 View bash history: removed useless shebang 2020-04-12 17:36:04 +02:00
4d0299904e all commands (almost): cleaner DMS closing at the end 2020-04-12 17:31:58 +02:00
eef5156d95 obi stats: fixed error when printing bool keys 2020-04-12 17:12:04 +02:00
e62c991bbc goes with previous commit 2020-04-10 11:22:26 +02:00
1218eed7fd ecopcr: now printing a warning instead of interrupting with an error
when a taxid is not found
2020-04-10 11:22:04 +02:00
cd9cea8c97 obi import: fixed critical bug where the last entry of embl and genbank
files was not imported
2020-04-09 19:26:27 +02:00
98cfb70d73 ecopcr: made some errors more informative 2020-04-09 09:15:28 +02:00
b9f68c76c8 ecopcr: added warnings and check of primer length (related to #75) 2020-04-05 18:40:56 +02:00
0b98371688 ngsfilter: added warning about primer length in -h (#75) 2020-04-05 18:39:20 +02:00
f0d152fcbd ngsfilter: now checking primer length (fixes #75) 2020-04-05 18:29:10 +02:00
8019dee68e ecotag: now closing all DMS properly 2020-04-05 13:20:49 +02:00
0b4a234671 Swich to version 3.0.0-beta11 2020-02-12 14:23:42 +01:00
d32cfdcce5 ecotag: fixed the generated column comments formatting that would
generate errors
2020-02-12 14:23:17 +01:00
219c0d6fdc obi cat: Fixed the handling when concatenating views with dictionaries
having different key sets
2020-02-12 14:21:39 +01:00
50 changed files with 722 additions and 384 deletions

View File

@ -6,7 +6,7 @@ recursive-include doc/sphinx/source *.txt *.rst *.py
recursive-include doc/sphinx/sphinxext *.py recursive-include doc/sphinx/sphinxext *.py
include doc/sphinx/Makefile include doc/sphinx/Makefile
include doc/sphinx/Doxyfile include doc/sphinx/Doxyfile
include README.txt include README.md
include requirements.txt include requirements.txt
include scripts/obi include scripts/obi

View File

@ -1,4 +1,3 @@
#/usr/bin/env bash
_obi_comp () _obi_comp ()
{ {

View File

@ -266,9 +266,9 @@ def run(config):
# If the input and the output DMS are different, delete the temporary result view in the input DMS # If the input and the output DMS are different, delete the temporary result view in the input DMS
if i_dms != o_dms: if i_dms != o_dms:
View.delete_view(i_dms, o_view_name) View.delete_view(i_dms, o_view_name)
o_dms.close() o_dms.close(force=True)
i_dms.close() i_dms.close(force=True)
logger("info", "Done.") logger("info", "Done.")

View File

@ -247,10 +247,10 @@ def run(config):
#print("\n\nOutput view:\n````````````", file=sys.stderr) #print("\n\nOutput view:\n````````````", file=sys.stderr)
#print(repr(view), file=sys.stderr) #print(repr(view), file=sys.stderr)
input[0].close() input[0].close(force=True)
if two_views: if two_views:
rinput[0].close() rinput[0].close(force=True)
output[0].close() output[0].close(force=True)
logger("info", "Done.") logger("info", "Done.")

View File

@ -13,7 +13,8 @@ from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN, \
ID_COLUMN, \ ID_COLUMN, \
DEFINITION_COLUMN, \ DEFINITION_COLUMN, \
QUALITY_COLUMN, \ QUALITY_COLUMN, \
COUNT_COLUMN COUNT_COLUMN, \
TAXID_COLUMN
import time import time
import math import math
@ -175,8 +176,8 @@ def sequenceTaggerGenerator(config, taxo=None):
counter[0]+=1 counter[0]+=1
for rank in annoteRank: for rank in annoteRank:
if 'taxid' in seq: if TAXID_COLUMN in seq:
taxid = seq['taxid'] taxid = seq[TAXID_COLUMN]
if taxid is not None: if taxid is not None:
rtaxid = taxo.get_taxon_at_rank(taxid, rank) rtaxid = taxo.get_taxon_at_rank(taxid, rank)
if rtaxid is not None: if rtaxid is not None:
@ -190,58 +191,50 @@ def sequenceTaggerGenerator(config, taxo=None):
seq['seq_rank']=counter[0] seq['seq_rank']=counter[0]
for i,v in toSet: for i,v in toSet:
#try: try:
if taxo is not None: if taxo is not None:
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math} environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
else: else:
environ = {'sequence':seq, 'counter':counter[0], 'math':math} environ = {'sequence':seq, 'counter':counter[0], 'math':math}
val = eval(v, environ, seq) val = eval(v, environ, seq)
#except Exception,e: # TODO discuss usefulness of this except Exception: # set string if not a valid expression
# if options.onlyValid: val = v
# raise e
# val = v
seq[i]=val seq[i]=val
if length: if length:
seq['seq_length']=len(seq) seq['seq_length']=len(seq)
if newId is not None: if newId is not None:
# try: try:
if taxo is not None: if taxo is not None:
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math} environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
else: else:
environ = {'sequence':seq, 'counter':counter[0], 'math':math} environ = {'sequence':seq, 'counter':counter[0], 'math':math}
val = eval(newId, environ, seq) val = eval(newId, environ, seq)
# except Exception,e: except Exception: # set string if not a valid expression
# if options.onlyValid: val = newId
# raise e
# val = newId
seq.id=val seq.id=val
if newDef is not None: if newDef is not None:
# try: try:
if taxo is not None: if taxo is not None:
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math} environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
else: else:
environ = {'sequence':seq, 'counter':counter[0], 'math':math} environ = {'sequence':seq, 'counter':counter[0], 'math':math}
val = eval(newDef, environ, seq) val = eval(newDef, environ, seq)
# except Exception,e: except Exception: # set string if not a valid expression
# if options.onlyValid: val = newDef
# raise e
# val = newDef
seq.definition=val seq.definition=val
#
if newSeq is not None: if newSeq is not None:
# try: try:
if taxo is not None: if taxo is not None:
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math} environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
else: else:
environ = {'sequence':seq, 'counter':counter[0], 'math':math} environ = {'sequence':seq, 'counter':counter[0], 'math':math}
val = eval(newSeq, environ, seq) val = eval(newSeq, environ, seq)
# except Exception,e: except Exception: # set string if not a valid expression
# if options.onlyValid: val = newSeq
# raise e
# val = newSeq
seq.seq=val seq.seq=val
if 'seq_length' in seq: if 'seq_length' in seq:
seq['seq_length']=len(seq) seq['seq_length']=len(seq)
@ -251,15 +244,14 @@ def sequenceTaggerGenerator(config, taxo=None):
seq.view.delete_column(QUALITY_COLUMN) seq.view.delete_column(QUALITY_COLUMN)
if run is not None: if run is not None:
# try: try:
if taxo is not None: if taxo is not None:
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math} environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
else: else:
environ = {'sequence':seq, 'counter':counter[0], 'math':math} environ = {'sequence':seq, 'counter':counter[0], 'math':math}
eval(run, environ, seq) eval(run, environ, seq)
# except Exception,e: except Exception,e:
# if options.onlyValid: raise e
# raise e
return sequenceTagger return sequenceTagger
@ -379,7 +371,7 @@ def run(config):
# If the input and the output DMS are different, delete the temporary imported view used to create the final view # If the input and the output DMS are different, delete the temporary imported view used to create the final view
if i_dms != o_dms: if i_dms != o_dms:
View.delete_view(o_dms, imported_view_name) View.delete_view(o_dms, imported_view_name)
o_dms.close() o_dms.close(force=True)
i_dms.close() i_dms.close(force=True)
logger("info", "Done.") logger("info", "Done.")

View File

@ -97,9 +97,9 @@ def run(config):
# If the input and the output DMS are different, delete the temporary result view in the input DMS # If the input and the output DMS are different, delete the temporary result view in the input DMS
if i_dms != o_dms: if i_dms != o_dms:
View.delete_view(i_dms, o_view_name) View.delete_view(i_dms, o_view_name)
o_dms.close() o_dms.close(force=True)
i_dms.close() i_dms.close(force=True)
logger("info", "Done.") logger("info", "Done.")

View File

@ -87,6 +87,23 @@ def run(config):
Column.new_column(o_view, REVERSE_SEQUENCE_COLUMN, OBI_SEQ) Column.new_column(o_view, REVERSE_SEQUENCE_COLUMN, OBI_SEQ)
Column.new_column(o_view, REVERSE_QUALITY_COLUMN, OBI_QUAL, associated_column_name=REVERSE_SEQUENCE_COLUMN, associated_column_version=o_view[REVERSE_SEQUENCE_COLUMN].version) Column.new_column(o_view, REVERSE_QUALITY_COLUMN, OBI_QUAL, associated_column_name=REVERSE_SEQUENCE_COLUMN, associated_column_version=o_view[REVERSE_SEQUENCE_COLUMN].version)
# Initialize multiple elements columns
dict_cols = {}
for v in iview_list:
for coln in v.keys():
if v[coln].nb_elements_per_line > 1:
if coln not in dict_cols:
dict_cols[coln] = {}
dict_cols[coln]['eltnames'] = set(v[coln].elements_names)
dict_cols[coln]['nbelts'] = v[coln].nb_elements_per_line
dict_cols[coln]['obitype'] = v[coln].data_type_int
else:
dict_cols[coln]['eltnames'] = set(v[coln].elements_names + list(dict_cols[coln]['eltnames']))
dict_cols[coln]['nbelts'] = len(dict_cols[coln]['eltnames'])
for coln in dict_cols:
Column.new_column(o_view, coln, dict_cols[coln]['obitype'],
nb_elements_per_line=dict_cols[coln]['nbelts'], elements_names=list(dict_cols[coln]['eltnames']))
# Initialize the progress bar # Initialize the progress bar
pb = ProgressBar(total_len, config, seconde=5) pb = ProgressBar(total_len, config, seconde=5)
@ -116,7 +133,7 @@ def run(config):
#print(repr(view), file=sys.stderr) #print(repr(view), file=sys.stderr)
for d in idms_list: for d in idms_list:
d.close() d.close(force=True)
o_dms.close() o_dms.close(force=True)
logger("info", "Done.") logger("info", "Done.")

View File

@ -124,8 +124,8 @@ def run(config):
# If the input and the output DMS are different, delete the temporary result view in the input DMS # If the input and the output DMS are different, delete the temporary result view in the input DMS
if i_dms != o_dms: if i_dms != o_dms:
View.delete_view(i_dms, o_view_name) View.delete_view(i_dms, o_view_name)
o_dms.close() o_dms.close(force=True)
i_dms.close() i_dms.close(force=True)
logger("info", "Done.") logger("info", "Done.")

View File

@ -56,3 +56,5 @@ def run(config):
print(count2) print(count2)
else: else:
print(count1) print(count1)
input[0].close(force=True)

View File

@ -35,13 +35,15 @@ def addOptions(parser):
action="store", dest="ecopcr:primer1", action="store", dest="ecopcr:primer1",
metavar='<PRIMER>', metavar='<PRIMER>',
type=str, type=str,
help="Forward primer.") required=True,
help="Forward primer, length must be less than or equal to 32")
group.add_argument('--primer2', '-R', group.add_argument('--primer2', '-R',
action="store", dest="ecopcr:primer2", action="store", dest="ecopcr:primer2",
metavar='<PRIMER>', metavar='<PRIMER>',
type=str, type=str,
help="Reverse primer.") required=True,
help="Reverse primer, length must be less than or equal to 32")
group.add_argument('--error', '-e', group.add_argument('--error', '-e',
action="store", dest="ecopcr:error", action="store", dest="ecopcr:error",
@ -203,6 +205,7 @@ def run(config):
#print("\n\nOutput view:\n````````````", file=sys.stderr) #print("\n\nOutput view:\n````````````", file=sys.stderr)
#print(repr(o_dms[o_view_name]), file=sys.stderr) #print(repr(o_dms[o_view_name]), file=sys.stderr)
o_dms.close() i_dms.close(force=True)
o_dms.close(force=True)
logger("info", "Done.") logger("info", "Done.")

View File

@ -64,9 +64,9 @@ def run(config):
ref_view_name = ref[1] ref_view_name = ref[1]
# Check that the threshold demanded is greater than or equal to the threshold used to build the reference database # Check that the threshold demanded is greater than or equal to the threshold used to build the reference database
if config['ecotag']['threshold'] < eval(i_dms[ref_view_name].comments["ref_db_threshold"]) : if config['ecotag']['threshold'] < eval(ref_dms[ref_view_name].comments["ref_db_threshold"]) :
print("Error: The threshold demanded (%f) is lower than the threshold used to build the reference database (%f).", print("Error: The threshold demanded (%f) is lower than the threshold used to build the reference database (%f).",
config['ecotag']['threshold'], i_dms[ref_view_name].comments["ref_db_threshold"]) config['ecotag']['threshold'], ref_dms[ref_view_name].comments["ref_db_threshold"])
# Open the output: only the DMS # Open the output: only the DMS
output = open_uri(config['obi']['outputURI'], output = open_uri(config['obi']['outputURI'],
@ -107,8 +107,8 @@ def run(config):
comments = View.print_config(config, "ecotag", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name) comments = View.print_config(config, "ecotag", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
if obi_ecotag(i_dms.name_with_full_path, tobytes(i_view_name), \ if obi_ecotag(i_dms.name_with_full_path, tobytes(i_view_name), \
tobytes(ref_dms_name), tobytes(ref_view_name), \ ref_dms.name_with_full_path, tobytes(ref_view_name), \
tobytes(taxo_dms_name), tobytes(taxonomy_name), \ taxo_dms.name_with_full_path, tobytes(taxonomy_name), \
tobytes(o_view_name), comments, tobytes(o_view_name), comments,
config['ecotag']['threshold']) < 0: config['ecotag']['threshold']) < 0:
raise Exception("Error running ecotag") raise Exception("Error running ecotag")
@ -126,9 +126,11 @@ def run(config):
# If the input and the output DMS are different, delete the temporary result view in the input DMS # If the input and the output DMS are different, delete the temporary result view in the input DMS
if i_dms != o_dms: if i_dms != o_dms:
View.delete_view(i_dms, o_view_name) View.delete_view(i_dms, o_view_name)
o_dms.close() o_dms.close(force=True)
i_dms.close() taxo_dms.close(force=True)
ref_dms.close(force=True)
i_dms.close(force=True)
logger("info", "Done.") logger("info", "Done.")

View File

@ -60,11 +60,21 @@ def run(config):
if (output[2] == Nuc_Seq) and (iview.type != b"NUC_SEQS_VIEW") : # Nuc_Seq_Stored? TODO if (output[2] == Nuc_Seq) and (iview.type != b"NUC_SEQS_VIEW") : # Nuc_Seq_Stored? TODO
raise Exception("Error: the view to export in fasta or fastq format is not a NUC_SEQS view") raise Exception("Error: the view to export in fasta or fastq format is not a NUC_SEQS view")
if config['obi']['only'] is not None:
withoutskip = min(input[4], config['obi']['only'])
else:
withoutskip = input[4]
if config['obi']['skip'] is not None:
skip = min(input[4], config['obi']['skip'])
else:
skip = 0
# Initialize the progress bar # Initialize the progress bar
if config['obi']['noprogressbar']: if config['obi']['noprogressbar']:
pb = None pb = None
else: else:
pb = ProgressBar(len(iview), config, seconde=5) pb = ProgressBar(withoutskip - skip, config, seconde=5)
i=0 i=0
for seq in iview : for seq in iview :
@ -86,7 +96,7 @@ def run(config):
if not BrokenPipeError and not IOError: if not BrokenPipeError and not IOError:
output_object.close() output_object.close()
iview.close() iview.close()
input[0].close() input[0].close(force=True)
logger("info", "Done.") logger("info", "Done.")

View File

@ -370,7 +370,7 @@ def run(config):
# If the input and the output DMS are different, delete the temporary imported view used to create the final view # If the input and the output DMS are different, delete the temporary imported view used to create the final view
if i_dms != o_dms: if i_dms != o_dms:
View.delete_view(i_dms, o_view_name) View.delete_view(i_dms, o_view_name)
o_dms.close() o_dms.close(force=True)
i_dms.close() i_dms.close(force=True)
logger("info", "Done.") logger("info", "Done.")

View File

@ -103,7 +103,7 @@ def run(config):
# If the input and the output DMS are different, delete the temporary imported view used to create the final view # If the input and the output DMS are different, delete the temporary imported view used to create the final view
if i_dms != o_dms: if i_dms != o_dms:
View.delete_view(i_dms, o_view_name) View.delete_view(i_dms, o_view_name)
o_dms.close() o_dms.close(force=True)
i_dms.close() i_dms.close(force=True)
logger("info", "Done.") logger("info", "Done.")

View File

@ -55,3 +55,4 @@ def run(config):
else: else:
raise Exception("ASCII history only available for views") raise Exception("ASCII history only available for views")
input[0].close(force=True)

View File

@ -25,7 +25,8 @@ from obitools3.dms.capi.obiview cimport VIEW_TYPE_NUC_SEQS, \
DEFINITION_COLUMN, \ DEFINITION_COLUMN, \
QUALITY_COLUMN, \ QUALITY_COLUMN, \
COUNT_COLUMN, \ COUNT_COLUMN, \
TAXID_COLUMN TAXID_COLUMN, \
MERGED_PREFIX
from obitools3.dms.capi.obidms cimport obi_import_view from obitools3.dms.capi.obidms cimport obi_import_view
@ -72,7 +73,7 @@ def addOptions(parser):
action="store_true", dest="import:preread", action="store_true", dest="import:preread",
default=False, default=False,
help="Do a first readthrough of the dataset if it contains huge dictionaries (more than 100 keys) for " help="Do a first readthrough of the dataset if it contains huge dictionaries (more than 100 keys) for "
"a much faster import.") "a much faster import. This option is not recommended and will slow down the import in any other case.")
def run(config): def run(config):
@ -163,7 +164,7 @@ def run(config):
taxo.write(taxo_name) taxo.write(taxo_name)
taxo.close() taxo.close()
o_dms.record_command_line(" ".join(sys.argv[1:])) o_dms.record_command_line(" ".join(sys.argv[1:]))
o_dms.close() o_dms.close(force=True)
logger("info", "Done.") logger("info", "Done.")
return return
@ -217,11 +218,14 @@ def run(config):
logger("info", "Read %d entries", i) logger("info", "Read %d entries", i)
for tag in entry : for tag in entry :
newtag = tag
if tag[:7] == b"merged_":
newtag = MERGED_PREFIX+tag[7:]
if type(entry[tag]) == dict : if type(entry[tag]) == dict :
if tag in dict_dict: if tag in dict_dict:
dict_dict[tag][0].update(entry[tag].keys()) dict_dict[newtag][0].update(entry[tag].keys())
else: else:
dict_dict[tag] = [set(entry[tag].keys()), get_obitype(entry[tag])] dict_dict[newtag] = [set(entry[tag].keys()), get_obitype(entry[tag])]
i+=1 i+=1
if pb is not None: if pb is not None:
@ -232,7 +236,7 @@ def run(config):
dcols[tag] = (Column.new_column(view, tag, dict_dict[tag][1], \ dcols[tag] = (Column.new_column(view, tag, dict_dict[tag][1], \
nb_elements_per_line=len(dict_dict[tag][0]), \ nb_elements_per_line=len(dict_dict[tag][0]), \
elements_names=list(dict_dict[tag][0])), \ elements_names=list(dict_dict[tag][0])), \
value_obitype) dict_dict[tag][1])
# Reinitialize the input # Reinitialize the input
@ -256,7 +260,6 @@ def run(config):
if entry is None: # error or exception handled at lower level, not raised because Python generators can't resume after any exception is raised if entry is None: # error or exception handled at lower level, not raised because Python generators can't resume after any exception is raised
if config['obi']['skiperror']: if config['obi']['skiperror']:
i-=1
continue continue
else: else:
raise RollbackException("obi import error, rollbacking view", view) raise RollbackException("obi import error, rollbacking view", view)
@ -266,6 +269,8 @@ def run(config):
elif not i%50000: elif not i%50000:
logger("info", "Imported %d entries", i) logger("info", "Imported %d entries", i)
try:
if NUC_SEQS_view: if NUC_SEQS_view:
id_col[i] = entry.id id_col[i] = entry.id
def_col[i] = entry.definition def_col[i] = entry.definition
@ -288,6 +293,8 @@ def run(config):
tag = TAXID_COLUMN tag = TAXID_COLUMN
if tag == b"count": if tag == b"count":
tag = COUNT_COLUMN tag = COUNT_COLUMN
if tag[:7] == b"merged_":
tag = MERGED_PREFIX+tag[7:]
if tag not in dcols : if tag not in dcols :
@ -328,8 +335,8 @@ def run(config):
try: try:
# Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key # Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key
if type(value) == dict and \ if type(value) == dict and \
dcols[tag][0].nb_elements_per_line == 1 and len(value.keys()) == 1 \ dcols[tag][0].nb_elements_per_line == 1 \
and dcols[tag][0].elements_names[0] != list(value.keys())[0] : and set(dcols[tag][0].elements_names) != set(value.keys()) :
raise IndexError # trigger column rewrite raise IndexError # trigger column rewrite
# Fill value # Fill value
@ -382,6 +389,13 @@ def run(config):
# Fill value # Fill value
dcols[tag][0][i] = value dcols[tag][0][i] = value
except Exception as e:
print("\nCould not import sequence id:", entry.id, "(error raised:", e, ")")
if 'skiperror' in config['obi'] and not config['obi']['skiperror']:
raise e
else:
pass
i+=1 i+=1
if pb is not None: if pb is not None:
@ -402,7 +416,7 @@ def run(config):
except AttributeError: except AttributeError:
pass pass
try: try:
output[0].close() output[0].close(force=True)
except AttributeError: except AttributeError:
pass pass

View File

@ -46,5 +46,5 @@ def run(config):
process.wait() process.wait()
iview.close() iview.close()
input[0].close() input[0].close(force=True)

View File

@ -36,6 +36,7 @@ def run(config):
l = [] l = []
for view in input[0]: for view in input[0]:
l.append(tostr(view) + "\t(Date created: " + str(bytes2str_object(dms[view].comments["Date created"]))+")") l.append(tostr(view) + "\t(Date created: " + str(bytes2str_object(dms[view].comments["Date created"]))+")")
dms[view].close()
l.sort() l.sort()
for v in l: for v in l:
print(v) print(v)
@ -52,3 +53,4 @@ def run(config):
print("\n### Comments:") print("\n### Comments:")
print(str(input[1].comments)) print(str(input[1].comments))
input[0].close(force=True)

View File

@ -42,7 +42,9 @@ def addOptions(parser):
metavar="<URI>", metavar="<URI>",
type=str, type=str,
default=None, default=None,
help="URI to the view containing the samples definition (with tags, primers, sample names,...)") required=True,
help="URI to the view containing the samples definition (with tags, primers, sample names,...).\n"
"\nWarning: primer lengths must be less than or equal to 32")
group.add_argument('-R', '--reverse-reads', group.add_argument('-R', '--reverse-reads',
action="store", dest="ngsfilter:reverse", action="store", dest="ngsfilter:reverse",
@ -172,6 +174,13 @@ cdef read_info_view(info_view, max_errors=2, verbose=False, not_aligned=False):
primer_list = [] primer_list = []
i=0 i=0
for p in info_view: for p in info_view:
# Check primer length: should not be longer than 32, the max allowed by the apat lib
if len(p[b'forward_primer']) > 32:
raise RollbackException("Error: primers can not be longer than 32bp, rollbacking views")
if len(p[b'reverse_primer']) > 32:
raise RollbackException("Error: primers can not be longer than 32bp, rollbacking views")
forward=Primer(p[b'forward_primer'], forward=Primer(p[b'forward_primer'],
len(p[b'forward_tag']) if (b'forward_tag' in p and p[b'forward_tag']!=None) else None, len(p[b'forward_tag']) if (b'forward_tag' in p and p[b'forward_tag']!=None) else None,
True, True,
@ -594,7 +603,13 @@ def run(config):
pb = ProgressBar(entries_len, config, seconde=5) pb = ProgressBar(entries_len, config, seconde=5)
# Check and store primers and tags # Check and store primers and tags
try:
infos, primer_list = read_info_view(info_view, max_errors=config['ngsfilter']['error'], verbose=False, not_aligned=not_aligned) # TODO obi verbose option infos, primer_list = read_info_view(info_view, max_errors=config['ngsfilter']['error'], verbose=False, not_aligned=not_aligned) # TODO obi verbose option
except RollbackException, e:
if unidentified is not None:
raise RollbackException("obi ngsfilter error, rollbacking views: "+str(e), o_view, unidentified)
else:
raise RollbackException("obi ngsfilter error, rollbacking view: "+str(e), o_view)
aligner = Primer_search(primer_list, config['ngsfilter']['error']) aligner = Primer_search(primer_list, config['ngsfilter']['error'])
@ -652,11 +667,11 @@ def run(config):
#print("\n\nOutput view:\n````````````", file=sys.stderr) #print("\n\nOutput view:\n````````````", file=sys.stderr)
#print(repr(o_view), file=sys.stderr) #print(repr(o_view), file=sys.stderr)
input[0].close() input[0].close(force=True)
output[0].close() output[0].close(force=True)
info_input[0].close() info_input[0].close(force=True)
if unidentified is not None: if unidentified is not None:
unidentified_input[0].close() unidentified_input[0].close(force=True)
aligner.free() aligner.free()
logger("info", "Done.") logger("info", "Done.")

View File

@ -141,7 +141,7 @@ def run(config):
# If the input and the output DMS are different, delete the temporary imported view used to create the final view # If the input and the output DMS are different, delete the temporary imported view used to create the final view
if i_dms != o_dms: if i_dms != o_dms:
View.delete_view(i_dms, o_view_name) View.delete_view(i_dms, o_view_name)
o_dms.close() o_dms.close(force=True)
i_dms.close() i_dms.close(force=True)
logger("info", "Done.") logger("info", "Done.")

View File

@ -251,7 +251,7 @@ def run(config):
for i in range(len(sorted_stats)): for i in range(len(sorted_stats)):
c = sorted_stats[i][0] c = sorted_stats[i][0]
for v in c: for v in c:
if v is not None: if type(v) == bytes:
print(pcat % tostr(v)+"\t", end="") print(pcat % tostr(v)+"\t", end="")
else: else:
print(pcat % str(v)+"\t", end="") print(pcat % str(v)+"\t", end="")
@ -268,6 +268,6 @@ def run(config):
print("%7d" %catcount[c], end="") print("%7d" %catcount[c], end="")
print("%9d" %totcount[c]) print("%9d" %totcount[c])
input[0].close() input[0].close(force=True)
logger("info", "Done.") logger("info", "Done.")

View File

@ -106,7 +106,7 @@ def run(config):
# If the input and the output DMS are different, delete the temporary imported view used to create the final view # If the input and the output DMS are different, delete the temporary imported view used to create the final view
if i_dms != o_dms: if i_dms != o_dms:
View.delete_view(i_dms, o_view_name) View.delete_view(i_dms, o_view_name)
o_dms.close() o_dms.close(force=True)
i_dms.close() i_dms.close(force=True)
logger("info", "Done.") logger("info", "Done.")

View File

@ -529,7 +529,7 @@ def run(config):
test_taxo(config, infos) test_taxo(config, infos)
infos['view'].close() infos['view'].close()
infos['dms'].close() infos['dms'].close(force=True)
shutil.rmtree(config['obi']['defaultdms']+'.obidms', ignore_errors=True) shutil.rmtree(config['obi']['defaultdms']+'.obidms', ignore_errors=True)
print("Done.") print("Done.")

View File

@ -5,5 +5,5 @@ from obitools3.dms.taxo.taxo cimport Taxonomy
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy, dict config)
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=*, Taxonomy taxonomy=*, bint mergeIds=*, list categories=*, int max_elts=*) cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, dict config, list mergedKeys_list=*, Taxonomy taxonomy=*, bint mergeIds=*, list categories=*, int max_elts=*)

View File

@ -56,7 +56,7 @@ def addOptions(parser):
"(option can be used several times).") "(option can be used several times).")
cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) : cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy, dict config) :
cdef int taxid cdef int taxid
cdef Nuc_Seq_Stored seq cdef Nuc_Seq_Stored seq
@ -69,7 +69,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
cdef object gn_sn cdef object gn_sn
cdef object fa_sn cdef object fa_sn
# Create columns # Create columns and save them for efficiency
if b"species" in o_view and o_view[b"species"].data_type_int != OBI_INT : if b"species" in o_view and o_view[b"species"].data_type_int != OBI_INT :
o_view.delete_column(b"species") o_view.delete_column(b"species")
if b"species" not in o_view: if b"species" not in o_view:
@ -77,6 +77,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
b"species", b"species",
OBI_INT OBI_INT
) )
species_column = o_view[b"species"]
if b"genus" in o_view and o_view[b"genus"].data_type_int != OBI_INT : if b"genus" in o_view and o_view[b"genus"].data_type_int != OBI_INT :
o_view.delete_column(b"genus") o_view.delete_column(b"genus")
@ -85,6 +86,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
b"genus", b"genus",
OBI_INT OBI_INT
) )
genus_column = o_view[b"genus"]
if b"family" in o_view and o_view[b"family"].data_type_int != OBI_INT : if b"family" in o_view and o_view[b"family"].data_type_int != OBI_INT :
o_view.delete_column(b"family") o_view.delete_column(b"family")
@ -93,6 +95,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
b"family", b"family",
OBI_INT OBI_INT
) )
family_column = o_view[b"family"]
if b"species_name" in o_view and o_view[b"species_name"].data_type_int != OBI_STR : if b"species_name" in o_view and o_view[b"species_name"].data_type_int != OBI_STR :
o_view.delete_column(b"species_name") o_view.delete_column(b"species_name")
@ -101,6 +104,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
b"species_name", b"species_name",
OBI_STR OBI_STR
) )
species_name_column = o_view[b"species_name"]
if b"genus_name" in o_view and o_view[b"genus_name"].data_type_int != OBI_STR : if b"genus_name" in o_view and o_view[b"genus_name"].data_type_int != OBI_STR :
o_view.delete_column(b"genus_name") o_view.delete_column(b"genus_name")
@ -109,6 +113,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
b"genus_name", b"genus_name",
OBI_STR OBI_STR
) )
genus_name_column = o_view[b"genus_name"]
if b"family_name" in o_view and o_view[b"family_name"].data_type_int != OBI_STR : if b"family_name" in o_view and o_view[b"family_name"].data_type_int != OBI_STR :
o_view.delete_column(b"family_name") o_view.delete_column(b"family_name")
@ -117,6 +122,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
b"family_name", b"family_name",
OBI_STR OBI_STR
) )
family_name_column = o_view[b"family_name"]
if b"rank" in o_view and o_view[b"rank"].data_type_int != OBI_STR : if b"rank" in o_view and o_view[b"rank"].data_type_int != OBI_STR :
o_view.delete_column(b"rank") o_view.delete_column(b"rank")
@ -125,6 +131,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
b"rank", b"rank",
OBI_STR OBI_STR
) )
rank_column = o_view[b"rank"]
if b"scientific_name" in o_view and o_view[b"scientific_name"].data_type_int != OBI_STR : if b"scientific_name" in o_view and o_view[b"scientific_name"].data_type_int != OBI_STR :
o_view.delete_column(b"scientific_name") o_view.delete_column(b"scientific_name")
@ -133,9 +140,15 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
b"scientific_name", b"scientific_name",
OBI_STR OBI_STR
) )
scientific_name_column = o_view[b"scientific_name"]
# Initialize the progress bar
pb = ProgressBar(len(o_view), config, seconde=5)
i=0
for seq in o_view: for seq in o_view:
PyErr_CheckSignals() PyErr_CheckSignals()
pb(i)
if MERGED_TAXID_COLUMN in seq : if MERGED_TAXID_COLUMN in seq :
m_taxids = [] m_taxids = []
m_taxids_dict = seq[MERGED_TAXID_COLUMN] m_taxids_dict = seq[MERGED_TAXID_COLUMN]
@ -166,19 +179,22 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
fa_sn = None fa_sn = None
tfa = None tfa = None
seq[b"species"] = tsp species_column[i] = tsp
seq[b"genus"] = tgn genus_column[i] = tgn
seq[b"family"] = tfa family_column[i] = tfa
seq[b"species_name"] = sp_sn species_name_column[i] = sp_sn
seq[b"genus_name"] = gn_sn genus_name_column[i] = gn_sn
seq[b"family_name"] = fa_sn family_name_column[i] = fa_sn
seq[b"rank"] = taxonomy.get_rank(taxid) rank_column[i] = taxonomy.get_rank(taxid)
seq[b"scientific_name"] = taxonomy.get_scientific_name(taxid) scientific_name_column[i] = taxonomy.get_scientific_name(taxid)
i+=1
pb(len(o_view), force=True)
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None, int max_elts=1000000) : cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, dict config, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None, int max_elts=1000000) :
cdef int i cdef int i
cdef int k cdef int k
@ -187,6 +203,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
cdef int u_idx cdef int u_idx
cdef int i_idx cdef int i_idx
cdef int i_count cdef int i_count
cdef int o_count
cdef str key_str cdef str key_str
cdef bytes key cdef bytes key
cdef bytes mkey cdef bytes mkey
@ -209,7 +226,6 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
cdef Nuc_Seq_Stored i_seq cdef Nuc_Seq_Stored i_seq
cdef Nuc_Seq_Stored o_seq cdef Nuc_Seq_Stored o_seq
cdef Nuc_Seq_Stored u_seq cdef Nuc_Seq_Stored u_seq
cdef Column i_col
cdef Column i_seq_col cdef Column i_seq_col
cdef Column i_id_col cdef Column i_id_col
cdef Column i_taxid_col cdef Column i_taxid_col
@ -217,6 +233,8 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
cdef Column o_id_col cdef Column o_id_col
cdef Column o_taxid_dist_col cdef Column o_taxid_dist_col
cdef Column o_merged_col cdef Column o_merged_col
cdef Column o_count_col
cdef Column i_count_col
cdef Column_line i_mcol cdef Column_line i_mcol
cdef object taxid_dist_dict cdef object taxid_dist_dict
cdef object iter_view cdef object iter_view
@ -253,6 +271,11 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
for k in range(k_count): for k in range(k_count):
mergedKeys_m.append(MERGED_PREFIX + mergedKeys[k]) mergedKeys_m.append(MERGED_PREFIX + mergedKeys[k])
# Check that not trying to remerge without total count information
for key in mergedKeys_m:
if key in view and COUNT_COLUMN not in view:
raise Exception("\n>>>>\nError: trying to re-merge tags without total count tag. Run obi annotate to add the count tag from the relevant merged tag, i.e.: \nobi annotate --set-tag COUNT:'sum([value for key,value in sequence['MERGED_sample'].items()])' dms/input dms/output\n")
if categories is None: if categories is None:
categories = [] categories = []
@ -320,6 +343,10 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
for k in range(k_count): for k in range(k_count):
key = mergedKeys[k] key = mergedKeys[k]
merged_col_name = mergedKeys_m[k] merged_col_name = mergedKeys_m[k]
if merged_col_name in view:
i_col = view[merged_col_name]
else:
i_col = view[key] i_col = view[key]
if merged_infos[merged_col_name]['nb_elts'] > max_elts: if merged_infos[merged_col_name]['nb_elts'] > max_elts:
@ -374,23 +401,30 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
alias=MERGED_COLUMN alias=MERGED_COLUMN
) )
# Keep columns that are going to be used a lot in variables # Keep columns in variables for efficiency
o_id_col = o_view[ID_COLUMN] o_id_col = o_view[ID_COLUMN]
if TAXID_DIST_COLUMN in o_view: if TAXID_DIST_COLUMN in o_view:
o_taxid_dist_col = o_view[TAXID_DIST_COLUMN] o_taxid_dist_col = o_view[TAXID_DIST_COLUMN]
if MERGED_COLUMN in o_view: if MERGED_COLUMN in o_view:
o_merged_col = o_view[MERGED_COLUMN] o_merged_col = o_view[MERGED_COLUMN]
if COUNT_COLUMN not in o_view:
Column.new_column(o_view,
COUNT_COLUMN,
OBI_INT)
o_count_col = o_view[COUNT_COLUMN]
if COUNT_COLUMN in view:
i_count_col = view[COUNT_COLUMN]
pb(len(view), force=True) pb(len(view), force=True)
print("") print("")
logger("info", "Second browsing through the input") logger("info", "Second browsing through the input")
# Initialize the progress bar # Initialize the progress bar
pb = ProgressBar(len(uniques), seconde=5) pb = ProgressBar(len(view), seconde=5)
o_idx = 0 o_idx = 0
total_treated = 0
for unique_id in uniques : for unique_id in uniques :
PyErr_CheckSignals() PyErr_CheckSignals()
pb(o_idx)
merged_sequences = uniques[unique_id] merged_sequences = uniques[unique_id]
@ -407,7 +441,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
merged_list = list(set(merged_list)) # deduplicate the list merged_list = list(set(merged_list)) # deduplicate the list
o_merged_col[o_idx] = merged_list o_merged_col[o_idx] = merged_list
o_seq[COUNT_COLUMN] = 0 o_count = 0
if TAXID_DIST_COLUMN in u_seq and i_taxid_dist_col[u_idx] is not None: if TAXID_DIST_COLUMN in u_seq and i_taxid_dist_col[u_idx] is not None:
taxid_dist_dict = i_taxid_dist_col[u_idx] taxid_dist_dict = i_taxid_dist_col[u_idx]
@ -419,16 +453,17 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
merged_dict[mkey] = {} merged_dict[mkey] = {}
for i_idx in merged_sequences: for i_idx in merged_sequences:
pb(total_treated)
i_id = i_id_col[i_idx] i_id = i_id_col[i_idx]
i_seq = view[i_idx] i_seq = view[i_idx]
if COUNT_COLUMN not in i_seq or i_seq[COUNT_COLUMN] is None: if COUNT_COLUMN not in i_seq or i_count_col[i_idx] is None:
i_count = 1 i_count = 1
else: else:
i_count = i_seq[COUNT_COLUMN] i_count = i_count_col[i_idx]
o_seq[COUNT_COLUMN] += i_count o_count += i_count
for k in range(k_count): for k in range(k_count):
@ -464,12 +499,14 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
else: else:
mcol[key2] = mcol[key2] + i_mcol[key2] mcol[key2] = mcol[key2] + i_mcol[key2]
# Write taxid_dist for key in i_seq.keys():
if mergeIds and TAXID_COLUMN in mergedKeys: # Delete informations that differ between the merged sequences
if TAXID_DIST_COLUMN in str_merged_cols: # TODO make special columns list? // could be more efficient
o_taxid_dist_col[o_idx] = str(taxid_dist_dict) if key != COUNT_COLUMN and key != ID_COLUMN and key != NUC_SEQUENCE_COLUMN and key in o_seq and o_seq[key] != i_seq[key] \
else: and key not in merged_dict :
o_taxid_dist_col[o_idx] = taxid_dist_dict o_seq[key] = None
total_treated += 1
# Write merged dicts # Write merged dicts
for mkey in merged_dict: for mkey in merged_dict:
@ -482,25 +519,33 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
# if mkey_cols[mkey][o_idx][key] is None: # if mkey_cols[mkey][o_idx][key] is None:
# mkey_cols[mkey][o_idx][key] = 0 # mkey_cols[mkey][o_idx][key] = 0
for key in i_seq.keys(): # Write taxid_dist
# Delete informations that differ between the merged sequences if mergeIds and TAXID_COLUMN in mergedKeys:
# TODO make special columns list? if TAXID_DIST_COLUMN in str_merged_cols:
if key != COUNT_COLUMN and key != ID_COLUMN and key != NUC_SEQUENCE_COLUMN and key in o_seq and o_seq[key] != i_seq[key] \ o_taxid_dist_col[o_idx] = str(taxid_dist_dict)
and key not in merged_dict : else:
o_seq[key] = None o_taxid_dist_col[o_idx] = taxid_dist_dict
o_count_col[o_idx] = o_count
o_idx += 1 o_idx += 1
pb(len(view), force=True)
# Deletes quality columns if there is one because the matching between sequence and quality will be broken (quality set to NA when sequence not) # Deletes quality columns if there is one because the matching between sequence and quality will be broken (quality set to NA when sequence not)
if QUALITY_COLUMN in view: if QUALITY_COLUMN in view:
o_view.delete_column(QUALITY_COLUMN) o_view.delete_column(QUALITY_COLUMN)
if REVERSE_QUALITY_COLUMN in view: if REVERSE_QUALITY_COLUMN in view:
o_view.delete_column(REVERSE_QUALITY_COLUMN) o_view.delete_column(REVERSE_QUALITY_COLUMN)
# Delete old columns that are now merged
for k in range(k_count):
if mergedKeys[k] in o_view:
o_view.delete_column(mergedKeys[k])
if taxonomy is not None: if taxonomy is not None:
print("") # TODO because in the middle of progress bar. Better solution? print("") # TODO because in the middle of progress bar. Better solution?
logger("info", "Merging taxonomy classification") logger("info", "Merging taxonomy classification")
merge_taxonomy_classification(o_view, taxonomy) merge_taxonomy_classification(o_view, taxonomy, config)
@ -547,11 +592,10 @@ def run(config):
pb = ProgressBar(len(entries), config, seconde=5) pb = ProgressBar(len(entries), config, seconde=5)
try: try:
uniq_sequences(entries, o_view, pb, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'], max_elts=config['obi']['maxelts']) uniq_sequences(entries, o_view, pb, config, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'], max_elts=config['obi']['maxelts'])
except Exception, e: except Exception, e:
raise RollbackException("obi uniq error, rollbacking view: "+str(e), o_view) raise RollbackException("obi uniq error, rollbacking view: "+str(e), o_view)
pb(len(entries), force=True)
print("", file=sys.stderr) print("", file=sys.stderr)
# Save command config in View and DMS comments # Save command config in View and DMS comments
@ -567,8 +611,8 @@ def run(config):
#print("\n\nOutput view:\n````````````", file=sys.stderr) #print("\n\nOutput view:\n````````````", file=sys.stderr)
#print(repr(o_view), file=sys.stderr) #print(repr(o_view), file=sys.stderr)
input[0].close() input[0].close(force=True)
output[0].close() output[0].close(force=True)
logger("info", "Done.") logger("info", "Done.")

View File

@ -63,6 +63,8 @@ cdef extern from "obidmscolumn.h" nogil:
char* obi_get_elements_names(OBIDMS_column_p column) char* obi_get_elements_names(OBIDMS_column_p column)
char* obi_column_formatted_infos(OBIDMS_column_p column)
index_t obi_column_get_element_index_from_name(OBIDMS_column_p column, const char* element_name) index_t obi_column_get_element_index_from_name(OBIDMS_column_p column, const char* element_name)
int obi_column_write_comments(OBIDMS_column_p column, const char* comments) int obi_column_write_comments(OBIDMS_column_p column, const char* comments)

View File

@ -14,6 +14,7 @@ from ..capi.obidms cimport obi_import_column
from ..capi.obidmscolumn cimport OBIDMS_column_header_p, \ from ..capi.obidmscolumn cimport OBIDMS_column_header_p, \
obi_close_column, \ obi_close_column, \
obi_get_elements_names, \ obi_get_elements_names, \
obi_column_formatted_infos, \
obi_column_write_comments obi_column_write_comments
from ..capi.obiutils cimport obi_format_date from ..capi.obiutils cimport obi_format_date
@ -288,7 +289,13 @@ cdef class Column(OBIWrapper) :
@OBIWrapper.checkIsActive @OBIWrapper.checkIsActive
def __repr__(self) : def __repr__(self) :
cdef bytes s cdef bytes s
#cdef char* s_b
#cdef str s_str
#s_b = obi_column_formatted_infos(self.pointer())
#s_str = bytes2str(s_b)
#free(s_b)
s = self._alias + b", data type: " + self.data_type s = self._alias + b", data type: " + self.data_type
#return s_str
return bytes2str(s) return bytes2str(s)

View File

@ -94,16 +94,16 @@ cdef class DMS(OBIWrapper):
return dms return dms
def close(self) : def close(self, force=False) :
''' '''
Closes the DMS instance and free the associated memory Closes the DMS instance and free the associated memory (no counter, closing is final)
The `close` method is automatically called by the object destructor. The `close` method is automatically called by the object destructor.
''' '''
cdef OBIDMS_p pointer = self.pointer() cdef OBIDMS_p pointer = self.pointer()
if self.active() : if self.active() :
OBIWrapper.close(self) OBIWrapper.close(self)
if (obi_close_dms(pointer, False)) < 0 : if (obi_close_dms(pointer, force=force)) < 0 :
raise Exception("Problem closing an OBIDMS") raise Exception("Problem closing an OBIDMS")
@ -254,7 +254,8 @@ cdef class DMS(OBIWrapper):
# bash command history property getter # bash command history property getter
@property @property
def bash_history(self): def bash_history(self):
s = b"#!/bin/bash\n\n" #s = b"#!${bash}/bin/bash\n\n"
s = b""
first = True first = True
for command in self.command_line_history: for command in self.command_line_history:
s+=b"#" s+=b"#"

View File

@ -526,11 +526,12 @@ cdef class View(OBIWrapper) :
# bash command history property getter # bash command history property getter
@property @property
def bash_history(self): def bash_history(self):
s = b"#!/bin/bash\n\n" s = b""
first = True first = True
for level in self.view_history: for level in self.view_history:
command_list = [level[input][b"command_line"] for input in level.keys()] command_list = [level[input][b"command_line"] for input in level.keys()]
for command in command_list: for command in command_list:
s+=b"obi "
s+=command s+=command
s+=b"\n" s+=b"\n"
return s return s

View File

@ -188,7 +188,7 @@ def buildConsensus(ali, seq, ref_tags=None):
seq[b'shift']=ali.shift seq[b'shift']=ali.shift
else: else:
if len(ali[0])>999: # TODO why? if len(ali[0])>999: # TODO why?
raise AssertionError,"Too long alignemnt" raise AssertionError,"Too long alignment"
ic=IterOnConsensus(ali) ic=IterOnConsensus(ali)
@ -250,11 +250,21 @@ def buildJoinedSequence(ali, reverse, seq, forward=None):
quality.extend(reverse.quality) quality.extend(reverse.quality)
seq.set(forward.id +b"_PairedEnd", s, definition=forward.definition, quality=quality) seq.set(forward.id +b"_PairedEnd", s, definition=forward.definition, quality=quality)
seq[b"score"]=ali.score seq[b"score"]=ali.score
if len(ali.direction) > 0:
seq[b"ali_direction"]=ali.direction seq[b"ali_direction"]=ali.direction
else:
seq[b"ali_direction"]=None
seq[b"mode"]=b"joined" seq[b"mode"]=b"joined"
seq[b"pairedend_limit"]=len(forward) seq[b"pairedend_limit"]=len(forward)
seq[b"ali_length"] = ali.consensus_len
if ali.consensus_len > 0:
seq[b"score_norm"]=float(ali.score)/ali.consensus_len
else:
seq[b"score_norm"]=0.0
for tag in forward: for tag in forward:
if tag != REVERSE_SEQUENCE_COLUMN and tag != REVERSE_QUALITY_COLUMN: if tag != REVERSE_SEQUENCE_COLUMN and tag != REVERSE_QUALITY_COLUMN and \
tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN:
seq[tag] = forward[tag] seq[tag] = forward[tag]
return seq return seq

View File

@ -156,6 +156,9 @@ def emblIterator_file(lineiterator,
yield seq yield seq
read+=1 read+=1
# Last sequence
seq = emblParser(entry)
yield seq yield seq
free(entry) free(entry)
@ -174,7 +177,7 @@ def emblIterator_dir(dir_path,
for filename in files: for filename in files:
if read==only: if read==only:
return return
print("Parsing file %s (%d/%d)" % (tostr(filename), read_files, len(files))) print("Parsing file %s (%d/%d)" % (tostr(filename), read_files+1, len(files)))
f = uopen(filename) f = uopen(filename)
if only is not None: if only is not None:
only_f = only-read only_f = only-read

View File

@ -104,6 +104,7 @@ def fastaNucIterator(lineiterator,
cdef bytes sequence cdef bytes sequence
cdef int skipped, ionly, read cdef int skipped, ionly, read
cdef Nuc_Seq seq cdef Nuc_Seq seq
cdef bint stop
if only is None: if only is None:
ionly = -1 ionly = -1
@ -130,7 +131,8 @@ def fastaNucIterator(lineiterator,
else: else:
line = firstline line = firstline
while True: stop=False
while not stop:
if ionly >= 0 and read >= ionly: if ionly >= 0 and read >= ionly:
break break
@ -153,7 +155,7 @@ def fastaNucIterator(lineiterator,
s.append(line[0:-1]) s.append(line[0:-1])
line = next(iterator) line = next(iterator)
except StopIteration: except StopIteration:
pass stop=True
sequence = b"".join(s) sequence = b"".join(s)

View File

@ -25,8 +25,9 @@ from libc.string cimport strcpy, strlen
_featureMatcher = re.compile(b'^FEATURES.+\n(?=ORIGIN)',re.DOTALL + re.M) _featureMatcher = re.compile(b'^FEATURES.+\n(?=ORIGIN)',re.DOTALL + re.M)
_headerMatcher = re.compile(b'^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M) _headerMatcher = re.compile(b'^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M)
_seqMatcher = re.compile(b'(?<=ORIGIN).+(?=//\n)', re.DOTALL + re.M) _seqMatcher = re.compile(b'ORIGIN.+(?=//\n)', re.DOTALL + re.M)
_cleanSeq = re.compile(b'[ \n0-9]+') _cleanSeq1 = re.compile(b'ORIGIN.+\n')
_cleanSeq2 = re.compile(b'[ \n0-9]+')
_acMatcher = re.compile(b'(?<=^ACCESSION ).+',re.M) _acMatcher = re.compile(b'(?<=^ACCESSION ).+',re.M)
_deMatcher = re.compile(b'(?<=^DEFINITION ).+\n( .+\n)*',re.M) _deMatcher = re.compile(b'(?<=^DEFINITION ).+\n( .+\n)*',re.M)
_cleanDe = re.compile(b'\n *') _cleanDe = re.compile(b'\n *')
@ -42,7 +43,8 @@ def genbankParser(bytes text):
ft = _featureMatcher.search(text).group() ft = _featureMatcher.search(text).group()
s = _seqMatcher.search(text).group() s = _seqMatcher.search(text).group()
s = _cleanSeq.sub(b'', s).upper() s = _cleanSeq1.sub(b'', s)
s = _cleanSeq2.sub(b'', s)
acs = _acMatcher.search(text).group() acs = _acMatcher.search(text).group()
acs = acs.split() acs = acs.split()
@ -52,12 +54,6 @@ def genbankParser(bytes text):
de = _deMatcher.search(header).group() de = _deMatcher.search(header).group()
de = _cleanDe.sub(b' ',de).strip().strip(b'.') de = _cleanDe.sub(b' ',de).strip().strip(b'.')
except Exception as e:
print("\nCould not import sequence id:", text.split()[1], "(error raised:", e, ")")
# Do not raise any Exception if you need the possibility to resume the generator
# (Python generators can't resume after any exception is raised)
return None
tags = {} tags = {}
extractTaxon(ft, tags) extractTaxon(ft, tags)
@ -68,6 +64,12 @@ def genbankParser(bytes text):
offset=-1, offset=-1,
tags=tags) tags=tags)
except Exception as e:
print("\nCould not import sequence id:", text.split()[1], "(error raised:", e, ")")
# Do not raise any Exception if you need the possibility to resume the generator
# (Python generators can't resume after any exception is raised)
return None
return seq return seq
@ -153,6 +155,9 @@ def genbankIterator_file(lineiterator,
yield seq yield seq
read+=1 read+=1
# Last sequence
seq = genbankParser(entry)
yield seq yield seq
free(entry) free(entry)
@ -168,10 +173,12 @@ def genbankIterator_dir(dir_path,
read = 0 read = 0
read_files = 0 read_files = 0
files = [filename for filename in glob.glob(os.path.join(path, b'*.gbff*'))] files = [filename for filename in glob.glob(os.path.join(path, b'*.gbff*'))]
files.extend([filename for filename in glob.glob(os.path.join(path, b'*.seq*'))]) # new genbank extension
files = list(set(files))
for filename in files: for filename in files:
if read==only: if read==only:
return return
print("Parsing file %s (%d/%d)" % (tostr(filename), read_files, len(files))) print("Parsing file %s (%d/%d)" % (tostr(filename), read_files+1, len(files)))
f = uopen(filename) f = uopen(filename)
if only is not None: if only is not None:
only_f = only-read only_f = only-read

View File

@ -72,7 +72,7 @@ cpdef int count_entries(file, bytes format):
return -1 return -1
mmapped_file = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) mmapped_file = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
total_count += len(re.findall(sep, mmapped_file)) total_count += len(re.findall(sep, mmapped_file))
if format != b"ngsfilter" and format != b"tabular": if format != b"ngsfilter" and format != b"tabular" and format != b"embl" and format != b"genbank":
total_count += 1 # adding +1 for 1st entry because separators include \n (ngsfilter and tabular already count one more because of last \n) total_count += 1 # adding +1 for 1st entry because separators include \n (ngsfilter and tabular already count one more because of last \n)
except: except:

View File

@ -1,5 +1,5 @@
major = 3 major = 3
minor = 0 minor = 0
serial= '0-beta10' serial= '0b24'
version ="%d.%02d.%s" % (major,minor,serial) version ="%d.%d.%s" % (major,minor,serial)

5
requirements.txt Executable file
View File

@ -0,0 +1,5 @@
--extra-index-url https://pypi.python.org/simple/
Cython>=0.24
Sphinx>=1.2.0
ipython>=3.0.0
breathe>=4.0.0

View File

@ -5,7 +5,8 @@ import re
import subprocess import subprocess
from distutils import log from distutils import log
from distutils.core import setup #from distutils.core import setup
from setuptools import setup # to work with pip
from distutils.core import Extension from distutils.core import Extension
from distutils.sysconfig import get_python_lib from distutils.sysconfig import get_python_lib
@ -26,10 +27,11 @@ class Distribution(ori_Distribution):
ori_Distribution.__init__(self, attrs) ori_Distribution.__init__(self, attrs)
self.global_options.insert(0,('cobitools3', None, "intall location of the C library" self.global_options.insert(0,('cobitools3', None, "install location of the C library"
)) ))
from distutils.command.build import build as build_ori from distutils.command.build import build as build_ori
from setuptools.command.bdist_egg import bdist_egg as bdist_egg_ori
from distutils.core import Command from distutils.core import Command
@ -70,6 +72,12 @@ class build(build_ori):
build_ori.run(self) build_ori.run(self)
class bdist_egg(bdist_egg_ori):
def run(self):
self.run_command('build_clib')
bdist_egg_ori.run(self)
sys.path.append(os.path.abspath("python")) sys.path.append(os.path.abspath("python"))
@ -88,9 +96,10 @@ PACKAGE = "OBITools3"
VERSION = version VERSION = version
AUTHOR = 'Celine Mercier' AUTHOR = 'Celine Mercier'
EMAIL = 'celine.mercier@metabarcoding.org' EMAIL = 'celine.mercier@metabarcoding.org'
URL = "http://metabarcoding.org/obitools3" URL = "https://metabarcoding.org/obitools3"
PLATFORMS = "posix"
LICENSE = "CeCILL-V2" LICENSE = "CeCILL-V2"
DESCRIPTION = "Tools and library for DNA metabarcoding", DESCRIPTION = "A package for the management of analyses and data in DNA metabarcoding."
PYTHONMIN = '3.5' PYTHONMIN = '3.5'
SRC = 'python' SRC = 'python'
@ -147,17 +156,24 @@ classifiers=['Development Status :: 4 - Beta',
'Topic :: Utilities', 'Topic :: Utilities',
] ]
with open("README.md", "r") as fh:
long_description = fh.read()
setup(name=PACKAGE, setup(name=PACKAGE,
description=DESCRIPTION, description=DESCRIPTION,
long_description=long_description,
long_description_content_type="text/markdown",
classifiers=classifiers, classifiers=classifiers,
version=VERSION, version=VERSION,
author=AUTHOR, author=AUTHOR,
author_email=EMAIL, author_email=EMAIL,
platforms=PLATFORMS,
license=LICENSE, license=LICENSE,
url=URL, url=URL,
ext_modules=xx, ext_modules=xx,
distclass=Distribution, distclass=Distribution,
cmdclass={'build': build, cmdclass={'build': build,
'bdist_egg': bdist_egg,
'build_clib': build_clib}, 'build_clib': build_clib},
cobitools3=get_python_lib(), cobitools3=get_python_lib(),
packages = findPackage('python'), packages = findPackage('python'),

View File

@ -157,7 +157,7 @@ int build_reference_db(const char* dms_name,
ecotx_t* lca_2 = NULL; ecotx_t* lca_2 = NULL;
ecotx_t* lca = NULL; ecotx_t* lca = NULL;
index_t idx1, idx2; index_t idx1, idx2;
index_t i, j, k; index_t i, j, k, count;
int32_t taxid_array_length; int32_t taxid_array_length;
int32_t score_array_length; int32_t score_array_length;
int32_t taxid_array_writable_length; int32_t taxid_array_writable_length;
@ -185,6 +185,7 @@ int build_reference_db(const char* dms_name,
matrix_view_name = strcpy(matrix_view_name, o_view_name); matrix_view_name = strcpy(matrix_view_name, o_view_name);
strcat(matrix_view_name, "_matrix"); strcat(matrix_view_name, "_matrix");
fprintf(stderr, "Aligning queries with reference database...\n");
if (obi_lcs_align_one_column(dms_name, if (obi_lcs_align_one_column(dms_name,
refs_view_name, refs_view_name,
"", "",
@ -320,13 +321,19 @@ int build_reference_db(const char* dms_name,
return -1; return -1;
} }
count = (matrix_with_lca_view->infos)->line_count;
fprintf(stderr, "Computing LCAs...\n");
// Compute all the LCAs // Compute all the LCAs
// For each pair // For each pair
for (i=0; i<(matrix_with_lca_view->infos)->line_count; i++) for (i=0; i<count; i++)
{ {
if (! keep_running) if (! keep_running)
return -1; return -1;
if (i%1000 == 0)
fprintf(stderr,"\rDone : %f %% ", (i / (float) count)*100);
// Read all taxids associated with the first sequence and compute their LCA // Read all taxids associated with the first sequence and compute their LCA
// Read line index // Read line index
idx1 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx1_column, i, 0); idx1 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx1_column, i, 0);
@ -363,6 +370,7 @@ int build_reference_db(const char* dms_name,
return -1; return -1;
} }
} }
fprintf(stderr,"\rDone : 100 %% \n");
// Clone refs view, add 2 arrays columns for lca and score, compute and write them // Clone refs view, add 2 arrays columns for lca and score, compute and write them
@ -442,13 +450,18 @@ int build_reference_db(const char* dms_name,
return -1; return -1;
} }
fprintf(stderr, "Building LCA arrays...\n");
// For each sequence, look for all its alignments in the matrix, and for each different LCA taxid/score, order them and write them // For each sequence, look for all its alignments in the matrix, and for each different LCA taxid/score, order them and write them
// Going through matrix once, filling refs arrays on the go for efficiency // Going through matrix once, filling refs arrays on the go for efficiency
for (i=0; i<(matrix_with_lca_view->infos)->line_count; i++) for (i=0; i<count; i++)
{ {
if (! keep_running) if (! keep_running)
return -1; return -1;
if (i%1000 == 0)
fprintf(stderr,"\rDone : %f %% ", (i / (float) count)*100);
// Read ref line indexes // Read ref line indexes
idx1 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx1_column, i, 0); idx1 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx1_column, i, 0);
idx2 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx2_column, i, 0); idx2 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx2_column, i, 0);
@ -464,6 +477,8 @@ int build_reference_db(const char* dms_name,
// Read alignment score // Read alignment score
score = obi_get_float_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_score_column, i, 0); score = obi_get_float_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_score_column, i, 0);
//fprintf(stderr, "\n\ntaxid_lca=%d, score=%f, idx1=%d, idx2=%d", taxid_lca, score, idx1, idx2);
///////////////// Compute for first sequence \\\\\\\\\\\\\\\\\\\\\\\ (TODO function) ///////////////// Compute for first sequence \\\\\\\\\\\\\\\\\\\\\\\ (TODO function)
// Read arrays // Read arrays
@ -480,9 +495,11 @@ int build_reference_db(const char* dms_name,
// return -1; // return -1;
// } // }
//fprintf(stderr, "\n1st sequence");
// If empty, add values // If empty, add values
if (taxid_array_length == 0) if (taxid_array_length == 0)
{ {
//fprintf(stderr, "\nEmpty, add value");
if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx1, &taxid_lca, (uint8_t) (obi_sizeof(OBI_INT) * 8), 1) < 0) if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx1, &taxid_lca, (uint8_t) (obi_sizeof(OBI_INT) * 8), 1) < 0)
{ {
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database"); obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
@ -496,6 +513,8 @@ int build_reference_db(const char* dms_name,
} }
else else
{ {
//fprintf(stderr, "\nNot empty");
j = 0; j = 0;
modified = false; modified = false;
while (j < taxid_array_length) while (j < taxid_array_length)
@ -509,6 +528,9 @@ int build_reference_db(const char* dms_name,
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t)); memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
modified = true; modified = true;
//fprintf(stderr, "\nSame LCA, replace %d and %f with %d and %f", lca_taxid_array_writable[j],
// score_array_writable[j], taxid_lca, score);
// Better score for the same LCA, replace this LCA/score pair // Better score for the same LCA, replace this LCA/score pair
lca_taxid_array_writable[j] = taxid_lca; lca_taxid_array_writable[j] = taxid_lca;
score_array_writable[j] = score; score_array_writable[j] = score;
@ -535,6 +557,8 @@ int build_reference_db(const char* dms_name,
{ {
if (score > score_array[j]) if (score > score_array[j])
{ {
//fprintf(stderr, "\nInsert new");
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t)); memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t)); memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
modified = true; modified = true;
@ -579,10 +603,15 @@ int build_reference_db(const char* dms_name,
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t)); memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
modified = true; modified = true;
//fprintf(stderr, "\nAppend at the end");
// Append LCA // Append LCA
lca_taxid_array_writable[taxid_array_writable_length] = taxid_lca; lca_taxid_array_writable[taxid_array_writable_length] = taxid_lca;
score_array_writable[score_array_writable_length] = score; score_array_writable[score_array_writable_length] = score;
taxid_array_writable_length++;
score_array_writable_length++;
// Remove the previous (children) LCAs from the array if their score is equal or lower // Remove the previous (children) LCAs from the array if their score is equal or lower
while ((j>0) && (score_array_writable[j-1] <= score)) while ((j>0) && (score_array_writable[j-1] <= score))
{ {
@ -603,6 +632,13 @@ int build_reference_db(const char* dms_name,
// Write new arrays // Write new arrays
if (modified) if (modified)
{ {
// fprintf(stderr, "\n\nnew array:");
// for (k=0;k<taxid_array_writable_length;k++)
// {
// lca = obi_taxo_get_taxon_with_taxid(tax, lca_taxid_array_writable[k]);
// fprintf(stderr, "\nLCA=%d, %s, score=%f", lca_taxid_array_writable[k], lca->name, score_array_writable[k]);
// }
if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx1, lca_taxid_array_writable, (uint8_t) (obi_sizeof(OBI_INT) * 8), taxid_array_writable_length) < 0) if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx1, lca_taxid_array_writable, (uint8_t) (obi_sizeof(OBI_INT) * 8), taxid_array_writable_length) < 0)
{ {
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database"); obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
@ -632,9 +668,13 @@ int build_reference_db(const char* dms_name,
// return -1; // return -1;
// } // }
//fprintf(stderr, "\n2nd sequence");
// If empty, add values // If empty, add values
if (taxid_array_length == 0) if (taxid_array_length == 0)
{ {
//fprintf(stderr, "\nEmpty, add value");
if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx2, &taxid_lca, (uint8_t) (obi_sizeof(OBI_INT) * 8), 1) < 0) if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx2, &taxid_lca, (uint8_t) (obi_sizeof(OBI_INT) * 8), 1) < 0)
{ {
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database"); obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
@ -648,6 +688,8 @@ int build_reference_db(const char* dms_name,
} }
else else
{ {
//fprintf(stderr, "\nNot empty");
j = 0; j = 0;
modified = false; modified = false;
while (j < taxid_array_length) while (j < taxid_array_length)
@ -661,6 +703,9 @@ int build_reference_db(const char* dms_name,
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t)); memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
modified = true; modified = true;
//fprintf(stderr, "\nSame LCA, replace %d and %f with %d and %f", lca_taxid_array_writable[j],
// score_array_writable[j], taxid_lca, score);
// Better score for the same LCA, replace this LCA/score pair // Better score for the same LCA, replace this LCA/score pair
lca_taxid_array_writable[j] = taxid_lca; lca_taxid_array_writable[j] = taxid_lca;
score_array_writable[j] = score; score_array_writable[j] = score;
@ -687,6 +732,8 @@ int build_reference_db(const char* dms_name,
{ {
if (score > score_array[j]) if (score > score_array[j])
{ {
//fprintf(stderr, "\nInsert new");
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t)); memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t)); memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
modified = true; modified = true;
@ -727,6 +774,8 @@ int build_reference_db(const char* dms_name,
if (j == taxid_array_length) // same or parent LCA not found, need to be appended at the end if (j == taxid_array_length) // same or parent LCA not found, need to be appended at the end
{ {
//fprintf(stderr, "\nAppend at the end");
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t)); memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t)); memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
modified = true; modified = true;
@ -735,6 +784,9 @@ int build_reference_db(const char* dms_name,
lca_taxid_array_writable[taxid_array_writable_length] = taxid_lca; lca_taxid_array_writable[taxid_array_writable_length] = taxid_lca;
score_array_writable[score_array_writable_length] = score; score_array_writable[score_array_writable_length] = score;
taxid_array_writable_length++;
score_array_writable_length++;
// Remove the previous (children) LCAs from the array if their score is equal or lower // Remove the previous (children) LCAs from the array if their score is equal or lower
while ((j>0) && (score_array_writable[j-1] <= score)) while ((j>0) && (score_array_writable[j-1] <= score))
{ {
@ -769,11 +821,17 @@ int build_reference_db(const char* dms_name,
} }
} }
} }
fprintf(stderr,"\rDone : 100 %% \n");
fprintf(stderr, "Writing results...\n");
count = (o_view->infos)->line_count;
// Fill empty LCA informations (because filling from potentially sparse alignment matrix) with the sequence taxid // Fill empty LCA informations (because filling from potentially sparse alignment matrix) with the sequence taxid
score=1.0; // technically getting LCA of identical sequences score=1.0; // technically getting LCA of identical sequences
for (i=0; i<(o_view->infos)->line_count; i++) for (i=0; i<count; i++)
{ {
if (i%1000 == 0)
fprintf(stderr,"\rDone : %f %% ", (i / (float) count)*100);
obi_get_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, i, &taxid_array_length); obi_get_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, i, &taxid_array_length);
if (taxid_array_length == 0) // no LCA set if (taxid_array_length == 0) // no LCA set
{ {
@ -799,6 +857,7 @@ int build_reference_db(const char* dms_name,
} }
} }
} }
fprintf(stderr,"\rDone : 100 %% \n");
// Add information about the threshold used to build the DB // Add information about the threshold used to build the DB
snprintf(threshold_str, 5, "%f", threshold); snprintf(threshold_str, 5, "%f", threshold);
@ -858,7 +917,6 @@ int build_reference_db(const char* dms_name,
free(matrix_view_name); free(matrix_view_name);
free(matrix_with_lca_view_name); free(matrix_with_lca_view_name);
fprintf(stderr,"\rDone : 100 %% \n");
return 0; return 0;
} }

View File

@ -413,7 +413,10 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
return NULL; return NULL;
} }
if (max_common_kmers > 0)
score = max_common_kmers + kmer_size - 1; // aka the number of nucleotides in the longest stretch of kmers perfectly matching score = max_common_kmers + kmer_size - 1; // aka the number of nucleotides in the longest stretch of kmers perfectly matching
else
score = 0;
abs_shift = abs(best_shift); abs_shift = abs(best_shift);
// Save result in Obi_ali structure // Save result in Obi_ali structure
@ -423,10 +426,15 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
ali->shift = abs_shift; ali->shift = abs_shift;
ali->consensus_seq = NULL; ali->consensus_seq = NULL;
ali->consensus_qual = NULL; ali->consensus_qual = NULL;
if (score == 0)
ali->direction[0] = '\0';
else
{
if (((best_shift <= 0) && (!switched_seqs)) || ((best_shift > 0) && switched_seqs)) if (((best_shift <= 0) && (!switched_seqs)) || ((best_shift > 0) && switched_seqs))
strcpy(ali->direction, "left"); strcpy(ali->direction, "left");
else else
strcpy(ali->direction, "right"); strcpy(ali->direction, "right");
}
// Build the consensus sequence if asked // Build the consensus sequence if asked
if (build_consensus) if (build_consensus)

44
src/obi_ecopcr.c Executable file → Normal file
View File

@ -105,7 +105,8 @@ static int create_output_columns(Obiview_p o_view, bool kingdom_mode);
* @param o_temp1_column A pointer on the output column for the temperature for the first primer. * @param o_temp1_column A pointer on the output column for the temperature for the first primer.
* @param o_temp2_column A pointer on the output column for the temperature for the second primer. * @param o_temp2_column A pointer on the output column for the temperature for the second primer.
* *
* @retval 0 if the operation was successfully completed. * @retval 0 if the sequence was skipped (taxid not found, warning printed).
* @retval 1 if the sequence was successfully printed to the output.
* @retval -1 if an error occurred. * @retval -1 if an error occurred.
* *
* @since July 2018 * @since July 2018
@ -366,6 +367,17 @@ static int print_seq(Obiview_p i_view, Obiview_p o_view,
// TODO add check for primer longer than MAX_PAT_LEN (32) // TODO add check for primer longer than MAX_PAT_LEN (32)
// Get sequence id
seq_id = obi_get_str_with_elt_idx_and_col_p_in_view(i_view, i_id_column, i_idx, 0);
// Get the taxon structure
main_taxon = obi_taxo_get_taxon_with_taxid(taxonomy, taxid);
if (main_taxon == NULL)
{
obidebug(1, "\nWarning: error reading the taxonomic information of a sequence. Seq id: %s, taxid: %d. Probably deprecated taxid. Skipping this sequence.", seq_id, taxid);
return 0;
}
ldelta = (pos1 <= keep_nucleotides)?pos1:keep_nucleotides; ldelta = (pos1 <= keep_nucleotides)?pos1:keep_nucleotides;
rdelta = ((pos2+keep_nucleotides)>=seq_len)?seq_len-pos2:keep_nucleotides; rdelta = ((pos2+keep_nucleotides)>=seq_len)?seq_len-pos2:keep_nucleotides;
@ -431,16 +443,7 @@ static int print_seq(Obiview_p i_view, Obiview_p o_view,
if (isnan(tm2)) if (isnan(tm2))
tm2 = OBIFloat_NA; tm2 = OBIFloat_NA;
// Get the taxon structure
main_taxon = obi_taxo_get_taxon_with_taxid(taxonomy, taxid);
if (main_taxon == NULL)
{
obidebug(1, "\nError reading the taxonomic information of a sequence");
return -1;
}
// Write sequence id // Write sequence id
seq_id = obi_get_str_with_elt_idx_and_col_p_in_view(i_view, i_id_column, i_idx, 0);
if (obi_set_str_with_elt_idx_and_col_p_in_view(o_view, o_id_column, o_idx, 0, seq_id) < 0) if (obi_set_str_with_elt_idx_and_col_p_in_view(o_view, o_id_column, o_idx, 0, seq_id) < 0)
{ {
obidebug(1, "\nError writing the sequence id"); obidebug(1, "\nError writing the sequence id");
@ -629,7 +632,7 @@ static int print_seq(Obiview_p i_view, Obiview_p o_view,
return -1; return -1;
} }
return 0; return 1;
} }
@ -698,6 +701,7 @@ int obi_ecopcr(const char* i_dms_name,
obiint_t taxid; obiint_t taxid;
char* sequence; char* sequence;
int printed;
SeqPtr apatseq=NULL; SeqPtr apatseq=NULL;
int32_t o1Hits; int32_t o1Hits;
@ -1057,14 +1061,14 @@ int obi_ecopcr(const char* i_dms_name,
length = 0; length = 0;
if (posj > posi) if (posj > posi)
length = posj - posi - o1->patlen - o2->patlen; length = posj - posi - o1->patlen - o2->patlen;
if (posj < posi) else if (circular > 0)
length = posj + apatseq->seqlen - posi - o1->patlen - o2->patlen; length = posj + apatseq->seqlen - posi - o1->patlen - o2->patlen;
if ((length>0) && // For when primers touch or overlap if ((length>0) && // For when primers touch or overlap
(!min_len || (length >= min_len)) && (!min_len || (length >= min_len)) &&
(!max_len || (length <= max_len))) (!max_len || (length <= max_len)))
{ {
// Print the found amplicon // Print the found amplicon
if (print_seq(i_view, o_view, printed = print_seq(i_view, o_view,
i_idx, o_idx, i_idx, o_idx,
taxonomy, taxonomy,
sequence, sequence,
@ -1090,11 +1094,13 @@ int obi_ecopcr(const char* i_dms_name,
o_strand_column, o_strand_column,
o_primer1_column, o_primer2_column, o_primer1_column, o_primer2_column,
o_error1_column, o_error2_column, o_error1_column, o_error2_column,
o_temp1_column, o_temp2_column) < 0) o_temp1_column, o_temp2_column);
if (printed < 0)
{ {
obidebug(1, "\nError writing the ecopcr result"); obidebug(1, "\nError writing the ecopcr result");
return -1; return -1;
} }
else if (printed > 0)
o_idx++; o_idx++;
} }
} }
@ -1145,14 +1151,14 @@ int obi_ecopcr(const char* i_dms_name,
length = 0; length = 0;
if (posj > posi) if (posj > posi)
length = posj - posi + 1 - o2->patlen - o1->patlen; /* - o1->patlen : deleted by <EC> (prior to the OBITools3) */ length = posj - posi + 1 - o2->patlen - o1->patlen; /* - o1->patlen : deleted by <EC> (prior to the OBITools3) */
if (posj < posi) else if (circular > 0)
length = posj + apatseq->seqlen - posi - o1->patlen - o2->patlen; length = posj + apatseq->seqlen - posi - o1->patlen - o2->patlen;
if ((length>0) && // For when primers touch or overlap if ((length>0) && // For when primers touch or overlap
(!min_len || (length >= min_len)) && (!min_len || (length >= min_len)) &&
(!max_len || (length <= max_len))) (!max_len || (length <= max_len)))
{ {
// Print the found amplicon // Print the found amplicon
if (print_seq(i_view, o_view, printed = print_seq(i_view, o_view,
i_idx, o_idx, i_idx, o_idx,
taxonomy, taxonomy,
sequence, sequence,
@ -1178,11 +1184,13 @@ int obi_ecopcr(const char* i_dms_name,
o_strand_column, o_strand_column,
o_primer1_column, o_primer2_column, o_primer1_column, o_primer2_column,
o_error1_column, o_error2_column, o_error1_column, o_error2_column,
o_temp1_column, o_temp2_column) < 0) o_temp1_column, o_temp2_column);
if (printed < 0)
{ {
obidebug(1, "\nError writing the ecopcr result"); obidebug(1, "\nError writing the ecopcr result");
return -1; return -1;
} }
else if (printed > 0)
o_idx++; o_idx++;
} }
} }
@ -1224,7 +1232,7 @@ int obi_ecopcr(const char* i_dms_name,
return -1; return -1;
} }
fprintf(stderr,"\rDone : 100 %% "); fprintf(stderr,"\rDone : 100 %% \n");
return 0; return 0;
return 0; return 0;

View File

@ -81,8 +81,8 @@
* @param o_dms_name The path to the output DMS. * @param o_dms_name The path to the output DMS.
* @param o_view_name The name of the output view. * @param o_view_name The name of the output view.
* @param o_view_comments The comments to associate with the output view. * @param o_view_comments The comments to associate with the output view.
* @param primer1 The first primer. * @param primer1 The first primer, length must be less than or equal to 32 (because of apat lib limitation).
* @param primer2 The second primer. * @param primer2 The second primer, length must be less than or equal to 32 (because of apat lib limitation).
* @param error_max The maximum number of errors allowed per primer for amplification. * @param error_max The maximum number of errors allowed per primer for amplification.
* @param min_len The minimum length of an amplicon. * @param min_len The minimum length of an amplicon.
* @param max_len The maximum length of an amplicon. * @param max_len The maximum length of an amplicon.

View File

@ -71,9 +71,12 @@ static int create_output_columns(Obiview_p o_view);
* @param name The assigned scientific name. * @param name The assigned scientific name.
* @param assigned_status_column A pointer on the column where the assigned status should be written. * @param assigned_status_column A pointer on the column where the assigned status should be written.
* @param assigned The assigned status (whether the sequence was assigned to a taxon or not). * @param assigned The assigned status (whether the sequence was assigned to a taxon or not).
* @param best_match_column A pointer on the column where the list of ids of the best matches should be written. * @param best_match_ids_column A pointer on the column where the list of ids of the best matches should be written.
* @param best_match_ids The list of ids of the best matches as an array of the concatenated ids separated by '\0'. * @param best_match_ids The list of ids of the best matches as an array of the concatenated ids separated by '\0'.
* @param best_match_ids_length The total length of the array of ids of best matches. * @param best_match_ids_length The total length of the array of ids of best matches.
* @param best_match_taxids_column A pointer on the column where the list of taxids of the best matches should be written.
* @param best_match_taxids The list of taxids of the best matches as an array of the taxids.
* @param best_match_taxids_length The length of the array of taxids of best matches.
* @param score_column A pointer on the column where the score should be written. * @param score_column A pointer on the column where the score should be written.
* @param score The similarity score of the sequence with its best match(es). * @param score The similarity score of the sequence with its best match(es).
* *
@ -87,7 +90,8 @@ int print_assignment_result(Obiview_p output_view, index_t line,
OBIDMS_column_p assigned_taxid_column, int32_t taxid, OBIDMS_column_p assigned_taxid_column, int32_t taxid,
OBIDMS_column_p assigned_name_column, const char* name, OBIDMS_column_p assigned_name_column, const char* name,
OBIDMS_column_p assigned_status_column, bool assigned, OBIDMS_column_p assigned_status_column, bool assigned,
OBIDMS_column_p best_match_column, const char* best_match_ids, int best_match_ids_length, OBIDMS_column_p best_match_ids_column, const char* best_match_ids, int best_match_ids_length,
OBIDMS_column_p best_match_taxids_column, const int32_t* best_match_taxids, int best_match_taxids_length,
OBIDMS_column_p score_column, double score); OBIDMS_column_p score_column, double score);
@ -100,37 +104,44 @@ int print_assignment_result(Obiview_p output_view, index_t line,
static int create_output_columns(Obiview_p o_view) static int create_output_columns(Obiview_p o_view)
{ {
// Score column // Score column
if (obi_view_add_column(o_view, ECOTAG_SCORE_COLUMN_NAME, -1, NULL, OBI_FLOAT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_SCORE_COLUMN_NAME, true) < 0) if (obi_view_add_column(o_view, ECOTAG_SCORE_COLUMN_NAME, -1, NULL, OBI_FLOAT, 0, 1, NULL, false, false, false, NULL, NULL, -1, "{}", true) < 0)
{ {
obidebug(1, "\nError creating the column for the score in ecotag"); obidebug(1, "\nError creating the column for the score in ecotag");
return -1; return -1;
} }
// Assigned taxid column // Assigned taxid column
if (obi_view_add_column(o_view, ECOTAG_TAXID_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_TAXID_COLUMN_NAME, true) < 0) if (obi_view_add_column(o_view, ECOTAG_TAXID_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, "{}", true) < 0)
{ {
obidebug(1, "\nError creating the column for the assigned taxid in ecotag"); obidebug(1, "\nError creating the column for the assigned taxid in ecotag");
return -1; return -1;
} }
// Assigned scientific name column // Assigned scientific name column
if (obi_view_add_column(o_view, ECOTAG_NAME_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_NAME_COLUMN_NAME, true) < 0) if (obi_view_add_column(o_view, ECOTAG_NAME_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, false, false, NULL, NULL, -1, "{}", true) < 0)
{ {
obidebug(1, "\nError creating the column for the assigned scientific name in ecotag"); obidebug(1, "\nError creating the column for the assigned scientific name in ecotag");
return -1; return -1;
} }
// Assignement status column // Assignement status column
if (obi_view_add_column(o_view, ECOTAG_STATUS_COLUMN_NAME, -1, NULL, OBI_BOOL, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_STATUS_COLUMN_NAME, true) < 0) if (obi_view_add_column(o_view, ECOTAG_STATUS_COLUMN_NAME, -1, NULL, OBI_BOOL, 0, 1, NULL, false, false, false, NULL, NULL, -1, "{}", true) < 0)
{ {
obidebug(1, "\nError creating the column for the assignment status in ecotag"); obidebug(1, "\nError creating the column for the assignment status in ecotag");
return -1; return -1;
} }
// Column for array of best match ids // Column for array of best match ids
if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, true, false, NULL, NULL, -1, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, true) < 0) if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, true, false, NULL, NULL, -1, "{}", true) < 0)
{ {
obidebug(1, "\nError creating the column for the array of ids of the best match in ecotag"); obidebug(1, "\nError creating the column for the array of ids of best matches in ecotag");
return -1;
}
// Column for array of best match taxids
if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_TAXIDS_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, true, false, NULL, NULL, -1, "{}", true) < 0)
{
obidebug(1, "\nError creating the column for the array of taxids of best matches in ecotag");
return -1; return -1;
} }
@ -142,7 +153,8 @@ int print_assignment_result(Obiview_p output_view, index_t line,
OBIDMS_column_p assigned_taxid_column, int32_t taxid, OBIDMS_column_p assigned_taxid_column, int32_t taxid,
OBIDMS_column_p assigned_name_column, const char* name, OBIDMS_column_p assigned_name_column, const char* name,
OBIDMS_column_p assigned_status_column, bool assigned, OBIDMS_column_p assigned_status_column, bool assigned,
OBIDMS_column_p best_match_column, const char* best_match_ids, int best_match_ids_length, OBIDMS_column_p best_match_ids_column, const char* best_match_ids, int best_match_ids_length,
OBIDMS_column_p best_match_taxids_column, const int32_t* best_match_taxids, int best_match_taxids_length,
OBIDMS_column_p score_column, double score) OBIDMS_column_p score_column, double score)
{ {
// Write the assigned taxid // Write the assigned taxid
@ -167,9 +179,16 @@ int print_assignment_result(Obiview_p output_view, index_t line,
} }
// Write the best match ids // Write the best match ids
if (obi_set_array_with_col_p_in_view(output_view, best_match_column, line, best_match_ids, (uint8_t)(sizeof(char)*8), best_match_ids_length) < 0) if (obi_set_array_with_col_p_in_view(output_view, best_match_ids_column, line, best_match_ids, (uint8_t)(sizeof(char)*8), best_match_ids_length) < 0)
{ {
obidebug(1, "\nError writing a assignment status in a column when writing ecotag results"); obidebug(1, "\nError writing the array of best match ids in a column when writing ecotag results");
return -1;
}
// Write the best match taxids
if (obi_set_array_with_col_p_in_view(output_view, best_match_taxids_column, line, best_match_taxids, (uint8_t)(sizeof(OBI_INT)*8), best_match_taxids_length) < 0)
{
obidebug(1, "\nError writing the array of best match taxids in a column when writing ecotag results");
return -1; return -1;
} }
@ -235,6 +254,8 @@ int obi_ecotag(const char* dms_name,
char* best_match_ids; char* best_match_ids;
char* best_match_ids_to_store; char* best_match_ids_to_store;
int32_t best_match_ids_length; int32_t best_match_ids_length;
int32_t* best_match_taxids;
int32_t* best_match_taxids_to_store;
int best_match_count; int best_match_count;
int buffer_size; int buffer_size;
int best_match_ids_buffer_size; int best_match_ids_buffer_size;
@ -263,7 +284,8 @@ int obi_ecotag(const char* dms_name,
OBIDMS_column_p assigned_taxid_column = NULL; OBIDMS_column_p assigned_taxid_column = NULL;
OBIDMS_column_p assigned_name_column = NULL; OBIDMS_column_p assigned_name_column = NULL;
OBIDMS_column_p assigned_status_column = NULL; OBIDMS_column_p assigned_status_column = NULL;
OBIDMS_column_p best_match_column = NULL; OBIDMS_column_p best_match_ids_column = NULL;
OBIDMS_column_p best_match_taxids_column = NULL;
OBIDMS_column_p lca_taxid_a_column = NULL; OBIDMS_column_p lca_taxid_a_column = NULL;
OBIDMS_column_p score_a_column = NULL; OBIDMS_column_p score_a_column = NULL;
OBIDMS_column_p ref_taxid_column = NULL; OBIDMS_column_p ref_taxid_column = NULL;
@ -396,7 +418,8 @@ int obi_ecotag(const char* dms_name,
assigned_taxid_column = obi_view_get_column(output_view, ECOTAG_TAXID_COLUMN_NAME); assigned_taxid_column = obi_view_get_column(output_view, ECOTAG_TAXID_COLUMN_NAME);
assigned_name_column = obi_view_get_column(output_view, ECOTAG_NAME_COLUMN_NAME); assigned_name_column = obi_view_get_column(output_view, ECOTAG_NAME_COLUMN_NAME);
assigned_status_column = obi_view_get_column(output_view, ECOTAG_STATUS_COLUMN_NAME); assigned_status_column = obi_view_get_column(output_view, ECOTAG_STATUS_COLUMN_NAME);
best_match_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME); best_match_ids_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME);
best_match_taxids_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_TAXIDS_COLUMN_NAME);
score_column = obi_view_get_column(output_view, ECOTAG_SCORE_COLUMN_NAME); score_column = obi_view_get_column(output_view, ECOTAG_SCORE_COLUMN_NAME);
// Open the used reference columns // Open the used reference columns
@ -453,9 +476,17 @@ int obi_ecotag(const char* dms_name,
return -1; return -1;
} }
best_match_taxids = (int32_t*) malloc(buffer_size* sizeof(int32_t));
if (best_match_taxids == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for the best match taxid array in ecotag");
return -1;
}
for (i=0; i < query_count; i++) for (i=0; i < query_count; i++)
{ {
if (i%100 == 0) if (i%1000 == 0)
fprintf(stderr,"\rDone : %f %% ", (i / (float) query_count)*100); fprintf(stderr,"\rDone : %f %% ", (i / (float) query_count)*100);
best_match_count = 0; best_match_count = 0;
@ -514,7 +545,7 @@ int obi_ecotag(const char* dms_name,
// Store in best match array // Store in best match array
// Grow match array if needed // Grow match and taxid array if needed
if (best_match_count == buffer_size) if (best_match_count == buffer_size)
{ {
buffer_size = buffer_size*2; buffer_size = buffer_size*2;
@ -525,6 +556,13 @@ int obi_ecotag(const char* dms_name,
obidebug(1, "\nError reallocating match array when assigning"); obidebug(1, "\nError reallocating match array when assigning");
return -1; return -1;
} }
best_match_taxids = (int32_t*) realloc(best_match_taxids, buffer_size*sizeof(int32_t));
if (best_match_taxids == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating match taxids array when assigning");
return -1;
}
} }
id = obi_get_str_with_elt_idx_and_col_p_in_view(ref_view, ref_id_column, j, 0); id = obi_get_str_with_elt_idx_and_col_p_in_view(ref_view, ref_id_column, j, 0);
@ -545,6 +583,7 @@ int obi_ecotag(const char* dms_name,
// Save match // Save match
best_match_array[best_match_count] = j; best_match_array[best_match_count] = j;
best_match_taxids[best_match_count] = obi_get_int_with_elt_idx_and_col_p_in_view(ref_view, ref_taxid_column, j, 0);
best_match_count++; best_match_count++;
strcpy(best_match_ids+best_match_ids_length, id); strcpy(best_match_ids+best_match_ids_length, id);
best_match_ids_length = best_match_ids_length + id_len + 1; best_match_ids_length = best_match_ids_length + id_len + 1;
@ -562,7 +601,7 @@ int obi_ecotag(const char* dms_name,
score_array = obi_get_array_with_col_p_in_view(ref_view, score_a_column, best_match_idx, &lca_array_length); score_array = obi_get_array_with_col_p_in_view(ref_view, score_a_column, best_match_idx, &lca_array_length);
k = 0; k = 0;
while ((k < lca_array_length) && (score_array[k] >= ecotag_threshold)) while ((k < lca_array_length) && (score_array[k] >= best_score))
k++; k++;
if (k>0) if (k>0)
@ -570,12 +609,12 @@ int obi_ecotag(const char* dms_name,
lca_array = obi_get_array_with_col_p_in_view(ref_view, lca_taxid_a_column, best_match_idx, &lca_array_length); lca_array = obi_get_array_with_col_p_in_view(ref_view, lca_taxid_a_column, best_match_idx, &lca_array_length);
if (j>0) if (j>0)
{ {
lca = obi_taxo_get_taxon_with_taxid(taxonomy, lca_taxid); // lca = obi_taxo_get_taxon_with_taxid(taxonomy, lca_taxid);
if (lca == NULL) // if (lca == NULL)
{ // {
obidebug(1, "\nError getting a taxon from a taxid when doing taxonomic assignment"); // obidebug(1, "\nError getting a taxon from a taxid when doing taxonomic assignment");
return -1; // return -1;
} // }
lca_in_array = obi_taxo_get_taxon_with_taxid(taxonomy, lca_array[k-1]); lca_in_array = obi_taxo_get_taxon_with_taxid(taxonomy, lca_array[k-1]);
if (lca_in_array == NULL) if (lca_in_array == NULL)
{ {
@ -629,6 +668,7 @@ int obi_ecotag(const char* dms_name,
else else
lca_name = lca->name; lca_name = lca->name;
best_match_ids_to_store = best_match_ids; best_match_ids_to_store = best_match_ids;
best_match_taxids_to_store = best_match_taxids;
} }
else else
{ {
@ -636,6 +676,7 @@ int obi_ecotag(const char* dms_name,
lca_name = OBIStr_NA; lca_name = OBIStr_NA;
lca_taxid = OBIInt_NA; lca_taxid = OBIInt_NA;
best_match_ids_to_store = OBITuple_NA; best_match_ids_to_store = OBITuple_NA;
best_match_taxids_to_store = OBITuple_NA;
score = OBIFloat_NA; score = OBIFloat_NA;
} }
@ -644,7 +685,8 @@ int obi_ecotag(const char* dms_name,
assigned_taxid_column, lca_taxid, assigned_taxid_column, lca_taxid,
assigned_name_column, lca_name, assigned_name_column, lca_name,
assigned_status_column, assigned, assigned_status_column, assigned,
best_match_column, best_match_ids_to_store, best_match_ids_length, best_match_ids_column, best_match_ids_to_store, best_match_ids_length,
best_match_taxids_column, best_match_taxids_to_store, best_match_count,
score_column, best_score score_column, best_score
) < 0) ) < 0)
return -1; return -1;
@ -652,6 +694,7 @@ int obi_ecotag(const char* dms_name,
free(best_match_array); free(best_match_array);
free(best_match_ids); free(best_match_ids);
free(best_match_taxids);
obi_close_taxonomy(taxonomy); obi_close_taxonomy(taxonomy);
obi_save_and_close_view(query_view); obi_save_and_close_view(query_view);

View File

@ -23,7 +23,8 @@
#define ECOTAG_TAXID_COLUMN_NAME "TAXID" #define ECOTAG_TAXID_COLUMN_NAME "TAXID"
#define ECOTAG_NAME_COLUMN_NAME "SCIENTIFIC_NAME" #define ECOTAG_NAME_COLUMN_NAME "SCIENTIFIC_NAME"
#define ECOTAG_STATUS_COLUMN_NAME "ID_STATUS" #define ECOTAG_STATUS_COLUMN_NAME "ID_STATUS"
#define ECOTAG_BEST_MATCH_IDS_COLUMN_NAME "BEST_MATCH" #define ECOTAG_BEST_MATCH_IDS_COLUMN_NAME "BEST_MATCH_IDS"
#define ECOTAG_BEST_MATCH_TAXIDS_COLUMN_NAME "BEST_MATCH_TAXIDS"
#define ECOTAG_SCORE_COLUMN_NAME "BEST_IDENTITY" #define ECOTAG_SCORE_COLUMN_NAME "BEST_IDENTITY"

View File

@ -648,7 +648,7 @@ int truncate_avl_data_to_size_used(OBIDMS_avl_data_p avl_data) // TODO is it nec
new_data_size = ((index_t) multiple) * getpagesize(); new_data_size = ((index_t) multiple) * getpagesize();
// Check that it is actually greater than the current size of the file, otherwise no need to truncate // Check that it is actually greater than the current size of the file, otherwise no need to truncate
if ((avl_data->header)->data_size_max == new_data_size) if ((avl_data->header)->data_size_max >= new_data_size)
return 0; return 0;
// Get the file descriptor // Get the file descriptor
@ -667,7 +667,7 @@ int truncate_avl_data_to_size_used(OBIDMS_avl_data_p avl_data) // TODO is it nec
if (ftruncate(file_descriptor, file_size) < 0) if (ftruncate(file_descriptor, file_size) < 0)
{ {
obi_set_errno(OBI_AVL_ERROR); obi_set_errno(OBI_AVL_ERROR);
obidebug(1, "\nError truncating an AVL data file"); obidebug(1, "\nError truncating an AVL data file, old data size = %lld, new data size = %lld", (avl_data->header)->data_size_max, new_data_size);
return -1; return -1;
} }

View File

@ -1496,7 +1496,7 @@ obiversion_t obi_import_column(const char* dms_path_1, const char* dms_path_2, c
memcpy(column_2->data, column_1->data, header_1->data_size); memcpy(column_2->data, column_1->data, header_1->data_size);
// Copy the AVL files if there are some (overwriting the automatically created files) // Copy the AVL files if there are some (overwriting the automatically created files)
if ((header_1->returned_data_type == OBI_STR) || (header_1->returned_data_type == OBI_SEQ) || (header_1->returned_data_type == OBI_QUAL)) if ((header_1->tuples) || ((header_1->returned_data_type == OBI_STR) || (header_1->returned_data_type == OBI_SEQ) || (header_1->returned_data_type == OBI_QUAL)))
{ {
avl_name_1 = (char*) malloc((strlen(header_1->indexer_name) + 1) * sizeof(char)); avl_name_1 = (char*) malloc((strlen(header_1->indexer_name) + 1) * sizeof(char));
if (avl_name_1 == NULL) if (avl_name_1 == NULL)

View File

@ -2376,9 +2376,10 @@ int read_merged_dmp(const char* taxdump, OBIDMS_taxonomy_p tax, int32_t* delnode
// and the deleted taxids with no current reference. An element of the list is composed of the taxid, and the index // and the deleted taxids with no current reference. An element of the list is composed of the taxid, and the index
// of the taxon in the taxa structure, or -1 for deleted taxids. // of the taxon in the taxa structure, or -1 for deleted taxids.
// Creating the merged list requires to merge the 3 ordered lists into one. // Creating the merged list requires to merge the 3 ordered lists into one.
while (((nT < (tax->taxa)->count) && ((tax->taxa)->taxon[nT].taxid < old_taxid)) || ((nD >= 0) && (delnodes[nD] < old_taxid))) while (((nT < (tax->taxa)->count) && ((tax->taxa)->taxon[nT].taxid < old_taxid)) ||
((nD >= 0) && (delnodes[nD] < old_taxid)))
{ {
if ((tax->taxa)->taxon[nT].taxid < delnodes[nD]) if ((nT < (tax->taxa)->count) && (tax->taxa)->taxon[nT].taxid < delnodes[nD])
{ // Add element from taxa list { // Add element from taxa list
// Enlarge structure if needed // Enlarge structure if needed
if (n == buffer_size) if (n == buffer_size)
@ -2401,7 +2402,7 @@ int read_merged_dmp(const char* taxdump, OBIDMS_taxonomy_p tax, int32_t* delnode
nT++; nT++;
n++; n++;
} }
else if (delnodes[nD] < (tax->taxa)->taxon[nT].taxid) else
{ // Add element from deleted taxids list { // Add element from deleted taxids list
// Enlarge structure if needed // Enlarge structure if needed
if (n == buffer_size) if (n == buffer_size)
@ -3036,12 +3037,12 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo
strcpy(tax->tax_name, taxonomy_name); strcpy(tax->tax_name, taxonomy_name);
buffer_size = 2048;
taxonomy_path = get_taxonomy_path(dms, taxonomy_name); taxonomy_path = get_taxonomy_path(dms, taxonomy_name);
if (taxonomy_path == NULL) if (taxonomy_path == NULL)
return NULL; return NULL;
buffer_size = strlen(taxonomy_path) + strlen(taxonomy_name) + 6;
// Read ranks // Read ranks
ranks_file_name = (char*) malloc(buffer_size*sizeof(char)); ranks_file_name = (char*) malloc(buffer_size*sizeof(char));
if (ranks_file_name == NULL) if (ranks_file_name == NULL)

View File

@ -1693,8 +1693,8 @@ int obi_close_column(OBIDMS_column_p column)
if (obi_dms_unlist_column(column->dms, column) < 0) if (obi_dms_unlist_column(column->dms, column) < 0)
ret_val = -1; ret_val = -1;
// If the data type is OBI_STR, OBI_SEQ or OBI_QUAL, the associated indexer is closed // If it's a tuple column or the data type is OBI_STR, OBI_SEQ or OBI_QUAL, the associated indexer is closed
if (((column->header)->returned_data_type == OBI_STR) || ((column->header)->returned_data_type == OBI_SEQ) || ((column->header)->returned_data_type == OBI_QUAL)) if (((column->header)->tuples) || (((column->header)->returned_data_type == OBI_STR) || ((column->header)->returned_data_type == OBI_SEQ) || ((column->header)->returned_data_type == OBI_QUAL)))
if (obi_close_indexer(column->indexer) < 0) if (obi_close_indexer(column->indexer) < 0)
ret_val = -1; ret_val = -1;
@ -1973,7 +1973,11 @@ int obi_enlarge_column(OBIDMS_column_p column)
// Calculate the new file size // Calculate the new file size
old_line_count = (column->header)->line_count; old_line_count = (column->header)->line_count;
new_line_count = old_line_count * COLUMN_GROWTH_FACTOR; new_line_count = ceil((double) old_line_count * (double) COLUMN_GROWTH_FACTOR);
if (new_line_count > old_line_count+100000)
new_line_count = old_line_count+100000;
else if (new_line_count < old_line_count+1000)
new_line_count = old_line_count+1000;
if (new_line_count > MAXIMUM_LINE_COUNT) if (new_line_count > MAXIMUM_LINE_COUNT)
{ {
@ -2381,6 +2385,54 @@ char* obi_get_elements_names(OBIDMS_column_p column)
} }
char* obi_get_formatted_elements_names(OBIDMS_column_p column)
{
char* elements_names;
int i, j;
int elt_idx;
int len;
elements_names = (char*) malloc(((column->header)->elements_names_length + (column->header)->nb_elements_per_line) * sizeof(char));
if (elements_names == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for elements names");
return NULL;
}
j = 0;
for (i=0; i < (column->header)->nb_elements_per_line; i++)
{
elt_idx = ((column->header)->elements_names_idx)[i];
len = strlen(((column->header)->elements_names)+elt_idx);
memcpy(elements_names+j, ((column->header)->elements_names)+elt_idx, len*sizeof(char));
j = j + len;
elements_names[j] = ';';
j++;
elements_names[j] = ' ';
j++;
}
elements_names[j - 1] = '\0';
return elements_names;
}
char* obi_column_formatted_infos(OBIDMS_column_p column)
{
char* column_infos;
char* elt_names;
column_infos = malloc(1024 * sizeof(char));
elt_names = obi_get_formatted_elements_names(column);
free(elt_names);
return column_infos;
}
int obi_column_prepare_to_set_value(OBIDMS_column_p column, index_t line_nb, index_t elt_idx) int obi_column_prepare_to_set_value(OBIDMS_column_p column, index_t line_nb, index_t elt_idx)
{ {

View File

@ -505,6 +505,14 @@ index_t obi_column_get_element_index_from_name(OBIDMS_column_p column, const cha
char* obi_get_elements_names(OBIDMS_column_p column); char* obi_get_elements_names(OBIDMS_column_p column);
// TODO
//char* obi_get_formatted_elements_names(OBIDMS_column_p column);
// TODO
//char* obi_column_formatted_infos(OBIDMS_column_p column);
/** /**
* @brief Prepares a column to set a value. * @brief Prepares a column to set a value.
* *

View File

@ -1037,8 +1037,9 @@ static int finish_view(Obiview_p view)
return -1; return -1;
} }
// Add count column if it's a NUC_SEQ_VIEW with no count column // TODO discuss // Add count column if it's a NUC_SEQ_VIEW with no count column (and there's no MERGED_sample column) // TODO discuss
if ((!strcmp((view->infos)->view_type, VIEW_TYPE_NUC_SEQS)) && (!obi_view_column_exists(view, COUNT_COLUMN))) if ((!strcmp((view->infos)->view_type, VIEW_TYPE_NUC_SEQS)) && (!obi_view_column_exists(view, COUNT_COLUMN))
&& (!obi_view_column_exists(view, "MERGED_sample"))) // TODO should eventually compute from merged samples?
{ {
if (obi_create_auto_count_column(view) < 0) if (obi_create_auto_count_column(view) < 0)
{ {
@ -1407,7 +1408,7 @@ static char* view_check_qual_match_seqs(Obiview_p view)
// Test that the lengths of the quality and the sequence are equal // Test that the lengths of the quality and the sequence are equal
if ((size_t)qual_len != strlen(seq)) if ((size_t)qual_len != strlen(seq))
{ {
obidebug(1, "\nError checking the predicate for view %s: The sequences and sequence quality arrays match.", (view->infos)->name); obidebug(1, "\nError checking the predicate for view %s: The sequences and sequence quality arrays match (index %lld, seq=%s, quality length = %d).", (view->infos)->name, j, seq, qual_len);
return NULL; return NULL;
} }
} }

View File

@ -686,6 +686,9 @@ int calculateSizeToAllocate(int maxLen, int LCSmin)
size *= 3; size *= 3;
size += 16; size += 16;
size += 10; // band-aid for memory bug I don't understand (triggered on specific db on ubuntu)
// bug might have to do with the way different systems behave when aligning the address in obi_get_memory_aligned_on_16
return(size*sizeof(int16_t)); return(size*sizeof(int16_t));
} }