Compare commits
27 Commits
v3.0.0-bet
...
v3.0.0b20
Author | SHA1 | Date | |
---|---|---|---|
a7dcf16c06 | |||
f13f8f6165 | |||
b5a29ac413 | |||
efd2b9d338 | |||
ca6e3e7aad | |||
76ed8e18e5 | |||
1d17f28aec | |||
fa834e4b8b | |||
a72fea3cc9 | |||
e9a37d8a6e | |||
ef074f8455 | |||
aec5e69f2c | |||
170ef3f1ba | |||
f999946582 | |||
773b36ec37 | |||
69cb434a6c | |||
55d4f98d60 | |||
0bec2631e8 | |||
e6b6c6fa84 | |||
974528b2e6 | |||
1b346b54f9 | |||
058f2ad8b3 | |||
60bfd3ae8d | |||
67bdee105a | |||
0f745e0113 | |||
da8de52ba4 | |||
4d36538c6e |
@ -6,7 +6,7 @@ recursive-include doc/sphinx/source *.txt *.rst *.py
|
|||||||
recursive-include doc/sphinx/sphinxext *.py
|
recursive-include doc/sphinx/sphinxext *.py
|
||||||
include doc/sphinx/Makefile
|
include doc/sphinx/Makefile
|
||||||
include doc/sphinx/Doxyfile
|
include doc/sphinx/Doxyfile
|
||||||
include README.txt
|
include README.md
|
||||||
include requirements.txt
|
include requirements.txt
|
||||||
include scripts/obi
|
include scripts/obi
|
||||||
|
|
||||||
|
@ -13,7 +13,8 @@ from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN, \
|
|||||||
ID_COLUMN, \
|
ID_COLUMN, \
|
||||||
DEFINITION_COLUMN, \
|
DEFINITION_COLUMN, \
|
||||||
QUALITY_COLUMN, \
|
QUALITY_COLUMN, \
|
||||||
COUNT_COLUMN
|
COUNT_COLUMN, \
|
||||||
|
TAXID_COLUMN
|
||||||
|
|
||||||
import time
|
import time
|
||||||
import math
|
import math
|
||||||
@ -175,8 +176,8 @@ def sequenceTaggerGenerator(config, taxo=None):
|
|||||||
counter[0]+=1
|
counter[0]+=1
|
||||||
|
|
||||||
for rank in annoteRank:
|
for rank in annoteRank:
|
||||||
if 'taxid' in seq:
|
if TAXID_COLUMN in seq:
|
||||||
taxid = seq['taxid']
|
taxid = seq[TAXID_COLUMN]
|
||||||
if taxid is not None:
|
if taxid is not None:
|
||||||
rtaxid = taxo.get_taxon_at_rank(taxid, rank)
|
rtaxid = taxo.get_taxon_at_rank(taxid, rank)
|
||||||
if rtaxid is not None:
|
if rtaxid is not None:
|
||||||
@ -190,58 +191,50 @@ def sequenceTaggerGenerator(config, taxo=None):
|
|||||||
seq['seq_rank']=counter[0]
|
seq['seq_rank']=counter[0]
|
||||||
|
|
||||||
for i,v in toSet:
|
for i,v in toSet:
|
||||||
#try:
|
try:
|
||||||
if taxo is not None:
|
if taxo is not None:
|
||||||
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
||||||
else:
|
else:
|
||||||
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
||||||
val = eval(v, environ, seq)
|
val = eval(v, environ, seq)
|
||||||
#except Exception,e: # TODO discuss usefulness of this
|
except Exception: # set string if not a valid expression
|
||||||
# if options.onlyValid:
|
val = v
|
||||||
# raise e
|
|
||||||
# val = v
|
|
||||||
seq[i]=val
|
seq[i]=val
|
||||||
|
|
||||||
if length:
|
if length:
|
||||||
seq['seq_length']=len(seq)
|
seq['seq_length']=len(seq)
|
||||||
|
|
||||||
if newId is not None:
|
if newId is not None:
|
||||||
# try:
|
try:
|
||||||
if taxo is not None:
|
if taxo is not None:
|
||||||
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
||||||
else:
|
else:
|
||||||
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
||||||
val = eval(newId, environ, seq)
|
val = eval(newId, environ, seq)
|
||||||
# except Exception,e:
|
except Exception: # set string if not a valid expression
|
||||||
# if options.onlyValid:
|
val = newId
|
||||||
# raise e
|
|
||||||
# val = newId
|
|
||||||
seq.id=val
|
seq.id=val
|
||||||
|
|
||||||
if newDef is not None:
|
if newDef is not None:
|
||||||
# try:
|
try:
|
||||||
if taxo is not None:
|
if taxo is not None:
|
||||||
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
||||||
else:
|
else:
|
||||||
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
||||||
val = eval(newDef, environ, seq)
|
val = eval(newDef, environ, seq)
|
||||||
# except Exception,e:
|
except Exception: # set string if not a valid expression
|
||||||
# if options.onlyValid:
|
val = newDef
|
||||||
# raise e
|
|
||||||
# val = newDef
|
|
||||||
seq.definition=val
|
seq.definition=val
|
||||||
#
|
|
||||||
if newSeq is not None:
|
if newSeq is not None:
|
||||||
# try:
|
try:
|
||||||
if taxo is not None:
|
if taxo is not None:
|
||||||
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
||||||
else:
|
else:
|
||||||
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
||||||
val = eval(newSeq, environ, seq)
|
val = eval(newSeq, environ, seq)
|
||||||
# except Exception,e:
|
except Exception: # set string if not a valid expression
|
||||||
# if options.onlyValid:
|
val = newSeq
|
||||||
# raise e
|
|
||||||
# val = newSeq
|
|
||||||
seq.seq=val
|
seq.seq=val
|
||||||
if 'seq_length' in seq:
|
if 'seq_length' in seq:
|
||||||
seq['seq_length']=len(seq)
|
seq['seq_length']=len(seq)
|
||||||
@ -251,15 +244,14 @@ def sequenceTaggerGenerator(config, taxo=None):
|
|||||||
seq.view.delete_column(QUALITY_COLUMN)
|
seq.view.delete_column(QUALITY_COLUMN)
|
||||||
|
|
||||||
if run is not None:
|
if run is not None:
|
||||||
# try:
|
try:
|
||||||
if taxo is not None:
|
if taxo is not None:
|
||||||
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
||||||
else:
|
else:
|
||||||
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
||||||
eval(run, environ, seq)
|
eval(run, environ, seq)
|
||||||
# except Exception,e:
|
except Exception,e:
|
||||||
# if options.onlyValid:
|
raise e
|
||||||
# raise e
|
|
||||||
|
|
||||||
return sequenceTagger
|
return sequenceTagger
|
||||||
|
|
||||||
|
@ -107,8 +107,8 @@ def run(config):
|
|||||||
comments = View.print_config(config, "ecotag", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
|
comments = View.print_config(config, "ecotag", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
|
||||||
|
|
||||||
if obi_ecotag(i_dms.name_with_full_path, tobytes(i_view_name), \
|
if obi_ecotag(i_dms.name_with_full_path, tobytes(i_view_name), \
|
||||||
tobytes(ref_dms_name), tobytes(ref_view_name), \
|
ref_dms.name_with_full_path, tobytes(ref_view_name), \
|
||||||
tobytes(taxo_dms_name), tobytes(taxonomy_name), \
|
taxo_dms.name_with_full_path, tobytes(taxonomy_name), \
|
||||||
tobytes(o_view_name), comments,
|
tobytes(o_view_name), comments,
|
||||||
config['ecotag']['threshold']) < 0:
|
config['ecotag']['threshold']) < 0:
|
||||||
raise Exception("Error running ecotag")
|
raise Exception("Error running ecotag")
|
||||||
|
@ -59,13 +59,23 @@ def run(config):
|
|||||||
# Check that the input view has the type NUC_SEQS if needed # TODO discuss, maybe bool property
|
# Check that the input view has the type NUC_SEQS if needed # TODO discuss, maybe bool property
|
||||||
if (output[2] == Nuc_Seq) and (iview.type != b"NUC_SEQS_VIEW") : # Nuc_Seq_Stored? TODO
|
if (output[2] == Nuc_Seq) and (iview.type != b"NUC_SEQS_VIEW") : # Nuc_Seq_Stored? TODO
|
||||||
raise Exception("Error: the view to export in fasta or fastq format is not a NUC_SEQS view")
|
raise Exception("Error: the view to export in fasta or fastq format is not a NUC_SEQS view")
|
||||||
|
|
||||||
|
if config['obi']['only'] is not None:
|
||||||
|
withoutskip = min(input[4], config['obi']['only'])
|
||||||
|
else:
|
||||||
|
withoutskip = input[4]
|
||||||
|
|
||||||
|
if config['obi']['skip'] is not None:
|
||||||
|
skip = min(input[4], config['obi']['skip'])
|
||||||
|
else:
|
||||||
|
skip = 0
|
||||||
|
|
||||||
# Initialize the progress bar
|
# Initialize the progress bar
|
||||||
if config['obi']['noprogressbar']:
|
if config['obi']['noprogressbar']:
|
||||||
pb = None
|
pb = None
|
||||||
else:
|
else:
|
||||||
pb = ProgressBar(len(iview), config, seconde=5)
|
pb = ProgressBar(withoutskip - skip, config, seconde=5)
|
||||||
|
|
||||||
i=0
|
i=0
|
||||||
for seq in iview :
|
for seq in iview :
|
||||||
PyErr_CheckSignals()
|
PyErr_CheckSignals()
|
||||||
|
@ -25,7 +25,8 @@ from obitools3.dms.capi.obiview cimport VIEW_TYPE_NUC_SEQS, \
|
|||||||
DEFINITION_COLUMN, \
|
DEFINITION_COLUMN, \
|
||||||
QUALITY_COLUMN, \
|
QUALITY_COLUMN, \
|
||||||
COUNT_COLUMN, \
|
COUNT_COLUMN, \
|
||||||
TAXID_COLUMN
|
TAXID_COLUMN, \
|
||||||
|
MERGED_PREFIX
|
||||||
|
|
||||||
from obitools3.dms.capi.obidms cimport obi_import_view
|
from obitools3.dms.capi.obidms cimport obi_import_view
|
||||||
|
|
||||||
@ -72,7 +73,7 @@ def addOptions(parser):
|
|||||||
action="store_true", dest="import:preread",
|
action="store_true", dest="import:preread",
|
||||||
default=False,
|
default=False,
|
||||||
help="Do a first readthrough of the dataset if it contains huge dictionaries (more than 100 keys) for "
|
help="Do a first readthrough of the dataset if it contains huge dictionaries (more than 100 keys) for "
|
||||||
"a much faster import.")
|
"a much faster import. This option is not recommended and will slow down the import in any other case.")
|
||||||
|
|
||||||
|
|
||||||
def run(config):
|
def run(config):
|
||||||
@ -163,7 +164,7 @@ def run(config):
|
|||||||
taxo.write(taxo_name)
|
taxo.write(taxo_name)
|
||||||
taxo.close()
|
taxo.close()
|
||||||
o_dms.record_command_line(" ".join(sys.argv[1:]))
|
o_dms.record_command_line(" ".join(sys.argv[1:]))
|
||||||
o_dms.close()
|
o_dms.close(force=True)
|
||||||
logger("info", "Done.")
|
logger("info", "Done.")
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -217,11 +218,14 @@ def run(config):
|
|||||||
logger("info", "Read %d entries", i)
|
logger("info", "Read %d entries", i)
|
||||||
|
|
||||||
for tag in entry :
|
for tag in entry :
|
||||||
|
newtag = tag
|
||||||
|
if tag[:7] == b"merged_":
|
||||||
|
newtag = MERGED_PREFIX+tag[7:]
|
||||||
if type(entry[tag]) == dict :
|
if type(entry[tag]) == dict :
|
||||||
if tag in dict_dict:
|
if tag in dict_dict:
|
||||||
dict_dict[tag][0].update(entry[tag].keys())
|
dict_dict[newtag][0].update(entry[tag].keys())
|
||||||
else:
|
else:
|
||||||
dict_dict[tag] = [set(entry[tag].keys()), get_obitype(entry[tag])]
|
dict_dict[newtag] = [set(entry[tag].keys()), get_obitype(entry[tag])]
|
||||||
i+=1
|
i+=1
|
||||||
|
|
||||||
if pb is not None:
|
if pb is not None:
|
||||||
@ -232,7 +236,7 @@ def run(config):
|
|||||||
dcols[tag] = (Column.new_column(view, tag, dict_dict[tag][1], \
|
dcols[tag] = (Column.new_column(view, tag, dict_dict[tag][1], \
|
||||||
nb_elements_per_line=len(dict_dict[tag][0]), \
|
nb_elements_per_line=len(dict_dict[tag][0]), \
|
||||||
elements_names=list(dict_dict[tag][0])), \
|
elements_names=list(dict_dict[tag][0])), \
|
||||||
value_obitype)
|
dict_dict[tag][1])
|
||||||
|
|
||||||
|
|
||||||
# Reinitialize the input
|
# Reinitialize the input
|
||||||
@ -288,6 +292,8 @@ def run(config):
|
|||||||
tag = TAXID_COLUMN
|
tag = TAXID_COLUMN
|
||||||
if tag == b"count":
|
if tag == b"count":
|
||||||
tag = COUNT_COLUMN
|
tag = COUNT_COLUMN
|
||||||
|
if tag[:7] == b"merged_":
|
||||||
|
tag = MERGED_PREFIX+tag[7:]
|
||||||
|
|
||||||
if tag not in dcols :
|
if tag not in dcols :
|
||||||
|
|
||||||
@ -328,8 +334,8 @@ def run(config):
|
|||||||
try:
|
try:
|
||||||
# Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key
|
# Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key
|
||||||
if type(value) == dict and \
|
if type(value) == dict and \
|
||||||
dcols[tag][0].nb_elements_per_line == 1 and len(value.keys()) == 1 \
|
dcols[tag][0].nb_elements_per_line == 1 \
|
||||||
and dcols[tag][0].elements_names[0] != list(value.keys())[0] :
|
and set(dcols[tag][0].elements_names) != set(value.keys()) :
|
||||||
raise IndexError # trigger column rewrite
|
raise IndexError # trigger column rewrite
|
||||||
|
|
||||||
# Fill value
|
# Fill value
|
||||||
@ -402,7 +408,7 @@ def run(config):
|
|||||||
except AttributeError:
|
except AttributeError:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
output[0].close()
|
output[0].close(force=True)
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -42,8 +42,8 @@ def addOptions(parser):
|
|||||||
metavar="<URI>",
|
metavar="<URI>",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help="URI to the view containing the samples definition (with tags, primers, sample names,...)"
|
help="URI to the view containing the samples definition (with tags, primers, sample names,...).\n"
|
||||||
"Warning: primer lengths must be less than or equal to 32")
|
"\nWarning: primer lengths must be less than or equal to 32")
|
||||||
|
|
||||||
group.add_argument('-R', '--reverse-reads',
|
group.add_argument('-R', '--reverse-reads',
|
||||||
action="store", dest="ngsfilter:reverse",
|
action="store", dest="ngsfilter:reverse",
|
||||||
|
@ -5,5 +5,5 @@ from obitools3.dms.taxo.taxo cimport Taxonomy
|
|||||||
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
||||||
|
|
||||||
|
|
||||||
cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy)
|
cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy, dict config)
|
||||||
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=*, Taxonomy taxonomy=*, bint mergeIds=*, list categories=*, int max_elts=*)
|
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, dict config, list mergedKeys_list=*, Taxonomy taxonomy=*, bint mergeIds=*, list categories=*, int max_elts=*)
|
||||||
|
@ -56,7 +56,7 @@ def addOptions(parser):
|
|||||||
"(option can be used several times).")
|
"(option can be used several times).")
|
||||||
|
|
||||||
|
|
||||||
cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy, dict config) :
|
||||||
|
|
||||||
cdef int taxid
|
cdef int taxid
|
||||||
cdef Nuc_Seq_Stored seq
|
cdef Nuc_Seq_Stored seq
|
||||||
@ -69,7 +69,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
|||||||
cdef object gn_sn
|
cdef object gn_sn
|
||||||
cdef object fa_sn
|
cdef object fa_sn
|
||||||
|
|
||||||
# Create columns
|
# Create columns and save them for efficiency
|
||||||
if b"species" in o_view and o_view[b"species"].data_type_int != OBI_INT :
|
if b"species" in o_view and o_view[b"species"].data_type_int != OBI_INT :
|
||||||
o_view.delete_column(b"species")
|
o_view.delete_column(b"species")
|
||||||
if b"species" not in o_view:
|
if b"species" not in o_view:
|
||||||
@ -77,6 +77,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
|||||||
b"species",
|
b"species",
|
||||||
OBI_INT
|
OBI_INT
|
||||||
)
|
)
|
||||||
|
species_column = o_view[b"species"]
|
||||||
|
|
||||||
if b"genus" in o_view and o_view[b"genus"].data_type_int != OBI_INT :
|
if b"genus" in o_view and o_view[b"genus"].data_type_int != OBI_INT :
|
||||||
o_view.delete_column(b"genus")
|
o_view.delete_column(b"genus")
|
||||||
@ -85,6 +86,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
|||||||
b"genus",
|
b"genus",
|
||||||
OBI_INT
|
OBI_INT
|
||||||
)
|
)
|
||||||
|
genus_column = o_view[b"genus"]
|
||||||
|
|
||||||
if b"family" in o_view and o_view[b"family"].data_type_int != OBI_INT :
|
if b"family" in o_view and o_view[b"family"].data_type_int != OBI_INT :
|
||||||
o_view.delete_column(b"family")
|
o_view.delete_column(b"family")
|
||||||
@ -93,6 +95,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
|||||||
b"family",
|
b"family",
|
||||||
OBI_INT
|
OBI_INT
|
||||||
)
|
)
|
||||||
|
family_column = o_view[b"family"]
|
||||||
|
|
||||||
if b"species_name" in o_view and o_view[b"species_name"].data_type_int != OBI_STR :
|
if b"species_name" in o_view and o_view[b"species_name"].data_type_int != OBI_STR :
|
||||||
o_view.delete_column(b"species_name")
|
o_view.delete_column(b"species_name")
|
||||||
@ -101,6 +104,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
|||||||
b"species_name",
|
b"species_name",
|
||||||
OBI_STR
|
OBI_STR
|
||||||
)
|
)
|
||||||
|
species_name_column = o_view[b"species_name"]
|
||||||
|
|
||||||
if b"genus_name" in o_view and o_view[b"genus_name"].data_type_int != OBI_STR :
|
if b"genus_name" in o_view and o_view[b"genus_name"].data_type_int != OBI_STR :
|
||||||
o_view.delete_column(b"genus_name")
|
o_view.delete_column(b"genus_name")
|
||||||
@ -109,6 +113,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
|||||||
b"genus_name",
|
b"genus_name",
|
||||||
OBI_STR
|
OBI_STR
|
||||||
)
|
)
|
||||||
|
genus_name_column = o_view[b"genus_name"]
|
||||||
|
|
||||||
if b"family_name" in o_view and o_view[b"family_name"].data_type_int != OBI_STR :
|
if b"family_name" in o_view and o_view[b"family_name"].data_type_int != OBI_STR :
|
||||||
o_view.delete_column(b"family_name")
|
o_view.delete_column(b"family_name")
|
||||||
@ -117,6 +122,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
|||||||
b"family_name",
|
b"family_name",
|
||||||
OBI_STR
|
OBI_STR
|
||||||
)
|
)
|
||||||
|
family_name_column = o_view[b"family_name"]
|
||||||
|
|
||||||
if b"rank" in o_view and o_view[b"rank"].data_type_int != OBI_STR :
|
if b"rank" in o_view and o_view[b"rank"].data_type_int != OBI_STR :
|
||||||
o_view.delete_column(b"rank")
|
o_view.delete_column(b"rank")
|
||||||
@ -125,6 +131,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
|||||||
b"rank",
|
b"rank",
|
||||||
OBI_STR
|
OBI_STR
|
||||||
)
|
)
|
||||||
|
rank_column = o_view[b"rank"]
|
||||||
|
|
||||||
if b"scientific_name" in o_view and o_view[b"scientific_name"].data_type_int != OBI_STR :
|
if b"scientific_name" in o_view and o_view[b"scientific_name"].data_type_int != OBI_STR :
|
||||||
o_view.delete_column(b"scientific_name")
|
o_view.delete_column(b"scientific_name")
|
||||||
@ -133,9 +140,15 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
|||||||
b"scientific_name",
|
b"scientific_name",
|
||||||
OBI_STR
|
OBI_STR
|
||||||
)
|
)
|
||||||
|
scientific_name_column = o_view[b"scientific_name"]
|
||||||
for seq in o_view:
|
|
||||||
PyErr_CheckSignals()
|
# Initialize the progress bar
|
||||||
|
pb = ProgressBar(len(o_view), config, seconde=5)
|
||||||
|
|
||||||
|
i=0
|
||||||
|
for seq in o_view:
|
||||||
|
PyErr_CheckSignals()
|
||||||
|
pb(i)
|
||||||
if MERGED_TAXID_COLUMN in seq :
|
if MERGED_TAXID_COLUMN in seq :
|
||||||
m_taxids = []
|
m_taxids = []
|
||||||
m_taxids_dict = seq[MERGED_TAXID_COLUMN]
|
m_taxids_dict = seq[MERGED_TAXID_COLUMN]
|
||||||
@ -165,20 +178,23 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
|||||||
else:
|
else:
|
||||||
fa_sn = None
|
fa_sn = None
|
||||||
tfa = None
|
tfa = None
|
||||||
|
|
||||||
seq[b"species"] = tsp
|
|
||||||
seq[b"genus"] = tgn
|
|
||||||
seq[b"family"] = tfa
|
|
||||||
|
|
||||||
seq[b"species_name"] = sp_sn
|
|
||||||
seq[b"genus_name"] = gn_sn
|
|
||||||
seq[b"family_name"] = fa_sn
|
|
||||||
|
|
||||||
seq[b"rank"] = taxonomy.get_rank(taxid)
|
species_column[i] = tsp
|
||||||
seq[b"scientific_name"] = taxonomy.get_scientific_name(taxid)
|
genus_column[i] = tgn
|
||||||
|
family_column[i] = tfa
|
||||||
|
|
||||||
|
species_name_column[i] = sp_sn
|
||||||
|
genus_name_column[i] = gn_sn
|
||||||
|
family_name_column[i] = fa_sn
|
||||||
|
|
||||||
|
rank_column[i] = taxonomy.get_rank(taxid)
|
||||||
|
scientific_name_column[i] = taxonomy.get_scientific_name(taxid)
|
||||||
|
i+=1
|
||||||
|
|
||||||
|
pb(len(o_view), force=True)
|
||||||
|
|
||||||
|
|
||||||
|
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, dict config, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None, int max_elts=1000000) :
|
||||||
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None, int max_elts=1000000) :
|
|
||||||
|
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef int k
|
cdef int k
|
||||||
@ -187,6 +203,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
cdef int u_idx
|
cdef int u_idx
|
||||||
cdef int i_idx
|
cdef int i_idx
|
||||||
cdef int i_count
|
cdef int i_count
|
||||||
|
cdef int o_count
|
||||||
cdef str key_str
|
cdef str key_str
|
||||||
cdef bytes key
|
cdef bytes key
|
||||||
cdef bytes mkey
|
cdef bytes mkey
|
||||||
@ -209,7 +226,6 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
cdef Nuc_Seq_Stored i_seq
|
cdef Nuc_Seq_Stored i_seq
|
||||||
cdef Nuc_Seq_Stored o_seq
|
cdef Nuc_Seq_Stored o_seq
|
||||||
cdef Nuc_Seq_Stored u_seq
|
cdef Nuc_Seq_Stored u_seq
|
||||||
cdef Column i_col
|
|
||||||
cdef Column i_seq_col
|
cdef Column i_seq_col
|
||||||
cdef Column i_id_col
|
cdef Column i_id_col
|
||||||
cdef Column i_taxid_col
|
cdef Column i_taxid_col
|
||||||
@ -217,6 +233,8 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
cdef Column o_id_col
|
cdef Column o_id_col
|
||||||
cdef Column o_taxid_dist_col
|
cdef Column o_taxid_dist_col
|
||||||
cdef Column o_merged_col
|
cdef Column o_merged_col
|
||||||
|
cdef Column o_count_col
|
||||||
|
cdef Column i_count_col
|
||||||
cdef Column_line i_mcol
|
cdef Column_line i_mcol
|
||||||
cdef object taxid_dist_dict
|
cdef object taxid_dist_dict
|
||||||
cdef object iter_view
|
cdef object iter_view
|
||||||
@ -252,7 +270,12 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
mergedKeys_m = []
|
mergedKeys_m = []
|
||||||
for k in range(k_count):
|
for k in range(k_count):
|
||||||
mergedKeys_m.append(MERGED_PREFIX + mergedKeys[k])
|
mergedKeys_m.append(MERGED_PREFIX + mergedKeys[k])
|
||||||
|
|
||||||
|
# Check that not trying to remerge without total count information
|
||||||
|
for key in mergedKeys_m:
|
||||||
|
if key in view and COUNT_COLUMN not in view:
|
||||||
|
raise Exception("\n>>>>\nError: trying to re-merge tags without total count tag. Run obi annotate to add the count tag from the relevant merged tag, i.e.: \nobi annotate --set-tag COUNT:'sum([value for key,value in sequence['MERGED_sample'].items()])' dms/input dms/output\n")
|
||||||
|
|
||||||
if categories is None:
|
if categories is None:
|
||||||
categories = []
|
categories = []
|
||||||
|
|
||||||
@ -320,7 +343,11 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
for k in range(k_count):
|
for k in range(k_count):
|
||||||
key = mergedKeys[k]
|
key = mergedKeys[k]
|
||||||
merged_col_name = mergedKeys_m[k]
|
merged_col_name = mergedKeys_m[k]
|
||||||
i_col = view[key]
|
|
||||||
|
if merged_col_name in view:
|
||||||
|
i_col = view[merged_col_name]
|
||||||
|
else:
|
||||||
|
i_col = view[key]
|
||||||
|
|
||||||
if merged_infos[merged_col_name]['nb_elts'] > max_elts:
|
if merged_infos[merged_col_name]['nb_elts'] > max_elts:
|
||||||
str_merged_cols.append(merged_col_name)
|
str_merged_cols.append(merged_col_name)
|
||||||
@ -374,12 +401,19 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
alias=MERGED_COLUMN
|
alias=MERGED_COLUMN
|
||||||
)
|
)
|
||||||
|
|
||||||
# Keep columns that are going to be used a lot in variables
|
# Keep columns in variables for efficiency
|
||||||
o_id_col = o_view[ID_COLUMN]
|
o_id_col = o_view[ID_COLUMN]
|
||||||
if TAXID_DIST_COLUMN in o_view:
|
if TAXID_DIST_COLUMN in o_view:
|
||||||
o_taxid_dist_col = o_view[TAXID_DIST_COLUMN]
|
o_taxid_dist_col = o_view[TAXID_DIST_COLUMN]
|
||||||
if MERGED_COLUMN in o_view:
|
if MERGED_COLUMN in o_view:
|
||||||
o_merged_col = o_view[MERGED_COLUMN]
|
o_merged_col = o_view[MERGED_COLUMN]
|
||||||
|
if COUNT_COLUMN not in o_view:
|
||||||
|
Column.new_column(o_view,
|
||||||
|
COUNT_COLUMN,
|
||||||
|
OBI_INT)
|
||||||
|
o_count_col = o_view[COUNT_COLUMN]
|
||||||
|
if COUNT_COLUMN in view:
|
||||||
|
i_count_col = view[COUNT_COLUMN]
|
||||||
|
|
||||||
pb(len(view), force=True)
|
pb(len(view), force=True)
|
||||||
print("")
|
print("")
|
||||||
@ -407,7 +441,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
merged_list = list(set(merged_list)) # deduplicate the list
|
merged_list = list(set(merged_list)) # deduplicate the list
|
||||||
o_merged_col[o_idx] = merged_list
|
o_merged_col[o_idx] = merged_list
|
||||||
|
|
||||||
o_seq[COUNT_COLUMN] = 0
|
o_count = 0
|
||||||
|
|
||||||
if TAXID_DIST_COLUMN in u_seq and i_taxid_dist_col[u_idx] is not None:
|
if TAXID_DIST_COLUMN in u_seq and i_taxid_dist_col[u_idx] is not None:
|
||||||
taxid_dist_dict = i_taxid_dist_col[u_idx]
|
taxid_dist_dict = i_taxid_dist_col[u_idx]
|
||||||
@ -423,12 +457,12 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
i_id = i_id_col[i_idx]
|
i_id = i_id_col[i_idx]
|
||||||
i_seq = view[i_idx]
|
i_seq = view[i_idx]
|
||||||
|
|
||||||
if COUNT_COLUMN not in i_seq or i_seq[COUNT_COLUMN] is None:
|
if COUNT_COLUMN not in i_seq or i_count_col[i_idx] is None:
|
||||||
i_count = 1
|
i_count = 1
|
||||||
else:
|
else:
|
||||||
i_count = i_seq[COUNT_COLUMN]
|
i_count = i_count_col[i_idx]
|
||||||
|
|
||||||
o_seq[COUNT_COLUMN] += i_count
|
o_count += i_count
|
||||||
|
|
||||||
for k in range(k_count):
|
for k in range(k_count):
|
||||||
|
|
||||||
@ -463,44 +497,52 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
|||||||
mcol[key2] = i_mcol[key2]
|
mcol[key2] = i_mcol[key2]
|
||||||
else:
|
else:
|
||||||
mcol[key2] = mcol[key2] + i_mcol[key2]
|
mcol[key2] = mcol[key2] + i_mcol[key2]
|
||||||
|
|
||||||
# Write taxid_dist
|
|
||||||
if mergeIds and TAXID_COLUMN in mergedKeys:
|
|
||||||
if TAXID_DIST_COLUMN in str_merged_cols:
|
|
||||||
o_taxid_dist_col[o_idx] = str(taxid_dist_dict)
|
|
||||||
else:
|
|
||||||
o_taxid_dist_col[o_idx] = taxid_dist_dict
|
|
||||||
|
|
||||||
# Write merged dicts
|
|
||||||
for mkey in merged_dict:
|
|
||||||
if mkey in str_merged_cols:
|
|
||||||
mkey_cols[mkey][o_idx] = str(merged_dict[mkey])
|
|
||||||
else:
|
|
||||||
mkey_cols[mkey][o_idx] = merged_dict[mkey]
|
|
||||||
# Sets NA values to 0 # TODO discuss, for now keep as None and test for None instead of testing for 0 in tools
|
|
||||||
#for key in mkey_cols[mkey][o_idx]:
|
|
||||||
# if mkey_cols[mkey][o_idx][key] is None:
|
|
||||||
# mkey_cols[mkey][o_idx][key] = 0
|
|
||||||
|
|
||||||
for key in i_seq.keys():
|
for key in i_seq.keys():
|
||||||
# Delete informations that differ between the merged sequences
|
# Delete informations that differ between the merged sequences
|
||||||
# TODO make special columns list?
|
# TODO make special columns list? // could be more efficient
|
||||||
if key != COUNT_COLUMN and key != ID_COLUMN and key != NUC_SEQUENCE_COLUMN and key in o_seq and o_seq[key] != i_seq[key] \
|
if key != COUNT_COLUMN and key != ID_COLUMN and key != NUC_SEQUENCE_COLUMN and key in o_seq and o_seq[key] != i_seq[key] \
|
||||||
and key not in merged_dict :
|
and key not in merged_dict :
|
||||||
o_seq[key] = None
|
o_seq[key] = None
|
||||||
|
|
||||||
|
# Write merged dicts
|
||||||
|
for mkey in merged_dict:
|
||||||
|
if mkey in str_merged_cols:
|
||||||
|
mkey_cols[mkey][o_idx] = str(merged_dict[mkey])
|
||||||
|
else:
|
||||||
|
mkey_cols[mkey][o_idx] = merged_dict[mkey]
|
||||||
|
# Sets NA values to 0 # TODO discuss, for now keep as None and test for None instead of testing for 0 in tools
|
||||||
|
#for key in mkey_cols[mkey][o_idx]:
|
||||||
|
# if mkey_cols[mkey][o_idx][key] is None:
|
||||||
|
# mkey_cols[mkey][o_idx][key] = 0
|
||||||
|
|
||||||
|
# Write taxid_dist
|
||||||
|
if mergeIds and TAXID_COLUMN in mergedKeys:
|
||||||
|
if TAXID_DIST_COLUMN in str_merged_cols:
|
||||||
|
o_taxid_dist_col[o_idx] = str(taxid_dist_dict)
|
||||||
|
else:
|
||||||
|
o_taxid_dist_col[o_idx] = taxid_dist_dict
|
||||||
|
|
||||||
|
o_count_col[o_idx] = o_count
|
||||||
o_idx += 1
|
o_idx += 1
|
||||||
|
|
||||||
|
pb(len(uniques), force=True)
|
||||||
|
|
||||||
# Deletes quality columns if there is one because the matching between sequence and quality will be broken (quality set to NA when sequence not)
|
# Deletes quality columns if there is one because the matching between sequence and quality will be broken (quality set to NA when sequence not)
|
||||||
if QUALITY_COLUMN in view:
|
if QUALITY_COLUMN in view:
|
||||||
o_view.delete_column(QUALITY_COLUMN)
|
o_view.delete_column(QUALITY_COLUMN)
|
||||||
if REVERSE_QUALITY_COLUMN in view:
|
if REVERSE_QUALITY_COLUMN in view:
|
||||||
o_view.delete_column(REVERSE_QUALITY_COLUMN)
|
o_view.delete_column(REVERSE_QUALITY_COLUMN)
|
||||||
|
|
||||||
|
# Delete old columns that are now merged
|
||||||
|
for k in range(k_count):
|
||||||
|
if mergedKeys[k] in o_view:
|
||||||
|
o_view.delete_column(mergedKeys[k])
|
||||||
|
|
||||||
if taxonomy is not None:
|
if taxonomy is not None:
|
||||||
print("") # TODO because in the middle of progress bar. Better solution?
|
print("") # TODO because in the middle of progress bar. Better solution?
|
||||||
logger("info", "Merging taxonomy classification")
|
logger("info", "Merging taxonomy classification")
|
||||||
merge_taxonomy_classification(o_view, taxonomy)
|
merge_taxonomy_classification(o_view, taxonomy, config)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -547,11 +589,10 @@ def run(config):
|
|||||||
pb = ProgressBar(len(entries), config, seconde=5)
|
pb = ProgressBar(len(entries), config, seconde=5)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
uniq_sequences(entries, o_view, pb, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'], max_elts=config['obi']['maxelts'])
|
uniq_sequences(entries, o_view, pb, config, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'], max_elts=config['obi']['maxelts'])
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
raise RollbackException("obi uniq error, rollbacking view: "+str(e), o_view)
|
raise RollbackException("obi uniq error, rollbacking view: "+str(e), o_view)
|
||||||
|
|
||||||
pb(len(entries), force=True)
|
|
||||||
print("", file=sys.stderr)
|
print("", file=sys.stderr)
|
||||||
|
|
||||||
# Save command config in View and DMS comments
|
# Save command config in View and DMS comments
|
||||||
@ -567,8 +608,8 @@ def run(config):
|
|||||||
#print("\n\nOutput view:\n````````````", file=sys.stderr)
|
#print("\n\nOutput view:\n````````````", file=sys.stderr)
|
||||||
#print(repr(o_view), file=sys.stderr)
|
#print(repr(o_view), file=sys.stderr)
|
||||||
|
|
||||||
input[0].close()
|
input[0].close(force=True)
|
||||||
output[0].close()
|
output[0].close(force=True)
|
||||||
|
|
||||||
logger("info", "Done.")
|
logger("info", "Done.")
|
||||||
|
|
||||||
|
@ -63,6 +63,8 @@ cdef extern from "obidmscolumn.h" nogil:
|
|||||||
|
|
||||||
char* obi_get_elements_names(OBIDMS_column_p column)
|
char* obi_get_elements_names(OBIDMS_column_p column)
|
||||||
|
|
||||||
|
char* obi_column_formatted_infos(OBIDMS_column_p column)
|
||||||
|
|
||||||
index_t obi_column_get_element_index_from_name(OBIDMS_column_p column, const char* element_name)
|
index_t obi_column_get_element_index_from_name(OBIDMS_column_p column, const char* element_name)
|
||||||
|
|
||||||
int obi_column_write_comments(OBIDMS_column_p column, const char* comments)
|
int obi_column_write_comments(OBIDMS_column_p column, const char* comments)
|
||||||
|
@ -14,6 +14,7 @@ from ..capi.obidms cimport obi_import_column
|
|||||||
from ..capi.obidmscolumn cimport OBIDMS_column_header_p, \
|
from ..capi.obidmscolumn cimport OBIDMS_column_header_p, \
|
||||||
obi_close_column, \
|
obi_close_column, \
|
||||||
obi_get_elements_names, \
|
obi_get_elements_names, \
|
||||||
|
obi_column_formatted_infos, \
|
||||||
obi_column_write_comments
|
obi_column_write_comments
|
||||||
|
|
||||||
from ..capi.obiutils cimport obi_format_date
|
from ..capi.obiutils cimport obi_format_date
|
||||||
@ -38,7 +39,7 @@ from obitools3.utils cimport tobytes, \
|
|||||||
|
|
||||||
from obitools3.dms.column import typed_column
|
from obitools3.dms.column import typed_column
|
||||||
|
|
||||||
from libc.stdlib cimport free
|
from libc.stdlib cimport free
|
||||||
|
|
||||||
import importlib
|
import importlib
|
||||||
import inspect
|
import inspect
|
||||||
@ -288,9 +289,15 @@ cdef class Column(OBIWrapper) :
|
|||||||
@OBIWrapper.checkIsActive
|
@OBIWrapper.checkIsActive
|
||||||
def __repr__(self) :
|
def __repr__(self) :
|
||||||
cdef bytes s
|
cdef bytes s
|
||||||
|
#cdef char* s_b
|
||||||
|
#cdef str s_str
|
||||||
|
#s_b = obi_column_formatted_infos(self.pointer())
|
||||||
|
#s_str = bytes2str(s_b)
|
||||||
|
#free(s_b)
|
||||||
s = self._alias + b", data type: " + self.data_type
|
s = self._alias + b", data type: " + self.data_type
|
||||||
|
#return s_str
|
||||||
return bytes2str(s)
|
return bytes2str(s)
|
||||||
|
|
||||||
|
|
||||||
def close(self): # TODO discuss, can't be called bc then bug when closing view that tries to close it in C
|
def close(self): # TODO discuss, can't be called bc then bug when closing view that tries to close it in C
|
||||||
|
|
||||||
|
@ -531,8 +531,8 @@ cdef class View(OBIWrapper) :
|
|||||||
for level in self.view_history:
|
for level in self.view_history:
|
||||||
command_list = [level[input][b"command_line"] for input in level.keys()]
|
command_list = [level[input][b"command_line"] for input in level.keys()]
|
||||||
for command in command_list:
|
for command in command_list:
|
||||||
|
s+=b"obi "
|
||||||
s+=command
|
s+=command
|
||||||
s+=b"\n"
|
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
@ -177,7 +177,7 @@ def emblIterator_dir(dir_path,
|
|||||||
for filename in files:
|
for filename in files:
|
||||||
if read==only:
|
if read==only:
|
||||||
return
|
return
|
||||||
print("Parsing file %s (%d/%d)" % (tostr(filename), read_files, len(files)))
|
print("Parsing file %s (%d/%d)" % (tostr(filename), read_files+1, len(files)))
|
||||||
f = uopen(filename)
|
f = uopen(filename)
|
||||||
if only is not None:
|
if only is not None:
|
||||||
only_f = only-read
|
only_f = only-read
|
||||||
|
@ -104,6 +104,7 @@ def fastaNucIterator(lineiterator,
|
|||||||
cdef bytes sequence
|
cdef bytes sequence
|
||||||
cdef int skipped, ionly, read
|
cdef int skipped, ionly, read
|
||||||
cdef Nuc_Seq seq
|
cdef Nuc_Seq seq
|
||||||
|
cdef bint stop
|
||||||
|
|
||||||
if only is None:
|
if only is None:
|
||||||
ionly = -1
|
ionly = -1
|
||||||
@ -130,7 +131,8 @@ def fastaNucIterator(lineiterator,
|
|||||||
else:
|
else:
|
||||||
line = firstline
|
line = firstline
|
||||||
|
|
||||||
while True:
|
stop=False
|
||||||
|
while not stop:
|
||||||
|
|
||||||
if ionly >= 0 and read >= ionly:
|
if ionly >= 0 and read >= ionly:
|
||||||
break
|
break
|
||||||
@ -153,7 +155,7 @@ def fastaNucIterator(lineiterator,
|
|||||||
s.append(line[0:-1])
|
s.append(line[0:-1])
|
||||||
line = next(iterator)
|
line = next(iterator)
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
pass
|
stop=True
|
||||||
|
|
||||||
sequence = b"".join(s)
|
sequence = b"".join(s)
|
||||||
|
|
||||||
|
@ -171,10 +171,12 @@ def genbankIterator_dir(dir_path,
|
|||||||
read = 0
|
read = 0
|
||||||
read_files = 0
|
read_files = 0
|
||||||
files = [filename for filename in glob.glob(os.path.join(path, b'*.gbff*'))]
|
files = [filename for filename in glob.glob(os.path.join(path, b'*.gbff*'))]
|
||||||
|
files.extend([filename for filename in glob.glob(os.path.join(path, b'*.seq*'))]) # new genbank extension
|
||||||
|
files = list(set(files))
|
||||||
for filename in files:
|
for filename in files:
|
||||||
if read==only:
|
if read==only:
|
||||||
return
|
return
|
||||||
print("Parsing file %s (%d/%d)" % (tostr(filename), read_files, len(files)))
|
print("Parsing file %s (%d/%d)" % (tostr(filename), read_files+1, len(files)))
|
||||||
f = uopen(filename)
|
f = uopen(filename)
|
||||||
if only is not None:
|
if only is not None:
|
||||||
only_f = only-read
|
only_f = only-read
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
major = 3
|
major = 3
|
||||||
minor = 0
|
minor = 0
|
||||||
serial= '0-beta14'
|
serial= '0b20'
|
||||||
|
|
||||||
version ="%d.%02d.%s" % (major,minor,serial)
|
version ="%d.%d.%s" % (major,minor,serial)
|
||||||
|
5
requirements.txt
Executable file
5
requirements.txt
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
--extra-index-url https://pypi.python.org/simple/
|
||||||
|
Cython>=0.24
|
||||||
|
Sphinx>=1.2.0
|
||||||
|
ipython>=3.0.0
|
||||||
|
breathe>=4.0.0
|
16
setup.py
16
setup.py
@ -5,8 +5,9 @@ import re
|
|||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
from distutils import log
|
from distutils import log
|
||||||
from distutils.core import setup
|
#from distutils.core import setup
|
||||||
|
from setuptools import setup # to work with pip
|
||||||
|
|
||||||
from distutils.core import Extension
|
from distutils.core import Extension
|
||||||
from distutils.sysconfig import get_python_lib
|
from distutils.sysconfig import get_python_lib
|
||||||
|
|
||||||
@ -88,9 +89,10 @@ PACKAGE = "OBITools3"
|
|||||||
VERSION = version
|
VERSION = version
|
||||||
AUTHOR = 'Celine Mercier'
|
AUTHOR = 'Celine Mercier'
|
||||||
EMAIL = 'celine.mercier@metabarcoding.org'
|
EMAIL = 'celine.mercier@metabarcoding.org'
|
||||||
URL = "http://metabarcoding.org/obitools3"
|
URL = "https://metabarcoding.org/obitools3"
|
||||||
|
PLATFORMS = "posix"
|
||||||
LICENSE = "CeCILL-V2"
|
LICENSE = "CeCILL-V2"
|
||||||
DESCRIPTION = "Tools and library for DNA metabarcoding",
|
DESCRIPTION = "A package for the management of analyses and data in DNA metabarcoding."
|
||||||
PYTHONMIN = '3.5'
|
PYTHONMIN = '3.5'
|
||||||
|
|
||||||
SRC = 'python'
|
SRC = 'python'
|
||||||
@ -147,12 +149,18 @@ classifiers=['Development Status :: 4 - Beta',
|
|||||||
'Topic :: Utilities',
|
'Topic :: Utilities',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
with open("README.md", "r") as fh:
|
||||||
|
long_description = fh.read()
|
||||||
|
|
||||||
setup(name=PACKAGE,
|
setup(name=PACKAGE,
|
||||||
description=DESCRIPTION,
|
description=DESCRIPTION,
|
||||||
|
long_description=long_description,
|
||||||
|
long_description_content_type="text/markdown",
|
||||||
classifiers=classifiers,
|
classifiers=classifiers,
|
||||||
version=VERSION,
|
version=VERSION,
|
||||||
author=AUTHOR,
|
author=AUTHOR,
|
||||||
author_email=EMAIL,
|
author_email=EMAIL,
|
||||||
|
platforms=PLATFORMS,
|
||||||
license=LICENSE,
|
license=LICENSE,
|
||||||
url=URL,
|
url=URL,
|
||||||
ext_modules=xx,
|
ext_modules=xx,
|
||||||
|
@ -157,7 +157,7 @@ int build_reference_db(const char* dms_name,
|
|||||||
ecotx_t* lca_2 = NULL;
|
ecotx_t* lca_2 = NULL;
|
||||||
ecotx_t* lca = NULL;
|
ecotx_t* lca = NULL;
|
||||||
index_t idx1, idx2;
|
index_t idx1, idx2;
|
||||||
index_t i, j, k;
|
index_t i, j, k, count;
|
||||||
int32_t taxid_array_length;
|
int32_t taxid_array_length;
|
||||||
int32_t score_array_length;
|
int32_t score_array_length;
|
||||||
int32_t taxid_array_writable_length;
|
int32_t taxid_array_writable_length;
|
||||||
@ -185,6 +185,7 @@ int build_reference_db(const char* dms_name,
|
|||||||
matrix_view_name = strcpy(matrix_view_name, o_view_name);
|
matrix_view_name = strcpy(matrix_view_name, o_view_name);
|
||||||
strcat(matrix_view_name, "_matrix");
|
strcat(matrix_view_name, "_matrix");
|
||||||
|
|
||||||
|
fprintf(stderr, "Aligning queries with reference database...\n");
|
||||||
if (obi_lcs_align_one_column(dms_name,
|
if (obi_lcs_align_one_column(dms_name,
|
||||||
refs_view_name,
|
refs_view_name,
|
||||||
"",
|
"",
|
||||||
@ -320,13 +321,19 @@ int build_reference_db(const char* dms_name,
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
count = (matrix_with_lca_view->infos)->line_count;
|
||||||
|
fprintf(stderr, "Computing LCAs...\n");
|
||||||
|
|
||||||
// Compute all the LCAs
|
// Compute all the LCAs
|
||||||
// For each pair
|
// For each pair
|
||||||
for (i=0; i<(matrix_with_lca_view->infos)->line_count; i++)
|
for (i=0; i<count; i++)
|
||||||
{
|
{
|
||||||
if (! keep_running)
|
if (! keep_running)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
|
if (i%1000 == 0)
|
||||||
|
fprintf(stderr,"\rDone : %f %% ", (i / (float) count)*100);
|
||||||
|
|
||||||
// Read all taxids associated with the first sequence and compute their LCA
|
// Read all taxids associated with the first sequence and compute their LCA
|
||||||
// Read line index
|
// Read line index
|
||||||
idx1 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx1_column, i, 0);
|
idx1 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx1_column, i, 0);
|
||||||
@ -363,6 +370,7 @@ int build_reference_db(const char* dms_name,
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
fprintf(stderr,"\rDone : 100 %% \n");
|
||||||
|
|
||||||
// Clone refs view, add 2 arrays columns for lca and score, compute and write them
|
// Clone refs view, add 2 arrays columns for lca and score, compute and write them
|
||||||
|
|
||||||
@ -442,13 +450,18 @@ int build_reference_db(const char* dms_name,
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "Building LCA arrays...\n");
|
||||||
|
|
||||||
// For each sequence, look for all its alignments in the matrix, and for each different LCA taxid/score, order them and write them
|
// For each sequence, look for all its alignments in the matrix, and for each different LCA taxid/score, order them and write them
|
||||||
// Going through matrix once, filling refs arrays on the go for efficiency
|
// Going through matrix once, filling refs arrays on the go for efficiency
|
||||||
for (i=0; i<(matrix_with_lca_view->infos)->line_count; i++)
|
for (i=0; i<count; i++)
|
||||||
{
|
{
|
||||||
if (! keep_running)
|
if (! keep_running)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
|
if (i%1000 == 0)
|
||||||
|
fprintf(stderr,"\rDone : %f %% ", (i / (float) count)*100);
|
||||||
|
|
||||||
// Read ref line indexes
|
// Read ref line indexes
|
||||||
idx1 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx1_column, i, 0);
|
idx1 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx1_column, i, 0);
|
||||||
idx2 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx2_column, i, 0);
|
idx2 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx2_column, i, 0);
|
||||||
@ -464,6 +477,8 @@ int build_reference_db(const char* dms_name,
|
|||||||
// Read alignment score
|
// Read alignment score
|
||||||
score = obi_get_float_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_score_column, i, 0);
|
score = obi_get_float_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_score_column, i, 0);
|
||||||
|
|
||||||
|
//fprintf(stderr, "\n\ntaxid_lca=%d, score=%f, idx1=%d, idx2=%d", taxid_lca, score, idx1, idx2);
|
||||||
|
|
||||||
///////////////// Compute for first sequence \\\\\\\\\\\\\\\\\\\\\\\ (TODO function)
|
///////////////// Compute for first sequence \\\\\\\\\\\\\\\\\\\\\\\ (TODO function)
|
||||||
|
|
||||||
// Read arrays
|
// Read arrays
|
||||||
@ -480,9 +495,11 @@ int build_reference_db(const char* dms_name,
|
|||||||
// return -1;
|
// return -1;
|
||||||
// }
|
// }
|
||||||
|
|
||||||
|
//fprintf(stderr, "\n1st sequence");
|
||||||
// If empty, add values
|
// If empty, add values
|
||||||
if (taxid_array_length == 0)
|
if (taxid_array_length == 0)
|
||||||
{
|
{
|
||||||
|
//fprintf(stderr, "\nEmpty, add value");
|
||||||
if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx1, &taxid_lca, (uint8_t) (obi_sizeof(OBI_INT) * 8), 1) < 0)
|
if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx1, &taxid_lca, (uint8_t) (obi_sizeof(OBI_INT) * 8), 1) < 0)
|
||||||
{
|
{
|
||||||
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
|
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
|
||||||
@ -496,6 +513,8 @@ int build_reference_db(const char* dms_name,
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
//fprintf(stderr, "\nNot empty");
|
||||||
|
|
||||||
j = 0;
|
j = 0;
|
||||||
modified = false;
|
modified = false;
|
||||||
while (j < taxid_array_length)
|
while (j < taxid_array_length)
|
||||||
@ -509,6 +528,9 @@ int build_reference_db(const char* dms_name,
|
|||||||
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
|
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
|
||||||
modified = true;
|
modified = true;
|
||||||
|
|
||||||
|
//fprintf(stderr, "\nSame LCA, replace %d and %f with %d and %f", lca_taxid_array_writable[j],
|
||||||
|
// score_array_writable[j], taxid_lca, score);
|
||||||
|
|
||||||
// Better score for the same LCA, replace this LCA/score pair
|
// Better score for the same LCA, replace this LCA/score pair
|
||||||
lca_taxid_array_writable[j] = taxid_lca;
|
lca_taxid_array_writable[j] = taxid_lca;
|
||||||
score_array_writable[j] = score;
|
score_array_writable[j] = score;
|
||||||
@ -535,6 +557,8 @@ int build_reference_db(const char* dms_name,
|
|||||||
{
|
{
|
||||||
if (score > score_array[j])
|
if (score > score_array[j])
|
||||||
{
|
{
|
||||||
|
//fprintf(stderr, "\nInsert new");
|
||||||
|
|
||||||
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
|
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
|
||||||
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
|
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
|
||||||
modified = true;
|
modified = true;
|
||||||
@ -579,10 +603,15 @@ int build_reference_db(const char* dms_name,
|
|||||||
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
|
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
|
||||||
modified = true;
|
modified = true;
|
||||||
|
|
||||||
|
//fprintf(stderr, "\nAppend at the end");
|
||||||
|
|
||||||
// Append LCA
|
// Append LCA
|
||||||
lca_taxid_array_writable[taxid_array_writable_length] = taxid_lca;
|
lca_taxid_array_writable[taxid_array_writable_length] = taxid_lca;
|
||||||
score_array_writable[score_array_writable_length] = score;
|
score_array_writable[score_array_writable_length] = score;
|
||||||
|
|
||||||
|
taxid_array_writable_length++;
|
||||||
|
score_array_writable_length++;
|
||||||
|
|
||||||
// Remove the previous (children) LCAs from the array if their score is equal or lower
|
// Remove the previous (children) LCAs from the array if their score is equal or lower
|
||||||
while ((j>0) && (score_array_writable[j-1] <= score))
|
while ((j>0) && (score_array_writable[j-1] <= score))
|
||||||
{
|
{
|
||||||
@ -603,6 +632,13 @@ int build_reference_db(const char* dms_name,
|
|||||||
// Write new arrays
|
// Write new arrays
|
||||||
if (modified)
|
if (modified)
|
||||||
{
|
{
|
||||||
|
// fprintf(stderr, "\n\nnew array:");
|
||||||
|
// for (k=0;k<taxid_array_writable_length;k++)
|
||||||
|
// {
|
||||||
|
// lca = obi_taxo_get_taxon_with_taxid(tax, lca_taxid_array_writable[k]);
|
||||||
|
// fprintf(stderr, "\nLCA=%d, %s, score=%f", lca_taxid_array_writable[k], lca->name, score_array_writable[k]);
|
||||||
|
// }
|
||||||
|
|
||||||
if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx1, lca_taxid_array_writable, (uint8_t) (obi_sizeof(OBI_INT) * 8), taxid_array_writable_length) < 0)
|
if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx1, lca_taxid_array_writable, (uint8_t) (obi_sizeof(OBI_INT) * 8), taxid_array_writable_length) < 0)
|
||||||
{
|
{
|
||||||
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
|
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
|
||||||
@ -632,9 +668,13 @@ int build_reference_db(const char* dms_name,
|
|||||||
// return -1;
|
// return -1;
|
||||||
// }
|
// }
|
||||||
|
|
||||||
|
//fprintf(stderr, "\n2nd sequence");
|
||||||
|
|
||||||
// If empty, add values
|
// If empty, add values
|
||||||
if (taxid_array_length == 0)
|
if (taxid_array_length == 0)
|
||||||
{
|
{
|
||||||
|
//fprintf(stderr, "\nEmpty, add value");
|
||||||
|
|
||||||
if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx2, &taxid_lca, (uint8_t) (obi_sizeof(OBI_INT) * 8), 1) < 0)
|
if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx2, &taxid_lca, (uint8_t) (obi_sizeof(OBI_INT) * 8), 1) < 0)
|
||||||
{
|
{
|
||||||
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
|
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
|
||||||
@ -648,6 +688,8 @@ int build_reference_db(const char* dms_name,
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
//fprintf(stderr, "\nNot empty");
|
||||||
|
|
||||||
j = 0;
|
j = 0;
|
||||||
modified = false;
|
modified = false;
|
||||||
while (j < taxid_array_length)
|
while (j < taxid_array_length)
|
||||||
@ -661,6 +703,9 @@ int build_reference_db(const char* dms_name,
|
|||||||
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
|
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
|
||||||
modified = true;
|
modified = true;
|
||||||
|
|
||||||
|
//fprintf(stderr, "\nSame LCA, replace %d and %f with %d and %f", lca_taxid_array_writable[j],
|
||||||
|
// score_array_writable[j], taxid_lca, score);
|
||||||
|
|
||||||
// Better score for the same LCA, replace this LCA/score pair
|
// Better score for the same LCA, replace this LCA/score pair
|
||||||
lca_taxid_array_writable[j] = taxid_lca;
|
lca_taxid_array_writable[j] = taxid_lca;
|
||||||
score_array_writable[j] = score;
|
score_array_writable[j] = score;
|
||||||
@ -687,6 +732,8 @@ int build_reference_db(const char* dms_name,
|
|||||||
{
|
{
|
||||||
if (score > score_array[j])
|
if (score > score_array[j])
|
||||||
{
|
{
|
||||||
|
//fprintf(stderr, "\nInsert new");
|
||||||
|
|
||||||
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
|
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
|
||||||
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
|
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
|
||||||
modified = true;
|
modified = true;
|
||||||
@ -727,6 +774,8 @@ int build_reference_db(const char* dms_name,
|
|||||||
|
|
||||||
if (j == taxid_array_length) // same or parent LCA not found, need to be appended at the end
|
if (j == taxid_array_length) // same or parent LCA not found, need to be appended at the end
|
||||||
{
|
{
|
||||||
|
//fprintf(stderr, "\nAppend at the end");
|
||||||
|
|
||||||
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
|
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
|
||||||
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
|
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
|
||||||
modified = true;
|
modified = true;
|
||||||
@ -735,6 +784,9 @@ int build_reference_db(const char* dms_name,
|
|||||||
lca_taxid_array_writable[taxid_array_writable_length] = taxid_lca;
|
lca_taxid_array_writable[taxid_array_writable_length] = taxid_lca;
|
||||||
score_array_writable[score_array_writable_length] = score;
|
score_array_writable[score_array_writable_length] = score;
|
||||||
|
|
||||||
|
taxid_array_writable_length++;
|
||||||
|
score_array_writable_length++;
|
||||||
|
|
||||||
// Remove the previous (children) LCAs from the array if their score is equal or lower
|
// Remove the previous (children) LCAs from the array if their score is equal or lower
|
||||||
while ((j>0) && (score_array_writable[j-1] <= score))
|
while ((j>0) && (score_array_writable[j-1] <= score))
|
||||||
{
|
{
|
||||||
@ -769,11 +821,17 @@ int build_reference_db(const char* dms_name,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
fprintf(stderr,"\rDone : 100 %% \n");
|
||||||
|
|
||||||
|
fprintf(stderr, "Writing results...\n");
|
||||||
|
count = (o_view->infos)->line_count;
|
||||||
// Fill empty LCA informations (because filling from potentially sparse alignment matrix) with the sequence taxid
|
// Fill empty LCA informations (because filling from potentially sparse alignment matrix) with the sequence taxid
|
||||||
score=1.0; // technically getting LCA of identical sequences
|
score=1.0; // technically getting LCA of identical sequences
|
||||||
for (i=0; i<(o_view->infos)->line_count; i++)
|
for (i=0; i<count; i++)
|
||||||
{
|
{
|
||||||
|
if (i%1000 == 0)
|
||||||
|
fprintf(stderr,"\rDone : %f %% ", (i / (float) count)*100);
|
||||||
|
|
||||||
obi_get_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, i, &taxid_array_length);
|
obi_get_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, i, &taxid_array_length);
|
||||||
if (taxid_array_length == 0) // no LCA set
|
if (taxid_array_length == 0) // no LCA set
|
||||||
{
|
{
|
||||||
@ -799,6 +857,7 @@ int build_reference_db(const char* dms_name,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
fprintf(stderr,"\rDone : 100 %% \n");
|
||||||
|
|
||||||
// Add information about the threshold used to build the DB
|
// Add information about the threshold used to build the DB
|
||||||
snprintf(threshold_str, 5, "%f", threshold);
|
snprintf(threshold_str, 5, "%f", threshold);
|
||||||
@ -858,7 +917,6 @@ int build_reference_db(const char* dms_name,
|
|||||||
free(matrix_view_name);
|
free(matrix_view_name);
|
||||||
free(matrix_with_lca_view_name);
|
free(matrix_with_lca_view_name);
|
||||||
|
|
||||||
fprintf(stderr,"\rDone : 100 %% \n");
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1061,7 +1061,7 @@ int obi_ecopcr(const char* i_dms_name,
|
|||||||
length = 0;
|
length = 0;
|
||||||
if (posj > posi)
|
if (posj > posi)
|
||||||
length = posj - posi - o1->patlen - o2->patlen;
|
length = posj - posi - o1->patlen - o2->patlen;
|
||||||
if (posj < posi)
|
else if (circular > 0)
|
||||||
length = posj + apatseq->seqlen - posi - o1->patlen - o2->patlen;
|
length = posj + apatseq->seqlen - posi - o1->patlen - o2->patlen;
|
||||||
if ((length>0) && // For when primers touch or overlap
|
if ((length>0) && // For when primers touch or overlap
|
||||||
(!min_len || (length >= min_len)) &&
|
(!min_len || (length >= min_len)) &&
|
||||||
@ -1151,7 +1151,7 @@ int obi_ecopcr(const char* i_dms_name,
|
|||||||
length = 0;
|
length = 0;
|
||||||
if (posj > posi)
|
if (posj > posi)
|
||||||
length = posj - posi + 1 - o2->patlen - o1->patlen; /* - o1->patlen : deleted by <EC> (prior to the OBITools3) */
|
length = posj - posi + 1 - o2->patlen - o1->patlen; /* - o1->patlen : deleted by <EC> (prior to the OBITools3) */
|
||||||
if (posj < posi)
|
else if (circular > 0)
|
||||||
length = posj + apatseq->seqlen - posi - o1->patlen - o2->patlen;
|
length = posj + apatseq->seqlen - posi - o1->patlen - o2->patlen;
|
||||||
if ((length>0) && // For when primers touch or overlap
|
if ((length>0) && // For when primers touch or overlap
|
||||||
(!min_len || (length >= min_len)) &&
|
(!min_len || (length >= min_len)) &&
|
||||||
@ -1232,7 +1232,7 @@ int obi_ecopcr(const char* i_dms_name,
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr,"\rDone : 100 %% ");
|
fprintf(stderr,"\rDone : 100 %% \n");
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -455,7 +455,7 @@ int obi_ecotag(const char* dms_name,
|
|||||||
|
|
||||||
for (i=0; i < query_count; i++)
|
for (i=0; i < query_count; i++)
|
||||||
{
|
{
|
||||||
if (i%100 == 0)
|
if (i%1000 == 0)
|
||||||
fprintf(stderr,"\rDone : %f %% ", (i / (float) query_count)*100);
|
fprintf(stderr,"\rDone : %f %% ", (i / (float) query_count)*100);
|
||||||
|
|
||||||
best_match_count = 0;
|
best_match_count = 0;
|
||||||
@ -562,7 +562,7 @@ int obi_ecotag(const char* dms_name,
|
|||||||
score_array = obi_get_array_with_col_p_in_view(ref_view, score_a_column, best_match_idx, &lca_array_length);
|
score_array = obi_get_array_with_col_p_in_view(ref_view, score_a_column, best_match_idx, &lca_array_length);
|
||||||
|
|
||||||
k = 0;
|
k = 0;
|
||||||
while ((k < lca_array_length) && (score_array[k] >= ecotag_threshold))
|
while ((k < lca_array_length) && (score_array[k] >= best_score))
|
||||||
k++;
|
k++;
|
||||||
|
|
||||||
if (k>0)
|
if (k>0)
|
||||||
@ -570,12 +570,12 @@ int obi_ecotag(const char* dms_name,
|
|||||||
lca_array = obi_get_array_with_col_p_in_view(ref_view, lca_taxid_a_column, best_match_idx, &lca_array_length);
|
lca_array = obi_get_array_with_col_p_in_view(ref_view, lca_taxid_a_column, best_match_idx, &lca_array_length);
|
||||||
if (j>0)
|
if (j>0)
|
||||||
{
|
{
|
||||||
lca = obi_taxo_get_taxon_with_taxid(taxonomy, lca_taxid);
|
// lca = obi_taxo_get_taxon_with_taxid(taxonomy, lca_taxid);
|
||||||
if (lca == NULL)
|
// if (lca == NULL)
|
||||||
{
|
// {
|
||||||
obidebug(1, "\nError getting a taxon from a taxid when doing taxonomic assignment");
|
// obidebug(1, "\nError getting a taxon from a taxid when doing taxonomic assignment");
|
||||||
return -1;
|
// return -1;
|
||||||
}
|
// }
|
||||||
lca_in_array = obi_taxo_get_taxon_with_taxid(taxonomy, lca_array[k-1]);
|
lca_in_array = obi_taxo_get_taxon_with_taxid(taxonomy, lca_array[k-1]);
|
||||||
if (lca_in_array == NULL)
|
if (lca_in_array == NULL)
|
||||||
{
|
{
|
||||||
|
@ -648,7 +648,7 @@ int truncate_avl_data_to_size_used(OBIDMS_avl_data_p avl_data) // TODO is it nec
|
|||||||
new_data_size = ((index_t) multiple) * getpagesize();
|
new_data_size = ((index_t) multiple) * getpagesize();
|
||||||
|
|
||||||
// Check that it is actually greater than the current size of the file, otherwise no need to truncate
|
// Check that it is actually greater than the current size of the file, otherwise no need to truncate
|
||||||
if ((avl_data->header)->data_size_max == new_data_size)
|
if ((avl_data->header)->data_size_max >= new_data_size)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
// Get the file descriptor
|
// Get the file descriptor
|
||||||
@ -667,7 +667,7 @@ int truncate_avl_data_to_size_used(OBIDMS_avl_data_p avl_data) // TODO is it nec
|
|||||||
if (ftruncate(file_descriptor, file_size) < 0)
|
if (ftruncate(file_descriptor, file_size) < 0)
|
||||||
{
|
{
|
||||||
obi_set_errno(OBI_AVL_ERROR);
|
obi_set_errno(OBI_AVL_ERROR);
|
||||||
obidebug(1, "\nError truncating an AVL data file");
|
obidebug(1, "\nError truncating an AVL data file, old data size = %lld, new data size = %lld", (avl_data->header)->data_size_max, new_data_size);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1974,6 +1974,10 @@ int obi_enlarge_column(OBIDMS_column_p column)
|
|||||||
// Calculate the new file size
|
// Calculate the new file size
|
||||||
old_line_count = (column->header)->line_count;
|
old_line_count = (column->header)->line_count;
|
||||||
new_line_count = ceil((double) old_line_count * (double) COLUMN_GROWTH_FACTOR);
|
new_line_count = ceil((double) old_line_count * (double) COLUMN_GROWTH_FACTOR);
|
||||||
|
if (new_line_count > old_line_count+100000)
|
||||||
|
new_line_count = old_line_count+100000;
|
||||||
|
else if (new_line_count < old_line_count+1000)
|
||||||
|
new_line_count = old_line_count+1000;
|
||||||
|
|
||||||
if (new_line_count > MAXIMUM_LINE_COUNT)
|
if (new_line_count > MAXIMUM_LINE_COUNT)
|
||||||
{
|
{
|
||||||
|
@ -34,7 +34,7 @@
|
|||||||
#define NB_ELTS_MAX_IF_DEFAULT_NAME (1000000) /**< The maximum number of elements per line if the default element names
|
#define NB_ELTS_MAX_IF_DEFAULT_NAME (1000000) /**< The maximum number of elements per line if the default element names
|
||||||
* are used ("0\01\02\0...\0n"), considering ELEMENTS_NAMES_MAX. // TODO not up to date
|
* are used ("0\01\02\0...\0n"), considering ELEMENTS_NAMES_MAX. // TODO not up to date
|
||||||
*/
|
*/
|
||||||
#define COLUMN_GROWTH_FACTOR (1.3) /**< The growth factor when a column is enlarged.
|
#define COLUMN_GROWTH_FACTOR (2) /**< The growth factor when a column is enlarged.
|
||||||
*/
|
*/
|
||||||
#define MAXIMUM_LINE_COUNT (1000000000) /**< The maximum line count for the data of a column (1E9). //TODO
|
#define MAXIMUM_LINE_COUNT (1000000000) /**< The maximum line count for the data of a column (1E9). //TODO
|
||||||
*/
|
*/
|
||||||
|
@ -1037,8 +1037,9 @@ static int finish_view(Obiview_p view)
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add count column if it's a NUC_SEQ_VIEW with no count column // TODO discuss
|
// Add count column if it's a NUC_SEQ_VIEW with no count column (and there's no MERGED_sample column) // TODO discuss
|
||||||
if ((!strcmp((view->infos)->view_type, VIEW_TYPE_NUC_SEQS)) && (!obi_view_column_exists(view, COUNT_COLUMN)))
|
if ((!strcmp((view->infos)->view_type, VIEW_TYPE_NUC_SEQS)) && (!obi_view_column_exists(view, COUNT_COLUMN))
|
||||||
|
&& (!obi_view_column_exists(view, "MERGED_sample"))) // TODO should eventually compute from merged samples?
|
||||||
{
|
{
|
||||||
if (obi_create_auto_count_column(view) < 0)
|
if (obi_create_auto_count_column(view) < 0)
|
||||||
{
|
{
|
||||||
|
@ -686,6 +686,9 @@ int calculateSizeToAllocate(int maxLen, int LCSmin)
|
|||||||
size *= 3;
|
size *= 3;
|
||||||
size += 16;
|
size += 16;
|
||||||
|
|
||||||
|
size += 10; // band-aid for memory bug I don't understand (triggered on specific db on ubuntu)
|
||||||
|
// bug might have to do with the way different systems behave when aligning the address in obi_get_memory_aligned_on_16
|
||||||
|
|
||||||
return(size*sizeof(int16_t));
|
return(size*sizeof(int16_t));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user