Compare commits
22 Commits
v3.0.0-bet
...
v3.0.0b21
Author | SHA1 | Date | |
---|---|---|---|
faf8ea9d86 | |||
ffe2485e94 | |||
6094ce2bbc | |||
a7dcf16c06 | |||
f13f8f6165 | |||
b5a29ac413 | |||
efd2b9d338 | |||
ca6e3e7aad | |||
76ed8e18e5 | |||
1d17f28aec | |||
fa834e4b8b | |||
a72fea3cc9 | |||
e9a37d8a6e | |||
ef074f8455 | |||
aec5e69f2c | |||
170ef3f1ba | |||
f999946582 | |||
773b36ec37 | |||
69cb434a6c | |||
55d4f98d60 | |||
0bec2631e8 | |||
e6b6c6fa84 |
@ -6,7 +6,7 @@ recursive-include doc/sphinx/source *.txt *.rst *.py
|
||||
recursive-include doc/sphinx/sphinxext *.py
|
||||
include doc/sphinx/Makefile
|
||||
include doc/sphinx/Doxyfile
|
||||
include README.txt
|
||||
include README.md
|
||||
include requirements.txt
|
||||
include scripts/obi
|
||||
|
||||
|
@ -13,7 +13,8 @@ from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN, \
|
||||
ID_COLUMN, \
|
||||
DEFINITION_COLUMN, \
|
||||
QUALITY_COLUMN, \
|
||||
COUNT_COLUMN
|
||||
COUNT_COLUMN, \
|
||||
TAXID_COLUMN
|
||||
|
||||
import time
|
||||
import math
|
||||
@ -175,8 +176,8 @@ def sequenceTaggerGenerator(config, taxo=None):
|
||||
counter[0]+=1
|
||||
|
||||
for rank in annoteRank:
|
||||
if 'taxid' in seq:
|
||||
taxid = seq['taxid']
|
||||
if TAXID_COLUMN in seq:
|
||||
taxid = seq[TAXID_COLUMN]
|
||||
if taxid is not None:
|
||||
rtaxid = taxo.get_taxon_at_rank(taxid, rank)
|
||||
if rtaxid is not None:
|
||||
|
@ -107,8 +107,8 @@ def run(config):
|
||||
comments = View.print_config(config, "ecotag", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
|
||||
|
||||
if obi_ecotag(i_dms.name_with_full_path, tobytes(i_view_name), \
|
||||
tobytes(ref_dms_name), tobytes(ref_view_name), \
|
||||
tobytes(taxo_dms_name), tobytes(taxonomy_name), \
|
||||
ref_dms.name_with_full_path, tobytes(ref_view_name), \
|
||||
taxo_dms.name_with_full_path, tobytes(taxonomy_name), \
|
||||
tobytes(o_view_name), comments,
|
||||
config['ecotag']['threshold']) < 0:
|
||||
raise Exception("Error running ecotag")
|
||||
|
@ -25,7 +25,8 @@ from obitools3.dms.capi.obiview cimport VIEW_TYPE_NUC_SEQS, \
|
||||
DEFINITION_COLUMN, \
|
||||
QUALITY_COLUMN, \
|
||||
COUNT_COLUMN, \
|
||||
TAXID_COLUMN
|
||||
TAXID_COLUMN, \
|
||||
MERGED_PREFIX
|
||||
|
||||
from obitools3.dms.capi.obidms cimport obi_import_view
|
||||
|
||||
@ -72,7 +73,7 @@ def addOptions(parser):
|
||||
action="store_true", dest="import:preread",
|
||||
default=False,
|
||||
help="Do a first readthrough of the dataset if it contains huge dictionaries (more than 100 keys) for "
|
||||
"a much faster import.")
|
||||
"a much faster import. This option is not recommended and will slow down the import in any other case.")
|
||||
|
||||
|
||||
def run(config):
|
||||
@ -163,7 +164,7 @@ def run(config):
|
||||
taxo.write(taxo_name)
|
||||
taxo.close()
|
||||
o_dms.record_command_line(" ".join(sys.argv[1:]))
|
||||
o_dms.close()
|
||||
o_dms.close(force=True)
|
||||
logger("info", "Done.")
|
||||
return
|
||||
|
||||
@ -217,11 +218,14 @@ def run(config):
|
||||
logger("info", "Read %d entries", i)
|
||||
|
||||
for tag in entry :
|
||||
newtag = tag
|
||||
if tag[:7] == b"merged_":
|
||||
newtag = MERGED_PREFIX+tag[7:]
|
||||
if type(entry[tag]) == dict :
|
||||
if tag in dict_dict:
|
||||
dict_dict[tag][0].update(entry[tag].keys())
|
||||
dict_dict[newtag][0].update(entry[tag].keys())
|
||||
else:
|
||||
dict_dict[tag] = [set(entry[tag].keys()), get_obitype(entry[tag])]
|
||||
dict_dict[newtag] = [set(entry[tag].keys()), get_obitype(entry[tag])]
|
||||
i+=1
|
||||
|
||||
if pb is not None:
|
||||
@ -232,7 +236,7 @@ def run(config):
|
||||
dcols[tag] = (Column.new_column(view, tag, dict_dict[tag][1], \
|
||||
nb_elements_per_line=len(dict_dict[tag][0]), \
|
||||
elements_names=list(dict_dict[tag][0])), \
|
||||
value_obitype)
|
||||
dict_dict[tag][1])
|
||||
|
||||
|
||||
# Reinitialize the input
|
||||
@ -265,123 +269,134 @@ def run(config):
|
||||
pb(i)
|
||||
elif not i%50000:
|
||||
logger("info", "Imported %d entries", i)
|
||||
|
||||
if NUC_SEQS_view:
|
||||
id_col[i] = entry.id
|
||||
def_col[i] = entry.definition
|
||||
seq_col[i] = entry.seq
|
||||
# Check if there is a sequencing quality associated by checking the first entry # TODO haven't found a more robust solution yet
|
||||
if i == 0:
|
||||
get_quality = QUALITY_COLUMN in entry
|
||||
|
||||
try:
|
||||
|
||||
if NUC_SEQS_view:
|
||||
id_col[i] = entry.id
|
||||
def_col[i] = entry.definition
|
||||
seq_col[i] = entry.seq
|
||||
# Check if there is a sequencing quality associated by checking the first entry # TODO haven't found a more robust solution yet
|
||||
if i == 0:
|
||||
get_quality = QUALITY_COLUMN in entry
|
||||
if get_quality:
|
||||
Column.new_column(view, QUALITY_COLUMN, OBI_QUAL)
|
||||
qual_col = view[QUALITY_COLUMN]
|
||||
if get_quality:
|
||||
Column.new_column(view, QUALITY_COLUMN, OBI_QUAL)
|
||||
qual_col = view[QUALITY_COLUMN]
|
||||
if get_quality:
|
||||
qual_col[i] = entry.quality
|
||||
|
||||
for tag in entry :
|
||||
|
||||
if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN : # TODO dirty
|
||||
|
||||
value = entry[tag]
|
||||
if tag == b"taxid":
|
||||
tag = TAXID_COLUMN
|
||||
if tag == b"count":
|
||||
tag = COUNT_COLUMN
|
||||
|
||||
if tag not in dcols :
|
||||
|
||||
value_type = type(value)
|
||||
nb_elts = 1
|
||||
value_obitype = OBI_VOID
|
||||
|
||||
if value_type == dict or value_type == list :
|
||||
nb_elts = len(value)
|
||||
elt_names = list(value)
|
||||
else :
|
||||
nb_elts = 1
|
||||
elt_names = None
|
||||
|
||||
value_obitype = get_obitype(value)
|
||||
|
||||
if value_obitype != OBI_VOID :
|
||||
dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
|
||||
|
||||
# Fill value
|
||||
if value_type == dict and nb_elts == 1: # special case that makes the OBI3 create a 1 elt/line column which won't read a dict value
|
||||
value = value[list(value.keys())[0]] # The solution is to transform the value in a simple atomic one acceptable by the column
|
||||
dcols[tag][0][i] = value
|
||||
|
||||
# TODO else log error?
|
||||
|
||||
else :
|
||||
|
||||
rewrite = False
|
||||
|
||||
# Check type adequation
|
||||
old_type = dcols[tag][1]
|
||||
new_type = OBI_VOID
|
||||
new_type = update_obitype(old_type, value)
|
||||
if old_type != new_type :
|
||||
rewrite = True
|
||||
|
||||
try:
|
||||
# Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key
|
||||
if type(value) == dict and \
|
||||
dcols[tag][0].nb_elements_per_line == 1 and len(value.keys()) == 1 \
|
||||
and dcols[tag][0].elements_names[0] != list(value.keys())[0] :
|
||||
raise IndexError # trigger column rewrite
|
||||
qual_col[i] = entry.quality
|
||||
|
||||
for tag in entry :
|
||||
|
||||
if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN : # TODO dirty
|
||||
|
||||
value = entry[tag]
|
||||
if tag == b"taxid":
|
||||
tag = TAXID_COLUMN
|
||||
if tag == b"count":
|
||||
tag = COUNT_COLUMN
|
||||
if tag[:7] == b"merged_":
|
||||
tag = MERGED_PREFIX+tag[7:]
|
||||
|
||||
# Fill value
|
||||
dcols[tag][0][i] = value
|
||||
|
||||
except IndexError :
|
||||
|
||||
if tag not in dcols :
|
||||
|
||||
value_type = type(value)
|
||||
old_column = dcols[tag][0]
|
||||
old_nb_elements_per_line = old_column.nb_elements_per_line
|
||||
new_nb_elements_per_line = 0
|
||||
old_elements_names = old_column.elements_names
|
||||
new_elements_names = None
|
||||
nb_elts = 1
|
||||
value_obitype = OBI_VOID
|
||||
|
||||
if value_type == dict or value_type == list :
|
||||
nb_elts = len(value)
|
||||
elt_names = list(value)
|
||||
else :
|
||||
nb_elts = 1
|
||||
elt_names = None
|
||||
|
||||
value_obitype = get_obitype(value)
|
||||
|
||||
if value_obitype != OBI_VOID :
|
||||
dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
|
||||
|
||||
# Fill value
|
||||
if value_type == dict and nb_elts == 1: # special case that makes the OBI3 create a 1 elt/line column which won't read a dict value
|
||||
value = value[list(value.keys())[0]] # The solution is to transform the value in a simple atomic one acceptable by the column
|
||||
dcols[tag][0][i] = value
|
||||
|
||||
# TODO else log error?
|
||||
|
||||
#####################################################################
|
||||
|
||||
# Check the length and keys of column lines if needed
|
||||
if value_type == dict : # Check dictionary keys
|
||||
for k in value :
|
||||
if k not in old_elements_names :
|
||||
new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
|
||||
rewrite = True
|
||||
break
|
||||
|
||||
elif value_type == list or value_type == tuple : # Check vector length
|
||||
if old_nb_elements_per_line < len(value) :
|
||||
new_nb_elements_per_line = len(value)
|
||||
rewrite = True
|
||||
|
||||
#####################################################################
|
||||
|
||||
if rewrite :
|
||||
if new_nb_elements_per_line == 0 and new_elements_names is not None :
|
||||
new_nb_elements_per_line = len(new_elements_names)
|
||||
|
||||
# Reset obierrno
|
||||
obi_errno = 0
|
||||
|
||||
dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
|
||||
new_data_type=new_type,
|
||||
new_nb_elements_per_line=new_nb_elements_per_line,
|
||||
new_elements_names=new_elements_names,
|
||||
rewrite_last_line=False),
|
||||
new_type)
|
||||
|
||||
# Update the dictionary:
|
||||
for t in dcols :
|
||||
dcols[t] = (view[t], dcols[t][1])
|
||||
|
||||
else :
|
||||
|
||||
rewrite = False
|
||||
|
||||
# Check type adequation
|
||||
old_type = dcols[tag][1]
|
||||
new_type = OBI_VOID
|
||||
new_type = update_obitype(old_type, value)
|
||||
if old_type != new_type :
|
||||
rewrite = True
|
||||
|
||||
try:
|
||||
# Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key
|
||||
if type(value) == dict and \
|
||||
dcols[tag][0].nb_elements_per_line == 1 \
|
||||
and set(dcols[tag][0].elements_names) != set(value.keys()) :
|
||||
raise IndexError # trigger column rewrite
|
||||
|
||||
# Fill value
|
||||
dcols[tag][0][i] = value
|
||||
|
||||
|
||||
except IndexError :
|
||||
|
||||
value_type = type(value)
|
||||
old_column = dcols[tag][0]
|
||||
old_nb_elements_per_line = old_column.nb_elements_per_line
|
||||
new_nb_elements_per_line = 0
|
||||
old_elements_names = old_column.elements_names
|
||||
new_elements_names = None
|
||||
|
||||
#####################################################################
|
||||
|
||||
# Check the length and keys of column lines if needed
|
||||
if value_type == dict : # Check dictionary keys
|
||||
for k in value :
|
||||
if k not in old_elements_names :
|
||||
new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
|
||||
rewrite = True
|
||||
break
|
||||
|
||||
elif value_type == list or value_type == tuple : # Check vector length
|
||||
if old_nb_elements_per_line < len(value) :
|
||||
new_nb_elements_per_line = len(value)
|
||||
rewrite = True
|
||||
|
||||
#####################################################################
|
||||
|
||||
if rewrite :
|
||||
if new_nb_elements_per_line == 0 and new_elements_names is not None :
|
||||
new_nb_elements_per_line = len(new_elements_names)
|
||||
|
||||
# Reset obierrno
|
||||
obi_errno = 0
|
||||
|
||||
dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
|
||||
new_data_type=new_type,
|
||||
new_nb_elements_per_line=new_nb_elements_per_line,
|
||||
new_elements_names=new_elements_names,
|
||||
rewrite_last_line=False),
|
||||
new_type)
|
||||
|
||||
# Update the dictionary:
|
||||
for t in dcols :
|
||||
dcols[t] = (view[t], dcols[t][1])
|
||||
|
||||
# Fill value
|
||||
dcols[tag][0][i] = value
|
||||
|
||||
except Exception as e:
|
||||
print("\nCould not import sequence id:", entry.id, "(error raised:", e, ")")
|
||||
if 'skiperror' in config['obi'] and not config['obi']['skiperror']:
|
||||
raise e
|
||||
else:
|
||||
pass
|
||||
|
||||
i+=1
|
||||
|
||||
if pb is not None:
|
||||
@ -402,7 +417,7 @@ def run(config):
|
||||
except AttributeError:
|
||||
pass
|
||||
try:
|
||||
output[0].close()
|
||||
output[0].close(force=True)
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
|
@ -42,8 +42,8 @@ def addOptions(parser):
|
||||
metavar="<URI>",
|
||||
type=str,
|
||||
default=None,
|
||||
help="URI to the view containing the samples definition (with tags, primers, sample names,...)"
|
||||
"Warning: primer lengths must be less than or equal to 32")
|
||||
help="URI to the view containing the samples definition (with tags, primers, sample names,...).\n"
|
||||
"\nWarning: primer lengths must be less than or equal to 32")
|
||||
|
||||
group.add_argument('-R', '--reverse-reads',
|
||||
action="store", dest="ngsfilter:reverse",
|
||||
|
@ -5,5 +5,5 @@ from obitools3.dms.taxo.taxo cimport Taxonomy
|
||||
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
||||
|
||||
|
||||
cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy)
|
||||
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=*, Taxonomy taxonomy=*, bint mergeIds=*, list categories=*, int max_elts=*)
|
||||
cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy, dict config)
|
||||
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, dict config, list mergedKeys_list=*, Taxonomy taxonomy=*, bint mergeIds=*, list categories=*, int max_elts=*)
|
||||
|
@ -56,7 +56,7 @@ def addOptions(parser):
|
||||
"(option can be used several times).")
|
||||
|
||||
|
||||
cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy, dict config) :
|
||||
|
||||
cdef int taxid
|
||||
cdef Nuc_Seq_Stored seq
|
||||
@ -69,7 +69,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
cdef object gn_sn
|
||||
cdef object fa_sn
|
||||
|
||||
# Create columns
|
||||
# Create columns and save them for efficiency
|
||||
if b"species" in o_view and o_view[b"species"].data_type_int != OBI_INT :
|
||||
o_view.delete_column(b"species")
|
||||
if b"species" not in o_view:
|
||||
@ -77,6 +77,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
b"species",
|
||||
OBI_INT
|
||||
)
|
||||
species_column = o_view[b"species"]
|
||||
|
||||
if b"genus" in o_view and o_view[b"genus"].data_type_int != OBI_INT :
|
||||
o_view.delete_column(b"genus")
|
||||
@ -85,6 +86,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
b"genus",
|
||||
OBI_INT
|
||||
)
|
||||
genus_column = o_view[b"genus"]
|
||||
|
||||
if b"family" in o_view and o_view[b"family"].data_type_int != OBI_INT :
|
||||
o_view.delete_column(b"family")
|
||||
@ -93,6 +95,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
b"family",
|
||||
OBI_INT
|
||||
)
|
||||
family_column = o_view[b"family"]
|
||||
|
||||
if b"species_name" in o_view and o_view[b"species_name"].data_type_int != OBI_STR :
|
||||
o_view.delete_column(b"species_name")
|
||||
@ -101,6 +104,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
b"species_name",
|
||||
OBI_STR
|
||||
)
|
||||
species_name_column = o_view[b"species_name"]
|
||||
|
||||
if b"genus_name" in o_view and o_view[b"genus_name"].data_type_int != OBI_STR :
|
||||
o_view.delete_column(b"genus_name")
|
||||
@ -109,6 +113,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
b"genus_name",
|
||||
OBI_STR
|
||||
)
|
||||
genus_name_column = o_view[b"genus_name"]
|
||||
|
||||
if b"family_name" in o_view and o_view[b"family_name"].data_type_int != OBI_STR :
|
||||
o_view.delete_column(b"family_name")
|
||||
@ -117,6 +122,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
b"family_name",
|
||||
OBI_STR
|
||||
)
|
||||
family_name_column = o_view[b"family_name"]
|
||||
|
||||
if b"rank" in o_view and o_view[b"rank"].data_type_int != OBI_STR :
|
||||
o_view.delete_column(b"rank")
|
||||
@ -125,6 +131,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
b"rank",
|
||||
OBI_STR
|
||||
)
|
||||
rank_column = o_view[b"rank"]
|
||||
|
||||
if b"scientific_name" in o_view and o_view[b"scientific_name"].data_type_int != OBI_STR :
|
||||
o_view.delete_column(b"scientific_name")
|
||||
@ -133,9 +140,15 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
b"scientific_name",
|
||||
OBI_STR
|
||||
)
|
||||
|
||||
for seq in o_view:
|
||||
PyErr_CheckSignals()
|
||||
scientific_name_column = o_view[b"scientific_name"]
|
||||
|
||||
# Initialize the progress bar
|
||||
pb = ProgressBar(len(o_view), config, seconde=5)
|
||||
|
||||
i=0
|
||||
for seq in o_view:
|
||||
PyErr_CheckSignals()
|
||||
pb(i)
|
||||
if MERGED_TAXID_COLUMN in seq :
|
||||
m_taxids = []
|
||||
m_taxids_dict = seq[MERGED_TAXID_COLUMN]
|
||||
@ -165,20 +178,23 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
|
||||
else:
|
||||
fa_sn = None
|
||||
tfa = None
|
||||
|
||||
seq[b"species"] = tsp
|
||||
seq[b"genus"] = tgn
|
||||
seq[b"family"] = tfa
|
||||
|
||||
seq[b"species_name"] = sp_sn
|
||||
seq[b"genus_name"] = gn_sn
|
||||
seq[b"family_name"] = fa_sn
|
||||
|
||||
seq[b"rank"] = taxonomy.get_rank(taxid)
|
||||
seq[b"scientific_name"] = taxonomy.get_scientific_name(taxid)
|
||||
species_column[i] = tsp
|
||||
genus_column[i] = tgn
|
||||
family_column[i] = tfa
|
||||
|
||||
species_name_column[i] = sp_sn
|
||||
genus_name_column[i] = gn_sn
|
||||
family_name_column[i] = fa_sn
|
||||
|
||||
rank_column[i] = taxonomy.get_rank(taxid)
|
||||
scientific_name_column[i] = taxonomy.get_scientific_name(taxid)
|
||||
i+=1
|
||||
|
||||
pb(len(o_view), force=True)
|
||||
|
||||
|
||||
|
||||
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None, int max_elts=1000000) :
|
||||
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, dict config, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None, int max_elts=1000000) :
|
||||
|
||||
cdef int i
|
||||
cdef int k
|
||||
@ -187,6 +203,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
cdef int u_idx
|
||||
cdef int i_idx
|
||||
cdef int i_count
|
||||
cdef int o_count
|
||||
cdef str key_str
|
||||
cdef bytes key
|
||||
cdef bytes mkey
|
||||
@ -209,7 +226,6 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
cdef Nuc_Seq_Stored i_seq
|
||||
cdef Nuc_Seq_Stored o_seq
|
||||
cdef Nuc_Seq_Stored u_seq
|
||||
cdef Column i_col
|
||||
cdef Column i_seq_col
|
||||
cdef Column i_id_col
|
||||
cdef Column i_taxid_col
|
||||
@ -217,6 +233,8 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
cdef Column o_id_col
|
||||
cdef Column o_taxid_dist_col
|
||||
cdef Column o_merged_col
|
||||
cdef Column o_count_col
|
||||
cdef Column i_count_col
|
||||
cdef Column_line i_mcol
|
||||
cdef object taxid_dist_dict
|
||||
cdef object iter_view
|
||||
@ -252,7 +270,12 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
mergedKeys_m = []
|
||||
for k in range(k_count):
|
||||
mergedKeys_m.append(MERGED_PREFIX + mergedKeys[k])
|
||||
|
||||
|
||||
# Check that not trying to remerge without total count information
|
||||
for key in mergedKeys_m:
|
||||
if key in view and COUNT_COLUMN not in view:
|
||||
raise Exception("\n>>>>\nError: trying to re-merge tags without total count tag. Run obi annotate to add the count tag from the relevant merged tag, i.e.: \nobi annotate --set-tag COUNT:'sum([value for key,value in sequence['MERGED_sample'].items()])' dms/input dms/output\n")
|
||||
|
||||
if categories is None:
|
||||
categories = []
|
||||
|
||||
@ -320,7 +343,11 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
for k in range(k_count):
|
||||
key = mergedKeys[k]
|
||||
merged_col_name = mergedKeys_m[k]
|
||||
i_col = view[key]
|
||||
|
||||
if merged_col_name in view:
|
||||
i_col = view[merged_col_name]
|
||||
else:
|
||||
i_col = view[key]
|
||||
|
||||
if merged_infos[merged_col_name]['nb_elts'] > max_elts:
|
||||
str_merged_cols.append(merged_col_name)
|
||||
@ -374,12 +401,19 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
alias=MERGED_COLUMN
|
||||
)
|
||||
|
||||
# Keep columns that are going to be used a lot in variables
|
||||
# Keep columns in variables for efficiency
|
||||
o_id_col = o_view[ID_COLUMN]
|
||||
if TAXID_DIST_COLUMN in o_view:
|
||||
o_taxid_dist_col = o_view[TAXID_DIST_COLUMN]
|
||||
if MERGED_COLUMN in o_view:
|
||||
o_merged_col = o_view[MERGED_COLUMN]
|
||||
if COUNT_COLUMN not in o_view:
|
||||
Column.new_column(o_view,
|
||||
COUNT_COLUMN,
|
||||
OBI_INT)
|
||||
o_count_col = o_view[COUNT_COLUMN]
|
||||
if COUNT_COLUMN in view:
|
||||
i_count_col = view[COUNT_COLUMN]
|
||||
|
||||
pb(len(view), force=True)
|
||||
print("")
|
||||
@ -407,7 +441,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
merged_list = list(set(merged_list)) # deduplicate the list
|
||||
o_merged_col[o_idx] = merged_list
|
||||
|
||||
o_seq[COUNT_COLUMN] = 0
|
||||
o_count = 0
|
||||
|
||||
if TAXID_DIST_COLUMN in u_seq and i_taxid_dist_col[u_idx] is not None:
|
||||
taxid_dist_dict = i_taxid_dist_col[u_idx]
|
||||
@ -423,12 +457,12 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
i_id = i_id_col[i_idx]
|
||||
i_seq = view[i_idx]
|
||||
|
||||
if COUNT_COLUMN not in i_seq or i_seq[COUNT_COLUMN] is None:
|
||||
if COUNT_COLUMN not in i_seq or i_count_col[i_idx] is None:
|
||||
i_count = 1
|
||||
else:
|
||||
i_count = i_seq[COUNT_COLUMN]
|
||||
i_count = i_count_col[i_idx]
|
||||
|
||||
o_seq[COUNT_COLUMN] += i_count
|
||||
o_count += i_count
|
||||
|
||||
for k in range(k_count):
|
||||
|
||||
@ -463,44 +497,52 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
|
||||
mcol[key2] = i_mcol[key2]
|
||||
else:
|
||||
mcol[key2] = mcol[key2] + i_mcol[key2]
|
||||
|
||||
# Write taxid_dist
|
||||
if mergeIds and TAXID_COLUMN in mergedKeys:
|
||||
if TAXID_DIST_COLUMN in str_merged_cols:
|
||||
o_taxid_dist_col[o_idx] = str(taxid_dist_dict)
|
||||
else:
|
||||
o_taxid_dist_col[o_idx] = taxid_dist_dict
|
||||
|
||||
# Write merged dicts
|
||||
for mkey in merged_dict:
|
||||
if mkey in str_merged_cols:
|
||||
mkey_cols[mkey][o_idx] = str(merged_dict[mkey])
|
||||
else:
|
||||
mkey_cols[mkey][o_idx] = merged_dict[mkey]
|
||||
# Sets NA values to 0 # TODO discuss, for now keep as None and test for None instead of testing for 0 in tools
|
||||
#for key in mkey_cols[mkey][o_idx]:
|
||||
# if mkey_cols[mkey][o_idx][key] is None:
|
||||
# mkey_cols[mkey][o_idx][key] = 0
|
||||
|
||||
|
||||
for key in i_seq.keys():
|
||||
# Delete informations that differ between the merged sequences
|
||||
# TODO make special columns list?
|
||||
# TODO make special columns list? // could be more efficient
|
||||
if key != COUNT_COLUMN and key != ID_COLUMN and key != NUC_SEQUENCE_COLUMN and key in o_seq and o_seq[key] != i_seq[key] \
|
||||
and key not in merged_dict :
|
||||
o_seq[key] = None
|
||||
|
||||
# Write merged dicts
|
||||
for mkey in merged_dict:
|
||||
if mkey in str_merged_cols:
|
||||
mkey_cols[mkey][o_idx] = str(merged_dict[mkey])
|
||||
else:
|
||||
mkey_cols[mkey][o_idx] = merged_dict[mkey]
|
||||
# Sets NA values to 0 # TODO discuss, for now keep as None and test for None instead of testing for 0 in tools
|
||||
#for key in mkey_cols[mkey][o_idx]:
|
||||
# if mkey_cols[mkey][o_idx][key] is None:
|
||||
# mkey_cols[mkey][o_idx][key] = 0
|
||||
|
||||
# Write taxid_dist
|
||||
if mergeIds and TAXID_COLUMN in mergedKeys:
|
||||
if TAXID_DIST_COLUMN in str_merged_cols:
|
||||
o_taxid_dist_col[o_idx] = str(taxid_dist_dict)
|
||||
else:
|
||||
o_taxid_dist_col[o_idx] = taxid_dist_dict
|
||||
|
||||
o_count_col[o_idx] = o_count
|
||||
o_idx += 1
|
||||
|
||||
pb(len(uniques), force=True)
|
||||
|
||||
# Deletes quality columns if there is one because the matching between sequence and quality will be broken (quality set to NA when sequence not)
|
||||
if QUALITY_COLUMN in view:
|
||||
o_view.delete_column(QUALITY_COLUMN)
|
||||
if REVERSE_QUALITY_COLUMN in view:
|
||||
o_view.delete_column(REVERSE_QUALITY_COLUMN)
|
||||
|
||||
# Delete old columns that are now merged
|
||||
for k in range(k_count):
|
||||
if mergedKeys[k] in o_view:
|
||||
o_view.delete_column(mergedKeys[k])
|
||||
|
||||
if taxonomy is not None:
|
||||
print("") # TODO because in the middle of progress bar. Better solution?
|
||||
logger("info", "Merging taxonomy classification")
|
||||
merge_taxonomy_classification(o_view, taxonomy)
|
||||
merge_taxonomy_classification(o_view, taxonomy, config)
|
||||
|
||||
|
||||
|
||||
@ -547,11 +589,10 @@ def run(config):
|
||||
pb = ProgressBar(len(entries), config, seconde=5)
|
||||
|
||||
try:
|
||||
uniq_sequences(entries, o_view, pb, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'], max_elts=config['obi']['maxelts'])
|
||||
uniq_sequences(entries, o_view, pb, config, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'], max_elts=config['obi']['maxelts'])
|
||||
except Exception, e:
|
||||
raise RollbackException("obi uniq error, rollbacking view: "+str(e), o_view)
|
||||
|
||||
pb(len(entries), force=True)
|
||||
print("", file=sys.stderr)
|
||||
|
||||
# Save command config in View and DMS comments
|
||||
@ -567,8 +608,8 @@ def run(config):
|
||||
#print("\n\nOutput view:\n````````````", file=sys.stderr)
|
||||
#print(repr(o_view), file=sys.stderr)
|
||||
|
||||
input[0].close()
|
||||
output[0].close()
|
||||
input[0].close(force=True)
|
||||
output[0].close(force=True)
|
||||
|
||||
logger("info", "Done.")
|
||||
|
||||
|
@ -63,6 +63,8 @@ cdef extern from "obidmscolumn.h" nogil:
|
||||
|
||||
char* obi_get_elements_names(OBIDMS_column_p column)
|
||||
|
||||
char* obi_column_formatted_infos(OBIDMS_column_p column)
|
||||
|
||||
index_t obi_column_get_element_index_from_name(OBIDMS_column_p column, const char* element_name)
|
||||
|
||||
int obi_column_write_comments(OBIDMS_column_p column, const char* comments)
|
||||
|
@ -14,6 +14,7 @@ from ..capi.obidms cimport obi_import_column
|
||||
from ..capi.obidmscolumn cimport OBIDMS_column_header_p, \
|
||||
obi_close_column, \
|
||||
obi_get_elements_names, \
|
||||
obi_column_formatted_infos, \
|
||||
obi_column_write_comments
|
||||
|
||||
from ..capi.obiutils cimport obi_format_date
|
||||
@ -38,7 +39,7 @@ from obitools3.utils cimport tobytes, \
|
||||
|
||||
from obitools3.dms.column import typed_column
|
||||
|
||||
from libc.stdlib cimport free
|
||||
from libc.stdlib cimport free
|
||||
|
||||
import importlib
|
||||
import inspect
|
||||
@ -288,9 +289,15 @@ cdef class Column(OBIWrapper) :
|
||||
@OBIWrapper.checkIsActive
|
||||
def __repr__(self) :
|
||||
cdef bytes s
|
||||
#cdef char* s_b
|
||||
#cdef str s_str
|
||||
#s_b = obi_column_formatted_infos(self.pointer())
|
||||
#s_str = bytes2str(s_b)
|
||||
#free(s_b)
|
||||
s = self._alias + b", data type: " + self.data_type
|
||||
#return s_str
|
||||
return bytes2str(s)
|
||||
|
||||
|
||||
|
||||
def close(self): # TODO discuss, can't be called bc then bug when closing view that tries to close it in C
|
||||
|
||||
|
@ -531,8 +531,8 @@ cdef class View(OBIWrapper) :
|
||||
for level in self.view_history:
|
||||
command_list = [level[input][b"command_line"] for input in level.keys()]
|
||||
for command in command_list:
|
||||
s+=b"obi "
|
||||
s+=command
|
||||
s+=b"\n"
|
||||
return s
|
||||
|
||||
|
||||
|
@ -177,7 +177,7 @@ def emblIterator_dir(dir_path,
|
||||
for filename in files:
|
||||
if read==only:
|
||||
return
|
||||
print("Parsing file %s (%d/%d)" % (tostr(filename), read_files, len(files)))
|
||||
print("Parsing file %s (%d/%d)" % (tostr(filename), read_files+1, len(files)))
|
||||
f = uopen(filename)
|
||||
if only is not None:
|
||||
only_f = only-read
|
||||
|
@ -104,6 +104,7 @@ def fastaNucIterator(lineiterator,
|
||||
cdef bytes sequence
|
||||
cdef int skipped, ionly, read
|
||||
cdef Nuc_Seq seq
|
||||
cdef bint stop
|
||||
|
||||
if only is None:
|
||||
ionly = -1
|
||||
@ -130,7 +131,8 @@ def fastaNucIterator(lineiterator,
|
||||
else:
|
||||
line = firstline
|
||||
|
||||
while True:
|
||||
stop=False
|
||||
while not stop:
|
||||
|
||||
if ionly >= 0 and read >= ionly:
|
||||
break
|
||||
@ -153,7 +155,7 @@ def fastaNucIterator(lineiterator,
|
||||
s.append(line[0:-1])
|
||||
line = next(iterator)
|
||||
except StopIteration:
|
||||
pass
|
||||
stop=True
|
||||
|
||||
sequence = b"".join(s)
|
||||
|
||||
|
@ -25,8 +25,9 @@ from libc.string cimport strcpy, strlen
|
||||
_featureMatcher = re.compile(b'^FEATURES.+\n(?=ORIGIN)',re.DOTALL + re.M)
|
||||
|
||||
_headerMatcher = re.compile(b'^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M)
|
||||
_seqMatcher = re.compile(b'(?<=ORIGIN).+(?=//\n)', re.DOTALL + re.M)
|
||||
_cleanSeq = re.compile(b'[ \n0-9]+')
|
||||
_seqMatcher = re.compile(b'ORIGIN.+(?=//\n)', re.DOTALL + re.M)
|
||||
_cleanSeq1 = re.compile(b'ORIGIN.+\n')
|
||||
_cleanSeq2 = re.compile(b'[ \n0-9]+')
|
||||
_acMatcher = re.compile(b'(?<=^ACCESSION ).+',re.M)
|
||||
_deMatcher = re.compile(b'(?<=^DEFINITION ).+\n( .+\n)*',re.M)
|
||||
_cleanDe = re.compile(b'\n *')
|
||||
@ -42,7 +43,8 @@ def genbankParser(bytes text):
|
||||
ft = _featureMatcher.search(text).group()
|
||||
|
||||
s = _seqMatcher.search(text).group()
|
||||
s = _cleanSeq.sub(b'', s).upper()
|
||||
s = _cleanSeq1.sub(b'', s)
|
||||
s = _cleanSeq2.sub(b'', s)
|
||||
|
||||
acs = _acMatcher.search(text).group()
|
||||
acs = acs.split()
|
||||
@ -51,23 +53,23 @@ def genbankParser(bytes text):
|
||||
|
||||
de = _deMatcher.search(header).group()
|
||||
de = _cleanDe.sub(b' ',de).strip().strip(b'.')
|
||||
|
||||
|
||||
tags = {}
|
||||
extractTaxon(ft, tags)
|
||||
|
||||
seq = Nuc_Seq(ac,
|
||||
s,
|
||||
definition=de,
|
||||
quality=None,
|
||||
offset=-1,
|
||||
tags=tags)
|
||||
|
||||
except Exception as e:
|
||||
print("\nCould not import sequence id:", text.split()[1], "(error raised:", e, ")")
|
||||
# Do not raise any Exception if you need the possibility to resume the generator
|
||||
# (Python generators can't resume after any exception is raised)
|
||||
return None
|
||||
|
||||
tags = {}
|
||||
extractTaxon(ft, tags)
|
||||
|
||||
seq = Nuc_Seq(ac,
|
||||
s,
|
||||
definition=de,
|
||||
quality=None,
|
||||
offset=-1,
|
||||
tags=tags)
|
||||
|
||||
|
||||
return seq
|
||||
|
||||
|
||||
@ -171,10 +173,12 @@ def genbankIterator_dir(dir_path,
|
||||
read = 0
|
||||
read_files = 0
|
||||
files = [filename for filename in glob.glob(os.path.join(path, b'*.gbff*'))]
|
||||
files.extend([filename for filename in glob.glob(os.path.join(path, b'*.seq*'))]) # new genbank extension
|
||||
files = list(set(files))
|
||||
for filename in files:
|
||||
if read==only:
|
||||
return
|
||||
print("Parsing file %s (%d/%d)" % (tostr(filename), read_files, len(files)))
|
||||
print("Parsing file %s (%d/%d)" % (tostr(filename), read_files+1, len(files)))
|
||||
f = uopen(filename)
|
||||
if only is not None:
|
||||
only_f = only-read
|
||||
|
@ -1,5 +1,5 @@
|
||||
major = 3
|
||||
minor = 0
|
||||
serial= '0-beta15'
|
||||
serial= '0b21'
|
||||
|
||||
version ="%d.%02d.%s" % (major,minor,serial)
|
||||
version ="%d.%d.%s" % (major,minor,serial)
|
||||
|
5
requirements.txt
Executable file
5
requirements.txt
Executable file
@ -0,0 +1,5 @@
|
||||
--extra-index-url https://pypi.python.org/simple/
|
||||
Cython>=0.24
|
||||
Sphinx>=1.2.0
|
||||
ipython>=3.0.0
|
||||
breathe>=4.0.0
|
16
setup.py
16
setup.py
@ -5,8 +5,9 @@ import re
|
||||
import subprocess
|
||||
|
||||
from distutils import log
|
||||
from distutils.core import setup
|
||||
|
||||
#from distutils.core import setup
|
||||
from setuptools import setup # to work with pip
|
||||
|
||||
from distutils.core import Extension
|
||||
from distutils.sysconfig import get_python_lib
|
||||
|
||||
@ -88,9 +89,10 @@ PACKAGE = "OBITools3"
|
||||
VERSION = version
|
||||
AUTHOR = 'Celine Mercier'
|
||||
EMAIL = 'celine.mercier@metabarcoding.org'
|
||||
URL = "http://metabarcoding.org/obitools3"
|
||||
URL = "https://metabarcoding.org/obitools3"
|
||||
PLATFORMS = "posix"
|
||||
LICENSE = "CeCILL-V2"
|
||||
DESCRIPTION = "Tools and library for DNA metabarcoding",
|
||||
DESCRIPTION = "A package for the management of analyses and data in DNA metabarcoding."
|
||||
PYTHONMIN = '3.5'
|
||||
|
||||
SRC = 'python'
|
||||
@ -147,12 +149,18 @@ classifiers=['Development Status :: 4 - Beta',
|
||||
'Topic :: Utilities',
|
||||
]
|
||||
|
||||
with open("README.md", "r") as fh:
|
||||
long_description = fh.read()
|
||||
|
||||
setup(name=PACKAGE,
|
||||
description=DESCRIPTION,
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
classifiers=classifiers,
|
||||
version=VERSION,
|
||||
author=AUTHOR,
|
||||
author_email=EMAIL,
|
||||
platforms=PLATFORMS,
|
||||
license=LICENSE,
|
||||
url=URL,
|
||||
ext_modules=xx,
|
||||
|
@ -648,7 +648,7 @@ int truncate_avl_data_to_size_used(OBIDMS_avl_data_p avl_data) // TODO is it nec
|
||||
new_data_size = ((index_t) multiple) * getpagesize();
|
||||
|
||||
// Check that it is actually greater than the current size of the file, otherwise no need to truncate
|
||||
if ((avl_data->header)->data_size_max == new_data_size)
|
||||
if ((avl_data->header)->data_size_max >= new_data_size)
|
||||
return 0;
|
||||
|
||||
// Get the file descriptor
|
||||
@ -667,7 +667,7 @@ int truncate_avl_data_to_size_used(OBIDMS_avl_data_p avl_data) // TODO is it nec
|
||||
if (ftruncate(file_descriptor, file_size) < 0)
|
||||
{
|
||||
obi_set_errno(OBI_AVL_ERROR);
|
||||
obidebug(1, "\nError truncating an AVL data file");
|
||||
obidebug(1, "\nError truncating an AVL data file, old data size = %lld, new data size = %lld", (avl_data->header)->data_size_max, new_data_size);
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
@ -1037,8 +1037,9 @@ static int finish_view(Obiview_p view)
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Add count column if it's a NUC_SEQ_VIEW with no count column // TODO discuss
|
||||
if ((!strcmp((view->infos)->view_type, VIEW_TYPE_NUC_SEQS)) && (!obi_view_column_exists(view, COUNT_COLUMN)))
|
||||
// Add count column if it's a NUC_SEQ_VIEW with no count column (and there's no MERGED_sample column) // TODO discuss
|
||||
if ((!strcmp((view->infos)->view_type, VIEW_TYPE_NUC_SEQS)) && (!obi_view_column_exists(view, COUNT_COLUMN))
|
||||
&& (!obi_view_column_exists(view, "MERGED_sample"))) // TODO should eventually compute from merged samples?
|
||||
{
|
||||
if (obi_create_auto_count_column(view) < 0)
|
||||
{
|
||||
|
Reference in New Issue
Block a user