Compare commits

...

19 Commits

Author SHA1 Message Date
a7dcf16c06 Minor changes for pip release 2020-05-20 15:59:04 +02:00
f13f8f6165 obi import: minor doc/display improvements 2020-05-20 11:46:29 +02:00
b5a29ac413 Switch to version 3.0.0b19 2020-05-20 10:29:36 +02:00
efd2b9d338 Cleaner installation 2020-05-20 10:29:12 +02:00
ca6e3e7aad obi import: fixed to work with seq genbank extension 2020-05-20 10:28:14 +02:00
76ed8e18e5 Switch to version 3.0.0b18 with version formatting that fits setuptools 2020-05-18 17:08:55 +02:00
1d17f28aec setup: now using setuptools instead of distutils to work with pip 2020-05-18 17:08:09 +02:00
fa834e4b8b obi import: small bug fix 2020-05-18 17:06:58 +02:00
a72fea3cc9 Python: fasta parser: fixed a bug stopping the program when the last
line contained a single nucleotide
2020-05-12 11:24:12 +02:00
e9a37d8a6e Switch to version 3.0.0-beta16 2020-05-07 17:09:26 +02:00
ef074f8455 typo 2020-05-07 17:08:59 +02:00
aec5e69f2c C, views: no more automatic COUNT column if MERGED_sample column exists 2020-05-07 17:08:07 +02:00
170ef3f1ba Views: added obi prefix to commands in bash history 2020-05-07 17:07:01 +02:00
f999946582 obi uniq: fixed the remerging of already merged informations, and
efficiency improvements
2020-05-07 17:05:54 +02:00
773b36ec37 obi import: fixed the import of old obitools files with premerged
informations, and other minor improvements
2020-05-07 17:03:04 +02:00
69cb434a6c version 3.0.0-beta15c 2020-04-29 14:25:33 +02:00
55d4f98d60 obi annotate: fixed annotation at ranks 2020-04-29 14:24:40 +02:00
0bec2631e8 ecotag: fixed a bug where all the full DMS path weren't properly sent to
the C layer
2020-04-29 10:35:55 +02:00
e6b6c6fa84 AVLs: Made an error message more informative 2020-04-29 10:14:04 +02:00
18 changed files with 161 additions and 86 deletions

View File

@ -6,7 +6,7 @@ recursive-include doc/sphinx/source *.txt *.rst *.py
recursive-include doc/sphinx/sphinxext *.py recursive-include doc/sphinx/sphinxext *.py
include doc/sphinx/Makefile include doc/sphinx/Makefile
include doc/sphinx/Doxyfile include doc/sphinx/Doxyfile
include README.txt include README.md
include requirements.txt include requirements.txt
include scripts/obi include scripts/obi

View File

@ -13,7 +13,8 @@ from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN, \
ID_COLUMN, \ ID_COLUMN, \
DEFINITION_COLUMN, \ DEFINITION_COLUMN, \
QUALITY_COLUMN, \ QUALITY_COLUMN, \
COUNT_COLUMN COUNT_COLUMN, \
TAXID_COLUMN
import time import time
import math import math
@ -175,8 +176,8 @@ def sequenceTaggerGenerator(config, taxo=None):
counter[0]+=1 counter[0]+=1
for rank in annoteRank: for rank in annoteRank:
if 'taxid' in seq: if TAXID_COLUMN in seq:
taxid = seq['taxid'] taxid = seq[TAXID_COLUMN]
if taxid is not None: if taxid is not None:
rtaxid = taxo.get_taxon_at_rank(taxid, rank) rtaxid = taxo.get_taxon_at_rank(taxid, rank)
if rtaxid is not None: if rtaxid is not None:

View File

@ -107,8 +107,8 @@ def run(config):
comments = View.print_config(config, "ecotag", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name) comments = View.print_config(config, "ecotag", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
if obi_ecotag(i_dms.name_with_full_path, tobytes(i_view_name), \ if obi_ecotag(i_dms.name_with_full_path, tobytes(i_view_name), \
tobytes(ref_dms_name), tobytes(ref_view_name), \ ref_dms.name_with_full_path, tobytes(ref_view_name), \
tobytes(taxo_dms_name), tobytes(taxonomy_name), \ taxo_dms.name_with_full_path, tobytes(taxonomy_name), \
tobytes(o_view_name), comments, tobytes(o_view_name), comments,
config['ecotag']['threshold']) < 0: config['ecotag']['threshold']) < 0:
raise Exception("Error running ecotag") raise Exception("Error running ecotag")

View File

@ -25,7 +25,8 @@ from obitools3.dms.capi.obiview cimport VIEW_TYPE_NUC_SEQS, \
DEFINITION_COLUMN, \ DEFINITION_COLUMN, \
QUALITY_COLUMN, \ QUALITY_COLUMN, \
COUNT_COLUMN, \ COUNT_COLUMN, \
TAXID_COLUMN TAXID_COLUMN, \
MERGED_PREFIX
from obitools3.dms.capi.obidms cimport obi_import_view from obitools3.dms.capi.obidms cimport obi_import_view
@ -72,7 +73,7 @@ def addOptions(parser):
action="store_true", dest="import:preread", action="store_true", dest="import:preread",
default=False, default=False,
help="Do a first readthrough of the dataset if it contains huge dictionaries (more than 100 keys) for " help="Do a first readthrough of the dataset if it contains huge dictionaries (more than 100 keys) for "
"a much faster import.") "a much faster import. This option is not recommended and will slow down the import in any other case.")
def run(config): def run(config):
@ -163,7 +164,7 @@ def run(config):
taxo.write(taxo_name) taxo.write(taxo_name)
taxo.close() taxo.close()
o_dms.record_command_line(" ".join(sys.argv[1:])) o_dms.record_command_line(" ".join(sys.argv[1:]))
o_dms.close() o_dms.close(force=True)
logger("info", "Done.") logger("info", "Done.")
return return
@ -217,11 +218,14 @@ def run(config):
logger("info", "Read %d entries", i) logger("info", "Read %d entries", i)
for tag in entry : for tag in entry :
newtag = tag
if tag[:7] == b"merged_":
newtag = MERGED_PREFIX+tag[7:]
if type(entry[tag]) == dict : if type(entry[tag]) == dict :
if tag in dict_dict: if tag in dict_dict:
dict_dict[tag][0].update(entry[tag].keys()) dict_dict[newtag][0].update(entry[tag].keys())
else: else:
dict_dict[tag] = [set(entry[tag].keys()), get_obitype(entry[tag])] dict_dict[newtag] = [set(entry[tag].keys()), get_obitype(entry[tag])]
i+=1 i+=1
if pb is not None: if pb is not None:
@ -232,7 +236,7 @@ def run(config):
dcols[tag] = (Column.new_column(view, tag, dict_dict[tag][1], \ dcols[tag] = (Column.new_column(view, tag, dict_dict[tag][1], \
nb_elements_per_line=len(dict_dict[tag][0]), \ nb_elements_per_line=len(dict_dict[tag][0]), \
elements_names=list(dict_dict[tag][0])), \ elements_names=list(dict_dict[tag][0])), \
value_obitype) dict_dict[tag][1])
# Reinitialize the input # Reinitialize the input
@ -288,6 +292,8 @@ def run(config):
tag = TAXID_COLUMN tag = TAXID_COLUMN
if tag == b"count": if tag == b"count":
tag = COUNT_COLUMN tag = COUNT_COLUMN
if tag[:7] == b"merged_":
tag = MERGED_PREFIX+tag[7:]
if tag not in dcols : if tag not in dcols :
@ -328,8 +334,8 @@ def run(config):
try: try:
# Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key # Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key
if type(value) == dict and \ if type(value) == dict and \
dcols[tag][0].nb_elements_per_line == 1 and len(value.keys()) == 1 \ dcols[tag][0].nb_elements_per_line == 1 \
and dcols[tag][0].elements_names[0] != list(value.keys())[0] : and set(dcols[tag][0].elements_names) != set(value.keys()) :
raise IndexError # trigger column rewrite raise IndexError # trigger column rewrite
# Fill value # Fill value
@ -402,7 +408,7 @@ def run(config):
except AttributeError: except AttributeError:
pass pass
try: try:
output[0].close() output[0].close(force=True)
except AttributeError: except AttributeError:
pass pass

View File

@ -42,8 +42,8 @@ def addOptions(parser):
metavar="<URI>", metavar="<URI>",
type=str, type=str,
default=None, default=None,
help="URI to the view containing the samples definition (with tags, primers, sample names,...)" help="URI to the view containing the samples definition (with tags, primers, sample names,...).\n"
"Warning: primer lengths must be less than or equal to 32") "\nWarning: primer lengths must be less than or equal to 32")
group.add_argument('-R', '--reverse-reads', group.add_argument('-R', '--reverse-reads',
action="store", dest="ngsfilter:reverse", action="store", dest="ngsfilter:reverse",

View File

@ -5,5 +5,5 @@ from obitools3.dms.taxo.taxo cimport Taxonomy
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy, dict config)
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=*, Taxonomy taxonomy=*, bint mergeIds=*, list categories=*, int max_elts=*) cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, dict config, list mergedKeys_list=*, Taxonomy taxonomy=*, bint mergeIds=*, list categories=*, int max_elts=*)

View File

@ -56,7 +56,7 @@ def addOptions(parser):
"(option can be used several times).") "(option can be used several times).")
cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) : cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy, dict config) :
cdef int taxid cdef int taxid
cdef Nuc_Seq_Stored seq cdef Nuc_Seq_Stored seq
@ -69,7 +69,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
cdef object gn_sn cdef object gn_sn
cdef object fa_sn cdef object fa_sn
# Create columns # Create columns and save them for efficiency
if b"species" in o_view and o_view[b"species"].data_type_int != OBI_INT : if b"species" in o_view and o_view[b"species"].data_type_int != OBI_INT :
o_view.delete_column(b"species") o_view.delete_column(b"species")
if b"species" not in o_view: if b"species" not in o_view:
@ -77,6 +77,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
b"species", b"species",
OBI_INT OBI_INT
) )
species_column = o_view[b"species"]
if b"genus" in o_view and o_view[b"genus"].data_type_int != OBI_INT : if b"genus" in o_view and o_view[b"genus"].data_type_int != OBI_INT :
o_view.delete_column(b"genus") o_view.delete_column(b"genus")
@ -85,6 +86,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
b"genus", b"genus",
OBI_INT OBI_INT
) )
genus_column = o_view[b"genus"]
if b"family" in o_view and o_view[b"family"].data_type_int != OBI_INT : if b"family" in o_view and o_view[b"family"].data_type_int != OBI_INT :
o_view.delete_column(b"family") o_view.delete_column(b"family")
@ -93,6 +95,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
b"family", b"family",
OBI_INT OBI_INT
) )
family_column = o_view[b"family"]
if b"species_name" in o_view and o_view[b"species_name"].data_type_int != OBI_STR : if b"species_name" in o_view and o_view[b"species_name"].data_type_int != OBI_STR :
o_view.delete_column(b"species_name") o_view.delete_column(b"species_name")
@ -101,6 +104,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
b"species_name", b"species_name",
OBI_STR OBI_STR
) )
species_name_column = o_view[b"species_name"]
if b"genus_name" in o_view and o_view[b"genus_name"].data_type_int != OBI_STR : if b"genus_name" in o_view and o_view[b"genus_name"].data_type_int != OBI_STR :
o_view.delete_column(b"genus_name") o_view.delete_column(b"genus_name")
@ -109,6 +113,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
b"genus_name", b"genus_name",
OBI_STR OBI_STR
) )
genus_name_column = o_view[b"genus_name"]
if b"family_name" in o_view and o_view[b"family_name"].data_type_int != OBI_STR : if b"family_name" in o_view and o_view[b"family_name"].data_type_int != OBI_STR :
o_view.delete_column(b"family_name") o_view.delete_column(b"family_name")
@ -117,6 +122,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
b"family_name", b"family_name",
OBI_STR OBI_STR
) )
family_name_column = o_view[b"family_name"]
if b"rank" in o_view and o_view[b"rank"].data_type_int != OBI_STR : if b"rank" in o_view and o_view[b"rank"].data_type_int != OBI_STR :
o_view.delete_column(b"rank") o_view.delete_column(b"rank")
@ -125,6 +131,7 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
b"rank", b"rank",
OBI_STR OBI_STR
) )
rank_column = o_view[b"rank"]
if b"scientific_name" in o_view and o_view[b"scientific_name"].data_type_int != OBI_STR : if b"scientific_name" in o_view and o_view[b"scientific_name"].data_type_int != OBI_STR :
o_view.delete_column(b"scientific_name") o_view.delete_column(b"scientific_name")
@ -133,9 +140,15 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
b"scientific_name", b"scientific_name",
OBI_STR OBI_STR
) )
scientific_name_column = o_view[b"scientific_name"]
# Initialize the progress bar
pb = ProgressBar(len(o_view), config, seconde=5)
i=0
for seq in o_view: for seq in o_view:
PyErr_CheckSignals() PyErr_CheckSignals()
pb(i)
if MERGED_TAXID_COLUMN in seq : if MERGED_TAXID_COLUMN in seq :
m_taxids = [] m_taxids = []
m_taxids_dict = seq[MERGED_TAXID_COLUMN] m_taxids_dict = seq[MERGED_TAXID_COLUMN]
@ -166,19 +179,22 @@ cdef merge_taxonomy_classification(View_NUC_SEQS o_view, Taxonomy taxonomy) :
fa_sn = None fa_sn = None
tfa = None tfa = None
seq[b"species"] = tsp species_column[i] = tsp
seq[b"genus"] = tgn genus_column[i] = tgn
seq[b"family"] = tfa family_column[i] = tfa
seq[b"species_name"] = sp_sn species_name_column[i] = sp_sn
seq[b"genus_name"] = gn_sn genus_name_column[i] = gn_sn
seq[b"family_name"] = fa_sn family_name_column[i] = fa_sn
seq[b"rank"] = taxonomy.get_rank(taxid) rank_column[i] = taxonomy.get_rank(taxid)
seq[b"scientific_name"] = taxonomy.get_scientific_name(taxid) scientific_name_column[i] = taxonomy.get_scientific_name(taxid)
i+=1
pb(len(o_view), force=True)
cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None, int max_elts=1000000) : cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, dict config, list mergedKeys_list=None, Taxonomy taxonomy=None, bint mergeIds=False, list categories=None, int max_elts=1000000) :
cdef int i cdef int i
cdef int k cdef int k
@ -187,6 +203,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
cdef int u_idx cdef int u_idx
cdef int i_idx cdef int i_idx
cdef int i_count cdef int i_count
cdef int o_count
cdef str key_str cdef str key_str
cdef bytes key cdef bytes key
cdef bytes mkey cdef bytes mkey
@ -209,7 +226,6 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
cdef Nuc_Seq_Stored i_seq cdef Nuc_Seq_Stored i_seq
cdef Nuc_Seq_Stored o_seq cdef Nuc_Seq_Stored o_seq
cdef Nuc_Seq_Stored u_seq cdef Nuc_Seq_Stored u_seq
cdef Column i_col
cdef Column i_seq_col cdef Column i_seq_col
cdef Column i_id_col cdef Column i_id_col
cdef Column i_taxid_col cdef Column i_taxid_col
@ -217,6 +233,8 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
cdef Column o_id_col cdef Column o_id_col
cdef Column o_taxid_dist_col cdef Column o_taxid_dist_col
cdef Column o_merged_col cdef Column o_merged_col
cdef Column o_count_col
cdef Column i_count_col
cdef Column_line i_mcol cdef Column_line i_mcol
cdef object taxid_dist_dict cdef object taxid_dist_dict
cdef object iter_view cdef object iter_view
@ -253,6 +271,11 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
for k in range(k_count): for k in range(k_count):
mergedKeys_m.append(MERGED_PREFIX + mergedKeys[k]) mergedKeys_m.append(MERGED_PREFIX + mergedKeys[k])
# Check that not trying to remerge without total count information
for key in mergedKeys_m:
if key in view and COUNT_COLUMN not in view:
raise Exception("\n>>>>\nError: trying to re-merge tags without total count tag. Run obi annotate to add the count tag from the relevant merged tag, i.e.: \nobi annotate --set-tag COUNT:'sum([value for key,value in sequence['MERGED_sample'].items()])' dms/input dms/output\n")
if categories is None: if categories is None:
categories = [] categories = []
@ -320,7 +343,11 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
for k in range(k_count): for k in range(k_count):
key = mergedKeys[k] key = mergedKeys[k]
merged_col_name = mergedKeys_m[k] merged_col_name = mergedKeys_m[k]
i_col = view[key]
if merged_col_name in view:
i_col = view[merged_col_name]
else:
i_col = view[key]
if merged_infos[merged_col_name]['nb_elts'] > max_elts: if merged_infos[merged_col_name]['nb_elts'] > max_elts:
str_merged_cols.append(merged_col_name) str_merged_cols.append(merged_col_name)
@ -374,12 +401,19 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
alias=MERGED_COLUMN alias=MERGED_COLUMN
) )
# Keep columns that are going to be used a lot in variables # Keep columns in variables for efficiency
o_id_col = o_view[ID_COLUMN] o_id_col = o_view[ID_COLUMN]
if TAXID_DIST_COLUMN in o_view: if TAXID_DIST_COLUMN in o_view:
o_taxid_dist_col = o_view[TAXID_DIST_COLUMN] o_taxid_dist_col = o_view[TAXID_DIST_COLUMN]
if MERGED_COLUMN in o_view: if MERGED_COLUMN in o_view:
o_merged_col = o_view[MERGED_COLUMN] o_merged_col = o_view[MERGED_COLUMN]
if COUNT_COLUMN not in o_view:
Column.new_column(o_view,
COUNT_COLUMN,
OBI_INT)
o_count_col = o_view[COUNT_COLUMN]
if COUNT_COLUMN in view:
i_count_col = view[COUNT_COLUMN]
pb(len(view), force=True) pb(len(view), force=True)
print("") print("")
@ -407,7 +441,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
merged_list = list(set(merged_list)) # deduplicate the list merged_list = list(set(merged_list)) # deduplicate the list
o_merged_col[o_idx] = merged_list o_merged_col[o_idx] = merged_list
o_seq[COUNT_COLUMN] = 0 o_count = 0
if TAXID_DIST_COLUMN in u_seq and i_taxid_dist_col[u_idx] is not None: if TAXID_DIST_COLUMN in u_seq and i_taxid_dist_col[u_idx] is not None:
taxid_dist_dict = i_taxid_dist_col[u_idx] taxid_dist_dict = i_taxid_dist_col[u_idx]
@ -423,12 +457,12 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
i_id = i_id_col[i_idx] i_id = i_id_col[i_idx]
i_seq = view[i_idx] i_seq = view[i_idx]
if COUNT_COLUMN not in i_seq or i_seq[COUNT_COLUMN] is None: if COUNT_COLUMN not in i_seq or i_count_col[i_idx] is None:
i_count = 1 i_count = 1
else: else:
i_count = i_seq[COUNT_COLUMN] i_count = i_count_col[i_idx]
o_seq[COUNT_COLUMN] += i_count o_count += i_count
for k in range(k_count): for k in range(k_count):
@ -464,43 +498,51 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
else: else:
mcol[key2] = mcol[key2] + i_mcol[key2] mcol[key2] = mcol[key2] + i_mcol[key2]
# Write taxid_dist
if mergeIds and TAXID_COLUMN in mergedKeys:
if TAXID_DIST_COLUMN in str_merged_cols:
o_taxid_dist_col[o_idx] = str(taxid_dist_dict)
else:
o_taxid_dist_col[o_idx] = taxid_dist_dict
# Write merged dicts
for mkey in merged_dict:
if mkey in str_merged_cols:
mkey_cols[mkey][o_idx] = str(merged_dict[mkey])
else:
mkey_cols[mkey][o_idx] = merged_dict[mkey]
# Sets NA values to 0 # TODO discuss, for now keep as None and test for None instead of testing for 0 in tools
#for key in mkey_cols[mkey][o_idx]:
# if mkey_cols[mkey][o_idx][key] is None:
# mkey_cols[mkey][o_idx][key] = 0
for key in i_seq.keys(): for key in i_seq.keys():
# Delete informations that differ between the merged sequences # Delete informations that differ between the merged sequences
# TODO make special columns list? # TODO make special columns list? // could be more efficient
if key != COUNT_COLUMN and key != ID_COLUMN and key != NUC_SEQUENCE_COLUMN and key in o_seq and o_seq[key] != i_seq[key] \ if key != COUNT_COLUMN and key != ID_COLUMN and key != NUC_SEQUENCE_COLUMN and key in o_seq and o_seq[key] != i_seq[key] \
and key not in merged_dict : and key not in merged_dict :
o_seq[key] = None o_seq[key] = None
# Write merged dicts
for mkey in merged_dict:
if mkey in str_merged_cols:
mkey_cols[mkey][o_idx] = str(merged_dict[mkey])
else:
mkey_cols[mkey][o_idx] = merged_dict[mkey]
# Sets NA values to 0 # TODO discuss, for now keep as None and test for None instead of testing for 0 in tools
#for key in mkey_cols[mkey][o_idx]:
# if mkey_cols[mkey][o_idx][key] is None:
# mkey_cols[mkey][o_idx][key] = 0
# Write taxid_dist
if mergeIds and TAXID_COLUMN in mergedKeys:
if TAXID_DIST_COLUMN in str_merged_cols:
o_taxid_dist_col[o_idx] = str(taxid_dist_dict)
else:
o_taxid_dist_col[o_idx] = taxid_dist_dict
o_count_col[o_idx] = o_count
o_idx += 1 o_idx += 1
pb(len(uniques), force=True)
# Deletes quality columns if there is one because the matching between sequence and quality will be broken (quality set to NA when sequence not) # Deletes quality columns if there is one because the matching between sequence and quality will be broken (quality set to NA when sequence not)
if QUALITY_COLUMN in view: if QUALITY_COLUMN in view:
o_view.delete_column(QUALITY_COLUMN) o_view.delete_column(QUALITY_COLUMN)
if REVERSE_QUALITY_COLUMN in view: if REVERSE_QUALITY_COLUMN in view:
o_view.delete_column(REVERSE_QUALITY_COLUMN) o_view.delete_column(REVERSE_QUALITY_COLUMN)
# Delete old columns that are now merged
for k in range(k_count):
if mergedKeys[k] in o_view:
o_view.delete_column(mergedKeys[k])
if taxonomy is not None: if taxonomy is not None:
print("") # TODO because in the middle of progress bar. Better solution? print("") # TODO because in the middle of progress bar. Better solution?
logger("info", "Merging taxonomy classification") logger("info", "Merging taxonomy classification")
merge_taxonomy_classification(o_view, taxonomy) merge_taxonomy_classification(o_view, taxonomy, config)
@ -547,11 +589,10 @@ def run(config):
pb = ProgressBar(len(entries), config, seconde=5) pb = ProgressBar(len(entries), config, seconde=5)
try: try:
uniq_sequences(entries, o_view, pb, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'], max_elts=config['obi']['maxelts']) uniq_sequences(entries, o_view, pb, config, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'], max_elts=config['obi']['maxelts'])
except Exception, e: except Exception, e:
raise RollbackException("obi uniq error, rollbacking view: "+str(e), o_view) raise RollbackException("obi uniq error, rollbacking view: "+str(e), o_view)
pb(len(entries), force=True)
print("", file=sys.stderr) print("", file=sys.stderr)
# Save command config in View and DMS comments # Save command config in View and DMS comments
@ -567,8 +608,8 @@ def run(config):
#print("\n\nOutput view:\n````````````", file=sys.stderr) #print("\n\nOutput view:\n````````````", file=sys.stderr)
#print(repr(o_view), file=sys.stderr) #print(repr(o_view), file=sys.stderr)
input[0].close() input[0].close(force=True)
output[0].close() output[0].close(force=True)
logger("info", "Done.") logger("info", "Done.")

View File

@ -63,6 +63,8 @@ cdef extern from "obidmscolumn.h" nogil:
char* obi_get_elements_names(OBIDMS_column_p column) char* obi_get_elements_names(OBIDMS_column_p column)
char* obi_column_formatted_infos(OBIDMS_column_p column)
index_t obi_column_get_element_index_from_name(OBIDMS_column_p column, const char* element_name) index_t obi_column_get_element_index_from_name(OBIDMS_column_p column, const char* element_name)
int obi_column_write_comments(OBIDMS_column_p column, const char* comments) int obi_column_write_comments(OBIDMS_column_p column, const char* comments)

View File

@ -14,6 +14,7 @@ from ..capi.obidms cimport obi_import_column
from ..capi.obidmscolumn cimport OBIDMS_column_header_p, \ from ..capi.obidmscolumn cimport OBIDMS_column_header_p, \
obi_close_column, \ obi_close_column, \
obi_get_elements_names, \ obi_get_elements_names, \
obi_column_formatted_infos, \
obi_column_write_comments obi_column_write_comments
from ..capi.obiutils cimport obi_format_date from ..capi.obiutils cimport obi_format_date
@ -38,7 +39,7 @@ from obitools3.utils cimport tobytes, \
from obitools3.dms.column import typed_column from obitools3.dms.column import typed_column
from libc.stdlib cimport free from libc.stdlib cimport free
import importlib import importlib
import inspect import inspect
@ -288,7 +289,13 @@ cdef class Column(OBIWrapper) :
@OBIWrapper.checkIsActive @OBIWrapper.checkIsActive
def __repr__(self) : def __repr__(self) :
cdef bytes s cdef bytes s
#cdef char* s_b
#cdef str s_str
#s_b = obi_column_formatted_infos(self.pointer())
#s_str = bytes2str(s_b)
#free(s_b)
s = self._alias + b", data type: " + self.data_type s = self._alias + b", data type: " + self.data_type
#return s_str
return bytes2str(s) return bytes2str(s)

View File

@ -531,8 +531,8 @@ cdef class View(OBIWrapper) :
for level in self.view_history: for level in self.view_history:
command_list = [level[input][b"command_line"] for input in level.keys()] command_list = [level[input][b"command_line"] for input in level.keys()]
for command in command_list: for command in command_list:
s+=b"obi "
s+=command s+=command
s+=b"\n"
return s return s

View File

@ -177,7 +177,7 @@ def emblIterator_dir(dir_path,
for filename in files: for filename in files:
if read==only: if read==only:
return return
print("Parsing file %s (%d/%d)" % (tostr(filename), read_files, len(files))) print("Parsing file %s (%d/%d)" % (tostr(filename), read_files+1, len(files)))
f = uopen(filename) f = uopen(filename)
if only is not None: if only is not None:
only_f = only-read only_f = only-read

View File

@ -104,6 +104,7 @@ def fastaNucIterator(lineiterator,
cdef bytes sequence cdef bytes sequence
cdef int skipped, ionly, read cdef int skipped, ionly, read
cdef Nuc_Seq seq cdef Nuc_Seq seq
cdef bint stop
if only is None: if only is None:
ionly = -1 ionly = -1
@ -130,7 +131,8 @@ def fastaNucIterator(lineiterator,
else: else:
line = firstline line = firstline
while True: stop=False
while not stop:
if ionly >= 0 and read >= ionly: if ionly >= 0 and read >= ionly:
break break
@ -153,7 +155,7 @@ def fastaNucIterator(lineiterator,
s.append(line[0:-1]) s.append(line[0:-1])
line = next(iterator) line = next(iterator)
except StopIteration: except StopIteration:
pass stop=True
sequence = b"".join(s) sequence = b"".join(s)

View File

@ -171,10 +171,12 @@ def genbankIterator_dir(dir_path,
read = 0 read = 0
read_files = 0 read_files = 0
files = [filename for filename in glob.glob(os.path.join(path, b'*.gbff*'))] files = [filename for filename in glob.glob(os.path.join(path, b'*.gbff*'))]
files.extend([filename for filename in glob.glob(os.path.join(path, b'*.seq*'))]) # new genbank extension
files = list(set(files))
for filename in files: for filename in files:
if read==only: if read==only:
return return
print("Parsing file %s (%d/%d)" % (tostr(filename), read_files, len(files))) print("Parsing file %s (%d/%d)" % (tostr(filename), read_files+1, len(files)))
f = uopen(filename) f = uopen(filename)
if only is not None: if only is not None:
only_f = only-read only_f = only-read

View File

@ -1,5 +1,5 @@
major = 3 major = 3
minor = 0 minor = 0
serial= '0-beta15' serial= '0b20'
version ="%d.%02d.%s" % (major,minor,serial) version ="%d.%d.%s" % (major,minor,serial)

5
requirements.txt Executable file
View File

@ -0,0 +1,5 @@
--extra-index-url https://pypi.python.org/simple/
Cython>=0.24
Sphinx>=1.2.0
ipython>=3.0.0
breathe>=4.0.0

View File

@ -5,7 +5,8 @@ import re
import subprocess import subprocess
from distutils import log from distutils import log
from distutils.core import setup #from distutils.core import setup
from setuptools import setup # to work with pip
from distutils.core import Extension from distutils.core import Extension
from distutils.sysconfig import get_python_lib from distutils.sysconfig import get_python_lib
@ -88,9 +89,10 @@ PACKAGE = "OBITools3"
VERSION = version VERSION = version
AUTHOR = 'Celine Mercier' AUTHOR = 'Celine Mercier'
EMAIL = 'celine.mercier@metabarcoding.org' EMAIL = 'celine.mercier@metabarcoding.org'
URL = "http://metabarcoding.org/obitools3" URL = "https://metabarcoding.org/obitools3"
PLATFORMS = "posix"
LICENSE = "CeCILL-V2" LICENSE = "CeCILL-V2"
DESCRIPTION = "Tools and library for DNA metabarcoding", DESCRIPTION = "A package for the management of analyses and data in DNA metabarcoding."
PYTHONMIN = '3.5' PYTHONMIN = '3.5'
SRC = 'python' SRC = 'python'
@ -147,12 +149,18 @@ classifiers=['Development Status :: 4 - Beta',
'Topic :: Utilities', 'Topic :: Utilities',
] ]
with open("README.md", "r") as fh:
long_description = fh.read()
setup(name=PACKAGE, setup(name=PACKAGE,
description=DESCRIPTION, description=DESCRIPTION,
long_description=long_description,
long_description_content_type="text/markdown",
classifiers=classifiers, classifiers=classifiers,
version=VERSION, version=VERSION,
author=AUTHOR, author=AUTHOR,
author_email=EMAIL, author_email=EMAIL,
platforms=PLATFORMS,
license=LICENSE, license=LICENSE,
url=URL, url=URL,
ext_modules=xx, ext_modules=xx,

View File

@ -648,7 +648,7 @@ int truncate_avl_data_to_size_used(OBIDMS_avl_data_p avl_data) // TODO is it nec
new_data_size = ((index_t) multiple) * getpagesize(); new_data_size = ((index_t) multiple) * getpagesize();
// Check that it is actually greater than the current size of the file, otherwise no need to truncate // Check that it is actually greater than the current size of the file, otherwise no need to truncate
if ((avl_data->header)->data_size_max == new_data_size) if ((avl_data->header)->data_size_max >= new_data_size)
return 0; return 0;
// Get the file descriptor // Get the file descriptor
@ -667,7 +667,7 @@ int truncate_avl_data_to_size_used(OBIDMS_avl_data_p avl_data) // TODO is it nec
if (ftruncate(file_descriptor, file_size) < 0) if (ftruncate(file_descriptor, file_size) < 0)
{ {
obi_set_errno(OBI_AVL_ERROR); obi_set_errno(OBI_AVL_ERROR);
obidebug(1, "\nError truncating an AVL data file"); obidebug(1, "\nError truncating an AVL data file, old data size = %lld, new data size = %lld", (avl_data->header)->data_size_max, new_data_size);
return -1; return -1;
} }

View File

@ -1037,8 +1037,9 @@ static int finish_view(Obiview_p view)
return -1; return -1;
} }
// Add count column if it's a NUC_SEQ_VIEW with no count column // TODO discuss // Add count column if it's a NUC_SEQ_VIEW with no count column (and there's no MERGED_sample column) // TODO discuss
if ((!strcmp((view->infos)->view_type, VIEW_TYPE_NUC_SEQS)) && (!obi_view_column_exists(view, COUNT_COLUMN))) if ((!strcmp((view->infos)->view_type, VIEW_TYPE_NUC_SEQS)) && (!obi_view_column_exists(view, COUNT_COLUMN))
&& (!obi_view_column_exists(view, "MERGED_sample"))) // TODO should eventually compute from merged samples?
{ {
if (obi_create_auto_count_column(view) < 0) if (obi_create_auto_count_column(view) < 0)
{ {