Compare commits
30 Commits
v3.0.0-bet
...
v3.0.0b25
Author | SHA1 | Date | |
---|---|---|---|
aa01236cae | |||
49b8810a76 | |||
7a39df54c0 | |||
09e483b0d6 | |||
14a2579173 | |||
36a8aaa92e | |||
a17eb445c2 | |||
e4a32788c2 | |||
2442cc80bf | |||
aa836b2ace | |||
8776ce22e6 | |||
4aa772c405 | |||
b0b96ac37a | |||
687e42ad22 | |||
5fbbb6d304 | |||
359a9fe237 | |||
f9b6851f75 | |||
29a2652bbf | |||
2a2c233936 | |||
faf8ea9d86 | |||
ffe2485e94 | |||
6094ce2bbc | |||
a7dcf16c06 | |||
f13f8f6165 | |||
b5a29ac413 | |||
efd2b9d338 | |||
ca6e3e7aad | |||
76ed8e18e5 | |||
1d17f28aec | |||
fa834e4b8b |
@ -6,7 +6,7 @@ recursive-include doc/sphinx/source *.txt *.rst *.py
|
||||
recursive-include doc/sphinx/sphinxext *.py
|
||||
include doc/sphinx/Makefile
|
||||
include doc/sphinx/Doxyfile
|
||||
include README.txt
|
||||
include README.md
|
||||
include requirements.txt
|
||||
include scripts/obi
|
||||
|
||||
|
@ -35,12 +35,14 @@ def addOptions(parser):
|
||||
action="store", dest="ecopcr:primer1",
|
||||
metavar='<PRIMER>',
|
||||
type=str,
|
||||
required=True,
|
||||
help="Forward primer, length must be less than or equal to 32")
|
||||
|
||||
group.add_argument('--primer2', '-R',
|
||||
action="store", dest="ecopcr:primer2",
|
||||
metavar='<PRIMER>',
|
||||
type=str,
|
||||
required=True,
|
||||
help="Reverse primer, length must be less than or equal to 32")
|
||||
|
||||
group.add_argument('--error', '-e',
|
||||
|
@ -161,8 +161,7 @@ def obi_eval(compiled_expr, loc_env, line):
|
||||
return obi_eval_result
|
||||
|
||||
|
||||
def Filter_generator(options, tax_filter):
|
||||
#taxfilter = taxonomyFilterGenerator(options)
|
||||
def Filter_generator(options, tax_filter, i_view):
|
||||
|
||||
# Initialize conditions
|
||||
predicates = None
|
||||
@ -171,6 +170,9 @@ def Filter_generator(options, tax_filter):
|
||||
attributes = None
|
||||
if "attributes" in options and len(options["attributes"]) > 0:
|
||||
attributes = options["attributes"]
|
||||
for attribute in attributes:
|
||||
if attribute not in i_view:
|
||||
return None
|
||||
lmax = None
|
||||
if "lmax" in options:
|
||||
lmax = options["lmax"]
|
||||
@ -196,6 +198,8 @@ def Filter_generator(options, tax_filter):
|
||||
if "attribute_patterns" in options and len(options["attribute_patterns"]) > 0:
|
||||
for p in options["attribute_patterns"]:
|
||||
attribute, pattern = p.split(":", 1)
|
||||
if attribute not in i_view:
|
||||
return None
|
||||
attribute_patterns[tobytes(attribute)] = re.compile(tobytes(pattern))
|
||||
|
||||
def filter(line, loc_env):
|
||||
@ -324,21 +328,29 @@ def run(config):
|
||||
|
||||
# Apply filter
|
||||
tax_filter = Taxonomy_filter_generator(taxo, config["grep"])
|
||||
filter = Filter_generator(config["grep"], tax_filter)
|
||||
filter = Filter_generator(config["grep"], tax_filter, i_view)
|
||||
selection = Line_selection(i_view)
|
||||
for i in range(len(i_view)):
|
||||
PyErr_CheckSignals()
|
||||
pb(i)
|
||||
line = i_view[i]
|
||||
|
||||
loc_env = {"sequence": line, "line": line, "taxonomy": taxo, "obi_eval_result": False}
|
||||
|
||||
good = filter(line, loc_env)
|
||||
|
||||
if good :
|
||||
selection.append(i)
|
||||
|
||||
pb(i, force=True)
|
||||
|
||||
if filter is None and config["grep"]["invert_selection"]: # all sequences are selected: filter is None if no line will be selected because some columns don't exist
|
||||
for i in range(len(i_view)):
|
||||
PyErr_CheckSignals()
|
||||
pb(i)
|
||||
selection.append(i)
|
||||
|
||||
elif filter is not None : # filter is None if no line will be selected because some columns don't exist
|
||||
for i in range(len(i_view)):
|
||||
PyErr_CheckSignals()
|
||||
pb(i)
|
||||
line = i_view[i]
|
||||
|
||||
loc_env = {"sequence": line, "line": line, "taxonomy": taxo, "obi_eval_result": False}
|
||||
|
||||
good = filter(line, loc_env)
|
||||
|
||||
if good :
|
||||
selection.append(i)
|
||||
|
||||
pb(len(i_view), force=True)
|
||||
print("", file=sys.stderr)
|
||||
|
||||
# Create output view with the line selection
|
||||
|
@ -73,7 +73,7 @@ def addOptions(parser):
|
||||
action="store_true", dest="import:preread",
|
||||
default=False,
|
||||
help="Do a first readthrough of the dataset if it contains huge dictionaries (more than 100 keys) for "
|
||||
"a much faster import.")
|
||||
"a much faster import. This option is not recommended and will slow down the import in any other case.")
|
||||
|
||||
|
||||
def run(config):
|
||||
@ -236,7 +236,7 @@ def run(config):
|
||||
dcols[tag] = (Column.new_column(view, tag, dict_dict[tag][1], \
|
||||
nb_elements_per_line=len(dict_dict[tag][0]), \
|
||||
elements_names=list(dict_dict[tag][0])), \
|
||||
value_obitype)
|
||||
dict_dict[tag][1])
|
||||
|
||||
|
||||
# Reinitialize the input
|
||||
@ -260,7 +260,6 @@ def run(config):
|
||||
|
||||
if entry is None: # error or exception handled at lower level, not raised because Python generators can't resume after any exception is raised
|
||||
if config['obi']['skiperror']:
|
||||
i-=1
|
||||
continue
|
||||
else:
|
||||
raise RollbackException("obi import error, rollbacking view", view)
|
||||
@ -269,125 +268,134 @@ def run(config):
|
||||
pb(i)
|
||||
elif not i%50000:
|
||||
logger("info", "Imported %d entries", i)
|
||||
|
||||
if NUC_SEQS_view:
|
||||
id_col[i] = entry.id
|
||||
def_col[i] = entry.definition
|
||||
seq_col[i] = entry.seq
|
||||
# Check if there is a sequencing quality associated by checking the first entry # TODO haven't found a more robust solution yet
|
||||
if i == 0:
|
||||
get_quality = QUALITY_COLUMN in entry
|
||||
|
||||
try:
|
||||
|
||||
if NUC_SEQS_view:
|
||||
id_col[i] = entry.id
|
||||
def_col[i] = entry.definition
|
||||
seq_col[i] = entry.seq
|
||||
# Check if there is a sequencing quality associated by checking the first entry # TODO haven't found a more robust solution yet
|
||||
if i == 0:
|
||||
get_quality = QUALITY_COLUMN in entry
|
||||
if get_quality:
|
||||
Column.new_column(view, QUALITY_COLUMN, OBI_QUAL)
|
||||
qual_col = view[QUALITY_COLUMN]
|
||||
if get_quality:
|
||||
Column.new_column(view, QUALITY_COLUMN, OBI_QUAL)
|
||||
qual_col = view[QUALITY_COLUMN]
|
||||
if get_quality:
|
||||
qual_col[i] = entry.quality
|
||||
|
||||
for tag in entry :
|
||||
|
||||
if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN : # TODO dirty
|
||||
|
||||
value = entry[tag]
|
||||
if tag == b"taxid":
|
||||
tag = TAXID_COLUMN
|
||||
if tag == b"count":
|
||||
tag = COUNT_COLUMN
|
||||
if tag[:7] == b"merged_":
|
||||
tag = MERGED_PREFIX+tag[7:]
|
||||
|
||||
if tag not in dcols :
|
||||
|
||||
value_type = type(value)
|
||||
nb_elts = 1
|
||||
value_obitype = OBI_VOID
|
||||
|
||||
if value_type == dict or value_type == list :
|
||||
nb_elts = len(value)
|
||||
elt_names = list(value)
|
||||
else :
|
||||
nb_elts = 1
|
||||
elt_names = None
|
||||
|
||||
value_obitype = get_obitype(value)
|
||||
|
||||
if value_obitype != OBI_VOID :
|
||||
dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
|
||||
|
||||
# Fill value
|
||||
if value_type == dict and nb_elts == 1: # special case that makes the OBI3 create a 1 elt/line column which won't read a dict value
|
||||
value = value[list(value.keys())[0]] # The solution is to transform the value in a simple atomic one acceptable by the column
|
||||
dcols[tag][0][i] = value
|
||||
|
||||
# TODO else log error?
|
||||
|
||||
else :
|
||||
|
||||
rewrite = False
|
||||
|
||||
# Check type adequation
|
||||
old_type = dcols[tag][1]
|
||||
new_type = OBI_VOID
|
||||
new_type = update_obitype(old_type, value)
|
||||
if old_type != new_type :
|
||||
rewrite = True
|
||||
|
||||
try:
|
||||
# Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key
|
||||
if type(value) == dict and \
|
||||
dcols[tag][0].nb_elements_per_line == 1 \
|
||||
and set(dcols[tag][0].elements_names) != set(value.keys()) :
|
||||
raise IndexError # trigger column rewrite
|
||||
qual_col[i] = entry.quality
|
||||
|
||||
for tag in entry :
|
||||
|
||||
if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN : # TODO dirty
|
||||
|
||||
value = entry[tag]
|
||||
if tag == b"taxid":
|
||||
tag = TAXID_COLUMN
|
||||
if tag == b"count":
|
||||
tag = COUNT_COLUMN
|
||||
if tag[:7] == b"merged_":
|
||||
tag = MERGED_PREFIX+tag[7:]
|
||||
|
||||
# Fill value
|
||||
dcols[tag][0][i] = value
|
||||
|
||||
except IndexError :
|
||||
|
||||
if tag not in dcols :
|
||||
|
||||
value_type = type(value)
|
||||
old_column = dcols[tag][0]
|
||||
old_nb_elements_per_line = old_column.nb_elements_per_line
|
||||
new_nb_elements_per_line = 0
|
||||
old_elements_names = old_column.elements_names
|
||||
new_elements_names = None
|
||||
nb_elts = 1
|
||||
value_obitype = OBI_VOID
|
||||
|
||||
if value_type == dict or value_type == list :
|
||||
nb_elts = len(value)
|
||||
elt_names = list(value)
|
||||
else :
|
||||
nb_elts = 1
|
||||
elt_names = None
|
||||
|
||||
value_obitype = get_obitype(value)
|
||||
|
||||
if value_obitype != OBI_VOID :
|
||||
dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
|
||||
|
||||
# Fill value
|
||||
if value_type == dict and nb_elts == 1: # special case that makes the OBI3 create a 1 elt/line column which won't read a dict value
|
||||
value = value[list(value.keys())[0]] # The solution is to transform the value in a simple atomic one acceptable by the column
|
||||
dcols[tag][0][i] = value
|
||||
|
||||
# TODO else log error?
|
||||
|
||||
#####################################################################
|
||||
|
||||
# Check the length and keys of column lines if needed
|
||||
if value_type == dict : # Check dictionary keys
|
||||
for k in value :
|
||||
if k not in old_elements_names :
|
||||
new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
|
||||
rewrite = True
|
||||
break
|
||||
|
||||
elif value_type == list or value_type == tuple : # Check vector length
|
||||
if old_nb_elements_per_line < len(value) :
|
||||
new_nb_elements_per_line = len(value)
|
||||
rewrite = True
|
||||
|
||||
#####################################################################
|
||||
|
||||
if rewrite :
|
||||
if new_nb_elements_per_line == 0 and new_elements_names is not None :
|
||||
new_nb_elements_per_line = len(new_elements_names)
|
||||
|
||||
# Reset obierrno
|
||||
obi_errno = 0
|
||||
|
||||
dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
|
||||
new_data_type=new_type,
|
||||
new_nb_elements_per_line=new_nb_elements_per_line,
|
||||
new_elements_names=new_elements_names,
|
||||
rewrite_last_line=False),
|
||||
new_type)
|
||||
|
||||
# Update the dictionary:
|
||||
for t in dcols :
|
||||
dcols[t] = (view[t], dcols[t][1])
|
||||
|
||||
else :
|
||||
|
||||
rewrite = False
|
||||
|
||||
# Check type adequation
|
||||
old_type = dcols[tag][1]
|
||||
new_type = OBI_VOID
|
||||
new_type = update_obitype(old_type, value)
|
||||
if old_type != new_type :
|
||||
rewrite = True
|
||||
|
||||
try:
|
||||
# Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key
|
||||
if type(value) == dict and \
|
||||
dcols[tag][0].nb_elements_per_line == 1 \
|
||||
and set(dcols[tag][0].elements_names) != set(value.keys()) :
|
||||
raise IndexError # trigger column rewrite
|
||||
|
||||
# Fill value
|
||||
dcols[tag][0][i] = value
|
||||
|
||||
|
||||
except IndexError :
|
||||
|
||||
value_type = type(value)
|
||||
old_column = dcols[tag][0]
|
||||
old_nb_elements_per_line = old_column.nb_elements_per_line
|
||||
new_nb_elements_per_line = 0
|
||||
old_elements_names = old_column.elements_names
|
||||
new_elements_names = None
|
||||
|
||||
#####################################################################
|
||||
|
||||
# Check the length and keys of column lines if needed
|
||||
if value_type == dict : # Check dictionary keys
|
||||
for k in value :
|
||||
if k not in old_elements_names :
|
||||
new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
|
||||
rewrite = True
|
||||
break
|
||||
|
||||
elif value_type == list or value_type == tuple : # Check vector length
|
||||
if old_nb_elements_per_line < len(value) :
|
||||
new_nb_elements_per_line = len(value)
|
||||
rewrite = True
|
||||
|
||||
#####################################################################
|
||||
|
||||
if rewrite :
|
||||
if new_nb_elements_per_line == 0 and new_elements_names is not None :
|
||||
new_nb_elements_per_line = len(new_elements_names)
|
||||
|
||||
# Reset obierrno
|
||||
obi_errno = 0
|
||||
|
||||
dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
|
||||
new_data_type=new_type,
|
||||
new_nb_elements_per_line=new_nb_elements_per_line,
|
||||
new_elements_names=new_elements_names,
|
||||
rewrite_last_line=False),
|
||||
new_type)
|
||||
|
||||
# Update the dictionary:
|
||||
for t in dcols :
|
||||
dcols[t] = (view[t], dcols[t][1])
|
||||
|
||||
# Fill value
|
||||
dcols[tag][0][i] = value
|
||||
|
||||
except Exception as e:
|
||||
print("\nCould not import sequence id:", entry.id, "(error raised:", e, ")")
|
||||
if 'skiperror' in config['obi'] and not config['obi']['skiperror']:
|
||||
raise e
|
||||
else:
|
||||
pass
|
||||
|
||||
i+=1
|
||||
|
||||
if pb is not None:
|
||||
|
@ -34,9 +34,10 @@ def run(config):
|
||||
if input[2] == DMS and not config['ls']['longformat']:
|
||||
dms = input[0]
|
||||
l = []
|
||||
for view in input[0]:
|
||||
l.append(tostr(view) + "\t(Date created: " + str(bytes2str_object(dms[view].comments["Date created"]))+")")
|
||||
dms[view].close()
|
||||
for viewname in input[0]:
|
||||
view = dms[viewname]
|
||||
l.append(tostr(viewname) + "\t(Date created: " + str(bytes2str_object(view.comments["Date created"]))+")")
|
||||
view.close()
|
||||
l.sort()
|
||||
for v in l:
|
||||
print(v)
|
||||
|
@ -42,6 +42,7 @@ def addOptions(parser):
|
||||
metavar="<URI>",
|
||||
type=str,
|
||||
default=None,
|
||||
required=True,
|
||||
help="URI to the view containing the samples definition (with tags, primers, sample names,...).\n"
|
||||
"\nWarning: primer lengths must be less than or equal to 32")
|
||||
|
||||
@ -505,7 +506,7 @@ cdef tuple annotate(sequences, infos, no_tags, verbose=False):
|
||||
sample=None
|
||||
|
||||
if sample is None:
|
||||
sequences[0][b'error']=b"No tags found"
|
||||
sequences[0][b'error']=b"No sample with that tag combination"
|
||||
return False, sequences[0]
|
||||
|
||||
sequences[0].update(sample)
|
||||
|
@ -419,12 +419,12 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, di
|
||||
print("")
|
||||
logger("info", "Second browsing through the input")
|
||||
# Initialize the progress bar
|
||||
pb = ProgressBar(len(uniques), seconde=5)
|
||||
pb = ProgressBar(len(view), seconde=5)
|
||||
o_idx = 0
|
||||
total_treated = 0
|
||||
|
||||
for unique_id in uniques :
|
||||
PyErr_CheckSignals()
|
||||
pb(o_idx)
|
||||
|
||||
merged_sequences = uniques[unique_id]
|
||||
|
||||
@ -453,7 +453,8 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, di
|
||||
merged_dict[mkey] = {}
|
||||
|
||||
for i_idx in merged_sequences:
|
||||
|
||||
pb(total_treated)
|
||||
|
||||
i_id = i_id_col[i_idx]
|
||||
i_seq = view[i_idx]
|
||||
|
||||
@ -504,7 +505,9 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, di
|
||||
if key != COUNT_COLUMN and key != ID_COLUMN and key != NUC_SEQUENCE_COLUMN and key in o_seq and o_seq[key] != i_seq[key] \
|
||||
and key not in merged_dict :
|
||||
o_seq[key] = None
|
||||
|
||||
|
||||
total_treated += 1
|
||||
|
||||
# Write merged dicts
|
||||
for mkey in merged_dict:
|
||||
if mkey in str_merged_cols:
|
||||
@ -526,7 +529,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, di
|
||||
o_count_col[o_idx] = o_count
|
||||
o_idx += 1
|
||||
|
||||
pb(len(uniques), force=True)
|
||||
pb(len(view), force=True)
|
||||
|
||||
# Deletes quality columns if there is one because the matching between sequence and quality will be broken (quality set to NA when sequence not)
|
||||
if QUALITY_COLUMN in view:
|
||||
@ -588,10 +591,11 @@ def run(config):
|
||||
# Initialize the progress bar
|
||||
pb = ProgressBar(len(entries), config, seconde=5)
|
||||
|
||||
try:
|
||||
uniq_sequences(entries, o_view, pb, config, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'], max_elts=config['obi']['maxelts'])
|
||||
except Exception, e:
|
||||
raise RollbackException("obi uniq error, rollbacking view: "+str(e), o_view)
|
||||
if len(entries) > 0:
|
||||
try:
|
||||
uniq_sequences(entries, o_view, pb, config, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'], max_elts=config['obi']['maxelts'])
|
||||
except Exception, e:
|
||||
raise RollbackException("obi uniq error, rollbacking view: "+str(e), o_view)
|
||||
|
||||
print("", file=sys.stderr)
|
||||
|
||||
|
@ -63,6 +63,8 @@ cdef extern from "obidmscolumn.h" nogil:
|
||||
|
||||
char* obi_get_elements_names(OBIDMS_column_p column)
|
||||
|
||||
char* obi_column_formatted_infos(OBIDMS_column_p column)
|
||||
|
||||
index_t obi_column_get_element_index_from_name(OBIDMS_column_p column, const char* element_name)
|
||||
|
||||
int obi_column_write_comments(OBIDMS_column_p column, const char* comments)
|
||||
|
@ -14,6 +14,7 @@ from ..capi.obidms cimport obi_import_column
|
||||
from ..capi.obidmscolumn cimport OBIDMS_column_header_p, \
|
||||
obi_close_column, \
|
||||
obi_get_elements_names, \
|
||||
obi_column_formatted_infos, \
|
||||
obi_column_write_comments
|
||||
|
||||
from ..capi.obiutils cimport obi_format_date
|
||||
@ -38,7 +39,7 @@ from obitools3.utils cimport tobytes, \
|
||||
|
||||
from obitools3.dms.column import typed_column
|
||||
|
||||
from libc.stdlib cimport free
|
||||
from libc.stdlib cimport free
|
||||
|
||||
import importlib
|
||||
import inspect
|
||||
@ -288,9 +289,15 @@ cdef class Column(OBIWrapper) :
|
||||
@OBIWrapper.checkIsActive
|
||||
def __repr__(self) :
|
||||
cdef bytes s
|
||||
#cdef char* s_b
|
||||
#cdef str s_str
|
||||
#s_b = obi_column_formatted_infos(self.pointer())
|
||||
#s_str = bytes2str(s_b)
|
||||
#free(s_b)
|
||||
s = self._alias + b", data type: " + self.data_type
|
||||
#return s_str
|
||||
return bytes2str(s)
|
||||
|
||||
|
||||
|
||||
def close(self): # TODO discuss, can't be called bc then bug when closing view that tries to close it in C
|
||||
|
||||
|
@ -227,7 +227,9 @@ cdef class DMS(OBIWrapper):
|
||||
cdef str s
|
||||
s=""
|
||||
for view_name in self.keys():
|
||||
s = s + repr(self.get_view(view_name)) + "\n"
|
||||
view = self.get_view(view_name)
|
||||
s = s + repr(view) + "\n"
|
||||
view.close()
|
||||
return s
|
||||
|
||||
|
||||
|
@ -533,6 +533,7 @@ cdef class View(OBIWrapper) :
|
||||
for command in command_list:
|
||||
s+=b"obi "
|
||||
s+=command
|
||||
s+=b"\n"
|
||||
return s
|
||||
|
||||
|
||||
|
@ -188,7 +188,7 @@ def buildConsensus(ali, seq, ref_tags=None):
|
||||
seq[b'shift']=ali.shift
|
||||
else:
|
||||
if len(ali[0])>999: # TODO why?
|
||||
raise AssertionError,"Too long alignemnt"
|
||||
raise AssertionError,"Too long alignment"
|
||||
|
||||
ic=IterOnConsensus(ali)
|
||||
|
||||
@ -250,11 +250,21 @@ def buildJoinedSequence(ali, reverse, seq, forward=None):
|
||||
quality.extend(reverse.quality)
|
||||
seq.set(forward.id +b"_PairedEnd", s, definition=forward.definition, quality=quality)
|
||||
seq[b"score"]=ali.score
|
||||
seq[b"ali_direction"]=ali.direction
|
||||
if len(ali.direction) > 0:
|
||||
seq[b"ali_direction"]=ali.direction
|
||||
else:
|
||||
seq[b"ali_direction"]=None
|
||||
seq[b"mode"]=b"joined"
|
||||
seq[b"pairedend_limit"]=len(forward)
|
||||
seq[b"pairedend_limit"]=len(forward)
|
||||
seq[b"ali_length"] = ali.consensus_len
|
||||
if ali.consensus_len > 0:
|
||||
seq[b"score_norm"]=float(ali.score)/ali.consensus_len
|
||||
else:
|
||||
seq[b"score_norm"]=0.0
|
||||
|
||||
for tag in forward:
|
||||
if tag != REVERSE_SEQUENCE_COLUMN and tag != REVERSE_QUALITY_COLUMN:
|
||||
if tag != REVERSE_SEQUENCE_COLUMN and tag != REVERSE_QUALITY_COLUMN and \
|
||||
tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN:
|
||||
seq[tag] = forward[tag]
|
||||
return seq
|
||||
|
||||
|
@ -177,7 +177,7 @@ def emblIterator_dir(dir_path,
|
||||
for filename in files:
|
||||
if read==only:
|
||||
return
|
||||
print("Parsing file %s (%d/%d)" % (tostr(filename), read_files, len(files)))
|
||||
print("Parsing file %s (%d/%d)" % (tostr(filename), read_files+1, len(files)))
|
||||
f = uopen(filename)
|
||||
if only is not None:
|
||||
only_f = only-read
|
||||
|
@ -25,8 +25,9 @@ from libc.string cimport strcpy, strlen
|
||||
_featureMatcher = re.compile(b'^FEATURES.+\n(?=ORIGIN)',re.DOTALL + re.M)
|
||||
|
||||
_headerMatcher = re.compile(b'^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M)
|
||||
_seqMatcher = re.compile(b'(?<=ORIGIN).+(?=//\n)', re.DOTALL + re.M)
|
||||
_cleanSeq = re.compile(b'[ \n0-9]+')
|
||||
_seqMatcher = re.compile(b'ORIGIN.+(?=//\n)', re.DOTALL + re.M)
|
||||
_cleanSeq1 = re.compile(b'ORIGIN.+\n')
|
||||
_cleanSeq2 = re.compile(b'[ \n0-9]+')
|
||||
_acMatcher = re.compile(b'(?<=^ACCESSION ).+',re.M)
|
||||
_deMatcher = re.compile(b'(?<=^DEFINITION ).+\n( .+\n)*',re.M)
|
||||
_cleanDe = re.compile(b'\n *')
|
||||
@ -42,7 +43,8 @@ def genbankParser(bytes text):
|
||||
ft = _featureMatcher.search(text).group()
|
||||
|
||||
s = _seqMatcher.search(text).group()
|
||||
s = _cleanSeq.sub(b'', s).upper()
|
||||
s = _cleanSeq1.sub(b'', s)
|
||||
s = _cleanSeq2.sub(b'', s)
|
||||
|
||||
acs = _acMatcher.search(text).group()
|
||||
acs = acs.split()
|
||||
@ -51,23 +53,23 @@ def genbankParser(bytes text):
|
||||
|
||||
de = _deMatcher.search(header).group()
|
||||
de = _cleanDe.sub(b' ',de).strip().strip(b'.')
|
||||
|
||||
|
||||
tags = {}
|
||||
extractTaxon(ft, tags)
|
||||
|
||||
seq = Nuc_Seq(ac,
|
||||
s,
|
||||
definition=de,
|
||||
quality=None,
|
||||
offset=-1,
|
||||
tags=tags)
|
||||
|
||||
except Exception as e:
|
||||
print("\nCould not import sequence id:", text.split()[1], "(error raised:", e, ")")
|
||||
# Do not raise any Exception if you need the possibility to resume the generator
|
||||
# (Python generators can't resume after any exception is raised)
|
||||
return None
|
||||
|
||||
tags = {}
|
||||
extractTaxon(ft, tags)
|
||||
|
||||
seq = Nuc_Seq(ac,
|
||||
s,
|
||||
definition=de,
|
||||
quality=None,
|
||||
offset=-1,
|
||||
tags=tags)
|
||||
|
||||
|
||||
return seq
|
||||
|
||||
|
||||
@ -171,10 +173,12 @@ def genbankIterator_dir(dir_path,
|
||||
read = 0
|
||||
read_files = 0
|
||||
files = [filename for filename in glob.glob(os.path.join(path, b'*.gbff*'))]
|
||||
files.extend([filename for filename in glob.glob(os.path.join(path, b'*.seq*'))]) # new genbank extension
|
||||
files = list(set(files))
|
||||
for filename in files:
|
||||
if read==only:
|
||||
return
|
||||
print("Parsing file %s (%d/%d)" % (tostr(filename), read_files, len(files)))
|
||||
print("Parsing file %s (%d/%d)" % (tostr(filename), read_files+1, len(files)))
|
||||
f = uopen(filename)
|
||||
if only is not None:
|
||||
only_f = only-read
|
||||
|
@ -1,5 +1,5 @@
|
||||
major = 3
|
||||
minor = 0
|
||||
serial= '0-beta17'
|
||||
serial= '0b25'
|
||||
|
||||
version ="%d.%02d.%s" % (major,minor,serial)
|
||||
version ="%d.%d.%s" % (major,minor,serial)
|
||||
|
5
requirements.txt
Executable file
5
requirements.txt
Executable file
@ -0,0 +1,5 @@
|
||||
--extra-index-url https://pypi.python.org/simple/
|
||||
Cython>=0.24
|
||||
Sphinx>=1.2.0
|
||||
ipython>=3.0.0
|
||||
breathe>=4.0.0
|
26
setup.py
26
setup.py
@ -5,8 +5,9 @@ import re
|
||||
import subprocess
|
||||
|
||||
from distutils import log
|
||||
from distutils.core import setup
|
||||
|
||||
#from distutils.core import setup
|
||||
from setuptools import setup # to work with pip
|
||||
|
||||
from distutils.core import Extension
|
||||
from distutils.sysconfig import get_python_lib
|
||||
|
||||
@ -26,10 +27,11 @@ class Distribution(ori_Distribution):
|
||||
|
||||
ori_Distribution.__init__(self, attrs)
|
||||
|
||||
self.global_options.insert(0,('cobitools3', None, "intall location of the C library"
|
||||
self.global_options.insert(0,('cobitools3', None, "install location of the C library"
|
||||
))
|
||||
|
||||
from distutils.command.build import build as build_ori
|
||||
from setuptools.command.bdist_egg import bdist_egg as bdist_egg_ori
|
||||
from distutils.core import Command
|
||||
|
||||
|
||||
@ -70,6 +72,12 @@ class build(build_ori):
|
||||
build_ori.run(self)
|
||||
|
||||
|
||||
class bdist_egg(bdist_egg_ori):
|
||||
def run(self):
|
||||
self.run_command('build_clib')
|
||||
bdist_egg_ori.run(self)
|
||||
|
||||
|
||||
sys.path.append(os.path.abspath("python"))
|
||||
|
||||
|
||||
@ -88,9 +96,10 @@ PACKAGE = "OBITools3"
|
||||
VERSION = version
|
||||
AUTHOR = 'Celine Mercier'
|
||||
EMAIL = 'celine.mercier@metabarcoding.org'
|
||||
URL = "http://metabarcoding.org/obitools3"
|
||||
URL = "https://metabarcoding.org/obitools3"
|
||||
PLATFORMS = "posix"
|
||||
LICENSE = "CeCILL-V2"
|
||||
DESCRIPTION = "Tools and library for DNA metabarcoding",
|
||||
DESCRIPTION = "A package for the management of analyses and data in DNA metabarcoding."
|
||||
PYTHONMIN = '3.5'
|
||||
|
||||
SRC = 'python'
|
||||
@ -147,17 +156,24 @@ classifiers=['Development Status :: 4 - Beta',
|
||||
'Topic :: Utilities',
|
||||
]
|
||||
|
||||
with open("README.md", "r") as fh:
|
||||
long_description = fh.read()
|
||||
|
||||
setup(name=PACKAGE,
|
||||
description=DESCRIPTION,
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
classifiers=classifiers,
|
||||
version=VERSION,
|
||||
author=AUTHOR,
|
||||
author_email=EMAIL,
|
||||
platforms=PLATFORMS,
|
||||
license=LICENSE,
|
||||
url=URL,
|
||||
ext_modules=xx,
|
||||
distclass=Distribution,
|
||||
cmdclass={'build': build,
|
||||
'bdist_egg': bdist_egg,
|
||||
'build_clib': build_clib},
|
||||
cobitools3=get_python_lib(),
|
||||
packages = findPackage('python'),
|
||||
|
@ -413,7 +413,10 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
||||
return NULL;
|
||||
}
|
||||
|
||||
score = max_common_kmers + kmer_size - 1; // aka the number of nucleotides in the longest stretch of kmers perfectly matching
|
||||
if (max_common_kmers > 0)
|
||||
score = max_common_kmers + kmer_size - 1; // aka the number of nucleotides in the longest stretch of kmers perfectly matching
|
||||
else
|
||||
score = 0;
|
||||
abs_shift = abs(best_shift);
|
||||
|
||||
// Save result in Obi_ali structure
|
||||
@ -423,10 +426,15 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
||||
ali->shift = abs_shift;
|
||||
ali->consensus_seq = NULL;
|
||||
ali->consensus_qual = NULL;
|
||||
if (((best_shift <= 0) && (!switched_seqs)) || ((best_shift > 0) && switched_seqs))
|
||||
strcpy(ali->direction, "left");
|
||||
if (score == 0)
|
||||
ali->direction[0] = '\0';
|
||||
else
|
||||
strcpy(ali->direction, "right");
|
||||
{
|
||||
if (((best_shift <= 0) && (!switched_seqs)) || ((best_shift > 0) && switched_seqs))
|
||||
strcpy(ali->direction, "left");
|
||||
else
|
||||
strcpy(ali->direction, "right");
|
||||
}
|
||||
|
||||
// Build the consensus sequence if asked
|
||||
if (build_consensus)
|
||||
|
@ -71,9 +71,12 @@ static int create_output_columns(Obiview_p o_view);
|
||||
* @param name The assigned scientific name.
|
||||
* @param assigned_status_column A pointer on the column where the assigned status should be written.
|
||||
* @param assigned The assigned status (whether the sequence was assigned to a taxon or not).
|
||||
* @param best_match_column A pointer on the column where the list of ids of the best matches should be written.
|
||||
* @param best_match_ids_column A pointer on the column where the list of ids of the best matches should be written.
|
||||
* @param best_match_ids The list of ids of the best matches as an array of the concatenated ids separated by '\0'.
|
||||
* @param best_match_ids_length The total length of the array of ids of best matches.
|
||||
* @param best_match_taxids_column A pointer on the column where the list of taxids of the best matches should be written.
|
||||
* @param best_match_taxids The list of taxids of the best matches as an array of the taxids.
|
||||
* @param best_match_taxids_length The length of the array of taxids of best matches.
|
||||
* @param score_column A pointer on the column where the score should be written.
|
||||
* @param score The similarity score of the sequence with its best match(es).
|
||||
*
|
||||
@ -87,7 +90,8 @@ int print_assignment_result(Obiview_p output_view, index_t line,
|
||||
OBIDMS_column_p assigned_taxid_column, int32_t taxid,
|
||||
OBIDMS_column_p assigned_name_column, const char* name,
|
||||
OBIDMS_column_p assigned_status_column, bool assigned,
|
||||
OBIDMS_column_p best_match_column, const char* best_match_ids, int best_match_ids_length,
|
||||
OBIDMS_column_p best_match_ids_column, const char* best_match_ids, int best_match_ids_length,
|
||||
OBIDMS_column_p best_match_taxids_column, const int32_t* best_match_taxids, int best_match_taxids_length,
|
||||
OBIDMS_column_p score_column, double score);
|
||||
|
||||
|
||||
@ -130,7 +134,14 @@ static int create_output_columns(Obiview_p o_view)
|
||||
// Column for array of best match ids
|
||||
if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, true, false, NULL, NULL, -1, "{}", true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the column for the array of ids of the best match in ecotag");
|
||||
obidebug(1, "\nError creating the column for the array of ids of best matches in ecotag");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Column for array of best match taxids
|
||||
if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_TAXIDS_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, true, false, NULL, NULL, -1, "{}", true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the column for the array of taxids of best matches in ecotag");
|
||||
return -1;
|
||||
}
|
||||
|
||||
@ -142,7 +153,8 @@ int print_assignment_result(Obiview_p output_view, index_t line,
|
||||
OBIDMS_column_p assigned_taxid_column, int32_t taxid,
|
||||
OBIDMS_column_p assigned_name_column, const char* name,
|
||||
OBIDMS_column_p assigned_status_column, bool assigned,
|
||||
OBIDMS_column_p best_match_column, const char* best_match_ids, int best_match_ids_length,
|
||||
OBIDMS_column_p best_match_ids_column, const char* best_match_ids, int best_match_ids_length,
|
||||
OBIDMS_column_p best_match_taxids_column, const int32_t* best_match_taxids, int best_match_taxids_length,
|
||||
OBIDMS_column_p score_column, double score)
|
||||
{
|
||||
// Write the assigned taxid
|
||||
@ -167,9 +179,16 @@ int print_assignment_result(Obiview_p output_view, index_t line,
|
||||
}
|
||||
|
||||
// Write the best match ids
|
||||
if (obi_set_array_with_col_p_in_view(output_view, best_match_column, line, best_match_ids, (uint8_t)(sizeof(char)*8), best_match_ids_length) < 0)
|
||||
if (obi_set_array_with_col_p_in_view(output_view, best_match_ids_column, line, best_match_ids, (uint8_t)(sizeof(char)*8), best_match_ids_length) < 0)
|
||||
{
|
||||
obidebug(1, "\nError writing a assignment status in a column when writing ecotag results");
|
||||
obidebug(1, "\nError writing the array of best match ids in a column when writing ecotag results");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Write the best match taxids
|
||||
if (obi_set_array_with_col_p_in_view(output_view, best_match_taxids_column, line, best_match_taxids, (uint8_t)(sizeof(OBI_INT)*8), best_match_taxids_length) < 0)
|
||||
{
|
||||
obidebug(1, "\nError writing the array of best match taxids in a column when writing ecotag results");
|
||||
return -1;
|
||||
}
|
||||
|
||||
@ -235,6 +254,8 @@ int obi_ecotag(const char* dms_name,
|
||||
char* best_match_ids;
|
||||
char* best_match_ids_to_store;
|
||||
int32_t best_match_ids_length;
|
||||
int32_t* best_match_taxids;
|
||||
int32_t* best_match_taxids_to_store;
|
||||
int best_match_count;
|
||||
int buffer_size;
|
||||
int best_match_ids_buffer_size;
|
||||
@ -263,7 +284,8 @@ int obi_ecotag(const char* dms_name,
|
||||
OBIDMS_column_p assigned_taxid_column = NULL;
|
||||
OBIDMS_column_p assigned_name_column = NULL;
|
||||
OBIDMS_column_p assigned_status_column = NULL;
|
||||
OBIDMS_column_p best_match_column = NULL;
|
||||
OBIDMS_column_p best_match_ids_column = NULL;
|
||||
OBIDMS_column_p best_match_taxids_column = NULL;
|
||||
OBIDMS_column_p lca_taxid_a_column = NULL;
|
||||
OBIDMS_column_p score_a_column = NULL;
|
||||
OBIDMS_column_p ref_taxid_column = NULL;
|
||||
@ -396,7 +418,8 @@ int obi_ecotag(const char* dms_name,
|
||||
assigned_taxid_column = obi_view_get_column(output_view, ECOTAG_TAXID_COLUMN_NAME);
|
||||
assigned_name_column = obi_view_get_column(output_view, ECOTAG_NAME_COLUMN_NAME);
|
||||
assigned_status_column = obi_view_get_column(output_view, ECOTAG_STATUS_COLUMN_NAME);
|
||||
best_match_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME);
|
||||
best_match_ids_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME);
|
||||
best_match_taxids_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_TAXIDS_COLUMN_NAME);
|
||||
score_column = obi_view_get_column(output_view, ECOTAG_SCORE_COLUMN_NAME);
|
||||
|
||||
// Open the used reference columns
|
||||
@ -453,6 +476,14 @@ int obi_ecotag(const char* dms_name,
|
||||
return -1;
|
||||
}
|
||||
|
||||
best_match_taxids = (int32_t*) malloc(buffer_size* sizeof(int32_t));
|
||||
if (best_match_taxids == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError allocating memory for the best match taxid array in ecotag");
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (i=0; i < query_count; i++)
|
||||
{
|
||||
if (i%1000 == 0)
|
||||
@ -514,7 +545,7 @@ int obi_ecotag(const char* dms_name,
|
||||
|
||||
// Store in best match array
|
||||
|
||||
// Grow match array if needed
|
||||
// Grow match and taxid array if needed
|
||||
if (best_match_count == buffer_size)
|
||||
{
|
||||
buffer_size = buffer_size*2;
|
||||
@ -525,6 +556,13 @@ int obi_ecotag(const char* dms_name,
|
||||
obidebug(1, "\nError reallocating match array when assigning");
|
||||
return -1;
|
||||
}
|
||||
best_match_taxids = (int32_t*) realloc(best_match_taxids, buffer_size*sizeof(int32_t));
|
||||
if (best_match_taxids == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError reallocating match taxids array when assigning");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
id = obi_get_str_with_elt_idx_and_col_p_in_view(ref_view, ref_id_column, j, 0);
|
||||
@ -545,6 +583,7 @@ int obi_ecotag(const char* dms_name,
|
||||
|
||||
// Save match
|
||||
best_match_array[best_match_count] = j;
|
||||
best_match_taxids[best_match_count] = obi_get_int_with_elt_idx_and_col_p_in_view(ref_view, ref_taxid_column, j, 0);
|
||||
best_match_count++;
|
||||
strcpy(best_match_ids+best_match_ids_length, id);
|
||||
best_match_ids_length = best_match_ids_length + id_len + 1;
|
||||
@ -629,6 +668,7 @@ int obi_ecotag(const char* dms_name,
|
||||
else
|
||||
lca_name = lca->name;
|
||||
best_match_ids_to_store = best_match_ids;
|
||||
best_match_taxids_to_store = best_match_taxids;
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -636,6 +676,7 @@ int obi_ecotag(const char* dms_name,
|
||||
lca_name = OBIStr_NA;
|
||||
lca_taxid = OBIInt_NA;
|
||||
best_match_ids_to_store = OBITuple_NA;
|
||||
best_match_taxids_to_store = OBITuple_NA;
|
||||
score = OBIFloat_NA;
|
||||
}
|
||||
|
||||
@ -644,7 +685,8 @@ int obi_ecotag(const char* dms_name,
|
||||
assigned_taxid_column, lca_taxid,
|
||||
assigned_name_column, lca_name,
|
||||
assigned_status_column, assigned,
|
||||
best_match_column, best_match_ids_to_store, best_match_ids_length,
|
||||
best_match_ids_column, best_match_ids_to_store, best_match_ids_length,
|
||||
best_match_taxids_column, best_match_taxids_to_store, best_match_count,
|
||||
score_column, best_score
|
||||
) < 0)
|
||||
return -1;
|
||||
@ -652,6 +694,7 @@ int obi_ecotag(const char* dms_name,
|
||||
|
||||
free(best_match_array);
|
||||
free(best_match_ids);
|
||||
free(best_match_taxids);
|
||||
|
||||
obi_close_taxonomy(taxonomy);
|
||||
obi_save_and_close_view(query_view);
|
||||
|
@ -23,7 +23,8 @@
|
||||
#define ECOTAG_TAXID_COLUMN_NAME "TAXID"
|
||||
#define ECOTAG_NAME_COLUMN_NAME "SCIENTIFIC_NAME"
|
||||
#define ECOTAG_STATUS_COLUMN_NAME "ID_STATUS"
|
||||
#define ECOTAG_BEST_MATCH_IDS_COLUMN_NAME "BEST_MATCH"
|
||||
#define ECOTAG_BEST_MATCH_IDS_COLUMN_NAME "BEST_MATCH_IDS"
|
||||
#define ECOTAG_BEST_MATCH_TAXIDS_COLUMN_NAME "BEST_MATCH_TAXIDS"
|
||||
#define ECOTAG_SCORE_COLUMN_NAME "BEST_IDENTITY"
|
||||
|
||||
|
||||
|
@ -1496,7 +1496,7 @@ obiversion_t obi_import_column(const char* dms_path_1, const char* dms_path_2, c
|
||||
memcpy(column_2->data, column_1->data, header_1->data_size);
|
||||
|
||||
// Copy the AVL files if there are some (overwriting the automatically created files)
|
||||
if ((header_1->returned_data_type == OBI_STR) || (header_1->returned_data_type == OBI_SEQ) || (header_1->returned_data_type == OBI_QUAL))
|
||||
if ((header_1->tuples) || ((header_1->returned_data_type == OBI_STR) || (header_1->returned_data_type == OBI_SEQ) || (header_1->returned_data_type == OBI_QUAL)))
|
||||
{
|
||||
avl_name_1 = (char*) malloc((strlen(header_1->indexer_name) + 1) * sizeof(char));
|
||||
if (avl_name_1 == NULL)
|
||||
|
@ -1350,6 +1350,8 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms,
|
||||
}
|
||||
strncpy(header->indexer_name, final_indexer_name, INDEXER_MAX_NAME);
|
||||
}
|
||||
else
|
||||
new_column->indexer = NULL;
|
||||
|
||||
// Fill the data with NA values
|
||||
obi_ini_to_NA_values(new_column, 0, nb_lines);
|
||||
@ -1558,6 +1560,8 @@ OBIDMS_column_p obi_open_column(OBIDMS_p dms,
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
else
|
||||
column->indexer = NULL;
|
||||
|
||||
if (close(column_file_descriptor) < 0)
|
||||
{
|
||||
@ -1693,8 +1697,8 @@ int obi_close_column(OBIDMS_column_p column)
|
||||
if (obi_dms_unlist_column(column->dms, column) < 0)
|
||||
ret_val = -1;
|
||||
|
||||
// If the data type is OBI_STR, OBI_SEQ or OBI_QUAL, the associated indexer is closed
|
||||
if (((column->header)->returned_data_type == OBI_STR) || ((column->header)->returned_data_type == OBI_SEQ) || ((column->header)->returned_data_type == OBI_QUAL))
|
||||
// If it's a tuple column or the data type is OBI_STR, OBI_SEQ or OBI_QUAL, the associated indexer is closed
|
||||
if ((column->indexer) != NULL)
|
||||
if (obi_close_indexer(column->indexer) < 0)
|
||||
ret_val = -1;
|
||||
|
||||
|
Reference in New Issue
Block a user