Compare commits

..

3 Commits

Author SHA1 Message Date
faf8ea9d86 Switch to version 3.0.0b21 2020-05-28 20:42:09 +02:00
ffe2485e94 Genbank parser: now reading ORIGIN lines with comments without
triggering error
2020-05-28 20:41:34 +02:00
6094ce2bbc obi import: skip on error more robust 2020-05-28 20:40:36 +02:00
3 changed files with 140 additions and 129 deletions

View File

@ -269,125 +269,134 @@ def run(config):
pb(i)
elif not i%50000:
logger("info", "Imported %d entries", i)
if NUC_SEQS_view:
id_col[i] = entry.id
def_col[i] = entry.definition
seq_col[i] = entry.seq
# Check if there is a sequencing quality associated by checking the first entry # TODO haven't found a more robust solution yet
if i == 0:
get_quality = QUALITY_COLUMN in entry
try:
if NUC_SEQS_view:
id_col[i] = entry.id
def_col[i] = entry.definition
seq_col[i] = entry.seq
# Check if there is a sequencing quality associated by checking the first entry # TODO haven't found a more robust solution yet
if i == 0:
get_quality = QUALITY_COLUMN in entry
if get_quality:
Column.new_column(view, QUALITY_COLUMN, OBI_QUAL)
qual_col = view[QUALITY_COLUMN]
if get_quality:
Column.new_column(view, QUALITY_COLUMN, OBI_QUAL)
qual_col = view[QUALITY_COLUMN]
if get_quality:
qual_col[i] = entry.quality
for tag in entry :
if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN : # TODO dirty
value = entry[tag]
if tag == b"taxid":
tag = TAXID_COLUMN
if tag == b"count":
tag = COUNT_COLUMN
if tag[:7] == b"merged_":
tag = MERGED_PREFIX+tag[7:]
if tag not in dcols :
value_type = type(value)
nb_elts = 1
value_obitype = OBI_VOID
if value_type == dict or value_type == list :
nb_elts = len(value)
elt_names = list(value)
else :
nb_elts = 1
elt_names = None
value_obitype = get_obitype(value)
if value_obitype != OBI_VOID :
dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
# Fill value
if value_type == dict and nb_elts == 1: # special case that makes the OBI3 create a 1 elt/line column which won't read a dict value
value = value[list(value.keys())[0]] # The solution is to transform the value in a simple atomic one acceptable by the column
dcols[tag][0][i] = value
# TODO else log error?
else :
rewrite = False
# Check type adequation
old_type = dcols[tag][1]
new_type = OBI_VOID
new_type = update_obitype(old_type, value)
if old_type != new_type :
rewrite = True
try:
# Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key
if type(value) == dict and \
dcols[tag][0].nb_elements_per_line == 1 \
and set(dcols[tag][0].elements_names) != set(value.keys()) :
raise IndexError # trigger column rewrite
qual_col[i] = entry.quality
for tag in entry :
if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN : # TODO dirty
value = entry[tag]
if tag == b"taxid":
tag = TAXID_COLUMN
if tag == b"count":
tag = COUNT_COLUMN
if tag[:7] == b"merged_":
tag = MERGED_PREFIX+tag[7:]
# Fill value
dcols[tag][0][i] = value
except IndexError :
if tag not in dcols :
value_type = type(value)
old_column = dcols[tag][0]
old_nb_elements_per_line = old_column.nb_elements_per_line
new_nb_elements_per_line = 0
old_elements_names = old_column.elements_names
new_elements_names = None
nb_elts = 1
value_obitype = OBI_VOID
if value_type == dict or value_type == list :
nb_elts = len(value)
elt_names = list(value)
else :
nb_elts = 1
elt_names = None
value_obitype = get_obitype(value)
if value_obitype != OBI_VOID :
dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
# Fill value
if value_type == dict and nb_elts == 1: # special case that makes the OBI3 create a 1 elt/line column which won't read a dict value
value = value[list(value.keys())[0]] # The solution is to transform the value in a simple atomic one acceptable by the column
dcols[tag][0][i] = value
# TODO else log error?
#####################################################################
# Check the length and keys of column lines if needed
if value_type == dict : # Check dictionary keys
for k in value :
if k not in old_elements_names :
new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
rewrite = True
break
elif value_type == list or value_type == tuple : # Check vector length
if old_nb_elements_per_line < len(value) :
new_nb_elements_per_line = len(value)
rewrite = True
#####################################################################
if rewrite :
if new_nb_elements_per_line == 0 and new_elements_names is not None :
new_nb_elements_per_line = len(new_elements_names)
# Reset obierrno
obi_errno = 0
dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
new_data_type=new_type,
new_nb_elements_per_line=new_nb_elements_per_line,
new_elements_names=new_elements_names,
rewrite_last_line=False),
new_type)
# Update the dictionary:
for t in dcols :
dcols[t] = (view[t], dcols[t][1])
else :
rewrite = False
# Check type adequation
old_type = dcols[tag][1]
new_type = OBI_VOID
new_type = update_obitype(old_type, value)
if old_type != new_type :
rewrite = True
try:
# Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key
if type(value) == dict and \
dcols[tag][0].nb_elements_per_line == 1 \
and set(dcols[tag][0].elements_names) != set(value.keys()) :
raise IndexError # trigger column rewrite
# Fill value
dcols[tag][0][i] = value
except IndexError :
value_type = type(value)
old_column = dcols[tag][0]
old_nb_elements_per_line = old_column.nb_elements_per_line
new_nb_elements_per_line = 0
old_elements_names = old_column.elements_names
new_elements_names = None
#####################################################################
# Check the length and keys of column lines if needed
if value_type == dict : # Check dictionary keys
for k in value :
if k not in old_elements_names :
new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
rewrite = True
break
elif value_type == list or value_type == tuple : # Check vector length
if old_nb_elements_per_line < len(value) :
new_nb_elements_per_line = len(value)
rewrite = True
#####################################################################
if rewrite :
if new_nb_elements_per_line == 0 and new_elements_names is not None :
new_nb_elements_per_line = len(new_elements_names)
# Reset obierrno
obi_errno = 0
dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
new_data_type=new_type,
new_nb_elements_per_line=new_nb_elements_per_line,
new_elements_names=new_elements_names,
rewrite_last_line=False),
new_type)
# Update the dictionary:
for t in dcols :
dcols[t] = (view[t], dcols[t][1])
# Fill value
dcols[tag][0][i] = value
except Exception as e:
print("\nCould not import sequence id:", entry.id, "(error raised:", e, ")")
if 'skiperror' in config['obi'] and not config['obi']['skiperror']:
raise e
else:
pass
i+=1
if pb is not None:

View File

@ -25,8 +25,9 @@ from libc.string cimport strcpy, strlen
_featureMatcher = re.compile(b'^FEATURES.+\n(?=ORIGIN)',re.DOTALL + re.M)
_headerMatcher = re.compile(b'^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M)
_seqMatcher = re.compile(b'(?<=ORIGIN).+(?=//\n)', re.DOTALL + re.M)
_cleanSeq = re.compile(b'[ \n0-9]+')
_seqMatcher = re.compile(b'ORIGIN.+(?=//\n)', re.DOTALL + re.M)
_cleanSeq1 = re.compile(b'ORIGIN.+\n')
_cleanSeq2 = re.compile(b'[ \n0-9]+')
_acMatcher = re.compile(b'(?<=^ACCESSION ).+',re.M)
_deMatcher = re.compile(b'(?<=^DEFINITION ).+\n( .+\n)*',re.M)
_cleanDe = re.compile(b'\n *')
@ -42,7 +43,8 @@ def genbankParser(bytes text):
ft = _featureMatcher.search(text).group()
s = _seqMatcher.search(text).group()
s = _cleanSeq.sub(b'', s).upper()
s = _cleanSeq1.sub(b'', s)
s = _cleanSeq2.sub(b'', s)
acs = _acMatcher.search(text).group()
acs = acs.split()
@ -51,23 +53,23 @@ def genbankParser(bytes text):
de = _deMatcher.search(header).group()
de = _cleanDe.sub(b' ',de).strip().strip(b'.')
tags = {}
extractTaxon(ft, tags)
seq = Nuc_Seq(ac,
s,
definition=de,
quality=None,
offset=-1,
tags=tags)
except Exception as e:
print("\nCould not import sequence id:", text.split()[1], "(error raised:", e, ")")
# Do not raise any Exception if you need the possibility to resume the generator
# (Python generators can't resume after any exception is raised)
return None
tags = {}
extractTaxon(ft, tags)
seq = Nuc_Seq(ac,
s,
definition=de,
quality=None,
offset=-1,
tags=tags)
return seq

View File

@ -1,5 +1,5 @@
major = 3
minor = 0
serial= '0b20'
serial= '0b21'
version ="%d.%d.%s" % (major,minor,serial)