Compare commits
3 Commits
Author | SHA1 | Date | |
---|---|---|---|
faf8ea9d86 | |||
ffe2485e94 | |||
6094ce2bbc |
@ -269,125 +269,134 @@ def run(config):
|
||||
pb(i)
|
||||
elif not i%50000:
|
||||
logger("info", "Imported %d entries", i)
|
||||
|
||||
if NUC_SEQS_view:
|
||||
id_col[i] = entry.id
|
||||
def_col[i] = entry.definition
|
||||
seq_col[i] = entry.seq
|
||||
# Check if there is a sequencing quality associated by checking the first entry # TODO haven't found a more robust solution yet
|
||||
if i == 0:
|
||||
get_quality = QUALITY_COLUMN in entry
|
||||
|
||||
try:
|
||||
|
||||
if NUC_SEQS_view:
|
||||
id_col[i] = entry.id
|
||||
def_col[i] = entry.definition
|
||||
seq_col[i] = entry.seq
|
||||
# Check if there is a sequencing quality associated by checking the first entry # TODO haven't found a more robust solution yet
|
||||
if i == 0:
|
||||
get_quality = QUALITY_COLUMN in entry
|
||||
if get_quality:
|
||||
Column.new_column(view, QUALITY_COLUMN, OBI_QUAL)
|
||||
qual_col = view[QUALITY_COLUMN]
|
||||
if get_quality:
|
||||
Column.new_column(view, QUALITY_COLUMN, OBI_QUAL)
|
||||
qual_col = view[QUALITY_COLUMN]
|
||||
if get_quality:
|
||||
qual_col[i] = entry.quality
|
||||
|
||||
for tag in entry :
|
||||
|
||||
if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN : # TODO dirty
|
||||
|
||||
value = entry[tag]
|
||||
if tag == b"taxid":
|
||||
tag = TAXID_COLUMN
|
||||
if tag == b"count":
|
||||
tag = COUNT_COLUMN
|
||||
if tag[:7] == b"merged_":
|
||||
tag = MERGED_PREFIX+tag[7:]
|
||||
|
||||
if tag not in dcols :
|
||||
|
||||
value_type = type(value)
|
||||
nb_elts = 1
|
||||
value_obitype = OBI_VOID
|
||||
|
||||
if value_type == dict or value_type == list :
|
||||
nb_elts = len(value)
|
||||
elt_names = list(value)
|
||||
else :
|
||||
nb_elts = 1
|
||||
elt_names = None
|
||||
|
||||
value_obitype = get_obitype(value)
|
||||
|
||||
if value_obitype != OBI_VOID :
|
||||
dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
|
||||
|
||||
# Fill value
|
||||
if value_type == dict and nb_elts == 1: # special case that makes the OBI3 create a 1 elt/line column which won't read a dict value
|
||||
value = value[list(value.keys())[0]] # The solution is to transform the value in a simple atomic one acceptable by the column
|
||||
dcols[tag][0][i] = value
|
||||
|
||||
# TODO else log error?
|
||||
|
||||
else :
|
||||
|
||||
rewrite = False
|
||||
|
||||
# Check type adequation
|
||||
old_type = dcols[tag][1]
|
||||
new_type = OBI_VOID
|
||||
new_type = update_obitype(old_type, value)
|
||||
if old_type != new_type :
|
||||
rewrite = True
|
||||
|
||||
try:
|
||||
# Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key
|
||||
if type(value) == dict and \
|
||||
dcols[tag][0].nb_elements_per_line == 1 \
|
||||
and set(dcols[tag][0].elements_names) != set(value.keys()) :
|
||||
raise IndexError # trigger column rewrite
|
||||
qual_col[i] = entry.quality
|
||||
|
||||
for tag in entry :
|
||||
|
||||
if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN : # TODO dirty
|
||||
|
||||
value = entry[tag]
|
||||
if tag == b"taxid":
|
||||
tag = TAXID_COLUMN
|
||||
if tag == b"count":
|
||||
tag = COUNT_COLUMN
|
||||
if tag[:7] == b"merged_":
|
||||
tag = MERGED_PREFIX+tag[7:]
|
||||
|
||||
# Fill value
|
||||
dcols[tag][0][i] = value
|
||||
|
||||
except IndexError :
|
||||
|
||||
if tag not in dcols :
|
||||
|
||||
value_type = type(value)
|
||||
old_column = dcols[tag][0]
|
||||
old_nb_elements_per_line = old_column.nb_elements_per_line
|
||||
new_nb_elements_per_line = 0
|
||||
old_elements_names = old_column.elements_names
|
||||
new_elements_names = None
|
||||
nb_elts = 1
|
||||
value_obitype = OBI_VOID
|
||||
|
||||
if value_type == dict or value_type == list :
|
||||
nb_elts = len(value)
|
||||
elt_names = list(value)
|
||||
else :
|
||||
nb_elts = 1
|
||||
elt_names = None
|
||||
|
||||
value_obitype = get_obitype(value)
|
||||
|
||||
if value_obitype != OBI_VOID :
|
||||
dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
|
||||
|
||||
# Fill value
|
||||
if value_type == dict and nb_elts == 1: # special case that makes the OBI3 create a 1 elt/line column which won't read a dict value
|
||||
value = value[list(value.keys())[0]] # The solution is to transform the value in a simple atomic one acceptable by the column
|
||||
dcols[tag][0][i] = value
|
||||
|
||||
# TODO else log error?
|
||||
|
||||
#####################################################################
|
||||
|
||||
# Check the length and keys of column lines if needed
|
||||
if value_type == dict : # Check dictionary keys
|
||||
for k in value :
|
||||
if k not in old_elements_names :
|
||||
new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
|
||||
rewrite = True
|
||||
break
|
||||
|
||||
elif value_type == list or value_type == tuple : # Check vector length
|
||||
if old_nb_elements_per_line < len(value) :
|
||||
new_nb_elements_per_line = len(value)
|
||||
rewrite = True
|
||||
|
||||
#####################################################################
|
||||
|
||||
if rewrite :
|
||||
if new_nb_elements_per_line == 0 and new_elements_names is not None :
|
||||
new_nb_elements_per_line = len(new_elements_names)
|
||||
|
||||
# Reset obierrno
|
||||
obi_errno = 0
|
||||
|
||||
dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
|
||||
new_data_type=new_type,
|
||||
new_nb_elements_per_line=new_nb_elements_per_line,
|
||||
new_elements_names=new_elements_names,
|
||||
rewrite_last_line=False),
|
||||
new_type)
|
||||
|
||||
# Update the dictionary:
|
||||
for t in dcols :
|
||||
dcols[t] = (view[t], dcols[t][1])
|
||||
|
||||
else :
|
||||
|
||||
rewrite = False
|
||||
|
||||
# Check type adequation
|
||||
old_type = dcols[tag][1]
|
||||
new_type = OBI_VOID
|
||||
new_type = update_obitype(old_type, value)
|
||||
if old_type != new_type :
|
||||
rewrite = True
|
||||
|
||||
try:
|
||||
# Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key
|
||||
if type(value) == dict and \
|
||||
dcols[tag][0].nb_elements_per_line == 1 \
|
||||
and set(dcols[tag][0].elements_names) != set(value.keys()) :
|
||||
raise IndexError # trigger column rewrite
|
||||
|
||||
# Fill value
|
||||
dcols[tag][0][i] = value
|
||||
|
||||
|
||||
except IndexError :
|
||||
|
||||
value_type = type(value)
|
||||
old_column = dcols[tag][0]
|
||||
old_nb_elements_per_line = old_column.nb_elements_per_line
|
||||
new_nb_elements_per_line = 0
|
||||
old_elements_names = old_column.elements_names
|
||||
new_elements_names = None
|
||||
|
||||
#####################################################################
|
||||
|
||||
# Check the length and keys of column lines if needed
|
||||
if value_type == dict : # Check dictionary keys
|
||||
for k in value :
|
||||
if k not in old_elements_names :
|
||||
new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
|
||||
rewrite = True
|
||||
break
|
||||
|
||||
elif value_type == list or value_type == tuple : # Check vector length
|
||||
if old_nb_elements_per_line < len(value) :
|
||||
new_nb_elements_per_line = len(value)
|
||||
rewrite = True
|
||||
|
||||
#####################################################################
|
||||
|
||||
if rewrite :
|
||||
if new_nb_elements_per_line == 0 and new_elements_names is not None :
|
||||
new_nb_elements_per_line = len(new_elements_names)
|
||||
|
||||
# Reset obierrno
|
||||
obi_errno = 0
|
||||
|
||||
dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
|
||||
new_data_type=new_type,
|
||||
new_nb_elements_per_line=new_nb_elements_per_line,
|
||||
new_elements_names=new_elements_names,
|
||||
rewrite_last_line=False),
|
||||
new_type)
|
||||
|
||||
# Update the dictionary:
|
||||
for t in dcols :
|
||||
dcols[t] = (view[t], dcols[t][1])
|
||||
|
||||
# Fill value
|
||||
dcols[tag][0][i] = value
|
||||
|
||||
except Exception as e:
|
||||
print("\nCould not import sequence id:", entry.id, "(error raised:", e, ")")
|
||||
if 'skiperror' in config['obi'] and not config['obi']['skiperror']:
|
||||
raise e
|
||||
else:
|
||||
pass
|
||||
|
||||
i+=1
|
||||
|
||||
if pb is not None:
|
||||
|
@ -25,8 +25,9 @@ from libc.string cimport strcpy, strlen
|
||||
_featureMatcher = re.compile(b'^FEATURES.+\n(?=ORIGIN)',re.DOTALL + re.M)
|
||||
|
||||
_headerMatcher = re.compile(b'^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M)
|
||||
_seqMatcher = re.compile(b'(?<=ORIGIN).+(?=//\n)', re.DOTALL + re.M)
|
||||
_cleanSeq = re.compile(b'[ \n0-9]+')
|
||||
_seqMatcher = re.compile(b'ORIGIN.+(?=//\n)', re.DOTALL + re.M)
|
||||
_cleanSeq1 = re.compile(b'ORIGIN.+\n')
|
||||
_cleanSeq2 = re.compile(b'[ \n0-9]+')
|
||||
_acMatcher = re.compile(b'(?<=^ACCESSION ).+',re.M)
|
||||
_deMatcher = re.compile(b'(?<=^DEFINITION ).+\n( .+\n)*',re.M)
|
||||
_cleanDe = re.compile(b'\n *')
|
||||
@ -42,7 +43,8 @@ def genbankParser(bytes text):
|
||||
ft = _featureMatcher.search(text).group()
|
||||
|
||||
s = _seqMatcher.search(text).group()
|
||||
s = _cleanSeq.sub(b'', s).upper()
|
||||
s = _cleanSeq1.sub(b'', s)
|
||||
s = _cleanSeq2.sub(b'', s)
|
||||
|
||||
acs = _acMatcher.search(text).group()
|
||||
acs = acs.split()
|
||||
@ -51,23 +53,23 @@ def genbankParser(bytes text):
|
||||
|
||||
de = _deMatcher.search(header).group()
|
||||
de = _cleanDe.sub(b' ',de).strip().strip(b'.')
|
||||
|
||||
|
||||
tags = {}
|
||||
extractTaxon(ft, tags)
|
||||
|
||||
seq = Nuc_Seq(ac,
|
||||
s,
|
||||
definition=de,
|
||||
quality=None,
|
||||
offset=-1,
|
||||
tags=tags)
|
||||
|
||||
except Exception as e:
|
||||
print("\nCould not import sequence id:", text.split()[1], "(error raised:", e, ")")
|
||||
# Do not raise any Exception if you need the possibility to resume the generator
|
||||
# (Python generators can't resume after any exception is raised)
|
||||
return None
|
||||
|
||||
tags = {}
|
||||
extractTaxon(ft, tags)
|
||||
|
||||
seq = Nuc_Seq(ac,
|
||||
s,
|
||||
definition=de,
|
||||
quality=None,
|
||||
offset=-1,
|
||||
tags=tags)
|
||||
|
||||
|
||||
return seq
|
||||
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
major = 3
|
||||
minor = 0
|
||||
serial= '0b20'
|
||||
serial= '0b21'
|
||||
|
||||
version ="%d.%d.%s" % (major,minor,serial)
|
||||
|
Reference in New Issue
Block a user