Genbank parser: now reading ORIGIN lines with comments without
triggering error
This commit is contained in:
@ -25,8 +25,9 @@ from libc.string cimport strcpy, strlen
|
||||
_featureMatcher = re.compile(b'^FEATURES.+\n(?=ORIGIN)',re.DOTALL + re.M)
|
||||
|
||||
_headerMatcher = re.compile(b'^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M)
|
||||
_seqMatcher = re.compile(b'(?<=ORIGIN).+(?=//\n)', re.DOTALL + re.M)
|
||||
_cleanSeq = re.compile(b'[ \n0-9]+')
|
||||
_seqMatcher = re.compile(b'ORIGIN.+(?=//\n)', re.DOTALL + re.M)
|
||||
_cleanSeq1 = re.compile(b'ORIGIN.+\n')
|
||||
_cleanSeq2 = re.compile(b'[ \n0-9]+')
|
||||
_acMatcher = re.compile(b'(?<=^ACCESSION ).+',re.M)
|
||||
_deMatcher = re.compile(b'(?<=^DEFINITION ).+\n( .+\n)*',re.M)
|
||||
_cleanDe = re.compile(b'\n *')
|
||||
@ -42,7 +43,8 @@ def genbankParser(bytes text):
|
||||
ft = _featureMatcher.search(text).group()
|
||||
|
||||
s = _seqMatcher.search(text).group()
|
||||
s = _cleanSeq.sub(b'', s).upper()
|
||||
s = _cleanSeq1.sub(b'', s)
|
||||
s = _cleanSeq2.sub(b'', s)
|
||||
|
||||
acs = _acMatcher.search(text).group()
|
||||
acs = acs.split()
|
||||
@ -52,22 +54,22 @@ def genbankParser(bytes text):
|
||||
de = _deMatcher.search(header).group()
|
||||
de = _cleanDe.sub(b' ',de).strip().strip(b'.')
|
||||
|
||||
tags = {}
|
||||
extractTaxon(ft, tags)
|
||||
|
||||
seq = Nuc_Seq(ac,
|
||||
s,
|
||||
definition=de,
|
||||
quality=None,
|
||||
offset=-1,
|
||||
tags=tags)
|
||||
|
||||
except Exception as e:
|
||||
print("\nCould not import sequence id:", text.split()[1], "(error raised:", e, ")")
|
||||
# Do not raise any Exception if you need the possibility to resume the generator
|
||||
# (Python generators can't resume after any exception is raised)
|
||||
return None
|
||||
|
||||
tags = {}
|
||||
extractTaxon(ft, tags)
|
||||
|
||||
seq = Nuc_Seq(ac,
|
||||
s,
|
||||
definition=de,
|
||||
quality=None,
|
||||
offset=-1,
|
||||
tags=tags)
|
||||
|
||||
return seq
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user