From ffe2485e94fadaf985b3c7a3d60f56f83ceef147 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Thu, 28 May 2020 20:41:34 +0200 Subject: [PATCH] Genbank parser: now reading ORIGIN lines with comments without triggering error --- python/obitools3/parsers/genbank.pyx | 32 +++++++++++++++------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/python/obitools3/parsers/genbank.pyx b/python/obitools3/parsers/genbank.pyx index c6a70e0..60d9bd3 100755 --- a/python/obitools3/parsers/genbank.pyx +++ b/python/obitools3/parsers/genbank.pyx @@ -25,8 +25,9 @@ from libc.string cimport strcpy, strlen _featureMatcher = re.compile(b'^FEATURES.+\n(?=ORIGIN)',re.DOTALL + re.M) _headerMatcher = re.compile(b'^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M) -_seqMatcher = re.compile(b'(?<=ORIGIN).+(?=//\n)', re.DOTALL + re.M) -_cleanSeq = re.compile(b'[ \n0-9]+') +_seqMatcher = re.compile(b'ORIGIN.+(?=//\n)', re.DOTALL + re.M) +_cleanSeq1 = re.compile(b'ORIGIN.+\n') +_cleanSeq2 = re.compile(b'[ \n0-9]+') _acMatcher = re.compile(b'(?<=^ACCESSION ).+',re.M) _deMatcher = re.compile(b'(?<=^DEFINITION ).+\n( .+\n)*',re.M) _cleanDe = re.compile(b'\n *') @@ -42,7 +43,8 @@ def genbankParser(bytes text): ft = _featureMatcher.search(text).group() s = _seqMatcher.search(text).group() - s = _cleanSeq.sub(b'', s).upper() + s = _cleanSeq1.sub(b'', s) + s = _cleanSeq2.sub(b'', s) acs = _acMatcher.search(text).group() acs = acs.split() @@ -51,23 +53,23 @@ def genbankParser(bytes text): de = _deMatcher.search(header).group() de = _cleanDe.sub(b' ',de).strip().strip(b'.') - + + tags = {} + extractTaxon(ft, tags) + + seq = Nuc_Seq(ac, + s, + definition=de, + quality=None, + offset=-1, + tags=tags) + except Exception as e: print("\nCould not import sequence id:", text.split()[1], "(error raised:", e, ")") # Do not raise any Exception if you need the possibility to resume the generator # (Python generators can't resume after any exception is raised) return None - - tags = {} - extractTaxon(ft, tags) - - seq = Nuc_Seq(ac, - s, - definition=de, - quality=None, - offset=-1, - tags=tags) - + return seq