python: genbank parser: better handling of white spaces

2021-09-13 11:44:38 +12:00
parent f43856b712
commit 2d66e0e965
1 changed files with 7 additions and 7 deletions
--- a/python/obitools3/parsers/genbank.pyx
+++ b/python/obitools3/parsers/genbank.pyx
@ -22,11 +22,11 @@ from libc.stdlib  cimport free, malloc, realloc
 from libc.string cimport strcpy, strlen


-_featureMatcher = re.compile(b'^FEATURES.+\n(?=ORIGIN )',re.DOTALL + re.M)
+_featureMatcher = re.compile(b'^FEATURES.+\n(?=ORIGIN(\s*))',re.DOTALL + re.M)

 _headerMatcher = re.compile(b'^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M)
-_seqMatcher    = re.compile(b'^ORIGIN .+(?=//\n)', re.DOTALL + re.M)
-_cleanSeq1     = re.compile(b'ORIGIN.+\n')
+_seqMatcher    = re.compile(b'^ORIGIN.+(?=//\n)', re.DOTALL + re.M)
+_cleanSeq1     = re.compile(b'ORIGIN(\s*)\n')
 _cleanSeq2     = re.compile(b'[ \n0-9]+')
 _acMatcher     = re.compile(b'(?<=^ACCESSION   ).+',re.M)
 _deMatcher     = re.compile(b'(?<=^DEFINITION  ).+\n( .+\n)*',re.M)
@ -155,9 +155,9 @@ def genbankIterator_file(lineiterator,
        yield seq
        read+=1
    
-    # Last sequence
+    # Last sequence if not empty lines
+    if entry.strip():
        seq = genbankParser(entry)
-    
        yield seq
    
    free(entry)