From 303648bd4723e87724cbbca9f76e1b50da78da4a Mon Sep 17 00:00:00 2001
From: Celine Mercier <celine.mercier@metabarcoding.org>
Date: Sat, 28 Jul 2018 17:14:10 +0200
Subject: [PATCH] Cython: embl file parser

---
 python/obitools3/parsers/embl.pxd             |   9 +
 python/obitools3/parsers/embl.pyx             | 195 ++++++++++++++++++
 .../parsers/embl_genbank_features.pxd         |   2 +
 .../parsers/embl_genbank_features.pyx         | 151 ++++++++++++++
 4 files changed, 357 insertions(+)
 create mode 100644 python/obitools3/parsers/embl.pxd
 create mode 100644 python/obitools3/parsers/embl.pyx
 create mode 100644 python/obitools3/parsers/embl_genbank_features.pxd
 create mode 100644 python/obitools3/parsers/embl_genbank_features.pyx

diff --git a/python/obitools3/parsers/embl.pxd b/python/obitools3/parsers/embl.pxd
new file mode 100644
index 0000000..8962d37
--- /dev/null
+++ b/python/obitools3/parsers/embl.pxd
@@ -0,0 +1,9 @@
+#cython: language_level=3
+
+from ..utils cimport str2bytes
+from .header cimport parseHeader
+from ..files.universalopener cimport uopen
+from ..files.linebuffer cimport LineBuffer
+
+
+    
\ No newline at end of file
diff --git a/python/obitools3/parsers/embl.pyx b/python/obitools3/parsers/embl.pyx
new file mode 100644
index 0000000..4c90fd6
--- /dev/null
+++ b/python/obitools3/parsers/embl.pyx
@@ -0,0 +1,195 @@
+#cython: language_level=3
+
+'''
+Created on June 12th 2018
+
+@author: coissac/mercier
+'''
+
+
+import types
+import re
+import sys
+import os
+import glob
+
+from obitools3.files.universalopener cimport uopen
+from obitools3.utils cimport tostr
+from obitools3.dms.obiseq cimport Nuc_Seq
+from .embl_genbank_features import extractTaxon # TODO cimport
+
+from libc.stdlib  cimport free, malloc, realloc
+from libc.string cimport strcpy, strlen
+
+
+_featureMatcher = re.compile(b'(^FT  .*\n)+', re.M)
+_cleanFT       = re.compile(b'^FT',re.M)
+
+_headerMatcher = re.compile(b'^ID.+(?=\nFH  )', re.DOTALL)
+_seqMatcher    = re.compile(b'(^    ).+(?=//\n)', re.DOTALL + re.M)
+_cleanSeq      = re.compile(b'[ \n0-9]+')
+_acMatcher     = re.compile(b'(?<=^AC   ).+',re.M)
+_deMatcher     = re.compile(b'(^DE   .+\n)+',re.M)
+_cleanDe       = re.compile(b'(^|\n)DE +')
+
+
+def emblParser(bytes text):
+    
+    cdef Nuc_Seq seq
+        
+    try:
+        header = _headerMatcher.search(text).group()
+
+        ft     = _featureMatcher.search(text).group()
+        ft     = _cleanFT.sub(b'  ',ft)
+        
+        s      = _seqMatcher.search(text).group()
+        s      = _cleanSeq.sub(b'', s).upper()
+        
+        acs    = _acMatcher.search(text).group()
+        acs    = acs.replace(b';', b' ')
+        acs    = acs.split()
+        ac     = acs[0]
+        acs    = acs[1:]
+
+        de     = _deMatcher.search(header).group()
+        de     = _cleanDe.sub(b' ', de).strip().strip(b'.')
+
+    except AttributeError,e:
+        print('=======================================================')
+        print(text)
+        print('=======================================================')
+        raise e
+        
+    tags = {}
+    extractTaxon(ft, tags)
+        
+    seq = Nuc_Seq(ac,
+                  s,
+                  definition=de,
+                  quality=None,
+                  offset=-1,
+                  tags=tags)
+    
+    return seq
+
+
+def emblIterator_file(lineiterator, 
+                      int skip=0,
+                      only=None,
+                      firstline=None,
+                      int buffersize=100000000
+                      ):
+    cdef int        lines_to_skip, ionly, read
+    cdef Nuc_Seq    seq
+    cdef char*      entry = NULL
+    cdef size_t     entry_buffer_size 
+    cdef int        entry_len
+    cdef int        line_len   
+    
+    entry_buffer_size = 2048
+    
+    entry = <char*> malloc(entry_buffer_size*sizeof(char)) 
+
+    if only is None:
+        ionly = -1
+    else:
+        ionly = int(only)
+
+    if isinstance(lineiterator, (str, bytes)):
+        lineiterator=uopen(lineiterator)        
+    if isinstance(lineiterator, LineBuffer):
+        iterator = iter(lineiterator)
+    else:
+        if hasattr(lineiterator, "readlines"):
+            iterator = iter(LineBuffer(lineiterator, buffersize))
+        elif hasattr(lineiterator, '__next__'):
+            iterator = lineiterator
+        else:
+            raise Exception("Invalid line iterator")
+
+    skipped = 0
+    read = 0
+    
+    if firstline is None:
+        line = next(iterator)
+    else:
+        line = firstline       
+        
+    DONECOUNT=0
+        
+    while True:
+        
+        if ionly >= 0 and read >= ionly:
+            break
+                
+        while skipped < skip:
+            line = next(iterator)
+            try:
+                while line[:2] != b'//':
+                    line = next(iterator)
+                line = next(iterator)
+            except StopIteration:
+                break
+            skipped += 1
+
+        try:
+            entry_len = 0
+            while line[:2] != b'//':
+                line_len = strlen(line)
+                while (entry_len + line_len) >= entry_buffer_size:
+                    entry_buffer_size*=2
+                    entry = <char*>realloc(entry, entry_buffer_size)
+                strcpy(entry+entry_len, line)
+                entry_len+=line_len
+                line = next(iterator)
+            # Add last line too because need the // flag to parse
+            line_len = strlen(line)
+            while (entry_len + line_len) >= entry_buffer_size:
+                entry_buffer_size*=2
+                entry = <char*>realloc(entry, entry_buffer_size)
+            strcpy(entry+entry_len, line)
+            line = next(iterator)
+        except StopIteration:
+            break
+                
+        seq = emblParser(entry)
+        
+        if DONECOUNT%10000 == 0:
+            print(DONECOUNT, "\n")
+        DONECOUNT+=1
+        
+        yield seq
+                
+        read+=1
+    
+    yield seq
+    
+    free(entry)
+        
+
+def emblIterator_dir(dir_path, 
+                     int skip=0,
+                     only=None,
+                     firstline=None,
+                     int buffersize=100000000
+                     ):
+    path = dir_path
+    for filename in glob.glob(os.path.join(path, b'*.dat*')):
+        f = uopen(filename)
+        for seq in emblIterator_file(f, skip=skip, only=only, buffersize=buffersize):
+            yield seq
+
+
+def emblIterator(obj, 
+                 int skip=0,
+                 only=None,
+                 firstline=None,
+                 int buffersize=100000000
+                ):
+    if type(obj) == bytes or type(obj) == str :
+        return emblIterator_dir(obj, skip=skip, only=only, firstline=firstline, buffersize=buffersize)
+    else:
+        return emblIterator_file(obj, skip=skip, only=only, firstline=firstline, buffersize=buffersize)
+
+
diff --git a/python/obitools3/parsers/embl_genbank_features.pxd b/python/obitools3/parsers/embl_genbank_features.pxd
new file mode 100644
index 0000000..1fdc434
--- /dev/null
+++ b/python/obitools3/parsers/embl_genbank_features.pxd
@@ -0,0 +1,2 @@
+#cython: language_level=3
+    
\ No newline at end of file
diff --git a/python/obitools3/parsers/embl_genbank_features.pyx b/python/obitools3/parsers/embl_genbank_features.pyx
new file mode 100644
index 0000000..985b351
--- /dev/null
+++ b/python/obitools3/parsers/embl_genbank_features.pyx
@@ -0,0 +1,151 @@
+#cython: language_level=3
+
+'''
+Created on June 12th 2018
+
+@author: coissac/mercier
+'''
+
+
+import logging
+import re
+from itertools import chain
+
+
+# TODO cython 
+# TODO import Location functions for Genbank stuff (src/obitools/location/__init__.py)
+
+        
+_featureMatcher = re.compile(b'^(FT|  )   [^ ].+\n((FT|  )    .+\n)+',re.M)
+_featureCleaner = re.compile(b'^FT',re.M)
+
+def textFeatureIterator(fttable):
+    '''
+    Iterate through a textual description of a feature table in a genbank
+    or embl format. Return at each step a text representation of each individual
+    feature composing the table.
+    
+    @param fttable:  a string corresponding to the feature table of a genbank
+                     or an embl entry
+                      
+    @type fttable: C{str}
+    
+    @return: an iterator on str
+    @rtype: iterator
+    
+    @see: L{ftParser}
+    '''
+    for m in _featureMatcher.finditer(fttable):
+        t = m.group()
+        t = _featureCleaner.sub(b'  ',t)
+        yield t
+   
+        
+_qualifierMatcher = re.compile(b'(?<=^ {21}/).+(\n {21}[^/].+)*',re.M)
+_qualifierCleanner= re.compile(b"^ +",re.M)
+        
+def qualifierIterator(qualifiers):
+    '''
+    Parse a textual description of a feature in embl or genbank format
+    as returned by the textFeatureIterator iterator and iterate through 
+    the key, value qualified defining this location.
+     
+    @param qualifiers: substring containing qualifiers
+    @type qualifiers: str
+    
+    @return: an iterator on tuple (key,value), where keys are C{str}
+    @rtype: iterator
+    '''
+    for m in _qualifierMatcher.finditer(qualifiers):
+        t = m.group()
+        t = _qualifierCleanner.sub(b'',t)
+        t = t.split(b'=',1)
+        if len(t)==1:
+            t = (t[0],None)
+        else:
+            if t[0]==b'translation':
+                value = t[1].replace(b'\n',b'')
+            else:
+                value = t[1].replace(b'\n',b' ')
+            try:
+                value = eval(value)
+            except:
+                pass
+            t = (t[0],value)
+        yield t
+    
+     
+_ftmatcher = re.compile(b'(?<=^ {5})\S+')
+_qualifiersMatcher = re.compile(b'^ +/.+',re.M+re.DOTALL)
+
+def ftParser(feature):
+    fttype = _ftmatcher.search(feature).group()
+    qualifiers=_qualifiersMatcher.search(feature)
+    if qualifiers is not None:
+        qualifiers=qualifiers.group()
+    else:
+        qualifiers=b""
+        logging.debug("Qualifiers regex not matching on \n=====\n%s\n========" % feature)
+
+    return fttype,qualifiers       
+
+
+class Feature(dict):
+    
+    def __init__(self,type):
+        self._fttype=type
+
+    def getFttype(self):
+        return self._fttype
+
+    def __str__(self):
+        return repr(self)
+    
+    def __repr__(self):
+        return str((self.ftType, dict.__repr__(self)))
+    
+    ftType = property(getFttype, None, None, "Feature type name")
+       
+
+def featureFactory(featureDescription):
+    fttype,qualifiers = ftParser(featureDescription)
+    feature = Feature(fttype)
+    feature.raw  = featureDescription
+
+    for k,v in qualifierIterator(qualifiers):
+        feature.setdefault(k,[]).append(v)
+        
+    return feature
+       
+        
+def featureIterator(featureTable,skipError=False):
+    for tft in textFeatureIterator(featureTable):
+        try:
+            feature = featureFactory(tft)
+        except AssertionError,e:
+            logging.debug("Parsing error on feature :\n===============\n%s\n===============" % tft)
+            if not skipError:
+                raise e
+            logging.debug("\t===> Error skipped")
+            continue
+            
+        yield feature
+
+        
+def extractTaxon(bytes text, dict tags):
+         
+    s = next(featureIterator(text))
+    s = [s]          
+    
+    t = set(int(v[6:]) for v in chain(*tuple(f[b'db_xref'] for f in s if b'db_xref' in f)) 
+            if  v[0:6]=='taxon:')
+    if len(t)==1 :
+        taxid=t.pop()
+        if taxid >=0:
+            tags[b'TAXID']=taxid      
+
+    t = set(chain(*tuple(f[b'organism'] for f in s if b'organism' in f))) 
+    if len(t)==1:
+        tags[b'organism']=t.pop()
+
+