Cython: embl file parser

2018-07-28 17:14:10 +02:00
parent 2ba6d16147
commit 303648bd47
4 changed files with 357 additions and 0 deletions
--- a/python/obitools3/parsers/embl.pxd
+++ b/python/obitools3/parsers/embl.pxd
@ -0,0 +1,9 @@
 #cython: language_level=3
 from ..utils cimport str2bytes
 from .header cimport parseHeader
 from ..files.universalopener cimport uopen
 from ..files.linebuffer cimport LineBuffer
--- a/python/obitools3/parsers/embl.pyx
+++ b/python/obitools3/parsers/embl.pyx
@ -0,0 +1,195 @@
 #cython: language_level=3
 '''
 Created on June 12th 2018
@author: coissac/mercier
 '''
 import types
 import re
 import sys
 import os
 import glob
 from obitools3.files.universalopener cimport uopen
 from obitools3.utils cimport tostr
 from obitools3.dms.obiseq cimport Nuc_Seq
 from .embl_genbank_features import extractTaxon # TODO cimport
 from libc.stdlib  cimport free, malloc, realloc
 from libc.string cimport strcpy, strlen
 _featureMatcher = re.compile(b'(^FT  .*\n)+', re.M)
 _cleanFT       = re.compile(b'^FT',re.M)
 _headerMatcher = re.compile(b'^ID.+(?=\nFH  )', re.DOTALL)
 _seqMatcher    = re.compile(b'(^    ).+(?=//\n)', re.DOTALL + re.M)
 _cleanSeq      = re.compile(b'[ \n0-9]+')
 _acMatcher     = re.compile(b'(?<=^AC   ).+',re.M)
 _deMatcher     = re.compile(b'(^DE   .+\n)+',re.M)
 _cleanDe       = re.compile(b'(^|\n)DE +')
 def emblParser(bytes text):
    cdef Nuc_Seq seq
    try:
        header = _headerMatcher.search(text).group()
        ft     = _featureMatcher.search(text).group()
        ft     = _cleanFT.sub(b'  ',ft)
        s      = _seqMatcher.search(text).group()
        s      = _cleanSeq.sub(b'', s).upper()
        acs    = _acMatcher.search(text).group()
        acs    = acs.replace(b';', b' ')
        acs    = acs.split()
        ac     = acs[0]
        acs    = acs[1:]
        de     = _deMatcher.search(header).group()
        de     = _cleanDe.sub(b' ', de).strip().strip(b'.')
    except AttributeError,e:
        print('=======================================================')
        print(text)
        print('=======================================================')
        raise e
    tags = {}
    extractTaxon(ft, tags)
    seq = Nuc_Seq(ac,
                  s,
                  definition=de,
                  quality=None,
                  offset=-1,
                  tags=tags)
    return seq
 def emblIterator_file(lineiterator, 
                      int skip=0,
                      only=None,
                      firstline=None,
                      int buffersize=100000000
                      ):
    cdef int        lines_to_skip, ionly, read
    cdef Nuc_Seq    seq
    cdef char*      entry = NULL
    cdef size_t     entry_buffer_size 
    cdef int        entry_len
    cdef int        line_len   
    entry_buffer_size = 2048
    entry = <char*> malloc(entry_buffer_size*sizeof(char)) 
    if only is None:
        ionly = -1
    else:
        ionly = int(only)
    if isinstance(lineiterator, (str, bytes)):
        lineiterator=uopen(lineiterator)        
    if isinstance(lineiterator, LineBuffer):
        iterator = iter(lineiterator)
    else:
        if hasattr(lineiterator, "readlines"):
            iterator = iter(LineBuffer(lineiterator, buffersize))
        elif hasattr(lineiterator, '__next__'):
            iterator = lineiterator
        else:
            raise Exception("Invalid line iterator")
    skipped = 0
    read = 0
    if firstline is None:
        line = next(iterator)
    else:
        line = firstline       
    DONECOUNT=0
    while True:
        if ionly >= 0 and read >= ionly:
            break
        while skipped < skip:
            line = next(iterator)
            try:
                while line[:2] != b'//':
                    line = next(iterator)
                line = next(iterator)
            except StopIteration:
                break
            skipped += 1
        try:
            entry_len = 0
            while line[:2] != b'//':
                line_len = strlen(line)
                while (entry_len + line_len) >= entry_buffer_size:
                    entry_buffer_size*=2
                    entry = <char*>realloc(entry, entry_buffer_size)
                strcpy(entry+entry_len, line)
                entry_len+=line_len
                line = next(iterator)
            # Add last line too because need the // flag to parse
            line_len = strlen(line)
            while (entry_len + line_len) >= entry_buffer_size:
                entry_buffer_size*=2
                entry = <char*>realloc(entry, entry_buffer_size)
            strcpy(entry+entry_len, line)
            line = next(iterator)
        except StopIteration:
            break
        seq = emblParser(entry)
        if DONECOUNT%10000 == 0:
            print(DONECOUNT, "\n")
        DONECOUNT+=1
        yield seq
        read+=1
    yield seq
    free(entry)
 def emblIterator_dir(dir_path, 
                     int skip=0,
                     only=None,
                     firstline=None,
                     int buffersize=100000000
                     ):
    path = dir_path
    for filename in glob.glob(os.path.join(path, b'*.dat*')):
        f = uopen(filename)
        for seq in emblIterator_file(f, skip=skip, only=only, buffersize=buffersize):
            yield seq
 def emblIterator(obj, 
                 int skip=0,
                 only=None,
                 firstline=None,
                 int buffersize=100000000
                ):
    if type(obj) == bytes or type(obj) == str :
        return emblIterator_dir(obj, skip=skip, only=only, firstline=firstline, buffersize=buffersize)
    else:
        return emblIterator_file(obj, skip=skip, only=only, firstline=firstline, buffersize=buffersize)
--- a/python/obitools3/parsers/embl_genbank_features.pxd
+++ b/python/obitools3/parsers/embl_genbank_features.pxd
@ -0,0 +1,2 @@
 #cython: language_level=3
--- a/python/obitools3/parsers/embl_genbank_features.pyx
+++ b/python/obitools3/parsers/embl_genbank_features.pyx
@ -0,0 +1,151 @@
 #cython: language_level=3
 '''
 Created on June 12th 2018
@author: coissac/mercier
 '''
 import logging
 import re
 from itertools import chain
 # TODO cython 
 # TODO import Location functions for Genbank stuff (src/obitools/location/__init__.py)
 _featureMatcher = re.compile(b'^(FT|  )   [^ ].+\n((FT|  )    .+\n)+',re.M)
 _featureCleaner = re.compile(b'^FT',re.M)
 def textFeatureIterator(fttable):
    '''
    Iterate through a textual description of a feature table in a genbank
    or embl format. Return at each step a text representation of each individual
    feature composing the table.
    @param fttable:  a string corresponding to the feature table of a genbank
                     or an embl entry
    @type fttable: C{str}
    @return: an iterator on str
    @rtype: iterator
    @see: L{ftParser}
    '''
    for m in _featureMatcher.finditer(fttable):
        t = m.group()
        t = _featureCleaner.sub(b'  ',t)
        yield t
 _qualifierMatcher = re.compile(b'(?<=^ {21}/).+(\n {21}[^/].+)*',re.M)
 _qualifierCleanner= re.compile(b"^ +",re.M)
 def qualifierIterator(qualifiers):
    '''
    Parse a textual description of a feature in embl or genbank format
    as returned by the textFeatureIterator iterator and iterate through 
    the key, value qualified defining this location.
    @param qualifiers: substring containing qualifiers
    @type qualifiers: str
    @return: an iterator on tuple (key,value), where keys are C{str}
    @rtype: iterator
    '''
    for m in _qualifierMatcher.finditer(qualifiers):
        t = m.group()
        t = _qualifierCleanner.sub(b'',t)
        t = t.split(b'=',1)
        if len(t)==1:
            t = (t[0],None)
        else:
            if t[0]==b'translation':
                value = t[1].replace(b'\n',b'')
            else:
                value = t[1].replace(b'\n',b' ')
            try:
                value = eval(value)
            except:
                pass
            t = (t[0],value)
        yield t
 _ftmatcher = re.compile(b'(?<=^ {5})\S+')
 _qualifiersMatcher = re.compile(b'^ +/.+',re.M+re.DOTALL)
 def ftParser(feature):
    fttype = _ftmatcher.search(feature).group()
    qualifiers=_qualifiersMatcher.search(feature)
    if qualifiers is not None:
        qualifiers=qualifiers.group()
    else:
        qualifiers=b""
        logging.debug("Qualifiers regex not matching on \n=====\n%s\n========" % feature)
    return fttype,qualifiers       
 class Feature(dict):
    def __init__(self,type):
        self._fttype=type
    def getFttype(self):
        return self._fttype
    def __str__(self):
        return repr(self)
    def __repr__(self):
        return str((self.ftType, dict.__repr__(self)))
    ftType = property(getFttype, None, None, "Feature type name")
 def featureFactory(featureDescription):
    fttype,qualifiers = ftParser(featureDescription)
    feature = Feature(fttype)
    feature.raw  = featureDescription
    for k,v in qualifierIterator(qualifiers):
        feature.setdefault(k,[]).append(v)
    return feature
 def featureIterator(featureTable,skipError=False):
    for tft in textFeatureIterator(featureTable):
        try:
            feature = featureFactory(tft)
        except AssertionError,e:
            logging.debug("Parsing error on feature :\n===============\n%s\n===============" % tft)
            if not skipError:
                raise e
            logging.debug("\t===> Error skipped")
            continue
        yield feature
 def extractTaxon(bytes text, dict tags):
    s = next(featureIterator(text))
    s = [s]          
    t = set(int(v[6:]) for v in chain(*tuple(f[b'db_xref'] for f in s if b'db_xref' in f)) 
            if  v[0:6]=='taxon:')
    if len(t)==1 :
        taxid=t.pop()
        if taxid >=0:
            tags[b'TAXID']=taxid      
    t = set(chain(*tuple(f[b'organism'] for f in s if b'organism' in f))) 
    if len(t)==1:
        tags[b'organism']=t.pop()