Cython: embl file parser
This commit is contained in:
9
python/obitools3/parsers/embl.pxd
Normal file
9
python/obitools3/parsers/embl.pxd
Normal file
@ -0,0 +1,9 @@
|
||||
#cython: language_level=3
|
||||
|
||||
from ..utils cimport str2bytes
|
||||
from .header cimport parseHeader
|
||||
from ..files.universalopener cimport uopen
|
||||
from ..files.linebuffer cimport LineBuffer
|
||||
|
||||
|
||||
|
195
python/obitools3/parsers/embl.pyx
Normal file
195
python/obitools3/parsers/embl.pyx
Normal file
@ -0,0 +1,195 @@
|
||||
#cython: language_level=3
|
||||
|
||||
'''
|
||||
Created on June 12th 2018
|
||||
|
||||
@author: coissac/mercier
|
||||
'''
|
||||
|
||||
|
||||
import types
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
import glob
|
||||
|
||||
from obitools3.files.universalopener cimport uopen
|
||||
from obitools3.utils cimport tostr
|
||||
from obitools3.dms.obiseq cimport Nuc_Seq
|
||||
from .embl_genbank_features import extractTaxon # TODO cimport
|
||||
|
||||
from libc.stdlib cimport free, malloc, realloc
|
||||
from libc.string cimport strcpy, strlen
|
||||
|
||||
|
||||
_featureMatcher = re.compile(b'(^FT .*\n)+', re.M)
|
||||
_cleanFT = re.compile(b'^FT',re.M)
|
||||
|
||||
_headerMatcher = re.compile(b'^ID.+(?=\nFH )', re.DOTALL)
|
||||
_seqMatcher = re.compile(b'(^ ).+(?=//\n)', re.DOTALL + re.M)
|
||||
_cleanSeq = re.compile(b'[ \n0-9]+')
|
||||
_acMatcher = re.compile(b'(?<=^AC ).+',re.M)
|
||||
_deMatcher = re.compile(b'(^DE .+\n)+',re.M)
|
||||
_cleanDe = re.compile(b'(^|\n)DE +')
|
||||
|
||||
|
||||
def emblParser(bytes text):
|
||||
|
||||
cdef Nuc_Seq seq
|
||||
|
||||
try:
|
||||
header = _headerMatcher.search(text).group()
|
||||
|
||||
ft = _featureMatcher.search(text).group()
|
||||
ft = _cleanFT.sub(b' ',ft)
|
||||
|
||||
s = _seqMatcher.search(text).group()
|
||||
s = _cleanSeq.sub(b'', s).upper()
|
||||
|
||||
acs = _acMatcher.search(text).group()
|
||||
acs = acs.replace(b';', b' ')
|
||||
acs = acs.split()
|
||||
ac = acs[0]
|
||||
acs = acs[1:]
|
||||
|
||||
de = _deMatcher.search(header).group()
|
||||
de = _cleanDe.sub(b' ', de).strip().strip(b'.')
|
||||
|
||||
except AttributeError,e:
|
||||
print('=======================================================')
|
||||
print(text)
|
||||
print('=======================================================')
|
||||
raise e
|
||||
|
||||
tags = {}
|
||||
extractTaxon(ft, tags)
|
||||
|
||||
seq = Nuc_Seq(ac,
|
||||
s,
|
||||
definition=de,
|
||||
quality=None,
|
||||
offset=-1,
|
||||
tags=tags)
|
||||
|
||||
return seq
|
||||
|
||||
|
||||
def emblIterator_file(lineiterator,
|
||||
int skip=0,
|
||||
only=None,
|
||||
firstline=None,
|
||||
int buffersize=100000000
|
||||
):
|
||||
cdef int lines_to_skip, ionly, read
|
||||
cdef Nuc_Seq seq
|
||||
cdef char* entry = NULL
|
||||
cdef size_t entry_buffer_size
|
||||
cdef int entry_len
|
||||
cdef int line_len
|
||||
|
||||
entry_buffer_size = 2048
|
||||
|
||||
entry = <char*> malloc(entry_buffer_size*sizeof(char))
|
||||
|
||||
if only is None:
|
||||
ionly = -1
|
||||
else:
|
||||
ionly = int(only)
|
||||
|
||||
if isinstance(lineiterator, (str, bytes)):
|
||||
lineiterator=uopen(lineiterator)
|
||||
if isinstance(lineiterator, LineBuffer):
|
||||
iterator = iter(lineiterator)
|
||||
else:
|
||||
if hasattr(lineiterator, "readlines"):
|
||||
iterator = iter(LineBuffer(lineiterator, buffersize))
|
||||
elif hasattr(lineiterator, '__next__'):
|
||||
iterator = lineiterator
|
||||
else:
|
||||
raise Exception("Invalid line iterator")
|
||||
|
||||
skipped = 0
|
||||
read = 0
|
||||
|
||||
if firstline is None:
|
||||
line = next(iterator)
|
||||
else:
|
||||
line = firstline
|
||||
|
||||
DONECOUNT=0
|
||||
|
||||
while True:
|
||||
|
||||
if ionly >= 0 and read >= ionly:
|
||||
break
|
||||
|
||||
while skipped < skip:
|
||||
line = next(iterator)
|
||||
try:
|
||||
while line[:2] != b'//':
|
||||
line = next(iterator)
|
||||
line = next(iterator)
|
||||
except StopIteration:
|
||||
break
|
||||
skipped += 1
|
||||
|
||||
try:
|
||||
entry_len = 0
|
||||
while line[:2] != b'//':
|
||||
line_len = strlen(line)
|
||||
while (entry_len + line_len) >= entry_buffer_size:
|
||||
entry_buffer_size*=2
|
||||
entry = <char*>realloc(entry, entry_buffer_size)
|
||||
strcpy(entry+entry_len, line)
|
||||
entry_len+=line_len
|
||||
line = next(iterator)
|
||||
# Add last line too because need the // flag to parse
|
||||
line_len = strlen(line)
|
||||
while (entry_len + line_len) >= entry_buffer_size:
|
||||
entry_buffer_size*=2
|
||||
entry = <char*>realloc(entry, entry_buffer_size)
|
||||
strcpy(entry+entry_len, line)
|
||||
line = next(iterator)
|
||||
except StopIteration:
|
||||
break
|
||||
|
||||
seq = emblParser(entry)
|
||||
|
||||
if DONECOUNT%10000 == 0:
|
||||
print(DONECOUNT, "\n")
|
||||
DONECOUNT+=1
|
||||
|
||||
yield seq
|
||||
|
||||
read+=1
|
||||
|
||||
yield seq
|
||||
|
||||
free(entry)
|
||||
|
||||
|
||||
def emblIterator_dir(dir_path,
|
||||
int skip=0,
|
||||
only=None,
|
||||
firstline=None,
|
||||
int buffersize=100000000
|
||||
):
|
||||
path = dir_path
|
||||
for filename in glob.glob(os.path.join(path, b'*.dat*')):
|
||||
f = uopen(filename)
|
||||
for seq in emblIterator_file(f, skip=skip, only=only, buffersize=buffersize):
|
||||
yield seq
|
||||
|
||||
|
||||
def emblIterator(obj,
|
||||
int skip=0,
|
||||
only=None,
|
||||
firstline=None,
|
||||
int buffersize=100000000
|
||||
):
|
||||
if type(obj) == bytes or type(obj) == str :
|
||||
return emblIterator_dir(obj, skip=skip, only=only, firstline=firstline, buffersize=buffersize)
|
||||
else:
|
||||
return emblIterator_file(obj, skip=skip, only=only, firstline=firstline, buffersize=buffersize)
|
||||
|
||||
|
2
python/obitools3/parsers/embl_genbank_features.pxd
Normal file
2
python/obitools3/parsers/embl_genbank_features.pxd
Normal file
@ -0,0 +1,2 @@
|
||||
#cython: language_level=3
|
||||
|
151
python/obitools3/parsers/embl_genbank_features.pyx
Normal file
151
python/obitools3/parsers/embl_genbank_features.pyx
Normal file
@ -0,0 +1,151 @@
|
||||
#cython: language_level=3
|
||||
|
||||
'''
|
||||
Created on June 12th 2018
|
||||
|
||||
@author: coissac/mercier
|
||||
'''
|
||||
|
||||
|
||||
import logging
|
||||
import re
|
||||
from itertools import chain
|
||||
|
||||
|
||||
# TODO cython
|
||||
# TODO import Location functions for Genbank stuff (src/obitools/location/__init__.py)
|
||||
|
||||
|
||||
_featureMatcher = re.compile(b'^(FT| ) [^ ].+\n((FT| ) .+\n)+',re.M)
|
||||
_featureCleaner = re.compile(b'^FT',re.M)
|
||||
|
||||
def textFeatureIterator(fttable):
|
||||
'''
|
||||
Iterate through a textual description of a feature table in a genbank
|
||||
or embl format. Return at each step a text representation of each individual
|
||||
feature composing the table.
|
||||
|
||||
@param fttable: a string corresponding to the feature table of a genbank
|
||||
or an embl entry
|
||||
|
||||
@type fttable: C{str}
|
||||
|
||||
@return: an iterator on str
|
||||
@rtype: iterator
|
||||
|
||||
@see: L{ftParser}
|
||||
'''
|
||||
for m in _featureMatcher.finditer(fttable):
|
||||
t = m.group()
|
||||
t = _featureCleaner.sub(b' ',t)
|
||||
yield t
|
||||
|
||||
|
||||
_qualifierMatcher = re.compile(b'(?<=^ {21}/).+(\n {21}[^/].+)*',re.M)
|
||||
_qualifierCleanner= re.compile(b"^ +",re.M)
|
||||
|
||||
def qualifierIterator(qualifiers):
|
||||
'''
|
||||
Parse a textual description of a feature in embl or genbank format
|
||||
as returned by the textFeatureIterator iterator and iterate through
|
||||
the key, value qualified defining this location.
|
||||
|
||||
@param qualifiers: substring containing qualifiers
|
||||
@type qualifiers: str
|
||||
|
||||
@return: an iterator on tuple (key,value), where keys are C{str}
|
||||
@rtype: iterator
|
||||
'''
|
||||
for m in _qualifierMatcher.finditer(qualifiers):
|
||||
t = m.group()
|
||||
t = _qualifierCleanner.sub(b'',t)
|
||||
t = t.split(b'=',1)
|
||||
if len(t)==1:
|
||||
t = (t[0],None)
|
||||
else:
|
||||
if t[0]==b'translation':
|
||||
value = t[1].replace(b'\n',b'')
|
||||
else:
|
||||
value = t[1].replace(b'\n',b' ')
|
||||
try:
|
||||
value = eval(value)
|
||||
except:
|
||||
pass
|
||||
t = (t[0],value)
|
||||
yield t
|
||||
|
||||
|
||||
_ftmatcher = re.compile(b'(?<=^ {5})\S+')
|
||||
_qualifiersMatcher = re.compile(b'^ +/.+',re.M+re.DOTALL)
|
||||
|
||||
def ftParser(feature):
|
||||
fttype = _ftmatcher.search(feature).group()
|
||||
qualifiers=_qualifiersMatcher.search(feature)
|
||||
if qualifiers is not None:
|
||||
qualifiers=qualifiers.group()
|
||||
else:
|
||||
qualifiers=b""
|
||||
logging.debug("Qualifiers regex not matching on \n=====\n%s\n========" % feature)
|
||||
|
||||
return fttype,qualifiers
|
||||
|
||||
|
||||
class Feature(dict):
|
||||
|
||||
def __init__(self,type):
|
||||
self._fttype=type
|
||||
|
||||
def getFttype(self):
|
||||
return self._fttype
|
||||
|
||||
def __str__(self):
|
||||
return repr(self)
|
||||
|
||||
def __repr__(self):
|
||||
return str((self.ftType, dict.__repr__(self)))
|
||||
|
||||
ftType = property(getFttype, None, None, "Feature type name")
|
||||
|
||||
|
||||
def featureFactory(featureDescription):
|
||||
fttype,qualifiers = ftParser(featureDescription)
|
||||
feature = Feature(fttype)
|
||||
feature.raw = featureDescription
|
||||
|
||||
for k,v in qualifierIterator(qualifiers):
|
||||
feature.setdefault(k,[]).append(v)
|
||||
|
||||
return feature
|
||||
|
||||
|
||||
def featureIterator(featureTable,skipError=False):
|
||||
for tft in textFeatureIterator(featureTable):
|
||||
try:
|
||||
feature = featureFactory(tft)
|
||||
except AssertionError,e:
|
||||
logging.debug("Parsing error on feature :\n===============\n%s\n===============" % tft)
|
||||
if not skipError:
|
||||
raise e
|
||||
logging.debug("\t===> Error skipped")
|
||||
continue
|
||||
|
||||
yield feature
|
||||
|
||||
|
||||
def extractTaxon(bytes text, dict tags):
|
||||
|
||||
s = next(featureIterator(text))
|
||||
s = [s]
|
||||
|
||||
t = set(int(v[6:]) for v in chain(*tuple(f[b'db_xref'] for f in s if b'db_xref' in f))
|
||||
if v[0:6]=='taxon:')
|
||||
if len(t)==1 :
|
||||
taxid=t.pop()
|
||||
if taxid >=0:
|
||||
tags[b'TAXID']=taxid
|
||||
|
||||
t = set(chain(*tuple(f[b'organism'] for f in s if b'organism' in f)))
|
||||
if len(t)==1:
|
||||
tags[b'organism']=t.pop()
|
||||
|
||||
|
Reference in New Issue
Block a user