From 303648bd4723e87724cbbca9f76e1b50da78da4a Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Sat, 28 Jul 2018 17:14:10 +0200 Subject: [PATCH] Cython: embl file parser --- python/obitools3/parsers/embl.pxd | 9 + python/obitools3/parsers/embl.pyx | 195 ++++++++++++++++++ .../parsers/embl_genbank_features.pxd | 2 + .../parsers/embl_genbank_features.pyx | 151 ++++++++++++++ 4 files changed, 357 insertions(+) create mode 100644 python/obitools3/parsers/embl.pxd create mode 100644 python/obitools3/parsers/embl.pyx create mode 100644 python/obitools3/parsers/embl_genbank_features.pxd create mode 100644 python/obitools3/parsers/embl_genbank_features.pyx diff --git a/python/obitools3/parsers/embl.pxd b/python/obitools3/parsers/embl.pxd new file mode 100644 index 0000000..8962d37 --- /dev/null +++ b/python/obitools3/parsers/embl.pxd @@ -0,0 +1,9 @@ +#cython: language_level=3 + +from ..utils cimport str2bytes +from .header cimport parseHeader +from ..files.universalopener cimport uopen +from ..files.linebuffer cimport LineBuffer + + + \ No newline at end of file diff --git a/python/obitools3/parsers/embl.pyx b/python/obitools3/parsers/embl.pyx new file mode 100644 index 0000000..4c90fd6 --- /dev/null +++ b/python/obitools3/parsers/embl.pyx @@ -0,0 +1,195 @@ +#cython: language_level=3 + +''' +Created on June 12th 2018 + +@author: coissac/mercier +''' + + +import types +import re +import sys +import os +import glob + +from obitools3.files.universalopener cimport uopen +from obitools3.utils cimport tostr +from obitools3.dms.obiseq cimport Nuc_Seq +from .embl_genbank_features import extractTaxon # TODO cimport + +from libc.stdlib cimport free, malloc, realloc +from libc.string cimport strcpy, strlen + + +_featureMatcher = re.compile(b'(^FT .*\n)+', re.M) +_cleanFT = re.compile(b'^FT',re.M) + +_headerMatcher = re.compile(b'^ID.+(?=\nFH )', re.DOTALL) +_seqMatcher = re.compile(b'(^ ).+(?=//\n)', re.DOTALL + re.M) +_cleanSeq = re.compile(b'[ \n0-9]+') +_acMatcher = re.compile(b'(?<=^AC ).+',re.M) +_deMatcher = re.compile(b'(^DE .+\n)+',re.M) +_cleanDe = re.compile(b'(^|\n)DE +') + + +def emblParser(bytes text): + + cdef Nuc_Seq seq + + try: + header = _headerMatcher.search(text).group() + + ft = _featureMatcher.search(text).group() + ft = _cleanFT.sub(b' ',ft) + + s = _seqMatcher.search(text).group() + s = _cleanSeq.sub(b'', s).upper() + + acs = _acMatcher.search(text).group() + acs = acs.replace(b';', b' ') + acs = acs.split() + ac = acs[0] + acs = acs[1:] + + de = _deMatcher.search(header).group() + de = _cleanDe.sub(b' ', de).strip().strip(b'.') + + except AttributeError,e: + print('=======================================================') + print(text) + print('=======================================================') + raise e + + tags = {} + extractTaxon(ft, tags) + + seq = Nuc_Seq(ac, + s, + definition=de, + quality=None, + offset=-1, + tags=tags) + + return seq + + +def emblIterator_file(lineiterator, + int skip=0, + only=None, + firstline=None, + int buffersize=100000000 + ): + cdef int lines_to_skip, ionly, read + cdef Nuc_Seq seq + cdef char* entry = NULL + cdef size_t entry_buffer_size + cdef int entry_len + cdef int line_len + + entry_buffer_size = 2048 + + entry = malloc(entry_buffer_size*sizeof(char)) + + if only is None: + ionly = -1 + else: + ionly = int(only) + + if isinstance(lineiterator, (str, bytes)): + lineiterator=uopen(lineiterator) + if isinstance(lineiterator, LineBuffer): + iterator = iter(lineiterator) + else: + if hasattr(lineiterator, "readlines"): + iterator = iter(LineBuffer(lineiterator, buffersize)) + elif hasattr(lineiterator, '__next__'): + iterator = lineiterator + else: + raise Exception("Invalid line iterator") + + skipped = 0 + read = 0 + + if firstline is None: + line = next(iterator) + else: + line = firstline + + DONECOUNT=0 + + while True: + + if ionly >= 0 and read >= ionly: + break + + while skipped < skip: + line = next(iterator) + try: + while line[:2] != b'//': + line = next(iterator) + line = next(iterator) + except StopIteration: + break + skipped += 1 + + try: + entry_len = 0 + while line[:2] != b'//': + line_len = strlen(line) + while (entry_len + line_len) >= entry_buffer_size: + entry_buffer_size*=2 + entry = realloc(entry, entry_buffer_size) + strcpy(entry+entry_len, line) + entry_len+=line_len + line = next(iterator) + # Add last line too because need the // flag to parse + line_len = strlen(line) + while (entry_len + line_len) >= entry_buffer_size: + entry_buffer_size*=2 + entry = realloc(entry, entry_buffer_size) + strcpy(entry+entry_len, line) + line = next(iterator) + except StopIteration: + break + + seq = emblParser(entry) + + if DONECOUNT%10000 == 0: + print(DONECOUNT, "\n") + DONECOUNT+=1 + + yield seq + + read+=1 + + yield seq + + free(entry) + + +def emblIterator_dir(dir_path, + int skip=0, + only=None, + firstline=None, + int buffersize=100000000 + ): + path = dir_path + for filename in glob.glob(os.path.join(path, b'*.dat*')): + f = uopen(filename) + for seq in emblIterator_file(f, skip=skip, only=only, buffersize=buffersize): + yield seq + + +def emblIterator(obj, + int skip=0, + only=None, + firstline=None, + int buffersize=100000000 + ): + if type(obj) == bytes or type(obj) == str : + return emblIterator_dir(obj, skip=skip, only=only, firstline=firstline, buffersize=buffersize) + else: + return emblIterator_file(obj, skip=skip, only=only, firstline=firstline, buffersize=buffersize) + + diff --git a/python/obitools3/parsers/embl_genbank_features.pxd b/python/obitools3/parsers/embl_genbank_features.pxd new file mode 100644 index 0000000..1fdc434 --- /dev/null +++ b/python/obitools3/parsers/embl_genbank_features.pxd @@ -0,0 +1,2 @@ +#cython: language_level=3 + \ No newline at end of file diff --git a/python/obitools3/parsers/embl_genbank_features.pyx b/python/obitools3/parsers/embl_genbank_features.pyx new file mode 100644 index 0000000..985b351 --- /dev/null +++ b/python/obitools3/parsers/embl_genbank_features.pyx @@ -0,0 +1,151 @@ +#cython: language_level=3 + +''' +Created on June 12th 2018 + +@author: coissac/mercier +''' + + +import logging +import re +from itertools import chain + + +# TODO cython +# TODO import Location functions for Genbank stuff (src/obitools/location/__init__.py) + + +_featureMatcher = re.compile(b'^(FT| ) [^ ].+\n((FT| ) .+\n)+',re.M) +_featureCleaner = re.compile(b'^FT',re.M) + +def textFeatureIterator(fttable): + ''' + Iterate through a textual description of a feature table in a genbank + or embl format. Return at each step a text representation of each individual + feature composing the table. + + @param fttable: a string corresponding to the feature table of a genbank + or an embl entry + + @type fttable: C{str} + + @return: an iterator on str + @rtype: iterator + + @see: L{ftParser} + ''' + for m in _featureMatcher.finditer(fttable): + t = m.group() + t = _featureCleaner.sub(b' ',t) + yield t + + +_qualifierMatcher = re.compile(b'(?<=^ {21}/).+(\n {21}[^/].+)*',re.M) +_qualifierCleanner= re.compile(b"^ +",re.M) + +def qualifierIterator(qualifiers): + ''' + Parse a textual description of a feature in embl or genbank format + as returned by the textFeatureIterator iterator and iterate through + the key, value qualified defining this location. + + @param qualifiers: substring containing qualifiers + @type qualifiers: str + + @return: an iterator on tuple (key,value), where keys are C{str} + @rtype: iterator + ''' + for m in _qualifierMatcher.finditer(qualifiers): + t = m.group() + t = _qualifierCleanner.sub(b'',t) + t = t.split(b'=',1) + if len(t)==1: + t = (t[0],None) + else: + if t[0]==b'translation': + value = t[1].replace(b'\n',b'') + else: + value = t[1].replace(b'\n',b' ') + try: + value = eval(value) + except: + pass + t = (t[0],value) + yield t + + +_ftmatcher = re.compile(b'(?<=^ {5})\S+') +_qualifiersMatcher = re.compile(b'^ +/.+',re.M+re.DOTALL) + +def ftParser(feature): + fttype = _ftmatcher.search(feature).group() + qualifiers=_qualifiersMatcher.search(feature) + if qualifiers is not None: + qualifiers=qualifiers.group() + else: + qualifiers=b"" + logging.debug("Qualifiers regex not matching on \n=====\n%s\n========" % feature) + + return fttype,qualifiers + + +class Feature(dict): + + def __init__(self,type): + self._fttype=type + + def getFttype(self): + return self._fttype + + def __str__(self): + return repr(self) + + def __repr__(self): + return str((self.ftType, dict.__repr__(self))) + + ftType = property(getFttype, None, None, "Feature type name") + + +def featureFactory(featureDescription): + fttype,qualifiers = ftParser(featureDescription) + feature = Feature(fttype) + feature.raw = featureDescription + + for k,v in qualifierIterator(qualifiers): + feature.setdefault(k,[]).append(v) + + return feature + + +def featureIterator(featureTable,skipError=False): + for tft in textFeatureIterator(featureTable): + try: + feature = featureFactory(tft) + except AssertionError,e: + logging.debug("Parsing error on feature :\n===============\n%s\n===============" % tft) + if not skipError: + raise e + logging.debug("\t===> Error skipped") + continue + + yield feature + + +def extractTaxon(bytes text, dict tags): + + s = next(featureIterator(text)) + s = [s] + + t = set(int(v[6:]) for v in chain(*tuple(f[b'db_xref'] for f in s if b'db_xref' in f)) + if v[0:6]=='taxon:') + if len(t)==1 : + taxid=t.pop() + if taxid >=0: + tags[b'TAXID']=taxid + + t = set(chain(*tuple(f[b'organism'] for f in s if b'organism' in f))) + if len(t)==1: + tags[b'organism']=t.pop() + +