Files
ecopcr/obitools/format/genericparser/__init__.py

217 lines
7.8 KiB
Python

"""
G{packagetree format}
"""
import re
from obitools.utils import universalOpen
def genericEntryIteratorGenerator(startEntry=None,endEntry=None,
head=False,tail=False,
strip=False,join=True):
'''
Transfome a text line iterator to an entry oriented iterator.
This iterator converted is useful to implement first stage
of flat file parsing.
@param startEntry: a regular pattern matching the beginning of
an entry
@type startEntry: C{str} or None
@param endEntry: a regular pattern matching the end of
an entry
@type endEntry: C{str} or None
@param head: indicate if an header is present before
the first entry (as in many original genbank
files)
@type head: C{bool}
@param tail: indicate if some extra informations are present
after the last entry.
@type tail: C{bool}
@return: an iterator on entries in text format
@rtype: an iterator on C{str}
'''
def isBeginning(line):
return startEntry is None or startEntry.match(line) is not None
def isEnding(line):
return ((endEntry is not None and endEntry.match(line) is not None) or
(endEntry is None and startEntry is not None and startEntry.match(line) is not None))
def transparentIteratorEntry(file):
file = universalOpen(file)
return file
def genericEntryIterator(file):
file = universalOpen(file)
entry = []
line = file.next()
started = head or isBeginning(line)
try:
while 1:
while not started:
line = file.next()
started = isBeginning(line)
if endEntry is None:
entry.append(line)
line = file.next()
while started:
end = isEnding(line)
if end:
if endEntry is not None:
entry.append(line)
if join:
e = ''.join(entry)
if strip:
e=e.strip()
else:
e=entry
if strip:
e=[x.strip() for x in e]
entry=[]
yield e
started=False
if endEntry is not None:
line = file.next()
else:
entry.append(line)
line = file.next()
started = isBeginning(line)
except StopIteration:
if entry and (endEntry is None or tail):
if join:
e = ''.join(entry)
if strip:
e=e.strip()
else:
e=entry
if strip:
e=[x.strip() for x in e]
yield e
if startEntry is not None:
startEntry = re.compile(startEntry)
if endEntry is not None:
endEntry = re.compile(endEntry)
if startEntry is None and endEntry is None:
return transparentIteratorEntry
return genericEntryIterator
class GenericParser(object):
def __init__(self,
startEntry=None,
endEntry=None,
head=False,
tail=False,
strip=False,
**parseAction):
"""
@param startEntry: a regular pattern matching the beginning of
an entry
@type startEntry: C{str} or None
@param endEntry: a regular pattern matching the end of
an entry
@type endEntry: C{str} or None
@param head: indicate if an header is present before
the first entry (as in many original genbank
files)
@type head: C{bool}
@param tail: indicate if some extra informations are present
after the last entry.
@type tail: C{bool}
@param parseAction:
"""
self.flatiterator= genericEntryIteratorGenerator(startEntry,
endEntry,
head,
tail,
strip)
self.action={}
for k in parseAction:
self.addParseAction(k,*parseAction[k])
def addParseAction(self,name,dataMatcher,dataCleaner=None,cleanSub=''):
'''
Add a parse action to the generic parser. A parse action
allows to extract one information from an entry. A parse
action is defined by a name and a method to extract this
information from the full text entry.
A parse action can be defined following two ways.
- via regular expression patterns
- via dedicated function.
In the first case, you have to indicate at least the
dataMatcher regular pattern. This pattern should match exactly
the data part you want to retrieve. If cleanning of extra
characters is needed. The second pattern dataCLeanner can be
used to specifyed these characters.
In the second case you must provide a callable object (function)
that extract and clean data from the text entry. This function
should return an array containing all data retrevied even if
no data or only one data is retrevied.
@summary: Add a parse action to the generic parser.
@param name: name of the data extracted
@type name: C{str}
@param dataMatcher: a regular pattern matching the data
or a callable object parsing the
entry and returning a list of marched data
@type dataMatcher: C{str} or C{SRE_Pattern} instance or a callable
object
@param dataCleaner: a regular pattern matching part of the data
to suppress.
@type dataCleaner: C{str} or C{SRE_Pattern} instance or C{None}
@param cleanSub: string used to replace dataCleaner matches.
Default is an empty string
@type cleanSub: C{str}
'''
if callable(dataMatcher):
self.action[name]=dataMatcher
else :
if isinstance(dataMatcher, str):
dataMatcher=re.compile(dataMatcher)
if isinstance(dataCleaner, str):
dataCleaner=re.compile(dataCleaner)
self.action[name]=self._buildREParser(dataMatcher,
dataCleaner,
cleanSub)
def _buildREParser(self,dataMatcher,dataCleaner,cleanSub):
def parser(data):
x = dataMatcher.findall(data)
if dataCleaner is not None:
x = [dataCleaner.sub(cleanSub,y) for y in x]
return x
return parser
def __call__(self,file):
for e in self.flatiterator(file):
pe = {'fullentry':e}
for k in self.action:
pe[k]=self.action[k](e)
yield pe