Parser OBO + contraintes GO - corrections de Sophie

This commit is contained in:
2008-10-05 10:43:11 +00:00
parent 4d4d53a37e
commit fa0636029a

View File

@ -0,0 +1,699 @@
from obitools.utils import skipWhiteLineIterator,multiLineWrapper
from obitools.utils import universalOpen
from obitools.format.genericparser import genericEntryIteratorGenerator
from logging import debug,warning
import re
#################################################################################
## Stanza preparation area ##
#################################################################################
class FileFormatError(Exception):
'''
An error derived from the class Exception.
'''
pass
_oboEntryIterator = genericEntryIteratorGenerator(endEntry='^ *$',
strip=True)
def stanzaIterator(inputfile):
'''
Iterator of stanza. The stanza are the basic units of OBO files.
@param inputfile: a stream of strings from an opened OBO file.
@type inputfile: a stream of strings
@return: a stream of stanza
@rtype: a stream of aggregated strings
@note: The iterator constructs stanza by aggregate strings from the
OBO file.
'''
inputfile = universalOpen(inputfile)
inputfile = multiLineWrapper(inputfile)
return _oboEntryIterator(inputfile)
#################################################################################
## Trailing Modifiers treatment area ##
#################################################################################
class TrailingModifier(dict):
'''
A class object which inherits from the class dict. Trailing modifiers can be found
at the end of TaggedValue objects when they exist.
'''
_match_brace = re.compile('(?<=\ {)[^\]]*(\}) *( !|$)')
def __init__(self,string):
## search for trailing modifiers signals
trailing_modifiers = TrailingModifier._match_brace.search(string)
## the trailing modifiers exist
if trailing_modifiers:
trailing_modifiers=trailing_modifiers.group(0).strip()
print trailing_modifiers
## creates and feeds the dictionary of trailing modifiers
dict.__init__(self,(x.strip().split('=',1) for x in trailing_modifiers.split(',')))
def trailingModifierFactory(string):
'''
Dispatcher of trailing modifiers.
@param string: a string from a TaggedValue object with a trailing modifiers signal.
@type string: string
@return: a class object
@note: The dispatcher is currently very simple. Only one case is treated by the function.
`the function returns a class object inherited from the class dict if the trailing modifiers
exist, None if they don't.
'''
trailing_modifiers = TrailingModifier(string)
if not trailing_modifiers:
trailing_modifiers=None
return trailing_modifiers
#################################################################################
## TaggedValue treatment area ##
#################################################################################
class TaggedValue(object):
'''
A couple 'tag:value' of an OBOEntry.
'''
_match_value = re.compile('(("(\\\\"|[^\"])*")|(\\\\"|[^\"]))*?( !| {|$)')
_split_comment = re.compile('^!| !')
_match_quotedString = re.compile('(?<=")(\\\\"|[^\"])*(?=")')
_match_bracket = re.compile('\[[^\]]*\]')
def __init__(self,line):
'''
Constructor of the class TaggedValue.
@param line: a line of an OBOEntry composed of a tag and a value.
@type line: string
@note: The constructor separates tags from right terms. 'value' is extracted
from right terms using a regular expression (value is at the beginning of the
string, between quotes or not). Then, 'comment' is extracted from the rest of the
string using another regular expression ('comment' is at the end of the string
after a '!'. By default, 'comment' is set to None). Finally, 'trailing_modifiers'
are extracted from the last string using another regular expression.
The tag, the value, the comment and the trailing_modifiers are saved.
'''
debug("tagValueParser : %s" % line)
## by default :
trailing_modifiers = None
comment = None
## the tag is saved. 'right' is composed of the value, the comment and the trailing modifiers
tag,rigth = line.split(':',1)
## the value is saved
value = TaggedValue._match_value.search(rigth).group(0)
debug("Extracted value : %s" % value)
## if there is a value AND a sign of a comment or trailing modifiers
if value and value[-1] in '!{':
lvalue = len(value)
## whatever it is a comment or trailing modifiers, it is saved into 'extra'
extra = rigth[lvalue-1:].strip()
## a comment is extracted
extra =TaggedValue._split_comment.split(extra,1)
## and saved if it exists
if len(extra)==2:
comment=extra[1].strip()
## trailing modifiers are extracted
extra=extra[0]
trailing_modifiers = trailingModifierFactory(extra)
## the value is cleaned of any comment or trailing modifiers signals
value = value[0:-1]
if tag=='use_term':
tag='consider'
raise DeprecationWarning,"user_term is a deprecated tag, you should instead use consider"
## recording zone
self.value =value.strip()
self.tag = tag
self.__doc__=comment
self.trailing_modifiers=trailing_modifiers
def __str__(self):
return str(self.value)
def __repr__(self):
return '''"""%s"""''' % str(self)
class NameValue(TaggedValue):
'''
A couple 'name:value' inherited from the class TaggedValue. Used to manage name tags.
'''
def __init__(self,line):
## no use of the TaggedValue constructor. The NameValue is very simple.
tag,rigth = line.split(':',1)
## recording zone
self.value = rigth.strip()
self.tag = 'name'
self.__doc__=None
self.trailing_modifiers=None
class DefValue(TaggedValue):
'''
A couple 'def:value' inherited from the class TaggedValue. Used to manage def tags.
'''
def __init__(self,line):
'''
Constructor of the class DefValue.
@param line: a line of an OBOEntry composed of a tag named 'def' and a value.
@type line: string
@note: The constructor calls the TaggedValue constructor. A regular expression
is used to extract the 'definition' from TaggedValue.value (definition is a not
quoted TaggedValue.value). A regular expression is used to extract 'dbxrefs'
from the aggedValue.value without the definition (dbxrefs are between brackets
and definition can be so). Definition is saved as the new value of the DefValue.
dbxrefs are saved.
'''
## use of the TaggedValue constructor
TaggedValue.__init__(self, line)
## definition, which is quoted, is extracted from the standard value of a TaggedValue.
definition = TaggedValue._match_quotedString.search(self.value).group(0)
## the standard value is cleaned of the definition.
cleanvalue = self.value.replace(definition,'')
cleanvalue = cleanvalue.replace(' ',' ')
## dbxrefs are searched into the rest of the standard value.
dbxrefs = TaggedValue._match_bracket.search(cleanvalue).group(0)
## recording zone
self.tag = "def"
## the value of a DefValue is not the standard value but the definition.
self.value=definition
self.dbxrefs=xrefFactory(dbxrefs)
class SynonymValue(TaggedValue):
'''
A couple 'synonym:value' inherited from the class TaggedValue. Used to manage
synonym tags, exact_synonym tags, broad_synonym tags and narrow_synonym tags.
'''
_match_scope = re.compile('(?<="")[^\[]*(?=\[|$)')
def __init__(self,line):
'''
Constructor of the class SynonymValue.
@param line: a line of an OBOEntry composed of a tag named 'synonym' or
'exact_synonym' or 'broad_synonym' or 'narrow_synonym' and a value.
@type line: string
@note: SynonymValue is composed of a tag, a value, a scope, a list of types and
dbxrefs.
The constructor calls the TaggedValue constructor. A regular expression
is used to extract 'definition' from TaggedValue.value (definition is a not
quoted TaggedValue.value). Definition is saved as the new value of the class
SynonymValue.
A regular expression is used to extract 'attributes' from the rest of the
string. Attributes may contain an optional synonym scope and an optional list
of synonym types. The scope is extracted from attributes or set by default to
'RELATED'. It is saved as the scope of the class. The types are the rest of the
attributes and are saved as the list of types of the class.
For deprecated tags 'exact_synonym', 'broad_synonym' and 'narrow_synonym', tag
is set to 'synonym' and scope is set respectively to 'EXACT', 'BROAD' and 'NARROW'.
A regular expression is used to extract 'dbxrefs' from the TaggedValue.value
without the definition (dbxrefs are between brackets and definition can be so).
dbxrefs are saved.
'''
## use of the TaggedValue constructor
TaggedValue.__init__(self, line)
## definition, which is quoted, is extracted from the standard value of a TaggedValue.
definition = TaggedValue._match_quotedString.search(self.value).group(0)
## the standard value is cleaned of the definition.
cleanvalue = self.value.replace(definition,'')
cleanvalue = cleanvalue.replace(' ',' ')
## 1) attributes are searched into the rest of the standard value.
## 2) then they are stripped.
## 3) then they are split on every ' '.
## 4) finally they are ordered into a set.
attributes = set(SynonymValue._match_scope.search(cleanvalue).group(0).strip().split())
## the scopes are the junction between the attributes and a set of specific terms.
scopes = attributes & set(['RELATED','EXACT','BROAD','NARROW'])
## the types are the rest of the attributes.
types = attributes - scopes
## this is a constraint of the OBO format
assert len(scopes)< 2,"Only one synonym scope allowed"
## the scope of the SynonymValue is into scopes or set by default to RELATED
if scopes:
scope = scopes.pop()
else:
scope = 'RELATED'
## Specific rules are defined for the following tags :
if self.tag == 'exact_synonym':
raise DeprecationWarning,'exact_synonym is a deprecated tag use instead synonym tag'
self.tag = 'synonym'
scope = 'EXACT'
if self.tag == 'broad_synonym':
raise DeprecationWarning,'broad_synonym is a deprecated tag use instead synonym tag'
self.tag = 'synonym'
scope = 'BROAD'
if self.tag == 'narrow_synonym':
raise DeprecationWarning,'narrow_synonym is a deprecated tag use instead synonym tag'
self.tag = 'synonym'
scope = 'NARROW'
if self.tag == 'systematic_synonym':
#raise DeprecationWarning,'narrow_synonym is a deprecated tag use instead sysnonym tag'
self.tag = 'synonym'
scope = 'SYSTEMATIC'
## this is our own constraint. deprecated tags are not saved by this parser.
assert self.tag =='synonym',"%s synonym type is not managed" % self.tag
## dbxrefs are searched into the rest of the standard value.
dbxrefs = TaggedValue._match_bracket.search(cleanvalue).group(0)
## recording zone
## the value of a SynonymValue is not the standard value but the definition.
self.value = definition
self.dbxrefs = xrefFactory(dbxrefs)
self.scope = scope
self.types = list(types)
def __eq__(self,b):
return ((self.value==b.value) and (self.dbxrefs==b.dbxrefs)
and (self.scope==b.scope) and (self.types==b.types)
and (self.__doc__==b.__doc__) and (self.tag==b.tag)
and (self.trailing_modifiers==b.trailing_modifiers))
def __hash__(self):
return (reduce(lambda x,y:x+y,(hash(z) for z in [self.__doc__,
self.value,
frozenset(self.dbxrefs),
self.scope,
frozenset(self.types),
self.tag,
self.trailing_modifiers]),0)) % (2**31)
class XrefValue(TaggedValue):
'''
A couple 'xref:value' inherited from the class TaggedValue. Used to manage
xref tags.
'''
def __init__(self,line):
## use of the TaggedValue constructor
TaggedValue.__init__(self, line)
## use the same function as the dbxrefs
self.value=xrefFactory(self.value)
if self.tag in ('xref_analog','xref_unk'):
raise DeprecationWarning,'%s is a deprecated tag use instead sysnonym tag' % self.tag
self.tag='xref'
## this is our own constraint. deprecated tags are not saved by this parser.
assert self.tag=='xref'
class RelationshipValue(TaggedValue):
'''
A couple 'xref:value' inherited from the class TaggedValue. Used to manage
xref tags.
'''
def __init__(self,line):
## use of the TaggedValue constructor
TaggedValue.__init__(self, line)
## the value is split on the first ' '.
value = self.value.split(None,1)
## succesful split !
if len(value)==2:
relationship=value[0]
term=value[1]
## unsuccesful split. The relationship is set by default to IS_A
else:
relationship='is_a'
term=value[0]
## recording zone
self.value=term
self.relationship=relationship
class NamespaceValue(TaggedValue):
def __init__(self,line):
TaggedValue.__init__(self, line)
class RemarkValue(TaggedValue):
def __init__(self,line):
TaggedValue.__init__(self, line)
label,value = self.value.split(':',1)
label = label.strip()
value = value.strip()
self.value=value
self.label=label
def taggedValueFactory(line):
'''
A function used to dispatch lines of an OBOEntry between the class TaggedValue
and its inherited classes.
@param line: a line of an OBOEntry composed of a tag and a value.
@type line: string
@return: a class object
'''
if (line[0:9]=='namespace' or
line[0:17]=='default-namespace'):
return NamespaceValue(line)
## DefValue is an inherited class of TaggedValue
elif line[0:3]=='def':
return DefValue(line)
## SynonymValue is an inherited class of TaggedValue
elif ((line[0:7]=="synonym" and line[0:14]!="synonymtypedef") or
line[0:13]=="exact_synonym" or
line[0:13]=="broad_synonym" or
line[0:14]=="narrow_synonym"):
return SynonymValue(line)
## XrefValue is an inherited class of TaggedValue
elif line[0:4]=='xref':
return XrefValue(line)
## NameValue is an inherited class of TaggedValue
elif line[0:4]=='name':
return NameValue(line)
## RelationshipValue is an inherited class of TaggedValue
elif (line[0:15]=='intersection_of' or
line[0:8] =='union_of' or
line[0:12]=='relationship'):
return RelationshipValue(line)
elif (line[0:6]=='remark'):
return RemarkValue(line)
## each line is a couple : tag / value (and some more features)
else:
return TaggedValue(line)
#################################################################################
## Xref treatment area ##
#################################################################################
class Xref(object):
'''
A xref object of an OBOentry. It may be the 'dbxrefs' of SynonymValue and
DefValue objects or the 'value' of XrefValue objects.
'''
__splitdata__ = re.compile(' +(?=["{])')
def __init__(self,ref):
if ref == '' : #
ref = None #
data = '' #
else : # Modifs JJ sinon erreur : list index out of range
data = Xref.__splitdata__.split(ref,1) #
ref = data[0] #
description=None
trailing_modifiers = None
if len(data)> 1:
extra = data[1]
description = TaggedValue._match_quotedString.search(extra)
if description is not None:
description = description.group(0)
extra.replace(description,'')
trailing_modifiers=trailingModifierFactory(extra)
self.reference=ref
self.description=description
self.trailing_modifiers=trailing_modifiers
def __eq__(self,b):
return ((self.reference==b.reference) and (self.description==b.description)
and (self.trailing_modifiers==b.trailing_modifiers))
def __hash__(self):
return (reduce(lambda x,y:x+y,(hash(z) for z in [self.reference,
self.description,
self.trailing_modifiers]),0)) % (2**31)
def xrefFactory(string):
'''
Dispatcher of xrefs.
@param string: a string (between brackets) from an inherited TaggedValue object with a dbxrefs
signal (actually, the signal can only be found into SynonymValue and DefValue
objects) or a string (without brackets) from a XrefValue object.
@type string: string
@return: a class object
@note: The dispatcher treats differently the strings between brackets (from SynonymValue and
DefValue objects) and without brackets (from XrefValue objects).
'''
string = string.strip()
if string[0]=='[':
return [Xref(x.strip()) for x in string[1:-1].split(',')]
else:
return Xref(string)
#################################################################################
## Stanza treatment area ##
#################################################################################
class OBOEntry(dict):
'''
An entry of an OBOFile. It can be a header (without a stanza name) or
a stanza (with a stanza name between brackets). It inherits from the class dict.
'''
_match_stanza_name = re.compile('(?<=^\[)[^\]]*(?=\])')
def __init__(self,stanza):
## tests if it is the header of the OBO file (returns TRUE) or not (returns FALSE)
self.isHeader = stanza[0]!='['
lines = stanza.split('\n')
## not the header : there is a [stanzaName]
if not self.isHeader:
self.stanzaName = lines[0].strip()[1:-1]
lines=lines[1:]
## whatever the stanza is.
for line in lines:
## each line is a couple : tag / value
taggedvalue = taggedValueFactory(line)
if taggedvalue.tag in self:
self[taggedvalue.tag].append(taggedvalue)
else:
self[taggedvalue.tag]=[taggedvalue]
def parseStanzaName(stanza):
sm = OBOEntry._match_stanza_name.search(stanza)
if sm:
return sm.group(0)
else:
return None
parseStanzaName=staticmethod(parseStanzaName)
class OBOTerm(OBOEntry):
'''
A stanza named 'Term'. It inherits from the class OBOEntry.
'''
def __init__(self,stanza):
## use of the OBOEntry constructor.
OBOEntry.__init__(self, stanza)
assert self.stanzaName=='Term'
assert 'id' in self and len(self['id'])==1,"An OBOTerm must have an id"
assert 'name' in self and len(self['name'])==1,"An OBOTerm must have a name"
assert 'namespace' in self and len(self['namespace'])==1, "An OBOTerm must belong to one of the cell_component, molecular_function or biological_process namespace"
assert 'def' not in self or len(self['def'])==1,"Only one definition is allowed for an OBO term"
assert 'comment' not in self or len(self['comment'])==1,"Only one comment is allowed for an OBO term"
assert 'union_of' not in self or len(self['union_of'])>=2,"Only one union relationship is allowed for an OBO term"
assert 'intersection_of' not in self or len(self['intersection_of'])>=2,"Only one intersection relationship is allowed for an OBO term"
if self._isObsolete():
assert 'is_a' not in self
assert 'relationship' not in self
assert 'inverse_of' not in self
assert 'disjoint_from' not in self
assert 'union_of' not in self
assert 'intersection_of' not in self
assert 'replaced_by' not in self or self._isObsolete()
assert 'consider' not in self or self._isObsolete()
## make-up functions.
def _getDefinition(self):
if 'def' in self:
return self['def'][0]
return None
def _getId(self):
return self['id'][0]
def _getNamespace(self):
return self['namespace'][0]
def _getName(self):
return self['name'][0]
def _getComment(self):
if 'comment' in self:
return self['comment'][0]
return None
def _getAltIds(self):
if 'alt_id' in self:
return list(set(self.get('alt_id',None)))
return None
def _getIsA(self):
if 'is_a' in self:
return list(set(self.get('is_a',None)))
return None
def _getSynonym(self):
if 'synonym' in self :
return list(set(self.get('synonym',None)))
return None
def _getSubset(self):
if self.get('subset',None) != None:
return list(set(self.get('subset',None)))
else:
return None
def _getXref(self):
if 'xref' in self:
return list(set(self.get('xref',None)))
return None
def _getRelationShip(self):
if 'relationship' in self:
return list(set(self.get('relationship',None)))
return None
def _getUnion(self):
return list(set(self.get('union_of',None)))
def _getIntersection(self):
return list(set(self.get('intersection_of',None)))
def _getDisjonction(self):
return list(set(self.get('disjoint_from',None)))
def _isObsolete(self):
return 'is_obsolete' in self and str(self['is_obsolete'][0])=='true'
def _getReplaceBy(self):
if 'replaced_by' in self:
return list(set(self.get('replaced_by',None)))
return None
def _getConsider(self):
if 'consider' in self:
return list(set(self.get('consider',None)))
return None
## automatically make-up !
definition = property(_getDefinition,None,None)
id = property(_getId,None,None)
namespace = property(_getNamespace,None,None)
name = property(_getName,None,None)
comment = property(_getComment,None,None)
alt_ids = property(_getAltIds,None,None)
is_a = property(_getIsA,None,None)
synonyms = property(_getSynonym,None,None)
subsets = property(_getSubset,None,None)
xrefs = property(_getXref,None,None)
relationship = property(_getRelationShip,None,None)
union_of = property(_getUnion,None,None)
intersection_of = property(_getIntersection,None,None)
disjoint_from = property(_getDisjonction,None,None)
is_obsolete = property(_isObsolete,None,None)
replaced_by = property(_getReplaceBy,None,None)
consider = property(_getConsider,None,None)
def OBOEntryFactory(stanza):
'''
Dispatcher of stanza.
@param stanza: a stanza composed of several lines.
@type stanza: text
@return: an C{OBOTerm} | C{OBOEntry} instance
@note: The dispatcher treats differently the stanza which are OBO "Term"
and the others.
'''
stanzaType = OBOEntry.parseStanzaName(stanza)
if stanzaType=="Term":
return OBOTerm(stanza)
else:
return OBOEntry(stanza)
def OBOEntryIterator(file):
entries = stanzaIterator(file)
for e in entries:
debug(e)
yield OBOEntryFactory(e)