Files
ecopcr/obitools/location/__init__.py

539 lines
15 KiB
Python

import obitools
import re
import array
class Location(object):
"""
Define a location on a sequence.
"""
def extractSequence(self,sequence):
'''
Extract subsequence corresponding to a Location.
@param sequence:
@type sequence: C{BioSequence} or C{str}
'''
assert isinstance(sequence, (obitools.BioSequence,str)), \
"sequence must be an instance of str or BioSequence"
if isinstance(sequence, str):
seq = self._extractSequence(sequence)
else:
if isinstance(sequence, obitools.AASequence):
assert not self.needNucleic(), \
"This location can be used only with Nucleic sequences"
seq = self._extractSequence(str(sequence))
if isinstance(sequence, obitools.AASequence):
st = obitools.AASequence
else:
st = obitools.NucSequence
seq = st(sequence.id,
seq,
sequence.definition,
**sequence.getTags())
seq['location']=str(self)
if 'length' in sequence.getTags():
seq['length']=len(seq)
if hasattr(sequence, 'quality'):
quality = self._extractQuality(sequence)
seq.quality=quality
return seq
def isDirect(self):
return None
def isSimple(self):
'''
Indicate if a location is composed of a single continuous
region or is composed by the junction of several locations
by the C{join} operator.
@return: C{True} if the location is composed of a single
continuous region.
@rtype: bool
'''
return None
def isFullLength(self):
return None
def needNucleic(self):
'''
If a location contains a complement operator, it can be use
only on nucleic sequence.
@return: C{True} if location contains a complement operator
@rtype: bool
'''
return None
def getGloc(self):
loc = self.simplify()
assert loc.isDirect() is not None,"Gloc cannot be created for multi oriented location : %s" % str(loc)
positions = ','.join([str(x) for x in loc._getglocpos()])
return "(%s,%s)" % ({True:'T',False:'F'}[loc.isDirect()],
positions)
def shift(self,s):
return None
def getBegin(self):
return None
def getEnd(self):
return None
def getFivePrime(self):
return self.getBegin()
def getThreePrime(self):
return self.getEnd()
begin = property(getBegin,None,None,"beginning position of the location")
end = property(getEnd,None,None,"ending position of the location")
fivePrime=property(getFivePrime,None,None,"5' position of the location")
threePrime=property(getThreePrime,None,None,"3' position of the location")
def __abs__(self):
assert self.isDirect() is not None,"Abs operator cannot be applied on non oriented location"
if self.isDirect():
return self
else:
return ComplementLocation(self).simplify()
def __cmp__(self,y):
if self.begin < y.begin:
return -1
if self.begin > y.begin:
return 1
if self.isDirect() == y.isDirect():
return 0
if self.isDirect() and not y.isDirect():
return -1
return 1
class SimpleLocation(Location):
"""
A simple location is describe a continuous region of
a sequence define by a C{begin} and a C{end} position.
"""
def __init__(self,begin,end):
'''
Build a new C{SimpleLocation} instance. Valid
position are define on M{[1,N]} with N the length
of the sequence.
@param begin: start position of the location
@type begin: int
@param end: end position of the location
@type end: int
'''
assert begin > 0 and end > 0
self._begin = begin
self._end = end
self._before=False
self._after=False
def _extractSequence(self,sequence):
assert ( self._begin < len(sequence)
and self._end <= len(sequence)), \
"Sequence length %d is too short" % len(sequence)
return sequence[self._begin-1:self._end]
def _extractQuality(self,sequence):
assert ( self._begin < len(sequence)
and self._end <= len(sequence)), \
"Sequence length %d is too short" % len(sequence)
return sequence.quality[self._begin-1:self._end]
def isDirect(self):
return True
def isSimple(self):
return True
def isFullLength(self):
return not (self.before or self.after)
def simplify(self):
if self._begin == self._end:
return PointLocation(self._begin)
else:
return self
def needNucleic(self):
return False
def __str__(self):
before = {True:'<',False:''}[self.before]
after = {True:'>',False:''}[self.after]
return "%s%d..%s%d" % (before,self._begin,after,self._end)
def shift(self,s):
assert (self._begin + s) > 0,"shift to large (%d)" % s
if s == 0:
return self
return SimpleLocation(self._begin + s, self._end + s)
def _getglocpos(self):
return (self.begin,self.end)
def getGloc(self):
positions = ','.join([str(x) for x in self._getglocpos()])
return "(%s,%s)" % ({True:'T',False:'F'}[self.isDirect()],
positions)
def getBegin(self):
return self._begin
def getEnd(self):
return self._end
begin = property(getBegin,None,None,"beginning position of the location")
end = property(getEnd,None,None,"ending position of the location")
def getBefore(self):
return self._before
def getAfter(self):
return self._after
def setBefore(self,value):
assert isinstance(value, bool)
self._before=value
def setAfter(self,value):
assert isinstance(value, bool)
self._after=value
before=property(getBefore,setBefore,None)
after=property(getAfter,setAfter,None)
class PointLocation(Location):
"""
A point location describes a location on a sequence
limited to a single position
"""
def __init__(self,position):
assert position > 0
self._pos=position
def _extractSequence(self,sequence):
assert self._end <= len(sequence), \
"Sequence length %d is too short" % len(sequence)
return sequence[self._pos-1]
def _extractQuality(self,sequence):
assert self._end <= len(sequence), \
"Sequence length %d is too short" % len(sequence)
return sequence[self._pos-1:self._pos]
def isDirect(self):
return True
def isSimple(self):
return True
def isFullLength(self):
return True
def simplify(self):
return self
def needNucleic(self):
return False
def shift(self,s):
assert (self._pos + s) > 0,"shift to large (%d)" % s
if s == 0:
return self
return PointLocation(self._pos + s)
def _getglocpos(self):
return (self._pos,self._pos)
def getBegin(self):
return self._pos
def getEnd(self):
return self._pos
begin = property(getBegin,None,None,"beginning position of the location")
end = property(getEnd,None,None,"ending position of the location")
def __str__(self):
return str(self._pos)
class CompositeLocation(Location):
"""
"""
def __init__(self,locations):
self._locs = tuple(locations)
def _extractSequence(self,sequence):
seq = ''.join([x._extractSequence(sequence)
for x in self._locs])
return seq
def _extractQuality(self,sequence):
rep=array.array('d',[])
for x in self._locs:
rep.extend(x._extractQuality(sequence))
return rep
def isDirect(self):
hasDirect,hasReverse = reduce(lambda x,y: (x[0] or y,x[1] or not y),
(z.isDirect() for z in self._locs),(False,False))
if hasDirect and not hasReverse:
return True
if hasReverse and not hasDirect:
return False
return None
def isSimple(self):
return False
def simplify(self):
if len(self._locs)==1:
return self._locs[0]
rep = CompositeLocation(x.simplify() for x in self._locs)
if reduce(lambda x,y : x and y,
(isinstance(z, ComplementLocation)
for z in self._locs)):
rep = ComplementLocation(CompositeLocation(x._loc.simplify()
for x in rep._locs[::-1]))
return rep
def isFullLength(self):
return reduce(lambda x,y : x and y, (z.isFullLength() for z in self._locs),1)
def needNucleic(self):
return reduce(lambda x,y : x or y,
(z.needNucleic for z in self._locs),
False)
def _getglocpos(self):
return reduce(lambda x,y : x + y,
(z._getglocpos() for z in self._locs))
def getBegin(self):
return min(x.getBegin() for x in self._locs)
def getEnd(self):
return max(x.getEnd() for x in self._locs)
def shift(self,s):
assert (self.getBegin() + s) > 0,"shift to large (%d)" % s
if s == 0:
return self
return CompositeLocation(x.shift(s) for x in self._locs)
begin = property(getBegin,None,None,"beginning position of the location")
end = property(getEnd,None,None,"ending position of the location")
def __str__(self):
return "join(%s)" % ','.join([str(x)
for x in self._locs])
class ComplementLocation(Location):
"""
"""
_comp={'a': 't', 'c': 'g', 'g': 'c', 't': 'a',
'r': 'y', 'y': 'r', 'k': 'm', 'm': 'k',
's': 's', 'w': 'w', 'b': 'v', 'd': 'h',
'h': 'd', 'v': 'b', 'n': 'n', 'u': 'a',
'-': '-'}
def __init__(self,location):
self._loc = location
def _extractSequence(self,sequence):
seq = self._loc._extractSequence(sequence)
seq = ''.join([ComplementLocation._comp.get(x.lower(),'n') for x in seq[::-1]])
return seq
def _extractQuality(self,sequence):
return sequence.quality[::-1]
def isDirect(self):
return False
def isSimple(self):
return self._loc.isSimple()
def isFullLength(self):
return self._loc.isFullLength()
def simplify(self):
if isinstance(self._loc, ComplementLocation):
return self._loc._loc.simplify()
else:
return self
def needNucleic(self):
return True
def __str__(self):
return "complement(%s)" % self._loc
def shift(self,s):
assert (self.getBegin() + s) > 0,"shift to large (%d)" % s
if s == 0:
return self
return ComplementLocation(self._loc.shift(s))
def _getglocpos(self):
return self._loc._getglocpos()
def getBegin(self):
return self._loc.getBegin()
def getEnd(self):
return self._loc.getEnd()
def getFivePrime(self):
return self.getEnd()
def getThreePrime(self):
return self.getBegin()
begin = property(getBegin,None,None,"beginning position of the location")
end = property(getEnd,None,None,"ending position of the location")
fivePrime=property(getFivePrime,None,None,"5' potisition of the location")
threePrime=property(getThreePrime,None,None,"3' potisition of the location")
#
# Internal functions used for location parsing
#
def __sublocationIterator(text):
sl = []
plevel=0
for c in text:
assert plevel>=0,"Misformated location : %s" % text
if c == '(':
plevel+=1
sl.append(c)
elif c==')':
plevel-=1
sl.append(c)
elif c==',' and plevel == 0:
assert sl,"Misformated location : %s" % text
yield ''.join(sl)
sl=[]
else:
sl.append(c)
assert sl and plevel==0,"Misformated location : %s" % text
yield ''.join(sl)
#
# Internal functions used for location parsing
#
__simplelocparser = re.compile('(?P<before><?)(?P<from>[0-9]+)(\.\.(?P<after>>?)(?P<to>[0-9]+))?')
def __locationParser(text):
text=text.strip()
if text[0:5]=='join(':
assert text[-1]==')',"Misformated location : %s" % text
return CompositeLocation(__locationParser(sl) for sl in __sublocationIterator(text[5:-1]))
elif text[0:11]=='complement(':
assert text[-1]==')',"Misformated location : %s" % text
subl = tuple(__locationParser(sl) for sl in __sublocationIterator(text[11:-1]))
if len(subl)>1:
subl = CompositeLocation(subl)
else:
subl = subl[0]
return ComplementLocation(subl)
else:
data = __simplelocparser.match(text)
assert data is not None,"Misformated location : %s" % text
data = data.groupdict()
if not data['to'] :
sl = PointLocation(int(data['from']))
else:
sl = SimpleLocation(int(data['from']),int(data['to']))
sl.before=data['before']=='<'
sl.after=data['after']=='>'
return sl
def locationGenerator(locstring):
'''
Parse a location string as present in genbank or embl file.
@param locstring: string description of the location in embl/gb format
@type locstring: str
@return: a Location instance
@rtype: C{Location} subclass instance
'''
return __locationParser(locstring)
_matchExternalRef = re.compile('[A-Za-z0-9_|]+(\.[0-9]+)?(?=:)')
def extractExternalRefs(locstring):
'''
When a location describe external references (ex: D28156.1:1..>1292)
separate the external reference part of the location and the location
by itself.
@param locstring: text representation of the location.
@type locstring: str
@return: a tuple with a set of string describing accession number
of the referred sequences and a C{Location} instance.
@rtype: tuple(set,Location)
'''
m = set(x.group() for x in _matchExternalRef.finditer(locstring))
clean = re.compile(':|'.join([re.escape(x) for x in m])+':')
cloc = locationGenerator(clean.sub('',locstring))
return m,cloc