remove extra obitools directory from ecoPCR subversion archive
git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPCR/trunk@418 60f365c0-8329-0410-b2a4-ec073aeeaa1d
This commit is contained in:
1054
obitools/SVGdraw.py
1054
obitools/SVGdraw.py
File diff suppressed because it is too large
Load Diff
@ -1,711 +0,0 @@
|
|||||||
'''
|
|
||||||
**obitools** main module
|
|
||||||
------------------------
|
|
||||||
|
|
||||||
.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
obitools module provides base class for sequence manipulation.
|
|
||||||
|
|
||||||
All biological sequences must be subclass of :py:class:`obitools.BioSequence`.
|
|
||||||
Some biological sequences are defined as transformation of other
|
|
||||||
biological sequences. For example Reversed complemented sequences
|
|
||||||
are a transformation of a :py:class:`obitools.NucSequence`. This particular
|
|
||||||
type of sequences are subclasses of the :py:class:`obitools.WrappedBioSequence`.
|
|
||||||
|
|
||||||
.. inheritance-diagram:: BioSequence NucSequence AASequence WrappedBioSequence SubSequence DNAComplementSequence
|
|
||||||
:parts: 1
|
|
||||||
|
|
||||||
|
|
||||||
'''
|
|
||||||
|
|
||||||
from weakref import ref
|
|
||||||
|
|
||||||
from obitools.utils.iterator import uniqueChain
|
|
||||||
from itertools import chain
|
|
||||||
import re
|
|
||||||
|
|
||||||
_default_raw_parser = " %s *= *([^;]*);"
|
|
||||||
|
|
||||||
try:
|
|
||||||
from functools import partial
|
|
||||||
except:
|
|
||||||
#
|
|
||||||
# Add for compatibility purpose with Python < 2.5
|
|
||||||
#
|
|
||||||
def partial(func, *args, **keywords):
|
|
||||||
def newfunc(*fargs, **fkeywords):
|
|
||||||
newkeywords = keywords.copy()
|
|
||||||
newkeywords.update(fkeywords)
|
|
||||||
return func(*(args + fargs), **newkeywords)
|
|
||||||
newfunc.func = func
|
|
||||||
newfunc.args = args
|
|
||||||
newfunc.keywords = keywords
|
|
||||||
return newfunc
|
|
||||||
|
|
||||||
|
|
||||||
from obitools.sequenceencoder import DNAComplementEncoder
|
|
||||||
from obitools.location import Location
|
|
||||||
|
|
||||||
class WrapperSetIterator(object):
|
|
||||||
def __init__(self,s):
|
|
||||||
self._i = set.__iter__(s)
|
|
||||||
def next(self):
|
|
||||||
return self._i.next()()
|
|
||||||
def __iter__(self):
|
|
||||||
return self
|
|
||||||
|
|
||||||
class WrapperSet(set):
|
|
||||||
def __iter__(self):
|
|
||||||
return WrapperSetIterator(self)
|
|
||||||
|
|
||||||
|
|
||||||
class BioSequence(object):
|
|
||||||
'''
|
|
||||||
BioSequence class is the base class for biological
|
|
||||||
sequence representation.
|
|
||||||
|
|
||||||
It provides storage of :
|
|
||||||
|
|
||||||
- the sequence itself,
|
|
||||||
- an identifier,
|
|
||||||
- a definition an manage
|
|
||||||
- a set of complementary information on a key / value principle.
|
|
||||||
|
|
||||||
.. warning::
|
|
||||||
|
|
||||||
:py:class:`obitools.BioSequence` is an abstract class, this constructor
|
|
||||||
can only be called by a subclass constructor.
|
|
||||||
'''
|
|
||||||
|
|
||||||
def __init__(self,id,seq,definition=None,rawinfo=None,rawparser=_default_raw_parser,**info):
|
|
||||||
'''
|
|
||||||
|
|
||||||
:param id: sequence identifier
|
|
||||||
:type id: `str`
|
|
||||||
|
|
||||||
:param seq: the sequence
|
|
||||||
:type seq: `str`
|
|
||||||
|
|
||||||
:param definition: sequence definition (optional)
|
|
||||||
:type definition: `str`
|
|
||||||
|
|
||||||
:param rawinfo: a text containing a set of key=value; patterns
|
|
||||||
:type definition: `str`
|
|
||||||
|
|
||||||
:param rawparser: a text describing a regular patterns template
|
|
||||||
used to parse rawinfo
|
|
||||||
:type definition: `str`
|
|
||||||
|
|
||||||
:param info: extra named parameters can be added to associate complementary
|
|
||||||
data to the sequence
|
|
||||||
|
|
||||||
'''
|
|
||||||
|
|
||||||
assert type(self)!=BioSequence,"obitools.BioSequence is an abstract class"
|
|
||||||
|
|
||||||
self._seq=str(seq).lower()
|
|
||||||
self._info = dict(info)
|
|
||||||
if rawinfo is not None:
|
|
||||||
self._rawinfo=' ' + rawinfo
|
|
||||||
else:
|
|
||||||
self._rawinfo=None
|
|
||||||
self._rawparser=rawparser
|
|
||||||
self.definition=definition
|
|
||||||
self.id=id
|
|
||||||
self._hasTaxid=None
|
|
||||||
|
|
||||||
def get_seq(self):
|
|
||||||
return self.__seq
|
|
||||||
|
|
||||||
|
|
||||||
def set_seq(self, value):
|
|
||||||
if not isinstance(value, str):
|
|
||||||
value=str(value)
|
|
||||||
self.__seq = value
|
|
||||||
self.__len = len(value)
|
|
||||||
|
|
||||||
|
|
||||||
def clone(self):
|
|
||||||
seq = type(self)(self.id,
|
|
||||||
str(self),
|
|
||||||
definition=self.definition
|
|
||||||
)
|
|
||||||
seq._info=dict(self.getTags())
|
|
||||||
seq._rawinfo=self._rawinfo
|
|
||||||
seq._rawparser=self._rawparser
|
|
||||||
seq._hasTaxid=self._hasTaxid
|
|
||||||
return seq
|
|
||||||
|
|
||||||
def getDefinition(self):
|
|
||||||
'''
|
|
||||||
Sequence definition getter.
|
|
||||||
|
|
||||||
:return: the sequence definition
|
|
||||||
:rtype: str
|
|
||||||
|
|
||||||
'''
|
|
||||||
return self._definition
|
|
||||||
|
|
||||||
def setDefinition(self, value):
|
|
||||||
'''
|
|
||||||
Sequence definition setter.
|
|
||||||
|
|
||||||
:param value: the new sequence definition
|
|
||||||
:type value: C{str}
|
|
||||||
:return: C{None}
|
|
||||||
'''
|
|
||||||
self._definition = value
|
|
||||||
|
|
||||||
def getId(self):
|
|
||||||
'''
|
|
||||||
Sequence identifier getter
|
|
||||||
|
|
||||||
:return: the sequence identifier
|
|
||||||
:rtype: C{str}
|
|
||||||
'''
|
|
||||||
return self._id
|
|
||||||
|
|
||||||
def setId(self, value):
|
|
||||||
'''
|
|
||||||
Sequence identifier setter.
|
|
||||||
|
|
||||||
:param value: the new sequence identifier
|
|
||||||
:type value: C{str}
|
|
||||||
:return: C{None}
|
|
||||||
'''
|
|
||||||
self._id = value
|
|
||||||
|
|
||||||
def getStr(self):
|
|
||||||
'''
|
|
||||||
Return the sequence as a string
|
|
||||||
|
|
||||||
:return: the string representation of the sequence
|
|
||||||
:rtype: str
|
|
||||||
'''
|
|
||||||
return self._seq
|
|
||||||
|
|
||||||
def getSymbolAt(self,position):
|
|
||||||
'''
|
|
||||||
Return the symbole at C{position} in the sequence
|
|
||||||
|
|
||||||
:param position: the desired position. Position start from 0
|
|
||||||
if position is < 0 then they are considered
|
|
||||||
to reference the end of the sequence.
|
|
||||||
:type position: `int`
|
|
||||||
|
|
||||||
:return: a one letter string
|
|
||||||
:rtype: `str`
|
|
||||||
'''
|
|
||||||
return str(self)[position]
|
|
||||||
|
|
||||||
def getSubSeq(self,location):
|
|
||||||
'''
|
|
||||||
return a subsequence as described by C{location}.
|
|
||||||
|
|
||||||
The C{location} parametter can be a L{obitools.location.Location} instance,
|
|
||||||
an interger or a python C{slice} instance. If C{location}
|
|
||||||
is an iterger this method is equivalent to L{getSymbolAt}.
|
|
||||||
|
|
||||||
:param location: the positions of the subsequence to return
|
|
||||||
:type location: C{Location} or C{int} or C{slice}
|
|
||||||
:return: the subsequence
|
|
||||||
:rtype: a single character as a C{str} is C{location} is an integer,
|
|
||||||
a L{obitools.SubSequence} instance otherwise.
|
|
||||||
|
|
||||||
'''
|
|
||||||
if isinstance(location,Location):
|
|
||||||
return location.extractSequence(self)
|
|
||||||
elif isinstance(location, int):
|
|
||||||
return self.getSymbolAt(location)
|
|
||||||
elif isinstance(location, slice):
|
|
||||||
return SubSequence(self,location)
|
|
||||||
|
|
||||||
raise TypeError,'key must be a Location, an integer or a slice'
|
|
||||||
|
|
||||||
def getKey(self,key):
|
|
||||||
if key not in self._info:
|
|
||||||
if self._rawinfo is None:
|
|
||||||
if key=='count':
|
|
||||||
return 1
|
|
||||||
else:
|
|
||||||
raise KeyError,key
|
|
||||||
p = re.compile(self._rawparser % key)
|
|
||||||
m = p.search(self._rawinfo)
|
|
||||||
if m is not None:
|
|
||||||
v=m.group(1)
|
|
||||||
self._rawinfo=' ' + self._rawinfo[0:m.start(0)]+self._rawinfo[m.end(0):]
|
|
||||||
try:
|
|
||||||
v = eval(v)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
self._info[key]=v
|
|
||||||
else:
|
|
||||||
if key=='count':
|
|
||||||
v=1
|
|
||||||
else:
|
|
||||||
raise KeyError,key
|
|
||||||
else:
|
|
||||||
v=self._info[key]
|
|
||||||
return v
|
|
||||||
|
|
||||||
def extractTaxon(self):
|
|
||||||
'''
|
|
||||||
Extract Taxonomy information from the sequence header.
|
|
||||||
This method by default return None. It should be subclassed
|
|
||||||
if necessary as in L{obitools.seqdb.AnnotatedSequence}.
|
|
||||||
|
|
||||||
:return: None
|
|
||||||
'''
|
|
||||||
self._hasTaxid=self.hasKey('taxid')
|
|
||||||
return None
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return self.getStr()
|
|
||||||
|
|
||||||
def __getitem__(self,key):
|
|
||||||
if isinstance(key, str):
|
|
||||||
if key=='taxid' and self._hasTaxid is None:
|
|
||||||
self.extractTaxon()
|
|
||||||
return self.getKey(key)
|
|
||||||
else:
|
|
||||||
return self.getSubSeq(key)
|
|
||||||
|
|
||||||
def __setitem__(self,key,value):
|
|
||||||
self._info[key]=value
|
|
||||||
if key=='taxid':
|
|
||||||
self._hasTaxid=value is not None
|
|
||||||
|
|
||||||
def __delitem__(self,key):
|
|
||||||
if isinstance(key, str):
|
|
||||||
if key in self:
|
|
||||||
del self._info[key]
|
|
||||||
else:
|
|
||||||
raise KeyError,key
|
|
||||||
|
|
||||||
if key=='taxid':
|
|
||||||
self._hasTaxid=False
|
|
||||||
else:
|
|
||||||
raise TypeError,key
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
'''
|
|
||||||
Iterate through the sequence symbols
|
|
||||||
'''
|
|
||||||
return iter(str(self))
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return self.__len
|
|
||||||
|
|
||||||
def hasKey(self,key):
|
|
||||||
rep = key in self._info
|
|
||||||
|
|
||||||
if not rep and self._rawinfo is not None:
|
|
||||||
p = re.compile(self._rawparser % key)
|
|
||||||
m = p.search(self._rawinfo)
|
|
||||||
if m is not None:
|
|
||||||
v=m.group(1)
|
|
||||||
self._rawinfo=' ' + self._rawinfo[0:m.start(0)]+self._rawinfo[m.end(0):]
|
|
||||||
try:
|
|
||||||
v = eval(v)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
self._info[key]=v
|
|
||||||
rep=True
|
|
||||||
|
|
||||||
return rep
|
|
||||||
|
|
||||||
def __contains__(self,key):
|
|
||||||
'''
|
|
||||||
methods allowing to use the C{in} operator on a C{BioSequence}.
|
|
||||||
|
|
||||||
The C{in} operator test if the C{key} value is defined for this
|
|
||||||
sequence.
|
|
||||||
|
|
||||||
:param key: the name of the checked value
|
|
||||||
:type key: str
|
|
||||||
:return: C{True} if the value is defined, {False} otherwise.
|
|
||||||
:rtype: C{bool}
|
|
||||||
'''
|
|
||||||
if key=='taxid' and self._hasTaxid is None:
|
|
||||||
self.extractTaxon()
|
|
||||||
return self.hasKey(key)
|
|
||||||
|
|
||||||
def rawiteritems(self):
|
|
||||||
return self._info.iteritems()
|
|
||||||
|
|
||||||
def iteritems(self):
|
|
||||||
'''
|
|
||||||
iterate other items dictionary storing the values
|
|
||||||
associated to the sequence. It works similarly to
|
|
||||||
the iteritems function of C{dict}.
|
|
||||||
|
|
||||||
:return: an iterator over the items (key,value)
|
|
||||||
link to a sequence
|
|
||||||
:rtype: iterator over tuple
|
|
||||||
:see: L{items}
|
|
||||||
'''
|
|
||||||
if self._rawinfo is not None:
|
|
||||||
p = re.compile(self._rawparser % "([a-zA-Z]\w*)")
|
|
||||||
for k,v in p.findall(self._rawinfo):
|
|
||||||
try:
|
|
||||||
self._info[k]=eval(v)
|
|
||||||
except:
|
|
||||||
self._info[k]=v
|
|
||||||
self._rawinfo=None
|
|
||||||
return self._info.iteritems()
|
|
||||||
|
|
||||||
def items(self):
|
|
||||||
return [x for x in self.iteritems()]
|
|
||||||
|
|
||||||
def iterkeys(self):
|
|
||||||
return (k for k,v in self.iteritems())
|
|
||||||
|
|
||||||
def keys(self):
|
|
||||||
return [x for x in self.iterkeys()]
|
|
||||||
|
|
||||||
def getTags(self):
|
|
||||||
self.iteritems()
|
|
||||||
return self._info
|
|
||||||
|
|
||||||
def getRoot(self):
|
|
||||||
return self
|
|
||||||
|
|
||||||
def getWrappers(self):
|
|
||||||
if not hasattr(self, '_wrappers'):
|
|
||||||
self._wrappers=WrapperSet()
|
|
||||||
return self._wrappers
|
|
||||||
|
|
||||||
def register(self,wrapper):
|
|
||||||
self.wrappers.add(ref(wrapper,self._unregister))
|
|
||||||
|
|
||||||
def _unregister(self,ref):
|
|
||||||
self.wrappers.remove(ref)
|
|
||||||
|
|
||||||
wrappers = property(getWrappers,None,None,'')
|
|
||||||
|
|
||||||
definition = property(getDefinition, setDefinition, None, "Sequence Definition")
|
|
||||||
|
|
||||||
id = property(getId, setId, None, 'Sequence identifier')
|
|
||||||
|
|
||||||
def _getTaxid(self):
|
|
||||||
return self['taxid']
|
|
||||||
|
|
||||||
def _setTaxid(self,taxid):
|
|
||||||
self['taxid']=taxid
|
|
||||||
|
|
||||||
taxid = property(_getTaxid,_setTaxid,None,'NCBI Taxonomy identifier')
|
|
||||||
_seq = property(get_seq, set_seq, None, None)
|
|
||||||
|
|
||||||
class NucSequence(BioSequence):
|
|
||||||
"""
|
|
||||||
:py:class:`NucSequence` specialize the :py:class:`BioSequence` class for storing DNA
|
|
||||||
sequences.
|
|
||||||
|
|
||||||
The constructor is identical to the :py:class:`BioSequence` constructor.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def complement(self):
|
|
||||||
"""
|
|
||||||
:return: The reverse complemented sequence as an instance of :py:class:`DNAComplementSequence`
|
|
||||||
:rtype: :py:class:`DNAComplementSequence`
|
|
||||||
"""
|
|
||||||
return DNAComplementSequence(self)
|
|
||||||
|
|
||||||
def isNucleotide(self):
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
class AASequence(BioSequence):
|
|
||||||
"""
|
|
||||||
:py:class:`AASequence` specialize the :py:class:`BioSequence` class for storing protein
|
|
||||||
sequences.
|
|
||||||
|
|
||||||
The constructor is identical to the :py:class:`BioSequence` constructor.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def isNucleotide(self):
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
class WrappedBioSequence(BioSequence):
|
|
||||||
"""
|
|
||||||
.. warning::
|
|
||||||
|
|
||||||
:py:class:`obitools.WrappedBioSequence` is an abstract class, this constructor
|
|
||||||
can only be called by a subclass constructor.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def __init__(self,reference,id=None,definition=None,**info):
|
|
||||||
|
|
||||||
assert type(self)!=WrappedBioSequence,"obitools.WrappedBioSequence is an abstract class"
|
|
||||||
|
|
||||||
self._wrapped = reference
|
|
||||||
reference.register(self)
|
|
||||||
self._id=id
|
|
||||||
self.definition=definition
|
|
||||||
self._info=info
|
|
||||||
|
|
||||||
def clone(self):
|
|
||||||
seq = type(self)(self.wrapped,
|
|
||||||
id=self._id,
|
|
||||||
definition=self._definition
|
|
||||||
)
|
|
||||||
seq._info=dict(self._info)
|
|
||||||
|
|
||||||
return seq
|
|
||||||
|
|
||||||
def getWrapped(self):
|
|
||||||
return self._wrapped
|
|
||||||
|
|
||||||
def getDefinition(self):
|
|
||||||
d = self._definition or self.wrapped.definition
|
|
||||||
return d
|
|
||||||
|
|
||||||
def getId(self):
|
|
||||||
d = self._id or self.wrapped.id
|
|
||||||
return d
|
|
||||||
|
|
||||||
def isNucleotide(self):
|
|
||||||
return self.wrapped.isNucleotide()
|
|
||||||
|
|
||||||
|
|
||||||
def iterkeys(self):
|
|
||||||
return uniqueChain(self._info.iterkeys(),
|
|
||||||
self.wrapped.iterkeys())
|
|
||||||
|
|
||||||
def rawiteritems(self):
|
|
||||||
return chain(self._info.iteritems(),
|
|
||||||
(x for x in self.wrapped.rawiteritems()
|
|
||||||
if x[0] not in self._info))
|
|
||||||
|
|
||||||
def iteritems(self):
|
|
||||||
for x in self.iterkeys():
|
|
||||||
yield (x,self[x])
|
|
||||||
|
|
||||||
def getKey(self,key):
|
|
||||||
if key in self._info:
|
|
||||||
return self._info[key]
|
|
||||||
else:
|
|
||||||
return self.wrapped.getKey(key)
|
|
||||||
|
|
||||||
def hasKey(self,key):
|
|
||||||
return key in self._info or self.wrapped.hasKey(key)
|
|
||||||
|
|
||||||
def getSymbolAt(self,position):
|
|
||||||
return self.wrapped.getSymbolAt(self.posInWrapped(position))
|
|
||||||
|
|
||||||
def posInWrapped(self,position,reference=None):
|
|
||||||
if reference is None or reference is self.wrapped:
|
|
||||||
return self._posInWrapped(position)
|
|
||||||
else:
|
|
||||||
return self.wrapped.posInWrapped(self._posInWrapped(position),reference)
|
|
||||||
|
|
||||||
|
|
||||||
def getStr(self):
|
|
||||||
return str(self.wrapped)
|
|
||||||
|
|
||||||
def getRoot(self):
|
|
||||||
return self.wrapped.getRoot()
|
|
||||||
|
|
||||||
def complement(self):
|
|
||||||
"""
|
|
||||||
The :py:meth:`complement` method of the :py:class:`WrappedBioSequence` class
|
|
||||||
raises an exception :py:exc:`AttributeError` if the method is called and the cut
|
|
||||||
sequence does not corresponds to a nucleic acid sequence.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if self.wrapped.isNucleotide():
|
|
||||||
return DNAComplementSequence(self)
|
|
||||||
raise AttributeError
|
|
||||||
|
|
||||||
|
|
||||||
def _posInWrapped(self,position):
|
|
||||||
return position
|
|
||||||
|
|
||||||
|
|
||||||
definition = property(getDefinition,BioSequence.setDefinition, None)
|
|
||||||
id = property(getId,BioSequence.setId, None)
|
|
||||||
|
|
||||||
wrapped = property(getWrapped, None, None, "A pointer to the wrapped sequence")
|
|
||||||
|
|
||||||
def _getWrappedRawInfo(self):
|
|
||||||
return self.wrapped._rawinfo
|
|
||||||
|
|
||||||
_rawinfo = property(_getWrappedRawInfo)
|
|
||||||
|
|
||||||
|
|
||||||
class SubSequence(WrappedBioSequence):
|
|
||||||
"""
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _sign(x):
|
|
||||||
if x == 0:
|
|
||||||
return 0
|
|
||||||
elif x < 0:
|
|
||||||
return -1
|
|
||||||
return 1
|
|
||||||
|
|
||||||
def __init__(self,reference,
|
|
||||||
location=None,
|
|
||||||
start=None,stop=None,
|
|
||||||
id=None,definition=None,
|
|
||||||
**info):
|
|
||||||
WrappedBioSequence.__init__(self,reference,id=None,definition=None,**info)
|
|
||||||
|
|
||||||
if isinstance(location, slice):
|
|
||||||
self._location = location
|
|
||||||
else:
|
|
||||||
step = 1
|
|
||||||
if not isinstance(start, int):
|
|
||||||
start = 0;
|
|
||||||
if not isinstance(stop,int):
|
|
||||||
stop = len(reference)
|
|
||||||
self._location=slice(start,stop,step)
|
|
||||||
|
|
||||||
self._indices=self._location.indices(len(self.wrapped))
|
|
||||||
self._xrange=xrange(*self._indices)
|
|
||||||
|
|
||||||
self._info['cut']='[%d,%d,%s]' % self._indices
|
|
||||||
|
|
||||||
if hasattr(reference,'quality'):
|
|
||||||
self.quality = reference.quality[self._location]
|
|
||||||
|
|
||||||
def getId(self):
|
|
||||||
d = self._id or ("%s_SUB" % self.wrapped.id)
|
|
||||||
return d
|
|
||||||
|
|
||||||
|
|
||||||
def clone(self):
|
|
||||||
seq = WrappedBioSequence.clone(self)
|
|
||||||
seq._location=self._location
|
|
||||||
seq._indices=seq._location.indices(len(seq.wrapped))
|
|
||||||
seq._xrange=xrange(*seq._indices)
|
|
||||||
return seq
|
|
||||||
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return len(self._xrange)
|
|
||||||
|
|
||||||
def getStr(self):
|
|
||||||
return ''.join([x for x in self])
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return (self.wrapped.getSymbolAt(x) for x in self._xrange)
|
|
||||||
|
|
||||||
def _posInWrapped(self,position):
|
|
||||||
return self._xrange[position]
|
|
||||||
|
|
||||||
|
|
||||||
id = property(getId,BioSequence.setId, None)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class DNAComplementSequence(WrappedBioSequence):
|
|
||||||
"""
|
|
||||||
Class used to represent a reverse complemented DNA sequence. Usually instances
|
|
||||||
of this class are produced by using the :py:meth:`NucSequence.complement` method.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
_comp={'a': 't', 'c': 'g', 'g': 'c', 't': 'a',
|
|
||||||
'r': 'y', 'y': 'r', 'k': 'm', 'm': 'k',
|
|
||||||
's': 's', 'w': 'w', 'b': 'v', 'd': 'h',
|
|
||||||
'h': 'd', 'v': 'b', 'n': 'n', 'u': 'a',
|
|
||||||
'-': '-'}
|
|
||||||
|
|
||||||
|
|
||||||
def __init__(self,reference,
|
|
||||||
id=None,definition=None,**info):
|
|
||||||
WrappedBioSequence.__init__(self,reference,id=None,definition=None,**info)
|
|
||||||
assert reference.isNucleotide()
|
|
||||||
self._info['complemented']=True
|
|
||||||
if hasattr(reference,'quality'):
|
|
||||||
self.quality = reference.quality[::-1]
|
|
||||||
|
|
||||||
|
|
||||||
def getId(self):
|
|
||||||
d = self._id or ("%s_CMP" % self.wrapped.id)
|
|
||||||
return d
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return len(self._wrapped)
|
|
||||||
|
|
||||||
def getStr(self):
|
|
||||||
return ''.join([x for x in self])
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return (self.getSymbolAt(x) for x in xrange(len(self)))
|
|
||||||
|
|
||||||
def _posInWrapped(self,position):
|
|
||||||
return -(position+1)
|
|
||||||
|
|
||||||
def getSymbolAt(self,position):
|
|
||||||
return DNAComplementSequence._comp[self.wrapped.getSymbolAt(self.posInWrapped(position))]
|
|
||||||
|
|
||||||
def complement(self):
|
|
||||||
"""
|
|
||||||
The :py:meth:`complement` method of the :py:class:`DNAComplementSequence` class actually
|
|
||||||
returns the wrapped sequenced. Effectively the reversed complemented sequence of a reversed
|
|
||||||
complemented sequence is the initial sequence.
|
|
||||||
"""
|
|
||||||
return self.wrapped
|
|
||||||
|
|
||||||
id = property(getId,BioSequence.setId, None)
|
|
||||||
|
|
||||||
|
|
||||||
def _isNucSeq(text):
|
|
||||||
acgt = 0
|
|
||||||
notnuc = 0
|
|
||||||
ltot = len(text) * 0.8
|
|
||||||
for c in text.lower():
|
|
||||||
if c in 'acgt-':
|
|
||||||
acgt+=1
|
|
||||||
if c not in DNAComplementEncoder._comp:
|
|
||||||
notnuc+=1
|
|
||||||
return notnuc==0 and float(acgt) > ltot
|
|
||||||
|
|
||||||
|
|
||||||
def bioSeqGenerator(id,seq,definition=None,rawinfo=None,rawparser=_default_raw_parser,**info):
|
|
||||||
"""
|
|
||||||
Generate automagically the good class instance between :
|
|
||||||
|
|
||||||
- :py:class:`NucSequence`
|
|
||||||
- :py:class:`AASequence`
|
|
||||||
|
|
||||||
Build a new sequence instance. Sequences are instancied as :py:class:`NucSequence` if the
|
|
||||||
`seq` attribute contains more than 80% of *A*, *C*, *G*, *T* or *-* symbols
|
|
||||||
in upper or lower cases. Conversely, the new sequence instance is instancied as
|
|
||||||
:py:class:`AASequence`.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
:param id: sequence identifier
|
|
||||||
:type id: `str`
|
|
||||||
|
|
||||||
:param seq: the sequence
|
|
||||||
:type seq: `str`
|
|
||||||
|
|
||||||
:param definition: sequence definition (optional)
|
|
||||||
:type definition: `str`
|
|
||||||
|
|
||||||
:param rawinfo: a text containing a set of key=value; patterns
|
|
||||||
:type definition: `str`
|
|
||||||
|
|
||||||
:param rawparser: a text describing a regular patterns template
|
|
||||||
used to parse rawinfo
|
|
||||||
:type definition: `str`
|
|
||||||
|
|
||||||
:param info: extra named parameters can be added to associate complementary
|
|
||||||
data to the sequence
|
|
||||||
"""
|
|
||||||
if _isNucSeq(seq):
|
|
||||||
return NucSequence(id,seq,definition,rawinfo,rawparser,**info)
|
|
||||||
else:
|
|
||||||
return AASequence(id,seq,definition,rawinfo,rawparser,**info)
|
|
||||||
|
|
Binary file not shown.
@ -1,13 +0,0 @@
|
|||||||
|
|
||||||
|
|
||||||
from _nws import NWS
|
|
||||||
from _upperbond import indexSequences
|
|
||||||
from _lcs import LCS,lenlcs
|
|
||||||
from _assemble import DirectAssemble, ReverseAssemble
|
|
||||||
from _qsassemble import QSolexaDirectAssemble,QSolexaReverseAssemble
|
|
||||||
from _rassemble import RightDirectAssemble as RightReverseAssemble
|
|
||||||
from _qsrassemble import QSolexaRightDirectAssemble,QSolexaRightReverseAssemble
|
|
||||||
from _freeendgap import FreeEndGap
|
|
||||||
from _freeendgapfm import FreeEndGapFullMatch
|
|
||||||
from _upperbond import isLCSReachable
|
|
||||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,56 +0,0 @@
|
|||||||
'''
|
|
||||||
Created on 14 mai 2009
|
|
||||||
|
|
||||||
@author: coissac
|
|
||||||
'''
|
|
||||||
|
|
||||||
from obitools import WrappedBioSequence
|
|
||||||
|
|
||||||
class HomoNucBioSeq(WrappedBioSequence):
|
|
||||||
'''
|
|
||||||
classdocs
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
|
||||||
def __init__(self,reference,id=None,definition=None,**info):
|
|
||||||
'''
|
|
||||||
Constructor
|
|
||||||
'''
|
|
||||||
assert reference.isNucleotide(),"reference must be a nucleic sequence"
|
|
||||||
WrappedBioSequence.__init__(self,reference,id=None,definition=None,**info)
|
|
||||||
self.__cleanHomopolymer()
|
|
||||||
|
|
||||||
def __cleanHomopolymer(self):
|
|
||||||
s = []
|
|
||||||
c = []
|
|
||||||
old=None
|
|
||||||
nc=0
|
|
||||||
for n in self._wrapped:
|
|
||||||
if old is not None and n!=old:
|
|
||||||
s.append(old)
|
|
||||||
c.append(nc)
|
|
||||||
nc=0
|
|
||||||
old=n
|
|
||||||
nc+=1
|
|
||||||
self._cached=''.join(s)
|
|
||||||
self['homopolymer']=c
|
|
||||||
self._cumulative=[]
|
|
||||||
sum=0
|
|
||||||
for c in self._count:
|
|
||||||
sum+=c
|
|
||||||
self._cumulative.append(sum)
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return len(self._cached)
|
|
||||||
|
|
||||||
def getStr(self):
|
|
||||||
return self._cached
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return iter(self._cached)
|
|
||||||
|
|
||||||
def _posInWrapped(self,position):
|
|
||||||
return self._cumulative[position]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,46 +0,0 @@
|
|||||||
import os
|
|
||||||
import re
|
|
||||||
|
|
||||||
from obitools.fasta import formatFasta
|
|
||||||
|
|
||||||
class SsearchParser(object):
|
|
||||||
|
|
||||||
_matchQuery = re.compile("^Query:.+\n.+?>+([^ ]+)", re.MULTILINE)
|
|
||||||
_matchLQuery = re.compile("^Query:.+\n.+?(\d+)(?= nt\n)", re.MULTILINE)
|
|
||||||
_matchProp = re.compile("^The best scores are:.*\n(.+?)>>>", re.DOTALL+re.MULTILINE)
|
|
||||||
def __init__(self,file):
|
|
||||||
if isinstance(file,str):
|
|
||||||
file = open(file,'rU')
|
|
||||||
self.data = file.read()
|
|
||||||
self.query= SsearchParser._matchQuery.search(self.data).group(1)
|
|
||||||
self.queryLength= int(SsearchParser._matchLQuery.search(self.data).group(1))
|
|
||||||
props = SsearchParser._matchProp.search(self.data)
|
|
||||||
if props:
|
|
||||||
props=props.group(0).split('\n')[1:-2]
|
|
||||||
self.props=[]
|
|
||||||
for line in props:
|
|
||||||
subject,tab = line.split('\t')
|
|
||||||
tab=tab.split()
|
|
||||||
ssp = subject.split()
|
|
||||||
ac = ssp[0]
|
|
||||||
dbl= int(ssp[-5][:-1])
|
|
||||||
ident = float(tab[0])
|
|
||||||
matchlen = abs(int(tab[5]) - int(tab[4])) +1
|
|
||||||
self.props.append({"ac" :ac,
|
|
||||||
"identity" :ident,
|
|
||||||
"subjectlength":dbl,
|
|
||||||
'matchlength' : matchlen})
|
|
||||||
|
|
||||||
def run(seq,database,program='fasta35',opts=''):
|
|
||||||
ssearchin,ssearchout,ssearcherr = os.popen3("%s %s %s" % (program,opts,database))
|
|
||||||
print >>ssearchin,formatFasta(seq)
|
|
||||||
ssearchin.close()
|
|
||||||
result = SsearchParser(ssearchout)
|
|
||||||
|
|
||||||
return seq,result
|
|
||||||
|
|
||||||
def ssearchIterator(sequenceIterator,database,program='ssearch35',opts=''):
|
|
||||||
for seq in sequenceIterator:
|
|
||||||
yield run(seq,database,program,opts)
|
|
||||||
|
|
||||||
|
|
@ -1,175 +0,0 @@
|
|||||||
from obitools import BioSequence
|
|
||||||
from obitools import WrappedBioSequence
|
|
||||||
from copy import deepcopy
|
|
||||||
|
|
||||||
class GappedPositionException(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
class AlignedSequence(WrappedBioSequence):
|
|
||||||
|
|
||||||
def __init__(self,reference,
|
|
||||||
id=None,definition=None,**info):
|
|
||||||
WrappedBioSequence.__init__(self,reference,id=None,definition=None,**info)
|
|
||||||
self._length=len(reference)
|
|
||||||
self._gaps=[[self._length,0]]
|
|
||||||
|
|
||||||
def clone(self):
|
|
||||||
seq = WrappedBioSequence.clone(self)
|
|
||||||
seq._gaps=deepcopy(self._gaps)
|
|
||||||
seq._length=reduce(lambda x,y:x+y, (z[0]+z[1] for z in self._gaps),0)
|
|
||||||
return seq
|
|
||||||
|
|
||||||
def setGaps(self, value):
|
|
||||||
'''
|
|
||||||
Set gap vector to an AlignedSequence.
|
|
||||||
|
|
||||||
Gap vector describes the gap positions on a sequence.
|
|
||||||
It is a gap of couple. The first couple member is the count
|
|
||||||
of sequence letter, the second one is the gap length.
|
|
||||||
@param value: a list of length 2 list describing gap positions
|
|
||||||
@type value: list of couple
|
|
||||||
'''
|
|
||||||
assert isinstance(value, list),'Gap vector must be a list'
|
|
||||||
assert reduce(lambda x,y: x and y,
|
|
||||||
(isinstance(z, list) and len(z)==2 for z in value),
|
|
||||||
True),"Value must be a list of length 2 list"
|
|
||||||
|
|
||||||
lseq = reduce(lambda x,y:x+y, (z[0] for z in value),0)
|
|
||||||
assert lseq==len(self.wrapped),"Gap vector incompatible with the sequence"
|
|
||||||
self._gaps = value
|
|
||||||
self._length=reduce(lambda x,y:x+y, (z[0]+z[1] for z in value),0)
|
|
||||||
|
|
||||||
def getGaps(self):
|
|
||||||
return tuple(self._gaps)
|
|
||||||
gaps = property(getGaps, setGaps, None, "Gaps's Docstring")
|
|
||||||
|
|
||||||
def _getIndice(self,pos):
|
|
||||||
i=0
|
|
||||||
cpos=0
|
|
||||||
for s,g in self._gaps:
|
|
||||||
cpos+=s
|
|
||||||
if cpos>pos:
|
|
||||||
return i,pos-cpos+s
|
|
||||||
cpos+=g
|
|
||||||
if cpos>pos:
|
|
||||||
return i,-pos+cpos-g-1
|
|
||||||
i+=1
|
|
||||||
raise IndexError
|
|
||||||
|
|
||||||
def getId(self):
|
|
||||||
d = self._id or ("%s_ALN" % self.wrapped.id)
|
|
||||||
return d
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return self._length
|
|
||||||
|
|
||||||
def getStr(self):
|
|
||||||
return ''.join([x for x in self])
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
def isymb():
|
|
||||||
cpos=0
|
|
||||||
for s,g in self._gaps:
|
|
||||||
for x in xrange(s):
|
|
||||||
yield self.wrapped[cpos+x]
|
|
||||||
for x in xrange(g):
|
|
||||||
yield '-'
|
|
||||||
cpos+=s
|
|
||||||
return isymb()
|
|
||||||
|
|
||||||
def _posInWrapped(self,position):
|
|
||||||
i,s=self._getIndice(position)
|
|
||||||
if s<0:
|
|
||||||
raise GappedPositionException
|
|
||||||
value=self._gaps
|
|
||||||
p=reduce(lambda x,y:x+y, (z[0] for z in value[:i]),0)+s
|
|
||||||
return p
|
|
||||||
|
|
||||||
def getSymbolAt(self,position):
|
|
||||||
try:
|
|
||||||
return self.wrapped.getSymbolAt(self.posInWrapped(position))
|
|
||||||
except GappedPositionException:
|
|
||||||
return '-'
|
|
||||||
|
|
||||||
def insertGap(self,position,count=1):
|
|
||||||
if position==self._length:
|
|
||||||
idx=len(self._gaps)-1
|
|
||||||
p=-1
|
|
||||||
else:
|
|
||||||
idx,p = self._getIndice(position)
|
|
||||||
|
|
||||||
if p >= 0:
|
|
||||||
self._gaps.insert(idx, [p,count])
|
|
||||||
self._gaps[idx+1][0]-=p
|
|
||||||
else:
|
|
||||||
self._gaps[idx][1]+=count
|
|
||||||
self._length=reduce(lambda x,y:x+y, (z[0]+z[1] for z in self._gaps),0)
|
|
||||||
|
|
||||||
|
|
||||||
id = property(getId,BioSequence.setId, None, "Sequence Identifier")
|
|
||||||
|
|
||||||
|
|
||||||
class Alignment(list):
|
|
||||||
|
|
||||||
def _assertData(self,data):
|
|
||||||
assert isinstance(data, BioSequence),'You must only add bioseq to an alignement'
|
|
||||||
if hasattr(self, '_alignlen'):
|
|
||||||
assert self._alignlen==len(data),'All aligned sequences must have the same length'
|
|
||||||
else:
|
|
||||||
self._alignlen=len(data)
|
|
||||||
return data
|
|
||||||
|
|
||||||
def clone(self):
|
|
||||||
ali = Alignment(x.clone() for x in self)
|
|
||||||
return ali
|
|
||||||
|
|
||||||
def append(self,data):
|
|
||||||
data = self._assertData(data)
|
|
||||||
list.append(self,data)
|
|
||||||
|
|
||||||
def __setitem__(self,index,data):
|
|
||||||
|
|
||||||
data = self._assertData(data)
|
|
||||||
list.__setitem__(self,index,data)
|
|
||||||
|
|
||||||
def getSite(self,key):
|
|
||||||
if isinstance(key,int):
|
|
||||||
return [x[key] for x in self]
|
|
||||||
|
|
||||||
def insertGap(self,position,count=1):
|
|
||||||
for s in self:
|
|
||||||
s.insertGap(position,count)
|
|
||||||
|
|
||||||
def isFullGapSite(self,key):
|
|
||||||
return reduce(lambda x,y: x and y,(z=='-' for z in self.getSite(key)),True)
|
|
||||||
|
|
||||||
def isGappedSite(self,key):
|
|
||||||
return '-' in self.getSite(key)
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
l = len(self[0])
|
|
||||||
rep=""
|
|
||||||
idmax = max(len(x.id) for x in self)+2
|
|
||||||
template= "%%-%ds %%-60s" % idmax
|
|
||||||
for p in xrange(0,l,60):
|
|
||||||
for s in self:
|
|
||||||
rep+= (template % (s.id,s[p:p+60])).strip() + '\n'
|
|
||||||
rep+="\n"
|
|
||||||
return rep
|
|
||||||
|
|
||||||
def alignmentReader(file,sequenceIterator):
|
|
||||||
seqs = sequenceIterator(file)
|
|
||||||
alignement = Alignment()
|
|
||||||
for seq in seqs:
|
|
||||||
alignement.append(seq)
|
|
||||||
return alignement
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def columnIterator(alignment):
|
|
||||||
lali = len(alignment[0])
|
|
||||||
for p in xrange(lali):
|
|
||||||
c = [x[p] for x in alignment]
|
|
||||||
yield c
|
|
@ -1,47 +0,0 @@
|
|||||||
from obitools.format.genericparser import GenericParser
|
|
||||||
from obitools.utils import universalOpen
|
|
||||||
from obitools.fasta import parseFastaDescription
|
|
||||||
from obitools import NucSequence
|
|
||||||
|
|
||||||
|
|
||||||
import sys
|
|
||||||
|
|
||||||
_contigIterator=GenericParser('^CO ')
|
|
||||||
|
|
||||||
_contigIterator.addParseAction('AF', '\nAF +(\S+) +([UC]) +(-?[0-9]+)')
|
|
||||||
_contigIterator.addParseAction('RD', '\nRD +(\S+) +([0-9]+) +([0-9]+) +([0-9]+) *\n([A-Za-z\n*]+?)\n\n')
|
|
||||||
_contigIterator.addParseAction('DS', '\nDS +(.+)')
|
|
||||||
_contigIterator.addParseAction('CO', '^CO (\S+)')
|
|
||||||
|
|
||||||
def contigIterator(file):
|
|
||||||
file = universalOpen(file)
|
|
||||||
for entry in _contigIterator(file):
|
|
||||||
contig=[]
|
|
||||||
for rd,ds,af in map(None,entry['RD'],entry['DS'],entry['AF']):
|
|
||||||
id = rd[0]
|
|
||||||
shift = int(af[2])
|
|
||||||
if shift < 0:
|
|
||||||
print >> sys.stderr,"Sequence %s in contig %s has a negative paddng value %d : skipped" % (id,entry['CO'][0],shift)
|
|
||||||
#continue
|
|
||||||
|
|
||||||
definition,info = parseFastaDescription(ds)
|
|
||||||
info['shift']=shift
|
|
||||||
seq = rd[4].replace('\n','').replace('*','-').strip()
|
|
||||||
contig.append(NucSequence(id,seq,definition,**info))
|
|
||||||
|
|
||||||
maxlen = max(len(x)+x['shift'] for x in contig)
|
|
||||||
minshift=min(x['shift'] for x in contig)
|
|
||||||
rep = []
|
|
||||||
|
|
||||||
for s in contig:
|
|
||||||
info = s.getTags()
|
|
||||||
info['shift']-=minshift-1
|
|
||||||
head = '-' * (info['shift']-1)
|
|
||||||
|
|
||||||
tail = (maxlen + minshift - len(s) - info['shift'] - 1)
|
|
||||||
info['tail']=tail
|
|
||||||
newseq = NucSequence(s.id,head + str(s)+ '-' * tail,s.definition,**info)
|
|
||||||
rep.append(newseq)
|
|
||||||
|
|
||||||
yield entry['CO'][0],rep
|
|
||||||
|
|
@ -1,7 +0,0 @@
|
|||||||
'''
|
|
||||||
|
|
||||||
@author: merciece
|
|
||||||
Creates the tree representing the coverage of 2 primers from an ecoPCR output file and an ecoPCR database.
|
|
||||||
|
|
||||||
|
|
||||||
'''
|
|
@ -1,62 +0,0 @@
|
|||||||
#!/usr/local/bin/python
|
|
||||||
'''
|
|
||||||
Created on 24 nov. 2011
|
|
||||||
|
|
||||||
@author: merciece
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
|
||||||
def main(amplifiedSeqs, seqsFromDB, keptRanks, errors, tax) :
|
|
||||||
'''
|
|
||||||
error threshold is set to 3
|
|
||||||
'''
|
|
||||||
|
|
||||||
listtaxabygroupinDB = {}
|
|
||||||
|
|
||||||
for seq in seqsFromDB :
|
|
||||||
taxid = seq['taxid']
|
|
||||||
p = [a for a in tax.parentalTreeIterator(taxid)]
|
|
||||||
for a in p :
|
|
||||||
if a != p[0] :
|
|
||||||
if a[1] in keptRanks :
|
|
||||||
group = a[0]
|
|
||||||
if group in listtaxabygroupinDB and taxid not in listtaxabygroupinDB[group] :
|
|
||||||
listtaxabygroupinDB[group].add(taxid)
|
|
||||||
elif group not in listtaxabygroupinDB :
|
|
||||||
listtaxabygroupinDB[group]=set([taxid])
|
|
||||||
|
|
||||||
taxabygroup = dict((x,len(listtaxabygroupinDB[x])) for x in listtaxabygroupinDB)
|
|
||||||
|
|
||||||
listamplifiedtaxabygroup = {}
|
|
||||||
|
|
||||||
for seq in amplifiedSeqs :
|
|
||||||
if errors[seq.id][2] <= 3 :
|
|
||||||
taxid = seq['taxid']
|
|
||||||
p = [a for a in tax.parentalTreeIterator(taxid)]
|
|
||||||
for a in p :
|
|
||||||
if a != p[0] :
|
|
||||||
if a[1] in keptRanks :
|
|
||||||
group = a[0]
|
|
||||||
if group in listamplifiedtaxabygroup and taxid not in listamplifiedtaxabygroup[group] :
|
|
||||||
listamplifiedtaxabygroup[group].add(taxid)
|
|
||||||
elif group not in listamplifiedtaxabygroup :
|
|
||||||
listamplifiedtaxabygroup[group]=set([taxid])
|
|
||||||
|
|
||||||
amplifiedtaxabygroup = dict((x,len(listamplifiedtaxabygroup[x])) for x in listamplifiedtaxabygroup)
|
|
||||||
|
|
||||||
BcValues = {}
|
|
||||||
|
|
||||||
groups = [g for g in taxabygroup.keys()]
|
|
||||||
|
|
||||||
for g in groups :
|
|
||||||
if g in amplifiedtaxabygroup :
|
|
||||||
BcValues[g] = float(amplifiedtaxabygroup[g])/taxabygroup[g]*100
|
|
||||||
BcValues[g] = round(BcValues[g], 2)
|
|
||||||
else :
|
|
||||||
BcValues[g] = 0.0
|
|
||||||
|
|
||||||
return BcValues
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,72 +0,0 @@
|
|||||||
#!/usr/local/bin/python
|
|
||||||
'''
|
|
||||||
Created on 24 nov. 2011
|
|
||||||
|
|
||||||
@author: merciece
|
|
||||||
'''
|
|
||||||
|
|
||||||
import sys
|
|
||||||
|
|
||||||
|
|
||||||
def main(amplifiedSeqs, seqsFromDB, keptRanks, tax) :
|
|
||||||
|
|
||||||
BcValues = {}
|
|
||||||
|
|
||||||
#speciesid = tax.findRankByName('species')
|
|
||||||
#subspeciesid = tax.findRankByName('subspecies')
|
|
||||||
|
|
||||||
listtaxonbygroup = {}
|
|
||||||
|
|
||||||
for seq in seqsFromDB :
|
|
||||||
taxid = seq['taxid']
|
|
||||||
p = [a for a in tax.parentalTreeIterator(taxid)]
|
|
||||||
for a in p :
|
|
||||||
if a != p[0] :
|
|
||||||
if a[1] in keptRanks :
|
|
||||||
group = a
|
|
||||||
if group in listtaxonbygroup:
|
|
||||||
listtaxonbygroup[group].add(taxid)
|
|
||||||
else:
|
|
||||||
listtaxonbygroup[group]=set([taxid])
|
|
||||||
|
|
||||||
#stats = dict((x,len(listtaxonbygroup[x])) for x in listtaxonbygroup)
|
|
||||||
|
|
||||||
print>>sys.stderr, listtaxonbygroup
|
|
||||||
|
|
||||||
listtaxonbygroup = {}
|
|
||||||
|
|
||||||
for seq in amplifiedSeqs :
|
|
||||||
taxid = seq['taxid']
|
|
||||||
p = [a for a in tax.parentalTreeIterator(taxid)]
|
|
||||||
for a in p :
|
|
||||||
if a != p[0] :
|
|
||||||
if a[1] in keptRanks :
|
|
||||||
group = a
|
|
||||||
if group in listtaxonbygroup:
|
|
||||||
listtaxonbygroup[group].add(taxid)
|
|
||||||
else:
|
|
||||||
listtaxonbygroup[group]=set([taxid])
|
|
||||||
|
|
||||||
print>>sys.stderr, listtaxonbygroup
|
|
||||||
|
|
||||||
return BcValues
|
|
||||||
|
|
||||||
# dbstats= dict((x,len(listtaxonbygroup[x])) for x in listtaxonbygroup)
|
|
||||||
#
|
|
||||||
# ranks = [r for r in keptRanks]
|
|
||||||
# ranks.sort()
|
|
||||||
#
|
|
||||||
# print '%-20s\t%10s\t%10s\t%7s' % ('rank','ecopcr','db','percent')
|
|
||||||
#
|
|
||||||
# print>>sys.stderr, stats
|
|
||||||
# print>>sys.stderr, dbstats
|
|
||||||
# print>>sys.stderr, ranks
|
|
||||||
#
|
|
||||||
# for r in ranks:
|
|
||||||
# if r in dbstats and dbstats[r]:
|
|
||||||
# print '%-20s\t%10d\t%10d\t%8.2f' % (r,dbstats[r],stats[r],float(dbstats[r])/stats[r]*100)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,108 +0,0 @@
|
|||||||
#!/usr/local/bin/python
|
|
||||||
'''
|
|
||||||
Created on 25 nov. 2011
|
|
||||||
|
|
||||||
@author: merciece
|
|
||||||
'''
|
|
||||||
|
|
||||||
from obitools.graph.rootedtree import nexusFormat
|
|
||||||
|
|
||||||
|
|
||||||
figtree="""\
|
|
||||||
begin figtree;
|
|
||||||
set appearance.backgroundColorAttribute="User Selection";
|
|
||||||
set appearance.backgroundColour=#-1;
|
|
||||||
set appearance.branchColorAttribute="bc";
|
|
||||||
set appearance.branchLineWidth=2.0;
|
|
||||||
set appearance.foregroundColour=#-16777216;
|
|
||||||
set appearance.selectionColour=#-2144520576;
|
|
||||||
set branchLabels.colorAttribute="User Selection";
|
|
||||||
set branchLabels.displayAttribute="errors";
|
|
||||||
set branchLabels.fontName="sansserif";
|
|
||||||
set branchLabels.fontSize=10;
|
|
||||||
set branchLabels.fontStyle=0;
|
|
||||||
set branchLabels.isShown=true;
|
|
||||||
set branchLabels.significantDigits=4;
|
|
||||||
set layout.expansion=2000;
|
|
||||||
set layout.layoutType="RECTILINEAR";
|
|
||||||
set layout.zoom=0;
|
|
||||||
set nodeBars.barWidth=4.0;
|
|
||||||
set nodeLabels.colorAttribute="User Selection";
|
|
||||||
set nodeLabels.displayAttribute="label";
|
|
||||||
set nodeLabels.fontName="sansserif";
|
|
||||||
set nodeLabels.fontSize=10;
|
|
||||||
set nodeLabels.fontStyle=0;
|
|
||||||
set nodeLabels.isShown=true;
|
|
||||||
set nodeLabels.significantDigits=4;
|
|
||||||
set polarLayout.alignTipLabels=false;
|
|
||||||
set polarLayout.angularRange=0;
|
|
||||||
set polarLayout.rootAngle=0;
|
|
||||||
set polarLayout.rootLength=100;
|
|
||||||
set polarLayout.showRoot=true;
|
|
||||||
set radialLayout.spread=0.0;
|
|
||||||
set rectilinearLayout.alignTipLabels=false;
|
|
||||||
set rectilinearLayout.curvature=0;
|
|
||||||
set rectilinearLayout.rootLength=100;
|
|
||||||
set scale.offsetAge=0.0;
|
|
||||||
set scale.rootAge=1.0;
|
|
||||||
set scale.scaleFactor=1.0;
|
|
||||||
set scale.scaleRoot=false;
|
|
||||||
set scaleAxis.automaticScale=true;
|
|
||||||
set scaleAxis.fontSize=8.0;
|
|
||||||
set scaleAxis.isShown=false;
|
|
||||||
set scaleAxis.lineWidth=2.0;
|
|
||||||
set scaleAxis.majorTicks=1.0;
|
|
||||||
set scaleAxis.origin=0.0;
|
|
||||||
set scaleAxis.reverseAxis=false;
|
|
||||||
set scaleAxis.showGrid=true;
|
|
||||||
set scaleAxis.significantDigits=4;
|
|
||||||
set scaleBar.automaticScale=true;
|
|
||||||
set scaleBar.fontSize=10.0;
|
|
||||||
set scaleBar.isShown=true;
|
|
||||||
set scaleBar.lineWidth=1.0;
|
|
||||||
set scaleBar.scaleRange=0.0;
|
|
||||||
set scaleBar.significantDigits=4;
|
|
||||||
set tipLabels.colorAttribute="User Selection";
|
|
||||||
set tipLabels.displayAttribute="Names";
|
|
||||||
set tipLabels.fontName="sansserif";
|
|
||||||
set tipLabels.fontSize=10;
|
|
||||||
set tipLabels.fontStyle=0;
|
|
||||||
set tipLabels.isShown=true;
|
|
||||||
set tipLabels.significantDigits=4;
|
|
||||||
set trees.order=false;
|
|
||||||
set trees.orderType="increasing";
|
|
||||||
set trees.rooting=false;
|
|
||||||
set trees.rootingType="User Selection";
|
|
||||||
set trees.transform=false;
|
|
||||||
set trees.transformType="cladogram";
|
|
||||||
end;
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def cartoonRankGenerator(rank):
|
|
||||||
def cartoon(node):
|
|
||||||
return 'rank' in node and node['rank']==rank
|
|
||||||
|
|
||||||
return cartoon
|
|
||||||
|
|
||||||
|
|
||||||
def collapseBcGenerator(Bclimit):
|
|
||||||
def collapse(node):
|
|
||||||
return 'bc' in node and node['bc']<=Bclimit
|
|
||||||
return collapse
|
|
||||||
|
|
||||||
|
|
||||||
def label(node):
|
|
||||||
if 'bc' in node:
|
|
||||||
return "(%+3.1f) %s" % (node['bc'],node['name'])
|
|
||||||
else:
|
|
||||||
return " %s" % node['name']
|
|
||||||
|
|
||||||
|
|
||||||
def main(coverageTree) :
|
|
||||||
print nexusFormat(coverageTree,
|
|
||||||
label=label,
|
|
||||||
blocks=figtree,
|
|
||||||
cartoon=cartoonRankGenerator('family'))
|
|
||||||
#collapse=collapseBcGenerator(70))
|
|
||||||
|
|
@ -1,56 +0,0 @@
|
|||||||
#!/usr/local/bin/python
|
|
||||||
'''
|
|
||||||
Created on 24 nov. 2011
|
|
||||||
|
|
||||||
@author: merciece
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
|
||||||
def main(seqs, keptRanks, tax):
|
|
||||||
errorsBySeq = getErrorsOnLeaves(seqs)
|
|
||||||
errorsByTaxon = propagateErrors(errorsBySeq, keptRanks, tax)
|
|
||||||
return errorsBySeq, errorsByTaxon
|
|
||||||
|
|
||||||
|
|
||||||
def getErrorsOnLeaves(seqs) :
|
|
||||||
errors = {}
|
|
||||||
for s in seqs :
|
|
||||||
taxid = s['taxid']
|
|
||||||
forErrs = s['forward_error']
|
|
||||||
revErrs = s['reverse_error']
|
|
||||||
total = forErrs + revErrs
|
|
||||||
seqNb = 1
|
|
||||||
errors[s.id] = [forErrs,revErrs,total,seqNb,taxid]
|
|
||||||
return errors
|
|
||||||
|
|
||||||
|
|
||||||
def propagateErrors(errorsOnLeaves, keptRanks, tax) :
|
|
||||||
allErrors = {}
|
|
||||||
for seq in errorsOnLeaves :
|
|
||||||
taxid = errorsOnLeaves[seq][4]
|
|
||||||
p = [a for a in tax.parentalTreeIterator(taxid)]
|
|
||||||
for a in p :
|
|
||||||
if a[1] in keptRanks :
|
|
||||||
group = a[0]
|
|
||||||
if group in allErrors :
|
|
||||||
allErrors[group][0] += errorsOnLeaves[seq][0]
|
|
||||||
allErrors[group][1] += errorsOnLeaves[seq][1]
|
|
||||||
allErrors[group][2] += errorsOnLeaves[seq][2]
|
|
||||||
allErrors[group][3] += 1
|
|
||||||
else :
|
|
||||||
allErrors[group] = errorsOnLeaves[seq]
|
|
||||||
|
|
||||||
for group in allErrors :
|
|
||||||
allErrors[group][0] /= float(allErrors[group][3])
|
|
||||||
allErrors[group][1] /= float(allErrors[group][3])
|
|
||||||
allErrors[group][2] /= float(allErrors[group][3])
|
|
||||||
|
|
||||||
allErrors[group][0] = round(allErrors[group][0], 2)
|
|
||||||
allErrors[group][1] = round(allErrors[group][1], 2)
|
|
||||||
allErrors[group][2] = round(allErrors[group][2], 2)
|
|
||||||
|
|
||||||
return allErrors
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,69 +0,0 @@
|
|||||||
#!/usr/local/bin/python
|
|
||||||
'''
|
|
||||||
Created on 23 nov. 2011
|
|
||||||
|
|
||||||
@author: merciece
|
|
||||||
'''
|
|
||||||
|
|
||||||
from obitools.ecopcr import sequence
|
|
||||||
from obitools.ecopcr import taxonomy
|
|
||||||
|
|
||||||
|
|
||||||
def main(entries,options):
|
|
||||||
filteredDataFromDB = ecoPCRDatabaseReader(options)
|
|
||||||
filteredData = ecoPCRFileReader(entries,filteredDataFromDB)
|
|
||||||
return filteredDataFromDB,filteredData
|
|
||||||
|
|
||||||
|
|
||||||
def ecoPCRDatabaseReader(options):
|
|
||||||
|
|
||||||
tax = taxonomy.EcoTaxonomyDB(options.taxonomy)
|
|
||||||
seqs = sequence.EcoPCRDBSequenceIterator(options.taxonomy,taxonomy=tax)
|
|
||||||
|
|
||||||
norankid = tax.findRankByName('no rank')
|
|
||||||
speciesid = tax.findRankByName('species')
|
|
||||||
genusid = tax.findRankByName('genus')
|
|
||||||
familyid = tax.findRankByName('family')
|
|
||||||
|
|
||||||
minrankseq = set([speciesid,genusid,familyid])
|
|
||||||
|
|
||||||
usedrankid = {}
|
|
||||||
|
|
||||||
ingroup = {}
|
|
||||||
outgroup= {}
|
|
||||||
|
|
||||||
for s in seqs :
|
|
||||||
if 'taxid' in s :
|
|
||||||
taxid = s['taxid']
|
|
||||||
allrank = set()
|
|
||||||
for p in tax.parentalTreeIterator(taxid):
|
|
||||||
if p[1]!=norankid:
|
|
||||||
allrank.add(p[1])
|
|
||||||
if len(minrankseq & allrank) == 3:
|
|
||||||
for r in allrank:
|
|
||||||
usedrankid[r]=usedrankid.get(r,0) + 1
|
|
||||||
|
|
||||||
if tax.isAncestor(options.ingroup,taxid):
|
|
||||||
ingroup[s.id] = s
|
|
||||||
else:
|
|
||||||
outgroup[s.id] = s
|
|
||||||
|
|
||||||
keptranks = set(r for r in usedrankid
|
|
||||||
if float(usedrankid[r])/float(len(ingroup)) > options.rankthresold)
|
|
||||||
|
|
||||||
return { 'ingroup' : ingroup,
|
|
||||||
'outgroup': outgroup,
|
|
||||||
'ranks' : keptranks,
|
|
||||||
'taxonomy': tax
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def ecoPCRFileReader(entries,filteredDataFromDB) :
|
|
||||||
filteredData = []
|
|
||||||
for s in entries :
|
|
||||||
if 'taxid' in s :
|
|
||||||
seqId = s.id
|
|
||||||
if seqId in filteredDataFromDB['ingroup'] :
|
|
||||||
filteredData.append(s)
|
|
||||||
return filteredData
|
|
||||||
|
|
@ -1,42 +0,0 @@
|
|||||||
#!/usr/local/bin/python
|
|
||||||
'''
|
|
||||||
Created on 25 nov. 2011
|
|
||||||
|
|
||||||
@author: merciece
|
|
||||||
'''
|
|
||||||
|
|
||||||
from obitools.graph.rootedtree import RootedTree
|
|
||||||
|
|
||||||
|
|
||||||
def main(BcValues,errors,tax) :
|
|
||||||
|
|
||||||
tree = RootedTree()
|
|
||||||
tset = set(BcValues)
|
|
||||||
|
|
||||||
for taxon in BcValues:
|
|
||||||
if taxon in errors :
|
|
||||||
forErr = errors[taxon][0]
|
|
||||||
revErr = errors[taxon][1]
|
|
||||||
totErr = errors[taxon][2]
|
|
||||||
else :
|
|
||||||
forErr = -1.0
|
|
||||||
revErr = -1.0
|
|
||||||
totErr = -1.0
|
|
||||||
|
|
||||||
tree.addNode(taxon, rank=tax.getRank(taxon),
|
|
||||||
name=tax.getScientificName(taxon),
|
|
||||||
bc = BcValues[taxon],
|
|
||||||
errors = str(forErr)+' '+str(revErr),
|
|
||||||
totError = totErr
|
|
||||||
)
|
|
||||||
|
|
||||||
for taxon in BcValues:
|
|
||||||
piter = tax.parentalTreeIterator(taxon)
|
|
||||||
taxon = piter.next()
|
|
||||||
for parent in piter:
|
|
||||||
if taxon[0] in tset and parent[0] in BcValues:
|
|
||||||
tset.remove(taxon[0])
|
|
||||||
tree.addEdge(parent[0], taxon[0])
|
|
||||||
taxon=parent
|
|
||||||
|
|
||||||
return tree
|
|
@ -1,207 +0,0 @@
|
|||||||
from os import popen2
|
|
||||||
from itertools import imap,count
|
|
||||||
|
|
||||||
from obitools.table import iTableIterator,TableRow,Table,SelectionIterator
|
|
||||||
from obitools.utils import ColumnFile
|
|
||||||
from obitools.location import SimpleLocation
|
|
||||||
from obitools.fasta import formatFasta
|
|
||||||
import sys
|
|
||||||
|
|
||||||
class Blast(object):
|
|
||||||
'''
|
|
||||||
Run blast
|
|
||||||
'''
|
|
||||||
|
|
||||||
def __init__(self,mode,db,program='blastall',**options):
|
|
||||||
self._mode = mode
|
|
||||||
self._db = db
|
|
||||||
self._program = program
|
|
||||||
self._options = options
|
|
||||||
|
|
||||||
def getMode(self):
|
|
||||||
return self._mode
|
|
||||||
|
|
||||||
|
|
||||||
def getDb(self):
|
|
||||||
return self._db
|
|
||||||
|
|
||||||
|
|
||||||
def getProgram(self):
|
|
||||||
return self._program
|
|
||||||
|
|
||||||
def _blastcmd(self):
|
|
||||||
tmp = """%(program)s \\
|
|
||||||
-p %(mode)s \\
|
|
||||||
-d %(db)s \\
|
|
||||||
-m 8 \\
|
|
||||||
%(options)s \\
|
|
||||||
"""
|
|
||||||
options = ' '.join(['-%s %s' % (x[0],str(x[1]))
|
|
||||||
for x in self._options.iteritems()])
|
|
||||||
data = {
|
|
||||||
'program' : self.program,
|
|
||||||
'db' : self.db,
|
|
||||||
'mode' : self.mode,
|
|
||||||
'options' : options
|
|
||||||
}
|
|
||||||
|
|
||||||
return tmp % data
|
|
||||||
|
|
||||||
def __call__(self,sequence):
|
|
||||||
'''
|
|
||||||
Run blast with one sequence object
|
|
||||||
@param sequence:
|
|
||||||
@type sequence:
|
|
||||||
'''
|
|
||||||
cmd = self._blastcmd()
|
|
||||||
|
|
||||||
(blast_in,blast_out) = popen2(cmd)
|
|
||||||
|
|
||||||
print >>blast_in,formatFasta(sequence)
|
|
||||||
blast_in.close()
|
|
||||||
|
|
||||||
blast = BlastResultIterator(blast_out)
|
|
||||||
|
|
||||||
return blast
|
|
||||||
|
|
||||||
mode = property(getMode, None, None, "Mode's Docstring")
|
|
||||||
|
|
||||||
db = property(getDb, None, None, "Db's Docstring")
|
|
||||||
|
|
||||||
program = property(getProgram, None, None, "Program's Docstring")
|
|
||||||
|
|
||||||
|
|
||||||
class NetBlast(Blast):
|
|
||||||
'''
|
|
||||||
Run blast on ncbi servers
|
|
||||||
'''
|
|
||||||
|
|
||||||
def __init__(self,mode,db,**options):
|
|
||||||
'''
|
|
||||||
|
|
||||||
@param mode:
|
|
||||||
@param db:
|
|
||||||
'''
|
|
||||||
Blast.__init__(self, mode, db, 'blastcl3',**options)
|
|
||||||
|
|
||||||
|
|
||||||
class BlastResultIterator(iTableIterator):
|
|
||||||
|
|
||||||
def __init__(self,blastoutput,query=None):
|
|
||||||
'''
|
|
||||||
|
|
||||||
@param blastoutput:
|
|
||||||
@type blastoutput:
|
|
||||||
'''
|
|
||||||
self._blast = ColumnFile(blastoutput,
|
|
||||||
strip=True,
|
|
||||||
skip="#",
|
|
||||||
sep="\t",
|
|
||||||
types=self.types
|
|
||||||
)
|
|
||||||
self._query = query
|
|
||||||
self._hindex = dict((k,i) for i,k in imap(None,count(),self._getHeaders()))
|
|
||||||
|
|
||||||
def _getHeaders(self):
|
|
||||||
return ('Query id','Subject id',
|
|
||||||
'% identity','alignment length',
|
|
||||||
'mismatches', 'gap openings',
|
|
||||||
'q. start', 'q. end',
|
|
||||||
's. start', 's. end',
|
|
||||||
'e-value', 'bit score')
|
|
||||||
|
|
||||||
def _getTypes(self):
|
|
||||||
return (str,str,
|
|
||||||
float,int,
|
|
||||||
int,int,
|
|
||||||
int,int,
|
|
||||||
int,int,
|
|
||||||
float,float)
|
|
||||||
|
|
||||||
def _getRowFactory(self):
|
|
||||||
return BlastMatch
|
|
||||||
|
|
||||||
def _getSubrowFactory(self):
|
|
||||||
return TableRow
|
|
||||||
|
|
||||||
def _getQuery(self):
|
|
||||||
return self._query
|
|
||||||
|
|
||||||
|
|
||||||
headers = property(_getHeaders,None,None)
|
|
||||||
types = property(_getTypes,None,None)
|
|
||||||
rowFactory = property(_getRowFactory,None,None)
|
|
||||||
subrowFactory = property(_getSubrowFactory,None,None)
|
|
||||||
query = property(_getQuery,None,None)
|
|
||||||
|
|
||||||
def next(self):
|
|
||||||
'''
|
|
||||||
|
|
||||||
'''
|
|
||||||
value = self._blast.next()
|
|
||||||
return self.rowFactory(self,value)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class BlastResult(Table):
|
|
||||||
'''
|
|
||||||
Results of a blast run
|
|
||||||
'''
|
|
||||||
|
|
||||||
class BlastMatch(TableRow):
|
|
||||||
'''
|
|
||||||
Blast high scoring pair between two sequences
|
|
||||||
'''
|
|
||||||
|
|
||||||
def getQueryLocation(self):
|
|
||||||
l = SimpleLocation(self[6], self[7])
|
|
||||||
return l
|
|
||||||
|
|
||||||
def getSubjectLocation(self):
|
|
||||||
l = SimpleLocation(self[8], self[9])
|
|
||||||
return l
|
|
||||||
|
|
||||||
def getSubjectSequence(self,database):
|
|
||||||
return database[self[1]]
|
|
||||||
|
|
||||||
def queryCov(self,query=None):
|
|
||||||
'''
|
|
||||||
Compute coverage of match on query sequence.
|
|
||||||
|
|
||||||
@param query: the query sequence. Default is None.
|
|
||||||
In this case the query sequence associated
|
|
||||||
to this blast result is used.
|
|
||||||
@type query: L{obitools.BioSequence}
|
|
||||||
|
|
||||||
@return: coverage fraction
|
|
||||||
@rtype: float
|
|
||||||
'''
|
|
||||||
if query is None:
|
|
||||||
query = self.table.query
|
|
||||||
assert query is not None
|
|
||||||
return float(self[7]-self[6]+1)/float(len(query))
|
|
||||||
|
|
||||||
def __getitem__(self,key):
|
|
||||||
if key=='query coverage' and self.table.query is not None:
|
|
||||||
return self.queryCov()
|
|
||||||
else:
|
|
||||||
return TableRow.__getitem__(self,key)
|
|
||||||
|
|
||||||
class BlastCovMinFilter(SelectionIterator):
|
|
||||||
|
|
||||||
def __init__(self,blastiterator,covmin,query=None,**conditions):
|
|
||||||
if query is None:
|
|
||||||
query = blastiterator.table.query
|
|
||||||
assert query is not None
|
|
||||||
SelectionIterator.__init__(self,blastiterator,**conditions)
|
|
||||||
self._query = query
|
|
||||||
self._covmin=covmin
|
|
||||||
|
|
||||||
def _covMinPredicat(self,row):
|
|
||||||
return row.queryCov(self._query)>=self._covmin
|
|
||||||
|
|
||||||
def _checkCondition(self,row):
|
|
||||||
return self._covMinPredicat(row) and SelectionIterator._checkCondition(self, row)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,376 +0,0 @@
|
|||||||
# -*- coding: latin1 -*-
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
from obitools import SVGdraw
|
|
||||||
import math
|
|
||||||
|
|
||||||
class Map(object):
|
|
||||||
"""
|
|
||||||
Map represente une instance d'une carte genetique physique.
|
|
||||||
Une telle carte est definie par la longueur de la sequence
|
|
||||||
qui lui est associe.
|
|
||||||
|
|
||||||
A une carte est associe un certain nombre de niveaux (Level)
|
|
||||||
eux meme decoupe en sous-niveau (SubLevel)
|
|
||||||
Les sous niveaux contiennent eux des features
|
|
||||||
"""
|
|
||||||
def __init__(self,name,seqlength,scale=1):
|
|
||||||
"""
|
|
||||||
Constructeur d'une nouvelle carte
|
|
||||||
|
|
||||||
*Param*:
|
|
||||||
|
|
||||||
name
|
|
||||||
nom de la carte
|
|
||||||
|
|
||||||
seqlength
|
|
||||||
longueur de la sequence associee a la carte
|
|
||||||
|
|
||||||
scale
|
|
||||||
echelle de la carte indicant combien de pixel
|
|
||||||
correspondent a une unite de la carte
|
|
||||||
"""
|
|
||||||
self.name = name
|
|
||||||
self.seqlength = seqlength
|
|
||||||
self.scale = scale
|
|
||||||
self.levels = {}
|
|
||||||
self.basicHSize = 10
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return '<%s>' % self.name
|
|
||||||
|
|
||||||
def __getitem__(self,level):
|
|
||||||
"""
|
|
||||||
retourne le niveau *level* de la carte et
|
|
||||||
le cree s'il n'existe pas
|
|
||||||
"""
|
|
||||||
if not isinstance(level,int):
|
|
||||||
raise TypeError('level must be an non Zero integer value')
|
|
||||||
elif level==0:
|
|
||||||
raise AssertionError('Level cannot be set to 0')
|
|
||||||
try:
|
|
||||||
return self.levels[level]
|
|
||||||
except KeyError:
|
|
||||||
self.levels[level] = Level(level,self)
|
|
||||||
return self.levels[level]
|
|
||||||
|
|
||||||
def getBasicHSize(self):
|
|
||||||
"""
|
|
||||||
retourne la hauteur de base d'un element de cartographie
|
|
||||||
exprimee en pixel
|
|
||||||
"""
|
|
||||||
return self.basicHSize
|
|
||||||
|
|
||||||
def getScale(self):
|
|
||||||
"""
|
|
||||||
Retourne l'echelle de la carte en nombre de pixels par
|
|
||||||
unite physique de la carte
|
|
||||||
"""
|
|
||||||
return self.scale
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def getNegativeBase(self):
|
|
||||||
return reduce(lambda x,y:x-y,[self.levels[z].getHeight()
|
|
||||||
for z in self.levels
|
|
||||||
if z < 0],self.getHeight())
|
|
||||||
|
|
||||||
def getPositiveBase(self):
|
|
||||||
return self.getNegativeBase() - 3 * self.getBasicHSize()
|
|
||||||
|
|
||||||
def getHeight(self):
|
|
||||||
return reduce(lambda x,y:x+y,[z.getHeight() for z in self.levels.values()],0) \
|
|
||||||
+ 4 * self.getBasicHSize()
|
|
||||||
|
|
||||||
def toXML(self,file=None,begin=0,end=None):
|
|
||||||
dessin = SVGdraw.drawing()
|
|
||||||
if end==None:
|
|
||||||
end = self.seqlength
|
|
||||||
hauteur= self.getHeight()
|
|
||||||
largeur=(end-begin+1)*self.scale
|
|
||||||
svg = SVGdraw.svg((begin*self.scale,0,largeur,hauteur),
|
|
||||||
'%fpx' % (self.seqlength * self.scale),
|
|
||||||
'%dpx' % hauteur)
|
|
||||||
|
|
||||||
centre = self.getPositiveBase() + (1 + 1/4) * self.getBasicHSize()
|
|
||||||
svg.addElement(SVGdraw.rect(0,centre,self.seqlength * self.scale,self.getBasicHSize()/2))
|
|
||||||
for e in self.levels.values():
|
|
||||||
svg.addElement(e.getElement())
|
|
||||||
dessin.setSVG(svg)
|
|
||||||
return dessin.toXml(file)
|
|
||||||
|
|
||||||
class Feature(object):
|
|
||||||
pass
|
|
||||||
|
|
||||||
class Level(object):
|
|
||||||
|
|
||||||
def __init__(self,level,map):
|
|
||||||
if not isinstance(map,Map):
|
|
||||||
raise AssertionError('map is not an instance of class Map')
|
|
||||||
if level in map.levels:
|
|
||||||
raise AssertionError('Level %d already define for map %s' % (level,map))
|
|
||||||
else:
|
|
||||||
map.levels[level] = self
|
|
||||||
self.map = map
|
|
||||||
self.level = level
|
|
||||||
self.sublevels = {}
|
|
||||||
|
|
||||||
def __getitem__(self,sublevel):
|
|
||||||
"""
|
|
||||||
retourne le niveau *sublevel* du niveau en
|
|
||||||
le creant s'il n'existe pas
|
|
||||||
"""
|
|
||||||
if not isinstance(sublevel,int):
|
|
||||||
raise TypeError('sublevel must be a positive integer value')
|
|
||||||
elif sublevel<0:
|
|
||||||
raise AssertionError('Level cannot be negative')
|
|
||||||
try:
|
|
||||||
return self.sublevels[sublevel]
|
|
||||||
except KeyError:
|
|
||||||
self.sublevels[sublevel] = SubLevel(sublevel,self)
|
|
||||||
return self.sublevels[sublevel]
|
|
||||||
|
|
||||||
def getBase(self):
|
|
||||||
if self.level < 0:
|
|
||||||
base = self.map.getNegativeBase()
|
|
||||||
base += reduce(lambda x,y:x+y,[self.map.levels[z].getHeight()
|
|
||||||
for z in self.map.levels
|
|
||||||
if z <0 and z >= self.level],0)
|
|
||||||
return base
|
|
||||||
else:
|
|
||||||
base = self.map.getPositiveBase()
|
|
||||||
base -= reduce(lambda x,y:x+y,[self.map.levels[z].getHeight()
|
|
||||||
for z in self.map.levels
|
|
||||||
if z >0 and z < self.level],0)
|
|
||||||
return base
|
|
||||||
|
|
||||||
def getElement(self):
|
|
||||||
objet = SVGdraw.group('level%d' % self.level)
|
|
||||||
for e in self.sublevels.values():
|
|
||||||
objet.addElement(e.getElement())
|
|
||||||
return objet
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def getHeight(self):
|
|
||||||
return reduce(lambda x,y:x+y,[z.getHeight() for z in self.sublevels.values()],0) \
|
|
||||||
+ 2 * self.map.getBasicHSize()
|
|
||||||
|
|
||||||
class SubLevel(object):
|
|
||||||
|
|
||||||
def __init__(self,sublevel,level):
|
|
||||||
if not isinstance(level,Level):
|
|
||||||
raise AssertionError('level is not an instance of class Level')
|
|
||||||
if level in level.sublevels:
|
|
||||||
raise AssertionError('Sublevel %d already define for level %s' % (sublevel,level))
|
|
||||||
else:
|
|
||||||
level.sublevels[sublevel] = self
|
|
||||||
self.level = level
|
|
||||||
self.sublevel = sublevel
|
|
||||||
self.features = {}
|
|
||||||
|
|
||||||
def getHeight(self):
|
|
||||||
return max([x.getHeight() for x in self.features.values()]+[0]) + 4 * self.level.map.getBasicHSize()
|
|
||||||
|
|
||||||
def getBase(self):
|
|
||||||
base = self.level.getBase()
|
|
||||||
if self.level.level < 0:
|
|
||||||
base -= self.level.getHeight() - 2 * self.level.map.getBasicHSize()
|
|
||||||
base += reduce(lambda x,y:x+y,[self.level.sublevels[z].getHeight()
|
|
||||||
for z in self.level.sublevels
|
|
||||||
if z <= self.sublevel],0)
|
|
||||||
base -= 2* self.level.map.getBasicHSize()
|
|
||||||
else:
|
|
||||||
base -= reduce(lambda x,y:x+y,[self.level.sublevels[z].getHeight()
|
|
||||||
for z in self.level.sublevels
|
|
||||||
if z < self.sublevel],0)
|
|
||||||
base -= self.level.map.getBasicHSize()
|
|
||||||
return base
|
|
||||||
|
|
||||||
def getElement(self):
|
|
||||||
base = self.getBase()
|
|
||||||
objet = SVGdraw.group('sublevel%d' % self.sublevel)
|
|
||||||
for e in self.features.values():
|
|
||||||
objet.addElement(e.getElement(base))
|
|
||||||
return objet
|
|
||||||
|
|
||||||
def add(self,feature):
|
|
||||||
if not isinstance(feature,Feature):
|
|
||||||
raise TypeError('feature must be an instance oof Feature')
|
|
||||||
if feature.name in self.features:
|
|
||||||
raise AssertionError('A feature with the same name (%s) have already be insert in this sublevel'
|
|
||||||
% feature.name)
|
|
||||||
self.features[feature.name]=feature
|
|
||||||
feature.sublevel=self
|
|
||||||
|
|
||||||
class SimpleFeature(Feature):
|
|
||||||
|
|
||||||
def __init__(self,name,begin,end,visiblename=False,color=0):
|
|
||||||
self.begin = begin
|
|
||||||
self.end = end
|
|
||||||
self.name = name
|
|
||||||
self.color = color
|
|
||||||
self.sublevel = None
|
|
||||||
self.visiblename=visiblename
|
|
||||||
|
|
||||||
def getHeight(self):
|
|
||||||
if not self.sublevel:
|
|
||||||
raise AssertionError('Not affected Simple feature')
|
|
||||||
if self.visiblename:
|
|
||||||
return self.sublevel.level.map.getBasicHSize() * 2
|
|
||||||
else:
|
|
||||||
return self.sublevel.level.map.getBasicHSize()
|
|
||||||
|
|
||||||
def getElement(self,base):
|
|
||||||
scale = self.sublevel.level.map.getScale()
|
|
||||||
y = base - self.sublevel.level.map.getBasicHSize()
|
|
||||||
x = self.begin * scale
|
|
||||||
width = (self.end - self.begin + 1) * scale
|
|
||||||
heigh = self.sublevel.level.map.getBasicHSize()
|
|
||||||
|
|
||||||
objet = SVGdraw.rect(x,y,width,heigh,stroke=self.color)
|
|
||||||
objet.addElement(SVGdraw.description(self.name))
|
|
||||||
|
|
||||||
return objet
|
|
||||||
|
|
||||||
class BoxFeature(SimpleFeature):
|
|
||||||
|
|
||||||
def getHeight(self):
|
|
||||||
if not self.sublevel:
|
|
||||||
raise AssertionError('Not affected Box feature')
|
|
||||||
if self.visiblename:
|
|
||||||
return self.sublevel.level.map.getBasicHSize() * 4
|
|
||||||
else:
|
|
||||||
return self.sublevel.level.map.getBasicHSize() * 3
|
|
||||||
|
|
||||||
def getElement(self,base):
|
|
||||||
scale = self.sublevel.level.map.getScale()
|
|
||||||
y = base - self.sublevel.level.map.getBasicHSize() * 2
|
|
||||||
x = self.begin * scale
|
|
||||||
width = (self.end - self.begin + 1) * scale
|
|
||||||
height = self.sublevel.level.map.getBasicHSize() * 3
|
|
||||||
|
|
||||||
objet = SVGdraw.rect(x,y,width,height,stroke=self.color,fill="none")
|
|
||||||
objet.addElement(SVGdraw.description(self.name))
|
|
||||||
|
|
||||||
return objet
|
|
||||||
|
|
||||||
class MultiPartFeature(Feature):
|
|
||||||
|
|
||||||
def __init__(self,name,*args,**kargs):
|
|
||||||
self.limits = args
|
|
||||||
self.name = name
|
|
||||||
try:
|
|
||||||
self.color = kargs['color']
|
|
||||||
except KeyError:
|
|
||||||
self.color = "black"
|
|
||||||
|
|
||||||
try:
|
|
||||||
self.visiblename=kargs['visiblename']
|
|
||||||
except KeyError:
|
|
||||||
self.visiblename=None
|
|
||||||
|
|
||||||
try:
|
|
||||||
self.flatlink=kargs['flatlink']
|
|
||||||
except KeyError:
|
|
||||||
self.flatlink=False
|
|
||||||
|
|
||||||
try:
|
|
||||||
self.roundlink=kargs['roundlink']
|
|
||||||
except KeyError:
|
|
||||||
self.roundlink=False
|
|
||||||
|
|
||||||
self.sublevel = None
|
|
||||||
|
|
||||||
|
|
||||||
def getHeight(self):
|
|
||||||
if not self.sublevel:
|
|
||||||
raise AssertionError('Not affected Simple feature')
|
|
||||||
if self.visiblename:
|
|
||||||
return self.sublevel.level.map.getBasicHSize() * 3
|
|
||||||
else:
|
|
||||||
return self.sublevel.level.map.getBasicHSize() * 2
|
|
||||||
|
|
||||||
def getElement(self,base):
|
|
||||||
scale = self.sublevel.level.map.getScale()
|
|
||||||
|
|
||||||
y = base - self.sublevel.level.map.getBasicHSize()
|
|
||||||
height = self.sublevel.level.map.getBasicHSize()
|
|
||||||
objet = SVGdraw.group(self.name)
|
|
||||||
for (debut,fin) in self.limits:
|
|
||||||
x = debut * scale
|
|
||||||
width = (fin - debut + 1) * scale
|
|
||||||
part = SVGdraw.rect(x,y,width,height,fill=self.color)
|
|
||||||
objet.addElement(part)
|
|
||||||
|
|
||||||
debut = self.limits[0][1]
|
|
||||||
for (fin,next) in self.limits[1:]:
|
|
||||||
debut*=scale
|
|
||||||
fin*=scale
|
|
||||||
path = SVGdraw.pathdata(debut,y + height / 2)
|
|
||||||
delta = height / 2
|
|
||||||
if self.roundlink:
|
|
||||||
path.qbezier((debut+fin)/2, y - delta,fin,y + height / 2)
|
|
||||||
else:
|
|
||||||
if self.flatlink:
|
|
||||||
delta = - height / 2
|
|
||||||
path.line((debut+fin)/2, y - delta)
|
|
||||||
path.line(fin,y + height / 2)
|
|
||||||
path = SVGdraw.path(path,fill="none",stroke=self.color)
|
|
||||||
objet.addElement(path)
|
|
||||||
debut = next
|
|
||||||
|
|
||||||
objet.addElement(SVGdraw.description(self.name))
|
|
||||||
|
|
||||||
return objet
|
|
||||||
|
|
||||||
class TagFeature(Feature):
|
|
||||||
|
|
||||||
def __init__(self,name,begin,length,ratio,visiblename=False,color=0):
|
|
||||||
self.begin = begin
|
|
||||||
self.length = length
|
|
||||||
self.ratio = ratio
|
|
||||||
self.name = name
|
|
||||||
self.color = color
|
|
||||||
self.sublevel = None
|
|
||||||
self.visiblename=visiblename
|
|
||||||
|
|
||||||
def getHeight(self):
|
|
||||||
if not self.sublevel:
|
|
||||||
raise AssertionError('Not affected Tag feature')
|
|
||||||
|
|
||||||
return self.sublevel.level.map.getBasicHSize()*11
|
|
||||||
|
|
||||||
def getElement(self,base):
|
|
||||||
scale = self.sublevel.level.map.getScale()
|
|
||||||
height = math.floor(max(1,self.sublevel.level.map.getBasicHSize()* 10 * self.ratio))
|
|
||||||
y = base + self.sublevel.level.map.getBasicHSize() - height
|
|
||||||
x = self.begin * scale
|
|
||||||
width = self.length * scale
|
|
||||||
objet = SVGdraw.rect(x,y,width,height,stroke=self.color)
|
|
||||||
objet.addElement(SVGdraw.description(self.name))
|
|
||||||
|
|
||||||
return objet
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
carte = Map('essai',20000,scale=0.5)
|
|
||||||
carte[-1][0].add(SimpleFeature('toto',100,300))
|
|
||||||
carte[1][0].add(SimpleFeature('toto',100,300))
|
|
||||||
carte[1][1].add(SimpleFeature('toto',200,1000))
|
|
||||||
|
|
||||||
carte[1][0].add(MultiPartFeature('bout',(1400,1450),(1470,1550),(1650,1800),color='red',flatlink=True))
|
|
||||||
carte[1][0].add(MultiPartFeature('titi',(400,450),(470,550),(650,800),color='red',flatlink=True))
|
|
||||||
carte[-1][1].add(MultiPartFeature('titi',(400,450),(470,550),(650,800),color='green'))
|
|
||||||
carte[-1][2].add(MultiPartFeature('titi',(400,450),(470,550),(650,800),color='purple',roundlink=True))
|
|
||||||
|
|
||||||
carte[-1][1].add(BoxFeature('tutu',390,810,color='purple'))
|
|
||||||
carte[1][0].add(BoxFeature('tutu',390,810,color='red'))
|
|
||||||
carte[2][0].add(TagFeature('t1',1400,20,0.8))
|
|
||||||
carte[2][0].add(TagFeature('t2',1600,20,0.2))
|
|
||||||
carte.basicHSize=6
|
|
||||||
print carte.toXML('truc.svg',begin=0,end=1000)
|
|
||||||
print carte.toXML('truc2.svg',begin=460,end=2000)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,29 +0,0 @@
|
|||||||
class DistanceMatrix(object):
|
|
||||||
|
|
||||||
def __init__(self,alignment):
|
|
||||||
'''
|
|
||||||
DistanceMatrix constructor.
|
|
||||||
|
|
||||||
@param alignment: aligment used to compute distance matrix
|
|
||||||
@type alignment: obitools.align.Alignment
|
|
||||||
'''
|
|
||||||
self.aligment = alignment
|
|
||||||
self.matrix = [[None] * (x+1) for x in xrange(len(alignment))]
|
|
||||||
|
|
||||||
def evaluateDist(self,x,y):
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def __getitem__(self,key):
|
|
||||||
assert isinstance(key,(tuple,list)) and len(key)==2, \
|
|
||||||
'key must be a tuple or a list of two integers'
|
|
||||||
x,y = key
|
|
||||||
if y < x:
|
|
||||||
z=x
|
|
||||||
x=y
|
|
||||||
y=z
|
|
||||||
rep = self.matrix[y][x]
|
|
||||||
if rep is None:
|
|
||||||
rep = self.evaluateDist(x,y)
|
|
||||||
self.matrix[y][x] = rep
|
|
||||||
|
|
||||||
return rep
|
|
@ -1,77 +0,0 @@
|
|||||||
'''
|
|
||||||
Module dedicated to compute observed divergeances from
|
|
||||||
an alignment. No distance correction is applied at all
|
|
||||||
'''
|
|
||||||
|
|
||||||
from itertools import imap
|
|
||||||
|
|
||||||
from obitools.distances import DistanceMatrix
|
|
||||||
|
|
||||||
class PairewiseGapRemoval(DistanceMatrix):
|
|
||||||
'''
|
|
||||||
Observed divergeance matrix from an alignment.
|
|
||||||
Gap are removed from the alignemt on a pairewise
|
|
||||||
sequence base
|
|
||||||
'''
|
|
||||||
|
|
||||||
def evaluateDist(self,x,y):
|
|
||||||
'''
|
|
||||||
Compute the observed divergeance from two sequences
|
|
||||||
of an aligment.
|
|
||||||
|
|
||||||
@attention: For performance purpose this method should
|
|
||||||
be directly used. use instead the __getitem__
|
|
||||||
method from DistanceMatrix.
|
|
||||||
|
|
||||||
@see: L{__getitem__}
|
|
||||||
|
|
||||||
@param x: number of the fisrt sequence in the aligment
|
|
||||||
@type x: int
|
|
||||||
@param y: umber of the second sequence in the aligment
|
|
||||||
@type y: int
|
|
||||||
|
|
||||||
|
|
||||||
'''
|
|
||||||
|
|
||||||
seq1 = self.aligment[x]
|
|
||||||
seq2 = self.aligment[y]
|
|
||||||
|
|
||||||
diff,tot = reduce(lambda x,y: (x[0]+y,x[1]+1),
|
|
||||||
(z[0]!=z[1] for z in imap(None,seq1,seq2)
|
|
||||||
if '-' not in z),(0,0))
|
|
||||||
return float(diff)/tot
|
|
||||||
|
|
||||||
|
|
||||||
class Pairewise(DistanceMatrix):
|
|
||||||
'''
|
|
||||||
Observed divergeance matrix from an alignment.
|
|
||||||
Gap are kept from the alignemt
|
|
||||||
'''
|
|
||||||
|
|
||||||
def evaluateDist(self,x,y):
|
|
||||||
'''
|
|
||||||
Compute the observed divergeance from two sequences
|
|
||||||
of an aligment.
|
|
||||||
|
|
||||||
@attention: For performance purpose this method should
|
|
||||||
be directly used. use instead the __getitem__
|
|
||||||
method from DistanceMatrix.
|
|
||||||
|
|
||||||
@see: L{__getitem__}
|
|
||||||
|
|
||||||
@param x: number of the fisrt sequence in the aligment
|
|
||||||
@type x: int
|
|
||||||
@param y: umber of the second sequence in the aligment
|
|
||||||
@type y: int
|
|
||||||
|
|
||||||
|
|
||||||
'''
|
|
||||||
|
|
||||||
seq1 = self.aligment[x]
|
|
||||||
seq2 = self.aligment[y]
|
|
||||||
|
|
||||||
diff,tot = reduce(lambda x,y: (x[0]+y,x[1]+1),
|
|
||||||
(z[0]!=z[1] for z in imap(None,seq1,seq2)),
|
|
||||||
(0,0))
|
|
||||||
return float(diff)/tot
|
|
||||||
|
|
@ -1,35 +0,0 @@
|
|||||||
import sys
|
|
||||||
|
|
||||||
from itertools import imap,count
|
|
||||||
|
|
||||||
def writePhylipMatrix(matrix):
|
|
||||||
names = [x.id for x in matrix.aligment]
|
|
||||||
pnames= [x[:10] for x in names]
|
|
||||||
unicity={}
|
|
||||||
redundent=[]
|
|
||||||
for n in pnames:
|
|
||||||
unicity[n]=unicity.get(n,0)+1
|
|
||||||
redundent.append(unicity[n])
|
|
||||||
|
|
||||||
for i,n,r in imap(None,count(),pnames,redundent):
|
|
||||||
alternate = n
|
|
||||||
if r > 1:
|
|
||||||
while alternate in pnames:
|
|
||||||
lcut = 9 - len(str(r))
|
|
||||||
alternate = n[:lcut]+ '_%d' % r
|
|
||||||
r+=1
|
|
||||||
pnames[i]='%-10s' % alternate
|
|
||||||
|
|
||||||
firstline = '%5d' % len(matrix.aligment)
|
|
||||||
rep = [firstline]
|
|
||||||
for i,n in imap(None,count(),pnames):
|
|
||||||
line = [n]
|
|
||||||
for j in xrange(i):
|
|
||||||
line.append('%5.4f' % matrix[(j,i)])
|
|
||||||
rep.append(' '.join(line))
|
|
||||||
return '\n'.join(rep)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,25 +0,0 @@
|
|||||||
import sys
|
|
||||||
|
|
||||||
from itertools import imap,count
|
|
||||||
|
|
||||||
def writeRMatrix(matrix):
|
|
||||||
names = [x.id for x in matrix.aligment]
|
|
||||||
lmax = max(max(len(x) for x in names),5)
|
|
||||||
lali = len(matrix.aligment)
|
|
||||||
|
|
||||||
nformat = '%%-%ds' % lmax
|
|
||||||
dformat = '%%%d.4f' % lmax
|
|
||||||
|
|
||||||
pnames=[nformat % x for x in names]
|
|
||||||
|
|
||||||
rep = [' '.join(pnames)]
|
|
||||||
|
|
||||||
for i in xrange(lali):
|
|
||||||
line=[]
|
|
||||||
for j in xrange(lali):
|
|
||||||
line.append('%5.4f' % matrix[(j,i)])
|
|
||||||
rep.append(' '.join(line))
|
|
||||||
return '\n'.join(rep)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,100 +0,0 @@
|
|||||||
_A=[0]
|
|
||||||
_C=[1]
|
|
||||||
_G=[2]
|
|
||||||
_T=[3]
|
|
||||||
_R= _A + _G
|
|
||||||
_Y= _C + _T
|
|
||||||
_M= _C + _A
|
|
||||||
_K= _T + _G
|
|
||||||
_W= _T + _A
|
|
||||||
_S= _C + _G
|
|
||||||
_B= _C + _G + _T
|
|
||||||
_D= _A + _G + _T
|
|
||||||
_H= _A + _C + _T
|
|
||||||
_V= _A + _C + _G
|
|
||||||
_N= _A + _C + _G + _T
|
|
||||||
|
|
||||||
_dnahash={'a':_A,
|
|
||||||
'c':_C,
|
|
||||||
'g':_G,
|
|
||||||
't':_T,
|
|
||||||
'r':_R,
|
|
||||||
'y':_Y,
|
|
||||||
'm':_M,
|
|
||||||
'k':_K,
|
|
||||||
'w':_W,
|
|
||||||
's':_S,
|
|
||||||
'b':_B,
|
|
||||||
'd':_D,
|
|
||||||
'h':_H,
|
|
||||||
'v':_V,
|
|
||||||
'n':_N,
|
|
||||||
}
|
|
||||||
|
|
||||||
def hashCodeIterator(sequence,wsize,degeneratemax=0,offset=0):
|
|
||||||
errors = 0
|
|
||||||
emask = [0] * wsize
|
|
||||||
epointer = 0
|
|
||||||
size = 0
|
|
||||||
position = offset
|
|
||||||
hashs = set([0])
|
|
||||||
hashmask = 0
|
|
||||||
for i in xrange(wsize):
|
|
||||||
hashmask <<= 2
|
|
||||||
hashmask +=3
|
|
||||||
|
|
||||||
for l in sequence:
|
|
||||||
l = l.lower()
|
|
||||||
hl = _dnahash[l]
|
|
||||||
|
|
||||||
if emask[epointer]:
|
|
||||||
errors-=1
|
|
||||||
emask[epointer]=0
|
|
||||||
|
|
||||||
if len(hl) > 1:
|
|
||||||
errors +=1
|
|
||||||
emask[epointer]=1
|
|
||||||
|
|
||||||
epointer+=1
|
|
||||||
epointer%=wsize
|
|
||||||
|
|
||||||
if errors > degeneratemax:
|
|
||||||
hl=set([hl[0]])
|
|
||||||
|
|
||||||
hashs=set((((hc<<2) | cl) & hashmask)
|
|
||||||
for hc in hashs
|
|
||||||
for cl in hl)
|
|
||||||
|
|
||||||
if size < wsize:
|
|
||||||
size+=1
|
|
||||||
|
|
||||||
if size==wsize:
|
|
||||||
if errors <= degeneratemax:
|
|
||||||
yield (position,hashs,errors)
|
|
||||||
position+=1
|
|
||||||
|
|
||||||
def hashSequence(sequence,wsize,degeneratemax=0,offset=0,hashs=None):
|
|
||||||
if hashs is None:
|
|
||||||
hashs=[[] for x in xrange(4**wsize)]
|
|
||||||
|
|
||||||
for pos,keys,errors in hashCodeIterator(sequence, wsize, degeneratemax, offset):
|
|
||||||
for k in keys:
|
|
||||||
hashs[k].append(pos)
|
|
||||||
|
|
||||||
return hashs
|
|
||||||
|
|
||||||
def hashSequences(sequences,wsize,maxpos,degeneratemax=0):
|
|
||||||
hashs=None
|
|
||||||
offsets=[]
|
|
||||||
offset=0
|
|
||||||
for s in sequences:
|
|
||||||
offsets.append(offset)
|
|
||||||
hashSequence(s,wsize,degeneratemax=degeneratemax,offset=offset,hashs=hashs)
|
|
||||||
offset+=len(s)
|
|
||||||
|
|
||||||
return hashs,offsets
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,32 +0,0 @@
|
|||||||
'''
|
|
||||||
Created on 25 sept. 2010
|
|
||||||
|
|
||||||
@author: coissac
|
|
||||||
'''
|
|
||||||
from obitools import NucSequence
|
|
||||||
|
|
||||||
def referenceDBIterator(options):
|
|
||||||
|
|
||||||
cursor = options.ecobarcodedb.cursor()
|
|
||||||
|
|
||||||
cursor.execute("select id from databases.database where name='%s'" % options.database)
|
|
||||||
options.dbid = cursor.fetchone()[0]
|
|
||||||
|
|
||||||
cursor.execute('''
|
|
||||||
select s.accession,r.id,r.taxid,r.sequence
|
|
||||||
from databases.database d,
|
|
||||||
databases.reference r,
|
|
||||||
databases.relatedsequences s
|
|
||||||
where r.database = d.id
|
|
||||||
and s.reference= r.id
|
|
||||||
and s.mainac
|
|
||||||
and d.name = '%s'
|
|
||||||
''' % options.database
|
|
||||||
)
|
|
||||||
|
|
||||||
for ac,id,taxid,sequence in cursor:
|
|
||||||
s = NucSequence(ac,sequence)
|
|
||||||
s['taxid']=taxid
|
|
||||||
s['refdbid']=id
|
|
||||||
yield s
|
|
||||||
|
|
@ -1,50 +0,0 @@
|
|||||||
'''
|
|
||||||
Created on 25 sept. 2010
|
|
||||||
|
|
||||||
@author: coissac
|
|
||||||
'''
|
|
||||||
|
|
||||||
def alreadyIdentified(seqid,options):
|
|
||||||
cursor = options.ecobarcodedb.cursor()
|
|
||||||
cursor.execute('''
|
|
||||||
select count(*)
|
|
||||||
from ecotag.identification
|
|
||||||
where sequence=%s
|
|
||||||
and database=%s
|
|
||||||
''',(int(seqid),int(options.dbid)))
|
|
||||||
|
|
||||||
return int(cursor.fetchone()[0]) > 0;
|
|
||||||
|
|
||||||
def storeIdentification(seqid,
|
|
||||||
idstatus,taxid,
|
|
||||||
matches,
|
|
||||||
options
|
|
||||||
):
|
|
||||||
|
|
||||||
cursor = options.ecobarcodedb.cursor()
|
|
||||||
|
|
||||||
if not options.updatedb:
|
|
||||||
cursor.execute('''
|
|
||||||
delete from ecotag.identification where sequence=%s and database=%s
|
|
||||||
''',(int(seqid),int(options.dbid)))
|
|
||||||
|
|
||||||
cursor.execute('''
|
|
||||||
insert into ecotag.identification (sequence,database,idstatus,taxid)
|
|
||||||
values (%s,%s,%s,%s)
|
|
||||||
returning id
|
|
||||||
''' , (int(seqid),int(options.dbid),idstatus,int(taxid)))
|
|
||||||
|
|
||||||
idid = cursor.fetchone()[0]
|
|
||||||
|
|
||||||
for seq,identity in matches.iteritems():
|
|
||||||
cursor.execute('''
|
|
||||||
insert into ecotag.evidence (identification,reference,identity)
|
|
||||||
values (%s,
|
|
||||||
%s,
|
|
||||||
%s)
|
|
||||||
''',(idid,seq,identity))
|
|
||||||
|
|
||||||
|
|
||||||
cursor.close()
|
|
||||||
|
|
||||||
options.ecobarcodedb.commit()
|
|
@ -1,64 +0,0 @@
|
|||||||
'''
|
|
||||||
Created on 23 sept. 2010
|
|
||||||
|
|
||||||
@author: coissac
|
|
||||||
'''
|
|
||||||
import psycopg2
|
|
||||||
|
|
||||||
from obitools.ecobarcode.taxonomy import EcoTaxonomyDB
|
|
||||||
|
|
||||||
def addEcoBarcodeDBOption(optionManager):
|
|
||||||
optionManager.add_option('--dbname',
|
|
||||||
action="store", dest="ecobarcodedb",
|
|
||||||
type='str',
|
|
||||||
default=None,
|
|
||||||
help="Specify the name of the ecobarcode database")
|
|
||||||
|
|
||||||
optionManager.add_option('--server',
|
|
||||||
action="store", dest="dbserver",
|
|
||||||
type='str',
|
|
||||||
default="localhost",
|
|
||||||
help="Specify the adress of the ecobarcode database server")
|
|
||||||
|
|
||||||
optionManager.add_option('--user',
|
|
||||||
action="store", dest="dbuser",
|
|
||||||
type='str',
|
|
||||||
default='postgres',
|
|
||||||
help="Specify the user of the ecobarcode database")
|
|
||||||
|
|
||||||
optionManager.add_option('--port',
|
|
||||||
action="store", dest="dbport",
|
|
||||||
type='str',
|
|
||||||
default=5432,
|
|
||||||
help="Specify the port of the ecobarcode database")
|
|
||||||
|
|
||||||
optionManager.add_option('--passwd',
|
|
||||||
action="store", dest="dbpasswd",
|
|
||||||
type='str',
|
|
||||||
default='',
|
|
||||||
help="Specify the passwd of the ecobarcode database")
|
|
||||||
|
|
||||||
optionManager.add_option('--primer',
|
|
||||||
action="store", dest="primer",
|
|
||||||
type='str',
|
|
||||||
default=None,
|
|
||||||
help="Specify the primer used for amplification")
|
|
||||||
|
|
||||||
|
|
||||||
def ecobarcodeDatabaseConnection(options):
|
|
||||||
if options.ecobarcodedb is not None:
|
|
||||||
connection = psycopg2.connect(database=options.ecobarcodedb,
|
|
||||||
user=options.dbuser,
|
|
||||||
password=options.dbpasswd,
|
|
||||||
host=options.dbserver,
|
|
||||||
port=options.dbport)
|
|
||||||
options.dbname=options.ecobarcodedb
|
|
||||||
else:
|
|
||||||
connection=None
|
|
||||||
if connection is not None:
|
|
||||||
options.ecobarcodedb=connection
|
|
||||||
taxonomy = EcoTaxonomyDB(connection)
|
|
||||||
else:
|
|
||||||
taxonomy=None
|
|
||||||
return taxonomy
|
|
||||||
|
|
@ -1,38 +0,0 @@
|
|||||||
'''
|
|
||||||
Created on 25 sept. 2010
|
|
||||||
|
|
||||||
@author: coissac
|
|
||||||
'''
|
|
||||||
|
|
||||||
from obitools import NucSequence
|
|
||||||
from obitools.utils import progressBar
|
|
||||||
from obitools.ecobarcode.ecotag import alreadyIdentified
|
|
||||||
|
|
||||||
import sys
|
|
||||||
|
|
||||||
def sequenceIterator(options):
|
|
||||||
cursor = options.ecobarcodedb.cursor()
|
|
||||||
|
|
||||||
cursor.execute('''
|
|
||||||
select s.id,sum(o.count),s.sequence
|
|
||||||
from rawdata.sequence s,
|
|
||||||
rawdata.occurrences o
|
|
||||||
where o.sequence= s.id
|
|
||||||
and s.primers = '%s'
|
|
||||||
group by s.id,s.sequence
|
|
||||||
''' % options.primer
|
|
||||||
)
|
|
||||||
|
|
||||||
nbseq = cursor.rowcount
|
|
||||||
progressBar(1, nbseq, True, head=options.dbname)
|
|
||||||
for id,count,sequence in cursor:
|
|
||||||
progressBar(cursor.rownumber+1, nbseq, head=options.dbname)
|
|
||||||
if not options.updatedb or not alreadyIdentified(id,options):
|
|
||||||
s = NucSequence(id,sequence)
|
|
||||||
s['count']=count
|
|
||||||
print >>sys.stderr,' +', cursor.rownumber+1,
|
|
||||||
yield s
|
|
||||||
else:
|
|
||||||
print >>sys.stderr,' @', cursor.rownumber+1,
|
|
||||||
|
|
||||||
print >>sys.stderr
|
|
@ -1,120 +0,0 @@
|
|||||||
'''
|
|
||||||
Created on 24 sept. 2010
|
|
||||||
|
|
||||||
@author: coissac
|
|
||||||
'''
|
|
||||||
|
|
||||||
from obitools.ecopcr.taxonomy import TaxonomyDump
|
|
||||||
from obitools.ecopcr.taxonomy import Taxonomy
|
|
||||||
import sys
|
|
||||||
|
|
||||||
class EcoTaxonomyDB(TaxonomyDump) :
|
|
||||||
|
|
||||||
def __init__(self,dbconnect):
|
|
||||||
self._dbconnect=dbconnect
|
|
||||||
|
|
||||||
print >> sys.stderr,"Reading ecobarcode taxonomy database..."
|
|
||||||
|
|
||||||
self._readNodeTable()
|
|
||||||
print >> sys.stderr," ok"
|
|
||||||
|
|
||||||
print >>sys.stderr,"Adding scientific name..."
|
|
||||||
|
|
||||||
self._name=[]
|
|
||||||
for taxid,name,classname in self._nameIterator():
|
|
||||||
self._name.append((name,classname,self._index[taxid]))
|
|
||||||
if classname == 'scientific name':
|
|
||||||
self._taxonomy[self._index[taxid]].append(name)
|
|
||||||
|
|
||||||
print >>sys.stderr,"Adding taxid alias..."
|
|
||||||
for taxid,current in self._mergedNodeIterator():
|
|
||||||
self._index[taxid]=self._index[current]
|
|
||||||
|
|
||||||
print >>sys.stderr,"Adding deleted taxid..."
|
|
||||||
for taxid in self._deletedNodeIterator():
|
|
||||||
self._index[taxid]=None
|
|
||||||
|
|
||||||
|
|
||||||
Taxonomy.__init__(self)
|
|
||||||
|
|
||||||
#####
|
|
||||||
#
|
|
||||||
# Iterator functions
|
|
||||||
#
|
|
||||||
#####
|
|
||||||
|
|
||||||
def _readNodeTable(self):
|
|
||||||
|
|
||||||
cursor = self._dbconnect.cursor()
|
|
||||||
|
|
||||||
cursor.execute("""
|
|
||||||
select taxid,rank,parent
|
|
||||||
from ncbitaxonomy.nodes
|
|
||||||
""")
|
|
||||||
|
|
||||||
print >>sys.stderr,"Reading taxonomy nodes..."
|
|
||||||
taxonomy=[list(n) for n in cursor]
|
|
||||||
|
|
||||||
print >>sys.stderr,"List all taxonomy rank..."
|
|
||||||
ranks =list(set(x[1] for x in taxonomy))
|
|
||||||
ranks.sort()
|
|
||||||
rankidx = dict(map(None,ranks,xrange(len(ranks))))
|
|
||||||
|
|
||||||
print >>sys.stderr,"Sorting taxons..."
|
|
||||||
taxonomy.sort(TaxonomyDump._taxonCmp)
|
|
||||||
|
|
||||||
self._taxonomy=taxonomy
|
|
||||||
|
|
||||||
print >>sys.stderr,"Indexing taxonomy..."
|
|
||||||
index = {}
|
|
||||||
for t in self._taxonomy:
|
|
||||||
index[t[0]]=self._bsearchTaxon(t[0])
|
|
||||||
|
|
||||||
print >>sys.stderr,"Indexing parent and rank..."
|
|
||||||
for t in self._taxonomy:
|
|
||||||
t[1]=rankidx[t[1]]
|
|
||||||
t[2]=index[t[2]]
|
|
||||||
|
|
||||||
self._ranks=ranks
|
|
||||||
self._index=index
|
|
||||||
|
|
||||||
cursor.close()
|
|
||||||
|
|
||||||
def _nameIterator(self):
|
|
||||||
cursor = self._dbconnect.cursor()
|
|
||||||
|
|
||||||
cursor.execute("""
|
|
||||||
select taxid,name,nameclass
|
|
||||||
from ncbitaxonomy.names
|
|
||||||
""")
|
|
||||||
|
|
||||||
for taxid,name,nameclass in cursor:
|
|
||||||
yield taxid,name,nameclass
|
|
||||||
|
|
||||||
cursor.close()
|
|
||||||
|
|
||||||
def _mergedNodeIterator(self):
|
|
||||||
cursor = self._dbconnect.cursor()
|
|
||||||
|
|
||||||
cursor.execute("""
|
|
||||||
select oldtaxid,newtaxid
|
|
||||||
from ncbitaxonomy.merged
|
|
||||||
""")
|
|
||||||
|
|
||||||
for oldtaxid,newtaxid in cursor:
|
|
||||||
yield oldtaxid,newtaxid
|
|
||||||
|
|
||||||
cursor.close()
|
|
||||||
|
|
||||||
def _deletedNodeIterator(self):
|
|
||||||
cursor = self._dbconnect.cursor()
|
|
||||||
|
|
||||||
cursor.execute("""
|
|
||||||
select taxid
|
|
||||||
from ncbitaxonomy.delnodes
|
|
||||||
""")
|
|
||||||
|
|
||||||
for taxid in cursor:
|
|
||||||
yield taxid[0]
|
|
||||||
|
|
||||||
cursor.close()
|
|
@ -1,69 +0,0 @@
|
|||||||
from obitools import utils
|
|
||||||
from obitools import NucSequence
|
|
||||||
from obitools.utils import universalOpen, universalTell, fileSize, progressBar
|
|
||||||
import struct
|
|
||||||
import sys
|
|
||||||
|
|
||||||
|
|
||||||
class EcoPCRFile(utils.ColumnFile):
|
|
||||||
def __init__(self,stream):
|
|
||||||
utils.ColumnFile.__init__(self,
|
|
||||||
stream, '|', True,
|
|
||||||
(str,int,int,
|
|
||||||
str,int,str,
|
|
||||||
int,str,int,
|
|
||||||
str,int,str,
|
|
||||||
str,str,int,float,
|
|
||||||
str,int,float,
|
|
||||||
int,
|
|
||||||
str,str), "#")
|
|
||||||
|
|
||||||
|
|
||||||
def next(self):
|
|
||||||
data = utils.ColumnFile.next(self)
|
|
||||||
seq = NucSequence(data[0],data[20],data[21])
|
|
||||||
seq['seq_length_ori']=data[1]
|
|
||||||
seq['taxid']=data[2]
|
|
||||||
seq['rank']=data[3]
|
|
||||||
seq['species']=data[4]
|
|
||||||
seq['species_sn']=data[5]
|
|
||||||
seq['genus']=data[6]
|
|
||||||
seq['genus_sn']=data[7]
|
|
||||||
seq['family']=data[8]
|
|
||||||
seq['family_sn']=data[9]
|
|
||||||
seq['strand']=data[12]
|
|
||||||
seq['forward_primer']=data[13]
|
|
||||||
seq['forward_error']=data[14]
|
|
||||||
seq['forward_tm']=data[15]
|
|
||||||
seq['reverse_primer']=data[16]
|
|
||||||
seq['reverse_error']=data[17]
|
|
||||||
seq['reverse_tm']=data[18]
|
|
||||||
|
|
||||||
return seq
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class EcoPCRDBFile(object):
|
|
||||||
|
|
||||||
def _ecoRecordIterator(self,file):
|
|
||||||
file = universalOpen(file)
|
|
||||||
(recordCount,) = struct.unpack('> I',file.read(4))
|
|
||||||
self._recover=False
|
|
||||||
|
|
||||||
if recordCount:
|
|
||||||
for i in xrange(recordCount):
|
|
||||||
(recordSize,)=struct.unpack('>I',file.read(4))
|
|
||||||
record = file.read(recordSize)
|
|
||||||
yield record
|
|
||||||
else:
|
|
||||||
print >> sys.stderr,"\n\n WARNING : EcoPCRDB readding set into recover data mode\n"
|
|
||||||
self._recover=True
|
|
||||||
ok=True
|
|
||||||
while(ok):
|
|
||||||
try:
|
|
||||||
(recordSize,)=struct.unpack('>I',file.read(4))
|
|
||||||
record = file.read(recordSize)
|
|
||||||
yield record
|
|
||||||
except:
|
|
||||||
ok=False
|
|
||||||
|
|
@ -1,104 +0,0 @@
|
|||||||
import struct
|
|
||||||
|
|
||||||
class EcoPCRDBAnnotationWriter(object):
|
|
||||||
'''
|
|
||||||
Class used to write Annotation description in EcoPCRDB format.
|
|
||||||
|
|
||||||
EcoPCRDBAnnotationWriter is oftenly called through the EcoPCRDBSequenceWriter class
|
|
||||||
|
|
||||||
@see: L{ecopcr.sequence.EcoPCRDBSequenceWriter}
|
|
||||||
'''
|
|
||||||
|
|
||||||
def __init__(self,dbname,id,fileidx=1,type=('CDS'),definition=None):
|
|
||||||
'''
|
|
||||||
class constructor
|
|
||||||
|
|
||||||
@param dbname: name of ecoPCR database
|
|
||||||
@type dbname: C{str}
|
|
||||||
@param id: name of the qualifier used as feature id
|
|
||||||
@type id: C{str}
|
|
||||||
@param fileidx:
|
|
||||||
@type fileidx: C{int}
|
|
||||||
@param type:
|
|
||||||
@type type: C{list} or C{tuple}
|
|
||||||
@param definition:
|
|
||||||
@type definition: C{str}
|
|
||||||
'''
|
|
||||||
self._type = type
|
|
||||||
self._definition = definition
|
|
||||||
self._id = id
|
|
||||||
self._filename="%s_%03d.adx" % (dbname,fileidx)
|
|
||||||
self._file = open(self._filename,'wb')
|
|
||||||
self._sequenceIdx=0
|
|
||||||
|
|
||||||
|
|
||||||
ftname ="%s.fdx" % (dbname)
|
|
||||||
ft = open(ftname,'wb')
|
|
||||||
|
|
||||||
self._fttypeidx=dict(map(None,type,xrange(len(type))))
|
|
||||||
|
|
||||||
ft.write(struct.pack('> I',len(type)))
|
|
||||||
|
|
||||||
for t in type:
|
|
||||||
ft.write(self._ecoFtTypePacker(t))
|
|
||||||
|
|
||||||
ft.close()
|
|
||||||
|
|
||||||
self._annotationCount=0
|
|
||||||
self._file.write(struct.pack('> I',self._annotationCount))
|
|
||||||
|
|
||||||
|
|
||||||
def _ecoFtTypePacker(self,type):
|
|
||||||
totalSize = len(type)
|
|
||||||
packed = struct.pack('> I %ds' % totalSize,totalSize,type)
|
|
||||||
|
|
||||||
assert len(packed) == totalSize+4, "error in feature type packing"
|
|
||||||
|
|
||||||
return packed
|
|
||||||
|
|
||||||
def _ecoAnnotationPacker(self,feature,seqidx):
|
|
||||||
begin = feature.begin-1
|
|
||||||
end = feature.end
|
|
||||||
type = self._fttypeidx[feature.ftType]
|
|
||||||
strand = feature.isDirect()
|
|
||||||
id = feature[self._id][0]
|
|
||||||
if self._definition in feature:
|
|
||||||
definition = feature[self._definition][0]
|
|
||||||
else:
|
|
||||||
definition = ''
|
|
||||||
|
|
||||||
assert strand is not None,"Only strand defined features can be stored"
|
|
||||||
|
|
||||||
deflength = len(definition)
|
|
||||||
|
|
||||||
totalSize = 4 + 4 + 4 + 4 + 4 + 20 + 4 + deflength
|
|
||||||
|
|
||||||
packed = struct.pack('> I I I I I 20s I %ds' % (deflength),
|
|
||||||
totalSize,
|
|
||||||
seqidx,
|
|
||||||
begin,
|
|
||||||
end,
|
|
||||||
type,
|
|
||||||
int(strand),
|
|
||||||
id,
|
|
||||||
deflength,
|
|
||||||
definition)
|
|
||||||
|
|
||||||
assert len(packed) == totalSize+4, "error in annotation packing"
|
|
||||||
|
|
||||||
return packed
|
|
||||||
|
|
||||||
|
|
||||||
def put(self,sequence,seqidx=None):
|
|
||||||
if seqidx is None:
|
|
||||||
seqidx = self._sequenceIdx
|
|
||||||
self._sequenceIdx+=1
|
|
||||||
for feature in sequence.getFeatureTable():
|
|
||||||
if feature.ftType in self._type:
|
|
||||||
self._annotationCount+=1
|
|
||||||
self._file.write(self._ecoAnnotationPacker(feature,seqidx))
|
|
||||||
|
|
||||||
def __del__(self):
|
|
||||||
self._file.seek(0,0)
|
|
||||||
self._file.write(struct.pack('> I',self._annotationCount))
|
|
||||||
self._file.close()
|
|
@ -1,129 +0,0 @@
|
|||||||
'''
|
|
||||||
Created on 13 fevr. 2011
|
|
||||||
|
|
||||||
@author: coissac
|
|
||||||
'''
|
|
||||||
|
|
||||||
from obitools.ecopcr.taxonomy import Taxonomy, EcoTaxonomyDB, TaxonomyDump, ecoTaxonomyWriter
|
|
||||||
|
|
||||||
try:
|
|
||||||
from obitools.ecobarcode.options import addEcoBarcodeDBOption,ecobarcodeDatabaseConnection
|
|
||||||
except ImportError:
|
|
||||||
def addEcoBarcodeDBOption(optionmanager):
|
|
||||||
pass
|
|
||||||
def ecobarcodeDatabaseConnection(options):
|
|
||||||
return None
|
|
||||||
|
|
||||||
def addTaxonomyDBOptions(optionManager):
|
|
||||||
addEcoBarcodeDBOption(optionManager)
|
|
||||||
optionManager.add_option('-d','--database',
|
|
||||||
action="store", dest="taxonomy",
|
|
||||||
metavar="<FILENAME>",
|
|
||||||
type="string",
|
|
||||||
help="ecoPCR taxonomy Database "
|
|
||||||
"name")
|
|
||||||
optionManager.add_option('-t','--taxonomy-dump',
|
|
||||||
action="store", dest="taxdump",
|
|
||||||
metavar="<FILENAME>",
|
|
||||||
type="string",
|
|
||||||
help="NCBI Taxonomy dump repository "
|
|
||||||
"name")
|
|
||||||
|
|
||||||
|
|
||||||
def addTaxonomyFilterOptions(optionManager):
|
|
||||||
addTaxonomyDBOptions(optionManager)
|
|
||||||
optionManager.add_option('--require-rank',
|
|
||||||
action="append",
|
|
||||||
dest='requiredRank',
|
|
||||||
metavar="<RANK_NAME>",
|
|
||||||
type="string",
|
|
||||||
default=[],
|
|
||||||
help="select sequence with taxid tag containing "
|
|
||||||
"a parent of rank <RANK_NAME>")
|
|
||||||
|
|
||||||
optionManager.add_option('-r','--required',
|
|
||||||
action="append",
|
|
||||||
dest='required',
|
|
||||||
metavar="<TAXID>",
|
|
||||||
type="int",
|
|
||||||
default=[],
|
|
||||||
help="required taxid")
|
|
||||||
|
|
||||||
optionManager.add_option('-i','--ignore',
|
|
||||||
action="append",
|
|
||||||
dest='ignored',
|
|
||||||
metavar="<TAXID>",
|
|
||||||
type="int",
|
|
||||||
default=[],
|
|
||||||
help="ignored taxid")
|
|
||||||
|
|
||||||
def loadTaxonomyDatabase(options):
|
|
||||||
if isinstance(options.taxonomy, Taxonomy):
|
|
||||||
return options.taxonomy
|
|
||||||
taxonomy = ecobarcodeDatabaseConnection(options)
|
|
||||||
if (taxonomy is not None or
|
|
||||||
options.taxonomy is not None or
|
|
||||||
options.taxdump is not None):
|
|
||||||
if options.taxdump is not None:
|
|
||||||
taxonomy = TaxonomyDump(options.taxdump)
|
|
||||||
if taxonomy is not None and isinstance(options.taxonomy, str):
|
|
||||||
ecoTaxonomyWriter(options.taxonomy,taxonomy)
|
|
||||||
options.ecodb=options.taxonomy
|
|
||||||
if isinstance(options.taxonomy, Taxonomy):
|
|
||||||
taxonomy = options.taxonomy
|
|
||||||
if taxonomy is None and isinstance(options.taxonomy, str):
|
|
||||||
taxonomy = EcoTaxonomyDB(options.taxonomy)
|
|
||||||
options.ecodb=options.taxonomy
|
|
||||||
options.taxonomy=taxonomy
|
|
||||||
return options.taxonomy
|
|
||||||
|
|
||||||
def taxonomyFilterGenerator(options):
|
|
||||||
loadTaxonomyDatabase(options)
|
|
||||||
if options.taxonomy is not None:
|
|
||||||
taxonomy=options.taxonomy
|
|
||||||
def taxonomyFilter(seq):
|
|
||||||
def annotateAtRank(seq,rank):
|
|
||||||
if 'taxid' in seq and seq['taxid'] is not None:
|
|
||||||
rtaxid= taxonomy.getTaxonAtRank(seq['taxid'],rank)
|
|
||||||
return rtaxid
|
|
||||||
return None
|
|
||||||
good = True
|
|
||||||
if 'taxid' in seq:
|
|
||||||
taxid = seq['taxid']
|
|
||||||
# print taxid,
|
|
||||||
if options.requiredRank:
|
|
||||||
taxonatrank = reduce(lambda x,y: x and y,
|
|
||||||
(annotateAtRank(seq,rank) is not None
|
|
||||||
for rank in options.requiredRank),True)
|
|
||||||
good = good and taxonatrank
|
|
||||||
# print >>sys.stderr, " Has rank : ",good,
|
|
||||||
if options.required:
|
|
||||||
good = good and reduce(lambda x,y: x or y,
|
|
||||||
(taxonomy.isAncestor(r,taxid) for r in options.required),
|
|
||||||
False)
|
|
||||||
# print " Required : ",good,
|
|
||||||
if options.ignored:
|
|
||||||
good = good and not reduce(lambda x,y: x or y,
|
|
||||||
(taxonomy.isAncestor(r,taxid) for r in options.ignored),
|
|
||||||
False)
|
|
||||||
# print " Ignored : ",good,
|
|
||||||
# print " Global : ",good
|
|
||||||
|
|
||||||
return good
|
|
||||||
|
|
||||||
|
|
||||||
else:
|
|
||||||
def taxonomyFilter(seq):
|
|
||||||
return True
|
|
||||||
|
|
||||||
return taxonomyFilter
|
|
||||||
|
|
||||||
def taxonomyFilterIteratorGenerator(options):
|
|
||||||
taxonomyFilter = taxonomyFilterGenerator(options)
|
|
||||||
|
|
||||||
def filterIterator(seqiterator):
|
|
||||||
for seq in seqiterator:
|
|
||||||
if taxonomyFilter(seq):
|
|
||||||
yield seq
|
|
||||||
|
|
||||||
return filterIterator
|
|
@ -1,133 +0,0 @@
|
|||||||
from obitools import NucSequence
|
|
||||||
from obitools.ecopcr import EcoPCRDBFile
|
|
||||||
from obitools.ecopcr.taxonomy import EcoTaxonomyDB, ecoTaxonomyWriter
|
|
||||||
from obitools.ecopcr.annotation import EcoPCRDBAnnotationWriter
|
|
||||||
from obitools.utils import universalOpen
|
|
||||||
from glob import glob
|
|
||||||
import struct
|
|
||||||
import gzip
|
|
||||||
import sys
|
|
||||||
|
|
||||||
|
|
||||||
class EcoPCRDBSequenceIterator(EcoPCRDBFile):
|
|
||||||
'''
|
|
||||||
Build an iterator over the sequences include in a sequence database
|
|
||||||
formated for ecoPCR
|
|
||||||
'''
|
|
||||||
|
|
||||||
def __init__(self,path,taxonomy=None):
|
|
||||||
'''
|
|
||||||
ecoPCR data iterator constructor
|
|
||||||
|
|
||||||
@param path: path to the ecoPCR database including the database prefix name
|
|
||||||
@type path: C{str}
|
|
||||||
@param taxonomy: a taxonomy can be given to the reader to decode the taxonomic data
|
|
||||||
associated to the sequences. If no Taxonomy is furnish, it will be read
|
|
||||||
before the sequence database files using the same path.
|
|
||||||
@type taxonomy: L{obitools.ecopcr.taxonomy.Taxonomy}
|
|
||||||
'''
|
|
||||||
self._path = path
|
|
||||||
|
|
||||||
if taxonomy is not None:
|
|
||||||
self._taxonomy=taxonomy
|
|
||||||
else:
|
|
||||||
self._taxonomy=EcoTaxonomyDB(path)
|
|
||||||
|
|
||||||
self._seqfilesFiles = glob('%s_???.sdx' % self._path)
|
|
||||||
self._seqfilesFiles.sort()
|
|
||||||
|
|
||||||
def __ecoSequenceIterator(self,file):
|
|
||||||
for record in self._ecoRecordIterator(file):
|
|
||||||
lrecord = len(record)
|
|
||||||
lnames = lrecord - (4*4+20)
|
|
||||||
(taxid,seqid,deflength,seqlength,cptseqlength,string)=struct.unpack('> I 20s I I I %ds' % lnames, record)
|
|
||||||
seqid=seqid.strip('\x00')
|
|
||||||
de = string[:deflength]
|
|
||||||
seq = gzip.zlib.decompress(string[deflength:])
|
|
||||||
bioseq = NucSequence(seqid,seq,de,taxidx=taxid,taxid=self._taxonomy._taxonomy[taxid][0])
|
|
||||||
yield bioseq
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
for seqfile in self._seqfilesFiles:
|
|
||||||
for seq in self.__ecoSequenceIterator(seqfile):
|
|
||||||
yield seq
|
|
||||||
|
|
||||||
class EcoPCRDBSequenceWriter(object):
|
|
||||||
|
|
||||||
def __init__(self,dbname,fileidx=1,taxonomy=None,ftid=None,type=None,definition=None,append=False):
|
|
||||||
self._taxonomy=taxonomy
|
|
||||||
self._filename="%s_%03d.sdx" % (dbname,fileidx)
|
|
||||||
if append:
|
|
||||||
mode ='r+b'
|
|
||||||
f = universalOpen(self._filename)
|
|
||||||
(recordCount,) = struct.unpack('> I',f.read(4))
|
|
||||||
self._sequenceCount=recordCount
|
|
||||||
del f
|
|
||||||
self._file = open(self._filename,mode)
|
|
||||||
self._file.seek(0,0)
|
|
||||||
self._file.write(struct.pack('> I',0))
|
|
||||||
self._file.seek(0,2)
|
|
||||||
else:
|
|
||||||
self._sequenceCount=0
|
|
||||||
mode = 'wb'
|
|
||||||
self._file = open(self._filename,mode)
|
|
||||||
self._file.write(struct.pack('> I',self._sequenceCount))
|
|
||||||
|
|
||||||
if self._taxonomy is not None:
|
|
||||||
print >> sys.stderr,"Writing the taxonomy file...",
|
|
||||||
ecoTaxonomyWriter(dbname,self._taxonomy)
|
|
||||||
print >> sys.stderr,"Ok"
|
|
||||||
|
|
||||||
if type is not None:
|
|
||||||
assert ftid is not None,"You must specify an id attribute for features"
|
|
||||||
self._annotation = EcoPCRDBAnnotationWriter(dbname, ftid, fileidx, type, definition)
|
|
||||||
else:
|
|
||||||
self._annotation = None
|
|
||||||
|
|
||||||
def _ecoSeqPacker(self,seq):
|
|
||||||
|
|
||||||
compactseq = gzip.zlib.compress(str(seq).upper(),9)
|
|
||||||
cptseqlength = len(compactseq)
|
|
||||||
delength = len(seq.definition)
|
|
||||||
|
|
||||||
totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength
|
|
||||||
|
|
||||||
if self._taxonomy is None or 'taxid' not in seq:
|
|
||||||
taxon=-1
|
|
||||||
else:
|
|
||||||
taxon=self._taxonomy.findIndex(seq['taxid'])
|
|
||||||
|
|
||||||
try:
|
|
||||||
packed = struct.pack('> I i 20s I I I %ds %ds' % (delength,cptseqlength),
|
|
||||||
totalSize,
|
|
||||||
taxon,
|
|
||||||
seq.id,
|
|
||||||
delength,
|
|
||||||
len(seq),
|
|
||||||
cptseqlength,
|
|
||||||
seq.definition,
|
|
||||||
compactseq)
|
|
||||||
except struct.error as e:
|
|
||||||
print >>sys.stderr,"\n\n============\n\nError on sequence : %s\n\n" % seq.id
|
|
||||||
raise e
|
|
||||||
|
|
||||||
assert len(packed) == totalSize+4, "error in sequence packing"
|
|
||||||
|
|
||||||
return packed
|
|
||||||
|
|
||||||
|
|
||||||
def put(self,sequence):
|
|
||||||
if self._taxonomy is not None:
|
|
||||||
if 'taxid' not in sequence and hasattr(sequence, 'extractTaxon'):
|
|
||||||
sequence.extractTaxon()
|
|
||||||
self._file.write(self._ecoSeqPacker(sequence))
|
|
||||||
if self._annotation is not None:
|
|
||||||
self._annotation.put(sequence, self._sequenceCount)
|
|
||||||
self._sequenceCount+=1
|
|
||||||
|
|
||||||
def __del__(self):
|
|
||||||
self._file.seek(0,0)
|
|
||||||
self._file.write(struct.pack('> I',self._sequenceCount))
|
|
||||||
self._file.close()
|
|
||||||
|
|
||||||
|
|
@ -1,630 +0,0 @@
|
|||||||
import struct
|
|
||||||
import sys
|
|
||||||
|
|
||||||
from itertools import count,imap
|
|
||||||
|
|
||||||
from obitools.ecopcr import EcoPCRDBFile
|
|
||||||
from obitools.utils import universalOpen
|
|
||||||
from obitools.utils import ColumnFile
|
|
||||||
|
|
||||||
class Taxonomy(object):
|
|
||||||
def __init__(self):
|
|
||||||
'''
|
|
||||||
The taxonomy database constructor
|
|
||||||
|
|
||||||
@param path: path to the ecoPCR database including the database prefix name
|
|
||||||
@type path: C{str}
|
|
||||||
'''
|
|
||||||
|
|
||||||
self._ranks.append('obi')
|
|
||||||
|
|
||||||
self._speciesidx = self._ranks.index('species')
|
|
||||||
self._genusidx = self._ranks.index('genus')
|
|
||||||
self._familyidx = self._ranks.index('family')
|
|
||||||
self._orderidx = self._ranks.index('order')
|
|
||||||
self._nameidx=dict((x[0],x[2]) for x in self._name)
|
|
||||||
self._nameidx.update(dict((x[0],x[2]) for x in self._preferedName))
|
|
||||||
self._preferedidx=dict((x[2],x[1]) for x in self._preferedName)
|
|
||||||
|
|
||||||
self._bigestTaxid = max(x[0] for x in self._taxonomy)
|
|
||||||
|
|
||||||
|
|
||||||
def findTaxonByIdx(self,idx):
|
|
||||||
if idx is None:
|
|
||||||
return None
|
|
||||||
return self._taxonomy[idx]
|
|
||||||
|
|
||||||
def findIndex(self,taxid):
|
|
||||||
if taxid is None:
|
|
||||||
return None
|
|
||||||
return self._index[taxid]
|
|
||||||
|
|
||||||
def findTaxonByTaxid(self,taxid):
|
|
||||||
return self.findTaxonByIdx(self.findIndex(taxid))
|
|
||||||
|
|
||||||
def findTaxonByName(self,name):
|
|
||||||
return self._taxonomy[self._nameidx[name]]
|
|
||||||
|
|
||||||
def findRankByName(self,rank):
|
|
||||||
try:
|
|
||||||
return self._ranks.index(rank)
|
|
||||||
except ValueError:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def __contains__(self,taxid):
|
|
||||||
return self.findTaxonByTaxid(taxid) is not None
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#####
|
|
||||||
#
|
|
||||||
# PUBLIC METHODS
|
|
||||||
#
|
|
||||||
#####
|
|
||||||
|
|
||||||
|
|
||||||
def subTreeIterator(self, taxid):
|
|
||||||
"return subtree for given taxonomic id "
|
|
||||||
idx = self.findTaxonByTaxid(taxid)
|
|
||||||
yield self._taxonomy[idx]
|
|
||||||
for t in self._taxonomy:
|
|
||||||
if t[2] == idx:
|
|
||||||
for subt in self.subTreeIterator(t[0]):
|
|
||||||
yield subt
|
|
||||||
|
|
||||||
def parentalTreeIterator(self, taxid):
|
|
||||||
"""
|
|
||||||
return parental tree for given taxonomic id starting from
|
|
||||||
first ancester to the root.
|
|
||||||
"""
|
|
||||||
taxon=self.findTaxonByTaxid(taxid)
|
|
||||||
if taxon is not None:
|
|
||||||
while taxon[2]!= 0:
|
|
||||||
yield taxon
|
|
||||||
taxon = self._taxonomy[taxon[2]]
|
|
||||||
yield self._taxonomy[0]
|
|
||||||
else:
|
|
||||||
raise StopIteration
|
|
||||||
|
|
||||||
def isAncestor(self,parent,taxid):
|
|
||||||
return parent in [x[0] for x in self.parentalTreeIterator(taxid)]
|
|
||||||
|
|
||||||
def lastCommonTaxon(self,*taxids):
|
|
||||||
if not taxids:
|
|
||||||
return None
|
|
||||||
if len(taxids)==1:
|
|
||||||
return taxids[0]
|
|
||||||
|
|
||||||
if len(taxids)==2:
|
|
||||||
t1 = [x[0] for x in self.parentalTreeIterator(taxids[0])]
|
|
||||||
t2 = [x[0] for x in self.parentalTreeIterator(taxids[1])]
|
|
||||||
t1.reverse()
|
|
||||||
t2.reverse()
|
|
||||||
|
|
||||||
count = min(len(t1),len(t2))
|
|
||||||
i=0
|
|
||||||
while(i < count and t1[i]==t2[i]):
|
|
||||||
i+=1
|
|
||||||
i-=1
|
|
||||||
|
|
||||||
return t1[i]
|
|
||||||
|
|
||||||
ancetre = taxids[0]
|
|
||||||
for taxon in taxids[1:]:
|
|
||||||
ancetre = self.lastCommonTaxon(ancetre,taxon)
|
|
||||||
|
|
||||||
return ancetre
|
|
||||||
|
|
||||||
def betterCommonTaxon(self,error=1,*taxids):
|
|
||||||
lca = self.lastCommonTaxon(*taxids)
|
|
||||||
idx = self._index[lca]
|
|
||||||
sublca = [t[0] for t in self._taxonomy if t[2]==idx]
|
|
||||||
return sublca
|
|
||||||
|
|
||||||
|
|
||||||
def getPreferedName(self,taxid):
|
|
||||||
idx = self.findIndex(taxid)
|
|
||||||
return self._preferedidx.get(idx,self._taxonomy[idx][3])
|
|
||||||
|
|
||||||
|
|
||||||
def getScientificName(self,taxid):
|
|
||||||
return self.findTaxonByTaxid(taxid)[3]
|
|
||||||
|
|
||||||
def getRankId(self,taxid):
|
|
||||||
return self.findTaxonByTaxid(taxid)[1]
|
|
||||||
|
|
||||||
def getRank(self,taxid):
|
|
||||||
return self._ranks[self.getRankId(taxid)]
|
|
||||||
|
|
||||||
def getTaxonAtRank(self,taxid,rankid):
|
|
||||||
if isinstance(rankid, str):
|
|
||||||
rankid=self._ranks.index(rankid)
|
|
||||||
try:
|
|
||||||
return [x[0] for x in self.parentalTreeIterator(taxid)
|
|
||||||
if x[1]==rankid][0]
|
|
||||||
except IndexError:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def getSpecies(self,taxid):
|
|
||||||
return self.getTaxonAtRank(taxid, self._speciesidx)
|
|
||||||
|
|
||||||
def getGenus(self,taxid):
|
|
||||||
return self.getTaxonAtRank(taxid, self._genusidx)
|
|
||||||
|
|
||||||
def getFamily(self,taxid):
|
|
||||||
return self.getTaxonAtRank(taxid, self._familyidx)
|
|
||||||
|
|
||||||
def getOrder(self,taxid):
|
|
||||||
return self.getTaxonAtRank(taxid, self._orderidx)
|
|
||||||
|
|
||||||
def rankIterator(self):
|
|
||||||
for x in imap(None,self._ranks,xrange(len(self._ranks))):
|
|
||||||
yield x
|
|
||||||
|
|
||||||
def groupTaxa(self,taxa,groupname):
|
|
||||||
t=[self.findTaxonByTaxid(x) for x in taxa]
|
|
||||||
a=set(x[2] for x in t)
|
|
||||||
assert len(a)==1,"All taxa must have the same parent"
|
|
||||||
newtaxid=max([2999999]+[x[0] for x in self._taxonomy if x[0]>=3000000 and x[0]<4000000])+1
|
|
||||||
newidx=len(self._taxonomy)
|
|
||||||
if 'GROUP' not in self._ranks:
|
|
||||||
self._ranks.append('GROUP')
|
|
||||||
rankid=self._ranks.index('GROUP')
|
|
||||||
self._taxonomy.append((newtaxid,rankid,a.pop(),groupname))
|
|
||||||
for x in t:
|
|
||||||
x[2]=newidx
|
|
||||||
|
|
||||||
def addLocalTaxon(self,name,rank,parent,minimaltaxid=10000000):
|
|
||||||
newtaxid = minimaltaxid if (self._bigestTaxid < minimaltaxid) else self._bigestTaxid+1
|
|
||||||
|
|
||||||
rankid=self.findRankByName(rank)
|
|
||||||
parentidx = self.findIndex(int(parent))
|
|
||||||
tx = (newtaxid,rankid,parentidx,name,'local')
|
|
||||||
self._taxonomy.append(tx)
|
|
||||||
newidx=len(self._taxonomy)-1
|
|
||||||
self._name.append((name,'scientific name',newidx))
|
|
||||||
self._nameidx[name]=newidx
|
|
||||||
self._index[newtaxid]=newidx
|
|
||||||
|
|
||||||
self._bigestTaxid=newtaxid
|
|
||||||
|
|
||||||
return newtaxid
|
|
||||||
|
|
||||||
def removeLocalTaxon(self,taxid):
|
|
||||||
raise NotImplemented
|
|
||||||
txidx = self.findIndex(taxid)
|
|
||||||
taxon = self.findTaxonByIdx(txidx)
|
|
||||||
|
|
||||||
assert txidx >= self._localtaxon,"Only local taxon can be deleted"
|
|
||||||
|
|
||||||
for t in self._taxonomy:
|
|
||||||
if t[2] == txidx:
|
|
||||||
self.removeLocalTaxon(t[0])
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
return taxon
|
|
||||||
|
|
||||||
def addPreferedName(self,taxid,name):
|
|
||||||
idx = self.findIndex(taxid)
|
|
||||||
self._preferedName.append(name,'obi',idx)
|
|
||||||
self._preferedidx[idx]=name
|
|
||||||
return taxid
|
|
||||||
|
|
||||||
class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile):
|
|
||||||
'''
|
|
||||||
A taxonomy database class
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
|
||||||
def __init__(self,path):
|
|
||||||
'''
|
|
||||||
The taxonomy database constructor
|
|
||||||
|
|
||||||
@param path: path to the ecoPCR database including the database prefix name
|
|
||||||
@type path: C{str}
|
|
||||||
'''
|
|
||||||
self._path = path
|
|
||||||
self._taxonFile = "%s.tdx" % self._path
|
|
||||||
self._localTaxonFile = "%s.ldx" % self._path
|
|
||||||
self._ranksFile = "%s.rdx" % self._path
|
|
||||||
self._namesFile = "%s.ndx" % self._path
|
|
||||||
self._preferedNamesFile = "%s.pdx" % self._path
|
|
||||||
self._aliasFile = "%s.adx" % self._path
|
|
||||||
|
|
||||||
print >> sys.stderr,"Reading binary taxonomy database...",
|
|
||||||
|
|
||||||
self.__readNodeTable()
|
|
||||||
|
|
||||||
print >> sys.stderr," ok"
|
|
||||||
|
|
||||||
Taxonomy.__init__(self)
|
|
||||||
|
|
||||||
|
|
||||||
#####
|
|
||||||
#
|
|
||||||
# Iterator functions
|
|
||||||
#
|
|
||||||
#####
|
|
||||||
|
|
||||||
def __ecoNameIterator(self,file):
|
|
||||||
for record in self._ecoRecordIterator(file):
|
|
||||||
lrecord = len(record)
|
|
||||||
lnames = lrecord - 16
|
|
||||||
(isScientificName,namelength,classLength,indextaxid,names)=struct.unpack('> I I I I %ds' % lnames, record)
|
|
||||||
name=names[:namelength]
|
|
||||||
classname=names[namelength:]
|
|
||||||
yield (name,classname,indextaxid)
|
|
||||||
|
|
||||||
|
|
||||||
def __ecoTaxonomicIterator(self):
|
|
||||||
for record in self._ecoRecordIterator(self._taxonFile):
|
|
||||||
lrecord = len(record)
|
|
||||||
lnames = lrecord - 16
|
|
||||||
(taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record)
|
|
||||||
yield (taxid,rankid,parentidx,name,'ncbi')
|
|
||||||
|
|
||||||
try :
|
|
||||||
lt=0
|
|
||||||
for record in self._ecoRecordIterator(self._localTaxonFile):
|
|
||||||
lrecord = len(record)
|
|
||||||
lnames = lrecord - 16
|
|
||||||
(taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record)
|
|
||||||
lt+=1
|
|
||||||
yield (taxid,rankid,parentidx,name,'local')
|
|
||||||
print >> sys.stderr, " [INFO : Local taxon file found] : %d added taxa" % lt
|
|
||||||
except:
|
|
||||||
print >> sys.stderr, " [INFO : Local taxon file not found] "
|
|
||||||
|
|
||||||
def __ecoRankIterator(self):
|
|
||||||
for record in self._ecoRecordIterator(self._ranksFile):
|
|
||||||
yield record
|
|
||||||
|
|
||||||
def __ecoAliasIterator(self):
|
|
||||||
for record in self._ecoRecordIterator(self._aliasFile):
|
|
||||||
(taxid,index) = struct.unpack('> I i',record)
|
|
||||||
yield taxid,index
|
|
||||||
|
|
||||||
#####
|
|
||||||
#
|
|
||||||
# Indexes
|
|
||||||
#
|
|
||||||
#####
|
|
||||||
|
|
||||||
def __ecoNameIndex(self):
|
|
||||||
indexName = [x for x in self.__ecoNameIterator(self._namesFile)]
|
|
||||||
return indexName
|
|
||||||
|
|
||||||
def __ecoRankIndex(self):
|
|
||||||
rank = [r for r in self.__ecoRankIterator()]
|
|
||||||
return rank
|
|
||||||
|
|
||||||
def __ecoTaxonomyIndex(self):
|
|
||||||
taxonomy = []
|
|
||||||
|
|
||||||
try :
|
|
||||||
index = dict(self.__ecoAliasIterator())
|
|
||||||
print >> sys.stderr, " [INFO : Taxon alias file found] "
|
|
||||||
buildIndex=False
|
|
||||||
except:
|
|
||||||
print >> sys.stderr, " [INFO : Taxon alias file not found] "
|
|
||||||
index={}
|
|
||||||
i = 0;
|
|
||||||
buildIndex=True
|
|
||||||
|
|
||||||
localtaxon=0
|
|
||||||
i=0
|
|
||||||
for x in self.__ecoTaxonomicIterator():
|
|
||||||
taxonomy.append(x)
|
|
||||||
if x[4]=='ncbi':
|
|
||||||
localtaxon+=1
|
|
||||||
|
|
||||||
if buildIndex or x[4]!='ncbi':
|
|
||||||
index[x[0]] = i
|
|
||||||
i+=1
|
|
||||||
|
|
||||||
|
|
||||||
print >> sys.stderr,"Taxonomical tree read",
|
|
||||||
return taxonomy, index,localtaxon
|
|
||||||
|
|
||||||
def __readNodeTable(self):
|
|
||||||
self._taxonomy, self._index, self._localtaxon= self.__ecoTaxonomyIndex()
|
|
||||||
self._ranks = self.__ecoRankIndex()
|
|
||||||
self._name = self.__ecoNameIndex()
|
|
||||||
|
|
||||||
# Add local taxon tame to the name index
|
|
||||||
i=self._localtaxon
|
|
||||||
for t in self._taxonomy[self._localtaxon:]:
|
|
||||||
self._name.append((t[3],'scientific name',i))
|
|
||||||
i+=1
|
|
||||||
|
|
||||||
try :
|
|
||||||
self._preferedName = [(x[0],'obi',x[2])
|
|
||||||
for x in self.__ecoNameIterator(self._preferedNamesFile)]
|
|
||||||
print >> sys.stderr, " [INFO : Prefered taxon name file found] : %d added taxa" % len(self._preferedName)
|
|
||||||
except:
|
|
||||||
print >> sys.stderr, " [INFO : Prefered taxon name file not found]"
|
|
||||||
self._preferedName = []
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class TaxonomyDump(Taxonomy):
|
|
||||||
|
|
||||||
def __init__(self,taxdir):
|
|
||||||
|
|
||||||
self._path=taxdir
|
|
||||||
self._readNodeTable('%s/nodes.dmp' % taxdir)
|
|
||||||
|
|
||||||
print >>sys.stderr,"Adding scientific name..."
|
|
||||||
|
|
||||||
self._name=[]
|
|
||||||
for taxid,name,classname in self._nameIterator('%s/names.dmp' % taxdir):
|
|
||||||
self._name.append((name,classname,self._index[taxid]))
|
|
||||||
if classname == 'scientific name':
|
|
||||||
self._taxonomy[self._index[taxid]].extend([name,'ncbi'])
|
|
||||||
|
|
||||||
print >>sys.stderr,"Adding taxid alias..."
|
|
||||||
for taxid,current in self._mergedNodeIterator('%s/merged.dmp' % taxdir):
|
|
||||||
self._index[taxid]=self._index[current]
|
|
||||||
|
|
||||||
print >>sys.stderr,"Adding deleted taxid..."
|
|
||||||
for taxid in self._deletedNodeIterator('%s/delnodes.dmp' % taxdir):
|
|
||||||
self._index[taxid]=None
|
|
||||||
|
|
||||||
self._nameidx=dict((x[0],x[2]) for x in self._name)
|
|
||||||
|
|
||||||
|
|
||||||
def _taxonCmp(t1,t2):
|
|
||||||
if t1[0] < t2[0]:
|
|
||||||
return -1
|
|
||||||
elif t1[0] > t2[0]:
|
|
||||||
return +1
|
|
||||||
return 0
|
|
||||||
|
|
||||||
_taxonCmp=staticmethod(_taxonCmp)
|
|
||||||
|
|
||||||
def _bsearchTaxon(self,taxid):
|
|
||||||
taxCount = len(self._taxonomy)
|
|
||||||
begin = 0
|
|
||||||
end = taxCount
|
|
||||||
oldcheck=taxCount
|
|
||||||
check = begin + end / 2
|
|
||||||
while check != oldcheck and self._taxonomy[check][0]!=taxid :
|
|
||||||
if self._taxonomy[check][0] < taxid:
|
|
||||||
begin=check
|
|
||||||
else:
|
|
||||||
end=check
|
|
||||||
oldcheck=check
|
|
||||||
check = (begin + end) / 2
|
|
||||||
|
|
||||||
|
|
||||||
if self._taxonomy[check][0]==taxid:
|
|
||||||
return check
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _readNodeTable(self,file):
|
|
||||||
|
|
||||||
file = universalOpen(file)
|
|
||||||
|
|
||||||
nodes = ColumnFile(file,
|
|
||||||
sep='|',
|
|
||||||
types=(int,int,str,
|
|
||||||
str,str,bool,
|
|
||||||
int,bool,int,
|
|
||||||
bool,bool,bool,str))
|
|
||||||
print >>sys.stderr,"Reading taxonomy dump file..."
|
|
||||||
# (taxid,rank,parent)
|
|
||||||
taxonomy=[[n[0],n[2],n[1]] for n in nodes]
|
|
||||||
print >>sys.stderr,"List all taxonomy rank..."
|
|
||||||
ranks =list(set(x[1] for x in taxonomy))
|
|
||||||
ranks.sort()
|
|
||||||
rankidx = dict(map(None,ranks,xrange(len(ranks))))
|
|
||||||
|
|
||||||
print >>sys.stderr,"Sorting taxons..."
|
|
||||||
taxonomy.sort(TaxonomyDump._taxonCmp)
|
|
||||||
|
|
||||||
self._taxonomy=taxonomy
|
|
||||||
self._localtaxon=len(taxonomy)
|
|
||||||
|
|
||||||
print >>sys.stderr,"Indexing taxonomy..."
|
|
||||||
index = {}
|
|
||||||
for t in self._taxonomy:
|
|
||||||
index[t[0]]=self._bsearchTaxon(t[0])
|
|
||||||
|
|
||||||
print >>sys.stderr,"Indexing parent and rank..."
|
|
||||||
for t in self._taxonomy:
|
|
||||||
t[1]=rankidx[t[1]]
|
|
||||||
t[2]=index[t[2]]
|
|
||||||
|
|
||||||
self._ranks=ranks
|
|
||||||
self._index=index
|
|
||||||
self._preferedName = []
|
|
||||||
|
|
||||||
def _nameIterator(self,file):
|
|
||||||
file = universalOpen(file)
|
|
||||||
names = ColumnFile(file,
|
|
||||||
sep='|',
|
|
||||||
types=(int,str,
|
|
||||||
str,str))
|
|
||||||
for taxid,name,unique,classname,white in names:
|
|
||||||
yield taxid,name,classname
|
|
||||||
|
|
||||||
def _mergedNodeIterator(self,file):
|
|
||||||
file = universalOpen(file)
|
|
||||||
merged = ColumnFile(file,
|
|
||||||
sep='|',
|
|
||||||
types=(int,int,str))
|
|
||||||
for taxid,current,white in merged:
|
|
||||||
yield taxid,current
|
|
||||||
|
|
||||||
def _deletedNodeIterator(self,file):
|
|
||||||
file = universalOpen(file)
|
|
||||||
deleted = ColumnFile(file,
|
|
||||||
sep='|',
|
|
||||||
types=(int,str))
|
|
||||||
for taxid,white in deleted:
|
|
||||||
yield taxid
|
|
||||||
|
|
||||||
#####
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# Binary writer
|
|
||||||
#
|
|
||||||
#
|
|
||||||
#####
|
|
||||||
|
|
||||||
def ecoTaxonomyWriter(prefix, taxonomy,onlyLocal=False):
|
|
||||||
|
|
||||||
def ecoTaxPacker(tx):
|
|
||||||
|
|
||||||
namelength = len(tx[3])
|
|
||||||
|
|
||||||
totalSize = 4 + 4 + 4 + 4 + namelength
|
|
||||||
|
|
||||||
packed = struct.pack('> I I I I I %ds' % namelength,
|
|
||||||
totalSize,
|
|
||||||
tx[0],
|
|
||||||
tx[1],
|
|
||||||
tx[2],
|
|
||||||
namelength,
|
|
||||||
tx[3])
|
|
||||||
|
|
||||||
return packed
|
|
||||||
|
|
||||||
def ecoRankPacker(rank):
|
|
||||||
|
|
||||||
namelength = len(rank)
|
|
||||||
|
|
||||||
packed = struct.pack('> I %ds' % namelength,
|
|
||||||
namelength,
|
|
||||||
rank)
|
|
||||||
|
|
||||||
return packed
|
|
||||||
|
|
||||||
def ecoAliasPacker(taxid,index):
|
|
||||||
|
|
||||||
totalSize = 4 + 4
|
|
||||||
try:
|
|
||||||
packed = struct.pack('> I I i',
|
|
||||||
totalSize,
|
|
||||||
taxid,
|
|
||||||
index)
|
|
||||||
except struct.error,e:
|
|
||||||
print >>sys.stderr,(totalSize,taxid,index)
|
|
||||||
print >>sys.stderr,"Total size : %d taxid : %d index : %d" %(totalSize,taxid,index)
|
|
||||||
raise e
|
|
||||||
|
|
||||||
return packed
|
|
||||||
|
|
||||||
def ecoNamePacker(name):
|
|
||||||
|
|
||||||
namelength = len(name[0])
|
|
||||||
classlength= len(name[1])
|
|
||||||
totalSize = namelength + classlength + 4 + 4 + 4 + 4
|
|
||||||
|
|
||||||
packed = struct.pack('> I I I I I %ds %ds' % (namelength,classlength),
|
|
||||||
totalSize,
|
|
||||||
int(name[1]=='scientific name'),
|
|
||||||
namelength,
|
|
||||||
classlength,
|
|
||||||
name[2],
|
|
||||||
name[0],
|
|
||||||
name[1])
|
|
||||||
|
|
||||||
return packed
|
|
||||||
|
|
||||||
|
|
||||||
def ecoTaxWriter(file,taxonomy):
|
|
||||||
output = open(file,'wb')
|
|
||||||
nbtaxon = reduce(lambda x,y:x+y,(1 for t in taxonomy if t[4]=='ncbi'),0)
|
|
||||||
|
|
||||||
output.write(struct.pack('> I',nbtaxon))
|
|
||||||
|
|
||||||
for tx in taxonomy:
|
|
||||||
if tx[4]=='ncbi':
|
|
||||||
output.write(ecoTaxPacker(tx))
|
|
||||||
|
|
||||||
output.close()
|
|
||||||
return nbtaxon < len(taxonomy)
|
|
||||||
|
|
||||||
def ecoLocalTaxWriter(file,taxonomy):
|
|
||||||
nbtaxon = reduce(lambda x,y:x+y,(1 for t in taxonomy if t[4]!='ncbi'),0)
|
|
||||||
|
|
||||||
if nbtaxon:
|
|
||||||
output = open(file,'wb')
|
|
||||||
|
|
||||||
output.write(struct.pack('> I',nbtaxon))
|
|
||||||
|
|
||||||
for tx in taxonomy:
|
|
||||||
if tx[4]!='ncbi':
|
|
||||||
output.write(ecoTaxPacker(tx))
|
|
||||||
|
|
||||||
output.close()
|
|
||||||
|
|
||||||
|
|
||||||
def ecoRankWriter(file,ranks):
|
|
||||||
output = open(file,'wb')
|
|
||||||
output.write(struct.pack('> I',len(ranks)))
|
|
||||||
|
|
||||||
for rank in ranks:
|
|
||||||
output.write(ecoRankPacker(rank))
|
|
||||||
|
|
||||||
output.close()
|
|
||||||
|
|
||||||
def ecoAliasWriter(file,index):
|
|
||||||
output = open(file,'wb')
|
|
||||||
output.write(struct.pack('> I',len(index)))
|
|
||||||
|
|
||||||
for taxid in index:
|
|
||||||
i=index[taxid]
|
|
||||||
if i is None:
|
|
||||||
i=-1
|
|
||||||
output.write(ecoAliasPacker(taxid, i))
|
|
||||||
|
|
||||||
output.close()
|
|
||||||
|
|
||||||
def nameCmp(n1,n2):
|
|
||||||
name1=n1[0].upper()
|
|
||||||
name2=n2[0].upper()
|
|
||||||
if name1 < name2:
|
|
||||||
return -1
|
|
||||||
elif name1 > name2:
|
|
||||||
return 1
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
def ecoNameWriter(file,names):
|
|
||||||
output = open(file,'wb')
|
|
||||||
output.write(struct.pack('> I',len(names)))
|
|
||||||
|
|
||||||
names.sort(nameCmp)
|
|
||||||
|
|
||||||
for name in names:
|
|
||||||
output.write(ecoNamePacker(name))
|
|
||||||
|
|
||||||
output.close()
|
|
||||||
|
|
||||||
def ecoPreferedNameWriter(file,names):
|
|
||||||
output = open(file,'wb')
|
|
||||||
output.write(struct.pack('> I',len(names)))
|
|
||||||
for name in names:
|
|
||||||
output.write(ecoNamePacker(name))
|
|
||||||
|
|
||||||
output.close()
|
|
||||||
|
|
||||||
localtaxon=True
|
|
||||||
if not onlyLocal:
|
|
||||||
ecoRankWriter('%s.rdx' % prefix, taxonomy._ranks)
|
|
||||||
localtaxon = ecoTaxWriter('%s.tdx' % prefix, taxonomy._taxonomy)
|
|
||||||
ecoNameWriter('%s.ndx' % prefix, [x for x in taxonomy._name if x[2] < taxonomy._localtaxon])
|
|
||||||
ecoAliasWriter('%s.adx' % prefix, taxonomy._index)
|
|
||||||
if localtaxon:
|
|
||||||
ecoLocalTaxWriter('%s.ldx' % prefix, taxonomy._taxonomy)
|
|
||||||
if taxonomy._preferedName:
|
|
||||||
ecoNameWriter('%s.pdx' % prefix, taxonomy._preferedName)
|
|
@ -1,2 +0,0 @@
|
|||||||
class EcoTagResult(dict):
|
|
||||||
pass
|
|
@ -1,150 +0,0 @@
|
|||||||
from itertools import imap
|
|
||||||
from obitools import utils
|
|
||||||
|
|
||||||
from obitools.ecotag import EcoTagResult
|
|
||||||
|
|
||||||
class EcoTagFileIterator(utils.ColumnFile):
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def taxid(x):
|
|
||||||
x = int(x)
|
|
||||||
if x < 0:
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
return x
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def scientificName(x):
|
|
||||||
if x=='--':
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
return x
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def value(x):
|
|
||||||
if x=='--':
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
return float(x)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def count(x):
|
|
||||||
if x=='--':
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
return int(x)
|
|
||||||
|
|
||||||
|
|
||||||
def __init__(self,stream):
|
|
||||||
utils.ColumnFile.__init__(self,
|
|
||||||
stream, '\t', True,
|
|
||||||
(str,str,str,
|
|
||||||
EcoTagFileIterator.value,
|
|
||||||
EcoTagFileIterator.value,
|
|
||||||
EcoTagFileIterator.value,
|
|
||||||
EcoTagFileIterator.count,
|
|
||||||
EcoTagFileIterator.count,
|
|
||||||
EcoTagFileIterator.taxid,
|
|
||||||
EcoTagFileIterator.scientificName,
|
|
||||||
str,
|
|
||||||
EcoTagFileIterator.taxid,
|
|
||||||
EcoTagFileIterator.scientificName,
|
|
||||||
EcoTagFileIterator.taxid,
|
|
||||||
EcoTagFileIterator.scientificName,
|
|
||||||
EcoTagFileIterator.taxid,
|
|
||||||
EcoTagFileIterator.scientificName,
|
|
||||||
str
|
|
||||||
))
|
|
||||||
self._memory=None
|
|
||||||
|
|
||||||
_colname = ['identification',
|
|
||||||
'seqid',
|
|
||||||
'best_match_ac',
|
|
||||||
'max_identity',
|
|
||||||
'min_identity',
|
|
||||||
'theorical_min_identity',
|
|
||||||
'count',
|
|
||||||
'match_count',
|
|
||||||
'taxid',
|
|
||||||
'scientific_name',
|
|
||||||
'rank',
|
|
||||||
'order_taxid',
|
|
||||||
'order_sn',
|
|
||||||
'family_taxid',
|
|
||||||
'family_sn',
|
|
||||||
'genus_taxid',
|
|
||||||
'genus_sn',
|
|
||||||
'species_taxid',
|
|
||||||
'species_sn',
|
|
||||||
'sequence']
|
|
||||||
|
|
||||||
def next(self):
|
|
||||||
if self._memory is not None:
|
|
||||||
data=self._memory
|
|
||||||
self._memory=None
|
|
||||||
else:
|
|
||||||
data = utils.ColumnFile.next(self)
|
|
||||||
data = EcoTagResult(imap(None,EcoTagFileIterator._colname[:len(data)],data))
|
|
||||||
|
|
||||||
if data['identification']=='ID':
|
|
||||||
data.cd=[]
|
|
||||||
try:
|
|
||||||
nextone = utils.ColumnFile.next(self)
|
|
||||||
nextone = EcoTagResult(imap(None,EcoTagFileIterator._colname[:len(nextone)],nextone))
|
|
||||||
except StopIteration:
|
|
||||||
nextone = None
|
|
||||||
while nextone is not None and nextone['identification']=='CD':
|
|
||||||
data.cd.append(nextone)
|
|
||||||
try:
|
|
||||||
nextone = utils.ColumnFile.next(self)
|
|
||||||
nextone = EcoTagResult(imap(None,EcoTagFileIterator._colname[:len(nextone)],nextone))
|
|
||||||
except StopIteration:
|
|
||||||
nextone = None
|
|
||||||
self._memory=nextone
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
def ecoTagIdentifiedFilter(ecoTagIterator):
|
|
||||||
for x in ecoTagIterator:
|
|
||||||
if x['identification']=='ID':
|
|
||||||
yield x
|
|
||||||
|
|
||||||
|
|
||||||
class EcoTagAbstractIterator(utils.ColumnFile):
|
|
||||||
|
|
||||||
_colname = ['scientific_name',
|
|
||||||
'taxid',
|
|
||||||
'rank',
|
|
||||||
'count',
|
|
||||||
'max_identity',
|
|
||||||
'min_identity']
|
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def taxid(x):
|
|
||||||
x = int(x)
|
|
||||||
if x < 0:
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
return x
|
|
||||||
|
|
||||||
def __init__(self,stream):
|
|
||||||
utils.ColumnFile.__init__(self,
|
|
||||||
stream, '\t', True,
|
|
||||||
(str,
|
|
||||||
EcoTagFileIterator.taxid,
|
|
||||||
str,
|
|
||||||
int,
|
|
||||||
float,float,float))
|
|
||||||
|
|
||||||
def next(self):
|
|
||||||
data = utils.ColumnFile.next(self)
|
|
||||||
data = dict(imap(None,EcoTagAbstractIterator._colname,data))
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
def ecoTagAbstractFilter(ecoTagAbsIterator):
|
|
||||||
for x in ecoTagAbsIterator:
|
|
||||||
if x['taxid'] is not None:
|
|
||||||
yield x
|
|
||||||
|
|
@ -1,54 +0,0 @@
|
|||||||
import time
|
|
||||||
from urllib2 import urlopen
|
|
||||||
import shelve
|
|
||||||
from threading import Lock
|
|
||||||
import sys
|
|
||||||
|
|
||||||
class EUtils(object):
|
|
||||||
'''
|
|
||||||
|
|
||||||
'''
|
|
||||||
|
|
||||||
_last_request=0
|
|
||||||
_interval=3
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self._lock = Lock()
|
|
||||||
|
|
||||||
def wait(self):
|
|
||||||
now=time.time()
|
|
||||||
delta = now - EUtils._last_request
|
|
||||||
while delta < EUtils._interval:
|
|
||||||
time.sleep(delta)
|
|
||||||
now=time.time()
|
|
||||||
delta = now - EUtils._last_request
|
|
||||||
|
|
||||||
def _sendRequest(self,url):
|
|
||||||
self.wait()
|
|
||||||
EUtils._last_request=time.time()
|
|
||||||
t = EUtils._last_request
|
|
||||||
print >>sys.stderr,"Sending request to NCBI @ %f" % t
|
|
||||||
data = urlopen(url).read()
|
|
||||||
print >>sys.stderr,"Data red from NCBI @ %f (%f)" % (t,time.time()-t)
|
|
||||||
return data
|
|
||||||
|
|
||||||
def setInterval(self,seconde):
|
|
||||||
EUtils._interval=seconde
|
|
||||||
|
|
||||||
|
|
||||||
class EFetch(EUtils):
|
|
||||||
'''
|
|
||||||
|
|
||||||
'''
|
|
||||||
def __init__(self,db,tool='OBITools',
|
|
||||||
retmode='text',rettype="native",
|
|
||||||
server='eutils.ncbi.nlm.nih.gov'):
|
|
||||||
EUtils.__init__(self)
|
|
||||||
self._url = "http://%s/entrez/eutils/efetch.fcgi?db=%s&tool=%s&retmode=%s&rettype=%s"
|
|
||||||
self._url = self._url % (server,db,tool,retmode,rettype)
|
|
||||||
|
|
||||||
|
|
||||||
def get(self,**args):
|
|
||||||
key = "&".join(['%s=%s' % x for x in args.items()])
|
|
||||||
return self._sendRequest(self._url +"&" + key)
|
|
||||||
|
|
@ -1,56 +0,0 @@
|
|||||||
"""
|
|
||||||
implement fastn/fastp sililarity search algorithm for BioSequence.
|
|
||||||
"""
|
|
||||||
|
|
||||||
class Fast(object):
|
|
||||||
|
|
||||||
def __init__(self,seq,kup=2):
|
|
||||||
'''
|
|
||||||
@param seq: sequence to hash
|
|
||||||
@type seq: BioSequence
|
|
||||||
@param kup: word size used for hashing process
|
|
||||||
@type kup: int
|
|
||||||
'''
|
|
||||||
hash={}
|
|
||||||
seq = str(seq)
|
|
||||||
for word,pos in ((seq[i:i+kup].upper(),i) for i in xrange(len(seq)-kup)):
|
|
||||||
if word in hash:
|
|
||||||
hash[word].append(pos)
|
|
||||||
else:
|
|
||||||
hash[word]=[pos]
|
|
||||||
|
|
||||||
self._kup = kup
|
|
||||||
self._hash= hash
|
|
||||||
self._seq = seq
|
|
||||||
|
|
||||||
def __call__(self,seq):
|
|
||||||
'''
|
|
||||||
Align one sequence with the fast hash table.
|
|
||||||
|
|
||||||
@param seq: the sequence to align
|
|
||||||
@type seq: BioSequence
|
|
||||||
|
|
||||||
@return: where smax is the
|
|
||||||
score of the largest diagonal and pmax the
|
|
||||||
associated shift
|
|
||||||
@rtype: a int tuple (smax,pmax)
|
|
||||||
'''
|
|
||||||
histo={}
|
|
||||||
seq = str(seq).upper()
|
|
||||||
hash= self._hash
|
|
||||||
kup = self._kup
|
|
||||||
|
|
||||||
for word,pos in ((seq[i:i+kup],i) for i in xrange(len(seq)-kup)):
|
|
||||||
matchedpos = hash.get(word,[])
|
|
||||||
for p in matchedpos:
|
|
||||||
delta = pos - p
|
|
||||||
histo[delta]=histo.get(delta,0) + 1
|
|
||||||
smax = max(histo.values())
|
|
||||||
pmax = [x for x in histo if histo[x]==smax]
|
|
||||||
return smax,pmax
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return len(self._seq)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,384 +0,0 @@
|
|||||||
"""
|
|
||||||
fasta module provides functions to read and write sequences in fasta format.
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
#from obitools.format.genericparser import fastGenericEntryIteratorGenerator
|
|
||||||
from obitools.format.genericparser import genericEntryIteratorGenerator
|
|
||||||
from obitools import bioSeqGenerator,BioSequence,AASequence,NucSequence
|
|
||||||
from obitools import _default_raw_parser
|
|
||||||
|
|
||||||
#from obitools.alignment import alignmentReader
|
|
||||||
#from obitools.utils import universalOpen
|
|
||||||
|
|
||||||
import re
|
|
||||||
from obitools.ecopcr.options import loadTaxonomyDatabase
|
|
||||||
from obitools.format import SequenceFileIterator
|
|
||||||
|
|
||||||
#from _fasta import parseFastaDescription,fastaParser
|
|
||||||
#from _fasta import _fastaJoinSeq
|
|
||||||
#from _fasta import _parseFastaTag
|
|
||||||
|
|
||||||
|
|
||||||
#fastaEntryIterator=fastGenericEntryIteratorGenerator(startEntry='>')
|
|
||||||
fastaEntryIterator=genericEntryIteratorGenerator(startEntry='>')
|
|
||||||
rawFastaEntryIterator=genericEntryIteratorGenerator(startEntry='\s*>')
|
|
||||||
|
|
||||||
def _fastaJoinSeq(seqarray):
|
|
||||||
return ''.join([x.strip() for x in seqarray])
|
|
||||||
|
|
||||||
|
|
||||||
def parseFastaDescription(ds,tagparser):
|
|
||||||
|
|
||||||
m = tagparser.search(' '+ds)
|
|
||||||
if m is not None:
|
|
||||||
info=m.group(0)
|
|
||||||
definition = ds[m.end(0):].strip()
|
|
||||||
else:
|
|
||||||
info=None
|
|
||||||
definition=ds
|
|
||||||
|
|
||||||
return definition,info
|
|
||||||
|
|
||||||
def fastaParser(seq,bioseqfactory,tagparser,rawparser,joinseq=_fastaJoinSeq):
|
|
||||||
'''
|
|
||||||
Parse a fasta record.
|
|
||||||
|
|
||||||
@attention: internal purpose function
|
|
||||||
|
|
||||||
@param seq: a sequence object containing all lines corresponding
|
|
||||||
to one fasta sequence
|
|
||||||
@type seq: C{list} or C{tuple} of C{str}
|
|
||||||
|
|
||||||
@param bioseqfactory: a callable object return a BioSequence
|
|
||||||
instance.
|
|
||||||
@type bioseqfactory: a callable object
|
|
||||||
|
|
||||||
@param tagparser: a compiled regular expression usable
|
|
||||||
to identify key, value couples from
|
|
||||||
title line.
|
|
||||||
@type tagparser: regex instance
|
|
||||||
|
|
||||||
@return: a C{BioSequence} instance
|
|
||||||
'''
|
|
||||||
seq = seq.split('\n')
|
|
||||||
title = seq[0].strip()[1:].split(None,1)
|
|
||||||
id=title[0]
|
|
||||||
if len(title) == 2:
|
|
||||||
definition,info=parseFastaDescription(title[1], tagparser)
|
|
||||||
else:
|
|
||||||
info= None
|
|
||||||
definition=None
|
|
||||||
|
|
||||||
seq=joinseq(seq[1:])
|
|
||||||
return bioseqfactory(id, seq, definition,info,rawparser)
|
|
||||||
|
|
||||||
|
|
||||||
def fastaNucParser(seq,tagparser=_default_raw_parser,joinseq=_fastaJoinSeq):
|
|
||||||
return fastaParser(seq,NucSequence,tagparser=tagparser,joinseq=_fastaJoinSeq)
|
|
||||||
|
|
||||||
def fastaAAParser(seq,tagparser=_default_raw_parser,joinseq=_fastaJoinSeq):
|
|
||||||
return fastaParser(seq,AASequence,tagparser=tagparser,joinseq=_fastaJoinSeq)
|
|
||||||
|
|
||||||
def fastaIterator(file,bioseqfactory=bioSeqGenerator,
|
|
||||||
tagparser=_default_raw_parser,
|
|
||||||
joinseq=_fastaJoinSeq):
|
|
||||||
'''
|
|
||||||
iterate through a fasta file sequence by sequence.
|
|
||||||
Returned sequences by this iterator will be BioSequence
|
|
||||||
instances
|
|
||||||
|
|
||||||
@param file: a line iterator containing fasta data or a filename
|
|
||||||
@type file: an iterable object or str
|
|
||||||
@param bioseqfactory: a callable object return a BioSequence
|
|
||||||
instance.
|
|
||||||
@type bioseqfactory: a callable object
|
|
||||||
|
|
||||||
@param tagparser: a compiled regular expression usable
|
|
||||||
to identify key, value couples from
|
|
||||||
title line.
|
|
||||||
@type tagparser: regex instance
|
|
||||||
|
|
||||||
@return: an iterator on C{BioSequence} instance
|
|
||||||
|
|
||||||
@see: L{fastaNucIterator}
|
|
||||||
@see: L{fastaAAIterator}
|
|
||||||
|
|
||||||
>>> from obitools.format.sequence.fasta import fastaIterator
|
|
||||||
>>> f = fastaIterator('monfichier')
|
|
||||||
>>> s = f.next()
|
|
||||||
>>> print s
|
|
||||||
gctagctagcatgctagcatgcta
|
|
||||||
>>>
|
|
||||||
'''
|
|
||||||
rawparser=tagparser
|
|
||||||
allparser = tagparser % '[a-zA-Z][a-zA-Z0-9_]*'
|
|
||||||
tagparser = re.compile('( *%s)+' % allparser)
|
|
||||||
|
|
||||||
for entry in fastaEntryIterator(file):
|
|
||||||
yield fastaParser(entry,bioseqfactory,tagparser,rawparser,joinseq)
|
|
||||||
|
|
||||||
def rawFastaIterator(file,bioseqfactory=bioSeqGenerator,
|
|
||||||
tagparser=_default_raw_parser,
|
|
||||||
joinseq=_fastaJoinSeq):
|
|
||||||
|
|
||||||
rawparser=tagparser
|
|
||||||
allparser = tagparser % '[a-zA-Z][a-zA-Z0-9_]*'
|
|
||||||
tagparser = re.compile('( *%s)+' % allparser)
|
|
||||||
|
|
||||||
for entry in rawFastaEntryIterator(file):
|
|
||||||
entry=entry.strip()
|
|
||||||
yield fastaParser(entry,bioseqfactory,tagparser,rawparser,joinseq)
|
|
||||||
|
|
||||||
def fastaNucIterator(file,tagparser=_default_raw_parser):
|
|
||||||
'''
|
|
||||||
iterate through a fasta file sequence by sequence.
|
|
||||||
Returned sequences by this iterator will be NucSequence
|
|
||||||
instances
|
|
||||||
|
|
||||||
@param file: a line iterator containint fasta data
|
|
||||||
@type file: an iterable object
|
|
||||||
|
|
||||||
@param tagparser: a compiled regular expression usable
|
|
||||||
to identify key, value couples from
|
|
||||||
title line.
|
|
||||||
@type tagparser: regex instance
|
|
||||||
|
|
||||||
@return: an iterator on C{NucBioSequence} instance
|
|
||||||
@rtype: a generator object
|
|
||||||
|
|
||||||
@see: L{fastaIterator}
|
|
||||||
@see: L{fastaAAIterator}
|
|
||||||
'''
|
|
||||||
return fastaIterator(file, NucSequence,tagparser)
|
|
||||||
|
|
||||||
def fastaAAIterator(file,tagparser=_default_raw_parser):
|
|
||||||
'''
|
|
||||||
iterate through a fasta file sequence by sequence.
|
|
||||||
Returned sequences by this iterator will be AASequence
|
|
||||||
instances
|
|
||||||
|
|
||||||
@param file: a line iterator containing fasta data
|
|
||||||
@type file: an iterable object
|
|
||||||
|
|
||||||
@param tagparser: a compiled regular expression usable
|
|
||||||
to identify key, value couples from
|
|
||||||
title line.
|
|
||||||
@type tagparser: regex instance
|
|
||||||
|
|
||||||
@return: an iterator on C{AABioSequence} instance
|
|
||||||
|
|
||||||
@see: L{fastaIterator}
|
|
||||||
@see: L{fastaNucIterator}
|
|
||||||
'''
|
|
||||||
return fastaIterator(file, AASequence,tagparser)
|
|
||||||
|
|
||||||
def formatFasta(data,gbmode=False,upper=False,restrict=None):
|
|
||||||
'''
|
|
||||||
Convert a seqence or a set of sequences in a
|
|
||||||
string following the fasta format
|
|
||||||
|
|
||||||
@param data: sequence or a set of sequences
|
|
||||||
@type data: BioSequence instance or an iterable object
|
|
||||||
on BioSequence instances
|
|
||||||
|
|
||||||
@param gbmode: if set to C{True} identifier part of the title
|
|
||||||
line follows recommendation from nbci to allow
|
|
||||||
sequence indexing with the blast formatdb command.
|
|
||||||
@type gbmode: bool
|
|
||||||
|
|
||||||
@param restrict: a set of key name that will be print in the formated
|
|
||||||
output. If restrict is set to C{None} (default) then
|
|
||||||
all keys are formated.
|
|
||||||
@type restrict: any iterable value or None
|
|
||||||
|
|
||||||
@return: a fasta formated string
|
|
||||||
@rtype: str
|
|
||||||
'''
|
|
||||||
if isinstance(data, BioSequence):
|
|
||||||
data = [data]
|
|
||||||
|
|
||||||
if restrict is not None and not isinstance(restrict, set):
|
|
||||||
restrict = set(restrict)
|
|
||||||
|
|
||||||
rep = []
|
|
||||||
for sequence in data:
|
|
||||||
seq = str(sequence)
|
|
||||||
if sequence.definition is None:
|
|
||||||
definition=''
|
|
||||||
else:
|
|
||||||
definition=sequence.definition
|
|
||||||
if upper:
|
|
||||||
frgseq = '\n'.join([seq[x:x+60].upper() for x in xrange(0,len(seq),60)])
|
|
||||||
else:
|
|
||||||
frgseq = '\n'.join([seq[x:x+60] for x in xrange(0,len(seq),60)])
|
|
||||||
info='; '.join(['%s=%s' % x
|
|
||||||
for x in sequence.rawiteritems()
|
|
||||||
if restrict is None or x[0] in restrict])
|
|
||||||
if info:
|
|
||||||
info=info+';'
|
|
||||||
if sequence._rawinfo is not None and sequence._rawinfo:
|
|
||||||
info+=" " + sequence._rawinfo.strip()
|
|
||||||
|
|
||||||
id = sequence.id
|
|
||||||
if gbmode:
|
|
||||||
if 'gi' in sequence:
|
|
||||||
id = "gi|%s|%s" % (sequence['gi'],id)
|
|
||||||
else:
|
|
||||||
id = "lcl|%s|" % (id)
|
|
||||||
title='>%s %s %s' %(id,info,definition)
|
|
||||||
rep.append("%s\n%s" % (title,frgseq))
|
|
||||||
return '\n'.join(rep)
|
|
||||||
|
|
||||||
def formatSAPFastaGenerator(options):
|
|
||||||
loadTaxonomyDatabase(options)
|
|
||||||
|
|
||||||
taxonomy=None
|
|
||||||
if options.taxonomy is not None:
|
|
||||||
taxonomy=options.taxonomy
|
|
||||||
|
|
||||||
assert taxonomy is not None,"SAP formating require indication of a taxonomy database"
|
|
||||||
|
|
||||||
ranks = ('superkingdom', 'kingdom', 'subkingdom', 'superphylum',
|
|
||||||
'phylum', 'subphylum', 'superclass', 'class', 'subclass',
|
|
||||||
'infraclass', 'superorder', 'order', 'suborder', 'infraorder',
|
|
||||||
'parvorder', 'superfamily', 'family', 'subfamily', 'supertribe', 'tribe',
|
|
||||||
'subtribe', 'supergenus', 'genus', 'subgenus', 'species group',
|
|
||||||
'species subgroup', 'species', 'subspecies')
|
|
||||||
|
|
||||||
trank=set(taxonomy._ranks)
|
|
||||||
ranks = [taxonomy._ranks.index(x) for x in ranks if x in trank]
|
|
||||||
|
|
||||||
strict= options.strictsap
|
|
||||||
|
|
||||||
def formatSAPFasta(data,gbmode=False,upper=False,restrict=None):
|
|
||||||
'''
|
|
||||||
Convert a seqence or a set of sequences in a
|
|
||||||
string following the fasta format as recommended for the SAP
|
|
||||||
software
|
|
||||||
|
|
||||||
http://ib.berkeley.edu/labs/slatkin/munch/StatisticalAssignmentPackage.html
|
|
||||||
|
|
||||||
@param data: sequence or a set of sequences
|
|
||||||
@type data: BioSequence instance or an iterable object
|
|
||||||
on BioSequence instances
|
|
||||||
|
|
||||||
@param gbmode: if set to C{True} identifier part of the title
|
|
||||||
line follows recommendation from nbci to allow
|
|
||||||
sequence indexing with the blast formatdb command.
|
|
||||||
@type gbmode: bool
|
|
||||||
|
|
||||||
@param restrict: a set of key name that will be print in the formated
|
|
||||||
output. If restrict is set to C{None} (default) then
|
|
||||||
all keys are formated.
|
|
||||||
@type restrict: any iterable value or None
|
|
||||||
|
|
||||||
@return: a fasta formated string
|
|
||||||
@rtype: str
|
|
||||||
'''
|
|
||||||
if isinstance(data, BioSequence):
|
|
||||||
data = [data]
|
|
||||||
|
|
||||||
if restrict is not None and not isinstance(restrict, set):
|
|
||||||
restrict = set(restrict)
|
|
||||||
|
|
||||||
rep = []
|
|
||||||
for sequence in data:
|
|
||||||
seq = str(sequence)
|
|
||||||
|
|
||||||
if upper:
|
|
||||||
frgseq = '\n'.join([seq[x:x+60].upper() for x in xrange(0,len(seq),60)])
|
|
||||||
else:
|
|
||||||
frgseq = '\n'.join([seq[x:x+60] for x in xrange(0,len(seq),60)])
|
|
||||||
|
|
||||||
try:
|
|
||||||
taxid = sequence["taxid"]
|
|
||||||
except KeyError:
|
|
||||||
if strict:
|
|
||||||
raise AssertionError('All sequence must have a taxid')
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
|
|
||||||
definition=' ;'
|
|
||||||
|
|
||||||
for r in ranks:
|
|
||||||
taxon = taxonomy.getTaxonAtRank(taxid,r)
|
|
||||||
if taxon is not None:
|
|
||||||
definition+=' %s: %s,' % (taxonomy._ranks[r],taxonomy.getPreferedName(taxon))
|
|
||||||
|
|
||||||
definition='%s ; %s' % (definition[0:-1],taxonomy.getPreferedName(taxid))
|
|
||||||
|
|
||||||
id = sequence.id
|
|
||||||
if gbmode:
|
|
||||||
if 'gi' in sequence:
|
|
||||||
id = "gi|%s|%s" % (sequence['gi'],id)
|
|
||||||
else:
|
|
||||||
id = "lcl|%s|" % (id)
|
|
||||||
title='>%s%s' %(id,definition)
|
|
||||||
rep.append("%s\n%s" % (title,frgseq))
|
|
||||||
return '\n'.join(rep)
|
|
||||||
|
|
||||||
return formatSAPFasta
|
|
||||||
|
|
||||||
class FastaIterator(SequenceFileIterator):
|
|
||||||
|
|
||||||
|
|
||||||
entryIterator = genericEntryIteratorGenerator(startEntry='>')
|
|
||||||
classmethod(entryIterator)
|
|
||||||
|
|
||||||
def __init__(self,inputfile,bioseqfactory=bioSeqGenerator,
|
|
||||||
tagparser=_default_raw_parser,
|
|
||||||
joinseq=_fastaJoinSeq):
|
|
||||||
|
|
||||||
SequenceFileIterator.__init__(self, inputfile, bioseqfactory)
|
|
||||||
|
|
||||||
self.__file = FastaIterator.entryIterator(self._inputfile)
|
|
||||||
|
|
||||||
self._tagparser = tagparser
|
|
||||||
self._joinseq = joinseq
|
|
||||||
|
|
||||||
def get_tagparser(self):
|
|
||||||
return self.__tagparser
|
|
||||||
|
|
||||||
|
|
||||||
def set_tagparser(self, value):
|
|
||||||
self._rawparser = value
|
|
||||||
allparser = value % '[a-zA-Z][a-zA-Z0-9_]*'
|
|
||||||
self.__tagparser = re.compile('( *%s)+' % allparser)
|
|
||||||
|
|
||||||
def _parseFastaDescription(self,ds):
|
|
||||||
|
|
||||||
m = self._tagparser.search(' '+ds)
|
|
||||||
if m is not None:
|
|
||||||
info=m.group(0)
|
|
||||||
definition = ds[m.end(0):].strip()
|
|
||||||
else:
|
|
||||||
info=None
|
|
||||||
definition=ds
|
|
||||||
|
|
||||||
return definition,info
|
|
||||||
|
|
||||||
|
|
||||||
def _parser(self):
|
|
||||||
'''
|
|
||||||
Parse a fasta record.
|
|
||||||
|
|
||||||
@attention: internal purpose function
|
|
||||||
|
|
||||||
@return: a C{BioSequence} instance
|
|
||||||
'''
|
|
||||||
seq = self._seq.split('\n')
|
|
||||||
title = seq[0].strip()[1:].split(None,1)
|
|
||||||
id=title[0]
|
|
||||||
if len(title) == 2:
|
|
||||||
definition,info=self._parseFastaDescription(title[1])
|
|
||||||
else:
|
|
||||||
info= None
|
|
||||||
definition=None
|
|
||||||
|
|
||||||
seq=self._joinseq(seq[1:])
|
|
||||||
|
|
||||||
return self._bioseqfactory(id, seq, definition,info,self._rawparser)
|
|
||||||
|
|
||||||
_tagparser = property(get_tagparser, set_tagparser, None, "_tagparser's docstring")
|
|
Binary file not shown.
@ -1,190 +0,0 @@
|
|||||||
'''
|
|
||||||
Created on 29 aout 2009
|
|
||||||
|
|
||||||
@author: coissac
|
|
||||||
'''
|
|
||||||
|
|
||||||
from obitools import BioSequence
|
|
||||||
from obitools import _default_raw_parser
|
|
||||||
from obitools.format.genericparser import genericEntryIteratorGenerator
|
|
||||||
from obitools import bioSeqGenerator,AASequence,NucSequence
|
|
||||||
from obitools.fasta import parseFastaDescription
|
|
||||||
from _fastq import fastqQualitySangerDecoder,fastqQualitySolexaDecoder
|
|
||||||
from _fastq import qualityToSangerError,qualityToSolexaError
|
|
||||||
from _fastq import errorToSangerFastQStr
|
|
||||||
from _fastq import formatFastq
|
|
||||||
from _fastq import fastqParserGenetator
|
|
||||||
from obitools.utils import universalOpen
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
fastqEntryIterator=genericEntryIteratorGenerator(startEntry='^@',endEntry="^\+",strip=True,join=False)
|
|
||||||
|
|
||||||
#def fastqParserGenetator(fastqvariant='sanger',bioseqfactory=NucSequence,tagparser=_parseFastaTag):
|
|
||||||
#
|
|
||||||
# qualityDecoder,errorDecoder = {'sanger' : (fastqQualitySangerDecoder,qualityToSangerError),
|
|
||||||
# 'solexa' : (fastqQualitySolexaDecoder,qualityToSolexaError),
|
|
||||||
# 'illumina' : (fastqQualitySolexaDecoder,qualityToSangerError)}[fastqvariant]
|
|
||||||
#
|
|
||||||
# def fastqParser(seq):
|
|
||||||
# '''
|
|
||||||
# Parse a fasta record.
|
|
||||||
#
|
|
||||||
# @attention: internal purpose function
|
|
||||||
#
|
|
||||||
# @param seq: a sequence object containing all lines corresponding
|
|
||||||
# to one fasta sequence
|
|
||||||
# @type seq: C{list} or C{tuple} of C{str}
|
|
||||||
#
|
|
||||||
# @param bioseqfactory: a callable object return a BioSequence
|
|
||||||
# instance.
|
|
||||||
# @type bioseqfactory: a callable object
|
|
||||||
#
|
|
||||||
# @param tagparser: a compiled regular expression usable
|
|
||||||
# to identify key, value couples from
|
|
||||||
# title line.
|
|
||||||
# @type tagparser: regex instance
|
|
||||||
#
|
|
||||||
# @return: a C{BioSequence} instance
|
|
||||||
# '''
|
|
||||||
#
|
|
||||||
# title = seq[0][1:].split(None,1)
|
|
||||||
# id=title[0]
|
|
||||||
# if len(title) == 2:
|
|
||||||
# definition,info=parseFastaDescription(title[1], tagparser)
|
|
||||||
# else:
|
|
||||||
# info= {}
|
|
||||||
# definition=None
|
|
||||||
#
|
|
||||||
# quality=errorDecoder(qualityDecoder(seq[3]))
|
|
||||||
#
|
|
||||||
# seq=seq[1]
|
|
||||||
#
|
|
||||||
# seq = bioseqfactory(id, seq, definition,False,**info)
|
|
||||||
# seq.quality = quality
|
|
||||||
#
|
|
||||||
# return seq
|
|
||||||
#
|
|
||||||
# return fastqParser
|
|
||||||
|
|
||||||
|
|
||||||
def fastqIterator(file,fastqvariant='sanger',bioseqfactory=NucSequence,tagparser=_default_raw_parser):
|
|
||||||
'''
|
|
||||||
iterate through a fasta file sequence by sequence.
|
|
||||||
Returned sequences by this iterator will be BioSequence
|
|
||||||
instances
|
|
||||||
|
|
||||||
@param file: a line iterator containing fasta data or a filename
|
|
||||||
@type file: an iterable object or str
|
|
||||||
@param bioseqfactory: a callable object return a BioSequence
|
|
||||||
instance.
|
|
||||||
@type bioseqfactory: a callable object
|
|
||||||
|
|
||||||
@param tagparser: a compiled regular expression usable
|
|
||||||
to identify key, value couples from
|
|
||||||
title line.
|
|
||||||
@type tagparser: regex instance
|
|
||||||
|
|
||||||
@return: an iterator on C{BioSequence} instance
|
|
||||||
|
|
||||||
@see: L{fastaNucIterator}
|
|
||||||
@see: L{fastaAAIterator}
|
|
||||||
|
|
||||||
'''
|
|
||||||
fastqParser=fastqParserGenetator(fastqvariant, bioseqfactory, tagparser)
|
|
||||||
file = universalOpen(file)
|
|
||||||
for entry in fastqEntryIterator(file):
|
|
||||||
title=entry[0]
|
|
||||||
seq="".join(entry[1:-1])
|
|
||||||
quality=''
|
|
||||||
lenseq=len(seq)
|
|
||||||
while (len(quality) < lenseq):
|
|
||||||
quality+=file.next().strip()
|
|
||||||
|
|
||||||
yield fastqParser([title,seq,'+',quality])
|
|
||||||
|
|
||||||
def fastqSangerIterator(file,tagparser=_default_raw_parser):
|
|
||||||
'''
|
|
||||||
iterate through a fastq file sequence by sequence.
|
|
||||||
Returned sequences by this iterator will be NucSequence
|
|
||||||
instances
|
|
||||||
|
|
||||||
@param file: a line iterator containint fasta data
|
|
||||||
@type file: an iterable object
|
|
||||||
|
|
||||||
@param tagparser: a compiled regular expression usable
|
|
||||||
to identify key, value couples from
|
|
||||||
title line.
|
|
||||||
@type tagparser: regex instance
|
|
||||||
|
|
||||||
@return: an iterator on C{NucBioSequence} instance
|
|
||||||
|
|
||||||
@see: L{fastqIterator}
|
|
||||||
@see: L{fastqAAIterator}
|
|
||||||
'''
|
|
||||||
return fastqIterator(file,'sanger',NucSequence,tagparser)
|
|
||||||
|
|
||||||
def fastqSolexaIterator(file,tagparser=_default_raw_parser):
|
|
||||||
'''
|
|
||||||
iterate through a fastq file sequence by sequence.
|
|
||||||
Returned sequences by this iterator will be NucSequence
|
|
||||||
instances
|
|
||||||
|
|
||||||
@param file: a line iterator containint fasta data
|
|
||||||
@type file: an iterable object
|
|
||||||
|
|
||||||
@param tagparser: a compiled regular expression usable
|
|
||||||
to identify key, value couples from
|
|
||||||
title line.
|
|
||||||
@type tagparser: regex instance
|
|
||||||
|
|
||||||
@return: an iterator on C{NucBioSequence} instance
|
|
||||||
|
|
||||||
@see: L{fastqIterator}
|
|
||||||
@see: L{fastqAAIterator}
|
|
||||||
'''
|
|
||||||
return fastqIterator(file,'solexa',NucSequence,tagparser)
|
|
||||||
|
|
||||||
def fastqIlluminaIterator(file,tagparser=_default_raw_parser):
|
|
||||||
'''
|
|
||||||
iterate through a fastq file sequence by sequence.
|
|
||||||
Returned sequences by this iterator will be NucSequence
|
|
||||||
instances
|
|
||||||
|
|
||||||
@param file: a line iterator containint fasta data
|
|
||||||
@type file: an iterable object
|
|
||||||
|
|
||||||
@param tagparser: a compiled regular expression usable
|
|
||||||
to identify key, value couples from
|
|
||||||
title line.
|
|
||||||
@type tagparser: regex instance
|
|
||||||
|
|
||||||
@return: an iterator on C{NucBioSequence} instance
|
|
||||||
|
|
||||||
@see: L{fastqIterator}
|
|
||||||
@see: L{fastqAAIterator}
|
|
||||||
'''
|
|
||||||
return fastqIterator(file,'illumina',NucSequence,tagparser)
|
|
||||||
|
|
||||||
def fastqAAIterator(file,tagparser=_default_raw_parser):
|
|
||||||
'''
|
|
||||||
iterate through a fastq file sequence by sequence.
|
|
||||||
Returned sequences by this iterator will be AASequence
|
|
||||||
instances
|
|
||||||
|
|
||||||
@param file: a line iterator containing fasta data
|
|
||||||
@type file: an iterable object
|
|
||||||
|
|
||||||
@param tagparser: a compiled regular expression usable
|
|
||||||
to identify key, value couples from
|
|
||||||
title line.
|
|
||||||
@type tagparser: regex instance
|
|
||||||
|
|
||||||
@return: an iterator on C{AABioSequence} instance
|
|
||||||
|
|
||||||
@see: L{fastqIterator}
|
|
||||||
@see: L{fastqNucIterator}
|
|
||||||
'''
|
|
||||||
return fastqIterator(file,'sanger',AASequence,tagparser)
|
|
||||||
|
|
||||||
|
|
Binary file not shown.
@ -1,2 +0,0 @@
|
|||||||
|
|
||||||
fnaTag=' %s *= *([^\s]+)'
|
|
@ -1,8 +0,0 @@
|
|||||||
from obitools.fasta import fastaNucIterator
|
|
||||||
from obitools.fnaqual import fnaTag
|
|
||||||
|
|
||||||
def fnaFastaIterator(file):
|
|
||||||
|
|
||||||
x = fastaNucIterator(file, fnaTag)
|
|
||||||
|
|
||||||
return x
|
|
@ -1,137 +0,0 @@
|
|||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
from obitools import _default_raw_parser
|
|
||||||
from obitools.fasta import fastaIterator
|
|
||||||
from obitools.fnaqual import fnaTag
|
|
||||||
from obitools.location import Location
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
class QualitySequence(list):
|
|
||||||
|
|
||||||
def __init__(self,id,seq,definition=None,rawinfo=None,rawparser=_default_raw_parser,**info):
|
|
||||||
'''
|
|
||||||
|
|
||||||
@param id:
|
|
||||||
@param seq:
|
|
||||||
@param definition:
|
|
||||||
'''
|
|
||||||
list.__init__(self,seq)
|
|
||||||
self._info = info
|
|
||||||
self.definition=definition
|
|
||||||
self.id=id
|
|
||||||
self._rawinfo=' ' + rawinfo
|
|
||||||
self._rawparser=rawparser
|
|
||||||
|
|
||||||
def getDefinition(self):
|
|
||||||
'''
|
|
||||||
Sequence definition getter
|
|
||||||
|
|
||||||
@return: the sequence definition
|
|
||||||
@rtype: str
|
|
||||||
|
|
||||||
'''
|
|
||||||
return self._definition
|
|
||||||
|
|
||||||
def setDefinition(self, value):
|
|
||||||
self._definition = value
|
|
||||||
|
|
||||||
def getId(self):
|
|
||||||
return self._id
|
|
||||||
|
|
||||||
def setId(self, value):
|
|
||||||
self._id = value
|
|
||||||
|
|
||||||
def getKey(self,key):
|
|
||||||
if key not in self._info:
|
|
||||||
p = re.compile(self._rawparser % key)
|
|
||||||
m = p.search(self._rawinfo)
|
|
||||||
if m is not None:
|
|
||||||
v=m.group(1)
|
|
||||||
self._rawinfo=' ' + self._rawinfo[0:m.start(0)]+self._rawinfo[m.end(0):]
|
|
||||||
try:
|
|
||||||
v = eval(v)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
self._info[key]=v
|
|
||||||
else:
|
|
||||||
raise KeyError,key
|
|
||||||
else:
|
|
||||||
v=self._info[key]
|
|
||||||
return v
|
|
||||||
|
|
||||||
def __getitem__(self,key):
|
|
||||||
if isinstance(key,Location):
|
|
||||||
return key.extractSequence(self)
|
|
||||||
elif isinstance(key, str):
|
|
||||||
return self._getKey(key)
|
|
||||||
elif isinstance(key, int):
|
|
||||||
return list.__getitem__(self,key)
|
|
||||||
elif isinstance(key, slice):
|
|
||||||
subseq=list.__getitem__(self,key)
|
|
||||||
info = dict(self._info)
|
|
||||||
if key.start is not None:
|
|
||||||
start = key.start +1
|
|
||||||
else:
|
|
||||||
start = 1
|
|
||||||
if key.stop is not None:
|
|
||||||
stop = key.stop+1
|
|
||||||
else:
|
|
||||||
stop = len(self)
|
|
||||||
if key.step is not None:
|
|
||||||
step = key.step
|
|
||||||
else:
|
|
||||||
step = 1
|
|
||||||
|
|
||||||
info['cut']='[%d,%d,%s]' % (start,stop,step)
|
|
||||||
return QualitySequence(self.id, subseq, self.definition,self._rawinfo,self._rawparser,**info)
|
|
||||||
|
|
||||||
raise TypeError,'key must be an integer, a str or a slice'
|
|
||||||
|
|
||||||
def __setitem__(self,key,value):
|
|
||||||
self._info[key]=value
|
|
||||||
|
|
||||||
def __delitem__(self,key):
|
|
||||||
if isinstance(key, str):
|
|
||||||
del self._info[key]
|
|
||||||
else:
|
|
||||||
raise TypeError,key
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return list.__iter__(self)
|
|
||||||
|
|
||||||
def __contains__(self,key):
|
|
||||||
return key in self._info
|
|
||||||
|
|
||||||
def getTags(self):
|
|
||||||
return self._info
|
|
||||||
|
|
||||||
def complement(self):
|
|
||||||
'''
|
|
||||||
|
|
||||||
'''
|
|
||||||
cseq = self[::-1]
|
|
||||||
rep = QualitySequence(self.id,cseq,self.definition,self._rawinfo,self._rawparser,**self._info)
|
|
||||||
rep._info['complemented']=not rep._info.get('complemented',False)
|
|
||||||
return rep
|
|
||||||
|
|
||||||
|
|
||||||
definition = property(getDefinition, setDefinition, None, "Sequence Definition")
|
|
||||||
|
|
||||||
id = property(getId, setId, None, 'Sequence identifier')
|
|
||||||
|
|
||||||
|
|
||||||
def _qualityJoinSeq(seqarray):
|
|
||||||
text = ' '.join([x.strip() for x in seqarray])
|
|
||||||
return [int(x) for x in text.split()]
|
|
||||||
|
|
||||||
def qualityIterator(file):
|
|
||||||
for q in fastaIterator(file, QualitySequence, fnaTag, _qualityJoinSeq):
|
|
||||||
yield q
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,28 +0,0 @@
|
|||||||
from obitools import bioSeqGenerator
|
|
||||||
from obitools.utils import universalOpen
|
|
||||||
|
|
||||||
|
|
||||||
class SequenceFileIterator:
|
|
||||||
|
|
||||||
def __init__(self,inputfile,bioseqfactory=bioSeqGenerator):
|
|
||||||
self._inputfile = universalOpen(inputfile)
|
|
||||||
self._bioseqfactory = bioseqfactory
|
|
||||||
|
|
||||||
def get_inputfile(self):
|
|
||||||
return self.__file
|
|
||||||
|
|
||||||
|
|
||||||
def get_bioseqfactory(self):
|
|
||||||
return self.__bioseqfactory
|
|
||||||
|
|
||||||
def next(self):
|
|
||||||
entry = self.inputfile.next()
|
|
||||||
return self._parse(entry)
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return self
|
|
||||||
|
|
||||||
_inputfile = property(get_inputfile, None, None, "_file's docstring")
|
|
||||||
_bioseqfactory = property(get_bioseqfactory, None, None, "_bioseqfactory's docstring")
|
|
||||||
|
|
||||||
|
|
Binary file not shown.
@ -1,217 +0,0 @@
|
|||||||
"""
|
|
||||||
G{packagetree format}
|
|
||||||
"""
|
|
||||||
import re
|
|
||||||
|
|
||||||
from obitools.utils import universalOpen
|
|
||||||
|
|
||||||
def genericEntryIteratorGenerator(startEntry=None,endEntry=None,
|
|
||||||
head=False,tail=False,
|
|
||||||
strip=False,join=True):
|
|
||||||
'''
|
|
||||||
Transfome a text line iterator to an entry oriented iterator.
|
|
||||||
|
|
||||||
This iterator converted is useful to implement first stage
|
|
||||||
of flat file parsing.
|
|
||||||
|
|
||||||
@param startEntry: a regular pattern matching the beginning of
|
|
||||||
an entry
|
|
||||||
@type startEntry: C{str} or None
|
|
||||||
@param endEntry: a regular pattern matching the end of
|
|
||||||
an entry
|
|
||||||
@type endEntry: C{str} or None
|
|
||||||
@param head: indicate if an header is present before
|
|
||||||
the first entry (as in many original genbank
|
|
||||||
files)
|
|
||||||
@type head: C{bool}
|
|
||||||
@param tail: indicate if some extra informations are present
|
|
||||||
after the last entry.
|
|
||||||
@type tail: C{bool}
|
|
||||||
|
|
||||||
@return: an iterator on entries in text format
|
|
||||||
@rtype: an iterator on C{str}
|
|
||||||
'''
|
|
||||||
|
|
||||||
def isBeginning(line):
|
|
||||||
return startEntry is None or startEntry.match(line) is not None
|
|
||||||
|
|
||||||
def isEnding(line):
|
|
||||||
return ((endEntry is not None and endEntry.match(line) is not None) or
|
|
||||||
(endEntry is None and startEntry is not None and startEntry.match(line) is not None))
|
|
||||||
|
|
||||||
def transparentIteratorEntry(file):
|
|
||||||
file = universalOpen(file)
|
|
||||||
return file
|
|
||||||
|
|
||||||
def genericEntryIterator(file):
|
|
||||||
file = universalOpen(file)
|
|
||||||
entry = []
|
|
||||||
line = file.next()
|
|
||||||
started = head or isBeginning(line)
|
|
||||||
|
|
||||||
try:
|
|
||||||
while 1:
|
|
||||||
while not started:
|
|
||||||
line = file.next()
|
|
||||||
started = isBeginning(line)
|
|
||||||
|
|
||||||
if endEntry is None:
|
|
||||||
entry.append(line)
|
|
||||||
line = file.next()
|
|
||||||
|
|
||||||
while started:
|
|
||||||
end = isEnding(line)
|
|
||||||
if end:
|
|
||||||
if endEntry is not None:
|
|
||||||
entry.append(line)
|
|
||||||
if join:
|
|
||||||
e = ''.join(entry)
|
|
||||||
if strip:
|
|
||||||
e=e.strip()
|
|
||||||
else:
|
|
||||||
e=entry
|
|
||||||
if strip:
|
|
||||||
e=[x.strip() for x in e]
|
|
||||||
entry=[]
|
|
||||||
yield e
|
|
||||||
started=False
|
|
||||||
if endEntry is not None:
|
|
||||||
line = file.next()
|
|
||||||
else:
|
|
||||||
entry.append(line)
|
|
||||||
line = file.next()
|
|
||||||
|
|
||||||
started = isBeginning(line)
|
|
||||||
|
|
||||||
except StopIteration:
|
|
||||||
if entry and (endEntry is None or tail):
|
|
||||||
if join:
|
|
||||||
e = ''.join(entry)
|
|
||||||
if strip:
|
|
||||||
e=e.strip()
|
|
||||||
else:
|
|
||||||
e=entry
|
|
||||||
if strip:
|
|
||||||
e=[x.strip() for x in e]
|
|
||||||
yield e
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if startEntry is not None:
|
|
||||||
startEntry = re.compile(startEntry)
|
|
||||||
if endEntry is not None:
|
|
||||||
endEntry = re.compile(endEntry)
|
|
||||||
|
|
||||||
if startEntry is None and endEntry is None:
|
|
||||||
return transparentIteratorEntry
|
|
||||||
|
|
||||||
return genericEntryIterator
|
|
||||||
|
|
||||||
|
|
||||||
class GenericParser(object):
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
startEntry=None,
|
|
||||||
endEntry=None,
|
|
||||||
head=False,
|
|
||||||
tail=False,
|
|
||||||
strip=False,
|
|
||||||
**parseAction):
|
|
||||||
"""
|
|
||||||
@param startEntry: a regular pattern matching the beginning of
|
|
||||||
an entry
|
|
||||||
@type startEntry: C{str} or None
|
|
||||||
@param endEntry: a regular pattern matching the end of
|
|
||||||
an entry
|
|
||||||
@type endEntry: C{str} or None
|
|
||||||
@param head: indicate if an header is present before
|
|
||||||
the first entry (as in many original genbank
|
|
||||||
files)
|
|
||||||
@type head: C{bool}
|
|
||||||
@param tail: indicate if some extra informations are present
|
|
||||||
after the last entry.
|
|
||||||
@type tail: C{bool}
|
|
||||||
|
|
||||||
@param parseAction:
|
|
||||||
|
|
||||||
"""
|
|
||||||
self.flatiterator= genericEntryIteratorGenerator(startEntry,
|
|
||||||
endEntry,
|
|
||||||
head,
|
|
||||||
tail,
|
|
||||||
strip)
|
|
||||||
|
|
||||||
self.action={}
|
|
||||||
|
|
||||||
for k in parseAction:
|
|
||||||
self.addParseAction(k,*parseAction[k])
|
|
||||||
|
|
||||||
def addParseAction(self,name,dataMatcher,dataCleaner=None,cleanSub=''):
|
|
||||||
'''
|
|
||||||
Add a parse action to the generic parser. A parse action
|
|
||||||
allows to extract one information from an entry. A parse
|
|
||||||
action is defined by a name and a method to extract this
|
|
||||||
information from the full text entry.
|
|
||||||
|
|
||||||
A parse action can be defined following two ways.
|
|
||||||
|
|
||||||
- via regular expression patterns
|
|
||||||
|
|
||||||
- via dedicated function.
|
|
||||||
|
|
||||||
In the first case, you have to indicate at least the
|
|
||||||
dataMatcher regular pattern. This pattern should match exactly
|
|
||||||
the data part you want to retrieve. If cleanning of extra
|
|
||||||
characters is needed. The second pattern dataCLeanner can be
|
|
||||||
used to specifyed these characters.
|
|
||||||
|
|
||||||
In the second case you must provide a callable object (function)
|
|
||||||
that extract and clean data from the text entry. This function
|
|
||||||
should return an array containing all data retrevied even if
|
|
||||||
no data or only one data is retrevied.
|
|
||||||
|
|
||||||
@summary: Add a parse action to the generic parser.
|
|
||||||
|
|
||||||
@param name: name of the data extracted
|
|
||||||
@type name: C{str}
|
|
||||||
@param dataMatcher: a regular pattern matching the data
|
|
||||||
or a callable object parsing the
|
|
||||||
entry and returning a list of marched data
|
|
||||||
@type dataMatcher: C{str} or C{SRE_Pattern} instance or a callable
|
|
||||||
object
|
|
||||||
@param dataCleaner: a regular pattern matching part of the data
|
|
||||||
to suppress.
|
|
||||||
@type dataCleaner: C{str} or C{SRE_Pattern} instance or C{None}
|
|
||||||
@param cleanSub: string used to replace dataCleaner matches.
|
|
||||||
Default is an empty string
|
|
||||||
@type cleanSub: C{str}
|
|
||||||
|
|
||||||
'''
|
|
||||||
if callable(dataMatcher):
|
|
||||||
self.action[name]=dataMatcher
|
|
||||||
else :
|
|
||||||
if isinstance(dataMatcher, str):
|
|
||||||
dataMatcher=re.compile(dataMatcher)
|
|
||||||
if isinstance(dataCleaner, str):
|
|
||||||
dataCleaner=re.compile(dataCleaner)
|
|
||||||
self.action[name]=self._buildREParser(dataMatcher,
|
|
||||||
dataCleaner,
|
|
||||||
cleanSub)
|
|
||||||
|
|
||||||
def _buildREParser(self,dataMatcher,dataCleaner,cleanSub):
|
|
||||||
def parser(data):
|
|
||||||
x = dataMatcher.findall(data)
|
|
||||||
if dataCleaner is not None:
|
|
||||||
x = [dataCleaner.sub(cleanSub,y) for y in x]
|
|
||||||
return x
|
|
||||||
return parser
|
|
||||||
|
|
||||||
def __call__(self,file):
|
|
||||||
for e in self.flatiterator(file):
|
|
||||||
pe = {'fullentry':e}
|
|
||||||
for k in self.action:
|
|
||||||
pe[k]=self.action[k](e)
|
|
||||||
yield pe
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,274 +0,0 @@
|
|||||||
__docformat__ = 'restructuredtext'
|
|
||||||
|
|
||||||
import re
|
|
||||||
import string
|
|
||||||
import textwrap
|
|
||||||
|
|
||||||
|
|
||||||
from obitools.obo.go.parser import GOEntryIterator
|
|
||||||
from obitools.obo.go.parser import GOTerm
|
|
||||||
from obitools.obo.go.parser import GOEntry
|
|
||||||
|
|
||||||
"""
|
|
||||||
go_obo.py : gene_ontology_edit.obo file parser:
|
|
||||||
----------------------------------------------------
|
|
||||||
|
|
||||||
- OBOFile class: open a flat file and return an entry.
|
|
||||||
|
|
||||||
"""
|
|
||||||
class OBOFile(object):
|
|
||||||
"""
|
|
||||||
Iterator over all entries of an OBO file
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,_path):
|
|
||||||
self.file = GOEntryIterator(_path)
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return self
|
|
||||||
|
|
||||||
def next(self):
|
|
||||||
fiche = self.file.next()
|
|
||||||
|
|
||||||
if isinstance(fiche, GOTerm):
|
|
||||||
self.isaterm=True
|
|
||||||
return Term(fiche)
|
|
||||||
elif isinstance(fiche, GOEntry):
|
|
||||||
self.isaterm=False
|
|
||||||
return Entry(fiche)
|
|
||||||
else:
|
|
||||||
self.isaterm=False
|
|
||||||
return Header(fiche)
|
|
||||||
|
|
||||||
|
|
||||||
############# tout le reste doit descendre a l'etage obitools/ogo/go/parser.py ##########
|
|
||||||
|
|
||||||
# define an XRef into a go_obo.py script in the microbi pylib
|
|
||||||
class Xref(object):
|
|
||||||
"""
|
|
||||||
Class Xref
|
|
||||||
Xref.db Xref database
|
|
||||||
Xref.id Xref identifier
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,description):
|
|
||||||
data = description.split(':')
|
|
||||||
self.db = data[0].strip()
|
|
||||||
self.id = data[1].strip()
|
|
||||||
|
|
||||||
# define a RelatedTerm into a go_obo.py script in the microbi pylib
|
|
||||||
class RelatedTerm(object):
|
|
||||||
"""
|
|
||||||
Class RelatedTerm
|
|
||||||
RelatedTerm.relation RelatedTerm relation
|
|
||||||
RelatedTerm.related_term RelatedTerm GO identifier
|
|
||||||
RelatedTerm.comment all terms have 0 or 1 comment
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,relation,value,comment):
|
|
||||||
self.relation = relation
|
|
||||||
self.related_term = value.strip('GO:')
|
|
||||||
self.comment = comment
|
|
||||||
|
|
||||||
|
|
||||||
# define into a go_obo.py script in the microbi pylib
|
|
||||||
#class Term(object):
|
|
||||||
# """
|
|
||||||
# class representing an OBO term (entry).
|
|
||||||
# """
|
|
||||||
#
|
|
||||||
# def __init__(self):
|
|
||||||
# raise RuntimeError('biodb.go_obo is an abstract class')
|
|
||||||
#
|
|
||||||
# def __checkEntry__(self):
|
|
||||||
# minimum=(hasattr(self,'goid') )
|
|
||||||
# if not minimum:
|
|
||||||
# raise AssertionError('Misconstructed GO Term instance %s' % [x for x in dir(self) if x[0]!='_'])
|
|
||||||
|
|
||||||
class Term(object):
|
|
||||||
"""
|
|
||||||
Class Term
|
|
||||||
representing a GO term.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,data=None):
|
|
||||||
"""
|
|
||||||
"""
|
|
||||||
self.data=data
|
|
||||||
self.isaterm = True
|
|
||||||
|
|
||||||
if data:
|
|
||||||
self.__filtreGoid__()
|
|
||||||
self.__filtreName__()
|
|
||||||
self.__filtreComment__()
|
|
||||||
self.__filtreSynonyms__()
|
|
||||||
self.__filtreDef__()
|
|
||||||
self.__filtreParents__()
|
|
||||||
self.__filtreRelationships__()
|
|
||||||
self.__filtreRelation__()
|
|
||||||
self.__filtreObsolete__()
|
|
||||||
self.__filtreAltIds__()
|
|
||||||
self.__filtreXRefs__()
|
|
||||||
self.__filtreSubsets__()
|
|
||||||
|
|
||||||
# check if all required attributes were valued
|
|
||||||
self.__checkEntry__()
|
|
||||||
|
|
||||||
|
|
||||||
def __checkEntry__(self):
|
|
||||||
minimum=(hasattr(self,'goid') )
|
|
||||||
if not minimum:
|
|
||||||
raise AssertionError('Misconstructed GO Term instance %s' % [x for x in dir(self) if x[0]!='_'])
|
|
||||||
|
|
||||||
|
|
||||||
def __filtreGoid__(self):
|
|
||||||
"""
|
|
||||||
Extract GO id.
|
|
||||||
"""
|
|
||||||
self.goid = self.data.id.value.strip('GO:')
|
|
||||||
|
|
||||||
def __filtreName__(self):
|
|
||||||
"""
|
|
||||||
Extract GO name.
|
|
||||||
"""
|
|
||||||
self.name = self.data.name.value
|
|
||||||
|
|
||||||
def __filtreSynonyms__(self):
|
|
||||||
"""
|
|
||||||
Extract GO synonym(s).
|
|
||||||
"""
|
|
||||||
self.list_synonyms = {}
|
|
||||||
if self.data.synonyms:
|
|
||||||
for y in self.data.synonyms:
|
|
||||||
self.list_synonyms[y.value] = y.scope
|
|
||||||
|
|
||||||
|
|
||||||
def __filtreComment__(self):
|
|
||||||
"""
|
|
||||||
manage None comments
|
|
||||||
"""
|
|
||||||
if self.data.comment != None:
|
|
||||||
self.comment = self.data.comment.value
|
|
||||||
else:
|
|
||||||
self.comment = ""
|
|
||||||
|
|
||||||
def __filtreDef__(self):
|
|
||||||
"""
|
|
||||||
Extract GO definition.
|
|
||||||
"""
|
|
||||||
if self.data.definition != None:
|
|
||||||
self.definition = self.data.definition.value
|
|
||||||
else:
|
|
||||||
self.definition = ""
|
|
||||||
|
|
||||||
def __filtreParents__(self):
|
|
||||||
"""
|
|
||||||
To make the is_a hierarchy
|
|
||||||
"""
|
|
||||||
if self.data.is_a != None:
|
|
||||||
self.is_a = set([isa.value.strip('GO:') for isa in self.data.is_a])
|
|
||||||
else:
|
|
||||||
self.is_a = set()
|
|
||||||
|
|
||||||
def __filtreRelation__(self):
|
|
||||||
"""
|
|
||||||
To make the part_of hierarchy
|
|
||||||
"""
|
|
||||||
self.part_of = set()
|
|
||||||
self.regulates = set()
|
|
||||||
self.negatively_regulates = set()
|
|
||||||
self.positively_regulates = set()
|
|
||||||
|
|
||||||
if self.data.relationship != None:
|
|
||||||
for rel in self.data.relationship:
|
|
||||||
if rel.relationship == "part_of":
|
|
||||||
self.part_of.add(rel.value.strip('GO:'))
|
|
||||||
elif rel.relationship == "regulates":
|
|
||||||
self.regulates.add(rel.value.strip('GO:'))
|
|
||||||
elif rel.relationship == "negatively_regulates":
|
|
||||||
self.negatively_regulates.add(rel.value.strip('GO:'))
|
|
||||||
elif rel.relationship == "positively_regulates":
|
|
||||||
self.positively_regulates.add(rel.value.strip('GO:'))
|
|
||||||
|
|
||||||
|
|
||||||
def __filtreRelationships__(self):
|
|
||||||
"""
|
|
||||||
Relation list with other GO Terms (is_a, part_of or some regulates relation)
|
|
||||||
"""
|
|
||||||
self.related_term =[]
|
|
||||||
if self.data.relationship != None:
|
|
||||||
for x in self.data.relationship:
|
|
||||||
self.related_term.append(RelatedTerm(x.relationship,x.value,x.__doc__))
|
|
||||||
#self.related_term.append(RelatedTerm(x.relationship,x.value,x.comment))
|
|
||||||
if self.data.is_a != None:
|
|
||||||
for x in self.data.is_a:
|
|
||||||
self.related_term.append(RelatedTerm('is_a',x.value,x.__doc__))
|
|
||||||
#self.related_term.append(RelatedTerm('is_a',x.value,x.comment))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def __filtreObsolete__(self):
|
|
||||||
"""
|
|
||||||
for each obsolete terms corresponds a set of GO Identifiers
|
|
||||||
so that this GO term is consider as others GO Terms
|
|
||||||
"""
|
|
||||||
self.considers = set()
|
|
||||||
self.replaces = set()
|
|
||||||
self.is_obsolete = self.data.is_obsolete
|
|
||||||
if self.data.is_obsolete:
|
|
||||||
if self.data.consider:
|
|
||||||
self.considers = set([considered.value.strip('GO:') for considered in self.data.consider])
|
|
||||||
if self.data.replaced_by:
|
|
||||||
self.replaces = set([replaced.value.strip('GO:') for replaced in self.data.replaced_by])
|
|
||||||
|
|
||||||
|
|
||||||
def __filtreAltIds__(self):
|
|
||||||
"""
|
|
||||||
alternate(s) id(s) for this term (= alias in the geneontology schema model!)
|
|
||||||
"""
|
|
||||||
if self.data.alt_ids:
|
|
||||||
self.alt_ids = set([x.value.strip('GO:') for x in self.data.alt_ids])
|
|
||||||
else:
|
|
||||||
self.alt_ids = set()
|
|
||||||
|
|
||||||
def __filtreXRefs__(self):
|
|
||||||
"""
|
|
||||||
cross references to other databases
|
|
||||||
"""
|
|
||||||
self.xrefs = set()
|
|
||||||
if self.data.xrefs:
|
|
||||||
self.xrefs = set([Xref(x.value.reference) for x in self.data.xrefs])
|
|
||||||
|
|
||||||
|
|
||||||
def __filtreSubsets__(self):
|
|
||||||
"""
|
|
||||||
subset label to make smaller sets of GO Terms
|
|
||||||
"""
|
|
||||||
self.subsets = set()
|
|
||||||
if self.data.subsets:
|
|
||||||
self.subsets = set([x.value for x in self.data.subsets])
|
|
||||||
|
|
||||||
|
|
||||||
class Entry(object):
|
|
||||||
"""
|
|
||||||
a Stanza entry, like [Typedef] for example
|
|
||||||
"""
|
|
||||||
def __init__(self,data=None):
|
|
||||||
self.data=data
|
|
||||||
self.isaterm=False
|
|
||||||
self.isanentry=True
|
|
||||||
|
|
||||||
|
|
||||||
class Header(object):
|
|
||||||
"""
|
|
||||||
class representing a GO header.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,data=None):
|
|
||||||
"""
|
|
||||||
"""
|
|
||||||
self.data=data
|
|
||||||
self.isaterm = False
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,284 +0,0 @@
|
|||||||
'''
|
|
||||||
Created on 13 oct. 2009
|
|
||||||
|
|
||||||
@author: coissac
|
|
||||||
'''
|
|
||||||
|
|
||||||
from obitools.format.sequence.embl import emblIterator
|
|
||||||
from obitools.format.sequence.genbank import genbankIterator
|
|
||||||
from obitools.format.sequence.fnaqual import fnaFastaIterator
|
|
||||||
from obitools.format.sequence.fasta import fastaAAIterator,fastaNucIterator,fastaIterator
|
|
||||||
from obitools.format.sequence.fastq import fastqIlluminaIterator,fastqSolexaIterator
|
|
||||||
from obitools.fastq import fastqSangerIterator
|
|
||||||
from obitools.fnaqual.quality import qualityIterator
|
|
||||||
from obitools.fasta import formatFasta, rawFastaIterator,\
|
|
||||||
formatSAPFastaGenerator
|
|
||||||
from obitools.fastq import formatFastq
|
|
||||||
|
|
||||||
from obitools.ecopcr.sequence import EcoPCRDBSequenceWriter
|
|
||||||
from obitools.ecopcr.options import loadTaxonomyDatabase
|
|
||||||
|
|
||||||
#from obitools.format._format import printOutput
|
|
||||||
|
|
||||||
from array import array
|
|
||||||
from itertools import chain
|
|
||||||
import sys
|
|
||||||
|
|
||||||
import re
|
|
||||||
from obitools.ecopcr import EcoPCRFile
|
|
||||||
|
|
||||||
|
|
||||||
def addInputFormatOption(optionManager):
|
|
||||||
# optionManager.add_option('--rank',
|
|
||||||
# action="store_true", dest='addrank',
|
|
||||||
# default=False,
|
|
||||||
# help="add a rank attribute to the sequence "
|
|
||||||
# "indicating the sequence position in the input data")
|
|
||||||
optionManager.add_option('--genbank',
|
|
||||||
action="store_const", dest="seqinformat",
|
|
||||||
default=None,
|
|
||||||
const='genbank',
|
|
||||||
help="input file is in genbank format")
|
|
||||||
optionManager.add_option('--embl',
|
|
||||||
action="store_const", dest="seqinformat",
|
|
||||||
default=None,
|
|
||||||
const='embl',
|
|
||||||
help="input file is in embl format")
|
|
||||||
|
|
||||||
optionManager.add_option('--fasta',
|
|
||||||
action="store_const", dest="seqinformat",
|
|
||||||
default=None,
|
|
||||||
const='fasta',
|
|
||||||
help="input file is in fasta nucleic format (including obitools fasta extentions)")
|
|
||||||
|
|
||||||
optionManager.add_option('--ecopcr',
|
|
||||||
action="store_const", dest="seqinformat",
|
|
||||||
default=None,
|
|
||||||
const='ecopcr',
|
|
||||||
help="input file is in fasta nucleic format (including obitools fasta extentions)")
|
|
||||||
|
|
||||||
optionManager.add_option('--raw-fasta',
|
|
||||||
action="store_const", dest="seqinformat",
|
|
||||||
default=None,
|
|
||||||
const='rawfasta',
|
|
||||||
help="input file is in fasta format (but more tolerant to format variant)")
|
|
||||||
|
|
||||||
optionManager.add_option('--fna',
|
|
||||||
action="store_const", dest="seqinformat",
|
|
||||||
default=None,
|
|
||||||
const='fna',
|
|
||||||
help="input file is in fasta nucleic format produced by 454 sequencer pipeline")
|
|
||||||
|
|
||||||
optionManager.add_option('--qual',
|
|
||||||
action="store", dest="withqualfile",
|
|
||||||
type='str',
|
|
||||||
default=None,
|
|
||||||
help="Specify the name of a quality file produced by 454 sequencer pipeline")
|
|
||||||
|
|
||||||
optionManager.add_option('--sanger',
|
|
||||||
action="store_const", dest="seqinformat",
|
|
||||||
default=None,
|
|
||||||
const='sanger',
|
|
||||||
help="input file is in sanger fastq nucleic format (standard fastq)")
|
|
||||||
|
|
||||||
optionManager.add_option('--solexa',
|
|
||||||
action="store_const", dest="seqinformat",
|
|
||||||
default=None,
|
|
||||||
const='solexa',
|
|
||||||
help="input file is in fastq nucleic format produced by solexa sequencer")
|
|
||||||
|
|
||||||
optionManager.add_option('--illumina',
|
|
||||||
action="store_const", dest="seqinformat",
|
|
||||||
default=None,
|
|
||||||
const='illumina',
|
|
||||||
help="input file is in fastq nucleic format produced by old solexa sequencer")
|
|
||||||
|
|
||||||
optionManager.add_option('--nuc',
|
|
||||||
action="store_const", dest="moltype",
|
|
||||||
default=None,
|
|
||||||
const='nuc',
|
|
||||||
help="input file is nucleic sequences")
|
|
||||||
optionManager.add_option('--prot',
|
|
||||||
action="store_const", dest="moltype",
|
|
||||||
default=None,
|
|
||||||
const='pep',
|
|
||||||
help="input file is protein sequences")
|
|
||||||
|
|
||||||
|
|
||||||
def addOutputFormatOption(optionManager):
|
|
||||||
optionManager.add_option('--fastq-output',
|
|
||||||
action="store_const", dest="output",
|
|
||||||
default=None,
|
|
||||||
const=formatFastq,
|
|
||||||
help="output sequences in sanger fastq format")
|
|
||||||
optionManager.add_option('--fasta-output',
|
|
||||||
action="store_const", dest="output",
|
|
||||||
default=None,
|
|
||||||
const=formatFasta,
|
|
||||||
help="output sequences in obitools fasta format")
|
|
||||||
optionManager.add_option('--sap-output',
|
|
||||||
action="store_const", dest="output",
|
|
||||||
default=None,
|
|
||||||
const=formatSAPFastaGenerator,
|
|
||||||
help="output sequences in sap fasta format")
|
|
||||||
optionManager.add_option('--strict-sap',
|
|
||||||
action='store_true',dest='strictsap',
|
|
||||||
default=False,
|
|
||||||
help="Print sequences in upper case (defualt is lower case)")
|
|
||||||
optionManager.add_option('--ecopcr-output',
|
|
||||||
action="store", dest="ecopcroutput",
|
|
||||||
default=None,
|
|
||||||
help="output sequences in obitools ecopcr format")
|
|
||||||
optionManager.add_option('--uppercase',
|
|
||||||
action='store_true',dest='uppercase',
|
|
||||||
default=False,
|
|
||||||
help="Print sequences in upper case (defualt is lower case)")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def addInOutputOption(optionManager):
|
|
||||||
addInputFormatOption(optionManager)
|
|
||||||
addOutputFormatOption(optionManager)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def autoEntriesIterator(options):
|
|
||||||
options.outputFormater=formatFasta
|
|
||||||
options.outputFormat="fasta"
|
|
||||||
|
|
||||||
ecopcr_pattern = re.compile('^[^ ]+ +| +[0-9]+ +| + [0-9]+ + | +')
|
|
||||||
|
|
||||||
def annotatedIterator(formatIterator):
|
|
||||||
options.outputFormater=formatFasta
|
|
||||||
options.outputFormat="fasta"
|
|
||||||
def iterator(lineiterator):
|
|
||||||
for s in formatIterator(lineiterator):
|
|
||||||
s.extractTaxon()
|
|
||||||
yield s
|
|
||||||
|
|
||||||
return iterator
|
|
||||||
|
|
||||||
def withQualIterator(qualityfile):
|
|
||||||
options.outputFormater=formatFastq
|
|
||||||
options.outputFormat="fastq"
|
|
||||||
def iterator(lineiterator):
|
|
||||||
for s in fnaFastaIterator(lineiterator):
|
|
||||||
q = qualityfile.next()
|
|
||||||
quality = array('d',(10.**(-x/10.) for x in q))
|
|
||||||
s.quality=quality
|
|
||||||
yield s
|
|
||||||
|
|
||||||
return iterator
|
|
||||||
|
|
||||||
def autoSequenceIterator(lineiterator):
|
|
||||||
options.outputFormater=formatFasta
|
|
||||||
options.outputFormat="fasta"
|
|
||||||
first = lineiterator.next()
|
|
||||||
if first[0]==">":
|
|
||||||
if options.withqualfile is not None:
|
|
||||||
qualfile=qualityIterator(options.withqualfile)
|
|
||||||
reader=withQualIterator(qualfile)
|
|
||||||
options.outputFormater=formatFastq
|
|
||||||
options.outputFormat="fastq"
|
|
||||||
elif options.moltype=='nuc':
|
|
||||||
reader=fastaNucIterator
|
|
||||||
elif options.moltype=='pep':
|
|
||||||
reader=fastaAAIterator
|
|
||||||
else:
|
|
||||||
reader=fastaIterator
|
|
||||||
elif first[0]=='@':
|
|
||||||
reader=fastqSangerIterator
|
|
||||||
options.outputFormater=formatFastq
|
|
||||||
options.outputFormat="fastq"
|
|
||||||
elif first[0:3]=='ID ':
|
|
||||||
reader=emblIterator
|
|
||||||
elif first[0:6]=='LOCUS ':
|
|
||||||
reader=genbankIterator
|
|
||||||
elif first[0]=="#" or ecopcr_pattern.search(first):
|
|
||||||
reader=EcoPCRFile
|
|
||||||
else:
|
|
||||||
raise AssertionError,'file is not in fasta, fasta, embl, genbank or ecoPCR format'
|
|
||||||
|
|
||||||
input = reader(chain([first],lineiterator))
|
|
||||||
|
|
||||||
return input
|
|
||||||
|
|
||||||
if options.seqinformat is None:
|
|
||||||
reader = autoSequenceIterator
|
|
||||||
else:
|
|
||||||
if options.seqinformat=='fasta':
|
|
||||||
if options.moltype=='nuc':
|
|
||||||
reader=fastaNucIterator
|
|
||||||
elif options.moltype=='pep':
|
|
||||||
reader=fastaAAIterator
|
|
||||||
else:
|
|
||||||
reader=fastaIterator
|
|
||||||
elif options.seqinformat=='rawfasta':
|
|
||||||
reader=annotatedIterator(rawFastaIterator)
|
|
||||||
elif options.seqinformat=='genbank':
|
|
||||||
reader=annotatedIterator(genbankIterator)
|
|
||||||
elif options.seqinformat=='embl':
|
|
||||||
reader=annotatedIterator(emblIterator)
|
|
||||||
elif options.seqinformat=='fna':
|
|
||||||
reader=fnaFastaIterator
|
|
||||||
elif options.seqinformat=='sanger':
|
|
||||||
options.outputFormater=formatFastq
|
|
||||||
options.outputFormat="fastq"
|
|
||||||
reader=fastqSangerIterator
|
|
||||||
elif options.seqinformat=='solexa':
|
|
||||||
options.outputFormater=formatFastq
|
|
||||||
options.outputFormat="fastq"
|
|
||||||
reader=fastqSolexaIterator
|
|
||||||
elif options.seqinformat=='illumina':
|
|
||||||
options.outputFormater=formatFastq
|
|
||||||
options.outputFormat="fastq"
|
|
||||||
reader=fastqIlluminaIterator
|
|
||||||
elif options.seqinformat=='ecopcr':
|
|
||||||
reader=EcoPCRFile
|
|
||||||
|
|
||||||
if options.seqinformat=='fna' and options.withqualfile is not None:
|
|
||||||
qualfile=qualityIterator(options.withqualfile)
|
|
||||||
reader=withQualIterator(qualfile)
|
|
||||||
options.outputFormater=formatFastq
|
|
||||||
options.outputFormat="fastq"
|
|
||||||
|
|
||||||
# if options.addrank:
|
|
||||||
# reader = withRankIterator(reader)
|
|
||||||
return reader
|
|
||||||
|
|
||||||
def sequenceWriterGenerator(options,output=sys.stdout):
|
|
||||||
class SequenceWriter:
|
|
||||||
def __init__(self,options,file=sys.stdout):
|
|
||||||
self._format=None
|
|
||||||
self._file=file
|
|
||||||
self._upper=options.uppercase
|
|
||||||
def put(self,seq):
|
|
||||||
if self._format is None:
|
|
||||||
self._format=formatFasta
|
|
||||||
if options.output is not None:
|
|
||||||
self._format=options.output
|
|
||||||
if self._format is formatSAPFastaGenerator:
|
|
||||||
self._format=formatSAPFastaGenerator(options)
|
|
||||||
elif options.outputFormater is not None:
|
|
||||||
self._format=options.outputFormater
|
|
||||||
s = self._format(seq,upper=self._upper)
|
|
||||||
try:
|
|
||||||
self._file.write(s)
|
|
||||||
self._file.write("\n")
|
|
||||||
except IOError:
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
if options.ecopcroutput is not None:
|
|
||||||
taxo = loadTaxonomyDatabase(options)
|
|
||||||
writer=EcoPCRDBSequenceWriter(options.ecopcroutput,taxonomy=taxo)
|
|
||||||
else:
|
|
||||||
writer=SequenceWriter(options,output)
|
|
||||||
|
|
||||||
def sequenceWriter(sequence):
|
|
||||||
writer.put(sequence)
|
|
||||||
|
|
||||||
return sequenceWriter
|
|
||||||
|
|
||||||
|
|
@ -1,24 +0,0 @@
|
|||||||
from obitools.fasta import fastaIterator
|
|
||||||
from obitools.fastq import fastqSangerIterator
|
|
||||||
from obitools.seqdb.embl.parser import emblIterator
|
|
||||||
from obitools.seqdb.genbank.parser import genbankIterator
|
|
||||||
from itertools import chain
|
|
||||||
from obitools.utils import universalOpen
|
|
||||||
|
|
||||||
def autoSequenceIterator(file):
|
|
||||||
lineiterator = universalOpen(file)
|
|
||||||
first = lineiterator.next()
|
|
||||||
if first[0]==">":
|
|
||||||
reader=fastaIterator
|
|
||||||
elif first[0]=='@':
|
|
||||||
reader=fastqSangerIterator
|
|
||||||
elif first[0:3]=='ID ':
|
|
||||||
reader=emblIterator
|
|
||||||
elif first[0:6]=='LOCUS ':
|
|
||||||
reader=genbankIterator
|
|
||||||
else:
|
|
||||||
raise AssertionError,'file is not in fasta, fasta, embl, or genbank format'
|
|
||||||
|
|
||||||
input = reader(chain([first],lineiterator))
|
|
||||||
|
|
||||||
return input
|
|
@ -1,2 +0,0 @@
|
|||||||
from obitools.seqdb.embl.parser import emblIterator,emblParser
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
|||||||
from obitools.fasta import fastaIterator,fastaParser
|
|
||||||
from obitools.fasta import fastaAAIterator,fastaAAParser
|
|
||||||
from obitools.fasta import fastaNucIterator,fastaNucParser
|
|
||||||
from obitools.fasta import formatFasta
|
|
@ -1,13 +0,0 @@
|
|||||||
'''
|
|
||||||
Created on 15 janv. 2010
|
|
||||||
|
|
||||||
@author: coissac
|
|
||||||
'''
|
|
||||||
|
|
||||||
from obitools.fastq import fastqIterator,fastqParserGenetator
|
|
||||||
from obitools.fastq import fastqSangerIterator,fastqSolexaIterator, \
|
|
||||||
fastqIlluminaIterator
|
|
||||||
from obitools.fastq import fastqAAIterator
|
|
||||||
from obitools.fastq import formatFastq
|
|
||||||
|
|
||||||
|
|
@ -1,8 +0,0 @@
|
|||||||
'''
|
|
||||||
Created on 12 oct. 2009
|
|
||||||
|
|
||||||
@author: coissac
|
|
||||||
'''
|
|
||||||
|
|
||||||
from obitools.fnaqual.fasta import fnaFastaIterator
|
|
||||||
from obitools.fnaqual.quality import qualityIterator
|
|
@ -1,4 +0,0 @@
|
|||||||
from obitools.seqdb.genbank.parser import genpepIterator,genpepParser
|
|
||||||
from obitools.seqdb.genbank.parser import genbankIterator,genbankParser
|
|
||||||
|
|
||||||
|
|
@ -1,5 +0,0 @@
|
|||||||
from obitools.tagmatcher.parser import tagMatcherParser
|
|
||||||
from obitools.tagmatcher.parser import TagMatcherIterator
|
|
||||||
from obitools.tagmatcher.parser import formatTagMatcher
|
|
||||||
|
|
||||||
tagMatcherIterator=TagMatcherIterator
|
|
@ -1,33 +0,0 @@
|
|||||||
from itertools import imap
|
|
||||||
from obitools import utils
|
|
||||||
|
|
||||||
class GoAFileIterator(utils.ColumnFile):
|
|
||||||
def __init__(self,stream):
|
|
||||||
utils.ColumnFile.__init__(self,
|
|
||||||
stream, '\t', True,
|
|
||||||
(str,))
|
|
||||||
|
|
||||||
_colname = ['database',
|
|
||||||
'ac',
|
|
||||||
'symbol',
|
|
||||||
'qualifier',
|
|
||||||
'goid',
|
|
||||||
'origin',
|
|
||||||
'evidence',
|
|
||||||
'evidnce_origine',
|
|
||||||
'namespace',
|
|
||||||
'db_object_name',
|
|
||||||
'gene',
|
|
||||||
'object_type',
|
|
||||||
'taxid',
|
|
||||||
'date',
|
|
||||||
'assigned_by']
|
|
||||||
|
|
||||||
def next(self):
|
|
||||||
data = utils.ColumnFile.next(self)
|
|
||||||
data = dict(imap(None,GoAFileIterator._colname,data))
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,962 +0,0 @@
|
|||||||
'''
|
|
||||||
**obitool.graph** for representing graph structure in obitools
|
|
||||||
--------------------------------------------------------------
|
|
||||||
|
|
||||||
.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>
|
|
||||||
|
|
||||||
|
|
||||||
This module offert classes to manipulate graphs, mainly trough the
|
|
||||||
:py:class:`obitools.graph.Graph` class.
|
|
||||||
|
|
||||||
.. inheritance-diagram:: Graph DiGraph UndirectedGraph
|
|
||||||
:parts: 2
|
|
||||||
|
|
||||||
'''
|
|
||||||
|
|
||||||
import sys
|
|
||||||
|
|
||||||
|
|
||||||
from obitools.utils import progressBar
|
|
||||||
|
|
||||||
|
|
||||||
class Indexer(dict):
|
|
||||||
'''
|
|
||||||
Allow to manage convertion between an arbitrarly hashable python
|
|
||||||
value and an unique integer key
|
|
||||||
'''
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
|
|
||||||
self.__max=0
|
|
||||||
self.__reverse=[]
|
|
||||||
|
|
||||||
def getLabel(self,index):
|
|
||||||
'''
|
|
||||||
Return the python value associated to an integer index.
|
|
||||||
|
|
||||||
:param index: an index value
|
|
||||||
:type index: int
|
|
||||||
|
|
||||||
:raises: IndexError if the index is not used in this
|
|
||||||
Indexer instance
|
|
||||||
'''
|
|
||||||
return self.__reverse[index]
|
|
||||||
|
|
||||||
def getIndex(self,key,strict=False):
|
|
||||||
'''
|
|
||||||
Return the index associated to a **key** in the indexer. Two
|
|
||||||
modes are available :
|
|
||||||
|
|
||||||
- strict mode :
|
|
||||||
|
|
||||||
if the key is not known by the :py:class:`Indexer` instance
|
|
||||||
a :py:exc:`KeyError` exception is raised.
|
|
||||||
|
|
||||||
- non strict mode :
|
|
||||||
|
|
||||||
in this mode if the requested *key** is absent, it is added to
|
|
||||||
the :py:class:`Indexer` instance and the new index is returned
|
|
||||||
|
|
||||||
:param key: the requested key
|
|
||||||
:type key: a hashable python value
|
|
||||||
|
|
||||||
:param strict: select the looking for mode
|
|
||||||
:type strict: bool
|
|
||||||
|
|
||||||
:return: the index corresponding to the key
|
|
||||||
:rtype: int
|
|
||||||
|
|
||||||
:raises: - :py:exc:`KeyError` in strict mode is key is absent
|
|
||||||
of the :py:class:`Indexer` instance
|
|
||||||
|
|
||||||
- :py:exc:`TypeError` if key is not an hashable value.
|
|
||||||
'''
|
|
||||||
if dict.__contains__(self,key):
|
|
||||||
return dict.__getitem__(self,key)
|
|
||||||
elif strict:
|
|
||||||
raise KeyError,key
|
|
||||||
else:
|
|
||||||
value = self.__max
|
|
||||||
self[key]= value
|
|
||||||
self.__reverse.append(key)
|
|
||||||
self.__max+=1
|
|
||||||
return value
|
|
||||||
|
|
||||||
def __getitem__(self,key):
|
|
||||||
'''
|
|
||||||
Implement the [] operateor to emulate the standard dictionnary
|
|
||||||
behaviour on :py:class:`Indexer` and returns the integer key
|
|
||||||
associated to a python value.
|
|
||||||
|
|
||||||
Actually this method call the:py:meth:`getIndex` method in
|
|
||||||
non strict mode so it only raises an :py:exc:`TypeError`
|
|
||||||
if key is not an hashable value.
|
|
||||||
|
|
||||||
:param key: the value to index
|
|
||||||
:type key: an hashable python value
|
|
||||||
|
|
||||||
:return: an unique integer value associated to the key
|
|
||||||
:rtype: int
|
|
||||||
|
|
||||||
:raises: :py:exc:`TypeError` if **key** is not an hashable value.
|
|
||||||
|
|
||||||
'''
|
|
||||||
return self.getIndex(key)
|
|
||||||
|
|
||||||
def __equal__(self,index):
|
|
||||||
'''
|
|
||||||
Implement equal operator **==** for comparing two :py:class:`Indexer` instances.
|
|
||||||
Two :py:class:`Indexer` instances are equals only if they are physically
|
|
||||||
the same instance
|
|
||||||
|
|
||||||
:param index: the second Indexer
|
|
||||||
:type index: an :py:class:`Indexer` instance
|
|
||||||
|
|
||||||
:return: True is the two :py:class:`Indexer` instances are the same
|
|
||||||
:rtype: bool
|
|
||||||
'''
|
|
||||||
return id(self)==id(index)
|
|
||||||
|
|
||||||
|
|
||||||
class Graph(object):
|
|
||||||
'''
|
|
||||||
Class used to represent directed or undirected graph.
|
|
||||||
|
|
||||||
.. warning::
|
|
||||||
|
|
||||||
Only one edge can connect two nodes in a given direction.
|
|
||||||
|
|
||||||
.. warning::
|
|
||||||
|
|
||||||
Specifying nodes through their index seepud your code but as no check
|
|
||||||
is done on index value, it may result in inconsistency. So prefer the
|
|
||||||
use of node label to specify a node.
|
|
||||||
|
|
||||||
|
|
||||||
'''
|
|
||||||
def __init__(self,label='G',directed=False,indexer=None,nodes=None,edges=None):
|
|
||||||
'''
|
|
||||||
:param label: Graph name, set to 'G' by default
|
|
||||||
:type label: str
|
|
||||||
|
|
||||||
:param directed: true for directed graph, set to False by defalt
|
|
||||||
:type directed: boolean
|
|
||||||
|
|
||||||
:param indexer: node label indexer. This allows to define several graphs
|
|
||||||
sharing the same indexer (see : :py:meth:`newEmpty`)
|
|
||||||
:type indexer: :py:class:`Indexer`
|
|
||||||
|
|
||||||
:param nodes: set of nodes to add to the graph
|
|
||||||
:type nodes: iterable value
|
|
||||||
|
|
||||||
:param edges: set of edges to add to the graph
|
|
||||||
:type edges: iterable value
|
|
||||||
'''
|
|
||||||
|
|
||||||
self._directed=directed
|
|
||||||
if indexer is None:
|
|
||||||
indexer = Indexer()
|
|
||||||
self._index = indexer
|
|
||||||
self._node = {}
|
|
||||||
self._node_attrs = {}
|
|
||||||
self._edge_attrs = {}
|
|
||||||
self._label=label
|
|
||||||
|
|
||||||
def newEmpty(self):
|
|
||||||
"""
|
|
||||||
Build a new empty graph using the same :py:class:`Indexer` instance.
|
|
||||||
This allows two graph for sharing their vertices through their indices.
|
|
||||||
"""
|
|
||||||
n = Graph(self._label+"_compact",self._directed,self._index)
|
|
||||||
|
|
||||||
return n
|
|
||||||
|
|
||||||
def addNode(self,node=None,index=None,**data):
|
|
||||||
'''
|
|
||||||
Add a new node or update an existing one.
|
|
||||||
|
|
||||||
:param node: the new node label or the label of an existing node
|
|
||||||
for updating it.
|
|
||||||
:type node: an hashable python value
|
|
||||||
|
|
||||||
:param index: the index of an existing node for updating it.
|
|
||||||
:type index: int
|
|
||||||
|
|
||||||
:return: the index of the node
|
|
||||||
:rtype: int
|
|
||||||
|
|
||||||
:raises: :py:exc:`IndexError` is index is not **None** and
|
|
||||||
corresponds to a not used index in this graph.
|
|
||||||
'''
|
|
||||||
if index is None:
|
|
||||||
index = self._index[node]
|
|
||||||
|
|
||||||
if index not in self._node:
|
|
||||||
self._node[index]=set()
|
|
||||||
else:
|
|
||||||
if index not in self._node:
|
|
||||||
raise IndexError,"This index is not used in this graph"
|
|
||||||
|
|
||||||
if data:
|
|
||||||
if index in self._node_attrs:
|
|
||||||
self._node_attrs[index].update(data)
|
|
||||||
else:
|
|
||||||
self._node_attrs[index]=dict(data)
|
|
||||||
|
|
||||||
return index
|
|
||||||
|
|
||||||
def __contains__(self,node):
|
|
||||||
try:
|
|
||||||
index = self._index.getIndex(node,strict=True)
|
|
||||||
r = index in self._node
|
|
||||||
except KeyError:
|
|
||||||
r=False
|
|
||||||
return r
|
|
||||||
|
|
||||||
def getNode(self,node=None,index=None):
|
|
||||||
"""
|
|
||||||
:param node: a node label.
|
|
||||||
:type node: an hashable python value
|
|
||||||
|
|
||||||
:param index: the index of an existing node.
|
|
||||||
:type index: int
|
|
||||||
|
|
||||||
.. note:: Index value are prevalent over node label.
|
|
||||||
|
|
||||||
:return: the looked for node
|
|
||||||
:rtype: :py:class:`Node`
|
|
||||||
|
|
||||||
:raises: :py:exc:`IndexError` if specified node lablel
|
|
||||||
corresponds to a non-existing node.
|
|
||||||
|
|
||||||
.. warning:: no check on index value
|
|
||||||
"""
|
|
||||||
if index is None:
|
|
||||||
index = self._index.getIndex(node, True)
|
|
||||||
return Node(index,self)
|
|
||||||
|
|
||||||
def getBestNode(self,estimator):
|
|
||||||
'''
|
|
||||||
Select the node maximizing the estimator function
|
|
||||||
|
|
||||||
:param estimator: the function to maximize
|
|
||||||
:type estimator: a function returning a numerical value and accepting one
|
|
||||||
argument of type :py:class:`Node`
|
|
||||||
|
|
||||||
:return: the best node
|
|
||||||
:rtype: py:class:`Node`
|
|
||||||
'''
|
|
||||||
|
|
||||||
bestScore=0
|
|
||||||
best=None
|
|
||||||
for n in self:
|
|
||||||
score = estimator(n)
|
|
||||||
if best is None or score > bestScore:
|
|
||||||
bestScore = score
|
|
||||||
best=n
|
|
||||||
return best
|
|
||||||
|
|
||||||
|
|
||||||
def delNode(self,node=None,index=None):
|
|
||||||
"""
|
|
||||||
Delete a node from a graph and all associated edges.
|
|
||||||
|
|
||||||
:param node: a node label.
|
|
||||||
:type node: an hashable python value
|
|
||||||
|
|
||||||
:param index: the index of an existing node.
|
|
||||||
:type index: int
|
|
||||||
|
|
||||||
.. note:: Index value are prevalent over node label.
|
|
||||||
|
|
||||||
:raises: :py:exc:`IndexError` if specified node lablel
|
|
||||||
corresponds to a non-existing node.
|
|
||||||
|
|
||||||
.. warning:: no check on index value
|
|
||||||
"""
|
|
||||||
if index is None:
|
|
||||||
index = self._index[node]
|
|
||||||
|
|
||||||
for n in self._node:
|
|
||||||
if n!=index:
|
|
||||||
e = self._node[n]
|
|
||||||
if index in e:
|
|
||||||
if (n,index) in self._edge_attrs:
|
|
||||||
del self._edge_attrs[(n,index)]
|
|
||||||
e.remove(index)
|
|
||||||
|
|
||||||
e = self._node[index]
|
|
||||||
|
|
||||||
for n in e:
|
|
||||||
if (index,n) in self._edge_attrs:
|
|
||||||
del self._edge_attrs[(index,n)]
|
|
||||||
|
|
||||||
del self._node[index]
|
|
||||||
if index in self._node_attrs:
|
|
||||||
del self._node_attrs[index]
|
|
||||||
|
|
||||||
|
|
||||||
def addEdge(self,node1=None,node2=None,index1=None,index2=None,**data):
|
|
||||||
'''
|
|
||||||
Create a new edge in the graph between both the specified nodes.
|
|
||||||
|
|
||||||
.. note:: Nodes can be specified using their label or their index in the graph
|
|
||||||
if both values are indicated the index is used.
|
|
||||||
|
|
||||||
:param node1: The first vertex label
|
|
||||||
:type node1: an hashable python value
|
|
||||||
:param node2: The second vertex label
|
|
||||||
:type node2: an hashable python value
|
|
||||||
:param index1: The first vertex index
|
|
||||||
:type index1: int
|
|
||||||
:param index2: The second vertex index
|
|
||||||
:type index2: int
|
|
||||||
|
|
||||||
:raises: :py:exc:`IndexError` if one of both the specified node lablel
|
|
||||||
corresponds to a non-existing node.
|
|
||||||
|
|
||||||
|
|
||||||
.. warning:: no check on index value
|
|
||||||
'''
|
|
||||||
|
|
||||||
index1=self.addNode(node1, index1)
|
|
||||||
index2=self.addNode(node2, index2)
|
|
||||||
|
|
||||||
self._node[index1].add(index2)
|
|
||||||
|
|
||||||
if not self._directed:
|
|
||||||
self._node[index2].add(index1)
|
|
||||||
|
|
||||||
if data:
|
|
||||||
if (index1,index2) not in self._edge_attrs:
|
|
||||||
data =dict(data)
|
|
||||||
self._edge_attrs[(index1,index2)]=data
|
|
||||||
if not self._directed:
|
|
||||||
self._edge_attrs[(index2,index1)]=data
|
|
||||||
else:
|
|
||||||
self._edge_attrs[(index2,index1)].update(data)
|
|
||||||
|
|
||||||
return (index1,index2)
|
|
||||||
|
|
||||||
def getEdge(self,node1=None,node2=None,index1=None,index2=None):
|
|
||||||
'''
|
|
||||||
Extract the :py:class:`Edge` instance linking two nodes of the graph.
|
|
||||||
|
|
||||||
.. note:: Nodes can be specified using their label or their index in the graph
|
|
||||||
if both values are indicated the index is used.
|
|
||||||
|
|
||||||
:param node1: The first vertex label
|
|
||||||
:type node1: an hashable python value
|
|
||||||
:param node2: The second vertex label
|
|
||||||
:type node2: an hashable python value
|
|
||||||
:param index1: The first vertex index
|
|
||||||
:type index1: int
|
|
||||||
:param index2: The second vertex index
|
|
||||||
:type index2: int
|
|
||||||
|
|
||||||
:raises: :py:exc:`IndexError` if one of both the specified node lablel
|
|
||||||
corresponds to a non-existing node.
|
|
||||||
|
|
||||||
|
|
||||||
.. warning:: no check on index value
|
|
||||||
'''
|
|
||||||
node1=self.getNode(node1, index1)
|
|
||||||
node2=self.getNode(node2, index2)
|
|
||||||
return Edge(node1,node2)
|
|
||||||
|
|
||||||
def delEdge(self,node1=None,node2=None,index1=None,index2=None):
|
|
||||||
"""
|
|
||||||
Delete the edge linking node 1 to node 2.
|
|
||||||
|
|
||||||
.. note:: Nodes can be specified using their label or their index in the graph
|
|
||||||
if both values are indicated the index is used.
|
|
||||||
|
|
||||||
|
|
||||||
:param node1: The first vertex label
|
|
||||||
:type node1: an hashable python value
|
|
||||||
:param node2: The second vertex label
|
|
||||||
:type node2: an hashable python value
|
|
||||||
:param index1: The first vertex index
|
|
||||||
:type index1: int
|
|
||||||
:param index2: The second vertex index
|
|
||||||
:type index2: int
|
|
||||||
|
|
||||||
:raises: :py:exc:`IndexError` if one of both the specified node lablel
|
|
||||||
corresponds to a non-existing node.
|
|
||||||
|
|
||||||
|
|
||||||
.. warning:: no check on index value
|
|
||||||
"""
|
|
||||||
if index1 is None:
|
|
||||||
index1 = self._index[node1]
|
|
||||||
if index2 is None:
|
|
||||||
index2 = self._index[node2]
|
|
||||||
if index1 in self._node and index2 in self._node[index1]:
|
|
||||||
self._node[index1].remove(index2)
|
|
||||||
if (index1,index2) in self._node_attrs:
|
|
||||||
del self._node_attrs[(index1,index2)]
|
|
||||||
if not self._directed:
|
|
||||||
self._node[index2].remove(index1)
|
|
||||||
if (index2,index1) in self._node_attrs:
|
|
||||||
del self._node_attrs[(index2,index1)]
|
|
||||||
|
|
||||||
def edgeIterator(self,predicate=None):
|
|
||||||
"""
|
|
||||||
Iterate through a set of selected vertices.
|
|
||||||
|
|
||||||
:param predicate: a function allowing node selection. Default value
|
|
||||||
is **None** and indicate that all nodes are selected.
|
|
||||||
:type predicate: a function returning a boolean value
|
|
||||||
and accepting one argument of class :py:class:`Edge`
|
|
||||||
|
|
||||||
:return: an iterator over selected edge
|
|
||||||
:rtype: interator over :py:class:`Edge` instances
|
|
||||||
|
|
||||||
.. seealso::
|
|
||||||
function :py:func:`selectEdgeAttributeFactory` for simple predicate.
|
|
||||||
|
|
||||||
"""
|
|
||||||
for n1 in self._node:
|
|
||||||
for n2 in self._node[n1]:
|
|
||||||
if self._directed or n1 <= n2:
|
|
||||||
e = self.getEdge(index1=n1, index2=n2)
|
|
||||||
if predicate is None or predicate(e):
|
|
||||||
yield e
|
|
||||||
|
|
||||||
|
|
||||||
def nodeIterator(self,predicate=None):
|
|
||||||
"""
|
|
||||||
Iterate through a set of selected vertices.
|
|
||||||
|
|
||||||
:param predicate: a function allowing edge selection. Default value
|
|
||||||
is **None** and indicate that all edges are selected.
|
|
||||||
:type predicate: a function returning a boolean value
|
|
||||||
and accepting one argument of class :py:class:`Node`
|
|
||||||
|
|
||||||
:return: an iterator over selected nodes.
|
|
||||||
:rtype: interator over :py:class:`Node` instances
|
|
||||||
|
|
||||||
"""
|
|
||||||
for n in self._node:
|
|
||||||
node = self.getNode(index=n)
|
|
||||||
if predicate is None or predicate(node):
|
|
||||||
yield node
|
|
||||||
|
|
||||||
def nodeIndexIterator(self,predicate=None):
|
|
||||||
"""
|
|
||||||
Iterate through the indexes of a set of selected vertices.
|
|
||||||
|
|
||||||
:param predicate: a function allowing edge selection. Default value
|
|
||||||
is **None** and indicate that all edges are selected.
|
|
||||||
:type predicate: a function returning a boolean value
|
|
||||||
and accepting one argument of class :py:class:`Node`
|
|
||||||
|
|
||||||
:return: an iterator over selected node indices.
|
|
||||||
:rtype: interator over `int`
|
|
||||||
|
|
||||||
"""
|
|
||||||
for n in self._node:
|
|
||||||
node = self.getNode(index=n)
|
|
||||||
if predicate is None or predicate(node):
|
|
||||||
yield n
|
|
||||||
|
|
||||||
def neighbourIndexSet(self,node=None,index=None):
|
|
||||||
if index is None:
|
|
||||||
index=self.getNode(node).index
|
|
||||||
return self._node[index]
|
|
||||||
|
|
||||||
def edgeCount(self):
|
|
||||||
n = reduce(lambda x,y:x+y, (len(z) for z in self._node.itervalues()),0)
|
|
||||||
if not self._directed:
|
|
||||||
n=n/2
|
|
||||||
return n
|
|
||||||
|
|
||||||
def subgraph(self,nodes,name='G'):
|
|
||||||
sub = Graph(name,self._directed,self._index)
|
|
||||||
if not isinstance(nodes, set):
|
|
||||||
nodes = set(nodes)
|
|
||||||
for n in nodes:
|
|
||||||
sub._node[n]=nodes & self._node[n]
|
|
||||||
if n in self._node_attrs:
|
|
||||||
sub._node_attrs[n]=dict(self._node_attrs[n])
|
|
||||||
for n2 in sub._node[n]:
|
|
||||||
if not self._directed:
|
|
||||||
if n <= n2:
|
|
||||||
if (n,n2) in self._edge_attrs:
|
|
||||||
data=dict(self._edge_attrs[(n,n2)])
|
|
||||||
sub._edge_attrs[(n,n2)]=data
|
|
||||||
sub._edge_attrs[(n2,n)]=data
|
|
||||||
else:
|
|
||||||
if (n,n2) in self._edge_attrs:
|
|
||||||
data=dict(self._edge_attrs[(n,n2)])
|
|
||||||
sub._edge_attrs[(n,n2)]=data
|
|
||||||
return sub
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return len(self._node)
|
|
||||||
|
|
||||||
def __getitem__(self,key):
|
|
||||||
return self.getNode(node=key)
|
|
||||||
|
|
||||||
def __delitem__(self,key):
|
|
||||||
self.delNode(node=key)
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return self.nodeIterator()
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
if self._directed:
|
|
||||||
kw ='digraph'
|
|
||||||
else:
|
|
||||||
kw='graph'
|
|
||||||
|
|
||||||
nodes = "\n ".join([str(x) for x in self])
|
|
||||||
edges = "\n ".join([str(x) for x in self.edgeIterator()])
|
|
||||||
|
|
||||||
return "%s %s {\n %s\n\n %s\n}" % (kw,self._label,nodes,edges)
|
|
||||||
|
|
||||||
class Node(object):
|
|
||||||
"""
|
|
||||||
Class used for representing one node or vertex in a graph
|
|
||||||
|
|
||||||
"""
|
|
||||||
def __init__(self,index,graph):
|
|
||||||
'''
|
|
||||||
.. warning::
|
|
||||||
|
|
||||||
:py:class:`Node` constructor is usualy called through the :py:class:`Graph` methods
|
|
||||||
|
|
||||||
:param index: Index of the node in the graph
|
|
||||||
:type index: int
|
|
||||||
:param graph: graph instance owning the node
|
|
||||||
:type graph: :py:class:`obitools.graph.Graph`
|
|
||||||
'''
|
|
||||||
self.index = index
|
|
||||||
self.__graph = graph
|
|
||||||
|
|
||||||
def getGraph(self):
|
|
||||||
'''
|
|
||||||
return graph owning this node.
|
|
||||||
|
|
||||||
:rtype: :py:class:`obitools.graph.Graph`
|
|
||||||
'''
|
|
||||||
return self.__graph
|
|
||||||
|
|
||||||
|
|
||||||
def getLabel(self):
|
|
||||||
'''
|
|
||||||
return label associated to this node.
|
|
||||||
'''
|
|
||||||
return self.__graph._index.getLabel(self.index)
|
|
||||||
|
|
||||||
|
|
||||||
def has_key(self,key):
|
|
||||||
'''
|
|
||||||
test is the node instance has a property named 'key'.
|
|
||||||
|
|
||||||
:param key: the name of a property
|
|
||||||
:type key: str
|
|
||||||
|
|
||||||
:return: True if the nade has a property named <key>
|
|
||||||
:rtype: bool
|
|
||||||
'''
|
|
||||||
if self.index in self.__graph._node_attrs:
|
|
||||||
return key in self.__graph._node_attrs[self.index]
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def neighbourIterator(self,nodePredicat=None,edgePredicat=None):
|
|
||||||
'''
|
|
||||||
iterate through the nodes directly connected to
|
|
||||||
this node.
|
|
||||||
|
|
||||||
:param nodePredicat: a function accepting one node as parameter
|
|
||||||
and returning **True** if this node must be
|
|
||||||
returned by the iterator.
|
|
||||||
:type nodePredicat: function
|
|
||||||
|
|
||||||
:param edgePredicat: a function accepting one edge as parameter
|
|
||||||
and returning True if the edge linking self and
|
|
||||||
the current must be considered.
|
|
||||||
:type edgePredicat: function
|
|
||||||
|
|
||||||
|
|
||||||
:rtype: iterator on Node instances
|
|
||||||
'''
|
|
||||||
for n in self.neighbourIndexIterator(nodePredicat, edgePredicat):
|
|
||||||
node = self.graph.getNode(index=n)
|
|
||||||
yield node
|
|
||||||
|
|
||||||
def neighbourIndexSet(self):
|
|
||||||
'''
|
|
||||||
Return a set of node indexes directely connected
|
|
||||||
to this node.
|
|
||||||
|
|
||||||
.. warning::
|
|
||||||
|
|
||||||
do not change this set unless you know
|
|
||||||
exactly what you do.
|
|
||||||
|
|
||||||
@rtype: set of int
|
|
||||||
'''
|
|
||||||
return self.__graph._node[self.index]
|
|
||||||
|
|
||||||
def neighbourIndexIterator(self,nodePredicat=None,edgePredicat=None):
|
|
||||||
'''
|
|
||||||
iterate through the node indexes directly connected to
|
|
||||||
this node.
|
|
||||||
|
|
||||||
:param nodePredicat: a function accepting one node as parameter
|
|
||||||
and returning True if this node must be
|
|
||||||
returned by the iterator.
|
|
||||||
:type nodePredicat: function
|
|
||||||
|
|
||||||
:param edgePredicat: a function accepting one edge as parameter
|
|
||||||
and returning True if the edge linking self and
|
|
||||||
the current must be considered.
|
|
||||||
:type edgePredicat: function
|
|
||||||
|
|
||||||
:rtype: iterator on int
|
|
||||||
'''
|
|
||||||
for n in self.neighbourIndexSet():
|
|
||||||
if nodePredicat is None or nodePredicat(self.__graph.getNode(index=n)):
|
|
||||||
if edgePredicat is None or edgePredicat(self.__graph.getEdge(index1=self.index,index2=n)):
|
|
||||||
yield n
|
|
||||||
|
|
||||||
def degree(self,nodeIndexes=None):
|
|
||||||
'''
|
|
||||||
return count of edges linking this node to the
|
|
||||||
set of nodes describes by their index in nodeIndexes
|
|
||||||
|
|
||||||
:param nodeIndexes: set of node indexes.
|
|
||||||
if set to None, all nodes of the
|
|
||||||
graph are take into account.
|
|
||||||
Set to None by default.
|
|
||||||
:type nodeIndexes: set of int
|
|
||||||
|
|
||||||
:rtype: int
|
|
||||||
'''
|
|
||||||
if nodeIndexes is None:
|
|
||||||
return len(self.__graph._node[self.index])
|
|
||||||
else:
|
|
||||||
return len(self.__graph._node[self.index] & nodeIndexes)
|
|
||||||
|
|
||||||
def componentIndexSet(self,nodePredicat=None,edgePredicat=None):
|
|
||||||
'''
|
|
||||||
Return the set of node index in the same connected component.
|
|
||||||
|
|
||||||
:param nodePredicat: a function accepting one node as parameter
|
|
||||||
and returning True if this node must be
|
|
||||||
returned by the iterator.
|
|
||||||
:type nodePredicat: function
|
|
||||||
|
|
||||||
:param edgePredicat: a function accepting one edge as parameter
|
|
||||||
and returning True if the edge linking self and
|
|
||||||
the current must be considered.
|
|
||||||
:type edgePredicat: function
|
|
||||||
|
|
||||||
|
|
||||||
:rtype: set of int
|
|
||||||
'''
|
|
||||||
cc=set([self.index])
|
|
||||||
added = set(x for x in self.neighbourIndexIterator(nodePredicat, edgePredicat))
|
|
||||||
while added:
|
|
||||||
cc |= added
|
|
||||||
added = reduce(lambda x,y : x | y,
|
|
||||||
(set(z for z in self.graph.getNode(index=c).neighbourIndexIterator(nodePredicat, edgePredicat))
|
|
||||||
for c in added),
|
|
||||||
set())
|
|
||||||
added -= cc
|
|
||||||
return cc
|
|
||||||
|
|
||||||
def componentIterator(self,nodePredicat=None,edgePredicat=None):
|
|
||||||
'''
|
|
||||||
Iterate through the nodes in the same connected
|
|
||||||
component.
|
|
||||||
|
|
||||||
:rtype: iterator on :py:class:`Node` instance
|
|
||||||
'''
|
|
||||||
for c in self.componentIndexSet(nodePredicat, edgePredicat):
|
|
||||||
yield self.graph.getNode(c)
|
|
||||||
|
|
||||||
def shortestPathIterator(self,nodes=None):
|
|
||||||
'''
|
|
||||||
Iterate through the shortest path sourcing
|
|
||||||
from this node. if nodes is not None, iterates
|
|
||||||
only path linkink this node to one node listed in
|
|
||||||
nodes
|
|
||||||
|
|
||||||
:param nodes: set of node index
|
|
||||||
:type nodes: iterable on int
|
|
||||||
|
|
||||||
:return: an iterator on list of int describing path
|
|
||||||
:rtype: iterator on list of int
|
|
||||||
'''
|
|
||||||
if nodes is not None:
|
|
||||||
nodes = set(nodes)
|
|
||||||
|
|
||||||
|
|
||||||
Q=[(self.index,-1)]
|
|
||||||
|
|
||||||
gray = set([self.index])
|
|
||||||
paths = {}
|
|
||||||
|
|
||||||
while Q and (nodes is None or nodes):
|
|
||||||
u,p = Q.pop()
|
|
||||||
paths[u]=p
|
|
||||||
next = self.graph._node[u] - gray
|
|
||||||
gray|=next
|
|
||||||
Q.extend((x,u) for x in next)
|
|
||||||
if nodes is None or u in nodes:
|
|
||||||
if nodes:
|
|
||||||
nodes.remove(u)
|
|
||||||
path = [u]
|
|
||||||
while p >= 0:
|
|
||||||
path.append(p)
|
|
||||||
p = paths[p]
|
|
||||||
path.reverse()
|
|
||||||
yield path
|
|
||||||
|
|
||||||
def shortestPathTo(self,node=None,index=None):
|
|
||||||
'''
|
|
||||||
return one of the shortest path linking this
|
|
||||||
node to specified node.
|
|
||||||
|
|
||||||
:param node: a node label or None
|
|
||||||
:param index: a node index or None. the parameter index
|
|
||||||
has a priority on the parameter node.
|
|
||||||
:type index: int
|
|
||||||
|
|
||||||
:return: list of node index corresponding to the path or None
|
|
||||||
if no path exists.
|
|
||||||
:rtype: list of int or None
|
|
||||||
'''
|
|
||||||
if index is None:
|
|
||||||
index=self.graph.getNode(node).index
|
|
||||||
for p in self.shortestPathIterator([index]):
|
|
||||||
return p
|
|
||||||
|
|
||||||
|
|
||||||
def __getitem__(self,key):
|
|
||||||
'''
|
|
||||||
return the value of the <key> property of this node
|
|
||||||
|
|
||||||
:param key: the name of a property
|
|
||||||
:type key: str
|
|
||||||
'''
|
|
||||||
return self.__graph._node_attrs.get(self.index,{})[key]
|
|
||||||
|
|
||||||
def __setitem__(self,key,value):
|
|
||||||
'''
|
|
||||||
set the value of a node property. In the property doesn't
|
|
||||||
already exist a new property is added to this node.
|
|
||||||
|
|
||||||
:param key: the name of a property
|
|
||||||
:type key: str
|
|
||||||
:param value: the value of the property
|
|
||||||
|
|
||||||
.. seealso::
|
|
||||||
|
|
||||||
:py:meth:`Node.__getitem__`
|
|
||||||
'''
|
|
||||||
if self.index in self.__graph._node_attrs:
|
|
||||||
data = self.__graph._node_attrs[self.index]
|
|
||||||
data[key]=value
|
|
||||||
else:
|
|
||||||
self.graph._node_attrs[self.index]={key:value}
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
'''
|
|
||||||
Count neighbour of this node
|
|
||||||
|
|
||||||
:rtype: int
|
|
||||||
|
|
||||||
.. seealso::
|
|
||||||
|
|
||||||
:py:meth:`Node.degree`
|
|
||||||
'''
|
|
||||||
return len(self.__graph._node[self.index])
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
'''
|
|
||||||
iterate through neighbour of this node
|
|
||||||
|
|
||||||
:rtype: iterator in :py:class:`Node` instances
|
|
||||||
|
|
||||||
.. seealso::
|
|
||||||
|
|
||||||
:py:meth:`Node.neighbourIterator`
|
|
||||||
'''
|
|
||||||
return self.neighbourIterator()
|
|
||||||
|
|
||||||
def __contains__(self,key):
|
|
||||||
return self.has_key(key)
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
|
|
||||||
if self.index in self.__graph._node_attrs:
|
|
||||||
keys = " ".join(['%s="%s"' % (x[0],str(x[1]).replace('"','\\"').replace('\n','\\n'))
|
|
||||||
for x in self.__graph._node_attrs[self.index].iteritems()]
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
keys=''
|
|
||||||
|
|
||||||
return '%d [label="%s" %s]' % (self.index,
|
|
||||||
str(self.label).replace('"','\\"').replace('\n','\\n'),
|
|
||||||
keys)
|
|
||||||
|
|
||||||
def keys(self):
|
|
||||||
if self.index in self.__graph._node_attrs:
|
|
||||||
k = self.__graph._node_attrs[self.index].keys()
|
|
||||||
else:
|
|
||||||
k=[]
|
|
||||||
return k
|
|
||||||
|
|
||||||
label = property(getLabel, None, None, "Label of the node")
|
|
||||||
|
|
||||||
graph = property(getGraph, None, None, "Graph owning this node")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Edge(object):
|
|
||||||
"""
|
|
||||||
Class used for representing one edge of a graph
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,node1,node2):
|
|
||||||
'''
|
|
||||||
.. warning::
|
|
||||||
|
|
||||||
:py:class:`Edge` constructor is usualy called through the :py:class:`Graph` methods
|
|
||||||
|
|
||||||
:param node1: First node likend by the edge
|
|
||||||
:type node1: :py:class:`Node`
|
|
||||||
:param node2: Seconde node likend by the edge
|
|
||||||
:type node2: :py:class:`Node`
|
|
||||||
'''
|
|
||||||
self.node1 = node1
|
|
||||||
self.node2 = node2
|
|
||||||
|
|
||||||
def getGraph(self):
|
|
||||||
"""
|
|
||||||
Return the :py:class:`Graph` instance owning this edge.
|
|
||||||
"""
|
|
||||||
return self.node1.graph
|
|
||||||
|
|
||||||
def has_key(self,key):
|
|
||||||
'''
|
|
||||||
test is the :py:class:`Edge` instance has a property named **key**.
|
|
||||||
|
|
||||||
:param key: the name of a property
|
|
||||||
:type key: str
|
|
||||||
|
|
||||||
:return: True if the edge has a property named <key>
|
|
||||||
:rtype: bool
|
|
||||||
'''
|
|
||||||
if (self.node1.index,self.node2.index) in self.graph._edge_attrs:
|
|
||||||
return key in self.graph._edge_attrs[(self.node1.index,self.node2.index)]
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def getDirected(self):
|
|
||||||
return self.node1.graph._directed
|
|
||||||
|
|
||||||
def __getitem__(self,key):
|
|
||||||
return self.graph._edge_attrs.get((self.node1.index,self.node2.index),{})[key]
|
|
||||||
|
|
||||||
def __setitem__(self,key,value):
|
|
||||||
e = (self.node1.index,self.node2.index)
|
|
||||||
if e in self.graph._edge_attrs:
|
|
||||||
data = self.graph._edge_attrs[e]
|
|
||||||
data[key]=value
|
|
||||||
else:
|
|
||||||
self.graph._edge_attrs[e]={key:value}
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
e = (self.node1.index,self.node2.index)
|
|
||||||
if e in self.graph._edge_attrs:
|
|
||||||
keys = "[%s]" % " ".join(['%s="%s"' % (x[0],str(x[1]).replace('"','\\"'))
|
|
||||||
for x in self.graph._edge_attrs[e].iteritems()]
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
keys = ""
|
|
||||||
|
|
||||||
if self.directed:
|
|
||||||
link='->'
|
|
||||||
else:
|
|
||||||
link='--'
|
|
||||||
|
|
||||||
return "%d %s %d %s" % (self.node1.index,link,self.node2.index,keys)
|
|
||||||
|
|
||||||
def __contains__(self,key):
|
|
||||||
return self.has_key(key)
|
|
||||||
|
|
||||||
|
|
||||||
graph = property(getGraph, None, None, "Graph owning this edge")
|
|
||||||
|
|
||||||
directed = property(getDirected, None, None, "Directed's Docstring")
|
|
||||||
|
|
||||||
|
|
||||||
class DiGraph(Graph):
|
|
||||||
"""
|
|
||||||
:py:class:`DiGraph class`is a specialisation of the :py:class:`Graph` class
|
|
||||||
dedicated to directed graph representation
|
|
||||||
|
|
||||||
.. seealso::
|
|
||||||
|
|
||||||
:py:class:`UndirectedGraph`
|
|
||||||
|
|
||||||
"""
|
|
||||||
def __init__(self,label='G',indexer=None,nodes=None,edges=None):
|
|
||||||
'''
|
|
||||||
:param label: Graph name, set to 'G' by default
|
|
||||||
:type label: str
|
|
||||||
:param indexer: node label indexer
|
|
||||||
:type indexer: Indexer instance
|
|
||||||
:param nodes: set of nodes to add to the graph
|
|
||||||
:type nodes: iterable value
|
|
||||||
:param edges: set of edges to add to the graph
|
|
||||||
:type edges: iterable value
|
|
||||||
'''
|
|
||||||
|
|
||||||
Graph.__init__(self, label, True, indexer, nodes, edges)
|
|
||||||
|
|
||||||
class UndirectedGraph(Graph):
|
|
||||||
"""
|
|
||||||
:py:class:`UndirectGraph class`is a specialisation of the :py:class:`Graph` class
|
|
||||||
dedicated to undirected graph representation
|
|
||||||
|
|
||||||
.. seealso::
|
|
||||||
|
|
||||||
:py:class:`DiGraph`
|
|
||||||
|
|
||||||
"""
|
|
||||||
def __init__(self,label='G',indexer=None,nodes=None,edges=None):
|
|
||||||
'''
|
|
||||||
:param label: Graph name, set to 'G' by default
|
|
||||||
:type label: str
|
|
||||||
:param indexer: node label indexer
|
|
||||||
:type indexer: Indexer instance
|
|
||||||
:param nodes: set of nodes to add to the graph
|
|
||||||
:type nodes: iterable value
|
|
||||||
:param edges: set of edges to add to the graph
|
|
||||||
:type edges: iterable value
|
|
||||||
'''
|
|
||||||
|
|
||||||
Graph.__init__(self, label, False, indexer, nodes, edges)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def selectEdgeAttributeFactory(attribut,value):
|
|
||||||
"""
|
|
||||||
This function help in building predicat function usable for selecting edge
|
|
||||||
in the folowing :py:class:`Graph` methods :
|
|
||||||
|
|
||||||
- :py:meth:`Graph.edgeIterator`
|
|
||||||
|
|
||||||
"""
|
|
||||||
def selectEdge(e):
|
|
||||||
return attribut in e and e[attribut]==value
|
|
||||||
return selectEdge
|
|
Binary file not shown.
Binary file not shown.
@ -1,134 +0,0 @@
|
|||||||
import time
|
|
||||||
import sys
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
_maxsize=0
|
|
||||||
_solution=0
|
|
||||||
_notbound=0
|
|
||||||
_sizebound=0
|
|
||||||
_lastyield=0
|
|
||||||
_maxclique=None
|
|
||||||
|
|
||||||
def cliqueIterator(graph,minsize=1,node=None,timeout=None):
|
|
||||||
global _maxsize,_solution,_notbound,_sizebound,_lastyield
|
|
||||||
_maxsize=0
|
|
||||||
_solution=0
|
|
||||||
_notbound=0
|
|
||||||
_sizebound=0
|
|
||||||
starttime = time.time()
|
|
||||||
|
|
||||||
if node:
|
|
||||||
node = graph.getNode(node)
|
|
||||||
index = node.index
|
|
||||||
clique= set([index])
|
|
||||||
candidates= set(graph.neighbourIndexSet(index=index))
|
|
||||||
else:
|
|
||||||
clique=set()
|
|
||||||
candidates = set(x.index for x in graph)
|
|
||||||
|
|
||||||
|
|
||||||
# candidates = set(x for x in candidates
|
|
||||||
# if len(graph.neighbourIndexSet(index=x) & candidates) >= (minsize - 1))
|
|
||||||
|
|
||||||
_lastyield=time.time()
|
|
||||||
for c in _cliqueIterator(graph,clique,candidates,set(),minsize,start=starttime,timeout=timeout):
|
|
||||||
yield c
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _cliqueIterator(graph,clique,candidates,notlist,minsize=0,start=None,timeout=None):
|
|
||||||
global _maxsize,_maxclique,_solution,_notbound,_sizebound,_lastyield
|
|
||||||
|
|
||||||
# Speed indicator
|
|
||||||
lclique = len(clique)
|
|
||||||
lcandidates = len(candidates)
|
|
||||||
notmin = lcandidates
|
|
||||||
notfix = None
|
|
||||||
|
|
||||||
for n in notlist:
|
|
||||||
nnc = candidates - graph.neighbourIndexSet(index=n)
|
|
||||||
nc = len(nnc)
|
|
||||||
if nc < notmin:
|
|
||||||
notmin=nc
|
|
||||||
notfix=n
|
|
||||||
notfixneib = nnc
|
|
||||||
|
|
||||||
if lclique > _maxsize or not _solution % 1000 :
|
|
||||||
if start is not None:
|
|
||||||
top = time.time()
|
|
||||||
delta = top - start
|
|
||||||
if delta==0:
|
|
||||||
delta=1e-6
|
|
||||||
speed = _solution / delta
|
|
||||||
start = top
|
|
||||||
else:
|
|
||||||
speed = 0
|
|
||||||
print >>sys.stderr,"\rCandidates : %-5d Maximum clique size : %-5d Solutions explored : %10d speed = %5.2f solutions/sec sizebound=%10d notbound=%10d " % (lcandidates,_maxsize,_solution,speed,_sizebound,_notbound),
|
|
||||||
sys.stderr.flush()
|
|
||||||
if lclique > _maxsize:
|
|
||||||
_maxsize=lclique
|
|
||||||
|
|
||||||
# print >>sys.stderr,'koukou'
|
|
||||||
|
|
||||||
timer = time.time() - _lastyield
|
|
||||||
|
|
||||||
if not candidates and not notlist:
|
|
||||||
if lclique==_maxsize:
|
|
||||||
_maxclique=set(clique)
|
|
||||||
if lclique >= minsize:
|
|
||||||
yield set(clique)
|
|
||||||
if timeout is not None and timer > timeout and _maxclique is not None:
|
|
||||||
yield _maxclique
|
|
||||||
_maxclique=None
|
|
||||||
|
|
||||||
else:
|
|
||||||
while notmin and candidates and ((lclique + len(candidates)) >= minsize or (timeout is not None and timer > timeout)):
|
|
||||||
# count explored solution
|
|
||||||
_solution+=1
|
|
||||||
|
|
||||||
if notfix is None:
|
|
||||||
nextcandidate = candidates.pop()
|
|
||||||
else:
|
|
||||||
nextcandidate = notfixneib.pop()
|
|
||||||
candidates.remove(nextcandidate)
|
|
||||||
|
|
||||||
clique.add(nextcandidate)
|
|
||||||
|
|
||||||
neighbours = graph.neighbourIndexSet(index=nextcandidate)
|
|
||||||
|
|
||||||
nextcandidates = candidates & neighbours
|
|
||||||
nextnot = notlist & neighbours
|
|
||||||
|
|
||||||
nnc = candidates - neighbours
|
|
||||||
lnnc=len(nnc)
|
|
||||||
|
|
||||||
for c in _cliqueIterator(graph,
|
|
||||||
set(clique),
|
|
||||||
nextcandidates,
|
|
||||||
nextnot,
|
|
||||||
minsize,
|
|
||||||
start,
|
|
||||||
timeout=timeout):
|
|
||||||
yield c
|
|
||||||
|
|
||||||
|
|
||||||
clique.remove(nextcandidate)
|
|
||||||
|
|
||||||
notmin-=1
|
|
||||||
|
|
||||||
if lnnc < notmin:
|
|
||||||
notmin = lnnc
|
|
||||||
notfix = nextcandidate
|
|
||||||
notfixneib = nnc
|
|
||||||
|
|
||||||
if notmin==0:
|
|
||||||
_notbound+=1
|
|
||||||
|
|
||||||
notlist.add(nextcandidate)
|
|
||||||
else:
|
|
||||||
if (lclique + len(candidates)) < minsize:
|
|
||||||
_sizebound+=1
|
|
||||||
|
|
@ -1,8 +0,0 @@
|
|||||||
|
|
||||||
def compactGraph(graph,nodeSetIterator):
|
|
||||||
compact = graph.newEmpty()
|
|
||||||
for ns in nodeSetIterator(graph):
|
|
||||||
nlabel = "\n".join([str(graph.getNode(index=x).label) for x in ns])
|
|
||||||
compact.addNode(nlabel)
|
|
||||||
print
|
|
||||||
print compact
|
|
@ -1,82 +0,0 @@
|
|||||||
"""
|
|
||||||
Iterate through the connected components of a graph
|
|
||||||
---------------------------------------------------
|
|
||||||
|
|
||||||
the module :py:mod:`obitools.graph.algorithm.component` provides
|
|
||||||
two functions to deal with the connected component of a graph
|
|
||||||
represented as a :py:class:`obitools.graph.Graph` instance.
|
|
||||||
|
|
||||||
The whole set of connected component of a graph is a partition of this graph.
|
|
||||||
So a node cannot belongs to two distinct connected component.
|
|
||||||
|
|
||||||
Two nodes are in the same connected component if it exits a path through
|
|
||||||
the graph edges linking them.
|
|
||||||
|
|
||||||
TODO: THere is certainly a bug with DirectedGraph
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def componentIterator(graph,nodePredicat=None,edgePredicat=None):
|
|
||||||
'''
|
|
||||||
Build an iterator over the connected component of a graph.
|
|
||||||
Each connected component returned by the iterator is represented
|
|
||||||
as a `set` of node indices.
|
|
||||||
|
|
||||||
:param graph: the graph to partitionne
|
|
||||||
:type graph: :py:class:`obitools.graph.Graph`
|
|
||||||
|
|
||||||
:param predicate: a function allowing edge selection. Default value
|
|
||||||
is **None** and indicate that all edges are selected.
|
|
||||||
:type predicate: a function returning a boolean value
|
|
||||||
and accepting one argument of class :py:class:`Node`
|
|
||||||
|
|
||||||
:param predicate: a function allowing node selection. Default value
|
|
||||||
is **None** and indicate that all nodes are selected.
|
|
||||||
:type predicate: a function returning a boolean value
|
|
||||||
and accepting one argument of class :py:class:`Edge`
|
|
||||||
|
|
||||||
:return: an iterator over the connected component set
|
|
||||||
:rtype: an iterator over `set` of `int`
|
|
||||||
|
|
||||||
.. seealso::
|
|
||||||
the :py:meth:`obitools.graph.Graph.componentIndexSet` method
|
|
||||||
on which is based this function.
|
|
||||||
'''
|
|
||||||
seen = set()
|
|
||||||
for n in graph.nodeIterator(nodePredicat):
|
|
||||||
if n.index not in seen:
|
|
||||||
cc=n.componentIndexSet(nodePredicat, edgePredicat)
|
|
||||||
yield cc
|
|
||||||
seen |= cc
|
|
||||||
|
|
||||||
def componentCount(graph,nodePredicat=None,edgePredicat=None):
|
|
||||||
'''
|
|
||||||
Count the connected componnent in a graph.
|
|
||||||
|
|
||||||
:param graph: the graph to partitionne
|
|
||||||
:type graph: :py:class:`obitools.graph.Graph`
|
|
||||||
|
|
||||||
:param predicate: a function allowing edge selection. Default value
|
|
||||||
is **None** and indicate that all edges are selected.
|
|
||||||
:type predicate: a function returning a boolean value
|
|
||||||
and accepting one argument of class :py:class:`Node`
|
|
||||||
|
|
||||||
:param predicate: a function allowing node selection. Default value
|
|
||||||
is **None** and indicate that all nodes are selected.
|
|
||||||
:type predicate: a function returning a boolean value
|
|
||||||
and accepting one argument of class :py:class:`Edge`
|
|
||||||
|
|
||||||
:return: an iterator over the connected component set
|
|
||||||
:rtype: an iterator over `set` of `int`
|
|
||||||
|
|
||||||
.. seealso::
|
|
||||||
the :py:func:`componentIterator` function
|
|
||||||
on which is based this function.
|
|
||||||
'''
|
|
||||||
n=0
|
|
||||||
for c in componentIterator(graph,nodePredicat, edgePredicat):
|
|
||||||
n+=1
|
|
||||||
return n
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Binary file not shown.
@ -1,80 +0,0 @@
|
|||||||
from obitools.graph import DiGraph,Node
|
|
||||||
from obitools.graph.algorithms.component import componentIterator
|
|
||||||
|
|
||||||
class DAG(DiGraph):
|
|
||||||
def __init__(self,label='G',indexer=None,nodes=None,edges=None):
|
|
||||||
'''
|
|
||||||
Directed Graph constructor.
|
|
||||||
|
|
||||||
@param label: Graph name, set to 'G' by default
|
|
||||||
@type label: str
|
|
||||||
@param indexer: node label indexer
|
|
||||||
@type indexer: Indexer instance
|
|
||||||
@param nodes: set of nodes to add to the graph
|
|
||||||
@type nodes: iterable value
|
|
||||||
@param edges: set of edges to add to the graph
|
|
||||||
@type edges: iterable value
|
|
||||||
'''
|
|
||||||
|
|
||||||
self._parents={}
|
|
||||||
DiGraph.__init__(self, label, indexer, nodes, edges)
|
|
||||||
|
|
||||||
def getNode(self,node=None,index=None):
|
|
||||||
if index is None:
|
|
||||||
index = self._index.getIndex(node, True)
|
|
||||||
return DAGNode(index,self)
|
|
||||||
|
|
||||||
def addEdge(self,parent=None,node=None,indexp=None,index=None,**data):
|
|
||||||
indexp=self.addNode(parent, indexp)
|
|
||||||
index =self.addNode(node , index)
|
|
||||||
|
|
||||||
pindex = set(n.index
|
|
||||||
for n in self.getNode(index=indexp).ancestorIterator())
|
|
||||||
|
|
||||||
assert index not in pindex,'Child node cannot be a parent node'
|
|
||||||
|
|
||||||
DiGraph.addEdge(self,index1=indexp,index2=index,**data)
|
|
||||||
|
|
||||||
if index in self._parents:
|
|
||||||
self._parents[index].add(indexp)
|
|
||||||
else:
|
|
||||||
self._parents[index]=set([indexp])
|
|
||||||
|
|
||||||
|
|
||||||
return (indexp,index)
|
|
||||||
|
|
||||||
def getRoots(self):
|
|
||||||
return [self.getNode(index=cc.pop()).getRoot()
|
|
||||||
for cc in componentIterator(self)]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class DAGNode(Node):
|
|
||||||
|
|
||||||
def ancestorIterator(self):
|
|
||||||
if self.index in self.graph._parents:
|
|
||||||
for p in self.graph._parents[self.index]:
|
|
||||||
parent = DAGNode(p,self.graph)
|
|
||||||
yield parent
|
|
||||||
for pnode in parent.ancestorIterator():
|
|
||||||
yield pnode
|
|
||||||
|
|
||||||
def getRoot(self):
|
|
||||||
for x in self.ancestorIterator():
|
|
||||||
pass
|
|
||||||
return x
|
|
||||||
|
|
||||||
def leavesIterator(self):
|
|
||||||
if not self:
|
|
||||||
yield self
|
|
||||||
for n in self:
|
|
||||||
for nn in n.leavesIterator():
|
|
||||||
yield nn
|
|
||||||
|
|
||||||
def subgraphIterator(self):
|
|
||||||
yield self
|
|
||||||
for n in self:
|
|
||||||
for nn in n.subgraphIterator():
|
|
||||||
yield nn
|
|
||||||
|
|
@ -1,117 +0,0 @@
|
|||||||
from obitools.graph.dag import DAG,DAGNode
|
|
||||||
|
|
||||||
class RootedTree(DAG):
|
|
||||||
|
|
||||||
def addEdge(self,parent=None,node=None,indexp=None,index=None,**data):
|
|
||||||
indexp=self.addNode(parent, indexp)
|
|
||||||
index =self.addNode(node , index)
|
|
||||||
|
|
||||||
assert index not in self._parents or indexp in self._parents[index], \
|
|
||||||
'Child node cannot have more than one parent node'
|
|
||||||
|
|
||||||
return DAG.addEdge(self,indexp=indexp,index=index,**data)
|
|
||||||
|
|
||||||
def getNode(self,node=None,index=None):
|
|
||||||
if index is None:
|
|
||||||
index = self._index.getIndex(node, True)
|
|
||||||
return RootedTreeNode(index,self)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class RootedTreeNode(DAGNode):
|
|
||||||
|
|
||||||
def subTreeSize(self):
|
|
||||||
n=1
|
|
||||||
for subnode in self:
|
|
||||||
n+=subnode.subTreeSize()
|
|
||||||
return n
|
|
||||||
|
|
||||||
def subTreeLeaves(self):
|
|
||||||
if not self:
|
|
||||||
return 1
|
|
||||||
n=0
|
|
||||||
for subnode in self:
|
|
||||||
n+=subnode.subTreeLeaves()
|
|
||||||
return n
|
|
||||||
|
|
||||||
|
|
||||||
def nodeWriter(node,deep=0,label=None,distance="distance", bootstrap="bootstrap",cartoon=None,collapse=None):
|
|
||||||
|
|
||||||
ks = node.keys()
|
|
||||||
|
|
||||||
|
|
||||||
if label is None:
|
|
||||||
name=node.label
|
|
||||||
elif callable(label):
|
|
||||||
name=label(node)
|
|
||||||
elif isinstance(label, str) and label in node:
|
|
||||||
name=node[label]
|
|
||||||
ks.remove(label)
|
|
||||||
else:
|
|
||||||
name=''
|
|
||||||
|
|
||||||
if distance in node:
|
|
||||||
dist=':%6.5f' % node[distance]
|
|
||||||
ks.remove(distance)
|
|
||||||
else:
|
|
||||||
dist=''
|
|
||||||
|
|
||||||
ks = ["%s=%s" % (k,node[k]) for k in ks]
|
|
||||||
|
|
||||||
if cartoon is not None and cartoon(node):
|
|
||||||
ks.append("!cartoon={%d,0.0}" % node.subTreeLeaves())
|
|
||||||
|
|
||||||
if collapse is not None and collapse(node):
|
|
||||||
ks.append('!collapse={"collapsed",0.0}')
|
|
||||||
|
|
||||||
if ks:
|
|
||||||
ks="[&"+",".join(ks)+"]"
|
|
||||||
else:
|
|
||||||
ks=''
|
|
||||||
|
|
||||||
|
|
||||||
nodeseparator = ',\n' + ' ' * (deep+1)
|
|
||||||
|
|
||||||
subnodes = nodeseparator.join([nodeWriter(x, deep+1,label,distance,bootstrap,cartoon=cartoon,collapse=collapse)
|
|
||||||
for x in node])
|
|
||||||
if subnodes:
|
|
||||||
subnodes='(\n' + ' ' * (deep+1) + subnodes + '\n' + ' ' * deep + ')'
|
|
||||||
|
|
||||||
return '%s"%s"%s%s' % (subnodes,name,ks,dist)
|
|
||||||
|
|
||||||
|
|
||||||
def nexusFormat(tree,startnode=None,label=None,blocks="",cartoon=None,collapse=None):
|
|
||||||
head="#NEXUS\n"
|
|
||||||
|
|
||||||
tx = []
|
|
||||||
|
|
||||||
for n in tree:
|
|
||||||
if label is None:
|
|
||||||
name=n.label
|
|
||||||
elif callable(label):
|
|
||||||
name=label(n)
|
|
||||||
elif isinstance(label, str) and label in n:
|
|
||||||
name=n[label]
|
|
||||||
else:
|
|
||||||
name=''
|
|
||||||
|
|
||||||
if name:
|
|
||||||
tx.append('"%s"' % name)
|
|
||||||
|
|
||||||
taxa = "begin taxa;\n\tdimensions ntax=%d;\n\ttaxlabels\n\t" % len(tx)
|
|
||||||
|
|
||||||
taxa+="\n\t".join(tx)
|
|
||||||
|
|
||||||
taxa+="\n;\nend;\n\n"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if startnode is not None:
|
|
||||||
roots =[startnode]
|
|
||||||
else:
|
|
||||||
roots = tree.getRoots()
|
|
||||||
trees = nodeWriter(roots[0],0,label,cartoon=cartoon,collapse=collapse)
|
|
||||||
trees = "begin trees;\n\ttree tree_1 = [&R] "+ trees +";\nend;\n\n"
|
|
||||||
return head+taxa+trees+"\n\n"+blocks+"\n"
|
|
||||||
|
|
||||||
|
|
@ -1,37 +0,0 @@
|
|||||||
from obitools.graph import UndirectedGraph,Node
|
|
||||||
from obitools.graph.algorithms.component import componentCount
|
|
||||||
|
|
||||||
|
|
||||||
class Forest(UndirectedGraph):
|
|
||||||
|
|
||||||
|
|
||||||
def getNode(self,node=None,index=None):
|
|
||||||
if index is None:
|
|
||||||
index = self._index.getIndex(node, True)
|
|
||||||
return TreeNode(index,self)
|
|
||||||
|
|
||||||
def addEdge(self,node1=None,node2=None,index1=None,index2=None,**data):
|
|
||||||
index1=self.addNode(node1, index1)
|
|
||||||
index2=self.addNode(node2, index2)
|
|
||||||
|
|
||||||
cc = set(n.index for n in self.getNode(index=index2).componentIterator())
|
|
||||||
|
|
||||||
assert index1 in self._node[index2] or index1 not in cc, \
|
|
||||||
"No more than one path is alloed between two nodes in a tree"
|
|
||||||
|
|
||||||
UndirectedGraph.addEdge(self, index1=index1, index2=index2,**data)
|
|
||||||
|
|
||||||
return (index1,index2)
|
|
||||||
|
|
||||||
def isASingleTree(self):
|
|
||||||
return componentCount(self)==1
|
|
||||||
|
|
||||||
class TreeNode(Node):
|
|
||||||
|
|
||||||
def componentIterator(self):
|
|
||||||
for c in self:
|
|
||||||
yield c
|
|
||||||
for cc in c:
|
|
||||||
yield cc
|
|
||||||
|
|
||||||
|
|
504
obitools/gzip.py
504
obitools/gzip.py
@ -1,504 +0,0 @@
|
|||||||
"""Functions that read and write gzipped files.
|
|
||||||
|
|
||||||
The user of the file doesn't have to worry about the compression,
|
|
||||||
but random access is not allowed.
|
|
||||||
|
|
||||||
This consisted on a patched version of of standard gzip python
|
|
||||||
module based on Andrew Kuchling's minigzip.py distributed with the zlib module
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
# based on Andrew Kuchling's minigzip.py distributed with the zlib module
|
|
||||||
|
|
||||||
import struct, sys, time
|
|
||||||
import zlib
|
|
||||||
import __builtin__
|
|
||||||
|
|
||||||
__all__ = ["GzipFile","open"]
|
|
||||||
|
|
||||||
FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
|
|
||||||
|
|
||||||
READ, WRITE = 1, 2
|
|
||||||
|
|
||||||
def U32(i):
|
|
||||||
"""Return i as an unsigned integer, assuming it fits in 32 bits.
|
|
||||||
|
|
||||||
If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
|
|
||||||
"""
|
|
||||||
if i < 0:
|
|
||||||
i += 1L << 32
|
|
||||||
return i
|
|
||||||
|
|
||||||
def LOWU32(i):
|
|
||||||
"""Return the low-order 32 bits of an int, as a non-negative int."""
|
|
||||||
return i & 0xFFFFFFFFL
|
|
||||||
|
|
||||||
def write32(output, value):
|
|
||||||
output.write(struct.pack("<l", value))
|
|
||||||
|
|
||||||
def write32u(output, value):
|
|
||||||
# The L format writes the bit pattern correctly whether signed
|
|
||||||
# or unsigned.
|
|
||||||
output.write(struct.pack("<L", value))
|
|
||||||
|
|
||||||
def read32(input):
|
|
||||||
return struct.unpack("<l", input.read(4))[0]
|
|
||||||
|
|
||||||
def unpack32(buf):
|
|
||||||
return struct.unpack("<l", buf)[0]
|
|
||||||
|
|
||||||
def open(filename, mode="rb", compresslevel=9):
|
|
||||||
"""Shorthand for GzipFile(filename, mode, compresslevel).
|
|
||||||
|
|
||||||
The filename argument is required; mode defaults to 'rb'
|
|
||||||
and compresslevel defaults to 9.
|
|
||||||
|
|
||||||
"""
|
|
||||||
return GzipFile(filename, mode, compresslevel)
|
|
||||||
|
|
||||||
class GzipFile:
|
|
||||||
"""The GzipFile class simulates most of the methods of a file object with
|
|
||||||
the exception of the readinto() and truncate() methods.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
myfileobj = None
|
|
||||||
max_read_chunk = 10 * 1024 * 1024 # 10Mb
|
|
||||||
|
|
||||||
def __init__(self, filename=None, mode=None,
|
|
||||||
compresslevel=9, fileobj=None):
|
|
||||||
"""Constructor for the GzipFile class.
|
|
||||||
|
|
||||||
At least one of fileobj and filename must be given a
|
|
||||||
non-trivial value.
|
|
||||||
|
|
||||||
The new class instance is based on fileobj, which can be a regular
|
|
||||||
file, a StringIO object, or any other object which simulates a file.
|
|
||||||
It defaults to None, in which case filename is opened to provide
|
|
||||||
a file object.
|
|
||||||
|
|
||||||
When fileobj is not None, the filename argument is only used to be
|
|
||||||
included in the gzip file header, which may includes the original
|
|
||||||
filename of the uncompressed file. It defaults to the filename of
|
|
||||||
fileobj, if discernible; otherwise, it defaults to the empty string,
|
|
||||||
and in this case the original filename is not included in the header.
|
|
||||||
|
|
||||||
The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
|
|
||||||
depending on whether the file will be read or written. The default
|
|
||||||
is the mode of fileobj if discernible; otherwise, the default is 'rb'.
|
|
||||||
Be aware that only the 'rb', 'ab', and 'wb' values should be used
|
|
||||||
for cross-platform portability.
|
|
||||||
|
|
||||||
The compresslevel argument is an integer from 1 to 9 controlling the
|
|
||||||
level of compression; 1 is fastest and produces the least compression,
|
|
||||||
and 9 is slowest and produces the most compression. The default is 9.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
# guarantee the file is opened in binary mode on platforms
|
|
||||||
# that care about that sort of thing
|
|
||||||
if mode and 'b' not in mode:
|
|
||||||
mode += 'b'
|
|
||||||
if fileobj is None:
|
|
||||||
fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
|
|
||||||
if filename is None:
|
|
||||||
if hasattr(fileobj, 'name'): filename = fileobj.name
|
|
||||||
else: filename = ''
|
|
||||||
if mode is None:
|
|
||||||
if hasattr(fileobj, 'mode'): mode = fileobj.mode
|
|
||||||
else: mode = 'rb'
|
|
||||||
|
|
||||||
if mode[0:1] == 'r':
|
|
||||||
self.mode = READ
|
|
||||||
# Set flag indicating start of a new member
|
|
||||||
self._new_member = True
|
|
||||||
self.extrabuf = ""
|
|
||||||
self.extrasize = 0
|
|
||||||
self.filename = filename
|
|
||||||
# Starts small, scales exponentially
|
|
||||||
self.min_readsize = 100
|
|
||||||
|
|
||||||
elif mode[0:1] == 'w' or mode[0:1] == 'a':
|
|
||||||
self.mode = WRITE
|
|
||||||
self._init_write(filename)
|
|
||||||
self.compress = zlib.compressobj(compresslevel,
|
|
||||||
zlib.DEFLATED,
|
|
||||||
-zlib.MAX_WBITS,
|
|
||||||
zlib.DEF_MEM_LEVEL,
|
|
||||||
0)
|
|
||||||
else:
|
|
||||||
raise IOError, "Mode " + mode + " not supported"
|
|
||||||
|
|
||||||
self.fileobj = fileobj
|
|
||||||
self.offset = 0
|
|
||||||
self.inputbuf = ''
|
|
||||||
self.last8 = ''
|
|
||||||
|
|
||||||
if self.mode == WRITE:
|
|
||||||
self._write_gzip_header()
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
s = repr(self.fileobj)
|
|
||||||
return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
|
|
||||||
|
|
||||||
def _init_write(self, filename):
|
|
||||||
if filename[-3:] != '.gz':
|
|
||||||
filename = filename + '.gz'
|
|
||||||
self.filename = filename
|
|
||||||
self.crc = zlib.crc32("")
|
|
||||||
self.size = 0
|
|
||||||
self.writebuf = []
|
|
||||||
self.bufsize = 0
|
|
||||||
|
|
||||||
def _write_gzip_header(self):
|
|
||||||
self.fileobj.write('\037\213') # magic header
|
|
||||||
self.fileobj.write('\010') # compression method
|
|
||||||
fname = self.filename[:-3]
|
|
||||||
flags = 0
|
|
||||||
if fname:
|
|
||||||
flags = FNAME
|
|
||||||
self.fileobj.write(chr(flags))
|
|
||||||
write32u(self.fileobj, long(time.time()))
|
|
||||||
self.fileobj.write('\002')
|
|
||||||
self.fileobj.write('\377')
|
|
||||||
if fname:
|
|
||||||
self.fileobj.write(fname + '\000')
|
|
||||||
|
|
||||||
def _init_read(self):
|
|
||||||
self.crc = zlib.crc32("")
|
|
||||||
self.size = 0
|
|
||||||
|
|
||||||
def _read_internal(self, size):
|
|
||||||
if len(self.inputbuf) < size:
|
|
||||||
self.inputbuf += self.fileobj.read(size-len(self.inputbuf))
|
|
||||||
chunk = self.inputbuf[:size]
|
|
||||||
# need to use len(chunk) bellow instead of size in case it's EOF.
|
|
||||||
if len(chunk) < 8:
|
|
||||||
self.last8 = self.last8[len(chunk):] + chunk
|
|
||||||
else:
|
|
||||||
self.last8 = chunk[-8:]
|
|
||||||
self.inputbuf = self.inputbuf[size:]
|
|
||||||
return chunk
|
|
||||||
|
|
||||||
def _read_gzip_header(self):
|
|
||||||
magic = self._read_internal(2)
|
|
||||||
if len(magic) != 2:
|
|
||||||
raise EOFError, "Reached EOF"
|
|
||||||
if magic != '\037\213':
|
|
||||||
raise IOError, 'Not a gzipped file'
|
|
||||||
method = ord( self._read_internal(1) )
|
|
||||||
if method != 8:
|
|
||||||
raise IOError, 'Unknown compression method'
|
|
||||||
flag = ord( self._read_internal(1) )
|
|
||||||
# modtime = self.fileobj.read(4)
|
|
||||||
# extraflag = self.fileobj.read(1)
|
|
||||||
# os = self.fileobj.read(1)
|
|
||||||
self._read_internal(6)
|
|
||||||
|
|
||||||
if flag & FEXTRA:
|
|
||||||
# Read & discard the extra field, if present
|
|
||||||
xlen = ord(self._read_internal(1))
|
|
||||||
xlen = xlen + 256*ord(self._read_internal(1))
|
|
||||||
self._read_internal(xlen)
|
|
||||||
if flag & FNAME:
|
|
||||||
# Read and discard a null-terminated string containing the filename
|
|
||||||
while True:
|
|
||||||
s = self._read_internal(1)
|
|
||||||
if not s or s=='\000':
|
|
||||||
break
|
|
||||||
if flag & FCOMMENT:
|
|
||||||
# Read and discard a null-terminated string containing a comment
|
|
||||||
while True:
|
|
||||||
s = self._read_internal(1)
|
|
||||||
if not s or s=='\000':
|
|
||||||
break
|
|
||||||
if flag & FHCRC:
|
|
||||||
self._read_internal(2) # Read & discard the 16-bit header CRC
|
|
||||||
|
|
||||||
|
|
||||||
def write(self,data):
|
|
||||||
if self.mode != WRITE:
|
|
||||||
import errno
|
|
||||||
raise IOError(errno.EBADF, "write() on read-only GzipFile object")
|
|
||||||
|
|
||||||
if self.fileobj is None:
|
|
||||||
raise ValueError, "write() on closed GzipFile object"
|
|
||||||
if len(data) > 0:
|
|
||||||
self.size = self.size + len(data)
|
|
||||||
self.crc = zlib.crc32(data, self.crc)
|
|
||||||
self.fileobj.write( self.compress.compress(data) )
|
|
||||||
self.offset += len(data)
|
|
||||||
|
|
||||||
def read(self, size=-1):
|
|
||||||
if self.mode != READ:
|
|
||||||
import errno
|
|
||||||
raise IOError(errno.EBADF, "read() on write-only GzipFile object")
|
|
||||||
|
|
||||||
if self.extrasize <= 0 and self.fileobj is None:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
readsize = 1024
|
|
||||||
if size < 0: # get the whole thing
|
|
||||||
try:
|
|
||||||
while True:
|
|
||||||
self._read(readsize)
|
|
||||||
readsize = min(self.max_read_chunk, readsize * 2)
|
|
||||||
except EOFError:
|
|
||||||
size = self.extrasize
|
|
||||||
else: # just get some more of it
|
|
||||||
try:
|
|
||||||
while size > self.extrasize:
|
|
||||||
self._read(readsize)
|
|
||||||
readsize = min(self.max_read_chunk, readsize * 2)
|
|
||||||
except EOFError:
|
|
||||||
if size > self.extrasize:
|
|
||||||
size = self.extrasize
|
|
||||||
|
|
||||||
chunk = self.extrabuf[:size]
|
|
||||||
self.extrabuf = self.extrabuf[size:]
|
|
||||||
self.extrasize = self.extrasize - size
|
|
||||||
|
|
||||||
self.offset += size
|
|
||||||
return chunk
|
|
||||||
|
|
||||||
def _unread(self, buf):
|
|
||||||
self.extrabuf = buf + self.extrabuf
|
|
||||||
self.extrasize = len(buf) + self.extrasize
|
|
||||||
self.offset -= len(buf)
|
|
||||||
|
|
||||||
def _read(self, size=1024):
|
|
||||||
if self.fileobj is None:
|
|
||||||
raise EOFError, "Reached EOF"
|
|
||||||
|
|
||||||
if self._new_member:
|
|
||||||
# If the _new_member flag is set, we have to
|
|
||||||
# jump to the next member, if there is one.
|
|
||||||
#
|
|
||||||
# _read_gzip_header will raise EOFError exception
|
|
||||||
# if there no more members to read.
|
|
||||||
self._init_read()
|
|
||||||
self._read_gzip_header()
|
|
||||||
self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
|
|
||||||
self._new_member = False
|
|
||||||
|
|
||||||
# Read a chunk of data from the file
|
|
||||||
buf = self._read_internal(size)
|
|
||||||
|
|
||||||
# If the EOF has been reached, flush the decompression object
|
|
||||||
# and mark this object as finished.
|
|
||||||
|
|
||||||
if buf == "":
|
|
||||||
uncompress = self.decompress.flush()
|
|
||||||
self._read_eof()
|
|
||||||
self._add_read_data( uncompress )
|
|
||||||
raise EOFError, 'Reached EOF'
|
|
||||||
|
|
||||||
uncompress = self.decompress.decompress(buf)
|
|
||||||
self._add_read_data( uncompress )
|
|
||||||
|
|
||||||
if self.decompress.unused_data != "":
|
|
||||||
# Ending case: we've come to the end of a member in the file,
|
|
||||||
# so put back unused_data and initialize last8 by reading them.
|
|
||||||
self.inputbuf = self.decompress.unused_data + self.inputbuf
|
|
||||||
self._read_internal(8)
|
|
||||||
|
|
||||||
# Check the CRC and file size, and set the flag so we read
|
|
||||||
# a new member on the next call
|
|
||||||
self._read_eof()
|
|
||||||
self._new_member = True
|
|
||||||
|
|
||||||
def _add_read_data(self, data):
|
|
||||||
self.crc = zlib.crc32(data, self.crc)
|
|
||||||
self.extrabuf = self.extrabuf + data
|
|
||||||
self.extrasize = self.extrasize + len(data)
|
|
||||||
self.size = self.size + len(data)
|
|
||||||
|
|
||||||
def _read_eof(self):
|
|
||||||
# We've read to the end of the file, so we have to rewind in order
|
|
||||||
# to reread the 8 bytes containing the CRC and the file size.
|
|
||||||
# We check the that the computed CRC and size of the
|
|
||||||
# uncompressed data matches the stored values. Note that the size
|
|
||||||
# stored is the true file size mod 2**32.
|
|
||||||
crc32 = unpack32(self.last8[:4])
|
|
||||||
isize = U32(unpack32(self.last8[4:])) # may exceed 2GB
|
|
||||||
if U32(crc32) != U32(self.crc):
|
|
||||||
raise IOError, "CRC check failed"
|
|
||||||
elif isize != LOWU32(self.size):
|
|
||||||
raise IOError, "Incorrect length of data produced"
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
if self.mode == WRITE:
|
|
||||||
self.fileobj.write(self.compress.flush())
|
|
||||||
# The native zlib crc is an unsigned 32-bit integer, but
|
|
||||||
# the Python wrapper implicitly casts that to a signed C
|
|
||||||
# long. So, on a 32-bit box self.crc may "look negative",
|
|
||||||
# while the same crc on a 64-bit box may "look positive".
|
|
||||||
# To avoid irksome warnings from the `struct` module, force
|
|
||||||
# it to look positive on all boxes.
|
|
||||||
write32u(self.fileobj, LOWU32(self.crc))
|
|
||||||
# self.size may exceed 2GB, or even 4GB
|
|
||||||
write32u(self.fileobj, LOWU32(self.size))
|
|
||||||
self.fileobj = None
|
|
||||||
elif self.mode == READ:
|
|
||||||
self.fileobj = None
|
|
||||||
if self.myfileobj:
|
|
||||||
self.myfileobj.close()
|
|
||||||
self.myfileobj = None
|
|
||||||
|
|
||||||
def __del__(self):
|
|
||||||
try:
|
|
||||||
if (self.myfileobj is None and
|
|
||||||
self.fileobj is None):
|
|
||||||
return
|
|
||||||
except AttributeError:
|
|
||||||
return
|
|
||||||
self.close()
|
|
||||||
|
|
||||||
def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
|
|
||||||
if self.mode == WRITE:
|
|
||||||
# Ensure the compressor's buffer is flushed
|
|
||||||
self.fileobj.write(self.compress.flush(zlib_mode))
|
|
||||||
self.fileobj.flush()
|
|
||||||
|
|
||||||
def fileno(self):
|
|
||||||
"""Invoke the underlying file object's fileno() method.
|
|
||||||
|
|
||||||
This will raise AttributeError if the underlying file object
|
|
||||||
doesn't support fileno().
|
|
||||||
"""
|
|
||||||
return self.fileobj.fileno()
|
|
||||||
|
|
||||||
def isatty(self):
|
|
||||||
return False
|
|
||||||
|
|
||||||
def tell(self):
|
|
||||||
return self.offset
|
|
||||||
|
|
||||||
def rewind(self):
|
|
||||||
'''Return the uncompressed stream file position indicator to the
|
|
||||||
beginning of the file'''
|
|
||||||
if self.mode != READ:
|
|
||||||
raise IOError("Can't rewind in write mode")
|
|
||||||
self.fileobj.seek(0)
|
|
||||||
self._new_member = True
|
|
||||||
self.extrabuf = ""
|
|
||||||
self.extrasize = 0
|
|
||||||
self.offset = 0
|
|
||||||
|
|
||||||
def seek(self, offset):
|
|
||||||
if self.mode == WRITE:
|
|
||||||
if offset < self.offset:
|
|
||||||
raise IOError('Negative seek in write mode')
|
|
||||||
count = offset - self.offset
|
|
||||||
for i in range(count // 1024):
|
|
||||||
self.write(1024 * '\0')
|
|
||||||
self.write((count % 1024) * '\0')
|
|
||||||
elif self.mode == READ:
|
|
||||||
if offset < self.offset:
|
|
||||||
# for negative seek, rewind and do positive seek
|
|
||||||
self.rewind()
|
|
||||||
count = offset - self.offset
|
|
||||||
for i in range(count // 1024):
|
|
||||||
self.read(1024)
|
|
||||||
self.read(count % 1024)
|
|
||||||
|
|
||||||
def readline(self, size=-1):
|
|
||||||
if size < 0:
|
|
||||||
size = sys.maxint
|
|
||||||
readsize = self.min_readsize
|
|
||||||
else:
|
|
||||||
readsize = size
|
|
||||||
bufs = []
|
|
||||||
while size != 0:
|
|
||||||
c = self.read(readsize)
|
|
||||||
i = c.find('\n')
|
|
||||||
|
|
||||||
# We set i=size to break out of the loop under two
|
|
||||||
# conditions: 1) there's no newline, and the chunk is
|
|
||||||
# larger than size, or 2) there is a newline, but the
|
|
||||||
# resulting line would be longer than 'size'.
|
|
||||||
if (size <= i) or (i == -1 and len(c) > size):
|
|
||||||
i = size - 1
|
|
||||||
|
|
||||||
if i >= 0 or c == '':
|
|
||||||
bufs.append(c[:i + 1]) # Add portion of last chunk
|
|
||||||
self._unread(c[i + 1:]) # Push back rest of chunk
|
|
||||||
break
|
|
||||||
|
|
||||||
# Append chunk to list, decrease 'size',
|
|
||||||
bufs.append(c)
|
|
||||||
size = size - len(c)
|
|
||||||
readsize = min(size, readsize * 2)
|
|
||||||
if readsize > self.min_readsize:
|
|
||||||
self.min_readsize = min(readsize, self.min_readsize * 2, 512)
|
|
||||||
return ''.join(bufs) # Return resulting line
|
|
||||||
|
|
||||||
def readlines(self, sizehint=0):
|
|
||||||
# Negative numbers result in reading all the lines
|
|
||||||
if sizehint <= 0:
|
|
||||||
sizehint = sys.maxint
|
|
||||||
L = []
|
|
||||||
while sizehint > 0:
|
|
||||||
line = self.readline()
|
|
||||||
if line == "":
|
|
||||||
break
|
|
||||||
L.append(line)
|
|
||||||
sizehint = sizehint - len(line)
|
|
||||||
|
|
||||||
return L
|
|
||||||
|
|
||||||
def writelines(self, L):
|
|
||||||
for line in L:
|
|
||||||
self.write(line)
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return self
|
|
||||||
|
|
||||||
def next(self):
|
|
||||||
line = self.readline()
|
|
||||||
if line:
|
|
||||||
return line
|
|
||||||
else:
|
|
||||||
raise StopIteration
|
|
||||||
|
|
||||||
|
|
||||||
def _test():
|
|
||||||
# Act like gzip; with -d, act like gunzip.
|
|
||||||
# The input file is not deleted, however, nor are any other gzip
|
|
||||||
# options or features supported.
|
|
||||||
args = sys.argv[1:]
|
|
||||||
decompress = args and args[0] == "-d"
|
|
||||||
if decompress:
|
|
||||||
args = args[1:]
|
|
||||||
if not args:
|
|
||||||
args = ["-"]
|
|
||||||
for arg in args:
|
|
||||||
if decompress:
|
|
||||||
if arg == "-":
|
|
||||||
f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
|
|
||||||
g = sys.stdout
|
|
||||||
else:
|
|
||||||
if arg[-3:] != ".gz":
|
|
||||||
print "filename doesn't end in .gz:", repr(arg)
|
|
||||||
continue
|
|
||||||
f = open(arg, "rb")
|
|
||||||
g = __builtin__.open(arg[:-3], "wb")
|
|
||||||
else:
|
|
||||||
if arg == "-":
|
|
||||||
f = sys.stdin
|
|
||||||
g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
|
|
||||||
else:
|
|
||||||
f = __builtin__.open(arg, "rb")
|
|
||||||
g = open(arg + ".gz", "wb")
|
|
||||||
while True:
|
|
||||||
chunk = f.read(1024)
|
|
||||||
if not chunk:
|
|
||||||
break
|
|
||||||
g.write(chunk)
|
|
||||||
if g is not sys.stdout:
|
|
||||||
g.close()
|
|
||||||
if f is not sys.stdin:
|
|
||||||
f.close()
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
_test()
|
|
Binary file not shown.
@ -1,538 +0,0 @@
|
|||||||
import obitools
|
|
||||||
import re
|
|
||||||
import array
|
|
||||||
|
|
||||||
class Location(object):
|
|
||||||
"""
|
|
||||||
Define a location on a sequence.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def extractSequence(self,sequence):
|
|
||||||
'''
|
|
||||||
Extract subsequence corresponding to a Location.
|
|
||||||
|
|
||||||
@param sequence:
|
|
||||||
@type sequence: C{BioSequence} or C{str}
|
|
||||||
'''
|
|
||||||
assert isinstance(sequence, (obitools.BioSequence,str)), \
|
|
||||||
"sequence must be an instance of str or BioSequence"
|
|
||||||
|
|
||||||
if isinstance(sequence, str):
|
|
||||||
seq = self._extractSequence(sequence)
|
|
||||||
else:
|
|
||||||
if isinstance(sequence, obitools.AASequence):
|
|
||||||
assert not self.needNucleic(), \
|
|
||||||
"This location can be used only with Nucleic sequences"
|
|
||||||
seq = self._extractSequence(str(sequence))
|
|
||||||
|
|
||||||
if isinstance(sequence, obitools.AASequence):
|
|
||||||
st = obitools.AASequence
|
|
||||||
else:
|
|
||||||
st = obitools.NucSequence
|
|
||||||
|
|
||||||
seq = st(sequence.id,
|
|
||||||
seq,
|
|
||||||
sequence.definition,
|
|
||||||
**sequence.getTags())
|
|
||||||
seq['location']=str(self)
|
|
||||||
|
|
||||||
if 'length' in sequence.getTags():
|
|
||||||
seq['length']=len(seq)
|
|
||||||
|
|
||||||
if hasattr(sequence, 'quality'):
|
|
||||||
quality = self._extractQuality(sequence)
|
|
||||||
seq.quality=quality
|
|
||||||
|
|
||||||
return seq
|
|
||||||
|
|
||||||
def isDirect(self):
|
|
||||||
return None
|
|
||||||
|
|
||||||
def isSimple(self):
|
|
||||||
'''
|
|
||||||
Indicate if a location is composed of a single continuous
|
|
||||||
region or is composed by the junction of several locations
|
|
||||||
by the C{join} operator.
|
|
||||||
|
|
||||||
@return: C{True} if the location is composed of a single
|
|
||||||
continuous region.
|
|
||||||
@rtype: bool
|
|
||||||
'''
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def isFullLength(self):
|
|
||||||
return None
|
|
||||||
|
|
||||||
def needNucleic(self):
|
|
||||||
'''
|
|
||||||
If a location contains a complement operator, it can be use
|
|
||||||
only on nucleic sequence.
|
|
||||||
|
|
||||||
@return: C{True} if location contains a complement operator
|
|
||||||
@rtype: bool
|
|
||||||
'''
|
|
||||||
return None
|
|
||||||
|
|
||||||
def getGloc(self):
|
|
||||||
loc = self.simplify()
|
|
||||||
assert loc.isDirect() is not None,"Gloc cannot be created for multi oriented location : %s" % str(loc)
|
|
||||||
positions = ','.join([str(x) for x in loc._getglocpos()])
|
|
||||||
return "(%s,%s)" % ({True:'T',False:'F'}[loc.isDirect()],
|
|
||||||
positions)
|
|
||||||
|
|
||||||
def shift(self,s):
|
|
||||||
return None
|
|
||||||
|
|
||||||
def getBegin(self):
|
|
||||||
return None
|
|
||||||
|
|
||||||
def getEnd(self):
|
|
||||||
return None
|
|
||||||
|
|
||||||
def getFivePrime(self):
|
|
||||||
return self.getBegin()
|
|
||||||
|
|
||||||
def getThreePrime(self):
|
|
||||||
return self.getEnd()
|
|
||||||
|
|
||||||
begin = property(getBegin,None,None,"beginning position of the location")
|
|
||||||
end = property(getEnd,None,None,"ending position of the location")
|
|
||||||
fivePrime=property(getFivePrime,None,None,"5' position of the location")
|
|
||||||
threePrime=property(getThreePrime,None,None,"3' position of the location")
|
|
||||||
|
|
||||||
def __abs__(self):
|
|
||||||
assert self.isDirect() is not None,"Abs operator cannot be applied on non oriented location"
|
|
||||||
if self.isDirect():
|
|
||||||
return self
|
|
||||||
else:
|
|
||||||
return ComplementLocation(self).simplify()
|
|
||||||
|
|
||||||
def __cmp__(self,y):
|
|
||||||
if self.begin < y.begin:
|
|
||||||
return -1
|
|
||||||
if self.begin > y.begin:
|
|
||||||
return 1
|
|
||||||
if self.isDirect() == y.isDirect():
|
|
||||||
return 0
|
|
||||||
if self.isDirect() and not y.isDirect():
|
|
||||||
return -1
|
|
||||||
return 1
|
|
||||||
|
|
||||||
class SimpleLocation(Location):
|
|
||||||
"""
|
|
||||||
A simple location is describe a continuous region of
|
|
||||||
a sequence define by a C{begin} and a C{end} position.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,begin,end):
|
|
||||||
'''
|
|
||||||
Build a new C{SimpleLocation} instance. Valid
|
|
||||||
position are define on M{[1,N]} with N the length
|
|
||||||
of the sequence.
|
|
||||||
|
|
||||||
@param begin: start position of the location
|
|
||||||
@type begin: int
|
|
||||||
@param end: end position of the location
|
|
||||||
@type end: int
|
|
||||||
'''
|
|
||||||
assert begin > 0 and end > 0
|
|
||||||
|
|
||||||
self._begin = begin
|
|
||||||
self._end = end
|
|
||||||
self._before=False
|
|
||||||
self._after=False
|
|
||||||
|
|
||||||
def _extractSequence(self,sequence):
|
|
||||||
|
|
||||||
assert ( self._begin < len(sequence)
|
|
||||||
and self._end <= len(sequence)), \
|
|
||||||
"Sequence length %d is too short" % len(sequence)
|
|
||||||
|
|
||||||
return sequence[self._begin-1:self._end]
|
|
||||||
|
|
||||||
def _extractQuality(self,sequence):
|
|
||||||
|
|
||||||
assert ( self._begin < len(sequence)
|
|
||||||
and self._end <= len(sequence)), \
|
|
||||||
"Sequence length %d is too short" % len(sequence)
|
|
||||||
|
|
||||||
return sequence.quality[self._begin-1:self._end]
|
|
||||||
|
|
||||||
|
|
||||||
def isDirect(self):
|
|
||||||
return True
|
|
||||||
|
|
||||||
def isSimple(self):
|
|
||||||
return True
|
|
||||||
|
|
||||||
def isFullLength(self):
|
|
||||||
return not (self.before or self.after)
|
|
||||||
|
|
||||||
def simplify(self):
|
|
||||||
if self._begin == self._end:
|
|
||||||
return PointLocation(self._begin)
|
|
||||||
else:
|
|
||||||
return self
|
|
||||||
|
|
||||||
def needNucleic(self):
|
|
||||||
return False
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
before = {True:'<',False:''}[self.before]
|
|
||||||
after = {True:'>',False:''}[self.after]
|
|
||||||
return "%s%d..%s%d" % (before,self._begin,after,self._end)
|
|
||||||
|
|
||||||
def shift(self,s):
|
|
||||||
assert (self._begin + s) > 0,"shift to large (%d)" % s
|
|
||||||
if s == 0:
|
|
||||||
return self
|
|
||||||
return SimpleLocation(self._begin + s, self._end + s)
|
|
||||||
|
|
||||||
def _getglocpos(self):
|
|
||||||
return (self.begin,self.end)
|
|
||||||
|
|
||||||
def getGloc(self):
|
|
||||||
positions = ','.join([str(x) for x in self._getglocpos()])
|
|
||||||
return "(%s,%s)" % ({True:'T',False:'F'}[self.isDirect()],
|
|
||||||
positions)
|
|
||||||
|
|
||||||
def getBegin(self):
|
|
||||||
return self._begin
|
|
||||||
|
|
||||||
def getEnd(self):
|
|
||||||
return self._end
|
|
||||||
|
|
||||||
|
|
||||||
begin = property(getBegin,None,None,"beginning position of the location")
|
|
||||||
end = property(getEnd,None,None,"ending position of the location")
|
|
||||||
|
|
||||||
def getBefore(self):
|
|
||||||
return self._before
|
|
||||||
|
|
||||||
def getAfter(self):
|
|
||||||
return self._after
|
|
||||||
|
|
||||||
def setBefore(self,value):
|
|
||||||
assert isinstance(value, bool)
|
|
||||||
self._before=value
|
|
||||||
|
|
||||||
def setAfter(self,value):
|
|
||||||
assert isinstance(value, bool)
|
|
||||||
self._after=value
|
|
||||||
|
|
||||||
before=property(getBefore,setBefore,None)
|
|
||||||
after=property(getAfter,setAfter,None)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class PointLocation(Location):
|
|
||||||
"""
|
|
||||||
A point location describes a location on a sequence
|
|
||||||
limited to a single position
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,position):
|
|
||||||
assert position > 0
|
|
||||||
self._pos=position
|
|
||||||
|
|
||||||
def _extractSequence(self,sequence):
|
|
||||||
|
|
||||||
assert self._end <= len(sequence), \
|
|
||||||
"Sequence length %d is too short" % len(sequence)
|
|
||||||
|
|
||||||
return sequence[self._pos-1]
|
|
||||||
|
|
||||||
def _extractQuality(self,sequence):
|
|
||||||
|
|
||||||
assert self._end <= len(sequence), \
|
|
||||||
"Sequence length %d is too short" % len(sequence)
|
|
||||||
|
|
||||||
return sequence[self._pos-1:self._pos]
|
|
||||||
|
|
||||||
def isDirect(self):
|
|
||||||
return True
|
|
||||||
|
|
||||||
def isSimple(self):
|
|
||||||
return True
|
|
||||||
|
|
||||||
def isFullLength(self):
|
|
||||||
return True
|
|
||||||
|
|
||||||
def simplify(self):
|
|
||||||
return self
|
|
||||||
|
|
||||||
def needNucleic(self):
|
|
||||||
return False
|
|
||||||
|
|
||||||
def shift(self,s):
|
|
||||||
assert (self._pos + s) > 0,"shift to large (%d)" % s
|
|
||||||
if s == 0:
|
|
||||||
return self
|
|
||||||
return PointLocation(self._pos + s)
|
|
||||||
|
|
||||||
def _getglocpos(self):
|
|
||||||
return (self._pos,self._pos)
|
|
||||||
|
|
||||||
def getBegin(self):
|
|
||||||
return self._pos
|
|
||||||
|
|
||||||
def getEnd(self):
|
|
||||||
return self._pos
|
|
||||||
|
|
||||||
begin = property(getBegin,None,None,"beginning position of the location")
|
|
||||||
end = property(getEnd,None,None,"ending position of the location")
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return str(self._pos)
|
|
||||||
|
|
||||||
class CompositeLocation(Location):
|
|
||||||
"""
|
|
||||||
"""
|
|
||||||
def __init__(self,locations):
|
|
||||||
self._locs = tuple(locations)
|
|
||||||
|
|
||||||
|
|
||||||
def _extractSequence(self,sequence):
|
|
||||||
seq = ''.join([x._extractSequence(sequence)
|
|
||||||
for x in self._locs])
|
|
||||||
return seq
|
|
||||||
|
|
||||||
def _extractQuality(self,sequence):
|
|
||||||
rep=array.array('d',[])
|
|
||||||
for x in self._locs:
|
|
||||||
rep.extend(x._extractQuality(sequence))
|
|
||||||
return rep
|
|
||||||
|
|
||||||
def isDirect(self):
|
|
||||||
hasDirect,hasReverse = reduce(lambda x,y: (x[0] or y,x[1] or not y),
|
|
||||||
(z.isDirect() for z in self._locs),(False,False))
|
|
||||||
|
|
||||||
if hasDirect and not hasReverse:
|
|
||||||
return True
|
|
||||||
if hasReverse and not hasDirect:
|
|
||||||
return False
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def isSimple(self):
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def simplify(self):
|
|
||||||
if len(self._locs)==1:
|
|
||||||
return self._locs[0]
|
|
||||||
|
|
||||||
rep = CompositeLocation(x.simplify() for x in self._locs)
|
|
||||||
|
|
||||||
if reduce(lambda x,y : x and y,
|
|
||||||
(isinstance(z, ComplementLocation)
|
|
||||||
for z in self._locs)):
|
|
||||||
rep = ComplementLocation(CompositeLocation(x._loc.simplify()
|
|
||||||
for x in rep._locs[::-1]))
|
|
||||||
|
|
||||||
return rep
|
|
||||||
|
|
||||||
def isFullLength(self):
|
|
||||||
return reduce(lambda x,y : x and y, (z.isFullLength() for z in self._locs),1)
|
|
||||||
|
|
||||||
def needNucleic(self):
|
|
||||||
return reduce(lambda x,y : x or y,
|
|
||||||
(z.needNucleic for z in self._locs),
|
|
||||||
False)
|
|
||||||
|
|
||||||
def _getglocpos(self):
|
|
||||||
return reduce(lambda x,y : x + y,
|
|
||||||
(z._getglocpos() for z in self._locs))
|
|
||||||
|
|
||||||
|
|
||||||
def getBegin(self):
|
|
||||||
return min(x.getBegin() for x in self._locs)
|
|
||||||
|
|
||||||
def getEnd(self):
|
|
||||||
return max(x.getEnd() for x in self._locs)
|
|
||||||
|
|
||||||
def shift(self,s):
|
|
||||||
assert (self.getBegin() + s) > 0,"shift to large (%d)" % s
|
|
||||||
if s == 0:
|
|
||||||
return self
|
|
||||||
return CompositeLocation(x.shift(s) for x in self._locs)
|
|
||||||
|
|
||||||
|
|
||||||
begin = property(getBegin,None,None,"beginning position of the location")
|
|
||||||
end = property(getEnd,None,None,"ending position of the location")
|
|
||||||
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return "join(%s)" % ','.join([str(x)
|
|
||||||
for x in self._locs])
|
|
||||||
|
|
||||||
class ComplementLocation(Location):
|
|
||||||
"""
|
|
||||||
"""
|
|
||||||
|
|
||||||
_comp={'a': 't', 'c': 'g', 'g': 'c', 't': 'a',
|
|
||||||
'r': 'y', 'y': 'r', 'k': 'm', 'm': 'k',
|
|
||||||
's': 's', 'w': 'w', 'b': 'v', 'd': 'h',
|
|
||||||
'h': 'd', 'v': 'b', 'n': 'n', 'u': 'a',
|
|
||||||
'-': '-'}
|
|
||||||
|
|
||||||
def __init__(self,location):
|
|
||||||
self._loc = location
|
|
||||||
|
|
||||||
def _extractSequence(self,sequence):
|
|
||||||
seq = self._loc._extractSequence(sequence)
|
|
||||||
seq = ''.join([ComplementLocation._comp.get(x.lower(),'n') for x in seq[::-1]])
|
|
||||||
return seq
|
|
||||||
|
|
||||||
def _extractQuality(self,sequence):
|
|
||||||
return sequence.quality[::-1]
|
|
||||||
|
|
||||||
def isDirect(self):
|
|
||||||
return False
|
|
||||||
|
|
||||||
def isSimple(self):
|
|
||||||
return self._loc.isSimple()
|
|
||||||
|
|
||||||
def isFullLength(self):
|
|
||||||
return self._loc.isFullLength()
|
|
||||||
|
|
||||||
def simplify(self):
|
|
||||||
if isinstance(self._loc, ComplementLocation):
|
|
||||||
return self._loc._loc.simplify()
|
|
||||||
else:
|
|
||||||
return self
|
|
||||||
|
|
||||||
def needNucleic(self):
|
|
||||||
return True
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return "complement(%s)" % self._loc
|
|
||||||
|
|
||||||
def shift(self,s):
|
|
||||||
assert (self.getBegin() + s) > 0,"shift to large (%d)" % s
|
|
||||||
if s == 0:
|
|
||||||
return self
|
|
||||||
return ComplementLocation(self._loc.shift(s))
|
|
||||||
|
|
||||||
def _getglocpos(self):
|
|
||||||
return self._loc._getglocpos()
|
|
||||||
|
|
||||||
def getBegin(self):
|
|
||||||
return self._loc.getBegin()
|
|
||||||
|
|
||||||
def getEnd(self):
|
|
||||||
return self._loc.getEnd()
|
|
||||||
|
|
||||||
def getFivePrime(self):
|
|
||||||
return self.getEnd()
|
|
||||||
|
|
||||||
def getThreePrime(self):
|
|
||||||
return self.getBegin()
|
|
||||||
|
|
||||||
|
|
||||||
begin = property(getBegin,None,None,"beginning position of the location")
|
|
||||||
end = property(getEnd,None,None,"ending position of the location")
|
|
||||||
fivePrime=property(getFivePrime,None,None,"5' potisition of the location")
|
|
||||||
threePrime=property(getThreePrime,None,None,"3' potisition of the location")
|
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
# Internal functions used for location parsing
|
|
||||||
#
|
|
||||||
|
|
||||||
def __sublocationIterator(text):
|
|
||||||
sl = []
|
|
||||||
plevel=0
|
|
||||||
for c in text:
|
|
||||||
assert plevel>=0,"Misformated location : %s" % text
|
|
||||||
if c == '(':
|
|
||||||
plevel+=1
|
|
||||||
sl.append(c)
|
|
||||||
elif c==')':
|
|
||||||
plevel-=1
|
|
||||||
sl.append(c)
|
|
||||||
elif c==',' and plevel == 0:
|
|
||||||
assert sl,"Misformated location : %s" % text
|
|
||||||
yield ''.join(sl)
|
|
||||||
sl=[]
|
|
||||||
else:
|
|
||||||
sl.append(c)
|
|
||||||
assert sl and plevel==0,"Misformated location : %s" % text
|
|
||||||
yield ''.join(sl)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
# Internal functions used for location parsing
|
|
||||||
#
|
|
||||||
|
|
||||||
__simplelocparser = re.compile('(?P<before><?)(?P<from>[0-9]+)(\.\.(?P<after>>?)(?P<to>[0-9]+))?')
|
|
||||||
|
|
||||||
|
|
||||||
def __locationParser(text):
|
|
||||||
text=text.strip()
|
|
||||||
if text[0:5]=='join(':
|
|
||||||
assert text[-1]==')',"Misformated location : %s" % text
|
|
||||||
return CompositeLocation(__locationParser(sl) for sl in __sublocationIterator(text[5:-1]))
|
|
||||||
elif text[0:11]=='complement(':
|
|
||||||
assert text[-1]==')',"Misformated location : %s" % text
|
|
||||||
subl = tuple(__locationParser(sl) for sl in __sublocationIterator(text[11:-1]))
|
|
||||||
if len(subl)>1:
|
|
||||||
subl = CompositeLocation(subl)
|
|
||||||
else:
|
|
||||||
subl = subl[0]
|
|
||||||
return ComplementLocation(subl)
|
|
||||||
else:
|
|
||||||
data = __simplelocparser.match(text)
|
|
||||||
assert data is not None,"Misformated location : %s" % text
|
|
||||||
data = data.groupdict()
|
|
||||||
if not data['to'] :
|
|
||||||
sl = PointLocation(int(data['from']))
|
|
||||||
else:
|
|
||||||
sl = SimpleLocation(int(data['from']),int(data['to']))
|
|
||||||
sl.before=data['before']=='<'
|
|
||||||
sl.after=data['after']=='>'
|
|
||||||
return sl
|
|
||||||
|
|
||||||
def locationGenerator(locstring):
|
|
||||||
'''
|
|
||||||
Parse a location string as present in genbank or embl file.
|
|
||||||
|
|
||||||
@param locstring: string description of the location in embl/gb format
|
|
||||||
@type locstring: str
|
|
||||||
|
|
||||||
@return: a Location instance
|
|
||||||
@rtype: C{Location} subclass instance
|
|
||||||
'''
|
|
||||||
return __locationParser(locstring)
|
|
||||||
|
|
||||||
|
|
||||||
_matchExternalRef = re.compile('[A-Za-z0-9_|]+(\.[0-9]+)?(?=:)')
|
|
||||||
|
|
||||||
def extractExternalRefs(locstring):
|
|
||||||
'''
|
|
||||||
When a location describe external references (ex: D28156.1:1..>1292)
|
|
||||||
separate the external reference part of the location and the location
|
|
||||||
by itself.
|
|
||||||
|
|
||||||
@param locstring: text representation of the location.
|
|
||||||
@type locstring: str
|
|
||||||
|
|
||||||
@return: a tuple with a set of string describing accession number
|
|
||||||
of the referred sequences and a C{Location} instance.
|
|
||||||
|
|
||||||
@rtype: tuple(set,Location)
|
|
||||||
'''
|
|
||||||
m = set(x.group() for x in _matchExternalRef.finditer(locstring))
|
|
||||||
clean = re.compile(':|'.join([re.escape(x) for x in m])+':')
|
|
||||||
cloc = locationGenerator(clean.sub('',locstring))
|
|
||||||
|
|
||||||
return m,cloc
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Binary file not shown.
@ -1,177 +0,0 @@
|
|||||||
from obitools.location import Location,locationGenerator
|
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
_featureMatcher = re.compile('^(FT| ) [^ ].+\n((FT| ) .+\n)+',re.M)
|
|
||||||
_featureCleaner = re.compile('^FT',re.M)
|
|
||||||
|
|
||||||
|
|
||||||
def textFeatureIterator(fttable):
|
|
||||||
'''
|
|
||||||
Iterate through a textual description of a feature table in a genbank
|
|
||||||
or embl format. Return at each step a text representation of each individual
|
|
||||||
feature composing the table.
|
|
||||||
|
|
||||||
@param fttable: a string corresponding to the feature table of a genbank
|
|
||||||
or an embl entry
|
|
||||||
|
|
||||||
@type fttable: C{str}
|
|
||||||
|
|
||||||
@return: an iterator on str
|
|
||||||
@rtype: iterator
|
|
||||||
|
|
||||||
@see: L{ftParser}
|
|
||||||
'''
|
|
||||||
for m in _featureMatcher.finditer(fttable):
|
|
||||||
t = m.group()
|
|
||||||
t = _featureCleaner.sub(' ',t)
|
|
||||||
yield t
|
|
||||||
|
|
||||||
_qualifierMatcher = re.compile('(?<=^ {21}/).+(\n {21}[^/].+)*',re.M)
|
|
||||||
_qualifierCleanner= re.compile("^ +",re.M)
|
|
||||||
|
|
||||||
def qualifierIterator(qualifiers):
|
|
||||||
'''
|
|
||||||
Parse a textual description of a feature in embl or genbank format
|
|
||||||
as returned by the textFeatureIterator iterator and iterate through
|
|
||||||
the key, value qualified defining this location.
|
|
||||||
|
|
||||||
@param qualifiers: substring containing qualifiers
|
|
||||||
@type qualifiers: str
|
|
||||||
|
|
||||||
@return: an iterator on tuple (key,value), where keys are C{str}
|
|
||||||
@rtype: iterator
|
|
||||||
'''
|
|
||||||
for m in _qualifierMatcher.finditer(qualifiers):
|
|
||||||
t = m.group()
|
|
||||||
t = _qualifierCleanner.sub('',t)
|
|
||||||
t = t.split('=',1)
|
|
||||||
if len(t)==1:
|
|
||||||
t = (t[0],None)
|
|
||||||
else:
|
|
||||||
if t[0]=='translation':
|
|
||||||
value = t[1].replace('\n','')
|
|
||||||
else:
|
|
||||||
value = t[1].replace('\n',' ')
|
|
||||||
try:
|
|
||||||
value = eval(value)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
t = (t[0],value)
|
|
||||||
yield t
|
|
||||||
|
|
||||||
|
|
||||||
_ftmatcher = re.compile('(?<=^ {5})\S+')
|
|
||||||
_locmatcher= re.compile('(?<=^.{21})[^/]+',re.DOTALL)
|
|
||||||
_cleanloc = re.compile('[\s\n]+')
|
|
||||||
_qualifiersMatcher = re.compile('^ +/.+',re.M+re.DOTALL)
|
|
||||||
|
|
||||||
def ftParser(feature):
|
|
||||||
fttype = _ftmatcher.search(feature).group()
|
|
||||||
location=_locmatcher.search(feature).group()
|
|
||||||
location=_cleanloc.sub('',location)
|
|
||||||
qualifiers=_qualifiersMatcher.search(feature)
|
|
||||||
if qualifiers is not None:
|
|
||||||
qualifiers=qualifiers.group()
|
|
||||||
else:
|
|
||||||
qualifiers=""
|
|
||||||
logging.debug("Qualifiers regex not matching on \n=====\n%s\n========" % feature)
|
|
||||||
|
|
||||||
return fttype,location,qualifiers
|
|
||||||
|
|
||||||
|
|
||||||
class Feature(dict,Location):
|
|
||||||
def __init__(self,type,location):
|
|
||||||
self._fttype=type
|
|
||||||
self._loc=location
|
|
||||||
|
|
||||||
def getFttype(self):
|
|
||||||
return self._fttype
|
|
||||||
|
|
||||||
|
|
||||||
def extractSequence(self,sequence,withQualifier=False):
|
|
||||||
seq = self._loc.extractSequence(sequence)
|
|
||||||
if withQualifier:
|
|
||||||
seq.getInfo().update(self)
|
|
||||||
return seq
|
|
||||||
|
|
||||||
def isDirect(self):
|
|
||||||
return self._loc.isDirect()
|
|
||||||
|
|
||||||
def isSimple(self):
|
|
||||||
return self._loc.isSimple()
|
|
||||||
|
|
||||||
def isFullLength(self):
|
|
||||||
return self._loc.isFullLength()
|
|
||||||
|
|
||||||
def simplify(self):
|
|
||||||
f = Feature(self._fttype,self._loc.simplify())
|
|
||||||
f.update(self)
|
|
||||||
return f
|
|
||||||
|
|
||||||
def locStr(self):
|
|
||||||
return str(self._loc)
|
|
||||||
|
|
||||||
def needNucleic(self):
|
|
||||||
return self._loc.needNucleic()
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return repr(self)
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return str((self.ftType,str(self._loc),dict.__repr__(self)))
|
|
||||||
|
|
||||||
def __cmp__(self,y):
|
|
||||||
return self._loc.__cmp__(y)
|
|
||||||
|
|
||||||
def _getglocpos(self):
|
|
||||||
return self._loc._getglocpos()
|
|
||||||
|
|
||||||
ftType = property(getFttype, None, None, "Feature type name")
|
|
||||||
|
|
||||||
def shift(self,s):
|
|
||||||
assert (self.getBegin() + s) > 0,"shift to large (%d)" % s
|
|
||||||
if s == 0:
|
|
||||||
return self
|
|
||||||
f = Feature(self._fttype,self._loc.shift(s))
|
|
||||||
f.update(self)
|
|
||||||
return f
|
|
||||||
|
|
||||||
|
|
||||||
def getBegin(self):
|
|
||||||
return self._loc.getBegin()
|
|
||||||
|
|
||||||
def getEnd(self):
|
|
||||||
return self._loc.getEnd()
|
|
||||||
|
|
||||||
begin = property(getBegin,None,None,"beginning position of the location")
|
|
||||||
end = property(getEnd,None,None,"ending position of the location")
|
|
||||||
|
|
||||||
|
|
||||||
def featureFactory(featureDescription):
|
|
||||||
fttype,location,qualifiers = ftParser(featureDescription)
|
|
||||||
location = locationGenerator(location)
|
|
||||||
feature = Feature(fttype,location)
|
|
||||||
feature.raw = featureDescription
|
|
||||||
|
|
||||||
for k,v in qualifierIterator(qualifiers):
|
|
||||||
feature.setdefault(k,[]).append(v)
|
|
||||||
|
|
||||||
return feature
|
|
||||||
|
|
||||||
def featureIterator(featureTable,skipError=False):
|
|
||||||
for tft in textFeatureIterator(featureTable):
|
|
||||||
try:
|
|
||||||
feature = featureFactory(tft)
|
|
||||||
except AssertionError,e:
|
|
||||||
logging.debug("Parsing error on feature :\n===============\n%s\n===============" % tft)
|
|
||||||
if not skipError:
|
|
||||||
raise e
|
|
||||||
logging.debug("\t===> Error skipped")
|
|
||||||
continue
|
|
||||||
|
|
||||||
yield feature
|
|
||||||
|
|
@ -1,265 +0,0 @@
|
|||||||
from obitools.ecopcr.options import addTaxonomyFilterOptions,\
|
|
||||||
loadTaxonomyDatabase
|
|
||||||
from obitools.graph import UndirectedGraph
|
|
||||||
from obitools.align import lenlcs,isLCSReachable
|
|
||||||
from obitools.graph.algorithms.component import componentIterator
|
|
||||||
from obitools.utils.bioseq import uniqSequence
|
|
||||||
from obitools.utils import progressBar
|
|
||||||
import math
|
|
||||||
import sys
|
|
||||||
from obitools.graph.rootedtree import RootedTree
|
|
||||||
|
|
||||||
def average(x):
|
|
||||||
x=list(x)
|
|
||||||
s = sum(i*j for (i,j) in x)
|
|
||||||
n = sum(i[1] for i in x)
|
|
||||||
return (float(s)/float(n),n)
|
|
||||||
|
|
||||||
def minimum(x):
|
|
||||||
x=list(x)
|
|
||||||
m = min(i[0] for i in x)
|
|
||||||
n = sum(i[1] for i in x)
|
|
||||||
return (float(m),n)
|
|
||||||
|
|
||||||
def ecoPCRReader(entries,options):
|
|
||||||
|
|
||||||
taxonomy = loadTaxonomyDatabase(options)
|
|
||||||
|
|
||||||
norankid =options.taxonomy.findRankByName('no rank')
|
|
||||||
speciesid=options.taxonomy.findRankByName('species')
|
|
||||||
genusid =options.taxonomy.findRankByName('genus')
|
|
||||||
familyid =options.taxonomy.findRankByName('family')
|
|
||||||
|
|
||||||
minrankseq = set([speciesid,genusid,familyid])
|
|
||||||
|
|
||||||
usedrankid = {}
|
|
||||||
|
|
||||||
ingroup = []
|
|
||||||
outgroup= []
|
|
||||||
|
|
||||||
for s in entries:
|
|
||||||
if 'taxid' in s :
|
|
||||||
taxid = s['taxid']
|
|
||||||
if taxid in taxonomy:
|
|
||||||
allrank = set()
|
|
||||||
for p in options.taxonomy.parentalTreeIterator(taxid):
|
|
||||||
if p[1]!=norankid:
|
|
||||||
allrank.add(p[1])
|
|
||||||
if len(minrankseq & allrank) == 3:
|
|
||||||
for r in allrank:
|
|
||||||
usedrankid[r]=usedrankid.get(r,0) + 1
|
|
||||||
|
|
||||||
if taxonomy.isAncestor(options.ingroup,taxid):
|
|
||||||
ingroup.append(s)
|
|
||||||
else:
|
|
||||||
outgroup.append(s)
|
|
||||||
|
|
||||||
keptrank = set(r for r in usedrankid
|
|
||||||
if float(usedrankid[r])/float(len(ingroup)) > options.rankthresold)
|
|
||||||
|
|
||||||
return { 'ingroup' : ingroup,
|
|
||||||
'outgroup': outgroup,
|
|
||||||
'ranks' : keptrank
|
|
||||||
}
|
|
||||||
|
|
||||||
def buildSimilarityGraph(dbseq,ranks,taxonomy,dcmax=5):
|
|
||||||
|
|
||||||
ldbseq = len(dbseq)
|
|
||||||
pos = 1
|
|
||||||
digit = int(math.ceil(math.log10(ldbseq)))
|
|
||||||
header = "Alignment : %%0%dd x %%0%dd -> %%0%dd " % (digit,digit,digit)
|
|
||||||
aligncount = ldbseq*(ldbseq+1)/2
|
|
||||||
edgecount = 0
|
|
||||||
print >>sys.stderr
|
|
||||||
|
|
||||||
progressBar(1,aligncount,True,"Alignment : %s x %s -> %s " % ('-'*digit,'-'*digit, '0'*digit))
|
|
||||||
|
|
||||||
|
|
||||||
sim = UndirectedGraph()
|
|
||||||
|
|
||||||
i=0
|
|
||||||
for s in dbseq:
|
|
||||||
taxid = s['taxid']
|
|
||||||
|
|
||||||
rtaxon = dict((rid,taxonomy.getTaxonAtRank(taxid,rid))
|
|
||||||
for rid in ranks)
|
|
||||||
|
|
||||||
sim.addNode(i, seq=s,taxid=taxid,rtaxon=rtaxon)
|
|
||||||
|
|
||||||
i+=1
|
|
||||||
|
|
||||||
# aligner = LCS()
|
|
||||||
|
|
||||||
for is1 in xrange(ldbseq):
|
|
||||||
s1 = dbseq[is1]
|
|
||||||
ls1= len(s1)
|
|
||||||
# aligner.seqA=s1
|
|
||||||
|
|
||||||
for is2 in xrange(is1+1,ldbseq):
|
|
||||||
|
|
||||||
s2=dbseq[is2]
|
|
||||||
ls2=len(s2)
|
|
||||||
|
|
||||||
lm = max(ls1,ls2)
|
|
||||||
lcsmin = lm - dcmax
|
|
||||||
|
|
||||||
if isLCSReachable(s1,s2,lcsmin):
|
|
||||||
llcs,lali=lenlcs(s1,s2)
|
|
||||||
ds1s2 = lali - llcs
|
|
||||||
|
|
||||||
if ds1s2 <= dcmax:
|
|
||||||
sim.addEdge(node1=is1, node2=is2,ds1s2=ds1s2,label=ds1s2)
|
|
||||||
edgecount+=1
|
|
||||||
|
|
||||||
progressBar(pos,aligncount,head=header % (is1,is2,edgecount))
|
|
||||||
pos+=(ldbseq-is1-1)
|
|
||||||
|
|
||||||
return sim
|
|
||||||
|
|
||||||
def buildTsr(component):
|
|
||||||
'''
|
|
||||||
Build for each consider taxonomic rank the list of taxa
|
|
||||||
present in the connected component
|
|
||||||
|
|
||||||
:param component: the analyzed connected component
|
|
||||||
:type component: :py:class:`UndirectedGraph`
|
|
||||||
|
|
||||||
:return: a dictionary indexed by rankid containing a `dict` indexed by taxid and containing count of sequences for this taxid
|
|
||||||
:rtype: `dict` indexed by `int` containing `dict` indexed by `int` and containing of `int`
|
|
||||||
|
|
||||||
'''
|
|
||||||
taxalist = {}
|
|
||||||
for n in component:
|
|
||||||
for r in n['rtaxon']:
|
|
||||||
rtaxid = n['rtaxon'][r]
|
|
||||||
if rtaxid is not None:
|
|
||||||
ts = taxalist.get(r,{})
|
|
||||||
ts[rtaxid]=ts.get(rtaxid,0)+1
|
|
||||||
taxalist[r]=ts
|
|
||||||
|
|
||||||
return taxalist
|
|
||||||
|
|
||||||
def edgeDistSelector(dcmax):
|
|
||||||
def predicate(e):
|
|
||||||
return e['ds1s2'] <= dcmax
|
|
||||||
return predicate
|
|
||||||
|
|
||||||
def distanceOfConfusion(simgraph,dcmax=5,aggregate=average):
|
|
||||||
|
|
||||||
alltaxa = set()
|
|
||||||
|
|
||||||
for n in simgraph:
|
|
||||||
alltaxa|=set(n['rtaxon'].values())
|
|
||||||
|
|
||||||
taxacount = len(alltaxa)
|
|
||||||
|
|
||||||
result = {}
|
|
||||||
|
|
||||||
pos = [1]
|
|
||||||
header = "Component : %-5d Identified : %-8d "
|
|
||||||
progressBar(1,taxacount,True,header % (0,0))
|
|
||||||
|
|
||||||
def _idc(cc,dcmax):
|
|
||||||
composante=[]
|
|
||||||
for x in cc:
|
|
||||||
composante.extend(simgraph.subgraph(c)
|
|
||||||
for c in componentIterator(x,
|
|
||||||
edgePredicat=edgeDistSelector(dcmax)))
|
|
||||||
|
|
||||||
good = set()
|
|
||||||
bad = {}
|
|
||||||
|
|
||||||
complexe = []
|
|
||||||
|
|
||||||
for c in composante:
|
|
||||||
tsr = buildTsr(c)
|
|
||||||
newbad=False
|
|
||||||
for r in tsr:
|
|
||||||
if len(tsr[r]) == 1:
|
|
||||||
taxid = tsr[r].keys()[0]
|
|
||||||
good.add((taxid,tsr[r][taxid]))
|
|
||||||
else:
|
|
||||||
newbad=True
|
|
||||||
for taxid in tsr[r]:
|
|
||||||
bad[taxid]=bad.get(taxid,0)+tsr[r][taxid]
|
|
||||||
if newbad:
|
|
||||||
complexe.append(c)
|
|
||||||
|
|
||||||
# good = good - bad
|
|
||||||
|
|
||||||
for taxid,weight in good:
|
|
||||||
if taxid not in result:
|
|
||||||
result[taxid]=[]
|
|
||||||
result[taxid].append((dcmax+1,weight))
|
|
||||||
|
|
||||||
|
|
||||||
progressBar(pos[0],taxacount,False,header % (len(composante),pos[0]))
|
|
||||||
pos[0]=len(result)
|
|
||||||
|
|
||||||
if dcmax > 0:
|
|
||||||
dcmax-=1
|
|
||||||
_idc(complexe,dcmax)
|
|
||||||
|
|
||||||
else:
|
|
||||||
for taxid in bad:
|
|
||||||
if taxid not in result:
|
|
||||||
result[taxid]=[]
|
|
||||||
result[taxid].append((0,bad[taxid]))
|
|
||||||
|
|
||||||
progressBar(pos[0],taxacount,False,header % (len(composante),pos[0]))
|
|
||||||
pos[0]=len(result)
|
|
||||||
|
|
||||||
_idc([simgraph],dcmax)
|
|
||||||
|
|
||||||
for taxid in result:
|
|
||||||
result[taxid]=aggregate(result[taxid])
|
|
||||||
return result
|
|
||||||
|
|
||||||
def propagateDc(tree,node=None,aggregate=min):
|
|
||||||
if node is None:
|
|
||||||
node = tree.getRoots()[0]
|
|
||||||
dca=aggregate(n['dc'] for n in node.leavesIterator())
|
|
||||||
node['dc']=dca
|
|
||||||
for n in node:
|
|
||||||
propagateDc(tree, n, aggregate)
|
|
||||||
|
|
||||||
def confusionTree(distances,ranks,taxonomy,aggregate=min,bsrank='species',dcmax=1):
|
|
||||||
|
|
||||||
def Bs(node,rank,dcmax):
|
|
||||||
n = len(node)
|
|
||||||
if n:
|
|
||||||
g = [int(x['dc']>=dcmax) for x in node.subgraphIterator() if x['rank']==bsrank]
|
|
||||||
n = len(g)
|
|
||||||
g = sum(g)
|
|
||||||
bs= float(g)/float(n)
|
|
||||||
node['bs']=bs
|
|
||||||
node['bs_label']="%3.2f (%d)" % (bs,n)
|
|
||||||
|
|
||||||
for n in node:
|
|
||||||
Bs(n,rank,dcmax)
|
|
||||||
|
|
||||||
tree = RootedTree()
|
|
||||||
ranks = set(ranks)
|
|
||||||
tset = set(distances)
|
|
||||||
|
|
||||||
for taxon in distances:
|
|
||||||
tree.addNode(taxon, rank=taxonomy.getRank(taxon),
|
|
||||||
name=taxonomy.getScientificName(taxon),
|
|
||||||
dc=float(distances[taxon][0]),
|
|
||||||
n=distances[taxon][1],
|
|
||||||
dc_label="%4.2f (%d)" % (float(distances[taxon][0]),distances[taxon][1])
|
|
||||||
)
|
|
||||||
|
|
||||||
for taxon in distances:
|
|
||||||
piter = taxonomy.parentalTreeIterator(taxon)
|
|
||||||
taxon = piter.next()
|
|
||||||
for parent in piter:
|
|
||||||
if taxon[0] in tset and parent[0] in distances:
|
|
||||||
tset.remove(taxon[0])
|
|
||||||
tree.addEdge(parent[0], taxon[0])
|
|
||||||
taxon=parent
|
|
||||||
|
|
||||||
root = tree.getRoots()[0]
|
|
||||||
Bs(root,bsrank,dcmax)
|
|
||||||
|
|
||||||
return tree
|
|
@ -1,34 +0,0 @@
|
|||||||
'''
|
|
||||||
Created on 30 oct. 2011
|
|
||||||
|
|
||||||
@author: coissac
|
|
||||||
'''
|
|
||||||
|
|
||||||
from obitools.ecopcr.options import addTaxonomyDBOptions
|
|
||||||
|
|
||||||
|
|
||||||
def addMetabarcodingOption(optionManager):
|
|
||||||
|
|
||||||
addTaxonomyDBOptions(optionManager)
|
|
||||||
|
|
||||||
optionManager.add_option('--dcmax',
|
|
||||||
action="store", dest="dc",
|
|
||||||
metavar="###",
|
|
||||||
type="int",
|
|
||||||
default=0,
|
|
||||||
help="Maximum confusion distance considered")
|
|
||||||
|
|
||||||
optionManager.add_option('--ingroup',
|
|
||||||
action="store", dest="ingroup",
|
|
||||||
metavar="###",
|
|
||||||
type="int",
|
|
||||||
default=1,
|
|
||||||
help="ncbi taxid delimitation the in group")
|
|
||||||
|
|
||||||
optionManager.add_option('--rank-thresold',
|
|
||||||
action="store", dest="rankthresold",
|
|
||||||
metavar="#.##",
|
|
||||||
type="float",
|
|
||||||
default=0.5,
|
|
||||||
help="minimum fraction of the ingroup sequences "
|
|
||||||
"for concidering the rank")
|
|
@ -1,28 +0,0 @@
|
|||||||
from obitools.obischemas import kb
|
|
||||||
__connection__ = None
|
|
||||||
|
|
||||||
def initConnection(options):
|
|
||||||
global __connection__
|
|
||||||
param = {}
|
|
||||||
if hasattr(options, "dbname") and options.dbname is not None:
|
|
||||||
param["database"]=options.dbname
|
|
||||||
if hasattr(options, "dbhost") and options.dbhost is not None:
|
|
||||||
param["host"]=options.dbhost
|
|
||||||
if hasattr(options, "dbuser") and options.dbuser is not None:
|
|
||||||
param["username"]=options.dbuser
|
|
||||||
if hasattr(options, "dbpassword") and options.dbpassword is not None:
|
|
||||||
param["password"]=options.dbpassword
|
|
||||||
|
|
||||||
__connection__=kb.getConnection(**param)
|
|
||||||
__connection__.autocommit=options.autocommit
|
|
||||||
|
|
||||||
def getConnection(options=None):
|
|
||||||
global __connection__
|
|
||||||
|
|
||||||
if options is not None:
|
|
||||||
initConnection(options)
|
|
||||||
|
|
||||||
assert __connection__ is not None,"database connection is not initialized"
|
|
||||||
|
|
||||||
return __connection__
|
|
||||||
|
|
@ -1,55 +0,0 @@
|
|||||||
"""
|
|
||||||
kb package is devoted to manage access to postgresql database from python
|
|
||||||
script
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class Connection(object):
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
raise RuntimeError('pyROM.KB.Connection is an abstract class')
|
|
||||||
|
|
||||||
def cursor(self):
|
|
||||||
raise RuntimeError('pyROM.KB.Connection.cursor is an abstract function')
|
|
||||||
|
|
||||||
def commit(self):
|
|
||||||
raise RuntimeError('pyROM.KB.Connection.commit is an abstract function')
|
|
||||||
|
|
||||||
def rollback(self):
|
|
||||||
raise RuntimeError('pyROM.KB.Connection.rollback is an abstract function')
|
|
||||||
|
|
||||||
def __call__(self,query):
|
|
||||||
return self.cursor().execute(query)
|
|
||||||
|
|
||||||
|
|
||||||
class Cursor(object):
|
|
||||||
|
|
||||||
def __init__(self,db):
|
|
||||||
raise RuntimeError('pyROM.KB.Cursor is an abstract class')
|
|
||||||
|
|
||||||
def execute(self,query):
|
|
||||||
raise RuntimeError('pyROM.KB.Cursor.execute is an abstract function')
|
|
||||||
|
|
||||||
__call__=execute
|
|
||||||
|
|
||||||
|
|
||||||
_current_connection = None # Static variable used to store connection to KB
|
|
||||||
|
|
||||||
def getConnection(*args,**kargs):
|
|
||||||
"""
|
|
||||||
return a connection to the database.
|
|
||||||
When call from database backend no argument are needed.
|
|
||||||
All connection returned by this function
|
|
||||||
"""
|
|
||||||
global _current_connection
|
|
||||||
|
|
||||||
if _current_connection==None or args or kargs :
|
|
||||||
try:
|
|
||||||
from obischemas.kb import backend
|
|
||||||
_current_connection = backend.Connection()
|
|
||||||
except ImportError:
|
|
||||||
from obischemas.kb import extern
|
|
||||||
_current_connection = extern.Connection(*args,**kargs)
|
|
||||||
return _current_connection
|
|
||||||
|
|
||||||
|
|
@ -1,78 +0,0 @@
|
|||||||
"""
|
|
||||||
Module : KB.extern
|
|
||||||
Author : Eric Coissac
|
|
||||||
Date : 03/05/2004
|
|
||||||
|
|
||||||
Module wrapping psycopg interface module to allow connection
|
|
||||||
to a postgresql databases with the same interface from
|
|
||||||
backend and external script.
|
|
||||||
|
|
||||||
This module define a class usable from external script
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
import psycopg2
|
|
||||||
import sys
|
|
||||||
from obischemas import kb
|
|
||||||
|
|
||||||
class Connection(kb.Connection):
|
|
||||||
|
|
||||||
def __init__(self,*connectParam,**kconnectParam):
|
|
||||||
if connectParam:
|
|
||||||
self.connectParam=={'dsn':connectParam}
|
|
||||||
else:
|
|
||||||
self.connectParam=kconnectParam
|
|
||||||
print self.connectParam
|
|
||||||
self.db = psycopg2.connect(**(self.connectParam))
|
|
||||||
|
|
||||||
def restart(self):
|
|
||||||
ok=1
|
|
||||||
while (ok and ok < 1000):
|
|
||||||
try:
|
|
||||||
self.db = psycopg2.connect(**self.connectParam)
|
|
||||||
except:
|
|
||||||
ok+=1
|
|
||||||
else:
|
|
||||||
ok=0
|
|
||||||
|
|
||||||
|
|
||||||
def cursor(self):
|
|
||||||
curs = Cursor(self.db)
|
|
||||||
if hasattr(self,'autocommit') and self.autocommit:
|
|
||||||
curs.autocommit = self.autocommit
|
|
||||||
return curs
|
|
||||||
|
|
||||||
def commit(self):
|
|
||||||
self.db.commit()
|
|
||||||
|
|
||||||
def rollback(self):
|
|
||||||
if hasattr(self,'db'):
|
|
||||||
self.db.rollback()
|
|
||||||
|
|
||||||
def __del__(self):
|
|
||||||
if hasattr(self,'db'):
|
|
||||||
self.rollback()
|
|
||||||
|
|
||||||
class Cursor(kb.Cursor):
|
|
||||||
|
|
||||||
def __init__(self,db):
|
|
||||||
self.db = db
|
|
||||||
self.curs = db.cursor()
|
|
||||||
|
|
||||||
def execute(self,query):
|
|
||||||
try:
|
|
||||||
self.curs.execute(query)
|
|
||||||
if hasattr(self,'autocommit') and self.autocommit:
|
|
||||||
self.db.commit()
|
|
||||||
except psycopg2.ProgrammingError,e:
|
|
||||||
print >>sys.stderr,"===> %s" % query
|
|
||||||
raise e
|
|
||||||
except psycopg2.IntegrityError,e:
|
|
||||||
print >>sys.stderr,"---> %s" % query
|
|
||||||
raise e
|
|
||||||
try:
|
|
||||||
label = [x[0] for x in self.curs.description]
|
|
||||||
return [dict(map(None,label,y))
|
|
||||||
for y in self.curs.fetchall()]
|
|
||||||
except TypeError:
|
|
||||||
return []
|
|
@ -1,31 +0,0 @@
|
|||||||
def addConnectionOptions(optionManager):
|
|
||||||
|
|
||||||
optionManager.add_option('-d','--dbname',
|
|
||||||
action="store", dest="dbname",
|
|
||||||
metavar="<DB NAME>",
|
|
||||||
type="string",
|
|
||||||
help="OBISchema database name containing"
|
|
||||||
"taxonomical data")
|
|
||||||
|
|
||||||
optionManager.add_option('-H','--host',
|
|
||||||
action="store", dest="dbhost",
|
|
||||||
metavar="<DB HOST>",
|
|
||||||
type="string",
|
|
||||||
help="host hosting OBISchema database")
|
|
||||||
|
|
||||||
optionManager.add_option('-U','--user',
|
|
||||||
action="store", dest="dbuser",
|
|
||||||
metavar="<DB USER>",
|
|
||||||
type="string",
|
|
||||||
help="user for OBISchema database connection")
|
|
||||||
|
|
||||||
optionManager.add_option('-W','--password',
|
|
||||||
action="store", dest="dbpassword",
|
|
||||||
metavar="<DB PASSWORD>",
|
|
||||||
type="string",
|
|
||||||
help="password for OBISchema database connection")
|
|
||||||
|
|
||||||
optionManager.add_option('-A','--autocommit',
|
|
||||||
action="store_true",dest="autocommit",
|
|
||||||
default=False,
|
|
||||||
help="add commit action after each query")
|
|
@ -1,53 +0,0 @@
|
|||||||
from obitools.obo.parser import OBOTerm
|
|
||||||
from obitools.obo.parser import OBOEntry
|
|
||||||
from obitools.obo.parser import stanzaIterator
|
|
||||||
from logging import debug
|
|
||||||
|
|
||||||
class GOEntry(OBOEntry):
|
|
||||||
'''
|
|
||||||
An entry of a GeneOntology .obo file. It can be a header (without a stanza name) or
|
|
||||||
a stanza (with a stanza name between brackets). It inherits from the class dict.
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
|
||||||
class GOTerm(OBOTerm):
|
|
||||||
|
|
||||||
'''
|
|
||||||
A stanza named 'Term'. It inherits from the class OBOTerm.
|
|
||||||
'''
|
|
||||||
|
|
||||||
def __init__(self,stanza):
|
|
||||||
|
|
||||||
## use of the OBOEntry constructor.
|
|
||||||
OBOTerm.__init__(self, stanza)
|
|
||||||
|
|
||||||
assert 'namespace' in self and len(self['namespace'])==1, "An OBOTerm must belong to one of the cell_component, molecular_function or biological_process namespace"
|
|
||||||
|
|
||||||
|
|
||||||
def GOEntryFactory(stanza):
|
|
||||||
'''
|
|
||||||
Dispatcher of stanza.
|
|
||||||
|
|
||||||
@param stanza: a stanza composed of several lines.
|
|
||||||
@type stanza: text
|
|
||||||
|
|
||||||
@return: an C{OBOTerm} | C{OBOEntry} instance
|
|
||||||
|
|
||||||
@note: The dispatcher treats differently the stanza which are OBO "Term"
|
|
||||||
and the others.
|
|
||||||
'''
|
|
||||||
|
|
||||||
stanzaType = OBOEntry.parseStanzaName(stanza)
|
|
||||||
|
|
||||||
if stanzaType=="Term":
|
|
||||||
return GOTerm(stanza)
|
|
||||||
else:
|
|
||||||
return OBOEntry(stanza)
|
|
||||||
|
|
||||||
|
|
||||||
def GOEntryIterator(file):
|
|
||||||
entries = stanzaIterator(file)
|
|
||||||
for e in entries:
|
|
||||||
debug(e)
|
|
||||||
yield GOEntryFactory(e)
|
|
||||||
|
|
@ -1,707 +0,0 @@
|
|||||||
from obitools.utils import skipWhiteLineIterator,multiLineWrapper
|
|
||||||
from obitools.utils import universalOpen
|
|
||||||
from obitools.format.genericparser import genericEntryIteratorGenerator
|
|
||||||
from logging import debug,warning
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
#################################################################################
|
|
||||||
## Stanza preparation area ##
|
|
||||||
#################################################################################
|
|
||||||
|
|
||||||
|
|
||||||
class FileFormatError(Exception):
|
|
||||||
'''
|
|
||||||
An error derived from the class Exception.
|
|
||||||
'''
|
|
||||||
pass
|
|
||||||
|
|
||||||
_oboEntryIterator = genericEntryIteratorGenerator(endEntry='^ *$',
|
|
||||||
strip=True)
|
|
||||||
|
|
||||||
def stanzaIterator(inputfile):
|
|
||||||
'''
|
|
||||||
Iterator of stanza. The stanza are the basic units of OBO files.
|
|
||||||
|
|
||||||
@param inputfile: a stream of strings from an opened OBO file.
|
|
||||||
@type inputfile: a stream of strings
|
|
||||||
|
|
||||||
@return: a stream of stanza
|
|
||||||
@rtype: a stream of aggregated strings
|
|
||||||
|
|
||||||
@note: The iterator constructs stanza by aggregate strings from the
|
|
||||||
OBO file.
|
|
||||||
'''
|
|
||||||
inputfile = universalOpen(inputfile)
|
|
||||||
inputfile = multiLineWrapper(inputfile)
|
|
||||||
return _oboEntryIterator(inputfile)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#################################################################################
|
|
||||||
## Trailing Modifiers treatment area ##
|
|
||||||
#################################################################################
|
|
||||||
|
|
||||||
|
|
||||||
class TrailingModifier(dict):
|
|
||||||
'''
|
|
||||||
A class object which inherits from the class dict. Trailing modifiers can be found
|
|
||||||
at the end of TaggedValue objects when they exist.
|
|
||||||
'''
|
|
||||||
|
|
||||||
_match_brace = re.compile('(?<=\ {)[^\]]*(\}) *( !|$)')
|
|
||||||
|
|
||||||
def __init__(self,string):
|
|
||||||
|
|
||||||
## search for trailing modifiers signals
|
|
||||||
trailing_modifiers = TrailingModifier._match_brace.search(string)
|
|
||||||
|
|
||||||
## the trailing modifiers exist
|
|
||||||
if trailing_modifiers:
|
|
||||||
trailing_modifiers=trailing_modifiers.group(0).strip()
|
|
||||||
print trailing_modifiers
|
|
||||||
## creates and feeds the dictionary of trailing modifiers
|
|
||||||
dict.__init__(self,(x.strip().split('=',1) for x in trailing_modifiers.split(',')))
|
|
||||||
|
|
||||||
|
|
||||||
def trailingModifierFactory(string):
|
|
||||||
'''
|
|
||||||
Dispatcher of trailing modifiers.
|
|
||||||
|
|
||||||
@param string: a string from a TaggedValue object with a trailing modifiers signal.
|
|
||||||
@type string: string
|
|
||||||
|
|
||||||
@return: a class object
|
|
||||||
|
|
||||||
@note: The dispatcher is currently very simple. Only one case is treated by the function.
|
|
||||||
`the function returns a class object inherited from the class dict if the trailing modifiers
|
|
||||||
exist, None if they don't.
|
|
||||||
'''
|
|
||||||
|
|
||||||
trailing_modifiers = TrailingModifier(string)
|
|
||||||
if not trailing_modifiers:
|
|
||||||
trailing_modifiers=None
|
|
||||||
return trailing_modifiers
|
|
||||||
|
|
||||||
|
|
||||||
#################################################################################
|
|
||||||
## TaggedValue treatment area ##
|
|
||||||
#################################################################################
|
|
||||||
|
|
||||||
|
|
||||||
class TaggedValue(object):
|
|
||||||
'''
|
|
||||||
A couple 'tag:value' of an OBOEntry.
|
|
||||||
'''
|
|
||||||
|
|
||||||
_match_value = re.compile('(("(\\\\"|[^\"])*")|(\\\\"|[^\"]))*?( !| {|$)')
|
|
||||||
_split_comment = re.compile('^!| !')
|
|
||||||
_match_quotedString = re.compile('(?<=")(\\\\"|[^\"])*(?=")')
|
|
||||||
_match_bracket = re.compile('\[[^\]]*\]')
|
|
||||||
|
|
||||||
def __init__(self,line):
|
|
||||||
'''
|
|
||||||
Constructor of the class TaggedValue.
|
|
||||||
|
|
||||||
@param line: a line of an OBOEntry composed of a tag and a value.
|
|
||||||
@type line: string
|
|
||||||
|
|
||||||
@note: The constructor separates tags from right terms. 'value' is extracted
|
|
||||||
from right terms using a regular expression (value is at the beginning of the
|
|
||||||
string, between quotes or not). Then, 'comment' is extracted from the rest of the
|
|
||||||
string using another regular expression ('comment' is at the end of the string
|
|
||||||
after a '!'. By default, 'comment' is set to None). Finally, 'trailing_modifiers'
|
|
||||||
are extracted from the last string using another regular expression.
|
|
||||||
The tag, the value, the comment and the trailing_modifiers are saved.
|
|
||||||
'''
|
|
||||||
|
|
||||||
debug("tagValueParser : %s" % line)
|
|
||||||
|
|
||||||
## by default :
|
|
||||||
trailing_modifiers = None
|
|
||||||
comment = None
|
|
||||||
|
|
||||||
## the tag is saved. 'right' is composed of the value, the comment and the trailing modifiers
|
|
||||||
tag,rigth = line.split(':',1)
|
|
||||||
|
|
||||||
## the value is saved
|
|
||||||
value = TaggedValue._match_value.search(rigth).group(0)
|
|
||||||
debug("Extracted value : %s" % value)
|
|
||||||
|
|
||||||
## if there is a value AND a sign of a comment or trailing modifiers
|
|
||||||
if value and value[-1] in '!{':
|
|
||||||
lvalue = len(value)
|
|
||||||
## whatever it is a comment or trailing modifiers, it is saved into 'extra'
|
|
||||||
extra = rigth[lvalue-1:].strip()
|
|
||||||
## a comment is extracted
|
|
||||||
extra =TaggedValue._split_comment.split(extra,1)
|
|
||||||
## and saved if it exists
|
|
||||||
if len(extra)==2:
|
|
||||||
comment=extra[1].strip()
|
|
||||||
## trailing modifiers are extracted
|
|
||||||
extra=extra[0]
|
|
||||||
trailing_modifiers = trailingModifierFactory(extra)
|
|
||||||
## the value is cleaned of any comment or trailing modifiers signals
|
|
||||||
value = value[0:-1]
|
|
||||||
|
|
||||||
if tag=='use_term':
|
|
||||||
tag='consider'
|
|
||||||
raise DeprecationWarning,"user_term is a deprecated tag, you should instead use consider"
|
|
||||||
|
|
||||||
## recording zone
|
|
||||||
self.value =value.strip()
|
|
||||||
self.tag = tag
|
|
||||||
self.__doc__=comment
|
|
||||||
self.trailing_modifiers=trailing_modifiers
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return str(self.value)
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '''"""%s"""''' % str(self)
|
|
||||||
|
|
||||||
|
|
||||||
class NameValue(TaggedValue):
|
|
||||||
'''
|
|
||||||
A couple 'name:value' inherited from the class TaggedValue. Used to manage name tags.
|
|
||||||
'''
|
|
||||||
|
|
||||||
def __init__(self,line):
|
|
||||||
|
|
||||||
## no use of the TaggedValue constructor. The NameValue is very simple.
|
|
||||||
tag,rigth = line.split(':',1)
|
|
||||||
|
|
||||||
## recording zone
|
|
||||||
self.value = rigth.strip()
|
|
||||||
self.tag = 'name'
|
|
||||||
self.__doc__=None
|
|
||||||
self.trailing_modifiers=None
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class DefValue(TaggedValue):
|
|
||||||
'''
|
|
||||||
A couple 'def:value' inherited from the class TaggedValue. Used to manage def tags.
|
|
||||||
'''
|
|
||||||
|
|
||||||
def __init__(self,line):
|
|
||||||
'''
|
|
||||||
Constructor of the class DefValue.
|
|
||||||
|
|
||||||
@param line: a line of an OBOEntry composed of a tag named 'def' and a value.
|
|
||||||
@type line: string
|
|
||||||
|
|
||||||
@note: The constructor calls the TaggedValue constructor. A regular expression
|
|
||||||
is used to extract the 'definition' from TaggedValue.value (definition is a not
|
|
||||||
quoted TaggedValue.value). A regular expression is used to extract 'dbxrefs'
|
|
||||||
from the aggedValue.value without the definition (dbxrefs are between brackets
|
|
||||||
and definition can be so). Definition is saved as the new value of the DefValue.
|
|
||||||
dbxrefs are saved.
|
|
||||||
'''
|
|
||||||
|
|
||||||
## use of the TaggedValue constructor
|
|
||||||
TaggedValue.__init__(self, line)
|
|
||||||
|
|
||||||
## definition, which is quoted, is extracted from the standard value of a TaggedValue.
|
|
||||||
definition = TaggedValue._match_quotedString.search(self.value).group(0)
|
|
||||||
|
|
||||||
## the standard value is cleaned of the definition.
|
|
||||||
cleanvalue = self.value.replace(definition,'')
|
|
||||||
cleanvalue = cleanvalue.replace(' ',' ')
|
|
||||||
|
|
||||||
## dbxrefs are searched into the rest of the standard value.
|
|
||||||
dbxrefs = TaggedValue._match_bracket.search(cleanvalue).group(0)
|
|
||||||
|
|
||||||
## recording zone
|
|
||||||
self.tag = 'def'
|
|
||||||
## the value of a DefValue is not the standard value but the definition.
|
|
||||||
self.value=definition
|
|
||||||
self.dbxrefs=xrefFactory(dbxrefs)
|
|
||||||
|
|
||||||
|
|
||||||
class SynonymValue(TaggedValue):
|
|
||||||
'''
|
|
||||||
A couple 'synonym:value' inherited from the class TaggedValue. Used to manage
|
|
||||||
synonym tags, exact_synonym tags, broad_synonym tags and narrow_synonym tags.
|
|
||||||
'''
|
|
||||||
|
|
||||||
_match_scope = re.compile('(?<="")[^\[]*(?=\[|$)')
|
|
||||||
|
|
||||||
def __init__(self,line):
|
|
||||||
'''
|
|
||||||
Constructor of the class SynonymValue.
|
|
||||||
|
|
||||||
@param line: a line of an OBOEntry composed of a tag named 'synonym' or
|
|
||||||
'exact_synonym' or 'broad_synonym' or 'narrow_synonym' and a value.
|
|
||||||
@type line: string
|
|
||||||
|
|
||||||
@note: SynonymValue is composed of a tag, a value, a scope, a list of types and
|
|
||||||
dbxrefs.
|
|
||||||
The constructor calls the TaggedValue constructor. A regular expression
|
|
||||||
is used to extract 'definition' from TaggedValue.value (definition is a not
|
|
||||||
quoted TaggedValue.value). Definition is saved as the new value of the class
|
|
||||||
SynonymValue.
|
|
||||||
A regular expression is used to extract 'attributes' from the rest of the
|
|
||||||
string. Attributes may contain an optional synonym scope and an optional list
|
|
||||||
of synonym types. The scope is extracted from attributes or set by default to
|
|
||||||
'RELATED'. It is saved as the scope of the class. The types are the rest of the
|
|
||||||
attributes and are saved as the list of types of the class.
|
|
||||||
For deprecated tags 'exact_synonym', 'broad_synonym' and 'narrow_synonym', tag
|
|
||||||
is set to 'synonym' and scope is set respectively to 'EXACT', 'BROAD' and 'NARROW'.
|
|
||||||
A regular expression is used to extract 'dbxrefs' from the TaggedValue.value
|
|
||||||
without the definition (dbxrefs are between brackets and definition can be so).
|
|
||||||
dbxrefs are saved.
|
|
||||||
'''
|
|
||||||
|
|
||||||
## use of the TaggedValue constructor
|
|
||||||
TaggedValue.__init__(self, line)
|
|
||||||
|
|
||||||
## definition, which is quoted, is extracted from the standard value of a TaggedValue.
|
|
||||||
definition = TaggedValue._match_quotedString.search(self.value).group(0)
|
|
||||||
|
|
||||||
## the standard value is cleaned of the definition.
|
|
||||||
cleanvalue = self.value.replace(definition,'')
|
|
||||||
cleanvalue = cleanvalue.replace(' ',' ')
|
|
||||||
|
|
||||||
## 1) attributes are searched into the rest of the standard value.
|
|
||||||
## 2) then they are stripped.
|
|
||||||
## 3) then they are split on every ' '.
|
|
||||||
## 4) finally they are ordered into a set.
|
|
||||||
attributes = set(SynonymValue._match_scope.search(cleanvalue).group(0).strip().split())
|
|
||||||
|
|
||||||
## the scopes are the junction between the attributes and a set of specific terms.
|
|
||||||
scopes = attributes & set(['RELATED','EXACT','BROAD','NARROW'])
|
|
||||||
|
|
||||||
## the types are the rest of the attributes.
|
|
||||||
types = attributes - scopes
|
|
||||||
|
|
||||||
## this is a constraint of the OBO format
|
|
||||||
assert len(scopes)< 2,"Only one synonym scope allowed"
|
|
||||||
|
|
||||||
## the scope of the SynonymValue is into scopes or set by default to RELATED
|
|
||||||
if scopes:
|
|
||||||
scope = scopes.pop()
|
|
||||||
else:
|
|
||||||
scope = 'RELATED'
|
|
||||||
|
|
||||||
## Specific rules are defined for the following tags :
|
|
||||||
if self.tag == 'exact_synonym':
|
|
||||||
raise DeprecationWarning,'exact_synonym is a deprecated tag use instead synonym tag'
|
|
||||||
self.tag = 'synonym'
|
|
||||||
scope = 'EXACT'
|
|
||||||
|
|
||||||
if self.tag == 'broad_synonym':
|
|
||||||
raise DeprecationWarning,'broad_synonym is a deprecated tag use instead synonym tag'
|
|
||||||
self.tag = 'synonym'
|
|
||||||
scope = 'BROAD'
|
|
||||||
|
|
||||||
if self.tag == 'narrow_synonym':
|
|
||||||
raise DeprecationWarning,'narrow_synonym is a deprecated tag use instead synonym tag'
|
|
||||||
self.tag = 'synonym'
|
|
||||||
scope = 'NARROW'
|
|
||||||
|
|
||||||
if self.tag == 'systematic_synonym':
|
|
||||||
#raise DeprecationWarning,'narrow_synonym is a deprecated tag use instead sysnonym tag'
|
|
||||||
self.tag = 'synonym'
|
|
||||||
scope = 'SYSTEMATIC'
|
|
||||||
|
|
||||||
## this is our own constraint. deprecated tags are not saved by this parser.
|
|
||||||
assert self.tag =='synonym',"%s synonym type is not managed" % self.tag
|
|
||||||
|
|
||||||
## dbxrefs are searched into the rest of the standard value.
|
|
||||||
dbxrefs = TaggedValue._match_bracket.search(cleanvalue).group(0)
|
|
||||||
|
|
||||||
## recording zone
|
|
||||||
## the value of a SynonymValue is not the standard value but the definition.
|
|
||||||
self.value = definition
|
|
||||||
self.dbxrefs = xrefFactory(dbxrefs)
|
|
||||||
self.scope = scope
|
|
||||||
self.types = list(types)
|
|
||||||
|
|
||||||
def __eq__(self,b):
|
|
||||||
return ((self.value==b.value) and (self.dbxrefs==b.dbxrefs)
|
|
||||||
and (self.scope==b.scope) and (self.types==b.types)
|
|
||||||
and (self.__doc__==b.__doc__) and (self.tag==b.tag)
|
|
||||||
and (self.trailing_modifiers==b.trailing_modifiers))
|
|
||||||
|
|
||||||
def __hash__(self):
|
|
||||||
return (reduce(lambda x,y:x+y,(hash(z) for z in [self.__doc__,
|
|
||||||
self.value,
|
|
||||||
frozenset(self.dbxrefs),
|
|
||||||
self.scope,
|
|
||||||
frozenset(self.types),
|
|
||||||
self.tag,
|
|
||||||
self.trailing_modifiers]),0)) % (2**31)
|
|
||||||
|
|
||||||
|
|
||||||
class XrefValue(TaggedValue):
|
|
||||||
'''
|
|
||||||
A couple 'xref:value' inherited from the class TaggedValue. Used to manage
|
|
||||||
xref tags.
|
|
||||||
'''
|
|
||||||
|
|
||||||
def __init__(self,line):
|
|
||||||
|
|
||||||
## use of the TaggedValue constructor
|
|
||||||
TaggedValue.__init__(self, line)
|
|
||||||
|
|
||||||
## use the same function as the dbxrefs
|
|
||||||
self.value=xrefFactory(self.value)
|
|
||||||
|
|
||||||
if self.tag in ('xref_analog','xref_unk'):
|
|
||||||
raise DeprecationWarning,'%s is a deprecated tag use instead sysnonym tag' % self.tag
|
|
||||||
self.tag='xref'
|
|
||||||
|
|
||||||
## this is our own constraint. deprecated tags are not saved by this parser.
|
|
||||||
assert self.tag=='xref'
|
|
||||||
|
|
||||||
|
|
||||||
class RelationshipValue(TaggedValue):
|
|
||||||
'''
|
|
||||||
A couple 'xref:value' inherited from the class TaggedValue. Used to manage
|
|
||||||
xref tags.
|
|
||||||
'''
|
|
||||||
|
|
||||||
def __init__(self,line):
|
|
||||||
|
|
||||||
## use of the TaggedValue constructor
|
|
||||||
TaggedValue.__init__(self, line)
|
|
||||||
|
|
||||||
## the value is split on the first ' '.
|
|
||||||
value = self.value.split(None,1)
|
|
||||||
|
|
||||||
## succesful split !
|
|
||||||
if len(value)==2:
|
|
||||||
relationship=value[0]
|
|
||||||
term=value[1]
|
|
||||||
## unsuccesful split. The relationship is set by default to IS_A
|
|
||||||
else:
|
|
||||||
relationship='is_a'
|
|
||||||
term=value[0]
|
|
||||||
|
|
||||||
## recording zone
|
|
||||||
self.value=term
|
|
||||||
self.relationship=relationship
|
|
||||||
|
|
||||||
|
|
||||||
class NamespaceValue(TaggedValue):
|
|
||||||
def __init__(self,line):
|
|
||||||
TaggedValue.__init__(self, line)
|
|
||||||
|
|
||||||
class RemarkValue(TaggedValue):
|
|
||||||
def __init__(self,line):
|
|
||||||
TaggedValue.__init__(self, line)
|
|
||||||
label,value = self.value.split(':',1)
|
|
||||||
label = label.strip()
|
|
||||||
value = value.strip()
|
|
||||||
self.value=value
|
|
||||||
self.label=label
|
|
||||||
|
|
||||||
|
|
||||||
def taggedValueFactory(line):
|
|
||||||
'''
|
|
||||||
A function used to dispatch lines of an OBOEntry between the class TaggedValue
|
|
||||||
and its inherited classes.
|
|
||||||
|
|
||||||
@param line: a line of an OBOEntry composed of a tag and a value.
|
|
||||||
@type line: string
|
|
||||||
|
|
||||||
@return: a class object
|
|
||||||
'''
|
|
||||||
|
|
||||||
if (line[0:9]=='namespace' or
|
|
||||||
line[0:17]=='default-namespace'):
|
|
||||||
return NamespaceValue(line)
|
|
||||||
## DefValue is an inherited class of TaggedValue
|
|
||||||
elif line[0:3]=='def':
|
|
||||||
return DefValue(line)
|
|
||||||
## SynonymValue is an inherited class of TaggedValue
|
|
||||||
elif ((line[0:7]=="synonym" and line[0:14]!="synonymtypedef") or
|
|
||||||
line[0:13]=="exact_synonym" or
|
|
||||||
line[0:13]=="broad_synonym" or
|
|
||||||
line[0:14]=="narrow_synonym"):
|
|
||||||
return SynonymValue(line)
|
|
||||||
## XrefValue is an inherited class of TaggedValue
|
|
||||||
elif line[0:4]=='xref':
|
|
||||||
return XrefValue(line)
|
|
||||||
## NameValue is an inherited class of TaggedValue
|
|
||||||
elif line[0:4]=='name':
|
|
||||||
return NameValue(line)
|
|
||||||
## RelationshipValue is an inherited class of TaggedValue
|
|
||||||
elif (line[0:15]=='intersection_of' or
|
|
||||||
line[0:8] =='union_of' or
|
|
||||||
line[0:12]=='relationship'):
|
|
||||||
return RelationshipValue(line)
|
|
||||||
elif (line[0:6]=='remark'):
|
|
||||||
return RemarkValue(line)
|
|
||||||
## each line is a couple : tag / value (and some more features)
|
|
||||||
else:
|
|
||||||
return TaggedValue(line)
|
|
||||||
|
|
||||||
|
|
||||||
#################################################################################
|
|
||||||
## Xref treatment area ##
|
|
||||||
#################################################################################
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Xref(object):
|
|
||||||
'''
|
|
||||||
A xref object of an OBOentry. It may be the 'dbxrefs' of SynonymValue and
|
|
||||||
DefValue objects or the 'value' of XrefValue objects.
|
|
||||||
'''
|
|
||||||
|
|
||||||
__splitdata__ = re.compile(' +(?=["{])')
|
|
||||||
|
|
||||||
def __init__(self,ref):
|
|
||||||
if ref == '' : #
|
|
||||||
ref = None #
|
|
||||||
data = '' #
|
|
||||||
else : # Modifs JJ sinon erreur : list index out of range
|
|
||||||
data = Xref.__splitdata__.split(ref,1) #
|
|
||||||
ref = data[0] #
|
|
||||||
description=None
|
|
||||||
trailing_modifiers = None
|
|
||||||
if len(data)> 1:
|
|
||||||
extra = data[1]
|
|
||||||
description = TaggedValue._match_quotedString.search(extra)
|
|
||||||
if description is not None:
|
|
||||||
description = description.group(0)
|
|
||||||
extra.replace(description,'')
|
|
||||||
trailing_modifiers=trailingModifierFactory(extra)
|
|
||||||
self.reference=ref
|
|
||||||
self.description=description
|
|
||||||
self.trailing_modifiers=trailing_modifiers
|
|
||||||
|
|
||||||
def __eq__(self,b):
|
|
||||||
return ((self.reference==b.reference) and (self.description==b.description)
|
|
||||||
and (self.trailing_modifiers==b.trailing_modifiers))
|
|
||||||
|
|
||||||
def __hash__(self):
|
|
||||||
return (reduce(lambda x,y:x+y,(hash(z) for z in [self.reference,
|
|
||||||
self.description,
|
|
||||||
self.trailing_modifiers]),0)) % (2**31)
|
|
||||||
|
|
||||||
|
|
||||||
def xrefFactory(string):
|
|
||||||
'''
|
|
||||||
Dispatcher of xrefs.
|
|
||||||
|
|
||||||
@param string: a string (between brackets) from an inherited TaggedValue object with a dbxrefs
|
|
||||||
signal (actually, the signal can only be found into SynonymValue and DefValue
|
|
||||||
objects) or a string (without brackets) from a XrefValue object.
|
|
||||||
@type string: string
|
|
||||||
|
|
||||||
@return: a class object
|
|
||||||
|
|
||||||
@note: The dispatcher treats differently the strings between brackets (from SynonymValue and
|
|
||||||
DefValue objects) and without brackets (from XrefValue objects).
|
|
||||||
'''
|
|
||||||
|
|
||||||
string = string.strip()
|
|
||||||
if string[0]=='[':
|
|
||||||
return [Xref(x.strip()) for x in string[1:-1].split(',')]
|
|
||||||
else:
|
|
||||||
return Xref(string)
|
|
||||||
|
|
||||||
|
|
||||||
#################################################################################
|
|
||||||
## Stanza treatment area ##
|
|
||||||
#################################################################################
|
|
||||||
|
|
||||||
|
|
||||||
class OBOEntry(dict):
|
|
||||||
'''
|
|
||||||
An entry of an OBOFile. It can be a header (without a stanza name) or
|
|
||||||
a stanza (with a stanza name between brackets). It inherits from the class dict.
|
|
||||||
'''
|
|
||||||
_match_stanza_name = re.compile('(?<=^\[)[^\]]*(?=\])')
|
|
||||||
|
|
||||||
def __init__(self,stanza):
|
|
||||||
## tests if it is the header of the OBO file (returns TRUE) or not (returns FALSE)
|
|
||||||
self.isHeader = stanza[0]!='['
|
|
||||||
lines = stanza.split('\n')
|
|
||||||
## not the header : there is a [stanzaName]
|
|
||||||
if not self.isHeader:
|
|
||||||
self.stanzaName = lines[0].strip()[1:-1]
|
|
||||||
lines=lines[1:]
|
|
||||||
self["stanza"] = [stanza.strip()]
|
|
||||||
|
|
||||||
## whatever the stanza is.
|
|
||||||
for line in lines:
|
|
||||||
## each line is a couple : tag / value
|
|
||||||
taggedvalue = taggedValueFactory(line)
|
|
||||||
if taggedvalue.tag in self:
|
|
||||||
self[taggedvalue.tag].append(taggedvalue)
|
|
||||||
else:
|
|
||||||
self[taggedvalue.tag]=[taggedvalue]
|
|
||||||
|
|
||||||
|
|
||||||
def parseStanzaName(stanza):
|
|
||||||
sm = OBOEntry._match_stanza_name.search(stanza)
|
|
||||||
if sm:
|
|
||||||
return sm.group(0)
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
parseStanzaName=staticmethod(parseStanzaName)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class OBOTerm(OBOEntry):
|
|
||||||
'''
|
|
||||||
A stanza named 'Term'. It inherits from the class OBOEntry.
|
|
||||||
'''
|
|
||||||
def __init__(self,stanza):
|
|
||||||
|
|
||||||
## use of the OBOEntry constructor.
|
|
||||||
OBOEntry.__init__(self, stanza)
|
|
||||||
|
|
||||||
assert self.stanzaName=='Term'
|
|
||||||
assert 'stanza' in self
|
|
||||||
assert 'id' in self and len(self['id'])==1,"An OBOTerm must have an id"
|
|
||||||
assert 'name' in self and len(self['name'])==1,"An OBOTerm must have a name"
|
|
||||||
assert 'namespace' not in self or len(self['namespace'])==1, "Only one namespace is allowed for an OBO term"
|
|
||||||
|
|
||||||
assert 'def' not in self or len(self['def'])==1,"Only one definition is allowed for an OBO term"
|
|
||||||
assert 'comment' not in self or len(self['comment'])==1,"Only one comment is allowed for an OBO term"
|
|
||||||
|
|
||||||
assert 'union_of' not in self or len(self['union_of'])>=2,"Only one union relationship is allowed for an OBO term"
|
|
||||||
assert 'intersection_of' not in self or len(self['intersection_of'])>=2,"Only one intersection relationship is allowed for an OBO term"
|
|
||||||
|
|
||||||
if self._isObsolete():
|
|
||||||
#assert 'is_a' not in self
|
|
||||||
assert 'relationship' not in self
|
|
||||||
assert 'inverse_of' not in self
|
|
||||||
assert 'disjoint_from' not in self
|
|
||||||
assert 'union_of' not in self
|
|
||||||
assert 'intersection_of' not in self
|
|
||||||
|
|
||||||
assert 'replaced_by' not in self or self._isObsolete()
|
|
||||||
assert 'consider' not in self or self._isObsolete()
|
|
||||||
|
|
||||||
def _getStanza(self):
|
|
||||||
return self['stanza'][0]
|
|
||||||
|
|
||||||
## make-up functions.
|
|
||||||
def _getDefinition(self):
|
|
||||||
if 'def' in self:
|
|
||||||
return self['def'][0]
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _getId(self):
|
|
||||||
return self['id'][0]
|
|
||||||
|
|
||||||
def _getNamespace(self):
|
|
||||||
return self['namespace'][0]
|
|
||||||
|
|
||||||
def _getName(self):
|
|
||||||
return self['name'][0]
|
|
||||||
|
|
||||||
def _getComment(self):
|
|
||||||
if 'comment' in self:
|
|
||||||
return self['comment'][0]
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _getAltIds(self):
|
|
||||||
if 'alt_id' in self:
|
|
||||||
return list(set(self.get('alt_id',None)))
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _getIsA(self):
|
|
||||||
if 'is_a' in self:
|
|
||||||
return list(set(self.get('is_a',None)))
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _getSynonym(self):
|
|
||||||
if 'synonym' in self :
|
|
||||||
return list(set(self.get('synonym',None)))
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _getSubset(self):
|
|
||||||
if self.get('subset',None) != None:
|
|
||||||
return list(set(self.get('subset',None)))
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _getXref(self):
|
|
||||||
if 'xref' in self:
|
|
||||||
return list(set(self.get('xref',None)))
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _getRelationShip(self):
|
|
||||||
if 'relationship' in self:
|
|
||||||
return list(set(self.get('relationship',None)))
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _getUnion(self):
|
|
||||||
return list(set(self.get('union_of',None)))
|
|
||||||
|
|
||||||
def _getIntersection(self):
|
|
||||||
return list(set(self.get('intersection_of',None)))
|
|
||||||
|
|
||||||
def _getDisjonction(self):
|
|
||||||
return list(set(self.get('disjoint_from',None)))
|
|
||||||
|
|
||||||
def _isObsolete(self):
|
|
||||||
return 'is_obsolete' in self and str(self['is_obsolete'][0])=='true'
|
|
||||||
|
|
||||||
def _getReplacedBy(self):
|
|
||||||
if 'replaced_by' in self:
|
|
||||||
return list(set(self.get('replaced_by',None)))
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _getConsider(self):
|
|
||||||
if 'consider' in self:
|
|
||||||
return list(set(self.get('consider',None)))
|
|
||||||
return None
|
|
||||||
|
|
||||||
## automatically make-up !
|
|
||||||
stanza = property(_getStanza,None,None)
|
|
||||||
definition = property(_getDefinition,None,None)
|
|
||||||
id = property(_getId,None,None)
|
|
||||||
namespace = property(_getNamespace,None,None)
|
|
||||||
name = property(_getName,None,None)
|
|
||||||
comment = property(_getComment,None,None)
|
|
||||||
alt_ids = property(_getAltIds,None,None)
|
|
||||||
is_a = property(_getIsA,None,None)
|
|
||||||
synonyms = property(_getSynonym,None,None)
|
|
||||||
subsets = property(_getSubset,None,None)
|
|
||||||
xrefs = property(_getXref,None,None)
|
|
||||||
relationship = property(_getRelationShip,None,None)
|
|
||||||
union_of = property(_getUnion,None,None)
|
|
||||||
intersection_of = property(_getIntersection,None,None)
|
|
||||||
disjoint_from = property(_getDisjonction,None,None)
|
|
||||||
is_obsolete = property(_isObsolete,None,None)
|
|
||||||
replaced_by = property(_getReplacedBy,None,None)
|
|
||||||
consider = property(_getConsider,None,None)
|
|
||||||
|
|
||||||
|
|
||||||
def OBOEntryFactory(stanza):
|
|
||||||
'''
|
|
||||||
Dispatcher of stanza.
|
|
||||||
|
|
||||||
@param stanza: a stanza composed of several lines.
|
|
||||||
@type stanza: text
|
|
||||||
|
|
||||||
@return: an C{OBOTerm} | C{OBOEntry} instance
|
|
||||||
|
|
||||||
@note: The dispatcher treats differently the stanza which are OBO "Term"
|
|
||||||
and the others.
|
|
||||||
'''
|
|
||||||
|
|
||||||
stanzaType = OBOEntry.parseStanzaName(stanza)
|
|
||||||
|
|
||||||
if stanzaType=="Term":
|
|
||||||
return OBOTerm(stanza)
|
|
||||||
else:
|
|
||||||
return OBOEntry(stanza)
|
|
||||||
|
|
||||||
def OBOEntryIterator(file):
|
|
||||||
entries = stanzaIterator(file)
|
|
||||||
for e in entries:
|
|
||||||
debug(e)
|
|
||||||
yield OBOEntryFactory(e)
|
|
||||||
|
|
||||||
|
|
@ -1,137 +0,0 @@
|
|||||||
"""
|
|
||||||
Module providing high level functions to manage command line options.
|
|
||||||
"""
|
|
||||||
import logging
|
|
||||||
import sys
|
|
||||||
|
|
||||||
from logging import debug
|
|
||||||
|
|
||||||
from optparse import OptionParser
|
|
||||||
|
|
||||||
from obitools.utils import universalOpen
|
|
||||||
from obitools.utils import fileSize
|
|
||||||
from obitools.utils import universalTell
|
|
||||||
from obitools.utils import progressBar
|
|
||||||
from obitools.format.options import addInputFormatOption, addInOutputOption,\
|
|
||||||
autoEntriesIterator
|
|
||||||
import time
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def getOptionManager(optionDefinitions,entryIterator=None,progdoc=None):
|
|
||||||
'''
|
|
||||||
Build an option manager fonction. that is able to parse
|
|
||||||
command line options of the script.
|
|
||||||
|
|
||||||
@param optionDefinitions: list of function describing a set of
|
|
||||||
options. Each function must allows as
|
|
||||||
unique parametter an instance of OptionParser.
|
|
||||||
@type optionDefinitions: list of functions.
|
|
||||||
|
|
||||||
@param entryIterator: an iterator generator function returning
|
|
||||||
entries from the data files.
|
|
||||||
|
|
||||||
@type entryIterator: an iterator generator function with only one
|
|
||||||
parametter of type file
|
|
||||||
'''
|
|
||||||
parser = OptionParser(progdoc)
|
|
||||||
parser.add_option('--DEBUG',
|
|
||||||
action="store_true", dest="debug",
|
|
||||||
default=False,
|
|
||||||
help="Set logging in debug mode")
|
|
||||||
|
|
||||||
parser.add_option('--no-psyco',
|
|
||||||
action="store_true", dest="noPsyco",
|
|
||||||
default=False,
|
|
||||||
help="Don't use psyco even if it installed")
|
|
||||||
|
|
||||||
parser.add_option('--without-progress-bar',
|
|
||||||
action="store_false", dest="progressbar",
|
|
||||||
default=True,
|
|
||||||
help="desactivate progress bar")
|
|
||||||
|
|
||||||
checkFormat=False
|
|
||||||
for f in optionDefinitions:
|
|
||||||
if f == addInputFormatOption or f == addInOutputOption:
|
|
||||||
checkFormat=True
|
|
||||||
f(parser)
|
|
||||||
|
|
||||||
def commandLineAnalyzer():
|
|
||||||
options,files = parser.parse_args()
|
|
||||||
if options.debug:
|
|
||||||
logging.root.setLevel(logging.DEBUG)
|
|
||||||
|
|
||||||
if checkFormat:
|
|
||||||
ei=autoEntriesIterator(options)
|
|
||||||
else:
|
|
||||||
ei=entryIterator
|
|
||||||
|
|
||||||
i = allEntryIterator(files,ei,with_progress=options.progressbar)
|
|
||||||
return options,i
|
|
||||||
|
|
||||||
return commandLineAnalyzer
|
|
||||||
|
|
||||||
_currentInputFileName=None
|
|
||||||
_currentFile = None
|
|
||||||
_currentFileSize = None
|
|
||||||
|
|
||||||
def currentInputFileName():
|
|
||||||
return _currentInputFileName
|
|
||||||
|
|
||||||
def currentInputFile():
|
|
||||||
return _currentFile
|
|
||||||
|
|
||||||
def currentFileSize():
|
|
||||||
return _currentFileSize
|
|
||||||
|
|
||||||
def currentFileTell():
|
|
||||||
return universalTell(_currentFile)
|
|
||||||
|
|
||||||
def fileWithProgressBar(file,step=100):
|
|
||||||
try:
|
|
||||||
size = currentFileSize()
|
|
||||||
except:
|
|
||||||
size = None
|
|
||||||
|
|
||||||
def fileBar():
|
|
||||||
pos=1
|
|
||||||
progressBar(pos, size, True,currentInputFileName())
|
|
||||||
for l in file:
|
|
||||||
progressBar(currentFileTell,size,head=currentInputFileName())
|
|
||||||
yield l
|
|
||||||
print >>sys.stderr,''
|
|
||||||
if size is None:
|
|
||||||
return file
|
|
||||||
else:
|
|
||||||
f = fileBar()
|
|
||||||
return f
|
|
||||||
|
|
||||||
|
|
||||||
def allEntryIterator(files,entryIterator,with_progress=False,histo_step=102):
|
|
||||||
global _currentFile
|
|
||||||
global _currentInputFileName
|
|
||||||
global _currentFileSize
|
|
||||||
if files :
|
|
||||||
for f in files:
|
|
||||||
_currentInputFileName=f
|
|
||||||
f = universalOpen(f)
|
|
||||||
_currentFile=f
|
|
||||||
_currentFileSize=fileSize(_currentFile)
|
|
||||||
debug(f)
|
|
||||||
if with_progress:
|
|
||||||
f=fileWithProgressBar(f,step=histo_step)
|
|
||||||
if entryIterator is None:
|
|
||||||
for line in f:
|
|
||||||
yield line
|
|
||||||
else:
|
|
||||||
for entry in entryIterator(f):
|
|
||||||
yield entry
|
|
||||||
else:
|
|
||||||
if entryIterator is None:
|
|
||||||
for line in sys.stdin:
|
|
||||||
yield line
|
|
||||||
else:
|
|
||||||
for entry in entryIterator(sys.stdin):
|
|
||||||
yield entry
|
|
||||||
|
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user