git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPCR/trunk@390 60f365c0-8329-0410-b2a4-ec073aeeaa1d
This commit is contained in:
1054
obitools/SVGdraw.py
Normal file
1054
obitools/SVGdraw.py
Normal file
File diff suppressed because it is too large
Load Diff
711
obitools/__init__.py
Normal file
711
obitools/__init__.py
Normal file
@ -0,0 +1,711 @@
|
||||
'''
|
||||
**obitools** main module
|
||||
------------------------
|
||||
|
||||
.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>
|
||||
|
||||
|
||||
|
||||
obitools module provides base class for sequence manipulation.
|
||||
|
||||
All biological sequences must be subclass of :py:class:`obitools.BioSequence`.
|
||||
Some biological sequences are defined as transformation of other
|
||||
biological sequences. For example Reversed complemented sequences
|
||||
are a transformation of a :py:class:`obitools.NucSequence`. This particular
|
||||
type of sequences are subclasses of the :py:class:`obitools.WrappedBioSequence`.
|
||||
|
||||
.. inheritance-diagram:: BioSequence NucSequence AASequence WrappedBioSequence SubSequence DNAComplementSequence
|
||||
:parts: 1
|
||||
|
||||
|
||||
'''
|
||||
|
||||
from weakref import ref
|
||||
|
||||
from obitools.utils.iterator import uniqueChain
|
||||
from itertools import chain
|
||||
import re
|
||||
|
||||
_default_raw_parser = " %s *= *([^;]*);"
|
||||
|
||||
try:
|
||||
from functools import partial
|
||||
except:
|
||||
#
|
||||
# Add for compatibility purpose with Python < 2.5
|
||||
#
|
||||
def partial(func, *args, **keywords):
|
||||
def newfunc(*fargs, **fkeywords):
|
||||
newkeywords = keywords.copy()
|
||||
newkeywords.update(fkeywords)
|
||||
return func(*(args + fargs), **newkeywords)
|
||||
newfunc.func = func
|
||||
newfunc.args = args
|
||||
newfunc.keywords = keywords
|
||||
return newfunc
|
||||
|
||||
|
||||
from obitools.sequenceencoder import DNAComplementEncoder
|
||||
from obitools.location import Location
|
||||
|
||||
class WrapperSetIterator(object):
|
||||
def __init__(self,s):
|
||||
self._i = set.__iter__(s)
|
||||
def next(self):
|
||||
return self._i.next()()
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
class WrapperSet(set):
|
||||
def __iter__(self):
|
||||
return WrapperSetIterator(self)
|
||||
|
||||
|
||||
class BioSequence(object):
|
||||
'''
|
||||
BioSequence class is the base class for biological
|
||||
sequence representation.
|
||||
|
||||
It provides storage of :
|
||||
|
||||
- the sequence itself,
|
||||
- an identifier,
|
||||
- a definition an manage
|
||||
- a set of complementary information on a key / value principle.
|
||||
|
||||
.. warning::
|
||||
|
||||
:py:class:`obitools.BioSequence` is an abstract class, this constructor
|
||||
can only be called by a subclass constructor.
|
||||
'''
|
||||
|
||||
def __init__(self,id,seq,definition=None,rawinfo=None,rawparser=_default_raw_parser,**info):
|
||||
'''
|
||||
|
||||
:param id: sequence identifier
|
||||
:type id: `str`
|
||||
|
||||
:param seq: the sequence
|
||||
:type seq: `str`
|
||||
|
||||
:param definition: sequence definition (optional)
|
||||
:type definition: `str`
|
||||
|
||||
:param rawinfo: a text containing a set of key=value; patterns
|
||||
:type definition: `str`
|
||||
|
||||
:param rawparser: a text describing a regular patterns template
|
||||
used to parse rawinfo
|
||||
:type definition: `str`
|
||||
|
||||
:param info: extra named parameters can be added to associate complementary
|
||||
data to the sequence
|
||||
|
||||
'''
|
||||
|
||||
assert type(self)!=BioSequence,"obitools.BioSequence is an abstract class"
|
||||
|
||||
self._seq=str(seq).lower()
|
||||
self._info = dict(info)
|
||||
if rawinfo is not None:
|
||||
self._rawinfo=' ' + rawinfo
|
||||
else:
|
||||
self._rawinfo=None
|
||||
self._rawparser=rawparser
|
||||
self.definition=definition
|
||||
self.id=id
|
||||
self._hasTaxid=None
|
||||
|
||||
def get_seq(self):
|
||||
return self.__seq
|
||||
|
||||
|
||||
def set_seq(self, value):
|
||||
if not isinstance(value, str):
|
||||
value=str(value)
|
||||
self.__seq = value
|
||||
self.__len = len(value)
|
||||
|
||||
|
||||
def clone(self):
|
||||
seq = type(self)(self.id,
|
||||
str(self),
|
||||
definition=self.definition
|
||||
)
|
||||
seq._info=dict(self.getTags())
|
||||
seq._rawinfo=self._rawinfo
|
||||
seq._rawparser=self._rawparser
|
||||
seq._hasTaxid=self._hasTaxid
|
||||
return seq
|
||||
|
||||
def getDefinition(self):
|
||||
'''
|
||||
Sequence definition getter.
|
||||
|
||||
:return: the sequence definition
|
||||
:rtype: str
|
||||
|
||||
'''
|
||||
return self._definition
|
||||
|
||||
def setDefinition(self, value):
|
||||
'''
|
||||
Sequence definition setter.
|
||||
|
||||
:param value: the new sequence definition
|
||||
:type value: C{str}
|
||||
:return: C{None}
|
||||
'''
|
||||
self._definition = value
|
||||
|
||||
def getId(self):
|
||||
'''
|
||||
Sequence identifier getter
|
||||
|
||||
:return: the sequence identifier
|
||||
:rtype: C{str}
|
||||
'''
|
||||
return self._id
|
||||
|
||||
def setId(self, value):
|
||||
'''
|
||||
Sequence identifier setter.
|
||||
|
||||
:param value: the new sequence identifier
|
||||
:type value: C{str}
|
||||
:return: C{None}
|
||||
'''
|
||||
self._id = value
|
||||
|
||||
def getStr(self):
|
||||
'''
|
||||
Return the sequence as a string
|
||||
|
||||
:return: the string representation of the sequence
|
||||
:rtype: str
|
||||
'''
|
||||
return self._seq
|
||||
|
||||
def getSymbolAt(self,position):
|
||||
'''
|
||||
Return the symbole at C{position} in the sequence
|
||||
|
||||
:param position: the desired position. Position start from 0
|
||||
if position is < 0 then they are considered
|
||||
to reference the end of the sequence.
|
||||
:type position: `int`
|
||||
|
||||
:return: a one letter string
|
||||
:rtype: `str`
|
||||
'''
|
||||
return str(self)[position]
|
||||
|
||||
def getSubSeq(self,location):
|
||||
'''
|
||||
return a subsequence as described by C{location}.
|
||||
|
||||
The C{location} parametter can be a L{obitools.location.Location} instance,
|
||||
an interger or a python C{slice} instance. If C{location}
|
||||
is an iterger this method is equivalent to L{getSymbolAt}.
|
||||
|
||||
:param location: the positions of the subsequence to return
|
||||
:type location: C{Location} or C{int} or C{slice}
|
||||
:return: the subsequence
|
||||
:rtype: a single character as a C{str} is C{location} is an integer,
|
||||
a L{obitools.SubSequence} instance otherwise.
|
||||
|
||||
'''
|
||||
if isinstance(location,Location):
|
||||
return location.extractSequence(self)
|
||||
elif isinstance(location, int):
|
||||
return self.getSymbolAt(location)
|
||||
elif isinstance(location, slice):
|
||||
return SubSequence(self,location)
|
||||
|
||||
raise TypeError,'key must be a Location, an integer or a slice'
|
||||
|
||||
def getKey(self,key):
|
||||
if key not in self._info:
|
||||
if self._rawinfo is None:
|
||||
if key=='count':
|
||||
return 1
|
||||
else:
|
||||
raise KeyError,key
|
||||
p = re.compile(self._rawparser % key)
|
||||
m = p.search(self._rawinfo)
|
||||
if m is not None:
|
||||
v=m.group(1)
|
||||
self._rawinfo=' ' + self._rawinfo[0:m.start(0)]+self._rawinfo[m.end(0):]
|
||||
try:
|
||||
v = eval(v)
|
||||
except:
|
||||
pass
|
||||
self._info[key]=v
|
||||
else:
|
||||
if key=='count':
|
||||
v=1
|
||||
else:
|
||||
raise KeyError,key
|
||||
else:
|
||||
v=self._info[key]
|
||||
return v
|
||||
|
||||
def extractTaxon(self):
|
||||
'''
|
||||
Extract Taxonomy information from the sequence header.
|
||||
This method by default return None. It should be subclassed
|
||||
if necessary as in L{obitools.seqdb.AnnotatedSequence}.
|
||||
|
||||
:return: None
|
||||
'''
|
||||
self._hasTaxid=self.hasKey('taxid')
|
||||
return None
|
||||
|
||||
def __str__(self):
|
||||
return self.getStr()
|
||||
|
||||
def __getitem__(self,key):
|
||||
if isinstance(key, str):
|
||||
if key=='taxid' and self._hasTaxid is None:
|
||||
self.extractTaxon()
|
||||
return self.getKey(key)
|
||||
else:
|
||||
return self.getSubSeq(key)
|
||||
|
||||
def __setitem__(self,key,value):
|
||||
self._info[key]=value
|
||||
if key=='taxid':
|
||||
self._hasTaxid=value is not None
|
||||
|
||||
def __delitem__(self,key):
|
||||
if isinstance(key, str):
|
||||
if key in self:
|
||||
del self._info[key]
|
||||
else:
|
||||
raise KeyError,key
|
||||
|
||||
if key=='taxid':
|
||||
self._hasTaxid=False
|
||||
else:
|
||||
raise TypeError,key
|
||||
|
||||
def __iter__(self):
|
||||
'''
|
||||
Iterate through the sequence symbols
|
||||
'''
|
||||
return iter(str(self))
|
||||
|
||||
def __len__(self):
|
||||
return self.__len
|
||||
|
||||
def hasKey(self,key):
|
||||
rep = key in self._info
|
||||
|
||||
if not rep and self._rawinfo is not None:
|
||||
p = re.compile(self._rawparser % key)
|
||||
m = p.search(self._rawinfo)
|
||||
if m is not None:
|
||||
v=m.group(1)
|
||||
self._rawinfo=' ' + self._rawinfo[0:m.start(0)]+self._rawinfo[m.end(0):]
|
||||
try:
|
||||
v = eval(v)
|
||||
except:
|
||||
pass
|
||||
self._info[key]=v
|
||||
rep=True
|
||||
|
||||
return rep
|
||||
|
||||
def __contains__(self,key):
|
||||
'''
|
||||
methods allowing to use the C{in} operator on a C{BioSequence}.
|
||||
|
||||
The C{in} operator test if the C{key} value is defined for this
|
||||
sequence.
|
||||
|
||||
:param key: the name of the checked value
|
||||
:type key: str
|
||||
:return: C{True} if the value is defined, {False} otherwise.
|
||||
:rtype: C{bool}
|
||||
'''
|
||||
if key=='taxid' and self._hasTaxid is None:
|
||||
self.extractTaxon()
|
||||
return self.hasKey(key)
|
||||
|
||||
def rawiteritems(self):
|
||||
return self._info.iteritems()
|
||||
|
||||
def iteritems(self):
|
||||
'''
|
||||
iterate other items dictionary storing the values
|
||||
associated to the sequence. It works similarly to
|
||||
the iteritems function of C{dict}.
|
||||
|
||||
:return: an iterator over the items (key,value)
|
||||
link to a sequence
|
||||
:rtype: iterator over tuple
|
||||
:see: L{items}
|
||||
'''
|
||||
if self._rawinfo is not None:
|
||||
p = re.compile(self._rawparser % "([a-zA-Z]\w*)")
|
||||
for k,v in p.findall(self._rawinfo):
|
||||
try:
|
||||
self._info[k]=eval(v)
|
||||
except:
|
||||
self._info[k]=v
|
||||
self._rawinfo=None
|
||||
return self._info.iteritems()
|
||||
|
||||
def items(self):
|
||||
return [x for x in self.iteritems()]
|
||||
|
||||
def iterkeys(self):
|
||||
return (k for k,v in self.iteritems())
|
||||
|
||||
def keys(self):
|
||||
return [x for x in self.iterkeys()]
|
||||
|
||||
def getTags(self):
|
||||
self.iteritems()
|
||||
return self._info
|
||||
|
||||
def getRoot(self):
|
||||
return self
|
||||
|
||||
def getWrappers(self):
|
||||
if not hasattr(self, '_wrappers'):
|
||||
self._wrappers=WrapperSet()
|
||||
return self._wrappers
|
||||
|
||||
def register(self,wrapper):
|
||||
self.wrappers.add(ref(wrapper,self._unregister))
|
||||
|
||||
def _unregister(self,ref):
|
||||
self.wrappers.remove(ref)
|
||||
|
||||
wrappers = property(getWrappers,None,None,'')
|
||||
|
||||
definition = property(getDefinition, setDefinition, None, "Sequence Definition")
|
||||
|
||||
id = property(getId, setId, None, 'Sequence identifier')
|
||||
|
||||
def _getTaxid(self):
|
||||
return self['taxid']
|
||||
|
||||
def _setTaxid(self,taxid):
|
||||
self['taxid']=taxid
|
||||
|
||||
taxid = property(_getTaxid,_setTaxid,None,'NCBI Taxonomy identifier')
|
||||
_seq = property(get_seq, set_seq, None, None)
|
||||
|
||||
class NucSequence(BioSequence):
|
||||
"""
|
||||
:py:class:`NucSequence` specialize the :py:class:`BioSequence` class for storing DNA
|
||||
sequences.
|
||||
|
||||
The constructor is identical to the :py:class:`BioSequence` constructor.
|
||||
"""
|
||||
|
||||
def complement(self):
|
||||
"""
|
||||
:return: The reverse complemented sequence as an instance of :py:class:`DNAComplementSequence`
|
||||
:rtype: :py:class:`DNAComplementSequence`
|
||||
"""
|
||||
return DNAComplementSequence(self)
|
||||
|
||||
def isNucleotide(self):
|
||||
return True
|
||||
|
||||
|
||||
class AASequence(BioSequence):
|
||||
"""
|
||||
:py:class:`AASequence` specialize the :py:class:`BioSequence` class for storing protein
|
||||
sequences.
|
||||
|
||||
The constructor is identical to the :py:class:`BioSequence` constructor.
|
||||
"""
|
||||
|
||||
|
||||
def isNucleotide(self):
|
||||
return False
|
||||
|
||||
|
||||
class WrappedBioSequence(BioSequence):
|
||||
"""
|
||||
.. warning::
|
||||
|
||||
:py:class:`obitools.WrappedBioSequence` is an abstract class, this constructor
|
||||
can only be called by a subclass constructor.
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self,reference,id=None,definition=None,**info):
|
||||
|
||||
assert type(self)!=WrappedBioSequence,"obitools.WrappedBioSequence is an abstract class"
|
||||
|
||||
self._wrapped = reference
|
||||
reference.register(self)
|
||||
self._id=id
|
||||
self.definition=definition
|
||||
self._info=info
|
||||
|
||||
def clone(self):
|
||||
seq = type(self)(self.wrapped,
|
||||
id=self._id,
|
||||
definition=self._definition
|
||||
)
|
||||
seq._info=dict(self._info)
|
||||
|
||||
return seq
|
||||
|
||||
def getWrapped(self):
|
||||
return self._wrapped
|
||||
|
||||
def getDefinition(self):
|
||||
d = self._definition or self.wrapped.definition
|
||||
return d
|
||||
|
||||
def getId(self):
|
||||
d = self._id or self.wrapped.id
|
||||
return d
|
||||
|
||||
def isNucleotide(self):
|
||||
return self.wrapped.isNucleotide()
|
||||
|
||||
|
||||
def iterkeys(self):
|
||||
return uniqueChain(self._info.iterkeys(),
|
||||
self.wrapped.iterkeys())
|
||||
|
||||
def rawiteritems(self):
|
||||
return chain(self._info.iteritems(),
|
||||
(x for x in self.wrapped.rawiteritems()
|
||||
if x[0] not in self._info))
|
||||
|
||||
def iteritems(self):
|
||||
for x in self.iterkeys():
|
||||
yield (x,self[x])
|
||||
|
||||
def getKey(self,key):
|
||||
if key in self._info:
|
||||
return self._info[key]
|
||||
else:
|
||||
return self.wrapped.getKey(key)
|
||||
|
||||
def hasKey(self,key):
|
||||
return key in self._info or self.wrapped.hasKey(key)
|
||||
|
||||
def getSymbolAt(self,position):
|
||||
return self.wrapped.getSymbolAt(self.posInWrapped(position))
|
||||
|
||||
def posInWrapped(self,position,reference=None):
|
||||
if reference is None or reference is self.wrapped:
|
||||
return self._posInWrapped(position)
|
||||
else:
|
||||
return self.wrapped.posInWrapped(self._posInWrapped(position),reference)
|
||||
|
||||
|
||||
def getStr(self):
|
||||
return str(self.wrapped)
|
||||
|
||||
def getRoot(self):
|
||||
return self.wrapped.getRoot()
|
||||
|
||||
def complement(self):
|
||||
"""
|
||||
The :py:meth:`complement` method of the :py:class:`WrappedBioSequence` class
|
||||
raises an exception :py:exc:`AttributeError` if the method is called and the cut
|
||||
sequence does not corresponds to a nucleic acid sequence.
|
||||
"""
|
||||
|
||||
if self.wrapped.isNucleotide():
|
||||
return DNAComplementSequence(self)
|
||||
raise AttributeError
|
||||
|
||||
|
||||
def _posInWrapped(self,position):
|
||||
return position
|
||||
|
||||
|
||||
definition = property(getDefinition,BioSequence.setDefinition, None)
|
||||
id = property(getId,BioSequence.setId, None)
|
||||
|
||||
wrapped = property(getWrapped, None, None, "A pointer to the wrapped sequence")
|
||||
|
||||
def _getWrappedRawInfo(self):
|
||||
return self.wrapped._rawinfo
|
||||
|
||||
_rawinfo = property(_getWrappedRawInfo)
|
||||
|
||||
|
||||
class SubSequence(WrappedBioSequence):
|
||||
"""
|
||||
"""
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _sign(x):
|
||||
if x == 0:
|
||||
return 0
|
||||
elif x < 0:
|
||||
return -1
|
||||
return 1
|
||||
|
||||
def __init__(self,reference,
|
||||
location=None,
|
||||
start=None,stop=None,
|
||||
id=None,definition=None,
|
||||
**info):
|
||||
WrappedBioSequence.__init__(self,reference,id=None,definition=None,**info)
|
||||
|
||||
if isinstance(location, slice):
|
||||
self._location = location
|
||||
else:
|
||||
step = 1
|
||||
if not isinstance(start, int):
|
||||
start = 0;
|
||||
if not isinstance(stop,int):
|
||||
stop = len(reference)
|
||||
self._location=slice(start,stop,step)
|
||||
|
||||
self._indices=self._location.indices(len(self.wrapped))
|
||||
self._xrange=xrange(*self._indices)
|
||||
|
||||
self._info['cut']='[%d,%d,%s]' % self._indices
|
||||
|
||||
if hasattr(reference,'quality'):
|
||||
self.quality = reference.quality[self._location]
|
||||
|
||||
def getId(self):
|
||||
d = self._id or ("%s_SUB" % self.wrapped.id)
|
||||
return d
|
||||
|
||||
|
||||
def clone(self):
|
||||
seq = WrappedBioSequence.clone(self)
|
||||
seq._location=self._location
|
||||
seq._indices=seq._location.indices(len(seq.wrapped))
|
||||
seq._xrange=xrange(*seq._indices)
|
||||
return seq
|
||||
|
||||
|
||||
def __len__(self):
|
||||
return len(self._xrange)
|
||||
|
||||
def getStr(self):
|
||||
return ''.join([x for x in self])
|
||||
|
||||
def __iter__(self):
|
||||
return (self.wrapped.getSymbolAt(x) for x in self._xrange)
|
||||
|
||||
def _posInWrapped(self,position):
|
||||
return self._xrange[position]
|
||||
|
||||
|
||||
id = property(getId,BioSequence.setId, None)
|
||||
|
||||
|
||||
|
||||
class DNAComplementSequence(WrappedBioSequence):
|
||||
"""
|
||||
Class used to represent a reverse complemented DNA sequence. Usually instances
|
||||
of this class are produced by using the :py:meth:`NucSequence.complement` method.
|
||||
"""
|
||||
|
||||
|
||||
_comp={'a': 't', 'c': 'g', 'g': 'c', 't': 'a',
|
||||
'r': 'y', 'y': 'r', 'k': 'm', 'm': 'k',
|
||||
's': 's', 'w': 'w', 'b': 'v', 'd': 'h',
|
||||
'h': 'd', 'v': 'b', 'n': 'n', 'u': 'a',
|
||||
'-': '-'}
|
||||
|
||||
|
||||
def __init__(self,reference,
|
||||
id=None,definition=None,**info):
|
||||
WrappedBioSequence.__init__(self,reference,id=None,definition=None,**info)
|
||||
assert reference.isNucleotide()
|
||||
self._info['complemented']=True
|
||||
if hasattr(reference,'quality'):
|
||||
self.quality = reference.quality[::-1]
|
||||
|
||||
|
||||
def getId(self):
|
||||
d = self._id or ("%s_CMP" % self.wrapped.id)
|
||||
return d
|
||||
|
||||
def __len__(self):
|
||||
return len(self._wrapped)
|
||||
|
||||
def getStr(self):
|
||||
return ''.join([x for x in self])
|
||||
|
||||
def __iter__(self):
|
||||
return (self.getSymbolAt(x) for x in xrange(len(self)))
|
||||
|
||||
def _posInWrapped(self,position):
|
||||
return -(position+1)
|
||||
|
||||
def getSymbolAt(self,position):
|
||||
return DNAComplementSequence._comp[self.wrapped.getSymbolAt(self.posInWrapped(position))]
|
||||
|
||||
def complement(self):
|
||||
"""
|
||||
The :py:meth:`complement` method of the :py:class:`DNAComplementSequence` class actually
|
||||
returns the wrapped sequenced. Effectively the reversed complemented sequence of a reversed
|
||||
complemented sequence is the initial sequence.
|
||||
"""
|
||||
return self.wrapped
|
||||
|
||||
id = property(getId,BioSequence.setId, None)
|
||||
|
||||
|
||||
def _isNucSeq(text):
|
||||
acgt = 0
|
||||
notnuc = 0
|
||||
ltot = len(text) * 0.8
|
||||
for c in text.lower():
|
||||
if c in 'acgt-':
|
||||
acgt+=1
|
||||
if c not in DNAComplementEncoder._comp:
|
||||
notnuc+=1
|
||||
return notnuc==0 and float(acgt) > ltot
|
||||
|
||||
|
||||
def bioSeqGenerator(id,seq,definition=None,rawinfo=None,rawparser=_default_raw_parser,**info):
|
||||
"""
|
||||
Generate automagically the good class instance between :
|
||||
|
||||
- :py:class:`NucSequence`
|
||||
- :py:class:`AASequence`
|
||||
|
||||
Build a new sequence instance. Sequences are instancied as :py:class:`NucSequence` if the
|
||||
`seq` attribute contains more than 80% of *A*, *C*, *G*, *T* or *-* symbols
|
||||
in upper or lower cases. Conversely, the new sequence instance is instancied as
|
||||
:py:class:`AASequence`.
|
||||
|
||||
|
||||
|
||||
:param id: sequence identifier
|
||||
:type id: `str`
|
||||
|
||||
:param seq: the sequence
|
||||
:type seq: `str`
|
||||
|
||||
:param definition: sequence definition (optional)
|
||||
:type definition: `str`
|
||||
|
||||
:param rawinfo: a text containing a set of key=value; patterns
|
||||
:type definition: `str`
|
||||
|
||||
:param rawparser: a text describing a regular patterns template
|
||||
used to parse rawinfo
|
||||
:type definition: `str`
|
||||
|
||||
:param info: extra named parameters can be added to associate complementary
|
||||
data to the sequence
|
||||
"""
|
||||
if _isNucSeq(seq):
|
||||
return NucSequence(id,seq,definition,rawinfo,rawparser,**info)
|
||||
else:
|
||||
return AASequence(id,seq,definition,rawinfo,rawparser,**info)
|
||||
|
BIN
obitools/__init__.pyc
Normal file
BIN
obitools/__init__.pyc
Normal file
Binary file not shown.
13
obitools/align/__init__.py
Normal file
13
obitools/align/__init__.py
Normal file
@ -0,0 +1,13 @@
|
||||
|
||||
|
||||
from _nws import NWS
|
||||
from _upperbond import indexSequences
|
||||
from _lcs import LCS,lenlcs
|
||||
from _assemble import DirectAssemble, ReverseAssemble
|
||||
from _qsassemble import QSolexaDirectAssemble,QSolexaReverseAssemble
|
||||
from _rassemble import RightDirectAssemble as RightReverseAssemble
|
||||
from _qsrassemble import QSolexaRightDirectAssemble,QSolexaRightReverseAssemble
|
||||
from _freeendgap import FreeEndGap
|
||||
from _freeendgapfm import FreeEndGapFullMatch
|
||||
from _upperbond import isLCSReachable
|
||||
|
BIN
obitools/align/_assemble.so
Executable file
BIN
obitools/align/_assemble.so
Executable file
Binary file not shown.
BIN
obitools/align/_dynamic.so
Executable file
BIN
obitools/align/_dynamic.so
Executable file
Binary file not shown.
BIN
obitools/align/_freeendgap.so
Executable file
BIN
obitools/align/_freeendgap.so
Executable file
Binary file not shown.
BIN
obitools/align/_freeendgapfm.so
Executable file
BIN
obitools/align/_freeendgapfm.so
Executable file
Binary file not shown.
BIN
obitools/align/_lcs.so
Executable file
BIN
obitools/align/_lcs.so
Executable file
Binary file not shown.
BIN
obitools/align/_nws.so
Executable file
BIN
obitools/align/_nws.so
Executable file
Binary file not shown.
BIN
obitools/align/_profilenws.so
Executable file
BIN
obitools/align/_profilenws.so
Executable file
Binary file not shown.
BIN
obitools/align/_qsassemble.so
Executable file
BIN
obitools/align/_qsassemble.so
Executable file
Binary file not shown.
BIN
obitools/align/_qsrassemble.so
Executable file
BIN
obitools/align/_qsrassemble.so
Executable file
Binary file not shown.
BIN
obitools/align/_rassemble.so
Executable file
BIN
obitools/align/_rassemble.so
Executable file
Binary file not shown.
BIN
obitools/align/_upperbond.so
Executable file
BIN
obitools/align/_upperbond.so
Executable file
Binary file not shown.
56
obitools/align/homopolymere.py
Normal file
56
obitools/align/homopolymere.py
Normal file
@ -0,0 +1,56 @@
|
||||
'''
|
||||
Created on 14 mai 2009
|
||||
|
||||
@author: coissac
|
||||
'''
|
||||
|
||||
from obitools import WrappedBioSequence
|
||||
|
||||
class HomoNucBioSeq(WrappedBioSequence):
|
||||
'''
|
||||
classdocs
|
||||
'''
|
||||
|
||||
|
||||
def __init__(self,reference,id=None,definition=None,**info):
|
||||
'''
|
||||
Constructor
|
||||
'''
|
||||
assert reference.isNucleotide(),"reference must be a nucleic sequence"
|
||||
WrappedBioSequence.__init__(self,reference,id=None,definition=None,**info)
|
||||
self.__cleanHomopolymer()
|
||||
|
||||
def __cleanHomopolymer(self):
|
||||
s = []
|
||||
c = []
|
||||
old=None
|
||||
nc=0
|
||||
for n in self._wrapped:
|
||||
if old is not None and n!=old:
|
||||
s.append(old)
|
||||
c.append(nc)
|
||||
nc=0
|
||||
old=n
|
||||
nc+=1
|
||||
self._cached=''.join(s)
|
||||
self['homopolymer']=c
|
||||
self._cumulative=[]
|
||||
sum=0
|
||||
for c in self._count:
|
||||
sum+=c
|
||||
self._cumulative.append(sum)
|
||||
|
||||
def __len__(self):
|
||||
return len(self._cached)
|
||||
|
||||
def getStr(self):
|
||||
return self._cached
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self._cached)
|
||||
|
||||
def _posInWrapped(self,position):
|
||||
return self._cumulative[position]
|
||||
|
||||
|
||||
|
46
obitools/align/ssearch.py
Normal file
46
obitools/align/ssearch.py
Normal file
@ -0,0 +1,46 @@
|
||||
import os
|
||||
import re
|
||||
|
||||
from obitools.fasta import formatFasta
|
||||
|
||||
class SsearchParser(object):
|
||||
|
||||
_matchQuery = re.compile("^Query:.+\n.+?>+([^ ]+)", re.MULTILINE)
|
||||
_matchLQuery = re.compile("^Query:.+\n.+?(\d+)(?= nt\n)", re.MULTILINE)
|
||||
_matchProp = re.compile("^The best scores are:.*\n(.+?)>>>", re.DOTALL+re.MULTILINE)
|
||||
def __init__(self,file):
|
||||
if isinstance(file,str):
|
||||
file = open(file,'rU')
|
||||
self.data = file.read()
|
||||
self.query= SsearchParser._matchQuery.search(self.data).group(1)
|
||||
self.queryLength= int(SsearchParser._matchLQuery.search(self.data).group(1))
|
||||
props = SsearchParser._matchProp.search(self.data)
|
||||
if props:
|
||||
props=props.group(0).split('\n')[1:-2]
|
||||
self.props=[]
|
||||
for line in props:
|
||||
subject,tab = line.split('\t')
|
||||
tab=tab.split()
|
||||
ssp = subject.split()
|
||||
ac = ssp[0]
|
||||
dbl= int(ssp[-5][:-1])
|
||||
ident = float(tab[0])
|
||||
matchlen = abs(int(tab[5]) - int(tab[4])) +1
|
||||
self.props.append({"ac" :ac,
|
||||
"identity" :ident,
|
||||
"subjectlength":dbl,
|
||||
'matchlength' : matchlen})
|
||||
|
||||
def run(seq,database,program='fasta35',opts=''):
|
||||
ssearchin,ssearchout,ssearcherr = os.popen3("%s %s %s" % (program,opts,database))
|
||||
print >>ssearchin,formatFasta(seq)
|
||||
ssearchin.close()
|
||||
result = SsearchParser(ssearchout)
|
||||
|
||||
return seq,result
|
||||
|
||||
def ssearchIterator(sequenceIterator,database,program='ssearch35',opts=''):
|
||||
for seq in sequenceIterator:
|
||||
yield run(seq,database,program,opts)
|
||||
|
||||
|
175
obitools/alignment/__init__.py
Normal file
175
obitools/alignment/__init__.py
Normal file
@ -0,0 +1,175 @@
|
||||
from obitools import BioSequence
|
||||
from obitools import WrappedBioSequence
|
||||
from copy import deepcopy
|
||||
|
||||
class GappedPositionException(Exception):
|
||||
pass
|
||||
|
||||
class AlignedSequence(WrappedBioSequence):
|
||||
|
||||
def __init__(self,reference,
|
||||
id=None,definition=None,**info):
|
||||
WrappedBioSequence.__init__(self,reference,id=None,definition=None,**info)
|
||||
self._length=len(reference)
|
||||
self._gaps=[[self._length,0]]
|
||||
|
||||
def clone(self):
|
||||
seq = WrappedBioSequence.clone(self)
|
||||
seq._gaps=deepcopy(self._gaps)
|
||||
seq._length=reduce(lambda x,y:x+y, (z[0]+z[1] for z in self._gaps),0)
|
||||
return seq
|
||||
|
||||
def setGaps(self, value):
|
||||
'''
|
||||
Set gap vector to an AlignedSequence.
|
||||
|
||||
Gap vector describes the gap positions on a sequence.
|
||||
It is a gap of couple. The first couple member is the count
|
||||
of sequence letter, the second one is the gap length.
|
||||
@param value: a list of length 2 list describing gap positions
|
||||
@type value: list of couple
|
||||
'''
|
||||
assert isinstance(value, list),'Gap vector must be a list'
|
||||
assert reduce(lambda x,y: x and y,
|
||||
(isinstance(z, list) and len(z)==2 for z in value),
|
||||
True),"Value must be a list of length 2 list"
|
||||
|
||||
lseq = reduce(lambda x,y:x+y, (z[0] for z in value),0)
|
||||
assert lseq==len(self.wrapped),"Gap vector incompatible with the sequence"
|
||||
self._gaps = value
|
||||
self._length=reduce(lambda x,y:x+y, (z[0]+z[1] for z in value),0)
|
||||
|
||||
def getGaps(self):
|
||||
return tuple(self._gaps)
|
||||
gaps = property(getGaps, setGaps, None, "Gaps's Docstring")
|
||||
|
||||
def _getIndice(self,pos):
|
||||
i=0
|
||||
cpos=0
|
||||
for s,g in self._gaps:
|
||||
cpos+=s
|
||||
if cpos>pos:
|
||||
return i,pos-cpos+s
|
||||
cpos+=g
|
||||
if cpos>pos:
|
||||
return i,-pos+cpos-g-1
|
||||
i+=1
|
||||
raise IndexError
|
||||
|
||||
def getId(self):
|
||||
d = self._id or ("%s_ALN" % self.wrapped.id)
|
||||
return d
|
||||
|
||||
def __len__(self):
|
||||
return self._length
|
||||
|
||||
def getStr(self):
|
||||
return ''.join([x for x in self])
|
||||
|
||||
def __iter__(self):
|
||||
def isymb():
|
||||
cpos=0
|
||||
for s,g in self._gaps:
|
||||
for x in xrange(s):
|
||||
yield self.wrapped[cpos+x]
|
||||
for x in xrange(g):
|
||||
yield '-'
|
||||
cpos+=s
|
||||
return isymb()
|
||||
|
||||
def _posInWrapped(self,position):
|
||||
i,s=self._getIndice(position)
|
||||
if s<0:
|
||||
raise GappedPositionException
|
||||
value=self._gaps
|
||||
p=reduce(lambda x,y:x+y, (z[0] for z in value[:i]),0)+s
|
||||
return p
|
||||
|
||||
def getSymbolAt(self,position):
|
||||
try:
|
||||
return self.wrapped.getSymbolAt(self.posInWrapped(position))
|
||||
except GappedPositionException:
|
||||
return '-'
|
||||
|
||||
def insertGap(self,position,count=1):
|
||||
if position==self._length:
|
||||
idx=len(self._gaps)-1
|
||||
p=-1
|
||||
else:
|
||||
idx,p = self._getIndice(position)
|
||||
|
||||
if p >= 0:
|
||||
self._gaps.insert(idx, [p,count])
|
||||
self._gaps[idx+1][0]-=p
|
||||
else:
|
||||
self._gaps[idx][1]+=count
|
||||
self._length=reduce(lambda x,y:x+y, (z[0]+z[1] for z in self._gaps),0)
|
||||
|
||||
|
||||
id = property(getId,BioSequence.setId, None, "Sequence Identifier")
|
||||
|
||||
|
||||
class Alignment(list):
|
||||
|
||||
def _assertData(self,data):
|
||||
assert isinstance(data, BioSequence),'You must only add bioseq to an alignement'
|
||||
if hasattr(self, '_alignlen'):
|
||||
assert self._alignlen==len(data),'All aligned sequences must have the same length'
|
||||
else:
|
||||
self._alignlen=len(data)
|
||||
return data
|
||||
|
||||
def clone(self):
|
||||
ali = Alignment(x.clone() for x in self)
|
||||
return ali
|
||||
|
||||
def append(self,data):
|
||||
data = self._assertData(data)
|
||||
list.append(self,data)
|
||||
|
||||
def __setitem__(self,index,data):
|
||||
|
||||
data = self._assertData(data)
|
||||
list.__setitem__(self,index,data)
|
||||
|
||||
def getSite(self,key):
|
||||
if isinstance(key,int):
|
||||
return [x[key] for x in self]
|
||||
|
||||
def insertGap(self,position,count=1):
|
||||
for s in self:
|
||||
s.insertGap(position,count)
|
||||
|
||||
def isFullGapSite(self,key):
|
||||
return reduce(lambda x,y: x and y,(z=='-' for z in self.getSite(key)),True)
|
||||
|
||||
def isGappedSite(self,key):
|
||||
return '-' in self.getSite(key)
|
||||
|
||||
def __str__(self):
|
||||
l = len(self[0])
|
||||
rep=""
|
||||
idmax = max(len(x.id) for x in self)+2
|
||||
template= "%%-%ds %%-60s" % idmax
|
||||
for p in xrange(0,l,60):
|
||||
for s in self:
|
||||
rep+= (template % (s.id,s[p:p+60])).strip() + '\n'
|
||||
rep+="\n"
|
||||
return rep
|
||||
|
||||
def alignmentReader(file,sequenceIterator):
|
||||
seqs = sequenceIterator(file)
|
||||
alignement = Alignment()
|
||||
for seq in seqs:
|
||||
alignement.append(seq)
|
||||
return alignement
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def columnIterator(alignment):
|
||||
lali = len(alignment[0])
|
||||
for p in xrange(lali):
|
||||
c = [x[p] for x in alignment]
|
||||
yield c
|
47
obitools/alignment/ace.py
Normal file
47
obitools/alignment/ace.py
Normal file
@ -0,0 +1,47 @@
|
||||
from obitools.format.genericparser import GenericParser
|
||||
from obitools.utils import universalOpen
|
||||
from obitools.fasta import parseFastaDescription
|
||||
from obitools import NucSequence
|
||||
|
||||
|
||||
import sys
|
||||
|
||||
_contigIterator=GenericParser('^CO ')
|
||||
|
||||
_contigIterator.addParseAction('AF', '\nAF +(\S+) +([UC]) +(-?[0-9]+)')
|
||||
_contigIterator.addParseAction('RD', '\nRD +(\S+) +([0-9]+) +([0-9]+) +([0-9]+) *\n([A-Za-z\n*]+?)\n\n')
|
||||
_contigIterator.addParseAction('DS', '\nDS +(.+)')
|
||||
_contigIterator.addParseAction('CO', '^CO (\S+)')
|
||||
|
||||
def contigIterator(file):
|
||||
file = universalOpen(file)
|
||||
for entry in _contigIterator(file):
|
||||
contig=[]
|
||||
for rd,ds,af in map(None,entry['RD'],entry['DS'],entry['AF']):
|
||||
id = rd[0]
|
||||
shift = int(af[2])
|
||||
if shift < 0:
|
||||
print >> sys.stderr,"Sequence %s in contig %s has a negative paddng value %d : skipped" % (id,entry['CO'][0],shift)
|
||||
#continue
|
||||
|
||||
definition,info = parseFastaDescription(ds)
|
||||
info['shift']=shift
|
||||
seq = rd[4].replace('\n','').replace('*','-').strip()
|
||||
contig.append(NucSequence(id,seq,definition,**info))
|
||||
|
||||
maxlen = max(len(x)+x['shift'] for x in contig)
|
||||
minshift=min(x['shift'] for x in contig)
|
||||
rep = []
|
||||
|
||||
for s in contig:
|
||||
info = s.getTags()
|
||||
info['shift']-=minshift-1
|
||||
head = '-' * (info['shift']-1)
|
||||
|
||||
tail = (maxlen + minshift - len(s) - info['shift'] - 1)
|
||||
info['tail']=tail
|
||||
newseq = NucSequence(s.id,head + str(s)+ '-' * tail,s.definition,**info)
|
||||
rep.append(newseq)
|
||||
|
||||
yield entry['CO'][0],rep
|
||||
|
7
obitools/barcodecoverage/__init__.py
Normal file
7
obitools/barcodecoverage/__init__.py
Normal file
@ -0,0 +1,7 @@
|
||||
'''
|
||||
|
||||
@author: merciece
|
||||
Creates the tree representing the coverage of 2 primers from an ecoPCR output file and an ecoPCR database.
|
||||
|
||||
|
||||
'''
|
62
obitools/barcodecoverage/calcBc.py
Normal file
62
obitools/barcodecoverage/calcBc.py
Normal file
@ -0,0 +1,62 @@
|
||||
#!/usr/local/bin/python
|
||||
'''
|
||||
Created on 24 nov. 2011
|
||||
|
||||
@author: merciece
|
||||
'''
|
||||
|
||||
|
||||
def main(amplifiedSeqs, seqsFromDB, keptRanks, errors, tax) :
|
||||
'''
|
||||
error threshold is set to 3
|
||||
'''
|
||||
|
||||
listtaxabygroupinDB = {}
|
||||
|
||||
for seq in seqsFromDB :
|
||||
taxid = seq['taxid']
|
||||
p = [a for a in tax.parentalTreeIterator(taxid)]
|
||||
for a in p :
|
||||
if a != p[0] :
|
||||
if a[1] in keptRanks :
|
||||
group = a[0]
|
||||
if group in listtaxabygroupinDB and taxid not in listtaxabygroupinDB[group] :
|
||||
listtaxabygroupinDB[group].add(taxid)
|
||||
elif group not in listtaxabygroupinDB :
|
||||
listtaxabygroupinDB[group]=set([taxid])
|
||||
|
||||
taxabygroup = dict((x,len(listtaxabygroupinDB[x])) for x in listtaxabygroupinDB)
|
||||
|
||||
listamplifiedtaxabygroup = {}
|
||||
|
||||
for seq in amplifiedSeqs :
|
||||
if errors[seq.id][2] <= 3 :
|
||||
taxid = seq['taxid']
|
||||
p = [a for a in tax.parentalTreeIterator(taxid)]
|
||||
for a in p :
|
||||
if a != p[0] :
|
||||
if a[1] in keptRanks :
|
||||
group = a[0]
|
||||
if group in listamplifiedtaxabygroup and taxid not in listamplifiedtaxabygroup[group] :
|
||||
listamplifiedtaxabygroup[group].add(taxid)
|
||||
elif group not in listamplifiedtaxabygroup :
|
||||
listamplifiedtaxabygroup[group]=set([taxid])
|
||||
|
||||
amplifiedtaxabygroup = dict((x,len(listamplifiedtaxabygroup[x])) for x in listamplifiedtaxabygroup)
|
||||
|
||||
BcValues = {}
|
||||
|
||||
groups = [g for g in taxabygroup.keys()]
|
||||
|
||||
for g in groups :
|
||||
if g in amplifiedtaxabygroup :
|
||||
BcValues[g] = float(amplifiedtaxabygroup[g])/taxabygroup[g]*100
|
||||
BcValues[g] = round(BcValues[g], 2)
|
||||
else :
|
||||
BcValues[g] = 0.0
|
||||
|
||||
return BcValues
|
||||
|
||||
|
||||
|
||||
|
72
obitools/barcodecoverage/calculateBc.py
Normal file
72
obitools/barcodecoverage/calculateBc.py
Normal file
@ -0,0 +1,72 @@
|
||||
#!/usr/local/bin/python
|
||||
'''
|
||||
Created on 24 nov. 2011
|
||||
|
||||
@author: merciece
|
||||
'''
|
||||
|
||||
import sys
|
||||
|
||||
|
||||
def main(amplifiedSeqs, seqsFromDB, keptRanks, tax) :
|
||||
|
||||
BcValues = {}
|
||||
|
||||
#speciesid = tax.findRankByName('species')
|
||||
#subspeciesid = tax.findRankByName('subspecies')
|
||||
|
||||
listtaxonbygroup = {}
|
||||
|
||||
for seq in seqsFromDB :
|
||||
taxid = seq['taxid']
|
||||
p = [a for a in tax.parentalTreeIterator(taxid)]
|
||||
for a in p :
|
||||
if a != p[0] :
|
||||
if a[1] in keptRanks :
|
||||
group = a
|
||||
if group in listtaxonbygroup:
|
||||
listtaxonbygroup[group].add(taxid)
|
||||
else:
|
||||
listtaxonbygroup[group]=set([taxid])
|
||||
|
||||
#stats = dict((x,len(listtaxonbygroup[x])) for x in listtaxonbygroup)
|
||||
|
||||
print>>sys.stderr, listtaxonbygroup
|
||||
|
||||
listtaxonbygroup = {}
|
||||
|
||||
for seq in amplifiedSeqs :
|
||||
taxid = seq['taxid']
|
||||
p = [a for a in tax.parentalTreeIterator(taxid)]
|
||||
for a in p :
|
||||
if a != p[0] :
|
||||
if a[1] in keptRanks :
|
||||
group = a
|
||||
if group in listtaxonbygroup:
|
||||
listtaxonbygroup[group].add(taxid)
|
||||
else:
|
||||
listtaxonbygroup[group]=set([taxid])
|
||||
|
||||
print>>sys.stderr, listtaxonbygroup
|
||||
|
||||
return BcValues
|
||||
|
||||
# dbstats= dict((x,len(listtaxonbygroup[x])) for x in listtaxonbygroup)
|
||||
#
|
||||
# ranks = [r for r in keptRanks]
|
||||
# ranks.sort()
|
||||
#
|
||||
# print '%-20s\t%10s\t%10s\t%7s' % ('rank','ecopcr','db','percent')
|
||||
#
|
||||
# print>>sys.stderr, stats
|
||||
# print>>sys.stderr, dbstats
|
||||
# print>>sys.stderr, ranks
|
||||
#
|
||||
# for r in ranks:
|
||||
# if r in dbstats and dbstats[r]:
|
||||
# print '%-20s\t%10d\t%10d\t%8.2f' % (r,dbstats[r],stats[r],float(dbstats[r])/stats[r]*100)
|
||||
|
||||
|
||||
|
||||
|
||||
|
108
obitools/barcodecoverage/drawBcTree.py
Normal file
108
obitools/barcodecoverage/drawBcTree.py
Normal file
@ -0,0 +1,108 @@
|
||||
#!/usr/local/bin/python
|
||||
'''
|
||||
Created on 25 nov. 2011
|
||||
|
||||
@author: merciece
|
||||
'''
|
||||
|
||||
from obitools.graph.rootedtree import nexusFormat
|
||||
|
||||
|
||||
figtree="""\
|
||||
begin figtree;
|
||||
set appearance.backgroundColorAttribute="User Selection";
|
||||
set appearance.backgroundColour=#-1;
|
||||
set appearance.branchColorAttribute="bc";
|
||||
set appearance.branchLineWidth=2.0;
|
||||
set appearance.foregroundColour=#-16777216;
|
||||
set appearance.selectionColour=#-2144520576;
|
||||
set branchLabels.colorAttribute="User Selection";
|
||||
set branchLabels.displayAttribute="errors";
|
||||
set branchLabels.fontName="sansserif";
|
||||
set branchLabels.fontSize=10;
|
||||
set branchLabels.fontStyle=0;
|
||||
set branchLabels.isShown=true;
|
||||
set branchLabels.significantDigits=4;
|
||||
set layout.expansion=2000;
|
||||
set layout.layoutType="RECTILINEAR";
|
||||
set layout.zoom=0;
|
||||
set nodeBars.barWidth=4.0;
|
||||
set nodeLabels.colorAttribute="User Selection";
|
||||
set nodeLabels.displayAttribute="label";
|
||||
set nodeLabels.fontName="sansserif";
|
||||
set nodeLabels.fontSize=10;
|
||||
set nodeLabels.fontStyle=0;
|
||||
set nodeLabels.isShown=true;
|
||||
set nodeLabels.significantDigits=4;
|
||||
set polarLayout.alignTipLabels=false;
|
||||
set polarLayout.angularRange=0;
|
||||
set polarLayout.rootAngle=0;
|
||||
set polarLayout.rootLength=100;
|
||||
set polarLayout.showRoot=true;
|
||||
set radialLayout.spread=0.0;
|
||||
set rectilinearLayout.alignTipLabels=false;
|
||||
set rectilinearLayout.curvature=0;
|
||||
set rectilinearLayout.rootLength=100;
|
||||
set scale.offsetAge=0.0;
|
||||
set scale.rootAge=1.0;
|
||||
set scale.scaleFactor=1.0;
|
||||
set scale.scaleRoot=false;
|
||||
set scaleAxis.automaticScale=true;
|
||||
set scaleAxis.fontSize=8.0;
|
||||
set scaleAxis.isShown=false;
|
||||
set scaleAxis.lineWidth=2.0;
|
||||
set scaleAxis.majorTicks=1.0;
|
||||
set scaleAxis.origin=0.0;
|
||||
set scaleAxis.reverseAxis=false;
|
||||
set scaleAxis.showGrid=true;
|
||||
set scaleAxis.significantDigits=4;
|
||||
set scaleBar.automaticScale=true;
|
||||
set scaleBar.fontSize=10.0;
|
||||
set scaleBar.isShown=true;
|
||||
set scaleBar.lineWidth=1.0;
|
||||
set scaleBar.scaleRange=0.0;
|
||||
set scaleBar.significantDigits=4;
|
||||
set tipLabels.colorAttribute="User Selection";
|
||||
set tipLabels.displayAttribute="Names";
|
||||
set tipLabels.fontName="sansserif";
|
||||
set tipLabels.fontSize=10;
|
||||
set tipLabels.fontStyle=0;
|
||||
set tipLabels.isShown=true;
|
||||
set tipLabels.significantDigits=4;
|
||||
set trees.order=false;
|
||||
set trees.orderType="increasing";
|
||||
set trees.rooting=false;
|
||||
set trees.rootingType="User Selection";
|
||||
set trees.transform=false;
|
||||
set trees.transformType="cladogram";
|
||||
end;
|
||||
"""
|
||||
|
||||
|
||||
def cartoonRankGenerator(rank):
|
||||
def cartoon(node):
|
||||
return 'rank' in node and node['rank']==rank
|
||||
|
||||
return cartoon
|
||||
|
||||
|
||||
def collapseBcGenerator(Bclimit):
|
||||
def collapse(node):
|
||||
return 'bc' in node and node['bc']<=Bclimit
|
||||
return collapse
|
||||
|
||||
|
||||
def label(node):
|
||||
if 'bc' in node:
|
||||
return "(%+3.1f) %s" % (node['bc'],node['name'])
|
||||
else:
|
||||
return " %s" % node['name']
|
||||
|
||||
|
||||
def main(coverageTree) :
|
||||
print nexusFormat(coverageTree,
|
||||
label=label,
|
||||
blocks=figtree,
|
||||
cartoon=cartoonRankGenerator('family'))
|
||||
#collapse=collapseBcGenerator(70))
|
||||
|
56
obitools/barcodecoverage/findErrors.py
Normal file
56
obitools/barcodecoverage/findErrors.py
Normal file
@ -0,0 +1,56 @@
|
||||
#!/usr/local/bin/python
|
||||
'''
|
||||
Created on 24 nov. 2011
|
||||
|
||||
@author: merciece
|
||||
'''
|
||||
|
||||
|
||||
def main(seqs, keptRanks, tax):
|
||||
errorsBySeq = getErrorsOnLeaves(seqs)
|
||||
errorsByTaxon = propagateErrors(errorsBySeq, keptRanks, tax)
|
||||
return errorsBySeq, errorsByTaxon
|
||||
|
||||
|
||||
def getErrorsOnLeaves(seqs) :
|
||||
errors = {}
|
||||
for s in seqs :
|
||||
taxid = s['taxid']
|
||||
forErrs = s['forward_error']
|
||||
revErrs = s['reverse_error']
|
||||
total = forErrs + revErrs
|
||||
seqNb = 1
|
||||
errors[s.id] = [forErrs,revErrs,total,seqNb,taxid]
|
||||
return errors
|
||||
|
||||
|
||||
def propagateErrors(errorsOnLeaves, keptRanks, tax) :
|
||||
allErrors = {}
|
||||
for seq in errorsOnLeaves :
|
||||
taxid = errorsOnLeaves[seq][4]
|
||||
p = [a for a in tax.parentalTreeIterator(taxid)]
|
||||
for a in p :
|
||||
if a[1] in keptRanks :
|
||||
group = a[0]
|
||||
if group in allErrors :
|
||||
allErrors[group][0] += errorsOnLeaves[seq][0]
|
||||
allErrors[group][1] += errorsOnLeaves[seq][1]
|
||||
allErrors[group][2] += errorsOnLeaves[seq][2]
|
||||
allErrors[group][3] += 1
|
||||
else :
|
||||
allErrors[group] = errorsOnLeaves[seq]
|
||||
|
||||
for group in allErrors :
|
||||
allErrors[group][0] /= float(allErrors[group][3])
|
||||
allErrors[group][1] /= float(allErrors[group][3])
|
||||
allErrors[group][2] /= float(allErrors[group][3])
|
||||
|
||||
allErrors[group][0] = round(allErrors[group][0], 2)
|
||||
allErrors[group][1] = round(allErrors[group][1], 2)
|
||||
allErrors[group][2] = round(allErrors[group][2], 2)
|
||||
|
||||
return allErrors
|
||||
|
||||
|
||||
|
||||
|
69
obitools/barcodecoverage/readFiles.py
Normal file
69
obitools/barcodecoverage/readFiles.py
Normal file
@ -0,0 +1,69 @@
|
||||
#!/usr/local/bin/python
|
||||
'''
|
||||
Created on 23 nov. 2011
|
||||
|
||||
@author: merciece
|
||||
'''
|
||||
|
||||
from obitools.ecopcr import sequence
|
||||
from obitools.ecopcr import taxonomy
|
||||
|
||||
|
||||
def main(entries,options):
|
||||
filteredDataFromDB = ecoPCRDatabaseReader(options)
|
||||
filteredData = ecoPCRFileReader(entries,filteredDataFromDB)
|
||||
return filteredDataFromDB,filteredData
|
||||
|
||||
|
||||
def ecoPCRDatabaseReader(options):
|
||||
|
||||
tax = taxonomy.EcoTaxonomyDB(options.taxonomy)
|
||||
seqs = sequence.EcoPCRDBSequenceIterator(options.taxonomy,taxonomy=tax)
|
||||
|
||||
norankid = tax.findRankByName('no rank')
|
||||
speciesid = tax.findRankByName('species')
|
||||
genusid = tax.findRankByName('genus')
|
||||
familyid = tax.findRankByName('family')
|
||||
|
||||
minrankseq = set([speciesid,genusid,familyid])
|
||||
|
||||
usedrankid = {}
|
||||
|
||||
ingroup = {}
|
||||
outgroup= {}
|
||||
|
||||
for s in seqs :
|
||||
if 'taxid' in s :
|
||||
taxid = s['taxid']
|
||||
allrank = set()
|
||||
for p in tax.parentalTreeIterator(taxid):
|
||||
if p[1]!=norankid:
|
||||
allrank.add(p[1])
|
||||
if len(minrankseq & allrank) == 3:
|
||||
for r in allrank:
|
||||
usedrankid[r]=usedrankid.get(r,0) + 1
|
||||
|
||||
if tax.isAncestor(options.ingroup,taxid):
|
||||
ingroup[s.id] = s
|
||||
else:
|
||||
outgroup[s.id] = s
|
||||
|
||||
keptranks = set(r for r in usedrankid
|
||||
if float(usedrankid[r])/float(len(ingroup)) > options.rankthresold)
|
||||
|
||||
return { 'ingroup' : ingroup,
|
||||
'outgroup': outgroup,
|
||||
'ranks' : keptranks,
|
||||
'taxonomy': tax
|
||||
}
|
||||
|
||||
|
||||
def ecoPCRFileReader(entries,filteredDataFromDB) :
|
||||
filteredData = []
|
||||
for s in entries :
|
||||
if 'taxid' in s :
|
||||
seqId = s.id
|
||||
if seqId in filteredDataFromDB['ingroup'] :
|
||||
filteredData.append(s)
|
||||
return filteredData
|
||||
|
42
obitools/barcodecoverage/writeBcTree.py
Normal file
42
obitools/barcodecoverage/writeBcTree.py
Normal file
@ -0,0 +1,42 @@
|
||||
#!/usr/local/bin/python
|
||||
'''
|
||||
Created on 25 nov. 2011
|
||||
|
||||
@author: merciece
|
||||
'''
|
||||
|
||||
from obitools.graph.rootedtree import RootedTree
|
||||
|
||||
|
||||
def main(BcValues,errors,tax) :
|
||||
|
||||
tree = RootedTree()
|
||||
tset = set(BcValues)
|
||||
|
||||
for taxon in BcValues:
|
||||
if taxon in errors :
|
||||
forErr = errors[taxon][0]
|
||||
revErr = errors[taxon][1]
|
||||
totErr = errors[taxon][2]
|
||||
else :
|
||||
forErr = -1.0
|
||||
revErr = -1.0
|
||||
totErr = -1.0
|
||||
|
||||
tree.addNode(taxon, rank=tax.getRank(taxon),
|
||||
name=tax.getScientificName(taxon),
|
||||
bc = BcValues[taxon],
|
||||
errors = str(forErr)+' '+str(revErr),
|
||||
totError = totErr
|
||||
)
|
||||
|
||||
for taxon in BcValues:
|
||||
piter = tax.parentalTreeIterator(taxon)
|
||||
taxon = piter.next()
|
||||
for parent in piter:
|
||||
if taxon[0] in tset and parent[0] in BcValues:
|
||||
tset.remove(taxon[0])
|
||||
tree.addEdge(parent[0], taxon[0])
|
||||
taxon=parent
|
||||
|
||||
return tree
|
207
obitools/blast/__init__.py
Normal file
207
obitools/blast/__init__.py
Normal file
@ -0,0 +1,207 @@
|
||||
from os import popen2
|
||||
from itertools import imap,count
|
||||
|
||||
from obitools.table import iTableIterator,TableRow,Table,SelectionIterator
|
||||
from obitools.utils import ColumnFile
|
||||
from obitools.location import SimpleLocation
|
||||
from obitools.fasta import formatFasta
|
||||
import sys
|
||||
|
||||
class Blast(object):
|
||||
'''
|
||||
Run blast
|
||||
'''
|
||||
|
||||
def __init__(self,mode,db,program='blastall',**options):
|
||||
self._mode = mode
|
||||
self._db = db
|
||||
self._program = program
|
||||
self._options = options
|
||||
|
||||
def getMode(self):
|
||||
return self._mode
|
||||
|
||||
|
||||
def getDb(self):
|
||||
return self._db
|
||||
|
||||
|
||||
def getProgram(self):
|
||||
return self._program
|
||||
|
||||
def _blastcmd(self):
|
||||
tmp = """%(program)s \\
|
||||
-p %(mode)s \\
|
||||
-d %(db)s \\
|
||||
-m 8 \\
|
||||
%(options)s \\
|
||||
"""
|
||||
options = ' '.join(['-%s %s' % (x[0],str(x[1]))
|
||||
for x in self._options.iteritems()])
|
||||
data = {
|
||||
'program' : self.program,
|
||||
'db' : self.db,
|
||||
'mode' : self.mode,
|
||||
'options' : options
|
||||
}
|
||||
|
||||
return tmp % data
|
||||
|
||||
def __call__(self,sequence):
|
||||
'''
|
||||
Run blast with one sequence object
|
||||
@param sequence:
|
||||
@type sequence:
|
||||
'''
|
||||
cmd = self._blastcmd()
|
||||
|
||||
(blast_in,blast_out) = popen2(cmd)
|
||||
|
||||
print >>blast_in,formatFasta(sequence)
|
||||
blast_in.close()
|
||||
|
||||
blast = BlastResultIterator(blast_out)
|
||||
|
||||
return blast
|
||||
|
||||
mode = property(getMode, None, None, "Mode's Docstring")
|
||||
|
||||
db = property(getDb, None, None, "Db's Docstring")
|
||||
|
||||
program = property(getProgram, None, None, "Program's Docstring")
|
||||
|
||||
|
||||
class NetBlast(Blast):
|
||||
'''
|
||||
Run blast on ncbi servers
|
||||
'''
|
||||
|
||||
def __init__(self,mode,db,**options):
|
||||
'''
|
||||
|
||||
@param mode:
|
||||
@param db:
|
||||
'''
|
||||
Blast.__init__(self, mode, db, 'blastcl3',**options)
|
||||
|
||||
|
||||
class BlastResultIterator(iTableIterator):
|
||||
|
||||
def __init__(self,blastoutput,query=None):
|
||||
'''
|
||||
|
||||
@param blastoutput:
|
||||
@type blastoutput:
|
||||
'''
|
||||
self._blast = ColumnFile(blastoutput,
|
||||
strip=True,
|
||||
skip="#",
|
||||
sep="\t",
|
||||
types=self.types
|
||||
)
|
||||
self._query = query
|
||||
self._hindex = dict((k,i) for i,k in imap(None,count(),self._getHeaders()))
|
||||
|
||||
def _getHeaders(self):
|
||||
return ('Query id','Subject id',
|
||||
'% identity','alignment length',
|
||||
'mismatches', 'gap openings',
|
||||
'q. start', 'q. end',
|
||||
's. start', 's. end',
|
||||
'e-value', 'bit score')
|
||||
|
||||
def _getTypes(self):
|
||||
return (str,str,
|
||||
float,int,
|
||||
int,int,
|
||||
int,int,
|
||||
int,int,
|
||||
float,float)
|
||||
|
||||
def _getRowFactory(self):
|
||||
return BlastMatch
|
||||
|
||||
def _getSubrowFactory(self):
|
||||
return TableRow
|
||||
|
||||
def _getQuery(self):
|
||||
return self._query
|
||||
|
||||
|
||||
headers = property(_getHeaders,None,None)
|
||||
types = property(_getTypes,None,None)
|
||||
rowFactory = property(_getRowFactory,None,None)
|
||||
subrowFactory = property(_getSubrowFactory,None,None)
|
||||
query = property(_getQuery,None,None)
|
||||
|
||||
def next(self):
|
||||
'''
|
||||
|
||||
'''
|
||||
value = self._blast.next()
|
||||
return self.rowFactory(self,value)
|
||||
|
||||
|
||||
|
||||
class BlastResult(Table):
|
||||
'''
|
||||
Results of a blast run
|
||||
'''
|
||||
|
||||
class BlastMatch(TableRow):
|
||||
'''
|
||||
Blast high scoring pair between two sequences
|
||||
'''
|
||||
|
||||
def getQueryLocation(self):
|
||||
l = SimpleLocation(self[6], self[7])
|
||||
return l
|
||||
|
||||
def getSubjectLocation(self):
|
||||
l = SimpleLocation(self[8], self[9])
|
||||
return l
|
||||
|
||||
def getSubjectSequence(self,database):
|
||||
return database[self[1]]
|
||||
|
||||
def queryCov(self,query=None):
|
||||
'''
|
||||
Compute coverage of match on query sequence.
|
||||
|
||||
@param query: the query sequence. Default is None.
|
||||
In this case the query sequence associated
|
||||
to this blast result is used.
|
||||
@type query: L{obitools.BioSequence}
|
||||
|
||||
@return: coverage fraction
|
||||
@rtype: float
|
||||
'''
|
||||
if query is None:
|
||||
query = self.table.query
|
||||
assert query is not None
|
||||
return float(self[7]-self[6]+1)/float(len(query))
|
||||
|
||||
def __getitem__(self,key):
|
||||
if key=='query coverage' and self.table.query is not None:
|
||||
return self.queryCov()
|
||||
else:
|
||||
return TableRow.__getitem__(self,key)
|
||||
|
||||
class BlastCovMinFilter(SelectionIterator):
|
||||
|
||||
def __init__(self,blastiterator,covmin,query=None,**conditions):
|
||||
if query is None:
|
||||
query = blastiterator.table.query
|
||||
assert query is not None
|
||||
SelectionIterator.__init__(self,blastiterator,**conditions)
|
||||
self._query = query
|
||||
self._covmin=covmin
|
||||
|
||||
def _covMinPredicat(self,row):
|
||||
return row.queryCov(self._query)>=self._covmin
|
||||
|
||||
def _checkCondition(self,row):
|
||||
return self._covMinPredicat(row) and SelectionIterator._checkCondition(self, row)
|
||||
|
||||
|
||||
|
376
obitools/carto/__init__.py
Normal file
376
obitools/carto/__init__.py
Normal file
@ -0,0 +1,376 @@
|
||||
# -*- coding: latin1 -*-
|
||||
|
||||
|
||||
|
||||
from obitools import SVGdraw
|
||||
import math
|
||||
|
||||
class Map(object):
|
||||
"""
|
||||
Map represente une instance d'une carte genetique physique.
|
||||
Une telle carte est definie par la longueur de la sequence
|
||||
qui lui est associe.
|
||||
|
||||
A une carte est associe un certain nombre de niveaux (Level)
|
||||
eux meme decoupe en sous-niveau (SubLevel)
|
||||
Les sous niveaux contiennent eux des features
|
||||
"""
|
||||
def __init__(self,name,seqlength,scale=1):
|
||||
"""
|
||||
Constructeur d'une nouvelle carte
|
||||
|
||||
*Param*:
|
||||
|
||||
name
|
||||
nom de la carte
|
||||
|
||||
seqlength
|
||||
longueur de la sequence associee a la carte
|
||||
|
||||
scale
|
||||
echelle de la carte indicant combien de pixel
|
||||
correspondent a une unite de la carte
|
||||
"""
|
||||
self.name = name
|
||||
self.seqlength = seqlength
|
||||
self.scale = scale
|
||||
self.levels = {}
|
||||
self.basicHSize = 10
|
||||
|
||||
def __str__(self):
|
||||
return '<%s>' % self.name
|
||||
|
||||
def __getitem__(self,level):
|
||||
"""
|
||||
retourne le niveau *level* de la carte et
|
||||
le cree s'il n'existe pas
|
||||
"""
|
||||
if not isinstance(level,int):
|
||||
raise TypeError('level must be an non Zero integer value')
|
||||
elif level==0:
|
||||
raise AssertionError('Level cannot be set to 0')
|
||||
try:
|
||||
return self.levels[level]
|
||||
except KeyError:
|
||||
self.levels[level] = Level(level,self)
|
||||
return self.levels[level]
|
||||
|
||||
def getBasicHSize(self):
|
||||
"""
|
||||
retourne la hauteur de base d'un element de cartographie
|
||||
exprimee en pixel
|
||||
"""
|
||||
return self.basicHSize
|
||||
|
||||
def getScale(self):
|
||||
"""
|
||||
Retourne l'echelle de la carte en nombre de pixels par
|
||||
unite physique de la carte
|
||||
"""
|
||||
return self.scale
|
||||
|
||||
|
||||
|
||||
def getNegativeBase(self):
|
||||
return reduce(lambda x,y:x-y,[self.levels[z].getHeight()
|
||||
for z in self.levels
|
||||
if z < 0],self.getHeight())
|
||||
|
||||
def getPositiveBase(self):
|
||||
return self.getNegativeBase() - 3 * self.getBasicHSize()
|
||||
|
||||
def getHeight(self):
|
||||
return reduce(lambda x,y:x+y,[z.getHeight() for z in self.levels.values()],0) \
|
||||
+ 4 * self.getBasicHSize()
|
||||
|
||||
def toXML(self,file=None,begin=0,end=None):
|
||||
dessin = SVGdraw.drawing()
|
||||
if end==None:
|
||||
end = self.seqlength
|
||||
hauteur= self.getHeight()
|
||||
largeur=(end-begin+1)*self.scale
|
||||
svg = SVGdraw.svg((begin*self.scale,0,largeur,hauteur),
|
||||
'%fpx' % (self.seqlength * self.scale),
|
||||
'%dpx' % hauteur)
|
||||
|
||||
centre = self.getPositiveBase() + (1 + 1/4) * self.getBasicHSize()
|
||||
svg.addElement(SVGdraw.rect(0,centre,self.seqlength * self.scale,self.getBasicHSize()/2))
|
||||
for e in self.levels.values():
|
||||
svg.addElement(e.getElement())
|
||||
dessin.setSVG(svg)
|
||||
return dessin.toXml(file)
|
||||
|
||||
class Feature(object):
|
||||
pass
|
||||
|
||||
class Level(object):
|
||||
|
||||
def __init__(self,level,map):
|
||||
if not isinstance(map,Map):
|
||||
raise AssertionError('map is not an instance of class Map')
|
||||
if level in map.levels:
|
||||
raise AssertionError('Level %d already define for map %s' % (level,map))
|
||||
else:
|
||||
map.levels[level] = self
|
||||
self.map = map
|
||||
self.level = level
|
||||
self.sublevels = {}
|
||||
|
||||
def __getitem__(self,sublevel):
|
||||
"""
|
||||
retourne le niveau *sublevel* du niveau en
|
||||
le creant s'il n'existe pas
|
||||
"""
|
||||
if not isinstance(sublevel,int):
|
||||
raise TypeError('sublevel must be a positive integer value')
|
||||
elif sublevel<0:
|
||||
raise AssertionError('Level cannot be negative')
|
||||
try:
|
||||
return self.sublevels[sublevel]
|
||||
except KeyError:
|
||||
self.sublevels[sublevel] = SubLevel(sublevel,self)
|
||||
return self.sublevels[sublevel]
|
||||
|
||||
def getBase(self):
|
||||
if self.level < 0:
|
||||
base = self.map.getNegativeBase()
|
||||
base += reduce(lambda x,y:x+y,[self.map.levels[z].getHeight()
|
||||
for z in self.map.levels
|
||||
if z <0 and z >= self.level],0)
|
||||
return base
|
||||
else:
|
||||
base = self.map.getPositiveBase()
|
||||
base -= reduce(lambda x,y:x+y,[self.map.levels[z].getHeight()
|
||||
for z in self.map.levels
|
||||
if z >0 and z < self.level],0)
|
||||
return base
|
||||
|
||||
def getElement(self):
|
||||
objet = SVGdraw.group('level%d' % self.level)
|
||||
for e in self.sublevels.values():
|
||||
objet.addElement(e.getElement())
|
||||
return objet
|
||||
|
||||
|
||||
|
||||
def getHeight(self):
|
||||
return reduce(lambda x,y:x+y,[z.getHeight() for z in self.sublevels.values()],0) \
|
||||
+ 2 * self.map.getBasicHSize()
|
||||
|
||||
class SubLevel(object):
|
||||
|
||||
def __init__(self,sublevel,level):
|
||||
if not isinstance(level,Level):
|
||||
raise AssertionError('level is not an instance of class Level')
|
||||
if level in level.sublevels:
|
||||
raise AssertionError('Sublevel %d already define for level %s' % (sublevel,level))
|
||||
else:
|
||||
level.sublevels[sublevel] = self
|
||||
self.level = level
|
||||
self.sublevel = sublevel
|
||||
self.features = {}
|
||||
|
||||
def getHeight(self):
|
||||
return max([x.getHeight() for x in self.features.values()]+[0]) + 4 * self.level.map.getBasicHSize()
|
||||
|
||||
def getBase(self):
|
||||
base = self.level.getBase()
|
||||
if self.level.level < 0:
|
||||
base -= self.level.getHeight() - 2 * self.level.map.getBasicHSize()
|
||||
base += reduce(lambda x,y:x+y,[self.level.sublevels[z].getHeight()
|
||||
for z in self.level.sublevels
|
||||
if z <= self.sublevel],0)
|
||||
base -= 2* self.level.map.getBasicHSize()
|
||||
else:
|
||||
base -= reduce(lambda x,y:x+y,[self.level.sublevels[z].getHeight()
|
||||
for z in self.level.sublevels
|
||||
if z < self.sublevel],0)
|
||||
base -= self.level.map.getBasicHSize()
|
||||
return base
|
||||
|
||||
def getElement(self):
|
||||
base = self.getBase()
|
||||
objet = SVGdraw.group('sublevel%d' % self.sublevel)
|
||||
for e in self.features.values():
|
||||
objet.addElement(e.getElement(base))
|
||||
return objet
|
||||
|
||||
def add(self,feature):
|
||||
if not isinstance(feature,Feature):
|
||||
raise TypeError('feature must be an instance oof Feature')
|
||||
if feature.name in self.features:
|
||||
raise AssertionError('A feature with the same name (%s) have already be insert in this sublevel'
|
||||
% feature.name)
|
||||
self.features[feature.name]=feature
|
||||
feature.sublevel=self
|
||||
|
||||
class SimpleFeature(Feature):
|
||||
|
||||
def __init__(self,name,begin,end,visiblename=False,color=0):
|
||||
self.begin = begin
|
||||
self.end = end
|
||||
self.name = name
|
||||
self.color = color
|
||||
self.sublevel = None
|
||||
self.visiblename=visiblename
|
||||
|
||||
def getHeight(self):
|
||||
if not self.sublevel:
|
||||
raise AssertionError('Not affected Simple feature')
|
||||
if self.visiblename:
|
||||
return self.sublevel.level.map.getBasicHSize() * 2
|
||||
else:
|
||||
return self.sublevel.level.map.getBasicHSize()
|
||||
|
||||
def getElement(self,base):
|
||||
scale = self.sublevel.level.map.getScale()
|
||||
y = base - self.sublevel.level.map.getBasicHSize()
|
||||
x = self.begin * scale
|
||||
width = (self.end - self.begin + 1) * scale
|
||||
heigh = self.sublevel.level.map.getBasicHSize()
|
||||
|
||||
objet = SVGdraw.rect(x,y,width,heigh,stroke=self.color)
|
||||
objet.addElement(SVGdraw.description(self.name))
|
||||
|
||||
return objet
|
||||
|
||||
class BoxFeature(SimpleFeature):
|
||||
|
||||
def getHeight(self):
|
||||
if not self.sublevel:
|
||||
raise AssertionError('Not affected Box feature')
|
||||
if self.visiblename:
|
||||
return self.sublevel.level.map.getBasicHSize() * 4
|
||||
else:
|
||||
return self.sublevel.level.map.getBasicHSize() * 3
|
||||
|
||||
def getElement(self,base):
|
||||
scale = self.sublevel.level.map.getScale()
|
||||
y = base - self.sublevel.level.map.getBasicHSize() * 2
|
||||
x = self.begin * scale
|
||||
width = (self.end - self.begin + 1) * scale
|
||||
height = self.sublevel.level.map.getBasicHSize() * 3
|
||||
|
||||
objet = SVGdraw.rect(x,y,width,height,stroke=self.color,fill="none")
|
||||
objet.addElement(SVGdraw.description(self.name))
|
||||
|
||||
return objet
|
||||
|
||||
class MultiPartFeature(Feature):
|
||||
|
||||
def __init__(self,name,*args,**kargs):
|
||||
self.limits = args
|
||||
self.name = name
|
||||
try:
|
||||
self.color = kargs['color']
|
||||
except KeyError:
|
||||
self.color = "black"
|
||||
|
||||
try:
|
||||
self.visiblename=kargs['visiblename']
|
||||
except KeyError:
|
||||
self.visiblename=None
|
||||
|
||||
try:
|
||||
self.flatlink=kargs['flatlink']
|
||||
except KeyError:
|
||||
self.flatlink=False
|
||||
|
||||
try:
|
||||
self.roundlink=kargs['roundlink']
|
||||
except KeyError:
|
||||
self.roundlink=False
|
||||
|
||||
self.sublevel = None
|
||||
|
||||
|
||||
def getHeight(self):
|
||||
if not self.sublevel:
|
||||
raise AssertionError('Not affected Simple feature')
|
||||
if self.visiblename:
|
||||
return self.sublevel.level.map.getBasicHSize() * 3
|
||||
else:
|
||||
return self.sublevel.level.map.getBasicHSize() * 2
|
||||
|
||||
def getElement(self,base):
|
||||
scale = self.sublevel.level.map.getScale()
|
||||
|
||||
y = base - self.sublevel.level.map.getBasicHSize()
|
||||
height = self.sublevel.level.map.getBasicHSize()
|
||||
objet = SVGdraw.group(self.name)
|
||||
for (debut,fin) in self.limits:
|
||||
x = debut * scale
|
||||
width = (fin - debut + 1) * scale
|
||||
part = SVGdraw.rect(x,y,width,height,fill=self.color)
|
||||
objet.addElement(part)
|
||||
|
||||
debut = self.limits[0][1]
|
||||
for (fin,next) in self.limits[1:]:
|
||||
debut*=scale
|
||||
fin*=scale
|
||||
path = SVGdraw.pathdata(debut,y + height / 2)
|
||||
delta = height / 2
|
||||
if self.roundlink:
|
||||
path.qbezier((debut+fin)/2, y - delta,fin,y + height / 2)
|
||||
else:
|
||||
if self.flatlink:
|
||||
delta = - height / 2
|
||||
path.line((debut+fin)/2, y - delta)
|
||||
path.line(fin,y + height / 2)
|
||||
path = SVGdraw.path(path,fill="none",stroke=self.color)
|
||||
objet.addElement(path)
|
||||
debut = next
|
||||
|
||||
objet.addElement(SVGdraw.description(self.name))
|
||||
|
||||
return objet
|
||||
|
||||
class TagFeature(Feature):
|
||||
|
||||
def __init__(self,name,begin,length,ratio,visiblename=False,color=0):
|
||||
self.begin = begin
|
||||
self.length = length
|
||||
self.ratio = ratio
|
||||
self.name = name
|
||||
self.color = color
|
||||
self.sublevel = None
|
||||
self.visiblename=visiblename
|
||||
|
||||
def getHeight(self):
|
||||
if not self.sublevel:
|
||||
raise AssertionError('Not affected Tag feature')
|
||||
|
||||
return self.sublevel.level.map.getBasicHSize()*11
|
||||
|
||||
def getElement(self,base):
|
||||
scale = self.sublevel.level.map.getScale()
|
||||
height = math.floor(max(1,self.sublevel.level.map.getBasicHSize()* 10 * self.ratio))
|
||||
y = base + self.sublevel.level.map.getBasicHSize() - height
|
||||
x = self.begin * scale
|
||||
width = self.length * scale
|
||||
objet = SVGdraw.rect(x,y,width,height,stroke=self.color)
|
||||
objet.addElement(SVGdraw.description(self.name))
|
||||
|
||||
return objet
|
||||
|
||||
if __name__ == '__main__':
|
||||
carte = Map('essai',20000,scale=0.5)
|
||||
carte[-1][0].add(SimpleFeature('toto',100,300))
|
||||
carte[1][0].add(SimpleFeature('toto',100,300))
|
||||
carte[1][1].add(SimpleFeature('toto',200,1000))
|
||||
|
||||
carte[1][0].add(MultiPartFeature('bout',(1400,1450),(1470,1550),(1650,1800),color='red',flatlink=True))
|
||||
carte[1][0].add(MultiPartFeature('titi',(400,450),(470,550),(650,800),color='red',flatlink=True))
|
||||
carte[-1][1].add(MultiPartFeature('titi',(400,450),(470,550),(650,800),color='green'))
|
||||
carte[-1][2].add(MultiPartFeature('titi',(400,450),(470,550),(650,800),color='purple',roundlink=True))
|
||||
|
||||
carte[-1][1].add(BoxFeature('tutu',390,810,color='purple'))
|
||||
carte[1][0].add(BoxFeature('tutu',390,810,color='red'))
|
||||
carte[2][0].add(TagFeature('t1',1400,20,0.8))
|
||||
carte[2][0].add(TagFeature('t2',1600,20,0.2))
|
||||
carte.basicHSize=6
|
||||
print carte.toXML('truc.svg',begin=0,end=1000)
|
||||
print carte.toXML('truc2.svg',begin=460,end=2000)
|
||||
|
||||
|
||||
|
0
obitools/decorator.py
Normal file
0
obitools/decorator.py
Normal file
29
obitools/distances/__init__.py
Normal file
29
obitools/distances/__init__.py
Normal file
@ -0,0 +1,29 @@
|
||||
class DistanceMatrix(object):
|
||||
|
||||
def __init__(self,alignment):
|
||||
'''
|
||||
DistanceMatrix constructor.
|
||||
|
||||
@param alignment: aligment used to compute distance matrix
|
||||
@type alignment: obitools.align.Alignment
|
||||
'''
|
||||
self.aligment = alignment
|
||||
self.matrix = [[None] * (x+1) for x in xrange(len(alignment))]
|
||||
|
||||
def evaluateDist(self,x,y):
|
||||
raise NotImplementedError
|
||||
|
||||
def __getitem__(self,key):
|
||||
assert isinstance(key,(tuple,list)) and len(key)==2, \
|
||||
'key must be a tuple or a list of two integers'
|
||||
x,y = key
|
||||
if y < x:
|
||||
z=x
|
||||
x=y
|
||||
y=z
|
||||
rep = self.matrix[y][x]
|
||||
if rep is None:
|
||||
rep = self.evaluateDist(x,y)
|
||||
self.matrix[y][x] = rep
|
||||
|
||||
return rep
|
77
obitools/distances/observed.py
Normal file
77
obitools/distances/observed.py
Normal file
@ -0,0 +1,77 @@
|
||||
'''
|
||||
Module dedicated to compute observed divergeances from
|
||||
an alignment. No distance correction is applied at all
|
||||
'''
|
||||
|
||||
from itertools import imap
|
||||
|
||||
from obitools.distances import DistanceMatrix
|
||||
|
||||
class PairewiseGapRemoval(DistanceMatrix):
|
||||
'''
|
||||
Observed divergeance matrix from an alignment.
|
||||
Gap are removed from the alignemt on a pairewise
|
||||
sequence base
|
||||
'''
|
||||
|
||||
def evaluateDist(self,x,y):
|
||||
'''
|
||||
Compute the observed divergeance from two sequences
|
||||
of an aligment.
|
||||
|
||||
@attention: For performance purpose this method should
|
||||
be directly used. use instead the __getitem__
|
||||
method from DistanceMatrix.
|
||||
|
||||
@see: L{__getitem__}
|
||||
|
||||
@param x: number of the fisrt sequence in the aligment
|
||||
@type x: int
|
||||
@param y: umber of the second sequence in the aligment
|
||||
@type y: int
|
||||
|
||||
|
||||
'''
|
||||
|
||||
seq1 = self.aligment[x]
|
||||
seq2 = self.aligment[y]
|
||||
|
||||
diff,tot = reduce(lambda x,y: (x[0]+y,x[1]+1),
|
||||
(z[0]!=z[1] for z in imap(None,seq1,seq2)
|
||||
if '-' not in z),(0,0))
|
||||
return float(diff)/tot
|
||||
|
||||
|
||||
class Pairewise(DistanceMatrix):
|
||||
'''
|
||||
Observed divergeance matrix from an alignment.
|
||||
Gap are kept from the alignemt
|
||||
'''
|
||||
|
||||
def evaluateDist(self,x,y):
|
||||
'''
|
||||
Compute the observed divergeance from two sequences
|
||||
of an aligment.
|
||||
|
||||
@attention: For performance purpose this method should
|
||||
be directly used. use instead the __getitem__
|
||||
method from DistanceMatrix.
|
||||
|
||||
@see: L{__getitem__}
|
||||
|
||||
@param x: number of the fisrt sequence in the aligment
|
||||
@type x: int
|
||||
@param y: umber of the second sequence in the aligment
|
||||
@type y: int
|
||||
|
||||
|
||||
'''
|
||||
|
||||
seq1 = self.aligment[x]
|
||||
seq2 = self.aligment[y]
|
||||
|
||||
diff,tot = reduce(lambda x,y: (x[0]+y,x[1]+1),
|
||||
(z[0]!=z[1] for z in imap(None,seq1,seq2)),
|
||||
(0,0))
|
||||
return float(diff)/tot
|
||||
|
35
obitools/distances/phylip.py
Normal file
35
obitools/distances/phylip.py
Normal file
@ -0,0 +1,35 @@
|
||||
import sys
|
||||
|
||||
from itertools import imap,count
|
||||
|
||||
def writePhylipMatrix(matrix):
|
||||
names = [x.id for x in matrix.aligment]
|
||||
pnames= [x[:10] for x in names]
|
||||
unicity={}
|
||||
redundent=[]
|
||||
for n in pnames:
|
||||
unicity[n]=unicity.get(n,0)+1
|
||||
redundent.append(unicity[n])
|
||||
|
||||
for i,n,r in imap(None,count(),pnames,redundent):
|
||||
alternate = n
|
||||
if r > 1:
|
||||
while alternate in pnames:
|
||||
lcut = 9 - len(str(r))
|
||||
alternate = n[:lcut]+ '_%d' % r
|
||||
r+=1
|
||||
pnames[i]='%-10s' % alternate
|
||||
|
||||
firstline = '%5d' % len(matrix.aligment)
|
||||
rep = [firstline]
|
||||
for i,n in imap(None,count(),pnames):
|
||||
line = [n]
|
||||
for j in xrange(i):
|
||||
line.append('%5.4f' % matrix[(j,i)])
|
||||
rep.append(' '.join(line))
|
||||
return '\n'.join(rep)
|
||||
|
||||
|
||||
|
||||
|
||||
|
25
obitools/distances/r.py
Normal file
25
obitools/distances/r.py
Normal file
@ -0,0 +1,25 @@
|
||||
import sys
|
||||
|
||||
from itertools import imap,count
|
||||
|
||||
def writeRMatrix(matrix):
|
||||
names = [x.id for x in matrix.aligment]
|
||||
lmax = max(max(len(x) for x in names),5)
|
||||
lali = len(matrix.aligment)
|
||||
|
||||
nformat = '%%-%ds' % lmax
|
||||
dformat = '%%%d.4f' % lmax
|
||||
|
||||
pnames=[nformat % x for x in names]
|
||||
|
||||
rep = [' '.join(pnames)]
|
||||
|
||||
for i in xrange(lali):
|
||||
line=[]
|
||||
for j in xrange(lali):
|
||||
line.append('%5.4f' % matrix[(j,i)])
|
||||
rep.append(' '.join(line))
|
||||
return '\n'.join(rep)
|
||||
|
||||
|
||||
|
100
obitools/dnahash/__init__.py
Normal file
100
obitools/dnahash/__init__.py
Normal file
@ -0,0 +1,100 @@
|
||||
_A=[0]
|
||||
_C=[1]
|
||||
_G=[2]
|
||||
_T=[3]
|
||||
_R= _A + _G
|
||||
_Y= _C + _T
|
||||
_M= _C + _A
|
||||
_K= _T + _G
|
||||
_W= _T + _A
|
||||
_S= _C + _G
|
||||
_B= _C + _G + _T
|
||||
_D= _A + _G + _T
|
||||
_H= _A + _C + _T
|
||||
_V= _A + _C + _G
|
||||
_N= _A + _C + _G + _T
|
||||
|
||||
_dnahash={'a':_A,
|
||||
'c':_C,
|
||||
'g':_G,
|
||||
't':_T,
|
||||
'r':_R,
|
||||
'y':_Y,
|
||||
'm':_M,
|
||||
'k':_K,
|
||||
'w':_W,
|
||||
's':_S,
|
||||
'b':_B,
|
||||
'd':_D,
|
||||
'h':_H,
|
||||
'v':_V,
|
||||
'n':_N,
|
||||
}
|
||||
|
||||
def hashCodeIterator(sequence,wsize,degeneratemax=0,offset=0):
|
||||
errors = 0
|
||||
emask = [0] * wsize
|
||||
epointer = 0
|
||||
size = 0
|
||||
position = offset
|
||||
hashs = set([0])
|
||||
hashmask = 0
|
||||
for i in xrange(wsize):
|
||||
hashmask <<= 2
|
||||
hashmask +=3
|
||||
|
||||
for l in sequence:
|
||||
l = l.lower()
|
||||
hl = _dnahash[l]
|
||||
|
||||
if emask[epointer]:
|
||||
errors-=1
|
||||
emask[epointer]=0
|
||||
|
||||
if len(hl) > 1:
|
||||
errors +=1
|
||||
emask[epointer]=1
|
||||
|
||||
epointer+=1
|
||||
epointer%=wsize
|
||||
|
||||
if errors > degeneratemax:
|
||||
hl=set([hl[0]])
|
||||
|
||||
hashs=set((((hc<<2) | cl) & hashmask)
|
||||
for hc in hashs
|
||||
for cl in hl)
|
||||
|
||||
if size < wsize:
|
||||
size+=1
|
||||
|
||||
if size==wsize:
|
||||
if errors <= degeneratemax:
|
||||
yield (position,hashs,errors)
|
||||
position+=1
|
||||
|
||||
def hashSequence(sequence,wsize,degeneratemax=0,offset=0,hashs=None):
|
||||
if hashs is None:
|
||||
hashs=[[] for x in xrange(4**wsize)]
|
||||
|
||||
for pos,keys,errors in hashCodeIterator(sequence, wsize, degeneratemax, offset):
|
||||
for k in keys:
|
||||
hashs[k].append(pos)
|
||||
|
||||
return hashs
|
||||
|
||||
def hashSequences(sequences,wsize,maxpos,degeneratemax=0):
|
||||
hashs=None
|
||||
offsets=[]
|
||||
offset=0
|
||||
for s in sequences:
|
||||
offsets.append(offset)
|
||||
hashSequence(s,wsize,degeneratemax=degeneratemax,offset=offset,hashs=hashs)
|
||||
offset+=len(s)
|
||||
|
||||
return hashs,offsets
|
||||
|
||||
|
||||
|
||||
|
||||
|
0
obitools/ecobarcode/__init__.py
Normal file
0
obitools/ecobarcode/__init__.py
Normal file
32
obitools/ecobarcode/databases.py
Normal file
32
obitools/ecobarcode/databases.py
Normal file
@ -0,0 +1,32 @@
|
||||
'''
|
||||
Created on 25 sept. 2010
|
||||
|
||||
@author: coissac
|
||||
'''
|
||||
from obitools import NucSequence
|
||||
|
||||
def referenceDBIterator(options):
|
||||
|
||||
cursor = options.ecobarcodedb.cursor()
|
||||
|
||||
cursor.execute("select id from databases.database where name='%s'" % options.database)
|
||||
options.dbid = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute('''
|
||||
select s.accession,r.id,r.taxid,r.sequence
|
||||
from databases.database d,
|
||||
databases.reference r,
|
||||
databases.relatedsequences s
|
||||
where r.database = d.id
|
||||
and s.reference= r.id
|
||||
and s.mainac
|
||||
and d.name = '%s'
|
||||
''' % options.database
|
||||
)
|
||||
|
||||
for ac,id,taxid,sequence in cursor:
|
||||
s = NucSequence(ac,sequence)
|
||||
s['taxid']=taxid
|
||||
s['refdbid']=id
|
||||
yield s
|
||||
|
50
obitools/ecobarcode/ecotag.py
Normal file
50
obitools/ecobarcode/ecotag.py
Normal file
@ -0,0 +1,50 @@
|
||||
'''
|
||||
Created on 25 sept. 2010
|
||||
|
||||
@author: coissac
|
||||
'''
|
||||
|
||||
def alreadyIdentified(seqid,options):
|
||||
cursor = options.ecobarcodedb.cursor()
|
||||
cursor.execute('''
|
||||
select count(*)
|
||||
from ecotag.identification
|
||||
where sequence=%s
|
||||
and database=%s
|
||||
''',(int(seqid),int(options.dbid)))
|
||||
|
||||
return int(cursor.fetchone()[0]) > 0;
|
||||
|
||||
def storeIdentification(seqid,
|
||||
idstatus,taxid,
|
||||
matches,
|
||||
options
|
||||
):
|
||||
|
||||
cursor = options.ecobarcodedb.cursor()
|
||||
|
||||
if not options.updatedb:
|
||||
cursor.execute('''
|
||||
delete from ecotag.identification where sequence=%s and database=%s
|
||||
''',(int(seqid),int(options.dbid)))
|
||||
|
||||
cursor.execute('''
|
||||
insert into ecotag.identification (sequence,database,idstatus,taxid)
|
||||
values (%s,%s,%s,%s)
|
||||
returning id
|
||||
''' , (int(seqid),int(options.dbid),idstatus,int(taxid)))
|
||||
|
||||
idid = cursor.fetchone()[0]
|
||||
|
||||
for seq,identity in matches.iteritems():
|
||||
cursor.execute('''
|
||||
insert into ecotag.evidence (identification,reference,identity)
|
||||
values (%s,
|
||||
%s,
|
||||
%s)
|
||||
''',(idid,seq,identity))
|
||||
|
||||
|
||||
cursor.close()
|
||||
|
||||
options.ecobarcodedb.commit()
|
64
obitools/ecobarcode/options.py
Normal file
64
obitools/ecobarcode/options.py
Normal file
@ -0,0 +1,64 @@
|
||||
'''
|
||||
Created on 23 sept. 2010
|
||||
|
||||
@author: coissac
|
||||
'''
|
||||
import psycopg2
|
||||
|
||||
from obitools.ecobarcode.taxonomy import EcoTaxonomyDB
|
||||
|
||||
def addEcoBarcodeDBOption(optionManager):
|
||||
optionManager.add_option('--dbname',
|
||||
action="store", dest="ecobarcodedb",
|
||||
type='str',
|
||||
default=None,
|
||||
help="Specify the name of the ecobarcode database")
|
||||
|
||||
optionManager.add_option('--server',
|
||||
action="store", dest="dbserver",
|
||||
type='str',
|
||||
default="localhost",
|
||||
help="Specify the adress of the ecobarcode database server")
|
||||
|
||||
optionManager.add_option('--user',
|
||||
action="store", dest="dbuser",
|
||||
type='str',
|
||||
default='postgres',
|
||||
help="Specify the user of the ecobarcode database")
|
||||
|
||||
optionManager.add_option('--port',
|
||||
action="store", dest="dbport",
|
||||
type='str',
|
||||
default=5432,
|
||||
help="Specify the port of the ecobarcode database")
|
||||
|
||||
optionManager.add_option('--passwd',
|
||||
action="store", dest="dbpasswd",
|
||||
type='str',
|
||||
default='',
|
||||
help="Specify the passwd of the ecobarcode database")
|
||||
|
||||
optionManager.add_option('--primer',
|
||||
action="store", dest="primer",
|
||||
type='str',
|
||||
default=None,
|
||||
help="Specify the primer used for amplification")
|
||||
|
||||
|
||||
def ecobarcodeDatabaseConnection(options):
|
||||
if options.ecobarcodedb is not None:
|
||||
connection = psycopg2.connect(database=options.ecobarcodedb,
|
||||
user=options.dbuser,
|
||||
password=options.dbpasswd,
|
||||
host=options.dbserver,
|
||||
port=options.dbport)
|
||||
options.dbname=options.ecobarcodedb
|
||||
else:
|
||||
connection=None
|
||||
if connection is not None:
|
||||
options.ecobarcodedb=connection
|
||||
taxonomy = EcoTaxonomyDB(connection)
|
||||
else:
|
||||
taxonomy=None
|
||||
return taxonomy
|
||||
|
38
obitools/ecobarcode/rawdata.py
Normal file
38
obitools/ecobarcode/rawdata.py
Normal file
@ -0,0 +1,38 @@
|
||||
'''
|
||||
Created on 25 sept. 2010
|
||||
|
||||
@author: coissac
|
||||
'''
|
||||
|
||||
from obitools import NucSequence
|
||||
from obitools.utils import progressBar
|
||||
from obitools.ecobarcode.ecotag import alreadyIdentified
|
||||
|
||||
import sys
|
||||
|
||||
def sequenceIterator(options):
|
||||
cursor = options.ecobarcodedb.cursor()
|
||||
|
||||
cursor.execute('''
|
||||
select s.id,sum(o.count),s.sequence
|
||||
from rawdata.sequence s,
|
||||
rawdata.occurrences o
|
||||
where o.sequence= s.id
|
||||
and s.primers = '%s'
|
||||
group by s.id,s.sequence
|
||||
''' % options.primer
|
||||
)
|
||||
|
||||
nbseq = cursor.rowcount
|
||||
progressBar(1, nbseq, True, head=options.dbname)
|
||||
for id,count,sequence in cursor:
|
||||
progressBar(cursor.rownumber+1, nbseq, head=options.dbname)
|
||||
if not options.updatedb or not alreadyIdentified(id,options):
|
||||
s = NucSequence(id,sequence)
|
||||
s['count']=count
|
||||
print >>sys.stderr,' +', cursor.rownumber+1,
|
||||
yield s
|
||||
else:
|
||||
print >>sys.stderr,' @', cursor.rownumber+1,
|
||||
|
||||
print >>sys.stderr
|
120
obitools/ecobarcode/taxonomy.py
Normal file
120
obitools/ecobarcode/taxonomy.py
Normal file
@ -0,0 +1,120 @@
|
||||
'''
|
||||
Created on 24 sept. 2010
|
||||
|
||||
@author: coissac
|
||||
'''
|
||||
|
||||
from obitools.ecopcr.taxonomy import TaxonomyDump
|
||||
from obitools.ecopcr.taxonomy import Taxonomy
|
||||
import sys
|
||||
|
||||
class EcoTaxonomyDB(TaxonomyDump) :
|
||||
|
||||
def __init__(self,dbconnect):
|
||||
self._dbconnect=dbconnect
|
||||
|
||||
print >> sys.stderr,"Reading ecobarcode taxonomy database..."
|
||||
|
||||
self._readNodeTable()
|
||||
print >> sys.stderr," ok"
|
||||
|
||||
print >>sys.stderr,"Adding scientific name..."
|
||||
|
||||
self._name=[]
|
||||
for taxid,name,classname in self._nameIterator():
|
||||
self._name.append((name,classname,self._index[taxid]))
|
||||
if classname == 'scientific name':
|
||||
self._taxonomy[self._index[taxid]].append(name)
|
||||
|
||||
print >>sys.stderr,"Adding taxid alias..."
|
||||
for taxid,current in self._mergedNodeIterator():
|
||||
self._index[taxid]=self._index[current]
|
||||
|
||||
print >>sys.stderr,"Adding deleted taxid..."
|
||||
for taxid in self._deletedNodeIterator():
|
||||
self._index[taxid]=None
|
||||
|
||||
|
||||
Taxonomy.__init__(self)
|
||||
|
||||
#####
|
||||
#
|
||||
# Iterator functions
|
||||
#
|
||||
#####
|
||||
|
||||
def _readNodeTable(self):
|
||||
|
||||
cursor = self._dbconnect.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
select taxid,rank,parent
|
||||
from ncbitaxonomy.nodes
|
||||
""")
|
||||
|
||||
print >>sys.stderr,"Reading taxonomy nodes..."
|
||||
taxonomy=[list(n) for n in cursor]
|
||||
|
||||
print >>sys.stderr,"List all taxonomy rank..."
|
||||
ranks =list(set(x[1] for x in taxonomy))
|
||||
ranks.sort()
|
||||
rankidx = dict(map(None,ranks,xrange(len(ranks))))
|
||||
|
||||
print >>sys.stderr,"Sorting taxons..."
|
||||
taxonomy.sort(TaxonomyDump._taxonCmp)
|
||||
|
||||
self._taxonomy=taxonomy
|
||||
|
||||
print >>sys.stderr,"Indexing taxonomy..."
|
||||
index = {}
|
||||
for t in self._taxonomy:
|
||||
index[t[0]]=self._bsearchTaxon(t[0])
|
||||
|
||||
print >>sys.stderr,"Indexing parent and rank..."
|
||||
for t in self._taxonomy:
|
||||
t[1]=rankidx[t[1]]
|
||||
t[2]=index[t[2]]
|
||||
|
||||
self._ranks=ranks
|
||||
self._index=index
|
||||
|
||||
cursor.close()
|
||||
|
||||
def _nameIterator(self):
|
||||
cursor = self._dbconnect.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
select taxid,name,nameclass
|
||||
from ncbitaxonomy.names
|
||||
""")
|
||||
|
||||
for taxid,name,nameclass in cursor:
|
||||
yield taxid,name,nameclass
|
||||
|
||||
cursor.close()
|
||||
|
||||
def _mergedNodeIterator(self):
|
||||
cursor = self._dbconnect.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
select oldtaxid,newtaxid
|
||||
from ncbitaxonomy.merged
|
||||
""")
|
||||
|
||||
for oldtaxid,newtaxid in cursor:
|
||||
yield oldtaxid,newtaxid
|
||||
|
||||
cursor.close()
|
||||
|
||||
def _deletedNodeIterator(self):
|
||||
cursor = self._dbconnect.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
select taxid
|
||||
from ncbitaxonomy.delnodes
|
||||
""")
|
||||
|
||||
for taxid in cursor:
|
||||
yield taxid[0]
|
||||
|
||||
cursor.close()
|
69
obitools/ecopcr/__init__.py
Normal file
69
obitools/ecopcr/__init__.py
Normal file
@ -0,0 +1,69 @@
|
||||
from obitools import utils
|
||||
from obitools import NucSequence
|
||||
from obitools.utils import universalOpen, universalTell, fileSize, progressBar
|
||||
import struct
|
||||
import sys
|
||||
|
||||
|
||||
class EcoPCRFile(utils.ColumnFile):
|
||||
def __init__(self,stream):
|
||||
utils.ColumnFile.__init__(self,
|
||||
stream, '|', True,
|
||||
(str,int,int,
|
||||
str,int,str,
|
||||
int,str,int,
|
||||
str,int,str,
|
||||
str,str,int,float,
|
||||
str,int,float,
|
||||
int,
|
||||
str,str), "#")
|
||||
|
||||
|
||||
def next(self):
|
||||
data = utils.ColumnFile.next(self)
|
||||
seq = NucSequence(data[0],data[20],data[21])
|
||||
seq['seq_length_ori']=data[1]
|
||||
seq['taxid']=data[2]
|
||||
seq['rank']=data[3]
|
||||
seq['species']=data[4]
|
||||
seq['species_sn']=data[5]
|
||||
seq['genus']=data[6]
|
||||
seq['genus_sn']=data[7]
|
||||
seq['family']=data[8]
|
||||
seq['family_sn']=data[9]
|
||||
seq['strand']=data[12]
|
||||
seq['forward_primer']=data[13]
|
||||
seq['forward_error']=data[14]
|
||||
seq['forward_tm']=data[15]
|
||||
seq['reverse_primer']=data[16]
|
||||
seq['reverse_error']=data[17]
|
||||
seq['reverse_tm']=data[18]
|
||||
|
||||
return seq
|
||||
|
||||
|
||||
|
||||
class EcoPCRDBFile(object):
|
||||
|
||||
def _ecoRecordIterator(self,file):
|
||||
file = universalOpen(file)
|
||||
(recordCount,) = struct.unpack('> I',file.read(4))
|
||||
self._recover=False
|
||||
|
||||
if recordCount:
|
||||
for i in xrange(recordCount):
|
||||
(recordSize,)=struct.unpack('>I',file.read(4))
|
||||
record = file.read(recordSize)
|
||||
yield record
|
||||
else:
|
||||
print >> sys.stderr,"\n\n WARNING : EcoPCRDB readding set into recover data mode\n"
|
||||
self._recover=True
|
||||
ok=True
|
||||
while(ok):
|
||||
try:
|
||||
(recordSize,)=struct.unpack('>I',file.read(4))
|
||||
record = file.read(recordSize)
|
||||
yield record
|
||||
except:
|
||||
ok=False
|
||||
|
104
obitools/ecopcr/annotation.py
Normal file
104
obitools/ecopcr/annotation.py
Normal file
@ -0,0 +1,104 @@
|
||||
import struct
|
||||
|
||||
class EcoPCRDBAnnotationWriter(object):
|
||||
'''
|
||||
Class used to write Annotation description in EcoPCRDB format.
|
||||
|
||||
EcoPCRDBAnnotationWriter is oftenly called through the EcoPCRDBSequenceWriter class
|
||||
|
||||
@see: L{ecopcr.sequence.EcoPCRDBSequenceWriter}
|
||||
'''
|
||||
|
||||
def __init__(self,dbname,id,fileidx=1,type=('CDS'),definition=None):
|
||||
'''
|
||||
class constructor
|
||||
|
||||
@param dbname: name of ecoPCR database
|
||||
@type dbname: C{str}
|
||||
@param id: name of the qualifier used as feature id
|
||||
@type id: C{str}
|
||||
@param fileidx:
|
||||
@type fileidx: C{int}
|
||||
@param type:
|
||||
@type type: C{list} or C{tuple}
|
||||
@param definition:
|
||||
@type definition: C{str}
|
||||
'''
|
||||
self._type = type
|
||||
self._definition = definition
|
||||
self._id = id
|
||||
self._filename="%s_%03d.adx" % (dbname,fileidx)
|
||||
self._file = open(self._filename,'wb')
|
||||
self._sequenceIdx=0
|
||||
|
||||
|
||||
ftname ="%s.fdx" % (dbname)
|
||||
ft = open(ftname,'wb')
|
||||
|
||||
self._fttypeidx=dict(map(None,type,xrange(len(type))))
|
||||
|
||||
ft.write(struct.pack('> I',len(type)))
|
||||
|
||||
for t in type:
|
||||
ft.write(self._ecoFtTypePacker(t))
|
||||
|
||||
ft.close()
|
||||
|
||||
self._annotationCount=0
|
||||
self._file.write(struct.pack('> I',self._annotationCount))
|
||||
|
||||
|
||||
def _ecoFtTypePacker(self,type):
|
||||
totalSize = len(type)
|
||||
packed = struct.pack('> I %ds' % totalSize,totalSize,type)
|
||||
|
||||
assert len(packed) == totalSize+4, "error in feature type packing"
|
||||
|
||||
return packed
|
||||
|
||||
def _ecoAnnotationPacker(self,feature,seqidx):
|
||||
begin = feature.begin-1
|
||||
end = feature.end
|
||||
type = self._fttypeidx[feature.ftType]
|
||||
strand = feature.isDirect()
|
||||
id = feature[self._id][0]
|
||||
if self._definition in feature:
|
||||
definition = feature[self._definition][0]
|
||||
else:
|
||||
definition = ''
|
||||
|
||||
assert strand is not None,"Only strand defined features can be stored"
|
||||
|
||||
deflength = len(definition)
|
||||
|
||||
totalSize = 4 + 4 + 4 + 4 + 4 + 20 + 4 + deflength
|
||||
|
||||
packed = struct.pack('> I I I I I 20s I %ds' % (deflength),
|
||||
totalSize,
|
||||
seqidx,
|
||||
begin,
|
||||
end,
|
||||
type,
|
||||
int(strand),
|
||||
id,
|
||||
deflength,
|
||||
definition)
|
||||
|
||||
assert len(packed) == totalSize+4, "error in annotation packing"
|
||||
|
||||
return packed
|
||||
|
||||
|
||||
def put(self,sequence,seqidx=None):
|
||||
if seqidx is None:
|
||||
seqidx = self._sequenceIdx
|
||||
self._sequenceIdx+=1
|
||||
for feature in sequence.getFeatureTable():
|
||||
if feature.ftType in self._type:
|
||||
self._annotationCount+=1
|
||||
self._file.write(self._ecoAnnotationPacker(feature,seqidx))
|
||||
|
||||
def __del__(self):
|
||||
self._file.seek(0,0)
|
||||
self._file.write(struct.pack('> I',self._annotationCount))
|
||||
self._file.close()
|
129
obitools/ecopcr/options.py
Normal file
129
obitools/ecopcr/options.py
Normal file
@ -0,0 +1,129 @@
|
||||
'''
|
||||
Created on 13 fevr. 2011
|
||||
|
||||
@author: coissac
|
||||
'''
|
||||
|
||||
from obitools.ecopcr.taxonomy import Taxonomy, EcoTaxonomyDB, TaxonomyDump, ecoTaxonomyWriter
|
||||
|
||||
try:
|
||||
from obitools.ecobarcode.options import addEcoBarcodeDBOption,ecobarcodeDatabaseConnection
|
||||
except ImportError:
|
||||
def addEcoBarcodeDBOption(optionmanager):
|
||||
pass
|
||||
def ecobarcodeDatabaseConnection(options):
|
||||
return None
|
||||
|
||||
def addTaxonomyDBOptions(optionManager):
|
||||
addEcoBarcodeDBOption(optionManager)
|
||||
optionManager.add_option('-d','--database',
|
||||
action="store", dest="taxonomy",
|
||||
metavar="<FILENAME>",
|
||||
type="string",
|
||||
help="ecoPCR taxonomy Database "
|
||||
"name")
|
||||
optionManager.add_option('-t','--taxonomy-dump',
|
||||
action="store", dest="taxdump",
|
||||
metavar="<FILENAME>",
|
||||
type="string",
|
||||
help="NCBI Taxonomy dump repository "
|
||||
"name")
|
||||
|
||||
|
||||
def addTaxonomyFilterOptions(optionManager):
|
||||
addTaxonomyDBOptions(optionManager)
|
||||
optionManager.add_option('--require-rank',
|
||||
action="append",
|
||||
dest='requiredRank',
|
||||
metavar="<RANK_NAME>",
|
||||
type="string",
|
||||
default=[],
|
||||
help="select sequence with taxid tag containing "
|
||||
"a parent of rank <RANK_NAME>")
|
||||
|
||||
optionManager.add_option('-r','--required',
|
||||
action="append",
|
||||
dest='required',
|
||||
metavar="<TAXID>",
|
||||
type="int",
|
||||
default=[],
|
||||
help="required taxid")
|
||||
|
||||
optionManager.add_option('-i','--ignore',
|
||||
action="append",
|
||||
dest='ignored',
|
||||
metavar="<TAXID>",
|
||||
type="int",
|
||||
default=[],
|
||||
help="ignored taxid")
|
||||
|
||||
def loadTaxonomyDatabase(options):
|
||||
if isinstance(options.taxonomy, Taxonomy):
|
||||
return options.taxonomy
|
||||
taxonomy = ecobarcodeDatabaseConnection(options)
|
||||
if (taxonomy is not None or
|
||||
options.taxonomy is not None or
|
||||
options.taxdump is not None):
|
||||
if options.taxdump is not None:
|
||||
taxonomy = TaxonomyDump(options.taxdump)
|
||||
if taxonomy is not None and isinstance(options.taxonomy, str):
|
||||
ecoTaxonomyWriter(options.taxonomy,taxonomy)
|
||||
options.ecodb=options.taxonomy
|
||||
if isinstance(options.taxonomy, Taxonomy):
|
||||
taxonomy = options.taxonomy
|
||||
if taxonomy is None and isinstance(options.taxonomy, str):
|
||||
taxonomy = EcoTaxonomyDB(options.taxonomy)
|
||||
options.ecodb=options.taxonomy
|
||||
options.taxonomy=taxonomy
|
||||
return options.taxonomy
|
||||
|
||||
def taxonomyFilterGenerator(options):
|
||||
loadTaxonomyDatabase(options)
|
||||
if options.taxonomy is not None:
|
||||
taxonomy=options.taxonomy
|
||||
def taxonomyFilter(seq):
|
||||
def annotateAtRank(seq,rank):
|
||||
if 'taxid' in seq and seq['taxid'] is not None:
|
||||
rtaxid= taxonomy.getTaxonAtRank(seq['taxid'],rank)
|
||||
return rtaxid
|
||||
return None
|
||||
good = True
|
||||
if 'taxid' in seq:
|
||||
taxid = seq['taxid']
|
||||
# print taxid,
|
||||
if options.requiredRank:
|
||||
taxonatrank = reduce(lambda x,y: x and y,
|
||||
(annotateAtRank(seq,rank) is not None
|
||||
for rank in options.requiredRank),True)
|
||||
good = good and taxonatrank
|
||||
# print >>sys.stderr, " Has rank : ",good,
|
||||
if options.required:
|
||||
good = good and reduce(lambda x,y: x or y,
|
||||
(taxonomy.isAncestor(r,taxid) for r in options.required),
|
||||
False)
|
||||
# print " Required : ",good,
|
||||
if options.ignored:
|
||||
good = good and not reduce(lambda x,y: x or y,
|
||||
(taxonomy.isAncestor(r,taxid) for r in options.ignored),
|
||||
False)
|
||||
# print " Ignored : ",good,
|
||||
# print " Global : ",good
|
||||
|
||||
return good
|
||||
|
||||
|
||||
else:
|
||||
def taxonomyFilter(seq):
|
||||
return True
|
||||
|
||||
return taxonomyFilter
|
||||
|
||||
def taxonomyFilterIteratorGenerator(options):
|
||||
taxonomyFilter = taxonomyFilterGenerator(options)
|
||||
|
||||
def filterIterator(seqiterator):
|
||||
for seq in seqiterator:
|
||||
if taxonomyFilter(seq):
|
||||
yield seq
|
||||
|
||||
return filterIterator
|
133
obitools/ecopcr/sequence.py
Normal file
133
obitools/ecopcr/sequence.py
Normal file
@ -0,0 +1,133 @@
|
||||
from obitools import NucSequence
|
||||
from obitools.ecopcr import EcoPCRDBFile
|
||||
from obitools.ecopcr.taxonomy import EcoTaxonomyDB, ecoTaxonomyWriter
|
||||
from obitools.ecopcr.annotation import EcoPCRDBAnnotationWriter
|
||||
from obitools.utils import universalOpen
|
||||
from glob import glob
|
||||
import struct
|
||||
import gzip
|
||||
import sys
|
||||
|
||||
|
||||
class EcoPCRDBSequenceIterator(EcoPCRDBFile):
|
||||
'''
|
||||
Build an iterator over the sequences include in a sequence database
|
||||
formated for ecoPCR
|
||||
'''
|
||||
|
||||
def __init__(self,path,taxonomy=None):
|
||||
'''
|
||||
ecoPCR data iterator constructor
|
||||
|
||||
@param path: path to the ecoPCR database including the database prefix name
|
||||
@type path: C{str}
|
||||
@param taxonomy: a taxonomy can be given to the reader to decode the taxonomic data
|
||||
associated to the sequences. If no Taxonomy is furnish, it will be read
|
||||
before the sequence database files using the same path.
|
||||
@type taxonomy: L{obitools.ecopcr.taxonomy.Taxonomy}
|
||||
'''
|
||||
self._path = path
|
||||
|
||||
if taxonomy is not None:
|
||||
self._taxonomy=taxonomy
|
||||
else:
|
||||
self._taxonomy=EcoTaxonomyDB(path)
|
||||
|
||||
self._seqfilesFiles = glob('%s_???.sdx' % self._path)
|
||||
self._seqfilesFiles.sort()
|
||||
|
||||
def __ecoSequenceIterator(self,file):
|
||||
for record in self._ecoRecordIterator(file):
|
||||
lrecord = len(record)
|
||||
lnames = lrecord - (4*4+20)
|
||||
(taxid,seqid,deflength,seqlength,cptseqlength,string)=struct.unpack('> I 20s I I I %ds' % lnames, record)
|
||||
seqid=seqid.strip('\x00')
|
||||
de = string[:deflength]
|
||||
seq = gzip.zlib.decompress(string[deflength:])
|
||||
bioseq = NucSequence(seqid,seq,de,taxidx=taxid,taxid=self._taxonomy._taxonomy[taxid][0])
|
||||
yield bioseq
|
||||
|
||||
def __iter__(self):
|
||||
for seqfile in self._seqfilesFiles:
|
||||
for seq in self.__ecoSequenceIterator(seqfile):
|
||||
yield seq
|
||||
|
||||
class EcoPCRDBSequenceWriter(object):
|
||||
|
||||
def __init__(self,dbname,fileidx=1,taxonomy=None,ftid=None,type=None,definition=None,append=False):
|
||||
self._taxonomy=taxonomy
|
||||
self._filename="%s_%03d.sdx" % (dbname,fileidx)
|
||||
if append:
|
||||
mode ='r+b'
|
||||
f = universalOpen(self._filename)
|
||||
(recordCount,) = struct.unpack('> I',f.read(4))
|
||||
self._sequenceCount=recordCount
|
||||
del f
|
||||
self._file = open(self._filename,mode)
|
||||
self._file.seek(0,0)
|
||||
self._file.write(struct.pack('> I',0))
|
||||
self._file.seek(0,2)
|
||||
else:
|
||||
self._sequenceCount=0
|
||||
mode = 'wb'
|
||||
self._file = open(self._filename,mode)
|
||||
self._file.write(struct.pack('> I',self._sequenceCount))
|
||||
|
||||
if self._taxonomy is not None:
|
||||
print >> sys.stderr,"Writing the taxonomy file...",
|
||||
ecoTaxonomyWriter(dbname,self._taxonomy)
|
||||
print >> sys.stderr,"Ok"
|
||||
|
||||
if type is not None:
|
||||
assert ftid is not None,"You must specify an id attribute for features"
|
||||
self._annotation = EcoPCRDBAnnotationWriter(dbname, ftid, fileidx, type, definition)
|
||||
else:
|
||||
self._annotation = None
|
||||
|
||||
def _ecoSeqPacker(self,seq):
|
||||
|
||||
compactseq = gzip.zlib.compress(str(seq).upper(),9)
|
||||
cptseqlength = len(compactseq)
|
||||
delength = len(seq.definition)
|
||||
|
||||
totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength
|
||||
|
||||
if self._taxonomy is None or 'taxid' not in seq:
|
||||
taxon=-1
|
||||
else:
|
||||
taxon=self._taxonomy.findIndex(seq['taxid'])
|
||||
|
||||
try:
|
||||
packed = struct.pack('> I i 20s I I I %ds %ds' % (delength,cptseqlength),
|
||||
totalSize,
|
||||
taxon,
|
||||
seq.id,
|
||||
delength,
|
||||
len(seq),
|
||||
cptseqlength,
|
||||
seq.definition,
|
||||
compactseq)
|
||||
except struct.error as e:
|
||||
print >>sys.stderr,"\n\n============\n\nError on sequence : %s\n\n" % seq.id
|
||||
raise e
|
||||
|
||||
assert len(packed) == totalSize+4, "error in sequence packing"
|
||||
|
||||
return packed
|
||||
|
||||
|
||||
def put(self,sequence):
|
||||
if self._taxonomy is not None:
|
||||
if 'taxid' not in sequence and hasattr(sequence, 'extractTaxon'):
|
||||
sequence.extractTaxon()
|
||||
self._file.write(self._ecoSeqPacker(sequence))
|
||||
if self._annotation is not None:
|
||||
self._annotation.put(sequence, self._sequenceCount)
|
||||
self._sequenceCount+=1
|
||||
|
||||
def __del__(self):
|
||||
self._file.seek(0,0)
|
||||
self._file.write(struct.pack('> I',self._sequenceCount))
|
||||
self._file.close()
|
||||
|
||||
|
630
obitools/ecopcr/taxonomy.py
Normal file
630
obitools/ecopcr/taxonomy.py
Normal file
@ -0,0 +1,630 @@
|
||||
import struct
|
||||
import sys
|
||||
|
||||
from itertools import count,imap
|
||||
|
||||
from obitools.ecopcr import EcoPCRDBFile
|
||||
from obitools.utils import universalOpen
|
||||
from obitools.utils import ColumnFile
|
||||
|
||||
class Taxonomy(object):
|
||||
def __init__(self):
|
||||
'''
|
||||
The taxonomy database constructor
|
||||
|
||||
@param path: path to the ecoPCR database including the database prefix name
|
||||
@type path: C{str}
|
||||
'''
|
||||
|
||||
self._ranks.append('obi')
|
||||
|
||||
self._speciesidx = self._ranks.index('species')
|
||||
self._genusidx = self._ranks.index('genus')
|
||||
self._familyidx = self._ranks.index('family')
|
||||
self._orderidx = self._ranks.index('order')
|
||||
self._nameidx=dict((x[0],x[2]) for x in self._name)
|
||||
self._nameidx.update(dict((x[0],x[2]) for x in self._preferedName))
|
||||
self._preferedidx=dict((x[2],x[1]) for x in self._preferedName)
|
||||
|
||||
self._bigestTaxid = max(x[0] for x in self._taxonomy)
|
||||
|
||||
|
||||
def findTaxonByIdx(self,idx):
|
||||
if idx is None:
|
||||
return None
|
||||
return self._taxonomy[idx]
|
||||
|
||||
def findIndex(self,taxid):
|
||||
if taxid is None:
|
||||
return None
|
||||
return self._index[taxid]
|
||||
|
||||
def findTaxonByTaxid(self,taxid):
|
||||
return self.findTaxonByIdx(self.findIndex(taxid))
|
||||
|
||||
def findTaxonByName(self,name):
|
||||
return self._taxonomy[self._nameidx[name]]
|
||||
|
||||
def findRankByName(self,rank):
|
||||
try:
|
||||
return self._ranks.index(rank)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
def __contains__(self,taxid):
|
||||
return self.findTaxonByTaxid(taxid) is not None
|
||||
|
||||
|
||||
|
||||
|
||||
#####
|
||||
#
|
||||
# PUBLIC METHODS
|
||||
#
|
||||
#####
|
||||
|
||||
|
||||
def subTreeIterator(self, taxid):
|
||||
"return subtree for given taxonomic id "
|
||||
idx = self.findTaxonByTaxid(taxid)
|
||||
yield self._taxonomy[idx]
|
||||
for t in self._taxonomy:
|
||||
if t[2] == idx:
|
||||
for subt in self.subTreeIterator(t[0]):
|
||||
yield subt
|
||||
|
||||
def parentalTreeIterator(self, taxid):
|
||||
"""
|
||||
return parental tree for given taxonomic id starting from
|
||||
first ancester to the root.
|
||||
"""
|
||||
taxon=self.findTaxonByTaxid(taxid)
|
||||
if taxon is not None:
|
||||
while taxon[2]!= 0:
|
||||
yield taxon
|
||||
taxon = self._taxonomy[taxon[2]]
|
||||
yield self._taxonomy[0]
|
||||
else:
|
||||
raise StopIteration
|
||||
|
||||
def isAncestor(self,parent,taxid):
|
||||
return parent in [x[0] for x in self.parentalTreeIterator(taxid)]
|
||||
|
||||
def lastCommonTaxon(self,*taxids):
|
||||
if not taxids:
|
||||
return None
|
||||
if len(taxids)==1:
|
||||
return taxids[0]
|
||||
|
||||
if len(taxids)==2:
|
||||
t1 = [x[0] for x in self.parentalTreeIterator(taxids[0])]
|
||||
t2 = [x[0] for x in self.parentalTreeIterator(taxids[1])]
|
||||
t1.reverse()
|
||||
t2.reverse()
|
||||
|
||||
count = min(len(t1),len(t2))
|
||||
i=0
|
||||
while(i < count and t1[i]==t2[i]):
|
||||
i+=1
|
||||
i-=1
|
||||
|
||||
return t1[i]
|
||||
|
||||
ancetre = taxids[0]
|
||||
for taxon in taxids[1:]:
|
||||
ancetre = self.lastCommonTaxon(ancetre,taxon)
|
||||
|
||||
return ancetre
|
||||
|
||||
def betterCommonTaxon(self,error=1,*taxids):
|
||||
lca = self.lastCommonTaxon(*taxids)
|
||||
idx = self._index[lca]
|
||||
sublca = [t[0] for t in self._taxonomy if t[2]==idx]
|
||||
return sublca
|
||||
|
||||
|
||||
def getPreferedName(self,taxid):
|
||||
idx = self.findIndex(taxid)
|
||||
return self._preferedidx.get(idx,self._taxonomy[idx][3])
|
||||
|
||||
|
||||
def getScientificName(self,taxid):
|
||||
return self.findTaxonByTaxid(taxid)[3]
|
||||
|
||||
def getRankId(self,taxid):
|
||||
return self.findTaxonByTaxid(taxid)[1]
|
||||
|
||||
def getRank(self,taxid):
|
||||
return self._ranks[self.getRankId(taxid)]
|
||||
|
||||
def getTaxonAtRank(self,taxid,rankid):
|
||||
if isinstance(rankid, str):
|
||||
rankid=self._ranks.index(rankid)
|
||||
try:
|
||||
return [x[0] for x in self.parentalTreeIterator(taxid)
|
||||
if x[1]==rankid][0]
|
||||
except IndexError:
|
||||
return None
|
||||
|
||||
def getSpecies(self,taxid):
|
||||
return self.getTaxonAtRank(taxid, self._speciesidx)
|
||||
|
||||
def getGenus(self,taxid):
|
||||
return self.getTaxonAtRank(taxid, self._genusidx)
|
||||
|
||||
def getFamily(self,taxid):
|
||||
return self.getTaxonAtRank(taxid, self._familyidx)
|
||||
|
||||
def getOrder(self,taxid):
|
||||
return self.getTaxonAtRank(taxid, self._orderidx)
|
||||
|
||||
def rankIterator(self):
|
||||
for x in imap(None,self._ranks,xrange(len(self._ranks))):
|
||||
yield x
|
||||
|
||||
def groupTaxa(self,taxa,groupname):
|
||||
t=[self.findTaxonByTaxid(x) for x in taxa]
|
||||
a=set(x[2] for x in t)
|
||||
assert len(a)==1,"All taxa must have the same parent"
|
||||
newtaxid=max([2999999]+[x[0] for x in self._taxonomy if x[0]>=3000000 and x[0]<4000000])+1
|
||||
newidx=len(self._taxonomy)
|
||||
if 'GROUP' not in self._ranks:
|
||||
self._ranks.append('GROUP')
|
||||
rankid=self._ranks.index('GROUP')
|
||||
self._taxonomy.append((newtaxid,rankid,a.pop(),groupname))
|
||||
for x in t:
|
||||
x[2]=newidx
|
||||
|
||||
def addLocalTaxon(self,name,rank,parent,minimaltaxid=10000000):
|
||||
newtaxid = minimaltaxid if (self._bigestTaxid < minimaltaxid) else self._bigestTaxid+1
|
||||
|
||||
rankid=self.findRankByName(rank)
|
||||
parentidx = self.findIndex(int(parent))
|
||||
tx = (newtaxid,rankid,parentidx,name,'local')
|
||||
self._taxonomy.append(tx)
|
||||
newidx=len(self._taxonomy)-1
|
||||
self._name.append((name,'scientific name',newidx))
|
||||
self._nameidx[name]=newidx
|
||||
self._index[newtaxid]=newidx
|
||||
|
||||
self._bigestTaxid=newtaxid
|
||||
|
||||
return newtaxid
|
||||
|
||||
def removeLocalTaxon(self,taxid):
|
||||
raise NotImplemented
|
||||
txidx = self.findIndex(taxid)
|
||||
taxon = self.findTaxonByIdx(txidx)
|
||||
|
||||
assert txidx >= self._localtaxon,"Only local taxon can be deleted"
|
||||
|
||||
for t in self._taxonomy:
|
||||
if t[2] == txidx:
|
||||
self.removeLocalTaxon(t[0])
|
||||
|
||||
|
||||
|
||||
|
||||
return taxon
|
||||
|
||||
def addPreferedName(self,taxid,name):
|
||||
idx = self.findIndex(taxid)
|
||||
self._preferedName.append(name,'obi',idx)
|
||||
self._preferedidx[idx]=name
|
||||
return taxid
|
||||
|
||||
class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile):
|
||||
'''
|
||||
A taxonomy database class
|
||||
'''
|
||||
|
||||
|
||||
def __init__(self,path):
|
||||
'''
|
||||
The taxonomy database constructor
|
||||
|
||||
@param path: path to the ecoPCR database including the database prefix name
|
||||
@type path: C{str}
|
||||
'''
|
||||
self._path = path
|
||||
self._taxonFile = "%s.tdx" % self._path
|
||||
self._localTaxonFile = "%s.ldx" % self._path
|
||||
self._ranksFile = "%s.rdx" % self._path
|
||||
self._namesFile = "%s.ndx" % self._path
|
||||
self._preferedNamesFile = "%s.pdx" % self._path
|
||||
self._aliasFile = "%s.adx" % self._path
|
||||
|
||||
print >> sys.stderr,"Reading binary taxonomy database...",
|
||||
|
||||
self.__readNodeTable()
|
||||
|
||||
print >> sys.stderr," ok"
|
||||
|
||||
Taxonomy.__init__(self)
|
||||
|
||||
|
||||
#####
|
||||
#
|
||||
# Iterator functions
|
||||
#
|
||||
#####
|
||||
|
||||
def __ecoNameIterator(self,file):
|
||||
for record in self._ecoRecordIterator(file):
|
||||
lrecord = len(record)
|
||||
lnames = lrecord - 16
|
||||
(isScientificName,namelength,classLength,indextaxid,names)=struct.unpack('> I I I I %ds' % lnames, record)
|
||||
name=names[:namelength]
|
||||
classname=names[namelength:]
|
||||
yield (name,classname,indextaxid)
|
||||
|
||||
|
||||
def __ecoTaxonomicIterator(self):
|
||||
for record in self._ecoRecordIterator(self._taxonFile):
|
||||
lrecord = len(record)
|
||||
lnames = lrecord - 16
|
||||
(taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record)
|
||||
yield (taxid,rankid,parentidx,name,'ncbi')
|
||||
|
||||
try :
|
||||
lt=0
|
||||
for record in self._ecoRecordIterator(self._localTaxonFile):
|
||||
lrecord = len(record)
|
||||
lnames = lrecord - 16
|
||||
(taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record)
|
||||
lt+=1
|
||||
yield (taxid,rankid,parentidx,name,'local')
|
||||
print >> sys.stderr, " [INFO : Local taxon file found] : %d added taxa" % lt
|
||||
except:
|
||||
print >> sys.stderr, " [INFO : Local taxon file not found] "
|
||||
|
||||
def __ecoRankIterator(self):
|
||||
for record in self._ecoRecordIterator(self._ranksFile):
|
||||
yield record
|
||||
|
||||
def __ecoAliasIterator(self):
|
||||
for record in self._ecoRecordIterator(self._aliasFile):
|
||||
(taxid,index) = struct.unpack('> I i',record)
|
||||
yield taxid,index
|
||||
|
||||
#####
|
||||
#
|
||||
# Indexes
|
||||
#
|
||||
#####
|
||||
|
||||
def __ecoNameIndex(self):
|
||||
indexName = [x for x in self.__ecoNameIterator(self._namesFile)]
|
||||
return indexName
|
||||
|
||||
def __ecoRankIndex(self):
|
||||
rank = [r for r in self.__ecoRankIterator()]
|
||||
return rank
|
||||
|
||||
def __ecoTaxonomyIndex(self):
|
||||
taxonomy = []
|
||||
|
||||
try :
|
||||
index = dict(self.__ecoAliasIterator())
|
||||
print >> sys.stderr, " [INFO : Taxon alias file found] "
|
||||
buildIndex=False
|
||||
except:
|
||||
print >> sys.stderr, " [INFO : Taxon alias file not found] "
|
||||
index={}
|
||||
i = 0;
|
||||
buildIndex=True
|
||||
|
||||
localtaxon=0
|
||||
i=0
|
||||
for x in self.__ecoTaxonomicIterator():
|
||||
taxonomy.append(x)
|
||||
if x[4]=='ncbi':
|
||||
localtaxon+=1
|
||||
|
||||
if buildIndex or x[4]!='ncbi':
|
||||
index[x[0]] = i
|
||||
i+=1
|
||||
|
||||
|
||||
print >> sys.stderr,"Taxonomical tree read",
|
||||
return taxonomy, index,localtaxon
|
||||
|
||||
def __readNodeTable(self):
|
||||
self._taxonomy, self._index, self._localtaxon= self.__ecoTaxonomyIndex()
|
||||
self._ranks = self.__ecoRankIndex()
|
||||
self._name = self.__ecoNameIndex()
|
||||
|
||||
# Add local taxon tame to the name index
|
||||
i=self._localtaxon
|
||||
for t in self._taxonomy[self._localtaxon:]:
|
||||
self._name.append((t[3],'scientific name',i))
|
||||
i+=1
|
||||
|
||||
try :
|
||||
self._preferedName = [(x[0],'obi',x[2])
|
||||
for x in self.__ecoNameIterator(self._preferedNamesFile)]
|
||||
print >> sys.stderr, " [INFO : Prefered taxon name file found] : %d added taxa" % len(self._preferedName)
|
||||
except:
|
||||
print >> sys.stderr, " [INFO : Prefered taxon name file not found]"
|
||||
self._preferedName = []
|
||||
|
||||
|
||||
|
||||
|
||||
class TaxonomyDump(Taxonomy):
|
||||
|
||||
def __init__(self,taxdir):
|
||||
|
||||
self._path=taxdir
|
||||
self._readNodeTable('%s/nodes.dmp' % taxdir)
|
||||
|
||||
print >>sys.stderr,"Adding scientific name..."
|
||||
|
||||
self._name=[]
|
||||
for taxid,name,classname in self._nameIterator('%s/names.dmp' % taxdir):
|
||||
self._name.append((name,classname,self._index[taxid]))
|
||||
if classname == 'scientific name':
|
||||
self._taxonomy[self._index[taxid]].extend([name,'ncbi'])
|
||||
|
||||
print >>sys.stderr,"Adding taxid alias..."
|
||||
for taxid,current in self._mergedNodeIterator('%s/merged.dmp' % taxdir):
|
||||
self._index[taxid]=self._index[current]
|
||||
|
||||
print >>sys.stderr,"Adding deleted taxid..."
|
||||
for taxid in self._deletedNodeIterator('%s/delnodes.dmp' % taxdir):
|
||||
self._index[taxid]=None
|
||||
|
||||
self._nameidx=dict((x[0],x[2]) for x in self._name)
|
||||
|
||||
|
||||
def _taxonCmp(t1,t2):
|
||||
if t1[0] < t2[0]:
|
||||
return -1
|
||||
elif t1[0] > t2[0]:
|
||||
return +1
|
||||
return 0
|
||||
|
||||
_taxonCmp=staticmethod(_taxonCmp)
|
||||
|
||||
def _bsearchTaxon(self,taxid):
|
||||
taxCount = len(self._taxonomy)
|
||||
begin = 0
|
||||
end = taxCount
|
||||
oldcheck=taxCount
|
||||
check = begin + end / 2
|
||||
while check != oldcheck and self._taxonomy[check][0]!=taxid :
|
||||
if self._taxonomy[check][0] < taxid:
|
||||
begin=check
|
||||
else:
|
||||
end=check
|
||||
oldcheck=check
|
||||
check = (begin + end) / 2
|
||||
|
||||
|
||||
if self._taxonomy[check][0]==taxid:
|
||||
return check
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
|
||||
def _readNodeTable(self,file):
|
||||
|
||||
file = universalOpen(file)
|
||||
|
||||
nodes = ColumnFile(file,
|
||||
sep='|',
|
||||
types=(int,int,str,
|
||||
str,str,bool,
|
||||
int,bool,int,
|
||||
bool,bool,bool,str))
|
||||
print >>sys.stderr,"Reading taxonomy dump file..."
|
||||
# (taxid,rank,parent)
|
||||
taxonomy=[[n[0],n[2],n[1]] for n in nodes]
|
||||
print >>sys.stderr,"List all taxonomy rank..."
|
||||
ranks =list(set(x[1] for x in taxonomy))
|
||||
ranks.sort()
|
||||
rankidx = dict(map(None,ranks,xrange(len(ranks))))
|
||||
|
||||
print >>sys.stderr,"Sorting taxons..."
|
||||
taxonomy.sort(TaxonomyDump._taxonCmp)
|
||||
|
||||
self._taxonomy=taxonomy
|
||||
self._localtaxon=len(taxonomy)
|
||||
|
||||
print >>sys.stderr,"Indexing taxonomy..."
|
||||
index = {}
|
||||
for t in self._taxonomy:
|
||||
index[t[0]]=self._bsearchTaxon(t[0])
|
||||
|
||||
print >>sys.stderr,"Indexing parent and rank..."
|
||||
for t in self._taxonomy:
|
||||
t[1]=rankidx[t[1]]
|
||||
t[2]=index[t[2]]
|
||||
|
||||
self._ranks=ranks
|
||||
self._index=index
|
||||
self._preferedName = []
|
||||
|
||||
def _nameIterator(self,file):
|
||||
file = universalOpen(file)
|
||||
names = ColumnFile(file,
|
||||
sep='|',
|
||||
types=(int,str,
|
||||
str,str))
|
||||
for taxid,name,unique,classname,white in names:
|
||||
yield taxid,name,classname
|
||||
|
||||
def _mergedNodeIterator(self,file):
|
||||
file = universalOpen(file)
|
||||
merged = ColumnFile(file,
|
||||
sep='|',
|
||||
types=(int,int,str))
|
||||
for taxid,current,white in merged:
|
||||
yield taxid,current
|
||||
|
||||
def _deletedNodeIterator(self,file):
|
||||
file = universalOpen(file)
|
||||
deleted = ColumnFile(file,
|
||||
sep='|',
|
||||
types=(int,str))
|
||||
for taxid,white in deleted:
|
||||
yield taxid
|
||||
|
||||
#####
|
||||
#
|
||||
#
|
||||
# Binary writer
|
||||
#
|
||||
#
|
||||
#####
|
||||
|
||||
def ecoTaxonomyWriter(prefix, taxonomy,onlyLocal=False):
|
||||
|
||||
def ecoTaxPacker(tx):
|
||||
|
||||
namelength = len(tx[3])
|
||||
|
||||
totalSize = 4 + 4 + 4 + 4 + namelength
|
||||
|
||||
packed = struct.pack('> I I I I I %ds' % namelength,
|
||||
totalSize,
|
||||
tx[0],
|
||||
tx[1],
|
||||
tx[2],
|
||||
namelength,
|
||||
tx[3])
|
||||
|
||||
return packed
|
||||
|
||||
def ecoRankPacker(rank):
|
||||
|
||||
namelength = len(rank)
|
||||
|
||||
packed = struct.pack('> I %ds' % namelength,
|
||||
namelength,
|
||||
rank)
|
||||
|
||||
return packed
|
||||
|
||||
def ecoAliasPacker(taxid,index):
|
||||
|
||||
totalSize = 4 + 4
|
||||
try:
|
||||
packed = struct.pack('> I I i',
|
||||
totalSize,
|
||||
taxid,
|
||||
index)
|
||||
except struct.error,e:
|
||||
print >>sys.stderr,(totalSize,taxid,index)
|
||||
print >>sys.stderr,"Total size : %d taxid : %d index : %d" %(totalSize,taxid,index)
|
||||
raise e
|
||||
|
||||
return packed
|
||||
|
||||
def ecoNamePacker(name):
|
||||
|
||||
namelength = len(name[0])
|
||||
classlength= len(name[1])
|
||||
totalSize = namelength + classlength + 4 + 4 + 4 + 4
|
||||
|
||||
packed = struct.pack('> I I I I I %ds %ds' % (namelength,classlength),
|
||||
totalSize,
|
||||
int(name[1]=='scientific name'),
|
||||
namelength,
|
||||
classlength,
|
||||
name[2],
|
||||
name[0],
|
||||
name[1])
|
||||
|
||||
return packed
|
||||
|
||||
|
||||
def ecoTaxWriter(file,taxonomy):
|
||||
output = open(file,'wb')
|
||||
nbtaxon = reduce(lambda x,y:x+y,(1 for t in taxonomy if t[4]=='ncbi'),0)
|
||||
|
||||
output.write(struct.pack('> I',nbtaxon))
|
||||
|
||||
for tx in taxonomy:
|
||||
if tx[4]=='ncbi':
|
||||
output.write(ecoTaxPacker(tx))
|
||||
|
||||
output.close()
|
||||
return nbtaxon < len(taxonomy)
|
||||
|
||||
def ecoLocalTaxWriter(file,taxonomy):
|
||||
nbtaxon = reduce(lambda x,y:x+y,(1 for t in taxonomy if t[4]!='ncbi'),0)
|
||||
|
||||
if nbtaxon:
|
||||
output = open(file,'wb')
|
||||
|
||||
output.write(struct.pack('> I',nbtaxon))
|
||||
|
||||
for tx in taxonomy:
|
||||
if tx[4]!='ncbi':
|
||||
output.write(ecoTaxPacker(tx))
|
||||
|
||||
output.close()
|
||||
|
||||
|
||||
def ecoRankWriter(file,ranks):
|
||||
output = open(file,'wb')
|
||||
output.write(struct.pack('> I',len(ranks)))
|
||||
|
||||
for rank in ranks:
|
||||
output.write(ecoRankPacker(rank))
|
||||
|
||||
output.close()
|
||||
|
||||
def ecoAliasWriter(file,index):
|
||||
output = open(file,'wb')
|
||||
output.write(struct.pack('> I',len(index)))
|
||||
|
||||
for taxid in index:
|
||||
i=index[taxid]
|
||||
if i is None:
|
||||
i=-1
|
||||
output.write(ecoAliasPacker(taxid, i))
|
||||
|
||||
output.close()
|
||||
|
||||
def nameCmp(n1,n2):
|
||||
name1=n1[0].upper()
|
||||
name2=n2[0].upper()
|
||||
if name1 < name2:
|
||||
return -1
|
||||
elif name1 > name2:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
def ecoNameWriter(file,names):
|
||||
output = open(file,'wb')
|
||||
output.write(struct.pack('> I',len(names)))
|
||||
|
||||
names.sort(nameCmp)
|
||||
|
||||
for name in names:
|
||||
output.write(ecoNamePacker(name))
|
||||
|
||||
output.close()
|
||||
|
||||
def ecoPreferedNameWriter(file,names):
|
||||
output = open(file,'wb')
|
||||
output.write(struct.pack('> I',len(names)))
|
||||
for name in names:
|
||||
output.write(ecoNamePacker(name))
|
||||
|
||||
output.close()
|
||||
|
||||
localtaxon=True
|
||||
if not onlyLocal:
|
||||
ecoRankWriter('%s.rdx' % prefix, taxonomy._ranks)
|
||||
localtaxon = ecoTaxWriter('%s.tdx' % prefix, taxonomy._taxonomy)
|
||||
ecoNameWriter('%s.ndx' % prefix, [x for x in taxonomy._name if x[2] < taxonomy._localtaxon])
|
||||
ecoAliasWriter('%s.adx' % prefix, taxonomy._index)
|
||||
if localtaxon:
|
||||
ecoLocalTaxWriter('%s.ldx' % prefix, taxonomy._taxonomy)
|
||||
if taxonomy._preferedName:
|
||||
ecoNameWriter('%s.pdx' % prefix, taxonomy._preferedName)
|
2
obitools/ecotag/__init__.py
Normal file
2
obitools/ecotag/__init__.py
Normal file
@ -0,0 +1,2 @@
|
||||
class EcoTagResult(dict):
|
||||
pass
|
150
obitools/ecotag/parser.py
Normal file
150
obitools/ecotag/parser.py
Normal file
@ -0,0 +1,150 @@
|
||||
from itertools import imap
|
||||
from obitools import utils
|
||||
|
||||
from obitools.ecotag import EcoTagResult
|
||||
|
||||
class EcoTagFileIterator(utils.ColumnFile):
|
||||
|
||||
@staticmethod
|
||||
def taxid(x):
|
||||
x = int(x)
|
||||
if x < 0:
|
||||
return None
|
||||
else:
|
||||
return x
|
||||
|
||||
@staticmethod
|
||||
def scientificName(x):
|
||||
if x=='--':
|
||||
return None
|
||||
else:
|
||||
return x
|
||||
|
||||
@staticmethod
|
||||
def value(x):
|
||||
if x=='--':
|
||||
return None
|
||||
else:
|
||||
return float(x)
|
||||
|
||||
@staticmethod
|
||||
def count(x):
|
||||
if x=='--':
|
||||
return None
|
||||
else:
|
||||
return int(x)
|
||||
|
||||
|
||||
def __init__(self,stream):
|
||||
utils.ColumnFile.__init__(self,
|
||||
stream, '\t', True,
|
||||
(str,str,str,
|
||||
EcoTagFileIterator.value,
|
||||
EcoTagFileIterator.value,
|
||||
EcoTagFileIterator.value,
|
||||
EcoTagFileIterator.count,
|
||||
EcoTagFileIterator.count,
|
||||
EcoTagFileIterator.taxid,
|
||||
EcoTagFileIterator.scientificName,
|
||||
str,
|
||||
EcoTagFileIterator.taxid,
|
||||
EcoTagFileIterator.scientificName,
|
||||
EcoTagFileIterator.taxid,
|
||||
EcoTagFileIterator.scientificName,
|
||||
EcoTagFileIterator.taxid,
|
||||
EcoTagFileIterator.scientificName,
|
||||
str
|
||||
))
|
||||
self._memory=None
|
||||
|
||||
_colname = ['identification',
|
||||
'seqid',
|
||||
'best_match_ac',
|
||||
'max_identity',
|
||||
'min_identity',
|
||||
'theorical_min_identity',
|
||||
'count',
|
||||
'match_count',
|
||||
'taxid',
|
||||
'scientific_name',
|
||||
'rank',
|
||||
'order_taxid',
|
||||
'order_sn',
|
||||
'family_taxid',
|
||||
'family_sn',
|
||||
'genus_taxid',
|
||||
'genus_sn',
|
||||
'species_taxid',
|
||||
'species_sn',
|
||||
'sequence']
|
||||
|
||||
def next(self):
|
||||
if self._memory is not None:
|
||||
data=self._memory
|
||||
self._memory=None
|
||||
else:
|
||||
data = utils.ColumnFile.next(self)
|
||||
data = EcoTagResult(imap(None,EcoTagFileIterator._colname[:len(data)],data))
|
||||
|
||||
if data['identification']=='ID':
|
||||
data.cd=[]
|
||||
try:
|
||||
nextone = utils.ColumnFile.next(self)
|
||||
nextone = EcoTagResult(imap(None,EcoTagFileIterator._colname[:len(nextone)],nextone))
|
||||
except StopIteration:
|
||||
nextone = None
|
||||
while nextone is not None and nextone['identification']=='CD':
|
||||
data.cd.append(nextone)
|
||||
try:
|
||||
nextone = utils.ColumnFile.next(self)
|
||||
nextone = EcoTagResult(imap(None,EcoTagFileIterator._colname[:len(nextone)],nextone))
|
||||
except StopIteration:
|
||||
nextone = None
|
||||
self._memory=nextone
|
||||
|
||||
return data
|
||||
|
||||
def ecoTagIdentifiedFilter(ecoTagIterator):
|
||||
for x in ecoTagIterator:
|
||||
if x['identification']=='ID':
|
||||
yield x
|
||||
|
||||
|
||||
class EcoTagAbstractIterator(utils.ColumnFile):
|
||||
|
||||
_colname = ['scientific_name',
|
||||
'taxid',
|
||||
'rank',
|
||||
'count',
|
||||
'max_identity',
|
||||
'min_identity']
|
||||
|
||||
|
||||
@staticmethod
|
||||
def taxid(x):
|
||||
x = int(x)
|
||||
if x < 0:
|
||||
return None
|
||||
else:
|
||||
return x
|
||||
|
||||
def __init__(self,stream):
|
||||
utils.ColumnFile.__init__(self,
|
||||
stream, '\t', True,
|
||||
(str,
|
||||
EcoTagFileIterator.taxid,
|
||||
str,
|
||||
int,
|
||||
float,float,float))
|
||||
|
||||
def next(self):
|
||||
data = utils.ColumnFile.next(self)
|
||||
data = dict(imap(None,EcoTagAbstractIterator._colname,data))
|
||||
|
||||
return data
|
||||
|
||||
def ecoTagAbstractFilter(ecoTagAbsIterator):
|
||||
for x in ecoTagAbsIterator:
|
||||
if x['taxid'] is not None:
|
||||
yield x
|
||||
|
54
obitools/eutils/__init__.py
Normal file
54
obitools/eutils/__init__.py
Normal file
@ -0,0 +1,54 @@
|
||||
import time
|
||||
from urllib2 import urlopen
|
||||
import shelve
|
||||
from threading import Lock
|
||||
import sys
|
||||
|
||||
class EUtils(object):
|
||||
'''
|
||||
|
||||
'''
|
||||
|
||||
_last_request=0
|
||||
_interval=3
|
||||
|
||||
def __init__(self):
|
||||
self._lock = Lock()
|
||||
|
||||
def wait(self):
|
||||
now=time.time()
|
||||
delta = now - EUtils._last_request
|
||||
while delta < EUtils._interval:
|
||||
time.sleep(delta)
|
||||
now=time.time()
|
||||
delta = now - EUtils._last_request
|
||||
|
||||
def _sendRequest(self,url):
|
||||
self.wait()
|
||||
EUtils._last_request=time.time()
|
||||
t = EUtils._last_request
|
||||
print >>sys.stderr,"Sending request to NCBI @ %f" % t
|
||||
data = urlopen(url).read()
|
||||
print >>sys.stderr,"Data red from NCBI @ %f (%f)" % (t,time.time()-t)
|
||||
return data
|
||||
|
||||
def setInterval(self,seconde):
|
||||
EUtils._interval=seconde
|
||||
|
||||
|
||||
class EFetch(EUtils):
|
||||
'''
|
||||
|
||||
'''
|
||||
def __init__(self,db,tool='OBITools',
|
||||
retmode='text',rettype="native",
|
||||
server='eutils.ncbi.nlm.nih.gov'):
|
||||
EUtils.__init__(self)
|
||||
self._url = "http://%s/entrez/eutils/efetch.fcgi?db=%s&tool=%s&retmode=%s&rettype=%s"
|
||||
self._url = self._url % (server,db,tool,retmode,rettype)
|
||||
|
||||
|
||||
def get(self,**args):
|
||||
key = "&".join(['%s=%s' % x for x in args.items()])
|
||||
return self._sendRequest(self._url +"&" + key)
|
||||
|
56
obitools/fast.py
Normal file
56
obitools/fast.py
Normal file
@ -0,0 +1,56 @@
|
||||
"""
|
||||
implement fastn/fastp sililarity search algorithm for BioSequence.
|
||||
"""
|
||||
|
||||
class Fast(object):
|
||||
|
||||
def __init__(self,seq,kup=2):
|
||||
'''
|
||||
@param seq: sequence to hash
|
||||
@type seq: BioSequence
|
||||
@param kup: word size used for hashing process
|
||||
@type kup: int
|
||||
'''
|
||||
hash={}
|
||||
seq = str(seq)
|
||||
for word,pos in ((seq[i:i+kup].upper(),i) for i in xrange(len(seq)-kup)):
|
||||
if word in hash:
|
||||
hash[word].append(pos)
|
||||
else:
|
||||
hash[word]=[pos]
|
||||
|
||||
self._kup = kup
|
||||
self._hash= hash
|
||||
self._seq = seq
|
||||
|
||||
def __call__(self,seq):
|
||||
'''
|
||||
Align one sequence with the fast hash table.
|
||||
|
||||
@param seq: the sequence to align
|
||||
@type seq: BioSequence
|
||||
|
||||
@return: where smax is the
|
||||
score of the largest diagonal and pmax the
|
||||
associated shift
|
||||
@rtype: a int tuple (smax,pmax)
|
||||
'''
|
||||
histo={}
|
||||
seq = str(seq).upper()
|
||||
hash= self._hash
|
||||
kup = self._kup
|
||||
|
||||
for word,pos in ((seq[i:i+kup],i) for i in xrange(len(seq)-kup)):
|
||||
matchedpos = hash.get(word,[])
|
||||
for p in matchedpos:
|
||||
delta = pos - p
|
||||
histo[delta]=histo.get(delta,0) + 1
|
||||
smax = max(histo.values())
|
||||
pmax = [x for x in histo if histo[x]==smax]
|
||||
return smax,pmax
|
||||
|
||||
def __len__(self):
|
||||
return len(self._seq)
|
||||
|
||||
|
||||
|
384
obitools/fasta/__init__.py
Normal file
384
obitools/fasta/__init__.py
Normal file
@ -0,0 +1,384 @@
|
||||
"""
|
||||
fasta module provides functions to read and write sequences in fasta format.
|
||||
|
||||
|
||||
"""
|
||||
|
||||
#from obitools.format.genericparser import fastGenericEntryIteratorGenerator
|
||||
from obitools.format.genericparser import genericEntryIteratorGenerator
|
||||
from obitools import bioSeqGenerator,BioSequence,AASequence,NucSequence
|
||||
from obitools import _default_raw_parser
|
||||
|
||||
#from obitools.alignment import alignmentReader
|
||||
#from obitools.utils import universalOpen
|
||||
|
||||
import re
|
||||
from obitools.ecopcr.options import loadTaxonomyDatabase
|
||||
from obitools.format import SequenceFileIterator
|
||||
|
||||
#from _fasta import parseFastaDescription,fastaParser
|
||||
#from _fasta import _fastaJoinSeq
|
||||
#from _fasta import _parseFastaTag
|
||||
|
||||
|
||||
#fastaEntryIterator=fastGenericEntryIteratorGenerator(startEntry='>')
|
||||
fastaEntryIterator=genericEntryIteratorGenerator(startEntry='>')
|
||||
rawFastaEntryIterator=genericEntryIteratorGenerator(startEntry='\s*>')
|
||||
|
||||
def _fastaJoinSeq(seqarray):
|
||||
return ''.join([x.strip() for x in seqarray])
|
||||
|
||||
|
||||
def parseFastaDescription(ds,tagparser):
|
||||
|
||||
m = tagparser.search(' '+ds)
|
||||
if m is not None:
|
||||
info=m.group(0)
|
||||
definition = ds[m.end(0):].strip()
|
||||
else:
|
||||
info=None
|
||||
definition=ds
|
||||
|
||||
return definition,info
|
||||
|
||||
def fastaParser(seq,bioseqfactory,tagparser,rawparser,joinseq=_fastaJoinSeq):
|
||||
'''
|
||||
Parse a fasta record.
|
||||
|
||||
@attention: internal purpose function
|
||||
|
||||
@param seq: a sequence object containing all lines corresponding
|
||||
to one fasta sequence
|
||||
@type seq: C{list} or C{tuple} of C{str}
|
||||
|
||||
@param bioseqfactory: a callable object return a BioSequence
|
||||
instance.
|
||||
@type bioseqfactory: a callable object
|
||||
|
||||
@param tagparser: a compiled regular expression usable
|
||||
to identify key, value couples from
|
||||
title line.
|
||||
@type tagparser: regex instance
|
||||
|
||||
@return: a C{BioSequence} instance
|
||||
'''
|
||||
seq = seq.split('\n')
|
||||
title = seq[0].strip()[1:].split(None,1)
|
||||
id=title[0]
|
||||
if len(title) == 2:
|
||||
definition,info=parseFastaDescription(title[1], tagparser)
|
||||
else:
|
||||
info= None
|
||||
definition=None
|
||||
|
||||
seq=joinseq(seq[1:])
|
||||
return bioseqfactory(id, seq, definition,info,rawparser)
|
||||
|
||||
|
||||
def fastaNucParser(seq,tagparser=_default_raw_parser,joinseq=_fastaJoinSeq):
|
||||
return fastaParser(seq,NucSequence,tagparser=tagparser,joinseq=_fastaJoinSeq)
|
||||
|
||||
def fastaAAParser(seq,tagparser=_default_raw_parser,joinseq=_fastaJoinSeq):
|
||||
return fastaParser(seq,AASequence,tagparser=tagparser,joinseq=_fastaJoinSeq)
|
||||
|
||||
def fastaIterator(file,bioseqfactory=bioSeqGenerator,
|
||||
tagparser=_default_raw_parser,
|
||||
joinseq=_fastaJoinSeq):
|
||||
'''
|
||||
iterate through a fasta file sequence by sequence.
|
||||
Returned sequences by this iterator will be BioSequence
|
||||
instances
|
||||
|
||||
@param file: a line iterator containing fasta data or a filename
|
||||
@type file: an iterable object or str
|
||||
@param bioseqfactory: a callable object return a BioSequence
|
||||
instance.
|
||||
@type bioseqfactory: a callable object
|
||||
|
||||
@param tagparser: a compiled regular expression usable
|
||||
to identify key, value couples from
|
||||
title line.
|
||||
@type tagparser: regex instance
|
||||
|
||||
@return: an iterator on C{BioSequence} instance
|
||||
|
||||
@see: L{fastaNucIterator}
|
||||
@see: L{fastaAAIterator}
|
||||
|
||||
>>> from obitools.format.sequence.fasta import fastaIterator
|
||||
>>> f = fastaIterator('monfichier')
|
||||
>>> s = f.next()
|
||||
>>> print s
|
||||
gctagctagcatgctagcatgcta
|
||||
>>>
|
||||
'''
|
||||
rawparser=tagparser
|
||||
allparser = tagparser % '[a-zA-Z][a-zA-Z0-9_]*'
|
||||
tagparser = re.compile('( *%s)+' % allparser)
|
||||
|
||||
for entry in fastaEntryIterator(file):
|
||||
yield fastaParser(entry,bioseqfactory,tagparser,rawparser,joinseq)
|
||||
|
||||
def rawFastaIterator(file,bioseqfactory=bioSeqGenerator,
|
||||
tagparser=_default_raw_parser,
|
||||
joinseq=_fastaJoinSeq):
|
||||
|
||||
rawparser=tagparser
|
||||
allparser = tagparser % '[a-zA-Z][a-zA-Z0-9_]*'
|
||||
tagparser = re.compile('( *%s)+' % allparser)
|
||||
|
||||
for entry in rawFastaEntryIterator(file):
|
||||
entry=entry.strip()
|
||||
yield fastaParser(entry,bioseqfactory,tagparser,rawparser,joinseq)
|
||||
|
||||
def fastaNucIterator(file,tagparser=_default_raw_parser):
|
||||
'''
|
||||
iterate through a fasta file sequence by sequence.
|
||||
Returned sequences by this iterator will be NucSequence
|
||||
instances
|
||||
|
||||
@param file: a line iterator containint fasta data
|
||||
@type file: an iterable object
|
||||
|
||||
@param tagparser: a compiled regular expression usable
|
||||
to identify key, value couples from
|
||||
title line.
|
||||
@type tagparser: regex instance
|
||||
|
||||
@return: an iterator on C{NucBioSequence} instance
|
||||
@rtype: a generator object
|
||||
|
||||
@see: L{fastaIterator}
|
||||
@see: L{fastaAAIterator}
|
||||
'''
|
||||
return fastaIterator(file, NucSequence,tagparser)
|
||||
|
||||
def fastaAAIterator(file,tagparser=_default_raw_parser):
|
||||
'''
|
||||
iterate through a fasta file sequence by sequence.
|
||||
Returned sequences by this iterator will be AASequence
|
||||
instances
|
||||
|
||||
@param file: a line iterator containing fasta data
|
||||
@type file: an iterable object
|
||||
|
||||
@param tagparser: a compiled regular expression usable
|
||||
to identify key, value couples from
|
||||
title line.
|
||||
@type tagparser: regex instance
|
||||
|
||||
@return: an iterator on C{AABioSequence} instance
|
||||
|
||||
@see: L{fastaIterator}
|
||||
@see: L{fastaNucIterator}
|
||||
'''
|
||||
return fastaIterator(file, AASequence,tagparser)
|
||||
|
||||
def formatFasta(data,gbmode=False,upper=False,restrict=None):
|
||||
'''
|
||||
Convert a seqence or a set of sequences in a
|
||||
string following the fasta format
|
||||
|
||||
@param data: sequence or a set of sequences
|
||||
@type data: BioSequence instance or an iterable object
|
||||
on BioSequence instances
|
||||
|
||||
@param gbmode: if set to C{True} identifier part of the title
|
||||
line follows recommendation from nbci to allow
|
||||
sequence indexing with the blast formatdb command.
|
||||
@type gbmode: bool
|
||||
|
||||
@param restrict: a set of key name that will be print in the formated
|
||||
output. If restrict is set to C{None} (default) then
|
||||
all keys are formated.
|
||||
@type restrict: any iterable value or None
|
||||
|
||||
@return: a fasta formated string
|
||||
@rtype: str
|
||||
'''
|
||||
if isinstance(data, BioSequence):
|
||||
data = [data]
|
||||
|
||||
if restrict is not None and not isinstance(restrict, set):
|
||||
restrict = set(restrict)
|
||||
|
||||
rep = []
|
||||
for sequence in data:
|
||||
seq = str(sequence)
|
||||
if sequence.definition is None:
|
||||
definition=''
|
||||
else:
|
||||
definition=sequence.definition
|
||||
if upper:
|
||||
frgseq = '\n'.join([seq[x:x+60].upper() for x in xrange(0,len(seq),60)])
|
||||
else:
|
||||
frgseq = '\n'.join([seq[x:x+60] for x in xrange(0,len(seq),60)])
|
||||
info='; '.join(['%s=%s' % x
|
||||
for x in sequence.rawiteritems()
|
||||
if restrict is None or x[0] in restrict])
|
||||
if info:
|
||||
info=info+';'
|
||||
if sequence._rawinfo is not None and sequence._rawinfo:
|
||||
info+=" " + sequence._rawinfo.strip()
|
||||
|
||||
id = sequence.id
|
||||
if gbmode:
|
||||
if 'gi' in sequence:
|
||||
id = "gi|%s|%s" % (sequence['gi'],id)
|
||||
else:
|
||||
id = "lcl|%s|" % (id)
|
||||
title='>%s %s %s' %(id,info,definition)
|
||||
rep.append("%s\n%s" % (title,frgseq))
|
||||
return '\n'.join(rep)
|
||||
|
||||
def formatSAPFastaGenerator(options):
|
||||
loadTaxonomyDatabase(options)
|
||||
|
||||
taxonomy=None
|
||||
if options.taxonomy is not None:
|
||||
taxonomy=options.taxonomy
|
||||
|
||||
assert taxonomy is not None,"SAP formating require indication of a taxonomy database"
|
||||
|
||||
ranks = ('superkingdom', 'kingdom', 'subkingdom', 'superphylum',
|
||||
'phylum', 'subphylum', 'superclass', 'class', 'subclass',
|
||||
'infraclass', 'superorder', 'order', 'suborder', 'infraorder',
|
||||
'parvorder', 'superfamily', 'family', 'subfamily', 'supertribe', 'tribe',
|
||||
'subtribe', 'supergenus', 'genus', 'subgenus', 'species group',
|
||||
'species subgroup', 'species', 'subspecies')
|
||||
|
||||
trank=set(taxonomy._ranks)
|
||||
ranks = [taxonomy._ranks.index(x) for x in ranks if x in trank]
|
||||
|
||||
strict= options.strictsap
|
||||
|
||||
def formatSAPFasta(data,gbmode=False,upper=False,restrict=None):
|
||||
'''
|
||||
Convert a seqence or a set of sequences in a
|
||||
string following the fasta format as recommended for the SAP
|
||||
software
|
||||
|
||||
http://ib.berkeley.edu/labs/slatkin/munch/StatisticalAssignmentPackage.html
|
||||
|
||||
@param data: sequence or a set of sequences
|
||||
@type data: BioSequence instance or an iterable object
|
||||
on BioSequence instances
|
||||
|
||||
@param gbmode: if set to C{True} identifier part of the title
|
||||
line follows recommendation from nbci to allow
|
||||
sequence indexing with the blast formatdb command.
|
||||
@type gbmode: bool
|
||||
|
||||
@param restrict: a set of key name that will be print in the formated
|
||||
output. If restrict is set to C{None} (default) then
|
||||
all keys are formated.
|
||||
@type restrict: any iterable value or None
|
||||
|
||||
@return: a fasta formated string
|
||||
@rtype: str
|
||||
'''
|
||||
if isinstance(data, BioSequence):
|
||||
data = [data]
|
||||
|
||||
if restrict is not None and not isinstance(restrict, set):
|
||||
restrict = set(restrict)
|
||||
|
||||
rep = []
|
||||
for sequence in data:
|
||||
seq = str(sequence)
|
||||
|
||||
if upper:
|
||||
frgseq = '\n'.join([seq[x:x+60].upper() for x in xrange(0,len(seq),60)])
|
||||
else:
|
||||
frgseq = '\n'.join([seq[x:x+60] for x in xrange(0,len(seq),60)])
|
||||
|
||||
try:
|
||||
taxid = sequence["taxid"]
|
||||
except KeyError:
|
||||
if strict:
|
||||
raise AssertionError('All sequence must have a taxid')
|
||||
else:
|
||||
continue
|
||||
|
||||
definition=' ;'
|
||||
|
||||
for r in ranks:
|
||||
taxon = taxonomy.getTaxonAtRank(taxid,r)
|
||||
if taxon is not None:
|
||||
definition+=' %s: %s,' % (taxonomy._ranks[r],taxonomy.getPreferedName(taxon))
|
||||
|
||||
definition='%s ; %s' % (definition[0:-1],taxonomy.getPreferedName(taxid))
|
||||
|
||||
id = sequence.id
|
||||
if gbmode:
|
||||
if 'gi' in sequence:
|
||||
id = "gi|%s|%s" % (sequence['gi'],id)
|
||||
else:
|
||||
id = "lcl|%s|" % (id)
|
||||
title='>%s%s' %(id,definition)
|
||||
rep.append("%s\n%s" % (title,frgseq))
|
||||
return '\n'.join(rep)
|
||||
|
||||
return formatSAPFasta
|
||||
|
||||
class FastaIterator(SequenceFileIterator):
|
||||
|
||||
|
||||
entryIterator = genericEntryIteratorGenerator(startEntry='>')
|
||||
classmethod(entryIterator)
|
||||
|
||||
def __init__(self,inputfile,bioseqfactory=bioSeqGenerator,
|
||||
tagparser=_default_raw_parser,
|
||||
joinseq=_fastaJoinSeq):
|
||||
|
||||
SequenceFileIterator.__init__(self, inputfile, bioseqfactory)
|
||||
|
||||
self.__file = FastaIterator.entryIterator(self._inputfile)
|
||||
|
||||
self._tagparser = tagparser
|
||||
self._joinseq = joinseq
|
||||
|
||||
def get_tagparser(self):
|
||||
return self.__tagparser
|
||||
|
||||
|
||||
def set_tagparser(self, value):
|
||||
self._rawparser = value
|
||||
allparser = value % '[a-zA-Z][a-zA-Z0-9_]*'
|
||||
self.__tagparser = re.compile('( *%s)+' % allparser)
|
||||
|
||||
def _parseFastaDescription(self,ds):
|
||||
|
||||
m = self._tagparser.search(' '+ds)
|
||||
if m is not None:
|
||||
info=m.group(0)
|
||||
definition = ds[m.end(0):].strip()
|
||||
else:
|
||||
info=None
|
||||
definition=ds
|
||||
|
||||
return definition,info
|
||||
|
||||
|
||||
def _parser(self):
|
||||
'''
|
||||
Parse a fasta record.
|
||||
|
||||
@attention: internal purpose function
|
||||
|
||||
@return: a C{BioSequence} instance
|
||||
'''
|
||||
seq = self._seq.split('\n')
|
||||
title = seq[0].strip()[1:].split(None,1)
|
||||
id=title[0]
|
||||
if len(title) == 2:
|
||||
definition,info=self._parseFastaDescription(title[1])
|
||||
else:
|
||||
info= None
|
||||
definition=None
|
||||
|
||||
seq=self._joinseq(seq[1:])
|
||||
|
||||
return self._bioseqfactory(id, seq, definition,info,self._rawparser)
|
||||
|
||||
_tagparser = property(get_tagparser, set_tagparser, None, "_tagparser's docstring")
|
BIN
obitools/fasta/_fasta.so
Executable file
BIN
obitools/fasta/_fasta.so
Executable file
Binary file not shown.
190
obitools/fastq/__init__.py
Normal file
190
obitools/fastq/__init__.py
Normal file
@ -0,0 +1,190 @@
|
||||
'''
|
||||
Created on 29 aout 2009
|
||||
|
||||
@author: coissac
|
||||
'''
|
||||
|
||||
from obitools import BioSequence
|
||||
from obitools import _default_raw_parser
|
||||
from obitools.format.genericparser import genericEntryIteratorGenerator
|
||||
from obitools import bioSeqGenerator,AASequence,NucSequence
|
||||
from obitools.fasta import parseFastaDescription
|
||||
from _fastq import fastqQualitySangerDecoder,fastqQualitySolexaDecoder
|
||||
from _fastq import qualityToSangerError,qualityToSolexaError
|
||||
from _fastq import errorToSangerFastQStr
|
||||
from _fastq import formatFastq
|
||||
from _fastq import fastqParserGenetator
|
||||
from obitools.utils import universalOpen
|
||||
|
||||
import re
|
||||
|
||||
fastqEntryIterator=genericEntryIteratorGenerator(startEntry='^@',endEntry="^\+",strip=True,join=False)
|
||||
|
||||
#def fastqParserGenetator(fastqvariant='sanger',bioseqfactory=NucSequence,tagparser=_parseFastaTag):
|
||||
#
|
||||
# qualityDecoder,errorDecoder = {'sanger' : (fastqQualitySangerDecoder,qualityToSangerError),
|
||||
# 'solexa' : (fastqQualitySolexaDecoder,qualityToSolexaError),
|
||||
# 'illumina' : (fastqQualitySolexaDecoder,qualityToSangerError)}[fastqvariant]
|
||||
#
|
||||
# def fastqParser(seq):
|
||||
# '''
|
||||
# Parse a fasta record.
|
||||
#
|
||||
# @attention: internal purpose function
|
||||
#
|
||||
# @param seq: a sequence object containing all lines corresponding
|
||||
# to one fasta sequence
|
||||
# @type seq: C{list} or C{tuple} of C{str}
|
||||
#
|
||||
# @param bioseqfactory: a callable object return a BioSequence
|
||||
# instance.
|
||||
# @type bioseqfactory: a callable object
|
||||
#
|
||||
# @param tagparser: a compiled regular expression usable
|
||||
# to identify key, value couples from
|
||||
# title line.
|
||||
# @type tagparser: regex instance
|
||||
#
|
||||
# @return: a C{BioSequence} instance
|
||||
# '''
|
||||
#
|
||||
# title = seq[0][1:].split(None,1)
|
||||
# id=title[0]
|
||||
# if len(title) == 2:
|
||||
# definition,info=parseFastaDescription(title[1], tagparser)
|
||||
# else:
|
||||
# info= {}
|
||||
# definition=None
|
||||
#
|
||||
# quality=errorDecoder(qualityDecoder(seq[3]))
|
||||
#
|
||||
# seq=seq[1]
|
||||
#
|
||||
# seq = bioseqfactory(id, seq, definition,False,**info)
|
||||
# seq.quality = quality
|
||||
#
|
||||
# return seq
|
||||
#
|
||||
# return fastqParser
|
||||
|
||||
|
||||
def fastqIterator(file,fastqvariant='sanger',bioseqfactory=NucSequence,tagparser=_default_raw_parser):
|
||||
'''
|
||||
iterate through a fasta file sequence by sequence.
|
||||
Returned sequences by this iterator will be BioSequence
|
||||
instances
|
||||
|
||||
@param file: a line iterator containing fasta data or a filename
|
||||
@type file: an iterable object or str
|
||||
@param bioseqfactory: a callable object return a BioSequence
|
||||
instance.
|
||||
@type bioseqfactory: a callable object
|
||||
|
||||
@param tagparser: a compiled regular expression usable
|
||||
to identify key, value couples from
|
||||
title line.
|
||||
@type tagparser: regex instance
|
||||
|
||||
@return: an iterator on C{BioSequence} instance
|
||||
|
||||
@see: L{fastaNucIterator}
|
||||
@see: L{fastaAAIterator}
|
||||
|
||||
'''
|
||||
fastqParser=fastqParserGenetator(fastqvariant, bioseqfactory, tagparser)
|
||||
file = universalOpen(file)
|
||||
for entry in fastqEntryIterator(file):
|
||||
title=entry[0]
|
||||
seq="".join(entry[1:-1])
|
||||
quality=''
|
||||
lenseq=len(seq)
|
||||
while (len(quality) < lenseq):
|
||||
quality+=file.next().strip()
|
||||
|
||||
yield fastqParser([title,seq,'+',quality])
|
||||
|
||||
def fastqSangerIterator(file,tagparser=_default_raw_parser):
|
||||
'''
|
||||
iterate through a fastq file sequence by sequence.
|
||||
Returned sequences by this iterator will be NucSequence
|
||||
instances
|
||||
|
||||
@param file: a line iterator containint fasta data
|
||||
@type file: an iterable object
|
||||
|
||||
@param tagparser: a compiled regular expression usable
|
||||
to identify key, value couples from
|
||||
title line.
|
||||
@type tagparser: regex instance
|
||||
|
||||
@return: an iterator on C{NucBioSequence} instance
|
||||
|
||||
@see: L{fastqIterator}
|
||||
@see: L{fastqAAIterator}
|
||||
'''
|
||||
return fastqIterator(file,'sanger',NucSequence,tagparser)
|
||||
|
||||
def fastqSolexaIterator(file,tagparser=_default_raw_parser):
|
||||
'''
|
||||
iterate through a fastq file sequence by sequence.
|
||||
Returned sequences by this iterator will be NucSequence
|
||||
instances
|
||||
|
||||
@param file: a line iterator containint fasta data
|
||||
@type file: an iterable object
|
||||
|
||||
@param tagparser: a compiled regular expression usable
|
||||
to identify key, value couples from
|
||||
title line.
|
||||
@type tagparser: regex instance
|
||||
|
||||
@return: an iterator on C{NucBioSequence} instance
|
||||
|
||||
@see: L{fastqIterator}
|
||||
@see: L{fastqAAIterator}
|
||||
'''
|
||||
return fastqIterator(file,'solexa',NucSequence,tagparser)
|
||||
|
||||
def fastqIlluminaIterator(file,tagparser=_default_raw_parser):
|
||||
'''
|
||||
iterate through a fastq file sequence by sequence.
|
||||
Returned sequences by this iterator will be NucSequence
|
||||
instances
|
||||
|
||||
@param file: a line iterator containint fasta data
|
||||
@type file: an iterable object
|
||||
|
||||
@param tagparser: a compiled regular expression usable
|
||||
to identify key, value couples from
|
||||
title line.
|
||||
@type tagparser: regex instance
|
||||
|
||||
@return: an iterator on C{NucBioSequence} instance
|
||||
|
||||
@see: L{fastqIterator}
|
||||
@see: L{fastqAAIterator}
|
||||
'''
|
||||
return fastqIterator(file,'illumina',NucSequence,tagparser)
|
||||
|
||||
def fastqAAIterator(file,tagparser=_default_raw_parser):
|
||||
'''
|
||||
iterate through a fastq file sequence by sequence.
|
||||
Returned sequences by this iterator will be AASequence
|
||||
instances
|
||||
|
||||
@param file: a line iterator containing fasta data
|
||||
@type file: an iterable object
|
||||
|
||||
@param tagparser: a compiled regular expression usable
|
||||
to identify key, value couples from
|
||||
title line.
|
||||
@type tagparser: regex instance
|
||||
|
||||
@return: an iterator on C{AABioSequence} instance
|
||||
|
||||
@see: L{fastqIterator}
|
||||
@see: L{fastqNucIterator}
|
||||
'''
|
||||
return fastqIterator(file,'sanger',AASequence,tagparser)
|
||||
|
||||
|
BIN
obitools/fastq/_fastq.so
Executable file
BIN
obitools/fastq/_fastq.so
Executable file
Binary file not shown.
2
obitools/fnaqual/__init__.py
Normal file
2
obitools/fnaqual/__init__.py
Normal file
@ -0,0 +1,2 @@
|
||||
|
||||
fnaTag=' %s *= *([^\s]+)'
|
8
obitools/fnaqual/fasta.py
Normal file
8
obitools/fnaqual/fasta.py
Normal file
@ -0,0 +1,8 @@
|
||||
from obitools.fasta import fastaNucIterator
|
||||
from obitools.fnaqual import fnaTag
|
||||
|
||||
def fnaFastaIterator(file):
|
||||
|
||||
x = fastaNucIterator(file, fnaTag)
|
||||
|
||||
return x
|
137
obitools/fnaqual/quality.py
Normal file
137
obitools/fnaqual/quality.py
Normal file
@ -0,0 +1,137 @@
|
||||
"""
|
||||
|
||||
|
||||
"""
|
||||
|
||||
from obitools import _default_raw_parser
|
||||
from obitools.fasta import fastaIterator
|
||||
from obitools.fnaqual import fnaTag
|
||||
from obitools.location import Location
|
||||
|
||||
import re
|
||||
|
||||
|
||||
class QualitySequence(list):
|
||||
|
||||
def __init__(self,id,seq,definition=None,rawinfo=None,rawparser=_default_raw_parser,**info):
|
||||
'''
|
||||
|
||||
@param id:
|
||||
@param seq:
|
||||
@param definition:
|
||||
'''
|
||||
list.__init__(self,seq)
|
||||
self._info = info
|
||||
self.definition=definition
|
||||
self.id=id
|
||||
self._rawinfo=' ' + rawinfo
|
||||
self._rawparser=rawparser
|
||||
|
||||
def getDefinition(self):
|
||||
'''
|
||||
Sequence definition getter
|
||||
|
||||
@return: the sequence definition
|
||||
@rtype: str
|
||||
|
||||
'''
|
||||
return self._definition
|
||||
|
||||
def setDefinition(self, value):
|
||||
self._definition = value
|
||||
|
||||
def getId(self):
|
||||
return self._id
|
||||
|
||||
def setId(self, value):
|
||||
self._id = value
|
||||
|
||||
def getKey(self,key):
|
||||
if key not in self._info:
|
||||
p = re.compile(self._rawparser % key)
|
||||
m = p.search(self._rawinfo)
|
||||
if m is not None:
|
||||
v=m.group(1)
|
||||
self._rawinfo=' ' + self._rawinfo[0:m.start(0)]+self._rawinfo[m.end(0):]
|
||||
try:
|
||||
v = eval(v)
|
||||
except:
|
||||
pass
|
||||
self._info[key]=v
|
||||
else:
|
||||
raise KeyError,key
|
||||
else:
|
||||
v=self._info[key]
|
||||
return v
|
||||
|
||||
def __getitem__(self,key):
|
||||
if isinstance(key,Location):
|
||||
return key.extractSequence(self)
|
||||
elif isinstance(key, str):
|
||||
return self._getKey(key)
|
||||
elif isinstance(key, int):
|
||||
return list.__getitem__(self,key)
|
||||
elif isinstance(key, slice):
|
||||
subseq=list.__getitem__(self,key)
|
||||
info = dict(self._info)
|
||||
if key.start is not None:
|
||||
start = key.start +1
|
||||
else:
|
||||
start = 1
|
||||
if key.stop is not None:
|
||||
stop = key.stop+1
|
||||
else:
|
||||
stop = len(self)
|
||||
if key.step is not None:
|
||||
step = key.step
|
||||
else:
|
||||
step = 1
|
||||
|
||||
info['cut']='[%d,%d,%s]' % (start,stop,step)
|
||||
return QualitySequence(self.id, subseq, self.definition,self._rawinfo,self._rawparser,**info)
|
||||
|
||||
raise TypeError,'key must be an integer, a str or a slice'
|
||||
|
||||
def __setitem__(self,key,value):
|
||||
self._info[key]=value
|
||||
|
||||
def __delitem__(self,key):
|
||||
if isinstance(key, str):
|
||||
del self._info[key]
|
||||
else:
|
||||
raise TypeError,key
|
||||
|
||||
def __iter__(self):
|
||||
return list.__iter__(self)
|
||||
|
||||
def __contains__(self,key):
|
||||
return key in self._info
|
||||
|
||||
def getTags(self):
|
||||
return self._info
|
||||
|
||||
def complement(self):
|
||||
'''
|
||||
|
||||
'''
|
||||
cseq = self[::-1]
|
||||
rep = QualitySequence(self.id,cseq,self.definition,self._rawinfo,self._rawparser,**self._info)
|
||||
rep._info['complemented']=not rep._info.get('complemented',False)
|
||||
return rep
|
||||
|
||||
|
||||
definition = property(getDefinition, setDefinition, None, "Sequence Definition")
|
||||
|
||||
id = property(getId, setId, None, 'Sequence identifier')
|
||||
|
||||
|
||||
def _qualityJoinSeq(seqarray):
|
||||
text = ' '.join([x.strip() for x in seqarray])
|
||||
return [int(x) for x in text.split()]
|
||||
|
||||
def qualityIterator(file):
|
||||
for q in fastaIterator(file, QualitySequence, fnaTag, _qualityJoinSeq):
|
||||
yield q
|
||||
|
||||
|
||||
|
28
obitools/format/__init__.py
Normal file
28
obitools/format/__init__.py
Normal file
@ -0,0 +1,28 @@
|
||||
from obitools import bioSeqGenerator
|
||||
from obitools.utils import universalOpen
|
||||
|
||||
|
||||
class SequenceFileIterator:
|
||||
|
||||
def __init__(self,inputfile,bioseqfactory=bioSeqGenerator):
|
||||
self._inputfile = universalOpen(inputfile)
|
||||
self._bioseqfactory = bioseqfactory
|
||||
|
||||
def get_inputfile(self):
|
||||
return self.__file
|
||||
|
||||
|
||||
def get_bioseqfactory(self):
|
||||
return self.__bioseqfactory
|
||||
|
||||
def next(self):
|
||||
entry = self.inputfile.next()
|
||||
return self._parse(entry)
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
_inputfile = property(get_inputfile, None, None, "_file's docstring")
|
||||
_bioseqfactory = property(get_bioseqfactory, None, None, "_bioseqfactory's docstring")
|
||||
|
||||
|
BIN
obitools/format/_format.so
Executable file
BIN
obitools/format/_format.so
Executable file
Binary file not shown.
217
obitools/format/genericparser/__init__.py
Normal file
217
obitools/format/genericparser/__init__.py
Normal file
@ -0,0 +1,217 @@
|
||||
"""
|
||||
G{packagetree format}
|
||||
"""
|
||||
import re
|
||||
|
||||
from obitools.utils import universalOpen
|
||||
|
||||
def genericEntryIteratorGenerator(startEntry=None,endEntry=None,
|
||||
head=False,tail=False,
|
||||
strip=False,join=True):
|
||||
'''
|
||||
Transfome a text line iterator to an entry oriented iterator.
|
||||
|
||||
This iterator converted is useful to implement first stage
|
||||
of flat file parsing.
|
||||
|
||||
@param startEntry: a regular pattern matching the beginning of
|
||||
an entry
|
||||
@type startEntry: C{str} or None
|
||||
@param endEntry: a regular pattern matching the end of
|
||||
an entry
|
||||
@type endEntry: C{str} or None
|
||||
@param head: indicate if an header is present before
|
||||
the first entry (as in many original genbank
|
||||
files)
|
||||
@type head: C{bool}
|
||||
@param tail: indicate if some extra informations are present
|
||||
after the last entry.
|
||||
@type tail: C{bool}
|
||||
|
||||
@return: an iterator on entries in text format
|
||||
@rtype: an iterator on C{str}
|
||||
'''
|
||||
|
||||
def isBeginning(line):
|
||||
return startEntry is None or startEntry.match(line) is not None
|
||||
|
||||
def isEnding(line):
|
||||
return ((endEntry is not None and endEntry.match(line) is not None) or
|
||||
(endEntry is None and startEntry is not None and startEntry.match(line) is not None))
|
||||
|
||||
def transparentIteratorEntry(file):
|
||||
file = universalOpen(file)
|
||||
return file
|
||||
|
||||
def genericEntryIterator(file):
|
||||
file = universalOpen(file)
|
||||
entry = []
|
||||
line = file.next()
|
||||
started = head or isBeginning(line)
|
||||
|
||||
try:
|
||||
while 1:
|
||||
while not started:
|
||||
line = file.next()
|
||||
started = isBeginning(line)
|
||||
|
||||
if endEntry is None:
|
||||
entry.append(line)
|
||||
line = file.next()
|
||||
|
||||
while started:
|
||||
end = isEnding(line)
|
||||
if end:
|
||||
if endEntry is not None:
|
||||
entry.append(line)
|
||||
if join:
|
||||
e = ''.join(entry)
|
||||
if strip:
|
||||
e=e.strip()
|
||||
else:
|
||||
e=entry
|
||||
if strip:
|
||||
e=[x.strip() for x in e]
|
||||
entry=[]
|
||||
yield e
|
||||
started=False
|
||||
if endEntry is not None:
|
||||
line = file.next()
|
||||
else:
|
||||
entry.append(line)
|
||||
line = file.next()
|
||||
|
||||
started = isBeginning(line)
|
||||
|
||||
except StopIteration:
|
||||
if entry and (endEntry is None or tail):
|
||||
if join:
|
||||
e = ''.join(entry)
|
||||
if strip:
|
||||
e=e.strip()
|
||||
else:
|
||||
e=entry
|
||||
if strip:
|
||||
e=[x.strip() for x in e]
|
||||
yield e
|
||||
|
||||
|
||||
|
||||
if startEntry is not None:
|
||||
startEntry = re.compile(startEntry)
|
||||
if endEntry is not None:
|
||||
endEntry = re.compile(endEntry)
|
||||
|
||||
if startEntry is None and endEntry is None:
|
||||
return transparentIteratorEntry
|
||||
|
||||
return genericEntryIterator
|
||||
|
||||
|
||||
class GenericParser(object):
|
||||
|
||||
def __init__(self,
|
||||
startEntry=None,
|
||||
endEntry=None,
|
||||
head=False,
|
||||
tail=False,
|
||||
strip=False,
|
||||
**parseAction):
|
||||
"""
|
||||
@param startEntry: a regular pattern matching the beginning of
|
||||
an entry
|
||||
@type startEntry: C{str} or None
|
||||
@param endEntry: a regular pattern matching the end of
|
||||
an entry
|
||||
@type endEntry: C{str} or None
|
||||
@param head: indicate if an header is present before
|
||||
the first entry (as in many original genbank
|
||||
files)
|
||||
@type head: C{bool}
|
||||
@param tail: indicate if some extra informations are present
|
||||
after the last entry.
|
||||
@type tail: C{bool}
|
||||
|
||||
@param parseAction:
|
||||
|
||||
"""
|
||||
self.flatiterator= genericEntryIteratorGenerator(startEntry,
|
||||
endEntry,
|
||||
head,
|
||||
tail,
|
||||
strip)
|
||||
|
||||
self.action={}
|
||||
|
||||
for k in parseAction:
|
||||
self.addParseAction(k,*parseAction[k])
|
||||
|
||||
def addParseAction(self,name,dataMatcher,dataCleaner=None,cleanSub=''):
|
||||
'''
|
||||
Add a parse action to the generic parser. A parse action
|
||||
allows to extract one information from an entry. A parse
|
||||
action is defined by a name and a method to extract this
|
||||
information from the full text entry.
|
||||
|
||||
A parse action can be defined following two ways.
|
||||
|
||||
- via regular expression patterns
|
||||
|
||||
- via dedicated function.
|
||||
|
||||
In the first case, you have to indicate at least the
|
||||
dataMatcher regular pattern. This pattern should match exactly
|
||||
the data part you want to retrieve. If cleanning of extra
|
||||
characters is needed. The second pattern dataCLeanner can be
|
||||
used to specifyed these characters.
|
||||
|
||||
In the second case you must provide a callable object (function)
|
||||
that extract and clean data from the text entry. This function
|
||||
should return an array containing all data retrevied even if
|
||||
no data or only one data is retrevied.
|
||||
|
||||
@summary: Add a parse action to the generic parser.
|
||||
|
||||
@param name: name of the data extracted
|
||||
@type name: C{str}
|
||||
@param dataMatcher: a regular pattern matching the data
|
||||
or a callable object parsing the
|
||||
entry and returning a list of marched data
|
||||
@type dataMatcher: C{str} or C{SRE_Pattern} instance or a callable
|
||||
object
|
||||
@param dataCleaner: a regular pattern matching part of the data
|
||||
to suppress.
|
||||
@type dataCleaner: C{str} or C{SRE_Pattern} instance or C{None}
|
||||
@param cleanSub: string used to replace dataCleaner matches.
|
||||
Default is an empty string
|
||||
@type cleanSub: C{str}
|
||||
|
||||
'''
|
||||
if callable(dataMatcher):
|
||||
self.action[name]=dataMatcher
|
||||
else :
|
||||
if isinstance(dataMatcher, str):
|
||||
dataMatcher=re.compile(dataMatcher)
|
||||
if isinstance(dataCleaner, str):
|
||||
dataCleaner=re.compile(dataCleaner)
|
||||
self.action[name]=self._buildREParser(dataMatcher,
|
||||
dataCleaner,
|
||||
cleanSub)
|
||||
|
||||
def _buildREParser(self,dataMatcher,dataCleaner,cleanSub):
|
||||
def parser(data):
|
||||
x = dataMatcher.findall(data)
|
||||
if dataCleaner is not None:
|
||||
x = [dataCleaner.sub(cleanSub,y) for y in x]
|
||||
return x
|
||||
return parser
|
||||
|
||||
def __call__(self,file):
|
||||
for e in self.flatiterator(file):
|
||||
pe = {'fullentry':e}
|
||||
for k in self.action:
|
||||
pe[k]=self.action[k](e)
|
||||
yield pe
|
||||
|
||||
|
||||
|
0
obitools/format/ontology/__init__.py
Normal file
0
obitools/format/ontology/__init__.py
Normal file
274
obitools/format/ontology/go_obo.py
Normal file
274
obitools/format/ontology/go_obo.py
Normal file
@ -0,0 +1,274 @@
|
||||
__docformat__ = 'restructuredtext'
|
||||
|
||||
import re
|
||||
import string
|
||||
import textwrap
|
||||
|
||||
|
||||
from obitools.obo.go.parser import GOEntryIterator
|
||||
from obitools.obo.go.parser import GOTerm
|
||||
from obitools.obo.go.parser import GOEntry
|
||||
|
||||
"""
|
||||
go_obo.py : gene_ontology_edit.obo file parser:
|
||||
----------------------------------------------------
|
||||
|
||||
- OBOFile class: open a flat file and return an entry.
|
||||
|
||||
"""
|
||||
class OBOFile(object):
|
||||
"""
|
||||
Iterator over all entries of an OBO file
|
||||
"""
|
||||
|
||||
def __init__(self,_path):
|
||||
self.file = GOEntryIterator(_path)
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def next(self):
|
||||
fiche = self.file.next()
|
||||
|
||||
if isinstance(fiche, GOTerm):
|
||||
self.isaterm=True
|
||||
return Term(fiche)
|
||||
elif isinstance(fiche, GOEntry):
|
||||
self.isaterm=False
|
||||
return Entry(fiche)
|
||||
else:
|
||||
self.isaterm=False
|
||||
return Header(fiche)
|
||||
|
||||
|
||||
############# tout le reste doit descendre a l'etage obitools/ogo/go/parser.py ##########
|
||||
|
||||
# define an XRef into a go_obo.py script in the microbi pylib
|
||||
class Xref(object):
|
||||
"""
|
||||
Class Xref
|
||||
Xref.db Xref database
|
||||
Xref.id Xref identifier
|
||||
"""
|
||||
|
||||
def __init__(self,description):
|
||||
data = description.split(':')
|
||||
self.db = data[0].strip()
|
||||
self.id = data[1].strip()
|
||||
|
||||
# define a RelatedTerm into a go_obo.py script in the microbi pylib
|
||||
class RelatedTerm(object):
|
||||
"""
|
||||
Class RelatedTerm
|
||||
RelatedTerm.relation RelatedTerm relation
|
||||
RelatedTerm.related_term RelatedTerm GO identifier
|
||||
RelatedTerm.comment all terms have 0 or 1 comment
|
||||
"""
|
||||
|
||||
def __init__(self,relation,value,comment):
|
||||
self.relation = relation
|
||||
self.related_term = value.strip('GO:')
|
||||
self.comment = comment
|
||||
|
||||
|
||||
# define into a go_obo.py script in the microbi pylib
|
||||
#class Term(object):
|
||||
# """
|
||||
# class representing an OBO term (entry).
|
||||
# """
|
||||
#
|
||||
# def __init__(self):
|
||||
# raise RuntimeError('biodb.go_obo is an abstract class')
|
||||
#
|
||||
# def __checkEntry__(self):
|
||||
# minimum=(hasattr(self,'goid') )
|
||||
# if not minimum:
|
||||
# raise AssertionError('Misconstructed GO Term instance %s' % [x for x in dir(self) if x[0]!='_'])
|
||||
|
||||
class Term(object):
|
||||
"""
|
||||
Class Term
|
||||
representing a GO term.
|
||||
"""
|
||||
|
||||
def __init__(self,data=None):
|
||||
"""
|
||||
"""
|
||||
self.data=data
|
||||
self.isaterm = True
|
||||
|
||||
if data:
|
||||
self.__filtreGoid__()
|
||||
self.__filtreName__()
|
||||
self.__filtreComment__()
|
||||
self.__filtreSynonyms__()
|
||||
self.__filtreDef__()
|
||||
self.__filtreParents__()
|
||||
self.__filtreRelationships__()
|
||||
self.__filtreRelation__()
|
||||
self.__filtreObsolete__()
|
||||
self.__filtreAltIds__()
|
||||
self.__filtreXRefs__()
|
||||
self.__filtreSubsets__()
|
||||
|
||||
# check if all required attributes were valued
|
||||
self.__checkEntry__()
|
||||
|
||||
|
||||
def __checkEntry__(self):
|
||||
minimum=(hasattr(self,'goid') )
|
||||
if not minimum:
|
||||
raise AssertionError('Misconstructed GO Term instance %s' % [x for x in dir(self) if x[0]!='_'])
|
||||
|
||||
|
||||
def __filtreGoid__(self):
|
||||
"""
|
||||
Extract GO id.
|
||||
"""
|
||||
self.goid = self.data.id.value.strip('GO:')
|
||||
|
||||
def __filtreName__(self):
|
||||
"""
|
||||
Extract GO name.
|
||||
"""
|
||||
self.name = self.data.name.value
|
||||
|
||||
def __filtreSynonyms__(self):
|
||||
"""
|
||||
Extract GO synonym(s).
|
||||
"""
|
||||
self.list_synonyms = {}
|
||||
if self.data.synonyms:
|
||||
for y in self.data.synonyms:
|
||||
self.list_synonyms[y.value] = y.scope
|
||||
|
||||
|
||||
def __filtreComment__(self):
|
||||
"""
|
||||
manage None comments
|
||||
"""
|
||||
if self.data.comment != None:
|
||||
self.comment = self.data.comment.value
|
||||
else:
|
||||
self.comment = ""
|
||||
|
||||
def __filtreDef__(self):
|
||||
"""
|
||||
Extract GO definition.
|
||||
"""
|
||||
if self.data.definition != None:
|
||||
self.definition = self.data.definition.value
|
||||
else:
|
||||
self.definition = ""
|
||||
|
||||
def __filtreParents__(self):
|
||||
"""
|
||||
To make the is_a hierarchy
|
||||
"""
|
||||
if self.data.is_a != None:
|
||||
self.is_a = set([isa.value.strip('GO:') for isa in self.data.is_a])
|
||||
else:
|
||||
self.is_a = set()
|
||||
|
||||
def __filtreRelation__(self):
|
||||
"""
|
||||
To make the part_of hierarchy
|
||||
"""
|
||||
self.part_of = set()
|
||||
self.regulates = set()
|
||||
self.negatively_regulates = set()
|
||||
self.positively_regulates = set()
|
||||
|
||||
if self.data.relationship != None:
|
||||
for rel in self.data.relationship:
|
||||
if rel.relationship == "part_of":
|
||||
self.part_of.add(rel.value.strip('GO:'))
|
||||
elif rel.relationship == "regulates":
|
||||
self.regulates.add(rel.value.strip('GO:'))
|
||||
elif rel.relationship == "negatively_regulates":
|
||||
self.negatively_regulates.add(rel.value.strip('GO:'))
|
||||
elif rel.relationship == "positively_regulates":
|
||||
self.positively_regulates.add(rel.value.strip('GO:'))
|
||||
|
||||
|
||||
def __filtreRelationships__(self):
|
||||
"""
|
||||
Relation list with other GO Terms (is_a, part_of or some regulates relation)
|
||||
"""
|
||||
self.related_term =[]
|
||||
if self.data.relationship != None:
|
||||
for x in self.data.relationship:
|
||||
self.related_term.append(RelatedTerm(x.relationship,x.value,x.__doc__))
|
||||
#self.related_term.append(RelatedTerm(x.relationship,x.value,x.comment))
|
||||
if self.data.is_a != None:
|
||||
for x in self.data.is_a:
|
||||
self.related_term.append(RelatedTerm('is_a',x.value,x.__doc__))
|
||||
#self.related_term.append(RelatedTerm('is_a',x.value,x.comment))
|
||||
|
||||
|
||||
|
||||
def __filtreObsolete__(self):
|
||||
"""
|
||||
for each obsolete terms corresponds a set of GO Identifiers
|
||||
so that this GO term is consider as others GO Terms
|
||||
"""
|
||||
self.considers = set()
|
||||
self.replaces = set()
|
||||
self.is_obsolete = self.data.is_obsolete
|
||||
if self.data.is_obsolete:
|
||||
if self.data.consider:
|
||||
self.considers = set([considered.value.strip('GO:') for considered in self.data.consider])
|
||||
if self.data.replaced_by:
|
||||
self.replaces = set([replaced.value.strip('GO:') for replaced in self.data.replaced_by])
|
||||
|
||||
|
||||
def __filtreAltIds__(self):
|
||||
"""
|
||||
alternate(s) id(s) for this term (= alias in the geneontology schema model!)
|
||||
"""
|
||||
if self.data.alt_ids:
|
||||
self.alt_ids = set([x.value.strip('GO:') for x in self.data.alt_ids])
|
||||
else:
|
||||
self.alt_ids = set()
|
||||
|
||||
def __filtreXRefs__(self):
|
||||
"""
|
||||
cross references to other databases
|
||||
"""
|
||||
self.xrefs = set()
|
||||
if self.data.xrefs:
|
||||
self.xrefs = set([Xref(x.value.reference) for x in self.data.xrefs])
|
||||
|
||||
|
||||
def __filtreSubsets__(self):
|
||||
"""
|
||||
subset label to make smaller sets of GO Terms
|
||||
"""
|
||||
self.subsets = set()
|
||||
if self.data.subsets:
|
||||
self.subsets = set([x.value for x in self.data.subsets])
|
||||
|
||||
|
||||
class Entry(object):
|
||||
"""
|
||||
a Stanza entry, like [Typedef] for example
|
||||
"""
|
||||
def __init__(self,data=None):
|
||||
self.data=data
|
||||
self.isaterm=False
|
||||
self.isanentry=True
|
||||
|
||||
|
||||
class Header(object):
|
||||
"""
|
||||
class representing a GO header.
|
||||
"""
|
||||
|
||||
def __init__(self,data=None):
|
||||
"""
|
||||
"""
|
||||
self.data=data
|
||||
self.isaterm = False
|
||||
|
||||
|
||||
|
284
obitools/format/options.py
Normal file
284
obitools/format/options.py
Normal file
@ -0,0 +1,284 @@
|
||||
'''
|
||||
Created on 13 oct. 2009
|
||||
|
||||
@author: coissac
|
||||
'''
|
||||
|
||||
from obitools.format.sequence.embl import emblIterator
|
||||
from obitools.format.sequence.genbank import genbankIterator
|
||||
from obitools.format.sequence.fnaqual import fnaFastaIterator
|
||||
from obitools.format.sequence.fasta import fastaAAIterator,fastaNucIterator,fastaIterator
|
||||
from obitools.format.sequence.fastq import fastqIlluminaIterator,fastqSolexaIterator
|
||||
from obitools.fastq import fastqSangerIterator
|
||||
from obitools.fnaqual.quality import qualityIterator
|
||||
from obitools.fasta import formatFasta, rawFastaIterator,\
|
||||
formatSAPFastaGenerator
|
||||
from obitools.fastq import formatFastq
|
||||
|
||||
from obitools.ecopcr.sequence import EcoPCRDBSequenceWriter
|
||||
from obitools.ecopcr.options import loadTaxonomyDatabase
|
||||
|
||||
#from obitools.format._format import printOutput
|
||||
|
||||
from array import array
|
||||
from itertools import chain
|
||||
import sys
|
||||
|
||||
import re
|
||||
from obitools.ecopcr import EcoPCRFile
|
||||
|
||||
|
||||
def addInputFormatOption(optionManager):
|
||||
# optionManager.add_option('--rank',
|
||||
# action="store_true", dest='addrank',
|
||||
# default=False,
|
||||
# help="add a rank attribute to the sequence "
|
||||
# "indicating the sequence position in the input data")
|
||||
optionManager.add_option('--genbank',
|
||||
action="store_const", dest="seqinformat",
|
||||
default=None,
|
||||
const='genbank',
|
||||
help="input file is in genbank format")
|
||||
optionManager.add_option('--embl',
|
||||
action="store_const", dest="seqinformat",
|
||||
default=None,
|
||||
const='embl',
|
||||
help="input file is in embl format")
|
||||
|
||||
optionManager.add_option('--fasta',
|
||||
action="store_const", dest="seqinformat",
|
||||
default=None,
|
||||
const='fasta',
|
||||
help="input file is in fasta nucleic format (including obitools fasta extentions)")
|
||||
|
||||
optionManager.add_option('--ecopcr',
|
||||
action="store_const", dest="seqinformat",
|
||||
default=None,
|
||||
const='ecopcr',
|
||||
help="input file is in fasta nucleic format (including obitools fasta extentions)")
|
||||
|
||||
optionManager.add_option('--raw-fasta',
|
||||
action="store_const", dest="seqinformat",
|
||||
default=None,
|
||||
const='rawfasta',
|
||||
help="input file is in fasta format (but more tolerant to format variant)")
|
||||
|
||||
optionManager.add_option('--fna',
|
||||
action="store_const", dest="seqinformat",
|
||||
default=None,
|
||||
const='fna',
|
||||
help="input file is in fasta nucleic format produced by 454 sequencer pipeline")
|
||||
|
||||
optionManager.add_option('--qual',
|
||||
action="store", dest="withqualfile",
|
||||
type='str',
|
||||
default=None,
|
||||
help="Specify the name of a quality file produced by 454 sequencer pipeline")
|
||||
|
||||
optionManager.add_option('--sanger',
|
||||
action="store_const", dest="seqinformat",
|
||||
default=None,
|
||||
const='sanger',
|
||||
help="input file is in sanger fastq nucleic format (standard fastq)")
|
||||
|
||||
optionManager.add_option('--solexa',
|
||||
action="store_const", dest="seqinformat",
|
||||
default=None,
|
||||
const='solexa',
|
||||
help="input file is in fastq nucleic format produced by solexa sequencer")
|
||||
|
||||
optionManager.add_option('--illumina',
|
||||
action="store_const", dest="seqinformat",
|
||||
default=None,
|
||||
const='illumina',
|
||||
help="input file is in fastq nucleic format produced by old solexa sequencer")
|
||||
|
||||
optionManager.add_option('--nuc',
|
||||
action="store_const", dest="moltype",
|
||||
default=None,
|
||||
const='nuc',
|
||||
help="input file is nucleic sequences")
|
||||
optionManager.add_option('--prot',
|
||||
action="store_const", dest="moltype",
|
||||
default=None,
|
||||
const='pep',
|
||||
help="input file is protein sequences")
|
||||
|
||||
|
||||
def addOutputFormatOption(optionManager):
|
||||
optionManager.add_option('--fastq-output',
|
||||
action="store_const", dest="output",
|
||||
default=None,
|
||||
const=formatFastq,
|
||||
help="output sequences in sanger fastq format")
|
||||
optionManager.add_option('--fasta-output',
|
||||
action="store_const", dest="output",
|
||||
default=None,
|
||||
const=formatFasta,
|
||||
help="output sequences in obitools fasta format")
|
||||
optionManager.add_option('--sap-output',
|
||||
action="store_const", dest="output",
|
||||
default=None,
|
||||
const=formatSAPFastaGenerator,
|
||||
help="output sequences in sap fasta format")
|
||||
optionManager.add_option('--strict-sap',
|
||||
action='store_true',dest='strictsap',
|
||||
default=False,
|
||||
help="Print sequences in upper case (defualt is lower case)")
|
||||
optionManager.add_option('--ecopcr-output',
|
||||
action="store", dest="ecopcroutput",
|
||||
default=None,
|
||||
help="output sequences in obitools ecopcr format")
|
||||
optionManager.add_option('--uppercase',
|
||||
action='store_true',dest='uppercase',
|
||||
default=False,
|
||||
help="Print sequences in upper case (defualt is lower case)")
|
||||
|
||||
|
||||
|
||||
def addInOutputOption(optionManager):
|
||||
addInputFormatOption(optionManager)
|
||||
addOutputFormatOption(optionManager)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def autoEntriesIterator(options):
|
||||
options.outputFormater=formatFasta
|
||||
options.outputFormat="fasta"
|
||||
|
||||
ecopcr_pattern = re.compile('^[^ ]+ +| +[0-9]+ +| + [0-9]+ + | +')
|
||||
|
||||
def annotatedIterator(formatIterator):
|
||||
options.outputFormater=formatFasta
|
||||
options.outputFormat="fasta"
|
||||
def iterator(lineiterator):
|
||||
for s in formatIterator(lineiterator):
|
||||
s.extractTaxon()
|
||||
yield s
|
||||
|
||||
return iterator
|
||||
|
||||
def withQualIterator(qualityfile):
|
||||
options.outputFormater=formatFastq
|
||||
options.outputFormat="fastq"
|
||||
def iterator(lineiterator):
|
||||
for s in fnaFastaIterator(lineiterator):
|
||||
q = qualityfile.next()
|
||||
quality = array('d',(10.**(-x/10.) for x in q))
|
||||
s.quality=quality
|
||||
yield s
|
||||
|
||||
return iterator
|
||||
|
||||
def autoSequenceIterator(lineiterator):
|
||||
options.outputFormater=formatFasta
|
||||
options.outputFormat="fasta"
|
||||
first = lineiterator.next()
|
||||
if first[0]==">":
|
||||
if options.withqualfile is not None:
|
||||
qualfile=qualityIterator(options.withqualfile)
|
||||
reader=withQualIterator(qualfile)
|
||||
options.outputFormater=formatFastq
|
||||
options.outputFormat="fastq"
|
||||
elif options.moltype=='nuc':
|
||||
reader=fastaNucIterator
|
||||
elif options.moltype=='pep':
|
||||
reader=fastaAAIterator
|
||||
else:
|
||||
reader=fastaIterator
|
||||
elif first[0]=='@':
|
||||
reader=fastqSangerIterator
|
||||
options.outputFormater=formatFastq
|
||||
options.outputFormat="fastq"
|
||||
elif first[0:3]=='ID ':
|
||||
reader=emblIterator
|
||||
elif first[0:6]=='LOCUS ':
|
||||
reader=genbankIterator
|
||||
elif first[0]=="#" or ecopcr_pattern.search(first):
|
||||
reader=EcoPCRFile
|
||||
else:
|
||||
raise AssertionError,'file is not in fasta, fasta, embl, genbank or ecoPCR format'
|
||||
|
||||
input = reader(chain([first],lineiterator))
|
||||
|
||||
return input
|
||||
|
||||
if options.seqinformat is None:
|
||||
reader = autoSequenceIterator
|
||||
else:
|
||||
if options.seqinformat=='fasta':
|
||||
if options.moltype=='nuc':
|
||||
reader=fastaNucIterator
|
||||
elif options.moltype=='pep':
|
||||
reader=fastaAAIterator
|
||||
else:
|
||||
reader=fastaIterator
|
||||
elif options.seqinformat=='rawfasta':
|
||||
reader=annotatedIterator(rawFastaIterator)
|
||||
elif options.seqinformat=='genbank':
|
||||
reader=annotatedIterator(genbankIterator)
|
||||
elif options.seqinformat=='embl':
|
||||
reader=annotatedIterator(emblIterator)
|
||||
elif options.seqinformat=='fna':
|
||||
reader=fnaFastaIterator
|
||||
elif options.seqinformat=='sanger':
|
||||
options.outputFormater=formatFastq
|
||||
options.outputFormat="fastq"
|
||||
reader=fastqSangerIterator
|
||||
elif options.seqinformat=='solexa':
|
||||
options.outputFormater=formatFastq
|
||||
options.outputFormat="fastq"
|
||||
reader=fastqSolexaIterator
|
||||
elif options.seqinformat=='illumina':
|
||||
options.outputFormater=formatFastq
|
||||
options.outputFormat="fastq"
|
||||
reader=fastqIlluminaIterator
|
||||
elif options.seqinformat=='ecopcr':
|
||||
reader=EcoPCRFile
|
||||
|
||||
if options.seqinformat=='fna' and options.withqualfile is not None:
|
||||
qualfile=qualityIterator(options.withqualfile)
|
||||
reader=withQualIterator(qualfile)
|
||||
options.outputFormater=formatFastq
|
||||
options.outputFormat="fastq"
|
||||
|
||||
# if options.addrank:
|
||||
# reader = withRankIterator(reader)
|
||||
return reader
|
||||
|
||||
def sequenceWriterGenerator(options,output=sys.stdout):
|
||||
class SequenceWriter:
|
||||
def __init__(self,options,file=sys.stdout):
|
||||
self._format=None
|
||||
self._file=file
|
||||
self._upper=options.uppercase
|
||||
def put(self,seq):
|
||||
if self._format is None:
|
||||
self._format=formatFasta
|
||||
if options.output is not None:
|
||||
self._format=options.output
|
||||
if self._format is formatSAPFastaGenerator:
|
||||
self._format=formatSAPFastaGenerator(options)
|
||||
elif options.outputFormater is not None:
|
||||
self._format=options.outputFormater
|
||||
s = self._format(seq,upper=self._upper)
|
||||
try:
|
||||
self._file.write(s)
|
||||
self._file.write("\n")
|
||||
except IOError:
|
||||
sys.exit(0)
|
||||
|
||||
if options.ecopcroutput is not None:
|
||||
taxo = loadTaxonomyDatabase(options)
|
||||
writer=EcoPCRDBSequenceWriter(options.ecopcroutput,taxonomy=taxo)
|
||||
else:
|
||||
writer=SequenceWriter(options,output)
|
||||
|
||||
def sequenceWriter(sequence):
|
||||
writer.put(sequence)
|
||||
|
||||
return sequenceWriter
|
||||
|
||||
|
24
obitools/format/sequence/__init__.py
Normal file
24
obitools/format/sequence/__init__.py
Normal file
@ -0,0 +1,24 @@
|
||||
from obitools.fasta import fastaIterator
|
||||
from obitools.fastq import fastqSangerIterator
|
||||
from obitools.seqdb.embl.parser import emblIterator
|
||||
from obitools.seqdb.genbank.parser import genbankIterator
|
||||
from itertools import chain
|
||||
from obitools.utils import universalOpen
|
||||
|
||||
def autoSequenceIterator(file):
|
||||
lineiterator = universalOpen(file)
|
||||
first = lineiterator.next()
|
||||
if first[0]==">":
|
||||
reader=fastaIterator
|
||||
elif first[0]=='@':
|
||||
reader=fastqSangerIterator
|
||||
elif first[0:3]=='ID ':
|
||||
reader=emblIterator
|
||||
elif first[0:6]=='LOCUS ':
|
||||
reader=genbankIterator
|
||||
else:
|
||||
raise AssertionError,'file is not in fasta, fasta, embl, or genbank format'
|
||||
|
||||
input = reader(chain([first],lineiterator))
|
||||
|
||||
return input
|
2
obitools/format/sequence/embl.py
Normal file
2
obitools/format/sequence/embl.py
Normal file
@ -0,0 +1,2 @@
|
||||
from obitools.seqdb.embl.parser import emblIterator,emblParser
|
||||
|
4
obitools/format/sequence/fasta.py
Normal file
4
obitools/format/sequence/fasta.py
Normal file
@ -0,0 +1,4 @@
|
||||
from obitools.fasta import fastaIterator,fastaParser
|
||||
from obitools.fasta import fastaAAIterator,fastaAAParser
|
||||
from obitools.fasta import fastaNucIterator,fastaNucParser
|
||||
from obitools.fasta import formatFasta
|
13
obitools/format/sequence/fastq.py
Normal file
13
obitools/format/sequence/fastq.py
Normal file
@ -0,0 +1,13 @@
|
||||
'''
|
||||
Created on 15 janv. 2010
|
||||
|
||||
@author: coissac
|
||||
'''
|
||||
|
||||
from obitools.fastq import fastqIterator,fastqParserGenetator
|
||||
from obitools.fastq import fastqSangerIterator,fastqSolexaIterator, \
|
||||
fastqIlluminaIterator
|
||||
from obitools.fastq import fastqAAIterator
|
||||
from obitools.fastq import formatFastq
|
||||
|
||||
|
8
obitools/format/sequence/fnaqual.py
Normal file
8
obitools/format/sequence/fnaqual.py
Normal file
@ -0,0 +1,8 @@
|
||||
'''
|
||||
Created on 12 oct. 2009
|
||||
|
||||
@author: coissac
|
||||
'''
|
||||
|
||||
from obitools.fnaqual.fasta import fnaFastaIterator
|
||||
from obitools.fnaqual.quality import qualityIterator
|
4
obitools/format/sequence/genbank.py
Normal file
4
obitools/format/sequence/genbank.py
Normal file
@ -0,0 +1,4 @@
|
||||
from obitools.seqdb.genbank.parser import genpepIterator,genpepParser
|
||||
from obitools.seqdb.genbank.parser import genbankIterator,genbankParser
|
||||
|
||||
|
5
obitools/format/sequence/tagmatcher.py
Normal file
5
obitools/format/sequence/tagmatcher.py
Normal file
@ -0,0 +1,5 @@
|
||||
from obitools.tagmatcher.parser import tagMatcherParser
|
||||
from obitools.tagmatcher.parser import TagMatcherIterator
|
||||
from obitools.tagmatcher.parser import formatTagMatcher
|
||||
|
||||
tagMatcherIterator=TagMatcherIterator
|
0
obitools/goa/__init__.py
Normal file
0
obitools/goa/__init__.py
Normal file
33
obitools/goa/parser.py
Normal file
33
obitools/goa/parser.py
Normal file
@ -0,0 +1,33 @@
|
||||
from itertools import imap
|
||||
from obitools import utils
|
||||
|
||||
class GoAFileIterator(utils.ColumnFile):
|
||||
def __init__(self,stream):
|
||||
utils.ColumnFile.__init__(self,
|
||||
stream, '\t', True,
|
||||
(str,))
|
||||
|
||||
_colname = ['database',
|
||||
'ac',
|
||||
'symbol',
|
||||
'qualifier',
|
||||
'goid',
|
||||
'origin',
|
||||
'evidence',
|
||||
'evidnce_origine',
|
||||
'namespace',
|
||||
'db_object_name',
|
||||
'gene',
|
||||
'object_type',
|
||||
'taxid',
|
||||
'date',
|
||||
'assigned_by']
|
||||
|
||||
def next(self):
|
||||
data = utils.ColumnFile.next(self)
|
||||
data = dict(imap(None,GoAFileIterator._colname,data))
|
||||
|
||||
return data
|
||||
|
||||
|
||||
|
962
obitools/graph/__init__.py
Normal file
962
obitools/graph/__init__.py
Normal file
@ -0,0 +1,962 @@
|
||||
'''
|
||||
**obitool.graph** for representing graph structure in obitools
|
||||
--------------------------------------------------------------
|
||||
|
||||
.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>
|
||||
|
||||
|
||||
This module offert classes to manipulate graphs, mainly trough the
|
||||
:py:class:`obitools.graph.Graph` class.
|
||||
|
||||
.. inheritance-diagram:: Graph DiGraph UndirectedGraph
|
||||
:parts: 2
|
||||
|
||||
'''
|
||||
|
||||
import sys
|
||||
|
||||
|
||||
from obitools.utils import progressBar
|
||||
|
||||
|
||||
class Indexer(dict):
|
||||
'''
|
||||
Allow to manage convertion between an arbitrarly hashable python
|
||||
value and an unique integer key
|
||||
'''
|
||||
|
||||
def __init__(self):
|
||||
|
||||
self.__max=0
|
||||
self.__reverse=[]
|
||||
|
||||
def getLabel(self,index):
|
||||
'''
|
||||
Return the python value associated to an integer index.
|
||||
|
||||
:param index: an index value
|
||||
:type index: int
|
||||
|
||||
:raises: IndexError if the index is not used in this
|
||||
Indexer instance
|
||||
'''
|
||||
return self.__reverse[index]
|
||||
|
||||
def getIndex(self,key,strict=False):
|
||||
'''
|
||||
Return the index associated to a **key** in the indexer. Two
|
||||
modes are available :
|
||||
|
||||
- strict mode :
|
||||
|
||||
if the key is not known by the :py:class:`Indexer` instance
|
||||
a :py:exc:`KeyError` exception is raised.
|
||||
|
||||
- non strict mode :
|
||||
|
||||
in this mode if the requested *key** is absent, it is added to
|
||||
the :py:class:`Indexer` instance and the new index is returned
|
||||
|
||||
:param key: the requested key
|
||||
:type key: a hashable python value
|
||||
|
||||
:param strict: select the looking for mode
|
||||
:type strict: bool
|
||||
|
||||
:return: the index corresponding to the key
|
||||
:rtype: int
|
||||
|
||||
:raises: - :py:exc:`KeyError` in strict mode is key is absent
|
||||
of the :py:class:`Indexer` instance
|
||||
|
||||
- :py:exc:`TypeError` if key is not an hashable value.
|
||||
'''
|
||||
if dict.__contains__(self,key):
|
||||
return dict.__getitem__(self,key)
|
||||
elif strict:
|
||||
raise KeyError,key
|
||||
else:
|
||||
value = self.__max
|
||||
self[key]= value
|
||||
self.__reverse.append(key)
|
||||
self.__max+=1
|
||||
return value
|
||||
|
||||
def __getitem__(self,key):
|
||||
'''
|
||||
Implement the [] operateor to emulate the standard dictionnary
|
||||
behaviour on :py:class:`Indexer` and returns the integer key
|
||||
associated to a python value.
|
||||
|
||||
Actually this method call the:py:meth:`getIndex` method in
|
||||
non strict mode so it only raises an :py:exc:`TypeError`
|
||||
if key is not an hashable value.
|
||||
|
||||
:param key: the value to index
|
||||
:type key: an hashable python value
|
||||
|
||||
:return: an unique integer value associated to the key
|
||||
:rtype: int
|
||||
|
||||
:raises: :py:exc:`TypeError` if **key** is not an hashable value.
|
||||
|
||||
'''
|
||||
return self.getIndex(key)
|
||||
|
||||
def __equal__(self,index):
|
||||
'''
|
||||
Implement equal operator **==** for comparing two :py:class:`Indexer` instances.
|
||||
Two :py:class:`Indexer` instances are equals only if they are physically
|
||||
the same instance
|
||||
|
||||
:param index: the second Indexer
|
||||
:type index: an :py:class:`Indexer` instance
|
||||
|
||||
:return: True is the two :py:class:`Indexer` instances are the same
|
||||
:rtype: bool
|
||||
'''
|
||||
return id(self)==id(index)
|
||||
|
||||
|
||||
class Graph(object):
|
||||
'''
|
||||
Class used to represent directed or undirected graph.
|
||||
|
||||
.. warning::
|
||||
|
||||
Only one edge can connect two nodes in a given direction.
|
||||
|
||||
.. warning::
|
||||
|
||||
Specifying nodes through their index seepud your code but as no check
|
||||
is done on index value, it may result in inconsistency. So prefer the
|
||||
use of node label to specify a node.
|
||||
|
||||
|
||||
'''
|
||||
def __init__(self,label='G',directed=False,indexer=None,nodes=None,edges=None):
|
||||
'''
|
||||
:param label: Graph name, set to 'G' by default
|
||||
:type label: str
|
||||
|
||||
:param directed: true for directed graph, set to False by defalt
|
||||
:type directed: boolean
|
||||
|
||||
:param indexer: node label indexer. This allows to define several graphs
|
||||
sharing the same indexer (see : :py:meth:`newEmpty`)
|
||||
:type indexer: :py:class:`Indexer`
|
||||
|
||||
:param nodes: set of nodes to add to the graph
|
||||
:type nodes: iterable value
|
||||
|
||||
:param edges: set of edges to add to the graph
|
||||
:type edges: iterable value
|
||||
'''
|
||||
|
||||
self._directed=directed
|
||||
if indexer is None:
|
||||
indexer = Indexer()
|
||||
self._index = indexer
|
||||
self._node = {}
|
||||
self._node_attrs = {}
|
||||
self._edge_attrs = {}
|
||||
self._label=label
|
||||
|
||||
def newEmpty(self):
|
||||
"""
|
||||
Build a new empty graph using the same :py:class:`Indexer` instance.
|
||||
This allows two graph for sharing their vertices through their indices.
|
||||
"""
|
||||
n = Graph(self._label+"_compact",self._directed,self._index)
|
||||
|
||||
return n
|
||||
|
||||
def addNode(self,node=None,index=None,**data):
|
||||
'''
|
||||
Add a new node or update an existing one.
|
||||
|
||||
:param node: the new node label or the label of an existing node
|
||||
for updating it.
|
||||
:type node: an hashable python value
|
||||
|
||||
:param index: the index of an existing node for updating it.
|
||||
:type index: int
|
||||
|
||||
:return: the index of the node
|
||||
:rtype: int
|
||||
|
||||
:raises: :py:exc:`IndexError` is index is not **None** and
|
||||
corresponds to a not used index in this graph.
|
||||
'''
|
||||
if index is None:
|
||||
index = self._index[node]
|
||||
|
||||
if index not in self._node:
|
||||
self._node[index]=set()
|
||||
else:
|
||||
if index not in self._node:
|
||||
raise IndexError,"This index is not used in this graph"
|
||||
|
||||
if data:
|
||||
if index in self._node_attrs:
|
||||
self._node_attrs[index].update(data)
|
||||
else:
|
||||
self._node_attrs[index]=dict(data)
|
||||
|
||||
return index
|
||||
|
||||
def __contains__(self,node):
|
||||
try:
|
||||
index = self._index.getIndex(node,strict=True)
|
||||
r = index in self._node
|
||||
except KeyError:
|
||||
r=False
|
||||
return r
|
||||
|
||||
def getNode(self,node=None,index=None):
|
||||
"""
|
||||
:param node: a node label.
|
||||
:type node: an hashable python value
|
||||
|
||||
:param index: the index of an existing node.
|
||||
:type index: int
|
||||
|
||||
.. note:: Index value are prevalent over node label.
|
||||
|
||||
:return: the looked for node
|
||||
:rtype: :py:class:`Node`
|
||||
|
||||
:raises: :py:exc:`IndexError` if specified node lablel
|
||||
corresponds to a non-existing node.
|
||||
|
||||
.. warning:: no check on index value
|
||||
"""
|
||||
if index is None:
|
||||
index = self._index.getIndex(node, True)
|
||||
return Node(index,self)
|
||||
|
||||
def getBestNode(self,estimator):
|
||||
'''
|
||||
Select the node maximizing the estimator function
|
||||
|
||||
:param estimator: the function to maximize
|
||||
:type estimator: a function returning a numerical value and accepting one
|
||||
argument of type :py:class:`Node`
|
||||
|
||||
:return: the best node
|
||||
:rtype: py:class:`Node`
|
||||
'''
|
||||
|
||||
bestScore=0
|
||||
best=None
|
||||
for n in self:
|
||||
score = estimator(n)
|
||||
if best is None or score > bestScore:
|
||||
bestScore = score
|
||||
best=n
|
||||
return best
|
||||
|
||||
|
||||
def delNode(self,node=None,index=None):
|
||||
"""
|
||||
Delete a node from a graph and all associated edges.
|
||||
|
||||
:param node: a node label.
|
||||
:type node: an hashable python value
|
||||
|
||||
:param index: the index of an existing node.
|
||||
:type index: int
|
||||
|
||||
.. note:: Index value are prevalent over node label.
|
||||
|
||||
:raises: :py:exc:`IndexError` if specified node lablel
|
||||
corresponds to a non-existing node.
|
||||
|
||||
.. warning:: no check on index value
|
||||
"""
|
||||
if index is None:
|
||||
index = self._index[node]
|
||||
|
||||
for n in self._node:
|
||||
if n!=index:
|
||||
e = self._node[n]
|
||||
if index in e:
|
||||
if (n,index) in self._edge_attrs:
|
||||
del self._edge_attrs[(n,index)]
|
||||
e.remove(index)
|
||||
|
||||
e = self._node[index]
|
||||
|
||||
for n in e:
|
||||
if (index,n) in self._edge_attrs:
|
||||
del self._edge_attrs[(index,n)]
|
||||
|
||||
del self._node[index]
|
||||
if index in self._node_attrs:
|
||||
del self._node_attrs[index]
|
||||
|
||||
|
||||
def addEdge(self,node1=None,node2=None,index1=None,index2=None,**data):
|
||||
'''
|
||||
Create a new edge in the graph between both the specified nodes.
|
||||
|
||||
.. note:: Nodes can be specified using their label or their index in the graph
|
||||
if both values are indicated the index is used.
|
||||
|
||||
:param node1: The first vertex label
|
||||
:type node1: an hashable python value
|
||||
:param node2: The second vertex label
|
||||
:type node2: an hashable python value
|
||||
:param index1: The first vertex index
|
||||
:type index1: int
|
||||
:param index2: The second vertex index
|
||||
:type index2: int
|
||||
|
||||
:raises: :py:exc:`IndexError` if one of both the specified node lablel
|
||||
corresponds to a non-existing node.
|
||||
|
||||
|
||||
.. warning:: no check on index value
|
||||
'''
|
||||
|
||||
index1=self.addNode(node1, index1)
|
||||
index2=self.addNode(node2, index2)
|
||||
|
||||
self._node[index1].add(index2)
|
||||
|
||||
if not self._directed:
|
||||
self._node[index2].add(index1)
|
||||
|
||||
if data:
|
||||
if (index1,index2) not in self._edge_attrs:
|
||||
data =dict(data)
|
||||
self._edge_attrs[(index1,index2)]=data
|
||||
if not self._directed:
|
||||
self._edge_attrs[(index2,index1)]=data
|
||||
else:
|
||||
self._edge_attrs[(index2,index1)].update(data)
|
||||
|
||||
return (index1,index2)
|
||||
|
||||
def getEdge(self,node1=None,node2=None,index1=None,index2=None):
|
||||
'''
|
||||
Extract the :py:class:`Edge` instance linking two nodes of the graph.
|
||||
|
||||
.. note:: Nodes can be specified using their label or their index in the graph
|
||||
if both values are indicated the index is used.
|
||||
|
||||
:param node1: The first vertex label
|
||||
:type node1: an hashable python value
|
||||
:param node2: The second vertex label
|
||||
:type node2: an hashable python value
|
||||
:param index1: The first vertex index
|
||||
:type index1: int
|
||||
:param index2: The second vertex index
|
||||
:type index2: int
|
||||
|
||||
:raises: :py:exc:`IndexError` if one of both the specified node lablel
|
||||
corresponds to a non-existing node.
|
||||
|
||||
|
||||
.. warning:: no check on index value
|
||||
'''
|
||||
node1=self.getNode(node1, index1)
|
||||
node2=self.getNode(node2, index2)
|
||||
return Edge(node1,node2)
|
||||
|
||||
def delEdge(self,node1=None,node2=None,index1=None,index2=None):
|
||||
"""
|
||||
Delete the edge linking node 1 to node 2.
|
||||
|
||||
.. note:: Nodes can be specified using their label or their index in the graph
|
||||
if both values are indicated the index is used.
|
||||
|
||||
|
||||
:param node1: The first vertex label
|
||||
:type node1: an hashable python value
|
||||
:param node2: The second vertex label
|
||||
:type node2: an hashable python value
|
||||
:param index1: The first vertex index
|
||||
:type index1: int
|
||||
:param index2: The second vertex index
|
||||
:type index2: int
|
||||
|
||||
:raises: :py:exc:`IndexError` if one of both the specified node lablel
|
||||
corresponds to a non-existing node.
|
||||
|
||||
|
||||
.. warning:: no check on index value
|
||||
"""
|
||||
if index1 is None:
|
||||
index1 = self._index[node1]
|
||||
if index2 is None:
|
||||
index2 = self._index[node2]
|
||||
if index1 in self._node and index2 in self._node[index1]:
|
||||
self._node[index1].remove(index2)
|
||||
if (index1,index2) in self._node_attrs:
|
||||
del self._node_attrs[(index1,index2)]
|
||||
if not self._directed:
|
||||
self._node[index2].remove(index1)
|
||||
if (index2,index1) in self._node_attrs:
|
||||
del self._node_attrs[(index2,index1)]
|
||||
|
||||
def edgeIterator(self,predicate=None):
|
||||
"""
|
||||
Iterate through a set of selected vertices.
|
||||
|
||||
:param predicate: a function allowing node selection. Default value
|
||||
is **None** and indicate that all nodes are selected.
|
||||
:type predicate: a function returning a boolean value
|
||||
and accepting one argument of class :py:class:`Edge`
|
||||
|
||||
:return: an iterator over selected edge
|
||||
:rtype: interator over :py:class:`Edge` instances
|
||||
|
||||
.. seealso::
|
||||
function :py:func:`selectEdgeAttributeFactory` for simple predicate.
|
||||
|
||||
"""
|
||||
for n1 in self._node:
|
||||
for n2 in self._node[n1]:
|
||||
if self._directed or n1 <= n2:
|
||||
e = self.getEdge(index1=n1, index2=n2)
|
||||
if predicate is None or predicate(e):
|
||||
yield e
|
||||
|
||||
|
||||
def nodeIterator(self,predicate=None):
|
||||
"""
|
||||
Iterate through a set of selected vertices.
|
||||
|
||||
:param predicate: a function allowing edge selection. Default value
|
||||
is **None** and indicate that all edges are selected.
|
||||
:type predicate: a function returning a boolean value
|
||||
and accepting one argument of class :py:class:`Node`
|
||||
|
||||
:return: an iterator over selected nodes.
|
||||
:rtype: interator over :py:class:`Node` instances
|
||||
|
||||
"""
|
||||
for n in self._node:
|
||||
node = self.getNode(index=n)
|
||||
if predicate is None or predicate(node):
|
||||
yield node
|
||||
|
||||
def nodeIndexIterator(self,predicate=None):
|
||||
"""
|
||||
Iterate through the indexes of a set of selected vertices.
|
||||
|
||||
:param predicate: a function allowing edge selection. Default value
|
||||
is **None** and indicate that all edges are selected.
|
||||
:type predicate: a function returning a boolean value
|
||||
and accepting one argument of class :py:class:`Node`
|
||||
|
||||
:return: an iterator over selected node indices.
|
||||
:rtype: interator over `int`
|
||||
|
||||
"""
|
||||
for n in self._node:
|
||||
node = self.getNode(index=n)
|
||||
if predicate is None or predicate(node):
|
||||
yield n
|
||||
|
||||
def neighbourIndexSet(self,node=None,index=None):
|
||||
if index is None:
|
||||
index=self.getNode(node).index
|
||||
return self._node[index]
|
||||
|
||||
def edgeCount(self):
|
||||
n = reduce(lambda x,y:x+y, (len(z) for z in self._node.itervalues()),0)
|
||||
if not self._directed:
|
||||
n=n/2
|
||||
return n
|
||||
|
||||
def subgraph(self,nodes,name='G'):
|
||||
sub = Graph(name,self._directed,self._index)
|
||||
if not isinstance(nodes, set):
|
||||
nodes = set(nodes)
|
||||
for n in nodes:
|
||||
sub._node[n]=nodes & self._node[n]
|
||||
if n in self._node_attrs:
|
||||
sub._node_attrs[n]=dict(self._node_attrs[n])
|
||||
for n2 in sub._node[n]:
|
||||
if not self._directed:
|
||||
if n <= n2:
|
||||
if (n,n2) in self._edge_attrs:
|
||||
data=dict(self._edge_attrs[(n,n2)])
|
||||
sub._edge_attrs[(n,n2)]=data
|
||||
sub._edge_attrs[(n2,n)]=data
|
||||
else:
|
||||
if (n,n2) in self._edge_attrs:
|
||||
data=dict(self._edge_attrs[(n,n2)])
|
||||
sub._edge_attrs[(n,n2)]=data
|
||||
return sub
|
||||
|
||||
def __len__(self):
|
||||
return len(self._node)
|
||||
|
||||
def __getitem__(self,key):
|
||||
return self.getNode(node=key)
|
||||
|
||||
def __delitem__(self,key):
|
||||
self.delNode(node=key)
|
||||
|
||||
def __iter__(self):
|
||||
return self.nodeIterator()
|
||||
|
||||
def __str__(self):
|
||||
if self._directed:
|
||||
kw ='digraph'
|
||||
else:
|
||||
kw='graph'
|
||||
|
||||
nodes = "\n ".join([str(x) for x in self])
|
||||
edges = "\n ".join([str(x) for x in self.edgeIterator()])
|
||||
|
||||
return "%s %s {\n %s\n\n %s\n}" % (kw,self._label,nodes,edges)
|
||||
|
||||
class Node(object):
|
||||
"""
|
||||
Class used for representing one node or vertex in a graph
|
||||
|
||||
"""
|
||||
def __init__(self,index,graph):
|
||||
'''
|
||||
.. warning::
|
||||
|
||||
:py:class:`Node` constructor is usualy called through the :py:class:`Graph` methods
|
||||
|
||||
:param index: Index of the node in the graph
|
||||
:type index: int
|
||||
:param graph: graph instance owning the node
|
||||
:type graph: :py:class:`obitools.graph.Graph`
|
||||
'''
|
||||
self.index = index
|
||||
self.__graph = graph
|
||||
|
||||
def getGraph(self):
|
||||
'''
|
||||
return graph owning this node.
|
||||
|
||||
:rtype: :py:class:`obitools.graph.Graph`
|
||||
'''
|
||||
return self.__graph
|
||||
|
||||
|
||||
def getLabel(self):
|
||||
'''
|
||||
return label associated to this node.
|
||||
'''
|
||||
return self.__graph._index.getLabel(self.index)
|
||||
|
||||
|
||||
def has_key(self,key):
|
||||
'''
|
||||
test is the node instance has a property named 'key'.
|
||||
|
||||
:param key: the name of a property
|
||||
:type key: str
|
||||
|
||||
:return: True if the nade has a property named <key>
|
||||
:rtype: bool
|
||||
'''
|
||||
if self.index in self.__graph._node_attrs:
|
||||
return key in self.__graph._node_attrs[self.index]
|
||||
else:
|
||||
return False
|
||||
|
||||
def neighbourIterator(self,nodePredicat=None,edgePredicat=None):
|
||||
'''
|
||||
iterate through the nodes directly connected to
|
||||
this node.
|
||||
|
||||
:param nodePredicat: a function accepting one node as parameter
|
||||
and returning **True** if this node must be
|
||||
returned by the iterator.
|
||||
:type nodePredicat: function
|
||||
|
||||
:param edgePredicat: a function accepting one edge as parameter
|
||||
and returning True if the edge linking self and
|
||||
the current must be considered.
|
||||
:type edgePredicat: function
|
||||
|
||||
|
||||
:rtype: iterator on Node instances
|
||||
'''
|
||||
for n in self.neighbourIndexIterator(nodePredicat, edgePredicat):
|
||||
node = self.graph.getNode(index=n)
|
||||
yield node
|
||||
|
||||
def neighbourIndexSet(self):
|
||||
'''
|
||||
Return a set of node indexes directely connected
|
||||
to this node.
|
||||
|
||||
.. warning::
|
||||
|
||||
do not change this set unless you know
|
||||
exactly what you do.
|
||||
|
||||
@rtype: set of int
|
||||
'''
|
||||
return self.__graph._node[self.index]
|
||||
|
||||
def neighbourIndexIterator(self,nodePredicat=None,edgePredicat=None):
|
||||
'''
|
||||
iterate through the node indexes directly connected to
|
||||
this node.
|
||||
|
||||
:param nodePredicat: a function accepting one node as parameter
|
||||
and returning True if this node must be
|
||||
returned by the iterator.
|
||||
:type nodePredicat: function
|
||||
|
||||
:param edgePredicat: a function accepting one edge as parameter
|
||||
and returning True if the edge linking self and
|
||||
the current must be considered.
|
||||
:type edgePredicat: function
|
||||
|
||||
:rtype: iterator on int
|
||||
'''
|
||||
for n in self.neighbourIndexSet():
|
||||
if nodePredicat is None or nodePredicat(self.__graph.getNode(index=n)):
|
||||
if edgePredicat is None or edgePredicat(self.__graph.getEdge(index1=self.index,index2=n)):
|
||||
yield n
|
||||
|
||||
def degree(self,nodeIndexes=None):
|
||||
'''
|
||||
return count of edges linking this node to the
|
||||
set of nodes describes by their index in nodeIndexes
|
||||
|
||||
:param nodeIndexes: set of node indexes.
|
||||
if set to None, all nodes of the
|
||||
graph are take into account.
|
||||
Set to None by default.
|
||||
:type nodeIndexes: set of int
|
||||
|
||||
:rtype: int
|
||||
'''
|
||||
if nodeIndexes is None:
|
||||
return len(self.__graph._node[self.index])
|
||||
else:
|
||||
return len(self.__graph._node[self.index] & nodeIndexes)
|
||||
|
||||
def componentIndexSet(self,nodePredicat=None,edgePredicat=None):
|
||||
'''
|
||||
Return the set of node index in the same connected component.
|
||||
|
||||
:param nodePredicat: a function accepting one node as parameter
|
||||
and returning True if this node must be
|
||||
returned by the iterator.
|
||||
:type nodePredicat: function
|
||||
|
||||
:param edgePredicat: a function accepting one edge as parameter
|
||||
and returning True if the edge linking self and
|
||||
the current must be considered.
|
||||
:type edgePredicat: function
|
||||
|
||||
|
||||
:rtype: set of int
|
||||
'''
|
||||
cc=set([self.index])
|
||||
added = set(x for x in self.neighbourIndexIterator(nodePredicat, edgePredicat))
|
||||
while added:
|
||||
cc |= added
|
||||
added = reduce(lambda x,y : x | y,
|
||||
(set(z for z in self.graph.getNode(index=c).neighbourIndexIterator(nodePredicat, edgePredicat))
|
||||
for c in added),
|
||||
set())
|
||||
added -= cc
|
||||
return cc
|
||||
|
||||
def componentIterator(self,nodePredicat=None,edgePredicat=None):
|
||||
'''
|
||||
Iterate through the nodes in the same connected
|
||||
component.
|
||||
|
||||
:rtype: iterator on :py:class:`Node` instance
|
||||
'''
|
||||
for c in self.componentIndexSet(nodePredicat, edgePredicat):
|
||||
yield self.graph.getNode(c)
|
||||
|
||||
def shortestPathIterator(self,nodes=None):
|
||||
'''
|
||||
Iterate through the shortest path sourcing
|
||||
from this node. if nodes is not None, iterates
|
||||
only path linkink this node to one node listed in
|
||||
nodes
|
||||
|
||||
:param nodes: set of node index
|
||||
:type nodes: iterable on int
|
||||
|
||||
:return: an iterator on list of int describing path
|
||||
:rtype: iterator on list of int
|
||||
'''
|
||||
if nodes is not None:
|
||||
nodes = set(nodes)
|
||||
|
||||
|
||||
Q=[(self.index,-1)]
|
||||
|
||||
gray = set([self.index])
|
||||
paths = {}
|
||||
|
||||
while Q and (nodes is None or nodes):
|
||||
u,p = Q.pop()
|
||||
paths[u]=p
|
||||
next = self.graph._node[u] - gray
|
||||
gray|=next
|
||||
Q.extend((x,u) for x in next)
|
||||
if nodes is None or u in nodes:
|
||||
if nodes:
|
||||
nodes.remove(u)
|
||||
path = [u]
|
||||
while p >= 0:
|
||||
path.append(p)
|
||||
p = paths[p]
|
||||
path.reverse()
|
||||
yield path
|
||||
|
||||
def shortestPathTo(self,node=None,index=None):
|
||||
'''
|
||||
return one of the shortest path linking this
|
||||
node to specified node.
|
||||
|
||||
:param node: a node label or None
|
||||
:param index: a node index or None. the parameter index
|
||||
has a priority on the parameter node.
|
||||
:type index: int
|
||||
|
||||
:return: list of node index corresponding to the path or None
|
||||
if no path exists.
|
||||
:rtype: list of int or None
|
||||
'''
|
||||
if index is None:
|
||||
index=self.graph.getNode(node).index
|
||||
for p in self.shortestPathIterator([index]):
|
||||
return p
|
||||
|
||||
|
||||
def __getitem__(self,key):
|
||||
'''
|
||||
return the value of the <key> property of this node
|
||||
|
||||
:param key: the name of a property
|
||||
:type key: str
|
||||
'''
|
||||
return self.__graph._node_attrs.get(self.index,{})[key]
|
||||
|
||||
def __setitem__(self,key,value):
|
||||
'''
|
||||
set the value of a node property. In the property doesn't
|
||||
already exist a new property is added to this node.
|
||||
|
||||
:param key: the name of a property
|
||||
:type key: str
|
||||
:param value: the value of the property
|
||||
|
||||
.. seealso::
|
||||
|
||||
:py:meth:`Node.__getitem__`
|
||||
'''
|
||||
if self.index in self.__graph._node_attrs:
|
||||
data = self.__graph._node_attrs[self.index]
|
||||
data[key]=value
|
||||
else:
|
||||
self.graph._node_attrs[self.index]={key:value}
|
||||
|
||||
def __len__(self):
|
||||
'''
|
||||
Count neighbour of this node
|
||||
|
||||
:rtype: int
|
||||
|
||||
.. seealso::
|
||||
|
||||
:py:meth:`Node.degree`
|
||||
'''
|
||||
return len(self.__graph._node[self.index])
|
||||
|
||||
def __iter__(self):
|
||||
'''
|
||||
iterate through neighbour of this node
|
||||
|
||||
:rtype: iterator in :py:class:`Node` instances
|
||||
|
||||
.. seealso::
|
||||
|
||||
:py:meth:`Node.neighbourIterator`
|
||||
'''
|
||||
return self.neighbourIterator()
|
||||
|
||||
def __contains__(self,key):
|
||||
return self.has_key(key)
|
||||
|
||||
def __str__(self):
|
||||
|
||||
if self.index in self.__graph._node_attrs:
|
||||
keys = " ".join(['%s="%s"' % (x[0],str(x[1]).replace('"','\\"').replace('\n','\\n'))
|
||||
for x in self.__graph._node_attrs[self.index].iteritems()]
|
||||
)
|
||||
else:
|
||||
keys=''
|
||||
|
||||
return '%d [label="%s" %s]' % (self.index,
|
||||
str(self.label).replace('"','\\"').replace('\n','\\n'),
|
||||
keys)
|
||||
|
||||
def keys(self):
|
||||
if self.index in self.__graph._node_attrs:
|
||||
k = self.__graph._node_attrs[self.index].keys()
|
||||
else:
|
||||
k=[]
|
||||
return k
|
||||
|
||||
label = property(getLabel, None, None, "Label of the node")
|
||||
|
||||
graph = property(getGraph, None, None, "Graph owning this node")
|
||||
|
||||
|
||||
|
||||
class Edge(object):
|
||||
"""
|
||||
Class used for representing one edge of a graph
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self,node1,node2):
|
||||
'''
|
||||
.. warning::
|
||||
|
||||
:py:class:`Edge` constructor is usualy called through the :py:class:`Graph` methods
|
||||
|
||||
:param node1: First node likend by the edge
|
||||
:type node1: :py:class:`Node`
|
||||
:param node2: Seconde node likend by the edge
|
||||
:type node2: :py:class:`Node`
|
||||
'''
|
||||
self.node1 = node1
|
||||
self.node2 = node2
|
||||
|
||||
def getGraph(self):
|
||||
"""
|
||||
Return the :py:class:`Graph` instance owning this edge.
|
||||
"""
|
||||
return self.node1.graph
|
||||
|
||||
def has_key(self,key):
|
||||
'''
|
||||
test is the :py:class:`Edge` instance has a property named **key**.
|
||||
|
||||
:param key: the name of a property
|
||||
:type key: str
|
||||
|
||||
:return: True if the edge has a property named <key>
|
||||
:rtype: bool
|
||||
'''
|
||||
if (self.node1.index,self.node2.index) in self.graph._edge_attrs:
|
||||
return key in self.graph._edge_attrs[(self.node1.index,self.node2.index)]
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def getDirected(self):
|
||||
return self.node1.graph._directed
|
||||
|
||||
def __getitem__(self,key):
|
||||
return self.graph._edge_attrs.get((self.node1.index,self.node2.index),{})[key]
|
||||
|
||||
def __setitem__(self,key,value):
|
||||
e = (self.node1.index,self.node2.index)
|
||||
if e in self.graph._edge_attrs:
|
||||
data = self.graph._edge_attrs[e]
|
||||
data[key]=value
|
||||
else:
|
||||
self.graph._edge_attrs[e]={key:value}
|
||||
|
||||
def __str__(self):
|
||||
e = (self.node1.index,self.node2.index)
|
||||
if e in self.graph._edge_attrs:
|
||||
keys = "[%s]" % " ".join(['%s="%s"' % (x[0],str(x[1]).replace('"','\\"'))
|
||||
for x in self.graph._edge_attrs[e].iteritems()]
|
||||
)
|
||||
else:
|
||||
keys = ""
|
||||
|
||||
if self.directed:
|
||||
link='->'
|
||||
else:
|
||||
link='--'
|
||||
|
||||
return "%d %s %d %s" % (self.node1.index,link,self.node2.index,keys)
|
||||
|
||||
def __contains__(self,key):
|
||||
return self.has_key(key)
|
||||
|
||||
|
||||
graph = property(getGraph, None, None, "Graph owning this edge")
|
||||
|
||||
directed = property(getDirected, None, None, "Directed's Docstring")
|
||||
|
||||
|
||||
class DiGraph(Graph):
|
||||
"""
|
||||
:py:class:`DiGraph class`is a specialisation of the :py:class:`Graph` class
|
||||
dedicated to directed graph representation
|
||||
|
||||
.. seealso::
|
||||
|
||||
:py:class:`UndirectedGraph`
|
||||
|
||||
"""
|
||||
def __init__(self,label='G',indexer=None,nodes=None,edges=None):
|
||||
'''
|
||||
:param label: Graph name, set to 'G' by default
|
||||
:type label: str
|
||||
:param indexer: node label indexer
|
||||
:type indexer: Indexer instance
|
||||
:param nodes: set of nodes to add to the graph
|
||||
:type nodes: iterable value
|
||||
:param edges: set of edges to add to the graph
|
||||
:type edges: iterable value
|
||||
'''
|
||||
|
||||
Graph.__init__(self, label, True, indexer, nodes, edges)
|
||||
|
||||
class UndirectedGraph(Graph):
|
||||
"""
|
||||
:py:class:`UndirectGraph class`is a specialisation of the :py:class:`Graph` class
|
||||
dedicated to undirected graph representation
|
||||
|
||||
.. seealso::
|
||||
|
||||
:py:class:`DiGraph`
|
||||
|
||||
"""
|
||||
def __init__(self,label='G',indexer=None,nodes=None,edges=None):
|
||||
'''
|
||||
:param label: Graph name, set to 'G' by default
|
||||
:type label: str
|
||||
:param indexer: node label indexer
|
||||
:type indexer: Indexer instance
|
||||
:param nodes: set of nodes to add to the graph
|
||||
:type nodes: iterable value
|
||||
:param edges: set of edges to add to the graph
|
||||
:type edges: iterable value
|
||||
'''
|
||||
|
||||
Graph.__init__(self, label, False, indexer, nodes, edges)
|
||||
|
||||
|
||||
|
||||
def selectEdgeAttributeFactory(attribut,value):
|
||||
"""
|
||||
This function help in building predicat function usable for selecting edge
|
||||
in the folowing :py:class:`Graph` methods :
|
||||
|
||||
- :py:meth:`Graph.edgeIterator`
|
||||
|
||||
"""
|
||||
def selectEdge(e):
|
||||
return attribut in e and e[attribut]==value
|
||||
return selectEdge
|
BIN
obitools/graph/__init__.pyc
Normal file
BIN
obitools/graph/__init__.pyc
Normal file
Binary file not shown.
0
obitools/graph/algorithms/__init__.py
Normal file
0
obitools/graph/algorithms/__init__.py
Normal file
BIN
obitools/graph/algorithms/__init__.pyc
Normal file
BIN
obitools/graph/algorithms/__init__.pyc
Normal file
Binary file not shown.
134
obitools/graph/algorithms/clique.py
Normal file
134
obitools/graph/algorithms/clique.py
Normal file
@ -0,0 +1,134 @@
|
||||
import time
|
||||
import sys
|
||||
|
||||
|
||||
|
||||
_maxsize=0
|
||||
_solution=0
|
||||
_notbound=0
|
||||
_sizebound=0
|
||||
_lastyield=0
|
||||
_maxclique=None
|
||||
|
||||
def cliqueIterator(graph,minsize=1,node=None,timeout=None):
|
||||
global _maxsize,_solution,_notbound,_sizebound,_lastyield
|
||||
_maxsize=0
|
||||
_solution=0
|
||||
_notbound=0
|
||||
_sizebound=0
|
||||
starttime = time.time()
|
||||
|
||||
if node:
|
||||
node = graph.getNode(node)
|
||||
index = node.index
|
||||
clique= set([index])
|
||||
candidates= set(graph.neighbourIndexSet(index=index))
|
||||
else:
|
||||
clique=set()
|
||||
candidates = set(x.index for x in graph)
|
||||
|
||||
|
||||
# candidates = set(x for x in candidates
|
||||
# if len(graph.neighbourIndexSet(index=x) & candidates) >= (minsize - 1))
|
||||
|
||||
_lastyield=time.time()
|
||||
for c in _cliqueIterator(graph,clique,candidates,set(),minsize,start=starttime,timeout=timeout):
|
||||
yield c
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def _cliqueIterator(graph,clique,candidates,notlist,minsize=0,start=None,timeout=None):
|
||||
global _maxsize,_maxclique,_solution,_notbound,_sizebound,_lastyield
|
||||
|
||||
# Speed indicator
|
||||
lclique = len(clique)
|
||||
lcandidates = len(candidates)
|
||||
notmin = lcandidates
|
||||
notfix = None
|
||||
|
||||
for n in notlist:
|
||||
nnc = candidates - graph.neighbourIndexSet(index=n)
|
||||
nc = len(nnc)
|
||||
if nc < notmin:
|
||||
notmin=nc
|
||||
notfix=n
|
||||
notfixneib = nnc
|
||||
|
||||
if lclique > _maxsize or not _solution % 1000 :
|
||||
if start is not None:
|
||||
top = time.time()
|
||||
delta = top - start
|
||||
if delta==0:
|
||||
delta=1e-6
|
||||
speed = _solution / delta
|
||||
start = top
|
||||
else:
|
||||
speed = 0
|
||||
print >>sys.stderr,"\rCandidates : %-5d Maximum clique size : %-5d Solutions explored : %10d speed = %5.2f solutions/sec sizebound=%10d notbound=%10d " % (lcandidates,_maxsize,_solution,speed,_sizebound,_notbound),
|
||||
sys.stderr.flush()
|
||||
if lclique > _maxsize:
|
||||
_maxsize=lclique
|
||||
|
||||
# print >>sys.stderr,'koukou'
|
||||
|
||||
timer = time.time() - _lastyield
|
||||
|
||||
if not candidates and not notlist:
|
||||
if lclique==_maxsize:
|
||||
_maxclique=set(clique)
|
||||
if lclique >= minsize:
|
||||
yield set(clique)
|
||||
if timeout is not None and timer > timeout and _maxclique is not None:
|
||||
yield _maxclique
|
||||
_maxclique=None
|
||||
|
||||
else:
|
||||
while notmin and candidates and ((lclique + len(candidates)) >= minsize or (timeout is not None and timer > timeout)):
|
||||
# count explored solution
|
||||
_solution+=1
|
||||
|
||||
if notfix is None:
|
||||
nextcandidate = candidates.pop()
|
||||
else:
|
||||
nextcandidate = notfixneib.pop()
|
||||
candidates.remove(nextcandidate)
|
||||
|
||||
clique.add(nextcandidate)
|
||||
|
||||
neighbours = graph.neighbourIndexSet(index=nextcandidate)
|
||||
|
||||
nextcandidates = candidates & neighbours
|
||||
nextnot = notlist & neighbours
|
||||
|
||||
nnc = candidates - neighbours
|
||||
lnnc=len(nnc)
|
||||
|
||||
for c in _cliqueIterator(graph,
|
||||
set(clique),
|
||||
nextcandidates,
|
||||
nextnot,
|
||||
minsize,
|
||||
start,
|
||||
timeout=timeout):
|
||||
yield c
|
||||
|
||||
|
||||
clique.remove(nextcandidate)
|
||||
|
||||
notmin-=1
|
||||
|
||||
if lnnc < notmin:
|
||||
notmin = lnnc
|
||||
notfix = nextcandidate
|
||||
notfixneib = nnc
|
||||
|
||||
if notmin==0:
|
||||
_notbound+=1
|
||||
|
||||
notlist.add(nextcandidate)
|
||||
else:
|
||||
if (lclique + len(candidates)) < minsize:
|
||||
_sizebound+=1
|
||||
|
8
obitools/graph/algorithms/compact.py
Normal file
8
obitools/graph/algorithms/compact.py
Normal file
@ -0,0 +1,8 @@
|
||||
|
||||
def compactGraph(graph,nodeSetIterator):
|
||||
compact = graph.newEmpty()
|
||||
for ns in nodeSetIterator(graph):
|
||||
nlabel = "\n".join([str(graph.getNode(index=x).label) for x in ns])
|
||||
compact.addNode(nlabel)
|
||||
print
|
||||
print compact
|
82
obitools/graph/algorithms/component.py
Normal file
82
obitools/graph/algorithms/component.py
Normal file
@ -0,0 +1,82 @@
|
||||
"""
|
||||
Iterate through the connected components of a graph
|
||||
---------------------------------------------------
|
||||
|
||||
the module :py:mod:`obitools.graph.algorithm.component` provides
|
||||
two functions to deal with the connected component of a graph
|
||||
represented as a :py:class:`obitools.graph.Graph` instance.
|
||||
|
||||
The whole set of connected component of a graph is a partition of this graph.
|
||||
So a node cannot belongs to two distinct connected component.
|
||||
|
||||
Two nodes are in the same connected component if it exits a path through
|
||||
the graph edges linking them.
|
||||
|
||||
TODO: THere is certainly a bug with DirectedGraph
|
||||
|
||||
"""
|
||||
|
||||
def componentIterator(graph,nodePredicat=None,edgePredicat=None):
|
||||
'''
|
||||
Build an iterator over the connected component of a graph.
|
||||
Each connected component returned by the iterator is represented
|
||||
as a `set` of node indices.
|
||||
|
||||
:param graph: the graph to partitionne
|
||||
:type graph: :py:class:`obitools.graph.Graph`
|
||||
|
||||
:param predicate: a function allowing edge selection. Default value
|
||||
is **None** and indicate that all edges are selected.
|
||||
:type predicate: a function returning a boolean value
|
||||
and accepting one argument of class :py:class:`Node`
|
||||
|
||||
:param predicate: a function allowing node selection. Default value
|
||||
is **None** and indicate that all nodes are selected.
|
||||
:type predicate: a function returning a boolean value
|
||||
and accepting one argument of class :py:class:`Edge`
|
||||
|
||||
:return: an iterator over the connected component set
|
||||
:rtype: an iterator over `set` of `int`
|
||||
|
||||
.. seealso::
|
||||
the :py:meth:`obitools.graph.Graph.componentIndexSet` method
|
||||
on which is based this function.
|
||||
'''
|
||||
seen = set()
|
||||
for n in graph.nodeIterator(nodePredicat):
|
||||
if n.index not in seen:
|
||||
cc=n.componentIndexSet(nodePredicat, edgePredicat)
|
||||
yield cc
|
||||
seen |= cc
|
||||
|
||||
def componentCount(graph,nodePredicat=None,edgePredicat=None):
|
||||
'''
|
||||
Count the connected componnent in a graph.
|
||||
|
||||
:param graph: the graph to partitionne
|
||||
:type graph: :py:class:`obitools.graph.Graph`
|
||||
|
||||
:param predicate: a function allowing edge selection. Default value
|
||||
is **None** and indicate that all edges are selected.
|
||||
:type predicate: a function returning a boolean value
|
||||
and accepting one argument of class :py:class:`Node`
|
||||
|
||||
:param predicate: a function allowing node selection. Default value
|
||||
is **None** and indicate that all nodes are selected.
|
||||
:type predicate: a function returning a boolean value
|
||||
and accepting one argument of class :py:class:`Edge`
|
||||
|
||||
:return: an iterator over the connected component set
|
||||
:rtype: an iterator over `set` of `int`
|
||||
|
||||
.. seealso::
|
||||
the :py:func:`componentIterator` function
|
||||
on which is based this function.
|
||||
'''
|
||||
n=0
|
||||
for c in componentIterator(graph,nodePredicat, edgePredicat):
|
||||
n+=1
|
||||
return n
|
||||
|
||||
|
||||
|
BIN
obitools/graph/algorithms/component.pyc
Normal file
BIN
obitools/graph/algorithms/component.pyc
Normal file
Binary file not shown.
80
obitools/graph/dag.py
Normal file
80
obitools/graph/dag.py
Normal file
@ -0,0 +1,80 @@
|
||||
from obitools.graph import DiGraph,Node
|
||||
from obitools.graph.algorithms.component import componentIterator
|
||||
|
||||
class DAG(DiGraph):
|
||||
def __init__(self,label='G',indexer=None,nodes=None,edges=None):
|
||||
'''
|
||||
Directed Graph constructor.
|
||||
|
||||
@param label: Graph name, set to 'G' by default
|
||||
@type label: str
|
||||
@param indexer: node label indexer
|
||||
@type indexer: Indexer instance
|
||||
@param nodes: set of nodes to add to the graph
|
||||
@type nodes: iterable value
|
||||
@param edges: set of edges to add to the graph
|
||||
@type edges: iterable value
|
||||
'''
|
||||
|
||||
self._parents={}
|
||||
DiGraph.__init__(self, label, indexer, nodes, edges)
|
||||
|
||||
def getNode(self,node=None,index=None):
|
||||
if index is None:
|
||||
index = self._index.getIndex(node, True)
|
||||
return DAGNode(index,self)
|
||||
|
||||
def addEdge(self,parent=None,node=None,indexp=None,index=None,**data):
|
||||
indexp=self.addNode(parent, indexp)
|
||||
index =self.addNode(node , index)
|
||||
|
||||
pindex = set(n.index
|
||||
for n in self.getNode(index=indexp).ancestorIterator())
|
||||
|
||||
assert index not in pindex,'Child node cannot be a parent node'
|
||||
|
||||
DiGraph.addEdge(self,index1=indexp,index2=index,**data)
|
||||
|
||||
if index in self._parents:
|
||||
self._parents[index].add(indexp)
|
||||
else:
|
||||
self._parents[index]=set([indexp])
|
||||
|
||||
|
||||
return (indexp,index)
|
||||
|
||||
def getRoots(self):
|
||||
return [self.getNode(index=cc.pop()).getRoot()
|
||||
for cc in componentIterator(self)]
|
||||
|
||||
|
||||
|
||||
|
||||
class DAGNode(Node):
|
||||
|
||||
def ancestorIterator(self):
|
||||
if self.index in self.graph._parents:
|
||||
for p in self.graph._parents[self.index]:
|
||||
parent = DAGNode(p,self.graph)
|
||||
yield parent
|
||||
for pnode in parent.ancestorIterator():
|
||||
yield pnode
|
||||
|
||||
def getRoot(self):
|
||||
for x in self.ancestorIterator():
|
||||
pass
|
||||
return x
|
||||
|
||||
def leavesIterator(self):
|
||||
if not self:
|
||||
yield self
|
||||
for n in self:
|
||||
for nn in n.leavesIterator():
|
||||
yield nn
|
||||
|
||||
def subgraphIterator(self):
|
||||
yield self
|
||||
for n in self:
|
||||
for nn in n.subgraphIterator():
|
||||
yield nn
|
||||
|
0
obitools/graph/layout/__init__.py
Normal file
0
obitools/graph/layout/__init__.py
Normal file
0
obitools/graph/layout/radialtree.py
Normal file
0
obitools/graph/layout/radialtree.py
Normal file
117
obitools/graph/rootedtree.py
Normal file
117
obitools/graph/rootedtree.py
Normal file
@ -0,0 +1,117 @@
|
||||
from obitools.graph.dag import DAG,DAGNode
|
||||
|
||||
class RootedTree(DAG):
|
||||
|
||||
def addEdge(self,parent=None,node=None,indexp=None,index=None,**data):
|
||||
indexp=self.addNode(parent, indexp)
|
||||
index =self.addNode(node , index)
|
||||
|
||||
assert index not in self._parents or indexp in self._parents[index], \
|
||||
'Child node cannot have more than one parent node'
|
||||
|
||||
return DAG.addEdge(self,indexp=indexp,index=index,**data)
|
||||
|
||||
def getNode(self,node=None,index=None):
|
||||
if index is None:
|
||||
index = self._index.getIndex(node, True)
|
||||
return RootedTreeNode(index,self)
|
||||
|
||||
|
||||
|
||||
class RootedTreeNode(DAGNode):
|
||||
|
||||
def subTreeSize(self):
|
||||
n=1
|
||||
for subnode in self:
|
||||
n+=subnode.subTreeSize()
|
||||
return n
|
||||
|
||||
def subTreeLeaves(self):
|
||||
if not self:
|
||||
return 1
|
||||
n=0
|
||||
for subnode in self:
|
||||
n+=subnode.subTreeLeaves()
|
||||
return n
|
||||
|
||||
|
||||
def nodeWriter(node,deep=0,label=None,distance="distance", bootstrap="bootstrap",cartoon=None,collapse=None):
|
||||
|
||||
ks = node.keys()
|
||||
|
||||
|
||||
if label is None:
|
||||
name=node.label
|
||||
elif callable(label):
|
||||
name=label(node)
|
||||
elif isinstance(label, str) and label in node:
|
||||
name=node[label]
|
||||
ks.remove(label)
|
||||
else:
|
||||
name=''
|
||||
|
||||
if distance in node:
|
||||
dist=':%6.5f' % node[distance]
|
||||
ks.remove(distance)
|
||||
else:
|
||||
dist=''
|
||||
|
||||
ks = ["%s=%s" % (k,node[k]) for k in ks]
|
||||
|
||||
if cartoon is not None and cartoon(node):
|
||||
ks.append("!cartoon={%d,0.0}" % node.subTreeLeaves())
|
||||
|
||||
if collapse is not None and collapse(node):
|
||||
ks.append('!collapse={"collapsed",0.0}')
|
||||
|
||||
if ks:
|
||||
ks="[&"+",".join(ks)+"]"
|
||||
else:
|
||||
ks=''
|
||||
|
||||
|
||||
nodeseparator = ',\n' + ' ' * (deep+1)
|
||||
|
||||
subnodes = nodeseparator.join([nodeWriter(x, deep+1,label,distance,bootstrap,cartoon=cartoon,collapse=collapse)
|
||||
for x in node])
|
||||
if subnodes:
|
||||
subnodes='(\n' + ' ' * (deep+1) + subnodes + '\n' + ' ' * deep + ')'
|
||||
|
||||
return '%s"%s"%s%s' % (subnodes,name,ks,dist)
|
||||
|
||||
|
||||
def nexusFormat(tree,startnode=None,label=None,blocks="",cartoon=None,collapse=None):
|
||||
head="#NEXUS\n"
|
||||
|
||||
tx = []
|
||||
|
||||
for n in tree:
|
||||
if label is None:
|
||||
name=n.label
|
||||
elif callable(label):
|
||||
name=label(n)
|
||||
elif isinstance(label, str) and label in n:
|
||||
name=n[label]
|
||||
else:
|
||||
name=''
|
||||
|
||||
if name:
|
||||
tx.append('"%s"' % name)
|
||||
|
||||
taxa = "begin taxa;\n\tdimensions ntax=%d;\n\ttaxlabels\n\t" % len(tx)
|
||||
|
||||
taxa+="\n\t".join(tx)
|
||||
|
||||
taxa+="\n;\nend;\n\n"
|
||||
|
||||
|
||||
|
||||
if startnode is not None:
|
||||
roots =[startnode]
|
||||
else:
|
||||
roots = tree.getRoots()
|
||||
trees = nodeWriter(roots[0],0,label,cartoon=cartoon,collapse=collapse)
|
||||
trees = "begin trees;\n\ttree tree_1 = [&R] "+ trees +";\nend;\n\n"
|
||||
return head+taxa+trees+"\n\n"+blocks+"\n"
|
||||
|
||||
|
37
obitools/graph/tree.py
Normal file
37
obitools/graph/tree.py
Normal file
@ -0,0 +1,37 @@
|
||||
from obitools.graph import UndirectedGraph,Node
|
||||
from obitools.graph.algorithms.component import componentCount
|
||||
|
||||
|
||||
class Forest(UndirectedGraph):
|
||||
|
||||
|
||||
def getNode(self,node=None,index=None):
|
||||
if index is None:
|
||||
index = self._index.getIndex(node, True)
|
||||
return TreeNode(index,self)
|
||||
|
||||
def addEdge(self,node1=None,node2=None,index1=None,index2=None,**data):
|
||||
index1=self.addNode(node1, index1)
|
||||
index2=self.addNode(node2, index2)
|
||||
|
||||
cc = set(n.index for n in self.getNode(index=index2).componentIterator())
|
||||
|
||||
assert index1 in self._node[index2] or index1 not in cc, \
|
||||
"No more than one path is alloed between two nodes in a tree"
|
||||
|
||||
UndirectedGraph.addEdge(self, index1=index1, index2=index2,**data)
|
||||
|
||||
return (index1,index2)
|
||||
|
||||
def isASingleTree(self):
|
||||
return componentCount(self)==1
|
||||
|
||||
class TreeNode(Node):
|
||||
|
||||
def componentIterator(self):
|
||||
for c in self:
|
||||
yield c
|
||||
for cc in c:
|
||||
yield cc
|
||||
|
||||
|
504
obitools/gzip.py
Normal file
504
obitools/gzip.py
Normal file
@ -0,0 +1,504 @@
|
||||
"""Functions that read and write gzipped files.
|
||||
|
||||
The user of the file doesn't have to worry about the compression,
|
||||
but random access is not allowed.
|
||||
|
||||
This consisted on a patched version of of standard gzip python
|
||||
module based on Andrew Kuchling's minigzip.py distributed with the zlib module
|
||||
|
||||
"""
|
||||
|
||||
# based on Andrew Kuchling's minigzip.py distributed with the zlib module
|
||||
|
||||
import struct, sys, time
|
||||
import zlib
|
||||
import __builtin__
|
||||
|
||||
__all__ = ["GzipFile","open"]
|
||||
|
||||
FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
|
||||
|
||||
READ, WRITE = 1, 2
|
||||
|
||||
def U32(i):
|
||||
"""Return i as an unsigned integer, assuming it fits in 32 bits.
|
||||
|
||||
If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
|
||||
"""
|
||||
if i < 0:
|
||||
i += 1L << 32
|
||||
return i
|
||||
|
||||
def LOWU32(i):
|
||||
"""Return the low-order 32 bits of an int, as a non-negative int."""
|
||||
return i & 0xFFFFFFFFL
|
||||
|
||||
def write32(output, value):
|
||||
output.write(struct.pack("<l", value))
|
||||
|
||||
def write32u(output, value):
|
||||
# The L format writes the bit pattern correctly whether signed
|
||||
# or unsigned.
|
||||
output.write(struct.pack("<L", value))
|
||||
|
||||
def read32(input):
|
||||
return struct.unpack("<l", input.read(4))[0]
|
||||
|
||||
def unpack32(buf):
|
||||
return struct.unpack("<l", buf)[0]
|
||||
|
||||
def open(filename, mode="rb", compresslevel=9):
|
||||
"""Shorthand for GzipFile(filename, mode, compresslevel).
|
||||
|
||||
The filename argument is required; mode defaults to 'rb'
|
||||
and compresslevel defaults to 9.
|
||||
|
||||
"""
|
||||
return GzipFile(filename, mode, compresslevel)
|
||||
|
||||
class GzipFile:
|
||||
"""The GzipFile class simulates most of the methods of a file object with
|
||||
the exception of the readinto() and truncate() methods.
|
||||
|
||||
"""
|
||||
|
||||
myfileobj = None
|
||||
max_read_chunk = 10 * 1024 * 1024 # 10Mb
|
||||
|
||||
def __init__(self, filename=None, mode=None,
|
||||
compresslevel=9, fileobj=None):
|
||||
"""Constructor for the GzipFile class.
|
||||
|
||||
At least one of fileobj and filename must be given a
|
||||
non-trivial value.
|
||||
|
||||
The new class instance is based on fileobj, which can be a regular
|
||||
file, a StringIO object, or any other object which simulates a file.
|
||||
It defaults to None, in which case filename is opened to provide
|
||||
a file object.
|
||||
|
||||
When fileobj is not None, the filename argument is only used to be
|
||||
included in the gzip file header, which may includes the original
|
||||
filename of the uncompressed file. It defaults to the filename of
|
||||
fileobj, if discernible; otherwise, it defaults to the empty string,
|
||||
and in this case the original filename is not included in the header.
|
||||
|
||||
The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
|
||||
depending on whether the file will be read or written. The default
|
||||
is the mode of fileobj if discernible; otherwise, the default is 'rb'.
|
||||
Be aware that only the 'rb', 'ab', and 'wb' values should be used
|
||||
for cross-platform portability.
|
||||
|
||||
The compresslevel argument is an integer from 1 to 9 controlling the
|
||||
level of compression; 1 is fastest and produces the least compression,
|
||||
and 9 is slowest and produces the most compression. The default is 9.
|
||||
|
||||
"""
|
||||
|
||||
# guarantee the file is opened in binary mode on platforms
|
||||
# that care about that sort of thing
|
||||
if mode and 'b' not in mode:
|
||||
mode += 'b'
|
||||
if fileobj is None:
|
||||
fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
|
||||
if filename is None:
|
||||
if hasattr(fileobj, 'name'): filename = fileobj.name
|
||||
else: filename = ''
|
||||
if mode is None:
|
||||
if hasattr(fileobj, 'mode'): mode = fileobj.mode
|
||||
else: mode = 'rb'
|
||||
|
||||
if mode[0:1] == 'r':
|
||||
self.mode = READ
|
||||
# Set flag indicating start of a new member
|
||||
self._new_member = True
|
||||
self.extrabuf = ""
|
||||
self.extrasize = 0
|
||||
self.filename = filename
|
||||
# Starts small, scales exponentially
|
||||
self.min_readsize = 100
|
||||
|
||||
elif mode[0:1] == 'w' or mode[0:1] == 'a':
|
||||
self.mode = WRITE
|
||||
self._init_write(filename)
|
||||
self.compress = zlib.compressobj(compresslevel,
|
||||
zlib.DEFLATED,
|
||||
-zlib.MAX_WBITS,
|
||||
zlib.DEF_MEM_LEVEL,
|
||||
0)
|
||||
else:
|
||||
raise IOError, "Mode " + mode + " not supported"
|
||||
|
||||
self.fileobj = fileobj
|
||||
self.offset = 0
|
||||
self.inputbuf = ''
|
||||
self.last8 = ''
|
||||
|
||||
if self.mode == WRITE:
|
||||
self._write_gzip_header()
|
||||
|
||||
def __repr__(self):
|
||||
s = repr(self.fileobj)
|
||||
return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
|
||||
|
||||
def _init_write(self, filename):
|
||||
if filename[-3:] != '.gz':
|
||||
filename = filename + '.gz'
|
||||
self.filename = filename
|
||||
self.crc = zlib.crc32("")
|
||||
self.size = 0
|
||||
self.writebuf = []
|
||||
self.bufsize = 0
|
||||
|
||||
def _write_gzip_header(self):
|
||||
self.fileobj.write('\037\213') # magic header
|
||||
self.fileobj.write('\010') # compression method
|
||||
fname = self.filename[:-3]
|
||||
flags = 0
|
||||
if fname:
|
||||
flags = FNAME
|
||||
self.fileobj.write(chr(flags))
|
||||
write32u(self.fileobj, long(time.time()))
|
||||
self.fileobj.write('\002')
|
||||
self.fileobj.write('\377')
|
||||
if fname:
|
||||
self.fileobj.write(fname + '\000')
|
||||
|
||||
def _init_read(self):
|
||||
self.crc = zlib.crc32("")
|
||||
self.size = 0
|
||||
|
||||
def _read_internal(self, size):
|
||||
if len(self.inputbuf) < size:
|
||||
self.inputbuf += self.fileobj.read(size-len(self.inputbuf))
|
||||
chunk = self.inputbuf[:size]
|
||||
# need to use len(chunk) bellow instead of size in case it's EOF.
|
||||
if len(chunk) < 8:
|
||||
self.last8 = self.last8[len(chunk):] + chunk
|
||||
else:
|
||||
self.last8 = chunk[-8:]
|
||||
self.inputbuf = self.inputbuf[size:]
|
||||
return chunk
|
||||
|
||||
def _read_gzip_header(self):
|
||||
magic = self._read_internal(2)
|
||||
if len(magic) != 2:
|
||||
raise EOFError, "Reached EOF"
|
||||
if magic != '\037\213':
|
||||
raise IOError, 'Not a gzipped file'
|
||||
method = ord( self._read_internal(1) )
|
||||
if method != 8:
|
||||
raise IOError, 'Unknown compression method'
|
||||
flag = ord( self._read_internal(1) )
|
||||
# modtime = self.fileobj.read(4)
|
||||
# extraflag = self.fileobj.read(1)
|
||||
# os = self.fileobj.read(1)
|
||||
self._read_internal(6)
|
||||
|
||||
if flag & FEXTRA:
|
||||
# Read & discard the extra field, if present
|
||||
xlen = ord(self._read_internal(1))
|
||||
xlen = xlen + 256*ord(self._read_internal(1))
|
||||
self._read_internal(xlen)
|
||||
if flag & FNAME:
|
||||
# Read and discard a null-terminated string containing the filename
|
||||
while True:
|
||||
s = self._read_internal(1)
|
||||
if not s or s=='\000':
|
||||
break
|
||||
if flag & FCOMMENT:
|
||||
# Read and discard a null-terminated string containing a comment
|
||||
while True:
|
||||
s = self._read_internal(1)
|
||||
if not s or s=='\000':
|
||||
break
|
||||
if flag & FHCRC:
|
||||
self._read_internal(2) # Read & discard the 16-bit header CRC
|
||||
|
||||
|
||||
def write(self,data):
|
||||
if self.mode != WRITE:
|
||||
import errno
|
||||
raise IOError(errno.EBADF, "write() on read-only GzipFile object")
|
||||
|
||||
if self.fileobj is None:
|
||||
raise ValueError, "write() on closed GzipFile object"
|
||||
if len(data) > 0:
|
||||
self.size = self.size + len(data)
|
||||
self.crc = zlib.crc32(data, self.crc)
|
||||
self.fileobj.write( self.compress.compress(data) )
|
||||
self.offset += len(data)
|
||||
|
||||
def read(self, size=-1):
|
||||
if self.mode != READ:
|
||||
import errno
|
||||
raise IOError(errno.EBADF, "read() on write-only GzipFile object")
|
||||
|
||||
if self.extrasize <= 0 and self.fileobj is None:
|
||||
return ''
|
||||
|
||||
readsize = 1024
|
||||
if size < 0: # get the whole thing
|
||||
try:
|
||||
while True:
|
||||
self._read(readsize)
|
||||
readsize = min(self.max_read_chunk, readsize * 2)
|
||||
except EOFError:
|
||||
size = self.extrasize
|
||||
else: # just get some more of it
|
||||
try:
|
||||
while size > self.extrasize:
|
||||
self._read(readsize)
|
||||
readsize = min(self.max_read_chunk, readsize * 2)
|
||||
except EOFError:
|
||||
if size > self.extrasize:
|
||||
size = self.extrasize
|
||||
|
||||
chunk = self.extrabuf[:size]
|
||||
self.extrabuf = self.extrabuf[size:]
|
||||
self.extrasize = self.extrasize - size
|
||||
|
||||
self.offset += size
|
||||
return chunk
|
||||
|
||||
def _unread(self, buf):
|
||||
self.extrabuf = buf + self.extrabuf
|
||||
self.extrasize = len(buf) + self.extrasize
|
||||
self.offset -= len(buf)
|
||||
|
||||
def _read(self, size=1024):
|
||||
if self.fileobj is None:
|
||||
raise EOFError, "Reached EOF"
|
||||
|
||||
if self._new_member:
|
||||
# If the _new_member flag is set, we have to
|
||||
# jump to the next member, if there is one.
|
||||
#
|
||||
# _read_gzip_header will raise EOFError exception
|
||||
# if there no more members to read.
|
||||
self._init_read()
|
||||
self._read_gzip_header()
|
||||
self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
|
||||
self._new_member = False
|
||||
|
||||
# Read a chunk of data from the file
|
||||
buf = self._read_internal(size)
|
||||
|
||||
# If the EOF has been reached, flush the decompression object
|
||||
# and mark this object as finished.
|
||||
|
||||
if buf == "":
|
||||
uncompress = self.decompress.flush()
|
||||
self._read_eof()
|
||||
self._add_read_data( uncompress )
|
||||
raise EOFError, 'Reached EOF'
|
||||
|
||||
uncompress = self.decompress.decompress(buf)
|
||||
self._add_read_data( uncompress )
|
||||
|
||||
if self.decompress.unused_data != "":
|
||||
# Ending case: we've come to the end of a member in the file,
|
||||
# so put back unused_data and initialize last8 by reading them.
|
||||
self.inputbuf = self.decompress.unused_data + self.inputbuf
|
||||
self._read_internal(8)
|
||||
|
||||
# Check the CRC and file size, and set the flag so we read
|
||||
# a new member on the next call
|
||||
self._read_eof()
|
||||
self._new_member = True
|
||||
|
||||
def _add_read_data(self, data):
|
||||
self.crc = zlib.crc32(data, self.crc)
|
||||
self.extrabuf = self.extrabuf + data
|
||||
self.extrasize = self.extrasize + len(data)
|
||||
self.size = self.size + len(data)
|
||||
|
||||
def _read_eof(self):
|
||||
# We've read to the end of the file, so we have to rewind in order
|
||||
# to reread the 8 bytes containing the CRC and the file size.
|
||||
# We check the that the computed CRC and size of the
|
||||
# uncompressed data matches the stored values. Note that the size
|
||||
# stored is the true file size mod 2**32.
|
||||
crc32 = unpack32(self.last8[:4])
|
||||
isize = U32(unpack32(self.last8[4:])) # may exceed 2GB
|
||||
if U32(crc32) != U32(self.crc):
|
||||
raise IOError, "CRC check failed"
|
||||
elif isize != LOWU32(self.size):
|
||||
raise IOError, "Incorrect length of data produced"
|
||||
|
||||
def close(self):
|
||||
if self.mode == WRITE:
|
||||
self.fileobj.write(self.compress.flush())
|
||||
# The native zlib crc is an unsigned 32-bit integer, but
|
||||
# the Python wrapper implicitly casts that to a signed C
|
||||
# long. So, on a 32-bit box self.crc may "look negative",
|
||||
# while the same crc on a 64-bit box may "look positive".
|
||||
# To avoid irksome warnings from the `struct` module, force
|
||||
# it to look positive on all boxes.
|
||||
write32u(self.fileobj, LOWU32(self.crc))
|
||||
# self.size may exceed 2GB, or even 4GB
|
||||
write32u(self.fileobj, LOWU32(self.size))
|
||||
self.fileobj = None
|
||||
elif self.mode == READ:
|
||||
self.fileobj = None
|
||||
if self.myfileobj:
|
||||
self.myfileobj.close()
|
||||
self.myfileobj = None
|
||||
|
||||
def __del__(self):
|
||||
try:
|
||||
if (self.myfileobj is None and
|
||||
self.fileobj is None):
|
||||
return
|
||||
except AttributeError:
|
||||
return
|
||||
self.close()
|
||||
|
||||
def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
|
||||
if self.mode == WRITE:
|
||||
# Ensure the compressor's buffer is flushed
|
||||
self.fileobj.write(self.compress.flush(zlib_mode))
|
||||
self.fileobj.flush()
|
||||
|
||||
def fileno(self):
|
||||
"""Invoke the underlying file object's fileno() method.
|
||||
|
||||
This will raise AttributeError if the underlying file object
|
||||
doesn't support fileno().
|
||||
"""
|
||||
return self.fileobj.fileno()
|
||||
|
||||
def isatty(self):
|
||||
return False
|
||||
|
||||
def tell(self):
|
||||
return self.offset
|
||||
|
||||
def rewind(self):
|
||||
'''Return the uncompressed stream file position indicator to the
|
||||
beginning of the file'''
|
||||
if self.mode != READ:
|
||||
raise IOError("Can't rewind in write mode")
|
||||
self.fileobj.seek(0)
|
||||
self._new_member = True
|
||||
self.extrabuf = ""
|
||||
self.extrasize = 0
|
||||
self.offset = 0
|
||||
|
||||
def seek(self, offset):
|
||||
if self.mode == WRITE:
|
||||
if offset < self.offset:
|
||||
raise IOError('Negative seek in write mode')
|
||||
count = offset - self.offset
|
||||
for i in range(count // 1024):
|
||||
self.write(1024 * '\0')
|
||||
self.write((count % 1024) * '\0')
|
||||
elif self.mode == READ:
|
||||
if offset < self.offset:
|
||||
# for negative seek, rewind and do positive seek
|
||||
self.rewind()
|
||||
count = offset - self.offset
|
||||
for i in range(count // 1024):
|
||||
self.read(1024)
|
||||
self.read(count % 1024)
|
||||
|
||||
def readline(self, size=-1):
|
||||
if size < 0:
|
||||
size = sys.maxint
|
||||
readsize = self.min_readsize
|
||||
else:
|
||||
readsize = size
|
||||
bufs = []
|
||||
while size != 0:
|
||||
c = self.read(readsize)
|
||||
i = c.find('\n')
|
||||
|
||||
# We set i=size to break out of the loop under two
|
||||
# conditions: 1) there's no newline, and the chunk is
|
||||
# larger than size, or 2) there is a newline, but the
|
||||
# resulting line would be longer than 'size'.
|
||||
if (size <= i) or (i == -1 and len(c) > size):
|
||||
i = size - 1
|
||||
|
||||
if i >= 0 or c == '':
|
||||
bufs.append(c[:i + 1]) # Add portion of last chunk
|
||||
self._unread(c[i + 1:]) # Push back rest of chunk
|
||||
break
|
||||
|
||||
# Append chunk to list, decrease 'size',
|
||||
bufs.append(c)
|
||||
size = size - len(c)
|
||||
readsize = min(size, readsize * 2)
|
||||
if readsize > self.min_readsize:
|
||||
self.min_readsize = min(readsize, self.min_readsize * 2, 512)
|
||||
return ''.join(bufs) # Return resulting line
|
||||
|
||||
def readlines(self, sizehint=0):
|
||||
# Negative numbers result in reading all the lines
|
||||
if sizehint <= 0:
|
||||
sizehint = sys.maxint
|
||||
L = []
|
||||
while sizehint > 0:
|
||||
line = self.readline()
|
||||
if line == "":
|
||||
break
|
||||
L.append(line)
|
||||
sizehint = sizehint - len(line)
|
||||
|
||||
return L
|
||||
|
||||
def writelines(self, L):
|
||||
for line in L:
|
||||
self.write(line)
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def next(self):
|
||||
line = self.readline()
|
||||
if line:
|
||||
return line
|
||||
else:
|
||||
raise StopIteration
|
||||
|
||||
|
||||
def _test():
|
||||
# Act like gzip; with -d, act like gunzip.
|
||||
# The input file is not deleted, however, nor are any other gzip
|
||||
# options or features supported.
|
||||
args = sys.argv[1:]
|
||||
decompress = args and args[0] == "-d"
|
||||
if decompress:
|
||||
args = args[1:]
|
||||
if not args:
|
||||
args = ["-"]
|
||||
for arg in args:
|
||||
if decompress:
|
||||
if arg == "-":
|
||||
f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
|
||||
g = sys.stdout
|
||||
else:
|
||||
if arg[-3:] != ".gz":
|
||||
print "filename doesn't end in .gz:", repr(arg)
|
||||
continue
|
||||
f = open(arg, "rb")
|
||||
g = __builtin__.open(arg[:-3], "wb")
|
||||
else:
|
||||
if arg == "-":
|
||||
f = sys.stdin
|
||||
g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
|
||||
else:
|
||||
f = __builtin__.open(arg, "rb")
|
||||
g = open(arg + ".gz", "wb")
|
||||
while True:
|
||||
chunk = f.read(1024)
|
||||
if not chunk:
|
||||
break
|
||||
g.write(chunk)
|
||||
if g is not sys.stdout:
|
||||
g.close()
|
||||
if f is not sys.stdin:
|
||||
f.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
_test()
|
BIN
obitools/gzip.pyc
Normal file
BIN
obitools/gzip.pyc
Normal file
Binary file not shown.
538
obitools/location/__init__.py
Normal file
538
obitools/location/__init__.py
Normal file
@ -0,0 +1,538 @@
|
||||
import obitools
|
||||
import re
|
||||
import array
|
||||
|
||||
class Location(object):
|
||||
"""
|
||||
Define a location on a sequence.
|
||||
"""
|
||||
|
||||
def extractSequence(self,sequence):
|
||||
'''
|
||||
Extract subsequence corresponding to a Location.
|
||||
|
||||
@param sequence:
|
||||
@type sequence: C{BioSequence} or C{str}
|
||||
'''
|
||||
assert isinstance(sequence, (obitools.BioSequence,str)), \
|
||||
"sequence must be an instance of str or BioSequence"
|
||||
|
||||
if isinstance(sequence, str):
|
||||
seq = self._extractSequence(sequence)
|
||||
else:
|
||||
if isinstance(sequence, obitools.AASequence):
|
||||
assert not self.needNucleic(), \
|
||||
"This location can be used only with Nucleic sequences"
|
||||
seq = self._extractSequence(str(sequence))
|
||||
|
||||
if isinstance(sequence, obitools.AASequence):
|
||||
st = obitools.AASequence
|
||||
else:
|
||||
st = obitools.NucSequence
|
||||
|
||||
seq = st(sequence.id,
|
||||
seq,
|
||||
sequence.definition,
|
||||
**sequence.getTags())
|
||||
seq['location']=str(self)
|
||||
|
||||
if 'length' in sequence.getTags():
|
||||
seq['length']=len(seq)
|
||||
|
||||
if hasattr(sequence, 'quality'):
|
||||
quality = self._extractQuality(sequence)
|
||||
seq.quality=quality
|
||||
|
||||
return seq
|
||||
|
||||
def isDirect(self):
|
||||
return None
|
||||
|
||||
def isSimple(self):
|
||||
'''
|
||||
Indicate if a location is composed of a single continuous
|
||||
region or is composed by the junction of several locations
|
||||
by the C{join} operator.
|
||||
|
||||
@return: C{True} if the location is composed of a single
|
||||
continuous region.
|
||||
@rtype: bool
|
||||
'''
|
||||
|
||||
return None
|
||||
|
||||
def isFullLength(self):
|
||||
return None
|
||||
|
||||
def needNucleic(self):
|
||||
'''
|
||||
If a location contains a complement operator, it can be use
|
||||
only on nucleic sequence.
|
||||
|
||||
@return: C{True} if location contains a complement operator
|
||||
@rtype: bool
|
||||
'''
|
||||
return None
|
||||
|
||||
def getGloc(self):
|
||||
loc = self.simplify()
|
||||
assert loc.isDirect() is not None,"Gloc cannot be created for multi oriented location : %s" % str(loc)
|
||||
positions = ','.join([str(x) for x in loc._getglocpos()])
|
||||
return "(%s,%s)" % ({True:'T',False:'F'}[loc.isDirect()],
|
||||
positions)
|
||||
|
||||
def shift(self,s):
|
||||
return None
|
||||
|
||||
def getBegin(self):
|
||||
return None
|
||||
|
||||
def getEnd(self):
|
||||
return None
|
||||
|
||||
def getFivePrime(self):
|
||||
return self.getBegin()
|
||||
|
||||
def getThreePrime(self):
|
||||
return self.getEnd()
|
||||
|
||||
begin = property(getBegin,None,None,"beginning position of the location")
|
||||
end = property(getEnd,None,None,"ending position of the location")
|
||||
fivePrime=property(getFivePrime,None,None,"5' position of the location")
|
||||
threePrime=property(getThreePrime,None,None,"3' position of the location")
|
||||
|
||||
def __abs__(self):
|
||||
assert self.isDirect() is not None,"Abs operator cannot be applied on non oriented location"
|
||||
if self.isDirect():
|
||||
return self
|
||||
else:
|
||||
return ComplementLocation(self).simplify()
|
||||
|
||||
def __cmp__(self,y):
|
||||
if self.begin < y.begin:
|
||||
return -1
|
||||
if self.begin > y.begin:
|
||||
return 1
|
||||
if self.isDirect() == y.isDirect():
|
||||
return 0
|
||||
if self.isDirect() and not y.isDirect():
|
||||
return -1
|
||||
return 1
|
||||
|
||||
class SimpleLocation(Location):
|
||||
"""
|
||||
A simple location is describe a continuous region of
|
||||
a sequence define by a C{begin} and a C{end} position.
|
||||
"""
|
||||
|
||||
def __init__(self,begin,end):
|
||||
'''
|
||||
Build a new C{SimpleLocation} instance. Valid
|
||||
position are define on M{[1,N]} with N the length
|
||||
of the sequence.
|
||||
|
||||
@param begin: start position of the location
|
||||
@type begin: int
|
||||
@param end: end position of the location
|
||||
@type end: int
|
||||
'''
|
||||
assert begin > 0 and end > 0
|
||||
|
||||
self._begin = begin
|
||||
self._end = end
|
||||
self._before=False
|
||||
self._after=False
|
||||
|
||||
def _extractSequence(self,sequence):
|
||||
|
||||
assert ( self._begin < len(sequence)
|
||||
and self._end <= len(sequence)), \
|
||||
"Sequence length %d is too short" % len(sequence)
|
||||
|
||||
return sequence[self._begin-1:self._end]
|
||||
|
||||
def _extractQuality(self,sequence):
|
||||
|
||||
assert ( self._begin < len(sequence)
|
||||
and self._end <= len(sequence)), \
|
||||
"Sequence length %d is too short" % len(sequence)
|
||||
|
||||
return sequence.quality[self._begin-1:self._end]
|
||||
|
||||
|
||||
def isDirect(self):
|
||||
return True
|
||||
|
||||
def isSimple(self):
|
||||
return True
|
||||
|
||||
def isFullLength(self):
|
||||
return not (self.before or self.after)
|
||||
|
||||
def simplify(self):
|
||||
if self._begin == self._end:
|
||||
return PointLocation(self._begin)
|
||||
else:
|
||||
return self
|
||||
|
||||
def needNucleic(self):
|
||||
return False
|
||||
|
||||
def __str__(self):
|
||||
before = {True:'<',False:''}[self.before]
|
||||
after = {True:'>',False:''}[self.after]
|
||||
return "%s%d..%s%d" % (before,self._begin,after,self._end)
|
||||
|
||||
def shift(self,s):
|
||||
assert (self._begin + s) > 0,"shift to large (%d)" % s
|
||||
if s == 0:
|
||||
return self
|
||||
return SimpleLocation(self._begin + s, self._end + s)
|
||||
|
||||
def _getglocpos(self):
|
||||
return (self.begin,self.end)
|
||||
|
||||
def getGloc(self):
|
||||
positions = ','.join([str(x) for x in self._getglocpos()])
|
||||
return "(%s,%s)" % ({True:'T',False:'F'}[self.isDirect()],
|
||||
positions)
|
||||
|
||||
def getBegin(self):
|
||||
return self._begin
|
||||
|
||||
def getEnd(self):
|
||||
return self._end
|
||||
|
||||
|
||||
begin = property(getBegin,None,None,"beginning position of the location")
|
||||
end = property(getEnd,None,None,"ending position of the location")
|
||||
|
||||
def getBefore(self):
|
||||
return self._before
|
||||
|
||||
def getAfter(self):
|
||||
return self._after
|
||||
|
||||
def setBefore(self,value):
|
||||
assert isinstance(value, bool)
|
||||
self._before=value
|
||||
|
||||
def setAfter(self,value):
|
||||
assert isinstance(value, bool)
|
||||
self._after=value
|
||||
|
||||
before=property(getBefore,setBefore,None)
|
||||
after=property(getAfter,setAfter,None)
|
||||
|
||||
|
||||
|
||||
|
||||
class PointLocation(Location):
|
||||
"""
|
||||
A point location describes a location on a sequence
|
||||
limited to a single position
|
||||
"""
|
||||
|
||||
def __init__(self,position):
|
||||
assert position > 0
|
||||
self._pos=position
|
||||
|
||||
def _extractSequence(self,sequence):
|
||||
|
||||
assert self._end <= len(sequence), \
|
||||
"Sequence length %d is too short" % len(sequence)
|
||||
|
||||
return sequence[self._pos-1]
|
||||
|
||||
def _extractQuality(self,sequence):
|
||||
|
||||
assert self._end <= len(sequence), \
|
||||
"Sequence length %d is too short" % len(sequence)
|
||||
|
||||
return sequence[self._pos-1:self._pos]
|
||||
|
||||
def isDirect(self):
|
||||
return True
|
||||
|
||||
def isSimple(self):
|
||||
return True
|
||||
|
||||
def isFullLength(self):
|
||||
return True
|
||||
|
||||
def simplify(self):
|
||||
return self
|
||||
|
||||
def needNucleic(self):
|
||||
return False
|
||||
|
||||
def shift(self,s):
|
||||
assert (self._pos + s) > 0,"shift to large (%d)" % s
|
||||
if s == 0:
|
||||
return self
|
||||
return PointLocation(self._pos + s)
|
||||
|
||||
def _getglocpos(self):
|
||||
return (self._pos,self._pos)
|
||||
|
||||
def getBegin(self):
|
||||
return self._pos
|
||||
|
||||
def getEnd(self):
|
||||
return self._pos
|
||||
|
||||
begin = property(getBegin,None,None,"beginning position of the location")
|
||||
end = property(getEnd,None,None,"ending position of the location")
|
||||
|
||||
def __str__(self):
|
||||
return str(self._pos)
|
||||
|
||||
class CompositeLocation(Location):
|
||||
"""
|
||||
"""
|
||||
def __init__(self,locations):
|
||||
self._locs = tuple(locations)
|
||||
|
||||
|
||||
def _extractSequence(self,sequence):
|
||||
seq = ''.join([x._extractSequence(sequence)
|
||||
for x in self._locs])
|
||||
return seq
|
||||
|
||||
def _extractQuality(self,sequence):
|
||||
rep=array.array('d',[])
|
||||
for x in self._locs:
|
||||
rep.extend(x._extractQuality(sequence))
|
||||
return rep
|
||||
|
||||
def isDirect(self):
|
||||
hasDirect,hasReverse = reduce(lambda x,y: (x[0] or y,x[1] or not y),
|
||||
(z.isDirect() for z in self._locs),(False,False))
|
||||
|
||||
if hasDirect and not hasReverse:
|
||||
return True
|
||||
if hasReverse and not hasDirect:
|
||||
return False
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def isSimple(self):
|
||||
return False
|
||||
|
||||
|
||||
def simplify(self):
|
||||
if len(self._locs)==1:
|
||||
return self._locs[0]
|
||||
|
||||
rep = CompositeLocation(x.simplify() for x in self._locs)
|
||||
|
||||
if reduce(lambda x,y : x and y,
|
||||
(isinstance(z, ComplementLocation)
|
||||
for z in self._locs)):
|
||||
rep = ComplementLocation(CompositeLocation(x._loc.simplify()
|
||||
for x in rep._locs[::-1]))
|
||||
|
||||
return rep
|
||||
|
||||
def isFullLength(self):
|
||||
return reduce(lambda x,y : x and y, (z.isFullLength() for z in self._locs),1)
|
||||
|
||||
def needNucleic(self):
|
||||
return reduce(lambda x,y : x or y,
|
||||
(z.needNucleic for z in self._locs),
|
||||
False)
|
||||
|
||||
def _getglocpos(self):
|
||||
return reduce(lambda x,y : x + y,
|
||||
(z._getglocpos() for z in self._locs))
|
||||
|
||||
|
||||
def getBegin(self):
|
||||
return min(x.getBegin() for x in self._locs)
|
||||
|
||||
def getEnd(self):
|
||||
return max(x.getEnd() for x in self._locs)
|
||||
|
||||
def shift(self,s):
|
||||
assert (self.getBegin() + s) > 0,"shift to large (%d)" % s
|
||||
if s == 0:
|
||||
return self
|
||||
return CompositeLocation(x.shift(s) for x in self._locs)
|
||||
|
||||
|
||||
begin = property(getBegin,None,None,"beginning position of the location")
|
||||
end = property(getEnd,None,None,"ending position of the location")
|
||||
|
||||
|
||||
def __str__(self):
|
||||
return "join(%s)" % ','.join([str(x)
|
||||
for x in self._locs])
|
||||
|
||||
class ComplementLocation(Location):
|
||||
"""
|
||||
"""
|
||||
|
||||
_comp={'a': 't', 'c': 'g', 'g': 'c', 't': 'a',
|
||||
'r': 'y', 'y': 'r', 'k': 'm', 'm': 'k',
|
||||
's': 's', 'w': 'w', 'b': 'v', 'd': 'h',
|
||||
'h': 'd', 'v': 'b', 'n': 'n', 'u': 'a',
|
||||
'-': '-'}
|
||||
|
||||
def __init__(self,location):
|
||||
self._loc = location
|
||||
|
||||
def _extractSequence(self,sequence):
|
||||
seq = self._loc._extractSequence(sequence)
|
||||
seq = ''.join([ComplementLocation._comp.get(x.lower(),'n') for x in seq[::-1]])
|
||||
return seq
|
||||
|
||||
def _extractQuality(self,sequence):
|
||||
return sequence.quality[::-1]
|
||||
|
||||
def isDirect(self):
|
||||
return False
|
||||
|
||||
def isSimple(self):
|
||||
return self._loc.isSimple()
|
||||
|
||||
def isFullLength(self):
|
||||
return self._loc.isFullLength()
|
||||
|
||||
def simplify(self):
|
||||
if isinstance(self._loc, ComplementLocation):
|
||||
return self._loc._loc.simplify()
|
||||
else:
|
||||
return self
|
||||
|
||||
def needNucleic(self):
|
||||
return True
|
||||
|
||||
def __str__(self):
|
||||
return "complement(%s)" % self._loc
|
||||
|
||||
def shift(self,s):
|
||||
assert (self.getBegin() + s) > 0,"shift to large (%d)" % s
|
||||
if s == 0:
|
||||
return self
|
||||
return ComplementLocation(self._loc.shift(s))
|
||||
|
||||
def _getglocpos(self):
|
||||
return self._loc._getglocpos()
|
||||
|
||||
def getBegin(self):
|
||||
return self._loc.getBegin()
|
||||
|
||||
def getEnd(self):
|
||||
return self._loc.getEnd()
|
||||
|
||||
def getFivePrime(self):
|
||||
return self.getEnd()
|
||||
|
||||
def getThreePrime(self):
|
||||
return self.getBegin()
|
||||
|
||||
|
||||
begin = property(getBegin,None,None,"beginning position of the location")
|
||||
end = property(getEnd,None,None,"ending position of the location")
|
||||
fivePrime=property(getFivePrime,None,None,"5' potisition of the location")
|
||||
threePrime=property(getThreePrime,None,None,"3' potisition of the location")
|
||||
|
||||
|
||||
#
|
||||
# Internal functions used for location parsing
|
||||
#
|
||||
|
||||
def __sublocationIterator(text):
|
||||
sl = []
|
||||
plevel=0
|
||||
for c in text:
|
||||
assert plevel>=0,"Misformated location : %s" % text
|
||||
if c == '(':
|
||||
plevel+=1
|
||||
sl.append(c)
|
||||
elif c==')':
|
||||
plevel-=1
|
||||
sl.append(c)
|
||||
elif c==',' and plevel == 0:
|
||||
assert sl,"Misformated location : %s" % text
|
||||
yield ''.join(sl)
|
||||
sl=[]
|
||||
else:
|
||||
sl.append(c)
|
||||
assert sl and plevel==0,"Misformated location : %s" % text
|
||||
yield ''.join(sl)
|
||||
|
||||
|
||||
|
||||
#
|
||||
# Internal functions used for location parsing
|
||||
#
|
||||
|
||||
__simplelocparser = re.compile('(?P<before><?)(?P<from>[0-9]+)(\.\.(?P<after>>?)(?P<to>[0-9]+))?')
|
||||
|
||||
|
||||
def __locationParser(text):
|
||||
text=text.strip()
|
||||
if text[0:5]=='join(':
|
||||
assert text[-1]==')',"Misformated location : %s" % text
|
||||
return CompositeLocation(__locationParser(sl) for sl in __sublocationIterator(text[5:-1]))
|
||||
elif text[0:11]=='complement(':
|
||||
assert text[-1]==')',"Misformated location : %s" % text
|
||||
subl = tuple(__locationParser(sl) for sl in __sublocationIterator(text[11:-1]))
|
||||
if len(subl)>1:
|
||||
subl = CompositeLocation(subl)
|
||||
else:
|
||||
subl = subl[0]
|
||||
return ComplementLocation(subl)
|
||||
else:
|
||||
data = __simplelocparser.match(text)
|
||||
assert data is not None,"Misformated location : %s" % text
|
||||
data = data.groupdict()
|
||||
if not data['to'] :
|
||||
sl = PointLocation(int(data['from']))
|
||||
else:
|
||||
sl = SimpleLocation(int(data['from']),int(data['to']))
|
||||
sl.before=data['before']=='<'
|
||||
sl.after=data['after']=='>'
|
||||
return sl
|
||||
|
||||
def locationGenerator(locstring):
|
||||
'''
|
||||
Parse a location string as present in genbank or embl file.
|
||||
|
||||
@param locstring: string description of the location in embl/gb format
|
||||
@type locstring: str
|
||||
|
||||
@return: a Location instance
|
||||
@rtype: C{Location} subclass instance
|
||||
'''
|
||||
return __locationParser(locstring)
|
||||
|
||||
|
||||
_matchExternalRef = re.compile('[A-Za-z0-9_|]+(\.[0-9]+)?(?=:)')
|
||||
|
||||
def extractExternalRefs(locstring):
|
||||
'''
|
||||
When a location describe external references (ex: D28156.1:1..>1292)
|
||||
separate the external reference part of the location and the location
|
||||
by itself.
|
||||
|
||||
@param locstring: text representation of the location.
|
||||
@type locstring: str
|
||||
|
||||
@return: a tuple with a set of string describing accession number
|
||||
of the referred sequences and a C{Location} instance.
|
||||
|
||||
@rtype: tuple(set,Location)
|
||||
'''
|
||||
m = set(x.group() for x in _matchExternalRef.finditer(locstring))
|
||||
clean = re.compile(':|'.join([re.escape(x) for x in m])+':')
|
||||
cloc = locationGenerator(clean.sub('',locstring))
|
||||
|
||||
return m,cloc
|
||||
|
||||
|
||||
|
||||
|
||||
|
BIN
obitools/location/__init__.pyc
Normal file
BIN
obitools/location/__init__.pyc
Normal file
Binary file not shown.
177
obitools/location/feature.py
Normal file
177
obitools/location/feature.py
Normal file
@ -0,0 +1,177 @@
|
||||
from obitools.location import Location,locationGenerator
|
||||
import logging
|
||||
import re
|
||||
|
||||
|
||||
|
||||
|
||||
_featureMatcher = re.compile('^(FT| ) [^ ].+\n((FT| ) .+\n)+',re.M)
|
||||
_featureCleaner = re.compile('^FT',re.M)
|
||||
|
||||
|
||||
def textFeatureIterator(fttable):
|
||||
'''
|
||||
Iterate through a textual description of a feature table in a genbank
|
||||
or embl format. Return at each step a text representation of each individual
|
||||
feature composing the table.
|
||||
|
||||
@param fttable: a string corresponding to the feature table of a genbank
|
||||
or an embl entry
|
||||
|
||||
@type fttable: C{str}
|
||||
|
||||
@return: an iterator on str
|
||||
@rtype: iterator
|
||||
|
||||
@see: L{ftParser}
|
||||
'''
|
||||
for m in _featureMatcher.finditer(fttable):
|
||||
t = m.group()
|
||||
t = _featureCleaner.sub(' ',t)
|
||||
yield t
|
||||
|
||||
_qualifierMatcher = re.compile('(?<=^ {21}/).+(\n {21}[^/].+)*',re.M)
|
||||
_qualifierCleanner= re.compile("^ +",re.M)
|
||||
|
||||
def qualifierIterator(qualifiers):
|
||||
'''
|
||||
Parse a textual description of a feature in embl or genbank format
|
||||
as returned by the textFeatureIterator iterator and iterate through
|
||||
the key, value qualified defining this location.
|
||||
|
||||
@param qualifiers: substring containing qualifiers
|
||||
@type qualifiers: str
|
||||
|
||||
@return: an iterator on tuple (key,value), where keys are C{str}
|
||||
@rtype: iterator
|
||||
'''
|
||||
for m in _qualifierMatcher.finditer(qualifiers):
|
||||
t = m.group()
|
||||
t = _qualifierCleanner.sub('',t)
|
||||
t = t.split('=',1)
|
||||
if len(t)==1:
|
||||
t = (t[0],None)
|
||||
else:
|
||||
if t[0]=='translation':
|
||||
value = t[1].replace('\n','')
|
||||
else:
|
||||
value = t[1].replace('\n',' ')
|
||||
try:
|
||||
value = eval(value)
|
||||
except:
|
||||
pass
|
||||
t = (t[0],value)
|
||||
yield t
|
||||
|
||||
|
||||
_ftmatcher = re.compile('(?<=^ {5})\S+')
|
||||
_locmatcher= re.compile('(?<=^.{21})[^/]+',re.DOTALL)
|
||||
_cleanloc = re.compile('[\s\n]+')
|
||||
_qualifiersMatcher = re.compile('^ +/.+',re.M+re.DOTALL)
|
||||
|
||||
def ftParser(feature):
|
||||
fttype = _ftmatcher.search(feature).group()
|
||||
location=_locmatcher.search(feature).group()
|
||||
location=_cleanloc.sub('',location)
|
||||
qualifiers=_qualifiersMatcher.search(feature)
|
||||
if qualifiers is not None:
|
||||
qualifiers=qualifiers.group()
|
||||
else:
|
||||
qualifiers=""
|
||||
logging.debug("Qualifiers regex not matching on \n=====\n%s\n========" % feature)
|
||||
|
||||
return fttype,location,qualifiers
|
||||
|
||||
|
||||
class Feature(dict,Location):
|
||||
def __init__(self,type,location):
|
||||
self._fttype=type
|
||||
self._loc=location
|
||||
|
||||
def getFttype(self):
|
||||
return self._fttype
|
||||
|
||||
|
||||
def extractSequence(self,sequence,withQualifier=False):
|
||||
seq = self._loc.extractSequence(sequence)
|
||||
if withQualifier:
|
||||
seq.getInfo().update(self)
|
||||
return seq
|
||||
|
||||
def isDirect(self):
|
||||
return self._loc.isDirect()
|
||||
|
||||
def isSimple(self):
|
||||
return self._loc.isSimple()
|
||||
|
||||
def isFullLength(self):
|
||||
return self._loc.isFullLength()
|
||||
|
||||
def simplify(self):
|
||||
f = Feature(self._fttype,self._loc.simplify())
|
||||
f.update(self)
|
||||
return f
|
||||
|
||||
def locStr(self):
|
||||
return str(self._loc)
|
||||
|
||||
def needNucleic(self):
|
||||
return self._loc.needNucleic()
|
||||
|
||||
def __str__(self):
|
||||
return repr(self)
|
||||
|
||||
def __repr__(self):
|
||||
return str((self.ftType,str(self._loc),dict.__repr__(self)))
|
||||
|
||||
def __cmp__(self,y):
|
||||
return self._loc.__cmp__(y)
|
||||
|
||||
def _getglocpos(self):
|
||||
return self._loc._getglocpos()
|
||||
|
||||
ftType = property(getFttype, None, None, "Feature type name")
|
||||
|
||||
def shift(self,s):
|
||||
assert (self.getBegin() + s) > 0,"shift to large (%d)" % s
|
||||
if s == 0:
|
||||
return self
|
||||
f = Feature(self._fttype,self._loc.shift(s))
|
||||
f.update(self)
|
||||
return f
|
||||
|
||||
|
||||
def getBegin(self):
|
||||
return self._loc.getBegin()
|
||||
|
||||
def getEnd(self):
|
||||
return self._loc.getEnd()
|
||||
|
||||
begin = property(getBegin,None,None,"beginning position of the location")
|
||||
end = property(getEnd,None,None,"ending position of the location")
|
||||
|
||||
|
||||
def featureFactory(featureDescription):
|
||||
fttype,location,qualifiers = ftParser(featureDescription)
|
||||
location = locationGenerator(location)
|
||||
feature = Feature(fttype,location)
|
||||
feature.raw = featureDescription
|
||||
|
||||
for k,v in qualifierIterator(qualifiers):
|
||||
feature.setdefault(k,[]).append(v)
|
||||
|
||||
return feature
|
||||
|
||||
def featureIterator(featureTable,skipError=False):
|
||||
for tft in textFeatureIterator(featureTable):
|
||||
try:
|
||||
feature = featureFactory(tft)
|
||||
except AssertionError,e:
|
||||
logging.debug("Parsing error on feature :\n===============\n%s\n===============" % tft)
|
||||
if not skipError:
|
||||
raise e
|
||||
logging.debug("\t===> Error skipped")
|
||||
continue
|
||||
|
||||
yield feature
|
||||
|
265
obitools/metabarcoding/__init__.py
Normal file
265
obitools/metabarcoding/__init__.py
Normal file
@ -0,0 +1,265 @@
|
||||
from obitools.ecopcr.options import addTaxonomyFilterOptions,\
|
||||
loadTaxonomyDatabase
|
||||
from obitools.graph import UndirectedGraph
|
||||
from obitools.align import lenlcs,isLCSReachable
|
||||
from obitools.graph.algorithms.component import componentIterator
|
||||
from obitools.utils.bioseq import uniqSequence
|
||||
from obitools.utils import progressBar
|
||||
import math
|
||||
import sys
|
||||
from obitools.graph.rootedtree import RootedTree
|
||||
|
||||
def average(x):
|
||||
x=list(x)
|
||||
s = sum(i*j for (i,j) in x)
|
||||
n = sum(i[1] for i in x)
|
||||
return (float(s)/float(n),n)
|
||||
|
||||
def minimum(x):
|
||||
x=list(x)
|
||||
m = min(i[0] for i in x)
|
||||
n = sum(i[1] for i in x)
|
||||
return (float(m),n)
|
||||
|
||||
def ecoPCRReader(entries,options):
|
||||
|
||||
taxonomy = loadTaxonomyDatabase(options)
|
||||
|
||||
norankid =options.taxonomy.findRankByName('no rank')
|
||||
speciesid=options.taxonomy.findRankByName('species')
|
||||
genusid =options.taxonomy.findRankByName('genus')
|
||||
familyid =options.taxonomy.findRankByName('family')
|
||||
|
||||
minrankseq = set([speciesid,genusid,familyid])
|
||||
|
||||
usedrankid = {}
|
||||
|
||||
ingroup = []
|
||||
outgroup= []
|
||||
|
||||
for s in entries:
|
||||
if 'taxid' in s :
|
||||
taxid = s['taxid']
|
||||
if taxid in taxonomy:
|
||||
allrank = set()
|
||||
for p in options.taxonomy.parentalTreeIterator(taxid):
|
||||
if p[1]!=norankid:
|
||||
allrank.add(p[1])
|
||||
if len(minrankseq & allrank) == 3:
|
||||
for r in allrank:
|
||||
usedrankid[r]=usedrankid.get(r,0) + 1
|
||||
|
||||
if taxonomy.isAncestor(options.ingroup,taxid):
|
||||
ingroup.append(s)
|
||||
else:
|
||||
outgroup.append(s)
|
||||
|
||||
keptrank = set(r for r in usedrankid
|
||||
if float(usedrankid[r])/float(len(ingroup)) > options.rankthresold)
|
||||
|
||||
return { 'ingroup' : ingroup,
|
||||
'outgroup': outgroup,
|
||||
'ranks' : keptrank
|
||||
}
|
||||
|
||||
def buildSimilarityGraph(dbseq,ranks,taxonomy,dcmax=5):
|
||||
|
||||
ldbseq = len(dbseq)
|
||||
pos = 1
|
||||
digit = int(math.ceil(math.log10(ldbseq)))
|
||||
header = "Alignment : %%0%dd x %%0%dd -> %%0%dd " % (digit,digit,digit)
|
||||
aligncount = ldbseq*(ldbseq+1)/2
|
||||
edgecount = 0
|
||||
print >>sys.stderr
|
||||
|
||||
progressBar(1,aligncount,True,"Alignment : %s x %s -> %s " % ('-'*digit,'-'*digit, '0'*digit))
|
||||
|
||||
|
||||
sim = UndirectedGraph()
|
||||
|
||||
i=0
|
||||
for s in dbseq:
|
||||
taxid = s['taxid']
|
||||
|
||||
rtaxon = dict((rid,taxonomy.getTaxonAtRank(taxid,rid))
|
||||
for rid in ranks)
|
||||
|
||||
sim.addNode(i, seq=s,taxid=taxid,rtaxon=rtaxon)
|
||||
|
||||
i+=1
|
||||
|
||||
# aligner = LCS()
|
||||
|
||||
for is1 in xrange(ldbseq):
|
||||
s1 = dbseq[is1]
|
||||
ls1= len(s1)
|
||||
# aligner.seqA=s1
|
||||
|
||||
for is2 in xrange(is1+1,ldbseq):
|
||||
|
||||
s2=dbseq[is2]
|
||||
ls2=len(s2)
|
||||
|
||||
lm = max(ls1,ls2)
|
||||
lcsmin = lm - dcmax
|
||||
|
||||
if isLCSReachable(s1,s2,lcsmin):
|
||||
llcs,lali=lenlcs(s1,s2)
|
||||
ds1s2 = lali - llcs
|
||||
|
||||
if ds1s2 <= dcmax:
|
||||
sim.addEdge(node1=is1, node2=is2,ds1s2=ds1s2,label=ds1s2)
|
||||
edgecount+=1
|
||||
|
||||
progressBar(pos,aligncount,head=header % (is1,is2,edgecount))
|
||||
pos+=(ldbseq-is1-1)
|
||||
|
||||
return sim
|
||||
|
||||
def buildTsr(component):
|
||||
'''
|
||||
Build for each consider taxonomic rank the list of taxa
|
||||
present in the connected component
|
||||
|
||||
:param component: the analyzed connected component
|
||||
:type component: :py:class:`UndirectedGraph`
|
||||
|
||||
:return: a dictionary indexed by rankid containing a `dict` indexed by taxid and containing count of sequences for this taxid
|
||||
:rtype: `dict` indexed by `int` containing `dict` indexed by `int` and containing of `int`
|
||||
|
||||
'''
|
||||
taxalist = {}
|
||||
for n in component:
|
||||
for r in n['rtaxon']:
|
||||
rtaxid = n['rtaxon'][r]
|
||||
if rtaxid is not None:
|
||||
ts = taxalist.get(r,{})
|
||||
ts[rtaxid]=ts.get(rtaxid,0)+1
|
||||
taxalist[r]=ts
|
||||
|
||||
return taxalist
|
||||
|
||||
def edgeDistSelector(dcmax):
|
||||
def predicate(e):
|
||||
return e['ds1s2'] <= dcmax
|
||||
return predicate
|
||||
|
||||
def distanceOfConfusion(simgraph,dcmax=5,aggregate=average):
|
||||
|
||||
alltaxa = set()
|
||||
|
||||
for n in simgraph:
|
||||
alltaxa|=set(n['rtaxon'].values())
|
||||
|
||||
taxacount = len(alltaxa)
|
||||
|
||||
result = {}
|
||||
|
||||
pos = [1]
|
||||
header = "Component : %-5d Identified : %-8d "
|
||||
progressBar(1,taxacount,True,header % (0,0))
|
||||
|
||||
def _idc(cc,dcmax):
|
||||
composante=[]
|
||||
for x in cc:
|
||||
composante.extend(simgraph.subgraph(c)
|
||||
for c in componentIterator(x,
|
||||
edgePredicat=edgeDistSelector(dcmax)))
|
||||
|
||||
good = set()
|
||||
bad = {}
|
||||
|
||||
complexe = []
|
||||
|
||||
for c in composante:
|
||||
tsr = buildTsr(c)
|
||||
newbad=False
|
||||
for r in tsr:
|
||||
if len(tsr[r]) == 1:
|
||||
taxid = tsr[r].keys()[0]
|
||||
good.add((taxid,tsr[r][taxid]))
|
||||
else:
|
||||
newbad=True
|
||||
for taxid in tsr[r]:
|
||||
bad[taxid]=bad.get(taxid,0)+tsr[r][taxid]
|
||||
if newbad:
|
||||
complexe.append(c)
|
||||
|
||||
# good = good - bad
|
||||
|
||||
for taxid,weight in good:
|
||||
if taxid not in result:
|
||||
result[taxid]=[]
|
||||
result[taxid].append((dcmax+1,weight))
|
||||
|
||||
|
||||
progressBar(pos[0],taxacount,False,header % (len(composante),pos[0]))
|
||||
pos[0]=len(result)
|
||||
|
||||
if dcmax > 0:
|
||||
dcmax-=1
|
||||
_idc(complexe,dcmax)
|
||||
|
||||
else:
|
||||
for taxid in bad:
|
||||
if taxid not in result:
|
||||
result[taxid]=[]
|
||||
result[taxid].append((0,bad[taxid]))
|
||||
|
||||
progressBar(pos[0],taxacount,False,header % (len(composante),pos[0]))
|
||||
pos[0]=len(result)
|
||||
|
||||
_idc([simgraph],dcmax)
|
||||
|
||||
for taxid in result:
|
||||
result[taxid]=aggregate(result[taxid])
|
||||
return result
|
||||
|
||||
def propagateDc(tree,node=None,aggregate=min):
|
||||
if node is None:
|
||||
node = tree.getRoots()[0]
|
||||
dca=aggregate(n['dc'] for n in node.leavesIterator())
|
||||
node['dc']=dca
|
||||
for n in node:
|
||||
propagateDc(tree, n, aggregate)
|
||||
|
||||
def confusionTree(distances,ranks,taxonomy,aggregate=min,bsrank='species',dcmax=1):
|
||||
|
||||
def Bs(node,rank,dcmax):
|
||||
n = len(node)
|
||||
if n:
|
||||
g = [int(x['dc']>=dcmax) for x in node.subgraphIterator() if x['rank']==bsrank]
|
||||
n = len(g)
|
||||
g = sum(g)
|
||||
bs= float(g)/float(n)
|
||||
node['bs']=bs
|
||||
node['bs_label']="%3.2f (%d)" % (bs,n)
|
||||
|
||||
for n in node:
|
||||
Bs(n,rank,dcmax)
|
||||
|
||||
tree = RootedTree()
|
||||
ranks = set(ranks)
|
||||
tset = set(distances)
|
||||
|
||||
for taxon in distances:
|
||||
tree.addNode(taxon, rank=taxonomy.getRank(taxon),
|
||||
name=taxonomy.getScientificName(taxon),
|
||||
dc=float(distances[taxon][0]),
|
||||
n=distances[taxon][1],
|
||||
dc_label="%4.2f (%d)" % (float(distances[taxon][0]),distances[taxon][1])
|
||||
)
|
||||
|
||||
for taxon in distances:
|
||||
piter = taxonomy.parentalTreeIterator(taxon)
|
||||
taxon = piter.next()
|
||||
for parent in piter:
|
||||
if taxon[0] in tset and parent[0] in distances:
|
||||
tset.remove(taxon[0])
|
||||
tree.addEdge(parent[0], taxon[0])
|
||||
taxon=parent
|
||||
|
||||
root = tree.getRoots()[0]
|
||||
Bs(root,bsrank,dcmax)
|
||||
|
||||
return tree
|
34
obitools/metabarcoding/options.py
Normal file
34
obitools/metabarcoding/options.py
Normal file
@ -0,0 +1,34 @@
|
||||
'''
|
||||
Created on 30 oct. 2011
|
||||
|
||||
@author: coissac
|
||||
'''
|
||||
|
||||
from obitools.ecopcr.options import addTaxonomyDBOptions
|
||||
|
||||
|
||||
def addMetabarcodingOption(optionManager):
|
||||
|
||||
addTaxonomyDBOptions(optionManager)
|
||||
|
||||
optionManager.add_option('--dcmax',
|
||||
action="store", dest="dc",
|
||||
metavar="###",
|
||||
type="int",
|
||||
default=0,
|
||||
help="Maximum confusion distance considered")
|
||||
|
||||
optionManager.add_option('--ingroup',
|
||||
action="store", dest="ingroup",
|
||||
metavar="###",
|
||||
type="int",
|
||||
default=1,
|
||||
help="ncbi taxid delimitation the in group")
|
||||
|
||||
optionManager.add_option('--rank-thresold',
|
||||
action="store", dest="rankthresold",
|
||||
metavar="#.##",
|
||||
type="float",
|
||||
default=0.5,
|
||||
help="minimum fraction of the ingroup sequences "
|
||||
"for concidering the rank")
|
28
obitools/obischemas/__init__.py
Normal file
28
obitools/obischemas/__init__.py
Normal file
@ -0,0 +1,28 @@
|
||||
from obitools.obischemas import kb
|
||||
__connection__ = None
|
||||
|
||||
def initConnection(options):
|
||||
global __connection__
|
||||
param = {}
|
||||
if hasattr(options, "dbname") and options.dbname is not None:
|
||||
param["database"]=options.dbname
|
||||
if hasattr(options, "dbhost") and options.dbhost is not None:
|
||||
param["host"]=options.dbhost
|
||||
if hasattr(options, "dbuser") and options.dbuser is not None:
|
||||
param["username"]=options.dbuser
|
||||
if hasattr(options, "dbpassword") and options.dbpassword is not None:
|
||||
param["password"]=options.dbpassword
|
||||
|
||||
__connection__=kb.getConnection(**param)
|
||||
__connection__.autocommit=options.autocommit
|
||||
|
||||
def getConnection(options=None):
|
||||
global __connection__
|
||||
|
||||
if options is not None:
|
||||
initConnection(options)
|
||||
|
||||
assert __connection__ is not None,"database connection is not initialized"
|
||||
|
||||
return __connection__
|
||||
|
55
obitools/obischemas/kb/__init__.py
Normal file
55
obitools/obischemas/kb/__init__.py
Normal file
@ -0,0 +1,55 @@
|
||||
"""
|
||||
kb package is devoted to manage access to postgresql database from python
|
||||
script
|
||||
"""
|
||||
|
||||
|
||||
class Connection(object):
|
||||
|
||||
def __init__(self):
|
||||
raise RuntimeError('pyROM.KB.Connection is an abstract class')
|
||||
|
||||
def cursor(self):
|
||||
raise RuntimeError('pyROM.KB.Connection.cursor is an abstract function')
|
||||
|
||||
def commit(self):
|
||||
raise RuntimeError('pyROM.KB.Connection.commit is an abstract function')
|
||||
|
||||
def rollback(self):
|
||||
raise RuntimeError('pyROM.KB.Connection.rollback is an abstract function')
|
||||
|
||||
def __call__(self,query):
|
||||
return self.cursor().execute(query)
|
||||
|
||||
|
||||
class Cursor(object):
|
||||
|
||||
def __init__(self,db):
|
||||
raise RuntimeError('pyROM.KB.Cursor is an abstract class')
|
||||
|
||||
def execute(self,query):
|
||||
raise RuntimeError('pyROM.KB.Cursor.execute is an abstract function')
|
||||
|
||||
__call__=execute
|
||||
|
||||
|
||||
_current_connection = None # Static variable used to store connection to KB
|
||||
|
||||
def getConnection(*args,**kargs):
|
||||
"""
|
||||
return a connection to the database.
|
||||
When call from database backend no argument are needed.
|
||||
All connection returned by this function
|
||||
"""
|
||||
global _current_connection
|
||||
|
||||
if _current_connection==None or args or kargs :
|
||||
try:
|
||||
from obischemas.kb import backend
|
||||
_current_connection = backend.Connection()
|
||||
except ImportError:
|
||||
from obischemas.kb import extern
|
||||
_current_connection = extern.Connection(*args,**kargs)
|
||||
return _current_connection
|
||||
|
||||
|
78
obitools/obischemas/kb/extern.py
Normal file
78
obitools/obischemas/kb/extern.py
Normal file
@ -0,0 +1,78 @@
|
||||
"""
|
||||
Module : KB.extern
|
||||
Author : Eric Coissac
|
||||
Date : 03/05/2004
|
||||
|
||||
Module wrapping psycopg interface module to allow connection
|
||||
to a postgresql databases with the same interface from
|
||||
backend and external script.
|
||||
|
||||
This module define a class usable from external script
|
||||
"""
|
||||
|
||||
|
||||
import psycopg2
|
||||
import sys
|
||||
from obischemas import kb
|
||||
|
||||
class Connection(kb.Connection):
|
||||
|
||||
def __init__(self,*connectParam,**kconnectParam):
|
||||
if connectParam:
|
||||
self.connectParam=={'dsn':connectParam}
|
||||
else:
|
||||
self.connectParam=kconnectParam
|
||||
print self.connectParam
|
||||
self.db = psycopg2.connect(**(self.connectParam))
|
||||
|
||||
def restart(self):
|
||||
ok=1
|
||||
while (ok and ok < 1000):
|
||||
try:
|
||||
self.db = psycopg2.connect(**self.connectParam)
|
||||
except:
|
||||
ok+=1
|
||||
else:
|
||||
ok=0
|
||||
|
||||
|
||||
def cursor(self):
|
||||
curs = Cursor(self.db)
|
||||
if hasattr(self,'autocommit') and self.autocommit:
|
||||
curs.autocommit = self.autocommit
|
||||
return curs
|
||||
|
||||
def commit(self):
|
||||
self.db.commit()
|
||||
|
||||
def rollback(self):
|
||||
if hasattr(self,'db'):
|
||||
self.db.rollback()
|
||||
|
||||
def __del__(self):
|
||||
if hasattr(self,'db'):
|
||||
self.rollback()
|
||||
|
||||
class Cursor(kb.Cursor):
|
||||
|
||||
def __init__(self,db):
|
||||
self.db = db
|
||||
self.curs = db.cursor()
|
||||
|
||||
def execute(self,query):
|
||||
try:
|
||||
self.curs.execute(query)
|
||||
if hasattr(self,'autocommit') and self.autocommit:
|
||||
self.db.commit()
|
||||
except psycopg2.ProgrammingError,e:
|
||||
print >>sys.stderr,"===> %s" % query
|
||||
raise e
|
||||
except psycopg2.IntegrityError,e:
|
||||
print >>sys.stderr,"---> %s" % query
|
||||
raise e
|
||||
try:
|
||||
label = [x[0] for x in self.curs.description]
|
||||
return [dict(map(None,label,y))
|
||||
for y in self.curs.fetchall()]
|
||||
except TypeError:
|
||||
return []
|
31
obitools/obischemas/options.py
Normal file
31
obitools/obischemas/options.py
Normal file
@ -0,0 +1,31 @@
|
||||
def addConnectionOptions(optionManager):
|
||||
|
||||
optionManager.add_option('-d','--dbname',
|
||||
action="store", dest="dbname",
|
||||
metavar="<DB NAME>",
|
||||
type="string",
|
||||
help="OBISchema database name containing"
|
||||
"taxonomical data")
|
||||
|
||||
optionManager.add_option('-H','--host',
|
||||
action="store", dest="dbhost",
|
||||
metavar="<DB HOST>",
|
||||
type="string",
|
||||
help="host hosting OBISchema database")
|
||||
|
||||
optionManager.add_option('-U','--user',
|
||||
action="store", dest="dbuser",
|
||||
metavar="<DB USER>",
|
||||
type="string",
|
||||
help="user for OBISchema database connection")
|
||||
|
||||
optionManager.add_option('-W','--password',
|
||||
action="store", dest="dbpassword",
|
||||
metavar="<DB PASSWORD>",
|
||||
type="string",
|
||||
help="password for OBISchema database connection")
|
||||
|
||||
optionManager.add_option('-A','--autocommit',
|
||||
action="store_true",dest="autocommit",
|
||||
default=False,
|
||||
help="add commit action after each query")
|
0
obitools/obo/__init__.py
Normal file
0
obitools/obo/__init__.py
Normal file
0
obitools/obo/go/__init__.py
Normal file
0
obitools/obo/go/__init__.py
Normal file
53
obitools/obo/go/parser.py
Normal file
53
obitools/obo/go/parser.py
Normal file
@ -0,0 +1,53 @@
|
||||
from obitools.obo.parser import OBOTerm
|
||||
from obitools.obo.parser import OBOEntry
|
||||
from obitools.obo.parser import stanzaIterator
|
||||
from logging import debug
|
||||
|
||||
class GOEntry(OBOEntry):
|
||||
'''
|
||||
An entry of a GeneOntology .obo file. It can be a header (without a stanza name) or
|
||||
a stanza (with a stanza name between brackets). It inherits from the class dict.
|
||||
'''
|
||||
|
||||
|
||||
class GOTerm(OBOTerm):
|
||||
|
||||
'''
|
||||
A stanza named 'Term'. It inherits from the class OBOTerm.
|
||||
'''
|
||||
|
||||
def __init__(self,stanza):
|
||||
|
||||
## use of the OBOEntry constructor.
|
||||
OBOTerm.__init__(self, stanza)
|
||||
|
||||
assert 'namespace' in self and len(self['namespace'])==1, "An OBOTerm must belong to one of the cell_component, molecular_function or biological_process namespace"
|
||||
|
||||
|
||||
def GOEntryFactory(stanza):
|
||||
'''
|
||||
Dispatcher of stanza.
|
||||
|
||||
@param stanza: a stanza composed of several lines.
|
||||
@type stanza: text
|
||||
|
||||
@return: an C{OBOTerm} | C{OBOEntry} instance
|
||||
|
||||
@note: The dispatcher treats differently the stanza which are OBO "Term"
|
||||
and the others.
|
||||
'''
|
||||
|
||||
stanzaType = OBOEntry.parseStanzaName(stanza)
|
||||
|
||||
if stanzaType=="Term":
|
||||
return GOTerm(stanza)
|
||||
else:
|
||||
return OBOEntry(stanza)
|
||||
|
||||
|
||||
def GOEntryIterator(file):
|
||||
entries = stanzaIterator(file)
|
||||
for e in entries:
|
||||
debug(e)
|
||||
yield GOEntryFactory(e)
|
||||
|
707
obitools/obo/parser.py
Normal file
707
obitools/obo/parser.py
Normal file
@ -0,0 +1,707 @@
|
||||
from obitools.utils import skipWhiteLineIterator,multiLineWrapper
|
||||
from obitools.utils import universalOpen
|
||||
from obitools.format.genericparser import genericEntryIteratorGenerator
|
||||
from logging import debug,warning
|
||||
|
||||
import re
|
||||
|
||||
|
||||
#################################################################################
|
||||
## Stanza preparation area ##
|
||||
#################################################################################
|
||||
|
||||
|
||||
class FileFormatError(Exception):
|
||||
'''
|
||||
An error derived from the class Exception.
|
||||
'''
|
||||
pass
|
||||
|
||||
_oboEntryIterator = genericEntryIteratorGenerator(endEntry='^ *$',
|
||||
strip=True)
|
||||
|
||||
def stanzaIterator(inputfile):
|
||||
'''
|
||||
Iterator of stanza. The stanza are the basic units of OBO files.
|
||||
|
||||
@param inputfile: a stream of strings from an opened OBO file.
|
||||
@type inputfile: a stream of strings
|
||||
|
||||
@return: a stream of stanza
|
||||
@rtype: a stream of aggregated strings
|
||||
|
||||
@note: The iterator constructs stanza by aggregate strings from the
|
||||
OBO file.
|
||||
'''
|
||||
inputfile = universalOpen(inputfile)
|
||||
inputfile = multiLineWrapper(inputfile)
|
||||
return _oboEntryIterator(inputfile)
|
||||
|
||||
|
||||
|
||||
#################################################################################
|
||||
## Trailing Modifiers treatment area ##
|
||||
#################################################################################
|
||||
|
||||
|
||||
class TrailingModifier(dict):
|
||||
'''
|
||||
A class object which inherits from the class dict. Trailing modifiers can be found
|
||||
at the end of TaggedValue objects when they exist.
|
||||
'''
|
||||
|
||||
_match_brace = re.compile('(?<=\ {)[^\]]*(\}) *( !|$)')
|
||||
|
||||
def __init__(self,string):
|
||||
|
||||
## search for trailing modifiers signals
|
||||
trailing_modifiers = TrailingModifier._match_brace.search(string)
|
||||
|
||||
## the trailing modifiers exist
|
||||
if trailing_modifiers:
|
||||
trailing_modifiers=trailing_modifiers.group(0).strip()
|
||||
print trailing_modifiers
|
||||
## creates and feeds the dictionary of trailing modifiers
|
||||
dict.__init__(self,(x.strip().split('=',1) for x in trailing_modifiers.split(',')))
|
||||
|
||||
|
||||
def trailingModifierFactory(string):
|
||||
'''
|
||||
Dispatcher of trailing modifiers.
|
||||
|
||||
@param string: a string from a TaggedValue object with a trailing modifiers signal.
|
||||
@type string: string
|
||||
|
||||
@return: a class object
|
||||
|
||||
@note: The dispatcher is currently very simple. Only one case is treated by the function.
|
||||
`the function returns a class object inherited from the class dict if the trailing modifiers
|
||||
exist, None if they don't.
|
||||
'''
|
||||
|
||||
trailing_modifiers = TrailingModifier(string)
|
||||
if not trailing_modifiers:
|
||||
trailing_modifiers=None
|
||||
return trailing_modifiers
|
||||
|
||||
|
||||
#################################################################################
|
||||
## TaggedValue treatment area ##
|
||||
#################################################################################
|
||||
|
||||
|
||||
class TaggedValue(object):
|
||||
'''
|
||||
A couple 'tag:value' of an OBOEntry.
|
||||
'''
|
||||
|
||||
_match_value = re.compile('(("(\\\\"|[^\"])*")|(\\\\"|[^\"]))*?( !| {|$)')
|
||||
_split_comment = re.compile('^!| !')
|
||||
_match_quotedString = re.compile('(?<=")(\\\\"|[^\"])*(?=")')
|
||||
_match_bracket = re.compile('\[[^\]]*\]')
|
||||
|
||||
def __init__(self,line):
|
||||
'''
|
||||
Constructor of the class TaggedValue.
|
||||
|
||||
@param line: a line of an OBOEntry composed of a tag and a value.
|
||||
@type line: string
|
||||
|
||||
@note: The constructor separates tags from right terms. 'value' is extracted
|
||||
from right terms using a regular expression (value is at the beginning of the
|
||||
string, between quotes or not). Then, 'comment' is extracted from the rest of the
|
||||
string using another regular expression ('comment' is at the end of the string
|
||||
after a '!'. By default, 'comment' is set to None). Finally, 'trailing_modifiers'
|
||||
are extracted from the last string using another regular expression.
|
||||
The tag, the value, the comment and the trailing_modifiers are saved.
|
||||
'''
|
||||
|
||||
debug("tagValueParser : %s" % line)
|
||||
|
||||
## by default :
|
||||
trailing_modifiers = None
|
||||
comment = None
|
||||
|
||||
## the tag is saved. 'right' is composed of the value, the comment and the trailing modifiers
|
||||
tag,rigth = line.split(':',1)
|
||||
|
||||
## the value is saved
|
||||
value = TaggedValue._match_value.search(rigth).group(0)
|
||||
debug("Extracted value : %s" % value)
|
||||
|
||||
## if there is a value AND a sign of a comment or trailing modifiers
|
||||
if value and value[-1] in '!{':
|
||||
lvalue = len(value)
|
||||
## whatever it is a comment or trailing modifiers, it is saved into 'extra'
|
||||
extra = rigth[lvalue-1:].strip()
|
||||
## a comment is extracted
|
||||
extra =TaggedValue._split_comment.split(extra,1)
|
||||
## and saved if it exists
|
||||
if len(extra)==2:
|
||||
comment=extra[1].strip()
|
||||
## trailing modifiers are extracted
|
||||
extra=extra[0]
|
||||
trailing_modifiers = trailingModifierFactory(extra)
|
||||
## the value is cleaned of any comment or trailing modifiers signals
|
||||
value = value[0:-1]
|
||||
|
||||
if tag=='use_term':
|
||||
tag='consider'
|
||||
raise DeprecationWarning,"user_term is a deprecated tag, you should instead use consider"
|
||||
|
||||
## recording zone
|
||||
self.value =value.strip()
|
||||
self.tag = tag
|
||||
self.__doc__=comment
|
||||
self.trailing_modifiers=trailing_modifiers
|
||||
|
||||
def __str__(self):
|
||||
return str(self.value)
|
||||
|
||||
def __repr__(self):
|
||||
return '''"""%s"""''' % str(self)
|
||||
|
||||
|
||||
class NameValue(TaggedValue):
|
||||
'''
|
||||
A couple 'name:value' inherited from the class TaggedValue. Used to manage name tags.
|
||||
'''
|
||||
|
||||
def __init__(self,line):
|
||||
|
||||
## no use of the TaggedValue constructor. The NameValue is very simple.
|
||||
tag,rigth = line.split(':',1)
|
||||
|
||||
## recording zone
|
||||
self.value = rigth.strip()
|
||||
self.tag = 'name'
|
||||
self.__doc__=None
|
||||
self.trailing_modifiers=None
|
||||
|
||||
|
||||
|
||||
class DefValue(TaggedValue):
|
||||
'''
|
||||
A couple 'def:value' inherited from the class TaggedValue. Used to manage def tags.
|
||||
'''
|
||||
|
||||
def __init__(self,line):
|
||||
'''
|
||||
Constructor of the class DefValue.
|
||||
|
||||
@param line: a line of an OBOEntry composed of a tag named 'def' and a value.
|
||||
@type line: string
|
||||
|
||||
@note: The constructor calls the TaggedValue constructor. A regular expression
|
||||
is used to extract the 'definition' from TaggedValue.value (definition is a not
|
||||
quoted TaggedValue.value). A regular expression is used to extract 'dbxrefs'
|
||||
from the aggedValue.value without the definition (dbxrefs are between brackets
|
||||
and definition can be so). Definition is saved as the new value of the DefValue.
|
||||
dbxrefs are saved.
|
||||
'''
|
||||
|
||||
## use of the TaggedValue constructor
|
||||
TaggedValue.__init__(self, line)
|
||||
|
||||
## definition, which is quoted, is extracted from the standard value of a TaggedValue.
|
||||
definition = TaggedValue._match_quotedString.search(self.value).group(0)
|
||||
|
||||
## the standard value is cleaned of the definition.
|
||||
cleanvalue = self.value.replace(definition,'')
|
||||
cleanvalue = cleanvalue.replace(' ',' ')
|
||||
|
||||
## dbxrefs are searched into the rest of the standard value.
|
||||
dbxrefs = TaggedValue._match_bracket.search(cleanvalue).group(0)
|
||||
|
||||
## recording zone
|
||||
self.tag = 'def'
|
||||
## the value of a DefValue is not the standard value but the definition.
|
||||
self.value=definition
|
||||
self.dbxrefs=xrefFactory(dbxrefs)
|
||||
|
||||
|
||||
class SynonymValue(TaggedValue):
|
||||
'''
|
||||
A couple 'synonym:value' inherited from the class TaggedValue. Used to manage
|
||||
synonym tags, exact_synonym tags, broad_synonym tags and narrow_synonym tags.
|
||||
'''
|
||||
|
||||
_match_scope = re.compile('(?<="")[^\[]*(?=\[|$)')
|
||||
|
||||
def __init__(self,line):
|
||||
'''
|
||||
Constructor of the class SynonymValue.
|
||||
|
||||
@param line: a line of an OBOEntry composed of a tag named 'synonym' or
|
||||
'exact_synonym' or 'broad_synonym' or 'narrow_synonym' and a value.
|
||||
@type line: string
|
||||
|
||||
@note: SynonymValue is composed of a tag, a value, a scope, a list of types and
|
||||
dbxrefs.
|
||||
The constructor calls the TaggedValue constructor. A regular expression
|
||||
is used to extract 'definition' from TaggedValue.value (definition is a not
|
||||
quoted TaggedValue.value). Definition is saved as the new value of the class
|
||||
SynonymValue.
|
||||
A regular expression is used to extract 'attributes' from the rest of the
|
||||
string. Attributes may contain an optional synonym scope and an optional list
|
||||
of synonym types. The scope is extracted from attributes or set by default to
|
||||
'RELATED'. It is saved as the scope of the class. The types are the rest of the
|
||||
attributes and are saved as the list of types of the class.
|
||||
For deprecated tags 'exact_synonym', 'broad_synonym' and 'narrow_synonym', tag
|
||||
is set to 'synonym' and scope is set respectively to 'EXACT', 'BROAD' and 'NARROW'.
|
||||
A regular expression is used to extract 'dbxrefs' from the TaggedValue.value
|
||||
without the definition (dbxrefs are between brackets and definition can be so).
|
||||
dbxrefs are saved.
|
||||
'''
|
||||
|
||||
## use of the TaggedValue constructor
|
||||
TaggedValue.__init__(self, line)
|
||||
|
||||
## definition, which is quoted, is extracted from the standard value of a TaggedValue.
|
||||
definition = TaggedValue._match_quotedString.search(self.value).group(0)
|
||||
|
||||
## the standard value is cleaned of the definition.
|
||||
cleanvalue = self.value.replace(definition,'')
|
||||
cleanvalue = cleanvalue.replace(' ',' ')
|
||||
|
||||
## 1) attributes are searched into the rest of the standard value.
|
||||
## 2) then they are stripped.
|
||||
## 3) then they are split on every ' '.
|
||||
## 4) finally they are ordered into a set.
|
||||
attributes = set(SynonymValue._match_scope.search(cleanvalue).group(0).strip().split())
|
||||
|
||||
## the scopes are the junction between the attributes and a set of specific terms.
|
||||
scopes = attributes & set(['RELATED','EXACT','BROAD','NARROW'])
|
||||
|
||||
## the types are the rest of the attributes.
|
||||
types = attributes - scopes
|
||||
|
||||
## this is a constraint of the OBO format
|
||||
assert len(scopes)< 2,"Only one synonym scope allowed"
|
||||
|
||||
## the scope of the SynonymValue is into scopes or set by default to RELATED
|
||||
if scopes:
|
||||
scope = scopes.pop()
|
||||
else:
|
||||
scope = 'RELATED'
|
||||
|
||||
## Specific rules are defined for the following tags :
|
||||
if self.tag == 'exact_synonym':
|
||||
raise DeprecationWarning,'exact_synonym is a deprecated tag use instead synonym tag'
|
||||
self.tag = 'synonym'
|
||||
scope = 'EXACT'
|
||||
|
||||
if self.tag == 'broad_synonym':
|
||||
raise DeprecationWarning,'broad_synonym is a deprecated tag use instead synonym tag'
|
||||
self.tag = 'synonym'
|
||||
scope = 'BROAD'
|
||||
|
||||
if self.tag == 'narrow_synonym':
|
||||
raise DeprecationWarning,'narrow_synonym is a deprecated tag use instead synonym tag'
|
||||
self.tag = 'synonym'
|
||||
scope = 'NARROW'
|
||||
|
||||
if self.tag == 'systematic_synonym':
|
||||
#raise DeprecationWarning,'narrow_synonym is a deprecated tag use instead sysnonym tag'
|
||||
self.tag = 'synonym'
|
||||
scope = 'SYSTEMATIC'
|
||||
|
||||
## this is our own constraint. deprecated tags are not saved by this parser.
|
||||
assert self.tag =='synonym',"%s synonym type is not managed" % self.tag
|
||||
|
||||
## dbxrefs are searched into the rest of the standard value.
|
||||
dbxrefs = TaggedValue._match_bracket.search(cleanvalue).group(0)
|
||||
|
||||
## recording zone
|
||||
## the value of a SynonymValue is not the standard value but the definition.
|
||||
self.value = definition
|
||||
self.dbxrefs = xrefFactory(dbxrefs)
|
||||
self.scope = scope
|
||||
self.types = list(types)
|
||||
|
||||
def __eq__(self,b):
|
||||
return ((self.value==b.value) and (self.dbxrefs==b.dbxrefs)
|
||||
and (self.scope==b.scope) and (self.types==b.types)
|
||||
and (self.__doc__==b.__doc__) and (self.tag==b.tag)
|
||||
and (self.trailing_modifiers==b.trailing_modifiers))
|
||||
|
||||
def __hash__(self):
|
||||
return (reduce(lambda x,y:x+y,(hash(z) for z in [self.__doc__,
|
||||
self.value,
|
||||
frozenset(self.dbxrefs),
|
||||
self.scope,
|
||||
frozenset(self.types),
|
||||
self.tag,
|
||||
self.trailing_modifiers]),0)) % (2**31)
|
||||
|
||||
|
||||
class XrefValue(TaggedValue):
|
||||
'''
|
||||
A couple 'xref:value' inherited from the class TaggedValue. Used to manage
|
||||
xref tags.
|
||||
'''
|
||||
|
||||
def __init__(self,line):
|
||||
|
||||
## use of the TaggedValue constructor
|
||||
TaggedValue.__init__(self, line)
|
||||
|
||||
## use the same function as the dbxrefs
|
||||
self.value=xrefFactory(self.value)
|
||||
|
||||
if self.tag in ('xref_analog','xref_unk'):
|
||||
raise DeprecationWarning,'%s is a deprecated tag use instead sysnonym tag' % self.tag
|
||||
self.tag='xref'
|
||||
|
||||
## this is our own constraint. deprecated tags are not saved by this parser.
|
||||
assert self.tag=='xref'
|
||||
|
||||
|
||||
class RelationshipValue(TaggedValue):
|
||||
'''
|
||||
A couple 'xref:value' inherited from the class TaggedValue. Used to manage
|
||||
xref tags.
|
||||
'''
|
||||
|
||||
def __init__(self,line):
|
||||
|
||||
## use of the TaggedValue constructor
|
||||
TaggedValue.__init__(self, line)
|
||||
|
||||
## the value is split on the first ' '.
|
||||
value = self.value.split(None,1)
|
||||
|
||||
## succesful split !
|
||||
if len(value)==2:
|
||||
relationship=value[0]
|
||||
term=value[1]
|
||||
## unsuccesful split. The relationship is set by default to IS_A
|
||||
else:
|
||||
relationship='is_a'
|
||||
term=value[0]
|
||||
|
||||
## recording zone
|
||||
self.value=term
|
||||
self.relationship=relationship
|
||||
|
||||
|
||||
class NamespaceValue(TaggedValue):
|
||||
def __init__(self,line):
|
||||
TaggedValue.__init__(self, line)
|
||||
|
||||
class RemarkValue(TaggedValue):
|
||||
def __init__(self,line):
|
||||
TaggedValue.__init__(self, line)
|
||||
label,value = self.value.split(':',1)
|
||||
label = label.strip()
|
||||
value = value.strip()
|
||||
self.value=value
|
||||
self.label=label
|
||||
|
||||
|
||||
def taggedValueFactory(line):
|
||||
'''
|
||||
A function used to dispatch lines of an OBOEntry between the class TaggedValue
|
||||
and its inherited classes.
|
||||
|
||||
@param line: a line of an OBOEntry composed of a tag and a value.
|
||||
@type line: string
|
||||
|
||||
@return: a class object
|
||||
'''
|
||||
|
||||
if (line[0:9]=='namespace' or
|
||||
line[0:17]=='default-namespace'):
|
||||
return NamespaceValue(line)
|
||||
## DefValue is an inherited class of TaggedValue
|
||||
elif line[0:3]=='def':
|
||||
return DefValue(line)
|
||||
## SynonymValue is an inherited class of TaggedValue
|
||||
elif ((line[0:7]=="synonym" and line[0:14]!="synonymtypedef") or
|
||||
line[0:13]=="exact_synonym" or
|
||||
line[0:13]=="broad_synonym" or
|
||||
line[0:14]=="narrow_synonym"):
|
||||
return SynonymValue(line)
|
||||
## XrefValue is an inherited class of TaggedValue
|
||||
elif line[0:4]=='xref':
|
||||
return XrefValue(line)
|
||||
## NameValue is an inherited class of TaggedValue
|
||||
elif line[0:4]=='name':
|
||||
return NameValue(line)
|
||||
## RelationshipValue is an inherited class of TaggedValue
|
||||
elif (line[0:15]=='intersection_of' or
|
||||
line[0:8] =='union_of' or
|
||||
line[0:12]=='relationship'):
|
||||
return RelationshipValue(line)
|
||||
elif (line[0:6]=='remark'):
|
||||
return RemarkValue(line)
|
||||
## each line is a couple : tag / value (and some more features)
|
||||
else:
|
||||
return TaggedValue(line)
|
||||
|
||||
|
||||
#################################################################################
|
||||
## Xref treatment area ##
|
||||
#################################################################################
|
||||
|
||||
|
||||
|
||||
class Xref(object):
|
||||
'''
|
||||
A xref object of an OBOentry. It may be the 'dbxrefs' of SynonymValue and
|
||||
DefValue objects or the 'value' of XrefValue objects.
|
||||
'''
|
||||
|
||||
__splitdata__ = re.compile(' +(?=["{])')
|
||||
|
||||
def __init__(self,ref):
|
||||
if ref == '' : #
|
||||
ref = None #
|
||||
data = '' #
|
||||
else : # Modifs JJ sinon erreur : list index out of range
|
||||
data = Xref.__splitdata__.split(ref,1) #
|
||||
ref = data[0] #
|
||||
description=None
|
||||
trailing_modifiers = None
|
||||
if len(data)> 1:
|
||||
extra = data[1]
|
||||
description = TaggedValue._match_quotedString.search(extra)
|
||||
if description is not None:
|
||||
description = description.group(0)
|
||||
extra.replace(description,'')
|
||||
trailing_modifiers=trailingModifierFactory(extra)
|
||||
self.reference=ref
|
||||
self.description=description
|
||||
self.trailing_modifiers=trailing_modifiers
|
||||
|
||||
def __eq__(self,b):
|
||||
return ((self.reference==b.reference) and (self.description==b.description)
|
||||
and (self.trailing_modifiers==b.trailing_modifiers))
|
||||
|
||||
def __hash__(self):
|
||||
return (reduce(lambda x,y:x+y,(hash(z) for z in [self.reference,
|
||||
self.description,
|
||||
self.trailing_modifiers]),0)) % (2**31)
|
||||
|
||||
|
||||
def xrefFactory(string):
|
||||
'''
|
||||
Dispatcher of xrefs.
|
||||
|
||||
@param string: a string (between brackets) from an inherited TaggedValue object with a dbxrefs
|
||||
signal (actually, the signal can only be found into SynonymValue and DefValue
|
||||
objects) or a string (without brackets) from a XrefValue object.
|
||||
@type string: string
|
||||
|
||||
@return: a class object
|
||||
|
||||
@note: The dispatcher treats differently the strings between brackets (from SynonymValue and
|
||||
DefValue objects) and without brackets (from XrefValue objects).
|
||||
'''
|
||||
|
||||
string = string.strip()
|
||||
if string[0]=='[':
|
||||
return [Xref(x.strip()) for x in string[1:-1].split(',')]
|
||||
else:
|
||||
return Xref(string)
|
||||
|
||||
|
||||
#################################################################################
|
||||
## Stanza treatment area ##
|
||||
#################################################################################
|
||||
|
||||
|
||||
class OBOEntry(dict):
|
||||
'''
|
||||
An entry of an OBOFile. It can be a header (without a stanza name) or
|
||||
a stanza (with a stanza name between brackets). It inherits from the class dict.
|
||||
'''
|
||||
_match_stanza_name = re.compile('(?<=^\[)[^\]]*(?=\])')
|
||||
|
||||
def __init__(self,stanza):
|
||||
## tests if it is the header of the OBO file (returns TRUE) or not (returns FALSE)
|
||||
self.isHeader = stanza[0]!='['
|
||||
lines = stanza.split('\n')
|
||||
## not the header : there is a [stanzaName]
|
||||
if not self.isHeader:
|
||||
self.stanzaName = lines[0].strip()[1:-1]
|
||||
lines=lines[1:]
|
||||
self["stanza"] = [stanza.strip()]
|
||||
|
||||
## whatever the stanza is.
|
||||
for line in lines:
|
||||
## each line is a couple : tag / value
|
||||
taggedvalue = taggedValueFactory(line)
|
||||
if taggedvalue.tag in self:
|
||||
self[taggedvalue.tag].append(taggedvalue)
|
||||
else:
|
||||
self[taggedvalue.tag]=[taggedvalue]
|
||||
|
||||
|
||||
def parseStanzaName(stanza):
|
||||
sm = OBOEntry._match_stanza_name.search(stanza)
|
||||
if sm:
|
||||
return sm.group(0)
|
||||
else:
|
||||
return None
|
||||
|
||||
parseStanzaName=staticmethod(parseStanzaName)
|
||||
|
||||
|
||||
|
||||
class OBOTerm(OBOEntry):
|
||||
'''
|
||||
A stanza named 'Term'. It inherits from the class OBOEntry.
|
||||
'''
|
||||
def __init__(self,stanza):
|
||||
|
||||
## use of the OBOEntry constructor.
|
||||
OBOEntry.__init__(self, stanza)
|
||||
|
||||
assert self.stanzaName=='Term'
|
||||
assert 'stanza' in self
|
||||
assert 'id' in self and len(self['id'])==1,"An OBOTerm must have an id"
|
||||
assert 'name' in self and len(self['name'])==1,"An OBOTerm must have a name"
|
||||
assert 'namespace' not in self or len(self['namespace'])==1, "Only one namespace is allowed for an OBO term"
|
||||
|
||||
assert 'def' not in self or len(self['def'])==1,"Only one definition is allowed for an OBO term"
|
||||
assert 'comment' not in self or len(self['comment'])==1,"Only one comment is allowed for an OBO term"
|
||||
|
||||
assert 'union_of' not in self or len(self['union_of'])>=2,"Only one union relationship is allowed for an OBO term"
|
||||
assert 'intersection_of' not in self or len(self['intersection_of'])>=2,"Only one intersection relationship is allowed for an OBO term"
|
||||
|
||||
if self._isObsolete():
|
||||
#assert 'is_a' not in self
|
||||
assert 'relationship' not in self
|
||||
assert 'inverse_of' not in self
|
||||
assert 'disjoint_from' not in self
|
||||
assert 'union_of' not in self
|
||||
assert 'intersection_of' not in self
|
||||
|
||||
assert 'replaced_by' not in self or self._isObsolete()
|
||||
assert 'consider' not in self or self._isObsolete()
|
||||
|
||||
def _getStanza(self):
|
||||
return self['stanza'][0]
|
||||
|
||||
## make-up functions.
|
||||
def _getDefinition(self):
|
||||
if 'def' in self:
|
||||
return self['def'][0]
|
||||
return None
|
||||
|
||||
def _getId(self):
|
||||
return self['id'][0]
|
||||
|
||||
def _getNamespace(self):
|
||||
return self['namespace'][0]
|
||||
|
||||
def _getName(self):
|
||||
return self['name'][0]
|
||||
|
||||
def _getComment(self):
|
||||
if 'comment' in self:
|
||||
return self['comment'][0]
|
||||
return None
|
||||
|
||||
def _getAltIds(self):
|
||||
if 'alt_id' in self:
|
||||
return list(set(self.get('alt_id',None)))
|
||||
return None
|
||||
|
||||
def _getIsA(self):
|
||||
if 'is_a' in self:
|
||||
return list(set(self.get('is_a',None)))
|
||||
return None
|
||||
|
||||
def _getSynonym(self):
|
||||
if 'synonym' in self :
|
||||
return list(set(self.get('synonym',None)))
|
||||
return None
|
||||
|
||||
def _getSubset(self):
|
||||
if self.get('subset',None) != None:
|
||||
return list(set(self.get('subset',None)))
|
||||
else:
|
||||
return None
|
||||
|
||||
def _getXref(self):
|
||||
if 'xref' in self:
|
||||
return list(set(self.get('xref',None)))
|
||||
return None
|
||||
|
||||
def _getRelationShip(self):
|
||||
if 'relationship' in self:
|
||||
return list(set(self.get('relationship',None)))
|
||||
return None
|
||||
|
||||
def _getUnion(self):
|
||||
return list(set(self.get('union_of',None)))
|
||||
|
||||
def _getIntersection(self):
|
||||
return list(set(self.get('intersection_of',None)))
|
||||
|
||||
def _getDisjonction(self):
|
||||
return list(set(self.get('disjoint_from',None)))
|
||||
|
||||
def _isObsolete(self):
|
||||
return 'is_obsolete' in self and str(self['is_obsolete'][0])=='true'
|
||||
|
||||
def _getReplacedBy(self):
|
||||
if 'replaced_by' in self:
|
||||
return list(set(self.get('replaced_by',None)))
|
||||
return None
|
||||
|
||||
def _getConsider(self):
|
||||
if 'consider' in self:
|
||||
return list(set(self.get('consider',None)))
|
||||
return None
|
||||
|
||||
## automatically make-up !
|
||||
stanza = property(_getStanza,None,None)
|
||||
definition = property(_getDefinition,None,None)
|
||||
id = property(_getId,None,None)
|
||||
namespace = property(_getNamespace,None,None)
|
||||
name = property(_getName,None,None)
|
||||
comment = property(_getComment,None,None)
|
||||
alt_ids = property(_getAltIds,None,None)
|
||||
is_a = property(_getIsA,None,None)
|
||||
synonyms = property(_getSynonym,None,None)
|
||||
subsets = property(_getSubset,None,None)
|
||||
xrefs = property(_getXref,None,None)
|
||||
relationship = property(_getRelationShip,None,None)
|
||||
union_of = property(_getUnion,None,None)
|
||||
intersection_of = property(_getIntersection,None,None)
|
||||
disjoint_from = property(_getDisjonction,None,None)
|
||||
is_obsolete = property(_isObsolete,None,None)
|
||||
replaced_by = property(_getReplacedBy,None,None)
|
||||
consider = property(_getConsider,None,None)
|
||||
|
||||
|
||||
def OBOEntryFactory(stanza):
|
||||
'''
|
||||
Dispatcher of stanza.
|
||||
|
||||
@param stanza: a stanza composed of several lines.
|
||||
@type stanza: text
|
||||
|
||||
@return: an C{OBOTerm} | C{OBOEntry} instance
|
||||
|
||||
@note: The dispatcher treats differently the stanza which are OBO "Term"
|
||||
and the others.
|
||||
'''
|
||||
|
||||
stanzaType = OBOEntry.parseStanzaName(stanza)
|
||||
|
||||
if stanzaType=="Term":
|
||||
return OBOTerm(stanza)
|
||||
else:
|
||||
return OBOEntry(stanza)
|
||||
|
||||
def OBOEntryIterator(file):
|
||||
entries = stanzaIterator(file)
|
||||
for e in entries:
|
||||
debug(e)
|
||||
yield OBOEntryFactory(e)
|
||||
|
||||
|
137
obitools/options/__init__.py
Normal file
137
obitools/options/__init__.py
Normal file
@ -0,0 +1,137 @@
|
||||
"""
|
||||
Module providing high level functions to manage command line options.
|
||||
"""
|
||||
import logging
|
||||
import sys
|
||||
|
||||
from logging import debug
|
||||
|
||||
from optparse import OptionParser
|
||||
|
||||
from obitools.utils import universalOpen
|
||||
from obitools.utils import fileSize
|
||||
from obitools.utils import universalTell
|
||||
from obitools.utils import progressBar
|
||||
from obitools.format.options import addInputFormatOption, addInOutputOption,\
|
||||
autoEntriesIterator
|
||||
import time
|
||||
|
||||
|
||||
|
||||
def getOptionManager(optionDefinitions,entryIterator=None,progdoc=None):
|
||||
'''
|
||||
Build an option manager fonction. that is able to parse
|
||||
command line options of the script.
|
||||
|
||||
@param optionDefinitions: list of function describing a set of
|
||||
options. Each function must allows as
|
||||
unique parametter an instance of OptionParser.
|
||||
@type optionDefinitions: list of functions.
|
||||
|
||||
@param entryIterator: an iterator generator function returning
|
||||
entries from the data files.
|
||||
|
||||
@type entryIterator: an iterator generator function with only one
|
||||
parametter of type file
|
||||
'''
|
||||
parser = OptionParser(progdoc)
|
||||
parser.add_option('--DEBUG',
|
||||
action="store_true", dest="debug",
|
||||
default=False,
|
||||
help="Set logging in debug mode")
|
||||
|
||||
parser.add_option('--no-psyco',
|
||||
action="store_true", dest="noPsyco",
|
||||
default=False,
|
||||
help="Don't use psyco even if it installed")
|
||||
|
||||
parser.add_option('--without-progress-bar',
|
||||
action="store_false", dest="progressbar",
|
||||
default=True,
|
||||
help="desactivate progress bar")
|
||||
|
||||
checkFormat=False
|
||||
for f in optionDefinitions:
|
||||
if f == addInputFormatOption or f == addInOutputOption:
|
||||
checkFormat=True
|
||||
f(parser)
|
||||
|
||||
def commandLineAnalyzer():
|
||||
options,files = parser.parse_args()
|
||||
if options.debug:
|
||||
logging.root.setLevel(logging.DEBUG)
|
||||
|
||||
if checkFormat:
|
||||
ei=autoEntriesIterator(options)
|
||||
else:
|
||||
ei=entryIterator
|
||||
|
||||
i = allEntryIterator(files,ei,with_progress=options.progressbar)
|
||||
return options,i
|
||||
|
||||
return commandLineAnalyzer
|
||||
|
||||
_currentInputFileName=None
|
||||
_currentFile = None
|
||||
_currentFileSize = None
|
||||
|
||||
def currentInputFileName():
|
||||
return _currentInputFileName
|
||||
|
||||
def currentInputFile():
|
||||
return _currentFile
|
||||
|
||||
def currentFileSize():
|
||||
return _currentFileSize
|
||||
|
||||
def currentFileTell():
|
||||
return universalTell(_currentFile)
|
||||
|
||||
def fileWithProgressBar(file,step=100):
|
||||
try:
|
||||
size = currentFileSize()
|
||||
except:
|
||||
size = None
|
||||
|
||||
def fileBar():
|
||||
pos=1
|
||||
progressBar(pos, size, True,currentInputFileName())
|
||||
for l in file:
|
||||
progressBar(currentFileTell,size,head=currentInputFileName())
|
||||
yield l
|
||||
print >>sys.stderr,''
|
||||
if size is None:
|
||||
return file
|
||||
else:
|
||||
f = fileBar()
|
||||
return f
|
||||
|
||||
|
||||
def allEntryIterator(files,entryIterator,with_progress=False,histo_step=102):
|
||||
global _currentFile
|
||||
global _currentInputFileName
|
||||
global _currentFileSize
|
||||
if files :
|
||||
for f in files:
|
||||
_currentInputFileName=f
|
||||
f = universalOpen(f)
|
||||
_currentFile=f
|
||||
_currentFileSize=fileSize(_currentFile)
|
||||
debug(f)
|
||||
if with_progress:
|
||||
f=fileWithProgressBar(f,step=histo_step)
|
||||
if entryIterator is None:
|
||||
for line in f:
|
||||
yield line
|
||||
else:
|
||||
for entry in entryIterator(f):
|
||||
yield entry
|
||||
else:
|
||||
if entryIterator is None:
|
||||
for line in sys.stdin:
|
||||
yield line
|
||||
else:
|
||||
for entry in entryIterator(sys.stdin):
|
||||
yield entry
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user