git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPCR/trunk@390 60f365c0-8329-0410-b2a4-ec073aeeaa1d

This commit is contained in:
2011-12-05 12:59:34 +00:00
parent 957a59eb5d
commit 4e5d8893e5
155 changed files with 16897 additions and 0 deletions

1054
obitools/SVGdraw.py Normal file

File diff suppressed because it is too large Load Diff

711
obitools/__init__.py Normal file
View File

@ -0,0 +1,711 @@
'''
**obitools** main module
------------------------
.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>
obitools module provides base class for sequence manipulation.
All biological sequences must be subclass of :py:class:`obitools.BioSequence`.
Some biological sequences are defined as transformation of other
biological sequences. For example Reversed complemented sequences
are a transformation of a :py:class:`obitools.NucSequence`. This particular
type of sequences are subclasses of the :py:class:`obitools.WrappedBioSequence`.
.. inheritance-diagram:: BioSequence NucSequence AASequence WrappedBioSequence SubSequence DNAComplementSequence
:parts: 1
'''
from weakref import ref
from obitools.utils.iterator import uniqueChain
from itertools import chain
import re
_default_raw_parser = " %s *= *([^;]*);"
try:
from functools import partial
except:
#
# Add for compatibility purpose with Python < 2.5
#
def partial(func, *args, **keywords):
def newfunc(*fargs, **fkeywords):
newkeywords = keywords.copy()
newkeywords.update(fkeywords)
return func(*(args + fargs), **newkeywords)
newfunc.func = func
newfunc.args = args
newfunc.keywords = keywords
return newfunc
from obitools.sequenceencoder import DNAComplementEncoder
from obitools.location import Location
class WrapperSetIterator(object):
def __init__(self,s):
self._i = set.__iter__(s)
def next(self):
return self._i.next()()
def __iter__(self):
return self
class WrapperSet(set):
def __iter__(self):
return WrapperSetIterator(self)
class BioSequence(object):
'''
BioSequence class is the base class for biological
sequence representation.
It provides storage of :
- the sequence itself,
- an identifier,
- a definition an manage
- a set of complementary information on a key / value principle.
.. warning::
:py:class:`obitools.BioSequence` is an abstract class, this constructor
can only be called by a subclass constructor.
'''
def __init__(self,id,seq,definition=None,rawinfo=None,rawparser=_default_raw_parser,**info):
'''
:param id: sequence identifier
:type id: `str`
:param seq: the sequence
:type seq: `str`
:param definition: sequence definition (optional)
:type definition: `str`
:param rawinfo: a text containing a set of key=value; patterns
:type definition: `str`
:param rawparser: a text describing a regular patterns template
used to parse rawinfo
:type definition: `str`
:param info: extra named parameters can be added to associate complementary
data to the sequence
'''
assert type(self)!=BioSequence,"obitools.BioSequence is an abstract class"
self._seq=str(seq).lower()
self._info = dict(info)
if rawinfo is not None:
self._rawinfo=' ' + rawinfo
else:
self._rawinfo=None
self._rawparser=rawparser
self.definition=definition
self.id=id
self._hasTaxid=None
def get_seq(self):
return self.__seq
def set_seq(self, value):
if not isinstance(value, str):
value=str(value)
self.__seq = value
self.__len = len(value)
def clone(self):
seq = type(self)(self.id,
str(self),
definition=self.definition
)
seq._info=dict(self.getTags())
seq._rawinfo=self._rawinfo
seq._rawparser=self._rawparser
seq._hasTaxid=self._hasTaxid
return seq
def getDefinition(self):
'''
Sequence definition getter.
:return: the sequence definition
:rtype: str
'''
return self._definition
def setDefinition(self, value):
'''
Sequence definition setter.
:param value: the new sequence definition
:type value: C{str}
:return: C{None}
'''
self._definition = value
def getId(self):
'''
Sequence identifier getter
:return: the sequence identifier
:rtype: C{str}
'''
return self._id
def setId(self, value):
'''
Sequence identifier setter.
:param value: the new sequence identifier
:type value: C{str}
:return: C{None}
'''
self._id = value
def getStr(self):
'''
Return the sequence as a string
:return: the string representation of the sequence
:rtype: str
'''
return self._seq
def getSymbolAt(self,position):
'''
Return the symbole at C{position} in the sequence
:param position: the desired position. Position start from 0
if position is < 0 then they are considered
to reference the end of the sequence.
:type position: `int`
:return: a one letter string
:rtype: `str`
'''
return str(self)[position]
def getSubSeq(self,location):
'''
return a subsequence as described by C{location}.
The C{location} parametter can be a L{obitools.location.Location} instance,
an interger or a python C{slice} instance. If C{location}
is an iterger this method is equivalent to L{getSymbolAt}.
:param location: the positions of the subsequence to return
:type location: C{Location} or C{int} or C{slice}
:return: the subsequence
:rtype: a single character as a C{str} is C{location} is an integer,
a L{obitools.SubSequence} instance otherwise.
'''
if isinstance(location,Location):
return location.extractSequence(self)
elif isinstance(location, int):
return self.getSymbolAt(location)
elif isinstance(location, slice):
return SubSequence(self,location)
raise TypeError,'key must be a Location, an integer or a slice'
def getKey(self,key):
if key not in self._info:
if self._rawinfo is None:
if key=='count':
return 1
else:
raise KeyError,key
p = re.compile(self._rawparser % key)
m = p.search(self._rawinfo)
if m is not None:
v=m.group(1)
self._rawinfo=' ' + self._rawinfo[0:m.start(0)]+self._rawinfo[m.end(0):]
try:
v = eval(v)
except:
pass
self._info[key]=v
else:
if key=='count':
v=1
else:
raise KeyError,key
else:
v=self._info[key]
return v
def extractTaxon(self):
'''
Extract Taxonomy information from the sequence header.
This method by default return None. It should be subclassed
if necessary as in L{obitools.seqdb.AnnotatedSequence}.
:return: None
'''
self._hasTaxid=self.hasKey('taxid')
return None
def __str__(self):
return self.getStr()
def __getitem__(self,key):
if isinstance(key, str):
if key=='taxid' and self._hasTaxid is None:
self.extractTaxon()
return self.getKey(key)
else:
return self.getSubSeq(key)
def __setitem__(self,key,value):
self._info[key]=value
if key=='taxid':
self._hasTaxid=value is not None
def __delitem__(self,key):
if isinstance(key, str):
if key in self:
del self._info[key]
else:
raise KeyError,key
if key=='taxid':
self._hasTaxid=False
else:
raise TypeError,key
def __iter__(self):
'''
Iterate through the sequence symbols
'''
return iter(str(self))
def __len__(self):
return self.__len
def hasKey(self,key):
rep = key in self._info
if not rep and self._rawinfo is not None:
p = re.compile(self._rawparser % key)
m = p.search(self._rawinfo)
if m is not None:
v=m.group(1)
self._rawinfo=' ' + self._rawinfo[0:m.start(0)]+self._rawinfo[m.end(0):]
try:
v = eval(v)
except:
pass
self._info[key]=v
rep=True
return rep
def __contains__(self,key):
'''
methods allowing to use the C{in} operator on a C{BioSequence}.
The C{in} operator test if the C{key} value is defined for this
sequence.
:param key: the name of the checked value
:type key: str
:return: C{True} if the value is defined, {False} otherwise.
:rtype: C{bool}
'''
if key=='taxid' and self._hasTaxid is None:
self.extractTaxon()
return self.hasKey(key)
def rawiteritems(self):
return self._info.iteritems()
def iteritems(self):
'''
iterate other items dictionary storing the values
associated to the sequence. It works similarly to
the iteritems function of C{dict}.
:return: an iterator over the items (key,value)
link to a sequence
:rtype: iterator over tuple
:see: L{items}
'''
if self._rawinfo is not None:
p = re.compile(self._rawparser % "([a-zA-Z]\w*)")
for k,v in p.findall(self._rawinfo):
try:
self._info[k]=eval(v)
except:
self._info[k]=v
self._rawinfo=None
return self._info.iteritems()
def items(self):
return [x for x in self.iteritems()]
def iterkeys(self):
return (k for k,v in self.iteritems())
def keys(self):
return [x for x in self.iterkeys()]
def getTags(self):
self.iteritems()
return self._info
def getRoot(self):
return self
def getWrappers(self):
if not hasattr(self, '_wrappers'):
self._wrappers=WrapperSet()
return self._wrappers
def register(self,wrapper):
self.wrappers.add(ref(wrapper,self._unregister))
def _unregister(self,ref):
self.wrappers.remove(ref)
wrappers = property(getWrappers,None,None,'')
definition = property(getDefinition, setDefinition, None, "Sequence Definition")
id = property(getId, setId, None, 'Sequence identifier')
def _getTaxid(self):
return self['taxid']
def _setTaxid(self,taxid):
self['taxid']=taxid
taxid = property(_getTaxid,_setTaxid,None,'NCBI Taxonomy identifier')
_seq = property(get_seq, set_seq, None, None)
class NucSequence(BioSequence):
"""
:py:class:`NucSequence` specialize the :py:class:`BioSequence` class for storing DNA
sequences.
The constructor is identical to the :py:class:`BioSequence` constructor.
"""
def complement(self):
"""
:return: The reverse complemented sequence as an instance of :py:class:`DNAComplementSequence`
:rtype: :py:class:`DNAComplementSequence`
"""
return DNAComplementSequence(self)
def isNucleotide(self):
return True
class AASequence(BioSequence):
"""
:py:class:`AASequence` specialize the :py:class:`BioSequence` class for storing protein
sequences.
The constructor is identical to the :py:class:`BioSequence` constructor.
"""
def isNucleotide(self):
return False
class WrappedBioSequence(BioSequence):
"""
.. warning::
:py:class:`obitools.WrappedBioSequence` is an abstract class, this constructor
can only be called by a subclass constructor.
"""
def __init__(self,reference,id=None,definition=None,**info):
assert type(self)!=WrappedBioSequence,"obitools.WrappedBioSequence is an abstract class"
self._wrapped = reference
reference.register(self)
self._id=id
self.definition=definition
self._info=info
def clone(self):
seq = type(self)(self.wrapped,
id=self._id,
definition=self._definition
)
seq._info=dict(self._info)
return seq
def getWrapped(self):
return self._wrapped
def getDefinition(self):
d = self._definition or self.wrapped.definition
return d
def getId(self):
d = self._id or self.wrapped.id
return d
def isNucleotide(self):
return self.wrapped.isNucleotide()
def iterkeys(self):
return uniqueChain(self._info.iterkeys(),
self.wrapped.iterkeys())
def rawiteritems(self):
return chain(self._info.iteritems(),
(x for x in self.wrapped.rawiteritems()
if x[0] not in self._info))
def iteritems(self):
for x in self.iterkeys():
yield (x,self[x])
def getKey(self,key):
if key in self._info:
return self._info[key]
else:
return self.wrapped.getKey(key)
def hasKey(self,key):
return key in self._info or self.wrapped.hasKey(key)
def getSymbolAt(self,position):
return self.wrapped.getSymbolAt(self.posInWrapped(position))
def posInWrapped(self,position,reference=None):
if reference is None or reference is self.wrapped:
return self._posInWrapped(position)
else:
return self.wrapped.posInWrapped(self._posInWrapped(position),reference)
def getStr(self):
return str(self.wrapped)
def getRoot(self):
return self.wrapped.getRoot()
def complement(self):
"""
The :py:meth:`complement` method of the :py:class:`WrappedBioSequence` class
raises an exception :py:exc:`AttributeError` if the method is called and the cut
sequence does not corresponds to a nucleic acid sequence.
"""
if self.wrapped.isNucleotide():
return DNAComplementSequence(self)
raise AttributeError
def _posInWrapped(self,position):
return position
definition = property(getDefinition,BioSequence.setDefinition, None)
id = property(getId,BioSequence.setId, None)
wrapped = property(getWrapped, None, None, "A pointer to the wrapped sequence")
def _getWrappedRawInfo(self):
return self.wrapped._rawinfo
_rawinfo = property(_getWrappedRawInfo)
class SubSequence(WrappedBioSequence):
"""
"""
@staticmethod
def _sign(x):
if x == 0:
return 0
elif x < 0:
return -1
return 1
def __init__(self,reference,
location=None,
start=None,stop=None,
id=None,definition=None,
**info):
WrappedBioSequence.__init__(self,reference,id=None,definition=None,**info)
if isinstance(location, slice):
self._location = location
else:
step = 1
if not isinstance(start, int):
start = 0;
if not isinstance(stop,int):
stop = len(reference)
self._location=slice(start,stop,step)
self._indices=self._location.indices(len(self.wrapped))
self._xrange=xrange(*self._indices)
self._info['cut']='[%d,%d,%s]' % self._indices
if hasattr(reference,'quality'):
self.quality = reference.quality[self._location]
def getId(self):
d = self._id or ("%s_SUB" % self.wrapped.id)
return d
def clone(self):
seq = WrappedBioSequence.clone(self)
seq._location=self._location
seq._indices=seq._location.indices(len(seq.wrapped))
seq._xrange=xrange(*seq._indices)
return seq
def __len__(self):
return len(self._xrange)
def getStr(self):
return ''.join([x for x in self])
def __iter__(self):
return (self.wrapped.getSymbolAt(x) for x in self._xrange)
def _posInWrapped(self,position):
return self._xrange[position]
id = property(getId,BioSequence.setId, None)
class DNAComplementSequence(WrappedBioSequence):
"""
Class used to represent a reverse complemented DNA sequence. Usually instances
of this class are produced by using the :py:meth:`NucSequence.complement` method.
"""
_comp={'a': 't', 'c': 'g', 'g': 'c', 't': 'a',
'r': 'y', 'y': 'r', 'k': 'm', 'm': 'k',
's': 's', 'w': 'w', 'b': 'v', 'd': 'h',
'h': 'd', 'v': 'b', 'n': 'n', 'u': 'a',
'-': '-'}
def __init__(self,reference,
id=None,definition=None,**info):
WrappedBioSequence.__init__(self,reference,id=None,definition=None,**info)
assert reference.isNucleotide()
self._info['complemented']=True
if hasattr(reference,'quality'):
self.quality = reference.quality[::-1]
def getId(self):
d = self._id or ("%s_CMP" % self.wrapped.id)
return d
def __len__(self):
return len(self._wrapped)
def getStr(self):
return ''.join([x for x in self])
def __iter__(self):
return (self.getSymbolAt(x) for x in xrange(len(self)))
def _posInWrapped(self,position):
return -(position+1)
def getSymbolAt(self,position):
return DNAComplementSequence._comp[self.wrapped.getSymbolAt(self.posInWrapped(position))]
def complement(self):
"""
The :py:meth:`complement` method of the :py:class:`DNAComplementSequence` class actually
returns the wrapped sequenced. Effectively the reversed complemented sequence of a reversed
complemented sequence is the initial sequence.
"""
return self.wrapped
id = property(getId,BioSequence.setId, None)
def _isNucSeq(text):
acgt = 0
notnuc = 0
ltot = len(text) * 0.8
for c in text.lower():
if c in 'acgt-':
acgt+=1
if c not in DNAComplementEncoder._comp:
notnuc+=1
return notnuc==0 and float(acgt) > ltot
def bioSeqGenerator(id,seq,definition=None,rawinfo=None,rawparser=_default_raw_parser,**info):
"""
Generate automagically the good class instance between :
- :py:class:`NucSequence`
- :py:class:`AASequence`
Build a new sequence instance. Sequences are instancied as :py:class:`NucSequence` if the
`seq` attribute contains more than 80% of *A*, *C*, *G*, *T* or *-* symbols
in upper or lower cases. Conversely, the new sequence instance is instancied as
:py:class:`AASequence`.
:param id: sequence identifier
:type id: `str`
:param seq: the sequence
:type seq: `str`
:param definition: sequence definition (optional)
:type definition: `str`
:param rawinfo: a text containing a set of key=value; patterns
:type definition: `str`
:param rawparser: a text describing a regular patterns template
used to parse rawinfo
:type definition: `str`
:param info: extra named parameters can be added to associate complementary
data to the sequence
"""
if _isNucSeq(seq):
return NucSequence(id,seq,definition,rawinfo,rawparser,**info)
else:
return AASequence(id,seq,definition,rawinfo,rawparser,**info)

BIN
obitools/__init__.pyc Normal file

Binary file not shown.

View File

@ -0,0 +1,13 @@
from _nws import NWS
from _upperbond import indexSequences
from _lcs import LCS,lenlcs
from _assemble import DirectAssemble, ReverseAssemble
from _qsassemble import QSolexaDirectAssemble,QSolexaReverseAssemble
from _rassemble import RightDirectAssemble as RightReverseAssemble
from _qsrassemble import QSolexaRightDirectAssemble,QSolexaRightReverseAssemble
from _freeendgap import FreeEndGap
from _freeendgapfm import FreeEndGapFullMatch
from _upperbond import isLCSReachable

BIN
obitools/align/_assemble.so Executable file

Binary file not shown.

BIN
obitools/align/_dynamic.so Executable file

Binary file not shown.

BIN
obitools/align/_freeendgap.so Executable file

Binary file not shown.

BIN
obitools/align/_freeendgapfm.so Executable file

Binary file not shown.

BIN
obitools/align/_lcs.so Executable file

Binary file not shown.

BIN
obitools/align/_nws.so Executable file

Binary file not shown.

BIN
obitools/align/_profilenws.so Executable file

Binary file not shown.

BIN
obitools/align/_qsassemble.so Executable file

Binary file not shown.

BIN
obitools/align/_qsrassemble.so Executable file

Binary file not shown.

BIN
obitools/align/_rassemble.so Executable file

Binary file not shown.

BIN
obitools/align/_upperbond.so Executable file

Binary file not shown.

View File

@ -0,0 +1,56 @@
'''
Created on 14 mai 2009
@author: coissac
'''
from obitools import WrappedBioSequence
class HomoNucBioSeq(WrappedBioSequence):
'''
classdocs
'''
def __init__(self,reference,id=None,definition=None,**info):
'''
Constructor
'''
assert reference.isNucleotide(),"reference must be a nucleic sequence"
WrappedBioSequence.__init__(self,reference,id=None,definition=None,**info)
self.__cleanHomopolymer()
def __cleanHomopolymer(self):
s = []
c = []
old=None
nc=0
for n in self._wrapped:
if old is not None and n!=old:
s.append(old)
c.append(nc)
nc=0
old=n
nc+=1
self._cached=''.join(s)
self['homopolymer']=c
self._cumulative=[]
sum=0
for c in self._count:
sum+=c
self._cumulative.append(sum)
def __len__(self):
return len(self._cached)
def getStr(self):
return self._cached
def __iter__(self):
return iter(self._cached)
def _posInWrapped(self,position):
return self._cumulative[position]

46
obitools/align/ssearch.py Normal file
View File

@ -0,0 +1,46 @@
import os
import re
from obitools.fasta import formatFasta
class SsearchParser(object):
_matchQuery = re.compile("^Query:.+\n.+?>+([^ ]+)", re.MULTILINE)
_matchLQuery = re.compile("^Query:.+\n.+?(\d+)(?= nt\n)", re.MULTILINE)
_matchProp = re.compile("^The best scores are:.*\n(.+?)>>>", re.DOTALL+re.MULTILINE)
def __init__(self,file):
if isinstance(file,str):
file = open(file,'rU')
self.data = file.read()
self.query= SsearchParser._matchQuery.search(self.data).group(1)
self.queryLength= int(SsearchParser._matchLQuery.search(self.data).group(1))
props = SsearchParser._matchProp.search(self.data)
if props:
props=props.group(0).split('\n')[1:-2]
self.props=[]
for line in props:
subject,tab = line.split('\t')
tab=tab.split()
ssp = subject.split()
ac = ssp[0]
dbl= int(ssp[-5][:-1])
ident = float(tab[0])
matchlen = abs(int(tab[5]) - int(tab[4])) +1
self.props.append({"ac" :ac,
"identity" :ident,
"subjectlength":dbl,
'matchlength' : matchlen})
def run(seq,database,program='fasta35',opts=''):
ssearchin,ssearchout,ssearcherr = os.popen3("%s %s %s" % (program,opts,database))
print >>ssearchin,formatFasta(seq)
ssearchin.close()
result = SsearchParser(ssearchout)
return seq,result
def ssearchIterator(sequenceIterator,database,program='ssearch35',opts=''):
for seq in sequenceIterator:
yield run(seq,database,program,opts)

View File

@ -0,0 +1,175 @@
from obitools import BioSequence
from obitools import WrappedBioSequence
from copy import deepcopy
class GappedPositionException(Exception):
pass
class AlignedSequence(WrappedBioSequence):
def __init__(self,reference,
id=None,definition=None,**info):
WrappedBioSequence.__init__(self,reference,id=None,definition=None,**info)
self._length=len(reference)
self._gaps=[[self._length,0]]
def clone(self):
seq = WrappedBioSequence.clone(self)
seq._gaps=deepcopy(self._gaps)
seq._length=reduce(lambda x,y:x+y, (z[0]+z[1] for z in self._gaps),0)
return seq
def setGaps(self, value):
'''
Set gap vector to an AlignedSequence.
Gap vector describes the gap positions on a sequence.
It is a gap of couple. The first couple member is the count
of sequence letter, the second one is the gap length.
@param value: a list of length 2 list describing gap positions
@type value: list of couple
'''
assert isinstance(value, list),'Gap vector must be a list'
assert reduce(lambda x,y: x and y,
(isinstance(z, list) and len(z)==2 for z in value),
True),"Value must be a list of length 2 list"
lseq = reduce(lambda x,y:x+y, (z[0] for z in value),0)
assert lseq==len(self.wrapped),"Gap vector incompatible with the sequence"
self._gaps = value
self._length=reduce(lambda x,y:x+y, (z[0]+z[1] for z in value),0)
def getGaps(self):
return tuple(self._gaps)
gaps = property(getGaps, setGaps, None, "Gaps's Docstring")
def _getIndice(self,pos):
i=0
cpos=0
for s,g in self._gaps:
cpos+=s
if cpos>pos:
return i,pos-cpos+s
cpos+=g
if cpos>pos:
return i,-pos+cpos-g-1
i+=1
raise IndexError
def getId(self):
d = self._id or ("%s_ALN" % self.wrapped.id)
return d
def __len__(self):
return self._length
def getStr(self):
return ''.join([x for x in self])
def __iter__(self):
def isymb():
cpos=0
for s,g in self._gaps:
for x in xrange(s):
yield self.wrapped[cpos+x]
for x in xrange(g):
yield '-'
cpos+=s
return isymb()
def _posInWrapped(self,position):
i,s=self._getIndice(position)
if s<0:
raise GappedPositionException
value=self._gaps
p=reduce(lambda x,y:x+y, (z[0] for z in value[:i]),0)+s
return p
def getSymbolAt(self,position):
try:
return self.wrapped.getSymbolAt(self.posInWrapped(position))
except GappedPositionException:
return '-'
def insertGap(self,position,count=1):
if position==self._length:
idx=len(self._gaps)-1
p=-1
else:
idx,p = self._getIndice(position)
if p >= 0:
self._gaps.insert(idx, [p,count])
self._gaps[idx+1][0]-=p
else:
self._gaps[idx][1]+=count
self._length=reduce(lambda x,y:x+y, (z[0]+z[1] for z in self._gaps),0)
id = property(getId,BioSequence.setId, None, "Sequence Identifier")
class Alignment(list):
def _assertData(self,data):
assert isinstance(data, BioSequence),'You must only add bioseq to an alignement'
if hasattr(self, '_alignlen'):
assert self._alignlen==len(data),'All aligned sequences must have the same length'
else:
self._alignlen=len(data)
return data
def clone(self):
ali = Alignment(x.clone() for x in self)
return ali
def append(self,data):
data = self._assertData(data)
list.append(self,data)
def __setitem__(self,index,data):
data = self._assertData(data)
list.__setitem__(self,index,data)
def getSite(self,key):
if isinstance(key,int):
return [x[key] for x in self]
def insertGap(self,position,count=1):
for s in self:
s.insertGap(position,count)
def isFullGapSite(self,key):
return reduce(lambda x,y: x and y,(z=='-' for z in self.getSite(key)),True)
def isGappedSite(self,key):
return '-' in self.getSite(key)
def __str__(self):
l = len(self[0])
rep=""
idmax = max(len(x.id) for x in self)+2
template= "%%-%ds %%-60s" % idmax
for p in xrange(0,l,60):
for s in self:
rep+= (template % (s.id,s[p:p+60])).strip() + '\n'
rep+="\n"
return rep
def alignmentReader(file,sequenceIterator):
seqs = sequenceIterator(file)
alignement = Alignment()
for seq in seqs:
alignement.append(seq)
return alignement
def columnIterator(alignment):
lali = len(alignment[0])
for p in xrange(lali):
c = [x[p] for x in alignment]
yield c

47
obitools/alignment/ace.py Normal file
View File

@ -0,0 +1,47 @@
from obitools.format.genericparser import GenericParser
from obitools.utils import universalOpen
from obitools.fasta import parseFastaDescription
from obitools import NucSequence
import sys
_contigIterator=GenericParser('^CO ')
_contigIterator.addParseAction('AF', '\nAF +(\S+) +([UC]) +(-?[0-9]+)')
_contigIterator.addParseAction('RD', '\nRD +(\S+) +([0-9]+) +([0-9]+) +([0-9]+) *\n([A-Za-z\n*]+?)\n\n')
_contigIterator.addParseAction('DS', '\nDS +(.+)')
_contigIterator.addParseAction('CO', '^CO (\S+)')
def contigIterator(file):
file = universalOpen(file)
for entry in _contigIterator(file):
contig=[]
for rd,ds,af in map(None,entry['RD'],entry['DS'],entry['AF']):
id = rd[0]
shift = int(af[2])
if shift < 0:
print >> sys.stderr,"Sequence %s in contig %s has a negative paddng value %d : skipped" % (id,entry['CO'][0],shift)
#continue
definition,info = parseFastaDescription(ds)
info['shift']=shift
seq = rd[4].replace('\n','').replace('*','-').strip()
contig.append(NucSequence(id,seq,definition,**info))
maxlen = max(len(x)+x['shift'] for x in contig)
minshift=min(x['shift'] for x in contig)
rep = []
for s in contig:
info = s.getTags()
info['shift']-=minshift-1
head = '-' * (info['shift']-1)
tail = (maxlen + minshift - len(s) - info['shift'] - 1)
info['tail']=tail
newseq = NucSequence(s.id,head + str(s)+ '-' * tail,s.definition,**info)
rep.append(newseq)
yield entry['CO'][0],rep

View File

@ -0,0 +1,7 @@
'''
@author: merciece
Creates the tree representing the coverage of 2 primers from an ecoPCR output file and an ecoPCR database.
'''

View File

@ -0,0 +1,62 @@
#!/usr/local/bin/python
'''
Created on 24 nov. 2011
@author: merciece
'''
def main(amplifiedSeqs, seqsFromDB, keptRanks, errors, tax) :
'''
error threshold is set to 3
'''
listtaxabygroupinDB = {}
for seq in seqsFromDB :
taxid = seq['taxid']
p = [a for a in tax.parentalTreeIterator(taxid)]
for a in p :
if a != p[0] :
if a[1] in keptRanks :
group = a[0]
if group in listtaxabygroupinDB and taxid not in listtaxabygroupinDB[group] :
listtaxabygroupinDB[group].add(taxid)
elif group not in listtaxabygroupinDB :
listtaxabygroupinDB[group]=set([taxid])
taxabygroup = dict((x,len(listtaxabygroupinDB[x])) for x in listtaxabygroupinDB)
listamplifiedtaxabygroup = {}
for seq in amplifiedSeqs :
if errors[seq.id][2] <= 3 :
taxid = seq['taxid']
p = [a for a in tax.parentalTreeIterator(taxid)]
for a in p :
if a != p[0] :
if a[1] in keptRanks :
group = a[0]
if group in listamplifiedtaxabygroup and taxid not in listamplifiedtaxabygroup[group] :
listamplifiedtaxabygroup[group].add(taxid)
elif group not in listamplifiedtaxabygroup :
listamplifiedtaxabygroup[group]=set([taxid])
amplifiedtaxabygroup = dict((x,len(listamplifiedtaxabygroup[x])) for x in listamplifiedtaxabygroup)
BcValues = {}
groups = [g for g in taxabygroup.keys()]
for g in groups :
if g in amplifiedtaxabygroup :
BcValues[g] = float(amplifiedtaxabygroup[g])/taxabygroup[g]*100
BcValues[g] = round(BcValues[g], 2)
else :
BcValues[g] = 0.0
return BcValues

View File

@ -0,0 +1,72 @@
#!/usr/local/bin/python
'''
Created on 24 nov. 2011
@author: merciece
'''
import sys
def main(amplifiedSeqs, seqsFromDB, keptRanks, tax) :
BcValues = {}
#speciesid = tax.findRankByName('species')
#subspeciesid = tax.findRankByName('subspecies')
listtaxonbygroup = {}
for seq in seqsFromDB :
taxid = seq['taxid']
p = [a for a in tax.parentalTreeIterator(taxid)]
for a in p :
if a != p[0] :
if a[1] in keptRanks :
group = a
if group in listtaxonbygroup:
listtaxonbygroup[group].add(taxid)
else:
listtaxonbygroup[group]=set([taxid])
#stats = dict((x,len(listtaxonbygroup[x])) for x in listtaxonbygroup)
print>>sys.stderr, listtaxonbygroup
listtaxonbygroup = {}
for seq in amplifiedSeqs :
taxid = seq['taxid']
p = [a for a in tax.parentalTreeIterator(taxid)]
for a in p :
if a != p[0] :
if a[1] in keptRanks :
group = a
if group in listtaxonbygroup:
listtaxonbygroup[group].add(taxid)
else:
listtaxonbygroup[group]=set([taxid])
print>>sys.stderr, listtaxonbygroup
return BcValues
# dbstats= dict((x,len(listtaxonbygroup[x])) for x in listtaxonbygroup)
#
# ranks = [r for r in keptRanks]
# ranks.sort()
#
# print '%-20s\t%10s\t%10s\t%7s' % ('rank','ecopcr','db','percent')
#
# print>>sys.stderr, stats
# print>>sys.stderr, dbstats
# print>>sys.stderr, ranks
#
# for r in ranks:
# if r in dbstats and dbstats[r]:
# print '%-20s\t%10d\t%10d\t%8.2f' % (r,dbstats[r],stats[r],float(dbstats[r])/stats[r]*100)

View File

@ -0,0 +1,108 @@
#!/usr/local/bin/python
'''
Created on 25 nov. 2011
@author: merciece
'''
from obitools.graph.rootedtree import nexusFormat
figtree="""\
begin figtree;
set appearance.backgroundColorAttribute="User Selection";
set appearance.backgroundColour=#-1;
set appearance.branchColorAttribute="bc";
set appearance.branchLineWidth=2.0;
set appearance.foregroundColour=#-16777216;
set appearance.selectionColour=#-2144520576;
set branchLabels.colorAttribute="User Selection";
set branchLabels.displayAttribute="errors";
set branchLabels.fontName="sansserif";
set branchLabels.fontSize=10;
set branchLabels.fontStyle=0;
set branchLabels.isShown=true;
set branchLabels.significantDigits=4;
set layout.expansion=2000;
set layout.layoutType="RECTILINEAR";
set layout.zoom=0;
set nodeBars.barWidth=4.0;
set nodeLabels.colorAttribute="User Selection";
set nodeLabels.displayAttribute="label";
set nodeLabels.fontName="sansserif";
set nodeLabels.fontSize=10;
set nodeLabels.fontStyle=0;
set nodeLabels.isShown=true;
set nodeLabels.significantDigits=4;
set polarLayout.alignTipLabels=false;
set polarLayout.angularRange=0;
set polarLayout.rootAngle=0;
set polarLayout.rootLength=100;
set polarLayout.showRoot=true;
set radialLayout.spread=0.0;
set rectilinearLayout.alignTipLabels=false;
set rectilinearLayout.curvature=0;
set rectilinearLayout.rootLength=100;
set scale.offsetAge=0.0;
set scale.rootAge=1.0;
set scale.scaleFactor=1.0;
set scale.scaleRoot=false;
set scaleAxis.automaticScale=true;
set scaleAxis.fontSize=8.0;
set scaleAxis.isShown=false;
set scaleAxis.lineWidth=2.0;
set scaleAxis.majorTicks=1.0;
set scaleAxis.origin=0.0;
set scaleAxis.reverseAxis=false;
set scaleAxis.showGrid=true;
set scaleAxis.significantDigits=4;
set scaleBar.automaticScale=true;
set scaleBar.fontSize=10.0;
set scaleBar.isShown=true;
set scaleBar.lineWidth=1.0;
set scaleBar.scaleRange=0.0;
set scaleBar.significantDigits=4;
set tipLabels.colorAttribute="User Selection";
set tipLabels.displayAttribute="Names";
set tipLabels.fontName="sansserif";
set tipLabels.fontSize=10;
set tipLabels.fontStyle=0;
set tipLabels.isShown=true;
set tipLabels.significantDigits=4;
set trees.order=false;
set trees.orderType="increasing";
set trees.rooting=false;
set trees.rootingType="User Selection";
set trees.transform=false;
set trees.transformType="cladogram";
end;
"""
def cartoonRankGenerator(rank):
def cartoon(node):
return 'rank' in node and node['rank']==rank
return cartoon
def collapseBcGenerator(Bclimit):
def collapse(node):
return 'bc' in node and node['bc']<=Bclimit
return collapse
def label(node):
if 'bc' in node:
return "(%+3.1f) %s" % (node['bc'],node['name'])
else:
return " %s" % node['name']
def main(coverageTree) :
print nexusFormat(coverageTree,
label=label,
blocks=figtree,
cartoon=cartoonRankGenerator('family'))
#collapse=collapseBcGenerator(70))

View File

@ -0,0 +1,56 @@
#!/usr/local/bin/python
'''
Created on 24 nov. 2011
@author: merciece
'''
def main(seqs, keptRanks, tax):
errorsBySeq = getErrorsOnLeaves(seqs)
errorsByTaxon = propagateErrors(errorsBySeq, keptRanks, tax)
return errorsBySeq, errorsByTaxon
def getErrorsOnLeaves(seqs) :
errors = {}
for s in seqs :
taxid = s['taxid']
forErrs = s['forward_error']
revErrs = s['reverse_error']
total = forErrs + revErrs
seqNb = 1
errors[s.id] = [forErrs,revErrs,total,seqNb,taxid]
return errors
def propagateErrors(errorsOnLeaves, keptRanks, tax) :
allErrors = {}
for seq in errorsOnLeaves :
taxid = errorsOnLeaves[seq][4]
p = [a for a in tax.parentalTreeIterator(taxid)]
for a in p :
if a[1] in keptRanks :
group = a[0]
if group in allErrors :
allErrors[group][0] += errorsOnLeaves[seq][0]
allErrors[group][1] += errorsOnLeaves[seq][1]
allErrors[group][2] += errorsOnLeaves[seq][2]
allErrors[group][3] += 1
else :
allErrors[group] = errorsOnLeaves[seq]
for group in allErrors :
allErrors[group][0] /= float(allErrors[group][3])
allErrors[group][1] /= float(allErrors[group][3])
allErrors[group][2] /= float(allErrors[group][3])
allErrors[group][0] = round(allErrors[group][0], 2)
allErrors[group][1] = round(allErrors[group][1], 2)
allErrors[group][2] = round(allErrors[group][2], 2)
return allErrors

View File

@ -0,0 +1,69 @@
#!/usr/local/bin/python
'''
Created on 23 nov. 2011
@author: merciece
'''
from obitools.ecopcr import sequence
from obitools.ecopcr import taxonomy
def main(entries,options):
filteredDataFromDB = ecoPCRDatabaseReader(options)
filteredData = ecoPCRFileReader(entries,filteredDataFromDB)
return filteredDataFromDB,filteredData
def ecoPCRDatabaseReader(options):
tax = taxonomy.EcoTaxonomyDB(options.taxonomy)
seqs = sequence.EcoPCRDBSequenceIterator(options.taxonomy,taxonomy=tax)
norankid = tax.findRankByName('no rank')
speciesid = tax.findRankByName('species')
genusid = tax.findRankByName('genus')
familyid = tax.findRankByName('family')
minrankseq = set([speciesid,genusid,familyid])
usedrankid = {}
ingroup = {}
outgroup= {}
for s in seqs :
if 'taxid' in s :
taxid = s['taxid']
allrank = set()
for p in tax.parentalTreeIterator(taxid):
if p[1]!=norankid:
allrank.add(p[1])
if len(minrankseq & allrank) == 3:
for r in allrank:
usedrankid[r]=usedrankid.get(r,0) + 1
if tax.isAncestor(options.ingroup,taxid):
ingroup[s.id] = s
else:
outgroup[s.id] = s
keptranks = set(r for r in usedrankid
if float(usedrankid[r])/float(len(ingroup)) > options.rankthresold)
return { 'ingroup' : ingroup,
'outgroup': outgroup,
'ranks' : keptranks,
'taxonomy': tax
}
def ecoPCRFileReader(entries,filteredDataFromDB) :
filteredData = []
for s in entries :
if 'taxid' in s :
seqId = s.id
if seqId in filteredDataFromDB['ingroup'] :
filteredData.append(s)
return filteredData

View File

@ -0,0 +1,42 @@
#!/usr/local/bin/python
'''
Created on 25 nov. 2011
@author: merciece
'''
from obitools.graph.rootedtree import RootedTree
def main(BcValues,errors,tax) :
tree = RootedTree()
tset = set(BcValues)
for taxon in BcValues:
if taxon in errors :
forErr = errors[taxon][0]
revErr = errors[taxon][1]
totErr = errors[taxon][2]
else :
forErr = -1.0
revErr = -1.0
totErr = -1.0
tree.addNode(taxon, rank=tax.getRank(taxon),
name=tax.getScientificName(taxon),
bc = BcValues[taxon],
errors = str(forErr)+' '+str(revErr),
totError = totErr
)
for taxon in BcValues:
piter = tax.parentalTreeIterator(taxon)
taxon = piter.next()
for parent in piter:
if taxon[0] in tset and parent[0] in BcValues:
tset.remove(taxon[0])
tree.addEdge(parent[0], taxon[0])
taxon=parent
return tree

207
obitools/blast/__init__.py Normal file
View File

@ -0,0 +1,207 @@
from os import popen2
from itertools import imap,count
from obitools.table import iTableIterator,TableRow,Table,SelectionIterator
from obitools.utils import ColumnFile
from obitools.location import SimpleLocation
from obitools.fasta import formatFasta
import sys
class Blast(object):
'''
Run blast
'''
def __init__(self,mode,db,program='blastall',**options):
self._mode = mode
self._db = db
self._program = program
self._options = options
def getMode(self):
return self._mode
def getDb(self):
return self._db
def getProgram(self):
return self._program
def _blastcmd(self):
tmp = """%(program)s \\
-p %(mode)s \\
-d %(db)s \\
-m 8 \\
%(options)s \\
"""
options = ' '.join(['-%s %s' % (x[0],str(x[1]))
for x in self._options.iteritems()])
data = {
'program' : self.program,
'db' : self.db,
'mode' : self.mode,
'options' : options
}
return tmp % data
def __call__(self,sequence):
'''
Run blast with one sequence object
@param sequence:
@type sequence:
'''
cmd = self._blastcmd()
(blast_in,blast_out) = popen2(cmd)
print >>blast_in,formatFasta(sequence)
blast_in.close()
blast = BlastResultIterator(blast_out)
return blast
mode = property(getMode, None, None, "Mode's Docstring")
db = property(getDb, None, None, "Db's Docstring")
program = property(getProgram, None, None, "Program's Docstring")
class NetBlast(Blast):
'''
Run blast on ncbi servers
'''
def __init__(self,mode,db,**options):
'''
@param mode:
@param db:
'''
Blast.__init__(self, mode, db, 'blastcl3',**options)
class BlastResultIterator(iTableIterator):
def __init__(self,blastoutput,query=None):
'''
@param blastoutput:
@type blastoutput:
'''
self._blast = ColumnFile(blastoutput,
strip=True,
skip="#",
sep="\t",
types=self.types
)
self._query = query
self._hindex = dict((k,i) for i,k in imap(None,count(),self._getHeaders()))
def _getHeaders(self):
return ('Query id','Subject id',
'% identity','alignment length',
'mismatches', 'gap openings',
'q. start', 'q. end',
's. start', 's. end',
'e-value', 'bit score')
def _getTypes(self):
return (str,str,
float,int,
int,int,
int,int,
int,int,
float,float)
def _getRowFactory(self):
return BlastMatch
def _getSubrowFactory(self):
return TableRow
def _getQuery(self):
return self._query
headers = property(_getHeaders,None,None)
types = property(_getTypes,None,None)
rowFactory = property(_getRowFactory,None,None)
subrowFactory = property(_getSubrowFactory,None,None)
query = property(_getQuery,None,None)
def next(self):
'''
'''
value = self._blast.next()
return self.rowFactory(self,value)
class BlastResult(Table):
'''
Results of a blast run
'''
class BlastMatch(TableRow):
'''
Blast high scoring pair between two sequences
'''
def getQueryLocation(self):
l = SimpleLocation(self[6], self[7])
return l
def getSubjectLocation(self):
l = SimpleLocation(self[8], self[9])
return l
def getSubjectSequence(self,database):
return database[self[1]]
def queryCov(self,query=None):
'''
Compute coverage of match on query sequence.
@param query: the query sequence. Default is None.
In this case the query sequence associated
to this blast result is used.
@type query: L{obitools.BioSequence}
@return: coverage fraction
@rtype: float
'''
if query is None:
query = self.table.query
assert query is not None
return float(self[7]-self[6]+1)/float(len(query))
def __getitem__(self,key):
if key=='query coverage' and self.table.query is not None:
return self.queryCov()
else:
return TableRow.__getitem__(self,key)
class BlastCovMinFilter(SelectionIterator):
def __init__(self,blastiterator,covmin,query=None,**conditions):
if query is None:
query = blastiterator.table.query
assert query is not None
SelectionIterator.__init__(self,blastiterator,**conditions)
self._query = query
self._covmin=covmin
def _covMinPredicat(self,row):
return row.queryCov(self._query)>=self._covmin
def _checkCondition(self,row):
return self._covMinPredicat(row) and SelectionIterator._checkCondition(self, row)

376
obitools/carto/__init__.py Normal file
View File

@ -0,0 +1,376 @@
# -*- coding: latin1 -*-
from obitools import SVGdraw
import math
class Map(object):
"""
Map represente une instance d'une carte genetique physique.
Une telle carte est definie par la longueur de la sequence
qui lui est associe.
A une carte est associe un certain nombre de niveaux (Level)
eux meme decoupe en sous-niveau (SubLevel)
Les sous niveaux contiennent eux des features
"""
def __init__(self,name,seqlength,scale=1):
"""
Constructeur d'une nouvelle carte
*Param*:
name
nom de la carte
seqlength
longueur de la sequence associee a la carte
scale
echelle de la carte indicant combien de pixel
correspondent a une unite de la carte
"""
self.name = name
self.seqlength = seqlength
self.scale = scale
self.levels = {}
self.basicHSize = 10
def __str__(self):
return '<%s>' % self.name
def __getitem__(self,level):
"""
retourne le niveau *level* de la carte et
le cree s'il n'existe pas
"""
if not isinstance(level,int):
raise TypeError('level must be an non Zero integer value')
elif level==0:
raise AssertionError('Level cannot be set to 0')
try:
return self.levels[level]
except KeyError:
self.levels[level] = Level(level,self)
return self.levels[level]
def getBasicHSize(self):
"""
retourne la hauteur de base d'un element de cartographie
exprimee en pixel
"""
return self.basicHSize
def getScale(self):
"""
Retourne l'echelle de la carte en nombre de pixels par
unite physique de la carte
"""
return self.scale
def getNegativeBase(self):
return reduce(lambda x,y:x-y,[self.levels[z].getHeight()
for z in self.levels
if z < 0],self.getHeight())
def getPositiveBase(self):
return self.getNegativeBase() - 3 * self.getBasicHSize()
def getHeight(self):
return reduce(lambda x,y:x+y,[z.getHeight() for z in self.levels.values()],0) \
+ 4 * self.getBasicHSize()
def toXML(self,file=None,begin=0,end=None):
dessin = SVGdraw.drawing()
if end==None:
end = self.seqlength
hauteur= self.getHeight()
largeur=(end-begin+1)*self.scale
svg = SVGdraw.svg((begin*self.scale,0,largeur,hauteur),
'%fpx' % (self.seqlength * self.scale),
'%dpx' % hauteur)
centre = self.getPositiveBase() + (1 + 1/4) * self.getBasicHSize()
svg.addElement(SVGdraw.rect(0,centre,self.seqlength * self.scale,self.getBasicHSize()/2))
for e in self.levels.values():
svg.addElement(e.getElement())
dessin.setSVG(svg)
return dessin.toXml(file)
class Feature(object):
pass
class Level(object):
def __init__(self,level,map):
if not isinstance(map,Map):
raise AssertionError('map is not an instance of class Map')
if level in map.levels:
raise AssertionError('Level %d already define for map %s' % (level,map))
else:
map.levels[level] = self
self.map = map
self.level = level
self.sublevels = {}
def __getitem__(self,sublevel):
"""
retourne le niveau *sublevel* du niveau en
le creant s'il n'existe pas
"""
if not isinstance(sublevel,int):
raise TypeError('sublevel must be a positive integer value')
elif sublevel<0:
raise AssertionError('Level cannot be negative')
try:
return self.sublevels[sublevel]
except KeyError:
self.sublevels[sublevel] = SubLevel(sublevel,self)
return self.sublevels[sublevel]
def getBase(self):
if self.level < 0:
base = self.map.getNegativeBase()
base += reduce(lambda x,y:x+y,[self.map.levels[z].getHeight()
for z in self.map.levels
if z <0 and z >= self.level],0)
return base
else:
base = self.map.getPositiveBase()
base -= reduce(lambda x,y:x+y,[self.map.levels[z].getHeight()
for z in self.map.levels
if z >0 and z < self.level],0)
return base
def getElement(self):
objet = SVGdraw.group('level%d' % self.level)
for e in self.sublevels.values():
objet.addElement(e.getElement())
return objet
def getHeight(self):
return reduce(lambda x,y:x+y,[z.getHeight() for z in self.sublevels.values()],0) \
+ 2 * self.map.getBasicHSize()
class SubLevel(object):
def __init__(self,sublevel,level):
if not isinstance(level,Level):
raise AssertionError('level is not an instance of class Level')
if level in level.sublevels:
raise AssertionError('Sublevel %d already define for level %s' % (sublevel,level))
else:
level.sublevels[sublevel] = self
self.level = level
self.sublevel = sublevel
self.features = {}
def getHeight(self):
return max([x.getHeight() for x in self.features.values()]+[0]) + 4 * self.level.map.getBasicHSize()
def getBase(self):
base = self.level.getBase()
if self.level.level < 0:
base -= self.level.getHeight() - 2 * self.level.map.getBasicHSize()
base += reduce(lambda x,y:x+y,[self.level.sublevels[z].getHeight()
for z in self.level.sublevels
if z <= self.sublevel],0)
base -= 2* self.level.map.getBasicHSize()
else:
base -= reduce(lambda x,y:x+y,[self.level.sublevels[z].getHeight()
for z in self.level.sublevels
if z < self.sublevel],0)
base -= self.level.map.getBasicHSize()
return base
def getElement(self):
base = self.getBase()
objet = SVGdraw.group('sublevel%d' % self.sublevel)
for e in self.features.values():
objet.addElement(e.getElement(base))
return objet
def add(self,feature):
if not isinstance(feature,Feature):
raise TypeError('feature must be an instance oof Feature')
if feature.name in self.features:
raise AssertionError('A feature with the same name (%s) have already be insert in this sublevel'
% feature.name)
self.features[feature.name]=feature
feature.sublevel=self
class SimpleFeature(Feature):
def __init__(self,name,begin,end,visiblename=False,color=0):
self.begin = begin
self.end = end
self.name = name
self.color = color
self.sublevel = None
self.visiblename=visiblename
def getHeight(self):
if not self.sublevel:
raise AssertionError('Not affected Simple feature')
if self.visiblename:
return self.sublevel.level.map.getBasicHSize() * 2
else:
return self.sublevel.level.map.getBasicHSize()
def getElement(self,base):
scale = self.sublevel.level.map.getScale()
y = base - self.sublevel.level.map.getBasicHSize()
x = self.begin * scale
width = (self.end - self.begin + 1) * scale
heigh = self.sublevel.level.map.getBasicHSize()
objet = SVGdraw.rect(x,y,width,heigh,stroke=self.color)
objet.addElement(SVGdraw.description(self.name))
return objet
class BoxFeature(SimpleFeature):
def getHeight(self):
if not self.sublevel:
raise AssertionError('Not affected Box feature')
if self.visiblename:
return self.sublevel.level.map.getBasicHSize() * 4
else:
return self.sublevel.level.map.getBasicHSize() * 3
def getElement(self,base):
scale = self.sublevel.level.map.getScale()
y = base - self.sublevel.level.map.getBasicHSize() * 2
x = self.begin * scale
width = (self.end - self.begin + 1) * scale
height = self.sublevel.level.map.getBasicHSize() * 3
objet = SVGdraw.rect(x,y,width,height,stroke=self.color,fill="none")
objet.addElement(SVGdraw.description(self.name))
return objet
class MultiPartFeature(Feature):
def __init__(self,name,*args,**kargs):
self.limits = args
self.name = name
try:
self.color = kargs['color']
except KeyError:
self.color = "black"
try:
self.visiblename=kargs['visiblename']
except KeyError:
self.visiblename=None
try:
self.flatlink=kargs['flatlink']
except KeyError:
self.flatlink=False
try:
self.roundlink=kargs['roundlink']
except KeyError:
self.roundlink=False
self.sublevel = None
def getHeight(self):
if not self.sublevel:
raise AssertionError('Not affected Simple feature')
if self.visiblename:
return self.sublevel.level.map.getBasicHSize() * 3
else:
return self.sublevel.level.map.getBasicHSize() * 2
def getElement(self,base):
scale = self.sublevel.level.map.getScale()
y = base - self.sublevel.level.map.getBasicHSize()
height = self.sublevel.level.map.getBasicHSize()
objet = SVGdraw.group(self.name)
for (debut,fin) in self.limits:
x = debut * scale
width = (fin - debut + 1) * scale
part = SVGdraw.rect(x,y,width,height,fill=self.color)
objet.addElement(part)
debut = self.limits[0][1]
for (fin,next) in self.limits[1:]:
debut*=scale
fin*=scale
path = SVGdraw.pathdata(debut,y + height / 2)
delta = height / 2
if self.roundlink:
path.qbezier((debut+fin)/2, y - delta,fin,y + height / 2)
else:
if self.flatlink:
delta = - height / 2
path.line((debut+fin)/2, y - delta)
path.line(fin,y + height / 2)
path = SVGdraw.path(path,fill="none",stroke=self.color)
objet.addElement(path)
debut = next
objet.addElement(SVGdraw.description(self.name))
return objet
class TagFeature(Feature):
def __init__(self,name,begin,length,ratio,visiblename=False,color=0):
self.begin = begin
self.length = length
self.ratio = ratio
self.name = name
self.color = color
self.sublevel = None
self.visiblename=visiblename
def getHeight(self):
if not self.sublevel:
raise AssertionError('Not affected Tag feature')
return self.sublevel.level.map.getBasicHSize()*11
def getElement(self,base):
scale = self.sublevel.level.map.getScale()
height = math.floor(max(1,self.sublevel.level.map.getBasicHSize()* 10 * self.ratio))
y = base + self.sublevel.level.map.getBasicHSize() - height
x = self.begin * scale
width = self.length * scale
objet = SVGdraw.rect(x,y,width,height,stroke=self.color)
objet.addElement(SVGdraw.description(self.name))
return objet
if __name__ == '__main__':
carte = Map('essai',20000,scale=0.5)
carte[-1][0].add(SimpleFeature('toto',100,300))
carte[1][0].add(SimpleFeature('toto',100,300))
carte[1][1].add(SimpleFeature('toto',200,1000))
carte[1][0].add(MultiPartFeature('bout',(1400,1450),(1470,1550),(1650,1800),color='red',flatlink=True))
carte[1][0].add(MultiPartFeature('titi',(400,450),(470,550),(650,800),color='red',flatlink=True))
carte[-1][1].add(MultiPartFeature('titi',(400,450),(470,550),(650,800),color='green'))
carte[-1][2].add(MultiPartFeature('titi',(400,450),(470,550),(650,800),color='purple',roundlink=True))
carte[-1][1].add(BoxFeature('tutu',390,810,color='purple'))
carte[1][0].add(BoxFeature('tutu',390,810,color='red'))
carte[2][0].add(TagFeature('t1',1400,20,0.8))
carte[2][0].add(TagFeature('t2',1600,20,0.2))
carte.basicHSize=6
print carte.toXML('truc.svg',begin=0,end=1000)
print carte.toXML('truc2.svg',begin=460,end=2000)

0
obitools/decorator.py Normal file
View File

View File

@ -0,0 +1,29 @@
class DistanceMatrix(object):
def __init__(self,alignment):
'''
DistanceMatrix constructor.
@param alignment: aligment used to compute distance matrix
@type alignment: obitools.align.Alignment
'''
self.aligment = alignment
self.matrix = [[None] * (x+1) for x in xrange(len(alignment))]
def evaluateDist(self,x,y):
raise NotImplementedError
def __getitem__(self,key):
assert isinstance(key,(tuple,list)) and len(key)==2, \
'key must be a tuple or a list of two integers'
x,y = key
if y < x:
z=x
x=y
y=z
rep = self.matrix[y][x]
if rep is None:
rep = self.evaluateDist(x,y)
self.matrix[y][x] = rep
return rep

View File

@ -0,0 +1,77 @@
'''
Module dedicated to compute observed divergeances from
an alignment. No distance correction is applied at all
'''
from itertools import imap
from obitools.distances import DistanceMatrix
class PairewiseGapRemoval(DistanceMatrix):
'''
Observed divergeance matrix from an alignment.
Gap are removed from the alignemt on a pairewise
sequence base
'''
def evaluateDist(self,x,y):
'''
Compute the observed divergeance from two sequences
of an aligment.
@attention: For performance purpose this method should
be directly used. use instead the __getitem__
method from DistanceMatrix.
@see: L{__getitem__}
@param x: number of the fisrt sequence in the aligment
@type x: int
@param y: umber of the second sequence in the aligment
@type y: int
'''
seq1 = self.aligment[x]
seq2 = self.aligment[y]
diff,tot = reduce(lambda x,y: (x[0]+y,x[1]+1),
(z[0]!=z[1] for z in imap(None,seq1,seq2)
if '-' not in z),(0,0))
return float(diff)/tot
class Pairewise(DistanceMatrix):
'''
Observed divergeance matrix from an alignment.
Gap are kept from the alignemt
'''
def evaluateDist(self,x,y):
'''
Compute the observed divergeance from two sequences
of an aligment.
@attention: For performance purpose this method should
be directly used. use instead the __getitem__
method from DistanceMatrix.
@see: L{__getitem__}
@param x: number of the fisrt sequence in the aligment
@type x: int
@param y: umber of the second sequence in the aligment
@type y: int
'''
seq1 = self.aligment[x]
seq2 = self.aligment[y]
diff,tot = reduce(lambda x,y: (x[0]+y,x[1]+1),
(z[0]!=z[1] for z in imap(None,seq1,seq2)),
(0,0))
return float(diff)/tot

View File

@ -0,0 +1,35 @@
import sys
from itertools import imap,count
def writePhylipMatrix(matrix):
names = [x.id for x in matrix.aligment]
pnames= [x[:10] for x in names]
unicity={}
redundent=[]
for n in pnames:
unicity[n]=unicity.get(n,0)+1
redundent.append(unicity[n])
for i,n,r in imap(None,count(),pnames,redundent):
alternate = n
if r > 1:
while alternate in pnames:
lcut = 9 - len(str(r))
alternate = n[:lcut]+ '_%d' % r
r+=1
pnames[i]='%-10s' % alternate
firstline = '%5d' % len(matrix.aligment)
rep = [firstline]
for i,n in imap(None,count(),pnames):
line = [n]
for j in xrange(i):
line.append('%5.4f' % matrix[(j,i)])
rep.append(' '.join(line))
return '\n'.join(rep)

25
obitools/distances/r.py Normal file
View File

@ -0,0 +1,25 @@
import sys
from itertools import imap,count
def writeRMatrix(matrix):
names = [x.id for x in matrix.aligment]
lmax = max(max(len(x) for x in names),5)
lali = len(matrix.aligment)
nformat = '%%-%ds' % lmax
dformat = '%%%d.4f' % lmax
pnames=[nformat % x for x in names]
rep = [' '.join(pnames)]
for i in xrange(lali):
line=[]
for j in xrange(lali):
line.append('%5.4f' % matrix[(j,i)])
rep.append(' '.join(line))
return '\n'.join(rep)

View File

@ -0,0 +1,100 @@
_A=[0]
_C=[1]
_G=[2]
_T=[3]
_R= _A + _G
_Y= _C + _T
_M= _C + _A
_K= _T + _G
_W= _T + _A
_S= _C + _G
_B= _C + _G + _T
_D= _A + _G + _T
_H= _A + _C + _T
_V= _A + _C + _G
_N= _A + _C + _G + _T
_dnahash={'a':_A,
'c':_C,
'g':_G,
't':_T,
'r':_R,
'y':_Y,
'm':_M,
'k':_K,
'w':_W,
's':_S,
'b':_B,
'd':_D,
'h':_H,
'v':_V,
'n':_N,
}
def hashCodeIterator(sequence,wsize,degeneratemax=0,offset=0):
errors = 0
emask = [0] * wsize
epointer = 0
size = 0
position = offset
hashs = set([0])
hashmask = 0
for i in xrange(wsize):
hashmask <<= 2
hashmask +=3
for l in sequence:
l = l.lower()
hl = _dnahash[l]
if emask[epointer]:
errors-=1
emask[epointer]=0
if len(hl) > 1:
errors +=1
emask[epointer]=1
epointer+=1
epointer%=wsize
if errors > degeneratemax:
hl=set([hl[0]])
hashs=set((((hc<<2) | cl) & hashmask)
for hc in hashs
for cl in hl)
if size < wsize:
size+=1
if size==wsize:
if errors <= degeneratemax:
yield (position,hashs,errors)
position+=1
def hashSequence(sequence,wsize,degeneratemax=0,offset=0,hashs=None):
if hashs is None:
hashs=[[] for x in xrange(4**wsize)]
for pos,keys,errors in hashCodeIterator(sequence, wsize, degeneratemax, offset):
for k in keys:
hashs[k].append(pos)
return hashs
def hashSequences(sequences,wsize,maxpos,degeneratemax=0):
hashs=None
offsets=[]
offset=0
for s in sequences:
offsets.append(offset)
hashSequence(s,wsize,degeneratemax=degeneratemax,offset=offset,hashs=hashs)
offset+=len(s)
return hashs,offsets

View File

View File

@ -0,0 +1,32 @@
'''
Created on 25 sept. 2010
@author: coissac
'''
from obitools import NucSequence
def referenceDBIterator(options):
cursor = options.ecobarcodedb.cursor()
cursor.execute("select id from databases.database where name='%s'" % options.database)
options.dbid = cursor.fetchone()[0]
cursor.execute('''
select s.accession,r.id,r.taxid,r.sequence
from databases.database d,
databases.reference r,
databases.relatedsequences s
where r.database = d.id
and s.reference= r.id
and s.mainac
and d.name = '%s'
''' % options.database
)
for ac,id,taxid,sequence in cursor:
s = NucSequence(ac,sequence)
s['taxid']=taxid
s['refdbid']=id
yield s

View File

@ -0,0 +1,50 @@
'''
Created on 25 sept. 2010
@author: coissac
'''
def alreadyIdentified(seqid,options):
cursor = options.ecobarcodedb.cursor()
cursor.execute('''
select count(*)
from ecotag.identification
where sequence=%s
and database=%s
''',(int(seqid),int(options.dbid)))
return int(cursor.fetchone()[0]) > 0;
def storeIdentification(seqid,
idstatus,taxid,
matches,
options
):
cursor = options.ecobarcodedb.cursor()
if not options.updatedb:
cursor.execute('''
delete from ecotag.identification where sequence=%s and database=%s
''',(int(seqid),int(options.dbid)))
cursor.execute('''
insert into ecotag.identification (sequence,database,idstatus,taxid)
values (%s,%s,%s,%s)
returning id
''' , (int(seqid),int(options.dbid),idstatus,int(taxid)))
idid = cursor.fetchone()[0]
for seq,identity in matches.iteritems():
cursor.execute('''
insert into ecotag.evidence (identification,reference,identity)
values (%s,
%s,
%s)
''',(idid,seq,identity))
cursor.close()
options.ecobarcodedb.commit()

View File

@ -0,0 +1,64 @@
'''
Created on 23 sept. 2010
@author: coissac
'''
import psycopg2
from obitools.ecobarcode.taxonomy import EcoTaxonomyDB
def addEcoBarcodeDBOption(optionManager):
optionManager.add_option('--dbname',
action="store", dest="ecobarcodedb",
type='str',
default=None,
help="Specify the name of the ecobarcode database")
optionManager.add_option('--server',
action="store", dest="dbserver",
type='str',
default="localhost",
help="Specify the adress of the ecobarcode database server")
optionManager.add_option('--user',
action="store", dest="dbuser",
type='str',
default='postgres',
help="Specify the user of the ecobarcode database")
optionManager.add_option('--port',
action="store", dest="dbport",
type='str',
default=5432,
help="Specify the port of the ecobarcode database")
optionManager.add_option('--passwd',
action="store", dest="dbpasswd",
type='str',
default='',
help="Specify the passwd of the ecobarcode database")
optionManager.add_option('--primer',
action="store", dest="primer",
type='str',
default=None,
help="Specify the primer used for amplification")
def ecobarcodeDatabaseConnection(options):
if options.ecobarcodedb is not None:
connection = psycopg2.connect(database=options.ecobarcodedb,
user=options.dbuser,
password=options.dbpasswd,
host=options.dbserver,
port=options.dbport)
options.dbname=options.ecobarcodedb
else:
connection=None
if connection is not None:
options.ecobarcodedb=connection
taxonomy = EcoTaxonomyDB(connection)
else:
taxonomy=None
return taxonomy

View File

@ -0,0 +1,38 @@
'''
Created on 25 sept. 2010
@author: coissac
'''
from obitools import NucSequence
from obitools.utils import progressBar
from obitools.ecobarcode.ecotag import alreadyIdentified
import sys
def sequenceIterator(options):
cursor = options.ecobarcodedb.cursor()
cursor.execute('''
select s.id,sum(o.count),s.sequence
from rawdata.sequence s,
rawdata.occurrences o
where o.sequence= s.id
and s.primers = '%s'
group by s.id,s.sequence
''' % options.primer
)
nbseq = cursor.rowcount
progressBar(1, nbseq, True, head=options.dbname)
for id,count,sequence in cursor:
progressBar(cursor.rownumber+1, nbseq, head=options.dbname)
if not options.updatedb or not alreadyIdentified(id,options):
s = NucSequence(id,sequence)
s['count']=count
print >>sys.stderr,' +', cursor.rownumber+1,
yield s
else:
print >>sys.stderr,' @', cursor.rownumber+1,
print >>sys.stderr

View File

@ -0,0 +1,120 @@
'''
Created on 24 sept. 2010
@author: coissac
'''
from obitools.ecopcr.taxonomy import TaxonomyDump
from obitools.ecopcr.taxonomy import Taxonomy
import sys
class EcoTaxonomyDB(TaxonomyDump) :
def __init__(self,dbconnect):
self._dbconnect=dbconnect
print >> sys.stderr,"Reading ecobarcode taxonomy database..."
self._readNodeTable()
print >> sys.stderr," ok"
print >>sys.stderr,"Adding scientific name..."
self._name=[]
for taxid,name,classname in self._nameIterator():
self._name.append((name,classname,self._index[taxid]))
if classname == 'scientific name':
self._taxonomy[self._index[taxid]].append(name)
print >>sys.stderr,"Adding taxid alias..."
for taxid,current in self._mergedNodeIterator():
self._index[taxid]=self._index[current]
print >>sys.stderr,"Adding deleted taxid..."
for taxid in self._deletedNodeIterator():
self._index[taxid]=None
Taxonomy.__init__(self)
#####
#
# Iterator functions
#
#####
def _readNodeTable(self):
cursor = self._dbconnect.cursor()
cursor.execute("""
select taxid,rank,parent
from ncbitaxonomy.nodes
""")
print >>sys.stderr,"Reading taxonomy nodes..."
taxonomy=[list(n) for n in cursor]
print >>sys.stderr,"List all taxonomy rank..."
ranks =list(set(x[1] for x in taxonomy))
ranks.sort()
rankidx = dict(map(None,ranks,xrange(len(ranks))))
print >>sys.stderr,"Sorting taxons..."
taxonomy.sort(TaxonomyDump._taxonCmp)
self._taxonomy=taxonomy
print >>sys.stderr,"Indexing taxonomy..."
index = {}
for t in self._taxonomy:
index[t[0]]=self._bsearchTaxon(t[0])
print >>sys.stderr,"Indexing parent and rank..."
for t in self._taxonomy:
t[1]=rankidx[t[1]]
t[2]=index[t[2]]
self._ranks=ranks
self._index=index
cursor.close()
def _nameIterator(self):
cursor = self._dbconnect.cursor()
cursor.execute("""
select taxid,name,nameclass
from ncbitaxonomy.names
""")
for taxid,name,nameclass in cursor:
yield taxid,name,nameclass
cursor.close()
def _mergedNodeIterator(self):
cursor = self._dbconnect.cursor()
cursor.execute("""
select oldtaxid,newtaxid
from ncbitaxonomy.merged
""")
for oldtaxid,newtaxid in cursor:
yield oldtaxid,newtaxid
cursor.close()
def _deletedNodeIterator(self):
cursor = self._dbconnect.cursor()
cursor.execute("""
select taxid
from ncbitaxonomy.delnodes
""")
for taxid in cursor:
yield taxid[0]
cursor.close()

View File

@ -0,0 +1,69 @@
from obitools import utils
from obitools import NucSequence
from obitools.utils import universalOpen, universalTell, fileSize, progressBar
import struct
import sys
class EcoPCRFile(utils.ColumnFile):
def __init__(self,stream):
utils.ColumnFile.__init__(self,
stream, '|', True,
(str,int,int,
str,int,str,
int,str,int,
str,int,str,
str,str,int,float,
str,int,float,
int,
str,str), "#")
def next(self):
data = utils.ColumnFile.next(self)
seq = NucSequence(data[0],data[20],data[21])
seq['seq_length_ori']=data[1]
seq['taxid']=data[2]
seq['rank']=data[3]
seq['species']=data[4]
seq['species_sn']=data[5]
seq['genus']=data[6]
seq['genus_sn']=data[7]
seq['family']=data[8]
seq['family_sn']=data[9]
seq['strand']=data[12]
seq['forward_primer']=data[13]
seq['forward_error']=data[14]
seq['forward_tm']=data[15]
seq['reverse_primer']=data[16]
seq['reverse_error']=data[17]
seq['reverse_tm']=data[18]
return seq
class EcoPCRDBFile(object):
def _ecoRecordIterator(self,file):
file = universalOpen(file)
(recordCount,) = struct.unpack('> I',file.read(4))
self._recover=False
if recordCount:
for i in xrange(recordCount):
(recordSize,)=struct.unpack('>I',file.read(4))
record = file.read(recordSize)
yield record
else:
print >> sys.stderr,"\n\n WARNING : EcoPCRDB readding set into recover data mode\n"
self._recover=True
ok=True
while(ok):
try:
(recordSize,)=struct.unpack('>I',file.read(4))
record = file.read(recordSize)
yield record
except:
ok=False

View File

@ -0,0 +1,104 @@
import struct
class EcoPCRDBAnnotationWriter(object):
'''
Class used to write Annotation description in EcoPCRDB format.
EcoPCRDBAnnotationWriter is oftenly called through the EcoPCRDBSequenceWriter class
@see: L{ecopcr.sequence.EcoPCRDBSequenceWriter}
'''
def __init__(self,dbname,id,fileidx=1,type=('CDS'),definition=None):
'''
class constructor
@param dbname: name of ecoPCR database
@type dbname: C{str}
@param id: name of the qualifier used as feature id
@type id: C{str}
@param fileidx:
@type fileidx: C{int}
@param type:
@type type: C{list} or C{tuple}
@param definition:
@type definition: C{str}
'''
self._type = type
self._definition = definition
self._id = id
self._filename="%s_%03d.adx" % (dbname,fileidx)
self._file = open(self._filename,'wb')
self._sequenceIdx=0
ftname ="%s.fdx" % (dbname)
ft = open(ftname,'wb')
self._fttypeidx=dict(map(None,type,xrange(len(type))))
ft.write(struct.pack('> I',len(type)))
for t in type:
ft.write(self._ecoFtTypePacker(t))
ft.close()
self._annotationCount=0
self._file.write(struct.pack('> I',self._annotationCount))
def _ecoFtTypePacker(self,type):
totalSize = len(type)
packed = struct.pack('> I %ds' % totalSize,totalSize,type)
assert len(packed) == totalSize+4, "error in feature type packing"
return packed
def _ecoAnnotationPacker(self,feature,seqidx):
begin = feature.begin-1
end = feature.end
type = self._fttypeidx[feature.ftType]
strand = feature.isDirect()
id = feature[self._id][0]
if self._definition in feature:
definition = feature[self._definition][0]
else:
definition = ''
assert strand is not None,"Only strand defined features can be stored"
deflength = len(definition)
totalSize = 4 + 4 + 4 + 4 + 4 + 20 + 4 + deflength
packed = struct.pack('> I I I I I 20s I %ds' % (deflength),
totalSize,
seqidx,
begin,
end,
type,
int(strand),
id,
deflength,
definition)
assert len(packed) == totalSize+4, "error in annotation packing"
return packed
def put(self,sequence,seqidx=None):
if seqidx is None:
seqidx = self._sequenceIdx
self._sequenceIdx+=1
for feature in sequence.getFeatureTable():
if feature.ftType in self._type:
self._annotationCount+=1
self._file.write(self._ecoAnnotationPacker(feature,seqidx))
def __del__(self):
self._file.seek(0,0)
self._file.write(struct.pack('> I',self._annotationCount))
self._file.close()

129
obitools/ecopcr/options.py Normal file
View File

@ -0,0 +1,129 @@
'''
Created on 13 fevr. 2011
@author: coissac
'''
from obitools.ecopcr.taxonomy import Taxonomy, EcoTaxonomyDB, TaxonomyDump, ecoTaxonomyWriter
try:
from obitools.ecobarcode.options import addEcoBarcodeDBOption,ecobarcodeDatabaseConnection
except ImportError:
def addEcoBarcodeDBOption(optionmanager):
pass
def ecobarcodeDatabaseConnection(options):
return None
def addTaxonomyDBOptions(optionManager):
addEcoBarcodeDBOption(optionManager)
optionManager.add_option('-d','--database',
action="store", dest="taxonomy",
metavar="<FILENAME>",
type="string",
help="ecoPCR taxonomy Database "
"name")
optionManager.add_option('-t','--taxonomy-dump',
action="store", dest="taxdump",
metavar="<FILENAME>",
type="string",
help="NCBI Taxonomy dump repository "
"name")
def addTaxonomyFilterOptions(optionManager):
addTaxonomyDBOptions(optionManager)
optionManager.add_option('--require-rank',
action="append",
dest='requiredRank',
metavar="<RANK_NAME>",
type="string",
default=[],
help="select sequence with taxid tag containing "
"a parent of rank <RANK_NAME>")
optionManager.add_option('-r','--required',
action="append",
dest='required',
metavar="<TAXID>",
type="int",
default=[],
help="required taxid")
optionManager.add_option('-i','--ignore',
action="append",
dest='ignored',
metavar="<TAXID>",
type="int",
default=[],
help="ignored taxid")
def loadTaxonomyDatabase(options):
if isinstance(options.taxonomy, Taxonomy):
return options.taxonomy
taxonomy = ecobarcodeDatabaseConnection(options)
if (taxonomy is not None or
options.taxonomy is not None or
options.taxdump is not None):
if options.taxdump is not None:
taxonomy = TaxonomyDump(options.taxdump)
if taxonomy is not None and isinstance(options.taxonomy, str):
ecoTaxonomyWriter(options.taxonomy,taxonomy)
options.ecodb=options.taxonomy
if isinstance(options.taxonomy, Taxonomy):
taxonomy = options.taxonomy
if taxonomy is None and isinstance(options.taxonomy, str):
taxonomy = EcoTaxonomyDB(options.taxonomy)
options.ecodb=options.taxonomy
options.taxonomy=taxonomy
return options.taxonomy
def taxonomyFilterGenerator(options):
loadTaxonomyDatabase(options)
if options.taxonomy is not None:
taxonomy=options.taxonomy
def taxonomyFilter(seq):
def annotateAtRank(seq,rank):
if 'taxid' in seq and seq['taxid'] is not None:
rtaxid= taxonomy.getTaxonAtRank(seq['taxid'],rank)
return rtaxid
return None
good = True
if 'taxid' in seq:
taxid = seq['taxid']
# print taxid,
if options.requiredRank:
taxonatrank = reduce(lambda x,y: x and y,
(annotateAtRank(seq,rank) is not None
for rank in options.requiredRank),True)
good = good and taxonatrank
# print >>sys.stderr, " Has rank : ",good,
if options.required:
good = good and reduce(lambda x,y: x or y,
(taxonomy.isAncestor(r,taxid) for r in options.required),
False)
# print " Required : ",good,
if options.ignored:
good = good and not reduce(lambda x,y: x or y,
(taxonomy.isAncestor(r,taxid) for r in options.ignored),
False)
# print " Ignored : ",good,
# print " Global : ",good
return good
else:
def taxonomyFilter(seq):
return True
return taxonomyFilter
def taxonomyFilterIteratorGenerator(options):
taxonomyFilter = taxonomyFilterGenerator(options)
def filterIterator(seqiterator):
for seq in seqiterator:
if taxonomyFilter(seq):
yield seq
return filterIterator

133
obitools/ecopcr/sequence.py Normal file
View File

@ -0,0 +1,133 @@
from obitools import NucSequence
from obitools.ecopcr import EcoPCRDBFile
from obitools.ecopcr.taxonomy import EcoTaxonomyDB, ecoTaxonomyWriter
from obitools.ecopcr.annotation import EcoPCRDBAnnotationWriter
from obitools.utils import universalOpen
from glob import glob
import struct
import gzip
import sys
class EcoPCRDBSequenceIterator(EcoPCRDBFile):
'''
Build an iterator over the sequences include in a sequence database
formated for ecoPCR
'''
def __init__(self,path,taxonomy=None):
'''
ecoPCR data iterator constructor
@param path: path to the ecoPCR database including the database prefix name
@type path: C{str}
@param taxonomy: a taxonomy can be given to the reader to decode the taxonomic data
associated to the sequences. If no Taxonomy is furnish, it will be read
before the sequence database files using the same path.
@type taxonomy: L{obitools.ecopcr.taxonomy.Taxonomy}
'''
self._path = path
if taxonomy is not None:
self._taxonomy=taxonomy
else:
self._taxonomy=EcoTaxonomyDB(path)
self._seqfilesFiles = glob('%s_???.sdx' % self._path)
self._seqfilesFiles.sort()
def __ecoSequenceIterator(self,file):
for record in self._ecoRecordIterator(file):
lrecord = len(record)
lnames = lrecord - (4*4+20)
(taxid,seqid,deflength,seqlength,cptseqlength,string)=struct.unpack('> I 20s I I I %ds' % lnames, record)
seqid=seqid.strip('\x00')
de = string[:deflength]
seq = gzip.zlib.decompress(string[deflength:])
bioseq = NucSequence(seqid,seq,de,taxidx=taxid,taxid=self._taxonomy._taxonomy[taxid][0])
yield bioseq
def __iter__(self):
for seqfile in self._seqfilesFiles:
for seq in self.__ecoSequenceIterator(seqfile):
yield seq
class EcoPCRDBSequenceWriter(object):
def __init__(self,dbname,fileidx=1,taxonomy=None,ftid=None,type=None,definition=None,append=False):
self._taxonomy=taxonomy
self._filename="%s_%03d.sdx" % (dbname,fileidx)
if append:
mode ='r+b'
f = universalOpen(self._filename)
(recordCount,) = struct.unpack('> I',f.read(4))
self._sequenceCount=recordCount
del f
self._file = open(self._filename,mode)
self._file.seek(0,0)
self._file.write(struct.pack('> I',0))
self._file.seek(0,2)
else:
self._sequenceCount=0
mode = 'wb'
self._file = open(self._filename,mode)
self._file.write(struct.pack('> I',self._sequenceCount))
if self._taxonomy is not None:
print >> sys.stderr,"Writing the taxonomy file...",
ecoTaxonomyWriter(dbname,self._taxonomy)
print >> sys.stderr,"Ok"
if type is not None:
assert ftid is not None,"You must specify an id attribute for features"
self._annotation = EcoPCRDBAnnotationWriter(dbname, ftid, fileidx, type, definition)
else:
self._annotation = None
def _ecoSeqPacker(self,seq):
compactseq = gzip.zlib.compress(str(seq).upper(),9)
cptseqlength = len(compactseq)
delength = len(seq.definition)
totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength
if self._taxonomy is None or 'taxid' not in seq:
taxon=-1
else:
taxon=self._taxonomy.findIndex(seq['taxid'])
try:
packed = struct.pack('> I i 20s I I I %ds %ds' % (delength,cptseqlength),
totalSize,
taxon,
seq.id,
delength,
len(seq),
cptseqlength,
seq.definition,
compactseq)
except struct.error as e:
print >>sys.stderr,"\n\n============\n\nError on sequence : %s\n\n" % seq.id
raise e
assert len(packed) == totalSize+4, "error in sequence packing"
return packed
def put(self,sequence):
if self._taxonomy is not None:
if 'taxid' not in sequence and hasattr(sequence, 'extractTaxon'):
sequence.extractTaxon()
self._file.write(self._ecoSeqPacker(sequence))
if self._annotation is not None:
self._annotation.put(sequence, self._sequenceCount)
self._sequenceCount+=1
def __del__(self):
self._file.seek(0,0)
self._file.write(struct.pack('> I',self._sequenceCount))
self._file.close()

630
obitools/ecopcr/taxonomy.py Normal file
View File

@ -0,0 +1,630 @@
import struct
import sys
from itertools import count,imap
from obitools.ecopcr import EcoPCRDBFile
from obitools.utils import universalOpen
from obitools.utils import ColumnFile
class Taxonomy(object):
def __init__(self):
'''
The taxonomy database constructor
@param path: path to the ecoPCR database including the database prefix name
@type path: C{str}
'''
self._ranks.append('obi')
self._speciesidx = self._ranks.index('species')
self._genusidx = self._ranks.index('genus')
self._familyidx = self._ranks.index('family')
self._orderidx = self._ranks.index('order')
self._nameidx=dict((x[0],x[2]) for x in self._name)
self._nameidx.update(dict((x[0],x[2]) for x in self._preferedName))
self._preferedidx=dict((x[2],x[1]) for x in self._preferedName)
self._bigestTaxid = max(x[0] for x in self._taxonomy)
def findTaxonByIdx(self,idx):
if idx is None:
return None
return self._taxonomy[idx]
def findIndex(self,taxid):
if taxid is None:
return None
return self._index[taxid]
def findTaxonByTaxid(self,taxid):
return self.findTaxonByIdx(self.findIndex(taxid))
def findTaxonByName(self,name):
return self._taxonomy[self._nameidx[name]]
def findRankByName(self,rank):
try:
return self._ranks.index(rank)
except ValueError:
return None
def __contains__(self,taxid):
return self.findTaxonByTaxid(taxid) is not None
#####
#
# PUBLIC METHODS
#
#####
def subTreeIterator(self, taxid):
"return subtree for given taxonomic id "
idx = self.findTaxonByTaxid(taxid)
yield self._taxonomy[idx]
for t in self._taxonomy:
if t[2] == idx:
for subt in self.subTreeIterator(t[0]):
yield subt
def parentalTreeIterator(self, taxid):
"""
return parental tree for given taxonomic id starting from
first ancester to the root.
"""
taxon=self.findTaxonByTaxid(taxid)
if taxon is not None:
while taxon[2]!= 0:
yield taxon
taxon = self._taxonomy[taxon[2]]
yield self._taxonomy[0]
else:
raise StopIteration
def isAncestor(self,parent,taxid):
return parent in [x[0] for x in self.parentalTreeIterator(taxid)]
def lastCommonTaxon(self,*taxids):
if not taxids:
return None
if len(taxids)==1:
return taxids[0]
if len(taxids)==2:
t1 = [x[0] for x in self.parentalTreeIterator(taxids[0])]
t2 = [x[0] for x in self.parentalTreeIterator(taxids[1])]
t1.reverse()
t2.reverse()
count = min(len(t1),len(t2))
i=0
while(i < count and t1[i]==t2[i]):
i+=1
i-=1
return t1[i]
ancetre = taxids[0]
for taxon in taxids[1:]:
ancetre = self.lastCommonTaxon(ancetre,taxon)
return ancetre
def betterCommonTaxon(self,error=1,*taxids):
lca = self.lastCommonTaxon(*taxids)
idx = self._index[lca]
sublca = [t[0] for t in self._taxonomy if t[2]==idx]
return sublca
def getPreferedName(self,taxid):
idx = self.findIndex(taxid)
return self._preferedidx.get(idx,self._taxonomy[idx][3])
def getScientificName(self,taxid):
return self.findTaxonByTaxid(taxid)[3]
def getRankId(self,taxid):
return self.findTaxonByTaxid(taxid)[1]
def getRank(self,taxid):
return self._ranks[self.getRankId(taxid)]
def getTaxonAtRank(self,taxid,rankid):
if isinstance(rankid, str):
rankid=self._ranks.index(rankid)
try:
return [x[0] for x in self.parentalTreeIterator(taxid)
if x[1]==rankid][0]
except IndexError:
return None
def getSpecies(self,taxid):
return self.getTaxonAtRank(taxid, self._speciesidx)
def getGenus(self,taxid):
return self.getTaxonAtRank(taxid, self._genusidx)
def getFamily(self,taxid):
return self.getTaxonAtRank(taxid, self._familyidx)
def getOrder(self,taxid):
return self.getTaxonAtRank(taxid, self._orderidx)
def rankIterator(self):
for x in imap(None,self._ranks,xrange(len(self._ranks))):
yield x
def groupTaxa(self,taxa,groupname):
t=[self.findTaxonByTaxid(x) for x in taxa]
a=set(x[2] for x in t)
assert len(a)==1,"All taxa must have the same parent"
newtaxid=max([2999999]+[x[0] for x in self._taxonomy if x[0]>=3000000 and x[0]<4000000])+1
newidx=len(self._taxonomy)
if 'GROUP' not in self._ranks:
self._ranks.append('GROUP')
rankid=self._ranks.index('GROUP')
self._taxonomy.append((newtaxid,rankid,a.pop(),groupname))
for x in t:
x[2]=newidx
def addLocalTaxon(self,name,rank,parent,minimaltaxid=10000000):
newtaxid = minimaltaxid if (self._bigestTaxid < minimaltaxid) else self._bigestTaxid+1
rankid=self.findRankByName(rank)
parentidx = self.findIndex(int(parent))
tx = (newtaxid,rankid,parentidx,name,'local')
self._taxonomy.append(tx)
newidx=len(self._taxonomy)-1
self._name.append((name,'scientific name',newidx))
self._nameidx[name]=newidx
self._index[newtaxid]=newidx
self._bigestTaxid=newtaxid
return newtaxid
def removeLocalTaxon(self,taxid):
raise NotImplemented
txidx = self.findIndex(taxid)
taxon = self.findTaxonByIdx(txidx)
assert txidx >= self._localtaxon,"Only local taxon can be deleted"
for t in self._taxonomy:
if t[2] == txidx:
self.removeLocalTaxon(t[0])
return taxon
def addPreferedName(self,taxid,name):
idx = self.findIndex(taxid)
self._preferedName.append(name,'obi',idx)
self._preferedidx[idx]=name
return taxid
class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile):
'''
A taxonomy database class
'''
def __init__(self,path):
'''
The taxonomy database constructor
@param path: path to the ecoPCR database including the database prefix name
@type path: C{str}
'''
self._path = path
self._taxonFile = "%s.tdx" % self._path
self._localTaxonFile = "%s.ldx" % self._path
self._ranksFile = "%s.rdx" % self._path
self._namesFile = "%s.ndx" % self._path
self._preferedNamesFile = "%s.pdx" % self._path
self._aliasFile = "%s.adx" % self._path
print >> sys.stderr,"Reading binary taxonomy database...",
self.__readNodeTable()
print >> sys.stderr," ok"
Taxonomy.__init__(self)
#####
#
# Iterator functions
#
#####
def __ecoNameIterator(self,file):
for record in self._ecoRecordIterator(file):
lrecord = len(record)
lnames = lrecord - 16
(isScientificName,namelength,classLength,indextaxid,names)=struct.unpack('> I I I I %ds' % lnames, record)
name=names[:namelength]
classname=names[namelength:]
yield (name,classname,indextaxid)
def __ecoTaxonomicIterator(self):
for record in self._ecoRecordIterator(self._taxonFile):
lrecord = len(record)
lnames = lrecord - 16
(taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record)
yield (taxid,rankid,parentidx,name,'ncbi')
try :
lt=0
for record in self._ecoRecordIterator(self._localTaxonFile):
lrecord = len(record)
lnames = lrecord - 16
(taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record)
lt+=1
yield (taxid,rankid,parentidx,name,'local')
print >> sys.stderr, " [INFO : Local taxon file found] : %d added taxa" % lt
except:
print >> sys.stderr, " [INFO : Local taxon file not found] "
def __ecoRankIterator(self):
for record in self._ecoRecordIterator(self._ranksFile):
yield record
def __ecoAliasIterator(self):
for record in self._ecoRecordIterator(self._aliasFile):
(taxid,index) = struct.unpack('> I i',record)
yield taxid,index
#####
#
# Indexes
#
#####
def __ecoNameIndex(self):
indexName = [x for x in self.__ecoNameIterator(self._namesFile)]
return indexName
def __ecoRankIndex(self):
rank = [r for r in self.__ecoRankIterator()]
return rank
def __ecoTaxonomyIndex(self):
taxonomy = []
try :
index = dict(self.__ecoAliasIterator())
print >> sys.stderr, " [INFO : Taxon alias file found] "
buildIndex=False
except:
print >> sys.stderr, " [INFO : Taxon alias file not found] "
index={}
i = 0;
buildIndex=True
localtaxon=0
i=0
for x in self.__ecoTaxonomicIterator():
taxonomy.append(x)
if x[4]=='ncbi':
localtaxon+=1
if buildIndex or x[4]!='ncbi':
index[x[0]] = i
i+=1
print >> sys.stderr,"Taxonomical tree read",
return taxonomy, index,localtaxon
def __readNodeTable(self):
self._taxonomy, self._index, self._localtaxon= self.__ecoTaxonomyIndex()
self._ranks = self.__ecoRankIndex()
self._name = self.__ecoNameIndex()
# Add local taxon tame to the name index
i=self._localtaxon
for t in self._taxonomy[self._localtaxon:]:
self._name.append((t[3],'scientific name',i))
i+=1
try :
self._preferedName = [(x[0],'obi',x[2])
for x in self.__ecoNameIterator(self._preferedNamesFile)]
print >> sys.stderr, " [INFO : Prefered taxon name file found] : %d added taxa" % len(self._preferedName)
except:
print >> sys.stderr, " [INFO : Prefered taxon name file not found]"
self._preferedName = []
class TaxonomyDump(Taxonomy):
def __init__(self,taxdir):
self._path=taxdir
self._readNodeTable('%s/nodes.dmp' % taxdir)
print >>sys.stderr,"Adding scientific name..."
self._name=[]
for taxid,name,classname in self._nameIterator('%s/names.dmp' % taxdir):
self._name.append((name,classname,self._index[taxid]))
if classname == 'scientific name':
self._taxonomy[self._index[taxid]].extend([name,'ncbi'])
print >>sys.stderr,"Adding taxid alias..."
for taxid,current in self._mergedNodeIterator('%s/merged.dmp' % taxdir):
self._index[taxid]=self._index[current]
print >>sys.stderr,"Adding deleted taxid..."
for taxid in self._deletedNodeIterator('%s/delnodes.dmp' % taxdir):
self._index[taxid]=None
self._nameidx=dict((x[0],x[2]) for x in self._name)
def _taxonCmp(t1,t2):
if t1[0] < t2[0]:
return -1
elif t1[0] > t2[0]:
return +1
return 0
_taxonCmp=staticmethod(_taxonCmp)
def _bsearchTaxon(self,taxid):
taxCount = len(self._taxonomy)
begin = 0
end = taxCount
oldcheck=taxCount
check = begin + end / 2
while check != oldcheck and self._taxonomy[check][0]!=taxid :
if self._taxonomy[check][0] < taxid:
begin=check
else:
end=check
oldcheck=check
check = (begin + end) / 2
if self._taxonomy[check][0]==taxid:
return check
else:
return None
def _readNodeTable(self,file):
file = universalOpen(file)
nodes = ColumnFile(file,
sep='|',
types=(int,int,str,
str,str,bool,
int,bool,int,
bool,bool,bool,str))
print >>sys.stderr,"Reading taxonomy dump file..."
# (taxid,rank,parent)
taxonomy=[[n[0],n[2],n[1]] for n in nodes]
print >>sys.stderr,"List all taxonomy rank..."
ranks =list(set(x[1] for x in taxonomy))
ranks.sort()
rankidx = dict(map(None,ranks,xrange(len(ranks))))
print >>sys.stderr,"Sorting taxons..."
taxonomy.sort(TaxonomyDump._taxonCmp)
self._taxonomy=taxonomy
self._localtaxon=len(taxonomy)
print >>sys.stderr,"Indexing taxonomy..."
index = {}
for t in self._taxonomy:
index[t[0]]=self._bsearchTaxon(t[0])
print >>sys.stderr,"Indexing parent and rank..."
for t in self._taxonomy:
t[1]=rankidx[t[1]]
t[2]=index[t[2]]
self._ranks=ranks
self._index=index
self._preferedName = []
def _nameIterator(self,file):
file = universalOpen(file)
names = ColumnFile(file,
sep='|',
types=(int,str,
str,str))
for taxid,name,unique,classname,white in names:
yield taxid,name,classname
def _mergedNodeIterator(self,file):
file = universalOpen(file)
merged = ColumnFile(file,
sep='|',
types=(int,int,str))
for taxid,current,white in merged:
yield taxid,current
def _deletedNodeIterator(self,file):
file = universalOpen(file)
deleted = ColumnFile(file,
sep='|',
types=(int,str))
for taxid,white in deleted:
yield taxid
#####
#
#
# Binary writer
#
#
#####
def ecoTaxonomyWriter(prefix, taxonomy,onlyLocal=False):
def ecoTaxPacker(tx):
namelength = len(tx[3])
totalSize = 4 + 4 + 4 + 4 + namelength
packed = struct.pack('> I I I I I %ds' % namelength,
totalSize,
tx[0],
tx[1],
tx[2],
namelength,
tx[3])
return packed
def ecoRankPacker(rank):
namelength = len(rank)
packed = struct.pack('> I %ds' % namelength,
namelength,
rank)
return packed
def ecoAliasPacker(taxid,index):
totalSize = 4 + 4
try:
packed = struct.pack('> I I i',
totalSize,
taxid,
index)
except struct.error,e:
print >>sys.stderr,(totalSize,taxid,index)
print >>sys.stderr,"Total size : %d taxid : %d index : %d" %(totalSize,taxid,index)
raise e
return packed
def ecoNamePacker(name):
namelength = len(name[0])
classlength= len(name[1])
totalSize = namelength + classlength + 4 + 4 + 4 + 4
packed = struct.pack('> I I I I I %ds %ds' % (namelength,classlength),
totalSize,
int(name[1]=='scientific name'),
namelength,
classlength,
name[2],
name[0],
name[1])
return packed
def ecoTaxWriter(file,taxonomy):
output = open(file,'wb')
nbtaxon = reduce(lambda x,y:x+y,(1 for t in taxonomy if t[4]=='ncbi'),0)
output.write(struct.pack('> I',nbtaxon))
for tx in taxonomy:
if tx[4]=='ncbi':
output.write(ecoTaxPacker(tx))
output.close()
return nbtaxon < len(taxonomy)
def ecoLocalTaxWriter(file,taxonomy):
nbtaxon = reduce(lambda x,y:x+y,(1 for t in taxonomy if t[4]!='ncbi'),0)
if nbtaxon:
output = open(file,'wb')
output.write(struct.pack('> I',nbtaxon))
for tx in taxonomy:
if tx[4]!='ncbi':
output.write(ecoTaxPacker(tx))
output.close()
def ecoRankWriter(file,ranks):
output = open(file,'wb')
output.write(struct.pack('> I',len(ranks)))
for rank in ranks:
output.write(ecoRankPacker(rank))
output.close()
def ecoAliasWriter(file,index):
output = open(file,'wb')
output.write(struct.pack('> I',len(index)))
for taxid in index:
i=index[taxid]
if i is None:
i=-1
output.write(ecoAliasPacker(taxid, i))
output.close()
def nameCmp(n1,n2):
name1=n1[0].upper()
name2=n2[0].upper()
if name1 < name2:
return -1
elif name1 > name2:
return 1
return 0
def ecoNameWriter(file,names):
output = open(file,'wb')
output.write(struct.pack('> I',len(names)))
names.sort(nameCmp)
for name in names:
output.write(ecoNamePacker(name))
output.close()
def ecoPreferedNameWriter(file,names):
output = open(file,'wb')
output.write(struct.pack('> I',len(names)))
for name in names:
output.write(ecoNamePacker(name))
output.close()
localtaxon=True
if not onlyLocal:
ecoRankWriter('%s.rdx' % prefix, taxonomy._ranks)
localtaxon = ecoTaxWriter('%s.tdx' % prefix, taxonomy._taxonomy)
ecoNameWriter('%s.ndx' % prefix, [x for x in taxonomy._name if x[2] < taxonomy._localtaxon])
ecoAliasWriter('%s.adx' % prefix, taxonomy._index)
if localtaxon:
ecoLocalTaxWriter('%s.ldx' % prefix, taxonomy._taxonomy)
if taxonomy._preferedName:
ecoNameWriter('%s.pdx' % prefix, taxonomy._preferedName)

View File

@ -0,0 +1,2 @@
class EcoTagResult(dict):
pass

150
obitools/ecotag/parser.py Normal file
View File

@ -0,0 +1,150 @@
from itertools import imap
from obitools import utils
from obitools.ecotag import EcoTagResult
class EcoTagFileIterator(utils.ColumnFile):
@staticmethod
def taxid(x):
x = int(x)
if x < 0:
return None
else:
return x
@staticmethod
def scientificName(x):
if x=='--':
return None
else:
return x
@staticmethod
def value(x):
if x=='--':
return None
else:
return float(x)
@staticmethod
def count(x):
if x=='--':
return None
else:
return int(x)
def __init__(self,stream):
utils.ColumnFile.__init__(self,
stream, '\t', True,
(str,str,str,
EcoTagFileIterator.value,
EcoTagFileIterator.value,
EcoTagFileIterator.value,
EcoTagFileIterator.count,
EcoTagFileIterator.count,
EcoTagFileIterator.taxid,
EcoTagFileIterator.scientificName,
str,
EcoTagFileIterator.taxid,
EcoTagFileIterator.scientificName,
EcoTagFileIterator.taxid,
EcoTagFileIterator.scientificName,
EcoTagFileIterator.taxid,
EcoTagFileIterator.scientificName,
str
))
self._memory=None
_colname = ['identification',
'seqid',
'best_match_ac',
'max_identity',
'min_identity',
'theorical_min_identity',
'count',
'match_count',
'taxid',
'scientific_name',
'rank',
'order_taxid',
'order_sn',
'family_taxid',
'family_sn',
'genus_taxid',
'genus_sn',
'species_taxid',
'species_sn',
'sequence']
def next(self):
if self._memory is not None:
data=self._memory
self._memory=None
else:
data = utils.ColumnFile.next(self)
data = EcoTagResult(imap(None,EcoTagFileIterator._colname[:len(data)],data))
if data['identification']=='ID':
data.cd=[]
try:
nextone = utils.ColumnFile.next(self)
nextone = EcoTagResult(imap(None,EcoTagFileIterator._colname[:len(nextone)],nextone))
except StopIteration:
nextone = None
while nextone is not None and nextone['identification']=='CD':
data.cd.append(nextone)
try:
nextone = utils.ColumnFile.next(self)
nextone = EcoTagResult(imap(None,EcoTagFileIterator._colname[:len(nextone)],nextone))
except StopIteration:
nextone = None
self._memory=nextone
return data
def ecoTagIdentifiedFilter(ecoTagIterator):
for x in ecoTagIterator:
if x['identification']=='ID':
yield x
class EcoTagAbstractIterator(utils.ColumnFile):
_colname = ['scientific_name',
'taxid',
'rank',
'count',
'max_identity',
'min_identity']
@staticmethod
def taxid(x):
x = int(x)
if x < 0:
return None
else:
return x
def __init__(self,stream):
utils.ColumnFile.__init__(self,
stream, '\t', True,
(str,
EcoTagFileIterator.taxid,
str,
int,
float,float,float))
def next(self):
data = utils.ColumnFile.next(self)
data = dict(imap(None,EcoTagAbstractIterator._colname,data))
return data
def ecoTagAbstractFilter(ecoTagAbsIterator):
for x in ecoTagAbsIterator:
if x['taxid'] is not None:
yield x

View File

@ -0,0 +1,54 @@
import time
from urllib2 import urlopen
import shelve
from threading import Lock
import sys
class EUtils(object):
'''
'''
_last_request=0
_interval=3
def __init__(self):
self._lock = Lock()
def wait(self):
now=time.time()
delta = now - EUtils._last_request
while delta < EUtils._interval:
time.sleep(delta)
now=time.time()
delta = now - EUtils._last_request
def _sendRequest(self,url):
self.wait()
EUtils._last_request=time.time()
t = EUtils._last_request
print >>sys.stderr,"Sending request to NCBI @ %f" % t
data = urlopen(url).read()
print >>sys.stderr,"Data red from NCBI @ %f (%f)" % (t,time.time()-t)
return data
def setInterval(self,seconde):
EUtils._interval=seconde
class EFetch(EUtils):
'''
'''
def __init__(self,db,tool='OBITools',
retmode='text',rettype="native",
server='eutils.ncbi.nlm.nih.gov'):
EUtils.__init__(self)
self._url = "http://%s/entrez/eutils/efetch.fcgi?db=%s&tool=%s&retmode=%s&rettype=%s"
self._url = self._url % (server,db,tool,retmode,rettype)
def get(self,**args):
key = "&".join(['%s=%s' % x for x in args.items()])
return self._sendRequest(self._url +"&" + key)

56
obitools/fast.py Normal file
View File

@ -0,0 +1,56 @@
"""
implement fastn/fastp sililarity search algorithm for BioSequence.
"""
class Fast(object):
def __init__(self,seq,kup=2):
'''
@param seq: sequence to hash
@type seq: BioSequence
@param kup: word size used for hashing process
@type kup: int
'''
hash={}
seq = str(seq)
for word,pos in ((seq[i:i+kup].upper(),i) for i in xrange(len(seq)-kup)):
if word in hash:
hash[word].append(pos)
else:
hash[word]=[pos]
self._kup = kup
self._hash= hash
self._seq = seq
def __call__(self,seq):
'''
Align one sequence with the fast hash table.
@param seq: the sequence to align
@type seq: BioSequence
@return: where smax is the
score of the largest diagonal and pmax the
associated shift
@rtype: a int tuple (smax,pmax)
'''
histo={}
seq = str(seq).upper()
hash= self._hash
kup = self._kup
for word,pos in ((seq[i:i+kup],i) for i in xrange(len(seq)-kup)):
matchedpos = hash.get(word,[])
for p in matchedpos:
delta = pos - p
histo[delta]=histo.get(delta,0) + 1
smax = max(histo.values())
pmax = [x for x in histo if histo[x]==smax]
return smax,pmax
def __len__(self):
return len(self._seq)

384
obitools/fasta/__init__.py Normal file
View File

@ -0,0 +1,384 @@
"""
fasta module provides functions to read and write sequences in fasta format.
"""
#from obitools.format.genericparser import fastGenericEntryIteratorGenerator
from obitools.format.genericparser import genericEntryIteratorGenerator
from obitools import bioSeqGenerator,BioSequence,AASequence,NucSequence
from obitools import _default_raw_parser
#from obitools.alignment import alignmentReader
#from obitools.utils import universalOpen
import re
from obitools.ecopcr.options import loadTaxonomyDatabase
from obitools.format import SequenceFileIterator
#from _fasta import parseFastaDescription,fastaParser
#from _fasta import _fastaJoinSeq
#from _fasta import _parseFastaTag
#fastaEntryIterator=fastGenericEntryIteratorGenerator(startEntry='>')
fastaEntryIterator=genericEntryIteratorGenerator(startEntry='>')
rawFastaEntryIterator=genericEntryIteratorGenerator(startEntry='\s*>')
def _fastaJoinSeq(seqarray):
return ''.join([x.strip() for x in seqarray])
def parseFastaDescription(ds,tagparser):
m = tagparser.search(' '+ds)
if m is not None:
info=m.group(0)
definition = ds[m.end(0):].strip()
else:
info=None
definition=ds
return definition,info
def fastaParser(seq,bioseqfactory,tagparser,rawparser,joinseq=_fastaJoinSeq):
'''
Parse a fasta record.
@attention: internal purpose function
@param seq: a sequence object containing all lines corresponding
to one fasta sequence
@type seq: C{list} or C{tuple} of C{str}
@param bioseqfactory: a callable object return a BioSequence
instance.
@type bioseqfactory: a callable object
@param tagparser: a compiled regular expression usable
to identify key, value couples from
title line.
@type tagparser: regex instance
@return: a C{BioSequence} instance
'''
seq = seq.split('\n')
title = seq[0].strip()[1:].split(None,1)
id=title[0]
if len(title) == 2:
definition,info=parseFastaDescription(title[1], tagparser)
else:
info= None
definition=None
seq=joinseq(seq[1:])
return bioseqfactory(id, seq, definition,info,rawparser)
def fastaNucParser(seq,tagparser=_default_raw_parser,joinseq=_fastaJoinSeq):
return fastaParser(seq,NucSequence,tagparser=tagparser,joinseq=_fastaJoinSeq)
def fastaAAParser(seq,tagparser=_default_raw_parser,joinseq=_fastaJoinSeq):
return fastaParser(seq,AASequence,tagparser=tagparser,joinseq=_fastaJoinSeq)
def fastaIterator(file,bioseqfactory=bioSeqGenerator,
tagparser=_default_raw_parser,
joinseq=_fastaJoinSeq):
'''
iterate through a fasta file sequence by sequence.
Returned sequences by this iterator will be BioSequence
instances
@param file: a line iterator containing fasta data or a filename
@type file: an iterable object or str
@param bioseqfactory: a callable object return a BioSequence
instance.
@type bioseqfactory: a callable object
@param tagparser: a compiled regular expression usable
to identify key, value couples from
title line.
@type tagparser: regex instance
@return: an iterator on C{BioSequence} instance
@see: L{fastaNucIterator}
@see: L{fastaAAIterator}
>>> from obitools.format.sequence.fasta import fastaIterator
>>> f = fastaIterator('monfichier')
>>> s = f.next()
>>> print s
gctagctagcatgctagcatgcta
>>>
'''
rawparser=tagparser
allparser = tagparser % '[a-zA-Z][a-zA-Z0-9_]*'
tagparser = re.compile('( *%s)+' % allparser)
for entry in fastaEntryIterator(file):
yield fastaParser(entry,bioseqfactory,tagparser,rawparser,joinseq)
def rawFastaIterator(file,bioseqfactory=bioSeqGenerator,
tagparser=_default_raw_parser,
joinseq=_fastaJoinSeq):
rawparser=tagparser
allparser = tagparser % '[a-zA-Z][a-zA-Z0-9_]*'
tagparser = re.compile('( *%s)+' % allparser)
for entry in rawFastaEntryIterator(file):
entry=entry.strip()
yield fastaParser(entry,bioseqfactory,tagparser,rawparser,joinseq)
def fastaNucIterator(file,tagparser=_default_raw_parser):
'''
iterate through a fasta file sequence by sequence.
Returned sequences by this iterator will be NucSequence
instances
@param file: a line iterator containint fasta data
@type file: an iterable object
@param tagparser: a compiled regular expression usable
to identify key, value couples from
title line.
@type tagparser: regex instance
@return: an iterator on C{NucBioSequence} instance
@rtype: a generator object
@see: L{fastaIterator}
@see: L{fastaAAIterator}
'''
return fastaIterator(file, NucSequence,tagparser)
def fastaAAIterator(file,tagparser=_default_raw_parser):
'''
iterate through a fasta file sequence by sequence.
Returned sequences by this iterator will be AASequence
instances
@param file: a line iterator containing fasta data
@type file: an iterable object
@param tagparser: a compiled regular expression usable
to identify key, value couples from
title line.
@type tagparser: regex instance
@return: an iterator on C{AABioSequence} instance
@see: L{fastaIterator}
@see: L{fastaNucIterator}
'''
return fastaIterator(file, AASequence,tagparser)
def formatFasta(data,gbmode=False,upper=False,restrict=None):
'''
Convert a seqence or a set of sequences in a
string following the fasta format
@param data: sequence or a set of sequences
@type data: BioSequence instance or an iterable object
on BioSequence instances
@param gbmode: if set to C{True} identifier part of the title
line follows recommendation from nbci to allow
sequence indexing with the blast formatdb command.
@type gbmode: bool
@param restrict: a set of key name that will be print in the formated
output. If restrict is set to C{None} (default) then
all keys are formated.
@type restrict: any iterable value or None
@return: a fasta formated string
@rtype: str
'''
if isinstance(data, BioSequence):
data = [data]
if restrict is not None and not isinstance(restrict, set):
restrict = set(restrict)
rep = []
for sequence in data:
seq = str(sequence)
if sequence.definition is None:
definition=''
else:
definition=sequence.definition
if upper:
frgseq = '\n'.join([seq[x:x+60].upper() for x in xrange(0,len(seq),60)])
else:
frgseq = '\n'.join([seq[x:x+60] for x in xrange(0,len(seq),60)])
info='; '.join(['%s=%s' % x
for x in sequence.rawiteritems()
if restrict is None or x[0] in restrict])
if info:
info=info+';'
if sequence._rawinfo is not None and sequence._rawinfo:
info+=" " + sequence._rawinfo.strip()
id = sequence.id
if gbmode:
if 'gi' in sequence:
id = "gi|%s|%s" % (sequence['gi'],id)
else:
id = "lcl|%s|" % (id)
title='>%s %s %s' %(id,info,definition)
rep.append("%s\n%s" % (title,frgseq))
return '\n'.join(rep)
def formatSAPFastaGenerator(options):
loadTaxonomyDatabase(options)
taxonomy=None
if options.taxonomy is not None:
taxonomy=options.taxonomy
assert taxonomy is not None,"SAP formating require indication of a taxonomy database"
ranks = ('superkingdom', 'kingdom', 'subkingdom', 'superphylum',
'phylum', 'subphylum', 'superclass', 'class', 'subclass',
'infraclass', 'superorder', 'order', 'suborder', 'infraorder',
'parvorder', 'superfamily', 'family', 'subfamily', 'supertribe', 'tribe',
'subtribe', 'supergenus', 'genus', 'subgenus', 'species group',
'species subgroup', 'species', 'subspecies')
trank=set(taxonomy._ranks)
ranks = [taxonomy._ranks.index(x) for x in ranks if x in trank]
strict= options.strictsap
def formatSAPFasta(data,gbmode=False,upper=False,restrict=None):
'''
Convert a seqence or a set of sequences in a
string following the fasta format as recommended for the SAP
software
http://ib.berkeley.edu/labs/slatkin/munch/StatisticalAssignmentPackage.html
@param data: sequence or a set of sequences
@type data: BioSequence instance or an iterable object
on BioSequence instances
@param gbmode: if set to C{True} identifier part of the title
line follows recommendation from nbci to allow
sequence indexing with the blast formatdb command.
@type gbmode: bool
@param restrict: a set of key name that will be print in the formated
output. If restrict is set to C{None} (default) then
all keys are formated.
@type restrict: any iterable value or None
@return: a fasta formated string
@rtype: str
'''
if isinstance(data, BioSequence):
data = [data]
if restrict is not None and not isinstance(restrict, set):
restrict = set(restrict)
rep = []
for sequence in data:
seq = str(sequence)
if upper:
frgseq = '\n'.join([seq[x:x+60].upper() for x in xrange(0,len(seq),60)])
else:
frgseq = '\n'.join([seq[x:x+60] for x in xrange(0,len(seq),60)])
try:
taxid = sequence["taxid"]
except KeyError:
if strict:
raise AssertionError('All sequence must have a taxid')
else:
continue
definition=' ;'
for r in ranks:
taxon = taxonomy.getTaxonAtRank(taxid,r)
if taxon is not None:
definition+=' %s: %s,' % (taxonomy._ranks[r],taxonomy.getPreferedName(taxon))
definition='%s ; %s' % (definition[0:-1],taxonomy.getPreferedName(taxid))
id = sequence.id
if gbmode:
if 'gi' in sequence:
id = "gi|%s|%s" % (sequence['gi'],id)
else:
id = "lcl|%s|" % (id)
title='>%s%s' %(id,definition)
rep.append("%s\n%s" % (title,frgseq))
return '\n'.join(rep)
return formatSAPFasta
class FastaIterator(SequenceFileIterator):
entryIterator = genericEntryIteratorGenerator(startEntry='>')
classmethod(entryIterator)
def __init__(self,inputfile,bioseqfactory=bioSeqGenerator,
tagparser=_default_raw_parser,
joinseq=_fastaJoinSeq):
SequenceFileIterator.__init__(self, inputfile, bioseqfactory)
self.__file = FastaIterator.entryIterator(self._inputfile)
self._tagparser = tagparser
self._joinseq = joinseq
def get_tagparser(self):
return self.__tagparser
def set_tagparser(self, value):
self._rawparser = value
allparser = value % '[a-zA-Z][a-zA-Z0-9_]*'
self.__tagparser = re.compile('( *%s)+' % allparser)
def _parseFastaDescription(self,ds):
m = self._tagparser.search(' '+ds)
if m is not None:
info=m.group(0)
definition = ds[m.end(0):].strip()
else:
info=None
definition=ds
return definition,info
def _parser(self):
'''
Parse a fasta record.
@attention: internal purpose function
@return: a C{BioSequence} instance
'''
seq = self._seq.split('\n')
title = seq[0].strip()[1:].split(None,1)
id=title[0]
if len(title) == 2:
definition,info=self._parseFastaDescription(title[1])
else:
info= None
definition=None
seq=self._joinseq(seq[1:])
return self._bioseqfactory(id, seq, definition,info,self._rawparser)
_tagparser = property(get_tagparser, set_tagparser, None, "_tagparser's docstring")

BIN
obitools/fasta/_fasta.so Executable file

Binary file not shown.

190
obitools/fastq/__init__.py Normal file
View File

@ -0,0 +1,190 @@
'''
Created on 29 aout 2009
@author: coissac
'''
from obitools import BioSequence
from obitools import _default_raw_parser
from obitools.format.genericparser import genericEntryIteratorGenerator
from obitools import bioSeqGenerator,AASequence,NucSequence
from obitools.fasta import parseFastaDescription
from _fastq import fastqQualitySangerDecoder,fastqQualitySolexaDecoder
from _fastq import qualityToSangerError,qualityToSolexaError
from _fastq import errorToSangerFastQStr
from _fastq import formatFastq
from _fastq import fastqParserGenetator
from obitools.utils import universalOpen
import re
fastqEntryIterator=genericEntryIteratorGenerator(startEntry='^@',endEntry="^\+",strip=True,join=False)
#def fastqParserGenetator(fastqvariant='sanger',bioseqfactory=NucSequence,tagparser=_parseFastaTag):
#
# qualityDecoder,errorDecoder = {'sanger' : (fastqQualitySangerDecoder,qualityToSangerError),
# 'solexa' : (fastqQualitySolexaDecoder,qualityToSolexaError),
# 'illumina' : (fastqQualitySolexaDecoder,qualityToSangerError)}[fastqvariant]
#
# def fastqParser(seq):
# '''
# Parse a fasta record.
#
# @attention: internal purpose function
#
# @param seq: a sequence object containing all lines corresponding
# to one fasta sequence
# @type seq: C{list} or C{tuple} of C{str}
#
# @param bioseqfactory: a callable object return a BioSequence
# instance.
# @type bioseqfactory: a callable object
#
# @param tagparser: a compiled regular expression usable
# to identify key, value couples from
# title line.
# @type tagparser: regex instance
#
# @return: a C{BioSequence} instance
# '''
#
# title = seq[0][1:].split(None,1)
# id=title[0]
# if len(title) == 2:
# definition,info=parseFastaDescription(title[1], tagparser)
# else:
# info= {}
# definition=None
#
# quality=errorDecoder(qualityDecoder(seq[3]))
#
# seq=seq[1]
#
# seq = bioseqfactory(id, seq, definition,False,**info)
# seq.quality = quality
#
# return seq
#
# return fastqParser
def fastqIterator(file,fastqvariant='sanger',bioseqfactory=NucSequence,tagparser=_default_raw_parser):
'''
iterate through a fasta file sequence by sequence.
Returned sequences by this iterator will be BioSequence
instances
@param file: a line iterator containing fasta data or a filename
@type file: an iterable object or str
@param bioseqfactory: a callable object return a BioSequence
instance.
@type bioseqfactory: a callable object
@param tagparser: a compiled regular expression usable
to identify key, value couples from
title line.
@type tagparser: regex instance
@return: an iterator on C{BioSequence} instance
@see: L{fastaNucIterator}
@see: L{fastaAAIterator}
'''
fastqParser=fastqParserGenetator(fastqvariant, bioseqfactory, tagparser)
file = universalOpen(file)
for entry in fastqEntryIterator(file):
title=entry[0]
seq="".join(entry[1:-1])
quality=''
lenseq=len(seq)
while (len(quality) < lenseq):
quality+=file.next().strip()
yield fastqParser([title,seq,'+',quality])
def fastqSangerIterator(file,tagparser=_default_raw_parser):
'''
iterate through a fastq file sequence by sequence.
Returned sequences by this iterator will be NucSequence
instances
@param file: a line iterator containint fasta data
@type file: an iterable object
@param tagparser: a compiled regular expression usable
to identify key, value couples from
title line.
@type tagparser: regex instance
@return: an iterator on C{NucBioSequence} instance
@see: L{fastqIterator}
@see: L{fastqAAIterator}
'''
return fastqIterator(file,'sanger',NucSequence,tagparser)
def fastqSolexaIterator(file,tagparser=_default_raw_parser):
'''
iterate through a fastq file sequence by sequence.
Returned sequences by this iterator will be NucSequence
instances
@param file: a line iterator containint fasta data
@type file: an iterable object
@param tagparser: a compiled regular expression usable
to identify key, value couples from
title line.
@type tagparser: regex instance
@return: an iterator on C{NucBioSequence} instance
@see: L{fastqIterator}
@see: L{fastqAAIterator}
'''
return fastqIterator(file,'solexa',NucSequence,tagparser)
def fastqIlluminaIterator(file,tagparser=_default_raw_parser):
'''
iterate through a fastq file sequence by sequence.
Returned sequences by this iterator will be NucSequence
instances
@param file: a line iterator containint fasta data
@type file: an iterable object
@param tagparser: a compiled regular expression usable
to identify key, value couples from
title line.
@type tagparser: regex instance
@return: an iterator on C{NucBioSequence} instance
@see: L{fastqIterator}
@see: L{fastqAAIterator}
'''
return fastqIterator(file,'illumina',NucSequence,tagparser)
def fastqAAIterator(file,tagparser=_default_raw_parser):
'''
iterate through a fastq file sequence by sequence.
Returned sequences by this iterator will be AASequence
instances
@param file: a line iterator containing fasta data
@type file: an iterable object
@param tagparser: a compiled regular expression usable
to identify key, value couples from
title line.
@type tagparser: regex instance
@return: an iterator on C{AABioSequence} instance
@see: L{fastqIterator}
@see: L{fastqNucIterator}
'''
return fastqIterator(file,'sanger',AASequence,tagparser)

BIN
obitools/fastq/_fastq.so Executable file

Binary file not shown.

View File

@ -0,0 +1,2 @@
fnaTag=' %s *= *([^\s]+)'

View File

@ -0,0 +1,8 @@
from obitools.fasta import fastaNucIterator
from obitools.fnaqual import fnaTag
def fnaFastaIterator(file):
x = fastaNucIterator(file, fnaTag)
return x

137
obitools/fnaqual/quality.py Normal file
View File

@ -0,0 +1,137 @@
"""
"""
from obitools import _default_raw_parser
from obitools.fasta import fastaIterator
from obitools.fnaqual import fnaTag
from obitools.location import Location
import re
class QualitySequence(list):
def __init__(self,id,seq,definition=None,rawinfo=None,rawparser=_default_raw_parser,**info):
'''
@param id:
@param seq:
@param definition:
'''
list.__init__(self,seq)
self._info = info
self.definition=definition
self.id=id
self._rawinfo=' ' + rawinfo
self._rawparser=rawparser
def getDefinition(self):
'''
Sequence definition getter
@return: the sequence definition
@rtype: str
'''
return self._definition
def setDefinition(self, value):
self._definition = value
def getId(self):
return self._id
def setId(self, value):
self._id = value
def getKey(self,key):
if key not in self._info:
p = re.compile(self._rawparser % key)
m = p.search(self._rawinfo)
if m is not None:
v=m.group(1)
self._rawinfo=' ' + self._rawinfo[0:m.start(0)]+self._rawinfo[m.end(0):]
try:
v = eval(v)
except:
pass
self._info[key]=v
else:
raise KeyError,key
else:
v=self._info[key]
return v
def __getitem__(self,key):
if isinstance(key,Location):
return key.extractSequence(self)
elif isinstance(key, str):
return self._getKey(key)
elif isinstance(key, int):
return list.__getitem__(self,key)
elif isinstance(key, slice):
subseq=list.__getitem__(self,key)
info = dict(self._info)
if key.start is not None:
start = key.start +1
else:
start = 1
if key.stop is not None:
stop = key.stop+1
else:
stop = len(self)
if key.step is not None:
step = key.step
else:
step = 1
info['cut']='[%d,%d,%s]' % (start,stop,step)
return QualitySequence(self.id, subseq, self.definition,self._rawinfo,self._rawparser,**info)
raise TypeError,'key must be an integer, a str or a slice'
def __setitem__(self,key,value):
self._info[key]=value
def __delitem__(self,key):
if isinstance(key, str):
del self._info[key]
else:
raise TypeError,key
def __iter__(self):
return list.__iter__(self)
def __contains__(self,key):
return key in self._info
def getTags(self):
return self._info
def complement(self):
'''
'''
cseq = self[::-1]
rep = QualitySequence(self.id,cseq,self.definition,self._rawinfo,self._rawparser,**self._info)
rep._info['complemented']=not rep._info.get('complemented',False)
return rep
definition = property(getDefinition, setDefinition, None, "Sequence Definition")
id = property(getId, setId, None, 'Sequence identifier')
def _qualityJoinSeq(seqarray):
text = ' '.join([x.strip() for x in seqarray])
return [int(x) for x in text.split()]
def qualityIterator(file):
for q in fastaIterator(file, QualitySequence, fnaTag, _qualityJoinSeq):
yield q

View File

@ -0,0 +1,28 @@
from obitools import bioSeqGenerator
from obitools.utils import universalOpen
class SequenceFileIterator:
def __init__(self,inputfile,bioseqfactory=bioSeqGenerator):
self._inputfile = universalOpen(inputfile)
self._bioseqfactory = bioseqfactory
def get_inputfile(self):
return self.__file
def get_bioseqfactory(self):
return self.__bioseqfactory
def next(self):
entry = self.inputfile.next()
return self._parse(entry)
def __iter__(self):
return self
_inputfile = property(get_inputfile, None, None, "_file's docstring")
_bioseqfactory = property(get_bioseqfactory, None, None, "_bioseqfactory's docstring")

BIN
obitools/format/_format.so Executable file

Binary file not shown.

View File

@ -0,0 +1,217 @@
"""
G{packagetree format}
"""
import re
from obitools.utils import universalOpen
def genericEntryIteratorGenerator(startEntry=None,endEntry=None,
head=False,tail=False,
strip=False,join=True):
'''
Transfome a text line iterator to an entry oriented iterator.
This iterator converted is useful to implement first stage
of flat file parsing.
@param startEntry: a regular pattern matching the beginning of
an entry
@type startEntry: C{str} or None
@param endEntry: a regular pattern matching the end of
an entry
@type endEntry: C{str} or None
@param head: indicate if an header is present before
the first entry (as in many original genbank
files)
@type head: C{bool}
@param tail: indicate if some extra informations are present
after the last entry.
@type tail: C{bool}
@return: an iterator on entries in text format
@rtype: an iterator on C{str}
'''
def isBeginning(line):
return startEntry is None or startEntry.match(line) is not None
def isEnding(line):
return ((endEntry is not None and endEntry.match(line) is not None) or
(endEntry is None and startEntry is not None and startEntry.match(line) is not None))
def transparentIteratorEntry(file):
file = universalOpen(file)
return file
def genericEntryIterator(file):
file = universalOpen(file)
entry = []
line = file.next()
started = head or isBeginning(line)
try:
while 1:
while not started:
line = file.next()
started = isBeginning(line)
if endEntry is None:
entry.append(line)
line = file.next()
while started:
end = isEnding(line)
if end:
if endEntry is not None:
entry.append(line)
if join:
e = ''.join(entry)
if strip:
e=e.strip()
else:
e=entry
if strip:
e=[x.strip() for x in e]
entry=[]
yield e
started=False
if endEntry is not None:
line = file.next()
else:
entry.append(line)
line = file.next()
started = isBeginning(line)
except StopIteration:
if entry and (endEntry is None or tail):
if join:
e = ''.join(entry)
if strip:
e=e.strip()
else:
e=entry
if strip:
e=[x.strip() for x in e]
yield e
if startEntry is not None:
startEntry = re.compile(startEntry)
if endEntry is not None:
endEntry = re.compile(endEntry)
if startEntry is None and endEntry is None:
return transparentIteratorEntry
return genericEntryIterator
class GenericParser(object):
def __init__(self,
startEntry=None,
endEntry=None,
head=False,
tail=False,
strip=False,
**parseAction):
"""
@param startEntry: a regular pattern matching the beginning of
an entry
@type startEntry: C{str} or None
@param endEntry: a regular pattern matching the end of
an entry
@type endEntry: C{str} or None
@param head: indicate if an header is present before
the first entry (as in many original genbank
files)
@type head: C{bool}
@param tail: indicate if some extra informations are present
after the last entry.
@type tail: C{bool}
@param parseAction:
"""
self.flatiterator= genericEntryIteratorGenerator(startEntry,
endEntry,
head,
tail,
strip)
self.action={}
for k in parseAction:
self.addParseAction(k,*parseAction[k])
def addParseAction(self,name,dataMatcher,dataCleaner=None,cleanSub=''):
'''
Add a parse action to the generic parser. A parse action
allows to extract one information from an entry. A parse
action is defined by a name and a method to extract this
information from the full text entry.
A parse action can be defined following two ways.
- via regular expression patterns
- via dedicated function.
In the first case, you have to indicate at least the
dataMatcher regular pattern. This pattern should match exactly
the data part you want to retrieve. If cleanning of extra
characters is needed. The second pattern dataCLeanner can be
used to specifyed these characters.
In the second case you must provide a callable object (function)
that extract and clean data from the text entry. This function
should return an array containing all data retrevied even if
no data or only one data is retrevied.
@summary: Add a parse action to the generic parser.
@param name: name of the data extracted
@type name: C{str}
@param dataMatcher: a regular pattern matching the data
or a callable object parsing the
entry and returning a list of marched data
@type dataMatcher: C{str} or C{SRE_Pattern} instance or a callable
object
@param dataCleaner: a regular pattern matching part of the data
to suppress.
@type dataCleaner: C{str} or C{SRE_Pattern} instance or C{None}
@param cleanSub: string used to replace dataCleaner matches.
Default is an empty string
@type cleanSub: C{str}
'''
if callable(dataMatcher):
self.action[name]=dataMatcher
else :
if isinstance(dataMatcher, str):
dataMatcher=re.compile(dataMatcher)
if isinstance(dataCleaner, str):
dataCleaner=re.compile(dataCleaner)
self.action[name]=self._buildREParser(dataMatcher,
dataCleaner,
cleanSub)
def _buildREParser(self,dataMatcher,dataCleaner,cleanSub):
def parser(data):
x = dataMatcher.findall(data)
if dataCleaner is not None:
x = [dataCleaner.sub(cleanSub,y) for y in x]
return x
return parser
def __call__(self,file):
for e in self.flatiterator(file):
pe = {'fullentry':e}
for k in self.action:
pe[k]=self.action[k](e)
yield pe

View File

View File

@ -0,0 +1,274 @@
__docformat__ = 'restructuredtext'
import re
import string
import textwrap
from obitools.obo.go.parser import GOEntryIterator
from obitools.obo.go.parser import GOTerm
from obitools.obo.go.parser import GOEntry
"""
go_obo.py : gene_ontology_edit.obo file parser:
----------------------------------------------------
- OBOFile class: open a flat file and return an entry.
"""
class OBOFile(object):
"""
Iterator over all entries of an OBO file
"""
def __init__(self,_path):
self.file = GOEntryIterator(_path)
def __iter__(self):
return self
def next(self):
fiche = self.file.next()
if isinstance(fiche, GOTerm):
self.isaterm=True
return Term(fiche)
elif isinstance(fiche, GOEntry):
self.isaterm=False
return Entry(fiche)
else:
self.isaterm=False
return Header(fiche)
############# tout le reste doit descendre a l'etage obitools/ogo/go/parser.py ##########
# define an XRef into a go_obo.py script in the microbi pylib
class Xref(object):
"""
Class Xref
Xref.db Xref database
Xref.id Xref identifier
"""
def __init__(self,description):
data = description.split(':')
self.db = data[0].strip()
self.id = data[1].strip()
# define a RelatedTerm into a go_obo.py script in the microbi pylib
class RelatedTerm(object):
"""
Class RelatedTerm
RelatedTerm.relation RelatedTerm relation
RelatedTerm.related_term RelatedTerm GO identifier
RelatedTerm.comment all terms have 0 or 1 comment
"""
def __init__(self,relation,value,comment):
self.relation = relation
self.related_term = value.strip('GO:')
self.comment = comment
# define into a go_obo.py script in the microbi pylib
#class Term(object):
# """
# class representing an OBO term (entry).
# """
#
# def __init__(self):
# raise RuntimeError('biodb.go_obo is an abstract class')
#
# def __checkEntry__(self):
# minimum=(hasattr(self,'goid') )
# if not minimum:
# raise AssertionError('Misconstructed GO Term instance %s' % [x for x in dir(self) if x[0]!='_'])
class Term(object):
"""
Class Term
representing a GO term.
"""
def __init__(self,data=None):
"""
"""
self.data=data
self.isaterm = True
if data:
self.__filtreGoid__()
self.__filtreName__()
self.__filtreComment__()
self.__filtreSynonyms__()
self.__filtreDef__()
self.__filtreParents__()
self.__filtreRelationships__()
self.__filtreRelation__()
self.__filtreObsolete__()
self.__filtreAltIds__()
self.__filtreXRefs__()
self.__filtreSubsets__()
# check if all required attributes were valued
self.__checkEntry__()
def __checkEntry__(self):
minimum=(hasattr(self,'goid') )
if not minimum:
raise AssertionError('Misconstructed GO Term instance %s' % [x for x in dir(self) if x[0]!='_'])
def __filtreGoid__(self):
"""
Extract GO id.
"""
self.goid = self.data.id.value.strip('GO:')
def __filtreName__(self):
"""
Extract GO name.
"""
self.name = self.data.name.value
def __filtreSynonyms__(self):
"""
Extract GO synonym(s).
"""
self.list_synonyms = {}
if self.data.synonyms:
for y in self.data.synonyms:
self.list_synonyms[y.value] = y.scope
def __filtreComment__(self):
"""
manage None comments
"""
if self.data.comment != None:
self.comment = self.data.comment.value
else:
self.comment = ""
def __filtreDef__(self):
"""
Extract GO definition.
"""
if self.data.definition != None:
self.definition = self.data.definition.value
else:
self.definition = ""
def __filtreParents__(self):
"""
To make the is_a hierarchy
"""
if self.data.is_a != None:
self.is_a = set([isa.value.strip('GO:') for isa in self.data.is_a])
else:
self.is_a = set()
def __filtreRelation__(self):
"""
To make the part_of hierarchy
"""
self.part_of = set()
self.regulates = set()
self.negatively_regulates = set()
self.positively_regulates = set()
if self.data.relationship != None:
for rel in self.data.relationship:
if rel.relationship == "part_of":
self.part_of.add(rel.value.strip('GO:'))
elif rel.relationship == "regulates":
self.regulates.add(rel.value.strip('GO:'))
elif rel.relationship == "negatively_regulates":
self.negatively_regulates.add(rel.value.strip('GO:'))
elif rel.relationship == "positively_regulates":
self.positively_regulates.add(rel.value.strip('GO:'))
def __filtreRelationships__(self):
"""
Relation list with other GO Terms (is_a, part_of or some regulates relation)
"""
self.related_term =[]
if self.data.relationship != None:
for x in self.data.relationship:
self.related_term.append(RelatedTerm(x.relationship,x.value,x.__doc__))
#self.related_term.append(RelatedTerm(x.relationship,x.value,x.comment))
if self.data.is_a != None:
for x in self.data.is_a:
self.related_term.append(RelatedTerm('is_a',x.value,x.__doc__))
#self.related_term.append(RelatedTerm('is_a',x.value,x.comment))
def __filtreObsolete__(self):
"""
for each obsolete terms corresponds a set of GO Identifiers
so that this GO term is consider as others GO Terms
"""
self.considers = set()
self.replaces = set()
self.is_obsolete = self.data.is_obsolete
if self.data.is_obsolete:
if self.data.consider:
self.considers = set([considered.value.strip('GO:') for considered in self.data.consider])
if self.data.replaced_by:
self.replaces = set([replaced.value.strip('GO:') for replaced in self.data.replaced_by])
def __filtreAltIds__(self):
"""
alternate(s) id(s) for this term (= alias in the geneontology schema model!)
"""
if self.data.alt_ids:
self.alt_ids = set([x.value.strip('GO:') for x in self.data.alt_ids])
else:
self.alt_ids = set()
def __filtreXRefs__(self):
"""
cross references to other databases
"""
self.xrefs = set()
if self.data.xrefs:
self.xrefs = set([Xref(x.value.reference) for x in self.data.xrefs])
def __filtreSubsets__(self):
"""
subset label to make smaller sets of GO Terms
"""
self.subsets = set()
if self.data.subsets:
self.subsets = set([x.value for x in self.data.subsets])
class Entry(object):
"""
a Stanza entry, like [Typedef] for example
"""
def __init__(self,data=None):
self.data=data
self.isaterm=False
self.isanentry=True
class Header(object):
"""
class representing a GO header.
"""
def __init__(self,data=None):
"""
"""
self.data=data
self.isaterm = False

284
obitools/format/options.py Normal file
View File

@ -0,0 +1,284 @@
'''
Created on 13 oct. 2009
@author: coissac
'''
from obitools.format.sequence.embl import emblIterator
from obitools.format.sequence.genbank import genbankIterator
from obitools.format.sequence.fnaqual import fnaFastaIterator
from obitools.format.sequence.fasta import fastaAAIterator,fastaNucIterator,fastaIterator
from obitools.format.sequence.fastq import fastqIlluminaIterator,fastqSolexaIterator
from obitools.fastq import fastqSangerIterator
from obitools.fnaqual.quality import qualityIterator
from obitools.fasta import formatFasta, rawFastaIterator,\
formatSAPFastaGenerator
from obitools.fastq import formatFastq
from obitools.ecopcr.sequence import EcoPCRDBSequenceWriter
from obitools.ecopcr.options import loadTaxonomyDatabase
#from obitools.format._format import printOutput
from array import array
from itertools import chain
import sys
import re
from obitools.ecopcr import EcoPCRFile
def addInputFormatOption(optionManager):
# optionManager.add_option('--rank',
# action="store_true", dest='addrank',
# default=False,
# help="add a rank attribute to the sequence "
# "indicating the sequence position in the input data")
optionManager.add_option('--genbank',
action="store_const", dest="seqinformat",
default=None,
const='genbank',
help="input file is in genbank format")
optionManager.add_option('--embl',
action="store_const", dest="seqinformat",
default=None,
const='embl',
help="input file is in embl format")
optionManager.add_option('--fasta',
action="store_const", dest="seqinformat",
default=None,
const='fasta',
help="input file is in fasta nucleic format (including obitools fasta extentions)")
optionManager.add_option('--ecopcr',
action="store_const", dest="seqinformat",
default=None,
const='ecopcr',
help="input file is in fasta nucleic format (including obitools fasta extentions)")
optionManager.add_option('--raw-fasta',
action="store_const", dest="seqinformat",
default=None,
const='rawfasta',
help="input file is in fasta format (but more tolerant to format variant)")
optionManager.add_option('--fna',
action="store_const", dest="seqinformat",
default=None,
const='fna',
help="input file is in fasta nucleic format produced by 454 sequencer pipeline")
optionManager.add_option('--qual',
action="store", dest="withqualfile",
type='str',
default=None,
help="Specify the name of a quality file produced by 454 sequencer pipeline")
optionManager.add_option('--sanger',
action="store_const", dest="seqinformat",
default=None,
const='sanger',
help="input file is in sanger fastq nucleic format (standard fastq)")
optionManager.add_option('--solexa',
action="store_const", dest="seqinformat",
default=None,
const='solexa',
help="input file is in fastq nucleic format produced by solexa sequencer")
optionManager.add_option('--illumina',
action="store_const", dest="seqinformat",
default=None,
const='illumina',
help="input file is in fastq nucleic format produced by old solexa sequencer")
optionManager.add_option('--nuc',
action="store_const", dest="moltype",
default=None,
const='nuc',
help="input file is nucleic sequences")
optionManager.add_option('--prot',
action="store_const", dest="moltype",
default=None,
const='pep',
help="input file is protein sequences")
def addOutputFormatOption(optionManager):
optionManager.add_option('--fastq-output',
action="store_const", dest="output",
default=None,
const=formatFastq,
help="output sequences in sanger fastq format")
optionManager.add_option('--fasta-output',
action="store_const", dest="output",
default=None,
const=formatFasta,
help="output sequences in obitools fasta format")
optionManager.add_option('--sap-output',
action="store_const", dest="output",
default=None,
const=formatSAPFastaGenerator,
help="output sequences in sap fasta format")
optionManager.add_option('--strict-sap',
action='store_true',dest='strictsap',
default=False,
help="Print sequences in upper case (defualt is lower case)")
optionManager.add_option('--ecopcr-output',
action="store", dest="ecopcroutput",
default=None,
help="output sequences in obitools ecopcr format")
optionManager.add_option('--uppercase',
action='store_true',dest='uppercase',
default=False,
help="Print sequences in upper case (defualt is lower case)")
def addInOutputOption(optionManager):
addInputFormatOption(optionManager)
addOutputFormatOption(optionManager)
def autoEntriesIterator(options):
options.outputFormater=formatFasta
options.outputFormat="fasta"
ecopcr_pattern = re.compile('^[^ ]+ +| +[0-9]+ +| + [0-9]+ + | +')
def annotatedIterator(formatIterator):
options.outputFormater=formatFasta
options.outputFormat="fasta"
def iterator(lineiterator):
for s in formatIterator(lineiterator):
s.extractTaxon()
yield s
return iterator
def withQualIterator(qualityfile):
options.outputFormater=formatFastq
options.outputFormat="fastq"
def iterator(lineiterator):
for s in fnaFastaIterator(lineiterator):
q = qualityfile.next()
quality = array('d',(10.**(-x/10.) for x in q))
s.quality=quality
yield s
return iterator
def autoSequenceIterator(lineiterator):
options.outputFormater=formatFasta
options.outputFormat="fasta"
first = lineiterator.next()
if first[0]==">":
if options.withqualfile is not None:
qualfile=qualityIterator(options.withqualfile)
reader=withQualIterator(qualfile)
options.outputFormater=formatFastq
options.outputFormat="fastq"
elif options.moltype=='nuc':
reader=fastaNucIterator
elif options.moltype=='pep':
reader=fastaAAIterator
else:
reader=fastaIterator
elif first[0]=='@':
reader=fastqSangerIterator
options.outputFormater=formatFastq
options.outputFormat="fastq"
elif first[0:3]=='ID ':
reader=emblIterator
elif first[0:6]=='LOCUS ':
reader=genbankIterator
elif first[0]=="#" or ecopcr_pattern.search(first):
reader=EcoPCRFile
else:
raise AssertionError,'file is not in fasta, fasta, embl, genbank or ecoPCR format'
input = reader(chain([first],lineiterator))
return input
if options.seqinformat is None:
reader = autoSequenceIterator
else:
if options.seqinformat=='fasta':
if options.moltype=='nuc':
reader=fastaNucIterator
elif options.moltype=='pep':
reader=fastaAAIterator
else:
reader=fastaIterator
elif options.seqinformat=='rawfasta':
reader=annotatedIterator(rawFastaIterator)
elif options.seqinformat=='genbank':
reader=annotatedIterator(genbankIterator)
elif options.seqinformat=='embl':
reader=annotatedIterator(emblIterator)
elif options.seqinformat=='fna':
reader=fnaFastaIterator
elif options.seqinformat=='sanger':
options.outputFormater=formatFastq
options.outputFormat="fastq"
reader=fastqSangerIterator
elif options.seqinformat=='solexa':
options.outputFormater=formatFastq
options.outputFormat="fastq"
reader=fastqSolexaIterator
elif options.seqinformat=='illumina':
options.outputFormater=formatFastq
options.outputFormat="fastq"
reader=fastqIlluminaIterator
elif options.seqinformat=='ecopcr':
reader=EcoPCRFile
if options.seqinformat=='fna' and options.withqualfile is not None:
qualfile=qualityIterator(options.withqualfile)
reader=withQualIterator(qualfile)
options.outputFormater=formatFastq
options.outputFormat="fastq"
# if options.addrank:
# reader = withRankIterator(reader)
return reader
def sequenceWriterGenerator(options,output=sys.stdout):
class SequenceWriter:
def __init__(self,options,file=sys.stdout):
self._format=None
self._file=file
self._upper=options.uppercase
def put(self,seq):
if self._format is None:
self._format=formatFasta
if options.output is not None:
self._format=options.output
if self._format is formatSAPFastaGenerator:
self._format=formatSAPFastaGenerator(options)
elif options.outputFormater is not None:
self._format=options.outputFormater
s = self._format(seq,upper=self._upper)
try:
self._file.write(s)
self._file.write("\n")
except IOError:
sys.exit(0)
if options.ecopcroutput is not None:
taxo = loadTaxonomyDatabase(options)
writer=EcoPCRDBSequenceWriter(options.ecopcroutput,taxonomy=taxo)
else:
writer=SequenceWriter(options,output)
def sequenceWriter(sequence):
writer.put(sequence)
return sequenceWriter

View File

@ -0,0 +1,24 @@
from obitools.fasta import fastaIterator
from obitools.fastq import fastqSangerIterator
from obitools.seqdb.embl.parser import emblIterator
from obitools.seqdb.genbank.parser import genbankIterator
from itertools import chain
from obitools.utils import universalOpen
def autoSequenceIterator(file):
lineiterator = universalOpen(file)
first = lineiterator.next()
if first[0]==">":
reader=fastaIterator
elif first[0]=='@':
reader=fastqSangerIterator
elif first[0:3]=='ID ':
reader=emblIterator
elif first[0:6]=='LOCUS ':
reader=genbankIterator
else:
raise AssertionError,'file is not in fasta, fasta, embl, or genbank format'
input = reader(chain([first],lineiterator))
return input

View File

@ -0,0 +1,2 @@
from obitools.seqdb.embl.parser import emblIterator,emblParser

View File

@ -0,0 +1,4 @@
from obitools.fasta import fastaIterator,fastaParser
from obitools.fasta import fastaAAIterator,fastaAAParser
from obitools.fasta import fastaNucIterator,fastaNucParser
from obitools.fasta import formatFasta

View File

@ -0,0 +1,13 @@
'''
Created on 15 janv. 2010
@author: coissac
'''
from obitools.fastq import fastqIterator,fastqParserGenetator
from obitools.fastq import fastqSangerIterator,fastqSolexaIterator, \
fastqIlluminaIterator
from obitools.fastq import fastqAAIterator
from obitools.fastq import formatFastq

View File

@ -0,0 +1,8 @@
'''
Created on 12 oct. 2009
@author: coissac
'''
from obitools.fnaqual.fasta import fnaFastaIterator
from obitools.fnaqual.quality import qualityIterator

View File

@ -0,0 +1,4 @@
from obitools.seqdb.genbank.parser import genpepIterator,genpepParser
from obitools.seqdb.genbank.parser import genbankIterator,genbankParser

View File

@ -0,0 +1,5 @@
from obitools.tagmatcher.parser import tagMatcherParser
from obitools.tagmatcher.parser import TagMatcherIterator
from obitools.tagmatcher.parser import formatTagMatcher
tagMatcherIterator=TagMatcherIterator

0
obitools/goa/__init__.py Normal file
View File

33
obitools/goa/parser.py Normal file
View File

@ -0,0 +1,33 @@
from itertools import imap
from obitools import utils
class GoAFileIterator(utils.ColumnFile):
def __init__(self,stream):
utils.ColumnFile.__init__(self,
stream, '\t', True,
(str,))
_colname = ['database',
'ac',
'symbol',
'qualifier',
'goid',
'origin',
'evidence',
'evidnce_origine',
'namespace',
'db_object_name',
'gene',
'object_type',
'taxid',
'date',
'assigned_by']
def next(self):
data = utils.ColumnFile.next(self)
data = dict(imap(None,GoAFileIterator._colname,data))
return data

962
obitools/graph/__init__.py Normal file
View File

@ -0,0 +1,962 @@
'''
**obitool.graph** for representing graph structure in obitools
--------------------------------------------------------------
.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>
This module offert classes to manipulate graphs, mainly trough the
:py:class:`obitools.graph.Graph` class.
.. inheritance-diagram:: Graph DiGraph UndirectedGraph
:parts: 2
'''
import sys
from obitools.utils import progressBar
class Indexer(dict):
'''
Allow to manage convertion between an arbitrarly hashable python
value and an unique integer key
'''
def __init__(self):
self.__max=0
self.__reverse=[]
def getLabel(self,index):
'''
Return the python value associated to an integer index.
:param index: an index value
:type index: int
:raises: IndexError if the index is not used in this
Indexer instance
'''
return self.__reverse[index]
def getIndex(self,key,strict=False):
'''
Return the index associated to a **key** in the indexer. Two
modes are available :
- strict mode :
if the key is not known by the :py:class:`Indexer` instance
a :py:exc:`KeyError` exception is raised.
- non strict mode :
in this mode if the requested *key** is absent, it is added to
the :py:class:`Indexer` instance and the new index is returned
:param key: the requested key
:type key: a hashable python value
:param strict: select the looking for mode
:type strict: bool
:return: the index corresponding to the key
:rtype: int
:raises: - :py:exc:`KeyError` in strict mode is key is absent
of the :py:class:`Indexer` instance
- :py:exc:`TypeError` if key is not an hashable value.
'''
if dict.__contains__(self,key):
return dict.__getitem__(self,key)
elif strict:
raise KeyError,key
else:
value = self.__max
self[key]= value
self.__reverse.append(key)
self.__max+=1
return value
def __getitem__(self,key):
'''
Implement the [] operateor to emulate the standard dictionnary
behaviour on :py:class:`Indexer` and returns the integer key
associated to a python value.
Actually this method call the:py:meth:`getIndex` method in
non strict mode so it only raises an :py:exc:`TypeError`
if key is not an hashable value.
:param key: the value to index
:type key: an hashable python value
:return: an unique integer value associated to the key
:rtype: int
:raises: :py:exc:`TypeError` if **key** is not an hashable value.
'''
return self.getIndex(key)
def __equal__(self,index):
'''
Implement equal operator **==** for comparing two :py:class:`Indexer` instances.
Two :py:class:`Indexer` instances are equals only if they are physically
the same instance
:param index: the second Indexer
:type index: an :py:class:`Indexer` instance
:return: True is the two :py:class:`Indexer` instances are the same
:rtype: bool
'''
return id(self)==id(index)
class Graph(object):
'''
Class used to represent directed or undirected graph.
.. warning::
Only one edge can connect two nodes in a given direction.
.. warning::
Specifying nodes through their index seepud your code but as no check
is done on index value, it may result in inconsistency. So prefer the
use of node label to specify a node.
'''
def __init__(self,label='G',directed=False,indexer=None,nodes=None,edges=None):
'''
:param label: Graph name, set to 'G' by default
:type label: str
:param directed: true for directed graph, set to False by defalt
:type directed: boolean
:param indexer: node label indexer. This allows to define several graphs
sharing the same indexer (see : :py:meth:`newEmpty`)
:type indexer: :py:class:`Indexer`
:param nodes: set of nodes to add to the graph
:type nodes: iterable value
:param edges: set of edges to add to the graph
:type edges: iterable value
'''
self._directed=directed
if indexer is None:
indexer = Indexer()
self._index = indexer
self._node = {}
self._node_attrs = {}
self._edge_attrs = {}
self._label=label
def newEmpty(self):
"""
Build a new empty graph using the same :py:class:`Indexer` instance.
This allows two graph for sharing their vertices through their indices.
"""
n = Graph(self._label+"_compact",self._directed,self._index)
return n
def addNode(self,node=None,index=None,**data):
'''
Add a new node or update an existing one.
:param node: the new node label or the label of an existing node
for updating it.
:type node: an hashable python value
:param index: the index of an existing node for updating it.
:type index: int
:return: the index of the node
:rtype: int
:raises: :py:exc:`IndexError` is index is not **None** and
corresponds to a not used index in this graph.
'''
if index is None:
index = self._index[node]
if index not in self._node:
self._node[index]=set()
else:
if index not in self._node:
raise IndexError,"This index is not used in this graph"
if data:
if index in self._node_attrs:
self._node_attrs[index].update(data)
else:
self._node_attrs[index]=dict(data)
return index
def __contains__(self,node):
try:
index = self._index.getIndex(node,strict=True)
r = index in self._node
except KeyError:
r=False
return r
def getNode(self,node=None,index=None):
"""
:param node: a node label.
:type node: an hashable python value
:param index: the index of an existing node.
:type index: int
.. note:: Index value are prevalent over node label.
:return: the looked for node
:rtype: :py:class:`Node`
:raises: :py:exc:`IndexError` if specified node lablel
corresponds to a non-existing node.
.. warning:: no check on index value
"""
if index is None:
index = self._index.getIndex(node, True)
return Node(index,self)
def getBestNode(self,estimator):
'''
Select the node maximizing the estimator function
:param estimator: the function to maximize
:type estimator: a function returning a numerical value and accepting one
argument of type :py:class:`Node`
:return: the best node
:rtype: py:class:`Node`
'''
bestScore=0
best=None
for n in self:
score = estimator(n)
if best is None or score > bestScore:
bestScore = score
best=n
return best
def delNode(self,node=None,index=None):
"""
Delete a node from a graph and all associated edges.
:param node: a node label.
:type node: an hashable python value
:param index: the index of an existing node.
:type index: int
.. note:: Index value are prevalent over node label.
:raises: :py:exc:`IndexError` if specified node lablel
corresponds to a non-existing node.
.. warning:: no check on index value
"""
if index is None:
index = self._index[node]
for n in self._node:
if n!=index:
e = self._node[n]
if index in e:
if (n,index) in self._edge_attrs:
del self._edge_attrs[(n,index)]
e.remove(index)
e = self._node[index]
for n in e:
if (index,n) in self._edge_attrs:
del self._edge_attrs[(index,n)]
del self._node[index]
if index in self._node_attrs:
del self._node_attrs[index]
def addEdge(self,node1=None,node2=None,index1=None,index2=None,**data):
'''
Create a new edge in the graph between both the specified nodes.
.. note:: Nodes can be specified using their label or their index in the graph
if both values are indicated the index is used.
:param node1: The first vertex label
:type node1: an hashable python value
:param node2: The second vertex label
:type node2: an hashable python value
:param index1: The first vertex index
:type index1: int
:param index2: The second vertex index
:type index2: int
:raises: :py:exc:`IndexError` if one of both the specified node lablel
corresponds to a non-existing node.
.. warning:: no check on index value
'''
index1=self.addNode(node1, index1)
index2=self.addNode(node2, index2)
self._node[index1].add(index2)
if not self._directed:
self._node[index2].add(index1)
if data:
if (index1,index2) not in self._edge_attrs:
data =dict(data)
self._edge_attrs[(index1,index2)]=data
if not self._directed:
self._edge_attrs[(index2,index1)]=data
else:
self._edge_attrs[(index2,index1)].update(data)
return (index1,index2)
def getEdge(self,node1=None,node2=None,index1=None,index2=None):
'''
Extract the :py:class:`Edge` instance linking two nodes of the graph.
.. note:: Nodes can be specified using their label or their index in the graph
if both values are indicated the index is used.
:param node1: The first vertex label
:type node1: an hashable python value
:param node2: The second vertex label
:type node2: an hashable python value
:param index1: The first vertex index
:type index1: int
:param index2: The second vertex index
:type index2: int
:raises: :py:exc:`IndexError` if one of both the specified node lablel
corresponds to a non-existing node.
.. warning:: no check on index value
'''
node1=self.getNode(node1, index1)
node2=self.getNode(node2, index2)
return Edge(node1,node2)
def delEdge(self,node1=None,node2=None,index1=None,index2=None):
"""
Delete the edge linking node 1 to node 2.
.. note:: Nodes can be specified using their label or their index in the graph
if both values are indicated the index is used.
:param node1: The first vertex label
:type node1: an hashable python value
:param node2: The second vertex label
:type node2: an hashable python value
:param index1: The first vertex index
:type index1: int
:param index2: The second vertex index
:type index2: int
:raises: :py:exc:`IndexError` if one of both the specified node lablel
corresponds to a non-existing node.
.. warning:: no check on index value
"""
if index1 is None:
index1 = self._index[node1]
if index2 is None:
index2 = self._index[node2]
if index1 in self._node and index2 in self._node[index1]:
self._node[index1].remove(index2)
if (index1,index2) in self._node_attrs:
del self._node_attrs[(index1,index2)]
if not self._directed:
self._node[index2].remove(index1)
if (index2,index1) in self._node_attrs:
del self._node_attrs[(index2,index1)]
def edgeIterator(self,predicate=None):
"""
Iterate through a set of selected vertices.
:param predicate: a function allowing node selection. Default value
is **None** and indicate that all nodes are selected.
:type predicate: a function returning a boolean value
and accepting one argument of class :py:class:`Edge`
:return: an iterator over selected edge
:rtype: interator over :py:class:`Edge` instances
.. seealso::
function :py:func:`selectEdgeAttributeFactory` for simple predicate.
"""
for n1 in self._node:
for n2 in self._node[n1]:
if self._directed or n1 <= n2:
e = self.getEdge(index1=n1, index2=n2)
if predicate is None or predicate(e):
yield e
def nodeIterator(self,predicate=None):
"""
Iterate through a set of selected vertices.
:param predicate: a function allowing edge selection. Default value
is **None** and indicate that all edges are selected.
:type predicate: a function returning a boolean value
and accepting one argument of class :py:class:`Node`
:return: an iterator over selected nodes.
:rtype: interator over :py:class:`Node` instances
"""
for n in self._node:
node = self.getNode(index=n)
if predicate is None or predicate(node):
yield node
def nodeIndexIterator(self,predicate=None):
"""
Iterate through the indexes of a set of selected vertices.
:param predicate: a function allowing edge selection. Default value
is **None** and indicate that all edges are selected.
:type predicate: a function returning a boolean value
and accepting one argument of class :py:class:`Node`
:return: an iterator over selected node indices.
:rtype: interator over `int`
"""
for n in self._node:
node = self.getNode(index=n)
if predicate is None or predicate(node):
yield n
def neighbourIndexSet(self,node=None,index=None):
if index is None:
index=self.getNode(node).index
return self._node[index]
def edgeCount(self):
n = reduce(lambda x,y:x+y, (len(z) for z in self._node.itervalues()),0)
if not self._directed:
n=n/2
return n
def subgraph(self,nodes,name='G'):
sub = Graph(name,self._directed,self._index)
if not isinstance(nodes, set):
nodes = set(nodes)
for n in nodes:
sub._node[n]=nodes & self._node[n]
if n in self._node_attrs:
sub._node_attrs[n]=dict(self._node_attrs[n])
for n2 in sub._node[n]:
if not self._directed:
if n <= n2:
if (n,n2) in self._edge_attrs:
data=dict(self._edge_attrs[(n,n2)])
sub._edge_attrs[(n,n2)]=data
sub._edge_attrs[(n2,n)]=data
else:
if (n,n2) in self._edge_attrs:
data=dict(self._edge_attrs[(n,n2)])
sub._edge_attrs[(n,n2)]=data
return sub
def __len__(self):
return len(self._node)
def __getitem__(self,key):
return self.getNode(node=key)
def __delitem__(self,key):
self.delNode(node=key)
def __iter__(self):
return self.nodeIterator()
def __str__(self):
if self._directed:
kw ='digraph'
else:
kw='graph'
nodes = "\n ".join([str(x) for x in self])
edges = "\n ".join([str(x) for x in self.edgeIterator()])
return "%s %s {\n %s\n\n %s\n}" % (kw,self._label,nodes,edges)
class Node(object):
"""
Class used for representing one node or vertex in a graph
"""
def __init__(self,index,graph):
'''
.. warning::
:py:class:`Node` constructor is usualy called through the :py:class:`Graph` methods
:param index: Index of the node in the graph
:type index: int
:param graph: graph instance owning the node
:type graph: :py:class:`obitools.graph.Graph`
'''
self.index = index
self.__graph = graph
def getGraph(self):
'''
return graph owning this node.
:rtype: :py:class:`obitools.graph.Graph`
'''
return self.__graph
def getLabel(self):
'''
return label associated to this node.
'''
return self.__graph._index.getLabel(self.index)
def has_key(self,key):
'''
test is the node instance has a property named 'key'.
:param key: the name of a property
:type key: str
:return: True if the nade has a property named <key>
:rtype: bool
'''
if self.index in self.__graph._node_attrs:
return key in self.__graph._node_attrs[self.index]
else:
return False
def neighbourIterator(self,nodePredicat=None,edgePredicat=None):
'''
iterate through the nodes directly connected to
this node.
:param nodePredicat: a function accepting one node as parameter
and returning **True** if this node must be
returned by the iterator.
:type nodePredicat: function
:param edgePredicat: a function accepting one edge as parameter
and returning True if the edge linking self and
the current must be considered.
:type edgePredicat: function
:rtype: iterator on Node instances
'''
for n in self.neighbourIndexIterator(nodePredicat, edgePredicat):
node = self.graph.getNode(index=n)
yield node
def neighbourIndexSet(self):
'''
Return a set of node indexes directely connected
to this node.
.. warning::
do not change this set unless you know
exactly what you do.
@rtype: set of int
'''
return self.__graph._node[self.index]
def neighbourIndexIterator(self,nodePredicat=None,edgePredicat=None):
'''
iterate through the node indexes directly connected to
this node.
:param nodePredicat: a function accepting one node as parameter
and returning True if this node must be
returned by the iterator.
:type nodePredicat: function
:param edgePredicat: a function accepting one edge as parameter
and returning True if the edge linking self and
the current must be considered.
:type edgePredicat: function
:rtype: iterator on int
'''
for n in self.neighbourIndexSet():
if nodePredicat is None or nodePredicat(self.__graph.getNode(index=n)):
if edgePredicat is None or edgePredicat(self.__graph.getEdge(index1=self.index,index2=n)):
yield n
def degree(self,nodeIndexes=None):
'''
return count of edges linking this node to the
set of nodes describes by their index in nodeIndexes
:param nodeIndexes: set of node indexes.
if set to None, all nodes of the
graph are take into account.
Set to None by default.
:type nodeIndexes: set of int
:rtype: int
'''
if nodeIndexes is None:
return len(self.__graph._node[self.index])
else:
return len(self.__graph._node[self.index] & nodeIndexes)
def componentIndexSet(self,nodePredicat=None,edgePredicat=None):
'''
Return the set of node index in the same connected component.
:param nodePredicat: a function accepting one node as parameter
and returning True if this node must be
returned by the iterator.
:type nodePredicat: function
:param edgePredicat: a function accepting one edge as parameter
and returning True if the edge linking self and
the current must be considered.
:type edgePredicat: function
:rtype: set of int
'''
cc=set([self.index])
added = set(x for x in self.neighbourIndexIterator(nodePredicat, edgePredicat))
while added:
cc |= added
added = reduce(lambda x,y : x | y,
(set(z for z in self.graph.getNode(index=c).neighbourIndexIterator(nodePredicat, edgePredicat))
for c in added),
set())
added -= cc
return cc
def componentIterator(self,nodePredicat=None,edgePredicat=None):
'''
Iterate through the nodes in the same connected
component.
:rtype: iterator on :py:class:`Node` instance
'''
for c in self.componentIndexSet(nodePredicat, edgePredicat):
yield self.graph.getNode(c)
def shortestPathIterator(self,nodes=None):
'''
Iterate through the shortest path sourcing
from this node. if nodes is not None, iterates
only path linkink this node to one node listed in
nodes
:param nodes: set of node index
:type nodes: iterable on int
:return: an iterator on list of int describing path
:rtype: iterator on list of int
'''
if nodes is not None:
nodes = set(nodes)
Q=[(self.index,-1)]
gray = set([self.index])
paths = {}
while Q and (nodes is None or nodes):
u,p = Q.pop()
paths[u]=p
next = self.graph._node[u] - gray
gray|=next
Q.extend((x,u) for x in next)
if nodes is None or u in nodes:
if nodes:
nodes.remove(u)
path = [u]
while p >= 0:
path.append(p)
p = paths[p]
path.reverse()
yield path
def shortestPathTo(self,node=None,index=None):
'''
return one of the shortest path linking this
node to specified node.
:param node: a node label or None
:param index: a node index or None. the parameter index
has a priority on the parameter node.
:type index: int
:return: list of node index corresponding to the path or None
if no path exists.
:rtype: list of int or None
'''
if index is None:
index=self.graph.getNode(node).index
for p in self.shortestPathIterator([index]):
return p
def __getitem__(self,key):
'''
return the value of the <key> property of this node
:param key: the name of a property
:type key: str
'''
return self.__graph._node_attrs.get(self.index,{})[key]
def __setitem__(self,key,value):
'''
set the value of a node property. In the property doesn't
already exist a new property is added to this node.
:param key: the name of a property
:type key: str
:param value: the value of the property
.. seealso::
:py:meth:`Node.__getitem__`
'''
if self.index in self.__graph._node_attrs:
data = self.__graph._node_attrs[self.index]
data[key]=value
else:
self.graph._node_attrs[self.index]={key:value}
def __len__(self):
'''
Count neighbour of this node
:rtype: int
.. seealso::
:py:meth:`Node.degree`
'''
return len(self.__graph._node[self.index])
def __iter__(self):
'''
iterate through neighbour of this node
:rtype: iterator in :py:class:`Node` instances
.. seealso::
:py:meth:`Node.neighbourIterator`
'''
return self.neighbourIterator()
def __contains__(self,key):
return self.has_key(key)
def __str__(self):
if self.index in self.__graph._node_attrs:
keys = " ".join(['%s="%s"' % (x[0],str(x[1]).replace('"','\\"').replace('\n','\\n'))
for x in self.__graph._node_attrs[self.index].iteritems()]
)
else:
keys=''
return '%d [label="%s" %s]' % (self.index,
str(self.label).replace('"','\\"').replace('\n','\\n'),
keys)
def keys(self):
if self.index in self.__graph._node_attrs:
k = self.__graph._node_attrs[self.index].keys()
else:
k=[]
return k
label = property(getLabel, None, None, "Label of the node")
graph = property(getGraph, None, None, "Graph owning this node")
class Edge(object):
"""
Class used for representing one edge of a graph
"""
def __init__(self,node1,node2):
'''
.. warning::
:py:class:`Edge` constructor is usualy called through the :py:class:`Graph` methods
:param node1: First node likend by the edge
:type node1: :py:class:`Node`
:param node2: Seconde node likend by the edge
:type node2: :py:class:`Node`
'''
self.node1 = node1
self.node2 = node2
def getGraph(self):
"""
Return the :py:class:`Graph` instance owning this edge.
"""
return self.node1.graph
def has_key(self,key):
'''
test is the :py:class:`Edge` instance has a property named **key**.
:param key: the name of a property
:type key: str
:return: True if the edge has a property named <key>
:rtype: bool
'''
if (self.node1.index,self.node2.index) in self.graph._edge_attrs:
return key in self.graph._edge_attrs[(self.node1.index,self.node2.index)]
else:
return False
def getDirected(self):
return self.node1.graph._directed
def __getitem__(self,key):
return self.graph._edge_attrs.get((self.node1.index,self.node2.index),{})[key]
def __setitem__(self,key,value):
e = (self.node1.index,self.node2.index)
if e in self.graph._edge_attrs:
data = self.graph._edge_attrs[e]
data[key]=value
else:
self.graph._edge_attrs[e]={key:value}
def __str__(self):
e = (self.node1.index,self.node2.index)
if e in self.graph._edge_attrs:
keys = "[%s]" % " ".join(['%s="%s"' % (x[0],str(x[1]).replace('"','\\"'))
for x in self.graph._edge_attrs[e].iteritems()]
)
else:
keys = ""
if self.directed:
link='->'
else:
link='--'
return "%d %s %d %s" % (self.node1.index,link,self.node2.index,keys)
def __contains__(self,key):
return self.has_key(key)
graph = property(getGraph, None, None, "Graph owning this edge")
directed = property(getDirected, None, None, "Directed's Docstring")
class DiGraph(Graph):
"""
:py:class:`DiGraph class`is a specialisation of the :py:class:`Graph` class
dedicated to directed graph representation
.. seealso::
:py:class:`UndirectedGraph`
"""
def __init__(self,label='G',indexer=None,nodes=None,edges=None):
'''
:param label: Graph name, set to 'G' by default
:type label: str
:param indexer: node label indexer
:type indexer: Indexer instance
:param nodes: set of nodes to add to the graph
:type nodes: iterable value
:param edges: set of edges to add to the graph
:type edges: iterable value
'''
Graph.__init__(self, label, True, indexer, nodes, edges)
class UndirectedGraph(Graph):
"""
:py:class:`UndirectGraph class`is a specialisation of the :py:class:`Graph` class
dedicated to undirected graph representation
.. seealso::
:py:class:`DiGraph`
"""
def __init__(self,label='G',indexer=None,nodes=None,edges=None):
'''
:param label: Graph name, set to 'G' by default
:type label: str
:param indexer: node label indexer
:type indexer: Indexer instance
:param nodes: set of nodes to add to the graph
:type nodes: iterable value
:param edges: set of edges to add to the graph
:type edges: iterable value
'''
Graph.__init__(self, label, False, indexer, nodes, edges)
def selectEdgeAttributeFactory(attribut,value):
"""
This function help in building predicat function usable for selecting edge
in the folowing :py:class:`Graph` methods :
- :py:meth:`Graph.edgeIterator`
"""
def selectEdge(e):
return attribut in e and e[attribut]==value
return selectEdge

BIN
obitools/graph/__init__.pyc Normal file

Binary file not shown.

View File

Binary file not shown.

View File

@ -0,0 +1,134 @@
import time
import sys
_maxsize=0
_solution=0
_notbound=0
_sizebound=0
_lastyield=0
_maxclique=None
def cliqueIterator(graph,minsize=1,node=None,timeout=None):
global _maxsize,_solution,_notbound,_sizebound,_lastyield
_maxsize=0
_solution=0
_notbound=0
_sizebound=0
starttime = time.time()
if node:
node = graph.getNode(node)
index = node.index
clique= set([index])
candidates= set(graph.neighbourIndexSet(index=index))
else:
clique=set()
candidates = set(x.index for x in graph)
# candidates = set(x for x in candidates
# if len(graph.neighbourIndexSet(index=x) & candidates) >= (minsize - 1))
_lastyield=time.time()
for c in _cliqueIterator(graph,clique,candidates,set(),minsize,start=starttime,timeout=timeout):
yield c
def _cliqueIterator(graph,clique,candidates,notlist,minsize=0,start=None,timeout=None):
global _maxsize,_maxclique,_solution,_notbound,_sizebound,_lastyield
# Speed indicator
lclique = len(clique)
lcandidates = len(candidates)
notmin = lcandidates
notfix = None
for n in notlist:
nnc = candidates - graph.neighbourIndexSet(index=n)
nc = len(nnc)
if nc < notmin:
notmin=nc
notfix=n
notfixneib = nnc
if lclique > _maxsize or not _solution % 1000 :
if start is not None:
top = time.time()
delta = top - start
if delta==0:
delta=1e-6
speed = _solution / delta
start = top
else:
speed = 0
print >>sys.stderr,"\rCandidates : %-5d Maximum clique size : %-5d Solutions explored : %10d speed = %5.2f solutions/sec sizebound=%10d notbound=%10d " % (lcandidates,_maxsize,_solution,speed,_sizebound,_notbound),
sys.stderr.flush()
if lclique > _maxsize:
_maxsize=lclique
# print >>sys.stderr,'koukou'
timer = time.time() - _lastyield
if not candidates and not notlist:
if lclique==_maxsize:
_maxclique=set(clique)
if lclique >= minsize:
yield set(clique)
if timeout is not None and timer > timeout and _maxclique is not None:
yield _maxclique
_maxclique=None
else:
while notmin and candidates and ((lclique + len(candidates)) >= minsize or (timeout is not None and timer > timeout)):
# count explored solution
_solution+=1
if notfix is None:
nextcandidate = candidates.pop()
else:
nextcandidate = notfixneib.pop()
candidates.remove(nextcandidate)
clique.add(nextcandidate)
neighbours = graph.neighbourIndexSet(index=nextcandidate)
nextcandidates = candidates & neighbours
nextnot = notlist & neighbours
nnc = candidates - neighbours
lnnc=len(nnc)
for c in _cliqueIterator(graph,
set(clique),
nextcandidates,
nextnot,
minsize,
start,
timeout=timeout):
yield c
clique.remove(nextcandidate)
notmin-=1
if lnnc < notmin:
notmin = lnnc
notfix = nextcandidate
notfixneib = nnc
if notmin==0:
_notbound+=1
notlist.add(nextcandidate)
else:
if (lclique + len(candidates)) < minsize:
_sizebound+=1

View File

@ -0,0 +1,8 @@
def compactGraph(graph,nodeSetIterator):
compact = graph.newEmpty()
for ns in nodeSetIterator(graph):
nlabel = "\n".join([str(graph.getNode(index=x).label) for x in ns])
compact.addNode(nlabel)
print
print compact

View File

@ -0,0 +1,82 @@
"""
Iterate through the connected components of a graph
---------------------------------------------------
the module :py:mod:`obitools.graph.algorithm.component` provides
two functions to deal with the connected component of a graph
represented as a :py:class:`obitools.graph.Graph` instance.
The whole set of connected component of a graph is a partition of this graph.
So a node cannot belongs to two distinct connected component.
Two nodes are in the same connected component if it exits a path through
the graph edges linking them.
TODO: THere is certainly a bug with DirectedGraph
"""
def componentIterator(graph,nodePredicat=None,edgePredicat=None):
'''
Build an iterator over the connected component of a graph.
Each connected component returned by the iterator is represented
as a `set` of node indices.
:param graph: the graph to partitionne
:type graph: :py:class:`obitools.graph.Graph`
:param predicate: a function allowing edge selection. Default value
is **None** and indicate that all edges are selected.
:type predicate: a function returning a boolean value
and accepting one argument of class :py:class:`Node`
:param predicate: a function allowing node selection. Default value
is **None** and indicate that all nodes are selected.
:type predicate: a function returning a boolean value
and accepting one argument of class :py:class:`Edge`
:return: an iterator over the connected component set
:rtype: an iterator over `set` of `int`
.. seealso::
the :py:meth:`obitools.graph.Graph.componentIndexSet` method
on which is based this function.
'''
seen = set()
for n in graph.nodeIterator(nodePredicat):
if n.index not in seen:
cc=n.componentIndexSet(nodePredicat, edgePredicat)
yield cc
seen |= cc
def componentCount(graph,nodePredicat=None,edgePredicat=None):
'''
Count the connected componnent in a graph.
:param graph: the graph to partitionne
:type graph: :py:class:`obitools.graph.Graph`
:param predicate: a function allowing edge selection. Default value
is **None** and indicate that all edges are selected.
:type predicate: a function returning a boolean value
and accepting one argument of class :py:class:`Node`
:param predicate: a function allowing node selection. Default value
is **None** and indicate that all nodes are selected.
:type predicate: a function returning a boolean value
and accepting one argument of class :py:class:`Edge`
:return: an iterator over the connected component set
:rtype: an iterator over `set` of `int`
.. seealso::
the :py:func:`componentIterator` function
on which is based this function.
'''
n=0
for c in componentIterator(graph,nodePredicat, edgePredicat):
n+=1
return n

Binary file not shown.

80
obitools/graph/dag.py Normal file
View File

@ -0,0 +1,80 @@
from obitools.graph import DiGraph,Node
from obitools.graph.algorithms.component import componentIterator
class DAG(DiGraph):
def __init__(self,label='G',indexer=None,nodes=None,edges=None):
'''
Directed Graph constructor.
@param label: Graph name, set to 'G' by default
@type label: str
@param indexer: node label indexer
@type indexer: Indexer instance
@param nodes: set of nodes to add to the graph
@type nodes: iterable value
@param edges: set of edges to add to the graph
@type edges: iterable value
'''
self._parents={}
DiGraph.__init__(self, label, indexer, nodes, edges)
def getNode(self,node=None,index=None):
if index is None:
index = self._index.getIndex(node, True)
return DAGNode(index,self)
def addEdge(self,parent=None,node=None,indexp=None,index=None,**data):
indexp=self.addNode(parent, indexp)
index =self.addNode(node , index)
pindex = set(n.index
for n in self.getNode(index=indexp).ancestorIterator())
assert index not in pindex,'Child node cannot be a parent node'
DiGraph.addEdge(self,index1=indexp,index2=index,**data)
if index in self._parents:
self._parents[index].add(indexp)
else:
self._parents[index]=set([indexp])
return (indexp,index)
def getRoots(self):
return [self.getNode(index=cc.pop()).getRoot()
for cc in componentIterator(self)]
class DAGNode(Node):
def ancestorIterator(self):
if self.index in self.graph._parents:
for p in self.graph._parents[self.index]:
parent = DAGNode(p,self.graph)
yield parent
for pnode in parent.ancestorIterator():
yield pnode
def getRoot(self):
for x in self.ancestorIterator():
pass
return x
def leavesIterator(self):
if not self:
yield self
for n in self:
for nn in n.leavesIterator():
yield nn
def subgraphIterator(self):
yield self
for n in self:
for nn in n.subgraphIterator():
yield nn

View File

View File

View File

@ -0,0 +1,117 @@
from obitools.graph.dag import DAG,DAGNode
class RootedTree(DAG):
def addEdge(self,parent=None,node=None,indexp=None,index=None,**data):
indexp=self.addNode(parent, indexp)
index =self.addNode(node , index)
assert index not in self._parents or indexp in self._parents[index], \
'Child node cannot have more than one parent node'
return DAG.addEdge(self,indexp=indexp,index=index,**data)
def getNode(self,node=None,index=None):
if index is None:
index = self._index.getIndex(node, True)
return RootedTreeNode(index,self)
class RootedTreeNode(DAGNode):
def subTreeSize(self):
n=1
for subnode in self:
n+=subnode.subTreeSize()
return n
def subTreeLeaves(self):
if not self:
return 1
n=0
for subnode in self:
n+=subnode.subTreeLeaves()
return n
def nodeWriter(node,deep=0,label=None,distance="distance", bootstrap="bootstrap",cartoon=None,collapse=None):
ks = node.keys()
if label is None:
name=node.label
elif callable(label):
name=label(node)
elif isinstance(label, str) and label in node:
name=node[label]
ks.remove(label)
else:
name=''
if distance in node:
dist=':%6.5f' % node[distance]
ks.remove(distance)
else:
dist=''
ks = ["%s=%s" % (k,node[k]) for k in ks]
if cartoon is not None and cartoon(node):
ks.append("!cartoon={%d,0.0}" % node.subTreeLeaves())
if collapse is not None and collapse(node):
ks.append('!collapse={"collapsed",0.0}')
if ks:
ks="[&"+",".join(ks)+"]"
else:
ks=''
nodeseparator = ',\n' + ' ' * (deep+1)
subnodes = nodeseparator.join([nodeWriter(x, deep+1,label,distance,bootstrap,cartoon=cartoon,collapse=collapse)
for x in node])
if subnodes:
subnodes='(\n' + ' ' * (deep+1) + subnodes + '\n' + ' ' * deep + ')'
return '%s"%s"%s%s' % (subnodes,name,ks,dist)
def nexusFormat(tree,startnode=None,label=None,blocks="",cartoon=None,collapse=None):
head="#NEXUS\n"
tx = []
for n in tree:
if label is None:
name=n.label
elif callable(label):
name=label(n)
elif isinstance(label, str) and label in n:
name=n[label]
else:
name=''
if name:
tx.append('"%s"' % name)
taxa = "begin taxa;\n\tdimensions ntax=%d;\n\ttaxlabels\n\t" % len(tx)
taxa+="\n\t".join(tx)
taxa+="\n;\nend;\n\n"
if startnode is not None:
roots =[startnode]
else:
roots = tree.getRoots()
trees = nodeWriter(roots[0],0,label,cartoon=cartoon,collapse=collapse)
trees = "begin trees;\n\ttree tree_1 = [&R] "+ trees +";\nend;\n\n"
return head+taxa+trees+"\n\n"+blocks+"\n"

37
obitools/graph/tree.py Normal file
View File

@ -0,0 +1,37 @@
from obitools.graph import UndirectedGraph,Node
from obitools.graph.algorithms.component import componentCount
class Forest(UndirectedGraph):
def getNode(self,node=None,index=None):
if index is None:
index = self._index.getIndex(node, True)
return TreeNode(index,self)
def addEdge(self,node1=None,node2=None,index1=None,index2=None,**data):
index1=self.addNode(node1, index1)
index2=self.addNode(node2, index2)
cc = set(n.index for n in self.getNode(index=index2).componentIterator())
assert index1 in self._node[index2] or index1 not in cc, \
"No more than one path is alloed between two nodes in a tree"
UndirectedGraph.addEdge(self, index1=index1, index2=index2,**data)
return (index1,index2)
def isASingleTree(self):
return componentCount(self)==1
class TreeNode(Node):
def componentIterator(self):
for c in self:
yield c
for cc in c:
yield cc

504
obitools/gzip.py Normal file
View File

@ -0,0 +1,504 @@
"""Functions that read and write gzipped files.
The user of the file doesn't have to worry about the compression,
but random access is not allowed.
This consisted on a patched version of of standard gzip python
module based on Andrew Kuchling's minigzip.py distributed with the zlib module
"""
# based on Andrew Kuchling's minigzip.py distributed with the zlib module
import struct, sys, time
import zlib
import __builtin__
__all__ = ["GzipFile","open"]
FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
READ, WRITE = 1, 2
def U32(i):
"""Return i as an unsigned integer, assuming it fits in 32 bits.
If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
"""
if i < 0:
i += 1L << 32
return i
def LOWU32(i):
"""Return the low-order 32 bits of an int, as a non-negative int."""
return i & 0xFFFFFFFFL
def write32(output, value):
output.write(struct.pack("<l", value))
def write32u(output, value):
# The L format writes the bit pattern correctly whether signed
# or unsigned.
output.write(struct.pack("<L", value))
def read32(input):
return struct.unpack("<l", input.read(4))[0]
def unpack32(buf):
return struct.unpack("<l", buf)[0]
def open(filename, mode="rb", compresslevel=9):
"""Shorthand for GzipFile(filename, mode, compresslevel).
The filename argument is required; mode defaults to 'rb'
and compresslevel defaults to 9.
"""
return GzipFile(filename, mode, compresslevel)
class GzipFile:
"""The GzipFile class simulates most of the methods of a file object with
the exception of the readinto() and truncate() methods.
"""
myfileobj = None
max_read_chunk = 10 * 1024 * 1024 # 10Mb
def __init__(self, filename=None, mode=None,
compresslevel=9, fileobj=None):
"""Constructor for the GzipFile class.
At least one of fileobj and filename must be given a
non-trivial value.
The new class instance is based on fileobj, which can be a regular
file, a StringIO object, or any other object which simulates a file.
It defaults to None, in which case filename is opened to provide
a file object.
When fileobj is not None, the filename argument is only used to be
included in the gzip file header, which may includes the original
filename of the uncompressed file. It defaults to the filename of
fileobj, if discernible; otherwise, it defaults to the empty string,
and in this case the original filename is not included in the header.
The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
depending on whether the file will be read or written. The default
is the mode of fileobj if discernible; otherwise, the default is 'rb'.
Be aware that only the 'rb', 'ab', and 'wb' values should be used
for cross-platform portability.
The compresslevel argument is an integer from 1 to 9 controlling the
level of compression; 1 is fastest and produces the least compression,
and 9 is slowest and produces the most compression. The default is 9.
"""
# guarantee the file is opened in binary mode on platforms
# that care about that sort of thing
if mode and 'b' not in mode:
mode += 'b'
if fileobj is None:
fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
if filename is None:
if hasattr(fileobj, 'name'): filename = fileobj.name
else: filename = ''
if mode is None:
if hasattr(fileobj, 'mode'): mode = fileobj.mode
else: mode = 'rb'
if mode[0:1] == 'r':
self.mode = READ
# Set flag indicating start of a new member
self._new_member = True
self.extrabuf = ""
self.extrasize = 0
self.filename = filename
# Starts small, scales exponentially
self.min_readsize = 100
elif mode[0:1] == 'w' or mode[0:1] == 'a':
self.mode = WRITE
self._init_write(filename)
self.compress = zlib.compressobj(compresslevel,
zlib.DEFLATED,
-zlib.MAX_WBITS,
zlib.DEF_MEM_LEVEL,
0)
else:
raise IOError, "Mode " + mode + " not supported"
self.fileobj = fileobj
self.offset = 0
self.inputbuf = ''
self.last8 = ''
if self.mode == WRITE:
self._write_gzip_header()
def __repr__(self):
s = repr(self.fileobj)
return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
def _init_write(self, filename):
if filename[-3:] != '.gz':
filename = filename + '.gz'
self.filename = filename
self.crc = zlib.crc32("")
self.size = 0
self.writebuf = []
self.bufsize = 0
def _write_gzip_header(self):
self.fileobj.write('\037\213') # magic header
self.fileobj.write('\010') # compression method
fname = self.filename[:-3]
flags = 0
if fname:
flags = FNAME
self.fileobj.write(chr(flags))
write32u(self.fileobj, long(time.time()))
self.fileobj.write('\002')
self.fileobj.write('\377')
if fname:
self.fileobj.write(fname + '\000')
def _init_read(self):
self.crc = zlib.crc32("")
self.size = 0
def _read_internal(self, size):
if len(self.inputbuf) < size:
self.inputbuf += self.fileobj.read(size-len(self.inputbuf))
chunk = self.inputbuf[:size]
# need to use len(chunk) bellow instead of size in case it's EOF.
if len(chunk) < 8:
self.last8 = self.last8[len(chunk):] + chunk
else:
self.last8 = chunk[-8:]
self.inputbuf = self.inputbuf[size:]
return chunk
def _read_gzip_header(self):
magic = self._read_internal(2)
if len(magic) != 2:
raise EOFError, "Reached EOF"
if magic != '\037\213':
raise IOError, 'Not a gzipped file'
method = ord( self._read_internal(1) )
if method != 8:
raise IOError, 'Unknown compression method'
flag = ord( self._read_internal(1) )
# modtime = self.fileobj.read(4)
# extraflag = self.fileobj.read(1)
# os = self.fileobj.read(1)
self._read_internal(6)
if flag & FEXTRA:
# Read & discard the extra field, if present
xlen = ord(self._read_internal(1))
xlen = xlen + 256*ord(self._read_internal(1))
self._read_internal(xlen)
if flag & FNAME:
# Read and discard a null-terminated string containing the filename
while True:
s = self._read_internal(1)
if not s or s=='\000':
break
if flag & FCOMMENT:
# Read and discard a null-terminated string containing a comment
while True:
s = self._read_internal(1)
if not s or s=='\000':
break
if flag & FHCRC:
self._read_internal(2) # Read & discard the 16-bit header CRC
def write(self,data):
if self.mode != WRITE:
import errno
raise IOError(errno.EBADF, "write() on read-only GzipFile object")
if self.fileobj is None:
raise ValueError, "write() on closed GzipFile object"
if len(data) > 0:
self.size = self.size + len(data)
self.crc = zlib.crc32(data, self.crc)
self.fileobj.write( self.compress.compress(data) )
self.offset += len(data)
def read(self, size=-1):
if self.mode != READ:
import errno
raise IOError(errno.EBADF, "read() on write-only GzipFile object")
if self.extrasize <= 0 and self.fileobj is None:
return ''
readsize = 1024
if size < 0: # get the whole thing
try:
while True:
self._read(readsize)
readsize = min(self.max_read_chunk, readsize * 2)
except EOFError:
size = self.extrasize
else: # just get some more of it
try:
while size > self.extrasize:
self._read(readsize)
readsize = min(self.max_read_chunk, readsize * 2)
except EOFError:
if size > self.extrasize:
size = self.extrasize
chunk = self.extrabuf[:size]
self.extrabuf = self.extrabuf[size:]
self.extrasize = self.extrasize - size
self.offset += size
return chunk
def _unread(self, buf):
self.extrabuf = buf + self.extrabuf
self.extrasize = len(buf) + self.extrasize
self.offset -= len(buf)
def _read(self, size=1024):
if self.fileobj is None:
raise EOFError, "Reached EOF"
if self._new_member:
# If the _new_member flag is set, we have to
# jump to the next member, if there is one.
#
# _read_gzip_header will raise EOFError exception
# if there no more members to read.
self._init_read()
self._read_gzip_header()
self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
self._new_member = False
# Read a chunk of data from the file
buf = self._read_internal(size)
# If the EOF has been reached, flush the decompression object
# and mark this object as finished.
if buf == "":
uncompress = self.decompress.flush()
self._read_eof()
self._add_read_data( uncompress )
raise EOFError, 'Reached EOF'
uncompress = self.decompress.decompress(buf)
self._add_read_data( uncompress )
if self.decompress.unused_data != "":
# Ending case: we've come to the end of a member in the file,
# so put back unused_data and initialize last8 by reading them.
self.inputbuf = self.decompress.unused_data + self.inputbuf
self._read_internal(8)
# Check the CRC and file size, and set the flag so we read
# a new member on the next call
self._read_eof()
self._new_member = True
def _add_read_data(self, data):
self.crc = zlib.crc32(data, self.crc)
self.extrabuf = self.extrabuf + data
self.extrasize = self.extrasize + len(data)
self.size = self.size + len(data)
def _read_eof(self):
# We've read to the end of the file, so we have to rewind in order
# to reread the 8 bytes containing the CRC and the file size.
# We check the that the computed CRC and size of the
# uncompressed data matches the stored values. Note that the size
# stored is the true file size mod 2**32.
crc32 = unpack32(self.last8[:4])
isize = U32(unpack32(self.last8[4:])) # may exceed 2GB
if U32(crc32) != U32(self.crc):
raise IOError, "CRC check failed"
elif isize != LOWU32(self.size):
raise IOError, "Incorrect length of data produced"
def close(self):
if self.mode == WRITE:
self.fileobj.write(self.compress.flush())
# The native zlib crc is an unsigned 32-bit integer, but
# the Python wrapper implicitly casts that to a signed C
# long. So, on a 32-bit box self.crc may "look negative",
# while the same crc on a 64-bit box may "look positive".
# To avoid irksome warnings from the `struct` module, force
# it to look positive on all boxes.
write32u(self.fileobj, LOWU32(self.crc))
# self.size may exceed 2GB, or even 4GB
write32u(self.fileobj, LOWU32(self.size))
self.fileobj = None
elif self.mode == READ:
self.fileobj = None
if self.myfileobj:
self.myfileobj.close()
self.myfileobj = None
def __del__(self):
try:
if (self.myfileobj is None and
self.fileobj is None):
return
except AttributeError:
return
self.close()
def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
if self.mode == WRITE:
# Ensure the compressor's buffer is flushed
self.fileobj.write(self.compress.flush(zlib_mode))
self.fileobj.flush()
def fileno(self):
"""Invoke the underlying file object's fileno() method.
This will raise AttributeError if the underlying file object
doesn't support fileno().
"""
return self.fileobj.fileno()
def isatty(self):
return False
def tell(self):
return self.offset
def rewind(self):
'''Return the uncompressed stream file position indicator to the
beginning of the file'''
if self.mode != READ:
raise IOError("Can't rewind in write mode")
self.fileobj.seek(0)
self._new_member = True
self.extrabuf = ""
self.extrasize = 0
self.offset = 0
def seek(self, offset):
if self.mode == WRITE:
if offset < self.offset:
raise IOError('Negative seek in write mode')
count = offset - self.offset
for i in range(count // 1024):
self.write(1024 * '\0')
self.write((count % 1024) * '\0')
elif self.mode == READ:
if offset < self.offset:
# for negative seek, rewind and do positive seek
self.rewind()
count = offset - self.offset
for i in range(count // 1024):
self.read(1024)
self.read(count % 1024)
def readline(self, size=-1):
if size < 0:
size = sys.maxint
readsize = self.min_readsize
else:
readsize = size
bufs = []
while size != 0:
c = self.read(readsize)
i = c.find('\n')
# We set i=size to break out of the loop under two
# conditions: 1) there's no newline, and the chunk is
# larger than size, or 2) there is a newline, but the
# resulting line would be longer than 'size'.
if (size <= i) or (i == -1 and len(c) > size):
i = size - 1
if i >= 0 or c == '':
bufs.append(c[:i + 1]) # Add portion of last chunk
self._unread(c[i + 1:]) # Push back rest of chunk
break
# Append chunk to list, decrease 'size',
bufs.append(c)
size = size - len(c)
readsize = min(size, readsize * 2)
if readsize > self.min_readsize:
self.min_readsize = min(readsize, self.min_readsize * 2, 512)
return ''.join(bufs) # Return resulting line
def readlines(self, sizehint=0):
# Negative numbers result in reading all the lines
if sizehint <= 0:
sizehint = sys.maxint
L = []
while sizehint > 0:
line = self.readline()
if line == "":
break
L.append(line)
sizehint = sizehint - len(line)
return L
def writelines(self, L):
for line in L:
self.write(line)
def __iter__(self):
return self
def next(self):
line = self.readline()
if line:
return line
else:
raise StopIteration
def _test():
# Act like gzip; with -d, act like gunzip.
# The input file is not deleted, however, nor are any other gzip
# options or features supported.
args = sys.argv[1:]
decompress = args and args[0] == "-d"
if decompress:
args = args[1:]
if not args:
args = ["-"]
for arg in args:
if decompress:
if arg == "-":
f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
g = sys.stdout
else:
if arg[-3:] != ".gz":
print "filename doesn't end in .gz:", repr(arg)
continue
f = open(arg, "rb")
g = __builtin__.open(arg[:-3], "wb")
else:
if arg == "-":
f = sys.stdin
g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
else:
f = __builtin__.open(arg, "rb")
g = open(arg + ".gz", "wb")
while True:
chunk = f.read(1024)
if not chunk:
break
g.write(chunk)
if g is not sys.stdout:
g.close()
if f is not sys.stdin:
f.close()
if __name__ == '__main__':
_test()

BIN
obitools/gzip.pyc Normal file

Binary file not shown.

View File

@ -0,0 +1,538 @@
import obitools
import re
import array
class Location(object):
"""
Define a location on a sequence.
"""
def extractSequence(self,sequence):
'''
Extract subsequence corresponding to a Location.
@param sequence:
@type sequence: C{BioSequence} or C{str}
'''
assert isinstance(sequence, (obitools.BioSequence,str)), \
"sequence must be an instance of str or BioSequence"
if isinstance(sequence, str):
seq = self._extractSequence(sequence)
else:
if isinstance(sequence, obitools.AASequence):
assert not self.needNucleic(), \
"This location can be used only with Nucleic sequences"
seq = self._extractSequence(str(sequence))
if isinstance(sequence, obitools.AASequence):
st = obitools.AASequence
else:
st = obitools.NucSequence
seq = st(sequence.id,
seq,
sequence.definition,
**sequence.getTags())
seq['location']=str(self)
if 'length' in sequence.getTags():
seq['length']=len(seq)
if hasattr(sequence, 'quality'):
quality = self._extractQuality(sequence)
seq.quality=quality
return seq
def isDirect(self):
return None
def isSimple(self):
'''
Indicate if a location is composed of a single continuous
region or is composed by the junction of several locations
by the C{join} operator.
@return: C{True} if the location is composed of a single
continuous region.
@rtype: bool
'''
return None
def isFullLength(self):
return None
def needNucleic(self):
'''
If a location contains a complement operator, it can be use
only on nucleic sequence.
@return: C{True} if location contains a complement operator
@rtype: bool
'''
return None
def getGloc(self):
loc = self.simplify()
assert loc.isDirect() is not None,"Gloc cannot be created for multi oriented location : %s" % str(loc)
positions = ','.join([str(x) for x in loc._getglocpos()])
return "(%s,%s)" % ({True:'T',False:'F'}[loc.isDirect()],
positions)
def shift(self,s):
return None
def getBegin(self):
return None
def getEnd(self):
return None
def getFivePrime(self):
return self.getBegin()
def getThreePrime(self):
return self.getEnd()
begin = property(getBegin,None,None,"beginning position of the location")
end = property(getEnd,None,None,"ending position of the location")
fivePrime=property(getFivePrime,None,None,"5' position of the location")
threePrime=property(getThreePrime,None,None,"3' position of the location")
def __abs__(self):
assert self.isDirect() is not None,"Abs operator cannot be applied on non oriented location"
if self.isDirect():
return self
else:
return ComplementLocation(self).simplify()
def __cmp__(self,y):
if self.begin < y.begin:
return -1
if self.begin > y.begin:
return 1
if self.isDirect() == y.isDirect():
return 0
if self.isDirect() and not y.isDirect():
return -1
return 1
class SimpleLocation(Location):
"""
A simple location is describe a continuous region of
a sequence define by a C{begin} and a C{end} position.
"""
def __init__(self,begin,end):
'''
Build a new C{SimpleLocation} instance. Valid
position are define on M{[1,N]} with N the length
of the sequence.
@param begin: start position of the location
@type begin: int
@param end: end position of the location
@type end: int
'''
assert begin > 0 and end > 0
self._begin = begin
self._end = end
self._before=False
self._after=False
def _extractSequence(self,sequence):
assert ( self._begin < len(sequence)
and self._end <= len(sequence)), \
"Sequence length %d is too short" % len(sequence)
return sequence[self._begin-1:self._end]
def _extractQuality(self,sequence):
assert ( self._begin < len(sequence)
and self._end <= len(sequence)), \
"Sequence length %d is too short" % len(sequence)
return sequence.quality[self._begin-1:self._end]
def isDirect(self):
return True
def isSimple(self):
return True
def isFullLength(self):
return not (self.before or self.after)
def simplify(self):
if self._begin == self._end:
return PointLocation(self._begin)
else:
return self
def needNucleic(self):
return False
def __str__(self):
before = {True:'<',False:''}[self.before]
after = {True:'>',False:''}[self.after]
return "%s%d..%s%d" % (before,self._begin,after,self._end)
def shift(self,s):
assert (self._begin + s) > 0,"shift to large (%d)" % s
if s == 0:
return self
return SimpleLocation(self._begin + s, self._end + s)
def _getglocpos(self):
return (self.begin,self.end)
def getGloc(self):
positions = ','.join([str(x) for x in self._getglocpos()])
return "(%s,%s)" % ({True:'T',False:'F'}[self.isDirect()],
positions)
def getBegin(self):
return self._begin
def getEnd(self):
return self._end
begin = property(getBegin,None,None,"beginning position of the location")
end = property(getEnd,None,None,"ending position of the location")
def getBefore(self):
return self._before
def getAfter(self):
return self._after
def setBefore(self,value):
assert isinstance(value, bool)
self._before=value
def setAfter(self,value):
assert isinstance(value, bool)
self._after=value
before=property(getBefore,setBefore,None)
after=property(getAfter,setAfter,None)
class PointLocation(Location):
"""
A point location describes a location on a sequence
limited to a single position
"""
def __init__(self,position):
assert position > 0
self._pos=position
def _extractSequence(self,sequence):
assert self._end <= len(sequence), \
"Sequence length %d is too short" % len(sequence)
return sequence[self._pos-1]
def _extractQuality(self,sequence):
assert self._end <= len(sequence), \
"Sequence length %d is too short" % len(sequence)
return sequence[self._pos-1:self._pos]
def isDirect(self):
return True
def isSimple(self):
return True
def isFullLength(self):
return True
def simplify(self):
return self
def needNucleic(self):
return False
def shift(self,s):
assert (self._pos + s) > 0,"shift to large (%d)" % s
if s == 0:
return self
return PointLocation(self._pos + s)
def _getglocpos(self):
return (self._pos,self._pos)
def getBegin(self):
return self._pos
def getEnd(self):
return self._pos
begin = property(getBegin,None,None,"beginning position of the location")
end = property(getEnd,None,None,"ending position of the location")
def __str__(self):
return str(self._pos)
class CompositeLocation(Location):
"""
"""
def __init__(self,locations):
self._locs = tuple(locations)
def _extractSequence(self,sequence):
seq = ''.join([x._extractSequence(sequence)
for x in self._locs])
return seq
def _extractQuality(self,sequence):
rep=array.array('d',[])
for x in self._locs:
rep.extend(x._extractQuality(sequence))
return rep
def isDirect(self):
hasDirect,hasReverse = reduce(lambda x,y: (x[0] or y,x[1] or not y),
(z.isDirect() for z in self._locs),(False,False))
if hasDirect and not hasReverse:
return True
if hasReverse and not hasDirect:
return False
return None
def isSimple(self):
return False
def simplify(self):
if len(self._locs)==1:
return self._locs[0]
rep = CompositeLocation(x.simplify() for x in self._locs)
if reduce(lambda x,y : x and y,
(isinstance(z, ComplementLocation)
for z in self._locs)):
rep = ComplementLocation(CompositeLocation(x._loc.simplify()
for x in rep._locs[::-1]))
return rep
def isFullLength(self):
return reduce(lambda x,y : x and y, (z.isFullLength() for z in self._locs),1)
def needNucleic(self):
return reduce(lambda x,y : x or y,
(z.needNucleic for z in self._locs),
False)
def _getglocpos(self):
return reduce(lambda x,y : x + y,
(z._getglocpos() for z in self._locs))
def getBegin(self):
return min(x.getBegin() for x in self._locs)
def getEnd(self):
return max(x.getEnd() for x in self._locs)
def shift(self,s):
assert (self.getBegin() + s) > 0,"shift to large (%d)" % s
if s == 0:
return self
return CompositeLocation(x.shift(s) for x in self._locs)
begin = property(getBegin,None,None,"beginning position of the location")
end = property(getEnd,None,None,"ending position of the location")
def __str__(self):
return "join(%s)" % ','.join([str(x)
for x in self._locs])
class ComplementLocation(Location):
"""
"""
_comp={'a': 't', 'c': 'g', 'g': 'c', 't': 'a',
'r': 'y', 'y': 'r', 'k': 'm', 'm': 'k',
's': 's', 'w': 'w', 'b': 'v', 'd': 'h',
'h': 'd', 'v': 'b', 'n': 'n', 'u': 'a',
'-': '-'}
def __init__(self,location):
self._loc = location
def _extractSequence(self,sequence):
seq = self._loc._extractSequence(sequence)
seq = ''.join([ComplementLocation._comp.get(x.lower(),'n') for x in seq[::-1]])
return seq
def _extractQuality(self,sequence):
return sequence.quality[::-1]
def isDirect(self):
return False
def isSimple(self):
return self._loc.isSimple()
def isFullLength(self):
return self._loc.isFullLength()
def simplify(self):
if isinstance(self._loc, ComplementLocation):
return self._loc._loc.simplify()
else:
return self
def needNucleic(self):
return True
def __str__(self):
return "complement(%s)" % self._loc
def shift(self,s):
assert (self.getBegin() + s) > 0,"shift to large (%d)" % s
if s == 0:
return self
return ComplementLocation(self._loc.shift(s))
def _getglocpos(self):
return self._loc._getglocpos()
def getBegin(self):
return self._loc.getBegin()
def getEnd(self):
return self._loc.getEnd()
def getFivePrime(self):
return self.getEnd()
def getThreePrime(self):
return self.getBegin()
begin = property(getBegin,None,None,"beginning position of the location")
end = property(getEnd,None,None,"ending position of the location")
fivePrime=property(getFivePrime,None,None,"5' potisition of the location")
threePrime=property(getThreePrime,None,None,"3' potisition of the location")
#
# Internal functions used for location parsing
#
def __sublocationIterator(text):
sl = []
plevel=0
for c in text:
assert plevel>=0,"Misformated location : %s" % text
if c == '(':
plevel+=1
sl.append(c)
elif c==')':
plevel-=1
sl.append(c)
elif c==',' and plevel == 0:
assert sl,"Misformated location : %s" % text
yield ''.join(sl)
sl=[]
else:
sl.append(c)
assert sl and plevel==0,"Misformated location : %s" % text
yield ''.join(sl)
#
# Internal functions used for location parsing
#
__simplelocparser = re.compile('(?P<before><?)(?P<from>[0-9]+)(\.\.(?P<after>>?)(?P<to>[0-9]+))?')
def __locationParser(text):
text=text.strip()
if text[0:5]=='join(':
assert text[-1]==')',"Misformated location : %s" % text
return CompositeLocation(__locationParser(sl) for sl in __sublocationIterator(text[5:-1]))
elif text[0:11]=='complement(':
assert text[-1]==')',"Misformated location : %s" % text
subl = tuple(__locationParser(sl) for sl in __sublocationIterator(text[11:-1]))
if len(subl)>1:
subl = CompositeLocation(subl)
else:
subl = subl[0]
return ComplementLocation(subl)
else:
data = __simplelocparser.match(text)
assert data is not None,"Misformated location : %s" % text
data = data.groupdict()
if not data['to'] :
sl = PointLocation(int(data['from']))
else:
sl = SimpleLocation(int(data['from']),int(data['to']))
sl.before=data['before']=='<'
sl.after=data['after']=='>'
return sl
def locationGenerator(locstring):
'''
Parse a location string as present in genbank or embl file.
@param locstring: string description of the location in embl/gb format
@type locstring: str
@return: a Location instance
@rtype: C{Location} subclass instance
'''
return __locationParser(locstring)
_matchExternalRef = re.compile('[A-Za-z0-9_|]+(\.[0-9]+)?(?=:)')
def extractExternalRefs(locstring):
'''
When a location describe external references (ex: D28156.1:1..>1292)
separate the external reference part of the location and the location
by itself.
@param locstring: text representation of the location.
@type locstring: str
@return: a tuple with a set of string describing accession number
of the referred sequences and a C{Location} instance.
@rtype: tuple(set,Location)
'''
m = set(x.group() for x in _matchExternalRef.finditer(locstring))
clean = re.compile(':|'.join([re.escape(x) for x in m])+':')
cloc = locationGenerator(clean.sub('',locstring))
return m,cloc

Binary file not shown.

View File

@ -0,0 +1,177 @@
from obitools.location import Location,locationGenerator
import logging
import re
_featureMatcher = re.compile('^(FT| ) [^ ].+\n((FT| ) .+\n)+',re.M)
_featureCleaner = re.compile('^FT',re.M)
def textFeatureIterator(fttable):
'''
Iterate through a textual description of a feature table in a genbank
or embl format. Return at each step a text representation of each individual
feature composing the table.
@param fttable: a string corresponding to the feature table of a genbank
or an embl entry
@type fttable: C{str}
@return: an iterator on str
@rtype: iterator
@see: L{ftParser}
'''
for m in _featureMatcher.finditer(fttable):
t = m.group()
t = _featureCleaner.sub(' ',t)
yield t
_qualifierMatcher = re.compile('(?<=^ {21}/).+(\n {21}[^/].+)*',re.M)
_qualifierCleanner= re.compile("^ +",re.M)
def qualifierIterator(qualifiers):
'''
Parse a textual description of a feature in embl or genbank format
as returned by the textFeatureIterator iterator and iterate through
the key, value qualified defining this location.
@param qualifiers: substring containing qualifiers
@type qualifiers: str
@return: an iterator on tuple (key,value), where keys are C{str}
@rtype: iterator
'''
for m in _qualifierMatcher.finditer(qualifiers):
t = m.group()
t = _qualifierCleanner.sub('',t)
t = t.split('=',1)
if len(t)==1:
t = (t[0],None)
else:
if t[0]=='translation':
value = t[1].replace('\n','')
else:
value = t[1].replace('\n',' ')
try:
value = eval(value)
except:
pass
t = (t[0],value)
yield t
_ftmatcher = re.compile('(?<=^ {5})\S+')
_locmatcher= re.compile('(?<=^.{21})[^/]+',re.DOTALL)
_cleanloc = re.compile('[\s\n]+')
_qualifiersMatcher = re.compile('^ +/.+',re.M+re.DOTALL)
def ftParser(feature):
fttype = _ftmatcher.search(feature).group()
location=_locmatcher.search(feature).group()
location=_cleanloc.sub('',location)
qualifiers=_qualifiersMatcher.search(feature)
if qualifiers is not None:
qualifiers=qualifiers.group()
else:
qualifiers=""
logging.debug("Qualifiers regex not matching on \n=====\n%s\n========" % feature)
return fttype,location,qualifiers
class Feature(dict,Location):
def __init__(self,type,location):
self._fttype=type
self._loc=location
def getFttype(self):
return self._fttype
def extractSequence(self,sequence,withQualifier=False):
seq = self._loc.extractSequence(sequence)
if withQualifier:
seq.getInfo().update(self)
return seq
def isDirect(self):
return self._loc.isDirect()
def isSimple(self):
return self._loc.isSimple()
def isFullLength(self):
return self._loc.isFullLength()
def simplify(self):
f = Feature(self._fttype,self._loc.simplify())
f.update(self)
return f
def locStr(self):
return str(self._loc)
def needNucleic(self):
return self._loc.needNucleic()
def __str__(self):
return repr(self)
def __repr__(self):
return str((self.ftType,str(self._loc),dict.__repr__(self)))
def __cmp__(self,y):
return self._loc.__cmp__(y)
def _getglocpos(self):
return self._loc._getglocpos()
ftType = property(getFttype, None, None, "Feature type name")
def shift(self,s):
assert (self.getBegin() + s) > 0,"shift to large (%d)" % s
if s == 0:
return self
f = Feature(self._fttype,self._loc.shift(s))
f.update(self)
return f
def getBegin(self):
return self._loc.getBegin()
def getEnd(self):
return self._loc.getEnd()
begin = property(getBegin,None,None,"beginning position of the location")
end = property(getEnd,None,None,"ending position of the location")
def featureFactory(featureDescription):
fttype,location,qualifiers = ftParser(featureDescription)
location = locationGenerator(location)
feature = Feature(fttype,location)
feature.raw = featureDescription
for k,v in qualifierIterator(qualifiers):
feature.setdefault(k,[]).append(v)
return feature
def featureIterator(featureTable,skipError=False):
for tft in textFeatureIterator(featureTable):
try:
feature = featureFactory(tft)
except AssertionError,e:
logging.debug("Parsing error on feature :\n===============\n%s\n===============" % tft)
if not skipError:
raise e
logging.debug("\t===> Error skipped")
continue
yield feature

View File

@ -0,0 +1,265 @@
from obitools.ecopcr.options import addTaxonomyFilterOptions,\
loadTaxonomyDatabase
from obitools.graph import UndirectedGraph
from obitools.align import lenlcs,isLCSReachable
from obitools.graph.algorithms.component import componentIterator
from obitools.utils.bioseq import uniqSequence
from obitools.utils import progressBar
import math
import sys
from obitools.graph.rootedtree import RootedTree
def average(x):
x=list(x)
s = sum(i*j for (i,j) in x)
n = sum(i[1] for i in x)
return (float(s)/float(n),n)
def minimum(x):
x=list(x)
m = min(i[0] for i in x)
n = sum(i[1] for i in x)
return (float(m),n)
def ecoPCRReader(entries,options):
taxonomy = loadTaxonomyDatabase(options)
norankid =options.taxonomy.findRankByName('no rank')
speciesid=options.taxonomy.findRankByName('species')
genusid =options.taxonomy.findRankByName('genus')
familyid =options.taxonomy.findRankByName('family')
minrankseq = set([speciesid,genusid,familyid])
usedrankid = {}
ingroup = []
outgroup= []
for s in entries:
if 'taxid' in s :
taxid = s['taxid']
if taxid in taxonomy:
allrank = set()
for p in options.taxonomy.parentalTreeIterator(taxid):
if p[1]!=norankid:
allrank.add(p[1])
if len(minrankseq & allrank) == 3:
for r in allrank:
usedrankid[r]=usedrankid.get(r,0) + 1
if taxonomy.isAncestor(options.ingroup,taxid):
ingroup.append(s)
else:
outgroup.append(s)
keptrank = set(r for r in usedrankid
if float(usedrankid[r])/float(len(ingroup)) > options.rankthresold)
return { 'ingroup' : ingroup,
'outgroup': outgroup,
'ranks' : keptrank
}
def buildSimilarityGraph(dbseq,ranks,taxonomy,dcmax=5):
ldbseq = len(dbseq)
pos = 1
digit = int(math.ceil(math.log10(ldbseq)))
header = "Alignment : %%0%dd x %%0%dd -> %%0%dd " % (digit,digit,digit)
aligncount = ldbseq*(ldbseq+1)/2
edgecount = 0
print >>sys.stderr
progressBar(1,aligncount,True,"Alignment : %s x %s -> %s " % ('-'*digit,'-'*digit, '0'*digit))
sim = UndirectedGraph()
i=0
for s in dbseq:
taxid = s['taxid']
rtaxon = dict((rid,taxonomy.getTaxonAtRank(taxid,rid))
for rid in ranks)
sim.addNode(i, seq=s,taxid=taxid,rtaxon=rtaxon)
i+=1
# aligner = LCS()
for is1 in xrange(ldbseq):
s1 = dbseq[is1]
ls1= len(s1)
# aligner.seqA=s1
for is2 in xrange(is1+1,ldbseq):
s2=dbseq[is2]
ls2=len(s2)
lm = max(ls1,ls2)
lcsmin = lm - dcmax
if isLCSReachable(s1,s2,lcsmin):
llcs,lali=lenlcs(s1,s2)
ds1s2 = lali - llcs
if ds1s2 <= dcmax:
sim.addEdge(node1=is1, node2=is2,ds1s2=ds1s2,label=ds1s2)
edgecount+=1
progressBar(pos,aligncount,head=header % (is1,is2,edgecount))
pos+=(ldbseq-is1-1)
return sim
def buildTsr(component):
'''
Build for each consider taxonomic rank the list of taxa
present in the connected component
:param component: the analyzed connected component
:type component: :py:class:`UndirectedGraph`
:return: a dictionary indexed by rankid containing a `dict` indexed by taxid and containing count of sequences for this taxid
:rtype: `dict` indexed by `int` containing `dict` indexed by `int` and containing of `int`
'''
taxalist = {}
for n in component:
for r in n['rtaxon']:
rtaxid = n['rtaxon'][r]
if rtaxid is not None:
ts = taxalist.get(r,{})
ts[rtaxid]=ts.get(rtaxid,0)+1
taxalist[r]=ts
return taxalist
def edgeDistSelector(dcmax):
def predicate(e):
return e['ds1s2'] <= dcmax
return predicate
def distanceOfConfusion(simgraph,dcmax=5,aggregate=average):
alltaxa = set()
for n in simgraph:
alltaxa|=set(n['rtaxon'].values())
taxacount = len(alltaxa)
result = {}
pos = [1]
header = "Component : %-5d Identified : %-8d "
progressBar(1,taxacount,True,header % (0,0))
def _idc(cc,dcmax):
composante=[]
for x in cc:
composante.extend(simgraph.subgraph(c)
for c in componentIterator(x,
edgePredicat=edgeDistSelector(dcmax)))
good = set()
bad = {}
complexe = []
for c in composante:
tsr = buildTsr(c)
newbad=False
for r in tsr:
if len(tsr[r]) == 1:
taxid = tsr[r].keys()[0]
good.add((taxid,tsr[r][taxid]))
else:
newbad=True
for taxid in tsr[r]:
bad[taxid]=bad.get(taxid,0)+tsr[r][taxid]
if newbad:
complexe.append(c)
# good = good - bad
for taxid,weight in good:
if taxid not in result:
result[taxid]=[]
result[taxid].append((dcmax+1,weight))
progressBar(pos[0],taxacount,False,header % (len(composante),pos[0]))
pos[0]=len(result)
if dcmax > 0:
dcmax-=1
_idc(complexe,dcmax)
else:
for taxid in bad:
if taxid not in result:
result[taxid]=[]
result[taxid].append((0,bad[taxid]))
progressBar(pos[0],taxacount,False,header % (len(composante),pos[0]))
pos[0]=len(result)
_idc([simgraph],dcmax)
for taxid in result:
result[taxid]=aggregate(result[taxid])
return result
def propagateDc(tree,node=None,aggregate=min):
if node is None:
node = tree.getRoots()[0]
dca=aggregate(n['dc'] for n in node.leavesIterator())
node['dc']=dca
for n in node:
propagateDc(tree, n, aggregate)
def confusionTree(distances,ranks,taxonomy,aggregate=min,bsrank='species',dcmax=1):
def Bs(node,rank,dcmax):
n = len(node)
if n:
g = [int(x['dc']>=dcmax) for x in node.subgraphIterator() if x['rank']==bsrank]
n = len(g)
g = sum(g)
bs= float(g)/float(n)
node['bs']=bs
node['bs_label']="%3.2f (%d)" % (bs,n)
for n in node:
Bs(n,rank,dcmax)
tree = RootedTree()
ranks = set(ranks)
tset = set(distances)
for taxon in distances:
tree.addNode(taxon, rank=taxonomy.getRank(taxon),
name=taxonomy.getScientificName(taxon),
dc=float(distances[taxon][0]),
n=distances[taxon][1],
dc_label="%4.2f (%d)" % (float(distances[taxon][0]),distances[taxon][1])
)
for taxon in distances:
piter = taxonomy.parentalTreeIterator(taxon)
taxon = piter.next()
for parent in piter:
if taxon[0] in tset and parent[0] in distances:
tset.remove(taxon[0])
tree.addEdge(parent[0], taxon[0])
taxon=parent
root = tree.getRoots()[0]
Bs(root,bsrank,dcmax)
return tree

View File

@ -0,0 +1,34 @@
'''
Created on 30 oct. 2011
@author: coissac
'''
from obitools.ecopcr.options import addTaxonomyDBOptions
def addMetabarcodingOption(optionManager):
addTaxonomyDBOptions(optionManager)
optionManager.add_option('--dcmax',
action="store", dest="dc",
metavar="###",
type="int",
default=0,
help="Maximum confusion distance considered")
optionManager.add_option('--ingroup',
action="store", dest="ingroup",
metavar="###",
type="int",
default=1,
help="ncbi taxid delimitation the in group")
optionManager.add_option('--rank-thresold',
action="store", dest="rankthresold",
metavar="#.##",
type="float",
default=0.5,
help="minimum fraction of the ingroup sequences "
"for concidering the rank")

View File

@ -0,0 +1,28 @@
from obitools.obischemas import kb
__connection__ = None
def initConnection(options):
global __connection__
param = {}
if hasattr(options, "dbname") and options.dbname is not None:
param["database"]=options.dbname
if hasattr(options, "dbhost") and options.dbhost is not None:
param["host"]=options.dbhost
if hasattr(options, "dbuser") and options.dbuser is not None:
param["username"]=options.dbuser
if hasattr(options, "dbpassword") and options.dbpassword is not None:
param["password"]=options.dbpassword
__connection__=kb.getConnection(**param)
__connection__.autocommit=options.autocommit
def getConnection(options=None):
global __connection__
if options is not None:
initConnection(options)
assert __connection__ is not None,"database connection is not initialized"
return __connection__

View File

@ -0,0 +1,55 @@
"""
kb package is devoted to manage access to postgresql database from python
script
"""
class Connection(object):
def __init__(self):
raise RuntimeError('pyROM.KB.Connection is an abstract class')
def cursor(self):
raise RuntimeError('pyROM.KB.Connection.cursor is an abstract function')
def commit(self):
raise RuntimeError('pyROM.KB.Connection.commit is an abstract function')
def rollback(self):
raise RuntimeError('pyROM.KB.Connection.rollback is an abstract function')
def __call__(self,query):
return self.cursor().execute(query)
class Cursor(object):
def __init__(self,db):
raise RuntimeError('pyROM.KB.Cursor is an abstract class')
def execute(self,query):
raise RuntimeError('pyROM.KB.Cursor.execute is an abstract function')
__call__=execute
_current_connection = None # Static variable used to store connection to KB
def getConnection(*args,**kargs):
"""
return a connection to the database.
When call from database backend no argument are needed.
All connection returned by this function
"""
global _current_connection
if _current_connection==None or args or kargs :
try:
from obischemas.kb import backend
_current_connection = backend.Connection()
except ImportError:
from obischemas.kb import extern
_current_connection = extern.Connection(*args,**kargs)
return _current_connection

View File

@ -0,0 +1,78 @@
"""
Module : KB.extern
Author : Eric Coissac
Date : 03/05/2004
Module wrapping psycopg interface module to allow connection
to a postgresql databases with the same interface from
backend and external script.
This module define a class usable from external script
"""
import psycopg2
import sys
from obischemas import kb
class Connection(kb.Connection):
def __init__(self,*connectParam,**kconnectParam):
if connectParam:
self.connectParam=={'dsn':connectParam}
else:
self.connectParam=kconnectParam
print self.connectParam
self.db = psycopg2.connect(**(self.connectParam))
def restart(self):
ok=1
while (ok and ok < 1000):
try:
self.db = psycopg2.connect(**self.connectParam)
except:
ok+=1
else:
ok=0
def cursor(self):
curs = Cursor(self.db)
if hasattr(self,'autocommit') and self.autocommit:
curs.autocommit = self.autocommit
return curs
def commit(self):
self.db.commit()
def rollback(self):
if hasattr(self,'db'):
self.db.rollback()
def __del__(self):
if hasattr(self,'db'):
self.rollback()
class Cursor(kb.Cursor):
def __init__(self,db):
self.db = db
self.curs = db.cursor()
def execute(self,query):
try:
self.curs.execute(query)
if hasattr(self,'autocommit') and self.autocommit:
self.db.commit()
except psycopg2.ProgrammingError,e:
print >>sys.stderr,"===> %s" % query
raise e
except psycopg2.IntegrityError,e:
print >>sys.stderr,"---> %s" % query
raise e
try:
label = [x[0] for x in self.curs.description]
return [dict(map(None,label,y))
for y in self.curs.fetchall()]
except TypeError:
return []

View File

@ -0,0 +1,31 @@
def addConnectionOptions(optionManager):
optionManager.add_option('-d','--dbname',
action="store", dest="dbname",
metavar="<DB NAME>",
type="string",
help="OBISchema database name containing"
"taxonomical data")
optionManager.add_option('-H','--host',
action="store", dest="dbhost",
metavar="<DB HOST>",
type="string",
help="host hosting OBISchema database")
optionManager.add_option('-U','--user',
action="store", dest="dbuser",
metavar="<DB USER>",
type="string",
help="user for OBISchema database connection")
optionManager.add_option('-W','--password',
action="store", dest="dbpassword",
metavar="<DB PASSWORD>",
type="string",
help="password for OBISchema database connection")
optionManager.add_option('-A','--autocommit',
action="store_true",dest="autocommit",
default=False,
help="add commit action after each query")

0
obitools/obo/__init__.py Normal file
View File

View File

53
obitools/obo/go/parser.py Normal file
View File

@ -0,0 +1,53 @@
from obitools.obo.parser import OBOTerm
from obitools.obo.parser import OBOEntry
from obitools.obo.parser import stanzaIterator
from logging import debug
class GOEntry(OBOEntry):
'''
An entry of a GeneOntology .obo file. It can be a header (without a stanza name) or
a stanza (with a stanza name between brackets). It inherits from the class dict.
'''
class GOTerm(OBOTerm):
'''
A stanza named 'Term'. It inherits from the class OBOTerm.
'''
def __init__(self,stanza):
## use of the OBOEntry constructor.
OBOTerm.__init__(self, stanza)
assert 'namespace' in self and len(self['namespace'])==1, "An OBOTerm must belong to one of the cell_component, molecular_function or biological_process namespace"
def GOEntryFactory(stanza):
'''
Dispatcher of stanza.
@param stanza: a stanza composed of several lines.
@type stanza: text
@return: an C{OBOTerm} | C{OBOEntry} instance
@note: The dispatcher treats differently the stanza which are OBO "Term"
and the others.
'''
stanzaType = OBOEntry.parseStanzaName(stanza)
if stanzaType=="Term":
return GOTerm(stanza)
else:
return OBOEntry(stanza)
def GOEntryIterator(file):
entries = stanzaIterator(file)
for e in entries:
debug(e)
yield GOEntryFactory(e)

707
obitools/obo/parser.py Normal file
View File

@ -0,0 +1,707 @@
from obitools.utils import skipWhiteLineIterator,multiLineWrapper
from obitools.utils import universalOpen
from obitools.format.genericparser import genericEntryIteratorGenerator
from logging import debug,warning
import re
#################################################################################
## Stanza preparation area ##
#################################################################################
class FileFormatError(Exception):
'''
An error derived from the class Exception.
'''
pass
_oboEntryIterator = genericEntryIteratorGenerator(endEntry='^ *$',
strip=True)
def stanzaIterator(inputfile):
'''
Iterator of stanza. The stanza are the basic units of OBO files.
@param inputfile: a stream of strings from an opened OBO file.
@type inputfile: a stream of strings
@return: a stream of stanza
@rtype: a stream of aggregated strings
@note: The iterator constructs stanza by aggregate strings from the
OBO file.
'''
inputfile = universalOpen(inputfile)
inputfile = multiLineWrapper(inputfile)
return _oboEntryIterator(inputfile)
#################################################################################
## Trailing Modifiers treatment area ##
#################################################################################
class TrailingModifier(dict):
'''
A class object which inherits from the class dict. Trailing modifiers can be found
at the end of TaggedValue objects when they exist.
'''
_match_brace = re.compile('(?<=\ {)[^\]]*(\}) *( !|$)')
def __init__(self,string):
## search for trailing modifiers signals
trailing_modifiers = TrailingModifier._match_brace.search(string)
## the trailing modifiers exist
if trailing_modifiers:
trailing_modifiers=trailing_modifiers.group(0).strip()
print trailing_modifiers
## creates and feeds the dictionary of trailing modifiers
dict.__init__(self,(x.strip().split('=',1) for x in trailing_modifiers.split(',')))
def trailingModifierFactory(string):
'''
Dispatcher of trailing modifiers.
@param string: a string from a TaggedValue object with a trailing modifiers signal.
@type string: string
@return: a class object
@note: The dispatcher is currently very simple. Only one case is treated by the function.
`the function returns a class object inherited from the class dict if the trailing modifiers
exist, None if they don't.
'''
trailing_modifiers = TrailingModifier(string)
if not trailing_modifiers:
trailing_modifiers=None
return trailing_modifiers
#################################################################################
## TaggedValue treatment area ##
#################################################################################
class TaggedValue(object):
'''
A couple 'tag:value' of an OBOEntry.
'''
_match_value = re.compile('(("(\\\\"|[^\"])*")|(\\\\"|[^\"]))*?( !| {|$)')
_split_comment = re.compile('^!| !')
_match_quotedString = re.compile('(?<=")(\\\\"|[^\"])*(?=")')
_match_bracket = re.compile('\[[^\]]*\]')
def __init__(self,line):
'''
Constructor of the class TaggedValue.
@param line: a line of an OBOEntry composed of a tag and a value.
@type line: string
@note: The constructor separates tags from right terms. 'value' is extracted
from right terms using a regular expression (value is at the beginning of the
string, between quotes or not). Then, 'comment' is extracted from the rest of the
string using another regular expression ('comment' is at the end of the string
after a '!'. By default, 'comment' is set to None). Finally, 'trailing_modifiers'
are extracted from the last string using another regular expression.
The tag, the value, the comment and the trailing_modifiers are saved.
'''
debug("tagValueParser : %s" % line)
## by default :
trailing_modifiers = None
comment = None
## the tag is saved. 'right' is composed of the value, the comment and the trailing modifiers
tag,rigth = line.split(':',1)
## the value is saved
value = TaggedValue._match_value.search(rigth).group(0)
debug("Extracted value : %s" % value)
## if there is a value AND a sign of a comment or trailing modifiers
if value and value[-1] in '!{':
lvalue = len(value)
## whatever it is a comment or trailing modifiers, it is saved into 'extra'
extra = rigth[lvalue-1:].strip()
## a comment is extracted
extra =TaggedValue._split_comment.split(extra,1)
## and saved if it exists
if len(extra)==2:
comment=extra[1].strip()
## trailing modifiers are extracted
extra=extra[0]
trailing_modifiers = trailingModifierFactory(extra)
## the value is cleaned of any comment or trailing modifiers signals
value = value[0:-1]
if tag=='use_term':
tag='consider'
raise DeprecationWarning,"user_term is a deprecated tag, you should instead use consider"
## recording zone
self.value =value.strip()
self.tag = tag
self.__doc__=comment
self.trailing_modifiers=trailing_modifiers
def __str__(self):
return str(self.value)
def __repr__(self):
return '''"""%s"""''' % str(self)
class NameValue(TaggedValue):
'''
A couple 'name:value' inherited from the class TaggedValue. Used to manage name tags.
'''
def __init__(self,line):
## no use of the TaggedValue constructor. The NameValue is very simple.
tag,rigth = line.split(':',1)
## recording zone
self.value = rigth.strip()
self.tag = 'name'
self.__doc__=None
self.trailing_modifiers=None
class DefValue(TaggedValue):
'''
A couple 'def:value' inherited from the class TaggedValue. Used to manage def tags.
'''
def __init__(self,line):
'''
Constructor of the class DefValue.
@param line: a line of an OBOEntry composed of a tag named 'def' and a value.
@type line: string
@note: The constructor calls the TaggedValue constructor. A regular expression
is used to extract the 'definition' from TaggedValue.value (definition is a not
quoted TaggedValue.value). A regular expression is used to extract 'dbxrefs'
from the aggedValue.value without the definition (dbxrefs are between brackets
and definition can be so). Definition is saved as the new value of the DefValue.
dbxrefs are saved.
'''
## use of the TaggedValue constructor
TaggedValue.__init__(self, line)
## definition, which is quoted, is extracted from the standard value of a TaggedValue.
definition = TaggedValue._match_quotedString.search(self.value).group(0)
## the standard value is cleaned of the definition.
cleanvalue = self.value.replace(definition,'')
cleanvalue = cleanvalue.replace(' ',' ')
## dbxrefs are searched into the rest of the standard value.
dbxrefs = TaggedValue._match_bracket.search(cleanvalue).group(0)
## recording zone
self.tag = 'def'
## the value of a DefValue is not the standard value but the definition.
self.value=definition
self.dbxrefs=xrefFactory(dbxrefs)
class SynonymValue(TaggedValue):
'''
A couple 'synonym:value' inherited from the class TaggedValue. Used to manage
synonym tags, exact_synonym tags, broad_synonym tags and narrow_synonym tags.
'''
_match_scope = re.compile('(?<="")[^\[]*(?=\[|$)')
def __init__(self,line):
'''
Constructor of the class SynonymValue.
@param line: a line of an OBOEntry composed of a tag named 'synonym' or
'exact_synonym' or 'broad_synonym' or 'narrow_synonym' and a value.
@type line: string
@note: SynonymValue is composed of a tag, a value, a scope, a list of types and
dbxrefs.
The constructor calls the TaggedValue constructor. A regular expression
is used to extract 'definition' from TaggedValue.value (definition is a not
quoted TaggedValue.value). Definition is saved as the new value of the class
SynonymValue.
A regular expression is used to extract 'attributes' from the rest of the
string. Attributes may contain an optional synonym scope and an optional list
of synonym types. The scope is extracted from attributes or set by default to
'RELATED'. It is saved as the scope of the class. The types are the rest of the
attributes and are saved as the list of types of the class.
For deprecated tags 'exact_synonym', 'broad_synonym' and 'narrow_synonym', tag
is set to 'synonym' and scope is set respectively to 'EXACT', 'BROAD' and 'NARROW'.
A regular expression is used to extract 'dbxrefs' from the TaggedValue.value
without the definition (dbxrefs are between brackets and definition can be so).
dbxrefs are saved.
'''
## use of the TaggedValue constructor
TaggedValue.__init__(self, line)
## definition, which is quoted, is extracted from the standard value of a TaggedValue.
definition = TaggedValue._match_quotedString.search(self.value).group(0)
## the standard value is cleaned of the definition.
cleanvalue = self.value.replace(definition,'')
cleanvalue = cleanvalue.replace(' ',' ')
## 1) attributes are searched into the rest of the standard value.
## 2) then they are stripped.
## 3) then they are split on every ' '.
## 4) finally they are ordered into a set.
attributes = set(SynonymValue._match_scope.search(cleanvalue).group(0).strip().split())
## the scopes are the junction between the attributes and a set of specific terms.
scopes = attributes & set(['RELATED','EXACT','BROAD','NARROW'])
## the types are the rest of the attributes.
types = attributes - scopes
## this is a constraint of the OBO format
assert len(scopes)< 2,"Only one synonym scope allowed"
## the scope of the SynonymValue is into scopes or set by default to RELATED
if scopes:
scope = scopes.pop()
else:
scope = 'RELATED'
## Specific rules are defined for the following tags :
if self.tag == 'exact_synonym':
raise DeprecationWarning,'exact_synonym is a deprecated tag use instead synonym tag'
self.tag = 'synonym'
scope = 'EXACT'
if self.tag == 'broad_synonym':
raise DeprecationWarning,'broad_synonym is a deprecated tag use instead synonym tag'
self.tag = 'synonym'
scope = 'BROAD'
if self.tag == 'narrow_synonym':
raise DeprecationWarning,'narrow_synonym is a deprecated tag use instead synonym tag'
self.tag = 'synonym'
scope = 'NARROW'
if self.tag == 'systematic_synonym':
#raise DeprecationWarning,'narrow_synonym is a deprecated tag use instead sysnonym tag'
self.tag = 'synonym'
scope = 'SYSTEMATIC'
## this is our own constraint. deprecated tags are not saved by this parser.
assert self.tag =='synonym',"%s synonym type is not managed" % self.tag
## dbxrefs are searched into the rest of the standard value.
dbxrefs = TaggedValue._match_bracket.search(cleanvalue).group(0)
## recording zone
## the value of a SynonymValue is not the standard value but the definition.
self.value = definition
self.dbxrefs = xrefFactory(dbxrefs)
self.scope = scope
self.types = list(types)
def __eq__(self,b):
return ((self.value==b.value) and (self.dbxrefs==b.dbxrefs)
and (self.scope==b.scope) and (self.types==b.types)
and (self.__doc__==b.__doc__) and (self.tag==b.tag)
and (self.trailing_modifiers==b.trailing_modifiers))
def __hash__(self):
return (reduce(lambda x,y:x+y,(hash(z) for z in [self.__doc__,
self.value,
frozenset(self.dbxrefs),
self.scope,
frozenset(self.types),
self.tag,
self.trailing_modifiers]),0)) % (2**31)
class XrefValue(TaggedValue):
'''
A couple 'xref:value' inherited from the class TaggedValue. Used to manage
xref tags.
'''
def __init__(self,line):
## use of the TaggedValue constructor
TaggedValue.__init__(self, line)
## use the same function as the dbxrefs
self.value=xrefFactory(self.value)
if self.tag in ('xref_analog','xref_unk'):
raise DeprecationWarning,'%s is a deprecated tag use instead sysnonym tag' % self.tag
self.tag='xref'
## this is our own constraint. deprecated tags are not saved by this parser.
assert self.tag=='xref'
class RelationshipValue(TaggedValue):
'''
A couple 'xref:value' inherited from the class TaggedValue. Used to manage
xref tags.
'''
def __init__(self,line):
## use of the TaggedValue constructor
TaggedValue.__init__(self, line)
## the value is split on the first ' '.
value = self.value.split(None,1)
## succesful split !
if len(value)==2:
relationship=value[0]
term=value[1]
## unsuccesful split. The relationship is set by default to IS_A
else:
relationship='is_a'
term=value[0]
## recording zone
self.value=term
self.relationship=relationship
class NamespaceValue(TaggedValue):
def __init__(self,line):
TaggedValue.__init__(self, line)
class RemarkValue(TaggedValue):
def __init__(self,line):
TaggedValue.__init__(self, line)
label,value = self.value.split(':',1)
label = label.strip()
value = value.strip()
self.value=value
self.label=label
def taggedValueFactory(line):
'''
A function used to dispatch lines of an OBOEntry between the class TaggedValue
and its inherited classes.
@param line: a line of an OBOEntry composed of a tag and a value.
@type line: string
@return: a class object
'''
if (line[0:9]=='namespace' or
line[0:17]=='default-namespace'):
return NamespaceValue(line)
## DefValue is an inherited class of TaggedValue
elif line[0:3]=='def':
return DefValue(line)
## SynonymValue is an inherited class of TaggedValue
elif ((line[0:7]=="synonym" and line[0:14]!="synonymtypedef") or
line[0:13]=="exact_synonym" or
line[0:13]=="broad_synonym" or
line[0:14]=="narrow_synonym"):
return SynonymValue(line)
## XrefValue is an inherited class of TaggedValue
elif line[0:4]=='xref':
return XrefValue(line)
## NameValue is an inherited class of TaggedValue
elif line[0:4]=='name':
return NameValue(line)
## RelationshipValue is an inherited class of TaggedValue
elif (line[0:15]=='intersection_of' or
line[0:8] =='union_of' or
line[0:12]=='relationship'):
return RelationshipValue(line)
elif (line[0:6]=='remark'):
return RemarkValue(line)
## each line is a couple : tag / value (and some more features)
else:
return TaggedValue(line)
#################################################################################
## Xref treatment area ##
#################################################################################
class Xref(object):
'''
A xref object of an OBOentry. It may be the 'dbxrefs' of SynonymValue and
DefValue objects or the 'value' of XrefValue objects.
'''
__splitdata__ = re.compile(' +(?=["{])')
def __init__(self,ref):
if ref == '' : #
ref = None #
data = '' #
else : # Modifs JJ sinon erreur : list index out of range
data = Xref.__splitdata__.split(ref,1) #
ref = data[0] #
description=None
trailing_modifiers = None
if len(data)> 1:
extra = data[1]
description = TaggedValue._match_quotedString.search(extra)
if description is not None:
description = description.group(0)
extra.replace(description,'')
trailing_modifiers=trailingModifierFactory(extra)
self.reference=ref
self.description=description
self.trailing_modifiers=trailing_modifiers
def __eq__(self,b):
return ((self.reference==b.reference) and (self.description==b.description)
and (self.trailing_modifiers==b.trailing_modifiers))
def __hash__(self):
return (reduce(lambda x,y:x+y,(hash(z) for z in [self.reference,
self.description,
self.trailing_modifiers]),0)) % (2**31)
def xrefFactory(string):
'''
Dispatcher of xrefs.
@param string: a string (between brackets) from an inherited TaggedValue object with a dbxrefs
signal (actually, the signal can only be found into SynonymValue and DefValue
objects) or a string (without brackets) from a XrefValue object.
@type string: string
@return: a class object
@note: The dispatcher treats differently the strings between brackets (from SynonymValue and
DefValue objects) and without brackets (from XrefValue objects).
'''
string = string.strip()
if string[0]=='[':
return [Xref(x.strip()) for x in string[1:-1].split(',')]
else:
return Xref(string)
#################################################################################
## Stanza treatment area ##
#################################################################################
class OBOEntry(dict):
'''
An entry of an OBOFile. It can be a header (without a stanza name) or
a stanza (with a stanza name between brackets). It inherits from the class dict.
'''
_match_stanza_name = re.compile('(?<=^\[)[^\]]*(?=\])')
def __init__(self,stanza):
## tests if it is the header of the OBO file (returns TRUE) or not (returns FALSE)
self.isHeader = stanza[0]!='['
lines = stanza.split('\n')
## not the header : there is a [stanzaName]
if not self.isHeader:
self.stanzaName = lines[0].strip()[1:-1]
lines=lines[1:]
self["stanza"] = [stanza.strip()]
## whatever the stanza is.
for line in lines:
## each line is a couple : tag / value
taggedvalue = taggedValueFactory(line)
if taggedvalue.tag in self:
self[taggedvalue.tag].append(taggedvalue)
else:
self[taggedvalue.tag]=[taggedvalue]
def parseStanzaName(stanza):
sm = OBOEntry._match_stanza_name.search(stanza)
if sm:
return sm.group(0)
else:
return None
parseStanzaName=staticmethod(parseStanzaName)
class OBOTerm(OBOEntry):
'''
A stanza named 'Term'. It inherits from the class OBOEntry.
'''
def __init__(self,stanza):
## use of the OBOEntry constructor.
OBOEntry.__init__(self, stanza)
assert self.stanzaName=='Term'
assert 'stanza' in self
assert 'id' in self and len(self['id'])==1,"An OBOTerm must have an id"
assert 'name' in self and len(self['name'])==1,"An OBOTerm must have a name"
assert 'namespace' not in self or len(self['namespace'])==1, "Only one namespace is allowed for an OBO term"
assert 'def' not in self or len(self['def'])==1,"Only one definition is allowed for an OBO term"
assert 'comment' not in self or len(self['comment'])==1,"Only one comment is allowed for an OBO term"
assert 'union_of' not in self or len(self['union_of'])>=2,"Only one union relationship is allowed for an OBO term"
assert 'intersection_of' not in self or len(self['intersection_of'])>=2,"Only one intersection relationship is allowed for an OBO term"
if self._isObsolete():
#assert 'is_a' not in self
assert 'relationship' not in self
assert 'inverse_of' not in self
assert 'disjoint_from' not in self
assert 'union_of' not in self
assert 'intersection_of' not in self
assert 'replaced_by' not in self or self._isObsolete()
assert 'consider' not in self or self._isObsolete()
def _getStanza(self):
return self['stanza'][0]
## make-up functions.
def _getDefinition(self):
if 'def' in self:
return self['def'][0]
return None
def _getId(self):
return self['id'][0]
def _getNamespace(self):
return self['namespace'][0]
def _getName(self):
return self['name'][0]
def _getComment(self):
if 'comment' in self:
return self['comment'][0]
return None
def _getAltIds(self):
if 'alt_id' in self:
return list(set(self.get('alt_id',None)))
return None
def _getIsA(self):
if 'is_a' in self:
return list(set(self.get('is_a',None)))
return None
def _getSynonym(self):
if 'synonym' in self :
return list(set(self.get('synonym',None)))
return None
def _getSubset(self):
if self.get('subset',None) != None:
return list(set(self.get('subset',None)))
else:
return None
def _getXref(self):
if 'xref' in self:
return list(set(self.get('xref',None)))
return None
def _getRelationShip(self):
if 'relationship' in self:
return list(set(self.get('relationship',None)))
return None
def _getUnion(self):
return list(set(self.get('union_of',None)))
def _getIntersection(self):
return list(set(self.get('intersection_of',None)))
def _getDisjonction(self):
return list(set(self.get('disjoint_from',None)))
def _isObsolete(self):
return 'is_obsolete' in self and str(self['is_obsolete'][0])=='true'
def _getReplacedBy(self):
if 'replaced_by' in self:
return list(set(self.get('replaced_by',None)))
return None
def _getConsider(self):
if 'consider' in self:
return list(set(self.get('consider',None)))
return None
## automatically make-up !
stanza = property(_getStanza,None,None)
definition = property(_getDefinition,None,None)
id = property(_getId,None,None)
namespace = property(_getNamespace,None,None)
name = property(_getName,None,None)
comment = property(_getComment,None,None)
alt_ids = property(_getAltIds,None,None)
is_a = property(_getIsA,None,None)
synonyms = property(_getSynonym,None,None)
subsets = property(_getSubset,None,None)
xrefs = property(_getXref,None,None)
relationship = property(_getRelationShip,None,None)
union_of = property(_getUnion,None,None)
intersection_of = property(_getIntersection,None,None)
disjoint_from = property(_getDisjonction,None,None)
is_obsolete = property(_isObsolete,None,None)
replaced_by = property(_getReplacedBy,None,None)
consider = property(_getConsider,None,None)
def OBOEntryFactory(stanza):
'''
Dispatcher of stanza.
@param stanza: a stanza composed of several lines.
@type stanza: text
@return: an C{OBOTerm} | C{OBOEntry} instance
@note: The dispatcher treats differently the stanza which are OBO "Term"
and the others.
'''
stanzaType = OBOEntry.parseStanzaName(stanza)
if stanzaType=="Term":
return OBOTerm(stanza)
else:
return OBOEntry(stanza)
def OBOEntryIterator(file):
entries = stanzaIterator(file)
for e in entries:
debug(e)
yield OBOEntryFactory(e)

View File

@ -0,0 +1,137 @@
"""
Module providing high level functions to manage command line options.
"""
import logging
import sys
from logging import debug
from optparse import OptionParser
from obitools.utils import universalOpen
from obitools.utils import fileSize
from obitools.utils import universalTell
from obitools.utils import progressBar
from obitools.format.options import addInputFormatOption, addInOutputOption,\
autoEntriesIterator
import time
def getOptionManager(optionDefinitions,entryIterator=None,progdoc=None):
'''
Build an option manager fonction. that is able to parse
command line options of the script.
@param optionDefinitions: list of function describing a set of
options. Each function must allows as
unique parametter an instance of OptionParser.
@type optionDefinitions: list of functions.
@param entryIterator: an iterator generator function returning
entries from the data files.
@type entryIterator: an iterator generator function with only one
parametter of type file
'''
parser = OptionParser(progdoc)
parser.add_option('--DEBUG',
action="store_true", dest="debug",
default=False,
help="Set logging in debug mode")
parser.add_option('--no-psyco',
action="store_true", dest="noPsyco",
default=False,
help="Don't use psyco even if it installed")
parser.add_option('--without-progress-bar',
action="store_false", dest="progressbar",
default=True,
help="desactivate progress bar")
checkFormat=False
for f in optionDefinitions:
if f == addInputFormatOption or f == addInOutputOption:
checkFormat=True
f(parser)
def commandLineAnalyzer():
options,files = parser.parse_args()
if options.debug:
logging.root.setLevel(logging.DEBUG)
if checkFormat:
ei=autoEntriesIterator(options)
else:
ei=entryIterator
i = allEntryIterator(files,ei,with_progress=options.progressbar)
return options,i
return commandLineAnalyzer
_currentInputFileName=None
_currentFile = None
_currentFileSize = None
def currentInputFileName():
return _currentInputFileName
def currentInputFile():
return _currentFile
def currentFileSize():
return _currentFileSize
def currentFileTell():
return universalTell(_currentFile)
def fileWithProgressBar(file,step=100):
try:
size = currentFileSize()
except:
size = None
def fileBar():
pos=1
progressBar(pos, size, True,currentInputFileName())
for l in file:
progressBar(currentFileTell,size,head=currentInputFileName())
yield l
print >>sys.stderr,''
if size is None:
return file
else:
f = fileBar()
return f
def allEntryIterator(files,entryIterator,with_progress=False,histo_step=102):
global _currentFile
global _currentInputFileName
global _currentFileSize
if files :
for f in files:
_currentInputFileName=f
f = universalOpen(f)
_currentFile=f
_currentFileSize=fileSize(_currentFile)
debug(f)
if with_progress:
f=fileWithProgressBar(f,step=histo_step)
if entryIterator is None:
for line in f:
yield line
else:
for entry in entryIterator(f):
yield entry
else:
if entryIterator is None:
for line in sys.stdin:
yield line
else:
for entry in entryIterator(sys.stdin):
yield entry

Some files were not shown because too many files have changed in this diff Show More