diff --git a/obitools/SVGdraw.py b/obitools/SVGdraw.py
new file mode 100644
index 0000000..521f750
--- /dev/null
+++ b/obitools/SVGdraw.py
@@ -0,0 +1,1054 @@
+#!/usr/bin/env python
+##Copyright (c) 2002, Fedor Baart & Hans de Wit (Stichting Farmaceutische Kengetallen)
+##All rights reserved.
+##
+##Redistribution and use in source and binary forms, with or without modification,
+##are permitted provided that the following conditions are met:
+##
+##Redistributions of source code must retain the above copyright notice, this
+##list of conditions and the following disclaimer.
+##
+##Redistributions in binary form must reproduce the above copyright notice,
+##this list of conditions and the following disclaimer in the documentation and/or
+##other materials provided with the distribution.
+##
+##Neither the name of the Stichting Farmaceutische Kengetallen nor the names of
+##its contributors may be used to endorse or promote products derived from this
+##software without specific prior written permission.
+##
+##THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+##AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+##IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+##DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+##FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+##DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+##SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+##CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+##OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+##OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+##Thanks to Gerald Rosennfellner for his help and useful comments.
+
+__doc__="""Use SVGdraw to generate your SVGdrawings.
+
+SVGdraw uses an object model drawing and a method toXML to create SVG graphics
+by using easy to use classes and methods usualy you start by creating a drawing eg
+
+ d=drawing()
+ #then you create a SVG root element
+ s=svg()
+ #then you add some elements eg a circle and add it to the svg root element
+ c=circle()
+ #you can supply attributes by using named arguments.
+ c=circle(fill='red',stroke='blue')
+ #or by updating the attributes attribute:
+ c.attributes['stroke-width']=1
+ s.addElement(c)
+ #then you add the svg root element to the drawing
+ d.setSVG(s)
+ #and finaly you xmlify the drawing
+ d.toXml()
+
+
+this results in the svg source of the drawing, which consists of a circle
+on a white background. Its as easy as that;)
+This module was created using the SVG specification of www.w3c.org and the
+O'Reilly (www.oreilly.com) python books as information sources. A svg viewer
+is available from www.adobe.com"""
+
+__version__="1.0"
+
+# there are two possibilities to generate svg:
+# via a dom implementation and directly using text strings
+# the latter is way faster (and shorter in coding)
+# the former is only used in debugging svg programs
+# maybe it will be removed alltogether after a while
+# with the following variable you indicate whether to use the dom implementation
+# Note that PyXML is required for using the dom implementation.
+# It is also possible to use the standard minidom. But I didn't try that one.
+# Anyway the text based approach is about 60 times faster than using the full dom implementation.
+use_dom_implementation=0
+
+
+import exceptions
+if use_dom_implementation<>0:
+ try:
+ from xml.dom import implementation
+ from xml.dom.ext import PrettyPrint
+ except:
+ raise exceptions.ImportError, "PyXML is required for using the dom implementation"
+#The implementation is used for the creating the XML document.
+#The prettyprint module is used for converting the xml document object to a xml file
+
+import sys
+assert sys.version_info[0]>=2
+if sys.version_info[1]<2:
+ True=1
+ False=0
+ file=open
+
+sys.setrecursionlimit=50
+#The recursion limit is set conservative so mistakes like s=svg() s.addElement(s)
+#won't eat up too much processor time.
+
+#the following code is pasted form xml.sax.saxutils
+#it makes it possible to run the code without the xml sax package installed
+#To make it possible to have in your text elements, it is necessary to escape the texts
+def _escape(data, entities={}):
+ """Escape &, <, and > in a string of data.
+
+ You can escape other strings of data by passing a dictionary as
+ the optional entities parameter. The keys and values must all be
+ strings; each key will be replaced with its corresponding value.
+ """
+ data = data.replace("&", "&")
+ data = data.replace("<", "<")
+ data = data.replace(">", ">")
+ for chars, entity in entities.items():
+ data = data.replace(chars, entity)
+ return data
+
+def _quoteattr(data, entities={}):
+ """Escape and quote an attribute value.
+
+ Escape &, <, and > in a string of data, then quote it for use as
+ an attribute value. The \" character will be escaped as well, if
+ necessary.
+
+ You can escape other strings of data by passing a dictionary as
+ the optional entities parameter. The keys and values must all be
+ strings; each key will be replaced with its corresponding value.
+ """
+ data = _escape(data, entities)
+ if '"' in data:
+ if "'" in data:
+ data = '"%s"' % data.replace('"', """)
+ else:
+ data = "'%s'" % data
+ else:
+ data = '"%s"' % data
+ return data
+
+
+
+def _xypointlist(a):
+ """formats a list of xy pairs"""
+ s=''
+ for e in a: #this could be done more elegant
+ s+=str(e)[1:-1] +' '
+ return s
+
+def _viewboxlist(a):
+ """formats a tuple"""
+ s=''
+ for e in a:
+ s+=str(e)+' '
+ return s
+
+def _pointlist(a):
+ """formats a list of numbers"""
+ return str(a)[1:-1]
+
+class pathdata:
+ """class used to create a pathdata object which can be used for a path.
+ although most methods are pretty straightforward it might be useful to look at the SVG specification."""
+ #I didn't test the methods below.
+ def __init__(self,x=None,y=None):
+ self.path=[]
+ if x is not None and y is not None:
+ self.path.append('M '+str(x)+' '+str(y))
+ def closepath(self):
+ """ends the path"""
+ self.path.append('z')
+ def move(self,x,y):
+ """move to absolute"""
+ self.path.append('M '+str(x)+' '+str(y))
+ def relmove(self,x,y):
+ """move to relative"""
+ self.path.append('m '+str(x)+' '+str(y))
+ def line(self,x,y):
+ """line to absolute"""
+ self.path.append('L '+str(x)+' '+str(y))
+ def relline(self,x,y):
+ """line to relative"""
+ self.path.append('l '+str(x)+' '+str(y))
+ def hline(self,x):
+ """horizontal line to absolute"""
+ self.path.append('H'+str(x))
+ def relhline(self,x):
+ """horizontal line to relative"""
+ self.path.append('h'+str(x))
+ def vline(self,y):
+ """verical line to absolute"""
+ self.path.append('V'+str(y))
+ def relvline(self,y):
+ """vertical line to relative"""
+ self.path.append('v'+str(y))
+ def bezier(self,x1,y1,x2,y2,x,y):
+ """bezier with xy1 and xy2 to xy absolut"""
+ self.path.append('C'+str(x1)+','+str(y1)+' '+str(x2)+','+str(y2)+' '+str(x)+','+str(y))
+ def relbezier(self,x1,y1,x2,y2,x,y):
+ """bezier with xy1 and xy2 to xy relative"""
+ self.path.append('c'+str(x1)+','+str(y1)+' '+str(x2)+','+str(y2)+' '+str(x)+','+str(y))
+ def smbezier(self,x2,y2,x,y):
+ """smooth bezier with xy2 to xy absolut"""
+ self.path.append('S'+str(x2)+','+str(y2)+' '+str(x)+','+str(y))
+ def relsmbezier(self,x2,y2,x,y):
+ """smooth bezier with xy2 to xy relative"""
+ self.path.append('s'+str(x2)+','+str(y2)+' '+str(x)+','+str(y))
+ def qbezier(self,x1,y1,x,y):
+ """quadratic bezier with xy1 to xy absolut"""
+ self.path.append('Q'+str(x1)+','+str(y1)+' '+str(x)+','+str(y))
+ def relqbezier(self,x1,y1,x,y):
+ """quadratic bezier with xy1 to xy relative"""
+ self.path.append('q'+str(x1)+','+str(y1)+' '+str(x)+','+str(y))
+ def smqbezier(self,x,y):
+ """smooth quadratic bezier to xy absolut"""
+ self.path.append('T'+str(x)+','+str(y))
+ def relsmqbezier(self,x,y):
+ """smooth quadratic bezier to xy relative"""
+ self.path.append('t'+str(x)+','+str(y))
+ def ellarc(self,rx,ry,xrot,laf,sf,x,y):
+ """elliptival arc with rx and ry rotating with xrot using large-arc-flag and sweep-flag to xy absolut"""
+ self.path.append('A'+str(rx)+','+str(ry)+' '+str(xrot)+' '+str(laf)+' '+str(sf)+' '+str(x)+' '+str(y))
+ def relellarc(self,rx,ry,xrot,laf,sf,x,y):
+ """elliptival arc with rx and ry rotating with xrot using large-arc-flag and sweep-flag to xy relative"""
+ self.path.append('a'+str(rx)+','+str(ry)+' '+str(xrot)+' '+str(laf)+' '+str(sf)+' '+str(x)+' '+str(y))
+ def __repr__(self):
+ return ' '.join(self.path)
+
+
+
+
+class SVGelement:
+ """SVGelement(type,attributes,elements,text,namespace,**args)
+ Creates a arbitrary svg element and is intended to be subclassed not used on its own.
+ This element is the base of every svg element it defines a class which resembles
+ a xml-element. The main advantage of this kind of implementation is that you don't
+ have to create a toXML method for every different graph object. Every element
+ consists of a type, attribute, optional subelements, optional text and an optional
+ namespace. Note the elements==None, if elements = None:self.elements=[] construction.
+ This is done because if you default to elements=[] every object has a reference
+ to the same empty list."""
+ def __init__(self,type='',attributes=None,elements=None,text='',namespace='',cdata=None,**args):
+ self.type=type
+ if attributes==None:
+ self.attributes={}
+ else:
+ self.attributes=attributes
+ if elements==None:
+ self.elements=[]
+ else:
+ self.elements=elements
+ self.text=text
+ self.namespace=namespace
+ self.cdata=cdata
+ for arg in args.keys():
+ self.attributes[arg]=args[arg]
+ def addElement(self,SVGelement):
+ """adds an element to a SVGelement
+
+ SVGelement.addElement(SVGelement)
+ """
+ self.elements.append(SVGelement)
+
+ #def toXml(self,level,f, preserveWhitespace=False):
+ def toXml(self,level,f, **kwargs):
+ preserve = kwargs.get("preserveWhitespace", False)
+ if preserve:
+ #print "PRESERVING"
+ NEWLINE = ""
+ TAB = ""
+ else:
+ #print "NOT PRESE"
+ NEWLINE = "\n"
+ TAB = "\t"
+ f.write(TAB*level)
+ f.write('<'+self.type)
+ for attkey in self.attributes.keys():
+ f.write(' '+_escape(str(attkey))+'='+_quoteattr(str(self.attributes[attkey])))
+ if self.namespace:
+ f.write(' xmlns="'+ _escape(str(self.namespace))+'" ')
+ if self.elements or self.text or self.cdata:
+ f.write('>')
+ if self.elements:
+ f.write(NEWLINE)
+ for element in self.elements:
+ element.toXml(level+1,f, preserveWhitespace=preserve)
+ if self.cdata:
+ f.write(NEWLINE+TAB*(level+1)+''+NEWLINE)
+ if self.text:
+ if type(self.text)==type(''): #If the text is only text
+ f.write(_escape(str(self.text)))
+ else: #If the text is a spannedtext class
+ f.write(str(self.text))
+ if self.elements:
+ f.write(TAB*level+''+self.type+'>'+NEWLINE)
+ elif self.text:
+ f.write(''+self.type+'>'+NEWLINE)
+ elif self.cdata:
+ f.write(TAB*level+''+self.type+'>'+NEWLINE)
+ else:
+ f.write('/>'+NEWLINE)
+
+class tspan(SVGelement):
+ """ts=tspan(text='',**args)
+
+ a tspan element can be used for applying formatting to a textsection
+ usage:
+ ts=tspan('this text is bold')
+ ts.attributes['font-weight']='bold'
+ st=spannedtext()
+ st.addtspan(ts)
+ t=text(3,5,st)
+ """
+ def __init__(self,text=None,**args):
+ SVGelement.__init__(self,'tspan',**args)
+ if self.text<>None:
+ self.text=text
+ def __repr__(self):
+ s="'
+ s+=self.text
+ s+=''
+ return s
+
+class tref(SVGelement):
+ """tr=tref(link='',**args)
+
+ a tref element can be used for referencing text by a link to its id.
+ usage:
+ tr=tref('#linktotext')
+ st=spannedtext()
+ st.addtref(tr)
+ t=text(3,5,st)
+ """
+ def __init__(self,link,**args):
+ SVGelement.__init__(self,'tref',{'xlink:href':link},**args)
+ def __repr__(self):
+ s="'
+ return s
+
+class spannedtext:
+ """st=spannedtext(textlist=[])
+
+ a spannedtext can be used for text which consists of text, tspan's and tref's
+ You can use it to add to a text element or path element. Don't add it directly
+ to a svg or a group element.
+ usage:
+
+ ts=tspan('this text is bold')
+ ts.attributes['font-weight']='bold'
+ tr=tref('#linktotext')
+ tr.attributes['fill']='red'
+ st=spannedtext()
+ st.addtspan(ts)
+ st.addtref(tr)
+ st.addtext('This text is not bold')
+ t=text(3,5,st)
+ """
+ def __init__(self,textlist=None):
+ if textlist==None:
+ self.textlist=[]
+ else:
+ self.textlist=textlist
+ def addtext(self,text=''):
+ self.textlist.append(text)
+ def addtspan(self,tspan):
+ self.textlist.append(tspan)
+ def addtref(self,tref):
+ self.textlist.append(tref)
+ def __repr__(self):
+ s=""
+ for element in self.textlist:
+ s+=str(element)
+ return s
+
+class rect(SVGelement):
+ """r=rect(width,height,x,y,fill,stroke,stroke_width,**args)
+
+ a rectangle is defined by a width and height and a xy pair
+ """
+ def __init__(self,x=None,y=None,width=None,height=None,fill=None,stroke=None,stroke_width=None,**args):
+ if width==None or height==None:
+ if width<>None:
+ raise ValueError, 'height is required'
+ if height<>None:
+ raise ValueError, 'width is required'
+ else:
+ raise ValueError, 'both height and width are required'
+ SVGelement.__init__(self,'rect',{'width':width,'height':height},**args)
+ if x<>None:
+ self.attributes['x']=x
+ if y<>None:
+ self.attributes['y']=y
+ if fill<>None:
+ self.attributes['fill']=fill
+ if stroke<>None:
+ self.attributes['stroke']=stroke
+ if stroke_width<>None:
+ self.attributes['stroke-width']=stroke_width
+
+class ellipse(SVGelement):
+ """e=ellipse(rx,ry,x,y,fill,stroke,stroke_width,**args)
+
+ an ellipse is defined as a center and a x and y radius.
+ """
+ def __init__(self,cx=None,cy=None,rx=None,ry=None,fill=None,stroke=None,stroke_width=None,**args):
+ if rx==None or ry== None:
+ if rx<>None:
+ raise ValueError, 'rx is required'
+ if ry<>None:
+ raise ValueError, 'ry is required'
+ else:
+ raise ValueError, 'both rx and ry are required'
+ SVGelement.__init__(self,'ellipse',{'rx':rx,'ry':ry},**args)
+ if cx<>None:
+ self.attributes['cx']=cx
+ if cy<>None:
+ self.attributes['cy']=cy
+ if fill<>None:
+ self.attributes['fill']=fill
+ if stroke<>None:
+ self.attributes['stroke']=stroke
+ if stroke_width<>None:
+ self.attributes['stroke-width']=stroke_width
+
+
+class circle(SVGelement):
+ """c=circle(x,y,radius,fill,stroke,stroke_width,**args)
+
+ The circle creates an element using a x, y and radius values eg
+ """
+ def __init__(self,cx=None,cy=None,r=None,fill=None,stroke=None,stroke_width=None,**args):
+ if r==None:
+ raise ValueError, 'r is required'
+ SVGelement.__init__(self,'circle',{'r':r},**args)
+ if cx<>None:
+ self.attributes['cx']=cx
+ if cy<>None:
+ self.attributes['cy']=cy
+ if fill<>None:
+ self.attributes['fill']=fill
+ if stroke<>None:
+ self.attributes['stroke']=stroke
+ if stroke_width<>None:
+ self.attributes['stroke-width']=stroke_width
+
+class point(circle):
+ """p=point(x,y,color)
+
+ A point is defined as a circle with a size 1 radius. It may be more efficient to use a
+ very small rectangle if you use many points because a circle is difficult to render.
+ """
+ def __init__(self,x,y,fill='black',**args):
+ circle.__init__(self,x,y,1,fill,**args)
+
+class line(SVGelement):
+ """l=line(x1,y1,x2,y2,stroke,stroke_width,**args)
+
+ A line is defined by a begin x,y pair and an end x,y pair
+ """
+ def __init__(self,x1=None,y1=None,x2=None,y2=None,stroke=None,stroke_width=None,**args):
+ SVGelement.__init__(self,'line',**args)
+ if x1<>None:
+ self.attributes['x1']=x1
+ if y1<>None:
+ self.attributes['y1']=y1
+ if x2<>None:
+ self.attributes['x2']=x2
+ if y2<>None:
+ self.attributes['y2']=y2
+ if stroke_width<>None:
+ self.attributes['stroke-width']=stroke_width
+ if stroke<>None:
+ self.attributes['stroke']=stroke
+
+class polyline(SVGelement):
+ """pl=polyline([[x1,y1],[x2,y2],...],fill,stroke,stroke_width,**args)
+
+ a polyline is defined by a list of xy pairs
+ """
+ def __init__(self,points,fill=None,stroke=None,stroke_width=None,**args):
+ SVGelement.__init__(self,'polyline',{'points':_xypointlist(points)},**args)
+ if fill<>None:
+ self.attributes['fill']=fill
+ if stroke_width<>None:
+ self.attributes['stroke-width']=stroke_width
+ if stroke<>None:
+ self.attributes['stroke']=stroke
+
+class polygon(SVGelement):
+ """pl=polyline([[x1,y1],[x2,y2],...],fill,stroke,stroke_width,**args)
+
+ a polygon is defined by a list of xy pairs
+ """
+ def __init__(self,points,fill=None,stroke=None,stroke_width=None,**args):
+ SVGelement.__init__(self,'polygon',{'points':_xypointlist(points)},**args)
+ if fill<>None:
+ self.attributes['fill']=fill
+ if stroke_width<>None:
+ self.attributes['stroke-width']=stroke_width
+ if stroke<>None:
+ self.attributes['stroke']=stroke
+
+class path(SVGelement):
+ """p=path(path,fill,stroke,stroke_width,**args)
+
+ a path is defined by a path object and optional width, stroke and fillcolor
+ """
+ def __init__(self,pathdata,fill=None,stroke=None,stroke_width=None,id=None,**args):
+ SVGelement.__init__(self,'path',{'d':str(pathdata)},**args)
+ if stroke<>None:
+ self.attributes['stroke']=stroke
+ if fill<>None:
+ self.attributes['fill']=fill
+ if stroke_width<>None:
+ self.attributes['stroke-width']=stroke_width
+ if id<>None:
+ self.attributes['id']=id
+
+
+class text(SVGelement):
+ """t=text(x,y,text,font_size,font_family,**args)
+
+ a text element can bge used for displaying text on the screen
+ """
+ def __init__(self,x=None,y=None,text=None,font_size=None,font_family=None,text_anchor=None,**args):
+ SVGelement.__init__(self,'text',**args)
+ if x<>None:
+ self.attributes['x']=x
+ if y<>None:
+ self.attributes['y']=y
+ if font_size<>None:
+ self.attributes['font-size']=font_size
+ if font_family<>None:
+ self.attributes['font-family']=font_family
+ if text<>None:
+ self.text=text
+ if text_anchor<>None:
+ self.attributes['text-anchor']=text_anchor
+
+ def toXml(self,level,f, **kwargs):
+ preserve = self.attributes.get("xml:space", None)
+ if preserve == "preserve":
+ #print "FOO PRE"
+ SVGelement.toXml(self,level, f, preserveWhitespace=True)
+ else:
+ #print "FOO NOT"
+ SVGelement.toXml(self, level, f, preserveWhitespace=False)
+
+class textpath(SVGelement):
+ """tp=textpath(text,link,**args)
+
+ a textpath places a text on a path which is referenced by a link.
+ """
+ def __init__(self,link,text=None,**args):
+ SVGelement.__init__(self,'textPath',{'xlink:href':link},**args)
+ if text<>None:
+ self.text=text
+
+class pattern(SVGelement):
+ """p=pattern(x,y,width,height,patternUnits,**args)
+
+ A pattern is used to fill or stroke an object using a pre-defined
+ graphic object which can be replicated ("tiled") at fixed intervals
+ in x and y to cover the areas to be painted.
+ """
+ def __init__(self,x=None,y=None,width=None,height=None,patternUnits=None,**args):
+ SVGelement.__init__(self,'pattern',**args)
+ if x<>None:
+ self.attributes['x']=x
+ if y<>None:
+ self.attributes['y']=y
+ if width<>None:
+ self.attributes['width']=width
+ if height<>None:
+ self.attributes['height']=height
+ if patternUnits<>None:
+ self.attributes['patternUnits']=patternUnits
+
+class title(SVGelement):
+ """t=title(text,**args)
+
+ a title is a text element. The text is displayed in the title bar
+ add at least one to the root svg element
+ """
+ def __init__(self,text=None,**args):
+ SVGelement.__init__(self,'title',**args)
+ if text<>None:
+ self.text=text
+
+class description(SVGelement):
+ """d=description(text,**args)
+
+ a description can be added to any element and is used for a tooltip
+ Add this element before adding other elements.
+ """
+ def __init__(self,text=None,**args):
+ SVGelement.__init__(self,'desc',**args)
+ if text<>None:
+ self.text=text
+
+class lineargradient(SVGelement):
+ """lg=lineargradient(x1,y1,x2,y2,id,**args)
+
+ defines a lineargradient using two xy pairs.
+ stop elements van be added to define the gradient colors.
+ """
+ def __init__(self,x1=None,y1=None,x2=None,y2=None,id=None,**args):
+ SVGelement.__init__(self,'linearGradient',**args)
+ if x1<>None:
+ self.attributes['x1']=x1
+ if y1<>None:
+ self.attributes['y1']=y1
+ if x2<>None:
+ self.attributes['x2']=x2
+ if y2<>None:
+ self.attributes['y2']=y2
+ if id<>None:
+ self.attributes['id']=id
+
+class radialgradient(SVGelement):
+ """rg=radialgradient(cx,cy,r,fx,fy,id,**args)
+
+ defines a radial gradient using a outer circle which are defined by a cx,cy and r and by using a focalpoint.
+ stop elements van be added to define the gradient colors.
+ """
+ def __init__(self,cx=None,cy=None,r=None,fx=None,fy=None,id=None,**args):
+ SVGelement.__init__(self,'radialGradient',**args)
+ if cx<>None:
+ self.attributes['cx']=cx
+ if cy<>None:
+ self.attributes['cy']=cy
+ if r<>None:
+ self.attributes['r']=r
+ if fx<>None:
+ self.attributes['fx']=fx
+ if fy<>None:
+ self.attributes['fy']=fy
+ if id<>None:
+ self.attributes['id']=id
+
+class stop(SVGelement):
+ """st=stop(offset,stop_color,**args)
+
+ Puts a stop color at the specified radius
+ """
+ def __init__(self,offset,stop_color=None,**args):
+ SVGelement.__init__(self,'stop',{'offset':offset},**args)
+ if stop_color<>None:
+ self.attributes['stop-color']=stop_color
+
+class style(SVGelement):
+ """st=style(type,cdata=None,**args)
+
+ Add a CDATA element to this element for defing in line stylesheets etc..
+ """
+ def __init__(self,type,cdata=None,**args):
+ SVGelement.__init__(self,'style',{'type':type},cdata=cdata, **args)
+
+
+class image(SVGelement):
+ """im=image(url,width,height,x,y,**args)
+
+ adds an image to the drawing. Supported formats are .png, .jpg and .svg.
+ """
+ def __init__(self,url,x=None,y=None,width=None,height=None,**args):
+ if width==None or height==None:
+ if width<>None:
+ raise ValueError, 'height is required'
+ if height<>None:
+ raise ValueError, 'width is required'
+ else:
+ raise ValueError, 'both height and width are required'
+ SVGelement.__init__(self,'image',{'xlink:href':url,'width':width,'height':height},**args)
+ if x<>None:
+ self.attributes['x']=x
+ if y<>None:
+ self.attributes['y']=y
+
+class cursor(SVGelement):
+ """c=cursor(url,**args)
+
+ defines a custom cursor for a element or a drawing
+ """
+ def __init__(self,url,**args):
+ SVGelement.__init__(self,'cursor',{'xlink:href':url},**args)
+
+
+class marker(SVGelement):
+ """m=marker(id,viewbox,refX,refY,markerWidth,markerHeight,**args)
+
+ defines a marker which can be used as an endpoint for a line or other pathtypes
+ add an element to it which should be used as a marker.
+ """
+ def __init__(self,id=None,viewBox=None,refx=None,refy=None,markerWidth=None,markerHeight=None,**args):
+ SVGelement.__init__(self,'marker',**args)
+ if id<>None:
+ self.attributes['id']=id
+ if viewBox<>None:
+ self.attributes['viewBox']=_viewboxlist(viewBox)
+ if refx<>None:
+ self.attributes['refX']=refx
+ if refy<>None:
+ self.attributes['refY']=refy
+ if markerWidth<>None:
+ self.attributes['markerWidth']=markerWidth
+ if markerHeight<>None:
+ self.attributes['markerHeight']=markerHeight
+
+class group(SVGelement):
+ """g=group(id,**args)
+
+ a group is defined by an id and is used to contain elements
+ g.addElement(SVGelement)
+ """
+ def __init__(self,id=None,**args):
+ SVGelement.__init__(self,'g',**args)
+ if id<>None:
+ self.attributes['id']=id
+
+class symbol(SVGelement):
+ """sy=symbol(id,viewbox,**args)
+
+ defines a symbol which can be used on different places in your graph using
+ the use element. A symbol is not rendered but you can use 'use' elements to
+ display it by referencing its id.
+ sy.addElement(SVGelement)
+ """
+
+ def __init__(self,id=None,viewBox=None,**args):
+ SVGelement.__init__(self,'symbol',**args)
+ if id<>None:
+ self.attributes['id']=id
+ if viewBox<>None:
+ self.attributes['viewBox']=_viewboxlist(viewBox)
+
+class defs(SVGelement):
+ """d=defs(**args)
+
+ container for defining elements
+ """
+ def __init__(self,**args):
+ SVGelement.__init__(self,'defs',**args)
+
+class switch(SVGelement):
+ """sw=switch(**args)
+
+ Elements added to a switch element which are "switched" by the attributes
+ requiredFeatures, requiredExtensions and systemLanguage.
+ Refer to the SVG specification for details.
+ """
+ def __init__(self,**args):
+ SVGelement.__init__(self,'switch',**args)
+
+
+class use(SVGelement):
+ """u=use(link,x,y,width,height,**args)
+
+ references a symbol by linking to its id and its position, height and width
+ """
+ def __init__(self,link,x=None,y=None,width=None,height=None,**args):
+ SVGelement.__init__(self,'use',{'xlink:href':link},**args)
+ if x<>None:
+ self.attributes['x']=x
+ if y<>None:
+ self.attributes['y']=y
+
+ if width<>None:
+ self.attributes['width']=width
+ if height<>None:
+ self.attributes['height']=height
+
+
+class link(SVGelement):
+ """a=link(url,**args)
+
+ a link is defined by a hyperlink. add elements which have to be linked
+ a.addElement(SVGelement)
+ """
+ def __init__(self,link='',**args):
+ SVGelement.__init__(self,'a',{'xlink:href':link},**args)
+
+class view(SVGelement):
+ """v=view(id,**args)
+
+ a view can be used to create a view with different attributes"""
+ def __init__(self,id=None,**args):
+ SVGelement.__init__(self,'view',**args)
+ if id<>None:
+ self.attributes['id']=id
+
+class script(SVGelement):
+ """sc=script(type,type,cdata,**args)
+
+ adds a script element which contains CDATA to the SVG drawing
+
+ """
+ def __init__(self,type,cdata=None,**args):
+ SVGelement.__init__(self,'script',{'type':type},cdata=cdata,**args)
+
+class animate(SVGelement):
+ """an=animate(attribute,from,to,during,**args)
+
+ animates an attribute.
+ """
+ def __init__(self,attribute,fr=None,to=None,dur=None,**args):
+ SVGelement.__init__(self,'animate',{'attributeName':attribute},**args)
+ if fr<>None:
+ self.attributes['from']=fr
+ if to<>None:
+ self.attributes['to']=to
+ if dur<>None:
+ self.attributes['dur']=dur
+
+class animateMotion(SVGelement):
+ """an=animateMotion(pathdata,dur,**args)
+
+ animates a SVGelement over the given path in dur seconds
+ """
+ def __init__(self,pathdata,dur,**args):
+ SVGelement.__init__(self,'animateMotion',**args)
+ if pathdata<>None:
+ self.attributes['path']=str(pathdata)
+ if dur<>None:
+ self.attributes['dur']=dur
+
+class animateTransform(SVGelement):
+ """antr=animateTransform(type,from,to,dur,**args)
+
+ transform an element from and to a value.
+ """
+ def __init__(self,type=None,fr=None,to=None,dur=None,**args):
+ SVGelement.__init__(self,'animateTransform',{'attributeName':'transform'},**args)
+ #As far as I know the attributeName is always transform
+ if type<>None:
+ self.attributes['type']=type
+ if fr<>None:
+ self.attributes['from']=fr
+ if to<>None:
+ self.attributes['to']=to
+ if dur<>None:
+ self.attributes['dur']=dur
+class animateColor(SVGelement):
+ """ac=animateColor(attribute,type,from,to,dur,**args)
+
+ Animates the color of a element
+ """
+ def __init__(self,attribute,type=None,fr=None,to=None,dur=None,**args):
+ SVGelement.__init__(self,'animateColor',{'attributeName':attribute},**args)
+ if type<>None:
+ self.attributes['type']=type
+ if fr<>None:
+ self.attributes['from']=fr
+ if to<>None:
+ self.attributes['to']=to
+ if dur<>None:
+ self.attributes['dur']=dur
+class set(SVGelement):
+ """st=set(attribute,to,during,**args)
+
+ sets an attribute to a value for a
+ """
+ def __init__(self,attribute,to=None,dur=None,**args):
+ SVGelement.__init__(self,'set',{'attributeName':attribute},**args)
+ if to<>None:
+ self.attributes['to']=to
+ if dur<>None:
+ self.attributes['dur']=dur
+
+
+
+class svg(SVGelement):
+ """s=svg(viewbox,width,height,**args)
+
+ a svg or element is the root of a drawing add all elements to a svg element.
+ You can have different svg elements in one svg file
+ s.addElement(SVGelement)
+
+ eg
+ d=drawing()
+ s=svg((0,0,100,100),'100%','100%')
+ c=circle(50,50,20)
+ s.addElement(c)
+ d.setSVG(s)
+ d.toXml()
+ """
+ def __init__(self,viewBox=None, width=None, height=None,**args):
+ SVGelement.__init__(self,'svg',**args)
+ if viewBox<>None:
+ self.attributes['viewBox']=_viewboxlist(viewBox)
+ if width<>None:
+ self.attributes['width']=width
+ if height<>None:
+ self.attributes['height']=height
+ self.namespace="http://www.w3.org/2000/svg"
+
+class drawing:
+ """d=drawing()
+
+ this is the actual SVG document. It needs a svg element as a root.
+ Use the addSVG method to set the svg to the root. Use the toXml method to write the SVG
+ source to the screen or to a file
+ d=drawing()
+ d.addSVG(svg)
+ d.toXml(optionalfilename)
+ """
+
+ def __init__(self):
+ self.svg=None
+ def setSVG(self,svg):
+ self.svg=svg
+ #Voeg een element toe aan de grafiek toe.
+ if use_dom_implementation==0:
+ def toXml(self, filename='',compress=False):
+ import cStringIO
+ xml=cStringIO.StringIO()
+ xml.write('\n')
+ xml.write("""]>\n""")
+ self.svg.toXml(0,xml)
+ if not filename:
+ if compress:
+ import gzip
+ f=cStringIO.StringIO()
+ zf=gzip.GzipFile(fileobj=f,mode='wb')
+ zf.write(xml.getvalue())
+ zf.close()
+ f.seek(0)
+ return f.read()
+ else:
+ return xml.getvalue()
+ else:
+ if filename[-4:]=='svgz':
+ import gzip
+ f=gzip.GzipFile(filename=filename,mode="wb", compresslevel=9)
+ f.write(xml.getvalue())
+ f.close()
+ else:
+ f=file(filename,'w')
+ f.write(xml.getvalue())
+ f.close()
+
+ else:
+ def toXml(self,filename='',compress=False):
+ """drawing.toXml() ---->to the screen
+ drawing.toXml(filename)---->to the file
+ writes a svg drawing to the screen or to a file
+ compresses if filename ends with svgz or if compress is true
+ """
+ doctype = implementation.createDocumentType('svg',"-//W3C//DTD SVG 1.0//EN""",'http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd ')
+
+ global root
+ #root is defined global so it can be used by the appender. Its also possible to use it as an arugument but
+ #that is a bit messy.
+ root=implementation.createDocument(None,None,doctype)
+ #Create the xml document.
+ global appender
+ def appender(element,elementroot):
+ """This recursive function appends elements to an element and sets the attributes
+ and type. It stops when alle elements have been appended"""
+ if element.namespace:
+ e=root.createElementNS(element.namespace,element.type)
+ else:
+ e=root.createElement(element.type)
+ if element.text:
+ textnode=root.createTextNode(element.text)
+ e.appendChild(textnode)
+ for attribute in element.attributes.keys(): #in element.attributes is supported from python 2.2
+ e.setAttribute(attribute,str(element.attributes[attribute]))
+ if element.elements:
+ for el in element.elements:
+ e=appender(el,e)
+ elementroot.appendChild(e)
+ return elementroot
+ root=appender(self.svg,root)
+ if not filename:
+ import cStringIO
+ xml=cStringIO.StringIO()
+ PrettyPrint(root,xml)
+ if compress:
+ import gzip
+ f=cStringIO.StringIO()
+ zf=gzip.GzipFile(fileobj=f,mode='wb')
+ zf.write(xml.getvalue())
+ zf.close()
+ f.seek(0)
+ return f.read()
+ else:
+ return xml.getvalue()
+ else:
+ try:
+ if filename[-4:]=='svgz':
+ import gzip
+ import cStringIO
+ xml=cStringIO.StringIO()
+ PrettyPrint(root,xml)
+ f=gzip.GzipFile(filename=filename,mode='wb',compresslevel=9)
+ f.write(xml.getvalue())
+ f.close()
+ else:
+ f=open(filename,'w')
+ PrettyPrint(root,f)
+ f.close()
+ except:
+ print "Cannot write SVG file: " + filename
+ def validate(self):
+ try:
+ import xml.parsers.xmlproc.xmlval
+ except:
+ raise exceptions.ImportError,'PyXml is required for validating SVG'
+ svg=self.toXml()
+ xv=xml.parsers.xmlproc.xmlval.XMLValidator()
+ try:
+ xv.feed(svg)
+ except:
+ raise "SVG is not well formed, see messages above"
+ else:
+ print "SVG well formed"
+if __name__=='__main__':
+
+
+ d=drawing()
+ s=svg((0,0,100,100))
+ r=rect(-100,-100,300,300,'cyan')
+ s.addElement(r)
+
+ t=title('SVGdraw Demo')
+ s.addElement(t)
+ g=group('animations')
+ e=ellipse(0,0,5,2)
+ g.addElement(e)
+ c=circle(0,0,1,'red')
+ g.addElement(c)
+ pd=pathdata(0,-10)
+ for i in range(6):
+ pd.relsmbezier(10,5,0,10)
+ pd.relsmbezier(-10,5,0,10)
+ an=animateMotion(pd,10)
+ an.attributes['rotate']='auto-reverse'
+ an.attributes['repeatCount']="indefinite"
+ g.addElement(an)
+ s.addElement(g)
+ for i in range(20,120,20):
+ u=use('#animations',i,0)
+ s.addElement(u)
+ for i in range(0,120,20):
+ for j in range(5,105,10):
+ c=circle(i,j,1,'red','black',.5)
+ s.addElement(c)
+ d.setSVG(s)
+
+ print d.toXml()
+
diff --git a/obitools/__init__.py b/obitools/__init__.py
new file mode 100644
index 0000000..3063d78
--- /dev/null
+++ b/obitools/__init__.py
@@ -0,0 +1,711 @@
+'''
+**obitools** main module
+------------------------
+
+.. codeauthor:: Eric Coissac
+
+
+
+obitools module provides base class for sequence manipulation.
+
+All biological sequences must be subclass of :py:class:`obitools.BioSequence`.
+Some biological sequences are defined as transformation of other
+biological sequences. For example Reversed complemented sequences
+are a transformation of a :py:class:`obitools.NucSequence`. This particular
+type of sequences are subclasses of the :py:class:`obitools.WrappedBioSequence`.
+
+.. inheritance-diagram:: BioSequence NucSequence AASequence WrappedBioSequence SubSequence DNAComplementSequence
+ :parts: 1
+
+
+'''
+
+from weakref import ref
+
+from obitools.utils.iterator import uniqueChain
+from itertools import chain
+import re
+
+_default_raw_parser = " %s *= *([^;]*);"
+
+try:
+ from functools import partial
+except:
+ #
+ # Add for compatibility purpose with Python < 2.5
+ #
+ def partial(func, *args, **keywords):
+ def newfunc(*fargs, **fkeywords):
+ newkeywords = keywords.copy()
+ newkeywords.update(fkeywords)
+ return func(*(args + fargs), **newkeywords)
+ newfunc.func = func
+ newfunc.args = args
+ newfunc.keywords = keywords
+ return newfunc
+
+
+from obitools.sequenceencoder import DNAComplementEncoder
+from obitools.location import Location
+
+class WrapperSetIterator(object):
+ def __init__(self,s):
+ self._i = set.__iter__(s)
+ def next(self):
+ return self._i.next()()
+ def __iter__(self):
+ return self
+
+class WrapperSet(set):
+ def __iter__(self):
+ return WrapperSetIterator(self)
+
+
+class BioSequence(object):
+ '''
+ BioSequence class is the base class for biological
+ sequence representation.
+
+ It provides storage of :
+
+ - the sequence itself,
+ - an identifier,
+ - a definition an manage
+ - a set of complementary information on a key / value principle.
+
+ .. warning::
+
+ :py:class:`obitools.BioSequence` is an abstract class, this constructor
+ can only be called by a subclass constructor.
+ '''
+
+ def __init__(self,id,seq,definition=None,rawinfo=None,rawparser=_default_raw_parser,**info):
+ '''
+
+ :param id: sequence identifier
+ :type id: `str`
+
+ :param seq: the sequence
+ :type seq: `str`
+
+ :param definition: sequence definition (optional)
+ :type definition: `str`
+
+ :param rawinfo: a text containing a set of key=value; patterns
+ :type definition: `str`
+
+ :param rawparser: a text describing a regular patterns template
+ used to parse rawinfo
+ :type definition: `str`
+
+ :param info: extra named parameters can be added to associate complementary
+ data to the sequence
+
+ '''
+
+ assert type(self)!=BioSequence,"obitools.BioSequence is an abstract class"
+
+ self._seq=str(seq).lower()
+ self._info = dict(info)
+ if rawinfo is not None:
+ self._rawinfo=' ' + rawinfo
+ else:
+ self._rawinfo=None
+ self._rawparser=rawparser
+ self.definition=definition
+ self.id=id
+ self._hasTaxid=None
+
+ def get_seq(self):
+ return self.__seq
+
+
+ def set_seq(self, value):
+ if not isinstance(value, str):
+ value=str(value)
+ self.__seq = value
+ self.__len = len(value)
+
+
+ def clone(self):
+ seq = type(self)(self.id,
+ str(self),
+ definition=self.definition
+ )
+ seq._info=dict(self.getTags())
+ seq._rawinfo=self._rawinfo
+ seq._rawparser=self._rawparser
+ seq._hasTaxid=self._hasTaxid
+ return seq
+
+ def getDefinition(self):
+ '''
+ Sequence definition getter.
+
+ :return: the sequence definition
+ :rtype: str
+
+ '''
+ return self._definition
+
+ def setDefinition(self, value):
+ '''
+ Sequence definition setter.
+
+ :param value: the new sequence definition
+ :type value: C{str}
+ :return: C{None}
+ '''
+ self._definition = value
+
+ def getId(self):
+ '''
+ Sequence identifier getter
+
+ :return: the sequence identifier
+ :rtype: C{str}
+ '''
+ return self._id
+
+ def setId(self, value):
+ '''
+ Sequence identifier setter.
+
+ :param value: the new sequence identifier
+ :type value: C{str}
+ :return: C{None}
+ '''
+ self._id = value
+
+ def getStr(self):
+ '''
+ Return the sequence as a string
+
+ :return: the string representation of the sequence
+ :rtype: str
+ '''
+ return self._seq
+
+ def getSymbolAt(self,position):
+ '''
+ Return the symbole at C{position} in the sequence
+
+ :param position: the desired position. Position start from 0
+ if position is < 0 then they are considered
+ to reference the end of the sequence.
+ :type position: `int`
+
+ :return: a one letter string
+ :rtype: `str`
+ '''
+ return str(self)[position]
+
+ def getSubSeq(self,location):
+ '''
+ return a subsequence as described by C{location}.
+
+ The C{location} parametter can be a L{obitools.location.Location} instance,
+ an interger or a python C{slice} instance. If C{location}
+ is an iterger this method is equivalent to L{getSymbolAt}.
+
+ :param location: the positions of the subsequence to return
+ :type location: C{Location} or C{int} or C{slice}
+ :return: the subsequence
+ :rtype: a single character as a C{str} is C{location} is an integer,
+ a L{obitools.SubSequence} instance otherwise.
+
+ '''
+ if isinstance(location,Location):
+ return location.extractSequence(self)
+ elif isinstance(location, int):
+ return self.getSymbolAt(location)
+ elif isinstance(location, slice):
+ return SubSequence(self,location)
+
+ raise TypeError,'key must be a Location, an integer or a slice'
+
+ def getKey(self,key):
+ if key not in self._info:
+ if self._rawinfo is None:
+ if key=='count':
+ return 1
+ else:
+ raise KeyError,key
+ p = re.compile(self._rawparser % key)
+ m = p.search(self._rawinfo)
+ if m is not None:
+ v=m.group(1)
+ self._rawinfo=' ' + self._rawinfo[0:m.start(0)]+self._rawinfo[m.end(0):]
+ try:
+ v = eval(v)
+ except:
+ pass
+ self._info[key]=v
+ else:
+ if key=='count':
+ v=1
+ else:
+ raise KeyError,key
+ else:
+ v=self._info[key]
+ return v
+
+ def extractTaxon(self):
+ '''
+ Extract Taxonomy information from the sequence header.
+ This method by default return None. It should be subclassed
+ if necessary as in L{obitools.seqdb.AnnotatedSequence}.
+
+ :return: None
+ '''
+ self._hasTaxid=self.hasKey('taxid')
+ return None
+
+ def __str__(self):
+ return self.getStr()
+
+ def __getitem__(self,key):
+ if isinstance(key, str):
+ if key=='taxid' and self._hasTaxid is None:
+ self.extractTaxon()
+ return self.getKey(key)
+ else:
+ return self.getSubSeq(key)
+
+ def __setitem__(self,key,value):
+ self._info[key]=value
+ if key=='taxid':
+ self._hasTaxid=value is not None
+
+ def __delitem__(self,key):
+ if isinstance(key, str):
+ if key in self:
+ del self._info[key]
+ else:
+ raise KeyError,key
+
+ if key=='taxid':
+ self._hasTaxid=False
+ else:
+ raise TypeError,key
+
+ def __iter__(self):
+ '''
+ Iterate through the sequence symbols
+ '''
+ return iter(str(self))
+
+ def __len__(self):
+ return self.__len
+
+ def hasKey(self,key):
+ rep = key in self._info
+
+ if not rep and self._rawinfo is not None:
+ p = re.compile(self._rawparser % key)
+ m = p.search(self._rawinfo)
+ if m is not None:
+ v=m.group(1)
+ self._rawinfo=' ' + self._rawinfo[0:m.start(0)]+self._rawinfo[m.end(0):]
+ try:
+ v = eval(v)
+ except:
+ pass
+ self._info[key]=v
+ rep=True
+
+ return rep
+
+ def __contains__(self,key):
+ '''
+ methods allowing to use the C{in} operator on a C{BioSequence}.
+
+ The C{in} operator test if the C{key} value is defined for this
+ sequence.
+
+ :param key: the name of the checked value
+ :type key: str
+ :return: C{True} if the value is defined, {False} otherwise.
+ :rtype: C{bool}
+ '''
+ if key=='taxid' and self._hasTaxid is None:
+ self.extractTaxon()
+ return self.hasKey(key)
+
+ def rawiteritems(self):
+ return self._info.iteritems()
+
+ def iteritems(self):
+ '''
+ iterate other items dictionary storing the values
+ associated to the sequence. It works similarly to
+ the iteritems function of C{dict}.
+
+ :return: an iterator over the items (key,value)
+ link to a sequence
+ :rtype: iterator over tuple
+ :see: L{items}
+ '''
+ if self._rawinfo is not None:
+ p = re.compile(self._rawparser % "([a-zA-Z]\w*)")
+ for k,v in p.findall(self._rawinfo):
+ try:
+ self._info[k]=eval(v)
+ except:
+ self._info[k]=v
+ self._rawinfo=None
+ return self._info.iteritems()
+
+ def items(self):
+ return [x for x in self.iteritems()]
+
+ def iterkeys(self):
+ return (k for k,v in self.iteritems())
+
+ def keys(self):
+ return [x for x in self.iterkeys()]
+
+ def getTags(self):
+ self.iteritems()
+ return self._info
+
+ def getRoot(self):
+ return self
+
+ def getWrappers(self):
+ if not hasattr(self, '_wrappers'):
+ self._wrappers=WrapperSet()
+ return self._wrappers
+
+ def register(self,wrapper):
+ self.wrappers.add(ref(wrapper,self._unregister))
+
+ def _unregister(self,ref):
+ self.wrappers.remove(ref)
+
+ wrappers = property(getWrappers,None,None,'')
+
+ definition = property(getDefinition, setDefinition, None, "Sequence Definition")
+
+ id = property(getId, setId, None, 'Sequence identifier')
+
+ def _getTaxid(self):
+ return self['taxid']
+
+ def _setTaxid(self,taxid):
+ self['taxid']=taxid
+
+ taxid = property(_getTaxid,_setTaxid,None,'NCBI Taxonomy identifier')
+ _seq = property(get_seq, set_seq, None, None)
+
+class NucSequence(BioSequence):
+ """
+ :py:class:`NucSequence` specialize the :py:class:`BioSequence` class for storing DNA
+ sequences.
+
+ The constructor is identical to the :py:class:`BioSequence` constructor.
+ """
+
+ def complement(self):
+ """
+ :return: The reverse complemented sequence as an instance of :py:class:`DNAComplementSequence`
+ :rtype: :py:class:`DNAComplementSequence`
+ """
+ return DNAComplementSequence(self)
+
+ def isNucleotide(self):
+ return True
+
+
+class AASequence(BioSequence):
+ """
+ :py:class:`AASequence` specialize the :py:class:`BioSequence` class for storing protein
+ sequences.
+
+ The constructor is identical to the :py:class:`BioSequence` constructor.
+ """
+
+
+ def isNucleotide(self):
+ return False
+
+
+class WrappedBioSequence(BioSequence):
+ """
+ .. warning::
+
+ :py:class:`obitools.WrappedBioSequence` is an abstract class, this constructor
+ can only be called by a subclass constructor.
+ """
+
+
+ def __init__(self,reference,id=None,definition=None,**info):
+
+ assert type(self)!=WrappedBioSequence,"obitools.WrappedBioSequence is an abstract class"
+
+ self._wrapped = reference
+ reference.register(self)
+ self._id=id
+ self.definition=definition
+ self._info=info
+
+ def clone(self):
+ seq = type(self)(self.wrapped,
+ id=self._id,
+ definition=self._definition
+ )
+ seq._info=dict(self._info)
+
+ return seq
+
+ def getWrapped(self):
+ return self._wrapped
+
+ def getDefinition(self):
+ d = self._definition or self.wrapped.definition
+ return d
+
+ def getId(self):
+ d = self._id or self.wrapped.id
+ return d
+
+ def isNucleotide(self):
+ return self.wrapped.isNucleotide()
+
+
+ def iterkeys(self):
+ return uniqueChain(self._info.iterkeys(),
+ self.wrapped.iterkeys())
+
+ def rawiteritems(self):
+ return chain(self._info.iteritems(),
+ (x for x in self.wrapped.rawiteritems()
+ if x[0] not in self._info))
+
+ def iteritems(self):
+ for x in self.iterkeys():
+ yield (x,self[x])
+
+ def getKey(self,key):
+ if key in self._info:
+ return self._info[key]
+ else:
+ return self.wrapped.getKey(key)
+
+ def hasKey(self,key):
+ return key in self._info or self.wrapped.hasKey(key)
+
+ def getSymbolAt(self,position):
+ return self.wrapped.getSymbolAt(self.posInWrapped(position))
+
+ def posInWrapped(self,position,reference=None):
+ if reference is None or reference is self.wrapped:
+ return self._posInWrapped(position)
+ else:
+ return self.wrapped.posInWrapped(self._posInWrapped(position),reference)
+
+
+ def getStr(self):
+ return str(self.wrapped)
+
+ def getRoot(self):
+ return self.wrapped.getRoot()
+
+ def complement(self):
+ """
+ The :py:meth:`complement` method of the :py:class:`WrappedBioSequence` class
+ raises an exception :py:exc:`AttributeError` if the method is called and the cut
+ sequence does not corresponds to a nucleic acid sequence.
+ """
+
+ if self.wrapped.isNucleotide():
+ return DNAComplementSequence(self)
+ raise AttributeError
+
+
+ def _posInWrapped(self,position):
+ return position
+
+
+ definition = property(getDefinition,BioSequence.setDefinition, None)
+ id = property(getId,BioSequence.setId, None)
+
+ wrapped = property(getWrapped, None, None, "A pointer to the wrapped sequence")
+
+ def _getWrappedRawInfo(self):
+ return self.wrapped._rawinfo
+
+ _rawinfo = property(_getWrappedRawInfo)
+
+
+class SubSequence(WrappedBioSequence):
+ """
+ """
+
+
+ @staticmethod
+ def _sign(x):
+ if x == 0:
+ return 0
+ elif x < 0:
+ return -1
+ return 1
+
+ def __init__(self,reference,
+ location=None,
+ start=None,stop=None,
+ id=None,definition=None,
+ **info):
+ WrappedBioSequence.__init__(self,reference,id=None,definition=None,**info)
+
+ if isinstance(location, slice):
+ self._location = location
+ else:
+ step = 1
+ if not isinstance(start, int):
+ start = 0;
+ if not isinstance(stop,int):
+ stop = len(reference)
+ self._location=slice(start,stop,step)
+
+ self._indices=self._location.indices(len(self.wrapped))
+ self._xrange=xrange(*self._indices)
+
+ self._info['cut']='[%d,%d,%s]' % self._indices
+
+ if hasattr(reference,'quality'):
+ self.quality = reference.quality[self._location]
+
+ def getId(self):
+ d = self._id or ("%s_SUB" % self.wrapped.id)
+ return d
+
+
+ def clone(self):
+ seq = WrappedBioSequence.clone(self)
+ seq._location=self._location
+ seq._indices=seq._location.indices(len(seq.wrapped))
+ seq._xrange=xrange(*seq._indices)
+ return seq
+
+
+ def __len__(self):
+ return len(self._xrange)
+
+ def getStr(self):
+ return ''.join([x for x in self])
+
+ def __iter__(self):
+ return (self.wrapped.getSymbolAt(x) for x in self._xrange)
+
+ def _posInWrapped(self,position):
+ return self._xrange[position]
+
+
+ id = property(getId,BioSequence.setId, None)
+
+
+
+class DNAComplementSequence(WrappedBioSequence):
+ """
+ Class used to represent a reverse complemented DNA sequence. Usually instances
+ of this class are produced by using the :py:meth:`NucSequence.complement` method.
+ """
+
+
+ _comp={'a': 't', 'c': 'g', 'g': 'c', 't': 'a',
+ 'r': 'y', 'y': 'r', 'k': 'm', 'm': 'k',
+ 's': 's', 'w': 'w', 'b': 'v', 'd': 'h',
+ 'h': 'd', 'v': 'b', 'n': 'n', 'u': 'a',
+ '-': '-'}
+
+
+ def __init__(self,reference,
+ id=None,definition=None,**info):
+ WrappedBioSequence.__init__(self,reference,id=None,definition=None,**info)
+ assert reference.isNucleotide()
+ self._info['complemented']=True
+ if hasattr(reference,'quality'):
+ self.quality = reference.quality[::-1]
+
+
+ def getId(self):
+ d = self._id or ("%s_CMP" % self.wrapped.id)
+ return d
+
+ def __len__(self):
+ return len(self._wrapped)
+
+ def getStr(self):
+ return ''.join([x for x in self])
+
+ def __iter__(self):
+ return (self.getSymbolAt(x) for x in xrange(len(self)))
+
+ def _posInWrapped(self,position):
+ return -(position+1)
+
+ def getSymbolAt(self,position):
+ return DNAComplementSequence._comp[self.wrapped.getSymbolAt(self.posInWrapped(position))]
+
+ def complement(self):
+ """
+ The :py:meth:`complement` method of the :py:class:`DNAComplementSequence` class actually
+ returns the wrapped sequenced. Effectively the reversed complemented sequence of a reversed
+ complemented sequence is the initial sequence.
+ """
+ return self.wrapped
+
+ id = property(getId,BioSequence.setId, None)
+
+
+def _isNucSeq(text):
+ acgt = 0
+ notnuc = 0
+ ltot = len(text) * 0.8
+ for c in text.lower():
+ if c in 'acgt-':
+ acgt+=1
+ if c not in DNAComplementEncoder._comp:
+ notnuc+=1
+ return notnuc==0 and float(acgt) > ltot
+
+
+def bioSeqGenerator(id,seq,definition=None,rawinfo=None,rawparser=_default_raw_parser,**info):
+ """
+ Generate automagically the good class instance between :
+
+ - :py:class:`NucSequence`
+ - :py:class:`AASequence`
+
+ Build a new sequence instance. Sequences are instancied as :py:class:`NucSequence` if the
+ `seq` attribute contains more than 80% of *A*, *C*, *G*, *T* or *-* symbols
+ in upper or lower cases. Conversely, the new sequence instance is instancied as
+ :py:class:`AASequence`.
+
+
+
+ :param id: sequence identifier
+ :type id: `str`
+
+ :param seq: the sequence
+ :type seq: `str`
+
+ :param definition: sequence definition (optional)
+ :type definition: `str`
+
+ :param rawinfo: a text containing a set of key=value; patterns
+ :type definition: `str`
+
+ :param rawparser: a text describing a regular patterns template
+ used to parse rawinfo
+ :type definition: `str`
+
+ :param info: extra named parameters can be added to associate complementary
+ data to the sequence
+ """
+ if _isNucSeq(seq):
+ return NucSequence(id,seq,definition,rawinfo,rawparser,**info)
+ else:
+ return AASequence(id,seq,definition,rawinfo,rawparser,**info)
+
diff --git a/obitools/__init__.pyc b/obitools/__init__.pyc
new file mode 100644
index 0000000..3cc2111
Binary files /dev/null and b/obitools/__init__.pyc differ
diff --git a/obitools/align/__init__.py b/obitools/align/__init__.py
new file mode 100644
index 0000000..54cca7d
--- /dev/null
+++ b/obitools/align/__init__.py
@@ -0,0 +1,13 @@
+
+
+from _nws import NWS
+from _upperbond import indexSequences
+from _lcs import LCS,lenlcs
+from _assemble import DirectAssemble, ReverseAssemble
+from _qsassemble import QSolexaDirectAssemble,QSolexaReverseAssemble
+from _rassemble import RightDirectAssemble as RightReverseAssemble
+from _qsrassemble import QSolexaRightDirectAssemble,QSolexaRightReverseAssemble
+from _freeendgap import FreeEndGap
+from _freeendgapfm import FreeEndGapFullMatch
+from _upperbond import isLCSReachable
+
diff --git a/obitools/align/_assemble.so b/obitools/align/_assemble.so
new file mode 100755
index 0000000..dbc2139
Binary files /dev/null and b/obitools/align/_assemble.so differ
diff --git a/obitools/align/_dynamic.so b/obitools/align/_dynamic.so
new file mode 100755
index 0000000..2f93d3a
Binary files /dev/null and b/obitools/align/_dynamic.so differ
diff --git a/obitools/align/_freeendgap.so b/obitools/align/_freeendgap.so
new file mode 100755
index 0000000..53cd9c0
Binary files /dev/null and b/obitools/align/_freeendgap.so differ
diff --git a/obitools/align/_freeendgapfm.so b/obitools/align/_freeendgapfm.so
new file mode 100755
index 0000000..f88c07b
Binary files /dev/null and b/obitools/align/_freeendgapfm.so differ
diff --git a/obitools/align/_lcs.so b/obitools/align/_lcs.so
new file mode 100755
index 0000000..555a2a2
Binary files /dev/null and b/obitools/align/_lcs.so differ
diff --git a/obitools/align/_nws.so b/obitools/align/_nws.so
new file mode 100755
index 0000000..af7e849
Binary files /dev/null and b/obitools/align/_nws.so differ
diff --git a/obitools/align/_profilenws.so b/obitools/align/_profilenws.so
new file mode 100755
index 0000000..baa8eda
Binary files /dev/null and b/obitools/align/_profilenws.so differ
diff --git a/obitools/align/_qsassemble.so b/obitools/align/_qsassemble.so
new file mode 100755
index 0000000..3bc83e9
Binary files /dev/null and b/obitools/align/_qsassemble.so differ
diff --git a/obitools/align/_qsrassemble.so b/obitools/align/_qsrassemble.so
new file mode 100755
index 0000000..75b98aa
Binary files /dev/null and b/obitools/align/_qsrassemble.so differ
diff --git a/obitools/align/_rassemble.so b/obitools/align/_rassemble.so
new file mode 100755
index 0000000..e2a063c
Binary files /dev/null and b/obitools/align/_rassemble.so differ
diff --git a/obitools/align/_upperbond.so b/obitools/align/_upperbond.so
new file mode 100755
index 0000000..5f2b1fe
Binary files /dev/null and b/obitools/align/_upperbond.so differ
diff --git a/obitools/align/homopolymere.py b/obitools/align/homopolymere.py
new file mode 100644
index 0000000..5efcbff
--- /dev/null
+++ b/obitools/align/homopolymere.py
@@ -0,0 +1,56 @@
+'''
+Created on 14 mai 2009
+
+@author: coissac
+'''
+
+from obitools import WrappedBioSequence
+
+class HomoNucBioSeq(WrappedBioSequence):
+ '''
+ classdocs
+ '''
+
+
+ def __init__(self,reference,id=None,definition=None,**info):
+ '''
+ Constructor
+ '''
+ assert reference.isNucleotide(),"reference must be a nucleic sequence"
+ WrappedBioSequence.__init__(self,reference,id=None,definition=None,**info)
+ self.__cleanHomopolymer()
+
+ def __cleanHomopolymer(self):
+ s = []
+ c = []
+ old=None
+ nc=0
+ for n in self._wrapped:
+ if old is not None and n!=old:
+ s.append(old)
+ c.append(nc)
+ nc=0
+ old=n
+ nc+=1
+ self._cached=''.join(s)
+ self['homopolymer']=c
+ self._cumulative=[]
+ sum=0
+ for c in self._count:
+ sum+=c
+ self._cumulative.append(sum)
+
+ def __len__(self):
+ return len(self._cached)
+
+ def getStr(self):
+ return self._cached
+
+ def __iter__(self):
+ return iter(self._cached)
+
+ def _posInWrapped(self,position):
+ return self._cumulative[position]
+
+
+
\ No newline at end of file
diff --git a/obitools/align/ssearch.py b/obitools/align/ssearch.py
new file mode 100644
index 0000000..55a74ce
--- /dev/null
+++ b/obitools/align/ssearch.py
@@ -0,0 +1,46 @@
+import os
+import re
+
+from obitools.fasta import formatFasta
+
+class SsearchParser(object):
+
+ _matchQuery = re.compile("^Query:.+\n.+?>+([^ ]+)", re.MULTILINE)
+ _matchLQuery = re.compile("^Query:.+\n.+?(\d+)(?= nt\n)", re.MULTILINE)
+ _matchProp = re.compile("^The best scores are:.*\n(.+?)>>>", re.DOTALL+re.MULTILINE)
+ def __init__(self,file):
+ if isinstance(file,str):
+ file = open(file,'rU')
+ self.data = file.read()
+ self.query= SsearchParser._matchQuery.search(self.data).group(1)
+ self.queryLength= int(SsearchParser._matchLQuery.search(self.data).group(1))
+ props = SsearchParser._matchProp.search(self.data)
+ if props:
+ props=props.group(0).split('\n')[1:-2]
+ self.props=[]
+ for line in props:
+ subject,tab = line.split('\t')
+ tab=tab.split()
+ ssp = subject.split()
+ ac = ssp[0]
+ dbl= int(ssp[-5][:-1])
+ ident = float(tab[0])
+ matchlen = abs(int(tab[5]) - int(tab[4])) +1
+ self.props.append({"ac" :ac,
+ "identity" :ident,
+ "subjectlength":dbl,
+ 'matchlength' : matchlen})
+
+def run(seq,database,program='fasta35',opts=''):
+ ssearchin,ssearchout,ssearcherr = os.popen3("%s %s %s" % (program,opts,database))
+ print >>ssearchin,formatFasta(seq)
+ ssearchin.close()
+ result = SsearchParser(ssearchout)
+
+ return seq,result
+
+def ssearchIterator(sequenceIterator,database,program='ssearch35',opts=''):
+ for seq in sequenceIterator:
+ yield run(seq,database,program,opts)
+
+
diff --git a/obitools/alignment/__init__.py b/obitools/alignment/__init__.py
new file mode 100644
index 0000000..a89793a
--- /dev/null
+++ b/obitools/alignment/__init__.py
@@ -0,0 +1,175 @@
+from obitools import BioSequence
+from obitools import WrappedBioSequence
+from copy import deepcopy
+
+class GappedPositionException(Exception):
+ pass
+
+class AlignedSequence(WrappedBioSequence):
+
+ def __init__(self,reference,
+ id=None,definition=None,**info):
+ WrappedBioSequence.__init__(self,reference,id=None,definition=None,**info)
+ self._length=len(reference)
+ self._gaps=[[self._length,0]]
+
+ def clone(self):
+ seq = WrappedBioSequence.clone(self)
+ seq._gaps=deepcopy(self._gaps)
+ seq._length=reduce(lambda x,y:x+y, (z[0]+z[1] for z in self._gaps),0)
+ return seq
+
+ def setGaps(self, value):
+ '''
+ Set gap vector to an AlignedSequence.
+
+ Gap vector describes the gap positions on a sequence.
+ It is a gap of couple. The first couple member is the count
+ of sequence letter, the second one is the gap length.
+ @param value: a list of length 2 list describing gap positions
+ @type value: list of couple
+ '''
+ assert isinstance(value, list),'Gap vector must be a list'
+ assert reduce(lambda x,y: x and y,
+ (isinstance(z, list) and len(z)==2 for z in value),
+ True),"Value must be a list of length 2 list"
+
+ lseq = reduce(lambda x,y:x+y, (z[0] for z in value),0)
+ assert lseq==len(self.wrapped),"Gap vector incompatible with the sequence"
+ self._gaps = value
+ self._length=reduce(lambda x,y:x+y, (z[0]+z[1] for z in value),0)
+
+ def getGaps(self):
+ return tuple(self._gaps)
+ gaps = property(getGaps, setGaps, None, "Gaps's Docstring")
+
+ def _getIndice(self,pos):
+ i=0
+ cpos=0
+ for s,g in self._gaps:
+ cpos+=s
+ if cpos>pos:
+ return i,pos-cpos+s
+ cpos+=g
+ if cpos>pos:
+ return i,-pos+cpos-g-1
+ i+=1
+ raise IndexError
+
+ def getId(self):
+ d = self._id or ("%s_ALN" % self.wrapped.id)
+ return d
+
+ def __len__(self):
+ return self._length
+
+ def getStr(self):
+ return ''.join([x for x in self])
+
+ def __iter__(self):
+ def isymb():
+ cpos=0
+ for s,g in self._gaps:
+ for x in xrange(s):
+ yield self.wrapped[cpos+x]
+ for x in xrange(g):
+ yield '-'
+ cpos+=s
+ return isymb()
+
+ def _posInWrapped(self,position):
+ i,s=self._getIndice(position)
+ if s<0:
+ raise GappedPositionException
+ value=self._gaps
+ p=reduce(lambda x,y:x+y, (z[0] for z in value[:i]),0)+s
+ return p
+
+ def getSymbolAt(self,position):
+ try:
+ return self.wrapped.getSymbolAt(self.posInWrapped(position))
+ except GappedPositionException:
+ return '-'
+
+ def insertGap(self,position,count=1):
+ if position==self._length:
+ idx=len(self._gaps)-1
+ p=-1
+ else:
+ idx,p = self._getIndice(position)
+
+ if p >= 0:
+ self._gaps.insert(idx, [p,count])
+ self._gaps[idx+1][0]-=p
+ else:
+ self._gaps[idx][1]+=count
+ self._length=reduce(lambda x,y:x+y, (z[0]+z[1] for z in self._gaps),0)
+
+
+ id = property(getId,BioSequence.setId, None, "Sequence Identifier")
+
+
+class Alignment(list):
+
+ def _assertData(self,data):
+ assert isinstance(data, BioSequence),'You must only add bioseq to an alignement'
+ if hasattr(self, '_alignlen'):
+ assert self._alignlen==len(data),'All aligned sequences must have the same length'
+ else:
+ self._alignlen=len(data)
+ return data
+
+ def clone(self):
+ ali = Alignment(x.clone() for x in self)
+ return ali
+
+ def append(self,data):
+ data = self._assertData(data)
+ list.append(self,data)
+
+ def __setitem__(self,index,data):
+
+ data = self._assertData(data)
+ list.__setitem__(self,index,data)
+
+ def getSite(self,key):
+ if isinstance(key,int):
+ return [x[key] for x in self]
+
+ def insertGap(self,position,count=1):
+ for s in self:
+ s.insertGap(position,count)
+
+ def isFullGapSite(self,key):
+ return reduce(lambda x,y: x and y,(z=='-' for z in self.getSite(key)),True)
+
+ def isGappedSite(self,key):
+ return '-' in self.getSite(key)
+
+ def __str__(self):
+ l = len(self[0])
+ rep=""
+ idmax = max(len(x.id) for x in self)+2
+ template= "%%-%ds %%-60s" % idmax
+ for p in xrange(0,l,60):
+ for s in self:
+ rep+= (template % (s.id,s[p:p+60])).strip() + '\n'
+ rep+="\n"
+ return rep
+
+def alignmentReader(file,sequenceIterator):
+ seqs = sequenceIterator(file)
+ alignement = Alignment()
+ for seq in seqs:
+ alignement.append(seq)
+ return alignement
+
+
+
+
+
+def columnIterator(alignment):
+ lali = len(alignment[0])
+ for p in xrange(lali):
+ c = [x[p] for x in alignment]
+ yield c
\ No newline at end of file
diff --git a/obitools/alignment/ace.py b/obitools/alignment/ace.py
new file mode 100644
index 0000000..59cc8f6
--- /dev/null
+++ b/obitools/alignment/ace.py
@@ -0,0 +1,47 @@
+from obitools.format.genericparser import GenericParser
+from obitools.utils import universalOpen
+from obitools.fasta import parseFastaDescription
+from obitools import NucSequence
+
+
+import sys
+
+_contigIterator=GenericParser('^CO ')
+
+_contigIterator.addParseAction('AF', '\nAF +(\S+) +([UC]) +(-?[0-9]+)')
+_contigIterator.addParseAction('RD', '\nRD +(\S+) +([0-9]+) +([0-9]+) +([0-9]+) *\n([A-Za-z\n*]+?)\n\n')
+_contigIterator.addParseAction('DS', '\nDS +(.+)')
+_contigIterator.addParseAction('CO', '^CO (\S+)')
+
+def contigIterator(file):
+ file = universalOpen(file)
+ for entry in _contigIterator(file):
+ contig=[]
+ for rd,ds,af in map(None,entry['RD'],entry['DS'],entry['AF']):
+ id = rd[0]
+ shift = int(af[2])
+ if shift < 0:
+ print >> sys.stderr,"Sequence %s in contig %s has a negative paddng value %d : skipped" % (id,entry['CO'][0],shift)
+ #continue
+
+ definition,info = parseFastaDescription(ds)
+ info['shift']=shift
+ seq = rd[4].replace('\n','').replace('*','-').strip()
+ contig.append(NucSequence(id,seq,definition,**info))
+
+ maxlen = max(len(x)+x['shift'] for x in contig)
+ minshift=min(x['shift'] for x in contig)
+ rep = []
+
+ for s in contig:
+ info = s.getTags()
+ info['shift']-=minshift-1
+ head = '-' * (info['shift']-1)
+
+ tail = (maxlen + minshift - len(s) - info['shift'] - 1)
+ info['tail']=tail
+ newseq = NucSequence(s.id,head + str(s)+ '-' * tail,s.definition,**info)
+ rep.append(newseq)
+
+ yield entry['CO'][0],rep
+
\ No newline at end of file
diff --git a/obitools/barcodecoverage/__init__.py b/obitools/barcodecoverage/__init__.py
new file mode 100644
index 0000000..09e542e
--- /dev/null
+++ b/obitools/barcodecoverage/__init__.py
@@ -0,0 +1,7 @@
+'''
+
+@author: merciece
+Creates the tree representing the coverage of 2 primers from an ecoPCR output file and an ecoPCR database.
+
+
+'''
\ No newline at end of file
diff --git a/obitools/barcodecoverage/calcBc.py b/obitools/barcodecoverage/calcBc.py
new file mode 100644
index 0000000..13b0401
--- /dev/null
+++ b/obitools/barcodecoverage/calcBc.py
@@ -0,0 +1,62 @@
+#!/usr/local/bin/python
+'''
+Created on 24 nov. 2011
+
+@author: merciece
+'''
+
+
+def main(amplifiedSeqs, seqsFromDB, keptRanks, errors, tax) :
+ '''
+ error threshold is set to 3
+ '''
+
+ listtaxabygroupinDB = {}
+
+ for seq in seqsFromDB :
+ taxid = seq['taxid']
+ p = [a for a in tax.parentalTreeIterator(taxid)]
+ for a in p :
+ if a != p[0] :
+ if a[1] in keptRanks :
+ group = a[0]
+ if group in listtaxabygroupinDB and taxid not in listtaxabygroupinDB[group] :
+ listtaxabygroupinDB[group].add(taxid)
+ elif group not in listtaxabygroupinDB :
+ listtaxabygroupinDB[group]=set([taxid])
+
+ taxabygroup = dict((x,len(listtaxabygroupinDB[x])) for x in listtaxabygroupinDB)
+
+ listamplifiedtaxabygroup = {}
+
+ for seq in amplifiedSeqs :
+ if errors[seq.id][2] <= 3 :
+ taxid = seq['taxid']
+ p = [a for a in tax.parentalTreeIterator(taxid)]
+ for a in p :
+ if a != p[0] :
+ if a[1] in keptRanks :
+ group = a[0]
+ if group in listamplifiedtaxabygroup and taxid not in listamplifiedtaxabygroup[group] :
+ listamplifiedtaxabygroup[group].add(taxid)
+ elif group not in listamplifiedtaxabygroup :
+ listamplifiedtaxabygroup[group]=set([taxid])
+
+ amplifiedtaxabygroup = dict((x,len(listamplifiedtaxabygroup[x])) for x in listamplifiedtaxabygroup)
+
+ BcValues = {}
+
+ groups = [g for g in taxabygroup.keys()]
+
+ for g in groups :
+ if g in amplifiedtaxabygroup :
+ BcValues[g] = float(amplifiedtaxabygroup[g])/taxabygroup[g]*100
+ BcValues[g] = round(BcValues[g], 2)
+ else :
+ BcValues[g] = 0.0
+
+ return BcValues
+
+
+
+
diff --git a/obitools/barcodecoverage/calculateBc.py b/obitools/barcodecoverage/calculateBc.py
new file mode 100644
index 0000000..c5edb8a
--- /dev/null
+++ b/obitools/barcodecoverage/calculateBc.py
@@ -0,0 +1,72 @@
+#!/usr/local/bin/python
+'''
+Created on 24 nov. 2011
+
+@author: merciece
+'''
+
+import sys
+
+
+def main(amplifiedSeqs, seqsFromDB, keptRanks, tax) :
+
+ BcValues = {}
+
+ #speciesid = tax.findRankByName('species')
+ #subspeciesid = tax.findRankByName('subspecies')
+
+ listtaxonbygroup = {}
+
+ for seq in seqsFromDB :
+ taxid = seq['taxid']
+ p = [a for a in tax.parentalTreeIterator(taxid)]
+ for a in p :
+ if a != p[0] :
+ if a[1] in keptRanks :
+ group = a
+ if group in listtaxonbygroup:
+ listtaxonbygroup[group].add(taxid)
+ else:
+ listtaxonbygroup[group]=set([taxid])
+
+ #stats = dict((x,len(listtaxonbygroup[x])) for x in listtaxonbygroup)
+
+ print>>sys.stderr, listtaxonbygroup
+
+ listtaxonbygroup = {}
+
+ for seq in amplifiedSeqs :
+ taxid = seq['taxid']
+ p = [a for a in tax.parentalTreeIterator(taxid)]
+ for a in p :
+ if a != p[0] :
+ if a[1] in keptRanks :
+ group = a
+ if group in listtaxonbygroup:
+ listtaxonbygroup[group].add(taxid)
+ else:
+ listtaxonbygroup[group]=set([taxid])
+
+ print>>sys.stderr, listtaxonbygroup
+
+ return BcValues
+
+# dbstats= dict((x,len(listtaxonbygroup[x])) for x in listtaxonbygroup)
+#
+# ranks = [r for r in keptRanks]
+# ranks.sort()
+#
+# print '%-20s\t%10s\t%10s\t%7s' % ('rank','ecopcr','db','percent')
+#
+# print>>sys.stderr, stats
+# print>>sys.stderr, dbstats
+# print>>sys.stderr, ranks
+#
+# for r in ranks:
+# if r in dbstats and dbstats[r]:
+# print '%-20s\t%10d\t%10d\t%8.2f' % (r,dbstats[r],stats[r],float(dbstats[r])/stats[r]*100)
+
+
+
+
+
diff --git a/obitools/barcodecoverage/drawBcTree.py b/obitools/barcodecoverage/drawBcTree.py
new file mode 100644
index 0000000..9b1e215
--- /dev/null
+++ b/obitools/barcodecoverage/drawBcTree.py
@@ -0,0 +1,108 @@
+#!/usr/local/bin/python
+'''
+Created on 25 nov. 2011
+
+@author: merciece
+'''
+
+from obitools.graph.rootedtree import nexusFormat
+
+
+figtree="""\
+begin figtree;
+ set appearance.backgroundColorAttribute="User Selection";
+ set appearance.backgroundColour=#-1;
+ set appearance.branchColorAttribute="bc";
+ set appearance.branchLineWidth=2.0;
+ set appearance.foregroundColour=#-16777216;
+ set appearance.selectionColour=#-2144520576;
+ set branchLabels.colorAttribute="User Selection";
+ set branchLabels.displayAttribute="errors";
+ set branchLabels.fontName="sansserif";
+ set branchLabels.fontSize=10;
+ set branchLabels.fontStyle=0;
+ set branchLabels.isShown=true;
+ set branchLabels.significantDigits=4;
+ set layout.expansion=2000;
+ set layout.layoutType="RECTILINEAR";
+ set layout.zoom=0;
+ set nodeBars.barWidth=4.0;
+ set nodeLabels.colorAttribute="User Selection";
+ set nodeLabels.displayAttribute="label";
+ set nodeLabels.fontName="sansserif";
+ set nodeLabels.fontSize=10;
+ set nodeLabels.fontStyle=0;
+ set nodeLabels.isShown=true;
+ set nodeLabels.significantDigits=4;
+ set polarLayout.alignTipLabels=false;
+ set polarLayout.angularRange=0;
+ set polarLayout.rootAngle=0;
+ set polarLayout.rootLength=100;
+ set polarLayout.showRoot=true;
+ set radialLayout.spread=0.0;
+ set rectilinearLayout.alignTipLabels=false;
+ set rectilinearLayout.curvature=0;
+ set rectilinearLayout.rootLength=100;
+ set scale.offsetAge=0.0;
+ set scale.rootAge=1.0;
+ set scale.scaleFactor=1.0;
+ set scale.scaleRoot=false;
+ set scaleAxis.automaticScale=true;
+ set scaleAxis.fontSize=8.0;
+ set scaleAxis.isShown=false;
+ set scaleAxis.lineWidth=2.0;
+ set scaleAxis.majorTicks=1.0;
+ set scaleAxis.origin=0.0;
+ set scaleAxis.reverseAxis=false;
+ set scaleAxis.showGrid=true;
+ set scaleAxis.significantDigits=4;
+ set scaleBar.automaticScale=true;
+ set scaleBar.fontSize=10.0;
+ set scaleBar.isShown=true;
+ set scaleBar.lineWidth=1.0;
+ set scaleBar.scaleRange=0.0;
+ set scaleBar.significantDigits=4;
+ set tipLabels.colorAttribute="User Selection";
+ set tipLabels.displayAttribute="Names";
+ set tipLabels.fontName="sansserif";
+ set tipLabels.fontSize=10;
+ set tipLabels.fontStyle=0;
+ set tipLabels.isShown=true;
+ set tipLabels.significantDigits=4;
+ set trees.order=false;
+ set trees.orderType="increasing";
+ set trees.rooting=false;
+ set trees.rootingType="User Selection";
+ set trees.transform=false;
+ set trees.transformType="cladogram";
+end;
+"""
+
+
+def cartoonRankGenerator(rank):
+ def cartoon(node):
+ return 'rank' in node and node['rank']==rank
+
+ return cartoon
+
+
+def collapseBcGenerator(Bclimit):
+ def collapse(node):
+ return 'bc' in node and node['bc']<=Bclimit
+ return collapse
+
+
+def label(node):
+ if 'bc' in node:
+ return "(%+3.1f) %s" % (node['bc'],node['name'])
+ else:
+ return " %s" % node['name']
+
+
+def main(coverageTree) :
+ print nexusFormat(coverageTree,
+ label=label,
+ blocks=figtree,
+ cartoon=cartoonRankGenerator('family'))
+ #collapse=collapseBcGenerator(70))
+
diff --git a/obitools/barcodecoverage/findErrors.py b/obitools/barcodecoverage/findErrors.py
new file mode 100644
index 0000000..dae20a0
--- /dev/null
+++ b/obitools/barcodecoverage/findErrors.py
@@ -0,0 +1,56 @@
+#!/usr/local/bin/python
+'''
+Created on 24 nov. 2011
+
+@author: merciece
+'''
+
+
+def main(seqs, keptRanks, tax):
+ errorsBySeq = getErrorsOnLeaves(seqs)
+ errorsByTaxon = propagateErrors(errorsBySeq, keptRanks, tax)
+ return errorsBySeq, errorsByTaxon
+
+
+def getErrorsOnLeaves(seqs) :
+ errors = {}
+ for s in seqs :
+ taxid = s['taxid']
+ forErrs = s['forward_error']
+ revErrs = s['reverse_error']
+ total = forErrs + revErrs
+ seqNb = 1
+ errors[s.id] = [forErrs,revErrs,total,seqNb,taxid]
+ return errors
+
+
+def propagateErrors(errorsOnLeaves, keptRanks, tax) :
+ allErrors = {}
+ for seq in errorsOnLeaves :
+ taxid = errorsOnLeaves[seq][4]
+ p = [a for a in tax.parentalTreeIterator(taxid)]
+ for a in p :
+ if a[1] in keptRanks :
+ group = a[0]
+ if group in allErrors :
+ allErrors[group][0] += errorsOnLeaves[seq][0]
+ allErrors[group][1] += errorsOnLeaves[seq][1]
+ allErrors[group][2] += errorsOnLeaves[seq][2]
+ allErrors[group][3] += 1
+ else :
+ allErrors[group] = errorsOnLeaves[seq]
+
+ for group in allErrors :
+ allErrors[group][0] /= float(allErrors[group][3])
+ allErrors[group][1] /= float(allErrors[group][3])
+ allErrors[group][2] /= float(allErrors[group][3])
+
+ allErrors[group][0] = round(allErrors[group][0], 2)
+ allErrors[group][1] = round(allErrors[group][1], 2)
+ allErrors[group][2] = round(allErrors[group][2], 2)
+
+ return allErrors
+
+
+
+
diff --git a/obitools/barcodecoverage/readFiles.py b/obitools/barcodecoverage/readFiles.py
new file mode 100644
index 0000000..b03e72a
--- /dev/null
+++ b/obitools/barcodecoverage/readFiles.py
@@ -0,0 +1,69 @@
+#!/usr/local/bin/python
+'''
+Created on 23 nov. 2011
+
+@author: merciece
+'''
+
+from obitools.ecopcr import sequence
+from obitools.ecopcr import taxonomy
+
+
+def main(entries,options):
+ filteredDataFromDB = ecoPCRDatabaseReader(options)
+ filteredData = ecoPCRFileReader(entries,filteredDataFromDB)
+ return filteredDataFromDB,filteredData
+
+
+def ecoPCRDatabaseReader(options):
+
+ tax = taxonomy.EcoTaxonomyDB(options.taxonomy)
+ seqs = sequence.EcoPCRDBSequenceIterator(options.taxonomy,taxonomy=tax)
+
+ norankid = tax.findRankByName('no rank')
+ speciesid = tax.findRankByName('species')
+ genusid = tax.findRankByName('genus')
+ familyid = tax.findRankByName('family')
+
+ minrankseq = set([speciesid,genusid,familyid])
+
+ usedrankid = {}
+
+ ingroup = {}
+ outgroup= {}
+
+ for s in seqs :
+ if 'taxid' in s :
+ taxid = s['taxid']
+ allrank = set()
+ for p in tax.parentalTreeIterator(taxid):
+ if p[1]!=norankid:
+ allrank.add(p[1])
+ if len(minrankseq & allrank) == 3:
+ for r in allrank:
+ usedrankid[r]=usedrankid.get(r,0) + 1
+
+ if tax.isAncestor(options.ingroup,taxid):
+ ingroup[s.id] = s
+ else:
+ outgroup[s.id] = s
+
+ keptranks = set(r for r in usedrankid
+ if float(usedrankid[r])/float(len(ingroup)) > options.rankthresold)
+
+ return { 'ingroup' : ingroup,
+ 'outgroup': outgroup,
+ 'ranks' : keptranks,
+ 'taxonomy': tax
+ }
+
+
+def ecoPCRFileReader(entries,filteredDataFromDB) :
+ filteredData = []
+ for s in entries :
+ if 'taxid' in s :
+ seqId = s.id
+ if seqId in filteredDataFromDB['ingroup'] :
+ filteredData.append(s)
+ return filteredData
+
diff --git a/obitools/barcodecoverage/writeBcTree.py b/obitools/barcodecoverage/writeBcTree.py
new file mode 100644
index 0000000..7c8243e
--- /dev/null
+++ b/obitools/barcodecoverage/writeBcTree.py
@@ -0,0 +1,42 @@
+#!/usr/local/bin/python
+'''
+Created on 25 nov. 2011
+
+@author: merciece
+'''
+
+from obitools.graph.rootedtree import RootedTree
+
+
+def main(BcValues,errors,tax) :
+
+ tree = RootedTree()
+ tset = set(BcValues)
+
+ for taxon in BcValues:
+ if taxon in errors :
+ forErr = errors[taxon][0]
+ revErr = errors[taxon][1]
+ totErr = errors[taxon][2]
+ else :
+ forErr = -1.0
+ revErr = -1.0
+ totErr = -1.0
+
+ tree.addNode(taxon, rank=tax.getRank(taxon),
+ name=tax.getScientificName(taxon),
+ bc = BcValues[taxon],
+ errors = str(forErr)+' '+str(revErr),
+ totError = totErr
+ )
+
+ for taxon in BcValues:
+ piter = tax.parentalTreeIterator(taxon)
+ taxon = piter.next()
+ for parent in piter:
+ if taxon[0] in tset and parent[0] in BcValues:
+ tset.remove(taxon[0])
+ tree.addEdge(parent[0], taxon[0])
+ taxon=parent
+
+ return tree
diff --git a/obitools/blast/__init__.py b/obitools/blast/__init__.py
new file mode 100644
index 0000000..11b5274
--- /dev/null
+++ b/obitools/blast/__init__.py
@@ -0,0 +1,207 @@
+from os import popen2
+from itertools import imap,count
+
+from obitools.table import iTableIterator,TableRow,Table,SelectionIterator
+from obitools.utils import ColumnFile
+from obitools.location import SimpleLocation
+from obitools.fasta import formatFasta
+import sys
+
+class Blast(object):
+ '''
+ Run blast
+ '''
+
+ def __init__(self,mode,db,program='blastall',**options):
+ self._mode = mode
+ self._db = db
+ self._program = program
+ self._options = options
+
+ def getMode(self):
+ return self._mode
+
+
+ def getDb(self):
+ return self._db
+
+
+ def getProgram(self):
+ return self._program
+
+ def _blastcmd(self):
+ tmp = """%(program)s \\
+ -p %(mode)s \\
+ -d %(db)s \\
+ -m 8 \\
+ %(options)s \\
+ """
+ options = ' '.join(['-%s %s' % (x[0],str(x[1]))
+ for x in self._options.iteritems()])
+ data = {
+ 'program' : self.program,
+ 'db' : self.db,
+ 'mode' : self.mode,
+ 'options' : options
+ }
+
+ return tmp % data
+
+ def __call__(self,sequence):
+ '''
+ Run blast with one sequence object
+ @param sequence:
+ @type sequence:
+ '''
+ cmd = self._blastcmd()
+
+ (blast_in,blast_out) = popen2(cmd)
+
+ print >>blast_in,formatFasta(sequence)
+ blast_in.close()
+
+ blast = BlastResultIterator(blast_out)
+
+ return blast
+
+ mode = property(getMode, None, None, "Mode's Docstring")
+
+ db = property(getDb, None, None, "Db's Docstring")
+
+ program = property(getProgram, None, None, "Program's Docstring")
+
+
+class NetBlast(Blast):
+ '''
+ Run blast on ncbi servers
+ '''
+
+ def __init__(self,mode,db,**options):
+ '''
+
+ @param mode:
+ @param db:
+ '''
+ Blast.__init__(self, mode, db, 'blastcl3',**options)
+
+
+class BlastResultIterator(iTableIterator):
+
+ def __init__(self,blastoutput,query=None):
+ '''
+
+ @param blastoutput:
+ @type blastoutput:
+ '''
+ self._blast = ColumnFile(blastoutput,
+ strip=True,
+ skip="#",
+ sep="\t",
+ types=self.types
+ )
+ self._query = query
+ self._hindex = dict((k,i) for i,k in imap(None,count(),self._getHeaders()))
+
+ def _getHeaders(self):
+ return ('Query id','Subject id',
+ '% identity','alignment length',
+ 'mismatches', 'gap openings',
+ 'q. start', 'q. end',
+ 's. start', 's. end',
+ 'e-value', 'bit score')
+
+ def _getTypes(self):
+ return (str,str,
+ float,int,
+ int,int,
+ int,int,
+ int,int,
+ float,float)
+
+ def _getRowFactory(self):
+ return BlastMatch
+
+ def _getSubrowFactory(self):
+ return TableRow
+
+ def _getQuery(self):
+ return self._query
+
+
+ headers = property(_getHeaders,None,None)
+ types = property(_getTypes,None,None)
+ rowFactory = property(_getRowFactory,None,None)
+ subrowFactory = property(_getSubrowFactory,None,None)
+ query = property(_getQuery,None,None)
+
+ def next(self):
+ '''
+
+ '''
+ value = self._blast.next()
+ return self.rowFactory(self,value)
+
+
+
+class BlastResult(Table):
+ '''
+ Results of a blast run
+ '''
+
+class BlastMatch(TableRow):
+ '''
+ Blast high scoring pair between two sequences
+ '''
+
+ def getQueryLocation(self):
+ l = SimpleLocation(self[6], self[7])
+ return l
+
+ def getSubjectLocation(self):
+ l = SimpleLocation(self[8], self[9])
+ return l
+
+ def getSubjectSequence(self,database):
+ return database[self[1]]
+
+ def queryCov(self,query=None):
+ '''
+ Compute coverage of match on query sequence.
+
+ @param query: the query sequence. Default is None.
+ In this case the query sequence associated
+ to this blast result is used.
+ @type query: L{obitools.BioSequence}
+
+ @return: coverage fraction
+ @rtype: float
+ '''
+ if query is None:
+ query = self.table.query
+ assert query is not None
+ return float(self[7]-self[6]+1)/float(len(query))
+
+ def __getitem__(self,key):
+ if key=='query coverage' and self.table.query is not None:
+ return self.queryCov()
+ else:
+ return TableRow.__getitem__(self,key)
+
+class BlastCovMinFilter(SelectionIterator):
+
+ def __init__(self,blastiterator,covmin,query=None,**conditions):
+ if query is None:
+ query = blastiterator.table.query
+ assert query is not None
+ SelectionIterator.__init__(self,blastiterator,**conditions)
+ self._query = query
+ self._covmin=covmin
+
+ def _covMinPredicat(self,row):
+ return row.queryCov(self._query)>=self._covmin
+
+ def _checkCondition(self,row):
+ return self._covMinPredicat(row) and SelectionIterator._checkCondition(self, row)
+
+
+
\ No newline at end of file
diff --git a/obitools/carto/__init__.py b/obitools/carto/__init__.py
new file mode 100644
index 0000000..b7ac176
--- /dev/null
+++ b/obitools/carto/__init__.py
@@ -0,0 +1,376 @@
+# -*- coding: latin1 -*-
+
+
+
+from obitools import SVGdraw
+import math
+
+class Map(object):
+ """
+ Map represente une instance d'une carte genetique physique.
+ Une telle carte est definie par la longueur de la sequence
+ qui lui est associe.
+
+ A une carte est associe un certain nombre de niveaux (Level)
+ eux meme decoupe en sous-niveau (SubLevel)
+ Les sous niveaux contiennent eux des features
+ """
+ def __init__(self,name,seqlength,scale=1):
+ """
+ Constructeur d'une nouvelle carte
+
+ *Param*:
+
+ name
+ nom de la carte
+
+ seqlength
+ longueur de la sequence associee a la carte
+
+ scale
+ echelle de la carte indicant combien de pixel
+ correspondent a une unite de la carte
+ """
+ self.name = name
+ self.seqlength = seqlength
+ self.scale = scale
+ self.levels = {}
+ self.basicHSize = 10
+
+ def __str__(self):
+ return '<%s>' % self.name
+
+ def __getitem__(self,level):
+ """
+ retourne le niveau *level* de la carte et
+ le cree s'il n'existe pas
+ """
+ if not isinstance(level,int):
+ raise TypeError('level must be an non Zero integer value')
+ elif level==0:
+ raise AssertionError('Level cannot be set to 0')
+ try:
+ return self.levels[level]
+ except KeyError:
+ self.levels[level] = Level(level,self)
+ return self.levels[level]
+
+ def getBasicHSize(self):
+ """
+ retourne la hauteur de base d'un element de cartographie
+ exprimee en pixel
+ """
+ return self.basicHSize
+
+ def getScale(self):
+ """
+ Retourne l'echelle de la carte en nombre de pixels par
+ unite physique de la carte
+ """
+ return self.scale
+
+
+
+ def getNegativeBase(self):
+ return reduce(lambda x,y:x-y,[self.levels[z].getHeight()
+ for z in self.levels
+ if z < 0],self.getHeight())
+
+ def getPositiveBase(self):
+ return self.getNegativeBase() - 3 * self.getBasicHSize()
+
+ def getHeight(self):
+ return reduce(lambda x,y:x+y,[z.getHeight() for z in self.levels.values()],0) \
+ + 4 * self.getBasicHSize()
+
+ def toXML(self,file=None,begin=0,end=None):
+ dessin = SVGdraw.drawing()
+ if end==None:
+ end = self.seqlength
+ hauteur= self.getHeight()
+ largeur=(end-begin+1)*self.scale
+ svg = SVGdraw.svg((begin*self.scale,0,largeur,hauteur),
+ '%fpx' % (self.seqlength * self.scale),
+ '%dpx' % hauteur)
+
+ centre = self.getPositiveBase() + (1 + 1/4) * self.getBasicHSize()
+ svg.addElement(SVGdraw.rect(0,centre,self.seqlength * self.scale,self.getBasicHSize()/2))
+ for e in self.levels.values():
+ svg.addElement(e.getElement())
+ dessin.setSVG(svg)
+ return dessin.toXml(file)
+
+class Feature(object):
+ pass
+
+class Level(object):
+
+ def __init__(self,level,map):
+ if not isinstance(map,Map):
+ raise AssertionError('map is not an instance of class Map')
+ if level in map.levels:
+ raise AssertionError('Level %d already define for map %s' % (level,map))
+ else:
+ map.levels[level] = self
+ self.map = map
+ self.level = level
+ self.sublevels = {}
+
+ def __getitem__(self,sublevel):
+ """
+ retourne le niveau *sublevel* du niveau en
+ le creant s'il n'existe pas
+ """
+ if not isinstance(sublevel,int):
+ raise TypeError('sublevel must be a positive integer value')
+ elif sublevel<0:
+ raise AssertionError('Level cannot be negative')
+ try:
+ return self.sublevels[sublevel]
+ except KeyError:
+ self.sublevels[sublevel] = SubLevel(sublevel,self)
+ return self.sublevels[sublevel]
+
+ def getBase(self):
+ if self.level < 0:
+ base = self.map.getNegativeBase()
+ base += reduce(lambda x,y:x+y,[self.map.levels[z].getHeight()
+ for z in self.map.levels
+ if z <0 and z >= self.level],0)
+ return base
+ else:
+ base = self.map.getPositiveBase()
+ base -= reduce(lambda x,y:x+y,[self.map.levels[z].getHeight()
+ for z in self.map.levels
+ if z >0 and z < self.level],0)
+ return base
+
+ def getElement(self):
+ objet = SVGdraw.group('level%d' % self.level)
+ for e in self.sublevels.values():
+ objet.addElement(e.getElement())
+ return objet
+
+
+
+ def getHeight(self):
+ return reduce(lambda x,y:x+y,[z.getHeight() for z in self.sublevels.values()],0) \
+ + 2 * self.map.getBasicHSize()
+
+class SubLevel(object):
+
+ def __init__(self,sublevel,level):
+ if not isinstance(level,Level):
+ raise AssertionError('level is not an instance of class Level')
+ if level in level.sublevels:
+ raise AssertionError('Sublevel %d already define for level %s' % (sublevel,level))
+ else:
+ level.sublevels[sublevel] = self
+ self.level = level
+ self.sublevel = sublevel
+ self.features = {}
+
+ def getHeight(self):
+ return max([x.getHeight() for x in self.features.values()]+[0]) + 4 * self.level.map.getBasicHSize()
+
+ def getBase(self):
+ base = self.level.getBase()
+ if self.level.level < 0:
+ base -= self.level.getHeight() - 2 * self.level.map.getBasicHSize()
+ base += reduce(lambda x,y:x+y,[self.level.sublevels[z].getHeight()
+ for z in self.level.sublevels
+ if z <= self.sublevel],0)
+ base -= 2* self.level.map.getBasicHSize()
+ else:
+ base -= reduce(lambda x,y:x+y,[self.level.sublevels[z].getHeight()
+ for z in self.level.sublevels
+ if z < self.sublevel],0)
+ base -= self.level.map.getBasicHSize()
+ return base
+
+ def getElement(self):
+ base = self.getBase()
+ objet = SVGdraw.group('sublevel%d' % self.sublevel)
+ for e in self.features.values():
+ objet.addElement(e.getElement(base))
+ return objet
+
+ def add(self,feature):
+ if not isinstance(feature,Feature):
+ raise TypeError('feature must be an instance oof Feature')
+ if feature.name in self.features:
+ raise AssertionError('A feature with the same name (%s) have already be insert in this sublevel'
+ % feature.name)
+ self.features[feature.name]=feature
+ feature.sublevel=self
+
+class SimpleFeature(Feature):
+
+ def __init__(self,name,begin,end,visiblename=False,color=0):
+ self.begin = begin
+ self.end = end
+ self.name = name
+ self.color = color
+ self.sublevel = None
+ self.visiblename=visiblename
+
+ def getHeight(self):
+ if not self.sublevel:
+ raise AssertionError('Not affected Simple feature')
+ if self.visiblename:
+ return self.sublevel.level.map.getBasicHSize() * 2
+ else:
+ return self.sublevel.level.map.getBasicHSize()
+
+ def getElement(self,base):
+ scale = self.sublevel.level.map.getScale()
+ y = base - self.sublevel.level.map.getBasicHSize()
+ x = self.begin * scale
+ width = (self.end - self.begin + 1) * scale
+ heigh = self.sublevel.level.map.getBasicHSize()
+
+ objet = SVGdraw.rect(x,y,width,heigh,stroke=self.color)
+ objet.addElement(SVGdraw.description(self.name))
+
+ return objet
+
+class BoxFeature(SimpleFeature):
+
+ def getHeight(self):
+ if not self.sublevel:
+ raise AssertionError('Not affected Box feature')
+ if self.visiblename:
+ return self.sublevel.level.map.getBasicHSize() * 4
+ else:
+ return self.sublevel.level.map.getBasicHSize() * 3
+
+ def getElement(self,base):
+ scale = self.sublevel.level.map.getScale()
+ y = base - self.sublevel.level.map.getBasicHSize() * 2
+ x = self.begin * scale
+ width = (self.end - self.begin + 1) * scale
+ height = self.sublevel.level.map.getBasicHSize() * 3
+
+ objet = SVGdraw.rect(x,y,width,height,stroke=self.color,fill="none")
+ objet.addElement(SVGdraw.description(self.name))
+
+ return objet
+
+class MultiPartFeature(Feature):
+
+ def __init__(self,name,*args,**kargs):
+ self.limits = args
+ self.name = name
+ try:
+ self.color = kargs['color']
+ except KeyError:
+ self.color = "black"
+
+ try:
+ self.visiblename=kargs['visiblename']
+ except KeyError:
+ self.visiblename=None
+
+ try:
+ self.flatlink=kargs['flatlink']
+ except KeyError:
+ self.flatlink=False
+
+ try:
+ self.roundlink=kargs['roundlink']
+ except KeyError:
+ self.roundlink=False
+
+ self.sublevel = None
+
+
+ def getHeight(self):
+ if not self.sublevel:
+ raise AssertionError('Not affected Simple feature')
+ if self.visiblename:
+ return self.sublevel.level.map.getBasicHSize() * 3
+ else:
+ return self.sublevel.level.map.getBasicHSize() * 2
+
+ def getElement(self,base):
+ scale = self.sublevel.level.map.getScale()
+
+ y = base - self.sublevel.level.map.getBasicHSize()
+ height = self.sublevel.level.map.getBasicHSize()
+ objet = SVGdraw.group(self.name)
+ for (debut,fin) in self.limits:
+ x = debut * scale
+ width = (fin - debut + 1) * scale
+ part = SVGdraw.rect(x,y,width,height,fill=self.color)
+ objet.addElement(part)
+
+ debut = self.limits[0][1]
+ for (fin,next) in self.limits[1:]:
+ debut*=scale
+ fin*=scale
+ path = SVGdraw.pathdata(debut,y + height / 2)
+ delta = height / 2
+ if self.roundlink:
+ path.qbezier((debut+fin)/2, y - delta,fin,y + height / 2)
+ else:
+ if self.flatlink:
+ delta = - height / 2
+ path.line((debut+fin)/2, y - delta)
+ path.line(fin,y + height / 2)
+ path = SVGdraw.path(path,fill="none",stroke=self.color)
+ objet.addElement(path)
+ debut = next
+
+ objet.addElement(SVGdraw.description(self.name))
+
+ return objet
+
+class TagFeature(Feature):
+
+ def __init__(self,name,begin,length,ratio,visiblename=False,color=0):
+ self.begin = begin
+ self.length = length
+ self.ratio = ratio
+ self.name = name
+ self.color = color
+ self.sublevel = None
+ self.visiblename=visiblename
+
+ def getHeight(self):
+ if not self.sublevel:
+ raise AssertionError('Not affected Tag feature')
+
+ return self.sublevel.level.map.getBasicHSize()*11
+
+ def getElement(self,base):
+ scale = self.sublevel.level.map.getScale()
+ height = math.floor(max(1,self.sublevel.level.map.getBasicHSize()* 10 * self.ratio))
+ y = base + self.sublevel.level.map.getBasicHSize() - height
+ x = self.begin * scale
+ width = self.length * scale
+ objet = SVGdraw.rect(x,y,width,height,stroke=self.color)
+ objet.addElement(SVGdraw.description(self.name))
+
+ return objet
+
+if __name__ == '__main__':
+ carte = Map('essai',20000,scale=0.5)
+ carte[-1][0].add(SimpleFeature('toto',100,300))
+ carte[1][0].add(SimpleFeature('toto',100,300))
+ carte[1][1].add(SimpleFeature('toto',200,1000))
+
+ carte[1][0].add(MultiPartFeature('bout',(1400,1450),(1470,1550),(1650,1800),color='red',flatlink=True))
+ carte[1][0].add(MultiPartFeature('titi',(400,450),(470,550),(650,800),color='red',flatlink=True))
+ carte[-1][1].add(MultiPartFeature('titi',(400,450),(470,550),(650,800),color='green'))
+ carte[-1][2].add(MultiPartFeature('titi',(400,450),(470,550),(650,800),color='purple',roundlink=True))
+
+ carte[-1][1].add(BoxFeature('tutu',390,810,color='purple'))
+ carte[1][0].add(BoxFeature('tutu',390,810,color='red'))
+ carte[2][0].add(TagFeature('t1',1400,20,0.8))
+ carte[2][0].add(TagFeature('t2',1600,20,0.2))
+ carte.basicHSize=6
+ print carte.toXML('truc.svg',begin=0,end=1000)
+ print carte.toXML('truc2.svg',begin=460,end=2000)
+
+
+
diff --git a/obitools/decorator.py b/obitools/decorator.py
new file mode 100644
index 0000000..e69de29
diff --git a/obitools/distances/__init__.py b/obitools/distances/__init__.py
new file mode 100644
index 0000000..1542fa9
--- /dev/null
+++ b/obitools/distances/__init__.py
@@ -0,0 +1,29 @@
+class DistanceMatrix(object):
+
+ def __init__(self,alignment):
+ '''
+ DistanceMatrix constructor.
+
+ @param alignment: aligment used to compute distance matrix
+ @type alignment: obitools.align.Alignment
+ '''
+ self.aligment = alignment
+ self.matrix = [[None] * (x+1) for x in xrange(len(alignment))]
+
+ def evaluateDist(self,x,y):
+ raise NotImplementedError
+
+ def __getitem__(self,key):
+ assert isinstance(key,(tuple,list)) and len(key)==2, \
+ 'key must be a tuple or a list of two integers'
+ x,y = key
+ if y < x:
+ z=x
+ x=y
+ y=z
+ rep = self.matrix[y][x]
+ if rep is None:
+ rep = self.evaluateDist(x,y)
+ self.matrix[y][x] = rep
+
+ return rep
\ No newline at end of file
diff --git a/obitools/distances/observed.py b/obitools/distances/observed.py
new file mode 100644
index 0000000..8828d92
--- /dev/null
+++ b/obitools/distances/observed.py
@@ -0,0 +1,77 @@
+'''
+Module dedicated to compute observed divergeances from
+an alignment. No distance correction is applied at all
+'''
+
+from itertools import imap
+
+from obitools.distances import DistanceMatrix
+
+class PairewiseGapRemoval(DistanceMatrix):
+ '''
+ Observed divergeance matrix from an alignment.
+ Gap are removed from the alignemt on a pairewise
+ sequence base
+ '''
+
+ def evaluateDist(self,x,y):
+ '''
+ Compute the observed divergeance from two sequences
+ of an aligment.
+
+ @attention: For performance purpose this method should
+ be directly used. use instead the __getitem__
+ method from DistanceMatrix.
+
+ @see: L{__getitem__}
+
+ @param x: number of the fisrt sequence in the aligment
+ @type x: int
+ @param y: umber of the second sequence in the aligment
+ @type y: int
+
+
+ '''
+
+ seq1 = self.aligment[x]
+ seq2 = self.aligment[y]
+
+ diff,tot = reduce(lambda x,y: (x[0]+y,x[1]+1),
+ (z[0]!=z[1] for z in imap(None,seq1,seq2)
+ if '-' not in z),(0,0))
+ return float(diff)/tot
+
+
+class Pairewise(DistanceMatrix):
+ '''
+ Observed divergeance matrix from an alignment.
+ Gap are kept from the alignemt
+ '''
+
+ def evaluateDist(self,x,y):
+ '''
+ Compute the observed divergeance from two sequences
+ of an aligment.
+
+ @attention: For performance purpose this method should
+ be directly used. use instead the __getitem__
+ method from DistanceMatrix.
+
+ @see: L{__getitem__}
+
+ @param x: number of the fisrt sequence in the aligment
+ @type x: int
+ @param y: umber of the second sequence in the aligment
+ @type y: int
+
+
+ '''
+
+ seq1 = self.aligment[x]
+ seq2 = self.aligment[y]
+
+ diff,tot = reduce(lambda x,y: (x[0]+y,x[1]+1),
+ (z[0]!=z[1] for z in imap(None,seq1,seq2)),
+ (0,0))
+ return float(diff)/tot
+
\ No newline at end of file
diff --git a/obitools/distances/phylip.py b/obitools/distances/phylip.py
new file mode 100644
index 0000000..e2043fa
--- /dev/null
+++ b/obitools/distances/phylip.py
@@ -0,0 +1,35 @@
+import sys
+
+from itertools import imap,count
+
+def writePhylipMatrix(matrix):
+ names = [x.id for x in matrix.aligment]
+ pnames= [x[:10] for x in names]
+ unicity={}
+ redundent=[]
+ for n in pnames:
+ unicity[n]=unicity.get(n,0)+1
+ redundent.append(unicity[n])
+
+ for i,n,r in imap(None,count(),pnames,redundent):
+ alternate = n
+ if r > 1:
+ while alternate in pnames:
+ lcut = 9 - len(str(r))
+ alternate = n[:lcut]+ '_%d' % r
+ r+=1
+ pnames[i]='%-10s' % alternate
+
+ firstline = '%5d' % len(matrix.aligment)
+ rep = [firstline]
+ for i,n in imap(None,count(),pnames):
+ line = [n]
+ for j in xrange(i):
+ line.append('%5.4f' % matrix[(j,i)])
+ rep.append(' '.join(line))
+ return '\n'.join(rep)
+
+
+
+
+
\ No newline at end of file
diff --git a/obitools/distances/r.py b/obitools/distances/r.py
new file mode 100644
index 0000000..f674a4c
--- /dev/null
+++ b/obitools/distances/r.py
@@ -0,0 +1,25 @@
+import sys
+
+from itertools import imap,count
+
+def writeRMatrix(matrix):
+ names = [x.id for x in matrix.aligment]
+ lmax = max(max(len(x) for x in names),5)
+ lali = len(matrix.aligment)
+
+ nformat = '%%-%ds' % lmax
+ dformat = '%%%d.4f' % lmax
+
+ pnames=[nformat % x for x in names]
+
+ rep = [' '.join(pnames)]
+
+ for i in xrange(lali):
+ line=[]
+ for j in xrange(lali):
+ line.append('%5.4f' % matrix[(j,i)])
+ rep.append(' '.join(line))
+ return '\n'.join(rep)
+
+
+
\ No newline at end of file
diff --git a/obitools/dnahash/__init__.py b/obitools/dnahash/__init__.py
new file mode 100644
index 0000000..ca02e35
--- /dev/null
+++ b/obitools/dnahash/__init__.py
@@ -0,0 +1,100 @@
+_A=[0]
+_C=[1]
+_G=[2]
+_T=[3]
+_R= _A + _G
+_Y= _C + _T
+_M= _C + _A
+_K= _T + _G
+_W= _T + _A
+_S= _C + _G
+_B= _C + _G + _T
+_D= _A + _G + _T
+_H= _A + _C + _T
+_V= _A + _C + _G
+_N= _A + _C + _G + _T
+
+_dnahash={'a':_A,
+ 'c':_C,
+ 'g':_G,
+ 't':_T,
+ 'r':_R,
+ 'y':_Y,
+ 'm':_M,
+ 'k':_K,
+ 'w':_W,
+ 's':_S,
+ 'b':_B,
+ 'd':_D,
+ 'h':_H,
+ 'v':_V,
+ 'n':_N,
+ }
+
+def hashCodeIterator(sequence,wsize,degeneratemax=0,offset=0):
+ errors = 0
+ emask = [0] * wsize
+ epointer = 0
+ size = 0
+ position = offset
+ hashs = set([0])
+ hashmask = 0
+ for i in xrange(wsize):
+ hashmask <<= 2
+ hashmask +=3
+
+ for l in sequence:
+ l = l.lower()
+ hl = _dnahash[l]
+
+ if emask[epointer]:
+ errors-=1
+ emask[epointer]=0
+
+ if len(hl) > 1:
+ errors +=1
+ emask[epointer]=1
+
+ epointer+=1
+ epointer%=wsize
+
+ if errors > degeneratemax:
+ hl=set([hl[0]])
+
+ hashs=set((((hc<<2) | cl) & hashmask)
+ for hc in hashs
+ for cl in hl)
+
+ if size < wsize:
+ size+=1
+
+ if size==wsize:
+ if errors <= degeneratemax:
+ yield (position,hashs,errors)
+ position+=1
+
+def hashSequence(sequence,wsize,degeneratemax=0,offset=0,hashs=None):
+ if hashs is None:
+ hashs=[[] for x in xrange(4**wsize)]
+
+ for pos,keys,errors in hashCodeIterator(sequence, wsize, degeneratemax, offset):
+ for k in keys:
+ hashs[k].append(pos)
+
+ return hashs
+
+def hashSequences(sequences,wsize,maxpos,degeneratemax=0):
+ hashs=None
+ offsets=[]
+ offset=0
+ for s in sequences:
+ offsets.append(offset)
+ hashSequence(s,wsize,degeneratemax=degeneratemax,offset=offset,hashs=hashs)
+ offset+=len(s)
+
+ return hashs,offsets
+
+
+
+
+
\ No newline at end of file
diff --git a/obitools/ecobarcode/__init__.py b/obitools/ecobarcode/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/obitools/ecobarcode/databases.py b/obitools/ecobarcode/databases.py
new file mode 100644
index 0000000..70d2319
--- /dev/null
+++ b/obitools/ecobarcode/databases.py
@@ -0,0 +1,32 @@
+'''
+Created on 25 sept. 2010
+
+@author: coissac
+'''
+from obitools import NucSequence
+
+def referenceDBIterator(options):
+
+ cursor = options.ecobarcodedb.cursor()
+
+ cursor.execute("select id from databases.database where name='%s'" % options.database)
+ options.dbid = cursor.fetchone()[0]
+
+ cursor.execute('''
+ select s.accession,r.id,r.taxid,r.sequence
+ from databases.database d,
+ databases.reference r,
+ databases.relatedsequences s
+ where r.database = d.id
+ and s.reference= r.id
+ and s.mainac
+ and d.name = '%s'
+ ''' % options.database
+ )
+
+ for ac,id,taxid,sequence in cursor:
+ s = NucSequence(ac,sequence)
+ s['taxid']=taxid
+ s['refdbid']=id
+ yield s
+
\ No newline at end of file
diff --git a/obitools/ecobarcode/ecotag.py b/obitools/ecobarcode/ecotag.py
new file mode 100644
index 0000000..2ebd3fb
--- /dev/null
+++ b/obitools/ecobarcode/ecotag.py
@@ -0,0 +1,50 @@
+'''
+Created on 25 sept. 2010
+
+@author: coissac
+'''
+
+def alreadyIdentified(seqid,options):
+ cursor = options.ecobarcodedb.cursor()
+ cursor.execute('''
+ select count(*)
+ from ecotag.identification
+ where sequence=%s
+ and database=%s
+ ''',(int(seqid),int(options.dbid)))
+
+ return int(cursor.fetchone()[0]) > 0;
+
+def storeIdentification(seqid,
+ idstatus,taxid,
+ matches,
+ options
+ ):
+
+ cursor = options.ecobarcodedb.cursor()
+
+ if not options.updatedb:
+ cursor.execute('''
+ delete from ecotag.identification where sequence=%s and database=%s
+ ''',(int(seqid),int(options.dbid)))
+
+ cursor.execute('''
+ insert into ecotag.identification (sequence,database,idstatus,taxid)
+ values (%s,%s,%s,%s)
+ returning id
+ ''' , (int(seqid),int(options.dbid),idstatus,int(taxid)))
+
+ idid = cursor.fetchone()[0]
+
+ for seq,identity in matches.iteritems():
+ cursor.execute('''
+ insert into ecotag.evidence (identification,reference,identity)
+ values (%s,
+ %s,
+ %s)
+ ''',(idid,seq,identity))
+
+
+ cursor.close()
+
+ options.ecobarcodedb.commit()
diff --git a/obitools/ecobarcode/options.py b/obitools/ecobarcode/options.py
new file mode 100644
index 0000000..6086423
--- /dev/null
+++ b/obitools/ecobarcode/options.py
@@ -0,0 +1,64 @@
+'''
+Created on 23 sept. 2010
+
+@author: coissac
+'''
+import psycopg2
+
+from obitools.ecobarcode.taxonomy import EcoTaxonomyDB
+
+def addEcoBarcodeDBOption(optionManager):
+ optionManager.add_option('--dbname',
+ action="store", dest="ecobarcodedb",
+ type='str',
+ default=None,
+ help="Specify the name of the ecobarcode database")
+
+ optionManager.add_option('--server',
+ action="store", dest="dbserver",
+ type='str',
+ default="localhost",
+ help="Specify the adress of the ecobarcode database server")
+
+ optionManager.add_option('--user',
+ action="store", dest="dbuser",
+ type='str',
+ default='postgres',
+ help="Specify the user of the ecobarcode database")
+
+ optionManager.add_option('--port',
+ action="store", dest="dbport",
+ type='str',
+ default=5432,
+ help="Specify the port of the ecobarcode database")
+
+ optionManager.add_option('--passwd',
+ action="store", dest="dbpasswd",
+ type='str',
+ default='',
+ help="Specify the passwd of the ecobarcode database")
+
+ optionManager.add_option('--primer',
+ action="store", dest="primer",
+ type='str',
+ default=None,
+ help="Specify the primer used for amplification")
+
+
+def ecobarcodeDatabaseConnection(options):
+ if options.ecobarcodedb is not None:
+ connection = psycopg2.connect(database=options.ecobarcodedb,
+ user=options.dbuser,
+ password=options.dbpasswd,
+ host=options.dbserver,
+ port=options.dbport)
+ options.dbname=options.ecobarcodedb
+ else:
+ connection=None
+ if connection is not None:
+ options.ecobarcodedb=connection
+ taxonomy = EcoTaxonomyDB(connection)
+ else:
+ taxonomy=None
+ return taxonomy
+
diff --git a/obitools/ecobarcode/rawdata.py b/obitools/ecobarcode/rawdata.py
new file mode 100644
index 0000000..a5f58cf
--- /dev/null
+++ b/obitools/ecobarcode/rawdata.py
@@ -0,0 +1,38 @@
+'''
+Created on 25 sept. 2010
+
+@author: coissac
+'''
+
+from obitools import NucSequence
+from obitools.utils import progressBar
+from obitools.ecobarcode.ecotag import alreadyIdentified
+
+import sys
+
+def sequenceIterator(options):
+ cursor = options.ecobarcodedb.cursor()
+
+ cursor.execute('''
+ select s.id,sum(o.count),s.sequence
+ from rawdata.sequence s,
+ rawdata.occurrences o
+ where o.sequence= s.id
+ and s.primers = '%s'
+ group by s.id,s.sequence
+ ''' % options.primer
+ )
+
+ nbseq = cursor.rowcount
+ progressBar(1, nbseq, True, head=options.dbname)
+ for id,count,sequence in cursor:
+ progressBar(cursor.rownumber+1, nbseq, head=options.dbname)
+ if not options.updatedb or not alreadyIdentified(id,options):
+ s = NucSequence(id,sequence)
+ s['count']=count
+ print >>sys.stderr,' +', cursor.rownumber+1,
+ yield s
+ else:
+ print >>sys.stderr,' @', cursor.rownumber+1,
+
+ print >>sys.stderr
diff --git a/obitools/ecobarcode/taxonomy.py b/obitools/ecobarcode/taxonomy.py
new file mode 100644
index 0000000..c7d0185
--- /dev/null
+++ b/obitools/ecobarcode/taxonomy.py
@@ -0,0 +1,120 @@
+'''
+Created on 24 sept. 2010
+
+@author: coissac
+'''
+
+from obitools.ecopcr.taxonomy import TaxonomyDump
+from obitools.ecopcr.taxonomy import Taxonomy
+import sys
+
+class EcoTaxonomyDB(TaxonomyDump) :
+
+ def __init__(self,dbconnect):
+ self._dbconnect=dbconnect
+
+ print >> sys.stderr,"Reading ecobarcode taxonomy database..."
+
+ self._readNodeTable()
+ print >> sys.stderr," ok"
+
+ print >>sys.stderr,"Adding scientific name..."
+
+ self._name=[]
+ for taxid,name,classname in self._nameIterator():
+ self._name.append((name,classname,self._index[taxid]))
+ if classname == 'scientific name':
+ self._taxonomy[self._index[taxid]].append(name)
+
+ print >>sys.stderr,"Adding taxid alias..."
+ for taxid,current in self._mergedNodeIterator():
+ self._index[taxid]=self._index[current]
+
+ print >>sys.stderr,"Adding deleted taxid..."
+ for taxid in self._deletedNodeIterator():
+ self._index[taxid]=None
+
+
+ Taxonomy.__init__(self)
+
+ #####
+ #
+ # Iterator functions
+ #
+ #####
+
+ def _readNodeTable(self):
+
+ cursor = self._dbconnect.cursor()
+
+ cursor.execute("""
+ select taxid,rank,parent
+ from ncbitaxonomy.nodes
+ """)
+
+ print >>sys.stderr,"Reading taxonomy nodes..."
+ taxonomy=[list(n) for n in cursor]
+
+ print >>sys.stderr,"List all taxonomy rank..."
+ ranks =list(set(x[1] for x in taxonomy))
+ ranks.sort()
+ rankidx = dict(map(None,ranks,xrange(len(ranks))))
+
+ print >>sys.stderr,"Sorting taxons..."
+ taxonomy.sort(TaxonomyDump._taxonCmp)
+
+ self._taxonomy=taxonomy
+
+ print >>sys.stderr,"Indexing taxonomy..."
+ index = {}
+ for t in self._taxonomy:
+ index[t[0]]=self._bsearchTaxon(t[0])
+
+ print >>sys.stderr,"Indexing parent and rank..."
+ for t in self._taxonomy:
+ t[1]=rankidx[t[1]]
+ t[2]=index[t[2]]
+
+ self._ranks=ranks
+ self._index=index
+
+ cursor.close()
+
+ def _nameIterator(self):
+ cursor = self._dbconnect.cursor()
+
+ cursor.execute("""
+ select taxid,name,nameclass
+ from ncbitaxonomy.names
+ """)
+
+ for taxid,name,nameclass in cursor:
+ yield taxid,name,nameclass
+
+ cursor.close()
+
+ def _mergedNodeIterator(self):
+ cursor = self._dbconnect.cursor()
+
+ cursor.execute("""
+ select oldtaxid,newtaxid
+ from ncbitaxonomy.merged
+ """)
+
+ for oldtaxid,newtaxid in cursor:
+ yield oldtaxid,newtaxid
+
+ cursor.close()
+
+ def _deletedNodeIterator(self):
+ cursor = self._dbconnect.cursor()
+
+ cursor.execute("""
+ select taxid
+ from ncbitaxonomy.delnodes
+ """)
+
+ for taxid in cursor:
+ yield taxid[0]
+
+ cursor.close()
diff --git a/obitools/ecopcr/__init__.py b/obitools/ecopcr/__init__.py
new file mode 100644
index 0000000..10a90e5
--- /dev/null
+++ b/obitools/ecopcr/__init__.py
@@ -0,0 +1,69 @@
+from obitools import utils
+from obitools import NucSequence
+from obitools.utils import universalOpen, universalTell, fileSize, progressBar
+import struct
+import sys
+
+
+class EcoPCRFile(utils.ColumnFile):
+ def __init__(self,stream):
+ utils.ColumnFile.__init__(self,
+ stream, '|', True,
+ (str,int,int,
+ str,int,str,
+ int,str,int,
+ str,int,str,
+ str,str,int,float,
+ str,int,float,
+ int,
+ str,str), "#")
+
+
+ def next(self):
+ data = utils.ColumnFile.next(self)
+ seq = NucSequence(data[0],data[20],data[21])
+ seq['seq_length_ori']=data[1]
+ seq['taxid']=data[2]
+ seq['rank']=data[3]
+ seq['species']=data[4]
+ seq['species_sn']=data[5]
+ seq['genus']=data[6]
+ seq['genus_sn']=data[7]
+ seq['family']=data[8]
+ seq['family_sn']=data[9]
+ seq['strand']=data[12]
+ seq['forward_primer']=data[13]
+ seq['forward_error']=data[14]
+ seq['forward_tm']=data[15]
+ seq['reverse_primer']=data[16]
+ seq['reverse_error']=data[17]
+ seq['reverse_tm']=data[18]
+
+ return seq
+
+
+
+class EcoPCRDBFile(object):
+
+ def _ecoRecordIterator(self,file):
+ file = universalOpen(file)
+ (recordCount,) = struct.unpack('> I',file.read(4))
+ self._recover=False
+
+ if recordCount:
+ for i in xrange(recordCount):
+ (recordSize,)=struct.unpack('>I',file.read(4))
+ record = file.read(recordSize)
+ yield record
+ else:
+ print >> sys.stderr,"\n\n WARNING : EcoPCRDB readding set into recover data mode\n"
+ self._recover=True
+ ok=True
+ while(ok):
+ try:
+ (recordSize,)=struct.unpack('>I',file.read(4))
+ record = file.read(recordSize)
+ yield record
+ except:
+ ok=False
+
\ No newline at end of file
diff --git a/obitools/ecopcr/annotation.py b/obitools/ecopcr/annotation.py
new file mode 100644
index 0000000..7c76fb2
--- /dev/null
+++ b/obitools/ecopcr/annotation.py
@@ -0,0 +1,104 @@
+import struct
+
+class EcoPCRDBAnnotationWriter(object):
+ '''
+ Class used to write Annotation description in EcoPCRDB format.
+
+ EcoPCRDBAnnotationWriter is oftenly called through the EcoPCRDBSequenceWriter class
+
+ @see: L{ecopcr.sequence.EcoPCRDBSequenceWriter}
+ '''
+
+ def __init__(self,dbname,id,fileidx=1,type=('CDS'),definition=None):
+ '''
+ class constructor
+
+ @param dbname: name of ecoPCR database
+ @type dbname: C{str}
+ @param id: name of the qualifier used as feature id
+ @type id: C{str}
+ @param fileidx:
+ @type fileidx: C{int}
+ @param type:
+ @type type: C{list} or C{tuple}
+ @param definition:
+ @type definition: C{str}
+ '''
+ self._type = type
+ self._definition = definition
+ self._id = id
+ self._filename="%s_%03d.adx" % (dbname,fileidx)
+ self._file = open(self._filename,'wb')
+ self._sequenceIdx=0
+
+
+ ftname ="%s.fdx" % (dbname)
+ ft = open(ftname,'wb')
+
+ self._fttypeidx=dict(map(None,type,xrange(len(type))))
+
+ ft.write(struct.pack('> I',len(type)))
+
+ for t in type:
+ ft.write(self._ecoFtTypePacker(t))
+
+ ft.close()
+
+ self._annotationCount=0
+ self._file.write(struct.pack('> I',self._annotationCount))
+
+
+ def _ecoFtTypePacker(self,type):
+ totalSize = len(type)
+ packed = struct.pack('> I %ds' % totalSize,totalSize,type)
+
+ assert len(packed) == totalSize+4, "error in feature type packing"
+
+ return packed
+
+ def _ecoAnnotationPacker(self,feature,seqidx):
+ begin = feature.begin-1
+ end = feature.end
+ type = self._fttypeidx[feature.ftType]
+ strand = feature.isDirect()
+ id = feature[self._id][0]
+ if self._definition in feature:
+ definition = feature[self._definition][0]
+ else:
+ definition = ''
+
+ assert strand is not None,"Only strand defined features can be stored"
+
+ deflength = len(definition)
+
+ totalSize = 4 + 4 + 4 + 4 + 4 + 20 + 4 + deflength
+
+ packed = struct.pack('> I I I I I 20s I %ds' % (deflength),
+ totalSize,
+ seqidx,
+ begin,
+ end,
+ type,
+ int(strand),
+ id,
+ deflength,
+ definition)
+
+ assert len(packed) == totalSize+4, "error in annotation packing"
+
+ return packed
+
+
+ def put(self,sequence,seqidx=None):
+ if seqidx is None:
+ seqidx = self._sequenceIdx
+ self._sequenceIdx+=1
+ for feature in sequence.getFeatureTable():
+ if feature.ftType in self._type:
+ self._annotationCount+=1
+ self._file.write(self._ecoAnnotationPacker(feature,seqidx))
+
+ def __del__(self):
+ self._file.seek(0,0)
+ self._file.write(struct.pack('> I',self._annotationCount))
+ self._file.close()
diff --git a/obitools/ecopcr/options.py b/obitools/ecopcr/options.py
new file mode 100644
index 0000000..03663cd
--- /dev/null
+++ b/obitools/ecopcr/options.py
@@ -0,0 +1,129 @@
+'''
+Created on 13 fevr. 2011
+
+@author: coissac
+'''
+
+from obitools.ecopcr.taxonomy import Taxonomy, EcoTaxonomyDB, TaxonomyDump, ecoTaxonomyWriter
+
+try:
+ from obitools.ecobarcode.options import addEcoBarcodeDBOption,ecobarcodeDatabaseConnection
+except ImportError:
+ def addEcoBarcodeDBOption(optionmanager):
+ pass
+ def ecobarcodeDatabaseConnection(options):
+ return None
+
+def addTaxonomyDBOptions(optionManager):
+ addEcoBarcodeDBOption(optionManager)
+ optionManager.add_option('-d','--database',
+ action="store", dest="taxonomy",
+ metavar="",
+ type="string",
+ help="ecoPCR taxonomy Database "
+ "name")
+ optionManager.add_option('-t','--taxonomy-dump',
+ action="store", dest="taxdump",
+ metavar="",
+ type="string",
+ help="NCBI Taxonomy dump repository "
+ "name")
+
+
+def addTaxonomyFilterOptions(optionManager):
+ addTaxonomyDBOptions(optionManager)
+ optionManager.add_option('--require-rank',
+ action="append",
+ dest='requiredRank',
+ metavar="",
+ type="string",
+ default=[],
+ help="select sequence with taxid tag containing "
+ "a parent of rank ")
+
+ optionManager.add_option('-r','--required',
+ action="append",
+ dest='required',
+ metavar="",
+ type="int",
+ default=[],
+ help="required taxid")
+
+ optionManager.add_option('-i','--ignore',
+ action="append",
+ dest='ignored',
+ metavar="",
+ type="int",
+ default=[],
+ help="ignored taxid")
+
+def loadTaxonomyDatabase(options):
+ if isinstance(options.taxonomy, Taxonomy):
+ return options.taxonomy
+ taxonomy = ecobarcodeDatabaseConnection(options)
+ if (taxonomy is not None or
+ options.taxonomy is not None or
+ options.taxdump is not None):
+ if options.taxdump is not None:
+ taxonomy = TaxonomyDump(options.taxdump)
+ if taxonomy is not None and isinstance(options.taxonomy, str):
+ ecoTaxonomyWriter(options.taxonomy,taxonomy)
+ options.ecodb=options.taxonomy
+ if isinstance(options.taxonomy, Taxonomy):
+ taxonomy = options.taxonomy
+ if taxonomy is None and isinstance(options.taxonomy, str):
+ taxonomy = EcoTaxonomyDB(options.taxonomy)
+ options.ecodb=options.taxonomy
+ options.taxonomy=taxonomy
+ return options.taxonomy
+
+def taxonomyFilterGenerator(options):
+ loadTaxonomyDatabase(options)
+ if options.taxonomy is not None:
+ taxonomy=options.taxonomy
+ def taxonomyFilter(seq):
+ def annotateAtRank(seq,rank):
+ if 'taxid' in seq and seq['taxid'] is not None:
+ rtaxid= taxonomy.getTaxonAtRank(seq['taxid'],rank)
+ return rtaxid
+ return None
+ good = True
+ if 'taxid' in seq:
+ taxid = seq['taxid']
+# print taxid,
+ if options.requiredRank:
+ taxonatrank = reduce(lambda x,y: x and y,
+ (annotateAtRank(seq,rank) is not None
+ for rank in options.requiredRank),True)
+ good = good and taxonatrank
+# print >>sys.stderr, " Has rank : ",good,
+ if options.required:
+ good = good and reduce(lambda x,y: x or y,
+ (taxonomy.isAncestor(r,taxid) for r in options.required),
+ False)
+# print " Required : ",good,
+ if options.ignored:
+ good = good and not reduce(lambda x,y: x or y,
+ (taxonomy.isAncestor(r,taxid) for r in options.ignored),
+ False)
+# print " Ignored : ",good,
+# print " Global : ",good
+
+ return good
+
+
+ else:
+ def taxonomyFilter(seq):
+ return True
+
+ return taxonomyFilter
+
+def taxonomyFilterIteratorGenerator(options):
+ taxonomyFilter = taxonomyFilterGenerator(options)
+
+ def filterIterator(seqiterator):
+ for seq in seqiterator:
+ if taxonomyFilter(seq):
+ yield seq
+
+ return filterIterator
\ No newline at end of file
diff --git a/obitools/ecopcr/sequence.py b/obitools/ecopcr/sequence.py
new file mode 100644
index 0000000..1465e69
--- /dev/null
+++ b/obitools/ecopcr/sequence.py
@@ -0,0 +1,133 @@
+from obitools import NucSequence
+from obitools.ecopcr import EcoPCRDBFile
+from obitools.ecopcr.taxonomy import EcoTaxonomyDB, ecoTaxonomyWriter
+from obitools.ecopcr.annotation import EcoPCRDBAnnotationWriter
+from obitools.utils import universalOpen
+from glob import glob
+import struct
+import gzip
+import sys
+
+
+class EcoPCRDBSequenceIterator(EcoPCRDBFile):
+ '''
+ Build an iterator over the sequences include in a sequence database
+ formated for ecoPCR
+ '''
+
+ def __init__(self,path,taxonomy=None):
+ '''
+ ecoPCR data iterator constructor
+
+ @param path: path to the ecoPCR database including the database prefix name
+ @type path: C{str}
+ @param taxonomy: a taxonomy can be given to the reader to decode the taxonomic data
+ associated to the sequences. If no Taxonomy is furnish, it will be read
+ before the sequence database files using the same path.
+ @type taxonomy: L{obitools.ecopcr.taxonomy.Taxonomy}
+ '''
+ self._path = path
+
+ if taxonomy is not None:
+ self._taxonomy=taxonomy
+ else:
+ self._taxonomy=EcoTaxonomyDB(path)
+
+ self._seqfilesFiles = glob('%s_???.sdx' % self._path)
+ self._seqfilesFiles.sort()
+
+ def __ecoSequenceIterator(self,file):
+ for record in self._ecoRecordIterator(file):
+ lrecord = len(record)
+ lnames = lrecord - (4*4+20)
+ (taxid,seqid,deflength,seqlength,cptseqlength,string)=struct.unpack('> I 20s I I I %ds' % lnames, record)
+ seqid=seqid.strip('\x00')
+ de = string[:deflength]
+ seq = gzip.zlib.decompress(string[deflength:])
+ bioseq = NucSequence(seqid,seq,de,taxidx=taxid,taxid=self._taxonomy._taxonomy[taxid][0])
+ yield bioseq
+
+ def __iter__(self):
+ for seqfile in self._seqfilesFiles:
+ for seq in self.__ecoSequenceIterator(seqfile):
+ yield seq
+
+class EcoPCRDBSequenceWriter(object):
+
+ def __init__(self,dbname,fileidx=1,taxonomy=None,ftid=None,type=None,definition=None,append=False):
+ self._taxonomy=taxonomy
+ self._filename="%s_%03d.sdx" % (dbname,fileidx)
+ if append:
+ mode ='r+b'
+ f = universalOpen(self._filename)
+ (recordCount,) = struct.unpack('> I',f.read(4))
+ self._sequenceCount=recordCount
+ del f
+ self._file = open(self._filename,mode)
+ self._file.seek(0,0)
+ self._file.write(struct.pack('> I',0))
+ self._file.seek(0,2)
+ else:
+ self._sequenceCount=0
+ mode = 'wb'
+ self._file = open(self._filename,mode)
+ self._file.write(struct.pack('> I',self._sequenceCount))
+
+ if self._taxonomy is not None:
+ print >> sys.stderr,"Writing the taxonomy file...",
+ ecoTaxonomyWriter(dbname,self._taxonomy)
+ print >> sys.stderr,"Ok"
+
+ if type is not None:
+ assert ftid is not None,"You must specify an id attribute for features"
+ self._annotation = EcoPCRDBAnnotationWriter(dbname, ftid, fileidx, type, definition)
+ else:
+ self._annotation = None
+
+ def _ecoSeqPacker(self,seq):
+
+ compactseq = gzip.zlib.compress(str(seq).upper(),9)
+ cptseqlength = len(compactseq)
+ delength = len(seq.definition)
+
+ totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength
+
+ if self._taxonomy is None or 'taxid' not in seq:
+ taxon=-1
+ else:
+ taxon=self._taxonomy.findIndex(seq['taxid'])
+
+ try:
+ packed = struct.pack('> I i 20s I I I %ds %ds' % (delength,cptseqlength),
+ totalSize,
+ taxon,
+ seq.id,
+ delength,
+ len(seq),
+ cptseqlength,
+ seq.definition,
+ compactseq)
+ except struct.error as e:
+ print >>sys.stderr,"\n\n============\n\nError on sequence : %s\n\n" % seq.id
+ raise e
+
+ assert len(packed) == totalSize+4, "error in sequence packing"
+
+ return packed
+
+
+ def put(self,sequence):
+ if self._taxonomy is not None:
+ if 'taxid' not in sequence and hasattr(sequence, 'extractTaxon'):
+ sequence.extractTaxon()
+ self._file.write(self._ecoSeqPacker(sequence))
+ if self._annotation is not None:
+ self._annotation.put(sequence, self._sequenceCount)
+ self._sequenceCount+=1
+
+ def __del__(self):
+ self._file.seek(0,0)
+ self._file.write(struct.pack('> I',self._sequenceCount))
+ self._file.close()
+
+
diff --git a/obitools/ecopcr/taxonomy.py b/obitools/ecopcr/taxonomy.py
new file mode 100644
index 0000000..bb2ec4e
--- /dev/null
+++ b/obitools/ecopcr/taxonomy.py
@@ -0,0 +1,630 @@
+import struct
+import sys
+
+from itertools import count,imap
+
+from obitools.ecopcr import EcoPCRDBFile
+from obitools.utils import universalOpen
+from obitools.utils import ColumnFile
+
+class Taxonomy(object):
+ def __init__(self):
+ '''
+ The taxonomy database constructor
+
+ @param path: path to the ecoPCR database including the database prefix name
+ @type path: C{str}
+ '''
+
+ self._ranks.append('obi')
+
+ self._speciesidx = self._ranks.index('species')
+ self._genusidx = self._ranks.index('genus')
+ self._familyidx = self._ranks.index('family')
+ self._orderidx = self._ranks.index('order')
+ self._nameidx=dict((x[0],x[2]) for x in self._name)
+ self._nameidx.update(dict((x[0],x[2]) for x in self._preferedName))
+ self._preferedidx=dict((x[2],x[1]) for x in self._preferedName)
+
+ self._bigestTaxid = max(x[0] for x in self._taxonomy)
+
+
+ def findTaxonByIdx(self,idx):
+ if idx is None:
+ return None
+ return self._taxonomy[idx]
+
+ def findIndex(self,taxid):
+ if taxid is None:
+ return None
+ return self._index[taxid]
+
+ def findTaxonByTaxid(self,taxid):
+ return self.findTaxonByIdx(self.findIndex(taxid))
+
+ def findTaxonByName(self,name):
+ return self._taxonomy[self._nameidx[name]]
+
+ def findRankByName(self,rank):
+ try:
+ return self._ranks.index(rank)
+ except ValueError:
+ return None
+
+ def __contains__(self,taxid):
+ return self.findTaxonByTaxid(taxid) is not None
+
+
+
+
+ #####
+ #
+ # PUBLIC METHODS
+ #
+ #####
+
+
+ def subTreeIterator(self, taxid):
+ "return subtree for given taxonomic id "
+ idx = self.findTaxonByTaxid(taxid)
+ yield self._taxonomy[idx]
+ for t in self._taxonomy:
+ if t[2] == idx:
+ for subt in self.subTreeIterator(t[0]):
+ yield subt
+
+ def parentalTreeIterator(self, taxid):
+ """
+ return parental tree for given taxonomic id starting from
+ first ancester to the root.
+ """
+ taxon=self.findTaxonByTaxid(taxid)
+ if taxon is not None:
+ while taxon[2]!= 0:
+ yield taxon
+ taxon = self._taxonomy[taxon[2]]
+ yield self._taxonomy[0]
+ else:
+ raise StopIteration
+
+ def isAncestor(self,parent,taxid):
+ return parent in [x[0] for x in self.parentalTreeIterator(taxid)]
+
+ def lastCommonTaxon(self,*taxids):
+ if not taxids:
+ return None
+ if len(taxids)==1:
+ return taxids[0]
+
+ if len(taxids)==2:
+ t1 = [x[0] for x in self.parentalTreeIterator(taxids[0])]
+ t2 = [x[0] for x in self.parentalTreeIterator(taxids[1])]
+ t1.reverse()
+ t2.reverse()
+
+ count = min(len(t1),len(t2))
+ i=0
+ while(i < count and t1[i]==t2[i]):
+ i+=1
+ i-=1
+
+ return t1[i]
+
+ ancetre = taxids[0]
+ for taxon in taxids[1:]:
+ ancetre = self.lastCommonTaxon(ancetre,taxon)
+
+ return ancetre
+
+ def betterCommonTaxon(self,error=1,*taxids):
+ lca = self.lastCommonTaxon(*taxids)
+ idx = self._index[lca]
+ sublca = [t[0] for t in self._taxonomy if t[2]==idx]
+ return sublca
+
+
+ def getPreferedName(self,taxid):
+ idx = self.findIndex(taxid)
+ return self._preferedidx.get(idx,self._taxonomy[idx][3])
+
+
+ def getScientificName(self,taxid):
+ return self.findTaxonByTaxid(taxid)[3]
+
+ def getRankId(self,taxid):
+ return self.findTaxonByTaxid(taxid)[1]
+
+ def getRank(self,taxid):
+ return self._ranks[self.getRankId(taxid)]
+
+ def getTaxonAtRank(self,taxid,rankid):
+ if isinstance(rankid, str):
+ rankid=self._ranks.index(rankid)
+ try:
+ return [x[0] for x in self.parentalTreeIterator(taxid)
+ if x[1]==rankid][0]
+ except IndexError:
+ return None
+
+ def getSpecies(self,taxid):
+ return self.getTaxonAtRank(taxid, self._speciesidx)
+
+ def getGenus(self,taxid):
+ return self.getTaxonAtRank(taxid, self._genusidx)
+
+ def getFamily(self,taxid):
+ return self.getTaxonAtRank(taxid, self._familyidx)
+
+ def getOrder(self,taxid):
+ return self.getTaxonAtRank(taxid, self._orderidx)
+
+ def rankIterator(self):
+ for x in imap(None,self._ranks,xrange(len(self._ranks))):
+ yield x
+
+ def groupTaxa(self,taxa,groupname):
+ t=[self.findTaxonByTaxid(x) for x in taxa]
+ a=set(x[2] for x in t)
+ assert len(a)==1,"All taxa must have the same parent"
+ newtaxid=max([2999999]+[x[0] for x in self._taxonomy if x[0]>=3000000 and x[0]<4000000])+1
+ newidx=len(self._taxonomy)
+ if 'GROUP' not in self._ranks:
+ self._ranks.append('GROUP')
+ rankid=self._ranks.index('GROUP')
+ self._taxonomy.append((newtaxid,rankid,a.pop(),groupname))
+ for x in t:
+ x[2]=newidx
+
+ def addLocalTaxon(self,name,rank,parent,minimaltaxid=10000000):
+ newtaxid = minimaltaxid if (self._bigestTaxid < minimaltaxid) else self._bigestTaxid+1
+
+ rankid=self.findRankByName(rank)
+ parentidx = self.findIndex(int(parent))
+ tx = (newtaxid,rankid,parentidx,name,'local')
+ self._taxonomy.append(tx)
+ newidx=len(self._taxonomy)-1
+ self._name.append((name,'scientific name',newidx))
+ self._nameidx[name]=newidx
+ self._index[newtaxid]=newidx
+
+ self._bigestTaxid=newtaxid
+
+ return newtaxid
+
+ def removeLocalTaxon(self,taxid):
+ raise NotImplemented
+ txidx = self.findIndex(taxid)
+ taxon = self.findTaxonByIdx(txidx)
+
+ assert txidx >= self._localtaxon,"Only local taxon can be deleted"
+
+ for t in self._taxonomy:
+ if t[2] == txidx:
+ self.removeLocalTaxon(t[0])
+
+
+
+
+ return taxon
+
+ def addPreferedName(self,taxid,name):
+ idx = self.findIndex(taxid)
+ self._preferedName.append(name,'obi',idx)
+ self._preferedidx[idx]=name
+ return taxid
+
+class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile):
+ '''
+ A taxonomy database class
+ '''
+
+
+ def __init__(self,path):
+ '''
+ The taxonomy database constructor
+
+ @param path: path to the ecoPCR database including the database prefix name
+ @type path: C{str}
+ '''
+ self._path = path
+ self._taxonFile = "%s.tdx" % self._path
+ self._localTaxonFile = "%s.ldx" % self._path
+ self._ranksFile = "%s.rdx" % self._path
+ self._namesFile = "%s.ndx" % self._path
+ self._preferedNamesFile = "%s.pdx" % self._path
+ self._aliasFile = "%s.adx" % self._path
+
+ print >> sys.stderr,"Reading binary taxonomy database...",
+
+ self.__readNodeTable()
+
+ print >> sys.stderr," ok"
+
+ Taxonomy.__init__(self)
+
+
+ #####
+ #
+ # Iterator functions
+ #
+ #####
+
+ def __ecoNameIterator(self,file):
+ for record in self._ecoRecordIterator(file):
+ lrecord = len(record)
+ lnames = lrecord - 16
+ (isScientificName,namelength,classLength,indextaxid,names)=struct.unpack('> I I I I %ds' % lnames, record)
+ name=names[:namelength]
+ classname=names[namelength:]
+ yield (name,classname,indextaxid)
+
+
+ def __ecoTaxonomicIterator(self):
+ for record in self._ecoRecordIterator(self._taxonFile):
+ lrecord = len(record)
+ lnames = lrecord - 16
+ (taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record)
+ yield (taxid,rankid,parentidx,name,'ncbi')
+
+ try :
+ lt=0
+ for record in self._ecoRecordIterator(self._localTaxonFile):
+ lrecord = len(record)
+ lnames = lrecord - 16
+ (taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record)
+ lt+=1
+ yield (taxid,rankid,parentidx,name,'local')
+ print >> sys.stderr, " [INFO : Local taxon file found] : %d added taxa" % lt
+ except:
+ print >> sys.stderr, " [INFO : Local taxon file not found] "
+
+ def __ecoRankIterator(self):
+ for record in self._ecoRecordIterator(self._ranksFile):
+ yield record
+
+ def __ecoAliasIterator(self):
+ for record in self._ecoRecordIterator(self._aliasFile):
+ (taxid,index) = struct.unpack('> I i',record)
+ yield taxid,index
+
+ #####
+ #
+ # Indexes
+ #
+ #####
+
+ def __ecoNameIndex(self):
+ indexName = [x for x in self.__ecoNameIterator(self._namesFile)]
+ return indexName
+
+ def __ecoRankIndex(self):
+ rank = [r for r in self.__ecoRankIterator()]
+ return rank
+
+ def __ecoTaxonomyIndex(self):
+ taxonomy = []
+
+ try :
+ index = dict(self.__ecoAliasIterator())
+ print >> sys.stderr, " [INFO : Taxon alias file found] "
+ buildIndex=False
+ except:
+ print >> sys.stderr, " [INFO : Taxon alias file not found] "
+ index={}
+ i = 0;
+ buildIndex=True
+
+ localtaxon=0
+ i=0
+ for x in self.__ecoTaxonomicIterator():
+ taxonomy.append(x)
+ if x[4]=='ncbi':
+ localtaxon+=1
+
+ if buildIndex or x[4]!='ncbi':
+ index[x[0]] = i
+ i+=1
+
+
+ print >> sys.stderr,"Taxonomical tree read",
+ return taxonomy, index,localtaxon
+
+ def __readNodeTable(self):
+ self._taxonomy, self._index, self._localtaxon= self.__ecoTaxonomyIndex()
+ self._ranks = self.__ecoRankIndex()
+ self._name = self.__ecoNameIndex()
+
+ # Add local taxon tame to the name index
+ i=self._localtaxon
+ for t in self._taxonomy[self._localtaxon:]:
+ self._name.append((t[3],'scientific name',i))
+ i+=1
+
+ try :
+ self._preferedName = [(x[0],'obi',x[2])
+ for x in self.__ecoNameIterator(self._preferedNamesFile)]
+ print >> sys.stderr, " [INFO : Prefered taxon name file found] : %d added taxa" % len(self._preferedName)
+ except:
+ print >> sys.stderr, " [INFO : Prefered taxon name file not found]"
+ self._preferedName = []
+
+
+
+
+class TaxonomyDump(Taxonomy):
+
+ def __init__(self,taxdir):
+
+ self._path=taxdir
+ self._readNodeTable('%s/nodes.dmp' % taxdir)
+
+ print >>sys.stderr,"Adding scientific name..."
+
+ self._name=[]
+ for taxid,name,classname in self._nameIterator('%s/names.dmp' % taxdir):
+ self._name.append((name,classname,self._index[taxid]))
+ if classname == 'scientific name':
+ self._taxonomy[self._index[taxid]].extend([name,'ncbi'])
+
+ print >>sys.stderr,"Adding taxid alias..."
+ for taxid,current in self._mergedNodeIterator('%s/merged.dmp' % taxdir):
+ self._index[taxid]=self._index[current]
+
+ print >>sys.stderr,"Adding deleted taxid..."
+ for taxid in self._deletedNodeIterator('%s/delnodes.dmp' % taxdir):
+ self._index[taxid]=None
+
+ self._nameidx=dict((x[0],x[2]) for x in self._name)
+
+
+ def _taxonCmp(t1,t2):
+ if t1[0] < t2[0]:
+ return -1
+ elif t1[0] > t2[0]:
+ return +1
+ return 0
+
+ _taxonCmp=staticmethod(_taxonCmp)
+
+ def _bsearchTaxon(self,taxid):
+ taxCount = len(self._taxonomy)
+ begin = 0
+ end = taxCount
+ oldcheck=taxCount
+ check = begin + end / 2
+ while check != oldcheck and self._taxonomy[check][0]!=taxid :
+ if self._taxonomy[check][0] < taxid:
+ begin=check
+ else:
+ end=check
+ oldcheck=check
+ check = (begin + end) / 2
+
+
+ if self._taxonomy[check][0]==taxid:
+ return check
+ else:
+ return None
+
+
+
+ def _readNodeTable(self,file):
+
+ file = universalOpen(file)
+
+ nodes = ColumnFile(file,
+ sep='|',
+ types=(int,int,str,
+ str,str,bool,
+ int,bool,int,
+ bool,bool,bool,str))
+ print >>sys.stderr,"Reading taxonomy dump file..."
+ # (taxid,rank,parent)
+ taxonomy=[[n[0],n[2],n[1]] for n in nodes]
+ print >>sys.stderr,"List all taxonomy rank..."
+ ranks =list(set(x[1] for x in taxonomy))
+ ranks.sort()
+ rankidx = dict(map(None,ranks,xrange(len(ranks))))
+
+ print >>sys.stderr,"Sorting taxons..."
+ taxonomy.sort(TaxonomyDump._taxonCmp)
+
+ self._taxonomy=taxonomy
+ self._localtaxon=len(taxonomy)
+
+ print >>sys.stderr,"Indexing taxonomy..."
+ index = {}
+ for t in self._taxonomy:
+ index[t[0]]=self._bsearchTaxon(t[0])
+
+ print >>sys.stderr,"Indexing parent and rank..."
+ for t in self._taxonomy:
+ t[1]=rankidx[t[1]]
+ t[2]=index[t[2]]
+
+ self._ranks=ranks
+ self._index=index
+ self._preferedName = []
+
+ def _nameIterator(self,file):
+ file = universalOpen(file)
+ names = ColumnFile(file,
+ sep='|',
+ types=(int,str,
+ str,str))
+ for taxid,name,unique,classname,white in names:
+ yield taxid,name,classname
+
+ def _mergedNodeIterator(self,file):
+ file = universalOpen(file)
+ merged = ColumnFile(file,
+ sep='|',
+ types=(int,int,str))
+ for taxid,current,white in merged:
+ yield taxid,current
+
+ def _deletedNodeIterator(self,file):
+ file = universalOpen(file)
+ deleted = ColumnFile(file,
+ sep='|',
+ types=(int,str))
+ for taxid,white in deleted:
+ yield taxid
+
+#####
+#
+#
+# Binary writer
+#
+#
+#####
+
+def ecoTaxonomyWriter(prefix, taxonomy,onlyLocal=False):
+
+ def ecoTaxPacker(tx):
+
+ namelength = len(tx[3])
+
+ totalSize = 4 + 4 + 4 + 4 + namelength
+
+ packed = struct.pack('> I I I I I %ds' % namelength,
+ totalSize,
+ tx[0],
+ tx[1],
+ tx[2],
+ namelength,
+ tx[3])
+
+ return packed
+
+ def ecoRankPacker(rank):
+
+ namelength = len(rank)
+
+ packed = struct.pack('> I %ds' % namelength,
+ namelength,
+ rank)
+
+ return packed
+
+ def ecoAliasPacker(taxid,index):
+
+ totalSize = 4 + 4
+ try:
+ packed = struct.pack('> I I i',
+ totalSize,
+ taxid,
+ index)
+ except struct.error,e:
+ print >>sys.stderr,(totalSize,taxid,index)
+ print >>sys.stderr,"Total size : %d taxid : %d index : %d" %(totalSize,taxid,index)
+ raise e
+
+ return packed
+
+ def ecoNamePacker(name):
+
+ namelength = len(name[0])
+ classlength= len(name[1])
+ totalSize = namelength + classlength + 4 + 4 + 4 + 4
+
+ packed = struct.pack('> I I I I I %ds %ds' % (namelength,classlength),
+ totalSize,
+ int(name[1]=='scientific name'),
+ namelength,
+ classlength,
+ name[2],
+ name[0],
+ name[1])
+
+ return packed
+
+
+ def ecoTaxWriter(file,taxonomy):
+ output = open(file,'wb')
+ nbtaxon = reduce(lambda x,y:x+y,(1 for t in taxonomy if t[4]=='ncbi'),0)
+
+ output.write(struct.pack('> I',nbtaxon))
+
+ for tx in taxonomy:
+ if tx[4]=='ncbi':
+ output.write(ecoTaxPacker(tx))
+
+ output.close()
+ return nbtaxon < len(taxonomy)
+
+ def ecoLocalTaxWriter(file,taxonomy):
+ nbtaxon = reduce(lambda x,y:x+y,(1 for t in taxonomy if t[4]!='ncbi'),0)
+
+ if nbtaxon:
+ output = open(file,'wb')
+
+ output.write(struct.pack('> I',nbtaxon))
+
+ for tx in taxonomy:
+ if tx[4]!='ncbi':
+ output.write(ecoTaxPacker(tx))
+
+ output.close()
+
+
+ def ecoRankWriter(file,ranks):
+ output = open(file,'wb')
+ output.write(struct.pack('> I',len(ranks)))
+
+ for rank in ranks:
+ output.write(ecoRankPacker(rank))
+
+ output.close()
+
+ def ecoAliasWriter(file,index):
+ output = open(file,'wb')
+ output.write(struct.pack('> I',len(index)))
+
+ for taxid in index:
+ i=index[taxid]
+ if i is None:
+ i=-1
+ output.write(ecoAliasPacker(taxid, i))
+
+ output.close()
+
+ def nameCmp(n1,n2):
+ name1=n1[0].upper()
+ name2=n2[0].upper()
+ if name1 < name2:
+ return -1
+ elif name1 > name2:
+ return 1
+ return 0
+
+
+ def ecoNameWriter(file,names):
+ output = open(file,'wb')
+ output.write(struct.pack('> I',len(names)))
+
+ names.sort(nameCmp)
+
+ for name in names:
+ output.write(ecoNamePacker(name))
+
+ output.close()
+
+ def ecoPreferedNameWriter(file,names):
+ output = open(file,'wb')
+ output.write(struct.pack('> I',len(names)))
+ for name in names:
+ output.write(ecoNamePacker(name))
+
+ output.close()
+
+ localtaxon=True
+ if not onlyLocal:
+ ecoRankWriter('%s.rdx' % prefix, taxonomy._ranks)
+ localtaxon = ecoTaxWriter('%s.tdx' % prefix, taxonomy._taxonomy)
+ ecoNameWriter('%s.ndx' % prefix, [x for x in taxonomy._name if x[2] < taxonomy._localtaxon])
+ ecoAliasWriter('%s.adx' % prefix, taxonomy._index)
+ if localtaxon:
+ ecoLocalTaxWriter('%s.ldx' % prefix, taxonomy._taxonomy)
+ if taxonomy._preferedName:
+ ecoNameWriter('%s.pdx' % prefix, taxonomy._preferedName)
diff --git a/obitools/ecotag/__init__.py b/obitools/ecotag/__init__.py
new file mode 100644
index 0000000..26c94d3
--- /dev/null
+++ b/obitools/ecotag/__init__.py
@@ -0,0 +1,2 @@
+class EcoTagResult(dict):
+ pass
\ No newline at end of file
diff --git a/obitools/ecotag/parser.py b/obitools/ecotag/parser.py
new file mode 100644
index 0000000..f431e34
--- /dev/null
+++ b/obitools/ecotag/parser.py
@@ -0,0 +1,150 @@
+from itertools import imap
+from obitools import utils
+
+from obitools.ecotag import EcoTagResult
+
+class EcoTagFileIterator(utils.ColumnFile):
+
+ @staticmethod
+ def taxid(x):
+ x = int(x)
+ if x < 0:
+ return None
+ else:
+ return x
+
+ @staticmethod
+ def scientificName(x):
+ if x=='--':
+ return None
+ else:
+ return x
+
+ @staticmethod
+ def value(x):
+ if x=='--':
+ return None
+ else:
+ return float(x)
+
+ @staticmethod
+ def count(x):
+ if x=='--':
+ return None
+ else:
+ return int(x)
+
+
+ def __init__(self,stream):
+ utils.ColumnFile.__init__(self,
+ stream, '\t', True,
+ (str,str,str,
+ EcoTagFileIterator.value,
+ EcoTagFileIterator.value,
+ EcoTagFileIterator.value,
+ EcoTagFileIterator.count,
+ EcoTagFileIterator.count,
+ EcoTagFileIterator.taxid,
+ EcoTagFileIterator.scientificName,
+ str,
+ EcoTagFileIterator.taxid,
+ EcoTagFileIterator.scientificName,
+ EcoTagFileIterator.taxid,
+ EcoTagFileIterator.scientificName,
+ EcoTagFileIterator.taxid,
+ EcoTagFileIterator.scientificName,
+ str
+ ))
+ self._memory=None
+
+ _colname = ['identification',
+ 'seqid',
+ 'best_match_ac',
+ 'max_identity',
+ 'min_identity',
+ 'theorical_min_identity',
+ 'count',
+ 'match_count',
+ 'taxid',
+ 'scientific_name',
+ 'rank',
+ 'order_taxid',
+ 'order_sn',
+ 'family_taxid',
+ 'family_sn',
+ 'genus_taxid',
+ 'genus_sn',
+ 'species_taxid',
+ 'species_sn',
+ 'sequence']
+
+ def next(self):
+ if self._memory is not None:
+ data=self._memory
+ self._memory=None
+ else:
+ data = utils.ColumnFile.next(self)
+ data = EcoTagResult(imap(None,EcoTagFileIterator._colname[:len(data)],data))
+
+ if data['identification']=='ID':
+ data.cd=[]
+ try:
+ nextone = utils.ColumnFile.next(self)
+ nextone = EcoTagResult(imap(None,EcoTagFileIterator._colname[:len(nextone)],nextone))
+ except StopIteration:
+ nextone = None
+ while nextone is not None and nextone['identification']=='CD':
+ data.cd.append(nextone)
+ try:
+ nextone = utils.ColumnFile.next(self)
+ nextone = EcoTagResult(imap(None,EcoTagFileIterator._colname[:len(nextone)],nextone))
+ except StopIteration:
+ nextone = None
+ self._memory=nextone
+
+ return data
+
+def ecoTagIdentifiedFilter(ecoTagIterator):
+ for x in ecoTagIterator:
+ if x['identification']=='ID':
+ yield x
+
+
+class EcoTagAbstractIterator(utils.ColumnFile):
+
+ _colname = ['scientific_name',
+ 'taxid',
+ 'rank',
+ 'count',
+ 'max_identity',
+ 'min_identity']
+
+
+ @staticmethod
+ def taxid(x):
+ x = int(x)
+ if x < 0:
+ return None
+ else:
+ return x
+
+ def __init__(self,stream):
+ utils.ColumnFile.__init__(self,
+ stream, '\t', True,
+ (str,
+ EcoTagFileIterator.taxid,
+ str,
+ int,
+ float,float,float))
+
+ def next(self):
+ data = utils.ColumnFile.next(self)
+ data = dict(imap(None,EcoTagAbstractIterator._colname,data))
+
+ return data
+
+def ecoTagAbstractFilter(ecoTagAbsIterator):
+ for x in ecoTagAbsIterator:
+ if x['taxid'] is not None:
+ yield x
+
\ No newline at end of file
diff --git a/obitools/eutils/__init__.py b/obitools/eutils/__init__.py
new file mode 100644
index 0000000..1e7d3b2
--- /dev/null
+++ b/obitools/eutils/__init__.py
@@ -0,0 +1,54 @@
+import time
+from urllib2 import urlopen
+import shelve
+from threading import Lock
+import sys
+
+class EUtils(object):
+ '''
+
+ '''
+
+ _last_request=0
+ _interval=3
+
+ def __init__(self):
+ self._lock = Lock()
+
+ def wait(self):
+ now=time.time()
+ delta = now - EUtils._last_request
+ while delta < EUtils._interval:
+ time.sleep(delta)
+ now=time.time()
+ delta = now - EUtils._last_request
+
+ def _sendRequest(self,url):
+ self.wait()
+ EUtils._last_request=time.time()
+ t = EUtils._last_request
+ print >>sys.stderr,"Sending request to NCBI @ %f" % t
+ data = urlopen(url).read()
+ print >>sys.stderr,"Data red from NCBI @ %f (%f)" % (t,time.time()-t)
+ return data
+
+ def setInterval(self,seconde):
+ EUtils._interval=seconde
+
+
+class EFetch(EUtils):
+ '''
+
+ '''
+ def __init__(self,db,tool='OBITools',
+ retmode='text',rettype="native",
+ server='eutils.ncbi.nlm.nih.gov'):
+ EUtils.__init__(self)
+ self._url = "http://%s/entrez/eutils/efetch.fcgi?db=%s&tool=%s&retmode=%s&rettype=%s"
+ self._url = self._url % (server,db,tool,retmode,rettype)
+
+
+ def get(self,**args):
+ key = "&".join(['%s=%s' % x for x in args.items()])
+ return self._sendRequest(self._url +"&" + key)
+
diff --git a/obitools/fast.py b/obitools/fast.py
new file mode 100644
index 0000000..760f493
--- /dev/null
+++ b/obitools/fast.py
@@ -0,0 +1,56 @@
+"""
+ implement fastn/fastp sililarity search algorithm for BioSequence.
+"""
+
+class Fast(object):
+
+ def __init__(self,seq,kup=2):
+ '''
+ @param seq: sequence to hash
+ @type seq: BioSequence
+ @param kup: word size used for hashing process
+ @type kup: int
+ '''
+ hash={}
+ seq = str(seq)
+ for word,pos in ((seq[i:i+kup].upper(),i) for i in xrange(len(seq)-kup)):
+ if word in hash:
+ hash[word].append(pos)
+ else:
+ hash[word]=[pos]
+
+ self._kup = kup
+ self._hash= hash
+ self._seq = seq
+
+ def __call__(self,seq):
+ '''
+ Align one sequence with the fast hash table.
+
+ @param seq: the sequence to align
+ @type seq: BioSequence
+
+ @return: where smax is the
+ score of the largest diagonal and pmax the
+ associated shift
+ @rtype: a int tuple (smax,pmax)
+ '''
+ histo={}
+ seq = str(seq).upper()
+ hash= self._hash
+ kup = self._kup
+
+ for word,pos in ((seq[i:i+kup],i) for i in xrange(len(seq)-kup)):
+ matchedpos = hash.get(word,[])
+ for p in matchedpos:
+ delta = pos - p
+ histo[delta]=histo.get(delta,0) + 1
+ smax = max(histo.values())
+ pmax = [x for x in histo if histo[x]==smax]
+ return smax,pmax
+
+ def __len__(self):
+ return len(self._seq)
+
+
+
diff --git a/obitools/fasta/__init__.py b/obitools/fasta/__init__.py
new file mode 100644
index 0000000..d5b90c5
--- /dev/null
+++ b/obitools/fasta/__init__.py
@@ -0,0 +1,384 @@
+"""
+fasta module provides functions to read and write sequences in fasta format.
+
+
+"""
+
+#from obitools.format.genericparser import fastGenericEntryIteratorGenerator
+from obitools.format.genericparser import genericEntryIteratorGenerator
+from obitools import bioSeqGenerator,BioSequence,AASequence,NucSequence
+from obitools import _default_raw_parser
+
+#from obitools.alignment import alignmentReader
+#from obitools.utils import universalOpen
+
+import re
+from obitools.ecopcr.options import loadTaxonomyDatabase
+from obitools.format import SequenceFileIterator
+
+#from _fasta import parseFastaDescription,fastaParser
+#from _fasta import _fastaJoinSeq
+#from _fasta import _parseFastaTag
+
+
+#fastaEntryIterator=fastGenericEntryIteratorGenerator(startEntry='>')
+fastaEntryIterator=genericEntryIteratorGenerator(startEntry='>')
+rawFastaEntryIterator=genericEntryIteratorGenerator(startEntry='\s*>')
+
+def _fastaJoinSeq(seqarray):
+ return ''.join([x.strip() for x in seqarray])
+
+
+def parseFastaDescription(ds,tagparser):
+
+ m = tagparser.search(' '+ds)
+ if m is not None:
+ info=m.group(0)
+ definition = ds[m.end(0):].strip()
+ else:
+ info=None
+ definition=ds
+
+ return definition,info
+
+def fastaParser(seq,bioseqfactory,tagparser,rawparser,joinseq=_fastaJoinSeq):
+ '''
+ Parse a fasta record.
+
+ @attention: internal purpose function
+
+ @param seq: a sequence object containing all lines corresponding
+ to one fasta sequence
+ @type seq: C{list} or C{tuple} of C{str}
+
+ @param bioseqfactory: a callable object return a BioSequence
+ instance.
+ @type bioseqfactory: a callable object
+
+ @param tagparser: a compiled regular expression usable
+ to identify key, value couples from
+ title line.
+ @type tagparser: regex instance
+
+ @return: a C{BioSequence} instance
+ '''
+ seq = seq.split('\n')
+ title = seq[0].strip()[1:].split(None,1)
+ id=title[0]
+ if len(title) == 2:
+ definition,info=parseFastaDescription(title[1], tagparser)
+ else:
+ info= None
+ definition=None
+
+ seq=joinseq(seq[1:])
+ return bioseqfactory(id, seq, definition,info,rawparser)
+
+
+def fastaNucParser(seq,tagparser=_default_raw_parser,joinseq=_fastaJoinSeq):
+ return fastaParser(seq,NucSequence,tagparser=tagparser,joinseq=_fastaJoinSeq)
+
+def fastaAAParser(seq,tagparser=_default_raw_parser,joinseq=_fastaJoinSeq):
+ return fastaParser(seq,AASequence,tagparser=tagparser,joinseq=_fastaJoinSeq)
+
+def fastaIterator(file,bioseqfactory=bioSeqGenerator,
+ tagparser=_default_raw_parser,
+ joinseq=_fastaJoinSeq):
+ '''
+ iterate through a fasta file sequence by sequence.
+ Returned sequences by this iterator will be BioSequence
+ instances
+
+ @param file: a line iterator containing fasta data or a filename
+ @type file: an iterable object or str
+ @param bioseqfactory: a callable object return a BioSequence
+ instance.
+ @type bioseqfactory: a callable object
+
+ @param tagparser: a compiled regular expression usable
+ to identify key, value couples from
+ title line.
+ @type tagparser: regex instance
+
+ @return: an iterator on C{BioSequence} instance
+
+ @see: L{fastaNucIterator}
+ @see: L{fastaAAIterator}
+
+ >>> from obitools.format.sequence.fasta import fastaIterator
+ >>> f = fastaIterator('monfichier')
+ >>> s = f.next()
+ >>> print s
+ gctagctagcatgctagcatgcta
+ >>>
+ '''
+ rawparser=tagparser
+ allparser = tagparser % '[a-zA-Z][a-zA-Z0-9_]*'
+ tagparser = re.compile('( *%s)+' % allparser)
+
+ for entry in fastaEntryIterator(file):
+ yield fastaParser(entry,bioseqfactory,tagparser,rawparser,joinseq)
+
+def rawFastaIterator(file,bioseqfactory=bioSeqGenerator,
+ tagparser=_default_raw_parser,
+ joinseq=_fastaJoinSeq):
+
+ rawparser=tagparser
+ allparser = tagparser % '[a-zA-Z][a-zA-Z0-9_]*'
+ tagparser = re.compile('( *%s)+' % allparser)
+
+ for entry in rawFastaEntryIterator(file):
+ entry=entry.strip()
+ yield fastaParser(entry,bioseqfactory,tagparser,rawparser,joinseq)
+
+def fastaNucIterator(file,tagparser=_default_raw_parser):
+ '''
+ iterate through a fasta file sequence by sequence.
+ Returned sequences by this iterator will be NucSequence
+ instances
+
+ @param file: a line iterator containint fasta data
+ @type file: an iterable object
+
+ @param tagparser: a compiled regular expression usable
+ to identify key, value couples from
+ title line.
+ @type tagparser: regex instance
+
+ @return: an iterator on C{NucBioSequence} instance
+ @rtype: a generator object
+
+ @see: L{fastaIterator}
+ @see: L{fastaAAIterator}
+ '''
+ return fastaIterator(file, NucSequence,tagparser)
+
+def fastaAAIterator(file,tagparser=_default_raw_parser):
+ '''
+ iterate through a fasta file sequence by sequence.
+ Returned sequences by this iterator will be AASequence
+ instances
+
+ @param file: a line iterator containing fasta data
+ @type file: an iterable object
+
+ @param tagparser: a compiled regular expression usable
+ to identify key, value couples from
+ title line.
+ @type tagparser: regex instance
+
+ @return: an iterator on C{AABioSequence} instance
+
+ @see: L{fastaIterator}
+ @see: L{fastaNucIterator}
+ '''
+ return fastaIterator(file, AASequence,tagparser)
+
+def formatFasta(data,gbmode=False,upper=False,restrict=None):
+ '''
+ Convert a seqence or a set of sequences in a
+ string following the fasta format
+
+ @param data: sequence or a set of sequences
+ @type data: BioSequence instance or an iterable object
+ on BioSequence instances
+
+ @param gbmode: if set to C{True} identifier part of the title
+ line follows recommendation from nbci to allow
+ sequence indexing with the blast formatdb command.
+ @type gbmode: bool
+
+ @param restrict: a set of key name that will be print in the formated
+ output. If restrict is set to C{None} (default) then
+ all keys are formated.
+ @type restrict: any iterable value or None
+
+ @return: a fasta formated string
+ @rtype: str
+ '''
+ if isinstance(data, BioSequence):
+ data = [data]
+
+ if restrict is not None and not isinstance(restrict, set):
+ restrict = set(restrict)
+
+ rep = []
+ for sequence in data:
+ seq = str(sequence)
+ if sequence.definition is None:
+ definition=''
+ else:
+ definition=sequence.definition
+ if upper:
+ frgseq = '\n'.join([seq[x:x+60].upper() for x in xrange(0,len(seq),60)])
+ else:
+ frgseq = '\n'.join([seq[x:x+60] for x in xrange(0,len(seq),60)])
+ info='; '.join(['%s=%s' % x
+ for x in sequence.rawiteritems()
+ if restrict is None or x[0] in restrict])
+ if info:
+ info=info+';'
+ if sequence._rawinfo is not None and sequence._rawinfo:
+ info+=" " + sequence._rawinfo.strip()
+
+ id = sequence.id
+ if gbmode:
+ if 'gi' in sequence:
+ id = "gi|%s|%s" % (sequence['gi'],id)
+ else:
+ id = "lcl|%s|" % (id)
+ title='>%s %s %s' %(id,info,definition)
+ rep.append("%s\n%s" % (title,frgseq))
+ return '\n'.join(rep)
+
+def formatSAPFastaGenerator(options):
+ loadTaxonomyDatabase(options)
+
+ taxonomy=None
+ if options.taxonomy is not None:
+ taxonomy=options.taxonomy
+
+ assert taxonomy is not None,"SAP formating require indication of a taxonomy database"
+
+ ranks = ('superkingdom', 'kingdom', 'subkingdom', 'superphylum',
+ 'phylum', 'subphylum', 'superclass', 'class', 'subclass',
+ 'infraclass', 'superorder', 'order', 'suborder', 'infraorder',
+ 'parvorder', 'superfamily', 'family', 'subfamily', 'supertribe', 'tribe',
+ 'subtribe', 'supergenus', 'genus', 'subgenus', 'species group',
+ 'species subgroup', 'species', 'subspecies')
+
+ trank=set(taxonomy._ranks)
+ ranks = [taxonomy._ranks.index(x) for x in ranks if x in trank]
+
+ strict= options.strictsap
+
+ def formatSAPFasta(data,gbmode=False,upper=False,restrict=None):
+ '''
+ Convert a seqence or a set of sequences in a
+ string following the fasta format as recommended for the SAP
+ software
+
+ http://ib.berkeley.edu/labs/slatkin/munch/StatisticalAssignmentPackage.html
+
+ @param data: sequence or a set of sequences
+ @type data: BioSequence instance or an iterable object
+ on BioSequence instances
+
+ @param gbmode: if set to C{True} identifier part of the title
+ line follows recommendation from nbci to allow
+ sequence indexing with the blast formatdb command.
+ @type gbmode: bool
+
+ @param restrict: a set of key name that will be print in the formated
+ output. If restrict is set to C{None} (default) then
+ all keys are formated.
+ @type restrict: any iterable value or None
+
+ @return: a fasta formated string
+ @rtype: str
+ '''
+ if isinstance(data, BioSequence):
+ data = [data]
+
+ if restrict is not None and not isinstance(restrict, set):
+ restrict = set(restrict)
+
+ rep = []
+ for sequence in data:
+ seq = str(sequence)
+
+ if upper:
+ frgseq = '\n'.join([seq[x:x+60].upper() for x in xrange(0,len(seq),60)])
+ else:
+ frgseq = '\n'.join([seq[x:x+60] for x in xrange(0,len(seq),60)])
+
+ try:
+ taxid = sequence["taxid"]
+ except KeyError:
+ if strict:
+ raise AssertionError('All sequence must have a taxid')
+ else:
+ continue
+
+ definition=' ;'
+
+ for r in ranks:
+ taxon = taxonomy.getTaxonAtRank(taxid,r)
+ if taxon is not None:
+ definition+=' %s: %s,' % (taxonomy._ranks[r],taxonomy.getPreferedName(taxon))
+
+ definition='%s ; %s' % (definition[0:-1],taxonomy.getPreferedName(taxid))
+
+ id = sequence.id
+ if gbmode:
+ if 'gi' in sequence:
+ id = "gi|%s|%s" % (sequence['gi'],id)
+ else:
+ id = "lcl|%s|" % (id)
+ title='>%s%s' %(id,definition)
+ rep.append("%s\n%s" % (title,frgseq))
+ return '\n'.join(rep)
+
+ return formatSAPFasta
+
+class FastaIterator(SequenceFileIterator):
+
+
+ entryIterator = genericEntryIteratorGenerator(startEntry='>')
+ classmethod(entryIterator)
+
+ def __init__(self,inputfile,bioseqfactory=bioSeqGenerator,
+ tagparser=_default_raw_parser,
+ joinseq=_fastaJoinSeq):
+
+ SequenceFileIterator.__init__(self, inputfile, bioseqfactory)
+
+ self.__file = FastaIterator.entryIterator(self._inputfile)
+
+ self._tagparser = tagparser
+ self._joinseq = joinseq
+
+ def get_tagparser(self):
+ return self.__tagparser
+
+
+ def set_tagparser(self, value):
+ self._rawparser = value
+ allparser = value % '[a-zA-Z][a-zA-Z0-9_]*'
+ self.__tagparser = re.compile('( *%s)+' % allparser)
+
+ def _parseFastaDescription(self,ds):
+
+ m = self._tagparser.search(' '+ds)
+ if m is not None:
+ info=m.group(0)
+ definition = ds[m.end(0):].strip()
+ else:
+ info=None
+ definition=ds
+
+ return definition,info
+
+
+ def _parser(self):
+ '''
+ Parse a fasta record.
+
+ @attention: internal purpose function
+
+ @return: a C{BioSequence} instance
+ '''
+ seq = self._seq.split('\n')
+ title = seq[0].strip()[1:].split(None,1)
+ id=title[0]
+ if len(title) == 2:
+ definition,info=self._parseFastaDescription(title[1])
+ else:
+ info= None
+ definition=None
+
+ seq=self._joinseq(seq[1:])
+
+ return self._bioseqfactory(id, seq, definition,info,self._rawparser)
+
+ _tagparser = property(get_tagparser, set_tagparser, None, "_tagparser's docstring")
diff --git a/obitools/fasta/_fasta.so b/obitools/fasta/_fasta.so
new file mode 100755
index 0000000..de300ce
Binary files /dev/null and b/obitools/fasta/_fasta.so differ
diff --git a/obitools/fastq/__init__.py b/obitools/fastq/__init__.py
new file mode 100644
index 0000000..1cf3535
--- /dev/null
+++ b/obitools/fastq/__init__.py
@@ -0,0 +1,190 @@
+'''
+Created on 29 aout 2009
+
+@author: coissac
+'''
+
+from obitools import BioSequence
+from obitools import _default_raw_parser
+from obitools.format.genericparser import genericEntryIteratorGenerator
+from obitools import bioSeqGenerator,AASequence,NucSequence
+from obitools.fasta import parseFastaDescription
+from _fastq import fastqQualitySangerDecoder,fastqQualitySolexaDecoder
+from _fastq import qualityToSangerError,qualityToSolexaError
+from _fastq import errorToSangerFastQStr
+from _fastq import formatFastq
+from _fastq import fastqParserGenetator
+from obitools.utils import universalOpen
+
+import re
+
+fastqEntryIterator=genericEntryIteratorGenerator(startEntry='^@',endEntry="^\+",strip=True,join=False)
+
+#def fastqParserGenetator(fastqvariant='sanger',bioseqfactory=NucSequence,tagparser=_parseFastaTag):
+#
+# qualityDecoder,errorDecoder = {'sanger' : (fastqQualitySangerDecoder,qualityToSangerError),
+# 'solexa' : (fastqQualitySolexaDecoder,qualityToSolexaError),
+# 'illumina' : (fastqQualitySolexaDecoder,qualityToSangerError)}[fastqvariant]
+#
+# def fastqParser(seq):
+# '''
+# Parse a fasta record.
+#
+# @attention: internal purpose function
+#
+# @param seq: a sequence object containing all lines corresponding
+# to one fasta sequence
+# @type seq: C{list} or C{tuple} of C{str}
+#
+# @param bioseqfactory: a callable object return a BioSequence
+# instance.
+# @type bioseqfactory: a callable object
+#
+# @param tagparser: a compiled regular expression usable
+# to identify key, value couples from
+# title line.
+# @type tagparser: regex instance
+#
+# @return: a C{BioSequence} instance
+# '''
+#
+# title = seq[0][1:].split(None,1)
+# id=title[0]
+# if len(title) == 2:
+# definition,info=parseFastaDescription(title[1], tagparser)
+# else:
+# info= {}
+# definition=None
+#
+# quality=errorDecoder(qualityDecoder(seq[3]))
+#
+# seq=seq[1]
+#
+# seq = bioseqfactory(id, seq, definition,False,**info)
+# seq.quality = quality
+#
+# return seq
+#
+# return fastqParser
+
+
+def fastqIterator(file,fastqvariant='sanger',bioseqfactory=NucSequence,tagparser=_default_raw_parser):
+ '''
+ iterate through a fasta file sequence by sequence.
+ Returned sequences by this iterator will be BioSequence
+ instances
+
+ @param file: a line iterator containing fasta data or a filename
+ @type file: an iterable object or str
+ @param bioseqfactory: a callable object return a BioSequence
+ instance.
+ @type bioseqfactory: a callable object
+
+ @param tagparser: a compiled regular expression usable
+ to identify key, value couples from
+ title line.
+ @type tagparser: regex instance
+
+ @return: an iterator on C{BioSequence} instance
+
+ @see: L{fastaNucIterator}
+ @see: L{fastaAAIterator}
+
+ '''
+ fastqParser=fastqParserGenetator(fastqvariant, bioseqfactory, tagparser)
+ file = universalOpen(file)
+ for entry in fastqEntryIterator(file):
+ title=entry[0]
+ seq="".join(entry[1:-1])
+ quality=''
+ lenseq=len(seq)
+ while (len(quality) < lenseq):
+ quality+=file.next().strip()
+
+ yield fastqParser([title,seq,'+',quality])
+
+def fastqSangerIterator(file,tagparser=_default_raw_parser):
+ '''
+ iterate through a fastq file sequence by sequence.
+ Returned sequences by this iterator will be NucSequence
+ instances
+
+ @param file: a line iterator containint fasta data
+ @type file: an iterable object
+
+ @param tagparser: a compiled regular expression usable
+ to identify key, value couples from
+ title line.
+ @type tagparser: regex instance
+
+ @return: an iterator on C{NucBioSequence} instance
+
+ @see: L{fastqIterator}
+ @see: L{fastqAAIterator}
+ '''
+ return fastqIterator(file,'sanger',NucSequence,tagparser)
+
+def fastqSolexaIterator(file,tagparser=_default_raw_parser):
+ '''
+ iterate through a fastq file sequence by sequence.
+ Returned sequences by this iterator will be NucSequence
+ instances
+
+ @param file: a line iterator containint fasta data
+ @type file: an iterable object
+
+ @param tagparser: a compiled regular expression usable
+ to identify key, value couples from
+ title line.
+ @type tagparser: regex instance
+
+ @return: an iterator on C{NucBioSequence} instance
+
+ @see: L{fastqIterator}
+ @see: L{fastqAAIterator}
+ '''
+ return fastqIterator(file,'solexa',NucSequence,tagparser)
+
+def fastqIlluminaIterator(file,tagparser=_default_raw_parser):
+ '''
+ iterate through a fastq file sequence by sequence.
+ Returned sequences by this iterator will be NucSequence
+ instances
+
+ @param file: a line iterator containint fasta data
+ @type file: an iterable object
+
+ @param tagparser: a compiled regular expression usable
+ to identify key, value couples from
+ title line.
+ @type tagparser: regex instance
+
+ @return: an iterator on C{NucBioSequence} instance
+
+ @see: L{fastqIterator}
+ @see: L{fastqAAIterator}
+ '''
+ return fastqIterator(file,'illumina',NucSequence,tagparser)
+
+def fastqAAIterator(file,tagparser=_default_raw_parser):
+ '''
+ iterate through a fastq file sequence by sequence.
+ Returned sequences by this iterator will be AASequence
+ instances
+
+ @param file: a line iterator containing fasta data
+ @type file: an iterable object
+
+ @param tagparser: a compiled regular expression usable
+ to identify key, value couples from
+ title line.
+ @type tagparser: regex instance
+
+ @return: an iterator on C{AABioSequence} instance
+
+ @see: L{fastqIterator}
+ @see: L{fastqNucIterator}
+ '''
+ return fastqIterator(file,'sanger',AASequence,tagparser)
+
+
diff --git a/obitools/fastq/_fastq.so b/obitools/fastq/_fastq.so
new file mode 100755
index 0000000..4e3b942
Binary files /dev/null and b/obitools/fastq/_fastq.so differ
diff --git a/obitools/fnaqual/__init__.py b/obitools/fnaqual/__init__.py
new file mode 100644
index 0000000..384eb96
--- /dev/null
+++ b/obitools/fnaqual/__init__.py
@@ -0,0 +1,2 @@
+
+fnaTag=' %s *= *([^\s]+)'
diff --git a/obitools/fnaqual/fasta.py b/obitools/fnaqual/fasta.py
new file mode 100644
index 0000000..102a13e
--- /dev/null
+++ b/obitools/fnaqual/fasta.py
@@ -0,0 +1,8 @@
+from obitools.fasta import fastaNucIterator
+from obitools.fnaqual import fnaTag
+
+def fnaFastaIterator(file):
+
+ x = fastaNucIterator(file, fnaTag)
+
+ return x
\ No newline at end of file
diff --git a/obitools/fnaqual/quality.py b/obitools/fnaqual/quality.py
new file mode 100644
index 0000000..092f610
--- /dev/null
+++ b/obitools/fnaqual/quality.py
@@ -0,0 +1,137 @@
+"""
+
+
+"""
+
+from obitools import _default_raw_parser
+from obitools.fasta import fastaIterator
+from obitools.fnaqual import fnaTag
+from obitools.location import Location
+
+import re
+
+
+class QualitySequence(list):
+
+ def __init__(self,id,seq,definition=None,rawinfo=None,rawparser=_default_raw_parser,**info):
+ '''
+
+ @param id:
+ @param seq:
+ @param definition:
+ '''
+ list.__init__(self,seq)
+ self._info = info
+ self.definition=definition
+ self.id=id
+ self._rawinfo=' ' + rawinfo
+ self._rawparser=rawparser
+
+ def getDefinition(self):
+ '''
+ Sequence definition getter
+
+ @return: the sequence definition
+ @rtype: str
+
+ '''
+ return self._definition
+
+ def setDefinition(self, value):
+ self._definition = value
+
+ def getId(self):
+ return self._id
+
+ def setId(self, value):
+ self._id = value
+
+ def getKey(self,key):
+ if key not in self._info:
+ p = re.compile(self._rawparser % key)
+ m = p.search(self._rawinfo)
+ if m is not None:
+ v=m.group(1)
+ self._rawinfo=' ' + self._rawinfo[0:m.start(0)]+self._rawinfo[m.end(0):]
+ try:
+ v = eval(v)
+ except:
+ pass
+ self._info[key]=v
+ else:
+ raise KeyError,key
+ else:
+ v=self._info[key]
+ return v
+
+ def __getitem__(self,key):
+ if isinstance(key,Location):
+ return key.extractSequence(self)
+ elif isinstance(key, str):
+ return self._getKey(key)
+ elif isinstance(key, int):
+ return list.__getitem__(self,key)
+ elif isinstance(key, slice):
+ subseq=list.__getitem__(self,key)
+ info = dict(self._info)
+ if key.start is not None:
+ start = key.start +1
+ else:
+ start = 1
+ if key.stop is not None:
+ stop = key.stop+1
+ else:
+ stop = len(self)
+ if key.step is not None:
+ step = key.step
+ else:
+ step = 1
+
+ info['cut']='[%d,%d,%s]' % (start,stop,step)
+ return QualitySequence(self.id, subseq, self.definition,self._rawinfo,self._rawparser,**info)
+
+ raise TypeError,'key must be an integer, a str or a slice'
+
+ def __setitem__(self,key,value):
+ self._info[key]=value
+
+ def __delitem__(self,key):
+ if isinstance(key, str):
+ del self._info[key]
+ else:
+ raise TypeError,key
+
+ def __iter__(self):
+ return list.__iter__(self)
+
+ def __contains__(self,key):
+ return key in self._info
+
+ def getTags(self):
+ return self._info
+
+ def complement(self):
+ '''
+
+ '''
+ cseq = self[::-1]
+ rep = QualitySequence(self.id,cseq,self.definition,self._rawinfo,self._rawparser,**self._info)
+ rep._info['complemented']=not rep._info.get('complemented',False)
+ return rep
+
+
+ definition = property(getDefinition, setDefinition, None, "Sequence Definition")
+
+ id = property(getId, setId, None, 'Sequence identifier')
+
+
+def _qualityJoinSeq(seqarray):
+ text = ' '.join([x.strip() for x in seqarray])
+ return [int(x) for x in text.split()]
+
+def qualityIterator(file):
+ for q in fastaIterator(file, QualitySequence, fnaTag, _qualityJoinSeq):
+ yield q
+
+
+
\ No newline at end of file
diff --git a/obitools/format/__init__.py b/obitools/format/__init__.py
new file mode 100644
index 0000000..a680505
--- /dev/null
+++ b/obitools/format/__init__.py
@@ -0,0 +1,28 @@
+from obitools import bioSeqGenerator
+from obitools.utils import universalOpen
+
+
+class SequenceFileIterator:
+
+ def __init__(self,inputfile,bioseqfactory=bioSeqGenerator):
+ self._inputfile = universalOpen(inputfile)
+ self._bioseqfactory = bioseqfactory
+
+ def get_inputfile(self):
+ return self.__file
+
+
+ def get_bioseqfactory(self):
+ return self.__bioseqfactory
+
+ def next(self):
+ entry = self.inputfile.next()
+ return self._parse(entry)
+
+ def __iter__(self):
+ return self
+
+ _inputfile = property(get_inputfile, None, None, "_file's docstring")
+ _bioseqfactory = property(get_bioseqfactory, None, None, "_bioseqfactory's docstring")
+
+
\ No newline at end of file
diff --git a/obitools/format/_format.so b/obitools/format/_format.so
new file mode 100755
index 0000000..92e460d
Binary files /dev/null and b/obitools/format/_format.so differ
diff --git a/obitools/format/genericparser/__init__.py b/obitools/format/genericparser/__init__.py
new file mode 100644
index 0000000..fecc72f
--- /dev/null
+++ b/obitools/format/genericparser/__init__.py
@@ -0,0 +1,217 @@
+"""
+G{packagetree format}
+"""
+import re
+
+from obitools.utils import universalOpen
+
+def genericEntryIteratorGenerator(startEntry=None,endEntry=None,
+ head=False,tail=False,
+ strip=False,join=True):
+ '''
+ Transfome a text line iterator to an entry oriented iterator.
+
+ This iterator converted is useful to implement first stage
+ of flat file parsing.
+
+ @param startEntry: a regular pattern matching the beginning of
+ an entry
+ @type startEntry: C{str} or None
+ @param endEntry: a regular pattern matching the end of
+ an entry
+ @type endEntry: C{str} or None
+ @param head: indicate if an header is present before
+ the first entry (as in many original genbank
+ files)
+ @type head: C{bool}
+ @param tail: indicate if some extra informations are present
+ after the last entry.
+ @type tail: C{bool}
+
+ @return: an iterator on entries in text format
+ @rtype: an iterator on C{str}
+ '''
+
+ def isBeginning(line):
+ return startEntry is None or startEntry.match(line) is not None
+
+ def isEnding(line):
+ return ((endEntry is not None and endEntry.match(line) is not None) or
+ (endEntry is None and startEntry is not None and startEntry.match(line) is not None))
+
+ def transparentIteratorEntry(file):
+ file = universalOpen(file)
+ return file
+
+ def genericEntryIterator(file):
+ file = universalOpen(file)
+ entry = []
+ line = file.next()
+ started = head or isBeginning(line)
+
+ try:
+ while 1:
+ while not started:
+ line = file.next()
+ started = isBeginning(line)
+
+ if endEntry is None:
+ entry.append(line)
+ line = file.next()
+
+ while started:
+ end = isEnding(line)
+ if end:
+ if endEntry is not None:
+ entry.append(line)
+ if join:
+ e = ''.join(entry)
+ if strip:
+ e=e.strip()
+ else:
+ e=entry
+ if strip:
+ e=[x.strip() for x in e]
+ entry=[]
+ yield e
+ started=False
+ if endEntry is not None:
+ line = file.next()
+ else:
+ entry.append(line)
+ line = file.next()
+
+ started = isBeginning(line)
+
+ except StopIteration:
+ if entry and (endEntry is None or tail):
+ if join:
+ e = ''.join(entry)
+ if strip:
+ e=e.strip()
+ else:
+ e=entry
+ if strip:
+ e=[x.strip() for x in e]
+ yield e
+
+
+
+ if startEntry is not None:
+ startEntry = re.compile(startEntry)
+ if endEntry is not None:
+ endEntry = re.compile(endEntry)
+
+ if startEntry is None and endEntry is None:
+ return transparentIteratorEntry
+
+ return genericEntryIterator
+
+
+class GenericParser(object):
+
+ def __init__(self,
+ startEntry=None,
+ endEntry=None,
+ head=False,
+ tail=False,
+ strip=False,
+ **parseAction):
+ """
+ @param startEntry: a regular pattern matching the beginning of
+ an entry
+ @type startEntry: C{str} or None
+ @param endEntry: a regular pattern matching the end of
+ an entry
+ @type endEntry: C{str} or None
+ @param head: indicate if an header is present before
+ the first entry (as in many original genbank
+ files)
+ @type head: C{bool}
+ @param tail: indicate if some extra informations are present
+ after the last entry.
+ @type tail: C{bool}
+
+ @param parseAction:
+
+ """
+ self.flatiterator= genericEntryIteratorGenerator(startEntry,
+ endEntry,
+ head,
+ tail,
+ strip)
+
+ self.action={}
+
+ for k in parseAction:
+ self.addParseAction(k,*parseAction[k])
+
+ def addParseAction(self,name,dataMatcher,dataCleaner=None,cleanSub=''):
+ '''
+ Add a parse action to the generic parser. A parse action
+ allows to extract one information from an entry. A parse
+ action is defined by a name and a method to extract this
+ information from the full text entry.
+
+ A parse action can be defined following two ways.
+
+ - via regular expression patterns
+
+ - via dedicated function.
+
+ In the first case, you have to indicate at least the
+ dataMatcher regular pattern. This pattern should match exactly
+ the data part you want to retrieve. If cleanning of extra
+ characters is needed. The second pattern dataCLeanner can be
+ used to specifyed these characters.
+
+ In the second case you must provide a callable object (function)
+ that extract and clean data from the text entry. This function
+ should return an array containing all data retrevied even if
+ no data or only one data is retrevied.
+
+ @summary: Add a parse action to the generic parser.
+
+ @param name: name of the data extracted
+ @type name: C{str}
+ @param dataMatcher: a regular pattern matching the data
+ or a callable object parsing the
+ entry and returning a list of marched data
+ @type dataMatcher: C{str} or C{SRE_Pattern} instance or a callable
+ object
+ @param dataCleaner: a regular pattern matching part of the data
+ to suppress.
+ @type dataCleaner: C{str} or C{SRE_Pattern} instance or C{None}
+ @param cleanSub: string used to replace dataCleaner matches.
+ Default is an empty string
+ @type cleanSub: C{str}
+
+ '''
+ if callable(dataMatcher):
+ self.action[name]=dataMatcher
+ else :
+ if isinstance(dataMatcher, str):
+ dataMatcher=re.compile(dataMatcher)
+ if isinstance(dataCleaner, str):
+ dataCleaner=re.compile(dataCleaner)
+ self.action[name]=self._buildREParser(dataMatcher,
+ dataCleaner,
+ cleanSub)
+
+ def _buildREParser(self,dataMatcher,dataCleaner,cleanSub):
+ def parser(data):
+ x = dataMatcher.findall(data)
+ if dataCleaner is not None:
+ x = [dataCleaner.sub(cleanSub,y) for y in x]
+ return x
+ return parser
+
+ def __call__(self,file):
+ for e in self.flatiterator(file):
+ pe = {'fullentry':e}
+ for k in self.action:
+ pe[k]=self.action[k](e)
+ yield pe
+
+
+
\ No newline at end of file
diff --git a/obitools/format/ontology/__init__.py b/obitools/format/ontology/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/obitools/format/ontology/go_obo.py b/obitools/format/ontology/go_obo.py
new file mode 100644
index 0000000..cd1d87e
--- /dev/null
+++ b/obitools/format/ontology/go_obo.py
@@ -0,0 +1,274 @@
+__docformat__ = 'restructuredtext'
+
+import re
+import string
+import textwrap
+
+
+from obitools.obo.go.parser import GOEntryIterator
+from obitools.obo.go.parser import GOTerm
+from obitools.obo.go.parser import GOEntry
+
+"""
+go_obo.py : gene_ontology_edit.obo file parser:
+----------------------------------------------------
+
+- OBOFile class: open a flat file and return an entry.
+
+"""
+class OBOFile(object):
+ """
+ Iterator over all entries of an OBO file
+ """
+
+ def __init__(self,_path):
+ self.file = GOEntryIterator(_path)
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ fiche = self.file.next()
+
+ if isinstance(fiche, GOTerm):
+ self.isaterm=True
+ return Term(fiche)
+ elif isinstance(fiche, GOEntry):
+ self.isaterm=False
+ return Entry(fiche)
+ else:
+ self.isaterm=False
+ return Header(fiche)
+
+
+############# tout le reste doit descendre a l'etage obitools/ogo/go/parser.py ##########
+
+# define an XRef into a go_obo.py script in the microbi pylib
+class Xref(object):
+ """
+ Class Xref
+ Xref.db Xref database
+ Xref.id Xref identifier
+ """
+
+ def __init__(self,description):
+ data = description.split(':')
+ self.db = data[0].strip()
+ self.id = data[1].strip()
+
+# define a RelatedTerm into a go_obo.py script in the microbi pylib
+class RelatedTerm(object):
+ """
+ Class RelatedTerm
+ RelatedTerm.relation RelatedTerm relation
+ RelatedTerm.related_term RelatedTerm GO identifier
+ RelatedTerm.comment all terms have 0 or 1 comment
+ """
+
+ def __init__(self,relation,value,comment):
+ self.relation = relation
+ self.related_term = value.strip('GO:')
+ self.comment = comment
+
+
+# define into a go_obo.py script in the microbi pylib
+#class Term(object):
+# """
+# class representing an OBO term (entry).
+# """
+#
+# def __init__(self):
+# raise RuntimeError('biodb.go_obo is an abstract class')
+#
+# def __checkEntry__(self):
+# minimum=(hasattr(self,'goid') )
+# if not minimum:
+# raise AssertionError('Misconstructed GO Term instance %s' % [x for x in dir(self) if x[0]!='_'])
+
+class Term(object):
+ """
+ Class Term
+ representing a GO term.
+ """
+
+ def __init__(self,data=None):
+ """
+ """
+ self.data=data
+ self.isaterm = True
+
+ if data:
+ self.__filtreGoid__()
+ self.__filtreName__()
+ self.__filtreComment__()
+ self.__filtreSynonyms__()
+ self.__filtreDef__()
+ self.__filtreParents__()
+ self.__filtreRelationships__()
+ self.__filtreRelation__()
+ self.__filtreObsolete__()
+ self.__filtreAltIds__()
+ self.__filtreXRefs__()
+ self.__filtreSubsets__()
+
+ # check if all required attributes were valued
+ self.__checkEntry__()
+
+
+ def __checkEntry__(self):
+ minimum=(hasattr(self,'goid') )
+ if not minimum:
+ raise AssertionError('Misconstructed GO Term instance %s' % [x for x in dir(self) if x[0]!='_'])
+
+
+ def __filtreGoid__(self):
+ """
+ Extract GO id.
+ """
+ self.goid = self.data.id.value.strip('GO:')
+
+ def __filtreName__(self):
+ """
+ Extract GO name.
+ """
+ self.name = self.data.name.value
+
+ def __filtreSynonyms__(self):
+ """
+ Extract GO synonym(s).
+ """
+ self.list_synonyms = {}
+ if self.data.synonyms:
+ for y in self.data.synonyms:
+ self.list_synonyms[y.value] = y.scope
+
+
+ def __filtreComment__(self):
+ """
+ manage None comments
+ """
+ if self.data.comment != None:
+ self.comment = self.data.comment.value
+ else:
+ self.comment = ""
+
+ def __filtreDef__(self):
+ """
+ Extract GO definition.
+ """
+ if self.data.definition != None:
+ self.definition = self.data.definition.value
+ else:
+ self.definition = ""
+
+ def __filtreParents__(self):
+ """
+ To make the is_a hierarchy
+ """
+ if self.data.is_a != None:
+ self.is_a = set([isa.value.strip('GO:') for isa in self.data.is_a])
+ else:
+ self.is_a = set()
+
+ def __filtreRelation__(self):
+ """
+ To make the part_of hierarchy
+ """
+ self.part_of = set()
+ self.regulates = set()
+ self.negatively_regulates = set()
+ self.positively_regulates = set()
+
+ if self.data.relationship != None:
+ for rel in self.data.relationship:
+ if rel.relationship == "part_of":
+ self.part_of.add(rel.value.strip('GO:'))
+ elif rel.relationship == "regulates":
+ self.regulates.add(rel.value.strip('GO:'))
+ elif rel.relationship == "negatively_regulates":
+ self.negatively_regulates.add(rel.value.strip('GO:'))
+ elif rel.relationship == "positively_regulates":
+ self.positively_regulates.add(rel.value.strip('GO:'))
+
+
+ def __filtreRelationships__(self):
+ """
+ Relation list with other GO Terms (is_a, part_of or some regulates relation)
+ """
+ self.related_term =[]
+ if self.data.relationship != None:
+ for x in self.data.relationship:
+ self.related_term.append(RelatedTerm(x.relationship,x.value,x.__doc__))
+ #self.related_term.append(RelatedTerm(x.relationship,x.value,x.comment))
+ if self.data.is_a != None:
+ for x in self.data.is_a:
+ self.related_term.append(RelatedTerm('is_a',x.value,x.__doc__))
+ #self.related_term.append(RelatedTerm('is_a',x.value,x.comment))
+
+
+
+ def __filtreObsolete__(self):
+ """
+ for each obsolete terms corresponds a set of GO Identifiers
+ so that this GO term is consider as others GO Terms
+ """
+ self.considers = set()
+ self.replaces = set()
+ self.is_obsolete = self.data.is_obsolete
+ if self.data.is_obsolete:
+ if self.data.consider:
+ self.considers = set([considered.value.strip('GO:') for considered in self.data.consider])
+ if self.data.replaced_by:
+ self.replaces = set([replaced.value.strip('GO:') for replaced in self.data.replaced_by])
+
+
+ def __filtreAltIds__(self):
+ """
+ alternate(s) id(s) for this term (= alias in the geneontology schema model!)
+ """
+ if self.data.alt_ids:
+ self.alt_ids = set([x.value.strip('GO:') for x in self.data.alt_ids])
+ else:
+ self.alt_ids = set()
+
+ def __filtreXRefs__(self):
+ """
+ cross references to other databases
+ """
+ self.xrefs = set()
+ if self.data.xrefs:
+ self.xrefs = set([Xref(x.value.reference) for x in self.data.xrefs])
+
+
+ def __filtreSubsets__(self):
+ """
+ subset label to make smaller sets of GO Terms
+ """
+ self.subsets = set()
+ if self.data.subsets:
+ self.subsets = set([x.value for x in self.data.subsets])
+
+
+class Entry(object):
+ """
+ a Stanza entry, like [Typedef] for example
+ """
+ def __init__(self,data=None):
+ self.data=data
+ self.isaterm=False
+ self.isanentry=True
+
+
+class Header(object):
+ """
+ class representing a GO header.
+ """
+
+ def __init__(self,data=None):
+ """
+ """
+ self.data=data
+ self.isaterm = False
+
+
+
diff --git a/obitools/format/options.py b/obitools/format/options.py
new file mode 100644
index 0000000..c42a23f
--- /dev/null
+++ b/obitools/format/options.py
@@ -0,0 +1,284 @@
+'''
+Created on 13 oct. 2009
+
+@author: coissac
+'''
+
+from obitools.format.sequence.embl import emblIterator
+from obitools.format.sequence.genbank import genbankIterator
+from obitools.format.sequence.fnaqual import fnaFastaIterator
+from obitools.format.sequence.fasta import fastaAAIterator,fastaNucIterator,fastaIterator
+from obitools.format.sequence.fastq import fastqIlluminaIterator,fastqSolexaIterator
+from obitools.fastq import fastqSangerIterator
+from obitools.fnaqual.quality import qualityIterator
+from obitools.fasta import formatFasta, rawFastaIterator,\
+ formatSAPFastaGenerator
+from obitools.fastq import formatFastq
+
+from obitools.ecopcr.sequence import EcoPCRDBSequenceWriter
+from obitools.ecopcr.options import loadTaxonomyDatabase
+
+#from obitools.format._format import printOutput
+
+from array import array
+from itertools import chain
+import sys
+
+import re
+from obitools.ecopcr import EcoPCRFile
+
+
+def addInputFormatOption(optionManager):
+# optionManager.add_option('--rank',
+# action="store_true", dest='addrank',
+# default=False,
+# help="add a rank attribute to the sequence "
+# "indicating the sequence position in the input data")
+ optionManager.add_option('--genbank',
+ action="store_const", dest="seqinformat",
+ default=None,
+ const='genbank',
+ help="input file is in genbank format")
+ optionManager.add_option('--embl',
+ action="store_const", dest="seqinformat",
+ default=None,
+ const='embl',
+ help="input file is in embl format")
+
+ optionManager.add_option('--fasta',
+ action="store_const", dest="seqinformat",
+ default=None,
+ const='fasta',
+ help="input file is in fasta nucleic format (including obitools fasta extentions)")
+
+ optionManager.add_option('--ecopcr',
+ action="store_const", dest="seqinformat",
+ default=None,
+ const='ecopcr',
+ help="input file is in fasta nucleic format (including obitools fasta extentions)")
+
+ optionManager.add_option('--raw-fasta',
+ action="store_const", dest="seqinformat",
+ default=None,
+ const='rawfasta',
+ help="input file is in fasta format (but more tolerant to format variant)")
+
+ optionManager.add_option('--fna',
+ action="store_const", dest="seqinformat",
+ default=None,
+ const='fna',
+ help="input file is in fasta nucleic format produced by 454 sequencer pipeline")
+
+ optionManager.add_option('--qual',
+ action="store", dest="withqualfile",
+ type='str',
+ default=None,
+ help="Specify the name of a quality file produced by 454 sequencer pipeline")
+
+ optionManager.add_option('--sanger',
+ action="store_const", dest="seqinformat",
+ default=None,
+ const='sanger',
+ help="input file is in sanger fastq nucleic format (standard fastq)")
+
+ optionManager.add_option('--solexa',
+ action="store_const", dest="seqinformat",
+ default=None,
+ const='solexa',
+ help="input file is in fastq nucleic format produced by solexa sequencer")
+
+ optionManager.add_option('--illumina',
+ action="store_const", dest="seqinformat",
+ default=None,
+ const='illumina',
+ help="input file is in fastq nucleic format produced by old solexa sequencer")
+
+ optionManager.add_option('--nuc',
+ action="store_const", dest="moltype",
+ default=None,
+ const='nuc',
+ help="input file is nucleic sequences")
+ optionManager.add_option('--prot',
+ action="store_const", dest="moltype",
+ default=None,
+ const='pep',
+ help="input file is protein sequences")
+
+
+def addOutputFormatOption(optionManager):
+ optionManager.add_option('--fastq-output',
+ action="store_const", dest="output",
+ default=None,
+ const=formatFastq,
+ help="output sequences in sanger fastq format")
+ optionManager.add_option('--fasta-output',
+ action="store_const", dest="output",
+ default=None,
+ const=formatFasta,
+ help="output sequences in obitools fasta format")
+ optionManager.add_option('--sap-output',
+ action="store_const", dest="output",
+ default=None,
+ const=formatSAPFastaGenerator,
+ help="output sequences in sap fasta format")
+ optionManager.add_option('--strict-sap',
+ action='store_true',dest='strictsap',
+ default=False,
+ help="Print sequences in upper case (defualt is lower case)")
+ optionManager.add_option('--ecopcr-output',
+ action="store", dest="ecopcroutput",
+ default=None,
+ help="output sequences in obitools ecopcr format")
+ optionManager.add_option('--uppercase',
+ action='store_true',dest='uppercase',
+ default=False,
+ help="Print sequences in upper case (defualt is lower case)")
+
+
+
+def addInOutputOption(optionManager):
+ addInputFormatOption(optionManager)
+ addOutputFormatOption(optionManager)
+
+
+
+
+
+def autoEntriesIterator(options):
+ options.outputFormater=formatFasta
+ options.outputFormat="fasta"
+
+ ecopcr_pattern = re.compile('^[^ ]+ +| +[0-9]+ +| + [0-9]+ + | +')
+
+ def annotatedIterator(formatIterator):
+ options.outputFormater=formatFasta
+ options.outputFormat="fasta"
+ def iterator(lineiterator):
+ for s in formatIterator(lineiterator):
+ s.extractTaxon()
+ yield s
+
+ return iterator
+
+ def withQualIterator(qualityfile):
+ options.outputFormater=formatFastq
+ options.outputFormat="fastq"
+ def iterator(lineiterator):
+ for s in fnaFastaIterator(lineiterator):
+ q = qualityfile.next()
+ quality = array('d',(10.**(-x/10.) for x in q))
+ s.quality=quality
+ yield s
+
+ return iterator
+
+ def autoSequenceIterator(lineiterator):
+ options.outputFormater=formatFasta
+ options.outputFormat="fasta"
+ first = lineiterator.next()
+ if first[0]==">":
+ if options.withqualfile is not None:
+ qualfile=qualityIterator(options.withqualfile)
+ reader=withQualIterator(qualfile)
+ options.outputFormater=formatFastq
+ options.outputFormat="fastq"
+ elif options.moltype=='nuc':
+ reader=fastaNucIterator
+ elif options.moltype=='pep':
+ reader=fastaAAIterator
+ else:
+ reader=fastaIterator
+ elif first[0]=='@':
+ reader=fastqSangerIterator
+ options.outputFormater=formatFastq
+ options.outputFormat="fastq"
+ elif first[0:3]=='ID ':
+ reader=emblIterator
+ elif first[0:6]=='LOCUS ':
+ reader=genbankIterator
+ elif first[0]=="#" or ecopcr_pattern.search(first):
+ reader=EcoPCRFile
+ else:
+ raise AssertionError,'file is not in fasta, fasta, embl, genbank or ecoPCR format'
+
+ input = reader(chain([first],lineiterator))
+
+ return input
+
+ if options.seqinformat is None:
+ reader = autoSequenceIterator
+ else:
+ if options.seqinformat=='fasta':
+ if options.moltype=='nuc':
+ reader=fastaNucIterator
+ elif options.moltype=='pep':
+ reader=fastaAAIterator
+ else:
+ reader=fastaIterator
+ elif options.seqinformat=='rawfasta':
+ reader=annotatedIterator(rawFastaIterator)
+ elif options.seqinformat=='genbank':
+ reader=annotatedIterator(genbankIterator)
+ elif options.seqinformat=='embl':
+ reader=annotatedIterator(emblIterator)
+ elif options.seqinformat=='fna':
+ reader=fnaFastaIterator
+ elif options.seqinformat=='sanger':
+ options.outputFormater=formatFastq
+ options.outputFormat="fastq"
+ reader=fastqSangerIterator
+ elif options.seqinformat=='solexa':
+ options.outputFormater=formatFastq
+ options.outputFormat="fastq"
+ reader=fastqSolexaIterator
+ elif options.seqinformat=='illumina':
+ options.outputFormater=formatFastq
+ options.outputFormat="fastq"
+ reader=fastqIlluminaIterator
+ elif options.seqinformat=='ecopcr':
+ reader=EcoPCRFile
+
+ if options.seqinformat=='fna' and options.withqualfile is not None:
+ qualfile=qualityIterator(options.withqualfile)
+ reader=withQualIterator(qualfile)
+ options.outputFormater=formatFastq
+ options.outputFormat="fastq"
+
+# if options.addrank:
+# reader = withRankIterator(reader)
+ return reader
+
+def sequenceWriterGenerator(options,output=sys.stdout):
+ class SequenceWriter:
+ def __init__(self,options,file=sys.stdout):
+ self._format=None
+ self._file=file
+ self._upper=options.uppercase
+ def put(self,seq):
+ if self._format is None:
+ self._format=formatFasta
+ if options.output is not None:
+ self._format=options.output
+ if self._format is formatSAPFastaGenerator:
+ self._format=formatSAPFastaGenerator(options)
+ elif options.outputFormater is not None:
+ self._format=options.outputFormater
+ s = self._format(seq,upper=self._upper)
+ try:
+ self._file.write(s)
+ self._file.write("\n")
+ except IOError:
+ sys.exit(0)
+
+ if options.ecopcroutput is not None:
+ taxo = loadTaxonomyDatabase(options)
+ writer=EcoPCRDBSequenceWriter(options.ecopcroutput,taxonomy=taxo)
+ else:
+ writer=SequenceWriter(options,output)
+
+ def sequenceWriter(sequence):
+ writer.put(sequence)
+
+ return sequenceWriter
+
+
\ No newline at end of file
diff --git a/obitools/format/sequence/__init__.py b/obitools/format/sequence/__init__.py
new file mode 100644
index 0000000..3918761
--- /dev/null
+++ b/obitools/format/sequence/__init__.py
@@ -0,0 +1,24 @@
+from obitools.fasta import fastaIterator
+from obitools.fastq import fastqSangerIterator
+from obitools.seqdb.embl.parser import emblIterator
+from obitools.seqdb.genbank.parser import genbankIterator
+from itertools import chain
+from obitools.utils import universalOpen
+
+def autoSequenceIterator(file):
+ lineiterator = universalOpen(file)
+ first = lineiterator.next()
+ if first[0]==">":
+ reader=fastaIterator
+ elif first[0]=='@':
+ reader=fastqSangerIterator
+ elif first[0:3]=='ID ':
+ reader=emblIterator
+ elif first[0:6]=='LOCUS ':
+ reader=genbankIterator
+ else:
+ raise AssertionError,'file is not in fasta, fasta, embl, or genbank format'
+
+ input = reader(chain([first],lineiterator))
+
+ return input
diff --git a/obitools/format/sequence/embl.py b/obitools/format/sequence/embl.py
new file mode 100644
index 0000000..f59f14a
--- /dev/null
+++ b/obitools/format/sequence/embl.py
@@ -0,0 +1,2 @@
+from obitools.seqdb.embl.parser import emblIterator,emblParser
+
diff --git a/obitools/format/sequence/fasta.py b/obitools/format/sequence/fasta.py
new file mode 100644
index 0000000..1d7bd49
--- /dev/null
+++ b/obitools/format/sequence/fasta.py
@@ -0,0 +1,4 @@
+from obitools.fasta import fastaIterator,fastaParser
+from obitools.fasta import fastaAAIterator,fastaAAParser
+from obitools.fasta import fastaNucIterator,fastaNucParser
+from obitools.fasta import formatFasta
diff --git a/obitools/format/sequence/fastq.py b/obitools/format/sequence/fastq.py
new file mode 100644
index 0000000..54fdf89
--- /dev/null
+++ b/obitools/format/sequence/fastq.py
@@ -0,0 +1,13 @@
+'''
+Created on 15 janv. 2010
+
+@author: coissac
+'''
+
+from obitools.fastq import fastqIterator,fastqParserGenetator
+from obitools.fastq import fastqSangerIterator,fastqSolexaIterator, \
+ fastqIlluminaIterator
+from obitools.fastq import fastqAAIterator
+from obitools.fastq import formatFastq
+
+
diff --git a/obitools/format/sequence/fnaqual.py b/obitools/format/sequence/fnaqual.py
new file mode 100644
index 0000000..ab69916
--- /dev/null
+++ b/obitools/format/sequence/fnaqual.py
@@ -0,0 +1,8 @@
+'''
+Created on 12 oct. 2009
+
+@author: coissac
+'''
+
+from obitools.fnaqual.fasta import fnaFastaIterator
+from obitools.fnaqual.quality import qualityIterator
diff --git a/obitools/format/sequence/genbank.py b/obitools/format/sequence/genbank.py
new file mode 100644
index 0000000..8524b6f
--- /dev/null
+++ b/obitools/format/sequence/genbank.py
@@ -0,0 +1,4 @@
+from obitools.seqdb.genbank.parser import genpepIterator,genpepParser
+from obitools.seqdb.genbank.parser import genbankIterator,genbankParser
+
+
diff --git a/obitools/format/sequence/tagmatcher.py b/obitools/format/sequence/tagmatcher.py
new file mode 100644
index 0000000..60ad8d8
--- /dev/null
+++ b/obitools/format/sequence/tagmatcher.py
@@ -0,0 +1,5 @@
+from obitools.tagmatcher.parser import tagMatcherParser
+from obitools.tagmatcher.parser import TagMatcherIterator
+from obitools.tagmatcher.parser import formatTagMatcher
+
+tagMatcherIterator=TagMatcherIterator
diff --git a/obitools/goa/__init__.py b/obitools/goa/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/obitools/goa/parser.py b/obitools/goa/parser.py
new file mode 100644
index 0000000..8ffd1e3
--- /dev/null
+++ b/obitools/goa/parser.py
@@ -0,0 +1,33 @@
+from itertools import imap
+from obitools import utils
+
+class GoAFileIterator(utils.ColumnFile):
+ def __init__(self,stream):
+ utils.ColumnFile.__init__(self,
+ stream, '\t', True,
+ (str,))
+
+ _colname = ['database',
+ 'ac',
+ 'symbol',
+ 'qualifier',
+ 'goid',
+ 'origin',
+ 'evidence',
+ 'evidnce_origine',
+ 'namespace',
+ 'db_object_name',
+ 'gene',
+ 'object_type',
+ 'taxid',
+ 'date',
+ 'assigned_by']
+
+ def next(self):
+ data = utils.ColumnFile.next(self)
+ data = dict(imap(None,GoAFileIterator._colname,data))
+
+ return data
+
+
+
diff --git a/obitools/graph/__init__.py b/obitools/graph/__init__.py
new file mode 100644
index 0000000..fbc5253
--- /dev/null
+++ b/obitools/graph/__init__.py
@@ -0,0 +1,962 @@
+'''
+**obitool.graph** for representing graph structure in obitools
+--------------------------------------------------------------
+
+.. codeauthor:: Eric Coissac
+
+
+This module offert classes to manipulate graphs, mainly trough the
+:py:class:`obitools.graph.Graph` class.
+
+.. inheritance-diagram:: Graph DiGraph UndirectedGraph
+ :parts: 2
+
+'''
+
+import sys
+
+
+from obitools.utils import progressBar
+
+
+class Indexer(dict):
+ '''
+ Allow to manage convertion between an arbitrarly hashable python
+ value and an unique integer key
+ '''
+
+ def __init__(self):
+
+ self.__max=0
+ self.__reverse=[]
+
+ def getLabel(self,index):
+ '''
+ Return the python value associated to an integer index.
+
+ :param index: an index value
+ :type index: int
+
+ :raises: IndexError if the index is not used in this
+ Indexer instance
+ '''
+ return self.__reverse[index]
+
+ def getIndex(self,key,strict=False):
+ '''
+ Return the index associated to a **key** in the indexer. Two
+ modes are available :
+
+ - strict mode :
+
+ if the key is not known by the :py:class:`Indexer` instance
+ a :py:exc:`KeyError` exception is raised.
+
+ - non strict mode :
+
+ in this mode if the requested *key** is absent, it is added to
+ the :py:class:`Indexer` instance and the new index is returned
+
+ :param key: the requested key
+ :type key: a hashable python value
+
+ :param strict: select the looking for mode
+ :type strict: bool
+
+ :return: the index corresponding to the key
+ :rtype: int
+
+ :raises: - :py:exc:`KeyError` in strict mode is key is absent
+ of the :py:class:`Indexer` instance
+
+ - :py:exc:`TypeError` if key is not an hashable value.
+ '''
+ if dict.__contains__(self,key):
+ return dict.__getitem__(self,key)
+ elif strict:
+ raise KeyError,key
+ else:
+ value = self.__max
+ self[key]= value
+ self.__reverse.append(key)
+ self.__max+=1
+ return value
+
+ def __getitem__(self,key):
+ '''
+ Implement the [] operateor to emulate the standard dictionnary
+ behaviour on :py:class:`Indexer` and returns the integer key
+ associated to a python value.
+
+ Actually this method call the:py:meth:`getIndex` method in
+ non strict mode so it only raises an :py:exc:`TypeError`
+ if key is not an hashable value.
+
+ :param key: the value to index
+ :type key: an hashable python value
+
+ :return: an unique integer value associated to the key
+ :rtype: int
+
+ :raises: :py:exc:`TypeError` if **key** is not an hashable value.
+
+ '''
+ return self.getIndex(key)
+
+ def __equal__(self,index):
+ '''
+ Implement equal operator **==** for comparing two :py:class:`Indexer` instances.
+ Two :py:class:`Indexer` instances are equals only if they are physically
+ the same instance
+
+ :param index: the second Indexer
+ :type index: an :py:class:`Indexer` instance
+
+ :return: True is the two :py:class:`Indexer` instances are the same
+ :rtype: bool
+ '''
+ return id(self)==id(index)
+
+
+class Graph(object):
+ '''
+ Class used to represent directed or undirected graph.
+
+ .. warning::
+
+ Only one edge can connect two nodes in a given direction.
+
+ .. warning::
+
+ Specifying nodes through their index seepud your code but as no check
+ is done on index value, it may result in inconsistency. So prefer the
+ use of node label to specify a node.
+
+
+ '''
+ def __init__(self,label='G',directed=False,indexer=None,nodes=None,edges=None):
+ '''
+ :param label: Graph name, set to 'G' by default
+ :type label: str
+
+ :param directed: true for directed graph, set to False by defalt
+ :type directed: boolean
+
+ :param indexer: node label indexer. This allows to define several graphs
+ sharing the same indexer (see : :py:meth:`newEmpty`)
+ :type indexer: :py:class:`Indexer`
+
+ :param nodes: set of nodes to add to the graph
+ :type nodes: iterable value
+
+ :param edges: set of edges to add to the graph
+ :type edges: iterable value
+ '''
+
+ self._directed=directed
+ if indexer is None:
+ indexer = Indexer()
+ self._index = indexer
+ self._node = {}
+ self._node_attrs = {}
+ self._edge_attrs = {}
+ self._label=label
+
+ def newEmpty(self):
+ """
+ Build a new empty graph using the same :py:class:`Indexer` instance.
+ This allows two graph for sharing their vertices through their indices.
+ """
+ n = Graph(self._label+"_compact",self._directed,self._index)
+
+ return n
+
+ def addNode(self,node=None,index=None,**data):
+ '''
+ Add a new node or update an existing one.
+
+ :param node: the new node label or the label of an existing node
+ for updating it.
+ :type node: an hashable python value
+
+ :param index: the index of an existing node for updating it.
+ :type index: int
+
+ :return: the index of the node
+ :rtype: int
+
+ :raises: :py:exc:`IndexError` is index is not **None** and
+ corresponds to a not used index in this graph.
+ '''
+ if index is None:
+ index = self._index[node]
+
+ if index not in self._node:
+ self._node[index]=set()
+ else:
+ if index not in self._node:
+ raise IndexError,"This index is not used in this graph"
+
+ if data:
+ if index in self._node_attrs:
+ self._node_attrs[index].update(data)
+ else:
+ self._node_attrs[index]=dict(data)
+
+ return index
+
+ def __contains__(self,node):
+ try:
+ index = self._index.getIndex(node,strict=True)
+ r = index in self._node
+ except KeyError:
+ r=False
+ return r
+
+ def getNode(self,node=None,index=None):
+ """
+ :param node: a node label.
+ :type node: an hashable python value
+
+ :param index: the index of an existing node.
+ :type index: int
+
+ .. note:: Index value are prevalent over node label.
+
+ :return: the looked for node
+ :rtype: :py:class:`Node`
+
+ :raises: :py:exc:`IndexError` if specified node lablel
+ corresponds to a non-existing node.
+
+ .. warning:: no check on index value
+ """
+ if index is None:
+ index = self._index.getIndex(node, True)
+ return Node(index,self)
+
+ def getBestNode(self,estimator):
+ '''
+ Select the node maximizing the estimator function
+
+ :param estimator: the function to maximize
+ :type estimator: a function returning a numerical value and accepting one
+ argument of type :py:class:`Node`
+
+ :return: the best node
+ :rtype: py:class:`Node`
+ '''
+
+ bestScore=0
+ best=None
+ for n in self:
+ score = estimator(n)
+ if best is None or score > bestScore:
+ bestScore = score
+ best=n
+ return best
+
+
+ def delNode(self,node=None,index=None):
+ """
+ Delete a node from a graph and all associated edges.
+
+ :param node: a node label.
+ :type node: an hashable python value
+
+ :param index: the index of an existing node.
+ :type index: int
+
+ .. note:: Index value are prevalent over node label.
+
+ :raises: :py:exc:`IndexError` if specified node lablel
+ corresponds to a non-existing node.
+
+ .. warning:: no check on index value
+ """
+ if index is None:
+ index = self._index[node]
+
+ for n in self._node:
+ if n!=index:
+ e = self._node[n]
+ if index in e:
+ if (n,index) in self._edge_attrs:
+ del self._edge_attrs[(n,index)]
+ e.remove(index)
+
+ e = self._node[index]
+
+ for n in e:
+ if (index,n) in self._edge_attrs:
+ del self._edge_attrs[(index,n)]
+
+ del self._node[index]
+ if index in self._node_attrs:
+ del self._node_attrs[index]
+
+
+ def addEdge(self,node1=None,node2=None,index1=None,index2=None,**data):
+ '''
+ Create a new edge in the graph between both the specified nodes.
+
+ .. note:: Nodes can be specified using their label or their index in the graph
+ if both values are indicated the index is used.
+
+ :param node1: The first vertex label
+ :type node1: an hashable python value
+ :param node2: The second vertex label
+ :type node2: an hashable python value
+ :param index1: The first vertex index
+ :type index1: int
+ :param index2: The second vertex index
+ :type index2: int
+
+ :raises: :py:exc:`IndexError` if one of both the specified node lablel
+ corresponds to a non-existing node.
+
+
+ .. warning:: no check on index value
+ '''
+
+ index1=self.addNode(node1, index1)
+ index2=self.addNode(node2, index2)
+
+ self._node[index1].add(index2)
+
+ if not self._directed:
+ self._node[index2].add(index1)
+
+ if data:
+ if (index1,index2) not in self._edge_attrs:
+ data =dict(data)
+ self._edge_attrs[(index1,index2)]=data
+ if not self._directed:
+ self._edge_attrs[(index2,index1)]=data
+ else:
+ self._edge_attrs[(index2,index1)].update(data)
+
+ return (index1,index2)
+
+ def getEdge(self,node1=None,node2=None,index1=None,index2=None):
+ '''
+ Extract the :py:class:`Edge` instance linking two nodes of the graph.
+
+ .. note:: Nodes can be specified using their label or their index in the graph
+ if both values are indicated the index is used.
+
+ :param node1: The first vertex label
+ :type node1: an hashable python value
+ :param node2: The second vertex label
+ :type node2: an hashable python value
+ :param index1: The first vertex index
+ :type index1: int
+ :param index2: The second vertex index
+ :type index2: int
+
+ :raises: :py:exc:`IndexError` if one of both the specified node lablel
+ corresponds to a non-existing node.
+
+
+ .. warning:: no check on index value
+ '''
+ node1=self.getNode(node1, index1)
+ node2=self.getNode(node2, index2)
+ return Edge(node1,node2)
+
+ def delEdge(self,node1=None,node2=None,index1=None,index2=None):
+ """
+ Delete the edge linking node 1 to node 2.
+
+ .. note:: Nodes can be specified using their label or their index in the graph
+ if both values are indicated the index is used.
+
+
+ :param node1: The first vertex label
+ :type node1: an hashable python value
+ :param node2: The second vertex label
+ :type node2: an hashable python value
+ :param index1: The first vertex index
+ :type index1: int
+ :param index2: The second vertex index
+ :type index2: int
+
+ :raises: :py:exc:`IndexError` if one of both the specified node lablel
+ corresponds to a non-existing node.
+
+
+ .. warning:: no check on index value
+ """
+ if index1 is None:
+ index1 = self._index[node1]
+ if index2 is None:
+ index2 = self._index[node2]
+ if index1 in self._node and index2 in self._node[index1]:
+ self._node[index1].remove(index2)
+ if (index1,index2) in self._node_attrs:
+ del self._node_attrs[(index1,index2)]
+ if not self._directed:
+ self._node[index2].remove(index1)
+ if (index2,index1) in self._node_attrs:
+ del self._node_attrs[(index2,index1)]
+
+ def edgeIterator(self,predicate=None):
+ """
+ Iterate through a set of selected vertices.
+
+ :param predicate: a function allowing node selection. Default value
+ is **None** and indicate that all nodes are selected.
+ :type predicate: a function returning a boolean value
+ and accepting one argument of class :py:class:`Edge`
+
+ :return: an iterator over selected edge
+ :rtype: interator over :py:class:`Edge` instances
+
+ .. seealso::
+ function :py:func:`selectEdgeAttributeFactory` for simple predicate.
+
+ """
+ for n1 in self._node:
+ for n2 in self._node[n1]:
+ if self._directed or n1 <= n2:
+ e = self.getEdge(index1=n1, index2=n2)
+ if predicate is None or predicate(e):
+ yield e
+
+
+ def nodeIterator(self,predicate=None):
+ """
+ Iterate through a set of selected vertices.
+
+ :param predicate: a function allowing edge selection. Default value
+ is **None** and indicate that all edges are selected.
+ :type predicate: a function returning a boolean value
+ and accepting one argument of class :py:class:`Node`
+
+ :return: an iterator over selected nodes.
+ :rtype: interator over :py:class:`Node` instances
+
+ """
+ for n in self._node:
+ node = self.getNode(index=n)
+ if predicate is None or predicate(node):
+ yield node
+
+ def nodeIndexIterator(self,predicate=None):
+ """
+ Iterate through the indexes of a set of selected vertices.
+
+ :param predicate: a function allowing edge selection. Default value
+ is **None** and indicate that all edges are selected.
+ :type predicate: a function returning a boolean value
+ and accepting one argument of class :py:class:`Node`
+
+ :return: an iterator over selected node indices.
+ :rtype: interator over `int`
+
+ """
+ for n in self._node:
+ node = self.getNode(index=n)
+ if predicate is None or predicate(node):
+ yield n
+
+ def neighbourIndexSet(self,node=None,index=None):
+ if index is None:
+ index=self.getNode(node).index
+ return self._node[index]
+
+ def edgeCount(self):
+ n = reduce(lambda x,y:x+y, (len(z) for z in self._node.itervalues()),0)
+ if not self._directed:
+ n=n/2
+ return n
+
+ def subgraph(self,nodes,name='G'):
+ sub = Graph(name,self._directed,self._index)
+ if not isinstance(nodes, set):
+ nodes = set(nodes)
+ for n in nodes:
+ sub._node[n]=nodes & self._node[n]
+ if n in self._node_attrs:
+ sub._node_attrs[n]=dict(self._node_attrs[n])
+ for n2 in sub._node[n]:
+ if not self._directed:
+ if n <= n2:
+ if (n,n2) in self._edge_attrs:
+ data=dict(self._edge_attrs[(n,n2)])
+ sub._edge_attrs[(n,n2)]=data
+ sub._edge_attrs[(n2,n)]=data
+ else:
+ if (n,n2) in self._edge_attrs:
+ data=dict(self._edge_attrs[(n,n2)])
+ sub._edge_attrs[(n,n2)]=data
+ return sub
+
+ def __len__(self):
+ return len(self._node)
+
+ def __getitem__(self,key):
+ return self.getNode(node=key)
+
+ def __delitem__(self,key):
+ self.delNode(node=key)
+
+ def __iter__(self):
+ return self.nodeIterator()
+
+ def __str__(self):
+ if self._directed:
+ kw ='digraph'
+ else:
+ kw='graph'
+
+ nodes = "\n ".join([str(x) for x in self])
+ edges = "\n ".join([str(x) for x in self.edgeIterator()])
+
+ return "%s %s {\n %s\n\n %s\n}" % (kw,self._label,nodes,edges)
+
+class Node(object):
+ """
+ Class used for representing one node or vertex in a graph
+
+ """
+ def __init__(self,index,graph):
+ '''
+ .. warning::
+
+ :py:class:`Node` constructor is usualy called through the :py:class:`Graph` methods
+
+ :param index: Index of the node in the graph
+ :type index: int
+ :param graph: graph instance owning the node
+ :type graph: :py:class:`obitools.graph.Graph`
+ '''
+ self.index = index
+ self.__graph = graph
+
+ def getGraph(self):
+ '''
+ return graph owning this node.
+
+ :rtype: :py:class:`obitools.graph.Graph`
+ '''
+ return self.__graph
+
+
+ def getLabel(self):
+ '''
+ return label associated to this node.
+ '''
+ return self.__graph._index.getLabel(self.index)
+
+
+ def has_key(self,key):
+ '''
+ test is the node instance has a property named 'key'.
+
+ :param key: the name of a property
+ :type key: str
+
+ :return: True if the nade has a property named
+ :rtype: bool
+ '''
+ if self.index in self.__graph._node_attrs:
+ return key in self.__graph._node_attrs[self.index]
+ else:
+ return False
+
+ def neighbourIterator(self,nodePredicat=None,edgePredicat=None):
+ '''
+ iterate through the nodes directly connected to
+ this node.
+
+ :param nodePredicat: a function accepting one node as parameter
+ and returning **True** if this node must be
+ returned by the iterator.
+ :type nodePredicat: function
+
+ :param edgePredicat: a function accepting one edge as parameter
+ and returning True if the edge linking self and
+ the current must be considered.
+ :type edgePredicat: function
+
+
+ :rtype: iterator on Node instances
+ '''
+ for n in self.neighbourIndexIterator(nodePredicat, edgePredicat):
+ node = self.graph.getNode(index=n)
+ yield node
+
+ def neighbourIndexSet(self):
+ '''
+ Return a set of node indexes directely connected
+ to this node.
+
+ .. warning::
+
+ do not change this set unless you know
+ exactly what you do.
+
+ @rtype: set of int
+ '''
+ return self.__graph._node[self.index]
+
+ def neighbourIndexIterator(self,nodePredicat=None,edgePredicat=None):
+ '''
+ iterate through the node indexes directly connected to
+ this node.
+
+ :param nodePredicat: a function accepting one node as parameter
+ and returning True if this node must be
+ returned by the iterator.
+ :type nodePredicat: function
+
+ :param edgePredicat: a function accepting one edge as parameter
+ and returning True if the edge linking self and
+ the current must be considered.
+ :type edgePredicat: function
+
+ :rtype: iterator on int
+ '''
+ for n in self.neighbourIndexSet():
+ if nodePredicat is None or nodePredicat(self.__graph.getNode(index=n)):
+ if edgePredicat is None or edgePredicat(self.__graph.getEdge(index1=self.index,index2=n)):
+ yield n
+
+ def degree(self,nodeIndexes=None):
+ '''
+ return count of edges linking this node to the
+ set of nodes describes by their index in nodeIndexes
+
+ :param nodeIndexes: set of node indexes.
+ if set to None, all nodes of the
+ graph are take into account.
+ Set to None by default.
+ :type nodeIndexes: set of int
+
+ :rtype: int
+ '''
+ if nodeIndexes is None:
+ return len(self.__graph._node[self.index])
+ else:
+ return len(self.__graph._node[self.index] & nodeIndexes)
+
+ def componentIndexSet(self,nodePredicat=None,edgePredicat=None):
+ '''
+ Return the set of node index in the same connected component.
+
+ :param nodePredicat: a function accepting one node as parameter
+ and returning True if this node must be
+ returned by the iterator.
+ :type nodePredicat: function
+
+ :param edgePredicat: a function accepting one edge as parameter
+ and returning True if the edge linking self and
+ the current must be considered.
+ :type edgePredicat: function
+
+
+ :rtype: set of int
+ '''
+ cc=set([self.index])
+ added = set(x for x in self.neighbourIndexIterator(nodePredicat, edgePredicat))
+ while added:
+ cc |= added
+ added = reduce(lambda x,y : x | y,
+ (set(z for z in self.graph.getNode(index=c).neighbourIndexIterator(nodePredicat, edgePredicat))
+ for c in added),
+ set())
+ added -= cc
+ return cc
+
+ def componentIterator(self,nodePredicat=None,edgePredicat=None):
+ '''
+ Iterate through the nodes in the same connected
+ component.
+
+ :rtype: iterator on :py:class:`Node` instance
+ '''
+ for c in self.componentIndexSet(nodePredicat, edgePredicat):
+ yield self.graph.getNode(c)
+
+ def shortestPathIterator(self,nodes=None):
+ '''
+ Iterate through the shortest path sourcing
+ from this node. if nodes is not None, iterates
+ only path linkink this node to one node listed in
+ nodes
+
+ :param nodes: set of node index
+ :type nodes: iterable on int
+
+ :return: an iterator on list of int describing path
+ :rtype: iterator on list of int
+ '''
+ if nodes is not None:
+ nodes = set(nodes)
+
+
+ Q=[(self.index,-1)]
+
+ gray = set([self.index])
+ paths = {}
+
+ while Q and (nodes is None or nodes):
+ u,p = Q.pop()
+ paths[u]=p
+ next = self.graph._node[u] - gray
+ gray|=next
+ Q.extend((x,u) for x in next)
+ if nodes is None or u in nodes:
+ if nodes:
+ nodes.remove(u)
+ path = [u]
+ while p >= 0:
+ path.append(p)
+ p = paths[p]
+ path.reverse()
+ yield path
+
+ def shortestPathTo(self,node=None,index=None):
+ '''
+ return one of the shortest path linking this
+ node to specified node.
+
+ :param node: a node label or None
+ :param index: a node index or None. the parameter index
+ has a priority on the parameter node.
+ :type index: int
+
+ :return: list of node index corresponding to the path or None
+ if no path exists.
+ :rtype: list of int or None
+ '''
+ if index is None:
+ index=self.graph.getNode(node).index
+ for p in self.shortestPathIterator([index]):
+ return p
+
+
+ def __getitem__(self,key):
+ '''
+ return the value of the property of this node
+
+ :param key: the name of a property
+ :type key: str
+ '''
+ return self.__graph._node_attrs.get(self.index,{})[key]
+
+ def __setitem__(self,key,value):
+ '''
+ set the value of a node property. In the property doesn't
+ already exist a new property is added to this node.
+
+ :param key: the name of a property
+ :type key: str
+ :param value: the value of the property
+
+ .. seealso::
+
+ :py:meth:`Node.__getitem__`
+ '''
+ if self.index in self.__graph._node_attrs:
+ data = self.__graph._node_attrs[self.index]
+ data[key]=value
+ else:
+ self.graph._node_attrs[self.index]={key:value}
+
+ def __len__(self):
+ '''
+ Count neighbour of this node
+
+ :rtype: int
+
+ .. seealso::
+
+ :py:meth:`Node.degree`
+ '''
+ return len(self.__graph._node[self.index])
+
+ def __iter__(self):
+ '''
+ iterate through neighbour of this node
+
+ :rtype: iterator in :py:class:`Node` instances
+
+ .. seealso::
+
+ :py:meth:`Node.neighbourIterator`
+ '''
+ return self.neighbourIterator()
+
+ def __contains__(self,key):
+ return self.has_key(key)
+
+ def __str__(self):
+
+ if self.index in self.__graph._node_attrs:
+ keys = " ".join(['%s="%s"' % (x[0],str(x[1]).replace('"','\\"').replace('\n','\\n'))
+ for x in self.__graph._node_attrs[self.index].iteritems()]
+ )
+ else:
+ keys=''
+
+ return '%d [label="%s" %s]' % (self.index,
+ str(self.label).replace('"','\\"').replace('\n','\\n'),
+ keys)
+
+ def keys(self):
+ if self.index in self.__graph._node_attrs:
+ k = self.__graph._node_attrs[self.index].keys()
+ else:
+ k=[]
+ return k
+
+ label = property(getLabel, None, None, "Label of the node")
+
+ graph = property(getGraph, None, None, "Graph owning this node")
+
+
+
+class Edge(object):
+ """
+ Class used for representing one edge of a graph
+
+ """
+
+ def __init__(self,node1,node2):
+ '''
+ .. warning::
+
+ :py:class:`Edge` constructor is usualy called through the :py:class:`Graph` methods
+
+ :param node1: First node likend by the edge
+ :type node1: :py:class:`Node`
+ :param node2: Seconde node likend by the edge
+ :type node2: :py:class:`Node`
+ '''
+ self.node1 = node1
+ self.node2 = node2
+
+ def getGraph(self):
+ """
+ Return the :py:class:`Graph` instance owning this edge.
+ """
+ return self.node1.graph
+
+ def has_key(self,key):
+ '''
+ test is the :py:class:`Edge` instance has a property named **key**.
+
+ :param key: the name of a property
+ :type key: str
+
+ :return: True if the edge has a property named
+ :rtype: bool
+ '''
+ if (self.node1.index,self.node2.index) in self.graph._edge_attrs:
+ return key in self.graph._edge_attrs[(self.node1.index,self.node2.index)]
+ else:
+ return False
+
+
+ def getDirected(self):
+ return self.node1.graph._directed
+
+ def __getitem__(self,key):
+ return self.graph._edge_attrs.get((self.node1.index,self.node2.index),{})[key]
+
+ def __setitem__(self,key,value):
+ e = (self.node1.index,self.node2.index)
+ if e in self.graph._edge_attrs:
+ data = self.graph._edge_attrs[e]
+ data[key]=value
+ else:
+ self.graph._edge_attrs[e]={key:value}
+
+ def __str__(self):
+ e = (self.node1.index,self.node2.index)
+ if e in self.graph._edge_attrs:
+ keys = "[%s]" % " ".join(['%s="%s"' % (x[0],str(x[1]).replace('"','\\"'))
+ for x in self.graph._edge_attrs[e].iteritems()]
+ )
+ else:
+ keys = ""
+
+ if self.directed:
+ link='->'
+ else:
+ link='--'
+
+ return "%d %s %d %s" % (self.node1.index,link,self.node2.index,keys)
+
+ def __contains__(self,key):
+ return self.has_key(key)
+
+
+ graph = property(getGraph, None, None, "Graph owning this edge")
+
+ directed = property(getDirected, None, None, "Directed's Docstring")
+
+
+class DiGraph(Graph):
+ """
+ :py:class:`DiGraph class`is a specialisation of the :py:class:`Graph` class
+ dedicated to directed graph representation
+
+ .. seealso::
+
+ :py:class:`UndirectedGraph`
+
+ """
+ def __init__(self,label='G',indexer=None,nodes=None,edges=None):
+ '''
+ :param label: Graph name, set to 'G' by default
+ :type label: str
+ :param indexer: node label indexer
+ :type indexer: Indexer instance
+ :param nodes: set of nodes to add to the graph
+ :type nodes: iterable value
+ :param edges: set of edges to add to the graph
+ :type edges: iterable value
+ '''
+
+ Graph.__init__(self, label, True, indexer, nodes, edges)
+
+class UndirectedGraph(Graph):
+ """
+ :py:class:`UndirectGraph class`is a specialisation of the :py:class:`Graph` class
+ dedicated to undirected graph representation
+
+ .. seealso::
+
+ :py:class:`DiGraph`
+
+ """
+ def __init__(self,label='G',indexer=None,nodes=None,edges=None):
+ '''
+ :param label: Graph name, set to 'G' by default
+ :type label: str
+ :param indexer: node label indexer
+ :type indexer: Indexer instance
+ :param nodes: set of nodes to add to the graph
+ :type nodes: iterable value
+ :param edges: set of edges to add to the graph
+ :type edges: iterable value
+ '''
+
+ Graph.__init__(self, label, False, indexer, nodes, edges)
+
+
+
+def selectEdgeAttributeFactory(attribut,value):
+ """
+ This function help in building predicat function usable for selecting edge
+ in the folowing :py:class:`Graph` methods :
+
+ - :py:meth:`Graph.edgeIterator`
+
+ """
+ def selectEdge(e):
+ return attribut in e and e[attribut]==value
+ return selectEdge
diff --git a/obitools/graph/__init__.pyc b/obitools/graph/__init__.pyc
new file mode 100644
index 0000000..397e5c0
Binary files /dev/null and b/obitools/graph/__init__.pyc differ
diff --git a/obitools/graph/algorithms/__init__.py b/obitools/graph/algorithms/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/obitools/graph/algorithms/__init__.pyc b/obitools/graph/algorithms/__init__.pyc
new file mode 100644
index 0000000..1f2edcc
Binary files /dev/null and b/obitools/graph/algorithms/__init__.pyc differ
diff --git a/obitools/graph/algorithms/clique.py b/obitools/graph/algorithms/clique.py
new file mode 100644
index 0000000..2007c1a
--- /dev/null
+++ b/obitools/graph/algorithms/clique.py
@@ -0,0 +1,134 @@
+import time
+import sys
+
+
+
+_maxsize=0
+_solution=0
+_notbound=0
+_sizebound=0
+_lastyield=0
+_maxclique=None
+
+def cliqueIterator(graph,minsize=1,node=None,timeout=None):
+ global _maxsize,_solution,_notbound,_sizebound,_lastyield
+ _maxsize=0
+ _solution=0
+ _notbound=0
+ _sizebound=0
+ starttime = time.time()
+
+ if node:
+ node = graph.getNode(node)
+ index = node.index
+ clique= set([index])
+ candidates= set(graph.neighbourIndexSet(index=index))
+ else:
+ clique=set()
+ candidates = set(x.index for x in graph)
+
+
+# candidates = set(x for x in candidates
+# if len(graph.neighbourIndexSet(index=x) & candidates) >= (minsize - 1))
+
+ _lastyield=time.time()
+ for c in _cliqueIterator(graph,clique,candidates,set(),minsize,start=starttime,timeout=timeout):
+ yield c
+
+
+
+
+
+def _cliqueIterator(graph,clique,candidates,notlist,minsize=0,start=None,timeout=None):
+ global _maxsize,_maxclique,_solution,_notbound,_sizebound,_lastyield
+
+ # Speed indicator
+ lclique = len(clique)
+ lcandidates = len(candidates)
+ notmin = lcandidates
+ notfix = None
+
+ for n in notlist:
+ nnc = candidates - graph.neighbourIndexSet(index=n)
+ nc = len(nnc)
+ if nc < notmin:
+ notmin=nc
+ notfix=n
+ notfixneib = nnc
+
+ if lclique > _maxsize or not _solution % 1000 :
+ if start is not None:
+ top = time.time()
+ delta = top - start
+ if delta==0:
+ delta=1e-6
+ speed = _solution / delta
+ start = top
+ else:
+ speed = 0
+ print >>sys.stderr,"\rCandidates : %-5d Maximum clique size : %-5d Solutions explored : %10d speed = %5.2f solutions/sec sizebound=%10d notbound=%10d " % (lcandidates,_maxsize,_solution,speed,_sizebound,_notbound),
+ sys.stderr.flush()
+ if lclique > _maxsize:
+ _maxsize=lclique
+
+# print >>sys.stderr,'koukou'
+
+ timer = time.time() - _lastyield
+
+ if not candidates and not notlist:
+ if lclique==_maxsize:
+ _maxclique=set(clique)
+ if lclique >= minsize:
+ yield set(clique)
+ if timeout is not None and timer > timeout and _maxclique is not None:
+ yield _maxclique
+ _maxclique=None
+
+ else:
+ while notmin and candidates and ((lclique + len(candidates)) >= minsize or (timeout is not None and timer > timeout)):
+ # count explored solution
+ _solution+=1
+
+ if notfix is None:
+ nextcandidate = candidates.pop()
+ else:
+ nextcandidate = notfixneib.pop()
+ candidates.remove(nextcandidate)
+
+ clique.add(nextcandidate)
+
+ neighbours = graph.neighbourIndexSet(index=nextcandidate)
+
+ nextcandidates = candidates & neighbours
+ nextnot = notlist & neighbours
+
+ nnc = candidates - neighbours
+ lnnc=len(nnc)
+
+ for c in _cliqueIterator(graph,
+ set(clique),
+ nextcandidates,
+ nextnot,
+ minsize,
+ start,
+ timeout=timeout):
+ yield c
+
+
+ clique.remove(nextcandidate)
+
+ notmin-=1
+
+ if lnnc < notmin:
+ notmin = lnnc
+ notfix = nextcandidate
+ notfixneib = nnc
+
+ if notmin==0:
+ _notbound+=1
+
+ notlist.add(nextcandidate)
+ else:
+ if (lclique + len(candidates)) < minsize:
+ _sizebound+=1
+
diff --git a/obitools/graph/algorithms/compact.py b/obitools/graph/algorithms/compact.py
new file mode 100644
index 0000000..8065a93
--- /dev/null
+++ b/obitools/graph/algorithms/compact.py
@@ -0,0 +1,8 @@
+
+def compactGraph(graph,nodeSetIterator):
+ compact = graph.newEmpty()
+ for ns in nodeSetIterator(graph):
+ nlabel = "\n".join([str(graph.getNode(index=x).label) for x in ns])
+ compact.addNode(nlabel)
+ print
+ print compact
diff --git a/obitools/graph/algorithms/component.py b/obitools/graph/algorithms/component.py
new file mode 100644
index 0000000..a17c8dd
--- /dev/null
+++ b/obitools/graph/algorithms/component.py
@@ -0,0 +1,82 @@
+"""
+Iterate through the connected components of a graph
+---------------------------------------------------
+
+the module :py:mod:`obitools.graph.algorithm.component` provides
+two functions to deal with the connected component of a graph
+represented as a :py:class:`obitools.graph.Graph` instance.
+
+The whole set of connected component of a graph is a partition of this graph.
+So a node cannot belongs to two distinct connected component.
+
+Two nodes are in the same connected component if it exits a path through
+the graph edges linking them.
+
+TODO: THere is certainly a bug with DirectedGraph
+
+"""
+
+def componentIterator(graph,nodePredicat=None,edgePredicat=None):
+ '''
+ Build an iterator over the connected component of a graph.
+ Each connected component returned by the iterator is represented
+ as a `set` of node indices.
+
+ :param graph: the graph to partitionne
+ :type graph: :py:class:`obitools.graph.Graph`
+
+ :param predicate: a function allowing edge selection. Default value
+ is **None** and indicate that all edges are selected.
+ :type predicate: a function returning a boolean value
+ and accepting one argument of class :py:class:`Node`
+
+ :param predicate: a function allowing node selection. Default value
+ is **None** and indicate that all nodes are selected.
+ :type predicate: a function returning a boolean value
+ and accepting one argument of class :py:class:`Edge`
+
+ :return: an iterator over the connected component set
+ :rtype: an iterator over `set` of `int`
+
+ .. seealso::
+ the :py:meth:`obitools.graph.Graph.componentIndexSet` method
+ on which is based this function.
+ '''
+ seen = set()
+ for n in graph.nodeIterator(nodePredicat):
+ if n.index not in seen:
+ cc=n.componentIndexSet(nodePredicat, edgePredicat)
+ yield cc
+ seen |= cc
+
+def componentCount(graph,nodePredicat=None,edgePredicat=None):
+ '''
+ Count the connected componnent in a graph.
+
+ :param graph: the graph to partitionne
+ :type graph: :py:class:`obitools.graph.Graph`
+
+ :param predicate: a function allowing edge selection. Default value
+ is **None** and indicate that all edges are selected.
+ :type predicate: a function returning a boolean value
+ and accepting one argument of class :py:class:`Node`
+
+ :param predicate: a function allowing node selection. Default value
+ is **None** and indicate that all nodes are selected.
+ :type predicate: a function returning a boolean value
+ and accepting one argument of class :py:class:`Edge`
+
+ :return: an iterator over the connected component set
+ :rtype: an iterator over `set` of `int`
+
+ .. seealso::
+ the :py:func:`componentIterator` function
+ on which is based this function.
+ '''
+ n=0
+ for c in componentIterator(graph,nodePredicat, edgePredicat):
+ n+=1
+ return n
+
+
+
\ No newline at end of file
diff --git a/obitools/graph/algorithms/component.pyc b/obitools/graph/algorithms/component.pyc
new file mode 100644
index 0000000..a3b6298
Binary files /dev/null and b/obitools/graph/algorithms/component.pyc differ
diff --git a/obitools/graph/dag.py b/obitools/graph/dag.py
new file mode 100644
index 0000000..f9a7a96
--- /dev/null
+++ b/obitools/graph/dag.py
@@ -0,0 +1,80 @@
+from obitools.graph import DiGraph,Node
+from obitools.graph.algorithms.component import componentIterator
+
+class DAG(DiGraph):
+ def __init__(self,label='G',indexer=None,nodes=None,edges=None):
+ '''
+ Directed Graph constructor.
+
+ @param label: Graph name, set to 'G' by default
+ @type label: str
+ @param indexer: node label indexer
+ @type indexer: Indexer instance
+ @param nodes: set of nodes to add to the graph
+ @type nodes: iterable value
+ @param edges: set of edges to add to the graph
+ @type edges: iterable value
+ '''
+
+ self._parents={}
+ DiGraph.__init__(self, label, indexer, nodes, edges)
+
+ def getNode(self,node=None,index=None):
+ if index is None:
+ index = self._index.getIndex(node, True)
+ return DAGNode(index,self)
+
+ def addEdge(self,parent=None,node=None,indexp=None,index=None,**data):
+ indexp=self.addNode(parent, indexp)
+ index =self.addNode(node , index)
+
+ pindex = set(n.index
+ for n in self.getNode(index=indexp).ancestorIterator())
+
+ assert index not in pindex,'Child node cannot be a parent node'
+
+ DiGraph.addEdge(self,index1=indexp,index2=index,**data)
+
+ if index in self._parents:
+ self._parents[index].add(indexp)
+ else:
+ self._parents[index]=set([indexp])
+
+
+ return (indexp,index)
+
+ def getRoots(self):
+ return [self.getNode(index=cc.pop()).getRoot()
+ for cc in componentIterator(self)]
+
+
+
+
+class DAGNode(Node):
+
+ def ancestorIterator(self):
+ if self.index in self.graph._parents:
+ for p in self.graph._parents[self.index]:
+ parent = DAGNode(p,self.graph)
+ yield parent
+ for pnode in parent.ancestorIterator():
+ yield pnode
+
+ def getRoot(self):
+ for x in self.ancestorIterator():
+ pass
+ return x
+
+ def leavesIterator(self):
+ if not self:
+ yield self
+ for n in self:
+ for nn in n.leavesIterator():
+ yield nn
+
+ def subgraphIterator(self):
+ yield self
+ for n in self:
+ for nn in n.subgraphIterator():
+ yield nn
+
diff --git a/obitools/graph/layout/__init__.py b/obitools/graph/layout/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/obitools/graph/layout/radialtree.py b/obitools/graph/layout/radialtree.py
new file mode 100644
index 0000000..e69de29
diff --git a/obitools/graph/rootedtree.py b/obitools/graph/rootedtree.py
new file mode 100644
index 0000000..803316d
--- /dev/null
+++ b/obitools/graph/rootedtree.py
@@ -0,0 +1,117 @@
+from obitools.graph.dag import DAG,DAGNode
+
+class RootedTree(DAG):
+
+ def addEdge(self,parent=None,node=None,indexp=None,index=None,**data):
+ indexp=self.addNode(parent, indexp)
+ index =self.addNode(node , index)
+
+ assert index not in self._parents or indexp in self._parents[index], \
+ 'Child node cannot have more than one parent node'
+
+ return DAG.addEdge(self,indexp=indexp,index=index,**data)
+
+ def getNode(self,node=None,index=None):
+ if index is None:
+ index = self._index.getIndex(node, True)
+ return RootedTreeNode(index,self)
+
+
+
+class RootedTreeNode(DAGNode):
+
+ def subTreeSize(self):
+ n=1
+ for subnode in self:
+ n+=subnode.subTreeSize()
+ return n
+
+ def subTreeLeaves(self):
+ if not self:
+ return 1
+ n=0
+ for subnode in self:
+ n+=subnode.subTreeLeaves()
+ return n
+
+
+def nodeWriter(node,deep=0,label=None,distance="distance", bootstrap="bootstrap",cartoon=None,collapse=None):
+
+ ks = node.keys()
+
+
+ if label is None:
+ name=node.label
+ elif callable(label):
+ name=label(node)
+ elif isinstance(label, str) and label in node:
+ name=node[label]
+ ks.remove(label)
+ else:
+ name=''
+
+ if distance in node:
+ dist=':%6.5f' % node[distance]
+ ks.remove(distance)
+ else:
+ dist=''
+
+ ks = ["%s=%s" % (k,node[k]) for k in ks]
+
+ if cartoon is not None and cartoon(node):
+ ks.append("!cartoon={%d,0.0}" % node.subTreeLeaves())
+
+ if collapse is not None and collapse(node):
+ ks.append('!collapse={"collapsed",0.0}')
+
+ if ks:
+ ks="[&"+",".join(ks)+"]"
+ else:
+ ks=''
+
+
+ nodeseparator = ',\n' + ' ' * (deep+1)
+
+ subnodes = nodeseparator.join([nodeWriter(x, deep+1,label,distance,bootstrap,cartoon=cartoon,collapse=collapse)
+ for x in node])
+ if subnodes:
+ subnodes='(\n' + ' ' * (deep+1) + subnodes + '\n' + ' ' * deep + ')'
+
+ return '%s"%s"%s%s' % (subnodes,name,ks,dist)
+
+
+def nexusFormat(tree,startnode=None,label=None,blocks="",cartoon=None,collapse=None):
+ head="#NEXUS\n"
+
+ tx = []
+
+ for n in tree:
+ if label is None:
+ name=n.label
+ elif callable(label):
+ name=label(n)
+ elif isinstance(label, str) and label in n:
+ name=n[label]
+ else:
+ name=''
+
+ if name:
+ tx.append('"%s"' % name)
+
+ taxa = "begin taxa;\n\tdimensions ntax=%d;\n\ttaxlabels\n\t" % len(tx)
+
+ taxa+="\n\t".join(tx)
+
+ taxa+="\n;\nend;\n\n"
+
+
+
+ if startnode is not None:
+ roots =[startnode]
+ else:
+ roots = tree.getRoots()
+ trees = nodeWriter(roots[0],0,label,cartoon=cartoon,collapse=collapse)
+ trees = "begin trees;\n\ttree tree_1 = [&R] "+ trees +";\nend;\n\n"
+ return head+taxa+trees+"\n\n"+blocks+"\n"
+
+
\ No newline at end of file
diff --git a/obitools/graph/tree.py b/obitools/graph/tree.py
new file mode 100644
index 0000000..940ee44
--- /dev/null
+++ b/obitools/graph/tree.py
@@ -0,0 +1,37 @@
+from obitools.graph import UndirectedGraph,Node
+from obitools.graph.algorithms.component import componentCount
+
+
+class Forest(UndirectedGraph):
+
+
+ def getNode(self,node=None,index=None):
+ if index is None:
+ index = self._index.getIndex(node, True)
+ return TreeNode(index,self)
+
+ def addEdge(self,node1=None,node2=None,index1=None,index2=None,**data):
+ index1=self.addNode(node1, index1)
+ index2=self.addNode(node2, index2)
+
+ cc = set(n.index for n in self.getNode(index=index2).componentIterator())
+
+ assert index1 in self._node[index2] or index1 not in cc, \
+ "No more than one path is alloed between two nodes in a tree"
+
+ UndirectedGraph.addEdge(self, index1=index1, index2=index2,**data)
+
+ return (index1,index2)
+
+ def isASingleTree(self):
+ return componentCount(self)==1
+
+class TreeNode(Node):
+
+ def componentIterator(self):
+ for c in self:
+ yield c
+ for cc in c:
+ yield cc
+
+
\ No newline at end of file
diff --git a/obitools/gzip.py b/obitools/gzip.py
new file mode 100644
index 0000000..841641a
--- /dev/null
+++ b/obitools/gzip.py
@@ -0,0 +1,504 @@
+"""Functions that read and write gzipped files.
+
+The user of the file doesn't have to worry about the compression,
+but random access is not allowed.
+
+This consisted on a patched version of of standard gzip python
+module based on Andrew Kuchling's minigzip.py distributed with the zlib module
+
+"""
+
+# based on Andrew Kuchling's minigzip.py distributed with the zlib module
+
+import struct, sys, time
+import zlib
+import __builtin__
+
+__all__ = ["GzipFile","open"]
+
+FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
+
+READ, WRITE = 1, 2
+
+def U32(i):
+ """Return i as an unsigned integer, assuming it fits in 32 bits.
+
+ If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
+ """
+ if i < 0:
+ i += 1L << 32
+ return i
+
+def LOWU32(i):
+ """Return the low-order 32 bits of an int, as a non-negative int."""
+ return i & 0xFFFFFFFFL
+
+def write32(output, value):
+ output.write(struct.pack("'
+
+ def _init_write(self, filename):
+ if filename[-3:] != '.gz':
+ filename = filename + '.gz'
+ self.filename = filename
+ self.crc = zlib.crc32("")
+ self.size = 0
+ self.writebuf = []
+ self.bufsize = 0
+
+ def _write_gzip_header(self):
+ self.fileobj.write('\037\213') # magic header
+ self.fileobj.write('\010') # compression method
+ fname = self.filename[:-3]
+ flags = 0
+ if fname:
+ flags = FNAME
+ self.fileobj.write(chr(flags))
+ write32u(self.fileobj, long(time.time()))
+ self.fileobj.write('\002')
+ self.fileobj.write('\377')
+ if fname:
+ self.fileobj.write(fname + '\000')
+
+ def _init_read(self):
+ self.crc = zlib.crc32("")
+ self.size = 0
+
+ def _read_internal(self, size):
+ if len(self.inputbuf) < size:
+ self.inputbuf += self.fileobj.read(size-len(self.inputbuf))
+ chunk = self.inputbuf[:size]
+ # need to use len(chunk) bellow instead of size in case it's EOF.
+ if len(chunk) < 8:
+ self.last8 = self.last8[len(chunk):] + chunk
+ else:
+ self.last8 = chunk[-8:]
+ self.inputbuf = self.inputbuf[size:]
+ return chunk
+
+ def _read_gzip_header(self):
+ magic = self._read_internal(2)
+ if len(magic) != 2:
+ raise EOFError, "Reached EOF"
+ if magic != '\037\213':
+ raise IOError, 'Not a gzipped file'
+ method = ord( self._read_internal(1) )
+ if method != 8:
+ raise IOError, 'Unknown compression method'
+ flag = ord( self._read_internal(1) )
+ # modtime = self.fileobj.read(4)
+ # extraflag = self.fileobj.read(1)
+ # os = self.fileobj.read(1)
+ self._read_internal(6)
+
+ if flag & FEXTRA:
+ # Read & discard the extra field, if present
+ xlen = ord(self._read_internal(1))
+ xlen = xlen + 256*ord(self._read_internal(1))
+ self._read_internal(xlen)
+ if flag & FNAME:
+ # Read and discard a null-terminated string containing the filename
+ while True:
+ s = self._read_internal(1)
+ if not s or s=='\000':
+ break
+ if flag & FCOMMENT:
+ # Read and discard a null-terminated string containing a comment
+ while True:
+ s = self._read_internal(1)
+ if not s or s=='\000':
+ break
+ if flag & FHCRC:
+ self._read_internal(2) # Read & discard the 16-bit header CRC
+
+
+ def write(self,data):
+ if self.mode != WRITE:
+ import errno
+ raise IOError(errno.EBADF, "write() on read-only GzipFile object")
+
+ if self.fileobj is None:
+ raise ValueError, "write() on closed GzipFile object"
+ if len(data) > 0:
+ self.size = self.size + len(data)
+ self.crc = zlib.crc32(data, self.crc)
+ self.fileobj.write( self.compress.compress(data) )
+ self.offset += len(data)
+
+ def read(self, size=-1):
+ if self.mode != READ:
+ import errno
+ raise IOError(errno.EBADF, "read() on write-only GzipFile object")
+
+ if self.extrasize <= 0 and self.fileobj is None:
+ return ''
+
+ readsize = 1024
+ if size < 0: # get the whole thing
+ try:
+ while True:
+ self._read(readsize)
+ readsize = min(self.max_read_chunk, readsize * 2)
+ except EOFError:
+ size = self.extrasize
+ else: # just get some more of it
+ try:
+ while size > self.extrasize:
+ self._read(readsize)
+ readsize = min(self.max_read_chunk, readsize * 2)
+ except EOFError:
+ if size > self.extrasize:
+ size = self.extrasize
+
+ chunk = self.extrabuf[:size]
+ self.extrabuf = self.extrabuf[size:]
+ self.extrasize = self.extrasize - size
+
+ self.offset += size
+ return chunk
+
+ def _unread(self, buf):
+ self.extrabuf = buf + self.extrabuf
+ self.extrasize = len(buf) + self.extrasize
+ self.offset -= len(buf)
+
+ def _read(self, size=1024):
+ if self.fileobj is None:
+ raise EOFError, "Reached EOF"
+
+ if self._new_member:
+ # If the _new_member flag is set, we have to
+ # jump to the next member, if there is one.
+ #
+ # _read_gzip_header will raise EOFError exception
+ # if there no more members to read.
+ self._init_read()
+ self._read_gzip_header()
+ self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
+ self._new_member = False
+
+ # Read a chunk of data from the file
+ buf = self._read_internal(size)
+
+ # If the EOF has been reached, flush the decompression object
+ # and mark this object as finished.
+
+ if buf == "":
+ uncompress = self.decompress.flush()
+ self._read_eof()
+ self._add_read_data( uncompress )
+ raise EOFError, 'Reached EOF'
+
+ uncompress = self.decompress.decompress(buf)
+ self._add_read_data( uncompress )
+
+ if self.decompress.unused_data != "":
+ # Ending case: we've come to the end of a member in the file,
+ # so put back unused_data and initialize last8 by reading them.
+ self.inputbuf = self.decompress.unused_data + self.inputbuf
+ self._read_internal(8)
+
+ # Check the CRC and file size, and set the flag so we read
+ # a new member on the next call
+ self._read_eof()
+ self._new_member = True
+
+ def _add_read_data(self, data):
+ self.crc = zlib.crc32(data, self.crc)
+ self.extrabuf = self.extrabuf + data
+ self.extrasize = self.extrasize + len(data)
+ self.size = self.size + len(data)
+
+ def _read_eof(self):
+ # We've read to the end of the file, so we have to rewind in order
+ # to reread the 8 bytes containing the CRC and the file size.
+ # We check the that the computed CRC and size of the
+ # uncompressed data matches the stored values. Note that the size
+ # stored is the true file size mod 2**32.
+ crc32 = unpack32(self.last8[:4])
+ isize = U32(unpack32(self.last8[4:])) # may exceed 2GB
+ if U32(crc32) != U32(self.crc):
+ raise IOError, "CRC check failed"
+ elif isize != LOWU32(self.size):
+ raise IOError, "Incorrect length of data produced"
+
+ def close(self):
+ if self.mode == WRITE:
+ self.fileobj.write(self.compress.flush())
+ # The native zlib crc is an unsigned 32-bit integer, but
+ # the Python wrapper implicitly casts that to a signed C
+ # long. So, on a 32-bit box self.crc may "look negative",
+ # while the same crc on a 64-bit box may "look positive".
+ # To avoid irksome warnings from the `struct` module, force
+ # it to look positive on all boxes.
+ write32u(self.fileobj, LOWU32(self.crc))
+ # self.size may exceed 2GB, or even 4GB
+ write32u(self.fileobj, LOWU32(self.size))
+ self.fileobj = None
+ elif self.mode == READ:
+ self.fileobj = None
+ if self.myfileobj:
+ self.myfileobj.close()
+ self.myfileobj = None
+
+ def __del__(self):
+ try:
+ if (self.myfileobj is None and
+ self.fileobj is None):
+ return
+ except AttributeError:
+ return
+ self.close()
+
+ def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
+ if self.mode == WRITE:
+ # Ensure the compressor's buffer is flushed
+ self.fileobj.write(self.compress.flush(zlib_mode))
+ self.fileobj.flush()
+
+ def fileno(self):
+ """Invoke the underlying file object's fileno() method.
+
+ This will raise AttributeError if the underlying file object
+ doesn't support fileno().
+ """
+ return self.fileobj.fileno()
+
+ def isatty(self):
+ return False
+
+ def tell(self):
+ return self.offset
+
+ def rewind(self):
+ '''Return the uncompressed stream file position indicator to the
+ beginning of the file'''
+ if self.mode != READ:
+ raise IOError("Can't rewind in write mode")
+ self.fileobj.seek(0)
+ self._new_member = True
+ self.extrabuf = ""
+ self.extrasize = 0
+ self.offset = 0
+
+ def seek(self, offset):
+ if self.mode == WRITE:
+ if offset < self.offset:
+ raise IOError('Negative seek in write mode')
+ count = offset - self.offset
+ for i in range(count // 1024):
+ self.write(1024 * '\0')
+ self.write((count % 1024) * '\0')
+ elif self.mode == READ:
+ if offset < self.offset:
+ # for negative seek, rewind and do positive seek
+ self.rewind()
+ count = offset - self.offset
+ for i in range(count // 1024):
+ self.read(1024)
+ self.read(count % 1024)
+
+ def readline(self, size=-1):
+ if size < 0:
+ size = sys.maxint
+ readsize = self.min_readsize
+ else:
+ readsize = size
+ bufs = []
+ while size != 0:
+ c = self.read(readsize)
+ i = c.find('\n')
+
+ # We set i=size to break out of the loop under two
+ # conditions: 1) there's no newline, and the chunk is
+ # larger than size, or 2) there is a newline, but the
+ # resulting line would be longer than 'size'.
+ if (size <= i) or (i == -1 and len(c) > size):
+ i = size - 1
+
+ if i >= 0 or c == '':
+ bufs.append(c[:i + 1]) # Add portion of last chunk
+ self._unread(c[i + 1:]) # Push back rest of chunk
+ break
+
+ # Append chunk to list, decrease 'size',
+ bufs.append(c)
+ size = size - len(c)
+ readsize = min(size, readsize * 2)
+ if readsize > self.min_readsize:
+ self.min_readsize = min(readsize, self.min_readsize * 2, 512)
+ return ''.join(bufs) # Return resulting line
+
+ def readlines(self, sizehint=0):
+ # Negative numbers result in reading all the lines
+ if sizehint <= 0:
+ sizehint = sys.maxint
+ L = []
+ while sizehint > 0:
+ line = self.readline()
+ if line == "":
+ break
+ L.append(line)
+ sizehint = sizehint - len(line)
+
+ return L
+
+ def writelines(self, L):
+ for line in L:
+ self.write(line)
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ line = self.readline()
+ if line:
+ return line
+ else:
+ raise StopIteration
+
+
+def _test():
+ # Act like gzip; with -d, act like gunzip.
+ # The input file is not deleted, however, nor are any other gzip
+ # options or features supported.
+ args = sys.argv[1:]
+ decompress = args and args[0] == "-d"
+ if decompress:
+ args = args[1:]
+ if not args:
+ args = ["-"]
+ for arg in args:
+ if decompress:
+ if arg == "-":
+ f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
+ g = sys.stdout
+ else:
+ if arg[-3:] != ".gz":
+ print "filename doesn't end in .gz:", repr(arg)
+ continue
+ f = open(arg, "rb")
+ g = __builtin__.open(arg[:-3], "wb")
+ else:
+ if arg == "-":
+ f = sys.stdin
+ g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
+ else:
+ f = __builtin__.open(arg, "rb")
+ g = open(arg + ".gz", "wb")
+ while True:
+ chunk = f.read(1024)
+ if not chunk:
+ break
+ g.write(chunk)
+ if g is not sys.stdout:
+ g.close()
+ if f is not sys.stdin:
+ f.close()
+
+if __name__ == '__main__':
+ _test()
diff --git a/obitools/gzip.pyc b/obitools/gzip.pyc
new file mode 100644
index 0000000..9c44a43
Binary files /dev/null and b/obitools/gzip.pyc differ
diff --git a/obitools/location/__init__.py b/obitools/location/__init__.py
new file mode 100644
index 0000000..b5463b0
--- /dev/null
+++ b/obitools/location/__init__.py
@@ -0,0 +1,538 @@
+import obitools
+import re
+import array
+
+class Location(object):
+ """
+ Define a location on a sequence.
+ """
+
+ def extractSequence(self,sequence):
+ '''
+ Extract subsequence corresponding to a Location.
+
+ @param sequence:
+ @type sequence: C{BioSequence} or C{str}
+ '''
+ assert isinstance(sequence, (obitools.BioSequence,str)), \
+ "sequence must be an instance of str or BioSequence"
+
+ if isinstance(sequence, str):
+ seq = self._extractSequence(sequence)
+ else:
+ if isinstance(sequence, obitools.AASequence):
+ assert not self.needNucleic(), \
+ "This location can be used only with Nucleic sequences"
+ seq = self._extractSequence(str(sequence))
+
+ if isinstance(sequence, obitools.AASequence):
+ st = obitools.AASequence
+ else:
+ st = obitools.NucSequence
+
+ seq = st(sequence.id,
+ seq,
+ sequence.definition,
+ **sequence.getTags())
+ seq['location']=str(self)
+
+ if 'length' in sequence.getTags():
+ seq['length']=len(seq)
+
+ if hasattr(sequence, 'quality'):
+ quality = self._extractQuality(sequence)
+ seq.quality=quality
+
+ return seq
+
+ def isDirect(self):
+ return None
+
+ def isSimple(self):
+ '''
+ Indicate if a location is composed of a single continuous
+ region or is composed by the junction of several locations
+ by the C{join} operator.
+
+ @return: C{True} if the location is composed of a single
+ continuous region.
+ @rtype: bool
+ '''
+
+ return None
+
+ def isFullLength(self):
+ return None
+
+ def needNucleic(self):
+ '''
+ If a location contains a complement operator, it can be use
+ only on nucleic sequence.
+
+ @return: C{True} if location contains a complement operator
+ @rtype: bool
+ '''
+ return None
+
+ def getGloc(self):
+ loc = self.simplify()
+ assert loc.isDirect() is not None,"Gloc cannot be created for multi oriented location : %s" % str(loc)
+ positions = ','.join([str(x) for x in loc._getglocpos()])
+ return "(%s,%s)" % ({True:'T',False:'F'}[loc.isDirect()],
+ positions)
+
+ def shift(self,s):
+ return None
+
+ def getBegin(self):
+ return None
+
+ def getEnd(self):
+ return None
+
+ def getFivePrime(self):
+ return self.getBegin()
+
+ def getThreePrime(self):
+ return self.getEnd()
+
+ begin = property(getBegin,None,None,"beginning position of the location")
+ end = property(getEnd,None,None,"ending position of the location")
+ fivePrime=property(getFivePrime,None,None,"5' position of the location")
+ threePrime=property(getThreePrime,None,None,"3' position of the location")
+
+ def __abs__(self):
+ assert self.isDirect() is not None,"Abs operator cannot be applied on non oriented location"
+ if self.isDirect():
+ return self
+ else:
+ return ComplementLocation(self).simplify()
+
+ def __cmp__(self,y):
+ if self.begin < y.begin:
+ return -1
+ if self.begin > y.begin:
+ return 1
+ if self.isDirect() == y.isDirect():
+ return 0
+ if self.isDirect() and not y.isDirect():
+ return -1
+ return 1
+
+class SimpleLocation(Location):
+ """
+ A simple location is describe a continuous region of
+ a sequence define by a C{begin} and a C{end} position.
+ """
+
+ def __init__(self,begin,end):
+ '''
+ Build a new C{SimpleLocation} instance. Valid
+ position are define on M{[1,N]} with N the length
+ of the sequence.
+
+ @param begin: start position of the location
+ @type begin: int
+ @param end: end position of the location
+ @type end: int
+ '''
+ assert begin > 0 and end > 0
+
+ self._begin = begin
+ self._end = end
+ self._before=False
+ self._after=False
+
+ def _extractSequence(self,sequence):
+
+ assert ( self._begin < len(sequence)
+ and self._end <= len(sequence)), \
+ "Sequence length %d is too short" % len(sequence)
+
+ return sequence[self._begin-1:self._end]
+
+ def _extractQuality(self,sequence):
+
+ assert ( self._begin < len(sequence)
+ and self._end <= len(sequence)), \
+ "Sequence length %d is too short" % len(sequence)
+
+ return sequence.quality[self._begin-1:self._end]
+
+
+ def isDirect(self):
+ return True
+
+ def isSimple(self):
+ return True
+
+ def isFullLength(self):
+ return not (self.before or self.after)
+
+ def simplify(self):
+ if self._begin == self._end:
+ return PointLocation(self._begin)
+ else:
+ return self
+
+ def needNucleic(self):
+ return False
+
+ def __str__(self):
+ before = {True:'<',False:''}[self.before]
+ after = {True:'>',False:''}[self.after]
+ return "%s%d..%s%d" % (before,self._begin,after,self._end)
+
+ def shift(self,s):
+ assert (self._begin + s) > 0,"shift to large (%d)" % s
+ if s == 0:
+ return self
+ return SimpleLocation(self._begin + s, self._end + s)
+
+ def _getglocpos(self):
+ return (self.begin,self.end)
+
+ def getGloc(self):
+ positions = ','.join([str(x) for x in self._getglocpos()])
+ return "(%s,%s)" % ({True:'T',False:'F'}[self.isDirect()],
+ positions)
+
+ def getBegin(self):
+ return self._begin
+
+ def getEnd(self):
+ return self._end
+
+
+ begin = property(getBegin,None,None,"beginning position of the location")
+ end = property(getEnd,None,None,"ending position of the location")
+
+ def getBefore(self):
+ return self._before
+
+ def getAfter(self):
+ return self._after
+
+ def setBefore(self,value):
+ assert isinstance(value, bool)
+ self._before=value
+
+ def setAfter(self,value):
+ assert isinstance(value, bool)
+ self._after=value
+
+ before=property(getBefore,setBefore,None)
+ after=property(getAfter,setAfter,None)
+
+
+
+
+class PointLocation(Location):
+ """
+ A point location describes a location on a sequence
+ limited to a single position
+ """
+
+ def __init__(self,position):
+ assert position > 0
+ self._pos=position
+
+ def _extractSequence(self,sequence):
+
+ assert self._end <= len(sequence), \
+ "Sequence length %d is too short" % len(sequence)
+
+ return sequence[self._pos-1]
+
+ def _extractQuality(self,sequence):
+
+ assert self._end <= len(sequence), \
+ "Sequence length %d is too short" % len(sequence)
+
+ return sequence[self._pos-1:self._pos]
+
+ def isDirect(self):
+ return True
+
+ def isSimple(self):
+ return True
+
+ def isFullLength(self):
+ return True
+
+ def simplify(self):
+ return self
+
+ def needNucleic(self):
+ return False
+
+ def shift(self,s):
+ assert (self._pos + s) > 0,"shift to large (%d)" % s
+ if s == 0:
+ return self
+ return PointLocation(self._pos + s)
+
+ def _getglocpos(self):
+ return (self._pos,self._pos)
+
+ def getBegin(self):
+ return self._pos
+
+ def getEnd(self):
+ return self._pos
+
+ begin = property(getBegin,None,None,"beginning position of the location")
+ end = property(getEnd,None,None,"ending position of the location")
+
+ def __str__(self):
+ return str(self._pos)
+
+class CompositeLocation(Location):
+ """
+ """
+ def __init__(self,locations):
+ self._locs = tuple(locations)
+
+
+ def _extractSequence(self,sequence):
+ seq = ''.join([x._extractSequence(sequence)
+ for x in self._locs])
+ return seq
+
+ def _extractQuality(self,sequence):
+ rep=array.array('d',[])
+ for x in self._locs:
+ rep.extend(x._extractQuality(sequence))
+ return rep
+
+ def isDirect(self):
+ hasDirect,hasReverse = reduce(lambda x,y: (x[0] or y,x[1] or not y),
+ (z.isDirect() for z in self._locs),(False,False))
+
+ if hasDirect and not hasReverse:
+ return True
+ if hasReverse and not hasDirect:
+ return False
+
+ return None
+
+
+ def isSimple(self):
+ return False
+
+
+ def simplify(self):
+ if len(self._locs)==1:
+ return self._locs[0]
+
+ rep = CompositeLocation(x.simplify() for x in self._locs)
+
+ if reduce(lambda x,y : x and y,
+ (isinstance(z, ComplementLocation)
+ for z in self._locs)):
+ rep = ComplementLocation(CompositeLocation(x._loc.simplify()
+ for x in rep._locs[::-1]))
+
+ return rep
+
+ def isFullLength(self):
+ return reduce(lambda x,y : x and y, (z.isFullLength() for z in self._locs),1)
+
+ def needNucleic(self):
+ return reduce(lambda x,y : x or y,
+ (z.needNucleic for z in self._locs),
+ False)
+
+ def _getglocpos(self):
+ return reduce(lambda x,y : x + y,
+ (z._getglocpos() for z in self._locs))
+
+
+ def getBegin(self):
+ return min(x.getBegin() for x in self._locs)
+
+ def getEnd(self):
+ return max(x.getEnd() for x in self._locs)
+
+ def shift(self,s):
+ assert (self.getBegin() + s) > 0,"shift to large (%d)" % s
+ if s == 0:
+ return self
+ return CompositeLocation(x.shift(s) for x in self._locs)
+
+
+ begin = property(getBegin,None,None,"beginning position of the location")
+ end = property(getEnd,None,None,"ending position of the location")
+
+
+ def __str__(self):
+ return "join(%s)" % ','.join([str(x)
+ for x in self._locs])
+
+class ComplementLocation(Location):
+ """
+ """
+
+ _comp={'a': 't', 'c': 'g', 'g': 'c', 't': 'a',
+ 'r': 'y', 'y': 'r', 'k': 'm', 'm': 'k',
+ 's': 's', 'w': 'w', 'b': 'v', 'd': 'h',
+ 'h': 'd', 'v': 'b', 'n': 'n', 'u': 'a',
+ '-': '-'}
+
+ def __init__(self,location):
+ self._loc = location
+
+ def _extractSequence(self,sequence):
+ seq = self._loc._extractSequence(sequence)
+ seq = ''.join([ComplementLocation._comp.get(x.lower(),'n') for x in seq[::-1]])
+ return seq
+
+ def _extractQuality(self,sequence):
+ return sequence.quality[::-1]
+
+ def isDirect(self):
+ return False
+
+ def isSimple(self):
+ return self._loc.isSimple()
+
+ def isFullLength(self):
+ return self._loc.isFullLength()
+
+ def simplify(self):
+ if isinstance(self._loc, ComplementLocation):
+ return self._loc._loc.simplify()
+ else:
+ return self
+
+ def needNucleic(self):
+ return True
+
+ def __str__(self):
+ return "complement(%s)" % self._loc
+
+ def shift(self,s):
+ assert (self.getBegin() + s) > 0,"shift to large (%d)" % s
+ if s == 0:
+ return self
+ return ComplementLocation(self._loc.shift(s))
+
+ def _getglocpos(self):
+ return self._loc._getglocpos()
+
+ def getBegin(self):
+ return self._loc.getBegin()
+
+ def getEnd(self):
+ return self._loc.getEnd()
+
+ def getFivePrime(self):
+ return self.getEnd()
+
+ def getThreePrime(self):
+ return self.getBegin()
+
+
+ begin = property(getBegin,None,None,"beginning position of the location")
+ end = property(getEnd,None,None,"ending position of the location")
+ fivePrime=property(getFivePrime,None,None,"5' potisition of the location")
+ threePrime=property(getThreePrime,None,None,"3' potisition of the location")
+
+
+ #
+ # Internal functions used for location parsing
+ #
+
+def __sublocationIterator(text):
+ sl = []
+ plevel=0
+ for c in text:
+ assert plevel>=0,"Misformated location : %s" % text
+ if c == '(':
+ plevel+=1
+ sl.append(c)
+ elif c==')':
+ plevel-=1
+ sl.append(c)
+ elif c==',' and plevel == 0:
+ assert sl,"Misformated location : %s" % text
+ yield ''.join(sl)
+ sl=[]
+ else:
+ sl.append(c)
+ assert sl and plevel==0,"Misformated location : %s" % text
+ yield ''.join(sl)
+
+
+
+ #
+ # Internal functions used for location parsing
+ #
+
+__simplelocparser = re.compile('(?P)(?P[0-9]+)(\.\.(?P>?)(?P[0-9]+))?')
+
+
+def __locationParser(text):
+ text=text.strip()
+ if text[0:5]=='join(':
+ assert text[-1]==')',"Misformated location : %s" % text
+ return CompositeLocation(__locationParser(sl) for sl in __sublocationIterator(text[5:-1]))
+ elif text[0:11]=='complement(':
+ assert text[-1]==')',"Misformated location : %s" % text
+ subl = tuple(__locationParser(sl) for sl in __sublocationIterator(text[11:-1]))
+ if len(subl)>1:
+ subl = CompositeLocation(subl)
+ else:
+ subl = subl[0]
+ return ComplementLocation(subl)
+ else:
+ data = __simplelocparser.match(text)
+ assert data is not None,"Misformated location : %s" % text
+ data = data.groupdict()
+ if not data['to'] :
+ sl = PointLocation(int(data['from']))
+ else:
+ sl = SimpleLocation(int(data['from']),int(data['to']))
+ sl.before=data['before']=='<'
+ sl.after=data['after']=='>'
+ return sl
+
+def locationGenerator(locstring):
+ '''
+ Parse a location string as present in genbank or embl file.
+
+ @param locstring: string description of the location in embl/gb format
+ @type locstring: str
+
+ @return: a Location instance
+ @rtype: C{Location} subclass instance
+ '''
+ return __locationParser(locstring)
+
+
+_matchExternalRef = re.compile('[A-Za-z0-9_|]+(\.[0-9]+)?(?=:)')
+
+def extractExternalRefs(locstring):
+ '''
+ When a location describe external references (ex: D28156.1:1..>1292)
+ separate the external reference part of the location and the location
+ by itself.
+
+ @param locstring: text representation of the location.
+ @type locstring: str
+
+ @return: a tuple with a set of string describing accession number
+ of the referred sequences and a C{Location} instance.
+
+ @rtype: tuple(set,Location)
+ '''
+ m = set(x.group() for x in _matchExternalRef.finditer(locstring))
+ clean = re.compile(':|'.join([re.escape(x) for x in m])+':')
+ cloc = locationGenerator(clean.sub('',locstring))
+
+ return m,cloc
+
+
+
+
+
diff --git a/obitools/location/__init__.pyc b/obitools/location/__init__.pyc
new file mode 100644
index 0000000..545f024
Binary files /dev/null and b/obitools/location/__init__.pyc differ
diff --git a/obitools/location/feature.py b/obitools/location/feature.py
new file mode 100644
index 0000000..89a183f
--- /dev/null
+++ b/obitools/location/feature.py
@@ -0,0 +1,177 @@
+from obitools.location import Location,locationGenerator
+import logging
+import re
+
+
+
+
+_featureMatcher = re.compile('^(FT| ) [^ ].+\n((FT| ) .+\n)+',re.M)
+_featureCleaner = re.compile('^FT',re.M)
+
+
+def textFeatureIterator(fttable):
+ '''
+ Iterate through a textual description of a feature table in a genbank
+ or embl format. Return at each step a text representation of each individual
+ feature composing the table.
+
+ @param fttable: a string corresponding to the feature table of a genbank
+ or an embl entry
+
+ @type fttable: C{str}
+
+ @return: an iterator on str
+ @rtype: iterator
+
+ @see: L{ftParser}
+ '''
+ for m in _featureMatcher.finditer(fttable):
+ t = m.group()
+ t = _featureCleaner.sub(' ',t)
+ yield t
+
+_qualifierMatcher = re.compile('(?<=^ {21}/).+(\n {21}[^/].+)*',re.M)
+_qualifierCleanner= re.compile("^ +",re.M)
+
+def qualifierIterator(qualifiers):
+ '''
+ Parse a textual description of a feature in embl or genbank format
+ as returned by the textFeatureIterator iterator and iterate through
+ the key, value qualified defining this location.
+
+ @param qualifiers: substring containing qualifiers
+ @type qualifiers: str
+
+ @return: an iterator on tuple (key,value), where keys are C{str}
+ @rtype: iterator
+ '''
+ for m in _qualifierMatcher.finditer(qualifiers):
+ t = m.group()
+ t = _qualifierCleanner.sub('',t)
+ t = t.split('=',1)
+ if len(t)==1:
+ t = (t[0],None)
+ else:
+ if t[0]=='translation':
+ value = t[1].replace('\n','')
+ else:
+ value = t[1].replace('\n',' ')
+ try:
+ value = eval(value)
+ except:
+ pass
+ t = (t[0],value)
+ yield t
+
+
+_ftmatcher = re.compile('(?<=^ {5})\S+')
+_locmatcher= re.compile('(?<=^.{21})[^/]+',re.DOTALL)
+_cleanloc = re.compile('[\s\n]+')
+_qualifiersMatcher = re.compile('^ +/.+',re.M+re.DOTALL)
+
+def ftParser(feature):
+ fttype = _ftmatcher.search(feature).group()
+ location=_locmatcher.search(feature).group()
+ location=_cleanloc.sub('',location)
+ qualifiers=_qualifiersMatcher.search(feature)
+ if qualifiers is not None:
+ qualifiers=qualifiers.group()
+ else:
+ qualifiers=""
+ logging.debug("Qualifiers regex not matching on \n=====\n%s\n========" % feature)
+
+ return fttype,location,qualifiers
+
+
+class Feature(dict,Location):
+ def __init__(self,type,location):
+ self._fttype=type
+ self._loc=location
+
+ def getFttype(self):
+ return self._fttype
+
+
+ def extractSequence(self,sequence,withQualifier=False):
+ seq = self._loc.extractSequence(sequence)
+ if withQualifier:
+ seq.getInfo().update(self)
+ return seq
+
+ def isDirect(self):
+ return self._loc.isDirect()
+
+ def isSimple(self):
+ return self._loc.isSimple()
+
+ def isFullLength(self):
+ return self._loc.isFullLength()
+
+ def simplify(self):
+ f = Feature(self._fttype,self._loc.simplify())
+ f.update(self)
+ return f
+
+ def locStr(self):
+ return str(self._loc)
+
+ def needNucleic(self):
+ return self._loc.needNucleic()
+
+ def __str__(self):
+ return repr(self)
+
+ def __repr__(self):
+ return str((self.ftType,str(self._loc),dict.__repr__(self)))
+
+ def __cmp__(self,y):
+ return self._loc.__cmp__(y)
+
+ def _getglocpos(self):
+ return self._loc._getglocpos()
+
+ ftType = property(getFttype, None, None, "Feature type name")
+
+ def shift(self,s):
+ assert (self.getBegin() + s) > 0,"shift to large (%d)" % s
+ if s == 0:
+ return self
+ f = Feature(self._fttype,self._loc.shift(s))
+ f.update(self)
+ return f
+
+
+ def getBegin(self):
+ return self._loc.getBegin()
+
+ def getEnd(self):
+ return self._loc.getEnd()
+
+ begin = property(getBegin,None,None,"beginning position of the location")
+ end = property(getEnd,None,None,"ending position of the location")
+
+
+def featureFactory(featureDescription):
+ fttype,location,qualifiers = ftParser(featureDescription)
+ location = locationGenerator(location)
+ feature = Feature(fttype,location)
+ feature.raw = featureDescription
+
+ for k,v in qualifierIterator(qualifiers):
+ feature.setdefault(k,[]).append(v)
+
+ return feature
+
+def featureIterator(featureTable,skipError=False):
+ for tft in textFeatureIterator(featureTable):
+ try:
+ feature = featureFactory(tft)
+ except AssertionError,e:
+ logging.debug("Parsing error on feature :\n===============\n%s\n===============" % tft)
+ if not skipError:
+ raise e
+ logging.debug("\t===> Error skipped")
+ continue
+
+ yield feature
+
\ No newline at end of file
diff --git a/obitools/metabarcoding/__init__.py b/obitools/metabarcoding/__init__.py
new file mode 100644
index 0000000..3b29b17
--- /dev/null
+++ b/obitools/metabarcoding/__init__.py
@@ -0,0 +1,265 @@
+from obitools.ecopcr.options import addTaxonomyFilterOptions,\
+ loadTaxonomyDatabase
+from obitools.graph import UndirectedGraph
+from obitools.align import lenlcs,isLCSReachable
+from obitools.graph.algorithms.component import componentIterator
+from obitools.utils.bioseq import uniqSequence
+from obitools.utils import progressBar
+import math
+import sys
+from obitools.graph.rootedtree import RootedTree
+
+def average(x):
+ x=list(x)
+ s = sum(i*j for (i,j) in x)
+ n = sum(i[1] for i in x)
+ return (float(s)/float(n),n)
+
+def minimum(x):
+ x=list(x)
+ m = min(i[0] for i in x)
+ n = sum(i[1] for i in x)
+ return (float(m),n)
+
+def ecoPCRReader(entries,options):
+
+ taxonomy = loadTaxonomyDatabase(options)
+
+ norankid =options.taxonomy.findRankByName('no rank')
+ speciesid=options.taxonomy.findRankByName('species')
+ genusid =options.taxonomy.findRankByName('genus')
+ familyid =options.taxonomy.findRankByName('family')
+
+ minrankseq = set([speciesid,genusid,familyid])
+
+ usedrankid = {}
+
+ ingroup = []
+ outgroup= []
+
+ for s in entries:
+ if 'taxid' in s :
+ taxid = s['taxid']
+ if taxid in taxonomy:
+ allrank = set()
+ for p in options.taxonomy.parentalTreeIterator(taxid):
+ if p[1]!=norankid:
+ allrank.add(p[1])
+ if len(minrankseq & allrank) == 3:
+ for r in allrank:
+ usedrankid[r]=usedrankid.get(r,0) + 1
+
+ if taxonomy.isAncestor(options.ingroup,taxid):
+ ingroup.append(s)
+ else:
+ outgroup.append(s)
+
+ keptrank = set(r for r in usedrankid
+ if float(usedrankid[r])/float(len(ingroup)) > options.rankthresold)
+
+ return { 'ingroup' : ingroup,
+ 'outgroup': outgroup,
+ 'ranks' : keptrank
+ }
+
+def buildSimilarityGraph(dbseq,ranks,taxonomy,dcmax=5):
+
+ ldbseq = len(dbseq)
+ pos = 1
+ digit = int(math.ceil(math.log10(ldbseq)))
+ header = "Alignment : %%0%dd x %%0%dd -> %%0%dd " % (digit,digit,digit)
+ aligncount = ldbseq*(ldbseq+1)/2
+ edgecount = 0
+ print >>sys.stderr
+
+ progressBar(1,aligncount,True,"Alignment : %s x %s -> %s " % ('-'*digit,'-'*digit, '0'*digit))
+
+
+ sim = UndirectedGraph()
+
+ i=0
+ for s in dbseq:
+ taxid = s['taxid']
+
+ rtaxon = dict((rid,taxonomy.getTaxonAtRank(taxid,rid))
+ for rid in ranks)
+
+ sim.addNode(i, seq=s,taxid=taxid,rtaxon=rtaxon)
+
+ i+=1
+
+# aligner = LCS()
+
+ for is1 in xrange(ldbseq):
+ s1 = dbseq[is1]
+ ls1= len(s1)
+# aligner.seqA=s1
+
+ for is2 in xrange(is1+1,ldbseq):
+
+ s2=dbseq[is2]
+ ls2=len(s2)
+
+ lm = max(ls1,ls2)
+ lcsmin = lm - dcmax
+
+ if isLCSReachable(s1,s2,lcsmin):
+ llcs,lali=lenlcs(s1,s2)
+ ds1s2 = lali - llcs
+
+ if ds1s2 <= dcmax:
+ sim.addEdge(node1=is1, node2=is2,ds1s2=ds1s2,label=ds1s2)
+ edgecount+=1
+
+ progressBar(pos,aligncount,head=header % (is1,is2,edgecount))
+ pos+=(ldbseq-is1-1)
+
+ return sim
+
+def buildTsr(component):
+ '''
+ Build for each consider taxonomic rank the list of taxa
+ present in the connected component
+
+ :param component: the analyzed connected component
+ :type component: :py:class:`UndirectedGraph`
+
+ :return: a dictionary indexed by rankid containing a `dict` indexed by taxid and containing count of sequences for this taxid
+ :rtype: `dict` indexed by `int` containing `dict` indexed by `int` and containing of `int`
+
+ '''
+ taxalist = {}
+ for n in component:
+ for r in n['rtaxon']:
+ rtaxid = n['rtaxon'][r]
+ if rtaxid is not None:
+ ts = taxalist.get(r,{})
+ ts[rtaxid]=ts.get(rtaxid,0)+1
+ taxalist[r]=ts
+
+ return taxalist
+
+def edgeDistSelector(dcmax):
+ def predicate(e):
+ return e['ds1s2'] <= dcmax
+ return predicate
+
+def distanceOfConfusion(simgraph,dcmax=5,aggregate=average):
+
+ alltaxa = set()
+
+ for n in simgraph:
+ alltaxa|=set(n['rtaxon'].values())
+
+ taxacount = len(alltaxa)
+
+ result = {}
+
+ pos = [1]
+ header = "Component : %-5d Identified : %-8d "
+ progressBar(1,taxacount,True,header % (0,0))
+
+ def _idc(cc,dcmax):
+ composante=[]
+ for x in cc:
+ composante.extend(simgraph.subgraph(c)
+ for c in componentIterator(x,
+ edgePredicat=edgeDistSelector(dcmax)))
+
+ good = set()
+ bad = {}
+
+ complexe = []
+
+ for c in composante:
+ tsr = buildTsr(c)
+ newbad=False
+ for r in tsr:
+ if len(tsr[r]) == 1:
+ taxid = tsr[r].keys()[0]
+ good.add((taxid,tsr[r][taxid]))
+ else:
+ newbad=True
+ for taxid in tsr[r]:
+ bad[taxid]=bad.get(taxid,0)+tsr[r][taxid]
+ if newbad:
+ complexe.append(c)
+
+# good = good - bad
+
+ for taxid,weight in good:
+ if taxid not in result:
+ result[taxid]=[]
+ result[taxid].append((dcmax+1,weight))
+
+
+ progressBar(pos[0],taxacount,False,header % (len(composante),pos[0]))
+ pos[0]=len(result)
+
+ if dcmax > 0:
+ dcmax-=1
+ _idc(complexe,dcmax)
+
+ else:
+ for taxid in bad:
+ if taxid not in result:
+ result[taxid]=[]
+ result[taxid].append((0,bad[taxid]))
+
+ progressBar(pos[0],taxacount,False,header % (len(composante),pos[0]))
+ pos[0]=len(result)
+
+ _idc([simgraph],dcmax)
+
+ for taxid in result:
+ result[taxid]=aggregate(result[taxid])
+ return result
+
+def propagateDc(tree,node=None,aggregate=min):
+ if node is None:
+ node = tree.getRoots()[0]
+ dca=aggregate(n['dc'] for n in node.leavesIterator())
+ node['dc']=dca
+ for n in node:
+ propagateDc(tree, n, aggregate)
+
+def confusionTree(distances,ranks,taxonomy,aggregate=min,bsrank='species',dcmax=1):
+
+ def Bs(node,rank,dcmax):
+ n = len(node)
+ if n:
+ g = [int(x['dc']>=dcmax) for x in node.subgraphIterator() if x['rank']==bsrank]
+ n = len(g)
+ g = sum(g)
+ bs= float(g)/float(n)
+ node['bs']=bs
+ node['bs_label']="%3.2f (%d)" % (bs,n)
+
+ for n in node:
+ Bs(n,rank,dcmax)
+
+ tree = RootedTree()
+ ranks = set(ranks)
+ tset = set(distances)
+
+ for taxon in distances:
+ tree.addNode(taxon, rank=taxonomy.getRank(taxon),
+ name=taxonomy.getScientificName(taxon),
+ dc=float(distances[taxon][0]),
+ n=distances[taxon][1],
+ dc_label="%4.2f (%d)" % (float(distances[taxon][0]),distances[taxon][1])
+ )
+
+ for taxon in distances:
+ piter = taxonomy.parentalTreeIterator(taxon)
+ taxon = piter.next()
+ for parent in piter:
+ if taxon[0] in tset and parent[0] in distances:
+ tset.remove(taxon[0])
+ tree.addEdge(parent[0], taxon[0])
+ taxon=parent
+
+ root = tree.getRoots()[0]
+ Bs(root,bsrank,dcmax)
+
+ return tree
diff --git a/obitools/metabarcoding/options.py b/obitools/metabarcoding/options.py
new file mode 100644
index 0000000..08ff423
--- /dev/null
+++ b/obitools/metabarcoding/options.py
@@ -0,0 +1,34 @@
+'''
+Created on 30 oct. 2011
+
+@author: coissac
+'''
+
+from obitools.ecopcr.options import addTaxonomyDBOptions
+
+
+def addMetabarcodingOption(optionManager):
+
+ addTaxonomyDBOptions(optionManager)
+
+ optionManager.add_option('--dcmax',
+ action="store", dest="dc",
+ metavar="###",
+ type="int",
+ default=0,
+ help="Maximum confusion distance considered")
+
+ optionManager.add_option('--ingroup',
+ action="store", dest="ingroup",
+ metavar="###",
+ type="int",
+ default=1,
+ help="ncbi taxid delimitation the in group")
+
+ optionManager.add_option('--rank-thresold',
+ action="store", dest="rankthresold",
+ metavar="#.##",
+ type="float",
+ default=0.5,
+ help="minimum fraction of the ingroup sequences "
+ "for concidering the rank")
diff --git a/obitools/obischemas/__init__.py b/obitools/obischemas/__init__.py
new file mode 100644
index 0000000..6bcafde
--- /dev/null
+++ b/obitools/obischemas/__init__.py
@@ -0,0 +1,28 @@
+from obitools.obischemas import kb
+__connection__ = None
+
+def initConnection(options):
+ global __connection__
+ param = {}
+ if hasattr(options, "dbname") and options.dbname is not None:
+ param["database"]=options.dbname
+ if hasattr(options, "dbhost") and options.dbhost is not None:
+ param["host"]=options.dbhost
+ if hasattr(options, "dbuser") and options.dbuser is not None:
+ param["username"]=options.dbuser
+ if hasattr(options, "dbpassword") and options.dbpassword is not None:
+ param["password"]=options.dbpassword
+
+ __connection__=kb.getConnection(**param)
+ __connection__.autocommit=options.autocommit
+
+def getConnection(options=None):
+ global __connection__
+
+ if options is not None:
+ initConnection(options)
+
+ assert __connection__ is not None,"database connection is not initialized"
+
+ return __connection__
+
\ No newline at end of file
diff --git a/obitools/obischemas/kb/__init__.py b/obitools/obischemas/kb/__init__.py
new file mode 100644
index 0000000..7d35dcb
--- /dev/null
+++ b/obitools/obischemas/kb/__init__.py
@@ -0,0 +1,55 @@
+"""
+ kb package is devoted to manage access to postgresql database from python
+ script
+"""
+
+
+class Connection(object):
+
+ def __init__(self):
+ raise RuntimeError('pyROM.KB.Connection is an abstract class')
+
+ def cursor(self):
+ raise RuntimeError('pyROM.KB.Connection.cursor is an abstract function')
+
+ def commit(self):
+ raise RuntimeError('pyROM.KB.Connection.commit is an abstract function')
+
+ def rollback(self):
+ raise RuntimeError('pyROM.KB.Connection.rollback is an abstract function')
+
+ def __call__(self,query):
+ return self.cursor().execute(query)
+
+
+class Cursor(object):
+
+ def __init__(self,db):
+ raise RuntimeError('pyROM.KB.Cursor is an abstract class')
+
+ def execute(self,query):
+ raise RuntimeError('pyROM.KB.Cursor.execute is an abstract function')
+
+ __call__=execute
+
+
+_current_connection = None # Static variable used to store connection to KB
+
+def getConnection(*args,**kargs):
+ """
+ return a connection to the database.
+ When call from database backend no argument are needed.
+ All connection returned by this function
+ """
+ global _current_connection
+
+ if _current_connection==None or args or kargs :
+ try:
+ from obischemas.kb import backend
+ _current_connection = backend.Connection()
+ except ImportError:
+ from obischemas.kb import extern
+ _current_connection = extern.Connection(*args,**kargs)
+ return _current_connection
+
+
diff --git a/obitools/obischemas/kb/extern.py b/obitools/obischemas/kb/extern.py
new file mode 100644
index 0000000..ce2ff84
--- /dev/null
+++ b/obitools/obischemas/kb/extern.py
@@ -0,0 +1,78 @@
+"""
+Module : KB.extern
+Author : Eric Coissac
+Date : 03/05/2004
+
+Module wrapping psycopg interface module to allow connection
+to a postgresql databases with the same interface from
+backend and external script.
+
+This module define a class usable from external script
+"""
+
+
+import psycopg2
+import sys
+from obischemas import kb
+
+class Connection(kb.Connection):
+
+ def __init__(self,*connectParam,**kconnectParam):
+ if connectParam:
+ self.connectParam=={'dsn':connectParam}
+ else:
+ self.connectParam=kconnectParam
+ print self.connectParam
+ self.db = psycopg2.connect(**(self.connectParam))
+
+ def restart(self):
+ ok=1
+ while (ok and ok < 1000):
+ try:
+ self.db = psycopg2.connect(**self.connectParam)
+ except:
+ ok+=1
+ else:
+ ok=0
+
+
+ def cursor(self):
+ curs = Cursor(self.db)
+ if hasattr(self,'autocommit') and self.autocommit:
+ curs.autocommit = self.autocommit
+ return curs
+
+ def commit(self):
+ self.db.commit()
+
+ def rollback(self):
+ if hasattr(self,'db'):
+ self.db.rollback()
+
+ def __del__(self):
+ if hasattr(self,'db'):
+ self.rollback()
+
+class Cursor(kb.Cursor):
+
+ def __init__(self,db):
+ self.db = db
+ self.curs = db.cursor()
+
+ def execute(self,query):
+ try:
+ self.curs.execute(query)
+ if hasattr(self,'autocommit') and self.autocommit:
+ self.db.commit()
+ except psycopg2.ProgrammingError,e:
+ print >>sys.stderr,"===> %s" % query
+ raise e
+ except psycopg2.IntegrityError,e:
+ print >>sys.stderr,"---> %s" % query
+ raise e
+ try:
+ label = [x[0] for x in self.curs.description]
+ return [dict(map(None,label,y))
+ for y in self.curs.fetchall()]
+ except TypeError:
+ return []
diff --git a/obitools/obischemas/options.py b/obitools/obischemas/options.py
new file mode 100644
index 0000000..66f5138
--- /dev/null
+++ b/obitools/obischemas/options.py
@@ -0,0 +1,31 @@
+def addConnectionOptions(optionManager):
+
+ optionManager.add_option('-d','--dbname',
+ action="store", dest="dbname",
+ metavar="",
+ type="string",
+ help="OBISchema database name containing"
+ "taxonomical data")
+
+ optionManager.add_option('-H','--host',
+ action="store", dest="dbhost",
+ metavar="",
+ type="string",
+ help="host hosting OBISchema database")
+
+ optionManager.add_option('-U','--user',
+ action="store", dest="dbuser",
+ metavar="",
+ type="string",
+ help="user for OBISchema database connection")
+
+ optionManager.add_option('-W','--password',
+ action="store", dest="dbpassword",
+ metavar="",
+ type="string",
+ help="password for OBISchema database connection")
+
+ optionManager.add_option('-A','--autocommit',
+ action="store_true",dest="autocommit",
+ default=False,
+ help="add commit action after each query")
\ No newline at end of file
diff --git a/obitools/obo/__init__.py b/obitools/obo/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/obitools/obo/go/__init__.py b/obitools/obo/go/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/obitools/obo/go/parser.py b/obitools/obo/go/parser.py
new file mode 100644
index 0000000..6902974
--- /dev/null
+++ b/obitools/obo/go/parser.py
@@ -0,0 +1,53 @@
+from obitools.obo.parser import OBOTerm
+from obitools.obo.parser import OBOEntry
+from obitools.obo.parser import stanzaIterator
+from logging import debug
+
+class GOEntry(OBOEntry):
+ '''
+ An entry of a GeneOntology .obo file. It can be a header (without a stanza name) or
+ a stanza (with a stanza name between brackets). It inherits from the class dict.
+ '''
+
+
+class GOTerm(OBOTerm):
+
+ '''
+ A stanza named 'Term'. It inherits from the class OBOTerm.
+ '''
+
+ def __init__(self,stanza):
+
+ ## use of the OBOEntry constructor.
+ OBOTerm.__init__(self, stanza)
+
+ assert 'namespace' in self and len(self['namespace'])==1, "An OBOTerm must belong to one of the cell_component, molecular_function or biological_process namespace"
+
+
+def GOEntryFactory(stanza):
+ '''
+ Dispatcher of stanza.
+
+ @param stanza: a stanza composed of several lines.
+ @type stanza: text
+
+ @return: an C{OBOTerm} | C{OBOEntry} instance
+
+ @note: The dispatcher treats differently the stanza which are OBO "Term"
+ and the others.
+ '''
+
+ stanzaType = OBOEntry.parseStanzaName(stanza)
+
+ if stanzaType=="Term":
+ return GOTerm(stanza)
+ else:
+ return OBOEntry(stanza)
+
+
+def GOEntryIterator(file):
+ entries = stanzaIterator(file)
+ for e in entries:
+ debug(e)
+ yield GOEntryFactory(e)
+
diff --git a/obitools/obo/parser.py b/obitools/obo/parser.py
new file mode 100644
index 0000000..f6f05f3
--- /dev/null
+++ b/obitools/obo/parser.py
@@ -0,0 +1,707 @@
+from obitools.utils import skipWhiteLineIterator,multiLineWrapper
+from obitools.utils import universalOpen
+from obitools.format.genericparser import genericEntryIteratorGenerator
+from logging import debug,warning
+
+import re
+
+
+#################################################################################
+## Stanza preparation area ##
+#################################################################################
+
+
+class FileFormatError(Exception):
+ '''
+ An error derived from the class Exception.
+ '''
+ pass
+
+_oboEntryIterator = genericEntryIteratorGenerator(endEntry='^ *$',
+ strip=True)
+
+def stanzaIterator(inputfile):
+ '''
+ Iterator of stanza. The stanza are the basic units of OBO files.
+
+ @param inputfile: a stream of strings from an opened OBO file.
+ @type inputfile: a stream of strings
+
+ @return: a stream of stanza
+ @rtype: a stream of aggregated strings
+
+ @note: The iterator constructs stanza by aggregate strings from the
+ OBO file.
+ '''
+ inputfile = universalOpen(inputfile)
+ inputfile = multiLineWrapper(inputfile)
+ return _oboEntryIterator(inputfile)
+
+
+
+#################################################################################
+## Trailing Modifiers treatment area ##
+#################################################################################
+
+
+class TrailingModifier(dict):
+ '''
+ A class object which inherits from the class dict. Trailing modifiers can be found
+ at the end of TaggedValue objects when they exist.
+ '''
+
+ _match_brace = re.compile('(?<=\ {)[^\]]*(\}) *( !|$)')
+
+ def __init__(self,string):
+
+ ## search for trailing modifiers signals
+ trailing_modifiers = TrailingModifier._match_brace.search(string)
+
+ ## the trailing modifiers exist
+ if trailing_modifiers:
+ trailing_modifiers=trailing_modifiers.group(0).strip()
+ print trailing_modifiers
+ ## creates and feeds the dictionary of trailing modifiers
+ dict.__init__(self,(x.strip().split('=',1) for x in trailing_modifiers.split(',')))
+
+
+def trailingModifierFactory(string):
+ '''
+ Dispatcher of trailing modifiers.
+
+ @param string: a string from a TaggedValue object with a trailing modifiers signal.
+ @type string: string
+
+ @return: a class object
+
+ @note: The dispatcher is currently very simple. Only one case is treated by the function.
+ `the function returns a class object inherited from the class dict if the trailing modifiers
+ exist, None if they don't.
+ '''
+
+ trailing_modifiers = TrailingModifier(string)
+ if not trailing_modifiers:
+ trailing_modifiers=None
+ return trailing_modifiers
+
+
+#################################################################################
+## TaggedValue treatment area ##
+#################################################################################
+
+
+class TaggedValue(object):
+ '''
+ A couple 'tag:value' of an OBOEntry.
+ '''
+
+ _match_value = re.compile('(("(\\\\"|[^\"])*")|(\\\\"|[^\"]))*?( !| {|$)')
+ _split_comment = re.compile('^!| !')
+ _match_quotedString = re.compile('(?<=")(\\\\"|[^\"])*(?=")')
+ _match_bracket = re.compile('\[[^\]]*\]')
+
+ def __init__(self,line):
+ '''
+ Constructor of the class TaggedValue.
+
+ @param line: a line of an OBOEntry composed of a tag and a value.
+ @type line: string
+
+ @note: The constructor separates tags from right terms. 'value' is extracted
+ from right terms using a regular expression (value is at the beginning of the
+ string, between quotes or not). Then, 'comment' is extracted from the rest of the
+ string using another regular expression ('comment' is at the end of the string
+ after a '!'. By default, 'comment' is set to None). Finally, 'trailing_modifiers'
+ are extracted from the last string using another regular expression.
+ The tag, the value, the comment and the trailing_modifiers are saved.
+ '''
+
+ debug("tagValueParser : %s" % line)
+
+ ## by default :
+ trailing_modifiers = None
+ comment = None
+
+ ## the tag is saved. 'right' is composed of the value, the comment and the trailing modifiers
+ tag,rigth = line.split(':',1)
+
+ ## the value is saved
+ value = TaggedValue._match_value.search(rigth).group(0)
+ debug("Extracted value : %s" % value)
+
+ ## if there is a value AND a sign of a comment or trailing modifiers
+ if value and value[-1] in '!{':
+ lvalue = len(value)
+ ## whatever it is a comment or trailing modifiers, it is saved into 'extra'
+ extra = rigth[lvalue-1:].strip()
+ ## a comment is extracted
+ extra =TaggedValue._split_comment.split(extra,1)
+ ## and saved if it exists
+ if len(extra)==2:
+ comment=extra[1].strip()
+ ## trailing modifiers are extracted
+ extra=extra[0]
+ trailing_modifiers = trailingModifierFactory(extra)
+ ## the value is cleaned of any comment or trailing modifiers signals
+ value = value[0:-1]
+
+ if tag=='use_term':
+ tag='consider'
+ raise DeprecationWarning,"user_term is a deprecated tag, you should instead use consider"
+
+ ## recording zone
+ self.value =value.strip()
+ self.tag = tag
+ self.__doc__=comment
+ self.trailing_modifiers=trailing_modifiers
+
+ def __str__(self):
+ return str(self.value)
+
+ def __repr__(self):
+ return '''"""%s"""''' % str(self)
+
+
+class NameValue(TaggedValue):
+ '''
+ A couple 'name:value' inherited from the class TaggedValue. Used to manage name tags.
+ '''
+
+ def __init__(self,line):
+
+ ## no use of the TaggedValue constructor. The NameValue is very simple.
+ tag,rigth = line.split(':',1)
+
+ ## recording zone
+ self.value = rigth.strip()
+ self.tag = 'name'
+ self.__doc__=None
+ self.trailing_modifiers=None
+
+
+
+class DefValue(TaggedValue):
+ '''
+ A couple 'def:value' inherited from the class TaggedValue. Used to manage def tags.
+ '''
+
+ def __init__(self,line):
+ '''
+ Constructor of the class DefValue.
+
+ @param line: a line of an OBOEntry composed of a tag named 'def' and a value.
+ @type line: string
+
+ @note: The constructor calls the TaggedValue constructor. A regular expression
+ is used to extract the 'definition' from TaggedValue.value (definition is a not
+ quoted TaggedValue.value). A regular expression is used to extract 'dbxrefs'
+ from the aggedValue.value without the definition (dbxrefs are between brackets
+ and definition can be so). Definition is saved as the new value of the DefValue.
+ dbxrefs are saved.
+ '''
+
+ ## use of the TaggedValue constructor
+ TaggedValue.__init__(self, line)
+
+ ## definition, which is quoted, is extracted from the standard value of a TaggedValue.
+ definition = TaggedValue._match_quotedString.search(self.value).group(0)
+
+ ## the standard value is cleaned of the definition.
+ cleanvalue = self.value.replace(definition,'')
+ cleanvalue = cleanvalue.replace(' ',' ')
+
+ ## dbxrefs are searched into the rest of the standard value.
+ dbxrefs = TaggedValue._match_bracket.search(cleanvalue).group(0)
+
+ ## recording zone
+ self.tag = 'def'
+ ## the value of a DefValue is not the standard value but the definition.
+ self.value=definition
+ self.dbxrefs=xrefFactory(dbxrefs)
+
+
+class SynonymValue(TaggedValue):
+ '''
+ A couple 'synonym:value' inherited from the class TaggedValue. Used to manage
+ synonym tags, exact_synonym tags, broad_synonym tags and narrow_synonym tags.
+ '''
+
+ _match_scope = re.compile('(?<="")[^\[]*(?=\[|$)')
+
+ def __init__(self,line):
+ '''
+ Constructor of the class SynonymValue.
+
+ @param line: a line of an OBOEntry composed of a tag named 'synonym' or
+ 'exact_synonym' or 'broad_synonym' or 'narrow_synonym' and a value.
+ @type line: string
+
+ @note: SynonymValue is composed of a tag, a value, a scope, a list of types and
+ dbxrefs.
+ The constructor calls the TaggedValue constructor. A regular expression
+ is used to extract 'definition' from TaggedValue.value (definition is a not
+ quoted TaggedValue.value). Definition is saved as the new value of the class
+ SynonymValue.
+ A regular expression is used to extract 'attributes' from the rest of the
+ string. Attributes may contain an optional synonym scope and an optional list
+ of synonym types. The scope is extracted from attributes or set by default to
+ 'RELATED'. It is saved as the scope of the class. The types are the rest of the
+ attributes and are saved as the list of types of the class.
+ For deprecated tags 'exact_synonym', 'broad_synonym' and 'narrow_synonym', tag
+ is set to 'synonym' and scope is set respectively to 'EXACT', 'BROAD' and 'NARROW'.
+ A regular expression is used to extract 'dbxrefs' from the TaggedValue.value
+ without the definition (dbxrefs are between brackets and definition can be so).
+ dbxrefs are saved.
+ '''
+
+ ## use of the TaggedValue constructor
+ TaggedValue.__init__(self, line)
+
+ ## definition, which is quoted, is extracted from the standard value of a TaggedValue.
+ definition = TaggedValue._match_quotedString.search(self.value).group(0)
+
+ ## the standard value is cleaned of the definition.
+ cleanvalue = self.value.replace(definition,'')
+ cleanvalue = cleanvalue.replace(' ',' ')
+
+ ## 1) attributes are searched into the rest of the standard value.
+ ## 2) then they are stripped.
+ ## 3) then they are split on every ' '.
+ ## 4) finally they are ordered into a set.
+ attributes = set(SynonymValue._match_scope.search(cleanvalue).group(0).strip().split())
+
+ ## the scopes are the junction between the attributes and a set of specific terms.
+ scopes = attributes & set(['RELATED','EXACT','BROAD','NARROW'])
+
+ ## the types are the rest of the attributes.
+ types = attributes - scopes
+
+ ## this is a constraint of the OBO format
+ assert len(scopes)< 2,"Only one synonym scope allowed"
+
+ ## the scope of the SynonymValue is into scopes or set by default to RELATED
+ if scopes:
+ scope = scopes.pop()
+ else:
+ scope = 'RELATED'
+
+ ## Specific rules are defined for the following tags :
+ if self.tag == 'exact_synonym':
+ raise DeprecationWarning,'exact_synonym is a deprecated tag use instead synonym tag'
+ self.tag = 'synonym'
+ scope = 'EXACT'
+
+ if self.tag == 'broad_synonym':
+ raise DeprecationWarning,'broad_synonym is a deprecated tag use instead synonym tag'
+ self.tag = 'synonym'
+ scope = 'BROAD'
+
+ if self.tag == 'narrow_synonym':
+ raise DeprecationWarning,'narrow_synonym is a deprecated tag use instead synonym tag'
+ self.tag = 'synonym'
+ scope = 'NARROW'
+
+ if self.tag == 'systematic_synonym':
+ #raise DeprecationWarning,'narrow_synonym is a deprecated tag use instead sysnonym tag'
+ self.tag = 'synonym'
+ scope = 'SYSTEMATIC'
+
+ ## this is our own constraint. deprecated tags are not saved by this parser.
+ assert self.tag =='synonym',"%s synonym type is not managed" % self.tag
+
+ ## dbxrefs are searched into the rest of the standard value.
+ dbxrefs = TaggedValue._match_bracket.search(cleanvalue).group(0)
+
+ ## recording zone
+ ## the value of a SynonymValue is not the standard value but the definition.
+ self.value = definition
+ self.dbxrefs = xrefFactory(dbxrefs)
+ self.scope = scope
+ self.types = list(types)
+
+ def __eq__(self,b):
+ return ((self.value==b.value) and (self.dbxrefs==b.dbxrefs)
+ and (self.scope==b.scope) and (self.types==b.types)
+ and (self.__doc__==b.__doc__) and (self.tag==b.tag)
+ and (self.trailing_modifiers==b.trailing_modifiers))
+
+ def __hash__(self):
+ return (reduce(lambda x,y:x+y,(hash(z) for z in [self.__doc__,
+ self.value,
+ frozenset(self.dbxrefs),
+ self.scope,
+ frozenset(self.types),
+ self.tag,
+ self.trailing_modifiers]),0)) % (2**31)
+
+
+class XrefValue(TaggedValue):
+ '''
+ A couple 'xref:value' inherited from the class TaggedValue. Used to manage
+ xref tags.
+ '''
+
+ def __init__(self,line):
+
+ ## use of the TaggedValue constructor
+ TaggedValue.__init__(self, line)
+
+ ## use the same function as the dbxrefs
+ self.value=xrefFactory(self.value)
+
+ if self.tag in ('xref_analog','xref_unk'):
+ raise DeprecationWarning,'%s is a deprecated tag use instead sysnonym tag' % self.tag
+ self.tag='xref'
+
+ ## this is our own constraint. deprecated tags are not saved by this parser.
+ assert self.tag=='xref'
+
+
+class RelationshipValue(TaggedValue):
+ '''
+ A couple 'xref:value' inherited from the class TaggedValue. Used to manage
+ xref tags.
+ '''
+
+ def __init__(self,line):
+
+ ## use of the TaggedValue constructor
+ TaggedValue.__init__(self, line)
+
+ ## the value is split on the first ' '.
+ value = self.value.split(None,1)
+
+ ## succesful split !
+ if len(value)==2:
+ relationship=value[0]
+ term=value[1]
+ ## unsuccesful split. The relationship is set by default to IS_A
+ else:
+ relationship='is_a'
+ term=value[0]
+
+ ## recording zone
+ self.value=term
+ self.relationship=relationship
+
+
+class NamespaceValue(TaggedValue):
+ def __init__(self,line):
+ TaggedValue.__init__(self, line)
+
+class RemarkValue(TaggedValue):
+ def __init__(self,line):
+ TaggedValue.__init__(self, line)
+ label,value = self.value.split(':',1)
+ label = label.strip()
+ value = value.strip()
+ self.value=value
+ self.label=label
+
+
+def taggedValueFactory(line):
+ '''
+ A function used to dispatch lines of an OBOEntry between the class TaggedValue
+ and its inherited classes.
+
+ @param line: a line of an OBOEntry composed of a tag and a value.
+ @type line: string
+
+ @return: a class object
+ '''
+
+ if (line[0:9]=='namespace' or
+ line[0:17]=='default-namespace'):
+ return NamespaceValue(line)
+ ## DefValue is an inherited class of TaggedValue
+ elif line[0:3]=='def':
+ return DefValue(line)
+ ## SynonymValue is an inherited class of TaggedValue
+ elif ((line[0:7]=="synonym" and line[0:14]!="synonymtypedef") or
+ line[0:13]=="exact_synonym" or
+ line[0:13]=="broad_synonym" or
+ line[0:14]=="narrow_synonym"):
+ return SynonymValue(line)
+ ## XrefValue is an inherited class of TaggedValue
+ elif line[0:4]=='xref':
+ return XrefValue(line)
+ ## NameValue is an inherited class of TaggedValue
+ elif line[0:4]=='name':
+ return NameValue(line)
+ ## RelationshipValue is an inherited class of TaggedValue
+ elif (line[0:15]=='intersection_of' or
+ line[0:8] =='union_of' or
+ line[0:12]=='relationship'):
+ return RelationshipValue(line)
+ elif (line[0:6]=='remark'):
+ return RemarkValue(line)
+ ## each line is a couple : tag / value (and some more features)
+ else:
+ return TaggedValue(line)
+
+
+#################################################################################
+## Xref treatment area ##
+#################################################################################
+
+
+
+class Xref(object):
+ '''
+ A xref object of an OBOentry. It may be the 'dbxrefs' of SynonymValue and
+ DefValue objects or the 'value' of XrefValue objects.
+ '''
+
+ __splitdata__ = re.compile(' +(?=["{])')
+
+ def __init__(self,ref):
+ if ref == '' : #
+ ref = None #
+ data = '' #
+ else : # Modifs JJ sinon erreur : list index out of range
+ data = Xref.__splitdata__.split(ref,1) #
+ ref = data[0] #
+ description=None
+ trailing_modifiers = None
+ if len(data)> 1:
+ extra = data[1]
+ description = TaggedValue._match_quotedString.search(extra)
+ if description is not None:
+ description = description.group(0)
+ extra.replace(description,'')
+ trailing_modifiers=trailingModifierFactory(extra)
+ self.reference=ref
+ self.description=description
+ self.trailing_modifiers=trailing_modifiers
+
+ def __eq__(self,b):
+ return ((self.reference==b.reference) and (self.description==b.description)
+ and (self.trailing_modifiers==b.trailing_modifiers))
+
+ def __hash__(self):
+ return (reduce(lambda x,y:x+y,(hash(z) for z in [self.reference,
+ self.description,
+ self.trailing_modifiers]),0)) % (2**31)
+
+
+def xrefFactory(string):
+ '''
+ Dispatcher of xrefs.
+
+ @param string: a string (between brackets) from an inherited TaggedValue object with a dbxrefs
+ signal (actually, the signal can only be found into SynonymValue and DefValue
+ objects) or a string (without brackets) from a XrefValue object.
+ @type string: string
+
+ @return: a class object
+
+ @note: The dispatcher treats differently the strings between brackets (from SynonymValue and
+ DefValue objects) and without brackets (from XrefValue objects).
+ '''
+
+ string = string.strip()
+ if string[0]=='[':
+ return [Xref(x.strip()) for x in string[1:-1].split(',')]
+ else:
+ return Xref(string)
+
+
+#################################################################################
+## Stanza treatment area ##
+#################################################################################
+
+
+class OBOEntry(dict):
+ '''
+ An entry of an OBOFile. It can be a header (without a stanza name) or
+ a stanza (with a stanza name between brackets). It inherits from the class dict.
+ '''
+ _match_stanza_name = re.compile('(?<=^\[)[^\]]*(?=\])')
+
+ def __init__(self,stanza):
+ ## tests if it is the header of the OBO file (returns TRUE) or not (returns FALSE)
+ self.isHeader = stanza[0]!='['
+ lines = stanza.split('\n')
+ ## not the header : there is a [stanzaName]
+ if not self.isHeader:
+ self.stanzaName = lines[0].strip()[1:-1]
+ lines=lines[1:]
+ self["stanza"] = [stanza.strip()]
+
+ ## whatever the stanza is.
+ for line in lines:
+ ## each line is a couple : tag / value
+ taggedvalue = taggedValueFactory(line)
+ if taggedvalue.tag in self:
+ self[taggedvalue.tag].append(taggedvalue)
+ else:
+ self[taggedvalue.tag]=[taggedvalue]
+
+
+ def parseStanzaName(stanza):
+ sm = OBOEntry._match_stanza_name.search(stanza)
+ if sm:
+ return sm.group(0)
+ else:
+ return None
+
+ parseStanzaName=staticmethod(parseStanzaName)
+
+
+
+class OBOTerm(OBOEntry):
+ '''
+ A stanza named 'Term'. It inherits from the class OBOEntry.
+ '''
+ def __init__(self,stanza):
+
+ ## use of the OBOEntry constructor.
+ OBOEntry.__init__(self, stanza)
+
+ assert self.stanzaName=='Term'
+ assert 'stanza' in self
+ assert 'id' in self and len(self['id'])==1,"An OBOTerm must have an id"
+ assert 'name' in self and len(self['name'])==1,"An OBOTerm must have a name"
+ assert 'namespace' not in self or len(self['namespace'])==1, "Only one namespace is allowed for an OBO term"
+
+ assert 'def' not in self or len(self['def'])==1,"Only one definition is allowed for an OBO term"
+ assert 'comment' not in self or len(self['comment'])==1,"Only one comment is allowed for an OBO term"
+
+ assert 'union_of' not in self or len(self['union_of'])>=2,"Only one union relationship is allowed for an OBO term"
+ assert 'intersection_of' not in self or len(self['intersection_of'])>=2,"Only one intersection relationship is allowed for an OBO term"
+
+ if self._isObsolete():
+ #assert 'is_a' not in self
+ assert 'relationship' not in self
+ assert 'inverse_of' not in self
+ assert 'disjoint_from' not in self
+ assert 'union_of' not in self
+ assert 'intersection_of' not in self
+
+ assert 'replaced_by' not in self or self._isObsolete()
+ assert 'consider' not in self or self._isObsolete()
+
+ def _getStanza(self):
+ return self['stanza'][0]
+
+ ## make-up functions.
+ def _getDefinition(self):
+ if 'def' in self:
+ return self['def'][0]
+ return None
+
+ def _getId(self):
+ return self['id'][0]
+
+ def _getNamespace(self):
+ return self['namespace'][0]
+
+ def _getName(self):
+ return self['name'][0]
+
+ def _getComment(self):
+ if 'comment' in self:
+ return self['comment'][0]
+ return None
+
+ def _getAltIds(self):
+ if 'alt_id' in self:
+ return list(set(self.get('alt_id',None)))
+ return None
+
+ def _getIsA(self):
+ if 'is_a' in self:
+ return list(set(self.get('is_a',None)))
+ return None
+
+ def _getSynonym(self):
+ if 'synonym' in self :
+ return list(set(self.get('synonym',None)))
+ return None
+
+ def _getSubset(self):
+ if self.get('subset',None) != None:
+ return list(set(self.get('subset',None)))
+ else:
+ return None
+
+ def _getXref(self):
+ if 'xref' in self:
+ return list(set(self.get('xref',None)))
+ return None
+
+ def _getRelationShip(self):
+ if 'relationship' in self:
+ return list(set(self.get('relationship',None)))
+ return None
+
+ def _getUnion(self):
+ return list(set(self.get('union_of',None)))
+
+ def _getIntersection(self):
+ return list(set(self.get('intersection_of',None)))
+
+ def _getDisjonction(self):
+ return list(set(self.get('disjoint_from',None)))
+
+ def _isObsolete(self):
+ return 'is_obsolete' in self and str(self['is_obsolete'][0])=='true'
+
+ def _getReplacedBy(self):
+ if 'replaced_by' in self:
+ return list(set(self.get('replaced_by',None)))
+ return None
+
+ def _getConsider(self):
+ if 'consider' in self:
+ return list(set(self.get('consider',None)))
+ return None
+
+ ## automatically make-up !
+ stanza = property(_getStanza,None,None)
+ definition = property(_getDefinition,None,None)
+ id = property(_getId,None,None)
+ namespace = property(_getNamespace,None,None)
+ name = property(_getName,None,None)
+ comment = property(_getComment,None,None)
+ alt_ids = property(_getAltIds,None,None)
+ is_a = property(_getIsA,None,None)
+ synonyms = property(_getSynonym,None,None)
+ subsets = property(_getSubset,None,None)
+ xrefs = property(_getXref,None,None)
+ relationship = property(_getRelationShip,None,None)
+ union_of = property(_getUnion,None,None)
+ intersection_of = property(_getIntersection,None,None)
+ disjoint_from = property(_getDisjonction,None,None)
+ is_obsolete = property(_isObsolete,None,None)
+ replaced_by = property(_getReplacedBy,None,None)
+ consider = property(_getConsider,None,None)
+
+
+def OBOEntryFactory(stanza):
+ '''
+ Dispatcher of stanza.
+
+ @param stanza: a stanza composed of several lines.
+ @type stanza: text
+
+ @return: an C{OBOTerm} | C{OBOEntry} instance
+
+ @note: The dispatcher treats differently the stanza which are OBO "Term"
+ and the others.
+ '''
+
+ stanzaType = OBOEntry.parseStanzaName(stanza)
+
+ if stanzaType=="Term":
+ return OBOTerm(stanza)
+ else:
+ return OBOEntry(stanza)
+
+def OBOEntryIterator(file):
+ entries = stanzaIterator(file)
+ for e in entries:
+ debug(e)
+ yield OBOEntryFactory(e)
+
+
\ No newline at end of file
diff --git a/obitools/options/__init__.py b/obitools/options/__init__.py
new file mode 100644
index 0000000..d6793d6
--- /dev/null
+++ b/obitools/options/__init__.py
@@ -0,0 +1,137 @@
+"""
+ Module providing high level functions to manage command line options.
+"""
+import logging
+import sys
+
+from logging import debug
+
+from optparse import OptionParser
+
+from obitools.utils import universalOpen
+from obitools.utils import fileSize
+from obitools.utils import universalTell
+from obitools.utils import progressBar
+from obitools.format.options import addInputFormatOption, addInOutputOption,\
+ autoEntriesIterator
+import time
+
+
+
+def getOptionManager(optionDefinitions,entryIterator=None,progdoc=None):
+ '''
+ Build an option manager fonction. that is able to parse
+ command line options of the script.
+
+ @param optionDefinitions: list of function describing a set of
+ options. Each function must allows as
+ unique parametter an instance of OptionParser.
+ @type optionDefinitions: list of functions.
+
+ @param entryIterator: an iterator generator function returning
+ entries from the data files.
+
+ @type entryIterator: an iterator generator function with only one
+ parametter of type file
+ '''
+ parser = OptionParser(progdoc)
+ parser.add_option('--DEBUG',
+ action="store_true", dest="debug",
+ default=False,
+ help="Set logging in debug mode")
+
+ parser.add_option('--no-psyco',
+ action="store_true", dest="noPsyco",
+ default=False,
+ help="Don't use psyco even if it installed")
+
+ parser.add_option('--without-progress-bar',
+ action="store_false", dest="progressbar",
+ default=True,
+ help="desactivate progress bar")
+
+ checkFormat=False
+ for f in optionDefinitions:
+ if f == addInputFormatOption or f == addInOutputOption:
+ checkFormat=True
+ f(parser)
+
+ def commandLineAnalyzer():
+ options,files = parser.parse_args()
+ if options.debug:
+ logging.root.setLevel(logging.DEBUG)
+
+ if checkFormat:
+ ei=autoEntriesIterator(options)
+ else:
+ ei=entryIterator
+
+ i = allEntryIterator(files,ei,with_progress=options.progressbar)
+ return options,i
+
+ return commandLineAnalyzer
+
+_currentInputFileName=None
+_currentFile = None
+_currentFileSize = None
+
+def currentInputFileName():
+ return _currentInputFileName
+
+def currentInputFile():
+ return _currentFile
+
+def currentFileSize():
+ return _currentFileSize
+
+def currentFileTell():
+ return universalTell(_currentFile)
+
+def fileWithProgressBar(file,step=100):
+ try:
+ size = currentFileSize()
+ except:
+ size = None
+
+ def fileBar():
+ pos=1
+ progressBar(pos, size, True,currentInputFileName())
+ for l in file:
+ progressBar(currentFileTell,size,head=currentInputFileName())
+ yield l
+ print >>sys.stderr,''
+ if size is None:
+ return file
+ else:
+ f = fileBar()
+ return f
+
+
+def allEntryIterator(files,entryIterator,with_progress=False,histo_step=102):
+ global _currentFile
+ global _currentInputFileName
+ global _currentFileSize
+ if files :
+ for f in files:
+ _currentInputFileName=f
+ f = universalOpen(f)
+ _currentFile=f
+ _currentFileSize=fileSize(_currentFile)
+ debug(f)
+ if with_progress:
+ f=fileWithProgressBar(f,step=histo_step)
+ if entryIterator is None:
+ for line in f:
+ yield line
+ else:
+ for entry in entryIterator(f):
+ yield entry
+ else:
+ if entryIterator is None:
+ for line in sys.stdin:
+ yield line
+ else:
+ for entry in entryIterator(sys.stdin):
+ yield entry
+
+
\ No newline at end of file
diff --git a/obitools/options/bioseqcutter.py b/obitools/options/bioseqcutter.py
new file mode 100644
index 0000000..77189af
--- /dev/null
+++ b/obitools/options/bioseqcutter.py
@@ -0,0 +1,85 @@
+from logging import debug
+
+def _beginOptionCallback(options,opt,value,parser):
+ def beginCutPosition(seq):
+ debug("begin = %s" % value )
+ if hasattr(options, 'taxonomy') and options.taxonomy is not None:
+ environ = {'taxonomy' : options.taxonomy,'sequence':seq}
+ else:
+ environ = {'sequence':seq}
+
+ return eval(value,environ,seq) - 1
+
+ parser.values.beginCutPosition=beginCutPosition
+
+def _endOptionCallback(options,opt,value,parser):
+ def endCutPosition(seq):
+ if hasattr(options, 'taxonomy') and options.taxonomy is not None:
+ environ = {'taxonomy' : options.taxonomy,'sequence':seq}
+ else:
+ environ = {'sequence':seq}
+
+ return eval(value,environ,seq)
+
+ parser.values.endCutPosition=endCutPosition
+
+
+
+
+def addSequenceCuttingOptions(optionManager):
+
+ optionManager.add_option('-b','--begin',
+ action="callback", callback=_beginOptionCallback,
+ metavar="",
+ type="string",
+ help="python expression to be evaluated in the "
+ "sequence context. The attribute name can be "
+ "used in the expression as variable name. "
+ "An extra variable named 'sequence' refers "
+ "to the sequence object itself. ")
+
+ optionManager.add_option('-e','--end',
+ action="callback", callback=_endOptionCallback,
+ metavar="",
+ type="string",
+ help="python expression to be evaluated in the "
+ "sequence context. The attribute name can be "
+ "used in the expression as variable name ."
+ "An extra variable named 'sequence' refers"
+ "to the sequence object itself. ")
+
+
+def cutterGenerator(options):
+
+ def sequenceCutter(seq):
+
+ lseq = len(seq)
+
+ if hasattr(options, 'beginCutPosition'):
+ begin = int(options.beginCutPosition(seq))
+ else:
+ begin = 0
+
+ if hasattr(options, 'endCutPosition'):
+ end = int(options.endCutPosition(seq))
+ else:
+ end = lseq
+
+ if begin > 0 or end < lseq:
+ seq = seq[begin:end]
+ seq['subsequence']="%d..%d" % (begin+1,end)
+
+ return seq
+
+ return sequenceCutter
+
+def cutterIteratorGenerator(options):
+ _cutter = cutterGenerator(options)
+
+ def sequenceCutterIterator(seqIterator):
+ for seq in seqIterator:
+ yield _cutter(seq)
+
+ return sequenceCutterIterator
+
+
diff --git a/obitools/options/bioseqedittag.py b/obitools/options/bioseqedittag.py
new file mode 100644
index 0000000..6eb1c36
--- /dev/null
+++ b/obitools/options/bioseqedittag.py
@@ -0,0 +1,237 @@
+import sys
+from obitools.options.taxonomyfilter import loadTaxonomyDatabase
+def addSequenceEditTagOptions(optionManager):
+
+ optionManager.add_option('--rank',
+ action="store_true", dest='addrank',
+ default=False,
+ help="add a rank attribute to the sequence "
+ "indicating the sequence position in the input data")
+
+ optionManager.add_option('-R','--rename-tag',
+ action="append",
+ dest='renameTags',
+ metavar="",
+ type="string",
+ default=[],
+ help="change tag name from OLD_NAME to NEW_NAME")
+
+ optionManager.add_option('--delete-tag',
+ action="append",
+ dest='deleteTags',
+ metavar="",
+ type="string",
+ default=[],
+ help="delete tag TAG_NAME")
+
+ optionManager.add_option('-S','--set-tag',
+ action="append",
+ dest='setTags',
+ metavar="",
+ type="string",
+ default=[],
+ help="Add a new tag named TAG_NAME with "
+ "a value computed from PYTHON_EXPRESSION")
+
+ optionManager.add_option('--set-identifier',
+ action="store",
+ dest='setIdentifier',
+ metavar="",
+ type="string",
+ default=None,
+ help="Set sequence identifier with "
+ "a value computed from PYTHON_EXPRESSION")
+
+ optionManager.add_option('--set-sequence',
+ action="store",
+ dest='setSequence',
+ metavar="",
+ type="string",
+ default=None,
+ help="Change the sequence itself with "
+ "a value computed from PYTHON_EXPRESSION")
+
+ optionManager.add_option('-T','--set-definition',
+ action="store",
+ dest='setDefinition',
+ metavar="",
+ type="string",
+ default=None,
+ help="Set sequence definition with "
+ "a value computed from PYTHON_EXPRESSION")
+
+ optionManager.add_option('-O','--only-valid-python',
+ action="store_true",
+ dest='onlyValid',
+ default=False,
+ help="only valid python expressions are allowed")
+
+ optionManager.add_option('-C','--clear',
+ action="store_true",
+ dest='clear',
+ default=False,
+ help="clear all tags associated to the sequences")
+
+ optionManager.add_option('-k','--keep',
+ action='append',
+ dest='keep',
+ default=[],
+ type="string",
+ help="only keep this tag")
+
+ optionManager.add_option('--length',
+ action="store_true",
+ dest='length',
+ default=False,
+ help="add seqLength tag with sequence length")
+
+ optionManager.add_option('--with-taxon-at-rank',
+ action='append',
+ dest='taxonrank',
+ default=[],
+ type="string",
+ help="add taxonomy annotation at a speciefied rank level")
+
+ optionManager.add_option('-m','--mcl',
+ action="store", dest="mcl",
+ metavar="",
+ type="string",
+ default=None,
+ help="split following mcl graph clustering partition")
+
+
+def readMCLFile(file):
+ partition=1
+ parts = {}
+ for l in file:
+ for seq in l.strip().split():
+ parts[seq]=partition
+ partition+=1
+ return parts
+
+
+
+
+def sequenceTaggerGenerator(options):
+ toDelete = options.deleteTags[:]
+ toRename = [x.split(':',1) for x in options.renameTags if len(x.split(':',1))==2]
+ toSet = [x.split(':',1) for x in options.setTags if len(x.split(':',1))==2]
+ newId = options.setIdentifier
+ newDef = options.setDefinition
+ newSeq = options.setSequence
+ clear = options.clear
+ keep = set(options.keep)
+ length = options.length
+ counter = [0]
+ loadTaxonomyDatabase(options)
+ if options.taxonomy is not None:
+ annoteRank=options.taxonrank
+ else:
+ annoteRank=[]
+
+ if options.mcl is not None:
+ parts = readMCLFile(open(options.mcl))
+ else:
+ parts = False
+
+ def sequenceTagger(seq):
+
+ if counter[0]>=0:
+ counter[0]+=1
+
+ if clear or keep:
+ ks = seq.keys()
+ for k in ks:
+ if k not in keep:
+ del seq[k]
+ else:
+ for i in toDelete:
+ if i in seq:
+ del seq[i]
+ for o,n in toRename:
+ if o in seq:
+ seq[n]=seq[o]
+ del seq[o]
+
+ for rank in annoteRank:
+ if 'taxid' in seq:
+ taxid = seq['taxid']
+ if taxid is not None:
+ rtaxid = options.taxonomy.getTaxonAtRank(taxid,rank)
+ if rtaxid is not None:
+ scn = options.taxonomy.getScientificName(rtaxid)
+ else:
+ scn=None
+ seq[rank]=rtaxid
+ seq["%s_name"%rank]=scn
+
+ if parts and seq.id in parts:
+ seq['cluster']=parts[seq.id]
+
+ if options.addrank:
+ seq['rank']=counter[0]
+
+ for i,v in toSet:
+ try:
+ if options.taxonomy is not None:
+ environ = {'taxonomy' : options.taxonomy,'sequence':seq, 'counter':counter[0]}
+ else:
+ environ = {'sequence':seq, 'counter':counter[0]}
+
+ val = eval(v,environ,seq)
+ except Exception,e:
+ if options.onlyValid:
+ raise e
+ val = v
+ seq[i]=val
+
+ if length:
+ seq['seqLength']=len(seq)
+
+ if newId is not None:
+ try:
+ if options.taxonomy is not None:
+ environ = {'taxonomy' : options.taxonomy,'sequence':seq, 'counter':counter[0]}
+ else:
+ environ = {'sequence':seq, 'counter':counter[0]}
+
+ val = eval(newId,environ,seq)
+ except Exception,e:
+ if options.onlyValid:
+ raise e
+ val = newId
+ seq.id=val
+ if newDef is not None:
+ try:
+ if options.taxonomy is not None:
+ environ = {'taxonomy' : options.taxonomy,'sequence':seq, 'counter':counter[0]}
+ else:
+ environ = {'sequence':seq, 'counter':counter[0]}
+
+ val = eval(newDef,environ,seq)
+ except Exception,e:
+ if options.onlyValid:
+ raise e
+ val = newDef
+ seq.definition=val
+
+ if newSeq is not None:
+ try:
+ if options.taxonomy is not None:
+ environ = {'taxonomy' : options.taxonomy,'sequence':seq, 'counter':counter[0]}
+ else:
+ environ = {'sequence':seq, 'counter':counter[0]}
+
+ val = eval(newSeq,environ,seq)
+ except Exception,e:
+ if options.onlyValid:
+ raise e
+ val = newSeq
+ if hasattr(seq, '_seq'):
+ seq._seq=str(val).lower()
+ if 'seqLength' in seq:
+ seq['seqLength']=len(seq)
+
+ return seq
+
+ return sequenceTagger
\ No newline at end of file
diff --git a/obitools/options/bioseqfilter.py b/obitools/options/bioseqfilter.py
new file mode 100644
index 0000000..d52c9b5
--- /dev/null
+++ b/obitools/options/bioseqfilter.py
@@ -0,0 +1,179 @@
+import re
+
+from obitools.options.taxonomyfilter import addTaxonomyFilterOptions
+from obitools.options.taxonomyfilter import taxonomyFilterGenerator
+
+def _sequenceOptionCallback(options,opt,value,parser):
+ parser.values.sequencePattern = re.compile(value,re.I)
+
+def _defintionOptionCallback(options,opt,value,parser):
+ parser.values.definitionPattern = re.compile(value)
+
+def _identifierOptionCallback(options,opt,value,parser):
+ parser.values.identifierPattern = re.compile(value)
+
+def _attributeOptionCallback(options,opt,value,parser):
+ if not hasattr(options, 'attributePatterns'):
+ parser.values.attributePatterns={}
+ attribute,pattern=value.split(':',1)
+ parser.values.attributePatterns[attribute]=re.compile(pattern)
+
+def _predicatOptionCallback(options,opt,value,parser):
+ if not hasattr(options, 'predicats'):
+ options.predicats=[]
+ parser.values.predicats.append(value)
+
+
+def addSequenceFilteringOptions(optionManager):
+
+ optionManager.add_option('-s','--sequence',
+ action="callback", callback=_sequenceOptionCallback,
+ metavar="",
+ type="string",
+ help="regular expression pattern used to select "
+ "the sequence. The pattern is case insensitive")
+
+ optionManager.add_option('-D','--definition',
+ action="callback", callback=_defintionOptionCallback,
+ type="string",
+ metavar="",
+ help="regular expression pattern matched against "
+ "the definition of the sequence. "
+ "The pattern is case sensitive")
+
+ optionManager.add_option('-I','--identifier',
+ action="callback", callback=_identifierOptionCallback,
+ type="string",
+ metavar="",
+ help="regular expression pattern matched against "
+ "the identifier of the sequence. "
+ "The pattern is case sensitive")
+
+ optionManager.add_option('-a','--attribute',
+ action="callback", callback=_attributeOptionCallback,
+ type="string",
+ metavar=":",
+ help="regular expression pattern matched against "
+ "the attributes of the sequence. "
+ "the value of this atribute is of the form : "
+ "attribute_name:regular_pattern. "
+ "The pattern is case sensitive."
+ "Several -a option can be used on the same "
+ "commande line.")
+
+ optionManager.add_option('-A','--has-attribute',
+ action="append",
+ type="string",
+ dest="has_attribute",
+ default=[],
+ metavar="",
+ help="select sequence with attribute "
+ "defined")
+
+ optionManager.add_option('-p','--predicat',
+ action="append", dest="predicats",
+ metavar="",
+ help="python boolean expression to be evaluated in the "
+ "sequence context. The attribute name can be "
+ "used in the expression as variable name ."
+ "An extra variable named 'sequence' refers"
+ "to the sequence object itself. "
+ "Several -p option can be used on the same "
+ "commande line.")
+
+ optionManager.add_option('-L','--lmax',
+ action='store',
+ metavar="<##>",
+ type="int",dest="lmax",
+ help="keep sequences shorter than lmax")
+
+ optionManager.add_option('-l','--lmin',
+ action='store',
+ metavar="<##>",
+ type="int",dest="lmin",
+ help="keep sequences longer than lmin")
+
+ optionManager.add_option('-v','--inverse-match',
+ action='store_true',
+ default=False,
+ dest="invertedFilter",
+ help="revert the sequence selection "
+ "[default : %default]")
+
+ addTaxonomyFilterOptions(optionManager)
+
+
+
+
+
+def filterGenerator(options):
+ taxfilter = taxonomyFilterGenerator(options)
+
+ def sequenceFilter(seq):
+ good = True
+
+ if hasattr(options, 'sequencePattern'):
+ good = bool(options.sequencePattern.search(str(seq)))
+
+ if good and hasattr(options, 'identifierPattern'):
+ good = bool(options.identifierPattern.search(seq.id))
+
+ if good and hasattr(options, 'definitionPattern'):
+ good = bool(options.definitionPattern.search(seq.definition))
+
+ if good :
+ good = reduce(lambda x,y:x and y,
+ (k in seq for k in options.has_attribute),
+ True)
+
+ if good and hasattr(options, 'attributePatterns'):
+ good = (reduce(lambda x,y : x and y,
+ (bool(options.attributePatterns[p].search(str(seq[p])))
+ for p in options.attributePatterns
+ if p in seq),True)
+ and
+ reduce(lambda x,y : x and y,
+ (bool(p in seq)
+ for p in options.attributePatterns),True)
+ )
+
+ if good and hasattr(options, 'predicats') and options.predicats is not None:
+ if options.taxonomy is not None:
+ e = {'taxonomy' : options.taxonomy,'sequence':seq}
+ else:
+ e = {'sequence':seq}
+
+ good = (reduce(lambda x,y: x and y,
+ (bool(eval(p,e,seq))
+ for p in options.predicats),True)
+ )
+
+ if good and hasattr(options, 'lmin') and options.lmin is not None:
+ good = len(seq) >= options.lmin
+
+ if good and hasattr(options, 'lmax') and options.lmax is not None:
+ good = len(seq) <= options.lmax
+
+ if good:
+ good = taxfilter(seq)
+
+ if hasattr(options, 'invertedFilter') and options.invertedFilter:
+ good=not good
+
+
+ return good
+
+ return sequenceFilter
+
+def sequenceFilterIteratorGenerator(options):
+ filter = filterGenerator(options)
+
+ def sequenceFilterIterator(seqIterator):
+ for seq in seqIterator:
+ if filter(seq):
+ yield seq
+
+ return sequenceFilterIterator
+
+
+
\ No newline at end of file
diff --git a/obitools/options/taxonomyfilter.py b/obitools/options/taxonomyfilter.py
new file mode 100644
index 0000000..5526c79
--- /dev/null
+++ b/obitools/options/taxonomyfilter.py
@@ -0,0 +1,6 @@
+from obitools.ecopcr.options import addTaxonomyDBOptions, \
+ addTaxonomyFilterOptions, \
+ loadTaxonomyDatabase, \
+ taxonomyFilterGenerator, \
+ taxonomyFilterIteratorGenerator
+
diff --git a/obitools/parallel/__init__.py b/obitools/parallel/__init__.py
new file mode 100644
index 0000000..2aa1b07
--- /dev/null
+++ b/obitools/parallel/__init__.py
@@ -0,0 +1,99 @@
+import threading
+
+class TaskPool(object):
+
+ def __init__(self,iterable,function,count=2):
+ self.pool = []
+ self.queue= []
+ self.plock= threading.Lock()
+ self.qlock= threading.Lock()
+ self.function=function
+ self.event=threading.Event()
+ self.iterable=iterable
+ for i in xrange(count):
+ Task(self)
+
+ def register(self,task):
+ self.plock.acquire()
+ self.pool.append(task)
+ self.plock.release()
+ self.ready(task)
+
+ def unregister(self,task):
+ task.thread.join()
+ self.plock.acquire()
+ self.pool.remove(task)
+ self.plock.release()
+
+
+ def ready(self,task):
+ self.qlock.acquire()
+ self.queue.append(task)
+ self.qlock.release()
+ self.event.set()
+
+ def __iter__(self):
+ for data in self.iterable:
+ while not self.queue:
+ self.event.wait()
+ self.event.clear()
+ self.qlock.acquire()
+ task=self.queue.pop(0)
+ self.qlock.release()
+ if hasattr(task, 'rep'):
+ yield task.rep
+ #print "send ",data
+ if isinstance(data,dict):
+ task.submit(**data)
+ else:
+ task.submit(*data)
+
+ while self.pool:
+ self.pool[0].finish()
+ while self.queue:
+ self.event.clear()
+ self.qlock.acquire()
+ task=self.queue.pop(0)
+ self.qlock.release()
+ if hasattr(task, 'rep'):
+ yield task.rep
+
+
+
+
+
+class Task(object):
+ def __init__(self,pool):
+ self.pool = pool
+ self.lock = threading.Lock()
+ self.dataOk = threading.Event()
+ self.repOk = threading.Event()
+ self.args = None
+ self.kwargs=None
+ self.stop=False
+ self.thread = threading.Thread(target=self)
+ self.thread.start()
+ self.pool.register(self)
+
+ def __call__(self):
+ self.dataOk.wait()
+ while(not self.stop):
+ self.lock.acquire()
+ self.dataOk.clear()
+ self.rep=self.pool.function(*self.args,**self.kwargs)
+ self.pool.ready(self)
+ self.lock.release()
+ self.dataOk.wait()
+
+ def submit(self,*args,**kwargs):
+ self.args=args
+ self.kwargs=kwargs
+ self.dataOk.set()
+
+ def finish(self):
+ self.lock.acquire()
+ self.stop=True
+ self.dataOk.set()
+ self.pool.unregister(self)
+
+
diff --git a/obitools/parallel/jobqueue.py b/obitools/parallel/jobqueue.py
new file mode 100644
index 0000000..9df4804
--- /dev/null
+++ b/obitools/parallel/jobqueue.py
@@ -0,0 +1,183 @@
+import threading
+from logging import warning,info
+from time import sleep,time
+
+from obitools.parallel import TaskPool
+
+
+class JobPool(dict):
+ '''
+ JobPool is dedicated to manage a job queue. These jobs
+ will run in a limited number of thread.
+ '''
+
+ def __init__(self,count,precision=0.01):
+ '''
+
+ @param count: number of thread dedicated to this JobPool
+ @type count: int
+ @param precision: delay between two check for new job (in second)
+ @type precision: float
+ '''
+ self._iterator = JobIterator(self)
+ self._taskPool = TaskPool(self._iterator,
+ self._runJob,
+ count)
+ self._precision=precision
+ self._toRun=set()
+ self._runnerThread = threading.Thread(target=self._runner)
+ self._runnerThread.start()
+ self._finalyzed=False
+
+ def _runner(self):
+ for rep in self._taskPool:
+ info('Job %d finnished' % id(rep))
+ info('All jobs in %d JobPool finished' % id(self))
+
+ def _jobIterator(self):
+ return self._iterator
+
+ def _runJob(self,job):
+ job.started= time()
+ info('Job %d started' % id(job))
+ job.result = job()
+ job.ended = time()
+ job.finished=True
+ return job
+
+ def submit(self,job,priority=1.0,userid=None):
+ '''
+ Submit a new job to the JobPool.
+
+ @param job: the new submited job
+ @type job: Job instance
+ @param priority: priority level of this job (higher is better)
+ @type priority: float
+ @param userid: a user identifier (Default is None)
+
+ @return: job identifier
+ @rtype: int
+ '''
+
+ assert not self._finalyzed,\
+ "This jobPool does not accept new job"
+ if job.submitted is not None:
+ warning('Job %d was already submitted' % id(job))
+ return id(job)
+
+ job.submitted = time()
+ job.priority = priority
+ job.userid = userid
+ i=id(job)
+ job.id=id
+ self[i]=job
+ self._toRun.add(job)
+
+ info('Job %d submitted' % i)
+
+ return i
+
+ def finalyze(self):
+ '''
+ Indicate to the JobPool, that no new jobs will
+ be submitted.
+ '''
+ self._iterator.finalyze()
+ self._finalyzed=True
+
+ def __del__(self):
+ self.finalyze()
+
+
+class JobIterator(object):
+ def __init__(self,pool):
+ self._pool = pool
+ self._finalyze=False
+ self._nextLock=threading.Lock()
+
+
+ def __iter__(self):
+ return self
+
+ def finalyze(self):
+ '''
+ Indicate to the JobIterator, that no new jobs will
+ be submitted.
+ '''
+ self._finalyze=True
+
+
+ def next(self):
+ '''
+
+ @return: the next job to run
+ @rtype: Job instance
+ '''
+ self._nextLock.acquire()
+ while self._pool._toRun or not self._finalyze:
+ rep = None
+ maxScore=0
+ for k in self._pool._toRun:
+ s = k.runScore()
+ if s > maxScore:
+ maxScore=s
+ rep=k
+ if rep is not None:
+ self._pool._toRun.remove(rep)
+ self._nextLock.release()
+ return (rep,)
+ sleep(self._pool._precision)
+ self._nextLock.release()
+ info('No more jobs in %d JobPool' % id(self._pool))
+ raise StopIteration
+
+
+
+class Job(object):
+
+ def __init__(self,pool=None,function=None,*args,**kwargs):
+ '''
+ Create a new job
+
+ @param pool: the jobpool used to run job. Can be None to not
+ execute the job immediately.
+ @type pool: JobPool instance
+
+ @param function: the function to run for the job
+ @type function: callable object
+
+ @param args: parametters for function call
+ @param kwargs: named parametters for function call
+
+ @precondition: function cannot be None
+ '''
+ assert function is not None
+ self._args=args
+ self._kwargs = kwargs
+ self._function = function
+ self.running = False
+ self.finished= False
+ self.submitted = None
+ self.priority = None
+ self.userid = None
+
+ if pool is not None:
+ pool.submit(self)
+
+ def runScore(self):
+ '''
+ @return: the score used to ordonnance job in the queue
+ @rtype: C{float}
+ '''
+
+ return (time() - self.submitted) * self.priority
+
+ def __call__(self):
+ return self._function(*self._args,**self._kwargs)
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/obitools/phylogeny/__init__.py b/obitools/phylogeny/__init__.py
new file mode 100644
index 0000000..8eb1587
--- /dev/null
+++ b/obitools/phylogeny/__init__.py
@@ -0,0 +1,119 @@
+
+from obitools.graph.tree import Forest,TreeNode
+from obitools.graph import Edge
+
+
+
+class PhylogenicTree(Forest):
+
+ def __init__(self,label='G',indexer=None,nodes=None,edges=None):
+ Forest.__init__(self, label, indexer, nodes, edges)
+ self.root=None
+ self.comment=None
+
+ def addNode(self,node=None,index=None,**data):
+ if node is None and index is None:
+ node = '__%d' % (len(self._node)+1)
+
+ return Forest.addNode(self, node, index, **data)
+
+ def getNode(self,node=None,index=None):
+ if index is None:
+ index = self._index.getIndex(node, True)
+ return PhylogenicNode(index,self)
+
+ def getEdge(self,node1=None,node2=None,index1=None,index2=None):
+ '''
+
+ @param node1:
+ @type node1:
+ @param node2:
+ @type node2:
+ @param index1:
+ @type index1:
+ @param index2:
+ @type index2:
+ '''
+ node1=self.getNode(node1, index1)
+ node2=self.getNode(node2, index2)
+ return PhylogenicEdge(node1,node2)
+
+
+
+class PhylogenicNode(TreeNode):
+
+ def getLabel(self):
+ label = TreeNode.getLabel(self)
+ if label[0:2]=='__':
+ return None
+ else:
+ return label
+
+ def __str__(self):
+
+ if self.index in self.graph._node_attrs:
+ keys = " ".join(['%s="%s"' % (x[0],str(x[1]).replace('"','\\"'))
+ for x in self.graph._node_attrs[self.index].iteritems()]
+ )
+ else:
+ keys=''
+
+ if self.label is None:
+ label=''
+ shape='point'
+ else:
+ label=self.label
+ shape='box'
+
+ return '%d [label="%s" shape="%s" %s]' % (self.index,str(label).replace('"','\\"'),shape,keys)
+
+ def distanceTo(self,node=None,index=None):
+ '''
+ compute branch length between the two nodes.
+ If distances are not secified for this tree, None is returned.
+
+ @param node: a node label or None
+ @param index: a node index or None. the parameter index
+ has a priority on the parameter node.
+ @type index: int
+
+ @return: the evolutive distance between the two nodes
+ @rtype: int, float or None
+ '''
+ path = self.shortestPathTo(node, index)
+
+ start = path.pop(0)
+ dist=0
+ for dest in path:
+ edge = self.graph.getEdge(index1=start,index2=dest)
+ if 'distance' in edge:
+ dist+=edge['distance']
+ else:
+ return None
+ start=dest
+
+ return dist
+
+ label = property(getLabel, None, None, "Label of the node")
+
+class PhylogenicEdge(Edge):
+
+ def __str__(self):
+ e = (self.node1.index,self.node2.index)
+ if e in self.graph._edge_attrs:
+ keys = "[%s]" % " ".join(['%s="%s"' % (x[0],str(x[1]).replace('"','\\"'))
+ for x in self.graph._edge_attrs[e].iteritems()
+ if x[0] not in ('distance','bootstrap')]
+ )
+ else:
+ keys = ""
+
+
+
+ if self.directed:
+ link='->'
+ else:
+ link='--'
+
+ return "%d %s %d %s" % (self.node1.index,link,self.node2.index,keys)
+
diff --git a/obitools/phylogeny/newick.py b/obitools/phylogeny/newick.py
new file mode 100644
index 0000000..cf0330c
--- /dev/null
+++ b/obitools/phylogeny/newick.py
@@ -0,0 +1,123 @@
+import re
+import sys
+
+from obitools.utils import universalOpen
+from obitools.phylogeny import PhylogenicTree
+
+def subNodeIterator(data):
+ level=0
+ start = 1
+ if data[0]=='(':
+ for i in xrange(1,len(data)):
+ c=data[i]
+ if c=='(':
+ level+=1
+ elif c==')':
+ level-=1
+ if c==',' and not level:
+ yield data[start:i]
+ start = i+1
+ yield data[start:i]
+ else:
+ yield data
+
+
+_nodeParser=re.compile('\s*(?P\(.*\))?(?P[^ :]+)? *(?P[0-9.]+)?(:(?P-?[0-9.]+))?')
+
+def nodeParser(data):
+ parsedNode = _nodeParser.match(data).groupdict(0)
+ if not parsedNode['name']:
+ parsedNode['name']=None
+
+ if not parsedNode['bootstrap']:
+ parsedNode['bootstrap']=None
+ else:
+ parsedNode['bootstrap']=float(parsedNode['bootstrap'])
+
+ if not parsedNode['distance']:
+ parsedNode['distance']=None
+ else:
+ parsedNode['distance']=float(parsedNode['distance'])
+
+ if not parsedNode['subnodes']:
+ parsedNode['subnodes']=None
+
+ return parsedNode
+
+_cleanTreeData=re.compile('\s+')
+
+def treeParser(data,tree=None,parent=None):
+ if tree is None:
+ tree = PhylogenicTree()
+ data = _cleanTreeData.sub(' ',data).strip()
+
+ parsedNode = nodeParser(data)
+
+ if parent is not None:
+ son,parent = tree.addEdge(node1=parsedNode['name'],
+ index2=parent,
+ distance=parsedNode['distance'],
+ bootstrap=parsedNode['bootstrap'])
+ else:
+ son = tree.addNode(node1=parsedNode['name'])
+ tree.root=son
+
+
+
+ if parsedNode['subnodes']:
+ for subnode in subNodeIterator(parsedNode['subnodes']):
+ treeParser(subnode,tree,son)
+
+ return tree
+
+_treecomment=re.compile('\[.*\]')
+
+def treeIterator(file):
+ file = universalOpen(file)
+ data = file.read()
+
+ comment = _treecomment.findall(data)
+ data=_treecomment.sub('',data).strip()
+
+ if comment:
+ comment=comment[0]
+ else:
+ comment=None
+ for tree in data.split(';'):
+ t = treeParser(tree)
+ if comment:
+ t.comment=comment
+ yield t
+
+def nodeWriter(tree,node,deep=0):
+ name = node._name
+ if name is None:
+ name=''
+
+ distance=node._dist
+ if distance is None:
+ distance=''
+ else:
+ distance = ':%6.5f' % distance
+
+ bootstrap=node._bootstrap
+ if bootstrap is None:
+ bootstrap=''
+ else:
+ bootstrap=' %d' % int(bootstrap)
+
+ nodeseparator = ',\n' + ' ' * (deep+1)
+
+ subnodes = nodeseparator.join([nodeWriter(tree, x, deep+1)
+ for x in tree.childNodeIterator(node)])
+ if subnodes:
+ subnodes='(\n' + ' ' * (deep+1) + subnodes + '\n' + ' ' * deep + ')'
+
+ return '%s%s%s%s' % (subnodes,name,bootstrap,distance)
+
+def treeWriter(tree,startnode=None):
+ if startnode is not None:
+ root=startnode
+ else:
+ root = tree.getRoot()
+ return nodeWriter(tree,root)+';'
diff --git a/obitools/profile/__init__.py b/obitools/profile/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/obitools/profile/_profile.so b/obitools/profile/_profile.so
new file mode 100755
index 0000000..7f52483
Binary files /dev/null and b/obitools/profile/_profile.so differ
diff --git a/obitools/sample.py b/obitools/sample.py
new file mode 100644
index 0000000..4894c94
--- /dev/null
+++ b/obitools/sample.py
@@ -0,0 +1,76 @@
+'''
+Created on 31 oct. 2009
+
+@author: coissac
+'''
+from random import shuffle, randrange
+
+def lookfor(x,cumsum):
+ lmax=len(cumsum)
+ lmin=0
+
+ assert x < cumsum[-1],"x must be smaller then cumulative sum"
+
+ while((lmax - lmin) > 0):
+
+ i=(lmax+lmin)/2
+ #print i,lmin,lmax
+ if (xcumsum[i-1])):
+ #print "return 1 :",i,cumsum[i-1],"<",x,"<",cumsum[i]
+ return i
+ elif cumsum[i]==x:
+ while cumsum[i]==x:
+ i+=1
+ #print "return 2 :",i,cumsum[i],"<",x,"<",cumsum[i+1]
+ return i
+ elif x0]
+ shuffle(entries)
+ cumul=[]
+ s=0
+ for e in entries:
+ s+=events[e]
+ cumul.append(s)
+
+ #print cumul
+ result={}
+
+ for t in xrange(size):
+ e=lookfor(randrange(s), cumul)
+ k=entries[e]
+ result[k]=result.get(k,0)+1
+
+ return result
+
+def weigthedSampleWithoutReplacement(events,size):
+ entries = [k for k in events.iterkeys() if events[k]>0]
+ shuffle(entries)
+ cumul=[]
+ s=0
+ for e in entries:
+ s+=events[e]
+ cumul.append(s)
+
+ #print cumul
+ result={}
+
+ for t in xrange(size):
+ # print s,cumul,
+ e=lookfor(randrange(s), cumul)
+ # print e
+ k=entries[e]
+ for x in xrange(e,len(cumul)):
+ cumul[x]-=1
+ s-=1
+ result[k]=result.get(k,0)+1
+
+ return result
\ No newline at end of file
diff --git a/obitools/seqdb/__init__.py b/obitools/seqdb/__init__.py
new file mode 100644
index 0000000..274cbad
--- /dev/null
+++ b/obitools/seqdb/__init__.py
@@ -0,0 +1,88 @@
+from obitools import NucSequence,AASequence
+from obitools.format.genericparser import genericEntryIteratorGenerator
+from obitools.location.feature import featureIterator
+
+from itertools import chain
+
+class AnnotatedSequence(object):
+
+ def __init__(self,header,featureTable,secondaryAcs):
+ self._header = header
+ self._featureTableText = featureTable
+ self._featureTable=None
+ self._secondaryAcs=secondaryAcs
+ self._hasTaxid=None
+
+ def getHeader(self):
+ return self._header
+
+
+ def getFeatureTable(self,skipError=False):
+ if self._featureTable is None:
+ self._featureTable = [x for x in featureIterator(self._featureTableText,skipError)]
+ return self._featureTable
+
+
+ def getSecondaryAcs(self):
+ return self._secondaryAcs
+
+ def extractTaxon(self):
+ if self._hasTaxid is None:
+
+ if self._featureTable is not None:
+ s = [f for f in self._featureTable if f.ftType=='source']
+ else:
+ s = featureIterator(self._featureTableText).next()
+ if s.ftType=='source':
+ s = [s]
+ else:
+ s = [f for f in self.featureTable if f.ftType=='source']
+
+ t =set(int(v[6:]) for v in chain(*tuple(f['db_xref'] for f in s if 'db_xref' in f))
+ if v[0:6]=='taxon:')
+
+ self._hasTaxid=False
+
+ if len(t)==1 :
+ taxid=t.pop()
+ if taxid >=0:
+ self['taxid']=taxid
+ self._hasTaxid=True
+
+
+ t =set(chain(*tuple(f['organism'] for f in s if 'organism' in f)))
+
+ if len(t)==1:
+ self['organism']=t.pop()
+
+
+ header = property(getHeader, None, None, "Header's Docstring")
+
+ featureTable = property(getFeatureTable, None, None, "FeatureTable's Docstring")
+
+ secondaryAcs = property(getSecondaryAcs, None, None, "SecondaryAcs's Docstring")
+
+class AnnotatedNucSequence(AnnotatedSequence,NucSequence):
+ '''
+
+ '''
+ def __init__(self,id,seq,de,header,featureTable,secondaryAcs,**info):
+ NucSequence.__init__(self, id, seq, de,**info)
+ AnnotatedSequence.__init__(self, header, featureTable, secondaryAcs)
+
+
+class AnnotatedAASequence(AnnotatedSequence,AASequence):
+ '''
+
+ '''
+ def __init__(self,id,seq,de,header,featureTable,secondaryAcs,**info):
+ AASequence.__init__(self, id, seq, de,**info)
+ AnnotatedSequence.__init__(self, header, featureTable, secondaryAcs)
+
+
+
+nucEntryIterator=genericEntryIteratorGenerator(endEntry='^//')
+aaEntryIterator=genericEntryIteratorGenerator(endEntry='^//')
+
+
+
diff --git a/obitools/seqdb/blastdb/__init__.py b/obitools/seqdb/blastdb/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/obitools/seqdb/dnaparser.py b/obitools/seqdb/dnaparser.py
new file mode 100644
index 0000000..85b82a2
--- /dev/null
+++ b/obitools/seqdb/dnaparser.py
@@ -0,0 +1,16 @@
+from obitools.format.sequence import embl,fasta,genbank
+
+class UnknownFormatError(Exception):
+ pass
+
+def whichParser(seq):
+ if seq[0]=='>':
+ return fasta.fastaNucParser
+ if seq[0:2]=='ID':
+ return embl.emblParser
+ if seq[0:5]=='LOCUS':
+ return genbank.genbankParser
+ raise UnknownFormatError,"Unknown nucleic format"
+
+def nucleicParser(seq):
+ return whichParser(seq)(seq)
diff --git a/obitools/seqdb/embl/__init__.py b/obitools/seqdb/embl/__init__.py
new file mode 100644
index 0000000..94f9efc
--- /dev/null
+++ b/obitools/seqdb/embl/__init__.py
@@ -0,0 +1,13 @@
+from obitools.seqdb import AnnotatedNucSequence, AnnotatedAASequence
+from obitools.location import locationGenerator,extractExternalRefs
+
+
+
+class EmblSequence(AnnotatedNucSequence):
+ '''
+ Class used to represent a nucleic sequence issued from EMBL.
+ '''
+
+
+
+
diff --git a/obitools/seqdb/embl/parser.py b/obitools/seqdb/embl/parser.py
new file mode 100644
index 0000000..2e3624f
--- /dev/null
+++ b/obitools/seqdb/embl/parser.py
@@ -0,0 +1,50 @@
+import re
+import sys
+
+from obitools.seqdb import embl
+from obitools.seqdb import nucEntryIterator
+
+_featureMatcher = re.compile('(^FT .*\n)+', re.M)
+_cleanFT = re.compile('^FT',re.M)
+
+_headerMatcher = re.compile('^ID.+(?=\nFH )', re.DOTALL)
+_seqMatcher = re.compile('(^ ).+(?=//\n)', re.DOTALL + re.M)
+_cleanSeq = re.compile('[ \n0-9]+')
+_acMatcher = re.compile('(?<=^AC ).+',re.M)
+_deMatcher = re.compile('(^DE .+\n)+',re.M)
+_cleanDe = re.compile('(^|\n)DE +')
+
+def __emblparser(text):
+ try:
+ header = _headerMatcher.search(text).group()
+
+ ft = _featureMatcher.search(text).group()
+ ft = _cleanFT.sub(' ',ft)
+
+ seq = _seqMatcher.search(text).group()
+ seq = _cleanSeq.sub('',seq).upper()
+
+ acs = _acMatcher.search(text).group()
+ acs = acs.split()
+ ac = acs[0]
+ acs = acs[1:]
+
+ de = _deMatcher.search(header).group()
+ de = _cleanDe.sub(' ',de).strip().strip('.')
+ except AttributeError,e:
+ print >>sys.stderr,'======================================================='
+ print >>sys.stderr,text
+ print >>sys.stderr,'======================================================='
+ raise e
+
+ return (ac,seq,de,header,ft,acs)
+
+def emblParser(text):
+ return embl.EmblSequence(*__emblparser(text))
+
+
+def emblIterator(file):
+ for e in nucEntryIterator(file):
+ yield emblParser(e)
+
+
\ No newline at end of file
diff --git a/obitools/seqdb/genbank/__init__.py b/obitools/seqdb/genbank/__init__.py
new file mode 100644
index 0000000..fb5b622
--- /dev/null
+++ b/obitools/seqdb/genbank/__init__.py
@@ -0,0 +1,84 @@
+from obitools.seqdb import AnnotatedNucSequence, AnnotatedAASequence
+from obitools.location import locationGenerator,extractExternalRefs
+
+
+
+class GbSequence(AnnotatedNucSequence):
+ '''
+ Class used to represent a nucleic sequence issued from Genbank.
+ '''
+
+
+class GpepSequence(AnnotatedAASequence):
+ '''
+ Class used to represent a peptidic sequence issued from Genpep.
+ '''
+
+ def __init__(self,id,seq,de,header,featureTable,secondaryAcs,**info):
+ AnnotatedAASequence.__init__(self,id, seq, de, header, featureTable, secondaryAcs,**info)
+ self.__hasNucRef=None
+
+ def __getGeneRef(self):
+ if self.__hasNucRef is None:
+ self.__hasNucRef=False
+ cds = [x for x in self.featureTable
+ if x.ftType=='CDS'
+ and 'coded_by' in x]
+
+ if cds:
+ source = cds[0]['coded_by'][0]
+ if 'transl_table' in cds[0]:
+ tt = cds[0]['transl_table'][0]
+ else:
+ tt=None
+ ac,loc = extractExternalRefs(source)
+
+ if len(ac)==1:
+ ac = ac.pop()
+ self.__hasNucRef=True
+ self.__nucRef = (ac,loc,tt)
+
+
+
+ def geneAvailable(self):
+ '''
+ Predicat indicating if reference to the nucleic sequence encoding
+ this protein is available in feature table.
+
+ @return: True if gene description is available
+ @rtype: bool
+ '''
+ self.__getGeneRef()
+ return self.__hasNucRef is not None and self.__hasNucRef
+
+
+ def getCDS(self,database):
+ '''
+ Return the nucleic sequence coding for this protein if
+ data are available.
+
+ @param database: a database object where looking for the sequence
+ @type database: a C{dict} like object
+
+ @return: a NucBioseq instance carreponding to the CDS
+ @rtype: NucBioSeq
+
+ @raise AssertionError: if no gene references are available
+ @see: L{geneAvailable}
+
+ '''
+
+ assert self.geneAvailable(), \
+ "No information available to retreive gene sequence"
+
+ ac,loc,tt = self.__nucRef
+ seq = database[ac]
+ seq.extractTaxon()
+ gene = seq[loc]
+ if tt is not None:
+ gene['transl_table']=tt
+ return gene
+
+
+
+
diff --git a/obitools/seqdb/genbank/ncbi.py b/obitools/seqdb/genbank/ncbi.py
new file mode 100644
index 0000000..40ddf91
--- /dev/null
+++ b/obitools/seqdb/genbank/ncbi.py
@@ -0,0 +1,79 @@
+from urllib2 import urlopen
+import sys
+import re
+
+import cStringIO
+
+from obitools.eutils import EFetch
+from parser import genbankParser,genpepParser
+from parser import genbankIterator,genpepIterator
+
+from obitools.utils import CachedDB
+
+
+class NCBIGenbank(EFetch):
+ def __init__(self):
+ EFetch.__init__(self,db='nucleotide',
+ rettype='gbwithparts')
+
+ def __getitem__(self,ac):
+ if isinstance(ac,str):
+ text = self.get(id=ac)
+ seq = genbankParser(text)
+ return seq
+ else:
+ query = ','.join([x for x in ac])
+ data = cStringIO.StringIO(self.get(id=query))
+ return genbankIterator(data)
+
+
+
+
+class NCBIGenpep(EFetch):
+ def __init__(self):
+ EFetch.__init__(self,db='protein',
+ rettype='gbwithparts')
+
+ def __getitem__(self,ac):
+ if isinstance(ac,str):
+ text = self.get(id=ac)
+ seq = genpepParser(text)
+ return seq
+ else:
+ query = ','.join([x for x in ac])
+ data = cStringIO.StringIO(self.get(id=query))
+ return genpepIterator(data)
+
+class NCBIAccession(EFetch):
+
+ _matchACS = re.compile(' +accession +"([^"]+)"')
+
+ def __init__(self):
+ EFetch.__init__(self,db='nucleotide',
+ rettype='seqid')
+
+ def __getitem__(self,ac):
+ if isinstance(ac,str):
+ text = self.get(id=ac)
+ rep = NCBIAccession._matchACS.search(text).group(1)
+ return rep
+ else:
+ query = ','.join([x for x in ac])
+ text = self.get(id=query)
+ rep = (ac.group(1) for ac in NCBIAccession._matchACS.finditer(text))
+ return rep
+
+def Genbank(cache=None):
+ gb = NCBIGenbank()
+ if cache is not None:
+ gb = CachedDB(cache, gb)
+ return gb
+
+
+def Genpep(cache=None):
+ gp = NCBIGenpep()
+ if cache is not None:
+ gp = CachedDB(cache, gp)
+ return gp
+
+
diff --git a/obitools/seqdb/genbank/parser.py b/obitools/seqdb/genbank/parser.py
new file mode 100644
index 0000000..b52fe59
--- /dev/null
+++ b/obitools/seqdb/genbank/parser.py
@@ -0,0 +1,53 @@
+import re
+import sys
+
+import obitools.seqdb.genbank as gb
+from obitools.seqdb import nucEntryIterator,aaEntryIterator
+
+_featureMatcher = re.compile('^FEATURES.+\n(?=ORIGIN)',re.DOTALL + re.M)
+
+_headerMatcher = re.compile('^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M)
+_seqMatcher = re.compile('(?<=ORIGIN).+(?=//\n)', re.DOTALL + re.M)
+_cleanSeq = re.compile('[ \n0-9]+')
+_acMatcher = re.compile('(?<=^ACCESSION ).+',re.M)
+_deMatcher = re.compile('(?<=^DEFINITION ).+\n( .+\n)*',re.M)
+_cleanDe = re.compile('\n *')
+
+def __gbparser(text):
+ try:
+ header = _headerMatcher.search(text).group()
+ ft = _featureMatcher.search(text).group()
+ seq = _seqMatcher.search(text).group()
+ seq = _cleanSeq.sub('',seq).upper()
+ acs = _acMatcher.search(text).group()
+ acs = acs.split()
+ ac = acs[0]
+ acs = acs[1:]
+ de = _deMatcher.search(header).group()
+ de = _cleanDe.sub(' ',de).strip().strip('.')
+ except AttributeError,e:
+ print >>sys.stderr,'======================================================='
+ print >>sys.stderr,text
+ print >>sys.stderr,'======================================================='
+ raise e
+
+ return (ac,seq,de,header,ft,acs)
+
+def genbankParser(text):
+ return gb.GbSequence(*__gbparser(text))
+
+
+def genbankIterator(file):
+ for e in nucEntryIterator(file):
+ yield genbankParser(e)
+
+
+def genpepParser(text):
+ return gb.GpepSequence(*__gbparser(text))
+
+
+def genpepIterator(file):
+ for e in aaEntryIterator(file):
+ yield genpepParser(e)
+
+
\ No newline at end of file
diff --git a/obitools/sequenceencoder/__init__.py b/obitools/sequenceencoder/__init__.py
new file mode 100644
index 0000000..89a8a59
--- /dev/null
+++ b/obitools/sequenceencoder/__init__.py
@@ -0,0 +1,73 @@
+from obitools import location
+
+class SequenceEncoder(object):
+ pass
+
+class DNAComplementEncoder(SequenceEncoder):
+ _comp={'a': 't', 'c': 'g', 'g': 'c', 't': 'a',
+ 'r': 'y', 'y': 'r', 'k': 'm', 'm': 'k',
+ 's': 's', 'w': 'w', 'b': 'v', 'd': 'h',
+ 'h': 'd', 'v': 'b', 'n': 'n', 'u': 'a',
+ '-': '-'}
+
+ _info={'complemented':True}
+
+ @staticmethod
+ def _encode(seq,position=slice(None, None, -1)):
+ cseq = [DNAComplementEncoder._comp.get(x.lower(),'n') for x in seq[position]]
+ return ''.join(cseq)
+
+ @staticmethod
+ def _check(seq):
+ assert seq.isNucleotide()
+
+ @staticmethod
+ def _convertpos(position):
+ if isinstance(position, int):
+ return -(position+1)
+ elif isinstance(position, slice):
+ return slice(-(position.stop+1),
+ -(position.start+1),
+ -position.step)
+ elif isinstance(position, location.Location):
+ return location.ComplementLocation(position).simplify()
+
+ raise TypeError,"position must be an int, slice or Location instance"
+
+ @staticmethod
+ def complement(seq):
+ return seq
+
+class SeqFragmentEncoder(SequenceEncoder):
+ def __init__(self,begin,end):
+ assert begin < end and begin >=0
+ self._limits = slice(begin,end)
+ self._info = {'cut' : [begin,end,1]}
+ self._len = end - begin + 1
+
+ def _check(self,seq):
+ lseq = len(seq)
+ assert self._limits.stop <= lseq
+
+ def _encode(self,seq,position=None):
+ return str(seq)[self._limits]
+
+ def _convertpos(self,position):
+ if isinstance(position, int):
+ if position < -self._len or position >= self._len:
+ raise IndexError,position
+ if position >=0:
+ return self._limits.start + position
+ else:
+ return self._limits.stop + position + 1
+ elif isinstance(position, slice):
+ return slice(-(position.stop+1),
+ -(position.start+1),
+ -position.step)
+ elif isinstance(position, location.Location):
+ return location.ComplementLocation(position).simplify()
+
+ raise TypeError,"position must be an int, slice or Location instance"
+
+
+
\ No newline at end of file
diff --git a/obitools/sequenceencoder/__init__.pyc b/obitools/sequenceencoder/__init__.pyc
new file mode 100644
index 0000000..463f84f
Binary files /dev/null and b/obitools/sequenceencoder/__init__.pyc differ
diff --git a/obitools/solexa/__init__.py b/obitools/solexa/__init__.py
new file mode 100644
index 0000000..60e35f8
--- /dev/null
+++ b/obitools/solexa/__init__.py
@@ -0,0 +1,45 @@
+from obitools import utils
+from obitools import NucSequence
+from obitools.dnahash import hashCodeIterator
+
+
+class SolexaSequence(NucSequence):
+ def __init__(self,id,seq,definition=None,quality=None,**info):
+ NucSequence.__init__(self, id, seq, definition,**info)
+ self._quality=quality
+ self._hash=None
+
+ def getQuality(self):
+ if isinstance(self._quality, str):
+ self._quality=[int(x) for x in self._quality.split()]
+ return self._quality
+
+
+ def __hash__(self):
+ if self._hash is None:
+ self._hash = hashCodeIterator(str(self), len(str(self)), 16, 0).next()[1].pop()
+ return self._hash
+
+class SolexaFile(utils.ColumnFile):
+ def __init__(self,stream):
+ utils.ColumnFile.__init__(self,
+ stream, ':', True,
+ (str,
+ int,int,int,int,
+ str,
+ str), "#")
+
+
+ def next(self):
+ data = utils.ColumnFile.next(self)
+ seq = SolexaSequence('%d_%d_%d_%d'%(data[1],data[2],data[3],data[4]),
+ data[5],
+ quality=data[6])
+ seq['machine']=data[0]
+ seq['channel']=data[1]
+ seq['tile']=data[2]
+ seq['pos_x']=data[3]
+ seq['pos_y']=data[4]
+
+ #assert len(seq['quality'])==len(seq),"Error in file format"
+ return seq
diff --git a/obitools/statistics/__init__.py b/obitools/statistics/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/obitools/statistics/hypergeometric.py b/obitools/statistics/hypergeometric.py
new file mode 100644
index 0000000..9a9b812
--- /dev/null
+++ b/obitools/statistics/hypergeometric.py
@@ -0,0 +1,166 @@
+# -*- coding: utf-8 -*-
+"""
+ Module de calcules statistiques.
+
+ Le module `statistics` contient des fonctions permettant le calcule
+ des probabilités associées à la loi hypergéométrique et
+ hypergéométrique cumulée, ainsi d'une méthode de correction pour les
+ tests multiples.
+
+"""
+
+from decimal import *
+
+getcontext().prec = 28
+
+
+def _hyper0(N,n,r):
+ """
+ Fonction interne permetant le calcule du terme 0 de la loi hypergéométrique.
+
+ Le calcule est réalisé selon la méthode décrite dans l'article
+
+ Trong Wu, An accurate computation of the hypergeometric distribution function,
+ ACM Trans. Math. Softw. 19 (1993), no. 1, 33–43.
+
+ Paramètres:
+
+ - `N` : La taille de la population
+ - `n` : Le nombre d'éléments marqués
+ - `r` : La taille de l'echantillon
+
+ Retourne un *float* indiquant la probabilité de récupérer 0 élément
+ marqué parmi *n* dans une population de taille *N* lors du tirage
+ d'un échantillon de taille *r*
+ """
+
+ #
+ # au numerateur nous avons :
+ # [N -r + 1 -n;N - n + 1[
+ #
+ # au denominateur :
+ # [N - r + 1; N + 1]
+ #
+ # avec X = N - r + 1
+ # et Y = N + 1
+ #
+ # Numerateur -> [ X - n; Y - n [
+ # Denominateur -> [ X ; Y [
+ #
+ # On peut donc siplifier
+ #
+ # Numerateur -> [X - n; X [
+ # Denominateur -> [Y - n; Y [
+
+ numerateur = xrange(N - r + 1 - n, N - r + 1)
+ denominateur= xrange(N + 1 - n, N + 1)
+#
+# version original
+#
+# m = N - n
+# numerateur = set(range(m-r+1,m+1))
+# denominateur = set(range(N-r+1,N+1))
+# simplification = numerateur & denominateur
+# numerateur -= simplification
+# denominateur -= simplification
+# numerateur = list(numerateur)
+# denominateur=list(denominateur)
+# numerateur.sort()
+# denominateur.sort()
+
+
+ p = reduce(lambda x,y:x*y,map(lambda i,j:Decimal(i)/Decimal(j),numerateur,denominateur))
+ return p
+
+
+def hypergeometric(x,N,n,r):
+ """
+ Calcule le terme *x* d'une loi hypergéométrique
+
+ Le calcule est réalisé selon la méthode décrite dans l'article
+
+ Trong Wu, An accurate computation of the hypergeometric distribution function,
+ ACM Trans. Math. Softw. 19 (1993), no. 1, 33–43.
+
+ Paramètres:
+
+ - `x` : Nombre d'éléments marqués attendu
+ - `N` : La taille de la population
+ - `n` : Le nombre d'éléments marqués
+ - `r` : La taille de l'echantillon
+
+ Retourne un *float* indiquant la probabilité de récupérer *x* éléments
+ marqués parmi *n* dans une population de taille *N* lors du tirage
+ d'un échantillon de taille *r*
+ """
+ if n < r:
+ s = n
+ n = r
+ r = s
+ assert x>=0 and x <= r,"x out of limits"
+ if x > 0 :
+ return hypergeometric(x-1,N,n,r) * (n - x + 1)/x * (r - x + 1)/(N-n-r+x)
+ else:
+ return _hyper0(N,n,r)
+
+def chypergeometric(xmin,xmax,N,n,r):
+ """
+ Calcule le terme *x* d'une loi hypergéométrique
+
+ Le calcule est réalisé selon la méthode décrite dans l'article
+
+ Trong Wu, An accurate computation of the hypergeometric distribution function,
+ ACM Trans. Math. Softw. 19 (1993), no. 1, 33–43.
+
+ Paramètres:
+
+ - `xmin` : Nombre d'éléments marqués minimum attendu
+ - `xmax` : Nombre d'éléments marqués maximum attendu
+ - `N` : La taille de la population
+ - `n` : Le nombre d'éléments marqués
+ - `r` : La taille de l'echantillon
+
+ Retourne un *float* indiquant la probabilité de récupérer entre
+ *xmin* et *xmax* éléments marqués parmi *n* dans une population
+ de taille *N* lors du tirage d'un échantillon de taille *r*
+ """
+ if n < r:
+ s = n
+ n = r
+ r = s
+ assert xmin>=0 and xmin <= r and xmax>=0 and xmax <= r and xmin <=xmax,"x out of limits"
+ hg = hypergeometric(xmin,N,n,r)
+ rep = hg
+ for x in xrange(xmin+1,xmax+1):
+ hg = hg * (n - x + 1)/x * (r - x + 1)/(N-n-r+x)
+ rep+=hg
+ return rep
+
+def multipleTest(globalPvalue,testList):
+ """
+ Correction pour les tests multiples.
+
+ Séléctionne parmis un ensemble de test le plus grand sous ensemble
+ telque le risque global soit inférieur à une pvalue déterminée.
+
+ Paramètres:
+
+ - `globalPvalue` : Risque global à prendre pour l'ensemble des tests
+ - `testList` : un élément itérable sur un ensemble de tests.
+ Chaque test est une liste ou un tuple dont le dernier élément
+ est la pvalue associée au test
+
+ Retourne une liste contenant le sous ensemble des tests selectionnés dans
+ `testList`
+ """
+ testList=list(testList)
+ testList.sort(lambda x,y:cmp(x[-1],y[-1]))
+ h0=1.0-globalPvalue
+ p=1.0
+ rep = []
+ for t in testList:
+ p*=1.0-t[-1]
+ if p > h0:
+ rep.append(t)
+ return rep
+
\ No newline at end of file
diff --git a/obitools/statistics/noncentralhypergeo.py b/obitools/statistics/noncentralhypergeo.py
new file mode 100644
index 0000000..e6a96ce
--- /dev/null
+++ b/obitools/statistics/noncentralhypergeo.py
@@ -0,0 +1,208 @@
+from decimal import *
+from math import log
+
+#from obitools.utils import moduleInDevelopment
+
+#moduleInDevelopment(__name__)
+
+# from : http://www.programmish.com/?p=25
+
+def dec_log(self, base=10):
+ cur_prec = getcontext().prec
+ getcontext().prec += 2
+ baseDec = Decimal(10)
+ retValue = self
+
+ if isinstance(base, Decimal):
+ baseDec = base
+ elif isinstance(base, float):
+ baseDec = Decimal("%f" % (base))
+ else:
+ baseDec = Decimal(base)
+
+ integer_part = Decimal(0)
+ while retValue < 1:
+ integer_part = integer_part - 1
+ retValue = retValue * baseDec
+ while retValue >= baseDec:
+ integer_part = integer_part + 1
+ retValue = retValue / baseDec
+
+ retValue = retValue ** 10
+ decimal_frac = Decimal(0)
+ partial_part = Decimal(1)
+ while cur_prec > 0:
+ partial_part = partial_part / Decimal(10)
+ digit = Decimal(0)
+ while retValue >= baseDec:
+ digit += 1
+ retValue = retValue / baseDec
+ decimal_frac = decimal_frac + digit * partial_part
+ retValue = retValue ** 10
+ cur_prec -= 1
+ getcontext().prec -= 2
+
+ return integer_part + decimal_frac
+
+class Interval(object):
+ def __init__(self,begin,end,facteur=1):
+ self._begin = begin
+ self._end = end
+ self._facteur=facteur
+
+ def __str__(self):
+ return '[%d,%d] ^ %d' % (self._begin,self._end,self._facteur)
+
+ def __repr__(self):
+ return 'Interval(%d,%d,%d)' % (self._begin,self._end,self._facteur)
+
+ def begin(self):
+ return (self._begin,self._facteur,True)
+
+ def end(self):
+ return (self._end,-self._facteur,False)
+
+
+def cmpb(i1,i2):
+ x= cmp(i1[0],i2[0])
+ if x==0:
+ x = cmp(i2[2],i1[2])
+ return x
+
+class Product(object):
+ def __init__(self,i=None):
+ if i is not None:
+ self.prod=[i]
+ else:
+ self.prod=[]
+ self._simplify()
+
+ def _simplify(self):
+ bornes=[]
+ prod =[]
+
+ if self.prod:
+
+ for i in self.prod:
+ bornes.append(i.begin())
+ bornes.append(i.end())
+ bornes.sort(cmpb)
+
+
+ j=0
+ r=len(bornes)
+ for i in xrange(1,len(bornes)):
+ if bornes[i][0]==bornes[j][0] and bornes[i][2]==bornes[j][2]:
+ bornes[j]=(bornes[j][0],bornes[j][1]+bornes[i][1],bornes[i][2])
+ r-=1
+ else:
+ j+=1
+ bornes[j]=bornes[i]
+
+ bornes=bornes[0:r]
+
+ facteur=0
+ close=1
+
+ for b,level,open in bornes:
+ if not open:
+ close=0
+ else:
+ close=1
+ if facteur:
+ prod.append(Interval(debut,b-close,facteur))
+ debut=b+1-close
+ facteur+=level
+
+ self.prod=prod
+
+
+
+
+ def __mul__(self,p):
+ res = Product()
+ res.prod=list(self.prod)
+ res.prod.extend(p.prod)
+ res._simplify()
+ return res
+
+ def __div__(self,p):
+ np = Product()
+ np.prod = [Interval(x._begin,x._end,-x._facteur) for x in p.prod]
+ return self * np
+
+ def __str__(self):
+ return str(self.prod)
+
+ def log(self):
+ p=Decimal(0)
+ for k in self.prod:
+ p+= Decimal(k._facteur) * reduce(lambda x,y:x+dec_log(Decimal(y),Decimal(10)),xrange(k._begin,k._end+1),Decimal(0))
+ return p
+
+ def product(self):
+ p=Decimal(1)
+ for k in self.prod:
+ p*= reduce(lambda x,y:x*Decimal(y),xrange(k._begin,k._end+1),Decimal(1)) ** Decimal(k._facteur)
+ return p
+
+ def __call__(self,log=True):
+ if log:
+ return self.log()
+ else:
+ return self.product()
+
+
+def fact(n):
+ return Product(Interval(1,n))
+
+def cnp(n,p):
+ return fact(n)/fact(p)/fact(n-p)
+
+def hypergeometic(x,n,M,N):
+ '''
+
+ @param x: Variable aleatoire
+ @type x: int
+ @param n: taille du tirage
+ @type n: int
+ @param M: boule gagnante
+ @type M: int
+ @param N: nombre total dans l'urne
+ @type N: int
+
+ p(x)= cnp(M,x) * cnp(N-M,n-x) / cnp(N,n)
+ '''
+ return cnp(M,x) * cnp(N-M,n-x) / cnp(N,n)
+
+def nchypergeometique(x,n,M,N,r):
+ '''
+
+ @param x: Variable aleatoire
+ @type x: int
+ @param n: taille du tirage
+ @type n: int
+ @param M: boule gagnante
+ @type M: int
+ @param N: nombre total dans l'urne
+ @type N: int
+ @param r: odd ratio
+ @type r: float
+
+ p(x)= cnp(M,x) * cnp(N-M,n-x) / cnp(N,n)
+ '''
+
+ xmin = max(0,n-N+M)
+ xmax = min(n,M)
+ lr = dec_log(r)
+ xlr = x * lr
+ num = cnp(M,x) * cnp(N-M,n-x)
+ den = [cnp(M,y) * cnp(N-M,n-y) / num for y in xrange(xmin,xmax+1)]
+ fden = [lr * y - xlr for y in xrange(xmin,xmax+1)]
+
+ inverse=reduce(lambda x,y : x+y,
+ map(lambda i,j: i(False) * 10**j ,den,fden))
+ return 1/inverse
+
+
+
\ No newline at end of file
diff --git a/obitools/svg.py b/obitools/svg.py
new file mode 100644
index 0000000..c42e3ef
--- /dev/null
+++ b/obitools/svg.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+"""\
+SVG.py - Construct/display SVG scenes.
+
+The following code is a lightweight wrapper around SVG files. The metaphor
+is to construct a scene, add objects to it, and then write it to a file
+to display it.
+
+This program uses ImageMagick to display the SVG files. ImageMagick also
+does a remarkable job of converting SVG files into other formats.
+"""
+
+import os
+display_prog = 'display' # Command to execute to display images.
+
+class Scene:
+ def __init__(self,name="svg",height=400,width=400):
+ self.name = name
+ self.items = []
+ self.height = height
+ self.width = width
+ return
+
+ def add(self,item): self.items.append(item)
+
+ def strarray(self):
+ var = ["\n",
+ "\n"]
+ return var
+
+ def write_svg(self,filename=None):
+ if filename:
+ self.svgname = filename
+ else:
+ self.svgname = self.name + ".svg"
+ file = open(self.svgname,'w')
+ file.writelines(self.strarray())
+ file.close()
+ return
+
+ def display(self,prog=display_prog):
+ os.system("%s %s" % (prog,self.svgname))
+ return
+
+
+class Line:
+ def __init__(self,start,end):
+ self.start = start #xy tuple
+ self.end = end #xy tuple
+ return
+
+ def strarray(self):
+ return [" \n" %\
+ (self.start[0],self.start[1],self.end[0],self.end[1])]
+
+
+class Circle:
+ def __init__(self,center,radius,color):
+ self.center = center #xy tuple
+ self.radius = radius #xy tuple
+ self.color = color #rgb tuple in range(0,256)
+ return
+
+ def strarray(self):
+ return [" \n" % colorstr(self.color)]
+
+class Rectangle:
+ def __init__(self,origin,height,width,color):
+ self.origin = origin
+ self.height = height
+ self.width = width
+ self.color = color
+ return
+
+ def strarray(self):
+ return [" \n" %\
+ (self.width,colorstr(self.color))]
+
+class Text:
+ def __init__(self,origin,text,size=24):
+ self.origin = origin
+ self.text = text
+ self.size = size
+ return
+
+ def strarray(self):
+ return [" \n" %\
+ (self.origin[0],self.origin[1],self.size),
+ " %s\n" % self.text,
+ " \n"]
+
+
+def colorstr(rgb): return "#%x%x%x" % (rgb[0]/16,rgb[1]/16,rgb[2]/16)
+
+def test():
+ scene = Scene('test')
+ scene.add(Rectangle((100,100),200,200,(0,255,255)))
+ scene.add(Line((200,200),(200,300)))
+ scene.add(Line((200,200),(300,200)))
+ scene.add(Line((200,200),(100,200)))
+ scene.add(Line((200,200),(200,100)))
+ scene.add(Circle((200,200),30,(0,0,255)))
+ scene.add(Circle((200,300),30,(0,255,0)))
+ scene.add(Circle((300,200),30,(255,0,0)))
+ scene.add(Circle((100,200),30,(255,255,0)))
+ scene.add(Circle((200,100),30,(255,0,255)))
+ scene.add(Text((50,50),"Testing SVG"))
+ scene.write_svg()
+ scene.display()
+ return
+
+if __name__ == '__main__': test()
diff --git a/obitools/table/__init__.py b/obitools/table/__init__.py
new file mode 100644
index 0000000..41e00bd
--- /dev/null
+++ b/obitools/table/__init__.py
@@ -0,0 +1,633 @@
+'''
+
+'''
+
+from itertools import imap,count,chain
+
+from itertools import imap,count,chain
+
+class Table(list):
+ """
+ Tables are list of rows of the same model
+ """
+ def __init__(self, headers=None,
+ types=None,
+ colcount=None,
+ rowFactory=None,
+ subrowFactory=None):
+ '''
+
+ @param headers: the list of column header.
+
+ if this parametter is C{None}, C{colcount}
+ parametter must be set.
+
+ @type headers: C{list}, C{tuple} or and iterable object
+
+ @param types: the list of data type associated to each column.
+
+ If this parametter is specified its length must be
+ equal to the C{headers} length or to C{colcount}.
+
+ @type types: C{list}, C{tuple} or and iterable object
+
+ @param colcount: number of column in the created table.
+
+ If C{headers} parametter is not C{None} this
+ parametter is ignored
+
+ @type colcount: int
+ '''
+
+ assert headers is not None or colcount is not None,\
+ 'headers or colcount parametter must be not None value'
+
+ if headers is None:
+ headers = tuple('Col_%d' % x for x in xrange(colcount))
+
+ self.headers = headers
+ self.types = types
+ self.colcount= len(self.headers)
+
+ if rowFactory is None:
+ self.rowFactory=TableRow
+ else:
+ self.rowFactory=rowFactory
+
+ if subrowFactory is None:
+ self.subrowFactory=TableRow
+ else:
+ self.subrowFactory=rowFactory
+
+
+ self.likedTo=set()
+
+
+
+ def isCompatible(self,data):
+ assert isinstance(data,(Table,TableRow))
+ return (self.colcount == data.colcount and
+ (id(self.types)==id(data.types) or
+ self.types==data.types
+ )
+ )
+
+ def __setitem__ (self,key,value):
+ '''
+
+ @param key:
+ @type key: C{int}, C{slice} or C{str}
+ @param value:
+ @type value:
+ '''
+
+ if isintance(key,int):
+ if not isinstance(value, TableRow):
+ value = self.rowFactory(self,value)
+ else:
+ assert self.isCompatible(value)
+ list.__setitem__(self,key,value.row)
+
+ elif isinstance(key,slice):
+ indices = xrange(key.indices(len(self)))
+ for i,d in imap(None,indices,value):
+ self[i]=d
+
+ else:
+ raise TypeError, "Key must be an int or slice value"
+
+ def __getitem__(self,key):
+ '''
+ this function has different comportements depending
+ of the data type of C{key} and the table used.
+
+ @param key: description of the table part to return
+ @type key: C{int} or C{slice}
+
+ @return: return a TableRow (if key is C{int})
+ or a subpart of the table (if key is C{slice}).
+ '''
+ if isinstance(key,int):
+ return self.rowFactory(self,
+ list.__getitem__(self,key))
+
+ if isinstance(key,slice):
+ newtable=Table(self.headers,self.types)
+ indices = xrange(key.indices(len(self)))
+ for i in indices:
+ list.append(newtable,list.__getitem__(self,i))
+ self.likedTo.add(newtable)
+ return newtable
+
+ raise TypeError
+
+
+ def __getslice__(self,x,y):
+ return self.__getitem__(slice(x,y))
+
+ def __iter__(self):
+ return TableIterator(self)
+
+ def __hash__(self):
+ return id(self)
+
+ def __add__(self,itable):
+ return concatTables(self,itable)
+
+ def _setTypes(self,types):
+ if types is not None and not isinstance(type,tuple):
+ types = tuple(x for x in types)
+
+ assert types is None or len(types)==len(self._headers)
+
+ self._types = types
+
+ if types is not None:
+ for row in self:
+ row.castRow()
+
+ def _getTypes(self):
+ return self._types
+
+ types = property(_getTypes,_setTypes)
+
+ def _getHeaders(self):
+ return self._headers
+
+ def _setHeaders(self,headers):
+ if not isinstance(headers, tuple):
+ headers = tuple(x for x in headers)
+
+ self._hindex = dict((k,i) for i,k in imap(None,count(),headers))
+ self._headers=headers
+ self.colcount=len(headers)
+
+ headers=property(_getHeaders,_setHeaders)
+
+ def append(self,value):
+ if not isinstance(value, TableRow):
+ value = self.rowFactory(self,value)
+ else:
+ assert self.isCompatible(value)
+ list.append(self,value.row)
+
+
+
+class _Row(list):
+ def __init__(self,data,size):
+ if data is None:
+ list.__init__(self,(None for x in xrange(size)))
+ else:
+ list.__init__(self,data)
+ assert len(self)==size, \
+ "Size of data is not correct (%d instead of %d)" % (len(self),size)
+
+ def append(self,value):
+ raise NotImplementedError, \
+ "Rows cannot change of size"
+
+ def pop(self,key=None):
+ raise NotImplementedError, \
+ "Rows cannot change of size"
+
+ def extend(self,values):
+ raise NotImplementedError, \
+ "Rows cannot change of size"
+
+
+
+
+class TableRow(object):
+ '''
+
+ '''
+ def __init__(self, table,
+ data=None,
+ ):
+
+ self.table = table
+
+ if isinstance(data,_Row):
+ self.row=row
+ else:
+ data = self._castRow(data)
+ self.row=_Row(data,self._colcount)
+
+ def getType(self):
+ return self.table.types
+
+ def getHeaders(self):
+ return self.table.headers
+
+ def getHIndex(self):
+ return self.table._hindex
+
+ def getColCount(self):
+ return self.table.colcount
+
+ types = property(getType,None,None,
+ "List of types associated to this row")
+ headers= property(getHeaders,None,None,
+ "List of headers associated to this row")
+
+ _hindex= property(getHIndex,None,None)
+ _colcount = property(getColCount,None,None)
+
+ def _castValue(t,x):
+ '''
+ Cast a value to a specified type, with exception of
+ C{None} values that are returned without cast.
+
+ @param t: the destination type
+ @type t: C{type}
+ @param x: the value to cast
+
+ @return: the casted value or C{None}
+
+ '''
+ if x is None or t is None:
+ return x
+ else:
+ return t(x)
+
+ _castValue=staticmethod(_castValue)
+
+ def _castRow(self,data):
+
+ if not isinstance(data, (list,dict)):
+ data=[x for x in data]
+
+ if isinstance(data,list):
+ assert len(data)==self._colcount, \
+ 'values has not good length'
+ if self.types is not None:
+ data=[TableRow._castValue(t, x)
+ for t,x in imap(None,self.types,data)]
+
+ elif isinstance(data,dict):
+ lvalue = [None] * len(self.header)
+
+ for k,v in data.items():
+ try:
+ hindex = self._hindex[k]
+ if self.types is not None:
+ lvalue[hindex]=TableRow._castValue(self.types[hindex], v)
+ else:
+ lvalue[hindex]=v
+ except KeyError:
+ info('%s is not a table column' % k)
+
+ data=lvalue
+ else:
+ raise TypeError
+
+ return data
+
+ def __getitem__(self,key):
+ '''
+
+ @param key:
+ @type key:
+ '''
+
+ if isinstance(key,(int,slice)):
+ return self.row[key]
+
+ if isinstance(key,str):
+ i = self._hindex[key]
+ return self.row[i]
+
+ raise TypeError, "Key must be an int, slice or str value"
+
+ def __setitem__(self,key,value):
+ '''
+
+ @param key:
+ @type key:
+ @param value:
+ @type value:
+ '''
+
+ if isinstance(key,str):
+ key = self._hindex[key]
+
+ elif isinstance(key,int):
+ if self.types is not None:
+ value = TableRow._castValue(self.types[key], value)
+ self.row[key]=value
+
+ elif isinstance(key,slice):
+ indices = xrange(key.indices(len(self.row)))
+ for i,v in imap(None,indices,value):
+ self[i]=v
+ else:
+ raise TypeError, "Key must be an int, slice or str value"
+
+
+
+ def __iter__(self):
+ '''
+
+ '''
+ return iter(self.row)
+
+ def append(self,value):
+ raise NotImplementedError, \
+ "Rows cannot change of size"
+
+ def pop(self,key=None):
+ raise NotImplementedError, \
+ "Rows cannot change of size"
+
+ def extend(self,values):
+ raise NotImplementedError, \
+ "Rows cannot change of size"
+
+ def __len__(self):
+ return self._colcount
+
+ def __repr__(self):
+ return repr(self.row)
+
+ def __str__(self):
+ return str(self.row)
+
+ def castRow(self):
+ self.row = _Row(self._castRow(self.row),len(self.row))
+
+
+class iTableIterator(object):
+
+ def _getHeaders(self):
+ raise NotImplemented
+
+ def _getTypes(self):
+ raise NotImplemented
+
+ def _getRowFactory(self):
+ raise NotImplemented
+
+ def _getSubrowFactory(self):
+ raise NotImplemented
+
+ def _getColcount(self):
+ return len(self._getTypes())
+
+ def __iter__(self):
+ return self
+
+ headers = property(_getHeaders,None,None)
+ types = property(_getTypes,None,None)
+ rowFactory = property(_getRowFactory,None,None)
+ subrowFactory = property(_getSubrowFactory,None,None)
+ colcount = property(_getColcount,None,None)
+
+ def columnIndex(self,name):
+ if isinstance(name,str):
+ return self._reference.headers.index(name)
+ elif isinstance(name,int):
+ lh = len(self._reference.headers)
+ if name < lh and name >=0:
+ return name
+ elif name < 0 and name >= -lh:
+ return lh - name
+ raise IndexError
+ raise TypeError
+
+ def next(self):
+ raise NotImplemented
+
+
+class TableIterator(iTableIterator):
+
+ def __init__(self,table):
+ if not isinstance(table,Table):
+ raise TypeError
+
+ self._reftable=table
+ self._i=0
+
+ def _getHeaders(self):
+ return self._reftable.headers
+
+ def _getTypes(self):
+ return self._reftable.types
+
+ def _getRowFactory(self):
+ return self._reftable.rowFactory
+
+ def _getSubrowFactory(self):
+ return self._reftable.subrowFactory
+
+ def columnIndex(self,name):
+ if isinstance(name,str):
+ return self._reftable._hindex[name]
+ elif isinstance(name,int):
+ lh = len(self._reftable._headers)
+ if name < lh and name >=0:
+ return name
+ elif name < 0 and name >= -lh:
+ return lh - name
+ raise IndexError
+ raise TypeError
+
+
+ def rewind(self):
+ i=0
+
+ def next(self):
+ if self._i < len(self._reftable):
+ rep=self._reftable[self._i]
+ self._i+=1
+ return rep
+ else:
+ raise StopIteration
+
+ headers = property(_getHeaders,None,None)
+ types = property(_getTypes,None,None)
+ rowFactory = property(_getRowFactory,None,None)
+ subrowFactory = property(_getSubrowFactory,None,None)
+
+
+class ProjectionIterator(iTableIterator):
+
+ def __init__(self,tableiterator,*cols):
+ self._reference = iter(tableiterator)
+
+ assert isinstance(self._reference, iTableIterator)
+
+ self._selected = tuple(self._reference.columnIndex(x)
+ for x in cols)
+ self._headers = tuple(self._reference.headers[x]
+ for x in self._selected)
+
+ if self._reference.types is not None:
+ self._types= tuple(self._reference.types[x]
+ for x in self._selected)
+ else:
+ self._types=None
+
+ def _getRowFactory(self):
+ return self._reference.subrowFactory
+
+ def _getSubrowFactory(self):
+ return self._reference.subrowFactory
+
+ def _getHeaders(self):
+ return self._headers
+
+ def _getTypes(self):
+ return self._types
+
+ headers = property(_getHeaders,None,None)
+ types = property(_getTypes,None,None)
+ rowFactory = property(_getRowFactory,None,None)
+ subrowFactory = property(_getSubrowFactory,None,None)
+
+ def next(self):
+ value = self._reference.next()
+ value = (value[x] for x in self._selected)
+ return self.rowFactory(self,value)
+
+class SelectionIterator(iTableIterator):
+ def __init__(self,tableiterator,**conditions):
+ self._reference = iter(tableiterator)
+
+ assert isinstance(self._reference, iTableIterator)
+
+ self._conditions=dict((self._reference.columnIndex(i),c)
+ for i,c in conditions.iteritems())
+
+ def _checkCondition(self,row):
+ return reduce(lambda x,y : x and y,
+ (bool(self._conditions[i](row[i]))
+ for i in self._conditions),
+ True)
+
+ def _getRowFactory(self):
+ return self._reference.rowFactory
+
+ def _getSubrowFactory(self):
+ return self._reference.subrowFactory
+
+ def _getHeaders(self):
+ return self._reference.headers
+
+ def _getTypes(self):
+ return self._reference.types
+
+ def next(self):
+ row = self._reference.next()
+ while not self._checkCondition(row):
+ row = self._reference.next()
+ return row
+
+
+ headers = property(_getHeaders,None,None)
+ types = property(_getTypes,None,None)
+ rowFactory = property(_getRowFactory,None,None)
+ subrowFactory = property(_getSubrowFactory,None,None)
+
+
+class UnionIterator(iTableIterator):
+ def __init__(self,*itables):
+ self._itables=[iter(x) for x in itables]
+ self._types = self._itables[0].types
+ self._headers = self._itables[0].headers
+
+ assert reduce(lambda x,y: x and y,
+ ( isinstance(z,iTableIterator)
+ and len(z.headers)==len(self._headers)
+ for z in self._itables),
+ True)
+
+ self._iterator = chain(*self._itables)
+
+ def _getRowFactory(self):
+ return self._itables[0].rowFactory
+
+ def _getSubrowFactory(self):
+ return self._itables[0].subrowFactory
+
+ def _getHeaders(self):
+ return self._headers
+
+ def _getTypes(self):
+ return self._types
+
+ def next(self):
+ value = self._iterator.next()
+ return self.rowFactory(self,value.row)
+
+ headers = property(_getHeaders,None,None)
+ types = property(_getTypes,None,None)
+ rowFactory = property(_getRowFactory,None,None)
+ subrowFactory = property(_getSubrowFactory,None,None)
+
+
+
+def tableFactory(tableiterator):
+ tableiterator = iter(tableiterator)
+ assert isinstance(tableiterator, iTableIterator)
+
+ newtable = Table(tableiterator.headers,
+ tableiterator.types,
+ tableiterator.rowFactory,
+ tableiterator.subrowFactory)
+
+ for r in tableiterator:
+ newtable.append(r)
+
+ return newtable
+
+def projectTable(tableiterator,*cols):
+ return tableFactory(ProjectionIterator(tableiterator,*cols))
+
+def subTable(tableiterator,**conditions):
+ return tableFactory(SelectionIterator(tableiterator,**conditions))
+
+def concatTables(*itables):
+ '''
+ Concatene severals tables.
+
+ concatenation is done using the L{UnionIterator}
+
+ @type itables: iTableIterator or Table
+
+ @return: a new Table
+ @rtype: c{Table}
+
+ @see: L{UnionIterator}
+ '''
+ return tableFactory(UnionIterator(*itables))
+
+class TableIteratorAsDict(object):
+
+ def __init__(self,tableiterator):
+ self._reference = iter(tableiterator)
+
+ assert isinstance(self._reference, iTableIterator)
+
+ self._headers = self._reference.headers
+ self._types = self._reference.types
+ if self._types is not None:
+ self._types = dict((n,t)
+ for n,t in imap(None,self._headers,self._types))
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ value = self._reference.next()
+ return dict((n,t)
+ for n,t in imap(None,self._headers,value))
+
+ def _getHeaders(self):
+ return self._headers
+
+ def _getTypes(self):
+ return self._types
+
+ headers = property(_getHeaders,None,None)
+ types = property(_getTypes,None,None)
+
\ No newline at end of file
diff --git a/obitools/table/csv.py b/obitools/table/csv.py
new file mode 100644
index 0000000..1d9a73d
--- /dev/null
+++ b/obitools/table/csv.py
@@ -0,0 +1,52 @@
+"""
+obitools.table.csv module provides an iterator adapter
+allowing to parse csv (comma separatted value) file
+"""
+
+import re
+
+def csvIterator(lineIterator,sep=','):
+ '''
+ Allows easy parsing of a csv file. This function
+ convert an iterator on line over a csv text file
+ in an iterator on data list. Each list corresponds
+ to all values present n one line.
+
+ @param lineIterator: iterator on text lines
+ @type lineIterator: iterator
+ @param sep: string of one letter used as separator
+ blank charactere or " is not allowed as
+ separator
+ @type sep: string
+ @return: an iterator on data list
+ @rtype: iterator
+ '''
+ assert len(sep)==1 and not sep.isspace() and sep!='"'
+ valueMatcher=re.compile('\s*((")(([^"]|"")*)"|([^%s]*?))\s*(%s|$)' % (sep,sep))
+ def iterator():
+ for l in lineIterator:
+ yield _csvParse(l,valueMatcher)
+ return iterator()
+
+
+def _csvParse(line,valueMatcher):
+ data=[]
+ i = iter(valueMatcher.findall(line))
+ m = i.next()
+ if m[0]:
+ while m[-1]!='':
+ if m[1]=='"':
+ data.append(m[2].replace('""','"'))
+ else:
+ data.append(m[0])
+ m=i.next()
+ if m[1]=='"':
+ data.append(m[2].replace('""','"'))
+ else:
+ data.append(m[0])
+ return data
+
+
+
+
+
\ No newline at end of file
diff --git a/obitools/tagmatcher/__init__.py b/obitools/tagmatcher/__init__.py
new file mode 100644
index 0000000..880ead0
--- /dev/null
+++ b/obitools/tagmatcher/__init__.py
@@ -0,0 +1,35 @@
+from obitools import NucSequence
+from obitools.location import locationGenerator,extractExternalRefs
+
+
+
+class TagMatcherSequence(NucSequence):
+ '''
+ Class used to represent a nucleic sequence issued mapped
+ on a genome by the tagMatcher software.
+ '''
+
+ def __init__(self,seq,cd,locs,dm,rm):
+ NucSequence.__init__(self, seq, seq)
+ self['locations']=locs
+ self['conditions']=cd
+ self['dm']=dm
+ self['rm']=rm
+ self['tm']=dm+rm
+
+ def eminEmaxFilter(self,emin=None,emax=None):
+ result = [x for x in self['locations']
+ if (emin is None or x['error'] >=emin)
+ and (emax is None or x['error'] <=emax)]
+ self['locations']=result
+ dm=0
+ rm=0
+ for x in result:
+ if x.isDirect():
+ dm+=1
+ else:
+ rm+=1
+ self['dm']=dm
+ self['rm']=rm
+ self['tm']=dm+rm
+ return self
diff --git a/obitools/tagmatcher/options.py b/obitools/tagmatcher/options.py
new file mode 100644
index 0000000..45673ce
--- /dev/null
+++ b/obitools/tagmatcher/options.py
@@ -0,0 +1,14 @@
+def addTagMatcherErrorOptions(optionManager):
+ optionManager.add_option('-E','--emax',
+ action='store',
+ metavar="<##>",
+ type="int",dest="emax",
+ default=None,
+ help="keep match with no more than emax errors")
+
+ optionManager.add_option('-e','--emin',
+ action='store',
+ metavar="<##>",
+ type="int",dest="emin",
+ default=0,
+ help="keep match with at least emin errors")
diff --git a/obitools/tagmatcher/parser.py b/obitools/tagmatcher/parser.py
new file mode 100644
index 0000000..a843e66
--- /dev/null
+++ b/obitools/tagmatcher/parser.py
@@ -0,0 +1,89 @@
+import re
+import sys
+
+from obitools import tagmatcher
+from obitools.seqdb import nucEntryIterator
+from obitools.location.feature import Feature
+from obitools.location import locationGenerator
+
+_seqMatcher = re.compile('(?<=TG )[acgtrymkwsbdhvnACGTRYMKWSBDHVN]+')
+_cdMatcher = re.compile('(?<=CD ) *([^:]+?) +: +([0-9]+)')
+_loMatcher = re.compile('(?<=LO ) *([ACGTRYMKWSBDHVN]+) +([^ ]+) +([^ ]+) +\(([0-9]+)\)')
+_dmMatcher = re.compile('(?<=DM )[0-9]+')
+_rmMatcher = re.compile('(?<=RM )[0-9]+')
+
+
+def __tagmatcherparser(text):
+ try:
+ seq = _seqMatcher.search(text).group()
+ cd = dict((x[0],int(x[1])) for x in _cdMatcher.findall(text))
+ locs = []
+
+ for (match,ac,loc,err) in _loMatcher.findall(text):
+ feat = Feature('location', locationGenerator(loc))
+ feat['error']=int(err)
+ feat['match']=match
+ feat['contig']=ac
+ locs.append(feat)
+
+ dm = int(_dmMatcher.search(text).group())
+ rm = int(_rmMatcher.search(text).group())
+
+ except AttributeError,e:
+ print >>sys.stderr,'======================================================='
+ print >>sys.stderr,text
+ print >>sys.stderr,'======================================================='
+ raise e
+
+ return (seq,cd,locs,dm,rm)
+
+def tagMatcherParser(text):
+ return tagmatcher.TagMatcherSequence(*__tagmatcherparser(text))
+
+
+class TagMatcherIterator(object):
+ _cdheadparser = re.compile('condition [0-9]+ : (.+)')
+
+ def __init__(self,file):
+ self._ni = nucEntryIterator(file)
+ self.header=self._ni.next()
+ self.conditions=TagMatcherIterator._cdheadparser.findall(self.header)
+
+ def next(self):
+ return tagMatcherParser(self._ni.next())
+
+ def __iter__(self):
+ return self
+
+def formatTagMatcher(tmseq,reader=None):
+ if isinstance(tmseq, TagMatcherIterator):
+ return tmseq.header
+
+ assert isinstance(tmseq,tagmatcher.TagMatcherSequence),'Only TagMatcherSequence can be used'
+ lo = '\n'.join(['LO %s %s %s (%d)' % (l['match'],l['contig'],l.locStr(),l['error'])
+ for l in tmseq['locations']])
+ if reader is not None:
+ cd = '\n'.join(['CD %s : %d' % (x,tmseq['conditions'][x])
+ for x in reader.conditions])
+ else:
+ cd = '\n'.join(['CD %s : %d' % (x,tmseq['conditions'][x])
+ for x in tmseq['conditions']])
+
+ tg = 'TG %s' % str(tmseq)
+
+ e=[tg]
+ if cd:
+ e.append(cd)
+ if lo:
+ e.append(lo)
+
+ tm = 'TM %d' % tmseq['tm']
+ dm = 'DM %d' % tmseq['dm']
+ rm = 'RM %d' % tmseq['rm']
+
+ e.extend((tm,dm,rm,'//'))
+
+ return '\n'.join(e)
+
+
+
diff --git a/obitools/thermo/__init__.py b/obitools/thermo/__init__.py
new file mode 100644
index 0000000..492dbb9
--- /dev/null
+++ b/obitools/thermo/__init__.py
@@ -0,0 +1,597 @@
+from math import log
+from array import array
+from copy import deepcopy
+
+bpencoder={'A':1,'C':2,'G':3,'T':4,
+ 'a':1,'c':2,'g':3,'t':4,
+ '-':0
+ }
+
+rvencoder={'A':4,'C':3,'G':2,'T':1,
+ 'a':4,'c':3,'g':2,'t':1,
+ '-':0
+ }
+
+R = 1.987
+SALT_METHOD_SANTALUCIA = 1
+SALT_METHOD_OWCZARZY = 2
+DEF_CONC_PRIMERS = 8.e-7
+DEF_CONC_SEQUENCES = 0.
+DEF_SALT = 0.05
+forbidden_entropy = 0.
+forbidden_enthalpy = 1.e18
+
+__dH = [[[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]],
+ [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]],
+ [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]],
+ [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]],
+ [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]],
+ [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]]
+ ]
+__dS = [[[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]],
+ [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]],
+ [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]],
+ [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]],
+ [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]],
+ [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]]
+ ]
+
+def initParams(c1, c2, kp, sm,nparm={}):
+ global forbidden_entropy
+ global dH,dS
+
+ dH=deepcopy(__dH)
+ dS=deepcopy(__dS)
+
+ nparm['Ct1'] = c1;
+ nparm['Ct2'] = c2;
+ nparm['kplus'] = kp;
+ maxCT = 1;
+
+ if(nparm['Ct2'] > nparm['Ct1']):
+ maxCT = 2
+
+ if(nparm['Ct1'] == nparm['Ct2']):
+ ctFactor = nparm['Ct1']/2
+ elif (maxCT == 1):
+ ctFactor = nparm['Ct1']-nparm['Ct2']/2
+ else:
+ ctFactor = nparm['Ct2']-nparm['Ct1']/2
+
+ nparm['rlogc'] = R * log(ctFactor)
+ forbidden_entropy = nparm['rlogc']
+ nparm['kfac'] = 0.368 * log(nparm['kplus'])
+ nparm['saltMethod'] = sm
+
+
+ # Set all X-/Y-, -X/Y- and X-/-Y so, that TM will be VERY small!
+ for x in xrange(1,5):
+ for y in xrange(1,5):
+ dH[0][x][y][0]=forbidden_enthalpy;
+ dS[0][x][y][0]=forbidden_entropy;
+ dH[x][0][0][y]=forbidden_enthalpy;
+ dS[x][0][0][y]=forbidden_entropy;
+ dH[x][0][y][0]=forbidden_enthalpy;
+ dS[x][0][y][0]=forbidden_entropy;
+ # forbid X-/Y$ and X$/Y- etc., i.e. terminal must not be paired with gap!
+ dH[x][5][y][0]=forbidden_enthalpy;
+ dS[x][5][y][0]=forbidden_entropy;
+ dH[x][0][y][5]=forbidden_enthalpy;
+ dS[x][0][y][5]=forbidden_entropy;
+ dH[5][x][0][y]=forbidden_enthalpy;
+ dS[5][x][0][y]=forbidden_entropy;
+ dH[0][x][5][y]=forbidden_enthalpy;
+ dS[0][x][5][y]=forbidden_entropy;
+
+ #forbid X$/-Y etc.
+ dH[x][5][0][y]=forbidden_enthalpy;
+ dS[x][5][0][y]=forbidden_entropy;
+ dH[x][0][5][y]=forbidden_enthalpy;
+ dS[x][0][5][y]=forbidden_entropy;
+ dH[5][x][y][0]=forbidden_enthalpy;
+ dS[5][x][y][0]=forbidden_entropy;
+ dH[0][x][y][5]=forbidden_enthalpy;
+ dS[0][x][y][5]=forbidden_entropy;
+
+
+
+ #also, forbid x-/-- and --/x-, i.e. no two inner gaps paired
+ dH[x][0][0][0]=forbidden_enthalpy;
+ dS[x][0][0][0]=forbidden_entropy;
+ dH[0][0][x][0]=forbidden_enthalpy;
+ dS[0][0][x][0]=forbidden_entropy;
+ # x-/-$
+ dH[x][0][0][5]=forbidden_enthalpy;
+ dS[x][0][0][5]=forbidden_entropy;
+ dH[5][0][0][x]=forbidden_enthalpy;
+ dS[5][0][0][x]=forbidden_entropy;
+ dH[0][5][x][0]=forbidden_enthalpy;
+ dS[x][0][0][5]=forbidden_entropy;
+ dH[0][x][5][0]=forbidden_enthalpy;
+ dS[0][x][5][0]=forbidden_entropy;
+
+ # forbid --/--
+ dH[0][0][0][0]=forbidden_enthalpy;
+ dS[0][0][0][0]=forbidden_entropy;
+
+ dH[5][0][0][0]=forbidden_enthalpy;
+ dS[5][0][0][0]=forbidden_entropy;
+ dH[0][0][5][0]=forbidden_enthalpy;
+ dS[0][0][5][0]=forbidden_entropy;
+ dH[0][5][5][0]=forbidden_enthalpy;
+ dS[0][5][5][0]=forbidden_entropy;
+
+ # Interior loops (double Mismatches)
+ iloop_entropy=-0.97
+ iloop_enthalpy=0.0
+
+ for x in xrange(1,5):
+ for y in xrange(1,5):
+ for a in xrange(1,5):
+ for b in xrange(1,5):
+ # AT and CG pair, and as A=1, C=2, G=3, T=4 this means
+ # we have Watson-Crick pairs if (x+a==5) and (y+b)==5.
+ if ( not ((x+a==5) or (y+b==5))):
+ # No watson-crick-pair, i.e. double mismatch!
+ # set enthalpy/entropy to loop expansion!
+ dH[x][y][a][b] = iloop_enthalpy;
+ dS[x][y][a][b] = iloop_entropy;
+
+
+ # xy/-- and --/xy (Bulge Loops of size > 1)
+ bloop_entropy=-1.3
+ bloop_enthalpy=0.0
+
+ for x in xrange(1,5):
+ for y in xrange(1,5):
+ dH[x][y][0][0] = bloop_enthalpy;
+ dS[x][y][0][0] = bloop_entropy;
+ dH[0][0][x][y] = bloop_enthalpy;
+ dS[0][0][x][y] = bloop_entropy;
+
+
+ # x-/ya abd xa/y- as well as -x/ay and ax/-y
+ # bulge opening and closing parameters with
+ # adjacent matches / mismatches
+ # obulge_mism and cbulge_mism chosen so high to avoid
+ # AAAAAAAAA
+ # T--G----T
+ # being better than
+ # AAAAAAAAA
+ # TG------T
+ obulge_match_H =-2.66e3
+ obulge_match_S =-14.22
+ cbulge_match_H =-2.66e3
+ cbulge_match_S =-14.22
+ obulge_mism_H = 0.0
+ obulge_mism_S = -6.45
+ cbulge_mism_H = 0.0
+ cbulge_mism_S =-6.45
+
+ for x in xrange(1,5):
+ for y in xrange(1,5):
+ for a in xrange(1,5):
+ if (x+y==5): # other base pair matches!
+
+ dH[x][0][y][a]=obulge_match_H; # bulge opening
+ dS[x][0][y][a]=obulge_match_S;
+ dH[x][a][y][0]=obulge_match_H;
+ dS[x][a][y][0]=obulge_match_S;
+ dH[0][x][a][y]=cbulge_match_H; # bulge closing
+ dS[0][x][a][y]=cbulge_match_S;
+ dH[a][x][0][y]=cbulge_match_H;
+ dS[a][x][0][y]=cbulge_match_S;
+ else:
+ # mismatch in other base pair!
+ dH[x][0][y][a]=obulge_mism_H; # bulge opening
+ dS[x][0][y][a]=obulge_mism_S;
+ dH[x][a][y][0]=obulge_mism_H;
+ dS[x][a][y][0]=obulge_mism_S;
+ dH[0][x][a][y]=cbulge_mism_H; # bulge closing
+ dS[0][x][a][y]=cbulge_mism_S;
+ dH[a][x][0][y]=cbulge_mism_H;
+ dS[a][x][0][y]=cbulge_mism_S;
+
+
+
+ # Watson-Crick pairs (note that only ten are unique, as obviously
+ # 5'-AG-3'/3'-TC-5' = 5'-CT-3'/3'-GA-5' etc.
+ dH[1][1][4][4]=-7.6e3; dS[1][1][4][4]=-21.3 # AA/TT 04
+ dH[1][2][4][3]=-8.4e3; dS[1][2][4][3]=-22.4 # AC/TG adapted GT/CA
+ dH[1][3][4][2]=-7.8e3; dS[1][3][4][2]=-21.0 # AG/TC adapted CT/GA
+ dH[1][4][4][1]=-7.2e3; dS[1][4][4][1]=-20.4 # AT/TA 04
+ dH[2][1][3][4]=-8.5e3; dS[2][1][3][4]=-22.7 # CA/GT 04
+ dH[2][2][3][3]=-8.0e3; dS[2][2][3][3]=-19.9 # CC/GG adapted GG/CC
+ dH[2][3][3][2]=-10.6e3; dS[2][3][3][2]=-27.2 # CG/GC 04
+ dH[2][4][3][1]=-7.8e3; dS[2][4][3][1]=-21.0 # CT/GA 04
+ dH[3][1][2][4]=-8.2e3; dS[3][1][2][4]=-22.2 # GA/CT 04
+ dH[3][2][2][3]=-9.8e3; dS[3][2][2][3]=-24.4 # GC/CG 04
+ dH[3][3][2][2]=-8.0e3; dS[3][3][2][2]=-19.9 # GG/CC 04
+ dH[3][4][2][1]=-8.4e3; dS[3][4][2][1]=-22.4 # GT/CA 04
+ dH[4][1][1][4]=-7.2e3; dS[4][1][1][4]=-21.3 # TA/AT 04
+ dH[4][2][1][3]=-8.2e3; dS[4][2][1][3]=-22.2 # TC/AG adapted GA/CT
+ dH[4][3][1][2]=-8.5e3; dS[4][3][1][2]=-22.7 # TG/AC adapted CA/GT
+ dH[4][4][1][1]=-7.6e3; dS[4][4][1][1]=-21.3 # TT/AA adapted AA/TT
+
+ # A-C Mismatches (Values for pH 7.0)
+ dH[1][1][2][4]=7.6e3; dS[1][1][2][4]=20.2 # AA/CT
+ dH[1][1][4][2]=2.3e3; dS[1][1][4][2]=4.6 # AA/TC
+ dH[1][2][2][3]=-0.7e3; dS[1][2][2][3]=-3.8 # AC/CG
+ dH[1][2][4][1]=5.3e3; dS[1][2][4][1]=14.6 # AC/TA
+ dH[1][3][2][2]=0.6e3; dS[1][3][2][2]=-0.6 # AG/CC
+ dH[1][4][2][1]=5.3e3; dS[1][4][2][1]=14.6 # AT/CA
+ dH[2][1][1][4]=3.4e3; dS[2][1][1][4]=8.0 # CA/AT
+ dH[2][1][3][2]=1.9e3; dS[2][1][3][2]=3.7 # CA/GC
+ dH[2][2][1][3]=5.2e3; dS[2][2][1][3]=14.2 # CC/AG
+ dH[2][2][3][1]=0.6e3; dS[2][2][3][1]=-0.6 # CC/GA
+ dH[2][3][1][2]=1.9e3; dS[2][3][1][2]=3.7 # CG/AC
+ dH[2][4][1][1]=2.3e3; dS[2][4][1][1]=4.6 # CT/AA
+ dH[3][1][2][2]=5.2e3; dS[3][1][2][2]=14.2 # GA/CC
+ dH[3][2][2][1]=-0.7e3; dS[3][2][2][1]=-3.8 # GC/CA
+ dH[4][1][1][2]=3.4e3; dS[4][1][1][2]=8.0 # TA/AC
+ dH[4][2][1][1]=7.6e3; dS[4][2][1][1]=20.2 # TC/AA
+
+ # C-T Mismatches
+ dH[1][2][4][4]=0.7e3; dS[1][2][4][4]=0.2 # AC/TT
+ dH[1][4][4][2]=-1.2e3; dS[1][4][4][2]=-6.2 # AT/TC
+ dH[2][1][4][4]=1.0e3; dS[2][1][4][4]=0.7 # CA/TT
+ dH[2][2][3][4]=-0.8e3; dS[2][2][3][4]=-4.5 # CC/GT
+ dH[2][2][4][3]=5.2e3; dS[2][2][4][3]=13.5 # CC/TG
+ dH[2][3][4][2]=-1.5e3; dS[2][3][4][2]=-6.1 # CG/TC
+ dH[2][4][3][2]=-1.5e3; dS[2][4][3][2]=-6.1 # CT/GC
+ dH[2][4][4][1]=-1.2e3; dS[2][4][4][1]=-6.2 # CT/TA
+ dH[3][2][2][4]=2.3e3; dS[3][2][2][4]=5.4 # GC/CT
+ dH[3][4][2][2]=5.2e3; dS[3][4][2][2]=13.5 # GT/CC
+ dH[4][1][2][4]=1.2e3; dS[4][1][2][4]=0.7 # TA/CT
+ dH[4][2][2][3]=2.3e3; dS[4][2][2][3]=5.4 # TC/CG
+ dH[4][2][1][4]=1.2e3; dS[4][2][1][4]=0.7 # TC/AT
+ dH[4][3][2][2]=-0.8e3; dS[4][3][2][2]=-4.5 # TG/CC
+ dH[4][4][2][1]=0.7e3; dS[4][4][2][1]=0.2 # TT/CA
+ dH[4][4][1][2]=1.0e3; dS[4][4][1][2]=0.7 # TT/AC
+
+ # G-A Mismatches
+ dH[1][1][3][4]=3.0e3; dS[1][1][3][4]=7.4 # AA/GT
+ dH[1][1][4][3]=-0.6e3; dS[1][1][4][3]=-2.3 # AA/TG
+ dH[1][2][3][3]=0.5e3; dS[1][2][3][3]=3.2 # AC/GG
+ dH[1][3][3][2]=-4.0e3; dS[1][3][3][2]=-13.2 # AG/GC
+ dH[1][3][4][1]=-0.7e3; dS[1][3][4][1]=-2.3 # AG/TA
+ dH[1][4][3][1]=-0.7e3; dS[1][4][3][1]=-2.3 # AT/GA
+ dH[2][1][3][3]=-0.7e3; dS[2][1][3][3]=-2.3 # CA/GG
+ dH[2][3][3][1]=-4.0e3; dS[2][3][3][1]=-13.2 # CG/GA
+ dH[3][1][1][4]=0.7e3; dS[3][1][1][4]=0.7 # GA/AT
+ dH[3][1][2][3]=-0.6e3; dS[3][1][2][3]=-1.0 # GA/CG
+ dH[3][2][1][3]=-0.6e3; dS[3][2][1][3]=-1.0 # GC/AG
+ dH[3][3][1][2]=-0.7e3; dS[3][3][1][2]=-2.3 # GG/AC
+ dH[3][3][2][1]=0.5e3; dS[3][3][2][1]=3.2 # GG/CA
+ dH[3][4][1][1]=-0.6e3; dS[3][4][1][1]=-2.3 # GT/AA
+ dH[4][1][1][3]=0.7e3; dS[4][1][1][3]=0.7 # TA/AG
+ dH[4][3][1][1]=3.0e3; dS[4][3][1][1]=7.4 # TG/AA
+
+ # G-T Mismatches
+ dH[1][3][4][4]=1.0e3; dS[1][3][4][4]=0.9 # AG/TT
+ dH[1][4][4][3]=-2.5e3; dS[1][4][4][3]=-8.3 # AT/TG
+ dH[2][3][3][4]=-4.1e3; dS[2][3][3][4]=-11.7 # CG/GT
+ dH[2][4][3][3]=-2.8e3; dS[2][4][3][3]=-8.0 # CT/GG
+ dH[3][1][4][4]=-1.3e3; dS[3][1][4][4]=-5.3 # GA/TT
+ dH[3][2][4][3]=-4.4e3; dS[3][2][4][3]=-12.3 # GC/TG
+ dH[3][3][2][4]=3.3e3; dS[3][3][2][4]=10.4 # GG/CT
+ dH[3][3][4][2]=-2.8e3; dS[3][3][4][2]=-8.0 # GG/TC
+# dH[3][3][4][4]=5.8e3; dS[3][3][4][4]=16.3 # GG/TT
+ dH[3][4][2][3]=-4.4e3; dS[3][4][2][3]=-12.3 # GT/CG
+ dH[3][4][4][1]=-2.5e3; dS[3][4][4][1]=-8.3 # GT/TA
+# dH[3][4][4][3]=4.1e3; dS[3][4][4][3]=9.5 # GT/TG
+ dH[4][1][3][4]=-0.1e3; dS[4][1][3][4]=-1.7 # TA/GT
+ dH[4][2][3][3]=3.3e3; dS[4][2][3][3]=10.4 # TC/GG
+ dH[4][3][1][4]=-0.1e3; dS[4][3][1][4]=-1.7 # TG/AT
+ dH[4][3][3][2]=-4.1e3; dS[4][3][3][2]=-11.7 # TG/GC
+# dH[4][3][3][4]=-1.4e3; dS[4][3][3][4]=-6.2 # TG/GT
+ dH[4][4][1][3]=-1.3e3; dS[4][4][1][3]=-5.3 # TT/AG
+ dH[4][4][3][1]=1.0e3; dS[4][4][3][1]=0.9 # TT/GA
+# dH[4][4][3][3]=5.8e3; dS[4][4][3][3]=16.3 # TT/GG
+
+ # A-A Mismatches
+ dH[1][1][1][4]=4.7e3; dS[1][1][1][4]=12.9 # AA/AT
+ dH[1][1][4][1]=1.2e3; dS[1][1][4][1]=1.7 # AA/TA
+ dH[1][2][1][3]=-2.9e3; dS[1][2][1][3]=-9.8 # AC/AG
+ dH[1][3][1][2]=-0.9e3; dS[1][3][1][2]=-4.2 # AG/AC
+ dH[1][4][1][1]=1.2e3; dS[1][4][1][1]=1.7 # AT/AA
+ dH[2][1][3][1]=-0.9e3; dS[2][1][3][1]=-4.2 # CA/GA
+ dH[3][1][2][1]=-2.9e3; dS[3][1][2][1]=-9.8 # GA/CA
+ dH[4][1][1][1]=4.7e3; dS[4][1][1][1]=12.9 # TA/AA
+
+ # C-C Mismatches
+ dH[1][2][4][2]=0.0e3; dS[1][2][4][2]=-4.4 # AC/TC
+ dH[2][1][2][4]=6.1e3; dS[2][1][2][4]=16.4 # CA/CT
+ dH[2][2][2][3]=3.6e3; dS[2][2][2][3]=8.9 # CC/CG
+ dH[2][2][3][2]=-1.5e3; dS[2][2][3][2]=-7.2 # CC/GC
+ dH[2][3][2][2]=-1.5e3; dS[2][3][2][2]=-7.2 # CG/CC
+ dH[2][4][2][1]=0.0e3; dS[2][4][2][1]=-4.4 # CT/CA
+ dH[3][2][2][2]=3.6e3; dS[3][2][2][2]=8.9 # GC/CC
+ dH[4][2][1][2]=6.1e3; dS[4][2][1][2]=16.4 # TC/AC
+
+ # G-G Mismatches
+ dH[1][3][4][3]=-3.1e3; dS[1][3][4][3]=-9.5 # AG/TG
+ dH[2][3][3][3]=-4.9e3; dS[2][3][3][3]=-15.3 # CG/GG
+ dH[3][1][3][4]=1.6e3; dS[3][1][3][4]=3.6 # GA/GT
+ dH[3][2][3][3]=-6.0e3; dS[3][2][3][3]=-15.8 # GC/GG
+ dH[3][3][2][3]=-6.0e3; dS[3][3][2][3]=-15.8 # GG/CG
+ dH[3][3][3][2]=-4.9e3; dS[3][3][3][2]=-15.3 # GG/GC
+ dH[3][4][3][1]=-3.1e3; dS[3][4][3][1]=-9.5 # GT/GA
+ dH[4][3][1][3]=1.6e3; dS[4][3][1][3]=3.6 # TG/AG
+
+ # T-T Mismatches
+ dH[1][4][4][4]=-2.7e3; dS[1][4][4][4]=-10.8 # AT/TT
+ dH[2][4][3][4]=-5.0e3; dS[2][4][3][4]=-15.8 # CT/GT
+ dH[3][4][2][4]=-2.2e3; dS[3][4][2][4]=-8.4 # GT/CT
+ dH[4][1][4][4]=0.2e3; dS[4][1][4][4]=-1.5 # TA/TT
+ dH[4][2][4][3]=-2.2e3; dS[4][2][4][3]=-8.4 # TC/TG
+ dH[4][3][4][2]=-5.0e3; dS[4][3][4][2]=-15.8 # TG/TC
+ dH[4][4][1][4]=0.2e3; dS[4][4][1][4]=-1.5 # TT/AT
+ dH[4][4][4][1]=-2.7e3; dS[4][4][4][1]=-10.8 # TT/TA
+
+ # Dangling Eds
+ dH[5][1][1][4]=-0.7e3; dS[5][1][1][4]=-0.8 # $A/AT
+ dH[5][1][2][4]=4.4e3; dS[5][1][2][4]=14.9 # $A/CT
+ dH[5][1][3][4]=-1.6e3; dS[5][1][3][4]=-3.6 # $A/GT
+ dH[5][1][4][4]=2.9e3; dS[5][1][4][4]=10.4 # $A/TT
+ dH[5][2][1][3]=-2.1e3; dS[5][2][1][3]=-3.9 # $C/AG
+ dH[5][2][2][3]=-0.2e3; dS[5][2][2][3]=-0.1 # $C/CG
+ dH[5][2][3][3]=-3.9e3; dS[5][2][3][3]=-11.2 # $C/GG
+ dH[5][2][4][3]=-4.4e3; dS[5][2][4][3]=-13.1 # $C/TG
+ dH[5][3][1][2]=-5.9e3; dS[5][3][1][2]=-16.5 # $G/AC
+ dH[5][3][2][2]=-2.6e3; dS[5][3][2][2]=-7.4 # $G/CC
+ dH[5][3][3][2]=-3.2e3; dS[5][3][3][2]=-10.4 # $G/GC
+ dH[5][3][4][2]=-5.2e3; dS[5][3][4][2]=-15.0 # $G/TC
+ dH[5][4][1][1]=-0.5e3; dS[5][4][1][1]=-1.1 # $T/AA
+ dH[5][4][2][1]=4.7e3; dS[5][4][2][1]=14.2 # $T/CA
+ dH[5][4][3][1]=-4.1e3; dS[5][4][3][1]=-13.1 # $T/GA
+ dH[5][4][4][1]=-3.8e3; dS[5][4][4][1]=-12.6 # $T/TA
+ dH[1][5][4][1]=-2.9e3; dS[1][5][4][1]=-7.6 # A$/TA
+ dH[1][5][4][2]=-4.1e3; dS[1][5][4][2]=-13.0 # A$/TC
+ dH[1][5][4][3]=-4.2e3; dS[1][5][4][3]=-15.0 # A$/TG
+ dH[1][5][4][4]=-0.2e3; dS[1][5][4][4]=-0.5 # A$/TT
+ dH[1][1][5][4]=0.2e3; dS[1][1][5][4]=2.3 # AA/$T
+ dH[1][1][4][5]=-0.5e3; dS[1][1][4][5]=-1.1 # AA/T$
+ dH[1][2][5][3]=-6.3e3; dS[1][2][5][3]=-17.1 # AC/$G
+ dH[1][2][4][5]=4.7e3; dS[1][2][4][5]=14.2 # AC/T$
+ dH[1][3][5][2]=-3.7e3; dS[1][3][5][2]=-10.0 # AG/$C
+ dH[1][3][4][5]=-4.1e3; dS[1][3][4][5]=-13.1 # AG/T$
+ dH[1][4][5][1]=-2.9e3; dS[1][4][5][1]=-7.6 # AT/$A
+ dH[1][4][4][5]=-3.8e3; dS[1][4][4][5]=-12.6 # AT/T$
+ dH[2][5][3][1]=-3.7e3; dS[2][5][3][1]=-10.0 # C$/GA
+ dH[2][5][3][2]=-4.0e3; dS[2][5][3][2]=-11.9 # C$/GC
+ dH[2][5][3][3]=-3.9e3; dS[2][5][3][3]=-10.9 # C$/GG
+ dH[2][5][3][4]=-4.9e3; dS[2][5][3][4]=-13.8 # C$/GT
+ dH[2][1][5][4]=0.6e3; dS[2][1][5][4]=3.3 # CA/$T
+ dH[2][1][3][5]=-5.9e3; dS[2][1][3][5]=-16.5 # CA/G$
+ dH[2][2][5][3]=-4.4e3; dS[2][2][5][3]=-12.6 # CC/$G
+ dH[2][2][3][5]=-2.6e3; dS[2][2][3][5]=-7.4 # CC/G$
+ dH[2][3][5][2]=-4.0e3; dS[2][3][5][2]=-11.9 # CG/$C
+ dH[2][3][3][5]=-3.2e3; dS[2][3][3][5]=-10.4 # CG/G$
+ dH[2][4][5][1]=-4.1e3; dS[2][4][5][1]=-13.0 # CT/$A
+ dH[2][4][3][5]=-5.2e3; dS[2][4][3][5]=-15.0 # CT/G$
+ dH[3][5][2][1]=-6.3e3; dS[3][5][2][1]=-17.1 # G$/CA
+ dH[3][5][2][2]=-4.4e3; dS[3][5][2][2]=-12.6 # G$/CC
+ dH[3][5][2][3]=-5.1e3; dS[3][5][2][3]=-14.0 # G$/CG
+ dH[3][5][2][4]=-4.0e3; dS[3][5][2][4]=-10.9 # G$/CT
+ dH[3][1][5][4]=-1.1e3; dS[3][1][5][4]=-1.6 # GA/$T
+ dH[3][1][2][5]=-2.1e3; dS[3][1][2][5]=-3.9 # GA/C$
+ dH[3][2][5][3]=-5.1e3; dS[3][2][5][3]=-14.0 # GC/$G
+ dH[3][2][2][5]=-0.2e3; dS[3][2][2][5]=-0.1 # GC/C$
+ dH[3][3][5][2]=-3.9e3; dS[3][3][5][2]=-10.9 # GG/$C
+ dH[3][3][2][5]=-3.9e3; dS[3][3][2][5]=-11.2 # GG/C$
+ dH[3][4][5][1]=-4.2e3; dS[3][4][5][1]=-15.0 # GT/$A
+ dH[3][4][2][5]=-4.4e3; dS[3][4][2][5]=-13.1 # GT/C$
+ dH[4][5][1][1]=0.2e3; dS[4][5][1][1]=2.3 # T$/AA
+ dH[4][5][1][2]=0.6e3; dS[4][5][1][2]=3.3 # T$/AC
+ dH[4][5][1][3]=-1.1e3; dS[4][5][1][3]=-1.6 # T$/AG
+ dH[4][5][1][4]=-6.9e3; dS[4][5][1][4]=-20.0 # T$/AT
+ dH[4][1][5][4]=-6.9e3; dS[4][1][5][4]=-20.0 # TA/$T
+ dH[4][1][1][5]=-0.7e3; dS[4][1][1][5]=-0.7 # TA/A$
+ dH[4][2][5][3]=-4.0e3; dS[4][2][5][3]=-10.9 # TC/$G
+ dH[4][2][1][5]=4.4e3; dS[4][2][1][5]=14.9 # TC/A$
+ dH[4][3][5][2]=-4.9e3; dS[4][3][5][2]=-13.8 # TG/$C
+ dH[4][3][1][5]=-1.6e3; dS[4][3][1][5]=-3.6 # TG/A$
+ dH[4][4][5][1]=-0.2e3; dS[4][4][5][1]=-0.5 # TT/$A
+ dH[4][4][1][5]=2.9e3; dS[4][4][1][5]=10.4 # TT/A$
+
+
+ nparm['dH']=dH
+ nparm['dS']=dS
+
+ return nparm
+
+
+defaultParm=initParams(DEF_CONC_PRIMERS,DEF_CONC_SEQUENCES,DEF_SALT, SALT_METHOD_SANTALUCIA)
+
+def seqencoder(seq):
+ return [bpencoder[x] for x in seq]
+
+def getInitialEntropy(nparm=defaultParm):
+ return -5.9+nparm['rlogc']
+
+def getEnthalpy(x0, x1, y0, y1,nparm=defaultParm):
+ return nparm['dH'][x0][x1][y0][y1]
+
+def GetEntropy(x0, x1, y0, y1,nparm=defaultParm):
+
+ nx0=x0
+ nx1=x1
+ ny0=y0
+ ny1=y1
+ dH=nparm['dH']
+ dS=nparm['dS']
+ answer = dS[nx0][nx1][ny0][ny1]
+
+ if (nparm['saltMethod'] == SALT_METHOD_SANTALUCIA):
+ if(nx0!=5 and 1<= nx1 and nx1<=4):
+ answer += 0.5*nparm['kfac']
+
+ if(ny1!=5 and 1<= ny0 and ny0<=4):
+ answer += 0.5*nparm['kfac']
+
+ if (nparm['saltMethod'] == SALT_METHOD_OWCZARZY):
+ logk = log(nparm['kplus']);
+ answer += dH[nx0][nx1][ny0][ny1]*((4.29 * nparm['gcContent']-3.95)* 1e-5 * logk + 0.0000094*logk**2);
+
+ return answer;
+
+def CalcTM(entropy,enthalpy):
+ tm = 0
+ if (enthalpy>=forbidden_enthalpy) :
+ return 0;
+
+ if (entropy<0) :
+ tm = enthalpy/entropy
+ if (tm<0):
+ return 0;
+
+ return tm;
+
+
+
+
+def countGCContent(seq):
+ count = 0;
+ for k in seq :
+ if k in 'cgGC':
+ count+=1;
+ return count;
+
+
+#def cleanSeq (inseq,outseq,length):
+#
+# seqlen = len(inseq)
+# if (len != 0)
+# seqlen = length;
+#
+# j=0
+# for i in xrange(seqlen):
+# {
+# switch (inseq[i])
+# {
+# case 'a':
+# case '\0':
+# case 'A':
+# outseq[j++] = 'A'; break;
+# case 'c':
+# case '\1':
+# case 'C':
+# outseq[j++] = 'C'; break;
+# case 'g':
+# case '\2':
+# case 'G':
+# outseq[j++] = 'G'; break;
+# case 't':
+# case '\3':
+# case 'T':
+# outseq[j++] = 'T'; break;
+# }
+# }
+# outseq[j] = '\0';
+#}
+
+def calcSelfTM(seq,nparm=defaultParm):
+ dH=nparm['dH']
+ dS=nparm['dS']
+ length=len(seq)
+
+ thedH = 0;
+ thedS = -5.9+nparm['rlogc']
+ for i in xrange(1,length):
+ c1 = rvencoder[seq[i-1]];
+ c2 = rvencoder[seq[i]];
+ c3 = bpencoder[seq[i-1]];
+ c4 = bpencoder[seq[i]];
+
+ thedH += dH[c3][c4][c1][c2];
+ thedS += GetEntropy(c3, c4, c1, c2, nparm)
+
+ mtemp = CalcTM(thedS,thedH);
+# print thedH,thedS,nparm['rlogc']
+ return mtemp-273.15;
+
+
+def calcTMTwoSeq(seq1,seq2,nparm=defaultParm):
+
+ thedH = 0;
+ thedS = -5.9+nparm['rlogc']
+ dH=nparm['dH']
+ dS=nparm['dS']
+ length=len(seq1)
+
+ for i in xrange(1,length):
+ c1 = rvencoder[seq2[i-1]]
+ c2 = rvencoder[seq2[i]]
+ c3 = bpencoder[seq1[i-1]]
+ c4 = bpencoder[seq1[i]]
+
+ thedH += dH[c3][c4][c1][c2]
+ thedS += GetEntropy(c3, c4, c1, c2, nparm)
+
+ mtemp = CalcTM(thedS,thedH);
+# print thedH,thedS,nparm['rlogc']
+
+ return mtemp-273.15;
+
+
diff --git a/obitools/tools/__init__.py b/obitools/tools/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/obitools/tools/_solexapairend.so b/obitools/tools/_solexapairend.so
new file mode 100755
index 0000000..2d9e075
Binary files /dev/null and b/obitools/tools/_solexapairend.so differ
diff --git a/obitools/tools/solexapairend.py b/obitools/tools/solexapairend.py
new file mode 100644
index 0000000..609f533
--- /dev/null
+++ b/obitools/tools/solexapairend.py
@@ -0,0 +1,51 @@
+'''
+Created on 17 mai 2010
+
+@author: coissac
+'''
+
+from obitools.alignment import columnIterator
+
+
+def iterOnAligment(ali):
+ pos0=0
+ pos1=len(ali[1].wrapped)-1
+ begin0=False
+ end0=False
+ begin1=False
+ end1=False
+ for nuc0,nuc1 in columnIterator(ali):
+ if nuc0=='-':
+ if begin0:
+ if not end0:
+ score0 = ( ali[0].wrapped.quality[pos0-1]
+ +ali[0].wrapped.quality[pos0]
+ )/2
+ else:
+ score0 = 1.
+ else:
+ score0 = 0.
+ else:
+ begin0=True
+ score0 = ali[0].wrapped.quality[pos0]
+ pos0+=1
+ end0= pos0==len(ali[0].wrapped)
+
+ if nuc1=='-':
+ if begin1:
+ if not end1:
+ score1 = ( ali[1].wrapped.wrapped.quality[pos1]
+ +ali[1].wrapped.wrapped.quality[pos1+1]
+ )/2
+ else:
+ score1 = 0.
+ else:
+ score1 = 1.
+ else:
+ begin1=True
+ score1 = ali[1].wrapped.wrapped.quality[pos1]
+ pos1-=1
+ end1=pos1<0
+
+ result = (nuc0,score0,nuc1,score1)
+ yield result
diff --git a/obitools/tree/__init__.py b/obitools/tree/__init__.py
new file mode 100644
index 0000000..facb5ff
--- /dev/null
+++ b/obitools/tree/__init__.py
@@ -0,0 +1,116 @@
+import re
+
+
+class Tree(set):
+ def registerNode(self,node):
+ assert isinstance(node, TreeNode)
+ self.add(node)
+
+ def childNodeIterator(self,node):
+ assert isinstance(node, TreeNode)
+ return (x for x in self if x._parent==node)
+
+ def subTreeSize(self,node):
+ n=1
+ for subnode in self.childNodeIterator(node):
+ n+=self.subTreeSize(subnode)
+ return n
+
+ def getRoot(self):
+ roots = [x for x in self if x._parent is None]
+ assert len(roots)==1,'Tree cannot have several root node'
+ return roots[0]
+
+ def ancestorNodeIterator(self,node):
+ assert isinstance(node, TreeNode)
+ while node._parent is not None:
+ yield node
+ node=node._parent
+ yield node
+
+ def terminalNodeIterator(self):
+ return (x for x in self if x._isterminal)
+
+ def commonAncestor(self,node1,node2):
+ anc1 = set(x for x in self.ancestorNodeIterator(node1))
+ rep = [x for x in self.ancestorNodeIterator(node2)
+ if x in anc1]
+ assert len(rep)>=1
+ return rep[0]
+
+ def getDist(self,node1,node2):
+ ca = self.commonAncestor(node1, node2)
+ dist = 0
+ while node1 != ca:
+ dist+=node1._dist
+ node1=node1._parent
+ while node2 != ca:
+ dist+=node2._dist
+ node2=node2._parent
+ return dist
+
+ def farestNodes(self):
+ dmax=0
+ n1=None
+ n2=None
+ for node1 in self.terminalNodeIterator():
+ for node2 in self.terminalNodeIterator():
+ d = self.getDist(node1, node2)
+ if d > dmax:
+ dmax = d
+ n1=node1
+ n2=node2
+ return node1,node2,dmax
+
+ def setRoot(self,node,dist):
+ assert node in self
+ assert node._parent and node._dist > dist
+
+ newroot = TreeNode(self)
+ parent = node._parent
+ node._parent = newroot
+ compdist = node._dist - dist
+ node._dist=dist
+ node = parent
+
+ while node:
+ parent = node._parent
+ if parent:
+ dist = node._dist
+
+ node._parent = newroot
+ node._dist = compdist
+
+ newroot = node
+ node = parent
+
+ if node:
+ compdist=dist
+
+ for child in self.childNodeIterator(newroot):
+ child._parent = newroot._parent
+ child._dist += newroot._dist
+
+ self.remove(newroot)
+
+
+class TreeNode(object):
+ def __init__(self,tree,name=None,dist=None,bootstrap=None,**info):
+ self._parent=None
+ self._name=name
+ self._dist=dist
+ self._bootstrap=bootstrap
+ self._info=info
+ tree.registerNode(self)
+ self._isterminal=True
+
+
+ def linkToParent(self,parent):
+ assert isinstance(parent, TreeNode) or parent is None
+ self._parent=parent
+ if parent is not None:
+ parent._isterminal=False
+
+
+
+
diff --git a/obitools/tree/dot.py b/obitools/tree/dot.py
new file mode 100644
index 0000000..a21c4a1
--- /dev/null
+++ b/obitools/tree/dot.py
@@ -0,0 +1,18 @@
+
+from obitools.utils import universalOpen
+from obitools.tree import Tree,TreeNode
+
+def nodeWriter(tree,node,nodes):
+ data=[]
+ if node._parent:
+ data.append('%d -> %d ' % (nodes[node],nodes[node._parent]))
+ return "\n".join(data)
+
+
+def treeWriter(tree):
+ nodes=dict(map(None,tree,xrange(len(tree))))
+ code=[]
+ for node in tree:
+ code.append(nodeWriter(tree,node,nodes))
+ code = "\n".join(code)
+ return 'digraph tree { node [shape=point]\n%s\n};' % code
\ No newline at end of file
diff --git a/obitools/tree/layout.py b/obitools/tree/layout.py
new file mode 100644
index 0000000..a39ba77
--- /dev/null
+++ b/obitools/tree/layout.py
@@ -0,0 +1,103 @@
+
+class NodeLayout(dict):
+ '''
+ Layout data associated to a tree node.
+ '''
+ pass
+
+class TreeLayout(dict):
+ '''
+ Description of a phylogenetic tree layout
+
+ @see:
+ '''
+ def addNode(self,node):
+ self[node]=NodeLayout()
+
+ def setAttribute(self,node,key,value):
+ self[node][key]=value
+
+ def hasAttribute(self,node,key):
+ return key in self[node]
+
+ def getAttribute(self,node,key,default=None):
+ return self[node].get(key,default)
+
+ def setNodesColor(self,color,predicate=True):
+ '''
+
+ @param color:
+ @type color:
+ @param predicat:
+ @type predicat:
+ '''
+ for node in self:
+ if callable(predicat):
+ change = predicat(node)
+ else:
+ change = predicat
+
+ if change:
+ if callable(color):
+ c = color(node)
+ else:
+ c = color
+ self.setAttribute(node, 'color', color)
+
+ def setCircular(self,iscircularpredicat):
+ for node in self:
+ if callable(iscircularpredicat):
+ change = iscircularpredicat(node)
+ else:
+ change = iscircularpredicat
+
+ if change:
+ self.setAttribute(node, 'shape', 'circle')
+ else:
+ self.setAttribute(node, 'shape', 'square')
+
+ def setRadius(self,radius,predicate=True):
+ for node in self:
+ if callable(predicat):
+ change = predicat(node)
+ else:
+ change = predicat
+
+ if change:
+ if callable(radius):
+ r = radius(node)
+ else:
+ r = radius
+ self.setAttribute(node, 'radius', r)
+
+def predicatGeneratorIsInfoEqual(info,value):
+ def isInfoEqual(node):
+ data = node._info
+ return data is not None and info in data and data[info]==value
+
+ return isInfoEqual
+
+def isTerminalNode(node):
+ return node._isterminal
+
+def constantColorGenerator(color):
+ def colorMaker(node):
+ return color
+
+ return colorMaker
+
+def constantColorGenerator(color):
+ def colorMaker(node):
+ return color
+
+ return colorMaker
+
+def notPredicatGenerator(predicate):
+ def notpred(x):
+ return not predicat(x)
+ return notpred
+
+
+
+
+
\ No newline at end of file
diff --git a/obitools/tree/newick.py b/obitools/tree/newick.py
new file mode 100644
index 0000000..c69d0d3
--- /dev/null
+++ b/obitools/tree/newick.py
@@ -0,0 +1,117 @@
+import re
+import sys
+
+from obitools.utils import universalOpen
+from obitools.tree import Tree,TreeNode
+
+def subNodeIterator(data):
+ level=0
+ start = 1
+ if data[0]=='(':
+ for i in xrange(1,len(data)):
+ c=data[i]
+ if c=='(':
+ level+=1
+ elif c==')':
+ level-=1
+ if c==',' and not level:
+ yield data[start:i]
+ start = i+1
+ yield data[start:i]
+ else:
+ yield data
+
+
+_nodeParser=re.compile('\s*(?P\(.*\))?(?P[^ :]+)? *(?P[0-9.]+)?(:(?P-?[0-9.]+))?')
+
+def nodeParser(data):
+ parsedNode = _nodeParser.match(data).groupdict(0)
+ if not parsedNode['name']:
+ parsedNode['name']=None
+
+ if not parsedNode['bootstrap']:
+ parsedNode['bootstrap']=None
+ else:
+ parsedNode['bootstrap']=float(parsedNode['bootstrap'])
+
+ if not parsedNode['distance']:
+ parsedNode['distance']=None
+ else:
+ parsedNode['distance']=float(parsedNode['distance'])
+
+ if not parsedNode['subnodes']:
+ parsedNode['subnodes']=None
+
+ return parsedNode
+
+_cleanTreeData=re.compile('\s+')
+
+def treeParser(data,tree=None,parent=None):
+ if tree is None:
+ tree = Tree()
+ data = _cleanTreeData.sub(' ',data).strip()
+
+ parsedNode = nodeParser(data)
+ node = TreeNode(tree,
+ parsedNode['name'],
+ parsedNode['distance'],
+ parsedNode['bootstrap'])
+
+ node.linkToParent(parent)
+
+ if parsedNode['subnodes']:
+ for subnode in subNodeIterator(parsedNode['subnodes']):
+ treeParser(subnode,tree,node)
+ return tree
+
+_treecomment=re.compile('\[.*\]')
+
+def treeIterator(file):
+ file = universalOpen(file)
+ data = file.read()
+
+ comment = _treecomment.findall(data)
+ data=_treecomment.sub('',data).strip()
+
+ if comment:
+ comment=comment[0]
+ else:
+ comment=None
+ for tree in data.split(';'):
+ t = treeParser(tree)
+ if comment:
+ t.comment=comment
+ yield t
+
+def nodeWriter(tree,node,deep=0):
+ name = node._name
+ if name is None:
+ name=''
+
+ distance=node._dist
+ if distance is None:
+ distance=''
+ else:
+ distance = ':%6.5f' % distance
+
+ bootstrap=node._bootstrap
+ if bootstrap is None:
+ bootstrap=''
+ else:
+ bootstrap=' %d' % int(bootstrap)
+
+ nodeseparator = ',\n' + ' ' * (deep+1)
+
+ subnodes = nodeseparator.join([nodeWriter(tree, x, deep+1)
+ for x in tree.childNodeIterator(node)])
+ if subnodes:
+ subnodes='(\n' + ' ' * (deep+1) + subnodes + '\n' + ' ' * deep + ')'
+
+ return '%s%s%s%s' % (subnodes,name,bootstrap,distance)
+
+def treeWriter(tree,startnode=None):
+ if startnode is not None:
+ root=startnode
+ else:
+ root = tree.getRoot()
+ return nodeWriter(tree,root)+';'
diff --git a/obitools/tree/svg.py b/obitools/tree/svg.py
new file mode 100644
index 0000000..ff51a8c
--- /dev/null
+++ b/obitools/tree/svg.py
@@ -0,0 +1,70 @@
+import math
+
+from obitools.svg import Scene,Circle,Line,Rectangle,Text
+from obitools.tree import Tree
+
+def displayTreeLayout(layout,width=400,height=400,radius=3,scale=1.0):
+ '''
+ Convert a tree layout object in an svg file.
+
+ @param layout: the tree layout object
+ @type layout: obitools.tree.layout.TreeLayout
+ @param width: svg document width
+ @type width: int
+ @param height: svg document height
+ @type height: int
+ @param radius: default radius of node in svg unit (default 3)
+ @type radius: int
+ @param scale: scale factor applied to the svg coordinates (default 1.0)
+ @type scale: float
+
+ @return: str containing svg code
+ '''
+ xmin = min(layout.getAttribute(n,'x') for n in layout)
+ xmax = max(layout.getAttribute(n,'x') for n in layout)
+ ymin = min(layout.getAttribute(n,'y') for n in layout)
+ ymax = max(layout.getAttribute(n,'y') for n in layout)
+
+ dx = xmax - xmin
+ dy = ymax - ymin
+
+ xscale = width * 0.95 / dx * scale
+ yscale = height * 0.95 / dy * scale
+
+ def X(x):
+ return (x - xmin ) * xscale + width * 0.025
+
+ def Y(y):
+ return (y - ymin ) * yscale + height * 0.025
+
+ scene = Scene('unrooted', height, width)
+
+ for n in layout:
+ if n._parent is not None:
+ parent = n._parent
+ xf = layout.getAttribute(n,'x')
+ yf = layout.getAttribute(n,'y')
+ xp = layout.getAttribute(parent,'x')
+ yp = layout.getAttribute(parent,'y')
+ scene.add(Line((X(xf),Y(yf)),(X(xp),Y(yp))))
+
+ for n in layout:
+ xf = layout.getAttribute(n,'x')
+ yf = layout.getAttribute(n,'y')
+ cf = layout.getAttribute(n,'color')
+ sf = layout.getAttribute(n,'shape')
+ if layout.hasAttribute(n,'radius'):
+ rf=layout.getAttribute(n,'radius')
+ else:
+ rf=radius
+
+ if sf=='circle':
+ scene.add(Circle((X(xf),Y(yf)),rf,cf))
+ else:
+ scene.add(Rectangle((X(xf)-rf,Y(yf)-rf),2*rf,2*rf,cf))
+
+
+ return ''.join(scene.strarray())
+
+
+
\ No newline at end of file
diff --git a/obitools/tree/unrooted.py b/obitools/tree/unrooted.py
new file mode 100644
index 0000000..9a9f3e6
--- /dev/null
+++ b/obitools/tree/unrooted.py
@@ -0,0 +1,33 @@
+from obitools.tree.layout import TreeLayout
+import math
+
+def subtreeLayout(tree,node,layout,start,end,x,y,default):
+ nbotu = tree.subTreeSize(node)
+ delta = (end-start)/(nbotu+1)
+
+ layout.addNode(node)
+ layout.setAttribute(node,'x',x)
+ layout.setAttribute(node,'y',y)
+ layout.setAttribute(node,'color',(255,0,0))
+ layout.setAttribute(node,'shape','circle')
+
+ for subnode in tree.childNodeIterator(node):
+ snbotu = tree.subTreeSize(subnode)
+ end = start + snbotu * delta
+ med = start + snbotu * delta /2
+ r = subnode._dist
+ if r is None or r <=0:
+ r=default
+ subx=math.cos(med) * r + x
+ suby=math.sin(med) * r + y
+ subtreeLayout(tree, subnode, layout, start, end, subx, suby, default)
+ start=end
+
+ return layout
+
+def treeLayout(tree):
+ layout = TreeLayout()
+ root = tree.getRoot()
+ dmin = min(n._dist for n in tree if n._dist is not None and n._dist > 0)
+ return subtreeLayout(tree,root,layout,0,2*math.pi,0,0,dmin / 100)
+
\ No newline at end of file
diff --git a/obitools/unit/__init__.py b/obitools/unit/__init__.py
new file mode 100644
index 0000000..d02c812
--- /dev/null
+++ b/obitools/unit/__init__.py
@@ -0,0 +1,8 @@
+import unittest
+
+from obitools import tests_group as obitools_tests_group
+
+tests_group=obitools_tests_group
+
+
+
diff --git a/obitools/unit/obitools/__init__.py b/obitools/unit/obitools/__init__.py
new file mode 100644
index 0000000..ab1bcec
--- /dev/null
+++ b/obitools/unit/obitools/__init__.py
@@ -0,0 +1,89 @@
+import unittest
+
+import obitools
+
+class BioseqTest(unittest.TestCase):
+
+ sequenceId = 'id1'
+ sequenceDefinition = 'sequence definition'
+ sequenceQualifier = {'extra':3}
+
+ def setUp(self):
+ self.bioseq = self.bioseqClass(self.sequenceId,
+ self.sequenceString,
+ self.sequenceDefinition,
+ **self.sequenceQualifier)
+
+ title = self.__doc__.strip()
+ underline = "=" * len(title)
+
+ #print "%s\n%s" % (title,underline)
+
+ def tearDown(self):
+ pass
+ #print "\n"
+
+ def testIdAttribute(self):
+ '''
+ test if id attribute exists
+ '''
+ self.failUnless(hasattr(self.bioseq, 'id'), 'id missing attribute')
+
+ def testIdValue(self):
+ '''
+ test if id attribute value is 'id1'
+ '''
+ self.failUnlessEqual(self.bioseq.id, 'id1',
+ 'identifier is created with good value')
+
+ def testDefinitionAttribute(self):
+ '''
+ test if definition attribute exists
+ '''
+ self.failUnless(hasattr(self.bioseq, 'definition'), 'definition missing attribute')
+
+ def testSequenceIsLowerCase(self):
+ '''
+ test if sequence is stored as lower case letter
+ '''
+ self.failUnlessEqual(str(self.bioseq),
+ str(self.bioseq).lower(),
+ "Sequence is not stored as lower case string")
+
+ def testSequenceQualifier(self):
+ '''
+ test if the extra qualifier is present and its value is three.
+ '''
+ self.failUnlessEqual(self.bioseq['extra'],
+ 3,
+ "Sequence qualifier cannot be successfully retrieve")
+
+ def testCreateSequenceQualifier(self):
+ self.bioseq['testqualifier']='ok'
+ self.failUnlessEqual(self.bioseq['testqualifier'],
+ 'ok',
+ "Sequence qualifier cannot be successfully created")
+
+
+
+class NucBioseqTest(BioseqTest):
+ '''
+ Test obitools.NucSequence class
+ '''
+
+ bioseqClass = obitools.NucSequence
+ sequenceString = 'AACGT' * 5
+
+
+class AABioseqTest(BioseqTest):
+ '''
+ Test obitools.AASequence class
+ '''
+
+ bioseqClass = obitools.AASequence
+ sequenceString = 'MLKCVT' * 5
+
+
+
+
+tests_group = [NucBioseqTest,AABioseqTest]
\ No newline at end of file
diff --git a/obitools/utils/__init__.py b/obitools/utils/__init__.py
new file mode 100644
index 0000000..fd7076f
--- /dev/null
+++ b/obitools/utils/__init__.py
@@ -0,0 +1,324 @@
+import sys
+
+import time
+import re
+import shelve
+
+from threading import Lock
+from logging import warning
+import urllib2
+
+from obitools.gzip import GzipFile
+from obitools.zipfile import ZipFile
+import os.path
+
+
+class FileFormatError(Exception):
+ pass
+
+
+
+def universalOpen(file,*options):
+ '''
+ Open a file gziped or not.
+
+ If file is a C{str} instance, file is
+ concidered as a file name. In this case
+ the C{.gz} suffixe is tested to eventually
+ open it a a gziped file.
+
+ If file is an other kind of object, it is assumed
+ that this object follow the C{file} interface
+ and it is return as is.
+
+ @param file: the file to open
+ @type file: C{str} or a file like object
+
+ @return: an iterator on text lines.
+ '''
+ if isinstance(file,str):
+ if urllib2.urlparse.urlparse(file)[0]=='':
+ rep = open(file,*options)
+ else:
+ rep = urllib2.urlopen(file,timeout=15)
+
+ if file[-3:] == '.gz':
+ rep = GzipFile(fileobj=rep)
+ if file[-4:] == '.zip':
+ zip = ZipFile(file=rep)
+ data = zip.infolist()
+ assert len(data)==1,'Only zipped file containning a single file can be open'
+ name = data[0].filename
+ rep = zip.open(name)
+ else:
+ rep = file
+ return rep
+
+def universalTell(file):
+ '''
+ Return the position in the file even if
+ it is a gziped one.
+
+ @param file: the file to check
+ @type file: a C{file} like instance
+
+ @return: position in the file
+ @rtype: C{int}
+ '''
+ if isinstance(file, GzipFile):
+ file=file.myfileobj
+ return file.tell()
+
+def fileSize(file):
+ '''
+ Return the file size even if it is a
+ gziped one.
+
+ @param file: the file to check
+ @type file: a C{file} like instance
+
+ @return: the size of the file
+ @rtype: C{int}
+ '''
+ if isinstance(file, GzipFile):
+ file=file.myfileobj
+ pos = file.tell()
+ file.seek(0,2)
+ length = file.tell()
+ file.seek(pos,0)
+ return length
+
+def progressBar(pos,maxi,reset=False,head='',delta=[],step=[1,0,0]):
+ if reset:
+ del delta[:]
+ if not delta:
+ delta.append(time.time())
+ delta.append(time.time())
+ assert maxi>0
+
+ step[1]+=1
+ if step[1] % step[0] == 0:
+ step[1]=1
+ newtime = time.time()
+ d = newtime-delta[1]
+ if d < 0.2:
+ step[0]*=2
+ elif d > 0.4 and step[0]>1:
+ step[0]/=2
+
+ delta[1]=newtime
+ elapsed = delta[1]-delta[0]
+
+ if callable(pos):
+ pos=pos()
+ percent = float(pos)/maxi * 100
+ remain = time.gmtime(elapsed / percent * (100-percent))
+ days = remain.tm_yday - 1
+ hour = remain.tm_hour
+ minu = remain.tm_min
+ sec = remain.tm_sec
+ if days:
+ remain = "%d days %02d:%02d:%02d" % (days,hour,minu,sec)
+ else:
+ remain = "%02d:%02d:%02d" % (hour,minu,sec)
+ bar = '#' * int(percent/2)
+ step[2]=(step[2]+1) % 4
+ bar+= '|/-\\'[step[2]]
+ bar+= ' ' * (50 - int(percent/2))
+ sys.stderr.write('\r%s %5.1f %% |%s] remain : %s' %(head,percent,bar,remain))
+ else:
+ step[1]+=1
+
+def endLessIterator(endedlist):
+ for x in endedlist:
+ yield x
+ while(1):
+ yield endedlist[-1]
+
+
+def multiLineWrapper(lineiterator):
+ '''
+ Aggregator of strings.
+
+ @param lineiterator: a stream of strings from an opened OBO file.
+ @type lineiterator: a stream of strings.
+
+ @return: an aggregated stanza.
+ @rtype: an iterotor on str
+
+ @note: The aggregator aggregates strings from an opened OBO file.
+ When the length of a string is < 2, the current stanza is over.
+ '''
+
+ for line in lineiterator:
+ rep = [line]
+ while len(line)>=2 and line[-2]=='\\':
+ rep[-1]=rep[-1][0:-2]
+ try:
+ line = lineiterator.next()
+ except StopIteration:
+ raise FileFormatError
+ rep.append(line)
+ yield ''.join(rep)
+
+
+def skipWhiteLineIterator(lineiterator):
+ '''
+ Curator of stanza.
+
+ @param lineiterator: a stream of strings from an opened OBO file.
+ @type lineiterator: a stream of strings.
+
+ @return: a stream of strings without blank strings.
+ @rtype: a stream strings
+
+ @note: The curator skip white lines of the current stanza.
+ '''
+
+ for line in lineiterator:
+ cleanline = line.strip()
+ if cleanline:
+ yield line
+ else:
+ print 'skipped'
+
+
+class ColumnFile(object):
+
+ def __init__(self,stream,sep=None,strip=True,
+ types=None,skip=None,head=None,
+ extra=None,
+ extraformat='([a-zA-Z]\w*) *= *([^;]+);'):
+ self._stream = universalOpen(stream)
+ self._delimiter=sep
+ self._strip=strip
+ self._extra=extra
+ self._extraformat = re.compile(extraformat)
+
+ if types:
+ self._types=[x for x in types]
+ for i in xrange(len(self._types)):
+ if self._types[i] is bool:
+ self._types[i]=ColumnFile.str2bool
+ else:
+ self._types=None
+
+ self._skip = skip
+ if skip is not None:
+ self._lskip= len(skip)
+ else:
+ self._lskip= 0
+ self._head=head
+
+ def str2bool(x):
+ return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False}))
+
+ str2bool = staticmethod(str2bool)
+
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+
+ def cast(txt,type):
+ try:
+ v = type(txt)
+ except:
+ v=None
+ return v
+ ligne = self._stream.next()
+ if self._skip is not None:
+ while ligne[0:self._lskip]==self._skip:
+ ligne = self._stream.next()
+ if self._extra is not None:
+ try:
+ (ligne,extra) = ligne.rsplit(self._extra,1)
+ extra = dict(self._extraformat.findall(extra))
+ except ValueError:
+ extra=None
+ else:
+ extra = None
+ data = ligne.split(self._delimiter)
+ if self._strip or self._types:
+ data = [x.strip() for x in data]
+ if self._types:
+ it = endLessIterator(self._types)
+ data = [cast(*x) for x in ((y,it.next()) for y in data)]
+ if self._head is not None:
+ data=dict(map(None, self._head,data))
+ if extra is not None:
+ data['__extra__']=extra
+ else:
+ if extra is not None:
+ data.append(extra)
+ return data
+
+ def tell(self):
+ return universalTell(self._stream)
+
+
+class CachedDB(object):
+
+ def __init__(self,cachefile,masterdb):
+ self._cache = shelve.open(cachefile,'c')
+ self._db = masterdb
+ self._lock=Lock()
+
+ def _cacheSeq(self,seq):
+ self._lock.acquire()
+ self._cache[seq.id]=seq
+ self._lock.release()
+ return seq
+
+ def __getitem__(self,ac):
+ if isinstance(ac,str):
+ self._lock.acquire()
+ if ac in self._cache:
+# print >>sys.stderr,"Use cache for %s" % ac
+ data = self._cache[ac]
+ self._lock.release()
+
+ else:
+ self._lock.release()
+ data = self._db[ac]
+ self._cacheSeq(data)
+ return data
+ else:
+ self._lock.acquire()
+ acs = [[x,self._cache.get(x,None)] for x in ac]
+ self._lock.release()
+ newacs = [ac for ac,cached in acs if cached is None]
+ if newacs:
+ newseqs = self._db[newacs]
+ else:
+ newseqs = iter([])
+ for r in acs:
+ if r[1] is None:
+ r[1]=self._cacheSeq(newseqs.next())
+# else:
+# print >>sys.stderr,"Use cache for %s" % r[0]
+ return (x[1] for x in acs)
+
+
+def moduleInDevelopment(name):
+ Warning('This module %s is under development : use it with caution' % name)
+
+
+def deprecatedScript(newscript):
+ current = sys.argv[0]
+ print >>sys.stderr," "
+ print >>sys.stderr," "
+ print >>sys.stderr," "
+ print >>sys.stderr,"#########################################################"
+ print >>sys.stderr,"# #"
+ print >>sys.stderr," W A R N I N G :"
+ print >>sys.stderr," %s is a deprecated script " % os.path.split(current)[1]
+ print >>sys.stderr," it will disappear in the next obitools version"
+ print >>sys.stderr," "
+ print >>sys.stderr," The new corresponding command is %s " % newscript
+ print >>sys.stderr,"# #"
+ print >>sys.stderr,"#########################################################"
+ print >>sys.stderr," "
+ print >>sys.stderr," "
+ print >>sys.stderr," "
diff --git a/obitools/utils/__init__.pyc b/obitools/utils/__init__.pyc
new file mode 100644
index 0000000..99512dc
Binary files /dev/null and b/obitools/utils/__init__.pyc differ
diff --git a/obitools/utils/bioseq.py b/obitools/utils/bioseq.py
new file mode 100644
index 0000000..71337c7
--- /dev/null
+++ b/obitools/utils/bioseq.py
@@ -0,0 +1,232 @@
+def mergeTaxonomyClassification(uniqSeq,taxonomy):
+ for seq in uniqSeq:
+ if seq['merged_taxid']:
+ seq['taxid']=taxonomy.lastCommonTaxon(*seq['merged_taxid'].keys())
+ tsp = taxonomy.getSpecies(seq['taxid'])
+ tgn = taxonomy.getGenus(seq['taxid'])
+ tfa = taxonomy.getFamily(seq['taxid'])
+
+ if tsp is not None:
+ sp_sn = taxonomy.getScientificName(tsp)
+ else:
+ sp_sn="###"
+ tsp=-1
+
+ if tgn is not None:
+ gn_sn = taxonomy.getScientificName(tgn)
+ else:
+ gn_sn="###"
+ tgn=-1
+
+ if tfa is not None:
+ fa_sn = taxonomy.getScientificName(tfa)
+ else:
+ fa_sn="###"
+ tfa=-1
+
+ seq['species']=tsp
+ seq['genus']=tgn
+ seq['family']=tfa
+
+ seq['species_sn']=sp_sn
+ seq['genus_sn']=gn_sn
+ seq['family_sn']=fa_sn
+
+ seq['rank']=taxonomy.getRank(seq['taxid'])
+ seq['scientific_name']=fa_sn = taxonomy.getScientificName(seq['taxid'])
+
+def uniqSequence(seqIterator,taxonomy=None,mergedKey=None,mergeIds=False,categories=None):
+ uniques={}
+ uniqSeq=[]
+
+ if categories is None:
+ categories=[]
+
+ if mergedKey is not None:
+ mergedKey=set(mergedKey)
+ else:
+ mergedKey=set()
+
+ if taxonomy is not None:
+ mergedKey.add('taxid')
+
+ for seq in seqIterator:
+ s = tuple(seq[x] for x in categories) + (str(seq),)
+ if s in uniques:
+ s = uniques[s]
+ if 'count' in seq:
+ s['count']+=seq['count']
+ else:
+ s['count']+=1
+# if taxonomy is not None and 'taxid' in seq:
+# s['merged_taxid'][seq['taxid']]=
+ for key in mergedKey:
+ if key=='taxid' and mergeIds:
+ if 'taxid_dist' in seq:
+ s["taxid_dist"].update(seq["taxid_dist"])
+ if 'taxid' in seq:
+ s["taxid_dist"][seq.id]=seq['taxid']
+
+ mkey = "merged_%s" % key
+ if key in seq:
+ s[mkey][seq[key]]=s[mkey].get(seq[key],0)+1
+ if mkey in seq:
+ for skey in seq[mkey]:
+ if skey in s:
+ s[mkey][skey]=s[mkey].get(seq[skey],0)+seq[mkey][skey]
+ else:
+ s[mkey][skey]=seq[mkey][skey]
+
+ for key in seq.iterkeys():
+ # Merger proprement l'attribut merged s'il exist
+ if key in s and s[key]!=seq[key] and key!='count' and key[0:7]!='merged_' and key!='merged':
+ del(s[key])
+
+
+ if mergeIds:
+ s['merged'].append(seq.id)
+ else:
+ uniques[s]=seq
+ for key in mergedKey:
+ if key=='taxid' and mergeIds:
+ if 'taxid_dist' not in seq:
+ seq["taxid_dist"]={}
+ if 'taxid' in seq:
+ seq["taxid_dist"][seq.id]=seq['taxid']
+ mkey = "merged_%s" % key
+ if mkey not in seq:
+ seq[mkey]={}
+ if key in seq:
+ seq[mkey][seq[key]]=seq[mkey].get(seq[key],0)+1
+ del(seq[key])
+
+ if 'count' not in seq:
+ seq['count']=1
+ if mergeIds:
+ seq['merged']=[seq.id]
+ uniqSeq.append(seq)
+
+ if taxonomy is not None:
+ mergeTaxonomyClassification(uniqSeq, taxonomy)
+
+
+
+ return uniqSeq
+
+def uniqPrefixSequence(seqIterator,taxonomy=None,mergedKey=None,mergeIds=False,categories=None):
+
+ if categories is None:
+ categories=[]
+
+ def cmpseq(s1,s2):
+ return cmp(str(s1),str(s2))
+
+ if mergedKey is not None:
+ mergedKey=set(mergedKey)
+ else:
+ mergedKey=set()
+
+ if taxonomy is not None:
+ mergedKey.add('taxid')
+
+ sequences=list(seqIterator)
+
+ if not sequences:
+ return []
+
+ sequences.sort(cmpseq)
+
+
+ old=sequences.pop()
+ uniqSeq=[old]
+ if 'count' not in old:
+ old['count']=1
+ for key in mergedKey:
+ mkey = "merged_%s" % key
+ if mkey not in old:
+ old[mkey]={}
+ if key in old:
+ old[mkey][old[key]]=old[mkey].get(old[key],0)+1
+ if mergeIds:
+ old['merged']=[old.id]
+
+
+ while(sequences):
+ seq=sequences.pop()
+ lseq=len(seq)
+ pold = str(old)[0:lseq]
+ if pold==str(seq):
+
+ if 'count' in seq:
+ old['count']+=seq['count']
+ else:
+ old['count']+=1
+
+ for key in mergedKey:
+ mkey = "merged_%s" % key
+ if key in seq:
+ old[mkey][seq[key]]=old[mkey].get(seq[key],0)+1
+ if mkey in seq:
+ for skey in seq[mkey]:
+ if skey in old:
+ old[mkey][skey]=old[mkey].get(seq[skey],0)+seq[mkey][skey]
+ else:
+ old[mkey][skey]=seq[mkey][skey]
+
+ for key in seq.iterkeys():
+ if key in old and old[key]!=seq[key]:
+ del(old[key])
+
+
+ if mergeIds:
+ old['merged'].append(seq.id)
+ else:
+ old=seq
+
+ for key in mergedKey:
+ mkey = "merged_%s" % key
+ if mkey not in seq:
+ seq[mkey]={}
+ if key in seq:
+ seq[mkey][seq[key]]=seq[mkey].get(seq[key],0)+1
+ del(seq[key])
+
+ if 'count' not in seq:
+ seq['count']=1
+ if mergeIds:
+ seq['merged']=[seq.id]
+ uniqSeq.append(seq)
+
+ if taxonomy is not None:
+ mergeTaxonomyClassification(uniqSeq, taxonomy)
+
+ return uniqSeq
+
+
+
+
+def _cmpOnKeyGenerator(key,reverse=False):
+ def compare(x,y):
+ try:
+ c1 = x[key]
+ except KeyError:
+ c1=None
+
+ try:
+ c2 = y[key]
+ except KeyError:
+ c2=None
+
+ if reverse:
+ s=c1
+ c1=c2
+ c2=s
+ return cmp(c1,c2)
+
+ return compare
+
+def sortSequence(seqIterator,key,reverse=False):
+ seqs = list(seqIterator)
+ seqs.sort(_cmpOnKeyGenerator(key, reverse))
+ return seqs
+
\ No newline at end of file
diff --git a/obitools/utils/crc64.py b/obitools/utils/crc64.py
new file mode 100644
index 0000000..537391e
--- /dev/null
+++ b/obitools/utils/crc64.py
@@ -0,0 +1,53 @@
+#
+# Code obtained from :
+# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/259177/index_txt
+#
+
+# Initialisation
+# 32 first bits of generator polynomial for CRC64
+# the 32 lower bits are assumed to be zero
+
+POLY64REVh = 0xd8000000L
+CRCTableh = [0] * 256
+CRCTablel = [0] * 256
+isInitialized = False
+
+def CRC64(aString):
+ global isInitialized
+ crcl = 0
+ crch = 0
+ if (isInitialized is not True):
+ isInitialized = True
+ for i in xrange(256):
+ partl = i
+ parth = 0L
+ for j in xrange(8):
+ rflag = partl & 1L
+ partl >>= 1L
+ if (parth & 1):
+ partl |= (1L << 31L)
+ parth >>= 1L
+ if rflag:
+ parth ^= POLY64REVh
+ CRCTableh[i] = parth;
+ CRCTablel[i] = partl;
+
+ for item in aString:
+ shr = 0L
+ shr = (crch & 0xFF) << 24
+ temp1h = crch >> 8L
+ temp1l = (crcl >> 8L) | shr
+ tableindex = (crcl ^ ord(item)) & 0xFF
+
+ crch = temp1h ^ CRCTableh[tableindex]
+ crcl = temp1l ^ CRCTablel[tableindex]
+ return (crch, crcl)
+
+def CRC64digest(aString):
+ return "%08X%08X" % (CRC64(aString))
+
+if __name__ == '__main__':
+ assert CRC64("IHATEMATH") == (3822890454, 2600578513)
+ assert CRC64digest("IHATEMATH") == "E3DCADD69B01ADD1"
+ print 'CRC64: dumb test successful'
+
diff --git a/obitools/utils/iterator.py b/obitools/utils/iterator.py
new file mode 100644
index 0000000..f53537f
--- /dev/null
+++ b/obitools/utils/iterator.py
@@ -0,0 +1,8 @@
+from itertools import chain
+
+def uniqueChain(*args):
+ see = set()
+ for x in chain(*args):
+ if x not in see:
+ see.add(x)
+ yield x
\ No newline at end of file
diff --git a/obitools/utils/iterator.pyc b/obitools/utils/iterator.pyc
new file mode 100644
index 0000000..88d415e
Binary files /dev/null and b/obitools/utils/iterator.pyc differ
diff --git a/obitools/word/__init__.py b/obitools/word/__init__.py
new file mode 100644
index 0000000..c1a4b6b
--- /dev/null
+++ b/obitools/word/__init__.py
@@ -0,0 +1,72 @@
+from itertools import imap
+from _binary import *
+
+def wordCount(liste):
+ count = {}
+
+ for e in liste:
+ count[e]=count.get(e,0) + 1
+
+ return count
+
+
+def wordIterator(sequence,lword,step=1,endIncluded=False,circular=False):
+
+ assert not (endIncluded and circular), \
+ "endIncluded and circular cannot not be set to True at the same time"
+
+ L = len(sequence)
+ sequence = str(sequence)
+ if circular:
+ sequence += sequence[0:lword]
+ pmax=L
+ elif endIncluded:
+ pmax=L
+ else:
+ pmax = L - lword + 1
+
+ pos = xrange(0,pmax,step)
+
+ for x in pos:
+ yield encodeWord(sequence[x:x+lword])
+
+
+
+def wordSelector(words,accept=None,reject=None):
+ '''
+ Filter over a DNA word iterator.
+
+ @param words: an iterable object other a list of DNA words
+ @type words: an iterator
+ @param accept: a list of predicate. Each predicate is a function
+ accepting one str parametter and returning a boolean
+ value.
+ @type accept: list
+ @param reject: a list of predicat. Each predicat is a function
+ accepting one str parametter and returning a boolean
+ value.
+ @type reject: list
+
+ @return: an iterator on DNA word (str)
+ @rtype: iterator
+ '''
+ if accept is None:
+ accept=[]
+ if reject is None:
+ reject=[]
+ for w in words:
+# print [bool(p(w)) for p in accept]
+ accepted = reduce(lambda x,y: bool(x) and bool(y),
+ (p(w) for p in accept),
+ True)
+# print [(p.__name__,bool(p(w))) for p in reject]
+ rejected = reduce(lambda x,y:bool(x) or bool(y),
+ (p(w) for p in reject),
+ False)
+# print decodeWord(w,5),accepted,rejected,
+ if accepted and not rejected:
+# print " conserved"
+ yield w
+# else:
+# print
+
diff --git a/obitools/word/_binary.so b/obitools/word/_binary.so
new file mode 100755
index 0000000..1780762
Binary files /dev/null and b/obitools/word/_binary.so differ
diff --git a/obitools/word/options.py b/obitools/word/options.py
new file mode 100644
index 0000000..ff44e57
--- /dev/null
+++ b/obitools/word/options.py
@@ -0,0 +1,116 @@
+from obitools.word import wordSelector
+from obitools.word import allDNAWordIterator,encodeWord
+from obitools.word import predicate
+
+
+
+
+def _acceptedOptionCallback(options,opt,value,parser):
+ if not hasattr(parser.values, 'acceptedOligo'):
+ parser.values.acceptedOligo=[]
+ parser.values.acceptedOligo.append(predicate.predicateMatchPattern(value,))
+
+def _rejectedOptionCallback(options,opt,value,parser):
+ if not hasattr(parser.values, 'rejectedOligo'):
+ parser.values.rejectedOligo=[]
+ parser.values.rejectedOligo.append(predicate.predicateMatchPattern(value))
+
+
+
+def addOligoOptions(optionManager):
+
+ optionManager.add_option('-L','--oligo-list',
+ action="store", dest="oligoList",
+ metavar="",
+ type="str",
+ help="filename containing a list of oligonucleotide")
+
+
+ optionManager.add_option('-s','--oligo-size',
+ action="store", dest="oligoSize",
+ metavar="<###>",
+ type="int",
+ help="Size of oligonucleotide to generate")
+
+ optionManager.add_option('-f','--family-size',
+ action="store", dest="familySize",
+ metavar="<###>",
+ type="int",
+ help="Size of oligonucleotide family to generate")
+
+ optionManager.add_option('-d','--distance',
+ action="store", dest="oligoDist",
+ metavar="<###>",
+ type="int",
+ default=1,
+ help="minimal distance between two oligonucleotides")
+
+ optionManager.add_option('-g','--gc-max',
+ action="store", dest="gcMax",
+ metavar="<###>",
+ type="int",
+ default=0,
+ help="maximum count of G or C nucleotide acceptable in a word")
+
+ optionManager.add_option('-a','--accepted',
+ action="append",dest="acceptedPattern",
+ metavar="",
+ default=[],
+ type="str",
+ help="pattern of accepted oligonucleotide")
+
+ optionManager.add_option('-r','--rejected',
+ action="append",dest="rejectedPattern",
+ metavar="",
+ default=[],
+ type="str",
+ help="pattern of rejected oligonucleotide")
+
+ optionManager.add_option('-p','--homopolymer',
+ action="store", dest="homopolymere",
+ metavar="<###>",
+ type="int",
+ default=0,
+ help="reject oligo with homopolymer longer than.")
+
+ optionManager.add_option('-P','--homopolymer-min',
+ action="store", dest="homopolymere_min",
+ metavar="<###>",
+ type="int",
+ default=0,
+ help="accept only oligo with homopolymer longer or equal to.")
+
+def dnaWordIterator(options):
+
+ assert options.oligoSize is not None or options.oligoList is not None,"option -s or --oligo-size must be specified"
+ assert options.familySize is not None,"option -f or --family-size must be specified"
+ assert options.oligoDist is not None,"option -d or --distance must be specified"
+
+ if options.oligoList is not None:
+ words = (encodeWord(x.strip().lower()) for x in open(options.oligoList))
+ else:
+ words = allDNAWordIterator(options.oligoSize)
+ #seed = 'a' * options.oligoSize
+ options.acceptedOligo=[]
+ for p in options.acceptedPattern:
+ assert len(p)==options.oligoSize,"Accept pattern with bad lenth : %s" % p
+ options.acceptedOligo.append(predicate.predicateMatchPattern(p, options.oligoSize))
+
+ options.rejectedOligo=[]
+ for p in options.rejectedPattern:
+ assert len(p)==options.oligoSize,"Reject pattern with bad lenth : %s" % p
+ options.rejectedOligo.append(predicate.predicateMatchPattern(p, options.oligoSize))
+
+
+ #options.acceptedOligo.append(predicat.distMinGenerator(seed, options.oligoDist))
+
+ if options.homopolymere:
+ options.rejectedOligo.append(predicate.predicateHomoPolymerLarger(options.homopolymere, options.oligoSize))
+
+ if options.homopolymere_min:
+ options.acceptedOligo.append(predicate.predicateHomoPolymerLarger(options.homopolymere_min-1, options.oligoSize))
+
+ if options.gcMax:
+ options.rejectedOligo.append(predicate.predicateGCUpperBond(options.gcMax, options.oligoSize))
+
+ return wordSelector(words, options.acceptedOligo, options.rejectedOligo)
diff --git a/obitools/word/predicate.py b/obitools/word/predicate.py
new file mode 100644
index 0000000..082b80f
--- /dev/null
+++ b/obitools/word/predicate.py
@@ -0,0 +1,41 @@
+#@PydevCodeAnalysisIgnore
+'''
+Created on 14 oct. 2009
+
+@author: coissac
+'''
+
+from _binary import wordDist, \
+ homoMax, \
+ countCG, \
+ matchPattern, \
+ encodePattern
+
+def predicateWordDistMin(word,dmin,size):
+ def predicate(w):
+ return wordDist(word, w) >= dmin
+ return predicate
+
+def predicateHomoPolymerLarger(count,size):
+ def predicate(w):
+ return homoMax(w, size) > count
+ return predicate
+
+def predicateHomoPolymerSmaller(count,size):
+ def predicate(w):
+ return homoMax(w, size) < count
+ return predicate
+
+def predicateGCUpperBond(count,size):
+ def predicate(w):
+ return countCG(w, size) > count
+ return predicate
+
+def predicateMatchPattern(pattern,size):
+ pattern=encodePattern(pattern)
+ def predicate(w):
+ return matchPattern(w, pattern)
+ return predicate
+
+
+
diff --git a/obitools/zipfile.py b/obitools/zipfile.py
new file mode 100644
index 0000000..41e4bcb
--- /dev/null
+++ b/obitools/zipfile.py
@@ -0,0 +1,1282 @@
+"""
+Read and write ZIP files.
+"""
+import struct, os, time, sys, shutil
+import binascii, cStringIO
+
+try:
+ import zlib # We may need its compression method
+ crc32 = zlib.crc32
+except ImportError:
+ zlib = None
+ crc32 = binascii.crc32
+
+__all__ = ["BadZipfile", "error", "ZIP_STORED", "ZIP_DEFLATED", "is_zipfile",
+ "ZipInfo", "ZipFile", "PyZipFile", "LargeZipFile" ]
+
+class BadZipfile(Exception):
+ pass
+
+
+class LargeZipFile(Exception):
+ """
+ Raised when writing a zipfile, the zipfile requires ZIP64 extensions
+ and those extensions are disabled.
+ """
+
+error = BadZipfile # The exception raised by this module
+
+ZIP64_LIMIT= (1 << 31) - 1
+
+# constants for Zip file compression methods
+ZIP_STORED = 0
+ZIP_DEFLATED = 8
+# Other ZIP compression methods not supported
+
+# Here are some struct module formats for reading headers
+structEndArchive = "<4s4H2LH" # 9 items, end of archive, 22 bytes
+stringEndArchive = "PK\005\006" # magic number for end of archive record
+structCentralDir = "<4s4B4HLLL5HLL"# 19 items, central directory, 46 bytes
+stringCentralDir = "PK\001\002" # magic number for central directory
+structFileHeader = "<4s2B4HLLL2H" # 12 items, file header record, 30 bytes
+stringFileHeader = "PK\003\004" # magic number for file header
+structEndArchive64Locator = "<4sLQL" # 4 items, locate Zip64 header, 20 bytes
+stringEndArchive64Locator = "PK\x06\x07" # magic token for locator header
+structEndArchive64 = "<4sQHHLLQQQQ" # 10 items, end of archive (Zip64), 56 bytes
+stringEndArchive64 = "PK\x06\x06" # magic token for Zip64 header
+
+
+# indexes of entries in the central directory structure
+_CD_SIGNATURE = 0
+_CD_CREATE_VERSION = 1
+_CD_CREATE_SYSTEM = 2
+_CD_EXTRACT_VERSION = 3
+_CD_EXTRACT_SYSTEM = 4 # is this meaningful?
+_CD_FLAG_BITS = 5
+_CD_COMPRESS_TYPE = 6
+_CD_TIME = 7
+_CD_DATE = 8
+_CD_CRC = 9
+_CD_COMPRESSED_SIZE = 10
+_CD_UNCOMPRESSED_SIZE = 11
+_CD_FILENAME_LENGTH = 12
+_CD_EXTRA_FIELD_LENGTH = 13
+_CD_COMMENT_LENGTH = 14
+_CD_DISK_NUMBER_START = 15
+_CD_INTERNAL_FILE_ATTRIBUTES = 16
+_CD_EXTERNAL_FILE_ATTRIBUTES = 17
+_CD_LOCAL_HEADER_OFFSET = 18
+
+# indexes of entries in the local file header structure
+_FH_SIGNATURE = 0
+_FH_EXTRACT_VERSION = 1
+_FH_EXTRACT_SYSTEM = 2 # is this meaningful?
+_FH_GENERAL_PURPOSE_FLAG_BITS = 3
+_FH_COMPRESSION_METHOD = 4
+_FH_LAST_MOD_TIME = 5
+_FH_LAST_MOD_DATE = 6
+_FH_CRC = 7
+_FH_COMPRESSED_SIZE = 8
+_FH_UNCOMPRESSED_SIZE = 9
+_FH_FILENAME_LENGTH = 10
+_FH_EXTRA_FIELD_LENGTH = 11
+
+def is_zipfile(filename):
+ """Quickly see if file is a ZIP file by checking the magic number."""
+ try:
+ fpin = open(filename, "rb")
+ endrec = _EndRecData(fpin)
+ fpin.close()
+ if endrec:
+ return True # file has correct magic number
+ except IOError:
+ pass
+ return False
+
+def _EndRecData64(fpin, offset, endrec):
+ """
+ Read the ZIP64 end-of-archive records and use that to update endrec
+ """
+ locatorSize = struct.calcsize(structEndArchive64Locator)
+ fpin.seek(offset - locatorSize, 2)
+ data = fpin.read(locatorSize)
+ sig, diskno, reloff, disks = struct.unpack(structEndArchive64Locator, data)
+ if sig != stringEndArchive64Locator:
+ return endrec
+
+ if diskno != 0 or disks != 1:
+ raise BadZipfile("zipfiles that span multiple disks are not supported")
+
+ # Assume no 'zip64 extensible data'
+ endArchiveSize = struct.calcsize(structEndArchive64)
+ fpin.seek(offset - locatorSize - endArchiveSize, 2)
+ data = fpin.read(endArchiveSize)
+ sig, sz, create_version, read_version, disk_num, disk_dir, \
+ dircount, dircount2, dirsize, diroffset = \
+ struct.unpack(structEndArchive64, data)
+ if sig != stringEndArchive64:
+ return endrec
+
+ # Update the original endrec using data from the ZIP64 record
+ endrec[1] = disk_num
+ endrec[2] = disk_dir
+ endrec[3] = dircount
+ endrec[4] = dircount2
+ endrec[5] = dirsize
+ endrec[6] = diroffset
+ return endrec
+
+
+def _EndRecData(fpin):
+ """Return data from the "End of Central Directory" record, or None.
+
+ The data is a list of the nine items in the ZIP "End of central dir"
+ record followed by a tenth item, the file seek offset of this record."""
+ fpin.seek(-22, 2) # Assume no archive comment.
+ filesize = fpin.tell() + 22 # Get file size
+ data = fpin.read()
+ if data[0:4] == stringEndArchive and data[-2:] == "\000\000":
+ endrec = struct.unpack(structEndArchive, data)
+ endrec = list(endrec)
+ endrec.append("") # Append the archive comment
+ endrec.append(filesize - 22) # Append the record start offset
+ if endrec[-4] == 0xffffffff:
+ return _EndRecData64(fpin, -22, endrec)
+ return endrec
+ # Search the last END_BLOCK bytes of the file for the record signature.
+ # The comment is appended to the ZIP file and has a 16 bit length.
+ # So the comment may be up to 64K long. We limit the search for the
+ # signature to a few Kbytes at the end of the file for efficiency.
+ # also, the signature must not appear in the comment.
+ END_BLOCK = min(filesize, 1024 * 4)
+ fpin.seek(filesize - END_BLOCK, 0)
+ data = fpin.read()
+ start = data.rfind(stringEndArchive)
+ if start >= 0: # Correct signature string was found
+ endrec = struct.unpack(structEndArchive, data[start:start+22])
+ endrec = list(endrec)
+ comment = data[start+22:]
+ if endrec[7] == len(comment): # Comment length checks out
+ # Append the archive comment and start offset
+ endrec.append(comment)
+ endrec.append(filesize - END_BLOCK + start)
+ if endrec[-4] == 0xffffffff:
+ return _EndRecData64(fpin, - END_BLOCK + start, endrec)
+ return endrec
+ return # Error, return None
+
+
+class ZipInfo (object):
+ """Class with attributes describing each file in the ZIP archive."""
+
+ __slots__ = (
+ 'orig_filename',
+ 'filename',
+ 'date_time',
+ 'compress_type',
+ 'comment',
+ 'extra',
+ 'create_system',
+ 'create_version',
+ 'extract_version',
+ 'reserved',
+ 'flag_bits',
+ 'volume',
+ 'internal_attr',
+ 'external_attr',
+ 'header_offset',
+ 'CRC',
+ 'compress_size',
+ 'file_size',
+ '_raw_time',
+ )
+
+ def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)):
+ self.orig_filename = filename # Original file name in archive
+
+ # Terminate the file name at the first null byte. Null bytes in file
+ # names are used as tricks by viruses in archives.
+ null_byte = filename.find(chr(0))
+ if null_byte >= 0:
+ filename = filename[0:null_byte]
+ # This is used to ensure paths in generated ZIP files always use
+ # forward slashes as the directory separator, as required by the
+ # ZIP format specification.
+ if os.sep != "/" and os.sep in filename:
+ filename = filename.replace(os.sep, "/")
+
+ self.filename = filename # Normalized file name
+ self.date_time = date_time # year, month, day, hour, min, sec
+ # Standard values:
+ self.compress_type = ZIP_STORED # Type of compression for the file
+ self.comment = "" # Comment for each file
+ self.extra = "" # ZIP extra data
+ if sys.platform == 'win32':
+ self.create_system = 0 # System which created ZIP archive
+ else:
+ # Assume everything else is unix-y
+ self.create_system = 3 # System which created ZIP archive
+ self.create_version = 20 # Version which created ZIP archive
+ self.extract_version = 20 # Version needed to extract archive
+ self.reserved = 0 # Must be zero
+ self.flag_bits = 0 # ZIP flag bits
+ self.volume = 0 # Volume number of file header
+ self.internal_attr = 0 # Internal attributes
+ self.external_attr = 0 # External file attributes
+ # Other attributes are set by class ZipFile:
+ # header_offset Byte offset to the file header
+ # CRC CRC-32 of the uncompressed file
+ # compress_size Size of the compressed file
+ # file_size Size of the uncompressed file
+
+ def FileHeader(self):
+ """Return the per-file header as a string."""
+ dt = self.date_time
+ dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2]
+ dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2)
+ if self.flag_bits & 0x08:
+ # Set these to zero because we write them after the file data
+ CRC = compress_size = file_size = 0
+ else:
+ CRC = self.CRC
+ compress_size = self.compress_size
+ file_size = self.file_size
+
+ extra = self.extra
+
+ if file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT:
+ # File is larger than what fits into a 4 byte integer,
+ # fall back to the ZIP64 extension
+ fmt = '= 24:
+ counts = unpack('> 1) & 0x7FFFFFFF) ^ poly
+ else:
+ crc = ((crc >> 1) & 0x7FFFFFFF)
+ table[i] = crc
+ return table
+ crctable = _GenerateCRCTable()
+
+ def _crc32(self, ch, crc):
+ """Compute the CRC32 primitive on one byte."""
+ return ((crc >> 8) & 0xffffff) ^ self.crctable[(crc ^ ord(ch)) & 0xff]
+
+ def __init__(self, pwd):
+ self.key0 = 305419896
+ self.key1 = 591751049
+ self.key2 = 878082192
+ for p in pwd:
+ self._UpdateKeys(p)
+
+ def _UpdateKeys(self, c):
+ self.key0 = self._crc32(c, self.key0)
+ self.key1 = (self.key1 + (self.key0 & 255)) & 4294967295
+ self.key1 = (self.key1 * 134775813 + 1) & 4294967295
+ self.key2 = self._crc32(chr((self.key1 >> 24) & 255), self.key2)
+
+ def __call__(self, c):
+ """Decrypt a single character."""
+ c = ord(c)
+ k = self.key2 | 2
+ c = c ^ (((k * (k^1)) >> 8) & 255)
+ c = chr(c)
+ self._UpdateKeys(c)
+ return c
+
+class ZipExtFile:
+ """File-like object for reading an archive member.
+ Is returned by ZipFile.open().
+ """
+
+ def __init__(self, fileobj, zipinfo, decrypt=None):
+ self.fileobj = fileobj
+ self.decrypter = decrypt
+ self.bytes_read = 0L
+ self.rawbuffer = ''
+ self.readbuffer = ''
+ self.linebuffer = ''
+ self.eof = False
+ self.univ_newlines = False
+ self.nlSeps = ("\n", )
+ self.lastdiscard = ''
+
+ self.compress_type = zipinfo.compress_type
+ self.compress_size = zipinfo.compress_size
+
+ self.closed = False
+ self.mode = "r"
+ self.name = zipinfo.filename
+
+ # read from compressed files in 64k blocks
+ self.compreadsize = 64*1024
+ if self.compress_type == ZIP_DEFLATED:
+ self.dc = zlib.decompressobj(-15)
+
+ def set_univ_newlines(self, univ_newlines):
+ self.univ_newlines = univ_newlines
+
+ # pick line separator char(s) based on universal newlines flag
+ self.nlSeps = ("\n", )
+ if self.univ_newlines:
+ self.nlSeps = ("\r\n", "\r", "\n")
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ nextline = self.readline()
+ if not nextline:
+ raise StopIteration()
+
+ return nextline
+
+ def close(self):
+ self.closed = True
+
+ def _checkfornewline(self):
+ nl, nllen = -1, -1
+ if self.linebuffer:
+ # ugly check for cases where half of an \r\n pair was
+ # read on the last pass, and the \r was discarded. In this
+ # case we just throw away the \n at the start of the buffer.
+ if (self.lastdiscard, self.linebuffer[0]) == ('\r','\n'):
+ self.linebuffer = self.linebuffer[1:]
+
+ for sep in self.nlSeps:
+ nl = self.linebuffer.find(sep)
+ if nl >= 0:
+ nllen = len(sep)
+ return nl, nllen
+
+ return nl, nllen
+
+ def readline(self, size = -1):
+ """Read a line with approx. size. If size is negative,
+ read a whole line.
+ """
+ if size < 0:
+ size = sys.maxint
+ elif size == 0:
+ return ''
+
+ # check for a newline already in buffer
+ nl, nllen = self._checkfornewline()
+
+ if nl >= 0:
+ # the next line was already in the buffer
+ nl = min(nl, size)
+ else:
+ # no line break in buffer - try to read more
+ size -= len(self.linebuffer)
+ while nl < 0 and size > 0:
+ buf = self.read(min(size, 100))
+ if not buf:
+ break
+ self.linebuffer += buf
+ size -= len(buf)
+
+ # check for a newline in buffer
+ nl, nllen = self._checkfornewline()
+
+ # we either ran out of bytes in the file, or
+ # met the specified size limit without finding a newline,
+ # so return current buffer
+ if nl < 0:
+ s = self.linebuffer
+ self.linebuffer = ''
+ return s
+
+ buf = self.linebuffer[:nl]
+ self.lastdiscard = self.linebuffer[nl:nl + nllen]
+ self.linebuffer = self.linebuffer[nl + nllen:]
+
+ # line is always returned with \n as newline char (except possibly
+ # for a final incomplete line in the file, which is handled above).
+ return buf + "\n"
+
+ def readlines(self, sizehint = -1):
+ """Return a list with all (following) lines. The sizehint parameter
+ is ignored in this implementation.
+ """
+ result = []
+ while True:
+ line = self.readline()
+ if not line: break
+ result.append(line)
+ return result
+
+ def read(self, size = None):
+ # act like file() obj and return empty string if size is 0
+ if size == 0:
+ return ''
+
+ # determine read size
+ bytesToRead = self.compress_size - self.bytes_read
+
+ # adjust read size for encrypted files since the first 12 bytes
+ # are for the encryption/password information
+ if self.decrypter is not None:
+ bytesToRead -= 12
+
+ if size is not None and size >= 0:
+ if self.compress_type == ZIP_STORED:
+ lr = len(self.readbuffer)
+ bytesToRead = min(bytesToRead, size - lr)
+ elif self.compress_type == ZIP_DEFLATED:
+ if len(self.readbuffer) > size:
+ # the user has requested fewer bytes than we've already
+ # pulled through the decompressor; don't read any more
+ bytesToRead = 0
+ else:
+ # user will use up the buffer, so read some more
+ lr = len(self.rawbuffer)
+ bytesToRead = min(bytesToRead, self.compreadsize - lr)
+
+ # avoid reading past end of file contents
+ if bytesToRead + self.bytes_read > self.compress_size:
+ bytesToRead = self.compress_size - self.bytes_read
+
+ # try to read from file (if necessary)
+ if bytesToRead > 0:
+ bytes = self.fileobj.read(bytesToRead)
+ self.bytes_read += len(bytes)
+ self.rawbuffer += bytes
+
+ # handle contents of raw buffer
+ if self.rawbuffer:
+ newdata = self.rawbuffer
+ self.rawbuffer = ''
+
+ # decrypt new data if we were given an object to handle that
+ if newdata and self.decrypter is not None:
+ newdata = ''.join(map(self.decrypter, newdata))
+
+ # decompress newly read data if necessary
+ if newdata and self.compress_type == ZIP_DEFLATED:
+ newdata = self.dc.decompress(newdata)
+ self.rawbuffer = self.dc.unconsumed_tail
+ if self.eof and len(self.rawbuffer) == 0:
+ # we're out of raw bytes (both from the file and
+ # the local buffer); flush just to make sure the
+ # decompressor is done
+ newdata += self.dc.flush()
+ # prevent decompressor from being used again
+ self.dc = None
+
+ self.readbuffer += newdata
+
+
+ # return what the user asked for
+ if size is None or len(self.readbuffer) <= size:
+ bytes = self.readbuffer
+ self.readbuffer = ''
+ else:
+ bytes = self.readbuffer[:size]
+ self.readbuffer = self.readbuffer[size:]
+
+ return bytes
+
+
+class ZipFile:
+ """ Class with methods to open, read, write, close, list zip files.
+
+ z = ZipFile(file, mode="r", compression=ZIP_STORED, allowZip64=True)
+
+ @var file: Either the path to the file, or a file-like object.
+ If it is a path, the file will be opened and closed by ZipFile.
+ @var mode: The mode can be either read "r", write "w" or append "a".
+ @var compression: ZIP_STORED (no compression) or ZIP_DEFLATED (requires zlib).
+ @var allowZip64: if True ZipFile will create files with ZIP64 extensions when
+ needed, otherwise it will raise an exception when this would
+ be necessary.
+
+ """
+
+ fp = None # Set here since __del__ checks it
+
+ def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=False):
+ """Open the ZIP file with mode read "r", write "w" or append "a"."""
+ if mode not in ("r", "w", "a"):
+ raise RuntimeError('ZipFile() requires mode "r", "w", or "a"')
+
+ if compression == ZIP_STORED:
+ pass
+ elif compression == ZIP_DEFLATED:
+ if not zlib:
+ raise RuntimeError,\
+ "Compression requires the (missing) zlib module"
+ else:
+ raise RuntimeError, "That compression method is not supported"
+
+ self._allowZip64 = allowZip64
+ self._didModify = False
+ self.debug = 0 # Level of printing: 0 through 3
+ self.NameToInfo = {} # Find file info given name
+ self.filelist = [] # List of ZipInfo instances for archive
+ self.compression = compression # Method of compression
+ self.mode = key = mode.replace('b', '')[0]
+ self.pwd = None
+
+ # Check if we were passed a file-like object
+ if isinstance(file, basestring):
+ self._filePassed = 0
+ self.filename = file
+ modeDict = {'r' : 'rb', 'w': 'wb', 'a' : 'r+b'}
+ try:
+ self.fp = open(file, modeDict[mode])
+ except IOError:
+ if mode == 'a':
+ mode = key = 'w'
+ self.fp = open(file, modeDict[mode])
+ else:
+ raise
+ else:
+ self._filePassed = 1
+ self.fp = file
+ self.filename = getattr(file, 'name', None)
+
+ if key == 'r':
+ self._GetContents()
+ elif key == 'w':
+ pass
+ elif key == 'a':
+ try: # See if file is a zip file
+ self._RealGetContents()
+ # seek to start of directory and overwrite
+ self.fp.seek(self.start_dir, 0)
+ except BadZipfile: # file is not a zip file, just append
+ self.fp.seek(0, 2)
+ else:
+ if not self._filePassed:
+ self.fp.close()
+ self.fp = None
+ raise RuntimeError, 'Mode must be "r", "w" or "a"'
+
+ def _GetContents(self):
+ """Read the directory, making sure we close the file if the format
+ is bad."""
+ try:
+ self._RealGetContents()
+ except BadZipfile:
+ if not self._filePassed:
+ self.fp.close()
+ self.fp = None
+ raise
+
+ def _RealGetContents(self):
+ """Read in the table of contents for the ZIP file."""
+ fp = self.fp
+ endrec = _EndRecData(fp)
+ if not endrec:
+ raise BadZipfile, "File is not a zip file"
+ if self.debug > 1:
+ print endrec
+ size_cd = endrec[5] # bytes in central directory
+ offset_cd = endrec[6] # offset of central directory
+ self.comment = endrec[8] # archive comment
+ # endrec[9] is the offset of the "End of Central Dir" record
+ if endrec[9] > ZIP64_LIMIT:
+ x = endrec[9] - size_cd - 56 - 20
+ else:
+ x = endrec[9] - size_cd
+ # "concat" is zero, unless zip was concatenated to another file
+ concat = x - offset_cd
+ if self.debug > 2:
+ print "given, inferred, offset", offset_cd, x, concat
+ # self.start_dir: Position of start of central directory
+ self.start_dir = offset_cd + concat
+ fp.seek(self.start_dir, 0)
+ data = fp.read(size_cd)
+ fp = cStringIO.StringIO(data)
+ total = 0
+ while total < size_cd:
+ centdir = fp.read(46)
+ total = total + 46
+ if centdir[0:4] != stringCentralDir:
+ raise BadZipfile, "Bad magic number for central directory"
+ centdir = struct.unpack(structCentralDir, centdir)
+ if self.debug > 2:
+ print centdir
+ filename = fp.read(centdir[_CD_FILENAME_LENGTH])
+ # Create ZipInfo instance to store file information
+ x = ZipInfo(filename)
+ x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH])
+ x.comment = fp.read(centdir[_CD_COMMENT_LENGTH])
+ total = (total + centdir[_CD_FILENAME_LENGTH]
+ + centdir[_CD_EXTRA_FIELD_LENGTH]
+ + centdir[_CD_COMMENT_LENGTH])
+ x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET]
+ (x.create_version, x.create_system, x.extract_version, x.reserved,
+ x.flag_bits, x.compress_type, t, d,
+ x.CRC, x.compress_size, x.file_size) = centdir[1:12]
+ x.volume, x.internal_attr, x.external_attr = centdir[15:18]
+ # Convert date/time code to (year, month, day, hour, min, sec)
+ x._raw_time = t
+ x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F,
+ t>>11, (t>>5)&0x3F, (t&0x1F) * 2 )
+
+ x._decodeExtra()
+ x.header_offset = x.header_offset + concat
+ self.filelist.append(x)
+ self.NameToInfo[x.filename] = x
+ if self.debug > 2:
+ print "total", total
+
+
+ def namelist(self):
+ """Return a list of file names in the archive."""
+ l = []
+ for data in self.filelist:
+ l.append(data.filename)
+ return l
+
+ def infolist(self):
+ """Return a list of class ZipInfo instances for files in the
+ archive."""
+ return self.filelist
+
+ def printdir(self):
+ """Print a table of contents for the zip file."""
+ print "%-46s %19s %12s" % ("File Name", "Modified ", "Size")
+ for zinfo in self.filelist:
+ date = "%d-%02d-%02d %02d:%02d:%02d" % zinfo.date_time[:6]
+ print "%-46s %s %12d" % (zinfo.filename, date, zinfo.file_size)
+
+ def testzip(self):
+ """Read all the files and check the CRC."""
+ for zinfo in self.filelist:
+ try:
+ self.read(zinfo.filename) # Check CRC-32
+ except BadZipfile:
+ return zinfo.filename
+
+
+ def getinfo(self, name):
+ """Return the instance of ZipInfo given 'name'."""
+ info = self.NameToInfo.get(name)
+ if info is None:
+ raise KeyError(
+ 'There is no item named %r in the archive' % name)
+
+ return info
+
+ def setpassword(self, pwd):
+ """Set default password for encrypted files."""
+ self.pwd = pwd
+
+ def read(self, name, pwd=None):
+ """Return file bytes (as a string) for name."""
+ return self.open(name, "r", pwd).read()
+
+ def open(self, name, mode="r", pwd=None):
+ """Return file-like object for 'name'."""
+ if mode not in ("r", "U", "rU"):
+ raise RuntimeError, 'open() requires mode "r", "U", or "rU"'
+ if not self.fp:
+ raise RuntimeError, \
+ "Attempt to read ZIP archive that was already closed"
+
+ # Only open a new file for instances where we were not
+ # given a file object in the constructor
+ if self._filePassed:
+ zef_file = self.fp
+ else:
+ zef_file = open(self.filename, 'rb')
+
+ # Get info object for name
+ zinfo = self.getinfo(name)
+
+ filepos = zef_file.tell()
+
+ zef_file.seek(zinfo.header_offset, 0)
+
+ # Skip the file header:
+ fheader = zef_file.read(30)
+ if fheader[0:4] != stringFileHeader:
+ raise BadZipfile, "Bad magic number for file header"
+
+ fheader = struct.unpack(structFileHeader, fheader)
+ fname = zef_file.read(fheader[_FH_FILENAME_LENGTH])
+ if fheader[_FH_EXTRA_FIELD_LENGTH]:
+ zef_file.read(fheader[_FH_EXTRA_FIELD_LENGTH])
+
+ if fname != zinfo.orig_filename:
+ raise BadZipfile, \
+ 'File name in directory "%s" and header "%s" differ.' % (
+ zinfo.orig_filename, fname)
+
+ # check for encrypted flag & handle password
+ is_encrypted = zinfo.flag_bits & 0x1
+ zd = None
+ if is_encrypted:
+ if not pwd:
+ pwd = self.pwd
+ if not pwd:
+ raise RuntimeError, "File %s is encrypted, " \
+ "password required for extraction" % name
+
+ zd = _ZipDecrypter(pwd)
+ # The first 12 bytes in the cypher stream is an encryption header
+ # used to strengthen the algorithm. The first 11 bytes are
+ # completely random, while the 12th contains the MSB of the CRC,
+ # or the MSB of the file time depending on the header type
+ # and is used to check the correctness of the password.
+ bytes = zef_file.read(12)
+ h = map(zd, bytes[0:12])
+ if zinfo.flag_bits & 0x8:
+ # compare against the file type from extended local headers
+ check_byte = (zinfo._raw_time >> 8) & 0xff
+ else:
+ # compare against the CRC otherwise
+ check_byte = (zinfo.CRC >> 24) & 0xff
+ if ord(h[11]) != check_byte:
+ raise RuntimeError("Bad password for file", name)
+
+ # build and return a ZipExtFile
+ if zd is None:
+ zef = ZipExtFile(zef_file, zinfo)
+ else:
+ zef = ZipExtFile(zef_file, zinfo, zd)
+
+ # set universal newlines on ZipExtFile if necessary
+ if "U" in mode:
+ zef.set_univ_newlines(True)
+ return zef
+
+ def extract(self, member, path=None, pwd=None):
+ """Extract a member from the archive to the current working directory,
+ using its full name. Its file information is extracted as accurately
+ as possible. `member' may be a filename or a ZipInfo object. You can
+ specify a different directory using `path'.
+ """
+ if not isinstance(member, ZipInfo):
+ member = self.getinfo(member)
+
+ if path is None:
+ path = os.getcwd()
+
+ return self._extract_member(member, path, pwd)
+
+ def extractall(self, path=None, members=None, pwd=None):
+ """Extract all members from the archive to the current working
+ directory. `path' specifies a different directory to extract to.
+ `members' is optional and must be a subset of the list returned
+ by namelist().
+ """
+ if members is None:
+ members = self.namelist()
+
+ for zipinfo in members:
+ self.extract(zipinfo, path, pwd)
+
+ def _extract_member(self, member, targetpath, pwd):
+ """Extract the ZipInfo object 'member' to a physical
+ file on the path targetpath.
+ """
+ # build the destination pathname, replacing
+ # forward slashes to platform specific separators.
+ if targetpath[-1:] == "/":
+ targetpath = targetpath[:-1]
+
+ # don't include leading "/" from file name if present
+ if os.path.isabs(member.filename):
+ targetpath = os.path.join(targetpath, member.filename[1:])
+ else:
+ targetpath = os.path.join(targetpath, member.filename)
+
+ targetpath = os.path.normpath(targetpath)
+
+ # Create all upper directories if necessary.
+ upperdirs = os.path.dirname(targetpath)
+ if upperdirs and not os.path.exists(upperdirs):
+ os.makedirs(upperdirs)
+
+ source = self.open(member.filename, pwd=pwd)
+ target = file(targetpath, "wb")
+ shutil.copyfileobj(source, target)
+ source.close()
+ target.close()
+
+ return targetpath
+
+ def _writecheck(self, zinfo):
+ """Check for errors before writing a file to the archive."""
+ if zinfo.filename in self.NameToInfo:
+ if self.debug: # Warning for duplicate names
+ print "Duplicate name:", zinfo.filename
+ if self.mode not in ("w", "a"):
+ raise RuntimeError, 'write() requires mode "w" or "a"'
+ if not self.fp:
+ raise RuntimeError, \
+ "Attempt to write ZIP archive that was already closed"
+ if zinfo.compress_type == ZIP_DEFLATED and not zlib:
+ raise RuntimeError, \
+ "Compression requires the (missing) zlib module"
+ if zinfo.compress_type not in (ZIP_STORED, ZIP_DEFLATED):
+ raise RuntimeError, \
+ "That compression method is not supported"
+ if zinfo.file_size > ZIP64_LIMIT:
+ if not self._allowZip64:
+ raise LargeZipFile("Filesize would require ZIP64 extensions")
+ if zinfo.header_offset > ZIP64_LIMIT:
+ if not self._allowZip64:
+ raise LargeZipFile("Zipfile size would require ZIP64 extensions")
+
+ def write(self, filename, arcname=None, compress_type=None):
+ """Put the bytes from filename into the archive under the name
+ arcname."""
+ if not self.fp:
+ raise RuntimeError(
+ "Attempt to write to ZIP archive that was already closed")
+
+ st = os.stat(filename)
+ mtime = time.localtime(st.st_mtime)
+ date_time = mtime[0:6]
+ # Create ZipInfo instance to store file information
+ if arcname is None:
+ arcname = filename
+ arcname = os.path.normpath(os.path.splitdrive(arcname)[1])
+ while arcname[0] in (os.sep, os.altsep):
+ arcname = arcname[1:]
+ zinfo = ZipInfo(arcname, date_time)
+ zinfo.external_attr = (st[0] & 0xFFFF) << 16L # Unix attributes
+ if compress_type is None:
+ zinfo.compress_type = self.compression
+ else:
+ zinfo.compress_type = compress_type
+
+ zinfo.file_size = st.st_size
+ zinfo.flag_bits = 0x00
+ zinfo.header_offset = self.fp.tell() # Start of header bytes
+
+ self._writecheck(zinfo)
+ self._didModify = True
+ fp = open(filename, "rb")
+ # Must overwrite CRC and sizes with correct data later
+ zinfo.CRC = CRC = 0
+ zinfo.compress_size = compress_size = 0
+ zinfo.file_size = file_size = 0
+ self.fp.write(zinfo.FileHeader())
+ if zinfo.compress_type == ZIP_DEFLATED:
+ cmpr = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION,
+ zlib.DEFLATED, -15)
+ else:
+ cmpr = None
+ while 1:
+ buf = fp.read(1024 * 8)
+ if not buf:
+ break
+ file_size = file_size + len(buf)
+ CRC = crc32(buf, CRC) & 0xffffffff
+ if cmpr:
+ buf = cmpr.compress(buf)
+ compress_size = compress_size + len(buf)
+ self.fp.write(buf)
+ fp.close()
+ if cmpr:
+ buf = cmpr.flush()
+ compress_size = compress_size + len(buf)
+ self.fp.write(buf)
+ zinfo.compress_size = compress_size
+ else:
+ zinfo.compress_size = file_size
+ zinfo.CRC = CRC
+ zinfo.file_size = file_size
+ # Seek backwards and write CRC and file sizes
+ position = self.fp.tell() # Preserve current position in file
+ self.fp.seek(zinfo.header_offset + 14, 0)
+ self.fp.write(struct.pack(" ZIP64_LIMIT \
+ or zinfo.compress_size > ZIP64_LIMIT:
+ extra.append(zinfo.file_size)
+ extra.append(zinfo.compress_size)
+ file_size = 0xffffffff #-1
+ compress_size = 0xffffffff #-1
+ else:
+ file_size = zinfo.file_size
+ compress_size = zinfo.compress_size
+
+ if zinfo.header_offset > ZIP64_LIMIT:
+ extra.append(zinfo.header_offset)
+ header_offset = 0xffffffffL # -1 32 bit
+ else:
+ header_offset = zinfo.header_offset
+
+ extra_data = zinfo.extra
+ if extra:
+ # Append a ZIP64 field to the extra's
+ extra_data = struct.pack(
+ '>sys.stderr, (structCentralDir,
+ stringCentralDir, create_version,
+ zinfo.create_system, extract_version, zinfo.reserved,
+ zinfo.flag_bits, zinfo.compress_type, dostime, dosdate,
+ zinfo.CRC, compress_size, file_size,
+ len(zinfo.filename), len(extra_data), len(zinfo.comment),
+ 0, zinfo.internal_attr, zinfo.external_attr,
+ header_offset)
+ raise
+ self.fp.write(centdir)
+ self.fp.write(zinfo.filename)
+ self.fp.write(extra_data)
+ self.fp.write(zinfo.comment)
+
+ pos2 = self.fp.tell()
+ # Write end-of-zip-archive record
+ if pos1 > ZIP64_LIMIT:
+ # Need to write the ZIP64 end-of-archive records
+ zip64endrec = struct.pack(
+ structEndArchive64, stringEndArchive64,
+ 44, 45, 45, 0, 0, count, count, pos2 - pos1, pos1)
+ self.fp.write(zip64endrec)
+
+ zip64locrec = struct.pack(
+ structEndArchive64Locator,
+ stringEndArchive64Locator, 0, pos2, 1)
+ self.fp.write(zip64locrec)
+
+ endrec = struct.pack(structEndArchive, stringEndArchive,
+ 0, 0, count, count, pos2 - pos1, 0xffffffffL, 0)
+ self.fp.write(endrec)
+
+ else:
+ endrec = struct.pack(structEndArchive, stringEndArchive,
+ 0, 0, count, count, pos2 - pos1, pos1, 0)
+ self.fp.write(endrec)
+ self.fp.flush()
+ if not self._filePassed:
+ self.fp.close()
+ self.fp = None
+
+
+class PyZipFile(ZipFile):
+ """Class to create ZIP archives with Python library files and packages."""
+
+ def writepy(self, pathname, basename = ""):
+ """Add all files from "pathname" to the ZIP archive.
+
+ If pathname is a package directory, search the directory and
+ all package subdirectories recursively for all *.py and enter
+ the modules into the archive. If pathname is a plain
+ directory, listdir *.py and enter all modules. Else, pathname
+ must be a Python *.py file and the module will be put into the
+ archive. Added modules are always module.pyo or module.pyc.
+ This method will compile the module.py into module.pyc if
+ necessary.
+ """
+ dir, name = os.path.split(pathname)
+ if os.path.isdir(pathname):
+ initname = os.path.join(pathname, "__init__.py")
+ if os.path.isfile(initname):
+ # This is a package directory, add it
+ if basename:
+ basename = "%s/%s" % (basename, name)
+ else:
+ basename = name
+ if self.debug:
+ print "Adding package in", pathname, "as", basename
+ fname, arcname = self._get_codename(initname[0:-3], basename)
+ if self.debug:
+ print "Adding", arcname
+ self.write(fname, arcname)
+ dirlist = os.listdir(pathname)
+ dirlist.remove("__init__.py")
+ # Add all *.py files and package subdirectories
+ for filename in dirlist:
+ path = os.path.join(pathname, filename)
+ root, ext = os.path.splitext(filename)
+ if os.path.isdir(path):
+ if os.path.isfile(os.path.join(path, "__init__.py")):
+ # This is a package directory, add it
+ self.writepy(path, basename) # Recursive call
+ elif ext == ".py":
+ fname, arcname = self._get_codename(path[0:-3],
+ basename)
+ if self.debug:
+ print "Adding", arcname
+ self.write(fname, arcname)
+ else:
+ # This is NOT a package directory, add its files at top level
+ if self.debug:
+ print "Adding files from directory", pathname
+ for filename in os.listdir(pathname):
+ path = os.path.join(pathname, filename)
+ root, ext = os.path.splitext(filename)
+ if ext == ".py":
+ fname, arcname = self._get_codename(path[0:-3],
+ basename)
+ if self.debug:
+ print "Adding", arcname
+ self.write(fname, arcname)
+ else:
+ if pathname[-3:] != ".py":
+ raise RuntimeError, \
+ 'Files added with writepy() must end with ".py"'
+ fname, arcname = self._get_codename(pathname[0:-3], basename)
+ if self.debug:
+ print "Adding file", arcname
+ self.write(fname, arcname)
+
+ def _get_codename(self, pathname, basename):
+ """Return (filename, archivename) for the path.
+
+ Given a module name path, return the correct file path and
+ archive name, compiling if necessary. For example, given
+ /python/lib/string, return (/python/lib/string.pyc, string).
+ """
+ file_py = pathname + ".py"
+ file_pyc = pathname + ".pyc"
+ file_pyo = pathname + ".pyo"
+ if os.path.isfile(file_pyo) and \
+ os.stat(file_pyo).st_mtime >= os.stat(file_py).st_mtime:
+ fname = file_pyo # Use .pyo file
+ elif not os.path.isfile(file_pyc) or \
+ os.stat(file_pyc).st_mtime < os.stat(file_py).st_mtime:
+ import py_compile
+ if self.debug:
+ print "Compiling", file_py
+ try:
+ py_compile.compile(file_py, file_pyc, None, True)
+ except py_compile.PyCompileError,err:
+ print err.msg
+ fname = file_pyc
+ else:
+ fname = file_pyc
+ archivename = os.path.split(fname)[1]
+ if basename:
+ archivename = "%s/%s" % (basename, archivename)
+ return (fname, archivename)
+
+
+def main(args = None):
+ import textwrap
+ USAGE=textwrap.dedent("""\
+ Usage:
+ zipfile.py -l zipfile.zip # Show listing of a zipfile
+ zipfile.py -t zipfile.zip # Test if a zipfile is valid
+ zipfile.py -e zipfile.zip target # Extract zipfile into target dir
+ zipfile.py -c zipfile.zip src ... # Create zipfile from sources
+ """)
+ if args is None:
+ args = sys.argv[1:]
+
+ if not args or args[0] not in ('-l', '-c', '-e', '-t'):
+ print USAGE
+ sys.exit(1)
+
+ if args[0] == '-l':
+ if len(args) != 2:
+ print USAGE
+ sys.exit(1)
+ zf = ZipFile(args[1], 'r')
+ zf.printdir()
+ zf.close()
+
+ elif args[0] == '-t':
+ if len(args) != 2:
+ print USAGE
+ sys.exit(1)
+ zf = ZipFile(args[1], 'r')
+ zf.testzip()
+ print "Done testing"
+
+ elif args[0] == '-e':
+ if len(args) != 3:
+ print USAGE
+ sys.exit(1)
+
+ zf = ZipFile(args[1], 'r')
+ out = args[2]
+ for path in zf.namelist():
+ if path.startswith('./'):
+ tgt = os.path.join(out, path[2:])
+ else:
+ tgt = os.path.join(out, path)
+
+ tgtdir = os.path.dirname(tgt)
+ if not os.path.exists(tgtdir):
+ os.makedirs(tgtdir)
+ fp = open(tgt, 'wb')
+ fp.write(zf.read(path))
+ fp.close()
+ zf.close()
+
+ elif args[0] == '-c':
+ if len(args) < 3:
+ print USAGE
+ sys.exit(1)
+
+ def addToZip(zf, path, zippath):
+ if os.path.isfile(path):
+ zf.write(path, zippath, ZIP_DEFLATED)
+ elif os.path.isdir(path):
+ for nm in os.listdir(path):
+ addToZip(zf,
+ os.path.join(path, nm), os.path.join(zippath, nm))
+ # else: ignore
+
+ zf = ZipFile(args[1], 'w', allowZip64=True)
+ for src in args[2:]:
+ addToZip(zf, src, os.path.basename(src))
+
+ zf.close()
+
+if __name__ == "__main__":
+ main()
diff --git a/obitools/zipfile.pyc b/obitools/zipfile.pyc
new file mode 100644
index 0000000..35dace0
Binary files /dev/null and b/obitools/zipfile.pyc differ