diff --git a/obitools/SVGdraw.py b/obitools/SVGdraw.py new file mode 100644 index 0000000..521f750 --- /dev/null +++ b/obitools/SVGdraw.py @@ -0,0 +1,1054 @@ +#!/usr/bin/env python +##Copyright (c) 2002, Fedor Baart & Hans de Wit (Stichting Farmaceutische Kengetallen) +##All rights reserved. +## +##Redistribution and use in source and binary forms, with or without modification, +##are permitted provided that the following conditions are met: +## +##Redistributions of source code must retain the above copyright notice, this +##list of conditions and the following disclaimer. +## +##Redistributions in binary form must reproduce the above copyright notice, +##this list of conditions and the following disclaimer in the documentation and/or +##other materials provided with the distribution. +## +##Neither the name of the Stichting Farmaceutische Kengetallen nor the names of +##its contributors may be used to endorse or promote products derived from this +##software without specific prior written permission. +## +##THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +##AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +##IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +##DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +##FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +##DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +##SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +##CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +##OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +##OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +##Thanks to Gerald Rosennfellner for his help and useful comments. + +__doc__="""Use SVGdraw to generate your SVGdrawings. + +SVGdraw uses an object model drawing and a method toXML to create SVG graphics +by using easy to use classes and methods usualy you start by creating a drawing eg + + d=drawing() + #then you create a SVG root element + s=svg() + #then you add some elements eg a circle and add it to the svg root element + c=circle() + #you can supply attributes by using named arguments. + c=circle(fill='red',stroke='blue') + #or by updating the attributes attribute: + c.attributes['stroke-width']=1 + s.addElement(c) + #then you add the svg root element to the drawing + d.setSVG(s) + #and finaly you xmlify the drawing + d.toXml() + + +this results in the svg source of the drawing, which consists of a circle +on a white background. Its as easy as that;) +This module was created using the SVG specification of www.w3c.org and the +O'Reilly (www.oreilly.com) python books as information sources. A svg viewer +is available from www.adobe.com""" + +__version__="1.0" + +# there are two possibilities to generate svg: +# via a dom implementation and directly using text strings +# the latter is way faster (and shorter in coding) +# the former is only used in debugging svg programs +# maybe it will be removed alltogether after a while +# with the following variable you indicate whether to use the dom implementation +# Note that PyXML is required for using the dom implementation. +# It is also possible to use the standard minidom. But I didn't try that one. +# Anyway the text based approach is about 60 times faster than using the full dom implementation. +use_dom_implementation=0 + + +import exceptions +if use_dom_implementation<>0: + try: + from xml.dom import implementation + from xml.dom.ext import PrettyPrint + except: + raise exceptions.ImportError, "PyXML is required for using the dom implementation" +#The implementation is used for the creating the XML document. +#The prettyprint module is used for converting the xml document object to a xml file + +import sys +assert sys.version_info[0]>=2 +if sys.version_info[1]<2: + True=1 + False=0 + file=open + +sys.setrecursionlimit=50 +#The recursion limit is set conservative so mistakes like s=svg() s.addElement(s) +#won't eat up too much processor time. + +#the following code is pasted form xml.sax.saxutils +#it makes it possible to run the code without the xml sax package installed +#To make it possible to have in your text elements, it is necessary to escape the texts +def _escape(data, entities={}): + """Escape &, <, and > in a string of data. + + You can escape other strings of data by passing a dictionary as + the optional entities parameter. The keys and values must all be + strings; each key will be replaced with its corresponding value. + """ + data = data.replace("&", "&") + data = data.replace("<", "<") + data = data.replace(">", ">") + for chars, entity in entities.items(): + data = data.replace(chars, entity) + return data + +def _quoteattr(data, entities={}): + """Escape and quote an attribute value. + + Escape &, <, and > in a string of data, then quote it for use as + an attribute value. The \" character will be escaped as well, if + necessary. + + You can escape other strings of data by passing a dictionary as + the optional entities parameter. The keys and values must all be + strings; each key will be replaced with its corresponding value. + """ + data = _escape(data, entities) + if '"' in data: + if "'" in data: + data = '"%s"' % data.replace('"', """) + else: + data = "'%s'" % data + else: + data = '"%s"' % data + return data + + + +def _xypointlist(a): + """formats a list of xy pairs""" + s='' + for e in a: #this could be done more elegant + s+=str(e)[1:-1] +' ' + return s + +def _viewboxlist(a): + """formats a tuple""" + s='' + for e in a: + s+=str(e)+' ' + return s + +def _pointlist(a): + """formats a list of numbers""" + return str(a)[1:-1] + +class pathdata: + """class used to create a pathdata object which can be used for a path. + although most methods are pretty straightforward it might be useful to look at the SVG specification.""" + #I didn't test the methods below. + def __init__(self,x=None,y=None): + self.path=[] + if x is not None and y is not None: + self.path.append('M '+str(x)+' '+str(y)) + def closepath(self): + """ends the path""" + self.path.append('z') + def move(self,x,y): + """move to absolute""" + self.path.append('M '+str(x)+' '+str(y)) + def relmove(self,x,y): + """move to relative""" + self.path.append('m '+str(x)+' '+str(y)) + def line(self,x,y): + """line to absolute""" + self.path.append('L '+str(x)+' '+str(y)) + def relline(self,x,y): + """line to relative""" + self.path.append('l '+str(x)+' '+str(y)) + def hline(self,x): + """horizontal line to absolute""" + self.path.append('H'+str(x)) + def relhline(self,x): + """horizontal line to relative""" + self.path.append('h'+str(x)) + def vline(self,y): + """verical line to absolute""" + self.path.append('V'+str(y)) + def relvline(self,y): + """vertical line to relative""" + self.path.append('v'+str(y)) + def bezier(self,x1,y1,x2,y2,x,y): + """bezier with xy1 and xy2 to xy absolut""" + self.path.append('C'+str(x1)+','+str(y1)+' '+str(x2)+','+str(y2)+' '+str(x)+','+str(y)) + def relbezier(self,x1,y1,x2,y2,x,y): + """bezier with xy1 and xy2 to xy relative""" + self.path.append('c'+str(x1)+','+str(y1)+' '+str(x2)+','+str(y2)+' '+str(x)+','+str(y)) + def smbezier(self,x2,y2,x,y): + """smooth bezier with xy2 to xy absolut""" + self.path.append('S'+str(x2)+','+str(y2)+' '+str(x)+','+str(y)) + def relsmbezier(self,x2,y2,x,y): + """smooth bezier with xy2 to xy relative""" + self.path.append('s'+str(x2)+','+str(y2)+' '+str(x)+','+str(y)) + def qbezier(self,x1,y1,x,y): + """quadratic bezier with xy1 to xy absolut""" + self.path.append('Q'+str(x1)+','+str(y1)+' '+str(x)+','+str(y)) + def relqbezier(self,x1,y1,x,y): + """quadratic bezier with xy1 to xy relative""" + self.path.append('q'+str(x1)+','+str(y1)+' '+str(x)+','+str(y)) + def smqbezier(self,x,y): + """smooth quadratic bezier to xy absolut""" + self.path.append('T'+str(x)+','+str(y)) + def relsmqbezier(self,x,y): + """smooth quadratic bezier to xy relative""" + self.path.append('t'+str(x)+','+str(y)) + def ellarc(self,rx,ry,xrot,laf,sf,x,y): + """elliptival arc with rx and ry rotating with xrot using large-arc-flag and sweep-flag to xy absolut""" + self.path.append('A'+str(rx)+','+str(ry)+' '+str(xrot)+' '+str(laf)+' '+str(sf)+' '+str(x)+' '+str(y)) + def relellarc(self,rx,ry,xrot,laf,sf,x,y): + """elliptival arc with rx and ry rotating with xrot using large-arc-flag and sweep-flag to xy relative""" + self.path.append('a'+str(rx)+','+str(ry)+' '+str(xrot)+' '+str(laf)+' '+str(sf)+' '+str(x)+' '+str(y)) + def __repr__(self): + return ' '.join(self.path) + + + + +class SVGelement: + """SVGelement(type,attributes,elements,text,namespace,**args) + Creates a arbitrary svg element and is intended to be subclassed not used on its own. + This element is the base of every svg element it defines a class which resembles + a xml-element. The main advantage of this kind of implementation is that you don't + have to create a toXML method for every different graph object. Every element + consists of a type, attribute, optional subelements, optional text and an optional + namespace. Note the elements==None, if elements = None:self.elements=[] construction. + This is done because if you default to elements=[] every object has a reference + to the same empty list.""" + def __init__(self,type='',attributes=None,elements=None,text='',namespace='',cdata=None,**args): + self.type=type + if attributes==None: + self.attributes={} + else: + self.attributes=attributes + if elements==None: + self.elements=[] + else: + self.elements=elements + self.text=text + self.namespace=namespace + self.cdata=cdata + for arg in args.keys(): + self.attributes[arg]=args[arg] + def addElement(self,SVGelement): + """adds an element to a SVGelement + + SVGelement.addElement(SVGelement) + """ + self.elements.append(SVGelement) + + #def toXml(self,level,f, preserveWhitespace=False): + def toXml(self,level,f, **kwargs): + preserve = kwargs.get("preserveWhitespace", False) + if preserve: + #print "PRESERVING" + NEWLINE = "" + TAB = "" + else: + #print "NOT PRESE" + NEWLINE = "\n" + TAB = "\t" + f.write(TAB*level) + f.write('<'+self.type) + for attkey in self.attributes.keys(): + f.write(' '+_escape(str(attkey))+'='+_quoteattr(str(self.attributes[attkey]))) + if self.namespace: + f.write(' xmlns="'+ _escape(str(self.namespace))+'" ') + if self.elements or self.text or self.cdata: + f.write('>') + if self.elements: + f.write(NEWLINE) + for element in self.elements: + element.toXml(level+1,f, preserveWhitespace=preserve) + if self.cdata: + f.write(NEWLINE+TAB*(level+1)+''+NEWLINE) + if self.text: + if type(self.text)==type(''): #If the text is only text + f.write(_escape(str(self.text))) + else: #If the text is a spannedtext class + f.write(str(self.text)) + if self.elements: + f.write(TAB*level+''+NEWLINE) + elif self.text: + f.write(''+NEWLINE) + elif self.cdata: + f.write(TAB*level+''+NEWLINE) + else: + f.write('/>'+NEWLINE) + +class tspan(SVGelement): + """ts=tspan(text='',**args) + + a tspan element can be used for applying formatting to a textsection + usage: + ts=tspan('this text is bold') + ts.attributes['font-weight']='bold' + st=spannedtext() + st.addtspan(ts) + t=text(3,5,st) + """ + def __init__(self,text=None,**args): + SVGelement.__init__(self,'tspan',**args) + if self.text<>None: + self.text=text + def __repr__(self): + s="None: + raise ValueError, 'height is required' + if height<>None: + raise ValueError, 'width is required' + else: + raise ValueError, 'both height and width are required' + SVGelement.__init__(self,'rect',{'width':width,'height':height},**args) + if x<>None: + self.attributes['x']=x + if y<>None: + self.attributes['y']=y + if fill<>None: + self.attributes['fill']=fill + if stroke<>None: + self.attributes['stroke']=stroke + if stroke_width<>None: + self.attributes['stroke-width']=stroke_width + +class ellipse(SVGelement): + """e=ellipse(rx,ry,x,y,fill,stroke,stroke_width,**args) + + an ellipse is defined as a center and a x and y radius. + """ + def __init__(self,cx=None,cy=None,rx=None,ry=None,fill=None,stroke=None,stroke_width=None,**args): + if rx==None or ry== None: + if rx<>None: + raise ValueError, 'rx is required' + if ry<>None: + raise ValueError, 'ry is required' + else: + raise ValueError, 'both rx and ry are required' + SVGelement.__init__(self,'ellipse',{'rx':rx,'ry':ry},**args) + if cx<>None: + self.attributes['cx']=cx + if cy<>None: + self.attributes['cy']=cy + if fill<>None: + self.attributes['fill']=fill + if stroke<>None: + self.attributes['stroke']=stroke + if stroke_width<>None: + self.attributes['stroke-width']=stroke_width + + +class circle(SVGelement): + """c=circle(x,y,radius,fill,stroke,stroke_width,**args) + + The circle creates an element using a x, y and radius values eg + """ + def __init__(self,cx=None,cy=None,r=None,fill=None,stroke=None,stroke_width=None,**args): + if r==None: + raise ValueError, 'r is required' + SVGelement.__init__(self,'circle',{'r':r},**args) + if cx<>None: + self.attributes['cx']=cx + if cy<>None: + self.attributes['cy']=cy + if fill<>None: + self.attributes['fill']=fill + if stroke<>None: + self.attributes['stroke']=stroke + if stroke_width<>None: + self.attributes['stroke-width']=stroke_width + +class point(circle): + """p=point(x,y,color) + + A point is defined as a circle with a size 1 radius. It may be more efficient to use a + very small rectangle if you use many points because a circle is difficult to render. + """ + def __init__(self,x,y,fill='black',**args): + circle.__init__(self,x,y,1,fill,**args) + +class line(SVGelement): + """l=line(x1,y1,x2,y2,stroke,stroke_width,**args) + + A line is defined by a begin x,y pair and an end x,y pair + """ + def __init__(self,x1=None,y1=None,x2=None,y2=None,stroke=None,stroke_width=None,**args): + SVGelement.__init__(self,'line',**args) + if x1<>None: + self.attributes['x1']=x1 + if y1<>None: + self.attributes['y1']=y1 + if x2<>None: + self.attributes['x2']=x2 + if y2<>None: + self.attributes['y2']=y2 + if stroke_width<>None: + self.attributes['stroke-width']=stroke_width + if stroke<>None: + self.attributes['stroke']=stroke + +class polyline(SVGelement): + """pl=polyline([[x1,y1],[x2,y2],...],fill,stroke,stroke_width,**args) + + a polyline is defined by a list of xy pairs + """ + def __init__(self,points,fill=None,stroke=None,stroke_width=None,**args): + SVGelement.__init__(self,'polyline',{'points':_xypointlist(points)},**args) + if fill<>None: + self.attributes['fill']=fill + if stroke_width<>None: + self.attributes['stroke-width']=stroke_width + if stroke<>None: + self.attributes['stroke']=stroke + +class polygon(SVGelement): + """pl=polyline([[x1,y1],[x2,y2],...],fill,stroke,stroke_width,**args) + + a polygon is defined by a list of xy pairs + """ + def __init__(self,points,fill=None,stroke=None,stroke_width=None,**args): + SVGelement.__init__(self,'polygon',{'points':_xypointlist(points)},**args) + if fill<>None: + self.attributes['fill']=fill + if stroke_width<>None: + self.attributes['stroke-width']=stroke_width + if stroke<>None: + self.attributes['stroke']=stroke + +class path(SVGelement): + """p=path(path,fill,stroke,stroke_width,**args) + + a path is defined by a path object and optional width, stroke and fillcolor + """ + def __init__(self,pathdata,fill=None,stroke=None,stroke_width=None,id=None,**args): + SVGelement.__init__(self,'path',{'d':str(pathdata)},**args) + if stroke<>None: + self.attributes['stroke']=stroke + if fill<>None: + self.attributes['fill']=fill + if stroke_width<>None: + self.attributes['stroke-width']=stroke_width + if id<>None: + self.attributes['id']=id + + +class text(SVGelement): + """t=text(x,y,text,font_size,font_family,**args) + + a text element can bge used for displaying text on the screen + """ + def __init__(self,x=None,y=None,text=None,font_size=None,font_family=None,text_anchor=None,**args): + SVGelement.__init__(self,'text',**args) + if x<>None: + self.attributes['x']=x + if y<>None: + self.attributes['y']=y + if font_size<>None: + self.attributes['font-size']=font_size + if font_family<>None: + self.attributes['font-family']=font_family + if text<>None: + self.text=text + if text_anchor<>None: + self.attributes['text-anchor']=text_anchor + + def toXml(self,level,f, **kwargs): + preserve = self.attributes.get("xml:space", None) + if preserve == "preserve": + #print "FOO PRE" + SVGelement.toXml(self,level, f, preserveWhitespace=True) + else: + #print "FOO NOT" + SVGelement.toXml(self, level, f, preserveWhitespace=False) + +class textpath(SVGelement): + """tp=textpath(text,link,**args) + + a textpath places a text on a path which is referenced by a link. + """ + def __init__(self,link,text=None,**args): + SVGelement.__init__(self,'textPath',{'xlink:href':link},**args) + if text<>None: + self.text=text + +class pattern(SVGelement): + """p=pattern(x,y,width,height,patternUnits,**args) + + A pattern is used to fill or stroke an object using a pre-defined + graphic object which can be replicated ("tiled") at fixed intervals + in x and y to cover the areas to be painted. + """ + def __init__(self,x=None,y=None,width=None,height=None,patternUnits=None,**args): + SVGelement.__init__(self,'pattern',**args) + if x<>None: + self.attributes['x']=x + if y<>None: + self.attributes['y']=y + if width<>None: + self.attributes['width']=width + if height<>None: + self.attributes['height']=height + if patternUnits<>None: + self.attributes['patternUnits']=patternUnits + +class title(SVGelement): + """t=title(text,**args) + + a title is a text element. The text is displayed in the title bar + add at least one to the root svg element + """ + def __init__(self,text=None,**args): + SVGelement.__init__(self,'title',**args) + if text<>None: + self.text=text + +class description(SVGelement): + """d=description(text,**args) + + a description can be added to any element and is used for a tooltip + Add this element before adding other elements. + """ + def __init__(self,text=None,**args): + SVGelement.__init__(self,'desc',**args) + if text<>None: + self.text=text + +class lineargradient(SVGelement): + """lg=lineargradient(x1,y1,x2,y2,id,**args) + + defines a lineargradient using two xy pairs. + stop elements van be added to define the gradient colors. + """ + def __init__(self,x1=None,y1=None,x2=None,y2=None,id=None,**args): + SVGelement.__init__(self,'linearGradient',**args) + if x1<>None: + self.attributes['x1']=x1 + if y1<>None: + self.attributes['y1']=y1 + if x2<>None: + self.attributes['x2']=x2 + if y2<>None: + self.attributes['y2']=y2 + if id<>None: + self.attributes['id']=id + +class radialgradient(SVGelement): + """rg=radialgradient(cx,cy,r,fx,fy,id,**args) + + defines a radial gradient using a outer circle which are defined by a cx,cy and r and by using a focalpoint. + stop elements van be added to define the gradient colors. + """ + def __init__(self,cx=None,cy=None,r=None,fx=None,fy=None,id=None,**args): + SVGelement.__init__(self,'radialGradient',**args) + if cx<>None: + self.attributes['cx']=cx + if cy<>None: + self.attributes['cy']=cy + if r<>None: + self.attributes['r']=r + if fx<>None: + self.attributes['fx']=fx + if fy<>None: + self.attributes['fy']=fy + if id<>None: + self.attributes['id']=id + +class stop(SVGelement): + """st=stop(offset,stop_color,**args) + + Puts a stop color at the specified radius + """ + def __init__(self,offset,stop_color=None,**args): + SVGelement.__init__(self,'stop',{'offset':offset},**args) + if stop_color<>None: + self.attributes['stop-color']=stop_color + +class style(SVGelement): + """st=style(type,cdata=None,**args) + + Add a CDATA element to this element for defing in line stylesheets etc.. + """ + def __init__(self,type,cdata=None,**args): + SVGelement.__init__(self,'style',{'type':type},cdata=cdata, **args) + + +class image(SVGelement): + """im=image(url,width,height,x,y,**args) + + adds an image to the drawing. Supported formats are .png, .jpg and .svg. + """ + def __init__(self,url,x=None,y=None,width=None,height=None,**args): + if width==None or height==None: + if width<>None: + raise ValueError, 'height is required' + if height<>None: + raise ValueError, 'width is required' + else: + raise ValueError, 'both height and width are required' + SVGelement.__init__(self,'image',{'xlink:href':url,'width':width,'height':height},**args) + if x<>None: + self.attributes['x']=x + if y<>None: + self.attributes['y']=y + +class cursor(SVGelement): + """c=cursor(url,**args) + + defines a custom cursor for a element or a drawing + """ + def __init__(self,url,**args): + SVGelement.__init__(self,'cursor',{'xlink:href':url},**args) + + +class marker(SVGelement): + """m=marker(id,viewbox,refX,refY,markerWidth,markerHeight,**args) + + defines a marker which can be used as an endpoint for a line or other pathtypes + add an element to it which should be used as a marker. + """ + def __init__(self,id=None,viewBox=None,refx=None,refy=None,markerWidth=None,markerHeight=None,**args): + SVGelement.__init__(self,'marker',**args) + if id<>None: + self.attributes['id']=id + if viewBox<>None: + self.attributes['viewBox']=_viewboxlist(viewBox) + if refx<>None: + self.attributes['refX']=refx + if refy<>None: + self.attributes['refY']=refy + if markerWidth<>None: + self.attributes['markerWidth']=markerWidth + if markerHeight<>None: + self.attributes['markerHeight']=markerHeight + +class group(SVGelement): + """g=group(id,**args) + + a group is defined by an id and is used to contain elements + g.addElement(SVGelement) + """ + def __init__(self,id=None,**args): + SVGelement.__init__(self,'g',**args) + if id<>None: + self.attributes['id']=id + +class symbol(SVGelement): + """sy=symbol(id,viewbox,**args) + + defines a symbol which can be used on different places in your graph using + the use element. A symbol is not rendered but you can use 'use' elements to + display it by referencing its id. + sy.addElement(SVGelement) + """ + + def __init__(self,id=None,viewBox=None,**args): + SVGelement.__init__(self,'symbol',**args) + if id<>None: + self.attributes['id']=id + if viewBox<>None: + self.attributes['viewBox']=_viewboxlist(viewBox) + +class defs(SVGelement): + """d=defs(**args) + + container for defining elements + """ + def __init__(self,**args): + SVGelement.__init__(self,'defs',**args) + +class switch(SVGelement): + """sw=switch(**args) + + Elements added to a switch element which are "switched" by the attributes + requiredFeatures, requiredExtensions and systemLanguage. + Refer to the SVG specification for details. + """ + def __init__(self,**args): + SVGelement.__init__(self,'switch',**args) + + +class use(SVGelement): + """u=use(link,x,y,width,height,**args) + + references a symbol by linking to its id and its position, height and width + """ + def __init__(self,link,x=None,y=None,width=None,height=None,**args): + SVGelement.__init__(self,'use',{'xlink:href':link},**args) + if x<>None: + self.attributes['x']=x + if y<>None: + self.attributes['y']=y + + if width<>None: + self.attributes['width']=width + if height<>None: + self.attributes['height']=height + + +class link(SVGelement): + """a=link(url,**args) + + a link is defined by a hyperlink. add elements which have to be linked + a.addElement(SVGelement) + """ + def __init__(self,link='',**args): + SVGelement.__init__(self,'a',{'xlink:href':link},**args) + +class view(SVGelement): + """v=view(id,**args) + + a view can be used to create a view with different attributes""" + def __init__(self,id=None,**args): + SVGelement.__init__(self,'view',**args) + if id<>None: + self.attributes['id']=id + +class script(SVGelement): + """sc=script(type,type,cdata,**args) + + adds a script element which contains CDATA to the SVG drawing + + """ + def __init__(self,type,cdata=None,**args): + SVGelement.__init__(self,'script',{'type':type},cdata=cdata,**args) + +class animate(SVGelement): + """an=animate(attribute,from,to,during,**args) + + animates an attribute. + """ + def __init__(self,attribute,fr=None,to=None,dur=None,**args): + SVGelement.__init__(self,'animate',{'attributeName':attribute},**args) + if fr<>None: + self.attributes['from']=fr + if to<>None: + self.attributes['to']=to + if dur<>None: + self.attributes['dur']=dur + +class animateMotion(SVGelement): + """an=animateMotion(pathdata,dur,**args) + + animates a SVGelement over the given path in dur seconds + """ + def __init__(self,pathdata,dur,**args): + SVGelement.__init__(self,'animateMotion',**args) + if pathdata<>None: + self.attributes['path']=str(pathdata) + if dur<>None: + self.attributes['dur']=dur + +class animateTransform(SVGelement): + """antr=animateTransform(type,from,to,dur,**args) + + transform an element from and to a value. + """ + def __init__(self,type=None,fr=None,to=None,dur=None,**args): + SVGelement.__init__(self,'animateTransform',{'attributeName':'transform'},**args) + #As far as I know the attributeName is always transform + if type<>None: + self.attributes['type']=type + if fr<>None: + self.attributes['from']=fr + if to<>None: + self.attributes['to']=to + if dur<>None: + self.attributes['dur']=dur +class animateColor(SVGelement): + """ac=animateColor(attribute,type,from,to,dur,**args) + + Animates the color of a element + """ + def __init__(self,attribute,type=None,fr=None,to=None,dur=None,**args): + SVGelement.__init__(self,'animateColor',{'attributeName':attribute},**args) + if type<>None: + self.attributes['type']=type + if fr<>None: + self.attributes['from']=fr + if to<>None: + self.attributes['to']=to + if dur<>None: + self.attributes['dur']=dur +class set(SVGelement): + """st=set(attribute,to,during,**args) + + sets an attribute to a value for a + """ + def __init__(self,attribute,to=None,dur=None,**args): + SVGelement.__init__(self,'set',{'attributeName':attribute},**args) + if to<>None: + self.attributes['to']=to + if dur<>None: + self.attributes['dur']=dur + + + +class svg(SVGelement): + """s=svg(viewbox,width,height,**args) + + a svg or element is the root of a drawing add all elements to a svg element. + You can have different svg elements in one svg file + s.addElement(SVGelement) + + eg + d=drawing() + s=svg((0,0,100,100),'100%','100%') + c=circle(50,50,20) + s.addElement(c) + d.setSVG(s) + d.toXml() + """ + def __init__(self,viewBox=None, width=None, height=None,**args): + SVGelement.__init__(self,'svg',**args) + if viewBox<>None: + self.attributes['viewBox']=_viewboxlist(viewBox) + if width<>None: + self.attributes['width']=width + if height<>None: + self.attributes['height']=height + self.namespace="http://www.w3.org/2000/svg" + +class drawing: + """d=drawing() + + this is the actual SVG document. It needs a svg element as a root. + Use the addSVG method to set the svg to the root. Use the toXml method to write the SVG + source to the screen or to a file + d=drawing() + d.addSVG(svg) + d.toXml(optionalfilename) + """ + + def __init__(self): + self.svg=None + def setSVG(self,svg): + self.svg=svg + #Voeg een element toe aan de grafiek toe. + if use_dom_implementation==0: + def toXml(self, filename='',compress=False): + import cStringIO + xml=cStringIO.StringIO() + xml.write('\n') + xml.write("""]>\n""") + self.svg.toXml(0,xml) + if not filename: + if compress: + import gzip + f=cStringIO.StringIO() + zf=gzip.GzipFile(fileobj=f,mode='wb') + zf.write(xml.getvalue()) + zf.close() + f.seek(0) + return f.read() + else: + return xml.getvalue() + else: + if filename[-4:]=='svgz': + import gzip + f=gzip.GzipFile(filename=filename,mode="wb", compresslevel=9) + f.write(xml.getvalue()) + f.close() + else: + f=file(filename,'w') + f.write(xml.getvalue()) + f.close() + + else: + def toXml(self,filename='',compress=False): + """drawing.toXml() ---->to the screen + drawing.toXml(filename)---->to the file + writes a svg drawing to the screen or to a file + compresses if filename ends with svgz or if compress is true + """ + doctype = implementation.createDocumentType('svg',"-//W3C//DTD SVG 1.0//EN""",'http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd ') + + global root + #root is defined global so it can be used by the appender. Its also possible to use it as an arugument but + #that is a bit messy. + root=implementation.createDocument(None,None,doctype) + #Create the xml document. + global appender + def appender(element,elementroot): + """This recursive function appends elements to an element and sets the attributes + and type. It stops when alle elements have been appended""" + if element.namespace: + e=root.createElementNS(element.namespace,element.type) + else: + e=root.createElement(element.type) + if element.text: + textnode=root.createTextNode(element.text) + e.appendChild(textnode) + for attribute in element.attributes.keys(): #in element.attributes is supported from python 2.2 + e.setAttribute(attribute,str(element.attributes[attribute])) + if element.elements: + for el in element.elements: + e=appender(el,e) + elementroot.appendChild(e) + return elementroot + root=appender(self.svg,root) + if not filename: + import cStringIO + xml=cStringIO.StringIO() + PrettyPrint(root,xml) + if compress: + import gzip + f=cStringIO.StringIO() + zf=gzip.GzipFile(fileobj=f,mode='wb') + zf.write(xml.getvalue()) + zf.close() + f.seek(0) + return f.read() + else: + return xml.getvalue() + else: + try: + if filename[-4:]=='svgz': + import gzip + import cStringIO + xml=cStringIO.StringIO() + PrettyPrint(root,xml) + f=gzip.GzipFile(filename=filename,mode='wb',compresslevel=9) + f.write(xml.getvalue()) + f.close() + else: + f=open(filename,'w') + PrettyPrint(root,f) + f.close() + except: + print "Cannot write SVG file: " + filename + def validate(self): + try: + import xml.parsers.xmlproc.xmlval + except: + raise exceptions.ImportError,'PyXml is required for validating SVG' + svg=self.toXml() + xv=xml.parsers.xmlproc.xmlval.XMLValidator() + try: + xv.feed(svg) + except: + raise "SVG is not well formed, see messages above" + else: + print "SVG well formed" +if __name__=='__main__': + + + d=drawing() + s=svg((0,0,100,100)) + r=rect(-100,-100,300,300,'cyan') + s.addElement(r) + + t=title('SVGdraw Demo') + s.addElement(t) + g=group('animations') + e=ellipse(0,0,5,2) + g.addElement(e) + c=circle(0,0,1,'red') + g.addElement(c) + pd=pathdata(0,-10) + for i in range(6): + pd.relsmbezier(10,5,0,10) + pd.relsmbezier(-10,5,0,10) + an=animateMotion(pd,10) + an.attributes['rotate']='auto-reverse' + an.attributes['repeatCount']="indefinite" + g.addElement(an) + s.addElement(g) + for i in range(20,120,20): + u=use('#animations',i,0) + s.addElement(u) + for i in range(0,120,20): + for j in range(5,105,10): + c=circle(i,j,1,'red','black',.5) + s.addElement(c) + d.setSVG(s) + + print d.toXml() + diff --git a/obitools/__init__.py b/obitools/__init__.py new file mode 100644 index 0000000..3063d78 --- /dev/null +++ b/obitools/__init__.py @@ -0,0 +1,711 @@ +''' +**obitools** main module +------------------------ + +.. codeauthor:: Eric Coissac + + + +obitools module provides base class for sequence manipulation. + +All biological sequences must be subclass of :py:class:`obitools.BioSequence`. +Some biological sequences are defined as transformation of other +biological sequences. For example Reversed complemented sequences +are a transformation of a :py:class:`obitools.NucSequence`. This particular +type of sequences are subclasses of the :py:class:`obitools.WrappedBioSequence`. + +.. inheritance-diagram:: BioSequence NucSequence AASequence WrappedBioSequence SubSequence DNAComplementSequence + :parts: 1 + + +''' + +from weakref import ref + +from obitools.utils.iterator import uniqueChain +from itertools import chain +import re + +_default_raw_parser = " %s *= *([^;]*);" + +try: + from functools import partial +except: + # + # Add for compatibility purpose with Python < 2.5 + # + def partial(func, *args, **keywords): + def newfunc(*fargs, **fkeywords): + newkeywords = keywords.copy() + newkeywords.update(fkeywords) + return func(*(args + fargs), **newkeywords) + newfunc.func = func + newfunc.args = args + newfunc.keywords = keywords + return newfunc + + +from obitools.sequenceencoder import DNAComplementEncoder +from obitools.location import Location + +class WrapperSetIterator(object): + def __init__(self,s): + self._i = set.__iter__(s) + def next(self): + return self._i.next()() + def __iter__(self): + return self + +class WrapperSet(set): + def __iter__(self): + return WrapperSetIterator(self) + + +class BioSequence(object): + ''' + BioSequence class is the base class for biological + sequence representation. + + It provides storage of : + + - the sequence itself, + - an identifier, + - a definition an manage + - a set of complementary information on a key / value principle. + + .. warning:: + + :py:class:`obitools.BioSequence` is an abstract class, this constructor + can only be called by a subclass constructor. + ''' + + def __init__(self,id,seq,definition=None,rawinfo=None,rawparser=_default_raw_parser,**info): + ''' + + :param id: sequence identifier + :type id: `str` + + :param seq: the sequence + :type seq: `str` + + :param definition: sequence definition (optional) + :type definition: `str` + + :param rawinfo: a text containing a set of key=value; patterns + :type definition: `str` + + :param rawparser: a text describing a regular patterns template + used to parse rawinfo + :type definition: `str` + + :param info: extra named parameters can be added to associate complementary + data to the sequence + + ''' + + assert type(self)!=BioSequence,"obitools.BioSequence is an abstract class" + + self._seq=str(seq).lower() + self._info = dict(info) + if rawinfo is not None: + self._rawinfo=' ' + rawinfo + else: + self._rawinfo=None + self._rawparser=rawparser + self.definition=definition + self.id=id + self._hasTaxid=None + + def get_seq(self): + return self.__seq + + + def set_seq(self, value): + if not isinstance(value, str): + value=str(value) + self.__seq = value + self.__len = len(value) + + + def clone(self): + seq = type(self)(self.id, + str(self), + definition=self.definition + ) + seq._info=dict(self.getTags()) + seq._rawinfo=self._rawinfo + seq._rawparser=self._rawparser + seq._hasTaxid=self._hasTaxid + return seq + + def getDefinition(self): + ''' + Sequence definition getter. + + :return: the sequence definition + :rtype: str + + ''' + return self._definition + + def setDefinition(self, value): + ''' + Sequence definition setter. + + :param value: the new sequence definition + :type value: C{str} + :return: C{None} + ''' + self._definition = value + + def getId(self): + ''' + Sequence identifier getter + + :return: the sequence identifier + :rtype: C{str} + ''' + return self._id + + def setId(self, value): + ''' + Sequence identifier setter. + + :param value: the new sequence identifier + :type value: C{str} + :return: C{None} + ''' + self._id = value + + def getStr(self): + ''' + Return the sequence as a string + + :return: the string representation of the sequence + :rtype: str + ''' + return self._seq + + def getSymbolAt(self,position): + ''' + Return the symbole at C{position} in the sequence + + :param position: the desired position. Position start from 0 + if position is < 0 then they are considered + to reference the end of the sequence. + :type position: `int` + + :return: a one letter string + :rtype: `str` + ''' + return str(self)[position] + + def getSubSeq(self,location): + ''' + return a subsequence as described by C{location}. + + The C{location} parametter can be a L{obitools.location.Location} instance, + an interger or a python C{slice} instance. If C{location} + is an iterger this method is equivalent to L{getSymbolAt}. + + :param location: the positions of the subsequence to return + :type location: C{Location} or C{int} or C{slice} + :return: the subsequence + :rtype: a single character as a C{str} is C{location} is an integer, + a L{obitools.SubSequence} instance otherwise. + + ''' + if isinstance(location,Location): + return location.extractSequence(self) + elif isinstance(location, int): + return self.getSymbolAt(location) + elif isinstance(location, slice): + return SubSequence(self,location) + + raise TypeError,'key must be a Location, an integer or a slice' + + def getKey(self,key): + if key not in self._info: + if self._rawinfo is None: + if key=='count': + return 1 + else: + raise KeyError,key + p = re.compile(self._rawparser % key) + m = p.search(self._rawinfo) + if m is not None: + v=m.group(1) + self._rawinfo=' ' + self._rawinfo[0:m.start(0)]+self._rawinfo[m.end(0):] + try: + v = eval(v) + except: + pass + self._info[key]=v + else: + if key=='count': + v=1 + else: + raise KeyError,key + else: + v=self._info[key] + return v + + def extractTaxon(self): + ''' + Extract Taxonomy information from the sequence header. + This method by default return None. It should be subclassed + if necessary as in L{obitools.seqdb.AnnotatedSequence}. + + :return: None + ''' + self._hasTaxid=self.hasKey('taxid') + return None + + def __str__(self): + return self.getStr() + + def __getitem__(self,key): + if isinstance(key, str): + if key=='taxid' and self._hasTaxid is None: + self.extractTaxon() + return self.getKey(key) + else: + return self.getSubSeq(key) + + def __setitem__(self,key,value): + self._info[key]=value + if key=='taxid': + self._hasTaxid=value is not None + + def __delitem__(self,key): + if isinstance(key, str): + if key in self: + del self._info[key] + else: + raise KeyError,key + + if key=='taxid': + self._hasTaxid=False + else: + raise TypeError,key + + def __iter__(self): + ''' + Iterate through the sequence symbols + ''' + return iter(str(self)) + + def __len__(self): + return self.__len + + def hasKey(self,key): + rep = key in self._info + + if not rep and self._rawinfo is not None: + p = re.compile(self._rawparser % key) + m = p.search(self._rawinfo) + if m is not None: + v=m.group(1) + self._rawinfo=' ' + self._rawinfo[0:m.start(0)]+self._rawinfo[m.end(0):] + try: + v = eval(v) + except: + pass + self._info[key]=v + rep=True + + return rep + + def __contains__(self,key): + ''' + methods allowing to use the C{in} operator on a C{BioSequence}. + + The C{in} operator test if the C{key} value is defined for this + sequence. + + :param key: the name of the checked value + :type key: str + :return: C{True} if the value is defined, {False} otherwise. + :rtype: C{bool} + ''' + if key=='taxid' and self._hasTaxid is None: + self.extractTaxon() + return self.hasKey(key) + + def rawiteritems(self): + return self._info.iteritems() + + def iteritems(self): + ''' + iterate other items dictionary storing the values + associated to the sequence. It works similarly to + the iteritems function of C{dict}. + + :return: an iterator over the items (key,value) + link to a sequence + :rtype: iterator over tuple + :see: L{items} + ''' + if self._rawinfo is not None: + p = re.compile(self._rawparser % "([a-zA-Z]\w*)") + for k,v in p.findall(self._rawinfo): + try: + self._info[k]=eval(v) + except: + self._info[k]=v + self._rawinfo=None + return self._info.iteritems() + + def items(self): + return [x for x in self.iteritems()] + + def iterkeys(self): + return (k for k,v in self.iteritems()) + + def keys(self): + return [x for x in self.iterkeys()] + + def getTags(self): + self.iteritems() + return self._info + + def getRoot(self): + return self + + def getWrappers(self): + if not hasattr(self, '_wrappers'): + self._wrappers=WrapperSet() + return self._wrappers + + def register(self,wrapper): + self.wrappers.add(ref(wrapper,self._unregister)) + + def _unregister(self,ref): + self.wrappers.remove(ref) + + wrappers = property(getWrappers,None,None,'') + + definition = property(getDefinition, setDefinition, None, "Sequence Definition") + + id = property(getId, setId, None, 'Sequence identifier') + + def _getTaxid(self): + return self['taxid'] + + def _setTaxid(self,taxid): + self['taxid']=taxid + + taxid = property(_getTaxid,_setTaxid,None,'NCBI Taxonomy identifier') + _seq = property(get_seq, set_seq, None, None) + +class NucSequence(BioSequence): + """ + :py:class:`NucSequence` specialize the :py:class:`BioSequence` class for storing DNA + sequences. + + The constructor is identical to the :py:class:`BioSequence` constructor. + """ + + def complement(self): + """ + :return: The reverse complemented sequence as an instance of :py:class:`DNAComplementSequence` + :rtype: :py:class:`DNAComplementSequence` + """ + return DNAComplementSequence(self) + + def isNucleotide(self): + return True + + +class AASequence(BioSequence): + """ + :py:class:`AASequence` specialize the :py:class:`BioSequence` class for storing protein + sequences. + + The constructor is identical to the :py:class:`BioSequence` constructor. + """ + + + def isNucleotide(self): + return False + + +class WrappedBioSequence(BioSequence): + """ + .. warning:: + + :py:class:`obitools.WrappedBioSequence` is an abstract class, this constructor + can only be called by a subclass constructor. + """ + + + def __init__(self,reference,id=None,definition=None,**info): + + assert type(self)!=WrappedBioSequence,"obitools.WrappedBioSequence is an abstract class" + + self._wrapped = reference + reference.register(self) + self._id=id + self.definition=definition + self._info=info + + def clone(self): + seq = type(self)(self.wrapped, + id=self._id, + definition=self._definition + ) + seq._info=dict(self._info) + + return seq + + def getWrapped(self): + return self._wrapped + + def getDefinition(self): + d = self._definition or self.wrapped.definition + return d + + def getId(self): + d = self._id or self.wrapped.id + return d + + def isNucleotide(self): + return self.wrapped.isNucleotide() + + + def iterkeys(self): + return uniqueChain(self._info.iterkeys(), + self.wrapped.iterkeys()) + + def rawiteritems(self): + return chain(self._info.iteritems(), + (x for x in self.wrapped.rawiteritems() + if x[0] not in self._info)) + + def iteritems(self): + for x in self.iterkeys(): + yield (x,self[x]) + + def getKey(self,key): + if key in self._info: + return self._info[key] + else: + return self.wrapped.getKey(key) + + def hasKey(self,key): + return key in self._info or self.wrapped.hasKey(key) + + def getSymbolAt(self,position): + return self.wrapped.getSymbolAt(self.posInWrapped(position)) + + def posInWrapped(self,position,reference=None): + if reference is None or reference is self.wrapped: + return self._posInWrapped(position) + else: + return self.wrapped.posInWrapped(self._posInWrapped(position),reference) + + + def getStr(self): + return str(self.wrapped) + + def getRoot(self): + return self.wrapped.getRoot() + + def complement(self): + """ + The :py:meth:`complement` method of the :py:class:`WrappedBioSequence` class + raises an exception :py:exc:`AttributeError` if the method is called and the cut + sequence does not corresponds to a nucleic acid sequence. + """ + + if self.wrapped.isNucleotide(): + return DNAComplementSequence(self) + raise AttributeError + + + def _posInWrapped(self,position): + return position + + + definition = property(getDefinition,BioSequence.setDefinition, None) + id = property(getId,BioSequence.setId, None) + + wrapped = property(getWrapped, None, None, "A pointer to the wrapped sequence") + + def _getWrappedRawInfo(self): + return self.wrapped._rawinfo + + _rawinfo = property(_getWrappedRawInfo) + + +class SubSequence(WrappedBioSequence): + """ + """ + + + @staticmethod + def _sign(x): + if x == 0: + return 0 + elif x < 0: + return -1 + return 1 + + def __init__(self,reference, + location=None, + start=None,stop=None, + id=None,definition=None, + **info): + WrappedBioSequence.__init__(self,reference,id=None,definition=None,**info) + + if isinstance(location, slice): + self._location = location + else: + step = 1 + if not isinstance(start, int): + start = 0; + if not isinstance(stop,int): + stop = len(reference) + self._location=slice(start,stop,step) + + self._indices=self._location.indices(len(self.wrapped)) + self._xrange=xrange(*self._indices) + + self._info['cut']='[%d,%d,%s]' % self._indices + + if hasattr(reference,'quality'): + self.quality = reference.quality[self._location] + + def getId(self): + d = self._id or ("%s_SUB" % self.wrapped.id) + return d + + + def clone(self): + seq = WrappedBioSequence.clone(self) + seq._location=self._location + seq._indices=seq._location.indices(len(seq.wrapped)) + seq._xrange=xrange(*seq._indices) + return seq + + + def __len__(self): + return len(self._xrange) + + def getStr(self): + return ''.join([x for x in self]) + + def __iter__(self): + return (self.wrapped.getSymbolAt(x) for x in self._xrange) + + def _posInWrapped(self,position): + return self._xrange[position] + + + id = property(getId,BioSequence.setId, None) + + + +class DNAComplementSequence(WrappedBioSequence): + """ + Class used to represent a reverse complemented DNA sequence. Usually instances + of this class are produced by using the :py:meth:`NucSequence.complement` method. + """ + + + _comp={'a': 't', 'c': 'g', 'g': 'c', 't': 'a', + 'r': 'y', 'y': 'r', 'k': 'm', 'm': 'k', + 's': 's', 'w': 'w', 'b': 'v', 'd': 'h', + 'h': 'd', 'v': 'b', 'n': 'n', 'u': 'a', + '-': '-'} + + + def __init__(self,reference, + id=None,definition=None,**info): + WrappedBioSequence.__init__(self,reference,id=None,definition=None,**info) + assert reference.isNucleotide() + self._info['complemented']=True + if hasattr(reference,'quality'): + self.quality = reference.quality[::-1] + + + def getId(self): + d = self._id or ("%s_CMP" % self.wrapped.id) + return d + + def __len__(self): + return len(self._wrapped) + + def getStr(self): + return ''.join([x for x in self]) + + def __iter__(self): + return (self.getSymbolAt(x) for x in xrange(len(self))) + + def _posInWrapped(self,position): + return -(position+1) + + def getSymbolAt(self,position): + return DNAComplementSequence._comp[self.wrapped.getSymbolAt(self.posInWrapped(position))] + + def complement(self): + """ + The :py:meth:`complement` method of the :py:class:`DNAComplementSequence` class actually + returns the wrapped sequenced. Effectively the reversed complemented sequence of a reversed + complemented sequence is the initial sequence. + """ + return self.wrapped + + id = property(getId,BioSequence.setId, None) + + +def _isNucSeq(text): + acgt = 0 + notnuc = 0 + ltot = len(text) * 0.8 + for c in text.lower(): + if c in 'acgt-': + acgt+=1 + if c not in DNAComplementEncoder._comp: + notnuc+=1 + return notnuc==0 and float(acgt) > ltot + + +def bioSeqGenerator(id,seq,definition=None,rawinfo=None,rawparser=_default_raw_parser,**info): + """ + Generate automagically the good class instance between : + + - :py:class:`NucSequence` + - :py:class:`AASequence` + + Build a new sequence instance. Sequences are instancied as :py:class:`NucSequence` if the + `seq` attribute contains more than 80% of *A*, *C*, *G*, *T* or *-* symbols + in upper or lower cases. Conversely, the new sequence instance is instancied as + :py:class:`AASequence`. + + + + :param id: sequence identifier + :type id: `str` + + :param seq: the sequence + :type seq: `str` + + :param definition: sequence definition (optional) + :type definition: `str` + + :param rawinfo: a text containing a set of key=value; patterns + :type definition: `str` + + :param rawparser: a text describing a regular patterns template + used to parse rawinfo + :type definition: `str` + + :param info: extra named parameters can be added to associate complementary + data to the sequence + """ + if _isNucSeq(seq): + return NucSequence(id,seq,definition,rawinfo,rawparser,**info) + else: + return AASequence(id,seq,definition,rawinfo,rawparser,**info) + diff --git a/obitools/__init__.pyc b/obitools/__init__.pyc new file mode 100644 index 0000000..3cc2111 Binary files /dev/null and b/obitools/__init__.pyc differ diff --git a/obitools/align/__init__.py b/obitools/align/__init__.py new file mode 100644 index 0000000..54cca7d --- /dev/null +++ b/obitools/align/__init__.py @@ -0,0 +1,13 @@ + + +from _nws import NWS +from _upperbond import indexSequences +from _lcs import LCS,lenlcs +from _assemble import DirectAssemble, ReverseAssemble +from _qsassemble import QSolexaDirectAssemble,QSolexaReverseAssemble +from _rassemble import RightDirectAssemble as RightReverseAssemble +from _qsrassemble import QSolexaRightDirectAssemble,QSolexaRightReverseAssemble +from _freeendgap import FreeEndGap +from _freeendgapfm import FreeEndGapFullMatch +from _upperbond import isLCSReachable + diff --git a/obitools/align/_assemble.so b/obitools/align/_assemble.so new file mode 100755 index 0000000..dbc2139 Binary files /dev/null and b/obitools/align/_assemble.so differ diff --git a/obitools/align/_dynamic.so b/obitools/align/_dynamic.so new file mode 100755 index 0000000..2f93d3a Binary files /dev/null and b/obitools/align/_dynamic.so differ diff --git a/obitools/align/_freeendgap.so b/obitools/align/_freeendgap.so new file mode 100755 index 0000000..53cd9c0 Binary files /dev/null and b/obitools/align/_freeendgap.so differ diff --git a/obitools/align/_freeendgapfm.so b/obitools/align/_freeendgapfm.so new file mode 100755 index 0000000..f88c07b Binary files /dev/null and b/obitools/align/_freeendgapfm.so differ diff --git a/obitools/align/_lcs.so b/obitools/align/_lcs.so new file mode 100755 index 0000000..555a2a2 Binary files /dev/null and b/obitools/align/_lcs.so differ diff --git a/obitools/align/_nws.so b/obitools/align/_nws.so new file mode 100755 index 0000000..af7e849 Binary files /dev/null and b/obitools/align/_nws.so differ diff --git a/obitools/align/_profilenws.so b/obitools/align/_profilenws.so new file mode 100755 index 0000000..baa8eda Binary files /dev/null and b/obitools/align/_profilenws.so differ diff --git a/obitools/align/_qsassemble.so b/obitools/align/_qsassemble.so new file mode 100755 index 0000000..3bc83e9 Binary files /dev/null and b/obitools/align/_qsassemble.so differ diff --git a/obitools/align/_qsrassemble.so b/obitools/align/_qsrassemble.so new file mode 100755 index 0000000..75b98aa Binary files /dev/null and b/obitools/align/_qsrassemble.so differ diff --git a/obitools/align/_rassemble.so b/obitools/align/_rassemble.so new file mode 100755 index 0000000..e2a063c Binary files /dev/null and b/obitools/align/_rassemble.so differ diff --git a/obitools/align/_upperbond.so b/obitools/align/_upperbond.so new file mode 100755 index 0000000..5f2b1fe Binary files /dev/null and b/obitools/align/_upperbond.so differ diff --git a/obitools/align/homopolymere.py b/obitools/align/homopolymere.py new file mode 100644 index 0000000..5efcbff --- /dev/null +++ b/obitools/align/homopolymere.py @@ -0,0 +1,56 @@ +''' +Created on 14 mai 2009 + +@author: coissac +''' + +from obitools import WrappedBioSequence + +class HomoNucBioSeq(WrappedBioSequence): + ''' + classdocs + ''' + + + def __init__(self,reference,id=None,definition=None,**info): + ''' + Constructor + ''' + assert reference.isNucleotide(),"reference must be a nucleic sequence" + WrappedBioSequence.__init__(self,reference,id=None,definition=None,**info) + self.__cleanHomopolymer() + + def __cleanHomopolymer(self): + s = [] + c = [] + old=None + nc=0 + for n in self._wrapped: + if old is not None and n!=old: + s.append(old) + c.append(nc) + nc=0 + old=n + nc+=1 + self._cached=''.join(s) + self['homopolymer']=c + self._cumulative=[] + sum=0 + for c in self._count: + sum+=c + self._cumulative.append(sum) + + def __len__(self): + return len(self._cached) + + def getStr(self): + return self._cached + + def __iter__(self): + return iter(self._cached) + + def _posInWrapped(self,position): + return self._cumulative[position] + + + \ No newline at end of file diff --git a/obitools/align/ssearch.py b/obitools/align/ssearch.py new file mode 100644 index 0000000..55a74ce --- /dev/null +++ b/obitools/align/ssearch.py @@ -0,0 +1,46 @@ +import os +import re + +from obitools.fasta import formatFasta + +class SsearchParser(object): + + _matchQuery = re.compile("^Query:.+\n.+?>+([^ ]+)", re.MULTILINE) + _matchLQuery = re.compile("^Query:.+\n.+?(\d+)(?= nt\n)", re.MULTILINE) + _matchProp = re.compile("^The best scores are:.*\n(.+?)>>>", re.DOTALL+re.MULTILINE) + def __init__(self,file): + if isinstance(file,str): + file = open(file,'rU') + self.data = file.read() + self.query= SsearchParser._matchQuery.search(self.data).group(1) + self.queryLength= int(SsearchParser._matchLQuery.search(self.data).group(1)) + props = SsearchParser._matchProp.search(self.data) + if props: + props=props.group(0).split('\n')[1:-2] + self.props=[] + for line in props: + subject,tab = line.split('\t') + tab=tab.split() + ssp = subject.split() + ac = ssp[0] + dbl= int(ssp[-5][:-1]) + ident = float(tab[0]) + matchlen = abs(int(tab[5]) - int(tab[4])) +1 + self.props.append({"ac" :ac, + "identity" :ident, + "subjectlength":dbl, + 'matchlength' : matchlen}) + +def run(seq,database,program='fasta35',opts=''): + ssearchin,ssearchout,ssearcherr = os.popen3("%s %s %s" % (program,opts,database)) + print >>ssearchin,formatFasta(seq) + ssearchin.close() + result = SsearchParser(ssearchout) + + return seq,result + +def ssearchIterator(sequenceIterator,database,program='ssearch35',opts=''): + for seq in sequenceIterator: + yield run(seq,database,program,opts) + + diff --git a/obitools/alignment/__init__.py b/obitools/alignment/__init__.py new file mode 100644 index 0000000..a89793a --- /dev/null +++ b/obitools/alignment/__init__.py @@ -0,0 +1,175 @@ +from obitools import BioSequence +from obitools import WrappedBioSequence +from copy import deepcopy + +class GappedPositionException(Exception): + pass + +class AlignedSequence(WrappedBioSequence): + + def __init__(self,reference, + id=None,definition=None,**info): + WrappedBioSequence.__init__(self,reference,id=None,definition=None,**info) + self._length=len(reference) + self._gaps=[[self._length,0]] + + def clone(self): + seq = WrappedBioSequence.clone(self) + seq._gaps=deepcopy(self._gaps) + seq._length=reduce(lambda x,y:x+y, (z[0]+z[1] for z in self._gaps),0) + return seq + + def setGaps(self, value): + ''' + Set gap vector to an AlignedSequence. + + Gap vector describes the gap positions on a sequence. + It is a gap of couple. The first couple member is the count + of sequence letter, the second one is the gap length. + @param value: a list of length 2 list describing gap positions + @type value: list of couple + ''' + assert isinstance(value, list),'Gap vector must be a list' + assert reduce(lambda x,y: x and y, + (isinstance(z, list) and len(z)==2 for z in value), + True),"Value must be a list of length 2 list" + + lseq = reduce(lambda x,y:x+y, (z[0] for z in value),0) + assert lseq==len(self.wrapped),"Gap vector incompatible with the sequence" + self._gaps = value + self._length=reduce(lambda x,y:x+y, (z[0]+z[1] for z in value),0) + + def getGaps(self): + return tuple(self._gaps) + gaps = property(getGaps, setGaps, None, "Gaps's Docstring") + + def _getIndice(self,pos): + i=0 + cpos=0 + for s,g in self._gaps: + cpos+=s + if cpos>pos: + return i,pos-cpos+s + cpos+=g + if cpos>pos: + return i,-pos+cpos-g-1 + i+=1 + raise IndexError + + def getId(self): + d = self._id or ("%s_ALN" % self.wrapped.id) + return d + + def __len__(self): + return self._length + + def getStr(self): + return ''.join([x for x in self]) + + def __iter__(self): + def isymb(): + cpos=0 + for s,g in self._gaps: + for x in xrange(s): + yield self.wrapped[cpos+x] + for x in xrange(g): + yield '-' + cpos+=s + return isymb() + + def _posInWrapped(self,position): + i,s=self._getIndice(position) + if s<0: + raise GappedPositionException + value=self._gaps + p=reduce(lambda x,y:x+y, (z[0] for z in value[:i]),0)+s + return p + + def getSymbolAt(self,position): + try: + return self.wrapped.getSymbolAt(self.posInWrapped(position)) + except GappedPositionException: + return '-' + + def insertGap(self,position,count=1): + if position==self._length: + idx=len(self._gaps)-1 + p=-1 + else: + idx,p = self._getIndice(position) + + if p >= 0: + self._gaps.insert(idx, [p,count]) + self._gaps[idx+1][0]-=p + else: + self._gaps[idx][1]+=count + self._length=reduce(lambda x,y:x+y, (z[0]+z[1] for z in self._gaps),0) + + + id = property(getId,BioSequence.setId, None, "Sequence Identifier") + + +class Alignment(list): + + def _assertData(self,data): + assert isinstance(data, BioSequence),'You must only add bioseq to an alignement' + if hasattr(self, '_alignlen'): + assert self._alignlen==len(data),'All aligned sequences must have the same length' + else: + self._alignlen=len(data) + return data + + def clone(self): + ali = Alignment(x.clone() for x in self) + return ali + + def append(self,data): + data = self._assertData(data) + list.append(self,data) + + def __setitem__(self,index,data): + + data = self._assertData(data) + list.__setitem__(self,index,data) + + def getSite(self,key): + if isinstance(key,int): + return [x[key] for x in self] + + def insertGap(self,position,count=1): + for s in self: + s.insertGap(position,count) + + def isFullGapSite(self,key): + return reduce(lambda x,y: x and y,(z=='-' for z in self.getSite(key)),True) + + def isGappedSite(self,key): + return '-' in self.getSite(key) + + def __str__(self): + l = len(self[0]) + rep="" + idmax = max(len(x.id) for x in self)+2 + template= "%%-%ds %%-60s" % idmax + for p in xrange(0,l,60): + for s in self: + rep+= (template % (s.id,s[p:p+60])).strip() + '\n' + rep+="\n" + return rep + +def alignmentReader(file,sequenceIterator): + seqs = sequenceIterator(file) + alignement = Alignment() + for seq in seqs: + alignement.append(seq) + return alignement + + + + + +def columnIterator(alignment): + lali = len(alignment[0]) + for p in xrange(lali): + c = [x[p] for x in alignment] + yield c \ No newline at end of file diff --git a/obitools/alignment/ace.py b/obitools/alignment/ace.py new file mode 100644 index 0000000..59cc8f6 --- /dev/null +++ b/obitools/alignment/ace.py @@ -0,0 +1,47 @@ +from obitools.format.genericparser import GenericParser +from obitools.utils import universalOpen +from obitools.fasta import parseFastaDescription +from obitools import NucSequence + + +import sys + +_contigIterator=GenericParser('^CO ') + +_contigIterator.addParseAction('AF', '\nAF +(\S+) +([UC]) +(-?[0-9]+)') +_contigIterator.addParseAction('RD', '\nRD +(\S+) +([0-9]+) +([0-9]+) +([0-9]+) *\n([A-Za-z\n*]+?)\n\n') +_contigIterator.addParseAction('DS', '\nDS +(.+)') +_contigIterator.addParseAction('CO', '^CO (\S+)') + +def contigIterator(file): + file = universalOpen(file) + for entry in _contigIterator(file): + contig=[] + for rd,ds,af in map(None,entry['RD'],entry['DS'],entry['AF']): + id = rd[0] + shift = int(af[2]) + if shift < 0: + print >> sys.stderr,"Sequence %s in contig %s has a negative paddng value %d : skipped" % (id,entry['CO'][0],shift) + #continue + + definition,info = parseFastaDescription(ds) + info['shift']=shift + seq = rd[4].replace('\n','').replace('*','-').strip() + contig.append(NucSequence(id,seq,definition,**info)) + + maxlen = max(len(x)+x['shift'] for x in contig) + minshift=min(x['shift'] for x in contig) + rep = [] + + for s in contig: + info = s.getTags() + info['shift']-=minshift-1 + head = '-' * (info['shift']-1) + + tail = (maxlen + minshift - len(s) - info['shift'] - 1) + info['tail']=tail + newseq = NucSequence(s.id,head + str(s)+ '-' * tail,s.definition,**info) + rep.append(newseq) + + yield entry['CO'][0],rep + \ No newline at end of file diff --git a/obitools/barcodecoverage/__init__.py b/obitools/barcodecoverage/__init__.py new file mode 100644 index 0000000..09e542e --- /dev/null +++ b/obitools/barcodecoverage/__init__.py @@ -0,0 +1,7 @@ +''' + +@author: merciece +Creates the tree representing the coverage of 2 primers from an ecoPCR output file and an ecoPCR database. + + +''' \ No newline at end of file diff --git a/obitools/barcodecoverage/calcBc.py b/obitools/barcodecoverage/calcBc.py new file mode 100644 index 0000000..13b0401 --- /dev/null +++ b/obitools/barcodecoverage/calcBc.py @@ -0,0 +1,62 @@ +#!/usr/local/bin/python +''' +Created on 24 nov. 2011 + +@author: merciece +''' + + +def main(amplifiedSeqs, seqsFromDB, keptRanks, errors, tax) : + ''' + error threshold is set to 3 + ''' + + listtaxabygroupinDB = {} + + for seq in seqsFromDB : + taxid = seq['taxid'] + p = [a for a in tax.parentalTreeIterator(taxid)] + for a in p : + if a != p[0] : + if a[1] in keptRanks : + group = a[0] + if group in listtaxabygroupinDB and taxid not in listtaxabygroupinDB[group] : + listtaxabygroupinDB[group].add(taxid) + elif group not in listtaxabygroupinDB : + listtaxabygroupinDB[group]=set([taxid]) + + taxabygroup = dict((x,len(listtaxabygroupinDB[x])) for x in listtaxabygroupinDB) + + listamplifiedtaxabygroup = {} + + for seq in amplifiedSeqs : + if errors[seq.id][2] <= 3 : + taxid = seq['taxid'] + p = [a for a in tax.parentalTreeIterator(taxid)] + for a in p : + if a != p[0] : + if a[1] in keptRanks : + group = a[0] + if group in listamplifiedtaxabygroup and taxid not in listamplifiedtaxabygroup[group] : + listamplifiedtaxabygroup[group].add(taxid) + elif group not in listamplifiedtaxabygroup : + listamplifiedtaxabygroup[group]=set([taxid]) + + amplifiedtaxabygroup = dict((x,len(listamplifiedtaxabygroup[x])) for x in listamplifiedtaxabygroup) + + BcValues = {} + + groups = [g for g in taxabygroup.keys()] + + for g in groups : + if g in amplifiedtaxabygroup : + BcValues[g] = float(amplifiedtaxabygroup[g])/taxabygroup[g]*100 + BcValues[g] = round(BcValues[g], 2) + else : + BcValues[g] = 0.0 + + return BcValues + + + + diff --git a/obitools/barcodecoverage/calculateBc.py b/obitools/barcodecoverage/calculateBc.py new file mode 100644 index 0000000..c5edb8a --- /dev/null +++ b/obitools/barcodecoverage/calculateBc.py @@ -0,0 +1,72 @@ +#!/usr/local/bin/python +''' +Created on 24 nov. 2011 + +@author: merciece +''' + +import sys + + +def main(amplifiedSeqs, seqsFromDB, keptRanks, tax) : + + BcValues = {} + + #speciesid = tax.findRankByName('species') + #subspeciesid = tax.findRankByName('subspecies') + + listtaxonbygroup = {} + + for seq in seqsFromDB : + taxid = seq['taxid'] + p = [a for a in tax.parentalTreeIterator(taxid)] + for a in p : + if a != p[0] : + if a[1] in keptRanks : + group = a + if group in listtaxonbygroup: + listtaxonbygroup[group].add(taxid) + else: + listtaxonbygroup[group]=set([taxid]) + + #stats = dict((x,len(listtaxonbygroup[x])) for x in listtaxonbygroup) + + print>>sys.stderr, listtaxonbygroup + + listtaxonbygroup = {} + + for seq in amplifiedSeqs : + taxid = seq['taxid'] + p = [a for a in tax.parentalTreeIterator(taxid)] + for a in p : + if a != p[0] : + if a[1] in keptRanks : + group = a + if group in listtaxonbygroup: + listtaxonbygroup[group].add(taxid) + else: + listtaxonbygroup[group]=set([taxid]) + + print>>sys.stderr, listtaxonbygroup + + return BcValues + +# dbstats= dict((x,len(listtaxonbygroup[x])) for x in listtaxonbygroup) +# +# ranks = [r for r in keptRanks] +# ranks.sort() +# +# print '%-20s\t%10s\t%10s\t%7s' % ('rank','ecopcr','db','percent') +# +# print>>sys.stderr, stats +# print>>sys.stderr, dbstats +# print>>sys.stderr, ranks +# +# for r in ranks: +# if r in dbstats and dbstats[r]: +# print '%-20s\t%10d\t%10d\t%8.2f' % (r,dbstats[r],stats[r],float(dbstats[r])/stats[r]*100) + + + + + diff --git a/obitools/barcodecoverage/drawBcTree.py b/obitools/barcodecoverage/drawBcTree.py new file mode 100644 index 0000000..9b1e215 --- /dev/null +++ b/obitools/barcodecoverage/drawBcTree.py @@ -0,0 +1,108 @@ +#!/usr/local/bin/python +''' +Created on 25 nov. 2011 + +@author: merciece +''' + +from obitools.graph.rootedtree import nexusFormat + + +figtree="""\ +begin figtree; + set appearance.backgroundColorAttribute="User Selection"; + set appearance.backgroundColour=#-1; + set appearance.branchColorAttribute="bc"; + set appearance.branchLineWidth=2.0; + set appearance.foregroundColour=#-16777216; + set appearance.selectionColour=#-2144520576; + set branchLabels.colorAttribute="User Selection"; + set branchLabels.displayAttribute="errors"; + set branchLabels.fontName="sansserif"; + set branchLabels.fontSize=10; + set branchLabels.fontStyle=0; + set branchLabels.isShown=true; + set branchLabels.significantDigits=4; + set layout.expansion=2000; + set layout.layoutType="RECTILINEAR"; + set layout.zoom=0; + set nodeBars.barWidth=4.0; + set nodeLabels.colorAttribute="User Selection"; + set nodeLabels.displayAttribute="label"; + set nodeLabels.fontName="sansserif"; + set nodeLabels.fontSize=10; + set nodeLabels.fontStyle=0; + set nodeLabels.isShown=true; + set nodeLabels.significantDigits=4; + set polarLayout.alignTipLabels=false; + set polarLayout.angularRange=0; + set polarLayout.rootAngle=0; + set polarLayout.rootLength=100; + set polarLayout.showRoot=true; + set radialLayout.spread=0.0; + set rectilinearLayout.alignTipLabels=false; + set rectilinearLayout.curvature=0; + set rectilinearLayout.rootLength=100; + set scale.offsetAge=0.0; + set scale.rootAge=1.0; + set scale.scaleFactor=1.0; + set scale.scaleRoot=false; + set scaleAxis.automaticScale=true; + set scaleAxis.fontSize=8.0; + set scaleAxis.isShown=false; + set scaleAxis.lineWidth=2.0; + set scaleAxis.majorTicks=1.0; + set scaleAxis.origin=0.0; + set scaleAxis.reverseAxis=false; + set scaleAxis.showGrid=true; + set scaleAxis.significantDigits=4; + set scaleBar.automaticScale=true; + set scaleBar.fontSize=10.0; + set scaleBar.isShown=true; + set scaleBar.lineWidth=1.0; + set scaleBar.scaleRange=0.0; + set scaleBar.significantDigits=4; + set tipLabels.colorAttribute="User Selection"; + set tipLabels.displayAttribute="Names"; + set tipLabels.fontName="sansserif"; + set tipLabels.fontSize=10; + set tipLabels.fontStyle=0; + set tipLabels.isShown=true; + set tipLabels.significantDigits=4; + set trees.order=false; + set trees.orderType="increasing"; + set trees.rooting=false; + set trees.rootingType="User Selection"; + set trees.transform=false; + set trees.transformType="cladogram"; +end; +""" + + +def cartoonRankGenerator(rank): + def cartoon(node): + return 'rank' in node and node['rank']==rank + + return cartoon + + +def collapseBcGenerator(Bclimit): + def collapse(node): + return 'bc' in node and node['bc']<=Bclimit + return collapse + + +def label(node): + if 'bc' in node: + return "(%+3.1f) %s" % (node['bc'],node['name']) + else: + return " %s" % node['name'] + + +def main(coverageTree) : + print nexusFormat(coverageTree, + label=label, + blocks=figtree, + cartoon=cartoonRankGenerator('family')) + #collapse=collapseBcGenerator(70)) + diff --git a/obitools/barcodecoverage/findErrors.py b/obitools/barcodecoverage/findErrors.py new file mode 100644 index 0000000..dae20a0 --- /dev/null +++ b/obitools/barcodecoverage/findErrors.py @@ -0,0 +1,56 @@ +#!/usr/local/bin/python +''' +Created on 24 nov. 2011 + +@author: merciece +''' + + +def main(seqs, keptRanks, tax): + errorsBySeq = getErrorsOnLeaves(seqs) + errorsByTaxon = propagateErrors(errorsBySeq, keptRanks, tax) + return errorsBySeq, errorsByTaxon + + +def getErrorsOnLeaves(seqs) : + errors = {} + for s in seqs : + taxid = s['taxid'] + forErrs = s['forward_error'] + revErrs = s['reverse_error'] + total = forErrs + revErrs + seqNb = 1 + errors[s.id] = [forErrs,revErrs,total,seqNb,taxid] + return errors + + +def propagateErrors(errorsOnLeaves, keptRanks, tax) : + allErrors = {} + for seq in errorsOnLeaves : + taxid = errorsOnLeaves[seq][4] + p = [a for a in tax.parentalTreeIterator(taxid)] + for a in p : + if a[1] in keptRanks : + group = a[0] + if group in allErrors : + allErrors[group][0] += errorsOnLeaves[seq][0] + allErrors[group][1] += errorsOnLeaves[seq][1] + allErrors[group][2] += errorsOnLeaves[seq][2] + allErrors[group][3] += 1 + else : + allErrors[group] = errorsOnLeaves[seq] + + for group in allErrors : + allErrors[group][0] /= float(allErrors[group][3]) + allErrors[group][1] /= float(allErrors[group][3]) + allErrors[group][2] /= float(allErrors[group][3]) + + allErrors[group][0] = round(allErrors[group][0], 2) + allErrors[group][1] = round(allErrors[group][1], 2) + allErrors[group][2] = round(allErrors[group][2], 2) + + return allErrors + + + + diff --git a/obitools/barcodecoverage/readFiles.py b/obitools/barcodecoverage/readFiles.py new file mode 100644 index 0000000..b03e72a --- /dev/null +++ b/obitools/barcodecoverage/readFiles.py @@ -0,0 +1,69 @@ +#!/usr/local/bin/python +''' +Created on 23 nov. 2011 + +@author: merciece +''' + +from obitools.ecopcr import sequence +from obitools.ecopcr import taxonomy + + +def main(entries,options): + filteredDataFromDB = ecoPCRDatabaseReader(options) + filteredData = ecoPCRFileReader(entries,filteredDataFromDB) + return filteredDataFromDB,filteredData + + +def ecoPCRDatabaseReader(options): + + tax = taxonomy.EcoTaxonomyDB(options.taxonomy) + seqs = sequence.EcoPCRDBSequenceIterator(options.taxonomy,taxonomy=tax) + + norankid = tax.findRankByName('no rank') + speciesid = tax.findRankByName('species') + genusid = tax.findRankByName('genus') + familyid = tax.findRankByName('family') + + minrankseq = set([speciesid,genusid,familyid]) + + usedrankid = {} + + ingroup = {} + outgroup= {} + + for s in seqs : + if 'taxid' in s : + taxid = s['taxid'] + allrank = set() + for p in tax.parentalTreeIterator(taxid): + if p[1]!=norankid: + allrank.add(p[1]) + if len(minrankseq & allrank) == 3: + for r in allrank: + usedrankid[r]=usedrankid.get(r,0) + 1 + + if tax.isAncestor(options.ingroup,taxid): + ingroup[s.id] = s + else: + outgroup[s.id] = s + + keptranks = set(r for r in usedrankid + if float(usedrankid[r])/float(len(ingroup)) > options.rankthresold) + + return { 'ingroup' : ingroup, + 'outgroup': outgroup, + 'ranks' : keptranks, + 'taxonomy': tax + } + + +def ecoPCRFileReader(entries,filteredDataFromDB) : + filteredData = [] + for s in entries : + if 'taxid' in s : + seqId = s.id + if seqId in filteredDataFromDB['ingroup'] : + filteredData.append(s) + return filteredData + diff --git a/obitools/barcodecoverage/writeBcTree.py b/obitools/barcodecoverage/writeBcTree.py new file mode 100644 index 0000000..7c8243e --- /dev/null +++ b/obitools/barcodecoverage/writeBcTree.py @@ -0,0 +1,42 @@ +#!/usr/local/bin/python +''' +Created on 25 nov. 2011 + +@author: merciece +''' + +from obitools.graph.rootedtree import RootedTree + + +def main(BcValues,errors,tax) : + + tree = RootedTree() + tset = set(BcValues) + + for taxon in BcValues: + if taxon in errors : + forErr = errors[taxon][0] + revErr = errors[taxon][1] + totErr = errors[taxon][2] + else : + forErr = -1.0 + revErr = -1.0 + totErr = -1.0 + + tree.addNode(taxon, rank=tax.getRank(taxon), + name=tax.getScientificName(taxon), + bc = BcValues[taxon], + errors = str(forErr)+' '+str(revErr), + totError = totErr + ) + + for taxon in BcValues: + piter = tax.parentalTreeIterator(taxon) + taxon = piter.next() + for parent in piter: + if taxon[0] in tset and parent[0] in BcValues: + tset.remove(taxon[0]) + tree.addEdge(parent[0], taxon[0]) + taxon=parent + + return tree diff --git a/obitools/blast/__init__.py b/obitools/blast/__init__.py new file mode 100644 index 0000000..11b5274 --- /dev/null +++ b/obitools/blast/__init__.py @@ -0,0 +1,207 @@ +from os import popen2 +from itertools import imap,count + +from obitools.table import iTableIterator,TableRow,Table,SelectionIterator +from obitools.utils import ColumnFile +from obitools.location import SimpleLocation +from obitools.fasta import formatFasta +import sys + +class Blast(object): + ''' + Run blast + ''' + + def __init__(self,mode,db,program='blastall',**options): + self._mode = mode + self._db = db + self._program = program + self._options = options + + def getMode(self): + return self._mode + + + def getDb(self): + return self._db + + + def getProgram(self): + return self._program + + def _blastcmd(self): + tmp = """%(program)s \\ + -p %(mode)s \\ + -d %(db)s \\ + -m 8 \\ + %(options)s \\ + """ + options = ' '.join(['-%s %s' % (x[0],str(x[1])) + for x in self._options.iteritems()]) + data = { + 'program' : self.program, + 'db' : self.db, + 'mode' : self.mode, + 'options' : options + } + + return tmp % data + + def __call__(self,sequence): + ''' + Run blast with one sequence object + @param sequence: + @type sequence: + ''' + cmd = self._blastcmd() + + (blast_in,blast_out) = popen2(cmd) + + print >>blast_in,formatFasta(sequence) + blast_in.close() + + blast = BlastResultIterator(blast_out) + + return blast + + mode = property(getMode, None, None, "Mode's Docstring") + + db = property(getDb, None, None, "Db's Docstring") + + program = property(getProgram, None, None, "Program's Docstring") + + +class NetBlast(Blast): + ''' + Run blast on ncbi servers + ''' + + def __init__(self,mode,db,**options): + ''' + + @param mode: + @param db: + ''' + Blast.__init__(self, mode, db, 'blastcl3',**options) + + +class BlastResultIterator(iTableIterator): + + def __init__(self,blastoutput,query=None): + ''' + + @param blastoutput: + @type blastoutput: + ''' + self._blast = ColumnFile(blastoutput, + strip=True, + skip="#", + sep="\t", + types=self.types + ) + self._query = query + self._hindex = dict((k,i) for i,k in imap(None,count(),self._getHeaders())) + + def _getHeaders(self): + return ('Query id','Subject id', + '% identity','alignment length', + 'mismatches', 'gap openings', + 'q. start', 'q. end', + 's. start', 's. end', + 'e-value', 'bit score') + + def _getTypes(self): + return (str,str, + float,int, + int,int, + int,int, + int,int, + float,float) + + def _getRowFactory(self): + return BlastMatch + + def _getSubrowFactory(self): + return TableRow + + def _getQuery(self): + return self._query + + + headers = property(_getHeaders,None,None) + types = property(_getTypes,None,None) + rowFactory = property(_getRowFactory,None,None) + subrowFactory = property(_getSubrowFactory,None,None) + query = property(_getQuery,None,None) + + def next(self): + ''' + + ''' + value = self._blast.next() + return self.rowFactory(self,value) + + + +class BlastResult(Table): + ''' + Results of a blast run + ''' + +class BlastMatch(TableRow): + ''' + Blast high scoring pair between two sequences + ''' + + def getQueryLocation(self): + l = SimpleLocation(self[6], self[7]) + return l + + def getSubjectLocation(self): + l = SimpleLocation(self[8], self[9]) + return l + + def getSubjectSequence(self,database): + return database[self[1]] + + def queryCov(self,query=None): + ''' + Compute coverage of match on query sequence. + + @param query: the query sequence. Default is None. + In this case the query sequence associated + to this blast result is used. + @type query: L{obitools.BioSequence} + + @return: coverage fraction + @rtype: float + ''' + if query is None: + query = self.table.query + assert query is not None + return float(self[7]-self[6]+1)/float(len(query)) + + def __getitem__(self,key): + if key=='query coverage' and self.table.query is not None: + return self.queryCov() + else: + return TableRow.__getitem__(self,key) + +class BlastCovMinFilter(SelectionIterator): + + def __init__(self,blastiterator,covmin,query=None,**conditions): + if query is None: + query = blastiterator.table.query + assert query is not None + SelectionIterator.__init__(self,blastiterator,**conditions) + self._query = query + self._covmin=covmin + + def _covMinPredicat(self,row): + return row.queryCov(self._query)>=self._covmin + + def _checkCondition(self,row): + return self._covMinPredicat(row) and SelectionIterator._checkCondition(self, row) + + + \ No newline at end of file diff --git a/obitools/carto/__init__.py b/obitools/carto/__init__.py new file mode 100644 index 0000000..b7ac176 --- /dev/null +++ b/obitools/carto/__init__.py @@ -0,0 +1,376 @@ +# -*- coding: latin1 -*- + + + +from obitools import SVGdraw +import math + +class Map(object): + """ + Map represente une instance d'une carte genetique physique. + Une telle carte est definie par la longueur de la sequence + qui lui est associe. + + A une carte est associe un certain nombre de niveaux (Level) + eux meme decoupe en sous-niveau (SubLevel) + Les sous niveaux contiennent eux des features + """ + def __init__(self,name,seqlength,scale=1): + """ + Constructeur d'une nouvelle carte + + *Param*: + + name + nom de la carte + + seqlength + longueur de la sequence associee a la carte + + scale + echelle de la carte indicant combien de pixel + correspondent a une unite de la carte + """ + self.name = name + self.seqlength = seqlength + self.scale = scale + self.levels = {} + self.basicHSize = 10 + + def __str__(self): + return '<%s>' % self.name + + def __getitem__(self,level): + """ + retourne le niveau *level* de la carte et + le cree s'il n'existe pas + """ + if not isinstance(level,int): + raise TypeError('level must be an non Zero integer value') + elif level==0: + raise AssertionError('Level cannot be set to 0') + try: + return self.levels[level] + except KeyError: + self.levels[level] = Level(level,self) + return self.levels[level] + + def getBasicHSize(self): + """ + retourne la hauteur de base d'un element de cartographie + exprimee en pixel + """ + return self.basicHSize + + def getScale(self): + """ + Retourne l'echelle de la carte en nombre de pixels par + unite physique de la carte + """ + return self.scale + + + + def getNegativeBase(self): + return reduce(lambda x,y:x-y,[self.levels[z].getHeight() + for z in self.levels + if z < 0],self.getHeight()) + + def getPositiveBase(self): + return self.getNegativeBase() - 3 * self.getBasicHSize() + + def getHeight(self): + return reduce(lambda x,y:x+y,[z.getHeight() for z in self.levels.values()],0) \ + + 4 * self.getBasicHSize() + + def toXML(self,file=None,begin=0,end=None): + dessin = SVGdraw.drawing() + if end==None: + end = self.seqlength + hauteur= self.getHeight() + largeur=(end-begin+1)*self.scale + svg = SVGdraw.svg((begin*self.scale,0,largeur,hauteur), + '%fpx' % (self.seqlength * self.scale), + '%dpx' % hauteur) + + centre = self.getPositiveBase() + (1 + 1/4) * self.getBasicHSize() + svg.addElement(SVGdraw.rect(0,centre,self.seqlength * self.scale,self.getBasicHSize()/2)) + for e in self.levels.values(): + svg.addElement(e.getElement()) + dessin.setSVG(svg) + return dessin.toXml(file) + +class Feature(object): + pass + +class Level(object): + + def __init__(self,level,map): + if not isinstance(map,Map): + raise AssertionError('map is not an instance of class Map') + if level in map.levels: + raise AssertionError('Level %d already define for map %s' % (level,map)) + else: + map.levels[level] = self + self.map = map + self.level = level + self.sublevels = {} + + def __getitem__(self,sublevel): + """ + retourne le niveau *sublevel* du niveau en + le creant s'il n'existe pas + """ + if not isinstance(sublevel,int): + raise TypeError('sublevel must be a positive integer value') + elif sublevel<0: + raise AssertionError('Level cannot be negative') + try: + return self.sublevels[sublevel] + except KeyError: + self.sublevels[sublevel] = SubLevel(sublevel,self) + return self.sublevels[sublevel] + + def getBase(self): + if self.level < 0: + base = self.map.getNegativeBase() + base += reduce(lambda x,y:x+y,[self.map.levels[z].getHeight() + for z in self.map.levels + if z <0 and z >= self.level],0) + return base + else: + base = self.map.getPositiveBase() + base -= reduce(lambda x,y:x+y,[self.map.levels[z].getHeight() + for z in self.map.levels + if z >0 and z < self.level],0) + return base + + def getElement(self): + objet = SVGdraw.group('level%d' % self.level) + for e in self.sublevels.values(): + objet.addElement(e.getElement()) + return objet + + + + def getHeight(self): + return reduce(lambda x,y:x+y,[z.getHeight() for z in self.sublevels.values()],0) \ + + 2 * self.map.getBasicHSize() + +class SubLevel(object): + + def __init__(self,sublevel,level): + if not isinstance(level,Level): + raise AssertionError('level is not an instance of class Level') + if level in level.sublevels: + raise AssertionError('Sublevel %d already define for level %s' % (sublevel,level)) + else: + level.sublevels[sublevel] = self + self.level = level + self.sublevel = sublevel + self.features = {} + + def getHeight(self): + return max([x.getHeight() for x in self.features.values()]+[0]) + 4 * self.level.map.getBasicHSize() + + def getBase(self): + base = self.level.getBase() + if self.level.level < 0: + base -= self.level.getHeight() - 2 * self.level.map.getBasicHSize() + base += reduce(lambda x,y:x+y,[self.level.sublevels[z].getHeight() + for z in self.level.sublevels + if z <= self.sublevel],0) + base -= 2* self.level.map.getBasicHSize() + else: + base -= reduce(lambda x,y:x+y,[self.level.sublevels[z].getHeight() + for z in self.level.sublevels + if z < self.sublevel],0) + base -= self.level.map.getBasicHSize() + return base + + def getElement(self): + base = self.getBase() + objet = SVGdraw.group('sublevel%d' % self.sublevel) + for e in self.features.values(): + objet.addElement(e.getElement(base)) + return objet + + def add(self,feature): + if not isinstance(feature,Feature): + raise TypeError('feature must be an instance oof Feature') + if feature.name in self.features: + raise AssertionError('A feature with the same name (%s) have already be insert in this sublevel' + % feature.name) + self.features[feature.name]=feature + feature.sublevel=self + +class SimpleFeature(Feature): + + def __init__(self,name,begin,end,visiblename=False,color=0): + self.begin = begin + self.end = end + self.name = name + self.color = color + self.sublevel = None + self.visiblename=visiblename + + def getHeight(self): + if not self.sublevel: + raise AssertionError('Not affected Simple feature') + if self.visiblename: + return self.sublevel.level.map.getBasicHSize() * 2 + else: + return self.sublevel.level.map.getBasicHSize() + + def getElement(self,base): + scale = self.sublevel.level.map.getScale() + y = base - self.sublevel.level.map.getBasicHSize() + x = self.begin * scale + width = (self.end - self.begin + 1) * scale + heigh = self.sublevel.level.map.getBasicHSize() + + objet = SVGdraw.rect(x,y,width,heigh,stroke=self.color) + objet.addElement(SVGdraw.description(self.name)) + + return objet + +class BoxFeature(SimpleFeature): + + def getHeight(self): + if not self.sublevel: + raise AssertionError('Not affected Box feature') + if self.visiblename: + return self.sublevel.level.map.getBasicHSize() * 4 + else: + return self.sublevel.level.map.getBasicHSize() * 3 + + def getElement(self,base): + scale = self.sublevel.level.map.getScale() + y = base - self.sublevel.level.map.getBasicHSize() * 2 + x = self.begin * scale + width = (self.end - self.begin + 1) * scale + height = self.sublevel.level.map.getBasicHSize() * 3 + + objet = SVGdraw.rect(x,y,width,height,stroke=self.color,fill="none") + objet.addElement(SVGdraw.description(self.name)) + + return objet + +class MultiPartFeature(Feature): + + def __init__(self,name,*args,**kargs): + self.limits = args + self.name = name + try: + self.color = kargs['color'] + except KeyError: + self.color = "black" + + try: + self.visiblename=kargs['visiblename'] + except KeyError: + self.visiblename=None + + try: + self.flatlink=kargs['flatlink'] + except KeyError: + self.flatlink=False + + try: + self.roundlink=kargs['roundlink'] + except KeyError: + self.roundlink=False + + self.sublevel = None + + + def getHeight(self): + if not self.sublevel: + raise AssertionError('Not affected Simple feature') + if self.visiblename: + return self.sublevel.level.map.getBasicHSize() * 3 + else: + return self.sublevel.level.map.getBasicHSize() * 2 + + def getElement(self,base): + scale = self.sublevel.level.map.getScale() + + y = base - self.sublevel.level.map.getBasicHSize() + height = self.sublevel.level.map.getBasicHSize() + objet = SVGdraw.group(self.name) + for (debut,fin) in self.limits: + x = debut * scale + width = (fin - debut + 1) * scale + part = SVGdraw.rect(x,y,width,height,fill=self.color) + objet.addElement(part) + + debut = self.limits[0][1] + for (fin,next) in self.limits[1:]: + debut*=scale + fin*=scale + path = SVGdraw.pathdata(debut,y + height / 2) + delta = height / 2 + if self.roundlink: + path.qbezier((debut+fin)/2, y - delta,fin,y + height / 2) + else: + if self.flatlink: + delta = - height / 2 + path.line((debut+fin)/2, y - delta) + path.line(fin,y + height / 2) + path = SVGdraw.path(path,fill="none",stroke=self.color) + objet.addElement(path) + debut = next + + objet.addElement(SVGdraw.description(self.name)) + + return objet + +class TagFeature(Feature): + + def __init__(self,name,begin,length,ratio,visiblename=False,color=0): + self.begin = begin + self.length = length + self.ratio = ratio + self.name = name + self.color = color + self.sublevel = None + self.visiblename=visiblename + + def getHeight(self): + if not self.sublevel: + raise AssertionError('Not affected Tag feature') + + return self.sublevel.level.map.getBasicHSize()*11 + + def getElement(self,base): + scale = self.sublevel.level.map.getScale() + height = math.floor(max(1,self.sublevel.level.map.getBasicHSize()* 10 * self.ratio)) + y = base + self.sublevel.level.map.getBasicHSize() - height + x = self.begin * scale + width = self.length * scale + objet = SVGdraw.rect(x,y,width,height,stroke=self.color) + objet.addElement(SVGdraw.description(self.name)) + + return objet + +if __name__ == '__main__': + carte = Map('essai',20000,scale=0.5) + carte[-1][0].add(SimpleFeature('toto',100,300)) + carte[1][0].add(SimpleFeature('toto',100,300)) + carte[1][1].add(SimpleFeature('toto',200,1000)) + + carte[1][0].add(MultiPartFeature('bout',(1400,1450),(1470,1550),(1650,1800),color='red',flatlink=True)) + carte[1][0].add(MultiPartFeature('titi',(400,450),(470,550),(650,800),color='red',flatlink=True)) + carte[-1][1].add(MultiPartFeature('titi',(400,450),(470,550),(650,800),color='green')) + carte[-1][2].add(MultiPartFeature('titi',(400,450),(470,550),(650,800),color='purple',roundlink=True)) + + carte[-1][1].add(BoxFeature('tutu',390,810,color='purple')) + carte[1][0].add(BoxFeature('tutu',390,810,color='red')) + carte[2][0].add(TagFeature('t1',1400,20,0.8)) + carte[2][0].add(TagFeature('t2',1600,20,0.2)) + carte.basicHSize=6 + print carte.toXML('truc.svg',begin=0,end=1000) + print carte.toXML('truc2.svg',begin=460,end=2000) + + + diff --git a/obitools/decorator.py b/obitools/decorator.py new file mode 100644 index 0000000..e69de29 diff --git a/obitools/distances/__init__.py b/obitools/distances/__init__.py new file mode 100644 index 0000000..1542fa9 --- /dev/null +++ b/obitools/distances/__init__.py @@ -0,0 +1,29 @@ +class DistanceMatrix(object): + + def __init__(self,alignment): + ''' + DistanceMatrix constructor. + + @param alignment: aligment used to compute distance matrix + @type alignment: obitools.align.Alignment + ''' + self.aligment = alignment + self.matrix = [[None] * (x+1) for x in xrange(len(alignment))] + + def evaluateDist(self,x,y): + raise NotImplementedError + + def __getitem__(self,key): + assert isinstance(key,(tuple,list)) and len(key)==2, \ + 'key must be a tuple or a list of two integers' + x,y = key + if y < x: + z=x + x=y + y=z + rep = self.matrix[y][x] + if rep is None: + rep = self.evaluateDist(x,y) + self.matrix[y][x] = rep + + return rep \ No newline at end of file diff --git a/obitools/distances/observed.py b/obitools/distances/observed.py new file mode 100644 index 0000000..8828d92 --- /dev/null +++ b/obitools/distances/observed.py @@ -0,0 +1,77 @@ +''' +Module dedicated to compute observed divergeances from +an alignment. No distance correction is applied at all +''' + +from itertools import imap + +from obitools.distances import DistanceMatrix + +class PairewiseGapRemoval(DistanceMatrix): + ''' + Observed divergeance matrix from an alignment. + Gap are removed from the alignemt on a pairewise + sequence base + ''' + + def evaluateDist(self,x,y): + ''' + Compute the observed divergeance from two sequences + of an aligment. + + @attention: For performance purpose this method should + be directly used. use instead the __getitem__ + method from DistanceMatrix. + + @see: L{__getitem__} + + @param x: number of the fisrt sequence in the aligment + @type x: int + @param y: umber of the second sequence in the aligment + @type y: int + + + ''' + + seq1 = self.aligment[x] + seq2 = self.aligment[y] + + diff,tot = reduce(lambda x,y: (x[0]+y,x[1]+1), + (z[0]!=z[1] for z in imap(None,seq1,seq2) + if '-' not in z),(0,0)) + return float(diff)/tot + + +class Pairewise(DistanceMatrix): + ''' + Observed divergeance matrix from an alignment. + Gap are kept from the alignemt + ''' + + def evaluateDist(self,x,y): + ''' + Compute the observed divergeance from two sequences + of an aligment. + + @attention: For performance purpose this method should + be directly used. use instead the __getitem__ + method from DistanceMatrix. + + @see: L{__getitem__} + + @param x: number of the fisrt sequence in the aligment + @type x: int + @param y: umber of the second sequence in the aligment + @type y: int + + + ''' + + seq1 = self.aligment[x] + seq2 = self.aligment[y] + + diff,tot = reduce(lambda x,y: (x[0]+y,x[1]+1), + (z[0]!=z[1] for z in imap(None,seq1,seq2)), + (0,0)) + return float(diff)/tot + \ No newline at end of file diff --git a/obitools/distances/phylip.py b/obitools/distances/phylip.py new file mode 100644 index 0000000..e2043fa --- /dev/null +++ b/obitools/distances/phylip.py @@ -0,0 +1,35 @@ +import sys + +from itertools import imap,count + +def writePhylipMatrix(matrix): + names = [x.id for x in matrix.aligment] + pnames= [x[:10] for x in names] + unicity={} + redundent=[] + for n in pnames: + unicity[n]=unicity.get(n,0)+1 + redundent.append(unicity[n]) + + for i,n,r in imap(None,count(),pnames,redundent): + alternate = n + if r > 1: + while alternate in pnames: + lcut = 9 - len(str(r)) + alternate = n[:lcut]+ '_%d' % r + r+=1 + pnames[i]='%-10s' % alternate + + firstline = '%5d' % len(matrix.aligment) + rep = [firstline] + for i,n in imap(None,count(),pnames): + line = [n] + for j in xrange(i): + line.append('%5.4f' % matrix[(j,i)]) + rep.append(' '.join(line)) + return '\n'.join(rep) + + + + + \ No newline at end of file diff --git a/obitools/distances/r.py b/obitools/distances/r.py new file mode 100644 index 0000000..f674a4c --- /dev/null +++ b/obitools/distances/r.py @@ -0,0 +1,25 @@ +import sys + +from itertools import imap,count + +def writeRMatrix(matrix): + names = [x.id for x in matrix.aligment] + lmax = max(max(len(x) for x in names),5) + lali = len(matrix.aligment) + + nformat = '%%-%ds' % lmax + dformat = '%%%d.4f' % lmax + + pnames=[nformat % x for x in names] + + rep = [' '.join(pnames)] + + for i in xrange(lali): + line=[] + for j in xrange(lali): + line.append('%5.4f' % matrix[(j,i)]) + rep.append(' '.join(line)) + return '\n'.join(rep) + + + \ No newline at end of file diff --git a/obitools/dnahash/__init__.py b/obitools/dnahash/__init__.py new file mode 100644 index 0000000..ca02e35 --- /dev/null +++ b/obitools/dnahash/__init__.py @@ -0,0 +1,100 @@ +_A=[0] +_C=[1] +_G=[2] +_T=[3] +_R= _A + _G +_Y= _C + _T +_M= _C + _A +_K= _T + _G +_W= _T + _A +_S= _C + _G +_B= _C + _G + _T +_D= _A + _G + _T +_H= _A + _C + _T +_V= _A + _C + _G +_N= _A + _C + _G + _T + +_dnahash={'a':_A, + 'c':_C, + 'g':_G, + 't':_T, + 'r':_R, + 'y':_Y, + 'm':_M, + 'k':_K, + 'w':_W, + 's':_S, + 'b':_B, + 'd':_D, + 'h':_H, + 'v':_V, + 'n':_N, + } + +def hashCodeIterator(sequence,wsize,degeneratemax=0,offset=0): + errors = 0 + emask = [0] * wsize + epointer = 0 + size = 0 + position = offset + hashs = set([0]) + hashmask = 0 + for i in xrange(wsize): + hashmask <<= 2 + hashmask +=3 + + for l in sequence: + l = l.lower() + hl = _dnahash[l] + + if emask[epointer]: + errors-=1 + emask[epointer]=0 + + if len(hl) > 1: + errors +=1 + emask[epointer]=1 + + epointer+=1 + epointer%=wsize + + if errors > degeneratemax: + hl=set([hl[0]]) + + hashs=set((((hc<<2) | cl) & hashmask) + for hc in hashs + for cl in hl) + + if size < wsize: + size+=1 + + if size==wsize: + if errors <= degeneratemax: + yield (position,hashs,errors) + position+=1 + +def hashSequence(sequence,wsize,degeneratemax=0,offset=0,hashs=None): + if hashs is None: + hashs=[[] for x in xrange(4**wsize)] + + for pos,keys,errors in hashCodeIterator(sequence, wsize, degeneratemax, offset): + for k in keys: + hashs[k].append(pos) + + return hashs + +def hashSequences(sequences,wsize,maxpos,degeneratemax=0): + hashs=None + offsets=[] + offset=0 + for s in sequences: + offsets.append(offset) + hashSequence(s,wsize,degeneratemax=degeneratemax,offset=offset,hashs=hashs) + offset+=len(s) + + return hashs,offsets + + + + + \ No newline at end of file diff --git a/obitools/ecobarcode/__init__.py b/obitools/ecobarcode/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/obitools/ecobarcode/databases.py b/obitools/ecobarcode/databases.py new file mode 100644 index 0000000..70d2319 --- /dev/null +++ b/obitools/ecobarcode/databases.py @@ -0,0 +1,32 @@ +''' +Created on 25 sept. 2010 + +@author: coissac +''' +from obitools import NucSequence + +def referenceDBIterator(options): + + cursor = options.ecobarcodedb.cursor() + + cursor.execute("select id from databases.database where name='%s'" % options.database) + options.dbid = cursor.fetchone()[0] + + cursor.execute(''' + select s.accession,r.id,r.taxid,r.sequence + from databases.database d, + databases.reference r, + databases.relatedsequences s + where r.database = d.id + and s.reference= r.id + and s.mainac + and d.name = '%s' + ''' % options.database + ) + + for ac,id,taxid,sequence in cursor: + s = NucSequence(ac,sequence) + s['taxid']=taxid + s['refdbid']=id + yield s + \ No newline at end of file diff --git a/obitools/ecobarcode/ecotag.py b/obitools/ecobarcode/ecotag.py new file mode 100644 index 0000000..2ebd3fb --- /dev/null +++ b/obitools/ecobarcode/ecotag.py @@ -0,0 +1,50 @@ +''' +Created on 25 sept. 2010 + +@author: coissac +''' + +def alreadyIdentified(seqid,options): + cursor = options.ecobarcodedb.cursor() + cursor.execute(''' + select count(*) + from ecotag.identification + where sequence=%s + and database=%s + ''',(int(seqid),int(options.dbid))) + + return int(cursor.fetchone()[0]) > 0; + +def storeIdentification(seqid, + idstatus,taxid, + matches, + options + ): + + cursor = options.ecobarcodedb.cursor() + + if not options.updatedb: + cursor.execute(''' + delete from ecotag.identification where sequence=%s and database=%s + ''',(int(seqid),int(options.dbid))) + + cursor.execute(''' + insert into ecotag.identification (sequence,database,idstatus,taxid) + values (%s,%s,%s,%s) + returning id + ''' , (int(seqid),int(options.dbid),idstatus,int(taxid))) + + idid = cursor.fetchone()[0] + + for seq,identity in matches.iteritems(): + cursor.execute(''' + insert into ecotag.evidence (identification,reference,identity) + values (%s, + %s, + %s) + ''',(idid,seq,identity)) + + + cursor.close() + + options.ecobarcodedb.commit() diff --git a/obitools/ecobarcode/options.py b/obitools/ecobarcode/options.py new file mode 100644 index 0000000..6086423 --- /dev/null +++ b/obitools/ecobarcode/options.py @@ -0,0 +1,64 @@ +''' +Created on 23 sept. 2010 + +@author: coissac +''' +import psycopg2 + +from obitools.ecobarcode.taxonomy import EcoTaxonomyDB + +def addEcoBarcodeDBOption(optionManager): + optionManager.add_option('--dbname', + action="store", dest="ecobarcodedb", + type='str', + default=None, + help="Specify the name of the ecobarcode database") + + optionManager.add_option('--server', + action="store", dest="dbserver", + type='str', + default="localhost", + help="Specify the adress of the ecobarcode database server") + + optionManager.add_option('--user', + action="store", dest="dbuser", + type='str', + default='postgres', + help="Specify the user of the ecobarcode database") + + optionManager.add_option('--port', + action="store", dest="dbport", + type='str', + default=5432, + help="Specify the port of the ecobarcode database") + + optionManager.add_option('--passwd', + action="store", dest="dbpasswd", + type='str', + default='', + help="Specify the passwd of the ecobarcode database") + + optionManager.add_option('--primer', + action="store", dest="primer", + type='str', + default=None, + help="Specify the primer used for amplification") + + +def ecobarcodeDatabaseConnection(options): + if options.ecobarcodedb is not None: + connection = psycopg2.connect(database=options.ecobarcodedb, + user=options.dbuser, + password=options.dbpasswd, + host=options.dbserver, + port=options.dbport) + options.dbname=options.ecobarcodedb + else: + connection=None + if connection is not None: + options.ecobarcodedb=connection + taxonomy = EcoTaxonomyDB(connection) + else: + taxonomy=None + return taxonomy + diff --git a/obitools/ecobarcode/rawdata.py b/obitools/ecobarcode/rawdata.py new file mode 100644 index 0000000..a5f58cf --- /dev/null +++ b/obitools/ecobarcode/rawdata.py @@ -0,0 +1,38 @@ +''' +Created on 25 sept. 2010 + +@author: coissac +''' + +from obitools import NucSequence +from obitools.utils import progressBar +from obitools.ecobarcode.ecotag import alreadyIdentified + +import sys + +def sequenceIterator(options): + cursor = options.ecobarcodedb.cursor() + + cursor.execute(''' + select s.id,sum(o.count),s.sequence + from rawdata.sequence s, + rawdata.occurrences o + where o.sequence= s.id + and s.primers = '%s' + group by s.id,s.sequence + ''' % options.primer + ) + + nbseq = cursor.rowcount + progressBar(1, nbseq, True, head=options.dbname) + for id,count,sequence in cursor: + progressBar(cursor.rownumber+1, nbseq, head=options.dbname) + if not options.updatedb or not alreadyIdentified(id,options): + s = NucSequence(id,sequence) + s['count']=count + print >>sys.stderr,' +', cursor.rownumber+1, + yield s + else: + print >>sys.stderr,' @', cursor.rownumber+1, + + print >>sys.stderr diff --git a/obitools/ecobarcode/taxonomy.py b/obitools/ecobarcode/taxonomy.py new file mode 100644 index 0000000..c7d0185 --- /dev/null +++ b/obitools/ecobarcode/taxonomy.py @@ -0,0 +1,120 @@ +''' +Created on 24 sept. 2010 + +@author: coissac +''' + +from obitools.ecopcr.taxonomy import TaxonomyDump +from obitools.ecopcr.taxonomy import Taxonomy +import sys + +class EcoTaxonomyDB(TaxonomyDump) : + + def __init__(self,dbconnect): + self._dbconnect=dbconnect + + print >> sys.stderr,"Reading ecobarcode taxonomy database..." + + self._readNodeTable() + print >> sys.stderr," ok" + + print >>sys.stderr,"Adding scientific name..." + + self._name=[] + for taxid,name,classname in self._nameIterator(): + self._name.append((name,classname,self._index[taxid])) + if classname == 'scientific name': + self._taxonomy[self._index[taxid]].append(name) + + print >>sys.stderr,"Adding taxid alias..." + for taxid,current in self._mergedNodeIterator(): + self._index[taxid]=self._index[current] + + print >>sys.stderr,"Adding deleted taxid..." + for taxid in self._deletedNodeIterator(): + self._index[taxid]=None + + + Taxonomy.__init__(self) + + ##### + # + # Iterator functions + # + ##### + + def _readNodeTable(self): + + cursor = self._dbconnect.cursor() + + cursor.execute(""" + select taxid,rank,parent + from ncbitaxonomy.nodes + """) + + print >>sys.stderr,"Reading taxonomy nodes..." + taxonomy=[list(n) for n in cursor] + + print >>sys.stderr,"List all taxonomy rank..." + ranks =list(set(x[1] for x in taxonomy)) + ranks.sort() + rankidx = dict(map(None,ranks,xrange(len(ranks)))) + + print >>sys.stderr,"Sorting taxons..." + taxonomy.sort(TaxonomyDump._taxonCmp) + + self._taxonomy=taxonomy + + print >>sys.stderr,"Indexing taxonomy..." + index = {} + for t in self._taxonomy: + index[t[0]]=self._bsearchTaxon(t[0]) + + print >>sys.stderr,"Indexing parent and rank..." + for t in self._taxonomy: + t[1]=rankidx[t[1]] + t[2]=index[t[2]] + + self._ranks=ranks + self._index=index + + cursor.close() + + def _nameIterator(self): + cursor = self._dbconnect.cursor() + + cursor.execute(""" + select taxid,name,nameclass + from ncbitaxonomy.names + """) + + for taxid,name,nameclass in cursor: + yield taxid,name,nameclass + + cursor.close() + + def _mergedNodeIterator(self): + cursor = self._dbconnect.cursor() + + cursor.execute(""" + select oldtaxid,newtaxid + from ncbitaxonomy.merged + """) + + for oldtaxid,newtaxid in cursor: + yield oldtaxid,newtaxid + + cursor.close() + + def _deletedNodeIterator(self): + cursor = self._dbconnect.cursor() + + cursor.execute(""" + select taxid + from ncbitaxonomy.delnodes + """) + + for taxid in cursor: + yield taxid[0] + + cursor.close() diff --git a/obitools/ecopcr/__init__.py b/obitools/ecopcr/__init__.py new file mode 100644 index 0000000..10a90e5 --- /dev/null +++ b/obitools/ecopcr/__init__.py @@ -0,0 +1,69 @@ +from obitools import utils +from obitools import NucSequence +from obitools.utils import universalOpen, universalTell, fileSize, progressBar +import struct +import sys + + +class EcoPCRFile(utils.ColumnFile): + def __init__(self,stream): + utils.ColumnFile.__init__(self, + stream, '|', True, + (str,int,int, + str,int,str, + int,str,int, + str,int,str, + str,str,int,float, + str,int,float, + int, + str,str), "#") + + + def next(self): + data = utils.ColumnFile.next(self) + seq = NucSequence(data[0],data[20],data[21]) + seq['seq_length_ori']=data[1] + seq['taxid']=data[2] + seq['rank']=data[3] + seq['species']=data[4] + seq['species_sn']=data[5] + seq['genus']=data[6] + seq['genus_sn']=data[7] + seq['family']=data[8] + seq['family_sn']=data[9] + seq['strand']=data[12] + seq['forward_primer']=data[13] + seq['forward_error']=data[14] + seq['forward_tm']=data[15] + seq['reverse_primer']=data[16] + seq['reverse_error']=data[17] + seq['reverse_tm']=data[18] + + return seq + + + +class EcoPCRDBFile(object): + + def _ecoRecordIterator(self,file): + file = universalOpen(file) + (recordCount,) = struct.unpack('> I',file.read(4)) + self._recover=False + + if recordCount: + for i in xrange(recordCount): + (recordSize,)=struct.unpack('>I',file.read(4)) + record = file.read(recordSize) + yield record + else: + print >> sys.stderr,"\n\n WARNING : EcoPCRDB readding set into recover data mode\n" + self._recover=True + ok=True + while(ok): + try: + (recordSize,)=struct.unpack('>I',file.read(4)) + record = file.read(recordSize) + yield record + except: + ok=False + \ No newline at end of file diff --git a/obitools/ecopcr/annotation.py b/obitools/ecopcr/annotation.py new file mode 100644 index 0000000..7c76fb2 --- /dev/null +++ b/obitools/ecopcr/annotation.py @@ -0,0 +1,104 @@ +import struct + +class EcoPCRDBAnnotationWriter(object): + ''' + Class used to write Annotation description in EcoPCRDB format. + + EcoPCRDBAnnotationWriter is oftenly called through the EcoPCRDBSequenceWriter class + + @see: L{ecopcr.sequence.EcoPCRDBSequenceWriter} + ''' + + def __init__(self,dbname,id,fileidx=1,type=('CDS'),definition=None): + ''' + class constructor + + @param dbname: name of ecoPCR database + @type dbname: C{str} + @param id: name of the qualifier used as feature id + @type id: C{str} + @param fileidx: + @type fileidx: C{int} + @param type: + @type type: C{list} or C{tuple} + @param definition: + @type definition: C{str} + ''' + self._type = type + self._definition = definition + self._id = id + self._filename="%s_%03d.adx" % (dbname,fileidx) + self._file = open(self._filename,'wb') + self._sequenceIdx=0 + + + ftname ="%s.fdx" % (dbname) + ft = open(ftname,'wb') + + self._fttypeidx=dict(map(None,type,xrange(len(type)))) + + ft.write(struct.pack('> I',len(type))) + + for t in type: + ft.write(self._ecoFtTypePacker(t)) + + ft.close() + + self._annotationCount=0 + self._file.write(struct.pack('> I',self._annotationCount)) + + + def _ecoFtTypePacker(self,type): + totalSize = len(type) + packed = struct.pack('> I %ds' % totalSize,totalSize,type) + + assert len(packed) == totalSize+4, "error in feature type packing" + + return packed + + def _ecoAnnotationPacker(self,feature,seqidx): + begin = feature.begin-1 + end = feature.end + type = self._fttypeidx[feature.ftType] + strand = feature.isDirect() + id = feature[self._id][0] + if self._definition in feature: + definition = feature[self._definition][0] + else: + definition = '' + + assert strand is not None,"Only strand defined features can be stored" + + deflength = len(definition) + + totalSize = 4 + 4 + 4 + 4 + 4 + 20 + 4 + deflength + + packed = struct.pack('> I I I I I 20s I %ds' % (deflength), + totalSize, + seqidx, + begin, + end, + type, + int(strand), + id, + deflength, + definition) + + assert len(packed) == totalSize+4, "error in annotation packing" + + return packed + + + def put(self,sequence,seqidx=None): + if seqidx is None: + seqidx = self._sequenceIdx + self._sequenceIdx+=1 + for feature in sequence.getFeatureTable(): + if feature.ftType in self._type: + self._annotationCount+=1 + self._file.write(self._ecoAnnotationPacker(feature,seqidx)) + + def __del__(self): + self._file.seek(0,0) + self._file.write(struct.pack('> I',self._annotationCount)) + self._file.close() diff --git a/obitools/ecopcr/options.py b/obitools/ecopcr/options.py new file mode 100644 index 0000000..03663cd --- /dev/null +++ b/obitools/ecopcr/options.py @@ -0,0 +1,129 @@ +''' +Created on 13 fevr. 2011 + +@author: coissac +''' + +from obitools.ecopcr.taxonomy import Taxonomy, EcoTaxonomyDB, TaxonomyDump, ecoTaxonomyWriter + +try: + from obitools.ecobarcode.options import addEcoBarcodeDBOption,ecobarcodeDatabaseConnection +except ImportError: + def addEcoBarcodeDBOption(optionmanager): + pass + def ecobarcodeDatabaseConnection(options): + return None + +def addTaxonomyDBOptions(optionManager): + addEcoBarcodeDBOption(optionManager) + optionManager.add_option('-d','--database', + action="store", dest="taxonomy", + metavar="", + type="string", + help="ecoPCR taxonomy Database " + "name") + optionManager.add_option('-t','--taxonomy-dump', + action="store", dest="taxdump", + metavar="", + type="string", + help="NCBI Taxonomy dump repository " + "name") + + +def addTaxonomyFilterOptions(optionManager): + addTaxonomyDBOptions(optionManager) + optionManager.add_option('--require-rank', + action="append", + dest='requiredRank', + metavar="", + type="string", + default=[], + help="select sequence with taxid tag containing " + "a parent of rank ") + + optionManager.add_option('-r','--required', + action="append", + dest='required', + metavar="", + type="int", + default=[], + help="required taxid") + + optionManager.add_option('-i','--ignore', + action="append", + dest='ignored', + metavar="", + type="int", + default=[], + help="ignored taxid") + +def loadTaxonomyDatabase(options): + if isinstance(options.taxonomy, Taxonomy): + return options.taxonomy + taxonomy = ecobarcodeDatabaseConnection(options) + if (taxonomy is not None or + options.taxonomy is not None or + options.taxdump is not None): + if options.taxdump is not None: + taxonomy = TaxonomyDump(options.taxdump) + if taxonomy is not None and isinstance(options.taxonomy, str): + ecoTaxonomyWriter(options.taxonomy,taxonomy) + options.ecodb=options.taxonomy + if isinstance(options.taxonomy, Taxonomy): + taxonomy = options.taxonomy + if taxonomy is None and isinstance(options.taxonomy, str): + taxonomy = EcoTaxonomyDB(options.taxonomy) + options.ecodb=options.taxonomy + options.taxonomy=taxonomy + return options.taxonomy + +def taxonomyFilterGenerator(options): + loadTaxonomyDatabase(options) + if options.taxonomy is not None: + taxonomy=options.taxonomy + def taxonomyFilter(seq): + def annotateAtRank(seq,rank): + if 'taxid' in seq and seq['taxid'] is not None: + rtaxid= taxonomy.getTaxonAtRank(seq['taxid'],rank) + return rtaxid + return None + good = True + if 'taxid' in seq: + taxid = seq['taxid'] +# print taxid, + if options.requiredRank: + taxonatrank = reduce(lambda x,y: x and y, + (annotateAtRank(seq,rank) is not None + for rank in options.requiredRank),True) + good = good and taxonatrank +# print >>sys.stderr, " Has rank : ",good, + if options.required: + good = good and reduce(lambda x,y: x or y, + (taxonomy.isAncestor(r,taxid) for r in options.required), + False) +# print " Required : ",good, + if options.ignored: + good = good and not reduce(lambda x,y: x or y, + (taxonomy.isAncestor(r,taxid) for r in options.ignored), + False) +# print " Ignored : ",good, +# print " Global : ",good + + return good + + + else: + def taxonomyFilter(seq): + return True + + return taxonomyFilter + +def taxonomyFilterIteratorGenerator(options): + taxonomyFilter = taxonomyFilterGenerator(options) + + def filterIterator(seqiterator): + for seq in seqiterator: + if taxonomyFilter(seq): + yield seq + + return filterIterator \ No newline at end of file diff --git a/obitools/ecopcr/sequence.py b/obitools/ecopcr/sequence.py new file mode 100644 index 0000000..1465e69 --- /dev/null +++ b/obitools/ecopcr/sequence.py @@ -0,0 +1,133 @@ +from obitools import NucSequence +from obitools.ecopcr import EcoPCRDBFile +from obitools.ecopcr.taxonomy import EcoTaxonomyDB, ecoTaxonomyWriter +from obitools.ecopcr.annotation import EcoPCRDBAnnotationWriter +from obitools.utils import universalOpen +from glob import glob +import struct +import gzip +import sys + + +class EcoPCRDBSequenceIterator(EcoPCRDBFile): + ''' + Build an iterator over the sequences include in a sequence database + formated for ecoPCR + ''' + + def __init__(self,path,taxonomy=None): + ''' + ecoPCR data iterator constructor + + @param path: path to the ecoPCR database including the database prefix name + @type path: C{str} + @param taxonomy: a taxonomy can be given to the reader to decode the taxonomic data + associated to the sequences. If no Taxonomy is furnish, it will be read + before the sequence database files using the same path. + @type taxonomy: L{obitools.ecopcr.taxonomy.Taxonomy} + ''' + self._path = path + + if taxonomy is not None: + self._taxonomy=taxonomy + else: + self._taxonomy=EcoTaxonomyDB(path) + + self._seqfilesFiles = glob('%s_???.sdx' % self._path) + self._seqfilesFiles.sort() + + def __ecoSequenceIterator(self,file): + for record in self._ecoRecordIterator(file): + lrecord = len(record) + lnames = lrecord - (4*4+20) + (taxid,seqid,deflength,seqlength,cptseqlength,string)=struct.unpack('> I 20s I I I %ds' % lnames, record) + seqid=seqid.strip('\x00') + de = string[:deflength] + seq = gzip.zlib.decompress(string[deflength:]) + bioseq = NucSequence(seqid,seq,de,taxidx=taxid,taxid=self._taxonomy._taxonomy[taxid][0]) + yield bioseq + + def __iter__(self): + for seqfile in self._seqfilesFiles: + for seq in self.__ecoSequenceIterator(seqfile): + yield seq + +class EcoPCRDBSequenceWriter(object): + + def __init__(self,dbname,fileidx=1,taxonomy=None,ftid=None,type=None,definition=None,append=False): + self._taxonomy=taxonomy + self._filename="%s_%03d.sdx" % (dbname,fileidx) + if append: + mode ='r+b' + f = universalOpen(self._filename) + (recordCount,) = struct.unpack('> I',f.read(4)) + self._sequenceCount=recordCount + del f + self._file = open(self._filename,mode) + self._file.seek(0,0) + self._file.write(struct.pack('> I',0)) + self._file.seek(0,2) + else: + self._sequenceCount=0 + mode = 'wb' + self._file = open(self._filename,mode) + self._file.write(struct.pack('> I',self._sequenceCount)) + + if self._taxonomy is not None: + print >> sys.stderr,"Writing the taxonomy file...", + ecoTaxonomyWriter(dbname,self._taxonomy) + print >> sys.stderr,"Ok" + + if type is not None: + assert ftid is not None,"You must specify an id attribute for features" + self._annotation = EcoPCRDBAnnotationWriter(dbname, ftid, fileidx, type, definition) + else: + self._annotation = None + + def _ecoSeqPacker(self,seq): + + compactseq = gzip.zlib.compress(str(seq).upper(),9) + cptseqlength = len(compactseq) + delength = len(seq.definition) + + totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength + + if self._taxonomy is None or 'taxid' not in seq: + taxon=-1 + else: + taxon=self._taxonomy.findIndex(seq['taxid']) + + try: + packed = struct.pack('> I i 20s I I I %ds %ds' % (delength,cptseqlength), + totalSize, + taxon, + seq.id, + delength, + len(seq), + cptseqlength, + seq.definition, + compactseq) + except struct.error as e: + print >>sys.stderr,"\n\n============\n\nError on sequence : %s\n\n" % seq.id + raise e + + assert len(packed) == totalSize+4, "error in sequence packing" + + return packed + + + def put(self,sequence): + if self._taxonomy is not None: + if 'taxid' not in sequence and hasattr(sequence, 'extractTaxon'): + sequence.extractTaxon() + self._file.write(self._ecoSeqPacker(sequence)) + if self._annotation is not None: + self._annotation.put(sequence, self._sequenceCount) + self._sequenceCount+=1 + + def __del__(self): + self._file.seek(0,0) + self._file.write(struct.pack('> I',self._sequenceCount)) + self._file.close() + + diff --git a/obitools/ecopcr/taxonomy.py b/obitools/ecopcr/taxonomy.py new file mode 100644 index 0000000..bb2ec4e --- /dev/null +++ b/obitools/ecopcr/taxonomy.py @@ -0,0 +1,630 @@ +import struct +import sys + +from itertools import count,imap + +from obitools.ecopcr import EcoPCRDBFile +from obitools.utils import universalOpen +from obitools.utils import ColumnFile + +class Taxonomy(object): + def __init__(self): + ''' + The taxonomy database constructor + + @param path: path to the ecoPCR database including the database prefix name + @type path: C{str} + ''' + + self._ranks.append('obi') + + self._speciesidx = self._ranks.index('species') + self._genusidx = self._ranks.index('genus') + self._familyidx = self._ranks.index('family') + self._orderidx = self._ranks.index('order') + self._nameidx=dict((x[0],x[2]) for x in self._name) + self._nameidx.update(dict((x[0],x[2]) for x in self._preferedName)) + self._preferedidx=dict((x[2],x[1]) for x in self._preferedName) + + self._bigestTaxid = max(x[0] for x in self._taxonomy) + + + def findTaxonByIdx(self,idx): + if idx is None: + return None + return self._taxonomy[idx] + + def findIndex(self,taxid): + if taxid is None: + return None + return self._index[taxid] + + def findTaxonByTaxid(self,taxid): + return self.findTaxonByIdx(self.findIndex(taxid)) + + def findTaxonByName(self,name): + return self._taxonomy[self._nameidx[name]] + + def findRankByName(self,rank): + try: + return self._ranks.index(rank) + except ValueError: + return None + + def __contains__(self,taxid): + return self.findTaxonByTaxid(taxid) is not None + + + + + ##### + # + # PUBLIC METHODS + # + ##### + + + def subTreeIterator(self, taxid): + "return subtree for given taxonomic id " + idx = self.findTaxonByTaxid(taxid) + yield self._taxonomy[idx] + for t in self._taxonomy: + if t[2] == idx: + for subt in self.subTreeIterator(t[0]): + yield subt + + def parentalTreeIterator(self, taxid): + """ + return parental tree for given taxonomic id starting from + first ancester to the root. + """ + taxon=self.findTaxonByTaxid(taxid) + if taxon is not None: + while taxon[2]!= 0: + yield taxon + taxon = self._taxonomy[taxon[2]] + yield self._taxonomy[0] + else: + raise StopIteration + + def isAncestor(self,parent,taxid): + return parent in [x[0] for x in self.parentalTreeIterator(taxid)] + + def lastCommonTaxon(self,*taxids): + if not taxids: + return None + if len(taxids)==1: + return taxids[0] + + if len(taxids)==2: + t1 = [x[0] for x in self.parentalTreeIterator(taxids[0])] + t2 = [x[0] for x in self.parentalTreeIterator(taxids[1])] + t1.reverse() + t2.reverse() + + count = min(len(t1),len(t2)) + i=0 + while(i < count and t1[i]==t2[i]): + i+=1 + i-=1 + + return t1[i] + + ancetre = taxids[0] + for taxon in taxids[1:]: + ancetre = self.lastCommonTaxon(ancetre,taxon) + + return ancetre + + def betterCommonTaxon(self,error=1,*taxids): + lca = self.lastCommonTaxon(*taxids) + idx = self._index[lca] + sublca = [t[0] for t in self._taxonomy if t[2]==idx] + return sublca + + + def getPreferedName(self,taxid): + idx = self.findIndex(taxid) + return self._preferedidx.get(idx,self._taxonomy[idx][3]) + + + def getScientificName(self,taxid): + return self.findTaxonByTaxid(taxid)[3] + + def getRankId(self,taxid): + return self.findTaxonByTaxid(taxid)[1] + + def getRank(self,taxid): + return self._ranks[self.getRankId(taxid)] + + def getTaxonAtRank(self,taxid,rankid): + if isinstance(rankid, str): + rankid=self._ranks.index(rankid) + try: + return [x[0] for x in self.parentalTreeIterator(taxid) + if x[1]==rankid][0] + except IndexError: + return None + + def getSpecies(self,taxid): + return self.getTaxonAtRank(taxid, self._speciesidx) + + def getGenus(self,taxid): + return self.getTaxonAtRank(taxid, self._genusidx) + + def getFamily(self,taxid): + return self.getTaxonAtRank(taxid, self._familyidx) + + def getOrder(self,taxid): + return self.getTaxonAtRank(taxid, self._orderidx) + + def rankIterator(self): + for x in imap(None,self._ranks,xrange(len(self._ranks))): + yield x + + def groupTaxa(self,taxa,groupname): + t=[self.findTaxonByTaxid(x) for x in taxa] + a=set(x[2] for x in t) + assert len(a)==1,"All taxa must have the same parent" + newtaxid=max([2999999]+[x[0] for x in self._taxonomy if x[0]>=3000000 and x[0]<4000000])+1 + newidx=len(self._taxonomy) + if 'GROUP' not in self._ranks: + self._ranks.append('GROUP') + rankid=self._ranks.index('GROUP') + self._taxonomy.append((newtaxid,rankid,a.pop(),groupname)) + for x in t: + x[2]=newidx + + def addLocalTaxon(self,name,rank,parent,minimaltaxid=10000000): + newtaxid = minimaltaxid if (self._bigestTaxid < minimaltaxid) else self._bigestTaxid+1 + + rankid=self.findRankByName(rank) + parentidx = self.findIndex(int(parent)) + tx = (newtaxid,rankid,parentidx,name,'local') + self._taxonomy.append(tx) + newidx=len(self._taxonomy)-1 + self._name.append((name,'scientific name',newidx)) + self._nameidx[name]=newidx + self._index[newtaxid]=newidx + + self._bigestTaxid=newtaxid + + return newtaxid + + def removeLocalTaxon(self,taxid): + raise NotImplemented + txidx = self.findIndex(taxid) + taxon = self.findTaxonByIdx(txidx) + + assert txidx >= self._localtaxon,"Only local taxon can be deleted" + + for t in self._taxonomy: + if t[2] == txidx: + self.removeLocalTaxon(t[0]) + + + + + return taxon + + def addPreferedName(self,taxid,name): + idx = self.findIndex(taxid) + self._preferedName.append(name,'obi',idx) + self._preferedidx[idx]=name + return taxid + +class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile): + ''' + A taxonomy database class + ''' + + + def __init__(self,path): + ''' + The taxonomy database constructor + + @param path: path to the ecoPCR database including the database prefix name + @type path: C{str} + ''' + self._path = path + self._taxonFile = "%s.tdx" % self._path + self._localTaxonFile = "%s.ldx" % self._path + self._ranksFile = "%s.rdx" % self._path + self._namesFile = "%s.ndx" % self._path + self._preferedNamesFile = "%s.pdx" % self._path + self._aliasFile = "%s.adx" % self._path + + print >> sys.stderr,"Reading binary taxonomy database...", + + self.__readNodeTable() + + print >> sys.stderr," ok" + + Taxonomy.__init__(self) + + + ##### + # + # Iterator functions + # + ##### + + def __ecoNameIterator(self,file): + for record in self._ecoRecordIterator(file): + lrecord = len(record) + lnames = lrecord - 16 + (isScientificName,namelength,classLength,indextaxid,names)=struct.unpack('> I I I I %ds' % lnames, record) + name=names[:namelength] + classname=names[namelength:] + yield (name,classname,indextaxid) + + + def __ecoTaxonomicIterator(self): + for record in self._ecoRecordIterator(self._taxonFile): + lrecord = len(record) + lnames = lrecord - 16 + (taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record) + yield (taxid,rankid,parentidx,name,'ncbi') + + try : + lt=0 + for record in self._ecoRecordIterator(self._localTaxonFile): + lrecord = len(record) + lnames = lrecord - 16 + (taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record) + lt+=1 + yield (taxid,rankid,parentidx,name,'local') + print >> sys.stderr, " [INFO : Local taxon file found] : %d added taxa" % lt + except: + print >> sys.stderr, " [INFO : Local taxon file not found] " + + def __ecoRankIterator(self): + for record in self._ecoRecordIterator(self._ranksFile): + yield record + + def __ecoAliasIterator(self): + for record in self._ecoRecordIterator(self._aliasFile): + (taxid,index) = struct.unpack('> I i',record) + yield taxid,index + + ##### + # + # Indexes + # + ##### + + def __ecoNameIndex(self): + indexName = [x for x in self.__ecoNameIterator(self._namesFile)] + return indexName + + def __ecoRankIndex(self): + rank = [r for r in self.__ecoRankIterator()] + return rank + + def __ecoTaxonomyIndex(self): + taxonomy = [] + + try : + index = dict(self.__ecoAliasIterator()) + print >> sys.stderr, " [INFO : Taxon alias file found] " + buildIndex=False + except: + print >> sys.stderr, " [INFO : Taxon alias file not found] " + index={} + i = 0; + buildIndex=True + + localtaxon=0 + i=0 + for x in self.__ecoTaxonomicIterator(): + taxonomy.append(x) + if x[4]=='ncbi': + localtaxon+=1 + + if buildIndex or x[4]!='ncbi': + index[x[0]] = i + i+=1 + + + print >> sys.stderr,"Taxonomical tree read", + return taxonomy, index,localtaxon + + def __readNodeTable(self): + self._taxonomy, self._index, self._localtaxon= self.__ecoTaxonomyIndex() + self._ranks = self.__ecoRankIndex() + self._name = self.__ecoNameIndex() + + # Add local taxon tame to the name index + i=self._localtaxon + for t in self._taxonomy[self._localtaxon:]: + self._name.append((t[3],'scientific name',i)) + i+=1 + + try : + self._preferedName = [(x[0],'obi',x[2]) + for x in self.__ecoNameIterator(self._preferedNamesFile)] + print >> sys.stderr, " [INFO : Prefered taxon name file found] : %d added taxa" % len(self._preferedName) + except: + print >> sys.stderr, " [INFO : Prefered taxon name file not found]" + self._preferedName = [] + + + + +class TaxonomyDump(Taxonomy): + + def __init__(self,taxdir): + + self._path=taxdir + self._readNodeTable('%s/nodes.dmp' % taxdir) + + print >>sys.stderr,"Adding scientific name..." + + self._name=[] + for taxid,name,classname in self._nameIterator('%s/names.dmp' % taxdir): + self._name.append((name,classname,self._index[taxid])) + if classname == 'scientific name': + self._taxonomy[self._index[taxid]].extend([name,'ncbi']) + + print >>sys.stderr,"Adding taxid alias..." + for taxid,current in self._mergedNodeIterator('%s/merged.dmp' % taxdir): + self._index[taxid]=self._index[current] + + print >>sys.stderr,"Adding deleted taxid..." + for taxid in self._deletedNodeIterator('%s/delnodes.dmp' % taxdir): + self._index[taxid]=None + + self._nameidx=dict((x[0],x[2]) for x in self._name) + + + def _taxonCmp(t1,t2): + if t1[0] < t2[0]: + return -1 + elif t1[0] > t2[0]: + return +1 + return 0 + + _taxonCmp=staticmethod(_taxonCmp) + + def _bsearchTaxon(self,taxid): + taxCount = len(self._taxonomy) + begin = 0 + end = taxCount + oldcheck=taxCount + check = begin + end / 2 + while check != oldcheck and self._taxonomy[check][0]!=taxid : + if self._taxonomy[check][0] < taxid: + begin=check + else: + end=check + oldcheck=check + check = (begin + end) / 2 + + + if self._taxonomy[check][0]==taxid: + return check + else: + return None + + + + def _readNodeTable(self,file): + + file = universalOpen(file) + + nodes = ColumnFile(file, + sep='|', + types=(int,int,str, + str,str,bool, + int,bool,int, + bool,bool,bool,str)) + print >>sys.stderr,"Reading taxonomy dump file..." + # (taxid,rank,parent) + taxonomy=[[n[0],n[2],n[1]] for n in nodes] + print >>sys.stderr,"List all taxonomy rank..." + ranks =list(set(x[1] for x in taxonomy)) + ranks.sort() + rankidx = dict(map(None,ranks,xrange(len(ranks)))) + + print >>sys.stderr,"Sorting taxons..." + taxonomy.sort(TaxonomyDump._taxonCmp) + + self._taxonomy=taxonomy + self._localtaxon=len(taxonomy) + + print >>sys.stderr,"Indexing taxonomy..." + index = {} + for t in self._taxonomy: + index[t[0]]=self._bsearchTaxon(t[0]) + + print >>sys.stderr,"Indexing parent and rank..." + for t in self._taxonomy: + t[1]=rankidx[t[1]] + t[2]=index[t[2]] + + self._ranks=ranks + self._index=index + self._preferedName = [] + + def _nameIterator(self,file): + file = universalOpen(file) + names = ColumnFile(file, + sep='|', + types=(int,str, + str,str)) + for taxid,name,unique,classname,white in names: + yield taxid,name,classname + + def _mergedNodeIterator(self,file): + file = universalOpen(file) + merged = ColumnFile(file, + sep='|', + types=(int,int,str)) + for taxid,current,white in merged: + yield taxid,current + + def _deletedNodeIterator(self,file): + file = universalOpen(file) + deleted = ColumnFile(file, + sep='|', + types=(int,str)) + for taxid,white in deleted: + yield taxid + +##### +# +# +# Binary writer +# +# +##### + +def ecoTaxonomyWriter(prefix, taxonomy,onlyLocal=False): + + def ecoTaxPacker(tx): + + namelength = len(tx[3]) + + totalSize = 4 + 4 + 4 + 4 + namelength + + packed = struct.pack('> I I I I I %ds' % namelength, + totalSize, + tx[0], + tx[1], + tx[2], + namelength, + tx[3]) + + return packed + + def ecoRankPacker(rank): + + namelength = len(rank) + + packed = struct.pack('> I %ds' % namelength, + namelength, + rank) + + return packed + + def ecoAliasPacker(taxid,index): + + totalSize = 4 + 4 + try: + packed = struct.pack('> I I i', + totalSize, + taxid, + index) + except struct.error,e: + print >>sys.stderr,(totalSize,taxid,index) + print >>sys.stderr,"Total size : %d taxid : %d index : %d" %(totalSize,taxid,index) + raise e + + return packed + + def ecoNamePacker(name): + + namelength = len(name[0]) + classlength= len(name[1]) + totalSize = namelength + classlength + 4 + 4 + 4 + 4 + + packed = struct.pack('> I I I I I %ds %ds' % (namelength,classlength), + totalSize, + int(name[1]=='scientific name'), + namelength, + classlength, + name[2], + name[0], + name[1]) + + return packed + + + def ecoTaxWriter(file,taxonomy): + output = open(file,'wb') + nbtaxon = reduce(lambda x,y:x+y,(1 for t in taxonomy if t[4]=='ncbi'),0) + + output.write(struct.pack('> I',nbtaxon)) + + for tx in taxonomy: + if tx[4]=='ncbi': + output.write(ecoTaxPacker(tx)) + + output.close() + return nbtaxon < len(taxonomy) + + def ecoLocalTaxWriter(file,taxonomy): + nbtaxon = reduce(lambda x,y:x+y,(1 for t in taxonomy if t[4]!='ncbi'),0) + + if nbtaxon: + output = open(file,'wb') + + output.write(struct.pack('> I',nbtaxon)) + + for tx in taxonomy: + if tx[4]!='ncbi': + output.write(ecoTaxPacker(tx)) + + output.close() + + + def ecoRankWriter(file,ranks): + output = open(file,'wb') + output.write(struct.pack('> I',len(ranks))) + + for rank in ranks: + output.write(ecoRankPacker(rank)) + + output.close() + + def ecoAliasWriter(file,index): + output = open(file,'wb') + output.write(struct.pack('> I',len(index))) + + for taxid in index: + i=index[taxid] + if i is None: + i=-1 + output.write(ecoAliasPacker(taxid, i)) + + output.close() + + def nameCmp(n1,n2): + name1=n1[0].upper() + name2=n2[0].upper() + if name1 < name2: + return -1 + elif name1 > name2: + return 1 + return 0 + + + def ecoNameWriter(file,names): + output = open(file,'wb') + output.write(struct.pack('> I',len(names))) + + names.sort(nameCmp) + + for name in names: + output.write(ecoNamePacker(name)) + + output.close() + + def ecoPreferedNameWriter(file,names): + output = open(file,'wb') + output.write(struct.pack('> I',len(names))) + for name in names: + output.write(ecoNamePacker(name)) + + output.close() + + localtaxon=True + if not onlyLocal: + ecoRankWriter('%s.rdx' % prefix, taxonomy._ranks) + localtaxon = ecoTaxWriter('%s.tdx' % prefix, taxonomy._taxonomy) + ecoNameWriter('%s.ndx' % prefix, [x for x in taxonomy._name if x[2] < taxonomy._localtaxon]) + ecoAliasWriter('%s.adx' % prefix, taxonomy._index) + if localtaxon: + ecoLocalTaxWriter('%s.ldx' % prefix, taxonomy._taxonomy) + if taxonomy._preferedName: + ecoNameWriter('%s.pdx' % prefix, taxonomy._preferedName) diff --git a/obitools/ecotag/__init__.py b/obitools/ecotag/__init__.py new file mode 100644 index 0000000..26c94d3 --- /dev/null +++ b/obitools/ecotag/__init__.py @@ -0,0 +1,2 @@ +class EcoTagResult(dict): + pass \ No newline at end of file diff --git a/obitools/ecotag/parser.py b/obitools/ecotag/parser.py new file mode 100644 index 0000000..f431e34 --- /dev/null +++ b/obitools/ecotag/parser.py @@ -0,0 +1,150 @@ +from itertools import imap +from obitools import utils + +from obitools.ecotag import EcoTagResult + +class EcoTagFileIterator(utils.ColumnFile): + + @staticmethod + def taxid(x): + x = int(x) + if x < 0: + return None + else: + return x + + @staticmethod + def scientificName(x): + if x=='--': + return None + else: + return x + + @staticmethod + def value(x): + if x=='--': + return None + else: + return float(x) + + @staticmethod + def count(x): + if x=='--': + return None + else: + return int(x) + + + def __init__(self,stream): + utils.ColumnFile.__init__(self, + stream, '\t', True, + (str,str,str, + EcoTagFileIterator.value, + EcoTagFileIterator.value, + EcoTagFileIterator.value, + EcoTagFileIterator.count, + EcoTagFileIterator.count, + EcoTagFileIterator.taxid, + EcoTagFileIterator.scientificName, + str, + EcoTagFileIterator.taxid, + EcoTagFileIterator.scientificName, + EcoTagFileIterator.taxid, + EcoTagFileIterator.scientificName, + EcoTagFileIterator.taxid, + EcoTagFileIterator.scientificName, + str + )) + self._memory=None + + _colname = ['identification', + 'seqid', + 'best_match_ac', + 'max_identity', + 'min_identity', + 'theorical_min_identity', + 'count', + 'match_count', + 'taxid', + 'scientific_name', + 'rank', + 'order_taxid', + 'order_sn', + 'family_taxid', + 'family_sn', + 'genus_taxid', + 'genus_sn', + 'species_taxid', + 'species_sn', + 'sequence'] + + def next(self): + if self._memory is not None: + data=self._memory + self._memory=None + else: + data = utils.ColumnFile.next(self) + data = EcoTagResult(imap(None,EcoTagFileIterator._colname[:len(data)],data)) + + if data['identification']=='ID': + data.cd=[] + try: + nextone = utils.ColumnFile.next(self) + nextone = EcoTagResult(imap(None,EcoTagFileIterator._colname[:len(nextone)],nextone)) + except StopIteration: + nextone = None + while nextone is not None and nextone['identification']=='CD': + data.cd.append(nextone) + try: + nextone = utils.ColumnFile.next(self) + nextone = EcoTagResult(imap(None,EcoTagFileIterator._colname[:len(nextone)],nextone)) + except StopIteration: + nextone = None + self._memory=nextone + + return data + +def ecoTagIdentifiedFilter(ecoTagIterator): + for x in ecoTagIterator: + if x['identification']=='ID': + yield x + + +class EcoTagAbstractIterator(utils.ColumnFile): + + _colname = ['scientific_name', + 'taxid', + 'rank', + 'count', + 'max_identity', + 'min_identity'] + + + @staticmethod + def taxid(x): + x = int(x) + if x < 0: + return None + else: + return x + + def __init__(self,stream): + utils.ColumnFile.__init__(self, + stream, '\t', True, + (str, + EcoTagFileIterator.taxid, + str, + int, + float,float,float)) + + def next(self): + data = utils.ColumnFile.next(self) + data = dict(imap(None,EcoTagAbstractIterator._colname,data)) + + return data + +def ecoTagAbstractFilter(ecoTagAbsIterator): + for x in ecoTagAbsIterator: + if x['taxid'] is not None: + yield x + \ No newline at end of file diff --git a/obitools/eutils/__init__.py b/obitools/eutils/__init__.py new file mode 100644 index 0000000..1e7d3b2 --- /dev/null +++ b/obitools/eutils/__init__.py @@ -0,0 +1,54 @@ +import time +from urllib2 import urlopen +import shelve +from threading import Lock +import sys + +class EUtils(object): + ''' + + ''' + + _last_request=0 + _interval=3 + + def __init__(self): + self._lock = Lock() + + def wait(self): + now=time.time() + delta = now - EUtils._last_request + while delta < EUtils._interval: + time.sleep(delta) + now=time.time() + delta = now - EUtils._last_request + + def _sendRequest(self,url): + self.wait() + EUtils._last_request=time.time() + t = EUtils._last_request + print >>sys.stderr,"Sending request to NCBI @ %f" % t + data = urlopen(url).read() + print >>sys.stderr,"Data red from NCBI @ %f (%f)" % (t,time.time()-t) + return data + + def setInterval(self,seconde): + EUtils._interval=seconde + + +class EFetch(EUtils): + ''' + + ''' + def __init__(self,db,tool='OBITools', + retmode='text',rettype="native", + server='eutils.ncbi.nlm.nih.gov'): + EUtils.__init__(self) + self._url = "http://%s/entrez/eutils/efetch.fcgi?db=%s&tool=%s&retmode=%s&rettype=%s" + self._url = self._url % (server,db,tool,retmode,rettype) + + + def get(self,**args): + key = "&".join(['%s=%s' % x for x in args.items()]) + return self._sendRequest(self._url +"&" + key) + diff --git a/obitools/fast.py b/obitools/fast.py new file mode 100644 index 0000000..760f493 --- /dev/null +++ b/obitools/fast.py @@ -0,0 +1,56 @@ +""" + implement fastn/fastp sililarity search algorithm for BioSequence. +""" + +class Fast(object): + + def __init__(self,seq,kup=2): + ''' + @param seq: sequence to hash + @type seq: BioSequence + @param kup: word size used for hashing process + @type kup: int + ''' + hash={} + seq = str(seq) + for word,pos in ((seq[i:i+kup].upper(),i) for i in xrange(len(seq)-kup)): + if word in hash: + hash[word].append(pos) + else: + hash[word]=[pos] + + self._kup = kup + self._hash= hash + self._seq = seq + + def __call__(self,seq): + ''' + Align one sequence with the fast hash table. + + @param seq: the sequence to align + @type seq: BioSequence + + @return: where smax is the + score of the largest diagonal and pmax the + associated shift + @rtype: a int tuple (smax,pmax) + ''' + histo={} + seq = str(seq).upper() + hash= self._hash + kup = self._kup + + for word,pos in ((seq[i:i+kup],i) for i in xrange(len(seq)-kup)): + matchedpos = hash.get(word,[]) + for p in matchedpos: + delta = pos - p + histo[delta]=histo.get(delta,0) + 1 + smax = max(histo.values()) + pmax = [x for x in histo if histo[x]==smax] + return smax,pmax + + def __len__(self): + return len(self._seq) + + + diff --git a/obitools/fasta/__init__.py b/obitools/fasta/__init__.py new file mode 100644 index 0000000..d5b90c5 --- /dev/null +++ b/obitools/fasta/__init__.py @@ -0,0 +1,384 @@ +""" +fasta module provides functions to read and write sequences in fasta format. + + +""" + +#from obitools.format.genericparser import fastGenericEntryIteratorGenerator +from obitools.format.genericparser import genericEntryIteratorGenerator +from obitools import bioSeqGenerator,BioSequence,AASequence,NucSequence +from obitools import _default_raw_parser + +#from obitools.alignment import alignmentReader +#from obitools.utils import universalOpen + +import re +from obitools.ecopcr.options import loadTaxonomyDatabase +from obitools.format import SequenceFileIterator + +#from _fasta import parseFastaDescription,fastaParser +#from _fasta import _fastaJoinSeq +#from _fasta import _parseFastaTag + + +#fastaEntryIterator=fastGenericEntryIteratorGenerator(startEntry='>') +fastaEntryIterator=genericEntryIteratorGenerator(startEntry='>') +rawFastaEntryIterator=genericEntryIteratorGenerator(startEntry='\s*>') + +def _fastaJoinSeq(seqarray): + return ''.join([x.strip() for x in seqarray]) + + +def parseFastaDescription(ds,tagparser): + + m = tagparser.search(' '+ds) + if m is not None: + info=m.group(0) + definition = ds[m.end(0):].strip() + else: + info=None + definition=ds + + return definition,info + +def fastaParser(seq,bioseqfactory,tagparser,rawparser,joinseq=_fastaJoinSeq): + ''' + Parse a fasta record. + + @attention: internal purpose function + + @param seq: a sequence object containing all lines corresponding + to one fasta sequence + @type seq: C{list} or C{tuple} of C{str} + + @param bioseqfactory: a callable object return a BioSequence + instance. + @type bioseqfactory: a callable object + + @param tagparser: a compiled regular expression usable + to identify key, value couples from + title line. + @type tagparser: regex instance + + @return: a C{BioSequence} instance + ''' + seq = seq.split('\n') + title = seq[0].strip()[1:].split(None,1) + id=title[0] + if len(title) == 2: + definition,info=parseFastaDescription(title[1], tagparser) + else: + info= None + definition=None + + seq=joinseq(seq[1:]) + return bioseqfactory(id, seq, definition,info,rawparser) + + +def fastaNucParser(seq,tagparser=_default_raw_parser,joinseq=_fastaJoinSeq): + return fastaParser(seq,NucSequence,tagparser=tagparser,joinseq=_fastaJoinSeq) + +def fastaAAParser(seq,tagparser=_default_raw_parser,joinseq=_fastaJoinSeq): + return fastaParser(seq,AASequence,tagparser=tagparser,joinseq=_fastaJoinSeq) + +def fastaIterator(file,bioseqfactory=bioSeqGenerator, + tagparser=_default_raw_parser, + joinseq=_fastaJoinSeq): + ''' + iterate through a fasta file sequence by sequence. + Returned sequences by this iterator will be BioSequence + instances + + @param file: a line iterator containing fasta data or a filename + @type file: an iterable object or str + @param bioseqfactory: a callable object return a BioSequence + instance. + @type bioseqfactory: a callable object + + @param tagparser: a compiled regular expression usable + to identify key, value couples from + title line. + @type tagparser: regex instance + + @return: an iterator on C{BioSequence} instance + + @see: L{fastaNucIterator} + @see: L{fastaAAIterator} + + >>> from obitools.format.sequence.fasta import fastaIterator + >>> f = fastaIterator('monfichier') + >>> s = f.next() + >>> print s + gctagctagcatgctagcatgcta + >>> + ''' + rawparser=tagparser + allparser = tagparser % '[a-zA-Z][a-zA-Z0-9_]*' + tagparser = re.compile('( *%s)+' % allparser) + + for entry in fastaEntryIterator(file): + yield fastaParser(entry,bioseqfactory,tagparser,rawparser,joinseq) + +def rawFastaIterator(file,bioseqfactory=bioSeqGenerator, + tagparser=_default_raw_parser, + joinseq=_fastaJoinSeq): + + rawparser=tagparser + allparser = tagparser % '[a-zA-Z][a-zA-Z0-9_]*' + tagparser = re.compile('( *%s)+' % allparser) + + for entry in rawFastaEntryIterator(file): + entry=entry.strip() + yield fastaParser(entry,bioseqfactory,tagparser,rawparser,joinseq) + +def fastaNucIterator(file,tagparser=_default_raw_parser): + ''' + iterate through a fasta file sequence by sequence. + Returned sequences by this iterator will be NucSequence + instances + + @param file: a line iterator containint fasta data + @type file: an iterable object + + @param tagparser: a compiled regular expression usable + to identify key, value couples from + title line. + @type tagparser: regex instance + + @return: an iterator on C{NucBioSequence} instance + @rtype: a generator object + + @see: L{fastaIterator} + @see: L{fastaAAIterator} + ''' + return fastaIterator(file, NucSequence,tagparser) + +def fastaAAIterator(file,tagparser=_default_raw_parser): + ''' + iterate through a fasta file sequence by sequence. + Returned sequences by this iterator will be AASequence + instances + + @param file: a line iterator containing fasta data + @type file: an iterable object + + @param tagparser: a compiled regular expression usable + to identify key, value couples from + title line. + @type tagparser: regex instance + + @return: an iterator on C{AABioSequence} instance + + @see: L{fastaIterator} + @see: L{fastaNucIterator} + ''' + return fastaIterator(file, AASequence,tagparser) + +def formatFasta(data,gbmode=False,upper=False,restrict=None): + ''' + Convert a seqence or a set of sequences in a + string following the fasta format + + @param data: sequence or a set of sequences + @type data: BioSequence instance or an iterable object + on BioSequence instances + + @param gbmode: if set to C{True} identifier part of the title + line follows recommendation from nbci to allow + sequence indexing with the blast formatdb command. + @type gbmode: bool + + @param restrict: a set of key name that will be print in the formated + output. If restrict is set to C{None} (default) then + all keys are formated. + @type restrict: any iterable value or None + + @return: a fasta formated string + @rtype: str + ''' + if isinstance(data, BioSequence): + data = [data] + + if restrict is not None and not isinstance(restrict, set): + restrict = set(restrict) + + rep = [] + for sequence in data: + seq = str(sequence) + if sequence.definition is None: + definition='' + else: + definition=sequence.definition + if upper: + frgseq = '\n'.join([seq[x:x+60].upper() for x in xrange(0,len(seq),60)]) + else: + frgseq = '\n'.join([seq[x:x+60] for x in xrange(0,len(seq),60)]) + info='; '.join(['%s=%s' % x + for x in sequence.rawiteritems() + if restrict is None or x[0] in restrict]) + if info: + info=info+';' + if sequence._rawinfo is not None and sequence._rawinfo: + info+=" " + sequence._rawinfo.strip() + + id = sequence.id + if gbmode: + if 'gi' in sequence: + id = "gi|%s|%s" % (sequence['gi'],id) + else: + id = "lcl|%s|" % (id) + title='>%s %s %s' %(id,info,definition) + rep.append("%s\n%s" % (title,frgseq)) + return '\n'.join(rep) + +def formatSAPFastaGenerator(options): + loadTaxonomyDatabase(options) + + taxonomy=None + if options.taxonomy is not None: + taxonomy=options.taxonomy + + assert taxonomy is not None,"SAP formating require indication of a taxonomy database" + + ranks = ('superkingdom', 'kingdom', 'subkingdom', 'superphylum', + 'phylum', 'subphylum', 'superclass', 'class', 'subclass', + 'infraclass', 'superorder', 'order', 'suborder', 'infraorder', + 'parvorder', 'superfamily', 'family', 'subfamily', 'supertribe', 'tribe', + 'subtribe', 'supergenus', 'genus', 'subgenus', 'species group', + 'species subgroup', 'species', 'subspecies') + + trank=set(taxonomy._ranks) + ranks = [taxonomy._ranks.index(x) for x in ranks if x in trank] + + strict= options.strictsap + + def formatSAPFasta(data,gbmode=False,upper=False,restrict=None): + ''' + Convert a seqence or a set of sequences in a + string following the fasta format as recommended for the SAP + software + + http://ib.berkeley.edu/labs/slatkin/munch/StatisticalAssignmentPackage.html + + @param data: sequence or a set of sequences + @type data: BioSequence instance or an iterable object + on BioSequence instances + + @param gbmode: if set to C{True} identifier part of the title + line follows recommendation from nbci to allow + sequence indexing with the blast formatdb command. + @type gbmode: bool + + @param restrict: a set of key name that will be print in the formated + output. If restrict is set to C{None} (default) then + all keys are formated. + @type restrict: any iterable value or None + + @return: a fasta formated string + @rtype: str + ''' + if isinstance(data, BioSequence): + data = [data] + + if restrict is not None and not isinstance(restrict, set): + restrict = set(restrict) + + rep = [] + for sequence in data: + seq = str(sequence) + + if upper: + frgseq = '\n'.join([seq[x:x+60].upper() for x in xrange(0,len(seq),60)]) + else: + frgseq = '\n'.join([seq[x:x+60] for x in xrange(0,len(seq),60)]) + + try: + taxid = sequence["taxid"] + except KeyError: + if strict: + raise AssertionError('All sequence must have a taxid') + else: + continue + + definition=' ;' + + for r in ranks: + taxon = taxonomy.getTaxonAtRank(taxid,r) + if taxon is not None: + definition+=' %s: %s,' % (taxonomy._ranks[r],taxonomy.getPreferedName(taxon)) + + definition='%s ; %s' % (definition[0:-1],taxonomy.getPreferedName(taxid)) + + id = sequence.id + if gbmode: + if 'gi' in sequence: + id = "gi|%s|%s" % (sequence['gi'],id) + else: + id = "lcl|%s|" % (id) + title='>%s%s' %(id,definition) + rep.append("%s\n%s" % (title,frgseq)) + return '\n'.join(rep) + + return formatSAPFasta + +class FastaIterator(SequenceFileIterator): + + + entryIterator = genericEntryIteratorGenerator(startEntry='>') + classmethod(entryIterator) + + def __init__(self,inputfile,bioseqfactory=bioSeqGenerator, + tagparser=_default_raw_parser, + joinseq=_fastaJoinSeq): + + SequenceFileIterator.__init__(self, inputfile, bioseqfactory) + + self.__file = FastaIterator.entryIterator(self._inputfile) + + self._tagparser = tagparser + self._joinseq = joinseq + + def get_tagparser(self): + return self.__tagparser + + + def set_tagparser(self, value): + self._rawparser = value + allparser = value % '[a-zA-Z][a-zA-Z0-9_]*' + self.__tagparser = re.compile('( *%s)+' % allparser) + + def _parseFastaDescription(self,ds): + + m = self._tagparser.search(' '+ds) + if m is not None: + info=m.group(0) + definition = ds[m.end(0):].strip() + else: + info=None + definition=ds + + return definition,info + + + def _parser(self): + ''' + Parse a fasta record. + + @attention: internal purpose function + + @return: a C{BioSequence} instance + ''' + seq = self._seq.split('\n') + title = seq[0].strip()[1:].split(None,1) + id=title[0] + if len(title) == 2: + definition,info=self._parseFastaDescription(title[1]) + else: + info= None + definition=None + + seq=self._joinseq(seq[1:]) + + return self._bioseqfactory(id, seq, definition,info,self._rawparser) + + _tagparser = property(get_tagparser, set_tagparser, None, "_tagparser's docstring") diff --git a/obitools/fasta/_fasta.so b/obitools/fasta/_fasta.so new file mode 100755 index 0000000..de300ce Binary files /dev/null and b/obitools/fasta/_fasta.so differ diff --git a/obitools/fastq/__init__.py b/obitools/fastq/__init__.py new file mode 100644 index 0000000..1cf3535 --- /dev/null +++ b/obitools/fastq/__init__.py @@ -0,0 +1,190 @@ +''' +Created on 29 aout 2009 + +@author: coissac +''' + +from obitools import BioSequence +from obitools import _default_raw_parser +from obitools.format.genericparser import genericEntryIteratorGenerator +from obitools import bioSeqGenerator,AASequence,NucSequence +from obitools.fasta import parseFastaDescription +from _fastq import fastqQualitySangerDecoder,fastqQualitySolexaDecoder +from _fastq import qualityToSangerError,qualityToSolexaError +from _fastq import errorToSangerFastQStr +from _fastq import formatFastq +from _fastq import fastqParserGenetator +from obitools.utils import universalOpen + +import re + +fastqEntryIterator=genericEntryIteratorGenerator(startEntry='^@',endEntry="^\+",strip=True,join=False) + +#def fastqParserGenetator(fastqvariant='sanger',bioseqfactory=NucSequence,tagparser=_parseFastaTag): +# +# qualityDecoder,errorDecoder = {'sanger' : (fastqQualitySangerDecoder,qualityToSangerError), +# 'solexa' : (fastqQualitySolexaDecoder,qualityToSolexaError), +# 'illumina' : (fastqQualitySolexaDecoder,qualityToSangerError)}[fastqvariant] +# +# def fastqParser(seq): +# ''' +# Parse a fasta record. +# +# @attention: internal purpose function +# +# @param seq: a sequence object containing all lines corresponding +# to one fasta sequence +# @type seq: C{list} or C{tuple} of C{str} +# +# @param bioseqfactory: a callable object return a BioSequence +# instance. +# @type bioseqfactory: a callable object +# +# @param tagparser: a compiled regular expression usable +# to identify key, value couples from +# title line. +# @type tagparser: regex instance +# +# @return: a C{BioSequence} instance +# ''' +# +# title = seq[0][1:].split(None,1) +# id=title[0] +# if len(title) == 2: +# definition,info=parseFastaDescription(title[1], tagparser) +# else: +# info= {} +# definition=None +# +# quality=errorDecoder(qualityDecoder(seq[3])) +# +# seq=seq[1] +# +# seq = bioseqfactory(id, seq, definition,False,**info) +# seq.quality = quality +# +# return seq +# +# return fastqParser + + +def fastqIterator(file,fastqvariant='sanger',bioseqfactory=NucSequence,tagparser=_default_raw_parser): + ''' + iterate through a fasta file sequence by sequence. + Returned sequences by this iterator will be BioSequence + instances + + @param file: a line iterator containing fasta data or a filename + @type file: an iterable object or str + @param bioseqfactory: a callable object return a BioSequence + instance. + @type bioseqfactory: a callable object + + @param tagparser: a compiled regular expression usable + to identify key, value couples from + title line. + @type tagparser: regex instance + + @return: an iterator on C{BioSequence} instance + + @see: L{fastaNucIterator} + @see: L{fastaAAIterator} + + ''' + fastqParser=fastqParserGenetator(fastqvariant, bioseqfactory, tagparser) + file = universalOpen(file) + for entry in fastqEntryIterator(file): + title=entry[0] + seq="".join(entry[1:-1]) + quality='' + lenseq=len(seq) + while (len(quality) < lenseq): + quality+=file.next().strip() + + yield fastqParser([title,seq,'+',quality]) + +def fastqSangerIterator(file,tagparser=_default_raw_parser): + ''' + iterate through a fastq file sequence by sequence. + Returned sequences by this iterator will be NucSequence + instances + + @param file: a line iterator containint fasta data + @type file: an iterable object + + @param tagparser: a compiled regular expression usable + to identify key, value couples from + title line. + @type tagparser: regex instance + + @return: an iterator on C{NucBioSequence} instance + + @see: L{fastqIterator} + @see: L{fastqAAIterator} + ''' + return fastqIterator(file,'sanger',NucSequence,tagparser) + +def fastqSolexaIterator(file,tagparser=_default_raw_parser): + ''' + iterate through a fastq file sequence by sequence. + Returned sequences by this iterator will be NucSequence + instances + + @param file: a line iterator containint fasta data + @type file: an iterable object + + @param tagparser: a compiled regular expression usable + to identify key, value couples from + title line. + @type tagparser: regex instance + + @return: an iterator on C{NucBioSequence} instance + + @see: L{fastqIterator} + @see: L{fastqAAIterator} + ''' + return fastqIterator(file,'solexa',NucSequence,tagparser) + +def fastqIlluminaIterator(file,tagparser=_default_raw_parser): + ''' + iterate through a fastq file sequence by sequence. + Returned sequences by this iterator will be NucSequence + instances + + @param file: a line iterator containint fasta data + @type file: an iterable object + + @param tagparser: a compiled regular expression usable + to identify key, value couples from + title line. + @type tagparser: regex instance + + @return: an iterator on C{NucBioSequence} instance + + @see: L{fastqIterator} + @see: L{fastqAAIterator} + ''' + return fastqIterator(file,'illumina',NucSequence,tagparser) + +def fastqAAIterator(file,tagparser=_default_raw_parser): + ''' + iterate through a fastq file sequence by sequence. + Returned sequences by this iterator will be AASequence + instances + + @param file: a line iterator containing fasta data + @type file: an iterable object + + @param tagparser: a compiled regular expression usable + to identify key, value couples from + title line. + @type tagparser: regex instance + + @return: an iterator on C{AABioSequence} instance + + @see: L{fastqIterator} + @see: L{fastqNucIterator} + ''' + return fastqIterator(file,'sanger',AASequence,tagparser) + + diff --git a/obitools/fastq/_fastq.so b/obitools/fastq/_fastq.so new file mode 100755 index 0000000..4e3b942 Binary files /dev/null and b/obitools/fastq/_fastq.so differ diff --git a/obitools/fnaqual/__init__.py b/obitools/fnaqual/__init__.py new file mode 100644 index 0000000..384eb96 --- /dev/null +++ b/obitools/fnaqual/__init__.py @@ -0,0 +1,2 @@ + +fnaTag=' %s *= *([^\s]+)' diff --git a/obitools/fnaqual/fasta.py b/obitools/fnaqual/fasta.py new file mode 100644 index 0000000..102a13e --- /dev/null +++ b/obitools/fnaqual/fasta.py @@ -0,0 +1,8 @@ +from obitools.fasta import fastaNucIterator +from obitools.fnaqual import fnaTag + +def fnaFastaIterator(file): + + x = fastaNucIterator(file, fnaTag) + + return x \ No newline at end of file diff --git a/obitools/fnaqual/quality.py b/obitools/fnaqual/quality.py new file mode 100644 index 0000000..092f610 --- /dev/null +++ b/obitools/fnaqual/quality.py @@ -0,0 +1,137 @@ +""" + + +""" + +from obitools import _default_raw_parser +from obitools.fasta import fastaIterator +from obitools.fnaqual import fnaTag +from obitools.location import Location + +import re + + +class QualitySequence(list): + + def __init__(self,id,seq,definition=None,rawinfo=None,rawparser=_default_raw_parser,**info): + ''' + + @param id: + @param seq: + @param definition: + ''' + list.__init__(self,seq) + self._info = info + self.definition=definition + self.id=id + self._rawinfo=' ' + rawinfo + self._rawparser=rawparser + + def getDefinition(self): + ''' + Sequence definition getter + + @return: the sequence definition + @rtype: str + + ''' + return self._definition + + def setDefinition(self, value): + self._definition = value + + def getId(self): + return self._id + + def setId(self, value): + self._id = value + + def getKey(self,key): + if key not in self._info: + p = re.compile(self._rawparser % key) + m = p.search(self._rawinfo) + if m is not None: + v=m.group(1) + self._rawinfo=' ' + self._rawinfo[0:m.start(0)]+self._rawinfo[m.end(0):] + try: + v = eval(v) + except: + pass + self._info[key]=v + else: + raise KeyError,key + else: + v=self._info[key] + return v + + def __getitem__(self,key): + if isinstance(key,Location): + return key.extractSequence(self) + elif isinstance(key, str): + return self._getKey(key) + elif isinstance(key, int): + return list.__getitem__(self,key) + elif isinstance(key, slice): + subseq=list.__getitem__(self,key) + info = dict(self._info) + if key.start is not None: + start = key.start +1 + else: + start = 1 + if key.stop is not None: + stop = key.stop+1 + else: + stop = len(self) + if key.step is not None: + step = key.step + else: + step = 1 + + info['cut']='[%d,%d,%s]' % (start,stop,step) + return QualitySequence(self.id, subseq, self.definition,self._rawinfo,self._rawparser,**info) + + raise TypeError,'key must be an integer, a str or a slice' + + def __setitem__(self,key,value): + self._info[key]=value + + def __delitem__(self,key): + if isinstance(key, str): + del self._info[key] + else: + raise TypeError,key + + def __iter__(self): + return list.__iter__(self) + + def __contains__(self,key): + return key in self._info + + def getTags(self): + return self._info + + def complement(self): + ''' + + ''' + cseq = self[::-1] + rep = QualitySequence(self.id,cseq,self.definition,self._rawinfo,self._rawparser,**self._info) + rep._info['complemented']=not rep._info.get('complemented',False) + return rep + + + definition = property(getDefinition, setDefinition, None, "Sequence Definition") + + id = property(getId, setId, None, 'Sequence identifier') + + +def _qualityJoinSeq(seqarray): + text = ' '.join([x.strip() for x in seqarray]) + return [int(x) for x in text.split()] + +def qualityIterator(file): + for q in fastaIterator(file, QualitySequence, fnaTag, _qualityJoinSeq): + yield q + + + \ No newline at end of file diff --git a/obitools/format/__init__.py b/obitools/format/__init__.py new file mode 100644 index 0000000..a680505 --- /dev/null +++ b/obitools/format/__init__.py @@ -0,0 +1,28 @@ +from obitools import bioSeqGenerator +from obitools.utils import universalOpen + + +class SequenceFileIterator: + + def __init__(self,inputfile,bioseqfactory=bioSeqGenerator): + self._inputfile = universalOpen(inputfile) + self._bioseqfactory = bioseqfactory + + def get_inputfile(self): + return self.__file + + + def get_bioseqfactory(self): + return self.__bioseqfactory + + def next(self): + entry = self.inputfile.next() + return self._parse(entry) + + def __iter__(self): + return self + + _inputfile = property(get_inputfile, None, None, "_file's docstring") + _bioseqfactory = property(get_bioseqfactory, None, None, "_bioseqfactory's docstring") + + \ No newline at end of file diff --git a/obitools/format/_format.so b/obitools/format/_format.so new file mode 100755 index 0000000..92e460d Binary files /dev/null and b/obitools/format/_format.so differ diff --git a/obitools/format/genericparser/__init__.py b/obitools/format/genericparser/__init__.py new file mode 100644 index 0000000..fecc72f --- /dev/null +++ b/obitools/format/genericparser/__init__.py @@ -0,0 +1,217 @@ +""" +G{packagetree format} +""" +import re + +from obitools.utils import universalOpen + +def genericEntryIteratorGenerator(startEntry=None,endEntry=None, + head=False,tail=False, + strip=False,join=True): + ''' + Transfome a text line iterator to an entry oriented iterator. + + This iterator converted is useful to implement first stage + of flat file parsing. + + @param startEntry: a regular pattern matching the beginning of + an entry + @type startEntry: C{str} or None + @param endEntry: a regular pattern matching the end of + an entry + @type endEntry: C{str} or None + @param head: indicate if an header is present before + the first entry (as in many original genbank + files) + @type head: C{bool} + @param tail: indicate if some extra informations are present + after the last entry. + @type tail: C{bool} + + @return: an iterator on entries in text format + @rtype: an iterator on C{str} + ''' + + def isBeginning(line): + return startEntry is None or startEntry.match(line) is not None + + def isEnding(line): + return ((endEntry is not None and endEntry.match(line) is not None) or + (endEntry is None and startEntry is not None and startEntry.match(line) is not None)) + + def transparentIteratorEntry(file): + file = universalOpen(file) + return file + + def genericEntryIterator(file): + file = universalOpen(file) + entry = [] + line = file.next() + started = head or isBeginning(line) + + try: + while 1: + while not started: + line = file.next() + started = isBeginning(line) + + if endEntry is None: + entry.append(line) + line = file.next() + + while started: + end = isEnding(line) + if end: + if endEntry is not None: + entry.append(line) + if join: + e = ''.join(entry) + if strip: + e=e.strip() + else: + e=entry + if strip: + e=[x.strip() for x in e] + entry=[] + yield e + started=False + if endEntry is not None: + line = file.next() + else: + entry.append(line) + line = file.next() + + started = isBeginning(line) + + except StopIteration: + if entry and (endEntry is None or tail): + if join: + e = ''.join(entry) + if strip: + e=e.strip() + else: + e=entry + if strip: + e=[x.strip() for x in e] + yield e + + + + if startEntry is not None: + startEntry = re.compile(startEntry) + if endEntry is not None: + endEntry = re.compile(endEntry) + + if startEntry is None and endEntry is None: + return transparentIteratorEntry + + return genericEntryIterator + + +class GenericParser(object): + + def __init__(self, + startEntry=None, + endEntry=None, + head=False, + tail=False, + strip=False, + **parseAction): + """ + @param startEntry: a regular pattern matching the beginning of + an entry + @type startEntry: C{str} or None + @param endEntry: a regular pattern matching the end of + an entry + @type endEntry: C{str} or None + @param head: indicate if an header is present before + the first entry (as in many original genbank + files) + @type head: C{bool} + @param tail: indicate if some extra informations are present + after the last entry. + @type tail: C{bool} + + @param parseAction: + + """ + self.flatiterator= genericEntryIteratorGenerator(startEntry, + endEntry, + head, + tail, + strip) + + self.action={} + + for k in parseAction: + self.addParseAction(k,*parseAction[k]) + + def addParseAction(self,name,dataMatcher,dataCleaner=None,cleanSub=''): + ''' + Add a parse action to the generic parser. A parse action + allows to extract one information from an entry. A parse + action is defined by a name and a method to extract this + information from the full text entry. + + A parse action can be defined following two ways. + + - via regular expression patterns + + - via dedicated function. + + In the first case, you have to indicate at least the + dataMatcher regular pattern. This pattern should match exactly + the data part you want to retrieve. If cleanning of extra + characters is needed. The second pattern dataCLeanner can be + used to specifyed these characters. + + In the second case you must provide a callable object (function) + that extract and clean data from the text entry. This function + should return an array containing all data retrevied even if + no data or only one data is retrevied. + + @summary: Add a parse action to the generic parser. + + @param name: name of the data extracted + @type name: C{str} + @param dataMatcher: a regular pattern matching the data + or a callable object parsing the + entry and returning a list of marched data + @type dataMatcher: C{str} or C{SRE_Pattern} instance or a callable + object + @param dataCleaner: a regular pattern matching part of the data + to suppress. + @type dataCleaner: C{str} or C{SRE_Pattern} instance or C{None} + @param cleanSub: string used to replace dataCleaner matches. + Default is an empty string + @type cleanSub: C{str} + + ''' + if callable(dataMatcher): + self.action[name]=dataMatcher + else : + if isinstance(dataMatcher, str): + dataMatcher=re.compile(dataMatcher) + if isinstance(dataCleaner, str): + dataCleaner=re.compile(dataCleaner) + self.action[name]=self._buildREParser(dataMatcher, + dataCleaner, + cleanSub) + + def _buildREParser(self,dataMatcher,dataCleaner,cleanSub): + def parser(data): + x = dataMatcher.findall(data) + if dataCleaner is not None: + x = [dataCleaner.sub(cleanSub,y) for y in x] + return x + return parser + + def __call__(self,file): + for e in self.flatiterator(file): + pe = {'fullentry':e} + for k in self.action: + pe[k]=self.action[k](e) + yield pe + + + \ No newline at end of file diff --git a/obitools/format/ontology/__init__.py b/obitools/format/ontology/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/obitools/format/ontology/go_obo.py b/obitools/format/ontology/go_obo.py new file mode 100644 index 0000000..cd1d87e --- /dev/null +++ b/obitools/format/ontology/go_obo.py @@ -0,0 +1,274 @@ +__docformat__ = 'restructuredtext' + +import re +import string +import textwrap + + +from obitools.obo.go.parser import GOEntryIterator +from obitools.obo.go.parser import GOTerm +from obitools.obo.go.parser import GOEntry + +""" +go_obo.py : gene_ontology_edit.obo file parser: +---------------------------------------------------- + +- OBOFile class: open a flat file and return an entry. + +""" +class OBOFile(object): + """ + Iterator over all entries of an OBO file + """ + + def __init__(self,_path): + self.file = GOEntryIterator(_path) + + def __iter__(self): + return self + + def next(self): + fiche = self.file.next() + + if isinstance(fiche, GOTerm): + self.isaterm=True + return Term(fiche) + elif isinstance(fiche, GOEntry): + self.isaterm=False + return Entry(fiche) + else: + self.isaterm=False + return Header(fiche) + + +############# tout le reste doit descendre a l'etage obitools/ogo/go/parser.py ########## + +# define an XRef into a go_obo.py script in the microbi pylib +class Xref(object): + """ + Class Xref + Xref.db Xref database + Xref.id Xref identifier + """ + + def __init__(self,description): + data = description.split(':') + self.db = data[0].strip() + self.id = data[1].strip() + +# define a RelatedTerm into a go_obo.py script in the microbi pylib +class RelatedTerm(object): + """ + Class RelatedTerm + RelatedTerm.relation RelatedTerm relation + RelatedTerm.related_term RelatedTerm GO identifier + RelatedTerm.comment all terms have 0 or 1 comment + """ + + def __init__(self,relation,value,comment): + self.relation = relation + self.related_term = value.strip('GO:') + self.comment = comment + + +# define into a go_obo.py script in the microbi pylib +#class Term(object): +# """ +# class representing an OBO term (entry). +# """ +# +# def __init__(self): +# raise RuntimeError('biodb.go_obo is an abstract class') +# +# def __checkEntry__(self): +# minimum=(hasattr(self,'goid') ) +# if not minimum: +# raise AssertionError('Misconstructed GO Term instance %s' % [x for x in dir(self) if x[0]!='_']) + +class Term(object): + """ + Class Term + representing a GO term. + """ + + def __init__(self,data=None): + """ + """ + self.data=data + self.isaterm = True + + if data: + self.__filtreGoid__() + self.__filtreName__() + self.__filtreComment__() + self.__filtreSynonyms__() + self.__filtreDef__() + self.__filtreParents__() + self.__filtreRelationships__() + self.__filtreRelation__() + self.__filtreObsolete__() + self.__filtreAltIds__() + self.__filtreXRefs__() + self.__filtreSubsets__() + + # check if all required attributes were valued + self.__checkEntry__() + + + def __checkEntry__(self): + minimum=(hasattr(self,'goid') ) + if not minimum: + raise AssertionError('Misconstructed GO Term instance %s' % [x for x in dir(self) if x[0]!='_']) + + + def __filtreGoid__(self): + """ + Extract GO id. + """ + self.goid = self.data.id.value.strip('GO:') + + def __filtreName__(self): + """ + Extract GO name. + """ + self.name = self.data.name.value + + def __filtreSynonyms__(self): + """ + Extract GO synonym(s). + """ + self.list_synonyms = {} + if self.data.synonyms: + for y in self.data.synonyms: + self.list_synonyms[y.value] = y.scope + + + def __filtreComment__(self): + """ + manage None comments + """ + if self.data.comment != None: + self.comment = self.data.comment.value + else: + self.comment = "" + + def __filtreDef__(self): + """ + Extract GO definition. + """ + if self.data.definition != None: + self.definition = self.data.definition.value + else: + self.definition = "" + + def __filtreParents__(self): + """ + To make the is_a hierarchy + """ + if self.data.is_a != None: + self.is_a = set([isa.value.strip('GO:') for isa in self.data.is_a]) + else: + self.is_a = set() + + def __filtreRelation__(self): + """ + To make the part_of hierarchy + """ + self.part_of = set() + self.regulates = set() + self.negatively_regulates = set() + self.positively_regulates = set() + + if self.data.relationship != None: + for rel in self.data.relationship: + if rel.relationship == "part_of": + self.part_of.add(rel.value.strip('GO:')) + elif rel.relationship == "regulates": + self.regulates.add(rel.value.strip('GO:')) + elif rel.relationship == "negatively_regulates": + self.negatively_regulates.add(rel.value.strip('GO:')) + elif rel.relationship == "positively_regulates": + self.positively_regulates.add(rel.value.strip('GO:')) + + + def __filtreRelationships__(self): + """ + Relation list with other GO Terms (is_a, part_of or some regulates relation) + """ + self.related_term =[] + if self.data.relationship != None: + for x in self.data.relationship: + self.related_term.append(RelatedTerm(x.relationship,x.value,x.__doc__)) + #self.related_term.append(RelatedTerm(x.relationship,x.value,x.comment)) + if self.data.is_a != None: + for x in self.data.is_a: + self.related_term.append(RelatedTerm('is_a',x.value,x.__doc__)) + #self.related_term.append(RelatedTerm('is_a',x.value,x.comment)) + + + + def __filtreObsolete__(self): + """ + for each obsolete terms corresponds a set of GO Identifiers + so that this GO term is consider as others GO Terms + """ + self.considers = set() + self.replaces = set() + self.is_obsolete = self.data.is_obsolete + if self.data.is_obsolete: + if self.data.consider: + self.considers = set([considered.value.strip('GO:') for considered in self.data.consider]) + if self.data.replaced_by: + self.replaces = set([replaced.value.strip('GO:') for replaced in self.data.replaced_by]) + + + def __filtreAltIds__(self): + """ + alternate(s) id(s) for this term (= alias in the geneontology schema model!) + """ + if self.data.alt_ids: + self.alt_ids = set([x.value.strip('GO:') for x in self.data.alt_ids]) + else: + self.alt_ids = set() + + def __filtreXRefs__(self): + """ + cross references to other databases + """ + self.xrefs = set() + if self.data.xrefs: + self.xrefs = set([Xref(x.value.reference) for x in self.data.xrefs]) + + + def __filtreSubsets__(self): + """ + subset label to make smaller sets of GO Terms + """ + self.subsets = set() + if self.data.subsets: + self.subsets = set([x.value for x in self.data.subsets]) + + +class Entry(object): + """ + a Stanza entry, like [Typedef] for example + """ + def __init__(self,data=None): + self.data=data + self.isaterm=False + self.isanentry=True + + +class Header(object): + """ + class representing a GO header. + """ + + def __init__(self,data=None): + """ + """ + self.data=data + self.isaterm = False + + + diff --git a/obitools/format/options.py b/obitools/format/options.py new file mode 100644 index 0000000..c42a23f --- /dev/null +++ b/obitools/format/options.py @@ -0,0 +1,284 @@ +''' +Created on 13 oct. 2009 + +@author: coissac +''' + +from obitools.format.sequence.embl import emblIterator +from obitools.format.sequence.genbank import genbankIterator +from obitools.format.sequence.fnaqual import fnaFastaIterator +from obitools.format.sequence.fasta import fastaAAIterator,fastaNucIterator,fastaIterator +from obitools.format.sequence.fastq import fastqIlluminaIterator,fastqSolexaIterator +from obitools.fastq import fastqSangerIterator +from obitools.fnaqual.quality import qualityIterator +from obitools.fasta import formatFasta, rawFastaIterator,\ + formatSAPFastaGenerator +from obitools.fastq import formatFastq + +from obitools.ecopcr.sequence import EcoPCRDBSequenceWriter +from obitools.ecopcr.options import loadTaxonomyDatabase + +#from obitools.format._format import printOutput + +from array import array +from itertools import chain +import sys + +import re +from obitools.ecopcr import EcoPCRFile + + +def addInputFormatOption(optionManager): +# optionManager.add_option('--rank', +# action="store_true", dest='addrank', +# default=False, +# help="add a rank attribute to the sequence " +# "indicating the sequence position in the input data") + optionManager.add_option('--genbank', + action="store_const", dest="seqinformat", + default=None, + const='genbank', + help="input file is in genbank format") + optionManager.add_option('--embl', + action="store_const", dest="seqinformat", + default=None, + const='embl', + help="input file is in embl format") + + optionManager.add_option('--fasta', + action="store_const", dest="seqinformat", + default=None, + const='fasta', + help="input file is in fasta nucleic format (including obitools fasta extentions)") + + optionManager.add_option('--ecopcr', + action="store_const", dest="seqinformat", + default=None, + const='ecopcr', + help="input file is in fasta nucleic format (including obitools fasta extentions)") + + optionManager.add_option('--raw-fasta', + action="store_const", dest="seqinformat", + default=None, + const='rawfasta', + help="input file is in fasta format (but more tolerant to format variant)") + + optionManager.add_option('--fna', + action="store_const", dest="seqinformat", + default=None, + const='fna', + help="input file is in fasta nucleic format produced by 454 sequencer pipeline") + + optionManager.add_option('--qual', + action="store", dest="withqualfile", + type='str', + default=None, + help="Specify the name of a quality file produced by 454 sequencer pipeline") + + optionManager.add_option('--sanger', + action="store_const", dest="seqinformat", + default=None, + const='sanger', + help="input file is in sanger fastq nucleic format (standard fastq)") + + optionManager.add_option('--solexa', + action="store_const", dest="seqinformat", + default=None, + const='solexa', + help="input file is in fastq nucleic format produced by solexa sequencer") + + optionManager.add_option('--illumina', + action="store_const", dest="seqinformat", + default=None, + const='illumina', + help="input file is in fastq nucleic format produced by old solexa sequencer") + + optionManager.add_option('--nuc', + action="store_const", dest="moltype", + default=None, + const='nuc', + help="input file is nucleic sequences") + optionManager.add_option('--prot', + action="store_const", dest="moltype", + default=None, + const='pep', + help="input file is protein sequences") + + +def addOutputFormatOption(optionManager): + optionManager.add_option('--fastq-output', + action="store_const", dest="output", + default=None, + const=formatFastq, + help="output sequences in sanger fastq format") + optionManager.add_option('--fasta-output', + action="store_const", dest="output", + default=None, + const=formatFasta, + help="output sequences in obitools fasta format") + optionManager.add_option('--sap-output', + action="store_const", dest="output", + default=None, + const=formatSAPFastaGenerator, + help="output sequences in sap fasta format") + optionManager.add_option('--strict-sap', + action='store_true',dest='strictsap', + default=False, + help="Print sequences in upper case (defualt is lower case)") + optionManager.add_option('--ecopcr-output', + action="store", dest="ecopcroutput", + default=None, + help="output sequences in obitools ecopcr format") + optionManager.add_option('--uppercase', + action='store_true',dest='uppercase', + default=False, + help="Print sequences in upper case (defualt is lower case)") + + + +def addInOutputOption(optionManager): + addInputFormatOption(optionManager) + addOutputFormatOption(optionManager) + + + + + +def autoEntriesIterator(options): + options.outputFormater=formatFasta + options.outputFormat="fasta" + + ecopcr_pattern = re.compile('^[^ ]+ +| +[0-9]+ +| + [0-9]+ + | +') + + def annotatedIterator(formatIterator): + options.outputFormater=formatFasta + options.outputFormat="fasta" + def iterator(lineiterator): + for s in formatIterator(lineiterator): + s.extractTaxon() + yield s + + return iterator + + def withQualIterator(qualityfile): + options.outputFormater=formatFastq + options.outputFormat="fastq" + def iterator(lineiterator): + for s in fnaFastaIterator(lineiterator): + q = qualityfile.next() + quality = array('d',(10.**(-x/10.) for x in q)) + s.quality=quality + yield s + + return iterator + + def autoSequenceIterator(lineiterator): + options.outputFormater=formatFasta + options.outputFormat="fasta" + first = lineiterator.next() + if first[0]==">": + if options.withqualfile is not None: + qualfile=qualityIterator(options.withqualfile) + reader=withQualIterator(qualfile) + options.outputFormater=formatFastq + options.outputFormat="fastq" + elif options.moltype=='nuc': + reader=fastaNucIterator + elif options.moltype=='pep': + reader=fastaAAIterator + else: + reader=fastaIterator + elif first[0]=='@': + reader=fastqSangerIterator + options.outputFormater=formatFastq + options.outputFormat="fastq" + elif first[0:3]=='ID ': + reader=emblIterator + elif first[0:6]=='LOCUS ': + reader=genbankIterator + elif first[0]=="#" or ecopcr_pattern.search(first): + reader=EcoPCRFile + else: + raise AssertionError,'file is not in fasta, fasta, embl, genbank or ecoPCR format' + + input = reader(chain([first],lineiterator)) + + return input + + if options.seqinformat is None: + reader = autoSequenceIterator + else: + if options.seqinformat=='fasta': + if options.moltype=='nuc': + reader=fastaNucIterator + elif options.moltype=='pep': + reader=fastaAAIterator + else: + reader=fastaIterator + elif options.seqinformat=='rawfasta': + reader=annotatedIterator(rawFastaIterator) + elif options.seqinformat=='genbank': + reader=annotatedIterator(genbankIterator) + elif options.seqinformat=='embl': + reader=annotatedIterator(emblIterator) + elif options.seqinformat=='fna': + reader=fnaFastaIterator + elif options.seqinformat=='sanger': + options.outputFormater=formatFastq + options.outputFormat="fastq" + reader=fastqSangerIterator + elif options.seqinformat=='solexa': + options.outputFormater=formatFastq + options.outputFormat="fastq" + reader=fastqSolexaIterator + elif options.seqinformat=='illumina': + options.outputFormater=formatFastq + options.outputFormat="fastq" + reader=fastqIlluminaIterator + elif options.seqinformat=='ecopcr': + reader=EcoPCRFile + + if options.seqinformat=='fna' and options.withqualfile is not None: + qualfile=qualityIterator(options.withqualfile) + reader=withQualIterator(qualfile) + options.outputFormater=formatFastq + options.outputFormat="fastq" + +# if options.addrank: +# reader = withRankIterator(reader) + return reader + +def sequenceWriterGenerator(options,output=sys.stdout): + class SequenceWriter: + def __init__(self,options,file=sys.stdout): + self._format=None + self._file=file + self._upper=options.uppercase + def put(self,seq): + if self._format is None: + self._format=formatFasta + if options.output is not None: + self._format=options.output + if self._format is formatSAPFastaGenerator: + self._format=formatSAPFastaGenerator(options) + elif options.outputFormater is not None: + self._format=options.outputFormater + s = self._format(seq,upper=self._upper) + try: + self._file.write(s) + self._file.write("\n") + except IOError: + sys.exit(0) + + if options.ecopcroutput is not None: + taxo = loadTaxonomyDatabase(options) + writer=EcoPCRDBSequenceWriter(options.ecopcroutput,taxonomy=taxo) + else: + writer=SequenceWriter(options,output) + + def sequenceWriter(sequence): + writer.put(sequence) + + return sequenceWriter + + \ No newline at end of file diff --git a/obitools/format/sequence/__init__.py b/obitools/format/sequence/__init__.py new file mode 100644 index 0000000..3918761 --- /dev/null +++ b/obitools/format/sequence/__init__.py @@ -0,0 +1,24 @@ +from obitools.fasta import fastaIterator +from obitools.fastq import fastqSangerIterator +from obitools.seqdb.embl.parser import emblIterator +from obitools.seqdb.genbank.parser import genbankIterator +from itertools import chain +from obitools.utils import universalOpen + +def autoSequenceIterator(file): + lineiterator = universalOpen(file) + first = lineiterator.next() + if first[0]==">": + reader=fastaIterator + elif first[0]=='@': + reader=fastqSangerIterator + elif first[0:3]=='ID ': + reader=emblIterator + elif first[0:6]=='LOCUS ': + reader=genbankIterator + else: + raise AssertionError,'file is not in fasta, fasta, embl, or genbank format' + + input = reader(chain([first],lineiterator)) + + return input diff --git a/obitools/format/sequence/embl.py b/obitools/format/sequence/embl.py new file mode 100644 index 0000000..f59f14a --- /dev/null +++ b/obitools/format/sequence/embl.py @@ -0,0 +1,2 @@ +from obitools.seqdb.embl.parser import emblIterator,emblParser + diff --git a/obitools/format/sequence/fasta.py b/obitools/format/sequence/fasta.py new file mode 100644 index 0000000..1d7bd49 --- /dev/null +++ b/obitools/format/sequence/fasta.py @@ -0,0 +1,4 @@ +from obitools.fasta import fastaIterator,fastaParser +from obitools.fasta import fastaAAIterator,fastaAAParser +from obitools.fasta import fastaNucIterator,fastaNucParser +from obitools.fasta import formatFasta diff --git a/obitools/format/sequence/fastq.py b/obitools/format/sequence/fastq.py new file mode 100644 index 0000000..54fdf89 --- /dev/null +++ b/obitools/format/sequence/fastq.py @@ -0,0 +1,13 @@ +''' +Created on 15 janv. 2010 + +@author: coissac +''' + +from obitools.fastq import fastqIterator,fastqParserGenetator +from obitools.fastq import fastqSangerIterator,fastqSolexaIterator, \ + fastqIlluminaIterator +from obitools.fastq import fastqAAIterator +from obitools.fastq import formatFastq + + diff --git a/obitools/format/sequence/fnaqual.py b/obitools/format/sequence/fnaqual.py new file mode 100644 index 0000000..ab69916 --- /dev/null +++ b/obitools/format/sequence/fnaqual.py @@ -0,0 +1,8 @@ +''' +Created on 12 oct. 2009 + +@author: coissac +''' + +from obitools.fnaqual.fasta import fnaFastaIterator +from obitools.fnaqual.quality import qualityIterator diff --git a/obitools/format/sequence/genbank.py b/obitools/format/sequence/genbank.py new file mode 100644 index 0000000..8524b6f --- /dev/null +++ b/obitools/format/sequence/genbank.py @@ -0,0 +1,4 @@ +from obitools.seqdb.genbank.parser import genpepIterator,genpepParser +from obitools.seqdb.genbank.parser import genbankIterator,genbankParser + + diff --git a/obitools/format/sequence/tagmatcher.py b/obitools/format/sequence/tagmatcher.py new file mode 100644 index 0000000..60ad8d8 --- /dev/null +++ b/obitools/format/sequence/tagmatcher.py @@ -0,0 +1,5 @@ +from obitools.tagmatcher.parser import tagMatcherParser +from obitools.tagmatcher.parser import TagMatcherIterator +from obitools.tagmatcher.parser import formatTagMatcher + +tagMatcherIterator=TagMatcherIterator diff --git a/obitools/goa/__init__.py b/obitools/goa/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/obitools/goa/parser.py b/obitools/goa/parser.py new file mode 100644 index 0000000..8ffd1e3 --- /dev/null +++ b/obitools/goa/parser.py @@ -0,0 +1,33 @@ +from itertools import imap +from obitools import utils + +class GoAFileIterator(utils.ColumnFile): + def __init__(self,stream): + utils.ColumnFile.__init__(self, + stream, '\t', True, + (str,)) + + _colname = ['database', + 'ac', + 'symbol', + 'qualifier', + 'goid', + 'origin', + 'evidence', + 'evidnce_origine', + 'namespace', + 'db_object_name', + 'gene', + 'object_type', + 'taxid', + 'date', + 'assigned_by'] + + def next(self): + data = utils.ColumnFile.next(self) + data = dict(imap(None,GoAFileIterator._colname,data)) + + return data + + + diff --git a/obitools/graph/__init__.py b/obitools/graph/__init__.py new file mode 100644 index 0000000..fbc5253 --- /dev/null +++ b/obitools/graph/__init__.py @@ -0,0 +1,962 @@ +''' +**obitool.graph** for representing graph structure in obitools +-------------------------------------------------------------- + +.. codeauthor:: Eric Coissac + + +This module offert classes to manipulate graphs, mainly trough the +:py:class:`obitools.graph.Graph` class. + +.. inheritance-diagram:: Graph DiGraph UndirectedGraph + :parts: 2 + +''' + +import sys + + +from obitools.utils import progressBar + + +class Indexer(dict): + ''' + Allow to manage convertion between an arbitrarly hashable python + value and an unique integer key + ''' + + def __init__(self): + + self.__max=0 + self.__reverse=[] + + def getLabel(self,index): + ''' + Return the python value associated to an integer index. + + :param index: an index value + :type index: int + + :raises: IndexError if the index is not used in this + Indexer instance + ''' + return self.__reverse[index] + + def getIndex(self,key,strict=False): + ''' + Return the index associated to a **key** in the indexer. Two + modes are available : + + - strict mode : + + if the key is not known by the :py:class:`Indexer` instance + a :py:exc:`KeyError` exception is raised. + + - non strict mode : + + in this mode if the requested *key** is absent, it is added to + the :py:class:`Indexer` instance and the new index is returned + + :param key: the requested key + :type key: a hashable python value + + :param strict: select the looking for mode + :type strict: bool + + :return: the index corresponding to the key + :rtype: int + + :raises: - :py:exc:`KeyError` in strict mode is key is absent + of the :py:class:`Indexer` instance + + - :py:exc:`TypeError` if key is not an hashable value. + ''' + if dict.__contains__(self,key): + return dict.__getitem__(self,key) + elif strict: + raise KeyError,key + else: + value = self.__max + self[key]= value + self.__reverse.append(key) + self.__max+=1 + return value + + def __getitem__(self,key): + ''' + Implement the [] operateor to emulate the standard dictionnary + behaviour on :py:class:`Indexer` and returns the integer key + associated to a python value. + + Actually this method call the:py:meth:`getIndex` method in + non strict mode so it only raises an :py:exc:`TypeError` + if key is not an hashable value. + + :param key: the value to index + :type key: an hashable python value + + :return: an unique integer value associated to the key + :rtype: int + + :raises: :py:exc:`TypeError` if **key** is not an hashable value. + + ''' + return self.getIndex(key) + + def __equal__(self,index): + ''' + Implement equal operator **==** for comparing two :py:class:`Indexer` instances. + Two :py:class:`Indexer` instances are equals only if they are physically + the same instance + + :param index: the second Indexer + :type index: an :py:class:`Indexer` instance + + :return: True is the two :py:class:`Indexer` instances are the same + :rtype: bool + ''' + return id(self)==id(index) + + +class Graph(object): + ''' + Class used to represent directed or undirected graph. + + .. warning:: + + Only one edge can connect two nodes in a given direction. + + .. warning:: + + Specifying nodes through their index seepud your code but as no check + is done on index value, it may result in inconsistency. So prefer the + use of node label to specify a node. + + + ''' + def __init__(self,label='G',directed=False,indexer=None,nodes=None,edges=None): + ''' + :param label: Graph name, set to 'G' by default + :type label: str + + :param directed: true for directed graph, set to False by defalt + :type directed: boolean + + :param indexer: node label indexer. This allows to define several graphs + sharing the same indexer (see : :py:meth:`newEmpty`) + :type indexer: :py:class:`Indexer` + + :param nodes: set of nodes to add to the graph + :type nodes: iterable value + + :param edges: set of edges to add to the graph + :type edges: iterable value + ''' + + self._directed=directed + if indexer is None: + indexer = Indexer() + self._index = indexer + self._node = {} + self._node_attrs = {} + self._edge_attrs = {} + self._label=label + + def newEmpty(self): + """ + Build a new empty graph using the same :py:class:`Indexer` instance. + This allows two graph for sharing their vertices through their indices. + """ + n = Graph(self._label+"_compact",self._directed,self._index) + + return n + + def addNode(self,node=None,index=None,**data): + ''' + Add a new node or update an existing one. + + :param node: the new node label or the label of an existing node + for updating it. + :type node: an hashable python value + + :param index: the index of an existing node for updating it. + :type index: int + + :return: the index of the node + :rtype: int + + :raises: :py:exc:`IndexError` is index is not **None** and + corresponds to a not used index in this graph. + ''' + if index is None: + index = self._index[node] + + if index not in self._node: + self._node[index]=set() + else: + if index not in self._node: + raise IndexError,"This index is not used in this graph" + + if data: + if index in self._node_attrs: + self._node_attrs[index].update(data) + else: + self._node_attrs[index]=dict(data) + + return index + + def __contains__(self,node): + try: + index = self._index.getIndex(node,strict=True) + r = index in self._node + except KeyError: + r=False + return r + + def getNode(self,node=None,index=None): + """ + :param node: a node label. + :type node: an hashable python value + + :param index: the index of an existing node. + :type index: int + + .. note:: Index value are prevalent over node label. + + :return: the looked for node + :rtype: :py:class:`Node` + + :raises: :py:exc:`IndexError` if specified node lablel + corresponds to a non-existing node. + + .. warning:: no check on index value + """ + if index is None: + index = self._index.getIndex(node, True) + return Node(index,self) + + def getBestNode(self,estimator): + ''' + Select the node maximizing the estimator function + + :param estimator: the function to maximize + :type estimator: a function returning a numerical value and accepting one + argument of type :py:class:`Node` + + :return: the best node + :rtype: py:class:`Node` + ''' + + bestScore=0 + best=None + for n in self: + score = estimator(n) + if best is None or score > bestScore: + bestScore = score + best=n + return best + + + def delNode(self,node=None,index=None): + """ + Delete a node from a graph and all associated edges. + + :param node: a node label. + :type node: an hashable python value + + :param index: the index of an existing node. + :type index: int + + .. note:: Index value are prevalent over node label. + + :raises: :py:exc:`IndexError` if specified node lablel + corresponds to a non-existing node. + + .. warning:: no check on index value + """ + if index is None: + index = self._index[node] + + for n in self._node: + if n!=index: + e = self._node[n] + if index in e: + if (n,index) in self._edge_attrs: + del self._edge_attrs[(n,index)] + e.remove(index) + + e = self._node[index] + + for n in e: + if (index,n) in self._edge_attrs: + del self._edge_attrs[(index,n)] + + del self._node[index] + if index in self._node_attrs: + del self._node_attrs[index] + + + def addEdge(self,node1=None,node2=None,index1=None,index2=None,**data): + ''' + Create a new edge in the graph between both the specified nodes. + + .. note:: Nodes can be specified using their label or their index in the graph + if both values are indicated the index is used. + + :param node1: The first vertex label + :type node1: an hashable python value + :param node2: The second vertex label + :type node2: an hashable python value + :param index1: The first vertex index + :type index1: int + :param index2: The second vertex index + :type index2: int + + :raises: :py:exc:`IndexError` if one of both the specified node lablel + corresponds to a non-existing node. + + + .. warning:: no check on index value + ''' + + index1=self.addNode(node1, index1) + index2=self.addNode(node2, index2) + + self._node[index1].add(index2) + + if not self._directed: + self._node[index2].add(index1) + + if data: + if (index1,index2) not in self._edge_attrs: + data =dict(data) + self._edge_attrs[(index1,index2)]=data + if not self._directed: + self._edge_attrs[(index2,index1)]=data + else: + self._edge_attrs[(index2,index1)].update(data) + + return (index1,index2) + + def getEdge(self,node1=None,node2=None,index1=None,index2=None): + ''' + Extract the :py:class:`Edge` instance linking two nodes of the graph. + + .. note:: Nodes can be specified using their label or their index in the graph + if both values are indicated the index is used. + + :param node1: The first vertex label + :type node1: an hashable python value + :param node2: The second vertex label + :type node2: an hashable python value + :param index1: The first vertex index + :type index1: int + :param index2: The second vertex index + :type index2: int + + :raises: :py:exc:`IndexError` if one of both the specified node lablel + corresponds to a non-existing node. + + + .. warning:: no check on index value + ''' + node1=self.getNode(node1, index1) + node2=self.getNode(node2, index2) + return Edge(node1,node2) + + def delEdge(self,node1=None,node2=None,index1=None,index2=None): + """ + Delete the edge linking node 1 to node 2. + + .. note:: Nodes can be specified using their label or their index in the graph + if both values are indicated the index is used. + + + :param node1: The first vertex label + :type node1: an hashable python value + :param node2: The second vertex label + :type node2: an hashable python value + :param index1: The first vertex index + :type index1: int + :param index2: The second vertex index + :type index2: int + + :raises: :py:exc:`IndexError` if one of both the specified node lablel + corresponds to a non-existing node. + + + .. warning:: no check on index value + """ + if index1 is None: + index1 = self._index[node1] + if index2 is None: + index2 = self._index[node2] + if index1 in self._node and index2 in self._node[index1]: + self._node[index1].remove(index2) + if (index1,index2) in self._node_attrs: + del self._node_attrs[(index1,index2)] + if not self._directed: + self._node[index2].remove(index1) + if (index2,index1) in self._node_attrs: + del self._node_attrs[(index2,index1)] + + def edgeIterator(self,predicate=None): + """ + Iterate through a set of selected vertices. + + :param predicate: a function allowing node selection. Default value + is **None** and indicate that all nodes are selected. + :type predicate: a function returning a boolean value + and accepting one argument of class :py:class:`Edge` + + :return: an iterator over selected edge + :rtype: interator over :py:class:`Edge` instances + + .. seealso:: + function :py:func:`selectEdgeAttributeFactory` for simple predicate. + + """ + for n1 in self._node: + for n2 in self._node[n1]: + if self._directed or n1 <= n2: + e = self.getEdge(index1=n1, index2=n2) + if predicate is None or predicate(e): + yield e + + + def nodeIterator(self,predicate=None): + """ + Iterate through a set of selected vertices. + + :param predicate: a function allowing edge selection. Default value + is **None** and indicate that all edges are selected. + :type predicate: a function returning a boolean value + and accepting one argument of class :py:class:`Node` + + :return: an iterator over selected nodes. + :rtype: interator over :py:class:`Node` instances + + """ + for n in self._node: + node = self.getNode(index=n) + if predicate is None or predicate(node): + yield node + + def nodeIndexIterator(self,predicate=None): + """ + Iterate through the indexes of a set of selected vertices. + + :param predicate: a function allowing edge selection. Default value + is **None** and indicate that all edges are selected. + :type predicate: a function returning a boolean value + and accepting one argument of class :py:class:`Node` + + :return: an iterator over selected node indices. + :rtype: interator over `int` + + """ + for n in self._node: + node = self.getNode(index=n) + if predicate is None or predicate(node): + yield n + + def neighbourIndexSet(self,node=None,index=None): + if index is None: + index=self.getNode(node).index + return self._node[index] + + def edgeCount(self): + n = reduce(lambda x,y:x+y, (len(z) for z in self._node.itervalues()),0) + if not self._directed: + n=n/2 + return n + + def subgraph(self,nodes,name='G'): + sub = Graph(name,self._directed,self._index) + if not isinstance(nodes, set): + nodes = set(nodes) + for n in nodes: + sub._node[n]=nodes & self._node[n] + if n in self._node_attrs: + sub._node_attrs[n]=dict(self._node_attrs[n]) + for n2 in sub._node[n]: + if not self._directed: + if n <= n2: + if (n,n2) in self._edge_attrs: + data=dict(self._edge_attrs[(n,n2)]) + sub._edge_attrs[(n,n2)]=data + sub._edge_attrs[(n2,n)]=data + else: + if (n,n2) in self._edge_attrs: + data=dict(self._edge_attrs[(n,n2)]) + sub._edge_attrs[(n,n2)]=data + return sub + + def __len__(self): + return len(self._node) + + def __getitem__(self,key): + return self.getNode(node=key) + + def __delitem__(self,key): + self.delNode(node=key) + + def __iter__(self): + return self.nodeIterator() + + def __str__(self): + if self._directed: + kw ='digraph' + else: + kw='graph' + + nodes = "\n ".join([str(x) for x in self]) + edges = "\n ".join([str(x) for x in self.edgeIterator()]) + + return "%s %s {\n %s\n\n %s\n}" % (kw,self._label,nodes,edges) + +class Node(object): + """ + Class used for representing one node or vertex in a graph + + """ + def __init__(self,index,graph): + ''' + .. warning:: + + :py:class:`Node` constructor is usualy called through the :py:class:`Graph` methods + + :param index: Index of the node in the graph + :type index: int + :param graph: graph instance owning the node + :type graph: :py:class:`obitools.graph.Graph` + ''' + self.index = index + self.__graph = graph + + def getGraph(self): + ''' + return graph owning this node. + + :rtype: :py:class:`obitools.graph.Graph` + ''' + return self.__graph + + + def getLabel(self): + ''' + return label associated to this node. + ''' + return self.__graph._index.getLabel(self.index) + + + def has_key(self,key): + ''' + test is the node instance has a property named 'key'. + + :param key: the name of a property + :type key: str + + :return: True if the nade has a property named + :rtype: bool + ''' + if self.index in self.__graph._node_attrs: + return key in self.__graph._node_attrs[self.index] + else: + return False + + def neighbourIterator(self,nodePredicat=None,edgePredicat=None): + ''' + iterate through the nodes directly connected to + this node. + + :param nodePredicat: a function accepting one node as parameter + and returning **True** if this node must be + returned by the iterator. + :type nodePredicat: function + + :param edgePredicat: a function accepting one edge as parameter + and returning True if the edge linking self and + the current must be considered. + :type edgePredicat: function + + + :rtype: iterator on Node instances + ''' + for n in self.neighbourIndexIterator(nodePredicat, edgePredicat): + node = self.graph.getNode(index=n) + yield node + + def neighbourIndexSet(self): + ''' + Return a set of node indexes directely connected + to this node. + + .. warning:: + + do not change this set unless you know + exactly what you do. + + @rtype: set of int + ''' + return self.__graph._node[self.index] + + def neighbourIndexIterator(self,nodePredicat=None,edgePredicat=None): + ''' + iterate through the node indexes directly connected to + this node. + + :param nodePredicat: a function accepting one node as parameter + and returning True if this node must be + returned by the iterator. + :type nodePredicat: function + + :param edgePredicat: a function accepting one edge as parameter + and returning True if the edge linking self and + the current must be considered. + :type edgePredicat: function + + :rtype: iterator on int + ''' + for n in self.neighbourIndexSet(): + if nodePredicat is None or nodePredicat(self.__graph.getNode(index=n)): + if edgePredicat is None or edgePredicat(self.__graph.getEdge(index1=self.index,index2=n)): + yield n + + def degree(self,nodeIndexes=None): + ''' + return count of edges linking this node to the + set of nodes describes by their index in nodeIndexes + + :param nodeIndexes: set of node indexes. + if set to None, all nodes of the + graph are take into account. + Set to None by default. + :type nodeIndexes: set of int + + :rtype: int + ''' + if nodeIndexes is None: + return len(self.__graph._node[self.index]) + else: + return len(self.__graph._node[self.index] & nodeIndexes) + + def componentIndexSet(self,nodePredicat=None,edgePredicat=None): + ''' + Return the set of node index in the same connected component. + + :param nodePredicat: a function accepting one node as parameter + and returning True if this node must be + returned by the iterator. + :type nodePredicat: function + + :param edgePredicat: a function accepting one edge as parameter + and returning True if the edge linking self and + the current must be considered. + :type edgePredicat: function + + + :rtype: set of int + ''' + cc=set([self.index]) + added = set(x for x in self.neighbourIndexIterator(nodePredicat, edgePredicat)) + while added: + cc |= added + added = reduce(lambda x,y : x | y, + (set(z for z in self.graph.getNode(index=c).neighbourIndexIterator(nodePredicat, edgePredicat)) + for c in added), + set()) + added -= cc + return cc + + def componentIterator(self,nodePredicat=None,edgePredicat=None): + ''' + Iterate through the nodes in the same connected + component. + + :rtype: iterator on :py:class:`Node` instance + ''' + for c in self.componentIndexSet(nodePredicat, edgePredicat): + yield self.graph.getNode(c) + + def shortestPathIterator(self,nodes=None): + ''' + Iterate through the shortest path sourcing + from this node. if nodes is not None, iterates + only path linkink this node to one node listed in + nodes + + :param nodes: set of node index + :type nodes: iterable on int + + :return: an iterator on list of int describing path + :rtype: iterator on list of int + ''' + if nodes is not None: + nodes = set(nodes) + + + Q=[(self.index,-1)] + + gray = set([self.index]) + paths = {} + + while Q and (nodes is None or nodes): + u,p = Q.pop() + paths[u]=p + next = self.graph._node[u] - gray + gray|=next + Q.extend((x,u) for x in next) + if nodes is None or u in nodes: + if nodes: + nodes.remove(u) + path = [u] + while p >= 0: + path.append(p) + p = paths[p] + path.reverse() + yield path + + def shortestPathTo(self,node=None,index=None): + ''' + return one of the shortest path linking this + node to specified node. + + :param node: a node label or None + :param index: a node index or None. the parameter index + has a priority on the parameter node. + :type index: int + + :return: list of node index corresponding to the path or None + if no path exists. + :rtype: list of int or None + ''' + if index is None: + index=self.graph.getNode(node).index + for p in self.shortestPathIterator([index]): + return p + + + def __getitem__(self,key): + ''' + return the value of the property of this node + + :param key: the name of a property + :type key: str + ''' + return self.__graph._node_attrs.get(self.index,{})[key] + + def __setitem__(self,key,value): + ''' + set the value of a node property. In the property doesn't + already exist a new property is added to this node. + + :param key: the name of a property + :type key: str + :param value: the value of the property + + .. seealso:: + + :py:meth:`Node.__getitem__` + ''' + if self.index in self.__graph._node_attrs: + data = self.__graph._node_attrs[self.index] + data[key]=value + else: + self.graph._node_attrs[self.index]={key:value} + + def __len__(self): + ''' + Count neighbour of this node + + :rtype: int + + .. seealso:: + + :py:meth:`Node.degree` + ''' + return len(self.__graph._node[self.index]) + + def __iter__(self): + ''' + iterate through neighbour of this node + + :rtype: iterator in :py:class:`Node` instances + + .. seealso:: + + :py:meth:`Node.neighbourIterator` + ''' + return self.neighbourIterator() + + def __contains__(self,key): + return self.has_key(key) + + def __str__(self): + + if self.index in self.__graph._node_attrs: + keys = " ".join(['%s="%s"' % (x[0],str(x[1]).replace('"','\\"').replace('\n','\\n')) + for x in self.__graph._node_attrs[self.index].iteritems()] + ) + else: + keys='' + + return '%d [label="%s" %s]' % (self.index, + str(self.label).replace('"','\\"').replace('\n','\\n'), + keys) + + def keys(self): + if self.index in self.__graph._node_attrs: + k = self.__graph._node_attrs[self.index].keys() + else: + k=[] + return k + + label = property(getLabel, None, None, "Label of the node") + + graph = property(getGraph, None, None, "Graph owning this node") + + + +class Edge(object): + """ + Class used for representing one edge of a graph + + """ + + def __init__(self,node1,node2): + ''' + .. warning:: + + :py:class:`Edge` constructor is usualy called through the :py:class:`Graph` methods + + :param node1: First node likend by the edge + :type node1: :py:class:`Node` + :param node2: Seconde node likend by the edge + :type node2: :py:class:`Node` + ''' + self.node1 = node1 + self.node2 = node2 + + def getGraph(self): + """ + Return the :py:class:`Graph` instance owning this edge. + """ + return self.node1.graph + + def has_key(self,key): + ''' + test is the :py:class:`Edge` instance has a property named **key**. + + :param key: the name of a property + :type key: str + + :return: True if the edge has a property named + :rtype: bool + ''' + if (self.node1.index,self.node2.index) in self.graph._edge_attrs: + return key in self.graph._edge_attrs[(self.node1.index,self.node2.index)] + else: + return False + + + def getDirected(self): + return self.node1.graph._directed + + def __getitem__(self,key): + return self.graph._edge_attrs.get((self.node1.index,self.node2.index),{})[key] + + def __setitem__(self,key,value): + e = (self.node1.index,self.node2.index) + if e in self.graph._edge_attrs: + data = self.graph._edge_attrs[e] + data[key]=value + else: + self.graph._edge_attrs[e]={key:value} + + def __str__(self): + e = (self.node1.index,self.node2.index) + if e in self.graph._edge_attrs: + keys = "[%s]" % " ".join(['%s="%s"' % (x[0],str(x[1]).replace('"','\\"')) + for x in self.graph._edge_attrs[e].iteritems()] + ) + else: + keys = "" + + if self.directed: + link='->' + else: + link='--' + + return "%d %s %d %s" % (self.node1.index,link,self.node2.index,keys) + + def __contains__(self,key): + return self.has_key(key) + + + graph = property(getGraph, None, None, "Graph owning this edge") + + directed = property(getDirected, None, None, "Directed's Docstring") + + +class DiGraph(Graph): + """ + :py:class:`DiGraph class`is a specialisation of the :py:class:`Graph` class + dedicated to directed graph representation + + .. seealso:: + + :py:class:`UndirectedGraph` + + """ + def __init__(self,label='G',indexer=None,nodes=None,edges=None): + ''' + :param label: Graph name, set to 'G' by default + :type label: str + :param indexer: node label indexer + :type indexer: Indexer instance + :param nodes: set of nodes to add to the graph + :type nodes: iterable value + :param edges: set of edges to add to the graph + :type edges: iterable value + ''' + + Graph.__init__(self, label, True, indexer, nodes, edges) + +class UndirectedGraph(Graph): + """ + :py:class:`UndirectGraph class`is a specialisation of the :py:class:`Graph` class + dedicated to undirected graph representation + + .. seealso:: + + :py:class:`DiGraph` + + """ + def __init__(self,label='G',indexer=None,nodes=None,edges=None): + ''' + :param label: Graph name, set to 'G' by default + :type label: str + :param indexer: node label indexer + :type indexer: Indexer instance + :param nodes: set of nodes to add to the graph + :type nodes: iterable value + :param edges: set of edges to add to the graph + :type edges: iterable value + ''' + + Graph.__init__(self, label, False, indexer, nodes, edges) + + + +def selectEdgeAttributeFactory(attribut,value): + """ + This function help in building predicat function usable for selecting edge + in the folowing :py:class:`Graph` methods : + + - :py:meth:`Graph.edgeIterator` + + """ + def selectEdge(e): + return attribut in e and e[attribut]==value + return selectEdge diff --git a/obitools/graph/__init__.pyc b/obitools/graph/__init__.pyc new file mode 100644 index 0000000..397e5c0 Binary files /dev/null and b/obitools/graph/__init__.pyc differ diff --git a/obitools/graph/algorithms/__init__.py b/obitools/graph/algorithms/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/obitools/graph/algorithms/__init__.pyc b/obitools/graph/algorithms/__init__.pyc new file mode 100644 index 0000000..1f2edcc Binary files /dev/null and b/obitools/graph/algorithms/__init__.pyc differ diff --git a/obitools/graph/algorithms/clique.py b/obitools/graph/algorithms/clique.py new file mode 100644 index 0000000..2007c1a --- /dev/null +++ b/obitools/graph/algorithms/clique.py @@ -0,0 +1,134 @@ +import time +import sys + + + +_maxsize=0 +_solution=0 +_notbound=0 +_sizebound=0 +_lastyield=0 +_maxclique=None + +def cliqueIterator(graph,minsize=1,node=None,timeout=None): + global _maxsize,_solution,_notbound,_sizebound,_lastyield + _maxsize=0 + _solution=0 + _notbound=0 + _sizebound=0 + starttime = time.time() + + if node: + node = graph.getNode(node) + index = node.index + clique= set([index]) + candidates= set(graph.neighbourIndexSet(index=index)) + else: + clique=set() + candidates = set(x.index for x in graph) + + +# candidates = set(x for x in candidates +# if len(graph.neighbourIndexSet(index=x) & candidates) >= (minsize - 1)) + + _lastyield=time.time() + for c in _cliqueIterator(graph,clique,candidates,set(),minsize,start=starttime,timeout=timeout): + yield c + + + + + +def _cliqueIterator(graph,clique,candidates,notlist,minsize=0,start=None,timeout=None): + global _maxsize,_maxclique,_solution,_notbound,_sizebound,_lastyield + + # Speed indicator + lclique = len(clique) + lcandidates = len(candidates) + notmin = lcandidates + notfix = None + + for n in notlist: + nnc = candidates - graph.neighbourIndexSet(index=n) + nc = len(nnc) + if nc < notmin: + notmin=nc + notfix=n + notfixneib = nnc + + if lclique > _maxsize or not _solution % 1000 : + if start is not None: + top = time.time() + delta = top - start + if delta==0: + delta=1e-6 + speed = _solution / delta + start = top + else: + speed = 0 + print >>sys.stderr,"\rCandidates : %-5d Maximum clique size : %-5d Solutions explored : %10d speed = %5.2f solutions/sec sizebound=%10d notbound=%10d " % (lcandidates,_maxsize,_solution,speed,_sizebound,_notbound), + sys.stderr.flush() + if lclique > _maxsize: + _maxsize=lclique + +# print >>sys.stderr,'koukou' + + timer = time.time() - _lastyield + + if not candidates and not notlist: + if lclique==_maxsize: + _maxclique=set(clique) + if lclique >= minsize: + yield set(clique) + if timeout is not None and timer > timeout and _maxclique is not None: + yield _maxclique + _maxclique=None + + else: + while notmin and candidates and ((lclique + len(candidates)) >= minsize or (timeout is not None and timer > timeout)): + # count explored solution + _solution+=1 + + if notfix is None: + nextcandidate = candidates.pop() + else: + nextcandidate = notfixneib.pop() + candidates.remove(nextcandidate) + + clique.add(nextcandidate) + + neighbours = graph.neighbourIndexSet(index=nextcandidate) + + nextcandidates = candidates & neighbours + nextnot = notlist & neighbours + + nnc = candidates - neighbours + lnnc=len(nnc) + + for c in _cliqueIterator(graph, + set(clique), + nextcandidates, + nextnot, + minsize, + start, + timeout=timeout): + yield c + + + clique.remove(nextcandidate) + + notmin-=1 + + if lnnc < notmin: + notmin = lnnc + notfix = nextcandidate + notfixneib = nnc + + if notmin==0: + _notbound+=1 + + notlist.add(nextcandidate) + else: + if (lclique + len(candidates)) < minsize: + _sizebound+=1 + diff --git a/obitools/graph/algorithms/compact.py b/obitools/graph/algorithms/compact.py new file mode 100644 index 0000000..8065a93 --- /dev/null +++ b/obitools/graph/algorithms/compact.py @@ -0,0 +1,8 @@ + +def compactGraph(graph,nodeSetIterator): + compact = graph.newEmpty() + for ns in nodeSetIterator(graph): + nlabel = "\n".join([str(graph.getNode(index=x).label) for x in ns]) + compact.addNode(nlabel) + print + print compact diff --git a/obitools/graph/algorithms/component.py b/obitools/graph/algorithms/component.py new file mode 100644 index 0000000..a17c8dd --- /dev/null +++ b/obitools/graph/algorithms/component.py @@ -0,0 +1,82 @@ +""" +Iterate through the connected components of a graph +--------------------------------------------------- + +the module :py:mod:`obitools.graph.algorithm.component` provides +two functions to deal with the connected component of a graph +represented as a :py:class:`obitools.graph.Graph` instance. + +The whole set of connected component of a graph is a partition of this graph. +So a node cannot belongs to two distinct connected component. + +Two nodes are in the same connected component if it exits a path through +the graph edges linking them. + +TODO: THere is certainly a bug with DirectedGraph + +""" + +def componentIterator(graph,nodePredicat=None,edgePredicat=None): + ''' + Build an iterator over the connected component of a graph. + Each connected component returned by the iterator is represented + as a `set` of node indices. + + :param graph: the graph to partitionne + :type graph: :py:class:`obitools.graph.Graph` + + :param predicate: a function allowing edge selection. Default value + is **None** and indicate that all edges are selected. + :type predicate: a function returning a boolean value + and accepting one argument of class :py:class:`Node` + + :param predicate: a function allowing node selection. Default value + is **None** and indicate that all nodes are selected. + :type predicate: a function returning a boolean value + and accepting one argument of class :py:class:`Edge` + + :return: an iterator over the connected component set + :rtype: an iterator over `set` of `int` + + .. seealso:: + the :py:meth:`obitools.graph.Graph.componentIndexSet` method + on which is based this function. + ''' + seen = set() + for n in graph.nodeIterator(nodePredicat): + if n.index not in seen: + cc=n.componentIndexSet(nodePredicat, edgePredicat) + yield cc + seen |= cc + +def componentCount(graph,nodePredicat=None,edgePredicat=None): + ''' + Count the connected componnent in a graph. + + :param graph: the graph to partitionne + :type graph: :py:class:`obitools.graph.Graph` + + :param predicate: a function allowing edge selection. Default value + is **None** and indicate that all edges are selected. + :type predicate: a function returning a boolean value + and accepting one argument of class :py:class:`Node` + + :param predicate: a function allowing node selection. Default value + is **None** and indicate that all nodes are selected. + :type predicate: a function returning a boolean value + and accepting one argument of class :py:class:`Edge` + + :return: an iterator over the connected component set + :rtype: an iterator over `set` of `int` + + .. seealso:: + the :py:func:`componentIterator` function + on which is based this function. + ''' + n=0 + for c in componentIterator(graph,nodePredicat, edgePredicat): + n+=1 + return n + + + \ No newline at end of file diff --git a/obitools/graph/algorithms/component.pyc b/obitools/graph/algorithms/component.pyc new file mode 100644 index 0000000..a3b6298 Binary files /dev/null and b/obitools/graph/algorithms/component.pyc differ diff --git a/obitools/graph/dag.py b/obitools/graph/dag.py new file mode 100644 index 0000000..f9a7a96 --- /dev/null +++ b/obitools/graph/dag.py @@ -0,0 +1,80 @@ +from obitools.graph import DiGraph,Node +from obitools.graph.algorithms.component import componentIterator + +class DAG(DiGraph): + def __init__(self,label='G',indexer=None,nodes=None,edges=None): + ''' + Directed Graph constructor. + + @param label: Graph name, set to 'G' by default + @type label: str + @param indexer: node label indexer + @type indexer: Indexer instance + @param nodes: set of nodes to add to the graph + @type nodes: iterable value + @param edges: set of edges to add to the graph + @type edges: iterable value + ''' + + self._parents={} + DiGraph.__init__(self, label, indexer, nodes, edges) + + def getNode(self,node=None,index=None): + if index is None: + index = self._index.getIndex(node, True) + return DAGNode(index,self) + + def addEdge(self,parent=None,node=None,indexp=None,index=None,**data): + indexp=self.addNode(parent, indexp) + index =self.addNode(node , index) + + pindex = set(n.index + for n in self.getNode(index=indexp).ancestorIterator()) + + assert index not in pindex,'Child node cannot be a parent node' + + DiGraph.addEdge(self,index1=indexp,index2=index,**data) + + if index in self._parents: + self._parents[index].add(indexp) + else: + self._parents[index]=set([indexp]) + + + return (indexp,index) + + def getRoots(self): + return [self.getNode(index=cc.pop()).getRoot() + for cc in componentIterator(self)] + + + + +class DAGNode(Node): + + def ancestorIterator(self): + if self.index in self.graph._parents: + for p in self.graph._parents[self.index]: + parent = DAGNode(p,self.graph) + yield parent + for pnode in parent.ancestorIterator(): + yield pnode + + def getRoot(self): + for x in self.ancestorIterator(): + pass + return x + + def leavesIterator(self): + if not self: + yield self + for n in self: + for nn in n.leavesIterator(): + yield nn + + def subgraphIterator(self): + yield self + for n in self: + for nn in n.subgraphIterator(): + yield nn + diff --git a/obitools/graph/layout/__init__.py b/obitools/graph/layout/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/obitools/graph/layout/radialtree.py b/obitools/graph/layout/radialtree.py new file mode 100644 index 0000000..e69de29 diff --git a/obitools/graph/rootedtree.py b/obitools/graph/rootedtree.py new file mode 100644 index 0000000..803316d --- /dev/null +++ b/obitools/graph/rootedtree.py @@ -0,0 +1,117 @@ +from obitools.graph.dag import DAG,DAGNode + +class RootedTree(DAG): + + def addEdge(self,parent=None,node=None,indexp=None,index=None,**data): + indexp=self.addNode(parent, indexp) + index =self.addNode(node , index) + + assert index not in self._parents or indexp in self._parents[index], \ + 'Child node cannot have more than one parent node' + + return DAG.addEdge(self,indexp=indexp,index=index,**data) + + def getNode(self,node=None,index=None): + if index is None: + index = self._index.getIndex(node, True) + return RootedTreeNode(index,self) + + + +class RootedTreeNode(DAGNode): + + def subTreeSize(self): + n=1 + for subnode in self: + n+=subnode.subTreeSize() + return n + + def subTreeLeaves(self): + if not self: + return 1 + n=0 + for subnode in self: + n+=subnode.subTreeLeaves() + return n + + +def nodeWriter(node,deep=0,label=None,distance="distance", bootstrap="bootstrap",cartoon=None,collapse=None): + + ks = node.keys() + + + if label is None: + name=node.label + elif callable(label): + name=label(node) + elif isinstance(label, str) and label in node: + name=node[label] + ks.remove(label) + else: + name='' + + if distance in node: + dist=':%6.5f' % node[distance] + ks.remove(distance) + else: + dist='' + + ks = ["%s=%s" % (k,node[k]) for k in ks] + + if cartoon is not None and cartoon(node): + ks.append("!cartoon={%d,0.0}" % node.subTreeLeaves()) + + if collapse is not None and collapse(node): + ks.append('!collapse={"collapsed",0.0}') + + if ks: + ks="[&"+",".join(ks)+"]" + else: + ks='' + + + nodeseparator = ',\n' + ' ' * (deep+1) + + subnodes = nodeseparator.join([nodeWriter(x, deep+1,label,distance,bootstrap,cartoon=cartoon,collapse=collapse) + for x in node]) + if subnodes: + subnodes='(\n' + ' ' * (deep+1) + subnodes + '\n' + ' ' * deep + ')' + + return '%s"%s"%s%s' % (subnodes,name,ks,dist) + + +def nexusFormat(tree,startnode=None,label=None,blocks="",cartoon=None,collapse=None): + head="#NEXUS\n" + + tx = [] + + for n in tree: + if label is None: + name=n.label + elif callable(label): + name=label(n) + elif isinstance(label, str) and label in n: + name=n[label] + else: + name='' + + if name: + tx.append('"%s"' % name) + + taxa = "begin taxa;\n\tdimensions ntax=%d;\n\ttaxlabels\n\t" % len(tx) + + taxa+="\n\t".join(tx) + + taxa+="\n;\nend;\n\n" + + + + if startnode is not None: + roots =[startnode] + else: + roots = tree.getRoots() + trees = nodeWriter(roots[0],0,label,cartoon=cartoon,collapse=collapse) + trees = "begin trees;\n\ttree tree_1 = [&R] "+ trees +";\nend;\n\n" + return head+taxa+trees+"\n\n"+blocks+"\n" + + \ No newline at end of file diff --git a/obitools/graph/tree.py b/obitools/graph/tree.py new file mode 100644 index 0000000..940ee44 --- /dev/null +++ b/obitools/graph/tree.py @@ -0,0 +1,37 @@ +from obitools.graph import UndirectedGraph,Node +from obitools.graph.algorithms.component import componentCount + + +class Forest(UndirectedGraph): + + + def getNode(self,node=None,index=None): + if index is None: + index = self._index.getIndex(node, True) + return TreeNode(index,self) + + def addEdge(self,node1=None,node2=None,index1=None,index2=None,**data): + index1=self.addNode(node1, index1) + index2=self.addNode(node2, index2) + + cc = set(n.index for n in self.getNode(index=index2).componentIterator()) + + assert index1 in self._node[index2] or index1 not in cc, \ + "No more than one path is alloed between two nodes in a tree" + + UndirectedGraph.addEdge(self, index1=index1, index2=index2,**data) + + return (index1,index2) + + def isASingleTree(self): + return componentCount(self)==1 + +class TreeNode(Node): + + def componentIterator(self): + for c in self: + yield c + for cc in c: + yield cc + + \ No newline at end of file diff --git a/obitools/gzip.py b/obitools/gzip.py new file mode 100644 index 0000000..841641a --- /dev/null +++ b/obitools/gzip.py @@ -0,0 +1,504 @@ +"""Functions that read and write gzipped files. + +The user of the file doesn't have to worry about the compression, +but random access is not allowed. + +This consisted on a patched version of of standard gzip python +module based on Andrew Kuchling's minigzip.py distributed with the zlib module + +""" + +# based on Andrew Kuchling's minigzip.py distributed with the zlib module + +import struct, sys, time +import zlib +import __builtin__ + +__all__ = ["GzipFile","open"] + +FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16 + +READ, WRITE = 1, 2 + +def U32(i): + """Return i as an unsigned integer, assuming it fits in 32 bits. + + If it's >= 2GB when viewed as a 32-bit unsigned int, return a long. + """ + if i < 0: + i += 1L << 32 + return i + +def LOWU32(i): + """Return the low-order 32 bits of an int, as a non-negative int.""" + return i & 0xFFFFFFFFL + +def write32(output, value): + output.write(struct.pack("' + + def _init_write(self, filename): + if filename[-3:] != '.gz': + filename = filename + '.gz' + self.filename = filename + self.crc = zlib.crc32("") + self.size = 0 + self.writebuf = [] + self.bufsize = 0 + + def _write_gzip_header(self): + self.fileobj.write('\037\213') # magic header + self.fileobj.write('\010') # compression method + fname = self.filename[:-3] + flags = 0 + if fname: + flags = FNAME + self.fileobj.write(chr(flags)) + write32u(self.fileobj, long(time.time())) + self.fileobj.write('\002') + self.fileobj.write('\377') + if fname: + self.fileobj.write(fname + '\000') + + def _init_read(self): + self.crc = zlib.crc32("") + self.size = 0 + + def _read_internal(self, size): + if len(self.inputbuf) < size: + self.inputbuf += self.fileobj.read(size-len(self.inputbuf)) + chunk = self.inputbuf[:size] + # need to use len(chunk) bellow instead of size in case it's EOF. + if len(chunk) < 8: + self.last8 = self.last8[len(chunk):] + chunk + else: + self.last8 = chunk[-8:] + self.inputbuf = self.inputbuf[size:] + return chunk + + def _read_gzip_header(self): + magic = self._read_internal(2) + if len(magic) != 2: + raise EOFError, "Reached EOF" + if magic != '\037\213': + raise IOError, 'Not a gzipped file' + method = ord( self._read_internal(1) ) + if method != 8: + raise IOError, 'Unknown compression method' + flag = ord( self._read_internal(1) ) + # modtime = self.fileobj.read(4) + # extraflag = self.fileobj.read(1) + # os = self.fileobj.read(1) + self._read_internal(6) + + if flag & FEXTRA: + # Read & discard the extra field, if present + xlen = ord(self._read_internal(1)) + xlen = xlen + 256*ord(self._read_internal(1)) + self._read_internal(xlen) + if flag & FNAME: + # Read and discard a null-terminated string containing the filename + while True: + s = self._read_internal(1) + if not s or s=='\000': + break + if flag & FCOMMENT: + # Read and discard a null-terminated string containing a comment + while True: + s = self._read_internal(1) + if not s or s=='\000': + break + if flag & FHCRC: + self._read_internal(2) # Read & discard the 16-bit header CRC + + + def write(self,data): + if self.mode != WRITE: + import errno + raise IOError(errno.EBADF, "write() on read-only GzipFile object") + + if self.fileobj is None: + raise ValueError, "write() on closed GzipFile object" + if len(data) > 0: + self.size = self.size + len(data) + self.crc = zlib.crc32(data, self.crc) + self.fileobj.write( self.compress.compress(data) ) + self.offset += len(data) + + def read(self, size=-1): + if self.mode != READ: + import errno + raise IOError(errno.EBADF, "read() on write-only GzipFile object") + + if self.extrasize <= 0 and self.fileobj is None: + return '' + + readsize = 1024 + if size < 0: # get the whole thing + try: + while True: + self._read(readsize) + readsize = min(self.max_read_chunk, readsize * 2) + except EOFError: + size = self.extrasize + else: # just get some more of it + try: + while size > self.extrasize: + self._read(readsize) + readsize = min(self.max_read_chunk, readsize * 2) + except EOFError: + if size > self.extrasize: + size = self.extrasize + + chunk = self.extrabuf[:size] + self.extrabuf = self.extrabuf[size:] + self.extrasize = self.extrasize - size + + self.offset += size + return chunk + + def _unread(self, buf): + self.extrabuf = buf + self.extrabuf + self.extrasize = len(buf) + self.extrasize + self.offset -= len(buf) + + def _read(self, size=1024): + if self.fileobj is None: + raise EOFError, "Reached EOF" + + if self._new_member: + # If the _new_member flag is set, we have to + # jump to the next member, if there is one. + # + # _read_gzip_header will raise EOFError exception + # if there no more members to read. + self._init_read() + self._read_gzip_header() + self.decompress = zlib.decompressobj(-zlib.MAX_WBITS) + self._new_member = False + + # Read a chunk of data from the file + buf = self._read_internal(size) + + # If the EOF has been reached, flush the decompression object + # and mark this object as finished. + + if buf == "": + uncompress = self.decompress.flush() + self._read_eof() + self._add_read_data( uncompress ) + raise EOFError, 'Reached EOF' + + uncompress = self.decompress.decompress(buf) + self._add_read_data( uncompress ) + + if self.decompress.unused_data != "": + # Ending case: we've come to the end of a member in the file, + # so put back unused_data and initialize last8 by reading them. + self.inputbuf = self.decompress.unused_data + self.inputbuf + self._read_internal(8) + + # Check the CRC and file size, and set the flag so we read + # a new member on the next call + self._read_eof() + self._new_member = True + + def _add_read_data(self, data): + self.crc = zlib.crc32(data, self.crc) + self.extrabuf = self.extrabuf + data + self.extrasize = self.extrasize + len(data) + self.size = self.size + len(data) + + def _read_eof(self): + # We've read to the end of the file, so we have to rewind in order + # to reread the 8 bytes containing the CRC and the file size. + # We check the that the computed CRC and size of the + # uncompressed data matches the stored values. Note that the size + # stored is the true file size mod 2**32. + crc32 = unpack32(self.last8[:4]) + isize = U32(unpack32(self.last8[4:])) # may exceed 2GB + if U32(crc32) != U32(self.crc): + raise IOError, "CRC check failed" + elif isize != LOWU32(self.size): + raise IOError, "Incorrect length of data produced" + + def close(self): + if self.mode == WRITE: + self.fileobj.write(self.compress.flush()) + # The native zlib crc is an unsigned 32-bit integer, but + # the Python wrapper implicitly casts that to a signed C + # long. So, on a 32-bit box self.crc may "look negative", + # while the same crc on a 64-bit box may "look positive". + # To avoid irksome warnings from the `struct` module, force + # it to look positive on all boxes. + write32u(self.fileobj, LOWU32(self.crc)) + # self.size may exceed 2GB, or even 4GB + write32u(self.fileobj, LOWU32(self.size)) + self.fileobj = None + elif self.mode == READ: + self.fileobj = None + if self.myfileobj: + self.myfileobj.close() + self.myfileobj = None + + def __del__(self): + try: + if (self.myfileobj is None and + self.fileobj is None): + return + except AttributeError: + return + self.close() + + def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH): + if self.mode == WRITE: + # Ensure the compressor's buffer is flushed + self.fileobj.write(self.compress.flush(zlib_mode)) + self.fileobj.flush() + + def fileno(self): + """Invoke the underlying file object's fileno() method. + + This will raise AttributeError if the underlying file object + doesn't support fileno(). + """ + return self.fileobj.fileno() + + def isatty(self): + return False + + def tell(self): + return self.offset + + def rewind(self): + '''Return the uncompressed stream file position indicator to the + beginning of the file''' + if self.mode != READ: + raise IOError("Can't rewind in write mode") + self.fileobj.seek(0) + self._new_member = True + self.extrabuf = "" + self.extrasize = 0 + self.offset = 0 + + def seek(self, offset): + if self.mode == WRITE: + if offset < self.offset: + raise IOError('Negative seek in write mode') + count = offset - self.offset + for i in range(count // 1024): + self.write(1024 * '\0') + self.write((count % 1024) * '\0') + elif self.mode == READ: + if offset < self.offset: + # for negative seek, rewind and do positive seek + self.rewind() + count = offset - self.offset + for i in range(count // 1024): + self.read(1024) + self.read(count % 1024) + + def readline(self, size=-1): + if size < 0: + size = sys.maxint + readsize = self.min_readsize + else: + readsize = size + bufs = [] + while size != 0: + c = self.read(readsize) + i = c.find('\n') + + # We set i=size to break out of the loop under two + # conditions: 1) there's no newline, and the chunk is + # larger than size, or 2) there is a newline, but the + # resulting line would be longer than 'size'. + if (size <= i) or (i == -1 and len(c) > size): + i = size - 1 + + if i >= 0 or c == '': + bufs.append(c[:i + 1]) # Add portion of last chunk + self._unread(c[i + 1:]) # Push back rest of chunk + break + + # Append chunk to list, decrease 'size', + bufs.append(c) + size = size - len(c) + readsize = min(size, readsize * 2) + if readsize > self.min_readsize: + self.min_readsize = min(readsize, self.min_readsize * 2, 512) + return ''.join(bufs) # Return resulting line + + def readlines(self, sizehint=0): + # Negative numbers result in reading all the lines + if sizehint <= 0: + sizehint = sys.maxint + L = [] + while sizehint > 0: + line = self.readline() + if line == "": + break + L.append(line) + sizehint = sizehint - len(line) + + return L + + def writelines(self, L): + for line in L: + self.write(line) + + def __iter__(self): + return self + + def next(self): + line = self.readline() + if line: + return line + else: + raise StopIteration + + +def _test(): + # Act like gzip; with -d, act like gunzip. + # The input file is not deleted, however, nor are any other gzip + # options or features supported. + args = sys.argv[1:] + decompress = args and args[0] == "-d" + if decompress: + args = args[1:] + if not args: + args = ["-"] + for arg in args: + if decompress: + if arg == "-": + f = GzipFile(filename="", mode="rb", fileobj=sys.stdin) + g = sys.stdout + else: + if arg[-3:] != ".gz": + print "filename doesn't end in .gz:", repr(arg) + continue + f = open(arg, "rb") + g = __builtin__.open(arg[:-3], "wb") + else: + if arg == "-": + f = sys.stdin + g = GzipFile(filename="", mode="wb", fileobj=sys.stdout) + else: + f = __builtin__.open(arg, "rb") + g = open(arg + ".gz", "wb") + while True: + chunk = f.read(1024) + if not chunk: + break + g.write(chunk) + if g is not sys.stdout: + g.close() + if f is not sys.stdin: + f.close() + +if __name__ == '__main__': + _test() diff --git a/obitools/gzip.pyc b/obitools/gzip.pyc new file mode 100644 index 0000000..9c44a43 Binary files /dev/null and b/obitools/gzip.pyc differ diff --git a/obitools/location/__init__.py b/obitools/location/__init__.py new file mode 100644 index 0000000..b5463b0 --- /dev/null +++ b/obitools/location/__init__.py @@ -0,0 +1,538 @@ +import obitools +import re +import array + +class Location(object): + """ + Define a location on a sequence. + """ + + def extractSequence(self,sequence): + ''' + Extract subsequence corresponding to a Location. + + @param sequence: + @type sequence: C{BioSequence} or C{str} + ''' + assert isinstance(sequence, (obitools.BioSequence,str)), \ + "sequence must be an instance of str or BioSequence" + + if isinstance(sequence, str): + seq = self._extractSequence(sequence) + else: + if isinstance(sequence, obitools.AASequence): + assert not self.needNucleic(), \ + "This location can be used only with Nucleic sequences" + seq = self._extractSequence(str(sequence)) + + if isinstance(sequence, obitools.AASequence): + st = obitools.AASequence + else: + st = obitools.NucSequence + + seq = st(sequence.id, + seq, + sequence.definition, + **sequence.getTags()) + seq['location']=str(self) + + if 'length' in sequence.getTags(): + seq['length']=len(seq) + + if hasattr(sequence, 'quality'): + quality = self._extractQuality(sequence) + seq.quality=quality + + return seq + + def isDirect(self): + return None + + def isSimple(self): + ''' + Indicate if a location is composed of a single continuous + region or is composed by the junction of several locations + by the C{join} operator. + + @return: C{True} if the location is composed of a single + continuous region. + @rtype: bool + ''' + + return None + + def isFullLength(self): + return None + + def needNucleic(self): + ''' + If a location contains a complement operator, it can be use + only on nucleic sequence. + + @return: C{True} if location contains a complement operator + @rtype: bool + ''' + return None + + def getGloc(self): + loc = self.simplify() + assert loc.isDirect() is not None,"Gloc cannot be created for multi oriented location : %s" % str(loc) + positions = ','.join([str(x) for x in loc._getglocpos()]) + return "(%s,%s)" % ({True:'T',False:'F'}[loc.isDirect()], + positions) + + def shift(self,s): + return None + + def getBegin(self): + return None + + def getEnd(self): + return None + + def getFivePrime(self): + return self.getBegin() + + def getThreePrime(self): + return self.getEnd() + + begin = property(getBegin,None,None,"beginning position of the location") + end = property(getEnd,None,None,"ending position of the location") + fivePrime=property(getFivePrime,None,None,"5' position of the location") + threePrime=property(getThreePrime,None,None,"3' position of the location") + + def __abs__(self): + assert self.isDirect() is not None,"Abs operator cannot be applied on non oriented location" + if self.isDirect(): + return self + else: + return ComplementLocation(self).simplify() + + def __cmp__(self,y): + if self.begin < y.begin: + return -1 + if self.begin > y.begin: + return 1 + if self.isDirect() == y.isDirect(): + return 0 + if self.isDirect() and not y.isDirect(): + return -1 + return 1 + +class SimpleLocation(Location): + """ + A simple location is describe a continuous region of + a sequence define by a C{begin} and a C{end} position. + """ + + def __init__(self,begin,end): + ''' + Build a new C{SimpleLocation} instance. Valid + position are define on M{[1,N]} with N the length + of the sequence. + + @param begin: start position of the location + @type begin: int + @param end: end position of the location + @type end: int + ''' + assert begin > 0 and end > 0 + + self._begin = begin + self._end = end + self._before=False + self._after=False + + def _extractSequence(self,sequence): + + assert ( self._begin < len(sequence) + and self._end <= len(sequence)), \ + "Sequence length %d is too short" % len(sequence) + + return sequence[self._begin-1:self._end] + + def _extractQuality(self,sequence): + + assert ( self._begin < len(sequence) + and self._end <= len(sequence)), \ + "Sequence length %d is too short" % len(sequence) + + return sequence.quality[self._begin-1:self._end] + + + def isDirect(self): + return True + + def isSimple(self): + return True + + def isFullLength(self): + return not (self.before or self.after) + + def simplify(self): + if self._begin == self._end: + return PointLocation(self._begin) + else: + return self + + def needNucleic(self): + return False + + def __str__(self): + before = {True:'<',False:''}[self.before] + after = {True:'>',False:''}[self.after] + return "%s%d..%s%d" % (before,self._begin,after,self._end) + + def shift(self,s): + assert (self._begin + s) > 0,"shift to large (%d)" % s + if s == 0: + return self + return SimpleLocation(self._begin + s, self._end + s) + + def _getglocpos(self): + return (self.begin,self.end) + + def getGloc(self): + positions = ','.join([str(x) for x in self._getglocpos()]) + return "(%s,%s)" % ({True:'T',False:'F'}[self.isDirect()], + positions) + + def getBegin(self): + return self._begin + + def getEnd(self): + return self._end + + + begin = property(getBegin,None,None,"beginning position of the location") + end = property(getEnd,None,None,"ending position of the location") + + def getBefore(self): + return self._before + + def getAfter(self): + return self._after + + def setBefore(self,value): + assert isinstance(value, bool) + self._before=value + + def setAfter(self,value): + assert isinstance(value, bool) + self._after=value + + before=property(getBefore,setBefore,None) + after=property(getAfter,setAfter,None) + + + + +class PointLocation(Location): + """ + A point location describes a location on a sequence + limited to a single position + """ + + def __init__(self,position): + assert position > 0 + self._pos=position + + def _extractSequence(self,sequence): + + assert self._end <= len(sequence), \ + "Sequence length %d is too short" % len(sequence) + + return sequence[self._pos-1] + + def _extractQuality(self,sequence): + + assert self._end <= len(sequence), \ + "Sequence length %d is too short" % len(sequence) + + return sequence[self._pos-1:self._pos] + + def isDirect(self): + return True + + def isSimple(self): + return True + + def isFullLength(self): + return True + + def simplify(self): + return self + + def needNucleic(self): + return False + + def shift(self,s): + assert (self._pos + s) > 0,"shift to large (%d)" % s + if s == 0: + return self + return PointLocation(self._pos + s) + + def _getglocpos(self): + return (self._pos,self._pos) + + def getBegin(self): + return self._pos + + def getEnd(self): + return self._pos + + begin = property(getBegin,None,None,"beginning position of the location") + end = property(getEnd,None,None,"ending position of the location") + + def __str__(self): + return str(self._pos) + +class CompositeLocation(Location): + """ + """ + def __init__(self,locations): + self._locs = tuple(locations) + + + def _extractSequence(self,sequence): + seq = ''.join([x._extractSequence(sequence) + for x in self._locs]) + return seq + + def _extractQuality(self,sequence): + rep=array.array('d',[]) + for x in self._locs: + rep.extend(x._extractQuality(sequence)) + return rep + + def isDirect(self): + hasDirect,hasReverse = reduce(lambda x,y: (x[0] or y,x[1] or not y), + (z.isDirect() for z in self._locs),(False,False)) + + if hasDirect and not hasReverse: + return True + if hasReverse and not hasDirect: + return False + + return None + + + def isSimple(self): + return False + + + def simplify(self): + if len(self._locs)==1: + return self._locs[0] + + rep = CompositeLocation(x.simplify() for x in self._locs) + + if reduce(lambda x,y : x and y, + (isinstance(z, ComplementLocation) + for z in self._locs)): + rep = ComplementLocation(CompositeLocation(x._loc.simplify() + for x in rep._locs[::-1])) + + return rep + + def isFullLength(self): + return reduce(lambda x,y : x and y, (z.isFullLength() for z in self._locs),1) + + def needNucleic(self): + return reduce(lambda x,y : x or y, + (z.needNucleic for z in self._locs), + False) + + def _getglocpos(self): + return reduce(lambda x,y : x + y, + (z._getglocpos() for z in self._locs)) + + + def getBegin(self): + return min(x.getBegin() for x in self._locs) + + def getEnd(self): + return max(x.getEnd() for x in self._locs) + + def shift(self,s): + assert (self.getBegin() + s) > 0,"shift to large (%d)" % s + if s == 0: + return self + return CompositeLocation(x.shift(s) for x in self._locs) + + + begin = property(getBegin,None,None,"beginning position of the location") + end = property(getEnd,None,None,"ending position of the location") + + + def __str__(self): + return "join(%s)" % ','.join([str(x) + for x in self._locs]) + +class ComplementLocation(Location): + """ + """ + + _comp={'a': 't', 'c': 'g', 'g': 'c', 't': 'a', + 'r': 'y', 'y': 'r', 'k': 'm', 'm': 'k', + 's': 's', 'w': 'w', 'b': 'v', 'd': 'h', + 'h': 'd', 'v': 'b', 'n': 'n', 'u': 'a', + '-': '-'} + + def __init__(self,location): + self._loc = location + + def _extractSequence(self,sequence): + seq = self._loc._extractSequence(sequence) + seq = ''.join([ComplementLocation._comp.get(x.lower(),'n') for x in seq[::-1]]) + return seq + + def _extractQuality(self,sequence): + return sequence.quality[::-1] + + def isDirect(self): + return False + + def isSimple(self): + return self._loc.isSimple() + + def isFullLength(self): + return self._loc.isFullLength() + + def simplify(self): + if isinstance(self._loc, ComplementLocation): + return self._loc._loc.simplify() + else: + return self + + def needNucleic(self): + return True + + def __str__(self): + return "complement(%s)" % self._loc + + def shift(self,s): + assert (self.getBegin() + s) > 0,"shift to large (%d)" % s + if s == 0: + return self + return ComplementLocation(self._loc.shift(s)) + + def _getglocpos(self): + return self._loc._getglocpos() + + def getBegin(self): + return self._loc.getBegin() + + def getEnd(self): + return self._loc.getEnd() + + def getFivePrime(self): + return self.getEnd() + + def getThreePrime(self): + return self.getBegin() + + + begin = property(getBegin,None,None,"beginning position of the location") + end = property(getEnd,None,None,"ending position of the location") + fivePrime=property(getFivePrime,None,None,"5' potisition of the location") + threePrime=property(getThreePrime,None,None,"3' potisition of the location") + + + # + # Internal functions used for location parsing + # + +def __sublocationIterator(text): + sl = [] + plevel=0 + for c in text: + assert plevel>=0,"Misformated location : %s" % text + if c == '(': + plevel+=1 + sl.append(c) + elif c==')': + plevel-=1 + sl.append(c) + elif c==',' and plevel == 0: + assert sl,"Misformated location : %s" % text + yield ''.join(sl) + sl=[] + else: + sl.append(c) + assert sl and plevel==0,"Misformated location : %s" % text + yield ''.join(sl) + + + + # + # Internal functions used for location parsing + # + +__simplelocparser = re.compile('(?P[0-9]+)(\.\.(?P>?)(?P[0-9]+))?') + + +def __locationParser(text): + text=text.strip() + if text[0:5]=='join(': + assert text[-1]==')',"Misformated location : %s" % text + return CompositeLocation(__locationParser(sl) for sl in __sublocationIterator(text[5:-1])) + elif text[0:11]=='complement(': + assert text[-1]==')',"Misformated location : %s" % text + subl = tuple(__locationParser(sl) for sl in __sublocationIterator(text[11:-1])) + if len(subl)>1: + subl = CompositeLocation(subl) + else: + subl = subl[0] + return ComplementLocation(subl) + else: + data = __simplelocparser.match(text) + assert data is not None,"Misformated location : %s" % text + data = data.groupdict() + if not data['to'] : + sl = PointLocation(int(data['from'])) + else: + sl = SimpleLocation(int(data['from']),int(data['to'])) + sl.before=data['before']=='<' + sl.after=data['after']=='>' + return sl + +def locationGenerator(locstring): + ''' + Parse a location string as present in genbank or embl file. + + @param locstring: string description of the location in embl/gb format + @type locstring: str + + @return: a Location instance + @rtype: C{Location} subclass instance + ''' + return __locationParser(locstring) + + +_matchExternalRef = re.compile('[A-Za-z0-9_|]+(\.[0-9]+)?(?=:)') + +def extractExternalRefs(locstring): + ''' + When a location describe external references (ex: D28156.1:1..>1292) + separate the external reference part of the location and the location + by itself. + + @param locstring: text representation of the location. + @type locstring: str + + @return: a tuple with a set of string describing accession number + of the referred sequences and a C{Location} instance. + + @rtype: tuple(set,Location) + ''' + m = set(x.group() for x in _matchExternalRef.finditer(locstring)) + clean = re.compile(':|'.join([re.escape(x) for x in m])+':') + cloc = locationGenerator(clean.sub('',locstring)) + + return m,cloc + + + + + diff --git a/obitools/location/__init__.pyc b/obitools/location/__init__.pyc new file mode 100644 index 0000000..545f024 Binary files /dev/null and b/obitools/location/__init__.pyc differ diff --git a/obitools/location/feature.py b/obitools/location/feature.py new file mode 100644 index 0000000..89a183f --- /dev/null +++ b/obitools/location/feature.py @@ -0,0 +1,177 @@ +from obitools.location import Location,locationGenerator +import logging +import re + + + + +_featureMatcher = re.compile('^(FT| ) [^ ].+\n((FT| ) .+\n)+',re.M) +_featureCleaner = re.compile('^FT',re.M) + + +def textFeatureIterator(fttable): + ''' + Iterate through a textual description of a feature table in a genbank + or embl format. Return at each step a text representation of each individual + feature composing the table. + + @param fttable: a string corresponding to the feature table of a genbank + or an embl entry + + @type fttable: C{str} + + @return: an iterator on str + @rtype: iterator + + @see: L{ftParser} + ''' + for m in _featureMatcher.finditer(fttable): + t = m.group() + t = _featureCleaner.sub(' ',t) + yield t + +_qualifierMatcher = re.compile('(?<=^ {21}/).+(\n {21}[^/].+)*',re.M) +_qualifierCleanner= re.compile("^ +",re.M) + +def qualifierIterator(qualifiers): + ''' + Parse a textual description of a feature in embl or genbank format + as returned by the textFeatureIterator iterator and iterate through + the key, value qualified defining this location. + + @param qualifiers: substring containing qualifiers + @type qualifiers: str + + @return: an iterator on tuple (key,value), where keys are C{str} + @rtype: iterator + ''' + for m in _qualifierMatcher.finditer(qualifiers): + t = m.group() + t = _qualifierCleanner.sub('',t) + t = t.split('=',1) + if len(t)==1: + t = (t[0],None) + else: + if t[0]=='translation': + value = t[1].replace('\n','') + else: + value = t[1].replace('\n',' ') + try: + value = eval(value) + except: + pass + t = (t[0],value) + yield t + + +_ftmatcher = re.compile('(?<=^ {5})\S+') +_locmatcher= re.compile('(?<=^.{21})[^/]+',re.DOTALL) +_cleanloc = re.compile('[\s\n]+') +_qualifiersMatcher = re.compile('^ +/.+',re.M+re.DOTALL) + +def ftParser(feature): + fttype = _ftmatcher.search(feature).group() + location=_locmatcher.search(feature).group() + location=_cleanloc.sub('',location) + qualifiers=_qualifiersMatcher.search(feature) + if qualifiers is not None: + qualifiers=qualifiers.group() + else: + qualifiers="" + logging.debug("Qualifiers regex not matching on \n=====\n%s\n========" % feature) + + return fttype,location,qualifiers + + +class Feature(dict,Location): + def __init__(self,type,location): + self._fttype=type + self._loc=location + + def getFttype(self): + return self._fttype + + + def extractSequence(self,sequence,withQualifier=False): + seq = self._loc.extractSequence(sequence) + if withQualifier: + seq.getInfo().update(self) + return seq + + def isDirect(self): + return self._loc.isDirect() + + def isSimple(self): + return self._loc.isSimple() + + def isFullLength(self): + return self._loc.isFullLength() + + def simplify(self): + f = Feature(self._fttype,self._loc.simplify()) + f.update(self) + return f + + def locStr(self): + return str(self._loc) + + def needNucleic(self): + return self._loc.needNucleic() + + def __str__(self): + return repr(self) + + def __repr__(self): + return str((self.ftType,str(self._loc),dict.__repr__(self))) + + def __cmp__(self,y): + return self._loc.__cmp__(y) + + def _getglocpos(self): + return self._loc._getglocpos() + + ftType = property(getFttype, None, None, "Feature type name") + + def shift(self,s): + assert (self.getBegin() + s) > 0,"shift to large (%d)" % s + if s == 0: + return self + f = Feature(self._fttype,self._loc.shift(s)) + f.update(self) + return f + + + def getBegin(self): + return self._loc.getBegin() + + def getEnd(self): + return self._loc.getEnd() + + begin = property(getBegin,None,None,"beginning position of the location") + end = property(getEnd,None,None,"ending position of the location") + + +def featureFactory(featureDescription): + fttype,location,qualifiers = ftParser(featureDescription) + location = locationGenerator(location) + feature = Feature(fttype,location) + feature.raw = featureDescription + + for k,v in qualifierIterator(qualifiers): + feature.setdefault(k,[]).append(v) + + return feature + +def featureIterator(featureTable,skipError=False): + for tft in textFeatureIterator(featureTable): + try: + feature = featureFactory(tft) + except AssertionError,e: + logging.debug("Parsing error on feature :\n===============\n%s\n===============" % tft) + if not skipError: + raise e + logging.debug("\t===> Error skipped") + continue + + yield feature + \ No newline at end of file diff --git a/obitools/metabarcoding/__init__.py b/obitools/metabarcoding/__init__.py new file mode 100644 index 0000000..3b29b17 --- /dev/null +++ b/obitools/metabarcoding/__init__.py @@ -0,0 +1,265 @@ +from obitools.ecopcr.options import addTaxonomyFilterOptions,\ + loadTaxonomyDatabase +from obitools.graph import UndirectedGraph +from obitools.align import lenlcs,isLCSReachable +from obitools.graph.algorithms.component import componentIterator +from obitools.utils.bioseq import uniqSequence +from obitools.utils import progressBar +import math +import sys +from obitools.graph.rootedtree import RootedTree + +def average(x): + x=list(x) + s = sum(i*j for (i,j) in x) + n = sum(i[1] for i in x) + return (float(s)/float(n),n) + +def minimum(x): + x=list(x) + m = min(i[0] for i in x) + n = sum(i[1] for i in x) + return (float(m),n) + +def ecoPCRReader(entries,options): + + taxonomy = loadTaxonomyDatabase(options) + + norankid =options.taxonomy.findRankByName('no rank') + speciesid=options.taxonomy.findRankByName('species') + genusid =options.taxonomy.findRankByName('genus') + familyid =options.taxonomy.findRankByName('family') + + minrankseq = set([speciesid,genusid,familyid]) + + usedrankid = {} + + ingroup = [] + outgroup= [] + + for s in entries: + if 'taxid' in s : + taxid = s['taxid'] + if taxid in taxonomy: + allrank = set() + for p in options.taxonomy.parentalTreeIterator(taxid): + if p[1]!=norankid: + allrank.add(p[1]) + if len(minrankseq & allrank) == 3: + for r in allrank: + usedrankid[r]=usedrankid.get(r,0) + 1 + + if taxonomy.isAncestor(options.ingroup,taxid): + ingroup.append(s) + else: + outgroup.append(s) + + keptrank = set(r for r in usedrankid + if float(usedrankid[r])/float(len(ingroup)) > options.rankthresold) + + return { 'ingroup' : ingroup, + 'outgroup': outgroup, + 'ranks' : keptrank + } + +def buildSimilarityGraph(dbseq,ranks,taxonomy,dcmax=5): + + ldbseq = len(dbseq) + pos = 1 + digit = int(math.ceil(math.log10(ldbseq))) + header = "Alignment : %%0%dd x %%0%dd -> %%0%dd " % (digit,digit,digit) + aligncount = ldbseq*(ldbseq+1)/2 + edgecount = 0 + print >>sys.stderr + + progressBar(1,aligncount,True,"Alignment : %s x %s -> %s " % ('-'*digit,'-'*digit, '0'*digit)) + + + sim = UndirectedGraph() + + i=0 + for s in dbseq: + taxid = s['taxid'] + + rtaxon = dict((rid,taxonomy.getTaxonAtRank(taxid,rid)) + for rid in ranks) + + sim.addNode(i, seq=s,taxid=taxid,rtaxon=rtaxon) + + i+=1 + +# aligner = LCS() + + for is1 in xrange(ldbseq): + s1 = dbseq[is1] + ls1= len(s1) +# aligner.seqA=s1 + + for is2 in xrange(is1+1,ldbseq): + + s2=dbseq[is2] + ls2=len(s2) + + lm = max(ls1,ls2) + lcsmin = lm - dcmax + + if isLCSReachable(s1,s2,lcsmin): + llcs,lali=lenlcs(s1,s2) + ds1s2 = lali - llcs + + if ds1s2 <= dcmax: + sim.addEdge(node1=is1, node2=is2,ds1s2=ds1s2,label=ds1s2) + edgecount+=1 + + progressBar(pos,aligncount,head=header % (is1,is2,edgecount)) + pos+=(ldbseq-is1-1) + + return sim + +def buildTsr(component): + ''' + Build for each consider taxonomic rank the list of taxa + present in the connected component + + :param component: the analyzed connected component + :type component: :py:class:`UndirectedGraph` + + :return: a dictionary indexed by rankid containing a `dict` indexed by taxid and containing count of sequences for this taxid + :rtype: `dict` indexed by `int` containing `dict` indexed by `int` and containing of `int` + + ''' + taxalist = {} + for n in component: + for r in n['rtaxon']: + rtaxid = n['rtaxon'][r] + if rtaxid is not None: + ts = taxalist.get(r,{}) + ts[rtaxid]=ts.get(rtaxid,0)+1 + taxalist[r]=ts + + return taxalist + +def edgeDistSelector(dcmax): + def predicate(e): + return e['ds1s2'] <= dcmax + return predicate + +def distanceOfConfusion(simgraph,dcmax=5,aggregate=average): + + alltaxa = set() + + for n in simgraph: + alltaxa|=set(n['rtaxon'].values()) + + taxacount = len(alltaxa) + + result = {} + + pos = [1] + header = "Component : %-5d Identified : %-8d " + progressBar(1,taxacount,True,header % (0,0)) + + def _idc(cc,dcmax): + composante=[] + for x in cc: + composante.extend(simgraph.subgraph(c) + for c in componentIterator(x, + edgePredicat=edgeDistSelector(dcmax))) + + good = set() + bad = {} + + complexe = [] + + for c in composante: + tsr = buildTsr(c) + newbad=False + for r in tsr: + if len(tsr[r]) == 1: + taxid = tsr[r].keys()[0] + good.add((taxid,tsr[r][taxid])) + else: + newbad=True + for taxid in tsr[r]: + bad[taxid]=bad.get(taxid,0)+tsr[r][taxid] + if newbad: + complexe.append(c) + +# good = good - bad + + for taxid,weight in good: + if taxid not in result: + result[taxid]=[] + result[taxid].append((dcmax+1,weight)) + + + progressBar(pos[0],taxacount,False,header % (len(composante),pos[0])) + pos[0]=len(result) + + if dcmax > 0: + dcmax-=1 + _idc(complexe,dcmax) + + else: + for taxid in bad: + if taxid not in result: + result[taxid]=[] + result[taxid].append((0,bad[taxid])) + + progressBar(pos[0],taxacount,False,header % (len(composante),pos[0])) + pos[0]=len(result) + + _idc([simgraph],dcmax) + + for taxid in result: + result[taxid]=aggregate(result[taxid]) + return result + +def propagateDc(tree,node=None,aggregate=min): + if node is None: + node = tree.getRoots()[0] + dca=aggregate(n['dc'] for n in node.leavesIterator()) + node['dc']=dca + for n in node: + propagateDc(tree, n, aggregate) + +def confusionTree(distances,ranks,taxonomy,aggregate=min,bsrank='species',dcmax=1): + + def Bs(node,rank,dcmax): + n = len(node) + if n: + g = [int(x['dc']>=dcmax) for x in node.subgraphIterator() if x['rank']==bsrank] + n = len(g) + g = sum(g) + bs= float(g)/float(n) + node['bs']=bs + node['bs_label']="%3.2f (%d)" % (bs,n) + + for n in node: + Bs(n,rank,dcmax) + + tree = RootedTree() + ranks = set(ranks) + tset = set(distances) + + for taxon in distances: + tree.addNode(taxon, rank=taxonomy.getRank(taxon), + name=taxonomy.getScientificName(taxon), + dc=float(distances[taxon][0]), + n=distances[taxon][1], + dc_label="%4.2f (%d)" % (float(distances[taxon][0]),distances[taxon][1]) + ) + + for taxon in distances: + piter = taxonomy.parentalTreeIterator(taxon) + taxon = piter.next() + for parent in piter: + if taxon[0] in tset and parent[0] in distances: + tset.remove(taxon[0]) + tree.addEdge(parent[0], taxon[0]) + taxon=parent + + root = tree.getRoots()[0] + Bs(root,bsrank,dcmax) + + return tree diff --git a/obitools/metabarcoding/options.py b/obitools/metabarcoding/options.py new file mode 100644 index 0000000..08ff423 --- /dev/null +++ b/obitools/metabarcoding/options.py @@ -0,0 +1,34 @@ +''' +Created on 30 oct. 2011 + +@author: coissac +''' + +from obitools.ecopcr.options import addTaxonomyDBOptions + + +def addMetabarcodingOption(optionManager): + + addTaxonomyDBOptions(optionManager) + + optionManager.add_option('--dcmax', + action="store", dest="dc", + metavar="###", + type="int", + default=0, + help="Maximum confusion distance considered") + + optionManager.add_option('--ingroup', + action="store", dest="ingroup", + metavar="###", + type="int", + default=1, + help="ncbi taxid delimitation the in group") + + optionManager.add_option('--rank-thresold', + action="store", dest="rankthresold", + metavar="#.##", + type="float", + default=0.5, + help="minimum fraction of the ingroup sequences " + "for concidering the rank") diff --git a/obitools/obischemas/__init__.py b/obitools/obischemas/__init__.py new file mode 100644 index 0000000..6bcafde --- /dev/null +++ b/obitools/obischemas/__init__.py @@ -0,0 +1,28 @@ +from obitools.obischemas import kb +__connection__ = None + +def initConnection(options): + global __connection__ + param = {} + if hasattr(options, "dbname") and options.dbname is not None: + param["database"]=options.dbname + if hasattr(options, "dbhost") and options.dbhost is not None: + param["host"]=options.dbhost + if hasattr(options, "dbuser") and options.dbuser is not None: + param["username"]=options.dbuser + if hasattr(options, "dbpassword") and options.dbpassword is not None: + param["password"]=options.dbpassword + + __connection__=kb.getConnection(**param) + __connection__.autocommit=options.autocommit + +def getConnection(options=None): + global __connection__ + + if options is not None: + initConnection(options) + + assert __connection__ is not None,"database connection is not initialized" + + return __connection__ + \ No newline at end of file diff --git a/obitools/obischemas/kb/__init__.py b/obitools/obischemas/kb/__init__.py new file mode 100644 index 0000000..7d35dcb --- /dev/null +++ b/obitools/obischemas/kb/__init__.py @@ -0,0 +1,55 @@ +""" + kb package is devoted to manage access to postgresql database from python + script +""" + + +class Connection(object): + + def __init__(self): + raise RuntimeError('pyROM.KB.Connection is an abstract class') + + def cursor(self): + raise RuntimeError('pyROM.KB.Connection.cursor is an abstract function') + + def commit(self): + raise RuntimeError('pyROM.KB.Connection.commit is an abstract function') + + def rollback(self): + raise RuntimeError('pyROM.KB.Connection.rollback is an abstract function') + + def __call__(self,query): + return self.cursor().execute(query) + + +class Cursor(object): + + def __init__(self,db): + raise RuntimeError('pyROM.KB.Cursor is an abstract class') + + def execute(self,query): + raise RuntimeError('pyROM.KB.Cursor.execute is an abstract function') + + __call__=execute + + +_current_connection = None # Static variable used to store connection to KB + +def getConnection(*args,**kargs): + """ + return a connection to the database. + When call from database backend no argument are needed. + All connection returned by this function + """ + global _current_connection + + if _current_connection==None or args or kargs : + try: + from obischemas.kb import backend + _current_connection = backend.Connection() + except ImportError: + from obischemas.kb import extern + _current_connection = extern.Connection(*args,**kargs) + return _current_connection + + diff --git a/obitools/obischemas/kb/extern.py b/obitools/obischemas/kb/extern.py new file mode 100644 index 0000000..ce2ff84 --- /dev/null +++ b/obitools/obischemas/kb/extern.py @@ -0,0 +1,78 @@ +""" +Module : KB.extern +Author : Eric Coissac +Date : 03/05/2004 + +Module wrapping psycopg interface module to allow connection +to a postgresql databases with the same interface from +backend and external script. + +This module define a class usable from external script +""" + + +import psycopg2 +import sys +from obischemas import kb + +class Connection(kb.Connection): + + def __init__(self,*connectParam,**kconnectParam): + if connectParam: + self.connectParam=={'dsn':connectParam} + else: + self.connectParam=kconnectParam + print self.connectParam + self.db = psycopg2.connect(**(self.connectParam)) + + def restart(self): + ok=1 + while (ok and ok < 1000): + try: + self.db = psycopg2.connect(**self.connectParam) + except: + ok+=1 + else: + ok=0 + + + def cursor(self): + curs = Cursor(self.db) + if hasattr(self,'autocommit') and self.autocommit: + curs.autocommit = self.autocommit + return curs + + def commit(self): + self.db.commit() + + def rollback(self): + if hasattr(self,'db'): + self.db.rollback() + + def __del__(self): + if hasattr(self,'db'): + self.rollback() + +class Cursor(kb.Cursor): + + def __init__(self,db): + self.db = db + self.curs = db.cursor() + + def execute(self,query): + try: + self.curs.execute(query) + if hasattr(self,'autocommit') and self.autocommit: + self.db.commit() + except psycopg2.ProgrammingError,e: + print >>sys.stderr,"===> %s" % query + raise e + except psycopg2.IntegrityError,e: + print >>sys.stderr,"---> %s" % query + raise e + try: + label = [x[0] for x in self.curs.description] + return [dict(map(None,label,y)) + for y in self.curs.fetchall()] + except TypeError: + return [] diff --git a/obitools/obischemas/options.py b/obitools/obischemas/options.py new file mode 100644 index 0000000..66f5138 --- /dev/null +++ b/obitools/obischemas/options.py @@ -0,0 +1,31 @@ +def addConnectionOptions(optionManager): + + optionManager.add_option('-d','--dbname', + action="store", dest="dbname", + metavar="", + type="string", + help="OBISchema database name containing" + "taxonomical data") + + optionManager.add_option('-H','--host', + action="store", dest="dbhost", + metavar="", + type="string", + help="host hosting OBISchema database") + + optionManager.add_option('-U','--user', + action="store", dest="dbuser", + metavar="", + type="string", + help="user for OBISchema database connection") + + optionManager.add_option('-W','--password', + action="store", dest="dbpassword", + metavar="", + type="string", + help="password for OBISchema database connection") + + optionManager.add_option('-A','--autocommit', + action="store_true",dest="autocommit", + default=False, + help="add commit action after each query") \ No newline at end of file diff --git a/obitools/obo/__init__.py b/obitools/obo/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/obitools/obo/go/__init__.py b/obitools/obo/go/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/obitools/obo/go/parser.py b/obitools/obo/go/parser.py new file mode 100644 index 0000000..6902974 --- /dev/null +++ b/obitools/obo/go/parser.py @@ -0,0 +1,53 @@ +from obitools.obo.parser import OBOTerm +from obitools.obo.parser import OBOEntry +from obitools.obo.parser import stanzaIterator +from logging import debug + +class GOEntry(OBOEntry): + ''' + An entry of a GeneOntology .obo file. It can be a header (without a stanza name) or + a stanza (with a stanza name between brackets). It inherits from the class dict. + ''' + + +class GOTerm(OBOTerm): + + ''' + A stanza named 'Term'. It inherits from the class OBOTerm. + ''' + + def __init__(self,stanza): + + ## use of the OBOEntry constructor. + OBOTerm.__init__(self, stanza) + + assert 'namespace' in self and len(self['namespace'])==1, "An OBOTerm must belong to one of the cell_component, molecular_function or biological_process namespace" + + +def GOEntryFactory(stanza): + ''' + Dispatcher of stanza. + + @param stanza: a stanza composed of several lines. + @type stanza: text + + @return: an C{OBOTerm} | C{OBOEntry} instance + + @note: The dispatcher treats differently the stanza which are OBO "Term" + and the others. + ''' + + stanzaType = OBOEntry.parseStanzaName(stanza) + + if stanzaType=="Term": + return GOTerm(stanza) + else: + return OBOEntry(stanza) + + +def GOEntryIterator(file): + entries = stanzaIterator(file) + for e in entries: + debug(e) + yield GOEntryFactory(e) + diff --git a/obitools/obo/parser.py b/obitools/obo/parser.py new file mode 100644 index 0000000..f6f05f3 --- /dev/null +++ b/obitools/obo/parser.py @@ -0,0 +1,707 @@ +from obitools.utils import skipWhiteLineIterator,multiLineWrapper +from obitools.utils import universalOpen +from obitools.format.genericparser import genericEntryIteratorGenerator +from logging import debug,warning + +import re + + +################################################################################# +## Stanza preparation area ## +################################################################################# + + +class FileFormatError(Exception): + ''' + An error derived from the class Exception. + ''' + pass + +_oboEntryIterator = genericEntryIteratorGenerator(endEntry='^ *$', + strip=True) + +def stanzaIterator(inputfile): + ''' + Iterator of stanza. The stanza are the basic units of OBO files. + + @param inputfile: a stream of strings from an opened OBO file. + @type inputfile: a stream of strings + + @return: a stream of stanza + @rtype: a stream of aggregated strings + + @note: The iterator constructs stanza by aggregate strings from the + OBO file. + ''' + inputfile = universalOpen(inputfile) + inputfile = multiLineWrapper(inputfile) + return _oboEntryIterator(inputfile) + + + +################################################################################# +## Trailing Modifiers treatment area ## +################################################################################# + + +class TrailingModifier(dict): + ''' + A class object which inherits from the class dict. Trailing modifiers can be found + at the end of TaggedValue objects when they exist. + ''' + + _match_brace = re.compile('(?<=\ {)[^\]]*(\}) *( !|$)') + + def __init__(self,string): + + ## search for trailing modifiers signals + trailing_modifiers = TrailingModifier._match_brace.search(string) + + ## the trailing modifiers exist + if trailing_modifiers: + trailing_modifiers=trailing_modifiers.group(0).strip() + print trailing_modifiers + ## creates and feeds the dictionary of trailing modifiers + dict.__init__(self,(x.strip().split('=',1) for x in trailing_modifiers.split(','))) + + +def trailingModifierFactory(string): + ''' + Dispatcher of trailing modifiers. + + @param string: a string from a TaggedValue object with a trailing modifiers signal. + @type string: string + + @return: a class object + + @note: The dispatcher is currently very simple. Only one case is treated by the function. + `the function returns a class object inherited from the class dict if the trailing modifiers + exist, None if they don't. + ''' + + trailing_modifiers = TrailingModifier(string) + if not trailing_modifiers: + trailing_modifiers=None + return trailing_modifiers + + +################################################################################# +## TaggedValue treatment area ## +################################################################################# + + +class TaggedValue(object): + ''' + A couple 'tag:value' of an OBOEntry. + ''' + + _match_value = re.compile('(("(\\\\"|[^\"])*")|(\\\\"|[^\"]))*?( !| {|$)') + _split_comment = re.compile('^!| !') + _match_quotedString = re.compile('(?<=")(\\\\"|[^\"])*(?=")') + _match_bracket = re.compile('\[[^\]]*\]') + + def __init__(self,line): + ''' + Constructor of the class TaggedValue. + + @param line: a line of an OBOEntry composed of a tag and a value. + @type line: string + + @note: The constructor separates tags from right terms. 'value' is extracted + from right terms using a regular expression (value is at the beginning of the + string, between quotes or not). Then, 'comment' is extracted from the rest of the + string using another regular expression ('comment' is at the end of the string + after a '!'. By default, 'comment' is set to None). Finally, 'trailing_modifiers' + are extracted from the last string using another regular expression. + The tag, the value, the comment and the trailing_modifiers are saved. + ''' + + debug("tagValueParser : %s" % line) + + ## by default : + trailing_modifiers = None + comment = None + + ## the tag is saved. 'right' is composed of the value, the comment and the trailing modifiers + tag,rigth = line.split(':',1) + + ## the value is saved + value = TaggedValue._match_value.search(rigth).group(0) + debug("Extracted value : %s" % value) + + ## if there is a value AND a sign of a comment or trailing modifiers + if value and value[-1] in '!{': + lvalue = len(value) + ## whatever it is a comment or trailing modifiers, it is saved into 'extra' + extra = rigth[lvalue-1:].strip() + ## a comment is extracted + extra =TaggedValue._split_comment.split(extra,1) + ## and saved if it exists + if len(extra)==2: + comment=extra[1].strip() + ## trailing modifiers are extracted + extra=extra[0] + trailing_modifiers = trailingModifierFactory(extra) + ## the value is cleaned of any comment or trailing modifiers signals + value = value[0:-1] + + if tag=='use_term': + tag='consider' + raise DeprecationWarning,"user_term is a deprecated tag, you should instead use consider" + + ## recording zone + self.value =value.strip() + self.tag = tag + self.__doc__=comment + self.trailing_modifiers=trailing_modifiers + + def __str__(self): + return str(self.value) + + def __repr__(self): + return '''"""%s"""''' % str(self) + + +class NameValue(TaggedValue): + ''' + A couple 'name:value' inherited from the class TaggedValue. Used to manage name tags. + ''' + + def __init__(self,line): + + ## no use of the TaggedValue constructor. The NameValue is very simple. + tag,rigth = line.split(':',1) + + ## recording zone + self.value = rigth.strip() + self.tag = 'name' + self.__doc__=None + self.trailing_modifiers=None + + + +class DefValue(TaggedValue): + ''' + A couple 'def:value' inherited from the class TaggedValue. Used to manage def tags. + ''' + + def __init__(self,line): + ''' + Constructor of the class DefValue. + + @param line: a line of an OBOEntry composed of a tag named 'def' and a value. + @type line: string + + @note: The constructor calls the TaggedValue constructor. A regular expression + is used to extract the 'definition' from TaggedValue.value (definition is a not + quoted TaggedValue.value). A regular expression is used to extract 'dbxrefs' + from the aggedValue.value without the definition (dbxrefs are between brackets + and definition can be so). Definition is saved as the new value of the DefValue. + dbxrefs are saved. + ''' + + ## use of the TaggedValue constructor + TaggedValue.__init__(self, line) + + ## definition, which is quoted, is extracted from the standard value of a TaggedValue. + definition = TaggedValue._match_quotedString.search(self.value).group(0) + + ## the standard value is cleaned of the definition. + cleanvalue = self.value.replace(definition,'') + cleanvalue = cleanvalue.replace(' ',' ') + + ## dbxrefs are searched into the rest of the standard value. + dbxrefs = TaggedValue._match_bracket.search(cleanvalue).group(0) + + ## recording zone + self.tag = 'def' + ## the value of a DefValue is not the standard value but the definition. + self.value=definition + self.dbxrefs=xrefFactory(dbxrefs) + + +class SynonymValue(TaggedValue): + ''' + A couple 'synonym:value' inherited from the class TaggedValue. Used to manage + synonym tags, exact_synonym tags, broad_synonym tags and narrow_synonym tags. + ''' + + _match_scope = re.compile('(?<="")[^\[]*(?=\[|$)') + + def __init__(self,line): + ''' + Constructor of the class SynonymValue. + + @param line: a line of an OBOEntry composed of a tag named 'synonym' or + 'exact_synonym' or 'broad_synonym' or 'narrow_synonym' and a value. + @type line: string + + @note: SynonymValue is composed of a tag, a value, a scope, a list of types and + dbxrefs. + The constructor calls the TaggedValue constructor. A regular expression + is used to extract 'definition' from TaggedValue.value (definition is a not + quoted TaggedValue.value). Definition is saved as the new value of the class + SynonymValue. + A regular expression is used to extract 'attributes' from the rest of the + string. Attributes may contain an optional synonym scope and an optional list + of synonym types. The scope is extracted from attributes or set by default to + 'RELATED'. It is saved as the scope of the class. The types are the rest of the + attributes and are saved as the list of types of the class. + For deprecated tags 'exact_synonym', 'broad_synonym' and 'narrow_synonym', tag + is set to 'synonym' and scope is set respectively to 'EXACT', 'BROAD' and 'NARROW'. + A regular expression is used to extract 'dbxrefs' from the TaggedValue.value + without the definition (dbxrefs are between brackets and definition can be so). + dbxrefs are saved. + ''' + + ## use of the TaggedValue constructor + TaggedValue.__init__(self, line) + + ## definition, which is quoted, is extracted from the standard value of a TaggedValue. + definition = TaggedValue._match_quotedString.search(self.value).group(0) + + ## the standard value is cleaned of the definition. + cleanvalue = self.value.replace(definition,'') + cleanvalue = cleanvalue.replace(' ',' ') + + ## 1) attributes are searched into the rest of the standard value. + ## 2) then they are stripped. + ## 3) then they are split on every ' '. + ## 4) finally they are ordered into a set. + attributes = set(SynonymValue._match_scope.search(cleanvalue).group(0).strip().split()) + + ## the scopes are the junction between the attributes and a set of specific terms. + scopes = attributes & set(['RELATED','EXACT','BROAD','NARROW']) + + ## the types are the rest of the attributes. + types = attributes - scopes + + ## this is a constraint of the OBO format + assert len(scopes)< 2,"Only one synonym scope allowed" + + ## the scope of the SynonymValue is into scopes or set by default to RELATED + if scopes: + scope = scopes.pop() + else: + scope = 'RELATED' + + ## Specific rules are defined for the following tags : + if self.tag == 'exact_synonym': + raise DeprecationWarning,'exact_synonym is a deprecated tag use instead synonym tag' + self.tag = 'synonym' + scope = 'EXACT' + + if self.tag == 'broad_synonym': + raise DeprecationWarning,'broad_synonym is a deprecated tag use instead synonym tag' + self.tag = 'synonym' + scope = 'BROAD' + + if self.tag == 'narrow_synonym': + raise DeprecationWarning,'narrow_synonym is a deprecated tag use instead synonym tag' + self.tag = 'synonym' + scope = 'NARROW' + + if self.tag == 'systematic_synonym': + #raise DeprecationWarning,'narrow_synonym is a deprecated tag use instead sysnonym tag' + self.tag = 'synonym' + scope = 'SYSTEMATIC' + + ## this is our own constraint. deprecated tags are not saved by this parser. + assert self.tag =='synonym',"%s synonym type is not managed" % self.tag + + ## dbxrefs are searched into the rest of the standard value. + dbxrefs = TaggedValue._match_bracket.search(cleanvalue).group(0) + + ## recording zone + ## the value of a SynonymValue is not the standard value but the definition. + self.value = definition + self.dbxrefs = xrefFactory(dbxrefs) + self.scope = scope + self.types = list(types) + + def __eq__(self,b): + return ((self.value==b.value) and (self.dbxrefs==b.dbxrefs) + and (self.scope==b.scope) and (self.types==b.types) + and (self.__doc__==b.__doc__) and (self.tag==b.tag) + and (self.trailing_modifiers==b.trailing_modifiers)) + + def __hash__(self): + return (reduce(lambda x,y:x+y,(hash(z) for z in [self.__doc__, + self.value, + frozenset(self.dbxrefs), + self.scope, + frozenset(self.types), + self.tag, + self.trailing_modifiers]),0)) % (2**31) + + +class XrefValue(TaggedValue): + ''' + A couple 'xref:value' inherited from the class TaggedValue. Used to manage + xref tags. + ''' + + def __init__(self,line): + + ## use of the TaggedValue constructor + TaggedValue.__init__(self, line) + + ## use the same function as the dbxrefs + self.value=xrefFactory(self.value) + + if self.tag in ('xref_analog','xref_unk'): + raise DeprecationWarning,'%s is a deprecated tag use instead sysnonym tag' % self.tag + self.tag='xref' + + ## this is our own constraint. deprecated tags are not saved by this parser. + assert self.tag=='xref' + + +class RelationshipValue(TaggedValue): + ''' + A couple 'xref:value' inherited from the class TaggedValue. Used to manage + xref tags. + ''' + + def __init__(self,line): + + ## use of the TaggedValue constructor + TaggedValue.__init__(self, line) + + ## the value is split on the first ' '. + value = self.value.split(None,1) + + ## succesful split ! + if len(value)==2: + relationship=value[0] + term=value[1] + ## unsuccesful split. The relationship is set by default to IS_A + else: + relationship='is_a' + term=value[0] + + ## recording zone + self.value=term + self.relationship=relationship + + +class NamespaceValue(TaggedValue): + def __init__(self,line): + TaggedValue.__init__(self, line) + +class RemarkValue(TaggedValue): + def __init__(self,line): + TaggedValue.__init__(self, line) + label,value = self.value.split(':',1) + label = label.strip() + value = value.strip() + self.value=value + self.label=label + + +def taggedValueFactory(line): + ''' + A function used to dispatch lines of an OBOEntry between the class TaggedValue + and its inherited classes. + + @param line: a line of an OBOEntry composed of a tag and a value. + @type line: string + + @return: a class object + ''' + + if (line[0:9]=='namespace' or + line[0:17]=='default-namespace'): + return NamespaceValue(line) + ## DefValue is an inherited class of TaggedValue + elif line[0:3]=='def': + return DefValue(line) + ## SynonymValue is an inherited class of TaggedValue + elif ((line[0:7]=="synonym" and line[0:14]!="synonymtypedef") or + line[0:13]=="exact_synonym" or + line[0:13]=="broad_synonym" or + line[0:14]=="narrow_synonym"): + return SynonymValue(line) + ## XrefValue is an inherited class of TaggedValue + elif line[0:4]=='xref': + return XrefValue(line) + ## NameValue is an inherited class of TaggedValue + elif line[0:4]=='name': + return NameValue(line) + ## RelationshipValue is an inherited class of TaggedValue + elif (line[0:15]=='intersection_of' or + line[0:8] =='union_of' or + line[0:12]=='relationship'): + return RelationshipValue(line) + elif (line[0:6]=='remark'): + return RemarkValue(line) + ## each line is a couple : tag / value (and some more features) + else: + return TaggedValue(line) + + +################################################################################# +## Xref treatment area ## +################################################################################# + + + +class Xref(object): + ''' + A xref object of an OBOentry. It may be the 'dbxrefs' of SynonymValue and + DefValue objects or the 'value' of XrefValue objects. + ''' + + __splitdata__ = re.compile(' +(?=["{])') + + def __init__(self,ref): + if ref == '' : # + ref = None # + data = '' # + else : # Modifs JJ sinon erreur : list index out of range + data = Xref.__splitdata__.split(ref,1) # + ref = data[0] # + description=None + trailing_modifiers = None + if len(data)> 1: + extra = data[1] + description = TaggedValue._match_quotedString.search(extra) + if description is not None: + description = description.group(0) + extra.replace(description,'') + trailing_modifiers=trailingModifierFactory(extra) + self.reference=ref + self.description=description + self.trailing_modifiers=trailing_modifiers + + def __eq__(self,b): + return ((self.reference==b.reference) and (self.description==b.description) + and (self.trailing_modifiers==b.trailing_modifiers)) + + def __hash__(self): + return (reduce(lambda x,y:x+y,(hash(z) for z in [self.reference, + self.description, + self.trailing_modifiers]),0)) % (2**31) + + +def xrefFactory(string): + ''' + Dispatcher of xrefs. + + @param string: a string (between brackets) from an inherited TaggedValue object with a dbxrefs + signal (actually, the signal can only be found into SynonymValue and DefValue + objects) or a string (without brackets) from a XrefValue object. + @type string: string + + @return: a class object + + @note: The dispatcher treats differently the strings between brackets (from SynonymValue and + DefValue objects) and without brackets (from XrefValue objects). + ''' + + string = string.strip() + if string[0]=='[': + return [Xref(x.strip()) for x in string[1:-1].split(',')] + else: + return Xref(string) + + +################################################################################# +## Stanza treatment area ## +################################################################################# + + +class OBOEntry(dict): + ''' + An entry of an OBOFile. It can be a header (without a stanza name) or + a stanza (with a stanza name between brackets). It inherits from the class dict. + ''' + _match_stanza_name = re.compile('(?<=^\[)[^\]]*(?=\])') + + def __init__(self,stanza): + ## tests if it is the header of the OBO file (returns TRUE) or not (returns FALSE) + self.isHeader = stanza[0]!='[' + lines = stanza.split('\n') + ## not the header : there is a [stanzaName] + if not self.isHeader: + self.stanzaName = lines[0].strip()[1:-1] + lines=lines[1:] + self["stanza"] = [stanza.strip()] + + ## whatever the stanza is. + for line in lines: + ## each line is a couple : tag / value + taggedvalue = taggedValueFactory(line) + if taggedvalue.tag in self: + self[taggedvalue.tag].append(taggedvalue) + else: + self[taggedvalue.tag]=[taggedvalue] + + + def parseStanzaName(stanza): + sm = OBOEntry._match_stanza_name.search(stanza) + if sm: + return sm.group(0) + else: + return None + + parseStanzaName=staticmethod(parseStanzaName) + + + +class OBOTerm(OBOEntry): + ''' + A stanza named 'Term'. It inherits from the class OBOEntry. + ''' + def __init__(self,stanza): + + ## use of the OBOEntry constructor. + OBOEntry.__init__(self, stanza) + + assert self.stanzaName=='Term' + assert 'stanza' in self + assert 'id' in self and len(self['id'])==1,"An OBOTerm must have an id" + assert 'name' in self and len(self['name'])==1,"An OBOTerm must have a name" + assert 'namespace' not in self or len(self['namespace'])==1, "Only one namespace is allowed for an OBO term" + + assert 'def' not in self or len(self['def'])==1,"Only one definition is allowed for an OBO term" + assert 'comment' not in self or len(self['comment'])==1,"Only one comment is allowed for an OBO term" + + assert 'union_of' not in self or len(self['union_of'])>=2,"Only one union relationship is allowed for an OBO term" + assert 'intersection_of' not in self or len(self['intersection_of'])>=2,"Only one intersection relationship is allowed for an OBO term" + + if self._isObsolete(): + #assert 'is_a' not in self + assert 'relationship' not in self + assert 'inverse_of' not in self + assert 'disjoint_from' not in self + assert 'union_of' not in self + assert 'intersection_of' not in self + + assert 'replaced_by' not in self or self._isObsolete() + assert 'consider' not in self or self._isObsolete() + + def _getStanza(self): + return self['stanza'][0] + + ## make-up functions. + def _getDefinition(self): + if 'def' in self: + return self['def'][0] + return None + + def _getId(self): + return self['id'][0] + + def _getNamespace(self): + return self['namespace'][0] + + def _getName(self): + return self['name'][0] + + def _getComment(self): + if 'comment' in self: + return self['comment'][0] + return None + + def _getAltIds(self): + if 'alt_id' in self: + return list(set(self.get('alt_id',None))) + return None + + def _getIsA(self): + if 'is_a' in self: + return list(set(self.get('is_a',None))) + return None + + def _getSynonym(self): + if 'synonym' in self : + return list(set(self.get('synonym',None))) + return None + + def _getSubset(self): + if self.get('subset',None) != None: + return list(set(self.get('subset',None))) + else: + return None + + def _getXref(self): + if 'xref' in self: + return list(set(self.get('xref',None))) + return None + + def _getRelationShip(self): + if 'relationship' in self: + return list(set(self.get('relationship',None))) + return None + + def _getUnion(self): + return list(set(self.get('union_of',None))) + + def _getIntersection(self): + return list(set(self.get('intersection_of',None))) + + def _getDisjonction(self): + return list(set(self.get('disjoint_from',None))) + + def _isObsolete(self): + return 'is_obsolete' in self and str(self['is_obsolete'][0])=='true' + + def _getReplacedBy(self): + if 'replaced_by' in self: + return list(set(self.get('replaced_by',None))) + return None + + def _getConsider(self): + if 'consider' in self: + return list(set(self.get('consider',None))) + return None + + ## automatically make-up ! + stanza = property(_getStanza,None,None) + definition = property(_getDefinition,None,None) + id = property(_getId,None,None) + namespace = property(_getNamespace,None,None) + name = property(_getName,None,None) + comment = property(_getComment,None,None) + alt_ids = property(_getAltIds,None,None) + is_a = property(_getIsA,None,None) + synonyms = property(_getSynonym,None,None) + subsets = property(_getSubset,None,None) + xrefs = property(_getXref,None,None) + relationship = property(_getRelationShip,None,None) + union_of = property(_getUnion,None,None) + intersection_of = property(_getIntersection,None,None) + disjoint_from = property(_getDisjonction,None,None) + is_obsolete = property(_isObsolete,None,None) + replaced_by = property(_getReplacedBy,None,None) + consider = property(_getConsider,None,None) + + +def OBOEntryFactory(stanza): + ''' + Dispatcher of stanza. + + @param stanza: a stanza composed of several lines. + @type stanza: text + + @return: an C{OBOTerm} | C{OBOEntry} instance + + @note: The dispatcher treats differently the stanza which are OBO "Term" + and the others. + ''' + + stanzaType = OBOEntry.parseStanzaName(stanza) + + if stanzaType=="Term": + return OBOTerm(stanza) + else: + return OBOEntry(stanza) + +def OBOEntryIterator(file): + entries = stanzaIterator(file) + for e in entries: + debug(e) + yield OBOEntryFactory(e) + + \ No newline at end of file diff --git a/obitools/options/__init__.py b/obitools/options/__init__.py new file mode 100644 index 0000000..d6793d6 --- /dev/null +++ b/obitools/options/__init__.py @@ -0,0 +1,137 @@ +""" + Module providing high level functions to manage command line options. +""" +import logging +import sys + +from logging import debug + +from optparse import OptionParser + +from obitools.utils import universalOpen +from obitools.utils import fileSize +from obitools.utils import universalTell +from obitools.utils import progressBar +from obitools.format.options import addInputFormatOption, addInOutputOption,\ + autoEntriesIterator +import time + + + +def getOptionManager(optionDefinitions,entryIterator=None,progdoc=None): + ''' + Build an option manager fonction. that is able to parse + command line options of the script. + + @param optionDefinitions: list of function describing a set of + options. Each function must allows as + unique parametter an instance of OptionParser. + @type optionDefinitions: list of functions. + + @param entryIterator: an iterator generator function returning + entries from the data files. + + @type entryIterator: an iterator generator function with only one + parametter of type file + ''' + parser = OptionParser(progdoc) + parser.add_option('--DEBUG', + action="store_true", dest="debug", + default=False, + help="Set logging in debug mode") + + parser.add_option('--no-psyco', + action="store_true", dest="noPsyco", + default=False, + help="Don't use psyco even if it installed") + + parser.add_option('--without-progress-bar', + action="store_false", dest="progressbar", + default=True, + help="desactivate progress bar") + + checkFormat=False + for f in optionDefinitions: + if f == addInputFormatOption or f == addInOutputOption: + checkFormat=True + f(parser) + + def commandLineAnalyzer(): + options,files = parser.parse_args() + if options.debug: + logging.root.setLevel(logging.DEBUG) + + if checkFormat: + ei=autoEntriesIterator(options) + else: + ei=entryIterator + + i = allEntryIterator(files,ei,with_progress=options.progressbar) + return options,i + + return commandLineAnalyzer + +_currentInputFileName=None +_currentFile = None +_currentFileSize = None + +def currentInputFileName(): + return _currentInputFileName + +def currentInputFile(): + return _currentFile + +def currentFileSize(): + return _currentFileSize + +def currentFileTell(): + return universalTell(_currentFile) + +def fileWithProgressBar(file,step=100): + try: + size = currentFileSize() + except: + size = None + + def fileBar(): + pos=1 + progressBar(pos, size, True,currentInputFileName()) + for l in file: + progressBar(currentFileTell,size,head=currentInputFileName()) + yield l + print >>sys.stderr,'' + if size is None: + return file + else: + f = fileBar() + return f + + +def allEntryIterator(files,entryIterator,with_progress=False,histo_step=102): + global _currentFile + global _currentInputFileName + global _currentFileSize + if files : + for f in files: + _currentInputFileName=f + f = universalOpen(f) + _currentFile=f + _currentFileSize=fileSize(_currentFile) + debug(f) + if with_progress: + f=fileWithProgressBar(f,step=histo_step) + if entryIterator is None: + for line in f: + yield line + else: + for entry in entryIterator(f): + yield entry + else: + if entryIterator is None: + for line in sys.stdin: + yield line + else: + for entry in entryIterator(sys.stdin): + yield entry + + \ No newline at end of file diff --git a/obitools/options/bioseqcutter.py b/obitools/options/bioseqcutter.py new file mode 100644 index 0000000..77189af --- /dev/null +++ b/obitools/options/bioseqcutter.py @@ -0,0 +1,85 @@ +from logging import debug + +def _beginOptionCallback(options,opt,value,parser): + def beginCutPosition(seq): + debug("begin = %s" % value ) + if hasattr(options, 'taxonomy') and options.taxonomy is not None: + environ = {'taxonomy' : options.taxonomy,'sequence':seq} + else: + environ = {'sequence':seq} + + return eval(value,environ,seq) - 1 + + parser.values.beginCutPosition=beginCutPosition + +def _endOptionCallback(options,opt,value,parser): + def endCutPosition(seq): + if hasattr(options, 'taxonomy') and options.taxonomy is not None: + environ = {'taxonomy' : options.taxonomy,'sequence':seq} + else: + environ = {'sequence':seq} + + return eval(value,environ,seq) + + parser.values.endCutPosition=endCutPosition + + + + +def addSequenceCuttingOptions(optionManager): + + optionManager.add_option('-b','--begin', + action="callback", callback=_beginOptionCallback, + metavar="", + type="string", + help="python expression to be evaluated in the " + "sequence context. The attribute name can be " + "used in the expression as variable name. " + "An extra variable named 'sequence' refers " + "to the sequence object itself. ") + + optionManager.add_option('-e','--end', + action="callback", callback=_endOptionCallback, + metavar="", + type="string", + help="python expression to be evaluated in the " + "sequence context. The attribute name can be " + "used in the expression as variable name ." + "An extra variable named 'sequence' refers" + "to the sequence object itself. ") + + +def cutterGenerator(options): + + def sequenceCutter(seq): + + lseq = len(seq) + + if hasattr(options, 'beginCutPosition'): + begin = int(options.beginCutPosition(seq)) + else: + begin = 0 + + if hasattr(options, 'endCutPosition'): + end = int(options.endCutPosition(seq)) + else: + end = lseq + + if begin > 0 or end < lseq: + seq = seq[begin:end] + seq['subsequence']="%d..%d" % (begin+1,end) + + return seq + + return sequenceCutter + +def cutterIteratorGenerator(options): + _cutter = cutterGenerator(options) + + def sequenceCutterIterator(seqIterator): + for seq in seqIterator: + yield _cutter(seq) + + return sequenceCutterIterator + + diff --git a/obitools/options/bioseqedittag.py b/obitools/options/bioseqedittag.py new file mode 100644 index 0000000..6eb1c36 --- /dev/null +++ b/obitools/options/bioseqedittag.py @@ -0,0 +1,237 @@ +import sys +from obitools.options.taxonomyfilter import loadTaxonomyDatabase +def addSequenceEditTagOptions(optionManager): + + optionManager.add_option('--rank', + action="store_true", dest='addrank', + default=False, + help="add a rank attribute to the sequence " + "indicating the sequence position in the input data") + + optionManager.add_option('-R','--rename-tag', + action="append", + dest='renameTags', + metavar="", + type="string", + default=[], + help="change tag name from OLD_NAME to NEW_NAME") + + optionManager.add_option('--delete-tag', + action="append", + dest='deleteTags', + metavar="", + type="string", + default=[], + help="delete tag TAG_NAME") + + optionManager.add_option('-S','--set-tag', + action="append", + dest='setTags', + metavar="", + type="string", + default=[], + help="Add a new tag named TAG_NAME with " + "a value computed from PYTHON_EXPRESSION") + + optionManager.add_option('--set-identifier', + action="store", + dest='setIdentifier', + metavar="", + type="string", + default=None, + help="Set sequence identifier with " + "a value computed from PYTHON_EXPRESSION") + + optionManager.add_option('--set-sequence', + action="store", + dest='setSequence', + metavar="", + type="string", + default=None, + help="Change the sequence itself with " + "a value computed from PYTHON_EXPRESSION") + + optionManager.add_option('-T','--set-definition', + action="store", + dest='setDefinition', + metavar="", + type="string", + default=None, + help="Set sequence definition with " + "a value computed from PYTHON_EXPRESSION") + + optionManager.add_option('-O','--only-valid-python', + action="store_true", + dest='onlyValid', + default=False, + help="only valid python expressions are allowed") + + optionManager.add_option('-C','--clear', + action="store_true", + dest='clear', + default=False, + help="clear all tags associated to the sequences") + + optionManager.add_option('-k','--keep', + action='append', + dest='keep', + default=[], + type="string", + help="only keep this tag") + + optionManager.add_option('--length', + action="store_true", + dest='length', + default=False, + help="add seqLength tag with sequence length") + + optionManager.add_option('--with-taxon-at-rank', + action='append', + dest='taxonrank', + default=[], + type="string", + help="add taxonomy annotation at a speciefied rank level") + + optionManager.add_option('-m','--mcl', + action="store", dest="mcl", + metavar="", + type="string", + default=None, + help="split following mcl graph clustering partition") + + +def readMCLFile(file): + partition=1 + parts = {} + for l in file: + for seq in l.strip().split(): + parts[seq]=partition + partition+=1 + return parts + + + + +def sequenceTaggerGenerator(options): + toDelete = options.deleteTags[:] + toRename = [x.split(':',1) for x in options.renameTags if len(x.split(':',1))==2] + toSet = [x.split(':',1) for x in options.setTags if len(x.split(':',1))==2] + newId = options.setIdentifier + newDef = options.setDefinition + newSeq = options.setSequence + clear = options.clear + keep = set(options.keep) + length = options.length + counter = [0] + loadTaxonomyDatabase(options) + if options.taxonomy is not None: + annoteRank=options.taxonrank + else: + annoteRank=[] + + if options.mcl is not None: + parts = readMCLFile(open(options.mcl)) + else: + parts = False + + def sequenceTagger(seq): + + if counter[0]>=0: + counter[0]+=1 + + if clear or keep: + ks = seq.keys() + for k in ks: + if k not in keep: + del seq[k] + else: + for i in toDelete: + if i in seq: + del seq[i] + for o,n in toRename: + if o in seq: + seq[n]=seq[o] + del seq[o] + + for rank in annoteRank: + if 'taxid' in seq: + taxid = seq['taxid'] + if taxid is not None: + rtaxid = options.taxonomy.getTaxonAtRank(taxid,rank) + if rtaxid is not None: + scn = options.taxonomy.getScientificName(rtaxid) + else: + scn=None + seq[rank]=rtaxid + seq["%s_name"%rank]=scn + + if parts and seq.id in parts: + seq['cluster']=parts[seq.id] + + if options.addrank: + seq['rank']=counter[0] + + for i,v in toSet: + try: + if options.taxonomy is not None: + environ = {'taxonomy' : options.taxonomy,'sequence':seq, 'counter':counter[0]} + else: + environ = {'sequence':seq, 'counter':counter[0]} + + val = eval(v,environ,seq) + except Exception,e: + if options.onlyValid: + raise e + val = v + seq[i]=val + + if length: + seq['seqLength']=len(seq) + + if newId is not None: + try: + if options.taxonomy is not None: + environ = {'taxonomy' : options.taxonomy,'sequence':seq, 'counter':counter[0]} + else: + environ = {'sequence':seq, 'counter':counter[0]} + + val = eval(newId,environ,seq) + except Exception,e: + if options.onlyValid: + raise e + val = newId + seq.id=val + if newDef is not None: + try: + if options.taxonomy is not None: + environ = {'taxonomy' : options.taxonomy,'sequence':seq, 'counter':counter[0]} + else: + environ = {'sequence':seq, 'counter':counter[0]} + + val = eval(newDef,environ,seq) + except Exception,e: + if options.onlyValid: + raise e + val = newDef + seq.definition=val + + if newSeq is not None: + try: + if options.taxonomy is not None: + environ = {'taxonomy' : options.taxonomy,'sequence':seq, 'counter':counter[0]} + else: + environ = {'sequence':seq, 'counter':counter[0]} + + val = eval(newSeq,environ,seq) + except Exception,e: + if options.onlyValid: + raise e + val = newSeq + if hasattr(seq, '_seq'): + seq._seq=str(val).lower() + if 'seqLength' in seq: + seq['seqLength']=len(seq) + + return seq + + return sequenceTagger \ No newline at end of file diff --git a/obitools/options/bioseqfilter.py b/obitools/options/bioseqfilter.py new file mode 100644 index 0000000..d52c9b5 --- /dev/null +++ b/obitools/options/bioseqfilter.py @@ -0,0 +1,179 @@ +import re + +from obitools.options.taxonomyfilter import addTaxonomyFilterOptions +from obitools.options.taxonomyfilter import taxonomyFilterGenerator + +def _sequenceOptionCallback(options,opt,value,parser): + parser.values.sequencePattern = re.compile(value,re.I) + +def _defintionOptionCallback(options,opt,value,parser): + parser.values.definitionPattern = re.compile(value) + +def _identifierOptionCallback(options,opt,value,parser): + parser.values.identifierPattern = re.compile(value) + +def _attributeOptionCallback(options,opt,value,parser): + if not hasattr(options, 'attributePatterns'): + parser.values.attributePatterns={} + attribute,pattern=value.split(':',1) + parser.values.attributePatterns[attribute]=re.compile(pattern) + +def _predicatOptionCallback(options,opt,value,parser): + if not hasattr(options, 'predicats'): + options.predicats=[] + parser.values.predicats.append(value) + + +def addSequenceFilteringOptions(optionManager): + + optionManager.add_option('-s','--sequence', + action="callback", callback=_sequenceOptionCallback, + metavar="", + type="string", + help="regular expression pattern used to select " + "the sequence. The pattern is case insensitive") + + optionManager.add_option('-D','--definition', + action="callback", callback=_defintionOptionCallback, + type="string", + metavar="", + help="regular expression pattern matched against " + "the definition of the sequence. " + "The pattern is case sensitive") + + optionManager.add_option('-I','--identifier', + action="callback", callback=_identifierOptionCallback, + type="string", + metavar="", + help="regular expression pattern matched against " + "the identifier of the sequence. " + "The pattern is case sensitive") + + optionManager.add_option('-a','--attribute', + action="callback", callback=_attributeOptionCallback, + type="string", + metavar=":", + help="regular expression pattern matched against " + "the attributes of the sequence. " + "the value of this atribute is of the form : " + "attribute_name:regular_pattern. " + "The pattern is case sensitive." + "Several -a option can be used on the same " + "commande line.") + + optionManager.add_option('-A','--has-attribute', + action="append", + type="string", + dest="has_attribute", + default=[], + metavar="", + help="select sequence with attribute " + "defined") + + optionManager.add_option('-p','--predicat', + action="append", dest="predicats", + metavar="", + help="python boolean expression to be evaluated in the " + "sequence context. The attribute name can be " + "used in the expression as variable name ." + "An extra variable named 'sequence' refers" + "to the sequence object itself. " + "Several -p option can be used on the same " + "commande line.") + + optionManager.add_option('-L','--lmax', + action='store', + metavar="<##>", + type="int",dest="lmax", + help="keep sequences shorter than lmax") + + optionManager.add_option('-l','--lmin', + action='store', + metavar="<##>", + type="int",dest="lmin", + help="keep sequences longer than lmin") + + optionManager.add_option('-v','--inverse-match', + action='store_true', + default=False, + dest="invertedFilter", + help="revert the sequence selection " + "[default : %default]") + + addTaxonomyFilterOptions(optionManager) + + + + + +def filterGenerator(options): + taxfilter = taxonomyFilterGenerator(options) + + def sequenceFilter(seq): + good = True + + if hasattr(options, 'sequencePattern'): + good = bool(options.sequencePattern.search(str(seq))) + + if good and hasattr(options, 'identifierPattern'): + good = bool(options.identifierPattern.search(seq.id)) + + if good and hasattr(options, 'definitionPattern'): + good = bool(options.definitionPattern.search(seq.definition)) + + if good : + good = reduce(lambda x,y:x and y, + (k in seq for k in options.has_attribute), + True) + + if good and hasattr(options, 'attributePatterns'): + good = (reduce(lambda x,y : x and y, + (bool(options.attributePatterns[p].search(str(seq[p]))) + for p in options.attributePatterns + if p in seq),True) + and + reduce(lambda x,y : x and y, + (bool(p in seq) + for p in options.attributePatterns),True) + ) + + if good and hasattr(options, 'predicats') and options.predicats is not None: + if options.taxonomy is not None: + e = {'taxonomy' : options.taxonomy,'sequence':seq} + else: + e = {'sequence':seq} + + good = (reduce(lambda x,y: x and y, + (bool(eval(p,e,seq)) + for p in options.predicats),True) + ) + + if good and hasattr(options, 'lmin') and options.lmin is not None: + good = len(seq) >= options.lmin + + if good and hasattr(options, 'lmax') and options.lmax is not None: + good = len(seq) <= options.lmax + + if good: + good = taxfilter(seq) + + if hasattr(options, 'invertedFilter') and options.invertedFilter: + good=not good + + + return good + + return sequenceFilter + +def sequenceFilterIteratorGenerator(options): + filter = filterGenerator(options) + + def sequenceFilterIterator(seqIterator): + for seq in seqIterator: + if filter(seq): + yield seq + + return sequenceFilterIterator + + + \ No newline at end of file diff --git a/obitools/options/taxonomyfilter.py b/obitools/options/taxonomyfilter.py new file mode 100644 index 0000000..5526c79 --- /dev/null +++ b/obitools/options/taxonomyfilter.py @@ -0,0 +1,6 @@ +from obitools.ecopcr.options import addTaxonomyDBOptions, \ + addTaxonomyFilterOptions, \ + loadTaxonomyDatabase, \ + taxonomyFilterGenerator, \ + taxonomyFilterIteratorGenerator + diff --git a/obitools/parallel/__init__.py b/obitools/parallel/__init__.py new file mode 100644 index 0000000..2aa1b07 --- /dev/null +++ b/obitools/parallel/__init__.py @@ -0,0 +1,99 @@ +import threading + +class TaskPool(object): + + def __init__(self,iterable,function,count=2): + self.pool = [] + self.queue= [] + self.plock= threading.Lock() + self.qlock= threading.Lock() + self.function=function + self.event=threading.Event() + self.iterable=iterable + for i in xrange(count): + Task(self) + + def register(self,task): + self.plock.acquire() + self.pool.append(task) + self.plock.release() + self.ready(task) + + def unregister(self,task): + task.thread.join() + self.plock.acquire() + self.pool.remove(task) + self.plock.release() + + + def ready(self,task): + self.qlock.acquire() + self.queue.append(task) + self.qlock.release() + self.event.set() + + def __iter__(self): + for data in self.iterable: + while not self.queue: + self.event.wait() + self.event.clear() + self.qlock.acquire() + task=self.queue.pop(0) + self.qlock.release() + if hasattr(task, 'rep'): + yield task.rep + #print "send ",data + if isinstance(data,dict): + task.submit(**data) + else: + task.submit(*data) + + while self.pool: + self.pool[0].finish() + while self.queue: + self.event.clear() + self.qlock.acquire() + task=self.queue.pop(0) + self.qlock.release() + if hasattr(task, 'rep'): + yield task.rep + + + + + +class Task(object): + def __init__(self,pool): + self.pool = pool + self.lock = threading.Lock() + self.dataOk = threading.Event() + self.repOk = threading.Event() + self.args = None + self.kwargs=None + self.stop=False + self.thread = threading.Thread(target=self) + self.thread.start() + self.pool.register(self) + + def __call__(self): + self.dataOk.wait() + while(not self.stop): + self.lock.acquire() + self.dataOk.clear() + self.rep=self.pool.function(*self.args,**self.kwargs) + self.pool.ready(self) + self.lock.release() + self.dataOk.wait() + + def submit(self,*args,**kwargs): + self.args=args + self.kwargs=kwargs + self.dataOk.set() + + def finish(self): + self.lock.acquire() + self.stop=True + self.dataOk.set() + self.pool.unregister(self) + + diff --git a/obitools/parallel/jobqueue.py b/obitools/parallel/jobqueue.py new file mode 100644 index 0000000..9df4804 --- /dev/null +++ b/obitools/parallel/jobqueue.py @@ -0,0 +1,183 @@ +import threading +from logging import warning,info +from time import sleep,time + +from obitools.parallel import TaskPool + + +class JobPool(dict): + ''' + JobPool is dedicated to manage a job queue. These jobs + will run in a limited number of thread. + ''' + + def __init__(self,count,precision=0.01): + ''' + + @param count: number of thread dedicated to this JobPool + @type count: int + @param precision: delay between two check for new job (in second) + @type precision: float + ''' + self._iterator = JobIterator(self) + self._taskPool = TaskPool(self._iterator, + self._runJob, + count) + self._precision=precision + self._toRun=set() + self._runnerThread = threading.Thread(target=self._runner) + self._runnerThread.start() + self._finalyzed=False + + def _runner(self): + for rep in self._taskPool: + info('Job %d finnished' % id(rep)) + info('All jobs in %d JobPool finished' % id(self)) + + def _jobIterator(self): + return self._iterator + + def _runJob(self,job): + job.started= time() + info('Job %d started' % id(job)) + job.result = job() + job.ended = time() + job.finished=True + return job + + def submit(self,job,priority=1.0,userid=None): + ''' + Submit a new job to the JobPool. + + @param job: the new submited job + @type job: Job instance + @param priority: priority level of this job (higher is better) + @type priority: float + @param userid: a user identifier (Default is None) + + @return: job identifier + @rtype: int + ''' + + assert not self._finalyzed,\ + "This jobPool does not accept new job" + if job.submitted is not None: + warning('Job %d was already submitted' % id(job)) + return id(job) + + job.submitted = time() + job.priority = priority + job.userid = userid + i=id(job) + job.id=id + self[i]=job + self._toRun.add(job) + + info('Job %d submitted' % i) + + return i + + def finalyze(self): + ''' + Indicate to the JobPool, that no new jobs will + be submitted. + ''' + self._iterator.finalyze() + self._finalyzed=True + + def __del__(self): + self.finalyze() + + +class JobIterator(object): + def __init__(self,pool): + self._pool = pool + self._finalyze=False + self._nextLock=threading.Lock() + + + def __iter__(self): + return self + + def finalyze(self): + ''' + Indicate to the JobIterator, that no new jobs will + be submitted. + ''' + self._finalyze=True + + + def next(self): + ''' + + @return: the next job to run + @rtype: Job instance + ''' + self._nextLock.acquire() + while self._pool._toRun or not self._finalyze: + rep = None + maxScore=0 + for k in self._pool._toRun: + s = k.runScore() + if s > maxScore: + maxScore=s + rep=k + if rep is not None: + self._pool._toRun.remove(rep) + self._nextLock.release() + return (rep,) + sleep(self._pool._precision) + self._nextLock.release() + info('No more jobs in %d JobPool' % id(self._pool)) + raise StopIteration + + + +class Job(object): + + def __init__(self,pool=None,function=None,*args,**kwargs): + ''' + Create a new job + + @param pool: the jobpool used to run job. Can be None to not + execute the job immediately. + @type pool: JobPool instance + + @param function: the function to run for the job + @type function: callable object + + @param args: parametters for function call + @param kwargs: named parametters for function call + + @precondition: function cannot be None + ''' + assert function is not None + self._args=args + self._kwargs = kwargs + self._function = function + self.running = False + self.finished= False + self.submitted = None + self.priority = None + self.userid = None + + if pool is not None: + pool.submit(self) + + def runScore(self): + ''' + @return: the score used to ordonnance job in the queue + @rtype: C{float} + ''' + + return (time() - self.submitted) * self.priority + + def __call__(self): + return self._function(*self._args,**self._kwargs) + + + + + + + \ No newline at end of file diff --git a/obitools/phylogeny/__init__.py b/obitools/phylogeny/__init__.py new file mode 100644 index 0000000..8eb1587 --- /dev/null +++ b/obitools/phylogeny/__init__.py @@ -0,0 +1,119 @@ + +from obitools.graph.tree import Forest,TreeNode +from obitools.graph import Edge + + + +class PhylogenicTree(Forest): + + def __init__(self,label='G',indexer=None,nodes=None,edges=None): + Forest.__init__(self, label, indexer, nodes, edges) + self.root=None + self.comment=None + + def addNode(self,node=None,index=None,**data): + if node is None and index is None: + node = '__%d' % (len(self._node)+1) + + return Forest.addNode(self, node, index, **data) + + def getNode(self,node=None,index=None): + if index is None: + index = self._index.getIndex(node, True) + return PhylogenicNode(index,self) + + def getEdge(self,node1=None,node2=None,index1=None,index2=None): + ''' + + @param node1: + @type node1: + @param node2: + @type node2: + @param index1: + @type index1: + @param index2: + @type index2: + ''' + node1=self.getNode(node1, index1) + node2=self.getNode(node2, index2) + return PhylogenicEdge(node1,node2) + + + +class PhylogenicNode(TreeNode): + + def getLabel(self): + label = TreeNode.getLabel(self) + if label[0:2]=='__': + return None + else: + return label + + def __str__(self): + + if self.index in self.graph._node_attrs: + keys = " ".join(['%s="%s"' % (x[0],str(x[1]).replace('"','\\"')) + for x in self.graph._node_attrs[self.index].iteritems()] + ) + else: + keys='' + + if self.label is None: + label='' + shape='point' + else: + label=self.label + shape='box' + + return '%d [label="%s" shape="%s" %s]' % (self.index,str(label).replace('"','\\"'),shape,keys) + + def distanceTo(self,node=None,index=None): + ''' + compute branch length between the two nodes. + If distances are not secified for this tree, None is returned. + + @param node: a node label or None + @param index: a node index or None. the parameter index + has a priority on the parameter node. + @type index: int + + @return: the evolutive distance between the two nodes + @rtype: int, float or None + ''' + path = self.shortestPathTo(node, index) + + start = path.pop(0) + dist=0 + for dest in path: + edge = self.graph.getEdge(index1=start,index2=dest) + if 'distance' in edge: + dist+=edge['distance'] + else: + return None + start=dest + + return dist + + label = property(getLabel, None, None, "Label of the node") + +class PhylogenicEdge(Edge): + + def __str__(self): + e = (self.node1.index,self.node2.index) + if e in self.graph._edge_attrs: + keys = "[%s]" % " ".join(['%s="%s"' % (x[0],str(x[1]).replace('"','\\"')) + for x in self.graph._edge_attrs[e].iteritems() + if x[0] not in ('distance','bootstrap')] + ) + else: + keys = "" + + + + if self.directed: + link='->' + else: + link='--' + + return "%d %s %d %s" % (self.node1.index,link,self.node2.index,keys) + diff --git a/obitools/phylogeny/newick.py b/obitools/phylogeny/newick.py new file mode 100644 index 0000000..cf0330c --- /dev/null +++ b/obitools/phylogeny/newick.py @@ -0,0 +1,123 @@ +import re +import sys + +from obitools.utils import universalOpen +from obitools.phylogeny import PhylogenicTree + +def subNodeIterator(data): + level=0 + start = 1 + if data[0]=='(': + for i in xrange(1,len(data)): + c=data[i] + if c=='(': + level+=1 + elif c==')': + level-=1 + if c==',' and not level: + yield data[start:i] + start = i+1 + yield data[start:i] + else: + yield data + + +_nodeParser=re.compile('\s*(?P\(.*\))?(?P[^ :]+)? *(?P[0-9.]+)?(:(?P-?[0-9.]+))?') + +def nodeParser(data): + parsedNode = _nodeParser.match(data).groupdict(0) + if not parsedNode['name']: + parsedNode['name']=None + + if not parsedNode['bootstrap']: + parsedNode['bootstrap']=None + else: + parsedNode['bootstrap']=float(parsedNode['bootstrap']) + + if not parsedNode['distance']: + parsedNode['distance']=None + else: + parsedNode['distance']=float(parsedNode['distance']) + + if not parsedNode['subnodes']: + parsedNode['subnodes']=None + + return parsedNode + +_cleanTreeData=re.compile('\s+') + +def treeParser(data,tree=None,parent=None): + if tree is None: + tree = PhylogenicTree() + data = _cleanTreeData.sub(' ',data).strip() + + parsedNode = nodeParser(data) + + if parent is not None: + son,parent = tree.addEdge(node1=parsedNode['name'], + index2=parent, + distance=parsedNode['distance'], + bootstrap=parsedNode['bootstrap']) + else: + son = tree.addNode(node1=parsedNode['name']) + tree.root=son + + + + if parsedNode['subnodes']: + for subnode in subNodeIterator(parsedNode['subnodes']): + treeParser(subnode,tree,son) + + return tree + +_treecomment=re.compile('\[.*\]') + +def treeIterator(file): + file = universalOpen(file) + data = file.read() + + comment = _treecomment.findall(data) + data=_treecomment.sub('',data).strip() + + if comment: + comment=comment[0] + else: + comment=None + for tree in data.split(';'): + t = treeParser(tree) + if comment: + t.comment=comment + yield t + +def nodeWriter(tree,node,deep=0): + name = node._name + if name is None: + name='' + + distance=node._dist + if distance is None: + distance='' + else: + distance = ':%6.5f' % distance + + bootstrap=node._bootstrap + if bootstrap is None: + bootstrap='' + else: + bootstrap=' %d' % int(bootstrap) + + nodeseparator = ',\n' + ' ' * (deep+1) + + subnodes = nodeseparator.join([nodeWriter(tree, x, deep+1) + for x in tree.childNodeIterator(node)]) + if subnodes: + subnodes='(\n' + ' ' * (deep+1) + subnodes + '\n' + ' ' * deep + ')' + + return '%s%s%s%s' % (subnodes,name,bootstrap,distance) + +def treeWriter(tree,startnode=None): + if startnode is not None: + root=startnode + else: + root = tree.getRoot() + return nodeWriter(tree,root)+';' diff --git a/obitools/profile/__init__.py b/obitools/profile/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/obitools/profile/_profile.so b/obitools/profile/_profile.so new file mode 100755 index 0000000..7f52483 Binary files /dev/null and b/obitools/profile/_profile.so differ diff --git a/obitools/sample.py b/obitools/sample.py new file mode 100644 index 0000000..4894c94 --- /dev/null +++ b/obitools/sample.py @@ -0,0 +1,76 @@ +''' +Created on 31 oct. 2009 + +@author: coissac +''' +from random import shuffle, randrange + +def lookfor(x,cumsum): + lmax=len(cumsum) + lmin=0 + + assert x < cumsum[-1],"x must be smaller then cumulative sum" + + while((lmax - lmin) > 0): + + i=(lmax+lmin)/2 + #print i,lmin,lmax + if (xcumsum[i-1])): + #print "return 1 :",i,cumsum[i-1],"<",x,"<",cumsum[i] + return i + elif cumsum[i]==x: + while cumsum[i]==x: + i+=1 + #print "return 2 :",i,cumsum[i],"<",x,"<",cumsum[i+1] + return i + elif x0] + shuffle(entries) + cumul=[] + s=0 + for e in entries: + s+=events[e] + cumul.append(s) + + #print cumul + result={} + + for t in xrange(size): + e=lookfor(randrange(s), cumul) + k=entries[e] + result[k]=result.get(k,0)+1 + + return result + +def weigthedSampleWithoutReplacement(events,size): + entries = [k for k in events.iterkeys() if events[k]>0] + shuffle(entries) + cumul=[] + s=0 + for e in entries: + s+=events[e] + cumul.append(s) + + #print cumul + result={} + + for t in xrange(size): + # print s,cumul, + e=lookfor(randrange(s), cumul) + # print e + k=entries[e] + for x in xrange(e,len(cumul)): + cumul[x]-=1 + s-=1 + result[k]=result.get(k,0)+1 + + return result \ No newline at end of file diff --git a/obitools/seqdb/__init__.py b/obitools/seqdb/__init__.py new file mode 100644 index 0000000..274cbad --- /dev/null +++ b/obitools/seqdb/__init__.py @@ -0,0 +1,88 @@ +from obitools import NucSequence,AASequence +from obitools.format.genericparser import genericEntryIteratorGenerator +from obitools.location.feature import featureIterator + +from itertools import chain + +class AnnotatedSequence(object): + + def __init__(self,header,featureTable,secondaryAcs): + self._header = header + self._featureTableText = featureTable + self._featureTable=None + self._secondaryAcs=secondaryAcs + self._hasTaxid=None + + def getHeader(self): + return self._header + + + def getFeatureTable(self,skipError=False): + if self._featureTable is None: + self._featureTable = [x for x in featureIterator(self._featureTableText,skipError)] + return self._featureTable + + + def getSecondaryAcs(self): + return self._secondaryAcs + + def extractTaxon(self): + if self._hasTaxid is None: + + if self._featureTable is not None: + s = [f for f in self._featureTable if f.ftType=='source'] + else: + s = featureIterator(self._featureTableText).next() + if s.ftType=='source': + s = [s] + else: + s = [f for f in self.featureTable if f.ftType=='source'] + + t =set(int(v[6:]) for v in chain(*tuple(f['db_xref'] for f in s if 'db_xref' in f)) + if v[0:6]=='taxon:') + + self._hasTaxid=False + + if len(t)==1 : + taxid=t.pop() + if taxid >=0: + self['taxid']=taxid + self._hasTaxid=True + + + t =set(chain(*tuple(f['organism'] for f in s if 'organism' in f))) + + if len(t)==1: + self['organism']=t.pop() + + + header = property(getHeader, None, None, "Header's Docstring") + + featureTable = property(getFeatureTable, None, None, "FeatureTable's Docstring") + + secondaryAcs = property(getSecondaryAcs, None, None, "SecondaryAcs's Docstring") + +class AnnotatedNucSequence(AnnotatedSequence,NucSequence): + ''' + + ''' + def __init__(self,id,seq,de,header,featureTable,secondaryAcs,**info): + NucSequence.__init__(self, id, seq, de,**info) + AnnotatedSequence.__init__(self, header, featureTable, secondaryAcs) + + +class AnnotatedAASequence(AnnotatedSequence,AASequence): + ''' + + ''' + def __init__(self,id,seq,de,header,featureTable,secondaryAcs,**info): + AASequence.__init__(self, id, seq, de,**info) + AnnotatedSequence.__init__(self, header, featureTable, secondaryAcs) + + + +nucEntryIterator=genericEntryIteratorGenerator(endEntry='^//') +aaEntryIterator=genericEntryIteratorGenerator(endEntry='^//') + + + diff --git a/obitools/seqdb/blastdb/__init__.py b/obitools/seqdb/blastdb/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/obitools/seqdb/dnaparser.py b/obitools/seqdb/dnaparser.py new file mode 100644 index 0000000..85b82a2 --- /dev/null +++ b/obitools/seqdb/dnaparser.py @@ -0,0 +1,16 @@ +from obitools.format.sequence import embl,fasta,genbank + +class UnknownFormatError(Exception): + pass + +def whichParser(seq): + if seq[0]=='>': + return fasta.fastaNucParser + if seq[0:2]=='ID': + return embl.emblParser + if seq[0:5]=='LOCUS': + return genbank.genbankParser + raise UnknownFormatError,"Unknown nucleic format" + +def nucleicParser(seq): + return whichParser(seq)(seq) diff --git a/obitools/seqdb/embl/__init__.py b/obitools/seqdb/embl/__init__.py new file mode 100644 index 0000000..94f9efc --- /dev/null +++ b/obitools/seqdb/embl/__init__.py @@ -0,0 +1,13 @@ +from obitools.seqdb import AnnotatedNucSequence, AnnotatedAASequence +from obitools.location import locationGenerator,extractExternalRefs + + + +class EmblSequence(AnnotatedNucSequence): + ''' + Class used to represent a nucleic sequence issued from EMBL. + ''' + + + + diff --git a/obitools/seqdb/embl/parser.py b/obitools/seqdb/embl/parser.py new file mode 100644 index 0000000..2e3624f --- /dev/null +++ b/obitools/seqdb/embl/parser.py @@ -0,0 +1,50 @@ +import re +import sys + +from obitools.seqdb import embl +from obitools.seqdb import nucEntryIterator + +_featureMatcher = re.compile('(^FT .*\n)+', re.M) +_cleanFT = re.compile('^FT',re.M) + +_headerMatcher = re.compile('^ID.+(?=\nFH )', re.DOTALL) +_seqMatcher = re.compile('(^ ).+(?=//\n)', re.DOTALL + re.M) +_cleanSeq = re.compile('[ \n0-9]+') +_acMatcher = re.compile('(?<=^AC ).+',re.M) +_deMatcher = re.compile('(^DE .+\n)+',re.M) +_cleanDe = re.compile('(^|\n)DE +') + +def __emblparser(text): + try: + header = _headerMatcher.search(text).group() + + ft = _featureMatcher.search(text).group() + ft = _cleanFT.sub(' ',ft) + + seq = _seqMatcher.search(text).group() + seq = _cleanSeq.sub('',seq).upper() + + acs = _acMatcher.search(text).group() + acs = acs.split() + ac = acs[0] + acs = acs[1:] + + de = _deMatcher.search(header).group() + de = _cleanDe.sub(' ',de).strip().strip('.') + except AttributeError,e: + print >>sys.stderr,'=======================================================' + print >>sys.stderr,text + print >>sys.stderr,'=======================================================' + raise e + + return (ac,seq,de,header,ft,acs) + +def emblParser(text): + return embl.EmblSequence(*__emblparser(text)) + + +def emblIterator(file): + for e in nucEntryIterator(file): + yield emblParser(e) + + \ No newline at end of file diff --git a/obitools/seqdb/genbank/__init__.py b/obitools/seqdb/genbank/__init__.py new file mode 100644 index 0000000..fb5b622 --- /dev/null +++ b/obitools/seqdb/genbank/__init__.py @@ -0,0 +1,84 @@ +from obitools.seqdb import AnnotatedNucSequence, AnnotatedAASequence +from obitools.location import locationGenerator,extractExternalRefs + + + +class GbSequence(AnnotatedNucSequence): + ''' + Class used to represent a nucleic sequence issued from Genbank. + ''' + + +class GpepSequence(AnnotatedAASequence): + ''' + Class used to represent a peptidic sequence issued from Genpep. + ''' + + def __init__(self,id,seq,de,header,featureTable,secondaryAcs,**info): + AnnotatedAASequence.__init__(self,id, seq, de, header, featureTable, secondaryAcs,**info) + self.__hasNucRef=None + + def __getGeneRef(self): + if self.__hasNucRef is None: + self.__hasNucRef=False + cds = [x for x in self.featureTable + if x.ftType=='CDS' + and 'coded_by' in x] + + if cds: + source = cds[0]['coded_by'][0] + if 'transl_table' in cds[0]: + tt = cds[0]['transl_table'][0] + else: + tt=None + ac,loc = extractExternalRefs(source) + + if len(ac)==1: + ac = ac.pop() + self.__hasNucRef=True + self.__nucRef = (ac,loc,tt) + + + + def geneAvailable(self): + ''' + Predicat indicating if reference to the nucleic sequence encoding + this protein is available in feature table. + + @return: True if gene description is available + @rtype: bool + ''' + self.__getGeneRef() + return self.__hasNucRef is not None and self.__hasNucRef + + + def getCDS(self,database): + ''' + Return the nucleic sequence coding for this protein if + data are available. + + @param database: a database object where looking for the sequence + @type database: a C{dict} like object + + @return: a NucBioseq instance carreponding to the CDS + @rtype: NucBioSeq + + @raise AssertionError: if no gene references are available + @see: L{geneAvailable} + + ''' + + assert self.geneAvailable(), \ + "No information available to retreive gene sequence" + + ac,loc,tt = self.__nucRef + seq = database[ac] + seq.extractTaxon() + gene = seq[loc] + if tt is not None: + gene['transl_table']=tt + return gene + + + + diff --git a/obitools/seqdb/genbank/ncbi.py b/obitools/seqdb/genbank/ncbi.py new file mode 100644 index 0000000..40ddf91 --- /dev/null +++ b/obitools/seqdb/genbank/ncbi.py @@ -0,0 +1,79 @@ +from urllib2 import urlopen +import sys +import re + +import cStringIO + +from obitools.eutils import EFetch +from parser import genbankParser,genpepParser +from parser import genbankIterator,genpepIterator + +from obitools.utils import CachedDB + + +class NCBIGenbank(EFetch): + def __init__(self): + EFetch.__init__(self,db='nucleotide', + rettype='gbwithparts') + + def __getitem__(self,ac): + if isinstance(ac,str): + text = self.get(id=ac) + seq = genbankParser(text) + return seq + else: + query = ','.join([x for x in ac]) + data = cStringIO.StringIO(self.get(id=query)) + return genbankIterator(data) + + + + +class NCBIGenpep(EFetch): + def __init__(self): + EFetch.__init__(self,db='protein', + rettype='gbwithparts') + + def __getitem__(self,ac): + if isinstance(ac,str): + text = self.get(id=ac) + seq = genpepParser(text) + return seq + else: + query = ','.join([x for x in ac]) + data = cStringIO.StringIO(self.get(id=query)) + return genpepIterator(data) + +class NCBIAccession(EFetch): + + _matchACS = re.compile(' +accession +"([^"]+)"') + + def __init__(self): + EFetch.__init__(self,db='nucleotide', + rettype='seqid') + + def __getitem__(self,ac): + if isinstance(ac,str): + text = self.get(id=ac) + rep = NCBIAccession._matchACS.search(text).group(1) + return rep + else: + query = ','.join([x for x in ac]) + text = self.get(id=query) + rep = (ac.group(1) for ac in NCBIAccession._matchACS.finditer(text)) + return rep + +def Genbank(cache=None): + gb = NCBIGenbank() + if cache is not None: + gb = CachedDB(cache, gb) + return gb + + +def Genpep(cache=None): + gp = NCBIGenpep() + if cache is not None: + gp = CachedDB(cache, gp) + return gp + + diff --git a/obitools/seqdb/genbank/parser.py b/obitools/seqdb/genbank/parser.py new file mode 100644 index 0000000..b52fe59 --- /dev/null +++ b/obitools/seqdb/genbank/parser.py @@ -0,0 +1,53 @@ +import re +import sys + +import obitools.seqdb.genbank as gb +from obitools.seqdb import nucEntryIterator,aaEntryIterator + +_featureMatcher = re.compile('^FEATURES.+\n(?=ORIGIN)',re.DOTALL + re.M) + +_headerMatcher = re.compile('^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M) +_seqMatcher = re.compile('(?<=ORIGIN).+(?=//\n)', re.DOTALL + re.M) +_cleanSeq = re.compile('[ \n0-9]+') +_acMatcher = re.compile('(?<=^ACCESSION ).+',re.M) +_deMatcher = re.compile('(?<=^DEFINITION ).+\n( .+\n)*',re.M) +_cleanDe = re.compile('\n *') + +def __gbparser(text): + try: + header = _headerMatcher.search(text).group() + ft = _featureMatcher.search(text).group() + seq = _seqMatcher.search(text).group() + seq = _cleanSeq.sub('',seq).upper() + acs = _acMatcher.search(text).group() + acs = acs.split() + ac = acs[0] + acs = acs[1:] + de = _deMatcher.search(header).group() + de = _cleanDe.sub(' ',de).strip().strip('.') + except AttributeError,e: + print >>sys.stderr,'=======================================================' + print >>sys.stderr,text + print >>sys.stderr,'=======================================================' + raise e + + return (ac,seq,de,header,ft,acs) + +def genbankParser(text): + return gb.GbSequence(*__gbparser(text)) + + +def genbankIterator(file): + for e in nucEntryIterator(file): + yield genbankParser(e) + + +def genpepParser(text): + return gb.GpepSequence(*__gbparser(text)) + + +def genpepIterator(file): + for e in aaEntryIterator(file): + yield genpepParser(e) + + \ No newline at end of file diff --git a/obitools/sequenceencoder/__init__.py b/obitools/sequenceencoder/__init__.py new file mode 100644 index 0000000..89a8a59 --- /dev/null +++ b/obitools/sequenceencoder/__init__.py @@ -0,0 +1,73 @@ +from obitools import location + +class SequenceEncoder(object): + pass + +class DNAComplementEncoder(SequenceEncoder): + _comp={'a': 't', 'c': 'g', 'g': 'c', 't': 'a', + 'r': 'y', 'y': 'r', 'k': 'm', 'm': 'k', + 's': 's', 'w': 'w', 'b': 'v', 'd': 'h', + 'h': 'd', 'v': 'b', 'n': 'n', 'u': 'a', + '-': '-'} + + _info={'complemented':True} + + @staticmethod + def _encode(seq,position=slice(None, None, -1)): + cseq = [DNAComplementEncoder._comp.get(x.lower(),'n') for x in seq[position]] + return ''.join(cseq) + + @staticmethod + def _check(seq): + assert seq.isNucleotide() + + @staticmethod + def _convertpos(position): + if isinstance(position, int): + return -(position+1) + elif isinstance(position, slice): + return slice(-(position.stop+1), + -(position.start+1), + -position.step) + elif isinstance(position, location.Location): + return location.ComplementLocation(position).simplify() + + raise TypeError,"position must be an int, slice or Location instance" + + @staticmethod + def complement(seq): + return seq + +class SeqFragmentEncoder(SequenceEncoder): + def __init__(self,begin,end): + assert begin < end and begin >=0 + self._limits = slice(begin,end) + self._info = {'cut' : [begin,end,1]} + self._len = end - begin + 1 + + def _check(self,seq): + lseq = len(seq) + assert self._limits.stop <= lseq + + def _encode(self,seq,position=None): + return str(seq)[self._limits] + + def _convertpos(self,position): + if isinstance(position, int): + if position < -self._len or position >= self._len: + raise IndexError,position + if position >=0: + return self._limits.start + position + else: + return self._limits.stop + position + 1 + elif isinstance(position, slice): + return slice(-(position.stop+1), + -(position.start+1), + -position.step) + elif isinstance(position, location.Location): + return location.ComplementLocation(position).simplify() + + raise TypeError,"position must be an int, slice or Location instance" + + + \ No newline at end of file diff --git a/obitools/sequenceencoder/__init__.pyc b/obitools/sequenceencoder/__init__.pyc new file mode 100644 index 0000000..463f84f Binary files /dev/null and b/obitools/sequenceencoder/__init__.pyc differ diff --git a/obitools/solexa/__init__.py b/obitools/solexa/__init__.py new file mode 100644 index 0000000..60e35f8 --- /dev/null +++ b/obitools/solexa/__init__.py @@ -0,0 +1,45 @@ +from obitools import utils +from obitools import NucSequence +from obitools.dnahash import hashCodeIterator + + +class SolexaSequence(NucSequence): + def __init__(self,id,seq,definition=None,quality=None,**info): + NucSequence.__init__(self, id, seq, definition,**info) + self._quality=quality + self._hash=None + + def getQuality(self): + if isinstance(self._quality, str): + self._quality=[int(x) for x in self._quality.split()] + return self._quality + + + def __hash__(self): + if self._hash is None: + self._hash = hashCodeIterator(str(self), len(str(self)), 16, 0).next()[1].pop() + return self._hash + +class SolexaFile(utils.ColumnFile): + def __init__(self,stream): + utils.ColumnFile.__init__(self, + stream, ':', True, + (str, + int,int,int,int, + str, + str), "#") + + + def next(self): + data = utils.ColumnFile.next(self) + seq = SolexaSequence('%d_%d_%d_%d'%(data[1],data[2],data[3],data[4]), + data[5], + quality=data[6]) + seq['machine']=data[0] + seq['channel']=data[1] + seq['tile']=data[2] + seq['pos_x']=data[3] + seq['pos_y']=data[4] + + #assert len(seq['quality'])==len(seq),"Error in file format" + return seq diff --git a/obitools/statistics/__init__.py b/obitools/statistics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/obitools/statistics/hypergeometric.py b/obitools/statistics/hypergeometric.py new file mode 100644 index 0000000..9a9b812 --- /dev/null +++ b/obitools/statistics/hypergeometric.py @@ -0,0 +1,166 @@ +# -*- coding: utf-8 -*- +""" + Module de calcules statistiques. + + Le module `statistics` contient des fonctions permettant le calcule + des probabilités associées à la loi hypergéométrique et + hypergéométrique cumulée, ainsi d'une méthode de correction pour les + tests multiples. + +""" + +from decimal import * + +getcontext().prec = 28 + + +def _hyper0(N,n,r): + """ + Fonction interne permetant le calcule du terme 0 de la loi hypergéométrique. + + Le calcule est réalisé selon la méthode décrite dans l'article + + Trong Wu, An accurate computation of the hypergeometric distribution function, + ACM Trans. Math. Softw. 19 (1993), no. 1, 33–43. + + Paramètres: + + - `N` : La taille de la population + - `n` : Le nombre d'éléments marqués + - `r` : La taille de l'echantillon + + Retourne un *float* indiquant la probabilité de récupérer 0 élément + marqué parmi *n* dans une population de taille *N* lors du tirage + d'un échantillon de taille *r* + """ + + # + # au numerateur nous avons : + # [N -r + 1 -n;N - n + 1[ + # + # au denominateur : + # [N - r + 1; N + 1] + # + # avec X = N - r + 1 + # et Y = N + 1 + # + # Numerateur -> [ X - n; Y - n [ + # Denominateur -> [ X ; Y [ + # + # On peut donc siplifier + # + # Numerateur -> [X - n; X [ + # Denominateur -> [Y - n; Y [ + + numerateur = xrange(N - r + 1 - n, N - r + 1) + denominateur= xrange(N + 1 - n, N + 1) +# +# version original +# +# m = N - n +# numerateur = set(range(m-r+1,m+1)) +# denominateur = set(range(N-r+1,N+1)) +# simplification = numerateur & denominateur +# numerateur -= simplification +# denominateur -= simplification +# numerateur = list(numerateur) +# denominateur=list(denominateur) +# numerateur.sort() +# denominateur.sort() + + + p = reduce(lambda x,y:x*y,map(lambda i,j:Decimal(i)/Decimal(j),numerateur,denominateur)) + return p + + +def hypergeometric(x,N,n,r): + """ + Calcule le terme *x* d'une loi hypergéométrique + + Le calcule est réalisé selon la méthode décrite dans l'article + + Trong Wu, An accurate computation of the hypergeometric distribution function, + ACM Trans. Math. Softw. 19 (1993), no. 1, 33–43. + + Paramètres: + + - `x` : Nombre d'éléments marqués attendu + - `N` : La taille de la population + - `n` : Le nombre d'éléments marqués + - `r` : La taille de l'echantillon + + Retourne un *float* indiquant la probabilité de récupérer *x* éléments + marqués parmi *n* dans une population de taille *N* lors du tirage + d'un échantillon de taille *r* + """ + if n < r: + s = n + n = r + r = s + assert x>=0 and x <= r,"x out of limits" + if x > 0 : + return hypergeometric(x-1,N,n,r) * (n - x + 1)/x * (r - x + 1)/(N-n-r+x) + else: + return _hyper0(N,n,r) + +def chypergeometric(xmin,xmax,N,n,r): + """ + Calcule le terme *x* d'une loi hypergéométrique + + Le calcule est réalisé selon la méthode décrite dans l'article + + Trong Wu, An accurate computation of the hypergeometric distribution function, + ACM Trans. Math. Softw. 19 (1993), no. 1, 33–43. + + Paramètres: + + - `xmin` : Nombre d'éléments marqués minimum attendu + - `xmax` : Nombre d'éléments marqués maximum attendu + - `N` : La taille de la population + - `n` : Le nombre d'éléments marqués + - `r` : La taille de l'echantillon + + Retourne un *float* indiquant la probabilité de récupérer entre + *xmin* et *xmax* éléments marqués parmi *n* dans une population + de taille *N* lors du tirage d'un échantillon de taille *r* + """ + if n < r: + s = n + n = r + r = s + assert xmin>=0 and xmin <= r and xmax>=0 and xmax <= r and xmin <=xmax,"x out of limits" + hg = hypergeometric(xmin,N,n,r) + rep = hg + for x in xrange(xmin+1,xmax+1): + hg = hg * (n - x + 1)/x * (r - x + 1)/(N-n-r+x) + rep+=hg + return rep + +def multipleTest(globalPvalue,testList): + """ + Correction pour les tests multiples. + + Séléctionne parmis un ensemble de test le plus grand sous ensemble + telque le risque global soit inférieur à une pvalue déterminée. + + Paramètres: + + - `globalPvalue` : Risque global à prendre pour l'ensemble des tests + - `testList` : un élément itérable sur un ensemble de tests. + Chaque test est une liste ou un tuple dont le dernier élément + est la pvalue associée au test + + Retourne une liste contenant le sous ensemble des tests selectionnés dans + `testList` + """ + testList=list(testList) + testList.sort(lambda x,y:cmp(x[-1],y[-1])) + h0=1.0-globalPvalue + p=1.0 + rep = [] + for t in testList: + p*=1.0-t[-1] + if p > h0: + rep.append(t) + return rep + \ No newline at end of file diff --git a/obitools/statistics/noncentralhypergeo.py b/obitools/statistics/noncentralhypergeo.py new file mode 100644 index 0000000..e6a96ce --- /dev/null +++ b/obitools/statistics/noncentralhypergeo.py @@ -0,0 +1,208 @@ +from decimal import * +from math import log + +#from obitools.utils import moduleInDevelopment + +#moduleInDevelopment(__name__) + +# from : http://www.programmish.com/?p=25 + +def dec_log(self, base=10): + cur_prec = getcontext().prec + getcontext().prec += 2 + baseDec = Decimal(10) + retValue = self + + if isinstance(base, Decimal): + baseDec = base + elif isinstance(base, float): + baseDec = Decimal("%f" % (base)) + else: + baseDec = Decimal(base) + + integer_part = Decimal(0) + while retValue < 1: + integer_part = integer_part - 1 + retValue = retValue * baseDec + while retValue >= baseDec: + integer_part = integer_part + 1 + retValue = retValue / baseDec + + retValue = retValue ** 10 + decimal_frac = Decimal(0) + partial_part = Decimal(1) + while cur_prec > 0: + partial_part = partial_part / Decimal(10) + digit = Decimal(0) + while retValue >= baseDec: + digit += 1 + retValue = retValue / baseDec + decimal_frac = decimal_frac + digit * partial_part + retValue = retValue ** 10 + cur_prec -= 1 + getcontext().prec -= 2 + + return integer_part + decimal_frac + +class Interval(object): + def __init__(self,begin,end,facteur=1): + self._begin = begin + self._end = end + self._facteur=facteur + + def __str__(self): + return '[%d,%d] ^ %d' % (self._begin,self._end,self._facteur) + + def __repr__(self): + return 'Interval(%d,%d,%d)' % (self._begin,self._end,self._facteur) + + def begin(self): + return (self._begin,self._facteur,True) + + def end(self): + return (self._end,-self._facteur,False) + + +def cmpb(i1,i2): + x= cmp(i1[0],i2[0]) + if x==0: + x = cmp(i2[2],i1[2]) + return x + +class Product(object): + def __init__(self,i=None): + if i is not None: + self.prod=[i] + else: + self.prod=[] + self._simplify() + + def _simplify(self): + bornes=[] + prod =[] + + if self.prod: + + for i in self.prod: + bornes.append(i.begin()) + bornes.append(i.end()) + bornes.sort(cmpb) + + + j=0 + r=len(bornes) + for i in xrange(1,len(bornes)): + if bornes[i][0]==bornes[j][0] and bornes[i][2]==bornes[j][2]: + bornes[j]=(bornes[j][0],bornes[j][1]+bornes[i][1],bornes[i][2]) + r-=1 + else: + j+=1 + bornes[j]=bornes[i] + + bornes=bornes[0:r] + + facteur=0 + close=1 + + for b,level,open in bornes: + if not open: + close=0 + else: + close=1 + if facteur: + prod.append(Interval(debut,b-close,facteur)) + debut=b+1-close + facteur+=level + + self.prod=prod + + + + + def __mul__(self,p): + res = Product() + res.prod=list(self.prod) + res.prod.extend(p.prod) + res._simplify() + return res + + def __div__(self,p): + np = Product() + np.prod = [Interval(x._begin,x._end,-x._facteur) for x in p.prod] + return self * np + + def __str__(self): + return str(self.prod) + + def log(self): + p=Decimal(0) + for k in self.prod: + p+= Decimal(k._facteur) * reduce(lambda x,y:x+dec_log(Decimal(y),Decimal(10)),xrange(k._begin,k._end+1),Decimal(0)) + return p + + def product(self): + p=Decimal(1) + for k in self.prod: + p*= reduce(lambda x,y:x*Decimal(y),xrange(k._begin,k._end+1),Decimal(1)) ** Decimal(k._facteur) + return p + + def __call__(self,log=True): + if log: + return self.log() + else: + return self.product() + + +def fact(n): + return Product(Interval(1,n)) + +def cnp(n,p): + return fact(n)/fact(p)/fact(n-p) + +def hypergeometic(x,n,M,N): + ''' + + @param x: Variable aleatoire + @type x: int + @param n: taille du tirage + @type n: int + @param M: boule gagnante + @type M: int + @param N: nombre total dans l'urne + @type N: int + + p(x)= cnp(M,x) * cnp(N-M,n-x) / cnp(N,n) + ''' + return cnp(M,x) * cnp(N-M,n-x) / cnp(N,n) + +def nchypergeometique(x,n,M,N,r): + ''' + + @param x: Variable aleatoire + @type x: int + @param n: taille du tirage + @type n: int + @param M: boule gagnante + @type M: int + @param N: nombre total dans l'urne + @type N: int + @param r: odd ratio + @type r: float + + p(x)= cnp(M,x) * cnp(N-M,n-x) / cnp(N,n) + ''' + + xmin = max(0,n-N+M) + xmax = min(n,M) + lr = dec_log(r) + xlr = x * lr + num = cnp(M,x) * cnp(N-M,n-x) + den = [cnp(M,y) * cnp(N-M,n-y) / num for y in xrange(xmin,xmax+1)] + fden = [lr * y - xlr for y in xrange(xmin,xmax+1)] + + inverse=reduce(lambda x,y : x+y, + map(lambda i,j: i(False) * 10**j ,den,fden)) + return 1/inverse + + + \ No newline at end of file diff --git a/obitools/svg.py b/obitools/svg.py new file mode 100644 index 0000000..c42e3ef --- /dev/null +++ b/obitools/svg.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python +"""\ +SVG.py - Construct/display SVG scenes. + +The following code is a lightweight wrapper around SVG files. The metaphor +is to construct a scene, add objects to it, and then write it to a file +to display it. + +This program uses ImageMagick to display the SVG files. ImageMagick also +does a remarkable job of converting SVG files into other formats. +""" + +import os +display_prog = 'display' # Command to execute to display images. + +class Scene: + def __init__(self,name="svg",height=400,width=400): + self.name = name + self.items = [] + self.height = height + self.width = width + return + + def add(self,item): self.items.append(item) + + def strarray(self): + var = ["\n", + "\n" % (self.height,self.width), + " \n"] + for item in self.items: var += item.strarray() + var += [" \n\n"] + return var + + def write_svg(self,filename=None): + if filename: + self.svgname = filename + else: + self.svgname = self.name + ".svg" + file = open(self.svgname,'w') + file.writelines(self.strarray()) + file.close() + return + + def display(self,prog=display_prog): + os.system("%s %s" % (prog,self.svgname)) + return + + +class Line: + def __init__(self,start,end): + self.start = start #xy tuple + self.end = end #xy tuple + return + + def strarray(self): + return [" \n" %\ + (self.start[0],self.start[1],self.end[0],self.end[1])] + + +class Circle: + def __init__(self,center,radius,color): + self.center = center #xy tuple + self.radius = radius #xy tuple + self.color = color #rgb tuple in range(0,256) + return + + def strarray(self): + return [" \n" % colorstr(self.color)] + +class Rectangle: + def __init__(self,origin,height,width,color): + self.origin = origin + self.height = height + self.width = width + self.color = color + return + + def strarray(self): + return [" \n" %\ + (self.width,colorstr(self.color))] + +class Text: + def __init__(self,origin,text,size=24): + self.origin = origin + self.text = text + self.size = size + return + + def strarray(self): + return [" \n" %\ + (self.origin[0],self.origin[1],self.size), + " %s\n" % self.text, + " \n"] + + +def colorstr(rgb): return "#%x%x%x" % (rgb[0]/16,rgb[1]/16,rgb[2]/16) + +def test(): + scene = Scene('test') + scene.add(Rectangle((100,100),200,200,(0,255,255))) + scene.add(Line((200,200),(200,300))) + scene.add(Line((200,200),(300,200))) + scene.add(Line((200,200),(100,200))) + scene.add(Line((200,200),(200,100))) + scene.add(Circle((200,200),30,(0,0,255))) + scene.add(Circle((200,300),30,(0,255,0))) + scene.add(Circle((300,200),30,(255,0,0))) + scene.add(Circle((100,200),30,(255,255,0))) + scene.add(Circle((200,100),30,(255,0,255))) + scene.add(Text((50,50),"Testing SVG")) + scene.write_svg() + scene.display() + return + +if __name__ == '__main__': test() diff --git a/obitools/table/__init__.py b/obitools/table/__init__.py new file mode 100644 index 0000000..41e00bd --- /dev/null +++ b/obitools/table/__init__.py @@ -0,0 +1,633 @@ +''' + +''' + +from itertools import imap,count,chain + +from itertools import imap,count,chain + +class Table(list): + """ + Tables are list of rows of the same model + """ + def __init__(self, headers=None, + types=None, + colcount=None, + rowFactory=None, + subrowFactory=None): + ''' + + @param headers: the list of column header. + + if this parametter is C{None}, C{colcount} + parametter must be set. + + @type headers: C{list}, C{tuple} or and iterable object + + @param types: the list of data type associated to each column. + + If this parametter is specified its length must be + equal to the C{headers} length or to C{colcount}. + + @type types: C{list}, C{tuple} or and iterable object + + @param colcount: number of column in the created table. + + If C{headers} parametter is not C{None} this + parametter is ignored + + @type colcount: int + ''' + + assert headers is not None or colcount is not None,\ + 'headers or colcount parametter must be not None value' + + if headers is None: + headers = tuple('Col_%d' % x for x in xrange(colcount)) + + self.headers = headers + self.types = types + self.colcount= len(self.headers) + + if rowFactory is None: + self.rowFactory=TableRow + else: + self.rowFactory=rowFactory + + if subrowFactory is None: + self.subrowFactory=TableRow + else: + self.subrowFactory=rowFactory + + + self.likedTo=set() + + + + def isCompatible(self,data): + assert isinstance(data,(Table,TableRow)) + return (self.colcount == data.colcount and + (id(self.types)==id(data.types) or + self.types==data.types + ) + ) + + def __setitem__ (self,key,value): + ''' + + @param key: + @type key: C{int}, C{slice} or C{str} + @param value: + @type value: + ''' + + if isintance(key,int): + if not isinstance(value, TableRow): + value = self.rowFactory(self,value) + else: + assert self.isCompatible(value) + list.__setitem__(self,key,value.row) + + elif isinstance(key,slice): + indices = xrange(key.indices(len(self))) + for i,d in imap(None,indices,value): + self[i]=d + + else: + raise TypeError, "Key must be an int or slice value" + + def __getitem__(self,key): + ''' + this function has different comportements depending + of the data type of C{key} and the table used. + + @param key: description of the table part to return + @type key: C{int} or C{slice} + + @return: return a TableRow (if key is C{int}) + or a subpart of the table (if key is C{slice}). + ''' + if isinstance(key,int): + return self.rowFactory(self, + list.__getitem__(self,key)) + + if isinstance(key,slice): + newtable=Table(self.headers,self.types) + indices = xrange(key.indices(len(self))) + for i in indices: + list.append(newtable,list.__getitem__(self,i)) + self.likedTo.add(newtable) + return newtable + + raise TypeError + + + def __getslice__(self,x,y): + return self.__getitem__(slice(x,y)) + + def __iter__(self): + return TableIterator(self) + + def __hash__(self): + return id(self) + + def __add__(self,itable): + return concatTables(self,itable) + + def _setTypes(self,types): + if types is not None and not isinstance(type,tuple): + types = tuple(x for x in types) + + assert types is None or len(types)==len(self._headers) + + self._types = types + + if types is not None: + for row in self: + row.castRow() + + def _getTypes(self): + return self._types + + types = property(_getTypes,_setTypes) + + def _getHeaders(self): + return self._headers + + def _setHeaders(self,headers): + if not isinstance(headers, tuple): + headers = tuple(x for x in headers) + + self._hindex = dict((k,i) for i,k in imap(None,count(),headers)) + self._headers=headers + self.colcount=len(headers) + + headers=property(_getHeaders,_setHeaders) + + def append(self,value): + if not isinstance(value, TableRow): + value = self.rowFactory(self,value) + else: + assert self.isCompatible(value) + list.append(self,value.row) + + + +class _Row(list): + def __init__(self,data,size): + if data is None: + list.__init__(self,(None for x in xrange(size))) + else: + list.__init__(self,data) + assert len(self)==size, \ + "Size of data is not correct (%d instead of %d)" % (len(self),size) + + def append(self,value): + raise NotImplementedError, \ + "Rows cannot change of size" + + def pop(self,key=None): + raise NotImplementedError, \ + "Rows cannot change of size" + + def extend(self,values): + raise NotImplementedError, \ + "Rows cannot change of size" + + + + +class TableRow(object): + ''' + + ''' + def __init__(self, table, + data=None, + ): + + self.table = table + + if isinstance(data,_Row): + self.row=row + else: + data = self._castRow(data) + self.row=_Row(data,self._colcount) + + def getType(self): + return self.table.types + + def getHeaders(self): + return self.table.headers + + def getHIndex(self): + return self.table._hindex + + def getColCount(self): + return self.table.colcount + + types = property(getType,None,None, + "List of types associated to this row") + headers= property(getHeaders,None,None, + "List of headers associated to this row") + + _hindex= property(getHIndex,None,None) + _colcount = property(getColCount,None,None) + + def _castValue(t,x): + ''' + Cast a value to a specified type, with exception of + C{None} values that are returned without cast. + + @param t: the destination type + @type t: C{type} + @param x: the value to cast + + @return: the casted value or C{None} + + ''' + if x is None or t is None: + return x + else: + return t(x) + + _castValue=staticmethod(_castValue) + + def _castRow(self,data): + + if not isinstance(data, (list,dict)): + data=[x for x in data] + + if isinstance(data,list): + assert len(data)==self._colcount, \ + 'values has not good length' + if self.types is not None: + data=[TableRow._castValue(t, x) + for t,x in imap(None,self.types,data)] + + elif isinstance(data,dict): + lvalue = [None] * len(self.header) + + for k,v in data.items(): + try: + hindex = self._hindex[k] + if self.types is not None: + lvalue[hindex]=TableRow._castValue(self.types[hindex], v) + else: + lvalue[hindex]=v + except KeyError: + info('%s is not a table column' % k) + + data=lvalue + else: + raise TypeError + + return data + + def __getitem__(self,key): + ''' + + @param key: + @type key: + ''' + + if isinstance(key,(int,slice)): + return self.row[key] + + if isinstance(key,str): + i = self._hindex[key] + return self.row[i] + + raise TypeError, "Key must be an int, slice or str value" + + def __setitem__(self,key,value): + ''' + + @param key: + @type key: + @param value: + @type value: + ''' + + if isinstance(key,str): + key = self._hindex[key] + + elif isinstance(key,int): + if self.types is not None: + value = TableRow._castValue(self.types[key], value) + self.row[key]=value + + elif isinstance(key,slice): + indices = xrange(key.indices(len(self.row))) + for i,v in imap(None,indices,value): + self[i]=v + else: + raise TypeError, "Key must be an int, slice or str value" + + + + def __iter__(self): + ''' + + ''' + return iter(self.row) + + def append(self,value): + raise NotImplementedError, \ + "Rows cannot change of size" + + def pop(self,key=None): + raise NotImplementedError, \ + "Rows cannot change of size" + + def extend(self,values): + raise NotImplementedError, \ + "Rows cannot change of size" + + def __len__(self): + return self._colcount + + def __repr__(self): + return repr(self.row) + + def __str__(self): + return str(self.row) + + def castRow(self): + self.row = _Row(self._castRow(self.row),len(self.row)) + + +class iTableIterator(object): + + def _getHeaders(self): + raise NotImplemented + + def _getTypes(self): + raise NotImplemented + + def _getRowFactory(self): + raise NotImplemented + + def _getSubrowFactory(self): + raise NotImplemented + + def _getColcount(self): + return len(self._getTypes()) + + def __iter__(self): + return self + + headers = property(_getHeaders,None,None) + types = property(_getTypes,None,None) + rowFactory = property(_getRowFactory,None,None) + subrowFactory = property(_getSubrowFactory,None,None) + colcount = property(_getColcount,None,None) + + def columnIndex(self,name): + if isinstance(name,str): + return self._reference.headers.index(name) + elif isinstance(name,int): + lh = len(self._reference.headers) + if name < lh and name >=0: + return name + elif name < 0 and name >= -lh: + return lh - name + raise IndexError + raise TypeError + + def next(self): + raise NotImplemented + + +class TableIterator(iTableIterator): + + def __init__(self,table): + if not isinstance(table,Table): + raise TypeError + + self._reftable=table + self._i=0 + + def _getHeaders(self): + return self._reftable.headers + + def _getTypes(self): + return self._reftable.types + + def _getRowFactory(self): + return self._reftable.rowFactory + + def _getSubrowFactory(self): + return self._reftable.subrowFactory + + def columnIndex(self,name): + if isinstance(name,str): + return self._reftable._hindex[name] + elif isinstance(name,int): + lh = len(self._reftable._headers) + if name < lh and name >=0: + return name + elif name < 0 and name >= -lh: + return lh - name + raise IndexError + raise TypeError + + + def rewind(self): + i=0 + + def next(self): + if self._i < len(self._reftable): + rep=self._reftable[self._i] + self._i+=1 + return rep + else: + raise StopIteration + + headers = property(_getHeaders,None,None) + types = property(_getTypes,None,None) + rowFactory = property(_getRowFactory,None,None) + subrowFactory = property(_getSubrowFactory,None,None) + + +class ProjectionIterator(iTableIterator): + + def __init__(self,tableiterator,*cols): + self._reference = iter(tableiterator) + + assert isinstance(self._reference, iTableIterator) + + self._selected = tuple(self._reference.columnIndex(x) + for x in cols) + self._headers = tuple(self._reference.headers[x] + for x in self._selected) + + if self._reference.types is not None: + self._types= tuple(self._reference.types[x] + for x in self._selected) + else: + self._types=None + + def _getRowFactory(self): + return self._reference.subrowFactory + + def _getSubrowFactory(self): + return self._reference.subrowFactory + + def _getHeaders(self): + return self._headers + + def _getTypes(self): + return self._types + + headers = property(_getHeaders,None,None) + types = property(_getTypes,None,None) + rowFactory = property(_getRowFactory,None,None) + subrowFactory = property(_getSubrowFactory,None,None) + + def next(self): + value = self._reference.next() + value = (value[x] for x in self._selected) + return self.rowFactory(self,value) + +class SelectionIterator(iTableIterator): + def __init__(self,tableiterator,**conditions): + self._reference = iter(tableiterator) + + assert isinstance(self._reference, iTableIterator) + + self._conditions=dict((self._reference.columnIndex(i),c) + for i,c in conditions.iteritems()) + + def _checkCondition(self,row): + return reduce(lambda x,y : x and y, + (bool(self._conditions[i](row[i])) + for i in self._conditions), + True) + + def _getRowFactory(self): + return self._reference.rowFactory + + def _getSubrowFactory(self): + return self._reference.subrowFactory + + def _getHeaders(self): + return self._reference.headers + + def _getTypes(self): + return self._reference.types + + def next(self): + row = self._reference.next() + while not self._checkCondition(row): + row = self._reference.next() + return row + + + headers = property(_getHeaders,None,None) + types = property(_getTypes,None,None) + rowFactory = property(_getRowFactory,None,None) + subrowFactory = property(_getSubrowFactory,None,None) + + +class UnionIterator(iTableIterator): + def __init__(self,*itables): + self._itables=[iter(x) for x in itables] + self._types = self._itables[0].types + self._headers = self._itables[0].headers + + assert reduce(lambda x,y: x and y, + ( isinstance(z,iTableIterator) + and len(z.headers)==len(self._headers) + for z in self._itables), + True) + + self._iterator = chain(*self._itables) + + def _getRowFactory(self): + return self._itables[0].rowFactory + + def _getSubrowFactory(self): + return self._itables[0].subrowFactory + + def _getHeaders(self): + return self._headers + + def _getTypes(self): + return self._types + + def next(self): + value = self._iterator.next() + return self.rowFactory(self,value.row) + + headers = property(_getHeaders,None,None) + types = property(_getTypes,None,None) + rowFactory = property(_getRowFactory,None,None) + subrowFactory = property(_getSubrowFactory,None,None) + + + +def tableFactory(tableiterator): + tableiterator = iter(tableiterator) + assert isinstance(tableiterator, iTableIterator) + + newtable = Table(tableiterator.headers, + tableiterator.types, + tableiterator.rowFactory, + tableiterator.subrowFactory) + + for r in tableiterator: + newtable.append(r) + + return newtable + +def projectTable(tableiterator,*cols): + return tableFactory(ProjectionIterator(tableiterator,*cols)) + +def subTable(tableiterator,**conditions): + return tableFactory(SelectionIterator(tableiterator,**conditions)) + +def concatTables(*itables): + ''' + Concatene severals tables. + + concatenation is done using the L{UnionIterator} + + @type itables: iTableIterator or Table + + @return: a new Table + @rtype: c{Table} + + @see: L{UnionIterator} + ''' + return tableFactory(UnionIterator(*itables)) + +class TableIteratorAsDict(object): + + def __init__(self,tableiterator): + self._reference = iter(tableiterator) + + assert isinstance(self._reference, iTableIterator) + + self._headers = self._reference.headers + self._types = self._reference.types + if self._types is not None: + self._types = dict((n,t) + for n,t in imap(None,self._headers,self._types)) + + def __iter__(self): + return self + + def next(self): + value = self._reference.next() + return dict((n,t) + for n,t in imap(None,self._headers,value)) + + def _getHeaders(self): + return self._headers + + def _getTypes(self): + return self._types + + headers = property(_getHeaders,None,None) + types = property(_getTypes,None,None) + \ No newline at end of file diff --git a/obitools/table/csv.py b/obitools/table/csv.py new file mode 100644 index 0000000..1d9a73d --- /dev/null +++ b/obitools/table/csv.py @@ -0,0 +1,52 @@ +""" +obitools.table.csv module provides an iterator adapter +allowing to parse csv (comma separatted value) file +""" + +import re + +def csvIterator(lineIterator,sep=','): + ''' + Allows easy parsing of a csv file. This function + convert an iterator on line over a csv text file + in an iterator on data list. Each list corresponds + to all values present n one line. + + @param lineIterator: iterator on text lines + @type lineIterator: iterator + @param sep: string of one letter used as separator + blank charactere or " is not allowed as + separator + @type sep: string + @return: an iterator on data list + @rtype: iterator + ''' + assert len(sep)==1 and not sep.isspace() and sep!='"' + valueMatcher=re.compile('\s*((")(([^"]|"")*)"|([^%s]*?))\s*(%s|$)' % (sep,sep)) + def iterator(): + for l in lineIterator: + yield _csvParse(l,valueMatcher) + return iterator() + + +def _csvParse(line,valueMatcher): + data=[] + i = iter(valueMatcher.findall(line)) + m = i.next() + if m[0]: + while m[-1]!='': + if m[1]=='"': + data.append(m[2].replace('""','"')) + else: + data.append(m[0]) + m=i.next() + if m[1]=='"': + data.append(m[2].replace('""','"')) + else: + data.append(m[0]) + return data + + + + + \ No newline at end of file diff --git a/obitools/tagmatcher/__init__.py b/obitools/tagmatcher/__init__.py new file mode 100644 index 0000000..880ead0 --- /dev/null +++ b/obitools/tagmatcher/__init__.py @@ -0,0 +1,35 @@ +from obitools import NucSequence +from obitools.location import locationGenerator,extractExternalRefs + + + +class TagMatcherSequence(NucSequence): + ''' + Class used to represent a nucleic sequence issued mapped + on a genome by the tagMatcher software. + ''' + + def __init__(self,seq,cd,locs,dm,rm): + NucSequence.__init__(self, seq, seq) + self['locations']=locs + self['conditions']=cd + self['dm']=dm + self['rm']=rm + self['tm']=dm+rm + + def eminEmaxFilter(self,emin=None,emax=None): + result = [x for x in self['locations'] + if (emin is None or x['error'] >=emin) + and (emax is None or x['error'] <=emax)] + self['locations']=result + dm=0 + rm=0 + for x in result: + if x.isDirect(): + dm+=1 + else: + rm+=1 + self['dm']=dm + self['rm']=rm + self['tm']=dm+rm + return self diff --git a/obitools/tagmatcher/options.py b/obitools/tagmatcher/options.py new file mode 100644 index 0000000..45673ce --- /dev/null +++ b/obitools/tagmatcher/options.py @@ -0,0 +1,14 @@ +def addTagMatcherErrorOptions(optionManager): + optionManager.add_option('-E','--emax', + action='store', + metavar="<##>", + type="int",dest="emax", + default=None, + help="keep match with no more than emax errors") + + optionManager.add_option('-e','--emin', + action='store', + metavar="<##>", + type="int",dest="emin", + default=0, + help="keep match with at least emin errors") diff --git a/obitools/tagmatcher/parser.py b/obitools/tagmatcher/parser.py new file mode 100644 index 0000000..a843e66 --- /dev/null +++ b/obitools/tagmatcher/parser.py @@ -0,0 +1,89 @@ +import re +import sys + +from obitools import tagmatcher +from obitools.seqdb import nucEntryIterator +from obitools.location.feature import Feature +from obitools.location import locationGenerator + +_seqMatcher = re.compile('(?<=TG )[acgtrymkwsbdhvnACGTRYMKWSBDHVN]+') +_cdMatcher = re.compile('(?<=CD ) *([^:]+?) +: +([0-9]+)') +_loMatcher = re.compile('(?<=LO ) *([ACGTRYMKWSBDHVN]+) +([^ ]+) +([^ ]+) +\(([0-9]+)\)') +_dmMatcher = re.compile('(?<=DM )[0-9]+') +_rmMatcher = re.compile('(?<=RM )[0-9]+') + + +def __tagmatcherparser(text): + try: + seq = _seqMatcher.search(text).group() + cd = dict((x[0],int(x[1])) for x in _cdMatcher.findall(text)) + locs = [] + + for (match,ac,loc,err) in _loMatcher.findall(text): + feat = Feature('location', locationGenerator(loc)) + feat['error']=int(err) + feat['match']=match + feat['contig']=ac + locs.append(feat) + + dm = int(_dmMatcher.search(text).group()) + rm = int(_rmMatcher.search(text).group()) + + except AttributeError,e: + print >>sys.stderr,'=======================================================' + print >>sys.stderr,text + print >>sys.stderr,'=======================================================' + raise e + + return (seq,cd,locs,dm,rm) + +def tagMatcherParser(text): + return tagmatcher.TagMatcherSequence(*__tagmatcherparser(text)) + + +class TagMatcherIterator(object): + _cdheadparser = re.compile('condition [0-9]+ : (.+)') + + def __init__(self,file): + self._ni = nucEntryIterator(file) + self.header=self._ni.next() + self.conditions=TagMatcherIterator._cdheadparser.findall(self.header) + + def next(self): + return tagMatcherParser(self._ni.next()) + + def __iter__(self): + return self + +def formatTagMatcher(tmseq,reader=None): + if isinstance(tmseq, TagMatcherIterator): + return tmseq.header + + assert isinstance(tmseq,tagmatcher.TagMatcherSequence),'Only TagMatcherSequence can be used' + lo = '\n'.join(['LO %s %s %s (%d)' % (l['match'],l['contig'],l.locStr(),l['error']) + for l in tmseq['locations']]) + if reader is not None: + cd = '\n'.join(['CD %s : %d' % (x,tmseq['conditions'][x]) + for x in reader.conditions]) + else: + cd = '\n'.join(['CD %s : %d' % (x,tmseq['conditions'][x]) + for x in tmseq['conditions']]) + + tg = 'TG %s' % str(tmseq) + + e=[tg] + if cd: + e.append(cd) + if lo: + e.append(lo) + + tm = 'TM %d' % tmseq['tm'] + dm = 'DM %d' % tmseq['dm'] + rm = 'RM %d' % tmseq['rm'] + + e.extend((tm,dm,rm,'//')) + + return '\n'.join(e) + + + diff --git a/obitools/thermo/__init__.py b/obitools/thermo/__init__.py new file mode 100644 index 0000000..492dbb9 --- /dev/null +++ b/obitools/thermo/__init__.py @@ -0,0 +1,597 @@ +from math import log +from array import array +from copy import deepcopy + +bpencoder={'A':1,'C':2,'G':3,'T':4, + 'a':1,'c':2,'g':3,'t':4, + '-':0 + } + +rvencoder={'A':4,'C':3,'G':2,'T':1, + 'a':4,'c':3,'g':2,'t':1, + '-':0 + } + +R = 1.987 +SALT_METHOD_SANTALUCIA = 1 +SALT_METHOD_OWCZARZY = 2 +DEF_CONC_PRIMERS = 8.e-7 +DEF_CONC_SEQUENCES = 0. +DEF_SALT = 0.05 +forbidden_entropy = 0. +forbidden_enthalpy = 1.e18 + +__dH = [[[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]], + [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]], + [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]], + [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]], + [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]], + [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]] + ] +__dS = [[[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]], + [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]], + [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]], + [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]], + [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]], + [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]], + [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]] + ] + +def initParams(c1, c2, kp, sm,nparm={}): + global forbidden_entropy + global dH,dS + + dH=deepcopy(__dH) + dS=deepcopy(__dS) + + nparm['Ct1'] = c1; + nparm['Ct2'] = c2; + nparm['kplus'] = kp; + maxCT = 1; + + if(nparm['Ct2'] > nparm['Ct1']): + maxCT = 2 + + if(nparm['Ct1'] == nparm['Ct2']): + ctFactor = nparm['Ct1']/2 + elif (maxCT == 1): + ctFactor = nparm['Ct1']-nparm['Ct2']/2 + else: + ctFactor = nparm['Ct2']-nparm['Ct1']/2 + + nparm['rlogc'] = R * log(ctFactor) + forbidden_entropy = nparm['rlogc'] + nparm['kfac'] = 0.368 * log(nparm['kplus']) + nparm['saltMethod'] = sm + + + # Set all X-/Y-, -X/Y- and X-/-Y so, that TM will be VERY small! + for x in xrange(1,5): + for y in xrange(1,5): + dH[0][x][y][0]=forbidden_enthalpy; + dS[0][x][y][0]=forbidden_entropy; + dH[x][0][0][y]=forbidden_enthalpy; + dS[x][0][0][y]=forbidden_entropy; + dH[x][0][y][0]=forbidden_enthalpy; + dS[x][0][y][0]=forbidden_entropy; + # forbid X-/Y$ and X$/Y- etc., i.e. terminal must not be paired with gap! + dH[x][5][y][0]=forbidden_enthalpy; + dS[x][5][y][0]=forbidden_entropy; + dH[x][0][y][5]=forbidden_enthalpy; + dS[x][0][y][5]=forbidden_entropy; + dH[5][x][0][y]=forbidden_enthalpy; + dS[5][x][0][y]=forbidden_entropy; + dH[0][x][5][y]=forbidden_enthalpy; + dS[0][x][5][y]=forbidden_entropy; + + #forbid X$/-Y etc. + dH[x][5][0][y]=forbidden_enthalpy; + dS[x][5][0][y]=forbidden_entropy; + dH[x][0][5][y]=forbidden_enthalpy; + dS[x][0][5][y]=forbidden_entropy; + dH[5][x][y][0]=forbidden_enthalpy; + dS[5][x][y][0]=forbidden_entropy; + dH[0][x][y][5]=forbidden_enthalpy; + dS[0][x][y][5]=forbidden_entropy; + + + + #also, forbid x-/-- and --/x-, i.e. no two inner gaps paired + dH[x][0][0][0]=forbidden_enthalpy; + dS[x][0][0][0]=forbidden_entropy; + dH[0][0][x][0]=forbidden_enthalpy; + dS[0][0][x][0]=forbidden_entropy; + # x-/-$ + dH[x][0][0][5]=forbidden_enthalpy; + dS[x][0][0][5]=forbidden_entropy; + dH[5][0][0][x]=forbidden_enthalpy; + dS[5][0][0][x]=forbidden_entropy; + dH[0][5][x][0]=forbidden_enthalpy; + dS[x][0][0][5]=forbidden_entropy; + dH[0][x][5][0]=forbidden_enthalpy; + dS[0][x][5][0]=forbidden_entropy; + + # forbid --/-- + dH[0][0][0][0]=forbidden_enthalpy; + dS[0][0][0][0]=forbidden_entropy; + + dH[5][0][0][0]=forbidden_enthalpy; + dS[5][0][0][0]=forbidden_entropy; + dH[0][0][5][0]=forbidden_enthalpy; + dS[0][0][5][0]=forbidden_entropy; + dH[0][5][5][0]=forbidden_enthalpy; + dS[0][5][5][0]=forbidden_entropy; + + # Interior loops (double Mismatches) + iloop_entropy=-0.97 + iloop_enthalpy=0.0 + + for x in xrange(1,5): + for y in xrange(1,5): + for a in xrange(1,5): + for b in xrange(1,5): + # AT and CG pair, and as A=1, C=2, G=3, T=4 this means + # we have Watson-Crick pairs if (x+a==5) and (y+b)==5. + if ( not ((x+a==5) or (y+b==5))): + # No watson-crick-pair, i.e. double mismatch! + # set enthalpy/entropy to loop expansion! + dH[x][y][a][b] = iloop_enthalpy; + dS[x][y][a][b] = iloop_entropy; + + + # xy/-- and --/xy (Bulge Loops of size > 1) + bloop_entropy=-1.3 + bloop_enthalpy=0.0 + + for x in xrange(1,5): + for y in xrange(1,5): + dH[x][y][0][0] = bloop_enthalpy; + dS[x][y][0][0] = bloop_entropy; + dH[0][0][x][y] = bloop_enthalpy; + dS[0][0][x][y] = bloop_entropy; + + + # x-/ya abd xa/y- as well as -x/ay and ax/-y + # bulge opening and closing parameters with + # adjacent matches / mismatches + # obulge_mism and cbulge_mism chosen so high to avoid + # AAAAAAAAA + # T--G----T + # being better than + # AAAAAAAAA + # TG------T + obulge_match_H =-2.66e3 + obulge_match_S =-14.22 + cbulge_match_H =-2.66e3 + cbulge_match_S =-14.22 + obulge_mism_H = 0.0 + obulge_mism_S = -6.45 + cbulge_mism_H = 0.0 + cbulge_mism_S =-6.45 + + for x in xrange(1,5): + for y in xrange(1,5): + for a in xrange(1,5): + if (x+y==5): # other base pair matches! + + dH[x][0][y][a]=obulge_match_H; # bulge opening + dS[x][0][y][a]=obulge_match_S; + dH[x][a][y][0]=obulge_match_H; + dS[x][a][y][0]=obulge_match_S; + dH[0][x][a][y]=cbulge_match_H; # bulge closing + dS[0][x][a][y]=cbulge_match_S; + dH[a][x][0][y]=cbulge_match_H; + dS[a][x][0][y]=cbulge_match_S; + else: + # mismatch in other base pair! + dH[x][0][y][a]=obulge_mism_H; # bulge opening + dS[x][0][y][a]=obulge_mism_S; + dH[x][a][y][0]=obulge_mism_H; + dS[x][a][y][0]=obulge_mism_S; + dH[0][x][a][y]=cbulge_mism_H; # bulge closing + dS[0][x][a][y]=cbulge_mism_S; + dH[a][x][0][y]=cbulge_mism_H; + dS[a][x][0][y]=cbulge_mism_S; + + + + # Watson-Crick pairs (note that only ten are unique, as obviously + # 5'-AG-3'/3'-TC-5' = 5'-CT-3'/3'-GA-5' etc. + dH[1][1][4][4]=-7.6e3; dS[1][1][4][4]=-21.3 # AA/TT 04 + dH[1][2][4][3]=-8.4e3; dS[1][2][4][3]=-22.4 # AC/TG adapted GT/CA + dH[1][3][4][2]=-7.8e3; dS[1][3][4][2]=-21.0 # AG/TC adapted CT/GA + dH[1][4][4][1]=-7.2e3; dS[1][4][4][1]=-20.4 # AT/TA 04 + dH[2][1][3][4]=-8.5e3; dS[2][1][3][4]=-22.7 # CA/GT 04 + dH[2][2][3][3]=-8.0e3; dS[2][2][3][3]=-19.9 # CC/GG adapted GG/CC + dH[2][3][3][2]=-10.6e3; dS[2][3][3][2]=-27.2 # CG/GC 04 + dH[2][4][3][1]=-7.8e3; dS[2][4][3][1]=-21.0 # CT/GA 04 + dH[3][1][2][4]=-8.2e3; dS[3][1][2][4]=-22.2 # GA/CT 04 + dH[3][2][2][3]=-9.8e3; dS[3][2][2][3]=-24.4 # GC/CG 04 + dH[3][3][2][2]=-8.0e3; dS[3][3][2][2]=-19.9 # GG/CC 04 + dH[3][4][2][1]=-8.4e3; dS[3][4][2][1]=-22.4 # GT/CA 04 + dH[4][1][1][4]=-7.2e3; dS[4][1][1][4]=-21.3 # TA/AT 04 + dH[4][2][1][3]=-8.2e3; dS[4][2][1][3]=-22.2 # TC/AG adapted GA/CT + dH[4][3][1][2]=-8.5e3; dS[4][3][1][2]=-22.7 # TG/AC adapted CA/GT + dH[4][4][1][1]=-7.6e3; dS[4][4][1][1]=-21.3 # TT/AA adapted AA/TT + + # A-C Mismatches (Values for pH 7.0) + dH[1][1][2][4]=7.6e3; dS[1][1][2][4]=20.2 # AA/CT + dH[1][1][4][2]=2.3e3; dS[1][1][4][2]=4.6 # AA/TC + dH[1][2][2][3]=-0.7e3; dS[1][2][2][3]=-3.8 # AC/CG + dH[1][2][4][1]=5.3e3; dS[1][2][4][1]=14.6 # AC/TA + dH[1][3][2][2]=0.6e3; dS[1][3][2][2]=-0.6 # AG/CC + dH[1][4][2][1]=5.3e3; dS[1][4][2][1]=14.6 # AT/CA + dH[2][1][1][4]=3.4e3; dS[2][1][1][4]=8.0 # CA/AT + dH[2][1][3][2]=1.9e3; dS[2][1][3][2]=3.7 # CA/GC + dH[2][2][1][3]=5.2e3; dS[2][2][1][3]=14.2 # CC/AG + dH[2][2][3][1]=0.6e3; dS[2][2][3][1]=-0.6 # CC/GA + dH[2][3][1][2]=1.9e3; dS[2][3][1][2]=3.7 # CG/AC + dH[2][4][1][1]=2.3e3; dS[2][4][1][1]=4.6 # CT/AA + dH[3][1][2][2]=5.2e3; dS[3][1][2][2]=14.2 # GA/CC + dH[3][2][2][1]=-0.7e3; dS[3][2][2][1]=-3.8 # GC/CA + dH[4][1][1][2]=3.4e3; dS[4][1][1][2]=8.0 # TA/AC + dH[4][2][1][1]=7.6e3; dS[4][2][1][1]=20.2 # TC/AA + + # C-T Mismatches + dH[1][2][4][4]=0.7e3; dS[1][2][4][4]=0.2 # AC/TT + dH[1][4][4][2]=-1.2e3; dS[1][4][4][2]=-6.2 # AT/TC + dH[2][1][4][4]=1.0e3; dS[2][1][4][4]=0.7 # CA/TT + dH[2][2][3][4]=-0.8e3; dS[2][2][3][4]=-4.5 # CC/GT + dH[2][2][4][3]=5.2e3; dS[2][2][4][3]=13.5 # CC/TG + dH[2][3][4][2]=-1.5e3; dS[2][3][4][2]=-6.1 # CG/TC + dH[2][4][3][2]=-1.5e3; dS[2][4][3][2]=-6.1 # CT/GC + dH[2][4][4][1]=-1.2e3; dS[2][4][4][1]=-6.2 # CT/TA + dH[3][2][2][4]=2.3e3; dS[3][2][2][4]=5.4 # GC/CT + dH[3][4][2][2]=5.2e3; dS[3][4][2][2]=13.5 # GT/CC + dH[4][1][2][4]=1.2e3; dS[4][1][2][4]=0.7 # TA/CT + dH[4][2][2][3]=2.3e3; dS[4][2][2][3]=5.4 # TC/CG + dH[4][2][1][4]=1.2e3; dS[4][2][1][4]=0.7 # TC/AT + dH[4][3][2][2]=-0.8e3; dS[4][3][2][2]=-4.5 # TG/CC + dH[4][4][2][1]=0.7e3; dS[4][4][2][1]=0.2 # TT/CA + dH[4][4][1][2]=1.0e3; dS[4][4][1][2]=0.7 # TT/AC + + # G-A Mismatches + dH[1][1][3][4]=3.0e3; dS[1][1][3][4]=7.4 # AA/GT + dH[1][1][4][3]=-0.6e3; dS[1][1][4][3]=-2.3 # AA/TG + dH[1][2][3][3]=0.5e3; dS[1][2][3][3]=3.2 # AC/GG + dH[1][3][3][2]=-4.0e3; dS[1][3][3][2]=-13.2 # AG/GC + dH[1][3][4][1]=-0.7e3; dS[1][3][4][1]=-2.3 # AG/TA + dH[1][4][3][1]=-0.7e3; dS[1][4][3][1]=-2.3 # AT/GA + dH[2][1][3][3]=-0.7e3; dS[2][1][3][3]=-2.3 # CA/GG + dH[2][3][3][1]=-4.0e3; dS[2][3][3][1]=-13.2 # CG/GA + dH[3][1][1][4]=0.7e3; dS[3][1][1][4]=0.7 # GA/AT + dH[3][1][2][3]=-0.6e3; dS[3][1][2][3]=-1.0 # GA/CG + dH[3][2][1][3]=-0.6e3; dS[3][2][1][3]=-1.0 # GC/AG + dH[3][3][1][2]=-0.7e3; dS[3][3][1][2]=-2.3 # GG/AC + dH[3][3][2][1]=0.5e3; dS[3][3][2][1]=3.2 # GG/CA + dH[3][4][1][1]=-0.6e3; dS[3][4][1][1]=-2.3 # GT/AA + dH[4][1][1][3]=0.7e3; dS[4][1][1][3]=0.7 # TA/AG + dH[4][3][1][1]=3.0e3; dS[4][3][1][1]=7.4 # TG/AA + + # G-T Mismatches + dH[1][3][4][4]=1.0e3; dS[1][3][4][4]=0.9 # AG/TT + dH[1][4][4][3]=-2.5e3; dS[1][4][4][3]=-8.3 # AT/TG + dH[2][3][3][4]=-4.1e3; dS[2][3][3][4]=-11.7 # CG/GT + dH[2][4][3][3]=-2.8e3; dS[2][4][3][3]=-8.0 # CT/GG + dH[3][1][4][4]=-1.3e3; dS[3][1][4][4]=-5.3 # GA/TT + dH[3][2][4][3]=-4.4e3; dS[3][2][4][3]=-12.3 # GC/TG + dH[3][3][2][4]=3.3e3; dS[3][3][2][4]=10.4 # GG/CT + dH[3][3][4][2]=-2.8e3; dS[3][3][4][2]=-8.0 # GG/TC +# dH[3][3][4][4]=5.8e3; dS[3][3][4][4]=16.3 # GG/TT + dH[3][4][2][3]=-4.4e3; dS[3][4][2][3]=-12.3 # GT/CG + dH[3][4][4][1]=-2.5e3; dS[3][4][4][1]=-8.3 # GT/TA +# dH[3][4][4][3]=4.1e3; dS[3][4][4][3]=9.5 # GT/TG + dH[4][1][3][4]=-0.1e3; dS[4][1][3][4]=-1.7 # TA/GT + dH[4][2][3][3]=3.3e3; dS[4][2][3][3]=10.4 # TC/GG + dH[4][3][1][4]=-0.1e3; dS[4][3][1][4]=-1.7 # TG/AT + dH[4][3][3][2]=-4.1e3; dS[4][3][3][2]=-11.7 # TG/GC +# dH[4][3][3][4]=-1.4e3; dS[4][3][3][4]=-6.2 # TG/GT + dH[4][4][1][3]=-1.3e3; dS[4][4][1][3]=-5.3 # TT/AG + dH[4][4][3][1]=1.0e3; dS[4][4][3][1]=0.9 # TT/GA +# dH[4][4][3][3]=5.8e3; dS[4][4][3][3]=16.3 # TT/GG + + # A-A Mismatches + dH[1][1][1][4]=4.7e3; dS[1][1][1][4]=12.9 # AA/AT + dH[1][1][4][1]=1.2e3; dS[1][1][4][1]=1.7 # AA/TA + dH[1][2][1][3]=-2.9e3; dS[1][2][1][3]=-9.8 # AC/AG + dH[1][3][1][2]=-0.9e3; dS[1][3][1][2]=-4.2 # AG/AC + dH[1][4][1][1]=1.2e3; dS[1][4][1][1]=1.7 # AT/AA + dH[2][1][3][1]=-0.9e3; dS[2][1][3][1]=-4.2 # CA/GA + dH[3][1][2][1]=-2.9e3; dS[3][1][2][1]=-9.8 # GA/CA + dH[4][1][1][1]=4.7e3; dS[4][1][1][1]=12.9 # TA/AA + + # C-C Mismatches + dH[1][2][4][2]=0.0e3; dS[1][2][4][2]=-4.4 # AC/TC + dH[2][1][2][4]=6.1e3; dS[2][1][2][4]=16.4 # CA/CT + dH[2][2][2][3]=3.6e3; dS[2][2][2][3]=8.9 # CC/CG + dH[2][2][3][2]=-1.5e3; dS[2][2][3][2]=-7.2 # CC/GC + dH[2][3][2][2]=-1.5e3; dS[2][3][2][2]=-7.2 # CG/CC + dH[2][4][2][1]=0.0e3; dS[2][4][2][1]=-4.4 # CT/CA + dH[3][2][2][2]=3.6e3; dS[3][2][2][2]=8.9 # GC/CC + dH[4][2][1][2]=6.1e3; dS[4][2][1][2]=16.4 # TC/AC + + # G-G Mismatches + dH[1][3][4][3]=-3.1e3; dS[1][3][4][3]=-9.5 # AG/TG + dH[2][3][3][3]=-4.9e3; dS[2][3][3][3]=-15.3 # CG/GG + dH[3][1][3][4]=1.6e3; dS[3][1][3][4]=3.6 # GA/GT + dH[3][2][3][3]=-6.0e3; dS[3][2][3][3]=-15.8 # GC/GG + dH[3][3][2][3]=-6.0e3; dS[3][3][2][3]=-15.8 # GG/CG + dH[3][3][3][2]=-4.9e3; dS[3][3][3][2]=-15.3 # GG/GC + dH[3][4][3][1]=-3.1e3; dS[3][4][3][1]=-9.5 # GT/GA + dH[4][3][1][3]=1.6e3; dS[4][3][1][3]=3.6 # TG/AG + + # T-T Mismatches + dH[1][4][4][4]=-2.7e3; dS[1][4][4][4]=-10.8 # AT/TT + dH[2][4][3][4]=-5.0e3; dS[2][4][3][4]=-15.8 # CT/GT + dH[3][4][2][4]=-2.2e3; dS[3][4][2][4]=-8.4 # GT/CT + dH[4][1][4][4]=0.2e3; dS[4][1][4][4]=-1.5 # TA/TT + dH[4][2][4][3]=-2.2e3; dS[4][2][4][3]=-8.4 # TC/TG + dH[4][3][4][2]=-5.0e3; dS[4][3][4][2]=-15.8 # TG/TC + dH[4][4][1][4]=0.2e3; dS[4][4][1][4]=-1.5 # TT/AT + dH[4][4][4][1]=-2.7e3; dS[4][4][4][1]=-10.8 # TT/TA + + # Dangling Eds + dH[5][1][1][4]=-0.7e3; dS[5][1][1][4]=-0.8 # $A/AT + dH[5][1][2][4]=4.4e3; dS[5][1][2][4]=14.9 # $A/CT + dH[5][1][3][4]=-1.6e3; dS[5][1][3][4]=-3.6 # $A/GT + dH[5][1][4][4]=2.9e3; dS[5][1][4][4]=10.4 # $A/TT + dH[5][2][1][3]=-2.1e3; dS[5][2][1][3]=-3.9 # $C/AG + dH[5][2][2][3]=-0.2e3; dS[5][2][2][3]=-0.1 # $C/CG + dH[5][2][3][3]=-3.9e3; dS[5][2][3][3]=-11.2 # $C/GG + dH[5][2][4][3]=-4.4e3; dS[5][2][4][3]=-13.1 # $C/TG + dH[5][3][1][2]=-5.9e3; dS[5][3][1][2]=-16.5 # $G/AC + dH[5][3][2][2]=-2.6e3; dS[5][3][2][2]=-7.4 # $G/CC + dH[5][3][3][2]=-3.2e3; dS[5][3][3][2]=-10.4 # $G/GC + dH[5][3][4][2]=-5.2e3; dS[5][3][4][2]=-15.0 # $G/TC + dH[5][4][1][1]=-0.5e3; dS[5][4][1][1]=-1.1 # $T/AA + dH[5][4][2][1]=4.7e3; dS[5][4][2][1]=14.2 # $T/CA + dH[5][4][3][1]=-4.1e3; dS[5][4][3][1]=-13.1 # $T/GA + dH[5][4][4][1]=-3.8e3; dS[5][4][4][1]=-12.6 # $T/TA + dH[1][5][4][1]=-2.9e3; dS[1][5][4][1]=-7.6 # A$/TA + dH[1][5][4][2]=-4.1e3; dS[1][5][4][2]=-13.0 # A$/TC + dH[1][5][4][3]=-4.2e3; dS[1][5][4][3]=-15.0 # A$/TG + dH[1][5][4][4]=-0.2e3; dS[1][5][4][4]=-0.5 # A$/TT + dH[1][1][5][4]=0.2e3; dS[1][1][5][4]=2.3 # AA/$T + dH[1][1][4][5]=-0.5e3; dS[1][1][4][5]=-1.1 # AA/T$ + dH[1][2][5][3]=-6.3e3; dS[1][2][5][3]=-17.1 # AC/$G + dH[1][2][4][5]=4.7e3; dS[1][2][4][5]=14.2 # AC/T$ + dH[1][3][5][2]=-3.7e3; dS[1][3][5][2]=-10.0 # AG/$C + dH[1][3][4][5]=-4.1e3; dS[1][3][4][5]=-13.1 # AG/T$ + dH[1][4][5][1]=-2.9e3; dS[1][4][5][1]=-7.6 # AT/$A + dH[1][4][4][5]=-3.8e3; dS[1][4][4][5]=-12.6 # AT/T$ + dH[2][5][3][1]=-3.7e3; dS[2][5][3][1]=-10.0 # C$/GA + dH[2][5][3][2]=-4.0e3; dS[2][5][3][2]=-11.9 # C$/GC + dH[2][5][3][3]=-3.9e3; dS[2][5][3][3]=-10.9 # C$/GG + dH[2][5][3][4]=-4.9e3; dS[2][5][3][4]=-13.8 # C$/GT + dH[2][1][5][4]=0.6e3; dS[2][1][5][4]=3.3 # CA/$T + dH[2][1][3][5]=-5.9e3; dS[2][1][3][5]=-16.5 # CA/G$ + dH[2][2][5][3]=-4.4e3; dS[2][2][5][3]=-12.6 # CC/$G + dH[2][2][3][5]=-2.6e3; dS[2][2][3][5]=-7.4 # CC/G$ + dH[2][3][5][2]=-4.0e3; dS[2][3][5][2]=-11.9 # CG/$C + dH[2][3][3][5]=-3.2e3; dS[2][3][3][5]=-10.4 # CG/G$ + dH[2][4][5][1]=-4.1e3; dS[2][4][5][1]=-13.0 # CT/$A + dH[2][4][3][5]=-5.2e3; dS[2][4][3][5]=-15.0 # CT/G$ + dH[3][5][2][1]=-6.3e3; dS[3][5][2][1]=-17.1 # G$/CA + dH[3][5][2][2]=-4.4e3; dS[3][5][2][2]=-12.6 # G$/CC + dH[3][5][2][3]=-5.1e3; dS[3][5][2][3]=-14.0 # G$/CG + dH[3][5][2][4]=-4.0e3; dS[3][5][2][4]=-10.9 # G$/CT + dH[3][1][5][4]=-1.1e3; dS[3][1][5][4]=-1.6 # GA/$T + dH[3][1][2][5]=-2.1e3; dS[3][1][2][5]=-3.9 # GA/C$ + dH[3][2][5][3]=-5.1e3; dS[3][2][5][3]=-14.0 # GC/$G + dH[3][2][2][5]=-0.2e3; dS[3][2][2][5]=-0.1 # GC/C$ + dH[3][3][5][2]=-3.9e3; dS[3][3][5][2]=-10.9 # GG/$C + dH[3][3][2][5]=-3.9e3; dS[3][3][2][5]=-11.2 # GG/C$ + dH[3][4][5][1]=-4.2e3; dS[3][4][5][1]=-15.0 # GT/$A + dH[3][4][2][5]=-4.4e3; dS[3][4][2][5]=-13.1 # GT/C$ + dH[4][5][1][1]=0.2e3; dS[4][5][1][1]=2.3 # T$/AA + dH[4][5][1][2]=0.6e3; dS[4][5][1][2]=3.3 # T$/AC + dH[4][5][1][3]=-1.1e3; dS[4][5][1][3]=-1.6 # T$/AG + dH[4][5][1][4]=-6.9e3; dS[4][5][1][4]=-20.0 # T$/AT + dH[4][1][5][4]=-6.9e3; dS[4][1][5][4]=-20.0 # TA/$T + dH[4][1][1][5]=-0.7e3; dS[4][1][1][5]=-0.7 # TA/A$ + dH[4][2][5][3]=-4.0e3; dS[4][2][5][3]=-10.9 # TC/$G + dH[4][2][1][5]=4.4e3; dS[4][2][1][5]=14.9 # TC/A$ + dH[4][3][5][2]=-4.9e3; dS[4][3][5][2]=-13.8 # TG/$C + dH[4][3][1][5]=-1.6e3; dS[4][3][1][5]=-3.6 # TG/A$ + dH[4][4][5][1]=-0.2e3; dS[4][4][5][1]=-0.5 # TT/$A + dH[4][4][1][5]=2.9e3; dS[4][4][1][5]=10.4 # TT/A$ + + + nparm['dH']=dH + nparm['dS']=dS + + return nparm + + +defaultParm=initParams(DEF_CONC_PRIMERS,DEF_CONC_SEQUENCES,DEF_SALT, SALT_METHOD_SANTALUCIA) + +def seqencoder(seq): + return [bpencoder[x] for x in seq] + +def getInitialEntropy(nparm=defaultParm): + return -5.9+nparm['rlogc'] + +def getEnthalpy(x0, x1, y0, y1,nparm=defaultParm): + return nparm['dH'][x0][x1][y0][y1] + +def GetEntropy(x0, x1, y0, y1,nparm=defaultParm): + + nx0=x0 + nx1=x1 + ny0=y0 + ny1=y1 + dH=nparm['dH'] + dS=nparm['dS'] + answer = dS[nx0][nx1][ny0][ny1] + + if (nparm['saltMethod'] == SALT_METHOD_SANTALUCIA): + if(nx0!=5 and 1<= nx1 and nx1<=4): + answer += 0.5*nparm['kfac'] + + if(ny1!=5 and 1<= ny0 and ny0<=4): + answer += 0.5*nparm['kfac'] + + if (nparm['saltMethod'] == SALT_METHOD_OWCZARZY): + logk = log(nparm['kplus']); + answer += dH[nx0][nx1][ny0][ny1]*((4.29 * nparm['gcContent']-3.95)* 1e-5 * logk + 0.0000094*logk**2); + + return answer; + +def CalcTM(entropy,enthalpy): + tm = 0 + if (enthalpy>=forbidden_enthalpy) : + return 0; + + if (entropy<0) : + tm = enthalpy/entropy + if (tm<0): + return 0; + + return tm; + + + + +def countGCContent(seq): + count = 0; + for k in seq : + if k in 'cgGC': + count+=1; + return count; + + +#def cleanSeq (inseq,outseq,length): +# +# seqlen = len(inseq) +# if (len != 0) +# seqlen = length; +# +# j=0 +# for i in xrange(seqlen): +# { +# switch (inseq[i]) +# { +# case 'a': +# case '\0': +# case 'A': +# outseq[j++] = 'A'; break; +# case 'c': +# case '\1': +# case 'C': +# outseq[j++] = 'C'; break; +# case 'g': +# case '\2': +# case 'G': +# outseq[j++] = 'G'; break; +# case 't': +# case '\3': +# case 'T': +# outseq[j++] = 'T'; break; +# } +# } +# outseq[j] = '\0'; +#} + +def calcSelfTM(seq,nparm=defaultParm): + dH=nparm['dH'] + dS=nparm['dS'] + length=len(seq) + + thedH = 0; + thedS = -5.9+nparm['rlogc'] + for i in xrange(1,length): + c1 = rvencoder[seq[i-1]]; + c2 = rvencoder[seq[i]]; + c3 = bpencoder[seq[i-1]]; + c4 = bpencoder[seq[i]]; + + thedH += dH[c3][c4][c1][c2]; + thedS += GetEntropy(c3, c4, c1, c2, nparm) + + mtemp = CalcTM(thedS,thedH); +# print thedH,thedS,nparm['rlogc'] + return mtemp-273.15; + + +def calcTMTwoSeq(seq1,seq2,nparm=defaultParm): + + thedH = 0; + thedS = -5.9+nparm['rlogc'] + dH=nparm['dH'] + dS=nparm['dS'] + length=len(seq1) + + for i in xrange(1,length): + c1 = rvencoder[seq2[i-1]] + c2 = rvencoder[seq2[i]] + c3 = bpencoder[seq1[i-1]] + c4 = bpencoder[seq1[i]] + + thedH += dH[c3][c4][c1][c2] + thedS += GetEntropy(c3, c4, c1, c2, nparm) + + mtemp = CalcTM(thedS,thedH); +# print thedH,thedS,nparm['rlogc'] + + return mtemp-273.15; + + diff --git a/obitools/tools/__init__.py b/obitools/tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/obitools/tools/_solexapairend.so b/obitools/tools/_solexapairend.so new file mode 100755 index 0000000..2d9e075 Binary files /dev/null and b/obitools/tools/_solexapairend.so differ diff --git a/obitools/tools/solexapairend.py b/obitools/tools/solexapairend.py new file mode 100644 index 0000000..609f533 --- /dev/null +++ b/obitools/tools/solexapairend.py @@ -0,0 +1,51 @@ +''' +Created on 17 mai 2010 + +@author: coissac +''' + +from obitools.alignment import columnIterator + + +def iterOnAligment(ali): + pos0=0 + pos1=len(ali[1].wrapped)-1 + begin0=False + end0=False + begin1=False + end1=False + for nuc0,nuc1 in columnIterator(ali): + if nuc0=='-': + if begin0: + if not end0: + score0 = ( ali[0].wrapped.quality[pos0-1] + +ali[0].wrapped.quality[pos0] + )/2 + else: + score0 = 1. + else: + score0 = 0. + else: + begin0=True + score0 = ali[0].wrapped.quality[pos0] + pos0+=1 + end0= pos0==len(ali[0].wrapped) + + if nuc1=='-': + if begin1: + if not end1: + score1 = ( ali[1].wrapped.wrapped.quality[pos1] + +ali[1].wrapped.wrapped.quality[pos1+1] + )/2 + else: + score1 = 0. + else: + score1 = 1. + else: + begin1=True + score1 = ali[1].wrapped.wrapped.quality[pos1] + pos1-=1 + end1=pos1<0 + + result = (nuc0,score0,nuc1,score1) + yield result diff --git a/obitools/tree/__init__.py b/obitools/tree/__init__.py new file mode 100644 index 0000000..facb5ff --- /dev/null +++ b/obitools/tree/__init__.py @@ -0,0 +1,116 @@ +import re + + +class Tree(set): + def registerNode(self,node): + assert isinstance(node, TreeNode) + self.add(node) + + def childNodeIterator(self,node): + assert isinstance(node, TreeNode) + return (x for x in self if x._parent==node) + + def subTreeSize(self,node): + n=1 + for subnode in self.childNodeIterator(node): + n+=self.subTreeSize(subnode) + return n + + def getRoot(self): + roots = [x for x in self if x._parent is None] + assert len(roots)==1,'Tree cannot have several root node' + return roots[0] + + def ancestorNodeIterator(self,node): + assert isinstance(node, TreeNode) + while node._parent is not None: + yield node + node=node._parent + yield node + + def terminalNodeIterator(self): + return (x for x in self if x._isterminal) + + def commonAncestor(self,node1,node2): + anc1 = set(x for x in self.ancestorNodeIterator(node1)) + rep = [x for x in self.ancestorNodeIterator(node2) + if x in anc1] + assert len(rep)>=1 + return rep[0] + + def getDist(self,node1,node2): + ca = self.commonAncestor(node1, node2) + dist = 0 + while node1 != ca: + dist+=node1._dist + node1=node1._parent + while node2 != ca: + dist+=node2._dist + node2=node2._parent + return dist + + def farestNodes(self): + dmax=0 + n1=None + n2=None + for node1 in self.terminalNodeIterator(): + for node2 in self.terminalNodeIterator(): + d = self.getDist(node1, node2) + if d > dmax: + dmax = d + n1=node1 + n2=node2 + return node1,node2,dmax + + def setRoot(self,node,dist): + assert node in self + assert node._parent and node._dist > dist + + newroot = TreeNode(self) + parent = node._parent + node._parent = newroot + compdist = node._dist - dist + node._dist=dist + node = parent + + while node: + parent = node._parent + if parent: + dist = node._dist + + node._parent = newroot + node._dist = compdist + + newroot = node + node = parent + + if node: + compdist=dist + + for child in self.childNodeIterator(newroot): + child._parent = newroot._parent + child._dist += newroot._dist + + self.remove(newroot) + + +class TreeNode(object): + def __init__(self,tree,name=None,dist=None,bootstrap=None,**info): + self._parent=None + self._name=name + self._dist=dist + self._bootstrap=bootstrap + self._info=info + tree.registerNode(self) + self._isterminal=True + + + def linkToParent(self,parent): + assert isinstance(parent, TreeNode) or parent is None + self._parent=parent + if parent is not None: + parent._isterminal=False + + + + diff --git a/obitools/tree/dot.py b/obitools/tree/dot.py new file mode 100644 index 0000000..a21c4a1 --- /dev/null +++ b/obitools/tree/dot.py @@ -0,0 +1,18 @@ + +from obitools.utils import universalOpen +from obitools.tree import Tree,TreeNode + +def nodeWriter(tree,node,nodes): + data=[] + if node._parent: + data.append('%d -> %d ' % (nodes[node],nodes[node._parent])) + return "\n".join(data) + + +def treeWriter(tree): + nodes=dict(map(None,tree,xrange(len(tree)))) + code=[] + for node in tree: + code.append(nodeWriter(tree,node,nodes)) + code = "\n".join(code) + return 'digraph tree { node [shape=point]\n%s\n};' % code \ No newline at end of file diff --git a/obitools/tree/layout.py b/obitools/tree/layout.py new file mode 100644 index 0000000..a39ba77 --- /dev/null +++ b/obitools/tree/layout.py @@ -0,0 +1,103 @@ + +class NodeLayout(dict): + ''' + Layout data associated to a tree node. + ''' + pass + +class TreeLayout(dict): + ''' + Description of a phylogenetic tree layout + + @see: + ''' + def addNode(self,node): + self[node]=NodeLayout() + + def setAttribute(self,node,key,value): + self[node][key]=value + + def hasAttribute(self,node,key): + return key in self[node] + + def getAttribute(self,node,key,default=None): + return self[node].get(key,default) + + def setNodesColor(self,color,predicate=True): + ''' + + @param color: + @type color: + @param predicat: + @type predicat: + ''' + for node in self: + if callable(predicat): + change = predicat(node) + else: + change = predicat + + if change: + if callable(color): + c = color(node) + else: + c = color + self.setAttribute(node, 'color', color) + + def setCircular(self,iscircularpredicat): + for node in self: + if callable(iscircularpredicat): + change = iscircularpredicat(node) + else: + change = iscircularpredicat + + if change: + self.setAttribute(node, 'shape', 'circle') + else: + self.setAttribute(node, 'shape', 'square') + + def setRadius(self,radius,predicate=True): + for node in self: + if callable(predicat): + change = predicat(node) + else: + change = predicat + + if change: + if callable(radius): + r = radius(node) + else: + r = radius + self.setAttribute(node, 'radius', r) + +def predicatGeneratorIsInfoEqual(info,value): + def isInfoEqual(node): + data = node._info + return data is not None and info in data and data[info]==value + + return isInfoEqual + +def isTerminalNode(node): + return node._isterminal + +def constantColorGenerator(color): + def colorMaker(node): + return color + + return colorMaker + +def constantColorGenerator(color): + def colorMaker(node): + return color + + return colorMaker + +def notPredicatGenerator(predicate): + def notpred(x): + return not predicat(x) + return notpred + + + + + \ No newline at end of file diff --git a/obitools/tree/newick.py b/obitools/tree/newick.py new file mode 100644 index 0000000..c69d0d3 --- /dev/null +++ b/obitools/tree/newick.py @@ -0,0 +1,117 @@ +import re +import sys + +from obitools.utils import universalOpen +from obitools.tree import Tree,TreeNode + +def subNodeIterator(data): + level=0 + start = 1 + if data[0]=='(': + for i in xrange(1,len(data)): + c=data[i] + if c=='(': + level+=1 + elif c==')': + level-=1 + if c==',' and not level: + yield data[start:i] + start = i+1 + yield data[start:i] + else: + yield data + + +_nodeParser=re.compile('\s*(?P\(.*\))?(?P[^ :]+)? *(?P[0-9.]+)?(:(?P-?[0-9.]+))?') + +def nodeParser(data): + parsedNode = _nodeParser.match(data).groupdict(0) + if not parsedNode['name']: + parsedNode['name']=None + + if not parsedNode['bootstrap']: + parsedNode['bootstrap']=None + else: + parsedNode['bootstrap']=float(parsedNode['bootstrap']) + + if not parsedNode['distance']: + parsedNode['distance']=None + else: + parsedNode['distance']=float(parsedNode['distance']) + + if not parsedNode['subnodes']: + parsedNode['subnodes']=None + + return parsedNode + +_cleanTreeData=re.compile('\s+') + +def treeParser(data,tree=None,parent=None): + if tree is None: + tree = Tree() + data = _cleanTreeData.sub(' ',data).strip() + + parsedNode = nodeParser(data) + node = TreeNode(tree, + parsedNode['name'], + parsedNode['distance'], + parsedNode['bootstrap']) + + node.linkToParent(parent) + + if parsedNode['subnodes']: + for subnode in subNodeIterator(parsedNode['subnodes']): + treeParser(subnode,tree,node) + return tree + +_treecomment=re.compile('\[.*\]') + +def treeIterator(file): + file = universalOpen(file) + data = file.read() + + comment = _treecomment.findall(data) + data=_treecomment.sub('',data).strip() + + if comment: + comment=comment[0] + else: + comment=None + for tree in data.split(';'): + t = treeParser(tree) + if comment: + t.comment=comment + yield t + +def nodeWriter(tree,node,deep=0): + name = node._name + if name is None: + name='' + + distance=node._dist + if distance is None: + distance='' + else: + distance = ':%6.5f' % distance + + bootstrap=node._bootstrap + if bootstrap is None: + bootstrap='' + else: + bootstrap=' %d' % int(bootstrap) + + nodeseparator = ',\n' + ' ' * (deep+1) + + subnodes = nodeseparator.join([nodeWriter(tree, x, deep+1) + for x in tree.childNodeIterator(node)]) + if subnodes: + subnodes='(\n' + ' ' * (deep+1) + subnodes + '\n' + ' ' * deep + ')' + + return '%s%s%s%s' % (subnodes,name,bootstrap,distance) + +def treeWriter(tree,startnode=None): + if startnode is not None: + root=startnode + else: + root = tree.getRoot() + return nodeWriter(tree,root)+';' diff --git a/obitools/tree/svg.py b/obitools/tree/svg.py new file mode 100644 index 0000000..ff51a8c --- /dev/null +++ b/obitools/tree/svg.py @@ -0,0 +1,70 @@ +import math + +from obitools.svg import Scene,Circle,Line,Rectangle,Text +from obitools.tree import Tree + +def displayTreeLayout(layout,width=400,height=400,radius=3,scale=1.0): + ''' + Convert a tree layout object in an svg file. + + @param layout: the tree layout object + @type layout: obitools.tree.layout.TreeLayout + @param width: svg document width + @type width: int + @param height: svg document height + @type height: int + @param radius: default radius of node in svg unit (default 3) + @type radius: int + @param scale: scale factor applied to the svg coordinates (default 1.0) + @type scale: float + + @return: str containing svg code + ''' + xmin = min(layout.getAttribute(n,'x') for n in layout) + xmax = max(layout.getAttribute(n,'x') for n in layout) + ymin = min(layout.getAttribute(n,'y') for n in layout) + ymax = max(layout.getAttribute(n,'y') for n in layout) + + dx = xmax - xmin + dy = ymax - ymin + + xscale = width * 0.95 / dx * scale + yscale = height * 0.95 / dy * scale + + def X(x): + return (x - xmin ) * xscale + width * 0.025 + + def Y(y): + return (y - ymin ) * yscale + height * 0.025 + + scene = Scene('unrooted', height, width) + + for n in layout: + if n._parent is not None: + parent = n._parent + xf = layout.getAttribute(n,'x') + yf = layout.getAttribute(n,'y') + xp = layout.getAttribute(parent,'x') + yp = layout.getAttribute(parent,'y') + scene.add(Line((X(xf),Y(yf)),(X(xp),Y(yp)))) + + for n in layout: + xf = layout.getAttribute(n,'x') + yf = layout.getAttribute(n,'y') + cf = layout.getAttribute(n,'color') + sf = layout.getAttribute(n,'shape') + if layout.hasAttribute(n,'radius'): + rf=layout.getAttribute(n,'radius') + else: + rf=radius + + if sf=='circle': + scene.add(Circle((X(xf),Y(yf)),rf,cf)) + else: + scene.add(Rectangle((X(xf)-rf,Y(yf)-rf),2*rf,2*rf,cf)) + + + return ''.join(scene.strarray()) + + + \ No newline at end of file diff --git a/obitools/tree/unrooted.py b/obitools/tree/unrooted.py new file mode 100644 index 0000000..9a9f3e6 --- /dev/null +++ b/obitools/tree/unrooted.py @@ -0,0 +1,33 @@ +from obitools.tree.layout import TreeLayout +import math + +def subtreeLayout(tree,node,layout,start,end,x,y,default): + nbotu = tree.subTreeSize(node) + delta = (end-start)/(nbotu+1) + + layout.addNode(node) + layout.setAttribute(node,'x',x) + layout.setAttribute(node,'y',y) + layout.setAttribute(node,'color',(255,0,0)) + layout.setAttribute(node,'shape','circle') + + for subnode in tree.childNodeIterator(node): + snbotu = tree.subTreeSize(subnode) + end = start + snbotu * delta + med = start + snbotu * delta /2 + r = subnode._dist + if r is None or r <=0: + r=default + subx=math.cos(med) * r + x + suby=math.sin(med) * r + y + subtreeLayout(tree, subnode, layout, start, end, subx, suby, default) + start=end + + return layout + +def treeLayout(tree): + layout = TreeLayout() + root = tree.getRoot() + dmin = min(n._dist for n in tree if n._dist is not None and n._dist > 0) + return subtreeLayout(tree,root,layout,0,2*math.pi,0,0,dmin / 100) + \ No newline at end of file diff --git a/obitools/unit/__init__.py b/obitools/unit/__init__.py new file mode 100644 index 0000000..d02c812 --- /dev/null +++ b/obitools/unit/__init__.py @@ -0,0 +1,8 @@ +import unittest + +from obitools import tests_group as obitools_tests_group + +tests_group=obitools_tests_group + + + diff --git a/obitools/unit/obitools/__init__.py b/obitools/unit/obitools/__init__.py new file mode 100644 index 0000000..ab1bcec --- /dev/null +++ b/obitools/unit/obitools/__init__.py @@ -0,0 +1,89 @@ +import unittest + +import obitools + +class BioseqTest(unittest.TestCase): + + sequenceId = 'id1' + sequenceDefinition = 'sequence definition' + sequenceQualifier = {'extra':3} + + def setUp(self): + self.bioseq = self.bioseqClass(self.sequenceId, + self.sequenceString, + self.sequenceDefinition, + **self.sequenceQualifier) + + title = self.__doc__.strip() + underline = "=" * len(title) + + #print "%s\n%s" % (title,underline) + + def tearDown(self): + pass + #print "\n" + + def testIdAttribute(self): + ''' + test if id attribute exists + ''' + self.failUnless(hasattr(self.bioseq, 'id'), 'id missing attribute') + + def testIdValue(self): + ''' + test if id attribute value is 'id1' + ''' + self.failUnlessEqual(self.bioseq.id, 'id1', + 'identifier is created with good value') + + def testDefinitionAttribute(self): + ''' + test if definition attribute exists + ''' + self.failUnless(hasattr(self.bioseq, 'definition'), 'definition missing attribute') + + def testSequenceIsLowerCase(self): + ''' + test if sequence is stored as lower case letter + ''' + self.failUnlessEqual(str(self.bioseq), + str(self.bioseq).lower(), + "Sequence is not stored as lower case string") + + def testSequenceQualifier(self): + ''' + test if the extra qualifier is present and its value is three. + ''' + self.failUnlessEqual(self.bioseq['extra'], + 3, + "Sequence qualifier cannot be successfully retrieve") + + def testCreateSequenceQualifier(self): + self.bioseq['testqualifier']='ok' + self.failUnlessEqual(self.bioseq['testqualifier'], + 'ok', + "Sequence qualifier cannot be successfully created") + + + +class NucBioseqTest(BioseqTest): + ''' + Test obitools.NucSequence class + ''' + + bioseqClass = obitools.NucSequence + sequenceString = 'AACGT' * 5 + + +class AABioseqTest(BioseqTest): + ''' + Test obitools.AASequence class + ''' + + bioseqClass = obitools.AASequence + sequenceString = 'MLKCVT' * 5 + + + + +tests_group = [NucBioseqTest,AABioseqTest] \ No newline at end of file diff --git a/obitools/utils/__init__.py b/obitools/utils/__init__.py new file mode 100644 index 0000000..fd7076f --- /dev/null +++ b/obitools/utils/__init__.py @@ -0,0 +1,324 @@ +import sys + +import time +import re +import shelve + +from threading import Lock +from logging import warning +import urllib2 + +from obitools.gzip import GzipFile +from obitools.zipfile import ZipFile +import os.path + + +class FileFormatError(Exception): + pass + + + +def universalOpen(file,*options): + ''' + Open a file gziped or not. + + If file is a C{str} instance, file is + concidered as a file name. In this case + the C{.gz} suffixe is tested to eventually + open it a a gziped file. + + If file is an other kind of object, it is assumed + that this object follow the C{file} interface + and it is return as is. + + @param file: the file to open + @type file: C{str} or a file like object + + @return: an iterator on text lines. + ''' + if isinstance(file,str): + if urllib2.urlparse.urlparse(file)[0]=='': + rep = open(file,*options) + else: + rep = urllib2.urlopen(file,timeout=15) + + if file[-3:] == '.gz': + rep = GzipFile(fileobj=rep) + if file[-4:] == '.zip': + zip = ZipFile(file=rep) + data = zip.infolist() + assert len(data)==1,'Only zipped file containning a single file can be open' + name = data[0].filename + rep = zip.open(name) + else: + rep = file + return rep + +def universalTell(file): + ''' + Return the position in the file even if + it is a gziped one. + + @param file: the file to check + @type file: a C{file} like instance + + @return: position in the file + @rtype: C{int} + ''' + if isinstance(file, GzipFile): + file=file.myfileobj + return file.tell() + +def fileSize(file): + ''' + Return the file size even if it is a + gziped one. + + @param file: the file to check + @type file: a C{file} like instance + + @return: the size of the file + @rtype: C{int} + ''' + if isinstance(file, GzipFile): + file=file.myfileobj + pos = file.tell() + file.seek(0,2) + length = file.tell() + file.seek(pos,0) + return length + +def progressBar(pos,maxi,reset=False,head='',delta=[],step=[1,0,0]): + if reset: + del delta[:] + if not delta: + delta.append(time.time()) + delta.append(time.time()) + assert maxi>0 + + step[1]+=1 + if step[1] % step[0] == 0: + step[1]=1 + newtime = time.time() + d = newtime-delta[1] + if d < 0.2: + step[0]*=2 + elif d > 0.4 and step[0]>1: + step[0]/=2 + + delta[1]=newtime + elapsed = delta[1]-delta[0] + + if callable(pos): + pos=pos() + percent = float(pos)/maxi * 100 + remain = time.gmtime(elapsed / percent * (100-percent)) + days = remain.tm_yday - 1 + hour = remain.tm_hour + minu = remain.tm_min + sec = remain.tm_sec + if days: + remain = "%d days %02d:%02d:%02d" % (days,hour,minu,sec) + else: + remain = "%02d:%02d:%02d" % (hour,minu,sec) + bar = '#' * int(percent/2) + step[2]=(step[2]+1) % 4 + bar+= '|/-\\'[step[2]] + bar+= ' ' * (50 - int(percent/2)) + sys.stderr.write('\r%s %5.1f %% |%s] remain : %s' %(head,percent,bar,remain)) + else: + step[1]+=1 + +def endLessIterator(endedlist): + for x in endedlist: + yield x + while(1): + yield endedlist[-1] + + +def multiLineWrapper(lineiterator): + ''' + Aggregator of strings. + + @param lineiterator: a stream of strings from an opened OBO file. + @type lineiterator: a stream of strings. + + @return: an aggregated stanza. + @rtype: an iterotor on str + + @note: The aggregator aggregates strings from an opened OBO file. + When the length of a string is < 2, the current stanza is over. + ''' + + for line in lineiterator: + rep = [line] + while len(line)>=2 and line[-2]=='\\': + rep[-1]=rep[-1][0:-2] + try: + line = lineiterator.next() + except StopIteration: + raise FileFormatError + rep.append(line) + yield ''.join(rep) + + +def skipWhiteLineIterator(lineiterator): + ''' + Curator of stanza. + + @param lineiterator: a stream of strings from an opened OBO file. + @type lineiterator: a stream of strings. + + @return: a stream of strings without blank strings. + @rtype: a stream strings + + @note: The curator skip white lines of the current stanza. + ''' + + for line in lineiterator: + cleanline = line.strip() + if cleanline: + yield line + else: + print 'skipped' + + +class ColumnFile(object): + + def __init__(self,stream,sep=None,strip=True, + types=None,skip=None,head=None, + extra=None, + extraformat='([a-zA-Z]\w*) *= *([^;]+);'): + self._stream = universalOpen(stream) + self._delimiter=sep + self._strip=strip + self._extra=extra + self._extraformat = re.compile(extraformat) + + if types: + self._types=[x for x in types] + for i in xrange(len(self._types)): + if self._types[i] is bool: + self._types[i]=ColumnFile.str2bool + else: + self._types=None + + self._skip = skip + if skip is not None: + self._lskip= len(skip) + else: + self._lskip= 0 + self._head=head + + def str2bool(x): + return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False})) + + str2bool = staticmethod(str2bool) + + + def __iter__(self): + return self + + def next(self): + + def cast(txt,type): + try: + v = type(txt) + except: + v=None + return v + ligne = self._stream.next() + if self._skip is not None: + while ligne[0:self._lskip]==self._skip: + ligne = self._stream.next() + if self._extra is not None: + try: + (ligne,extra) = ligne.rsplit(self._extra,1) + extra = dict(self._extraformat.findall(extra)) + except ValueError: + extra=None + else: + extra = None + data = ligne.split(self._delimiter) + if self._strip or self._types: + data = [x.strip() for x in data] + if self._types: + it = endLessIterator(self._types) + data = [cast(*x) for x in ((y,it.next()) for y in data)] + if self._head is not None: + data=dict(map(None, self._head,data)) + if extra is not None: + data['__extra__']=extra + else: + if extra is not None: + data.append(extra) + return data + + def tell(self): + return universalTell(self._stream) + + +class CachedDB(object): + + def __init__(self,cachefile,masterdb): + self._cache = shelve.open(cachefile,'c') + self._db = masterdb + self._lock=Lock() + + def _cacheSeq(self,seq): + self._lock.acquire() + self._cache[seq.id]=seq + self._lock.release() + return seq + + def __getitem__(self,ac): + if isinstance(ac,str): + self._lock.acquire() + if ac in self._cache: +# print >>sys.stderr,"Use cache for %s" % ac + data = self._cache[ac] + self._lock.release() + + else: + self._lock.release() + data = self._db[ac] + self._cacheSeq(data) + return data + else: + self._lock.acquire() + acs = [[x,self._cache.get(x,None)] for x in ac] + self._lock.release() + newacs = [ac for ac,cached in acs if cached is None] + if newacs: + newseqs = self._db[newacs] + else: + newseqs = iter([]) + for r in acs: + if r[1] is None: + r[1]=self._cacheSeq(newseqs.next()) +# else: +# print >>sys.stderr,"Use cache for %s" % r[0] + return (x[1] for x in acs) + + +def moduleInDevelopment(name): + Warning('This module %s is under development : use it with caution' % name) + + +def deprecatedScript(newscript): + current = sys.argv[0] + print >>sys.stderr," " + print >>sys.stderr," " + print >>sys.stderr," " + print >>sys.stderr,"#########################################################" + print >>sys.stderr,"# #" + print >>sys.stderr," W A R N I N G :" + print >>sys.stderr," %s is a deprecated script " % os.path.split(current)[1] + print >>sys.stderr," it will disappear in the next obitools version" + print >>sys.stderr," " + print >>sys.stderr," The new corresponding command is %s " % newscript + print >>sys.stderr,"# #" + print >>sys.stderr,"#########################################################" + print >>sys.stderr," " + print >>sys.stderr," " + print >>sys.stderr," " diff --git a/obitools/utils/__init__.pyc b/obitools/utils/__init__.pyc new file mode 100644 index 0000000..99512dc Binary files /dev/null and b/obitools/utils/__init__.pyc differ diff --git a/obitools/utils/bioseq.py b/obitools/utils/bioseq.py new file mode 100644 index 0000000..71337c7 --- /dev/null +++ b/obitools/utils/bioseq.py @@ -0,0 +1,232 @@ +def mergeTaxonomyClassification(uniqSeq,taxonomy): + for seq in uniqSeq: + if seq['merged_taxid']: + seq['taxid']=taxonomy.lastCommonTaxon(*seq['merged_taxid'].keys()) + tsp = taxonomy.getSpecies(seq['taxid']) + tgn = taxonomy.getGenus(seq['taxid']) + tfa = taxonomy.getFamily(seq['taxid']) + + if tsp is not None: + sp_sn = taxonomy.getScientificName(tsp) + else: + sp_sn="###" + tsp=-1 + + if tgn is not None: + gn_sn = taxonomy.getScientificName(tgn) + else: + gn_sn="###" + tgn=-1 + + if tfa is not None: + fa_sn = taxonomy.getScientificName(tfa) + else: + fa_sn="###" + tfa=-1 + + seq['species']=tsp + seq['genus']=tgn + seq['family']=tfa + + seq['species_sn']=sp_sn + seq['genus_sn']=gn_sn + seq['family_sn']=fa_sn + + seq['rank']=taxonomy.getRank(seq['taxid']) + seq['scientific_name']=fa_sn = taxonomy.getScientificName(seq['taxid']) + +def uniqSequence(seqIterator,taxonomy=None,mergedKey=None,mergeIds=False,categories=None): + uniques={} + uniqSeq=[] + + if categories is None: + categories=[] + + if mergedKey is not None: + mergedKey=set(mergedKey) + else: + mergedKey=set() + + if taxonomy is not None: + mergedKey.add('taxid') + + for seq in seqIterator: + s = tuple(seq[x] for x in categories) + (str(seq),) + if s in uniques: + s = uniques[s] + if 'count' in seq: + s['count']+=seq['count'] + else: + s['count']+=1 +# if taxonomy is not None and 'taxid' in seq: +# s['merged_taxid'][seq['taxid']]= + for key in mergedKey: + if key=='taxid' and mergeIds: + if 'taxid_dist' in seq: + s["taxid_dist"].update(seq["taxid_dist"]) + if 'taxid' in seq: + s["taxid_dist"][seq.id]=seq['taxid'] + + mkey = "merged_%s" % key + if key in seq: + s[mkey][seq[key]]=s[mkey].get(seq[key],0)+1 + if mkey in seq: + for skey in seq[mkey]: + if skey in s: + s[mkey][skey]=s[mkey].get(seq[skey],0)+seq[mkey][skey] + else: + s[mkey][skey]=seq[mkey][skey] + + for key in seq.iterkeys(): + # Merger proprement l'attribut merged s'il exist + if key in s and s[key]!=seq[key] and key!='count' and key[0:7]!='merged_' and key!='merged': + del(s[key]) + + + if mergeIds: + s['merged'].append(seq.id) + else: + uniques[s]=seq + for key in mergedKey: + if key=='taxid' and mergeIds: + if 'taxid_dist' not in seq: + seq["taxid_dist"]={} + if 'taxid' in seq: + seq["taxid_dist"][seq.id]=seq['taxid'] + mkey = "merged_%s" % key + if mkey not in seq: + seq[mkey]={} + if key in seq: + seq[mkey][seq[key]]=seq[mkey].get(seq[key],0)+1 + del(seq[key]) + + if 'count' not in seq: + seq['count']=1 + if mergeIds: + seq['merged']=[seq.id] + uniqSeq.append(seq) + + if taxonomy is not None: + mergeTaxonomyClassification(uniqSeq, taxonomy) + + + + return uniqSeq + +def uniqPrefixSequence(seqIterator,taxonomy=None,mergedKey=None,mergeIds=False,categories=None): + + if categories is None: + categories=[] + + def cmpseq(s1,s2): + return cmp(str(s1),str(s2)) + + if mergedKey is not None: + mergedKey=set(mergedKey) + else: + mergedKey=set() + + if taxonomy is not None: + mergedKey.add('taxid') + + sequences=list(seqIterator) + + if not sequences: + return [] + + sequences.sort(cmpseq) + + + old=sequences.pop() + uniqSeq=[old] + if 'count' not in old: + old['count']=1 + for key in mergedKey: + mkey = "merged_%s" % key + if mkey not in old: + old[mkey]={} + if key in old: + old[mkey][old[key]]=old[mkey].get(old[key],0)+1 + if mergeIds: + old['merged']=[old.id] + + + while(sequences): + seq=sequences.pop() + lseq=len(seq) + pold = str(old)[0:lseq] + if pold==str(seq): + + if 'count' in seq: + old['count']+=seq['count'] + else: + old['count']+=1 + + for key in mergedKey: + mkey = "merged_%s" % key + if key in seq: + old[mkey][seq[key]]=old[mkey].get(seq[key],0)+1 + if mkey in seq: + for skey in seq[mkey]: + if skey in old: + old[mkey][skey]=old[mkey].get(seq[skey],0)+seq[mkey][skey] + else: + old[mkey][skey]=seq[mkey][skey] + + for key in seq.iterkeys(): + if key in old and old[key]!=seq[key]: + del(old[key]) + + + if mergeIds: + old['merged'].append(seq.id) + else: + old=seq + + for key in mergedKey: + mkey = "merged_%s" % key + if mkey not in seq: + seq[mkey]={} + if key in seq: + seq[mkey][seq[key]]=seq[mkey].get(seq[key],0)+1 + del(seq[key]) + + if 'count' not in seq: + seq['count']=1 + if mergeIds: + seq['merged']=[seq.id] + uniqSeq.append(seq) + + if taxonomy is not None: + mergeTaxonomyClassification(uniqSeq, taxonomy) + + return uniqSeq + + + + +def _cmpOnKeyGenerator(key,reverse=False): + def compare(x,y): + try: + c1 = x[key] + except KeyError: + c1=None + + try: + c2 = y[key] + except KeyError: + c2=None + + if reverse: + s=c1 + c1=c2 + c2=s + return cmp(c1,c2) + + return compare + +def sortSequence(seqIterator,key,reverse=False): + seqs = list(seqIterator) + seqs.sort(_cmpOnKeyGenerator(key, reverse)) + return seqs + \ No newline at end of file diff --git a/obitools/utils/crc64.py b/obitools/utils/crc64.py new file mode 100644 index 0000000..537391e --- /dev/null +++ b/obitools/utils/crc64.py @@ -0,0 +1,53 @@ +# +# Code obtained from : +# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/259177/index_txt +# + +# Initialisation +# 32 first bits of generator polynomial for CRC64 +# the 32 lower bits are assumed to be zero + +POLY64REVh = 0xd8000000L +CRCTableh = [0] * 256 +CRCTablel = [0] * 256 +isInitialized = False + +def CRC64(aString): + global isInitialized + crcl = 0 + crch = 0 + if (isInitialized is not True): + isInitialized = True + for i in xrange(256): + partl = i + parth = 0L + for j in xrange(8): + rflag = partl & 1L + partl >>= 1L + if (parth & 1): + partl |= (1L << 31L) + parth >>= 1L + if rflag: + parth ^= POLY64REVh + CRCTableh[i] = parth; + CRCTablel[i] = partl; + + for item in aString: + shr = 0L + shr = (crch & 0xFF) << 24 + temp1h = crch >> 8L + temp1l = (crcl >> 8L) | shr + tableindex = (crcl ^ ord(item)) & 0xFF + + crch = temp1h ^ CRCTableh[tableindex] + crcl = temp1l ^ CRCTablel[tableindex] + return (crch, crcl) + +def CRC64digest(aString): + return "%08X%08X" % (CRC64(aString)) + +if __name__ == '__main__': + assert CRC64("IHATEMATH") == (3822890454, 2600578513) + assert CRC64digest("IHATEMATH") == "E3DCADD69B01ADD1" + print 'CRC64: dumb test successful' + diff --git a/obitools/utils/iterator.py b/obitools/utils/iterator.py new file mode 100644 index 0000000..f53537f --- /dev/null +++ b/obitools/utils/iterator.py @@ -0,0 +1,8 @@ +from itertools import chain + +def uniqueChain(*args): + see = set() + for x in chain(*args): + if x not in see: + see.add(x) + yield x \ No newline at end of file diff --git a/obitools/utils/iterator.pyc b/obitools/utils/iterator.pyc new file mode 100644 index 0000000..88d415e Binary files /dev/null and b/obitools/utils/iterator.pyc differ diff --git a/obitools/word/__init__.py b/obitools/word/__init__.py new file mode 100644 index 0000000..c1a4b6b --- /dev/null +++ b/obitools/word/__init__.py @@ -0,0 +1,72 @@ +from itertools import imap +from _binary import * + +def wordCount(liste): + count = {} + + for e in liste: + count[e]=count.get(e,0) + 1 + + return count + + +def wordIterator(sequence,lword,step=1,endIncluded=False,circular=False): + + assert not (endIncluded and circular), \ + "endIncluded and circular cannot not be set to True at the same time" + + L = len(sequence) + sequence = str(sequence) + if circular: + sequence += sequence[0:lword] + pmax=L + elif endIncluded: + pmax=L + else: + pmax = L - lword + 1 + + pos = xrange(0,pmax,step) + + for x in pos: + yield encodeWord(sequence[x:x+lword]) + + + +def wordSelector(words,accept=None,reject=None): + ''' + Filter over a DNA word iterator. + + @param words: an iterable object other a list of DNA words + @type words: an iterator + @param accept: a list of predicate. Each predicate is a function + accepting one str parametter and returning a boolean + value. + @type accept: list + @param reject: a list of predicat. Each predicat is a function + accepting one str parametter and returning a boolean + value. + @type reject: list + + @return: an iterator on DNA word (str) + @rtype: iterator + ''' + if accept is None: + accept=[] + if reject is None: + reject=[] + for w in words: +# print [bool(p(w)) for p in accept] + accepted = reduce(lambda x,y: bool(x) and bool(y), + (p(w) for p in accept), + True) +# print [(p.__name__,bool(p(w))) for p in reject] + rejected = reduce(lambda x,y:bool(x) or bool(y), + (p(w) for p in reject), + False) +# print decodeWord(w,5),accepted,rejected, + if accepted and not rejected: +# print " conserved" + yield w +# else: +# print + diff --git a/obitools/word/_binary.so b/obitools/word/_binary.so new file mode 100755 index 0000000..1780762 Binary files /dev/null and b/obitools/word/_binary.so differ diff --git a/obitools/word/options.py b/obitools/word/options.py new file mode 100644 index 0000000..ff44e57 --- /dev/null +++ b/obitools/word/options.py @@ -0,0 +1,116 @@ +from obitools.word import wordSelector +from obitools.word import allDNAWordIterator,encodeWord +from obitools.word import predicate + + + + +def _acceptedOptionCallback(options,opt,value,parser): + if not hasattr(parser.values, 'acceptedOligo'): + parser.values.acceptedOligo=[] + parser.values.acceptedOligo.append(predicate.predicateMatchPattern(value,)) + +def _rejectedOptionCallback(options,opt,value,parser): + if not hasattr(parser.values, 'rejectedOligo'): + parser.values.rejectedOligo=[] + parser.values.rejectedOligo.append(predicate.predicateMatchPattern(value)) + + + +def addOligoOptions(optionManager): + + optionManager.add_option('-L','--oligo-list', + action="store", dest="oligoList", + metavar="", + type="str", + help="filename containing a list of oligonucleotide") + + + optionManager.add_option('-s','--oligo-size', + action="store", dest="oligoSize", + metavar="<###>", + type="int", + help="Size of oligonucleotide to generate") + + optionManager.add_option('-f','--family-size', + action="store", dest="familySize", + metavar="<###>", + type="int", + help="Size of oligonucleotide family to generate") + + optionManager.add_option('-d','--distance', + action="store", dest="oligoDist", + metavar="<###>", + type="int", + default=1, + help="minimal distance between two oligonucleotides") + + optionManager.add_option('-g','--gc-max', + action="store", dest="gcMax", + metavar="<###>", + type="int", + default=0, + help="maximum count of G or C nucleotide acceptable in a word") + + optionManager.add_option('-a','--accepted', + action="append",dest="acceptedPattern", + metavar="", + default=[], + type="str", + help="pattern of accepted oligonucleotide") + + optionManager.add_option('-r','--rejected', + action="append",dest="rejectedPattern", + metavar="", + default=[], + type="str", + help="pattern of rejected oligonucleotide") + + optionManager.add_option('-p','--homopolymer', + action="store", dest="homopolymere", + metavar="<###>", + type="int", + default=0, + help="reject oligo with homopolymer longer than.") + + optionManager.add_option('-P','--homopolymer-min', + action="store", dest="homopolymere_min", + metavar="<###>", + type="int", + default=0, + help="accept only oligo with homopolymer longer or equal to.") + +def dnaWordIterator(options): + + assert options.oligoSize is not None or options.oligoList is not None,"option -s or --oligo-size must be specified" + assert options.familySize is not None,"option -f or --family-size must be specified" + assert options.oligoDist is not None,"option -d or --distance must be specified" + + if options.oligoList is not None: + words = (encodeWord(x.strip().lower()) for x in open(options.oligoList)) + else: + words = allDNAWordIterator(options.oligoSize) + #seed = 'a' * options.oligoSize + options.acceptedOligo=[] + for p in options.acceptedPattern: + assert len(p)==options.oligoSize,"Accept pattern with bad lenth : %s" % p + options.acceptedOligo.append(predicate.predicateMatchPattern(p, options.oligoSize)) + + options.rejectedOligo=[] + for p in options.rejectedPattern: + assert len(p)==options.oligoSize,"Reject pattern with bad lenth : %s" % p + options.rejectedOligo.append(predicate.predicateMatchPattern(p, options.oligoSize)) + + + #options.acceptedOligo.append(predicat.distMinGenerator(seed, options.oligoDist)) + + if options.homopolymere: + options.rejectedOligo.append(predicate.predicateHomoPolymerLarger(options.homopolymere, options.oligoSize)) + + if options.homopolymere_min: + options.acceptedOligo.append(predicate.predicateHomoPolymerLarger(options.homopolymere_min-1, options.oligoSize)) + + if options.gcMax: + options.rejectedOligo.append(predicate.predicateGCUpperBond(options.gcMax, options.oligoSize)) + + return wordSelector(words, options.acceptedOligo, options.rejectedOligo) diff --git a/obitools/word/predicate.py b/obitools/word/predicate.py new file mode 100644 index 0000000..082b80f --- /dev/null +++ b/obitools/word/predicate.py @@ -0,0 +1,41 @@ +#@PydevCodeAnalysisIgnore +''' +Created on 14 oct. 2009 + +@author: coissac +''' + +from _binary import wordDist, \ + homoMax, \ + countCG, \ + matchPattern, \ + encodePattern + +def predicateWordDistMin(word,dmin,size): + def predicate(w): + return wordDist(word, w) >= dmin + return predicate + +def predicateHomoPolymerLarger(count,size): + def predicate(w): + return homoMax(w, size) > count + return predicate + +def predicateHomoPolymerSmaller(count,size): + def predicate(w): + return homoMax(w, size) < count + return predicate + +def predicateGCUpperBond(count,size): + def predicate(w): + return countCG(w, size) > count + return predicate + +def predicateMatchPattern(pattern,size): + pattern=encodePattern(pattern) + def predicate(w): + return matchPattern(w, pattern) + return predicate + + + diff --git a/obitools/zipfile.py b/obitools/zipfile.py new file mode 100644 index 0000000..41e4bcb --- /dev/null +++ b/obitools/zipfile.py @@ -0,0 +1,1282 @@ +""" +Read and write ZIP files. +""" +import struct, os, time, sys, shutil +import binascii, cStringIO + +try: + import zlib # We may need its compression method + crc32 = zlib.crc32 +except ImportError: + zlib = None + crc32 = binascii.crc32 + +__all__ = ["BadZipfile", "error", "ZIP_STORED", "ZIP_DEFLATED", "is_zipfile", + "ZipInfo", "ZipFile", "PyZipFile", "LargeZipFile" ] + +class BadZipfile(Exception): + pass + + +class LargeZipFile(Exception): + """ + Raised when writing a zipfile, the zipfile requires ZIP64 extensions + and those extensions are disabled. + """ + +error = BadZipfile # The exception raised by this module + +ZIP64_LIMIT= (1 << 31) - 1 + +# constants for Zip file compression methods +ZIP_STORED = 0 +ZIP_DEFLATED = 8 +# Other ZIP compression methods not supported + +# Here are some struct module formats for reading headers +structEndArchive = "<4s4H2LH" # 9 items, end of archive, 22 bytes +stringEndArchive = "PK\005\006" # magic number for end of archive record +structCentralDir = "<4s4B4HLLL5HLL"# 19 items, central directory, 46 bytes +stringCentralDir = "PK\001\002" # magic number for central directory +structFileHeader = "<4s2B4HLLL2H" # 12 items, file header record, 30 bytes +stringFileHeader = "PK\003\004" # magic number for file header +structEndArchive64Locator = "<4sLQL" # 4 items, locate Zip64 header, 20 bytes +stringEndArchive64Locator = "PK\x06\x07" # magic token for locator header +structEndArchive64 = "<4sQHHLLQQQQ" # 10 items, end of archive (Zip64), 56 bytes +stringEndArchive64 = "PK\x06\x06" # magic token for Zip64 header + + +# indexes of entries in the central directory structure +_CD_SIGNATURE = 0 +_CD_CREATE_VERSION = 1 +_CD_CREATE_SYSTEM = 2 +_CD_EXTRACT_VERSION = 3 +_CD_EXTRACT_SYSTEM = 4 # is this meaningful? +_CD_FLAG_BITS = 5 +_CD_COMPRESS_TYPE = 6 +_CD_TIME = 7 +_CD_DATE = 8 +_CD_CRC = 9 +_CD_COMPRESSED_SIZE = 10 +_CD_UNCOMPRESSED_SIZE = 11 +_CD_FILENAME_LENGTH = 12 +_CD_EXTRA_FIELD_LENGTH = 13 +_CD_COMMENT_LENGTH = 14 +_CD_DISK_NUMBER_START = 15 +_CD_INTERNAL_FILE_ATTRIBUTES = 16 +_CD_EXTERNAL_FILE_ATTRIBUTES = 17 +_CD_LOCAL_HEADER_OFFSET = 18 + +# indexes of entries in the local file header structure +_FH_SIGNATURE = 0 +_FH_EXTRACT_VERSION = 1 +_FH_EXTRACT_SYSTEM = 2 # is this meaningful? +_FH_GENERAL_PURPOSE_FLAG_BITS = 3 +_FH_COMPRESSION_METHOD = 4 +_FH_LAST_MOD_TIME = 5 +_FH_LAST_MOD_DATE = 6 +_FH_CRC = 7 +_FH_COMPRESSED_SIZE = 8 +_FH_UNCOMPRESSED_SIZE = 9 +_FH_FILENAME_LENGTH = 10 +_FH_EXTRA_FIELD_LENGTH = 11 + +def is_zipfile(filename): + """Quickly see if file is a ZIP file by checking the magic number.""" + try: + fpin = open(filename, "rb") + endrec = _EndRecData(fpin) + fpin.close() + if endrec: + return True # file has correct magic number + except IOError: + pass + return False + +def _EndRecData64(fpin, offset, endrec): + """ + Read the ZIP64 end-of-archive records and use that to update endrec + """ + locatorSize = struct.calcsize(structEndArchive64Locator) + fpin.seek(offset - locatorSize, 2) + data = fpin.read(locatorSize) + sig, diskno, reloff, disks = struct.unpack(structEndArchive64Locator, data) + if sig != stringEndArchive64Locator: + return endrec + + if diskno != 0 or disks != 1: + raise BadZipfile("zipfiles that span multiple disks are not supported") + + # Assume no 'zip64 extensible data' + endArchiveSize = struct.calcsize(structEndArchive64) + fpin.seek(offset - locatorSize - endArchiveSize, 2) + data = fpin.read(endArchiveSize) + sig, sz, create_version, read_version, disk_num, disk_dir, \ + dircount, dircount2, dirsize, diroffset = \ + struct.unpack(structEndArchive64, data) + if sig != stringEndArchive64: + return endrec + + # Update the original endrec using data from the ZIP64 record + endrec[1] = disk_num + endrec[2] = disk_dir + endrec[3] = dircount + endrec[4] = dircount2 + endrec[5] = dirsize + endrec[6] = diroffset + return endrec + + +def _EndRecData(fpin): + """Return data from the "End of Central Directory" record, or None. + + The data is a list of the nine items in the ZIP "End of central dir" + record followed by a tenth item, the file seek offset of this record.""" + fpin.seek(-22, 2) # Assume no archive comment. + filesize = fpin.tell() + 22 # Get file size + data = fpin.read() + if data[0:4] == stringEndArchive and data[-2:] == "\000\000": + endrec = struct.unpack(structEndArchive, data) + endrec = list(endrec) + endrec.append("") # Append the archive comment + endrec.append(filesize - 22) # Append the record start offset + if endrec[-4] == 0xffffffff: + return _EndRecData64(fpin, -22, endrec) + return endrec + # Search the last END_BLOCK bytes of the file for the record signature. + # The comment is appended to the ZIP file and has a 16 bit length. + # So the comment may be up to 64K long. We limit the search for the + # signature to a few Kbytes at the end of the file for efficiency. + # also, the signature must not appear in the comment. + END_BLOCK = min(filesize, 1024 * 4) + fpin.seek(filesize - END_BLOCK, 0) + data = fpin.read() + start = data.rfind(stringEndArchive) + if start >= 0: # Correct signature string was found + endrec = struct.unpack(structEndArchive, data[start:start+22]) + endrec = list(endrec) + comment = data[start+22:] + if endrec[7] == len(comment): # Comment length checks out + # Append the archive comment and start offset + endrec.append(comment) + endrec.append(filesize - END_BLOCK + start) + if endrec[-4] == 0xffffffff: + return _EndRecData64(fpin, - END_BLOCK + start, endrec) + return endrec + return # Error, return None + + +class ZipInfo (object): + """Class with attributes describing each file in the ZIP archive.""" + + __slots__ = ( + 'orig_filename', + 'filename', + 'date_time', + 'compress_type', + 'comment', + 'extra', + 'create_system', + 'create_version', + 'extract_version', + 'reserved', + 'flag_bits', + 'volume', + 'internal_attr', + 'external_attr', + 'header_offset', + 'CRC', + 'compress_size', + 'file_size', + '_raw_time', + ) + + def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)): + self.orig_filename = filename # Original file name in archive + + # Terminate the file name at the first null byte. Null bytes in file + # names are used as tricks by viruses in archives. + null_byte = filename.find(chr(0)) + if null_byte >= 0: + filename = filename[0:null_byte] + # This is used to ensure paths in generated ZIP files always use + # forward slashes as the directory separator, as required by the + # ZIP format specification. + if os.sep != "/" and os.sep in filename: + filename = filename.replace(os.sep, "/") + + self.filename = filename # Normalized file name + self.date_time = date_time # year, month, day, hour, min, sec + # Standard values: + self.compress_type = ZIP_STORED # Type of compression for the file + self.comment = "" # Comment for each file + self.extra = "" # ZIP extra data + if sys.platform == 'win32': + self.create_system = 0 # System which created ZIP archive + else: + # Assume everything else is unix-y + self.create_system = 3 # System which created ZIP archive + self.create_version = 20 # Version which created ZIP archive + self.extract_version = 20 # Version needed to extract archive + self.reserved = 0 # Must be zero + self.flag_bits = 0 # ZIP flag bits + self.volume = 0 # Volume number of file header + self.internal_attr = 0 # Internal attributes + self.external_attr = 0 # External file attributes + # Other attributes are set by class ZipFile: + # header_offset Byte offset to the file header + # CRC CRC-32 of the uncompressed file + # compress_size Size of the compressed file + # file_size Size of the uncompressed file + + def FileHeader(self): + """Return the per-file header as a string.""" + dt = self.date_time + dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2] + dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2) + if self.flag_bits & 0x08: + # Set these to zero because we write them after the file data + CRC = compress_size = file_size = 0 + else: + CRC = self.CRC + compress_size = self.compress_size + file_size = self.file_size + + extra = self.extra + + if file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT: + # File is larger than what fits into a 4 byte integer, + # fall back to the ZIP64 extension + fmt = '= 24: + counts = unpack('> 1) & 0x7FFFFFFF) ^ poly + else: + crc = ((crc >> 1) & 0x7FFFFFFF) + table[i] = crc + return table + crctable = _GenerateCRCTable() + + def _crc32(self, ch, crc): + """Compute the CRC32 primitive on one byte.""" + return ((crc >> 8) & 0xffffff) ^ self.crctable[(crc ^ ord(ch)) & 0xff] + + def __init__(self, pwd): + self.key0 = 305419896 + self.key1 = 591751049 + self.key2 = 878082192 + for p in pwd: + self._UpdateKeys(p) + + def _UpdateKeys(self, c): + self.key0 = self._crc32(c, self.key0) + self.key1 = (self.key1 + (self.key0 & 255)) & 4294967295 + self.key1 = (self.key1 * 134775813 + 1) & 4294967295 + self.key2 = self._crc32(chr((self.key1 >> 24) & 255), self.key2) + + def __call__(self, c): + """Decrypt a single character.""" + c = ord(c) + k = self.key2 | 2 + c = c ^ (((k * (k^1)) >> 8) & 255) + c = chr(c) + self._UpdateKeys(c) + return c + +class ZipExtFile: + """File-like object for reading an archive member. + Is returned by ZipFile.open(). + """ + + def __init__(self, fileobj, zipinfo, decrypt=None): + self.fileobj = fileobj + self.decrypter = decrypt + self.bytes_read = 0L + self.rawbuffer = '' + self.readbuffer = '' + self.linebuffer = '' + self.eof = False + self.univ_newlines = False + self.nlSeps = ("\n", ) + self.lastdiscard = '' + + self.compress_type = zipinfo.compress_type + self.compress_size = zipinfo.compress_size + + self.closed = False + self.mode = "r" + self.name = zipinfo.filename + + # read from compressed files in 64k blocks + self.compreadsize = 64*1024 + if self.compress_type == ZIP_DEFLATED: + self.dc = zlib.decompressobj(-15) + + def set_univ_newlines(self, univ_newlines): + self.univ_newlines = univ_newlines + + # pick line separator char(s) based on universal newlines flag + self.nlSeps = ("\n", ) + if self.univ_newlines: + self.nlSeps = ("\r\n", "\r", "\n") + + def __iter__(self): + return self + + def next(self): + nextline = self.readline() + if not nextline: + raise StopIteration() + + return nextline + + def close(self): + self.closed = True + + def _checkfornewline(self): + nl, nllen = -1, -1 + if self.linebuffer: + # ugly check for cases where half of an \r\n pair was + # read on the last pass, and the \r was discarded. In this + # case we just throw away the \n at the start of the buffer. + if (self.lastdiscard, self.linebuffer[0]) == ('\r','\n'): + self.linebuffer = self.linebuffer[1:] + + for sep in self.nlSeps: + nl = self.linebuffer.find(sep) + if nl >= 0: + nllen = len(sep) + return nl, nllen + + return nl, nllen + + def readline(self, size = -1): + """Read a line with approx. size. If size is negative, + read a whole line. + """ + if size < 0: + size = sys.maxint + elif size == 0: + return '' + + # check for a newline already in buffer + nl, nllen = self._checkfornewline() + + if nl >= 0: + # the next line was already in the buffer + nl = min(nl, size) + else: + # no line break in buffer - try to read more + size -= len(self.linebuffer) + while nl < 0 and size > 0: + buf = self.read(min(size, 100)) + if not buf: + break + self.linebuffer += buf + size -= len(buf) + + # check for a newline in buffer + nl, nllen = self._checkfornewline() + + # we either ran out of bytes in the file, or + # met the specified size limit without finding a newline, + # so return current buffer + if nl < 0: + s = self.linebuffer + self.linebuffer = '' + return s + + buf = self.linebuffer[:nl] + self.lastdiscard = self.linebuffer[nl:nl + nllen] + self.linebuffer = self.linebuffer[nl + nllen:] + + # line is always returned with \n as newline char (except possibly + # for a final incomplete line in the file, which is handled above). + return buf + "\n" + + def readlines(self, sizehint = -1): + """Return a list with all (following) lines. The sizehint parameter + is ignored in this implementation. + """ + result = [] + while True: + line = self.readline() + if not line: break + result.append(line) + return result + + def read(self, size = None): + # act like file() obj and return empty string if size is 0 + if size == 0: + return '' + + # determine read size + bytesToRead = self.compress_size - self.bytes_read + + # adjust read size for encrypted files since the first 12 bytes + # are for the encryption/password information + if self.decrypter is not None: + bytesToRead -= 12 + + if size is not None and size >= 0: + if self.compress_type == ZIP_STORED: + lr = len(self.readbuffer) + bytesToRead = min(bytesToRead, size - lr) + elif self.compress_type == ZIP_DEFLATED: + if len(self.readbuffer) > size: + # the user has requested fewer bytes than we've already + # pulled through the decompressor; don't read any more + bytesToRead = 0 + else: + # user will use up the buffer, so read some more + lr = len(self.rawbuffer) + bytesToRead = min(bytesToRead, self.compreadsize - lr) + + # avoid reading past end of file contents + if bytesToRead + self.bytes_read > self.compress_size: + bytesToRead = self.compress_size - self.bytes_read + + # try to read from file (if necessary) + if bytesToRead > 0: + bytes = self.fileobj.read(bytesToRead) + self.bytes_read += len(bytes) + self.rawbuffer += bytes + + # handle contents of raw buffer + if self.rawbuffer: + newdata = self.rawbuffer + self.rawbuffer = '' + + # decrypt new data if we were given an object to handle that + if newdata and self.decrypter is not None: + newdata = ''.join(map(self.decrypter, newdata)) + + # decompress newly read data if necessary + if newdata and self.compress_type == ZIP_DEFLATED: + newdata = self.dc.decompress(newdata) + self.rawbuffer = self.dc.unconsumed_tail + if self.eof and len(self.rawbuffer) == 0: + # we're out of raw bytes (both from the file and + # the local buffer); flush just to make sure the + # decompressor is done + newdata += self.dc.flush() + # prevent decompressor from being used again + self.dc = None + + self.readbuffer += newdata + + + # return what the user asked for + if size is None or len(self.readbuffer) <= size: + bytes = self.readbuffer + self.readbuffer = '' + else: + bytes = self.readbuffer[:size] + self.readbuffer = self.readbuffer[size:] + + return bytes + + +class ZipFile: + """ Class with methods to open, read, write, close, list zip files. + + z = ZipFile(file, mode="r", compression=ZIP_STORED, allowZip64=True) + + @var file: Either the path to the file, or a file-like object. + If it is a path, the file will be opened and closed by ZipFile. + @var mode: The mode can be either read "r", write "w" or append "a". + @var compression: ZIP_STORED (no compression) or ZIP_DEFLATED (requires zlib). + @var allowZip64: if True ZipFile will create files with ZIP64 extensions when + needed, otherwise it will raise an exception when this would + be necessary. + + """ + + fp = None # Set here since __del__ checks it + + def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=False): + """Open the ZIP file with mode read "r", write "w" or append "a".""" + if mode not in ("r", "w", "a"): + raise RuntimeError('ZipFile() requires mode "r", "w", or "a"') + + if compression == ZIP_STORED: + pass + elif compression == ZIP_DEFLATED: + if not zlib: + raise RuntimeError,\ + "Compression requires the (missing) zlib module" + else: + raise RuntimeError, "That compression method is not supported" + + self._allowZip64 = allowZip64 + self._didModify = False + self.debug = 0 # Level of printing: 0 through 3 + self.NameToInfo = {} # Find file info given name + self.filelist = [] # List of ZipInfo instances for archive + self.compression = compression # Method of compression + self.mode = key = mode.replace('b', '')[0] + self.pwd = None + + # Check if we were passed a file-like object + if isinstance(file, basestring): + self._filePassed = 0 + self.filename = file + modeDict = {'r' : 'rb', 'w': 'wb', 'a' : 'r+b'} + try: + self.fp = open(file, modeDict[mode]) + except IOError: + if mode == 'a': + mode = key = 'w' + self.fp = open(file, modeDict[mode]) + else: + raise + else: + self._filePassed = 1 + self.fp = file + self.filename = getattr(file, 'name', None) + + if key == 'r': + self._GetContents() + elif key == 'w': + pass + elif key == 'a': + try: # See if file is a zip file + self._RealGetContents() + # seek to start of directory and overwrite + self.fp.seek(self.start_dir, 0) + except BadZipfile: # file is not a zip file, just append + self.fp.seek(0, 2) + else: + if not self._filePassed: + self.fp.close() + self.fp = None + raise RuntimeError, 'Mode must be "r", "w" or "a"' + + def _GetContents(self): + """Read the directory, making sure we close the file if the format + is bad.""" + try: + self._RealGetContents() + except BadZipfile: + if not self._filePassed: + self.fp.close() + self.fp = None + raise + + def _RealGetContents(self): + """Read in the table of contents for the ZIP file.""" + fp = self.fp + endrec = _EndRecData(fp) + if not endrec: + raise BadZipfile, "File is not a zip file" + if self.debug > 1: + print endrec + size_cd = endrec[5] # bytes in central directory + offset_cd = endrec[6] # offset of central directory + self.comment = endrec[8] # archive comment + # endrec[9] is the offset of the "End of Central Dir" record + if endrec[9] > ZIP64_LIMIT: + x = endrec[9] - size_cd - 56 - 20 + else: + x = endrec[9] - size_cd + # "concat" is zero, unless zip was concatenated to another file + concat = x - offset_cd + if self.debug > 2: + print "given, inferred, offset", offset_cd, x, concat + # self.start_dir: Position of start of central directory + self.start_dir = offset_cd + concat + fp.seek(self.start_dir, 0) + data = fp.read(size_cd) + fp = cStringIO.StringIO(data) + total = 0 + while total < size_cd: + centdir = fp.read(46) + total = total + 46 + if centdir[0:4] != stringCentralDir: + raise BadZipfile, "Bad magic number for central directory" + centdir = struct.unpack(structCentralDir, centdir) + if self.debug > 2: + print centdir + filename = fp.read(centdir[_CD_FILENAME_LENGTH]) + # Create ZipInfo instance to store file information + x = ZipInfo(filename) + x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH]) + x.comment = fp.read(centdir[_CD_COMMENT_LENGTH]) + total = (total + centdir[_CD_FILENAME_LENGTH] + + centdir[_CD_EXTRA_FIELD_LENGTH] + + centdir[_CD_COMMENT_LENGTH]) + x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET] + (x.create_version, x.create_system, x.extract_version, x.reserved, + x.flag_bits, x.compress_type, t, d, + x.CRC, x.compress_size, x.file_size) = centdir[1:12] + x.volume, x.internal_attr, x.external_attr = centdir[15:18] + # Convert date/time code to (year, month, day, hour, min, sec) + x._raw_time = t + x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F, + t>>11, (t>>5)&0x3F, (t&0x1F) * 2 ) + + x._decodeExtra() + x.header_offset = x.header_offset + concat + self.filelist.append(x) + self.NameToInfo[x.filename] = x + if self.debug > 2: + print "total", total + + + def namelist(self): + """Return a list of file names in the archive.""" + l = [] + for data in self.filelist: + l.append(data.filename) + return l + + def infolist(self): + """Return a list of class ZipInfo instances for files in the + archive.""" + return self.filelist + + def printdir(self): + """Print a table of contents for the zip file.""" + print "%-46s %19s %12s" % ("File Name", "Modified ", "Size") + for zinfo in self.filelist: + date = "%d-%02d-%02d %02d:%02d:%02d" % zinfo.date_time[:6] + print "%-46s %s %12d" % (zinfo.filename, date, zinfo.file_size) + + def testzip(self): + """Read all the files and check the CRC.""" + for zinfo in self.filelist: + try: + self.read(zinfo.filename) # Check CRC-32 + except BadZipfile: + return zinfo.filename + + + def getinfo(self, name): + """Return the instance of ZipInfo given 'name'.""" + info = self.NameToInfo.get(name) + if info is None: + raise KeyError( + 'There is no item named %r in the archive' % name) + + return info + + def setpassword(self, pwd): + """Set default password for encrypted files.""" + self.pwd = pwd + + def read(self, name, pwd=None): + """Return file bytes (as a string) for name.""" + return self.open(name, "r", pwd).read() + + def open(self, name, mode="r", pwd=None): + """Return file-like object for 'name'.""" + if mode not in ("r", "U", "rU"): + raise RuntimeError, 'open() requires mode "r", "U", or "rU"' + if not self.fp: + raise RuntimeError, \ + "Attempt to read ZIP archive that was already closed" + + # Only open a new file for instances where we were not + # given a file object in the constructor + if self._filePassed: + zef_file = self.fp + else: + zef_file = open(self.filename, 'rb') + + # Get info object for name + zinfo = self.getinfo(name) + + filepos = zef_file.tell() + + zef_file.seek(zinfo.header_offset, 0) + + # Skip the file header: + fheader = zef_file.read(30) + if fheader[0:4] != stringFileHeader: + raise BadZipfile, "Bad magic number for file header" + + fheader = struct.unpack(structFileHeader, fheader) + fname = zef_file.read(fheader[_FH_FILENAME_LENGTH]) + if fheader[_FH_EXTRA_FIELD_LENGTH]: + zef_file.read(fheader[_FH_EXTRA_FIELD_LENGTH]) + + if fname != zinfo.orig_filename: + raise BadZipfile, \ + 'File name in directory "%s" and header "%s" differ.' % ( + zinfo.orig_filename, fname) + + # check for encrypted flag & handle password + is_encrypted = zinfo.flag_bits & 0x1 + zd = None + if is_encrypted: + if not pwd: + pwd = self.pwd + if not pwd: + raise RuntimeError, "File %s is encrypted, " \ + "password required for extraction" % name + + zd = _ZipDecrypter(pwd) + # The first 12 bytes in the cypher stream is an encryption header + # used to strengthen the algorithm. The first 11 bytes are + # completely random, while the 12th contains the MSB of the CRC, + # or the MSB of the file time depending on the header type + # and is used to check the correctness of the password. + bytes = zef_file.read(12) + h = map(zd, bytes[0:12]) + if zinfo.flag_bits & 0x8: + # compare against the file type from extended local headers + check_byte = (zinfo._raw_time >> 8) & 0xff + else: + # compare against the CRC otherwise + check_byte = (zinfo.CRC >> 24) & 0xff + if ord(h[11]) != check_byte: + raise RuntimeError("Bad password for file", name) + + # build and return a ZipExtFile + if zd is None: + zef = ZipExtFile(zef_file, zinfo) + else: + zef = ZipExtFile(zef_file, zinfo, zd) + + # set universal newlines on ZipExtFile if necessary + if "U" in mode: + zef.set_univ_newlines(True) + return zef + + def extract(self, member, path=None, pwd=None): + """Extract a member from the archive to the current working directory, + using its full name. Its file information is extracted as accurately + as possible. `member' may be a filename or a ZipInfo object. You can + specify a different directory using `path'. + """ + if not isinstance(member, ZipInfo): + member = self.getinfo(member) + + if path is None: + path = os.getcwd() + + return self._extract_member(member, path, pwd) + + def extractall(self, path=None, members=None, pwd=None): + """Extract all members from the archive to the current working + directory. `path' specifies a different directory to extract to. + `members' is optional and must be a subset of the list returned + by namelist(). + """ + if members is None: + members = self.namelist() + + for zipinfo in members: + self.extract(zipinfo, path, pwd) + + def _extract_member(self, member, targetpath, pwd): + """Extract the ZipInfo object 'member' to a physical + file on the path targetpath. + """ + # build the destination pathname, replacing + # forward slashes to platform specific separators. + if targetpath[-1:] == "/": + targetpath = targetpath[:-1] + + # don't include leading "/" from file name if present + if os.path.isabs(member.filename): + targetpath = os.path.join(targetpath, member.filename[1:]) + else: + targetpath = os.path.join(targetpath, member.filename) + + targetpath = os.path.normpath(targetpath) + + # Create all upper directories if necessary. + upperdirs = os.path.dirname(targetpath) + if upperdirs and not os.path.exists(upperdirs): + os.makedirs(upperdirs) + + source = self.open(member.filename, pwd=pwd) + target = file(targetpath, "wb") + shutil.copyfileobj(source, target) + source.close() + target.close() + + return targetpath + + def _writecheck(self, zinfo): + """Check for errors before writing a file to the archive.""" + if zinfo.filename in self.NameToInfo: + if self.debug: # Warning for duplicate names + print "Duplicate name:", zinfo.filename + if self.mode not in ("w", "a"): + raise RuntimeError, 'write() requires mode "w" or "a"' + if not self.fp: + raise RuntimeError, \ + "Attempt to write ZIP archive that was already closed" + if zinfo.compress_type == ZIP_DEFLATED and not zlib: + raise RuntimeError, \ + "Compression requires the (missing) zlib module" + if zinfo.compress_type not in (ZIP_STORED, ZIP_DEFLATED): + raise RuntimeError, \ + "That compression method is not supported" + if zinfo.file_size > ZIP64_LIMIT: + if not self._allowZip64: + raise LargeZipFile("Filesize would require ZIP64 extensions") + if zinfo.header_offset > ZIP64_LIMIT: + if not self._allowZip64: + raise LargeZipFile("Zipfile size would require ZIP64 extensions") + + def write(self, filename, arcname=None, compress_type=None): + """Put the bytes from filename into the archive under the name + arcname.""" + if not self.fp: + raise RuntimeError( + "Attempt to write to ZIP archive that was already closed") + + st = os.stat(filename) + mtime = time.localtime(st.st_mtime) + date_time = mtime[0:6] + # Create ZipInfo instance to store file information + if arcname is None: + arcname = filename + arcname = os.path.normpath(os.path.splitdrive(arcname)[1]) + while arcname[0] in (os.sep, os.altsep): + arcname = arcname[1:] + zinfo = ZipInfo(arcname, date_time) + zinfo.external_attr = (st[0] & 0xFFFF) << 16L # Unix attributes + if compress_type is None: + zinfo.compress_type = self.compression + else: + zinfo.compress_type = compress_type + + zinfo.file_size = st.st_size + zinfo.flag_bits = 0x00 + zinfo.header_offset = self.fp.tell() # Start of header bytes + + self._writecheck(zinfo) + self._didModify = True + fp = open(filename, "rb") + # Must overwrite CRC and sizes with correct data later + zinfo.CRC = CRC = 0 + zinfo.compress_size = compress_size = 0 + zinfo.file_size = file_size = 0 + self.fp.write(zinfo.FileHeader()) + if zinfo.compress_type == ZIP_DEFLATED: + cmpr = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION, + zlib.DEFLATED, -15) + else: + cmpr = None + while 1: + buf = fp.read(1024 * 8) + if not buf: + break + file_size = file_size + len(buf) + CRC = crc32(buf, CRC) & 0xffffffff + if cmpr: + buf = cmpr.compress(buf) + compress_size = compress_size + len(buf) + self.fp.write(buf) + fp.close() + if cmpr: + buf = cmpr.flush() + compress_size = compress_size + len(buf) + self.fp.write(buf) + zinfo.compress_size = compress_size + else: + zinfo.compress_size = file_size + zinfo.CRC = CRC + zinfo.file_size = file_size + # Seek backwards and write CRC and file sizes + position = self.fp.tell() # Preserve current position in file + self.fp.seek(zinfo.header_offset + 14, 0) + self.fp.write(struct.pack(" ZIP64_LIMIT \ + or zinfo.compress_size > ZIP64_LIMIT: + extra.append(zinfo.file_size) + extra.append(zinfo.compress_size) + file_size = 0xffffffff #-1 + compress_size = 0xffffffff #-1 + else: + file_size = zinfo.file_size + compress_size = zinfo.compress_size + + if zinfo.header_offset > ZIP64_LIMIT: + extra.append(zinfo.header_offset) + header_offset = 0xffffffffL # -1 32 bit + else: + header_offset = zinfo.header_offset + + extra_data = zinfo.extra + if extra: + # Append a ZIP64 field to the extra's + extra_data = struct.pack( + '>sys.stderr, (structCentralDir, + stringCentralDir, create_version, + zinfo.create_system, extract_version, zinfo.reserved, + zinfo.flag_bits, zinfo.compress_type, dostime, dosdate, + zinfo.CRC, compress_size, file_size, + len(zinfo.filename), len(extra_data), len(zinfo.comment), + 0, zinfo.internal_attr, zinfo.external_attr, + header_offset) + raise + self.fp.write(centdir) + self.fp.write(zinfo.filename) + self.fp.write(extra_data) + self.fp.write(zinfo.comment) + + pos2 = self.fp.tell() + # Write end-of-zip-archive record + if pos1 > ZIP64_LIMIT: + # Need to write the ZIP64 end-of-archive records + zip64endrec = struct.pack( + structEndArchive64, stringEndArchive64, + 44, 45, 45, 0, 0, count, count, pos2 - pos1, pos1) + self.fp.write(zip64endrec) + + zip64locrec = struct.pack( + structEndArchive64Locator, + stringEndArchive64Locator, 0, pos2, 1) + self.fp.write(zip64locrec) + + endrec = struct.pack(structEndArchive, stringEndArchive, + 0, 0, count, count, pos2 - pos1, 0xffffffffL, 0) + self.fp.write(endrec) + + else: + endrec = struct.pack(structEndArchive, stringEndArchive, + 0, 0, count, count, pos2 - pos1, pos1, 0) + self.fp.write(endrec) + self.fp.flush() + if not self._filePassed: + self.fp.close() + self.fp = None + + +class PyZipFile(ZipFile): + """Class to create ZIP archives with Python library files and packages.""" + + def writepy(self, pathname, basename = ""): + """Add all files from "pathname" to the ZIP archive. + + If pathname is a package directory, search the directory and + all package subdirectories recursively for all *.py and enter + the modules into the archive. If pathname is a plain + directory, listdir *.py and enter all modules. Else, pathname + must be a Python *.py file and the module will be put into the + archive. Added modules are always module.pyo or module.pyc. + This method will compile the module.py into module.pyc if + necessary. + """ + dir, name = os.path.split(pathname) + if os.path.isdir(pathname): + initname = os.path.join(pathname, "__init__.py") + if os.path.isfile(initname): + # This is a package directory, add it + if basename: + basename = "%s/%s" % (basename, name) + else: + basename = name + if self.debug: + print "Adding package in", pathname, "as", basename + fname, arcname = self._get_codename(initname[0:-3], basename) + if self.debug: + print "Adding", arcname + self.write(fname, arcname) + dirlist = os.listdir(pathname) + dirlist.remove("__init__.py") + # Add all *.py files and package subdirectories + for filename in dirlist: + path = os.path.join(pathname, filename) + root, ext = os.path.splitext(filename) + if os.path.isdir(path): + if os.path.isfile(os.path.join(path, "__init__.py")): + # This is a package directory, add it + self.writepy(path, basename) # Recursive call + elif ext == ".py": + fname, arcname = self._get_codename(path[0:-3], + basename) + if self.debug: + print "Adding", arcname + self.write(fname, arcname) + else: + # This is NOT a package directory, add its files at top level + if self.debug: + print "Adding files from directory", pathname + for filename in os.listdir(pathname): + path = os.path.join(pathname, filename) + root, ext = os.path.splitext(filename) + if ext == ".py": + fname, arcname = self._get_codename(path[0:-3], + basename) + if self.debug: + print "Adding", arcname + self.write(fname, arcname) + else: + if pathname[-3:] != ".py": + raise RuntimeError, \ + 'Files added with writepy() must end with ".py"' + fname, arcname = self._get_codename(pathname[0:-3], basename) + if self.debug: + print "Adding file", arcname + self.write(fname, arcname) + + def _get_codename(self, pathname, basename): + """Return (filename, archivename) for the path. + + Given a module name path, return the correct file path and + archive name, compiling if necessary. For example, given + /python/lib/string, return (/python/lib/string.pyc, string). + """ + file_py = pathname + ".py" + file_pyc = pathname + ".pyc" + file_pyo = pathname + ".pyo" + if os.path.isfile(file_pyo) and \ + os.stat(file_pyo).st_mtime >= os.stat(file_py).st_mtime: + fname = file_pyo # Use .pyo file + elif not os.path.isfile(file_pyc) or \ + os.stat(file_pyc).st_mtime < os.stat(file_py).st_mtime: + import py_compile + if self.debug: + print "Compiling", file_py + try: + py_compile.compile(file_py, file_pyc, None, True) + except py_compile.PyCompileError,err: + print err.msg + fname = file_pyc + else: + fname = file_pyc + archivename = os.path.split(fname)[1] + if basename: + archivename = "%s/%s" % (basename, archivename) + return (fname, archivename) + + +def main(args = None): + import textwrap + USAGE=textwrap.dedent("""\ + Usage: + zipfile.py -l zipfile.zip # Show listing of a zipfile + zipfile.py -t zipfile.zip # Test if a zipfile is valid + zipfile.py -e zipfile.zip target # Extract zipfile into target dir + zipfile.py -c zipfile.zip src ... # Create zipfile from sources + """) + if args is None: + args = sys.argv[1:] + + if not args or args[0] not in ('-l', '-c', '-e', '-t'): + print USAGE + sys.exit(1) + + if args[0] == '-l': + if len(args) != 2: + print USAGE + sys.exit(1) + zf = ZipFile(args[1], 'r') + zf.printdir() + zf.close() + + elif args[0] == '-t': + if len(args) != 2: + print USAGE + sys.exit(1) + zf = ZipFile(args[1], 'r') + zf.testzip() + print "Done testing" + + elif args[0] == '-e': + if len(args) != 3: + print USAGE + sys.exit(1) + + zf = ZipFile(args[1], 'r') + out = args[2] + for path in zf.namelist(): + if path.startswith('./'): + tgt = os.path.join(out, path[2:]) + else: + tgt = os.path.join(out, path) + + tgtdir = os.path.dirname(tgt) + if not os.path.exists(tgtdir): + os.makedirs(tgtdir) + fp = open(tgt, 'wb') + fp.write(zf.read(path)) + fp.close() + zf.close() + + elif args[0] == '-c': + if len(args) < 3: + print USAGE + sys.exit(1) + + def addToZip(zf, path, zippath): + if os.path.isfile(path): + zf.write(path, zippath, ZIP_DEFLATED) + elif os.path.isdir(path): + for nm in os.listdir(path): + addToZip(zf, + os.path.join(path, nm), os.path.join(zippath, nm)) + # else: ignore + + zf = ZipFile(args[1], 'w', allowZip64=True) + for src in args[2:]: + addToZip(zf, src, os.path.basename(src)) + + zf.close() + +if __name__ == "__main__": + main() diff --git a/obitools/zipfile.pyc b/obitools/zipfile.pyc new file mode 100644 index 0000000..35dace0 Binary files /dev/null and b/obitools/zipfile.pyc differ