git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPCR/branches/refactoring@83 60f365c0-8329-0410-b2a4-ec073aeeaa1d

2007-06-29 07:14:31 +00:00
parent 2e7e0f625f
commit 212c1fbb7f
1 changed files with 786 additions and 0 deletions
--- a/tools/ecoSort.py
+++ b/tools/ecoSort.py
@ -0,0 +1,786 @@
+#!/usr/bin/env python
+
+import struct
+import sys
+import os
+import gzip
+import re
+
+from reportlab.graphics.shapes import *
+from reportlab.graphics.charts.barcharts import VerticalBarChart
+from reportlab.graphics.charts.piecharts import Pie
+from reportlab.graphics import renderPDF
+
+#####
+#
+# Generic file function
+#
+#####
+
+class Filter(object):
+    """
+       This object provides different iterators and method :
+        * findTaxonByTaxid
+        * subTreeIterator
+        * parentalTreeIterator
+        * ecoPCRResultIterator
+        * rankFilter
+        * lastCommonTaxon
+        
+        see each method for more informations
+    """
+    
+    def __init__(self,path):
+        self._path = path
+        self._taxonFile =  "%s.tdx" % self._path
+        self._ranksFile =  "%s.rdx" % self._path
+        self._namesFile =  "%s.ndx" % self._path
+        self._taxonomy, self._index, self._ranks, self._name = self.__readNodeTable()
+
+
+    def __universalOpen(self,file):
+        if isinstance(file,str):
+            if file[-3:] == '.gz':
+                rep = gzip.open(file)
+            else:
+                rep = open(file)
+        else:
+            rep = file
+        return rep
+
+    def __universalTell(self,file):
+        if isinstance(file, gzip.GzipFile):
+            file=file.myfileobj
+        return file.tell()
+    
+    def __fileSize(self,file):
+        if isinstance(file, gzip.GzipFile):
+            file=file.myfileobj
+        pos = file.tell()
+        file.seek(0,2)
+        length = file.tell()
+        file.seek(pos,0)
+        return length
+    
+    def __progressBar(self,pos,max,reset=False,delta=[]):
+        if reset:
+            del delta[:]
+        if not delta:
+            delta.append(time.time())
+            delta.append(time.time())
+    
+        delta[1]=time.time()
+        elapsed = delta[1]-delta[0]
+        percent = float(pos)/max * 100
+        remain = time.strftime('%H:%M:%S',time.gmtime(elapsed / percent * (100-percent)))
+        bar = '#' * int(percent/2)
+        bar+= '|/-\\-'[pos % 5]
+        bar+= ' ' * (50 - int(percent/2))
+        sys.stderr.write('\r%5.1f %% |%s] remain : %s' %(percent,bar,remain))
+
+
+
+
+    #####
+    #
+    # Iterator functions
+    #
+    #####
+    
+    
+    
+    def __ecoRecordIterator(self,file):
+        file = self.__universalOpen(file)
+        (recordCount,) = struct.unpack('> I',file.read(4))
+    
+        for i in xrange(recordCount):
+            (recordSize,)=struct.unpack('>I',file.read(4))
+            record = file.read(recordSize)
+            yield record
+    
+               
+    def __ecoNameIterator(self):
+        for record in self.__ecoRecordIterator(self._namesFile):
+            lrecord = len(record)
+            lnames  = lrecord - 16
+            (isScientificName,namelength,classLength,indextaxid,names)=struct.unpack('> I I I I %ds' % lnames, record)
+            name=names[:namelength]
+            classname=names[namelength:]
+            yield (name,classname,indextaxid)
+    
+    
+    def __ecoTaxonomicIterator(self):
+        for record in self.__ecoRecordIterator(self._taxonFile):
+            lrecord = len(record)
+            lnames  = lrecord - 16
+            (taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record)
+            yield  (taxid,rankid,parentidx,name)
+    
+    
+    def __ecoSequenceIterator(self,file):
+        for record in self.__ecoRecordIterator(file):
+            lrecord = len(record)
+            lnames  = lrecord - (4*4+20)
+            (taxid,seqid,deflength,seqlength,cptseqlength,string)=struct.unpack('> I 20s I I I %ds' % lnames, record)
+            de = string[:deflength]
+            seq = gzip.zlib.decompress(string[deflength:])
+            yield  (taxid,seqid,deflength,seqlength,cptseqlength,de,seq)
+    
+            
+    def __ecoRankIterator(self):
+        for record in self.__ecoRecordIterator(self._ranksFile):
+            yield  record
+    
+    
+    #####
+    #
+    # Indexes
+    #
+    #####
+    
+    def __ecoNameIndex(self):
+        indexName = [x for x in self.__ecoNameIterator()]
+        return indexName
+
+    def __ecoRankIndex(self):
+        rank = [r for r in self.__ecoRankIterator()]
+        return rank
+
+    def __ecoTaxonomyIndex(self):
+        taxonomy = []
+        index = {}
+        i = 0;
+        for x in self.__ecoTaxonomicIterator():
+            taxonomy.append(x)
+            index[x[0]] = i 
+            i = i + 1
+        return taxonomy, index
+
+    def __readNodeTable(self):
+        taxonomy, index = self.__ecoTaxonomyIndex()
+        ranks = self.__ecoRankIndex()
+        name = self.__ecoNameIndex()
+        return taxonomy,index,ranks,name
+
+
+    #####
+    #
+    # PUBLIC METHODS
+    #
+    #####
+
+    def findTaxonByTaxid(self,taxid):
+        """
+            Returns a list containing [taxid,rankid,parent_index,nameLength,name]
+            It takes one argument : a taxonomic id
+        """
+        return self._taxonomy[self._index[taxid]]
+
+
+    def subTreeIterator(self, taxid):
+        """
+            Returns subtree for given taxid from first child
+            to last child. It takes one argument : a taxonomic id
+        """
+        idx = self._index[taxid]
+        yield self._taxonomy[idx]
+        for t in self._taxonomy:
+            if t[2] == idx:
+                for subt in self.subTreeIterator(t[0]):
+                    yield subt
+    
+    
+    def parentalTreeIterator(self, taxid):
+        """
+           return parental tree for given taxonomic id starting from
+           first ancester to the root.
+        """
+        taxon=self.findTaxonByTaxid(taxid)
+        while taxon[2]!= 0: 
+            yield taxon
+            taxon = self._taxonomy[taxon[2]]
+        yield self._taxonomy[0]    
+    
+    
+    def ecoPCRResultIterator(self, file):
+        """
+           iteration on ecoPCR result file"
+           It returns a dictionnary
+        """
+        file = self.__universalOpen(file)
+        data = ColumnFile(file,
+                          sep='|',
+                          types=(str,int,int,
+                                 str,int,str,
+                                 int,str,int,
+                                 str,int,str,
+                                 str,str,int,
+                                 str,int,int,
+                                 str,str),skip='#')
+     
+        
+        for ac, sq_len, taxid,\
+            rank, sp_taxid, species,\
+            ge_taxid, genus, fa_taxid,\
+            family, sk_taxid, s_kgdom,\
+            strand, oligo_1, error_1,\
+            oligo_2, error_2, amp_len,\
+            sq_des, definition in data:
+            
+            yield {'ac':ac, 'sq_len':sq_len, 'taxid':taxid,
+                   'rank':rank, 'sp_taxid':sp_taxid, 'species':species,
+                   'ge_taxid':ge_taxid, 'genus':genus, 'fa_taxid':fa_taxid,
+                   'family':family, 'sk_taxid':sk_taxid, 's_kgdom':s_kgdom,
+                   'strand':strand, 'oligo_1':oligo_1, 'error_1':error_1,
+                   'oligo_2':oligo_2, 'error_2':error_2, 'amp_len':amp_len,
+                   'sq_des':sq_des, 'definition':definition}
+    
+    def rankFilter(self,rankid,filter):
+        """
+            boolean telling whether rankid match filter
+            takes 2 arguments : 
+            1- rankid
+            2- filter (i.e genus)
+        """
+        return self._ranks[rankid] == filter
+
+
+    def lastCommonTaxon(self,taxid_1, taxid_2):
+        """
+            returns a last common parent for two given taxon.
+            It starts from the root and goes down the tree
+            until their parents diverge.
+            It takes 2 arguments :
+            1- taxid 1
+            2- taxid 2
+        """
+        t1 = [x[0] for x in self.parentalTreeIterator(taxid_1)]
+        t2 = [x[0] for x in self.parentalTreeIterator(taxid_2)]
+        t1.reverse()
+        t2.reverse()
+        count = t1 < t2 and len(t1) or len(t2)
+        for i in range(count):
+            if t1[i] != t2[i]:
+               return t1[i-1]
+        return t1[count-1]
+       
+    
+    
+    
+
+class ColumnFile(object):
+    """
+        cut an ecoPCR file into a list
+    """
+    def __init__(self,stream,sep=None,strip=True,types=None,skip=None):
+        if isinstance(stream,str):
+            self._stream = open(stream)
+        elif hasattr(stream,'next'):
+            self._stream = stream
+        else:
+            raise ValueError,'stream must be string or an iterator'
+        self._delimiter=sep
+        self._strip=strip
+        if types:
+            self._types=[x for x in types]
+            for i in xrange(len(self._types)):
+                if self._types[i] is bool:
+                    self._types[i]=ColumnFile.str2bool
+        else:
+            self._types=None
+        self._skip = skip
+        self._oligo = {}
+            
+    def str2bool(x):
+        return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False}))
+                    
+    str2bool = staticmethod(str2bool)
+            
+        
+    def __iter__(self):
+        return self
+    
+    def next(self):
+        ligne = self._stream.next()
+        while ligne[0] == self._skip:
+            ligne = self._stream.next()
+        data = ligne.split(self._delimiter)
+        if self._strip or self._types:
+            data = [x.strip() for x in data]
+        if self._types:
+            it = self.endLessIterator(self._types)
+            data = [x[1](x[0]) for x in ((y,it.next()) for y in data)]
+        return data
+    
+    def endLessIterator(self,endedlist):
+        for x in endedlist:
+            yield x
+        while(1):
+            yield endedlist[-1]
+        
+    def getOligo(self,line):
+        if line[2:8] == 'direct':
+            r = re.compile('(?<=direct  strand oligo1 : )[A-Z]+(?= *)')
+            self._oligo['o1'] = r.findall(line)
+        if line[2:9] == 'reverse':
+            r = re.compile('(?<=reverse strand oligo2 : )[A-Z]+(?= *)')
+            self._oligo['o2'] = r.findall(line) 
+        return None
+            
+            
+
+
+###########
+#
+# DATA STRUCTURE
+#
+###########
+
+
+class Table(list):
+    """
+        Data object inheriting from list
+    """
+    def __init__(self, headers, types):
+        list.__init__(self)
+        self.headers = headers
+        self.types = types
+
+
+    def __setitem__ (self,key,value):
+        """
+            method overloaded to check data types
+        """
+        for e in range(len(value)):
+            value[e] = self.types[e](value[e])
+        list.__setitem__(self,key,value)
+
+
+
+###########
+#
+# GRAPH FUNCTIONS
+#
+###########
+
+
+def _makeHistogram(data,label,title,path):
+    
+    drawing = Drawing(350, 450)
+    
+    drawing.add(String(35,430,title,fontSize=10))
+    
+    bc = VerticalBarChart()
+    bc.x = 40
+    bc.y = 100
+    bc.height = 300
+    bc.width = 300
+    bc.bars.strokeWidth = 1
+    bc.barSpacing = 2
+    bc.barLabelFormat = '%d'
+    bc.barLabels.dy = -5
+    bc.barLabels.fontSize = 9 - (len(data[0])/8)
+    bc.data = data
+       
+    bc.valueAxis.valueMin = min(data[0])*0.7
+    bc.valueAxis.valueMax = max(data[0])
+    bc.valueAxis.valueStep = (max(data[0]) - min(data[0])) / 10
+    bc.valueAxis.strokeWidth = 1
+    
+    bc.categoryAxis.labels.boxAnchor = 'e'
+    bc.categoryAxis.labels.dx = -40
+    bc.categoryAxis.labels.dy = -50
+    bc.categoryAxis.labels.angle = 45
+    bc.categoryAxis.labels.width = 10
+    bc.categoryAxis.labels.height = 4
+    bc.categoryAxis.categoryNames = label
+    bc.categoryAxis.strokeWidth = 1
+    bc.categoryAxis.labels.fontSize = 9
+
+    
+    drawing.add(bc)
+        
+    renderPDF.drawToFile(drawing, path )
+    
+def _makePie(data, label, title, path):
+    
+    drawing = Drawing(400,400)
+    drawing.add(String(50,50,title,fontSize=10))
+    
+    pc = Pie()
+    pc.x = 100
+    pc.y = 100
+    pc.data = data
+    pc.labels = label
+    pc.labels.angle = 45
+    
+    drawing.add(pc)
+    
+    renderPDF.drawToFile(drawing,path)
+    
+    
+###########
+#
+# PARSE FUNCTIONS
+#
+###########
+
+def _parseOligoResult(filter,file,strand):
+    seq = {}
+    
+    if strand == 'direct':
+        key = 'oligo_1'
+    elif strand == 'reverse':
+        key = 'oligo_2'
+        
+    for s in filter.ecoPCRResultIterator(file):
+        o = s[key]
+        taxid = s['taxid']
+        if not seq.has_key(o):
+            seq[o] = [1,taxid]
+        else:
+            seq[o][0] = seq[o][0] + 1
+            seq[o][1] = filter.lastCommonTaxon(seq[o][1],taxid)
+    return seq
+
+    
+def _parseTaxonomyResult(table):
+    tax = {}
+    for l in table:
+        taxid = l[2]
+        scName = l[3]
+        count = l[1]
+        if not tax.has_key(taxid):
+            tax[taxid] = [1,scName,count]
+        else:
+            tax[taxid][0] = tax[taxid][0] + 1
+            tax[taxid][2] = tax[taxid][2] + count
+    return tax
+
+
+def _sortTable(e1,e2):
+    e1 = e1[1]
+    e2 = e2[1]
+    if e1 < e2:
+        return 1
+    if e1 > e2:
+        return -1
+    return 0 
+
+
+def _countOligoMismatch(o1,o2):
+    """
+        define mismatch between two oligonucleotids
+        return number of mismatch
+    """ 
+    mmatch = 0
+    if len(o1) < len(o2):
+        mmatch = int(len(o2) - len(o1))
+    for i in range(len(o1)):
+        try:
+            if o1[i] != o2[i]:
+                mmatch = mmatch + 1
+        except:
+            mmatch = mmatch + 1    
+
+    return mmatch
+
+###########
+#
+# TOOLS FUNCTIONS
+#
+###########
+
+def customSort(table,x,y):
+    """
+        
+    """
+    x = x-1
+    y = y-1
+    h = (table.headers[x],table.headers[y])
+    t = (table.types[x],table.types[y])
+    cTable = Table(h,t)
+    
+    tmp = {}
+    
+    for l in table:
+        if tmp.has_key(l[x]):
+            tmp[l[x]] = tmp[l[x]] + l[y]
+        else:
+            tmp[l[x]] = l[y]
+    
+    for k,v in tmp.items():
+        cTable.append([k,v])
+    
+    return cTable
+    
+
+def countColumnOccurrence(table,x):
+    x = x-1
+    h = (table.headers[x],"count")
+    t = (table.types[x],int)
+    cTable = Table(h,t)
+    
+    tmp = {}
+    
+    for l in table:
+        if tmp.has_key(l[x]):
+            tmp[l[x]] = tmp[l[x]] + 1
+        else:
+            tmp[l[x]] = 1
+    
+    for k,v in tmp.items():
+        cTable.append([k,v])
+    
+    return cTable
+
+
+def buildSpecificityTable(table):
+    header =  ("mismatch","taxon","count")
+    type = (int,str,int)
+    speTable = Table(header,type)
+    
+    tmp = {}
+    for l in table:
+        if not tmp.has_key(l[5]):
+            tmp[l[5]] = {}
+        if not tmp[l[5]].has_key(l[3]):
+            tmp[l[5]][l[3]] = l[1]
+        else:
+            tmp[l[5]][l[3]] = tmp[l[5]][l[3]] + l[1]
+    
+    for mismatch in tmp.items():
+        for taxon,count in mismatch[1].items():
+            speTable.append([mismatch[0],taxon,count])     
+
+    return speTable
+   
+
+def buildOligoTable(table, file, filter, oligoRef, strand='direct'):
+    """
+        Fills and sorts a Table object with ecoPCR result file
+        
+        Takes 4 arguments
+        1- Table object
+        2- ecoPCR result file path
+        3- Filter Object
+        4- the oligo used in ecoPCR
+        5- the oligo type : direct or reverse
+        
+    """
+    seq = _parseOligoResult(filter, file, strand)
+    
+    i = 0
+    for oligo, info in seq.items():
+        table.append(0)
+        count, lctTaxid = info[0], info[1]
+        scName = filter.findTaxonByTaxid(info[1])[3]
+        rank = filter._ranks[filter.findTaxonByTaxid(info[1])[1]]
+        mismatch = _countOligoMismatch(oligoRef,oligo)
+        table[i]=[oligo,count,lctTaxid,scName,rank,mismatch]
+        i = i + 1
+        
+    table.sort(sortTable)
+
+ 
+def buildTaxonomicTable(table):
+    """
+        Fill a Table object with a taxonomic synthesis
+    """
+    taxHeaders = ("scName","numOfOligo","numOfAmpl","taxid")
+    taxTypes = (str, int, int, int)
+    taxTable = Table(taxHeaders, taxTypes)
+    
+    tax = _parseTaxonomyResult(table) 
+  
+    i = 0
+    for taxid, info in tax.items():
+        taxTable.append(0)
+        numOfOligo, scName, numOfAmpl = info[0], info[1], info[2]
+        taxTable[i]=[scName,numOfOligo,numOfAmpl,taxid]
+        i = i + 1 
+
+    taxTable.sort(sortTable)
+
+    return taxTable
+
+def _parseSequenceResult(filter,file,id):
+    sequences = {}
+    idIndex = {}
+    
+    if id == 'family':
+        key = 'fa_taxid'
+    elif id == 'genus':
+        key = 'ge_taxid'
+    else:
+        key = 'taxid'
+    
+    for s in filter.ecoPCRResultIterator(file):
+        seq = s['sq_des']
+        id = s[key]
+        if not idIndex.has_key(id):
+            idIndex[id] = []
+        if not sequences.has_key(seq):
+            sequences[seq] = [id]
+        else:
+            sequences[seq].append(id)
+    return sequences, idIndex
+
+
+def _sameValuesInList(array):
+    for i in range(len(array)-1):
+        if array[i] != array[i+1]:
+            return False
+    return True
+
+
+def buildSequenceTable(table,file,filter):
+    
+    sequences, idIndex = _parseSequenceResult(filter,file,'species')
+    
+    for s,id in sequences.items():
+        if len(id) == 1 or _sameValuesInList(id):
+            idIndex[id[0]].append(1)
+        else:
+            idIndex[id[0]].append(0)
+    
+   
+    for id,values in idIndex.items():
+        c = values.count(1)
+        if  c == 0:
+            idIndex[id] = -1
+        else:
+            idIndex[id] = len(values) == c and 1 or 0
+            
+    identified = {}
+    non_identified = {}
+    ambiguous = {}
+    for sequence...
+
+
+    
+
+###########
+#
+# OUTPUT FUNCTIONS
+#
+###########
+
+def printTable(table):
+    """
+        Displays the content a of Table object
+        Take 1 arguments
+        1- Table object
+    """
+
+    format = ("%20s | " * len(table.headers))[:-3]
+    print format % tuple([str(e) for e in table.headers ]) +"\n" + "-"*23*len(table.headers)
+    for l in table:
+        print format % tuple([str(e) for e in l ])
+    print "# %d results" % len(table)
+    
+        
+def printColumn(table,n):
+    """
+       Displays a column of a Table object
+       Takes 2 arguments
+       1- Table object
+       2- Number of the Column to display
+    """
+    n = n - 1
+    print "\t%s\n" % table.headers[n]
+    for l in table:
+        print "%s" % l[n]
+    
+        
+def saveAsCSV(table,path):
+    """
+        Creates a csv file from a Table object
+        Takes 2 arguments
+        1- Table object
+        2- path of the file-to-be    
+    """
+    file = open(path,'w')
+    file.write(','.join([str(e) for e in table.headers ]) + "\n")
+    for l in table:
+        file.write(','.join([str(e) for e in l ]) + "\n")
+    file.close()
+
+
+def grepTable(table,col,pattern):
+    """
+        Filters a Table object with regular expression
+        Takes 3 arguments :
+        1- Table object
+        2- number of column to match with
+        3- regular expression pattern
+        
+        Returns a Table object
+    """
+    col = col -1
+    p = re.compile(pattern, re.IGNORECASE)
+    out = Table(table.headers,table.types)
+    for l in table:
+        if p.search(l[col]):
+            out.append(l)
+    return out
+
+def drawGraph(table,x,y,path='graph.pdf',title='',treshold=10,graphType=1):
+    """
+        Creates an histogram as pdf file
+        Takes 5 arguments :
+        1- a Table object
+        2- number of column of Table object for y axis
+        3- number of column of Table object for x axis
+        4- path of the pdf-to-be
+        5- a title for the graph (optional)
+        6- the x first highest results (10 by default)
+        7- the graph type : 1 for histogram, 2 for pie
+    """
+    count, label = [], []
+    x = x-1
+    y = y-1
+    for l in table:
+        count.append(l[y])
+        label.append(str(l[x]))
+    if graphType == 1:
+        _makeHistogram([tuple(count[:treshold])], label[:treshold], title, path)
+    elif graphType == 2:
+        _makePie(count[:treshold], label[:treshold], title, path)
+
+
+
+###################### 
+
+ 
+def init():
+    file = "/Users/bessiere/Documents/workspace/ecoPCR/src/toto.tmp"
+    oligo = {'o1':'ATGTTTAAAA','o2':'ATGGGGGTATTG'}
+     
+    filter = Filter("/ecoPCRDB/gbmam")
+    
+#    headers = ("oligo", "Num", "LCT Taxid", "Sc Name", "Rank", "distance")
+#    types = (str, int, int, str, str, int)
+#    
+#    o1Table = Table(headers, types)
+#    o2Table = Table(headers, types)
+#    
+#    buildOligoTable(o1Table, file, filter, oligo['o1'], 'direct')
+#    buildOligoTable(o2Table, file, filter, oligo['o2'], 'reverse')
+#
+# 
+#    taxTable = buildTaxonomicTable(o1Table)
+#    speTable = buildSpecificityTable(o1Table)
+#   
+   
+    seqHeaders = ("sequence","taxid")
+    seqTypes = (str,list) 
+    seqTable = Table(seqHeaders,seqTypes)
+    
+    return buildSequenceTable(seqTable, file, filter)
+    
+    
+
+    return seqTable
+
+