#!/usr/bin/env python import struct import sys import os import gzip import re from reportlab.graphics.shapes import * from reportlab.graphics.charts.barcharts import VerticalBarChart from reportlab.graphics.charts.piecharts import Pie from reportlab.graphics import renderPDF ##### # # Generic file function # ##### class Filter(object): """ This object provides different iterators and method : * findTaxonByTaxid * subTreeIterator * parentalTreeIterator * ecoPCRResultIterator * rankFilter * lastCommonTaxon see each method for more informations """ def __init__(self,path): self._path = path self._taxonFile = "%s.tdx" % self._path self._ranksFile = "%s.rdx" % self._path self._namesFile = "%s.ndx" % self._path self._taxonomy, self._index, self._ranks, self._name = self.__readNodeTable() def __universalOpen(self,file): if isinstance(file,str): if file[-3:] == '.gz': rep = gzip.open(file) else: rep = open(file) else: rep = file return rep def __universalTell(self,file): if isinstance(file, gzip.GzipFile): file=file.myfileobj return file.tell() def __fileSize(self,file): if isinstance(file, gzip.GzipFile): file=file.myfileobj pos = file.tell() file.seek(0,2) length = file.tell() file.seek(pos,0) return length def __progressBar(self,pos,max,reset=False,delta=[]): if reset: del delta[:] if not delta: delta.append(time.time()) delta.append(time.time()) delta[1]=time.time() elapsed = delta[1]-delta[0] percent = float(pos)/max * 100 remain = time.strftime('%H:%M:%S',time.gmtime(elapsed / percent * (100-percent))) bar = '#' * int(percent/2) bar+= '|/-\\-'[pos % 5] bar+= ' ' * (50 - int(percent/2)) sys.stderr.write('\r%5.1f %% |%s] remain : %s' %(percent,bar,remain)) ##### # # Iterator functions # ##### def __ecoRecordIterator(self,file): file = self.__universalOpen(file) (recordCount,) = struct.unpack('> I',file.read(4)) for i in xrange(recordCount): (recordSize,)=struct.unpack('>I',file.read(4)) record = file.read(recordSize) yield record def __ecoNameIterator(self): for record in self.__ecoRecordIterator(self._namesFile): lrecord = len(record) lnames = lrecord - 16 (isScientificName,namelength,classLength,indextaxid,names)=struct.unpack('> I I I I %ds' % lnames, record) name=names[:namelength] classname=names[namelength:] yield (name,classname,indextaxid) def __ecoTaxonomicIterator(self): for record in self.__ecoRecordIterator(self._taxonFile): lrecord = len(record) lnames = lrecord - 16 (taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record) yield (taxid,rankid,parentidx,name) def __ecoSequenceIterator(self,file): for record in self.__ecoRecordIterator(file): lrecord = len(record) lnames = lrecord - (4*4+20) (taxid,seqid,deflength,seqlength,cptseqlength,string)=struct.unpack('> I 20s I I I %ds' % lnames, record) de = string[:deflength] seq = gzip.zlib.decompress(string[deflength:]) yield (taxid,seqid,deflength,seqlength,cptseqlength,de,seq) def __ecoRankIterator(self): for record in self.__ecoRecordIterator(self._ranksFile): yield record ##### # # Indexes # ##### def __ecoNameIndex(self): indexName = [x for x in self.__ecoNameIterator()] return indexName def __ecoRankIndex(self): rank = [r for r in self.__ecoRankIterator()] return rank def __ecoTaxonomyIndex(self): taxonomy = [] index = {} i = 0; for x in self.__ecoTaxonomicIterator(): taxonomy.append(x) index[x[0]] = i i = i + 1 return taxonomy, index def __readNodeTable(self): taxonomy, index = self.__ecoTaxonomyIndex() ranks = self.__ecoRankIndex() name = self.__ecoNameIndex() return taxonomy,index,ranks,name ##### # # PUBLIC METHODS # ##### def findTaxonByTaxid(self,taxid): """ Returns a list containing [taxid,rankid,parent_index,nameLength,name] It takes one argument : a taxonomic id """ return self._taxonomy[self._index[taxid]] def subTreeIterator(self, taxid): """ Returns subtree for given taxid from first child to last child. It takes one argument : a taxonomic id """ idx = self._index[taxid] yield self._taxonomy[idx] for t in self._taxonomy: if t[2] == idx: for subt in self.subTreeIterator(t[0]): yield subt def parentalTreeIterator(self, taxid): """ return parental tree for given taxonomic id starting from first ancester to the root. """ taxon=self.findTaxonByTaxid(taxid) while taxon[2]!= 0: yield taxon taxon = self._taxonomy[taxon[2]] yield self._taxonomy[0] def ecoPCRResultIterator(self, file): """ iteration on ecoPCR result file" It returns a dictionnary """ file = self.__universalOpen(file) data = ColumnFile(file, sep='|', types=(str,int,int, str,int,str, int,str,int, str,int,str, str,str,int, str,int,int, str,str),skip='#') for ac, sq_len, taxid,\ rank, sp_taxid, species,\ ge_taxid, genus, fa_taxid,\ family, sk_taxid, s_kgdom,\ strand, oligo_1, error_1,\ oligo_2, error_2, amp_len,\ sq_des, definition in data: yield {'ac':ac, 'sq_len':sq_len, 'taxid':taxid, 'rank':rank, 'sp_taxid':sp_taxid, 'species':species, 'ge_taxid':ge_taxid, 'genus':genus, 'fa_taxid':fa_taxid, 'family':family, 'sk_taxid':sk_taxid, 's_kgdom':s_kgdom, 'strand':strand, 'oligo_1':oligo_1, 'error_1':error_1, 'oligo_2':oligo_2, 'error_2':error_2, 'amp_len':amp_len, 'sq_des':sq_des, 'definition':definition} def rankFilter(self,rankid,filter): """ boolean telling whether rankid match filter takes 2 arguments : 1- rankid 2- filter (i.e genus) """ return self._ranks[rankid] == filter def lastCommonTaxon(self,taxid_1, taxid_2): """ returns a last common parent for two given taxon. It starts from the root and goes down the tree until their parents diverge. It takes 2 arguments : 1- taxid 1 2- taxid 2 """ t1 = [x[0] for x in self.parentalTreeIterator(taxid_1)] t2 = [x[0] for x in self.parentalTreeIterator(taxid_2)] t1.reverse() t2.reverse() count = t1 < t2 and len(t1) or len(t2) for i in range(count): if t1[i] != t2[i]: return t1[i-1] return t1[count-1] class ColumnFile(object): """ cut an ecoPCR file into a list """ def __init__(self,stream,sep=None,strip=True,types=None,skip=None): if isinstance(stream,str): self._stream = open(stream) elif hasattr(stream,'next'): self._stream = stream else: raise ValueError,'stream must be string or an iterator' self._delimiter=sep self._strip=strip if types: self._types=[x for x in types] for i in xrange(len(self._types)): if self._types[i] is bool: self._types[i]=ColumnFile.str2bool else: self._types=None self._skip = skip self._oligo = {} def str2bool(x): return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False})) str2bool = staticmethod(str2bool) def __iter__(self): return self def next(self): ligne = self._stream.next() while ligne[0] == self._skip: ligne = self._stream.next() data = ligne.split(self._delimiter) if self._strip or self._types: data = [x.strip() for x in data] if self._types: it = self.endLessIterator(self._types) data = [x[1](x[0]) for x in ((y,it.next()) for y in data)] return data def endLessIterator(self,endedlist): for x in endedlist: yield x while(1): yield endedlist[-1] def getOligo(self,line): if line[2:8] == 'direct': r = re.compile('(?<=direct strand oligo1 : )[A-Z]+(?= *)') self._oligo['o1'] = r.findall(line) if line[2:9] == 'reverse': r = re.compile('(?<=reverse strand oligo2 : )[A-Z]+(?= *)') self._oligo['o2'] = r.findall(line) return None ########### # # DATA STRUCTURE # ########### class Table(list): """ Data object inheriting from list """ def __init__(self, headers, types): list.__init__(self) self.headers = headers self.types = types def __setitem__ (self,key,value): """ method overloaded to check data types """ for e in range(len(value)): value[e] = self.types[e](value[e]) list.__setitem__(self,key,value) ########### # # GRAPH FUNCTIONS # ########### def _makeHistogram(data,label,title,path): drawing = Drawing(350, 450) drawing.add(String(35,430,title,fontSize=10)) bc = VerticalBarChart() bc.x = 40 bc.y = 100 bc.height = 300 bc.width = 300 bc.bars.strokeWidth = 1 bc.barSpacing = 2 bc.barLabelFormat = '%d' bc.barLabels.dy = -5 bc.barLabels.fontSize = 9 - (len(data[0])/8) bc.data = data bc.valueAxis.valueMin = min(data[0])*0.7 bc.valueAxis.valueMax = max(data[0]) bc.valueAxis.valueStep = (max(data[0]) - min(data[0])) / 10 bc.valueAxis.strokeWidth = 1 bc.categoryAxis.labels.boxAnchor = 'e' bc.categoryAxis.labels.dx = -40 bc.categoryAxis.labels.dy = -50 bc.categoryAxis.labels.angle = 45 bc.categoryAxis.labels.width = 10 bc.categoryAxis.labels.height = 4 bc.categoryAxis.categoryNames = label bc.categoryAxis.strokeWidth = 1 bc.categoryAxis.labels.fontSize = 9 drawing.add(bc) renderPDF.drawToFile(drawing, path ) def _makePie(data, label, title, path): drawing = Drawing(400,400) drawing.add(String(50,50,title,fontSize=10)) pc = Pie() pc.x = 100 pc.y = 100 pc.data = data pc.labels = label pc.labels.angle = 45 drawing.add(pc) renderPDF.drawToFile(drawing,path) ########### # # PARSE FUNCTIONS # ########### def _parseOligoResult(filter,file,strand): seq = {} if strand == 'direct': key = 'oligo_1' elif strand == 'reverse': key = 'oligo_2' for s in filter.ecoPCRResultIterator(file): o = s[key] taxid = s['taxid'] if not seq.has_key(o): seq[o] = [1,taxid] else: seq[o][0] = seq[o][0] + 1 seq[o][1] = filter.lastCommonTaxon(seq[o][1],taxid) return seq def _parseTaxonomyResult(table): tax = {} for l in table: taxid = l[2] scName = l[3] count = l[1] if not tax.has_key(taxid): tax[taxid] = [1,scName,count] else: tax[taxid][0] = tax[taxid][0] + 1 tax[taxid][2] = tax[taxid][2] + count return tax def _sortTable(e1,e2): e1 = e1[1] e2 = e2[1] if e1 < e2: return 1 if e1 > e2: return -1 return 0 def _countOligoMismatch(o1,o2): """ define mismatch between two oligonucleotids return number of mismatch """ mmatch = 0 if len(o1) < len(o2): mmatch = int(len(o2) - len(o1)) for i in range(len(o1)): try: if o1[i] != o2[i]: mmatch = mmatch + 1 except: mmatch = mmatch + 1 return mmatch ########### # # TOOLS FUNCTIONS # ########### def customSort(table,x,y): """ """ x = x-1 y = y-1 h = (table.headers[x],table.headers[y]) t = (table.types[x],table.types[y]) cTable = Table(h,t) tmp = {} for l in table: if tmp.has_key(l[x]): tmp[l[x]] = tmp[l[x]] + l[y] else: tmp[l[x]] = l[y] for k,v in tmp.items(): cTable.append([k,v]) return cTable def countColumnOccurrence(table,x): x = x-1 h = (table.headers[x],"count") t = (table.types[x],int) cTable = Table(h,t) tmp = {} for l in table: if tmp.has_key(l[x]): tmp[l[x]] = tmp[l[x]] + 1 else: tmp[l[x]] = 1 for k,v in tmp.items(): cTable.append([k,v]) return cTable def buildSpecificityTable(table): header = ("mismatch","taxon","count") type = (int,str,int) speTable = Table(header,type) tmp = {} for l in table: if not tmp.has_key(l[5]): tmp[l[5]] = {} if not tmp[l[5]].has_key(l[3]): tmp[l[5]][l[3]] = l[1] else: tmp[l[5]][l[3]] = tmp[l[5]][l[3]] + l[1] for mismatch in tmp.items(): for taxon,count in mismatch[1].items(): speTable.append([mismatch[0],taxon,count]) return speTable def buildOligoTable(table, file, filter, oligoRef, strand='direct'): """ Fills and sorts a Table object with ecoPCR result file Takes 4 arguments 1- Table object 2- ecoPCR result file path 3- Filter Object 4- the oligo used in ecoPCR 5- the oligo type : direct or reverse """ seq = _parseOligoResult(filter, file, strand) i = 0 for oligo, info in seq.items(): table.append(0) count, lctTaxid = info[0], info[1] scName = filter.findTaxonByTaxid(info[1])[3] rank = filter._ranks[filter.findTaxonByTaxid(info[1])[1]] mismatch = _countOligoMismatch(oligoRef,oligo) table[i]=[oligo,count,lctTaxid,scName,rank,mismatch] i = i + 1 table.sort(sortTable) def buildTaxonomicTable(table): """ Fill a Table object with a taxonomic synthesis """ taxHeaders = ("scName","numOfOligo","numOfAmpl","taxid") taxTypes = (str, int, int, int) taxTable = Table(taxHeaders, taxTypes) tax = _parseTaxonomyResult(table) i = 0 for taxid, info in tax.items(): taxTable.append(0) numOfOligo, scName, numOfAmpl = info[0], info[1], info[2] taxTable[i]=[scName,numOfOligo,numOfAmpl,taxid] i = i + 1 taxTable.sort(sortTable) return taxTable def _parseSequenceResult(filter,file,id): sequences = {} idIndex = {} if id == 'family': key = 'fa_taxid' elif id == 'genus': key = 'ge_taxid' else: key = 'taxid' for s in filter.ecoPCRResultIterator(file): seq = s['sq_des'] id = s[key] if not idIndex.has_key(id): idIndex[id] = [] if not sequences.has_key(seq): sequences[seq] = [id] else: sequences[seq].append(id) return sequences, idIndex def _sameValuesInList(array): for i in range(len(array)-1): if array[i] != array[i+1]: return False return True def buildSequenceTable(table,file,filter): sequences, idIndex = _parseSequenceResult(filter,file,'species') for s,id in sequences.items(): if len(id) == 1 or _sameValuesInList(id): idIndex[id[0]].append(1) else: idIndex[id[0]].append(0) for id,values in idIndex.items(): c = values.count(1) if c == 0: idIndex[id] = -1 else: idIndex[id] = len(values) == c and 1 or 0 identified = {} non_identified = {} ambiguous = {} for sequence... ########### # # OUTPUT FUNCTIONS # ########### def printTable(table): """ Displays the content a of Table object Take 1 arguments 1- Table object """ format = ("%20s | " * len(table.headers))[:-3] print format % tuple([str(e) for e in table.headers ]) +"\n" + "-"*23*len(table.headers) for l in table: print format % tuple([str(e) for e in l ]) print "# %d results" % len(table) def printColumn(table,n): """ Displays a column of a Table object Takes 2 arguments 1- Table object 2- Number of the Column to display """ n = n - 1 print "\t%s\n" % table.headers[n] for l in table: print "%s" % l[n] def saveAsCSV(table,path): """ Creates a csv file from a Table object Takes 2 arguments 1- Table object 2- path of the file-to-be """ file = open(path,'w') file.write(','.join([str(e) for e in table.headers ]) + "\n") for l in table: file.write(','.join([str(e) for e in l ]) + "\n") file.close() def grepTable(table,col,pattern): """ Filters a Table object with regular expression Takes 3 arguments : 1- Table object 2- number of column to match with 3- regular expression pattern Returns a Table object """ col = col -1 p = re.compile(pattern, re.IGNORECASE) out = Table(table.headers,table.types) for l in table: if p.search(l[col]): out.append(l) return out def drawGraph(table,x,y,path='graph.pdf',title='',treshold=10,graphType=1): """ Creates an histogram as pdf file Takes 5 arguments : 1- a Table object 2- number of column of Table object for y axis 3- number of column of Table object for x axis 4- path of the pdf-to-be 5- a title for the graph (optional) 6- the x first highest results (10 by default) 7- the graph type : 1 for histogram, 2 for pie """ count, label = [], [] x = x-1 y = y-1 for l in table: count.append(l[y]) label.append(str(l[x])) if graphType == 1: _makeHistogram([tuple(count[:treshold])], label[:treshold], title, path) elif graphType == 2: _makePie(count[:treshold], label[:treshold], title, path) ###################### def init(): file = "/Users/bessiere/Documents/workspace/ecoPCR/src/toto.tmp" oligo = {'o1':'ATGTTTAAAA','o2':'ATGGGGGTATTG'} filter = Filter("/ecoPCRDB/gbmam") # headers = ("oligo", "Num", "LCT Taxid", "Sc Name", "Rank", "distance") # types = (str, int, int, str, str, int) # # o1Table = Table(headers, types) # o2Table = Table(headers, types) # # buildOligoTable(o1Table, file, filter, oligo['o1'], 'direct') # buildOligoTable(o2Table, file, filter, oligo['o2'], 'reverse') # # # taxTable = buildTaxonomicTable(o1Table) # speTable = buildSpecificityTable(o1Table) # seqHeaders = ("sequence","taxid") seqTypes = (str,list) seqTable = Table(seqHeaders,seqTypes) return buildSequenceTable(seqTable, file, filter) return seqTable