diff --git a/tools/ecoSort.py b/tools/ecoSort.py new file mode 100755 index 0000000..a519375 --- /dev/null +++ b/tools/ecoSort.py @@ -0,0 +1,786 @@ +#!/usr/bin/env python + +import struct +import sys +import os +import gzip +import re + +from reportlab.graphics.shapes import * +from reportlab.graphics.charts.barcharts import VerticalBarChart +from reportlab.graphics.charts.piecharts import Pie +from reportlab.graphics import renderPDF + +##### +# +# Generic file function +# +##### + +class Filter(object): + """ + This object provides different iterators and method : + * findTaxonByTaxid + * subTreeIterator + * parentalTreeIterator + * ecoPCRResultIterator + * rankFilter + * lastCommonTaxon + + see each method for more informations + """ + + def __init__(self,path): + self._path = path + self._taxonFile = "%s.tdx" % self._path + self._ranksFile = "%s.rdx" % self._path + self._namesFile = "%s.ndx" % self._path + self._taxonomy, self._index, self._ranks, self._name = self.__readNodeTable() + + + def __universalOpen(self,file): + if isinstance(file,str): + if file[-3:] == '.gz': + rep = gzip.open(file) + else: + rep = open(file) + else: + rep = file + return rep + + def __universalTell(self,file): + if isinstance(file, gzip.GzipFile): + file=file.myfileobj + return file.tell() + + def __fileSize(self,file): + if isinstance(file, gzip.GzipFile): + file=file.myfileobj + pos = file.tell() + file.seek(0,2) + length = file.tell() + file.seek(pos,0) + return length + + def __progressBar(self,pos,max,reset=False,delta=[]): + if reset: + del delta[:] + if not delta: + delta.append(time.time()) + delta.append(time.time()) + + delta[1]=time.time() + elapsed = delta[1]-delta[0] + percent = float(pos)/max * 100 + remain = time.strftime('%H:%M:%S',time.gmtime(elapsed / percent * (100-percent))) + bar = '#' * int(percent/2) + bar+= '|/-\\-'[pos % 5] + bar+= ' ' * (50 - int(percent/2)) + sys.stderr.write('\r%5.1f %% |%s] remain : %s' %(percent,bar,remain)) + + + + + ##### + # + # Iterator functions + # + ##### + + + + def __ecoRecordIterator(self,file): + file = self.__universalOpen(file) + (recordCount,) = struct.unpack('> I',file.read(4)) + + for i in xrange(recordCount): + (recordSize,)=struct.unpack('>I',file.read(4)) + record = file.read(recordSize) + yield record + + + def __ecoNameIterator(self): + for record in self.__ecoRecordIterator(self._namesFile): + lrecord = len(record) + lnames = lrecord - 16 + (isScientificName,namelength,classLength,indextaxid,names)=struct.unpack('> I I I I %ds' % lnames, record) + name=names[:namelength] + classname=names[namelength:] + yield (name,classname,indextaxid) + + + def __ecoTaxonomicIterator(self): + for record in self.__ecoRecordIterator(self._taxonFile): + lrecord = len(record) + lnames = lrecord - 16 + (taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record) + yield (taxid,rankid,parentidx,name) + + + def __ecoSequenceIterator(self,file): + for record in self.__ecoRecordIterator(file): + lrecord = len(record) + lnames = lrecord - (4*4+20) + (taxid,seqid,deflength,seqlength,cptseqlength,string)=struct.unpack('> I 20s I I I %ds' % lnames, record) + de = string[:deflength] + seq = gzip.zlib.decompress(string[deflength:]) + yield (taxid,seqid,deflength,seqlength,cptseqlength,de,seq) + + + def __ecoRankIterator(self): + for record in self.__ecoRecordIterator(self._ranksFile): + yield record + + + ##### + # + # Indexes + # + ##### + + def __ecoNameIndex(self): + indexName = [x for x in self.__ecoNameIterator()] + return indexName + + def __ecoRankIndex(self): + rank = [r for r in self.__ecoRankIterator()] + return rank + + def __ecoTaxonomyIndex(self): + taxonomy = [] + index = {} + i = 0; + for x in self.__ecoTaxonomicIterator(): + taxonomy.append(x) + index[x[0]] = i + i = i + 1 + return taxonomy, index + + def __readNodeTable(self): + taxonomy, index = self.__ecoTaxonomyIndex() + ranks = self.__ecoRankIndex() + name = self.__ecoNameIndex() + return taxonomy,index,ranks,name + + + ##### + # + # PUBLIC METHODS + # + ##### + + def findTaxonByTaxid(self,taxid): + """ + Returns a list containing [taxid,rankid,parent_index,nameLength,name] + It takes one argument : a taxonomic id + """ + return self._taxonomy[self._index[taxid]] + + + def subTreeIterator(self, taxid): + """ + Returns subtree for given taxid from first child + to last child. It takes one argument : a taxonomic id + """ + idx = self._index[taxid] + yield self._taxonomy[idx] + for t in self._taxonomy: + if t[2] == idx: + for subt in self.subTreeIterator(t[0]): + yield subt + + + def parentalTreeIterator(self, taxid): + """ + return parental tree for given taxonomic id starting from + first ancester to the root. + """ + taxon=self.findTaxonByTaxid(taxid) + while taxon[2]!= 0: + yield taxon + taxon = self._taxonomy[taxon[2]] + yield self._taxonomy[0] + + + def ecoPCRResultIterator(self, file): + """ + iteration on ecoPCR result file" + It returns a dictionnary + """ + file = self.__universalOpen(file) + data = ColumnFile(file, + sep='|', + types=(str,int,int, + str,int,str, + int,str,int, + str,int,str, + str,str,int, + str,int,int, + str,str),skip='#') + + + for ac, sq_len, taxid,\ + rank, sp_taxid, species,\ + ge_taxid, genus, fa_taxid,\ + family, sk_taxid, s_kgdom,\ + strand, oligo_1, error_1,\ + oligo_2, error_2, amp_len,\ + sq_des, definition in data: + + yield {'ac':ac, 'sq_len':sq_len, 'taxid':taxid, + 'rank':rank, 'sp_taxid':sp_taxid, 'species':species, + 'ge_taxid':ge_taxid, 'genus':genus, 'fa_taxid':fa_taxid, + 'family':family, 'sk_taxid':sk_taxid, 's_kgdom':s_kgdom, + 'strand':strand, 'oligo_1':oligo_1, 'error_1':error_1, + 'oligo_2':oligo_2, 'error_2':error_2, 'amp_len':amp_len, + 'sq_des':sq_des, 'definition':definition} + + def rankFilter(self,rankid,filter): + """ + boolean telling whether rankid match filter + takes 2 arguments : + 1- rankid + 2- filter (i.e genus) + """ + return self._ranks[rankid] == filter + + + def lastCommonTaxon(self,taxid_1, taxid_2): + """ + returns a last common parent for two given taxon. + It starts from the root and goes down the tree + until their parents diverge. + It takes 2 arguments : + 1- taxid 1 + 2- taxid 2 + """ + t1 = [x[0] for x in self.parentalTreeIterator(taxid_1)] + t2 = [x[0] for x in self.parentalTreeIterator(taxid_2)] + t1.reverse() + t2.reverse() + count = t1 < t2 and len(t1) or len(t2) + for i in range(count): + if t1[i] != t2[i]: + return t1[i-1] + return t1[count-1] + + + + + +class ColumnFile(object): + """ + cut an ecoPCR file into a list + """ + def __init__(self,stream,sep=None,strip=True,types=None,skip=None): + if isinstance(stream,str): + self._stream = open(stream) + elif hasattr(stream,'next'): + self._stream = stream + else: + raise ValueError,'stream must be string or an iterator' + self._delimiter=sep + self._strip=strip + if types: + self._types=[x for x in types] + for i in xrange(len(self._types)): + if self._types[i] is bool: + self._types[i]=ColumnFile.str2bool + else: + self._types=None + self._skip = skip + self._oligo = {} + + def str2bool(x): + return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False})) + + str2bool = staticmethod(str2bool) + + + def __iter__(self): + return self + + def next(self): + ligne = self._stream.next() + while ligne[0] == self._skip: + ligne = self._stream.next() + data = ligne.split(self._delimiter) + if self._strip or self._types: + data = [x.strip() for x in data] + if self._types: + it = self.endLessIterator(self._types) + data = [x[1](x[0]) for x in ((y,it.next()) for y in data)] + return data + + def endLessIterator(self,endedlist): + for x in endedlist: + yield x + while(1): + yield endedlist[-1] + + def getOligo(self,line): + if line[2:8] == 'direct': + r = re.compile('(?<=direct strand oligo1 : )[A-Z]+(?= *)') + self._oligo['o1'] = r.findall(line) + if line[2:9] == 'reverse': + r = re.compile('(?<=reverse strand oligo2 : )[A-Z]+(?= *)') + self._oligo['o2'] = r.findall(line) + return None + + + + +########### +# +# DATA STRUCTURE +# +########### + + +class Table(list): + """ + Data object inheriting from list + """ + def __init__(self, headers, types): + list.__init__(self) + self.headers = headers + self.types = types + + + def __setitem__ (self,key,value): + """ + method overloaded to check data types + """ + for e in range(len(value)): + value[e] = self.types[e](value[e]) + list.__setitem__(self,key,value) + + + +########### +# +# GRAPH FUNCTIONS +# +########### + + +def _makeHistogram(data,label,title,path): + + drawing = Drawing(350, 450) + + drawing.add(String(35,430,title,fontSize=10)) + + bc = VerticalBarChart() + bc.x = 40 + bc.y = 100 + bc.height = 300 + bc.width = 300 + bc.bars.strokeWidth = 1 + bc.barSpacing = 2 + bc.barLabelFormat = '%d' + bc.barLabels.dy = -5 + bc.barLabels.fontSize = 9 - (len(data[0])/8) + bc.data = data + + bc.valueAxis.valueMin = min(data[0])*0.7 + bc.valueAxis.valueMax = max(data[0]) + bc.valueAxis.valueStep = (max(data[0]) - min(data[0])) / 10 + bc.valueAxis.strokeWidth = 1 + + bc.categoryAxis.labels.boxAnchor = 'e' + bc.categoryAxis.labels.dx = -40 + bc.categoryAxis.labels.dy = -50 + bc.categoryAxis.labels.angle = 45 + bc.categoryAxis.labels.width = 10 + bc.categoryAxis.labels.height = 4 + bc.categoryAxis.categoryNames = label + bc.categoryAxis.strokeWidth = 1 + bc.categoryAxis.labels.fontSize = 9 + + + drawing.add(bc) + + renderPDF.drawToFile(drawing, path ) + +def _makePie(data, label, title, path): + + drawing = Drawing(400,400) + drawing.add(String(50,50,title,fontSize=10)) + + pc = Pie() + pc.x = 100 + pc.y = 100 + pc.data = data + pc.labels = label + pc.labels.angle = 45 + + drawing.add(pc) + + renderPDF.drawToFile(drawing,path) + + +########### +# +# PARSE FUNCTIONS +# +########### + +def _parseOligoResult(filter,file,strand): + seq = {} + + if strand == 'direct': + key = 'oligo_1' + elif strand == 'reverse': + key = 'oligo_2' + + for s in filter.ecoPCRResultIterator(file): + o = s[key] + taxid = s['taxid'] + if not seq.has_key(o): + seq[o] = [1,taxid] + else: + seq[o][0] = seq[o][0] + 1 + seq[o][1] = filter.lastCommonTaxon(seq[o][1],taxid) + return seq + + +def _parseTaxonomyResult(table): + tax = {} + for l in table: + taxid = l[2] + scName = l[3] + count = l[1] + if not tax.has_key(taxid): + tax[taxid] = [1,scName,count] + else: + tax[taxid][0] = tax[taxid][0] + 1 + tax[taxid][2] = tax[taxid][2] + count + return tax + + +def _sortTable(e1,e2): + e1 = e1[1] + e2 = e2[1] + if e1 < e2: + return 1 + if e1 > e2: + return -1 + return 0 + + +def _countOligoMismatch(o1,o2): + """ + define mismatch between two oligonucleotids + return number of mismatch + """ + mmatch = 0 + if len(o1) < len(o2): + mmatch = int(len(o2) - len(o1)) + for i in range(len(o1)): + try: + if o1[i] != o2[i]: + mmatch = mmatch + 1 + except: + mmatch = mmatch + 1 + + return mmatch + +########### +# +# TOOLS FUNCTIONS +# +########### + +def customSort(table,x,y): + """ + + """ + x = x-1 + y = y-1 + h = (table.headers[x],table.headers[y]) + t = (table.types[x],table.types[y]) + cTable = Table(h,t) + + tmp = {} + + for l in table: + if tmp.has_key(l[x]): + tmp[l[x]] = tmp[l[x]] + l[y] + else: + tmp[l[x]] = l[y] + + for k,v in tmp.items(): + cTable.append([k,v]) + + return cTable + + +def countColumnOccurrence(table,x): + x = x-1 + h = (table.headers[x],"count") + t = (table.types[x],int) + cTable = Table(h,t) + + tmp = {} + + for l in table: + if tmp.has_key(l[x]): + tmp[l[x]] = tmp[l[x]] + 1 + else: + tmp[l[x]] = 1 + + for k,v in tmp.items(): + cTable.append([k,v]) + + return cTable + + +def buildSpecificityTable(table): + header = ("mismatch","taxon","count") + type = (int,str,int) + speTable = Table(header,type) + + tmp = {} + for l in table: + if not tmp.has_key(l[5]): + tmp[l[5]] = {} + if not tmp[l[5]].has_key(l[3]): + tmp[l[5]][l[3]] = l[1] + else: + tmp[l[5]][l[3]] = tmp[l[5]][l[3]] + l[1] + + for mismatch in tmp.items(): + for taxon,count in mismatch[1].items(): + speTable.append([mismatch[0],taxon,count]) + + return speTable + + +def buildOligoTable(table, file, filter, oligoRef, strand='direct'): + """ + Fills and sorts a Table object with ecoPCR result file + + Takes 4 arguments + 1- Table object + 2- ecoPCR result file path + 3- Filter Object + 4- the oligo used in ecoPCR + 5- the oligo type : direct or reverse + + """ + seq = _parseOligoResult(filter, file, strand) + + i = 0 + for oligo, info in seq.items(): + table.append(0) + count, lctTaxid = info[0], info[1] + scName = filter.findTaxonByTaxid(info[1])[3] + rank = filter._ranks[filter.findTaxonByTaxid(info[1])[1]] + mismatch = _countOligoMismatch(oligoRef,oligo) + table[i]=[oligo,count,lctTaxid,scName,rank,mismatch] + i = i + 1 + + table.sort(sortTable) + + +def buildTaxonomicTable(table): + """ + Fill a Table object with a taxonomic synthesis + """ + taxHeaders = ("scName","numOfOligo","numOfAmpl","taxid") + taxTypes = (str, int, int, int) + taxTable = Table(taxHeaders, taxTypes) + + tax = _parseTaxonomyResult(table) + + i = 0 + for taxid, info in tax.items(): + taxTable.append(0) + numOfOligo, scName, numOfAmpl = info[0], info[1], info[2] + taxTable[i]=[scName,numOfOligo,numOfAmpl,taxid] + i = i + 1 + + taxTable.sort(sortTable) + + return taxTable + +def _parseSequenceResult(filter,file,id): + sequences = {} + idIndex = {} + + if id == 'family': + key = 'fa_taxid' + elif id == 'genus': + key = 'ge_taxid' + else: + key = 'taxid' + + for s in filter.ecoPCRResultIterator(file): + seq = s['sq_des'] + id = s[key] + if not idIndex.has_key(id): + idIndex[id] = [] + if not sequences.has_key(seq): + sequences[seq] = [id] + else: + sequences[seq].append(id) + return sequences, idIndex + + +def _sameValuesInList(array): + for i in range(len(array)-1): + if array[i] != array[i+1]: + return False + return True + + +def buildSequenceTable(table,file,filter): + + sequences, idIndex = _parseSequenceResult(filter,file,'species') + + for s,id in sequences.items(): + if len(id) == 1 or _sameValuesInList(id): + idIndex[id[0]].append(1) + else: + idIndex[id[0]].append(0) + + + for id,values in idIndex.items(): + c = values.count(1) + if c == 0: + idIndex[id] = -1 + else: + idIndex[id] = len(values) == c and 1 or 0 + + identified = {} + non_identified = {} + ambiguous = {} + for sequence... + + + + +########### +# +# OUTPUT FUNCTIONS +# +########### + +def printTable(table): + """ + Displays the content a of Table object + Take 1 arguments + 1- Table object + """ + + format = ("%20s | " * len(table.headers))[:-3] + print format % tuple([str(e) for e in table.headers ]) +"\n" + "-"*23*len(table.headers) + for l in table: + print format % tuple([str(e) for e in l ]) + print "# %d results" % len(table) + + +def printColumn(table,n): + """ + Displays a column of a Table object + Takes 2 arguments + 1- Table object + 2- Number of the Column to display + """ + n = n - 1 + print "\t%s\n" % table.headers[n] + for l in table: + print "%s" % l[n] + + +def saveAsCSV(table,path): + """ + Creates a csv file from a Table object + Takes 2 arguments + 1- Table object + 2- path of the file-to-be + """ + file = open(path,'w') + file.write(','.join([str(e) for e in table.headers ]) + "\n") + for l in table: + file.write(','.join([str(e) for e in l ]) + "\n") + file.close() + + +def grepTable(table,col,pattern): + """ + Filters a Table object with regular expression + Takes 3 arguments : + 1- Table object + 2- number of column to match with + 3- regular expression pattern + + Returns a Table object + """ + col = col -1 + p = re.compile(pattern, re.IGNORECASE) + out = Table(table.headers,table.types) + for l in table: + if p.search(l[col]): + out.append(l) + return out + +def drawGraph(table,x,y,path='graph.pdf',title='',treshold=10,graphType=1): + """ + Creates an histogram as pdf file + Takes 5 arguments : + 1- a Table object + 2- number of column of Table object for y axis + 3- number of column of Table object for x axis + 4- path of the pdf-to-be + 5- a title for the graph (optional) + 6- the x first highest results (10 by default) + 7- the graph type : 1 for histogram, 2 for pie + """ + count, label = [], [] + x = x-1 + y = y-1 + for l in table: + count.append(l[y]) + label.append(str(l[x])) + if graphType == 1: + _makeHistogram([tuple(count[:treshold])], label[:treshold], title, path) + elif graphType == 2: + _makePie(count[:treshold], label[:treshold], title, path) + + + +###################### + + +def init(): + file = "/Users/bessiere/Documents/workspace/ecoPCR/src/toto.tmp" + oligo = {'o1':'ATGTTTAAAA','o2':'ATGGGGGTATTG'} + + filter = Filter("/ecoPCRDB/gbmam") + +# headers = ("oligo", "Num", "LCT Taxid", "Sc Name", "Rank", "distance") +# types = (str, int, int, str, str, int) +# +# o1Table = Table(headers, types) +# o2Table = Table(headers, types) +# +# buildOligoTable(o1Table, file, filter, oligo['o1'], 'direct') +# buildOligoTable(o2Table, file, filter, oligo['o2'], 'reverse') +# +# +# taxTable = buildTaxonomicTable(o1Table) +# speTable = buildSpecificityTable(o1Table) +# + + seqHeaders = ("sequence","taxid") + seqTypes = (str,list) + seqTable = Table(seqHeaders,seqTypes) + + return buildSequenceTable(seqTable, file, filter) + + + + return seqTable + +