diff --git a/tools/ecoPCRFilter.py b/tools/ecoPCRFilter.py new file mode 100755 index 0000000..9bf0a51 --- /dev/null +++ b/tools/ecoPCRFilter.py @@ -0,0 +1,303 @@ +#!/usr/bin/env python + +import struct +import sys +import os +import gzip + + +##### +# +# Generic file function +# +##### + +class Filter(object): + + + def __init__(self,path): + self._path = path + self._taxonFile = "%s.tdx" % self._path + self._ranksFile = "%s.rdx" % self._path + self._namesFile = "%s.ndx" % self._path + self._taxonomy, self._index, self._ranks, self._name = self.__readNodeTable() + + + def __universalOpen(self,file): + if isinstance(file,str): + if file[-3:] == '.gz': + rep = gzip.open(file) + else: + rep = open(file) + else: + rep = file + return rep + + def __universalTell(self,file): + if isinstance(file, gzip.GzipFile): + file=file.myfileobj + return file.tell() + + def __fileSize(self,file): + if isinstance(file, gzip.GzipFile): + file=file.myfileobj + pos = file.tell() + file.seek(0,2) + length = file.tell() + file.seek(pos,0) + return length + + def __progressBar(self,pos,max,reset=False,delta=[]): + if reset: + del delta[:] + if not delta: + delta.append(time.time()) + delta.append(time.time()) + + delta[1]=time.time() + elapsed = delta[1]-delta[0] + percent = float(pos)/max * 100 + remain = time.strftime('%H:%M:%S',time.gmtime(elapsed / percent * (100-percent))) + bar = '#' * int(percent/2) + bar+= '|/-\\-'[pos % 5] + bar+= ' ' * (50 - int(percent/2)) + sys.stderr.write('\r%5.1f %% |%s] remain : %s' %(percent,bar,remain)) + + + + + ##### + # + # Iterator functions + # + ##### + + + + def __ecoRecordIterator(self,file): + file = self.__universalOpen(file) + (recordCount,) = struct.unpack('> I',file.read(4)) + + for i in xrange(recordCount): + (recordSize,)=struct.unpack('>I',file.read(4)) + record = file.read(recordSize) + yield record + + + def __ecoNameIterator(self): + for record in self.__ecoRecordIterator(self._namesFile): + lrecord = len(record) + lnames = lrecord - 16 + (isScientificName,namelength,classLength,indextaxid,names)=struct.unpack('> I I I I %ds' % lnames, record) + name=names[:namelength] + classname=names[namelength:] + yield (name,classname,indextaxid) + + + def __ecoTaxonomicIterator(self): + for record in self.__ecoRecordIterator(self._taxonFile): + lrecord = len(record) + lnames = lrecord - 16 + (taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record) + yield (taxid,rankid,parentidx,name) + + + def __ecoSequenceIterator(self,file): + for record in self.__ecoRecordIterator(file): + lrecord = len(record) + lnames = lrecord - (4*4+20) + (taxid,seqid,deflength,seqlength,cptseqlength,string)=struct.unpack('> I 20s I I I %ds' % lnames, record) + de = string[:deflength] + seq = gzip.zlib.decompress(string[deflength:]) + yield (taxid,seqid,deflength,seqlength,cptseqlength,de,seq) + + + def __ecoRankIterator(self): + for record in self.__ecoRecordIterator(self._ranksFile): + yield record + + + ##### + # + # Indexes + # + ##### + + def __ecoNameIndex(self): + indexName = [x for x in self.__ecoNameIterator()] + return indexName + + def __ecoRankIndex(self): + rank = [r for r in self.__ecoRankIterator()] + return rank + + def __ecoTaxonomyIndex(self): + taxonomy = [] + index = {} + i = 0; + for x in self.__ecoTaxonomicIterator(): + taxonomy.append(x) + index[x[0]] = i + i = i + 1 + return taxonomy, index + + def __readNodeTable(self): + taxonomy, index = self.__ecoTaxonomyIndex() + ranks = self.__ecoRankIndex() + name = self.__ecoNameIndex() + return taxonomy,index,ranks,name + + + def findTaxonByTaxid(self,taxid): + return self._taxonomy[self._index[taxid]] + + + + ##### + # + # PUBLIC METHODS + # + ##### + + + def subTreeIterator(self, taxid): + "return subtree for given taxonomic id " + idx = self._index[taxid] + yield self._taxonomy[idx] + for t in self._taxonomy: + if t[2] == idx: + for subt in self.subTreeIterator(t[0]): + yield subt + + + def parentalTreeIterator(self, taxid): + """ + return parental tree for given taxonomic id starting from + first ancester to the root. + """ + taxon=self.findTaxonByTaxid(taxid) + while taxon[2]!= 0: + yield taxon + taxon = self._taxonomy[taxon[2]] + yield self._taxonomy[0] + + + def ecoPCRResultIterator(self, file): + "iteration on ecoPCR result file" + file = self.__universalOpen(file) + data = ColumnFile(file, + sep='|', + types=(str,int,int, + str,int,str, + int,str,int, + str,int,str, + str,str,int, + str,int,int, + str,str),skip='#') + + for ac, sq_len, taxid,\ + rank, sp_taxid, species,\ + ge_taxid, genus, fa_taxid,\ + family, sk_taxid, s_kgdom,\ + strand, oligo_1, error_1,\ + oligo_2, error_2, amp_len,\ + sq_des, definition in data: + + yield {'ac':ac, 'sq_len':sq_len, 'taxid':taxid, + 'rank':rank, 'sp_taxid':sp_taxid, 'species':species, + 'ge_taxid':ge_taxid, 'genus':genus, 'fa_taxid':fa_taxid, + 'family':family, 'sk_taxid':sk_taxid, 's_kgdom':s_kgdom, + 'strand':strand, 'oligo_1':oligo_1, 'error_1':error_1, + 'oligo_2':oligo_2, 'error_2':error_2, 'amp_len':amp_len, + 'sq_des':sq_des, 'definition':definition} + + def rankFilter(self,rankid,filter): + return self._ranks[rankid] == filter + + + def lastCommonTaxon(self,taxid_1, taxid_2): + t1 = [x[0] for x in self.parentalTreeIterator(taxid_1)] + t2 = [x[0] for x in self.parentalTreeIterator(taxid_2)] + t1.reverse() + t2.reverse() + count = t1 < t2 and len(t1) or len(t2) + for i in range(count): + if t1[i] != t2[i]: + return t1[i-1] + + + + +class ColumnFile(object): + + def __init__(self,stream,sep=None,strip=True,types=None,skip=None): + if isinstance(stream,str): + self._stream = open(stream) + elif hasattr(stream,'next'): + self._stream = stream + else: + raise ValueError,'stream must be string or an iterator' + self._delimiter=sep + self._strip=strip + if types: + self._types=[x for x in types] + for i in xrange(len(self._types)): + if self._types[i] is bool: + self._types[i]=ColumnFile.str2bool + else: + self._types=None + self._skip = skip + + def str2bool(x): + return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False})) + + str2bool = staticmethod(str2bool) + + + def __iter__(self): + return self + + def next(self): + ligne = self._stream.next() + while ligne[0] == self._skip: + ligne = self._stream.next() + data = ligne.split(self._delimiter) + if self._strip or self._types: + data = [x.strip() for x in data] + if self._types: + it = self.endLessIterator(self._types) + data = [x[1](x[0]) for x in ((y,it.next()) for y in data)] + return data + + def endLessIterator(self,endedlist): + for x in endedlist: + yield x + while(1): + yield endedlist[-1] + + +class Table(list): + + def __init__(self, headers, types): + list.__init__(self) + self.headers = headers + self.types = types + self.lines = [] + + def printTable(self): + for h in self.headers: + print "\t%s\t|" % h, + print "\n" + for l in self.lines: + for c in l: + print "\t%s\t|" % c + print "\n" + + def getColumn(self,n): + print "\t%s\n" % self.header[n] + for i in range(len(self.lines)): + print "\t%s\n" % i[n] + + + + diff --git a/tools/ecoPCRFormat.py b/tools/ecoPCRFormat.py index 2d86065..3acb26c 100755 --- a/tools/ecoPCRFormat.py +++ b/tools/ecoPCRFormat.py @@ -68,8 +68,7 @@ def endLessIterator(endedlist): yield x while(1): yield endedlist[-1] - - + class ColumnFile(object): def __init__(self,stream,sep=None,strip=True,types=None): @@ -135,7 +134,7 @@ def bsearchTaxon(taxonomy,taxid): else: return None - + def readNodeTable(file): @@ -149,7 +148,6 @@ def readNodeTable(file): bool,bool,bool,str)) print >>sys.stderr,"Reading taxonomy dump file..." taxonomy=[[n[0],n[2],n[1]] for n in nodes] - print >>sys.stderr,"List all taxonomy rank..." ranks =list(set(x[1] for x in taxonomy)) ranks.sort() @@ -171,15 +169,14 @@ def readNodeTable(file): return taxonomy,ranks,index -def scientificNameIterator(file): +def nameIterator(file): file = universalOpen(file) names = ColumnFile(file, sep='|', types=(int,str, str,str)) for taxid,name,unique,classname,white in names: - if classname == 'scientific name': - yield taxid,name + yield taxid,name,classname def mergedNodeIterator(file): file = universalOpen(file) @@ -201,8 +198,12 @@ def readTaxonomyDump(taxdir): taxonomy,ranks,index = readNodeTable('%s/nodes.dmp' % taxdir) print >>sys.stderr,"Adding scientific name..." - for taxid,name in scientificNameIterator('%s/names.dmp' % taxdir): - taxonomy[index[taxid]].append(name) + + alternativeName=[] + for taxid,name,classname in nameIterator('%s/names.dmp' % taxdir): + alternativeName.append((name,classname,index[taxid])) + if classname == 'scientific name': + taxonomy[index[taxid]].append(name) print >>sys.stderr,"Adding taxid alias..." for taxid,current in mergedNodeIterator('%s/merged.dmp' % taxdir): @@ -212,7 +213,7 @@ def readTaxonomyDump(taxdir): for taxid in deletedNodeIterator('%s/delnodes.dmp' % taxdir): index[taxid]=None - return taxonomy,ranks,index + return taxonomy,ranks,alternativeName,index ##### @@ -267,28 +268,52 @@ def genbankEntryParser(entry): Tx = None return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq} -_fastaParseID = re.compile('(?<=^>)[^ ]+') -_fastaParseDE = re.compile('(?<=^>).+',) -_fastaParseSQ = re.compile('^[^>].+',re.MULTILINE+re.DOTALL) -_fastaParseTX = re.compile('(?<=[[Tt]axon:) *[0-9]+ *(?=])') - -def fastaEntryParser(entry): - Id = _fastaParseID.findall(entry)[0] - De = _fastaParseDE.findall(entry)[0].split(None,1)[1:] - if not De: - De='' - else: - De=De[0] - Sq = cleanSeq(_fastaParseSQ.findall(entry)[0].upper()) +###################### + +_cleanDef = re.compile('[\nDE]') + +def cleanDef(definition): + return _cleanDef.sub('',definition) + +_emblParseID = re.compile('(?<=^ID {3})[^ ]+(?=;)',re.MULTILINE) +_emblParseDE = re.compile('(?<=^DE {3}).+?\. *$(?=[^ ])',re.MULTILINE+re.DOTALL) +_emblParseSQ = re.compile('(?<=^ ).+?(?=^//$)',re.MULTILINE+re.DOTALL) +_emblParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")') + +def emblEntryParser(entry): + Id = _emblParseID.findall(entry)[0] + De = ' '.join(cleanDef(_emblParseDE.findall(entry)[0]).split()) + Sq = cleanSeq(_emblParseSQ.findall(entry)[0].upper()) try: - Tx = int(_fastaParseTX.findall(entry)[0]) + Tx = int(_emblParseTX.findall(entry)[0]) except IndexError: Tx = None - return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq} + + +###################### + +def parseFasta(seq): + title = seq[0].strip()[1:].split(None,1) + id=title[0] + if len(title) == 2: + field = title[1].split('; ') + else: + field=[] + info = dict(x.split('=') for x in field if '=' in x) + definition = ' '.join([x for x in field if '=' not in x]) + seq=(''.join([x.strip() for x in seq[1:]])).upper() + return id,seq,definition,info + + +def fastaEntryParser(entry): + id,seq,definition,info = parseFasta(entry) + Tx = info.get('taxid',None) + if Tx is not None: + Tx=int(Tx) + return {'id':id,'taxid':Tx,'definition':definition,'sequence':seq} - def sequenceIteratorFactory(entryParser,entryIterator): def sequenceIterator(file): for entry in entryIterator(file): @@ -381,6 +406,22 @@ def ecoRankPacker(rank): return packed +def ecoNamePacker(name): + + namelength = len(name[0]) + classlength= len(name[1]) + totalSize = namelength + classlength + 4 + 4 + 4 + 4 + + packed = struct.pack('> I I I I I %ds %ds' % (namelength,classlength), + totalSize, + int(name[1]=='scientific name'), + namelength, + classlength, + name[2], + name[0], + name[1]) + + return packed def ecoSeqWriter(file,input,taxindex,parser): output = open(file,'wb') @@ -438,18 +479,40 @@ def ecoRankWriter(file,ranks): output.write(ecoRankPacker(rank)) output.close() + +def nameCmp(n1,n2): + name1=n1[0].upper() + name2=n2[0].upper() + if name1 < name2: + return -1 + elif name1 > name2: + return 1 + return 0 + + +def ecoNameWriter(file,names): + output = open(file,'wb') + output.write(struct.pack('> I',len(names))) + + names.sort(nameCmp) + + for name in names: + output.write(ecoNamePacker(name)) + + output.close() def ecoDBWriter(prefix,taxonomy,seqFileNames,parser): ecoRankWriter('%s.rdx' % prefix, taxonomy[1]) ecoTaxWriter('%s.tdx' % prefix, taxonomy[0]) - + ecoNameWriter('%s.ndx' % prefix, taxonomy[2]) + filecount = 0 for filename in seqFileNames: filecount+=1 sk=ecoSeqWriter('%s_%03d.sdx' % (prefix,filecount), filename, - taxonomy[2], + taxonomy[3], parser) if sk: print >>sys.stderr,"Skipped entry :" @@ -464,37 +527,58 @@ def ecoParseOptions(arguments): } o,filenames = getopt.getopt(arguments, - 'ht:n:gf', + 'ht:n:gfe', ['help', 'taxonomy=', 'name=', 'genbank', - 'fasta']) + 'fasta', + 'embl']) for name,value in o: if name in ('-h','--help'): - pass + printHelp() + exit() elif name in ('-t','--taxonomy'): opt['taxdir']=value elif name in ('-n','--name'): opt['prefix']=value elif name in ('-g','--genbank'): opt['parser']=sequenceIteratorFactory(genbankEntryParser, - entryIterator - ) + entryIterator) + elif name in ('-f','--fasta'): opt['parser']=sequenceIteratorFactory(fastaEntryParser, fastaEntryIterator) + + elif name in ('-e','--embl'): + opt['parser']=sequenceIteratorFactory(emblEntryParser, + entryIterator) else: raise ValueError,'Unknown option %s' % name return opt,filenames +def printHelp(): + print "-----------------------------------" + print " ecoPCRFormat.py" + print "-----------------------------------" + print "ecoPCRFormat.py [option] " + print "-----------------------------------" + print "-e --embl :[E]mbl format" + print "-f --fasta :[F]asta format" + print "-g --genbank :[G]enbank format" + print "-h --help :[H]elp - print this help" + print "-n --name :[N]ame of the new database created" + print "-t --taxonomy :[T]axonomy - path to the taxonomy database" + print " :bcp-like dump from GenBank taxonomy database." + print "-----------------------------------" + if __name__ == '__main__': opt,filenames = ecoParseOptions(sys.argv[1:]) taxonomy = readTaxonomyDump(opt['taxdir']) - + ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser']) diff --git a/tools/ecoSort.py b/tools/ecoSort.py new file mode 100755 index 0000000..c7d6ec3 --- /dev/null +++ b/tools/ecoSort.py @@ -0,0 +1,811 @@ +#!/usr/bin/env python + +import struct +import sys +import os +import gzip +import re +import string + +from reportlab.graphics.shapes import * +from reportlab.graphics.charts.barcharts import VerticalBarChart +from reportlab.graphics.charts.piecharts import Pie +from reportlab.lib.styles import getSampleStyleSheet +from reportlab.lib.units import cm +from reportlab.lib import colors +from reportlab.platypus import * + + +##### +# +# Generic file function +# +##### + +class Filter(object): + """ + This object provides different iterators and method : + * findTaxonByTaxid + * subTreeIterator + * parentalTreeIterator + * ecoPCRResultIterator + * rankFilter + * lastCommonTaxon + + see each method for more informations + """ + + def __init__(self,path): + self._path = path + self._taxonFile = "%s.tdx" % self._path + self._ranksFile = "%s.rdx" % self._path + self._namesFile = "%s.ndx" % self._path + self._taxonomy, self._index, self._ranks, self._name = self.__readNodeTable() + + + def __universalOpen(self,file): + if isinstance(file,str): + if file[-3:] == '.gz': + rep = gzip.open(file) + else: + rep = open(file) + else: + rep = file + return rep + + def __universalTell(self,file): + if isinstance(file, gzip.GzipFile): + file=file.myfileobj + return file.tell() + + def __fileSize(self,file): + if isinstance(file, gzip.GzipFile): + file=file.myfileobj + pos = file.tell() + file.seek(0,2) + length = file.tell() + file.seek(pos,0) + return length + + def __progressBar(self,pos,max,reset=False,delta=[]): + if reset: + del delta[:] + if not delta: + delta.append(time.time()) + delta.append(time.time()) + + delta[1]=time.time() + elapsed = delta[1]-delta[0] + percent = float(pos)/max * 100 + remain = time.strftime('%H:%M:%S',time.gmtime(elapsed / percent * (100-percent))) + bar = '#' * int(percent/2) + bar+= '|/-\\-'[pos % 5] + bar+= ' ' * (50 - int(percent/2)) + sys.stderr.write('\r%5.1f %% |%s] remain : %s' %(percent,bar,remain)) + + + + + ##### + # + # Iterator functions + # + ##### + + + + def __ecoRecordIterator(self,file): + file = self.__universalOpen(file) + (recordCount,) = struct.unpack('> I',file.read(4)) + + for i in xrange(recordCount): + (recordSize,)=struct.unpack('>I',file.read(4)) + record = file.read(recordSize) + yield record + + + def __ecoNameIterator(self): + for record in self.__ecoRecordIterator(self._namesFile): + lrecord = len(record) + lnames = lrecord - 16 + (isScientificName,namelength,classLength,indextaxid,names)=struct.unpack('> I I I I %ds' % lnames, record) + name=names[:namelength] + classname=names[namelength:] + yield (name,classname,indextaxid) + + + def __ecoTaxonomicIterator(self): + for record in self.__ecoRecordIterator(self._taxonFile): + lrecord = len(record) + lnames = lrecord - 16 + (taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record) + yield (taxid,rankid,parentidx,name) + + + def __ecoSequenceIterator(self,file): + for record in self.__ecoRecordIterator(file): + lrecord = len(record) + lnames = lrecord - (4*4+20) + (taxid,seqid,deflength,seqlength,cptseqlength,string)=struct.unpack('> I 20s I I I %ds' % lnames, record) + de = string[:deflength] + seq = gzip.zlib.decompress(string[deflength:]) + yield (taxid,seqid,deflength,seqlength,cptseqlength,de,seq) + + + def __ecoRankIterator(self): + for record in self.__ecoRecordIterator(self._ranksFile): + yield record + + + ##### + # + # Indexes + # + ##### + + def __ecoNameIndex(self): + indexName = [x for x in self.__ecoNameIterator()] + return indexName + + def __ecoRankIndex(self): + rank = [r for r in self.__ecoRankIterator()] + return rank + + def __ecoTaxonomyIndex(self): + taxonomy = [] + index = {} + i = 0; + for x in self.__ecoTaxonomicIterator(): + taxonomy.append(x) + index[x[0]] = i + i = i + 1 + return taxonomy, index + + def __readNodeTable(self): + taxonomy, index = self.__ecoTaxonomyIndex() + ranks = self.__ecoRankIndex() + name = self.__ecoNameIndex() + return taxonomy,index,ranks,name + + + ##### + # + # PUBLIC METHODS + # + ##### + + def findTaxonByTaxid(self,taxid): + """ + Returns a list containing [taxid,rankid,parent_index,nameLength,name] + It takes one argument : a taxonomic id + """ + return self._taxonomy[self._index[taxid]] + + + def subTreeIterator(self, taxid): + """ + Returns subtree for given taxid from first child + to last child. It takes one argument : a taxonomic id + """ + idx = self._index[taxid] + yield self._taxonomy[idx] + for t in self._taxonomy: + if t[2] == idx: + for subt in self.subTreeIterator(t[0]): + yield subt + + + def parentalTreeIterator(self, taxid): + """ + return parental tree for given taxonomic id starting from + first ancester to the root. + """ + taxon=self.findTaxonByTaxid(taxid) + while taxon[2]!= 0: + yield taxon + taxon = self._taxonomy[taxon[2]] + yield self._taxonomy[0] + + + def ecoPCRResultIterator(self, file): + """ + iteration on ecoPCR result file" + It returns a dictionnary + """ + file = self.__universalOpen(file) + data = ColumnFile(file, + sep='|', + types=(str,int,int, + str,int,str, + int,str,int, + str,int,str, + str,str,int, + str,int,int, + str,str),skip='#') + + + for ac, sq_len, taxid,\ + rank, sp_taxid, species,\ + ge_taxid, genus, fa_taxid,\ + family, sk_taxid, s_kgdom,\ + strand, oligo_1, error_1,\ + oligo_2, error_2, amp_len,\ + sq_des, definition in data: + + yield {'ac':ac, 'sq_len':sq_len, 'taxid':taxid, + 'rank':rank, 'sp_taxid':sp_taxid, 'species':species, + 'ge_taxid':ge_taxid, 'genus':genus, 'fa_taxid':fa_taxid, + 'family':family, 'sk_taxid':sk_taxid, 's_kgdom':s_kgdom, + 'strand':strand, 'oligo_1':oligo_1, 'error_1':error_1, + 'oligo_2':oligo_2, 'error_2':error_2, 'amp_len':amp_len, + 'sq_des':sq_des, 'definition':definition} + + def rankFilter(self,rankid,filter): + """ + boolean telling whether rankid match filter + takes 2 arguments : + 1- rankid + 2- filter (i.e genus) + """ + return self._ranks[rankid] == filter + + + def lastCommonTaxon(self,taxid_1, taxid_2): + """ + returns a last common parent for two given taxon. + It starts from the root and goes down the tree + until their parents diverge. + It takes 2 arguments : + 1- taxid 1 + 2- taxid 2 + """ + t1 = [x[0] for x in self.parentalTreeIterator(taxid_1)] + t2 = [x[0] for x in self.parentalTreeIterator(taxid_2)] + t1.reverse() + t2.reverse() + count = t1 < t2 and len(t1) or len(t2) + for i in range(count): + if t1[i] != t2[i]: + return t1[i-1] + return t1[count-1] + + + + + +class ColumnFile(object): + """ + cut an ecoPCR file into a list + """ + def __init__(self,stream,sep=None,strip=True,types=None,skip=None): + if isinstance(stream,str): + self._stream = open(stream) + elif hasattr(stream,'next'): + self._stream = stream + else: + raise ValueError,'stream must be string or an iterator' + self._delimiter=sep + self._strip=strip + if types: + self._types=[x for x in types] + for i in xrange(len(self._types)): + if self._types[i] is bool: + self._types[i]=ColumnFile.str2bool + else: + self._types=None + self._skip = skip + self._oligo = {} + + def str2bool(x): + return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False})) + + str2bool = staticmethod(str2bool) + + + def __iter__(self): + return self + + def next(self): + ligne = self._stream.next() + while ligne[0] == self._skip: + ligne = self._stream.next() + data = ligne.split(self._delimiter) + if self._strip or self._types: + data = [x.strip() for x in data] + if self._types: + it = self.endLessIterator(self._types) + data = [x[1](x[0]) for x in ((y,it.next()) for y in data)] + return data + + def endLessIterator(self,endedlist): + for x in endedlist: + yield x + while(1): + yield endedlist[-1] + + def getOligo(self,line): + if line[2:8] == 'direct': + r = re.compile('(?<=direct strand oligo1 : )[A-Z]+(?= *)') + self._oligo['o1'] = r.findall(line) + if line[2:9] == 'reverse': + r = re.compile('(?<=reverse strand oligo2 : )[A-Z]+(?= *)') + self._oligo['o2'] = r.findall(line) + return None + + + + +########### +# +# DATA STRUCTURE +# +########### + + +class ecoTable(list): + """ + Data object inheriting from list + """ + def __init__(self, headers, types): + list.__init__(self) + self.headers = headers + self.types = types + + + def __setitem__ (self,key,value): + """ + method overloaded to check data types + """ + for e in range(len(value)): + value[e] = self.types[e](value[e]) + list.__setitem__(self,key,value) + + def __getitem__(self,index): + newtable = ecoTable(self.headers,self.types) + if isinstance(index,slice): + newtable.extend(list.__getitem__(self,index)) + else: + newtable.append(list.__getitem__(self,index)) + + return newtable + + def getColumns(self,columnList): + newhead = [self.headers[x] for x in columnList] + newtype = [self.types[x] for x in columnList] + newtable = ecoTable(newhead,newtype) + for line in self: + newtable.append([line[x] for x in columnList]) + + return newtable + + +########### +# +# PARSE FUNCTIONS +# +########### + +def _parseOligoResult(filter,file,strand): + seq = {} + + if strand == 'direct': + key = 'oligo_1' + elif strand == 'reverse': + key = 'oligo_2' + + for s in filter.ecoPCRResultIterator(file): + o = s[key] + taxid = s['taxid'] + if not seq.has_key(o): + seq[o] = [1,taxid] + else: + seq[o][0] = seq[o][0] + 1 + seq[o][1] = filter.lastCommonTaxon(seq[o][1],taxid) + return seq + + +def _parseTaxonomyResult(table): + tax = {} + for l in table: + taxid = l[2] + scName = l[3] + count = l[1] + if not tax.has_key(taxid): + tax[taxid] = [1,scName,count] + else: + tax[taxid][0] = tax[taxid][0] + 1 + tax[taxid][2] = tax[taxid][2] + count + return tax + + +def _sortTable(e1,e2): + e1 = e1[1] + e2 = e2[1] + if e1 < e2: + return 1 + if e1 > e2: + return -1 + return 0 + + +def _countOligoMismatch(o1,o2): + """ + define mismatch between two oligonucleotids + return number of mismatch + """ + mmatch = 0 + if len(o1) < len(o2): + mmatch = int(len(o2) - len(o1)) + for i in range(len(o1)): + try: + if o1[i] != o2[i]: + mmatch = mmatch + 1 + except: + mmatch = mmatch + 1 + + return mmatch + +########### +# +# TOOLS FUNCTIONS +# +########### + +def customSort(table,x,y): + """ + + """ + x = x-1 + y = y-1 + h = (table.headers[x],table.headers[y]) + t = (table.types[x],table.types[y]) + cTable = ecoTable(h,t) + + tmp = {} + + for l in table: + if tmp.has_key(l[x]): + tmp[l[x]] = tmp[l[x]] + l[y] + else: + tmp[l[x]] = l[y] + + for k,v in tmp.items(): + cTable.append([k,v]) + + return cTable + + +def countColumnOccurrence(table,x): + x = x-1 + h = (table.headers[x],"count") + t = (table.types[x],int) + cTable = Table(h,t) + + tmp = {} + + for l in table: + if tmp.has_key(l[x]): + tmp[l[x]] = tmp[l[x]] + 1 + else: + tmp[l[x]] = 1 + + for k,v in tmp.items(): + cTable.append([k,v]) + + return cTable + + +def buildSpecificityTable(table): + header = ("mismatch","taxon","count") + type = (int,str,int) + speTable = ecoTable(header,type) + + tmp = {} + for l in table: + if not tmp.has_key(l[5]): + tmp[l[5]] = {} + if not tmp[l[5]].has_key(l[3]): + tmp[l[5]][l[3]] = l[1] + else: + tmp[l[5]][l[3]] = tmp[l[5]][l[3]] + l[1] + + for mismatch in tmp.items(): + for taxon,count in mismatch[1].items(): + speTable.append([mismatch[0],taxon,count]) + + return speTable + + +def buildOligoTable(table, file, filter, oligoRef, strand='direct'): + """ + Fills and sorts a Table object with ecoPCR result file + + Takes 4 arguments + 1- Table object + 2- ecoPCR result file path + 3- Filter Object + 4- the oligo used in ecoPCR + 5- the oligo type : direct or reverse + + """ + seq = _parseOligoResult(filter, file, strand) + + i = 0 + for oligo, info in seq.items(): + table.append(0) + count, lctTaxid = info[0], info[1] + scName = filter.findTaxonByTaxid(info[1])[3] + rank = filter._ranks[filter.findTaxonByTaxid(info[1])[1]] + mismatch = _countOligoMismatch(oligoRef,oligo) + table[i]=[oligo,count,lctTaxid,scName,rank,mismatch] + i = i + 1 + + table.sort(_sortTable) + + +def buildTaxonomicTable(table): + """ + Fill a Table object with a taxonomic synthesis + """ + taxHeaders = ("scName","numOfOligo","numOfAmpl","taxid") + taxTypes = (str, int, int, int) + taxTable = ecoTable(taxHeaders, taxTypes) + + tax = _parseTaxonomyResult(table) + + i = 0 + for taxid, info in tax.items(): + taxTable.append(0) + numOfOligo, scName, numOfAmpl = info[0], info[1], info[2] + taxTable[i]=[scName,numOfOligo,numOfAmpl,taxid] + i = i + 1 + + taxTable.sort(_sortTable) + + return taxTable + +def _parseSequenceResult(filter,file,id): + sequences = {} + idIndex = {} + + if id == 'family': + key = 'fa_taxid' + elif id == 'genus': + key = 'ge_taxid' + else: + key = 'taxid' + + for s in filter.ecoPCRResultIterator(file): + seq = s['sq_des'] + id = s[key] + if not idIndex.has_key(id): + idIndex[id] = [] + if not sequences.has_key(seq): + sequences[seq] = [id] + else: + sequences[seq].append(id) + return sequences, idIndex + + +def _sameValuesInList(array): + for i in range(len(array)-1): + if array[i] != array[i+1]: + return False + return True + + +def _sortSequences(file,filter): + + sequences, idIndex = _parseSequenceResult(filter,file,'species') + + for s,id in sequences.items(): + if len(id) == 1 or _sameValuesInList(id): + idIndex[id[0]].append(1) + else: + for e in id: + idIndex[e].append(0) + + + for id,values in idIndex.items(): + idIndex[id] = float(values.count(1)) / float(len(values)) * 100 + + + identified = {} + non_identified = {} + ambiguous = {} + + return sequences, idIndex + +def getIntraSpeciesDiversity(table,file,filter): + + intraDiv = {} + + seq, idIndex = _sortSequences(file,filter) + + for id,percent in idIndex.items(): + if percent == 100: + intraDiv[id] = [0,[]] + for seq,idList in sequences.items(): + if id in idList: + intraDiv[id][0] = intraDiv[id][0] + 1 + intraDiv[id][1].append(seq) + + for id, values in intraDiv.items(): + table.append(id,values[0],values[1]) + + + +########### +# +# OUTPUT FUNCTIONS +# +########### + +def printTable(table): + """ + Displays the content a of Table object + Take 1 arguments + 1- Table object + """ + + format = ("%20s | " * len(table.headers))[:-3] + print format % tuple([str(e) for e in table.headers ]) +"\n" + "-"*23*len(table.headers) + for l in table: + print format % tuple([str(e) for e in l ]) + print "# %d results" % len(table) + + +def saveAsCSV(table,path): + """ + Creates a csv file from a Table object + Takes 2 arguments + 1- Table object + 2- path of the file-to-be + """ + file = open(path,'w') + file.write(','.join([str(e) for e in table.headers ]) + "\n") + for l in table: + file.write(','.join([str(e) for e in l ]) + "\n") + file.close() + + +def grepTable(table,col,pattern): + """ + Filters a Table object with regular expression + Takes 3 arguments : + 1- Table object + 2- number of column to match with + 3- regular expression pattern + + Returns a Table object + """ + col = col -1 + p = re.compile(pattern, re.IGNORECASE) + out = ecoTable(table.headers,table.types) + for l in table: + if p.search(l[col]): + out.append(l) + return out + + +########### +# +# GRAPH FUNCTIONS +# +########### + +class EcoGraph(object): + + def __init__(self): + self._styles = getSampleStyleSheet() + + self._element = [] + self._element.append(self._box(Paragraph("EcoPCR report", self._styles['Title']))) + self._element.append(Spacer(0, 0.5 * cm)) + + def _box(self,flt, center=True): + box_style = [('BOX', (0, 0), (-1, -1), 0.5, colors.lightgrey)] + if center: + box_style += [('ALIGN', (0, 0), (-1, -1), 'CENTER')] + return Table([[flt]], style=box_style) + + def _addChart(self,chart,title): + drawing = Drawing(300, 250) + drawing.add(chart) + self._element.append(self._box(Paragraph(title, self._styles['Normal']))) + self._element.append(self._box(drawing)) + self._element.append(Spacer(0, 0.5 * cm)) + + def _formatData(self,table): + data, label = [],[] + for i in range(len(table)): + label.append(table[i][0]) + data.append(table[i][1]) + return data, label + + def makePie(self, table, title): + data, label = self._formatData(table) + pie = Pie() + pie.x = 100 + pie.y = 100 + pie.data = data + pie.labels = label + self._addChart(pie, title) + + def makeHistogram(self, table, title): + data, label = self._formatData(table) + data = [tuple(data)] + + histo = VerticalBarChart() + histo.x = 10 + histo.y = 70 + histo.height = 150 + histo.width = 300 + histo.bars.strokeWidth = 1 + histo.barSpacing = 1 + histo.barLabels.dy = 5 + histo.barLabelFormat = '%d' + histo.barLabels.fontSize = 9 - (len(data[0])/10) + histo.data = data + + histo.categoryAxis.labels.boxAnchor = 'e' + histo.categoryAxis.labels.textAnchor = 'start' + histo.categoryAxis.labels.dx = -40 + histo.categoryAxis.labels.dy = -50 + histo.categoryAxis.labels.angle = 45 + histo.categoryAxis.labels.width = 10 + histo.categoryAxis.labels.height = 4 + histo.categoryAxis.categoryNames = label + histo.categoryAxis.strokeWidth = 1 + histo.categoryAxis.labels.fontSize = 8 + + histo.valueAxis.valueMin = min(data[0])*0.7 + histo.valueAxis.valueMax = max(data[0]) + step = (max(data[0]) - min(data[0])) / 10 + histo.valueAxis.valueStep = step > 1 and step or 1 + + self._addChart(histo, title) + + def makeReport(self,path): + doc = SimpleDocTemplate(path) + doc.build(self._element) + + +###################### + + +def init(): + file = "/Users/bessiere/Documents/workspace/ecoPCR/src/toto.tmp" + oligo = {'o1':'ATGTTTAAAA','o2':'ATGGGGGTATTG'} + + filter = Filter("/ecoPCRDB/gbmam") + + headers = ("oligo", "Num", "LCT Taxid", "Sc Name", "Rank", "distance") + types = (str, int, int, str, str, int) + + o1Table = ecoTable(headers, types) + o2Table = ecoTable(headers, types) + + buildOligoTable(o1Table, file, filter, oligo['o1'], 'direct') + buildOligoTable(o2Table, file, filter, oligo['o2'], 'reverse') + + + taxTable = buildTaxonomicTable(o1Table) + speTable = buildSpecificityTable(o1Table) + + return o1Table, o2Table, taxTable + + + +def start(): + file = "/Users/bessiere/Documents/workspace/ecoPCR/src/toto.tmp" + filter = Filter("/ecoPCRDB/gbmam") + + speHeaders = ("taxid","num of seq","list of seq") + speTypes = (int,int,list) + speTable = ecoTable(speHeaders,speTypes) + + getIntraSpeciesDiversity(speTable, file, filter) + + +