From d6fcc7673b773a6fc35dc38e70a6397e47323dfc Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 21 Jun 2007 13:58:27 +0000 Subject: [PATCH] git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPCR/branches/refactoring@81 60f365c0-8329-0410-b2a4-ec073aeeaa1d --- tools/ecoPCRFilter.py | 164 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100755 tools/ecoPCRFilter.py diff --git a/tools/ecoPCRFilter.py b/tools/ecoPCRFilter.py new file mode 100755 index 0000000..6dfa331 --- /dev/null +++ b/tools/ecoPCRFilter.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python + +import struct +import sys + +##### +# +# +# Generic file function +# +# +##### + +def universalOpen(file): + if isinstance(file,str): + if file[-3:] == '.gz': + rep = gzip.open(file) + else: + rep = open(file) + else: + rep = file + return rep + +def universalTell(file): + if isinstance(file, gzip.GzipFile): + file=file.myfileobj + return file.tell() + +def fileSize(file): + if isinstance(file, gzip.GzipFile): + file=file.myfileobj + pos = file.tell() + file.seek(0,2) + length = file.tell() + file.seek(pos,0) + return length + +def progressBar(pos,max,reset=False,delta=[]): + if reset: + del delta[:] + if not delta: + delta.append(time.time()) + delta.append(time.time()) + + delta[1]=time.time() + elapsed = delta[1]-delta[0] + percent = float(pos)/max * 100 + remain = time.strftime('%H:%M:%S',time.gmtime(elapsed / percent * (100-percent))) + bar = '#' * int(percent/2) + bar+= '|/-\\-'[pos % 5] + bar+= ' ' * (50 - int(percent/2)) + sys.stderr.write('\r%5.1f %% |%s] remain : %s' %(percent,bar,remain)) + +def endLessIterator(endedlist): + for x in endedlist: + yield x + while(1): + yield endedlist[-1] + +class ColumnFile(object): + + def __init__(self,stream,sep=None,strip=True,types=None,skip=None): + if isinstance(stream,str): + self._stream = open(stream) + elif hasattr(stream,'next'): + self._stream = stream + else: + raise ValueError,'stream must be string or an iterator' + self._delimiter=sep + self._strip=strip + if types: + self._types=[x for x in types] + for i in xrange(len(self._types)): + if self._types[i] is bool: + self._types[i]=ColumnFile.str2bool + else: + self._types=None + self._skip = skip + + def str2bool(x): + return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False})) + + str2bool = staticmethod(str2bool) + + + def __iter__(self): + return self + + def next(self): + ligne = self._stream.next() + while ligne[0] == self._skip: + ligne = self._stream.next() + data = ligne.split(self._delimiter) + if self._strip or self._types: + data = [x.strip() for x in data] + if self._types: + it = endLessIterator(self._types) + data = [x[1](x[0]) for x in ((y,it.next()) for y in data)] + return data + + +def ecoPCRResultIterator(file): + file = universalOpen(file) + data = ColumnFile(file, + sep='|', + types=(str,int,int, + str,int,str, + int,str,int, + str,int,str, + str,str,int, + str,int,int, + str,str),skip='#') + + for ac, sq_len, taxid,\ + rank, sp_taxid, species,\ + ge_taxid, genus, fa_taxid,\ + family, sk_taxid, s_kgdom,\ + strand, oligo_1, error_1,\ + oligo_2, error_2, amp_len,\ + sq_des, definition in data: + + yield {'ac':ac, 'sq_len':sq_len, 'taxid':taxid, + 'rank':rank, 'sp_taxid':sp_taxid, 'species':species, + 'ge_taxid':ge_taxid, 'genus':genus, 'fa_taxid':fa_taxid, + 'family':family, 'sk_taxid':sk_taxid, 's_kgdom':s_kgdom, + 'strand':strand, 'oligo_1':oligo_1, 'error_1':error_1, + 'oligo_2':oligo_2, 'error_2':error_2, 'amp_len':amp_len, + 'sq_des':sq_des, 'definition':definition} + + +def ecoRecordIterator(file): + file = universalOpen(file) + (recordCount,) = struct.unpack('> I',file.read(4)) + + for i in xrange(recordCount): + (recordSize,)=struct.unpack('>I',file.read(4)) + record = file.read(recordSize) + yield record + + +def ecoNameIterator(file): + + for record in ecoRecordIterator(file): + lrecord = len(record) + lnames = lrecord - 16 + (isScientificName,namelength,classLength,indextaxid,names)=struct.unpack('> I I I I %ds' % lnames, record) + name=names[:namelength] + classname=names[namelength:] + yield (name,classname,indextaxid) + + +def ecoTaxonomicIterator(file): + + for record in ecoRecordIterator(file): + lrecord = len(record) + lnames = lrecord - 16 + (taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record) + yield (taxid,rankid,parentidx,name) + + +def ecoRankIterator(file=None): + + for record in ecoRecordIterator(file): + yield record