#!/usr/bin/env python import struct import sys ##### # # # Generic file function # # ##### def universalOpen(file): if isinstance(file,str): if file[-3:] == '.gz': rep = gzip.open(file) else: rep = open(file) else: rep = file return rep def universalTell(file): if isinstance(file, gzip.GzipFile): file=file.myfileobj return file.tell() def fileSize(file): if isinstance(file, gzip.GzipFile): file=file.myfileobj pos = file.tell() file.seek(0,2) length = file.tell() file.seek(pos,0) return length def progressBar(pos,max,reset=False,delta=[]): if reset: del delta[:] if not delta: delta.append(time.time()) delta.append(time.time()) delta[1]=time.time() elapsed = delta[1]-delta[0] percent = float(pos)/max * 100 remain = time.strftime('%H:%M:%S',time.gmtime(elapsed / percent * (100-percent))) bar = '#' * int(percent/2) bar+= '|/-\\-'[pos % 5] bar+= ' ' * (50 - int(percent/2)) sys.stderr.write('\r%5.1f %% |%s] remain : %s' %(percent,bar,remain)) def endLessIterator(endedlist): for x in endedlist: yield x while(1): yield endedlist[-1] class ColumnFile(object): def __init__(self,stream,sep=None,strip=True,types=None,skip=None): if isinstance(stream,str): self._stream = open(stream) elif hasattr(stream,'next'): self._stream = stream else: raise ValueError,'stream must be string or an iterator' self._delimiter=sep self._strip=strip if types: self._types=[x for x in types] for i in xrange(len(self._types)): if self._types[i] is bool: self._types[i]=ColumnFile.str2bool else: self._types=None self._skip = skip def str2bool(x): return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False})) str2bool = staticmethod(str2bool) def __iter__(self): return self def next(self): ligne = self._stream.next() while ligne[0] == self._skip: ligne = self._stream.next() data = ligne.split(self._delimiter) if self._strip or self._types: data = [x.strip() for x in data] if self._types: it = endLessIterator(self._types) data = [x[1](x[0]) for x in ((y,it.next()) for y in data)] return data def ecoPCRResultIterator(file): file = universalOpen(file) data = ColumnFile(file, sep='|', types=(str,int,int, str,int,str, int,str,int, str,int,str, str,str,int, str,int,int, str,str),skip='#') for ac, sq_len, taxid,\ rank, sp_taxid, species,\ ge_taxid, genus, fa_taxid,\ family, sk_taxid, s_kgdom,\ strand, oligo_1, error_1,\ oligo_2, error_2, amp_len,\ sq_des, definition in data: yield {'ac':ac, 'sq_len':sq_len, 'taxid':taxid, 'rank':rank, 'sp_taxid':sp_taxid, 'species':species, 'ge_taxid':ge_taxid, 'genus':genus, 'fa_taxid':fa_taxid, 'family':family, 'sk_taxid':sk_taxid, 's_kgdom':s_kgdom, 'strand':strand, 'oligo_1':oligo_1, 'error_1':error_1, 'oligo_2':oligo_2, 'error_2':error_2, 'amp_len':amp_len, 'sq_des':sq_des, 'definition':definition} def ecoRecordIterator(file): file = universalOpen(file) (recordCount,) = struct.unpack('> I',file.read(4)) for i in xrange(recordCount): (recordSize,)=struct.unpack('>I',file.read(4)) record = file.read(recordSize) yield record def ecoNameIterator(file): for record in ecoRecordIterator(file): lrecord = len(record) lnames = lrecord - 16 (isScientificName,namelength,classLength,indextaxid,names)=struct.unpack('> I I I I %ds' % lnames, record) name=names[:namelength] classname=names[namelength:] yield (name,classname,indextaxid) def ecoTaxonomicIterator(file): for record in ecoRecordIterator(file): lrecord = len(record) lnames = lrecord - 16 (taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record) yield (taxid,rankid,parentidx,name) def ecoRankIterator(file=None): for record in ecoRecordIterator(file): yield record