This commit is contained in:
2007-06-21 13:58:27 +00:00
parent 379ca7988b
commit d6fcc7673b

164
tools/ecoPCRFilter.py Executable file
View File

@ -0,0 +1,164 @@
#!/usr/bin/env python
import struct
import sys
#####
#
#
# Generic file function
#
#
#####
def universalOpen(file):
if isinstance(file,str):
if file[-3:] == '.gz':
rep = gzip.open(file)
else:
rep = open(file)
else:
rep = file
return rep
def universalTell(file):
if isinstance(file, gzip.GzipFile):
file=file.myfileobj
return file.tell()
def fileSize(file):
if isinstance(file, gzip.GzipFile):
file=file.myfileobj
pos = file.tell()
file.seek(0,2)
length = file.tell()
file.seek(pos,0)
return length
def progressBar(pos,max,reset=False,delta=[]):
if reset:
del delta[:]
if not delta:
delta.append(time.time())
delta.append(time.time())
delta[1]=time.time()
elapsed = delta[1]-delta[0]
percent = float(pos)/max * 100
remain = time.strftime('%H:%M:%S',time.gmtime(elapsed / percent * (100-percent)))
bar = '#' * int(percent/2)
bar+= '|/-\\-'[pos % 5]
bar+= ' ' * (50 - int(percent/2))
sys.stderr.write('\r%5.1f %% |%s] remain : %s' %(percent,bar,remain))
def endLessIterator(endedlist):
for x in endedlist:
yield x
while(1):
yield endedlist[-1]
class ColumnFile(object):
def __init__(self,stream,sep=None,strip=True,types=None,skip=None):
if isinstance(stream,str):
self._stream = open(stream)
elif hasattr(stream,'next'):
self._stream = stream
else:
raise ValueError,'stream must be string or an iterator'
self._delimiter=sep
self._strip=strip
if types:
self._types=[x for x in types]
for i in xrange(len(self._types)):
if self._types[i] is bool:
self._types[i]=ColumnFile.str2bool
else:
self._types=None
self._skip = skip
def str2bool(x):
return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False}))
str2bool = staticmethod(str2bool)
def __iter__(self):
return self
def next(self):
ligne = self._stream.next()
while ligne[0] == self._skip:
ligne = self._stream.next()
data = ligne.split(self._delimiter)
if self._strip or self._types:
data = [x.strip() for x in data]
if self._types:
it = endLessIterator(self._types)
data = [x[1](x[0]) for x in ((y,it.next()) for y in data)]
return data
def ecoPCRResultIterator(file):
file = universalOpen(file)
data = ColumnFile(file,
sep='|',
types=(str,int,int,
str,int,str,
int,str,int,
str,int,str,
str,str,int,
str,int,int,
str,str),skip='#')
for ac, sq_len, taxid,\
rank, sp_taxid, species,\
ge_taxid, genus, fa_taxid,\
family, sk_taxid, s_kgdom,\
strand, oligo_1, error_1,\
oligo_2, error_2, amp_len,\
sq_des, definition in data:
yield {'ac':ac, 'sq_len':sq_len, 'taxid':taxid,
'rank':rank, 'sp_taxid':sp_taxid, 'species':species,
'ge_taxid':ge_taxid, 'genus':genus, 'fa_taxid':fa_taxid,
'family':family, 'sk_taxid':sk_taxid, 's_kgdom':s_kgdom,
'strand':strand, 'oligo_1':oligo_1, 'error_1':error_1,
'oligo_2':oligo_2, 'error_2':error_2, 'amp_len':amp_len,
'sq_des':sq_des, 'definition':definition}
def ecoRecordIterator(file):
file = universalOpen(file)
(recordCount,) = struct.unpack('> I',file.read(4))
for i in xrange(recordCount):
(recordSize,)=struct.unpack('>I',file.read(4))
record = file.read(recordSize)
yield record
def ecoNameIterator(file):
for record in ecoRecordIterator(file):
lrecord = len(record)
lnames = lrecord - 16
(isScientificName,namelength,classLength,indextaxid,names)=struct.unpack('> I I I I %ds' % lnames, record)
name=names[:namelength]
classname=names[namelength:]
yield (name,classname,indextaxid)
def ecoTaxonomicIterator(file):
for record in ecoRecordIterator(file):
lrecord = len(record)
lnames = lrecord - 16
(taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record)
yield (taxid,rankid,parentidx,name)
def ecoRankIterator(file=None):
for record in ecoRecordIterator(file):
yield record