git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPCR/branches/refactoring@81 60f365c0-8329-0410-b2a4-ec073aeeaa1d

2007-06-21 13:58:27 +00:00
parent 379ca7988b
commit d6fcc7673b
1 changed files with 164 additions and 0 deletions
--- a/tools/ecoPCRFilter.py
+++ b/tools/ecoPCRFilter.py
@ -0,0 +1,164 @@
+#!/usr/bin/env python
+
+import struct
+import sys
+
+#####
+#
+#
+# Generic file function
+#
+#
+#####
+
+def universalOpen(file):
+    if isinstance(file,str):
+        if file[-3:] == '.gz':
+            rep = gzip.open(file)
+        else:
+            rep = open(file)
+    else:
+        rep = file
+    return rep
+
+def universalTell(file):
+    if isinstance(file, gzip.GzipFile):
+        file=file.myfileobj
+    return file.tell()
+
+def fileSize(file):
+    if isinstance(file, gzip.GzipFile):
+        file=file.myfileobj
+    pos = file.tell()
+    file.seek(0,2)
+    length = file.tell()
+    file.seek(pos,0)
+    return length
+
+def progressBar(pos,max,reset=False,delta=[]):
+    if reset:
+        del delta[:]
+    if not delta:
+        delta.append(time.time())
+        delta.append(time.time())
+
+    delta[1]=time.time()
+    elapsed = delta[1]-delta[0]
+    percent = float(pos)/max * 100
+    remain = time.strftime('%H:%M:%S',time.gmtime(elapsed / percent * (100-percent)))
+    bar = '#' * int(percent/2)
+    bar+= '|/-\\-'[pos % 5]
+    bar+= ' ' * (50 - int(percent/2))
+    sys.stderr.write('\r%5.1f %% |%s] remain : %s' %(percent,bar,remain))
+
+def endLessIterator(endedlist):
+    for x in endedlist:
+        yield x
+    while(1):
+        yield endedlist[-1]
+
+class ColumnFile(object):
+    
+    def __init__(self,stream,sep=None,strip=True,types=None,skip=None):
+        if isinstance(stream,str):
+            self._stream = open(stream)
+        elif hasattr(stream,'next'):
+            self._stream = stream
+        else:
+            raise ValueError,'stream must be string or an iterator'
+        self._delimiter=sep
+        self._strip=strip
+        if types:
+            self._types=[x for x in types]
+            for i in xrange(len(self._types)):
+                if self._types[i] is bool:
+                    self._types[i]=ColumnFile.str2bool
+        else:
+            self._types=None
+        self._skip = skip
+            
+    def str2bool(x):
+        return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False}))
+                    
+    str2bool = staticmethod(str2bool)
+            
+        
+    def __iter__(self):
+        return self
+    
+    def next(self):
+        ligne = self._stream.next()
+        while ligne[0] == self._skip:
+            ligne = self._stream.next()
+        data = ligne.split(self._delimiter)
+        if self._strip or self._types:
+            data = [x.strip() for x in data]
+        if self._types:
+            it = endLessIterator(self._types)
+            data = [x[1](x[0]) for x in ((y,it.next()) for y in data)]
+        return data
+
+
+def ecoPCRResultIterator(file):
+    file = universalOpen(file)
+    data = ColumnFile(file,
+                      sep='|',
+                      types=(str,int,int,
+                             str,int,str,
+                             int,str,int,
+                             str,int,str,
+                             str,str,int,
+                             str,int,int,
+                             str,str),skip='#')
+    
+    for ac, sq_len, taxid,\
+        rank, sp_taxid, species,\
+        ge_taxid, genus, fa_taxid,\
+        family, sk_taxid, s_kgdom,\
+        strand, oligo_1, error_1,\
+        oligo_2, error_2, amp_len,\
+        sq_des, definition in data:
+        
+        yield {'ac':ac, 'sq_len':sq_len, 'taxid':taxid,
+               'rank':rank, 'sp_taxid':sp_taxid, 'species':species,
+               'ge_taxid':ge_taxid, 'genus':genus, 'fa_taxid':fa_taxid,
+               'family':family, 'sk_taxid':sk_taxid, 's_kgdom':s_kgdom,
+               'strand':strand, 'oligo_1':oligo_1, 'error_1':error_1,
+               'oligo_2':oligo_2, 'error_2':error_2, 'amp_len':amp_len,
+               'sq_des':sq_des, 'definition':definition}
+
+
+def ecoRecordIterator(file):
+    file = universalOpen(file)
+    (recordCount,) = struct.unpack('> I',file.read(4))
+
+    for i in xrange(recordCount):
+        (recordSize,)=struct.unpack('>I',file.read(4))
+        record = file.read(recordSize)
+        yield record
+
+           
+def ecoNameIterator(file):
+    
+    for record in ecoRecordIterator(file):
+        lrecord = len(record)
+        lnames  = lrecord - 16
+        (isScientificName,namelength,classLength,indextaxid,names)=struct.unpack('> I I I I %ds' % lnames, record)
+        name=names[:namelength]
+        classname=names[namelength:]
+        yield (name,classname,indextaxid)
+
+
+def ecoTaxonomicIterator(file):
+    
+    for record in ecoRecordIterator(file):
+        lrecord = len(record)
+        lnames  = lrecord - 16
+        (taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record)
+        yield  (taxid,rankid,parentidx,name)
+
+        
+def ecoRankIterator(file=None):
+
+    for record in ecoRecordIterator(file):
+        yield  record