2008-06-02 10:02:43 +00:00
parent 0bf1761df6
commit d5fbfa6e50
7 changed files with 85 additions and 5 deletions
--- a/src/ecoTaxStat.py
+++ b/src/ecoTaxStat.py
@ -26,7 +26,7 @@ if __name__=='__main__':
    (options, entries) = optionParser()
    
    tax = taxonomy.Taxonomy(options.db)
-    seqd= sequence.ecoPCRDBSequenceIterator(options.db,taxonomy=tax)
+    seqd= sequence.EcoPCRDBSequenceIterator(options.db,taxonomy=tax)
    
    ranks = set(x for x in tax.rankIterator())
    
--- a/src/fastaGrep.py
+++ b/src/fastaGrep.py
@ -1,6 +1,5 @@
 #!/usr/local/bin/python

-import fileinput
 import re
 import sys

--- a/src/obitools/align/ssearch.py
+++ b/src/obitools/align/ssearch.py
@ -21,11 +21,14 @@ class SsearchParser(object):
            for line in props:
                subject,tab = line.split('\t')
                tab=tab.split()
-                ac = subject.split()[0]
+                ssp = subject.split()
+                ac = ssp[0]
+                dbl= int(ssp[-5][:-1])
                ident = float(tab[0])
                matchlen = int(tab[5]) - int(tab[4]) +1
                self.props.append({"ac"       :ac,
                                   "identity" :ident,
+                                   "subjectlength":dbl,
                                   'matchlength' : matchlen})
                
 def run(seq,database,program='fasta35',opts=''):
--- a/src/obitools/cns/fasta.py
+++ b/src/obitools/cns/fasta.py
@ -0,0 +1,8 @@
+from obitools.fasta import fastaNucIterator
+from obitools.cns import cnsTag
+
+def cnsFastaIterator(file):
+    
+    x = fastaNucIterator(file, cnsTag)
+    
+    return x
--- a/src/obitools/ecopcr/sequence.py
+++ b/src/obitools/ecopcr/sequence.py
@ -1,12 +1,13 @@
 from obitools import NucSequence
 from obitools.ecopcr import EcoPCRDBFile
 from obitools.ecopcr.taxonomy import Taxonomy
+from obitools.ecopcr.annotation import EcoPCRDBAnnotationWriter
 from glob import glob
 import struct
 import gzip


-class ecoPCRDBSequenceIterator(EcoPCRDBFile):
+class EcoPCRDBSequenceIterator(EcoPCRDBFile):

    def __init__(self,path,taxonomy=None):
        self._path = path
@ -33,3 +34,64 @@ class ecoPCRDBSequenceIterator(EcoPCRDBFile):
        for seqfile in self._seqfilesFiles:
            for seq in self.__ecoSequenceIterator(seqfile):
                yield seq
+                
+class EcoPCRDBSequenceWriter(object):
+    
+    def __init__(self,dbname,fileidx=1,taxonomy=None,ftid=None,type=None,definition=None):
+        self._taxonomy=taxonomy
+        self._filename="%s_%03d.sdx" % (dbname,fileidx)
+        self._file = open(self._filename,'wb')
+        self._sequenceCount=0
+        
+        self._file.write(struct.pack('> I',self._sequenceCount))
+        
+
+        
+        if type is not None:
+            assert ftid is not None,"You must specify an id attribute for features"
+            self._annotation = EcoPCRDBAnnotationWriter(dbname, ftid, fileidx, type, definition)
+        else: 
+            self._annotation = None
+        
+    def _ecoSeqPacker(self,seq):
+    
+        compactseq = gzip.zlib.compress(str(seq),9)
+        cptseqlength  = len(compactseq)
+        delength   = len(seq.definition)
+        
+        totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength
+        
+        if self._taxonomy is None:
+            taxon=-1
+        else:
+            taxon=self._taxonomy.findIndex(seq['taxid'])
+        
+        packed = struct.pack('> I i 20s I I I %ds %ds' % (delength,cptseqlength),
+                             totalSize,
+                             taxon,
+                             seq.id,
+                             delength,
+                             len(seq),
+                             cptseqlength,
+                             seq.definition,
+                             compactseq)
+        
+        assert len(packed) == totalSize+4, "error in sequence packing"
+    
+        return packed
+
+        
+    def put(self,sequence):
+        if self._taxonomy is not None:
+            self.extractTaxon()
+        self._file.write(self._ecoSeqPacker(sequence))
+        if self._annotation is not None:
+            self._annotation.put(sequence, self._sequenceCount)
+        self._sequenceCount+=1
+        
+    def __del__(self):
+        self._file.seek(0,0)
+        self._file.write(struct.pack('> I',self._sequenceCount))
+        self._file.close()
+        
+    
--- a/src/obitools/ecopcr/taxonomy.py
+++ b/src/obitools/ecopcr/taxonomy.py
@ -97,6 +97,9 @@ class Taxonomy(EcoPCRDBFile):
            return self._ranks.index(rank)
        except ValueError:
            return None
+        
+    def findIndex(self,taxid):
+        return self._index[taxid]


    #####
--- a/src/obitools/fast.py
+++ b/src/obitools/fast.py
@ -21,6 +21,7 @@ class Fast(object):
        
        self._kup = kup
        self._hash= hash
+        self._seq = seq
        
    def __call__(self,seq):
        '''
@ -35,7 +36,7 @@ class Fast(object):
        @rtype: a int tuple (smax,pmax)
        '''
        histo={}
-        seq = str(seq)
+        seq = str(seq).upper()
        hash= self._hash
        kup = self._kup
        
@ -47,5 +48,9 @@ class Fast(object):
        smax = max(histo.values())
        pmax = [x for x in histo if histo[x]==smax]
        return smax,pmax
+    
+    def __len__(self):
+        return len(self._seq)
+