This commit is contained in:
2008-06-02 10:02:43 +00:00
parent 0bf1761df6
commit d5fbfa6e50
7 changed files with 85 additions and 5 deletions

View File

@ -26,7 +26,7 @@ if __name__=='__main__':
(options, entries) = optionParser()
tax = taxonomy.Taxonomy(options.db)
seqd= sequence.ecoPCRDBSequenceIterator(options.db,taxonomy=tax)
seqd= sequence.EcoPCRDBSequenceIterator(options.db,taxonomy=tax)
ranks = set(x for x in tax.rankIterator())

View File

@ -1,6 +1,5 @@
#!/usr/local/bin/python
import fileinput
import re
import sys

View File

@ -21,11 +21,14 @@ class SsearchParser(object):
for line in props:
subject,tab = line.split('\t')
tab=tab.split()
ac = subject.split()[0]
ssp = subject.split()
ac = ssp[0]
dbl= int(ssp[-5][:-1])
ident = float(tab[0])
matchlen = int(tab[5]) - int(tab[4]) +1
self.props.append({"ac" :ac,
"identity" :ident,
"subjectlength":dbl,
'matchlength' : matchlen})
def run(seq,database,program='fasta35',opts=''):

View File

@ -0,0 +1,8 @@
from obitools.fasta import fastaNucIterator
from obitools.cns import cnsTag
def cnsFastaIterator(file):
x = fastaNucIterator(file, cnsTag)
return x

View File

@ -1,12 +1,13 @@
from obitools import NucSequence
from obitools.ecopcr import EcoPCRDBFile
from obitools.ecopcr.taxonomy import Taxonomy
from obitools.ecopcr.annotation import EcoPCRDBAnnotationWriter
from glob import glob
import struct
import gzip
class ecoPCRDBSequenceIterator(EcoPCRDBFile):
class EcoPCRDBSequenceIterator(EcoPCRDBFile):
def __init__(self,path,taxonomy=None):
self._path = path
@ -33,3 +34,64 @@ class ecoPCRDBSequenceIterator(EcoPCRDBFile):
for seqfile in self._seqfilesFiles:
for seq in self.__ecoSequenceIterator(seqfile):
yield seq
class EcoPCRDBSequenceWriter(object):
def __init__(self,dbname,fileidx=1,taxonomy=None,ftid=None,type=None,definition=None):
self._taxonomy=taxonomy
self._filename="%s_%03d.sdx" % (dbname,fileidx)
self._file = open(self._filename,'wb')
self._sequenceCount=0
self._file.write(struct.pack('> I',self._sequenceCount))
if type is not None:
assert ftid is not None,"You must specify an id attribute for features"
self._annotation = EcoPCRDBAnnotationWriter(dbname, ftid, fileidx, type, definition)
else:
self._annotation = None
def _ecoSeqPacker(self,seq):
compactseq = gzip.zlib.compress(str(seq),9)
cptseqlength = len(compactseq)
delength = len(seq.definition)
totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength
if self._taxonomy is None:
taxon=-1
else:
taxon=self._taxonomy.findIndex(seq['taxid'])
packed = struct.pack('> I i 20s I I I %ds %ds' % (delength,cptseqlength),
totalSize,
taxon,
seq.id,
delength,
len(seq),
cptseqlength,
seq.definition,
compactseq)
assert len(packed) == totalSize+4, "error in sequence packing"
return packed
def put(self,sequence):
if self._taxonomy is not None:
self.extractTaxon()
self._file.write(self._ecoSeqPacker(sequence))
if self._annotation is not None:
self._annotation.put(sequence, self._sequenceCount)
self._sequenceCount+=1
def __del__(self):
self._file.seek(0,0)
self._file.write(struct.pack('> I',self._sequenceCount))
self._file.close()

View File

@ -97,6 +97,9 @@ class Taxonomy(EcoPCRDBFile):
return self._ranks.index(rank)
except ValueError:
return None
def findIndex(self,taxid):
return self._index[taxid]
#####

View File

@ -21,6 +21,7 @@ class Fast(object):
self._kup = kup
self._hash= hash
self._seq = seq
def __call__(self,seq):
'''
@ -35,7 +36,7 @@ class Fast(object):
@rtype: a int tuple (smax,pmax)
'''
histo={}
seq = str(seq)
seq = str(seq).upper()
hash= self._hash
kup = self._kup
@ -47,5 +48,9 @@ class Fast(object):
smax = max(histo.values())
pmax = [x for x in histo if histo[x]==smax]
return smax,pmax
def __len__(self):
return len(self._seq)