This commit is contained in:
2008-06-02 10:02:43 +00:00
parent 0bf1761df6
commit d5fbfa6e50
7 changed files with 85 additions and 5 deletions

View File

@ -26,7 +26,7 @@ if __name__=='__main__':
(options, entries) = optionParser() (options, entries) = optionParser()
tax = taxonomy.Taxonomy(options.db) tax = taxonomy.Taxonomy(options.db)
seqd= sequence.ecoPCRDBSequenceIterator(options.db,taxonomy=tax) seqd= sequence.EcoPCRDBSequenceIterator(options.db,taxonomy=tax)
ranks = set(x for x in tax.rankIterator()) ranks = set(x for x in tax.rankIterator())

View File

@ -1,6 +1,5 @@
#!/usr/local/bin/python #!/usr/local/bin/python
import fileinput
import re import re
import sys import sys

View File

@ -21,11 +21,14 @@ class SsearchParser(object):
for line in props: for line in props:
subject,tab = line.split('\t') subject,tab = line.split('\t')
tab=tab.split() tab=tab.split()
ac = subject.split()[0] ssp = subject.split()
ac = ssp[0]
dbl= int(ssp[-5][:-1])
ident = float(tab[0]) ident = float(tab[0])
matchlen = int(tab[5]) - int(tab[4]) +1 matchlen = int(tab[5]) - int(tab[4]) +1
self.props.append({"ac" :ac, self.props.append({"ac" :ac,
"identity" :ident, "identity" :ident,
"subjectlength":dbl,
'matchlength' : matchlen}) 'matchlength' : matchlen})
def run(seq,database,program='fasta35',opts=''): def run(seq,database,program='fasta35',opts=''):

View File

@ -0,0 +1,8 @@
from obitools.fasta import fastaNucIterator
from obitools.cns import cnsTag
def cnsFastaIterator(file):
x = fastaNucIterator(file, cnsTag)
return x

View File

@ -1,12 +1,13 @@
from obitools import NucSequence from obitools import NucSequence
from obitools.ecopcr import EcoPCRDBFile from obitools.ecopcr import EcoPCRDBFile
from obitools.ecopcr.taxonomy import Taxonomy from obitools.ecopcr.taxonomy import Taxonomy
from obitools.ecopcr.annotation import EcoPCRDBAnnotationWriter
from glob import glob from glob import glob
import struct import struct
import gzip import gzip
class ecoPCRDBSequenceIterator(EcoPCRDBFile): class EcoPCRDBSequenceIterator(EcoPCRDBFile):
def __init__(self,path,taxonomy=None): def __init__(self,path,taxonomy=None):
self._path = path self._path = path
@ -33,3 +34,64 @@ class ecoPCRDBSequenceIterator(EcoPCRDBFile):
for seqfile in self._seqfilesFiles: for seqfile in self._seqfilesFiles:
for seq in self.__ecoSequenceIterator(seqfile): for seq in self.__ecoSequenceIterator(seqfile):
yield seq yield seq
class EcoPCRDBSequenceWriter(object):
def __init__(self,dbname,fileidx=1,taxonomy=None,ftid=None,type=None,definition=None):
self._taxonomy=taxonomy
self._filename="%s_%03d.sdx" % (dbname,fileidx)
self._file = open(self._filename,'wb')
self._sequenceCount=0
self._file.write(struct.pack('> I',self._sequenceCount))
if type is not None:
assert ftid is not None,"You must specify an id attribute for features"
self._annotation = EcoPCRDBAnnotationWriter(dbname, ftid, fileidx, type, definition)
else:
self._annotation = None
def _ecoSeqPacker(self,seq):
compactseq = gzip.zlib.compress(str(seq),9)
cptseqlength = len(compactseq)
delength = len(seq.definition)
totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength
if self._taxonomy is None:
taxon=-1
else:
taxon=self._taxonomy.findIndex(seq['taxid'])
packed = struct.pack('> I i 20s I I I %ds %ds' % (delength,cptseqlength),
totalSize,
taxon,
seq.id,
delength,
len(seq),
cptseqlength,
seq.definition,
compactseq)
assert len(packed) == totalSize+4, "error in sequence packing"
return packed
def put(self,sequence):
if self._taxonomy is not None:
self.extractTaxon()
self._file.write(self._ecoSeqPacker(sequence))
if self._annotation is not None:
self._annotation.put(sequence, self._sequenceCount)
self._sequenceCount+=1
def __del__(self):
self._file.seek(0,0)
self._file.write(struct.pack('> I',self._sequenceCount))
self._file.close()

View File

@ -97,6 +97,9 @@ class Taxonomy(EcoPCRDBFile):
return self._ranks.index(rank) return self._ranks.index(rank)
except ValueError: except ValueError:
return None return None
def findIndex(self,taxid):
return self._index[taxid]
##### #####

View File

@ -21,6 +21,7 @@ class Fast(object):
self._kup = kup self._kup = kup
self._hash= hash self._hash= hash
self._seq = seq
def __call__(self,seq): def __call__(self,seq):
''' '''
@ -35,7 +36,7 @@ class Fast(object):
@rtype: a int tuple (smax,pmax) @rtype: a int tuple (smax,pmax)
''' '''
histo={} histo={}
seq = str(seq) seq = str(seq).upper()
hash= self._hash hash= self._hash
kup = self._kup kup = self._kup
@ -47,5 +48,9 @@ class Fast(object):
smax = max(histo.values()) smax = max(histo.values())
pmax = [x for x in histo if histo[x]==smax] pmax = [x for x in histo if histo[x]==smax]
return smax,pmax return smax,pmax
def __len__(self):
return len(self._seq)