This commit is contained in:
@ -26,7 +26,7 @@ if __name__=='__main__':
|
||||
(options, entries) = optionParser()
|
||||
|
||||
tax = taxonomy.Taxonomy(options.db)
|
||||
seqd= sequence.ecoPCRDBSequenceIterator(options.db,taxonomy=tax)
|
||||
seqd= sequence.EcoPCRDBSequenceIterator(options.db,taxonomy=tax)
|
||||
|
||||
ranks = set(x for x in tax.rankIterator())
|
||||
|
||||
|
@ -1,6 +1,5 @@
|
||||
#!/usr/local/bin/python
|
||||
|
||||
import fileinput
|
||||
import re
|
||||
import sys
|
||||
|
||||
|
@ -21,11 +21,14 @@ class SsearchParser(object):
|
||||
for line in props:
|
||||
subject,tab = line.split('\t')
|
||||
tab=tab.split()
|
||||
ac = subject.split()[0]
|
||||
ssp = subject.split()
|
||||
ac = ssp[0]
|
||||
dbl= int(ssp[-5][:-1])
|
||||
ident = float(tab[0])
|
||||
matchlen = int(tab[5]) - int(tab[4]) +1
|
||||
self.props.append({"ac" :ac,
|
||||
"identity" :ident,
|
||||
"subjectlength":dbl,
|
||||
'matchlength' : matchlen})
|
||||
|
||||
def run(seq,database,program='fasta35',opts=''):
|
||||
|
@ -0,0 +1,8 @@
|
||||
from obitools.fasta import fastaNucIterator
|
||||
from obitools.cns import cnsTag
|
||||
|
||||
def cnsFastaIterator(file):
|
||||
|
||||
x = fastaNucIterator(file, cnsTag)
|
||||
|
||||
return x
|
@ -1,12 +1,13 @@
|
||||
from obitools import NucSequence
|
||||
from obitools.ecopcr import EcoPCRDBFile
|
||||
from obitools.ecopcr.taxonomy import Taxonomy
|
||||
from obitools.ecopcr.annotation import EcoPCRDBAnnotationWriter
|
||||
from glob import glob
|
||||
import struct
|
||||
import gzip
|
||||
|
||||
|
||||
class ecoPCRDBSequenceIterator(EcoPCRDBFile):
|
||||
class EcoPCRDBSequenceIterator(EcoPCRDBFile):
|
||||
|
||||
def __init__(self,path,taxonomy=None):
|
||||
self._path = path
|
||||
@ -33,3 +34,64 @@ class ecoPCRDBSequenceIterator(EcoPCRDBFile):
|
||||
for seqfile in self._seqfilesFiles:
|
||||
for seq in self.__ecoSequenceIterator(seqfile):
|
||||
yield seq
|
||||
|
||||
class EcoPCRDBSequenceWriter(object):
|
||||
|
||||
def __init__(self,dbname,fileidx=1,taxonomy=None,ftid=None,type=None,definition=None):
|
||||
self._taxonomy=taxonomy
|
||||
self._filename="%s_%03d.sdx" % (dbname,fileidx)
|
||||
self._file = open(self._filename,'wb')
|
||||
self._sequenceCount=0
|
||||
|
||||
self._file.write(struct.pack('> I',self._sequenceCount))
|
||||
|
||||
|
||||
|
||||
if type is not None:
|
||||
assert ftid is not None,"You must specify an id attribute for features"
|
||||
self._annotation = EcoPCRDBAnnotationWriter(dbname, ftid, fileidx, type, definition)
|
||||
else:
|
||||
self._annotation = None
|
||||
|
||||
def _ecoSeqPacker(self,seq):
|
||||
|
||||
compactseq = gzip.zlib.compress(str(seq),9)
|
||||
cptseqlength = len(compactseq)
|
||||
delength = len(seq.definition)
|
||||
|
||||
totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength
|
||||
|
||||
if self._taxonomy is None:
|
||||
taxon=-1
|
||||
else:
|
||||
taxon=self._taxonomy.findIndex(seq['taxid'])
|
||||
|
||||
packed = struct.pack('> I i 20s I I I %ds %ds' % (delength,cptseqlength),
|
||||
totalSize,
|
||||
taxon,
|
||||
seq.id,
|
||||
delength,
|
||||
len(seq),
|
||||
cptseqlength,
|
||||
seq.definition,
|
||||
compactseq)
|
||||
|
||||
assert len(packed) == totalSize+4, "error in sequence packing"
|
||||
|
||||
return packed
|
||||
|
||||
|
||||
def put(self,sequence):
|
||||
if self._taxonomy is not None:
|
||||
self.extractTaxon()
|
||||
self._file.write(self._ecoSeqPacker(sequence))
|
||||
if self._annotation is not None:
|
||||
self._annotation.put(sequence, self._sequenceCount)
|
||||
self._sequenceCount+=1
|
||||
|
||||
def __del__(self):
|
||||
self._file.seek(0,0)
|
||||
self._file.write(struct.pack('> I',self._sequenceCount))
|
||||
self._file.close()
|
||||
|
||||
|
||||
|
@ -97,6 +97,9 @@ class Taxonomy(EcoPCRDBFile):
|
||||
return self._ranks.index(rank)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
def findIndex(self,taxid):
|
||||
return self._index[taxid]
|
||||
|
||||
|
||||
#####
|
||||
|
@ -21,6 +21,7 @@ class Fast(object):
|
||||
|
||||
self._kup = kup
|
||||
self._hash= hash
|
||||
self._seq = seq
|
||||
|
||||
def __call__(self,seq):
|
||||
'''
|
||||
@ -35,7 +36,7 @@ class Fast(object):
|
||||
@rtype: a int tuple (smax,pmax)
|
||||
'''
|
||||
histo={}
|
||||
seq = str(seq)
|
||||
seq = str(seq).upper()
|
||||
hash= self._hash
|
||||
kup = self._kup
|
||||
|
||||
@ -47,5 +48,9 @@ class Fast(object):
|
||||
smax = max(histo.values())
|
||||
pmax = [x for x in histo if histo[x]==smax]
|
||||
return smax,pmax
|
||||
|
||||
def __len__(self):
|
||||
return len(self._seq)
|
||||
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user