Patch several bugs or inconsistencies following the tutorial at Anthony's lab
This commit is contained in:
@ -1,11 +1,9 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||||
<?eclipse-pydev version="1.0"?>
|
<?eclipse-pydev version="1.0"?><pydev_project>
|
||||||
|
|
||||||
<pydev_project>
|
|
||||||
<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
|
<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
|
||||||
<path>/OBITools-1.0/src</path>
|
<path>/OBITools-1.0/src</path>
|
||||||
<path>/OBITools-1.0/textwrangler</path>
|
<path>/OBITools-1.0/textwrangler</path>
|
||||||
</pydev_pathproperty>
|
</pydev_pathproperty>
|
||||||
<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.7</pydev_property>
|
<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.7</pydev_property>
|
||||||
<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Python2.7</pydev_property>
|
<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Python-2.7</pydev_property>
|
||||||
</pydev_project>
|
</pydev_project>
|
||||||
|
@ -17,7 +17,7 @@ Standard output format
|
|||||||
Generating an ecoPCR database
|
Generating an ecoPCR database
|
||||||
.............................
|
.............................
|
||||||
|
|
||||||
.. cmdoption:: --ecopcrDB-output=<PREFIX_FILENAME>
|
.. cmdoption:: --ecopcrdb-output=<PREFIX_FILENAME>
|
||||||
|
|
||||||
Creates an ecoPCR database from sequence records results
|
Creates an ecoPCR database from sequence records results
|
||||||
|
|
||||||
|
31
setup.py
31
setup.py
@ -6,9 +6,16 @@
|
|||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from setuptools.core import setup
|
from setuptools import setup
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from distutils.core import setup
|
import ez_setup
|
||||||
|
ez_setup.use_setuptools()
|
||||||
|
from setuptools import setup
|
||||||
|
|
||||||
|
# try:
|
||||||
|
# from setuptools.core import setup
|
||||||
|
# except ImportError:
|
||||||
|
# from distutils.core import setup
|
||||||
from distutils.extension import Extension
|
from distutils.extension import Extension
|
||||||
from distutils.util import convert_path
|
from distutils.util import convert_path
|
||||||
from distutils import log
|
from distutils import log
|
||||||
@ -39,6 +46,10 @@ import glob
|
|||||||
|
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
|
# requires = ['Cython>=0.20', 'Sphinx>=1.2']
|
||||||
|
requires = ['Cython>=0.20']
|
||||||
|
|
||||||
|
|
||||||
class install_scripts(ori_install_scripts):
|
class install_scripts(ori_install_scripts):
|
||||||
|
|
||||||
def remove_deprecated_script(self):
|
def remove_deprecated_script(self):
|
||||||
@ -209,7 +220,7 @@ def findC(root,base=None,pyrexs=None):
|
|||||||
#from obitools.version import version as obiversion
|
#from obitools.version import version as obiversion
|
||||||
#sys.path.pop(0)
|
#sys.path.pop(0)
|
||||||
|
|
||||||
VERSION = "1.0.beta"
|
VERSION = "1.0.beta.2"
|
||||||
AUTHOR = 'Eric Coissac'
|
AUTHOR = 'Eric Coissac'
|
||||||
EMAIL = 'eric@coissac.eu'
|
EMAIL = 'eric@coissac.eu'
|
||||||
URL = 'www.grenoble.prabi.fr/trac/OBITools'
|
URL = 'www.grenoble.prabi.fr/trac/OBITools'
|
||||||
@ -242,6 +253,18 @@ else:
|
|||||||
|
|
||||||
setup(name="OBITools",
|
setup(name="OBITools",
|
||||||
description="Scripts and library for sequence analysis",
|
description="Scripts and library for sequence analysis",
|
||||||
|
classifiers=[
|
||||||
|
'Development Status :: 5 - Production/Stable',
|
||||||
|
'Environment :: Console',
|
||||||
|
'Intended Audience :: Research',
|
||||||
|
'License :: CeCILL-V2',
|
||||||
|
'Operating System :: Unix like',
|
||||||
|
'Programming Language :: Python',
|
||||||
|
'Programming Language :: Python :: 2',
|
||||||
|
'Topic :: NGS Data processing',
|
||||||
|
'Topic :: DNA metabarcoding',
|
||||||
|
'Topic :: Utilities',
|
||||||
|
],
|
||||||
version=VERSION,
|
version=VERSION,
|
||||||
author=AUTHOR,
|
author=AUTHOR,
|
||||||
author_email=EMAIL,
|
author_email=EMAIL,
|
||||||
@ -251,7 +274,7 @@ setup(name="OBITools",
|
|||||||
package_dir = {'': SRC},
|
package_dir = {'': SRC},
|
||||||
packages=findPackage(SRC),
|
packages=findPackage(SRC),
|
||||||
cmdclass = {'build_ext': build_ext,'build_scripts':build_scripts, 'install_scripts':install_scripts},
|
cmdclass = {'build_ext': build_ext,'build_scripts':build_scripts, 'install_scripts':install_scripts},
|
||||||
requires=['Cython (>=0.16)'],
|
install_requires=requires,
|
||||||
zip_safe = False,
|
zip_safe = False,
|
||||||
ext_modules=EXTENTION)
|
ext_modules=EXTENTION)
|
||||||
|
|
||||||
|
@ -202,8 +202,8 @@ if __name__ == '__main__':
|
|||||||
|
|
||||||
localdata=False
|
localdata=False
|
||||||
|
|
||||||
if options.write != '' :
|
# if options.write != '' :
|
||||||
options.write = open(options.write, 'w')
|
# options.write = open(options.write, 'w')
|
||||||
|
|
||||||
for t in options.newtaxon:
|
for t in options.newtaxon:
|
||||||
tx = t.split(':')
|
tx = t.split(':')
|
||||||
|
@ -1,296 +1,69 @@
|
|||||||
|
from obitools import utils
|
||||||
|
from obitools import NucSequence
|
||||||
|
from obitools.utils import universalOpen, universalTell, fileSize, progressBar
|
||||||
|
import struct
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
import time
|
|
||||||
import re
|
|
||||||
import shelve
|
|
||||||
|
|
||||||
from threading import Lock
|
class EcoPCRFile(utils.ColumnFile):
|
||||||
from logging import warning
|
def __init__(self,stream):
|
||||||
import urllib2
|
utils.ColumnFile.__init__(self,
|
||||||
|
stream, '|', True,
|
||||||
|
(str,int,int,
|
||||||
|
str,int,str,
|
||||||
|
int,str,int,
|
||||||
|
str,int,str,
|
||||||
|
str,str,int,float,
|
||||||
|
str,int,float,
|
||||||
|
int,
|
||||||
|
str,str), "#")
|
||||||
|
|
||||||
from obitools.gzip import GzipFile
|
|
||||||
from obitools.zipfile import ZipFile
|
|
||||||
import os.path
|
|
||||||
|
|
||||||
from _utils import FakeFile
|
|
||||||
from _utils import progressBar
|
|
||||||
|
|
||||||
try:
|
|
||||||
from collections import Counter
|
|
||||||
except ImportError:
|
|
||||||
from obitools.collections import Counter
|
|
||||||
|
|
||||||
|
|
||||||
class FileFormatError(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def universalOpen(file,*options):
|
|
||||||
'''
|
|
||||||
Open a file gziped or not.
|
|
||||||
|
|
||||||
If file is a C{str} instance, file is
|
|
||||||
concidered as a file name. In this case
|
|
||||||
the C{.gz} suffixe is tested to eventually
|
|
||||||
open it a a gziped file.
|
|
||||||
|
|
||||||
If file is an other kind of object, it is assumed
|
|
||||||
that this object follow the C{file} interface
|
|
||||||
and it is return as is.
|
|
||||||
|
|
||||||
@param file: the file to open
|
|
||||||
@type file: C{str} or a file like object
|
|
||||||
|
|
||||||
@return: an iterator on text lines.
|
|
||||||
'''
|
|
||||||
if isinstance(file,str):
|
|
||||||
try:
|
|
||||||
if urllib2.urlparse.urlparse(file)[0]=='':
|
|
||||||
rep = open(file,*options)
|
|
||||||
else:
|
|
||||||
rep = urllib2.urlopen(file,timeout=15)
|
|
||||||
|
|
||||||
if file[-3:] == '.gz':
|
|
||||||
rep = GzipFile(fileobj=rep)
|
|
||||||
if file[-4:] == '.zip':
|
|
||||||
zip = ZipFile(file=rep)
|
|
||||||
data = zip.infolist()
|
|
||||||
assert len(data)==1,'Only zipped file containning a single file can be open'
|
|
||||||
name = data[0].filename
|
|
||||||
rep = zip.open(name)
|
|
||||||
except Exception as e:
|
|
||||||
print>>sys.stderr, e
|
|
||||||
sys.exit();
|
|
||||||
else:
|
|
||||||
rep = file
|
|
||||||
return rep
|
|
||||||
|
|
||||||
def universalTell(file):
|
|
||||||
'''
|
|
||||||
Return the position in the file even if
|
|
||||||
it is a gziped one.
|
|
||||||
|
|
||||||
@param file: the file to check
|
|
||||||
@type file: a C{file} like instance
|
|
||||||
|
|
||||||
@return: position in the file
|
|
||||||
@rtype: C{int}
|
|
||||||
'''
|
|
||||||
if isinstance(file, GzipFile):
|
|
||||||
file=file.myfileobj
|
|
||||||
return file.tell()
|
|
||||||
|
|
||||||
def fileSize(file):
|
|
||||||
'''
|
|
||||||
Return the file size even if it is a
|
|
||||||
gziped one.
|
|
||||||
|
|
||||||
@param file: the file to check
|
|
||||||
@type file: a C{file} like instance
|
|
||||||
|
|
||||||
@return: the size of the file
|
|
||||||
@rtype: C{int}
|
|
||||||
'''
|
|
||||||
if isinstance(file, GzipFile):
|
|
||||||
file=file.myfileobj
|
|
||||||
pos = file.tell()
|
|
||||||
file.seek(0,2)
|
|
||||||
length = file.tell()
|
|
||||||
file.seek(pos,0)
|
|
||||||
return length
|
|
||||||
|
|
||||||
|
|
||||||
def endLessIterator(endedlist):
|
|
||||||
for x in endedlist:
|
|
||||||
yield x
|
|
||||||
while(1):
|
|
||||||
yield endedlist[-1]
|
|
||||||
|
|
||||||
|
|
||||||
def multiLineWrapper(lineiterator):
|
|
||||||
'''
|
|
||||||
Aggregator of strings.
|
|
||||||
|
|
||||||
@param lineiterator: a stream of strings from an opened OBO file.
|
|
||||||
@type lineiterator: a stream of strings.
|
|
||||||
|
|
||||||
@return: an aggregated stanza.
|
|
||||||
@rtype: an iterotor on str
|
|
||||||
|
|
||||||
@note: The aggregator aggregates strings from an opened OBO file.
|
|
||||||
When the length of a string is < 2, the current stanza is over.
|
|
||||||
'''
|
|
||||||
|
|
||||||
for line in lineiterator:
|
|
||||||
rep = [line]
|
|
||||||
while len(line)>=2 and line[-2]=='\\':
|
|
||||||
rep[-1]=rep[-1][0:-2]
|
|
||||||
try:
|
|
||||||
line = lineiterator.next()
|
|
||||||
except StopIteration:
|
|
||||||
raise FileFormatError
|
|
||||||
rep.append(line)
|
|
||||||
yield ''.join(rep)
|
|
||||||
|
|
||||||
|
|
||||||
def skipWhiteLineIterator(lineiterator):
|
|
||||||
'''
|
|
||||||
Curator of stanza.
|
|
||||||
|
|
||||||
@param lineiterator: a stream of strings from an opened OBO file.
|
|
||||||
@type lineiterator: a stream of strings.
|
|
||||||
|
|
||||||
@return: a stream of strings without blank strings.
|
|
||||||
@rtype: a stream strings
|
|
||||||
|
|
||||||
@note: The curator skip white lines of the current stanza.
|
|
||||||
'''
|
|
||||||
|
|
||||||
for line in lineiterator:
|
|
||||||
cleanline = line.strip()
|
|
||||||
if cleanline:
|
|
||||||
yield line
|
|
||||||
else:
|
|
||||||
print 'skipped'
|
|
||||||
|
|
||||||
|
|
||||||
class ColumnFile(object):
|
|
||||||
|
|
||||||
def __init__(self,stream,sep=None,strip=True,
|
|
||||||
types=None,skip=None,head=None,
|
|
||||||
extra=None,
|
|
||||||
extraformat='([a-zA-Z]\w*) *= *([^;]+);'):
|
|
||||||
self._stream = universalOpen(stream)
|
|
||||||
self._delimiter=sep
|
|
||||||
self._strip=strip
|
|
||||||
self._extra=extra
|
|
||||||
self._extraformat = re.compile(extraformat)
|
|
||||||
|
|
||||||
if types:
|
|
||||||
self._types=[x for x in types]
|
|
||||||
for i in xrange(len(self._types)):
|
|
||||||
if self._types[i] is bool:
|
|
||||||
self._types[i]=ColumnFile.str2bool
|
|
||||||
else:
|
|
||||||
self._types=None
|
|
||||||
|
|
||||||
self._skip = skip
|
|
||||||
if skip is not None:
|
|
||||||
self._lskip= len(skip)
|
|
||||||
else:
|
|
||||||
self._lskip= 0
|
|
||||||
self._head=head
|
|
||||||
|
|
||||||
def str2bool(x):
|
|
||||||
return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False}))
|
|
||||||
|
|
||||||
str2bool = staticmethod(str2bool)
|
|
||||||
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return self
|
|
||||||
|
|
||||||
def next(self):
|
def next(self):
|
||||||
|
data = utils.ColumnFile.next(self)
|
||||||
|
seq = NucSequence(data[0],data[20],data[21])
|
||||||
|
seq['seq_length_ori']=data[1]
|
||||||
|
seq['taxid']=data[2]
|
||||||
|
seq['rank']=data[3]
|
||||||
|
seq['species']=data[4]
|
||||||
|
seq['species_name']=data[5]
|
||||||
|
seq['genus']=data[6]
|
||||||
|
seq['genus_name']=data[7]
|
||||||
|
seq['family']=data[8]
|
||||||
|
seq['family_name']=data[9]
|
||||||
|
seq['strand']=data[12]
|
||||||
|
seq['forward_match']=data[13]
|
||||||
|
seq['forward_error']=data[14]
|
||||||
|
seq['forward_tm']=data[15]
|
||||||
|
seq['reverse_match']=data[16]
|
||||||
|
seq['reverse_error']=data[17]
|
||||||
|
seq['reverse_tm']=data[18]
|
||||||
|
|
||||||
def cast(txt,type):
|
|
||||||
try:
|
|
||||||
v = type(txt)
|
|
||||||
except:
|
|
||||||
v=None
|
|
||||||
return v
|
|
||||||
ligne = self._stream.next()
|
|
||||||
if self._skip is not None:
|
|
||||||
while ligne[0:self._lskip]==self._skip:
|
|
||||||
ligne = self._stream.next()
|
|
||||||
if self._extra is not None:
|
|
||||||
try:
|
|
||||||
(ligne,extra) = ligne.rsplit(self._extra,1)
|
|
||||||
extra = dict(self._extraformat.findall(extra))
|
|
||||||
except ValueError:
|
|
||||||
extra=None
|
|
||||||
else:
|
|
||||||
extra = None
|
|
||||||
data = ligne.split(self._delimiter)
|
|
||||||
if self._strip or self._types:
|
|
||||||
data = [x.strip() for x in data]
|
|
||||||
if self._types:
|
|
||||||
it = endLessIterator(self._types)
|
|
||||||
data = [cast(*x) for x in ((y,it.next()) for y in data)]
|
|
||||||
if self._head is not None:
|
|
||||||
data=dict(map(None, self._head,data))
|
|
||||||
if extra is not None:
|
|
||||||
data['__extra__']=extra
|
|
||||||
else:
|
|
||||||
if extra is not None:
|
|
||||||
data.append(extra)
|
|
||||||
return data
|
|
||||||
|
|
||||||
def tell(self):
|
|
||||||
return universalTell(self._stream)
|
|
||||||
|
|
||||||
|
|
||||||
class CachedDB(object):
|
|
||||||
|
|
||||||
def __init__(self,cachefile,masterdb):
|
|
||||||
self._cache = shelve.open(cachefile,'c')
|
|
||||||
self._db = masterdb
|
|
||||||
self._lock=Lock()
|
|
||||||
|
|
||||||
def _cacheSeq(self,seq):
|
|
||||||
self._lock.acquire()
|
|
||||||
self._cache[seq.id]=seq
|
|
||||||
self._lock.release()
|
|
||||||
return seq
|
return seq
|
||||||
|
|
||||||
def __getitem__(self,ac):
|
|
||||||
if isinstance(ac,str):
|
|
||||||
self._lock.acquire()
|
|
||||||
if ac in self._cache:
|
|
||||||
# print >>sys.stderr,"Use cache for %s" % ac
|
|
||||||
data = self._cache[ac]
|
|
||||||
self._lock.release()
|
|
||||||
|
|
||||||
else:
|
|
||||||
self._lock.release()
|
class EcoPCRDBFile(object):
|
||||||
data = self._db[ac]
|
|
||||||
self._cacheSeq(data)
|
def _ecoRecordIterator(self,file,noError=False):
|
||||||
return data
|
file = universalOpen(file,noError)
|
||||||
|
(recordCount,) = struct.unpack('> I',file.read(4))
|
||||||
|
self._recover=False
|
||||||
|
|
||||||
|
if recordCount:
|
||||||
|
for i in xrange(recordCount):
|
||||||
|
(recordSize,)=struct.unpack('>I',file.read(4))
|
||||||
|
record = file.read(recordSize)
|
||||||
|
yield record
|
||||||
else:
|
else:
|
||||||
self._lock.acquire()
|
print >> sys.stderr,"\n\n WARNING : EcoPCRDB reading set into recover data mode\n"
|
||||||
acs = [[x,self._cache.get(x,None)] for x in ac]
|
self._recover=True
|
||||||
self._lock.release()
|
ok=True
|
||||||
newacs = [ac for ac,cached in acs if cached is None]
|
while(ok):
|
||||||
if newacs:
|
try:
|
||||||
newseqs = self._db[newacs]
|
(recordSize,)=struct.unpack('>I',file.read(4))
|
||||||
else:
|
record = file.read(recordSize)
|
||||||
newseqs = iter([])
|
yield record
|
||||||
for r in acs:
|
except:
|
||||||
if r[1] is None:
|
ok=False
|
||||||
r[1]=self._cacheSeq(newseqs.next())
|
|
||||||
# else:
|
|
||||||
# print >>sys.stderr,"Use cache for %s" % r[0]
|
|
||||||
return (x[1] for x in acs)
|
|
||||||
|
|
||||||
|
|
||||||
def moduleInDevelopment(name):
|
|
||||||
Warning('This module %s is under development : use it with caution' % name)
|
|
||||||
|
|
||||||
|
|
||||||
def deprecatedScript(newscript):
|
|
||||||
current = sys.argv[0]
|
|
||||||
print >>sys.stderr," "
|
|
||||||
print >>sys.stderr," "
|
|
||||||
print >>sys.stderr," "
|
|
||||||
print >>sys.stderr,"#########################################################"
|
|
||||||
print >>sys.stderr,"# #"
|
|
||||||
print >>sys.stderr," W A R N I N G :"
|
|
||||||
print >>sys.stderr," %s is a deprecated script " % os.path.split(current)[1]
|
|
||||||
print >>sys.stderr," it will disappear in the next obitools version"
|
|
||||||
print >>sys.stderr," "
|
|
||||||
print >>sys.stderr," The new corresponding command is %s " % newscript
|
|
||||||
print >>sys.stderr,"# #"
|
|
||||||
print >>sys.stderr,"#########################################################"
|
|
||||||
print >>sys.stderr," "
|
|
||||||
print >>sys.stderr," "
|
|
||||||
print >>sys.stderr," "
|
|
||||||
|
@ -70,10 +70,9 @@ def loadTaxonomyDatabase(options):
|
|||||||
|
|
||||||
if isinstance(options.taxonomy, Taxonomy):
|
if isinstance(options.taxonomy, Taxonomy):
|
||||||
return options.taxonomy
|
return options.taxonomy
|
||||||
#taxonomy = ecobarcodeDatabaseConnection(options)
|
|
||||||
taxonomy = None
|
taxonomy = None
|
||||||
if (taxonomy is not None or
|
if (options.taxonomy is not None or
|
||||||
options.taxonomy is not None or
|
|
||||||
options.taxdump is not None):
|
options.taxdump is not None):
|
||||||
if options.taxdump is not None:
|
if options.taxdump is not None:
|
||||||
taxonomy = TaxonomyDump(options.taxdump)
|
taxonomy = TaxonomyDump(options.taxdump)
|
||||||
|
@ -1,12 +1,14 @@
|
|||||||
from obitools import NucSequence
|
from obitools import NucSequence
|
||||||
from obitools.ecopcr import EcoPCRDBFile
|
from obitools.ecopcr import EcoPCRDBFile
|
||||||
from obitools.ecopcr.taxonomy import EcoTaxonomyDB, ecoTaxonomyWriter
|
from obitools.ecopcr.taxonomy import EcoTaxonomyDB, ecoTaxonomyWriter
|
||||||
|
from obitools.ecopcr.options import loadTaxonomyDatabase
|
||||||
from obitools.ecopcr.annotation import EcoPCRDBAnnotationWriter
|
from obitools.ecopcr.annotation import EcoPCRDBAnnotationWriter
|
||||||
from obitools.utils import universalOpen
|
from obitools.utils import universalOpen
|
||||||
from glob import glob
|
from glob import glob
|
||||||
import struct
|
import struct
|
||||||
import gzip
|
import gzip
|
||||||
import sys
|
import sys
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
class EcoPCRDBSequenceIterator(EcoPCRDBFile):
|
class EcoPCRDBSequenceIterator(EcoPCRDBFile):
|
||||||
@ -40,11 +42,11 @@ class EcoPCRDBSequenceIterator(EcoPCRDBFile):
|
|||||||
for record in self._ecoRecordIterator(file):
|
for record in self._ecoRecordIterator(file):
|
||||||
lrecord = len(record)
|
lrecord = len(record)
|
||||||
lnames = lrecord - (4*4+20)
|
lnames = lrecord - (4*4+20)
|
||||||
(taxid,seqid,deflength,seqlength,cptseqlength,string)=struct.unpack('> I 20s I I I %ds' % lnames, record)
|
(taxid,seqid,deflength,seqlength,cptseqlength,string)=struct.unpack('> I 20s I I I %ds' % lnames, record) # @UnusedVariable
|
||||||
seqid=seqid.strip('\x00')
|
seqid=seqid.strip('\x00')
|
||||||
de = string[:deflength]
|
de = string[:deflength]
|
||||||
seq = gzip.zlib.decompress(string[deflength:])
|
seq = gzip.zlib.decompress(string[deflength:])
|
||||||
bioseq = NucSequence(seqid,seq,de,taxidx=taxid,taxid=self._taxonomy._taxonomy[taxid][0])
|
bioseq = NucSequence(seqid,seq,de,taxid=self._taxonomy._taxonomy[taxid][0])
|
||||||
yield bioseq
|
yield bioseq
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
@ -54,8 +56,26 @@ class EcoPCRDBSequenceIterator(EcoPCRDBFile):
|
|||||||
|
|
||||||
class EcoPCRDBSequenceWriter(object):
|
class EcoPCRDBSequenceWriter(object):
|
||||||
|
|
||||||
def __init__(self,dbname,fileidx=1,taxonomy=None,ftid=None,type=None,definition=None,append=False):
|
def __init__(self,options,fileidx=None,ftid=None,type=None,definition=None,append=False):
|
||||||
self._taxonomy=taxonomy
|
|
||||||
|
# Take care of the taxonomy associated to the database
|
||||||
|
|
||||||
|
self._taxonomy= loadTaxonomyDatabase(options)
|
||||||
|
dbname=options.ecopcroutput
|
||||||
|
|
||||||
|
if (self._taxonomy is not None
|
||||||
|
and (not hasattr(options,'ecodb') or options.ecodb!=dbname)):
|
||||||
|
print >> sys.stderr,"Writing the taxonomy file...",
|
||||||
|
ecoTaxonomyWriter(dbname,self._taxonomy)
|
||||||
|
print >> sys.stderr,"Ok"
|
||||||
|
|
||||||
|
# Identifiy the next sequence file numbre
|
||||||
|
if fileidx is None:
|
||||||
|
p = re.compile(r'([0-9]{3})\.sdx')
|
||||||
|
fileidx = max(list(int(p.search(i).group(1))
|
||||||
|
for i in glob('%s_[0-9][0-9][0-9].sdx' % dbname))+[0]
|
||||||
|
) +1
|
||||||
|
|
||||||
self._filename="%s_%03d.sdx" % (dbname,fileidx)
|
self._filename="%s_%03d.sdx" % (dbname,fileidx)
|
||||||
if append:
|
if append:
|
||||||
mode ='r+b'
|
mode ='r+b'
|
||||||
@ -73,11 +93,6 @@ class EcoPCRDBSequenceWriter(object):
|
|||||||
self._file = open(self._filename,mode)
|
self._file = open(self._filename,mode)
|
||||||
self._file.write(struct.pack('> I',self._sequenceCount))
|
self._file.write(struct.pack('> I',self._sequenceCount))
|
||||||
|
|
||||||
if self._taxonomy is not None:
|
|
||||||
print >> sys.stderr,"Writing the taxonomy file...",
|
|
||||||
ecoTaxonomyWriter(dbname,self._taxonomy)
|
|
||||||
print >> sys.stderr,"Ok"
|
|
||||||
|
|
||||||
if type is not None:
|
if type is not None:
|
||||||
assert ftid is not None,"You must specify an id attribute for features"
|
assert ftid is not None,"You must specify an id attribute for features"
|
||||||
self._annotation = EcoPCRDBAnnotationWriter(dbname, ftid, fileidx, type, definition)
|
self._annotation = EcoPCRDBAnnotationWriter(dbname, ftid, fileidx, type, definition)
|
||||||
|
@ -329,10 +329,10 @@ class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile):
|
|||||||
|
|
||||||
try :
|
try :
|
||||||
lt=0
|
lt=0
|
||||||
for record in self._ecoRecordIterator(self._localTaxonFile):
|
for record in self._ecoRecordIterator(self._localTaxonFile,noError=True):
|
||||||
lrecord = len(record)
|
lrecord = len(record)
|
||||||
lnames = lrecord - 16
|
lnames = lrecord - 16
|
||||||
(taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record)
|
(taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record) # @UnusedVariable
|
||||||
lt+=1
|
lt+=1
|
||||||
yield (taxid,rankid,parentidx,name,'local')
|
yield (taxid,rankid,parentidx,name,'local')
|
||||||
print >> sys.stderr, " [INFO : Local taxon file found] : %d added taxa" % lt
|
print >> sys.stderr, " [INFO : Local taxon file found] : %d added taxa" % lt
|
||||||
@ -344,7 +344,7 @@ class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile):
|
|||||||
yield record
|
yield record
|
||||||
|
|
||||||
def __ecoAliasIterator(self):
|
def __ecoAliasIterator(self):
|
||||||
for record in self._ecoRecordIterator(self._aliasFile):
|
for record in self._ecoRecordIterator(self._aliasFile,noError=True):
|
||||||
(taxid,index) = struct.unpack('> I i',record)
|
(taxid,index) = struct.unpack('> I i',record)
|
||||||
yield taxid,index
|
yield taxid,index
|
||||||
|
|
||||||
@ -402,7 +402,7 @@ class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile):
|
|||||||
|
|
||||||
try :
|
try :
|
||||||
self._preferedName = [(x[0],'obi',x[2])
|
self._preferedName = [(x[0],'obi',x[2])
|
||||||
for x in self.__ecoNameIterator(self._preferedNamesFile)]
|
for x in self.__ecoNameIterator(self._preferedNamesFile,noError=True)]
|
||||||
print >> sys.stderr, " [INFO : Preferred taxon name file found] : %d added taxa" % len(self._preferedName)
|
print >> sys.stderr, " [INFO : Preferred taxon name file found] : %d added taxa" % len(self._preferedName)
|
||||||
except:
|
except:
|
||||||
print >> sys.stderr, " [INFO : Preferred taxon name file not found]"
|
print >> sys.stderr, " [INFO : Preferred taxon name file not found]"
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
import sys
|
import sys
|
||||||
from obitools.fasta import formatFasta
|
from obitools.fasta import formatFasta
|
||||||
from obitools.ecopcr.sequence import EcoPCRDBSequenceWriter
|
#from obitools.ecopcr.sequence import EcoPCRDBSequenceWriter
|
||||||
|
|
||||||
cpdef printOutput(options,seq,output=sys.stdout):
|
cpdef printOutput(options,seq,output=sys.stdout):
|
||||||
if options.output is not None:
|
if options.output is not None:
|
||||||
|
@ -18,7 +18,6 @@ from obitools.fasta import formatFasta, rawFastaIterator,\
|
|||||||
from obitools.fastq import formatFastq
|
from obitools.fastq import formatFastq
|
||||||
|
|
||||||
from obitools.ecopcr.sequence import EcoPCRDBSequenceWriter
|
from obitools.ecopcr.sequence import EcoPCRDBSequenceWriter
|
||||||
from obitools.ecopcr.options import loadTaxonomyDatabase
|
|
||||||
|
|
||||||
from cPickle import dump,load,UnpicklingError
|
from cPickle import dump,load,UnpicklingError
|
||||||
|
|
||||||
@ -34,7 +33,7 @@ from obitools.format.sequence import skipOnErrorIterator
|
|||||||
from obitools import BioSequence
|
from obitools import BioSequence
|
||||||
from obitools.utils import FakeFile
|
from obitools.utils import FakeFile
|
||||||
|
|
||||||
|
from glob import glob
|
||||||
|
|
||||||
|
|
||||||
def binarySequenceIterator(lineiterator):
|
def binarySequenceIterator(lineiterator):
|
||||||
@ -168,7 +167,7 @@ def addOutputFormatOption(optionManager):
|
|||||||
# help="Output sequences in sap fasta format "
|
# help="Output sequences in sap fasta format "
|
||||||
# "(Sequence must have a taxid and a taxonomy has to be loaded)")
|
# "(Sequence must have a taxid and a taxonomy has to be loaded)")
|
||||||
|
|
||||||
group.add_option('--ecopcrDB-output',
|
group.add_option('--ecopcrdb-output',
|
||||||
action="store", dest="ecopcroutput",
|
action="store", dest="ecopcroutput",
|
||||||
default=None,
|
default=None,
|
||||||
help="Output sequences in ecopcr database format "
|
help="Output sequences in ecopcr database format "
|
||||||
@ -313,6 +312,10 @@ def sequenceWriterGenerator(options,output=sys.stdout):
|
|||||||
self._format=formatSAPFastaGenerator(options)
|
self._format=formatSAPFastaGenerator(options)
|
||||||
elif options.outputFormater is not None:
|
elif options.outputFormater is not None:
|
||||||
self._format=options.outputFormater
|
self._format=options.outputFormater
|
||||||
|
|
||||||
|
if hasattr(seq,'_hasTaxid') and seq._hasTaxid:
|
||||||
|
seq.extractTaxon()
|
||||||
|
|
||||||
s = self._format(seq,upper=self._upper)
|
s = self._format(seq,upper=self._upper)
|
||||||
try:
|
try:
|
||||||
self._file.write(s)
|
self._file.write(s)
|
||||||
@ -336,8 +339,7 @@ def sequenceWriterGenerator(options,output=sys.stdout):
|
|||||||
|
|
||||||
|
|
||||||
if options.ecopcroutput is not None:
|
if options.ecopcroutput is not None:
|
||||||
taxo = loadTaxonomyDatabase(options)
|
writer=EcoPCRDBSequenceWriter(options)
|
||||||
writer=EcoPCRDBSequenceWriter(options.ecopcroutput,taxonomy=taxo)
|
|
||||||
elif options.output==dump:
|
elif options.output==dump:
|
||||||
writer=BinaryWriter(options,output)
|
writer=BinaryWriter(options,output)
|
||||||
else:
|
else:
|
||||||
|
@ -6,7 +6,7 @@ from obitools.utils import universalOpen
|
|||||||
from obitools.utils import universalTell
|
from obitools.utils import universalTell
|
||||||
from obitools.utils import fileSize
|
from obitools.utils import fileSize
|
||||||
from obitools.ecopcr.sequence import EcoPCRDBSequenceIterator
|
from obitools.ecopcr.sequence import EcoPCRDBSequenceIterator
|
||||||
|
from glob import glob
|
||||||
from logging import debug
|
from logging import debug
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
@ -69,23 +69,30 @@ def allEntryIterator(files,entryIterator,with_progress=False,histo_step=102):
|
|||||||
if files :
|
if files :
|
||||||
for f in files:
|
for f in files:
|
||||||
if (entryIterator != EcoPCRDBSequenceIterator) :
|
if (entryIterator != EcoPCRDBSequenceIterator) :
|
||||||
|
|
||||||
cfs.currentInputFileName=f
|
cfs.currentInputFileName=f
|
||||||
f = universalOpen(f)
|
try:
|
||||||
cfs.currentFile=f
|
f = universalOpen(f,noError=True)
|
||||||
cfs.currentFileSize=fileSize(cfs.currentFile)
|
except Exception as e:
|
||||||
debug(f)
|
if glob('%s_[0-9][0-9][0-9].sdx' % f):
|
||||||
|
entryIterator=EcoPCRDBSequenceIterator
|
||||||
if with_progress:
|
else:
|
||||||
f=fileWithProgressBar(f,step=histo_step)
|
print >>sys.stderr, e
|
||||||
|
sys.exit();
|
||||||
if entryIterator is None:
|
|
||||||
for line in f:
|
|
||||||
yield line
|
|
||||||
else:
|
else:
|
||||||
for entry in entryIterator(f):
|
cfs.currentFile=f
|
||||||
yield entry
|
cfs.currentFileSize=fileSize(cfs.currentFile)
|
||||||
else :
|
debug(f)
|
||||||
yield EcoPCRDBSequenceIterator(f)
|
|
||||||
|
if with_progress:
|
||||||
|
f=fileWithProgressBar(f,step=histo_step)
|
||||||
|
|
||||||
|
if entryIterator is None:
|
||||||
|
for line in f:
|
||||||
|
yield line
|
||||||
|
else:
|
||||||
|
for entry in entryIterator(f):
|
||||||
|
yield entry
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if entryIterator is None:
|
if entryIterator is None:
|
||||||
|
@ -26,7 +26,7 @@ class FileFormatError(Exception):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
def universalOpen(file,*options):
|
def universalOpen(file,noError=False):
|
||||||
'''
|
'''
|
||||||
Open a file gziped or not.
|
Open a file gziped or not.
|
||||||
|
|
||||||
@ -47,7 +47,7 @@ def universalOpen(file,*options):
|
|||||||
if isinstance(file,str):
|
if isinstance(file,str):
|
||||||
try:
|
try:
|
||||||
if urllib2.urlparse.urlparse(file)[0]=='':
|
if urllib2.urlparse.urlparse(file)[0]=='':
|
||||||
rep = open(file,*options)
|
rep = open(file)
|
||||||
else:
|
else:
|
||||||
rep = urllib2.urlopen(file,timeout=15)
|
rep = urllib2.urlopen(file,timeout=15)
|
||||||
|
|
||||||
@ -60,8 +60,11 @@ def universalOpen(file,*options):
|
|||||||
name = data[0].filename
|
name = data[0].filename
|
||||||
rep = zip.open(name)
|
rep = zip.open(name)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print>>sys.stderr, e
|
if not noError:
|
||||||
sys.exit();
|
print >>sys.stderr, e
|
||||||
|
sys.exit();
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
else:
|
else:
|
||||||
rep = file
|
rep = file
|
||||||
return rep
|
return rep
|
||||||
|
Reference in New Issue
Block a user