Patch several bugs or inconsistencies following the tutorial at Anthony's lab

2014-06-05 14:30:20 +00:00
parent e0d8e2fe5d
commit 59f5ab4d55
12 changed files with 162 additions and 342 deletions
--- a/.pydevproject
+++ b/.pydevproject
@ -1,11 +1,9 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<?eclipse-pydev version="1.0"?>
+<?eclipse-pydev version="1.0"?><pydev_project>
 <pydev_project>
 <pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
 <path>/OBITools-1.0/src</path>
 <path>/OBITools-1.0/textwrangler</path>
 </pydev_pathproperty>
 <pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.7</pydev_property>
-<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Python2.7</pydev_property>
+<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Python-2.7</pydev_property>
 </pydev_project>
--- a/doc/sphinx/source/optionsSet/outputformat.txt
+++ b/doc/sphinx/source/optionsSet/outputformat.txt
@ -17,7 +17,7 @@ Standard output format
 Generating an ecoPCR database
 .............................
-.. cmdoption::  --ecopcrDB-output=<PREFIX_FILENAME> 
+.. cmdoption::  --ecopcrdb-output=<PREFIX_FILENAME> 
      Creates an ecoPCR database from sequence records results
--- a/setup.py
+++ b/setup.py
@ -6,9 +6,16 @@
 try:
-   from setuptools.core import setup
+    from setuptools import setup
 except ImportError:
-   from distutils.core import setup
+    import ez_setup
    ez_setup.use_setuptools()
    from setuptools import setup
 # try:
 #    from setuptools.core import setup
 # except ImportError:
 #    from distutils.core import setup
 from distutils.extension import Extension
 from distutils.util import convert_path
 from distutils import log
@ -39,6 +46,10 @@ import glob
 from os import path
 # requires = ['Cython>=0.20', 'Sphinx>=1.2']
 requires = ['Cython>=0.20']
 class install_scripts(ori_install_scripts):
    def remove_deprecated_script(self):
@ -209,7 +220,7 @@ def findC(root,base=None,pyrexs=None):
 #from obitools.version import version as obiversion
 #sys.path.pop(0)
-VERSION =  "1.0.beta"
+VERSION =  "1.0.beta.2"
 AUTHOR  = 'Eric Coissac'
 EMAIL   = 'eric@coissac.eu'
 URL     = 'www.grenoble.prabi.fr/trac/OBITools'
@ -242,6 +253,18 @@ else:
 setup(name="OBITools",
      description="Scripts and library for sequence analysis",
      classifiers=[
        'Development Status :: 5 - Production/Stable',
        'Environment :: Console',
        'Intended Audience :: Research',
        'License :: CeCILL-V2',
        'Operating System :: Unix like',
        'Programming Language :: Python',
        'Programming Language :: Python :: 2',
        'Topic :: NGS Data processing',
        'Topic :: DNA metabarcoding',
        'Topic :: Utilities',
      ],
      version=VERSION,
      author=AUTHOR,
      author_email=EMAIL,
@ -251,7 +274,7 @@ setup(name="OBITools",
      package_dir = {'': SRC},
      packages=findPackage(SRC),
      cmdclass = {'build_ext': build_ext,'build_scripts':build_scripts, 'install_scripts':install_scripts},
-      requires=['Cython (>=0.16)'],
+      install_requires=requires,
      zip_safe = False,
      ext_modules=EXTENTION)
--- a/src/obitaxonomy.py
+++ b/src/obitaxonomy.py
@ -202,8 +202,8 @@ if __name__ == '__main__':
    localdata=False
-    if options.write != '' :
+#     if options.write != '' :
-        options.write = open(options.write, 'w')
+#         options.write = open(options.write, 'w')
    for t in options.newtaxon:
        tx = t.split(':')
--- a/src/obitools/ecopcr/init.py
+++ b/src/obitools/ecopcr/init.py
@ -1,296 +1,69 @@
 from obitools import utils
 from obitools import NucSequence
 from obitools.utils import universalOpen, universalTell, fileSize, progressBar
 import struct
 import sys
 import time
 import re
 import shelve
-from threading import Lock
+class EcoPCRFile(utils.ColumnFile):
-from logging  import warning
+    def __init__(self,stream):
-import urllib2
+        utils.ColumnFile.__init__(self,
                                  stream, '|', True, 
                                  (str,int,int,
                                   str,int,str,
                                   int,str,int,
                                   str,int,str,
                                   str,str,int,float,
                                   str,int,float,
                                   int,
                                   str,str), "#")
 from obitools.gzip import GzipFile
 from obitools.zipfile import ZipFile
 import os.path
 from _utils import FakeFile
 from _utils import progressBar
 try:
    from collections import Counter
 except ImportError:
    from obitools.collections import Counter
 class FileFormatError(Exception):
    pass
 def universalOpen(file,*options):
    '''
    Open a file gziped or not.
    If file is a C{str} instance, file is
    concidered as a file name. In this case 
    the C{.gz} suffixe is tested to eventually
    open it a a gziped file.
    If file is an other kind of object, it is assumed
    that this object follow the C{file} interface 
    and it is return as is.
    @param file: the file to open
    @type file: C{str} or a file like object
    @return: an iterator on text lines.
    '''
    if isinstance(file,str):
        try:
            if urllib2.urlparse.urlparse(file)[0]=='':
                rep = open(file,*options)
            else:
                rep  = urllib2.urlopen(file,timeout=15)
            if file[-3:] == '.gz':
                rep = GzipFile(fileobj=rep)
            if file[-4:] == '.zip':
                zip = ZipFile(file=rep)
                data = zip.infolist()
                assert len(data)==1,'Only zipped file containning a single file can be open'
                name = data[0].filename
                rep = zip.open(name)
        except Exception as e:
            print>>sys.stderr, e
            sys.exit();
    else:
        rep = file
    return rep
 def universalTell(file):
    '''
    Return the position in the file even if
    it is a gziped one.
    @param file: the file to check
    @type file: a C{file} like instance
    @return: position in the file
    @rtype:  C{int}
    '''
    if isinstance(file, GzipFile):
        file=file.myfileobj
    return file.tell()
 def fileSize(file):
    '''
    Return the file size even if it is a 
    gziped one.
    @param file: the file to check
    @type file: a C{file} like instance
    @return: the size of the file
    @rtype: C{int}
    '''
    if isinstance(file, GzipFile):
        file=file.myfileobj
    pos = file.tell()
    file.seek(0,2)
    length = file.tell()
    file.seek(pos,0)
    return length
 def endLessIterator(endedlist):
    for x in endedlist:
        yield x
    while(1):
        yield endedlist[-1]
 def multiLineWrapper(lineiterator):
    '''
    Aggregator of strings.
    @param lineiterator: a stream of strings from an opened OBO file.
    @type lineiterator: a stream of strings.
    @return: an aggregated stanza.
    @rtype: an iterotor on str
    @note: The aggregator aggregates strings from an opened OBO file.
    When the length of a string is < 2, the current stanza is over.
    '''
    for line in lineiterator:
        rep = [line]
        while len(line)>=2 and line[-2]=='\\':
            rep[-1]=rep[-1][0:-2]
            try:
                line = lineiterator.next()
            except StopIteration:
                raise FileFormatError
            rep.append(line)
        yield ''.join(rep)
 def skipWhiteLineIterator(lineiterator):
    '''
    Curator of stanza.
    @param lineiterator: a stream of strings from an opened OBO file.
    @type lineiterator: a stream of strings.
    @return: a stream of strings without blank strings.
    @rtype: a stream strings
    @note: The curator skip white lines of the current stanza.
    '''
    for line in lineiterator:
        cleanline = line.strip()
        if cleanline:
            yield line
        else:
            print 'skipped'
 class ColumnFile(object):
    def __init__(self,stream,sep=None,strip=True,
                 types=None,skip=None,head=None,
                 extra=None,
                 extraformat='([a-zA-Z]\w*) *= *([^;]+);'):
        self._stream = universalOpen(stream)
        self._delimiter=sep
        self._strip=strip
        self._extra=extra
        self._extraformat = re.compile(extraformat)
        if types:
            self._types=[x for x in types]
            for i in xrange(len(self._types)):
                if self._types[i] is bool:
                    self._types[i]=ColumnFile.str2bool
        else:
            self._types=None
        self._skip = skip
        if skip is not None:
            self._lskip= len(skip)
        else:
            self._lskip= 0
        self._head=head
    def str2bool(x):
        return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False}))
    str2bool = staticmethod(str2bool)
    def __iter__(self):
        return self
    def next(self):
        data = utils.ColumnFile.next(self)
        seq = NucSequence(data[0],data[20],data[21])
        seq['seq_length_ori']=data[1]
        seq['taxid']=data[2]
        seq['rank']=data[3]
        seq['species']=data[4]
        seq['species_name']=data[5]
        seq['genus']=data[6]
        seq['genus_name']=data[7]
        seq['family']=data[8]
        seq['family_name']=data[9]
        seq['strand']=data[12]
        seq['forward_match']=data[13]
        seq['forward_error']=data[14]
        seq['forward_tm']=data[15]
        seq['reverse_match']=data[16]
        seq['reverse_error']=data[17]
        seq['reverse_tm']=data[18]
        def cast(txt,type):
            try:
                v = type(txt)
            except:
                v=None
            return v
        ligne = self._stream.next()
        if self._skip is not None:
            while ligne[0:self._lskip]==self._skip:
                ligne = self._stream.next()
        if self._extra is not None:
            try:
                (ligne,extra) = ligne.rsplit(self._extra,1)
                extra = dict(self._extraformat.findall(extra))
            except ValueError:
                extra=None
        else:
            extra = None
        data = ligne.split(self._delimiter)
        if self._strip or self._types:
            data = [x.strip() for x in data]
        if self._types:
            it = endLessIterator(self._types)
            data = [cast(*x) for x in ((y,it.next()) for y in data)]
        if self._head is not None:
            data=dict(map(None, self._head,data))
            if extra is not None:
                data['__extra__']=extra
        else:
            if extra is not None:
                data.append(extra)
        return data
    def tell(self):
        return universalTell(self._stream)
 class CachedDB(object):
    def __init__(self,cachefile,masterdb):
        self._cache = shelve.open(cachefile,'c')
        self._db = masterdb
        self._lock=Lock()
    def _cacheSeq(self,seq):
        self._lock.acquire()
        self._cache[seq.id]=seq
        self._lock.release()
        return seq
    def __getitem__(self,ac):
        if isinstance(ac,str):
            self._lock.acquire()
            if ac in self._cache:
 #                print >>sys.stderr,"Use cache for %s" % ac
                data = self._cache[ac]
                self._lock.release()
-            else:
+        
-                self._lock.release()
+class EcoPCRDBFile(object):
-                data = self._db[ac]
+    
-                self._cacheSeq(data)
+    def _ecoRecordIterator(self,file,noError=False):
-            return data
+        file = universalOpen(file,noError)
        (recordCount,) = struct.unpack('> I',file.read(4))
        self._recover=False
        if recordCount:
            for i in xrange(recordCount):
                (recordSize,)=struct.unpack('>I',file.read(4))
                record = file.read(recordSize)
                yield record
        else:
-            self._lock.acquire()
+            print >> sys.stderr,"\n\n  WARNING : EcoPCRDB reading set into recover data mode\n"
-            acs = [[x,self._cache.get(x,None)] for x in ac]
+            self._recover=True
-            self._lock.release()
+            ok=True
-            newacs = [ac for ac,cached in acs if cached is None]
+            while(ok):
-            if newacs:
+                try:
-                newseqs = self._db[newacs]
+                    (recordSize,)=struct.unpack('>I',file.read(4))
-            else:
+                    record = file.read(recordSize)
-                newseqs = iter([])
+                    yield record
-            for r in acs:
+                except:
-                if r[1] is None:
+                    ok=False
                    r[1]=self._cacheSeq(newseqs.next())
 #                else:
 #                    print >>sys.stderr,"Use cache for %s" % r[0]
            return (x[1] for x in acs)
 def moduleInDevelopment(name):
    Warning('This module %s is under development : use it with caution' % name)
 def deprecatedScript(newscript):
    current = sys.argv[0]
    print >>sys.stderr,"        "   
    print >>sys.stderr,"        "   
    print >>sys.stderr,"        "   
    print >>sys.stderr,"#########################################################"
    print >>sys.stderr,"#                                                       #"
    print >>sys.stderr,"    W A R N I N G :"
    print >>sys.stderr,"        %s is a deprecated script                     " % os.path.split(current)[1]
    print >>sys.stderr,"        it will disappear in the next obitools version" 
    print >>sys.stderr,"        "   
    print >>sys.stderr,"    The new corresponding command is %s    " % newscript   
    print >>sys.stderr,"#                                                       #"
    print >>sys.stderr,"#########################################################"
    print >>sys.stderr,"        "   
    print >>sys.stderr,"        "   
    print >>sys.stderr,"        "   
--- a/src/obitools/ecopcr/options.py
+++ b/src/obitools/ecopcr/options.py
@ -70,10 +70,9 @@ def loadTaxonomyDatabase(options):
    if isinstance(options.taxonomy, Taxonomy):
        return options.taxonomy
-    #taxonomy = ecobarcodeDatabaseConnection(options)
+
    taxonomy = None
-    if (taxonomy is not None or 
+    if (options.taxonomy is not None or 
        options.taxonomy is not None or 
        options.taxdump is not None):
        if options.taxdump is not None:
            taxonomy = TaxonomyDump(options.taxdump)
--- a/src/obitools/ecopcr/sequence.py
+++ b/src/obitools/ecopcr/sequence.py
@ -1,12 +1,14 @@
 from obitools import NucSequence
 from obitools.ecopcr import EcoPCRDBFile
 from obitools.ecopcr.taxonomy import EcoTaxonomyDB, ecoTaxonomyWriter
 from obitools.ecopcr.options  import loadTaxonomyDatabase
 from obitools.ecopcr.annotation import EcoPCRDBAnnotationWriter
 from obitools.utils import universalOpen
 from glob import glob
 import struct
 import gzip
 import sys
 import re
 class EcoPCRDBSequenceIterator(EcoPCRDBFile):
@ -40,11 +42,11 @@ class EcoPCRDBSequenceIterator(EcoPCRDBFile):
        for record in self._ecoRecordIterator(file):
            lrecord = len(record)
            lnames  = lrecord - (4*4+20)
-            (taxid,seqid,deflength,seqlength,cptseqlength,string)=struct.unpack('> I 20s I I I %ds' % lnames, record)
+            (taxid,seqid,deflength,seqlength,cptseqlength,string)=struct.unpack('> I 20s I I I %ds' % lnames, record)  # @UnusedVariable
            seqid=seqid.strip('\x00')
            de = string[:deflength]
            seq = gzip.zlib.decompress(string[deflength:])
-            bioseq = NucSequence(seqid,seq,de,taxidx=taxid,taxid=self._taxonomy._taxonomy[taxid][0])
+            bioseq = NucSequence(seqid,seq,de,taxid=self._taxonomy._taxonomy[taxid][0])
            yield  bioseq
    def __iter__(self):
@ -54,8 +56,26 @@ class EcoPCRDBSequenceIterator(EcoPCRDBFile):
 class EcoPCRDBSequenceWriter(object):
-    def __init__(self,dbname,fileidx=1,taxonomy=None,ftid=None,type=None,definition=None,append=False):
+    def __init__(self,options,fileidx=None,ftid=None,type=None,definition=None,append=False):
-        self._taxonomy=taxonomy
+
        # Take care of the taxonomy associated to the database
        self._taxonomy= loadTaxonomyDatabase(options)
        dbname=options.ecopcroutput
        if (self._taxonomy is not None
            and (not hasattr(options,'ecodb') or options.ecodb!=dbname)):
            print >> sys.stderr,"Writing the taxonomy file...",
            ecoTaxonomyWriter(dbname,self._taxonomy)
            print >> sys.stderr,"Ok"
        # Identifiy the next sequence file numbre 
        if fileidx is None:
            p = re.compile(r'([0-9]{3})\.sdx')
            fileidx = max(list(int(p.search(i).group(1)) 
                               for i in glob('%s_[0-9][0-9][0-9].sdx' % dbname))+[0]
                          ) +1
        self._filename="%s_%03d.sdx" % (dbname,fileidx)
        if append:
            mode ='r+b'
@ -73,11 +93,6 @@ class EcoPCRDBSequenceWriter(object):
            self._file = open(self._filename,mode)
            self._file.write(struct.pack('> I',self._sequenceCount))
        if self._taxonomy is not None:
            print >> sys.stderr,"Writing the taxonomy file...",
            ecoTaxonomyWriter(dbname,self._taxonomy)
            print >> sys.stderr,"Ok"
        if type is not None:
            assert ftid is not None,"You must specify an id attribute for features"
            self._annotation = EcoPCRDBAnnotationWriter(dbname, ftid, fileidx, type, definition)
--- a/src/obitools/ecopcr/taxonomy.py
+++ b/src/obitools/ecopcr/taxonomy.py
@ -329,10 +329,10 @@ class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile):
        try :
            lt=0
-            for record in self._ecoRecordIterator(self._localTaxonFile):
+            for record in self._ecoRecordIterator(self._localTaxonFile,noError=True):
                lrecord = len(record)
                lnames  = lrecord - 16
-                (taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record)
+                (taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record)  # @UnusedVariable
                lt+=1
                yield  (taxid,rankid,parentidx,name,'local')
            print >> sys.stderr, " [INFO : Local taxon file found] : %d added taxa" % lt
@ -344,7 +344,7 @@ class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile):
            yield  record
    def __ecoAliasIterator(self):
-        for record in self._ecoRecordIterator(self._aliasFile):
+        for record in self._ecoRecordIterator(self._aliasFile,noError=True):
            (taxid,index) = struct.unpack('> I i',record)
            yield taxid,index
@ -402,7 +402,7 @@ class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile):
        try :
            self._preferedName = [(x[0],'obi',x[2]) 
-                                  for x in self.__ecoNameIterator(self._preferedNamesFile)]
+                                  for x in self.__ecoNameIterator(self._preferedNamesFile,noError=True)]
            print >> sys.stderr, " [INFO : Preferred taxon name file found] : %d added taxa" % len(self._preferedName)
        except:
            print >> sys.stderr, " [INFO : Preferred taxon name file not found]"
--- a/src/obitools/format/_format.pyx
+++ b/src/obitools/format/_format.pyx
@ -2,7 +2,7 @@
 import sys
 from obitools.fasta import formatFasta
-from obitools.ecopcr.sequence import EcoPCRDBSequenceWriter
+#from obitools.ecopcr.sequence import EcoPCRDBSequenceWriter
 cpdef printOutput(options,seq,output=sys.stdout):
    if options.output is not None:
--- a/src/obitools/format/options.py
+++ b/src/obitools/format/options.py
@ -18,7 +18,6 @@ from obitools.fasta import formatFasta, rawFastaIterator,\
 from obitools.fastq import formatFastq
 from obitools.ecopcr.sequence import EcoPCRDBSequenceWriter
 from obitools.ecopcr.options  import loadTaxonomyDatabase
 from cPickle import dump,load,UnpicklingError
@ -34,7 +33,7 @@ from obitools.format.sequence import skipOnErrorIterator
 from obitools import BioSequence
 from obitools.utils import FakeFile
-
+from glob import glob
 def binarySequenceIterator(lineiterator):    
@ -168,7 +167,7 @@ def addOutputFormatOption(optionManager):
 #                             help="Output sequences in sap fasta format "
 #                                  "(Sequence must have a taxid and a taxonomy has to be loaded)")
-    group.add_option('--ecopcrDB-output',
+    group.add_option('--ecopcrdb-output',
                             action="store", dest="ecopcroutput",
                             default=None,
                             help="Output sequences in ecopcr database format "
@ -313,6 +312,10 @@ def sequenceWriterGenerator(options,output=sys.stdout):
                        self._format=formatSAPFastaGenerator(options)
                elif options.outputFormater is not None:
                    self._format=options.outputFormater
            if hasattr(seq,'_hasTaxid') and seq._hasTaxid:
                seq.extractTaxon()
            s = self._format(seq,upper=self._upper)
            try:
                self._file.write(s)
@ -336,8 +339,7 @@ def sequenceWriterGenerator(options,output=sys.stdout):
    if options.ecopcroutput is not None:
-        taxo = loadTaxonomyDatabase(options)
+        writer=EcoPCRDBSequenceWriter(options)
        writer=EcoPCRDBSequenceWriter(options.ecopcroutput,taxonomy=taxo)
    elif options.output==dump:
        writer=BinaryWriter(options,output)
    else:
--- a/src/obitools/options/_options.pyx
+++ b/src/obitools/options/_options.pyx
@ -6,7 +6,7 @@ from obitools.utils import universalOpen
 from obitools.utils import universalTell
 from obitools.utils import fileSize
 from obitools.ecopcr.sequence import EcoPCRDBSequenceIterator
-
+from glob import glob 
 from logging import debug
 import sys
@ -69,23 +69,30 @@ def allEntryIterator(files,entryIterator,with_progress=False,histo_step=102):
    if files :
        for f in files:
            if (entryIterator != EcoPCRDBSequenceIterator) :
                cfs.currentInputFileName=f
-                f = universalOpen(f)
+                try:
-                cfs.currentFile=f
+                    f = universalOpen(f,noError=True)
-                cfs.currentFileSize=fileSize(cfs.currentFile)
+                except Exception as e:    
-                debug(f)
+                    if glob('%s_[0-9][0-9][0-9].sdx' % f):
-            
+                        entryIterator=EcoPCRDBSequenceIterator
-                if with_progress:
+                    else:
-                    f=fileWithProgressBar(f,step=histo_step)
+                        print >>sys.stderr, e
-                    
+                        sys.exit();
                if entryIterator is None:
                    for line in f:
                        yield line
                else:
-                    for entry in entryIterator(f):
+                    cfs.currentFile=f
-                        yield entry
+                    cfs.currentFileSize=fileSize(cfs.currentFile)
-            else :
+                    debug(f)
-                yield EcoPCRDBSequenceIterator(f)
+                
                    if with_progress:
                        f=fileWithProgressBar(f,step=histo_step)           
            if entryIterator is None:
                for line in f:
                    yield line
            else:
                for entry in entryIterator(f):
                    yield entry
    else:
        if entryIterator is None:
--- a/src/obitools/utils/init.py
+++ b/src/obitools/utils/init.py
@ -26,7 +26,7 @@ class FileFormatError(Exception):
-def universalOpen(file,*options):
+def universalOpen(file,noError=False):
    '''
    Open a file gziped or not.
@ -47,7 +47,7 @@ def universalOpen(file,*options):
    if isinstance(file,str):
        try:
            if urllib2.urlparse.urlparse(file)[0]=='':
-                rep = open(file,*options)
+                rep = open(file)
            else:
                rep  = urllib2.urlopen(file,timeout=15)
@ -60,8 +60,11 @@ def universalOpen(file,*options):
                name = data[0].filename
                rep = zip.open(name)
        except Exception as e:
-            print>>sys.stderr, e
+            if not noError:
-            sys.exit();
+                print >>sys.stderr, e
                sys.exit();
            else:
                raise e
    else:
        rep = file
    return rep