Add docuentation for the new options and an option to manage the ecotag

cache size
2015-07-03 10:39:59 +02:00
parent aa064dda57
commit 2af94b9da7
3 changed files with 76 additions and 28 deletions
--- a/doc/sphinx/source/optionsSet/inputformat.txt
+++ b/doc/sphinx/source/optionsSet/inputformat.txt
@ -3,6 +3,24 @@ Options to specify input format
 .. program:: obitools
 Restrict the analysis to a sub-part of the input file
 .....................................................
 .. cmdoption::  --skip <N>
      The N first sequence records of the file are discarded from the analysis and 
      not reported to the output file
 .. cmdoption::  --only <N>
      Only the N next sequence records of the file are analyzed. The following sequences
      in the file are neither analyzed, neither reported to the output file.
      This option can be used conjointly with the `--skip` option.
 Sequence annotated format
 .........................
--- a/doc/sphinx/source/scripts/ecotag.rst
+++ b/doc/sphinx/source/scripts/ecotag.rst
@ -9,10 +9,15 @@
   .. cmdoption::  -m FLOAT, --minimum-identity=FLOAT
-        When sequence identity is less than FLOAT, the taxonomic 
+        When the best match with the reference database present an identity
-        assignment for the sequence record is not indicated in ``ecotag``'s 
+        level below FLOAT, the taxonomic assignment for the sequence record
-        output. FLOAT is included in a [0,1] interval.
+        is not computed. The sequence record is nevertheless included in the
-        (This option doesn't seem to work).
+        output file. FLOAT is included in a [0,1] interval.
   .. cmdoption::    --minimum-circle=FLOAT
        minimum identity considered for the assignment circle.
        FLOAT is included in a [0,1] interval.
   .. cmdoption::  -x RANK, --explain=RANK
@ -40,8 +45,18 @@
        is expected to be assigned to the wrong taxon, for example because of
        taxonomic misidentification. FLOAT is included in a [0,1] interval.
   .. cmdoption::  --cache-size=INTEGER
        A cache for computed similarities is maintained by `ecotag`. the default
        size for this cache is 1,000,000 of scores. This option allows to change
        the cache size.
   .. include:: ../optionsSet/taxonomyDB.txt
   .. include:: ../optionsSet/inputformat.txt
   .. include:: ../optionsSet/outputformat.txt
   .. include:: ../optionsSet/defaultoptions.txt
   :py:mod:`ecotag` added sequence attributes
@ -65,4 +80,3 @@
           - :doc:`species_list <../attributes/species_list>`
           - :doc:`species_name <../attributes/species_name>`
           - :doc:`taxid <../attributes/taxid>`
--- a/src/ecotag.py
+++ b/src/ecotag.py
@ -148,6 +148,13 @@ def addSearchOptions(optionManager):
                             default=0.0,
                             help='Tolerated rate of wrong assignation')    
    optionManager.add_option('--cache-size',
                             action='store',dest='cache',
                             type='int',
                             metavar='<SIZE>',
                             default=1000000,
                             help='Cache size for the aligment score')    
 def count(data):
    rep = {}
@ -203,6 +210,7 @@ def cachedLenLCS(s1,s2,minid,normalized,reference):
    global __LCSCache__
    global __INCache__
    global __OUTCache__
    global __CACHE_SIZE__
    pair=frozenset((s1.id,s2.id))
@ -217,7 +225,7 @@ def cachedLenLCS(s1,s2,minid,normalized,reference):
    __LCSCache__[pair]=rep
-    if len(__LCSCache__) > 1000000:
+    if len(__LCSCache__) > __CACHE_SIZE__:
        __LCSCache__.popitem(0)
    return rep
@ -292,10 +300,16 @@ if __name__=='__main__':
    __INCache__=1.0
    __OUTCache__=1.0
    optionParser = getOptionManager([addSearchOptions,addTaxonomyDBOptions,addInOutputOption],progdoc=__doc__)
    (options, entries) = optionParser()
    __CACHE_SIZE__=options.cache
    if __CACHE_SIZE__ < 10:
        __CACHE_SIZE__=10
    taxonomy = loadTaxonomyDatabase(options)
    writer = sequenceWriterGenerator(options)
@ -338,6 +352,8 @@ if __name__=='__main__':
    search = lcsIteratorSelf(entries,db,options)
    print >>sys.stderr,'\nCache size : %d\n'
    for seq,best,match in search:
        try:
@ -424,9 +440,9 @@ if __name__=='__main__':
            else:
                seq['species_name']=None
-        print >>sys.stderr,'\rCache size : %5.3f  ' % (__INCache__/__OUTCache__),
+        
        writer(seq)        
-                    
+    print >>sys.stderr,'\n%5.3f% of the alignments was cached' % (__INCache__/(__INCache__+__OUTCache__)*100)