Add docuentation for the new options and an option to manage the ecotag

cache size
2015-07-03 10:39:59 +02:00
parent aa064dda57
commit 2af94b9da7
3 changed files with 76 additions and 28 deletions
--- a/doc/sphinx/source/optionsSet/inputformat.txt
+++ b/doc/sphinx/source/optionsSet/inputformat.txt
@ -3,6 +3,24 @@ Options to specify input format

 .. program:: obitools

+
+Restrict the analysis to a sub-part of the input file
+.....................................................
+
+.. cmdoption::  --skip <N>
+
+      The N first sequence records of the file are discarded from the analysis and 
+      not reported to the output file
+      
+
+.. cmdoption::  --only <N>
+
+      Only the N next sequence records of the file are analyzed. The following sequences
+      in the file are neither analyzed, neither reported to the output file.
+      This option can be used conjointly with the `--skip` option.
+      
+
+
 Sequence annotated format
 .........................

--- a/doc/sphinx/source/scripts/ecotag.rst
+++ b/doc/sphinx/source/scripts/ecotag.rst
@ -9,10 +9,15 @@

   .. cmdoption::  -m FLOAT, --minimum-identity=FLOAT

-        When sequence identity is less than FLOAT, the taxonomic 
-        assignment for the sequence record is not indicated in ``ecotag``'s 
-        output. FLOAT is included in a [0,1] interval.
-        (This option doesn't seem to work).
+        When the best match with the reference database present an identity
+        level below FLOAT, the taxonomic assignment for the sequence record
+        is not computed. The sequence record is nevertheless included in the
+        output file. FLOAT is included in a [0,1] interval.
+
+   .. cmdoption::    --minimum-circle=FLOAT
+   
+        minimum identity considered for the assignment circle.
+        FLOAT is included in a [0,1] interval.

   .. cmdoption::  -x RANK, --explain=RANK

@ -40,8 +45,18 @@
        is expected to be assigned to the wrong taxon, for example because of
        taxonomic misidentification. FLOAT is included in a [0,1] interval.

+   .. cmdoption::  --cache-size=INTEGER
+
+        A cache for computed similarities is maintained by `ecotag`. the default
+        size for this cache is 1,000,000 of scores. This option allows to change
+        the cache size.
+
   .. include:: ../optionsSet/taxonomyDB.txt

+   .. include:: ../optionsSet/inputformat.txt
+
+   .. include:: ../optionsSet/outputformat.txt
+
   .. include:: ../optionsSet/defaultoptions.txt

   :py:mod:`ecotag` added sequence attributes
@ -65,4 +80,3 @@
           - :doc:`species_list <../attributes/species_list>`
           - :doc:`species_name <../attributes/species_name>`
           - :doc:`taxid <../attributes/taxid>`
-      
--- a/src/ecotag.py
+++ b/src/ecotag.py
@ -148,6 +148,13 @@ def addSearchOptions(optionManager):
                             default=0.0,
                             help='Tolerated rate of wrong assignation')    

+    optionManager.add_option('--cache-size',
+                             action='store',dest='cache',
+                             type='int',
+                             metavar='<SIZE>',
+                             default=1000000,
+                             help='Cache size for the aligment score')    
+

 def count(data):
    rep = {}
@ -203,6 +210,7 @@ def cachedLenLCS(s1,s2,minid,normalized,reference):
    global __LCSCache__
    global __INCache__
    global __OUTCache__
+    global __CACHE_SIZE__
    
    pair=frozenset((s1.id,s2.id))
    
@ -217,7 +225,7 @@ def cachedLenLCS(s1,s2,minid,normalized,reference):
        
    __LCSCache__[pair]=rep
    
-    if len(__LCSCache__) > 1000000:
+    if len(__LCSCache__) > __CACHE_SIZE__:
        __LCSCache__.popitem(0)
    return rep
    
@ -292,10 +300,16 @@ if __name__=='__main__':
    __INCache__=1.0
    __OUTCache__=1.0
    
+    
    optionParser = getOptionManager([addSearchOptions,addTaxonomyDBOptions,addInOutputOption],progdoc=__doc__)
    
    (options, entries) = optionParser()
    
+    __CACHE_SIZE__=options.cache
+    
+    if __CACHE_SIZE__ < 10:
+        __CACHE_SIZE__=10
+        
    taxonomy = loadTaxonomyDatabase(options)
    writer = sequenceWriterGenerator(options)
    
@ -338,6 +352,8 @@ if __name__=='__main__':

    search = lcsIteratorSelf(entries,db,options)
                     
+    print >>sys.stderr,'\nCache size : %d\n'
+
                    
    for seq,best,match in search:
        try:
@ -424,9 +440,9 @@ if __name__=='__main__':
            else:
                seq['species_name']=None
                        
-        print >>sys.stderr,'\rCache size : %5.3f  ' % (__INCache__/__OUTCache__),
+        
        writer(seq)        
-                    
+    print >>sys.stderr,'\n%5.3f% of the alignments was cached' % (__INCache__/(__INCache__+__OUTCache__)*100)