Add docuentation for the new options and an option to manage the ecotag

cache size
2015-07-03 10:39:59 +02:00
parent aa064dda57
commit 2af94b9da7
3 changed files with 76 additions and 28 deletions
--- a/doc/sphinx/source/optionsSet/inputformat.txt
+++ b/doc/sphinx/source/optionsSet/inputformat.txt
@ -3,6 +3,24 @@ Options to specify input format

 .. program:: obitools

+
+Restrict the analysis to a sub-part of the input file
+.....................................................
+
+.. cmdoption::  --skip <N>
+
+      The N first sequence records of the file are discarded from the analysis and 
+      not reported to the output file
+      
+
+.. cmdoption::  --only <N>
+
+      Only the N next sequence records of the file are analyzed. The following sequences
+      in the file are neither analyzed, neither reported to the output file.
+      This option can be used conjointly with the `--skip` option.
+      
+
+
 Sequence annotated format
 .........................

--- a/doc/sphinx/source/scripts/ecotag.rst
+++ b/doc/sphinx/source/scripts/ecotag.rst
@ -1,55 +1,70 @@
 .. automodule:: ecotag
-   
+
   :py:mod:`ecotag` specific options
   ---------------------------------

-   .. cmdoption::  -R <FILENAME>, --ref-database=<FILENAME>   
-   
+   .. cmdoption::  -R <FILENAME>, --ref-database=<FILENAME>
+
        <FILENAME> is the fasta file containing the reference sequences

   .. cmdoption::  -m FLOAT, --minimum-identity=FLOAT
+
+        When the best match with the reference database present an identity
+        level below FLOAT, the taxonomic assignment for the sequence record
+        is not computed. The sequence record is nevertheless included in the
+        output file. FLOAT is included in a [0,1] interval.
+
+   .. cmdoption::    --minimum-circle=FLOAT
   
-        When sequence identity is less than FLOAT, the taxonomic 
-        assignment for the sequence record is not indicated in ``ecotag``'s 
-        output. FLOAT is included in a [0,1] interval.
-        (This option doesn't seem to work).
+        minimum identity considered for the assignment circle.
+        FLOAT is included in a [0,1] interval.

   .. cmdoption::  -x RANK, --explain=RANK
-   
+
   .. cmdoption::  -u, --uniq
-   
-        When this option is specified, the program first dereplicates the sequence 
-        records to work on unique sequences only. This option greatly improves 
+
+        When this option is specified, the program first dereplicates the sequence
+        records to work on unique sequences only. This option greatly improves
        the program's speed, especially for highly redundant datasets.

   .. cmdoption::  --sort=<KEY>
-   
+
        The output is sorted based on the values of the relevant attribute.

   .. cmdoption::  -r, --reverse
-   
+
        The output is sorted in reverse order (should be used with the --sort option).
-        (Works even if the --sort option is not set, but could not find on what 
+        (Works even if the --sort option is not set, but could not find on what
        the output is sorted).

   .. cmdoption::  -E FLOAT, --errors=FLOAT
-   
-        FLOAT is the fraction of reference sequences that will 
-        be ignored when looking for the most recent common ancestor. This 
-        option is useful when a non-negligible proportion of reference sequences 
-        is expected to be assigned to the wrong taxon, for example because of 
+
+        FLOAT is the fraction of reference sequences that will
+        be ignored when looking for the most recent common ancestor. This
+        option is useful when a non-negligible proportion of reference sequences
+        is expected to be assigned to the wrong taxon, for example because of
        taxonomic misidentification. FLOAT is included in a [0,1] interval.

+   .. cmdoption::  --cache-size=INTEGER
+
+        A cache for computed similarities is maintained by `ecotag`. the default
+        size for this cache is 1,000,000 of scores. This option allows to change
+        the cache size.
+
   .. include:: ../optionsSet/taxonomyDB.txt
-   
+
+   .. include:: ../optionsSet/inputformat.txt
+
+   .. include:: ../optionsSet/outputformat.txt
+
   .. include:: ../optionsSet/defaultoptions.txt
-   
+
   :py:mod:`ecotag` added sequence attributes
   ------------------------------------------
-   
+
      .. hlist::
           :columns: 3
-           
+
           - :doc:`best_identity <../attributes/best_identity>`
           - :doc:`best_match <../attributes/best_match>`
           - :doc:`family <../attributes/family>`
@ -65,4 +80,3 @@
           - :doc:`species_list <../attributes/species_list>`
           - :doc:`species_name <../attributes/species_name>`
           - :doc:`taxid <../attributes/taxid>`
-      
--- a/src/ecotag.py
+++ b/src/ecotag.py
@ -148,6 +148,13 @@ def addSearchOptions(optionManager):
                             default=0.0,
                             help='Tolerated rate of wrong assignation')    

+    optionManager.add_option('--cache-size',
+                             action='store',dest='cache',
+                             type='int',
+                             metavar='<SIZE>',
+                             default=1000000,
+                             help='Cache size for the aligment score')    
+

 def count(data):
    rep = {}
@ -203,6 +210,7 @@ def cachedLenLCS(s1,s2,minid,normalized,reference):
    global __LCSCache__
    global __INCache__
    global __OUTCache__
+    global __CACHE_SIZE__
    
    pair=frozenset((s1.id,s2.id))
    
@ -217,7 +225,7 @@ def cachedLenLCS(s1,s2,minid,normalized,reference):
        
    __LCSCache__[pair]=rep
    
-    if len(__LCSCache__) > 1000000:
+    if len(__LCSCache__) > __CACHE_SIZE__:
        __LCSCache__.popitem(0)
    return rep
    
@ -292,9 +300,15 @@ if __name__=='__main__':
    __INCache__=1.0
    __OUTCache__=1.0
    
+    
    optionParser = getOptionManager([addSearchOptions,addTaxonomyDBOptions,addInOutputOption],progdoc=__doc__)
    
    (options, entries) = optionParser()
+    
+    __CACHE_SIZE__=options.cache
+    
+    if __CACHE_SIZE__ < 10:
+        __CACHE_SIZE__=10
        
    taxonomy = loadTaxonomyDatabase(options)
    writer = sequenceWriterGenerator(options)
@ -317,7 +331,7 @@ if __name__=='__main__':
    taxonlink = {}

    rankid = taxonomy.findRankByName(options.explain)
-    
+
    for seq in db:
        id = seq.id[0:46]
        seq.id=id
@ -338,6 +352,8 @@ if __name__=='__main__':

    search = lcsIteratorSelf(entries,db,options)
                     
+    print >>sys.stderr,'\nCache size : %d\n'
+
                    
    for seq,best,match in search:
        try:
@ -424,9 +440,9 @@ if __name__=='__main__':
            else:
                seq['species_name']=None
                        
-        print >>sys.stderr,'\rCache size : %5.3f  ' % (__INCache__/__OUTCache__),
+        
        writer(seq)        
-                    
+    print >>sys.stderr,'\n%5.3f% of the alignments was cached' % (__INCache__/(__INCache__+__OUTCache__)*100)