Add docuentation for the new options and an option to manage the ecotag

cache size
2015-07-03 10:39:59 +02:00
parent aa064dda57
commit 2af94b9da7
3 changed files with 76 additions and 28 deletions
--- a/doc/sphinx/source/optionsSet/inputformat.txt
+++ b/doc/sphinx/source/optionsSet/inputformat.txt
@ -3,6 +3,24 @@ Options to specify input format
 .. program:: obitools
 Restrict the analysis to a sub-part of the input file
 .....................................................
 .. cmdoption::  --skip <N>
      The N first sequence records of the file are discarded from the analysis and 
      not reported to the output file
 .. cmdoption::  --only <N>
      Only the N next sequence records of the file are analyzed. The following sequences
      in the file are neither analyzed, neither reported to the output file.
      This option can be used conjointly with the `--skip` option.
 Sequence annotated format
 .........................
--- a/doc/sphinx/source/scripts/ecotag.rst
+++ b/doc/sphinx/source/scripts/ecotag.rst
@ -1,55 +1,70 @@
 .. automodule:: ecotag
-   
+
   :py:mod:`ecotag` specific options
   ---------------------------------
-   .. cmdoption::  -R <FILENAME>, --ref-database=<FILENAME>   
+   .. cmdoption::  -R <FILENAME>, --ref-database=<FILENAME>
-   
+
        <FILENAME> is the fasta file containing the reference sequences
   .. cmdoption::  -m FLOAT, --minimum-identity=FLOAT
        When the best match with the reference database present an identity
        level below FLOAT, the taxonomic assignment for the sequence record
        is not computed. The sequence record is nevertheless included in the
        output file. FLOAT is included in a [0,1] interval.
   .. cmdoption::    --minimum-circle=FLOAT
-        When sequence identity is less than FLOAT, the taxonomic 
+        minimum identity considered for the assignment circle.
-        assignment for the sequence record is not indicated in ``ecotag``'s 
+        FLOAT is included in a [0,1] interval.
        output. FLOAT is included in a [0,1] interval.
        (This option doesn't seem to work).
   .. cmdoption::  -x RANK, --explain=RANK
-   
+
   .. cmdoption::  -u, --uniq
-   
+
-        When this option is specified, the program first dereplicates the sequence 
+        When this option is specified, the program first dereplicates the sequence
-        records to work on unique sequences only. This option greatly improves 
+        records to work on unique sequences only. This option greatly improves
        the program's speed, especially for highly redundant datasets.
   .. cmdoption::  --sort=<KEY>
-   
+
        The output is sorted based on the values of the relevant attribute.
   .. cmdoption::  -r, --reverse
-   
+
        The output is sorted in reverse order (should be used with the --sort option).
-        (Works even if the --sort option is not set, but could not find on what 
+        (Works even if the --sort option is not set, but could not find on what
        the output is sorted).
   .. cmdoption::  -E FLOAT, --errors=FLOAT
-   
+
-        FLOAT is the fraction of reference sequences that will 
+        FLOAT is the fraction of reference sequences that will
-        be ignored when looking for the most recent common ancestor. This 
+        be ignored when looking for the most recent common ancestor. This
-        option is useful when a non-negligible proportion of reference sequences 
+        option is useful when a non-negligible proportion of reference sequences
-        is expected to be assigned to the wrong taxon, for example because of 
+        is expected to be assigned to the wrong taxon, for example because of
        taxonomic misidentification. FLOAT is included in a [0,1] interval.
   .. cmdoption::  --cache-size=INTEGER
        A cache for computed similarities is maintained by `ecotag`. the default
        size for this cache is 1,000,000 of scores. This option allows to change
        the cache size.
   .. include:: ../optionsSet/taxonomyDB.txt
-   
+
   .. include:: ../optionsSet/inputformat.txt
   .. include:: ../optionsSet/outputformat.txt
   .. include:: ../optionsSet/defaultoptions.txt
-   
+
   :py:mod:`ecotag` added sequence attributes
   ------------------------------------------
-   
+
      .. hlist::
           :columns: 3
-           
+
           - :doc:`best_identity <../attributes/best_identity>`
           - :doc:`best_match <../attributes/best_match>`
           - :doc:`family <../attributes/family>`
@ -65,4 +80,3 @@
           - :doc:`species_list <../attributes/species_list>`
           - :doc:`species_name <../attributes/species_name>`
           - :doc:`taxid <../attributes/taxid>`
--- a/src/ecotag.py
+++ b/src/ecotag.py
@ -148,6 +148,13 @@ def addSearchOptions(optionManager):
                             default=0.0,
                             help='Tolerated rate of wrong assignation')    
    optionManager.add_option('--cache-size',
                             action='store',dest='cache',
                             type='int',
                             metavar='<SIZE>',
                             default=1000000,
                             help='Cache size for the aligment score')    
 def count(data):
    rep = {}
@ -203,6 +210,7 @@ def cachedLenLCS(s1,s2,minid,normalized,reference):
    global __LCSCache__
    global __INCache__
    global __OUTCache__
    global __CACHE_SIZE__
    pair=frozenset((s1.id,s2.id))
@ -217,7 +225,7 @@ def cachedLenLCS(s1,s2,minid,normalized,reference):
    __LCSCache__[pair]=rep
-    if len(__LCSCache__) > 1000000:
+    if len(__LCSCache__) > __CACHE_SIZE__:
        __LCSCache__.popitem(0)
    return rep
@ -292,9 +300,15 @@ if __name__=='__main__':
    __INCache__=1.0
    __OUTCache__=1.0
    optionParser = getOptionManager([addSearchOptions,addTaxonomyDBOptions,addInOutputOption],progdoc=__doc__)
    (options, entries) = optionParser()
    __CACHE_SIZE__=options.cache
    if __CACHE_SIZE__ < 10:
        __CACHE_SIZE__=10
    taxonomy = loadTaxonomyDatabase(options)
    writer = sequenceWriterGenerator(options)
@ -317,7 +331,7 @@ if __name__=='__main__':
    taxonlink = {}
    rankid = taxonomy.findRankByName(options.explain)
-    
+
    for seq in db:
        id = seq.id[0:46]
        seq.id=id
@ -338,6 +352,8 @@ if __name__=='__main__':
    search = lcsIteratorSelf(entries,db,options)
    print >>sys.stderr,'\nCache size : %d\n'
    for seq,best,match in search:
        try:
@ -424,9 +440,9 @@ if __name__=='__main__':
            else:
                seq['species_name']=None
-        print >>sys.stderr,'\rCache size : %5.3f  ' % (__INCache__/__OUTCache__),
+        
        writer(seq)        
-                    
+    print >>sys.stderr,'\n%5.3f% of the alignments was cached' % (__INCache__/(__INCache__+__OUTCache__)*100)