Add docuentation for the new options and an option to manage the ecotag
cache size
This commit is contained in:
@ -3,6 +3,24 @@ Options to specify input format
|
||||
|
||||
.. program:: obitools
|
||||
|
||||
|
||||
Restrict the analysis to a sub-part of the input file
|
||||
.....................................................
|
||||
|
||||
.. cmdoption:: --skip <N>
|
||||
|
||||
The N first sequence records of the file are discarded from the analysis and
|
||||
not reported to the output file
|
||||
|
||||
|
||||
.. cmdoption:: --only <N>
|
||||
|
||||
Only the N next sequence records of the file are analyzed. The following sequences
|
||||
in the file are neither analyzed, neither reported to the output file.
|
||||
This option can be used conjointly with the `--skip` option.
|
||||
|
||||
|
||||
|
||||
Sequence annotated format
|
||||
.........................
|
||||
|
||||
|
@ -1,55 +1,70 @@
|
||||
.. automodule:: ecotag
|
||||
|
||||
|
||||
:py:mod:`ecotag` specific options
|
||||
---------------------------------
|
||||
|
||||
.. cmdoption:: -R <FILENAME>, --ref-database=<FILENAME>
|
||||
|
||||
.. cmdoption:: -R <FILENAME>, --ref-database=<FILENAME>
|
||||
|
||||
<FILENAME> is the fasta file containing the reference sequences
|
||||
|
||||
.. cmdoption:: -m FLOAT, --minimum-identity=FLOAT
|
||||
|
||||
When the best match with the reference database present an identity
|
||||
level below FLOAT, the taxonomic assignment for the sequence record
|
||||
is not computed. The sequence record is nevertheless included in the
|
||||
output file. FLOAT is included in a [0,1] interval.
|
||||
|
||||
.. cmdoption:: --minimum-circle=FLOAT
|
||||
|
||||
When sequence identity is less than FLOAT, the taxonomic
|
||||
assignment for the sequence record is not indicated in ``ecotag``'s
|
||||
output. FLOAT is included in a [0,1] interval.
|
||||
(This option doesn't seem to work).
|
||||
minimum identity considered for the assignment circle.
|
||||
FLOAT is included in a [0,1] interval.
|
||||
|
||||
.. cmdoption:: -x RANK, --explain=RANK
|
||||
|
||||
|
||||
.. cmdoption:: -u, --uniq
|
||||
|
||||
When this option is specified, the program first dereplicates the sequence
|
||||
records to work on unique sequences only. This option greatly improves
|
||||
|
||||
When this option is specified, the program first dereplicates the sequence
|
||||
records to work on unique sequences only. This option greatly improves
|
||||
the program's speed, especially for highly redundant datasets.
|
||||
|
||||
.. cmdoption:: --sort=<KEY>
|
||||
|
||||
|
||||
The output is sorted based on the values of the relevant attribute.
|
||||
|
||||
.. cmdoption:: -r, --reverse
|
||||
|
||||
|
||||
The output is sorted in reverse order (should be used with the --sort option).
|
||||
(Works even if the --sort option is not set, but could not find on what
|
||||
(Works even if the --sort option is not set, but could not find on what
|
||||
the output is sorted).
|
||||
|
||||
.. cmdoption:: -E FLOAT, --errors=FLOAT
|
||||
|
||||
FLOAT is the fraction of reference sequences that will
|
||||
be ignored when looking for the most recent common ancestor. This
|
||||
option is useful when a non-negligible proportion of reference sequences
|
||||
is expected to be assigned to the wrong taxon, for example because of
|
||||
|
||||
FLOAT is the fraction of reference sequences that will
|
||||
be ignored when looking for the most recent common ancestor. This
|
||||
option is useful when a non-negligible proportion of reference sequences
|
||||
is expected to be assigned to the wrong taxon, for example because of
|
||||
taxonomic misidentification. FLOAT is included in a [0,1] interval.
|
||||
|
||||
.. cmdoption:: --cache-size=INTEGER
|
||||
|
||||
A cache for computed similarities is maintained by `ecotag`. the default
|
||||
size for this cache is 1,000,000 of scores. This option allows to change
|
||||
the cache size.
|
||||
|
||||
.. include:: ../optionsSet/taxonomyDB.txt
|
||||
|
||||
|
||||
.. include:: ../optionsSet/inputformat.txt
|
||||
|
||||
.. include:: ../optionsSet/outputformat.txt
|
||||
|
||||
.. include:: ../optionsSet/defaultoptions.txt
|
||||
|
||||
|
||||
:py:mod:`ecotag` added sequence attributes
|
||||
------------------------------------------
|
||||
|
||||
|
||||
.. hlist::
|
||||
:columns: 3
|
||||
|
||||
|
||||
- :doc:`best_identity <../attributes/best_identity>`
|
||||
- :doc:`best_match <../attributes/best_match>`
|
||||
- :doc:`family <../attributes/family>`
|
||||
@ -65,4 +80,3 @@
|
||||
- :doc:`species_list <../attributes/species_list>`
|
||||
- :doc:`species_name <../attributes/species_name>`
|
||||
- :doc:`taxid <../attributes/taxid>`
|
||||
|
||||
|
@ -148,6 +148,13 @@ def addSearchOptions(optionManager):
|
||||
default=0.0,
|
||||
help='Tolerated rate of wrong assignation')
|
||||
|
||||
optionManager.add_option('--cache-size',
|
||||
action='store',dest='cache',
|
||||
type='int',
|
||||
metavar='<SIZE>',
|
||||
default=1000000,
|
||||
help='Cache size for the aligment score')
|
||||
|
||||
|
||||
def count(data):
|
||||
rep = {}
|
||||
@ -203,6 +210,7 @@ def cachedLenLCS(s1,s2,minid,normalized,reference):
|
||||
global __LCSCache__
|
||||
global __INCache__
|
||||
global __OUTCache__
|
||||
global __CACHE_SIZE__
|
||||
|
||||
pair=frozenset((s1.id,s2.id))
|
||||
|
||||
@ -217,7 +225,7 @@ def cachedLenLCS(s1,s2,minid,normalized,reference):
|
||||
|
||||
__LCSCache__[pair]=rep
|
||||
|
||||
if len(__LCSCache__) > 1000000:
|
||||
if len(__LCSCache__) > __CACHE_SIZE__:
|
||||
__LCSCache__.popitem(0)
|
||||
return rep
|
||||
|
||||
@ -292,9 +300,15 @@ if __name__=='__main__':
|
||||
__INCache__=1.0
|
||||
__OUTCache__=1.0
|
||||
|
||||
|
||||
optionParser = getOptionManager([addSearchOptions,addTaxonomyDBOptions,addInOutputOption],progdoc=__doc__)
|
||||
|
||||
(options, entries) = optionParser()
|
||||
|
||||
__CACHE_SIZE__=options.cache
|
||||
|
||||
if __CACHE_SIZE__ < 10:
|
||||
__CACHE_SIZE__=10
|
||||
|
||||
taxonomy = loadTaxonomyDatabase(options)
|
||||
writer = sequenceWriterGenerator(options)
|
||||
@ -317,7 +331,7 @@ if __name__=='__main__':
|
||||
taxonlink = {}
|
||||
|
||||
rankid = taxonomy.findRankByName(options.explain)
|
||||
|
||||
|
||||
for seq in db:
|
||||
id = seq.id[0:46]
|
||||
seq.id=id
|
||||
@ -338,6 +352,8 @@ if __name__=='__main__':
|
||||
|
||||
search = lcsIteratorSelf(entries,db,options)
|
||||
|
||||
print >>sys.stderr,'\nCache size : %d\n'
|
||||
|
||||
|
||||
for seq,best,match in search:
|
||||
try:
|
||||
@ -424,9 +440,9 @@ if __name__=='__main__':
|
||||
else:
|
||||
seq['species_name']=None
|
||||
|
||||
print >>sys.stderr,'\rCache size : %5.3f ' % (__INCache__/__OUTCache__),
|
||||
|
||||
writer(seq)
|
||||
|
||||
print >>sys.stderr,'\n%5.3f% of the alignments was cached' % (__INCache__/(__INCache__+__OUTCache__)*100)
|
||||
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user