This commit is contained in:
@@ -1,4 +1,58 @@
|
|||||||
#!/usr/local/bin/python
|
#!/usr/local/bin/python
|
||||||
|
'''
|
||||||
|
:py:mod:`ecotaxspecificity`: Evaluates barcode resolution
|
||||||
|
=========================================================
|
||||||
|
|
||||||
|
.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>
|
||||||
|
|
||||||
|
The :py:mod:`ecotaxspecificity` command evaluates barcode resolution at different
|
||||||
|
taxonomic ranks.
|
||||||
|
|
||||||
|
As inputs, it takes a sequence record file annotated with taxids in the sequence
|
||||||
|
header, and a database formated as an ecopcr database (see :doc:`obitaxonomy
|
||||||
|
<obitaxonomy>`) or a NCBI taxdump (see NCBI ftp site).
|
||||||
|
|
||||||
|
An example of output is reported below::
|
||||||
|
|
||||||
|
Number of sequences added in graph: 284
|
||||||
|
Number of nodes in all components: 269
|
||||||
|
Number of sequences lost: 15!
|
||||||
|
rank taxon_ok taxon_total percent
|
||||||
|
order 8 8 100.00
|
||||||
|
superfamily 1 1 100.00
|
||||||
|
parvorder 1 1 100.00
|
||||||
|
subkingdom 1 1 100.00
|
||||||
|
superkingdom 1 1 100.00
|
||||||
|
kingdom 3 3 100.00
|
||||||
|
phylum 5 5 100.00
|
||||||
|
infraorder 1 1 100.00
|
||||||
|
subfamily 3 3 100.00
|
||||||
|
class 6 6 100.00
|
||||||
|
species 35 176 19.89
|
||||||
|
superorder 1 1 100.00
|
||||||
|
suborder 1 1 100.00
|
||||||
|
subtribe 1 1 100.00
|
||||||
|
subclass 3 3 100.00
|
||||||
|
genus 9 15 60.00
|
||||||
|
superclass 1 1 100.00
|
||||||
|
family 10 10 100.00
|
||||||
|
tribe 2 2 100.00
|
||||||
|
subphylum 1 1 100.00
|
||||||
|
|
||||||
|
|
||||||
|
In this example, the input sequence file contains 284 sequence records, but only
|
||||||
|
269 have been examined, because taxonomic information was not recovered for the
|
||||||
|
the 15 remaining ones.
|
||||||
|
|
||||||
|
"Taxon_total" refers to the number of different taxa observed at this rank
|
||||||
|
in the sequence record file (when taxonomic information is available at this
|
||||||
|
rank), and "taxon_ok" corresponds to the number of taxa that the barcode sequence
|
||||||
|
identifies unambiguously in the taxonomic database. In this example, the sequence
|
||||||
|
records correspond to 176 different species, but only 35 of these have specific
|
||||||
|
barcodes. "percent" is the percentage of unambiguously identified taxa among
|
||||||
|
the total number of taxa (taxon_ok/taxon_total*100).
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
import math
|
import math
|
||||||
import sys
|
import sys
|
||||||
@@ -16,13 +70,14 @@ from obitools.ecopcr.options import addTaxonomyDBOptions, loadTaxonomyDatabase
|
|||||||
|
|
||||||
|
|
||||||
def addSpecificityOptions(optionManager):
|
def addSpecificityOptions(optionManager):
|
||||||
optionManager.add_option('-e','--errors',
|
group = optionManager.add_option_group('ecotaxspecificity specific options')
|
||||||
|
group.add_option('-e','--errors',
|
||||||
action="store", dest="dist",
|
action="store", dest="dist",
|
||||||
metavar="###",
|
metavar="###",
|
||||||
type="int",
|
type="int",
|
||||||
default=1,
|
default=1,
|
||||||
help="Maximum errors between two sequences")
|
help="Maximum errors between two sequences")
|
||||||
optionManager.add_option('-q','--quorum',
|
group.add_option('-q','--quorum',
|
||||||
action="store", dest="quorum",
|
action="store", dest="quorum",
|
||||||
type="float",
|
type="float",
|
||||||
default=0.0,
|
default=0.0,
|
||||||
@@ -31,7 +86,7 @@ def addSpecificityOptions(optionManager):
|
|||||||
|
|
||||||
if __name__=='__main__':
|
if __name__=='__main__':
|
||||||
|
|
||||||
optionParser = getOptionManager([addInOutputOption,addTaxonomyDBOptions,addSpecificityOptions])
|
optionParser = getOptionManager([addInputFormatOption,addTaxonomyDBOptions,addSpecificityOptions])
|
||||||
|
|
||||||
(options, entries) = optionParser()
|
(options, entries) = optionParser()
|
||||||
|
|
||||||
@@ -110,9 +165,9 @@ if __name__=='__main__':
|
|||||||
indexbyseq[s].add(seq)
|
indexbyseq[s].add(seq)
|
||||||
yy = yy + 1
|
yy = yy + 1
|
||||||
|
|
||||||
print "Total Sequences added in graph: " + str(xx)
|
print "Number of sequences added in graph: " + str(xx)
|
||||||
print "Total nodes in all components: " + str (yy)
|
print "Number of nodes in all components: " + str (yy)
|
||||||
print "Lost sequences: " + str (xx-yy) + "!"
|
print "Number of sequences lost: " + str (xx-yy) + "!"
|
||||||
|
|
||||||
# since multiple different sequences have one key, we need to know what that key is for each sequence
|
# since multiple different sequences have one key, we need to know what that key is for each sequence
|
||||||
indexbykey={} #it will have elements like: {"seq1":key, "seq2":key, ...} where 'key' is the component key to which 'seqx' belongs
|
indexbykey={} #it will have elements like: {"seq1":key, "seq2":key, ...} where 'key' is the component key to which 'seqx' belongs
|
||||||
|
Reference in New Issue
Block a user