12 Commits

8 changed files with 135 additions and 90 deletions
+11 -2
View File
@@ -15,7 +15,7 @@
output file. FLOAT is included in a [0,1] interval.
.. cmdoption:: --minimum-circle=FLOAT
minimum identity considered for the assignment circle.
FLOAT is included in a [0,1] interval.
@@ -40,11 +40,20 @@
.. cmdoption:: -E FLOAT, --errors=FLOAT
FLOAT is the fraction of reference sequences that will
be ignored when looking for the most recent common ancestor. This
be ignored when looking for the lowest common ancestor. This
option is useful when a non-negligible proportion of reference sequences
is expected to be assigned to the wrong taxon, for example because of
taxonomic misidentification. FLOAT is included in a [0,1] interval.
.. cmdoption:: -M INTEGER, --min-matches=FLOAT
Define the minimum congruent assignation. If this minimum is reached and
the -E option is activated, the lowest common ancestor algorithm tolarated
that some sequences do not provide the same taxonomic annotation (see the
-E option).
.. cmdoption:: --cache-size=INTEGER
A cache for computed similarities is maintained by `ecotag`. the default
+1 -1
View File
@@ -19,7 +19,7 @@ from os import path
PACKAGE = "OBITools"
VERSION = "1.2.0"
VERSION = "1.2.3"
AUTHOR = 'Eric Coissac'
EMAIL = 'eric@coissac.eu'
URL = 'metabarcoding.org/obitools'
+18 -6
View File
@@ -1,4 +1,4 @@
#!/usr/local/bin/python
#!/usr/local/OBITools-1.1.22/bin/python
'''
:py:mod:`ecotag`: assigns sequences to taxa
===========================================
@@ -145,9 +145,16 @@ def addSearchOptions(optionManager):
optionManager.add_option('-E','--errors',
action='store',dest='error',
type='float',
default=0.0,
help='Tolerated rate of wrong assignation')
optionManager.add_option('-M','--min-matches',
action='store',dest='minmatches',
type="int",
default=1,
help='Minimum congruent assignation')
optionManager.add_option('--cache-size',
action='store',dest='cache',
type='int',
@@ -333,10 +340,10 @@ if __name__=='__main__':
rankid = taxonomy.findRankByName(options.explain)
for seq in db:
id = seq.id[0:46]
seq.id=id
assert id not in taxonlink
taxonlink[id]=int(seq['taxid'])
seqid = seq.id[0:46]
seq.id=seqid
assert seqid not in taxonlink
taxonlink[seqid]=int(seq['taxid'])
if options.uniq:
@@ -363,7 +370,12 @@ if __name__=='__main__':
if best[0]:
taxlist = set(taxonlink[p[0].id] for p in match)
lca = taxonomy.betterCommonTaxon(options.error,*tuple(taxlist))
if options.error > 0.0 and len(match) >= int(options.minmatches / (1.0 - options.error)):
lca = taxonomy.betterCommonTaxon(options.error,
*tuple(taxlist))
else:
lca = taxonomy.betterCommonTaxon(0.0,*tuple(taxlist))
scname = taxonomy.getScientificName(lca)
rank = taxonomy.getRank(lca)
if len(taxlist) < 15:
+2 -1
View File
@@ -48,7 +48,7 @@ Otherwise,
.. code-block:: bash
> obiaddtaxids -T species_name -g genus_identified.fasta \\
> obiaddtaxids -k species_name -g genus_identified.fasta \\
-u unidentified.fasta -d my_ecopcr_database \\
my_sequences.fasta > identified.fasta
@@ -340,6 +340,7 @@ if __name__=='__main__':
entries = entryIterator(entries)
elif options.db_type == 'UNITE_FULL' :
entryIterator = UNITEIterator_FULL
entries = entryIterator(entries)
elif options.db_type == 'UNITE_GENERAL' :
entryIterator = UNITEIterator_GENERAL
entries = entryIterator(entries)
+96 -73
View File
@@ -23,8 +23,11 @@ def minimum(seqs):
return min(s['select'] for s in seqs)
def maximum(seqs):
return max(s['select'] for s in seqs)
try:
return max(s['select'] for s in seqs)
except TypeError, e:
print >>sys.stderr, seqs
raise e
def mean(seqs):
ss= reduce(lambda x,y: x + y,(s['select'] for s in seqs),0)
return float(ss) / len(seqs)
@@ -101,6 +104,13 @@ def addSelectOptions(optionManager):
default=[],
help="attributes to merge within each group")
group.add_option('-s','--sample',
action="store", dest="sample",
metavar="<TAGNAME>",
type="str",
default=None,
help="Tag containing sample descriptions, the default value is set to *merged_sample*")
group.add_option('--merge-ids',
action="store_true", dest="mergeids",
default=False,
@@ -129,16 +139,22 @@ if __name__ == '__main__':
classes = {}
print >>sys.stderr,"\nLoading sequences...\n"
with_taxonomy=hasattr(options, 'taxonomy') and options.taxonomy is not None
nbseq=0
for s in entries:
nbseq+=1
category = []
if with_taxonomy:
environ = {'taxonomy' : options.taxonomy,'sequence':s,'random':random()}
else:
environ = {'sequence':s,'random':random()}
for c in options.categories:
try:
if hasattr(options, 'taxonomy') and options.taxonomy is not None:
environ = {'taxonomy' : options.taxonomy,'sequence':s,'random':random()}
else:
environ = {'sequence':s,'random':random()}
v = eval(c,environ,s)
category.append(v)
except:
@@ -149,11 +165,6 @@ if __name__ == '__main__':
group.append(s)
classes[category]= group
if hasattr(options, 'taxonomy') and options.taxonomy is not None:
environ = {'taxonomy' : options.taxonomy,'sequence':s,'random':random()}
else:
environ = {'sequence':s, 'random':random()}
try:
select = eval(options.function,environ,s)
s['select']=select
@@ -181,75 +192,87 @@ if __name__ == '__main__':
i+=1
progressBar(i,lclasses,False,"%15s" % ("/".join(map(str,c)),))
seqs = classes[c]
sortclass(seqs, options)
if len(c)==1:
c=c[0]
if options.sample is not None:
subsets = {}
for s in seqs:
for sid in s[options.sample]:
ss = subsets.get(sid,[])
ss.append(s)
subsets[sid]=ss
else:
subsets={"all":seqs}
if options.number==1:
s = seqs[0]
for seqs in subsets.values():
sortclass(seqs, options)
for key in mergedKey:
if key=='taxid' and mergeIds:
if 'taxid_dist' not in s:
s["taxid_dist"]={}
if 'taxid' in s:
s["taxid_dist"][s.id]=s['taxid']
mkey = "merged_%s" % key
if mkey not in s:
if key in s:
s[mkey]={s[key]:1}
else:
s[mkey]={}
if 'count' not in s:
s['count']=1
if mergeIds:
s['merged']=[s.id]
for seq in seqs[1:]:
if len(c)==1:
c=c[0]
s['count']+=seq['count']
if options.number==1 and options.sample is None:
s = seqs[0]
for key in mergedKey:
if key=='taxid' and mergeIds:
if 'taxid_dist' in seq:
s["taxid_dist"].update(seq["taxid_dist"])
if 'taxid' in seq:
s["taxid_dist"][seq.id]=seq['taxid']
if 'taxid_dist' not in s:
s["taxid_dist"]={}
if 'taxid' in s:
s["taxid_dist"][s.id]=s['taxid']
mkey = "merged_%s" % key
if mkey in seq:
m = seq[mkey]
else:
if key in seq:
m={seq[key]:1}
allmkey = set(m.keys()) | set(s[mkey].keys())
s[mkey] = dict((k,m.get(k,0)+s[mkey].get(k,0)) for k in allmkey)
# if mkey in seq:
# for skey in seq[mkey]:
# if skey in s:
# s[mkey][skey]=s[mkey].get(seq[skey],0)+seq[mkey][skey]
# else:
# s[mkey][skey]=seq[mkey][skey]
#for key in seq.iterkeys():
# # Merger proprement l'attribut merged s'il exist
# if key in s and s[key]!=seq[key] and key!='count' and key[0:7]!='merged_' and key!='merged' and key!='select':
# del(s[key])
if mkey not in s:
if key in s:
s[mkey]={s[key]:1}
else:
s[mkey]={}
if 'count' not in s:
s['count']=1
if mergeIds:
s['merged'].append(seq.id)
if taxonomy is not None:
mergeTaxonomyClassification(seqs, taxonomy)
s['merged']=[s.id]
for seq in seqs[1:]:
for s in seqs[0:options.number]:
s['class']=c
del s['select']
writer(s)
s['count']+=seq['count']
for key in mergedKey:
if key=='taxid' and mergeIds:
if 'taxid_dist' in seq:
s["taxid_dist"].update(seq["taxid_dist"])
if 'taxid' in seq:
s["taxid_dist"][seq.id]=seq['taxid']
mkey = "merged_%s" % key
if mkey in seq:
m = seq[mkey]
else:
if key in seq:
m={seq[key]:1}
allmkey = set(m.keys()) | set(s[mkey].keys())
s[mkey] = dict((k,m.get(k,0)+s[mkey].get(k,0)) for k in allmkey)
if mergeIds:
s['merged'].append(seq.id)
if taxonomy is not None:
mergeTaxonomyClassification(seqs, taxonomy)
for s in seqs[0:options.number]:
s['class']=c
s['__@TOWRITE@__']=True
print >>sys.stderr,"\Writing sequences...\n"
progressBar(1,nbseq,True,'Writing')
i=0
for c in classes:
seqs = classes[c]
for s in seqs:
i+=1
progressBar(i,nbseq,False,"Writing")
if '__@TOWRITE@__' in s:
del s['__@TOWRITE@__']
del s['select']
writer(s)
print >>sys.stderr
+3 -3
View File
@@ -320,9 +320,9 @@ if __name__ == '__main__':
parent[3],parent[0],options.taxonomy._ranks[parent[1]])
localdata=True
for n in options.newname:
tx = t.split(t,':')
taxid = options.taxonomy.addPreferedName(tx[0].strip(),tx[1])
for t in options.newname:
tx = t.split(':')
taxid = options.taxonomy.addPreferedName(int(tx[1]), tx[0].strip())
print "name : %8d\t->\t%s" % (taxid,options.taxonomy.getPreferedName(taxid))
ecoTaxonomyWriter(options.ecodb,options.taxonomy,onlyLocal=True)
+3 -3
View File
@@ -269,7 +269,7 @@ class Taxonomy(object):
def addPreferedName(self,taxid,name):
idx = self.findIndex(taxid)
self._preferedName.append(name,'obi',idx)
self._preferedName.append([name,'obi',idx])
self._preferedidx[idx]=name
return taxid
@@ -399,10 +399,10 @@ class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile):
for t in self._taxonomy[self._localtaxon:]:
self._name.append((t[3],'scientific name',i))
i+=1
try :
self._preferedName = [(x[0],'obi',x[2])
for x in self.__ecoNameIterator(self._preferedNamesFile,noError=True)]
for x in self.__ecoNameIterator(self._preferedNamesFile)]
print >> sys.stderr, " [INFO : Preferred taxon name file found] : %d added taxa" % len(self._preferedName)
except:
print >> sys.stderr, " [INFO : Preferred taxon name file not found]"
+1 -1
View File
@@ -1,5 +1,5 @@
major = 1
minor = 2
serial= '0'
serial= '3'
version = "%2d.%02d %s" % (major,minor,serial)