Add new algorithm in EcoTag to reduce impact of database errors

This commit is contained in:
2012-05-08 18:31:37 +00:00
parent 657b3e33a8
commit 5205b9e478
2 changed files with 18 additions and 7 deletions

View File

@ -214,7 +214,7 @@ if __name__=='__main__':
if best[0] is not None:
taxlist = set(taxonlink[p[0].id] for p in match)
lca = taxonomy.lastCommonTaxon(*tuple(taxlist))
lca = taxonomy.betterCommonTaxon(0.2,*tuple(taxlist))
scname = taxonomy.getScientificName(lca)
rank = taxonomy.getRank(lca)

View File

@ -1,11 +1,12 @@
import struct
import sys
from itertools import count,imap
from itertools import count,imap,combinations
from obitools.ecopcr import EcoPCRDBFile
from obitools.utils import universalOpen
from obitools.utils import ColumnFile
import math
class Taxonomy(object):
def __init__(self):
@ -116,11 +117,21 @@ class Taxonomy(object):
return ancetre
def betterCommonTaxon(self,error=1,*taxids):
lca = self.lastCommonTaxon(*taxids)
idx = self._index[lca]
sublca = [t[0] for t in self._taxonomy if t[2]==idx]
return sublca
def depth(self,taxid):
return len([x for x in self.parentalTreeIterator(taxid)])
def betterCommonTaxon(self,error=0.2,*taxids):
error = math.floor(len(taxids) * error)
if error >= 1:
possible = [self.lastCommonTaxon(x) for x in combinations(taxids,len(taxids)-error)]
possible.sort(cmp=lambda t1,t2: cmp(self.depth(t2),self.depth(t1)))
lca=possible[0]
else:
lca = self.lastCommonTaxon(*taxids)
return lca
def getPreferedName(self,taxid):