Add new algorithm in EcoTag to reduce impact of database errors

This commit is contained in:
2012-05-08 18:31:37 +00:00
parent 657b3e33a8
commit 5205b9e478
2 changed files with 18 additions and 7 deletions

View File

@ -214,7 +214,7 @@ if __name__=='__main__':
if best[0] is not None: if best[0] is not None:
taxlist = set(taxonlink[p[0].id] for p in match) taxlist = set(taxonlink[p[0].id] for p in match)
lca = taxonomy.lastCommonTaxon(*tuple(taxlist)) lca = taxonomy.betterCommonTaxon(0.2,*tuple(taxlist))
scname = taxonomy.getScientificName(lca) scname = taxonomy.getScientificName(lca)
rank = taxonomy.getRank(lca) rank = taxonomy.getRank(lca)

View File

@ -1,11 +1,12 @@
import struct import struct
import sys import sys
from itertools import count,imap from itertools import count,imap,combinations
from obitools.ecopcr import EcoPCRDBFile from obitools.ecopcr import EcoPCRDBFile
from obitools.utils import universalOpen from obitools.utils import universalOpen
from obitools.utils import ColumnFile from obitools.utils import ColumnFile
import math
class Taxonomy(object): class Taxonomy(object):
def __init__(self): def __init__(self):
@ -116,11 +117,21 @@ class Taxonomy(object):
return ancetre return ancetre
def betterCommonTaxon(self,error=1,*taxids): def depth(self,taxid):
return len([x for x in self.parentalTreeIterator(taxid)])
def betterCommonTaxon(self,error=0.2,*taxids):
error = math.floor(len(taxids) * error)
if error >= 1:
possible = [self.lastCommonTaxon(x) for x in combinations(taxids,len(taxids)-error)]
possible.sort(cmp=lambda t1,t2: cmp(self.depth(t2),self.depth(t1)))
lca=possible[0]
else:
lca = self.lastCommonTaxon(*taxids) lca = self.lastCommonTaxon(*taxids)
idx = self._index[lca]
sublca = [t[0] for t in self._taxonomy if t[2]==idx] return lca
return sublca
def getPreferedName(self,taxid): def getPreferedName(self,taxid):