Add option to ecoPCRFormat to deal with an obischema db as taxonomy source

git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPCR/trunk@158 60f365c0-8329-0410-b2a4-ec073aeeaa1d
This commit is contained in:
2008-02-08 10:27:42 +00:00
parent a1141a77b5
commit f68f4af244

View File

@ -7,6 +7,12 @@ import sys
import time
import getopt
try:
import psycopg2
_dbenable=True
except ImportError:
_dbenable=False
#####
#
#
@ -215,7 +221,56 @@ def readTaxonomyDump(taxdir):
return taxonomy,ranks,alternativeName,index
def readTaxonomyDB(dbname):
connection = psycopg2.connect(database=dbname)
cursor = connection.cursor()
cursor.execute("select numid,rank,parent from ncbi_taxonomy.taxon")
taxonomy=[list(x) for x in cursor]
cursor.execute("select rank_class from ncbi_taxonomy.taxon_rank_class order by rank_class")
ranks=cursor.fetchall()
ranks = dict(map(None,(x[0] for x in ranks),xrange(len(ranks))))
print >>sys.stderr,"Sorting taxons..."
taxonomy.sort(taxonCmp)
print >>sys.stderr,"Indexing taxonomy..."
index = {}
for t in taxonomy:
index[t[0]]=bsearchTaxon(taxonomy, t[0])
print >>sys.stderr,"Indexing parent and rank..."
for t in taxonomy:
t[1]=ranks[t[1]]
try:
t[2]=index[t[2]]
except KeyError,e:
if t[2] is None and t[0]==1:
t[2]=index[t[0]]
else:
raise e
cursor.execute("select taxid,name,category from ncbi_taxonomy.name")
alternativeName=[]
for taxid,name,classname in cursor:
alternativeName.append((name,classname,index[taxid]))
if classname == 'scientific name':
taxonomy[index[taxid]].append(name)
cursor.execute("select old_numid,current_numid from ncbi_taxonomy.taxon_id_alias")
print >>sys.stderr,"Adding taxid alias..."
for taxid,current in cursor:
if current is not None:
index[taxid]=index[current]
else:
index[taxid]=None
return taxonomy,ranks,alternativeName,index
#####
#
#
@ -293,11 +348,14 @@ def emblEntryParser(entry):
######################
_fastaSplit=re.compile(';\W*')
def parseFasta(seq):
seq=seq.split('\n')
title = seq[0].strip()[1:].split(None,1)
id=title[0]
if len(title) == 2:
field = title[1].split('; ')
field = _fastaSplit.split(title[1])
else:
field=[]
info = dict(x.split('=',1) for x in field if '=' in x)
@ -527,9 +585,10 @@ def ecoParseOptions(arguments):
}
o,filenames = getopt.getopt(arguments,
'ht:n:gfe',
'ht:T:n:gfe',
['help',
'taxonomy=',
'taxonomy_db=',
'name=',
'genbank',
'fasta',
@ -540,7 +599,11 @@ def ecoParseOptions(arguments):
printHelp()
exit()
elif name in ('-t','--taxonomy'):
opt['taxmod']='dump'
opt['taxdir']=value
elif name in ('-T','--taxonomy_db'):
opt['taxmod']='db'
opt['taxdb']=value
elif name in ('-n','--name'):
opt['prefix']=value
elif name in ('-g','--genbank'):
@ -578,7 +641,11 @@ if __name__ == '__main__':
opt,filenames = ecoParseOptions(sys.argv[1:])
taxonomy = readTaxonomyDump(opt['taxdir'])
if opt['taxmod']=='dump':
taxonomy = readTaxonomyDump(opt['taxdir'])
elif opt['taxmod']=='db':
taxonomy = readTaxonomyDB(opt['taxdb'])
ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser'])