Add option to ecoPCRFormat to deal with an obischema db as taxonomy source
git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPCR/trunk@158 60f365c0-8329-0410-b2a4-ec073aeeaa1d
This commit is contained in:
@ -7,6 +7,12 @@ import sys
|
|||||||
import time
|
import time
|
||||||
import getopt
|
import getopt
|
||||||
|
|
||||||
|
try:
|
||||||
|
import psycopg2
|
||||||
|
_dbenable=True
|
||||||
|
except ImportError:
|
||||||
|
_dbenable=False
|
||||||
|
|
||||||
#####
|
#####
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
@ -215,7 +221,56 @@ def readTaxonomyDump(taxdir):
|
|||||||
|
|
||||||
return taxonomy,ranks,alternativeName,index
|
return taxonomy,ranks,alternativeName,index
|
||||||
|
|
||||||
|
def readTaxonomyDB(dbname):
|
||||||
|
connection = psycopg2.connect(database=dbname)
|
||||||
|
|
||||||
|
cursor = connection.cursor()
|
||||||
|
cursor.execute("select numid,rank,parent from ncbi_taxonomy.taxon")
|
||||||
|
taxonomy=[list(x) for x in cursor]
|
||||||
|
|
||||||
|
cursor.execute("select rank_class from ncbi_taxonomy.taxon_rank_class order by rank_class")
|
||||||
|
ranks=cursor.fetchall()
|
||||||
|
ranks = dict(map(None,(x[0] for x in ranks),xrange(len(ranks))))
|
||||||
|
|
||||||
|
print >>sys.stderr,"Sorting taxons..."
|
||||||
|
taxonomy.sort(taxonCmp)
|
||||||
|
|
||||||
|
print >>sys.stderr,"Indexing taxonomy..."
|
||||||
|
index = {}
|
||||||
|
for t in taxonomy:
|
||||||
|
index[t[0]]=bsearchTaxon(taxonomy, t[0])
|
||||||
|
|
||||||
|
print >>sys.stderr,"Indexing parent and rank..."
|
||||||
|
for t in taxonomy:
|
||||||
|
t[1]=ranks[t[1]]
|
||||||
|
try:
|
||||||
|
t[2]=index[t[2]]
|
||||||
|
except KeyError,e:
|
||||||
|
if t[2] is None and t[0]==1:
|
||||||
|
t[2]=index[t[0]]
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
|
|
||||||
|
cursor.execute("select taxid,name,category from ncbi_taxonomy.name")
|
||||||
|
|
||||||
|
alternativeName=[]
|
||||||
|
for taxid,name,classname in cursor:
|
||||||
|
alternativeName.append((name,classname,index[taxid]))
|
||||||
|
if classname == 'scientific name':
|
||||||
|
taxonomy[index[taxid]].append(name)
|
||||||
|
|
||||||
|
cursor.execute("select old_numid,current_numid from ncbi_taxonomy.taxon_id_alias")
|
||||||
|
|
||||||
|
print >>sys.stderr,"Adding taxid alias..."
|
||||||
|
for taxid,current in cursor:
|
||||||
|
if current is not None:
|
||||||
|
index[taxid]=index[current]
|
||||||
|
else:
|
||||||
|
index[taxid]=None
|
||||||
|
|
||||||
|
|
||||||
|
return taxonomy,ranks,alternativeName,index
|
||||||
|
|
||||||
#####
|
#####
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
@ -293,11 +348,14 @@ def emblEntryParser(entry):
|
|||||||
|
|
||||||
######################
|
######################
|
||||||
|
|
||||||
|
_fastaSplit=re.compile(';\W*')
|
||||||
|
|
||||||
def parseFasta(seq):
|
def parseFasta(seq):
|
||||||
|
seq=seq.split('\n')
|
||||||
title = seq[0].strip()[1:].split(None,1)
|
title = seq[0].strip()[1:].split(None,1)
|
||||||
id=title[0]
|
id=title[0]
|
||||||
if len(title) == 2:
|
if len(title) == 2:
|
||||||
field = title[1].split('; ')
|
field = _fastaSplit.split(title[1])
|
||||||
else:
|
else:
|
||||||
field=[]
|
field=[]
|
||||||
info = dict(x.split('=',1) for x in field if '=' in x)
|
info = dict(x.split('=',1) for x in field if '=' in x)
|
||||||
@ -527,9 +585,10 @@ def ecoParseOptions(arguments):
|
|||||||
}
|
}
|
||||||
|
|
||||||
o,filenames = getopt.getopt(arguments,
|
o,filenames = getopt.getopt(arguments,
|
||||||
'ht:n:gfe',
|
'ht:T:n:gfe',
|
||||||
['help',
|
['help',
|
||||||
'taxonomy=',
|
'taxonomy=',
|
||||||
|
'taxonomy_db=',
|
||||||
'name=',
|
'name=',
|
||||||
'genbank',
|
'genbank',
|
||||||
'fasta',
|
'fasta',
|
||||||
@ -540,7 +599,11 @@ def ecoParseOptions(arguments):
|
|||||||
printHelp()
|
printHelp()
|
||||||
exit()
|
exit()
|
||||||
elif name in ('-t','--taxonomy'):
|
elif name in ('-t','--taxonomy'):
|
||||||
|
opt['taxmod']='dump'
|
||||||
opt['taxdir']=value
|
opt['taxdir']=value
|
||||||
|
elif name in ('-T','--taxonomy_db'):
|
||||||
|
opt['taxmod']='db'
|
||||||
|
opt['taxdb']=value
|
||||||
elif name in ('-n','--name'):
|
elif name in ('-n','--name'):
|
||||||
opt['prefix']=value
|
opt['prefix']=value
|
||||||
elif name in ('-g','--genbank'):
|
elif name in ('-g','--genbank'):
|
||||||
@ -578,7 +641,11 @@ if __name__ == '__main__':
|
|||||||
|
|
||||||
opt,filenames = ecoParseOptions(sys.argv[1:])
|
opt,filenames = ecoParseOptions(sys.argv[1:])
|
||||||
|
|
||||||
taxonomy = readTaxonomyDump(opt['taxdir'])
|
if opt['taxmod']=='dump':
|
||||||
|
taxonomy = readTaxonomyDump(opt['taxdir'])
|
||||||
|
elif opt['taxmod']=='db':
|
||||||
|
taxonomy = readTaxonomyDB(opt['taxdb'])
|
||||||
|
|
||||||
|
|
||||||
ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser'])
|
ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser'])
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user