diff --git a/tools/ecoPCRFormat.py b/tools/ecoPCRFormat.py index cbc20af..3884001 100755 --- a/tools/ecoPCRFormat.py +++ b/tools/ecoPCRFormat.py @@ -7,6 +7,12 @@ import sys import time import getopt +try: + import psycopg2 + _dbenable=True +except ImportError: + _dbenable=False + ##### # # @@ -215,7 +221,56 @@ def readTaxonomyDump(taxdir): return taxonomy,ranks,alternativeName,index +def readTaxonomyDB(dbname): + connection = psycopg2.connect(database=dbname) + + cursor = connection.cursor() + cursor.execute("select numid,rank,parent from ncbi_taxonomy.taxon") + taxonomy=[list(x) for x in cursor] + cursor.execute("select rank_class from ncbi_taxonomy.taxon_rank_class order by rank_class") + ranks=cursor.fetchall() + ranks = dict(map(None,(x[0] for x in ranks),xrange(len(ranks)))) + + print >>sys.stderr,"Sorting taxons..." + taxonomy.sort(taxonCmp) + + print >>sys.stderr,"Indexing taxonomy..." + index = {} + for t in taxonomy: + index[t[0]]=bsearchTaxon(taxonomy, t[0]) + + print >>sys.stderr,"Indexing parent and rank..." + for t in taxonomy: + t[1]=ranks[t[1]] + try: + t[2]=index[t[2]] + except KeyError,e: + if t[2] is None and t[0]==1: + t[2]=index[t[0]] + else: + raise e + + cursor.execute("select taxid,name,category from ncbi_taxonomy.name") + + alternativeName=[] + for taxid,name,classname in cursor: + alternativeName.append((name,classname,index[taxid])) + if classname == 'scientific name': + taxonomy[index[taxid]].append(name) + + cursor.execute("select old_numid,current_numid from ncbi_taxonomy.taxon_id_alias") + + print >>sys.stderr,"Adding taxid alias..." + for taxid,current in cursor: + if current is not None: + index[taxid]=index[current] + else: + index[taxid]=None + + + return taxonomy,ranks,alternativeName,index + ##### # # @@ -293,11 +348,14 @@ def emblEntryParser(entry): ###################### +_fastaSplit=re.compile(';\W*') + def parseFasta(seq): + seq=seq.split('\n') title = seq[0].strip()[1:].split(None,1) id=title[0] if len(title) == 2: - field = title[1].split('; ') + field = _fastaSplit.split(title[1]) else: field=[] info = dict(x.split('=',1) for x in field if '=' in x) @@ -527,9 +585,10 @@ def ecoParseOptions(arguments): } o,filenames = getopt.getopt(arguments, - 'ht:n:gfe', + 'ht:T:n:gfe', ['help', 'taxonomy=', + 'taxonomy_db=', 'name=', 'genbank', 'fasta', @@ -540,7 +599,11 @@ def ecoParseOptions(arguments): printHelp() exit() elif name in ('-t','--taxonomy'): + opt['taxmod']='dump' opt['taxdir']=value + elif name in ('-T','--taxonomy_db'): + opt['taxmod']='db' + opt['taxdb']=value elif name in ('-n','--name'): opt['prefix']=value elif name in ('-g','--genbank'): @@ -578,7 +641,11 @@ if __name__ == '__main__': opt,filenames = ecoParseOptions(sys.argv[1:]) - taxonomy = readTaxonomyDump(opt['taxdir']) + if opt['taxmod']=='dump': + taxonomy = readTaxonomyDump(opt['taxdir']) + elif opt['taxmod']=='db': + taxonomy = readTaxonomyDB(opt['taxdb']) + ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser'])