Update to consider evolution of the language
This commit is contained in:
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import re
|
||||
import gzip
|
||||
@ -6,12 +6,9 @@ import struct
|
||||
import sys
|
||||
import time
|
||||
import getopt
|
||||
from functools import cmp_to_key
|
||||
|
||||
try:
|
||||
import psycopg2
|
||||
_dbenable=True
|
||||
except ImportError:
|
||||
_dbenable=False
|
||||
_dbenable=False
|
||||
|
||||
#####
|
||||
#
|
||||
@ -80,15 +77,17 @@ class ColumnFile(object):
|
||||
def __init__(self,stream,sep=None,strip=True,types=None):
|
||||
if isinstance(stream,str):
|
||||
self._stream = open(stream)
|
||||
elif hasattr(stream,'next'):
|
||||
self._stream = stream
|
||||
else:
|
||||
raise ValueError,'stream must be string or an iterator'
|
||||
try:
|
||||
iter(stream)
|
||||
self._stream = stream
|
||||
except TypeError:
|
||||
raise ValueError('stream must be string or an iterator')
|
||||
self._delimiter=sep
|
||||
self._strip=strip
|
||||
if types:
|
||||
self._types=[x for x in types]
|
||||
for i in xrange(len(self._types)):
|
||||
for i in range(len(self._types)):
|
||||
if self._types[i] is bool:
|
||||
self._types[i]=ColumnFile.str2bool
|
||||
else:
|
||||
@ -103,14 +102,14 @@ class ColumnFile(object):
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def next(self):
|
||||
ligne = self._stream.next()
|
||||
def __next__(self):
|
||||
ligne = next(self._stream)
|
||||
data = ligne.split(self._delimiter)
|
||||
if self._strip or self._types:
|
||||
data = [x.strip() for x in data]
|
||||
if self._types:
|
||||
it = endLessIterator(self._types)
|
||||
data = [x[1](x[0]) for x in ((y,it.next()) for y in data)]
|
||||
data = [x[1](x[0]) for x in ((y,next(it)) for y in data)]
|
||||
return data
|
||||
|
||||
def taxonCmp(t1,t2):
|
||||
@ -125,14 +124,14 @@ def bsearchTaxon(taxonomy,taxid):
|
||||
begin = 0
|
||||
end = taxCount
|
||||
oldcheck=taxCount
|
||||
check = begin + end / 2
|
||||
check = int(begin + end / 2)
|
||||
while check != oldcheck and taxonomy[check][0]!=taxid :
|
||||
if taxonomy[check][0] < taxid:
|
||||
begin=check
|
||||
else:
|
||||
end=check
|
||||
oldcheck=check
|
||||
check = (begin + end) / 2
|
||||
check = int((begin + end) / 2)
|
||||
|
||||
|
||||
if taxonomy[check][0]==taxid:
|
||||
@ -152,22 +151,22 @@ def readNodeTable(file):
|
||||
str,str,bool,
|
||||
int,bool,int,
|
||||
bool,bool,bool,str))
|
||||
print >>sys.stderr,"Reading taxonomy dump file..."
|
||||
print("Reading taxonomy dump file...", file=sys.stderr)
|
||||
taxonomy=[[n[0],n[2],n[1]] for n in nodes]
|
||||
print >>sys.stderr,"List all taxonomy rank..."
|
||||
print("List all taxonomy rank...", file=sys.stderr)
|
||||
ranks =list(set(x[1] for x in taxonomy))
|
||||
ranks.sort()
|
||||
ranks = dict(map(None,ranks,xrange(len(ranks))))
|
||||
ranks = {rank: index for index, rank in enumerate(ranks)}
|
||||
|
||||
print >>sys.stderr,"Sorting taxons..."
|
||||
taxonomy.sort(taxonCmp)
|
||||
print("Sorting taxons...", file=sys.stderr)
|
||||
taxonomy.sort(key=lambda x: x[0])
|
||||
|
||||
print >>sys.stderr,"Indexing taxonomy..."
|
||||
print("Indexing taxonomy...", file=sys.stderr)
|
||||
index = {}
|
||||
for t in taxonomy:
|
||||
index[t[0]]=bsearchTaxon(taxonomy, t[0])
|
||||
|
||||
print >>sys.stderr,"Indexing parent and rank..."
|
||||
print("Indexing parent and rank...", file=sys.stderr)
|
||||
for t in taxonomy:
|
||||
t[1]=ranks[t[1]]
|
||||
t[2]=index[t[2]]
|
||||
@ -203,7 +202,7 @@ def deletedNodeIterator(file):
|
||||
def readTaxonomyDump(taxdir):
|
||||
taxonomy,ranks,index = readNodeTable('%s/nodes.dmp' % taxdir)
|
||||
|
||||
print >>sys.stderr,"Adding scientific name..."
|
||||
print("Adding scientific name...", file=sys.stderr)
|
||||
|
||||
alternativeName=[]
|
||||
for taxid,name,classname in nameIterator('%s/names.dmp' % taxdir):
|
||||
@ -211,66 +210,16 @@ def readTaxonomyDump(taxdir):
|
||||
if classname == 'scientific name':
|
||||
taxonomy[index[taxid]].append(name)
|
||||
|
||||
print >>sys.stderr,"Adding taxid alias..."
|
||||
print("Adding taxid alias...", file=sys.stderr)
|
||||
for taxid,current in mergedNodeIterator('%s/merged.dmp' % taxdir):
|
||||
index[taxid]=index[current]
|
||||
|
||||
print >>sys.stderr,"Adding deleted taxid..."
|
||||
print("Adding deleted taxid...", file=sys.stderr)
|
||||
for taxid in deletedNodeIterator('%s/delnodes.dmp' % taxdir):
|
||||
index[taxid]=None
|
||||
|
||||
return taxonomy,ranks,alternativeName,index
|
||||
|
||||
def readTaxonomyDB(dbname):
|
||||
connection = psycopg2.connect(database=dbname)
|
||||
|
||||
cursor = connection.cursor()
|
||||
cursor.execute("select numid,rank,parent from ncbi_taxonomy.taxon")
|
||||
taxonomy=[list(x) for x in cursor]
|
||||
|
||||
cursor.execute("select rank_class from ncbi_taxonomy.taxon_rank_class order by rank_class")
|
||||
ranks=cursor.fetchall()
|
||||
ranks = dict(map(None,(x[0] for x in ranks),xrange(len(ranks))))
|
||||
|
||||
print >>sys.stderr,"Sorting taxons..."
|
||||
taxonomy.sort(taxonCmp)
|
||||
|
||||
print >>sys.stderr,"Indexing taxonomy..."
|
||||
index = {}
|
||||
for t in taxonomy:
|
||||
index[t[0]]=bsearchTaxon(taxonomy, t[0])
|
||||
|
||||
print >>sys.stderr,"Indexing parent and rank..."
|
||||
for t in taxonomy:
|
||||
t[1]=ranks[t[1]]
|
||||
try:
|
||||
t[2]=index[t[2]]
|
||||
except KeyError,e:
|
||||
if t[2] is None and t[0]==1:
|
||||
t[2]=index[t[0]]
|
||||
else:
|
||||
raise e
|
||||
|
||||
cursor.execute("select taxid,name,category from ncbi_taxonomy.name")
|
||||
|
||||
alternativeName=[]
|
||||
for taxid,name,classname in cursor:
|
||||
alternativeName.append((name,classname,index[taxid]))
|
||||
if classname == 'scientific name':
|
||||
taxonomy[index[taxid]].append(name)
|
||||
|
||||
cursor.execute("select old_numid,current_numid from ncbi_taxonomy.taxon_id_alias")
|
||||
|
||||
print >>sys.stderr,"Adding taxid alias..."
|
||||
for taxid,current in cursor:
|
||||
if current is not None:
|
||||
index[taxid]=index[current]
|
||||
else:
|
||||
index[taxid]=None
|
||||
|
||||
|
||||
return taxonomy,ranks,alternativeName,index
|
||||
|
||||
#####
|
||||
#
|
||||
#
|
||||
@ -282,22 +231,27 @@ def readTaxonomyDB(dbname):
|
||||
def entryIterator(file):
|
||||
file = universalOpen(file)
|
||||
rep =[]
|
||||
for ligne in file:
|
||||
ligne = file.readline()
|
||||
while ligne:
|
||||
rep.append(ligne)
|
||||
if ligne == '//\n':
|
||||
rep = ''.join(rep)
|
||||
yield rep
|
||||
rep = []
|
||||
ligne = file.readline()
|
||||
|
||||
def fastaEntryIterator(file):
|
||||
file = universalOpen(file)
|
||||
rep =[]
|
||||
for ligne in file:
|
||||
ligne = file.readline()
|
||||
while ligne:
|
||||
if ligne[0] == '>' and rep:
|
||||
rep = ''.join(rep)
|
||||
yield rep
|
||||
rep = []
|
||||
rep.append(ligne)
|
||||
ligne = file.readline()
|
||||
|
||||
if rep:
|
||||
rep = ''.join(rep)
|
||||
yield rep
|
||||
@ -418,7 +372,7 @@ def taxonomyInfo(entry,connection):
|
||||
|
||||
def ecoSeqPacker(sq):
|
||||
|
||||
compactseq = gzip.zlib.compress(sq['sequence'],9)
|
||||
compactseq = gzip.zlib.compress(bytes(sq['sequence'],"ascii"),9)
|
||||
cptseqlength = len(compactseq)
|
||||
delength = len(sq['definition'])
|
||||
|
||||
@ -427,11 +381,11 @@ def ecoSeqPacker(sq):
|
||||
packed = struct.pack('> I I 20s I I I %ds %ds' % (delength,cptseqlength),
|
||||
totalSize,
|
||||
sq['taxid'],
|
||||
sq['id'],
|
||||
bytes(sq['id'],"ascii"),
|
||||
delength,
|
||||
len(sq['sequence']),
|
||||
cptseqlength,
|
||||
sq['definition'],
|
||||
bytes(sq['definition'],"ascii"),
|
||||
compactseq)
|
||||
|
||||
assert len(packed) == totalSize+4, "error in sequence packing"
|
||||
@ -450,7 +404,7 @@ def ecoTaxPacker(tx):
|
||||
tx[1],
|
||||
tx[2],
|
||||
namelength,
|
||||
tx[3])
|
||||
bytes(tx[3],"ascii"))
|
||||
|
||||
return packed
|
||||
|
||||
@ -460,7 +414,7 @@ def ecoRankPacker(rank):
|
||||
|
||||
packed = struct.pack('> I %ds' % namelength,
|
||||
namelength,
|
||||
rank)
|
||||
bytes(rank, 'ascii'))
|
||||
|
||||
return packed
|
||||
|
||||
@ -476,8 +430,8 @@ def ecoNamePacker(name):
|
||||
namelength,
|
||||
classlength,
|
||||
name[2],
|
||||
name[0],
|
||||
name[1])
|
||||
bytes(name[0], 'ascii'),
|
||||
bytes(name[1], 'ascii'))
|
||||
|
||||
return packed
|
||||
|
||||
@ -505,11 +459,11 @@ def ecoSeqWriter(file,input,taxindex,parser):
|
||||
skipped.append(entry['id'])
|
||||
where = universalTell(input)
|
||||
progressBar(where, inputsize)
|
||||
print >>sys.stderr," Readed sequences : %d " % seqcount,
|
||||
print(" Readed sequences : %d " % seqcount, end=' ', file=sys.stderr)
|
||||
else:
|
||||
skipped.append(entry['id'])
|
||||
|
||||
print >>sys.stderr
|
||||
print(file=sys.stderr)
|
||||
output.seek(0,0)
|
||||
output.write(struct.pack('> I',seqcount))
|
||||
|
||||
@ -530,7 +484,7 @@ def ecoRankWriter(file,ranks):
|
||||
output = open(file,'wb')
|
||||
output.write(struct.pack('> I',len(ranks)))
|
||||
|
||||
rankNames = ranks.keys()
|
||||
rankNames = list(ranks.keys())
|
||||
rankNames.sort()
|
||||
|
||||
for rank in rankNames:
|
||||
@ -552,7 +506,7 @@ def ecoNameWriter(file,names):
|
||||
output = open(file,'wb')
|
||||
output.write(struct.pack('> I',len(names)))
|
||||
|
||||
names.sort(nameCmp)
|
||||
names.sort(key=lambda x:x[0].upper())
|
||||
|
||||
for name in names:
|
||||
output.write(ecoNamePacker(name))
|
||||
@ -573,8 +527,8 @@ def ecoDBWriter(prefix,taxonomy,seqFileNames,parser):
|
||||
taxonomy[3],
|
||||
parser)
|
||||
if sk:
|
||||
print >>sys.stderr,"Skipped entry :"
|
||||
print >>sys.stderr,sk
|
||||
print("Skipped entry :", file=sys.stderr)
|
||||
print(sk, file=sys.stderr)
|
||||
|
||||
def ecoParseOptions(arguments):
|
||||
opt = {
|
||||
@ -618,34 +572,30 @@ def ecoParseOptions(arguments):
|
||||
opt['parser']=sequenceIteratorFactory(emblEntryParser,
|
||||
entryIterator)
|
||||
else:
|
||||
raise ValueError,'Unknown option %s' % name
|
||||
raise ValueError('Unknown option %s' % name)
|
||||
|
||||
return opt,filenames
|
||||
|
||||
def printHelp():
|
||||
print "-----------------------------------"
|
||||
print " ecoPCRFormat.py"
|
||||
print "-----------------------------------"
|
||||
print "ecoPCRFormat.py [option] <argument>"
|
||||
print "-----------------------------------"
|
||||
print "-e --embl :[E]mbl format"
|
||||
print "-f --fasta :[F]asta format"
|
||||
print "-g --genbank :[G]enbank format"
|
||||
print "-h --help :[H]elp - print this help"
|
||||
print "-n --name :[N]ame of the new database created"
|
||||
print "-t --taxonomy :[T]axonomy - path to the taxonomy database"
|
||||
print " :bcp-like dump from GenBank taxonomy database."
|
||||
print "-----------------------------------"
|
||||
print("-----------------------------------")
|
||||
print(" ecoPCRFormat.py")
|
||||
print("-----------------------------------")
|
||||
print("ecoPCRFormat.py [option] <argument>")
|
||||
print("-----------------------------------")
|
||||
print("-e --embl :[E]mbl format")
|
||||
print("-f --fasta :[F]asta format")
|
||||
print("-g --genbank :[G]enbank format")
|
||||
print("-h --help :[H]elp - print this help")
|
||||
print("-n --name :[N]ame of the new database created")
|
||||
print("-t --taxonomy :[T]axonomy - path to the taxonomy database")
|
||||
print(" :bcp-like dump from GenBank taxonomy database.")
|
||||
print("-----------------------------------")
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
opt,filenames = ecoParseOptions(sys.argv[1:])
|
||||
|
||||
if opt['taxmod']=='dump':
|
||||
taxonomy = readTaxonomyDump(opt['taxdir'])
|
||||
elif opt['taxmod']=='db':
|
||||
taxonomy = readTaxonomyDB(opt['taxdb'])
|
||||
|
||||
|
||||
taxonomy = readTaxonomyDump(opt['taxdir'])
|
||||
|
||||
ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser'])
|
||||
|
||||
|
Reference in New Issue
Block a user