448 lines
12 KiB
Python
448 lines
12 KiB
Python
|
#!/usr/bin/env python
|
||
|
|
||
|
import re
|
||
|
import gzip
|
||
|
import psycopg2
|
||
|
import struct
|
||
|
import sys
|
||
|
import time
|
||
|
import getopt
|
||
|
|
||
|
#####
|
||
|
#
|
||
|
#
|
||
|
# Generic file function
|
||
|
#
|
||
|
#
|
||
|
#####
|
||
|
|
||
|
def universalOpen(file):
|
||
|
if isinstance(file,str):
|
||
|
if file[-3:] == '.gz':
|
||
|
rep = gzip.open(file)
|
||
|
else:
|
||
|
rep = open(file)
|
||
|
else:
|
||
|
rep = file
|
||
|
return rep
|
||
|
|
||
|
def universalTell(file):
|
||
|
if isinstance(file, gzip.GzipFile):
|
||
|
file=file.myfileobj
|
||
|
return file.tell()
|
||
|
|
||
|
def fileSize(file):
|
||
|
if isinstance(file, gzip.GzipFile):
|
||
|
file=file.myfileobj
|
||
|
pos = file.tell()
|
||
|
file.seek(0,2)
|
||
|
length = file.tell()
|
||
|
file.seek(pos,0)
|
||
|
return length
|
||
|
|
||
|
def progressBar(pos,max,reset=False,delta=[]):
|
||
|
if reset:
|
||
|
del delta[:]
|
||
|
if not delta:
|
||
|
delta.append(time.time())
|
||
|
delta.append(time.time())
|
||
|
|
||
|
delta[1]=time.time()
|
||
|
elapsed = delta[1]-delta[0]
|
||
|
percent = float(pos)/max * 100
|
||
|
remain = time.strftime('%H:%M:%S',time.gmtime(elapsed / percent * (100-percent)))
|
||
|
bar = '#' * int(percent/2)
|
||
|
bar+= '|/-\\-'[pos % 5]
|
||
|
bar+= ' ' * (50 - int(percent/2))
|
||
|
sys.stderr.write('\r%5.1f %% |%s] remain : %s' %(percent,bar,remain))
|
||
|
|
||
|
#####
|
||
|
#
|
||
|
#
|
||
|
# NCBI Dump Taxonomy reader
|
||
|
#
|
||
|
#
|
||
|
#####
|
||
|
|
||
|
def endLessIterator(endedlist):
|
||
|
for x in endedlist:
|
||
|
yield x
|
||
|
while(1):
|
||
|
yield endedlist[-1]
|
||
|
|
||
|
|
||
|
class ColumnFile(object):
|
||
|
|
||
|
def __init__(self,stream,sep=None,strip=True,types=None):
|
||
|
if isinstance(stream,str):
|
||
|
self._stream = open(stream)
|
||
|
elif hasattr(stream,'next'):
|
||
|
self._stream = stream
|
||
|
else:
|
||
|
raise ValueError,'stream must be string or an iterator'
|
||
|
self._delimiter=sep
|
||
|
self._strip=strip
|
||
|
if types:
|
||
|
self._types=[x for x in types]
|
||
|
for i in xrange(len(self._types)):
|
||
|
if self._types[i] is bool:
|
||
|
self._types[i]=ColumnFile.str2bool
|
||
|
else:
|
||
|
self._types=None
|
||
|
|
||
|
def str2bool(x):
|
||
|
return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False}))
|
||
|
|
||
|
str2bool = staticmethod(str2bool)
|
||
|
|
||
|
|
||
|
def __iter__(self):
|
||
|
return self
|
||
|
|
||
|
def next(self):
|
||
|
ligne = self._stream.next()
|
||
|
data = ligne.split(self._delimiter)
|
||
|
if self._strip or self._types:
|
||
|
data = [x.strip() for x in data]
|
||
|
if self._types:
|
||
|
it = endLessIterator(self._types)
|
||
|
data = [x[1](x[0]) for x in ((y,it.next()) for y in data)]
|
||
|
return data
|
||
|
|
||
|
def taxonCmp(t1,t2):
|
||
|
if t1[0] < t2[0]:
|
||
|
return -1
|
||
|
elif t1[0] > t2[0]:
|
||
|
return +1
|
||
|
return 0
|
||
|
|
||
|
def bsearchTaxon(taxonomy,taxid):
|
||
|
taxCount = len(taxonomy)
|
||
|
begin = 0
|
||
|
end = taxCount
|
||
|
oldcheck=taxCount
|
||
|
check = begin + end / 2
|
||
|
while check != oldcheck and taxonomy[check][0]!=taxid :
|
||
|
if taxonomy[check][0] < taxid:
|
||
|
begin=check
|
||
|
else:
|
||
|
end=check
|
||
|
oldcheck=check
|
||
|
check = (begin + end) / 2
|
||
|
|
||
|
|
||
|
if taxonomy[check][0]==taxid:
|
||
|
return check
|
||
|
else:
|
||
|
return None
|
||
|
|
||
|
|
||
|
|
||
|
def readNodeTable(file):
|
||
|
|
||
|
file = universalOpen(file)
|
||
|
|
||
|
nodes = ColumnFile(file,
|
||
|
sep='|',
|
||
|
types=(int,int,str,
|
||
|
str,str,bool,
|
||
|
int,bool,int,
|
||
|
bool,bool,bool,str))
|
||
|
print >>sys.stderr,"Reading taxonomy dump file..."
|
||
|
taxonomy=[[n[0],n[2],n[1]] for n in nodes]
|
||
|
|
||
|
print >>sys.stderr,"List all taxonomy rank..."
|
||
|
ranks =list(set(x[1] for x in taxonomy))
|
||
|
ranks.sort()
|
||
|
ranks = dict(map(None,ranks,xrange(len(ranks))))
|
||
|
|
||
|
print >>sys.stderr,"Sorting taxons..."
|
||
|
taxonomy.sort(taxonCmp)
|
||
|
|
||
|
print >>sys.stderr,"Indexing taxonomy..."
|
||
|
index = {}
|
||
|
for t in taxonomy:
|
||
|
index[t[0]]=bsearchTaxon(taxonomy, t[0])
|
||
|
|
||
|
print >>sys.stderr,"Indexing parent and rank..."
|
||
|
for t in taxonomy:
|
||
|
t[1]=ranks[t[1]]
|
||
|
t[2]=index[t[2]]
|
||
|
|
||
|
|
||
|
return taxonomy,ranks,index
|
||
|
|
||
|
def scientificNameIterator(file):
|
||
|
file = universalOpen(file)
|
||
|
names = ColumnFile(file,
|
||
|
sep='|',
|
||
|
types=(int,str,
|
||
|
str,str))
|
||
|
for taxid,name,unique,classname,white in names:
|
||
|
if classname == 'scientific name':
|
||
|
yield taxid,name
|
||
|
|
||
|
def mergedNodeIterator(file):
|
||
|
file = universalOpen(file)
|
||
|
merged = ColumnFile(file,
|
||
|
sep='|',
|
||
|
types=(int,int,str))
|
||
|
for taxid,current,white in merged:
|
||
|
yield taxid,current
|
||
|
|
||
|
def deletedNodeIterator(file):
|
||
|
file = universalOpen(file)
|
||
|
deleted = ColumnFile(file,
|
||
|
sep='|',
|
||
|
types=(int,str))
|
||
|
for taxid,white in deleted:
|
||
|
yield taxid
|
||
|
|
||
|
def readTaxonomyDump(taxdir):
|
||
|
taxonomy,ranks,index = readNodeTable('%s/nodes.dmp' % taxdir)
|
||
|
|
||
|
print >>sys.stderr,"Adding scientific name..."
|
||
|
for taxid,name in scientificNameIterator('%s/names.dmp' % taxdir):
|
||
|
taxonomy[index[taxid]].append(name)
|
||
|
|
||
|
print >>sys.stderr,"Adding taxid alias..."
|
||
|
for taxid,current in mergedNodeIterator('%s/merged.dmp' % taxdir):
|
||
|
index[taxid]=index[current]
|
||
|
|
||
|
print >>sys.stderr,"Adding deleted taxid..."
|
||
|
for taxid in deletedNodeIterator('%s/delnodes.dmp' % taxdir):
|
||
|
index[taxid]=None
|
||
|
|
||
|
return taxonomy,ranks,index
|
||
|
|
||
|
|
||
|
#####
|
||
|
#
|
||
|
#
|
||
|
# Genbank/EMBL sequence reader
|
||
|
#
|
||
|
#
|
||
|
#####
|
||
|
|
||
|
def entryIterator(file):
|
||
|
file = universalOpen(file)
|
||
|
rep =[]
|
||
|
for ligne in file:
|
||
|
rep.append(ligne)
|
||
|
if ligne == '//\n':
|
||
|
rep = ''.join(rep)
|
||
|
yield rep
|
||
|
rep = []
|
||
|
|
||
|
_cleanSeq = re.compile('[ \n0-9]+')
|
||
|
|
||
|
def cleanSeq(seq):
|
||
|
return _cleanSeq.sub('',seq)
|
||
|
|
||
|
|
||
|
_gbParseID = re.compile('(?<=^LOCUS {7})[^ ]+(?= )',re.MULTILINE)
|
||
|
_gbParseDE = re.compile('(?<=^DEFINITION {2}).+?\. *$(?=[^ ])',re.MULTILINE+re.DOTALL)
|
||
|
_gbParseSQ = re.compile('(?<=^ORIGIN).+?(?=^//$)',re.MULTILINE+re.DOTALL)
|
||
|
_gbParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")')
|
||
|
|
||
|
def genbankParser(entry):
|
||
|
Id = _gbParseID.findall(entry)[0]
|
||
|
De = ' '.join(_gbParseDE.findall(entry)[0].split())
|
||
|
Sq = cleanSeq(_gbParseSQ.findall(entry)[0].upper())
|
||
|
Tx = int(_gbParseTX.findall(entry)[0])
|
||
|
return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq}
|
||
|
|
||
|
def sequenceIterator(file,parser):
|
||
|
for entry in entryIterator(file):
|
||
|
yield parser(entry)
|
||
|
|
||
|
def taxonomyInfo(entry,connection):
|
||
|
taxid = entry['taxid']
|
||
|
curseur = connection.cursor()
|
||
|
curseur.execute("""
|
||
|
select taxid,species,genus,family,
|
||
|
taxonomy.scientificName(taxid) as sn,
|
||
|
taxonomy.scientificName(species) as species_sn,
|
||
|
taxonomy.scientificName(genus) as genus_sn,
|
||
|
taxonomy.scientificName(family) as family_sn
|
||
|
from
|
||
|
(
|
||
|
select alias as taxid,
|
||
|
taxonomy.getSpecies(alias) as species,
|
||
|
taxonomy.getGenus(alias) as genus,
|
||
|
taxonomy.getFamily(alias) as family
|
||
|
from taxonomy.aliases
|
||
|
where id=%d ) as tax
|
||
|
""" % taxid)
|
||
|
rep = curseur.fetchone()
|
||
|
entry['current_taxid']=rep[0]
|
||
|
entry['species']=rep[1]
|
||
|
entry['genus']=rep[2]
|
||
|
entry['family']=rep[3]
|
||
|
entry['scientific_name']=rep[4]
|
||
|
entry['species_sn']=rep[5]
|
||
|
entry['genus_sn']=rep[6]
|
||
|
entry['family_sn']=rep[7]
|
||
|
return entry
|
||
|
|
||
|
#####
|
||
|
#
|
||
|
#
|
||
|
# Binary writer
|
||
|
#
|
||
|
#
|
||
|
#####
|
||
|
|
||
|
def ecoSeqPacker(sq):
|
||
|
|
||
|
compactseq = gzip.zlib.compress(sq['sequence'],9)
|
||
|
cptseqlength = len(compactseq)
|
||
|
delength = len(sq['definition'])
|
||
|
|
||
|
totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength
|
||
|
|
||
|
packed = struct.pack('> I I 20s I I I %ds %ds' % (delength,cptseqlength),
|
||
|
totalSize,
|
||
|
sq['taxid'],
|
||
|
sq['id'],
|
||
|
delength,
|
||
|
len(sq['sequence']),
|
||
|
cptseqlength,
|
||
|
sq['definition'],
|
||
|
compactseq)
|
||
|
|
||
|
assert len(packed) == totalSize+4, "error in sequence packing"
|
||
|
|
||
|
return packed
|
||
|
|
||
|
def ecoTaxPacker(tx):
|
||
|
|
||
|
namelength = len(tx[3])
|
||
|
|
||
|
totalSize = 4 + 4 + 4 + 4 + namelength
|
||
|
|
||
|
packed = struct.pack('> I I I I I %ds' % namelength,
|
||
|
totalSize,
|
||
|
tx[0],
|
||
|
tx[1],
|
||
|
tx[2],
|
||
|
namelength,
|
||
|
tx[3])
|
||
|
|
||
|
return packed
|
||
|
|
||
|
def ecoRankPacker(rank):
|
||
|
|
||
|
namelength = len(rank)
|
||
|
|
||
|
packed = struct.pack('> I %ds' % namelength,
|
||
|
namelength,
|
||
|
rank)
|
||
|
|
||
|
return packed
|
||
|
|
||
|
|
||
|
def ecoSeqWriter(file,input,taxindex,parser=genbankParser):
|
||
|
output = open(file,'wb')
|
||
|
input = universalOpen(input)
|
||
|
inputsize = fileSize(input)
|
||
|
entries = sequenceIterator(input, parser)
|
||
|
seqcount=0
|
||
|
skipped = []
|
||
|
|
||
|
output.write(struct.pack('> I',seqcount))
|
||
|
|
||
|
progressBar(1, inputsize,reset=True)
|
||
|
for entry in entries:
|
||
|
entry['taxid']=taxindex[entry['taxid']]
|
||
|
if entry['taxid'] is not None:
|
||
|
seqcount+=1
|
||
|
output.write(ecoSeqPacker(entry))
|
||
|
else:
|
||
|
skipped.append[entry['id']]
|
||
|
where = universalTell(input)
|
||
|
progressBar(where, inputsize)
|
||
|
print >>sys.stderr," Read sequences : %d " % seqcount,
|
||
|
|
||
|
print >>sys.stderr
|
||
|
output.seek(0,0)
|
||
|
output.write(struct.pack('> I',seqcount))
|
||
|
|
||
|
output.close()
|
||
|
return skipped
|
||
|
|
||
|
|
||
|
def ecoTaxWriter(file,taxonomy):
|
||
|
output = open(file,'wb')
|
||
|
output.write(struct.pack('> I',len(taxonomy)))
|
||
|
|
||
|
for tx in taxonomy:
|
||
|
output.write(ecoTaxPacker(tx))
|
||
|
|
||
|
output.close()
|
||
|
|
||
|
def ecoRankWriter(file,ranks):
|
||
|
output = open(file,'wb')
|
||
|
output.write(struct.pack('> I',len(ranks)))
|
||
|
|
||
|
rankNames = ranks.keys()
|
||
|
rankNames.sort()
|
||
|
|
||
|
for rank in rankNames:
|
||
|
output.write(ecoRankPacker(rank))
|
||
|
|
||
|
output.close()
|
||
|
|
||
|
def ecoDBWriter(prefix,taxonomy,seqFileNames,parser=genbankParser):
|
||
|
|
||
|
ecoRankWriter('%s.rdx' % prefix, taxonomy[1])
|
||
|
ecoTaxWriter('%s.tdx' % prefix, taxonomy[0])
|
||
|
|
||
|
filecount = 0
|
||
|
for filename in seqFileNames:
|
||
|
filecount+=1
|
||
|
sk=ecoSeqWriter('%s_%03d.sdx' % (prefix,filecount),
|
||
|
filename,
|
||
|
taxonomy[2],
|
||
|
parser)
|
||
|
if sk:
|
||
|
print >>sys.stderr,"Skipped entry :"
|
||
|
print >>sys.stderr,sk
|
||
|
|
||
|
def ecoParseOptions(arguments):
|
||
|
opt = {
|
||
|
'prefix' : 'ecodb',
|
||
|
'taxdir' : 'taxdump',
|
||
|
'parser' : genbankParser
|
||
|
}
|
||
|
|
||
|
o,filenames = getopt.getopt(arguments,
|
||
|
'ht:n:g',
|
||
|
['help',
|
||
|
'taxonomy=',
|
||
|
'name=',
|
||
|
'genbank'])
|
||
|
|
||
|
for name,value in o:
|
||
|
if name in ('-h','--help'):
|
||
|
pass
|
||
|
elif name in ('-t','--taxonomy'):
|
||
|
opt['taxdir']=value
|
||
|
elif name in ('-n','--name'):
|
||
|
opt['prefix']=value
|
||
|
elif name in ('-g','--genbank'):
|
||
|
opt['parser']=genbankParser
|
||
|
else:
|
||
|
raise ValueError,'Unknown option %s' % name
|
||
|
|
||
|
return opt,filenames
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
|
||
|
opt,filenames = ecoParseOptions(sys.argv[1:])
|
||
|
|
||
|
taxonomy = readTaxonomyDump(opt['taxdir'])
|
||
|
|
||
|
ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser'])
|
||
|
|