Initial version of ecoPCR
git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPCR/trunk@5 60f365c0-8329-0410-b2a4-ec073aeeaa1d
This commit is contained in:
447
tools/ecoPCRFormat.py
Executable file
447
tools/ecoPCRFormat.py
Executable file
@ -0,0 +1,447 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import re
|
||||
import gzip
|
||||
import psycopg2
|
||||
import struct
|
||||
import sys
|
||||
import time
|
||||
import getopt
|
||||
|
||||
#####
|
||||
#
|
||||
#
|
||||
# Generic file function
|
||||
#
|
||||
#
|
||||
#####
|
||||
|
||||
def universalOpen(file):
|
||||
if isinstance(file,str):
|
||||
if file[-3:] == '.gz':
|
||||
rep = gzip.open(file)
|
||||
else:
|
||||
rep = open(file)
|
||||
else:
|
||||
rep = file
|
||||
return rep
|
||||
|
||||
def universalTell(file):
|
||||
if isinstance(file, gzip.GzipFile):
|
||||
file=file.myfileobj
|
||||
return file.tell()
|
||||
|
||||
def fileSize(file):
|
||||
if isinstance(file, gzip.GzipFile):
|
||||
file=file.myfileobj
|
||||
pos = file.tell()
|
||||
file.seek(0,2)
|
||||
length = file.tell()
|
||||
file.seek(pos,0)
|
||||
return length
|
||||
|
||||
def progressBar(pos,max,reset=False,delta=[]):
|
||||
if reset:
|
||||
del delta[:]
|
||||
if not delta:
|
||||
delta.append(time.time())
|
||||
delta.append(time.time())
|
||||
|
||||
delta[1]=time.time()
|
||||
elapsed = delta[1]-delta[0]
|
||||
percent = float(pos)/max * 100
|
||||
remain = time.strftime('%H:%M:%S',time.gmtime(elapsed / percent * (100-percent)))
|
||||
bar = '#' * int(percent/2)
|
||||
bar+= '|/-\\-'[pos % 5]
|
||||
bar+= ' ' * (50 - int(percent/2))
|
||||
sys.stderr.write('\r%5.1f %% |%s] remain : %s' %(percent,bar,remain))
|
||||
|
||||
#####
|
||||
#
|
||||
#
|
||||
# NCBI Dump Taxonomy reader
|
||||
#
|
||||
#
|
||||
#####
|
||||
|
||||
def endLessIterator(endedlist):
|
||||
for x in endedlist:
|
||||
yield x
|
||||
while(1):
|
||||
yield endedlist[-1]
|
||||
|
||||
|
||||
class ColumnFile(object):
|
||||
|
||||
def __init__(self,stream,sep=None,strip=True,types=None):
|
||||
if isinstance(stream,str):
|
||||
self._stream = open(stream)
|
||||
elif hasattr(stream,'next'):
|
||||
self._stream = stream
|
||||
else:
|
||||
raise ValueError,'stream must be string or an iterator'
|
||||
self._delimiter=sep
|
||||
self._strip=strip
|
||||
if types:
|
||||
self._types=[x for x in types]
|
||||
for i in xrange(len(self._types)):
|
||||
if self._types[i] is bool:
|
||||
self._types[i]=ColumnFile.str2bool
|
||||
else:
|
||||
self._types=None
|
||||
|
||||
def str2bool(x):
|
||||
return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False}))
|
||||
|
||||
str2bool = staticmethod(str2bool)
|
||||
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def next(self):
|
||||
ligne = self._stream.next()
|
||||
data = ligne.split(self._delimiter)
|
||||
if self._strip or self._types:
|
||||
data = [x.strip() for x in data]
|
||||
if self._types:
|
||||
it = endLessIterator(self._types)
|
||||
data = [x[1](x[0]) for x in ((y,it.next()) for y in data)]
|
||||
return data
|
||||
|
||||
def taxonCmp(t1,t2):
|
||||
if t1[0] < t2[0]:
|
||||
return -1
|
||||
elif t1[0] > t2[0]:
|
||||
return +1
|
||||
return 0
|
||||
|
||||
def bsearchTaxon(taxonomy,taxid):
|
||||
taxCount = len(taxonomy)
|
||||
begin = 0
|
||||
end = taxCount
|
||||
oldcheck=taxCount
|
||||
check = begin + end / 2
|
||||
while check != oldcheck and taxonomy[check][0]!=taxid :
|
||||
if taxonomy[check][0] < taxid:
|
||||
begin=check
|
||||
else:
|
||||
end=check
|
||||
oldcheck=check
|
||||
check = (begin + end) / 2
|
||||
|
||||
|
||||
if taxonomy[check][0]==taxid:
|
||||
return check
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
|
||||
def readNodeTable(file):
|
||||
|
||||
file = universalOpen(file)
|
||||
|
||||
nodes = ColumnFile(file,
|
||||
sep='|',
|
||||
types=(int,int,str,
|
||||
str,str,bool,
|
||||
int,bool,int,
|
||||
bool,bool,bool,str))
|
||||
print >>sys.stderr,"Reading taxonomy dump file..."
|
||||
taxonomy=[[n[0],n[2],n[1]] for n in nodes]
|
||||
|
||||
print >>sys.stderr,"List all taxonomy rank..."
|
||||
ranks =list(set(x[1] for x in taxonomy))
|
||||
ranks.sort()
|
||||
ranks = dict(map(None,ranks,xrange(len(ranks))))
|
||||
|
||||
print >>sys.stderr,"Sorting taxons..."
|
||||
taxonomy.sort(taxonCmp)
|
||||
|
||||
print >>sys.stderr,"Indexing taxonomy..."
|
||||
index = {}
|
||||
for t in taxonomy:
|
||||
index[t[0]]=bsearchTaxon(taxonomy, t[0])
|
||||
|
||||
print >>sys.stderr,"Indexing parent and rank..."
|
||||
for t in taxonomy:
|
||||
t[1]=ranks[t[1]]
|
||||
t[2]=index[t[2]]
|
||||
|
||||
|
||||
return taxonomy,ranks,index
|
||||
|
||||
def scientificNameIterator(file):
|
||||
file = universalOpen(file)
|
||||
names = ColumnFile(file,
|
||||
sep='|',
|
||||
types=(int,str,
|
||||
str,str))
|
||||
for taxid,name,unique,classname,white in names:
|
||||
if classname == 'scientific name':
|
||||
yield taxid,name
|
||||
|
||||
def mergedNodeIterator(file):
|
||||
file = universalOpen(file)
|
||||
merged = ColumnFile(file,
|
||||
sep='|',
|
||||
types=(int,int,str))
|
||||
for taxid,current,white in merged:
|
||||
yield taxid,current
|
||||
|
||||
def deletedNodeIterator(file):
|
||||
file = universalOpen(file)
|
||||
deleted = ColumnFile(file,
|
||||
sep='|',
|
||||
types=(int,str))
|
||||
for taxid,white in deleted:
|
||||
yield taxid
|
||||
|
||||
def readTaxonomyDump(taxdir):
|
||||
taxonomy,ranks,index = readNodeTable('%s/nodes.dmp' % taxdir)
|
||||
|
||||
print >>sys.stderr,"Adding scientific name..."
|
||||
for taxid,name in scientificNameIterator('%s/names.dmp' % taxdir):
|
||||
taxonomy[index[taxid]].append(name)
|
||||
|
||||
print >>sys.stderr,"Adding taxid alias..."
|
||||
for taxid,current in mergedNodeIterator('%s/merged.dmp' % taxdir):
|
||||
index[taxid]=index[current]
|
||||
|
||||
print >>sys.stderr,"Adding deleted taxid..."
|
||||
for taxid in deletedNodeIterator('%s/delnodes.dmp' % taxdir):
|
||||
index[taxid]=None
|
||||
|
||||
return taxonomy,ranks,index
|
||||
|
||||
|
||||
#####
|
||||
#
|
||||
#
|
||||
# Genbank/EMBL sequence reader
|
||||
#
|
||||
#
|
||||
#####
|
||||
|
||||
def entryIterator(file):
|
||||
file = universalOpen(file)
|
||||
rep =[]
|
||||
for ligne in file:
|
||||
rep.append(ligne)
|
||||
if ligne == '//\n':
|
||||
rep = ''.join(rep)
|
||||
yield rep
|
||||
rep = []
|
||||
|
||||
_cleanSeq = re.compile('[ \n0-9]+')
|
||||
|
||||
def cleanSeq(seq):
|
||||
return _cleanSeq.sub('',seq)
|
||||
|
||||
|
||||
_gbParseID = re.compile('(?<=^LOCUS {7})[^ ]+(?= )',re.MULTILINE)
|
||||
_gbParseDE = re.compile('(?<=^DEFINITION {2}).+?\. *$(?=[^ ])',re.MULTILINE+re.DOTALL)
|
||||
_gbParseSQ = re.compile('(?<=^ORIGIN).+?(?=^//$)',re.MULTILINE+re.DOTALL)
|
||||
_gbParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")')
|
||||
|
||||
def genbankParser(entry):
|
||||
Id = _gbParseID.findall(entry)[0]
|
||||
De = ' '.join(_gbParseDE.findall(entry)[0].split())
|
||||
Sq = cleanSeq(_gbParseSQ.findall(entry)[0].upper())
|
||||
Tx = int(_gbParseTX.findall(entry)[0])
|
||||
return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq}
|
||||
|
||||
def sequenceIterator(file,parser):
|
||||
for entry in entryIterator(file):
|
||||
yield parser(entry)
|
||||
|
||||
def taxonomyInfo(entry,connection):
|
||||
taxid = entry['taxid']
|
||||
curseur = connection.cursor()
|
||||
curseur.execute("""
|
||||
select taxid,species,genus,family,
|
||||
taxonomy.scientificName(taxid) as sn,
|
||||
taxonomy.scientificName(species) as species_sn,
|
||||
taxonomy.scientificName(genus) as genus_sn,
|
||||
taxonomy.scientificName(family) as family_sn
|
||||
from
|
||||
(
|
||||
select alias as taxid,
|
||||
taxonomy.getSpecies(alias) as species,
|
||||
taxonomy.getGenus(alias) as genus,
|
||||
taxonomy.getFamily(alias) as family
|
||||
from taxonomy.aliases
|
||||
where id=%d ) as tax
|
||||
""" % taxid)
|
||||
rep = curseur.fetchone()
|
||||
entry['current_taxid']=rep[0]
|
||||
entry['species']=rep[1]
|
||||
entry['genus']=rep[2]
|
||||
entry['family']=rep[3]
|
||||
entry['scientific_name']=rep[4]
|
||||
entry['species_sn']=rep[5]
|
||||
entry['genus_sn']=rep[6]
|
||||
entry['family_sn']=rep[7]
|
||||
return entry
|
||||
|
||||
#####
|
||||
#
|
||||
#
|
||||
# Binary writer
|
||||
#
|
||||
#
|
||||
#####
|
||||
|
||||
def ecoSeqPacker(sq):
|
||||
|
||||
compactseq = gzip.zlib.compress(sq['sequence'],9)
|
||||
cptseqlength = len(compactseq)
|
||||
delength = len(sq['definition'])
|
||||
|
||||
totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength
|
||||
|
||||
packed = struct.pack('> I I 20s I I I %ds %ds' % (delength,cptseqlength),
|
||||
totalSize,
|
||||
sq['taxid'],
|
||||
sq['id'],
|
||||
delength,
|
||||
len(sq['sequence']),
|
||||
cptseqlength,
|
||||
sq['definition'],
|
||||
compactseq)
|
||||
|
||||
assert len(packed) == totalSize+4, "error in sequence packing"
|
||||
|
||||
return packed
|
||||
|
||||
def ecoTaxPacker(tx):
|
||||
|
||||
namelength = len(tx[3])
|
||||
|
||||
totalSize = 4 + 4 + 4 + 4 + namelength
|
||||
|
||||
packed = struct.pack('> I I I I I %ds' % namelength,
|
||||
totalSize,
|
||||
tx[0],
|
||||
tx[1],
|
||||
tx[2],
|
||||
namelength,
|
||||
tx[3])
|
||||
|
||||
return packed
|
||||
|
||||
def ecoRankPacker(rank):
|
||||
|
||||
namelength = len(rank)
|
||||
|
||||
packed = struct.pack('> I %ds' % namelength,
|
||||
namelength,
|
||||
rank)
|
||||
|
||||
return packed
|
||||
|
||||
|
||||
def ecoSeqWriter(file,input,taxindex,parser=genbankParser):
|
||||
output = open(file,'wb')
|
||||
input = universalOpen(input)
|
||||
inputsize = fileSize(input)
|
||||
entries = sequenceIterator(input, parser)
|
||||
seqcount=0
|
||||
skipped = []
|
||||
|
||||
output.write(struct.pack('> I',seqcount))
|
||||
|
||||
progressBar(1, inputsize,reset=True)
|
||||
for entry in entries:
|
||||
entry['taxid']=taxindex[entry['taxid']]
|
||||
if entry['taxid'] is not None:
|
||||
seqcount+=1
|
||||
output.write(ecoSeqPacker(entry))
|
||||
else:
|
||||
skipped.append[entry['id']]
|
||||
where = universalTell(input)
|
||||
progressBar(where, inputsize)
|
||||
print >>sys.stderr," Read sequences : %d " % seqcount,
|
||||
|
||||
print >>sys.stderr
|
||||
output.seek(0,0)
|
||||
output.write(struct.pack('> I',seqcount))
|
||||
|
||||
output.close()
|
||||
return skipped
|
||||
|
||||
|
||||
def ecoTaxWriter(file,taxonomy):
|
||||
output = open(file,'wb')
|
||||
output.write(struct.pack('> I',len(taxonomy)))
|
||||
|
||||
for tx in taxonomy:
|
||||
output.write(ecoTaxPacker(tx))
|
||||
|
||||
output.close()
|
||||
|
||||
def ecoRankWriter(file,ranks):
|
||||
output = open(file,'wb')
|
||||
output.write(struct.pack('> I',len(ranks)))
|
||||
|
||||
rankNames = ranks.keys()
|
||||
rankNames.sort()
|
||||
|
||||
for rank in rankNames:
|
||||
output.write(ecoRankPacker(rank))
|
||||
|
||||
output.close()
|
||||
|
||||
def ecoDBWriter(prefix,taxonomy,seqFileNames,parser=genbankParser):
|
||||
|
||||
ecoRankWriter('%s.rdx' % prefix, taxonomy[1])
|
||||
ecoTaxWriter('%s.tdx' % prefix, taxonomy[0])
|
||||
|
||||
filecount = 0
|
||||
for filename in seqFileNames:
|
||||
filecount+=1
|
||||
sk=ecoSeqWriter('%s_%03d.sdx' % (prefix,filecount),
|
||||
filename,
|
||||
taxonomy[2],
|
||||
parser)
|
||||
if sk:
|
||||
print >>sys.stderr,"Skipped entry :"
|
||||
print >>sys.stderr,sk
|
||||
|
||||
def ecoParseOptions(arguments):
|
||||
opt = {
|
||||
'prefix' : 'ecodb',
|
||||
'taxdir' : 'taxdump',
|
||||
'parser' : genbankParser
|
||||
}
|
||||
|
||||
o,filenames = getopt.getopt(arguments,
|
||||
'ht:n:g',
|
||||
['help',
|
||||
'taxonomy=',
|
||||
'name=',
|
||||
'genbank'])
|
||||
|
||||
for name,value in o:
|
||||
if name in ('-h','--help'):
|
||||
pass
|
||||
elif name in ('-t','--taxonomy'):
|
||||
opt['taxdir']=value
|
||||
elif name in ('-n','--name'):
|
||||
opt['prefix']=value
|
||||
elif name in ('-g','--genbank'):
|
||||
opt['parser']=genbankParser
|
||||
else:
|
||||
raise ValueError,'Unknown option %s' % name
|
||||
|
||||
return opt,filenames
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
opt,filenames = ecoParseOptions(sys.argv[1:])
|
||||
|
||||
taxonomy = readTaxonomyDump(opt['taxdir'])
|
||||
|
||||
ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser'])
|
||||
|
Reference in New Issue
Block a user