#!/usr/bin/env python import re import gzip import psycopg2 import struct import sys import time import getopt ##### # # # Generic file function # # ##### def universalOpen(file): if isinstance(file,str): if file[-3:] == '.gz': rep = gzip.open(file) else: rep = open(file) else: rep = file return rep def universalTell(file): if isinstance(file, gzip.GzipFile): file=file.myfileobj return file.tell() def fileSize(file): if isinstance(file, gzip.GzipFile): file=file.myfileobj pos = file.tell() file.seek(0,2) length = file.tell() file.seek(pos,0) return length def progressBar(pos,max,reset=False,delta=[]): if reset: del delta[:] if not delta: delta.append(time.time()) delta.append(time.time()) delta[1]=time.time() elapsed = delta[1]-delta[0] percent = float(pos)/max * 100 remain = time.strftime('%H:%M:%S',time.gmtime(elapsed / percent * (100-percent))) bar = '#' * int(percent/2) bar+= '|/-\\-'[pos % 5] bar+= ' ' * (50 - int(percent/2)) sys.stderr.write('\r%5.1f %% |%s] remain : %s' %(percent,bar,remain)) ##### # # # NCBI Dump Taxonomy reader # # ##### def endLessIterator(endedlist): for x in endedlist: yield x while(1): yield endedlist[-1] class ColumnFile(object): def __init__(self,stream,sep=None,strip=True,types=None): if isinstance(stream,str): self._stream = open(stream) elif hasattr(stream,'next'): self._stream = stream else: raise ValueError,'stream must be string or an iterator' self._delimiter=sep self._strip=strip if types: self._types=[x for x in types] for i in xrange(len(self._types)): if self._types[i] is bool: self._types[i]=ColumnFile.str2bool else: self._types=None def str2bool(x): return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False})) str2bool = staticmethod(str2bool) def __iter__(self): return self def next(self): ligne = self._stream.next() data = ligne.split(self._delimiter) if self._strip or self._types: data = [x.strip() for x in data] if self._types: it = endLessIterator(self._types) data = [x[1](x[0]) for x in ((y,it.next()) for y in data)] return data def taxonCmp(t1,t2): if t1[0] < t2[0]: return -1 elif t1[0] > t2[0]: return +1 return 0 def bsearchTaxon(taxonomy,taxid): taxCount = len(taxonomy) begin = 0 end = taxCount oldcheck=taxCount check = begin + end / 2 while check != oldcheck and taxonomy[check][0]!=taxid : if taxonomy[check][0] < taxid: begin=check else: end=check oldcheck=check check = (begin + end) / 2 if taxonomy[check][0]==taxid: return check else: return None def readNodeTable(file): file = universalOpen(file) nodes = ColumnFile(file, sep='|', types=(int,int,str, str,str,bool, int,bool,int, bool,bool,bool,str)) print >>sys.stderr,"Reading taxonomy dump file..." taxonomy=[[n[0],n[2],n[1]] for n in nodes] print >>sys.stderr,"List all taxonomy rank..." ranks =list(set(x[1] for x in taxonomy)) ranks.sort() ranks = dict(map(None,ranks,xrange(len(ranks)))) print >>sys.stderr,"Sorting taxons..." taxonomy.sort(taxonCmp) print >>sys.stderr,"Indexing taxonomy..." index = {} for t in taxonomy: index[t[0]]=bsearchTaxon(taxonomy, t[0]) print >>sys.stderr,"Indexing parent and rank..." for t in taxonomy: t[1]=ranks[t[1]] t[2]=index[t[2]] return taxonomy,ranks,index def scientificNameIterator(file): file = universalOpen(file) names = ColumnFile(file, sep='|', types=(int,str, str,str)) for taxid,name,unique,classname,white in names: if classname == 'scientific name': yield taxid,name def mergedNodeIterator(file): file = universalOpen(file) merged = ColumnFile(file, sep='|', types=(int,int,str)) for taxid,current,white in merged: yield taxid,current def deletedNodeIterator(file): file = universalOpen(file) deleted = ColumnFile(file, sep='|', types=(int,str)) for taxid,white in deleted: yield taxid def readTaxonomyDump(taxdir): taxonomy,ranks,index = readNodeTable('%s/nodes.dmp' % taxdir) print >>sys.stderr,"Adding scientific name..." for taxid,name in scientificNameIterator('%s/names.dmp' % taxdir): taxonomy[index[taxid]].append(name) print >>sys.stderr,"Adding taxid alias..." for taxid,current in mergedNodeIterator('%s/merged.dmp' % taxdir): index[taxid]=index[current] print >>sys.stderr,"Adding deleted taxid..." for taxid in deletedNodeIterator('%s/delnodes.dmp' % taxdir): index[taxid]=None return taxonomy,ranks,index ##### # # # Genbank/EMBL sequence reader # # ##### def entryIterator(file): file = universalOpen(file) rep =[] for ligne in file: rep.append(ligne) if ligne == '//\n': rep = ''.join(rep) yield rep rep = [] _cleanSeq = re.compile('[ \n0-9]+') def cleanSeq(seq): return _cleanSeq.sub('',seq) _gbParseID = re.compile('(?<=^LOCUS {7})[^ ]+(?= )',re.MULTILINE) _gbParseDE = re.compile('(?<=^DEFINITION {2}).+?\. *$(?=[^ ])',re.MULTILINE+re.DOTALL) _gbParseSQ = re.compile('(?<=^ORIGIN).+?(?=^//$)',re.MULTILINE+re.DOTALL) _gbParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")') def genbankParser(entry): Id = _gbParseID.findall(entry)[0] De = ' '.join(_gbParseDE.findall(entry)[0].split()) Sq = cleanSeq(_gbParseSQ.findall(entry)[0].upper()) Tx = int(_gbParseTX.findall(entry)[0]) return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq} def sequenceIterator(file,parser): for entry in entryIterator(file): yield parser(entry) def taxonomyInfo(entry,connection): taxid = entry['taxid'] curseur = connection.cursor() curseur.execute(""" select taxid,species,genus,family, taxonomy.scientificName(taxid) as sn, taxonomy.scientificName(species) as species_sn, taxonomy.scientificName(genus) as genus_sn, taxonomy.scientificName(family) as family_sn from ( select alias as taxid, taxonomy.getSpecies(alias) as species, taxonomy.getGenus(alias) as genus, taxonomy.getFamily(alias) as family from taxonomy.aliases where id=%d ) as tax """ % taxid) rep = curseur.fetchone() entry['current_taxid']=rep[0] entry['species']=rep[1] entry['genus']=rep[2] entry['family']=rep[3] entry['scientific_name']=rep[4] entry['species_sn']=rep[5] entry['genus_sn']=rep[6] entry['family_sn']=rep[7] return entry ##### # # # Binary writer # # ##### def ecoSeqPacker(sq): compactseq = gzip.zlib.compress(sq['sequence'],9) cptseqlength = len(compactseq) delength = len(sq['definition']) totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength packed = struct.pack('> I I 20s I I I %ds %ds' % (delength,cptseqlength), totalSize, sq['taxid'], sq['id'], delength, len(sq['sequence']), cptseqlength, sq['definition'], compactseq) assert len(packed) == totalSize+4, "error in sequence packing" return packed def ecoTaxPacker(tx): namelength = len(tx[3]) totalSize = 4 + 4 + 4 + 4 + namelength packed = struct.pack('> I I I I I %ds' % namelength, totalSize, tx[0], tx[1], tx[2], namelength, tx[3]) return packed def ecoRankPacker(rank): namelength = len(rank) packed = struct.pack('> I %ds' % namelength, namelength, rank) return packed def ecoSeqWriter(file,input,taxindex,parser=genbankParser): output = open(file,'wb') input = universalOpen(input) inputsize = fileSize(input) entries = sequenceIterator(input, parser) seqcount=0 skipped = [] output.write(struct.pack('> I',seqcount)) progressBar(1, inputsize,reset=True) for entry in entries: entry['taxid']=taxindex[entry['taxid']] if entry['taxid'] is not None: seqcount+=1 output.write(ecoSeqPacker(entry)) else: skipped.append[entry['id']] where = universalTell(input) progressBar(where, inputsize) print >>sys.stderr," Read sequences : %d " % seqcount, print >>sys.stderr output.seek(0,0) output.write(struct.pack('> I',seqcount)) output.close() return skipped def ecoTaxWriter(file,taxonomy): output = open(file,'wb') output.write(struct.pack('> I',len(taxonomy))) for tx in taxonomy: output.write(ecoTaxPacker(tx)) output.close() def ecoRankWriter(file,ranks): output = open(file,'wb') output.write(struct.pack('> I',len(ranks))) rankNames = ranks.keys() rankNames.sort() for rank in rankNames: output.write(ecoRankPacker(rank)) output.close() def ecoDBWriter(prefix,taxonomy,seqFileNames,parser=genbankParser): ecoRankWriter('%s.rdx' % prefix, taxonomy[1]) ecoTaxWriter('%s.tdx' % prefix, taxonomy[0]) filecount = 0 for filename in seqFileNames: filecount+=1 sk=ecoSeqWriter('%s_%03d.sdx' % (prefix,filecount), filename, taxonomy[2], parser) if sk: print >>sys.stderr,"Skipped entry :" print >>sys.stderr,sk def ecoParseOptions(arguments): opt = { 'prefix' : 'ecodb', 'taxdir' : 'taxdump', 'parser' : genbankParser } o,filenames = getopt.getopt(arguments, 'ht:n:g', ['help', 'taxonomy=', 'name=', 'genbank']) for name,value in o: if name in ('-h','--help'): pass elif name in ('-t','--taxonomy'): opt['taxdir']=value elif name in ('-n','--name'): opt['prefix']=value elif name in ('-g','--genbank'): opt['parser']=genbankParser else: raise ValueError,'Unknown option %s' % name return opt,filenames if __name__ == '__main__': opt,filenames = ecoParseOptions(sys.argv[1:]) taxonomy = readTaxonomyDump(opt['taxdir']) ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser'])