2017-02-06 11:24:32 +01:00
|
|
|
#!/usr/bin/env python2.7
|
2007-02-15 13:33:32 +00:00
|
|
|
|
|
|
|
import re
|
|
|
|
import gzip
|
|
|
|
import struct
|
|
|
|
import sys
|
|
|
|
import time
|
|
|
|
import getopt
|
|
|
|
|
2017-02-06 11:24:32 +01:00
|
|
|
_dbenable=False
|
|
|
|
|
2008-02-08 10:27:42 +00:00
|
|
|
|
2007-02-15 13:33:32 +00:00
|
|
|
#####
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# Generic file function
|
|
|
|
#
|
|
|
|
#
|
|
|
|
#####
|
|
|
|
|
|
|
|
def universalOpen(file):
|
|
|
|
if isinstance(file,str):
|
|
|
|
if file[-3:] == '.gz':
|
|
|
|
rep = gzip.open(file)
|
|
|
|
else:
|
|
|
|
rep = open(file)
|
|
|
|
else:
|
|
|
|
rep = file
|
|
|
|
return rep
|
|
|
|
|
|
|
|
def universalTell(file):
|
|
|
|
if isinstance(file, gzip.GzipFile):
|
|
|
|
file=file.myfileobj
|
|
|
|
return file.tell()
|
|
|
|
|
|
|
|
def fileSize(file):
|
|
|
|
if isinstance(file, gzip.GzipFile):
|
|
|
|
file=file.myfileobj
|
|
|
|
pos = file.tell()
|
|
|
|
file.seek(0,2)
|
|
|
|
length = file.tell()
|
|
|
|
file.seek(pos,0)
|
|
|
|
return length
|
|
|
|
|
|
|
|
def progressBar(pos,max,reset=False,delta=[]):
|
|
|
|
if reset:
|
|
|
|
del delta[:]
|
|
|
|
if not delta:
|
|
|
|
delta.append(time.time())
|
|
|
|
delta.append(time.time())
|
|
|
|
|
|
|
|
delta[1]=time.time()
|
|
|
|
elapsed = delta[1]-delta[0]
|
|
|
|
percent = float(pos)/max * 100
|
|
|
|
remain = time.strftime('%H:%M:%S',time.gmtime(elapsed / percent * (100-percent)))
|
|
|
|
bar = '#' * int(percent/2)
|
|
|
|
bar+= '|/-\\-'[pos % 5]
|
|
|
|
bar+= ' ' * (50 - int(percent/2))
|
|
|
|
sys.stderr.write('\r%5.1f %% |%s] remain : %s' %(percent,bar,remain))
|
|
|
|
|
|
|
|
#####
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# NCBI Dump Taxonomy reader
|
|
|
|
#
|
|
|
|
#
|
|
|
|
#####
|
|
|
|
|
|
|
|
def endLessIterator(endedlist):
|
|
|
|
for x in endedlist:
|
|
|
|
yield x
|
|
|
|
while(1):
|
|
|
|
yield endedlist[-1]
|
2007-09-24 09:48:19 +00:00
|
|
|
|
2007-02-15 13:33:32 +00:00
|
|
|
class ColumnFile(object):
|
|
|
|
|
|
|
|
def __init__(self,stream,sep=None,strip=True,types=None):
|
|
|
|
if isinstance(stream,str):
|
|
|
|
self._stream = open(stream)
|
|
|
|
elif hasattr(stream,'next'):
|
|
|
|
self._stream = stream
|
|
|
|
else:
|
|
|
|
raise ValueError,'stream must be string or an iterator'
|
|
|
|
self._delimiter=sep
|
|
|
|
self._strip=strip
|
|
|
|
if types:
|
|
|
|
self._types=[x for x in types]
|
|
|
|
for i in xrange(len(self._types)):
|
|
|
|
if self._types[i] is bool:
|
|
|
|
self._types[i]=ColumnFile.str2bool
|
|
|
|
else:
|
|
|
|
self._types=None
|
|
|
|
|
|
|
|
def str2bool(x):
|
|
|
|
return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False}))
|
|
|
|
|
|
|
|
str2bool = staticmethod(str2bool)
|
|
|
|
|
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
return self
|
|
|
|
|
|
|
|
def next(self):
|
|
|
|
ligne = self._stream.next()
|
|
|
|
data = ligne.split(self._delimiter)
|
|
|
|
if self._strip or self._types:
|
|
|
|
data = [x.strip() for x in data]
|
|
|
|
if self._types:
|
|
|
|
it = endLessIterator(self._types)
|
|
|
|
data = [x[1](x[0]) for x in ((y,it.next()) for y in data)]
|
|
|
|
return data
|
|
|
|
|
|
|
|
def taxonCmp(t1,t2):
|
|
|
|
if t1[0] < t2[0]:
|
|
|
|
return -1
|
|
|
|
elif t1[0] > t2[0]:
|
|
|
|
return +1
|
|
|
|
return 0
|
|
|
|
|
|
|
|
def bsearchTaxon(taxonomy,taxid):
|
|
|
|
taxCount = len(taxonomy)
|
|
|
|
begin = 0
|
|
|
|
end = taxCount
|
|
|
|
oldcheck=taxCount
|
|
|
|
check = begin + end / 2
|
|
|
|
while check != oldcheck and taxonomy[check][0]!=taxid :
|
|
|
|
if taxonomy[check][0] < taxid:
|
|
|
|
begin=check
|
|
|
|
else:
|
|
|
|
end=check
|
|
|
|
oldcheck=check
|
|
|
|
check = (begin + end) / 2
|
|
|
|
|
|
|
|
|
|
|
|
if taxonomy[check][0]==taxid:
|
|
|
|
return check
|
|
|
|
else:
|
|
|
|
return None
|
|
|
|
|
2007-09-24 09:48:19 +00:00
|
|
|
|
2007-02-15 13:33:32 +00:00
|
|
|
|
|
|
|
def readNodeTable(file):
|
|
|
|
|
|
|
|
file = universalOpen(file)
|
|
|
|
|
|
|
|
nodes = ColumnFile(file,
|
|
|
|
sep='|',
|
|
|
|
types=(int,int,str,
|
|
|
|
str,str,bool,
|
|
|
|
int,bool,int,
|
|
|
|
bool,bool,bool,str))
|
|
|
|
print >>sys.stderr,"Reading taxonomy dump file..."
|
|
|
|
taxonomy=[[n[0],n[2],n[1]] for n in nodes]
|
|
|
|
print >>sys.stderr,"List all taxonomy rank..."
|
|
|
|
ranks =list(set(x[1] for x in taxonomy))
|
|
|
|
ranks.sort()
|
|
|
|
ranks = dict(map(None,ranks,xrange(len(ranks))))
|
|
|
|
|
|
|
|
print >>sys.stderr,"Sorting taxons..."
|
|
|
|
taxonomy.sort(taxonCmp)
|
|
|
|
|
|
|
|
print >>sys.stderr,"Indexing taxonomy..."
|
|
|
|
index = {}
|
|
|
|
for t in taxonomy:
|
|
|
|
index[t[0]]=bsearchTaxon(taxonomy, t[0])
|
|
|
|
|
|
|
|
print >>sys.stderr,"Indexing parent and rank..."
|
|
|
|
for t in taxonomy:
|
|
|
|
t[1]=ranks[t[1]]
|
|
|
|
t[2]=index[t[2]]
|
|
|
|
|
|
|
|
|
|
|
|
return taxonomy,ranks,index
|
|
|
|
|
2007-09-24 09:48:19 +00:00
|
|
|
def nameIterator(file):
|
2007-02-15 13:33:32 +00:00
|
|
|
file = universalOpen(file)
|
|
|
|
names = ColumnFile(file,
|
|
|
|
sep='|',
|
|
|
|
types=(int,str,
|
|
|
|
str,str))
|
|
|
|
for taxid,name,unique,classname,white in names:
|
2007-09-24 09:48:19 +00:00
|
|
|
yield taxid,name,classname
|
2007-02-15 13:33:32 +00:00
|
|
|
|
|
|
|
def mergedNodeIterator(file):
|
|
|
|
file = universalOpen(file)
|
|
|
|
merged = ColumnFile(file,
|
|
|
|
sep='|',
|
|
|
|
types=(int,int,str))
|
|
|
|
for taxid,current,white in merged:
|
|
|
|
yield taxid,current
|
|
|
|
|
|
|
|
def deletedNodeIterator(file):
|
|
|
|
file = universalOpen(file)
|
|
|
|
deleted = ColumnFile(file,
|
|
|
|
sep='|',
|
|
|
|
types=(int,str))
|
|
|
|
for taxid,white in deleted:
|
|
|
|
yield taxid
|
|
|
|
|
|
|
|
def readTaxonomyDump(taxdir):
|
|
|
|
taxonomy,ranks,index = readNodeTable('%s/nodes.dmp' % taxdir)
|
|
|
|
|
|
|
|
print >>sys.stderr,"Adding scientific name..."
|
2007-09-24 09:48:19 +00:00
|
|
|
|
|
|
|
alternativeName=[]
|
|
|
|
for taxid,name,classname in nameIterator('%s/names.dmp' % taxdir):
|
|
|
|
alternativeName.append((name,classname,index[taxid]))
|
|
|
|
if classname == 'scientific name':
|
|
|
|
taxonomy[index[taxid]].append(name)
|
2007-02-15 13:33:32 +00:00
|
|
|
|
|
|
|
print >>sys.stderr,"Adding taxid alias..."
|
|
|
|
for taxid,current in mergedNodeIterator('%s/merged.dmp' % taxdir):
|
|
|
|
index[taxid]=index[current]
|
|
|
|
|
|
|
|
print >>sys.stderr,"Adding deleted taxid..."
|
|
|
|
for taxid in deletedNodeIterator('%s/delnodes.dmp' % taxdir):
|
|
|
|
index[taxid]=None
|
|
|
|
|
2007-09-24 09:48:19 +00:00
|
|
|
return taxonomy,ranks,alternativeName,index
|
2007-02-15 13:33:32 +00:00
|
|
|
|
|
|
|
|
|
|
|
#####
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# Genbank/EMBL sequence reader
|
|
|
|
#
|
|
|
|
#
|
|
|
|
#####
|
|
|
|
|
|
|
|
def entryIterator(file):
|
|
|
|
file = universalOpen(file)
|
|
|
|
rep =[]
|
|
|
|
for ligne in file:
|
|
|
|
rep.append(ligne)
|
|
|
|
if ligne == '//\n':
|
|
|
|
rep = ''.join(rep)
|
|
|
|
yield rep
|
|
|
|
rep = []
|
2007-05-02 10:10:27 +00:00
|
|
|
|
|
|
|
def fastaEntryIterator(file):
|
|
|
|
file = universalOpen(file)
|
|
|
|
rep =[]
|
|
|
|
for ligne in file:
|
|
|
|
if ligne[0] == '>' and rep:
|
|
|
|
rep = ''.join(rep)
|
|
|
|
yield rep
|
|
|
|
rep = []
|
|
|
|
rep.append(ligne)
|
|
|
|
if rep:
|
|
|
|
rep = ''.join(rep)
|
|
|
|
yield rep
|
|
|
|
|
2007-02-15 13:33:32 +00:00
|
|
|
_cleanSeq = re.compile('[ \n0-9]+')
|
|
|
|
|
|
|
|
def cleanSeq(seq):
|
|
|
|
return _cleanSeq.sub('',seq)
|
|
|
|
|
|
|
|
|
|
|
|
_gbParseID = re.compile('(?<=^LOCUS {7})[^ ]+(?= )',re.MULTILINE)
|
|
|
|
_gbParseDE = re.compile('(?<=^DEFINITION {2}).+?\. *$(?=[^ ])',re.MULTILINE+re.DOTALL)
|
|
|
|
_gbParseSQ = re.compile('(?<=^ORIGIN).+?(?=^//$)',re.MULTILINE+re.DOTALL)
|
|
|
|
_gbParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")')
|
|
|
|
|
2007-05-02 10:10:27 +00:00
|
|
|
def genbankEntryParser(entry):
|
2007-02-15 13:33:32 +00:00
|
|
|
Id = _gbParseID.findall(entry)[0]
|
|
|
|
De = ' '.join(_gbParseDE.findall(entry)[0].split())
|
|
|
|
Sq = cleanSeq(_gbParseSQ.findall(entry)[0].upper())
|
2007-03-14 16:13:23 +00:00
|
|
|
try:
|
|
|
|
Tx = int(_gbParseTX.findall(entry)[0])
|
|
|
|
except IndexError:
|
|
|
|
Tx = None
|
2007-02-15 13:33:32 +00:00
|
|
|
return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq}
|
|
|
|
|
2007-09-24 09:48:19 +00:00
|
|
|
######################
|
|
|
|
|
|
|
|
_cleanDef = re.compile('[\nDE]')
|
|
|
|
|
|
|
|
def cleanDef(definition):
|
|
|
|
return _cleanDef.sub('',definition)
|
|
|
|
|
|
|
|
_emblParseID = re.compile('(?<=^ID {3})[^ ]+(?=;)',re.MULTILINE)
|
|
|
|
_emblParseDE = re.compile('(?<=^DE {3}).+?\. *$(?=[^ ])',re.MULTILINE+re.DOTALL)
|
|
|
|
_emblParseSQ = re.compile('(?<=^ ).+?(?=^//$)',re.MULTILINE+re.DOTALL)
|
|
|
|
_emblParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")')
|
|
|
|
|
|
|
|
def emblEntryParser(entry):
|
|
|
|
Id = _emblParseID.findall(entry)[0]
|
|
|
|
De = ' '.join(cleanDef(_emblParseDE.findall(entry)[0]).split())
|
|
|
|
Sq = cleanSeq(_emblParseSQ.findall(entry)[0].upper())
|
2007-05-02 10:10:27 +00:00
|
|
|
try:
|
2007-09-24 09:48:19 +00:00
|
|
|
Tx = int(_emblParseTX.findall(entry)[0])
|
2007-05-02 10:10:27 +00:00
|
|
|
except IndexError:
|
|
|
|
Tx = None
|
|
|
|
return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq}
|
|
|
|
|
2007-09-24 09:48:19 +00:00
|
|
|
|
|
|
|
######################
|
|
|
|
|
2008-02-08 10:27:42 +00:00
|
|
|
_fastaSplit=re.compile(';\W*')
|
|
|
|
|
2007-09-24 09:48:19 +00:00
|
|
|
def parseFasta(seq):
|
2008-02-08 10:27:42 +00:00
|
|
|
seq=seq.split('\n')
|
2007-09-24 09:48:19 +00:00
|
|
|
title = seq[0].strip()[1:].split(None,1)
|
|
|
|
id=title[0]
|
|
|
|
if len(title) == 2:
|
2008-02-08 10:27:42 +00:00
|
|
|
field = _fastaSplit.split(title[1])
|
2007-09-24 09:48:19 +00:00
|
|
|
else:
|
|
|
|
field=[]
|
2007-09-24 10:06:00 +00:00
|
|
|
info = dict(x.split('=',1) for x in field if '=' in x)
|
2007-09-24 09:48:19 +00:00
|
|
|
definition = ' '.join([x for x in field if '=' not in x])
|
2007-09-24 10:06:00 +00:00
|
|
|
seq=(''.join([x.strip() for x in seq[1:]])).upper()
|
2007-09-24 09:48:19 +00:00
|
|
|
return id,seq,definition,info
|
|
|
|
|
|
|
|
|
|
|
|
def fastaEntryParser(entry):
|
|
|
|
id,seq,definition,info = parseFasta(entry)
|
|
|
|
Tx = info.get('taxid',None)
|
|
|
|
if Tx is not None:
|
|
|
|
Tx=int(Tx)
|
|
|
|
return {'id':id,'taxid':Tx,'definition':definition,'sequence':seq}
|
2007-05-02 10:10:27 +00:00
|
|
|
|
2007-09-24 09:48:19 +00:00
|
|
|
|
2007-05-02 10:10:27 +00:00
|
|
|
def sequenceIteratorFactory(entryParser,entryIterator):
|
|
|
|
def sequenceIterator(file):
|
|
|
|
for entry in entryIterator(file):
|
|
|
|
yield entryParser(entry)
|
|
|
|
return sequenceIterator
|
|
|
|
|
2007-02-15 13:33:32 +00:00
|
|
|
|
|
|
|
def taxonomyInfo(entry,connection):
|
|
|
|
taxid = entry['taxid']
|
|
|
|
curseur = connection.cursor()
|
|
|
|
curseur.execute("""
|
|
|
|
select taxid,species,genus,family,
|
|
|
|
taxonomy.scientificName(taxid) as sn,
|
|
|
|
taxonomy.scientificName(species) as species_sn,
|
|
|
|
taxonomy.scientificName(genus) as genus_sn,
|
|
|
|
taxonomy.scientificName(family) as family_sn
|
|
|
|
from
|
|
|
|
(
|
|
|
|
select alias as taxid,
|
|
|
|
taxonomy.getSpecies(alias) as species,
|
|
|
|
taxonomy.getGenus(alias) as genus,
|
|
|
|
taxonomy.getFamily(alias) as family
|
|
|
|
from taxonomy.aliases
|
|
|
|
where id=%d ) as tax
|
|
|
|
""" % taxid)
|
|
|
|
rep = curseur.fetchone()
|
|
|
|
entry['current_taxid']=rep[0]
|
|
|
|
entry['species']=rep[1]
|
|
|
|
entry['genus']=rep[2]
|
|
|
|
entry['family']=rep[3]
|
|
|
|
entry['scientific_name']=rep[4]
|
|
|
|
entry['species_sn']=rep[5]
|
|
|
|
entry['genus_sn']=rep[6]
|
|
|
|
entry['family_sn']=rep[7]
|
|
|
|
return entry
|
|
|
|
|
|
|
|
#####
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# Binary writer
|
|
|
|
#
|
|
|
|
#
|
|
|
|
#####
|
|
|
|
|
|
|
|
def ecoSeqPacker(sq):
|
|
|
|
|
|
|
|
compactseq = gzip.zlib.compress(sq['sequence'],9)
|
|
|
|
cptseqlength = len(compactseq)
|
|
|
|
delength = len(sq['definition'])
|
|
|
|
|
|
|
|
totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength
|
|
|
|
|
|
|
|
packed = struct.pack('> I I 20s I I I %ds %ds' % (delength,cptseqlength),
|
|
|
|
totalSize,
|
|
|
|
sq['taxid'],
|
|
|
|
sq['id'],
|
|
|
|
delength,
|
|
|
|
len(sq['sequence']),
|
|
|
|
cptseqlength,
|
|
|
|
sq['definition'],
|
|
|
|
compactseq)
|
|
|
|
|
|
|
|
assert len(packed) == totalSize+4, "error in sequence packing"
|
|
|
|
|
|
|
|
return packed
|
|
|
|
|
|
|
|
def ecoTaxPacker(tx):
|
|
|
|
|
|
|
|
namelength = len(tx[3])
|
|
|
|
|
|
|
|
totalSize = 4 + 4 + 4 + 4 + namelength
|
|
|
|
|
|
|
|
packed = struct.pack('> I I I I I %ds' % namelength,
|
|
|
|
totalSize,
|
|
|
|
tx[0],
|
|
|
|
tx[1],
|
|
|
|
tx[2],
|
|
|
|
namelength,
|
|
|
|
tx[3])
|
|
|
|
|
|
|
|
return packed
|
|
|
|
|
|
|
|
def ecoRankPacker(rank):
|
|
|
|
|
|
|
|
namelength = len(rank)
|
|
|
|
|
|
|
|
packed = struct.pack('> I %ds' % namelength,
|
|
|
|
namelength,
|
|
|
|
rank)
|
|
|
|
|
|
|
|
return packed
|
|
|
|
|
2007-09-24 09:48:19 +00:00
|
|
|
def ecoNamePacker(name):
|
|
|
|
|
|
|
|
namelength = len(name[0])
|
|
|
|
classlength= len(name[1])
|
|
|
|
totalSize = namelength + classlength + 4 + 4 + 4 + 4
|
|
|
|
|
|
|
|
packed = struct.pack('> I I I I I %ds %ds' % (namelength,classlength),
|
|
|
|
totalSize,
|
|
|
|
int(name[1]=='scientific name'),
|
|
|
|
namelength,
|
|
|
|
classlength,
|
|
|
|
name[2],
|
|
|
|
name[0],
|
|
|
|
name[1])
|
|
|
|
|
|
|
|
return packed
|
2007-02-15 13:33:32 +00:00
|
|
|
|
2007-05-02 10:10:27 +00:00
|
|
|
def ecoSeqWriter(file,input,taxindex,parser):
|
2007-02-15 13:33:32 +00:00
|
|
|
output = open(file,'wb')
|
|
|
|
input = universalOpen(input)
|
|
|
|
inputsize = fileSize(input)
|
2007-05-02 10:10:27 +00:00
|
|
|
entries = parser(input)
|
2007-02-15 13:33:32 +00:00
|
|
|
seqcount=0
|
|
|
|
skipped = []
|
|
|
|
|
|
|
|
output.write(struct.pack('> I',seqcount))
|
|
|
|
|
|
|
|
progressBar(1, inputsize,reset=True)
|
|
|
|
for entry in entries:
|
|
|
|
if entry['taxid'] is not None:
|
2007-05-02 10:10:27 +00:00
|
|
|
try:
|
|
|
|
entry['taxid']=taxindex[entry['taxid']]
|
|
|
|
except KeyError:
|
|
|
|
entry['taxid']=None
|
2007-03-14 16:13:23 +00:00
|
|
|
if entry['taxid'] is not None:
|
|
|
|
seqcount+=1
|
|
|
|
output.write(ecoSeqPacker(entry))
|
|
|
|
else:
|
2007-03-15 11:53:50 +00:00
|
|
|
skipped.append(entry['id'])
|
2007-03-14 16:13:23 +00:00
|
|
|
where = universalTell(input)
|
|
|
|
progressBar(where, inputsize)
|
|
|
|
print >>sys.stderr," Readed sequences : %d " % seqcount,
|
2007-02-15 13:33:32 +00:00
|
|
|
else:
|
2007-03-15 11:53:50 +00:00
|
|
|
skipped.append(entry['id'])
|
2007-02-15 13:33:32 +00:00
|
|
|
|
|
|
|
print >>sys.stderr
|
|
|
|
output.seek(0,0)
|
|
|
|
output.write(struct.pack('> I',seqcount))
|
|
|
|
|
|
|
|
output.close()
|
|
|
|
return skipped
|
|
|
|
|
|
|
|
|
|
|
|
def ecoTaxWriter(file,taxonomy):
|
|
|
|
output = open(file,'wb')
|
|
|
|
output.write(struct.pack('> I',len(taxonomy)))
|
|
|
|
|
|
|
|
for tx in taxonomy:
|
|
|
|
output.write(ecoTaxPacker(tx))
|
|
|
|
|
|
|
|
output.close()
|
|
|
|
|
|
|
|
def ecoRankWriter(file,ranks):
|
|
|
|
output = open(file,'wb')
|
|
|
|
output.write(struct.pack('> I',len(ranks)))
|
|
|
|
|
|
|
|
rankNames = ranks.keys()
|
|
|
|
rankNames.sort()
|
|
|
|
|
|
|
|
for rank in rankNames:
|
|
|
|
output.write(ecoRankPacker(rank))
|
|
|
|
|
|
|
|
output.close()
|
2007-09-24 09:48:19 +00:00
|
|
|
|
|
|
|
def nameCmp(n1,n2):
|
|
|
|
name1=n1[0].upper()
|
|
|
|
name2=n2[0].upper()
|
|
|
|
if name1 < name2:
|
|
|
|
return -1
|
|
|
|
elif name1 > name2:
|
|
|
|
return 1
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
def ecoNameWriter(file,names):
|
|
|
|
output = open(file,'wb')
|
|
|
|
output.write(struct.pack('> I',len(names)))
|
|
|
|
|
|
|
|
names.sort(nameCmp)
|
|
|
|
|
|
|
|
for name in names:
|
|
|
|
output.write(ecoNamePacker(name))
|
|
|
|
|
|
|
|
output.close()
|
2007-02-15 13:33:32 +00:00
|
|
|
|
2007-05-02 10:10:27 +00:00
|
|
|
def ecoDBWriter(prefix,taxonomy,seqFileNames,parser):
|
2007-02-15 13:33:32 +00:00
|
|
|
|
|
|
|
ecoRankWriter('%s.rdx' % prefix, taxonomy[1])
|
|
|
|
ecoTaxWriter('%s.tdx' % prefix, taxonomy[0])
|
2007-09-24 09:48:19 +00:00
|
|
|
ecoNameWriter('%s.ndx' % prefix, taxonomy[2])
|
|
|
|
|
2007-02-15 13:33:32 +00:00
|
|
|
filecount = 0
|
|
|
|
for filename in seqFileNames:
|
|
|
|
filecount+=1
|
|
|
|
sk=ecoSeqWriter('%s_%03d.sdx' % (prefix,filecount),
|
|
|
|
filename,
|
2007-09-24 09:48:19 +00:00
|
|
|
taxonomy[3],
|
2007-02-15 13:33:32 +00:00
|
|
|
parser)
|
|
|
|
if sk:
|
|
|
|
print >>sys.stderr,"Skipped entry :"
|
|
|
|
print >>sys.stderr,sk
|
|
|
|
|
|
|
|
def ecoParseOptions(arguments):
|
|
|
|
opt = {
|
|
|
|
'prefix' : 'ecodb',
|
|
|
|
'taxdir' : 'taxdump',
|
2007-05-02 10:10:27 +00:00
|
|
|
'parser' : sequenceIteratorFactory(genbankEntryParser,
|
|
|
|
entryIterator)
|
2007-02-15 13:33:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
o,filenames = getopt.getopt(arguments,
|
2017-02-06 11:24:32 +01:00
|
|
|
'ht:n:gfe',
|
2007-02-15 13:33:32 +00:00
|
|
|
['help',
|
|
|
|
'taxonomy=',
|
|
|
|
'name=',
|
2007-05-02 10:10:27 +00:00
|
|
|
'genbank',
|
2007-09-24 09:48:19 +00:00
|
|
|
'fasta',
|
|
|
|
'embl'])
|
2007-02-15 13:33:32 +00:00
|
|
|
|
|
|
|
for name,value in o:
|
|
|
|
if name in ('-h','--help'):
|
2007-09-24 09:48:19 +00:00
|
|
|
printHelp()
|
|
|
|
exit()
|
2007-02-15 13:33:32 +00:00
|
|
|
elif name in ('-t','--taxonomy'):
|
2008-02-08 10:27:42 +00:00
|
|
|
opt['taxmod']='dump'
|
2007-02-15 13:33:32 +00:00
|
|
|
opt['taxdir']=value
|
|
|
|
elif name in ('-n','--name'):
|
|
|
|
opt['prefix']=value
|
|
|
|
elif name in ('-g','--genbank'):
|
2007-05-02 10:10:27 +00:00
|
|
|
opt['parser']=sequenceIteratorFactory(genbankEntryParser,
|
2007-09-24 09:48:19 +00:00
|
|
|
entryIterator)
|
|
|
|
|
2007-05-02 10:10:27 +00:00
|
|
|
elif name in ('-f','--fasta'):
|
|
|
|
opt['parser']=sequenceIteratorFactory(fastaEntryParser,
|
|
|
|
fastaEntryIterator)
|
2007-09-24 09:48:19 +00:00
|
|
|
|
|
|
|
elif name in ('-e','--embl'):
|
|
|
|
opt['parser']=sequenceIteratorFactory(emblEntryParser,
|
|
|
|
entryIterator)
|
2007-02-15 13:33:32 +00:00
|
|
|
else:
|
|
|
|
raise ValueError,'Unknown option %s' % name
|
|
|
|
|
|
|
|
return opt,filenames
|
|
|
|
|
2017-02-06 11:24:32 +01:00
|
|
|
|
2007-09-24 09:48:19 +00:00
|
|
|
def printHelp():
|
|
|
|
print "-----------------------------------"
|
|
|
|
print " ecoPCRFormat.py"
|
|
|
|
print "-----------------------------------"
|
|
|
|
print "ecoPCRFormat.py [option] <argument>"
|
|
|
|
print "-----------------------------------"
|
|
|
|
print "-e --embl :[E]mbl format"
|
|
|
|
print "-f --fasta :[F]asta format"
|
|
|
|
print "-g --genbank :[G]enbank format"
|
|
|
|
print "-h --help :[H]elp - print this help"
|
|
|
|
print "-n --name :[N]ame of the new database created"
|
|
|
|
print "-t --taxonomy :[T]axonomy - path to the taxonomy database"
|
|
|
|
print " :bcp-like dump from GenBank taxonomy database."
|
|
|
|
print "-----------------------------------"
|
|
|
|
|
2007-02-15 13:33:32 +00:00
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
|
|
opt,filenames = ecoParseOptions(sys.argv[1:])
|
|
|
|
|
2017-02-06 11:24:32 +01:00
|
|
|
taxonomy = readTaxonomyDump(opt['taxdir'])
|
|
|
|
|
2007-02-15 13:33:32 +00:00
|
|
|
ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser'])
|
|
|
|
|