This commit is contained in:
2007-06-20 07:39:40 +00:00
parent 163693a132
commit c77c360782

View File

@ -171,15 +171,14 @@ def readNodeTable(file):
return taxonomy,ranks,index
def scientificNameIterator(file):
def nameIterator(file):
file = universalOpen(file)
names = ColumnFile(file,
sep='|',
types=(int,str,
str,str))
for taxid,name,unique,classname,white in names:
if classname == 'scientific name':
yield taxid,name
yield taxid,name,classname
def mergedNodeIterator(file):
file = universalOpen(file)
@ -201,8 +200,12 @@ def readTaxonomyDump(taxdir):
taxonomy,ranks,index = readNodeTable('%s/nodes.dmp' % taxdir)
print >>sys.stderr,"Adding scientific name..."
for taxid,name in scientificNameIterator('%s/names.dmp' % taxdir):
taxonomy[index[taxid]].append(name)
alternativeName=[]
for taxid,name,classname in nameIterator('%s/names.dmp' % taxdir):
alternativeName.append((name,classname,index[taxid]))
if classname == 'scientific name':
taxonomy[index[taxid]].append(name)
print >>sys.stderr,"Adding taxid alias..."
for taxid,current in mergedNodeIterator('%s/merged.dmp' % taxdir):
@ -212,7 +215,7 @@ def readTaxonomyDump(taxdir):
for taxid in deletedNodeIterator('%s/delnodes.dmp' % taxdir):
index[taxid]=None
return taxonomy,ranks,index
return taxonomy,ranks,alternativeName,index
#####
@ -405,6 +408,22 @@ def ecoRankPacker(rank):
return packed
def ecoNamePacker(name):
namelength = len(name[0])
classlength= len(name[1])
totalSize = namelength + classlength + 4 + 4 + 4 + 4
packed = struct.pack('> I I I I I %ds %ds' % (namelength,classlength),
totalSize,
int(name[1]=='scientific name'),
namelength,
classlength,
name[2],
name[0],
name[1])
return packed
def ecoSeqWriter(file,input,taxindex,parser):
output = open(file,'wb')
@ -462,18 +481,40 @@ def ecoRankWriter(file,ranks):
output.write(ecoRankPacker(rank))
output.close()
def nameCmp(n1,n2):
name1=n1[0].upper()
name2=n2[0].upper()
if name1 < name2:
return -1
elif name1 > name2:
return 1
return 0
def ecoNameWriter(file,names):
output = open(file,'wb')
output.write(struct.pack('> I',len(names)))
names.sort(nameCmp)
for name in names:
output.write(ecoNamePacker(name))
output.close()
def ecoDBWriter(prefix,taxonomy,seqFileNames,parser):
ecoRankWriter('%s.rdx' % prefix, taxonomy[1])
ecoTaxWriter('%s.tdx' % prefix, taxonomy[0])
ecoNameWriter('%s.ndx' % prefix, taxonomy[2])
filecount = 0
for filename in seqFileNames:
filecount+=1
sk=ecoSeqWriter('%s_%03d.sdx' % (prefix,filecount),
filename,
taxonomy[2],
taxonomy[3],
parser)
if sk:
print >>sys.stderr,"Skipped entry :"
@ -526,9 +567,9 @@ def printHelp():
print "-----------------------------------"
print "ecoPCRFormat.py [option] <argument>"
print "-----------------------------------"
print "-e --embl :[E]mbl format file name"
print "-f --fasta :[F]asta format file name"
print "-g --genbank :[G]enbank format file name"
print "-e --embl :[E]mbl format"
print "-f --fasta :[F]asta format"
print "-g --genbank :[G]enbank format"
print "-h --help :[H]elp - print this help"
print "-n --name :[N]ame of the new database created"
print "-t --taxonomy :[T]axonomy - path to the taxonomy database"