From c77c3607828dd2987f107e603f8618f1a51eb81f Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 20 Jun 2007 07:39:40 +0000 Subject: [PATCH] git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPCR/branches/refactoring@76 60f365c0-8329-0410-b2a4-ec073aeeaa1d --- tools/ecoPCRFormat.py | 63 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 52 insertions(+), 11 deletions(-) diff --git a/tools/ecoPCRFormat.py b/tools/ecoPCRFormat.py index 9a3205d..0f237ae 100755 --- a/tools/ecoPCRFormat.py +++ b/tools/ecoPCRFormat.py @@ -171,15 +171,14 @@ def readNodeTable(file): return taxonomy,ranks,index -def scientificNameIterator(file): +def nameIterator(file): file = universalOpen(file) names = ColumnFile(file, sep='|', types=(int,str, str,str)) for taxid,name,unique,classname,white in names: - if classname == 'scientific name': - yield taxid,name + yield taxid,name,classname def mergedNodeIterator(file): file = universalOpen(file) @@ -201,8 +200,12 @@ def readTaxonomyDump(taxdir): taxonomy,ranks,index = readNodeTable('%s/nodes.dmp' % taxdir) print >>sys.stderr,"Adding scientific name..." - for taxid,name in scientificNameIterator('%s/names.dmp' % taxdir): - taxonomy[index[taxid]].append(name) + + alternativeName=[] + for taxid,name,classname in nameIterator('%s/names.dmp' % taxdir): + alternativeName.append((name,classname,index[taxid])) + if classname == 'scientific name': + taxonomy[index[taxid]].append(name) print >>sys.stderr,"Adding taxid alias..." for taxid,current in mergedNodeIterator('%s/merged.dmp' % taxdir): @@ -212,7 +215,7 @@ def readTaxonomyDump(taxdir): for taxid in deletedNodeIterator('%s/delnodes.dmp' % taxdir): index[taxid]=None - return taxonomy,ranks,index + return taxonomy,ranks,alternativeName,index ##### @@ -405,6 +408,22 @@ def ecoRankPacker(rank): return packed +def ecoNamePacker(name): + + namelength = len(name[0]) + classlength= len(name[1]) + totalSize = namelength + classlength + 4 + 4 + 4 + 4 + + packed = struct.pack('> I I I I I %ds %ds' % (namelength,classlength), + totalSize, + int(name[1]=='scientific name'), + namelength, + classlength, + name[2], + name[0], + name[1]) + + return packed def ecoSeqWriter(file,input,taxindex,parser): output = open(file,'wb') @@ -462,18 +481,40 @@ def ecoRankWriter(file,ranks): output.write(ecoRankPacker(rank)) output.close() + +def nameCmp(n1,n2): + name1=n1[0].upper() + name2=n2[0].upper() + if name1 < name2: + return -1 + elif name1 > name2: + return 1 + return 0 + + +def ecoNameWriter(file,names): + output = open(file,'wb') + output.write(struct.pack('> I',len(names))) + + names.sort(nameCmp) + + for name in names: + output.write(ecoNamePacker(name)) + + output.close() def ecoDBWriter(prefix,taxonomy,seqFileNames,parser): ecoRankWriter('%s.rdx' % prefix, taxonomy[1]) ecoTaxWriter('%s.tdx' % prefix, taxonomy[0]) - + ecoNameWriter('%s.ndx' % prefix, taxonomy[2]) + filecount = 0 for filename in seqFileNames: filecount+=1 sk=ecoSeqWriter('%s_%03d.sdx' % (prefix,filecount), filename, - taxonomy[2], + taxonomy[3], parser) if sk: print >>sys.stderr,"Skipped entry :" @@ -526,9 +567,9 @@ def printHelp(): print "-----------------------------------" print "ecoPCRFormat.py [option] " print "-----------------------------------" - print "-e --embl :[E]mbl format file name" - print "-f --fasta :[F]asta format file name" - print "-g --genbank :[G]enbank format file name" + print "-e --embl :[E]mbl format" + print "-f --fasta :[F]asta format" + print "-g --genbank :[G]enbank format" print "-h --help :[H]elp - print this help" print "-n --name :[N]ame of the new database created" print "-t --taxonomy :[T]axonomy - path to the taxonomy database"