Update to consider evolution of the language

This commit is contained in:
2023-06-29 12:12:46 +02:00
parent 73236c72a8
commit 92826de147
18 changed files with 146 additions and 199 deletions

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
import re
import gzip
@ -6,12 +6,9 @@ import struct
import sys
import time
import getopt
from functools import cmp_to_key
try:
import psycopg2
_dbenable=True
except ImportError:
_dbenable=False
_dbenable=False
#####
#
@ -80,15 +77,17 @@ class ColumnFile(object):
def __init__(self,stream,sep=None,strip=True,types=None):
if isinstance(stream,str):
self._stream = open(stream)
elif hasattr(stream,'next'):
self._stream = stream
else:
raise ValueError,'stream must be string or an iterator'
try:
iter(stream)
self._stream = stream
except TypeError:
raise ValueError('stream must be string or an iterator')
self._delimiter=sep
self._strip=strip
if types:
self._types=[x for x in types]
for i in xrange(len(self._types)):
for i in range(len(self._types)):
if self._types[i] is bool:
self._types[i]=ColumnFile.str2bool
else:
@ -103,14 +102,14 @@ class ColumnFile(object):
def __iter__(self):
return self
def next(self):
ligne = self._stream.next()
def __next__(self):
ligne = next(self._stream)
data = ligne.split(self._delimiter)
if self._strip or self._types:
data = [x.strip() for x in data]
if self._types:
it = endLessIterator(self._types)
data = [x[1](x[0]) for x in ((y,it.next()) for y in data)]
data = [x[1](x[0]) for x in ((y,next(it)) for y in data)]
return data
def taxonCmp(t1,t2):
@ -125,14 +124,14 @@ def bsearchTaxon(taxonomy,taxid):
begin = 0
end = taxCount
oldcheck=taxCount
check = begin + end / 2
check = int(begin + end / 2)
while check != oldcheck and taxonomy[check][0]!=taxid :
if taxonomy[check][0] < taxid:
begin=check
else:
end=check
oldcheck=check
check = (begin + end) / 2
check = int((begin + end) / 2)
if taxonomy[check][0]==taxid:
@ -152,22 +151,22 @@ def readNodeTable(file):
str,str,bool,
int,bool,int,
bool,bool,bool,str))
print >>sys.stderr,"Reading taxonomy dump file..."
print("Reading taxonomy dump file...", file=sys.stderr)
taxonomy=[[n[0],n[2],n[1]] for n in nodes]
print >>sys.stderr,"List all taxonomy rank..."
print("List all taxonomy rank...", file=sys.stderr)
ranks =list(set(x[1] for x in taxonomy))
ranks.sort()
ranks = dict(map(None,ranks,xrange(len(ranks))))
ranks = {rank: index for index, rank in enumerate(ranks)}
print >>sys.stderr,"Sorting taxons..."
taxonomy.sort(taxonCmp)
print("Sorting taxons...", file=sys.stderr)
taxonomy.sort(key=lambda x: x[0])
print >>sys.stderr,"Indexing taxonomy..."
print("Indexing taxonomy...", file=sys.stderr)
index = {}
for t in taxonomy:
index[t[0]]=bsearchTaxon(taxonomy, t[0])
print >>sys.stderr,"Indexing parent and rank..."
print("Indexing parent and rank...", file=sys.stderr)
for t in taxonomy:
t[1]=ranks[t[1]]
t[2]=index[t[2]]
@ -203,7 +202,7 @@ def deletedNodeIterator(file):
def readTaxonomyDump(taxdir):
taxonomy,ranks,index = readNodeTable('%s/nodes.dmp' % taxdir)
print >>sys.stderr,"Adding scientific name..."
print("Adding scientific name...", file=sys.stderr)
alternativeName=[]
for taxid,name,classname in nameIterator('%s/names.dmp' % taxdir):
@ -211,66 +210,16 @@ def readTaxonomyDump(taxdir):
if classname == 'scientific name':
taxonomy[index[taxid]].append(name)
print >>sys.stderr,"Adding taxid alias..."
print("Adding taxid alias...", file=sys.stderr)
for taxid,current in mergedNodeIterator('%s/merged.dmp' % taxdir):
index[taxid]=index[current]
print >>sys.stderr,"Adding deleted taxid..."
print("Adding deleted taxid...", file=sys.stderr)
for taxid in deletedNodeIterator('%s/delnodes.dmp' % taxdir):
index[taxid]=None
return taxonomy,ranks,alternativeName,index
def readTaxonomyDB(dbname):
connection = psycopg2.connect(database=dbname)
cursor = connection.cursor()
cursor.execute("select numid,rank,parent from ncbi_taxonomy.taxon")
taxonomy=[list(x) for x in cursor]
cursor.execute("select rank_class from ncbi_taxonomy.taxon_rank_class order by rank_class")
ranks=cursor.fetchall()
ranks = dict(map(None,(x[0] for x in ranks),xrange(len(ranks))))
print >>sys.stderr,"Sorting taxons..."
taxonomy.sort(taxonCmp)
print >>sys.stderr,"Indexing taxonomy..."
index = {}
for t in taxonomy:
index[t[0]]=bsearchTaxon(taxonomy, t[0])
print >>sys.stderr,"Indexing parent and rank..."
for t in taxonomy:
t[1]=ranks[t[1]]
try:
t[2]=index[t[2]]
except KeyError,e:
if t[2] is None and t[0]==1:
t[2]=index[t[0]]
else:
raise e
cursor.execute("select taxid,name,category from ncbi_taxonomy.name")
alternativeName=[]
for taxid,name,classname in cursor:
alternativeName.append((name,classname,index[taxid]))
if classname == 'scientific name':
taxonomy[index[taxid]].append(name)
cursor.execute("select old_numid,current_numid from ncbi_taxonomy.taxon_id_alias")
print >>sys.stderr,"Adding taxid alias..."
for taxid,current in cursor:
if current is not None:
index[taxid]=index[current]
else:
index[taxid]=None
return taxonomy,ranks,alternativeName,index
#####
#
#
@ -282,22 +231,27 @@ def readTaxonomyDB(dbname):
def entryIterator(file):
file = universalOpen(file)
rep =[]
for ligne in file:
ligne = file.readline()
while ligne:
rep.append(ligne)
if ligne == '//\n':
rep = ''.join(rep)
yield rep
rep = []
ligne = file.readline()
def fastaEntryIterator(file):
file = universalOpen(file)
rep =[]
for ligne in file:
ligne = file.readline()
while ligne:
if ligne[0] == '>' and rep:
rep = ''.join(rep)
yield rep
rep = []
rep.append(ligne)
ligne = file.readline()
if rep:
rep = ''.join(rep)
yield rep
@ -418,7 +372,7 @@ def taxonomyInfo(entry,connection):
def ecoSeqPacker(sq):
compactseq = gzip.zlib.compress(sq['sequence'],9)
compactseq = gzip.zlib.compress(bytes(sq['sequence'],"ascii"),9)
cptseqlength = len(compactseq)
delength = len(sq['definition'])
@ -427,11 +381,11 @@ def ecoSeqPacker(sq):
packed = struct.pack('> I I 20s I I I %ds %ds' % (delength,cptseqlength),
totalSize,
sq['taxid'],
sq['id'],
bytes(sq['id'],"ascii"),
delength,
len(sq['sequence']),
cptseqlength,
sq['definition'],
bytes(sq['definition'],"ascii"),
compactseq)
assert len(packed) == totalSize+4, "error in sequence packing"
@ -450,7 +404,7 @@ def ecoTaxPacker(tx):
tx[1],
tx[2],
namelength,
tx[3])
bytes(tx[3],"ascii"))
return packed
@ -460,7 +414,7 @@ def ecoRankPacker(rank):
packed = struct.pack('> I %ds' % namelength,
namelength,
rank)
bytes(rank, 'ascii'))
return packed
@ -476,8 +430,8 @@ def ecoNamePacker(name):
namelength,
classlength,
name[2],
name[0],
name[1])
bytes(name[0], 'ascii'),
bytes(name[1], 'ascii'))
return packed
@ -505,11 +459,11 @@ def ecoSeqWriter(file,input,taxindex,parser):
skipped.append(entry['id'])
where = universalTell(input)
progressBar(where, inputsize)
print >>sys.stderr," Readed sequences : %d " % seqcount,
print(" Readed sequences : %d " % seqcount, end=' ', file=sys.stderr)
else:
skipped.append(entry['id'])
print >>sys.stderr
print(file=sys.stderr)
output.seek(0,0)
output.write(struct.pack('> I',seqcount))
@ -530,7 +484,7 @@ def ecoRankWriter(file,ranks):
output = open(file,'wb')
output.write(struct.pack('> I',len(ranks)))
rankNames = ranks.keys()
rankNames = list(ranks.keys())
rankNames.sort()
for rank in rankNames:
@ -552,7 +506,7 @@ def ecoNameWriter(file,names):
output = open(file,'wb')
output.write(struct.pack('> I',len(names)))
names.sort(nameCmp)
names.sort(key=lambda x:x[0].upper())
for name in names:
output.write(ecoNamePacker(name))
@ -573,8 +527,8 @@ def ecoDBWriter(prefix,taxonomy,seqFileNames,parser):
taxonomy[3],
parser)
if sk:
print >>sys.stderr,"Skipped entry :"
print >>sys.stderr,sk
print("Skipped entry :", file=sys.stderr)
print(sk, file=sys.stderr)
def ecoParseOptions(arguments):
opt = {
@ -618,34 +572,30 @@ def ecoParseOptions(arguments):
opt['parser']=sequenceIteratorFactory(emblEntryParser,
entryIterator)
else:
raise ValueError,'Unknown option %s' % name
raise ValueError('Unknown option %s' % name)
return opt,filenames
def printHelp():
print "-----------------------------------"
print " ecoPCRFormat.py"
print "-----------------------------------"
print "ecoPCRFormat.py [option] <argument>"
print "-----------------------------------"
print "-e --embl :[E]mbl format"
print "-f --fasta :[F]asta format"
print "-g --genbank :[G]enbank format"
print "-h --help :[H]elp - print this help"
print "-n --name :[N]ame of the new database created"
print "-t --taxonomy :[T]axonomy - path to the taxonomy database"
print " :bcp-like dump from GenBank taxonomy database."
print "-----------------------------------"
print("-----------------------------------")
print(" ecoPCRFormat.py")
print("-----------------------------------")
print("ecoPCRFormat.py [option] <argument>")
print("-----------------------------------")
print("-e --embl :[E]mbl format")
print("-f --fasta :[F]asta format")
print("-g --genbank :[G]enbank format")
print("-h --help :[H]elp - print this help")
print("-n --name :[N]ame of the new database created")
print("-t --taxonomy :[T]axonomy - path to the taxonomy database")
print(" :bcp-like dump from GenBank taxonomy database.")
print("-----------------------------------")
if __name__ == '__main__':
opt,filenames = ecoParseOptions(sys.argv[1:])
if opt['taxmod']=='dump':
taxonomy = readTaxonomyDump(opt['taxdir'])
elif opt['taxmod']=='db':
taxonomy = readTaxonomyDB(opt['taxdb'])
taxonomy = readTaxonomyDump(opt['taxdir'])
ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser'])