Actualiser tools/ecoPCRFormat.py
This commit is contained in:
@ -1,12 +1,14 @@
|
||||
#!/usr/bin/env python3
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
import re
|
||||
import gzip
|
||||
import struct
|
||||
import sys
|
||||
import time
|
||||
import getopt
|
||||
from functools import cmp_to_key
|
||||
import zlib
|
||||
|
||||
_dbenable = False
|
||||
|
||||
@ -20,29 +22,42 @@ _dbenable=False
|
||||
|
||||
def universalOpen(file):
|
||||
if isinstance(file, str):
|
||||
if file[-3:] == '.gz':
|
||||
if file.endswith('.gz'):
|
||||
if sys.version_info[0] < 3:
|
||||
rep = gzip.open(file)
|
||||
else:
|
||||
rep = open(file)
|
||||
rep = gzip.open(file, 'rt', encoding='latin-1')
|
||||
else:
|
||||
rep = open(file, 'r')
|
||||
else:
|
||||
rep = file
|
||||
return rep
|
||||
|
||||
def universalTell(file):
|
||||
if isinstance(file, gzip.GzipFile):
|
||||
file=file.myfileobj
|
||||
if hasattr(file, 'myfileobj'): # Python 3 gzip
|
||||
return file.myfileobj.tell()
|
||||
elif hasattr(file, 'fileobj'): # Python 2 gzip
|
||||
return file.fileobj.tell()
|
||||
return file.tell()
|
||||
|
||||
def fileSize(file):
|
||||
if isinstance(file, gzip.GzipFile):
|
||||
file=file.myfileobj
|
||||
pos = file.tell()
|
||||
file.seek(0,2)
|
||||
length = file.tell()
|
||||
file.seek(pos,0)
|
||||
if hasattr(file, 'myfileobj'): # Python 3 gzip
|
||||
f = file.myfileobj
|
||||
elif hasattr(file, 'fileobj'): # Python 2 gzip
|
||||
f = file.fileobj
|
||||
else:
|
||||
f = file
|
||||
|
||||
if not hasattr(f, 'fileno'):
|
||||
return None
|
||||
|
||||
pos = f.tell()
|
||||
f.seek(0, 2)
|
||||
length = f.tell()
|
||||
f.seek(pos, 0)
|
||||
return length
|
||||
|
||||
def progressBar(pos,max,reset=False,delta=[]):
|
||||
def progressBar(pos, max_size, reset=False, delta=[]):
|
||||
if reset:
|
||||
del delta[:]
|
||||
if not delta:
|
||||
@ -51,12 +66,28 @@ def progressBar(pos,max,reset=False,delta=[]):
|
||||
|
||||
delta[1] = time.time()
|
||||
elapsed = delta[1] - delta[0]
|
||||
percent = float(pos)/max * 100
|
||||
remain = time.strftime('%H:%M:%S',time.gmtime(elapsed / percent * (100-percent)))
|
||||
bar = '#' * int(percent/2)
|
||||
bar+= '|/-\\-'[pos % 5]
|
||||
bar+= ' ' * (50 - int(percent/2))
|
||||
sys.stderr.write('\r%5.1f %% |%s] remain : %s' %(percent,bar,remain))
|
||||
|
||||
if max_size is None:
|
||||
sys.stderr.write('\rReaded sequences : %d ' % pos)
|
||||
sys.stderr.flush()
|
||||
return
|
||||
|
||||
percent = float(pos) / max_size * 100 if max_size > 0 else 0
|
||||
|
||||
if percent > 0:
|
||||
remain = elapsed / percent * (100 - percent)
|
||||
remain_str = time.strftime('%H:%M:%S', time.gmtime(int(remain)))
|
||||
else:
|
||||
remain_str = '?'
|
||||
|
||||
bar_length = 50
|
||||
filled = int(percent / 2)
|
||||
bar = '#' * filled
|
||||
bar += '|/-\\'[pos % 4]
|
||||
bar += ' ' * (bar_length - filled - 1)
|
||||
|
||||
sys.stderr.write('\r%5.1f %% |%s] remain : %s' % (percent, bar, remain_str))
|
||||
sys.stderr.flush()
|
||||
|
||||
#####
|
||||
#
|
||||
@ -66,14 +97,25 @@ def progressBar(pos,max,reset=False,delta=[]):
|
||||
#
|
||||
#####
|
||||
|
||||
def endLessIterator(endedlist):
|
||||
for x in endedlist:
|
||||
yield x
|
||||
while(1):
|
||||
yield endedlist[-1]
|
||||
class endLessIterator(object):
|
||||
def __init__(self, endedlist):
|
||||
self.endedlist = endedlist
|
||||
self.index = 0
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def next(self): # Python 2
|
||||
if self.index < len(self.endedlist):
|
||||
item = self.endedlist[self.index]
|
||||
self.index += 1
|
||||
return item
|
||||
else:
|
||||
return self.endedlist[-1]
|
||||
|
||||
__next__ = next # Python 3 compatibility
|
||||
|
||||
class ColumnFile(object):
|
||||
|
||||
def __init__(self, stream, sep=None, strip=True, types=None):
|
||||
if isinstance(stream, str):
|
||||
self._stream = open(stream)
|
||||
@ -85,66 +127,64 @@ class ColumnFile(object):
|
||||
raise ValueError('stream must be string or an iterator')
|
||||
self._delimiter = sep
|
||||
self._strip = strip
|
||||
if types:
|
||||
self._types=[x for x in types]
|
||||
for i in range(len(self._types)):
|
||||
if self._types[i] is bool:
|
||||
self._types[i]=ColumnFile.str2bool
|
||||
else:
|
||||
self._types=None
|
||||
self._types = types
|
||||
|
||||
@staticmethod
|
||||
def str2bool(x):
|
||||
return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False}))
|
||||
|
||||
str2bool = staticmethod(str2bool)
|
||||
|
||||
val = x.strip()[0].upper()
|
||||
if val in 'T1Y':
|
||||
return True
|
||||
elif val in 'F0N':
|
||||
return False
|
||||
raise ValueError("Invalid boolean value: %s" % x)
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
def next(self): # Python 2
|
||||
ligne = next(self._stream)
|
||||
data = ligne.split(self._delimiter)
|
||||
if self._strip or self._types:
|
||||
if self._strip:
|
||||
data = [x.strip() for x in data]
|
||||
|
||||
if self._types:
|
||||
it = endLessIterator(self._types)
|
||||
data = [x[1](x[0]) for x in ((y,next(it)) for y in data)]
|
||||
typed_data = []
|
||||
for i, val in enumerate(data):
|
||||
if i < len(self._types):
|
||||
t = self._types[i]
|
||||
if t is bool:
|
||||
typed_data.append(self.str2bool(val))
|
||||
else:
|
||||
typed_data.append(t(val))
|
||||
else:
|
||||
typed_data.append(val)
|
||||
return typed_data
|
||||
return data
|
||||
|
||||
def taxonCmp(t1,t2):
|
||||
if t1[0] < t2[0]:
|
||||
return -1
|
||||
elif t1[0] > t2[0]:
|
||||
return +1
|
||||
return 0
|
||||
__next__ = next # Python 3 compatibility
|
||||
|
||||
def bsearchTaxon(taxonomy, taxid):
|
||||
taxCount = len(taxonomy)
|
||||
begin = 0
|
||||
end = taxCount
|
||||
oldcheck=taxCount
|
||||
check = int(begin + end / 2)
|
||||
while check != oldcheck and taxonomy[check][0]!=taxid :
|
||||
oldcheck = -1
|
||||
check = int(begin + (end - begin) / 2)
|
||||
|
||||
while check != oldcheck and check < taxCount and taxonomy[check][0] != taxid:
|
||||
if taxonomy[check][0] < taxid:
|
||||
begin = check
|
||||
else:
|
||||
end = check
|
||||
|
||||
oldcheck = check
|
||||
check = int((begin + end) / 2)
|
||||
check = int(begin + (end - begin) / 2)
|
||||
|
||||
|
||||
if taxonomy[check][0]==taxid:
|
||||
if check < taxCount and taxonomy[check][0] == taxid:
|
||||
return check
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
|
||||
def readNodeTable(file):
|
||||
|
||||
file = universalOpen(file)
|
||||
|
||||
nodes = ColumnFile(file,
|
||||
sep='|',
|
||||
types=(int, int, str,
|
||||
@ -152,35 +192,47 @@ def readNodeTable(file):
|
||||
int, bool, int,
|
||||
bool, bool, bool, str))
|
||||
print("Reading taxonomy dump file...", file=sys.stderr)
|
||||
taxonomy=[[n[0],n[2],n[1]] for n in nodes]
|
||||
sys.stderr.flush()
|
||||
|
||||
taxonomy = []
|
||||
for n in nodes:
|
||||
taxonomy.append([n[0], n[2], n[1]])
|
||||
|
||||
print("List all taxonomy rank...", file=sys.stderr)
|
||||
sys.stderr.flush()
|
||||
|
||||
ranks = list(set(x[1] for x in taxonomy))
|
||||
ranks.sort()
|
||||
ranks = {rank: index for index, rank in enumerate(ranks)}
|
||||
rank_dict = {rank: idx for idx, rank in enumerate(ranks)}
|
||||
|
||||
print("Sorting taxons...", file=sys.stderr)
|
||||
sys.stderr.flush()
|
||||
taxonomy.sort(key=lambda x: x[0])
|
||||
|
||||
print("Indexing taxonomy...", file=sys.stderr)
|
||||
sys.stderr.flush()
|
||||
|
||||
index = {}
|
||||
for t in taxonomy:
|
||||
index[t[0]]=bsearchTaxon(taxonomy, t[0])
|
||||
for i, t in enumerate(taxonomy):
|
||||
index[t[0]] = i
|
||||
|
||||
print("Indexing parent and rank...", file=sys.stderr)
|
||||
sys.stderr.flush()
|
||||
|
||||
for t in taxonomy:
|
||||
t[1]=ranks[t[1]]
|
||||
t[2]=index[t[2]]
|
||||
t[1] = rank_dict[t[1]]
|
||||
t[2] = index.get(t[2], None)
|
||||
|
||||
|
||||
return taxonomy,ranks,index
|
||||
return taxonomy, rank_dict, index
|
||||
|
||||
def nameIterator(file):
|
||||
file = universalOpen(file)
|
||||
names = ColumnFile(file,
|
||||
sep='|',
|
||||
types=(int,str,
|
||||
str,str))
|
||||
for taxid,name,unique,classname,white in names:
|
||||
types=(int, str, str, str))
|
||||
for row in names:
|
||||
if len(row) >= 4:
|
||||
taxid, name, unique, classname = row[:4]
|
||||
yield taxid, name, classname
|
||||
|
||||
def mergedNodeIterator(file):
|
||||
@ -188,7 +240,9 @@ def mergedNodeIterator(file):
|
||||
merged = ColumnFile(file,
|
||||
sep='|',
|
||||
types=(int, int, str))
|
||||
for taxid,current,white in merged:
|
||||
for row in merged:
|
||||
if len(row) >= 2:
|
||||
taxid, current = row[:2]
|
||||
yield taxid, current
|
||||
|
||||
def deletedNodeIterator(file):
|
||||
@ -196,26 +250,40 @@ def deletedNodeIterator(file):
|
||||
deleted = ColumnFile(file,
|
||||
sep='|',
|
||||
types=(int, str))
|
||||
for taxid,white in deleted:
|
||||
for row in deleted:
|
||||
if row:
|
||||
taxid = row[0]
|
||||
yield taxid
|
||||
|
||||
def readTaxonomyDump(taxdir):
|
||||
taxonomy,ranks,index = readNodeTable('%s/nodes.dmp' % taxdir)
|
||||
taxonomy, ranks, index = readNodeTable(taxdir + '/nodes.dmp')
|
||||
|
||||
# Initialize scientific name as empty string for each taxon
|
||||
for tx in taxonomy:
|
||||
tx.append("") # Add fourth element: scientific name (empty at start)
|
||||
|
||||
print("Adding scientific name...", file=sys.stderr)
|
||||
sys.stderr.flush()
|
||||
|
||||
alternativeName = []
|
||||
for taxid,name,classname in nameIterator('%s/names.dmp' % taxdir):
|
||||
alternativeName.append((name,classname,index[taxid]))
|
||||
for taxid, name, classname in nameIterator(taxdir + '/names.dmp'):
|
||||
idx = index.get(taxid, None)
|
||||
if idx is not None:
|
||||
# Clean and ensure ASCII only
|
||||
name_clean = ''.join(c for c in name if ord(c) < 128)
|
||||
alternativeName.append((name_clean, classname, idx))
|
||||
if classname == 'scientific name':
|
||||
taxonomy[index[taxid]].append(name)
|
||||
taxonomy[idx][3] = name_clean
|
||||
|
||||
print("Adding taxid alias...", file=sys.stderr)
|
||||
for taxid,current in mergedNodeIterator('%s/merged.dmp' % taxdir):
|
||||
sys.stderr.flush()
|
||||
for taxid, current in mergedNodeIterator(taxdir + '/merged.dmp'):
|
||||
if current in index:
|
||||
index[taxid] = index[current]
|
||||
|
||||
print("Adding deleted taxid...", file=sys.stderr)
|
||||
for taxid in deletedNodeIterator('%s/delnodes.dmp' % taxdir):
|
||||
sys.stderr.flush()
|
||||
for taxid in deletedNodeIterator(taxdir + '/delnodes.dmp'):
|
||||
index[taxid] = None
|
||||
|
||||
return taxonomy, ranks, alternativeName, index
|
||||
@ -235,96 +303,124 @@ def entryIterator(file):
|
||||
while ligne:
|
||||
rep.append(ligne)
|
||||
if ligne == '//\n':
|
||||
rep = ''.join(rep)
|
||||
yield rep
|
||||
yield ''.join(rep)
|
||||
rep = []
|
||||
ligne = file.readline()
|
||||
if rep:
|
||||
yield ''.join(rep)
|
||||
|
||||
def fastaEntryIterator(file):
|
||||
file = universalOpen(file)
|
||||
rep = []
|
||||
ligne = file.readline()
|
||||
while ligne:
|
||||
if ligne[0] == '>' and rep:
|
||||
rep = ''.join(rep)
|
||||
yield rep
|
||||
if ligne.startswith('>') and rep:
|
||||
yield ''.join(rep)
|
||||
rep = []
|
||||
rep.append(ligne)
|
||||
ligne = file.readline()
|
||||
|
||||
if rep:
|
||||
rep = ''.join(rep)
|
||||
yield rep
|
||||
yield ''.join(rep)
|
||||
|
||||
_cleanSeq = re.compile('[ \n0-9]+')
|
||||
_cleanSeq = re.compile(r'[\s\d]') # More efficient
|
||||
|
||||
def cleanSeq(seq):
|
||||
return _cleanSeq.sub('', seq)
|
||||
|
||||
|
||||
_gbParseID = re.compile('(?<=^LOCUS {7})[^ ]+(?= )',re.MULTILINE)
|
||||
_gbParseDE = re.compile('(?<=^DEFINITION {2}).+?\. *$(?=[^ ])',re.MULTILINE+re.DOTALL)
|
||||
_gbParseSQ = re.compile('(?<=^ORIGIN).+?(?=^//$)',re.MULTILINE+re.DOTALL)
|
||||
_gbParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")')
|
||||
# GenBank patterns
|
||||
_gbParseID = re.compile(r'(?<=^LOCUS {7})[^ ]+(?= )', re.MULTILINE)
|
||||
_gbParseDE = re.compile(r'(?<=^DEFINITION {2}).+?\. *$', re.MULTILINE | re.DOTALL)
|
||||
_gbParseSQ = re.compile(r'(?<=^ORIGIN).+?(?=^//$)', re.MULTILINE | re.DOTALL)
|
||||
_gbParseTX = re.compile(r'(?<= /db_xref="taxon:)[0-9]+(?=")')
|
||||
|
||||
def genbankEntryParser(entry):
|
||||
Id = _gbParseID.findall(entry)[0]
|
||||
De = ' '.join(_gbParseDE.findall(entry)[0].split())
|
||||
Sq = cleanSeq(_gbParseSQ.findall(entry)[0].upper())
|
||||
try:
|
||||
Tx = int(_gbParseTX.findall(entry)[0])
|
||||
except IndexError:
|
||||
Tx = None
|
||||
Id_match = _gbParseID.findall(entry)
|
||||
Id = Id_match[0] if Id_match else "UNKNOWN_ID"
|
||||
|
||||
De_match = _gbParseDE.findall(entry)
|
||||
De = ' '.join(De_match[0].split()) if De_match else ""
|
||||
|
||||
Sq_match = _gbParseSQ.findall(entry)
|
||||
Sq = cleanSeq(Sq_match[0].upper()) if Sq_match else ""
|
||||
|
||||
Tx_match = _gbParseTX.findall(entry)
|
||||
Tx = int(Tx_match[0]) if Tx_match else None
|
||||
|
||||
# Clean and ensure ASCII only
|
||||
Id = ''.join(c for c in Id if ord(c) < 128)
|
||||
De = ''.join(c for c in De if ord(c) < 128)
|
||||
|
||||
return {'id': Id, 'taxid': Tx, 'definition': De, 'sequence': Sq}
|
||||
|
||||
######################
|
||||
|
||||
_cleanDef = re.compile('[\nDE]')
|
||||
|
||||
def cleanDef(definition):
|
||||
return _cleanDef.sub('',definition)
|
||||
|
||||
_emblParseID = re.compile('(?<=^ID {3})[^ ]+(?=;)',re.MULTILINE)
|
||||
_emblParseDE = re.compile('(?<=^DE {3}).+?\. *$(?=[^ ])',re.MULTILINE+re.DOTALL)
|
||||
_emblParseSQ = re.compile('(?<=^ ).+?(?=^//$)',re.MULTILINE+re.DOTALL)
|
||||
_emblParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")')
|
||||
# EMBL patterns
|
||||
_emblParseID = re.compile(r'(?<=^ID {3})[^ ]+(?=;)', re.MULTILINE)
|
||||
_emblParseDE = re.compile(r'(?<=^DE {3}).+?\. *$', re.MULTILINE | re.DOTALL)
|
||||
_emblParseSQ = re.compile(r'(?<=^ ).+?(?=^//$)', re.MULTILINE | re.DOTALL)
|
||||
_emblParseTX = re.compile(r'(?<= /db_xref="taxon:)[0-9]+(?=")')
|
||||
|
||||
def emblEntryParser(entry):
|
||||
Id = _emblParseID.findall(entry)[0]
|
||||
De = ' '.join(cleanDef(_emblParseDE.findall(entry)[0]).split())
|
||||
Sq = cleanSeq(_emblParseSQ.findall(entry)[0].upper())
|
||||
try:
|
||||
Tx = int(_emblParseTX.findall(entry)[0])
|
||||
except IndexError:
|
||||
Tx = None
|
||||
Id_match = _emblParseID.findall(entry)
|
||||
Id = Id_match[0] if Id_match else "UNKNOWN_ID"
|
||||
|
||||
De_match = _emblParseDE.findall(entry)
|
||||
De = ' '.join(De_match[0].replace('\nDE ', ' ').split()) if De_match else ""
|
||||
|
||||
Sq_match = _emblParseSQ.findall(entry)
|
||||
Sq = cleanSeq(Sq_match[0].upper()) if Sq_match else ""
|
||||
|
||||
Tx_match = _emblParseTX.findall(entry)
|
||||
Tx = int(Tx_match[0]) if Tx_match else None
|
||||
|
||||
# Clean and ensure ASCII only
|
||||
Id = ''.join(c for c in Id if ord(c) < 128)
|
||||
De = ''.join(c for c in De if ord(c) < 128)
|
||||
|
||||
return {'id': Id, 'taxid': Tx, 'definition': De, 'sequence': Sq}
|
||||
|
||||
|
||||
######################
|
||||
|
||||
_fastaSplit=re.compile(';\W*')
|
||||
# FASTA processing
|
||||
_fastaSplit = re.compile(r';\W*')
|
||||
|
||||
def parseFasta(seq):
|
||||
seq=seq.split('\n')
|
||||
title = seq[0].strip()[1:].split(None,1)
|
||||
id=title[0]
|
||||
if len(title) == 2:
|
||||
field = _fastaSplit.split(title[1])
|
||||
else:
|
||||
field=[]
|
||||
info = dict(x.split('=',1) for x in field if '=' in x)
|
||||
definition = ' '.join([x for x in field if '=' not in x])
|
||||
seq=(''.join([x.strip() for x in seq[1:]])).upper()
|
||||
return id,seq,definition,info
|
||||
lines = seq.split('\n')
|
||||
if not lines:
|
||||
return "", "", "", {}
|
||||
|
||||
header = lines[0].strip()
|
||||
if header.startswith('>'):
|
||||
header = header[1:]
|
||||
|
||||
parts = header.split(None, 1)
|
||||
seq_id = parts[0] if parts else ""
|
||||
description = parts[1] if len(parts) > 1 else ""
|
||||
|
||||
fields = _fastaSplit.split(description) if description else []
|
||||
info = {}
|
||||
other_desc = []
|
||||
for field in fields:
|
||||
if '=' in field:
|
||||
key, val = field.split('=', 1)
|
||||
info[key.strip()] = val.strip()
|
||||
else:
|
||||
other_desc.append(field)
|
||||
|
||||
definition = ' '.join(other_desc)
|
||||
sequence = ''.join(x.strip() for x in lines[1:]).upper()
|
||||
|
||||
# Clean and ensure ASCII only
|
||||
seq_id = ''.join(c for c in seq_id if ord(c) < 128)
|
||||
definition = ''.join(c for c in definition if ord(c) < 128)
|
||||
|
||||
return seq_id, sequence, definition, info
|
||||
|
||||
def fastaEntryParser(entry):
|
||||
id,seq,definition,info = parseFasta(entry)
|
||||
seq_id, sequence, definition, info = parseFasta(entry)
|
||||
Tx = info.get('taxid', None)
|
||||
if Tx is not None:
|
||||
try:
|
||||
Tx = int(Tx)
|
||||
return {'id':id,'taxid':Tx,'definition':definition,'sequence':seq}
|
||||
|
||||
except ValueError:
|
||||
Tx = None
|
||||
return {'id': seq_id, 'taxid': Tx, 'definition': definition, 'sequence': sequence}
|
||||
|
||||
def sequenceIteratorFactory(entryParser, entryIterator):
|
||||
def sequenceIterator(file):
|
||||
@ -332,144 +428,142 @@ def sequenceIteratorFactory(entryParser,entryIterator):
|
||||
yield entryParser(entry)
|
||||
return sequenceIterator
|
||||
|
||||
|
||||
def taxonomyInfo(entry,connection):
|
||||
taxid = entry['taxid']
|
||||
curseur = connection.cursor()
|
||||
curseur.execute("""
|
||||
select taxid,species,genus,family,
|
||||
taxonomy.scientificName(taxid) as sn,
|
||||
taxonomy.scientificName(species) as species_sn,
|
||||
taxonomy.scientificName(genus) as genus_sn,
|
||||
taxonomy.scientificName(family) as family_sn
|
||||
from
|
||||
(
|
||||
select alias as taxid,
|
||||
taxonomy.getSpecies(alias) as species,
|
||||
taxonomy.getGenus(alias) as genus,
|
||||
taxonomy.getFamily(alias) as family
|
||||
from taxonomy.aliases
|
||||
where id=%d ) as tax
|
||||
""" % taxid)
|
||||
rep = curseur.fetchone()
|
||||
entry['current_taxid']=rep[0]
|
||||
entry['species']=rep[1]
|
||||
entry['genus']=rep[2]
|
||||
entry['family']=rep[3]
|
||||
entry['scientific_name']=rep[4]
|
||||
entry['species_sn']=rep[5]
|
||||
entry['genus_sn']=rep[6]
|
||||
entry['family_sn']=rep[7]
|
||||
return entry
|
||||
|
||||
#####
|
||||
#
|
||||
#
|
||||
# Binary writer
|
||||
# Binary writer (CORRECTED VERSION)
|
||||
#
|
||||
#
|
||||
#####
|
||||
|
||||
def ecoSeqPacker(sq):
|
||||
|
||||
compactseq = gzip.zlib.compress(bytes(sq['sequence'],"ascii"),9)
|
||||
# Ensure sequence is ASCII uppercase
|
||||
sequence = ''.join(c for c in sq['sequence'] if ord(c) < 128).upper()
|
||||
seq_bytes = sequence.encode('ascii')
|
||||
compactseq = zlib.compress(seq_bytes, 9)
|
||||
cptseqlength = len(compactseq)
|
||||
delength = len(sq['definition'])
|
||||
|
||||
totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength
|
||||
# Clean definition to pure ASCII
|
||||
definition = ''.join(c for c in sq['definition'] if ord(c) < 128)
|
||||
de_bytes = definition.encode('ascii')
|
||||
delength = len(de_bytes)
|
||||
|
||||
packed = struct.pack('> I I 20s I I I %ds %ds' % (delength,cptseqlength),
|
||||
totalSize,
|
||||
sq['taxid'],
|
||||
bytes(sq['id'],"ascii"),
|
||||
delength,
|
||||
len(sq['sequence']),
|
||||
cptseqlength,
|
||||
bytes(sq['definition'],"ascii"),
|
||||
compactseq)
|
||||
# Clean ID to pure ASCII and pad to 20 bytes
|
||||
seq_id = ''.join(c for c in sq['id'] if ord(c) < 128)
|
||||
seq_id_bytes = seq_id.encode('ascii')
|
||||
if len(seq_id_bytes) > 20:
|
||||
seq_id_bytes = seq_id_bytes[:20]
|
||||
else:
|
||||
seq_id_bytes = seq_id_bytes.ljust(20, b'\0')
|
||||
|
||||
assert len(packed) == totalSize+4, "error in sequence packing"
|
||||
# Calculate total size (4 bytes for each integer + data lengths)
|
||||
totalSize = 4 + 20 + 4 + 4 + 4 + delength + cptseqlength
|
||||
|
||||
# Pack with correct byte order and field order
|
||||
packed = struct.pack('>I', totalSize)
|
||||
packed += struct.pack('>I', sq['taxid'])
|
||||
packed += seq_id_bytes
|
||||
packed += struct.pack('>I', delength)
|
||||
packed += struct.pack('>I', len(sequence))
|
||||
packed += struct.pack('>I', cptseqlength)
|
||||
packed += de_bytes
|
||||
packed += compactseq
|
||||
|
||||
return packed
|
||||
|
||||
def ecoTaxPacker(tx):
|
||||
|
||||
namelength = len(tx[3])
|
||||
|
||||
# Clean name to pure ASCII
|
||||
name_clean = ''.join(c for c in tx[3] if ord(c) < 128)
|
||||
name_bytes = name_clean.encode('ascii')
|
||||
namelength = len(name_bytes)
|
||||
totalSize = 4 + 4 + 4 + 4 + namelength
|
||||
|
||||
packed = struct.pack('> I I I I I %ds' % namelength,
|
||||
totalSize,
|
||||
tx[0],
|
||||
tx[1],
|
||||
tx[2],
|
||||
namelength,
|
||||
bytes(tx[3],"ascii"))
|
||||
packed = struct.pack('>I', totalSize)
|
||||
packed += struct.pack('>I', tx[0]) # taxid
|
||||
packed += struct.pack('>I', tx[1]) # rank index
|
||||
packed += struct.pack('>I', tx[2]) # parent index
|
||||
packed += struct.pack('>I', namelength)
|
||||
packed += name_bytes
|
||||
|
||||
return packed
|
||||
|
||||
def ecoRankPacker(rank):
|
||||
|
||||
namelength = len(rank)
|
||||
|
||||
packed = struct.pack('> I %ds' % namelength,
|
||||
namelength,
|
||||
bytes(rank, 'ascii'))
|
||||
|
||||
# Clean rank to pure ASCII
|
||||
rank_clean = ''.join(c for c in rank if ord(c) < 128)
|
||||
rank_bytes = rank_clean.encode('ascii')
|
||||
namelength = len(rank_bytes)
|
||||
packed = struct.pack('>I', namelength)
|
||||
packed += rank_bytes
|
||||
return packed
|
||||
|
||||
def ecoNamePacker(name):
|
||||
# Clean name and class to pure ASCII
|
||||
name_clean = ''.join(c for c in name[0] if ord(c) < 128)
|
||||
class_clean = ''.join(c for c in name[1] if ord(c) < 128)
|
||||
|
||||
namelength = len(name[0])
|
||||
classlength= len(name[1])
|
||||
totalSize = namelength + classlength + 4 + 4 + 4 + 4
|
||||
name_bytes = name_clean.encode('ascii')
|
||||
class_bytes = class_clean.encode('ascii')
|
||||
namelength = len(name_bytes)
|
||||
classlength = len(class_bytes)
|
||||
totalSize = 4 + 4 + 4 + 4 + namelength + classlength
|
||||
|
||||
packed = struct.pack('> I I I I I %ds %ds' % (namelength,classlength),
|
||||
totalSize,
|
||||
int(name[1]=='scientific name'),
|
||||
namelength,
|
||||
classlength,
|
||||
name[2],
|
||||
bytes(name[0], 'ascii'),
|
||||
bytes(name[1], 'ascii'))
|
||||
packed = struct.pack('>I', totalSize)
|
||||
packed += struct.pack('>I', 1 if name[1] == 'scientific name' else 0)
|
||||
packed += struct.pack('>I', namelength)
|
||||
packed += struct.pack('>I', classlength)
|
||||
packed += struct.pack('>I', name[2]) # taxon index
|
||||
packed += name_bytes
|
||||
packed += class_bytes
|
||||
|
||||
return packed
|
||||
|
||||
def ecoSeqWriter(file,input,taxindex,parser):
|
||||
def ecoSeqWriter(file, input_file, taxindex, parser):
|
||||
output = open(file, 'wb')
|
||||
input = universalOpen(input)
|
||||
inputsize = fileSize(input)
|
||||
entries = parser(input)
|
||||
input_stream = universalOpen(input_file)
|
||||
inputsize = fileSize(input_stream)
|
||||
entries = parser(input_stream)
|
||||
seqcount = 0
|
||||
skipped = []
|
||||
|
||||
output.write(struct.pack('> I',seqcount))
|
||||
# Write placeholder for sequence count
|
||||
output.write(struct.pack('>I', 0))
|
||||
|
||||
progressBar(0, inputsize, reset=True)
|
||||
|
||||
progressBar(1, inputsize,reset=True)
|
||||
for entry in entries:
|
||||
if entry['taxid'] is not None:
|
||||
try:
|
||||
entry['taxid']=taxindex[entry['taxid']]
|
||||
except KeyError:
|
||||
# Convert to integer taxid
|
||||
taxid_int = int(entry['taxid'])
|
||||
# Lookup in taxindex
|
||||
entry['taxid'] = taxindex.get(taxid_int, None)
|
||||
except (ValueError, TypeError):
|
||||
entry['taxid'] = None
|
||||
|
||||
if entry['taxid'] is not None:
|
||||
seqcount += 1
|
||||
output.write(ecoSeqPacker(entry))
|
||||
packed = ecoSeqPacker(entry)
|
||||
output.write(packed)
|
||||
else:
|
||||
skipped.append(entry['id'])
|
||||
where = universalTell(input)
|
||||
|
||||
# Update progress
|
||||
if inputsize is not None:
|
||||
where = universalTell(input_stream)
|
||||
progressBar(where, inputsize)
|
||||
print(" Readed sequences : %d " % seqcount, end=' ', file=sys.stderr)
|
||||
else:
|
||||
skipped.append(entry['id'])
|
||||
progressBar(seqcount, seqcount + 1) # Fake progress
|
||||
|
||||
print(file=sys.stderr)
|
||||
output.seek(0,0)
|
||||
sys.stderr.write(" Readed sequences : %d \r" % seqcount)
|
||||
sys.stderr.flush()
|
||||
|
||||
print("\n", file=sys.stderr)
|
||||
# Update sequence count at beginning of file
|
||||
output.seek(0)
|
||||
output.write(struct.pack('>I', seqcount))
|
||||
|
||||
output.close()
|
||||
return skipped
|
||||
|
||||
return skipped
|
||||
|
||||
def ecoTaxWriter(file, taxonomy):
|
||||
output = open(file, 'wb')
|
||||
@ -482,31 +576,22 @@ def ecoTaxWriter(file,taxonomy):
|
||||
|
||||
def ecoRankWriter(file, ranks):
|
||||
output = open(file, 'wb')
|
||||
output.write(struct.pack('> I',len(ranks)))
|
||||
rank_list = sorted(ranks.keys())
|
||||
print(rank_list)
|
||||
print(len(rank_list))
|
||||
output.write(struct.pack('>I', len(rank_list)))
|
||||
|
||||
rankNames = list(ranks.keys())
|
||||
rankNames.sort()
|
||||
|
||||
for rank in rankNames:
|
||||
for rank in rank_list:
|
||||
output.write(ecoRankPacker(rank))
|
||||
|
||||
output.close()
|
||||
|
||||
def nameCmp(n1,n2):
|
||||
name1=n1[0].upper()
|
||||
name2=n2[0].upper()
|
||||
if name1 < name2:
|
||||
return -1
|
||||
elif name1 > name2:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
def ecoNameWriter(file, names):
|
||||
output = open(file, 'wb')
|
||||
output.write(struct.pack('>I', len(names)))
|
||||
|
||||
names.sort(key=lambda x:x[0].upper())
|
||||
# Sort case-insensitive
|
||||
names.sort(key=lambda x: x[0].lower())
|
||||
|
||||
for name in names:
|
||||
output.write(ecoNamePacker(name))
|
||||
@ -514,30 +599,32 @@ def ecoNameWriter(file,names):
|
||||
output.close()
|
||||
|
||||
def ecoDBWriter(prefix, taxonomy, seqFileNames, parser):
|
||||
|
||||
ecoRankWriter('%s.rdx' % prefix, taxonomy[1])
|
||||
ecoTaxWriter('%s.tdx' % prefix, taxonomy[0])
|
||||
ecoNameWriter('%s.ndx' % prefix, taxonomy[2])
|
||||
ecoRankWriter(prefix + '.rdx', taxonomy[1])
|
||||
ecoTaxWriter(prefix + '.tdx', taxonomy[0])
|
||||
ecoNameWriter(prefix + '.ndx', taxonomy[2])
|
||||
|
||||
filecount = 0
|
||||
for filename in seqFileNames:
|
||||
filecount += 1
|
||||
sk=ecoSeqWriter('%s_%03d.sdx' % (prefix,filecount),
|
||||
filename,
|
||||
taxonomy[3],
|
||||
parser)
|
||||
if sk:
|
||||
print("Skipped entry :", file=sys.stderr)
|
||||
print(sk, file=sys.stderr)
|
||||
outfile = "%s_%03d.sdx" % (prefix, filecount)
|
||||
print("\nProcessing:", filename, file=sys.stderr)
|
||||
skipped = ecoSeqWriter(outfile, filename, taxonomy[3], parser)
|
||||
|
||||
if skipped:
|
||||
print("\nSkipped entries without valid taxonomy:", file=sys.stderr)
|
||||
for sid in skipped:
|
||||
print(sid, file=sys.stderr)
|
||||
else:
|
||||
print("All entries processed successfully", file=sys.stderr)
|
||||
|
||||
def ecoParseOptions(arguments):
|
||||
opt = {
|
||||
'prefix': 'ecodb',
|
||||
'taxdir': 'taxdump',
|
||||
'parser' : sequenceIteratorFactory(genbankEntryParser,
|
||||
entryIterator)
|
||||
'parser': sequenceIteratorFactory(genbankEntryParser, entryIterator)
|
||||
}
|
||||
|
||||
try:
|
||||
o, filenames = getopt.getopt(arguments,
|
||||
'ht:T:n:gfe',
|
||||
['help',
|
||||
@ -547,32 +634,27 @@ def ecoParseOptions(arguments):
|
||||
'genbank',
|
||||
'fasta',
|
||||
'embl'])
|
||||
except getopt.GetoptError as err:
|
||||
print(str(err), file=sys.stderr)
|
||||
printHelp()
|
||||
sys.exit(2)
|
||||
|
||||
for name, value in o:
|
||||
if name in ('-h', '--help'):
|
||||
printHelp()
|
||||
exit()
|
||||
sys.exit()
|
||||
elif name in ('-t', '--taxonomy'):
|
||||
opt['taxmod']='dump'
|
||||
opt['taxdir'] = value
|
||||
elif name in ('-T','--taxonomy_db'):
|
||||
opt['taxmod']='db'
|
||||
opt['taxdb']=value
|
||||
elif name in ('-n', '--name'):
|
||||
opt['prefix'] = value
|
||||
elif name in ('-g', '--genbank'):
|
||||
opt['parser']=sequenceIteratorFactory(genbankEntryParser,
|
||||
entryIterator)
|
||||
|
||||
opt['parser'] = sequenceIteratorFactory(genbankEntryParser, entryIterator)
|
||||
elif name in ('-f', '--fasta'):
|
||||
opt['parser']=sequenceIteratorFactory(fastaEntryParser,
|
||||
fastaEntryIterator)
|
||||
|
||||
opt['parser'] = sequenceIteratorFactory(fastaEntryParser, fastaEntryIterator)
|
||||
elif name in ('-e', '--embl'):
|
||||
opt['parser']=sequenceIteratorFactory(emblEntryParser,
|
||||
entryIterator)
|
||||
opt['parser'] = sequenceIteratorFactory(emblEntryParser, entryIterator)
|
||||
else:
|
||||
raise ValueError('Unknown option %s' % name)
|
||||
raise ValueError('Unknown option: %s' % name)
|
||||
|
||||
return opt, filenames
|
||||
|
||||
@ -580,22 +662,37 @@ def printHelp():
|
||||
print("-----------------------------------")
|
||||
print(" ecoPCRFormat.py")
|
||||
print("-----------------------------------")
|
||||
print("ecoPCRFormat.py [option] <argument>")
|
||||
print("-----------------------------------")
|
||||
print("-e --embl :[E]mbl format")
|
||||
print("-f --fasta :[F]asta format")
|
||||
print("-g --genbank :[G]enbank format")
|
||||
print("-h --help :[H]elp - print this help")
|
||||
print("-n --name :[N]ame of the new database created")
|
||||
print("-t --taxonomy :[T]axonomy - path to the taxonomy database")
|
||||
print(" :bcp-like dump from GenBank taxonomy database.")
|
||||
print("Converts sequence databases to ecoPCR format")
|
||||
print("Usage: ecoPCRFormat.py [options] <files>")
|
||||
print("Options:")
|
||||
print(" -e, --embl Input in EMBL format")
|
||||
print(" -f, --fasta Input in FASTA format")
|
||||
print(" -g, --genbank Input in GenBank format (default)")
|
||||
print(" -h, --help Show this help message")
|
||||
print(" -n NAME, --name=NAME Database prefix (default: ecodb)")
|
||||
print(" -t DIR, --taxonomy=DIR Taxonomy directory (default: taxdump)")
|
||||
print("-----------------------------------")
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
try:
|
||||
opt, filenames = ecoParseOptions(sys.argv[1:])
|
||||
|
||||
if not filenames:
|
||||
print("Error: No input files specified", file=sys.stderr)
|
||||
printHelp()
|
||||
sys.exit(1)
|
||||
|
||||
print("Loading taxonomy...", file=sys.stderr)
|
||||
taxonomy = readTaxonomyDump(opt['taxdir'])
|
||||
|
||||
print("Processing sequence files...", file=sys.stderr)
|
||||
ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser'])
|
||||
|
||||
print("Database creation complete", file=sys.stderr)
|
||||
|
||||
except Exception as e:
|
||||
print("Error: " + str(e), file=sys.stderr)
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
Reference in New Issue
Block a user