diff --git a/tools/ecoPCRFormat.py b/tools/ecoPCRFormat.py index b849588..a91f831 100755 --- a/tools/ecoPCRFormat.py +++ b/tools/ecoPCRFormat.py @@ -1,14 +1,16 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from __future__ import print_function import re import gzip import struct import sys import time import getopt -from functools import cmp_to_key +import zlib -_dbenable=False +_dbenable = False ##### # @@ -19,44 +21,73 @@ _dbenable=False ##### def universalOpen(file): - if isinstance(file,str): - if file[-3:] == '.gz': - rep = gzip.open(file) + if isinstance(file, str): + if file.endswith('.gz'): + if sys.version_info[0] < 3: + rep = gzip.open(file) + else: + rep = gzip.open(file, 'rt', encoding='latin-1') else: - rep = open(file) + rep = open(file, 'r') else: rep = file return rep def universalTell(file): - if isinstance(file, gzip.GzipFile): - file=file.myfileobj + if hasattr(file, 'myfileobj'): # Python 3 gzip + return file.myfileobj.tell() + elif hasattr(file, 'fileobj'): # Python 2 gzip + return file.fileobj.tell() return file.tell() def fileSize(file): - if isinstance(file, gzip.GzipFile): - file=file.myfileobj - pos = file.tell() - file.seek(0,2) - length = file.tell() - file.seek(pos,0) + if hasattr(file, 'myfileobj'): # Python 3 gzip + f = file.myfileobj + elif hasattr(file, 'fileobj'): # Python 2 gzip + f = file.fileobj + else: + f = file + + if not hasattr(f, 'fileno'): + return None + + pos = f.tell() + f.seek(0, 2) + length = f.tell() + f.seek(pos, 0) return length -def progressBar(pos,max,reset=False,delta=[]): +def progressBar(pos, max_size, reset=False, delta=[]): if reset: del delta[:] if not delta: delta.append(time.time()) delta.append(time.time()) - delta[1]=time.time() - elapsed = delta[1]-delta[0] - percent = float(pos)/max * 100 - remain = time.strftime('%H:%M:%S',time.gmtime(elapsed / percent * (100-percent))) - bar = '#' * int(percent/2) - bar+= '|/-\\-'[pos % 5] - bar+= ' ' * (50 - int(percent/2)) - sys.stderr.write('\r%5.1f %% |%s] remain : %s' %(percent,bar,remain)) + delta[1] = time.time() + elapsed = delta[1] - delta[0] + + if max_size is None: + sys.stderr.write('\rReaded sequences : %d ' % pos) + sys.stderr.flush() + return + + percent = float(pos) / max_size * 100 if max_size > 0 else 0 + + if percent > 0: + remain = elapsed / percent * (100 - percent) + remain_str = time.strftime('%H:%M:%S', time.gmtime(int(remain))) + else: + remain_str = '?' + + bar_length = 50 + filled = int(percent / 2) + bar = '#' * filled + bar += '|/-\\'[pos % 4] + bar += ' ' * (bar_length - filled - 1) + + sys.stderr.write('\r%5.1f %% |%s] remain : %s' % (percent, bar, remain_str)) + sys.stderr.flush() ##### # @@ -66,159 +97,196 @@ def progressBar(pos,max,reset=False,delta=[]): # ##### -def endLessIterator(endedlist): - for x in endedlist: - yield x - while(1): - yield endedlist[-1] - -class ColumnFile(object): - - def __init__(self,stream,sep=None,strip=True,types=None): - if isinstance(stream,str): - self._stream = open(stream) - else: - try: - iter(stream) - self._stream = stream - except TypeError: - raise ValueError('stream must be string or an iterator') - self._delimiter=sep - self._strip=strip - if types: - self._types=[x for x in types] - for i in range(len(self._types)): - if self._types[i] is bool: - self._types[i]=ColumnFile.str2bool - else: - self._types=None - - def str2bool(x): - return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False})) - - str2bool = staticmethod(str2bool) - - +class endLessIterator(object): + def __init__(self, endedlist): + self.endedlist = endedlist + self.index = 0 + def __iter__(self): return self - - def __next__(self): + + def next(self): # Python 2 + if self.index < len(self.endedlist): + item = self.endedlist[self.index] + self.index += 1 + return item + else: + return self.endedlist[-1] + + __next__ = next # Python 3 compatibility + +class ColumnFile(object): + def __init__(self, stream, sep=None, strip=True, types=None): + if isinstance(stream, str): + self._stream = open(stream) + else: + try: + iter(stream) + self._stream = stream + except TypeError: + raise ValueError('stream must be string or an iterator') + self._delimiter = sep + self._strip = strip + self._types = types + + @staticmethod + def str2bool(x): + val = x.strip()[0].upper() + if val in 'T1Y': + return True + elif val in 'F0N': + return False + raise ValueError("Invalid boolean value: %s" % x) + + def __iter__(self): + return self + + def next(self): # Python 2 ligne = next(self._stream) data = ligne.split(self._delimiter) - if self._strip or self._types: + if self._strip: data = [x.strip() for x in data] - if self._types: - it = endLessIterator(self._types) - data = [x[1](x[0]) for x in ((y,next(it)) for y in data)] - return data - -def taxonCmp(t1,t2): - if t1[0] < t2[0]: - return -1 - elif t1[0] > t2[0]: - return +1 - return 0 -def bsearchTaxon(taxonomy,taxid): + if self._types: + typed_data = [] + for i, val in enumerate(data): + if i < len(self._types): + t = self._types[i] + if t is bool: + typed_data.append(self.str2bool(val)) + else: + typed_data.append(t(val)) + else: + typed_data.append(val) + return typed_data + return data + + __next__ = next # Python 3 compatibility + +def bsearchTaxon(taxonomy, taxid): taxCount = len(taxonomy) begin = 0 - end = taxCount - oldcheck=taxCount - check = int(begin + end / 2) - while check != oldcheck and taxonomy[check][0]!=taxid : - if taxonomy[check][0] < taxid: - begin=check - else: - end=check - oldcheck=check - check = int((begin + end) / 2) - - - if taxonomy[check][0]==taxid: - return check - else: - return None - - - -def readNodeTable(file): + end = taxCount + oldcheck = -1 + check = int(begin + (end - begin) / 2) + while check != oldcheck and check < taxCount and taxonomy[check][0] != taxid: + if taxonomy[check][0] < taxid: + begin = check + else: + end = check + + oldcheck = check + check = int(begin + (end - begin) / 2) + + if check < taxCount and taxonomy[check][0] == taxid: + return check + return None + +def readNodeTable(file): file = universalOpen(file) - - nodes = ColumnFile(file, - sep='|', - types=(int,int,str, - str,str,bool, - int,bool,int, - bool,bool,bool,str)) + nodes = ColumnFile(file, + sep='|', + types=(int, int, str, + str, str, bool, + int, bool, int, + bool, bool, bool, str)) print("Reading taxonomy dump file...", file=sys.stderr) - taxonomy=[[n[0],n[2],n[1]] for n in nodes] - print("List all taxonomy rank...", file=sys.stderr) - ranks =list(set(x[1] for x in taxonomy)) + sys.stderr.flush() + + taxonomy = [] + for n in nodes: + taxonomy.append([n[0], n[2], n[1]]) + + print("List all taxonomy rank...", file=sys.stderr) + sys.stderr.flush() + + ranks = list(set(x[1] for x in taxonomy)) ranks.sort() - ranks = {rank: index for index, rank in enumerate(ranks)} - + rank_dict = {rank: idx for idx, rank in enumerate(ranks)} + print("Sorting taxons...", file=sys.stderr) + sys.stderr.flush() taxonomy.sort(key=lambda x: x[0]) print("Indexing taxonomy...", file=sys.stderr) + sys.stderr.flush() + index = {} - for t in taxonomy: - index[t[0]]=bsearchTaxon(taxonomy, t[0]) - + for i, t in enumerate(taxonomy): + index[t[0]] = i + print("Indexing parent and rank...", file=sys.stderr) + sys.stderr.flush() + for t in taxonomy: - t[1]=ranks[t[1]] - t[2]=index[t[2]] - - - return taxonomy,ranks,index + t[1] = rank_dict[t[1]] + t[2] = index.get(t[2], None) + + return taxonomy, rank_dict, index def nameIterator(file): file = universalOpen(file) - names = ColumnFile(file, - sep='|', - types=(int,str, - str,str)) - for taxid,name,unique,classname,white in names: - yield taxid,name,classname - + names = ColumnFile(file, + sep='|', + types=(int, str, str, str)) + for row in names: + if len(row) >= 4: + taxid, name, unique, classname = row[:4] + yield taxid, name, classname + def mergedNodeIterator(file): file = universalOpen(file) - merged = ColumnFile(file, - sep='|', - types=(int,int,str)) - for taxid,current,white in merged: - yield taxid,current - + merged = ColumnFile(file, + sep='|', + types=(int, int, str)) + for row in merged: + if len(row) >= 2: + taxid, current = row[:2] + yield taxid, current + def deletedNodeIterator(file): file = universalOpen(file) - deleted = ColumnFile(file, - sep='|', - types=(int,str)) - for taxid,white in deleted: + deleted = ColumnFile(file, + sep='|', + types=(int, str)) + for row in deleted: + if row: + taxid = row[0] yield taxid - -def readTaxonomyDump(taxdir): - taxonomy,ranks,index = readNodeTable('%s/nodes.dmp' % taxdir) - - print("Adding scientific name...", file=sys.stderr) - alternativeName=[] - for taxid,name,classname in nameIterator('%s/names.dmp' % taxdir): - alternativeName.append((name,classname,index[taxid])) - if classname == 'scientific name': - taxonomy[index[taxid]].append(name) - +def readTaxonomyDump(taxdir): + taxonomy, ranks, index = readNodeTable(taxdir + '/nodes.dmp') + + # Initialize scientific name as empty string for each taxon + for tx in taxonomy: + tx.append("") # Add fourth element: scientific name (empty at start) + + print("Adding scientific name...", file=sys.stderr) + sys.stderr.flush() + + alternativeName = [] + for taxid, name, classname in nameIterator(taxdir + '/names.dmp'): + idx = index.get(taxid, None) + if idx is not None: + # Clean and ensure ASCII only + name_clean = ''.join(c for c in name if ord(c) < 128) + alternativeName.append((name_clean, classname, idx)) + if classname == 'scientific name': + taxonomy[idx][3] = name_clean + print("Adding taxid alias...", file=sys.stderr) - for taxid,current in mergedNodeIterator('%s/merged.dmp' % taxdir): - index[taxid]=index[current] - + sys.stderr.flush() + for taxid, current in mergedNodeIterator(taxdir + '/merged.dmp'): + if current in index: + index[taxid] = index[current] + print("Adding deleted taxid...", file=sys.stderr) - for taxid in deletedNodeIterator('%s/delnodes.dmp' % taxdir): - index[taxid]=None - - return taxonomy,ranks,alternativeName,index + sys.stderr.flush() + for taxid in deletedNodeIterator(taxdir + '/delnodes.dmp'): + index[taxid] = None + + return taxonomy, ranks, alternativeName, index ##### # @@ -230,372 +298,401 @@ def readTaxonomyDump(taxdir): def entryIterator(file): file = universalOpen(file) - rep =[] + rep = [] ligne = file.readline() while ligne: rep.append(ligne) if ligne == '//\n': - rep = ''.join(rep) - yield rep + yield ''.join(rep) rep = [] ligne = file.readline() - + if rep: + yield ''.join(rep) + def fastaEntryIterator(file): file = universalOpen(file) - rep =[] - ligne = file.readline() + rep = [] + ligne = file.readline() while ligne: - if ligne[0] == '>' and rep: - rep = ''.join(rep) - yield rep + if ligne.startswith('>') and rep: + yield ''.join(rep) rep = [] rep.append(ligne) ligne = file.readline() - if rep: - rep = ''.join(rep) - yield rep - -_cleanSeq = re.compile('[ \n0-9]+') - + yield ''.join(rep) + +_cleanSeq = re.compile(r'[\s\d]') # More efficient + def cleanSeq(seq): - return _cleanSeq.sub('',seq) - - -_gbParseID = re.compile('(?<=^LOCUS {7})[^ ]+(?= )',re.MULTILINE) -_gbParseDE = re.compile('(?<=^DEFINITION {2}).+?\. *$(?=[^ ])',re.MULTILINE+re.DOTALL) -_gbParseSQ = re.compile('(?<=^ORIGIN).+?(?=^//$)',re.MULTILINE+re.DOTALL) -_gbParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")') - + return _cleanSeq.sub('', seq) + +# GenBank patterns +_gbParseID = re.compile(r'(?<=^LOCUS {7})[^ ]+(?= )', re.MULTILINE) +_gbParseDE = re.compile(r'(?<=^DEFINITION {2}).+?\. *$', re.MULTILINE | re.DOTALL) +_gbParseSQ = re.compile(r'(?<=^ORIGIN).+?(?=^//$)', re.MULTILINE | re.DOTALL) +_gbParseTX = re.compile(r'(?<= /db_xref="taxon:)[0-9]+(?=")') + def genbankEntryParser(entry): - Id = _gbParseID.findall(entry)[0] - De = ' '.join(_gbParseDE.findall(entry)[0].split()) - Sq = cleanSeq(_gbParseSQ.findall(entry)[0].upper()) - try: - Tx = int(_gbParseTX.findall(entry)[0]) - except IndexError: - Tx = None - return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq} + Id_match = _gbParseID.findall(entry) + Id = Id_match[0] if Id_match else "UNKNOWN_ID" -###################### + De_match = _gbParseDE.findall(entry) + De = ' '.join(De_match[0].split()) if De_match else "" -_cleanDef = re.compile('[\nDE]') + Sq_match = _gbParseSQ.findall(entry) + Sq = cleanSeq(Sq_match[0].upper()) if Sq_match else "" -def cleanDef(definition): - return _cleanDef.sub('',definition) + Tx_match = _gbParseTX.findall(entry) + Tx = int(Tx_match[0]) if Tx_match else None -_emblParseID = re.compile('(?<=^ID {3})[^ ]+(?=;)',re.MULTILINE) -_emblParseDE = re.compile('(?<=^DE {3}).+?\. *$(?=[^ ])',re.MULTILINE+re.DOTALL) -_emblParseSQ = re.compile('(?<=^ ).+?(?=^//$)',re.MULTILINE+re.DOTALL) -_emblParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")') + # Clean and ensure ASCII only + Id = ''.join(c for c in Id if ord(c) < 128) + De = ''.join(c for c in De if ord(c) < 128) + + return {'id': Id, 'taxid': Tx, 'definition': De, 'sequence': Sq} + +# EMBL patterns +_emblParseID = re.compile(r'(?<=^ID {3})[^ ]+(?=;)', re.MULTILINE) +_emblParseDE = re.compile(r'(?<=^DE {3}).+?\. *$', re.MULTILINE | re.DOTALL) +_emblParseSQ = re.compile(r'(?<=^ ).+?(?=^//$)', re.MULTILINE | re.DOTALL) +_emblParseTX = re.compile(r'(?<= /db_xref="taxon:)[0-9]+(?=")') def emblEntryParser(entry): - Id = _emblParseID.findall(entry)[0] - De = ' '.join(cleanDef(_emblParseDE.findall(entry)[0]).split()) - Sq = cleanSeq(_emblParseSQ.findall(entry)[0].upper()) - try: - Tx = int(_emblParseTX.findall(entry)[0]) - except IndexError: - Tx = None - return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq} + Id_match = _emblParseID.findall(entry) + Id = Id_match[0] if Id_match else "UNKNOWN_ID" + De_match = _emblParseDE.findall(entry) + De = ' '.join(De_match[0].replace('\nDE ', ' ').split()) if De_match else "" -###################### + Sq_match = _emblParseSQ.findall(entry) + Sq = cleanSeq(Sq_match[0].upper()) if Sq_match else "" -_fastaSplit=re.compile(';\W*') + Tx_match = _emblParseTX.findall(entry) + Tx = int(Tx_match[0]) if Tx_match else None + + # Clean and ensure ASCII only + Id = ''.join(c for c in Id if ord(c) < 128) + De = ''.join(c for c in De if ord(c) < 128) + + return {'id': Id, 'taxid': Tx, 'definition': De, 'sequence': Sq} + +# FASTA processing +_fastaSplit = re.compile(r';\W*') def parseFasta(seq): - seq=seq.split('\n') - title = seq[0].strip()[1:].split(None,1) - id=title[0] - if len(title) == 2: - field = _fastaSplit.split(title[1]) - else: - field=[] - info = dict(x.split('=',1) for x in field if '=' in x) - definition = ' '.join([x for x in field if '=' not in x]) - seq=(''.join([x.strip() for x in seq[1:]])).upper() - return id,seq,definition,info + lines = seq.split('\n') + if not lines: + return "", "", "", {} + + header = lines[0].strip() + if header.startswith('>'): + header = header[1:] + + parts = header.split(None, 1) + seq_id = parts[0] if parts else "" + description = parts[1] if len(parts) > 1 else "" + + fields = _fastaSplit.split(description) if description else [] + info = {} + other_desc = [] + for field in fields: + if '=' in field: + key, val = field.split('=', 1) + info[key.strip()] = val.strip() + else: + other_desc.append(field) + + definition = ' '.join(other_desc) + sequence = ''.join(x.strip() for x in lines[1:]).upper() + + # Clean and ensure ASCII only + seq_id = ''.join(c for c in seq_id if ord(c) < 128) + definition = ''.join(c for c in definition if ord(c) < 128) + + return seq_id, sequence, definition, info - def fastaEntryParser(entry): - id,seq,definition,info = parseFasta(entry) - Tx = info.get('taxid',None) + seq_id, sequence, definition, info = parseFasta(entry) + Tx = info.get('taxid', None) if Tx is not None: - Tx=int(Tx) - return {'id':id,'taxid':Tx,'definition':definition,'sequence':seq} - + try: + Tx = int(Tx) + except ValueError: + Tx = None + return {'id': seq_id, 'taxid': Tx, 'definition': definition, 'sequence': sequence} -def sequenceIteratorFactory(entryParser,entryIterator): +def sequenceIteratorFactory(entryParser, entryIterator): def sequenceIterator(file): for entry in entryIterator(file): yield entryParser(entry) return sequenceIterator +##### +# +# +# Binary writer (CORRECTED VERSION) +# +# +##### -def taxonomyInfo(entry,connection): - taxid = entry['taxid'] - curseur = connection.cursor() - curseur.execute(""" - select taxid,species,genus,family, - taxonomy.scientificName(taxid) as sn, - taxonomy.scientificName(species) as species_sn, - taxonomy.scientificName(genus) as genus_sn, - taxonomy.scientificName(family) as family_sn - from - ( - select alias as taxid, - taxonomy.getSpecies(alias) as species, - taxonomy.getGenus(alias) as genus, - taxonomy.getFamily(alias) as family - from taxonomy.aliases - where id=%d ) as tax - """ % taxid) - rep = curseur.fetchone() - entry['current_taxid']=rep[0] - entry['species']=rep[1] - entry['genus']=rep[2] - entry['family']=rep[3] - entry['scientific_name']=rep[4] - entry['species_sn']=rep[5] - entry['genus_sn']=rep[6] - entry['family_sn']=rep[7] - return entry - -##### -# -# -# Binary writer -# -# -##### - def ecoSeqPacker(sq): - - compactseq = gzip.zlib.compress(bytes(sq['sequence'],"ascii"),9) - cptseqlength = len(compactseq) - delength = len(sq['definition']) - - totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength - - packed = struct.pack('> I I 20s I I I %ds %ds' % (delength,cptseqlength), - totalSize, - sq['taxid'], - bytes(sq['id'],"ascii"), - delength, - len(sq['sequence']), - cptseqlength, - bytes(sq['definition'],"ascii"), - compactseq) - - assert len(packed) == totalSize+4, "error in sequence packing" - + # Ensure sequence is ASCII uppercase + sequence = ''.join(c for c in sq['sequence'] if ord(c) < 128).upper() + seq_bytes = sequence.encode('ascii') + compactseq = zlib.compress(seq_bytes, 9) + cptseqlength = len(compactseq) + + # Clean definition to pure ASCII + definition = ''.join(c for c in sq['definition'] if ord(c) < 128) + de_bytes = definition.encode('ascii') + delength = len(de_bytes) + + # Clean ID to pure ASCII and pad to 20 bytes + seq_id = ''.join(c for c in sq['id'] if ord(c) < 128) + seq_id_bytes = seq_id.encode('ascii') + if len(seq_id_bytes) > 20: + seq_id_bytes = seq_id_bytes[:20] + else: + seq_id_bytes = seq_id_bytes.ljust(20, b'\0') + + # Calculate total size (4 bytes for each integer + data lengths) + totalSize = 4 + 20 + 4 + 4 + 4 + delength + cptseqlength + + # Pack with correct byte order and field order + packed = struct.pack('>I', totalSize) + packed += struct.pack('>I', sq['taxid']) + packed += seq_id_bytes + packed += struct.pack('>I', delength) + packed += struct.pack('>I', len(sequence)) + packed += struct.pack('>I', cptseqlength) + packed += de_bytes + packed += compactseq + return packed def ecoTaxPacker(tx): - - namelength = len(tx[3]) - + # Clean name to pure ASCII + name_clean = ''.join(c for c in tx[3] if ord(c) < 128) + name_bytes = name_clean.encode('ascii') + namelength = len(name_bytes) totalSize = 4 + 4 + 4 + 4 + namelength - - packed = struct.pack('> I I I I I %ds' % namelength, - totalSize, - tx[0], - tx[1], - tx[2], - namelength, - bytes(tx[3],"ascii")) - + + packed = struct.pack('>I', totalSize) + packed += struct.pack('>I', tx[0]) # taxid + packed += struct.pack('>I', tx[1]) # rank index + packed += struct.pack('>I', tx[2]) # parent index + packed += struct.pack('>I', namelength) + packed += name_bytes + return packed def ecoRankPacker(rank): - - namelength = len(rank) - - packed = struct.pack('> I %ds' % namelength, - namelength, - bytes(rank, 'ascii')) - + # Clean rank to pure ASCII + rank_clean = ''.join(c for c in rank if ord(c) < 128) + rank_bytes = rank_clean.encode('ascii') + namelength = len(rank_bytes) + packed = struct.pack('>I', namelength) + packed += rank_bytes return packed - + def ecoNamePacker(name): - - namelength = len(name[0]) - classlength= len(name[1]) - totalSize = namelength + classlength + 4 + 4 + 4 + 4 - - packed = struct.pack('> I I I I I %ds %ds' % (namelength,classlength), - totalSize, - int(name[1]=='scientific name'), - namelength, - classlength, - name[2], - bytes(name[0], 'ascii'), - bytes(name[1], 'ascii')) - + # Clean name and class to pure ASCII + name_clean = ''.join(c for c in name[0] if ord(c) < 128) + class_clean = ''.join(c for c in name[1] if ord(c) < 128) + + name_bytes = name_clean.encode('ascii') + class_bytes = class_clean.encode('ascii') + namelength = len(name_bytes) + classlength = len(class_bytes) + totalSize = 4 + 4 + 4 + 4 + namelength + classlength + + packed = struct.pack('>I', totalSize) + packed += struct.pack('>I', 1 if name[1] == 'scientific name' else 0) + packed += struct.pack('>I', namelength) + packed += struct.pack('>I', classlength) + packed += struct.pack('>I', name[2]) # taxon index + packed += name_bytes + packed += class_bytes + return packed - -def ecoSeqWriter(file,input,taxindex,parser): - output = open(file,'wb') - input = universalOpen(input) - inputsize = fileSize(input) - entries = parser(input) - seqcount=0 + +def ecoSeqWriter(file, input_file, taxindex, parser): + output = open(file, 'wb') + input_stream = universalOpen(input_file) + inputsize = fileSize(input_stream) + entries = parser(input_stream) + seqcount = 0 skipped = [] - output.write(struct.pack('> I',seqcount)) - - progressBar(1, inputsize,reset=True) + # Write placeholder for sequence count + output.write(struct.pack('>I', 0)) + + progressBar(0, inputsize, reset=True) + for entry in entries: if entry['taxid'] is not None: try: - entry['taxid']=taxindex[entry['taxid']] - except KeyError: - entry['taxid']=None + # Convert to integer taxid + taxid_int = int(entry['taxid']) + # Lookup in taxindex + entry['taxid'] = taxindex.get(taxid_int, None) + except (ValueError, TypeError): + entry['taxid'] = None + if entry['taxid'] is not None: - seqcount+=1 - output.write(ecoSeqPacker(entry)) + seqcount += 1 + packed = ecoSeqPacker(entry) + output.write(packed) else: skipped.append(entry['id']) - where = universalTell(input) - progressBar(where, inputsize) - print(" Readed sequences : %d " % seqcount, end=' ', file=sys.stderr) - else: - skipped.append(entry['id']) - - print(file=sys.stderr) - output.seek(0,0) - output.write(struct.pack('> I',seqcount)) - - output.close() - return skipped - -def ecoTaxWriter(file,taxonomy): - output = open(file,'wb') - output.write(struct.pack('> I',len(taxonomy))) - + # Update progress + if inputsize is not None: + where = universalTell(input_stream) + progressBar(where, inputsize) + else: + progressBar(seqcount, seqcount + 1) # Fake progress + + sys.stderr.write(" Readed sequences : %d \r" % seqcount) + sys.stderr.flush() + + print("\n", file=sys.stderr) + # Update sequence count at beginning of file + output.seek(0) + output.write(struct.pack('>I', seqcount)) + output.close() + + return skipped + +def ecoTaxWriter(file, taxonomy): + output = open(file, 'wb') + output.write(struct.pack('>I', len(taxonomy))) + for tx in taxonomy: output.write(ecoTaxPacker(tx)) output.close() - -def ecoRankWriter(file,ranks): - output = open(file,'wb') - output.write(struct.pack('> I',len(ranks))) - rankNames = list(ranks.keys()) - rankNames.sort() - - for rank in rankNames: +def ecoRankWriter(file, ranks): + output = open(file, 'wb') + rank_list = sorted(ranks.keys()) + print(rank_list) + print(len(rank_list)) + output.write(struct.pack('>I', len(rank_list))) + + for rank in rank_list: output.write(ecoRankPacker(rank)) output.close() -def nameCmp(n1,n2): - name1=n1[0].upper() - name2=n2[0].upper() - if name1 < name2: - return -1 - elif name1 > name2: - return 1 - return 0 +def ecoNameWriter(file, names): + output = open(file, 'wb') + output.write(struct.pack('>I', len(names))) + # Sort case-insensitive + names.sort(key=lambda x: x[0].lower()) -def ecoNameWriter(file,names): - output = open(file,'wb') - output.write(struct.pack('> I',len(names))) - - names.sort(key=lambda x:x[0].upper()) - for name in names: output.write(ecoNamePacker(name)) output.close() - -def ecoDBWriter(prefix,taxonomy,seqFileNames,parser): - - ecoRankWriter('%s.rdx' % prefix, taxonomy[1]) - ecoTaxWriter('%s.tdx' % prefix, taxonomy[0]) - ecoNameWriter('%s.ndx' % prefix, taxonomy[2]) - + +def ecoDBWriter(prefix, taxonomy, seqFileNames, parser): + ecoRankWriter(prefix + '.rdx', taxonomy[1]) + ecoTaxWriter(prefix + '.tdx', taxonomy[0]) + ecoNameWriter(prefix + '.ndx', taxonomy[2]) + filecount = 0 for filename in seqFileNames: - filecount+=1 - sk=ecoSeqWriter('%s_%03d.sdx' % (prefix,filecount), - filename, - taxonomy[3], - parser) - if sk: - print("Skipped entry :", file=sys.stderr) - print(sk, file=sys.stderr) - + filecount += 1 + outfile = "%s_%03d.sdx" % (prefix, filecount) + print("\nProcessing:", filename, file=sys.stderr) + skipped = ecoSeqWriter(outfile, filename, taxonomy[3], parser) + + if skipped: + print("\nSkipped entries without valid taxonomy:", file=sys.stderr) + for sid in skipped: + print(sid, file=sys.stderr) + else: + print("All entries processed successfully", file=sys.stderr) + def ecoParseOptions(arguments): opt = { - 'prefix' : 'ecodb', - 'taxdir' : 'taxdump', - 'parser' : sequenceIteratorFactory(genbankEntryParser, - entryIterator) - } - - o,filenames = getopt.getopt(arguments, - 'ht:T:n:gfe', - ['help', - 'taxonomy=', - 'taxonomy_db=', - 'name=', - 'genbank', - 'fasta', - 'embl']) - - for name,value in o: - if name in ('-h','--help'): + 'prefix': 'ecodb', + 'taxdir': 'taxdump', + 'parser': sequenceIteratorFactory(genbankEntryParser, entryIterator) + } + + try: + o, filenames = getopt.getopt(arguments, + 'ht:T:n:gfe', + ['help', + 'taxonomy=', + 'taxonomy_db=', + 'name=', + 'genbank', + 'fasta', + 'embl']) + except getopt.GetoptError as err: + print(str(err), file=sys.stderr) + printHelp() + sys.exit(2) + + for name, value in o: + if name in ('-h', '--help'): printHelp() - exit() - elif name in ('-t','--taxonomy'): - opt['taxmod']='dump' - opt['taxdir']=value - elif name in ('-T','--taxonomy_db'): - opt['taxmod']='db' - opt['taxdb']=value - elif name in ('-n','--name'): - opt['prefix']=value - elif name in ('-g','--genbank'): - opt['parser']=sequenceIteratorFactory(genbankEntryParser, - entryIterator) - - elif name in ('-f','--fasta'): - opt['parser']=sequenceIteratorFactory(fastaEntryParser, - fastaEntryIterator) - - elif name in ('-e','--embl'): - opt['parser']=sequenceIteratorFactory(emblEntryParser, - entryIterator) + sys.exit() + elif name in ('-t', '--taxonomy'): + opt['taxdir'] = value + elif name in ('-n', '--name'): + opt['prefix'] = value + elif name in ('-g', '--genbank'): + opt['parser'] = sequenceIteratorFactory(genbankEntryParser, entryIterator) + elif name in ('-f', '--fasta'): + opt['parser'] = sequenceIteratorFactory(fastaEntryParser, fastaEntryIterator) + elif name in ('-e', '--embl'): + opt['parser'] = sequenceIteratorFactory(emblEntryParser, entryIterator) else: - raise ValueError('Unknown option %s' % name) + raise ValueError('Unknown option: %s' % name) - return opt,filenames + return opt, filenames def printHelp(): print("-----------------------------------") print(" ecoPCRFormat.py") print("-----------------------------------") - print("ecoPCRFormat.py [option] ") - print("-----------------------------------") - print("-e --embl :[E]mbl format") - print("-f --fasta :[F]asta format") - print("-g --genbank :[G]enbank format") - print("-h --help :[H]elp - print this help") - print("-n --name :[N]ame of the new database created") - print("-t --taxonomy :[T]axonomy - path to the taxonomy database") - print(" :bcp-like dump from GenBank taxonomy database.") + print("Converts sequence databases to ecoPCR format") + print("Usage: ecoPCRFormat.py [options] ") + print("Options:") + print(" -e, --embl Input in EMBL format") + print(" -f, --fasta Input in FASTA format") + print(" -g, --genbank Input in GenBank format (default)") + print(" -h, --help Show this help message") + print(" -n NAME, --name=NAME Database prefix (default: ecodb)") + print(" -t DIR, --taxonomy=DIR Taxonomy directory (default: taxdump)") print("-----------------------------------") if __name__ == '__main__': - - opt,filenames = ecoParseOptions(sys.argv[1:]) - - taxonomy = readTaxonomyDump(opt['taxdir']) - - ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser']) - + try: + opt, filenames = ecoParseOptions(sys.argv[1:]) + + if not filenames: + print("Error: No input files specified", file=sys.stderr) + printHelp() + sys.exit(1) + + print("Loading taxonomy...", file=sys.stderr) + taxonomy = readTaxonomyDump(opt['taxdir']) + + print("Processing sequence files...", file=sys.stderr) + ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser']) + + print("Database creation complete", file=sys.stderr) + + except Exception as e: + print("Error: " + str(e), file=sys.stderr) + import traceback + traceback.print_exc() + sys.exit(1) + \ No newline at end of file