From 6cd114113052d91c5c9ffac0b5f3ff5b7a59faaa Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 2 May 2007 10:10:27 +0000 Subject: [PATCH] Add capacity to format fasta files, with taxonomical data described in title line in the following format : [taxon: ####] where #### is a taxid from NCBI taxonomy. It can be as precise as possible (species, genus or even root) git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPCR/trunk@13 60f365c0-8329-0410-b2a4-ec073aeeaa1d --- tools/ecoPCRFormat.py | 75 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 61 insertions(+), 14 deletions(-) diff --git a/tools/ecoPCRFormat.py b/tools/ecoPCRFormat.py index 7324212..2d86065 100755 --- a/tools/ecoPCRFormat.py +++ b/tools/ecoPCRFormat.py @@ -2,7 +2,6 @@ import re import gzip -import psycopg2 import struct import sys import time @@ -233,7 +232,20 @@ def entryIterator(file): rep = ''.join(rep) yield rep rep = [] - + +def fastaEntryIterator(file): + file = universalOpen(file) + rep =[] + for ligne in file: + if ligne[0] == '>' and rep: + rep = ''.join(rep) + yield rep + rep = [] + rep.append(ligne) + if rep: + rep = ''.join(rep) + yield rep + _cleanSeq = re.compile('[ \n0-9]+') def cleanSeq(seq): @@ -245,7 +257,7 @@ _gbParseDE = re.compile('(?<=^DEFINITION {2}).+?\. *$(?=[^ ])',re.MULTILINE+re.D _gbParseSQ = re.compile('(?<=^ORIGIN).+?(?=^//$)',re.MULTILINE+re.DOTALL) _gbParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")') -def genbankParser(entry): +def genbankEntryParser(entry): Id = _gbParseID.findall(entry)[0] De = ' '.join(_gbParseDE.findall(entry)[0].split()) Sq = cleanSeq(_gbParseSQ.findall(entry)[0].upper()) @@ -255,9 +267,34 @@ def genbankParser(entry): Tx = None return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq} -def sequenceIterator(file,parser): - for entry in entryIterator(file): - yield parser(entry) +_fastaParseID = re.compile('(?<=^>)[^ ]+') +_fastaParseDE = re.compile('(?<=^>).+',) +_fastaParseSQ = re.compile('^[^>].+',re.MULTILINE+re.DOTALL) +_fastaParseTX = re.compile('(?<=[[Tt]axon:) *[0-9]+ *(?=])') + +def fastaEntryParser(entry): + Id = _fastaParseID.findall(entry)[0] + De = _fastaParseDE.findall(entry)[0].split(None,1)[1:] + if not De: + De='' + else: + De=De[0] + Sq = cleanSeq(_fastaParseSQ.findall(entry)[0].upper()) + try: + Tx = int(_fastaParseTX.findall(entry)[0]) + except IndexError: + Tx = None + + return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq} + + + +def sequenceIteratorFactory(entryParser,entryIterator): + def sequenceIterator(file): + for entry in entryIterator(file): + yield entryParser(entry) + return sequenceIterator + def taxonomyInfo(entry,connection): taxid = entry['taxid'] @@ -345,11 +382,11 @@ def ecoRankPacker(rank): return packed -def ecoSeqWriter(file,input,taxindex,parser=genbankParser): +def ecoSeqWriter(file,input,taxindex,parser): output = open(file,'wb') input = universalOpen(input) inputsize = fileSize(input) - entries = sequenceIterator(input, parser) + entries = parser(input) seqcount=0 skipped = [] @@ -358,7 +395,10 @@ def ecoSeqWriter(file,input,taxindex,parser=genbankParser): progressBar(1, inputsize,reset=True) for entry in entries: if entry['taxid'] is not None: - entry['taxid']=taxindex[entry['taxid']] + try: + entry['taxid']=taxindex[entry['taxid']] + except KeyError: + entry['taxid']=None if entry['taxid'] is not None: seqcount+=1 output.write(ecoSeqPacker(entry)) @@ -399,7 +439,7 @@ def ecoRankWriter(file,ranks): output.close() -def ecoDBWriter(prefix,taxonomy,seqFileNames,parser=genbankParser): +def ecoDBWriter(prefix,taxonomy,seqFileNames,parser): ecoRankWriter('%s.rdx' % prefix, taxonomy[1]) ecoTaxWriter('%s.tdx' % prefix, taxonomy[0]) @@ -419,15 +459,17 @@ def ecoParseOptions(arguments): opt = { 'prefix' : 'ecodb', 'taxdir' : 'taxdump', - 'parser' : genbankParser + 'parser' : sequenceIteratorFactory(genbankEntryParser, + entryIterator) } o,filenames = getopt.getopt(arguments, - 'ht:n:g', + 'ht:n:gf', ['help', 'taxonomy=', 'name=', - 'genbank']) + 'genbank', + 'fasta']) for name,value in o: if name in ('-h','--help'): @@ -437,7 +479,12 @@ def ecoParseOptions(arguments): elif name in ('-n','--name'): opt['prefix']=value elif name in ('-g','--genbank'): - opt['parser']=genbankParser + opt['parser']=sequenceIteratorFactory(genbankEntryParser, + entryIterator + ) + elif name in ('-f','--fasta'): + opt['parser']=sequenceIteratorFactory(fastaEntryParser, + fastaEntryIterator) else: raise ValueError,'Unknown option %s' % name