From fc26a9fda38c70c55d6511768f09765302d72b22 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Fri, 15 Jun 2007 13:40:14 +0000 Subject: [PATCH] git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPCR/branches/refactoring@67 60f365c0-8329-0410-b2a4-ec073aeeaa1d --- tools/ecoPCRFormat.py | 57 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 51 insertions(+), 6 deletions(-) diff --git a/tools/ecoPCRFormat.py b/tools/ecoPCRFormat.py index 2d86065..9a3205d 100755 --- a/tools/ecoPCRFormat.py +++ b/tools/ecoPCRFormat.py @@ -267,6 +267,30 @@ def genbankEntryParser(entry): Tx = None return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq} +###################### + +_cleanDef = re.compile('[\nDE]') + +def cleanDef(definition): + return _cleanDef.sub('',definition) + +_emblParseID = re.compile('(?<=^ID {3})[^ ]+(?=;)',re.MULTILINE) +_emblParseDE = re.compile('(?<=^DE {3}).+?\. *$(?=[^ ])',re.MULTILINE+re.DOTALL) +_emblParseSQ = re.compile('(?<=^ ).+?(?=^//$)',re.MULTILINE+re.DOTALL) +_emblParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")') + +def emblEntryParser(entry): + Id = _emblParseID.findall(entry)[0] + De = ' '.join(cleanDef(_emblParseDE.findall(entry)[0]).split()) + Sq = cleanSeq(_emblParseSQ.findall(entry)[0].upper()) + try: + Tx = int(_emblParseTX.findall(entry)[0]) + except IndexError: + Tx = None + return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq} + +###################### + _fastaParseID = re.compile('(?<=^>)[^ ]+') _fastaParseDE = re.compile('(?<=^>).+',) _fastaParseSQ = re.compile('^[^>].+',re.MULTILINE+re.DOTALL) @@ -464,37 +488,58 @@ def ecoParseOptions(arguments): } o,filenames = getopt.getopt(arguments, - 'ht:n:gf', + 'ht:n:gfe', ['help', 'taxonomy=', 'name=', 'genbank', - 'fasta']) + 'fasta', + 'embl']) for name,value in o: if name in ('-h','--help'): - pass + printHelp() + exit() elif name in ('-t','--taxonomy'): opt['taxdir']=value elif name in ('-n','--name'): opt['prefix']=value elif name in ('-g','--genbank'): opt['parser']=sequenceIteratorFactory(genbankEntryParser, - entryIterator - ) + entryIterator) + elif name in ('-f','--fasta'): opt['parser']=sequenceIteratorFactory(fastaEntryParser, fastaEntryIterator) + + elif name in ('-e','--embl'): + opt['parser']=sequenceIteratorFactory(emblEntryParser, + entryIterator) else: raise ValueError,'Unknown option %s' % name return opt,filenames +def printHelp(): + print "-----------------------------------" + print " ecoPCRFormat.py" + print "-----------------------------------" + print "ecoPCRFormat.py [option] " + print "-----------------------------------" + print "-e --embl :[E]mbl format file name" + print "-f --fasta :[F]asta format file name" + print "-g --genbank :[G]enbank format file name" + print "-h --help :[H]elp - print this help" + print "-n --name :[N]ame of the new database created" + print "-t --taxonomy :[T]axonomy - path to the taxonomy database" + print " :bcp-like dump from GenBank taxonomy database." + print "-----------------------------------" + if __name__ == '__main__': opt,filenames = ecoParseOptions(sys.argv[1:]) taxonomy = readTaxonomyDump(opt['taxdir']) - + ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser'])