Add capacity to format fasta files, with taxonomical data described in title line in the following format :

[taxon: ####] where #### is a taxid from NCBI taxonomy. It can be as precise as possible (species, genus or even root)
		

git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPCR/trunk@13 60f365c0-8329-0410-b2a4-ec073aeeaa1d
This commit is contained in:
2007-05-02 10:10:27 +00:00
parent 420cdb877b
commit 6cd1141130

View File

@ -2,7 +2,6 @@
import re import re
import gzip import gzip
import psycopg2
import struct import struct
import sys import sys
import time import time
@ -233,7 +232,20 @@ def entryIterator(file):
rep = ''.join(rep) rep = ''.join(rep)
yield rep yield rep
rep = [] rep = []
def fastaEntryIterator(file):
file = universalOpen(file)
rep =[]
for ligne in file:
if ligne[0] == '>' and rep:
rep = ''.join(rep)
yield rep
rep = []
rep.append(ligne)
if rep:
rep = ''.join(rep)
yield rep
_cleanSeq = re.compile('[ \n0-9]+') _cleanSeq = re.compile('[ \n0-9]+')
def cleanSeq(seq): def cleanSeq(seq):
@ -245,7 +257,7 @@ _gbParseDE = re.compile('(?<=^DEFINITION {2}).+?\. *$(?=[^ ])',re.MULTILINE+re.D
_gbParseSQ = re.compile('(?<=^ORIGIN).+?(?=^//$)',re.MULTILINE+re.DOTALL) _gbParseSQ = re.compile('(?<=^ORIGIN).+?(?=^//$)',re.MULTILINE+re.DOTALL)
_gbParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")') _gbParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")')
def genbankParser(entry): def genbankEntryParser(entry):
Id = _gbParseID.findall(entry)[0] Id = _gbParseID.findall(entry)[0]
De = ' '.join(_gbParseDE.findall(entry)[0].split()) De = ' '.join(_gbParseDE.findall(entry)[0].split())
Sq = cleanSeq(_gbParseSQ.findall(entry)[0].upper()) Sq = cleanSeq(_gbParseSQ.findall(entry)[0].upper())
@ -255,9 +267,34 @@ def genbankParser(entry):
Tx = None Tx = None
return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq} return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq}
def sequenceIterator(file,parser): _fastaParseID = re.compile('(?<=^>)[^ ]+')
for entry in entryIterator(file): _fastaParseDE = re.compile('(?<=^>).+',)
yield parser(entry) _fastaParseSQ = re.compile('^[^>].+',re.MULTILINE+re.DOTALL)
_fastaParseTX = re.compile('(?<=[[Tt]axon:) *[0-9]+ *(?=])')
def fastaEntryParser(entry):
Id = _fastaParseID.findall(entry)[0]
De = _fastaParseDE.findall(entry)[0].split(None,1)[1:]
if not De:
De=''
else:
De=De[0]
Sq = cleanSeq(_fastaParseSQ.findall(entry)[0].upper())
try:
Tx = int(_fastaParseTX.findall(entry)[0])
except IndexError:
Tx = None
return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq}
def sequenceIteratorFactory(entryParser,entryIterator):
def sequenceIterator(file):
for entry in entryIterator(file):
yield entryParser(entry)
return sequenceIterator
def taxonomyInfo(entry,connection): def taxonomyInfo(entry,connection):
taxid = entry['taxid'] taxid = entry['taxid']
@ -345,11 +382,11 @@ def ecoRankPacker(rank):
return packed return packed
def ecoSeqWriter(file,input,taxindex,parser=genbankParser): def ecoSeqWriter(file,input,taxindex,parser):
output = open(file,'wb') output = open(file,'wb')
input = universalOpen(input) input = universalOpen(input)
inputsize = fileSize(input) inputsize = fileSize(input)
entries = sequenceIterator(input, parser) entries = parser(input)
seqcount=0 seqcount=0
skipped = [] skipped = []
@ -358,7 +395,10 @@ def ecoSeqWriter(file,input,taxindex,parser=genbankParser):
progressBar(1, inputsize,reset=True) progressBar(1, inputsize,reset=True)
for entry in entries: for entry in entries:
if entry['taxid'] is not None: if entry['taxid'] is not None:
entry['taxid']=taxindex[entry['taxid']] try:
entry['taxid']=taxindex[entry['taxid']]
except KeyError:
entry['taxid']=None
if entry['taxid'] is not None: if entry['taxid'] is not None:
seqcount+=1 seqcount+=1
output.write(ecoSeqPacker(entry)) output.write(ecoSeqPacker(entry))
@ -399,7 +439,7 @@ def ecoRankWriter(file,ranks):
output.close() output.close()
def ecoDBWriter(prefix,taxonomy,seqFileNames,parser=genbankParser): def ecoDBWriter(prefix,taxonomy,seqFileNames,parser):
ecoRankWriter('%s.rdx' % prefix, taxonomy[1]) ecoRankWriter('%s.rdx' % prefix, taxonomy[1])
ecoTaxWriter('%s.tdx' % prefix, taxonomy[0]) ecoTaxWriter('%s.tdx' % prefix, taxonomy[0])
@ -419,15 +459,17 @@ def ecoParseOptions(arguments):
opt = { opt = {
'prefix' : 'ecodb', 'prefix' : 'ecodb',
'taxdir' : 'taxdump', 'taxdir' : 'taxdump',
'parser' : genbankParser 'parser' : sequenceIteratorFactory(genbankEntryParser,
entryIterator)
} }
o,filenames = getopt.getopt(arguments, o,filenames = getopt.getopt(arguments,
'ht:n:g', 'ht:n:gf',
['help', ['help',
'taxonomy=', 'taxonomy=',
'name=', 'name=',
'genbank']) 'genbank',
'fasta'])
for name,value in o: for name,value in o:
if name in ('-h','--help'): if name in ('-h','--help'):
@ -437,7 +479,12 @@ def ecoParseOptions(arguments):
elif name in ('-n','--name'): elif name in ('-n','--name'):
opt['prefix']=value opt['prefix']=value
elif name in ('-g','--genbank'): elif name in ('-g','--genbank'):
opt['parser']=genbankParser opt['parser']=sequenceIteratorFactory(genbankEntryParser,
entryIterator
)
elif name in ('-f','--fasta'):
opt['parser']=sequenceIteratorFactory(fastaEntryParser,
fastaEntryIterator)
else: else:
raise ValueError,'Unknown option %s' % name raise ValueError,'Unknown option %s' % name