Add capacity to format fasta files, with taxonomical data described in title line in the following format :

[taxon: ####] where #### is a taxid from NCBI taxonomy. It can be as precise as possible (species, genus or even root)
		

git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPCR/trunk@13 60f365c0-8329-0410-b2a4-ec073aeeaa1d
This commit is contained in:
2007-05-02 10:10:27 +00:00
parent 420cdb877b
commit 6cd1141130

View File

@ -2,7 +2,6 @@
import re
import gzip
import psycopg2
import struct
import sys
import time
@ -233,7 +232,20 @@ def entryIterator(file):
rep = ''.join(rep)
yield rep
rep = []
def fastaEntryIterator(file):
file = universalOpen(file)
rep =[]
for ligne in file:
if ligne[0] == '>' and rep:
rep = ''.join(rep)
yield rep
rep = []
rep.append(ligne)
if rep:
rep = ''.join(rep)
yield rep
_cleanSeq = re.compile('[ \n0-9]+')
def cleanSeq(seq):
@ -245,7 +257,7 @@ _gbParseDE = re.compile('(?<=^DEFINITION {2}).+?\. *$(?=[^ ])',re.MULTILINE+re.D
_gbParseSQ = re.compile('(?<=^ORIGIN).+?(?=^//$)',re.MULTILINE+re.DOTALL)
_gbParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")')
def genbankParser(entry):
def genbankEntryParser(entry):
Id = _gbParseID.findall(entry)[0]
De = ' '.join(_gbParseDE.findall(entry)[0].split())
Sq = cleanSeq(_gbParseSQ.findall(entry)[0].upper())
@ -255,9 +267,34 @@ def genbankParser(entry):
Tx = None
return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq}
def sequenceIterator(file,parser):
for entry in entryIterator(file):
yield parser(entry)
_fastaParseID = re.compile('(?<=^>)[^ ]+')
_fastaParseDE = re.compile('(?<=^>).+',)
_fastaParseSQ = re.compile('^[^>].+',re.MULTILINE+re.DOTALL)
_fastaParseTX = re.compile('(?<=[[Tt]axon:) *[0-9]+ *(?=])')
def fastaEntryParser(entry):
Id = _fastaParseID.findall(entry)[0]
De = _fastaParseDE.findall(entry)[0].split(None,1)[1:]
if not De:
De=''
else:
De=De[0]
Sq = cleanSeq(_fastaParseSQ.findall(entry)[0].upper())
try:
Tx = int(_fastaParseTX.findall(entry)[0])
except IndexError:
Tx = None
return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq}
def sequenceIteratorFactory(entryParser,entryIterator):
def sequenceIterator(file):
for entry in entryIterator(file):
yield entryParser(entry)
return sequenceIterator
def taxonomyInfo(entry,connection):
taxid = entry['taxid']
@ -345,11 +382,11 @@ def ecoRankPacker(rank):
return packed
def ecoSeqWriter(file,input,taxindex,parser=genbankParser):
def ecoSeqWriter(file,input,taxindex,parser):
output = open(file,'wb')
input = universalOpen(input)
inputsize = fileSize(input)
entries = sequenceIterator(input, parser)
entries = parser(input)
seqcount=0
skipped = []
@ -358,7 +395,10 @@ def ecoSeqWriter(file,input,taxindex,parser=genbankParser):
progressBar(1, inputsize,reset=True)
for entry in entries:
if entry['taxid'] is not None:
entry['taxid']=taxindex[entry['taxid']]
try:
entry['taxid']=taxindex[entry['taxid']]
except KeyError:
entry['taxid']=None
if entry['taxid'] is not None:
seqcount+=1
output.write(ecoSeqPacker(entry))
@ -399,7 +439,7 @@ def ecoRankWriter(file,ranks):
output.close()
def ecoDBWriter(prefix,taxonomy,seqFileNames,parser=genbankParser):
def ecoDBWriter(prefix,taxonomy,seqFileNames,parser):
ecoRankWriter('%s.rdx' % prefix, taxonomy[1])
ecoTaxWriter('%s.tdx' % prefix, taxonomy[0])
@ -419,15 +459,17 @@ def ecoParseOptions(arguments):
opt = {
'prefix' : 'ecodb',
'taxdir' : 'taxdump',
'parser' : genbankParser
'parser' : sequenceIteratorFactory(genbankEntryParser,
entryIterator)
}
o,filenames = getopt.getopt(arguments,
'ht:n:g',
'ht:n:gf',
['help',
'taxonomy=',
'name=',
'genbank'])
'genbank',
'fasta'])
for name,value in o:
if name in ('-h','--help'):
@ -437,7 +479,12 @@ def ecoParseOptions(arguments):
elif name in ('-n','--name'):
opt['prefix']=value
elif name in ('-g','--genbank'):
opt['parser']=genbankParser
opt['parser']=sequenceIteratorFactory(genbankEntryParser,
entryIterator
)
elif name in ('-f','--fasta'):
opt['parser']=sequenceIteratorFactory(fastaEntryParser,
fastaEntryIterator)
else:
raise ValueError,'Unknown option %s' % name