Add capacity to format fasta files, with taxonomical data described in title line in the following format :
[taxon: ####] where #### is a taxid from NCBI taxonomy. It can be as precise as possible (species, genus or even root) git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPCR/trunk@13 60f365c0-8329-0410-b2a4-ec073aeeaa1d
This commit is contained in:
@ -2,7 +2,6 @@
|
||||
|
||||
import re
|
||||
import gzip
|
||||
import psycopg2
|
||||
import struct
|
||||
import sys
|
||||
import time
|
||||
@ -233,7 +232,20 @@ def entryIterator(file):
|
||||
rep = ''.join(rep)
|
||||
yield rep
|
||||
rep = []
|
||||
|
||||
|
||||
def fastaEntryIterator(file):
|
||||
file = universalOpen(file)
|
||||
rep =[]
|
||||
for ligne in file:
|
||||
if ligne[0] == '>' and rep:
|
||||
rep = ''.join(rep)
|
||||
yield rep
|
||||
rep = []
|
||||
rep.append(ligne)
|
||||
if rep:
|
||||
rep = ''.join(rep)
|
||||
yield rep
|
||||
|
||||
_cleanSeq = re.compile('[ \n0-9]+')
|
||||
|
||||
def cleanSeq(seq):
|
||||
@ -245,7 +257,7 @@ _gbParseDE = re.compile('(?<=^DEFINITION {2}).+?\. *$(?=[^ ])',re.MULTILINE+re.D
|
||||
_gbParseSQ = re.compile('(?<=^ORIGIN).+?(?=^//$)',re.MULTILINE+re.DOTALL)
|
||||
_gbParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")')
|
||||
|
||||
def genbankParser(entry):
|
||||
def genbankEntryParser(entry):
|
||||
Id = _gbParseID.findall(entry)[0]
|
||||
De = ' '.join(_gbParseDE.findall(entry)[0].split())
|
||||
Sq = cleanSeq(_gbParseSQ.findall(entry)[0].upper())
|
||||
@ -255,9 +267,34 @@ def genbankParser(entry):
|
||||
Tx = None
|
||||
return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq}
|
||||
|
||||
def sequenceIterator(file,parser):
|
||||
for entry in entryIterator(file):
|
||||
yield parser(entry)
|
||||
_fastaParseID = re.compile('(?<=^>)[^ ]+')
|
||||
_fastaParseDE = re.compile('(?<=^>).+',)
|
||||
_fastaParseSQ = re.compile('^[^>].+',re.MULTILINE+re.DOTALL)
|
||||
_fastaParseTX = re.compile('(?<=[[Tt]axon:) *[0-9]+ *(?=])')
|
||||
|
||||
def fastaEntryParser(entry):
|
||||
Id = _fastaParseID.findall(entry)[0]
|
||||
De = _fastaParseDE.findall(entry)[0].split(None,1)[1:]
|
||||
if not De:
|
||||
De=''
|
||||
else:
|
||||
De=De[0]
|
||||
Sq = cleanSeq(_fastaParseSQ.findall(entry)[0].upper())
|
||||
try:
|
||||
Tx = int(_fastaParseTX.findall(entry)[0])
|
||||
except IndexError:
|
||||
Tx = None
|
||||
|
||||
return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq}
|
||||
|
||||
|
||||
|
||||
def sequenceIteratorFactory(entryParser,entryIterator):
|
||||
def sequenceIterator(file):
|
||||
for entry in entryIterator(file):
|
||||
yield entryParser(entry)
|
||||
return sequenceIterator
|
||||
|
||||
|
||||
def taxonomyInfo(entry,connection):
|
||||
taxid = entry['taxid']
|
||||
@ -345,11 +382,11 @@ def ecoRankPacker(rank):
|
||||
return packed
|
||||
|
||||
|
||||
def ecoSeqWriter(file,input,taxindex,parser=genbankParser):
|
||||
def ecoSeqWriter(file,input,taxindex,parser):
|
||||
output = open(file,'wb')
|
||||
input = universalOpen(input)
|
||||
inputsize = fileSize(input)
|
||||
entries = sequenceIterator(input, parser)
|
||||
entries = parser(input)
|
||||
seqcount=0
|
||||
skipped = []
|
||||
|
||||
@ -358,7 +395,10 @@ def ecoSeqWriter(file,input,taxindex,parser=genbankParser):
|
||||
progressBar(1, inputsize,reset=True)
|
||||
for entry in entries:
|
||||
if entry['taxid'] is not None:
|
||||
entry['taxid']=taxindex[entry['taxid']]
|
||||
try:
|
||||
entry['taxid']=taxindex[entry['taxid']]
|
||||
except KeyError:
|
||||
entry['taxid']=None
|
||||
if entry['taxid'] is not None:
|
||||
seqcount+=1
|
||||
output.write(ecoSeqPacker(entry))
|
||||
@ -399,7 +439,7 @@ def ecoRankWriter(file,ranks):
|
||||
|
||||
output.close()
|
||||
|
||||
def ecoDBWriter(prefix,taxonomy,seqFileNames,parser=genbankParser):
|
||||
def ecoDBWriter(prefix,taxonomy,seqFileNames,parser):
|
||||
|
||||
ecoRankWriter('%s.rdx' % prefix, taxonomy[1])
|
||||
ecoTaxWriter('%s.tdx' % prefix, taxonomy[0])
|
||||
@ -419,15 +459,17 @@ def ecoParseOptions(arguments):
|
||||
opt = {
|
||||
'prefix' : 'ecodb',
|
||||
'taxdir' : 'taxdump',
|
||||
'parser' : genbankParser
|
||||
'parser' : sequenceIteratorFactory(genbankEntryParser,
|
||||
entryIterator)
|
||||
}
|
||||
|
||||
o,filenames = getopt.getopt(arguments,
|
||||
'ht:n:g',
|
||||
'ht:n:gf',
|
||||
['help',
|
||||
'taxonomy=',
|
||||
'name=',
|
||||
'genbank'])
|
||||
'genbank',
|
||||
'fasta'])
|
||||
|
||||
for name,value in o:
|
||||
if name in ('-h','--help'):
|
||||
@ -437,7 +479,12 @@ def ecoParseOptions(arguments):
|
||||
elif name in ('-n','--name'):
|
||||
opt['prefix']=value
|
||||
elif name in ('-g','--genbank'):
|
||||
opt['parser']=genbankParser
|
||||
opt['parser']=sequenceIteratorFactory(genbankEntryParser,
|
||||
entryIterator
|
||||
)
|
||||
elif name in ('-f','--fasta'):
|
||||
opt['parser']=sequenceIteratorFactory(fastaEntryParser,
|
||||
fastaEntryIterator)
|
||||
else:
|
||||
raise ValueError,'Unknown option %s' % name
|
||||
|
||||
|
Reference in New Issue
Block a user