Add automanagement of input and output format

This commit is contained in:
2010-04-27 15:12:36 +00:00
parent 8ef73843c1
commit 018ce968e6

View File

@@ -14,32 +14,33 @@ from obitools.fnaqual.quality import qualityIterator
from obitools.fasta import formatFasta
from obitools.fastq import formatFastq
import sys
from array import array
from itertools import chain
def addInputFormatOption(optionManager):
optionManager.add_option('--genbank',
action="store_const", dest="seqinformat",
default='fasta',
default=None,
const='genbank',
help="input file is in genbank format")
optionManager.add_option('--embl',
action="store_const", dest="seqinformat",
default='fasta',
default=None,
const='embl',
help="input file is in embl format")
optionManager.add_option('--fasta',
action="store_const", dest="seqinformat",
default='fasta',
default=None,
const='fasta',
help="input file is in fasta nucleic format (including obitools fasta extentions)")
optionManager.add_option('--fna',
action="store_const", dest="seqinformat",
default='fasta',
default=None,
const='fna',
help="input file is in fasta nucleic format produced by 454 sequencer pipeline")
@@ -51,19 +52,19 @@ def addInputFormatOption(optionManager):
optionManager.add_option('--sanger',
action="store_const", dest="seqinformat",
default='fasta',
default=None,
const='sanger',
help="input file is in sanger fastq nucleic format (standard fastq)")
optionManager.add_option('--solexa',
action="store_const", dest="seqinformat",
default='fasta',
default=None,
const='solexa',
help="input file is in fastq nucleic format produced by solexa sequencer")
optionManager.add_option('--illumina',
action="store_const", dest="seqinformat",
default='fasta',
default=None,
const='illumina',
help="input file is in fastq nucleic format produced by old solexa sequencer")
@@ -79,20 +80,94 @@ def addInputFormatOption(optionManager):
help="input file is protein sequences")
def addOutputOption(optionManager):
def addOutputFormatOption(optionManager):
optionManager.add_option('--fastq-output',
action="store_const", dest="output",
default=formatFasta,
default=None,
const=formatFastq,
help="output sequences in sanger fastq format")
optionManager.add_option('--fasta-output',
action="store_const", dest="output",
default=formatFasta,
default=None,
const=formatFasta,
help="output sequences in obitools fasta format")
def entriesIterator(options):
def addInOutputOption(optionManager):
addInputFormatOption(optionManager)
addOutputFormatOption(optionManager)
def printOutput(options,seq,output=sys.stdout):
if options.output is not None:
r=options.output(seq)
elif options.outputFormater is not None:
r=options.outputFormater(seq)
else:
r=formatFasta(seq)
try:
print >>output,r
except IOError:
sys.exit(0)
def autoEntriesIterator(options):
options.outputFormater=formatFasta
def annotatedIterator(formatIterator):
options.outputFormater=formatFasta
def iterator(lineiterator):
for s in formatIterator(lineiterator):
s.extractTaxon()
yield s
return iterator
def withQualIterator(qualityfile):
options.outputFormater=formatFastq
def iterator(lineiterator):
for s in fnaFastaIterator(lineiterator):
q = qualityfile.next()
quality = array('d',(10.**(-x/10.) for x in q))
s.quality=quality
yield s
return iterator
def autoSequenceIterator(lineiterator):
options.outputFormater=formatFasta
first = lineiterator.next()
if first[0]==">":
if options.withqualfile is not None:
qualfile=qualityIterator(options.withqualfile)
reader=withQualIterator(qualfile)
options.outputFormater=formatFastq
elif options.moltype=='nuc':
reader=fastaNucIterator
elif options.moltype=='pep':
reader=fastaAAIterator
else:
reader=fastaIterator
elif first[0]=='@':
reader=fastqSangerIterator
options.outputFormater=formatFastq
elif first[0:3]=='ID ':
reader=emblIterator
elif first[0:6]=='LOCUS ':
reader=genbankIterator
else:
raise AssertionError,'file is not in fasta, fasta, embl, or genbank format'
input = reader(chain([first],lineiterator))
return input
if options.seqinformat is None:
reader = autoSequenceIterator
else:
if options.seqinformat=='fasta':
if options.moltype=='nuc':
reader=fastaNucIterator
@@ -101,33 +176,27 @@ def entriesIterator(options):
else:
reader=fastaIterator
elif options.seqinformat=='genbank':
reader=genbankIterator
reader=annotatedIterator(genbankIterator)
elif options.seqinformat=='embl':
reader=emblIterator
reader=annotatedIterator(emblIterator)
elif options.seqinformat=='fna':
reader=fnaFastaIterator
elif options.seqinformat=='sanger':
options.outputFormater=formatFastq
reader=fastqSangerIterator
elif options.seqinformat=='solexa':
options.outputFormater=formatFastq
reader=fastqSolexaIterator
elif options.seqinformat=='illumina':
options.outputFormater=formatFastq
reader=fastqIlluminaIterator
if options.seqinformat=='fna' and options.withqualfile is not None:
qualfile=qualityIterator(options.withqualfile)
else:
qualfile=None
reader=withQualIterator(qualfile)
options.outputFormater=formatFastq
def iterator(lineiterator):
for s in reader(lineiterator):
if qualfile is not None:
q = qualfile.next()
quality = array('d',(10.**(-x/10.) for x in q))
s.quality=quality
s.extractTaxon()
yield s
return iterator
return reader