Add automanagement of input and output format

This commit is contained in:
2010-04-27 15:12:36 +00:00
parent 8ef73843c1
commit 018ce968e6

View File

@@ -14,32 +14,33 @@ from obitools.fnaqual.quality import qualityIterator
from obitools.fasta import formatFasta from obitools.fasta import formatFasta
from obitools.fastq import formatFastq from obitools.fastq import formatFastq
import sys
from array import array from array import array
from itertools import chain
def addInputFormatOption(optionManager): def addInputFormatOption(optionManager):
optionManager.add_option('--genbank', optionManager.add_option('--genbank',
action="store_const", dest="seqinformat", action="store_const", dest="seqinformat",
default='fasta', default=None,
const='genbank', const='genbank',
help="input file is in genbank format") help="input file is in genbank format")
optionManager.add_option('--embl', optionManager.add_option('--embl',
action="store_const", dest="seqinformat", action="store_const", dest="seqinformat",
default='fasta', default=None,
const='embl', const='embl',
help="input file is in embl format") help="input file is in embl format")
optionManager.add_option('--fasta', optionManager.add_option('--fasta',
action="store_const", dest="seqinformat", action="store_const", dest="seqinformat",
default='fasta', default=None,
const='fasta', const='fasta',
help="input file is in fasta nucleic format (including obitools fasta extentions)") help="input file is in fasta nucleic format (including obitools fasta extentions)")
optionManager.add_option('--fna', optionManager.add_option('--fna',
action="store_const", dest="seqinformat", action="store_const", dest="seqinformat",
default='fasta', default=None,
const='fna', const='fna',
help="input file is in fasta nucleic format produced by 454 sequencer pipeline") help="input file is in fasta nucleic format produced by 454 sequencer pipeline")
@@ -51,19 +52,19 @@ def addInputFormatOption(optionManager):
optionManager.add_option('--sanger', optionManager.add_option('--sanger',
action="store_const", dest="seqinformat", action="store_const", dest="seqinformat",
default='fasta', default=None,
const='sanger', const='sanger',
help="input file is in sanger fastq nucleic format (standard fastq)") help="input file is in sanger fastq nucleic format (standard fastq)")
optionManager.add_option('--solexa', optionManager.add_option('--solexa',
action="store_const", dest="seqinformat", action="store_const", dest="seqinformat",
default='fasta', default=None,
const='solexa', const='solexa',
help="input file is in fastq nucleic format produced by solexa sequencer") help="input file is in fastq nucleic format produced by solexa sequencer")
optionManager.add_option('--illumina', optionManager.add_option('--illumina',
action="store_const", dest="seqinformat", action="store_const", dest="seqinformat",
default='fasta', default=None,
const='illumina', const='illumina',
help="input file is in fastq nucleic format produced by old solexa sequencer") help="input file is in fastq nucleic format produced by old solexa sequencer")
@@ -79,55 +80,123 @@ def addInputFormatOption(optionManager):
help="input file is protein sequences") help="input file is protein sequences")
def addOutputOption(optionManager): def addOutputFormatOption(optionManager):
optionManager.add_option('--fastq-output', optionManager.add_option('--fastq-output',
action="store_const", dest="output", action="store_const", dest="output",
default=formatFasta, default=None,
const=formatFastq, const=formatFastq,
help="output sequences in sanger fastq format") help="output sequences in sanger fastq format")
optionManager.add_option('--fasta-output', optionManager.add_option('--fasta-output',
action="store_const", dest="output", action="store_const", dest="output",
default=formatFasta, default=None,
const=formatFasta, const=formatFasta,
help="output sequences in obitools fasta format") help="output sequences in obitools fasta format")
def entriesIterator(options):
if options.seqinformat=='fasta': def addInOutputOption(optionManager):
if options.moltype=='nuc': addInputFormatOption(optionManager)
reader=fastaNucIterator addOutputFormatOption(optionManager)
elif options.moltype=='pep':
reader=fastaAAIterator
else: def printOutput(options,seq,output=sys.stdout):
reader=fastaIterator if options.output is not None:
elif options.seqinformat=='genbank': r=options.output(seq)
reader=genbankIterator elif options.outputFormater is not None:
elif options.seqinformat=='embl': r=options.outputFormater(seq)
reader=emblIterator
elif options.seqinformat=='fna':
reader=fnaFastaIterator
elif options.seqinformat=='sanger':
reader=fastqSangerIterator
elif options.seqinformat=='solexa':
reader=fastqSolexaIterator
elif options.seqinformat=='illumina':
reader=fastqIlluminaIterator
if options.seqinformat=='fna' and options.withqualfile is not None:
qualfile=qualityIterator(options.withqualfile)
else: else:
qualfile=None r=formatFasta(seq)
def iterator(lineiterator): try:
for s in reader(lineiterator): print >>output,r
if qualfile is not None: except IOError:
q = qualfile.next() sys.exit(0)
def autoEntriesIterator(options):
options.outputFormater=formatFasta
def annotatedIterator(formatIterator):
options.outputFormater=formatFasta
def iterator(lineiterator):
for s in formatIterator(lineiterator):
s.extractTaxon()
yield s
return iterator
def withQualIterator(qualityfile):
options.outputFormater=formatFastq
def iterator(lineiterator):
for s in fnaFastaIterator(lineiterator):
q = qualityfile.next()
quality = array('d',(10.**(-x/10.) for x in q)) quality = array('d',(10.**(-x/10.) for x in q))
s.quality=quality s.quality=quality
s.extractTaxon() yield s
yield s
return iterator
def autoSequenceIterator(lineiterator):
options.outputFormater=formatFasta
first = lineiterator.next()
if first[0]==">":
if options.withqualfile is not None:
qualfile=qualityIterator(options.withqualfile)
reader=withQualIterator(qualfile)
options.outputFormater=formatFastq
elif options.moltype=='nuc':
reader=fastaNucIterator
elif options.moltype=='pep':
reader=fastaAAIterator
else:
reader=fastaIterator
elif first[0]=='@':
reader=fastqSangerIterator
options.outputFormater=formatFastq
elif first[0:3]=='ID ':
reader=emblIterator
elif first[0:6]=='LOCUS ':
reader=genbankIterator
else:
raise AssertionError,'file is not in fasta, fasta, embl, or genbank format'
input = reader(chain([first],lineiterator))
return input
if options.seqinformat is None:
reader = autoSequenceIterator
else:
if options.seqinformat=='fasta':
if options.moltype=='nuc':
reader=fastaNucIterator
elif options.moltype=='pep':
reader=fastaAAIterator
else:
reader=fastaIterator
elif options.seqinformat=='genbank':
reader=annotatedIterator(genbankIterator)
elif options.seqinformat=='embl':
reader=annotatedIterator(emblIterator)
elif options.seqinformat=='fna':
reader=fnaFastaIterator
elif options.seqinformat=='sanger':
options.outputFormater=formatFastq
reader=fastqSangerIterator
elif options.seqinformat=='solexa':
options.outputFormater=formatFastq
reader=fastqSolexaIterator
elif options.seqinformat=='illumina':
options.outputFormater=formatFastq
reader=fastqIlluminaIterator
return iterator if options.seqinformat=='fna' and options.withqualfile is not None:
qualfile=qualityIterator(options.withqualfile)
reader=withQualIterator(qualfile)
options.outputFormater=formatFastq
return reader