Add automanagement of input and output format

This commit is contained in:
2010-04-27 15:12:36 +00:00
parent 8ef73843c1
commit 018ce968e6

View File

@@ -14,32 +14,33 @@ from obitools.fnaqual.quality import qualityIterator
from obitools.fasta import formatFasta from obitools.fasta import formatFasta
from obitools.fastq import formatFastq from obitools.fastq import formatFastq
import sys
from array import array from array import array
from itertools import chain
def addInputFormatOption(optionManager): def addInputFormatOption(optionManager):
optionManager.add_option('--genbank', optionManager.add_option('--genbank',
action="store_const", dest="seqinformat", action="store_const", dest="seqinformat",
default='fasta', default=None,
const='genbank', const='genbank',
help="input file is in genbank format") help="input file is in genbank format")
optionManager.add_option('--embl', optionManager.add_option('--embl',
action="store_const", dest="seqinformat", action="store_const", dest="seqinformat",
default='fasta', default=None,
const='embl', const='embl',
help="input file is in embl format") help="input file is in embl format")
optionManager.add_option('--fasta', optionManager.add_option('--fasta',
action="store_const", dest="seqinformat", action="store_const", dest="seqinformat",
default='fasta', default=None,
const='fasta', const='fasta',
help="input file is in fasta nucleic format (including obitools fasta extentions)") help="input file is in fasta nucleic format (including obitools fasta extentions)")
optionManager.add_option('--fna', optionManager.add_option('--fna',
action="store_const", dest="seqinformat", action="store_const", dest="seqinformat",
default='fasta', default=None,
const='fna', const='fna',
help="input file is in fasta nucleic format produced by 454 sequencer pipeline") help="input file is in fasta nucleic format produced by 454 sequencer pipeline")
@@ -51,19 +52,19 @@ def addInputFormatOption(optionManager):
optionManager.add_option('--sanger', optionManager.add_option('--sanger',
action="store_const", dest="seqinformat", action="store_const", dest="seqinformat",
default='fasta', default=None,
const='sanger', const='sanger',
help="input file is in sanger fastq nucleic format (standard fastq)") help="input file is in sanger fastq nucleic format (standard fastq)")
optionManager.add_option('--solexa', optionManager.add_option('--solexa',
action="store_const", dest="seqinformat", action="store_const", dest="seqinformat",
default='fasta', default=None,
const='solexa', const='solexa',
help="input file is in fastq nucleic format produced by solexa sequencer") help="input file is in fastq nucleic format produced by solexa sequencer")
optionManager.add_option('--illumina', optionManager.add_option('--illumina',
action="store_const", dest="seqinformat", action="store_const", dest="seqinformat",
default='fasta', default=None,
const='illumina', const='illumina',
help="input file is in fastq nucleic format produced by old solexa sequencer") help="input file is in fastq nucleic format produced by old solexa sequencer")
@@ -79,20 +80,94 @@ def addInputFormatOption(optionManager):
help="input file is protein sequences") help="input file is protein sequences")
def addOutputOption(optionManager): def addOutputFormatOption(optionManager):
optionManager.add_option('--fastq-output', optionManager.add_option('--fastq-output',
action="store_const", dest="output", action="store_const", dest="output",
default=formatFasta, default=None,
const=formatFastq, const=formatFastq,
help="output sequences in sanger fastq format") help="output sequences in sanger fastq format")
optionManager.add_option('--fasta-output', optionManager.add_option('--fasta-output',
action="store_const", dest="output", action="store_const", dest="output",
default=formatFasta, default=None,
const=formatFasta, const=formatFasta,
help="output sequences in obitools fasta format") help="output sequences in obitools fasta format")
def entriesIterator(options):
def addInOutputOption(optionManager):
addInputFormatOption(optionManager)
addOutputFormatOption(optionManager)
def printOutput(options,seq,output=sys.stdout):
if options.output is not None:
r=options.output(seq)
elif options.outputFormater is not None:
r=options.outputFormater(seq)
else:
r=formatFasta(seq)
try:
print >>output,r
except IOError:
sys.exit(0)
def autoEntriesIterator(options):
options.outputFormater=formatFasta
def annotatedIterator(formatIterator):
options.outputFormater=formatFasta
def iterator(lineiterator):
for s in formatIterator(lineiterator):
s.extractTaxon()
yield s
return iterator
def withQualIterator(qualityfile):
options.outputFormater=formatFastq
def iterator(lineiterator):
for s in fnaFastaIterator(lineiterator):
q = qualityfile.next()
quality = array('d',(10.**(-x/10.) for x in q))
s.quality=quality
yield s
return iterator
def autoSequenceIterator(lineiterator):
options.outputFormater=formatFasta
first = lineiterator.next()
if first[0]==">":
if options.withqualfile is not None:
qualfile=qualityIterator(options.withqualfile)
reader=withQualIterator(qualfile)
options.outputFormater=formatFastq
elif options.moltype=='nuc':
reader=fastaNucIterator
elif options.moltype=='pep':
reader=fastaAAIterator
else:
reader=fastaIterator
elif first[0]=='@':
reader=fastqSangerIterator
options.outputFormater=formatFastq
elif first[0:3]=='ID ':
reader=emblIterator
elif first[0:6]=='LOCUS ':
reader=genbankIterator
else:
raise AssertionError,'file is not in fasta, fasta, embl, or genbank format'
input = reader(chain([first],lineiterator))
return input
if options.seqinformat is None:
reader = autoSequenceIterator
else:
if options.seqinformat=='fasta': if options.seqinformat=='fasta':
if options.moltype=='nuc': if options.moltype=='nuc':
reader=fastaNucIterator reader=fastaNucIterator
@@ -101,33 +176,27 @@ def entriesIterator(options):
else: else:
reader=fastaIterator reader=fastaIterator
elif options.seqinformat=='genbank': elif options.seqinformat=='genbank':
reader=genbankIterator reader=annotatedIterator(genbankIterator)
elif options.seqinformat=='embl': elif options.seqinformat=='embl':
reader=emblIterator reader=annotatedIterator(emblIterator)
elif options.seqinformat=='fna': elif options.seqinformat=='fna':
reader=fnaFastaIterator reader=fnaFastaIterator
elif options.seqinformat=='sanger': elif options.seqinformat=='sanger':
options.outputFormater=formatFastq
reader=fastqSangerIterator reader=fastqSangerIterator
elif options.seqinformat=='solexa': elif options.seqinformat=='solexa':
options.outputFormater=formatFastq
reader=fastqSolexaIterator reader=fastqSolexaIterator
elif options.seqinformat=='illumina': elif options.seqinformat=='illumina':
options.outputFormater=formatFastq
reader=fastqIlluminaIterator reader=fastqIlluminaIterator
if options.seqinformat=='fna' and options.withqualfile is not None: if options.seqinformat=='fna' and options.withqualfile is not None:
qualfile=qualityIterator(options.withqualfile) qualfile=qualityIterator(options.withqualfile)
else: reader=withQualIterator(qualfile)
qualfile=None options.outputFormater=formatFastq
def iterator(lineiterator): return reader
for s in reader(lineiterator):
if qualfile is not None:
q = qualfile.next()
quality = array('d',(10.**(-x/10.) for x in q))
s.quality=quality
s.extractTaxon()
yield s
return iterator