Add automanagement of input and output format
This commit is contained in:
@@ -14,32 +14,33 @@ from obitools.fnaqual.quality import qualityIterator
|
|||||||
from obitools.fasta import formatFasta
|
from obitools.fasta import formatFasta
|
||||||
from obitools.fastq import formatFastq
|
from obitools.fastq import formatFastq
|
||||||
|
|
||||||
|
import sys
|
||||||
from array import array
|
from array import array
|
||||||
|
from itertools import chain
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def addInputFormatOption(optionManager):
|
def addInputFormatOption(optionManager):
|
||||||
optionManager.add_option('--genbank',
|
optionManager.add_option('--genbank',
|
||||||
action="store_const", dest="seqinformat",
|
action="store_const", dest="seqinformat",
|
||||||
default='fasta',
|
default=None,
|
||||||
const='genbank',
|
const='genbank',
|
||||||
help="input file is in genbank format")
|
help="input file is in genbank format")
|
||||||
optionManager.add_option('--embl',
|
optionManager.add_option('--embl',
|
||||||
action="store_const", dest="seqinformat",
|
action="store_const", dest="seqinformat",
|
||||||
default='fasta',
|
default=None,
|
||||||
const='embl',
|
const='embl',
|
||||||
help="input file is in embl format")
|
help="input file is in embl format")
|
||||||
|
|
||||||
optionManager.add_option('--fasta',
|
optionManager.add_option('--fasta',
|
||||||
action="store_const", dest="seqinformat",
|
action="store_const", dest="seqinformat",
|
||||||
default='fasta',
|
default=None,
|
||||||
const='fasta',
|
const='fasta',
|
||||||
help="input file is in fasta nucleic format (including obitools fasta extentions)")
|
help="input file is in fasta nucleic format (including obitools fasta extentions)")
|
||||||
|
|
||||||
optionManager.add_option('--fna',
|
optionManager.add_option('--fna',
|
||||||
action="store_const", dest="seqinformat",
|
action="store_const", dest="seqinformat",
|
||||||
default='fasta',
|
default=None,
|
||||||
const='fna',
|
const='fna',
|
||||||
help="input file is in fasta nucleic format produced by 454 sequencer pipeline")
|
help="input file is in fasta nucleic format produced by 454 sequencer pipeline")
|
||||||
|
|
||||||
@@ -51,19 +52,19 @@ def addInputFormatOption(optionManager):
|
|||||||
|
|
||||||
optionManager.add_option('--sanger',
|
optionManager.add_option('--sanger',
|
||||||
action="store_const", dest="seqinformat",
|
action="store_const", dest="seqinformat",
|
||||||
default='fasta',
|
default=None,
|
||||||
const='sanger',
|
const='sanger',
|
||||||
help="input file is in sanger fastq nucleic format (standard fastq)")
|
help="input file is in sanger fastq nucleic format (standard fastq)")
|
||||||
|
|
||||||
optionManager.add_option('--solexa',
|
optionManager.add_option('--solexa',
|
||||||
action="store_const", dest="seqinformat",
|
action="store_const", dest="seqinformat",
|
||||||
default='fasta',
|
default=None,
|
||||||
const='solexa',
|
const='solexa',
|
||||||
help="input file is in fastq nucleic format produced by solexa sequencer")
|
help="input file is in fastq nucleic format produced by solexa sequencer")
|
||||||
|
|
||||||
optionManager.add_option('--illumina',
|
optionManager.add_option('--illumina',
|
||||||
action="store_const", dest="seqinformat",
|
action="store_const", dest="seqinformat",
|
||||||
default='fasta',
|
default=None,
|
||||||
const='illumina',
|
const='illumina',
|
||||||
help="input file is in fastq nucleic format produced by old solexa sequencer")
|
help="input file is in fastq nucleic format produced by old solexa sequencer")
|
||||||
|
|
||||||
@@ -79,55 +80,123 @@ def addInputFormatOption(optionManager):
|
|||||||
help="input file is protein sequences")
|
help="input file is protein sequences")
|
||||||
|
|
||||||
|
|
||||||
def addOutputOption(optionManager):
|
def addOutputFormatOption(optionManager):
|
||||||
optionManager.add_option('--fastq-output',
|
optionManager.add_option('--fastq-output',
|
||||||
action="store_const", dest="output",
|
action="store_const", dest="output",
|
||||||
default=formatFasta,
|
default=None,
|
||||||
const=formatFastq,
|
const=formatFastq,
|
||||||
help="output sequences in sanger fastq format")
|
help="output sequences in sanger fastq format")
|
||||||
optionManager.add_option('--fasta-output',
|
optionManager.add_option('--fasta-output',
|
||||||
action="store_const", dest="output",
|
action="store_const", dest="output",
|
||||||
default=formatFasta,
|
default=None,
|
||||||
const=formatFasta,
|
const=formatFasta,
|
||||||
help="output sequences in obitools fasta format")
|
help="output sequences in obitools fasta format")
|
||||||
|
|
||||||
|
|
||||||
def entriesIterator(options):
|
|
||||||
if options.seqinformat=='fasta':
|
def addInOutputOption(optionManager):
|
||||||
if options.moltype=='nuc':
|
addInputFormatOption(optionManager)
|
||||||
reader=fastaNucIterator
|
addOutputFormatOption(optionManager)
|
||||||
elif options.moltype=='pep':
|
|
||||||
reader=fastaAAIterator
|
|
||||||
else:
|
def printOutput(options,seq,output=sys.stdout):
|
||||||
reader=fastaIterator
|
if options.output is not None:
|
||||||
elif options.seqinformat=='genbank':
|
r=options.output(seq)
|
||||||
reader=genbankIterator
|
elif options.outputFormater is not None:
|
||||||
elif options.seqinformat=='embl':
|
r=options.outputFormater(seq)
|
||||||
reader=emblIterator
|
|
||||||
elif options.seqinformat=='fna':
|
|
||||||
reader=fnaFastaIterator
|
|
||||||
elif options.seqinformat=='sanger':
|
|
||||||
reader=fastqSangerIterator
|
|
||||||
elif options.seqinformat=='solexa':
|
|
||||||
reader=fastqSolexaIterator
|
|
||||||
elif options.seqinformat=='illumina':
|
|
||||||
reader=fastqIlluminaIterator
|
|
||||||
|
|
||||||
if options.seqinformat=='fna' and options.withqualfile is not None:
|
|
||||||
qualfile=qualityIterator(options.withqualfile)
|
|
||||||
else:
|
else:
|
||||||
qualfile=None
|
r=formatFasta(seq)
|
||||||
|
|
||||||
def iterator(lineiterator):
|
try:
|
||||||
for s in reader(lineiterator):
|
print >>output,r
|
||||||
if qualfile is not None:
|
except IOError:
|
||||||
q = qualfile.next()
|
sys.exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def autoEntriesIterator(options):
|
||||||
|
options.outputFormater=formatFasta
|
||||||
|
|
||||||
|
def annotatedIterator(formatIterator):
|
||||||
|
options.outputFormater=formatFasta
|
||||||
|
def iterator(lineiterator):
|
||||||
|
for s in formatIterator(lineiterator):
|
||||||
|
s.extractTaxon()
|
||||||
|
yield s
|
||||||
|
|
||||||
|
return iterator
|
||||||
|
|
||||||
|
def withQualIterator(qualityfile):
|
||||||
|
options.outputFormater=formatFastq
|
||||||
|
def iterator(lineiterator):
|
||||||
|
for s in fnaFastaIterator(lineiterator):
|
||||||
|
q = qualityfile.next()
|
||||||
quality = array('d',(10.**(-x/10.) for x in q))
|
quality = array('d',(10.**(-x/10.) for x in q))
|
||||||
s.quality=quality
|
s.quality=quality
|
||||||
s.extractTaxon()
|
yield s
|
||||||
yield s
|
|
||||||
|
return iterator
|
||||||
|
|
||||||
|
def autoSequenceIterator(lineiterator):
|
||||||
|
options.outputFormater=formatFasta
|
||||||
|
first = lineiterator.next()
|
||||||
|
if first[0]==">":
|
||||||
|
if options.withqualfile is not None:
|
||||||
|
qualfile=qualityIterator(options.withqualfile)
|
||||||
|
reader=withQualIterator(qualfile)
|
||||||
|
options.outputFormater=formatFastq
|
||||||
|
elif options.moltype=='nuc':
|
||||||
|
reader=fastaNucIterator
|
||||||
|
elif options.moltype=='pep':
|
||||||
|
reader=fastaAAIterator
|
||||||
|
else:
|
||||||
|
reader=fastaIterator
|
||||||
|
elif first[0]=='@':
|
||||||
|
reader=fastqSangerIterator
|
||||||
|
options.outputFormater=formatFastq
|
||||||
|
elif first[0:3]=='ID ':
|
||||||
|
reader=emblIterator
|
||||||
|
elif first[0:6]=='LOCUS ':
|
||||||
|
reader=genbankIterator
|
||||||
|
else:
|
||||||
|
raise AssertionError,'file is not in fasta, fasta, embl, or genbank format'
|
||||||
|
|
||||||
|
input = reader(chain([first],lineiterator))
|
||||||
|
|
||||||
|
return input
|
||||||
|
|
||||||
|
if options.seqinformat is None:
|
||||||
|
reader = autoSequenceIterator
|
||||||
|
else:
|
||||||
|
if options.seqinformat=='fasta':
|
||||||
|
if options.moltype=='nuc':
|
||||||
|
reader=fastaNucIterator
|
||||||
|
elif options.moltype=='pep':
|
||||||
|
reader=fastaAAIterator
|
||||||
|
else:
|
||||||
|
reader=fastaIterator
|
||||||
|
elif options.seqinformat=='genbank':
|
||||||
|
reader=annotatedIterator(genbankIterator)
|
||||||
|
elif options.seqinformat=='embl':
|
||||||
|
reader=annotatedIterator(emblIterator)
|
||||||
|
elif options.seqinformat=='fna':
|
||||||
|
reader=fnaFastaIterator
|
||||||
|
elif options.seqinformat=='sanger':
|
||||||
|
options.outputFormater=formatFastq
|
||||||
|
reader=fastqSangerIterator
|
||||||
|
elif options.seqinformat=='solexa':
|
||||||
|
options.outputFormater=formatFastq
|
||||||
|
reader=fastqSolexaIterator
|
||||||
|
elif options.seqinformat=='illumina':
|
||||||
|
options.outputFormater=formatFastq
|
||||||
|
reader=fastqIlluminaIterator
|
||||||
|
|
||||||
return iterator
|
if options.seqinformat=='fna' and options.withqualfile is not None:
|
||||||
|
qualfile=qualityIterator(options.withqualfile)
|
||||||
|
reader=withQualIterator(qualfile)
|
||||||
|
options.outputFormater=formatFastq
|
||||||
|
|
||||||
|
return reader
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user