Add a factory checking the file format and returning the correct
iterator. First version working only with fasta and fastq nucleic formats
This commit is contained in:
86
python/obitools3/parsers/universal.pyx
Normal file
86
python/obitools3/parsers/universal.pyx
Normal file
@ -0,0 +1,86 @@
|
||||
import re
|
||||
from obitools3.parsers.fasta import fastaNucIterator
|
||||
from obitools3.parsers.fastq import fastqIterator
|
||||
|
||||
oligore = re.compile("^[ACGTRYSWKMBDHVN]+$",re.I)
|
||||
tagre = re.compile("^([ACGTRYSWKMBDHVN]+|-)(:([ACGTRYSWKMBDHVN]+)|-)?$",re.I)
|
||||
|
||||
def is_ngsfilter_line(line):
|
||||
try:
|
||||
parts = line.split()
|
||||
ok = tagre.match(parts[2])
|
||||
ok&= oligore.match(parts[3])
|
||||
ok&= oligore.match(parts[4])
|
||||
ok&= parts[5]=="F" | parts[5]=="T"
|
||||
return ok
|
||||
except:
|
||||
return False
|
||||
|
||||
def entryIteratorFactory(lineiterator,
|
||||
int skip=0,
|
||||
only=None,
|
||||
bytes seqtype=b'nuc',
|
||||
int qualityoffset=32,
|
||||
bool noquality=False,
|
||||
bool skiperror=True,
|
||||
bool header=False,
|
||||
bytes sep=None,
|
||||
bytes dec=b'.',
|
||||
bytes nastring=b"NA",
|
||||
bool stripwhite=True,
|
||||
bool blanklineskip=True,
|
||||
bytes commentchar=b"#",
|
||||
int buffersize=100000000):
|
||||
|
||||
if isinstance(lineiterator,(str,bytes)):
|
||||
lineiterator=uopen(lineiterator)
|
||||
|
||||
if isinstance(lineiterator, LineBuffer):
|
||||
lb=lineiterator
|
||||
else:
|
||||
lb=LineBuffer(lineiterator,buffersize)
|
||||
|
||||
i = iter(lb)
|
||||
|
||||
first=next(i)
|
||||
|
||||
format="tab"
|
||||
|
||||
if first[0]==">":
|
||||
format=b"fasta"
|
||||
if first[0]=="@":
|
||||
format=b"fastq"
|
||||
elif first[0:3]=='ID ':
|
||||
format=b"embl"
|
||||
elif first[0:6]=='LOCUS ':
|
||||
format=b"genbank"
|
||||
elif first[0:11]=='#@ecopcr-v2':
|
||||
format=b"ecopcrfile"
|
||||
elif is_ngsfilter_line(first):
|
||||
format=b"ngsfilter"
|
||||
else:
|
||||
format=b"tabular"
|
||||
|
||||
|
||||
if format==b'fasta':
|
||||
if seqtype == b'nuc':
|
||||
return (fastaNucIterator(lineiterator,
|
||||
skip,only,
|
||||
first),
|
||||
Nuc_Seq)
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
elif format=b'fastq':
|
||||
return (fastqIterator(lineiterator,
|
||||
skip,only,
|
||||
qualityoffset,
|
||||
first),
|
||||
Nuc_Seq)
|
||||
|
||||
|
||||
raise NotImplementedError('File format not yet implemented')
|
||||
|
||||
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user