Add a factory checking the file format and returning the correct

iterator. First version working only with fasta and fastq nucleic
formats
This commit is contained in:
2017-07-27 16:02:52 +02:00
parent 0f6ae7dfa6
commit 8781ecab1f

View File

@ -0,0 +1,86 @@
import re
from obitools3.parsers.fasta import fastaNucIterator
from obitools3.parsers.fastq import fastqIterator
oligore = re.compile("^[ACGTRYSWKMBDHVN]+$",re.I)
tagre = re.compile("^([ACGTRYSWKMBDHVN]+|-)(:([ACGTRYSWKMBDHVN]+)|-)?$",re.I)
def is_ngsfilter_line(line):
try:
parts = line.split()
ok = tagre.match(parts[2])
ok&= oligore.match(parts[3])
ok&= oligore.match(parts[4])
ok&= parts[5]=="F" | parts[5]=="T"
return ok
except:
return False
def entryIteratorFactory(lineiterator,
int skip=0,
only=None,
bytes seqtype=b'nuc',
int qualityoffset=32,
bool noquality=False,
bool skiperror=True,
bool header=False,
bytes sep=None,
bytes dec=b'.',
bytes nastring=b"NA",
bool stripwhite=True,
bool blanklineskip=True,
bytes commentchar=b"#",
int buffersize=100000000):
if isinstance(lineiterator,(str,bytes)):
lineiterator=uopen(lineiterator)
if isinstance(lineiterator, LineBuffer):
lb=lineiterator
else:
lb=LineBuffer(lineiterator,buffersize)
i = iter(lb)
first=next(i)
format="tab"
if first[0]==">":
format=b"fasta"
if first[0]=="@":
format=b"fastq"
elif first[0:3]=='ID ':
format=b"embl"
elif first[0:6]=='LOCUS ':
format=b"genbank"
elif first[0:11]=='#@ecopcr-v2':
format=b"ecopcrfile"
elif is_ngsfilter_line(first):
format=b"ngsfilter"
else:
format=b"tabular"
if format==b'fasta':
if seqtype == b'nuc':
return (fastaNucIterator(lineiterator,
skip,only,
first),
Nuc_Seq)
else:
raise NotImplementedError()
elif format=b'fastq':
return (fastqIterator(lineiterator,
skip,only,
qualityoffset,
first),
Nuc_Seq)
raise NotImplementedError('File format not yet implemented')