87 lines
2.3 KiB
Cython
87 lines
2.3 KiB
Cython
import re
|
|
from obitools3.parsers.fasta import fastaNucIterator
|
|
from obitools3.parsers.fastq import fastqIterator
|
|
|
|
oligore = re.compile("^[ACGTRYSWKMBDHVN]+$",re.I)
|
|
tagre = re.compile("^([ACGTRYSWKMBDHVN]+|-)(:([ACGTRYSWKMBDHVN]+)|-)?$",re.I)
|
|
|
|
def is_ngsfilter_line(line):
|
|
try:
|
|
parts = line.split()
|
|
ok = tagre.match(parts[2])
|
|
ok&= oligore.match(parts[3])
|
|
ok&= oligore.match(parts[4])
|
|
ok&= parts[5]=="F" | parts[5]=="T"
|
|
return ok
|
|
except:
|
|
return False
|
|
|
|
def entryIteratorFactory(lineiterator,
|
|
int skip=0,
|
|
only=None,
|
|
bytes seqtype=b'nuc',
|
|
int qualityoffset=32,
|
|
bool noquality=False,
|
|
bool skiperror=True,
|
|
bool header=False,
|
|
bytes sep=None,
|
|
bytes dec=b'.',
|
|
bytes nastring=b"NA",
|
|
bool stripwhite=True,
|
|
bool blanklineskip=True,
|
|
bytes commentchar=b"#",
|
|
int buffersize=100000000):
|
|
|
|
if isinstance(lineiterator,(str,bytes)):
|
|
lineiterator=uopen(lineiterator)
|
|
|
|
if isinstance(lineiterator, LineBuffer):
|
|
lb=lineiterator
|
|
else:
|
|
lb=LineBuffer(lineiterator,buffersize)
|
|
|
|
i = iter(lb)
|
|
|
|
first=next(i)
|
|
|
|
format="tab"
|
|
|
|
if first[0]==">":
|
|
format=b"fasta"
|
|
if first[0]=="@":
|
|
format=b"fastq"
|
|
elif first[0:3]=='ID ':
|
|
format=b"embl"
|
|
elif first[0:6]=='LOCUS ':
|
|
format=b"genbank"
|
|
elif first[0:11]=='#@ecopcr-v2':
|
|
format=b"ecopcrfile"
|
|
elif is_ngsfilter_line(first):
|
|
format=b"ngsfilter"
|
|
else:
|
|
format=b"tabular"
|
|
|
|
|
|
if format==b'fasta':
|
|
if seqtype == b'nuc':
|
|
return (fastaNucIterator(lineiterator,
|
|
skip,only,
|
|
first),
|
|
Nuc_Seq)
|
|
else:
|
|
raise NotImplementedError()
|
|
elif format=b'fastq':
|
|
return (fastqIterator(lineiterator,
|
|
skip,only,
|
|
qualityoffset,
|
|
first),
|
|
Nuc_Seq)
|
|
|
|
|
|
raise NotImplementedError('File format not yet implemented')
|
|
|
|
|
|
|
|
|
|
|