import re from obitools3.parsers.fasta import fastaNucIterator from obitools3.parsers.fastq import fastqIterator oligore = re.compile("^[ACGTRYSWKMBDHVN]+$",re.I) tagre = re.compile("^([ACGTRYSWKMBDHVN]+|-)(:([ACGTRYSWKMBDHVN]+)|-)?$",re.I) def is_ngsfilter_line(line): try: parts = line.split() ok = tagre.match(parts[2]) ok&= oligore.match(parts[3]) ok&= oligore.match(parts[4]) ok&= parts[5]=="F" | parts[5]=="T" return ok except: return False def entryIteratorFactory(lineiterator, int skip=0, only=None, bytes seqtype=b'nuc', int qualityoffset=32, bool noquality=False, bool skiperror=True, bool header=False, bytes sep=None, bytes dec=b'.', bytes nastring=b"NA", bool stripwhite=True, bool blanklineskip=True, bytes commentchar=b"#", int buffersize=100000000): if isinstance(lineiterator,(str,bytes)): lineiterator=uopen(lineiterator) if isinstance(lineiterator, LineBuffer): lb=lineiterator else: lb=LineBuffer(lineiterator,buffersize) i = iter(lb) first=next(i) format="tab" if first[0]==">": format=b"fasta" if first[0]=="@": format=b"fastq" elif first[0:3]=='ID ': format=b"embl" elif first[0:6]=='LOCUS ': format=b"genbank" elif first[0:11]=='#@ecopcr-v2': format=b"ecopcrfile" elif is_ngsfilter_line(first): format=b"ngsfilter" else: format=b"tabular" if format==b'fasta': if seqtype == b'nuc': return (fastaNucIterator(lineiterator, skip,only, first), Nuc_Seq) else: raise NotImplementedError() elif format=b'fastq': return (fastqIterator(lineiterator, skip,only, qualityoffset, first), Nuc_Seq) raise NotImplementedError('File format not yet implemented')