diff --git a/python/obitools3/parsers/universal.pyx b/python/obitools3/parsers/universal.pyx new file mode 100644 index 0000000..280a1e2 --- /dev/null +++ b/python/obitools3/parsers/universal.pyx @@ -0,0 +1,86 @@ +import re +from obitools3.parsers.fasta import fastaNucIterator +from obitools3.parsers.fastq import fastqIterator + +oligore = re.compile("^[ACGTRYSWKMBDHVN]+$",re.I) +tagre = re.compile("^([ACGTRYSWKMBDHVN]+|-)(:([ACGTRYSWKMBDHVN]+)|-)?$",re.I) + +def is_ngsfilter_line(line): + try: + parts = line.split() + ok = tagre.match(parts[2]) + ok&= oligore.match(parts[3]) + ok&= oligore.match(parts[4]) + ok&= parts[5]=="F" | parts[5]=="T" + return ok + except: + return False + +def entryIteratorFactory(lineiterator, + int skip=0, + only=None, + bytes seqtype=b'nuc', + int qualityoffset=32, + bool noquality=False, + bool skiperror=True, + bool header=False, + bytes sep=None, + bytes dec=b'.', + bytes nastring=b"NA", + bool stripwhite=True, + bool blanklineskip=True, + bytes commentchar=b"#", + int buffersize=100000000): + + if isinstance(lineiterator,(str,bytes)): + lineiterator=uopen(lineiterator) + + if isinstance(lineiterator, LineBuffer): + lb=lineiterator + else: + lb=LineBuffer(lineiterator,buffersize) + + i = iter(lb) + + first=next(i) + + format="tab" + + if first[0]==">": + format=b"fasta" + if first[0]=="@": + format=b"fastq" + elif first[0:3]=='ID ': + format=b"embl" + elif first[0:6]=='LOCUS ': + format=b"genbank" + elif first[0:11]=='#@ecopcr-v2': + format=b"ecopcrfile" + elif is_ngsfilter_line(first): + format=b"ngsfilter" + else: + format=b"tabular" + + + if format==b'fasta': + if seqtype == b'nuc': + return (fastaNucIterator(lineiterator, + skip,only, + first), + Nuc_Seq) + else: + raise NotImplementedError() + elif format=b'fastq': + return (fastqIterator(lineiterator, + skip,only, + qualityoffset, + first), + Nuc_Seq) + + + raise NotImplementedError('File format not yet implemented') + + + + +