Add a factory checking the file format and returning the correct

iterator. First version working only with fasta and fastq nucleic formats
2017-07-27 16:02:52 +02:00
parent 0f6ae7dfa6
commit 8781ecab1f
1 changed files with 86 additions and 0 deletions
--- a/python/obitools3/parsers/universal.pyx
+++ b/python/obitools3/parsers/universal.pyx
@ -0,0 +1,86 @@
+import re 
+from obitools3.parsers.fasta import fastaNucIterator
+from obitools3.parsers.fastq import fastqIterator
+
+oligore = re.compile("^[ACGTRYSWKMBDHVN]+$",re.I)
+tagre   = re.compile("^([ACGTRYSWKMBDHVN]+|-)(:([ACGTRYSWKMBDHVN]+)|-)?$",re.I)
+
+def is_ngsfilter_line(line):
+    try:
+        parts = line.split()
+        ok = tagre.match(parts[2])
+        ok&= oligore.match(parts[3])
+        ok&= oligore.match(parts[4])
+        ok&= parts[5]=="F" | parts[5]=="T"
+        return ok  
+    except:
+        return False
+
+def entryIteratorFactory(lineiterator, 
+                  int skip=0,
+                  only=None,
+                  bytes seqtype=b'nuc',
+                  int qualityoffset=32,
+                  bool noquality=False,
+                  bool skiperror=True,
+                  bool header=False,
+                  bytes sep=None,
+                  bytes dec=b'.',
+                  bytes nastring=b"NA",
+                  bool stripwhite=True,
+                  bool blanklineskip=True,
+                  bytes commentchar=b"#",
+                  int buffersize=100000000):
+
+    if isinstance(lineiterator,(str,bytes)):
+        lineiterator=uopen(lineiterator)
+    
+    if isinstance(lineiterator, LineBuffer):
+        lb=lineiterator
+    else:
+        lb=LineBuffer(lineiterator,buffersize)
+        
+    i = iter(lb)
+        
+    first=next(i)
+    
+    format="tab"
+    
+    if first[0]==">":
+        format=b"fasta"
+    if first[0]=="@":
+        format=b"fastq"
+    elif first[0:3]=='ID ':
+        format=b"embl"
+    elif first[0:6]=='LOCUS ':
+        format=b"genbank"
+    elif first[0:11]=='#@ecopcr-v2':
+        format=b"ecopcrfile"
+    elif is_ngsfilter_line(first):
+        format=b"ngsfilter"
+    else:
+        format=b"tabular"
+        
+        
+    if format==b'fasta':
+        if seqtype == b'nuc':
+            return (fastaNucIterator(lineiterator,
+                                    skip,only,
+                                    first),
+                    Nuc_Seq)
+        else:
+            raise NotImplementedError()
+    elif format=b'fastq':
+            return (fastqIterator(lineiterator,
+                                 skip,only,
+                                 qualityoffset,
+                                 first),
+                    Nuc_Seq)
+                    
+            
+    raise NotImplementedError('File format not yet implemented')
+
+
+        
+
+