From 06178d9d61737020210c56d028780de2718f0dbc Mon Sep 17 00:00:00 2001
From: Celine Mercier <celine.mercier@metabarcoding.org>
Date: Wed, 20 Mar 2019 11:44:43 +0100
Subject: [PATCH] Genbank file parser functions that should have been included
 in a previous commit

---
 python/obitools3/parsers/genbank.cfiles | 110 ++++++++++++++
 python/obitools3/parsers/genbank.pxd    |   9 ++
 python/obitools3/parsers/genbank.pyx    | 193 ++++++++++++++++++++++++
 3 files changed, 312 insertions(+)
 create mode 100644 python/obitools3/parsers/genbank.cfiles
 create mode 100755 python/obitools3/parsers/genbank.pxd
 create mode 100755 python/obitools3/parsers/genbank.pyx

diff --git a/python/obitools3/parsers/genbank.cfiles b/python/obitools3/parsers/genbank.cfiles
new file mode 100644
index 0000000..fcf4279
--- /dev/null
+++ b/python/obitools3/parsers/genbank.cfiles
@@ -0,0 +1,110 @@
+../../../src/obi_lcs.h
+../../../src/obi_lcs.c
+../../../src/obierrno.h
+../../../src/obierrno.c
+../../../src/upperband.h
+../../../src/upperband.c
+../../../src/sse_banded_LCS_alignment.h
+../../../src/sse_banded_LCS_alignment.c
+../../../src/obiblob.h
+../../../src/obiblob.c
+../../../src/utils.h
+../../../src/utils.c
+../../../src/obidms.h
+../../../src/obidms.c
+../../../src/libjson/json_utils.h
+../../../src/libjson/json_utils.c
+../../../src/libjson/cJSON.h
+../../../src/libjson/cJSON.c
+../../../src/obiavl.h
+../../../src/obiavl.c
+../../../src/bloom.h
+../../../src/bloom.c
+../../../src/crc64.h
+../../../src/crc64.c
+../../../src/murmurhash2.h
+../../../src/murmurhash2.c
+../../../src/obidmscolumn.h
+../../../src/obidmscolumn.c
+../../../src/obitypes.h
+../../../src/obitypes.c
+../../../src/obidmscolumndir.h
+../../../src/obidmscolumndir.c
+../../../src/obiblob_indexer.h
+../../../src/obiblob_indexer.c
+../../../src/obiview.h
+../../../src/obiview.c
+../../../src/hashtable.h
+../../../src/hashtable.c
+../../../src/linked_list.h
+../../../src/linked_list.c
+../../../src/obidmscolumn_array.h
+../../../src/obidmscolumn_array.c
+../../../src/obidmscolumn_blob.h
+../../../src/obidmscolumn_blob.c
+../../../src/obidmscolumn_idx.h
+../../../src/obidmscolumn_idx.c
+../../../src/obidmscolumn_bool.h
+../../../src/obidmscolumn_bool.c
+../../../src/obidmscolumn_char.h
+../../../src/obidmscolumn_char.c
+../../../src/obidmscolumn_float.h
+../../../src/obidmscolumn_float.c
+../../../src/obidmscolumn_int.h
+../../../src/obidmscolumn_int.c
+../../../src/obidmscolumn_qual.h
+../../../src/obidmscolumn_qual.c
+../../../src/obidmscolumn_seq.h
+../../../src/obidmscolumn_seq.c
+../../../src/obidmscolumn_str.h
+../../../src/obidmscolumn_str.c
+../../../src/array_indexer.h
+../../../src/array_indexer.c
+../../../src/char_str_indexer.h
+../../../src/char_str_indexer.c
+../../../src/dna_seq_indexer.h
+../../../src/dna_seq_indexer.c
+../../../src/encode.c
+../../../src/encode.h
+../../../src/uint8_indexer.c
+../../../src/uint8_indexer.h
+../../../src/build_reference_db.c
+../../../src/build_reference_db.h
+../../../src/kmer_similarity.c
+../../../src/kmer_similarity.h
+../../../src/obi_clean.c
+../../../src/obi_clean.h
+../../../src/obi_ecopcr.c
+../../../src/obi_ecopcr.h
+../../../src/obi_ecotag.c
+../../../src/obi_ecotag.h
+../../../src/obidms_taxonomy.c
+../../../src/obidms_taxonomy.h
+../../../src/obilittlebigman.c
+../../../src/obilittlebigman.h
+../../../src/_sse.h
+../../../src/obidebug.h
+../../../src/libecoPCR/libapat/CODES/dft_code.h
+../../../src/libecoPCR/libapat/CODES/dna_code.h
+../../../src/libecoPCR/libapat/CODES/prot_code.h
+../../../src/libecoPCR/libapat/apat_parse.c
+../../../src/libecoPCR/libapat/apat_search.c
+../../../src/libecoPCR/libapat/apat.h
+../../../src/libecoPCR/libapat/Gmach.h
+../../../src/libecoPCR/libapat/Gtypes.h
+../../../src/libecoPCR/libapat/libstki.c
+../../../src/libecoPCR/libapat/libstki.h
+../../../src/libecoPCR/libthermo/nnparams.h
+../../../src/libecoPCR/libthermo/nnparams.c
+../../../src/libecoPCR/ecoapat.c
+../../../src/libecoPCR/ecodna.c
+../../../src/libecoPCR/ecoError.c
+../../../src/libecoPCR/ecoMalloc.c
+../../../src/libecoPCR/ecoPCR.h
+
+
+
+
+
+
+
diff --git a/python/obitools3/parsers/genbank.pxd b/python/obitools3/parsers/genbank.pxd
new file mode 100755
index 0000000..8962d37
--- /dev/null
+++ b/python/obitools3/parsers/genbank.pxd
@@ -0,0 +1,9 @@
+#cython: language_level=3
+
+from ..utils cimport str2bytes
+from .header cimport parseHeader
+from ..files.universalopener cimport uopen
+from ..files.linebuffer cimport LineBuffer
+
+
+    
\ No newline at end of file
diff --git a/python/obitools3/parsers/genbank.pyx b/python/obitools3/parsers/genbank.pyx
new file mode 100755
index 0000000..dcce178
--- /dev/null
+++ b/python/obitools3/parsers/genbank.pyx
@@ -0,0 +1,193 @@
+#cython: language_level=3
+
+'''
+Created on June 12th 2018
+
+@author: coissac/mercier
+'''
+
+
+import types
+import re
+import sys
+import os
+import glob
+
+from obitools3.files.universalopener cimport uopen
+from obitools3.utils cimport tostr
+from obitools3.dms.obiseq cimport Nuc_Seq
+from .embl_genbank_features import extractTaxon
+
+from libc.stdlib  cimport free, malloc, realloc
+from libc.string cimport strcpy, strlen
+
+
+_featureMatcher = re.compile(b'^FEATURES.+\n(?=ORIGIN)',re.DOTALL + re.M)
+
+_headerMatcher = re.compile(b'^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M)
+_seqMatcher    = re.compile(b'(?<=ORIGIN).+(?=//\n)', re.DOTALL + re.M)
+_cleanSeq      = re.compile(b'[ \n0-9]+')
+_acMatcher     = re.compile(b'(?<=^ACCESSION   ).+',re.M)
+_deMatcher     = re.compile(b'(?<=^DEFINITION  ).+\n( .+\n)*',re.M)
+_cleanDe       = re.compile(b'\n *')
+
+
+def genbankParser(bytes text):
+    
+    cdef Nuc_Seq seq
+        
+    try:
+        header = _headerMatcher.search(text).group()
+        
+        ft     = _featureMatcher.search(text).group()
+        
+        s      = _seqMatcher.search(text).group()
+        s      = _cleanSeq.sub(b'', s).upper()
+        
+        acs    = _acMatcher.search(text).group()
+        acs    = acs.split()
+        ac     = acs[0]
+        acs    = acs[1:]
+        
+        de     = _deMatcher.search(header).group()
+        de     = _cleanDe.sub(b' ',de).strip().strip(b'.')
+
+    except Exception as e:
+        print("\nCould not import sequence id:", text.split()[1], "(error raised:", e, ")")
+        # Do not raise any Exception if you need the possibility to resume the generator
+        # (Python generators can't resume after any exception is raised)
+        return None
+    
+    tags = {}
+    extractTaxon(ft, tags)
+        
+    seq = Nuc_Seq(ac,
+                  s,
+                  definition=de,
+                  quality=None,
+                  offset=-1,
+                  tags=tags)
+    
+    return seq
+
+
+def genbankIterator_file(lineiterator, 
+                         int skip=0,
+                         only=None,
+                         firstline=None,
+                         int buffersize=100000000
+                         ):
+    cdef int        lines_to_skip, ionly, read
+    cdef Nuc_Seq    seq
+    cdef char*      entry = NULL
+    cdef size_t     entry_buffer_size 
+    cdef int        entry_len
+    cdef int        line_len   
+    
+    entry_buffer_size = 2048
+    
+    entry = <char*> malloc(entry_buffer_size*sizeof(char)) 
+
+    if only is None:
+        ionly = -1
+    else:
+        ionly = int(only)
+
+    if isinstance(lineiterator, (str, bytes)):
+        lineiterator=uopen(lineiterator)        
+    if isinstance(lineiterator, LineBuffer):
+        iterator = iter(lineiterator)
+    else:
+        if hasattr(lineiterator, "readlines"):
+            iterator = iter(LineBuffer(lineiterator, buffersize))
+        elif hasattr(lineiterator, '__next__'):
+            iterator = lineiterator
+        else:
+            raise Exception("Invalid line iterator")
+
+    skipped = 0
+    read = 0
+    
+    if firstline is None:
+        line = next(iterator)
+    else:
+        line = firstline       
+            
+    while True:
+        
+        if ionly >= 0 and read >= ionly-1:
+            break
+                
+        while skipped < skip:
+            line = next(iterator)
+            try:
+                while line[:2] != b'//':
+                    line = next(iterator)
+                line = next(iterator)
+            except StopIteration:
+                break
+            skipped += 1
+
+        try:
+            entry_len = 0
+            while line[:2] != b'//':
+                line_len = strlen(line)
+                while (entry_len + line_len) >= entry_buffer_size:
+                    entry_buffer_size*=2
+                    entry = <char*>realloc(entry, entry_buffer_size)
+                strcpy(entry+entry_len, line)
+                entry_len+=line_len
+                line = next(iterator)
+            # Add last line too because need the // flag to parse
+            line_len = strlen(line)
+            while (entry_len + line_len) >= entry_buffer_size:
+                entry_buffer_size*=2
+                entry = <char*>realloc(entry, entry_buffer_size)
+            strcpy(entry+entry_len, line)
+            line = next(iterator)
+        except StopIteration:
+            break
+        
+        seq = genbankParser(entry)
+
+        yield seq
+        read+=1
+    
+    yield seq
+    
+    free(entry)
+
+
+def genbankIterator_dir(dir_path, 
+                        int skip=0,
+                        only=None,
+                        firstline=None,
+                        int buffersize=100000000
+                       ):
+    path = dir_path
+    read = 0
+    for filename in glob.glob(os.path.join(path, b'*.gbff*')):
+        if read==only:
+            return
+        f = uopen(filename)
+        if only is not None:
+            only_f = only-read
+        else:
+            only_f = None
+        for seq in genbankIterator_file(f, skip=skip, only=only_f, buffersize=buffersize):
+            yield seq
+            read+=1
+
+
+def genbankIterator(obj, 
+                    int skip=0,
+                    only=None,
+                    firstline=None,
+                    int buffersize=100000000
+                   ):
+    if type(obj) == bytes or type(obj) == str :
+        return genbankIterator_dir(obj, skip=skip, only=only, firstline=firstline, buffersize=buffersize)
+    else:
+        return genbankIterator_file(obj, skip=skip, only=only, firstline=firstline, buffersize=buffersize)
+
+