From 06178d9d61737020210c56d028780de2718f0dbc Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Wed, 20 Mar 2019 11:44:43 +0100 Subject: [PATCH] Genbank file parser functions that should have been included in a previous commit --- python/obitools3/parsers/genbank.cfiles | 110 ++++++++++++++ python/obitools3/parsers/genbank.pxd | 9 ++ python/obitools3/parsers/genbank.pyx | 193 ++++++++++++++++++++++++ 3 files changed, 312 insertions(+) create mode 100644 python/obitools3/parsers/genbank.cfiles create mode 100755 python/obitools3/parsers/genbank.pxd create mode 100755 python/obitools3/parsers/genbank.pyx diff --git a/python/obitools3/parsers/genbank.cfiles b/python/obitools3/parsers/genbank.cfiles new file mode 100644 index 0000000..fcf4279 --- /dev/null +++ b/python/obitools3/parsers/genbank.cfiles @@ -0,0 +1,110 @@ +../../../src/obi_lcs.h +../../../src/obi_lcs.c +../../../src/obierrno.h +../../../src/obierrno.c +../../../src/upperband.h +../../../src/upperband.c +../../../src/sse_banded_LCS_alignment.h +../../../src/sse_banded_LCS_alignment.c +../../../src/obiblob.h +../../../src/obiblob.c +../../../src/utils.h +../../../src/utils.c +../../../src/obidms.h +../../../src/obidms.c +../../../src/libjson/json_utils.h +../../../src/libjson/json_utils.c +../../../src/libjson/cJSON.h +../../../src/libjson/cJSON.c +../../../src/obiavl.h +../../../src/obiavl.c +../../../src/bloom.h +../../../src/bloom.c +../../../src/crc64.h +../../../src/crc64.c +../../../src/murmurhash2.h +../../../src/murmurhash2.c +../../../src/obidmscolumn.h +../../../src/obidmscolumn.c +../../../src/obitypes.h +../../../src/obitypes.c +../../../src/obidmscolumndir.h +../../../src/obidmscolumndir.c +../../../src/obiblob_indexer.h +../../../src/obiblob_indexer.c +../../../src/obiview.h +../../../src/obiview.c +../../../src/hashtable.h +../../../src/hashtable.c +../../../src/linked_list.h +../../../src/linked_list.c +../../../src/obidmscolumn_array.h +../../../src/obidmscolumn_array.c +../../../src/obidmscolumn_blob.h +../../../src/obidmscolumn_blob.c +../../../src/obidmscolumn_idx.h +../../../src/obidmscolumn_idx.c +../../../src/obidmscolumn_bool.h +../../../src/obidmscolumn_bool.c +../../../src/obidmscolumn_char.h +../../../src/obidmscolumn_char.c +../../../src/obidmscolumn_float.h +../../../src/obidmscolumn_float.c +../../../src/obidmscolumn_int.h +../../../src/obidmscolumn_int.c +../../../src/obidmscolumn_qual.h +../../../src/obidmscolumn_qual.c +../../../src/obidmscolumn_seq.h +../../../src/obidmscolumn_seq.c +../../../src/obidmscolumn_str.h +../../../src/obidmscolumn_str.c +../../../src/array_indexer.h +../../../src/array_indexer.c +../../../src/char_str_indexer.h +../../../src/char_str_indexer.c +../../../src/dna_seq_indexer.h +../../../src/dna_seq_indexer.c +../../../src/encode.c +../../../src/encode.h +../../../src/uint8_indexer.c +../../../src/uint8_indexer.h +../../../src/build_reference_db.c +../../../src/build_reference_db.h +../../../src/kmer_similarity.c +../../../src/kmer_similarity.h +../../../src/obi_clean.c +../../../src/obi_clean.h +../../../src/obi_ecopcr.c +../../../src/obi_ecopcr.h +../../../src/obi_ecotag.c +../../../src/obi_ecotag.h +../../../src/obidms_taxonomy.c +../../../src/obidms_taxonomy.h +../../../src/obilittlebigman.c +../../../src/obilittlebigman.h +../../../src/_sse.h +../../../src/obidebug.h +../../../src/libecoPCR/libapat/CODES/dft_code.h +../../../src/libecoPCR/libapat/CODES/dna_code.h +../../../src/libecoPCR/libapat/CODES/prot_code.h +../../../src/libecoPCR/libapat/apat_parse.c +../../../src/libecoPCR/libapat/apat_search.c +../../../src/libecoPCR/libapat/apat.h +../../../src/libecoPCR/libapat/Gmach.h +../../../src/libecoPCR/libapat/Gtypes.h +../../../src/libecoPCR/libapat/libstki.c +../../../src/libecoPCR/libapat/libstki.h +../../../src/libecoPCR/libthermo/nnparams.h +../../../src/libecoPCR/libthermo/nnparams.c +../../../src/libecoPCR/ecoapat.c +../../../src/libecoPCR/ecodna.c +../../../src/libecoPCR/ecoError.c +../../../src/libecoPCR/ecoMalloc.c +../../../src/libecoPCR/ecoPCR.h + + + + + + + diff --git a/python/obitools3/parsers/genbank.pxd b/python/obitools3/parsers/genbank.pxd new file mode 100755 index 0000000..8962d37 --- /dev/null +++ b/python/obitools3/parsers/genbank.pxd @@ -0,0 +1,9 @@ +#cython: language_level=3 + +from ..utils cimport str2bytes +from .header cimport parseHeader +from ..files.universalopener cimport uopen +from ..files.linebuffer cimport LineBuffer + + + \ No newline at end of file diff --git a/python/obitools3/parsers/genbank.pyx b/python/obitools3/parsers/genbank.pyx new file mode 100755 index 0000000..dcce178 --- /dev/null +++ b/python/obitools3/parsers/genbank.pyx @@ -0,0 +1,193 @@ +#cython: language_level=3 + +''' +Created on June 12th 2018 + +@author: coissac/mercier +''' + + +import types +import re +import sys +import os +import glob + +from obitools3.files.universalopener cimport uopen +from obitools3.utils cimport tostr +from obitools3.dms.obiseq cimport Nuc_Seq +from .embl_genbank_features import extractTaxon + +from libc.stdlib cimport free, malloc, realloc +from libc.string cimport strcpy, strlen + + +_featureMatcher = re.compile(b'^FEATURES.+\n(?=ORIGIN)',re.DOTALL + re.M) + +_headerMatcher = re.compile(b'^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M) +_seqMatcher = re.compile(b'(?<=ORIGIN).+(?=//\n)', re.DOTALL + re.M) +_cleanSeq = re.compile(b'[ \n0-9]+') +_acMatcher = re.compile(b'(?<=^ACCESSION ).+',re.M) +_deMatcher = re.compile(b'(?<=^DEFINITION ).+\n( .+\n)*',re.M) +_cleanDe = re.compile(b'\n *') + + +def genbankParser(bytes text): + + cdef Nuc_Seq seq + + try: + header = _headerMatcher.search(text).group() + + ft = _featureMatcher.search(text).group() + + s = _seqMatcher.search(text).group() + s = _cleanSeq.sub(b'', s).upper() + + acs = _acMatcher.search(text).group() + acs = acs.split() + ac = acs[0] + acs = acs[1:] + + de = _deMatcher.search(header).group() + de = _cleanDe.sub(b' ',de).strip().strip(b'.') + + except Exception as e: + print("\nCould not import sequence id:", text.split()[1], "(error raised:", e, ")") + # Do not raise any Exception if you need the possibility to resume the generator + # (Python generators can't resume after any exception is raised) + return None + + tags = {} + extractTaxon(ft, tags) + + seq = Nuc_Seq(ac, + s, + definition=de, + quality=None, + offset=-1, + tags=tags) + + return seq + + +def genbankIterator_file(lineiterator, + int skip=0, + only=None, + firstline=None, + int buffersize=100000000 + ): + cdef int lines_to_skip, ionly, read + cdef Nuc_Seq seq + cdef char* entry = NULL + cdef size_t entry_buffer_size + cdef int entry_len + cdef int line_len + + entry_buffer_size = 2048 + + entry = malloc(entry_buffer_size*sizeof(char)) + + if only is None: + ionly = -1 + else: + ionly = int(only) + + if isinstance(lineiterator, (str, bytes)): + lineiterator=uopen(lineiterator) + if isinstance(lineiterator, LineBuffer): + iterator = iter(lineiterator) + else: + if hasattr(lineiterator, "readlines"): + iterator = iter(LineBuffer(lineiterator, buffersize)) + elif hasattr(lineiterator, '__next__'): + iterator = lineiterator + else: + raise Exception("Invalid line iterator") + + skipped = 0 + read = 0 + + if firstline is None: + line = next(iterator) + else: + line = firstline + + while True: + + if ionly >= 0 and read >= ionly-1: + break + + while skipped < skip: + line = next(iterator) + try: + while line[:2] != b'//': + line = next(iterator) + line = next(iterator) + except StopIteration: + break + skipped += 1 + + try: + entry_len = 0 + while line[:2] != b'//': + line_len = strlen(line) + while (entry_len + line_len) >= entry_buffer_size: + entry_buffer_size*=2 + entry = realloc(entry, entry_buffer_size) + strcpy(entry+entry_len, line) + entry_len+=line_len + line = next(iterator) + # Add last line too because need the // flag to parse + line_len = strlen(line) + while (entry_len + line_len) >= entry_buffer_size: + entry_buffer_size*=2 + entry = realloc(entry, entry_buffer_size) + strcpy(entry+entry_len, line) + line = next(iterator) + except StopIteration: + break + + seq = genbankParser(entry) + + yield seq + read+=1 + + yield seq + + free(entry) + + +def genbankIterator_dir(dir_path, + int skip=0, + only=None, + firstline=None, + int buffersize=100000000 + ): + path = dir_path + read = 0 + for filename in glob.glob(os.path.join(path, b'*.gbff*')): + if read==only: + return + f = uopen(filename) + if only is not None: + only_f = only-read + else: + only_f = None + for seq in genbankIterator_file(f, skip=skip, only=only_f, buffersize=buffersize): + yield seq + read+=1 + + +def genbankIterator(obj, + int skip=0, + only=None, + firstline=None, + int buffersize=100000000 + ): + if type(obj) == bytes or type(obj) == str : + return genbankIterator_dir(obj, skip=skip, only=only, firstline=firstline, buffersize=buffersize) + else: + return genbankIterator_file(obj, skip=skip, only=only, firstline=firstline, buffersize=buffersize) + +