Genbank file parser functions that should have been included in a
previous commit
This commit is contained in:
110
python/obitools3/parsers/genbank.cfiles
Normal file
110
python/obitools3/parsers/genbank.cfiles
Normal file
@ -0,0 +1,110 @@
|
||||
../../../src/obi_lcs.h
|
||||
../../../src/obi_lcs.c
|
||||
../../../src/obierrno.h
|
||||
../../../src/obierrno.c
|
||||
../../../src/upperband.h
|
||||
../../../src/upperband.c
|
||||
../../../src/sse_banded_LCS_alignment.h
|
||||
../../../src/sse_banded_LCS_alignment.c
|
||||
../../../src/obiblob.h
|
||||
../../../src/obiblob.c
|
||||
../../../src/utils.h
|
||||
../../../src/utils.c
|
||||
../../../src/obidms.h
|
||||
../../../src/obidms.c
|
||||
../../../src/libjson/json_utils.h
|
||||
../../../src/libjson/json_utils.c
|
||||
../../../src/libjson/cJSON.h
|
||||
../../../src/libjson/cJSON.c
|
||||
../../../src/obiavl.h
|
||||
../../../src/obiavl.c
|
||||
../../../src/bloom.h
|
||||
../../../src/bloom.c
|
||||
../../../src/crc64.h
|
||||
../../../src/crc64.c
|
||||
../../../src/murmurhash2.h
|
||||
../../../src/murmurhash2.c
|
||||
../../../src/obidmscolumn.h
|
||||
../../../src/obidmscolumn.c
|
||||
../../../src/obitypes.h
|
||||
../../../src/obitypes.c
|
||||
../../../src/obidmscolumndir.h
|
||||
../../../src/obidmscolumndir.c
|
||||
../../../src/obiblob_indexer.h
|
||||
../../../src/obiblob_indexer.c
|
||||
../../../src/obiview.h
|
||||
../../../src/obiview.c
|
||||
../../../src/hashtable.h
|
||||
../../../src/hashtable.c
|
||||
../../../src/linked_list.h
|
||||
../../../src/linked_list.c
|
||||
../../../src/obidmscolumn_array.h
|
||||
../../../src/obidmscolumn_array.c
|
||||
../../../src/obidmscolumn_blob.h
|
||||
../../../src/obidmscolumn_blob.c
|
||||
../../../src/obidmscolumn_idx.h
|
||||
../../../src/obidmscolumn_idx.c
|
||||
../../../src/obidmscolumn_bool.h
|
||||
../../../src/obidmscolumn_bool.c
|
||||
../../../src/obidmscolumn_char.h
|
||||
../../../src/obidmscolumn_char.c
|
||||
../../../src/obidmscolumn_float.h
|
||||
../../../src/obidmscolumn_float.c
|
||||
../../../src/obidmscolumn_int.h
|
||||
../../../src/obidmscolumn_int.c
|
||||
../../../src/obidmscolumn_qual.h
|
||||
../../../src/obidmscolumn_qual.c
|
||||
../../../src/obidmscolumn_seq.h
|
||||
../../../src/obidmscolumn_seq.c
|
||||
../../../src/obidmscolumn_str.h
|
||||
../../../src/obidmscolumn_str.c
|
||||
../../../src/array_indexer.h
|
||||
../../../src/array_indexer.c
|
||||
../../../src/char_str_indexer.h
|
||||
../../../src/char_str_indexer.c
|
||||
../../../src/dna_seq_indexer.h
|
||||
../../../src/dna_seq_indexer.c
|
||||
../../../src/encode.c
|
||||
../../../src/encode.h
|
||||
../../../src/uint8_indexer.c
|
||||
../../../src/uint8_indexer.h
|
||||
../../../src/build_reference_db.c
|
||||
../../../src/build_reference_db.h
|
||||
../../../src/kmer_similarity.c
|
||||
../../../src/kmer_similarity.h
|
||||
../../../src/obi_clean.c
|
||||
../../../src/obi_clean.h
|
||||
../../../src/obi_ecopcr.c
|
||||
../../../src/obi_ecopcr.h
|
||||
../../../src/obi_ecotag.c
|
||||
../../../src/obi_ecotag.h
|
||||
../../../src/obidms_taxonomy.c
|
||||
../../../src/obidms_taxonomy.h
|
||||
../../../src/obilittlebigman.c
|
||||
../../../src/obilittlebigman.h
|
||||
../../../src/_sse.h
|
||||
../../../src/obidebug.h
|
||||
../../../src/libecoPCR/libapat/CODES/dft_code.h
|
||||
../../../src/libecoPCR/libapat/CODES/dna_code.h
|
||||
../../../src/libecoPCR/libapat/CODES/prot_code.h
|
||||
../../../src/libecoPCR/libapat/apat_parse.c
|
||||
../../../src/libecoPCR/libapat/apat_search.c
|
||||
../../../src/libecoPCR/libapat/apat.h
|
||||
../../../src/libecoPCR/libapat/Gmach.h
|
||||
../../../src/libecoPCR/libapat/Gtypes.h
|
||||
../../../src/libecoPCR/libapat/libstki.c
|
||||
../../../src/libecoPCR/libapat/libstki.h
|
||||
../../../src/libecoPCR/libthermo/nnparams.h
|
||||
../../../src/libecoPCR/libthermo/nnparams.c
|
||||
../../../src/libecoPCR/ecoapat.c
|
||||
../../../src/libecoPCR/ecodna.c
|
||||
../../../src/libecoPCR/ecoError.c
|
||||
../../../src/libecoPCR/ecoMalloc.c
|
||||
../../../src/libecoPCR/ecoPCR.h
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
9
python/obitools3/parsers/genbank.pxd
Executable file
9
python/obitools3/parsers/genbank.pxd
Executable file
@ -0,0 +1,9 @@
|
||||
#cython: language_level=3
|
||||
|
||||
from ..utils cimport str2bytes
|
||||
from .header cimport parseHeader
|
||||
from ..files.universalopener cimport uopen
|
||||
from ..files.linebuffer cimport LineBuffer
|
||||
|
||||
|
||||
|
193
python/obitools3/parsers/genbank.pyx
Executable file
193
python/obitools3/parsers/genbank.pyx
Executable file
@ -0,0 +1,193 @@
|
||||
#cython: language_level=3
|
||||
|
||||
'''
|
||||
Created on June 12th 2018
|
||||
|
||||
@author: coissac/mercier
|
||||
'''
|
||||
|
||||
|
||||
import types
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
import glob
|
||||
|
||||
from obitools3.files.universalopener cimport uopen
|
||||
from obitools3.utils cimport tostr
|
||||
from obitools3.dms.obiseq cimport Nuc_Seq
|
||||
from .embl_genbank_features import extractTaxon
|
||||
|
||||
from libc.stdlib cimport free, malloc, realloc
|
||||
from libc.string cimport strcpy, strlen
|
||||
|
||||
|
||||
_featureMatcher = re.compile(b'^FEATURES.+\n(?=ORIGIN)',re.DOTALL + re.M)
|
||||
|
||||
_headerMatcher = re.compile(b'^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M)
|
||||
_seqMatcher = re.compile(b'(?<=ORIGIN).+(?=//\n)', re.DOTALL + re.M)
|
||||
_cleanSeq = re.compile(b'[ \n0-9]+')
|
||||
_acMatcher = re.compile(b'(?<=^ACCESSION ).+',re.M)
|
||||
_deMatcher = re.compile(b'(?<=^DEFINITION ).+\n( .+\n)*',re.M)
|
||||
_cleanDe = re.compile(b'\n *')
|
||||
|
||||
|
||||
def genbankParser(bytes text):
|
||||
|
||||
cdef Nuc_Seq seq
|
||||
|
||||
try:
|
||||
header = _headerMatcher.search(text).group()
|
||||
|
||||
ft = _featureMatcher.search(text).group()
|
||||
|
||||
s = _seqMatcher.search(text).group()
|
||||
s = _cleanSeq.sub(b'', s).upper()
|
||||
|
||||
acs = _acMatcher.search(text).group()
|
||||
acs = acs.split()
|
||||
ac = acs[0]
|
||||
acs = acs[1:]
|
||||
|
||||
de = _deMatcher.search(header).group()
|
||||
de = _cleanDe.sub(b' ',de).strip().strip(b'.')
|
||||
|
||||
except Exception as e:
|
||||
print("\nCould not import sequence id:", text.split()[1], "(error raised:", e, ")")
|
||||
# Do not raise any Exception if you need the possibility to resume the generator
|
||||
# (Python generators can't resume after any exception is raised)
|
||||
return None
|
||||
|
||||
tags = {}
|
||||
extractTaxon(ft, tags)
|
||||
|
||||
seq = Nuc_Seq(ac,
|
||||
s,
|
||||
definition=de,
|
||||
quality=None,
|
||||
offset=-1,
|
||||
tags=tags)
|
||||
|
||||
return seq
|
||||
|
||||
|
||||
def genbankIterator_file(lineiterator,
|
||||
int skip=0,
|
||||
only=None,
|
||||
firstline=None,
|
||||
int buffersize=100000000
|
||||
):
|
||||
cdef int lines_to_skip, ionly, read
|
||||
cdef Nuc_Seq seq
|
||||
cdef char* entry = NULL
|
||||
cdef size_t entry_buffer_size
|
||||
cdef int entry_len
|
||||
cdef int line_len
|
||||
|
||||
entry_buffer_size = 2048
|
||||
|
||||
entry = <char*> malloc(entry_buffer_size*sizeof(char))
|
||||
|
||||
if only is None:
|
||||
ionly = -1
|
||||
else:
|
||||
ionly = int(only)
|
||||
|
||||
if isinstance(lineiterator, (str, bytes)):
|
||||
lineiterator=uopen(lineiterator)
|
||||
if isinstance(lineiterator, LineBuffer):
|
||||
iterator = iter(lineiterator)
|
||||
else:
|
||||
if hasattr(lineiterator, "readlines"):
|
||||
iterator = iter(LineBuffer(lineiterator, buffersize))
|
||||
elif hasattr(lineiterator, '__next__'):
|
||||
iterator = lineiterator
|
||||
else:
|
||||
raise Exception("Invalid line iterator")
|
||||
|
||||
skipped = 0
|
||||
read = 0
|
||||
|
||||
if firstline is None:
|
||||
line = next(iterator)
|
||||
else:
|
||||
line = firstline
|
||||
|
||||
while True:
|
||||
|
||||
if ionly >= 0 and read >= ionly-1:
|
||||
break
|
||||
|
||||
while skipped < skip:
|
||||
line = next(iterator)
|
||||
try:
|
||||
while line[:2] != b'//':
|
||||
line = next(iterator)
|
||||
line = next(iterator)
|
||||
except StopIteration:
|
||||
break
|
||||
skipped += 1
|
||||
|
||||
try:
|
||||
entry_len = 0
|
||||
while line[:2] != b'//':
|
||||
line_len = strlen(line)
|
||||
while (entry_len + line_len) >= entry_buffer_size:
|
||||
entry_buffer_size*=2
|
||||
entry = <char*>realloc(entry, entry_buffer_size)
|
||||
strcpy(entry+entry_len, line)
|
||||
entry_len+=line_len
|
||||
line = next(iterator)
|
||||
# Add last line too because need the // flag to parse
|
||||
line_len = strlen(line)
|
||||
while (entry_len + line_len) >= entry_buffer_size:
|
||||
entry_buffer_size*=2
|
||||
entry = <char*>realloc(entry, entry_buffer_size)
|
||||
strcpy(entry+entry_len, line)
|
||||
line = next(iterator)
|
||||
except StopIteration:
|
||||
break
|
||||
|
||||
seq = genbankParser(entry)
|
||||
|
||||
yield seq
|
||||
read+=1
|
||||
|
||||
yield seq
|
||||
|
||||
free(entry)
|
||||
|
||||
|
||||
def genbankIterator_dir(dir_path,
|
||||
int skip=0,
|
||||
only=None,
|
||||
firstline=None,
|
||||
int buffersize=100000000
|
||||
):
|
||||
path = dir_path
|
||||
read = 0
|
||||
for filename in glob.glob(os.path.join(path, b'*.gbff*')):
|
||||
if read==only:
|
||||
return
|
||||
f = uopen(filename)
|
||||
if only is not None:
|
||||
only_f = only-read
|
||||
else:
|
||||
only_f = None
|
||||
for seq in genbankIterator_file(f, skip=skip, only=only_f, buffersize=buffersize):
|
||||
yield seq
|
||||
read+=1
|
||||
|
||||
|
||||
def genbankIterator(obj,
|
||||
int skip=0,
|
||||
only=None,
|
||||
firstline=None,
|
||||
int buffersize=100000000
|
||||
):
|
||||
if type(obj) == bytes or type(obj) == str :
|
||||
return genbankIterator_dir(obj, skip=skip, only=only, firstline=firstline, buffersize=buffersize)
|
||||
else:
|
||||
return genbankIterator_file(obj, skip=skip, only=only, firstline=firstline, buffersize=buffersize)
|
||||
|
||||
|
Reference in New Issue
Block a user