Compare commits

..

12 Commits

10 changed files with 180 additions and 143 deletions

View File

@ -6,7 +6,7 @@ recursive-include doc/sphinx/source *.txt *.rst *.py
recursive-include doc/sphinx/sphinxext *.py recursive-include doc/sphinx/sphinxext *.py
include doc/sphinx/Makefile include doc/sphinx/Makefile
include doc/sphinx/Doxyfile include doc/sphinx/Doxyfile
include README.txt include README.md
include requirements.txt include requirements.txt
include scripts/obi include scripts/obi

View File

@ -73,7 +73,7 @@ def addOptions(parser):
action="store_true", dest="import:preread", action="store_true", dest="import:preread",
default=False, default=False,
help="Do a first readthrough of the dataset if it contains huge dictionaries (more than 100 keys) for " help="Do a first readthrough of the dataset if it contains huge dictionaries (more than 100 keys) for "
"a much faster import.") "a much faster import. This option is not recommended and will slow down the import in any other case.")
def run(config): def run(config):
@ -236,7 +236,7 @@ def run(config):
dcols[tag] = (Column.new_column(view, tag, dict_dict[tag][1], \ dcols[tag] = (Column.new_column(view, tag, dict_dict[tag][1], \
nb_elements_per_line=len(dict_dict[tag][0]), \ nb_elements_per_line=len(dict_dict[tag][0]), \
elements_names=list(dict_dict[tag][0])), \ elements_names=list(dict_dict[tag][0])), \
value_obitype) dict_dict[tag][1])
# Reinitialize the input # Reinitialize the input
@ -269,125 +269,134 @@ def run(config):
pb(i) pb(i)
elif not i%50000: elif not i%50000:
logger("info", "Imported %d entries", i) logger("info", "Imported %d entries", i)
if NUC_SEQS_view: try:
id_col[i] = entry.id
def_col[i] = entry.definition if NUC_SEQS_view:
seq_col[i] = entry.seq id_col[i] = entry.id
# Check if there is a sequencing quality associated by checking the first entry # TODO haven't found a more robust solution yet def_col[i] = entry.definition
if i == 0: seq_col[i] = entry.seq
get_quality = QUALITY_COLUMN in entry # Check if there is a sequencing quality associated by checking the first entry # TODO haven't found a more robust solution yet
if i == 0:
get_quality = QUALITY_COLUMN in entry
if get_quality:
Column.new_column(view, QUALITY_COLUMN, OBI_QUAL)
qual_col = view[QUALITY_COLUMN]
if get_quality: if get_quality:
Column.new_column(view, QUALITY_COLUMN, OBI_QUAL) qual_col[i] = entry.quality
qual_col = view[QUALITY_COLUMN]
if get_quality: for tag in entry :
qual_col[i] = entry.quality
if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN : # TODO dirty
for tag in entry :
value = entry[tag]
if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN : # TODO dirty if tag == b"taxid":
tag = TAXID_COLUMN
value = entry[tag] if tag == b"count":
if tag == b"taxid": tag = COUNT_COLUMN
tag = TAXID_COLUMN if tag[:7] == b"merged_":
if tag == b"count": tag = MERGED_PREFIX+tag[7:]
tag = COUNT_COLUMN
if tag[:7] == b"merged_":
tag = MERGED_PREFIX+tag[7:]
if tag not in dcols :
value_type = type(value)
nb_elts = 1
value_obitype = OBI_VOID
if value_type == dict or value_type == list :
nb_elts = len(value)
elt_names = list(value)
else :
nb_elts = 1
elt_names = None
value_obitype = get_obitype(value)
if value_obitype != OBI_VOID :
dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
# Fill value
if value_type == dict and nb_elts == 1: # special case that makes the OBI3 create a 1 elt/line column which won't read a dict value
value = value[list(value.keys())[0]] # The solution is to transform the value in a simple atomic one acceptable by the column
dcols[tag][0][i] = value
# TODO else log error?
else :
rewrite = False
# Check type adequation
old_type = dcols[tag][1]
new_type = OBI_VOID
new_type = update_obitype(old_type, value)
if old_type != new_type :
rewrite = True
try:
# Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key
if type(value) == dict and \
dcols[tag][0].nb_elements_per_line == 1 \
and set(dcols[tag][0].elements_names) != set(value.keys()) :
raise IndexError # trigger column rewrite
# Fill value if tag not in dcols :
dcols[tag][0][i] = value
except IndexError :
value_type = type(value) value_type = type(value)
old_column = dcols[tag][0] nb_elts = 1
old_nb_elements_per_line = old_column.nb_elements_per_line value_obitype = OBI_VOID
new_nb_elements_per_line = 0
old_elements_names = old_column.elements_names if value_type == dict or value_type == list :
new_elements_names = None nb_elts = len(value)
elt_names = list(value)
else :
nb_elts = 1
elt_names = None
value_obitype = get_obitype(value)
if value_obitype != OBI_VOID :
dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
# Fill value
if value_type == dict and nb_elts == 1: # special case that makes the OBI3 create a 1 elt/line column which won't read a dict value
value = value[list(value.keys())[0]] # The solution is to transform the value in a simple atomic one acceptable by the column
dcols[tag][0][i] = value
# TODO else log error?
##################################################################### else :
# Check the length and keys of column lines if needed rewrite = False
if value_type == dict : # Check dictionary keys
for k in value : # Check type adequation
if k not in old_elements_names : old_type = dcols[tag][1]
new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value])) new_type = OBI_VOID
rewrite = True new_type = update_obitype(old_type, value)
break if old_type != new_type :
rewrite = True
elif value_type == list or value_type == tuple : # Check vector length
if old_nb_elements_per_line < len(value) : try:
new_nb_elements_per_line = len(value) # Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key
rewrite = True if type(value) == dict and \
dcols[tag][0].nb_elements_per_line == 1 \
##################################################################### and set(dcols[tag][0].elements_names) != set(value.keys()) :
raise IndexError # trigger column rewrite
if rewrite :
if new_nb_elements_per_line == 0 and new_elements_names is not None :
new_nb_elements_per_line = len(new_elements_names)
# Reset obierrno
obi_errno = 0
dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
new_data_type=new_type,
new_nb_elements_per_line=new_nb_elements_per_line,
new_elements_names=new_elements_names,
rewrite_last_line=False),
new_type)
# Update the dictionary:
for t in dcols :
dcols[t] = (view[t], dcols[t][1])
# Fill value # Fill value
dcols[tag][0][i] = value dcols[tag][0][i] = value
except IndexError :
value_type = type(value)
old_column = dcols[tag][0]
old_nb_elements_per_line = old_column.nb_elements_per_line
new_nb_elements_per_line = 0
old_elements_names = old_column.elements_names
new_elements_names = None
#####################################################################
# Check the length and keys of column lines if needed
if value_type == dict : # Check dictionary keys
for k in value :
if k not in old_elements_names :
new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
rewrite = True
break
elif value_type == list or value_type == tuple : # Check vector length
if old_nb_elements_per_line < len(value) :
new_nb_elements_per_line = len(value)
rewrite = True
#####################################################################
if rewrite :
if new_nb_elements_per_line == 0 and new_elements_names is not None :
new_nb_elements_per_line = len(new_elements_names)
# Reset obierrno
obi_errno = 0
dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
new_data_type=new_type,
new_nb_elements_per_line=new_nb_elements_per_line,
new_elements_names=new_elements_names,
rewrite_last_line=False),
new_type)
# Update the dictionary:
for t in dcols :
dcols[t] = (view[t], dcols[t][1])
# Fill value
dcols[tag][0][i] = value
except Exception as e:
print("\nCould not import sequence id:", entry.id, "(error raised:", e, ")")
if 'skiperror' in config['obi'] and not config['obi']['skiperror']:
raise e
else:
pass
i+=1 i+=1
if pb is not None: if pb is not None:

View File

@ -63,6 +63,8 @@ cdef extern from "obidmscolumn.h" nogil:
char* obi_get_elements_names(OBIDMS_column_p column) char* obi_get_elements_names(OBIDMS_column_p column)
char* obi_column_formatted_infos(OBIDMS_column_p column)
index_t obi_column_get_element_index_from_name(OBIDMS_column_p column, const char* element_name) index_t obi_column_get_element_index_from_name(OBIDMS_column_p column, const char* element_name)
int obi_column_write_comments(OBIDMS_column_p column, const char* comments) int obi_column_write_comments(OBIDMS_column_p column, const char* comments)

View File

@ -14,6 +14,7 @@ from ..capi.obidms cimport obi_import_column
from ..capi.obidmscolumn cimport OBIDMS_column_header_p, \ from ..capi.obidmscolumn cimport OBIDMS_column_header_p, \
obi_close_column, \ obi_close_column, \
obi_get_elements_names, \ obi_get_elements_names, \
obi_column_formatted_infos, \
obi_column_write_comments obi_column_write_comments
from ..capi.obiutils cimport obi_format_date from ..capi.obiutils cimport obi_format_date
@ -38,7 +39,7 @@ from obitools3.utils cimport tobytes, \
from obitools3.dms.column import typed_column from obitools3.dms.column import typed_column
from libc.stdlib cimport free from libc.stdlib cimport free
import importlib import importlib
import inspect import inspect
@ -288,9 +289,15 @@ cdef class Column(OBIWrapper) :
@OBIWrapper.checkIsActive @OBIWrapper.checkIsActive
def __repr__(self) : def __repr__(self) :
cdef bytes s cdef bytes s
#cdef char* s_b
#cdef str s_str
#s_b = obi_column_formatted_infos(self.pointer())
#s_str = bytes2str(s_b)
#free(s_b)
s = self._alias + b", data type: " + self.data_type s = self._alias + b", data type: " + self.data_type
#return s_str
return bytes2str(s) return bytes2str(s)
def close(self): # TODO discuss, can't be called bc then bug when closing view that tries to close it in C def close(self): # TODO discuss, can't be called bc then bug when closing view that tries to close it in C

View File

@ -177,7 +177,7 @@ def emblIterator_dir(dir_path,
for filename in files: for filename in files:
if read==only: if read==only:
return return
print("Parsing file %s (%d/%d)" % (tostr(filename), read_files, len(files))) print("Parsing file %s (%d/%d)" % (tostr(filename), read_files+1, len(files)))
f = uopen(filename) f = uopen(filename)
if only is not None: if only is not None:
only_f = only-read only_f = only-read

View File

@ -104,6 +104,7 @@ def fastaNucIterator(lineiterator,
cdef bytes sequence cdef bytes sequence
cdef int skipped, ionly, read cdef int skipped, ionly, read
cdef Nuc_Seq seq cdef Nuc_Seq seq
cdef bint stop
if only is None: if only is None:
ionly = -1 ionly = -1
@ -130,7 +131,8 @@ def fastaNucIterator(lineiterator,
else: else:
line = firstline line = firstline
while True: stop=False
while not stop:
if ionly >= 0 and read >= ionly: if ionly >= 0 and read >= ionly:
break break
@ -153,7 +155,7 @@ def fastaNucIterator(lineiterator,
s.append(line[0:-1]) s.append(line[0:-1])
line = next(iterator) line = next(iterator)
except StopIteration: except StopIteration:
pass stop=True
sequence = b"".join(s) sequence = b"".join(s)

View File

@ -25,8 +25,9 @@ from libc.string cimport strcpy, strlen
_featureMatcher = re.compile(b'^FEATURES.+\n(?=ORIGIN)',re.DOTALL + re.M) _featureMatcher = re.compile(b'^FEATURES.+\n(?=ORIGIN)',re.DOTALL + re.M)
_headerMatcher = re.compile(b'^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M) _headerMatcher = re.compile(b'^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M)
_seqMatcher = re.compile(b'(?<=ORIGIN).+(?=//\n)', re.DOTALL + re.M) _seqMatcher = re.compile(b'ORIGIN.+(?=//\n)', re.DOTALL + re.M)
_cleanSeq = re.compile(b'[ \n0-9]+') _cleanSeq1 = re.compile(b'ORIGIN.+\n')
_cleanSeq2 = re.compile(b'[ \n0-9]+')
_acMatcher = re.compile(b'(?<=^ACCESSION ).+',re.M) _acMatcher = re.compile(b'(?<=^ACCESSION ).+',re.M)
_deMatcher = re.compile(b'(?<=^DEFINITION ).+\n( .+\n)*',re.M) _deMatcher = re.compile(b'(?<=^DEFINITION ).+\n( .+\n)*',re.M)
_cleanDe = re.compile(b'\n *') _cleanDe = re.compile(b'\n *')
@ -42,7 +43,8 @@ def genbankParser(bytes text):
ft = _featureMatcher.search(text).group() ft = _featureMatcher.search(text).group()
s = _seqMatcher.search(text).group() s = _seqMatcher.search(text).group()
s = _cleanSeq.sub(b'', s).upper() s = _cleanSeq1.sub(b'', s)
s = _cleanSeq2.sub(b'', s)
acs = _acMatcher.search(text).group() acs = _acMatcher.search(text).group()
acs = acs.split() acs = acs.split()
@ -51,23 +53,23 @@ def genbankParser(bytes text):
de = _deMatcher.search(header).group() de = _deMatcher.search(header).group()
de = _cleanDe.sub(b' ',de).strip().strip(b'.') de = _cleanDe.sub(b' ',de).strip().strip(b'.')
tags = {}
extractTaxon(ft, tags)
seq = Nuc_Seq(ac,
s,
definition=de,
quality=None,
offset=-1,
tags=tags)
except Exception as e: except Exception as e:
print("\nCould not import sequence id:", text.split()[1], "(error raised:", e, ")") print("\nCould not import sequence id:", text.split()[1], "(error raised:", e, ")")
# Do not raise any Exception if you need the possibility to resume the generator # Do not raise any Exception if you need the possibility to resume the generator
# (Python generators can't resume after any exception is raised) # (Python generators can't resume after any exception is raised)
return None return None
tags = {}
extractTaxon(ft, tags)
seq = Nuc_Seq(ac,
s,
definition=de,
quality=None,
offset=-1,
tags=tags)
return seq return seq
@ -171,10 +173,12 @@ def genbankIterator_dir(dir_path,
read = 0 read = 0
read_files = 0 read_files = 0
files = [filename for filename in glob.glob(os.path.join(path, b'*.gbff*'))] files = [filename for filename in glob.glob(os.path.join(path, b'*.gbff*'))]
files.extend([filename for filename in glob.glob(os.path.join(path, b'*.seq*'))]) # new genbank extension
files = list(set(files))
for filename in files: for filename in files:
if read==only: if read==only:
return return
print("Parsing file %s (%d/%d)" % (tostr(filename), read_files, len(files))) print("Parsing file %s (%d/%d)" % (tostr(filename), read_files+1, len(files)))
f = uopen(filename) f = uopen(filename)
if only is not None: if only is not None:
only_f = only-read only_f = only-read

View File

@ -1,5 +1,5 @@
major = 3 major = 3
minor = 0 minor = 0
serial= '0-beta16' serial= '0b21'
version ="%d.%02d.%s" % (major,minor,serial) version ="%d.%d.%s" % (major,minor,serial)

5
requirements.txt Executable file
View File

@ -0,0 +1,5 @@
--extra-index-url https://pypi.python.org/simple/
Cython>=0.24
Sphinx>=1.2.0
ipython>=3.0.0
breathe>=4.0.0

View File

@ -5,8 +5,9 @@ import re
import subprocess import subprocess
from distutils import log from distutils import log
from distutils.core import setup #from distutils.core import setup
from setuptools import setup # to work with pip
from distutils.core import Extension from distutils.core import Extension
from distutils.sysconfig import get_python_lib from distutils.sysconfig import get_python_lib
@ -88,9 +89,10 @@ PACKAGE = "OBITools3"
VERSION = version VERSION = version
AUTHOR = 'Celine Mercier' AUTHOR = 'Celine Mercier'
EMAIL = 'celine.mercier@metabarcoding.org' EMAIL = 'celine.mercier@metabarcoding.org'
URL = "http://metabarcoding.org/obitools3" URL = "https://metabarcoding.org/obitools3"
PLATFORMS = "posix"
LICENSE = "CeCILL-V2" LICENSE = "CeCILL-V2"
DESCRIPTION = "Tools and library for DNA metabarcoding", DESCRIPTION = "A package for the management of analyses and data in DNA metabarcoding."
PYTHONMIN = '3.5' PYTHONMIN = '3.5'
SRC = 'python' SRC = 'python'
@ -147,12 +149,18 @@ classifiers=['Development Status :: 4 - Beta',
'Topic :: Utilities', 'Topic :: Utilities',
] ]
with open("README.md", "r") as fh:
long_description = fh.read()
setup(name=PACKAGE, setup(name=PACKAGE,
description=DESCRIPTION, description=DESCRIPTION,
long_description=long_description,
long_description_content_type="text/markdown",
classifiers=classifiers, classifiers=classifiers,
version=VERSION, version=VERSION,
author=AUTHOR, author=AUTHOR,
author_email=EMAIL, author_email=EMAIL,
platforms=PLATFORMS,
license=LICENSE, license=LICENSE,
url=URL, url=URL,
ext_modules=xx, ext_modules=xx,