obi import: fixed a bug when skipping an entry

Switch to version 3.0.0b21
Genbank parser: now reading ORIGIN lines with comments without
2020-05-29 21:19:42 +02:00 · 2020-05-28 20:42:09 +02:00 · 2020-05-28 20:41:34 +02:00 · 2020-05-28 20:40:36 +02:00 · 2020-05-20 15:59:04 +02:00 · 2020-05-20 11:46:29 +02:00
7 changed files with 163 additions and 137 deletions
--- a/python/obitools3/commands/import.pyx
+++ b/python/obitools3/commands/import.pyx
@ -73,7 +73,7 @@ def addOptions(parser):
                     action="store_true", dest="import:preread",
                     default=False,
                     help="Do a first readthrough of the dataset if it contains huge dictionaries (more than 100 keys) for "
-                          "a much faster import.")
+                          "a much faster import. This option is not recommended and will slow down the import in any other case.")
 def run(config):
@ -260,7 +260,6 @@ def run(config):
        if entry is None:  # error or exception handled at lower level, not raised because Python generators can't resume after any exception is raised
            if config['obi']['skiperror']:
                i-=1
                continue
            else:
                raise RollbackException("obi import error, rollbacking view", view)
@ -270,6 +269,8 @@ def run(config):
        elif not i%50000:
            logger("info", "Imported %d entries", i)
        try:
            if NUC_SEQS_view: 
                id_col[i] = entry.id
                def_col[i] = entry.definition
@ -388,6 +389,13 @@ def run(config):
                                # Fill value
                                dcols[tag][0][i] = value
        except Exception as e:
            print("\nCould not import sequence id:", entry.id, "(error raised:", e, ")")
            if 'skiperror' in config['obi'] and not config['obi']['skiperror']:
                raise e
            else:
                pass
        i+=1
    if pb is not None:
--- a/python/obitools3/dms/capi/obidmscolumn.pxd
+++ b/python/obitools3/dms/capi/obidmscolumn.pxd
@ -63,6 +63,8 @@ cdef extern from "obidmscolumn.h" nogil:
    char* obi_get_elements_names(OBIDMS_column_p column)
    char* obi_column_formatted_infos(OBIDMS_column_p column)
    index_t obi_column_get_element_index_from_name(OBIDMS_column_p column, const char* element_name)
    int obi_column_write_comments(OBIDMS_column_p column, const char* comments)
--- a/python/obitools3/dms/column/column.pyx
+++ b/python/obitools3/dms/column/column.pyx
@ -14,6 +14,7 @@ from ..capi.obidms cimport obi_import_column
 from ..capi.obidmscolumn cimport OBIDMS_column_header_p, \
                                 obi_close_column, \
                                 obi_get_elements_names, \
                                 obi_column_formatted_infos, \
                                 obi_column_write_comments
 from ..capi.obiutils cimport obi_format_date
@ -288,7 +289,13 @@ cdef class Column(OBIWrapper) :
    @OBIWrapper.checkIsActive
    def __repr__(self) :
        cdef bytes s
        #cdef char* s_b
        #cdef str s_str
        #s_b = obi_column_formatted_infos(self.pointer())
        #s_str = bytes2str(s_b)
        #free(s_b)
        s = self._alias + b", data type: " + self.data_type
        #return s_str
        return bytes2str(s)
--- a/python/obitools3/parsers/embl.pyx
+++ b/python/obitools3/parsers/embl.pyx
@ -177,7 +177,7 @@ def emblIterator_dir(dir_path,
    for filename in files:
        if read==only:
            return
-        print("Parsing file %s (%d/%d)" % (tostr(filename), read_files, len(files)))
+        print("Parsing file %s (%d/%d)" % (tostr(filename), read_files+1, len(files)))
        f = uopen(filename)
        if only is not None:
            only_f = only-read
--- a/python/obitools3/parsers/genbank.pyx
+++ b/python/obitools3/parsers/genbank.pyx
@ -25,8 +25,9 @@ from libc.string cimport strcpy, strlen
 _featureMatcher = re.compile(b'^FEATURES.+\n(?=ORIGIN)',re.DOTALL + re.M)
 _headerMatcher = re.compile(b'^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M)
-_seqMatcher    = re.compile(b'(?<=ORIGIN).+(?=//\n)', re.DOTALL + re.M)
+_seqMatcher    = re.compile(b'ORIGIN.+(?=//\n)', re.DOTALL + re.M)
-_cleanSeq      = re.compile(b'[ \n0-9]+')
+_cleanSeq1     = re.compile(b'ORIGIN.+\n')
 _cleanSeq2     = re.compile(b'[ \n0-9]+')
 _acMatcher     = re.compile(b'(?<=^ACCESSION   ).+',re.M)
 _deMatcher     = re.compile(b'(?<=^DEFINITION  ).+\n( .+\n)*',re.M)
 _cleanDe       = re.compile(b'\n *')
@ -42,7 +43,8 @@ def genbankParser(bytes text):
        ft     = _featureMatcher.search(text).group()
        s      = _seqMatcher.search(text).group()
-        s      = _cleanSeq.sub(b'', s).upper()
+        s      = _cleanSeq1.sub(b'', s)
        s      = _cleanSeq2.sub(b'', s)
        acs    = _acMatcher.search(text).group()
        acs    = acs.split()
@ -52,12 +54,6 @@ def genbankParser(bytes text):
        de     = _deMatcher.search(header).group()
        de     = _cleanDe.sub(b' ',de).strip().strip(b'.')
    except Exception as e:
        print("\nCould not import sequence id:", text.split()[1], "(error raised:", e, ")")
        # Do not raise any Exception if you need the possibility to resume the generator
        # (Python generators can't resume after any exception is raised)
        return None
        tags = {}
        extractTaxon(ft, tags)
@ -68,6 +64,12 @@ def genbankParser(bytes text):
                      offset=-1,
                      tags=tags)
    except Exception as e:
        print("\nCould not import sequence id:", text.split()[1], "(error raised:", e, ")")
        # Do not raise any Exception if you need the possibility to resume the generator
        # (Python generators can't resume after any exception is raised)
        return None
    return seq
@ -176,7 +178,7 @@ def genbankIterator_dir(dir_path,
    for filename in files:
        if read==only:
            return
-        print("Parsing file %s (%d/%d)" % (tostr(filename), read_files, len(files)))
+        print("Parsing file %s (%d/%d)" % (tostr(filename), read_files+1, len(files)))
        f = uopen(filename)
        if only is not None:
            only_f = only-read
--- a/python/obitools3/version.py
+++ b/python/obitools3/version.py
@ -1,5 +1,5 @@
 major = 3
 minor = 0
-serial= '0b19'
+serial= '0b22'
 version ="%d.%d.%s" % (major,minor,serial)
--- a/setup.py
+++ b/setup.py
@ -89,9 +89,10 @@ PACKAGE     = "OBITools3"
 VERSION     = version
 AUTHOR      = 'Celine Mercier'
 EMAIL       = 'celine.mercier@metabarcoding.org'
-URL         = "http://metabarcoding.org/obitools3"
+URL         = "https://metabarcoding.org/obitools3"
 PLATFORMS   = "posix"
 LICENSE     = "CeCILL-V2"
-DESCRIPTION = "Tools and library for DNA metabarcoding",
+DESCRIPTION = "A package for the management of analyses and data in DNA metabarcoding."
 PYTHONMIN   = '3.5'
 SRC       = 'python'
@ -148,12 +149,18 @@ classifiers=['Development Status :: 4 - Beta',
             'Topic :: Utilities',
             ]
 with open("README.md", "r") as fh:
    long_description = fh.read()
 setup(name=PACKAGE,
      description=DESCRIPTION,
      long_description=long_description,
      long_description_content_type="text/markdown",
      classifiers=classifiers,
      version=VERSION,
      author=AUTHOR,
      author_email=EMAIL,
      platforms=PLATFORMS,
      license=LICENSE,
      url=URL,
      ext_modules=xx,
Author	SHA1	Message	Date
Celine Mercier	2a2c233936	obi import: fixed a bug when skipping an entry	2020-05-29 21:19:42 +02:00
Celine Mercier	faf8ea9d86	Switch to version 3.0.0b21	2020-05-28 20:42:09 +02:00
Celine Mercier	ffe2485e94	Genbank parser: now reading ORIGIN lines with comments without triggering error	2020-05-28 20:41:34 +02:00
Celine Mercier	6094ce2bbc	obi import: skip on error more robust	2020-05-28 20:40:36 +02:00
Celine Mercier	a7dcf16c06	Minor changes for pip release	2020-05-20 15:59:04 +02:00
Celine Mercier	f13f8f6165	obi import: minor doc/display improvements	2020-05-20 11:46:29 +02:00