Switch to version 3.0.0b21

Genbank parser: now reading ORIGIN lines with comments without
triggering error
2020-05-28 20:42:09 +02:00 · 2020-05-28 20:41:34 +02:00 · 2020-05-28 20:40:36 +02:00 · 2020-05-20 15:59:04 +02:00 · 2020-05-20 11:46:29 +02:00 · 2020-05-20 10:29:36 +02:00
10 changed files with 180 additions and 143 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -6,7 +6,7 @@ recursive-include doc/sphinx/source *.txt *.rst *.py
 recursive-include doc/sphinx/sphinxext *.py
 include doc/sphinx/Makefile
 include doc/sphinx/Doxyfile
-include README.txt
+include README.md
 include requirements.txt
 include scripts/obi
--- a/python/obitools3/commands/import.pyx
+++ b/python/obitools3/commands/import.pyx
@ -73,7 +73,7 @@ def addOptions(parser):
                     action="store_true", dest="import:preread",
                     default=False,
                     help="Do a first readthrough of the dataset if it contains huge dictionaries (more than 100 keys) for "
-                          "a much faster import.")
+                          "a much faster import. This option is not recommended and will slow down the import in any other case.")
 def run(config):
@ -236,7 +236,7 @@ def run(config):
            dcols[tag] = (Column.new_column(view, tag, dict_dict[tag][1], \
                              nb_elements_per_line=len(dict_dict[tag][0]), \
                              elements_names=list(dict_dict[tag][0])), \
-                          value_obitype)
+                          dict_dict[tag][1])
        # Reinitialize the input
@ -269,125 +269,134 @@ def run(config):
            pb(i)
        elif not i%50000:
            logger("info", "Imported %d entries", i)
-              
+        
-        if NUC_SEQS_view: 
+        try:
-            id_col[i] = entry.id
+             
-            def_col[i] = entry.definition
+            if NUC_SEQS_view: 
-            seq_col[i] = entry.seq
+                id_col[i] = entry.id
-            # Check if there is a sequencing quality associated by checking the first entry    # TODO haven't found a more robust solution yet
+                def_col[i] = entry.definition
-            if i == 0:
+                seq_col[i] = entry.seq
-                get_quality = QUALITY_COLUMN in entry
+                # Check if there is a sequencing quality associated by checking the first entry    # TODO haven't found a more robust solution yet
                if i == 0:
                    get_quality = QUALITY_COLUMN in entry
                    if get_quality:
                        Column.new_column(view, QUALITY_COLUMN, OBI_QUAL)
                        qual_col = view[QUALITY_COLUMN]
                if get_quality:
-                    Column.new_column(view, QUALITY_COLUMN, OBI_QUAL)
+                    qual_col[i] = entry.quality
-                    qual_col = view[QUALITY_COLUMN]
+             
-            if get_quality:
+            for tag in entry :
-                qual_col[i] = entry.quality
+                
-         
+                if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN :  # TODO dirty 
-        for tag in entry :
+                                    
-            
+                    value = entry[tag]
-            if tag != ID_COLUMN and tag != DEFINITION_COLUMN and tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN :  # TODO dirty 
+                    if tag == b"taxid":
-                                
+                        tag = TAXID_COLUMN
-                value = entry[tag]
+                    if tag == b"count":
-                if tag == b"taxid":
+                        tag = COUNT_COLUMN
-                    tag = TAXID_COLUMN
+                    if tag[:7] == b"merged_":
-                if tag == b"count":
+                        tag = MERGED_PREFIX+tag[7:]
                    tag = COUNT_COLUMN
                if tag[:7] == b"merged_":
                    tag = MERGED_PREFIX+tag[7:]
                if tag not in dcols :
                    value_type = type(value)
                    nb_elts = 1
                    value_obitype = OBI_VOID
                    if value_type == dict or value_type == list :
                        nb_elts = len(value)
                        elt_names = list(value)
                    else :
                        nb_elts = 1
                        elt_names = None
                    value_obitype = get_obitype(value)
                    if value_obitype != OBI_VOID :
                        dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
                        # Fill value
                        if value_type == dict and nb_elts == 1:  # special case that makes the OBI3 create a 1 elt/line column which won't read a dict value
                            value = value[list(value.keys())[0]]       # The solution is to transform the value in a simple atomic one acceptable by the column
                        dcols[tag][0][i] = value
                    # TODO else log error?
                else :
                    rewrite = False
                    # Check type adequation
                    old_type = dcols[tag][1]
                    new_type = OBI_VOID
                    new_type = update_obitype(old_type, value)
                    if old_type != new_type :
                        rewrite = True
                    try:
                        # Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key                        
                        if type(value) == dict and \
                            dcols[tag][0].nb_elements_per_line == 1 \
                            and set(dcols[tag][0].elements_names) != set(value.keys()) :
                            raise IndexError  # trigger column rewrite
-                        # Fill value
+                    if tag not in dcols :
-                        dcols[tag][0][i] = value
+                         
                    except IndexError :
                        value_type = type(value)
-                        old_column = dcols[tag][0]
+                        nb_elts = 1
-                        old_nb_elements_per_line = old_column.nb_elements_per_line
+                        value_obitype = OBI_VOID
-                        new_nb_elements_per_line = 0
+                         
-                        old_elements_names = old_column.elements_names
+                        if value_type == dict or value_type == list :
-                        new_elements_names = None
+                            nb_elts = len(value)
                            elt_names = list(value)
                        else :
                            nb_elts = 1
                            elt_names = None
                        value_obitype = get_obitype(value)
                        if value_obitype != OBI_VOID :
                            dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
                            # Fill value
                            if value_type == dict and nb_elts == 1:  # special case that makes the OBI3 create a 1 elt/line column which won't read a dict value
                                value = value[list(value.keys())[0]]       # The solution is to transform the value in a simple atomic one acceptable by the column
                            dcols[tag][0][i] = value
                        # TODO else log error?
-                        #####################################################################
+                    else :
-                         
+             
-                        # Check the length and keys of column lines if needed
+                        rewrite = False
-                        if value_type == dict :    # Check dictionary keys
+     
-                            for k in value :
+                        # Check type adequation
-                                if k not in old_elements_names :
+                        old_type = dcols[tag][1]
-                                    new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
+                        new_type = OBI_VOID
-                                    rewrite = True
+                        new_type = update_obitype(old_type, value)
-                                    break
+                        if old_type != new_type :
-                         
+                            rewrite = True
-                        elif value_type == list or value_type == tuple :  # Check vector length
+     
-                            if old_nb_elements_per_line < len(value) :
+                        try:
-                                new_nb_elements_per_line = len(value)
+                            # Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key                        
-                                rewrite = True
+                            if type(value) == dict and \
-                         
+                                dcols[tag][0].nb_elements_per_line == 1 \
-                        #####################################################################
+                                and set(dcols[tag][0].elements_names) != set(value.keys()) :
-                         
+                                raise IndexError  # trigger column rewrite
-                        if rewrite :
+                            
                            if new_nb_elements_per_line == 0 and new_elements_names is not None :
                                new_nb_elements_per_line = len(new_elements_names)
                            # Reset obierrno 
                            obi_errno = 0
                            dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, 
                                                                                   new_data_type=new_type, 
                                                                                   new_nb_elements_per_line=new_nb_elements_per_line,
                                                                                   new_elements_names=new_elements_names,
                                                                                   rewrite_last_line=False), 
                                          new_type)
                            # Update the dictionary:
                            for t in dcols :
                                dcols[t] = (view[t], dcols[t][1])
                            # Fill value
                            dcols[tag][0][i] = value
-                                    
+                         
                        except IndexError :
                            value_type = type(value)
                            old_column = dcols[tag][0]
                            old_nb_elements_per_line = old_column.nb_elements_per_line
                            new_nb_elements_per_line = 0
                            old_elements_names = old_column.elements_names
                            new_elements_names = None
                            #####################################################################
                            # Check the length and keys of column lines if needed
                            if value_type == dict :    # Check dictionary keys
                                for k in value :
                                    if k not in old_elements_names :
                                        new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
                                        rewrite = True
                                        break
                            elif value_type == list or value_type == tuple :  # Check vector length
                                if old_nb_elements_per_line < len(value) :
                                    new_nb_elements_per_line = len(value)
                                    rewrite = True
                            #####################################################################
                            if rewrite :
                                if new_nb_elements_per_line == 0 and new_elements_names is not None :
                                    new_nb_elements_per_line = len(new_elements_names)
                                # Reset obierrno 
                                obi_errno = 0
                                dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, 
                                                                                       new_data_type=new_type, 
                                                                                       new_nb_elements_per_line=new_nb_elements_per_line,
                                                                                       new_elements_names=new_elements_names,
                                                                                       rewrite_last_line=False), 
                                              new_type)
                                # Update the dictionary:
                                for t in dcols :
                                    dcols[t] = (view[t], dcols[t][1])
                                # Fill value
                                dcols[tag][0][i] = value
        except Exception as e:
            print("\nCould not import sequence id:", entry.id, "(error raised:", e, ")")
            if 'skiperror' in config['obi'] and not config['obi']['skiperror']:
                raise e
            else:
                pass
        i+=1
    if pb is not None:
--- a/python/obitools3/dms/capi/obidmscolumn.pxd
+++ b/python/obitools3/dms/capi/obidmscolumn.pxd
@ -63,6 +63,8 @@ cdef extern from "obidmscolumn.h" nogil:
    char* obi_get_elements_names(OBIDMS_column_p column)
    char* obi_column_formatted_infos(OBIDMS_column_p column)
    index_t obi_column_get_element_index_from_name(OBIDMS_column_p column, const char* element_name)
    int obi_column_write_comments(OBIDMS_column_p column, const char* comments)
--- a/python/obitools3/dms/column/column.pyx
+++ b/python/obitools3/dms/column/column.pyx
@ -14,6 +14,7 @@ from ..capi.obidms cimport obi_import_column
 from ..capi.obidmscolumn cimport OBIDMS_column_header_p, \
                                 obi_close_column, \
                                 obi_get_elements_names, \
                                 obi_column_formatted_infos, \
                                 obi_column_write_comments
 from ..capi.obiutils cimport obi_format_date
@ -38,7 +39,7 @@ from obitools3.utils cimport tobytes, \
 from obitools3.dms.column import typed_column
-from libc.stdlib  cimport free
+from libc.stdlib cimport free
 import importlib
 import inspect
@ -288,9 +289,15 @@ cdef class Column(OBIWrapper) :
    @OBIWrapper.checkIsActive
    def __repr__(self) :
        cdef bytes s
        #cdef char* s_b
        #cdef str s_str
        #s_b = obi_column_formatted_infos(self.pointer())
        #s_str = bytes2str(s_b)
        #free(s_b)
        s = self._alias + b", data type: " + self.data_type
        #return s_str
        return bytes2str(s)
-
+    
    def close(self):  # TODO discuss, can't be called bc then bug when closing view that tries to close it in C
--- a/python/obitools3/parsers/embl.pyx
+++ b/python/obitools3/parsers/embl.pyx
@ -177,7 +177,7 @@ def emblIterator_dir(dir_path,
    for filename in files:
        if read==only:
            return
-        print("Parsing file %s (%d/%d)" % (tostr(filename), read_files, len(files)))
+        print("Parsing file %s (%d/%d)" % (tostr(filename), read_files+1, len(files)))
        f = uopen(filename)
        if only is not None:
            only_f = only-read
--- a/python/obitools3/parsers/fasta.pyx
+++ b/python/obitools3/parsers/fasta.pyx
@ -104,6 +104,7 @@ def fastaNucIterator(lineiterator,
    cdef bytes      sequence
    cdef int        skipped, ionly, read
    cdef Nuc_Seq    seq
    cdef bint       stop
    if only is None:
        ionly = -1
@ -130,7 +131,8 @@ def fastaNucIterator(lineiterator,
    else:
        line = firstline       
-    while True:
+    stop=False
    while not stop:
        if ionly >= 0 and read >= ionly:
            break
@ -153,7 +155,7 @@ def fastaNucIterator(lineiterator,
                s.append(line[0:-1])
                line = next(iterator)
        except StopIteration:
-            pass
+            stop=True
        sequence  = b"".join(s)        
--- a/python/obitools3/parsers/genbank.pyx
+++ b/python/obitools3/parsers/genbank.pyx
@ -25,8 +25,9 @@ from libc.string cimport strcpy, strlen
 _featureMatcher = re.compile(b'^FEATURES.+\n(?=ORIGIN)',re.DOTALL + re.M)
 _headerMatcher = re.compile(b'^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M)
-_seqMatcher    = re.compile(b'(?<=ORIGIN).+(?=//\n)', re.DOTALL + re.M)
+_seqMatcher    = re.compile(b'ORIGIN.+(?=//\n)', re.DOTALL + re.M)
-_cleanSeq      = re.compile(b'[ \n0-9]+')
+_cleanSeq1     = re.compile(b'ORIGIN.+\n')
 _cleanSeq2     = re.compile(b'[ \n0-9]+')
 _acMatcher     = re.compile(b'(?<=^ACCESSION   ).+',re.M)
 _deMatcher     = re.compile(b'(?<=^DEFINITION  ).+\n( .+\n)*',re.M)
 _cleanDe       = re.compile(b'\n *')
@ -42,7 +43,8 @@ def genbankParser(bytes text):
        ft     = _featureMatcher.search(text).group()
        s      = _seqMatcher.search(text).group()
-        s      = _cleanSeq.sub(b'', s).upper()
+        s      = _cleanSeq1.sub(b'', s)
        s      = _cleanSeq2.sub(b'', s)
        acs    = _acMatcher.search(text).group()
        acs    = acs.split()
@ -51,23 +53,23 @@ def genbankParser(bytes text):
        de     = _deMatcher.search(header).group()
        de     = _cleanDe.sub(b' ',de).strip().strip(b'.')
-
+    
        tags = {}
        extractTaxon(ft, tags)
        seq = Nuc_Seq(ac,
                      s,
                      definition=de,
                      quality=None,
                      offset=-1,
                      tags=tags)
    except Exception as e:
        print("\nCould not import sequence id:", text.split()[1], "(error raised:", e, ")")
        # Do not raise any Exception if you need the possibility to resume the generator
        # (Python generators can't resume after any exception is raised)
        return None
-    
+
    tags = {}
    extractTaxon(ft, tags)
    seq = Nuc_Seq(ac,
                  s,
                  definition=de,
                  quality=None,
                  offset=-1,
                  tags=tags)
    return seq
@ -171,10 +173,12 @@ def genbankIterator_dir(dir_path,
    read = 0
    read_files = 0
    files = [filename for filename in glob.glob(os.path.join(path, b'*.gbff*'))]
    files.extend([filename for filename in glob.glob(os.path.join(path, b'*.seq*'))])  # new genbank extension
    files = list(set(files))
    for filename in files:
        if read==only:
            return
-        print("Parsing file %s (%d/%d)" % (tostr(filename), read_files, len(files)))
+        print("Parsing file %s (%d/%d)" % (tostr(filename), read_files+1, len(files)))
        f = uopen(filename)
        if only is not None:
            only_f = only-read
--- a/python/obitools3/version.py
+++ b/python/obitools3/version.py
@ -1,5 +1,5 @@
 major = 3
 minor = 0
-serial= '0-beta16'
+serial= '0b21'
-version ="%d.%02d.%s" % (major,minor,serial)
+version ="%d.%d.%s" % (major,minor,serial)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,5 @@
 --extra-index-url https://pypi.python.org/simple/
 Cython>=0.24
 Sphinx>=1.2.0
 ipython>=3.0.0
 breathe>=4.0.0
--- a/setup.py
+++ b/setup.py
@ -5,8 +5,9 @@ import re
 import subprocess
 from distutils import log
-from distutils.core import setup
+#from distutils.core import setup
-    
+from setuptools import setup    # to work with pip
 from distutils.core import Extension
 from distutils.sysconfig import get_python_lib
@ -88,9 +89,10 @@ PACKAGE     = "OBITools3"
 VERSION     = version
 AUTHOR      = 'Celine Mercier'
 EMAIL       = 'celine.mercier@metabarcoding.org'
-URL         = "http://metabarcoding.org/obitools3"
+URL         = "https://metabarcoding.org/obitools3"
 PLATFORMS   = "posix"
 LICENSE     = "CeCILL-V2"
-DESCRIPTION = "Tools and library for DNA metabarcoding",
+DESCRIPTION = "A package for the management of analyses and data in DNA metabarcoding."
 PYTHONMIN   = '3.5'
 SRC       = 'python'
@ -147,12 +149,18 @@ classifiers=['Development Status :: 4 - Beta',
             'Topic :: Utilities',
             ]
 with open("README.md", "r") as fh:
    long_description = fh.read()
 setup(name=PACKAGE,
      description=DESCRIPTION,
      long_description=long_description,
      long_description_content_type="text/markdown",
      classifiers=classifiers,
      version=VERSION,
      author=AUTHOR,
      author_email=EMAIL,
      platforms=PLATFORMS,
      license=LICENSE,
      url=URL,
      ext_modules=xx,
Author	SHA1	Message	Date
Celine Mercier	faf8ea9d86	Switch to version 3.0.0b21	2020-05-28 20:42:09 +02:00
Celine Mercier	ffe2485e94	Genbank parser: now reading ORIGIN lines with comments without triggering error	2020-05-28 20:41:34 +02:00
Celine Mercier	6094ce2bbc	obi import: skip on error more robust	2020-05-28 20:40:36 +02:00
Celine Mercier	a7dcf16c06	Minor changes for pip release	2020-05-20 15:59:04 +02:00
Celine Mercier	f13f8f6165	obi import: minor doc/display improvements	2020-05-20 11:46:29 +02:00
Celine Mercier	b5a29ac413	Switch to version 3.0.0b19	2020-05-20 10:29:36 +02:00
Celine Mercier	efd2b9d338	Cleaner installation	2020-05-20 10:29:12 +02:00
Celine Mercier	ca6e3e7aad	obi import: fixed to work with `seq` genbank extension	2020-05-20 10:28:14 +02:00
Celine Mercier	76ed8e18e5	Switch to version 3.0.0b18 with version formatting that fits setuptools	2020-05-18 17:08:55 +02:00
Celine Mercier	1d17f28aec	setup: now using setuptools instead of distutils to work with pip	2020-05-18 17:08:09 +02:00
Celine Mercier	fa834e4b8b	obi import: small bug fix	2020-05-18 17:06:58 +02:00
Celine Mercier	a72fea3cc9	Python: fasta parser: fixed a bug stopping the program when the last line contained a single nucleotide	2020-05-12 11:24:12 +02:00