switch to version 3.0.0-beta10

obi import: new option --preread to do a first readthrough of the
dataset if it contains huge dictionaries for a much faster import.
2020-02-02 21:15:27 +01:00 · 2020-02-02 21:12:34 +01:00 · 2020-02-02 21:11:05 +01:00 · 2020-02-01 15:48:55 +01:00 · 2020-02-01 15:31:14 +01:00 · 2020-01-29 20:23:39 +01:00
21 changed files with 320 additions and 70 deletions
--- a/python/obitools3/apps/optiongroups/init.py
+++ b/python/obitools3/apps/optiongroups/init.py
@ -222,7 +222,7 @@ def __addDMSOutputOption(optionManager):
    group.add_argument('--no-create-dms',
                 action="store_true", dest="obi:nocreatedms",
                 default=False,
-                 help="Don't create an output DMS it does not already exist")
+                 help="Don't create an output DMS if it does not already exist")


 def __addEltLimitOption(optionManager):
--- a/python/obitools3/commands/alignpairedend.pyx
+++ b/python/obitools3/commands/alignpairedend.pyx
@ -14,7 +14,7 @@ from obitools3.libalign._qsrassemble import QSolexaRightReverseAssemble
 from obitools3.libalign._solexapairend import buildConsensus, buildJoinedSequence
 from obitools3.dms.obiseq cimport Nuc_Seq
 from obitools3.libalign.shifted_ali cimport Kmer_similarity, Ali_shifted
-from obitools3.commands.ngsfilter import REVERSE_SEQ_COLUMN_NAME, REVERSE_QUALITY_COLUMN_NAME
+from obitools3.dms.capi.obiview cimport REVERSE_SEQUENCE_COLUMN, REVERSE_QUALITY_COLUMN

 import sys
 import os
@ -102,7 +102,7 @@ def alignmentIterator(entries, aligner):
            seqR = reverse[i]
        else:
            seqF = Nuc_Seq.new_from_stored(entries[i])
-            seqR = Nuc_Seq(seqF.id, seqF[REVERSE_SEQ_COLUMN_NAME], quality=seqF[REVERSE_QUALITY_COLUMN_NAME])
+            seqR = Nuc_Seq(seqF.id, seqF[REVERSE_SEQUENCE_COLUMN], quality=seqF[REVERSE_QUALITY_COLUMN])
            seqR.index = i
        
        ali = aligner(seqF, seqR)
@ -196,8 +196,8 @@ def run(config):
                                  reversed_column=None)
    else:
        aligner = Kmer_similarity(entries, \
-                                  column2=entries[REVERSE_SEQ_COLUMN_NAME], \
-                                  qual_column2=entries[REVERSE_QUALITY_COLUMN_NAME], \
+                                  column2=entries[REVERSE_SEQUENCE_COLUMN], \
+                                  qual_column2=entries[REVERSE_QUALITY_COLUMN], \
                                  kmer_size=config['alignpairedend']['kmersize'], \
                                  reversed_column=entries[b'reversed'])  # column created by the ngsfilter tool
        
@ -221,7 +221,7 @@ def run(config):
            buildConsensus(ali, consensus, seqF)
        else:
            if not two_views:
-                seqR = Nuc_Seq(seqF.id, seqF[REVERSE_SEQ_COLUMN_NAME], quality = seqF[REVERSE_QUALITY_COLUMN_NAME])
+                seqR = Nuc_Seq(seqF.id, seqF[REVERSE_SEQUENCE_COLUMN], quality = seqF[REVERSE_QUALITY_COLUMN])
            else:
                seqR = reverse[i]
            buildJoinedSequence(ali, seqR, consensus, forward=seqF)
--- a/python/obitools3/commands/cat.pyx
+++ b/python/obitools3/commands/cat.pyx
@ -0,0 +1,122 @@
+#cython: language_level=3
+
+from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
+from obitools3.dms import DMS
+from obitools3.dms.view.view cimport View
+from obitools3.uri.decode import open_uri
+from obitools3.apps.optiongroups import addMinimalOutputOption
+from obitools3.dms.view import RollbackException
+from obitools3.apps.config import logger
+from obitools3.utils cimport str2bytes
+from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
+from obitools3.dms.view.view cimport View
+from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN, REVERSE_SEQUENCE_COLUMN, \
+                                        QUALITY_COLUMN, REVERSE_QUALITY_COLUMN
+from obitools3.dms.capi.obitypes cimport OBI_SEQ, OBI_QUAL
+from obitools3.dms.column.column cimport Column
+
+import time
+import sys
+ 
+from cpython.exc cimport PyErr_CheckSignals
+
+
+__title__="Concatenate views."
+
+ 
+def addOptions(parser):
+    
+    addMinimalOutputOption(parser)
+
+    group=parser.add_argument_group('obi cat specific options')
+
+    group.add_argument("-c",
+                       action="append", dest="cat:views_to_cat",
+                       metavar="<VIEW_NAME>",
+                       default=[],
+                       type=str,
+                       help="URI of a view to concatenate. (e.g. 'my_dms/my_view'). "
+                            "Several -c options can be used on the same "
+                            "command line.")
+
+     
+def run(config):
+     
+    DMS.obi_atexit()
+    
+    logger("info", "obi cat")
+
+    # Open the views to concatenate
+    iview_list = []
+    idms_list = []
+    total_len = 0
+    remove_qual = False
+    remove_rev_qual = False
+    v_type = View_NUC_SEQS
+    for v_uri in config["cat"]["views_to_cat"]:
+        input = open_uri(v_uri)
+        if input is None:
+            raise Exception("Could not read input view")
+        i_dms = input[0]
+        i_view = input[1]
+        if input[2] != View_NUC_SEQS:  # Check view type (output view is nuc_seqs view if all input view are nuc_seqs view)
+            v_type = View
+        if QUALITY_COLUMN not in i_view: # Check if keep quality column in output view (if all input views have it)
+            remove_qual = True
+        if REVERSE_QUALITY_COLUMN not in i_view: # same as above for reverse quality
+            remove_rev_qual = True
+        total_len += len(i_view)
+        iview_list.append(i_view)
+        idms_list.append(i_dms)
+
+    # Open the output: only the DMS
+    output = open_uri(config['obi']['outputURI'],
+                      input=False, 
+                      newviewtype=v_type)
+    if output is None:
+        raise Exception("Could not create output view")
+    o_dms = output[0]
+    o_view = output[1]
+    
+    # Initialize quality columns and their associated sequence columns if needed
+    if not remove_qual:
+        if NUC_SEQUENCE_COLUMN not in o_view:
+            Column.new_column(o_view, NUC_SEQUENCE_COLUMN, OBI_SEQ)
+        Column.new_column(o_view, QUALITY_COLUMN, OBI_QUAL, associated_column_name=NUC_SEQUENCE_COLUMN, associated_column_version=o_view[NUC_SEQUENCE_COLUMN].version)    
+    if not remove_rev_qual:
+        Column.new_column(o_view, REVERSE_SEQUENCE_COLUMN, OBI_SEQ)
+        Column.new_column(o_view, REVERSE_QUALITY_COLUMN, OBI_QUAL, associated_column_name=REVERSE_SEQUENCE_COLUMN, associated_column_version=o_view[REVERSE_SEQUENCE_COLUMN].version)
+
+    # Initialize the progress bar
+    pb = ProgressBar(total_len, config, seconde=5)
+    
+    i = 0
+    for v in iview_list:
+        for l in v:
+            PyErr_CheckSignals()
+            pb(i)
+            o_view[i] = l
+            i+=1
+
+    # Deletes quality columns if needed
+    if QUALITY_COLUMN in o_view and remove_qual :
+        o_view.delete_column(QUALITY_COLUMN)
+    if REVERSE_QUALITY_COLUMN in o_view and remove_rev_qual :
+        o_view.delete_column(REVERSE_QUALITY_COLUMN)
+
+    pb(i, force=True)
+    print("", file=sys.stderr)
+    
+    # Save command config in DMS comments
+    command_line = " ".join(sys.argv[1:])
+    o_view.write_config(config, "cat", command_line, input_dms_name=[d.name for d in idms_list], input_view_name=[v.name for v in iview_list])
+    o_dms.record_command_line(command_line)
+
+    #print("\n\nOutput view:\n````````````", file=sys.stderr)
+    #print(repr(view), file=sys.stderr)
+
+    for d in idms_list:
+        d.close()
+    o_dms.close()
+    
+    logger("info", "Done.")
--- a/python/obitools3/commands/grep.pyx
+++ b/python/obitools3/commands/grep.pyx
@ -36,14 +36,13 @@ def addOptions(parser):
                       metavar="<PREDICATE>",
                       default=[],
                       type=str,
-                       help="Warning: use bytes for character strings (b'text' instead of 'text'). "
-                            "Python boolean expression to be evaluated in the "
+                       help="Python boolean expression to be evaluated in the "
                            "sequence/line context. The attribute name can be "
                            "used in the expression as a variable name. "
                            "An extra variable named 'sequence' or 'line' refers "
                            "to the sequence or line object itself. "
                            "Several -p options can be used on the same "
-                            "commande line.")
+                            "command line.")
 
    group.add_argument("-S", "--sequence",
                       action="store", dest="grep:seq_pattern",
--- a/python/obitools3/commands/import.pyx
+++ b/python/obitools3/commands/import.pyx
@ -11,6 +11,7 @@ from obitools3.dms.column.column cimport Column
 from obitools3.dms.obiseq cimport Nuc_Seq
 from obitools3.dms import DMS
 from obitools3.dms.taxo.taxo cimport Taxonomy
+from obitools3.files.uncompress cimport CompressedFile


 from obitools3.utils cimport tobytes, \
@ -65,6 +66,14 @@ def addOptions(parser):
    addTaxdumpInputOption(parser)
    addMinimalOutputOption(parser)

+    group = parser.add_argument_group('obi import specific options')
+
+    group.add_argument('--preread',
+                     action="store_true", dest="import:preread",
+                     default=False,
+                     help="Do a first readthrough of the dataset if it contains huge dictionaries (more than 100 keys) for "
+                          "a much faster import.")
+

 def run(config):
    
@ -169,8 +178,6 @@ def run(config):

    if entry_count >= 0:
        pb = ProgressBar(entry_count, config, seconde=5)
-    
-    entries = input[1]
        
    NUC_SEQS_view = False
    if isinstance(output[1], View) :
@ -188,6 +195,60 @@ def run(config):
        
    dcols = {}
        
+    # First read through the entries to prepare columns with dictionaries as they are very time-expensive to rewrite
+    if config['import']['preread']:
+        logger("info", "First readthrough...")
+        entries = input[1]
+        i = 0
+        dict_dict = {}
+        for entry in entries:
+            PyErr_CheckSignals()
+        
+            if entry is None:  # error or exception handled at lower level, not raised because Python generators can't resume after any exception is raised
+                if config['obi']['skiperror']:
+                    i-=1
+                    continue
+                else:
+                    raise Exception("obi import error in first readthrough")
+            
+            if pb is not None:
+                pb(i)
+            elif not i%50000:
+                logger("info", "Read %d entries", i)
+    
+            for tag in entry :
+                if type(entry[tag]) == dict :
+                    if tag in dict_dict:
+                        dict_dict[tag][0].update(entry[tag].keys())
+                    else:
+                        dict_dict[tag] = [set(entry[tag].keys()), get_obitype(entry[tag])]
+            i+=1
+        
+        if pb is not None:
+            pb(i, force=True)
+            print("", file=sys.stderr)
+       
+        for tag in dict_dict:
+            dcols[tag] = (Column.new_column(view, tag, dict_dict[tag][1], \
+                              nb_elements_per_line=len(dict_dict[tag][0]), \
+                              elements_names=list(dict_dict[tag][0])), \
+                          value_obitype)
+    
+        
+        # Reinitialize the input
+        if isinstance(input[0], CompressedFile):
+            input_is_file = True
+        if entry_count >= 0:
+            pb = ProgressBar(entry_count, config, seconde=5)
+        try:
+            input[0].close()
+        except AttributeError:
+            pass
+        input = open_uri(config['obi']['inputURI'], force_file=input_is_file)
+        if input is None:
+            raise Exception("Could not open input URI")
+    
+    entries = input[1]
    i = 0
    for entry in entries :
        
@ -247,6 +308,8 @@ def run(config):
                        dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
                                                 
                        # Fill value
+                        if value_type == dict and nb_elts == 1:  # special case that makes the OBI3 create a 1 elt/line column which won't read a dict value
+                            value = value[list(value.keys())[0]]       # The solution is to transform the value in a simple atomic one acceptable by the column
                        dcols[tag][0][i] = value
                     
                    # TODO else log error?
@ -263,6 +326,12 @@ def run(config):
                        rewrite = True
 
                    try:
+                        # Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key                        
+                        if type(value) == dict and \
+                            dcols[tag][0].nb_elements_per_line == 1 and len(value.keys()) == 1 \
+                            and dcols[tag][0].elements_names[0] != list(value.keys())[0] :
+                            raise IndexError  # trigger column rewrite
+                        
                        # Fill value
                        dcols[tag][0][i] = value
                     
--- a/python/obitools3/commands/ngsfilter.pyx
+++ b/python/obitools3/commands/ngsfilter.pyx
@ -13,6 +13,7 @@ from obitools3.libalign.apat_pattern import Primer_search
 from obitools3.dms.obiseq cimport Nuc_Seq
 from obitools3.dms.capi.obitypes cimport OBI_SEQ, OBI_QUAL
 from obitools3.dms.capi.apat cimport MAX_PATTERN
+from obitools3.dms.capi.obiview cimport REVERSE_SEQUENCE_COLUMN, REVERSE_QUALITY_COLUMN
 from obitools3.utils cimport tobytes

 from libc.stdint cimport INT32_MAX
@ -22,8 +23,8 @@ import sys
 from cpython.exc cimport PyErr_CheckSignals


-REVERSE_SEQ_COLUMN_NAME = b"REVERSE_SEQUENCE"      # used by alignpairedend tool
-REVERSE_QUALITY_COLUMN_NAME = b"REVERSE_QUALITY"   # used by alignpairedend tool
+#REVERSE_SEQ_COLUMN_NAME = b"REVERSE_SEQUENCE"      # used by alignpairedend tool
+#REVERSE_QUALITY_COLUMN_NAME = b"REVERSE_QUALITY"   # used by alignpairedend tool


 __title__="Assigns sequence records to the corresponding experiment/sample based on DNA tags and primers"
@ -259,8 +260,8 @@ cdef tuple annotate(sequences, infos, no_tags, verbose=False):

    if not_aligned:
        sequences[1] = sequences[1].clone()
-        sequences[0][REVERSE_SEQ_COLUMN_NAME] = sequences[1].seq             # used by alignpairedend tool
-        sequences[0][REVERSE_QUALITY_COLUMN_NAME] = sequences[1].quality     # used by alignpairedend tool
+        sequences[0][REVERSE_SEQUENCE_COLUMN] = sequences[1].seq             # used by alignpairedend tool
+        sequences[0][REVERSE_QUALITY_COLUMN] = sequences[1].quality     # used by alignpairedend tool

    for seq in sequences:
        if hasattr(seq, "quality_array"): 
@ -295,8 +296,8 @@ cdef tuple annotate(sequences, infos, no_tags, verbose=False):

    if directmatch is None:
        if not_aligned:
-            sequences[0][REVERSE_SEQ_COLUMN_NAME] = sequences[1].seq             # used by alignpairedend tool
-            sequences[0][REVERSE_QUALITY_COLUMN_NAME] = sequences[1].quality     # used by alignpairedend tool
+            sequences[0][REVERSE_SEQUENCE_COLUMN] = sequences[1].seq             # used by alignpairedend tool
+            sequences[0][REVERSE_QUALITY_COLUMN] = sequences[1].quality     # used by alignpairedend tool
        sequences[0][b'error']=b'No primer match'
        return False, sequences[0]

@ -314,8 +315,8 @@ cdef tuple annotate(sequences, infos, no_tags, verbose=False):
        sequences[0] = sequences[0][directmatch[1][2]:]
    else:
        sequences[1] = sequences[1][directmatch[1][2]:]
-        sequences[0][REVERSE_SEQ_COLUMN_NAME] = sequences[1].seq           # used by alignpairedend tool
-        sequences[0][REVERSE_QUALITY_COLUMN_NAME] = sequences[1].quality   # used by alignpairedend tool
+        sequences[0][REVERSE_SEQUENCE_COLUMN] = sequences[1].seq           # used by alignpairedend tool
+        sequences[0][REVERSE_QUALITY_COLUMN] = sequences[1].quality   # used by alignpairedend tool
    
    if directmatch[0].forward:
        sequences[0][b'direction']=b'forward'
@ -361,8 +362,8 @@ cdef tuple annotate(sequences, infos, no_tags, verbose=False):
                sequences[0] = sequences[0][:r[1]]
            else:
                sequences[1] = sequences[1][:r[1]]
-                sequences[0][REVERSE_SEQ_COLUMN_NAME] = sequences[1].seq           # used by alignpairedend tool
-                sequences[0][REVERSE_QUALITY_COLUMN_NAME] = sequences[1].quality   # used by alignpairedend tool
+                sequences[0][REVERSE_SEQUENCE_COLUMN] = sequences[1].seq           # used by alignpairedend tool
+                sequences[0][REVERSE_QUALITY_COLUMN] = sequences[1].quality   # used by alignpairedend tool
        # do the same on the other seq
        if first_match_first_seq: 
            r = direct_primer.revcomp(sequences[1])
@ -373,8 +374,8 @@ cdef tuple annotate(sequences, infos, no_tags, verbose=False):
                sequences[1] = sequences[1][:r[1]]
            else:
                sequences[0] = sequences[0][:r[1]] 
-                sequences[0][REVERSE_SEQ_COLUMN_NAME] = sequences[1].seq
-                sequences[0][REVERSE_QUALITY_COLUMN_NAME] = sequences[1].quality
+                sequences[0][REVERSE_SEQUENCE_COLUMN] = sequences[1].seq
+                sequences[0][REVERSE_QUALITY_COLUMN] = sequences[1].quality
    
    
    # Look for other primer in the other direction on the sequence, or
@ -442,8 +443,8 @@ cdef tuple annotate(sequences, infos, no_tags, verbose=False):
            sequences[1] = sequences[1][reversematch[1][2]:]
            if not directmatch[0].forward:
                sequences[1] = sequences[1].reverse_complement
-            sequences[0][REVERSE_SEQ_COLUMN_NAME] = sequences[1].seq           # used by alignpairedend tool
-            sequences[0][REVERSE_QUALITY_COLUMN_NAME] = sequences[1].quality   # used by alignpairedend tool
+            sequences[0][REVERSE_SEQUENCE_COLUMN] = sequences[1].seq           # used by alignpairedend tool
+            sequences[0][REVERSE_QUALITY_COLUMN] = sequences[1].quality   # used by alignpairedend tool
        else:
            sequences[0] = sequences[0][reversematch[1][2]:]
            
@ -605,12 +606,12 @@ def run(config):
                paired_p.revcomp.aligner = aligner
    
    if not_aligned:   # create columns used by alignpairedend tool
-        Column.new_column(o_view, REVERSE_SEQ_COLUMN_NAME, OBI_SEQ)
-        Column.new_column(o_view, REVERSE_QUALITY_COLUMN_NAME, OBI_QUAL, associated_column_name=REVERSE_SEQ_COLUMN_NAME, associated_column_version=o_view[REVERSE_SEQ_COLUMN_NAME].version)
+        Column.new_column(o_view, REVERSE_SEQUENCE_COLUMN, OBI_SEQ)
+        Column.new_column(o_view, REVERSE_QUALITY_COLUMN, OBI_QUAL, associated_column_name=REVERSE_SEQUENCE_COLUMN, associated_column_version=o_view[REVERSE_SEQUENCE_COLUMN].version)
        
        if unidentified is not None:
-            Column.new_column(unidentified, REVERSE_SEQ_COLUMN_NAME, OBI_SEQ)
-            Column.new_column(unidentified, REVERSE_QUALITY_COLUMN_NAME, OBI_QUAL, associated_column_name=REVERSE_SEQ_COLUMN_NAME, associated_column_version=unidentified[REVERSE_SEQ_COLUMN_NAME].version)
+            Column.new_column(unidentified, REVERSE_SEQUENCE_COLUMN, OBI_SEQ)
+            Column.new_column(unidentified, REVERSE_QUALITY_COLUMN, OBI_QUAL, associated_column_name=REVERSE_SEQUENCE_COLUMN, associated_column_version=unidentified[REVERSE_SEQUENCE_COLUMN].version)
    
    g = 0
    u = 0
--- a/python/obitools3/commands/uniq.pyx
+++ b/python/obitools3/commands/uniq.pyx
@ -8,7 +8,8 @@ from obitools3.dms.view import RollbackException
 from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
 from obitools3.dms.column.column cimport Column, Column_line
 from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN, TAXID_COLUMN, \
-                                        TAXID_DIST_COLUMN, MERGED_TAXID_COLUMN, MERGED_COLUMN, MERGED_PREFIX
+                                        TAXID_DIST_COLUMN, MERGED_TAXID_COLUMN, MERGED_COLUMN, MERGED_PREFIX, \
+                                        REVERSE_QUALITY_COLUMN
 from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t
 from obitools3.apps.optiongroups import addMinimalInputOption, \
                                        addMinimalOutputOption, \
@ -24,9 +25,6 @@ from cpython.exc cimport PyErr_CheckSignals

 __title__="Group sequence records together"

-
-REVERSE_QUALITY_COLUMN_NAME = b"REVERSE_QUALITY"   # TODO from ngsfilter, move to C
-
 
 def addOptions(parser):
 
@ -496,8 +494,8 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, li
    # Deletes quality columns if there is one because the matching between sequence and quality will be broken (quality set to NA when sequence not)
    if QUALITY_COLUMN in view:
        o_view.delete_column(QUALITY_COLUMN)
-    if REVERSE_QUALITY_COLUMN_NAME in view:
-        o_view.delete_column(REVERSE_QUALITY_COLUMN_NAME)
+    if REVERSE_QUALITY_COLUMN in view:
+        o_view.delete_column(REVERSE_QUALITY_COLUMN)
    
    if taxonomy is not None:
        print("")  # TODO because in the middle of progress bar. Better solution?
--- a/python/obitools3/dms/capi/obiview.pxd
+++ b/python/obitools3/dms/capi/obiview.pxd
@ -24,6 +24,8 @@ cdef extern from "obiview.h" nogil:
    extern const_char_p ID_COLUMN
    extern const_char_p DEFINITION_COLUMN
    extern const_char_p QUALITY_COLUMN
+    extern const_char_p REVERSE_QUALITY_COLUMN
+    extern const_char_p REVERSE_SEQUENCE_COLUMN
    extern const_char_p COUNT_COLUMN
    extern const_char_p TAXID_COLUMN
    extern const_char_p MERGED_TAXID_COLUMN
@ -100,7 +102,7 @@ cdef extern from "obiview.h" nogil:
                            const_char_p comments,
                            bint create)

-    int obi_view_delete_column(Obiview_p view, const_char_p column_name)
+    int obi_view_delete_column(Obiview_p view, const_char_p column_name, bint delete_file)
            
    OBIDMS_column_p obi_view_get_column(Obiview_p view, const_char_p column_name)

--- a/python/obitools3/dms/column/column.pyx
+++ b/python/obitools3/dms/column/column.pyx
@ -21,7 +21,11 @@ from ..capi.obiutils cimport obi_format_date
 from ..capi.obiview cimport obi_view_add_column, \
                            obi_view_get_pointer_on_column_in_view, \
                            Obiview_p, \
-                            NUC_SEQUENCE_COLUMN
+                            NUC_SEQUENCE_COLUMN, \
+                            QUALITY_COLUMN, \
+                            REVERSE_SEQUENCE_COLUMN, \
+                            REVERSE_QUALITY_COLUMN
+

 from ..object cimport OBIDeactivatedInstanceError

@ -122,11 +126,18 @@ cdef class Column(OBIWrapper) :
        
        if data_type == OBI_QUAL:
            if associated_column_name_b == b"":
-                if NUC_SEQUENCE_COLUMN not in view:
-                     raise RuntimeError("Cannot create column %s in view %s: trying to create quality column but no NUC_SEQ column to associate it with in the view" % (bytes2str(column_name_b),
-                                                                           bytes2str(view.name)))
-                associated_column_name_b = NUC_SEQUENCE_COLUMN
-                associated_column_version = view[NUC_SEQUENCE_COLUMN].version
+                if column_name == QUALITY_COLUMN:
+                    if NUC_SEQUENCE_COLUMN not in view:
+                         raise RuntimeError("Cannot create column %s in view %s: trying to create quality column but no NUC_SEQ column to associate it with in the view" % (bytes2str(column_name_b),
+                                                                               bytes2str(view.name)))
+                    associated_column_name_b = NUC_SEQUENCE_COLUMN
+                    associated_column_version = view[NUC_SEQUENCE_COLUMN].version
+                elif column_name == REVERSE_QUALITY_COLUMN:
+                    if REVERSE_SEQUENCE_COLUMN not in view:
+                         raise RuntimeError("Cannot create column %s in view %s: trying to create reverse quality column but no REVERSE_SEQUENCE column to associate it with in the view" % (bytes2str(column_name_b),
+                                                                               bytes2str(view.name)))
+                    associated_column_name_b = REVERSE_SEQUENCE_COLUMN
+                    associated_column_version = view[REVERSE_SEQUENCE_COLUMN].version
        
        if (obi_view_add_column(view                      = view.pointer(),
                                column_name               = column_name_b,
--- a/python/obitools3/dms/dms.pyx
+++ b/python/obitools3/dms/dms.pyx
@ -259,7 +259,7 @@ cdef class DMS(OBIWrapper):
        for command in self.command_line_history:
            s+=b"#"
            s+=command[b"time"]
-            s+=b"\n"
+            s+=b"\nobi "
            s+=command[b"command"]
            s+=b"\n"
        return s
--- a/python/obitools3/dms/view/view.pxd
+++ b/python/obitools3/dms/view/view.pxd
@ -22,7 +22,8 @@ cdef class View(OBIWrapper):
    cdef inline Obiview_p pointer(self)   
        
    cpdef delete_column(self, 
-                        object column_name)
+                        object column_name,
+                        bint delete_file=*)
    
    cpdef rename_column(self, 
                        object current_name, 
--- a/python/obitools3/dms/view/view.pyx
+++ b/python/obitools3/dms/view/view.pyx
@ -227,7 +227,8 @@ cdef class View(OBIWrapper) :

    
    cpdef delete_column(self, 
-                        object column_name) :
+                        object column_name,
+                        bint delete_file=False) :

        cdef bytes column_name_b = tobytes(column_name)

@ -239,7 +240,7 @@ cdef class View(OBIWrapper) :
        col.close()
        
        # Remove the column from the view which closes the C structure
-        if obi_view_delete_column(self.pointer(), column_name_b) < 0 :
+        if obi_view_delete_column(self.pointer(), column_name_b, delete_file) < 0 :
            raise RollbackException("Problem deleting column %s from a view",
                            bytes2str(column_name_b), self)

@ -297,11 +298,17 @@ cdef class View(OBIWrapper) :
                                       nb_elements_per_line=new_nb_elements_per_line, elements_names=new_elements_names, 
                                       comments=old_column.comments, alias=column_name_b+tobytes('___new___'))
        
+        switch_to_dict = old_column.nb_elements_per_line == 1 and new_nb_elements_per_line > 1
+        ori_key = old_column._elements_names[0]
+        
        for i in range(length) :
-            new_column[i] = old_column[i]
+            if switch_to_dict :
+                new_column[i] = {ori_key: old_column[i]}
+            else:
+                new_column[i] = old_column[i]

        # Remove old column from view
-        self.delete_column(column_name_b)
+        self.delete_column(column_name_b, delete_file=True)

        # Rename new
        new_column.name = column_name_b
--- a/python/obitools3/libalign/_solexapairend.pyx
+++ b/python/obitools3/libalign/_solexapairend.pyx
@ -6,6 +6,7 @@ from .solexapairend import iterOnAligment
 from .shifted_ali cimport Ali_shifted

 from obitools3.dms.capi.obiview cimport Obiview_p, QUALITY_COLUMN, NUC_SEQUENCE_COLUMN, \
+                                        REVERSE_SEQUENCE_COLUMN, REVERSE_QUALITY_COLUMN, \
                                        obi_set_qual_int_with_elt_idx_and_col_p_in_view, \
                                        obi_set_str_with_elt_idx_and_col_p_in_view
                                        
@ -13,7 +14,6 @@ from obitools3.dms.capi.obidmscolumn cimport OBIDMS_column_p

 from obitools3.dms.view.view cimport View
 from obitools3.dms.column.column cimport Column
-from obitools3.commands.ngsfilter import REVERSE_SEQ_COLUMN_NAME, REVERSE_QUALITY_COLUMN_NAME

 from math import log10

@ -233,7 +233,7 @@ def buildConsensus(ali, seq, ref_tags=None):
    seq[b'mode']=b'alignment'

    for tag in ref_tags:
-        if tag != REVERSE_SEQ_COLUMN_NAME and tag != REVERSE_QUALITY_COLUMN_NAME and \
+        if tag != REVERSE_SEQUENCE_COLUMN and tag != REVERSE_QUALITY_COLUMN and \
            tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN:
            seq[tag] = ref_tags[tag]

@ -254,7 +254,7 @@ def buildJoinedSequence(ali, reverse, seq, forward=None):
    seq[b"mode"]=b"joined"
    seq[b"pairedend_limit"]=len(forward)    
    for tag in forward:
-        if tag != REVERSE_SEQ_COLUMN_NAME and tag != REVERSE_QUALITY_COLUMN_NAME:
+        if tag != REVERSE_SEQUENCE_COLUMN and tag != REVERSE_QUALITY_COLUMN:
            seq[tag] = forward[tag]
    return seq

--- a/python/obitools3/uri/decode.pyx
+++ b/python/obitools3/uri/decode.pyx
@ -171,7 +171,8 @@ Reads an URI and returns a tuple containing:
 def open_uri(uri,
             bint input=True,
             type newviewtype=View,
-             dms_only=False):
+             dms_only=False,
+             force_file=False):
    
    cdef bytes urib = tobytes(uri)
    cdef bytes scheme
@ -195,9 +196,9 @@ def open_uri(uri,
    if 'obi' not in config:
        config['obi']={}
    
-    try:
+    if not force_file and "defaultdms" in config["obi"]:
        default_dms=config["obi"]["defaultdms"]
-    except KeyError:
+    else:
        default_dms=None
        
    try:
--- a/python/obitools3/version.py
+++ b/python/obitools3/version.py
@ -1,5 +1,5 @@
 major = 3
 minor = 0
-serial= '0-beta5'
+serial= '0-beta10'

 version ="%d.%02d.%s" % (major,minor,serial)
--- a/src/obi_clean.c
+++ b/src/obi_clean.c
@ -409,8 +409,7 @@ int obi_clean(const char* dms_name,
 			stop = true;
 		}

-		#pragma omp parallel default(none) \
-					 	 	 shared(thread_count, seq_count, blob_array, complete_sample_count_array, alignment_result_array, \
+		#pragma omp parallel shared(thread_count, seq_count, blob_array, complete_sample_count_array, alignment_result_array, \
 					 	 			 stop, blob1, i, obi_errno, keep_running, stderr, max_ratio, iseq_column, i_view, \
 									 similarity_mode, reference, normalize, threshold, ktable, status_column, o_view, sample_count)
 		{
--- a/src/obidms.c
+++ b/src/obidms.c
@ -696,6 +696,12 @@ int obi_clean_dms(const char* dms_path)
 //		return -1;
 //	}

+	if (obi_close_dms(dms, true) < 0)
+	{
+		obidebug(1, "\nError closing a DMS after cleaning");
+		return -1;
+	}
+
 	return 0;
 }

--- a/src/obitypes.h
+++ b/src/obitypes.h
@ -34,8 +34,8 @@
 * @brief enum for the boolean OBIType.
 */
 typedef enum OBIBool {
-    FALSE      = 0,
-    TRUE       = 1,
+    OBIFalse   = 0,
+    OBITrue    = 1,
    OBIBool_NA = 2
 } obibool_t, *obibool_p; 		/**< a boolean true/false value */	// TODO check name convention?

--- a/src/obiview.c
+++ b/src/obiview.c
@ -2380,11 +2380,12 @@ int obi_view_add_column(Obiview_p    view,
 }


-int obi_view_delete_column(Obiview_p view, const char* column_name)
+int obi_view_delete_column(Obiview_p view, const char* column_name, bool delete_file)
 {
 	int  i;
 	bool found;
 	OBIDMS_column_p column;
+	char* col_to_delete_path;

 	// Check that the view is not read-only
 	if (view->read_only)
@ -2406,8 +2407,31 @@ int obi_view_delete_column(Obiview_p view, const char* column_name)
 				obidebug(1, "\nError getting a column from the linked list of column pointers of a view when deleting a column from a view");
 				return -1;
 			}
+			// Keep column path if need to delete the file
+			if (delete_file)
+			{
+				col_to_delete_path = obi_column_full_path(view->dms, column->header->name, column->header->version);
+				if (col_to_delete_path == NULL)
+				{
+					obidebug(1, "\nError getting a column file path when deleting a column");
+					return -1;
+				}
+			}

 			obi_close_column(column);
+
+			// Delete file if needed
+			if (delete_file)
+			{
+				if (remove(col_to_delete_path) < 0)
+				{
+					obi_set_errno(OBICOL_UNKNOWN_ERROR);
+					obidebug(1, "\nError deleting a column file when deleting unfinished columns: file %s", col_to_delete_path);
+					return -1;
+				}
+				free(col_to_delete_path);
+			}
+
 			view->columns = ll_delete(view->columns, i);
 			// TODO how do we check for error? NULL can be empty list
 			found = true;
@ -3047,7 +3071,7 @@ int obi_create_auto_id_column(Obiview_p view, const char* prefix)
 	// Delete old ID column if it exists
 	if (obi_view_get_column(view, ID_COLUMN) != NULL)
 	{
-		if (obi_view_delete_column(view, ID_COLUMN) < 0)
+		if (obi_view_delete_column(view, ID_COLUMN, false) < 0)
 		{
 			obidebug(1, "Error deleting an ID column to replace it in a view");
 			return -1;
--- a/src/obiview.h
+++ b/src/obiview.h
@ -52,6 +52,15 @@
 #define QUALITY_COLUMN "QUALITY"				/**< The name of the column containing the sequence qualities
 	 	 	 	 	 	 	 	 	 	 	 	 *   in NUC_SEQS_VIEW views.
                                	 	  	   	 */
+#define REVERSE_QUALITY_COLUMN "REVERSE_QUALITY" /**< The name of the column containing the sequence qualities
+ 	 	 	 	 	 	 	 	 	 	 	 	 *    of the reverse read (generated by ngsfilter, used by alignpairedend).
+                                	 	  	   	 */
+#define REVERSE_SEQUENCE_COLUMN "REVERSE_SEQUENCE" /**< The name of the column containing the sequence
+ 	 	 	 	 	 	 	 	 	 	 	 	 *    of the reverse read (generated by ngsfilter, used by alignpairedend).
+                                	 	  	   	 */
+#define QUALITY_COLUMN "QUALITY"				/**< The name of the column containing the sequence qualities
+ 	 	 	 	 	 	 	 	 	 	 	 	 *   in NUC_SEQS_VIEW views.
+                                	 	  	   	 */
 #define COUNT_COLUMN "COUNT"				    /**< The name of the column containing the sequence counts
 	 	 	 	 	 	 	 	 	 	 	 	 *   in NUC_SEQS_VIEW views.
                                	 	  	  	 */
@ -431,6 +440,7 @@ int obi_view_add_column(Obiview_p    view,
 *
 * @param view A pointer on the view.
 * @param column_name The name of the column that should be deleted from the view.
+ * @param delete_file Whether the column file should be deleted. Use carefully re: dependencies.
 *
 * @returns A value indicating the success of the operation.
 * @retval 0 if the operation was successfully completed.
@ -439,7 +449,7 @@ int obi_view_add_column(Obiview_p    view,
 * @since February 2016
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 */
-int obi_view_delete_column(Obiview_p view, const char* column_name);
+int obi_view_delete_column(Obiview_p view, const char* column_name, bool delete_file);


 /**
--- a/src/sse_banded_LCS_alignment.c
+++ b/src/sse_banded_LCS_alignment.c
@ -951,15 +951,15 @@ double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bo
 	// Put the DNA sequences in the int arrays. Longest sequence must be first argument of sse_align function
 	if (l2 > l1)
 	{
-		putSeqInSeq(iseq1, seq2, l2, TRUE);
-		putSeqInSeq(iseq2, seq1, l1, FALSE);
+		putSeqInSeq(iseq1, seq2, l2, true);
+		putSeqInSeq(iseq2, seq1, l1, false);
 		// Compute alignment
 		id = sse_banded_lcs_align(iseq1, iseq2, l2, l1, normalize, reference, similarity_mode, address, LCSmin, lcs_length, ali_length);
 	}
 	else
 	{
-		putSeqInSeq(iseq1, seq1, l1, TRUE);
-		putSeqInSeq(iseq2, seq2, l2, FALSE);
+		putSeqInSeq(iseq1, seq1, l1, true);
+		putSeqInSeq(iseq2, seq2, l2, false);
 		// Compute alignment
 		id = sse_banded_lcs_align(iseq1, iseq2, l1, l2, normalize, reference, similarity_mode, address, LCSmin, lcs_length, ali_length);
 	}
@ -1054,15 +1054,15 @@ double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double thr
 	// Put the DNA sequences in the int arrays. Longest sequence must be first argument of sse_align function
 	if (l2 > l1)
 	{
-		putBlobInSeq(iseq1, seq2, l2, TRUE);
-		putBlobInSeq(iseq2, seq1, l1, FALSE);
+		putBlobInSeq(iseq1, seq2, l2, true);
+		putBlobInSeq(iseq2, seq1, l1, false);
 		// Compute alignment
 		id = sse_banded_lcs_align(iseq1, iseq2, l2, l1, normalize, reference, similarity_mode, address, LCSmin, lcs_length, ali_length);
 	}
 	else
 	{
-		putBlobInSeq(iseq1, seq1, l1, TRUE);
-		putBlobInSeq(iseq2, seq2, l2, FALSE);
+		putBlobInSeq(iseq1, seq1, l1, true);
+		putBlobInSeq(iseq2, seq2, l2, false);
 		// Compute alignment
 		id = sse_banded_lcs_align(iseq1, iseq2, l1, l2, normalize, reference, similarity_mode, address, LCSmin, lcs_length, ali_length);
 	}
Author	SHA1	Message	Date
Celine Mercier	dc9f897917	switch to version 3.0.0-beta10	2020-02-02 21:15:27 +01:00
Celine Mercier	bb72682f7d	obi import: new option --preread to do a first readthrough of the dataset if it contains huge dictionaries for a much faster import.	2020-02-02 21:12:34 +01:00
Celine Mercier	52920c3c71	URI decoding: dirty temp fix for bug where default dms makes a mess when should guess file	2020-02-02 21:11:05 +01:00
Celine Mercier	18c22cecf9	switch to version 3.0.0-beta9	2020-02-01 15:48:55 +01:00
Celine Mercier	1bfb96023c	obi import: rewriting a column now deletes the old one to save disk space	2020-02-01 15:31:14 +01:00
Celine Mercier	c67d668989	obi import: fixed a bug when the first entry would contain a dictionary with one key. Switch to beta8	2020-01-29 20:23:39 +01:00
Celine Mercier	db0ac37d41	switch to version 3.0.0-beta7	2020-01-29 16:18:53 +01:00
Celine Mercier	d0c21ecd39	Removed an OpenMP clause that was not obligatory and triggered a known gcc bug involving macros	2020-01-24 16:00:53 +01:00
Celine Mercier	53212168a2	History: added 'obi' in bash history for practical reasons	2020-01-23 16:51:49 +01:00
Celine Mercier	b4b2e62195	Cleaner handling of reverse quality columns	2020-01-18 19:28:12 +01:00
Celine Mercier	ced82c4242	Switching to version 3.0-beta6	2020-01-18 17:29:23 +01:00
Celine Mercier	a524f8829e	New command: obi cat to concatenate views (not optimized yet)	2020-01-18 17:28:31 +01:00
Celine Mercier	5c9091e9eb	C: closing DMS after cleaning it instead of counting on upper layer	2020-01-18 17:27:35 +01:00
Celine Mercier	822000cb70	Fixes in documentation	2020-01-18 17:26:18 +01:00
Celine Mercier	b9cd9bee9a	C: Changed obibool definitions because of conflict with R	2020-01-06 15:11:31 +01:00