Switch to version 3.0.0b27

C: AVL trees: fixed a bug where storing the difference between 2 crc64
values in an int64 would mess trees up resulting in failed data dereplication
2020-07-07 16:47:21 +02:00 · 2020-07-07 16:47:00 +02:00 · 2020-07-03 17:36:57 +02:00 · 2020-07-01 18:20:38 +02:00 · 2020-07-01 18:17:47 +02:00 · 2020-06-17 18:56:07 +02:00
25 changed files with 224 additions and 79 deletions
--- a/python/obitools3/commands/clean.pyx
+++ b/python/obitools3/commands/clean.pyx
@ -36,8 +36,7 @@ def addOptions(parser):
                       dest="clean:sample-tag-name",
                       metavar="<SAMPLE TAG NAME>",
                       type=str,
-                       default="merged_sample",
-                       help="Name of the tag where sample counts are kept.")
+                       help="Name of the tag where merged sample count informations are kept (typically generated by obi uniq, usually MERGED_sample, default: None).")
    
    group.add_argument('--ratio', '-r',
                       action="store", dest="clean:ratio",
@ -107,6 +106,9 @@ def run(config):
    command_line = " ".join(sys.argv[1:])
    comments = View.print_config(config, "clean", command_line, input_dms_name=[i_dms_name], input_view_name=[i_view_name])

+    if 'sample-tag-name' not in config['clean']:
+        config['clean']['sample-tag-name'] = ""
+        
    if obi_clean(i_dms.name_with_full_path, tobytes(i_view_name), tobytes(config['clean']['sample-tag-name']), tobytes(o_view_name), comments, \
              config['clean']['distance'], config['clean']['ratio'], config['clean']['heads-only'], config['clean']['thread-count']) < 0:
        raise Exception("Error running obiclean")
--- a/python/obitools3/commands/count.pyx
+++ b/python/obitools3/commands/count.pyx
@ -22,7 +22,7 @@ def addOptions(parser):
    group.add_argument('-s','--sequence',
                        action="store_true", dest="count:sequence",
                        default=False,
-                        help="Prints only the number of sequence records.")
+                        help="Prints only the number of sequence records (much faster, default: False).")
 
    group.add_argument('-a','--all',
                        action="store_true", dest="count:all",
--- a/python/obitools3/commands/ecopcr.pyx
+++ b/python/obitools3/commands/ecopcr.pyx
@ -35,12 +35,14 @@ def addOptions(parser):
                       action="store", dest="ecopcr:primer1",
                       metavar='<PRIMER>',
                       type=str,
+                       required=True,
                       help="Forward primer, length must be less than or equal to 32")

    group.add_argument('--primer2', '-R',
                       action="store", dest="ecopcr:primer2",
                       metavar='<PRIMER>',
                       type=str,
+                       required=True,
                       help="Reverse primer, length must be less than or equal to 32")

    group.add_argument('--error', '-e',
--- a/python/obitools3/commands/grep.pyx
+++ b/python/obitools3/commands/grep.pyx
@ -161,8 +161,7 @@ def obi_eval(compiled_expr, loc_env, line):
    return obi_eval_result
    

-def Filter_generator(options, tax_filter):
-    #taxfilter = taxonomyFilterGenerator(options)
+def Filter_generator(options, tax_filter, i_view):

    # Initialize conditions
    predicates = None
@ -171,6 +170,9 @@ def Filter_generator(options, tax_filter):
    attributes = None
    if "attributes" in options and len(options["attributes"]) > 0:
        attributes = options["attributes"]
+        for attribute in attributes:
+            if attribute not in i_view:
+                return None
    lmax = None
    if "lmax" in options:
        lmax = options["lmax"]
@ -196,6 +198,8 @@ def Filter_generator(options, tax_filter):
    if "attribute_patterns" in options and len(options["attribute_patterns"]) > 0:
        for p in options["attribute_patterns"]:
            attribute, pattern = p.split(":", 1)
+            if attribute not in i_view:
+                return None
            attribute_patterns[tobytes(attribute)] = re.compile(tobytes(pattern))
    
    def filter(line, loc_env):
@ -324,21 +328,29 @@ def run(config):
 
    # Apply filter
    tax_filter = Taxonomy_filter_generator(taxo, config["grep"])
-    filter = Filter_generator(config["grep"], tax_filter)
+    filter = Filter_generator(config["grep"], tax_filter, i_view)        
    selection = Line_selection(i_view)
-    for i in range(len(i_view)):
-        PyErr_CheckSignals()
-        pb(i)
-        line = i_view[i]
-                 
-        loc_env = {"sequence": line, "line": line, "taxonomy": taxo, "obi_eval_result": False}
-        
-        good = filter(line, loc_env)
- 
-        if good :
-            selection.append(i)
-
-    pb(i, force=True)
+    
+    if filter is None and config["grep"]["invert_selection"]: # all sequences are selected: filter is None if no line will be selected because some columns don't exist
+        for i in range(len(i_view)):
+            PyErr_CheckSignals()
+            pb(i)
+            selection.append(i)        
+    
+    elif filter is not None :   # filter is None if no line will be selected because some columns don't exist
+        for i in range(len(i_view)):
+            PyErr_CheckSignals()
+            pb(i)
+            line = i_view[i]
+                     
+            loc_env = {"sequence": line, "line": line, "taxonomy": taxo, "obi_eval_result": False}
+            
+            good = filter(line, loc_env)
+     
+            if good :
+                selection.append(i)
+    
+    pb(len(i_view), force=True)
    print("", file=sys.stderr)

    # Create output view with the line selection
--- a/python/obitools3/commands/import.pyx
+++ b/python/obitools3/commands/import.pyx
@ -260,7 +260,6 @@ def run(config):

        if entry is None:  # error or exception handled at lower level, not raised because Python generators can't resume after any exception is raised
            if config['obi']['skiperror']:
-                i-=1
                continue
            else:
                raise RollbackException("obi import error, rollbacking view", view)
--- a/python/obitools3/commands/ls.pyx
+++ b/python/obitools3/commands/ls.pyx
@ -34,9 +34,10 @@ def run(config):
    if input[2] == DMS and not config['ls']['longformat']:
        dms = input[0]
        l = []
-        for view in input[0]:
-            l.append(tostr(view) + "\t(Date created: " + str(bytes2str_object(dms[view].comments["Date created"]))+")")
-            dms[view].close()
+        for viewname in input[0]:
+            view = dms[viewname]
+            l.append(tostr(viewname) + "\t(Date created: " + str(bytes2str_object(view.comments["Date created"]))+")")
+            view.close()
        l.sort()
        for v in l:
            print(v)
--- a/python/obitools3/commands/ngsfilter.pyx
+++ b/python/obitools3/commands/ngsfilter.pyx
@ -42,6 +42,7 @@ def addOptions(parser):
                     metavar="<URI>",
                     type=str,
                     default=None,
+                     required=True,
                     help="URI to the view containing the samples definition (with tags, primers, sample names,...).\n"
                          "\nWarning: primer lengths must be less than or equal to 32")

@ -478,6 +479,8 @@ cdef tuple annotate(sequences, infos, no_tags, verbose=False):
    if not directmatch[0].forward:
        sequences[0] = sequences[0].reverse_complement
        sequences[0][b'reversed'] = True   # used by the alignpairedend tool (in kmer_similarity.c)
+    else:
+        sequences[0][b'reversed'] = False   # used by the alignpairedend tool (in kmer_similarity.c)

    sample=None
    if not no_tags:
@ -505,7 +508,7 @@ cdef tuple annotate(sequences, infos, no_tags, verbose=False):
                    sample=None
        
        if sample is None:
-            sequences[0][b'error']=b"No tags found"
+            sequences[0][b'error']=b"No sample with that tag combination"
            return False, sequences[0]
    
        sequences[0].update(sample)
--- a/python/obitools3/commands/uniq.pyx
+++ b/python/obitools3/commands/uniq.pyx
@ -307,6 +307,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, di
        for x in categories :
            catl.append(i_seq[x])    
          
+        #unique_id = tuple(catl) + (i_seq_col[i],)
        unique_id = tuple(catl) + (i_seq_col.get_line_idx(i),)
        #unique_id = tuple(i_seq[x] for x in categories) + (seq_col.get_line_idx(i),)  # The line that cython can't read properly
         
@ -419,12 +420,12 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, di
    print("")
    logger("info", "Second browsing through the input")
    # Initialize the progress bar
-    pb = ProgressBar(len(uniques), seconde=5)
+    pb = ProgressBar(len(view), seconde=5)
    o_idx = 0
+    total_treated = 0
    
    for unique_id in uniques :
        PyErr_CheckSignals()
-        pb(o_idx)
        
        merged_sequences = uniques[unique_id]
        
@ -453,7 +454,9 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, di
            merged_dict[mkey] = {}

        for i_idx in merged_sequences:
-                        
+            PyErr_CheckSignals()
+            pb(total_treated)
+ 
            i_id = i_id_col[i_idx]
            i_seq = view[i_idx]

@ -504,7 +507,9 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, di
                if key != COUNT_COLUMN and key != ID_COLUMN and key != NUC_SEQUENCE_COLUMN and key in o_seq and o_seq[key] != i_seq[key] \
                    and key not in merged_dict :
                    o_seq[key] = None
-
+            
+            total_treated += 1
+            
        # Write merged dicts
        for mkey in merged_dict: 
            if mkey in str_merged_cols:
@ -526,7 +531,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, di
        o_count_col[o_idx] = o_count
        o_idx += 1
    
-    pb(len(uniques), force=True)
+    pb(len(view), force=True)
    
    # Deletes quality columns if there is one because the matching between sequence and quality will be broken (quality set to NA when sequence not)
    if QUALITY_COLUMN in view:
@ -588,10 +593,11 @@ def run(config):
    # Initialize the progress bar
    pb = ProgressBar(len(entries), config, seconde=5)
    
-    try:
-        uniq_sequences(entries, o_view, pb, config, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'], max_elts=config['obi']['maxelts'])       
-    except Exception, e:
-        raise RollbackException("obi uniq error, rollbacking view: "+str(e), o_view)
+    if len(entries) > 0:
+        try:
+            uniq_sequences(entries, o_view, pb, config, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'], max_elts=config['obi']['maxelts'])       
+        except Exception, e:
+            raise RollbackException("obi uniq error, rollbacking view: "+str(e), o_view)
    
    print("", file=sys.stderr)

--- a/python/obitools3/dms/column/column.pxd
+++ b/python/obitools3/dms/column/column.pxd
@ -22,6 +22,7 @@ cdef class Column(OBIWrapper) :
    
    cdef inline OBIDMS_column_p pointer(self)
    cdef read_elements_names(self)
+    cpdef list keys(self)
    
    @staticmethod
    cdef type get_column_class(obitype_t obitype, bint multi_elts, bint tuples)
--- a/python/obitools3/dms/column/column.pyx
+++ b/python/obitools3/dms/column/column.pyx
@ -323,7 +323,10 @@ cdef class Column(OBIWrapper) :
        free(elts_names_b)
        return elts_names_list
    
-    
+    cpdef list keys(self):
+        return self._elements_names
+
+
    # Column alias property getter and setter
    @property
    def name(self):
@ -340,7 +343,7 @@ cdef class Column(OBIWrapper) :
    @property
    def elements_names(self):
        return self._elements_names
- 
+       
    # nb_elements_per_line property getter
    @property
    def nb_elements_per_line(self):
--- a/python/obitools3/dms/dms.pyx
+++ b/python/obitools3/dms/dms.pyx
@ -227,7 +227,9 @@ cdef class DMS(OBIWrapper):
        cdef str s 
        s=""
        for view_name in self.keys():
-            s = s + repr(self.get_view(view_name)) + "\n"
+            view = self.get_view(view_name)
+            s = s + repr(view) + "\n"
+            view.close()
        return s
    

--- a/python/obitools3/dms/view/view.pyx
+++ b/python/obitools3/dms/view/view.pyx
@ -533,6 +533,7 @@ cdef class View(OBIWrapper) :
            for command in command_list:
                s+=b"obi "
                s+=command
+                s+=b"\n"
        return s


--- a/python/obitools3/format/tab.pyx
+++ b/python/obitools3/format/tab.pyx
@ -3,7 +3,7 @@
 cimport cython
 from obitools3.dms.view.view cimport Line
 from obitools3.utils cimport bytes2str_object, str2bytes, tobytes
-from obitools3.dms.column.column cimport Column_line
+from obitools3.dms.column.column cimport Column_line, Column_multi_elts


 cdef class TabFormat:
@ -25,19 +25,29 @@ cdef class TabFormat:
        for k in self.tags:
            
            if self.header and self.first_line:
-                value = tobytes(k)
+                if isinstance(data.view[k], Column_multi_elts):
+                    for k2 in data.view[k].keys():
+                        line.append(tobytes(k)+b':'+tobytes(k2))
+                else:
+                    line.append(tobytes(k))
            else:
                value = data[k]
-                if value is not None:
-                    if type(value) == Column_line:
-                        value = value.bytes()
+                if isinstance(data.view[k], Column_multi_elts):
+                    if value is None:  # all keys at None
+                        for k2 in data.view[k].keys(): # TODO could be much more efficient
+                            line.append(self.NAString)
                    else:
-                        value = str2bytes(str(bytes2str_object(value))) # genius programming
-                if value is None:
-                    value = self.NAString
-            
-            line.append(value)
-      	
+                        for k2 in data.view[k].keys(): # TODO could be much more efficient
+                            if value[k2] is not None:
+                                line.append(str2bytes(str(bytes2str_object(value[k2]))))  # genius programming
+                            else:
+                                line.append(self.NAString)
+                else:
+                    if value is not None:
+                        line.append(str2bytes(str(bytes2str_object(value))))
+                    else:
+                        line.append(self.NAString)
+                  	
        if self.first_line:
            self.first_line = False
      		
--- a/python/obitools3/libalign/_solexapairend.pyx
+++ b/python/obitools3/libalign/_solexapairend.pyx
@ -188,7 +188,7 @@ def buildConsensus(ali, seq, ref_tags=None):
        seq[b'shift']=ali.shift
    else:
        if len(ali[0])>999:   # TODO why?
-            raise AssertionError,"Too long alignemnt"
+            raise AssertionError,"Too long alignment"
    
        ic=IterOnConsensus(ali)
    
@ -250,11 +250,21 @@ def buildJoinedSequence(ali, reverse, seq, forward=None):
    quality.extend(reverse.quality)
    seq.set(forward.id +b"_PairedEnd", s, definition=forward.definition, quality=quality)
    seq[b"score"]=ali.score
-    seq[b"ali_direction"]=ali.direction
+    if len(ali.direction) > 0:
+        seq[b"ali_direction"]=ali.direction
+    else:
+        seq[b"ali_direction"]=None
    seq[b"mode"]=b"joined"
-    seq[b"pairedend_limit"]=len(forward)    
+    seq[b"pairedend_limit"]=len(forward)
+    seq[b"ali_length"] = ali.consensus_len
+    if ali.consensus_len > 0:
+        seq[b"score_norm"]=float(ali.score)/ali.consensus_len
+    else:
+        seq[b"score_norm"]=0.0
+  
    for tag in forward:
-        if tag != REVERSE_SEQUENCE_COLUMN and tag != REVERSE_QUALITY_COLUMN:
+        if tag != REVERSE_SEQUENCE_COLUMN and tag != REVERSE_QUALITY_COLUMN and \
+            tag != NUC_SEQUENCE_COLUMN and tag != QUALITY_COLUMN:
            seq[tag] = forward[tag]
    return seq

--- a/python/obitools3/utils.pyx
+++ b/python/obitools3/utils.pyx
@ -166,7 +166,9 @@ cdef object bytes2str_object(object value):  # Only works if complex types are d
                    value[k] = bytes2str(v)
            if type(k) == bytes:
                value[bytes2str(k)] = value.pop(k)
-    elif isinstance(value, list):
+    elif isinstance(value, list) or isinstance(value, tuple):
+        if isinstance(value, tuple):
+            value = list(value)
        for i in range(len(value)):
            if isinstance(value[i], list) or isinstance(value[i], dict):
                value[i] = bytes2str_object(value[i])
--- a/python/obitools3/version.py
+++ b/python/obitools3/version.py
@ -1,5 +1,5 @@
 major = 3
 minor = 0
-serial= '0b21'
+serial= '0b27'

 version ="%d.%d.%s" % (major,minor,serial)
--- a/setup.py
+++ b/setup.py
@ -27,10 +27,11 @@ class Distribution(ori_Distribution):
        
        ori_Distribution.__init__(self, attrs)
        
-        self.global_options.insert(0,('cobitools3', None, "intall location of the C library"
+        self.global_options.insert(0,('cobitools3', None, "install location of the C library"
                                     ))

 from distutils.command.build import build as build_ori
+from setuptools.command.bdist_egg import bdist_egg as bdist_egg_ori
 from distutils.core import Command


@ -71,6 +72,12 @@ class build(build_ori):
        build_ori.run(self)


+class bdist_egg(bdist_egg_ori):
+    def run(self):
+        self.run_command('build_clib')
+        bdist_egg_ori.run(self)
+
+
 sys.path.append(os.path.abspath("python"))


@ -166,6 +173,7 @@ setup(name=PACKAGE,
      ext_modules=xx,
      distclass=Distribution,
      cmdclass={'build': build,
+                'bdist_egg': bdist_egg,
                'build_clib': build_clib},
      cobitools3=get_python_lib(),
      packages = findPackage('python'),
--- a/src/kmer_similarity.c
+++ b/src/kmer_similarity.c
@ -413,7 +413,10 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
 	 	return NULL;
 	}

-	score = max_common_kmers + kmer_size - 1;  // aka the number of nucleotides in the longest stretch of kmers perfectly matching
+	if (max_common_kmers > 0)
+		score = max_common_kmers + kmer_size - 1;  // aka the number of nucleotides in the longest stretch of kmers perfectly matching
+	else
+		score = 0;
 	abs_shift = abs(best_shift);

 	// Save result in Obi_ali structure
@ -423,10 +426,15 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
 	ali->shift = abs_shift;
 	ali->consensus_seq = NULL;
 	ali->consensus_qual = NULL;
-	if (((best_shift <= 0) && (!switched_seqs)) || ((best_shift > 0) && switched_seqs))
-		strcpy(ali->direction, "left");
+	if (score == 0)
+		ali->direction[0] = '\0';
 	else
-		strcpy(ali->direction, "right");
+	{
+		if (((best_shift <= 0) && (!switched_seqs)) || ((best_shift > 0) && switched_seqs))
+			strcpy(ali->direction, "left");
+		else
+			strcpy(ali->direction, "right");
+	}

 	// Build the consensus sequence if asked
 	if (build_consensus)
--- a/src/obi_clean.c
+++ b/src/obi_clean.c
@ -246,7 +246,16 @@ int obi_clean(const char* dms_name,

 	// Open the sample column if there is one
 	if ((strcmp(sample_column_name, "") == 0) || (sample_column_name == NULL))
-		sample_column = NULL;
+	{
+		fprintf(stderr, "Info: No sample information provided, assuming one sample.\n");
+		sample_column = obi_view_get_column(i_view, COUNT_COLUMN);
+		if (sample_column == NULL)
+		{
+			obidebug(1, "\nError getting the COUNT column");
+			return -1;
+		}
+		sample_count = 1;
+	}
 	else
 	{
 		sample_column = obi_view_get_column(i_view, sample_column_name);
@ -255,6 +264,13 @@ int obi_clean(const char* dms_name,
 			obidebug(1, "\nError getting the sample column");
 			return -1;
 		}
+		sample_count = (sample_column->header)->nb_elements_per_line;
+		// Check that the sample column is a merged column with all sample informations
+		if (sample_count == 1)
+		{
+			obidebug(1, "\n\nError: If a sample column is provided, it must contain 'merged' sample counts as built by obi uniq with the -m option\n");
+			return -1;
+		}
 	}

 	// Create the output view, or a temporary one if heads only
@ -279,8 +295,6 @@ int obi_clean(const char* dms_name,
 		return -1;
 	}

-	sample_count = (sample_column->header)->nb_elements_per_line;
-
 	// Create the output columns
 	if (create_output_columns(o_view, sample_column, sample_count) < 0)
 	{
@ -549,7 +563,7 @@ int obi_clean(const char* dms_name,

 	if (heads_only)
 	{
-		line_selection = malloc((o_view->infos)->line_count * sizeof(index_t));
+		line_selection = malloc((((o_view->infos)->line_count) + 1) * sizeof(index_t));
 		if (line_selection == NULL)
 		{
 			obi_set_errno(OBI_MALLOC_ERROR);
--- a/src/obi_clean.h
+++ b/src/obi_clean.h
@ -52,7 +52,8 @@
 *
 * @param dms A pointer on an OBIDMS.
 * @param i_view_name The name of the input view.
- * @param sample_column_name The name of the OBI_STR column in the input view where the sample information is kept.
+ * @param sample_column_name The name of the column in the input view where the sample information is kept.
+ *                           Must be merged informations as built by the obi uniq tool (checked by the function).
 *                           NULL or "" (empty string) if there is no sample information.
 * @param o_view_name The name of the output view where the results should be written (should not already exist).
 * @param o_view_comments The comments that should be associated with the output view.
--- a/src/obi_ecotag.c
+++ b/src/obi_ecotag.c
@ -71,9 +71,12 @@ static int create_output_columns(Obiview_p o_view);
 * @param name The assigned scientific name.
 * @param assigned_status_column A pointer on the column where the assigned status should be written.
 * @param assigned The assigned status (whether the sequence was assigned to a taxon or not).
- * @param best_match_column A pointer on the column where the list of ids of the best matches should be written.
+ * @param best_match_ids_column A pointer on the column where the list of ids of the best matches should be written.
 * @param best_match_ids The list of ids of the best matches as an array of the concatenated ids separated by '\0'.
 * @param best_match_ids_length The total length of the array of ids of best matches.
+ * @param best_match_taxids_column A pointer on the column where the list of taxids of the best matches should be written.
+ * @param best_match_taxids The list of taxids of the best matches as an array of the taxids.
+ * @param best_match_taxids_length The length of the array of taxids of best matches.
 * @param score_column A pointer on the column where the score should be written.
 * @param score The similarity score of the sequence with its best match(es).
 *
@ -87,7 +90,8 @@ int print_assignment_result(Obiview_p output_view, index_t line,
 							OBIDMS_column_p assigned_taxid_column, int32_t taxid,
 							OBIDMS_column_p assigned_name_column, const char* name,
 							OBIDMS_column_p assigned_status_column, bool assigned,
-							OBIDMS_column_p best_match_column, const char* best_match_ids, int best_match_ids_length,
+							OBIDMS_column_p best_match_ids_column, const char* best_match_ids, int best_match_ids_length,
+							OBIDMS_column_p best_match_taxids_column, const int32_t* best_match_taxids, int best_match_taxids_length,
 							OBIDMS_column_p score_column, double score);


@ -130,7 +134,14 @@ static int create_output_columns(Obiview_p o_view)
 	// Column for array of best match ids
 	if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, true, false, NULL, NULL, -1, "{}", true) < 0)
 	{
-		obidebug(1, "\nError creating the column for the array of ids of the best match in ecotag");
+		obidebug(1, "\nError creating the column for the array of ids of best matches in ecotag");
+		return -1;
+	}
+
+	// Column for array of best match taxids
+	if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_TAXIDS_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, true, false, NULL, NULL, -1, "{}", true) < 0)
+	{
+		obidebug(1, "\nError creating the column for the array of taxids of best matches in ecotag");
 		return -1;
 	}

@ -142,7 +153,8 @@ int print_assignment_result(Obiview_p output_view, index_t line,
 							OBIDMS_column_p assigned_taxid_column, int32_t taxid,
 							OBIDMS_column_p assigned_name_column, const char* name,
 							OBIDMS_column_p assigned_status_column, bool assigned,
-							OBIDMS_column_p best_match_column, const char* best_match_ids, int best_match_ids_length,
+							OBIDMS_column_p best_match_ids_column, const char* best_match_ids, int best_match_ids_length,
+							OBIDMS_column_p best_match_taxids_column, const int32_t* best_match_taxids, int best_match_taxids_length,
 							OBIDMS_column_p score_column, double score)
 {
 	// Write the assigned taxid
@ -167,9 +179,16 @@ int print_assignment_result(Obiview_p output_view, index_t line,
 	}

 	// Write the best match ids
-	if (obi_set_array_with_col_p_in_view(output_view, best_match_column, line, best_match_ids, (uint8_t)(sizeof(char)*8), best_match_ids_length) < 0)
+	if (obi_set_array_with_col_p_in_view(output_view, best_match_ids_column, line, best_match_ids, (uint8_t)(sizeof(char)*8), best_match_ids_length) < 0)
 	{
-		obidebug(1, "\nError writing a assignment status in a column when writing ecotag results");
+		obidebug(1, "\nError writing the array of best match ids in a column when writing ecotag results");
+		return -1;
+	}
+
+	// Write the best match taxids
+	if (obi_set_array_with_col_p_in_view(output_view, best_match_taxids_column, line, best_match_taxids, (uint8_t)(sizeof(OBI_INT)*8), best_match_taxids_length) < 0)
+	{
+		obidebug(1, "\nError writing the array of best match taxids in a column when writing ecotag results");
 		return -1;
 	}

@ -235,6 +254,8 @@ int obi_ecotag(const char* dms_name,
 	char*     		best_match_ids;
 	char*			best_match_ids_to_store;
 	int32_t         best_match_ids_length;
+	int32_t*   		best_match_taxids;
+	int32_t*		best_match_taxids_to_store;
 	int				best_match_count;
 	int             buffer_size;
 	int 			best_match_ids_buffer_size;
@ -263,7 +284,8 @@ int obi_ecotag(const char* dms_name,
 	OBIDMS_column_p assigned_taxid_column = NULL;
 	OBIDMS_column_p assigned_name_column = NULL;
 	OBIDMS_column_p assigned_status_column = NULL;
-	OBIDMS_column_p best_match_column = NULL;
+	OBIDMS_column_p best_match_ids_column = NULL;
+	OBIDMS_column_p best_match_taxids_column = NULL;
 	OBIDMS_column_p lca_taxid_a_column = NULL;
 	OBIDMS_column_p score_a_column = NULL;
 	OBIDMS_column_p ref_taxid_column = NULL;
@ -396,7 +418,8 @@ int obi_ecotag(const char* dms_name,
 	assigned_taxid_column = obi_view_get_column(output_view, ECOTAG_TAXID_COLUMN_NAME);
 	assigned_name_column = obi_view_get_column(output_view, ECOTAG_NAME_COLUMN_NAME);
 	assigned_status_column = obi_view_get_column(output_view, ECOTAG_STATUS_COLUMN_NAME);
-	best_match_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME);
+	best_match_ids_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME);
+	best_match_taxids_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_TAXIDS_COLUMN_NAME);
 	score_column = obi_view_get_column(output_view, ECOTAG_SCORE_COLUMN_NAME);

 	// Open the used reference columns
@ -453,6 +476,14 @@ int obi_ecotag(const char* dms_name,
 		return -1;
 	}

+	best_match_taxids = (int32_t*) malloc(buffer_size* sizeof(int32_t));
+	if (best_match_taxids == NULL)
+	{
+		obi_set_errno(OBI_MALLOC_ERROR);
+		obidebug(1, "\nError allocating memory for the best match taxid array in ecotag");
+		return -1;
+	}
+
 	for (i=0; i < query_count; i++)
 	{
 		if (i%1000 == 0)
@ -514,7 +545,7 @@ int obi_ecotag(const char* dms_name,

 				// Store in best match array

-				// Grow match array if needed
+				// Grow match and taxid array if needed
 				if (best_match_count == buffer_size)
 				{
 					buffer_size = buffer_size*2;
@ -525,6 +556,13 @@ int obi_ecotag(const char* dms_name,
 						obidebug(1, "\nError reallocating match array when assigning");
 						return -1;
 					}
+					best_match_taxids = (int32_t*) realloc(best_match_taxids, buffer_size*sizeof(int32_t));
+					if (best_match_taxids == NULL)
+					{
+						obi_set_errno(OBI_MALLOC_ERROR);
+						obidebug(1, "\nError reallocating match taxids array when assigning");
+						return -1;
+					}
 				}

 				id = obi_get_str_with_elt_idx_and_col_p_in_view(ref_view, ref_id_column, j, 0);
@ -545,6 +583,7 @@ int obi_ecotag(const char* dms_name,

 				// Save match
 				best_match_array[best_match_count] = j;
+				best_match_taxids[best_match_count] = obi_get_int_with_elt_idx_and_col_p_in_view(ref_view, ref_taxid_column, j, 0);
 				best_match_count++;
 				strcpy(best_match_ids+best_match_ids_length, id);
 				best_match_ids_length = best_match_ids_length + id_len + 1;
@ -629,6 +668,7 @@ int obi_ecotag(const char* dms_name,
 			else
 				lca_name = lca->name;
 			best_match_ids_to_store = best_match_ids;
+			best_match_taxids_to_store = best_match_taxids;
 		}
 		else
 		{
@ -636,6 +676,7 @@ int obi_ecotag(const char* dms_name,
 			lca_name = OBIStr_NA;
 			lca_taxid = OBIInt_NA;
 			best_match_ids_to_store = OBITuple_NA;
+			best_match_taxids_to_store = OBITuple_NA;
 			score = OBIFloat_NA;
 		}

@ -644,7 +685,8 @@ int obi_ecotag(const char* dms_name,
 								    assigned_taxid_column, lca_taxid,
 									assigned_name_column, lca_name,
 									assigned_status_column, assigned,
-									best_match_column, best_match_ids_to_store, best_match_ids_length,
+									best_match_ids_column, best_match_ids_to_store, best_match_ids_length,
+									best_match_taxids_column, best_match_taxids_to_store, best_match_count,
 									score_column, best_score
 									) < 0)
 							return -1;
@ -652,6 +694,7 @@ int obi_ecotag(const char* dms_name,

 	free(best_match_array);
 	free(best_match_ids);
+	free(best_match_taxids);

 	obi_close_taxonomy(taxonomy);
 	obi_save_and_close_view(query_view);
--- a/src/obi_ecotag.h
+++ b/src/obi_ecotag.h
@ -23,7 +23,8 @@
 #define ECOTAG_TAXID_COLUMN_NAME "TAXID"
 #define ECOTAG_NAME_COLUMN_NAME "SCIENTIFIC_NAME"
 #define ECOTAG_STATUS_COLUMN_NAME "ID_STATUS"
-#define ECOTAG_BEST_MATCH_IDS_COLUMN_NAME "BEST_MATCH"
+#define ECOTAG_BEST_MATCH_IDS_COLUMN_NAME "BEST_MATCH_IDS"
+#define ECOTAG_BEST_MATCH_TAXIDS_COLUMN_NAME "BEST_MATCH_TAXIDS"
 #define ECOTAG_SCORE_COLUMN_NAME "BEST_IDENTITY"


--- a/src/obiavl.c
+++ b/src/obiavl.c
@ -2259,7 +2259,13 @@ index_t obi_avl_add(OBIDMS_avl_p avl, Obi_blob_p value)
 		parent = next;

 		// Compare the crc of the value with the crc of the current node
-		comp = (current_node->crc64) - crc;
+		//comp = (current_node->crc64) - crc;
+		if ((current_node->crc64) == crc)
+			comp = 0;
+		else if ((current_node->crc64) > crc)
+			comp = 1;
+		else
+			comp = -1;

 		if (comp == 0)
 		{ // check if really same value
@ -2354,7 +2360,13 @@ index_t obi_avl_find(OBIDMS_avl_p avl, Obi_blob_p value)
 		current_node = (avl->tree)+next;

 		// Compare the crc of the value with the crc of the current node
-		comp = (current_node->crc64) - crc;
+		//comp = (current_node->crc64) - crc;
+		if ((current_node->crc64) == crc)
+			comp = 0;
+		else if ((current_node->crc64) > crc)
+			comp = 1;
+		else
+			comp = -1;

 		if (comp == 0)
 		{ // Check if really same value
--- a/src/obidms.c
+++ b/src/obidms.c
@ -1496,7 +1496,7 @@ obiversion_t obi_import_column(const char* dms_path_1, const char* dms_path_2, c
 	memcpy(column_2->data, column_1->data, header_1->data_size);

 	// Copy the AVL files if there are some (overwriting the automatically created files)
-	if ((header_1->returned_data_type == OBI_STR) || (header_1->returned_data_type == OBI_SEQ) || (header_1->returned_data_type == OBI_QUAL))
+	if ((header_1->tuples) || ((header_1->returned_data_type == OBI_STR) || (header_1->returned_data_type == OBI_SEQ) || (header_1->returned_data_type == OBI_QUAL)))
 	{
 		avl_name_1 = (char*) malloc((strlen(header_1->indexer_name) + 1)  * sizeof(char));
 		if (avl_name_1 == NULL)
--- a/src/obidmscolumn.c
+++ b/src/obidmscolumn.c
@ -1350,6 +1350,8 @@ OBIDMS_column_p obi_create_column(OBIDMS_p     dms,
 		}
 		strncpy(header->indexer_name, final_indexer_name, INDEXER_MAX_NAME);
 	}
+	else
+		new_column->indexer = NULL;

 	// Fill the data with NA values
 	obi_ini_to_NA_values(new_column, 0, nb_lines);
@ -1558,6 +1560,8 @@ OBIDMS_column_p obi_open_column(OBIDMS_p     dms,
 			return NULL;
 		}
 	}
+	else
+		column->indexer = NULL;

 	if (close(column_file_descriptor) < 0)
 	{
@ -1693,8 +1697,8 @@ int obi_close_column(OBIDMS_column_p column)
 		if (obi_dms_unlist_column(column->dms, column) < 0)
 			ret_val = -1;

-		// If the data type is OBI_STR, OBI_SEQ or OBI_QUAL, the associated indexer is closed
-		if (((column->header)->returned_data_type == OBI_STR) || ((column->header)->returned_data_type == OBI_SEQ) || ((column->header)->returned_data_type == OBI_QUAL))
+		// If it's a tuple column or the data type is OBI_STR, OBI_SEQ or OBI_QUAL, the associated indexer is closed
+		if ((column->indexer) != NULL)
 			if (obi_close_indexer(column->indexer) < 0)
 				ret_val = -1;
Author	SHA1	Message	Date
Celine Mercier	9ace9989c4	Switch to version 3.0.0b27	2020-07-07 16:47:21 +02:00
Celine Mercier	a3ebe5f118	C: AVL trees: fixed a bug where storing the difference between 2 crc64 values in an int64 would mess trees up resulting in failed data dereplication	2020-07-07 16:47:00 +02:00
Celine Mercier	9100e14899	obi uniq: quick fix for bug where some sequences are not correctly dereplicated	2020-07-03 17:36:57 +02:00
Celine Mercier	ccda0661ce	small help documentation improvement	2020-07-01 18:20:38 +02:00
Celine Mercier	aab59f2214	obi clean: fixed a memory bug, fixed the behaviour when no sample info, and added checks warnings and error handling when sample info not dereplicated	2020-07-01 18:17:47 +02:00
Celine Mercier	ade1107b42	switch to version 3.0.0b26	2020-06-17 18:56:07 +02:00
Celine Mercier	9c7d24406f	export: dictionaries are now formatted like in the original OBITools when exporting in tabular format and tuple formatting is cleaner	2020-06-17 18:55:46 +02:00
Celine Mercier	03bc9915f2	Cython: utils: added handling of tuples to bytes2str_object function	2020-06-17 18:54:14 +02:00
Celine Mercier	24b1dab573	Cython: Columns: added a keys() method that returns all element names	2020-06-17 18:53:41 +02:00
Celine Mercier	7593673f3f	ngsfilter: now setting 'reversed' tag to False instead of None when false	2020-06-17 18:52:35 +02:00
Celine Mercier	aa01236cae	switch to version 3.0.0b25	2020-06-13 21:48:49 +02:00
Celine Mercier	49b8810a76	C: made indexer opening/closing cleaner	2020-06-13 21:47:03 +02:00
Celine Mercier	7a39df54c0	ls: fixed an issue where big DMS couldn't be read by ls	2020-06-13 21:45:22 +02:00
Celine Mercier	09e483b0d6	switch to temporary version 3.0.0b24a	2020-06-10 17:47:56 +02:00
Celine Mercier	14a2579173	uniq: now outputs an empty view if input view is empty instead of displaying an error	2020-06-10 17:47:26 +02:00
Celine Mercier	36a8aaa92e	grep: now creating empty views instead of displaying an error when selecting on an unexisting column/tag	2020-06-10 16:57:42 +02:00
Celine Mercier	a17eb445c2	ngsfilter: made one of the tag error messages more accurate	2020-06-10 16:27:36 +02:00
Celine Mercier	e4a32788c2	Switch to version 3.0.0b24	2020-06-09 14:36:58 +02:00
Celine Mercier	2442cc80bf	Cython: View: fixed bash history display	2020-06-09 14:36:37 +02:00
Celine Mercier	aa836b2ace	uniq: improved progress bar of second browsing	2020-06-09 14:36:02 +02:00
Celine Mercier	8776ce22e6	C: fixed a bug where indexers referring to tuples of certain types were not properly closed and imported	2020-06-09 14:34:43 +02:00
Celine Mercier	4aa772c405	ecotag: Added list of taxids for all best matches (closes #80 )	2020-06-09 14:33:14 +02:00
Celine Mercier	b0b96ac37a	version 3.0.0b23a	2020-06-05 16:10:24 +02:00
Celine Mercier	687e42ad22	C: kmer alignment: fixed a bug where scores of 0 were at (0+kmer_length-1) (and now setting alignment direction to None if score is 0	2020-06-05 16:09:33 +02:00
Celine Mercier	5fbbb6d304	alignpairedend: fixed a bug when rebuilding joined (unaligned) sequences where only the forward sequence was kept	2020-06-05 16:06:43 +02:00
Celine Mercier	359a9fe237	Switch to version 3.0.0b23	2020-06-04 15:35:03 +02:00
Celine Mercier	f9b6851f75	Python: correctly flagged some mandatory options as required	2020-06-04 15:34:24 +02:00
cmercier	29a2652bbf	Fixed installation on Ubuntu without pip	2020-06-04 15:06:35 +02:00
Celine Mercier	2a2c233936	obi import: fixed a bug when skipping an entry	2020-05-29 21:19:42 +02:00