ecotag: added separate threshold for minimum circle identity (and switch

to version 3.0.0b35
obi cat: fixed open file descriptor leak and switch to version 3.0.0b34
2020-09-25 16:22:09 +02:00 · 2020-08-28 10:41:22 +02:00 · 2020-08-27 18:38:16 +02:00 · 2020-08-13 18:17:09 +02:00 · 2020-08-12 17:55:08 +02:00 · 2020-07-31 16:43:35 +02:00
15 changed files with 118 additions and 51 deletions
--- a/python/obitools3/commands/cat.pyx
+++ b/python/obitools3/commands/cat.pyx
@ -4,7 +4,7 @@ from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
 from obitools3.dms import DMS
 from obitools3.dms.view.view cimport View
 from obitools3.uri.decode import open_uri
-from obitools3.apps.optiongroups import addMinimalOutputOption
+from obitools3.apps.optiongroups import addMinimalOutputOption, addNoProgressBarOption
 from obitools3.dms.view import RollbackException
 from obitools3.apps.config import logger
 from obitools3.utils cimport str2bytes
@ -28,6 +28,7 @@ __title__="Concatenate views."
 def addOptions(parser):
    
    addMinimalOutputOption(parser)
+    addNoProgressBarOption(parser)

    group=parser.add_argument_group('obi cat specific options')

@ -47,9 +48,9 @@ def run(config):
    
    logger("info", "obi cat")

-    # Open the views to concatenate
-    iview_list = []
+    # Check the views to concatenate
    idms_list = []
+    iview_list = []
    total_len = 0
    remove_qual = False
    remove_rev_qual = False
@ -67,8 +68,9 @@ def run(config):
        if REVERSE_QUALITY_COLUMN not in i_view: # same as above for reverse quality
            remove_rev_qual = True
        total_len += len(i_view)
-        iview_list.append(i_view)
        idms_list.append(i_dms)
+        iview_list.append(i_view.name)
+        i_view.close()

    # Open the output: only the DMS
    output = open_uri(config['obi']['outputURI'],
@ -97,8 +99,10 @@ def run(config):
    # Initialize multiple elements columns
    if type(output_0)==BufferedWriter: 
        dict_cols = {}
-        for v in iview_list:
+        for v_uri in config["cat"]["views_to_cat"]:
+            v = open_uri(v_uri)[1]
            for coln in v.keys():
+                col = v[coln]
                if v[coln].nb_elements_per_line > 1:
                    if coln not in dict_cols:
                        dict_cols[coln] = {}
@ -108,6 +112,7 @@ def run(config):
                    else:
                        dict_cols[coln]['eltnames'] = set(v[coln].elements_names + list(dict_cols[coln]['eltnames']))
                        dict_cols[coln]['nbelts'] = len(dict_cols[coln]['eltnames'])
+            v.close()            
        for coln in dict_cols:
            Column.new_column(o_view, coln, dict_cols[coln]['obitype'], 
                              nb_elements_per_line=dict_cols[coln]['nbelts'], elements_names=list(dict_cols[coln]['eltnames']))
@ -119,7 +124,8 @@ def run(config):
        pb = None
    
    i = 0
-    for v in iview_list:
+    for v_uri in config["cat"]["views_to_cat"]:
+        v = open_uri(v_uri)[1]
        for entry in v:
            PyErr_CheckSignals()
            if pb is not None:
@ -130,6 +136,7 @@ def run(config):
            else:
                o_view[i] = entry
            i+=1
+        v.close()

    # Deletes quality columns if needed
    if type(output_0)!=BufferedWriter: 
@ -144,7 +151,7 @@ def run(config):
    
    # Save command config in DMS comments
    command_line = " ".join(sys.argv[1:])
-    o_view.write_config(config, "cat", command_line, input_dms_name=[d.name for d in idms_list], input_view_name=[v.name for v in iview_list])
+    o_view.write_config(config, "cat", command_line, input_dms_name=[d.name for d in idms_list], input_view_name=[vname for vname in iview_list])
    o_dms.record_command_line(command_line)

    #print("\n\nOutput view:\n````````````", file=sys.stderr)
--- a/python/obitools3/commands/ecotag.pyx
+++ b/python/obitools3/commands/ecotag.pyx
@ -41,6 +41,17 @@ def addOptions(parser):
                      help="Minimum identity to consider for assignment, as a normalized identity, e.g. 0.95 for an identity of 95%%. "
                           "Default: 0.00 (no threshold).")

+    group.add_argument('--minimum-circle','-c',
+                      action="store", dest="ecotag:bubble_threshold",
+                      metavar='<CIRCLE_THRESHOLD>',
+                      default=0.99,
+                      type=float,
+                      help="Minimum identity considered for the assignment circle "
+                           "(sequence is assigned to the LCA of all sequences within a similarity circle of the best matches; "
+                           "the threshold for this circle is the highest value between <CIRCLE_THRESHOLD> and the best assignment score found). "
+                           "Give value as a normalized identity, e.g. 0.95 for an identity of 95%%. "
+                           "Default: 0.99.")
+
 def run(config):
        
    DMS.obi_atexit()
@ -66,9 +77,8 @@ def run(config):
    ref_view_name = ref[1]

    # Check that the threshold demanded is greater than or equal to the threshold used to build the reference database
-    if config['ecotag']['threshold'] < eval(ref_dms[ref_view_name].comments["ref_db_threshold"]) :
-        print("Error: The threshold demanded (%f) is lower than the threshold used to build the reference database (%f).", 
-              config['ecotag']['threshold'], ref_dms[ref_view_name].comments["ref_db_threshold"])
+    if config['ecotag']['bubble_threshold'] < eval(ref_dms[ref_view_name].comments["ref_db_threshold"]) :
+        raise Exception(f"Error: The threshold demanded ({config['ecotag']['bubble_threshold']}) is lower than the threshold used to build the reference database ({float(ref_dms[ref_view_name].comments['ref_db_threshold'])}).")
    
    # Open the output: only the DMS
    output = open_uri(config['obi']['outputURI'],
@ -113,8 +123,9 @@ def run(config):
    if obi_ecotag(i_dms.name_with_full_path, tobytes(i_view_name), \
                  ref_dms.name_with_full_path, tobytes(ref_view_name), \
                  taxo_dms.name_with_full_path, tobytes(taxonomy_name), \
-                  tobytes(o_view_name), comments, 
-                  config['ecotag']['threshold']) < 0:
+                  tobytes(o_view_name), comments, \
+                  config['ecotag']['threshold'], \
+                  config['ecotag']['bubble_threshold']) < 0:
        raise Exception("Error running ecotag")
    
    # If the input and output DMS are not the same, export result view to output DMS
--- a/python/obitools3/commands/export.pyx
+++ b/python/obitools3/commands/export.pyx
@ -89,7 +89,7 @@ def run(config):

    if pb is not None:
        pb(i, force=True)
-    print("", file=sys.stderr)
+        print("", file=sys.stderr)

    # TODO save command in input dms?
    
--- a/python/obitools3/commands/test.pyx
+++ b/python/obitools3/commands/test.pyx
@ -23,6 +23,7 @@ from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN, \
 import shutil
 import string
 import random
+import sys
 from cpython.exc cimport PyErr_CheckSignals


@ -366,7 +367,7 @@ def random_new_view(config, infos, first=False):
            infos['view'] = View_NUC_SEQS.new(infos['dms'], random_unique_name(infos), comments=random_comments(config))   # TODO quality column
        else :
            infos['view'] = View.new(infos['dms'], random_unique_name(infos), comments=random_comments(config))   # TODO quality column
-        
+        infos['view'].write_config(config, "test", infos["command_line"], input_dms_name=[infos['dms'].name], input_view_name=["random"])
    print_test(config, repr(infos['view']))
    if v_to_clone is not None :
        if line_selection is None:
@ -497,7 +498,8 @@ def run(config):
                                    (b"OBI_SEQ", False): random_seq, (b"OBI_SEQ", True): random_seq_tuples,
                                    (b"OBI_STR", False): random_bytes, (b"OBI_STR", True): random_bytes_tuples
                                  },
-             'tests': [test_set_and_get, test_add_col, test_delete_col, test_col_alias, test_new_view]
+             'tests': [test_set_and_get, test_add_col, test_delete_col, test_col_alias, test_new_view],
+             'command_line': " ".join(sys.argv[1:])
            }
 
    # TODO ???
--- a/python/obitools3/commands/uniq.pyx
+++ b/python/obitools3/commands/uniq.pyx
@ -354,6 +354,9 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, di
        key = mergedKeys[k]
        merged_col_name = mergedKeys_m[k]
        
+        if merged_infos[merged_col_name]['nb_elts'] == 1:
+            raise Exception("Can't merge information from a tag with only one element (e.g. one sample ; don't use -m option)")
+        
        if merged_col_name in view:
            i_col = view[merged_col_name]
        else:
--- a/python/obitools3/dms/capi/obiecotag.pxd
+++ b/python/obitools3/dms/capi/obiecotag.pxd
@ -11,4 +11,5 @@ cdef extern from "obi_ecotag.h" nogil:
                   const char* taxonomy_name,
                   const char* output_view_name,
                   const char* output_view_comments,
-                   double ecotag_threshold)
+                   double ecotag_threshold,
+                   double bubble_threshold)
--- a/python/obitools3/dms/view/view.pyx
+++ b/python/obitools3/dms/view/view.pyx
@ -7,6 +7,7 @@ cdef dict __VIEW_CLASS__= {}
 from libc.stdlib  cimport malloc

 from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
+from obitools3.version import version

 from ..capi.obiview cimport Alias_column_pair_p, \
                            obi_new_view, \
@ -184,8 +185,14 @@ cdef class View(OBIWrapper) :
    
    @OBIWrapper.checkIsActive
    def __repr__(self) :
-        cdef str s = "#View name:\n{name:s}\n#Line count:\n{line_count:d}\n#Columns:\n".format(name       = bytes2str(self.name),
-                                                                                                                         line_count = self.line_count)
+        cdef str s
+        if self.read_only:   # can read date   
+            s = "#View name:\n{name:s}\n#Date created:\n{date:s}\n#Line count:\n{line_count:d}\n#Columns:\n".format(name       = bytes2str(self.name),
+                                                                                                                    line_count = self.line_count,
+                                                                                                                    date       = str(bytes2str_object(self.comments["Date created"])))        
+        else:
+            s = "#View name:\n{name:s}\n#Line count:\n{line_count:d}\n#Columns:\n".format(name       = bytes2str(self.name),
+                                                                                          line_count = self.line_count)
        for column_name in self.keys() :
            s = s + repr(self[column_name]) + '\n'
        return s
@ -434,6 +441,7 @@ cdef class View(OBIWrapper) :
            for i in range(len(input_view_name)):
                input_str.append(tostr(input_dms_name[i])+"/"+tostr(input_view_name[i]))
        comments["input_str"] = input_str
+        comments["version"] = version
        return bytes2str_object(comments)

    
--- a/python/obitools3/format/tab.pyx
+++ b/python/obitools3/format/tab.pyx
@ -5,6 +5,7 @@ from obitools3.dms.view.view cimport Line
 from obitools3.utils cimport bytes2str_object, str2bytes, tobytes
 from obitools3.dms.column.column cimport Column_line, Column_multi_elts

+import sys

 cdef class TabFormat:
    
@ -26,18 +27,22 @@ cdef class TabFormat:
            
            if self.header and self.first_line:
                if isinstance(data.view[k], Column_multi_elts):
-                    for k2 in data.view[k].keys():
+                    keys = data.view[k].keys()
+                    keys.sort()
+                    for k2 in keys:
                        line.append(tobytes(k)+b':'+tobytes(k2))
                else:
                    line.append(tobytes(k))
            else:
                value = data[k]
                if isinstance(data.view[k], Column_multi_elts):
+                    keys = data.view[k].keys()
+                    keys.sort()
                    if value is None:  # all keys at None
-                        for k2 in data.view[k].keys(): # TODO could be much more efficient
+                        for k2 in keys: # TODO could be much more efficient
                            line.append(self.NAString)
                    else:
-                        for k2 in data.view[k].keys(): # TODO could be much more efficient
+                        for k2 in keys: # TODO could be much more efficient
                            if value[k2] is not None:
                                line.append(str2bytes(str(bytes2str_object(value[k2]))))  # genius programming
                            else:
--- a/python/obitools3/uri/decode.pyx
+++ b/python/obitools3/uri/decode.pyx
@ -276,9 +276,9 @@ def open_uri(uri,
            iseq = urib
            objclass = bytes
    else:  # TODO update uopen to be able to write? 
-        if urip.path == b'-':
+        if not urip.path or urip.path == b'-':
            file = sys.stdout.buffer
-        elif urip.path :
+        else:
            file = open(urip.path, 'wb')

    if file is not None:
--- a/python/obitools3/utils.pxd
+++ b/python/obitools3/utils.pxd
@ -2,7 +2,7 @@

 from obitools3.dms.capi.obitypes cimport obitype_t, index_t

-cpdef bytes format_separator(bytes format)
+cpdef bytes format_uniq_pattern(bytes format)
 cpdef int count_entries(file, bytes format)

 cdef obi_errno_to_exception(index_t line_nb=*, object elt_id=*, str error_message=*)
--- a/python/obitools3/utils.pyx
+++ b/python/obitools3/utils.pyx
@ -24,11 +24,11 @@ import glob
 import gzip


-cpdef bytes format_separator(bytes format):
+cpdef bytes format_uniq_pattern(bytes format):
    if format == b"fasta":
        return b"\n>"
    elif format == b"fastq":
-        return b"\n@"
+        return b"\n\+\n"
    elif format == b"ngsfilter" or format == b"tabular":
        return b"\n"
    elif format == b"genbank" or format == b"embl":
@ -42,7 +42,7 @@ cpdef bytes format_separator(bytes format):
 cpdef int count_entries(file, bytes format):
    
    try:
-        sep = format_separator(format)
+        sep = format_uniq_pattern(format)
        if sep is None:
            return -1
        sep = re.compile(sep)
@ -72,7 +72,7 @@ cpdef int count_entries(file, bytes format):
                return -1
            mmapped_file = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
            total_count += len(re.findall(sep, mmapped_file))
-            if format != b"ngsfilter" and format != b"tabular" and format != b"embl" and format != b"genbank":
+            if format != b"ngsfilter" and format != b"tabular" and format != b"embl" and format != b"genbank" and format != b"fastq":
                total_count += 1 # adding +1 for 1st entry because separators include \n (ngsfilter and tabular already count one more because of last \n)
            
    except:
--- a/python/obitools3/version.py
+++ b/python/obitools3/version.py
@ -1,5 +1,5 @@
 major = 3
 minor = 0
-serial= '0b30'
+serial= '0b35'

 version ="%d.%d.%s" % (major,minor,serial)
--- a/src/obi_ecotag.c
+++ b/src/obi_ecotag.c
@ -218,7 +218,8 @@ int obi_ecotag(const char* dms_name,
 			   const char* taxonomy_name,
 			   const char* output_view_name,
 			   const char* output_view_comments,
-			   double ecotag_threshold)    // TODO different threshold for the similarity sphere around ref seqs
+			   double ecotag_threshold,
+			   double bubble_threshold)
 {

 	// For each sequence
@ -239,6 +240,7 @@ int obi_ecotag(const char* dms_name,
 	index_t         query_seq_idx, ref_seq_idx;
 	double          score, best_score;
 	double			threshold;
+	double			lca_threshold;
 	int             lcs_length;
 	int             ali_length;
 	Kmer_table_p    ktable;
@ -389,10 +391,10 @@ int obi_ecotag(const char* dms_name,
 		return -1;
 	}
 	free(db_threshold_str);
-	if (ecotag_threshold < db_threshold)
+	if (bubble_threshold < db_threshold)
 	{
 		fprintf(stderr, "\nError: The threshold demanded (%f) is lower than the threshold used to build the reference database (%f).\n\n",
-				ecotag_threshold, db_threshold);
+				bubble_threshold, db_threshold);
 		return -1;
 	}

@ -597,11 +599,16 @@ int obi_ecotag(const char* dms_name,
 		{
 			best_match_idx = best_match_array[j];

-			// Find the LCA for the chosen threshold
+			// Find the LCA for the highest threshold between best_score and the chosen bubble threshold
 			score_array = obi_get_array_with_col_p_in_view(ref_view, score_a_column, best_match_idx, &lca_array_length);

+			if (bubble_threshold < best_score)
+				lca_threshold = best_score;
+			else
+				lca_threshold = bubble_threshold;
+
 			k = 0;
-			while ((k < lca_array_length) && (score_array[k] >= best_score))
+			while ((k < lca_array_length) && (score_array[k] >= lca_threshold))
 				k++;

 			if (k>0)
--- a/src/obi_ecotag.h
+++ b/src/obi_ecotag.h
@ -42,12 +42,14 @@
 * @param output_view_name The name to give to the output view.
 * @param output_view_comments The comments to associate to the output view.
 * @param ecotag_threshold The threshold at which to assign.
+ * @param bubble_threshold The threshold at which to look for an LCA (i.e. minimum identity considered for the assignment circle);
+ *                         the threshold actually used will be the highest between this value and the best assignment score found.
 *
 * 	The algorithm works like this:
 * 		For each query sequence:
 *			Align with reference database
 *			Keep the indices of all the best matches
- *			For each kept index, get the LCA at that threshold as stored in the reference database, then the LCA of those LCAs
+ *			For each kept index, get the LCA at the highest threshold between bubble_threshold and the best assignment score found (as stored in the reference database), then the LCA of those LCAs
 *			Write result (max score, threshold, taxid and scientific name of the LCA assigned, list of the ids of the best matches)
 *
 * @returns A value indicating the success of the operation.
@ -65,7 +67,8 @@ int obi_ecotag(const char* dms_name,
 			   const char* taxonomy_name,
 			   const char* output_view_name,
 			   const char* output_view_comments,
-			   double ecotag_threshold);
+			   double ecotag_threshold,
+			   double bubble_threshold);


 #endif /* OBI_ECOTAG_H_ */
--- a/src/obidmscolumn.c
+++ b/src/obidmscolumn.c
@ -1725,16 +1725,32 @@ int obi_close_column(OBIDMS_column_p column)
 int obi_clone_column_indexer(OBIDMS_column_p column)
 {
 	char* new_indexer_name;
+	int i;

-	new_indexer_name = obi_build_indexer_name((column->header)->name, (column->header)->version);
-	if (new_indexer_name == NULL)
-		return -1;
-
-	column->indexer = obi_clone_indexer(column->indexer, new_indexer_name);	// TODO Need to lock this somehow?
-	if (column->indexer == NULL)
+	i=0;
+	while (true) // find avl name not already used
 	{
-		obidebug(1, "\nError cloning a column's indexer to make it writable");
-		return -1;
+		new_indexer_name = obi_build_indexer_name((column->header)->name, ((column->header)->version)+i);
+		if (new_indexer_name == NULL)
+			return -1;
+
+		column->indexer = obi_clone_indexer(column->indexer, new_indexer_name);	// TODO Need to lock this somehow?
+		if (column->indexer == NULL)
+		{
+			if (errno == EEXIST)
+			{
+				free(new_indexer_name);
+				i++;
+			}
+			else
+			{
+				free(new_indexer_name);
+				obidebug(1, "\nError cloning a column's indexer to make it writable");
+				return -1;
+			}
+		}
+		else
+			break;
 	}

 	strcpy((column->header)->indexer_name, new_indexer_name);
@ -2415,16 +2431,20 @@ char* obi_get_formatted_elements_names(OBIDMS_column_p column)
 }


-char* obi_column_formatted_infos(OBIDMS_column_p column)
+char* obi_column_formatted_infos(OBIDMS_column_p column, bool detailed)
 {
-	char* column_infos;
-	char* elt_names;
-
-	column_infos = malloc(1024 * sizeof(char));
+	char* column_infos = NULL;
+	char* elt_names = NULL;
+	char* column_name = NULL;
+	 // should be in view.c because alias exists in the context of view
+	column_infos = malloc(2048 * sizeof(char)); // TODO

 	elt_names = obi_get_formatted_elements_names(column);


+//  "column_name, data type: OBI_TYPE, element names: [formatted element names](, all comments)"
+
+
 	free(elt_names);
 	return column_infos;
 }
Author	SHA1	Message	Date
Celine Mercier	c4696ac865	ecotag: added separate threshold for minimum circle identity (and switch to version 3.0.0b35	2020-09-25 16:22:09 +02:00
Celine Mercier	11a0945a9b	obi cat: fixed open file descriptor leak and switch to version 3.0.0b34	2020-08-28 10:41:22 +02:00
Celine Mercier	f23c40c905	obi cat: fixed a bug introduced in 3.0.0b28 and switch to version 3.0.0b33	2020-08-27 18:38:16 +02:00
Celine Mercier	f99fc13b75	switch to version 3.0.0b32	2020-08-13 18:17:09 +02:00
Celine Mercier	1da6aac1b8	C: patch for failed creation of AVL with errno EEXIST	2020-08-12 17:55:08 +02:00
Celine Mercier	159803b40a	export: now automatically sorts dictionary keys alphabetically for tab/csv output	2020-07-31 16:43:35 +02:00
Celine Mercier	7dcbc34017	import: fixed entry count estimation when importing fastq files	2020-07-30 16:56:36 +02:00
Celine Mercier	db2202c8b4	uniq: added a check to make sure that there is more than one element for one tag when merging its information	2020-07-30 16:14:37 +02:00
Celine Mercier	d33ff97846	switch to version 3.0.0b31	2020-07-28 09:31:19 +02:00
Celine Mercier	1dcdf69f1f	export: fixed a bug introduced in version 3.0.0b28	2020-07-28 09:31:05 +02:00
Celine Mercier	dec114eed6	Python: added "date created" information in view representation	2020-07-27 17:38:45 +02:00
Celine Mercier	f36691053b	Python: added the OBITools3 version that generated the view in view comments	2020-07-27 16:50:00 +02:00