ecotag: added separate threshold for minimum circle identity (and switch

to version 3.0.0b35
obi cat: fixed open file descriptor leak and switch to version 3.0.0b34
2020-09-25 16:22:09 +02:00 · 2020-08-28 10:41:22 +02:00
6 changed files with 48 additions and 20 deletions
--- a/python/obitools3/commands/cat.pyx
+++ b/python/obitools3/commands/cat.pyx
@ -48,9 +48,9 @@ def run(config):
    
    logger("info", "obi cat")

-    # Open the views to concatenate
-    iview_list = []
+    # Check the views to concatenate
    idms_list = []
+    iview_list = []
    total_len = 0
    remove_qual = False
    remove_rev_qual = False
@ -68,8 +68,9 @@ def run(config):
        if REVERSE_QUALITY_COLUMN not in i_view: # same as above for reverse quality
            remove_rev_qual = True
        total_len += len(i_view)
-        iview_list.append(i_view)
        idms_list.append(i_dms)
+        iview_list.append(i_view.name)
+        i_view.close()

    # Open the output: only the DMS
    output = open_uri(config['obi']['outputURI'],
@ -98,8 +99,10 @@ def run(config):
    # Initialize multiple elements columns
    if type(output_0)==BufferedWriter: 
        dict_cols = {}
-        for v in iview_list:
+        for v_uri in config["cat"]["views_to_cat"]:
+            v = open_uri(v_uri)[1]
            for coln in v.keys():
+                col = v[coln]
                if v[coln].nb_elements_per_line > 1:
                    if coln not in dict_cols:
                        dict_cols[coln] = {}
@ -109,6 +112,7 @@ def run(config):
                    else:
                        dict_cols[coln]['eltnames'] = set(v[coln].elements_names + list(dict_cols[coln]['eltnames']))
                        dict_cols[coln]['nbelts'] = len(dict_cols[coln]['eltnames'])
+            v.close()            
        for coln in dict_cols:
            Column.new_column(o_view, coln, dict_cols[coln]['obitype'], 
                              nb_elements_per_line=dict_cols[coln]['nbelts'], elements_names=list(dict_cols[coln]['eltnames']))
@ -120,7 +124,8 @@ def run(config):
        pb = None
    
    i = 0
-    for v in iview_list:
+    for v_uri in config["cat"]["views_to_cat"]:
+        v = open_uri(v_uri)[1]
        for entry in v:
            PyErr_CheckSignals()
            if pb is not None:
@ -131,6 +136,7 @@ def run(config):
            else:
                o_view[i] = entry
            i+=1
+        v.close()

    # Deletes quality columns if needed
    if type(output_0)!=BufferedWriter: 
@ -145,7 +151,7 @@ def run(config):
    
    # Save command config in DMS comments
    command_line = " ".join(sys.argv[1:])
-    o_view.write_config(config, "cat", command_line, input_dms_name=[d.name for d in idms_list], input_view_name=[v.name for v in iview_list])
+    o_view.write_config(config, "cat", command_line, input_dms_name=[d.name for d in idms_list], input_view_name=[vname for vname in iview_list])
    o_dms.record_command_line(command_line)

    #print("\n\nOutput view:\n````````````", file=sys.stderr)
--- a/python/obitools3/commands/ecotag.pyx
+++ b/python/obitools3/commands/ecotag.pyx
@ -41,6 +41,17 @@ def addOptions(parser):
                      help="Minimum identity to consider for assignment, as a normalized identity, e.g. 0.95 for an identity of 95%%. "
                           "Default: 0.00 (no threshold).")

+    group.add_argument('--minimum-circle','-c',
+                      action="store", dest="ecotag:bubble_threshold",
+                      metavar='<CIRCLE_THRESHOLD>',
+                      default=0.99,
+                      type=float,
+                      help="Minimum identity considered for the assignment circle "
+                           "(sequence is assigned to the LCA of all sequences within a similarity circle of the best matches; "
+                           "the threshold for this circle is the highest value between <CIRCLE_THRESHOLD> and the best assignment score found). "
+                           "Give value as a normalized identity, e.g. 0.95 for an identity of 95%%. "
+                           "Default: 0.99.")
+
 def run(config):
        
    DMS.obi_atexit()
@ -66,9 +77,8 @@ def run(config):
    ref_view_name = ref[1]

    # Check that the threshold demanded is greater than or equal to the threshold used to build the reference database
-    if config['ecotag']['threshold'] < eval(ref_dms[ref_view_name].comments["ref_db_threshold"]) :
-        print("Error: The threshold demanded (%f) is lower than the threshold used to build the reference database (%f).", 
-              config['ecotag']['threshold'], ref_dms[ref_view_name].comments["ref_db_threshold"])
+    if config['ecotag']['bubble_threshold'] < eval(ref_dms[ref_view_name].comments["ref_db_threshold"]) :
+        raise Exception(f"Error: The threshold demanded ({config['ecotag']['bubble_threshold']}) is lower than the threshold used to build the reference database ({float(ref_dms[ref_view_name].comments['ref_db_threshold'])}).")
    
    # Open the output: only the DMS
    output = open_uri(config['obi']['outputURI'],
@ -113,8 +123,9 @@ def run(config):
    if obi_ecotag(i_dms.name_with_full_path, tobytes(i_view_name), \
                  ref_dms.name_with_full_path, tobytes(ref_view_name), \
                  taxo_dms.name_with_full_path, tobytes(taxonomy_name), \
-                  tobytes(o_view_name), comments, 
-                  config['ecotag']['threshold']) < 0:
+                  tobytes(o_view_name), comments, \
+                  config['ecotag']['threshold'], \
+                  config['ecotag']['bubble_threshold']) < 0:
        raise Exception("Error running ecotag")
    
    # If the input and output DMS are not the same, export result view to output DMS
--- a/python/obitools3/dms/capi/obiecotag.pxd
+++ b/python/obitools3/dms/capi/obiecotag.pxd
@ -11,4 +11,5 @@ cdef extern from "obi_ecotag.h" nogil:
                   const char* taxonomy_name,
                   const char* output_view_name,
                   const char* output_view_comments,
-                   double ecotag_threshold)
+                   double ecotag_threshold,
+                   double bubble_threshold)
--- a/python/obitools3/version.py
+++ b/python/obitools3/version.py
@ -1,5 +1,5 @@
 major = 3
 minor = 0
-serial= '0b33'
+serial= '0b35'

 version ="%d.%d.%s" % (major,minor,serial)
--- a/src/obi_ecotag.c
+++ b/src/obi_ecotag.c
@ -218,7 +218,8 @@ int obi_ecotag(const char* dms_name,
 			   const char* taxonomy_name,
 			   const char* output_view_name,
 			   const char* output_view_comments,
-			   double ecotag_threshold)    // TODO different threshold for the similarity sphere around ref seqs
+			   double ecotag_threshold,
+			   double bubble_threshold)
 {

 	// For each sequence
@ -239,6 +240,7 @@ int obi_ecotag(const char* dms_name,
 	index_t         query_seq_idx, ref_seq_idx;
 	double          score, best_score;
 	double			threshold;
+	double			lca_threshold;
 	int             lcs_length;
 	int             ali_length;
 	Kmer_table_p    ktable;
@ -389,10 +391,10 @@ int obi_ecotag(const char* dms_name,
 		return -1;
 	}
 	free(db_threshold_str);
-	if (ecotag_threshold < db_threshold)
+	if (bubble_threshold < db_threshold)
 	{
 		fprintf(stderr, "\nError: The threshold demanded (%f) is lower than the threshold used to build the reference database (%f).\n\n",
-				ecotag_threshold, db_threshold);
+				bubble_threshold, db_threshold);
 		return -1;
 	}

@ -597,11 +599,16 @@ int obi_ecotag(const char* dms_name,
 		{
 			best_match_idx = best_match_array[j];

-			// Find the LCA for the chosen threshold
+			// Find the LCA for the highest threshold between best_score and the chosen bubble threshold
 			score_array = obi_get_array_with_col_p_in_view(ref_view, score_a_column, best_match_idx, &lca_array_length);

+			if (bubble_threshold < best_score)
+				lca_threshold = best_score;
+			else
+				lca_threshold = bubble_threshold;
+
 			k = 0;
-			while ((k < lca_array_length) && (score_array[k] >= best_score))
+			while ((k < lca_array_length) && (score_array[k] >= lca_threshold))
 				k++;

 			if (k>0)
--- a/src/obi_ecotag.h
+++ b/src/obi_ecotag.h
@ -42,12 +42,14 @@
 * @param output_view_name The name to give to the output view.
 * @param output_view_comments The comments to associate to the output view.
 * @param ecotag_threshold The threshold at which to assign.
+ * @param bubble_threshold The threshold at which to look for an LCA (i.e. minimum identity considered for the assignment circle);
+ *                         the threshold actually used will be the highest between this value and the best assignment score found.
 *
 * 	The algorithm works like this:
 * 		For each query sequence:
 *			Align with reference database
 *			Keep the indices of all the best matches
- *			For each kept index, get the LCA at that threshold as stored in the reference database, then the LCA of those LCAs
+ *			For each kept index, get the LCA at the highest threshold between bubble_threshold and the best assignment score found (as stored in the reference database), then the LCA of those LCAs
 *			Write result (max score, threshold, taxid and scientific name of the LCA assigned, list of the ids of the best matches)
 *
 * @returns A value indicating the success of the operation.
@ -65,7 +67,8 @@ int obi_ecotag(const char* dms_name,
 			   const char* taxonomy_name,
 			   const char* output_view_name,
 			   const char* output_view_comments,
-			   double ecotag_threshold);
+			   double ecotag_threshold,
+			   double bubble_threshold);


 #endif /* OBI_ECOTAG_H_ */
Author	SHA1	Message	Date
Celine Mercier	c4696ac865	ecotag: added separate threshold for minimum circle identity (and switch to version 3.0.0b35	2020-09-25 16:22:09 +02:00
Celine Mercier	11a0945a9b	obi cat: fixed open file descriptor leak and switch to version 3.0.0b34	2020-08-28 10:41:22 +02:00