From 9672f01c6a806141add71da8725b1a425c1cbb3d Mon Sep 17 00:00:00 2001
From: Celine Mercier <celine.mercier@metabarcoding.org>
Date: Mon, 13 Jul 2020 15:59:50 +0200
Subject: [PATCH] alignpairedend: improved/fixed the output tags for the
 alignment score and lengths. Removed minimum score option

---
 python/obitools3/commands/alignpairedend.pyx | 24 ++++++++------------
 python/obitools3/libalign/_solexapairend.pyx | 10 ++++----
 src/kmer_similarity.c                        |  5 +++-
 src/kmer_similarity.h                        |  6 ++++-
 4 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/python/obitools3/commands/alignpairedend.pyx b/python/obitools3/commands/alignpairedend.pyx
index 50d679b..c7fd0d2 100755
--- a/python/obitools3/commands/alignpairedend.pyx
+++ b/python/obitools3/commands/alignpairedend.pyx
@@ -39,12 +39,13 @@ def addOptions(parser):
                      type=str,
                      help="URI to the reverse reads if they are in a different view than the forward reads")
 
-    group.add_argument('--score-min',
-                     action="store", dest="alignpairedend:smin",
-                     metavar="#.###",
-                     default=None,
-                     type=float,
-                     help="Minimum score for keeping alignments")
+#     group.add_argument('--score-min',
+#                      action="store", dest="alignpairedend:smin",
+#                      metavar="#.###",
+#                      default=None,
+#                      type=float,
+#                      help="Minimum score for keeping alignments. " 
+#                           "(for kmer alignment) The score is an approximation of the number of nucleotides matching in the overlap of the alignment.")
 
 #     group.add_argument('-A', '--true-ali',
 #                        action="store_true", dest="alignpairedend:trueali",
@@ -174,11 +175,6 @@ def run(config):
     
     Column.new_column(view, QUALITY_COLUMN, OBI_QUAL)   #TODO output URI quality option?
 
-    if 'smin' in config['alignpairedend']:
-        smin = config['alignpairedend']['smin']
-    else:
-        smin = 0
-
     # Initialize the progress bar
     pb = ProgressBar(entries_len, config, seconde=5)
     
@@ -217,7 +213,7 @@ def run(config):
         else:
             seqF = forward[i]
                         
-        if ali.score > smin and ali.consensus_len > 0 :
+        if ali.overlap_len > 0 :
             buildConsensus(ali, consensus, seqF)
         else:
             if not two_views:
@@ -225,9 +221,7 @@ def run(config):
             else:
                 seqR = reverse[i]
             buildJoinedSequence(ali, seqR, consensus, forward=seqF)
-        
-        consensus[b"smin"] = smin
-           
+                   
         if kmer_ali :
             ali.free()
     
diff --git a/python/obitools3/libalign/_solexapairend.pyx b/python/obitools3/libalign/_solexapairend.pyx
index bf391c5..d708408 100755
--- a/python/obitools3/libalign/_solexapairend.pyx
+++ b/python/obitools3/libalign/_solexapairend.pyx
@@ -183,8 +183,9 @@ def buildConsensus(ali, seq, ref_tags=None):
         # doesn't work because uint8_t* are forced into bytes by cython (nothing read/kept beyond 0 values)
         #obi_set_qual_int_with_elt_idx_and_col_p_in_view(view_p, col_p, seq.index, 0, ali.consensus_qual, ali.consensus_len)
         seq.set(ref_tags.id+b"_CONS", ali.consensus_seq, quality=ali.consensus_qual)
-        seq[b'ali_length'] = ali.consensus_len
-        seq[b'score_norm']=float(ali.score)/ali.consensus_len
+        seq[b"seq_length"] = ali.consensus_len
+        seq[b"overlap_length"] = ali.overlap_len
+        seq[b'score_norm']=round(float(ali.score)/ali.overlap_len, 3)
         seq[b'shift']=ali.shift
     else:
         if len(ali[0])>999:   # TODO why?
@@ -256,9 +257,10 @@ def buildJoinedSequence(ali, reverse, seq, forward=None):
         seq[b"ali_direction"]=None
     seq[b"mode"]=b"joined"
     seq[b"pairedend_limit"]=len(forward)
-    seq[b"ali_length"] = ali.consensus_len
+    seq[b"seq_length"] = ali.consensus_len
+    seq[b"overlap_length"] = ali.overlap_len
     if ali.consensus_len > 0:
-        seq[b"score_norm"]=float(ali.score)/ali.consensus_len
+        seq[b'score_norm']=round(float(ali.score)/ali.overlap_len, 3)
     else:
         seq[b"score_norm"]=0.0
   
diff --git a/src/kmer_similarity.c b/src/kmer_similarity.c
index 4b4240f..b9a9950 100755
--- a/src/kmer_similarity.c
+++ b/src/kmer_similarity.c
@@ -414,7 +414,10 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
 	}
 
 	if (max_common_kmers > 0)
-		score = max_common_kmers + kmer_size - 1;  // aka the number of nucleotides in the longest stretch of kmers perfectly matching
+		score = max_common_kmers + kmer_size - 1; // aka an approximation of the number of nucleotides matching in the overlap of the alignment.
+												  // It's an approximation because one mismatch produces kmer_size kmer mismatches if in the middle of the overlap,
+												  // and less for mismatches located towards the ends of the overlap. The case where there are the most mismatches is assumed,
+												  // meaning that the score will be often underestimated and never overestimated.
 	else
 		score = 0;
 	abs_shift = abs(best_shift);
diff --git a/src/kmer_similarity.h b/src/kmer_similarity.h
index ada665f..92f78a1 100755
--- a/src/kmer_similarity.h
+++ b/src/kmer_similarity.h
@@ -27,7 +27,11 @@
  * @brief Alignment structure, with informations about the similarity and to rebuild the alignment.
  */
 typedef struct Obi_ali {
-	int      score;	    		/**< Alignment score, corresponding to the number of nucleotides in the longest stretch of kmers perfectly matching.
+	int      score;	    		/**< Alignment score, corresponding to an approximation of the number of
+								 *   nucleotides matching in the overlap of the alignment.
+								 *   It's an approximation because one mismatch produces kmer_size kmer mismatches if in the middle of the overlap,
+							     *   and less for mismatches located towards the ends of the overlap. The case where there are the most mismatches is assumed,
+							     *   meaning that the score will be often underestimated and never overestimated.
 	 	 	 	 	 	 	 	 */
 	int      consensus_length; 	/**< Length of the final consensus sequence.
 	 	 	 	 	 	 	 	 */