alignpairedend: improved/fixed the output tags for the alignment score

and lengths. Removed minimum score option
2020-07-13 15:59:50 +02:00
parent ed9549acfb
commit 9672f01c6a
4 changed files with 24 additions and 21 deletions
--- a/python/obitools3/commands/alignpairedend.pyx
+++ b/python/obitools3/commands/alignpairedend.pyx
@ -39,12 +39,13 @@ def addOptions(parser):
                     type=str,
                     help="URI to the reverse reads if they are in a different view than the forward reads")

-    group.add_argument('--score-min',
-                     action="store", dest="alignpairedend:smin",
-                     metavar="#.###",
-                     default=None,
-                     type=float,
-                     help="Minimum score for keeping alignments")
+#     group.add_argument('--score-min',
+#                      action="store", dest="alignpairedend:smin",
+#                      metavar="#.###",
+#                      default=None,
+#                      type=float,
+#                      help="Minimum score for keeping alignments. " 
+#                           "(for kmer alignment) The score is an approximation of the number of nucleotides matching in the overlap of the alignment.")

 #     group.add_argument('-A', '--true-ali',
 #                        action="store_true", dest="alignpairedend:trueali",
@ -174,11 +175,6 @@ def run(config):
    
    Column.new_column(view, QUALITY_COLUMN, OBI_QUAL)   #TODO output URI quality option?

-    if 'smin' in config['alignpairedend']:
-        smin = config['alignpairedend']['smin']
-    else:
-        smin = 0
-
    # Initialize the progress bar
    pb = ProgressBar(entries_len, config, seconde=5)
    
@ -217,7 +213,7 @@ def run(config):
        else:
            seqF = forward[i]
                        
-        if ali.score > smin and ali.consensus_len > 0 :
+        if ali.overlap_len > 0 :
            buildConsensus(ali, consensus, seqF)
        else:
            if not two_views:
@ -225,9 +221,7 @@ def run(config):
            else:
                seqR = reverse[i]
            buildJoinedSequence(ali, seqR, consensus, forward=seqF)
-        
-        consensus[b"smin"] = smin
-           
+                   
        if kmer_ali :
            ali.free()
    
--- a/python/obitools3/libalign/_solexapairend.pyx
+++ b/python/obitools3/libalign/_solexapairend.pyx
@ -183,8 +183,9 @@ def buildConsensus(ali, seq, ref_tags=None):
        # doesn't work because uint8_t* are forced into bytes by cython (nothing read/kept beyond 0 values)
        #obi_set_qual_int_with_elt_idx_and_col_p_in_view(view_p, col_p, seq.index, 0, ali.consensus_qual, ali.consensus_len)
        seq.set(ref_tags.id+b"_CONS", ali.consensus_seq, quality=ali.consensus_qual)
-        seq[b'ali_length'] = ali.consensus_len
-        seq[b'score_norm']=float(ali.score)/ali.consensus_len
+        seq[b"seq_length"] = ali.consensus_len
+        seq[b"overlap_length"] = ali.overlap_len
+        seq[b'score_norm']=round(float(ali.score)/ali.overlap_len, 3)
        seq[b'shift']=ali.shift
    else:
        if len(ali[0])>999:   # TODO why?
@ -256,9 +257,10 @@ def buildJoinedSequence(ali, reverse, seq, forward=None):
        seq[b"ali_direction"]=None
    seq[b"mode"]=b"joined"
    seq[b"pairedend_limit"]=len(forward)
-    seq[b"ali_length"] = ali.consensus_len
+    seq[b"seq_length"] = ali.consensus_len
+    seq[b"overlap_length"] = ali.overlap_len
    if ali.consensus_len > 0:
-        seq[b"score_norm"]=float(ali.score)/ali.consensus_len
+        seq[b'score_norm']=round(float(ali.score)/ali.overlap_len, 3)
    else:
        seq[b"score_norm"]=0.0
  
--- a/src/kmer_similarity.c
+++ b/src/kmer_similarity.c
@ -414,7 +414,10 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
 	}

 	if (max_common_kmers > 0)
-		score = max_common_kmers + kmer_size - 1;  // aka the number of nucleotides in the longest stretch of kmers perfectly matching
+		score = max_common_kmers + kmer_size - 1; // aka an approximation of the number of nucleotides matching in the overlap of the alignment.
+												  // It's an approximation because one mismatch produces kmer_size kmer mismatches if in the middle of the overlap,
+												  // and less for mismatches located towards the ends of the overlap. The case where there are the most mismatches is assumed,
+												  // meaning that the score will be often underestimated and never overestimated.
 	else
 		score = 0;
 	abs_shift = abs(best_shift);
--- a/src/kmer_similarity.h
+++ b/src/kmer_similarity.h
@ -27,7 +27,11 @@
 * @brief Alignment structure, with informations about the similarity and to rebuild the alignment.
 */
 typedef struct Obi_ali {
-	int      score;	    		/**< Alignment score, corresponding to the number of nucleotides in the longest stretch of kmers perfectly matching.
+	int      score;	    		/**< Alignment score, corresponding to an approximation of the number of
+								 *   nucleotides matching in the overlap of the alignment.
+								 *   It's an approximation because one mismatch produces kmer_size kmer mismatches if in the middle of the overlap,
+							     *   and less for mismatches located towards the ends of the overlap. The case where there are the most mismatches is assumed,
+							     *   meaning that the score will be often underestimated and never overestimated.
 	 	 	 	 	 	 	 	 */
 	int      consensus_length; 	/**< Length of the final consensus sequence.
 	 	 	 	 	 	 	 	 */