From 9672f01c6a806141add71da8725b1a425c1cbb3d Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Mon, 13 Jul 2020 15:59:50 +0200 Subject: [PATCH] alignpairedend: improved/fixed the output tags for the alignment score and lengths. Removed minimum score option --- python/obitools3/commands/alignpairedend.pyx | 24 ++++++++------------ python/obitools3/libalign/_solexapairend.pyx | 10 ++++---- src/kmer_similarity.c | 5 +++- src/kmer_similarity.h | 6 ++++- 4 files changed, 24 insertions(+), 21 deletions(-) diff --git a/python/obitools3/commands/alignpairedend.pyx b/python/obitools3/commands/alignpairedend.pyx index 50d679b..c7fd0d2 100755 --- a/python/obitools3/commands/alignpairedend.pyx +++ b/python/obitools3/commands/alignpairedend.pyx @@ -39,12 +39,13 @@ def addOptions(parser): type=str, help="URI to the reverse reads if they are in a different view than the forward reads") - group.add_argument('--score-min', - action="store", dest="alignpairedend:smin", - metavar="#.###", - default=None, - type=float, - help="Minimum score for keeping alignments") +# group.add_argument('--score-min', +# action="store", dest="alignpairedend:smin", +# metavar="#.###", +# default=None, +# type=float, +# help="Minimum score for keeping alignments. " +# "(for kmer alignment) The score is an approximation of the number of nucleotides matching in the overlap of the alignment.") # group.add_argument('-A', '--true-ali', # action="store_true", dest="alignpairedend:trueali", @@ -174,11 +175,6 @@ def run(config): Column.new_column(view, QUALITY_COLUMN, OBI_QUAL) #TODO output URI quality option? - if 'smin' in config['alignpairedend']: - smin = config['alignpairedend']['smin'] - else: - smin = 0 - # Initialize the progress bar pb = ProgressBar(entries_len, config, seconde=5) @@ -217,7 +213,7 @@ def run(config): else: seqF = forward[i] - if ali.score > smin and ali.consensus_len > 0 : + if ali.overlap_len > 0 : buildConsensus(ali, consensus, seqF) else: if not two_views: @@ -225,9 +221,7 @@ def run(config): else: seqR = reverse[i] buildJoinedSequence(ali, seqR, consensus, forward=seqF) - - consensus[b"smin"] = smin - + if kmer_ali : ali.free() diff --git a/python/obitools3/libalign/_solexapairend.pyx b/python/obitools3/libalign/_solexapairend.pyx index bf391c5..d708408 100755 --- a/python/obitools3/libalign/_solexapairend.pyx +++ b/python/obitools3/libalign/_solexapairend.pyx @@ -183,8 +183,9 @@ def buildConsensus(ali, seq, ref_tags=None): # doesn't work because uint8_t* are forced into bytes by cython (nothing read/kept beyond 0 values) #obi_set_qual_int_with_elt_idx_and_col_p_in_view(view_p, col_p, seq.index, 0, ali.consensus_qual, ali.consensus_len) seq.set(ref_tags.id+b"_CONS", ali.consensus_seq, quality=ali.consensus_qual) - seq[b'ali_length'] = ali.consensus_len - seq[b'score_norm']=float(ali.score)/ali.consensus_len + seq[b"seq_length"] = ali.consensus_len + seq[b"overlap_length"] = ali.overlap_len + seq[b'score_norm']=round(float(ali.score)/ali.overlap_len, 3) seq[b'shift']=ali.shift else: if len(ali[0])>999: # TODO why? @@ -256,9 +257,10 @@ def buildJoinedSequence(ali, reverse, seq, forward=None): seq[b"ali_direction"]=None seq[b"mode"]=b"joined" seq[b"pairedend_limit"]=len(forward) - seq[b"ali_length"] = ali.consensus_len + seq[b"seq_length"] = ali.consensus_len + seq[b"overlap_length"] = ali.overlap_len if ali.consensus_len > 0: - seq[b"score_norm"]=float(ali.score)/ali.consensus_len + seq[b'score_norm']=round(float(ali.score)/ali.overlap_len, 3) else: seq[b"score_norm"]=0.0 diff --git a/src/kmer_similarity.c b/src/kmer_similarity.c index 4b4240f..b9a9950 100755 --- a/src/kmer_similarity.c +++ b/src/kmer_similarity.c @@ -414,7 +414,10 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 } if (max_common_kmers > 0) - score = max_common_kmers + kmer_size - 1; // aka the number of nucleotides in the longest stretch of kmers perfectly matching + score = max_common_kmers + kmer_size - 1; // aka an approximation of the number of nucleotides matching in the overlap of the alignment. + // It's an approximation because one mismatch produces kmer_size kmer mismatches if in the middle of the overlap, + // and less for mismatches located towards the ends of the overlap. The case where there are the most mismatches is assumed, + // meaning that the score will be often underestimated and never overestimated. else score = 0; abs_shift = abs(best_shift); diff --git a/src/kmer_similarity.h b/src/kmer_similarity.h index ada665f..92f78a1 100755 --- a/src/kmer_similarity.h +++ b/src/kmer_similarity.h @@ -27,7 +27,11 @@ * @brief Alignment structure, with informations about the similarity and to rebuild the alignment. */ typedef struct Obi_ali { - int score; /**< Alignment score, corresponding to the number of nucleotides in the longest stretch of kmers perfectly matching. + int score; /**< Alignment score, corresponding to an approximation of the number of + * nucleotides matching in the overlap of the alignment. + * It's an approximation because one mismatch produces kmer_size kmer mismatches if in the middle of the overlap, + * and less for mismatches located towards the ends of the overlap. The case where there are the most mismatches is assumed, + * meaning that the score will be often underestimated and never overestimated. */ int consensus_length; /**< Length of the final consensus sequence. */