alignpairedend: improved/fixed the output tags for the alignment score

and lengths. Removed minimum score option
This commit is contained in:
Celine Mercier
2020-07-13 15:59:50 +02:00
parent ed9549acfb
commit 9672f01c6a
4 changed files with 24 additions and 21 deletions

View File

@ -39,12 +39,13 @@ def addOptions(parser):
type=str,
help="URI to the reverse reads if they are in a different view than the forward reads")
group.add_argument('--score-min',
action="store", dest="alignpairedend:smin",
metavar="#.###",
default=None,
type=float,
help="Minimum score for keeping alignments")
# group.add_argument('--score-min',
# action="store", dest="alignpairedend:smin",
# metavar="#.###",
# default=None,
# type=float,
# help="Minimum score for keeping alignments. "
# "(for kmer alignment) The score is an approximation of the number of nucleotides matching in the overlap of the alignment.")
# group.add_argument('-A', '--true-ali',
# action="store_true", dest="alignpairedend:trueali",
@ -174,11 +175,6 @@ def run(config):
Column.new_column(view, QUALITY_COLUMN, OBI_QUAL) #TODO output URI quality option?
if 'smin' in config['alignpairedend']:
smin = config['alignpairedend']['smin']
else:
smin = 0
# Initialize the progress bar
pb = ProgressBar(entries_len, config, seconde=5)
@ -217,7 +213,7 @@ def run(config):
else:
seqF = forward[i]
if ali.score > smin and ali.consensus_len > 0 :
if ali.overlap_len > 0 :
buildConsensus(ali, consensus, seqF)
else:
if not two_views:
@ -225,9 +221,7 @@ def run(config):
else:
seqR = reverse[i]
buildJoinedSequence(ali, seqR, consensus, forward=seqF)
consensus[b"smin"] = smin
if kmer_ali :
ali.free()

View File

@ -183,8 +183,9 @@ def buildConsensus(ali, seq, ref_tags=None):
# doesn't work because uint8_t* are forced into bytes by cython (nothing read/kept beyond 0 values)
#obi_set_qual_int_with_elt_idx_and_col_p_in_view(view_p, col_p, seq.index, 0, ali.consensus_qual, ali.consensus_len)
seq.set(ref_tags.id+b"_CONS", ali.consensus_seq, quality=ali.consensus_qual)
seq[b'ali_length'] = ali.consensus_len
seq[b'score_norm']=float(ali.score)/ali.consensus_len
seq[b"seq_length"] = ali.consensus_len
seq[b"overlap_length"] = ali.overlap_len
seq[b'score_norm']=round(float(ali.score)/ali.overlap_len, 3)
seq[b'shift']=ali.shift
else:
if len(ali[0])>999: # TODO why?
@ -256,9 +257,10 @@ def buildJoinedSequence(ali, reverse, seq, forward=None):
seq[b"ali_direction"]=None
seq[b"mode"]=b"joined"
seq[b"pairedend_limit"]=len(forward)
seq[b"ali_length"] = ali.consensus_len
seq[b"seq_length"] = ali.consensus_len
seq[b"overlap_length"] = ali.overlap_len
if ali.consensus_len > 0:
seq[b"score_norm"]=float(ali.score)/ali.consensus_len
seq[b'score_norm']=round(float(ali.score)/ali.overlap_len, 3)
else:
seq[b"score_norm"]=0.0

View File

@ -414,7 +414,10 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
}
if (max_common_kmers > 0)
score = max_common_kmers + kmer_size - 1; // aka the number of nucleotides in the longest stretch of kmers perfectly matching
score = max_common_kmers + kmer_size - 1; // aka an approximation of the number of nucleotides matching in the overlap of the alignment.
// It's an approximation because one mismatch produces kmer_size kmer mismatches if in the middle of the overlap,
// and less for mismatches located towards the ends of the overlap. The case where there are the most mismatches is assumed,
// meaning that the score will be often underestimated and never overestimated.
else
score = 0;
abs_shift = abs(best_shift);

View File

@ -27,7 +27,11 @@
* @brief Alignment structure, with informations about the similarity and to rebuild the alignment.
*/
typedef struct Obi_ali {
int score; /**< Alignment score, corresponding to the number of nucleotides in the longest stretch of kmers perfectly matching.
int score; /**< Alignment score, corresponding to an approximation of the number of
* nucleotides matching in the overlap of the alignment.
* It's an approximation because one mismatch produces kmer_size kmer mismatches if in the middle of the overlap,
* and less for mismatches located towards the ends of the overlap. The case where there are the most mismatches is assumed,
* meaning that the score will be often underestimated and never overestimated.
*/
int consensus_length; /**< Length of the final consensus sequence.
*/