alignpairedend: improved/fixed the output tags for the alignment score
and lengths. Removed minimum score option
This commit is contained in:
@ -39,12 +39,13 @@ def addOptions(parser):
|
|||||||
type=str,
|
type=str,
|
||||||
help="URI to the reverse reads if they are in a different view than the forward reads")
|
help="URI to the reverse reads if they are in a different view than the forward reads")
|
||||||
|
|
||||||
group.add_argument('--score-min',
|
# group.add_argument('--score-min',
|
||||||
action="store", dest="alignpairedend:smin",
|
# action="store", dest="alignpairedend:smin",
|
||||||
metavar="#.###",
|
# metavar="#.###",
|
||||||
default=None,
|
# default=None,
|
||||||
type=float,
|
# type=float,
|
||||||
help="Minimum score for keeping alignments")
|
# help="Minimum score for keeping alignments. "
|
||||||
|
# "(for kmer alignment) The score is an approximation of the number of nucleotides matching in the overlap of the alignment.")
|
||||||
|
|
||||||
# group.add_argument('-A', '--true-ali',
|
# group.add_argument('-A', '--true-ali',
|
||||||
# action="store_true", dest="alignpairedend:trueali",
|
# action="store_true", dest="alignpairedend:trueali",
|
||||||
@ -174,11 +175,6 @@ def run(config):
|
|||||||
|
|
||||||
Column.new_column(view, QUALITY_COLUMN, OBI_QUAL) #TODO output URI quality option?
|
Column.new_column(view, QUALITY_COLUMN, OBI_QUAL) #TODO output URI quality option?
|
||||||
|
|
||||||
if 'smin' in config['alignpairedend']:
|
|
||||||
smin = config['alignpairedend']['smin']
|
|
||||||
else:
|
|
||||||
smin = 0
|
|
||||||
|
|
||||||
# Initialize the progress bar
|
# Initialize the progress bar
|
||||||
pb = ProgressBar(entries_len, config, seconde=5)
|
pb = ProgressBar(entries_len, config, seconde=5)
|
||||||
|
|
||||||
@ -217,7 +213,7 @@ def run(config):
|
|||||||
else:
|
else:
|
||||||
seqF = forward[i]
|
seqF = forward[i]
|
||||||
|
|
||||||
if ali.score > smin and ali.consensus_len > 0 :
|
if ali.overlap_len > 0 :
|
||||||
buildConsensus(ali, consensus, seqF)
|
buildConsensus(ali, consensus, seqF)
|
||||||
else:
|
else:
|
||||||
if not two_views:
|
if not two_views:
|
||||||
@ -226,8 +222,6 @@ def run(config):
|
|||||||
seqR = reverse[i]
|
seqR = reverse[i]
|
||||||
buildJoinedSequence(ali, seqR, consensus, forward=seqF)
|
buildJoinedSequence(ali, seqR, consensus, forward=seqF)
|
||||||
|
|
||||||
consensus[b"smin"] = smin
|
|
||||||
|
|
||||||
if kmer_ali :
|
if kmer_ali :
|
||||||
ali.free()
|
ali.free()
|
||||||
|
|
||||||
|
@ -183,8 +183,9 @@ def buildConsensus(ali, seq, ref_tags=None):
|
|||||||
# doesn't work because uint8_t* are forced into bytes by cython (nothing read/kept beyond 0 values)
|
# doesn't work because uint8_t* are forced into bytes by cython (nothing read/kept beyond 0 values)
|
||||||
#obi_set_qual_int_with_elt_idx_and_col_p_in_view(view_p, col_p, seq.index, 0, ali.consensus_qual, ali.consensus_len)
|
#obi_set_qual_int_with_elt_idx_and_col_p_in_view(view_p, col_p, seq.index, 0, ali.consensus_qual, ali.consensus_len)
|
||||||
seq.set(ref_tags.id+b"_CONS", ali.consensus_seq, quality=ali.consensus_qual)
|
seq.set(ref_tags.id+b"_CONS", ali.consensus_seq, quality=ali.consensus_qual)
|
||||||
seq[b'ali_length'] = ali.consensus_len
|
seq[b"seq_length"] = ali.consensus_len
|
||||||
seq[b'score_norm']=float(ali.score)/ali.consensus_len
|
seq[b"overlap_length"] = ali.overlap_len
|
||||||
|
seq[b'score_norm']=round(float(ali.score)/ali.overlap_len, 3)
|
||||||
seq[b'shift']=ali.shift
|
seq[b'shift']=ali.shift
|
||||||
else:
|
else:
|
||||||
if len(ali[0])>999: # TODO why?
|
if len(ali[0])>999: # TODO why?
|
||||||
@ -256,9 +257,10 @@ def buildJoinedSequence(ali, reverse, seq, forward=None):
|
|||||||
seq[b"ali_direction"]=None
|
seq[b"ali_direction"]=None
|
||||||
seq[b"mode"]=b"joined"
|
seq[b"mode"]=b"joined"
|
||||||
seq[b"pairedend_limit"]=len(forward)
|
seq[b"pairedend_limit"]=len(forward)
|
||||||
seq[b"ali_length"] = ali.consensus_len
|
seq[b"seq_length"] = ali.consensus_len
|
||||||
|
seq[b"overlap_length"] = ali.overlap_len
|
||||||
if ali.consensus_len > 0:
|
if ali.consensus_len > 0:
|
||||||
seq[b"score_norm"]=float(ali.score)/ali.consensus_len
|
seq[b'score_norm']=round(float(ali.score)/ali.overlap_len, 3)
|
||||||
else:
|
else:
|
||||||
seq[b"score_norm"]=0.0
|
seq[b"score_norm"]=0.0
|
||||||
|
|
||||||
|
@ -414,7 +414,10 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (max_common_kmers > 0)
|
if (max_common_kmers > 0)
|
||||||
score = max_common_kmers + kmer_size - 1; // aka the number of nucleotides in the longest stretch of kmers perfectly matching
|
score = max_common_kmers + kmer_size - 1; // aka an approximation of the number of nucleotides matching in the overlap of the alignment.
|
||||||
|
// It's an approximation because one mismatch produces kmer_size kmer mismatches if in the middle of the overlap,
|
||||||
|
// and less for mismatches located towards the ends of the overlap. The case where there are the most mismatches is assumed,
|
||||||
|
// meaning that the score will be often underestimated and never overestimated.
|
||||||
else
|
else
|
||||||
score = 0;
|
score = 0;
|
||||||
abs_shift = abs(best_shift);
|
abs_shift = abs(best_shift);
|
||||||
|
@ -27,7 +27,11 @@
|
|||||||
* @brief Alignment structure, with informations about the similarity and to rebuild the alignment.
|
* @brief Alignment structure, with informations about the similarity and to rebuild the alignment.
|
||||||
*/
|
*/
|
||||||
typedef struct Obi_ali {
|
typedef struct Obi_ali {
|
||||||
int score; /**< Alignment score, corresponding to the number of nucleotides in the longest stretch of kmers perfectly matching.
|
int score; /**< Alignment score, corresponding to an approximation of the number of
|
||||||
|
* nucleotides matching in the overlap of the alignment.
|
||||||
|
* It's an approximation because one mismatch produces kmer_size kmer mismatches if in the middle of the overlap,
|
||||||
|
* and less for mismatches located towards the ends of the overlap. The case where there are the most mismatches is assumed,
|
||||||
|
* meaning that the score will be often underestimated and never overestimated.
|
||||||
*/
|
*/
|
||||||
int consensus_length; /**< Length of the final consensus sequence.
|
int consensus_length; /**< Length of the final consensus sequence.
|
||||||
*/
|
*/
|
||||||
|
Reference in New Issue
Block a user