Patch RPS12 detection
This commit is contained in:
@ -33,7 +33,7 @@ fi
|
||||
|
||||
shift
|
||||
|
||||
GENOME_LENGHT="$2"
|
||||
GENOME_LENGTH="$1"
|
||||
|
||||
if (( $# > 1 )) ; then
|
||||
TEMP=$2
|
||||
@ -54,7 +54,7 @@ pushTmpDir ORG.RPS12
|
||||
|
||||
# localize the gene on the chloroplast genome using blast
|
||||
loginfo "Locating RPS12 gene by similarity..."
|
||||
|
||||
loginfo " Considered genome length : $GENOME_LENGTH"
|
||||
|
||||
blastx \
|
||||
-query ${QUERY} \
|
||||
@ -82,11 +82,126 @@ blastx \
|
||||
|
||||
(BEST_EVAL > ($11 + 0.0)) {BEST_EVAL = ($11 + 0.0)}
|
||||
' \
|
||||
| $AwkCmd -v glength=${GENOME_LENGHT} \
|
||||
'!($7 + 0 > glength + 0 && $8 + 0 > glength + 0)' \
|
||||
| $AwkCmd -v glength="${GENOME_LENGTH}" \
|
||||
'!(($7 + 0) > (glength + 0) && $8 + (0 > glength + 0))' \
|
||||
| $AwkCmd ' BEGIN { FS = OFS = "\t" }
|
||||
|
||||
{
|
||||
subject = $2
|
||||
bitscore = $12 + 0
|
||||
s_start = $9 + 0
|
||||
s_end = $10 + 0
|
||||
|
||||
# Ajuster l ordre des coordonnées
|
||||
actual_start = (s_start < s_end) ? s_start : s_end
|
||||
actual_end = (s_start < s_end) ? s_end : s_start
|
||||
|
||||
# Stocker les données
|
||||
count[subject]++
|
||||
idx = count[subject]
|
||||
starts[subject, idx] = actual_start
|
||||
ends[subject, idx] = actual_end
|
||||
scores[subject, idx] = bitscore
|
||||
lines[subject, idx] = $0
|
||||
}
|
||||
|
||||
END {
|
||||
for (subject in count) {
|
||||
n = count[subject]
|
||||
|
||||
# Tri par score (décroissant) puis position (croissante)
|
||||
for (i = 1; i <= n; i++) {
|
||||
for (j = i + 1; j <= n; j++) {
|
||||
if (scores[subject, i] < scores[subject, j] ||
|
||||
(scores[subject, i] == scores[subject, j] &&
|
||||
starts[subject, i] > starts[subject, j])) {
|
||||
swap(starts, subject, i, j)
|
||||
swap(ends, subject, i, j)
|
||||
swap(scores, subject, i, j)
|
||||
swap(lines, subject, i, j)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Sélection des HSPs optimaux
|
||||
selected_count = 0
|
||||
delete selected_starts
|
||||
delete selected_ends
|
||||
delete selected_scores
|
||||
delete selected_lines
|
||||
|
||||
for (i = 1; i <= n; i++) {
|
||||
current_start = starts[subject, i]
|
||||
current_end = ends[subject, i]
|
||||
current_score = scores[subject, i]
|
||||
current_line = lines[subject, i]
|
||||
|
||||
# Vérifier les chevauchements avec les HSPs sélectionnés
|
||||
overlap_max = 0
|
||||
overlap_indices = ""
|
||||
for (j = 1; j <= selected_count; j++) {
|
||||
os = (current_start > selected_starts[j]) ? current_start : selected_starts[j]
|
||||
oe = (current_end < selected_ends[j]) ? current_end : selected_ends[j]
|
||||
if (os <= oe && (oe - os + 1) >= 15) {
|
||||
if (selected_scores[j] > overlap_max) overlap_max = selected_scores[j]
|
||||
overlap_indices = overlap_indices j " "
|
||||
}
|
||||
}
|
||||
|
||||
# Appliquer les règles de sélection
|
||||
if (overlap_indices == "") {
|
||||
# Aucun chevauchement : sélectionner
|
||||
add_to_selected(current_start, current_end, current_score, current_line)
|
||||
}
|
||||
else if (current_score >= overlap_max) {
|
||||
# Supprimer les HSPs moins bons et conserver les ex æquo
|
||||
new_selected_count = 0
|
||||
delete temp_selected
|
||||
|
||||
# Garder les non-chevauchants et les ex æquo
|
||||
for (j = 1; j <= selected_count; j++) {
|
||||
if (!index(overlap_indices, j " ") || selected_scores[j] == current_score) {
|
||||
temp_selected[++new_selected_count] = j
|
||||
}
|
||||
}
|
||||
|
||||
# Mettre à jour la liste sélectionnée
|
||||
selected_count = 0
|
||||
for (j = 1; j <= new_selected_count; j++) {
|
||||
idx = temp_selected[j]
|
||||
selected_count++
|
||||
selected_starts[selected_count] = selected_starts[idx]
|
||||
selected_ends[selected_count] = selected_ends[idx]
|
||||
selected_scores[selected_count] = selected_scores[idx]
|
||||
selected_lines[selected_count] = selected_lines[idx]
|
||||
}
|
||||
add_to_selected(current_start, current_end, current_score, current_line)
|
||||
}
|
||||
}
|
||||
|
||||
# Affichage des résultats
|
||||
for (j = 1; j <= selected_count; j++) {
|
||||
print selected_lines[j]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function add_to_selected(start, end, score, line) {
|
||||
selected_count++
|
||||
selected_starts[selected_count] = start
|
||||
selected_ends[selected_count] = end
|
||||
selected_scores[selected_count] = score
|
||||
selected_lines[selected_count] = line
|
||||
}
|
||||
|
||||
function swap(arr, subject, i, j, tmp) {
|
||||
tmp = arr[subject, i]
|
||||
arr[subject, i] = arr[subject, j]
|
||||
arr[subject, j] = tmp
|
||||
}
|
||||
' \
|
||||
> "rps12_locate.hsps"
|
||||
|
||||
|
||||
#
|
||||
# Extracting protein ids from selected blast HSPs
|
||||
#
|
||||
@ -127,6 +242,7 @@ blastx \
|
||||
-v chloro="${QUERY}" \
|
||||
-f $LIB_DIR/rps12_filter_3.awk
|
||||
|
||||
|
||||
nrps12=$(ls -1 rps12_fragments_*.fasta | wc -l)
|
||||
|
||||
if (( nrps12 > 1 )) ; then
|
||||
@ -161,6 +277,10 @@ blastx \
|
||||
|
||||
n=0
|
||||
for f in *.res ; do
|
||||
loginfo "processing $f"
|
||||
xxx=$(cat $f)
|
||||
echo -e "\n==============\n$xxx\n==============\n" 1>&2
|
||||
|
||||
((n=n+1))
|
||||
mv $f $f.ori
|
||||
if [[ -z "$TEMP" ]] ; then
|
||||
@ -168,7 +288,9 @@ blastx \
|
||||
else
|
||||
dest="$TEMP/$f"
|
||||
fi
|
||||
loginfo "Destination file $dest"
|
||||
header=$(head -1 ${f/.rps12.res/.fasta})
|
||||
loginfo "Header: $header"
|
||||
L2=$(sed -E 's/^.*limit=([0-9]+);.*$/\1/' <<< $header)
|
||||
S1=$(sed -E 's/^.*strand1=(R|F);.*$/\1/' <<< $header)
|
||||
S2=$(sed -E 's/^.*strand2=(R|F);.*$/\1/' <<< $header)
|
||||
@ -179,7 +301,7 @@ blastx \
|
||||
cat $f.ori \
|
||||
| $AwkCmd -v S1="$S1" -v F1="$F1" -v T1="$T1" \
|
||||
-v S2="$S2" -v F2="$F2" -v T2="$T2" -v L2="$L2" \
|
||||
-f $LIB_DIR/rps12_filter_4.awk \
|
||||
-f $LIB_DIR/rps12_filter_4.awk \
|
||||
| $AwkCmd '
|
||||
#
|
||||
# Normalize join(complement(A),complement(B),complement(C)) locations
|
||||
@ -201,7 +323,7 @@ blastx \
|
||||
sub(positions,rexons,$0)
|
||||
}
|
||||
{ print $0}
|
||||
' \
|
||||
' \
|
||||
| $AwkCmd '
|
||||
/^FT [^ ]/ && (length($0) > 80) {
|
||||
n = split($0,parts,",")
|
||||
@ -237,10 +359,9 @@ blastx \
|
||||
{
|
||||
print $0
|
||||
}
|
||||
' > $dest
|
||||
done
|
||||
|
||||
|
||||
' > "$dest"
|
||||
done
|
||||
|
||||
popTmpDir
|
||||
|
||||
exit 0
|
||||
|
@ -37,6 +37,8 @@ needfile "$Fasta"
|
||||
|
||||
GenomeLength=$1; shift
|
||||
|
||||
loginfo "Genome length set to : $GenomeLength bp"
|
||||
|
||||
# Genome names is set from the base
|
||||
# name of the genome file without its extension
|
||||
Genome=$(basename ${Fasta%.*})
|
||||
@ -95,7 +97,7 @@ fi
|
||||
|
||||
if [[ "$cdsdetection_pass2" == "yes" ]] ; then
|
||||
loginfo "running pass2:rps12 exonerate of $Genome on $DbRoot"
|
||||
$PROG_DIR/do_rps12.sh $Fasta $GenomeLength $temp
|
||||
"$PROG_DIR/do_rps12.sh" "$Fasta" "$GenomeLength" $temp
|
||||
fi
|
||||
|
||||
#
|
||||
@ -109,7 +111,7 @@ if [[ "$cdsdetection_pass3" == "yes" ]] ; then
|
||||
loginfo $fams
|
||||
loginfo "running pass3:$dir exonerate of $Genome on $DbRoot"
|
||||
for f in $fams ; do
|
||||
echo tcsh -f $PROG_DIR/do_exonerate.csh Pass3 $Fasta $f $AnnotFile $DbRoot/models $temp
|
||||
tcsh -f $PROG_DIR/do_exonerate.csh Pass3 $Fasta $f $AnnotFile $DbRoot/models $temp
|
||||
done
|
||||
fi
|
||||
done) | parallel -j $Threads
|
||||
|
@ -507,9 +507,9 @@ pushTmpDir ORG.organnot
|
||||
if [[ ! -z "${sequence}" ]] ; then
|
||||
echo "${sequence}" > toannotate.fasta
|
||||
sl=$(seqlength "toannotate.fasta")
|
||||
|
||||
|
||||
if (( sl >= minlength )) ; then
|
||||
|
||||
loginfo "Annotated genome length: $sl bp"
|
||||
seqid=$($AwkCmd '(NR==1) {print substr($1,2,1000)}' toannotate.fasta)
|
||||
# seqid=$(tr "." "_" <<< ${seqid})
|
||||
|
||||
|
Reference in New Issue
Block a user