Patch RPS12 detection
This commit is contained in:
@ -33,7 +33,7 @@ fi
|
|||||||
|
|
||||||
shift
|
shift
|
||||||
|
|
||||||
GENOME_LENGHT="$2"
|
GENOME_LENGTH="$1"
|
||||||
|
|
||||||
if (( $# > 1 )) ; then
|
if (( $# > 1 )) ; then
|
||||||
TEMP=$2
|
TEMP=$2
|
||||||
@ -54,7 +54,7 @@ pushTmpDir ORG.RPS12
|
|||||||
|
|
||||||
# localize the gene on the chloroplast genome using blast
|
# localize the gene on the chloroplast genome using blast
|
||||||
loginfo "Locating RPS12 gene by similarity..."
|
loginfo "Locating RPS12 gene by similarity..."
|
||||||
|
loginfo " Considered genome length : $GENOME_LENGTH"
|
||||||
|
|
||||||
blastx \
|
blastx \
|
||||||
-query ${QUERY} \
|
-query ${QUERY} \
|
||||||
@ -82,11 +82,126 @@ blastx \
|
|||||||
|
|
||||||
(BEST_EVAL > ($11 + 0.0)) {BEST_EVAL = ($11 + 0.0)}
|
(BEST_EVAL > ($11 + 0.0)) {BEST_EVAL = ($11 + 0.0)}
|
||||||
' \
|
' \
|
||||||
| $AwkCmd -v glength=${GENOME_LENGHT} \
|
| $AwkCmd -v glength="${GENOME_LENGTH}" \
|
||||||
'!($7 + 0 > glength + 0 && $8 + 0 > glength + 0)' \
|
'!(($7 + 0) > (glength + 0) && $8 + (0 > glength + 0))' \
|
||||||
|
| $AwkCmd ' BEGIN { FS = OFS = "\t" }
|
||||||
|
|
||||||
|
{
|
||||||
|
subject = $2
|
||||||
|
bitscore = $12 + 0
|
||||||
|
s_start = $9 + 0
|
||||||
|
s_end = $10 + 0
|
||||||
|
|
||||||
|
# Ajuster l ordre des coordonnées
|
||||||
|
actual_start = (s_start < s_end) ? s_start : s_end
|
||||||
|
actual_end = (s_start < s_end) ? s_end : s_start
|
||||||
|
|
||||||
|
# Stocker les données
|
||||||
|
count[subject]++
|
||||||
|
idx = count[subject]
|
||||||
|
starts[subject, idx] = actual_start
|
||||||
|
ends[subject, idx] = actual_end
|
||||||
|
scores[subject, idx] = bitscore
|
||||||
|
lines[subject, idx] = $0
|
||||||
|
}
|
||||||
|
|
||||||
|
END {
|
||||||
|
for (subject in count) {
|
||||||
|
n = count[subject]
|
||||||
|
|
||||||
|
# Tri par score (décroissant) puis position (croissante)
|
||||||
|
for (i = 1; i <= n; i++) {
|
||||||
|
for (j = i + 1; j <= n; j++) {
|
||||||
|
if (scores[subject, i] < scores[subject, j] ||
|
||||||
|
(scores[subject, i] == scores[subject, j] &&
|
||||||
|
starts[subject, i] > starts[subject, j])) {
|
||||||
|
swap(starts, subject, i, j)
|
||||||
|
swap(ends, subject, i, j)
|
||||||
|
swap(scores, subject, i, j)
|
||||||
|
swap(lines, subject, i, j)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Sélection des HSPs optimaux
|
||||||
|
selected_count = 0
|
||||||
|
delete selected_starts
|
||||||
|
delete selected_ends
|
||||||
|
delete selected_scores
|
||||||
|
delete selected_lines
|
||||||
|
|
||||||
|
for (i = 1; i <= n; i++) {
|
||||||
|
current_start = starts[subject, i]
|
||||||
|
current_end = ends[subject, i]
|
||||||
|
current_score = scores[subject, i]
|
||||||
|
current_line = lines[subject, i]
|
||||||
|
|
||||||
|
# Vérifier les chevauchements avec les HSPs sélectionnés
|
||||||
|
overlap_max = 0
|
||||||
|
overlap_indices = ""
|
||||||
|
for (j = 1; j <= selected_count; j++) {
|
||||||
|
os = (current_start > selected_starts[j]) ? current_start : selected_starts[j]
|
||||||
|
oe = (current_end < selected_ends[j]) ? current_end : selected_ends[j]
|
||||||
|
if (os <= oe && (oe - os + 1) >= 15) {
|
||||||
|
if (selected_scores[j] > overlap_max) overlap_max = selected_scores[j]
|
||||||
|
overlap_indices = overlap_indices j " "
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Appliquer les règles de sélection
|
||||||
|
if (overlap_indices == "") {
|
||||||
|
# Aucun chevauchement : sélectionner
|
||||||
|
add_to_selected(current_start, current_end, current_score, current_line)
|
||||||
|
}
|
||||||
|
else if (current_score >= overlap_max) {
|
||||||
|
# Supprimer les HSPs moins bons et conserver les ex æquo
|
||||||
|
new_selected_count = 0
|
||||||
|
delete temp_selected
|
||||||
|
|
||||||
|
# Garder les non-chevauchants et les ex æquo
|
||||||
|
for (j = 1; j <= selected_count; j++) {
|
||||||
|
if (!index(overlap_indices, j " ") || selected_scores[j] == current_score) {
|
||||||
|
temp_selected[++new_selected_count] = j
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Mettre à jour la liste sélectionnée
|
||||||
|
selected_count = 0
|
||||||
|
for (j = 1; j <= new_selected_count; j++) {
|
||||||
|
idx = temp_selected[j]
|
||||||
|
selected_count++
|
||||||
|
selected_starts[selected_count] = selected_starts[idx]
|
||||||
|
selected_ends[selected_count] = selected_ends[idx]
|
||||||
|
selected_scores[selected_count] = selected_scores[idx]
|
||||||
|
selected_lines[selected_count] = selected_lines[idx]
|
||||||
|
}
|
||||||
|
add_to_selected(current_start, current_end, current_score, current_line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Affichage des résultats
|
||||||
|
for (j = 1; j <= selected_count; j++) {
|
||||||
|
print selected_lines[j]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function add_to_selected(start, end, score, line) {
|
||||||
|
selected_count++
|
||||||
|
selected_starts[selected_count] = start
|
||||||
|
selected_ends[selected_count] = end
|
||||||
|
selected_scores[selected_count] = score
|
||||||
|
selected_lines[selected_count] = line
|
||||||
|
}
|
||||||
|
|
||||||
|
function swap(arr, subject, i, j, tmp) {
|
||||||
|
tmp = arr[subject, i]
|
||||||
|
arr[subject, i] = arr[subject, j]
|
||||||
|
arr[subject, j] = tmp
|
||||||
|
}
|
||||||
|
' \
|
||||||
> "rps12_locate.hsps"
|
> "rps12_locate.hsps"
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Extracting protein ids from selected blast HSPs
|
# Extracting protein ids from selected blast HSPs
|
||||||
#
|
#
|
||||||
@ -127,6 +242,7 @@ blastx \
|
|||||||
-v chloro="${QUERY}" \
|
-v chloro="${QUERY}" \
|
||||||
-f $LIB_DIR/rps12_filter_3.awk
|
-f $LIB_DIR/rps12_filter_3.awk
|
||||||
|
|
||||||
|
|
||||||
nrps12=$(ls -1 rps12_fragments_*.fasta | wc -l)
|
nrps12=$(ls -1 rps12_fragments_*.fasta | wc -l)
|
||||||
|
|
||||||
if (( nrps12 > 1 )) ; then
|
if (( nrps12 > 1 )) ; then
|
||||||
@ -161,6 +277,10 @@ blastx \
|
|||||||
|
|
||||||
n=0
|
n=0
|
||||||
for f in *.res ; do
|
for f in *.res ; do
|
||||||
|
loginfo "processing $f"
|
||||||
|
xxx=$(cat $f)
|
||||||
|
echo -e "\n==============\n$xxx\n==============\n" 1>&2
|
||||||
|
|
||||||
((n=n+1))
|
((n=n+1))
|
||||||
mv $f $f.ori
|
mv $f $f.ori
|
||||||
if [[ -z "$TEMP" ]] ; then
|
if [[ -z "$TEMP" ]] ; then
|
||||||
@ -168,7 +288,9 @@ blastx \
|
|||||||
else
|
else
|
||||||
dest="$TEMP/$f"
|
dest="$TEMP/$f"
|
||||||
fi
|
fi
|
||||||
|
loginfo "Destination file $dest"
|
||||||
header=$(head -1 ${f/.rps12.res/.fasta})
|
header=$(head -1 ${f/.rps12.res/.fasta})
|
||||||
|
loginfo "Header: $header"
|
||||||
L2=$(sed -E 's/^.*limit=([0-9]+);.*$/\1/' <<< $header)
|
L2=$(sed -E 's/^.*limit=([0-9]+);.*$/\1/' <<< $header)
|
||||||
S1=$(sed -E 's/^.*strand1=(R|F);.*$/\1/' <<< $header)
|
S1=$(sed -E 's/^.*strand1=(R|F);.*$/\1/' <<< $header)
|
||||||
S2=$(sed -E 's/^.*strand2=(R|F);.*$/\1/' <<< $header)
|
S2=$(sed -E 's/^.*strand2=(R|F);.*$/\1/' <<< $header)
|
||||||
@ -179,7 +301,7 @@ blastx \
|
|||||||
cat $f.ori \
|
cat $f.ori \
|
||||||
| $AwkCmd -v S1="$S1" -v F1="$F1" -v T1="$T1" \
|
| $AwkCmd -v S1="$S1" -v F1="$F1" -v T1="$T1" \
|
||||||
-v S2="$S2" -v F2="$F2" -v T2="$T2" -v L2="$L2" \
|
-v S2="$S2" -v F2="$F2" -v T2="$T2" -v L2="$L2" \
|
||||||
-f $LIB_DIR/rps12_filter_4.awk \
|
-f $LIB_DIR/rps12_filter_4.awk \
|
||||||
| $AwkCmd '
|
| $AwkCmd '
|
||||||
#
|
#
|
||||||
# Normalize join(complement(A),complement(B),complement(C)) locations
|
# Normalize join(complement(A),complement(B),complement(C)) locations
|
||||||
@ -201,7 +323,7 @@ blastx \
|
|||||||
sub(positions,rexons,$0)
|
sub(positions,rexons,$0)
|
||||||
}
|
}
|
||||||
{ print $0}
|
{ print $0}
|
||||||
' \
|
' \
|
||||||
| $AwkCmd '
|
| $AwkCmd '
|
||||||
/^FT [^ ]/ && (length($0) > 80) {
|
/^FT [^ ]/ && (length($0) > 80) {
|
||||||
n = split($0,parts,",")
|
n = split($0,parts,",")
|
||||||
@ -237,10 +359,9 @@ blastx \
|
|||||||
{
|
{
|
||||||
print $0
|
print $0
|
||||||
}
|
}
|
||||||
' > $dest
|
' > "$dest"
|
||||||
done
|
done
|
||||||
|
|
||||||
|
|
||||||
popTmpDir
|
popTmpDir
|
||||||
|
|
||||||
exit 0
|
exit 0
|
||||||
|
@ -37,6 +37,8 @@ needfile "$Fasta"
|
|||||||
|
|
||||||
GenomeLength=$1; shift
|
GenomeLength=$1; shift
|
||||||
|
|
||||||
|
loginfo "Genome length set to : $GenomeLength bp"
|
||||||
|
|
||||||
# Genome names is set from the base
|
# Genome names is set from the base
|
||||||
# name of the genome file without its extension
|
# name of the genome file without its extension
|
||||||
Genome=$(basename ${Fasta%.*})
|
Genome=$(basename ${Fasta%.*})
|
||||||
@ -95,7 +97,7 @@ fi
|
|||||||
|
|
||||||
if [[ "$cdsdetection_pass2" == "yes" ]] ; then
|
if [[ "$cdsdetection_pass2" == "yes" ]] ; then
|
||||||
loginfo "running pass2:rps12 exonerate of $Genome on $DbRoot"
|
loginfo "running pass2:rps12 exonerate of $Genome on $DbRoot"
|
||||||
$PROG_DIR/do_rps12.sh $Fasta $GenomeLength $temp
|
"$PROG_DIR/do_rps12.sh" "$Fasta" "$GenomeLength" $temp
|
||||||
fi
|
fi
|
||||||
|
|
||||||
#
|
#
|
||||||
@ -109,7 +111,7 @@ if [[ "$cdsdetection_pass3" == "yes" ]] ; then
|
|||||||
loginfo $fams
|
loginfo $fams
|
||||||
loginfo "running pass3:$dir exonerate of $Genome on $DbRoot"
|
loginfo "running pass3:$dir exonerate of $Genome on $DbRoot"
|
||||||
for f in $fams ; do
|
for f in $fams ; do
|
||||||
echo tcsh -f $PROG_DIR/do_exonerate.csh Pass3 $Fasta $f $AnnotFile $DbRoot/models $temp
|
tcsh -f $PROG_DIR/do_exonerate.csh Pass3 $Fasta $f $AnnotFile $DbRoot/models $temp
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
done) | parallel -j $Threads
|
done) | parallel -j $Threads
|
||||||
|
@ -507,9 +507,9 @@ pushTmpDir ORG.organnot
|
|||||||
if [[ ! -z "${sequence}" ]] ; then
|
if [[ ! -z "${sequence}" ]] ; then
|
||||||
echo "${sequence}" > toannotate.fasta
|
echo "${sequence}" > toannotate.fasta
|
||||||
sl=$(seqlength "toannotate.fasta")
|
sl=$(seqlength "toannotate.fasta")
|
||||||
|
|
||||||
if (( sl >= minlength )) ; then
|
if (( sl >= minlength )) ; then
|
||||||
|
loginfo "Annotated genome length: $sl bp"
|
||||||
seqid=$($AwkCmd '(NR==1) {print substr($1,2,1000)}' toannotate.fasta)
|
seqid=$($AwkCmd '(NR==1) {print substr($1,2,1000)}' toannotate.fasta)
|
||||||
# seqid=$(tr "." "_" <<< ${seqid})
|
# seqid=$(tr "." "_" <<< ${seqid})
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user