From 17908e0df2b135da12f1c05098286225f73e2504 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Sun, 25 May 2025 13:41:47 +0200 Subject: [PATCH] Patch RPS12 detection --- detectors/cds/bin/do_rps12.sh | 143 +++++++++++++++++++++++++++++++--- detectors/cds/bin/go_cds.sh | 6 +- org-annotate.sh | 4 +- 3 files changed, 138 insertions(+), 15 deletions(-) diff --git a/detectors/cds/bin/do_rps12.sh b/detectors/cds/bin/do_rps12.sh index b1d33b5..5259c4e 100755 --- a/detectors/cds/bin/do_rps12.sh +++ b/detectors/cds/bin/do_rps12.sh @@ -33,7 +33,7 @@ fi shift -GENOME_LENGHT="$2" +GENOME_LENGTH="$1" if (( $# > 1 )) ; then TEMP=$2 @@ -54,7 +54,7 @@ pushTmpDir ORG.RPS12 # localize the gene on the chloroplast genome using blast loginfo "Locating RPS12 gene by similarity..." - +loginfo " Considered genome length : $GENOME_LENGTH" blastx \ -query ${QUERY} \ @@ -82,11 +82,126 @@ blastx \ (BEST_EVAL > ($11 + 0.0)) {BEST_EVAL = ($11 + 0.0)} ' \ - | $AwkCmd -v glength=${GENOME_LENGHT} \ - '!($7 + 0 > glength + 0 && $8 + 0 > glength + 0)' \ + | $AwkCmd -v glength="${GENOME_LENGTH}" \ + '!(($7 + 0) > (glength + 0) && $8 + (0 > glength + 0))' \ + | $AwkCmd ' BEGIN { FS = OFS = "\t" } + + { + subject = $2 + bitscore = $12 + 0 + s_start = $9 + 0 + s_end = $10 + 0 + + # Ajuster l ordre des coordonnées + actual_start = (s_start < s_end) ? s_start : s_end + actual_end = (s_start < s_end) ? s_end : s_start + + # Stocker les données + count[subject]++ + idx = count[subject] + starts[subject, idx] = actual_start + ends[subject, idx] = actual_end + scores[subject, idx] = bitscore + lines[subject, idx] = $0 + } + + END { + for (subject in count) { + n = count[subject] + + # Tri par score (décroissant) puis position (croissante) + for (i = 1; i <= n; i++) { + for (j = i + 1; j <= n; j++) { + if (scores[subject, i] < scores[subject, j] || + (scores[subject, i] == scores[subject, j] && + starts[subject, i] > starts[subject, j])) { + swap(starts, subject, i, j) + swap(ends, subject, i, j) + swap(scores, subject, i, j) + swap(lines, subject, i, j) + } + } + } + + # Sélection des HSPs optimaux + selected_count = 0 + delete selected_starts + delete selected_ends + delete selected_scores + delete selected_lines + + for (i = 1; i <= n; i++) { + current_start = starts[subject, i] + current_end = ends[subject, i] + current_score = scores[subject, i] + current_line = lines[subject, i] + + # Vérifier les chevauchements avec les HSPs sélectionnés + overlap_max = 0 + overlap_indices = "" + for (j = 1; j <= selected_count; j++) { + os = (current_start > selected_starts[j]) ? current_start : selected_starts[j] + oe = (current_end < selected_ends[j]) ? current_end : selected_ends[j] + if (os <= oe && (oe - os + 1) >= 15) { + if (selected_scores[j] > overlap_max) overlap_max = selected_scores[j] + overlap_indices = overlap_indices j " " + } + } + + # Appliquer les règles de sélection + if (overlap_indices == "") { + # Aucun chevauchement : sélectionner + add_to_selected(current_start, current_end, current_score, current_line) + } + else if (current_score >= overlap_max) { + # Supprimer les HSPs moins bons et conserver les ex æquo + new_selected_count = 0 + delete temp_selected + + # Garder les non-chevauchants et les ex æquo + for (j = 1; j <= selected_count; j++) { + if (!index(overlap_indices, j " ") || selected_scores[j] == current_score) { + temp_selected[++new_selected_count] = j + } + } + + # Mettre à jour la liste sélectionnée + selected_count = 0 + for (j = 1; j <= new_selected_count; j++) { + idx = temp_selected[j] + selected_count++ + selected_starts[selected_count] = selected_starts[idx] + selected_ends[selected_count] = selected_ends[idx] + selected_scores[selected_count] = selected_scores[idx] + selected_lines[selected_count] = selected_lines[idx] + } + add_to_selected(current_start, current_end, current_score, current_line) + } + } + + # Affichage des résultats + for (j = 1; j <= selected_count; j++) { + print selected_lines[j] + } + } + } + + function add_to_selected(start, end, score, line) { + selected_count++ + selected_starts[selected_count] = start + selected_ends[selected_count] = end + selected_scores[selected_count] = score + selected_lines[selected_count] = line + } + + function swap(arr, subject, i, j, tmp) { + tmp = arr[subject, i] + arr[subject, i] = arr[subject, j] + arr[subject, j] = tmp + } + ' \ > "rps12_locate.hsps" - # # Extracting protein ids from selected blast HSPs # @@ -127,6 +242,7 @@ blastx \ -v chloro="${QUERY}" \ -f $LIB_DIR/rps12_filter_3.awk + nrps12=$(ls -1 rps12_fragments_*.fasta | wc -l) if (( nrps12 > 1 )) ; then @@ -161,6 +277,10 @@ blastx \ n=0 for f in *.res ; do + loginfo "processing $f" + xxx=$(cat $f) + echo -e "\n==============\n$xxx\n==============\n" 1>&2 + ((n=n+1)) mv $f $f.ori if [[ -z "$TEMP" ]] ; then @@ -168,7 +288,9 @@ blastx \ else dest="$TEMP/$f" fi + loginfo "Destination file $dest" header=$(head -1 ${f/.rps12.res/.fasta}) + loginfo "Header: $header" L2=$(sed -E 's/^.*limit=([0-9]+);.*$/\1/' <<< $header) S1=$(sed -E 's/^.*strand1=(R|F);.*$/\1/' <<< $header) S2=$(sed -E 's/^.*strand2=(R|F);.*$/\1/' <<< $header) @@ -179,7 +301,7 @@ blastx \ cat $f.ori \ | $AwkCmd -v S1="$S1" -v F1="$F1" -v T1="$T1" \ -v S2="$S2" -v F2="$F2" -v T2="$T2" -v L2="$L2" \ - -f $LIB_DIR/rps12_filter_4.awk \ + -f $LIB_DIR/rps12_filter_4.awk \ | $AwkCmd ' # # Normalize join(complement(A),complement(B),complement(C)) locations @@ -201,7 +323,7 @@ blastx \ sub(positions,rexons,$0) } { print $0} - ' \ + ' \ | $AwkCmd ' /^FT [^ ]/ && (length($0) > 80) { n = split($0,parts,",") @@ -237,10 +359,9 @@ blastx \ { print $0 } - ' > $dest - done - - + ' > "$dest" + done + popTmpDir exit 0 diff --git a/detectors/cds/bin/go_cds.sh b/detectors/cds/bin/go_cds.sh index 990625a..b89f244 100755 --- a/detectors/cds/bin/go_cds.sh +++ b/detectors/cds/bin/go_cds.sh @@ -37,6 +37,8 @@ needfile "$Fasta" GenomeLength=$1; shift +loginfo "Genome length set to : $GenomeLength bp" + # Genome names is set from the base # name of the genome file without its extension Genome=$(basename ${Fasta%.*}) @@ -95,7 +97,7 @@ fi if [[ "$cdsdetection_pass2" == "yes" ]] ; then loginfo "running pass2:rps12 exonerate of $Genome on $DbRoot" - $PROG_DIR/do_rps12.sh $Fasta $GenomeLength $temp + "$PROG_DIR/do_rps12.sh" "$Fasta" "$GenomeLength" $temp fi # @@ -109,7 +111,7 @@ if [[ "$cdsdetection_pass3" == "yes" ]] ; then loginfo $fams loginfo "running pass3:$dir exonerate of $Genome on $DbRoot" for f in $fams ; do - echo tcsh -f $PROG_DIR/do_exonerate.csh Pass3 $Fasta $f $AnnotFile $DbRoot/models $temp + tcsh -f $PROG_DIR/do_exonerate.csh Pass3 $Fasta $f $AnnotFile $DbRoot/models $temp done fi done) | parallel -j $Threads diff --git a/org-annotate.sh b/org-annotate.sh index df4e839..b7f44ca 100755 --- a/org-annotate.sh +++ b/org-annotate.sh @@ -507,9 +507,9 @@ pushTmpDir ORG.organnot if [[ ! -z "${sequence}" ]] ; then echo "${sequence}" > toannotate.fasta sl=$(seqlength "toannotate.fasta") - + if (( sl >= minlength )) ; then - + loginfo "Annotated genome length: $sl bp" seqid=$($AwkCmd '(NR==1) {print substr($1,2,1000)}' toannotate.fasta) # seqid=$(tr "." "_" <<< ${seqid})