From 17908e0df2b135da12f1c05098286225f73e2504 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Sun, 25 May 2025 13:41:47 +0200
Subject: [PATCH] Patch RPS12 detection

---
 detectors/cds/bin/do_rps12.sh | 143 +++++++++++++++++++++++++++++++---
 detectors/cds/bin/go_cds.sh   |   6 +-
 org-annotate.sh               |   4 +-
 3 files changed, 138 insertions(+), 15 deletions(-)

diff --git a/detectors/cds/bin/do_rps12.sh b/detectors/cds/bin/do_rps12.sh
index b1d33b5..5259c4e 100755
--- a/detectors/cds/bin/do_rps12.sh
+++ b/detectors/cds/bin/do_rps12.sh
@@ -33,7 +33,7 @@ fi
 
 shift
 
-GENOME_LENGHT="$2"
+GENOME_LENGTH="$1"
 
 if (( $# > 1 )) ; then
     TEMP=$2
@@ -54,7 +54,7 @@ pushTmpDir ORG.RPS12
 
 # localize the gene on the chloroplast genome using blast
 loginfo "Locating RPS12 gene by similarity..."
-
+loginfo "  Considered genome length : $GENOME_LENGTH"
 
 blastx \
     -query ${QUERY} \
@@ -82,11 +82,126 @@ blastx \
                 
                 (BEST_EVAL > ($11 + 0.0)) {BEST_EVAL = ($11 + 0.0)} 
           ' \
-    | $AwkCmd -v glength=${GENOME_LENGHT}  \
-              '!($7 + 0 > glength +  0 && $8  + 0 > glength +  0)' \
+    | $AwkCmd -v glength="${GENOME_LENGTH}"  \
+              '!(($7 + 0) > (glength +  0) && $8  + (0 > glength +  0))' \
+    | $AwkCmd ' BEGIN { FS = OFS = "\t" }
+
+                {
+                    subject = $2
+                    bitscore = $12 + 0
+                    s_start = $9 + 0
+                    s_end = $10 + 0
+
+                    # Ajuster l ordre des coordonnées
+                    actual_start = (s_start < s_end) ? s_start : s_end
+                    actual_end = (s_start < s_end) ? s_end : s_start
+
+                    # Stocker les données
+                    count[subject]++
+                    idx = count[subject]
+                    starts[subject, idx] = actual_start
+                    ends[subject, idx] = actual_end
+                    scores[subject, idx] = bitscore
+                    lines[subject, idx] = $0
+                }
+
+                END {
+                    for (subject in count) {
+                        n = count[subject]
+                        
+                        # Tri par score (décroissant) puis position (croissante)
+                        for (i = 1; i <= n; i++) {
+                            for (j = i + 1; j <= n; j++) {
+                                if (scores[subject, i] < scores[subject, j] || 
+                                (scores[subject, i] == scores[subject, j] && 
+                                    starts[subject, i] > starts[subject, j])) {
+                                    swap(starts, subject, i, j)
+                                    swap(ends, subject, i, j)
+                                    swap(scores, subject, i, j)
+                                    swap(lines, subject, i, j)
+                                }
+                            }
+                        }
+
+                        # Sélection des HSPs optimaux
+                        selected_count = 0
+                        delete selected_starts
+                        delete selected_ends
+                        delete selected_scores
+                        delete selected_lines
+
+                        for (i = 1; i <= n; i++) {
+                            current_start = starts[subject, i]
+                            current_end = ends[subject, i]
+                            current_score = scores[subject, i]
+                            current_line = lines[subject, i]
+                            
+                            # Vérifier les chevauchements avec les HSPs sélectionnés
+                            overlap_max = 0
+                            overlap_indices = ""
+                            for (j = 1; j <= selected_count; j++) {
+                                os = (current_start > selected_starts[j]) ? current_start : selected_starts[j]
+                                oe = (current_end < selected_ends[j]) ? current_end : selected_ends[j]
+                                if (os <= oe && (oe - os + 1) >= 15) {
+                                    if (selected_scores[j] > overlap_max) overlap_max = selected_scores[j]
+                                    overlap_indices = overlap_indices j " "
+                                }
+                            }
+
+                            # Appliquer les règles de sélection
+                            if (overlap_indices == "") {
+                                # Aucun chevauchement : sélectionner
+                                add_to_selected(current_start, current_end, current_score, current_line)
+                            }
+                            else if (current_score >= overlap_max) {
+                                # Supprimer les HSPs moins bons et conserver les ex æquo
+                                new_selected_count = 0
+                                delete temp_selected
+
+                                # Garder les non-chevauchants et les ex æquo
+                                for (j = 1; j <= selected_count; j++) {
+                                    if (!index(overlap_indices, j " ") || selected_scores[j] == current_score) {
+                                        temp_selected[++new_selected_count] = j
+                                    }
+                                }
+                                
+                                # Mettre à jour la liste sélectionnée
+                                selected_count = 0
+                                for (j = 1; j <= new_selected_count; j++) {
+                                    idx = temp_selected[j]
+                                    selected_count++
+                                    selected_starts[selected_count] = selected_starts[idx]
+                                    selected_ends[selected_count] = selected_ends[idx]
+                                    selected_scores[selected_count] = selected_scores[idx]
+                                    selected_lines[selected_count] = selected_lines[idx]
+                                }
+                                add_to_selected(current_start, current_end, current_score, current_line)
+                            }
+                        }
+
+                        # Affichage des résultats
+                        for (j = 1; j <= selected_count; j++) {
+                            print selected_lines[j]
+                        }
+                    }
+                }
+
+                function add_to_selected(start, end, score, line) {
+                    selected_count++
+                    selected_starts[selected_count] = start
+                    selected_ends[selected_count] = end
+                    selected_scores[selected_count] = score
+                    selected_lines[selected_count] = line
+                }
+
+                function swap(arr, subject, i, j,    tmp) {
+                    tmp = arr[subject, i]
+                    arr[subject, i] = arr[subject, j]
+                    arr[subject, j] = tmp
+                }
+                ' \
     > "rps12_locate.hsps"
 
-    
     #
     # Extracting protein ids from selected blast HSPs
     #
@@ -127,6 +242,7 @@ blastx \
                   -v chloro="${QUERY}" \
                   -f $LIB_DIR/rps12_filter_3.awk
 
+    
     nrps12=$(ls -1 rps12_fragments_*.fasta | wc -l)
 
     if (( nrps12 > 1 )) ; then
@@ -161,6 +277,10 @@ blastx \
 
         n=0
         for f in *.res ; do
+            loginfo "processing $f"
+            xxx=$(cat $f)
+            echo -e "\n==============\n$xxx\n==============\n" 1>&2
+            
             ((n=n+1))
             mv $f $f.ori
             if [[ -z "$TEMP" ]] ; then
@@ -168,7 +288,9 @@ blastx \
             else
                 dest="$TEMP/$f"
             fi
+            loginfo "Destination file $dest"
             header=$(head -1 ${f/.rps12.res/.fasta})
+            loginfo "Header: $header"
             L2=$(sed -E 's/^.*limit=([0-9]+);.*$/\1/' <<< $header)
             S1=$(sed -E 's/^.*strand1=(R|F);.*$/\1/' <<< $header)
             S2=$(sed -E 's/^.*strand2=(R|F);.*$/\1/' <<< $header)
@@ -179,7 +301,7 @@ blastx \
             cat $f.ori \
             | $AwkCmd -v S1="$S1" -v F1="$F1" -v T1="$T1" \
                       -v S2="$S2" -v F2="$F2" -v T2="$T2" -v L2="$L2" \
-                      -f $LIB_DIR/rps12_filter_4.awk  \
+                      -f $LIB_DIR/rps12_filter_4.awk \
             | $AwkCmd '
               #
               # Normalize join(complement(A),complement(B),complement(C)) locations
@@ -201,7 +323,7 @@ blastx \
                     sub(positions,rexons,$0)
                 }
                 { print $0}
-            ' \
+            '  \
         | $AwkCmd '
           /^FT   [^ ]/ && (length($0) > 80) {
                  n = split($0,parts,",")
@@ -237,10 +359,9 @@ blastx \
             {
                 print $0
             }       
-        ' > $dest
-        done
-         
-    
+        '  > "$dest" 
+        done   
+
 popTmpDir
 
 exit 0
diff --git a/detectors/cds/bin/go_cds.sh b/detectors/cds/bin/go_cds.sh
index 990625a..b89f244 100755
--- a/detectors/cds/bin/go_cds.sh
+++ b/detectors/cds/bin/go_cds.sh
@@ -37,6 +37,8 @@ needfile "$Fasta"
 
 GenomeLength=$1; shift
 
+loginfo "Genome length set to : $GenomeLength bp"
+
 # Genome names is set from the base
 # name of the genome file without its extension
 Genome=$(basename ${Fasta%.*})
@@ -95,7 +97,7 @@ fi
 
 if [[ "$cdsdetection_pass2" == "yes" ]] ; then
     loginfo "running pass2:rps12 exonerate of $Genome on $DbRoot"
-    $PROG_DIR/do_rps12.sh $Fasta $GenomeLength $temp
+    "$PROG_DIR/do_rps12.sh" "$Fasta" "$GenomeLength" $temp
 fi
 
 #
@@ -109,7 +111,7 @@ if [[ "$cdsdetection_pass3" == "yes" ]] ; then
         loginfo $fams
         loginfo "running pass3:$dir exonerate of $Genome on $DbRoot"
         for f in $fams ; do
-          echo tcsh -f $PROG_DIR/do_exonerate.csh Pass3 $Fasta $f $AnnotFile $DbRoot/models $temp
+          tcsh -f $PROG_DIR/do_exonerate.csh Pass3 $Fasta $f $AnnotFile $DbRoot/models $temp
         done
     fi
     done) | parallel -j $Threads
diff --git a/org-annotate.sh b/org-annotate.sh
index df4e839..b7f44ca 100755
--- a/org-annotate.sh
+++ b/org-annotate.sh
@@ -507,9 +507,9 @@ pushTmpDir ORG.organnot
 		if [[ ! -z "${sequence}" ]] ; then
 			echo "${sequence}" > toannotate.fasta
 			sl=$(seqlength "toannotate.fasta")
-			
+		
 			if (( sl >= minlength )) ; then
-			
+				loginfo "Annotated genome length: $sl bp"	
 				seqid=$($AwkCmd '(NR==1) {print substr($1,2,1000)}' toannotate.fasta)
 				# seqid=$(tr "." "_" <<< ${seqid})