#!/bin/bash # # Annotate the RPS12 gene of a plastide genome # #======================================================================================== # # The RPS12 gene is one of the CDS coding for a riboosomal protein # Depending on the species, the gene is constituted of oone to three exons. # The exon one is not located close to the others and a trans-splicing is needed # to reconstruct the spliced mRNA. The exons 2 and eventuually 3 can be located in the # inverted repeats (IRs) and therfore they can exist in two copies. This can lead to two # ways to annotate RPS12 # # # go_rps12.sh # # - : The fasta file containing the normalized genome to annotate # # Results are printed to the standart output # #======================================================================================== # -- CAUTION -- Works as long than the script # is not called through a symlink THIS_DIR="$(dirname ${BASH_SOURCE[0]})" source "${THIS_DIR}/../../../scripts/bash_init.sh" if [[ ! "$1" =~ ^/ ]]; then QUERY="${CALL_DIR}/$1" else QUERY="$1" fi shift GENOME_LENGTH="$1" if (( $# > 1 )) ; then TEMP=$2 else TEMP="" fi DBROOT="$CDS_DATA_DIR/sp_chlorodb/RPS12" RPS12DB="${DBROOT}/rps12.fst" DELTA=50 AnnotFile="$CDS_DATA_DIR/sp_chlorodb/Annot.lst" ModelsDir="$CDS_DATA_DIR/sp_chlorodb/models" SEQLEN=$(seqlength "${QUERY}") pushTmpDir ORG.RPS12 # localize the gene on the chloroplast genome using blast loginfo "Locating RPS12 gene by similarity..." loginfo " Considered genome length : $GENOME_LENGTH" blastx \ -query ${QUERY} \ -db ${RPS12DB} \ -query_gencode 11 \ -outfmt 7 \ | $AwkCmd ' # Blast HSPs are filtered to keep only # at maximum the 20 first ones having an # e-value below 1e-20 BEGIN {BEST_EVAL = 1e-40; OUT = 0} /^#/ {next} ($2 == PREV_CDS) && (($11 + 0.0) < (1e-5 + 0.0)) { HSPs = HSPs "\n" $0; } (OUT < 20) && ($2 != PREV_CDS) && (BEST_EVAL < (1e-20 + 0.0)) { if (PREV_CDS) print HSPs; HSPs = $0; BEST_EVAL = 1; PREV_CDS = $2; OUT++ } {PREV_CDS = $2;} (BEST_EVAL > ($11 + 0.0)) {BEST_EVAL = ($11 + 0.0)} ' \ | $AwkCmd -v glength="${GENOME_LENGTH}" \ '!(($7 + 0) > (glength + 0) && $8 + (0 > glength + 0))' \ | $AwkCmd ' BEGIN { FS = OFS = "\t" } { subject = $2 bitscore = $12 + 0 s_start = $9 + 0 s_end = $10 + 0 # Ajuster l ordre des coordonnées actual_start = (s_start < s_end) ? s_start : s_end actual_end = (s_start < s_end) ? s_end : s_start # Stocker les données count[subject]++ idx = count[subject] starts[subject, idx] = actual_start ends[subject, idx] = actual_end scores[subject, idx] = bitscore lines[subject, idx] = $0 } END { for (subject in count) { n = count[subject] # Tri par score (décroissant) puis position (croissante) for (i = 1; i <= n; i++) { for (j = i + 1; j <= n; j++) { if (scores[subject, i] < scores[subject, j] || (scores[subject, i] == scores[subject, j] && starts[subject, i] > starts[subject, j])) { swap(starts, subject, i, j) swap(ends, subject, i, j) swap(scores, subject, i, j) swap(lines, subject, i, j) } } } # Sélection des HSPs optimaux selected_count = 0 delete selected_starts delete selected_ends delete selected_scores delete selected_lines for (i = 1; i <= n; i++) { current_start = starts[subject, i] current_end = ends[subject, i] current_score = scores[subject, i] current_line = lines[subject, i] # Vérifier les chevauchements avec les HSPs sélectionnés overlap_max = 0 overlap_indices = "" for (j = 1; j <= selected_count; j++) { os = (current_start > selected_starts[j]) ? current_start : selected_starts[j] oe = (current_end < selected_ends[j]) ? current_end : selected_ends[j] if (os <= oe && (oe - os + 1) >= 15) { if (selected_scores[j] > overlap_max) overlap_max = selected_scores[j] overlap_indices = overlap_indices j " " } } # Appliquer les règles de sélection if (overlap_indices == "") { # Aucun chevauchement : sélectionner add_to_selected(current_start, current_end, current_score, current_line) } else if (current_score >= overlap_max) { # Supprimer les HSPs moins bons et conserver les ex æquo new_selected_count = 0 delete temp_selected # Garder les non-chevauchants et les ex æquo for (j = 1; j <= selected_count; j++) { if (!index(overlap_indices, j " ") || selected_scores[j] == current_score) { temp_selected[++new_selected_count] = j } } # Mettre à jour la liste sélectionnée selected_count = 0 for (j = 1; j <= new_selected_count; j++) { idx = temp_selected[j] selected_count++ selected_starts[selected_count] = selected_starts[idx] selected_ends[selected_count] = selected_ends[idx] selected_scores[selected_count] = selected_scores[idx] selected_lines[selected_count] = selected_lines[idx] } add_to_selected(current_start, current_end, current_score, current_line) } } # Affichage des résultats for (j = 1; j <= selected_count; j++) { print selected_lines[j] } } } function add_to_selected(start, end, score, line) { selected_count++ selected_starts[selected_count] = start selected_ends[selected_count] = end selected_scores[selected_count] = score selected_lines[selected_count] = line } function swap(arr, subject, i, j, tmp) { tmp = arr[subject, i] arr[subject, i] = arr[subject, j] arr[subject, j] = tmp } ' \ > "rps12_locate.hsps" # # Extracting protein ids from selected blast HSPs # $AwkCmd '{print $2}' "rps12_locate.hsps" \ | sort \ | uniq > "dbsel.txt" # # Extract corresponding protein sequences # from the RPS12 database. # mkdir -p RPS12 $AwkCmd -v FILE="dbsel.txt" \ -f $LIB_DIR/subdb.awk ${RPS12DB} \ > "RPS12/rps12.fasta" cat "rps12_locate.hsps" \ | $AwkCmd '# Normalizes the writing of the forward and reverse strand matches ($7 <= $8) {print $7,$8,$9,$10,"F"} ($7 > $8) {print $8,$7,$9,$10,"R"}' \ | sort -n \ | uniq \ | $AwkCmd -f $LIB_DIR/rps12_filter_1.awk \ | sort -nk 3 \ | $AwkCmd '($3 != old3 || $4 != old4) { i++ old3=$3 old4=$4 } {print $0,i} ' \ | sort -nk 6 \ | $AwkCmd -f $LIB_DIR/rps12_filter_2.awk \ | $AwkCmd -v delta="$DELTA" \ -v seqlen="$SEQLEN" \ -v chloro="${QUERY}" \ -f $LIB_DIR/rps12_filter_3.awk nrps12=$(ls -1 rps12_fragments_*.fasta | wc -l) if (( nrps12 > 1 )) ; then message="$nrps12 versions" else message="$nrps12 version" fi loginfo "$message of the gene rps12 detected." # # Run exonarate on every fragment of chloroplast # # It should be one or two fragments # export PASS1_SPEEDUP=0 nbseq=0 for fasta in rps12_fragments_*.fasta ; do tcsh -f ${PROG_DIR}/do_exonerate.csh \ Pass2 \ $fasta \ "RPS12/rps12.fasta" \ $AnnotFile \ $ModelsDir $(pwd) ((nbseq=nbseq+1)) done # # Rewrite the coordinates of the genes on the extracted # fragment to the chloroplaste genome coordinates # n=0 for f in *.res ; do loginfo "processing $f" xxx=$(cat $f) echo -e "\n==============\n$xxx\n==============\n" 1>&2 ((n=n+1)) mv $f $f.ori if [[ -z "$TEMP" ]] ; then dest="/dev/stdout" else dest="$TEMP/$f" fi loginfo "Destination file $dest" header=$(head -1 ${f/.rps12.res/.fasta}) loginfo "Header: $header" L2=$(sed -E 's/^.*limit=([0-9]+);.*$/\1/' <<< $header) S1=$(sed -E 's/^.*strand1=(R|F);.*$/\1/' <<< $header) S2=$(sed -E 's/^.*strand2=(R|F);.*$/\1/' <<< $header) F1=$(sed -E 's/^.*from1=([0-9]+);.*$/\1/' <<< $header) F2=$(sed -E 's/^.*from2=([0-9]+);.*$/\1/' <<< $header) T1=$(sed -E 's/^.*to1=([0-9]+);.*$/\1/' <<< $header) T2=$(sed -E 's/^.*to2=([0-9]+);.*$/\1/' <<< $header) cat $f.ori \ | $AwkCmd -v S1="$S1" -v F1="$F1" -v T1="$T1" \ -v S2="$S2" -v F2="$F2" -v T2="$T2" -v L2="$L2" \ -f $LIB_DIR/rps12_filter_4.awk \ | $AwkCmd ' # # Normalize join(complement(A),complement(B),complement(C)) locations # into complement(join(C,B,A)) # /join\((complement\([0-9]+\.\.[0-9]+\),)+complement\([0-9]+\.\.[0-9]+\)\)/ \ { sub(/join\(complement/,"complement(join",$0) gsub(/\),complement\(/,",",$0) match($0,/[0-9]+\.\.[0-9]+(,[0-9]+\.\.[0-9]+)*/) positions=substr($0,RSTART,RLENGTH) n = split(positions,exons,",") for (i=1; i<=n; i++) { if (i > 1) rexons = exons[i] "," rexons else rexons = exons[i] } sub(positions,rexons,$0) } { print $0} ' \ | $AwkCmd ' /^FT [^ ]/ && (length($0) > 80) { n = split($0,parts,",") j = 1 for (i = 1; i <= n; i++) { if (length(line) + length(parts[i]) > 78) { print line "," line = "FT " j = i } if (i > j) line = line "," line = line parts[i] } $0 = line } {print $0} ' | \ $AwkCmd -v n=$n -v nbseq=$nbseq ' /^FT +\/gene="rps12"/ && (nbseq > 1) { sub(/rps12/,"rps12_" n,$0) } { print $0 } ' | \ $AwkCmd ' # # Adds the trans_splicing qualifier # /^FT \/translation=/ { print "FT /trans_splicing" } { print $0 } ' > "$dest" done popTmpDir exit 0 # NC_010654.fst # location=complement(join(77925..77967,78465..78700,52867..52980)); # location=join(complement(52867..52980),109583..109818,110316..110358); # 52837 52980 1 48 R 1 # 77928 77981 113 130 R 3 # 78458 78712 39 132 R 2 # 109571 109825 39 132 F 2 # 110302 110355 113 130 F 3 # /translation="MPTNPQLIRDARQQKKKKRGSRGLQRCPQRRGVCARVYNINPKK # --> MPTNPQLIRDARQQKKKKRGSRGLQRCPQRRGVCARVSNINPKK # ==> MPTNPQLIRDARQQKKKKRGSRGLQRCPQRRGVCARVSNINPKK # PNSALRKVARVRLTSGFEITAYIPGIGHNLQEHSVVLVRGGRVKDLPGVKYRIVRGTL # --> PNSALRKVARVRLTSGFEITAYIPGIGHNLQEHSVVLVRGGRVKDLPGVKYRIVRGTL # ==> PNSALRKVARVRLTSGFEITAYIPGIGHNLQEHSVVLVRGGRVKDLPGVKYRIVRGTL # DAVAVKNRQQGRSSAIWSQKAEKKVIHF" # --> DAVAVKNRQQGRSSAIWSQKAEKKVIHF # ==> DAVAVKNRQQGRSSAIWSQKAEKKVIHF # ADL.norm.fasta # 69300 69425 1 42 R 1 # 97365 97670 36 137 R 2 # 130601 130906 36 137 F 2 # NC_008822 # location=90942..91313; # 90942 91310 1 123 F 1 # location=complement(join(77925..77967,78465..78700,52867..52980)); # >RPS12_1 parts=2 limit=255; from1=52787; to1=53030; strand1=R; from2=77878; to2=78762; strand2=R; # join(51..159,312..553,1051..1092) # location=join(complement(52867..52980),109583..109818,110316..110358); # >RPS12_2 parts=2 limit=254; from1=52787; to1=53030; strand1=R; from2=109521; to2=110405; strand2=F; # join\((complement\([0-9]+\.\.[0-9]+\),)+complement\([0-9]+\.\.[0-9]+\)\) # cat NC_010654.annot.embl | awk -v tag="agser" 'BEGIN {n=0} /\locus_tag/ {n++; sub(/""/, sprintf("\"%s%04d\"",tag,n),$0)} {print $0}' | less # BRR.chloro_1 NC_018117_rps12_2 96.08 102 4 0 99022 98717 36 137 9e-57 195 # BRR.chloro_1 NC_018117_rps12_2 96.08 102 4 0 141187 141492 36 137 9e-57 195 # BRR.chloro_1 NC_018117_rps12_2 94.87 39 2 0 70611 70495 1 39 2e-16 78.6 # /trans_splicing