diff --git a/detectors/ir/.DS_Store b/detectors/ir/.DS_Store new file mode 100644 index 0000000..3b80de6 Binary files /dev/null and b/detectors/ir/.DS_Store differ diff --git a/detectors/ir/bin/go_ir.sh b/detectors/ir/bin/go_ir.sh new file mode 100755 index 0000000..181913b --- /dev/null +++ b/detectors/ir/bin/go_ir.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# +# Annotate the Inverted Repeats of a plastide genome +# +#======================================================================================== +# +# The SSC and LSC are approximatively mapped by similarity with a reference database +# Inverted repeats (IRs) are identified for maximizing the segregation between +# LSC and SSC match +# +# +# go_normalize.sh +# +# - : The fasta file containing the genome to normalize +# +# Results are printed to the standart output +# +#======================================================================================== + +# -- CAUTION -- Works as long than the script +# is not called through a symlink +SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})" +source ${SCRIPT_DIR}/../../normalize/lib/lookforIR.lib.sh + +pushTmpDir ORG.ir + + loginfo "Computing the genome size..." + genome_length=$(seqlength $QUERY) + loginfo " --> $genome_length bp" + loginfo "Done" + + IR=( $(lookForIR ${QUERY}) ) + + posIR1=${IR[4]} + posIR2=${IR[6]} + + let "lenIR= ( ${IR[5]} + ${IR[7]} ) / 2 " + + let "endIR2=$posIR2 + $lenIR - 1" + let "endIR1=$posIR1 + $lenIR - 1" + + beginLSC=1 + let "endLSC=$posIR1-1" + + + let "beginSSC=$endIR1+1" + let "endSSC=$posIR2-1" + + + echo "FT misc_feature ${beginLSC}..${endLSC}" + echo "FT /note=\"large single copy region (LSC)\"" + echo "FT repeat_region ${posIR1}..${endIR1}" + echo "FT /rpt_type=INVERTED" + echo "FT /note=\"left inverted repeat B; IRB\"" + echo "FT misc_feature ${beginSSC}..${endSSC}" + echo "FT /note=\"small single copy region (SSC)\"" + echo "FT repeat_region ${posIR2}..${endIR2}" + echo "FT /rpt_type=INVERTED" + echo "FT /note=\"left inverted repeat A; IRA\"" + + +popTmpDir + +exit 0 diff --git a/detectors/normalize/bin/go_normalize.sh b/detectors/normalize/bin/go_normalize.sh index 967c399..b772fae 100755 --- a/detectors/normalize/bin/go_normalize.sh +++ b/detectors/normalize/bin/go_normalize.sh @@ -3,104 +3,37 @@ # NORMALISATION D'UN PLASTIDE # #======================================================================================== -# Ce programme dispose de 4 fonctions pour traiter les donnees fasta issues de genbank -# - seqlength : compte le nombre de paire de base du fichier -# ex : seqlength $1 # -# - cutseq : permet de couper un morceau de la sequence -# cutseq [x] [y] -# [x] : coordonne du debut de la sequence a couper -# [y] : coordonne de la fin de la sequence a couper -# ex : cutseq $1 10 100 +# Normalize the way the chloroplaste genome sequence is linearized in the fasta file +# The normalized sequence is: # +# LSC + IRB + SSC + IRA # -# - revcomp : donne le brin reverse -# ex : $1 | revcomp +# The SSC and LSC are approximatively mapped by similarity with a reference database +# Inverted repeats (IRs) are identified for maximizing the segregation between +# LSC and SSC match +# # -# - formatfasta : permet de coller a la suite plusieurs morceaux de sequence au moment de -# la reecriture -# - joinfasta : enleve les titres au moment de la reecriture du fichier et renvoie les -# informations dans la fonction formatfasta -# ex : joinfasta $1 +# go_normalize.sh # -#======================================================================================== -# Pour lancer le programme, utiliser les commandes : -# chmod +x normalize_plastid.sh -#./normalize_plastid.sh [fichier].fasta +# - : The fasta file containing the genome to normalize # -# ex : seqlength $1 +# Results are printed to the standart output # -# cutseq $1 [x] [y] -# [x]:coordonne du debut [y]:coordonne de la fin de la sequence a couper -# ex : cutseq $1 10 100 -# -# ex : $1 | revcomp -# -# ex : joinfasta $1 #======================================================================================== # -- CAUTION -- Works as long than the script # is not called through a symlink SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})" -source "${SCRIPT_DIR}/../../../scripts/bash_init.sh" +source ${SCRIPT_DIR}/../lib/lookforIR.lib.sh -function lookForIR { - - local QUERY="$1" - local MATCHES=$(basename ${QUERY}) - MATCHES="${MATCHES/.*/.matches}" - - local REPEATS="${MATCHES/.*/.repseek}" - - loginfo "Locating SSC and LSC by similarity..." - blastn -db ${SCDB} \ - -query ${QUERY} \ - -outfmt 6 \ - -max_target_seqs 10000 | \ - awk '($4 > 1000) && ($3>80) { \ - SAME=(($7 < $8) && ($9 < $10)) || (($7 > $8) && ($9 > $10)); \ - if ($7 < $8) \ - {print substr($2,1,3),$7,$8,SAME} \ - else \ - {print substr($2,1,3),$8,$7,SAME}}' | \ - sort -nk 2 > ${MATCHES} - loginfo "Done" - - loginfo "Looking for long inverted repeats..." - repseek -c -p 0.001 -i ${QUERY} 2>> /dev/null > ${REPEATS} - loginfo " --> $(wc -l ${REPEATS} | awk '{print $1}') repeats identified" - loginfo "Done" - - loginfo "Marking and selecting the best inverted repeat..." - local IR=( $(${PROG_DIR}/selectIR.py ${MATCHES} ${REPEATS}) ) - loginfo "Done" - - loginfo " --> IR size : IRa = ${IR[5]} / IRb = ${IR[7]}" - loginfo " --> IR Score: ${IR[8]}" - - let "deltaIR=${IR[5]} - ${IR[7]}" - - if (( $deltaIR < -10 )) || (( $deltaIR > 10 )); then - logwarning "Differences between IR lengths ($deltaIR) is greater than 10" - fi - - - echo "${IR[@]}" -} pushTmpDir ORG.normalize - SCDB="${IR_DATA_DIR}/SC_RefDB" - QUERY="${CALL_DIR}/$1" - MATCHES="${1/.*/.matches}" - REPEATS="${1/.*/.repseek}" - tmpfasta1="tmp_$$_1.fasta" tmpfasta2="tmp_$$_2.fasta" - openLogFile "${QUERY/.*/.log}" - loginfo "Computing the genome size..." genome_length=$(seqlength $QUERY) loginfo " --> $genome_length bp" @@ -136,7 +69,7 @@ pushTmpDir ORG.normalize rm -f ${tmpfasta1} QUERY=${tmpfasta2} - loginfo "Recompute location of the IR..." + loginfo "Recomputing location of the IR..." declare -a IR=( $(lookForIR ${QUERY}) ) loginfo "Done" @@ -224,7 +157,7 @@ pushTmpDir ORG.normalize fi # Merges the four parts of the genome. - cat ${tmpSSC} ${tmpIR1} ${tmpLSC} ${tmpIR2} | joinfasta + cat ${tmpLSC} ${tmpIR2} ${tmpSSC} ${tmpIR1} | joinfasta diff --git a/detectors/normalize/lib/lookforIR.lib.sh b/detectors/normalize/lib/lookforIR.lib.sh new file mode 100644 index 0000000..31dd9da --- /dev/null +++ b/detectors/normalize/lib/lookforIR.lib.sh @@ -0,0 +1,53 @@ +source "${SCRIPT_DIR}/../../../scripts/bash_init.sh" + +SELECTIR="${PROG_DIR}/../../normalize/lib/selectIR.py" + +function lookForIR { + + local QUERY="$1" + local MATCHES=$(basename ${QUERY}) + MATCHES="${MATCHES/.*/}.matches" + + local REPEATS="${MATCHES/.*/}.repseek" + + loginfo "Locating SSC and LSC by similarity..." + blastn -db ${SCDB} \ + -query ${QUERY} \ + -outfmt 6 \ + -max_target_seqs 10000 | \ + awk '($4 > 1000) && ($3>80) { \ + SAME=(($7 < $8) && ($9 < $10)) || (($7 > $8) && ($9 > $10)); \ + if ($7 < $8) \ + {print substr($2,1,3),$7,$8,SAME} \ + else \ + {print substr($2,1,3),$8,$7,SAME}}' | \ + sort -nk 2 > ${MATCHES} + loginfo "Done" + + loginfo "Looking for long inverted repeats..." + repseek -c -p 0.001 -i ${QUERY} 2>> /dev/null > ${REPEATS} + loginfo " --> $(wc -l ${REPEATS} | awk '{print $1}') repeats identified" + loginfo "Done" + + loginfo "Marking and selecting the best inverted repeat..." + local IR=( $(${SELECTIR} ${MATCHES} ${REPEATS}) ) + loginfo "Done" + + loginfo " --> IR size : IRa = ${IR[5]} / IRb = ${IR[7]}" + loginfo " --> IR Score: ${IR[8]}" + + let "deltaIR=${IR[5]} - ${IR[7]}" + + if (( $deltaIR < -10 )) || (( $deltaIR > 10 )); then + logwarning "Differences between IR lengths ($deltaIR) is greater than 10" + fi + + + echo "${IR[@]}" +} + +SCDB="${IR_DATA_DIR}/SC_RefDB" +QUERY="${CALL_DIR}/$1" + + +openLogFile "${QUERY/.*/}.log" diff --git a/detectors/normalize/bin/selectIR.py b/detectors/normalize/lib/selectIR.py similarity index 100% rename from detectors/normalize/bin/selectIR.py rename to detectors/normalize/lib/selectIR.py