From 466308267e6b80f47e8923fb43caf5389d0f0ea5 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 2 May 2016 15:32:28 +0200 Subject: [PATCH] Add a patch for chloroplast annotation when no inverted repeats are detected Former-commit-id: 7e3ddd41cf0d0788223382fedbf45b183974233e Former-commit-id: e5a8ceb825f78d243e37d22cd6b2e91f403c0ee8 --- detectors/ir/bin/go_ir.sh | 54 +++--- detectors/normalize/bin/go_normalize.sh | 214 ++++++++++++----------- detectors/normalize/lib/lookforIR.lib.sh | 9 +- 3 files changed, 146 insertions(+), 131 deletions(-) diff --git a/detectors/ir/bin/go_ir.sh b/detectors/ir/bin/go_ir.sh index 2f7a0bb..4993096 100755 --- a/detectors/ir/bin/go_ir.sh +++ b/detectors/ir/bin/go_ir.sh @@ -29,35 +29,37 @@ pushTmpDir ORG.ir loginfo " --> $genome_length bp" loginfo "Done" - IR=( $(lookForIR ${QUERY}) ) - - posIR1=${IR[4]} - posIR2=${IR[6]} - - let "lenIR= ( ${IR[5]} + ${IR[7]} ) / 2 " + IRDetected=1 + IR=( $(lookForIR ${QUERY}) ) || IRDetected=0 - let "endIR2=$posIR2 + $lenIR - 1" - let "endIR1=$posIR1 + $lenIR - 1" - - beginLSC=1 - let "endLSC=$posIR1-1" - - - let "beginSSC=$endIR1+1" - let "endSSC=$posIR2-1" + if (( IRDetected == 1 )) ; then + posIR1=${IR[4]} + posIR2=${IR[6]} + + let "lenIR= ( ${IR[5]} + ${IR[7]} ) / 2 " + + let "endIR2=$posIR2 + $lenIR - 1" + let "endIR1=$posIR1 + $lenIR - 1" + + beginLSC=1 + let "endLSC=$posIR1-1" - echo "FT misc_feature ${beginLSC}..${endLSC}" - echo "FT /note=\"large single copy region (LSC)\"" - echo "FT repeat_region ${posIR1}..${endIR1}" - echo "FT /rpt_type=INVERTED" - echo "FT /note=\"left inverted repeat B; IRB\"" - echo "FT misc_feature ${beginSSC}..${endSSC}" - echo "FT /note=\"small single copy region (SSC)\"" - echo "FT repeat_region ${posIR2}..${endIR2}" - echo "FT /rpt_type=INVERTED" - echo "FT /note=\"left inverted repeat A; IRA\"" - + let "beginSSC=$endIR1+1" + let "endSSC=$posIR2-1" + + + echo "FT misc_feature ${beginLSC}..${endLSC}" + echo "FT /note=\"large single copy region (LSC)\"" + echo "FT repeat_region ${posIR1}..${endIR1}" + echo "FT /rpt_type=INVERTED" + echo "FT /note=\"left inverted repeat B; IRB\"" + echo "FT misc_feature ${beginSSC}..${endSSC}" + echo "FT /note=\"small single copy region (SSC)\"" + echo "FT repeat_region ${posIR2}..${endIR2}" + echo "FT /rpt_type=INVERTED" + echo "FT /note=\"left inverted repeat A; IRA\"" + fi popTmpDir diff --git a/detectors/normalize/bin/go_normalize.sh b/detectors/normalize/bin/go_normalize.sh index 0fbd8b9..3da5f8a 100755 --- a/detectors/normalize/bin/go_normalize.sh +++ b/detectors/normalize/bin/go_normalize.sh @@ -41,128 +41,134 @@ pushTmpDir ORG.normalize loginfo " --> $genome_length bp" loginfo "Done" - IR=( $(lookForIR ${QUERY}) ) + IRDetected=1 + IR=( $(lookForIR ${QUERY}) ) || IRDetected=0 - posIR1=${IR[4]} - posIR2=${IR[6]} + if (( IRDetected == 1 )) ; then - let "lenIR= ( ${IR[5]} + ${IR[7]} ) / 2 " - - let "endIR2=$posIR2 + $lenIR - 1" - let "endIR1=$posIR1 + $lenIR - 1" - - if (( "$endIR2" >= "$genome_length" )) ; then - loginfo "IRB is at the end of the original sequence" + posIR1=${IR[4]} + posIR2=${IR[6]} - # - # We just move the IRB at the begining of the sequence - # - - # Extract the IRB sequence - let "posCut=($endIR1+$posIR2)/2" - cutseq ${QUERY} ${posCut} ${genome_length} > ${tmpfasta1} - - # Append the remaining part of the genome - let "posCut=$posCut-1" - cutseq ${QUERY} 1 ${posCut} >> ${tmpfasta1} - - # merges both the parts - joinfasta ${tmpfasta1} > ${tmpfasta2} - rm -f ${tmpfasta1} - QUERY=${tmpfasta2} - - loginfo "Recomputing location of the IR..." - declare -a IR=( $(lookForIR ${QUERY}) ) - loginfo "Done" - - posIR1="${IR[4]}" - posIR2="${IR[6]}" - - let "lenIR=(${IR[5]} + ${IR[7]}) / 2 " + let "lenIR= ( ${IR[5]} + ${IR[7]} ) / 2 " let "endIR2=$posIR2 + $lenIR - 1" let "endIR1=$posIR1 + $lenIR - 1" - fi + if (( "$endIR2" >= "$genome_length" )) ; then + loginfo "IRB is at the end of the original sequence" + + # + # We just move the IRB at the begining of the sequence + # + + # Extract the IRB sequence + let "posCut=($endIR1+$posIR2)/2" + cutseq ${QUERY} ${posCut} ${genome_length} > ${tmpfasta1} - tmpIR1="tmp_$$_IR1.fasta" - tmpIR2="tmp_$$_IR2.fasta" + # Append the remaining part of the genome + let "posCut=$posCut-1" + cutseq ${QUERY} 1 ${posCut} >> ${tmpfasta1} + + # merges both the parts + joinfasta ${tmpfasta1} > ${tmpfasta2} + rm -f ${tmpfasta1} + QUERY=${tmpfasta2} - #enregistre les deux fragments IRa et IRb complet - cutseq ${QUERY} ${posIR1} ${endIR1} > ${tmpIR1} - cutseq ${QUERY} ${posIR2} ${endIR2} > ${tmpIR2} - - let "lenSC1=$posIR1 -1 + ($genome_length - endIR2)" - let "lenSC2=$posIR2 - $endIR1" - - center="${IR[0]}" + loginfo "Recomputing location of the IR..." + declare -a IR=( $(lookForIR ${QUERY}) ) + loginfo "Done" + + posIR1="${IR[4]}" + posIR2="${IR[6]}" + + let "lenIR=(${IR[5]} + ${IR[7]}) / 2 " - tmpLSC="tmp_$$_LSC.fasta" - tmpSSC="tmp_$$_SSC.fasta" - - # Extract the central SC present in between the two IRs - # considering it as LSC - - let "beginLSC=$endIR1+1" - let "endLSC=$posIR2-1" - cutseq ${QUERY} ${beginLSC} ${endLSC} > ${tmpLSC} - - strandLSC="${IR[1]}" - - - # Extract the external SC present in two parts - # Considering it as SSC - - let "beginSSC=$endIR2+1" - cutseq ${QUERY} ${beginSSC} ${genome_length} > ${tmpSSC} - - let "endSSC=$posIR1-1" - cutseq ${QUERY} 1 ${endSSC} >> ${tmpSSC} - - joinfasta ${tmpSSC} > ${tmpfasta1} - mv ${tmpfasta1} ${tmpSSC} - - strandSSC="${IR[3]}" - - - - if [[ "$center" == "SSC" ]]; then - - # Actually this is the oposite LSC is SSC and SSC is LSC - - # Exchanges the SSC and LSC sequences - mv ${tmpSSC} ${tmpfasta1} - mv ${tmpLSC} ${tmpSSC} - mv ${tmpfasta1} ${tmpLSC} + let "endIR2=$posIR2 + $lenIR - 1" + let "endIR1=$posIR1 + $lenIR - 1" + + fi - # Exchanges the IRa and IRb sequences - mv ${tmpIR1} ${tmpfasta1} - mv ${tmpIR2} ${tmpIR1} - mv ${tmpfasta1} ${tmpIR2} + tmpIR1="tmp_$$_IR1.fasta" + tmpIR2="tmp_$$_IR2.fasta" - # Exchanges the strand of both the Single copies - tmp=${strandSSC} - strandSSC=${strandLSC} - strandLSC=${tmp} + #enregistre les deux fragments IRa et IRb complet + cutseq ${QUERY} ${posIR1} ${endIR1} > ${tmpIR1} + cutseq ${QUERY} ${posIR2} ${endIR2} > ${tmpIR2} - fi + let "lenSC1=$posIR1 -1 + ($genome_length - endIR2)" + let "lenSC2=$posIR2 - $endIR1" + + center="${IR[0]}" + + tmpLSC="tmp_$$_LSC.fasta" + tmpSSC="tmp_$$_SSC.fasta" + + # Extract the central SC present in between the two IRs + # considering it as LSC - # Reverse complement the SSC if needed - if [[ "${strandSSC}" == "-" ]]; then - fastarevcomp -f ${tmpSSC} > ${tmpfasta1} + let "beginLSC=$endIR1+1" + let "endLSC=$posIR2-1" + cutseq ${QUERY} ${beginLSC} ${endLSC} > ${tmpLSC} + + strandLSC="${IR[1]}" + + + # Extract the external SC present in two parts + # Considering it as SSC + + let "beginSSC=$endIR2+1" + cutseq ${QUERY} ${beginSSC} ${genome_length} > ${tmpSSC} + + let "endSSC=$posIR1-1" + cutseq ${QUERY} 1 ${endSSC} >> ${tmpSSC} + + joinfasta ${tmpSSC} > ${tmpfasta1} mv ${tmpfasta1} ${tmpSSC} - fi + + strandSSC="${IR[3]}" + + - # Reverse complement the LSC if needed - if [[ "${strandLSC}" == "-" ]]; then - fastarevcomp -f ${tmpLSC} > ${tmpfasta1} - mv ${tmpfasta1} ${tmpLSC} - fi + if [[ "$center" == "SSC" ]]; then + + # Actually this is the oposite LSC is SSC and SSC is LSC - # Merges the four parts of the genome. - cat ${tmpLSC} ${tmpIR2} ${tmpSSC} ${tmpIR1} | joinfasta + # Exchanges the SSC and LSC sequences + mv ${tmpSSC} ${tmpfasta1} + mv ${tmpLSC} ${tmpSSC} + mv ${tmpfasta1} ${tmpLSC} + + # Exchanges the IRa and IRb sequences + mv ${tmpIR1} ${tmpfasta1} + mv ${tmpIR2} ${tmpIR1} + mv ${tmpfasta1} ${tmpIR2} + + # Exchanges the strand of both the Single copies + tmp=${strandSSC} + strandSSC=${strandLSC} + strandLSC=${tmp} + + fi + + # Reverse complement the SSC if needed + if [[ "${strandSSC}" == "-" ]]; then + fastarevcomp -f ${tmpSSC} > ${tmpfasta1} + mv ${tmpfasta1} ${tmpSSC} + fi + + # Reverse complement the LSC if needed + if [[ "${strandLSC}" == "-" ]]; then + fastarevcomp -f ${tmpLSC} > ${tmpfasta1} + mv ${tmpfasta1} ${tmpLSC} + fi + + # Merges the four parts of the genome. + cat ${tmpLSC} ${tmpIR2} ${tmpSSC} ${tmpIR1} | joinfasta - + else + # No IR detected --> normalization has no effect + cat ${QUERY} + fi popTmpDir exit 0 diff --git a/detectors/normalize/lib/lookforIR.lib.sh b/detectors/normalize/lib/lookforIR.lib.sh index 94ea75b..df0a22d 100644 --- a/detectors/normalize/lib/lookforIR.lib.sh +++ b/detectors/normalize/lib/lookforIR.lib.sh @@ -38,9 +38,16 @@ function lookForIR { loginfo "Looking for long inverted repeats..." repseek -c -p 0.001 -i ${QUERY} 2>> /dev/null > ${REPEATS} - loginfo " --> $(wc -l ${REPEATS} | awk '{print $1}') repeats identified" + nrepeat="$(wc -l ${REPEATS} | awk '{print $1}')" loginfo "Done" + if (( nrepeat == 0 )) ; then + logwarning "No inverted repeat identified" + return 1 + fi + + loginfo " --> ${nrepeat} repeats identified" + loginfo "Marking and selecting the best inverted repeat..." local IR=( $(${SELECTIR} ${MATCHES} ${REPEATS}) ) loginfo "Done"