From c32f7cdde6f40efe49a2d19198dcac455c65fdcb Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Sun, 11 Oct 2015 10:39:59 -0300 Subject: [PATCH] First version of the tRNA detector and of the global organnot.sh script Former-commit-id: f2a75cf99b24875c90c426c2afb22a75b972bf60 Former-commit-id: 65e3dfb35df06ca69bb29b690c9a40e8940ac6bf --- annotate_plastid.sh | 96 -------------------- detectors/ir/bin/go_ir.sh | 2 +- detectors/normalize/bin/go_normalize.sh | 2 + detectors/normalize/lib/lookforIR.lib.sh | 8 +- detectors/normalize/tools/buildSCDB.sh | 8 +- detectors/trna/bin/go_trna.sh | 42 +++++++++ detectors/trna/{ => lib}/aragorn_wrapper.awk | 70 +++++++++----- organnote.sh | 47 ++++++++++ scripts/bash_init.sh | 33 +++++-- 9 files changed, 175 insertions(+), 133 deletions(-) delete mode 100755 annotate_plastid.sh create mode 100755 detectors/trna/bin/go_trna.sh rename detectors/trna/{ => lib}/aragorn_wrapper.awk (77%) create mode 100755 organnote.sh diff --git a/annotate_plastid.sh b/annotate_plastid.sh deleted file mode 100755 index 62c5fe7..0000000 --- a/annotate_plastid.sh +++ /dev/null @@ -1,96 +0,0 @@ -#!/bin/bash -# -# - -export ORGANNOT_HOME=`dirname $0` - -REPSEEK=${ORGANNOT_HOME}/repseek -SUMATRA=${ORGANNOT_HOME}/sumatra -ARAGORN=${ORGANNOT_HOME}/aragorn -WRAPARAGORN=${ORGANNOT_HOME}/aragorn_wrapper.awk -ECOFIND=${ORGANNOT_HOME}/ecofind - -function annotateCAU { - QUERY="$$.query.fasta" - echo $1 | sed 's/&/ /' | tr '@' '\n' > ${QUERY} - ${SUMATRA} -d -n ${QUERY} $2 2> /dev/null | \ - awk ' {n[$2]+=1;d[$2]+=$3} \ - END {for (i in n) \ - print i, n[i],d[i], d[i]/n[i]\ - }' | \ - sort -rnk4 | \ - egrep '^trn(I|M|fM)' | \ - tail -1 | \ - awk '{print $1,$NF}' - rm -rf ${QUERY} -} - -function gffTRNA { - - ${ARAGORN} -w -io -seq $3 | awk -v gid=${1} -f ${WRAPARAGORN} - -} - -# s'alimente avec un fichier.fasta -# $3 : nb de caractere du fichier, t : nb de caractere du titre, -# $1+1 : nb de retour chariot du fichier -function seqlength { - cat $1 | \ - wc |\ - awk -v t="`head -1 $1 | wc -c`" '{print $3 - t - $1 + 1}' -} - - -# recupere les informations issues du programme repseek avec l'origine des deux -# IR et leur taille -function lookforIR { - ${REPSEEK} -c -p 0.001 $1 | \ - grep 'Distant.inv' | \ - sort -n -k4 | \ - tail -1 | \ - awk '{print $7}' | \ - sed 's/-/ /g' -} - -# recupere le nom de la sequence analyse -function seqName { - head -n1 $1| \ - awk '{print $1}' | \ - sed 's/^>//' | \ - sed -E 's/.*\|([^|]+)\|/\1/' -} - -# cree un resume du fichier analyse au format gff -# ex : GFF (NC_*** Repseek IR1 start end . + . ) -function gffIR { - lseq=$2 - nom=$1 - lookforIR $3 | \ - awk -v nom="$nom" -v lseq="$lseq" \ - 'BEGIN {OFS="\t"} \ - { startIR1=$1; \ - startIR2=$2; \ - endIR1=startIR1 + $3 -1; \ - endIR2=startIR2 + $3 -1; \ - startSSC=1; \ - endSSC=startIR1-1; \ - startLSC=endIR1+1; \ - endLSC=startIR2-1; \ - \ - print nom,"RepSeek","misc_feature",startSSC,endSSC,"\.","+","\.","ID=SSC;note=small single copy region";\ - print nom,"RepSeek","repeat_region",startIR1,endIR1,"\.","+","\.","ID=IRA;note=inverted repeat A";\ - print nom,"RepSeek","misc_feature",startLSC,endLSC,"\.","+","\.","ID=LSC;note=large single copy region";\ - print nom,"RepSeek","repeat_region",startIR2,endIR2,"\.","-","\.","ID=IRB;note=inverted repeat B";\ - }' -} - - -echo "##gff-version 3" - - -genome=$1 -genome_name=`seqName $1` -genome_length=`seqlength $1` - -gffIR ${genome_name} ${genome_length} ${genome}| grep -v '^ *$' -gffTRNA ${genome_name} ${genome_length} ${genome}| grep -v '^ *$' diff --git a/detectors/ir/bin/go_ir.sh b/detectors/ir/bin/go_ir.sh index 181913b..18f903c 100755 --- a/detectors/ir/bin/go_ir.sh +++ b/detectors/ir/bin/go_ir.sh @@ -61,4 +61,4 @@ pushTmpDir ORG.ir popTmpDir -exit 0 +exit 0 \ No newline at end of file diff --git a/detectors/normalize/bin/go_normalize.sh b/detectors/normalize/bin/go_normalize.sh index b772fae..6f6b9a5 100755 --- a/detectors/normalize/bin/go_normalize.sh +++ b/detectors/normalize/bin/go_normalize.sh @@ -27,12 +27,14 @@ SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})" source ${SCRIPT_DIR}/../lib/lookforIR.lib.sh +ORG_DEBUG=1 pushTmpDir ORG.normalize tmpfasta1="tmp_$$_1.fasta" tmpfasta2="tmp_$$_2.fasta" + logdebug "Running on : $QUERY" loginfo "Computing the genome size..." genome_length=$(seqlength $QUERY) diff --git a/detectors/normalize/lib/lookforIR.lib.sh b/detectors/normalize/lib/lookforIR.lib.sh index 31dd9da..fc8c787 100644 --- a/detectors/normalize/lib/lookforIR.lib.sh +++ b/detectors/normalize/lib/lookforIR.lib.sh @@ -47,7 +47,9 @@ function lookForIR { } SCDB="${IR_DATA_DIR}/SC_RefDB" -QUERY="${CALL_DIR}/$1" - -openLogFile "${QUERY/.*/}.log" +if [[ ! "$1" =~ ^/ ]]; then + QUERY="${CALL_DIR}/$1" +else + QUERY="$1" +fi diff --git a/detectors/normalize/tools/buildSCDB.sh b/detectors/normalize/tools/buildSCDB.sh index 708a665..6855b44 100755 --- a/detectors/normalize/tools/buildSCDB.sh +++ b/detectors/normalize/tools/buildSCDB.sh @@ -31,7 +31,7 @@ pushTmpDir ORG.buildSCDB loginfo "Building LSC coorientation graph..." - ${PROG_DIR}/coorienteSC.sh LSC.fasta 20000 ${LOGFILE} > LSC.tgf + ${PROG_DIR}/coorienteSC.sh LSC.fasta 20000 ${ORG_LOGFILE} > LSC.tgf ${PROG_DIR}/cc.py LSC.tgf > LSC.cc loginfo " --> $(awk '{print $1}' LSC.cc | uniq | wc -l) connected componants" loginfo "Done" @@ -73,7 +73,7 @@ pushTmpDir ORG.buildSCDB loginfo "Checking LCS homogeneity..." - ${PROG_DIR}/coorienteSC.sh LSC.direct.fasta 20000 ${LOGFILE} > LSC_RefDB.tgf + ${PROG_DIR}/coorienteSC.sh LSC.direct.fasta 20000 ${ORG_LOGFILE} > LSC_RefDB.tgf ${PROG_DIR}/cc.py LSC_RefDB.tgf > LSC_RefDB.cc NCC=$(awk '{print $1}' LSC_RefDB.cc | uniq | wc -l) if (( $NCC == 1 )); then @@ -103,7 +103,7 @@ pushTmpDir ORG.buildSCDB loginfo "Building SSC coorientation graph..." - ${PROG_DIR}/coorienteSC.sh SSC.fasta 5000 ${LOGFILE} > SSC.tgf + ${PROG_DIR}/coorienteSC.sh SSC.fasta 5000 ${ORG_LOGFILE} > SSC.tgf ${PROG_DIR}/cc.py SSC.tgf > SSC.cc loginfo " --> $(awk '{print $1}' SSC.cc | uniq | wc -l) connected componants" loginfo "Done" @@ -146,7 +146,7 @@ pushTmpDir ORG.buildSCDB loginfo "Checking SSC homogeneity..." - ${PROG_DIR}/coorienteSC.sh SSC.direct.fasta 5000 ${LOGFILE} > SSC_RefDB.tgf + ${PROG_DIR}/coorienteSC.sh SSC.direct.fasta 5000 ${ORG_LOGFILE} > SSC_RefDB.tgf ${PROG_DIR}/cc.py SSC_RefDB.tgf > SSC_RefDB.cc NCC=$(awk '{print $1}' SSC_RefDB.cc | uniq | wc -l) if (( $NCC == 1 )); then diff --git a/detectors/trna/bin/go_trna.sh b/detectors/trna/bin/go_trna.sh new file mode 100755 index 0000000..6e2d29d --- /dev/null +++ b/detectors/trna/bin/go_trna.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# +# Annotate tRNA +# +#======================================================================================== +# +# Annotate tRNA based on the Aragorn software predictions. + +# go_trna.sh +# +# - : The fasta file containing the genome to annotate +# +# Results are printed to the standart output +# +#======================================================================================== + +# -- CAUTION -- Works as long than the script +# is not called through a symlink +SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})" +source "${SCRIPT_DIR}/../../../scripts/bash_init.sh" + +pushTmpDir ORG.trna + + CAUTRNADB="${TRNA_DATA_DIR}/CAU_tRNA_DB.fasta" + export CAUTRNADB + + if [[ ! "$1" =~ ^/ ]]; then + QUERY="${CALL_DIR}/$1" + else + QUERY="$1" + fi + + TRNA=$(basename ${QUERY}) + + aragorn -i -w -seq ${QUERY} | \ + ${PROG_DIR}/../lib/aragorn_wrapper.awk + + +popTmpDir + +exit 0 + diff --git a/detectors/trna/aragorn_wrapper.awk b/detectors/trna/lib/aragorn_wrapper.awk similarity index 77% rename from detectors/trna/aragorn_wrapper.awk rename to detectors/trna/lib/aragorn_wrapper.awk index 3a0ae67..6fdcba9 100755 --- a/detectors/trna/aragorn_wrapper.awk +++ b/detectors/trna/lib/aragorn_wrapper.awk @@ -7,17 +7,9 @@ function genomeid() { return gid; } -function home() { - "echo $ORGANNOT_HOME" | getline homedir; - return homedir; -} - -function prog(program) { - return home() "/" program; -} - -function trnalib(prognam) { - return home() "/lib/trnaCAU.ref.fasta"; +function trnalib() { + "echo $CAUTRNADB" | getline ref; + return ref } function awkPID() { @@ -65,19 +57,20 @@ function patchtRNA(anticodon,trna,seq) { if (anticodon == "cat") { file=printfasta(trna "_" anticodon,seq,""); - command= prog("sumatra") " -d -n " file " " trnalib(); + command= "sumatra -t 0.9 -x -n " file " " trnalib() " 2>> /dev/null"; while ((command | getline output) > 0) { split(output,field," "); - n[field[2]]++; - d[field[2]]=field[3]; + match(field[2],"trn.M?"); + trna=substr(field[2],RSTART,RLENGTH); + n[trna]+=field[5]; } close(command) - dmin=1; + nmax=0; for (i in n) { - dist=d[i]/n[i]; - if (dist < dmin) { - dmin=dist; + dist=n[i]; + if (n[i] > nmax) { + nmax=n[i]; trna=i; } } @@ -94,6 +87,42 @@ function gene2product(gene) { return "tRNA-" AA3[substr(gene,4,1)]; } +function emblTRNA(geneid,trna,loc,anti,intron,seq) { + if (loc ~ /^c/) { + sub("c\\[","complement(",loc); + sub("\\]",")",loc); + sub(",","..",loc)} + else { + sub("\\[","",loc); + sub("\\]","",loc); + sub(",","..",loc)} + + anti=toupper(anti); + gsub("T","U",anti); + product=gene2product(trna); + + if (intron!="") { + l=length(intron); + intron=substr(intron,2,l-2); + split(intron,intronpos,","); + ib=intronpos[0]; + ie=intronpos[1]; + match(loc,"[0-9][0-9]*"); + gb=substr(loc,RSTART,RLENGTH); + sub("\\.\\.",".." (gb + ib -2) "," (gb + ie) "..",loc); \ + sub("complement","complement(join",loc);\ + if (substr(loc,1,1) ~ /[0-9]/) { + loc="join("loc} + loc=loc")"; + } + + print "FT tRNA " loc; + print "FT /gene=\""trna"\""; + print "FT /anticodon=\""anti"\""; + print "FT /product=\""product"("anti")\""; + +} + function gffTRNA(geneid,trna,loc,anti,intron,seq) { if (loc ~ /^c/) { complement="-"; @@ -139,7 +168,6 @@ function gffTRNA(geneid,trna,loc,anti,intron,seq) { } BEGIN { - print ARGV[1]; AA1["Ala"]="A"; AA1["Cys"]="C"; AA1["Asp"]="D"; @@ -201,7 +229,7 @@ BEGIN { { seq=epissage(intron,seq); trna=patchtRNA(anti,trna,seq); # print geneid,trna,loc,anti,"'"intron"'",seq; - gffTRNA(geneid,trna,loc,anti,intron,seq); + emblTRNA(geneid,trna,loc,anti,intron,seq); seq="" } @@ -225,5 +253,5 @@ BEGIN { END { seq=epissage(intron,seq); trna=patchtRNA(anti,trna,seq); # print geneid,trna,loc,anti,"'"intron"'",seq; - gffTRNA(geneid,trna,loc,anti,intron,seq); + emblTRNA(geneid,trna,loc,anti,intron,seq); } diff --git a/organnote.sh b/organnote.sh new file mode 100755 index 0000000..8258582 --- /dev/null +++ b/organnote.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# +# + +# +# Annotate tRNA +# +#======================================================================================== +# +# Annotate tRNA based on the Aragorn software predictions. + +# go_trna.sh +# +# - : The fasta file containing the genome to annotate +# +# Results are printed to the standart output +# +#======================================================================================== + +# -- CAUTION -- Works as long than the script +# is not called through a symlink +SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})" +source "${SCRIPT_DIR}/scripts/bash_init.sh" + +pushTmpDir ORG.organnot + + if [[ ! "$1" =~ ^/ ]]; then + QUERY="${CALL_DIR}/$1" + else + QUERY="$1" + fi + + RESULTS=$(basename ${QUERY/.*/}) + LOG="${CALL_DIR}/${RESULTS}.log" + + rm -f ${LOG} + openLogFile ${LOG} + + ${PROG_DIR}/detectors/normalize/bin/go_normalize.sh ${QUERY} > "${RESULTS}.norm.fasta" + + ${PROG_DIR}/detectors/ir/bin/go_ir.sh ${QUERY} > "${RESULTS}.annot" + ${PROG_DIR}/detectors/trna/bin/go_trna.sh ${QUERY} >> "${RESULTS}.annot" + + cat "${RESULTS}.annot" + +popTmpDir + diff --git a/scripts/bash_init.sh b/scripts/bash_init.sh index 161518b..6f9cc42 100644 --- a/scripts/bash_init.sh +++ b/scripts/bash_init.sh @@ -19,11 +19,18 @@ function getAbsolutePath { function pushTmpDir { TMP_DIR=$(mktemp -d -t "$1_proc_$$_") pushd $TMP_DIR >& /dev/null + TMP_DIR_STACK="$TMP_DIR $TMP_DIR_STACK" + logdebug "Pushing temp directory $TMP_DIR" + logdebug "Stack : ${TMP_DIR_STACK}" } function popTmpDir { + TMP_DIR=$(echo $TMP_DIR_STACK | awk '{print $1}') + TMP_DIR_STACK=$(echo $TMP_DIR_STACK | awk '{$1="";print $0}') popd >& /dev/null rm -rf $TMP_DIR >& /dev/null + logdebug "Poping temp directory $TMP_DIR" + logdebug "Stack : ${TMP_DIR_STACK}" } # Logging functions @@ -34,32 +41,42 @@ function errcho { } function openLogFile { - LOGFILE=$1 - touch ${LOGFILE} + ORG_LOGFILE=$1 + export ORG_LOGFILE + touch ${ORG_LOGFILE} } function loginfo { errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA INFO ] $$ -- $1" - if [[ ! -z ${LOGFILE} ]]; then - echo `date +'%Y-%m-%d %H:%M:%S'` "[OA INFO ] $$ -- $1" >> ${LOGFILE} + if [[ ! -z ${ORG_LOGFILE} ]]; then + echo `date +'%Y-%m-%d %H:%M:%S'` "[OA INFO ] $$ -- $1" >> ${ORG_LOGFILE} fi } function logerror { errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA ERROR ] $$ -- $1" - if [[ ! -z ${LOGFILE} ]]; then - echo `date +'%Y-%m-%d %H:%M:%S'` "[OA ERROR ] $$ -- $1" >> ${LOGFILE} + if [[ ! -z ${ORG_LOGFILE} ]]; then + echo `date +'%Y-%m-%d %H:%M:%S'` "[OA ERROR ] $$ -- $1" >> ${ORG_LOGFILE} fi } function logwarning { errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA WARNING] $$ -- $1" - if [[ ! -z ${LOGFILE} ]]; then - echo `date +'%Y-%m-%d %H:%M:%S'` "[OA WARNING] $$ -- $1" >> ${LOGFILE} + if [[ ! -z ${ORG_LOGFILE} ]]; then + echo `date +'%Y-%m-%d %H:%M:%S'` "[OA WARNING] $$ -- $1" >> ${ORG_LOGFILE} fi } +function logdebug { + if [[ ! -z ${ORG_DEBUG} ]]; then + errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA DEBUG ] $$ -- $1" + if [[ ! -z ${ORG_LOGFILE} ]]; then + echo `date +'%Y-%m-%d %H:%M:%S'` "[OA DEBUG ] $$ -- $1" >> ${ORG_LOGFILE} + fi + fi +} + # Sequence related functions # Counts how many sequences are stored in a fasta file