First version of the tRNA detector and of the global organnot.sh script

Former-commit-id: f2a75cf99b24875c90c426c2afb22a75b972bf60
Former-commit-id: 65e3dfb35df06ca69bb29b690c9a40e8940ac6bf
This commit is contained in:
2015-10-11 10:39:59 -03:00
parent 6015339839
commit c32f7cdde6
9 changed files with 175 additions and 133 deletions

View File

@ -1,96 +0,0 @@
#!/bin/bash
#
#
export ORGANNOT_HOME=`dirname $0`
REPSEEK=${ORGANNOT_HOME}/repseek
SUMATRA=${ORGANNOT_HOME}/sumatra
ARAGORN=${ORGANNOT_HOME}/aragorn
WRAPARAGORN=${ORGANNOT_HOME}/aragorn_wrapper.awk
ECOFIND=${ORGANNOT_HOME}/ecofind
function annotateCAU {
QUERY="$$.query.fasta"
echo $1 | sed 's/&/ /' | tr '@' '\n' > ${QUERY}
${SUMATRA} -d -n ${QUERY} $2 2> /dev/null | \
awk ' {n[$2]+=1;d[$2]+=$3} \
END {for (i in n) \
print i, n[i],d[i], d[i]/n[i]\
}' | \
sort -rnk4 | \
egrep '^trn(I|M|fM)' | \
tail -1 | \
awk '{print $1,$NF}'
rm -rf ${QUERY}
}
function gffTRNA {
${ARAGORN} -w -io -seq $3 | awk -v gid=${1} -f ${WRAPARAGORN}
}
# s'alimente avec un fichier.fasta
# $3 : nb de caractere du fichier, t : nb de caractere du titre,
# $1+1 : nb de retour chariot du fichier
function seqlength {
cat $1 | \
wc |\
awk -v t="`head -1 $1 | wc -c`" '{print $3 - t - $1 + 1}'
}
# recupere les informations issues du programme repseek avec l'origine des deux
# IR et leur taille
function lookforIR {
${REPSEEK} -c -p 0.001 $1 | \
grep 'Distant.inv' | \
sort -n -k4 | \
tail -1 | \
awk '{print $7}' | \
sed 's/-/ /g'
}
# recupere le nom de la sequence analyse
function seqName {
head -n1 $1| \
awk '{print $1}' | \
sed 's/^>//' | \
sed -E 's/.*\|([^|]+)\|/\1/'
}
# cree un resume du fichier analyse au format gff
# ex : GFF (NC_*** Repseek IR1 start end . + . )
function gffIR {
lseq=$2
nom=$1
lookforIR $3 | \
awk -v nom="$nom" -v lseq="$lseq" \
'BEGIN {OFS="\t"} \
{ startIR1=$1; \
startIR2=$2; \
endIR1=startIR1 + $3 -1; \
endIR2=startIR2 + $3 -1; \
startSSC=1; \
endSSC=startIR1-1; \
startLSC=endIR1+1; \
endLSC=startIR2-1; \
\
print nom,"RepSeek","misc_feature",startSSC,endSSC,"\.","+","\.","ID=SSC;note=small single copy region";\
print nom,"RepSeek","repeat_region",startIR1,endIR1,"\.","+","\.","ID=IRA;note=inverted repeat A";\
print nom,"RepSeek","misc_feature",startLSC,endLSC,"\.","+","\.","ID=LSC;note=large single copy region";\
print nom,"RepSeek","repeat_region",startIR2,endIR2,"\.","-","\.","ID=IRB;note=inverted repeat B";\
}'
}
echo "##gff-version 3"
genome=$1
genome_name=`seqName $1`
genome_length=`seqlength $1`
gffIR ${genome_name} ${genome_length} ${genome}| grep -v '^ *$'
gffTRNA ${genome_name} ${genome_length} ${genome}| grep -v '^ *$'

View File

@ -61,4 +61,4 @@ pushTmpDir ORG.ir
popTmpDir popTmpDir
exit 0 exit 0

View File

@ -27,12 +27,14 @@
SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})" SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})"
source ${SCRIPT_DIR}/../lib/lookforIR.lib.sh source ${SCRIPT_DIR}/../lib/lookforIR.lib.sh
ORG_DEBUG=1
pushTmpDir ORG.normalize pushTmpDir ORG.normalize
tmpfasta1="tmp_$$_1.fasta" tmpfasta1="tmp_$$_1.fasta"
tmpfasta2="tmp_$$_2.fasta" tmpfasta2="tmp_$$_2.fasta"
logdebug "Running on : $QUERY"
loginfo "Computing the genome size..." loginfo "Computing the genome size..."
genome_length=$(seqlength $QUERY) genome_length=$(seqlength $QUERY)

View File

@ -47,7 +47,9 @@ function lookForIR {
} }
SCDB="${IR_DATA_DIR}/SC_RefDB" SCDB="${IR_DATA_DIR}/SC_RefDB"
QUERY="${CALL_DIR}/$1"
if [[ ! "$1" =~ ^/ ]]; then
openLogFile "${QUERY/.*/}.log" QUERY="${CALL_DIR}/$1"
else
QUERY="$1"
fi

View File

@ -31,7 +31,7 @@ pushTmpDir ORG.buildSCDB
loginfo "Building LSC coorientation graph..." loginfo "Building LSC coorientation graph..."
${PROG_DIR}/coorienteSC.sh LSC.fasta 20000 ${LOGFILE} > LSC.tgf ${PROG_DIR}/coorienteSC.sh LSC.fasta 20000 ${ORG_LOGFILE} > LSC.tgf
${PROG_DIR}/cc.py LSC.tgf > LSC.cc ${PROG_DIR}/cc.py LSC.tgf > LSC.cc
loginfo " --> $(awk '{print $1}' LSC.cc | uniq | wc -l) connected componants" loginfo " --> $(awk '{print $1}' LSC.cc | uniq | wc -l) connected componants"
loginfo "Done" loginfo "Done"
@ -73,7 +73,7 @@ pushTmpDir ORG.buildSCDB
loginfo "Checking LCS homogeneity..." loginfo "Checking LCS homogeneity..."
${PROG_DIR}/coorienteSC.sh LSC.direct.fasta 20000 ${LOGFILE} > LSC_RefDB.tgf ${PROG_DIR}/coorienteSC.sh LSC.direct.fasta 20000 ${ORG_LOGFILE} > LSC_RefDB.tgf
${PROG_DIR}/cc.py LSC_RefDB.tgf > LSC_RefDB.cc ${PROG_DIR}/cc.py LSC_RefDB.tgf > LSC_RefDB.cc
NCC=$(awk '{print $1}' LSC_RefDB.cc | uniq | wc -l) NCC=$(awk '{print $1}' LSC_RefDB.cc | uniq | wc -l)
if (( $NCC == 1 )); then if (( $NCC == 1 )); then
@ -103,7 +103,7 @@ pushTmpDir ORG.buildSCDB
loginfo "Building SSC coorientation graph..." loginfo "Building SSC coorientation graph..."
${PROG_DIR}/coorienteSC.sh SSC.fasta 5000 ${LOGFILE} > SSC.tgf ${PROG_DIR}/coorienteSC.sh SSC.fasta 5000 ${ORG_LOGFILE} > SSC.tgf
${PROG_DIR}/cc.py SSC.tgf > SSC.cc ${PROG_DIR}/cc.py SSC.tgf > SSC.cc
loginfo " --> $(awk '{print $1}' SSC.cc | uniq | wc -l) connected componants" loginfo " --> $(awk '{print $1}' SSC.cc | uniq | wc -l) connected componants"
loginfo "Done" loginfo "Done"
@ -146,7 +146,7 @@ pushTmpDir ORG.buildSCDB
loginfo "Checking SSC homogeneity..." loginfo "Checking SSC homogeneity..."
${PROG_DIR}/coorienteSC.sh SSC.direct.fasta 5000 ${LOGFILE} > SSC_RefDB.tgf ${PROG_DIR}/coorienteSC.sh SSC.direct.fasta 5000 ${ORG_LOGFILE} > SSC_RefDB.tgf
${PROG_DIR}/cc.py SSC_RefDB.tgf > SSC_RefDB.cc ${PROG_DIR}/cc.py SSC_RefDB.tgf > SSC_RefDB.cc
NCC=$(awk '{print $1}' SSC_RefDB.cc | uniq | wc -l) NCC=$(awk '{print $1}' SSC_RefDB.cc | uniq | wc -l)
if (( $NCC == 1 )); then if (( $NCC == 1 )); then

42
detectors/trna/bin/go_trna.sh Executable file
View File

@ -0,0 +1,42 @@
#!/bin/bash
#
# Annotate tRNA
#
#========================================================================================
#
# Annotate tRNA based on the Aragorn software predictions.
# go_trna.sh <FASTAFILE>
#
# - <FASTAFILE> : The fasta file containing the genome to annotate
#
# Results are printed to the standart output
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${SCRIPT_DIR}/../../../scripts/bash_init.sh"
pushTmpDir ORG.trna
CAUTRNADB="${TRNA_DATA_DIR}/CAU_tRNA_DB.fasta"
export CAUTRNADB
if [[ ! "$1" =~ ^/ ]]; then
QUERY="${CALL_DIR}/$1"
else
QUERY="$1"
fi
TRNA=$(basename ${QUERY})
aragorn -i -w -seq ${QUERY} | \
${PROG_DIR}/../lib/aragorn_wrapper.awk
popTmpDir
exit 0

View File

@ -7,17 +7,9 @@ function genomeid() {
return gid; return gid;
} }
function home() { function trnalib() {
"echo $ORGANNOT_HOME" | getline homedir; "echo $CAUTRNADB" | getline ref;
return homedir; return ref
}
function prog(program) {
return home() "/" program;
}
function trnalib(prognam) {
return home() "/lib/trnaCAU.ref.fasta";
} }
function awkPID() { function awkPID() {
@ -65,19 +57,20 @@ function patchtRNA(anticodon,trna,seq) {
if (anticodon == "cat") { if (anticodon == "cat") {
file=printfasta(trna "_" anticodon,seq,""); file=printfasta(trna "_" anticodon,seq,"");
command= prog("sumatra") " -d -n " file " " trnalib(); command= "sumatra -t 0.9 -x -n " file " " trnalib() " 2>> /dev/null";
while ((command | getline output) > 0) { while ((command | getline output) > 0) {
split(output,field," "); split(output,field," ");
n[field[2]]++; match(field[2],"trn.M?");
d[field[2]]=field[3]; trna=substr(field[2],RSTART,RLENGTH);
n[trna]+=field[5];
} }
close(command) close(command)
dmin=1; nmax=0;
for (i in n) { for (i in n) {
dist=d[i]/n[i]; dist=n[i];
if (dist < dmin) { if (n[i] > nmax) {
dmin=dist; nmax=n[i];
trna=i; trna=i;
} }
} }
@ -94,6 +87,42 @@ function gene2product(gene) {
return "tRNA-" AA3[substr(gene,4,1)]; return "tRNA-" AA3[substr(gene,4,1)];
} }
function emblTRNA(geneid,trna,loc,anti,intron,seq) {
if (loc ~ /^c/) {
sub("c\\[","complement(",loc);
sub("\\]",")",loc);
sub(",","..",loc)}
else {
sub("\\[","",loc);
sub("\\]","",loc);
sub(",","..",loc)}
anti=toupper(anti);
gsub("T","U",anti);
product=gene2product(trna);
if (intron!="") {
l=length(intron);
intron=substr(intron,2,l-2);
split(intron,intronpos,",");
ib=intronpos[0];
ie=intronpos[1];
match(loc,"[0-9][0-9]*");
gb=substr(loc,RSTART,RLENGTH);
sub("\\.\\.",".." (gb + ib -2) "," (gb + ie) "..",loc); \
sub("complement","complement(join",loc);\
if (substr(loc,1,1) ~ /[0-9]/) {
loc="join("loc}
loc=loc")";
}
print "FT tRNA " loc;
print "FT /gene=\""trna"\"";
print "FT /anticodon=\""anti"\"";
print "FT /product=\""product"("anti")\"";
}
function gffTRNA(geneid,trna,loc,anti,intron,seq) { function gffTRNA(geneid,trna,loc,anti,intron,seq) {
if (loc ~ /^c/) { if (loc ~ /^c/) {
complement="-"; complement="-";
@ -139,7 +168,6 @@ function gffTRNA(geneid,trna,loc,anti,intron,seq) {
} }
BEGIN { BEGIN {
print ARGV[1];
AA1["Ala"]="A"; AA1["Ala"]="A";
AA1["Cys"]="C"; AA1["Cys"]="C";
AA1["Asp"]="D"; AA1["Asp"]="D";
@ -201,7 +229,7 @@ BEGIN {
{ seq=epissage(intron,seq); { seq=epissage(intron,seq);
trna=patchtRNA(anti,trna,seq); trna=patchtRNA(anti,trna,seq);
# print geneid,trna,loc,anti,"'"intron"'",seq; # print geneid,trna,loc,anti,"'"intron"'",seq;
gffTRNA(geneid,trna,loc,anti,intron,seq); emblTRNA(geneid,trna,loc,anti,intron,seq);
seq="" seq=""
} }
@ -225,5 +253,5 @@ BEGIN {
END { seq=epissage(intron,seq); END { seq=epissage(intron,seq);
trna=patchtRNA(anti,trna,seq); trna=patchtRNA(anti,trna,seq);
# print geneid,trna,loc,anti,"'"intron"'",seq; # print geneid,trna,loc,anti,"'"intron"'",seq;
gffTRNA(geneid,trna,loc,anti,intron,seq); emblTRNA(geneid,trna,loc,anti,intron,seq);
} }

47
organnote.sh Executable file
View File

@ -0,0 +1,47 @@
#!/bin/bash
#
#
#
# Annotate tRNA
#
#========================================================================================
#
# Annotate tRNA based on the Aragorn software predictions.
# go_trna.sh <FASTAFILE>
#
# - <FASTAFILE> : The fasta file containing the genome to annotate
#
# Results are printed to the standart output
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${SCRIPT_DIR}/scripts/bash_init.sh"
pushTmpDir ORG.organnot
if [[ ! "$1" =~ ^/ ]]; then
QUERY="${CALL_DIR}/$1"
else
QUERY="$1"
fi
RESULTS=$(basename ${QUERY/.*/})
LOG="${CALL_DIR}/${RESULTS}.log"
rm -f ${LOG}
openLogFile ${LOG}
${PROG_DIR}/detectors/normalize/bin/go_normalize.sh ${QUERY} > "${RESULTS}.norm.fasta"
${PROG_DIR}/detectors/ir/bin/go_ir.sh ${QUERY} > "${RESULTS}.annot"
${PROG_DIR}/detectors/trna/bin/go_trna.sh ${QUERY} >> "${RESULTS}.annot"
cat "${RESULTS}.annot"
popTmpDir

View File

@ -19,11 +19,18 @@ function getAbsolutePath {
function pushTmpDir { function pushTmpDir {
TMP_DIR=$(mktemp -d -t "$1_proc_$$_") TMP_DIR=$(mktemp -d -t "$1_proc_$$_")
pushd $TMP_DIR >& /dev/null pushd $TMP_DIR >& /dev/null
TMP_DIR_STACK="$TMP_DIR $TMP_DIR_STACK"
logdebug "Pushing temp directory $TMP_DIR"
logdebug "Stack : ${TMP_DIR_STACK}"
} }
function popTmpDir { function popTmpDir {
TMP_DIR=$(echo $TMP_DIR_STACK | awk '{print $1}')
TMP_DIR_STACK=$(echo $TMP_DIR_STACK | awk '{$1="";print $0}')
popd >& /dev/null popd >& /dev/null
rm -rf $TMP_DIR >& /dev/null rm -rf $TMP_DIR >& /dev/null
logdebug "Poping temp directory $TMP_DIR"
logdebug "Stack : ${TMP_DIR_STACK}"
} }
# Logging functions # Logging functions
@ -34,32 +41,42 @@ function errcho {
} }
function openLogFile { function openLogFile {
LOGFILE=$1 ORG_LOGFILE=$1
touch ${LOGFILE} export ORG_LOGFILE
touch ${ORG_LOGFILE}
} }
function loginfo { function loginfo {
errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA INFO ] $$ -- $1" errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA INFO ] $$ -- $1"
if [[ ! -z ${LOGFILE} ]]; then if [[ ! -z ${ORG_LOGFILE} ]]; then
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA INFO ] $$ -- $1" >> ${LOGFILE} echo `date +'%Y-%m-%d %H:%M:%S'` "[OA INFO ] $$ -- $1" >> ${ORG_LOGFILE}
fi fi
} }
function logerror { function logerror {
errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA ERROR ] $$ -- $1" errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA ERROR ] $$ -- $1"
if [[ ! -z ${LOGFILE} ]]; then if [[ ! -z ${ORG_LOGFILE} ]]; then
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA ERROR ] $$ -- $1" >> ${LOGFILE} echo `date +'%Y-%m-%d %H:%M:%S'` "[OA ERROR ] $$ -- $1" >> ${ORG_LOGFILE}
fi fi
} }
function logwarning { function logwarning {
errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA WARNING] $$ -- $1" errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA WARNING] $$ -- $1"
if [[ ! -z ${LOGFILE} ]]; then if [[ ! -z ${ORG_LOGFILE} ]]; then
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA WARNING] $$ -- $1" >> ${LOGFILE} echo `date +'%Y-%m-%d %H:%M:%S'` "[OA WARNING] $$ -- $1" >> ${ORG_LOGFILE}
fi fi
} }
function logdebug {
if [[ ! -z ${ORG_DEBUG} ]]; then
errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA DEBUG ] $$ -- $1"
if [[ ! -z ${ORG_LOGFILE} ]]; then
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA DEBUG ] $$ -- $1" >> ${ORG_LOGFILE}
fi
fi
}
# Sequence related functions # Sequence related functions
# Counts how many sequences are stored in a fasta file # Counts how many sequences are stored in a fasta file