First version of the tRNA detector and of the global organnot.sh script

Former-commit-id: f2a75cf99b24875c90c426c2afb22a75b972bf60
Former-commit-id: 65e3dfb35df06ca69bb29b690c9a40e8940ac6bf
This commit is contained in:
2015-10-11 10:39:59 -03:00
parent 6015339839
commit c32f7cdde6
9 changed files with 175 additions and 133 deletions

View File

@ -1,96 +0,0 @@
#!/bin/bash
#
#
export ORGANNOT_HOME=`dirname $0`
REPSEEK=${ORGANNOT_HOME}/repseek
SUMATRA=${ORGANNOT_HOME}/sumatra
ARAGORN=${ORGANNOT_HOME}/aragorn
WRAPARAGORN=${ORGANNOT_HOME}/aragorn_wrapper.awk
ECOFIND=${ORGANNOT_HOME}/ecofind
function annotateCAU {
QUERY="$$.query.fasta"
echo $1 | sed 's/&/ /' | tr '@' '\n' > ${QUERY}
${SUMATRA} -d -n ${QUERY} $2 2> /dev/null | \
awk ' {n[$2]+=1;d[$2]+=$3} \
END {for (i in n) \
print i, n[i],d[i], d[i]/n[i]\
}' | \
sort -rnk4 | \
egrep '^trn(I|M|fM)' | \
tail -1 | \
awk '{print $1,$NF}'
rm -rf ${QUERY}
}
function gffTRNA {
${ARAGORN} -w -io -seq $3 | awk -v gid=${1} -f ${WRAPARAGORN}
}
# s'alimente avec un fichier.fasta
# $3 : nb de caractere du fichier, t : nb de caractere du titre,
# $1+1 : nb de retour chariot du fichier
function seqlength {
cat $1 | \
wc |\
awk -v t="`head -1 $1 | wc -c`" '{print $3 - t - $1 + 1}'
}
# recupere les informations issues du programme repseek avec l'origine des deux
# IR et leur taille
function lookforIR {
${REPSEEK} -c -p 0.001 $1 | \
grep 'Distant.inv' | \
sort -n -k4 | \
tail -1 | \
awk '{print $7}' | \
sed 's/-/ /g'
}
# recupere le nom de la sequence analyse
function seqName {
head -n1 $1| \
awk '{print $1}' | \
sed 's/^>//' | \
sed -E 's/.*\|([^|]+)\|/\1/'
}
# cree un resume du fichier analyse au format gff
# ex : GFF (NC_*** Repseek IR1 start end . + . )
function gffIR {
lseq=$2
nom=$1
lookforIR $3 | \
awk -v nom="$nom" -v lseq="$lseq" \
'BEGIN {OFS="\t"} \
{ startIR1=$1; \
startIR2=$2; \
endIR1=startIR1 + $3 -1; \
endIR2=startIR2 + $3 -1; \
startSSC=1; \
endSSC=startIR1-1; \
startLSC=endIR1+1; \
endLSC=startIR2-1; \
\
print nom,"RepSeek","misc_feature",startSSC,endSSC,"\.","+","\.","ID=SSC;note=small single copy region";\
print nom,"RepSeek","repeat_region",startIR1,endIR1,"\.","+","\.","ID=IRA;note=inverted repeat A";\
print nom,"RepSeek","misc_feature",startLSC,endLSC,"\.","+","\.","ID=LSC;note=large single copy region";\
print nom,"RepSeek","repeat_region",startIR2,endIR2,"\.","-","\.","ID=IRB;note=inverted repeat B";\
}'
}
echo "##gff-version 3"
genome=$1
genome_name=`seqName $1`
genome_length=`seqlength $1`
gffIR ${genome_name} ${genome_length} ${genome}| grep -v '^ *$'
gffTRNA ${genome_name} ${genome_length} ${genome}| grep -v '^ *$'

View File

@ -61,4 +61,4 @@ pushTmpDir ORG.ir
popTmpDir
exit 0
exit 0

View File

@ -27,12 +27,14 @@
SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})"
source ${SCRIPT_DIR}/../lib/lookforIR.lib.sh
ORG_DEBUG=1
pushTmpDir ORG.normalize
tmpfasta1="tmp_$$_1.fasta"
tmpfasta2="tmp_$$_2.fasta"
logdebug "Running on : $QUERY"
loginfo "Computing the genome size..."
genome_length=$(seqlength $QUERY)

View File

@ -47,7 +47,9 @@ function lookForIR {
}
SCDB="${IR_DATA_DIR}/SC_RefDB"
QUERY="${CALL_DIR}/$1"
openLogFile "${QUERY/.*/}.log"
if [[ ! "$1" =~ ^/ ]]; then
QUERY="${CALL_DIR}/$1"
else
QUERY="$1"
fi

View File

@ -31,7 +31,7 @@ pushTmpDir ORG.buildSCDB
loginfo "Building LSC coorientation graph..."
${PROG_DIR}/coorienteSC.sh LSC.fasta 20000 ${LOGFILE} > LSC.tgf
${PROG_DIR}/coorienteSC.sh LSC.fasta 20000 ${ORG_LOGFILE} > LSC.tgf
${PROG_DIR}/cc.py LSC.tgf > LSC.cc
loginfo " --> $(awk '{print $1}' LSC.cc | uniq | wc -l) connected componants"
loginfo "Done"
@ -73,7 +73,7 @@ pushTmpDir ORG.buildSCDB
loginfo "Checking LCS homogeneity..."
${PROG_DIR}/coorienteSC.sh LSC.direct.fasta 20000 ${LOGFILE} > LSC_RefDB.tgf
${PROG_DIR}/coorienteSC.sh LSC.direct.fasta 20000 ${ORG_LOGFILE} > LSC_RefDB.tgf
${PROG_DIR}/cc.py LSC_RefDB.tgf > LSC_RefDB.cc
NCC=$(awk '{print $1}' LSC_RefDB.cc | uniq | wc -l)
if (( $NCC == 1 )); then
@ -103,7 +103,7 @@ pushTmpDir ORG.buildSCDB
loginfo "Building SSC coorientation graph..."
${PROG_DIR}/coorienteSC.sh SSC.fasta 5000 ${LOGFILE} > SSC.tgf
${PROG_DIR}/coorienteSC.sh SSC.fasta 5000 ${ORG_LOGFILE} > SSC.tgf
${PROG_DIR}/cc.py SSC.tgf > SSC.cc
loginfo " --> $(awk '{print $1}' SSC.cc | uniq | wc -l) connected componants"
loginfo "Done"
@ -146,7 +146,7 @@ pushTmpDir ORG.buildSCDB
loginfo "Checking SSC homogeneity..."
${PROG_DIR}/coorienteSC.sh SSC.direct.fasta 5000 ${LOGFILE} > SSC_RefDB.tgf
${PROG_DIR}/coorienteSC.sh SSC.direct.fasta 5000 ${ORG_LOGFILE} > SSC_RefDB.tgf
${PROG_DIR}/cc.py SSC_RefDB.tgf > SSC_RefDB.cc
NCC=$(awk '{print $1}' SSC_RefDB.cc | uniq | wc -l)
if (( $NCC == 1 )); then

42
detectors/trna/bin/go_trna.sh Executable file
View File

@ -0,0 +1,42 @@
#!/bin/bash
#
# Annotate tRNA
#
#========================================================================================
#
# Annotate tRNA based on the Aragorn software predictions.
# go_trna.sh <FASTAFILE>
#
# - <FASTAFILE> : The fasta file containing the genome to annotate
#
# Results are printed to the standart output
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${SCRIPT_DIR}/../../../scripts/bash_init.sh"
pushTmpDir ORG.trna
CAUTRNADB="${TRNA_DATA_DIR}/CAU_tRNA_DB.fasta"
export CAUTRNADB
if [[ ! "$1" =~ ^/ ]]; then
QUERY="${CALL_DIR}/$1"
else
QUERY="$1"
fi
TRNA=$(basename ${QUERY})
aragorn -i -w -seq ${QUERY} | \
${PROG_DIR}/../lib/aragorn_wrapper.awk
popTmpDir
exit 0

View File

@ -7,17 +7,9 @@ function genomeid() {
return gid;
}
function home() {
"echo $ORGANNOT_HOME" | getline homedir;
return homedir;
}
function prog(program) {
return home() "/" program;
}
function trnalib(prognam) {
return home() "/lib/trnaCAU.ref.fasta";
function trnalib() {
"echo $CAUTRNADB" | getline ref;
return ref
}
function awkPID() {
@ -65,19 +57,20 @@ function patchtRNA(anticodon,trna,seq) {
if (anticodon == "cat") {
file=printfasta(trna "_" anticodon,seq,"");
command= prog("sumatra") " -d -n " file " " trnalib();
command= "sumatra -t 0.9 -x -n " file " " trnalib() " 2>> /dev/null";
while ((command | getline output) > 0) {
split(output,field," ");
n[field[2]]++;
d[field[2]]=field[3];
match(field[2],"trn.M?");
trna=substr(field[2],RSTART,RLENGTH);
n[trna]+=field[5];
}
close(command)
dmin=1;
nmax=0;
for (i in n) {
dist=d[i]/n[i];
if (dist < dmin) {
dmin=dist;
dist=n[i];
if (n[i] > nmax) {
nmax=n[i];
trna=i;
}
}
@ -94,6 +87,42 @@ function gene2product(gene) {
return "tRNA-" AA3[substr(gene,4,1)];
}
function emblTRNA(geneid,trna,loc,anti,intron,seq) {
if (loc ~ /^c/) {
sub("c\\[","complement(",loc);
sub("\\]",")",loc);
sub(",","..",loc)}
else {
sub("\\[","",loc);
sub("\\]","",loc);
sub(",","..",loc)}
anti=toupper(anti);
gsub("T","U",anti);
product=gene2product(trna);
if (intron!="") {
l=length(intron);
intron=substr(intron,2,l-2);
split(intron,intronpos,",");
ib=intronpos[0];
ie=intronpos[1];
match(loc,"[0-9][0-9]*");
gb=substr(loc,RSTART,RLENGTH);
sub("\\.\\.",".." (gb + ib -2) "," (gb + ie) "..",loc); \
sub("complement","complement(join",loc);\
if (substr(loc,1,1) ~ /[0-9]/) {
loc="join("loc}
loc=loc")";
}
print "FT tRNA " loc;
print "FT /gene=\""trna"\"";
print "FT /anticodon=\""anti"\"";
print "FT /product=\""product"("anti")\"";
}
function gffTRNA(geneid,trna,loc,anti,intron,seq) {
if (loc ~ /^c/) {
complement="-";
@ -139,7 +168,6 @@ function gffTRNA(geneid,trna,loc,anti,intron,seq) {
}
BEGIN {
print ARGV[1];
AA1["Ala"]="A";
AA1["Cys"]="C";
AA1["Asp"]="D";
@ -201,7 +229,7 @@ BEGIN {
{ seq=epissage(intron,seq);
trna=patchtRNA(anti,trna,seq);
# print geneid,trna,loc,anti,"'"intron"'",seq;
gffTRNA(geneid,trna,loc,anti,intron,seq);
emblTRNA(geneid,trna,loc,anti,intron,seq);
seq=""
}
@ -225,5 +253,5 @@ BEGIN {
END { seq=epissage(intron,seq);
trna=patchtRNA(anti,trna,seq);
# print geneid,trna,loc,anti,"'"intron"'",seq;
gffTRNA(geneid,trna,loc,anti,intron,seq);
emblTRNA(geneid,trna,loc,anti,intron,seq);
}

47
organnote.sh Executable file
View File

@ -0,0 +1,47 @@
#!/bin/bash
#
#
#
# Annotate tRNA
#
#========================================================================================
#
# Annotate tRNA based on the Aragorn software predictions.
# go_trna.sh <FASTAFILE>
#
# - <FASTAFILE> : The fasta file containing the genome to annotate
#
# Results are printed to the standart output
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${SCRIPT_DIR}/scripts/bash_init.sh"
pushTmpDir ORG.organnot
if [[ ! "$1" =~ ^/ ]]; then
QUERY="${CALL_DIR}/$1"
else
QUERY="$1"
fi
RESULTS=$(basename ${QUERY/.*/})
LOG="${CALL_DIR}/${RESULTS}.log"
rm -f ${LOG}
openLogFile ${LOG}
${PROG_DIR}/detectors/normalize/bin/go_normalize.sh ${QUERY} > "${RESULTS}.norm.fasta"
${PROG_DIR}/detectors/ir/bin/go_ir.sh ${QUERY} > "${RESULTS}.annot"
${PROG_DIR}/detectors/trna/bin/go_trna.sh ${QUERY} >> "${RESULTS}.annot"
cat "${RESULTS}.annot"
popTmpDir

View File

@ -19,11 +19,18 @@ function getAbsolutePath {
function pushTmpDir {
TMP_DIR=$(mktemp -d -t "$1_proc_$$_")
pushd $TMP_DIR >& /dev/null
TMP_DIR_STACK="$TMP_DIR $TMP_DIR_STACK"
logdebug "Pushing temp directory $TMP_DIR"
logdebug "Stack : ${TMP_DIR_STACK}"
}
function popTmpDir {
TMP_DIR=$(echo $TMP_DIR_STACK | awk '{print $1}')
TMP_DIR_STACK=$(echo $TMP_DIR_STACK | awk '{$1="";print $0}')
popd >& /dev/null
rm -rf $TMP_DIR >& /dev/null
logdebug "Poping temp directory $TMP_DIR"
logdebug "Stack : ${TMP_DIR_STACK}"
}
# Logging functions
@ -34,32 +41,42 @@ function errcho {
}
function openLogFile {
LOGFILE=$1
touch ${LOGFILE}
ORG_LOGFILE=$1
export ORG_LOGFILE
touch ${ORG_LOGFILE}
}
function loginfo {
errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA INFO ] $$ -- $1"
if [[ ! -z ${LOGFILE} ]]; then
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA INFO ] $$ -- $1" >> ${LOGFILE}
if [[ ! -z ${ORG_LOGFILE} ]]; then
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA INFO ] $$ -- $1" >> ${ORG_LOGFILE}
fi
}
function logerror {
errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA ERROR ] $$ -- $1"
if [[ ! -z ${LOGFILE} ]]; then
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA ERROR ] $$ -- $1" >> ${LOGFILE}
if [[ ! -z ${ORG_LOGFILE} ]]; then
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA ERROR ] $$ -- $1" >> ${ORG_LOGFILE}
fi
}
function logwarning {
errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA WARNING] $$ -- $1"
if [[ ! -z ${LOGFILE} ]]; then
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA WARNING] $$ -- $1" >> ${LOGFILE}
if [[ ! -z ${ORG_LOGFILE} ]]; then
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA WARNING] $$ -- $1" >> ${ORG_LOGFILE}
fi
}
function logdebug {
if [[ ! -z ${ORG_DEBUG} ]]; then
errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA DEBUG ] $$ -- $1"
if [[ ! -z ${ORG_LOGFILE} ]]; then
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA DEBUG ] $$ -- $1" >> ${ORG_LOGFILE}
fi
fi
}
# Sequence related functions
# Counts how many sequences are stored in a fasta file