Files
annotate/detectors/trna/tools/extract_refCAUtRNA.sh
Eric Coissac ee634cc779 Simplify CAU tRNA reference database building to keep onlyCAU tRNA
from plastomes where the three categories of CAU tRNA (Met/Ile/fMet)
are annotated

Former-commit-id: 67dc445698e22fe8a503c6700977c79e4817d302
Former-commit-id: 6e84303543b0752a7946bdde6e5114cfe6eef8da
2018-04-05 17:55:31 +02:00

177 lines
4.0 KiB
Bash
Executable File

#!/bin/bash
#
# BUILD REFERENCE THE CAU TRNA LIBRARy
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
function taxid {
local gbk=$1
local CAT=cat
if [[ "$gbk" =~ \.gz$ ]] ; then
CAT=gzcat
fi
$CAT $gbk | \
egrep '/db_xref="taxon:[0-9]+"' | \
sed -E 's@ +/db_xref="taxon:([0-9]+)"@\1@'
}
function ac {
local gbk=$1
local CAT=cat
if [[ "$gbk" =~ \.gz$ ]] ; then
CAT=gzcat
fi
$CAT $gbk | \
head -1 | $AwkCmd '{print $2}'
}
function definition {
local gbk=$1
local CAT=cat
if [[ "$gbk" =~ \.gz$ ]] ; then
CAT=gzcat
fi
$CAT $gbk | \
$AwkCmd '/^DEFINITION/ {on=1} \
(on==1) {printf("%s ",$0)} \
(/\.$/ && (on==1)) {on=0;print ""}' $1 | \
sed 's/^DEFINITION *//' | \
sed 's/ *$//'
}
function gb2fasta {
local gbk=$1
local CAT=cat
if [[ "$gbk" =~ \.gz$ ]] ; then
CAT=gzcat
fi
AC=`ac $1`
TAXID=`taxid $1`
DEFINITION=`definition $1`
echo ">${AC} taxid=${TAXID}; ${DEFINITION}"
$CAT $gbk | \
$AwkCmd '/^\/\// {on=0} \
(on==1) {print $0} \
/^ORIGIN / {on=1}' | \
sed -E 's/^ *[0-9]+ +//' | \
sed 's/ //g'
}
function findCAUtrna {
FASTATMP="$$.genome.fasta"
gb2fasta $1 > ${FASTATMP}
aragorn -i -w -seq ${FASTATMP} | \
$AwkCmd '(on==1) && /^ *[0-9]+/ {on=0;print ""} \
(on==1) {printf($0)} \
/\(cat\)$/ {on=1; printf("%s ",$0)} \
END {print ""}' | \
$AwkCmd '{print $3,$6}' | \
sed -E 's/c?\[([0-9]+),([0-9]+)\]/\1 \2/' | \
sed 's/ /:/g'
rm ${FASTATMP}
}
function trnaAnnotations {
local gbk=$1
local CAT=cat
if [[ "$gbk" =~ \.gz$ ]] ; then
CAT=gzcat
fi
$CAT $gbk | \
$AwkCmd '/^ORIGIN/ {on=0} \
(on==1) {print $0} \
/^FEATURE/ {on=1}' | \
$AwkCmd '/^ [^ ]/ {print ""} \
{printf("%s ",$0)} \
END {print ""}' | \
sed 's/^ *//' | \
sed -E 's/ +/ /g' | \
grep '^tRNA' | grep '/gene="' | \
sed -E 's/([0-9]+)\.\.([0-9]+)/\1 \2/g' | \
sed -E 's/ [0-9]+,[0-9]+ / /g' | \
grep -v '>' | \
grep -v '<' | \
sed -E 's/join\(([0-9]+ [0-9]+)\)/\1/' | \
sed -E 's/complement\(([0-9]+ [0-9]+)\)/\1/' | \
sed -E 's/join\(([0-9]+ [0-9]+)\)/\1/' | \
sed 's/^tRNA *//' | \
sed -E 's@([0-9]+) +([0-9]+).*/gene="([^"]+)"@\1 \2 \3@' | \
$AwkCmd '{print $1,$2,$3}'
}
function annotateCAU {
DISTTMP="$$.trna.dist"
trna=(`echo $1 | sed 's/:/ /g'`)
$AwkCmd -v b=${trna[0]} -v e=${trna[1]} \
'{printf("sqrt((%d - %d)^2 + (%d - %d)^2)\n",$1,b,$2,e)}' $2 | \
bc -l | \
sed 's/\..*$//' > ${DISTTMP}
paste ${DISTTMP} $2 | sort -nk 1 | head -1 | $AwkCmd '{print $1,$4}'
rm -f ${DISTTMP}
}
function writeTRNA {
local gbk=$1
local CAT=cat
if [[ "$gbk" =~ \.gz$ ]] ; then
CAT=gzcat
fi
local TAXID=`taxid $gbk`
local AC=`ac $gbk`
local DEFINITION=`definition $gbk`
local TRNATMP="$$.trna.txt"
trnaAnnotations $gbk > ${TRNATMP}
ntrna=`wc -l ${TRNATMP} | $AwkCmd '{print $1}'`
if (( ntrna > 0 )); then
trnacau=`findCAUtrna $1`
for t in $trnacau; do
AA=(`annotateCAU $t ${TRNATMP}`)
distance=${AA[0]}
aa=`echo ${AA[1]} | sed -E 's/(t(rn|RNA)-?)?(I|M|fM).*$/trn\3/'`
if (( distance <= 10 )); then
echo ">${aa}_${AC} gbac=${AC}; trna=${aa}; taxid=${TAXID}; distance=${distance}; ${DEFINITION}"
echo "$t" | $AwkCmd -F ':' '{print $3}'
fi
done
fi
rm -f ${TRNATMP}
}
pushTmpDir ORG.buildCAUtRNA
for gb in $*; do
writeTRNA $gb
done
popTmpDir