2015-10-10 19:12:02 -03:00
|
|
|
#!/bin/bash
|
|
|
|
#
|
|
|
|
# BUILD REFERENCE THE CAU TRNA LIBRARy
|
|
|
|
#
|
|
|
|
#========================================================================================
|
|
|
|
|
|
|
|
# -- CAUTION -- Works as long than the script
|
|
|
|
# is not called through a symlink
|
2015-11-08 14:28:57 +01:00
|
|
|
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
|
|
|
|
source "${THIS_DIR}/../../../scripts/bash_init.sh"
|
2015-10-10 19:12:02 -03:00
|
|
|
|
|
|
|
function taxid {
|
2018-04-05 17:55:31 +02:00
|
|
|
local gbk=$1
|
|
|
|
local CAT=cat
|
|
|
|
|
|
|
|
if [[ "$gbk" =~ \.gz$ ]] ; then
|
|
|
|
CAT=gzcat
|
|
|
|
fi
|
|
|
|
|
|
|
|
$CAT $gbk | \
|
|
|
|
egrep '/db_xref="taxon:[0-9]+"' | \
|
2015-10-10 19:12:02 -03:00
|
|
|
sed -E 's@ +/db_xref="taxon:([0-9]+)"@\1@'
|
|
|
|
}
|
|
|
|
|
|
|
|
function ac {
|
2018-04-05 17:55:31 +02:00
|
|
|
local gbk=$1
|
|
|
|
local CAT=cat
|
|
|
|
|
|
|
|
if [[ "$gbk" =~ \.gz$ ]] ; then
|
|
|
|
CAT=gzcat
|
|
|
|
fi
|
|
|
|
|
|
|
|
$CAT $gbk | \
|
|
|
|
head -1 | $AwkCmd '{print $2}'
|
2015-10-10 19:12:02 -03:00
|
|
|
}
|
|
|
|
|
|
|
|
function definition {
|
2018-04-05 17:55:31 +02:00
|
|
|
local gbk=$1
|
|
|
|
local CAT=cat
|
|
|
|
|
|
|
|
if [[ "$gbk" =~ \.gz$ ]] ; then
|
|
|
|
CAT=gzcat
|
|
|
|
fi
|
|
|
|
|
|
|
|
$CAT $gbk | \
|
2015-11-08 19:33:00 +01:00
|
|
|
$AwkCmd '/^DEFINITION/ {on=1} \
|
2015-10-10 19:12:02 -03:00
|
|
|
(on==1) {printf("%s ",$0)} \
|
|
|
|
(/\.$/ && (on==1)) {on=0;print ""}' $1 | \
|
|
|
|
sed 's/^DEFINITION *//' | \
|
|
|
|
sed 's/ *$//'
|
|
|
|
}
|
|
|
|
|
|
|
|
function gb2fasta {
|
2018-04-05 17:55:31 +02:00
|
|
|
local gbk=$1
|
|
|
|
local CAT=cat
|
|
|
|
|
|
|
|
if [[ "$gbk" =~ \.gz$ ]] ; then
|
|
|
|
CAT=gzcat
|
|
|
|
fi
|
|
|
|
|
2015-10-10 19:12:02 -03:00
|
|
|
AC=`ac $1`
|
|
|
|
TAXID=`taxid $1`
|
|
|
|
DEFINITION=`definition $1`
|
|
|
|
|
|
|
|
echo ">${AC} taxid=${TAXID}; ${DEFINITION}"
|
|
|
|
|
2018-04-05 17:55:31 +02:00
|
|
|
$CAT $gbk | \
|
|
|
|
$AwkCmd '/^\/\// {on=0} \
|
2015-10-10 19:12:02 -03:00
|
|
|
(on==1) {print $0} \
|
2018-04-05 17:55:31 +02:00
|
|
|
/^ORIGIN / {on=1}' | \
|
2015-10-10 19:12:02 -03:00
|
|
|
sed -E 's/^ *[0-9]+ +//' | \
|
|
|
|
sed 's/ //g'
|
|
|
|
}
|
|
|
|
|
|
|
|
function findCAUtrna {
|
|
|
|
|
|
|
|
FASTATMP="$$.genome.fasta"
|
|
|
|
|
|
|
|
gb2fasta $1 > ${FASTATMP}
|
|
|
|
aragorn -i -w -seq ${FASTATMP} | \
|
2015-11-08 19:33:00 +01:00
|
|
|
$AwkCmd '(on==1) && /^ *[0-9]+/ {on=0;print ""} \
|
2015-10-10 19:12:02 -03:00
|
|
|
(on==1) {printf($0)} \
|
|
|
|
/\(cat\)$/ {on=1; printf("%s ",$0)} \
|
|
|
|
END {print ""}' | \
|
2015-11-08 19:33:00 +01:00
|
|
|
$AwkCmd '{print $3,$6}' | \
|
2015-10-10 19:12:02 -03:00
|
|
|
sed -E 's/c?\[([0-9]+),([0-9]+)\]/\1 \2/' | \
|
|
|
|
sed 's/ /:/g'
|
|
|
|
|
|
|
|
rm ${FASTATMP}
|
|
|
|
}
|
|
|
|
|
|
|
|
function trnaAnnotations {
|
2018-04-05 17:55:31 +02:00
|
|
|
local gbk=$1
|
|
|
|
local CAT=cat
|
|
|
|
|
|
|
|
if [[ "$gbk" =~ \.gz$ ]] ; then
|
|
|
|
CAT=gzcat
|
|
|
|
fi
|
|
|
|
|
|
|
|
$CAT $gbk | \
|
2015-11-08 19:33:00 +01:00
|
|
|
$AwkCmd '/^ORIGIN/ {on=0} \
|
2015-10-10 19:12:02 -03:00
|
|
|
(on==1) {print $0} \
|
2018-04-05 17:55:31 +02:00
|
|
|
/^FEATURE/ {on=1}' | \
|
2015-11-08 19:33:00 +01:00
|
|
|
$AwkCmd '/^ [^ ]/ {print ""} \
|
2015-10-10 19:12:02 -03:00
|
|
|
{printf("%s ",$0)} \
|
|
|
|
END {print ""}' | \
|
|
|
|
sed 's/^ *//' | \
|
|
|
|
sed -E 's/ +/ /g' | \
|
|
|
|
grep '^tRNA' | grep '/gene="' | \
|
|
|
|
sed -E 's/([0-9]+)\.\.([0-9]+)/\1 \2/g' | \
|
|
|
|
sed -E 's/ [0-9]+,[0-9]+ / /g' | \
|
|
|
|
grep -v '>' | \
|
|
|
|
grep -v '<' | \
|
|
|
|
sed -E 's/join\(([0-9]+ [0-9]+)\)/\1/' | \
|
|
|
|
sed -E 's/complement\(([0-9]+ [0-9]+)\)/\1/' | \
|
|
|
|
sed -E 's/join\(([0-9]+ [0-9]+)\)/\1/' | \
|
|
|
|
sed 's/^tRNA *//' | \
|
|
|
|
sed -E 's@([0-9]+) +([0-9]+).*/gene="([^"]+)"@\1 \2 \3@' | \
|
2015-11-08 19:33:00 +01:00
|
|
|
$AwkCmd '{print $1,$2,$3}'
|
2015-10-10 19:12:02 -03:00
|
|
|
}
|
|
|
|
|
|
|
|
function annotateCAU {
|
|
|
|
DISTTMP="$$.trna.dist"
|
|
|
|
trna=(`echo $1 | sed 's/:/ /g'`)
|
2015-11-08 19:33:00 +01:00
|
|
|
$AwkCmd -v b=${trna[0]} -v e=${trna[1]} \
|
2015-10-10 19:12:02 -03:00
|
|
|
'{printf("sqrt((%d - %d)^2 + (%d - %d)^2)\n",$1,b,$2,e)}' $2 | \
|
|
|
|
bc -l | \
|
|
|
|
sed 's/\..*$//' > ${DISTTMP}
|
2015-11-08 19:33:00 +01:00
|
|
|
paste ${DISTTMP} $2 | sort -nk 1 | head -1 | $AwkCmd '{print $1,$4}'
|
2015-10-10 19:12:02 -03:00
|
|
|
rm -f ${DISTTMP}
|
|
|
|
}
|
|
|
|
|
|
|
|
function writeTRNA {
|
2018-04-05 17:55:31 +02:00
|
|
|
local gbk=$1
|
|
|
|
local CAT=cat
|
|
|
|
|
|
|
|
if [[ "$gbk" =~ \.gz$ ]] ; then
|
|
|
|
CAT=gzcat
|
|
|
|
fi
|
|
|
|
|
|
|
|
local TAXID=`taxid $gbk`
|
|
|
|
local AC=`ac $gbk`
|
|
|
|
local DEFINITION=`definition $gbk`
|
2015-10-10 19:12:02 -03:00
|
|
|
|
2018-04-05 17:55:31 +02:00
|
|
|
local TRNATMP="$$.trna.txt"
|
2015-10-10 19:12:02 -03:00
|
|
|
|
2018-04-05 17:55:31 +02:00
|
|
|
trnaAnnotations $gbk > ${TRNATMP}
|
2015-11-08 19:33:00 +01:00
|
|
|
ntrna=`wc -l ${TRNATMP} | $AwkCmd '{print $1}'`
|
2015-10-10 19:12:02 -03:00
|
|
|
|
|
|
|
if (( ntrna > 0 )); then
|
|
|
|
trnacau=`findCAUtrna $1`
|
|
|
|
|
|
|
|
for t in $trnacau; do
|
|
|
|
AA=(`annotateCAU $t ${TRNATMP}`)
|
|
|
|
distance=${AA[0]}
|
|
|
|
aa=`echo ${AA[1]} | sed -E 's/(t(rn|RNA)-?)?(I|M|fM).*$/trn\3/'`
|
|
|
|
|
|
|
|
if (( distance <= 10 )); then
|
|
|
|
echo ">${aa}_${AC} gbac=${AC}; trna=${aa}; taxid=${TAXID}; distance=${distance}; ${DEFINITION}"
|
2015-11-08 19:33:00 +01:00
|
|
|
echo "$t" | $AwkCmd -F ':' '{print $3}'
|
2015-10-10 19:12:02 -03:00
|
|
|
fi
|
|
|
|
done
|
|
|
|
fi
|
|
|
|
|
|
|
|
rm -f ${TRNATMP}
|
|
|
|
}
|
|
|
|
|
|
|
|
pushTmpDir ORG.buildCAUtRNA
|
|
|
|
|
|
|
|
for gb in $*; do
|
|
|
|
writeTRNA $gb
|
|
|
|
done
|
|
|
|
|
|
|
|
popTmpDir
|
|
|
|
|
|
|
|
|