Files
annotate/detectors/trna/tools/extract_refCAUtRNA.sh
Eric Coissac 6015339839 First version of the scripts for building the CAU tRNA database
Former-commit-id: 443a0446133850bb57211c27d2001241b03472bd
Former-commit-id: 8af464a0786f81dddf795f0cbda9ada9974808bf
2015-10-10 19:12:02 -03:00

130 lines
3.3 KiB
Bash
Executable File

#!/bin/bash
#
# BUILD REFERENCE THE CAU TRNA LIBRARy
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${SCRIPT_DIR}/../../../scripts/bash_init.sh"
function taxid {
egrep '/db_xref="taxon:[0-9]+"' $1 | \
sed -E 's@ +/db_xref="taxon:([0-9]+)"@\1@'
}
function ac {
head -1 $1 | awk '{print $2}'
}
function definition {
awk '/^DEFINITION/ {on=1} \
(on==1) {printf("%s ",$0)} \
(/\.$/ && (on==1)) {on=0;print ""}' $1 | \
sed 's/^DEFINITION *//' | \
sed 's/ *$//'
}
function gb2fasta {
AC=`ac $1`
TAXID=`taxid $1`
DEFINITION=`definition $1`
echo ">${AC} taxid=${TAXID}; ${DEFINITION}"
awk '/^\/\// {on=0} \
(on==1) {print $0} \
/^ORIGIN / {on=1}' $1 | \
sed -E 's/^ *[0-9]+ +//' | \
sed 's/ //g'
}
function findCAUtrna {
FASTATMP="$$.genome.fasta"
gb2fasta $1 > ${FASTATMP}
aragorn -i -w -seq ${FASTATMP} | \
awk '(on==1) && /^ *[0-9]+/ {on=0;print ""} \
(on==1) {printf($0)} \
/\(cat\)$/ {on=1; printf("%s ",$0)} \
END {print ""}' | \
awk '{print $3,$6}' | \
sed -E 's/c?\[([0-9]+),([0-9]+)\]/\1 \2/' | \
sed 's/ /:/g'
rm ${FASTATMP}
}
function trnaAnnotations {
awk '/^ORIGIN/ {on=0} \
(on==1) {print $0} \
/^FEATURE/ {on=1}' $1 | \
awk '/^ [^ ]/ {print ""} \
{printf("%s ",$0)} \
END {print ""}' | \
sed 's/^ *//' | \
sed -E 's/ +/ /g' | \
grep '^tRNA' | grep '/gene="' | \
sed -E 's/([0-9]+)\.\.([0-9]+)/\1 \2/g' | \
sed -E 's/ [0-9]+,[0-9]+ / /g' | \
grep -v '>' | \
grep -v '<' | \
sed -E 's/join\(([0-9]+ [0-9]+)\)/\1/' | \
sed -E 's/complement\(([0-9]+ [0-9]+)\)/\1/' | \
sed -E 's/join\(([0-9]+ [0-9]+)\)/\1/' | \
sed 's/^tRNA *//' | \
sed -E 's@([0-9]+) +([0-9]+).*/gene="([^"]+)"@\1 \2 \3@' | \
awk '{print $1,$2,$3}'
}
function annotateCAU {
DISTTMP="$$.trna.dist"
trna=(`echo $1 | sed 's/:/ /g'`)
awk -v b=${trna[0]} -v e=${trna[1]} \
'{printf("sqrt((%d - %d)^2 + (%d - %d)^2)\n",$1,b,$2,e)}' $2 | \
bc -l | \
sed 's/\..*$//' > ${DISTTMP}
paste ${DISTTMP} $2 | sort -nk 1 | head -1 | awk '{print $1,$4}'
rm -f ${DISTTMP}
}
function writeTRNA {
TAXID=`taxid $1`
AC=`ac $1`
DEFINITION=`definition $1`
TRNATMP="$$.trna.txt"
trnaAnnotations $1 > ${TRNATMP}
ntrna=`wc -l ${TRNATMP} | awk '{print $1}'`
if (( ntrna > 0 )); then
trnacau=`findCAUtrna $1`
for t in $trnacau; do
AA=(`annotateCAU $t ${TRNATMP}`)
distance=${AA[0]}
aa=`echo ${AA[1]} | sed -E 's/(t(rn|RNA)-?)?(I|M|fM).*$/trn\3/'`
if (( distance <= 10 )); then
echo ">${aa}_${AC} gbac=${AC}; trna=${aa}; taxid=${TAXID}; distance=${distance}; ${DEFINITION}"
echo "$t" | awk -F ':' '{print $3}'
fi
done
fi
rm -f ${TRNATMP}
}
pushTmpDir ORG.buildCAUtRNA
for gb in $*; do
writeTRNA $gb
done
popTmpDir