First version of the scripts for building the CAU tRNA database

Former-commit-id: 443a0446133850bb57211c27d2001241b03472bd
Former-commit-id: 8af464a0786f81dddf795f0cbda9ada9974808bf
This commit is contained in:
2015-10-10 19:12:02 -03:00
parent c402923d0f
commit 6015339839
11 changed files with 1531 additions and 1289 deletions

1257
data/trna/CAU_tRNA_DB.fasta Normal file

File diff suppressed because it is too large Load Diff

18
data/trna/CAU_tRNA_DB.log Normal file
View File

@ -0,0 +1,18 @@
2015-10-10 18:52:35 [OA INFO ] 63859 -- Selecting Viridiplantae genebank entries...
2015-10-10 18:52:44 [OA INFO ] 63859 -- --> 695 entries selected
2015-10-10 18:52:44 [OA INFO ] 63859 -- Done
2015-10-10 18:52:44 [OA INFO ] 63859 -- Extracting the CAU tRNA from the plants entries...
2015-10-10 19:03:07 [OA INFO ] 63859 -- Done
2015-10-10 19:03:07 [OA INFO ] 63859 -- Sorting the CAU tRNA...
2015-10-10 19:03:07 [OA INFO ] 63859 -- Extract and dereplicate trnI...
2015-10-10 19:03:07 [OA INFO ] 63859 -- Extract and dereplicate trnM...
2015-10-10 19:03:07 [OA INFO ] 63859 -- Extract and dereplicate trnfM...
2015-10-10 19:03:07 [OA INFO ] 63859 -- Done
2015-10-10 19:03:07 [OA INFO ] 63859 -- Cleaning the CAU tRNA...
2015-10-10 19:03:07 [OA INFO ] 63859 -- First pass...
2015-10-10 19:03:09 [OA INFO ] 63859 -- --> 420 trnCAU.good.ids sequences kept
2015-10-10 19:03:09 [OA INFO ] 63859 -- Second pass...
2015-10-10 19:03:11 [OA INFO ] 63859 -- --> 419 trnCAU.good.ids sequences kept
2015-10-10 19:03:11 [OA INFO ] 63859 -- Done
2015-10-10 19:03:11 [OA INFO ] 63859 -- Installing the CAU tRNA database...
2015-10-10 19:03:11 [OA INFO ] 63859 -- Done

Binary file not shown.

View File

@ -1 +0,0 @@
0b9d49e3330cd8e6abe0c8b010e9931f7b2f22f3

Binary file not shown.

View File

@ -1 +0,0 @@
87cabbb1913329a9e693acad81654beb01f0e9bc

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,124 @@
#!/bin/bash
#
# BUILD REFERENCE THE CAU TRNA LIBRARy
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${SCRIPT_DIR}/../../../scripts/bash_init.sh"
function fasta1li {
awk '/^>/ {if (sequence) \
{print sequence}; \
print $0; \
sequence=""} \
!/^>/ {sequence = sequence $0} \
END {print sequence}' $1
}
function dereplicate {
DATA=$1
sumaclust -t 1 $DATA | \
fasta1li | \
grep -A 1 '^>' | \
grep -A1 'cluster_center=True;' | \
grep -v -- -- | \
sed -E "s/count=[0-9]+; //" | \
sed 's/cluster_weight/count/' | \
awk ' /^>/ {SEQ++;$1=$1"_"SEQ;print $0} \
!/^>/ {print $0}'
}
function extractSeqs {
rm -f $$.index
fastaindex -f $1 \
-i $$.index
for id in `cat $2`; do
fastafetch -f $1 \
-i $$.index \
-q $id
done
rm -f $$.index
}
function goodtrna {
local QUERY=$1
local REF=$2
sumatra -t 0.90 -x $QUERY $REF | \
sed -E 's/.(trn.M?)[_A-Z0-9]+/ \1 /' | \
sort -k 1,2 | \
awk '(OLD) && ($1!=OLD) {print OLD,c["trnM"],c["trnfM"],c["trnI"]} \
(OLD !=$1) {c["trnM"]=0;c["trnfM"]=0;c["trnI"]=0;OLD=$1} \
{c[$2]+=$5}' | awk '{p=0;} \
($2 > $3) && ($2 > $4) { print $0,"trnM";p=1 } \
($3 > $2) && ($3 > $4) {print $0,"trnfM";p=1} \
($4 > $2) && ($4 > $3) {print $0,"trnI";p=1} \
(p==0) {print $0,"----"}' | sed 's/_/ /' | \
awk '{print $1"_"$2,$3,$4,$5,$1,$6}' | \
awk '(($2+$3+$4) > 1) && ($5==$6) {print $1}'
}
pushTmpDir ORG.buildSCDB
CAUFILE=CAU.fasta
openLogFile "${TRNA_DATA_DIR}/CAU_tRNA_DB.log"
loginfo "Selecting Viridiplantae genebank entries..."
VIRIDIPLANTAE=$(${PROG_DIR}/../../normalize/tools/selectViridiplantae.sh $*)
loginfo " --> $(echo ${VIRIDIPLANTAE} | wc -w) entries selected"
loginfo "Done"
loginfo "Extracting the CAU tRNA from the plants entries..."
${PROG_DIR}/extract_refCAUtRNA.sh ${VIRIDIPLANTAE} | \
fasta1li | \
egrep -A 1 '^>trn(I|M|fM)' | \
grep -v -- -- > ${CAUFILE}
loginfo "Done"
loginfo "Sorting the CAU tRNA..."
loginfo "Extract and dereplicate trnI..."
egrep -A 1 '^>trnI_' ${CAUFILE} | grep -v -- -- > trnI.fasta
dereplicate trnI.fasta > trnCAU.fasta
loginfo "Extract and dereplicate trnM..."
egrep -A 1 '^>trnM_' ${CAUFILE} | grep -v -- -- > trnM.fasta
dereplicate trnM.fasta >> trnCAU.fasta
loginfo "Extract and dereplicate trnfM..."
egrep -A 1 '^>trnfM_' ${CAUFILE} | grep -v -- -- > trnfM.fasta
dereplicate trnfM.fasta >> trnCAU.fasta
loginfo "Done"
loginfo "Cleaning the CAU tRNA..."
loginfo "First pass..."
goodtrna trnCAU.fasta trnCAU.fasta > trnCAU.good.ids
extractSeqs trnCAU.fasta trnCAU.good.ids > trnCAU.good.fasta
loginfo " --> $(wc -l trnCAU.good.ids) sequences kept"
loginfo "Second pass..."
goodtrna trnCAU.fasta trnCAU.good.fasta > trnCAU.good.ids
extractSeqs trnCAU.fasta trnCAU.good.ids > trnCAU.good.fasta
loginfo " --> $(wc -l trnCAU.good.ids) sequences kept"
loginfo "Done"
loginfo "Installing the CAU tRNA database..."
cp trnCAU.good.fasta "${TRNA_DATA_DIR}/CAU_tRNA_DB.fasta"
loginfo "Done"
popTmpDir

View File

@ -0,0 +1,129 @@
#!/bin/bash
#
# BUILD REFERENCE THE CAU TRNA LIBRARy
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${SCRIPT_DIR}/../../../scripts/bash_init.sh"
function taxid {
egrep '/db_xref="taxon:[0-9]+"' $1 | \
sed -E 's@ +/db_xref="taxon:([0-9]+)"@\1@'
}
function ac {
head -1 $1 | awk '{print $2}'
}
function definition {
awk '/^DEFINITION/ {on=1} \
(on==1) {printf("%s ",$0)} \
(/\.$/ && (on==1)) {on=0;print ""}' $1 | \
sed 's/^DEFINITION *//' | \
sed 's/ *$//'
}
function gb2fasta {
AC=`ac $1`
TAXID=`taxid $1`
DEFINITION=`definition $1`
echo ">${AC} taxid=${TAXID}; ${DEFINITION}"
awk '/^\/\// {on=0} \
(on==1) {print $0} \
/^ORIGIN / {on=1}' $1 | \
sed -E 's/^ *[0-9]+ +//' | \
sed 's/ //g'
}
function findCAUtrna {
FASTATMP="$$.genome.fasta"
gb2fasta $1 > ${FASTATMP}
aragorn -i -w -seq ${FASTATMP} | \
awk '(on==1) && /^ *[0-9]+/ {on=0;print ""} \
(on==1) {printf($0)} \
/\(cat\)$/ {on=1; printf("%s ",$0)} \
END {print ""}' | \
awk '{print $3,$6}' | \
sed -E 's/c?\[([0-9]+),([0-9]+)\]/\1 \2/' | \
sed 's/ /:/g'
rm ${FASTATMP}
}
function trnaAnnotations {
awk '/^ORIGIN/ {on=0} \
(on==1) {print $0} \
/^FEATURE/ {on=1}' $1 | \
awk '/^ [^ ]/ {print ""} \
{printf("%s ",$0)} \
END {print ""}' | \
sed 's/^ *//' | \
sed -E 's/ +/ /g' | \
grep '^tRNA' | grep '/gene="' | \
sed -E 's/([0-9]+)\.\.([0-9]+)/\1 \2/g' | \
sed -E 's/ [0-9]+,[0-9]+ / /g' | \
grep -v '>' | \
grep -v '<' | \
sed -E 's/join\(([0-9]+ [0-9]+)\)/\1/' | \
sed -E 's/complement\(([0-9]+ [0-9]+)\)/\1/' | \
sed -E 's/join\(([0-9]+ [0-9]+)\)/\1/' | \
sed 's/^tRNA *//' | \
sed -E 's@([0-9]+) +([0-9]+).*/gene="([^"]+)"@\1 \2 \3@' | \
awk '{print $1,$2,$3}'
}
function annotateCAU {
DISTTMP="$$.trna.dist"
trna=(`echo $1 | sed 's/:/ /g'`)
awk -v b=${trna[0]} -v e=${trna[1]} \
'{printf("sqrt((%d - %d)^2 + (%d - %d)^2)\n",$1,b,$2,e)}' $2 | \
bc -l | \
sed 's/\..*$//' > ${DISTTMP}
paste ${DISTTMP} $2 | sort -nk 1 | head -1 | awk '{print $1,$4}'
rm -f ${DISTTMP}
}
function writeTRNA {
TAXID=`taxid $1`
AC=`ac $1`
DEFINITION=`definition $1`
TRNATMP="$$.trna.txt"
trnaAnnotations $1 > ${TRNATMP}
ntrna=`wc -l ${TRNATMP} | awk '{print $1}'`
if (( ntrna > 0 )); then
trnacau=`findCAUtrna $1`
for t in $trnacau; do
AA=(`annotateCAU $t ${TRNATMP}`)
distance=${AA[0]}
aa=`echo ${AA[1]} | sed -E 's/(t(rn|RNA)-?)?(I|M|fM).*$/trn\3/'`
if (( distance <= 10 )); then
echo ">${aa}_${AC} gbac=${AC}; trna=${aa}; taxid=${TAXID}; distance=${distance}; ${DEFINITION}"
echo "$t" | awk -F ':' '{print $3}'
fi
done
fi
rm -f ${TRNATMP}
}
pushTmpDir ORG.buildCAUtRNA
for gb in $*; do
writeTRNA $gb
done
popTmpDir

View File

@ -1,24 +0,0 @@
#!/bin/bash
SUMATRA=`dirname $0`/sumatra
function annotateCAU {
QUERY="$$.query.fasta"
echo $1 | sed 's/&/ /' | tr '@' '\n' > ${QUERY}
${SUMATRA} -d -n ${QUERY} $2 2> /dev/null | \
awk ' {n[$2]+=1;d[$2]+=$3} \
END {for (i in n) \
print i, n[i],d[i], d[i]/n[i]\
}' | \
sort -rnk4 | \
egrep '^trn(I|M|fM)' | \
tail -1 | \
awk '{print $1,$NF}'
rm -rf ${QUERY}
}
for seq in `awk '(on==1) && /^>/ {print "";on=0} /^>/ {printf("%s@",$0)} ! /^>/ {on=1;printf($0)}' $1 | tr ' ' '&'`; do
new=(`annotateCAU ${seq} $1`)
echo $seq | sed 's/&/ /g' | sed -E 's/>([^ ]+) />'${new[0]}' /' | tr '@' '\n'
done

View File

@ -142,6 +142,9 @@ CALL_DIR="$(getAbsolutePath $(pwd))" # Directory from where the main
IR_DATA_DIR="${DATA_DIR}/ir" # Directory containing data related to the
# Inverted repeat strucuture
TRNA_DATA_DIR="${DATA_DIR}/trna" # Directory containing data related to the
# Inverted repeat strucuture
#
#