Files
annotate/detectors/cds/tools/go_rps12db.sh

107 lines
3.4 KiB
Bash
Raw Normal View History

#!/bin/bash
#
# BUILD REFERENCE : THE RPS12 LIBRARY
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
source "${THIS_DIR}/lib/clusterize_prot.sh"
function extract_rps12() {
$AwkCmd ' \
/^LOCUS/ {LOCUS=$2;} \
/^ [^ ]/ { if (CDS) { \
print LOCUS "/" feature; \
print "#################" \
} \
CDS=0; \
} \
/^ CDS / {CDS=1; \
$1=""; \
feature=""} \
(CDS) { sub(/^ */,"",$0); \
sub(/ *$/,"",$0); \
feature=feature $0} \
' \
| egrep -i '"rps12"' \
| $AwkCmd -F"/" ' \
function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \
} \
\
($1 != current) {current=$1; \
n=1 \
} \
{$1=$1 "_rps12_" n; \
n++; \
delete keys; \
for (i=3; i<=NF; i++) { \
split($i,key,"="); \
keys[key[1]]=key[2] \
} \
prot = keys["translation"]; \
gsub(/"/,"",prot); \
print ">" $1,"location=" $2 ";"; \
printfasta(prot) \
} \
'
}
pushTmpDir ORG.buildRPS12DB
RPS12FILE=RPS12_prot.fst
openLogFile "${CDS_DATA_DIR}/chlorodb/RPS12_DB.log"
loginfo "Selecting Viridiplantae genbank entries..."
VIRIDIPLANTAE=$(${PROG_DIR}/../../normalize/tools/selectViridiplantae.sh $*)
loginfo " --> $(echo ${VIRIDIPLANTAE} | wc -w) entries selected"
loginfo "Done"
loginfo "Extracting the RPS12 protein sequences from the plants entries..."
( for gbk in ${VIRIDIPLANTAE} ; do
gzcat $gbk | \
extract_rps12
done ) > ${RPS12FILE}
loginfo "Done"
loginfo "Installing the RPS12 protein sequence database..."
cp ${RPS12FILE} "${CDS_DATA_DIR}/chlorodb/RPS12_DB.fst"
loginfo "Done"
popTmpDir
pushd "${CDS_DATA_DIR}/chlorodb"
loginfo "Clusterizing the RPS12 protein sequence database..."
rm -rf RPS12_DB.clean.fst
clusterize RPS12_DB
loginfo "Done"
loginfo " formatting Blast RPS12 DB"
timeoutcmd 300 makeblastdb -dbtype prot -in RPS12_DB.clean.fst >& /dev/null
loginfo "Done"
popd
#
# format blast protein database
#
loginfo "Done"