Adds detection of RPS12 and managment of locus tags
Former-commit-id: b9b17708eaaa27580f1e99bd3c375d4b6aba4d79 Former-commit-id: 369361ffa58e65b19ab1005bdf7960924f24ca08
This commit is contained in:
106
detectors/cds/tools/go_rps12db.sh
Executable file
106
detectors/cds/tools/go_rps12db.sh
Executable file
@ -0,0 +1,106 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# BUILD REFERENCE : THE RPS12 LIBRARY
|
||||
#
|
||||
#========================================================================================
|
||||
|
||||
# -- CAUTION -- Works as long than the script
|
||||
# is not called through a symlink
|
||||
|
||||
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
|
||||
source "${THIS_DIR}/../../../scripts/bash_init.sh"
|
||||
source "${THIS_DIR}/lib/clusterize_prot.sh"
|
||||
|
||||
function extract_rps12() {
|
||||
$AwkCmd ' \
|
||||
/^LOCUS/ {LOCUS=$2;} \
|
||||
/^ [^ ]/ { if (CDS) { \
|
||||
print LOCUS "/" feature; \
|
||||
print "#################" \
|
||||
} \
|
||||
CDS=0; \
|
||||
} \
|
||||
/^ CDS / {CDS=1; \
|
||||
$1=""; \
|
||||
feature=""} \
|
||||
(CDS) { sub(/^ */,"",$0); \
|
||||
sub(/ *$/,"",$0); \
|
||||
feature=feature $0} \
|
||||
' \
|
||||
| egrep -i '"rps12"' \
|
||||
| $AwkCmd -F"/" ' \
|
||||
function printfasta(seq) { \
|
||||
seqlen=length(seq); \
|
||||
for (i=1; i <= seqlen; i+=60) \
|
||||
print substr(seq,i,60); \
|
||||
} \
|
||||
\
|
||||
($1 != current) {current=$1; \
|
||||
n=1 \
|
||||
} \
|
||||
{$1=$1 "_rps12_" n; \
|
||||
n++; \
|
||||
delete keys; \
|
||||
for (i=3; i<=NF; i++) { \
|
||||
split($i,key,"="); \
|
||||
keys[key[1]]=key[2] \
|
||||
} \
|
||||
prot = keys["translation"]; \
|
||||
gsub(/"/,"",prot); \
|
||||
print ">" $1,"location=" $2 ";"; \
|
||||
printfasta(prot) \
|
||||
} \
|
||||
'
|
||||
}
|
||||
|
||||
|
||||
pushTmpDir ORG.buildRPS12DB
|
||||
|
||||
RPS12FILE=RPS12_prot.fst
|
||||
|
||||
openLogFile "${CDS_DATA_DIR}/chlorodb/RPS12_DB.log"
|
||||
|
||||
loginfo "Selecting Viridiplantae genbank entries..."
|
||||
VIRIDIPLANTAE=$(${PROG_DIR}/../../normalize/tools/selectViridiplantae.sh $*)
|
||||
loginfo " --> $(echo ${VIRIDIPLANTAE} | wc -w) entries selected"
|
||||
loginfo "Done"
|
||||
|
||||
loginfo "Extracting the RPS12 protein sequences from the plants entries..."
|
||||
( for gbk in ${VIRIDIPLANTAE} ; do
|
||||
gzcat $gbk | \
|
||||
extract_rps12
|
||||
done ) > ${RPS12FILE}
|
||||
loginfo "Done"
|
||||
|
||||
loginfo "Installing the RPS12 protein sequence database..."
|
||||
|
||||
cp ${RPS12FILE} "${CDS_DATA_DIR}/chlorodb/RPS12_DB.fst"
|
||||
|
||||
loginfo "Done"
|
||||
|
||||
popTmpDir
|
||||
|
||||
pushd "${CDS_DATA_DIR}/chlorodb"
|
||||
|
||||
loginfo "Clusterizing the RPS12 protein sequence database..."
|
||||
rm -rf RPS12_DB.clean.fst
|
||||
clusterize RPS12_DB
|
||||
loginfo "Done"
|
||||
|
||||
loginfo " formatting Blast RPS12 DB"
|
||||
timeoutcmd 300 makeblastdb -dbtype prot -in RPS12_DB.clean.fst >& /dev/null
|
||||
loginfo "Done"
|
||||
|
||||
|
||||
|
||||
popd
|
||||
|
||||
#
|
||||
# format blast protein database
|
||||
#
|
||||
|
||||
|
||||
|
||||
|
||||
loginfo "Done"
|
||||
|
@ -49,6 +49,7 @@ pushTmpDir ORG.rrna
|
||||
print "FT rRNA " loc; \
|
||||
print "FT /gene=\""rrna" rRNA\""
|
||||
print "FT /product=\""rrna" ribosomal RNA\""
|
||||
print "FT /locus_tag=\"\"";
|
||||
full=0
|
||||
}'
|
||||
|
||||
|
@ -167,7 +167,9 @@ function emblTRNA(geneid,trna,loc,anti,intron,notes,seq) {
|
||||
print "FT tRNA " loc;
|
||||
print "FT /gene=\""trna"\"";
|
||||
print "FT /anticodon=\""anti"\"";
|
||||
# print "FT /note=\"*anticodon: "anti"\"";
|
||||
print "FT /product=\""product"("anti")\"";
|
||||
print "FT /locus_tag=\"\"";
|
||||
# print "FT /inference=\"Aragorn-1.2.38\"";
|
||||
if (notes!="-")
|
||||
print "FT /note=\""notes"\"";
|
||||
|
@ -497,10 +497,29 @@ pushTmpDir ORG.organnot
|
||||
match($0,"^[0-9]* ");\
|
||||
line=substr($0,RLENGTH+1);\
|
||||
gsub("@","\n",line); \
|
||||
print line}'
|
||||
print line}' > "${RESULTS}.sorted.annot"
|
||||
loginfo "Done."
|
||||
|
||||
|
||||
if [[ "$idprefix" != "no" ]] ; then
|
||||
loginfo "Adding locus tags..."
|
||||
cat "${RESULTS}.sorted.annot" \
|
||||
| $AwkCmd -v idprefix="$idprefix" '
|
||||
BEGIN {n=1}
|
||||
/^FT +\/locus_tag=""/ {
|
||||
sub(/locus_tag=""/,"locus_tag=\""idprefix"_"n"\"",$0);
|
||||
n++;
|
||||
}
|
||||
{
|
||||
print $0
|
||||
}
|
||||
'
|
||||
loginfo "Locus tags done."
|
||||
else
|
||||
loginfo "Clearing locus tags done."
|
||||
egrep -v '^FT +\/locus_tag=""' \
|
||||
"${RESULTS}.sorted.annot"
|
||||
loginfo "Clearing of tags done."
|
||||
fi
|
||||
|
||||
loginfo "Closing annotations table..."
|
||||
echo "XX"
|
||||
|
@ -206,9 +206,9 @@ function formatfasta {
|
||||
# Reverse complement a DNA string
|
||||
# - $1 : The DNA string to reverse complement
|
||||
function reversecomp {
|
||||
echo $1 \
|
||||
echo $* \
|
||||
| tr 'Aa' '@!' | tr 'Tt' 'Aa' | tr '@!' 'Tt' \
|
||||
| tr 'Cc' '@!' | tr 'Gg' 'Cc' | tr '@!' 'Gc' \
|
||||
| tr 'Cc' '@!' | tr 'Gg' 'Cc' | tr '@!' 'Gg' \
|
||||
| tr 'Mm' '@!' | tr 'Kk' 'Mm' | tr '@!' 'Kk' \
|
||||
| tr 'Rr' '@!' | tr 'Yy' 'Rr' | tr '@!' 'Yy' \
|
||||
| tr 'Ww' '@!' | tr 'Ss' 'Ww' | tr '@!' 'Ss' \
|
||||
|
Reference in New Issue
Block a user