Adds detection of RPS12 and managment of locus tags

Former-commit-id: b9b17708eaaa27580f1e99bd3c375d4b6aba4d79
Former-commit-id: 369361ffa58e65b19ab1005bdf7960924f24ca08
This commit is contained in:
2022-02-14 14:21:50 +01:00
parent 9648bbb874
commit 59fcad1c42
5 changed files with 132 additions and 4 deletions

106
detectors/cds/tools/go_rps12db.sh Executable file
View File

@ -0,0 +1,106 @@
#!/bin/bash
#
# BUILD REFERENCE : THE RPS12 LIBRARY
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
source "${THIS_DIR}/lib/clusterize_prot.sh"
function extract_rps12() {
$AwkCmd ' \
/^LOCUS/ {LOCUS=$2;} \
/^ [^ ]/ { if (CDS) { \
print LOCUS "/" feature; \
print "#################" \
} \
CDS=0; \
} \
/^ CDS / {CDS=1; \
$1=""; \
feature=""} \
(CDS) { sub(/^ */,"",$0); \
sub(/ *$/,"",$0); \
feature=feature $0} \
' \
| egrep -i '"rps12"' \
| $AwkCmd -F"/" ' \
function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \
} \
\
($1 != current) {current=$1; \
n=1 \
} \
{$1=$1 "_rps12_" n; \
n++; \
delete keys; \
for (i=3; i<=NF; i++) { \
split($i,key,"="); \
keys[key[1]]=key[2] \
} \
prot = keys["translation"]; \
gsub(/"/,"",prot); \
print ">" $1,"location=" $2 ";"; \
printfasta(prot) \
} \
'
}
pushTmpDir ORG.buildRPS12DB
RPS12FILE=RPS12_prot.fst
openLogFile "${CDS_DATA_DIR}/chlorodb/RPS12_DB.log"
loginfo "Selecting Viridiplantae genbank entries..."
VIRIDIPLANTAE=$(${PROG_DIR}/../../normalize/tools/selectViridiplantae.sh $*)
loginfo " --> $(echo ${VIRIDIPLANTAE} | wc -w) entries selected"
loginfo "Done"
loginfo "Extracting the RPS12 protein sequences from the plants entries..."
( for gbk in ${VIRIDIPLANTAE} ; do
gzcat $gbk | \
extract_rps12
done ) > ${RPS12FILE}
loginfo "Done"
loginfo "Installing the RPS12 protein sequence database..."
cp ${RPS12FILE} "${CDS_DATA_DIR}/chlorodb/RPS12_DB.fst"
loginfo "Done"
popTmpDir
pushd "${CDS_DATA_DIR}/chlorodb"
loginfo "Clusterizing the RPS12 protein sequence database..."
rm -rf RPS12_DB.clean.fst
clusterize RPS12_DB
loginfo "Done"
loginfo " formatting Blast RPS12 DB"
timeoutcmd 300 makeblastdb -dbtype prot -in RPS12_DB.clean.fst >& /dev/null
loginfo "Done"
popd
#
# format blast protein database
#
loginfo "Done"

View File

@ -49,6 +49,7 @@ pushTmpDir ORG.rrna
print "FT rRNA " loc; \
print "FT /gene=\""rrna" rRNA\""
print "FT /product=\""rrna" ribosomal RNA\""
print "FT /locus_tag=\"\"";
full=0
}'

View File

@ -167,7 +167,9 @@ function emblTRNA(geneid,trna,loc,anti,intron,notes,seq) {
print "FT tRNA " loc;
print "FT /gene=\""trna"\"";
print "FT /anticodon=\""anti"\"";
# print "FT /note=\"*anticodon: "anti"\"";
print "FT /product=\""product"("anti")\"";
print "FT /locus_tag=\"\"";
# print "FT /inference=\"Aragorn-1.2.38\"";
if (notes!="-")
print "FT /note=\""notes"\"";

View File

@ -497,10 +497,29 @@ pushTmpDir ORG.organnot
match($0,"^[0-9]* ");\
line=substr($0,RLENGTH+1);\
gsub("@","\n",line); \
print line}'
print line}' > "${RESULTS}.sorted.annot"
loginfo "Done."
if [[ "$idprefix" != "no" ]] ; then
loginfo "Adding locus tags..."
cat "${RESULTS}.sorted.annot" \
| $AwkCmd -v idprefix="$idprefix" '
BEGIN {n=1}
/^FT +\/locus_tag=""/ {
sub(/locus_tag=""/,"locus_tag=\""idprefix"_"n"\"",$0);
n++;
}
{
print $0
}
'
loginfo "Locus tags done."
else
loginfo "Clearing locus tags done."
egrep -v '^FT +\/locus_tag=""' \
"${RESULTS}.sorted.annot"
loginfo "Clearing of tags done."
fi
loginfo "Closing annotations table..."
echo "XX"

View File

@ -206,9 +206,9 @@ function formatfasta {
# Reverse complement a DNA string
# - $1 : The DNA string to reverse complement
function reversecomp {
echo $1 \
echo $* \
| tr 'Aa' '@!' | tr 'Tt' 'Aa' | tr '@!' 'Tt' \
| tr 'Cc' '@!' | tr 'Gg' 'Cc' | tr '@!' 'Gc' \
| tr 'Cc' '@!' | tr 'Gg' 'Cc' | tr '@!' 'Gg' \
| tr 'Mm' '@!' | tr 'Kk' 'Mm' | tr '@!' 'Kk' \
| tr 'Rr' '@!' | tr 'Yy' 'Rr' | tr '@!' 'Yy' \
| tr 'Ww' '@!' | tr 'Ss' 'Ww' | tr '@!' 'Ss' \