diff --git a/detectors/cds/tools/go_rps12db.sh b/detectors/cds/tools/go_rps12db.sh new file mode 100755 index 0000000..213b07d --- /dev/null +++ b/detectors/cds/tools/go_rps12db.sh @@ -0,0 +1,106 @@ +#!/bin/bash +# +# BUILD REFERENCE : THE RPS12 LIBRARY +# +#======================================================================================== + +# -- CAUTION -- Works as long than the script +# is not called through a symlink + +THIS_DIR="$(dirname ${BASH_SOURCE[0]})" +source "${THIS_DIR}/../../../scripts/bash_init.sh" +source "${THIS_DIR}/lib/clusterize_prot.sh" + +function extract_rps12() { +$AwkCmd ' \ + /^LOCUS/ {LOCUS=$2;} \ + /^ [^ ]/ { if (CDS) { \ + print LOCUS "/" feature; \ + print "#################" \ + } \ + CDS=0; \ + } \ + /^ CDS / {CDS=1; \ + $1=""; \ + feature=""} \ + (CDS) { sub(/^ */,"",$0); \ + sub(/ *$/,"",$0); \ + feature=feature $0} \ + ' \ + | egrep -i '"rps12"' \ + | $AwkCmd -F"/" ' \ + function printfasta(seq) { \ + seqlen=length(seq); \ + for (i=1; i <= seqlen; i+=60) \ + print substr(seq,i,60); \ + } \ + \ + ($1 != current) {current=$1; \ + n=1 \ + } \ + {$1=$1 "_rps12_" n; \ + n++; \ + delete keys; \ + for (i=3; i<=NF; i++) { \ + split($i,key,"="); \ + keys[key[1]]=key[2] \ + } \ + prot = keys["translation"]; \ + gsub(/"/,"",prot); \ + print ">" $1,"location=" $2 ";"; \ + printfasta(prot) \ + } \ + ' +} + + +pushTmpDir ORG.buildRPS12DB + + RPS12FILE=RPS12_prot.fst + + openLogFile "${CDS_DATA_DIR}/chlorodb/RPS12_DB.log" + + loginfo "Selecting Viridiplantae genbank entries..." + VIRIDIPLANTAE=$(${PROG_DIR}/../../normalize/tools/selectViridiplantae.sh $*) + loginfo " --> $(echo ${VIRIDIPLANTAE} | wc -w) entries selected" + loginfo "Done" + + loginfo "Extracting the RPS12 protein sequences from the plants entries..." + ( for gbk in ${VIRIDIPLANTAE} ; do + gzcat $gbk | \ + extract_rps12 + done ) > ${RPS12FILE} + loginfo "Done" + + loginfo "Installing the RPS12 protein sequence database..." + + cp ${RPS12FILE} "${CDS_DATA_DIR}/chlorodb/RPS12_DB.fst" + + loginfo "Done" + +popTmpDir + +pushd "${CDS_DATA_DIR}/chlorodb" + + loginfo "Clusterizing the RPS12 protein sequence database..." + rm -rf RPS12_DB.clean.fst + clusterize RPS12_DB + loginfo "Done" + + loginfo " formatting Blast RPS12 DB" + timeoutcmd 300 makeblastdb -dbtype prot -in RPS12_DB.clean.fst >& /dev/null + loginfo "Done" + + + +popd + +# +# format blast protein database +# + + + + +loginfo "Done" + diff --git a/detectors/rrna/bin/go_rrna.sh b/detectors/rrna/bin/go_rrna.sh index 171b350..2946468 100755 --- a/detectors/rrna/bin/go_rrna.sh +++ b/detectors/rrna/bin/go_rrna.sh @@ -49,6 +49,7 @@ pushTmpDir ORG.rrna print "FT rRNA " loc; \ print "FT /gene=\""rrna" rRNA\"" print "FT /product=\""rrna" ribosomal RNA\"" + print "FT /locus_tag=\"\""; full=0 }' diff --git a/detectors/trna/lib/aragorn_wrapper.awk b/detectors/trna/lib/aragorn_wrapper.awk index cdda0ca..dae5e2c 100755 --- a/detectors/trna/lib/aragorn_wrapper.awk +++ b/detectors/trna/lib/aragorn_wrapper.awk @@ -167,7 +167,9 @@ function emblTRNA(geneid,trna,loc,anti,intron,notes,seq) { print "FT tRNA " loc; print "FT /gene=\""trna"\""; print "FT /anticodon=\""anti"\""; +# print "FT /note=\"*anticodon: "anti"\""; print "FT /product=\""product"("anti")\""; + print "FT /locus_tag=\"\""; # print "FT /inference=\"Aragorn-1.2.38\""; if (notes!="-") print "FT /note=\""notes"\""; diff --git a/org-annotate.sh b/org-annotate.sh index f60f098..8b90f9e 100755 --- a/org-annotate.sh +++ b/org-annotate.sh @@ -497,10 +497,29 @@ pushTmpDir ORG.organnot match($0,"^[0-9]* ");\ line=substr($0,RLENGTH+1);\ gsub("@","\n",line); \ - print line}' + print line}' > "${RESULTS}.sorted.annot" loginfo "Done." - + if [[ "$idprefix" != "no" ]] ; then + loginfo "Adding locus tags..." + cat "${RESULTS}.sorted.annot" \ + | $AwkCmd -v idprefix="$idprefix" ' + BEGIN {n=1} + /^FT +\/locus_tag=""/ { + sub(/locus_tag=""/,"locus_tag=\""idprefix"_"n"\"",$0); + n++; + } + { + print $0 + } + ' + loginfo "Locus tags done." + else + loginfo "Clearing locus tags done." + egrep -v '^FT +\/locus_tag=""' \ + "${RESULTS}.sorted.annot" + loginfo "Clearing of tags done." + fi loginfo "Closing annotations table..." echo "XX" diff --git a/scripts/bash_init.sh b/scripts/bash_init.sh index 0b5941e..389d54e 100644 --- a/scripts/bash_init.sh +++ b/scripts/bash_init.sh @@ -206,9 +206,9 @@ function formatfasta { # Reverse complement a DNA string # - $1 : The DNA string to reverse complement function reversecomp { - echo $1 \ + echo $* \ | tr 'Aa' '@!' | tr 'Tt' 'Aa' | tr '@!' 'Tt' \ - | tr 'Cc' '@!' | tr 'Gg' 'Cc' | tr '@!' 'Gc' \ + | tr 'Cc' '@!' | tr 'Gg' 'Cc' | tr '@!' 'Gg' \ | tr 'Mm' '@!' | tr 'Kk' 'Mm' | tr '@!' 'Kk' \ | tr 'Rr' '@!' | tr 'Yy' 'Rr' | tr '@!' 'Yy' \ | tr 'Ww' '@!' | tr 'Ss' 'Ww' | tr '@!' 'Ss' \