Add annotation of nuclear rDNA cistron

Former-commit-id: ee54019ddddbea4d17956622968f6ce673b609e1
Former-commit-id: 5e5381cf59409ca3dc01098b0e3f330efe0a6a32
This commit is contained in:
2016-05-02 10:56:40 +02:00
parent 7d04371387
commit 8113b80d47
8 changed files with 72157 additions and 45 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,70 @@
#!/bin/bash
#
# Annotate the Intergenic Spacer (ITS) of nuclear rDNA cluster
#
#========================================================================================
#
# This script is based on ITSx
#
#
# go_its.sh <FASTAFILE>
#
# - <FASTAFILE> : The fasta file containing the cluster to annotate
#
# Results are printed to the standart output
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
pushTmpDir ORG.its
loginfo "Normalizing nuclear rDNA cistron..."
RRNADB="${NUCRRNA_DATA_DIR}/plants/nuc_RRNA.hmm"
if [[ ! "$1" =~ ^/ ]]; then
QUERY="${CALL_DIR}/$1"
else
QUERY="$1"
fi
strand=( $(hmmsearch --max ${RRNADB} ${QUERY} | \
$AwkCmd '/Query: / { \
profil=$2; \
match($3,"[0-9][0-9]*");\
lprof=substr($3,RSTART,RLENGTH)} \
/ [0-9][0-9]* ! / { \
print profil,lprof,$7,$8,$10,$11}' | \
$AwkCmd '($3 <=5) && (($2-$4) <=5) { \
full=1;$5=$5-$3+1;$6=$6+($2-$4)} \
{loc="Forward"} \
($1 ~ /_RC$/) { \
loc="Reverse"} \
(full==1) {match($1,"_..*S");\
rrna=substr($1,RSTART+1,RLENGTH-1);\
print loc;\
full=0
}' | sort | uniq) )
if [[ "${#strand[@]}" == 1 ]] ; then
if [[ "${strand[0]}" == "Forward" ]] ; then
cat ${QUERY}
else
fastarevcomp -f ${QUERY}
fi
else
logerror "Cannot determine the Cistron orientation"
exit 1
fi
loginfo "Done."
popTmpDir
exit 0

View File

@ -0,0 +1,88 @@
#!/bin/bash
#
# Annotate the Intergenic Spacer (ITS) of nuclear rDNA cluster
#
#========================================================================================
#
# This script is based on ITSx
#
#
# go_its.sh <FASTAFILE>
#
# - <FASTAFILE> : The fasta file containing the cluster to annotate
#
# Results are printed to the standart output
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
pushTmpDir ORG.its
loginfo "Annotating ITS and TSU..."
RRNADB="${NUCRRNA_DATA_DIR}/plants/nuc_RRNA.hmm"
if [[ ! "$1" =~ ^/ ]]; then
QUERY="${CALL_DIR}/$1"
else
QUERY="$1"
fi
ITSx -p "${ITS_DATA_DIR}/ITSx_db/HMMs" -i "${QUERY}" -o "output.itsx"
ITS1=( $(sed -E 's/.*ITS1: *([0-9]+)-([0-9]+).*/\1 \2/' "output.itsx.positions.txt" ) )
ITS2=( $(sed -E 's/.*ITS2: *([0-9]+)-([0-9]+).*/\1 \2/' "output.itsx.positions.txt" ) )
TSU=( $(sed -E 's/.*5\.8S: *([0-9]+)-([0-9]+).*/\1 \2/' "output.itsx.positions.txt" ) )
if [[ ${#ITS1[@]}=="2" ]] ; then
echo "FT misc_RNA ${ITS1[0]}..${ITS1[1]}"
echo 'FT /gene="ITS1"'
echo 'FT /note="internal transcribed spacer 1, ITS1"'
fi
if [[ ${#TSU[@]}=="2" ]] ; then
echo "FT rRNA ${TSU[0]}..${TSU[1]}"
echo 'FT /gene="5.8S rRNA"'
echo 'FT /product="5.8S ribosomal nuclear RNA"'
fi
if [[ ${#ITS2[@]}=="2" ]] ; then
echo "FT misc_RNA ${ITS2[0]}..${ITS2[1]}"
echo 'FT /gene="ITS2"'
echo 'FT /note="internal transcribed spacer 2, ITS2"'
fi
hmmsearch --max ${RRNADB} ${QUERY} | \
$AwkCmd '/Query: / { \
profil=$2; \
match($3,"[0-9][0-9]*");\
lprof=substr($3,RSTART,RLENGTH)} \
/ [0-9][0-9]* ! / { \
print profil,lprof,$7,$8,$10,$11}' | \
$AwkCmd '($3 <=5) && (($2-$4) <=5) { \
full=1;$5=$5-$3+1;$6=$6+($2-$4)} \
{loc=$5".."$6} \
($1 ~ /_RC$/) { \
loc="complement("loc")"} \
(full==1) {match($1,"_..*S");\
rrna=substr($1,RSTART+1,RLENGTH-1);\
print "FT rRNA " loc; \
print "FT /gene=\"rrn"rrna"\""
print "FT /product=\""rrna" ribosomal RNA\"";\
full=0
}'
loginfo "Done."
popTmpDir
exit 0

View File

@ -23,9 +23,41 @@ taxid="no"
normalization="yes"
irdetection="yes"
organism="no"
types="chloro"
function usage {
echo "Usage:" ;
echo " $1 "'[-t|--ncbi-taxid ###] [-n|--no-normalization] \'
echo ' [-i|--no-ir-detection] [-h|--help] \ '
echo ' [-o|--organism <organism_name>] \ '
echo ' [-c|--chloroplast|-r|--nuclear-rdna|-m|--mitochondrion] <FASTAFILE>'
echo
echo "Options:"
echo ' -t ### | --ncbi-taxid ###'
echo ' ### represents the ncbi taxid associated to the sequence'
echo
echo ' -i | --no-ir-detection'
echo ' Does not look for inverted repeats in the plastid genome'
echo
echo ' -o | --organism <organism_name>'
echo ' Allows for specifiying the organism name in the embl generated file'
echo ' Spaces have to be substituted by underscore ex : Abies_alba'
echo
echo ' -c | --chloroplast'
echo ' Selects for the annotation of a chloroplast genome'
echo ' This is the default mode'
echo
echo ' -r | --nuclear-rdna'
echo ' Selects for the annotation of the rDNA nuclear cistron'
echo
echo ' -m | --mitochondrion'
echo ' Selects for the annotation of an animal mitochondrion genome'
exit $2
}
# options may be followed by one colon to indicate they have a required argument
if ! options=$(getopt -o t:o:ih -l ncbi-taxid:,organism,no-ir-detection,help -- "$@")
if ! options=$(getopt -o t:o:icrmh -l ncbi-taxid:,organism,no-ir-detection,chloroplast,nuclear-rdna,mitochondrion,help -- "$@")
then
# something went wrong, getopt will put out an error message for us
exit 1
@ -36,20 +68,13 @@ eval set -- "$options"
while [ $# -gt 0 ]
do
case $1 in
-t|--ncbi-taxid) taxid="$2" ; shift;;
-t|--ncbi-taxid) taxid="$2" ; shift ;;
-i|--no-ir-detection) irdetection="no" ;;
-o|--organism) organism="$2" ; shift;;
-h|--help) echo "Usage:" ;
echo " $0 "'[-t|--ncbi-taxid ###] [-n|--no-normalization] \'
echo " [-i|--no-ir-detection] [-h|--help] <FASTAFILE>"
echo
echo "Options:"
echo ' -t ### | --ncbi-taxid ###'
echo ' ### represents the ncbi taxid associated to the sequence'
echo
echo ' -i | --no-ir-detection'
echo ' Does not look for inverted repeats in the plastid genome'
exit 0;;
-o|--organism) organism="$2" ; shift ;;
-c|--chloroplast) types="chloro" ;;
-r|--nuclear-rdna) types="nucrdna" ;;
-m|--mitochondrion) types="mito" ;;
-h|--help) usage $0 0;;
(--) shift; break;;
(-*) echo "$0: error - unrecognized option $1" 1>&2; exit 1;;
(*) break;;
@ -57,6 +82,7 @@ do
shift
done
echo $type
#############################
pushTmpDir ORG.organnot
@ -73,43 +99,78 @@ pushTmpDir ORG.organnot
rm -f ${LOG}
openLogFile ${LOG}
if [ "$irdetection"=="yes" ]; then
loginfo "Normalizing the structure of the Chloroplast sequence..."
loginfo " LSC + IRB + SSC + IRA"
${PROG_DIR}/detectors/normalize/bin/go_normalize.sh ${QUERY} > "${RESULTS}.norm.fasta"
loginfo "Done."
case "$types" in
chloro)
loginfo "Annotating a plant chloroplast genome..."
if [ "$irdetection"=="yes" ]; then
loginfo "Annotating the Inverted repeats and Single copies (LSC and SSC)..."
${PROG_DIR}/detectors/ir/bin/go_ir.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot"
loginfo "Done."
loginfo "Normalizing the structure of the Chloroplast sequence..."
loginfo " LSC + IRB + SSC + IRA"
${PROG_DIR}/detectors/normalize/bin/go_normalize.sh ${QUERY} > "${RESULTS}.norm.fasta"
loginfo "Done."
loginfo "Annotating the Inverted repeats and Single copies (LSC and SSC)..."
${PROG_DIR}/detectors/ir/bin/go_ir.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot"
loginfo "Done."
fi
loginfo "Annotating the tRNA..."
${PROG_DIR}/detectors/trna/bin/go_trna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
loginfo "Done."
loginfo "Annotating the rRNA genes..."
${PROG_DIR}/detectors/rrna/bin/go_rrna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
loginfo "Done."
loginfo "Annotating the CDS..."
tcsh -f ${PROG_DIR}/detectors/cds/bin/go_cds.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
loginfo "Done."
topology="circular"
defline="plastid, complete genome"
;;
nucrdna)
loginfo "Annotating a plant rDNA cistron..."
loginfo "Normalizing the structure of the cistron sequence..."
${PROG_DIR}/detectors/normalizerdna/bin/go_normalizerdna.sh ${QUERY} > "${RESULTS}.norm.fasta"
loginfo "Done."
loginfo "Annotating the rRNA genes..."
${PROG_DIR}/detectors/nucrrna/bin/go_nucrrna.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot"
loginfo "Done."
topology="linear"
defline="18S rRNA gene, ITS1, 5.8S rRNA gene, ITS2 and 28S rRNA gene"
;;
mito)
loginfo "Annotating an animal mitochondrial genome..."
logerror "Not yet implemented"
topology="circular"
defline="mitochondrion, complete genome"
exit 1
;;
*)
echo usage $0 1;;
esac
if [[ "${organism}" == "no" ]]; then
organism="{organism}"
else
organism="$(echo ${organism} | tr '_' ' ')"
fi
loginfo "Annotating the tRNA..."
${PROG_DIR}/detectors/trna/bin/go_trna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
loginfo "Done."
loginfo "Annotating the rRNA genes..."
${PROG_DIR}/detectors/rrna/bin/go_rrna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
loginfo "Done."
loginfo "Annotating the CDS..."
tcsh -f ${PROG_DIR}/detectors/cds/bin/go_cds.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
loginfo "Done."
loginfo "Printing minimal header..."
echo "ID XXX; XXX; circular; genomic DNA; XXX; XXX; $(seqlength ${RESULTS}.norm.fasta) BP."
echo "ID XXX; XXX; ${topology}; genomic DNA; XXX; XXX; $(seqlength ${RESULTS}.norm.fasta) BP."
echo "XX"
echo "AC XXX;"
echo "DE ${organism} ${defline}."
echo "XX"
if [[ "${organism}" == "no" ]]; then
echo "DE {organism} plastid, complete genome."
else
echo "DE $(echo ${organism} | tr '_' ' ') plastid, complete genome."
fi
echo "XX"
loginfo "Done."
loginfo "Printing annotations header..."

View File

@ -182,6 +182,12 @@ CDS_DATA_DIR="${DATA_DIR}/cds" # Directory containing data related to
RRNA_DATA_DIR="${DATA_DIR}/rrna" # Directory containing data related to
# rRNAs detection
NUCRRNA_DATA_DIR="${DATA_DIR}/nucrrna" # Directory containing data related to
# rRNAs detection
ITS_DATA_DIR="${DATA_DIR}/its" # Directory containing data related to
# rRNAs detection
#
#

View File

@ -25,31 +25,44 @@ PRTPATH = $(abspath $(PRTDIR))
DATADIR = $(CFGDIR)../data
DATAITS = $(DATADIR)/its
HMMPRESS= $(BINDIR)/hmmpress
HMMDIR = $(PKGDIR)/ITSx_db/HMMs
HMMS = $(wildcard $(HMMDIR)/*.hmm)
HMMP = $(patsubst %.hmm,%.hmm.h3p,$(HMMS))
HMMI = $(patsubst %.hmm,%.hmm.h3i,$(HMMS))
HMMM = $(patsubst %.hmm,%.hmm.h3m,$(HMMS))
#
# Rules
#
.PHONY: all clean test portclean pkg pkg.expand pkg.install
all:: pkg
%.hmm.h3i: %.hmm
echo $(HMMPRESS)
(! test -s $< ) || $(HMMPRESS) -f $<
all:: pkg.install
pkg.expand::
test -d $(PKGDIR) || mkdir $(PKGDIR)
$(TAR) zxf $(PKGTAR) -C $(PKGDIR) --strip-components 1
pkg.install:: pkg.expand
pkg.install:: pkg
@mkdir -p $(BINDIR)
@cp $(PKGDIR)/ITSx $(BINDIR)
@mkdir -p $(DATAITS)
@cp -r $(PKGDIR)/ITSx_db $(DATAITS)
@echo "+++++++++++ package $(PKG) done"
pkg:: pkg.install
pkg:: pkg.expand clean $(HMMI)
test::
echo No test available
clean::
\rm -f $(HMMP) $(HMMI) $(HMMM)
echo Done
portclean::