Add annotation of nuclear rDNA cistron
Former-commit-id: ee54019ddddbea4d17956622968f6ce673b609e1 Former-commit-id: 5e5381cf59409ca3dc01098b0e3f330efe0a6a32
This commit is contained in:
29834
data/nucrrna/plants/nuc_RRNA.hmm
Normal file
29834
data/nucrrna/plants/nuc_RRNA.hmm
Normal file
File diff suppressed because it is too large
Load Diff
1440
data/nucrrna/plants/nuc_RRNA_18S.fst
Normal file
1440
data/nucrrna/plants/nuc_RRNA_18S.fst
Normal file
File diff suppressed because it is too large
Load Diff
40600
data/nucrrna/plants/nuc_RRNA_28S.fst
Normal file
40600
data/nucrrna/plants/nuc_RRNA_28S.fst
Normal file
File diff suppressed because it is too large
Load Diff
70
detectors/normalizerdna/bin/go_normalizerdna.sh
Executable file
70
detectors/normalizerdna/bin/go_normalizerdna.sh
Executable file
@ -0,0 +1,70 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Annotate the Intergenic Spacer (ITS) of nuclear rDNA cluster
|
||||||
|
#
|
||||||
|
#========================================================================================
|
||||||
|
#
|
||||||
|
# This script is based on ITSx
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# go_its.sh <FASTAFILE>
|
||||||
|
#
|
||||||
|
# - <FASTAFILE> : The fasta file containing the cluster to annotate
|
||||||
|
#
|
||||||
|
# Results are printed to the standart output
|
||||||
|
#
|
||||||
|
#========================================================================================
|
||||||
|
|
||||||
|
# -- CAUTION -- Works as long than the script
|
||||||
|
# is not called through a symlink
|
||||||
|
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
|
||||||
|
source "${THIS_DIR}/../../../scripts/bash_init.sh"
|
||||||
|
|
||||||
|
pushTmpDir ORG.its
|
||||||
|
|
||||||
|
loginfo "Normalizing nuclear rDNA cistron..."
|
||||||
|
|
||||||
|
RRNADB="${NUCRRNA_DATA_DIR}/plants/nuc_RRNA.hmm"
|
||||||
|
|
||||||
|
if [[ ! "$1" =~ ^/ ]]; then
|
||||||
|
QUERY="${CALL_DIR}/$1"
|
||||||
|
else
|
||||||
|
QUERY="$1"
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
strand=( $(hmmsearch --max ${RRNADB} ${QUERY} | \
|
||||||
|
$AwkCmd '/Query: / { \
|
||||||
|
profil=$2; \
|
||||||
|
match($3,"[0-9][0-9]*");\
|
||||||
|
lprof=substr($3,RSTART,RLENGTH)} \
|
||||||
|
/ [0-9][0-9]* ! / { \
|
||||||
|
print profil,lprof,$7,$8,$10,$11}' | \
|
||||||
|
$AwkCmd '($3 <=5) && (($2-$4) <=5) { \
|
||||||
|
full=1;$5=$5-$3+1;$6=$6+($2-$4)} \
|
||||||
|
{loc="Forward"} \
|
||||||
|
($1 ~ /_RC$/) { \
|
||||||
|
loc="Reverse"} \
|
||||||
|
(full==1) {match($1,"_..*S");\
|
||||||
|
rrna=substr($1,RSTART+1,RLENGTH-1);\
|
||||||
|
print loc;\
|
||||||
|
full=0
|
||||||
|
}' | sort | uniq) )
|
||||||
|
|
||||||
|
if [[ "${#strand[@]}" == 1 ]] ; then
|
||||||
|
if [[ "${strand[0]}" == "Forward" ]] ; then
|
||||||
|
cat ${QUERY}
|
||||||
|
else
|
||||||
|
fastarevcomp -f ${QUERY}
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
logerror "Cannot determine the Cistron orientation"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
loginfo "Done."
|
||||||
|
|
||||||
|
popTmpDir
|
||||||
|
|
||||||
|
exit 0
|
||||||
|
|
88
detectors/nucrrna/bin/go_nucrrna.sh
Executable file
88
detectors/nucrrna/bin/go_nucrrna.sh
Executable file
@ -0,0 +1,88 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Annotate the Intergenic Spacer (ITS) of nuclear rDNA cluster
|
||||||
|
#
|
||||||
|
#========================================================================================
|
||||||
|
#
|
||||||
|
# This script is based on ITSx
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# go_its.sh <FASTAFILE>
|
||||||
|
#
|
||||||
|
# - <FASTAFILE> : The fasta file containing the cluster to annotate
|
||||||
|
#
|
||||||
|
# Results are printed to the standart output
|
||||||
|
#
|
||||||
|
#========================================================================================
|
||||||
|
|
||||||
|
# -- CAUTION -- Works as long than the script
|
||||||
|
# is not called through a symlink
|
||||||
|
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
|
||||||
|
source "${THIS_DIR}/../../../scripts/bash_init.sh"
|
||||||
|
|
||||||
|
pushTmpDir ORG.its
|
||||||
|
|
||||||
|
loginfo "Annotating ITS and TSU..."
|
||||||
|
|
||||||
|
RRNADB="${NUCRRNA_DATA_DIR}/plants/nuc_RRNA.hmm"
|
||||||
|
|
||||||
|
if [[ ! "$1" =~ ^/ ]]; then
|
||||||
|
QUERY="${CALL_DIR}/$1"
|
||||||
|
else
|
||||||
|
QUERY="$1"
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
ITSx -p "${ITS_DATA_DIR}/ITSx_db/HMMs" -i "${QUERY}" -o "output.itsx"
|
||||||
|
|
||||||
|
ITS1=( $(sed -E 's/.*ITS1: *([0-9]+)-([0-9]+).*/\1 \2/' "output.itsx.positions.txt" ) )
|
||||||
|
ITS2=( $(sed -E 's/.*ITS2: *([0-9]+)-([0-9]+).*/\1 \2/' "output.itsx.positions.txt" ) )
|
||||||
|
TSU=( $(sed -E 's/.*5\.8S: *([0-9]+)-([0-9]+).*/\1 \2/' "output.itsx.positions.txt" ) )
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if [[ ${#ITS1[@]}=="2" ]] ; then
|
||||||
|
echo "FT misc_RNA ${ITS1[0]}..${ITS1[1]}"
|
||||||
|
echo 'FT /gene="ITS1"'
|
||||||
|
echo 'FT /note="internal transcribed spacer 1, ITS1"'
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ${#TSU[@]}=="2" ]] ; then
|
||||||
|
echo "FT rRNA ${TSU[0]}..${TSU[1]}"
|
||||||
|
echo 'FT /gene="5.8S rRNA"'
|
||||||
|
echo 'FT /product="5.8S ribosomal nuclear RNA"'
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ${#ITS2[@]}=="2" ]] ; then
|
||||||
|
echo "FT misc_RNA ${ITS2[0]}..${ITS2[1]}"
|
||||||
|
echo 'FT /gene="ITS2"'
|
||||||
|
echo 'FT /note="internal transcribed spacer 2, ITS2"'
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
hmmsearch --max ${RRNADB} ${QUERY} | \
|
||||||
|
$AwkCmd '/Query: / { \
|
||||||
|
profil=$2; \
|
||||||
|
match($3,"[0-9][0-9]*");\
|
||||||
|
lprof=substr($3,RSTART,RLENGTH)} \
|
||||||
|
/ [0-9][0-9]* ! / { \
|
||||||
|
print profil,lprof,$7,$8,$10,$11}' | \
|
||||||
|
$AwkCmd '($3 <=5) && (($2-$4) <=5) { \
|
||||||
|
full=1;$5=$5-$3+1;$6=$6+($2-$4)} \
|
||||||
|
{loc=$5".."$6} \
|
||||||
|
($1 ~ /_RC$/) { \
|
||||||
|
loc="complement("loc")"} \
|
||||||
|
(full==1) {match($1,"_..*S");\
|
||||||
|
rrna=substr($1,RSTART+1,RLENGTH-1);\
|
||||||
|
print "FT rRNA " loc; \
|
||||||
|
print "FT /gene=\"rrn"rrna"\""
|
||||||
|
print "FT /product=\""rrna" ribosomal RNA\"";\
|
||||||
|
full=0
|
||||||
|
}'
|
||||||
|
|
||||||
|
loginfo "Done."
|
||||||
|
|
||||||
|
popTmpDir
|
||||||
|
|
||||||
|
exit 0
|
||||||
|
|
103
org-annotate.sh
103
org-annotate.sh
@ -23,9 +23,41 @@ taxid="no"
|
|||||||
normalization="yes"
|
normalization="yes"
|
||||||
irdetection="yes"
|
irdetection="yes"
|
||||||
organism="no"
|
organism="no"
|
||||||
|
types="chloro"
|
||||||
|
|
||||||
|
function usage {
|
||||||
|
echo "Usage:" ;
|
||||||
|
echo " $1 "'[-t|--ncbi-taxid ###] [-n|--no-normalization] \'
|
||||||
|
echo ' [-i|--no-ir-detection] [-h|--help] \ '
|
||||||
|
echo ' [-o|--organism <organism_name>] \ '
|
||||||
|
echo ' [-c|--chloroplast|-r|--nuclear-rdna|-m|--mitochondrion] <FASTAFILE>'
|
||||||
|
echo
|
||||||
|
echo "Options:"
|
||||||
|
echo ' -t ### | --ncbi-taxid ###'
|
||||||
|
echo ' ### represents the ncbi taxid associated to the sequence'
|
||||||
|
echo
|
||||||
|
echo ' -i | --no-ir-detection'
|
||||||
|
echo ' Does not look for inverted repeats in the plastid genome'
|
||||||
|
echo
|
||||||
|
echo ' -o | --organism <organism_name>'
|
||||||
|
echo ' Allows for specifiying the organism name in the embl generated file'
|
||||||
|
echo ' Spaces have to be substituted by underscore ex : Abies_alba'
|
||||||
|
echo
|
||||||
|
echo ' -c | --chloroplast'
|
||||||
|
echo ' Selects for the annotation of a chloroplast genome'
|
||||||
|
echo ' This is the default mode'
|
||||||
|
echo
|
||||||
|
echo ' -r | --nuclear-rdna'
|
||||||
|
echo ' Selects for the annotation of the rDNA nuclear cistron'
|
||||||
|
echo
|
||||||
|
echo ' -m | --mitochondrion'
|
||||||
|
echo ' Selects for the annotation of an animal mitochondrion genome'
|
||||||
|
exit $2
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# options may be followed by one colon to indicate they have a required argument
|
# options may be followed by one colon to indicate they have a required argument
|
||||||
if ! options=$(getopt -o t:o:ih -l ncbi-taxid:,organism,no-ir-detection,help -- "$@")
|
if ! options=$(getopt -o t:o:icrmh -l ncbi-taxid:,organism,no-ir-detection,chloroplast,nuclear-rdna,mitochondrion,help -- "$@")
|
||||||
then
|
then
|
||||||
# something went wrong, getopt will put out an error message for us
|
# something went wrong, getopt will put out an error message for us
|
||||||
exit 1
|
exit 1
|
||||||
@ -36,20 +68,13 @@ eval set -- "$options"
|
|||||||
while [ $# -gt 0 ]
|
while [ $# -gt 0 ]
|
||||||
do
|
do
|
||||||
case $1 in
|
case $1 in
|
||||||
-t|--ncbi-taxid) taxid="$2" ; shift;;
|
-t|--ncbi-taxid) taxid="$2" ; shift ;;
|
||||||
-i|--no-ir-detection) irdetection="no" ;;
|
-i|--no-ir-detection) irdetection="no" ;;
|
||||||
-o|--organism) organism="$2" ; shift;;
|
-o|--organism) organism="$2" ; shift ;;
|
||||||
-h|--help) echo "Usage:" ;
|
-c|--chloroplast) types="chloro" ;;
|
||||||
echo " $0 "'[-t|--ncbi-taxid ###] [-n|--no-normalization] \'
|
-r|--nuclear-rdna) types="nucrdna" ;;
|
||||||
echo " [-i|--no-ir-detection] [-h|--help] <FASTAFILE>"
|
-m|--mitochondrion) types="mito" ;;
|
||||||
echo
|
-h|--help) usage $0 0;;
|
||||||
echo "Options:"
|
|
||||||
echo ' -t ### | --ncbi-taxid ###'
|
|
||||||
echo ' ### represents the ncbi taxid associated to the sequence'
|
|
||||||
echo
|
|
||||||
echo ' -i | --no-ir-detection'
|
|
||||||
echo ' Does not look for inverted repeats in the plastid genome'
|
|
||||||
exit 0;;
|
|
||||||
(--) shift; break;;
|
(--) shift; break;;
|
||||||
(-*) echo "$0: error - unrecognized option $1" 1>&2; exit 1;;
|
(-*) echo "$0: error - unrecognized option $1" 1>&2; exit 1;;
|
||||||
(*) break;;
|
(*) break;;
|
||||||
@ -57,6 +82,7 @@ do
|
|||||||
shift
|
shift
|
||||||
done
|
done
|
||||||
|
|
||||||
|
echo $type
|
||||||
#############################
|
#############################
|
||||||
|
|
||||||
pushTmpDir ORG.organnot
|
pushTmpDir ORG.organnot
|
||||||
@ -74,6 +100,10 @@ pushTmpDir ORG.organnot
|
|||||||
rm -f ${LOG}
|
rm -f ${LOG}
|
||||||
openLogFile ${LOG}
|
openLogFile ${LOG}
|
||||||
|
|
||||||
|
case "$types" in
|
||||||
|
chloro)
|
||||||
|
loginfo "Annotating a plant chloroplast genome..."
|
||||||
|
|
||||||
if [ "$irdetection"=="yes" ]; then
|
if [ "$irdetection"=="yes" ]; then
|
||||||
|
|
||||||
loginfo "Normalizing the structure of the Chloroplast sequence..."
|
loginfo "Normalizing the structure of the Chloroplast sequence..."
|
||||||
@ -99,16 +129,47 @@ pushTmpDir ORG.organnot
|
|||||||
tcsh -f ${PROG_DIR}/detectors/cds/bin/go_cds.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
|
tcsh -f ${PROG_DIR}/detectors/cds/bin/go_cds.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
|
||||||
loginfo "Done."
|
loginfo "Done."
|
||||||
|
|
||||||
|
topology="circular"
|
||||||
|
defline="plastid, complete genome"
|
||||||
|
;;
|
||||||
|
nucrdna)
|
||||||
|
loginfo "Annotating a plant rDNA cistron..."
|
||||||
|
|
||||||
|
loginfo "Normalizing the structure of the cistron sequence..."
|
||||||
|
${PROG_DIR}/detectors/normalizerdna/bin/go_normalizerdna.sh ${QUERY} > "${RESULTS}.norm.fasta"
|
||||||
|
loginfo "Done."
|
||||||
|
|
||||||
|
loginfo "Annotating the rRNA genes..."
|
||||||
|
${PROG_DIR}/detectors/nucrrna/bin/go_nucrrna.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot"
|
||||||
|
loginfo "Done."
|
||||||
|
|
||||||
|
topology="linear"
|
||||||
|
defline="18S rRNA gene, ITS1, 5.8S rRNA gene, ITS2 and 28S rRNA gene"
|
||||||
|
;;
|
||||||
|
mito)
|
||||||
|
loginfo "Annotating an animal mitochondrial genome..."
|
||||||
|
logerror "Not yet implemented"
|
||||||
|
|
||||||
|
topology="circular"
|
||||||
|
defline="mitochondrion, complete genome"
|
||||||
|
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo usage $0 1;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
if [[ "${organism}" == "no" ]]; then
|
||||||
|
organism="{organism}"
|
||||||
|
else
|
||||||
|
organism="$(echo ${organism} | tr '_' ' ')"
|
||||||
|
fi
|
||||||
|
|
||||||
loginfo "Printing minimal header..."
|
loginfo "Printing minimal header..."
|
||||||
echo "ID XXX; XXX; circular; genomic DNA; XXX; XXX; $(seqlength ${RESULTS}.norm.fasta) BP."
|
echo "ID XXX; XXX; ${topology}; genomic DNA; XXX; XXX; $(seqlength ${RESULTS}.norm.fasta) BP."
|
||||||
echo "XX"
|
echo "XX"
|
||||||
echo "AC XXX;"
|
echo "AC XXX;"
|
||||||
echo "XX"
|
echo "DE ${organism} ${defline}."
|
||||||
if [[ "${organism}" == "no" ]]; then
|
|
||||||
echo "DE {organism} plastid, complete genome."
|
|
||||||
else
|
|
||||||
echo "DE $(echo ${organism} | tr '_' ' ') plastid, complete genome."
|
|
||||||
fi
|
|
||||||
echo "XX"
|
echo "XX"
|
||||||
loginfo "Done."
|
loginfo "Done."
|
||||||
|
|
||||||
|
@ -182,6 +182,12 @@ CDS_DATA_DIR="${DATA_DIR}/cds" # Directory containing data related to
|
|||||||
RRNA_DATA_DIR="${DATA_DIR}/rrna" # Directory containing data related to
|
RRNA_DATA_DIR="${DATA_DIR}/rrna" # Directory containing data related to
|
||||||
# rRNAs detection
|
# rRNAs detection
|
||||||
|
|
||||||
|
NUCRRNA_DATA_DIR="${DATA_DIR}/nucrrna" # Directory containing data related to
|
||||||
|
# rRNAs detection
|
||||||
|
|
||||||
|
ITS_DATA_DIR="${DATA_DIR}/its" # Directory containing data related to
|
||||||
|
# rRNAs detection
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
|
@ -25,31 +25,44 @@ PRTPATH = $(abspath $(PRTDIR))
|
|||||||
DATADIR = $(CFGDIR)../data
|
DATADIR = $(CFGDIR)../data
|
||||||
DATAITS = $(DATADIR)/its
|
DATAITS = $(DATADIR)/its
|
||||||
|
|
||||||
|
HMMPRESS= $(BINDIR)/hmmpress
|
||||||
|
|
||||||
|
HMMDIR = $(PKGDIR)/ITSx_db/HMMs
|
||||||
|
HMMS = $(wildcard $(HMMDIR)/*.hmm)
|
||||||
|
HMMP = $(patsubst %.hmm,%.hmm.h3p,$(HMMS))
|
||||||
|
HMMI = $(patsubst %.hmm,%.hmm.h3i,$(HMMS))
|
||||||
|
HMMM = $(patsubst %.hmm,%.hmm.h3m,$(HMMS))
|
||||||
|
|
||||||
#
|
#
|
||||||
# Rules
|
# Rules
|
||||||
#
|
#
|
||||||
|
|
||||||
.PHONY: all clean test portclean pkg pkg.expand pkg.install
|
.PHONY: all clean test portclean pkg pkg.expand pkg.install
|
||||||
|
|
||||||
all:: pkg
|
%.hmm.h3i: %.hmm
|
||||||
|
echo $(HMMPRESS)
|
||||||
|
(! test -s $< ) || $(HMMPRESS) -f $<
|
||||||
|
|
||||||
|
all:: pkg.install
|
||||||
|
|
||||||
pkg.expand::
|
pkg.expand::
|
||||||
test -d $(PKGDIR) || mkdir $(PKGDIR)
|
test -d $(PKGDIR) || mkdir $(PKGDIR)
|
||||||
$(TAR) zxf $(PKGTAR) -C $(PKGDIR) --strip-components 1
|
$(TAR) zxf $(PKGTAR) -C $(PKGDIR) --strip-components 1
|
||||||
|
|
||||||
pkg.install:: pkg.expand
|
pkg.install:: pkg
|
||||||
@mkdir -p $(BINDIR)
|
@mkdir -p $(BINDIR)
|
||||||
@cp $(PKGDIR)/ITSx $(BINDIR)
|
@cp $(PKGDIR)/ITSx $(BINDIR)
|
||||||
@mkdir -p $(DATAITS)
|
@mkdir -p $(DATAITS)
|
||||||
@cp -r $(PKGDIR)/ITSx_db $(DATAITS)
|
@cp -r $(PKGDIR)/ITSx_db $(DATAITS)
|
||||||
@echo "+++++++++++ package $(PKG) done"
|
@echo "+++++++++++ package $(PKG) done"
|
||||||
|
|
||||||
pkg:: pkg.install
|
pkg:: pkg.expand clean $(HMMI)
|
||||||
|
|
||||||
test::
|
test::
|
||||||
echo No test available
|
echo No test available
|
||||||
|
|
||||||
clean::
|
clean::
|
||||||
|
\rm -f $(HMMP) $(HMMI) $(HMMM)
|
||||||
echo Done
|
echo Done
|
||||||
|
|
||||||
portclean::
|
portclean::
|
||||||
|
Reference in New Issue
Block a user