8113b80d4705f9429bc9e35d4a8e5c13137bc463

@@ -0,0 +1,70 @@
#!/bin/bash
#
#           Annotate the Intergenic Spacer (ITS) of nuclear rDNA cluster
#
#========================================================================================
#
# This script is based on ITSx
# 
#
#  go_its.sh <FASTAFILE>
#
#		- <FASTAFILE> : The fasta file containing the cluster to annotate
#
#  Results are printed to the standart output
#
#========================================================================================

# -- CAUTION -- Works as long than the script 
#               is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"

pushTmpDir ORG.its

	loginfo "Normalizing nuclear rDNA cistron..."

	RRNADB="${NUCRRNA_DATA_DIR}/plants/nuc_RRNA.hmm"
	
	if [[ ! "$1" =~ ^/ ]]; then
		QUERY="${CALL_DIR}/$1"
	else
		QUERY="$1"
	fi


	strand=( $(hmmsearch --max ${RRNADB} ${QUERY} | \
				$AwkCmd '/Query: / { \
				                profil=$2; \
				                match($3,"[0-9][0-9]*");\
				                lprof=substr($3,RSTART,RLENGTH)} \
				     / [0-9][0-9]* ! / { \
				                print profil,lprof,$7,$8,$10,$11}' | \
				$AwkCmd '($3 <=5) && (($2-$4) <=5) { \
				                full=1;$5=$5-$3+1;$6=$6+($2-$4)}  \
				               {loc="Forward"} \
				     ($1 ~ /_RC$/) { \
				                loc="Reverse"} \
				     (full==1) {match($1,"_..*S");\
				                rrna=substr($1,RSTART+1,RLENGTH-1);\
				                print loc;\
				                full=0
				                }' | sort | uniq) )
				               
	if [[ "${#strand[@]}" == 1 ]] ; then
		if [[ "${strand[0]}" == "Forward" ]] ; then
			cat ${QUERY}
		else
			fastarevcomp -f ${QUERY}
		fi
	else
		logerror "Cannot determine the Cistron orientation"
		exit 1
	fi
	 
    loginfo "Done."

popTmpDir

exit 0

@@ -0,0 +1,88 @@
#!/bin/bash
#
#           Annotate the Intergenic Spacer (ITS) of nuclear rDNA cluster
#
#========================================================================================
#
# This script is based on ITSx
# 
#
#  go_its.sh <FASTAFILE>
#
#		- <FASTAFILE> : The fasta file containing the cluster to annotate
#
#  Results are printed to the standart output
#
#========================================================================================

# -- CAUTION -- Works as long than the script 
#               is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"

pushTmpDir ORG.its

	loginfo "Annotating ITS and TSU..."

	RRNADB="${NUCRRNA_DATA_DIR}/plants/nuc_RRNA.hmm"
	
	if [[ ! "$1" =~ ^/ ]]; then
		QUERY="${CALL_DIR}/$1"
	else
		QUERY="$1"
	fi


    ITSx -p "${ITS_DATA_DIR}/ITSx_db/HMMs" -i "${QUERY}" -o "output.itsx"
    
    ITS1=( $(sed -E 's/.*ITS1: *([0-9]+)-([0-9]+).*/\1 \2/' "output.itsx.positions.txt" ) )
    ITS2=( $(sed -E 's/.*ITS2: *([0-9]+)-([0-9]+).*/\1 \2/' "output.itsx.positions.txt" ) )
    TSU=( $(sed -E 's/.*5\.8S: *([0-9]+)-([0-9]+).*/\1 \2/' "output.itsx.positions.txt" ) )


    if [[ ${#ITS1[@]}=="2" ]]  ; then
    	echo "FT   misc_RNA        ${ITS1[0]}..${ITS1[1]}"
		echo 'FT                   /gene="ITS1"'
		echo 'FT                   /note="internal transcribed spacer 1, ITS1"'
	fi
	
    if [[ ${#TSU[@]}=="2" ]]  ; then
		echo "FT   rRNA            ${TSU[0]}..${TSU[1]}"
		echo 'FT                   /gene="5.8S rRNA"'
		echo 'FT                   /product="5.8S ribosomal nuclear RNA"'
	fi
	
    if [[ ${#ITS2[@]}=="2" ]]  ; then
		echo "FT   misc_RNA        ${ITS2[0]}..${ITS2[1]}"
		echo 'FT                   /gene="ITS2"'
		echo 'FT                   /note="internal transcribed spacer 2, ITS2"'
	fi
    

	hmmsearch --max ${RRNADB} ${QUERY} | \
		$AwkCmd '/Query: / { \
		                profil=$2; \
		                match($3,"[0-9][0-9]*");\
		                lprof=substr($3,RSTART,RLENGTH)} \
		     / [0-9][0-9]* ! / { \
		                print profil,lprof,$7,$8,$10,$11}' | \
		$AwkCmd '($3 <=5) && (($2-$4) <=5) { \
		                full=1;$5=$5-$3+1;$6=$6+($2-$4)}  \
		               {loc=$5".."$6} \
		     ($1 ~ /_RC$/) { \
		                loc="complement("loc")"} \
		     (full==1) {match($1,"_..*S");\
		                rrna=substr($1,RSTART+1,RLENGTH-1);\
		                print "FT   rRNA            " loc; \
		                print "FT                   /gene=\"rrn"rrna"\""
		                print "FT                   /product=\""rrna" ribosomal RNA\"";\
		                full=0
		                }'	

    loginfo "Done."

popTmpDir

exit 0

@@ -23,9 +23,41 @@ taxid="no"
normalization="yes"
irdetection="yes"
organism="no"
types="chloro"

function usage {
	echo "Usage:" ;  
	echo "    $1 "'[-t|--ncbi-taxid ###] [-n|--no-normalization] \' 
	echo '       [-i|--no-ir-detection] [-h|--help] \ '
	echo '       [-o|--organism <organism_name>]  \ '
	echo '       [-c|--chloroplast|-r|--nuclear-rdna|-m|--mitochondrion] <FASTAFILE>' 
	echo
	echo "Options:"
	echo '  -t ### | --ncbi-taxid ###'
	echo '      ### represents the ncbi taxid associated to the sequence'
	echo
	echo '  -i     | --no-ir-detection'
	echo '      Does not look for inverted repeats in the plastid genome'
	echo
	echo '  -o     | --organism <organism_name>'
	echo '      Allows for specifiying the organism name in the embl generated file'
	echo '      Spaces have to be substituted by underscore ex : Abies_alba'
	echo
	echo '  -c     | --chloroplast'
	echo '      Selects for the annotation of a chloroplast genome'
	echo '      This is the default mode'
	echo
	echo '  -r     | --nuclear-rdna'
	echo '      Selects for the annotation of the rDNA nuclear cistron'
	echo
	echo '  -m     | --mitochondrion'
	echo '      Selects for the annotation of an animal mitochondrion genome'
   	exit $2
}


# options may be followed by one colon to indicate they have a required argument
if ! options=$(getopt -o t:o:ih -l ncbi-taxid:,organism,no-ir-detection,help -- "$@")
if ! options=$(getopt -o t:o:icrmh -l ncbi-taxid:,organism,no-ir-detection,chloroplast,nuclear-rdna,mitochondrion,help -- "$@")
then
    # something went wrong, getopt will put out an error message for us
    exit 1
@@ -36,20 +68,13 @@ eval set -- "$options"
while [ $# -gt 0 ]
do
    case $1 in
    -t|--ncbi-taxid) taxid="$2" ; shift;;
    -t|--ncbi-taxid) taxid="$2" ; shift ;;
    -i|--no-ir-detection)  irdetection="no" ;;
    -o|--organism) organism="$2" ; shift;;
    -h|--help)  echo "Usage:" ;  
    			echo "    $0 "'[-t|--ncbi-taxid ###] [-n|--no-normalization] \' 
    			echo "       [-i|--no-ir-detection] [-h|--help] <FASTAFILE>"
    			echo
    			echo "Options:"
    			echo '  -t ### | --ncbi-taxid ###'
    			echo '      ### represents the ncbi taxid associated to the sequence'
    			echo
    			echo '  -i     | --no-ir-detection'
    			echo '      Does not look for inverted repeats in the plastid genome'
               	exit 0;;
    -o|--organism) organism="$2" ; shift ;;
    -c|--chloroplast) types="chloro" ;;
    -r|--nuclear-rdna) types="nucrdna" ;;
    -m|--mitochondrion) types="mito" ;;
    -h|--help)  usage $0 0;;
    (--) shift; break;;
    (-*) echo "$0: error - unrecognized option $1" 1>&2; exit 1;;
    (*) break;;
@@ -57,6 +82,7 @@ do
    shift
done

echo $type
#############################

pushTmpDir ORG.organnot
@@ -73,43 +99,78 @@ pushTmpDir ORG.organnot
	
	rm -f ${LOG}
	openLogFile ${LOG}
		
	if [ "$irdetection"=="yes" ]; then

		loginfo "Normalizing the structure of the Chloroplast sequence..."
			loginfo "   LSC + IRB + SSC + IRA"
			${PROG_DIR}/detectors/normalize/bin/go_normalize.sh ${QUERY} > "${RESULTS}.norm.fasta"
		loginfo "Done."
	case "$types" in 
		chloro) 
			loginfo "Annotating a plant chloroplast genome..."
	
			if [ "$irdetection"=="yes" ]; then
		
		loginfo "Annotating the Inverted repeats and Single copies (LSC and SSC)..."
			${PROG_DIR}/detectors/ir/bin/go_ir.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot"		
		loginfo "Done."
				loginfo "Normalizing the structure of the Chloroplast sequence..."
					loginfo "   LSC + IRB + SSC + IRA"
					${PROG_DIR}/detectors/normalize/bin/go_normalize.sh ${QUERY} > "${RESULTS}.norm.fasta"
				loginfo "Done."
				
				loginfo "Annotating the Inverted repeats and Single copies (LSC and SSC)..."
					${PROG_DIR}/detectors/ir/bin/go_ir.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot"		
				loginfo "Done."
				
			fi
			
			loginfo "Annotating the tRNA..."
				${PROG_DIR}/detectors/trna/bin/go_trna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
			loginfo "Done."
			
			loginfo "Annotating the rRNA genes..."
				${PROG_DIR}/detectors/rrna/bin/go_rrna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
			loginfo "Done."
		
			loginfo "Annotating the CDS..."
				tcsh -f ${PROG_DIR}/detectors/cds/bin/go_cds.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
			loginfo "Done."
			
			topology="circular"
			defline="plastid, complete genome"
			;;
		nucrdna) 
			loginfo "Annotating a plant rDNA cistron..."
			
			loginfo "Normalizing the structure of the cistron sequence..."
				${PROG_DIR}/detectors/normalizerdna/bin/go_normalizerdna.sh ${QUERY} > "${RESULTS}.norm.fasta"
			loginfo "Done."
			
			loginfo "Annotating the rRNA genes..."
				${PROG_DIR}/detectors/nucrrna/bin/go_nucrrna.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot"
			loginfo "Done."

			topology="linear"
			defline="18S rRNA gene, ITS1, 5.8S rRNA gene, ITS2 and 28S rRNA gene"
			;;
		mito) 
			loginfo "Annotating an animal mitochondrial genome..."
			logerror "Not yet implemented"

			topology="circular"
			defline="mitochondrion, complete genome"

			exit 1
			;;
		*) 
			echo usage $0 1;;
	esac
					
	if [[ "${organism}" == "no" ]]; then
		organism="{organism}"
	else
		organism="$(echo ${organism} | tr '_' ' ')"
	fi
	
	loginfo "Annotating the tRNA..."
		${PROG_DIR}/detectors/trna/bin/go_trna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
	loginfo "Done."
	
	loginfo "Annotating the rRNA genes..."
		${PROG_DIR}/detectors/rrna/bin/go_rrna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
	loginfo "Done."

	loginfo "Annotating the CDS..."
		tcsh -f ${PROG_DIR}/detectors/cds/bin/go_cds.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
	loginfo "Done."
	
	loginfo "Printing minimal header..."		
		echo "ID   XXX; XXX; circular; genomic DNA; XXX; XXX; $(seqlength ${RESULTS}.norm.fasta) BP."
		echo "ID   XXX; XXX; ${topology}; genomic DNA; XXX; XXX; $(seqlength ${RESULTS}.norm.fasta) BP."
		echo "XX"
		echo "AC   XXX;"
		echo "DE   ${organism} ${defline}."
		echo "XX"
		if [[ "${organism}" == "no" ]]; then
			echo "DE   {organism} plastid, complete genome."
		else
			echo "DE   $(echo ${organism} | tr '_' ' ') plastid, complete genome."
		fi
		echo "XX"	
	loginfo "Done."

	loginfo "Printing annotations header..."
 
@@ -182,6 +182,12 @@ CDS_DATA_DIR="${DATA_DIR}/cds"  				# Directory containing data related to
RRNA_DATA_DIR="${DATA_DIR}/rrna"  				# Directory containing data related to 
												# rRNAs detection
								
NUCRRNA_DATA_DIR="${DATA_DIR}/nucrrna"  				# Directory containing data related to 
												# rRNAs detection
								
ITS_DATA_DIR="${DATA_DIR}/its"  				# Directory containing data related to 
												# rRNAs detection
								

#
#
 
@@ -25,31 +25,44 @@ PRTPATH = $(abspath $(PRTDIR))
DATADIR = $(CFGDIR)../data
DATAITS = $(DATADIR)/its

HMMPRESS= $(BINDIR)/hmmpress

HMMDIR  = $(PKGDIR)/ITSx_db/HMMs
HMMS    = $(wildcard $(HMMDIR)/*.hmm)
HMMP    = $(patsubst %.hmm,%.hmm.h3p,$(HMMS))
HMMI    = $(patsubst %.hmm,%.hmm.h3i,$(HMMS))
HMMM    = $(patsubst %.hmm,%.hmm.h3m,$(HMMS))

#
# Rules
#

.PHONY: all clean test portclean pkg pkg.expand pkg.install

all:: pkg
%.hmm.h3i: %.hmm
	echo $(HMMPRESS)
	(! test -s $< ) || $(HMMPRESS) -f $<

all:: pkg.install

pkg.expand::
	test -d $(PKGDIR) || mkdir $(PKGDIR)
	$(TAR) zxf $(PKGTAR) -C $(PKGDIR) --strip-components 1

pkg.install:: pkg.expand
pkg.install:: pkg
	@mkdir -p $(BINDIR)
	@cp $(PKGDIR)/ITSx $(BINDIR)
	@mkdir -p $(DATAITS)
	@cp -r $(PKGDIR)/ITSx_db $(DATAITS)
	@echo "+++++++++++ package $(PKG) done"

pkg:: pkg.install
pkg:: pkg.expand clean $(HMMI) 

test::
	echo No test available
	
clean::
	\rm -f $(HMMP) $(HMMI) $(HMMM)
	echo Done

portclean::