From 775d1a7157f0c3c880fd51ba4e191ac390fd2810 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric.coissac@metabarcoding.org>
Date: Thu, 4 Nov 2021 21:55:59 +0100
Subject: [PATCH] Adds changes to conform with EMBL template files

Former-commit-id: 93da2c4c6fe0ca46de5adb439341e970ba5abd55
Former-commit-id: 0ba15a4167e769b8e13507bd399892eb6098b4c8
---
 detectors/nucrrna/bin/go_nucrrna.sh |  37 ++--
 org-annotate.sh                     | 257 ++++++++++++++++++++++++----
 2 files changed, 240 insertions(+), 54 deletions(-)

diff --git a/detectors/nucrrna/bin/go_nucrrna.sh b/detectors/nucrrna/bin/go_nucrrna.sh
index 4665d4f..4547633 100755
--- a/detectors/nucrrna/bin/go_nucrrna.sh
+++ b/detectors/nucrrna/bin/go_nucrrna.sh
@@ -50,7 +50,7 @@ pushTmpDir ORG.its
     if [[ ${#TSU[@]}=="2" ]]  ; then
 		echo "FT   rRNA            ${TSU[0]}..${TSU[1]}"
 		echo 'FT                   /gene="5.8S rRNA"'
-		echo 'FT                   /product="5.8S ribosomal nuclear RNA"'
+		echo 'FT                   /product="5.8S ribosomal RNA"'
 	fi
 	
     if [[ ${#ITS2[@]}=="2" ]]  ; then
@@ -61,23 +61,26 @@ pushTmpDir ORG.its
     
 
 	hmmsearch --max ${RRNADB} ${QUERY} | \
-		$AwkCmd '/Query: / { \
-		                profil=$2; \
-		                match($3,"[0-9][0-9]*");\
-		                lprof=substr($3,RSTART,RLENGTH)} \
-		     / [0-9][0-9]* ! / { \
+		$AwkCmd '/Query: / { 
+		                profil=$2
+		                match($3,"[0-9][0-9]*");
+		                lprof=substr($3,RSTART,RLENGTH)} 
+		     / [0-9][0-9]* ! / { 
 		                print profil,lprof,$7,$8,$10,$11}' | \
-		$AwkCmd '($3 <=5) && (($2-$4) <=5) { \
-		                full=1;$5=$5-$3+1;$6=$6+($2-$4)}  \
-		               {loc=$5".."$6} \
-		     ($1 ~ /_RC$/) { \
-		                loc="complement("loc")"} \
-		     (full==1) {match($1,"_..*S");\
-		                rrna=substr($1,RSTART+1,RLENGTH-1);\
-		                print "FT   rRNA            " loc; \
-		                print "FT                   /gene=\"rrn"rrna"\""
-		                print "FT                   /product=\""rrna" ribosomal RNA\"";\
-		                full=0
+		$AwkCmd '($3 <=5) && (($2-$4) <=5) { 
+		                  full=1;$5=$5-$3+1;$6=$6+($2-$4)
+						}  
+		                { loc=$5".."$6 } 
+		         ($1 ~ /_RC$/) { 
+		                  loc="complement("loc")" 
+						} 
+		         (full==1) { 
+					      match($1,"_..*S")
+		                  rrna=substr($1,RSTART+1,RLENGTH-1)
+		                  print "FT   rRNA            " loc
+		                  print "FT                   /gene=\""rrna" rRNA\""
+		                  print "FT                   /product=\""rrna" ribosomal RNA\""
+		                  full=0
 		                }'	
 
     loginfo "Done."
diff --git a/org-annotate.sh b/org-annotate.sh
index 1ac656e..ffdcebe 100755
--- a/org-annotate.sh
+++ b/org-annotate.sh
@@ -4,8 +4,23 @@
 #
 #                           Annotate Organelle 
 #
+# The org-annotate pipeline aims to annotate fasta files produced by assembling 
+# genome skimming. It has been developped in the context of the PhyloAlps 
+# (http://phyloalps.org) and of the PhyloNorway (http://phylonorway.no) projects.
+#
+# Today it is able to produce EMBL flat files suitable for submission to ENA/EBI
+# It provides annotation procedure for :
+#
+#     - Plant chloroplast genomes.
+#     - Nuclear rDNA Region.
+# 
+#
 #========================================================================================
 #
+# The template used for generating the EMBL files follows the recommendation presented
+# at ENA documentation website (at the date of 2021-11-04).
+# 
+# https://ena-docs.readthedocs.io/en/latest/submit/fileprep/sequence-flatfile.html
 #
 #========================================================================================
 
@@ -27,7 +42,12 @@ cdsdetection_pass1="yes"
 cdsdetection_pass2="yes"
 trnadetection="yes"
 rrnadetection="yes"
+idprefix="no"
 organism="no"
+country="no"
+specimen="no"
+project="no"
+resetorganism="yes"
 types="chloro"
 partial=0
 minlength=0
@@ -40,16 +60,8 @@ function usage {
 	echo '       [-c|--chloroplast|-r|--nuclear-rdna|-m|--mitochondrion] <FASTAFILE>' 
 	echo
 	echo "Options:"
-	echo '  -t ### | --ncbi-taxid ###'
-	echo '      ### represents the ncbi taxid associated to the sequence'
-	echo
-	echo '  -i     | --no-ir-detection'
-	echo '      Does not look for inverted repeats in the plastid genome'
-	echo
-	echo '  -o     | --organism <organism_name>'
-	echo '      Allows for specifiying the organism name in the embl generated file'
-	echo '      Spaces have to be substituted by underscore ex : Abies_alba'
-	echo
+	echo 
+	echo ' Defining the sequence category'
 	echo '  -c     | --chloroplast'
 	echo '      Selects for the annotation of a chloroplast genome'
 	echo '      This is the default mode'
@@ -57,15 +69,53 @@ function usage {
 	echo '  -r     | --nuclear-rdna'
 	echo '      Selects for the annotation of the rDNA nuclear cistron'
 	echo
-	echo '  -m     | --mitochondrion'
-	echo '      Selects for the annotation of an animal mitochondrion genome'
+#	echo '  -m     | --mitochondrion'
+#	echo '      Selects for the annotation of an animal mitochondrion genome'
 	echo
+	echo ' Providing information about the sequence'
+	echo '  -s     | --specimen ###'
+	echo '      Represents the specimen voucher identifier '
+	echo '      (e.g. for herbarium sample TROM_V_991090'
+	echo '      will be added to the /specimen_voucher qualifier'
+	echo
+	echo '  -t     | --ncbi-taxid ###'
+	echo '      Represents the ncbi taxid associated to the sequence'
+	echo
+	echo '  -o     | --organism <organism_name>'
+	echo '      Allows for specifiying the organism name in the embl generated file'
+	echo '      Spaces have to be substituted by underscore ex : Abies_alba'
+	echo
+	echo '  -b     | --country "<country_value>[:<region>][, <locality>]"'
+	echo '      Location (at least country) of collection'
+	echo
+	echo '  -f     | --not-force-ncbi'
+	echo '      Do not force the name of the organism to match the NCBI scientific name.'
+	echo '      if the provided NCB taxid is publically defined'
+	echo
+	echo ' Information related to an ENA project'
+	echo '  -P     | --project <ENA Project>'
+	echo '      The accession number of the related project in ENA'
+	echo
+	echo '  -i     | --id-prefix <prefix>'
+	echo '      prefix used to build the sequence identifier'
+	echo '      the number of the contig is append to the prefix'
+	echo '      to build the complete id. This id is used only if'
+	echo '      an ENA project is specified.'
+ 	echo
+	echo ' Annotation of partial sequences'
 	echo '  -p     | --partial'
 	echo '      Indicates that the genome sequence is partial and therefore in several contigs'
 	echo
 	echo '  -l     | --min-length'
 	echo '      Indicates for partial mode the minimum length of contig to annotate'
 	echo
+	echo ' Setting up the sequence annotation'
+	echo '  -N     | --no-normalization'
+	echo '      Does not normalize the sequence befire annotation'
+	echo
+	echo '  -I     | --no-ir-detection'
+	echo '      Does not look for inverted repeats in the plastid genome'
+	echo
 	echo '  -C     | --no-cds'
 	echo '      Do not annotate CDS'
 	echo
@@ -84,6 +134,59 @@ function usage {
 }
 
 
+function ncbiscientificname {
+	local efetch='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
+	local params='db=taxonomy&id='$1'&mode=text&report=xml'
+	local url="${efetch}?${params}" 
+	curl $url \
+	| grep '<ScientificName>' \
+	| sed 's/<ScientificName>//' \
+	| sed 's@</ScientificName>@@' \
+	| sed 's@^ *@@' \
+	| sed 's@ *$@@' \
+	| head -1
+}
+
+function split80 {
+	local text=$1
+	local pattern
+	local header
+
+	if (( $# >= 2 )) ; then
+		header=$2
+	else
+		header=""
+	fi
+
+	if (( $# >= 3 )) ; then
+		pattern=$3
+	else
+		pattern=" "
+	fi
+
+	echo $text \
+	| $AwkCmd -v pattern="$pattern" -v header="$header" '
+		BEGIN { line = header }
+		{
+			n = split($0,parts,pattern)
+			j = 1
+			for (i = 1; i <= n; i++) {
+				if (length(line) + length(parts[i]) > 79) {
+					print line
+					line = header
+					j = i
+				} 
+				if (i > j) line = line pattern
+				line = line parts[i]
+			}
+			$0 = line    
+        }		
+		END {
+			print $0
+		}
+	  '
+}
+
 function fastaIterator() {
 	$AwkCmd '/^>/ {if (seq) printf("%s\f",seq); seq=""} \
 	          {if (seq) seq=seq"\n"; seq=seq $1} \
@@ -91,7 +194,7 @@ function fastaIterator() {
 }
 
 # options may be followed by one colon to indicate they have a required argument
-if ! options=$(getopt -o t:o:icrmhpl:CDETR -l ncbi-taxid:,organism,no-ir-detection,chloroplast,nuclear-rdna,mitochondrion,partial,min-length:,help,no-cds,no-cds-pass1,no-cds-pass2,no-trna,no-rrna -- "$@")
+if ! options=$(getopt -o s:t:o:b:P:i:fcrmhpl:NICDETR -l specimen:,ncbi-taxid:,organism:,country:,project:,id-prefix:,not-force-ncbi,chloroplast,nuclear-rdna,mitochondrion,partial,min-length:,help,no-normalization,no-ir-detection,no-cds,no-cds-pass1,no-cds-pass2,no-trna,no-rrna -- "$@")
 then
     # something went wrong, getopt will put out an error message for us
     usage $0 1
@@ -102,15 +205,21 @@ eval set -- "$options"
 while [ $# -gt 0 ]
 do
     case $1 in
-    -t|--ncbi-taxid) taxid="$2" ; shift ;;
-    -i|--no-ir-detection)  irdetection="no" ;;
-    -o|--organism) organism="$2" ; shift ;;
     -c|--chloroplast) types="chloro" ;;
     -r|--nuclear-rdna) types="nucrdna" ;;
-    -m|--mitochondrion) types="mito" ;;
+#    -m|--mitochondrion) types="mito" ;;
+    -t|--ncbi-taxid) taxid="$2" ; shift ;;
+	-s|--specimen) specimen="$2" ; shift ;;
+	-b|--country) country="$2" ; shift ;;
+    -o|--organism) organism="$2" ; shift ;;
+    -P|--project) project="$2" ; shift ;;
+    -i|--id-prefix) idprefix="$2" ; shift ;;
+    -f|--not-force-ncbi)  resetorganism="no" ;;
     -p|--partial) partial="1" ;;
     -l|--min-length) minlength="$2" ; shift ;;
     -h|--help)  usage $0 0;;
+    -N|--no-normalization)  normalization="no" ;;
+    -I|--no-ir-detection)  irdetection="no" ;;
 	-C|--no-cds) cdsdetection="no";;
 	-D|--no-cds-pass1) cdsdetection_pass1="no";;
 	-E|--no-cds-pass2) cdsdetection_pass2="no";;
@@ -123,16 +232,33 @@ do
     shift
 done
 
-loginfo "Annotating mode.....: $types"
-loginfo "IR detection mode...: $irdetection"
-loginfo "CDS detection mode..: $cdsdetection"
-loginfo "           pass 1...: $cdsdetection_pass1"
-loginfo "           pass 2...: $cdsdetection_pass2"
-loginfo "tRNA detection mode.: $trnadetection"
-loginfo "rRNA detection mode.: $rrnadetection"
-loginfo "Organism............: $organism"
-loginfo "Partial mode........: $partial"
-loginfo "Minimum length......: $minlength"
+loginfo "NCBI taxid provided......: $taxid"
+
+if [[ "$taxid" != "no" ]] ; then
+	scientificname=$(ncbiscientificname $taxid)
+	loginfo "NCBI scientific name.....: $scientificname"	
+	if [[ -z "$scientificname" ]] ; then
+		loginfo "    Unknown taxid."
+	else
+		loginfo "Organism name from taxid.: $resetorganism"
+		if [[ "$resetorganism" == "yes" ]] ; then
+			organism=$(echo $scientificname | tr ' ' '_')
+		fi
+	fi
+fi
+
+loginfo "Annotating mode..........: $types"
+loginfo "Sequence normalization...: $normalization"
+loginfo "IR detection mode........: $irdetection"
+loginfo "CDS detection mode.......: $cdsdetection"
+loginfo "           pass 1........: $cdsdetection_pass1"
+loginfo "           pass 2........: $cdsdetection_pass2"
+loginfo "tRNA detection mode......: $trnadetection"
+loginfo "rRNA detection mode......: $rrnadetection"
+loginfo "Organism.................: $organism"
+loginfo "Country..................: $country"
+loginfo "Partial mode.............: $partial"
+loginfo "Minimum length...........: $minlength"
 
 
 #############################
@@ -155,8 +281,11 @@ pushTmpDir ORG.organnot
 	openLogFile ${LOG}
 
 	IFS=$'\f'
+
+	sequence_number=0
 	
 	for sequence in $(fastaIterator "${QUERY}") ; do
+	    let sequence_number=sequence_number+1
 		unset IFS
 		if [[ ! -z "${sequence}" ]] ; then
 			echo "${sequence}" > toannotate.fasta
@@ -172,10 +301,15 @@ pushTmpDir ORG.organnot
 						
 						if [[ "$irdetection" == "yes" ]] && (( partial == 0 )) ; then
 					
-							loginfo "Normalizing the structure of the Chloroplast sequence..."
+							if [[ "$normalization" == "yes" ]] ; then
+								loginfo "Normalizing the structure of the Chloroplast sequence..."
 								loginfo "   LSC + IRB + SSC + IRA"
 								${PROG_DIR}/detectors/normalize/bin/go_normalize.sh toannotate.fasta > "${RESULTS}.norm.fasta"
-							loginfo "Done."
+								loginfo "Done."
+							else
+								loginfo "No normalization the structure of the Chloroplast sequence..."
+								cat toannotate.fasta > "${RESULTS}.norm.fasta"
+							fi
 							
 							loginfo "Annotating the Inverted repeats and Single copies (LSC and SSC)..."
 								${PROG_DIR}/detectors/ir/bin/go_ir.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot"		
@@ -220,16 +354,40 @@ pushTmpDir ORG.organnot
 					nucrdna) 
 						loginfo "Annotating a plant rDNA cistron..."
 						
-						loginfo "Normalizing the structure of the cistron sequence..."
+						if [[ "$normalization" == "yes" ]] ; then
+							loginfo "Normalizing the structure of the cistron sequence..."
 							${PROG_DIR}/detectors/normalizerdna/bin/go_normalizerdna.sh toannotate.fasta > "${RESULTS}.norm.fasta"								
-						loginfo "Done."
+							loginfo "Done."
+						else
+							loginfo "No normalization of the structure of the cistron sequence..."
+							cat toannotate.fasta > "${RESULTS}.norm.fasta"
+						fi
 						
 						loginfo "Annotating the rRNA genes..."
 							${PROG_DIR}/detectors/nucrrna/bin/go_nucrrna.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot"
 						loginfo "Done."
 			
 						topology="linear"
-						defline="18S rRNA gene, ITS1, 5.8S rRNA gene, ITS2 and 28S rRNA gene"
+						defline=$(cat "${RESULTS}.annot" \
+						        | grep "/gene=" \
+								| sed -E 's@^FT */gene="([^"]*)".*$@\1@' \
+								| sort -u \
+								| awk '{printf("%s;",$0)}' \
+								| awk 'BEGIN        {i=1} 
+								       /18S rRNA;/  {gene[i]="18S rRNA"; i++} 
+									   /ITS1;/      {gene[i]="ITS1"; i++} 
+									   /5.8S rRNA;/ {gene[i]="5.8S rRNA"; i++} 
+									   /ITS2;/      {gene[i]="ITS2"; i++} 
+									   /28S rRNA;/  {gene[i]="28S rRNA"; ii++} 
+									   END          {
+										for (i=1;i <= length(gene); i++) {
+											separator =""
+											if (i  < (length(gene)-1)) separator=", " 
+											if (i == (length(gene)-1)) separator=" and " 
+											printf("%s gene%s",gene[i],separator)
+											}
+										}') 
+						# defline="18S rRNA gene, ITS1, 5.8S rRNA gene, ITS2 and 28S rRNA gene"
 						;;
 						
 					mito) 
@@ -256,14 +414,31 @@ pushTmpDir ORG.organnot
 				else
 					organism="$(echo ${organism} | tr '_' ' ')"
 				fi
+
+				if [[ "$specimen" != "no" ]] ; then
+					defline="${defline}, voucher ${specimen}"
+				fi
 				
 				sl=$(seqlength "${RESULTS}.norm.fasta")
 				
 				loginfo "Printing minimal header..."		
-					echo "ID   ${seqid}; ${seqid}; ${topology}; genomic DNA; XXX; XXX; ${sl} BP."
+					echo "ID   XXX; XXX; ${topology}; genomic DNA; XXX; XXX; ${sl} BP."
 					echo "XX"
-					echo "AC   ${seqid};"
-					echo "DE   ${organism} ${defline}."
+					echo "AC   XXX;"
+					echo "XX"
+					if [[ "${project}" != "no" ]] ; then 
+					sequence_number
+						if [[ "$idprefix" != "no" ]] ; then
+							seqid="${idprefix}${sequence_number}"
+						fi
+						echo "AC * _${seqid};"
+						echo "XX"
+						echo "PR   Project:${project};"              
+						echo "XX"
+					fi	
+					
+					split80 "${organism} ${defline}." "DE   "
+					echo "DE   "
 					echo "XX"
 				loginfo "Done."
 			
@@ -274,7 +449,7 @@ pushTmpDir ORG.organnot
 				loginfo "Printing the source feature"
 						echo "FT   source          1..${sl}"                               
 	
-					if [[ "${organism}" != "{organism}" ]] ; then 
+					if [[ "${organism}" != "no" ]] ; then 
 						echo "FT                   /organism=\"${organism}\""              
 					fi	
 					
@@ -292,11 +467,18 @@ pushTmpDir ORG.organnot
 					
 						echo "FT                   /mol_type=\"genomic DNA\""              
 					
+					if [[ "${specimen}" != "no" ]] ; then 
+						echo "FT                   /specimen_voucher=\"${specimen}\""            
+					fi
+					
 					if [[ "${taxid}" != "no" ]] ; then 
 						echo "FT                   /db_xref=\"taxon:${taxid}\""            
 					fi
 					
-					#	echo "FT                   /country=\"Poland: Bialowieza Forest\"" 
+					if [[ "${country}" != "no" ]] ; then 
+						echo "FT                   /country=\"${country}\""            
+					fi
+					
 				loginfo "Done."
 				
 				loginfo "Ordering annotations..."
@@ -379,4 +561,5 @@ pushTmpDir ORG.organnot
 		IFS=$'\f'
 	done # End of the loop over the sequences
 popTmpDir
+loginfo "Annotation done."