From cf5a5d1ce5c12a5b05f8b254d5af840fc6957443 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric.coissac@metabarcoding.org>
Date: Wed, 5 Oct 2016 15:11:26 +0200
Subject: [PATCH] Add management of partial sequences

Former-commit-id: abd6112ac558592616c197f2fe761880a847b031
Former-commit-id: d2ea2e7e306512bf8ef92d3497a0302919b81010
---
 org-annotate.sh | 345 +++++++++++++++++++++++++++++-------------------
 1 file changed, 208 insertions(+), 137 deletions(-)

diff --git a/org-annotate.sh b/org-annotate.sh
index ef2f1ea..70bc1f1 100755
--- a/org-annotate.sh
+++ b/org-annotate.sh
@@ -24,6 +24,7 @@ normalization="yes"
 irdetection="yes"
 organism="no"
 types="chloro"
+partial=0
 
 function usage {
 	echo "Usage:" ;  
@@ -52,12 +53,21 @@ function usage {
 	echo
 	echo '  -m     | --mitochondrion'
 	echo '      Selects for the annotation of an animal mitochondrion genome'
+	echo
+	echo '  -p     | --partial'
+	echo '      Indicates that the genome sequence is partial and therefore in several contigs'
    	exit $2
 }
 
 
+function fastaIterator() {
+	awk '/^>/ {if (seq) printf("%s\f",seq); seq=""} \
+	          {if (seq) seq=seq"\n"; seq=seq $1} \
+	          END {print seq}' "$1"
+}
+
 # options may be followed by one colon to indicate they have a required argument
-if ! options=$(getopt -o t:o:icrmh -l ncbi-taxid:,organism,no-ir-detection,chloroplast,nuclear-rdna,mitochondrion,help -- "$@")
+if ! options=$(getopt -o t:o:icrmhp -l ncbi-taxid:,organism,no-ir-detection,chloroplast,nuclear-rdna,mitochondrion,partial,help -- "$@")
 then
     # something went wrong, getopt will put out an error message for us
     usage $0 1
@@ -74,6 +84,7 @@ do
     -c|--chloroplast) types="chloro" ;;
     -r|--nuclear-rdna) types="nucrdna" ;;
     -m|--mitochondrion) types="mito" ;;
+    -p|--partial) partial="1" ;;
     -h|--help)  usage $0 0;;
     (--) shift; break;;
     (-*) echo "$0: error - unrecognized option $1" 1>&2; exit 1;;
@@ -101,151 +112,211 @@ pushTmpDir ORG.organnot
 	rm -f ${LOG}
 	openLogFile ${LOG}
 
-
-	case "$types" in 
-		chloro) 
-			loginfo "Annotating a plant chloroplast genome..."
+	IFS=$'\f'
 	
-			if [ "$irdetection"=="yes" ]; then
+	for sequence in $(fastaIterator "${QUERY}") ; do
+		unset IFS
+		if [[ ! -z "${sequence}" ]] ; then
+			echo "${sequence}" > toannotate.fasta
+			
+			seqid=$(awk '(NR==1) {print substr($1,2,1000)}' toannotate.fasta)
+			
+			case "$types" in 
+				chloro) 
+					loginfo "Annotating a plant chloroplast genome..."
+					
+					if [[ "$irdetection"=="yes" ]] && (( partial == 0 )) ; then
+				
+						loginfo "Normalizing the structure of the Chloroplast sequence..."
+							loginfo "   LSC + IRB + SSC + IRA"
+							${PROG_DIR}/detectors/normalize/bin/go_normalize.sh toannotate.fasta > "${RESULTS}.norm.fasta"
+						loginfo "Done."
+						
+						loginfo "Annotating the Inverted repeats and Single copies (LSC and SSC)..."
+							${PROG_DIR}/detectors/ir/bin/go_ir.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot"		
+						loginfo "Done."
+						
+					else
+						cat toannotate.fasta > "${RESULTS}.norm.fasta"
+						rm -f "${RESULTS}.annot"
+						touch "${RESULTS}.annot"
+					fi
+					
+					loginfo "Annotating the tRNA..."
+						${PROG_DIR}/detectors/trna/bin/go_trna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
+					loginfo "Done."
+					
+					loginfo "Annotating the rRNA genes..."
+						${PROG_DIR}/detectors/rrna/bin/go_rrna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
+					loginfo "Done."
+				
+					loginfo "Annotating the CDS..."
+						tcsh -f ${PROG_DIR}/detectors/cds/bin/go_cds.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
+					loginfo "Done."
+					
+					if (( partial == 0 )) ; then 
+						topology="circular"
+						defline="plastid, complete genome"
+					else
+						topology="linear"
+						defline="plastid, partial sequence"
+					fi
+					;;
+					
+				nucrdna) 
+					loginfo "Annotating a plant rDNA cistron..."
+					
+					loginfo "Normalizing the structure of the cistron sequence..."
+						${PROG_DIR}/detectors/normalizerdna/bin/go_normalizerdna.sh toannotate.fasta > "${RESULTS}.norm.fasta"
+					loginfo "Done."
+					
+					loginfo "Annotating the rRNA genes..."
+						${PROG_DIR}/detectors/nucrrna/bin/go_nucrrna.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot"
+					loginfo "Done."
 		
-				loginfo "Normalizing the structure of the Chloroplast sequence..."
-					loginfo "   LSC + IRB + SSC + IRA"
-					${PROG_DIR}/detectors/normalize/bin/go_normalize.sh ${QUERY} > "${RESULTS}.norm.fasta"
-				loginfo "Done."
-				
-				loginfo "Annotating the Inverted repeats and Single copies (LSC and SSC)..."
-					${PROG_DIR}/detectors/ir/bin/go_ir.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot"		
-				loginfo "Done."
-				
+					topology="linear"
+					defline="18S rRNA gene, ITS1, 5.8S rRNA gene, ITS2 and 28S rRNA gene"
+					;;
+					
+				mito) 
+					loginfo "Annotating an animal mitochondrial genome..."
+					logerror "Not yet implemented"
+		
+					if (( partial == 0 )) ; then 
+						topology="circular"
+						defline="mitochondrion, complete genome"
+					else
+						topology="linear"
+						defline="mitochondrion, partial sequence"
+					fi
+					
+					exit 1
+					;;
+					
+				*) 
+					usage $0 1;;
+			esac
+								
+			if [[ "${organism}" == "no" ]]; then
+				organism="{organism}"
+			else
+				organism="$(echo ${organism} | tr '_' ' ')"
 			fi
 			
-			loginfo "Annotating the tRNA..."
-				${PROG_DIR}/detectors/trna/bin/go_trna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
-			loginfo "Done."
+			sl=$(seqlength "${RESULTS}.norm.fasta")
 			
-			loginfo "Annotating the rRNA genes..."
-				${PROG_DIR}/detectors/rrna/bin/go_rrna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
+			loginfo "Printing minimal header..."		
+				echo "ID   ${seqid}; ${seqid}; ${topology}; genomic DNA; XXX; XXX; ${sl} BP."
+				echo "XX"
+				echo "AC   ${seqid};"
+				echo "DE   ${organism} ${defline}."
+				echo "XX"
 			loginfo "Done."
 		
-			loginfo "Annotating the CDS..."
-				tcsh -f ${PROG_DIR}/detectors/cds/bin/go_cds.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
-			loginfo "Done."
-			
-			topology="circular"
-			defline="plastid, complete genome"
-			;;
-		nucrdna) 
-			loginfo "Annotating a plant rDNA cistron..."
-			
-			loginfo "Normalizing the structure of the cistron sequence..."
-				${PROG_DIR}/detectors/normalizerdna/bin/go_normalizerdna.sh ${QUERY} > "${RESULTS}.norm.fasta"
-			loginfo "Done."
-			
-			loginfo "Annotating the rRNA genes..."
-				${PROG_DIR}/detectors/nucrrna/bin/go_nucrrna.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot"
+			loginfo "Printing annotations header..."
+		    	echo "FH   Key             Location/Qualifiers"
 			loginfo "Done."
 
-			topology="linear"
-			defline="18S rRNA gene, ITS1, 5.8S rRNA gene, ITS2 and 28S rRNA gene"
-			;;
-		mito) 
-			loginfo "Annotating an animal mitochondrial genome..."
-			logerror "Not yet implemented"
-
-			topology="circular"
-			defline="mitochondrion, complete genome"
-
-			exit 1
-			;;
-		*) 
-			usage $0 1;;
-	esac
-						
-	if [[ "${organism}" == "no" ]]; then
-		organism="{organism}"
-	else
-		organism="$(echo ${organism} | tr '_' ' ')"
-	fi
-	
-	loginfo "Printing minimal header..."		
-		echo "ID   XXX; XXX; ${topology}; genomic DNA; XXX; XXX; $(seqlength ${RESULTS}.norm.fasta) BP."
-		echo "XX"
-		echo "AC   XXX;"
-		echo "DE   ${organism} ${defline}."
-		echo "XX"
-	loginfo "Done."
-
-	loginfo "Printing annotations header..."
-    	echo "FH   Key             Location/Qualifiers"
-	loginfo "Done."
-	
-	loginfo "Ordering annotations..."
-		awk '/^.....(misc|repeat|rRNA|tRNA|gene)/ { \
-		        match($3,"[0-9][0-9]*"); \
-		        pos=substr($3,RSTART,RLENGTH)*1000 + 1; \
-		        print pos,$0;    \
-		        next} \
-		      { pos++; \
-		        print pos,$0}' "${RESULTS}.annot" | \
-		sort -nk1 |\
-		awk '{ \
-		        match($0,"^[0-9]* ");\
-		        line=substr($0,RLENGTH+1);\
-		        print line}' 
-	loginfo "Done."
-	
-	loginfo "Closing annotations table..."
-		echo "XX"
-	loginfo "Done."
-	
-	loginfo "Computing statistics on nucleotide usage..."
-		awk '! /^>/ { \
-			    seq=toupper($0); \
-				gsub(" ","",seq); \
-			    lseq=length(seq); \
-				for (i=0; i < lseq; i++) { \
-					freq[substr(seq,i,1)]++}\
-					} \
-			 END { \
-			 	other=0; \
-			 	for (i in freq) { \
-			 		if (i!="A" && i!="C" && i!="G" && i!="T") {\
-			 			other+=freq[i] \
-			 			} \
-			 		}; \
-			 		print "SQ   Sequence "\
-			 		      (freq["A"]+freq["C"]+freq["G"]+freq["T"]+other) \
-			 		      " BP; "\
-			 		      freq["A"]" A; "\
-			 		      freq["C"]" C; "\
-			 		      freq["G"]" G; "\
-			 		      freq["T"]" T; "\
-			 		      other" other;" \
-			 }' "${RESULTS}.norm.fasta"
-	loginfo "Done."
-	
-	loginfo "Reformating sequences..."
-		lines=$(wc -l "${RESULTS}.norm.fasta" | awk '{print $1}')
-		awk -v lines=$lines ' \
-			! /^>/ { \
-					seq=tolower($0); \
-					gsub(" ","",seq); \
-					printf("     ") ;\
-					for (i=0; i < 6; i++) { \
-						f=substr(seq,i * 10, 10); \
-						pos+=length(f); \
-						f = f  substr("          ",1,10-length(f)); \
-						printf("%s ",f) \
-					}; \
-					if (NR==lines) \
-					  {pos-=1}; \
-					printf("   %6d\n",pos) \
-			   }' "${RESULTS}.norm.fasta"
-	loginfo "Done."
-	
-	loginfo "Closing sequence part..."
-		echo "//"
-	loginfo "Done."
+			loginfo "Printing the source feature"
+					echo "FT   source          1..${sl}"                               >> "${RESULTS}.annot"
 
+				if [[ "${organism}" != "{organism}" ]] ; then 
+					echo "FT                   /organism=\"${organism}\""              >> "${RESULTS}.annot"
+				fi	
+				
+				case "${types}" in 
+					chloro)  
+						echo "FT                   /organelle=\"plastid:chloroplast\"" >> "${RESULTS}.annot"
+					;;
+					mito)    
+						echo "FT                   /organelle=\"mitochondrion\""       >> "${RESULTS}.annot"
+					;;
+					*) 
+						loginfo "Nuclear sequence"
+					;;
+				esac
+				
+					echo "FT                   /mol_type=\"genomic DNA\""              >> "${RESULTS}.annot"
+				
+				if [[ "${taxid}" != "no" ]] ; then 
+					echo "FT                   /db_xref=\"taxon:${taxid}\""            >> "${RESULTS}.annot"
+				fi
+				
+				#	echo "FT                   /country=\"Poland: Bialowieza Forest\"" >> "${RESULTS}.annot"
+			loginfo "Done."
+			
+			loginfo "Ordering annotations..."
+				awk '/^.....(misc|repeat|rRNA|tRNA|gene|source)/ { \
+				        match($3,"[0-9][0-9]*"); \
+				        pos=substr($3,RSTART,RLENGTH)*1000 + 1; \
+				        print pos,$0;    \
+				        next} \
+				      { pos++; \
+				        print pos,$0}' "${RESULTS}.annot" | \
+				sort -nk1 |\
+				awk '{ \
+				        match($0,"^[0-9]* ");\
+				        line=substr($0,RLENGTH+1);\
+				        print line}' 
+			loginfo "Done."
+			
+			
+			
+			loginfo "Closing annotations table..."
+				echo "XX"
+			loginfo "Done."
+			
+			loginfo "Computing statistics on nucleotide usage..."
+				awk '! /^>/ { \
+					    seq=toupper($0); \
+						gsub(" ","",seq); \
+					    lseq=length(seq); \
+						for (i=0; i < lseq; i++) { \
+							freq[substr(seq,i,1)]++}\
+							} \
+					 END { \
+					 	other=0; \
+					 	for (i in freq) { \
+					 		if (i!="A" && i!="C" && i!="G" && i!="T") {\
+					 			other+=freq[i] \
+					 			} \
+					 		}; \
+					 		print "SQ   Sequence "\
+					 		      (freq["A"]+freq["C"]+freq["G"]+freq["T"]+other) \
+					 		      " BP; "\
+					 		      freq["A"]" A; "\
+					 		      freq["C"]" C; "\
+					 		      freq["G"]" G; "\
+					 		      freq["T"]" T; "\
+					 		      other" other;" \
+					 }' "${RESULTS}.norm.fasta"
+			loginfo "Done."
+			
+			loginfo "Reformating sequences..."
+				lines=$(wc -l "${RESULTS}.norm.fasta" | awk '{print $1}')
+				awk -v lines=$lines ' \
+					! /^>/ { \
+							seq=tolower($0); \
+							gsub(" ","",seq); \
+							printf("     ") ;\
+							for (i=0; i < 6; i++) { \
+								f=substr(seq,i * 10, 10); \
+								pos+=length(f); \
+								f = f  substr("          ",1,10-length(f)); \
+								printf("%s ",f) \
+							}; \
+							if (NR==lines) \
+							  {pos-=1}; \
+							printf("   %6d\n",pos) \
+					   }' "${RESULTS}.norm.fasta"
+			loginfo "Done."
+			
+			loginfo "Closing sequence part..."
+				echo "//"
+			loginfo "Done."
+		fi
+		
+		IFS=$'\f'
+	done # End of the loop over the sequences
 popTmpDir