From 775d1a7157f0c3c880fd51ba4e191ac390fd2810 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 4 Nov 2021 21:55:59 +0100 Subject: [PATCH] Adds changes to conform with EMBL template files Former-commit-id: 93da2c4c6fe0ca46de5adb439341e970ba5abd55 Former-commit-id: 0ba15a4167e769b8e13507bd399892eb6098b4c8 --- detectors/nucrrna/bin/go_nucrrna.sh | 37 ++-- org-annotate.sh | 257 ++++++++++++++++++++++++---- 2 files changed, 240 insertions(+), 54 deletions(-) diff --git a/detectors/nucrrna/bin/go_nucrrna.sh b/detectors/nucrrna/bin/go_nucrrna.sh index 4665d4f..4547633 100755 --- a/detectors/nucrrna/bin/go_nucrrna.sh +++ b/detectors/nucrrna/bin/go_nucrrna.sh @@ -50,7 +50,7 @@ pushTmpDir ORG.its if [[ ${#TSU[@]}=="2" ]] ; then echo "FT rRNA ${TSU[0]}..${TSU[1]}" echo 'FT /gene="5.8S rRNA"' - echo 'FT /product="5.8S ribosomal nuclear RNA"' + echo 'FT /product="5.8S ribosomal RNA"' fi if [[ ${#ITS2[@]}=="2" ]] ; then @@ -61,23 +61,26 @@ pushTmpDir ORG.its hmmsearch --max ${RRNADB} ${QUERY} | \ - $AwkCmd '/Query: / { \ - profil=$2; \ - match($3,"[0-9][0-9]*");\ - lprof=substr($3,RSTART,RLENGTH)} \ - / [0-9][0-9]* ! / { \ + $AwkCmd '/Query: / { + profil=$2 + match($3,"[0-9][0-9]*"); + lprof=substr($3,RSTART,RLENGTH)} + / [0-9][0-9]* ! / { print profil,lprof,$7,$8,$10,$11}' | \ - $AwkCmd '($3 <=5) && (($2-$4) <=5) { \ - full=1;$5=$5-$3+1;$6=$6+($2-$4)} \ - {loc=$5".."$6} \ - ($1 ~ /_RC$/) { \ - loc="complement("loc")"} \ - (full==1) {match($1,"_..*S");\ - rrna=substr($1,RSTART+1,RLENGTH-1);\ - print "FT rRNA " loc; \ - print "FT /gene=\"rrn"rrna"\"" - print "FT /product=\""rrna" ribosomal RNA\"";\ - full=0 + $AwkCmd '($3 <=5) && (($2-$4) <=5) { + full=1;$5=$5-$3+1;$6=$6+($2-$4) + } + { loc=$5".."$6 } + ($1 ~ /_RC$/) { + loc="complement("loc")" + } + (full==1) { + match($1,"_..*S") + rrna=substr($1,RSTART+1,RLENGTH-1) + print "FT rRNA " loc + print "FT /gene=\""rrna" rRNA\"" + print "FT /product=\""rrna" ribosomal RNA\"" + full=0 }' loginfo "Done." diff --git a/org-annotate.sh b/org-annotate.sh index 1ac656e..ffdcebe 100755 --- a/org-annotate.sh +++ b/org-annotate.sh @@ -4,8 +4,23 @@ # # Annotate Organelle # +# The org-annotate pipeline aims to annotate fasta files produced by assembling +# genome skimming. It has been developped in the context of the PhyloAlps +# (http://phyloalps.org) and of the PhyloNorway (http://phylonorway.no) projects. +# +# Today it is able to produce EMBL flat files suitable for submission to ENA/EBI +# It provides annotation procedure for : +# +# - Plant chloroplast genomes. +# - Nuclear rDNA Region. +# +# #======================================================================================== # +# The template used for generating the EMBL files follows the recommendation presented +# at ENA documentation website (at the date of 2021-11-04). +# +# https://ena-docs.readthedocs.io/en/latest/submit/fileprep/sequence-flatfile.html # #======================================================================================== @@ -27,7 +42,12 @@ cdsdetection_pass1="yes" cdsdetection_pass2="yes" trnadetection="yes" rrnadetection="yes" +idprefix="no" organism="no" +country="no" +specimen="no" +project="no" +resetorganism="yes" types="chloro" partial=0 minlength=0 @@ -40,16 +60,8 @@ function usage { echo ' [-c|--chloroplast|-r|--nuclear-rdna|-m|--mitochondrion] ' echo echo "Options:" - echo ' -t ### | --ncbi-taxid ###' - echo ' ### represents the ncbi taxid associated to the sequence' - echo - echo ' -i | --no-ir-detection' - echo ' Does not look for inverted repeats in the plastid genome' - echo - echo ' -o | --organism ' - echo ' Allows for specifiying the organism name in the embl generated file' - echo ' Spaces have to be substituted by underscore ex : Abies_alba' - echo + echo + echo ' Defining the sequence category' echo ' -c | --chloroplast' echo ' Selects for the annotation of a chloroplast genome' echo ' This is the default mode' @@ -57,15 +69,53 @@ function usage { echo ' -r | --nuclear-rdna' echo ' Selects for the annotation of the rDNA nuclear cistron' echo - echo ' -m | --mitochondrion' - echo ' Selects for the annotation of an animal mitochondrion genome' +# echo ' -m | --mitochondrion' +# echo ' Selects for the annotation of an animal mitochondrion genome' echo + echo ' Providing information about the sequence' + echo ' -s | --specimen ###' + echo ' Represents the specimen voucher identifier ' + echo ' (e.g. for herbarium sample TROM_V_991090' + echo ' will be added to the /specimen_voucher qualifier' + echo + echo ' -t | --ncbi-taxid ###' + echo ' Represents the ncbi taxid associated to the sequence' + echo + echo ' -o | --organism ' + echo ' Allows for specifiying the organism name in the embl generated file' + echo ' Spaces have to be substituted by underscore ex : Abies_alba' + echo + echo ' -b | --country "[:][, ]"' + echo ' Location (at least country) of collection' + echo + echo ' -f | --not-force-ncbi' + echo ' Do not force the name of the organism to match the NCBI scientific name.' + echo ' if the provided NCB taxid is publically defined' + echo + echo ' Information related to an ENA project' + echo ' -P | --project ' + echo ' The accession number of the related project in ENA' + echo + echo ' -i | --id-prefix ' + echo ' prefix used to build the sequence identifier' + echo ' the number of the contig is append to the prefix' + echo ' to build the complete id. This id is used only if' + echo ' an ENA project is specified.' + echo + echo ' Annotation of partial sequences' echo ' -p | --partial' echo ' Indicates that the genome sequence is partial and therefore in several contigs' echo echo ' -l | --min-length' echo ' Indicates for partial mode the minimum length of contig to annotate' echo + echo ' Setting up the sequence annotation' + echo ' -N | --no-normalization' + echo ' Does not normalize the sequence befire annotation' + echo + echo ' -I | --no-ir-detection' + echo ' Does not look for inverted repeats in the plastid genome' + echo echo ' -C | --no-cds' echo ' Do not annotate CDS' echo @@ -84,6 +134,59 @@ function usage { } +function ncbiscientificname { + local efetch='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi' + local params='db=taxonomy&id='$1'&mode=text&report=xml' + local url="${efetch}?${params}" + curl $url \ + | grep '' \ + | sed 's///' \ + | sed 's@@@' \ + | sed 's@^ *@@' \ + | sed 's@ *$@@' \ + | head -1 +} + +function split80 { + local text=$1 + local pattern + local header + + if (( $# >= 2 )) ; then + header=$2 + else + header="" + fi + + if (( $# >= 3 )) ; then + pattern=$3 + else + pattern=" " + fi + + echo $text \ + | $AwkCmd -v pattern="$pattern" -v header="$header" ' + BEGIN { line = header } + { + n = split($0,parts,pattern) + j = 1 + for (i = 1; i <= n; i++) { + if (length(line) + length(parts[i]) > 79) { + print line + line = header + j = i + } + if (i > j) line = line pattern + line = line parts[i] + } + $0 = line + } + END { + print $0 + } + ' +} + function fastaIterator() { $AwkCmd '/^>/ {if (seq) printf("%s\f",seq); seq=""} \ {if (seq) seq=seq"\n"; seq=seq $1} \ @@ -91,7 +194,7 @@ function fastaIterator() { } # options may be followed by one colon to indicate they have a required argument -if ! options=$(getopt -o t:o:icrmhpl:CDETR -l ncbi-taxid:,organism,no-ir-detection,chloroplast,nuclear-rdna,mitochondrion,partial,min-length:,help,no-cds,no-cds-pass1,no-cds-pass2,no-trna,no-rrna -- "$@") +if ! options=$(getopt -o s:t:o:b:P:i:fcrmhpl:NICDETR -l specimen:,ncbi-taxid:,organism:,country:,project:,id-prefix:,not-force-ncbi,chloroplast,nuclear-rdna,mitochondrion,partial,min-length:,help,no-normalization,no-ir-detection,no-cds,no-cds-pass1,no-cds-pass2,no-trna,no-rrna -- "$@") then # something went wrong, getopt will put out an error message for us usage $0 1 @@ -102,15 +205,21 @@ eval set -- "$options" while [ $# -gt 0 ] do case $1 in - -t|--ncbi-taxid) taxid="$2" ; shift ;; - -i|--no-ir-detection) irdetection="no" ;; - -o|--organism) organism="$2" ; shift ;; -c|--chloroplast) types="chloro" ;; -r|--nuclear-rdna) types="nucrdna" ;; - -m|--mitochondrion) types="mito" ;; +# -m|--mitochondrion) types="mito" ;; + -t|--ncbi-taxid) taxid="$2" ; shift ;; + -s|--specimen) specimen="$2" ; shift ;; + -b|--country) country="$2" ; shift ;; + -o|--organism) organism="$2" ; shift ;; + -P|--project) project="$2" ; shift ;; + -i|--id-prefix) idprefix="$2" ; shift ;; + -f|--not-force-ncbi) resetorganism="no" ;; -p|--partial) partial="1" ;; -l|--min-length) minlength="$2" ; shift ;; -h|--help) usage $0 0;; + -N|--no-normalization) normalization="no" ;; + -I|--no-ir-detection) irdetection="no" ;; -C|--no-cds) cdsdetection="no";; -D|--no-cds-pass1) cdsdetection_pass1="no";; -E|--no-cds-pass2) cdsdetection_pass2="no";; @@ -123,16 +232,33 @@ do shift done -loginfo "Annotating mode.....: $types" -loginfo "IR detection mode...: $irdetection" -loginfo "CDS detection mode..: $cdsdetection" -loginfo " pass 1...: $cdsdetection_pass1" -loginfo " pass 2...: $cdsdetection_pass2" -loginfo "tRNA detection mode.: $trnadetection" -loginfo "rRNA detection mode.: $rrnadetection" -loginfo "Organism............: $organism" -loginfo "Partial mode........: $partial" -loginfo "Minimum length......: $minlength" +loginfo "NCBI taxid provided......: $taxid" + +if [[ "$taxid" != "no" ]] ; then + scientificname=$(ncbiscientificname $taxid) + loginfo "NCBI scientific name.....: $scientificname" + if [[ -z "$scientificname" ]] ; then + loginfo " Unknown taxid." + else + loginfo "Organism name from taxid.: $resetorganism" + if [[ "$resetorganism" == "yes" ]] ; then + organism=$(echo $scientificname | tr ' ' '_') + fi + fi +fi + +loginfo "Annotating mode..........: $types" +loginfo "Sequence normalization...: $normalization" +loginfo "IR detection mode........: $irdetection" +loginfo "CDS detection mode.......: $cdsdetection" +loginfo " pass 1........: $cdsdetection_pass1" +loginfo " pass 2........: $cdsdetection_pass2" +loginfo "tRNA detection mode......: $trnadetection" +loginfo "rRNA detection mode......: $rrnadetection" +loginfo "Organism.................: $organism" +loginfo "Country..................: $country" +loginfo "Partial mode.............: $partial" +loginfo "Minimum length...........: $minlength" ############################# @@ -155,8 +281,11 @@ pushTmpDir ORG.organnot openLogFile ${LOG} IFS=$'\f' + + sequence_number=0 for sequence in $(fastaIterator "${QUERY}") ; do + let sequence_number=sequence_number+1 unset IFS if [[ ! -z "${sequence}" ]] ; then echo "${sequence}" > toannotate.fasta @@ -172,10 +301,15 @@ pushTmpDir ORG.organnot if [[ "$irdetection" == "yes" ]] && (( partial == 0 )) ; then - loginfo "Normalizing the structure of the Chloroplast sequence..." + if [[ "$normalization" == "yes" ]] ; then + loginfo "Normalizing the structure of the Chloroplast sequence..." loginfo " LSC + IRB + SSC + IRA" ${PROG_DIR}/detectors/normalize/bin/go_normalize.sh toannotate.fasta > "${RESULTS}.norm.fasta" - loginfo "Done." + loginfo "Done." + else + loginfo "No normalization the structure of the Chloroplast sequence..." + cat toannotate.fasta > "${RESULTS}.norm.fasta" + fi loginfo "Annotating the Inverted repeats and Single copies (LSC and SSC)..." ${PROG_DIR}/detectors/ir/bin/go_ir.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot" @@ -220,16 +354,40 @@ pushTmpDir ORG.organnot nucrdna) loginfo "Annotating a plant rDNA cistron..." - loginfo "Normalizing the structure of the cistron sequence..." + if [[ "$normalization" == "yes" ]] ; then + loginfo "Normalizing the structure of the cistron sequence..." ${PROG_DIR}/detectors/normalizerdna/bin/go_normalizerdna.sh toannotate.fasta > "${RESULTS}.norm.fasta" - loginfo "Done." + loginfo "Done." + else + loginfo "No normalization of the structure of the cistron sequence..." + cat toannotate.fasta > "${RESULTS}.norm.fasta" + fi loginfo "Annotating the rRNA genes..." ${PROG_DIR}/detectors/nucrrna/bin/go_nucrrna.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot" loginfo "Done." topology="linear" - defline="18S rRNA gene, ITS1, 5.8S rRNA gene, ITS2 and 28S rRNA gene" + defline=$(cat "${RESULTS}.annot" \ + | grep "/gene=" \ + | sed -E 's@^FT */gene="([^"]*)".*$@\1@' \ + | sort -u \ + | awk '{printf("%s;",$0)}' \ + | awk 'BEGIN {i=1} + /18S rRNA;/ {gene[i]="18S rRNA"; i++} + /ITS1;/ {gene[i]="ITS1"; i++} + /5.8S rRNA;/ {gene[i]="5.8S rRNA"; i++} + /ITS2;/ {gene[i]="ITS2"; i++} + /28S rRNA;/ {gene[i]="28S rRNA"; ii++} + END { + for (i=1;i <= length(gene); i++) { + separator ="" + if (i < (length(gene)-1)) separator=", " + if (i == (length(gene)-1)) separator=" and " + printf("%s gene%s",gene[i],separator) + } + }') + # defline="18S rRNA gene, ITS1, 5.8S rRNA gene, ITS2 and 28S rRNA gene" ;; mito) @@ -256,14 +414,31 @@ pushTmpDir ORG.organnot else organism="$(echo ${organism} | tr '_' ' ')" fi + + if [[ "$specimen" != "no" ]] ; then + defline="${defline}, voucher ${specimen}" + fi sl=$(seqlength "${RESULTS}.norm.fasta") loginfo "Printing minimal header..." - echo "ID ${seqid}; ${seqid}; ${topology}; genomic DNA; XXX; XXX; ${sl} BP." + echo "ID XXX; XXX; ${topology}; genomic DNA; XXX; XXX; ${sl} BP." echo "XX" - echo "AC ${seqid};" - echo "DE ${organism} ${defline}." + echo "AC XXX;" + echo "XX" + if [[ "${project}" != "no" ]] ; then + sequence_number + if [[ "$idprefix" != "no" ]] ; then + seqid="${idprefix}${sequence_number}" + fi + echo "AC * _${seqid};" + echo "XX" + echo "PR Project:${project};" + echo "XX" + fi + + split80 "${organism} ${defline}." "DE " + echo "DE " echo "XX" loginfo "Done." @@ -274,7 +449,7 @@ pushTmpDir ORG.organnot loginfo "Printing the source feature" echo "FT source 1..${sl}" - if [[ "${organism}" != "{organism}" ]] ; then + if [[ "${organism}" != "no" ]] ; then echo "FT /organism=\"${organism}\"" fi @@ -292,11 +467,18 @@ pushTmpDir ORG.organnot echo "FT /mol_type=\"genomic DNA\"" + if [[ "${specimen}" != "no" ]] ; then + echo "FT /specimen_voucher=\"${specimen}\"" + fi + if [[ "${taxid}" != "no" ]] ; then echo "FT /db_xref=\"taxon:${taxid}\"" fi - # echo "FT /country=\"Poland: Bialowieza Forest\"" + if [[ "${country}" != "no" ]] ; then + echo "FT /country=\"${country}\"" + fi + loginfo "Done." loginfo "Ordering annotations..." @@ -379,4 +561,5 @@ pushTmpDir ORG.organnot IFS=$'\f' done # End of the loop over the sequences popTmpDir +loginfo "Annotation done."