From cf5a5d1ce5c12a5b05f8b254d5af840fc6957443 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 5 Oct 2016 15:11:26 +0200 Subject: [PATCH] Add management of partial sequences Former-commit-id: abd6112ac558592616c197f2fe761880a847b031 Former-commit-id: d2ea2e7e306512bf8ef92d3497a0302919b81010 --- org-annotate.sh | 345 +++++++++++++++++++++++++++++------------------- 1 file changed, 208 insertions(+), 137 deletions(-) diff --git a/org-annotate.sh b/org-annotate.sh index ef2f1ea..70bc1f1 100755 --- a/org-annotate.sh +++ b/org-annotate.sh @@ -24,6 +24,7 @@ normalization="yes" irdetection="yes" organism="no" types="chloro" +partial=0 function usage { echo "Usage:" ; @@ -52,12 +53,21 @@ function usage { echo echo ' -m | --mitochondrion' echo ' Selects for the annotation of an animal mitochondrion genome' + echo + echo ' -p | --partial' + echo ' Indicates that the genome sequence is partial and therefore in several contigs' exit $2 } +function fastaIterator() { + awk '/^>/ {if (seq) printf("%s\f",seq); seq=""} \ + {if (seq) seq=seq"\n"; seq=seq $1} \ + END {print seq}' "$1" +} + # options may be followed by one colon to indicate they have a required argument -if ! options=$(getopt -o t:o:icrmh -l ncbi-taxid:,organism,no-ir-detection,chloroplast,nuclear-rdna,mitochondrion,help -- "$@") +if ! options=$(getopt -o t:o:icrmhp -l ncbi-taxid:,organism,no-ir-detection,chloroplast,nuclear-rdna,mitochondrion,partial,help -- "$@") then # something went wrong, getopt will put out an error message for us usage $0 1 @@ -74,6 +84,7 @@ do -c|--chloroplast) types="chloro" ;; -r|--nuclear-rdna) types="nucrdna" ;; -m|--mitochondrion) types="mito" ;; + -p|--partial) partial="1" ;; -h|--help) usage $0 0;; (--) shift; break;; (-*) echo "$0: error - unrecognized option $1" 1>&2; exit 1;; @@ -101,151 +112,211 @@ pushTmpDir ORG.organnot rm -f ${LOG} openLogFile ${LOG} - - case "$types" in - chloro) - loginfo "Annotating a plant chloroplast genome..." + IFS=$'\f' - if [ "$irdetection"=="yes" ]; then + for sequence in $(fastaIterator "${QUERY}") ; do + unset IFS + if [[ ! -z "${sequence}" ]] ; then + echo "${sequence}" > toannotate.fasta + + seqid=$(awk '(NR==1) {print substr($1,2,1000)}' toannotate.fasta) + + case "$types" in + chloro) + loginfo "Annotating a plant chloroplast genome..." + + if [[ "$irdetection"=="yes" ]] && (( partial == 0 )) ; then + + loginfo "Normalizing the structure of the Chloroplast sequence..." + loginfo " LSC + IRB + SSC + IRA" + ${PROG_DIR}/detectors/normalize/bin/go_normalize.sh toannotate.fasta > "${RESULTS}.norm.fasta" + loginfo "Done." + + loginfo "Annotating the Inverted repeats and Single copies (LSC and SSC)..." + ${PROG_DIR}/detectors/ir/bin/go_ir.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot" + loginfo "Done." + + else + cat toannotate.fasta > "${RESULTS}.norm.fasta" + rm -f "${RESULTS}.annot" + touch "${RESULTS}.annot" + fi + + loginfo "Annotating the tRNA..." + ${PROG_DIR}/detectors/trna/bin/go_trna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot" + loginfo "Done." + + loginfo "Annotating the rRNA genes..." + ${PROG_DIR}/detectors/rrna/bin/go_rrna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot" + loginfo "Done." + + loginfo "Annotating the CDS..." + tcsh -f ${PROG_DIR}/detectors/cds/bin/go_cds.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot" + loginfo "Done." + + if (( partial == 0 )) ; then + topology="circular" + defline="plastid, complete genome" + else + topology="linear" + defline="plastid, partial sequence" + fi + ;; + + nucrdna) + loginfo "Annotating a plant rDNA cistron..." + + loginfo "Normalizing the structure of the cistron sequence..." + ${PROG_DIR}/detectors/normalizerdna/bin/go_normalizerdna.sh toannotate.fasta > "${RESULTS}.norm.fasta" + loginfo "Done." + + loginfo "Annotating the rRNA genes..." + ${PROG_DIR}/detectors/nucrrna/bin/go_nucrrna.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot" + loginfo "Done." - loginfo "Normalizing the structure of the Chloroplast sequence..." - loginfo " LSC + IRB + SSC + IRA" - ${PROG_DIR}/detectors/normalize/bin/go_normalize.sh ${QUERY} > "${RESULTS}.norm.fasta" - loginfo "Done." - - loginfo "Annotating the Inverted repeats and Single copies (LSC and SSC)..." - ${PROG_DIR}/detectors/ir/bin/go_ir.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot" - loginfo "Done." - + topology="linear" + defline="18S rRNA gene, ITS1, 5.8S rRNA gene, ITS2 and 28S rRNA gene" + ;; + + mito) + loginfo "Annotating an animal mitochondrial genome..." + logerror "Not yet implemented" + + if (( partial == 0 )) ; then + topology="circular" + defline="mitochondrion, complete genome" + else + topology="linear" + defline="mitochondrion, partial sequence" + fi + + exit 1 + ;; + + *) + usage $0 1;; + esac + + if [[ "${organism}" == "no" ]]; then + organism="{organism}" + else + organism="$(echo ${organism} | tr '_' ' ')" fi - loginfo "Annotating the tRNA..." - ${PROG_DIR}/detectors/trna/bin/go_trna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot" - loginfo "Done." + sl=$(seqlength "${RESULTS}.norm.fasta") - loginfo "Annotating the rRNA genes..." - ${PROG_DIR}/detectors/rrna/bin/go_rrna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot" + loginfo "Printing minimal header..." + echo "ID ${seqid}; ${seqid}; ${topology}; genomic DNA; XXX; XXX; ${sl} BP." + echo "XX" + echo "AC ${seqid};" + echo "DE ${organism} ${defline}." + echo "XX" loginfo "Done." - loginfo "Annotating the CDS..." - tcsh -f ${PROG_DIR}/detectors/cds/bin/go_cds.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot" - loginfo "Done." - - topology="circular" - defline="plastid, complete genome" - ;; - nucrdna) - loginfo "Annotating a plant rDNA cistron..." - - loginfo "Normalizing the structure of the cistron sequence..." - ${PROG_DIR}/detectors/normalizerdna/bin/go_normalizerdna.sh ${QUERY} > "${RESULTS}.norm.fasta" - loginfo "Done." - - loginfo "Annotating the rRNA genes..." - ${PROG_DIR}/detectors/nucrrna/bin/go_nucrrna.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot" + loginfo "Printing annotations header..." + echo "FH Key Location/Qualifiers" loginfo "Done." - topology="linear" - defline="18S rRNA gene, ITS1, 5.8S rRNA gene, ITS2 and 28S rRNA gene" - ;; - mito) - loginfo "Annotating an animal mitochondrial genome..." - logerror "Not yet implemented" - - topology="circular" - defline="mitochondrion, complete genome" - - exit 1 - ;; - *) - usage $0 1;; - esac - - if [[ "${organism}" == "no" ]]; then - organism="{organism}" - else - organism="$(echo ${organism} | tr '_' ' ')" - fi - - loginfo "Printing minimal header..." - echo "ID XXX; XXX; ${topology}; genomic DNA; XXX; XXX; $(seqlength ${RESULTS}.norm.fasta) BP." - echo "XX" - echo "AC XXX;" - echo "DE ${organism} ${defline}." - echo "XX" - loginfo "Done." - - loginfo "Printing annotations header..." - echo "FH Key Location/Qualifiers" - loginfo "Done." - - loginfo "Ordering annotations..." - awk '/^.....(misc|repeat|rRNA|tRNA|gene)/ { \ - match($3,"[0-9][0-9]*"); \ - pos=substr($3,RSTART,RLENGTH)*1000 + 1; \ - print pos,$0; \ - next} \ - { pos++; \ - print pos,$0}' "${RESULTS}.annot" | \ - sort -nk1 |\ - awk '{ \ - match($0,"^[0-9]* ");\ - line=substr($0,RLENGTH+1);\ - print line}' - loginfo "Done." - - loginfo "Closing annotations table..." - echo "XX" - loginfo "Done." - - loginfo "Computing statistics on nucleotide usage..." - awk '! /^>/ { \ - seq=toupper($0); \ - gsub(" ","",seq); \ - lseq=length(seq); \ - for (i=0; i < lseq; i++) { \ - freq[substr(seq,i,1)]++}\ - } \ - END { \ - other=0; \ - for (i in freq) { \ - if (i!="A" && i!="C" && i!="G" && i!="T") {\ - other+=freq[i] \ - } \ - }; \ - print "SQ Sequence "\ - (freq["A"]+freq["C"]+freq["G"]+freq["T"]+other) \ - " BP; "\ - freq["A"]" A; "\ - freq["C"]" C; "\ - freq["G"]" G; "\ - freq["T"]" T; "\ - other" other;" \ - }' "${RESULTS}.norm.fasta" - loginfo "Done." - - loginfo "Reformating sequences..." - lines=$(wc -l "${RESULTS}.norm.fasta" | awk '{print $1}') - awk -v lines=$lines ' \ - ! /^>/ { \ - seq=tolower($0); \ - gsub(" ","",seq); \ - printf(" ") ;\ - for (i=0; i < 6; i++) { \ - f=substr(seq,i * 10, 10); \ - pos+=length(f); \ - f = f substr(" ",1,10-length(f)); \ - printf("%s ",f) \ - }; \ - if (NR==lines) \ - {pos-=1}; \ - printf(" %6d\n",pos) \ - }' "${RESULTS}.norm.fasta" - loginfo "Done." - - loginfo "Closing sequence part..." - echo "//" - loginfo "Done." + loginfo "Printing the source feature" + echo "FT source 1..${sl}" >> "${RESULTS}.annot" + if [[ "${organism}" != "{organism}" ]] ; then + echo "FT /organism=\"${organism}\"" >> "${RESULTS}.annot" + fi + + case "${types}" in + chloro) + echo "FT /organelle=\"plastid:chloroplast\"" >> "${RESULTS}.annot" + ;; + mito) + echo "FT /organelle=\"mitochondrion\"" >> "${RESULTS}.annot" + ;; + *) + loginfo "Nuclear sequence" + ;; + esac + + echo "FT /mol_type=\"genomic DNA\"" >> "${RESULTS}.annot" + + if [[ "${taxid}" != "no" ]] ; then + echo "FT /db_xref=\"taxon:${taxid}\"" >> "${RESULTS}.annot" + fi + + # echo "FT /country=\"Poland: Bialowieza Forest\"" >> "${RESULTS}.annot" + loginfo "Done." + + loginfo "Ordering annotations..." + awk '/^.....(misc|repeat|rRNA|tRNA|gene|source)/ { \ + match($3,"[0-9][0-9]*"); \ + pos=substr($3,RSTART,RLENGTH)*1000 + 1; \ + print pos,$0; \ + next} \ + { pos++; \ + print pos,$0}' "${RESULTS}.annot" | \ + sort -nk1 |\ + awk '{ \ + match($0,"^[0-9]* ");\ + line=substr($0,RLENGTH+1);\ + print line}' + loginfo "Done." + + + + loginfo "Closing annotations table..." + echo "XX" + loginfo "Done." + + loginfo "Computing statistics on nucleotide usage..." + awk '! /^>/ { \ + seq=toupper($0); \ + gsub(" ","",seq); \ + lseq=length(seq); \ + for (i=0; i < lseq; i++) { \ + freq[substr(seq,i,1)]++}\ + } \ + END { \ + other=0; \ + for (i in freq) { \ + if (i!="A" && i!="C" && i!="G" && i!="T") {\ + other+=freq[i] \ + } \ + }; \ + print "SQ Sequence "\ + (freq["A"]+freq["C"]+freq["G"]+freq["T"]+other) \ + " BP; "\ + freq["A"]" A; "\ + freq["C"]" C; "\ + freq["G"]" G; "\ + freq["T"]" T; "\ + other" other;" \ + }' "${RESULTS}.norm.fasta" + loginfo "Done." + + loginfo "Reformating sequences..." + lines=$(wc -l "${RESULTS}.norm.fasta" | awk '{print $1}') + awk -v lines=$lines ' \ + ! /^>/ { \ + seq=tolower($0); \ + gsub(" ","",seq); \ + printf(" ") ;\ + for (i=0; i < 6; i++) { \ + f=substr(seq,i * 10, 10); \ + pos+=length(f); \ + f = f substr(" ",1,10-length(f)); \ + printf("%s ",f) \ + }; \ + if (NR==lines) \ + {pos-=1}; \ + printf(" %6d\n",pos) \ + }' "${RESULTS}.norm.fasta" + loginfo "Done." + + loginfo "Closing sequence part..." + echo "//" + loginfo "Done." + fi + + IFS=$'\f' + done # End of the loop over the sequences popTmpDir