diff --git a/org-annotate.sh b/org-annotate.sh index be61a37..da11b73 100755 --- a/org-annotate.sh +++ b/org-annotate.sh @@ -25,6 +25,7 @@ irdetection="yes" organism="no" types="chloro" partial=0 +minlength=0 function usage { echo "Usage:" ; @@ -56,6 +57,9 @@ function usage { echo echo ' -p | --partial' echo ' Indicates that the genome sequence is partial and therefore in several contigs' + echo + echo ' -l | --min-length' + echo ' Indicates for partial mode the minimum length of contig to annotate' exit $2 } @@ -67,7 +71,7 @@ function fastaIterator() { } # options may be followed by one colon to indicate they have a required argument -if ! options=$(getopt -o t:o:icrmhp -l ncbi-taxid:,organism,no-ir-detection,chloroplast,nuclear-rdna,mitochondrion,partial,help -- "$@") +if ! options=$(getopt -o t:o:icrmhpl: -l ncbi-taxid:,organism,no-ir-detection,chloroplast,nuclear-rdna,mitochondrion,partial,min-length:,help -- "$@") then # something went wrong, getopt will put out an error message for us usage $0 1 @@ -85,6 +89,7 @@ do -r|--nuclear-rdna) types="nucrdna" ;; -m|--mitochondrion) types="mito" ;; -p|--partial) partial="1" ;; + -l|--min-length) minlength="$2" ; shift ;; -h|--help) usage $0 0;; (--) shift; break;; (-*) echo "$0: error - unrecognized option $1" 1>&2; exit 1;; @@ -97,6 +102,7 @@ loginfo "Annotating mode.....: $types" loginfo "IR detection mode...: $irdetection" loginfo "Organism............: $organism" loginfo "Partial mode........: $partial" +loginfo "Minimum length......: $minlength" ############################# @@ -124,209 +130,213 @@ pushTmpDir ORG.organnot unset IFS if [[ ! -z "${sequence}" ]] ; then echo "${sequence}" > toannotate.fasta + sl=$(seqlength "toannotate.fasta") - seqid=$($AwkCmd '(NR==1) {print substr($1,2,1000)}' toannotate.fasta) + if (( sl >= minlength )) ; then - case "$types" in - chloro) - loginfo "Annotating a plant chloroplast genome..." - - if [[ "$irdetection" == "yes" ]] && (( partial == 0 )) ; then + seqid=$($AwkCmd '(NR==1) {print substr($1,2,1000)}' toannotate.fasta) - loginfo "Normalizing the structure of the Chloroplast sequence..." - loginfo " LSC + IRB + SSC + IRA" - ${PROG_DIR}/detectors/normalize/bin/go_normalize.sh toannotate.fasta > "${RESULTS}.norm.fasta" + case "$types" in + chloro) + loginfo "Annotating a plant chloroplast genome..." + + if [[ "$irdetection" == "yes" ]] && (( partial == 0 )) ; then + + loginfo "Normalizing the structure of the Chloroplast sequence..." + loginfo " LSC + IRB + SSC + IRA" + ${PROG_DIR}/detectors/normalize/bin/go_normalize.sh toannotate.fasta > "${RESULTS}.norm.fasta" + loginfo "Done." + + loginfo "Annotating the Inverted repeats and Single copies (LSC and SSC)..." + ${PROG_DIR}/detectors/ir/bin/go_ir.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot" + loginfo "Done." + + else + loginfo "No normalization of the structure of the Chloroplast sequence..." + cat toannotate.fasta > "${RESULTS}.norm.fasta" + rm -f "${RESULTS}.annot" + touch "${RESULTS}.annot" + fi + + loginfo "Annotating the tRNA..." + ${PROG_DIR}/detectors/trna/bin/go_trna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot" loginfo "Done." - loginfo "Annotating the Inverted repeats and Single copies (LSC and SSC)..." - ${PROG_DIR}/detectors/ir/bin/go_ir.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot" + loginfo "Annotating the rRNA genes..." + ${PROG_DIR}/detectors/rrna/bin/go_rrna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot" + loginfo "Done." + + loginfo "Annotating the CDS..." + tcsh -f ${PROG_DIR}/detectors/cds/bin/go_cds.csh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot" loginfo "Done." - else - loginfo "No normalization of the structure of the Chloroplast sequence..." - cat toannotate.fasta > "${RESULTS}.norm.fasta" - rm -f "${RESULTS}.annot" - touch "${RESULTS}.annot" - fi - - loginfo "Annotating the tRNA..." - ${PROG_DIR}/detectors/trna/bin/go_trna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot" - loginfo "Done." - - loginfo "Annotating the rRNA genes..." - ${PROG_DIR}/detectors/rrna/bin/go_rrna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot" - loginfo "Done." - - loginfo "Annotating the CDS..." - tcsh -f ${PROG_DIR}/detectors/cds/bin/go_cds.csh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot" - loginfo "Done." - - if (( partial == 0 )) ; then - topology="circular" - defline="plastid, complete genome" - else - topology="linear" - defline="plastid, partial sequence" - fi - ;; - - nucrdna) - loginfo "Annotating a plant rDNA cistron..." - - loginfo "Normalizing the structure of the cistron sequence..." - ${PROG_DIR}/detectors/normalizerdna/bin/go_normalizerdna.sh toannotate.fasta > "${RESULTS}.norm.fasta" - loginfo "Done." - - loginfo "Annotating the rRNA genes..." - ${PROG_DIR}/detectors/nucrrna/bin/go_nucrrna.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot" - loginfo "Done." - - topology="linear" - defline="18S rRNA gene, ITS1, 5.8S rRNA gene, ITS2 and 28S rRNA gene" - ;; - - mito) - loginfo "Annotating an animal mitochondrial genome..." - logerror "Not yet implemented" - - if (( partial == 0 )) ; then - topology="circular" - defline="mitochondrion, complete genome" - else - topology="linear" - defline="mitochondrion, partial sequence" - fi - - exit 1 - ;; - - *) - usage $0 1;; - esac - - if [[ "${organism}" == "no" ]]; then - organism="{organism}" - else - organism="$(echo ${organism} | tr '_' ' ')" - fi + if (( partial == 0 )) ; then + topology="circular" + defline="plastid, complete genome" + else + topology="linear" + defline="plastid, partial sequence" + fi + ;; + + nucrdna) + loginfo "Annotating a plant rDNA cistron..." + + loginfo "Normalizing the structure of the cistron sequence..." + ${PROG_DIR}/detectors/normalizerdna/bin/go_normalizerdna.sh toannotate.fasta > "${RESULTS}.norm.fasta" + loginfo "Done." + + loginfo "Annotating the rRNA genes..." + ${PROG_DIR}/detectors/nucrrna/bin/go_nucrrna.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot" + loginfo "Done." - sl=$(seqlength "${RESULTS}.norm.fasta") + topology="linear" + defline="18S rRNA gene, ITS1, 5.8S rRNA gene, ITS2 and 28S rRNA gene" + ;; + + mito) + loginfo "Annotating an animal mitochondrial genome..." + logerror "Not yet implemented" - loginfo "Printing minimal header..." - echo "ID ${seqid}; ${seqid}; ${topology}; genomic DNA; XXX; XXX; ${sl} BP." - echo "XX" - echo "AC ${seqid};" - echo "DE ${organism} ${defline}." - echo "XX" - loginfo "Done." - - loginfo "Printing annotations header..." - echo "FH Key Location/Qualifiers" - loginfo "Done." - - loginfo "Printing the source feature" - echo "FT source 1..${sl}" - - if [[ "${organism}" != "{organism}" ]] ; then - echo "FT /organism=\"${organism}\"" - fi - - case "${types}" in - chloro) - echo "FT /organelle=\"plastid:chloroplast\"" - ;; - mito) - echo "FT /organelle=\"mitochondrion\"" - ;; + if (( partial == 0 )) ; then + topology="circular" + defline="mitochondrion, complete genome" + else + topology="linear" + defline="mitochondrion, partial sequence" + fi + + exit 1 + ;; + *) - loginfo "Nuclear sequence" - ;; + usage $0 1;; esac - - echo "FT /mol_type=\"genomic DNA\"" - - if [[ "${taxid}" != "no" ]] ; then - echo "FT /db_xref=\"taxon:${taxid}\"" + + if [[ "${organism}" == "no" ]]; then + organism="{organism}" + else + organism="$(echo ${organism} | tr '_' ' ')" fi - # echo "FT /country=\"Poland: Bialowieza Forest\"" - loginfo "Done." - - loginfo "Ordering annotations..." - $AwkCmd '(entry && /^.....(misc|repeat|rRNA|tRNA|gene|source)/) { \ - print pos,entry } \ - /^.....(misc|repeat|rRNA|tRNA|gene|source)/ { \ - match($3,"[0-9][0-9]*"); \ - pos=substr($3,RSTART,RLENGTH)*1000 + 1; \ - entry=$0; \ - next} \ - { entry=entry "@" $0} \ - END {print pos,entry}' "${RESULTS}.annot" | \ - sort -nk1 |\ - $AwkCmd '{ \ - match($0,"^[0-9]* ");\ - line=substr($0,RLENGTH+1);\ - gsub("@","\n",line); \ - print line}' - loginfo "Done." - - - - loginfo "Closing annotations table..." - echo "XX" - loginfo "Done." - - loginfo "Computing statistics on nucleotide usage..." - $AwkCmd '! /^>/ { \ - seq=toupper($0); \ - gsub(" ","",seq); \ - lseq=length(seq); \ - for (i=0; i < lseq; i++) { \ - freq[substr(seq,i,1)]++}\ - } \ - END { \ - other=0; \ - for (i in freq) { \ - if (i!="A" && i!="C" && i!="G" && i!="T") {\ - other+=freq[i] \ - } \ - }; \ - print "SQ Sequence "\ - (freq["A"]+freq["C"]+freq["G"]+freq["T"]+other) \ - " BP; "\ - freq["A"]" A; "\ - freq["C"]" C; "\ - freq["G"]" G; "\ - freq["T"]" T; "\ - other" other;" \ - }' "${RESULTS}.norm.fasta" - loginfo "Done." - - loginfo "Reformating sequences..." - lines=$(wc -l "${RESULTS}.norm.fasta" | $AwkCmd '{print $1}') + sl=$(seqlength "${RESULTS}.norm.fasta") - loginfo "Sequence length $(seqlength ${RESULTS}.norm.fasta)" - loginfo "lines $lines" - formatfasta "${RESULTS}.norm.fasta" | \ - $AwkCmd -v lines=$lines ' \ - ! /^>/ { \ - seq=tolower($0); \ - gsub(" ","",seq); \ - printf(" ") ;\ - for (i=0; i < 6; i++) { \ - f=substr(seq,i * 10 + 1, 10); \ - pos+=length(f); \ - f = f substr(" ",1,10-length(f)); \ - printf("%s ",f) \ - }; \ - printf(" %6d\n",pos) \ - }' - loginfo "Done." + loginfo "Printing minimal header..." + echo "ID ${seqid}; ${seqid}; ${topology}; genomic DNA; XXX; XXX; ${sl} BP." + echo "XX" + echo "AC ${seqid};" + echo "DE ${organism} ${defline}." + echo "XX" + loginfo "Done." - loginfo "Closing sequence part..." - echo "//" - loginfo "Done." - fi + loginfo "Printing annotations header..." + echo "FH Key Location/Qualifiers" + loginfo "Done." + + loginfo "Printing the source feature" + echo "FT source 1..${sl}" + + if [[ "${organism}" != "{organism}" ]] ; then + echo "FT /organism=\"${organism}\"" + fi + + case "${types}" in + chloro) + echo "FT /organelle=\"plastid:chloroplast\"" + ;; + mito) + echo "FT /organelle=\"mitochondrion\"" + ;; + *) + loginfo "Nuclear sequence" + ;; + esac + + echo "FT /mol_type=\"genomic DNA\"" + + if [[ "${taxid}" != "no" ]] ; then + echo "FT /db_xref=\"taxon:${taxid}\"" + fi + + # echo "FT /country=\"Poland: Bialowieza Forest\"" + loginfo "Done." + + loginfo "Ordering annotations..." + $AwkCmd '(entry && /^.....(misc|repeat|rRNA|tRNA|gene|source)/) { \ + print pos,entry } \ + /^.....(misc|repeat|rRNA|tRNA|gene|source)/ { \ + match($3,"[0-9][0-9]*"); \ + pos=substr($3,RSTART,RLENGTH)*1000 + 1; \ + entry=$0; \ + next} \ + { entry=entry "@" $0} \ + END {print pos,entry}' "${RESULTS}.annot" | \ + sort -nk1 |\ + $AwkCmd '{ \ + match($0,"^[0-9]* ");\ + line=substr($0,RLENGTH+1);\ + gsub("@","\n",line); \ + print line}' + loginfo "Done." + + + + loginfo "Closing annotations table..." + echo "XX" + loginfo "Done." + + loginfo "Computing statistics on nucleotide usage..." + $AwkCmd '! /^>/ { \ + seq=toupper($0); \ + gsub(" ","",seq); \ + lseq=length(seq); \ + for (i=0; i < lseq; i++) { \ + freq[substr(seq,i,1)]++}\ + } \ + END { \ + other=0; \ + for (i in freq) { \ + if (i!="A" && i!="C" && i!="G" && i!="T") {\ + other+=freq[i] \ + } \ + }; \ + print "SQ Sequence "\ + (freq["A"]+freq["C"]+freq["G"]+freq["T"]+other) \ + " BP; "\ + freq["A"]" A; "\ + freq["C"]" C; "\ + freq["G"]" G; "\ + freq["T"]" T; "\ + other" other;" \ + }' "${RESULTS}.norm.fasta" + loginfo "Done." + + loginfo "Reformating sequences..." + lines=$(wc -l "${RESULTS}.norm.fasta" | $AwkCmd '{print $1}') + + loginfo "Sequence length $(seqlength ${RESULTS}.norm.fasta)" + loginfo "lines $lines" + formatfasta "${RESULTS}.norm.fasta" | \ + $AwkCmd -v lines=$lines ' \ + ! /^>/ { \ + seq=tolower($0); \ + gsub(" ","",seq); \ + printf(" ") ;\ + for (i=0; i < 6; i++) { \ + f=substr(seq,i * 10 + 1, 10); \ + pos+=length(f); \ + f = f substr(" ",1,10-length(f)); \ + printf("%s ",f) \ + }; \ + printf(" %6d\n",pos) \ + }' + loginfo "Done." + + loginfo "Closing sequence part..." + echo "//" + loginfo "Done." + fi # End of the minimum length condition + fi # End of not empty sequence condition IFS=$'\f' done # End of the loop over the sequences