From bff0beaeee33b0818149d652e51e2508fd2aae84 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Sun, 11 Oct 2015 11:58:43 -0300 Subject: [PATCH] Add the reformating of the sequence in EMBL format Former-commit-id: f58f9d6702775280555d8d05fa167594e33620b2 Former-commit-id: 99eb5825fe75585ac86805ffdcca65f4ee7c0070 --- organnote.sh | 88 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 84 insertions(+), 4 deletions(-) diff --git a/organnote.sh b/organnote.sh index 8258582..14e58d0 100755 --- a/organnote.sh +++ b/organnote.sh @@ -36,12 +36,92 @@ pushTmpDir ORG.organnot rm -f ${LOG} openLogFile ${LOG} - ${PROG_DIR}/detectors/normalize/bin/go_normalize.sh ${QUERY} > "${RESULTS}.norm.fasta" + loginfo "Normalizing the structure of the Chloroplast sequence..." + loginfo " LSC + IRB + SSC + IRA" + ${PROG_DIR}/detectors/normalize/bin/go_normalize.sh ${QUERY} > "${RESULTS}.norm.fasta" + loginfo "Done." - ${PROG_DIR}/detectors/ir/bin/go_ir.sh ${QUERY} > "${RESULTS}.annot" - ${PROG_DIR}/detectors/trna/bin/go_trna.sh ${QUERY} >> "${RESULTS}.annot" + loginfo "Annotating the Inverted repeats and Single copies (LSC and SSC)..." + ${PROG_DIR}/detectors/ir/bin/go_ir.sh ${QUERY} > "${RESULTS}.annot" + loginfo "Done." - cat "${RESULTS}.annot" + loginfo "Annotating the tRNA..." + ${PROG_DIR}/detectors/trna/bin/go_trna.sh ${QUERY} >> "${RESULTS}.annot" + loginfo "Done." + + loginfo "Printing annotations header..." + echo "XX" + echo "FH Key Location/Qualifiers" + loginfo "Done." + + loginfo "Ordering annotations..." + awk '/^.....[^ ]/ { \ + match($3,"[0-9][0-9]*"); \ + pos=substr($3,RSTART,RLENGTH)*1000 + 1; \ + print pos,$0} \ + /^..... / { \ + pos++; \ + print pos,$0}' "${RESULTS}.annot" | \ + sort -nk1 | \ + awk '{ \ + match($0,"^[0-9]* ");\ + line=substr($0,RLENGTH+1,100000);\ + print line}' + loginfo "Done." + + loginfo "Closing annotations table..." + echo "XX" + loginfo "Done." + + loginfo "Computing statistics on nucleotide usage..." + awk '! /^>/ { \ + seq=toupper($0); \ + gsub(" ","",seq); \ + lseq=length(seq); \ + for (i=0; i < lseq; i++) { \ + freq[substr(seq,i,1)]++}\ + } \ + END { \ + other=0; \ + for (i in freq) { \ + if (i!="A" && i!="C" && i!="G" && i!="T") {\ + other+=freq[i] \ + } \ + }; \ + print "SQ Sequence "\ + (freq["A"]+freq["C"]+freq["G"]+freq["T"]+other) \ + " BP; "\ + freq["A"]" A; "\ + freq["C"]" C; "\ + freq["G"]" G; "\ + freq["T"]" T; "\ + other" other;" \ + }' ${QUERY} + loginfo "Done." + + loginfo "Reformating sequences..." + lines=$(wc -l ${QUERY} | awk '{print $1}') + awk -v lines=$lines ' \ + ! /^>/ { \ + seq=tolower($0); \ + gsub(" ","",seq); \ + printf(" ") ;\ + for (i=0; i < 6; i++) { \ + f=substr(seq,i * 10, 10); \ + pos+=length(f); \ + f = f substr(" ",1,10-length(f)); \ + printf("%s ",f) \ + }; \ + if (NR==lines) \ + {pos-=1}; \ + printf(" %6d\n",pos) \ + }' ${QUERY} + loginfo "Done." + + loginfo "Closing sequence part..." + echo "//" + loginfo "Done." + popTmpDir