diff --git a/detectors/normalize/tools/buildSCDB.sh b/detectors/normalize/tools/buildSCDB.sh index 3cfae96..3014eee 100755 --- a/detectors/normalize/tools/buildSCDB.sh +++ b/detectors/normalize/tools/buildSCDB.sh @@ -33,7 +33,7 @@ pushTmpDir ORG.buildSCDB loginfo "Building LSC coorientation graph..." ${PROG_DIR}/coorienteSC.sh LSC.fasta 20000 ${ORG_LOGFILE} > LSC.tgf ${PROG_DIR}/cc.py LSC.tgf > LSC.cc - loginfo " --> $(awk '{print $1}' LSC.cc | uniq | wc -l) connected componants" + loginfo " --> $($AwkCmd '{print $1}' LSC.cc | uniq | wc -l) connected componants" loginfo "Done" @@ -46,7 +46,7 @@ pushTmpDir ORG.buildSCDB loginfo "Extracting main connected components for LCS..." rm -f LSC.direct.fasta touch LSC.direct.fasta - for id in `awk '($1==0) {print $2}' LSC.cc`; do + for id in `$AwkCmd '($1==0) {print $2}' LSC.cc`; do fastafetch -f LSC.fasta -i LSC.index -q "${id}" >> LSC.direct.fasta done loginfo " --> $(fastaCount LSC.direct.fasta) sequences" @@ -57,7 +57,7 @@ pushTmpDir ORG.buildSCDB loginfo "Extracting second connected components for LCS..." rm -f LSC.reverse.fasta touch LSC.reverse.fasta - for id in `awk '($1==1) {print $2}' LSC.cc`; do + for id in `$AwkCmd '($1==1) {print $2}' LSC.cc`; do fastafetch -f LSC.fasta -i LSC.index -q "${id}" >> LSC.reverse.fasta done loginfo " --> $(fastaCount LSC.reverse.fasta) sequences" @@ -75,7 +75,7 @@ pushTmpDir ORG.buildSCDB loginfo "Checking LCS homogeneity..." ${PROG_DIR}/coorienteSC.sh LSC.direct.fasta 20000 ${ORG_LOGFILE} > LSC_RefDB.tgf ${PROG_DIR}/cc.py LSC_RefDB.tgf > LSC_RefDB.cc - NCC=$(awk '{print $1}' LSC_RefDB.cc | uniq | wc -l) + NCC=$($AwkCmd '{print $1}' LSC_RefDB.cc | uniq | wc -l) if (( $NCC == 1 )); then loginfo " --> $NCC connected componants" else @@ -105,7 +105,7 @@ pushTmpDir ORG.buildSCDB loginfo "Building SSC coorientation graph..." ${PROG_DIR}/coorienteSC.sh SSC.fasta 5000 ${ORG_LOGFILE} > SSC.tgf ${PROG_DIR}/cc.py SSC.tgf > SSC.cc - loginfo " --> $(awk '{print $1}' SSC.cc | uniq | wc -l) connected componants" + loginfo " --> $($AwkCmd '{print $1}' SSC.cc | uniq | wc -l) connected componants" loginfo "Done" @@ -119,7 +119,7 @@ pushTmpDir ORG.buildSCDB loginfo "Extracting main connected components for SSC..." rm -f SSC.direct.fasta touch SSC.direct.fasta - for id in `awk '($1==0) {print $2}' SSC.cc`; do + for id in `$AwkCmd '($1==0) {print $2}' SSC.cc`; do fastafetch -f SSC.fasta -i SSC.index -q "${id}" >> SSC.direct.fasta done loginfo " --> $(fastaCount SSC.direct.fasta) sequences" @@ -130,7 +130,7 @@ pushTmpDir ORG.buildSCDB loginfo "Extracting second connected components for SSC..." rm -f SSC.reverse.fasta touch SSC.reverse.fasta - for id in `awk '($1==1) {print $2}' SSC.cc`; do + for id in `$AwkCmd '($1==1) {print $2}' SSC.cc`; do fastafetch -f SSC.fasta -i SSC.index -q "${id}" >> SSC.reverse.fasta done loginfo " --> $(fastaCount SSC.reverse.fasta) sequences" @@ -148,7 +148,7 @@ pushTmpDir ORG.buildSCDB loginfo "Checking SSC homogeneity..." ${PROG_DIR}/coorienteSC.sh SSC.direct.fasta 5000 ${ORG_LOGFILE} > SSC_RefDB.tgf ${PROG_DIR}/cc.py SSC_RefDB.tgf > SSC_RefDB.cc - NCC=$(awk '{print $1}' SSC_RefDB.cc | uniq | wc -l) + NCC=$($AwkCmd '{print $1}' SSC_RefDB.cc | uniq | wc -l) if (( $NCC == 1 )); then loginfo " --> $NCC connected componants" else diff --git a/detectors/normalize/tools/coorienteSC.sh b/detectors/normalize/tools/coorienteSC.sh index ed6cc50..af4f38a 100755 --- a/detectors/normalize/tools/coorienteSC.sh +++ b/detectors/normalize/tools/coorienteSC.sh @@ -43,7 +43,7 @@ pushTmpDir ORG.coorienteSC loginfo "Running Blast..." blastn -db "${BLASTDB}" -query "${DATA}" -outfmt 6 | \ - awk ' \ + $AwkCmd ' \ ($4 > 1000) && ($3 > 70) \ ($1==QUERY) && \ ($2==SUBJECT) && \ @@ -65,7 +65,7 @@ pushTmpDir ORG.coorienteSC LDIFF= ($3/100.*$4) }} \ } \ END {print QUERY,SUBJECT,LSAME,LDIFF,(LSAME>LDIFF)}' | \ - awk -v minlength="${MINLENGTH}" \ + $AwkCmd -v minlength="${MINLENGTH}" \ ' (($3>minlength) || \ ($4 > minlength)) && \ ($3/($4+1) > 2) && \ @@ -75,7 +75,7 @@ pushTmpDir ORG.coorienteSC {print $1,$2,$5}}' | \ sort | \ uniq -c | \ - awk '($1==2) {print $2,$3,$4}' + $AwkCmd '($1==2) {print $2,$3,$4}' loginfo "Done" popTmpDir diff --git a/detectors/normalize/tools/extract_refLSC.sh b/detectors/normalize/tools/extract_refLSC.sh index 55bbcab..7930655 100755 --- a/detectors/normalize/tools/extract_refLSC.sh +++ b/detectors/normalize/tools/extract_refLSC.sh @@ -2,7 +2,7 @@ # - awk 'function printfasta(seq) { \ + gawk 'function printfasta(seq) { \ seqlen=length(seq); \ for (i=1; i <= seqlen; i+=60) \ print substr(seq,i,60); \ diff --git a/detectors/normalize/tools/extract_refSSC.sh b/detectors/normalize/tools/extract_refSSC.sh index 36660c6..e21d0e6 100755 --- a/detectors/normalize/tools/extract_refSSC.sh +++ b/detectors/normalize/tools/extract_refSSC.sh @@ -2,7 +2,7 @@ # - awk 'function printfasta(seq) { \ + gawk 'function printfasta(seq) { \ seqlen=length(seq); \ for (i=1; i <= seqlen; i+=60) \ print substr(seq,i,60); \ diff --git a/detectors/normalize/tools/selectViridiplantae.sh b/detectors/normalize/tools/selectViridiplantae.sh index 25e644e..895dd06 100755 --- a/detectors/normalize/tools/selectViridiplantae.sh +++ b/detectors/normalize/tools/selectViridiplantae.sh @@ -2,7 +2,7 @@ grep -A 1 ' ORGANISM' $* | \ grep -B 1 Viridiplantae | \ - awk '{print $1}' | \ + gawk '{print $1}' | \ grep '\.gbk' | \ sed -E 's/(^.*\.gbk).$/\1/' | \ uniq \ No newline at end of file diff --git a/detectors/rrna/bin/go_rrna.sh b/detectors/rrna/bin/go_rrna.sh index 4d79957..d987cb9 100755 --- a/detectors/rrna/bin/go_rrna.sh +++ b/detectors/rrna/bin/go_rrna.sh @@ -33,13 +33,13 @@ pushTmpDir ORG.rrna RRNA=$(basename ${QUERY}) hmmsearch --max ${RRNADB} ${QUERY} | \ - awk '/Query: / { \ + $AwkCmd '/Query: / { \ profil=$2; \ match($3,"[0-9][0-9]*");\ lprof=substr($3,RSTART,RLENGTH)} \ / [0-9][0-9]* ! / { \ print profil,lprof,$7,$8,$10,$11}' | \ - awk '($3 <=5) && (($2-$4) <=5) { \ + $AwkCmd '($3 <=5) && (($2-$4) <=5) { \ full=1;$5=$5-$3+1;$6=$6+($2-$4)} {loc=$5".."$6} \ ($1 ~ /_RC$/) { \ diff --git a/detectors/rrna/tools/buildRRNAModels.sh b/detectors/rrna/tools/buildRRNAModels.sh index 8a9561f..fdb3c55 100755 --- a/detectors/rrna/tools/buildRRNAModels.sh +++ b/detectors/rrna/tools/buildRRNAModels.sh @@ -11,7 +11,7 @@ source "${THIS_DIR}/../../../scripts/bash_init.sh" function fasta1li { - awk '/^>/ {if (sequence) \ + $AwkCmd '/^>/ {if (sequence) \ {print sequence}; \ print $0; \ sequence=""} \ @@ -28,7 +28,7 @@ function dereplicate { grep -v -- -- | \ sed -E "s/count=[0-9]+; //" | \ sed 's/cluster_weight/count/' | \ - awk ' /^>/ {SEQ++;\ + $AwkCmd ' /^>/ {SEQ++;\ match($0,"count=[0-9][0-9]*;");\ count=substr($0,RSTART,RLENGTH);\ $1=$1"_"SEQ;\ @@ -58,7 +58,7 @@ function clustering { } function revcomp { - awk 'function printfasta(seq) { \ + $AwkCmd 'function printfasta(seq) { \ seqlen=length(seq); \ for (i=1; i <= seqlen; i+=60) \ print substr(seq,i,60); \ diff --git a/detectors/rrna/tools/extract_ref16S.sh b/detectors/rrna/tools/extract_ref16S.sh index 1337eef..86b93a3 100755 --- a/detectors/rrna/tools/extract_ref16S.sh +++ b/detectors/rrna/tools/extract_ref16S.sh @@ -2,7 +2,7 @@ # - awk 'function printfasta(seq) { \ + gawk 'function printfasta(seq) { \ seqlen=length(seq); \ for (i=1; i <= seqlen; i+=60) \ print substr(seq,i,60); \ diff --git a/detectors/rrna/tools/extract_ref23S.sh b/detectors/rrna/tools/extract_ref23S.sh index 31758c7..fc210df 100755 --- a/detectors/rrna/tools/extract_ref23S.sh +++ b/detectors/rrna/tools/extract_ref23S.sh @@ -2,7 +2,7 @@ # - awk 'function printfasta(seq) { \ + gawk 'function printfasta(seq) { \ seqlen=length(seq); \ for (i=1; i <= seqlen; i+=60) \ print substr(seq,i,60); \ diff --git a/detectors/rrna/tools/extract_ref4.5S.sh b/detectors/rrna/tools/extract_ref4.5S.sh index 6f59c29..35c2ba5 100755 --- a/detectors/rrna/tools/extract_ref4.5S.sh +++ b/detectors/rrna/tools/extract_ref4.5S.sh @@ -2,7 +2,7 @@ # - awk 'function printfasta(seq) { \ + gawk 'function printfasta(seq) { \ seqlen=length(seq); \ for (i=1; i <= seqlen; i+=60) \ print substr(seq,i,60); \ diff --git a/detectors/rrna/tools/extract_ref5S.sh b/detectors/rrna/tools/extract_ref5S.sh index bc90277..322227b 100755 --- a/detectors/rrna/tools/extract_ref5S.sh +++ b/detectors/rrna/tools/extract_ref5S.sh @@ -2,7 +2,7 @@ # - awk 'function printfasta(seq) { \ + gawk 'function printfasta(seq) { \ seqlen=length(seq); \ for (i=1; i <= seqlen; i+=60) \ print substr(seq,i,60); \ diff --git a/detectors/rrna/tools/revcomp_alignments.sh b/detectors/rrna/tools/revcomp_alignments.sh index e06a015..7d1d089 100644 --- a/detectors/rrna/tools/revcomp_alignments.sh +++ b/detectors/rrna/tools/revcomp_alignments.sh @@ -11,7 +11,7 @@ source "${THIS_DIR}/../../../scripts/bash_init.sh" function revcomp { - awk 'function printfasta(seq) { \ + gawk 'function printfasta(seq) { \ seqlen=length(seq); \ for (i=1; i <= seqlen; i+=60) \ print substr(seq,i,60); \ diff --git a/detectors/trna/lib/aragorn_wrapper.awk b/detectors/trna/lib/aragorn_wrapper.awk index 6fdcba9..83b6a48 100755 --- a/detectors/trna/lib/aragorn_wrapper.awk +++ b/detectors/trna/lib/aragorn_wrapper.awk @@ -1,4 +1,4 @@ -#!/usr/bin/awk -f +#!/usr/bin/env gawk -f function genomeid() { if (gid=="") { gid="XXXXXXX"; diff --git a/detectors/trna/tools/buildCAURefDB.sh b/detectors/trna/tools/buildCAURefDB.sh index a0de820..c97c89e 100755 --- a/detectors/trna/tools/buildCAURefDB.sh +++ b/detectors/trna/tools/buildCAURefDB.sh @@ -11,7 +11,7 @@ source "${THIS_DIR}/../../../scripts/bash_init.sh" function fasta1li { - awk '/^>/ {if (sequence) \ + $AwkCmd '/^>/ {if (sequence) \ {print sequence}; \ print $0; \ sequence=""} \ @@ -28,7 +28,7 @@ function dereplicate { grep -v -- -- | \ sed -E "s/count=[0-9]+; //" | \ sed 's/cluster_weight/count/' | \ - awk ' /^>/ {SEQ++;$1=$1"_"SEQ;print $0} \ + $AwkCmd ' /^>/ {SEQ++;$1=$1"_"SEQ;print $0} \ !/^>/ {print $0}' } @@ -52,15 +52,16 @@ function goodtrna { sumatra -t 0.90 -x $QUERY $REF | \ sed -E 's/.(trn.M?)[_A-Z0-9]+/ \1 /' | \ sort -k 1,2 | \ - awk '(OLD) && ($1!=OLD) {print OLD,c["trnM"],c["trnfM"],c["trnI"]} \ + $AwkCmd '(OLD) && ($1!=OLD) {print OLD,c["trnM"],c["trnfM"],c["trnI"]} \ (OLD !=$1) {c["trnM"]=0;c["trnfM"]=0;c["trnI"]=0;OLD=$1} \ - {c[$2]+=$5}' | awk '{p=0;} \ + {c[$2]+=$5}' | \ + $AwkCmd '{p=0;} \ ($2 > $3) && ($2 > $4) { print $0,"trnM";p=1 } \ ($3 > $2) && ($3 > $4) {print $0,"trnfM";p=1} \ ($4 > $2) && ($4 > $3) {print $0,"trnI";p=1} \ (p==0) {print $0,"----"}' | sed 's/_/ /' | \ - awk '{print $1"_"$2,$3,$4,$5,$1,$6}' | \ - awk '(($2+$3+$4) > 1) && ($5==$6) {print $1}' + $AwkCmd '{print $1"_"$2,$3,$4,$5,$1,$6}' | \ + $AwkCmd '(($2+$3+$4) > 1) && ($5==$6) {print $1}' } pushTmpDir ORG.buildSCDB diff --git a/detectors/trna/tools/extract_refCAUtRNA.sh b/detectors/trna/tools/extract_refCAUtRNA.sh index 6a8866e..f71f1e1 100755 --- a/detectors/trna/tools/extract_refCAUtRNA.sh +++ b/detectors/trna/tools/extract_refCAUtRNA.sh @@ -15,11 +15,11 @@ function taxid { } function ac { - head -1 $1 | awk '{print $2}' + head -1 $1 | $AwkCmd '{print $2}' } function definition { - awk '/^DEFINITION/ {on=1} \ + $AwkCmd '/^DEFINITION/ {on=1} \ (on==1) {printf("%s ",$0)} \ (/\.$/ && (on==1)) {on=0;print ""}' $1 | \ sed 's/^DEFINITION *//' | \ @@ -33,7 +33,7 @@ function gb2fasta { echo ">${AC} taxid=${TAXID}; ${DEFINITION}" - awk '/^\/\// {on=0} \ + $AwkCmd '/^\/\// {on=0} \ (on==1) {print $0} \ /^ORIGIN / {on=1}' $1 | \ sed -E 's/^ *[0-9]+ +//' | \ @@ -46,11 +46,11 @@ function findCAUtrna { gb2fasta $1 > ${FASTATMP} aragorn -i -w -seq ${FASTATMP} | \ - awk '(on==1) && /^ *[0-9]+/ {on=0;print ""} \ + $AwkCmd '(on==1) && /^ *[0-9]+/ {on=0;print ""} \ (on==1) {printf($0)} \ /\(cat\)$/ {on=1; printf("%s ",$0)} \ END {print ""}' | \ - awk '{print $3,$6}' | \ + $AwkCmd '{print $3,$6}' | \ sed -E 's/c?\[([0-9]+),([0-9]+)\]/\1 \2/' | \ sed 's/ /:/g' @@ -58,10 +58,10 @@ function findCAUtrna { } function trnaAnnotations { - awk '/^ORIGIN/ {on=0} \ + $AwkCmd '/^ORIGIN/ {on=0} \ (on==1) {print $0} \ /^FEATURE/ {on=1}' $1 | \ - awk '/^ [^ ]/ {print ""} \ + $AwkCmd '/^ [^ ]/ {print ""} \ {printf("%s ",$0)} \ END {print ""}' | \ sed 's/^ *//' | \ @@ -76,17 +76,17 @@ function trnaAnnotations { sed -E 's/join\(([0-9]+ [0-9]+)\)/\1/' | \ sed 's/^tRNA *//' | \ sed -E 's@([0-9]+) +([0-9]+).*/gene="([^"]+)"@\1 \2 \3@' | \ - awk '{print $1,$2,$3}' + $AwkCmd '{print $1,$2,$3}' } function annotateCAU { DISTTMP="$$.trna.dist" trna=(`echo $1 | sed 's/:/ /g'`) - awk -v b=${trna[0]} -v e=${trna[1]} \ + $AwkCmd -v b=${trna[0]} -v e=${trna[1]} \ '{printf("sqrt((%d - %d)^2 + (%d - %d)^2)\n",$1,b,$2,e)}' $2 | \ bc -l | \ sed 's/\..*$//' > ${DISTTMP} - paste ${DISTTMP} $2 | sort -nk 1 | head -1 | awk '{print $1,$4}' + paste ${DISTTMP} $2 | sort -nk 1 | head -1 | $AwkCmd '{print $1,$4}' rm -f ${DISTTMP} } @@ -98,7 +98,7 @@ function writeTRNA { TRNATMP="$$.trna.txt" trnaAnnotations $1 > ${TRNATMP} - ntrna=`wc -l ${TRNATMP} | awk '{print $1}'` + ntrna=`wc -l ${TRNATMP} | $AwkCmd '{print $1}'` if (( ntrna > 0 )); then trnacau=`findCAUtrna $1` @@ -110,7 +110,7 @@ function writeTRNA { if (( distance <= 10 )); then echo ">${aa}_${AC} gbac=${AC}; trna=${aa}; taxid=${TAXID}; distance=${distance}; ${DEFINITION}" - echo "$t" | awk -F ':' '{print $3}' + echo "$t" | $AwkCmd -F ':' '{print $3}' fi done fi diff --git a/scripts/bash_init.sh b/scripts/bash_init.sh index 73b0185..bb60ba2 100644 --- a/scripts/bash_init.sh +++ b/scripts/bash_init.sh @@ -3,6 +3,8 @@ # for setting up basic variables and functions # +export AwkCmd="gawk" + ######################## # # General usage functions @@ -25,8 +27,8 @@ function pushTmpDir { } function popTmpDir { - TMP_DIR=$(echo $TMP_DIR_STACK | awk '{print $1}') - TMP_DIR_STACK=$(echo $TMP_DIR_STACK | awk '{$1="";print $0}') + TMP_DIR=$(echo $TMP_DIR_STACK | $AwkCmd '{print $1}') + TMP_DIR_STACK=$(echo $TMP_DIR_STACK | $AwkCmd '{$1="";print $0}') popd >& /dev/null rm -rf $TMP_DIR >& /dev/null logdebug "Poping temp directory $TMP_DIR" @@ -91,7 +93,7 @@ function fastaCount { function seqlength { cat $1 | \ wc |\ - awk -v t="`head -1 $1 | wc -c`" '{print $3 - t - $1 + 1}' + $AwkCmd -v t="`head -1 $1 | wc -c`" '{print $3 - t - $1 + 1}' } # extract a subseq from a fasta sequence @@ -99,7 +101,7 @@ function seqlength { # - $2 : First position of the subsequence (first position is numered 1), # - $3 : End of the subsequence (included in the subsequence) function cutseq { - awk -v from=$2 -v end=$3 'function printfasta(seq) { \ + $AwkCmd -v from=$2 -v end=$3 'function printfasta(seq) { \ seqlen=length(seq); \ for (i=1; i <= seqlen; i+=60) \ print substr(seq,i,60); \ @@ -114,13 +116,13 @@ function cutseq { # a single sequence # - $1 : The fasta file containing the sequences to join function joinfasta { - awk '(NR==1 && /^>/) {print $0} \ + $AwkCmd '(NR==1 && /^>/) {print $0} \ ! /^>/ {print $0}' $1 | \ formatfasta } function formatfasta { - awk 'function printfasta(seq) { \ + $AwkCmd 'function printfasta(seq) { \ seqlen=length(seq); \ for (i=1; i <= seqlen; i+=60) \ print substr(seq,i,60); \