Merge branch 'master' of git.metabarcoding.org:org-asm/org-annotate

Former-commit-id: 0a81301c5e8bd922a52aac5350d89d75290e25be
Former-commit-id: ba26c3376ad23cf6c7ebd98c9e7f955bc70b51b3
This commit is contained in:
alain viari
2015-11-08 19:54:19 +01:00
16 changed files with 52 additions and 49 deletions

View File

@ -33,7 +33,7 @@ pushTmpDir ORG.buildSCDB
loginfo "Building LSC coorientation graph..."
${PROG_DIR}/coorienteSC.sh LSC.fasta 20000 ${ORG_LOGFILE} > LSC.tgf
${PROG_DIR}/cc.py LSC.tgf > LSC.cc
loginfo " --> $(awk '{print $1}' LSC.cc | uniq | wc -l) connected componants"
loginfo " --> $($AwkCmd '{print $1}' LSC.cc | uniq | wc -l) connected componants"
loginfo "Done"
@ -46,7 +46,7 @@ pushTmpDir ORG.buildSCDB
loginfo "Extracting main connected components for LCS..."
rm -f LSC.direct.fasta
touch LSC.direct.fasta
for id in `awk '($1==0) {print $2}' LSC.cc`; do
for id in `$AwkCmd '($1==0) {print $2}' LSC.cc`; do
fastafetch -f LSC.fasta -i LSC.index -q "${id}" >> LSC.direct.fasta
done
loginfo " --> $(fastaCount LSC.direct.fasta) sequences"
@ -57,7 +57,7 @@ pushTmpDir ORG.buildSCDB
loginfo "Extracting second connected components for LCS..."
rm -f LSC.reverse.fasta
touch LSC.reverse.fasta
for id in `awk '($1==1) {print $2}' LSC.cc`; do
for id in `$AwkCmd '($1==1) {print $2}' LSC.cc`; do
fastafetch -f LSC.fasta -i LSC.index -q "${id}" >> LSC.reverse.fasta
done
loginfo " --> $(fastaCount LSC.reverse.fasta) sequences"
@ -75,7 +75,7 @@ pushTmpDir ORG.buildSCDB
loginfo "Checking LCS homogeneity..."
${PROG_DIR}/coorienteSC.sh LSC.direct.fasta 20000 ${ORG_LOGFILE} > LSC_RefDB.tgf
${PROG_DIR}/cc.py LSC_RefDB.tgf > LSC_RefDB.cc
NCC=$(awk '{print $1}' LSC_RefDB.cc | uniq | wc -l)
NCC=$($AwkCmd '{print $1}' LSC_RefDB.cc | uniq | wc -l)
if (( $NCC == 1 )); then
loginfo " --> $NCC connected componants"
else
@ -105,7 +105,7 @@ pushTmpDir ORG.buildSCDB
loginfo "Building SSC coorientation graph..."
${PROG_DIR}/coorienteSC.sh SSC.fasta 5000 ${ORG_LOGFILE} > SSC.tgf
${PROG_DIR}/cc.py SSC.tgf > SSC.cc
loginfo " --> $(awk '{print $1}' SSC.cc | uniq | wc -l) connected componants"
loginfo " --> $($AwkCmd '{print $1}' SSC.cc | uniq | wc -l) connected componants"
loginfo "Done"
@ -119,7 +119,7 @@ pushTmpDir ORG.buildSCDB
loginfo "Extracting main connected components for SSC..."
rm -f SSC.direct.fasta
touch SSC.direct.fasta
for id in `awk '($1==0) {print $2}' SSC.cc`; do
for id in `$AwkCmd '($1==0) {print $2}' SSC.cc`; do
fastafetch -f SSC.fasta -i SSC.index -q "${id}" >> SSC.direct.fasta
done
loginfo " --> $(fastaCount SSC.direct.fasta) sequences"
@ -130,7 +130,7 @@ pushTmpDir ORG.buildSCDB
loginfo "Extracting second connected components for SSC..."
rm -f SSC.reverse.fasta
touch SSC.reverse.fasta
for id in `awk '($1==1) {print $2}' SSC.cc`; do
for id in `$AwkCmd '($1==1) {print $2}' SSC.cc`; do
fastafetch -f SSC.fasta -i SSC.index -q "${id}" >> SSC.reverse.fasta
done
loginfo " --> $(fastaCount SSC.reverse.fasta) sequences"
@ -148,7 +148,7 @@ pushTmpDir ORG.buildSCDB
loginfo "Checking SSC homogeneity..."
${PROG_DIR}/coorienteSC.sh SSC.direct.fasta 5000 ${ORG_LOGFILE} > SSC_RefDB.tgf
${PROG_DIR}/cc.py SSC_RefDB.tgf > SSC_RefDB.cc
NCC=$(awk '{print $1}' SSC_RefDB.cc | uniq | wc -l)
NCC=$($AwkCmd '{print $1}' SSC_RefDB.cc | uniq | wc -l)
if (( $NCC == 1 )); then
loginfo " --> $NCC connected componants"
else

View File

@ -43,7 +43,7 @@ pushTmpDir ORG.coorienteSC
loginfo "Running Blast..."
blastn -db "${BLASTDB}" -query "${DATA}" -outfmt 6 | \
awk ' \
$AwkCmd ' \
($4 > 1000) && ($3 > 70) \
($1==QUERY) && \
($2==SUBJECT) && \
@ -65,7 +65,7 @@ pushTmpDir ORG.coorienteSC
LDIFF= ($3/100.*$4) }} \
} \
END {print QUERY,SUBJECT,LSAME,LDIFF,(LSAME>LDIFF)}' | \
awk -v minlength="${MINLENGTH}" \
$AwkCmd -v minlength="${MINLENGTH}" \
' (($3>minlength) || \
($4 > minlength)) && \
($3/($4+1) > 2) && \
@ -75,7 +75,7 @@ pushTmpDir ORG.coorienteSC
{print $1,$2,$5}}' | \
sort | \
uniq -c | \
awk '($1==2) {print $2,$3,$4}'
$AwkCmd '($1==2) {print $2,$3,$4}'
loginfo "Done"
popTmpDir

View File

@ -2,7 +2,7 @@
#
awk 'function printfasta(seq) { \
gawk 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \

View File

@ -2,7 +2,7 @@
#
awk 'function printfasta(seq) { \
gawk 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \

View File

@ -2,7 +2,7 @@
grep -A 1 ' ORGANISM' $* | \
grep -B 1 Viridiplantae | \
awk '{print $1}' | \
gawk '{print $1}' | \
grep '\.gbk' | \
sed -E 's/(^.*\.gbk).$/\1/' | \
uniq

View File

@ -33,13 +33,13 @@ pushTmpDir ORG.rrna
RRNA=$(basename ${QUERY})
hmmsearch --max ${RRNADB} ${QUERY} | \
awk '/Query: / { \
$AwkCmd '/Query: / { \
profil=$2; \
match($3,"[0-9][0-9]*");\
lprof=substr($3,RSTART,RLENGTH)} \
/ [0-9][0-9]* ! / { \
print profil,lprof,$7,$8,$10,$11}' | \
awk '($3 <=5) && (($2-$4) <=5) { \
$AwkCmd '($3 <=5) && (($2-$4) <=5) { \
full=1;$5=$5-$3+1;$6=$6+($2-$4)}
{loc=$5".."$6} \
($1 ~ /_RC$/) { \

View File

@ -11,7 +11,7 @@ source "${THIS_DIR}/../../../scripts/bash_init.sh"
function fasta1li {
awk '/^>/ {if (sequence) \
$AwkCmd '/^>/ {if (sequence) \
{print sequence}; \
print $0; \
sequence=""} \
@ -28,7 +28,7 @@ function dereplicate {
grep -v -- -- | \
sed -E "s/count=[0-9]+; //" | \
sed 's/cluster_weight/count/' | \
awk ' /^>/ {SEQ++;\
$AwkCmd ' /^>/ {SEQ++;\
match($0,"count=[0-9][0-9]*;");\
count=substr($0,RSTART,RLENGTH);\
$1=$1"_"SEQ;\
@ -58,7 +58,7 @@ function clustering {
}
function revcomp {
awk 'function printfasta(seq) { \
$AwkCmd 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \

View File

@ -2,7 +2,7 @@
#
awk 'function printfasta(seq) { \
gawk 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \

View File

@ -2,7 +2,7 @@
#
awk 'function printfasta(seq) { \
gawk 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \

View File

@ -2,7 +2,7 @@
#
awk 'function printfasta(seq) { \
gawk 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \

View File

@ -2,7 +2,7 @@
#
awk 'function printfasta(seq) { \
gawk 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \

View File

@ -11,7 +11,7 @@ source "${THIS_DIR}/../../../scripts/bash_init.sh"
function revcomp {
awk 'function printfasta(seq) { \
gawk 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \

View File

@ -1,4 +1,4 @@
#!/usr/bin/awk -f
#!/usr/bin/env gawk -f
function genomeid() {
if (gid=="") {
gid="XXXXXXX";

View File

@ -11,7 +11,7 @@ source "${THIS_DIR}/../../../scripts/bash_init.sh"
function fasta1li {
awk '/^>/ {if (sequence) \
$AwkCmd '/^>/ {if (sequence) \
{print sequence}; \
print $0; \
sequence=""} \
@ -28,7 +28,7 @@ function dereplicate {
grep -v -- -- | \
sed -E "s/count=[0-9]+; //" | \
sed 's/cluster_weight/count/' | \
awk ' /^>/ {SEQ++;$1=$1"_"SEQ;print $0} \
$AwkCmd ' /^>/ {SEQ++;$1=$1"_"SEQ;print $0} \
!/^>/ {print $0}'
}
@ -52,15 +52,16 @@ function goodtrna {
sumatra -t 0.90 -x $QUERY $REF | \
sed -E 's/.(trn.M?)[_A-Z0-9]+/ \1 /' | \
sort -k 1,2 | \
awk '(OLD) && ($1!=OLD) {print OLD,c["trnM"],c["trnfM"],c["trnI"]} \
$AwkCmd '(OLD) && ($1!=OLD) {print OLD,c["trnM"],c["trnfM"],c["trnI"]} \
(OLD !=$1) {c["trnM"]=0;c["trnfM"]=0;c["trnI"]=0;OLD=$1} \
{c[$2]+=$5}' | awk '{p=0;} \
{c[$2]+=$5}' | \
$AwkCmd '{p=0;} \
($2 > $3) && ($2 > $4) { print $0,"trnM";p=1 } \
($3 > $2) && ($3 > $4) {print $0,"trnfM";p=1} \
($4 > $2) && ($4 > $3) {print $0,"trnI";p=1} \
(p==0) {print $0,"----"}' | sed 's/_/ /' | \
awk '{print $1"_"$2,$3,$4,$5,$1,$6}' | \
awk '(($2+$3+$4) > 1) && ($5==$6) {print $1}'
$AwkCmd '{print $1"_"$2,$3,$4,$5,$1,$6}' | \
$AwkCmd '(($2+$3+$4) > 1) && ($5==$6) {print $1}'
}
pushTmpDir ORG.buildSCDB

View File

@ -15,11 +15,11 @@ function taxid {
}
function ac {
head -1 $1 | awk '{print $2}'
head -1 $1 | $AwkCmd '{print $2}'
}
function definition {
awk '/^DEFINITION/ {on=1} \
$AwkCmd '/^DEFINITION/ {on=1} \
(on==1) {printf("%s ",$0)} \
(/\.$/ && (on==1)) {on=0;print ""}' $1 | \
sed 's/^DEFINITION *//' | \
@ -33,7 +33,7 @@ function gb2fasta {
echo ">${AC} taxid=${TAXID}; ${DEFINITION}"
awk '/^\/\// {on=0} \
$AwkCmd '/^\/\// {on=0} \
(on==1) {print $0} \
/^ORIGIN / {on=1}' $1 | \
sed -E 's/^ *[0-9]+ +//' | \
@ -46,11 +46,11 @@ function findCAUtrna {
gb2fasta $1 > ${FASTATMP}
aragorn -i -w -seq ${FASTATMP} | \
awk '(on==1) && /^ *[0-9]+/ {on=0;print ""} \
$AwkCmd '(on==1) && /^ *[0-9]+/ {on=0;print ""} \
(on==1) {printf($0)} \
/\(cat\)$/ {on=1; printf("%s ",$0)} \
END {print ""}' | \
awk '{print $3,$6}' | \
$AwkCmd '{print $3,$6}' | \
sed -E 's/c?\[([0-9]+),([0-9]+)\]/\1 \2/' | \
sed 's/ /:/g'
@ -58,10 +58,10 @@ function findCAUtrna {
}
function trnaAnnotations {
awk '/^ORIGIN/ {on=0} \
$AwkCmd '/^ORIGIN/ {on=0} \
(on==1) {print $0} \
/^FEATURE/ {on=1}' $1 | \
awk '/^ [^ ]/ {print ""} \
$AwkCmd '/^ [^ ]/ {print ""} \
{printf("%s ",$0)} \
END {print ""}' | \
sed 's/^ *//' | \
@ -76,17 +76,17 @@ function trnaAnnotations {
sed -E 's/join\(([0-9]+ [0-9]+)\)/\1/' | \
sed 's/^tRNA *//' | \
sed -E 's@([0-9]+) +([0-9]+).*/gene="([^"]+)"@\1 \2 \3@' | \
awk '{print $1,$2,$3}'
$AwkCmd '{print $1,$2,$3}'
}
function annotateCAU {
DISTTMP="$$.trna.dist"
trna=(`echo $1 | sed 's/:/ /g'`)
awk -v b=${trna[0]} -v e=${trna[1]} \
$AwkCmd -v b=${trna[0]} -v e=${trna[1]} \
'{printf("sqrt((%d - %d)^2 + (%d - %d)^2)\n",$1,b,$2,e)}' $2 | \
bc -l | \
sed 's/\..*$//' > ${DISTTMP}
paste ${DISTTMP} $2 | sort -nk 1 | head -1 | awk '{print $1,$4}'
paste ${DISTTMP} $2 | sort -nk 1 | head -1 | $AwkCmd '{print $1,$4}'
rm -f ${DISTTMP}
}
@ -98,7 +98,7 @@ function writeTRNA {
TRNATMP="$$.trna.txt"
trnaAnnotations $1 > ${TRNATMP}
ntrna=`wc -l ${TRNATMP} | awk '{print $1}'`
ntrna=`wc -l ${TRNATMP} | $AwkCmd '{print $1}'`
if (( ntrna > 0 )); then
trnacau=`findCAUtrna $1`
@ -110,7 +110,7 @@ function writeTRNA {
if (( distance <= 10 )); then
echo ">${aa}_${AC} gbac=${AC}; trna=${aa}; taxid=${TAXID}; distance=${distance}; ${DEFINITION}"
echo "$t" | awk -F ':' '{print $3}'
echo "$t" | $AwkCmd -F ':' '{print $3}'
fi
done
fi

View File

@ -3,6 +3,8 @@
# for setting up basic variables and functions
#
export AwkCmd="gawk"
########################
#
# General usage functions
@ -25,8 +27,8 @@ function pushTmpDir {
}
function popTmpDir {
TMP_DIR=$(echo $TMP_DIR_STACK | awk '{print $1}')
TMP_DIR_STACK=$(echo $TMP_DIR_STACK | awk '{$1="";print $0}')
TMP_DIR=$(echo $TMP_DIR_STACK | $AwkCmd '{print $1}')
TMP_DIR_STACK=$(echo $TMP_DIR_STACK | $AwkCmd '{$1="";print $0}')
popd >& /dev/null
rm -rf $TMP_DIR >& /dev/null
logdebug "Poping temp directory $TMP_DIR"
@ -91,7 +93,7 @@ function fastaCount {
function seqlength {
cat $1 | \
wc |\
awk -v t="`head -1 $1 | wc -c`" '{print $3 - t - $1 + 1}'
$AwkCmd -v t="`head -1 $1 | wc -c`" '{print $3 - t - $1 + 1}'
}
# extract a subseq from a fasta sequence
@ -99,7 +101,7 @@ function seqlength {
# - $2 : First position of the subsequence (first position is numered 1),
# - $3 : End of the subsequence (included in the subsequence)
function cutseq {
awk -v from=$2 -v end=$3 'function printfasta(seq) { \
$AwkCmd -v from=$2 -v end=$3 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \
@ -114,13 +116,13 @@ function cutseq {
# a single sequence
# - $1 : The fasta file containing the sequences to join
function joinfasta {
awk '(NR==1 && /^>/) {print $0} \
$AwkCmd '(NR==1 && /^>/) {print $0} \
! /^>/ {print $0}' $1 | \
formatfasta
}
function formatfasta {
awk 'function printfasta(seq) { \
$AwkCmd 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \