First version including the rRNA detector
Former-commit-id: 865e2626c7cd92ea596aedefe2b760b3eaaa9c77 Former-commit-id: 94a2e642328eb679660c6ac5a6cb6803c2083632
This commit is contained in:
165
detectors/rrna/tools/buildRRNAModels.sh
Executable file
165
detectors/rrna/tools/buildRRNAModels.sh
Executable file
@ -0,0 +1,165 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# BUILD RRNA models
|
||||
#
|
||||
#========================================================================================
|
||||
|
||||
# -- CAUTION -- Works as long than the script
|
||||
# is not called through a symlink
|
||||
SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})"
|
||||
source "${SCRIPT_DIR}/../../../scripts/bash_init.sh"
|
||||
|
||||
function fasta1li {
|
||||
|
||||
awk '/^>/ {if (sequence) \
|
||||
{print sequence}; \
|
||||
print $0; \
|
||||
sequence=""} \
|
||||
!/^>/ {sequence = sequence $0} \
|
||||
END {print sequence}' $1
|
||||
}
|
||||
|
||||
function dereplicate {
|
||||
DATA=$1
|
||||
sumaclust -t 1 $DATA | \
|
||||
fasta1li | \
|
||||
grep -A 1 '^>' | \
|
||||
grep -A1 'cluster_center=True;' | \
|
||||
grep -v -- -- | \
|
||||
sed -E "s/count=[0-9]+; //" | \
|
||||
sed 's/cluster_weight/count/' | \
|
||||
awk ' /^>/ {SEQ++;\
|
||||
match($0,"count=[0-9][0-9]*;");\
|
||||
count=substr($0,RSTART,RLENGTH);\
|
||||
$1=$1"_"SEQ;\
|
||||
print $1,count} \
|
||||
!/^>/ {print $0}'
|
||||
}
|
||||
|
||||
|
||||
function clustering {
|
||||
DATA=$1
|
||||
rm -rf $DATA
|
||||
mkdir $DATA
|
||||
sumaclust -t 0.9 $DATA.fasta | \
|
||||
fasta1li > $DATA.clust.fasta
|
||||
cluster=$(grep '^>' $DATA.clust.fasta | \
|
||||
sed -E 's/.*cluster=([^;]+);.*$/\1/' | \
|
||||
sort -u)
|
||||
for c in $cluster; do
|
||||
w=$(grep "$c" "${DATA}.clust.fasta" | \
|
||||
head -1 | \
|
||||
sed -E 's/.*cluster_weight=([^;]+);.*$/\1/')
|
||||
out=$(printf "${DATA}/%05d_%s" $w $c)
|
||||
grep -A1 "$c" "${DATA}.clust.fasta" | \
|
||||
grep -v -- -- > "$out.fasta"
|
||||
muscle -in "$out.fasta" -out "$out.align.fasta"
|
||||
done
|
||||
}
|
||||
|
||||
function revcomp {
|
||||
awk 'function printfasta(seq) { \
|
||||
seqlen=length(seq); \
|
||||
for (i=1; i <= seqlen; i+=60) \
|
||||
print substr(seq,i,60); \
|
||||
} \
|
||||
function comp(seq) { \
|
||||
"echo "seq" | tr acgtACGT tgcaTGCA " | getline res; \
|
||||
close("echo "seq" | tr acgtACGT tgcaTGCA "); \
|
||||
return res; \
|
||||
} \
|
||||
function rev(seq) { \
|
||||
"echo "seq" | rev " | getline res; \
|
||||
close("echo "seq" | rev "); \
|
||||
return res; \
|
||||
} \
|
||||
function revcomp(seq) { \
|
||||
res=rev(comp(seq)); \
|
||||
return res; \
|
||||
} \
|
||||
\
|
||||
(seq) && /^>/ {print head; \
|
||||
printfasta(revcomp(seq)); \
|
||||
seq=""} \
|
||||
/^>/ {head=$0} \
|
||||
! /^>/ {seq=seq$0} \
|
||||
END { print head; \
|
||||
printfasta(revcomp(seq)); \
|
||||
}' $1
|
||||
}
|
||||
|
||||
|
||||
pushTmpDir ORG.buildRRNA
|
||||
|
||||
openLogFile "${RRNA_DATA_DIR}/rRNA_models.log"
|
||||
|
||||
loginfo "Selecting Viridiplantae genebank entries..."
|
||||
VIRIDIPLANTAE=$(${PROG_DIR}/../../normalize/tools/selectViridiplantae.sh $*)
|
||||
loginfo " --> $(echo ${VIRIDIPLANTAE} | wc -w) entries selected"
|
||||
loginfo "Done"
|
||||
|
||||
|
||||
loginfo "Extracting 4.5S rRNA sequences..."
|
||||
${PROG_DIR}/extract_ref4.5S.sh ${VIRIDIPLANTAE} > raw_4_5S.fasta
|
||||
loginfo " --> $(fastaCount raw_4_5S.fasta) retreived sequences"
|
||||
dereplicate raw_4_5S.fasta > 4_5S.fasta
|
||||
loginfo " --> $(fastaCount ${CALL_DIR}/4_5S.fasta) distinct sequences"
|
||||
loginfo "Done"
|
||||
|
||||
loginfo "Clustering 4.5S rRNA sequences..."
|
||||
clustering 4_5S
|
||||
loginfo "Done"
|
||||
|
||||
loginfo "Installing 4.5S rRNA sequences..."
|
||||
cp -r 4_5S "${RRNA_DATA_DIR}/RRNA_4_5S"
|
||||
loginfo "Done"
|
||||
|
||||
loginfo "Extracting 5S rRNA sequences..."
|
||||
${PROG_DIR}/extract_ref5S.sh ${VIRIDIPLANTAE} > raw_5S.fasta
|
||||
loginfo " --> $(fastaCount raw_5S.fasta) retreived sequences"
|
||||
dereplicate raw_5S.fasta > 5S.fasta
|
||||
loginfo " --> $(fastaCount 5S.fasta) distinct sequences"
|
||||
loginfo "Done"
|
||||
|
||||
loginfo "Clustering 5S rRNA sequences..."
|
||||
clustering 5S
|
||||
loginfo "Done"
|
||||
|
||||
loginfo "Installing 5S rRNA sequences..."
|
||||
cp -r 5S "${RRNA_DATA_DIR}/RRNA_5S"
|
||||
loginfo "Done"
|
||||
|
||||
|
||||
loginfo "Extracting 16S rRNA sequences..."
|
||||
${PROG_DIR}/extract_ref16S.sh ${VIRIDIPLANTAE} > raw_16S.fasta
|
||||
loginfo " --> $(fastaCount raw_16S.fasta) retreived sequences"
|
||||
dereplicate raw_16S.fasta > 16S.fasta
|
||||
loginfo " --> $(fastaCount 16S.fasta) distinct sequences"
|
||||
loginfo "Done"
|
||||
|
||||
loginfo "Clustering 16S rRNA sequences..."
|
||||
clustering 16S
|
||||
loginfo "Done"
|
||||
|
||||
loginfo "Installing 16S rRNA sequences..."
|
||||
cp -r 16S "${RRNA_DATA_DIR}/RRNA_16S"
|
||||
loginfo "Done"
|
||||
|
||||
|
||||
loginfo "Extracting 23S rRNA sequences..."
|
||||
${PROG_DIR}/extract_ref23S.sh ${VIRIDIPLANTAE} > raw_23S.fasta
|
||||
loginfo " --> $(fastaCount raw_23S.fasta) retreived sequences"
|
||||
dereplicate raw_23S.fasta > 23S.fasta
|
||||
loginfo " --> $(fastaCount 23S.fasta) distinct sequences"
|
||||
loginfo "Done"
|
||||
|
||||
loginfo "Clustering 23S rRNA sequences..."
|
||||
clustering 23S
|
||||
loginfo "Done"
|
||||
|
||||
loginfo "Installing 23S rRNA sequences..."
|
||||
cp -r 23S "${RRNA_DATA_DIR}/RRNA_23S"
|
||||
loginfo "Done"
|
||||
|
||||
|
||||
popTmpDir
|
@ -39,7 +39,7 @@
|
||||
sequence=sequence seq \
|
||||
} \
|
||||
/^\/\// && FROM \
|
||||
{print ">"AC"_16S Strand="STRAND";", \
|
||||
{print ">RRNA16S_"AC" Strand="STRAND";", \
|
||||
"cut="FROM".."TO";", \
|
||||
"seq_length="LENGTH";"; \
|
||||
SS=substr(sequence,FROM,LENGTH); \
|
||||
|
50
detectors/rrna/tools/extract_ref23S.sh
Executable file
50
detectors/rrna/tools/extract_ref23S.sh
Executable file
@ -0,0 +1,50 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
|
||||
awk 'function printfasta(seq) { \
|
||||
seqlen=length(seq); \
|
||||
for (i=1; i <= seqlen; i+=60) \
|
||||
print substr(seq,i,60); \
|
||||
} \
|
||||
function comp(seq) { \
|
||||
"echo "seq" | tr acgtACGT tgcaTGCA " | getline res; \
|
||||
close("echo "seq" | tr acgtACGT tgcaTGCA "); \
|
||||
return res; \
|
||||
} \
|
||||
function rev(seq) { \
|
||||
"echo "seq" | rev " | getline res; \
|
||||
close("echo "seq" | rev "); \
|
||||
return res; \
|
||||
} \
|
||||
function revcomp(seq) { \
|
||||
res=rev(comp(seq)); \
|
||||
return res; \
|
||||
} \
|
||||
\
|
||||
/^LOCUS / {AC=$2; sequence=""; seqon=0; FROM="";TO=""} \
|
||||
/^ rRNA / {LOCUS=$2; STRAND=1} \
|
||||
/^ rRNA / && /complement/ {STRAND=0; \
|
||||
sub("complement\\(","",LOCUS); \
|
||||
sub("\\)","",LOCUS); \
|
||||
} \
|
||||
/23S/ {split(LOCUS,POS,"."); \
|
||||
FROM=POS[1]; \
|
||||
TO=POS[3]; \
|
||||
LENGTH=TO-FROM+1 \
|
||||
} \
|
||||
/^ORIGIN/ {seqon=1} \
|
||||
/^ *[1-9][0-9]* [a-z ]+$/ && seqon {seq=$2 $3 $4 $5 $6 $7; \
|
||||
gsub("[^acgt]","n",seq);\
|
||||
sequence=sequence seq \
|
||||
} \
|
||||
/^\/\// && FROM \
|
||||
{print ">RRNA23S_"AC" Strand="STRAND";", \
|
||||
"cut="FROM".."TO";", \
|
||||
"seq_length="LENGTH";"; \
|
||||
SS=substr(sequence,FROM,LENGTH); \
|
||||
if (! STRAND) \
|
||||
SS=revcomp(SS); \
|
||||
printfasta(SS); \
|
||||
} \
|
||||
' $*
|
@ -28,7 +28,7 @@
|
||||
sub("complement\\(","",LOCUS); \
|
||||
sub("\\)","",LOCUS); \
|
||||
} \
|
||||
/4.5S/ {split(LOCUS,POS,"."); \
|
||||
/4\.5S/ {split(LOCUS,POS,"."); \
|
||||
FROM=POS[1]; \
|
||||
TO=POS[3]; \
|
||||
LENGTH=TO-FROM+1 \
|
||||
@ -39,7 +39,7 @@
|
||||
sequence=sequence seq \
|
||||
} \
|
||||
/^\/\// && FROM \
|
||||
{print ">"AC"_4.5S Strand="STRAND";", \
|
||||
{print ">RRNA4.5S_"AC" Strand="STRAND";", \
|
||||
"cut="FROM".."TO";", \
|
||||
"seq_length="LENGTH";"; \
|
||||
SS=substr(sequence,FROM,LENGTH); \
|
||||
|
50
detectors/rrna/tools/extract_ref5S.sh
Executable file
50
detectors/rrna/tools/extract_ref5S.sh
Executable file
@ -0,0 +1,50 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
|
||||
awk 'function printfasta(seq) { \
|
||||
seqlen=length(seq); \
|
||||
for (i=1; i <= seqlen; i+=60) \
|
||||
print substr(seq,i,60); \
|
||||
} \
|
||||
function comp(seq) { \
|
||||
"echo "seq" | tr acgtACGT tgcaTGCA " | getline res; \
|
||||
close("echo "seq" | tr acgtACGT tgcaTGCA "); \
|
||||
return res; \
|
||||
} \
|
||||
function rev(seq) { \
|
||||
"echo "seq" | rev " | getline res; \
|
||||
close("echo "seq" | rev "); \
|
||||
return res; \
|
||||
} \
|
||||
function revcomp(seq) { \
|
||||
res=rev(comp(seq)); \
|
||||
return res; \
|
||||
} \
|
||||
\
|
||||
/^LOCUS / {AC=$2; sequence=""; seqon=0; FROM="";TO=""} \
|
||||
/^ rRNA / {LOCUS=$2; STRAND=1} \
|
||||
/^ rRNA / && /complement/ {STRAND=0; \
|
||||
sub("complement\\(","",LOCUS); \
|
||||
sub("\\)","",LOCUS); \
|
||||
} \
|
||||
/5S/ && ! /4.5S/ {split(LOCUS,POS,"."); \
|
||||
FROM=POS[1]; \
|
||||
TO=POS[3]; \
|
||||
LENGTH=TO-FROM+1 \
|
||||
} \
|
||||
/^ORIGIN/ {seqon=1} \
|
||||
/^ *[1-9][0-9]* [a-z ]+$/ && seqon {seq=$2 $3 $4 $5 $6 $7; \
|
||||
gsub("[^acgt]","n",seq);\
|
||||
sequence=sequence seq \
|
||||
} \
|
||||
/^\/\// && FROM \
|
||||
{print ">RRNA5S_"AC" Strand="STRAND";", \
|
||||
"cut="FROM".."TO";", \
|
||||
"seq_length="LENGTH";"; \
|
||||
SS=substr(sequence,FROM,LENGTH); \
|
||||
if (! STRAND) \
|
||||
SS=revcomp(SS); \
|
||||
printfasta(SS); \
|
||||
} \
|
||||
' $*
|
44
detectors/rrna/tools/revcomp_alignments.sh
Normal file
44
detectors/rrna/tools/revcomp_alignments.sh
Normal file
@ -0,0 +1,44 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Reverse complement a fasta formated alignment
|
||||
#
|
||||
#========================================================================================
|
||||
|
||||
# -- CAUTION -- Works as long than the script
|
||||
# is not called through a symlink
|
||||
SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})"
|
||||
source "${SCRIPT_DIR}/../../../scripts/bash_init.sh"
|
||||
|
||||
|
||||
function revcomp {
|
||||
awk 'function printfasta(seq) { \
|
||||
seqlen=length(seq); \
|
||||
for (i=1; i <= seqlen; i+=60) \
|
||||
print substr(seq,i,60); \
|
||||
} \
|
||||
function comp(seq) { \
|
||||
"echo "seq" | tr acgtACGT tgcaTGCA " | getline res; \
|
||||
close("echo "seq" | tr acgtACGT tgcaTGCA "); \
|
||||
return res; \
|
||||
} \
|
||||
function rev(seq) { \
|
||||
"echo "seq" | rev " | getline res; \
|
||||
close("echo "seq" | rev "); \
|
||||
return res; \
|
||||
} \
|
||||
function revcomp(seq) { \
|
||||
res=rev(comp(seq)); \
|
||||
return res; \
|
||||
} \
|
||||
\
|
||||
(seq) && /^>/ {print head; \
|
||||
printfasta(revcomp(seq)); \
|
||||
seq=""} \
|
||||
/^>/ {head=$0} \
|
||||
! /^>/ {seq=seq$0} \
|
||||
END { print head; \
|
||||
printfasta(revcomp(seq)); \
|
||||
}' $*
|
||||
}
|
||||
|
||||
revcomp $*
|
Reference in New Issue
Block a user