first batch

Former-commit-id: 1eecb206a17c4aff21d1170b48db134ce3c4f14e
This commit is contained in:
Eric Coissac
2025-03-01 16:15:28 +01:00
parent 4e51d42b85
commit 2c012eec8e
596 changed files with 5247 additions and 77743 deletions

Binary file not shown.

View File

@ -0,0 +1,141 @@
#!/bin/bash
#
# BUILD RRNA models
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
function fasta1li {
$AwkCmd '/^>/ {if (sequence) \
{print sequence}; \
print $0; \
sequence=""} \
!/^>/ {sequence = sequence $0} \
END {print sequence}' $1
}
function dereplicate {
DATA=$1
sumaclust -t 1 $DATA | \
fasta1li | \
grep -A 1 '^>' | \
grep -A1 'cluster_center=True;' | \
grep -v -- -- | \
sed -E "s/count=[0-9]+; //" | \
sed 's/cluster_weight/count/' | \
$AwkCmd ' /^>/ {SEQ++;\
match($0,"count=[0-9][0-9]*;");\
count=substr($0,RSTART,RLENGTH);\
$1=$1"_"SEQ;\
print $1,count} \
!/^>/ {print $0}'
}
function clustering {
DATA=$1
rm -rf $DATA
mkdir $DATA
sumaclust -t 0.9 $DATA.fasta | \
fasta1li > $DATA.clust.fasta
cluster=$(grep '^>' $DATA.clust.fasta | \
sed -E 's/.*cluster=([^;]+);.*$/\1/' | \
sort -u)
for c in $cluster; do
w=$(grep "$c" "${DATA}.clust.fasta" | \
head -1 | \
sed -E 's/.*cluster_weight=([^;]+);.*$/\1/')
out=$(printf "${DATA}/%05d_%s" $w $c)
grep -A1 "$c" "${DATA}.clust.fasta" | \
grep -v -- -- > "$out.fasta"
muscle -in "$out.fasta" -out "$out.align.fasta"
done
}
function revcomp {
$AwkCmd 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \
} \
function comp(seq) { \
"echo "seq" | tr acgtACGT tgcaTGCA " | getline res; \
close("echo "seq" | tr acgtACGT tgcaTGCA "); \
return res; \
} \
function rev(seq) { \
"echo "seq" | rev " | getline res; \
close("echo "seq" | rev "); \
return res; \
} \
function revcomp(seq) { \
res=rev(comp(seq)); \
return res; \
} \
\
(seq) && /^>/ {print head; \
printfasta(revcomp(seq)); \
seq=""} \
/^>/ {head=$0} \
! /^>/ {seq=seq$0} \
END { print head; \
printfasta(revcomp(seq)); \
}' $1
}
pushTmpDir ORG.buildRRNAMito
loginfo "Tempdir: $(pwd)"
openLogFile "${RRNA_DATA_DIR}/rRNA_mito_models.log"
loginfo "Selecting Metazoa genebank entries..."
METAZOA=$(${PROG_DIR}/../../normalize/tools/selectMetazoa.sh $*)
loginfo " --> $(echo ${METAZOA} | wc -w) entries selected"
loginfo "Done"
loginfo "Extracting 12S rRNA sequences..."
rm -f raw_12S.fasta
for f in ${METAZOA}; do
loginfo "Extracting 12S rRNA sequences from ${f}..."
${PROG_DIR}/extract_ref12S.sh ${f} >> raw_12S.fasta
done
loginfo " --> $(fastaCount raw_12S.fasta) retreived sequences"
dereplicate raw_12S.fasta >> 12S.fasta
loginfo " --> $(fastaCount 12S.fasta) distinct sequences"
loginfo "Done"
loginfo "Clustering 12S rRNA sequences..."
clustering 12S
loginfo "Done"
loginfo "Installing 12S rRNA sequences..."
cp -r 12S "${RRNA_DATA_DIR}/RRNA_12S_mito"
loginfo "Done"
loginfo "Extracting 16S rRNA sequences..."
rm -f raw_16S.fasta
for f in ${METAZOA}; do
${PROG_DIR}/extract_ref16S.sh ${f} >> raw_16S.fasta
done
loginfo " --> $(fastaCount raw_16S.fasta) retreived sequences"
dereplicate raw_16S.fasta > 16S.fasta
loginfo " --> $(fastaCount 16S.fasta) distinct sequences"
loginfo "Done"
loginfo "Clustering 16S rRNA sequences..."
clustering 16S
loginfo "Done"
loginfo "Installing 16S rRNA sequences..."
cp -r 16S "${RRNA_DATA_DIR}/RRNA_16S_mito"
loginfo "Done"
popTmpDir

View File

@ -0,0 +1,55 @@
#!/bin/bash
#
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
$AwkCmd 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \
} \
function comp(seq) { \
"echo "seq" | tr acgtACGT tgcaTGCA " | getline res; \
close("echo "seq" | tr acgtACGT tgcaTGCA "); \
return res; \
} \
function rev(seq) { \
"echo "seq" | rev " | getline res; \
close("echo "seq" | rev "); \
return res; \
} \
function revcomp(seq) { \
res=rev(comp(seq)); \
return res; \
} \
\
/^LOCUS / {AC=$2; sequence=""; seqon=0; FROM="";TO=""} \
/^ rRNA / {LOCUS=$2; STRAND=1} \
/^ rRNA / && /complement/ {STRAND=0; \
sub("complement\\(","",LOCUS); \
sub("\\)","",LOCUS); \
} \
/12S/ {split(LOCUS,POS,"."); \
FROM=POS[1]; \
TO=POS[3]; \
LENGTH=TO-FROM+1 \
} \
/^ORIGIN/ {seqon=1} \
/^ *[1-9][0-9]* [a-z ]+$/ && seqon {seq=$2 $3 $4 $5 $6 $7; \
gsub("[^acgt]","n",seq);\
sequence=sequence seq \
} \
/^\/\// && FROM \
{print ">RRNA12S_"AC" Strand="STRAND";", \
"cut="FROM".."TO";", \
"seq_length="LENGTH";"; \
SS=substr(sequence,FROM,LENGTH); \
if (! STRAND) \
SS=revcomp(SS); \
printfasta(SS); \
} \
' $*

View File

@ -2,7 +2,12 @@
#
gawk 'function printfasta(seq) { \
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
$AwkCmd 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \

View File

@ -2,7 +2,12 @@
#
gawk 'function printfasta(seq) { \
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
$AwkCmd 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \

View File

@ -1,8 +1,12 @@
#!/bin/bash
#
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
gawk 'function printfasta(seq) { \
$AwkCmd 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \

View File

@ -2,7 +2,12 @@
#
gawk 'function printfasta(seq) { \
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
$AwkCmd 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \

View File

@ -11,7 +11,7 @@ source "${THIS_DIR}/../../../scripts/bash_init.sh"
function revcomp {
gawk 'function printfasta(seq) { \
$AwkCmd 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \

View File

@ -0,0 +1,29 @@
#!/bin/bash
#
# splitgbk.sh:
# Split a gbk file in multiple files
# each containing a single sequence
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
inputfile=$1
dest=${inputfile/.*/}
mkdir -p $dest
$AwkCmd -v dest="$dest" '/^LOCUS/ {
AC=$2;
destfile = sprintf("%s/%s.gbk", dest, AC);
}
{
print $0 >> destfile
}
/^\/\// {
close(destfile);
}
' $inputfile