From 20d0bcfbf8f0749e574e44abfba647f72f6bb213 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 25 Apr 2016 23:41:18 +0200 Subject: [PATCH] First trial to automatcally cleanup the core CDS database Former-commit-id: dc61a61816084f385f1aa89324b08f81602b4353 Former-commit-id: ee8bf1a08e4af4f4d8d12a1e2a83c5f688e5f7e8 --- detectors/cds/lib/clustertizecore.sh | 81 ++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100755 detectors/cds/lib/clustertizecore.sh diff --git a/detectors/cds/lib/clustertizecore.sh b/detectors/cds/lib/clustertizecore.sh new file mode 100755 index 0000000..3dc6595 --- /dev/null +++ b/detectors/cds/lib/clustertizecore.sh @@ -0,0 +1,81 @@ +#!/bin/bash + +THIS_DIR="$(dirname ${BASH_SOURCE[0]})" + +source "${THIS_DIR}/../../../scripts/bash_init.sh" + +CORELIB="${CDS_DATA_DIR}/chlorodb/core" + + +function clusterize() { + + local prot="${1}" + local fastain="${prot}.fst" + local cdhitout="${prot}.cdhit" + local fasta1="${prot}.1l.fst" + + rm -rf "${prot}" + mkdir -p "${prot}" + + cd-hit -i "${fastain}" \ + -o "${prot}/${cdhitout}" \ + -c 0.6 -G 1 \ + -g 1 -s 0.8 \ + -b 150 -p 1 \ + -d 0 -n 3 + + fasta1line "${fastain}" > "${prot}/${fasta1}" + + pushd "$prot" + + rm -rf "*.cluster.*.ids" + $AwkCmd -v prot="$prot" \ + '/^>/ {cluster=$2} \ + ! /^>/ {sub("\\.\\.*$","",$3); \ + filename=prot".cluster."cluster".ids"; \ + print $3 >> filename ; \ + close(filename) }' "${cdhitout}.clstr" + + for ids in *.cluster.*.ids ; do + cluster=$(printf "%03d" $(echo "${ids}" | $AwkCmd -F'.' '{print $3}')) + size=$(wc -l "$ids" | $AwkCmd '{print $1}') + fsize=$(printf '%05d' $size) + + alignment="${prot}.cluster.${fsize}.${cluster}.fst" + + if (( size > 1 )) ; then + egrep -f "$ids" -A 1 "${fasta1}" | \ + egrep -v -- '^--$' | \ + clustalo -i - > "$alignment" + else + egrep -f "$ids" -A 1 "${fasta1}" | \ + egrep -v -- '^--$' > "$alignment" + fi + + if (( size >= 10 )) ; then + egrep -f "$ids" -A 1 "${fasta1}" | \ + egrep -v -- '^--$' | \ + formatfasta >> "$prot.clean.fst" + fi + + rm -f "$ids" + + done + + rm -f "${fasta1}" + rm -f "${cdhitout}" + + popd + +} + + + +pushd $CORELIB + +for prot in *.fst ; do + prot="${prot/.fst/}" + clusterize "$prot" +done + +popd \ No newline at end of file