A new set of protein cleaned for the CDS detector prepared using the
clusterizecore.sh script from the detectors/cds/lib folder. The CDS detector is now modified to use the clean.fst files. Former-commit-id: e30a53b5b6b658388af4b2640b30e6765c729894 Former-commit-id: 3015ad50d25248fb117ab00e816b00fde1f9ba1d
This commit is contained in:
@ -3,6 +3,8 @@
|
||||
#
|
||||
# -v MAX_SPAN ALLOW_STOP EXCLUDE
|
||||
#
|
||||
#
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
PROCINFO["sorted_in"] = "@ind_num_asc"
|
||||
|
@ -6,6 +6,9 @@ source "${THIS_DIR}/../../../scripts/bash_init.sh"
|
||||
|
||||
CORELIB="${CDS_DATA_DIR}/chlorodb/core"
|
||||
|
||||
CDHIT_ID=0.7
|
||||
CDHIT_DELTA=0.8
|
||||
|
||||
|
||||
function clusterize() {
|
||||
|
||||
@ -17,12 +20,15 @@ function clusterize() {
|
||||
rm -rf "${prot}"
|
||||
mkdir -p "${prot}"
|
||||
|
||||
cd-hit -i "${fastain}" \
|
||||
-o "${prot}/${cdhitout}" \
|
||||
-c 0.6 -G 1 \
|
||||
-g 1 -s 0.8 \
|
||||
-b 150 -p 1 \
|
||||
-d 0 -n 3
|
||||
cd-hit -i "${fastain}" \
|
||||
-o "${prot}/${cdhitout}" \
|
||||
-c ${CDHIT_DELTA} \
|
||||
-G 1 \
|
||||
-g 1 \
|
||||
-aL 0.95 \
|
||||
-s ${CDHIT_ID} \
|
||||
-b 350 -p 1 \
|
||||
-d 0 -n 3
|
||||
|
||||
fasta1line "${fastain}" > "${prot}/${fasta1}"
|
||||
|
||||
@ -35,6 +41,8 @@ function clusterize() {
|
||||
filename=prot".cluster."cluster".ids"; \
|
||||
print $3 >> filename ; \
|
||||
close(filename) }' "${cdhitout}.clstr"
|
||||
|
||||
rm -f "../$prot.clean.fst"
|
||||
|
||||
for ids in *.cluster.*.ids ; do
|
||||
cluster=$(printf "%03d" $(echo "${ids}" | $AwkCmd -F'.' '{print $3}'))
|
||||
@ -52,10 +60,10 @@ function clusterize() {
|
||||
egrep -v -- '^--$' > "$alignment"
|
||||
fi
|
||||
|
||||
if (( size >= 10 )) ; then
|
||||
if (( size >= 5 )) ; then
|
||||
egrep -f "$ids" -A 1 "${fasta1}" | \
|
||||
egrep -v -- '^--$' | \
|
||||
formatfasta >> "$prot.clean.fst"
|
||||
formatfasta >> "../$prot.clean.fst"
|
||||
fi
|
||||
|
||||
rm -f "$ids"
|
||||
@ -73,6 +81,8 @@ function clusterize() {
|
||||
|
||||
pushd $CORELIB
|
||||
|
||||
rm -rf *.clean.fst
|
||||
|
||||
for prot in *.fst ; do
|
||||
prot="${prot/.fst/}"
|
||||
clusterize "$prot"
|
||||
|
Reference in New Issue
Block a user