A new set of protein cleaned for the CDS detector prepared using the
clusterizecore.sh script from the detectors/cds/lib folder. The CDS detector is now modified to use the clean.fst files. Former-commit-id: e30a53b5b6b658388af4b2640b30e6765c729894 Former-commit-id: 3015ad50d25248fb117ab00e816b00fde1f9ba1d
This commit is contained in:
@ -31,7 +31,7 @@ set GenoName = `basename $GenoFile:r`
|
||||
|
||||
set ProtFile = $Argv[1]; Shift
|
||||
set ProtDir = `dirname $ProtFile`
|
||||
set ProtName = `basename $ProtFile:r`
|
||||
set ProtName = `basename $ProtFile | $AwkCmd -F'.' '{print $1}'`
|
||||
set ProtType = `basename $ProtDir`
|
||||
|
||||
NeedFile $GenoFile
|
||||
@ -122,7 +122,7 @@ endif
|
||||
|
||||
if ($PASS1_SPEEDUP != 0) then
|
||||
|
||||
tcsh -f $PROG_DIR/do_filterbx.sh $GenoFile $ProtFile \
|
||||
tcsh -f $PROG_DIR/do_filterbx.csh $GenoFile $ProtFile \
|
||||
$PASS1_BLASTX_FILTER_IDMIN \
|
||||
$PASS1_BLASTX_FILTER_NBMIN \
|
||||
$PASS1_BLASTX_FILTER_NBMAX > D_$$
|
@ -55,10 +55,10 @@ endif
|
||||
|
||||
foreach dir ("core" "shell" "dust")
|
||||
if (-d $DbRoot/$dir) then
|
||||
set fams = `ls $DbRoot/$dir/*.fst`
|
||||
set fams = `ls $DbRoot/$dir/*.clean.fst`
|
||||
Notify "running pass1:$dir exonerate of $Genome on $DbRoot"
|
||||
foreach f ($fams)
|
||||
tcsh -f $PROG_DIR/do_exonerate.sh $Fasta $f $DbRoot/models $temp
|
||||
tcsh -f $PROG_DIR/do_exonerate.csh $Fasta $f $DbRoot/models $temp
|
||||
end
|
||||
endif
|
||||
end
|
@ -3,6 +3,8 @@
|
||||
#
|
||||
# -v MAX_SPAN ALLOW_STOP EXCLUDE
|
||||
#
|
||||
#
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
PROCINFO["sorted_in"] = "@ind_num_asc"
|
||||
|
@ -6,6 +6,9 @@ source "${THIS_DIR}/../../../scripts/bash_init.sh"
|
||||
|
||||
CORELIB="${CDS_DATA_DIR}/chlorodb/core"
|
||||
|
||||
CDHIT_ID=0.7
|
||||
CDHIT_DELTA=0.8
|
||||
|
||||
|
||||
function clusterize() {
|
||||
|
||||
@ -17,12 +20,15 @@ function clusterize() {
|
||||
rm -rf "${prot}"
|
||||
mkdir -p "${prot}"
|
||||
|
||||
cd-hit -i "${fastain}" \
|
||||
-o "${prot}/${cdhitout}" \
|
||||
-c 0.6 -G 1 \
|
||||
-g 1 -s 0.8 \
|
||||
-b 150 -p 1 \
|
||||
-d 0 -n 3
|
||||
cd-hit -i "${fastain}" \
|
||||
-o "${prot}/${cdhitout}" \
|
||||
-c ${CDHIT_DELTA} \
|
||||
-G 1 \
|
||||
-g 1 \
|
||||
-aL 0.95 \
|
||||
-s ${CDHIT_ID} \
|
||||
-b 350 -p 1 \
|
||||
-d 0 -n 3
|
||||
|
||||
fasta1line "${fastain}" > "${prot}/${fasta1}"
|
||||
|
||||
@ -35,6 +41,8 @@ function clusterize() {
|
||||
filename=prot".cluster."cluster".ids"; \
|
||||
print $3 >> filename ; \
|
||||
close(filename) }' "${cdhitout}.clstr"
|
||||
|
||||
rm -f "../$prot.clean.fst"
|
||||
|
||||
for ids in *.cluster.*.ids ; do
|
||||
cluster=$(printf "%03d" $(echo "${ids}" | $AwkCmd -F'.' '{print $3}'))
|
||||
@ -52,10 +60,10 @@ function clusterize() {
|
||||
egrep -v -- '^--$' > "$alignment"
|
||||
fi
|
||||
|
||||
if (( size >= 10 )) ; then
|
||||
if (( size >= 5 )) ; then
|
||||
egrep -f "$ids" -A 1 "${fasta1}" | \
|
||||
egrep -v -- '^--$' | \
|
||||
formatfasta >> "$prot.clean.fst"
|
||||
formatfasta >> "../$prot.clean.fst"
|
||||
fi
|
||||
|
||||
rm -f "$ids"
|
||||
@ -73,6 +81,8 @@ function clusterize() {
|
||||
|
||||
pushd $CORELIB
|
||||
|
||||
rm -rf *.clean.fst
|
||||
|
||||
for prot in *.fst ; do
|
||||
prot="${prot/.fst/}"
|
||||
clusterize "$prot"
|
||||
|
Reference in New Issue
Block a user