A new set of protein cleaned for the CDS detector prepared using the

clusterizecore.sh script from the detectors/cds/lib folder.

The CDS detector is now modified to use the clean.fst files.


Former-commit-id: e30a53b5b6b658388af4b2640b30e6765c729894
Former-commit-id: 3015ad50d25248fb117ab00e816b00fde1f9ba1d
This commit is contained in:
2016-10-05 08:01:30 -03:00
parent 3a8860aaf7
commit d4da1d01fd
86 changed files with 202819 additions and 14 deletions

View File

@ -31,7 +31,7 @@ set GenoName = `basename $GenoFile:r`
set ProtFile = $Argv[1]; Shift
set ProtDir = `dirname $ProtFile`
set ProtName = `basename $ProtFile:r`
set ProtName = `basename $ProtFile | $AwkCmd -F'.' '{print $1}'`
set ProtType = `basename $ProtDir`
NeedFile $GenoFile
@ -122,7 +122,7 @@ endif
if ($PASS1_SPEEDUP != 0) then
tcsh -f $PROG_DIR/do_filterbx.sh $GenoFile $ProtFile \
tcsh -f $PROG_DIR/do_filterbx.csh $GenoFile $ProtFile \
$PASS1_BLASTX_FILTER_IDMIN \
$PASS1_BLASTX_FILTER_NBMIN \
$PASS1_BLASTX_FILTER_NBMAX > D_$$

View File

@ -55,10 +55,10 @@ endif
foreach dir ("core" "shell" "dust")
if (-d $DbRoot/$dir) then
set fams = `ls $DbRoot/$dir/*.fst`
set fams = `ls $DbRoot/$dir/*.clean.fst`
Notify "running pass1:$dir exonerate of $Genome on $DbRoot"
foreach f ($fams)
tcsh -f $PROG_DIR/do_exonerate.sh $Fasta $f $DbRoot/models $temp
tcsh -f $PROG_DIR/do_exonerate.csh $Fasta $f $DbRoot/models $temp
end
endif
end

View File

@ -3,6 +3,8 @@
#
# -v MAX_SPAN ALLOW_STOP EXCLUDE
#
#
#
BEGIN {
PROCINFO["sorted_in"] = "@ind_num_asc"

View File

@ -6,6 +6,9 @@ source "${THIS_DIR}/../../../scripts/bash_init.sh"
CORELIB="${CDS_DATA_DIR}/chlorodb/core"
CDHIT_ID=0.7
CDHIT_DELTA=0.8
function clusterize() {
@ -17,12 +20,15 @@ function clusterize() {
rm -rf "${prot}"
mkdir -p "${prot}"
cd-hit -i "${fastain}" \
-o "${prot}/${cdhitout}" \
-c 0.6 -G 1 \
-g 1 -s 0.8 \
-b 150 -p 1 \
-d 0 -n 3
cd-hit -i "${fastain}" \
-o "${prot}/${cdhitout}" \
-c ${CDHIT_DELTA} \
-G 1 \
-g 1 \
-aL 0.95 \
-s ${CDHIT_ID} \
-b 350 -p 1 \
-d 0 -n 3
fasta1line "${fastain}" > "${prot}/${fasta1}"
@ -35,6 +41,8 @@ function clusterize() {
filename=prot".cluster."cluster".ids"; \
print $3 >> filename ; \
close(filename) }' "${cdhitout}.clstr"
rm -f "../$prot.clean.fst"
for ids in *.cluster.*.ids ; do
cluster=$(printf "%03d" $(echo "${ids}" | $AwkCmd -F'.' '{print $3}'))
@ -52,10 +60,10 @@ function clusterize() {
egrep -v -- '^--$' > "$alignment"
fi
if (( size >= 10 )) ; then
if (( size >= 5 )) ; then
egrep -f "$ids" -A 1 "${fasta1}" | \
egrep -v -- '^--$' | \
formatfasta >> "$prot.clean.fst"
formatfasta >> "../$prot.clean.fst"
fi
rm -f "$ids"
@ -73,6 +81,8 @@ function clusterize() {
pushd $CORELIB
rm -rf *.clean.fst
for prot in *.fst ; do
prot="${prot/.fst/}"
clusterize "$prot"