Change setup for the blast filtering before exonerate

Former-commit-id: 139685eca58c1fb2272854dee31de3821c54af80
Former-commit-id: dc5c345ce72e9895cbdcc3321499b869040a24da
This commit is contained in:
2022-02-17 18:41:27 +01:00
parent 3b584dbebd
commit 9d93a68b3a
3 changed files with 26 additions and 18 deletions

View File

@ -127,7 +127,10 @@ endif
# speedup exonerate
#
if ($PASS1_SPEEDUP != 0) then
set AvgLengh = `$AwkCmd '/>/ {nseq++} \\!/>/ {lseq+=length($0)} END {print int(lseq/nseq)}' $ProtFile`
Notify " Protein $ProtName has an average length of $AvgLengh"
if (($PASS1_SPEEDUP != 0) && ($AvgLengh > $PASS1_BLASTX_FILTER_MINLEN)) then
tcsh -f $PROG_DIR/do_filterbx.csh $GenoFile $ProtFile \
$PASS1_BLASTX_FILTER_IDMIN \
@ -135,20 +138,24 @@ if ($PASS1_SPEEDUP != 0) then
$PASS1_BLASTX_FILTER_NBMAX > D_$$
set n = `egrep "^>" D_$$ | wc -l`
if ($n > 0) then
if ($n >= $PASS1_SLOWDOWN) then
Notify " $n sequences kept"
set DbFile = D_$$
else
Notify " no sequence match"
if ($PASS1_SLOWDOWN != 0) then
Notify " reverting to original $ProtName"
if ($PASS1_SLOWDOWN > 0) then
Notify " not enought sequence match (only $n)"
Notify " reverting to complete $ProtName db"
set DbFile = $ProtFile
else
Notify " no blast match for $ProtName stop annotation"
echo "" > $base.exo.raw
goto parse
endif
endif
else
if ($AvgLengh <= $PASS1_BLASTX_FILTER_MINLEN) then
Notify " too short protein, reverting to original $ProtName"
endif
set DbFile = $ProtFile
endif
@ -207,7 +214,7 @@ set sp_match=`awk '/^e similarity/ {print $12}' $base.exo.best | head -1`
if ( ${%sp_match} > 0 ) then
set sp_ac=`awk -v sp="$sp_match" '($1 ~ sp) {sub(/SP_AC=/,"",$2); sub(/;$/,"",$2); print $2} ' $DbFile`
echo " " patch ac $sp_match to $sp_ac > /dev/stderr
Notify " for $ProtName patches swiss ID $sp_match to AC $sp_ac"
sed "s/$sp_match/$sp_ac/g" $base.exo.best > T_$$
mv T_$$ $base.exo.best
endif

View File

@ -66,15 +66,15 @@ Fasta="$temp/genome.fasta"
#
if [[ "$cdsdetection_pass1" == "yes" ]] ; then
for dir in "core" ; do
( for dir in "core" ; do
if [[ -d $DbRoot/$dir ]] ; then
fams=$(ls $DbRoot/$dir/*.fst)
loginfo "running pass1:$dir exonerate of $Genome on $DbRoot"
for f in $fams ; do
tcsh -f $PROG_DIR/do_exonerate.csh Pass1 $Fasta $f $AnnotFile $DbRoot/models $temp
echo tcsh -f $PROG_DIR/do_exonerate.csh Pass1 $Fasta $f $AnnotFile $DbRoot/models $temp
done
fi
done
done ) | parallel -j 8
mv $temp/genome.cds.fasta $Genome.cds_pass1.fasta
fi
@ -94,16 +94,16 @@ fi
#
if [[ "$cdsdetection_pass3" == "yes" ]] ; then
for dir in "shell" ; do
(for dir in "shell" ; do
if [[ -d $DbRoot/$dir ]] ; then
fams=$(ls $DbRoot/$dir/*.fst)
loginfo $fams
loginfo "running pass3:$dir exonerate of $Genome on $DbRoot"
for f in $fams ; do
tcsh -f $PROG_DIR/do_exonerate.csh Pass3 $Fasta $f $AnnotFile $DbRoot/models $temp
echo tcsh -f $PROG_DIR/do_exonerate.csh Pass3 $Fasta $f $AnnotFile $DbRoot/models $temp
done
fi
done
done) | parallel -j 8
mv $temp/genome.cds.fasta $Genome.cds_pass2.fasta
fi

View File

@ -15,13 +15,14 @@ AssignUndef TMP_CLEANUP 1
# pass1: exonerate speedup
#
AssignUndef PASS1_SPEEDUP 1
AssignUndef PASS1_SLOWDOWN 0
AssignUndef PASS1_SLOWDOWN 5
AssignUndef PASS1_BLASTX_FILTER_IDMIN 80
AssignUndef PASS1_BLASTX_FILTER_NBMIN 20
AssignUndef PASS1_BLASTX_FILTER_NBMAX 50
AssignUndef PASS1_BLASTX_FILTER_COMIN 0.8
AssignUndef PASS1_BLASTX_FILTER_GCODE 11
AssignUndef PASS1_BLASTX_FILTER_MINLEN 100
AssignUndef PASS1_BLASTX_FILTER_IDMIN 50
AssignUndef PASS1_BLASTX_FILTER_NBMIN 20
AssignUndef PASS1_BLASTX_FILTER_NBMAX 20
AssignUndef PASS1_BLASTX_FILTER_COMIN 0.8
AssignUndef PASS1_BLASTX_FILTER_GCODE 11
#
# pass1: exonerate parameters