Switch to a swissprot based reference database for CDS annotation

Former-commit-id: 3da31ce8a135394ecac041291134d61f11f06d8f
Former-commit-id: 406f41a7cb2db14ea832480b86f72a11d3b0ab4a
This commit is contained in:
2022-02-16 22:50:17 +01:00
parent 90b3ee9b04
commit 831669433e
644 changed files with 25433 additions and 485597 deletions

View File

@@ -6,10 +6,13 @@
#
# Annotate CDS using exonerate
#
# do_exonerate.sh <FASTAGENOM> <FASTAPROT> [<OUTDIR>]
# do_exonerate.sh <PASS> <FASTAGENOM> <FASTAPROT> <ANNOTFILE> <MODELDIR> [<OUTDIR>]
#
# - <PASS> : the pass running exonarate
# - <FASTAGENOM> : The fasta file containing the genome to annotate
# - <FASTAPROT> : The fasta file containing the protein family
# - <ANNOTFILE> : The annotation file used to add product info
# - <MODELDIR> : Directory containing model parameters for exonerate
#
# Results are in file : `basename <FASTAGENOM>:r`.`basename <FASTAPROT>:r`.res
#
@@ -26,17 +29,21 @@ alias Override 'if (-e \!:2) set \!:1 = \!:2'
NeedArg 2
set Pass = $Argv[1]; Shift
set GenoFile = $Argv[1]; Shift
set GenoName = `basename $GenoFile:r`
set ProtFile = $Argv[1]; Shift
set ProtDir = `dirname $ProtFile`
set DBDir = `dirname $ProtDir`
set ProtName = `basename $ProtFile | $AwkCmd -F'.' '{print $1}'`
set ProtType = `basename $ProtDir`
set AnnotFile = $Argv[1]; Shift
NeedFile $GenoFile
NeedFile $ProtFile
NeedFile $ProtDir/Annot.lst
NeedFile $AnnotFile
set ModelsDir = $PROG_DIR/../models
if ($#Argv > 0) then
@@ -188,7 +195,7 @@ if ( -z $base.exo.best) then
$AwkCmd -v MAX_SPAN=$PASS1_MAX_SPAN \
-v ALLOW_STOP=1 \
-v EXCLUDE=$GenoName \
-f $LIB_DIR/bestclust.awk $base.exo.raw > $base.exo.best
-f $LIB_DIR/bestclust.awk $base.exo.raw > $base.exo.best
endif
endif
@@ -196,7 +203,16 @@ endif
# get annotations
#
egrep "^$ProtName " $ProtDir/Annot.lst | $AwkCmd '{print "c annot", $0}' > T_$$
set sp_match=`awk '/^e similarity/ {print $12}' $base.exo.best | head -1`
if ( ${%sp_match} > 0 ) then
set sp_ac=`awk -v sp="$sp_match" '($1 ~ sp) {sub(/SP_AC=/,"",$2); sub(/;$/,"",$2); print $2} ' $DbFile`
echo " " patch ac $sp_match to $sp_ac > /dev/stderr
sed "s/$sp_match/$sp_ac/g" $base.exo.best > T_$$
mv T_$$ $base.exo.best
endif
egrep "^$ProtName " $AnnotFile | $AwkCmd '{print "c annot", $0}' > T_$$
#
# extend start/stop
@@ -213,7 +229,7 @@ $AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/extend.awk \
# translate
#
echo "c pass pass1 $ProtType" > $base.iff
echo "c pass $Pass $ProtType" > $base.iff
$AwkCmd -v FASTA=$GenoFile -f $LIB_DIR/libutil.awk \
-f $LIB_DIR/translate.awk T_$$ >> $base.iff