Switch to a swissprot based reference database for CDS annotation

Former-commit-id: 3da31ce8a135394ecac041291134d61f11f06d8f
Former-commit-id: 406f41a7cb2db14ea832480b86f72a11d3b0ab4a
This commit is contained in:
2022-02-16 22:50:17 +01:00
parent 90b3ee9b04
commit 831669433e
644 changed files with 25433 additions and 485597 deletions

View File

@@ -6,10 +6,13 @@
#
# Annotate CDS using exonerate
#
# do_exonerate.sh <FASTAGENOM> <FASTAPROT> [<OUTDIR>]
# do_exonerate.sh <PASS> <FASTAGENOM> <FASTAPROT> <ANNOTFILE> <MODELDIR> [<OUTDIR>]
#
# - <PASS> : the pass running exonarate
# - <FASTAGENOM> : The fasta file containing the genome to annotate
# - <FASTAPROT> : The fasta file containing the protein family
# - <ANNOTFILE> : The annotation file used to add product info
# - <MODELDIR> : Directory containing model parameters for exonerate
#
# Results are in file : `basename <FASTAGENOM>:r`.`basename <FASTAPROT>:r`.res
#
@@ -26,17 +29,21 @@ alias Override 'if (-e \!:2) set \!:1 = \!:2'
NeedArg 2
set Pass = $Argv[1]; Shift
set GenoFile = $Argv[1]; Shift
set GenoName = `basename $GenoFile:r`
set ProtFile = $Argv[1]; Shift
set ProtDir = `dirname $ProtFile`
set DBDir = `dirname $ProtDir`
set ProtName = `basename $ProtFile | $AwkCmd -F'.' '{print $1}'`
set ProtType = `basename $ProtDir`
set AnnotFile = $Argv[1]; Shift
NeedFile $GenoFile
NeedFile $ProtFile
NeedFile $ProtDir/Annot.lst
NeedFile $AnnotFile
set ModelsDir = $PROG_DIR/../models
if ($#Argv > 0) then
@@ -188,7 +195,7 @@ if ( -z $base.exo.best) then
$AwkCmd -v MAX_SPAN=$PASS1_MAX_SPAN \
-v ALLOW_STOP=1 \
-v EXCLUDE=$GenoName \
-f $LIB_DIR/bestclust.awk $base.exo.raw > $base.exo.best
-f $LIB_DIR/bestclust.awk $base.exo.raw > $base.exo.best
endif
endif
@@ -196,7 +203,16 @@ endif
# get annotations
#
egrep "^$ProtName " $ProtDir/Annot.lst | $AwkCmd '{print "c annot", $0}' > T_$$
set sp_match=`awk '/^e similarity/ {print $12}' $base.exo.best | head -1`
if ( ${%sp_match} > 0 ) then
set sp_ac=`awk -v sp="$sp_match" '($1 ~ sp) {sub(/SP_AC=/,"",$2); sub(/;$/,"",$2); print $2} ' $DbFile`
echo " " patch ac $sp_match to $sp_ac > /dev/stderr
sed "s/$sp_match/$sp_ac/g" $base.exo.best > T_$$
mv T_$$ $base.exo.best
endif
egrep "^$ProtName " $AnnotFile | $AwkCmd '{print "c annot", $0}' > T_$$
#
# extend start/stop
@@ -213,7 +229,7 @@ $AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/extend.awk \
# translate
#
echo "c pass pass1 $ProtType" > $base.iff
echo "c pass $Pass $ProtType" > $base.iff
$AwkCmd -v FASTA=$GenoFile -f $LIB_DIR/libutil.awk \
-f $LIB_DIR/translate.awk T_$$ >> $base.iff

View File

@@ -37,10 +37,13 @@ else
TEMP=""
fi
DBROOT="$CDS_DATA_DIR/chlorodb/RPS12"
RPS12DB="${DBROOT}/RPS12_DB.clean.fst"
DBROOT="$CDS_DATA_DIR/sp_chlorodb/RPS12"
RPS12DB="${DBROOT}/rps12.fst"
DELTA=50
AnnotFile="$CDS_DATA_DIR/sp_chlorodb/Annot.lst"
ModelsDir="$CDS_DATA_DIR/sp_chlorodb/models"
SEQLEN=$(seqlength "${QUERY}")
SEQUENCE=$(readfirstfastaseq "${QUERY}")
@@ -61,7 +64,9 @@ blastx \
BEGIN {BEST_EVAL = 1e-40;
OUT = 0}
/^#/ {next}
($2 == PREV_CDS) { HSPs = HSPs "\n" $0;}
($2 == PREV_CDS) && (($11 + 0.0) < (1e-5 + 0.0)) {
HSPs = HSPs "\n" $0;
}
(OUT < 20) && ($2 != PREV_CDS) && (BEST_EVAL < (1e-20 + 0.0)) {
if (PREV_CDS) print HSPs;
@@ -75,6 +80,7 @@ blastx \
(BEST_EVAL > ($11 + 0.0)) {BEST_EVAL = ($11 + 0.0)}
' > "rps12_locate.hsps"
#
# Extracting protein ids from selected blast HSPs
#
@@ -83,7 +89,6 @@ blastx \
| sort \
| uniq > "dbsel.txt"
#
# Extract corresponding protein sequences
# from the RPS12 database.
@@ -134,7 +139,7 @@ blastx \
}
}
' \
| sort -nk 3 \
| sort -nk 3 \
| $AwkCmd '($3 != old3 || $4 != old4) {
i++
old3=$3
@@ -262,13 +267,14 @@ blastx \
# It should be one or two fragments
#
export PASS1_SPEEDUP=0
cp $DBROOT/Annot.lst RPS12
nbseq = 0
nbseq=0
for fasta in rps12_fragments_*.fasta ; do
tcsh -f ${PROG_DIR}/do_exonerate.csh \
Pass2 \
$fasta \
"RPS12/rps12.fasta" \
$DBROOT/../models $(pwd)
$AnnotFile \
$ModelsDir $(pwd)
((nbseq=nbseq+1))
done

View File

@@ -35,19 +35,22 @@ Genome=$(basename ${Fasta%.*})
# DbRoot is set to its default values except
# if the second argument precise another DbRoot
DbRoot="$CDS_DATA_DIR/chlorodb"
DbRoot="$CDS_DATA_DIR/sp_chlorodb"
if (( $# > 0)) ; then
DbRoot="$1"; Shift
fi
AnnotFile="$DbRoot/Annot.lst"
needdir $DbRoot
needdir $DbRoot/core
needfile $DbRoot/core/Annot.lst
needfile $AnnotFile
needdir $DbRoot/models
assignundef cdsdetection_pass1 yes
assignundef cdsdetection_pass2 yes
assignundef cdsdetection_pass3 yes
temp=$(mktempdir $(hostname))
@@ -63,33 +66,47 @@ Fasta="$temp/genome.fasta"
#
if [[ "$cdsdetection_pass1" == "yes" ]] ; then
for dir in "core" "shell" "dust" ; do
for dir in "core" ; do
if [[ -d $DbRoot/$dir ]] ; then
fams=$(ls $DbRoot/$dir/*.clean.fst)
fams=$(ls $DbRoot/$dir/*.fst)
loginfo "running pass1:$dir exonerate of $Genome on $DbRoot"
for f in $fams ; do
tcsh -f $PROG_DIR/do_exonerate.csh $Fasta $f $DbRoot/models $temp
tcsh -f $PROG_DIR/do_exonerate.csh Pass1 $Fasta $f $AnnotFile $DbRoot/models $temp
done
fi
done
cp $temp/genome.cds.fasta $Genome.cds.fasta
mv $temp/genome.cds.fasta $Genome.cds_pass1.fasta
fi
#
# pass2: transsplicing
# pass2: RPS12 gene with transsplicing
#
if [[ "$cdsdetection_pass2" == "yes" ]] ; then
loginfo "running pass2:rps12 exonerate of $Genome on $DbRoot"
$PROG_DIR/do_rps12.sh $Fasta $temp
fi
#
# pass3: prokov
# pass3: run exonerate on shell and dust
#
if [[ "$cdsdetection_pass3" == "yes" ]] ; then
for dir in "shell" ; do
if [[ -d $DbRoot/$dir ]] ; then
fams=$(ls $DbRoot/$dir/*.fst)
loginfo $fams
loginfo "running pass3:$dir exonerate of $Genome on $DbRoot"
for f in $fams ; do
tcsh -f $PROG_DIR/do_exonerate.csh Pass3 $Fasta $f $AnnotFile $DbRoot/models $temp
done
fi
done
mv $temp/genome.cds.fasta $Genome.cds_pass2.fasta
fi
# $PROG_DIR/do_prokov.sh $Fasta $Genome.cds.fasta $temp
#