Switch to a swissprot based reference database for CDS annotation
Former-commit-id: 3da31ce8a135394ecac041291134d61f11f06d8f Former-commit-id: 406f41a7cb2db14ea832480b86f72a11d3b0ab4a
This commit is contained in:
@@ -6,10 +6,13 @@
|
||||
#
|
||||
# Annotate CDS using exonerate
|
||||
#
|
||||
# do_exonerate.sh <FASTAGENOM> <FASTAPROT> [<OUTDIR>]
|
||||
# do_exonerate.sh <PASS> <FASTAGENOM> <FASTAPROT> <ANNOTFILE> <MODELDIR> [<OUTDIR>]
|
||||
#
|
||||
# - <PASS> : the pass running exonarate
|
||||
# - <FASTAGENOM> : The fasta file containing the genome to annotate
|
||||
# - <FASTAPROT> : The fasta file containing the protein family
|
||||
# - <ANNOTFILE> : The annotation file used to add product info
|
||||
# - <MODELDIR> : Directory containing model parameters for exonerate
|
||||
#
|
||||
# Results are in file : `basename <FASTAGENOM>:r`.`basename <FASTAPROT>:r`.res
|
||||
#
|
||||
@@ -26,17 +29,21 @@ alias Override 'if (-e \!:2) set \!:1 = \!:2'
|
||||
|
||||
NeedArg 2
|
||||
|
||||
set Pass = $Argv[1]; Shift
|
||||
set GenoFile = $Argv[1]; Shift
|
||||
set GenoName = `basename $GenoFile:r`
|
||||
|
||||
set ProtFile = $Argv[1]; Shift
|
||||
set ProtDir = `dirname $ProtFile`
|
||||
set DBDir = `dirname $ProtDir`
|
||||
set ProtName = `basename $ProtFile | $AwkCmd -F'.' '{print $1}'`
|
||||
set ProtType = `basename $ProtDir`
|
||||
|
||||
set AnnotFile = $Argv[1]; Shift
|
||||
|
||||
NeedFile $GenoFile
|
||||
NeedFile $ProtFile
|
||||
NeedFile $ProtDir/Annot.lst
|
||||
NeedFile $AnnotFile
|
||||
|
||||
set ModelsDir = $PROG_DIR/../models
|
||||
if ($#Argv > 0) then
|
||||
@@ -188,7 +195,7 @@ if ( -z $base.exo.best) then
|
||||
$AwkCmd -v MAX_SPAN=$PASS1_MAX_SPAN \
|
||||
-v ALLOW_STOP=1 \
|
||||
-v EXCLUDE=$GenoName \
|
||||
-f $LIB_DIR/bestclust.awk $base.exo.raw > $base.exo.best
|
||||
-f $LIB_DIR/bestclust.awk $base.exo.raw > $base.exo.best
|
||||
endif
|
||||
endif
|
||||
|
||||
@@ -196,7 +203,16 @@ endif
|
||||
# get annotations
|
||||
#
|
||||
|
||||
egrep "^$ProtName " $ProtDir/Annot.lst | $AwkCmd '{print "c annot", $0}' > T_$$
|
||||
set sp_match=`awk '/^e similarity/ {print $12}' $base.exo.best | head -1`
|
||||
|
||||
if ( ${%sp_match} > 0 ) then
|
||||
set sp_ac=`awk -v sp="$sp_match" '($1 ~ sp) {sub(/SP_AC=/,"",$2); sub(/;$/,"",$2); print $2} ' $DbFile`
|
||||
echo " " patch ac $sp_match to $sp_ac > /dev/stderr
|
||||
sed "s/$sp_match/$sp_ac/g" $base.exo.best > T_$$
|
||||
mv T_$$ $base.exo.best
|
||||
endif
|
||||
|
||||
egrep "^$ProtName " $AnnotFile | $AwkCmd '{print "c annot", $0}' > T_$$
|
||||
|
||||
#
|
||||
# extend start/stop
|
||||
@@ -213,7 +229,7 @@ $AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/extend.awk \
|
||||
# translate
|
||||
#
|
||||
|
||||
echo "c pass pass1 $ProtType" > $base.iff
|
||||
echo "c pass $Pass $ProtType" > $base.iff
|
||||
|
||||
$AwkCmd -v FASTA=$GenoFile -f $LIB_DIR/libutil.awk \
|
||||
-f $LIB_DIR/translate.awk T_$$ >> $base.iff
|
||||
|
@@ -37,10 +37,13 @@ else
|
||||
TEMP=""
|
||||
fi
|
||||
|
||||
DBROOT="$CDS_DATA_DIR/chlorodb/RPS12"
|
||||
RPS12DB="${DBROOT}/RPS12_DB.clean.fst"
|
||||
DBROOT="$CDS_DATA_DIR/sp_chlorodb/RPS12"
|
||||
RPS12DB="${DBROOT}/rps12.fst"
|
||||
DELTA=50
|
||||
|
||||
AnnotFile="$CDS_DATA_DIR/sp_chlorodb/Annot.lst"
|
||||
ModelsDir="$CDS_DATA_DIR/sp_chlorodb/models"
|
||||
|
||||
SEQLEN=$(seqlength "${QUERY}")
|
||||
SEQUENCE=$(readfirstfastaseq "${QUERY}")
|
||||
|
||||
@@ -61,7 +64,9 @@ blastx \
|
||||
BEGIN {BEST_EVAL = 1e-40;
|
||||
OUT = 0}
|
||||
/^#/ {next}
|
||||
($2 == PREV_CDS) { HSPs = HSPs "\n" $0;}
|
||||
($2 == PREV_CDS) && (($11 + 0.0) < (1e-5 + 0.0)) {
|
||||
HSPs = HSPs "\n" $0;
|
||||
}
|
||||
|
||||
(OUT < 20) && ($2 != PREV_CDS) && (BEST_EVAL < (1e-20 + 0.0)) {
|
||||
if (PREV_CDS) print HSPs;
|
||||
@@ -75,6 +80,7 @@ blastx \
|
||||
(BEST_EVAL > ($11 + 0.0)) {BEST_EVAL = ($11 + 0.0)}
|
||||
' > "rps12_locate.hsps"
|
||||
|
||||
|
||||
#
|
||||
# Extracting protein ids from selected blast HSPs
|
||||
#
|
||||
@@ -83,7 +89,6 @@ blastx \
|
||||
| sort \
|
||||
| uniq > "dbsel.txt"
|
||||
|
||||
|
||||
#
|
||||
# Extract corresponding protein sequences
|
||||
# from the RPS12 database.
|
||||
@@ -134,7 +139,7 @@ blastx \
|
||||
}
|
||||
}
|
||||
' \
|
||||
| sort -nk 3 \
|
||||
| sort -nk 3 \
|
||||
| $AwkCmd '($3 != old3 || $4 != old4) {
|
||||
i++
|
||||
old3=$3
|
||||
@@ -262,13 +267,14 @@ blastx \
|
||||
# It should be one or two fragments
|
||||
#
|
||||
export PASS1_SPEEDUP=0
|
||||
cp $DBROOT/Annot.lst RPS12
|
||||
nbseq = 0
|
||||
nbseq=0
|
||||
for fasta in rps12_fragments_*.fasta ; do
|
||||
tcsh -f ${PROG_DIR}/do_exonerate.csh \
|
||||
Pass2 \
|
||||
$fasta \
|
||||
"RPS12/rps12.fasta" \
|
||||
$DBROOT/../models $(pwd)
|
||||
$AnnotFile \
|
||||
$ModelsDir $(pwd)
|
||||
((nbseq=nbseq+1))
|
||||
done
|
||||
|
||||
|
@@ -35,19 +35,22 @@ Genome=$(basename ${Fasta%.*})
|
||||
# DbRoot is set to its default values except
|
||||
# if the second argument precise another DbRoot
|
||||
|
||||
DbRoot="$CDS_DATA_DIR/chlorodb"
|
||||
DbRoot="$CDS_DATA_DIR/sp_chlorodb"
|
||||
|
||||
if (( $# > 0)) ; then
|
||||
DbRoot="$1"; Shift
|
||||
fi
|
||||
|
||||
AnnotFile="$DbRoot/Annot.lst"
|
||||
|
||||
needdir $DbRoot
|
||||
needdir $DbRoot/core
|
||||
needfile $DbRoot/core/Annot.lst
|
||||
needfile $AnnotFile
|
||||
needdir $DbRoot/models
|
||||
|
||||
assignundef cdsdetection_pass1 yes
|
||||
assignundef cdsdetection_pass2 yes
|
||||
assignundef cdsdetection_pass3 yes
|
||||
|
||||
temp=$(mktempdir $(hostname))
|
||||
|
||||
@@ -63,33 +66,47 @@ Fasta="$temp/genome.fasta"
|
||||
#
|
||||
|
||||
if [[ "$cdsdetection_pass1" == "yes" ]] ; then
|
||||
for dir in "core" "shell" "dust" ; do
|
||||
for dir in "core" ; do
|
||||
if [[ -d $DbRoot/$dir ]] ; then
|
||||
fams=$(ls $DbRoot/$dir/*.clean.fst)
|
||||
fams=$(ls $DbRoot/$dir/*.fst)
|
||||
loginfo "running pass1:$dir exonerate of $Genome on $DbRoot"
|
||||
for f in $fams ; do
|
||||
tcsh -f $PROG_DIR/do_exonerate.csh $Fasta $f $DbRoot/models $temp
|
||||
tcsh -f $PROG_DIR/do_exonerate.csh Pass1 $Fasta $f $AnnotFile $DbRoot/models $temp
|
||||
done
|
||||
fi
|
||||
done
|
||||
|
||||
cp $temp/genome.cds.fasta $Genome.cds.fasta
|
||||
|
||||
mv $temp/genome.cds.fasta $Genome.cds_pass1.fasta
|
||||
fi
|
||||
|
||||
|
||||
#
|
||||
# pass2: transsplicing
|
||||
# pass2: RPS12 gene with transsplicing
|
||||
#
|
||||
|
||||
if [[ "$cdsdetection_pass2" == "yes" ]] ; then
|
||||
loginfo "running pass2:rps12 exonerate of $Genome on $DbRoot"
|
||||
$PROG_DIR/do_rps12.sh $Fasta $temp
|
||||
fi
|
||||
|
||||
#
|
||||
# pass3: prokov
|
||||
# pass3: run exonerate on shell and dust
|
||||
#
|
||||
|
||||
if [[ "$cdsdetection_pass3" == "yes" ]] ; then
|
||||
for dir in "shell" ; do
|
||||
if [[ -d $DbRoot/$dir ]] ; then
|
||||
fams=$(ls $DbRoot/$dir/*.fst)
|
||||
loginfo $fams
|
||||
loginfo "running pass3:$dir exonerate of $Genome on $DbRoot"
|
||||
for f in $fams ; do
|
||||
tcsh -f $PROG_DIR/do_exonerate.csh Pass3 $Fasta $f $AnnotFile $DbRoot/models $temp
|
||||
done
|
||||
fi
|
||||
done
|
||||
mv $temp/genome.cds.fasta $Genome.cds_pass2.fasta
|
||||
fi
|
||||
|
||||
# $PROG_DIR/do_prokov.sh $Fasta $Genome.cds.fasta $temp
|
||||
|
||||
#
|
||||
|
Reference in New Issue
Block a user