go_pass1 moved to bin
Former-commit-id: 6be837df8518c9374c0a9de6ff717add6178d01b Former-commit-id: b2e3e8200b9e14447b7eeedd70191e10cf4b8a58
This commit is contained in:
@ -1,62 +0,0 @@
|
||||
#!/bin/csh -f
|
||||
#
|
||||
# filter a DB thru BlastX
|
||||
# usually to speedup further DB search
|
||||
#
|
||||
# output on stdout
|
||||
#
|
||||
# usage: go_filterbx.sh dna_fasta prot_fasta [idmin nbmin nbmax]
|
||||
#
|
||||
|
||||
setenv ORG_HOME `dirname $0`/../../..
|
||||
source $ORG_HOME/scripts/csh_init.sh
|
||||
|
||||
NeedArg 2
|
||||
|
||||
set GenoFile = $Argv[1]; Shift
|
||||
set ProtFile = $Argv[1]; Shift
|
||||
|
||||
NeedFile $GenoFile
|
||||
NeedFile $ProtFile
|
||||
|
||||
set IDMIN = 70
|
||||
set NBMIN = 50
|
||||
set NBMAX = 200
|
||||
|
||||
if ($#Argv >= 1) set IDMIN = $Argv[1]
|
||||
if ($#Argv >= 2) set NBMIN = $Argv[2]
|
||||
if ($#Argv >= 3) set NBMAX = $Argv[3]
|
||||
|
||||
#
|
||||
# format ProtFile
|
||||
#
|
||||
|
||||
if (! -e $ProtFile.phr) then
|
||||
Notify " formatting $ProtFile"
|
||||
(makeblastdb -dbtype prot -in $ProtFile) >& /dev/null
|
||||
CheckAbort 10 "makeblastdb failure"
|
||||
endif
|
||||
|
||||
#
|
||||
# blastx
|
||||
#
|
||||
|
||||
Notify " blasting $ProtFile"
|
||||
blastx -query $GenoFile -db $ProtFile -outfmt 7 |\
|
||||
$AwkCmd -v IDMIN=$IDMIN \
|
||||
-v NBMIN=$NBMIN \
|
||||
-v NBMAX=$NBMAX \
|
||||
-f $LIB_DIR/filterbx.awk > T_$$
|
||||
#
|
||||
# extract subdb
|
||||
#
|
||||
|
||||
$AwkCmd -v FILE=T_$$ -f $LIB_DIR/subdb.awk $ProtFile
|
||||
|
||||
#
|
||||
# end
|
||||
#
|
||||
|
||||
(\rm -f ?_$$) >> /dev/null
|
||||
|
||||
Exit 0
|
@ -1,197 +0,0 @@
|
||||
#!/bin/csh -f
|
||||
#
|
||||
# Annotate CDS - Pass1
|
||||
#
|
||||
#========================================================================================
|
||||
#
|
||||
# Annotate CDS of chlorodb/core proteins using exonerate
|
||||
#
|
||||
# pass1.sh <FASTAFILE> <FAMILY> [<OUTDIR>]
|
||||
#
|
||||
# - <FASTAFILE> : The fasta file containing the genome to annotate
|
||||
# - <FAMILY> : Name of the protein family (defined in chlorodb/core)
|
||||
#
|
||||
# Results are in file : `basename <FASTAFILE>:r`.<FAMILY>.res
|
||||
#
|
||||
#========================================================================================
|
||||
#
|
||||
# usage: go_pass1.sh fasta family [outdir]
|
||||
#
|
||||
|
||||
setenv ORG_HOME `dirname $0`/../../..
|
||||
source $ORG_HOME/scripts/csh_init.sh
|
||||
|
||||
set PARAMS_DIR = $LIB_DIR/../params
|
||||
set MODELS_DIR = $LIB_DIR/../models
|
||||
|
||||
alias Override 'if (-e \!:2) set \!:1 = \!:2'
|
||||
|
||||
NeedArg 2
|
||||
|
||||
set GenoFile = $Argv[1]
|
||||
set GenoName = `basename $GenoFile:r`
|
||||
set ProtName = $Argv[2]
|
||||
set ProtDir = $CDS_DATA_DIR/chlorodb/core
|
||||
set ProtFile = $ProtDir/$ProtName.fst
|
||||
|
||||
NeedFile $GenoFile
|
||||
NeedFile $ProtFile
|
||||
|
||||
set OutDir = .
|
||||
if ($#Argv >= 3) set OutDir = $3
|
||||
if (! -d $OutDir) mkdir $OutDir
|
||||
|
||||
#
|
||||
# general parameters
|
||||
#
|
||||
|
||||
source $PARAMS_DIR/default
|
||||
|
||||
#
|
||||
# family specific parameters
|
||||
#
|
||||
|
||||
if (-e $PARAMS_DIR/$ProtName) then
|
||||
source $PARAMS_DIR/$ProtName
|
||||
endif
|
||||
|
||||
#
|
||||
# start/stop/splice models
|
||||
#
|
||||
|
||||
if ($?STARTMODEL == 0) then
|
||||
set STARTMODEL = $MODELS_DIR/start.default.frq
|
||||
Override STARTMODEL $MODELS_DIR/start.$ProtName.frq
|
||||
endif
|
||||
|
||||
if ($?STOPMODEL == 0) then
|
||||
set STOPMODEL = $MODELS_DIR/stop.default.frq
|
||||
Override STOPMODEL $MODELS_DIR/stop.$ProtName.frq
|
||||
endif
|
||||
|
||||
if ($?SPLICE3MODEL == 0) then
|
||||
set SPLICE3MODEL = $MODELS_DIR/splice3.default.frq
|
||||
Override SPLICE3MODEL $MODELS_DIR/splice3.$ProtName.frq
|
||||
endif
|
||||
|
||||
if ($?SPLICE5MODEL == 0) then
|
||||
set SPLICE5MODEL = $MODELS_DIR/splice5.default.frq
|
||||
Override SPLICE5MODEL $MODELS_DIR/splice5.$ProtName.frq
|
||||
endif
|
||||
|
||||
#
|
||||
# out files prefix
|
||||
#
|
||||
|
||||
set base = $OutDir/$GenoName.$ProtName
|
||||
|
||||
#
|
||||
# skip exonerate calculations if already done
|
||||
#
|
||||
|
||||
if (-e $base.exo.raw) then
|
||||
Notify " file $base.exo.raw found <exonerate skipped>"
|
||||
goto parse
|
||||
endif
|
||||
|
||||
#
|
||||
# speedup exonerate
|
||||
#
|
||||
|
||||
if ($PASS1_SPEEDUP != 0) then
|
||||
|
||||
$LIB_DIR/go_filterbx.sh $GenoFile $ProtFile \
|
||||
$PASS1_BLASTX_FILTER_IDMIN \
|
||||
$PASS1_BLASTX_FILTER_NBMIN \
|
||||
$PASS1_BLASTX_FILTER_NBMAX > D_$$
|
||||
|
||||
set n = `egrep "^>" D_$$ | wc -l`
|
||||
if ($n > 0) then
|
||||
Notify " $n sequences kept"
|
||||
set DbFile = D_$$
|
||||
else
|
||||
Notify " no sequence match"
|
||||
if ($PASS1_SLOWDOWN != 0) then
|
||||
Notify " reverting to original $ProtName"
|
||||
set DbFile = $ProtFile
|
||||
else
|
||||
echo "" > $base.exo.raw
|
||||
goto parse
|
||||
endif
|
||||
endif
|
||||
else
|
||||
set DbFile = $ProtFile
|
||||
endif
|
||||
|
||||
#
|
||||
# run exonerate
|
||||
#
|
||||
|
||||
Notify " running exonerate of $GenoName on $ProtName"
|
||||
exonerate --model protein2genome \
|
||||
--percent $PASS1_PERCENT \
|
||||
--showalignment TRUE \
|
||||
--showvulgar TRUE \
|
||||
--showtargetgff TRUE \
|
||||
--geneticcode $PASS1_GENETIC_CODE \
|
||||
--minintron $PASS1_MIN_INTRON \
|
||||
--maxintron $PASS1_MAX_INTRON \
|
||||
--bestn $PASS1_BESTN \
|
||||
--frameshift $PASS1_FRAMESHIFT \
|
||||
--proteinsubmat $PASS1_SUBMAT \
|
||||
--splice3 $SPLICE3MODEL \
|
||||
--splice5 $SPLICE5MODEL \
|
||||
$DbFile $GenoFile > $base.exo.raw
|
||||
CheckAbort 20 "exonerate failure"
|
||||
|
||||
#
|
||||
# extract best clusters
|
||||
#
|
||||
parse:
|
||||
|
||||
$AwkCmd -v MAX_SPAN=$PASS1_MAX_SPAN \
|
||||
-v ALLOW_STOP=$PASS1_ALLOW_STOP \
|
||||
-v EXCLUDE=$GenoName \
|
||||
-f $LIB_DIR/bestclust.awk $base.exo.raw > $base.exo.best
|
||||
|
||||
#
|
||||
# get annotations
|
||||
#
|
||||
|
||||
egrep "^$ProtName " $CDS_DATA_DIR/chlorodb/core/Annot.lst |\
|
||||
awk '{print "c annot", $0}' > T_$$
|
||||
|
||||
#
|
||||
# extend start/stop
|
||||
#
|
||||
|
||||
$AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/extend.awk \
|
||||
-v FASTA=$GenoFile \
|
||||
-v START_MODEL=$STARTMODEL \
|
||||
-v STOP_MODEL=$STOPMODEL \
|
||||
-v START_WALK=$PASS1_START_WALK \
|
||||
-v STOP_WALK=$PASS1_STOP_WALK \
|
||||
$base.exo.best >> T_$$
|
||||
#
|
||||
# translate
|
||||
#
|
||||
|
||||
$AwkCmd -v FASTA=$GenoFile -f $LIB_DIR/libutil.awk \
|
||||
-f $LIB_DIR/translate.awk T_$$ > $base.iff
|
||||
|
||||
#
|
||||
# convert to embl
|
||||
#
|
||||
|
||||
$AwkCmd -f $LIB_DIR/toEmbl.awk $base.iff |\
|
||||
$AwkCmd -f $LIB_DIR/cutline.awk > $base.res
|
||||
|
||||
#
|
||||
# end
|
||||
#
|
||||
|
||||
Notify " output file: $base.res"
|
||||
|
||||
(\rm -f ?_$$) >> /dev/null
|
||||
|
||||
Exit 0
|
Reference in New Issue
Block a user