
Former-commit-id: 7017655ac86e7b7837c7b581bf8a1abb86c08b30 Former-commit-id: dcedd4e32e3c7ce302eed94abd2b975a4506df97
280 lines
7.9 KiB
Bash
Executable File
280 lines
7.9 KiB
Bash
Executable File
#!/bin/csh -f
|
|
#
|
|
# make ChloroDB's
|
|
#
|
|
# usage: copy genbank/embl files into 'DB_DIR/download'
|
|
# usage: [create a paramter.sh file in 'DB_DIR']
|
|
# usage: go_chlorodb [DB_DIR]
|
|
#
|
|
unsetenv ORG_SOURCED
|
|
|
|
setenv ORG_HOME `dirname $0`/../../../..
|
|
source $ORG_HOME/scripts/csh_init.sh
|
|
|
|
#
|
|
# which DB to process
|
|
#
|
|
|
|
set DB_BASE = $DATA_DIR/cds/chlorodb # default location
|
|
|
|
if ($#Argv > 0) then
|
|
set DB_BASE = $Argv[1]; Shift
|
|
endif
|
|
|
|
set DB_BASE = `cd $DB_BASE && pwd -P`
|
|
|
|
NeedDir $DB_BASE/download
|
|
|
|
if (! -d $DB_BASE/info) mkdir $DB_BASE/info
|
|
if (! -d $DB_BASE/fasta) mkdir $DB_BASE/fasta
|
|
|
|
cd $DB_BASE/info
|
|
|
|
#
|
|
# params
|
|
#
|
|
|
|
if (! -e $DB_BASE/parameters.sh) then
|
|
Notify "no $DB_BASE/parameters.sh found : creating one for you"
|
|
@ n = `find $DB_BASE/download -maxdepth 1 -type f -print | wc -l`
|
|
@ cor_cutoff = $n / 2
|
|
@ atg_cutoff = $n / 10
|
|
@ dbs_cutoff = $n / 4
|
|
if ($cor_cutoff == 0) @ cor_cutoff = 1
|
|
if ($atg_cutoff == 0) @ atg_cutoff = 1
|
|
if ($dbs_cutoff == 0) @ dbs_cutoff = 1
|
|
echo "# sourced file" > $DB_BASE/parameters.sh
|
|
echo "" >> $DB_BASE/parameters.sh
|
|
echo "set CORE_NCDS_CUTOFF = $cor_cutoff" >> $DB_BASE/parameters.sh
|
|
echo "set CORE_START_ATG_CUTOFF = $atg_cutoff" >> $DB_BASE/parameters.sh
|
|
echo "set CORE_START_DFT_CUTOFF = $atg_cutoff" >> $DB_BASE/parameters.sh
|
|
echo "set CORE_START_OTH_CUTOFF = 10" >> $DB_BASE/parameters.sh
|
|
echo "set CORE_STOP_CUTOFF = $cor_cutoff" >> $DB_BASE/parameters.sh
|
|
echo "set CORE_SPLICE_CUTOFF = $atg_cutoff" >> $DB_BASE/parameters.sh
|
|
echo "" >> $DB_BASE/parameters.sh
|
|
echo "set SHEL_NCDS_CUTOFF = 10" >> $DB_BASE/parameters.sh
|
|
echo "" >> $DB_BASE/parameters.sh
|
|
echo "set CORE_DELTA = Inf" >> $DB_BASE/parameters.sh
|
|
echo "set CORE_COVMIN = 30" >> $DB_BASE/parameters.sh
|
|
echo "set CORE_PMAX = 1e-6" >> $DB_BASE/parameters.sh
|
|
echo "set CORE_IDMIN = 30" >> $DB_BASE/parameters.sh
|
|
echo "set CORE_SIZMIN = $cor_cutoff" >> $DB_BASE/parameters.sh
|
|
echo "" >> $DB_BASE/parameters.sh
|
|
echo "set SHEL_DELTA = 0.5" >> $DB_BASE/parameters.sh
|
|
echo "set SHEL_COVMIN = 30" >> $DB_BASE/parameters.sh
|
|
echo "set SHEL_PMAX = 1e-6" >> $DB_BASE/parameters.sh
|
|
echo "set SHEL_IDMIN = 30" >> $DB_BASE/parameters.sh
|
|
echo "set SHEL_SIZMIN = $dbs_cutoff" >> $DB_BASE/parameters.sh
|
|
echo "" >> $DB_BASE/parameters.sh
|
|
echo "set DUST_DELTA = 0.5" >> $DB_BASE/parameters.sh
|
|
echo "set DUST_COVMIN = 30" >> $DB_BASE/parameters.sh
|
|
echo "set DUST_PMAX = 1e-6" >> $DB_BASE/parameters.sh
|
|
echo "set DUST_IDMIN = 30" >> $DB_BASE/parameters.sh
|
|
echo "set DUST_SIZMIN = 10" >> $DB_BASE/parameters.sh
|
|
Cat $DB_BASE/parameters.sh
|
|
else
|
|
Notify "DB parameters : $DB_BASE/parameters.sh"
|
|
endif
|
|
|
|
source $DB_BASE/parameters.sh
|
|
|
|
##set CMIN_COD = 0
|
|
##set FMIN_COD = 0.01
|
|
|
|
#
|
|
# temporarily uncompress
|
|
#
|
|
|
|
set ff = `find $DB_BASE/download -maxdepth 1 -name \*.gz -print`
|
|
|
|
if ($#ff != 0) then
|
|
Notify "uncompressing $#ff entries"
|
|
foreach f ($ff)
|
|
gunzip -f $f
|
|
end
|
|
endif
|
|
|
|
#
|
|
# convert gbk/embl to fasta
|
|
#
|
|
|
|
set ff = `find $DB_BASE/download -maxdepth 1 \( -name \*.gbk -or -name \*.embl \) -print`
|
|
|
|
Notify "convert $#ff gbk/embl entries to fasta"
|
|
|
|
foreach f ($ff)
|
|
set nom = `basename $f:r`
|
|
set typ = $f:e
|
|
$AwkCmd -f $LIB_DIR/$typ.tofasta.awk $f > $DB_BASE/fasta/$nom.fst
|
|
end
|
|
|
|
#
|
|
# get gbk/embl info
|
|
#
|
|
|
|
Notify "get gbk/embl info for $#ff entries"
|
|
|
|
echo "" | awk -v HEADONLY=1 -f $LIB_DIR/gbk.info.awk > db.info.txt # just get header
|
|
|
|
foreach f ($ff)
|
|
set nom = `basename $f:r`
|
|
set typ = $f:e
|
|
$AwkCmd -f $LIB_DIR/$typ.oneliner.awk $f |\
|
|
$AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/$typ.info.awk |\
|
|
egrep -v '^#' >> db.info.txt
|
|
end
|
|
|
|
#
|
|
# get cds info
|
|
#
|
|
|
|
Notify "get gbk/embl cds for $#ff entries"
|
|
|
|
echo "" | awk -v HEADONLY=1 -f $LIB_DIR/gbk.cds_long.awk > db.cds.txt # just get header
|
|
|
|
foreach f ($ff)
|
|
set nom = `basename $f:r`
|
|
set typ = $f:e
|
|
$AwkCmd -f $LIB_DIR/$typ.oneliner.awk $f |\
|
|
$AwkCmd -v FASTA=$DB_BASE/fasta/$nom.fst -f $LIB_DIR/libutil.awk \
|
|
-f $LIB_DIR/$typ.cds_long.awk |\
|
|
egrep -v '^#' >> db.cds.txt
|
|
end
|
|
|
|
#
|
|
# get fasta for prots
|
|
#
|
|
|
|
Notify "get prots"
|
|
$AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/cds2fasta.awk db.cds.txt > db.prot.fst
|
|
|
|
#
|
|
# get introns
|
|
#
|
|
|
|
Notify "get gbk/embl introns for $#ff entries"
|
|
|
|
echo "" | awk -v HEADONLY=1 -f $LIB_DIR/gbk.intron.awk > db.intron.txt # just get header
|
|
|
|
foreach f ($ff)
|
|
set nom = `basename $f:r`
|
|
set typ = $f:e
|
|
$AwkCmd -f $LIB_DIR/$typ.oneliner.awk $f |\
|
|
$AwkCmd -v FASTA=$DB_BASE/fasta/$nom.fst -f $LIB_DIR/libutil.awk \
|
|
-f $LIB_DIR/$typ.intron.awk |\
|
|
egrep -v '^#' >> db.intron.txt
|
|
end
|
|
|
|
#
|
|
# make models
|
|
#
|
|
|
|
Notify "Making models"
|
|
|
|
echo -n "" > db.models.params.txt
|
|
echo "CORE_NCDS_CUTOFF <- $CORE_NCDS_CUTOFF" >> db.models.params.txt
|
|
echo "CORE_START_ATG_CUTOFF <- $CORE_START_ATG_CUTOFF" >> db.models.params.txt
|
|
echo "CORE_START_DFT_CUTOFF <- $CORE_START_DFT_CUTOFF" >> db.models.params.txt
|
|
echo "CORE_START_OTH_CUTOFF <- $CORE_START_OTH_CUTOFF" >> db.models.params.txt
|
|
echo "CORE_STOP_CUTOFF <- $CORE_STOP_CUTOFF" >> db.models.params.txt
|
|
echo "CORE_SPLICE_CUTOFF <- $CORE_SPLICE_CUTOFF" >> db.models.params.txt
|
|
echo "SHEL_NCDS_CUTOFF <- $SHEL_NCDS_CUTOFF" >> db.models.params.txt
|
|
|
|
$LIB_DIR/make.models.r |& Cat
|
|
|
|
GetStatus
|
|
OnError then
|
|
Error 2 "model parameter too stringent"
|
|
endif
|
|
|
|
#
|
|
# add matrices
|
|
#
|
|
|
|
cp -f $PROG_DIR/matrices/* models
|
|
|
|
#
|
|
# make subDBs
|
|
#
|
|
|
|
if (-e db.core.pat.txt) then
|
|
Notify "Making core DB (take some time... please wait)"
|
|
$PROG_DIR/subdb/go_subdb.sh db.prot.fst db.core.pat.txt \
|
|
$CORE_DELTA $CORE_COVMIN $CORE_PMAX $CORE_IDMIN $CORE_SIZMIN
|
|
|
|
# add discarded entries into shell
|
|
if (-e db.core.pat.db/Annot.lst) then
|
|
sort db.core.pat.txt > A_$$
|
|
sort db.core.pat.db/Annot.lst > B_$$
|
|
join -a1 A_$$ B_$$ | awk '(NF==3) {print $0}' > C_$$
|
|
set n = `cat C_$$ | wc -l`
|
|
Notify "transfering $n discarded entries to shell"
|
|
cat C_$$ >> db.shell.pat.txt
|
|
\rm -f ?_$$
|
|
endif
|
|
endif
|
|
|
|
if (-e db.shell.pat.txt) then
|
|
Notify "Making shell DB (take some time... please wait)"
|
|
$PROG_DIR/subdb/go_subdb.sh db.prot.fst db.shell.pat.txt \
|
|
$SHEL_DELTA $SHEL_COVMIN $SHEL_PMAX $SHEL_IDMIN $SHEL_SIZMIN
|
|
|
|
# add discarded entries into dust
|
|
if (-e db.shell.pat.db/Annot.lst) then
|
|
sort db.shell.pat.txt > A_$$
|
|
sort db.shell.pat.db/Annot.lst > B_$$
|
|
join -a1 A_$$ B_$$ | awk '(NF==3) {print $0}' >> C_$$
|
|
set n = `cat C_$$ | wc -l`
|
|
Notify "transfering $n discarded entries to dust"
|
|
cat C_$$ >> db.dust.pat.txt
|
|
\rm -f ?_$$
|
|
endif
|
|
endif
|
|
|
|
if (-e db.dust.pat.txt) then
|
|
Notify "Making dust DB (take some time... please wait)"
|
|
$PROG_DIR/subdb/go_subdb.sh db.prot.fst db.dust.pat.txt \
|
|
$DUST_DELTA $DUST_COVMIN $DUST_PMAX $DUST_IDMIN $DUST_SIZMIN
|
|
endif
|
|
|
|
#
|
|
# recompress entries
|
|
#
|
|
|
|
set ff = `find $DB_BASE/download -maxdepth 1 -type f -print`
|
|
|
|
if ($#ff != 0) then
|
|
Notify "recompressing $#ff entries"
|
|
foreach f ($ff)
|
|
gzip -f $f
|
|
end
|
|
endif
|
|
|
|
# compress fasta
|
|
|
|
set ff = `find $DB_BASE/fasta -maxdepth 1 -name \*.fst -print`
|
|
|
|
if ($#ff != 0) then
|
|
Notify "compressing $#ff fasta entries"
|
|
foreach f ($ff)
|
|
gzip -f $f
|
|
end
|
|
endif
|
|
|
|
# install everything in proper directory
|
|
|
|
foreach dir ("core" "shell" "dust")
|
|
if (-e $DB_BASE/$dir) \rm -r $DB_BASE/$dir
|
|
if ((-d db.$dir.pat.db) && (-e db.$dir.pat.db/Annot.lst)) then
|
|
Notify "installing $DB_BASE/$dir"
|
|
\mv -f db.$dir.pat.db $DB_BASE/$dir
|
|
endif
|
|
end
|
|
|
|
if (-e $DB_BASE/models) \rm -r $DB_BASE/models
|
|
if (-d models) \mv -f models $DB_BASE
|
|
|
|
Notify "Done"
|
|
exit 0
|
|
|