annotate/detectors/cds/tools/chlorodb/go_chlorodb.sh

#!/bin/csh -f
#
# make ChloroDB's
#
# usage: copy genbank/embl files into 'DB_DIR/download'
# usage: [create a paramter.sh file in 'DB_DIR']
# usage: go_chlorodb [DB_DIR]
#
unsetenv ORG_SOURCED

setenv ORG_HOME `dirname $0`/../../../..
source $ORG_HOME/scripts/csh_init.sh

#
# which DB to process
#

set DB_BASE = $DATA_DIR/cds/chlorodb  # default location

if ($#Argv > 0) then
  set DB_BASE = $Argv[1]; Shift
endif

set DB_BASE = `cd $DB_BASE && pwd -P`

NeedDir $DB_BASE/download

if (! -d $DB_BASE/info)  mkdir $DB_BASE/info
if (! -d $DB_BASE/fasta) mkdir $DB_BASE/fasta

cd $DB_BASE/info

#
# params
#

if (! -e $DB_BASE/parameters.sh) then
  Notify "no $DB_BASE/parameters.sh found : creating one for you"
  @ n = `find $DB_BASE/download -maxdepth 1 -type f -print | wc -l`
  @ cor_cutoff = $n / 2
  @ atg_cutoff = $n / 10
  @ dbs_cutoff = $n / 4
  if ($cor_cutoff == 0) @ cor_cutoff = 1
  if ($atg_cutoff == 0) @ atg_cutoff = 1
  if ($dbs_cutoff == 0) @ dbs_cutoff = 1
  echo "# sourced file"                          >  $DB_BASE/parameters.sh
  echo ""                                        >> $DB_BASE/parameters.sh
  echo "set CORE_NCDS_CUTOFF      = $cor_cutoff" >> $DB_BASE/parameters.sh
  echo "set CORE_START_ATG_CUTOFF = $atg_cutoff" >> $DB_BASE/parameters.sh
  echo "set CORE_START_DFT_CUTOFF = $atg_cutoff" >> $DB_BASE/parameters.sh
  echo "set CORE_START_OTH_CUTOFF = 10"          >> $DB_BASE/parameters.sh
  echo "set CORE_STOP_CUTOFF      = $cor_cutoff" >> $DB_BASE/parameters.sh
  echo "set CORE_SPLICE_CUTOFF    = $atg_cutoff" >> $DB_BASE/parameters.sh
  echo ""                                        >> $DB_BASE/parameters.sh
  echo "set SHEL_NCDS_CUTOFF      = 10"          >> $DB_BASE/parameters.sh
  echo ""                                        >> $DB_BASE/parameters.sh
  echo "set CORE_DELTA            = Inf"         >> $DB_BASE/parameters.sh
  echo "set CORE_COVMIN           = 30"          >> $DB_BASE/parameters.sh
  echo "set CORE_PMAX             = 1e-6"        >> $DB_BASE/parameters.sh
  echo "set CORE_IDMIN            = 30"          >> $DB_BASE/parameters.sh
  echo "set CORE_SIZMIN           = $cor_cutoff" >> $DB_BASE/parameters.sh
  echo ""                                        >> $DB_BASE/parameters.sh
  echo "set SHEL_DELTA            = 0.5"         >> $DB_BASE/parameters.sh
  echo "set SHEL_COVMIN           = 30"          >> $DB_BASE/parameters.sh
  echo "set SHEL_PMAX             = 1e-6"        >> $DB_BASE/parameters.sh
  echo "set SHEL_IDMIN            = 30"          >> $DB_BASE/parameters.sh
  echo "set SHEL_SIZMIN           = $dbs_cutoff" >> $DB_BASE/parameters.sh
  echo ""                                        >> $DB_BASE/parameters.sh
  echo "set DUST_DELTA            = 0.5"         >> $DB_BASE/parameters.sh
  echo "set DUST_COVMIN           = 30"          >> $DB_BASE/parameters.sh
  echo "set DUST_PMAX             = 1e-6"        >> $DB_BASE/parameters.sh
  echo "set DUST_IDMIN            = 30"          >> $DB_BASE/parameters.sh
  echo "set DUST_SIZMIN           = 10"          >> $DB_BASE/parameters.sh
  Cat $DB_BASE/parameters.sh
else
  Notify "DB parameters : $DB_BASE/parameters.sh"
endif

source $DB_BASE/parameters.sh

##set CMIN_COD = 0
##set FMIN_COD = 0.01

#
# temporarily uncompress
#

set ff = `find $DB_BASE/download -maxdepth 1 -name \*.gz -print`

if ($#ff != 0) then
  Notify "uncompressing $#ff entries"
  foreach f ($ff)
    gunzip -f $f
  end
endif

#
# convert gbk/embl to fasta
#

set ff = `find $DB_BASE/download -maxdepth 1 \( -name \*.gbk -or -name \*.embl \) -print`

Notify "convert $#ff gbk/embl entries to fasta"

foreach f ($ff)
  set nom = `basename $f:r`
  set typ = $f:e
  $AwkCmd -f $LIB_DIR/$typ.tofasta.awk $f > $DB_BASE/fasta/$nom.fst
end

#
# get gbk/embl info
#

Notify "get gbk/embl info for $#ff entries"

echo "" | awk -v HEADONLY=1 -f $LIB_DIR/gbk.info.awk > db.info.txt  # just get header

foreach f ($ff)
  set nom = `basename $f:r`
  set typ = $f:e
  $AwkCmd -f $LIB_DIR/$typ.oneliner.awk  $f |\
  $AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/$typ.info.awk |\
  egrep -v '^#' >> db.info.txt
end

#
# get cds info
#

Notify "get gbk/embl cds for $#ff entries"

echo "" | awk -v HEADONLY=1 -f $LIB_DIR/gbk.cds_long.awk > db.cds.txt  # just get header

foreach f ($ff)
  set nom = `basename $f:r`
  set typ = $f:e
  $AwkCmd -f $LIB_DIR/$typ.oneliner.awk  $f |\
  $AwkCmd -v FASTA=$DB_BASE/fasta/$nom.fst -f $LIB_DIR/libutil.awk \
          -f $LIB_DIR/$typ.cds_long.awk |\
  egrep -v '^#' >> db.cds.txt
end

#
# get fasta for  prots
#

Notify "get prots"
$AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/cds2fasta.awk db.cds.txt > db.prot.fst

#
# get introns
#

Notify "get gbk/embl introns for $#ff entries"

echo "" | awk -v HEADONLY=1 -f $LIB_DIR/gbk.intron.awk > db.intron.txt  # just get header

foreach f ($ff)
  set nom = `basename $f:r`
  set typ = $f:e
  $AwkCmd -f $LIB_DIR/$typ.oneliner.awk  $f |\
  $AwkCmd -v FASTA=$DB_BASE/fasta/$nom.fst -f $LIB_DIR/libutil.awk \
          -f $LIB_DIR/$typ.intron.awk |\
  egrep -v '^#' >> db.intron.txt
end

#
# make models
#

Notify "Making models"

echo -n ""                                              >  db.models.params.txt
echo "CORE_NCDS_CUTOFF <- $CORE_NCDS_CUTOFF"            >> db.models.params.txt
echo "CORE_START_ATG_CUTOFF <- $CORE_START_ATG_CUTOFF"  >> db.models.params.txt
echo "CORE_START_DFT_CUTOFF <- $CORE_START_DFT_CUTOFF"  >> db.models.params.txt
echo "CORE_START_OTH_CUTOFF <- $CORE_START_OTH_CUTOFF"  >> db.models.params.txt
echo "CORE_STOP_CUTOFF <- $CORE_STOP_CUTOFF"            >> db.models.params.txt
echo "CORE_SPLICE_CUTOFF <- $CORE_SPLICE_CUTOFF"        >> db.models.params.txt
echo "SHEL_NCDS_CUTOFF <- $SHEL_NCDS_CUTOFF"            >> db.models.params.txt

$LIB_DIR/make.models.r |& Cat

GetStatus
OnError then 
  Error 2 "model parameter too stringent"
endif

#
# add matrices
#

cp -f $PROG_DIR/matrices/* models

#
# make subDBs
#

if (-e db.core.pat.txt) then
  Notify "Making core DB (take some time... please wait)"
  $PROG_DIR/subdb/go_subdb.sh db.prot.fst db.core.pat.txt \
    $CORE_DELTA $CORE_COVMIN $CORE_PMAX $CORE_IDMIN $CORE_SIZMIN
    
  # add discarded entries into shell
  if (-e db.core.pat.db/Annot.lst) then
    sort db.core.pat.txt > A_$$
    sort db.core.pat.db/Annot.lst > B_$$
    join -a1 A_$$ B_$$ | awk '(NF==3) {print $0}' > C_$$
    set n = `cat C_$$ | wc -l`
    Notify "transfering $n discarded entries to shell"
    cat C_$$ >> db.shell.pat.txt
    \rm -f ?_$$
  endif
endif

if (-e db.shell.pat.txt) then
  Notify "Making shell DB (take some time... please wait)"
  $PROG_DIR/subdb/go_subdb.sh db.prot.fst db.shell.pat.txt \
    $SHEL_DELTA $SHEL_COVMIN $SHEL_PMAX $SHEL_IDMIN $SHEL_SIZMIN

  # add discarded entries into dust
  if (-e db.shell.pat.db/Annot.lst) then
    sort db.shell.pat.txt > A_$$
    sort db.shell.pat.db/Annot.lst > B_$$
    join -a1 A_$$ B_$$ | awk '(NF==3) {print $0}' >> C_$$
    set n = `cat C_$$ | wc -l`
    Notify "transfering $n discarded entries to dust"
    cat C_$$ >> db.dust.pat.txt
    \rm -f ?_$$
  endif
endif

if (-e db.dust.pat.txt) then
  Notify "Making dust DB (take some time... please wait)"
  $PROG_DIR/subdb/go_subdb.sh db.prot.fst db.dust.pat.txt \
    $DUST_DELTA $DUST_COVMIN $DUST_PMAX $DUST_IDMIN $DUST_SIZMIN
endif

#
# recompress entries
#

set ff = `find $DB_BASE/download -maxdepth 1 -type f -print`

if ($#ff != 0) then
  Notify "recompressing $#ff entries"
  foreach f ($ff)
    gzip -f $f
  end
endif

# compress fasta

set ff = `find $DB_BASE/fasta -maxdepth 1 -name \*.fst -print`

if ($#ff != 0) then
  Notify "compressing $#ff fasta entries"
  foreach f ($ff)
    gzip -f $f
  end
endif

# install everything in proper directory

foreach dir ("core" "shell" "dust")
  if (-e $DB_BASE/$dir) \rm -r $DB_BASE/$dir
  if ((-d db.$dir.pat.db) && (-e db.$dir.pat.db/Annot.lst)) then
    Notify "installing $DB_BASE/$dir"
    \mv -f db.$dir.pat.db $DB_BASE/$dir
  endif
end

if (-e $DB_BASE/models) \rm -r $DB_BASE/models
if (-d models) \mv -f models $DB_BASE

Notify "Done"
exit 0