#!/bin/csh -f
#
# usage: go_subdb.sh prot.fst pat.txt [deltalen covmin pmax idmin sizmin]
# usage: prot.fst : proteins fasta file
# usage: pat.txt  : text file containing patterns and names for families to extract
# usage: output directory containig subdbs : basename <pat:r>.db
#

unsetenv ORG_SOURCED

setenv ORG_HOME `dirname $0`/../../../../..
source $ORG_HOME/scripts/csh_init.sh

NeedArg 2

set ProtFile = $Argv[1]; Shift
set PatFile  = $Argv[1]; Shift

NeedFile $ProtFile
NeedFile $PatFile

#
# parameters
#

set Delta  = 0.5
set Covmin = 30
set Pmax   = 1e-6
set Idmin  = 30
set Sizmin = 10

if ($#Argv > 0) then
  set Delta = $Argv[1]; Shift
endif

if ($#Argv > 0) then
  set Covmin = $Argv[1]; Shift
endif

if ($#Argv > 0) then
  set Pmax = $Argv[1]; Shift
endif

if ($#Argv > 0) then
  set Idmin = $Argv[1]; Shift
endif

if ($#Argv > 0) then
  set Sizmin = $Argv[1]; Shift
endif

#
# output directory
#

set OutDir = `basename $PatFile:r`.db

if (-d $OutDir) \rm -r $OutDir
mkdir $OutDir

set OutLog = `basename $PatFile:r`.log

echo -n '' > $OutLog

alias Report 'egrep "^>" \!:1 | wc -l | awk -v P=`basename \!:1` -v H=\!:2 '"'{print H,P,"'$1}'"'"' >> $OutLog'

#
# remove entries with bad symbols
#

Notify "cleanup $ProtFile"

Report $ProtFile "init_size"

$AwkCmd -f $LIB_DIR/db.filter.sym.awk $ProtFile > P_$$

Report $ProtFile "cleanup_size"

#
# select by name pattern
#

Notify "select by patterns"

mkdir D_$$
mkdir E_$$
mkdir F_$$

set noms = `awk '{print $1}' $PatFile`

foreach nom ($noms)
  set pat = `egrep "^$nom " $PatFile | awk '{print $2}'`
  $AwkCmd -f $LIB_DIR/db.filter.pat.awk -v PAT="$pat" P_$$ > D_$$/$nom.fst
  Report D_$$/$nom.fst "pattern_filter"
  set n = `egrep '^>' D_$$/$nom.fst | wc -l`
  Notify "  pattern : $nom : $n"
  if ($n < $Sizmin) \rm -f D_$$/$nom.fst
end

set ok = `ls D_$$ | wc -l`
if ($ok == 0) then
  Warning "no entries found after pattern selection (increase Sizmin = $Sizmin)"
  goto fin
endif

#
# select by length
#

Notify "select by length"

foreach f (D_$$/*.fst) 
  set nom = `basename $f:r`
  $AwkCmd -v DELTA=$Delta -f $LIB_DIR/db.filter.len.awk $f > M_$$
  $AwkCmd -v FILE=M_$$ -f $LIB_DIR/db.subdb.awk $f > E_$$/$nom.fst
  Report E_$$/$nom.fst "length_filter"
  set n = `egrep '^>' E_$$/$nom.fst | wc -l`
  Notify "  length filter : $nom : $n"
  if ($n < $Sizmin) \rm -f E_$$/$nom.fst
end

set ok = `ls E_$$ | wc -l`
if ($ok == 0) then
  Warning "no entries found after length selection (increase Sizmin = $Sizmin)"
  goto fin
endif

#
# select by similarity
#

Notify "select by similarity"

foreach f (E_$$/*.fst) 
  set nom = `basename $f:r`

  Notify "  blasting $nom"
  
  makeblastdb -dbtype 'prot' -in $f >>& db.log
  blastp -db $f -query $f -outfmt 7 > $f.blast.out
  \rm -f $f.p??
  
  $AwkCmd -v COVMIN=$Covmin -v PMAX=$Pmax -v IDMIN=$Idmin \
      -f $LIB_DIR/db.blastlink.awk $f.blast.out |\
  $AwkCmd -f $LIB_DIR/db.cc.awk > $f.cc.txt

  awk -v NAME=$nom -f $LIB_DIR/db.reportcc.awk $f.cc.txt >> $OutLog 
  
  $AwkCmd -f $LIB_DIR/db.selcc.awk $f.cc.txt > S_$$
  $AwkCmd -v FILE=S_$$ -f $LIB_DIR/db.subdb.awk $f > F_$$/$nom.fst

  Report F_$$/$nom.fst "similarity_filter"

  set n = `egrep '^>' F_$$/$nom.fst | wc -l`
  Notify "  blast filter : $nom : $n"
  if ($n < $Sizmin) \rm -f F_$$/$nom.fst
  
end

set ok = `ls F_$$ | wc -l`
if ($ok == 0) then
  Warning "no entries found after similarity selection (increase Sizmin = $Sizmin)"
  goto fin
endif

#
# annotations
#

echo -n "" > J_$$

foreach f (F_$$/*.fst) 
  $AwkCmd -f $LIB_DIR/db.annot.awk $f >> J_$$
end

awk '(NF >= 3) {print $1, $NF}' $PatFile | sort > A_$$
sort J_$$ | egrep -v '^ *$' > B_$$
join A_$$ B_$$ > F_$$/Annot.lst

#
# copy files
#

set n = `ls F_$$/* | wc -l`
Notify "copy $n files to $OutDir"

\mv -f F_$$/* $OutDir

#
# end
#

fin:

set n = `find $OutDir -name \*.fst -print | wc -l`
if ($n == 0) then
  Warning "no entries found : removing $OutDir"
  \rm -r $OutDir
else
  Notify "output directory : $OutDir : $n entries"
endif

\rm -r ?_$$


exit 0