cds/tools/chlorodb added
Former-commit-id: 0579e878a69b7c285ca71870e9ca5730649a2fda Former-commit-id: 7cced5b488441d87bf070a9a444317db0e048880
This commit is contained in:
254
detectors/cds/tools/chlorodb/go_chlorodb.sh
Executable file
254
detectors/cds/tools/chlorodb/go_chlorodb.sh
Executable file
@ -0,0 +1,254 @@
|
||||
#!/bin/csh -f
|
||||
#
|
||||
# make ChloroDB's
|
||||
#
|
||||
# usage: copy genbank/embl files into 'DB_DIR/download'
|
||||
# usage: [create a paramter.sh file in 'DB_DIR']
|
||||
# usage: go_chlorodb [DB_DIR]
|
||||
#
|
||||
unsetenv ORG_SOURCED
|
||||
|
||||
setenv ORG_HOME `dirname $0`/../../../..
|
||||
source $ORG_HOME/scripts/csh_init.sh
|
||||
|
||||
#
|
||||
# which DB to process
|
||||
#
|
||||
|
||||
set DB_BASE = $DATA_DIR/cds/chlorodb # default location
|
||||
|
||||
if ($#Argv > 0) then
|
||||
set DB_BASE = $Argv[1]; Shift
|
||||
endif
|
||||
|
||||
set DB_BASE = `cd $DB_BASE && pwd -P`
|
||||
|
||||
NeedDir $DB_BASE/download
|
||||
|
||||
if (! -d $DB_BASE/info) mkdir $DB_BASE/info
|
||||
if (! -d $DB_BASE/fasta) mkdir $DB_BASE/fasta
|
||||
|
||||
cd $DB_BASE/info
|
||||
|
||||
#
|
||||
# params
|
||||
#
|
||||
|
||||
if (! -e $DB_BASE/parameters.sh) then
|
||||
@ n = `find $DB_BASE/download -depth 1 -type f -print | wc -l`
|
||||
@ cor_cutoff = $n / 2
|
||||
@ atg_cutoff = $n / 10
|
||||
@ dbs_cutoff = $n / 4
|
||||
if ($cor_cutoff == 0) @ cor_cutoff = 1
|
||||
if ($atg_cutoff == 0) @ atg_cutoff = 1
|
||||
if ($dbs_cutoff == 0) @ dbs_cutoff = 1
|
||||
echo "# sourced file" > $DB_BASE/parameters.sh
|
||||
echo "" >> $DB_BASE/parameters.sh
|
||||
echo "set CORE_NCDS_CUTOFF = $cor_cutoff" >> $DB_BASE/parameters.sh
|
||||
echo "set CORE_START_ATG_CUTOFF = $atg_cutoff" >> $DB_BASE/parameters.sh
|
||||
echo "set CORE_START_DFT_CUTOFF = $atg_cutoff" >> $DB_BASE/parameters.sh
|
||||
echo "set CORE_START_OTH_CUTOFF = 10" >> $DB_BASE/parameters.sh
|
||||
echo "set CORE_STOP_CUTOFF = $cor_cutoff" >> $DB_BASE/parameters.sh
|
||||
echo "set CORE_SPLICE_CUTOFF = $atg_cutoff" >> $DB_BASE/parameters.sh
|
||||
echo "" >> $DB_BASE/parameters.sh
|
||||
echo "set SHEL_NCDS_CUTOFF = 10" >> $DB_BASE/parameters.sh
|
||||
echo "" >> $DB_BASE/parameters.sh
|
||||
echo "set CORE_DELTA = Inf" >> $DB_BASE/parameters.sh
|
||||
echo "set CORE_COVMIN = 30" >> $DB_BASE/parameters.sh
|
||||
echo "set CORE_PMAX = 1e-6" >> $DB_BASE/parameters.sh
|
||||
echo "set CORE_IDMIN = 30" >> $DB_BASE/parameters.sh
|
||||
echo "set CORE_SIZMIN = $cor_cutoff" >> $DB_BASE/parameters.sh
|
||||
echo "" >> $DB_BASE/parameters.sh
|
||||
echo "set SHEL_DELTA = 0.5" >> $DB_BASE/parameters.sh
|
||||
echo "set SHEL_COVMIN = 30" >> $DB_BASE/parameters.sh
|
||||
echo "set SHEL_PMAX = 1e-6" >> $DB_BASE/parameters.sh
|
||||
echo "set SHEL_IDMIN = 30" >> $DB_BASE/parameters.sh
|
||||
echo "set SHEL_SIZMIN = $dbs_cutoff" >> $DB_BASE/parameters.sh
|
||||
echo "" >> $DB_BASE/parameters.sh
|
||||
echo "set DUST_DELTA = 0.5" >> $DB_BASE/parameters.sh
|
||||
echo "set DUST_COVMIN = 30" >> $DB_BASE/parameters.sh
|
||||
echo "set DUST_PMAX = 1e-6" >> $DB_BASE/parameters.sh
|
||||
echo "set DUST_IDMIN = 30" >> $DB_BASE/parameters.sh
|
||||
echo "set DUST_SIZMIN = 10" >> $DB_BASE/parameters.sh
|
||||
|
||||
endif
|
||||
|
||||
source $DB_BASE/parameters.sh
|
||||
|
||||
##set CMIN_COD = 0
|
||||
##set FMIN_COD = 0.01
|
||||
|
||||
#
|
||||
# temporarily uncompress
|
||||
#
|
||||
|
||||
set ff = `find $DB_BASE/download -depth 1 -name \*.gz -print`
|
||||
|
||||
if ($#ff != 0) then
|
||||
Notify "uncompressing $#ff entries"
|
||||
foreach f ($ff)
|
||||
gunzip -f $f
|
||||
end
|
||||
endif
|
||||
|
||||
#
|
||||
# convert gbk/embl to fasta
|
||||
#
|
||||
|
||||
set ff = `find $DB_BASE/download -depth 1 \( -name \*.gbk -or -name \*.embl \) -print`
|
||||
|
||||
Notify "convert $#ff gbk/embl entries to fasta"
|
||||
|
||||
foreach f ($ff)
|
||||
set nom = `basename $f:r`
|
||||
set typ = $f:e
|
||||
$AwkCmd -f $LIB_DIR/$typ.tofasta.awk $f > $DB_BASE/fasta/$nom.fst
|
||||
end
|
||||
|
||||
#
|
||||
# get gbk/embl info
|
||||
#
|
||||
|
||||
Notify "get gbk/embl info for $#ff entries"
|
||||
|
||||
echo "" | awk -v HEADONLY=1 -f $LIB_DIR/gbk.info.awk > db.info.txt # just get header
|
||||
|
||||
foreach f ($ff)
|
||||
set nom = `basename $f:r`
|
||||
set typ = $f:e
|
||||
$AwkCmd -f $LIB_DIR/$typ.oneliner.awk $f |\
|
||||
$AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/$typ.info.awk |\
|
||||
egrep -v '^#' >> db.info.txt
|
||||
end
|
||||
|
||||
#
|
||||
# get cds info
|
||||
#
|
||||
|
||||
Notify "get gbk/embl cds for $#ff entries"
|
||||
|
||||
echo "" | awk -v HEADONLY=1 -f $LIB_DIR/gbk.cds_long.awk > db.cds.txt # just get header
|
||||
|
||||
foreach f ($ff)
|
||||
set nom = `basename $f:r`
|
||||
set typ = $f:e
|
||||
$AwkCmd -f $LIB_DIR/$typ.oneliner.awk $f |\
|
||||
$AwkCmd -v FASTA=$DB_BASE/fasta/$nom.fst -f $LIB_DIR/libutil.awk \
|
||||
-f $LIB_DIR/$typ.cds_long.awk |\
|
||||
egrep -v '^#' >> db.cds.txt
|
||||
end
|
||||
|
||||
#
|
||||
# get fasta for prots
|
||||
#
|
||||
|
||||
Notify "get prots"
|
||||
$AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/cds2fasta.awk db.cds.txt > db.prot.fst
|
||||
|
||||
#
|
||||
# get introns
|
||||
#
|
||||
|
||||
Notify "get gbk/embl introns for $#ff entries"
|
||||
|
||||
echo "" | awk -v HEADONLY=1 -f $LIB_DIR/gbk.intron.awk > db.intron.txt # just get header
|
||||
|
||||
foreach f ($ff)
|
||||
set nom = `basename $f:r`
|
||||
set typ = $f:e
|
||||
$AwkCmd -f $LIB_DIR/$typ.oneliner.awk $f |\
|
||||
$AwkCmd -v FASTA=$DB_BASE/fasta/$nom.fst -f $LIB_DIR/libutil.awk \
|
||||
-f $LIB_DIR/$typ.intron.awk |\
|
||||
egrep -v '^#' >> db.intron.txt
|
||||
end
|
||||
|
||||
#
|
||||
# make models
|
||||
#
|
||||
|
||||
Notify "Making models"
|
||||
|
||||
echo -n "" > db.models.params.txt
|
||||
echo "CORE_NCDS_CUTOFF <- $CORE_NCDS_CUTOFF" >> db.models.params.txt
|
||||
echo "CORE_START_ATG_CUTOFF <- $CORE_START_ATG_CUTOFF" >> db.models.params.txt
|
||||
echo "CORE_START_DFT_CUTOFF <- $CORE_START_DFT_CUTOFF" >> db.models.params.txt
|
||||
echo "CORE_START_OTH_CUTOFF <- $CORE_START_OTH_CUTOFF" >> db.models.params.txt
|
||||
echo "CORE_STOP_CUTOFF <- $CORE_STOP_CUTOFF" >> db.models.params.txt
|
||||
echo "CORE_SPLICE_CUTOFF <- $CORE_SPLICE_CUTOFF" >> db.models.params.txt
|
||||
echo "SHEL_NCDS_CUTOFF <- $SHEL_NCDS_CUTOFF" >> db.models.params.txt
|
||||
|
||||
$LIB_DIR/make.models.r |& Cat
|
||||
|
||||
GetStatus
|
||||
OnError then
|
||||
Error 2 "model parameter too stringent"
|
||||
endif
|
||||
|
||||
#
|
||||
# add matrices
|
||||
#
|
||||
|
||||
cp -f $PROG_DIR/matrices/* models
|
||||
|
||||
#
|
||||
# make subDBs
|
||||
#
|
||||
|
||||
if (-e db.core.pat.txt) then
|
||||
Notify "Making core DB (take some time... please wait)"
|
||||
$PROG_DIR/subdb/go_subdb.sh db.prot.fst db.core.pat.txt \
|
||||
$CORE_DELTA $CORE_COVMIN $CORE_PMAX $CORE_IDMIN $CORE_SIZMIN
|
||||
endif
|
||||
|
||||
if (-e db.shell.pat.txt) then
|
||||
Notify "Making shell DB (take some time... please wait)"
|
||||
$PROG_DIR/subdb/go_subdb.sh db.prot.fst db.shell.pat.txt \
|
||||
$SHEL_DELTA $SHEL_COVMIN $SHEL_PMAX $SHEL_IDMIN $SHEL_SIZMIN
|
||||
endif
|
||||
|
||||
if (-e db.dust.pat.txt) then
|
||||
Notify "Making dust DB (take some time... please wait)"
|
||||
$PROG_DIR/subdb/go_subdb.sh db.prot.fst db.dust.pat.txt \
|
||||
$DUST_DELTA $DUST_COVMIN $DUST_PMAX $DUST_IDMIN $DUST_SIZMIN
|
||||
endif
|
||||
|
||||
#
|
||||
# recompress entries
|
||||
#
|
||||
|
||||
set ff = `find $DB_BASE/download -depth 1 -type f -print`
|
||||
|
||||
if ($#ff != 0) then
|
||||
Notify "recompressing $#ff entries"
|
||||
foreach f ($ff)
|
||||
gzip -f $f
|
||||
end
|
||||
endif
|
||||
|
||||
# compress fasta
|
||||
|
||||
set ff = `find $DB_BASE/fasta -depth 1 -name \*.fst -print`
|
||||
|
||||
if ($#ff != 0) then
|
||||
Notify "compressing $#ff fasta entries"
|
||||
foreach f ($ff)
|
||||
gzip -f $f
|
||||
end
|
||||
endif
|
||||
|
||||
# install everything in proper directory
|
||||
|
||||
foreach dir ("core" "shell" "dust")
|
||||
if (-e $DB_BASE/$dir) \rm -r $DB_BASE/$dir
|
||||
if ((-d db.$dir.pat.db) && (-e db.$dir.pat.db/Annot.lst)) then
|
||||
Notify "installing $DB_BASE/$dir"
|
||||
\mv -f db.$dir.pat.db $DB_BASE/$dir
|
||||
endif
|
||||
end
|
||||
|
||||
if (-e $DB_BASE/models) \rm -r $DB_BASE/models
|
||||
if (-d models) \mv -f models $DB_BASE
|
||||
|
||||
Notify "Done"
|
||||
exit 0
|
||||
|
29
detectors/cds/tools/chlorodb/matrices/blosum62.mat
Normal file
29
detectors/cds/tools/chlorodb/matrices/blosum62.mat
Normal file
@ -0,0 +1,29 @@
|
||||
#
|
||||
# blosum62 substitution matrix
|
||||
# with larger penalty for stops
|
||||
#
|
||||
A R N D C Q E G H I L K M F P S T W Y V B Z X *
|
||||
A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -50
|
||||
R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -50
|
||||
N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -50
|
||||
D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -50
|
||||
C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -50
|
||||
Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -50
|
||||
E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -50
|
||||
G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -50
|
||||
H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -50
|
||||
I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -50
|
||||
L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -50
|
||||
K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -50
|
||||
M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -50
|
||||
F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -50
|
||||
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -50
|
||||
S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -50
|
||||
T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -50
|
||||
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -50
|
||||
Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -50
|
||||
V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -50
|
||||
B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -50
|
||||
Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -50
|
||||
X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -50
|
||||
* -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 1
|
195
detectors/cds/tools/chlorodb/subdb/go_subdb.sh
Executable file
195
detectors/cds/tools/chlorodb/subdb/go_subdb.sh
Executable file
@ -0,0 +1,195 @@
|
||||
#!/bin/csh -f
|
||||
#
|
||||
# usage: go_subdb.sh prot.fst pat.txt [deltalen covmin pmax idmin sizmin]
|
||||
# usage: prot.fst : proteins fasta file
|
||||
# usage: pat.txt : text file containing patterns and names for families to extract
|
||||
# usage: output directory containig subdbs : basename <pat:r>.db
|
||||
#
|
||||
|
||||
unsetenv ORG_SOURCED
|
||||
|
||||
setenv ORG_HOME `dirname $0`/../../../../..
|
||||
source $ORG_HOME/scripts/csh_init.sh
|
||||
|
||||
NeedArg 2
|
||||
|
||||
set ProtFile = $Argv[1]; Shift
|
||||
set PatFile = $Argv[1]; Shift
|
||||
|
||||
NeedFile $ProtFile
|
||||
NeedFile $PatFile
|
||||
|
||||
#
|
||||
# parameters
|
||||
#
|
||||
|
||||
set Delta = 0.5
|
||||
set Covmin = 30
|
||||
set Pmax = 1e-6
|
||||
set Idmin = 30
|
||||
set Sizmin = 5
|
||||
|
||||
if ($#Argv > 0) then
|
||||
set Delta = $Argv[1]; Shift
|
||||
endif
|
||||
|
||||
if ($#Argv > 0) then
|
||||
set Covmin = $Argv[1]; Shift
|
||||
endif
|
||||
|
||||
if ($#Argv > 0) then
|
||||
set Pmax = $Argv[1]; Shift
|
||||
endif
|
||||
|
||||
if ($#Argv > 0) then
|
||||
set Idmin = $Argv[1]; Shift
|
||||
endif
|
||||
|
||||
if ($#Argv > 0) then
|
||||
set Sizmin = $Argv[1]; Shift
|
||||
endif
|
||||
|
||||
#
|
||||
# output directory
|
||||
#
|
||||
|
||||
set OutDir = `basename $PatFile:r`.db
|
||||
|
||||
if (-d $OutDir) \rm -r $OutDir
|
||||
mkdir $OutDir
|
||||
|
||||
set OutLog = `basename $PatFile:r`.log
|
||||
|
||||
echo -n '' > $OutLog
|
||||
|
||||
alias Report 'egrep "^>" \!:1 | wc -l | awk -v P=`basename \!:1` -v H=\!:2 '"'{print H,P,"'$1}'"'"' >> $OutLog'
|
||||
|
||||
#
|
||||
# remove entries with bad symbols
|
||||
#
|
||||
|
||||
Notify "cleanup $ProtFile"
|
||||
|
||||
Report $ProtFile "init_size"
|
||||
|
||||
$AwkCmd -f $LIB_DIR/db.filter.sym.awk $ProtFile > P_$$
|
||||
|
||||
Report $ProtFile "cleanup_size"
|
||||
|
||||
#
|
||||
# select by name pattern
|
||||
#
|
||||
|
||||
Notify "select by patterns"
|
||||
|
||||
mkdir D_$$
|
||||
mkdir E_$$
|
||||
mkdir F_$$
|
||||
|
||||
set noms = `awk '{print $1}' $PatFile`
|
||||
|
||||
foreach nom ($noms)
|
||||
set pat = `egrep "^$nom " $PatFile | awk '{print $2}'`
|
||||
$AwkCmd -f $LIB_DIR/db.filter.pat.awk -v PAT="$pat" P_$$ > D_$$/$nom.fst
|
||||
set n = `egrep '^>' D_$$/$nom.fst | wc -l`
|
||||
Notify " pattern : $nom : $n"
|
||||
Report D_$$/$nom.fst "pattern_filter"
|
||||
if ($n <= $Sizmin) \rm -f D_$$/$nom.fst
|
||||
end
|
||||
|
||||
set ok = `ls D_$$ | wc -l`
|
||||
if ($ok == 0) goto fin
|
||||
|
||||
#
|
||||
# select by length
|
||||
#
|
||||
|
||||
Notify "select by length"
|
||||
|
||||
foreach f (D_$$/*.fst)
|
||||
set nom = `basename $f:r`
|
||||
$AwkCmd -f $LIB_DIR/db.getlen.awk $f > L_$$
|
||||
$LIB_DIR/db.filter.len.r L_$$ $Delta |\
|
||||
$AwkCmd '($NF == "TRUE") {print $2}' > M_$$
|
||||
$AwkCmd -v FILE=M_$$ -f $LIB_DIR/db.subdb.awk $f > E_$$/$nom.fst
|
||||
Report E_$$/$nom.fst "length_filter"
|
||||
set n = `egrep '^>' E_$$/$nom.fst | wc -l`
|
||||
Notify " length filter : $nom : $n"
|
||||
if ($n <= $Sizmin) \rm -f E_$$/$nom.fst
|
||||
end
|
||||
|
||||
set ok = `ls E_$$ | wc -l`
|
||||
if ($ok == 0) goto fin
|
||||
|
||||
|
||||
#
|
||||
# select by similarity
|
||||
#
|
||||
|
||||
Notify "select by similarity"
|
||||
|
||||
foreach f (E_$$/*.fst)
|
||||
set nom = `basename $f:r`
|
||||
|
||||
Notify " blasting $nom"
|
||||
|
||||
makeblastdb -dbtype 'prot' -in $f >>& db.log
|
||||
blastp -db $f -query $f -outfmt 7 > $f.blast.out
|
||||
\rm -f $f.p??
|
||||
|
||||
$AwkCmd -v COVMIN=$Covmin -v PMAX=$Pmax -v IDMIN=$Idmin \
|
||||
-f $LIB_DIR/db.blastlink.awk $f.blast.out |\
|
||||
$AwkCmd -f $LIB_DIR/db.todl.awk > G_$$
|
||||
|
||||
($LIB_DIR/db.cc.r G_$$ > $f.cc.txt) >>& db.log
|
||||
|
||||
awk -v NAME=$nom -f $LIB_DIR/db.reportcc.awk $f.cc.txt >> $OutLog
|
||||
|
||||
$AwkCmd -f $LIB_DIR/db.selcc.awk $f.cc.txt > S_$$
|
||||
$AwkCmd -v FILE=S_$$ -f $LIB_DIR/db.subdb.awk $f > F_$$/$nom.fst
|
||||
|
||||
Report F_$$/$nom.fst "similarity_filter"
|
||||
|
||||
set n = `egrep '^>' F_$$/$nom.fst | wc -l`
|
||||
Notify " blast filter : $nom : $n"
|
||||
if ($n <= $Sizmin) \rm -f F_$$/$nom.fst
|
||||
|
||||
end
|
||||
|
||||
set ok = `ls D_$$ | wc -l`
|
||||
if ($ok == 0) goto fin
|
||||
|
||||
#
|
||||
# annotations
|
||||
#
|
||||
|
||||
echo -n "" > J_$$
|
||||
|
||||
foreach f (F_$$/*.fst)
|
||||
$AwkCmd -f $LIB_DIR/db.annot.awk $f >> J_$$
|
||||
end
|
||||
|
||||
awk '(NF >= 3) {print $1, $NF}' $PatFile | sort > A_$$
|
||||
sort J_$$ | egrep -v '^ *$' > B_$$
|
||||
join A_$$ B_$$ > F_$$/Annot.lst
|
||||
|
||||
#
|
||||
# copy files
|
||||
#
|
||||
|
||||
set n = `ls F_$$/* | wc -l`
|
||||
Notify "copy $n files to $OutDir"
|
||||
|
||||
\mv -f F_$$/* $OutDir
|
||||
|
||||
#
|
||||
# end
|
||||
#
|
||||
|
||||
fin:
|
||||
Notify "output directory : $OutDir"
|
||||
|
||||
\rm -r ?_$$
|
||||
|
||||
|
||||
exit 0
|
39
detectors/cds/tools/chlorodb/subdb/lib/db.annot.awk
Normal file
39
detectors/cds/tools/chlorodb/subdb/lib/db.annot.awk
Normal file
@ -0,0 +1,39 @@
|
||||
#
|
||||
|
||||
/^>/ {
|
||||
N++
|
||||
na = split($1, a, "@")
|
||||
if (a[na-1] > NEXMAX) NEXMAX = a[na-1]
|
||||
NEX[a[na-1]]++
|
||||
ANNOT[$NF]++
|
||||
}
|
||||
|
||||
END {
|
||||
na = split(FILENAME, a, "/")
|
||||
na = split(a[na], a, "\\.")
|
||||
printf("%s %d ", a[1], N)
|
||||
s = ""
|
||||
for (i = 1 ; i <= NEXMAX ; i ++) {
|
||||
if (NEX[i] != 0)
|
||||
s = s "" i ":" NEX[i] "_"
|
||||
}
|
||||
gsub("_+$", "", s)
|
||||
printf("%s ", s)
|
||||
|
||||
s = (NEXMAX == 1) ? "MONEX" : "POLYEX"
|
||||
printf("%s ", s)
|
||||
|
||||
nmax = 0
|
||||
amax = "none"
|
||||
for (e in ANNOT) {
|
||||
if (ANNOT[e] > nmax) {
|
||||
nmax = ANNOT[e]
|
||||
amax = e
|
||||
}
|
||||
}
|
||||
print amax
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
48
detectors/cds/tools/chlorodb/subdb/lib/db.blastlink.awk
Normal file
48
detectors/cds/tools/chlorodb/subdb/lib/db.blastlink.awk
Normal file
@ -0,0 +1,48 @@
|
||||
#
|
||||
|
||||
function min(x, y) {
|
||||
return ((x < y) ? x : y)
|
||||
}
|
||||
|
||||
BEGIN {
|
||||
if (COVMIN == "") COVMIN = 50
|
||||
if (PMAX == "") PMAX = 1e-6
|
||||
if (IDMIN == "") IDMIN = 30
|
||||
}
|
||||
|
||||
/^#/ {
|
||||
hitnum = 0;
|
||||
next;
|
||||
}
|
||||
|
||||
{
|
||||
if ($1 == $2) next
|
||||
|
||||
hitnum++;
|
||||
|
||||
na = split($1, a, "@");
|
||||
if (na < 2) {
|
||||
print "query file not properly formatted" > "/dev/stderr"
|
||||
exit(1);
|
||||
}
|
||||
len1 = a[na];
|
||||
|
||||
na = split($2, a, "@");
|
||||
if (na < 2) {
|
||||
print "bank file not properly formatted" > "/dev/stderr"
|
||||
exit(1);
|
||||
}
|
||||
len2 = a[na];
|
||||
|
||||
id = $3 + 0.0;
|
||||
ali = $4;
|
||||
|
||||
covmin = ali * 100. / min(len1, len2);
|
||||
|
||||
proba = $11 + 0.0;
|
||||
|
||||
if ((covmin > COVMIN) && ((proba < PMAX) || (proba == 0)) && (id > IDMIN)) {
|
||||
print $1, $2, hitnum, id, covmin, proba, ali, len1, len2;
|
||||
}
|
||||
}
|
||||
|
18
detectors/cds/tools/chlorodb/subdb/lib/db.cc.r
Executable file
18
detectors/cds/tools/chlorodb/subdb/lib/db.cc.r
Executable file
@ -0,0 +1,18 @@
|
||||
#!/usr/bin/env Rscript
|
||||
#
|
||||
|
||||
require(igraph, warn.conflicts=F)
|
||||
|
||||
args <- commandArgs(T)
|
||||
path <- if(length(args) > 0) args[1] else 'graph.dl'
|
||||
|
||||
g <- read.graph(path, format='dl')
|
||||
|
||||
cc <- clusters(g)
|
||||
|
||||
res <- cbind(V(g)$name, membership(cc))
|
||||
|
||||
write.table(res, quote=FALSE, row.names=FALSE, col.names=FALSE)
|
||||
|
||||
quit(save="no")
|
||||
|
19
detectors/cds/tools/chlorodb/subdb/lib/db.filter.len.r
Executable file
19
detectors/cds/tools/chlorodb/subdb/lib/db.filter.len.r
Executable file
@ -0,0 +1,19 @@
|
||||
#!/usr/bin/env Rscript
|
||||
#
|
||||
|
||||
args <- commandArgs(T)
|
||||
path <- if(length(args) > 0) args[1] else 'len.txt'
|
||||
delta <- if(length(args) > 1) args[2] else 0.5
|
||||
|
||||
tab <- read.table(path, header=T)
|
||||
|
||||
lmed <- median(tab$len)
|
||||
|
||||
dlen <- lmed * as.numeric(delta)
|
||||
|
||||
tab$ok <- (abs(tab$len-lmed)/lmed) <= delta
|
||||
|
||||
write.table(tab, quote=F)
|
||||
|
||||
quit(save='no')
|
||||
|
10
detectors/cds/tools/chlorodb/subdb/lib/db.filter.pat.awk
Normal file
10
detectors/cds/tools/chlorodb/subdb/lib/db.filter.pat.awk
Normal file
@ -0,0 +1,10 @@
|
||||
#
|
||||
|
||||
/^>/ {
|
||||
split($1, a, "@")
|
||||
ok = a[3] ~ PAT
|
||||
}
|
||||
|
||||
ok {
|
||||
print $0
|
||||
}
|
30
detectors/cds/tools/chlorodb/subdb/lib/db.filter.sym.awk
Normal file
30
detectors/cds/tools/chlorodb/subdb/lib/db.filter.sym.awk
Normal file
@ -0,0 +1,30 @@
|
||||
#
|
||||
#
|
||||
#
|
||||
|
||||
function Check(seq) {
|
||||
if (seq == "") return 0
|
||||
gsub("[ACDEFGHIKLMNPQRSTVWXY\n]+", "", seq)
|
||||
return (length(seq) == 0)
|
||||
}
|
||||
|
||||
/^>/ {
|
||||
if (Check(Seq)) {
|
||||
print Name
|
||||
printf("%s", Seq)
|
||||
}
|
||||
Name = $0
|
||||
Seq = ""
|
||||
next
|
||||
}
|
||||
|
||||
{
|
||||
Seq = Seq "" $0 "\n"
|
||||
}
|
||||
|
||||
END {
|
||||
if (Check(Seq)) {
|
||||
print Name
|
||||
printf("%s", Seq)
|
||||
}
|
||||
}
|
10
detectors/cds/tools/chlorodb/subdb/lib/db.getlen.awk
Normal file
10
detectors/cds/tools/chlorodb/subdb/lib/db.getlen.awk
Normal file
@ -0,0 +1,10 @@
|
||||
#
|
||||
BEGIN {
|
||||
print "id len"
|
||||
}
|
||||
|
||||
/^>/ {
|
||||
na = split($1, a, "@")
|
||||
print substr($1, 2), a[na]
|
||||
}
|
||||
|
15
detectors/cds/tools/chlorodb/subdb/lib/db.reportcc.awk
Normal file
15
detectors/cds/tools/chlorodb/subdb/lib/db.reportcc.awk
Normal file
@ -0,0 +1,15 @@
|
||||
#
|
||||
#
|
||||
|
||||
{
|
||||
cnt[$NF]++
|
||||
}
|
||||
|
||||
END {
|
||||
n = asort(cnt)
|
||||
printf("cc_size %s", NAME)
|
||||
for (i = n ; i >= 1 ; i--)
|
||||
printf(" %d", cnt[i])
|
||||
print ""
|
||||
}
|
||||
|
19
detectors/cds/tools/chlorodb/subdb/lib/db.selcc.awk
Normal file
19
detectors/cds/tools/chlorodb/subdb/lib/db.selcc.awk
Normal file
@ -0,0 +1,19 @@
|
||||
#
|
||||
|
||||
{
|
||||
N[$NF]++
|
||||
E[$NF, N[$NF]] = $1
|
||||
}
|
||||
|
||||
END {
|
||||
cmax = 1
|
||||
nmax = N[1]
|
||||
for (i in N) {
|
||||
if (N[i] > nmax) {
|
||||
nmax = N[i]
|
||||
cmax = i
|
||||
}
|
||||
}
|
||||
for (i = 1 ; i <= nmax ; i++)
|
||||
print E[cmax, i]
|
||||
}
|
17
detectors/cds/tools/chlorodb/subdb/lib/db.subdb.awk
Normal file
17
detectors/cds/tools/chlorodb/subdb/lib/db.subdb.awk
Normal file
@ -0,0 +1,17 @@
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
if (FILE == "") FILE = "db.sel.txt"
|
||||
while (getline < FILE)
|
||||
INC[$1] = $1
|
||||
close(FILE)
|
||||
}
|
||||
|
||||
/^>/ {
|
||||
name = substr($1, 2)
|
||||
ok = name in INC
|
||||
}
|
||||
|
||||
ok {
|
||||
print $0
|
||||
}
|
21
detectors/cds/tools/chlorodb/subdb/lib/db.todl.awk
Normal file
21
detectors/cds/tools/chlorodb/subdb/lib/db.todl.awk
Normal file
@ -0,0 +1,21 @@
|
||||
#
|
||||
|
||||
{
|
||||
node[$1]++
|
||||
node[$2]++
|
||||
link[++M] = $1 " " $2
|
||||
}
|
||||
|
||||
|
||||
END {
|
||||
for (n in node)
|
||||
N++
|
||||
print "DL n=" N
|
||||
print "format = edgelist1"
|
||||
print "labels embedded:"
|
||||
print "data:"
|
||||
for (i = 1 ; i <= M ; i++)
|
||||
print link[i]
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user