cds/tools/chlorodb added
Former-commit-id: 0579e878a69b7c285ca71870e9ca5730649a2fda Former-commit-id: 7cced5b488441d87bf070a9a444317db0e048880
This commit is contained in:
254
detectors/cds/tools/chlorodb/go_chlorodb.sh
Executable file
254
detectors/cds/tools/chlorodb/go_chlorodb.sh
Executable file
@ -0,0 +1,254 @@
|
||||
#!/bin/csh -f
|
||||
#
|
||||
# make ChloroDB's
|
||||
#
|
||||
# usage: copy genbank/embl files into 'DB_DIR/download'
|
||||
# usage: [create a paramter.sh file in 'DB_DIR']
|
||||
# usage: go_chlorodb [DB_DIR]
|
||||
#
|
||||
unsetenv ORG_SOURCED
|
||||
|
||||
setenv ORG_HOME `dirname $0`/../../../..
|
||||
source $ORG_HOME/scripts/csh_init.sh
|
||||
|
||||
#
|
||||
# which DB to process
|
||||
#
|
||||
|
||||
set DB_BASE = $DATA_DIR/cds/chlorodb # default location
|
||||
|
||||
if ($#Argv > 0) then
|
||||
set DB_BASE = $Argv[1]; Shift
|
||||
endif
|
||||
|
||||
set DB_BASE = `cd $DB_BASE && pwd -P`
|
||||
|
||||
NeedDir $DB_BASE/download
|
||||
|
||||
if (! -d $DB_BASE/info) mkdir $DB_BASE/info
|
||||
if (! -d $DB_BASE/fasta) mkdir $DB_BASE/fasta
|
||||
|
||||
cd $DB_BASE/info
|
||||
|
||||
#
|
||||
# params
|
||||
#
|
||||
|
||||
if (! -e $DB_BASE/parameters.sh) then
|
||||
@ n = `find $DB_BASE/download -depth 1 -type f -print | wc -l`
|
||||
@ cor_cutoff = $n / 2
|
||||
@ atg_cutoff = $n / 10
|
||||
@ dbs_cutoff = $n / 4
|
||||
if ($cor_cutoff == 0) @ cor_cutoff = 1
|
||||
if ($atg_cutoff == 0) @ atg_cutoff = 1
|
||||
if ($dbs_cutoff == 0) @ dbs_cutoff = 1
|
||||
echo "# sourced file" > $DB_BASE/parameters.sh
|
||||
echo "" >> $DB_BASE/parameters.sh
|
||||
echo "set CORE_NCDS_CUTOFF = $cor_cutoff" >> $DB_BASE/parameters.sh
|
||||
echo "set CORE_START_ATG_CUTOFF = $atg_cutoff" >> $DB_BASE/parameters.sh
|
||||
echo "set CORE_START_DFT_CUTOFF = $atg_cutoff" >> $DB_BASE/parameters.sh
|
||||
echo "set CORE_START_OTH_CUTOFF = 10" >> $DB_BASE/parameters.sh
|
||||
echo "set CORE_STOP_CUTOFF = $cor_cutoff" >> $DB_BASE/parameters.sh
|
||||
echo "set CORE_SPLICE_CUTOFF = $atg_cutoff" >> $DB_BASE/parameters.sh
|
||||
echo "" >> $DB_BASE/parameters.sh
|
||||
echo "set SHEL_NCDS_CUTOFF = 10" >> $DB_BASE/parameters.sh
|
||||
echo "" >> $DB_BASE/parameters.sh
|
||||
echo "set CORE_DELTA = Inf" >> $DB_BASE/parameters.sh
|
||||
echo "set CORE_COVMIN = 30" >> $DB_BASE/parameters.sh
|
||||
echo "set CORE_PMAX = 1e-6" >> $DB_BASE/parameters.sh
|
||||
echo "set CORE_IDMIN = 30" >> $DB_BASE/parameters.sh
|
||||
echo "set CORE_SIZMIN = $cor_cutoff" >> $DB_BASE/parameters.sh
|
||||
echo "" >> $DB_BASE/parameters.sh
|
||||
echo "set SHEL_DELTA = 0.5" >> $DB_BASE/parameters.sh
|
||||
echo "set SHEL_COVMIN = 30" >> $DB_BASE/parameters.sh
|
||||
echo "set SHEL_PMAX = 1e-6" >> $DB_BASE/parameters.sh
|
||||
echo "set SHEL_IDMIN = 30" >> $DB_BASE/parameters.sh
|
||||
echo "set SHEL_SIZMIN = $dbs_cutoff" >> $DB_BASE/parameters.sh
|
||||
echo "" >> $DB_BASE/parameters.sh
|
||||
echo "set DUST_DELTA = 0.5" >> $DB_BASE/parameters.sh
|
||||
echo "set DUST_COVMIN = 30" >> $DB_BASE/parameters.sh
|
||||
echo "set DUST_PMAX = 1e-6" >> $DB_BASE/parameters.sh
|
||||
echo "set DUST_IDMIN = 30" >> $DB_BASE/parameters.sh
|
||||
echo "set DUST_SIZMIN = 10" >> $DB_BASE/parameters.sh
|
||||
|
||||
endif
|
||||
|
||||
source $DB_BASE/parameters.sh
|
||||
|
||||
##set CMIN_COD = 0
|
||||
##set FMIN_COD = 0.01
|
||||
|
||||
#
|
||||
# temporarily uncompress
|
||||
#
|
||||
|
||||
set ff = `find $DB_BASE/download -depth 1 -name \*.gz -print`
|
||||
|
||||
if ($#ff != 0) then
|
||||
Notify "uncompressing $#ff entries"
|
||||
foreach f ($ff)
|
||||
gunzip -f $f
|
||||
end
|
||||
endif
|
||||
|
||||
#
|
||||
# convert gbk/embl to fasta
|
||||
#
|
||||
|
||||
set ff = `find $DB_BASE/download -depth 1 \( -name \*.gbk -or -name \*.embl \) -print`
|
||||
|
||||
Notify "convert $#ff gbk/embl entries to fasta"
|
||||
|
||||
foreach f ($ff)
|
||||
set nom = `basename $f:r`
|
||||
set typ = $f:e
|
||||
$AwkCmd -f $LIB_DIR/$typ.tofasta.awk $f > $DB_BASE/fasta/$nom.fst
|
||||
end
|
||||
|
||||
#
|
||||
# get gbk/embl info
|
||||
#
|
||||
|
||||
Notify "get gbk/embl info for $#ff entries"
|
||||
|
||||
echo "" | awk -v HEADONLY=1 -f $LIB_DIR/gbk.info.awk > db.info.txt # just get header
|
||||
|
||||
foreach f ($ff)
|
||||
set nom = `basename $f:r`
|
||||
set typ = $f:e
|
||||
$AwkCmd -f $LIB_DIR/$typ.oneliner.awk $f |\
|
||||
$AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/$typ.info.awk |\
|
||||
egrep -v '^#' >> db.info.txt
|
||||
end
|
||||
|
||||
#
|
||||
# get cds info
|
||||
#
|
||||
|
||||
Notify "get gbk/embl cds for $#ff entries"
|
||||
|
||||
echo "" | awk -v HEADONLY=1 -f $LIB_DIR/gbk.cds_long.awk > db.cds.txt # just get header
|
||||
|
||||
foreach f ($ff)
|
||||
set nom = `basename $f:r`
|
||||
set typ = $f:e
|
||||
$AwkCmd -f $LIB_DIR/$typ.oneliner.awk $f |\
|
||||
$AwkCmd -v FASTA=$DB_BASE/fasta/$nom.fst -f $LIB_DIR/libutil.awk \
|
||||
-f $LIB_DIR/$typ.cds_long.awk |\
|
||||
egrep -v '^#' >> db.cds.txt
|
||||
end
|
||||
|
||||
#
|
||||
# get fasta for prots
|
||||
#
|
||||
|
||||
Notify "get prots"
|
||||
$AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/cds2fasta.awk db.cds.txt > db.prot.fst
|
||||
|
||||
#
|
||||
# get introns
|
||||
#
|
||||
|
||||
Notify "get gbk/embl introns for $#ff entries"
|
||||
|
||||
echo "" | awk -v HEADONLY=1 -f $LIB_DIR/gbk.intron.awk > db.intron.txt # just get header
|
||||
|
||||
foreach f ($ff)
|
||||
set nom = `basename $f:r`
|
||||
set typ = $f:e
|
||||
$AwkCmd -f $LIB_DIR/$typ.oneliner.awk $f |\
|
||||
$AwkCmd -v FASTA=$DB_BASE/fasta/$nom.fst -f $LIB_DIR/libutil.awk \
|
||||
-f $LIB_DIR/$typ.intron.awk |\
|
||||
egrep -v '^#' >> db.intron.txt
|
||||
end
|
||||
|
||||
#
|
||||
# make models
|
||||
#
|
||||
|
||||
Notify "Making models"
|
||||
|
||||
echo -n "" > db.models.params.txt
|
||||
echo "CORE_NCDS_CUTOFF <- $CORE_NCDS_CUTOFF" >> db.models.params.txt
|
||||
echo "CORE_START_ATG_CUTOFF <- $CORE_START_ATG_CUTOFF" >> db.models.params.txt
|
||||
echo "CORE_START_DFT_CUTOFF <- $CORE_START_DFT_CUTOFF" >> db.models.params.txt
|
||||
echo "CORE_START_OTH_CUTOFF <- $CORE_START_OTH_CUTOFF" >> db.models.params.txt
|
||||
echo "CORE_STOP_CUTOFF <- $CORE_STOP_CUTOFF" >> db.models.params.txt
|
||||
echo "CORE_SPLICE_CUTOFF <- $CORE_SPLICE_CUTOFF" >> db.models.params.txt
|
||||
echo "SHEL_NCDS_CUTOFF <- $SHEL_NCDS_CUTOFF" >> db.models.params.txt
|
||||
|
||||
$LIB_DIR/make.models.r |& Cat
|
||||
|
||||
GetStatus
|
||||
OnError then
|
||||
Error 2 "model parameter too stringent"
|
||||
endif
|
||||
|
||||
#
|
||||
# add matrices
|
||||
#
|
||||
|
||||
cp -f $PROG_DIR/matrices/* models
|
||||
|
||||
#
|
||||
# make subDBs
|
||||
#
|
||||
|
||||
if (-e db.core.pat.txt) then
|
||||
Notify "Making core DB (take some time... please wait)"
|
||||
$PROG_DIR/subdb/go_subdb.sh db.prot.fst db.core.pat.txt \
|
||||
$CORE_DELTA $CORE_COVMIN $CORE_PMAX $CORE_IDMIN $CORE_SIZMIN
|
||||
endif
|
||||
|
||||
if (-e db.shell.pat.txt) then
|
||||
Notify "Making shell DB (take some time... please wait)"
|
||||
$PROG_DIR/subdb/go_subdb.sh db.prot.fst db.shell.pat.txt \
|
||||
$SHEL_DELTA $SHEL_COVMIN $SHEL_PMAX $SHEL_IDMIN $SHEL_SIZMIN
|
||||
endif
|
||||
|
||||
if (-e db.dust.pat.txt) then
|
||||
Notify "Making dust DB (take some time... please wait)"
|
||||
$PROG_DIR/subdb/go_subdb.sh db.prot.fst db.dust.pat.txt \
|
||||
$DUST_DELTA $DUST_COVMIN $DUST_PMAX $DUST_IDMIN $DUST_SIZMIN
|
||||
endif
|
||||
|
||||
#
|
||||
# recompress entries
|
||||
#
|
||||
|
||||
set ff = `find $DB_BASE/download -depth 1 -type f -print`
|
||||
|
||||
if ($#ff != 0) then
|
||||
Notify "recompressing $#ff entries"
|
||||
foreach f ($ff)
|
||||
gzip -f $f
|
||||
end
|
||||
endif
|
||||
|
||||
# compress fasta
|
||||
|
||||
set ff = `find $DB_BASE/fasta -depth 1 -name \*.fst -print`
|
||||
|
||||
if ($#ff != 0) then
|
||||
Notify "compressing $#ff fasta entries"
|
||||
foreach f ($ff)
|
||||
gzip -f $f
|
||||
end
|
||||
endif
|
||||
|
||||
# install everything in proper directory
|
||||
|
||||
foreach dir ("core" "shell" "dust")
|
||||
if (-e $DB_BASE/$dir) \rm -r $DB_BASE/$dir
|
||||
if ((-d db.$dir.pat.db) && (-e db.$dir.pat.db/Annot.lst)) then
|
||||
Notify "installing $DB_BASE/$dir"
|
||||
\mv -f db.$dir.pat.db $DB_BASE/$dir
|
||||
endif
|
||||
end
|
||||
|
||||
if (-e $DB_BASE/models) \rm -r $DB_BASE/models
|
||||
if (-d models) \mv -f models $DB_BASE
|
||||
|
||||
Notify "Done"
|
||||
exit 0
|
||||
|
29
detectors/cds/tools/chlorodb/matrices/blosum62.mat
Normal file
29
detectors/cds/tools/chlorodb/matrices/blosum62.mat
Normal file
@ -0,0 +1,29 @@
|
||||
#
|
||||
# blosum62 substitution matrix
|
||||
# with larger penalty for stops
|
||||
#
|
||||
A R N D C Q E G H I L K M F P S T W Y V B Z X *
|
||||
A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -50
|
||||
R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -50
|
||||
N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -50
|
||||
D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -50
|
||||
C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -50
|
||||
Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -50
|
||||
E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -50
|
||||
G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -50
|
||||
H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -50
|
||||
I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -50
|
||||
L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -50
|
||||
K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -50
|
||||
M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -50
|
||||
F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -50
|
||||
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -50
|
||||
S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -50
|
||||
T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -50
|
||||
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -50
|
||||
Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -50
|
||||
V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -50
|
||||
B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -50
|
||||
Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -50
|
||||
X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -50
|
||||
* -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 1
|
195
detectors/cds/tools/chlorodb/subdb/go_subdb.sh
Executable file
195
detectors/cds/tools/chlorodb/subdb/go_subdb.sh
Executable file
@ -0,0 +1,195 @@
|
||||
#!/bin/csh -f
|
||||
#
|
||||
# usage: go_subdb.sh prot.fst pat.txt [deltalen covmin pmax idmin sizmin]
|
||||
# usage: prot.fst : proteins fasta file
|
||||
# usage: pat.txt : text file containing patterns and names for families to extract
|
||||
# usage: output directory containig subdbs : basename <pat:r>.db
|
||||
#
|
||||
|
||||
unsetenv ORG_SOURCED
|
||||
|
||||
setenv ORG_HOME `dirname $0`/../../../../..
|
||||
source $ORG_HOME/scripts/csh_init.sh
|
||||
|
||||
NeedArg 2
|
||||
|
||||
set ProtFile = $Argv[1]; Shift
|
||||
set PatFile = $Argv[1]; Shift
|
||||
|
||||
NeedFile $ProtFile
|
||||
NeedFile $PatFile
|
||||
|
||||
#
|
||||
# parameters
|
||||
#
|
||||
|
||||
set Delta = 0.5
|
||||
set Covmin = 30
|
||||
set Pmax = 1e-6
|
||||
set Idmin = 30
|
||||
set Sizmin = 5
|
||||
|
||||
if ($#Argv > 0) then
|
||||
set Delta = $Argv[1]; Shift
|
||||
endif
|
||||
|
||||
if ($#Argv > 0) then
|
||||
set Covmin = $Argv[1]; Shift
|
||||
endif
|
||||
|
||||
if ($#Argv > 0) then
|
||||
set Pmax = $Argv[1]; Shift
|
||||
endif
|
||||
|
||||
if ($#Argv > 0) then
|
||||
set Idmin = $Argv[1]; Shift
|
||||
endif
|
||||
|
||||
if ($#Argv > 0) then
|
||||
set Sizmin = $Argv[1]; Shift
|
||||
endif
|
||||
|
||||
#
|
||||
# output directory
|
||||
#
|
||||
|
||||
set OutDir = `basename $PatFile:r`.db
|
||||
|
||||
if (-d $OutDir) \rm -r $OutDir
|
||||
mkdir $OutDir
|
||||
|
||||
set OutLog = `basename $PatFile:r`.log
|
||||
|
||||
echo -n '' > $OutLog
|
||||
|
||||
alias Report 'egrep "^>" \!:1 | wc -l | awk -v P=`basename \!:1` -v H=\!:2 '"'{print H,P,"'$1}'"'"' >> $OutLog'
|
||||
|
||||
#
|
||||
# remove entries with bad symbols
|
||||
#
|
||||
|
||||
Notify "cleanup $ProtFile"
|
||||
|
||||
Report $ProtFile "init_size"
|
||||
|
||||
$AwkCmd -f $LIB_DIR/db.filter.sym.awk $ProtFile > P_$$
|
||||
|
||||
Report $ProtFile "cleanup_size"
|
||||
|
||||
#
|
||||
# select by name pattern
|
||||
#
|
||||
|
||||
Notify "select by patterns"
|
||||
|
||||
mkdir D_$$
|
||||
mkdir E_$$
|
||||
mkdir F_$$
|
||||
|
||||
set noms = `awk '{print $1}' $PatFile`
|
||||
|
||||
foreach nom ($noms)
|
||||
set pat = `egrep "^$nom " $PatFile | awk '{print $2}'`
|
||||
$AwkCmd -f $LIB_DIR/db.filter.pat.awk -v PAT="$pat" P_$$ > D_$$/$nom.fst
|
||||
set n = `egrep '^>' D_$$/$nom.fst | wc -l`
|
||||
Notify " pattern : $nom : $n"
|
||||
Report D_$$/$nom.fst "pattern_filter"
|
||||
if ($n <= $Sizmin) \rm -f D_$$/$nom.fst
|
||||
end
|
||||
|
||||
set ok = `ls D_$$ | wc -l`
|
||||
if ($ok == 0) goto fin
|
||||
|
||||
#
|
||||
# select by length
|
||||
#
|
||||
|
||||
Notify "select by length"
|
||||
|
||||
foreach f (D_$$/*.fst)
|
||||
set nom = `basename $f:r`
|
||||
$AwkCmd -f $LIB_DIR/db.getlen.awk $f > L_$$
|
||||
$LIB_DIR/db.filter.len.r L_$$ $Delta |\
|
||||
$AwkCmd '($NF == "TRUE") {print $2}' > M_$$
|
||||
$AwkCmd -v FILE=M_$$ -f $LIB_DIR/db.subdb.awk $f > E_$$/$nom.fst
|
||||
Report E_$$/$nom.fst "length_filter"
|
||||
set n = `egrep '^>' E_$$/$nom.fst | wc -l`
|
||||
Notify " length filter : $nom : $n"
|
||||
if ($n <= $Sizmin) \rm -f E_$$/$nom.fst
|
||||
end
|
||||
|
||||
set ok = `ls E_$$ | wc -l`
|
||||
if ($ok == 0) goto fin
|
||||
|
||||
|
||||
#
|
||||
# select by similarity
|
||||
#
|
||||
|
||||
Notify "select by similarity"
|
||||
|
||||
foreach f (E_$$/*.fst)
|
||||
set nom = `basename $f:r`
|
||||
|
||||
Notify " blasting $nom"
|
||||
|
||||
makeblastdb -dbtype 'prot' -in $f >>& db.log
|
||||
blastp -db $f -query $f -outfmt 7 > $f.blast.out
|
||||
\rm -f $f.p??
|
||||
|
||||
$AwkCmd -v COVMIN=$Covmin -v PMAX=$Pmax -v IDMIN=$Idmin \
|
||||
-f $LIB_DIR/db.blastlink.awk $f.blast.out |\
|
||||
$AwkCmd -f $LIB_DIR/db.todl.awk > G_$$
|
||||
|
||||
($LIB_DIR/db.cc.r G_$$ > $f.cc.txt) >>& db.log
|
||||
|
||||
awk -v NAME=$nom -f $LIB_DIR/db.reportcc.awk $f.cc.txt >> $OutLog
|
||||
|
||||
$AwkCmd -f $LIB_DIR/db.selcc.awk $f.cc.txt > S_$$
|
||||
$AwkCmd -v FILE=S_$$ -f $LIB_DIR/db.subdb.awk $f > F_$$/$nom.fst
|
||||
|
||||
Report F_$$/$nom.fst "similarity_filter"
|
||||
|
||||
set n = `egrep '^>' F_$$/$nom.fst | wc -l`
|
||||
Notify " blast filter : $nom : $n"
|
||||
if ($n <= $Sizmin) \rm -f F_$$/$nom.fst
|
||||
|
||||
end
|
||||
|
||||
set ok = `ls D_$$ | wc -l`
|
||||
if ($ok == 0) goto fin
|
||||
|
||||
#
|
||||
# annotations
|
||||
#
|
||||
|
||||
echo -n "" > J_$$
|
||||
|
||||
foreach f (F_$$/*.fst)
|
||||
$AwkCmd -f $LIB_DIR/db.annot.awk $f >> J_$$
|
||||
end
|
||||
|
||||
awk '(NF >= 3) {print $1, $NF}' $PatFile | sort > A_$$
|
||||
sort J_$$ | egrep -v '^ *$' > B_$$
|
||||
join A_$$ B_$$ > F_$$/Annot.lst
|
||||
|
||||
#
|
||||
# copy files
|
||||
#
|
||||
|
||||
set n = `ls F_$$/* | wc -l`
|
||||
Notify "copy $n files to $OutDir"
|
||||
|
||||
\mv -f F_$$/* $OutDir
|
||||
|
||||
#
|
||||
# end
|
||||
#
|
||||
|
||||
fin:
|
||||
Notify "output directory : $OutDir"
|
||||
|
||||
\rm -r ?_$$
|
||||
|
||||
|
||||
exit 0
|
39
detectors/cds/tools/chlorodb/subdb/lib/db.annot.awk
Normal file
39
detectors/cds/tools/chlorodb/subdb/lib/db.annot.awk
Normal file
@ -0,0 +1,39 @@
|
||||
#
|
||||
|
||||
/^>/ {
|
||||
N++
|
||||
na = split($1, a, "@")
|
||||
if (a[na-1] > NEXMAX) NEXMAX = a[na-1]
|
||||
NEX[a[na-1]]++
|
||||
ANNOT[$NF]++
|
||||
}
|
||||
|
||||
END {
|
||||
na = split(FILENAME, a, "/")
|
||||
na = split(a[na], a, "\\.")
|
||||
printf("%s %d ", a[1], N)
|
||||
s = ""
|
||||
for (i = 1 ; i <= NEXMAX ; i ++) {
|
||||
if (NEX[i] != 0)
|
||||
s = s "" i ":" NEX[i] "_"
|
||||
}
|
||||
gsub("_+$", "", s)
|
||||
printf("%s ", s)
|
||||
|
||||
s = (NEXMAX == 1) ? "MONEX" : "POLYEX"
|
||||
printf("%s ", s)
|
||||
|
||||
nmax = 0
|
||||
amax = "none"
|
||||
for (e in ANNOT) {
|
||||
if (ANNOT[e] > nmax) {
|
||||
nmax = ANNOT[e]
|
||||
amax = e
|
||||
}
|
||||
}
|
||||
print amax
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
48
detectors/cds/tools/chlorodb/subdb/lib/db.blastlink.awk
Normal file
48
detectors/cds/tools/chlorodb/subdb/lib/db.blastlink.awk
Normal file
@ -0,0 +1,48 @@
|
||||
#
|
||||
|
||||
function min(x, y) {
|
||||
return ((x < y) ? x : y)
|
||||
}
|
||||
|
||||
BEGIN {
|
||||
if (COVMIN == "") COVMIN = 50
|
||||
if (PMAX == "") PMAX = 1e-6
|
||||
if (IDMIN == "") IDMIN = 30
|
||||
}
|
||||
|
||||
/^#/ {
|
||||
hitnum = 0;
|
||||
next;
|
||||
}
|
||||
|
||||
{
|
||||
if ($1 == $2) next
|
||||
|
||||
hitnum++;
|
||||
|
||||
na = split($1, a, "@");
|
||||
if (na < 2) {
|
||||
print "query file not properly formatted" > "/dev/stderr"
|
||||
exit(1);
|
||||
}
|
||||
len1 = a[na];
|
||||
|
||||
na = split($2, a, "@");
|
||||
if (na < 2) {
|
||||
print "bank file not properly formatted" > "/dev/stderr"
|
||||
exit(1);
|
||||
}
|
||||
len2 = a[na];
|
||||
|
||||
id = $3 + 0.0;
|
||||
ali = $4;
|
||||
|
||||
covmin = ali * 100. / min(len1, len2);
|
||||
|
||||
proba = $11 + 0.0;
|
||||
|
||||
if ((covmin > COVMIN) && ((proba < PMAX) || (proba == 0)) && (id > IDMIN)) {
|
||||
print $1, $2, hitnum, id, covmin, proba, ali, len1, len2;
|
||||
}
|
||||
}
|
||||
|
18
detectors/cds/tools/chlorodb/subdb/lib/db.cc.r
Executable file
18
detectors/cds/tools/chlorodb/subdb/lib/db.cc.r
Executable file
@ -0,0 +1,18 @@
|
||||
#!/usr/bin/env Rscript
|
||||
#
|
||||
|
||||
require(igraph, warn.conflicts=F)
|
||||
|
||||
args <- commandArgs(T)
|
||||
path <- if(length(args) > 0) args[1] else 'graph.dl'
|
||||
|
||||
g <- read.graph(path, format='dl')
|
||||
|
||||
cc <- clusters(g)
|
||||
|
||||
res <- cbind(V(g)$name, membership(cc))
|
||||
|
||||
write.table(res, quote=FALSE, row.names=FALSE, col.names=FALSE)
|
||||
|
||||
quit(save="no")
|
||||
|
19
detectors/cds/tools/chlorodb/subdb/lib/db.filter.len.r
Executable file
19
detectors/cds/tools/chlorodb/subdb/lib/db.filter.len.r
Executable file
@ -0,0 +1,19 @@
|
||||
#!/usr/bin/env Rscript
|
||||
#
|
||||
|
||||
args <- commandArgs(T)
|
||||
path <- if(length(args) > 0) args[1] else 'len.txt'
|
||||
delta <- if(length(args) > 1) args[2] else 0.5
|
||||
|
||||
tab <- read.table(path, header=T)
|
||||
|
||||
lmed <- median(tab$len)
|
||||
|
||||
dlen <- lmed * as.numeric(delta)
|
||||
|
||||
tab$ok <- (abs(tab$len-lmed)/lmed) <= delta
|
||||
|
||||
write.table(tab, quote=F)
|
||||
|
||||
quit(save='no')
|
||||
|
10
detectors/cds/tools/chlorodb/subdb/lib/db.filter.pat.awk
Normal file
10
detectors/cds/tools/chlorodb/subdb/lib/db.filter.pat.awk
Normal file
@ -0,0 +1,10 @@
|
||||
#
|
||||
|
||||
/^>/ {
|
||||
split($1, a, "@")
|
||||
ok = a[3] ~ PAT
|
||||
}
|
||||
|
||||
ok {
|
||||
print $0
|
||||
}
|
30
detectors/cds/tools/chlorodb/subdb/lib/db.filter.sym.awk
Normal file
30
detectors/cds/tools/chlorodb/subdb/lib/db.filter.sym.awk
Normal file
@ -0,0 +1,30 @@
|
||||
#
|
||||
#
|
||||
#
|
||||
|
||||
function Check(seq) {
|
||||
if (seq == "") return 0
|
||||
gsub("[ACDEFGHIKLMNPQRSTVWXY\n]+", "", seq)
|
||||
return (length(seq) == 0)
|
||||
}
|
||||
|
||||
/^>/ {
|
||||
if (Check(Seq)) {
|
||||
print Name
|
||||
printf("%s", Seq)
|
||||
}
|
||||
Name = $0
|
||||
Seq = ""
|
||||
next
|
||||
}
|
||||
|
||||
{
|
||||
Seq = Seq "" $0 "\n"
|
||||
}
|
||||
|
||||
END {
|
||||
if (Check(Seq)) {
|
||||
print Name
|
||||
printf("%s", Seq)
|
||||
}
|
||||
}
|
10
detectors/cds/tools/chlorodb/subdb/lib/db.getlen.awk
Normal file
10
detectors/cds/tools/chlorodb/subdb/lib/db.getlen.awk
Normal file
@ -0,0 +1,10 @@
|
||||
#
|
||||
BEGIN {
|
||||
print "id len"
|
||||
}
|
||||
|
||||
/^>/ {
|
||||
na = split($1, a, "@")
|
||||
print substr($1, 2), a[na]
|
||||
}
|
||||
|
15
detectors/cds/tools/chlorodb/subdb/lib/db.reportcc.awk
Normal file
15
detectors/cds/tools/chlorodb/subdb/lib/db.reportcc.awk
Normal file
@ -0,0 +1,15 @@
|
||||
#
|
||||
#
|
||||
|
||||
{
|
||||
cnt[$NF]++
|
||||
}
|
||||
|
||||
END {
|
||||
n = asort(cnt)
|
||||
printf("cc_size %s", NAME)
|
||||
for (i = n ; i >= 1 ; i--)
|
||||
printf(" %d", cnt[i])
|
||||
print ""
|
||||
}
|
||||
|
19
detectors/cds/tools/chlorodb/subdb/lib/db.selcc.awk
Normal file
19
detectors/cds/tools/chlorodb/subdb/lib/db.selcc.awk
Normal file
@ -0,0 +1,19 @@
|
||||
#
|
||||
|
||||
{
|
||||
N[$NF]++
|
||||
E[$NF, N[$NF]] = $1
|
||||
}
|
||||
|
||||
END {
|
||||
cmax = 1
|
||||
nmax = N[1]
|
||||
for (i in N) {
|
||||
if (N[i] > nmax) {
|
||||
nmax = N[i]
|
||||
cmax = i
|
||||
}
|
||||
}
|
||||
for (i = 1 ; i <= nmax ; i++)
|
||||
print E[cmax, i]
|
||||
}
|
17
detectors/cds/tools/chlorodb/subdb/lib/db.subdb.awk
Normal file
17
detectors/cds/tools/chlorodb/subdb/lib/db.subdb.awk
Normal file
@ -0,0 +1,17 @@
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
if (FILE == "") FILE = "db.sel.txt"
|
||||
while (getline < FILE)
|
||||
INC[$1] = $1
|
||||
close(FILE)
|
||||
}
|
||||
|
||||
/^>/ {
|
||||
name = substr($1, 2)
|
||||
ok = name in INC
|
||||
}
|
||||
|
||||
ok {
|
||||
print $0
|
||||
}
|
21
detectors/cds/tools/chlorodb/subdb/lib/db.todl.awk
Normal file
21
detectors/cds/tools/chlorodb/subdb/lib/db.todl.awk
Normal file
@ -0,0 +1,21 @@
|
||||
#
|
||||
|
||||
{
|
||||
node[$1]++
|
||||
node[$2]++
|
||||
link[++M] = $1 " " $2
|
||||
}
|
||||
|
||||
|
||||
END {
|
||||
for (n in node)
|
||||
N++
|
||||
print "DL n=" N
|
||||
print "format = edgelist1"
|
||||
print "labels embedded:"
|
||||
print "data:"
|
||||
for (i = 1 ; i <= M ; i++)
|
||||
print link[i]
|
||||
}
|
||||
|
||||
|
@ -47,8 +47,8 @@ $AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/$PrdType.cds_short.awk > P_$$
|
||||
|
||||
Notify "compare bank to predictions"
|
||||
|
||||
$AwkCmd -f $LIB_DIR/libnws.awk \
|
||||
-f $LIB_DIR/compareCds.awk \
|
||||
$AwkCmd -f $LIB_DIR/libnws.awk \
|
||||
-f $LIB_DIR/compare.cds.awk \
|
||||
R_$$ P_$$ > S_$$
|
||||
|
||||
# base statistics
|
||||
|
@ -15,6 +15,8 @@ NeedArg 1
|
||||
|
||||
egrep '^#|^MATCH' $* | awk -f $LIB_DIR/summary.cmp.awk > compare.txt
|
||||
|
||||
Notify "text file: compare.txt"
|
||||
|
||||
$LIB_DIR/summarize_cmp.r
|
||||
|
||||
|
||||
|
19
detectors/cds/tools/lib/cds2fasta.awk
Normal file
19
detectors/cds/tools/lib/cds2fasta.awk
Normal file
@ -0,0 +1,19 @@
|
||||
#
|
||||
# get fasta sequence from cds list
|
||||
#
|
||||
# [-v FIELD=13] CDS sequence
|
||||
# [-v FIELD=14] Prot Sequence
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
if (CHARPERLINE == "") CHARPERLINE = 50
|
||||
if (FIELD == "") FIELD = 14
|
||||
}
|
||||
|
||||
/^#/ { next }
|
||||
|
||||
{
|
||||
name = $1 "@" $2 "@" $3 "@" $5 "@" $6 "@" $7 "@" $8 "@" int($9/3)
|
||||
comment = $NF
|
||||
PrintFasta($FIELD, name " " comment)
|
||||
}
|
97
detectors/cds/tools/lib/embl.cds_long.awk
Normal file
97
detectors/cds/tools/lib/embl.cds_long.awk
Normal file
@ -0,0 +1,97 @@
|
||||
#
|
||||
# get cds features from embl (long version)
|
||||
#
|
||||
# -v FASTA
|
||||
|
||||
# @include lib.embl.awk
|
||||
|
||||
|
||||
BEGIN {
|
||||
print "#locus locustag genefam gene from to strand nexon length status start stop dnaseq protseq product"
|
||||
|
||||
if (HEADONLY != "") exit(0)
|
||||
|
||||
if (MAXSPAN == "") MAXSPAN = 10000
|
||||
|
||||
if (FASTA == "") Error("No FASTA file specified", 1)
|
||||
|
||||
if (! TestPath(FASTA)) Error("Fasta file: '" FASTA "' not found", 1)
|
||||
|
||||
Seq = tolower(ReadFasta(FASTA))
|
||||
}
|
||||
|
||||
/^ID / {
|
||||
locus = $2
|
||||
gsub(";", "", locus)
|
||||
incds = 0
|
||||
next
|
||||
}
|
||||
|
||||
/^FT CDS/ {
|
||||
revstrand = match($3, "^complement")
|
||||
s = substr($0, 22)
|
||||
gsub("^complement", "", s)
|
||||
ok = ! match(s, "complement|order")
|
||||
nexon = Nexons(s)
|
||||
SpanLocation(s, sloc)
|
||||
spanlen = sloc[2] - sloc[1] + 1
|
||||
len = LenLocation(s)
|
||||
ok = ok && (len < MAXSPAN)
|
||||
cdsseq = ok ? SeqLocation(Seq, s, revstrand) : "XXX"
|
||||
cstart = substr(cdsseq, 1,3)
|
||||
cstop = substr(cdsseq, length(cdsseq)-2)
|
||||
|
||||
gene = "none"
|
||||
locustag = "none"
|
||||
product = "none"
|
||||
translation = "X"
|
||||
incds = 1
|
||||
next
|
||||
}
|
||||
|
||||
(incds && /^FT [^ ]/) {
|
||||
print locus, locustag, GeneFamily(gene), gene, sloc[1], sloc[2], (revstrand ? "R" : "D"),
|
||||
nexon, len, (ok ? "Ok" : "Error"), cstart, cstop, cdsseq, translation, product
|
||||
incds = 0
|
||||
next
|
||||
}
|
||||
|
||||
/^FT \/gene=/ {
|
||||
split($0, a, "=")
|
||||
gene = a[2]
|
||||
gsub("^[^a-z,A-Z]+", "", gene)
|
||||
gsub("\"", "", gene)
|
||||
gsub(" ", "_", gene)
|
||||
next
|
||||
}
|
||||
|
||||
/^FT \/locus_tag=/ {
|
||||
split($0, a, "=")
|
||||
locustag = a[2]
|
||||
gsub("\"", "", locustag)
|
||||
gsub(" ", "_", locustag)
|
||||
next
|
||||
}
|
||||
|
||||
/^FT \/product=/ {
|
||||
split($0, a, "=")
|
||||
product = a[2]
|
||||
gsub("\"", "", product)
|
||||
gsub(" ", "_", product)
|
||||
next
|
||||
}
|
||||
|
||||
/^FT \/translation=/ {
|
||||
split($0, a, "=")
|
||||
translation = a[2]
|
||||
gsub("\"", "", translation)
|
||||
gsub(" ", "", translation)
|
||||
next
|
||||
}
|
||||
|
||||
END {
|
||||
if (incds) {
|
||||
print locus, locustag, GeneFamily(gene), gene, sloc[1], sloc[2], (revstrand ? "R" : "D"),
|
||||
nexon, len, (ok ? "Ok" : "Error"), cstart, cstop, cdsseq, translation, product
|
||||
}
|
||||
}
|
97
detectors/cds/tools/lib/embl.info.awk
Normal file
97
detectors/cds/tools/lib/embl.info.awk
Normal file
@ -0,0 +1,97 @@
|
||||
#
|
||||
# get feature info from embl
|
||||
#
|
||||
|
||||
# @include libgbk.awk
|
||||
|
||||
function GC(s, _local_, i, len) {
|
||||
s = toupper(s)
|
||||
len = length(s)
|
||||
gsub("G|C", "", s)
|
||||
return ((len - length(s)) * 100 / (len ? len : 1))
|
||||
}
|
||||
|
||||
#
|
||||
# rules
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
print "#locus orga len oklen gc nbCds nbCds_int0 nbCds_int1 nbCds_intsup1 perCds_noex meanCdsSize nbtRNA nbrRNA nboRNA"
|
||||
}
|
||||
|
||||
/^ID/ {
|
||||
locus = $2
|
||||
gsub(";", "", locus)
|
||||
next
|
||||
}
|
||||
|
||||
/^OS/ {
|
||||
orga = substr($0, 6)
|
||||
gsub(" ", "_", orga)
|
||||
next
|
||||
}
|
||||
|
||||
/^FT source/ {
|
||||
GetLoc($3, loc);
|
||||
len = loc[2];
|
||||
next
|
||||
}
|
||||
|
||||
/^FT CDS/ {
|
||||
meanCds = meanCds * nbCds + LenLocation($3)
|
||||
nbCds++
|
||||
meanCds /= nbCds
|
||||
n = Nexons($3)
|
||||
if (n > 3) n = 3
|
||||
nbCdx[n]++
|
||||
next
|
||||
}
|
||||
|
||||
/^FT tRNA/ {
|
||||
nbTrna++
|
||||
next
|
||||
}
|
||||
|
||||
/^FT rRNA/ {
|
||||
nbRrna++
|
||||
next
|
||||
}
|
||||
|
||||
/^FT mRNA/ {
|
||||
next
|
||||
}
|
||||
|
||||
/^FT .*RNA/ {
|
||||
nbOrna++
|
||||
next
|
||||
}
|
||||
|
||||
/^SQ / {
|
||||
inseq = 1
|
||||
seq = ""
|
||||
next
|
||||
}
|
||||
|
||||
inseq && /^ / {
|
||||
s = $0
|
||||
gsub("[0-9]+", "", s)
|
||||
gsub(" ", "", s)
|
||||
seq = seq "" s
|
||||
next
|
||||
}
|
||||
|
||||
/^\/\// {
|
||||
oklen = (len == length(seq) ? "ok" : "wrong")
|
||||
gc = GC(seq)
|
||||
print locus, orga, len, oklen, gc, nbCds+0, nbCdx[1]+0, \
|
||||
nbCdx[2]+0, nbCdx[3]+0, (nbCdx[1]+0)*100/Max(1, nbCds+0), \
|
||||
meanCds+0, nbTrna+0, nbRrna+0, nbOrna+0
|
||||
nbCds = nbTrna = nbRrna = nbOrna = len = inseq = meanCds = 0
|
||||
delete nbCdx
|
||||
orga = locus = "?"
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
97
detectors/cds/tools/lib/embl.intron.awk
Normal file
97
detectors/cds/tools/lib/embl.intron.awk
Normal file
@ -0,0 +1,97 @@
|
||||
#
|
||||
# get intron features from embl
|
||||
#
|
||||
|
||||
# @include libembl.awk
|
||||
|
||||
BEGIN {
|
||||
print "#locus locustag genefam gene from to strand intron_num intron_nb acceptor-donor status"
|
||||
|
||||
if (HEADONLY != "") exit(0)
|
||||
|
||||
if (FASTA == "") Error("No FASTA file specified", 1)
|
||||
|
||||
if (! TestPath(FASTA)) Error("Fasta file: '" FASTA "' not found", 1)
|
||||
|
||||
Seq = tolower(ReadFasta(FASTA))
|
||||
}
|
||||
|
||||
/^ID / {
|
||||
locus = $2
|
||||
gsub(";", "", locus)
|
||||
next
|
||||
}
|
||||
|
||||
/^FT CDS/ {
|
||||
revstrand = match($3, "^complement")
|
||||
s = substr($0, 22)
|
||||
gsub("^complement", "", s)
|
||||
ok = ! match(s, "complement|order")
|
||||
if (! ok) next
|
||||
|
||||
na = ParseLocation(s, locs)
|
||||
if (na < 2) next
|
||||
|
||||
delete SINfo
|
||||
Ninfo = 0
|
||||
|
||||
val = locs[1][1]
|
||||
for (i = 2 ; i <= na ; i++) {
|
||||
if (locs[i][1] < val) ok = 0
|
||||
val = locs[i][1]
|
||||
}
|
||||
if (! ok) next
|
||||
|
||||
val = locs[1][2]
|
||||
for (i = 2 ; i <= na ; i++) {
|
||||
if (locs[i][2] < val) ok = 0
|
||||
val = locs[i][2]
|
||||
}
|
||||
if (! ok) next
|
||||
|
||||
from = locs[1][2] + 1
|
||||
for (i = 2 ; i <= na ; i++) {
|
||||
to = locs[i][1] - 1
|
||||
inseq = SeqLocation(Seq, (from - 4) ".." (to + 4), revstrand)
|
||||
SINfo[++Ninfo] = from " " to " " (revstrand ? "R" : "D") " "\
|
||||
(revstrand ? na-i+1 : i-1) " " na-1 " "\
|
||||
substr(inseq, 1,4) "."\
|
||||
substr(inseq, 5,6) "-"\
|
||||
substr(inseq, length(inseq)-9, 6) "."\
|
||||
substr(inseq, length(inseq)-3, 4) " "\
|
||||
"ok"
|
||||
from = locs[i][2] + 1
|
||||
}
|
||||
|
||||
gene = "none"
|
||||
locustag = "none"
|
||||
next
|
||||
}
|
||||
|
||||
/^FT \/gene=/ {
|
||||
split($0, a, "=")
|
||||
gene = a[2]
|
||||
gsub("^[^a-z,A-Z]+", "", gene)
|
||||
gsub("\"", "", gene)
|
||||
gsub(" ", "_", gene)
|
||||
next
|
||||
}
|
||||
|
||||
/^FT \/locus_tag=/ {
|
||||
split($0, a, "=")
|
||||
locustag = a[2]
|
||||
gsub("\"", "", locustag)
|
||||
gsub(" ", "_", locustag)
|
||||
next
|
||||
}
|
||||
|
||||
/^FT \/translation=/ {
|
||||
for (i = 1 ; i <= Ninfo ; i++)
|
||||
print locus, locustag, GeneFamily(gene), gene, SINfo[i]
|
||||
Ninfo = 0
|
||||
next
|
||||
}
|
||||
|
||||
/^\/\// {
|
||||
locus = "?"
|
||||
}
|
36
detectors/cds/tools/lib/embl.tofasta.awk
Normal file
36
detectors/cds/tools/lib/embl.tofasta.awk
Normal file
@ -0,0 +1,36 @@
|
||||
#
|
||||
# get fasta sequence from embl
|
||||
#
|
||||
|
||||
/^ID / {
|
||||
locus = $2
|
||||
gsub(";", "", locus)
|
||||
next
|
||||
}
|
||||
|
||||
/^SQ / {
|
||||
inseq = 1
|
||||
nln = 0
|
||||
delete seq
|
||||
next
|
||||
}
|
||||
|
||||
/^\/\// {
|
||||
inseq = 0
|
||||
print ">" locus
|
||||
for (i = 1 ; i <= nln ; i++)
|
||||
print seq[i]
|
||||
next
|
||||
}
|
||||
|
||||
inseq {
|
||||
s = $0
|
||||
gsub(" ", "", s)
|
||||
gsub("[0-9]+", "", s)
|
||||
seq[++nln] = s
|
||||
next
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
99
detectors/cds/tools/lib/gbk.cds_long.awk
Normal file
99
detectors/cds/tools/lib/gbk.cds_long.awk
Normal file
@ -0,0 +1,99 @@
|
||||
#
|
||||
# get cds features from genbank (long version)
|
||||
#
|
||||
# -v FASTA
|
||||
|
||||
# @include libgbk.awk
|
||||
|
||||
BEGIN {
|
||||
print "#locus locustag genefam gene from to strand nexon length status start stop dnaseq protseq product"
|
||||
|
||||
if (HEADONLY != "") exit(0)
|
||||
|
||||
if (MAXSPAN == "") MAXSPAN = 10000
|
||||
|
||||
if (FASTA == "") Error("No FASTA file specified", 1)
|
||||
|
||||
if (! TestPath(FASTA)) Error("Fasta file: '" FASTA "' not found", 1)
|
||||
|
||||
Seq = tolower(ReadFasta(FASTA))
|
||||
}
|
||||
|
||||
/^LOCUS/ {
|
||||
locus = $2
|
||||
incds = 0
|
||||
next
|
||||
}
|
||||
|
||||
/^ CDS/ {
|
||||
revstrand = match($2, "^complement")
|
||||
s = substr($0, 22)
|
||||
gsub("^complement", "", s)
|
||||
ok = ! match(s, "complement|order")
|
||||
nexon = Nexons(s)
|
||||
SpanLocation(s, sloc)
|
||||
spanlen = sloc[2] - sloc[1] + 1
|
||||
len = LenLocation(s)
|
||||
ok = ok && (len < MAXSPAN)
|
||||
cdsseq = ok ? SeqLocation(Seq, s, revstrand) : "XXX"
|
||||
cstart = substr(cdsseq, 1,3)
|
||||
cstop = substr(cdsseq, length(cdsseq)-2)
|
||||
|
||||
gene = "none"
|
||||
locustag = "none"
|
||||
product = "none"
|
||||
translation = "X"
|
||||
incds = 1
|
||||
next
|
||||
}
|
||||
|
||||
(incds && /^ [^ ]/) {
|
||||
print locus, locustag, GeneFamily(gene), gene, sloc[1], sloc[2], (revstrand ? "R" : "D"),
|
||||
nexon, len, (ok ? "Ok" : "Error"), cstart, cstop, cdsseq, translation, product
|
||||
incds = 0
|
||||
next
|
||||
}
|
||||
|
||||
/^ \/gene=/ {
|
||||
split($0, a, "=")
|
||||
gene = a[2]
|
||||
gsub("^[^a-z,A-Z]+", "", gene)
|
||||
gsub("\"", "", gene)
|
||||
gsub(" ", "_", gene)
|
||||
next
|
||||
}
|
||||
|
||||
/^ \/locus_tag=/ {
|
||||
split($0, a, "=")
|
||||
locustag = a[2]
|
||||
gsub("\"", "", locustag)
|
||||
gsub(" ", "_", locustag)
|
||||
next
|
||||
}
|
||||
|
||||
/^ \/product=/ {
|
||||
split($0, a, "=")
|
||||
product = a[2]
|
||||
gsub("\"", "", product)
|
||||
gsub(" ", "_", product)
|
||||
next
|
||||
}
|
||||
|
||||
/^ \/translation=/ {
|
||||
split($0, a, "=")
|
||||
translation = a[2]
|
||||
gsub("\"", "", translation)
|
||||
gsub(" ", "", translation)
|
||||
next
|
||||
}
|
||||
|
||||
/^\/\// {
|
||||
locus = "?"
|
||||
}
|
||||
|
||||
END {
|
||||
if (incds) {
|
||||
print locus, locustag, GeneFamily(gene), gene, sloc[1], sloc[2], (revstrand ? "R" : "D"),
|
||||
nexon, len, (ok ? "Ok" : "Error"), cstart, cstop, cdsseq, translation, product
|
||||
}
|
||||
}
|
97
detectors/cds/tools/lib/gbk.info.awk
Normal file
97
detectors/cds/tools/lib/gbk.info.awk
Normal file
@ -0,0 +1,97 @@
|
||||
#
|
||||
# get feature info from genbank
|
||||
#
|
||||
|
||||
# @include libgbk.awk
|
||||
|
||||
function GC(s, _local_, i, len) {
|
||||
s = toupper(s)
|
||||
len = length(s)
|
||||
gsub("G|C", "", s)
|
||||
return ((len - length(s)) * 100 / len)
|
||||
}
|
||||
|
||||
#
|
||||
# rules
|
||||
#
|
||||
|
||||
BEGIN {
|
||||
print "#locus orga len oklen gc nbCds nbCds_int0 nbCds_int1 nbCds_intsup1 perCds_noex meanCdsSize nbtRNA nbrRNA nboRNA"
|
||||
}
|
||||
|
||||
/^LOCUS/ {
|
||||
locus = $2
|
||||
next
|
||||
}
|
||||
|
||||
/^ ORGANISM/ {
|
||||
orga = substr($0, 13)
|
||||
split(orga, a, ";")
|
||||
orga = a[1]
|
||||
gsub(" ", "_", orga)
|
||||
next
|
||||
}
|
||||
|
||||
/^ source/ {
|
||||
GetLoc($2, loc);
|
||||
len = loc[2];
|
||||
next
|
||||
}
|
||||
|
||||
/^ CDS/ {
|
||||
meanCds = meanCds * nbCds + LenLocation($2)
|
||||
nbCds++
|
||||
meanCds /= nbCds
|
||||
n = Nexons($2)
|
||||
if (n > 3) n = 3
|
||||
nbCdx[n]++
|
||||
next
|
||||
}
|
||||
|
||||
/^ tRNA/ {
|
||||
nbTrna++
|
||||
next
|
||||
}
|
||||
|
||||
/^ rRNA/ {
|
||||
nbRrna++
|
||||
next
|
||||
}
|
||||
|
||||
/^ mRNA/ {
|
||||
next
|
||||
}
|
||||
|
||||
/^ .*RNA/ {
|
||||
nbOrna++
|
||||
next
|
||||
}
|
||||
|
||||
/^ORIGIN/ {
|
||||
inseq = 1
|
||||
seq = ""
|
||||
next
|
||||
}
|
||||
|
||||
inseq && /^ +[1-9][0-9]*/ {
|
||||
s = substr($0, 11)
|
||||
gsub(" ", "", s)
|
||||
seq = seq "" s
|
||||
next
|
||||
}
|
||||
|
||||
/^\/\// {
|
||||
oklen = (len == length(seq) ? "ok" : "wrong")
|
||||
gc = GC(seq)
|
||||
print locus, orga, len, oklen, gc, nbCds+0, nbCdx[1]+0, \
|
||||
nbCdx[2]+0, nbCdx[3]+0, (nbCdx[1]+0)*100/Max(1, nbCds+0), \
|
||||
meanCds+0, nbTrna+0, nbRrna+0, nbOrna+0
|
||||
nbCds = nbTrna = nbRrna = nbOrna = len = inseq = meanCds = 0
|
||||
delete nbCdx
|
||||
orga = locus = "?"
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
96
detectors/cds/tools/lib/gbk.intron.awk
Normal file
96
detectors/cds/tools/lib/gbk.intron.awk
Normal file
@ -0,0 +1,96 @@
|
||||
#
|
||||
# get intron features from genbank
|
||||
#
|
||||
|
||||
# @include libgbk.awk
|
||||
|
||||
BEGIN {
|
||||
print "#locus locustag genefam gene from to strand intron_num intron_nb acceptor-donor status"
|
||||
|
||||
if (HEADONLY != "") exit(0)
|
||||
|
||||
if (FASTA == "") Error("No FASTA file specified", 1)
|
||||
|
||||
if (! TestPath(FASTA)) Error("Fasta file: '" FASTA "' not found", 1)
|
||||
|
||||
Seq = tolower(ReadFasta(FASTA))
|
||||
}
|
||||
|
||||
/^LOCUS/ {
|
||||
locus = $2
|
||||
next
|
||||
}
|
||||
|
||||
/^ CDS/ {
|
||||
revstrand = match($2, "^complement")
|
||||
s = substr($0, 22)
|
||||
gsub("^complement", "", s)
|
||||
ok = ! match(s, "complement|order")
|
||||
if (! ok) next
|
||||
|
||||
na = ParseLocation(s, locs)
|
||||
if (na < 2) next
|
||||
|
||||
delete SINfo
|
||||
Ninfo = 0
|
||||
|
||||
val = locs[1][1]
|
||||
for (i = 2 ; i <= na ; i++) {
|
||||
if (locs[i][1] < val) ok = 0
|
||||
val = locs[i][1]
|
||||
}
|
||||
if (! ok) next
|
||||
|
||||
val = locs[1][2]
|
||||
for (i = 2 ; i <= na ; i++) {
|
||||
if (locs[i][2] < val) ok = 0
|
||||
val = locs[i][2]
|
||||
}
|
||||
if (! ok) next
|
||||
|
||||
from = locs[1][2] + 1
|
||||
for (i = 2 ; i <= na ; i++) {
|
||||
to = locs[i][1] - 1
|
||||
inseq = SeqLocation(Seq, (from - 4) ".." (to + 4), revstrand)
|
||||
SINfo[++Ninfo] = from " " to " " (revstrand ? "R" : "D") " "\
|
||||
(revstrand ? na-i+1 : i-1) " " na-1 " "\
|
||||
substr(inseq, 1,4) "."\
|
||||
substr(inseq, 5,6) "-"\
|
||||
substr(inseq, length(inseq)-9, 6) "."\
|
||||
substr(inseq, length(inseq)-3, 4) " "\
|
||||
"ok"
|
||||
from = locs[i][2] + 1
|
||||
}
|
||||
|
||||
gene = "none"
|
||||
locustag = "none"
|
||||
next
|
||||
}
|
||||
|
||||
/^ \/gene=/ {
|
||||
split($0, a, "=")
|
||||
gene = a[2]
|
||||
gsub("^[^a-z,A-Z]+", "", gene)
|
||||
gsub("\"", "", gene)
|
||||
gsub(" ", "_", gene)
|
||||
next
|
||||
}
|
||||
|
||||
/^ \/locus_tag=/ {
|
||||
split($0, a, "=")
|
||||
locustag = a[2]
|
||||
gsub("\"", "", locustag)
|
||||
gsub(" ", "_", locustag)
|
||||
next
|
||||
}
|
||||
|
||||
/^ \/translation=/ {
|
||||
for (i = 1 ; i <= Ninfo ; i++)
|
||||
print locus, locustag, GeneFamily(gene), gene, SINfo[i]
|
||||
Ninfo = 0
|
||||
next
|
||||
}
|
||||
|
||||
/^\/\// {
|
||||
locus = "?"
|
||||
}
|
32
detectors/cds/tools/lib/gbk.tofasta.awk
Normal file
32
detectors/cds/tools/lib/gbk.tofasta.awk
Normal file
@ -0,0 +1,32 @@
|
||||
#
|
||||
# get fasta sequence from genbank
|
||||
#
|
||||
|
||||
/^LOCUS/ {
|
||||
locus = $2
|
||||
next
|
||||
}
|
||||
|
||||
/^ORIGIN/ {
|
||||
inseq = 1
|
||||
nln = 0
|
||||
delete seq
|
||||
}
|
||||
|
||||
inseq && /^ +[1-9][0-9]*/ {
|
||||
s = substr($0, 11)
|
||||
gsub(" ", "", s)
|
||||
seq[++nln] = s
|
||||
next
|
||||
}
|
||||
|
||||
/^\/\// {
|
||||
print ">" locus
|
||||
for (i = 1 ; i <= nln ; i++)
|
||||
print seq[i]
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
31
detectors/cds/tools/lib/install.rpackages.r
Executable file
31
detectors/cds/tools/lib/install.rpackages.r
Executable file
@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env Rscript
|
||||
#
|
||||
# check and install required packages
|
||||
#
|
||||
|
||||
out <- function(...) {
|
||||
cat(paste0('+ ', ..., '\n'), file=stderr())
|
||||
}
|
||||
|
||||
installed <- function(package) {
|
||||
package %in% rownames(installed.packages())
|
||||
}
|
||||
|
||||
check <- function(package, repos="http://cran.univ-lyon1.fr") {
|
||||
if (installed(package)) {
|
||||
out("R package ", package, " installed")
|
||||
} else {
|
||||
out("Installing R package ", package, " from ", repos)
|
||||
install.packages(package, repos=repos)
|
||||
}
|
||||
invisible(installed(package))
|
||||
}
|
||||
|
||||
check("grid")
|
||||
check("gridExtra")
|
||||
check("vcd")
|
||||
check("plotrix")
|
||||
check("igraph")
|
||||
|
||||
quit(save='no', status=0)
|
||||
|
224
detectors/cds/tools/lib/make.models.r
Executable file
224
detectors/cds/tools/lib/make.models.r
Executable file
@ -0,0 +1,224 @@
|
||||
#!/usr/bin/env Rscript
|
||||
#
|
||||
# compute start, stop, splice-junctions models for core DB
|
||||
#
|
||||
# source("make.models.r")
|
||||
#
|
||||
|
||||
LIB_DIR <- Sys.getenv("LIB_DIR")
|
||||
if (LIB_DIR == "") LIB_DIR = "."
|
||||
|
||||
source(paste0(LIB_DIR, "/util.base.r"))
|
||||
source(paste0(LIB_DIR, "/util.cons.r"))
|
||||
source(paste0(LIB_DIR, "/util.modelio.r"))
|
||||
|
||||
# -------------------------------
|
||||
# parameters
|
||||
# -------------------------------
|
||||
|
||||
# core cutoffs
|
||||
|
||||
source("db.models.params.txt")
|
||||
|
||||
# -------------------------------
|
||||
# genome infos
|
||||
# -------------------------------
|
||||
|
||||
notify("loading info table")
|
||||
chromo <- read.table("db.info.txt", com="", head=T, stringsAsFactors=F)
|
||||
|
||||
# -------------------------------
|
||||
# CDS
|
||||
# -------------------------------
|
||||
|
||||
notify("loading cds table")
|
||||
cds <- read.table("db.cds.txt", com="", header=T, stringsAsFactors=F)
|
||||
|
||||
cds$start <- as.factor(cds$start)
|
||||
cds$stop <- as.factor(cds$stop)
|
||||
|
||||
cds <- cds[cds$status=="Ok",]
|
||||
cds <- cds[cds$genefam!="none",]
|
||||
|
||||
cds$categ <- "dust"
|
||||
|
||||
x <- sort(table(cds$genefam), dec=T)
|
||||
ok <- names(x[x >= CORE_NCDS_CUTOFF])
|
||||
|
||||
cds$categ[cds$genefam %in% ok] <- "core"
|
||||
|
||||
x <- x[! names(x) %in% ok]
|
||||
ok <- names(x[x >= SHEL_NCDS_CUTOFF])
|
||||
|
||||
cds$categ[cds$genefam %in% ok] <- "shell"
|
||||
|
||||
#
|
||||
|
||||
cds.ori <- cds
|
||||
|
||||
cds.lst <- split(cds.ori, cds.ori$categ)
|
||||
|
||||
#
|
||||
# write out families
|
||||
#
|
||||
|
||||
# patterns & names
|
||||
|
||||
invisible(lapply(cds.lst, function(cds) {
|
||||
|
||||
x <- sort(table(cds$genefam), decreasing=T)
|
||||
tab <- paste0("^", names(x), "$")
|
||||
names(tab) <- names(x)
|
||||
|
||||
y <- sapply(split(cds$gene, cds$genefam), function(g) {
|
||||
head(names(sort(table(g), decreasing=T)), 1)
|
||||
})
|
||||
|
||||
tab <- cbind(tab, y[names(x)])
|
||||
|
||||
categ <- unique(cds$categ)
|
||||
f <- paste0("db.", categ, ".pat.txt")
|
||||
notify("writing patterns for", categ, ":", f)
|
||||
write.table(tab, file=f, quote=F, col.names=F, row.names=T)
|
||||
}))
|
||||
|
||||
# -------------------------------
|
||||
# Start models (core only)
|
||||
# -------------------------------
|
||||
|
||||
if (! "core" %in% names(cds.lst)) {
|
||||
notify("*** no gene found in core")
|
||||
notify("*** please change parameters")
|
||||
quit(save='no', status=1)
|
||||
}
|
||||
|
||||
cds <- cds.lst[["core"]]
|
||||
|
||||
#
|
||||
# start by genes
|
||||
#
|
||||
|
||||
tab <- split(cds$start, cds$genefam)
|
||||
|
||||
fatg <- sapply(tab, function(x) table(x)["atg"]/length(x)*100)
|
||||
names(fatg) <- names(tab)
|
||||
|
||||
start.dft <- names(which(fatg >= CORE_START_ATG_CUTOFF))
|
||||
start.spc <- names(which(fatg < CORE_START_ATG_CUTOFF))
|
||||
|
||||
tab <- cds[cds$genefam %in% start.dft,]
|
||||
tab <- table(tab$start)
|
||||
|
||||
# default model
|
||||
|
||||
x <- sort(tab[tab>=CORE_START_DFT_CUTOFF], decreasing=T)
|
||||
write.model.start(x, "default")
|
||||
|
||||
# gene specific models
|
||||
|
||||
invisible(sapply(start.spc, function(g) {
|
||||
x <- cds[cds$genefam == g,]
|
||||
tx <- table(x$start)
|
||||
tx <- sort(tx[tx>=CORE_START_OTH_CUTOFF], decreasing=T)
|
||||
write.model.start(tx, g)
|
||||
}))
|
||||
|
||||
# -------------------------------
|
||||
# Stop models (core only)
|
||||
# -------------------------------
|
||||
|
||||
# write default stop model
|
||||
|
||||
tab <- table(cds$stop)
|
||||
x <- sort(tab[tab>=CORE_STOP_CUTOFF], decreasing=T)
|
||||
write.model.stop(x, "default")
|
||||
|
||||
# -------------------------------
|
||||
# splice junctions
|
||||
# -------------------------------
|
||||
|
||||
notify("loading intron table")
|
||||
intron <- read.table("db.intron.txt", com="", header=T, stringsAsFactors=F)
|
||||
|
||||
# remove invalid sequences
|
||||
|
||||
intron$seq <- gsub("\\.|-", "", intron$acceptor.donor)
|
||||
|
||||
lseq <- nchar(gsub("[^acgt]", "", intron$seq))
|
||||
|
||||
intron <- intron[lseq == 20,]
|
||||
|
||||
# remove genes out of core
|
||||
|
||||
intron <- intron[intron$genefam %in% cds$genefam,]
|
||||
|
||||
# acceptors / donors
|
||||
|
||||
intron$acc <- substr(intron$seq, 5, 6)
|
||||
intron$don <- substr(intron$seq, 15, 16)
|
||||
|
||||
# consensus
|
||||
|
||||
cons.px <- cons.build(intron$acceptor.donor)
|
||||
cons.px <- cons.px[,! is.nan(colSums(cons.px))]
|
||||
|
||||
seq.px <- sapply(intron$acceptor.donor, function(s) gsub("[^acgt]", "", s))
|
||||
|
||||
conf.px <- cons.confusion(cons.px, seq.px)
|
||||
|
||||
sfam <- split(conf.px$l2scor, intron$genefam)
|
||||
sfam <- sfam[order(sapply(sfam, median))]
|
||||
|
||||
# extract splice exceptions
|
||||
|
||||
name.bad <- names(which(sapply(sfam, median) < 0))
|
||||
name.spc <- names(which(sapply(sfam[name.bad], length) >= CORE_SPLICE_CUTOFF))
|
||||
name.ok <- setdiff(unique(intron$genefam), name.bad)
|
||||
name.bad <- setdiff(name.bad, name.spc)
|
||||
name.list <- c(sapply(name.spc, function(x) x), list(default=name.ok))
|
||||
|
||||
cons <- lapply(name.list, function(x) cons.build(intron[intron$genefam %in% x, "acceptor.donor"]))
|
||||
|
||||
# write junction models
|
||||
|
||||
invisible(sapply(names(cons), function(n) write.model.splice3(cons[[n]], n)))
|
||||
invisible(sapply(names(cons), function(n) write.model.splice5(cons[[n]], n)))
|
||||
|
||||
# use uniform model for bad guys
|
||||
|
||||
invisible(sapply(name.bad, function(n) write.unif.splice(3, n)))
|
||||
invisible(sapply(name.bad, function(n) write.unif.splice(5, n)))
|
||||
|
||||
invisible(write.unif.splice('', "none"))
|
||||
|
||||
# -------------------------------
|
||||
# keep data for plotting
|
||||
# -------------------------------
|
||||
|
||||
DB <- list()
|
||||
|
||||
params <- list()
|
||||
|
||||
params$CORE_NCDS_CUTOFF <- CORE_NCDS_CUTOFF
|
||||
params$CORE_START_ATG_CUTOFF <- CORE_START_ATG_CUTOFF
|
||||
params$CORE_START_DFT_CUTOFF <- CORE_START_DFT_CUTOFF
|
||||
params$CORE_START_OTH_CUTOFF <- CORE_START_OTH_CUTOFF
|
||||
params$CORE_STOP_CUTOFF <- CORE_STOP_CUTOFF
|
||||
params$CORE_SPLICE_CUTOFF <- CORE_SPLICE_CUTOFF
|
||||
|
||||
params$SHEL_NCDS_CUTOFF <- SHEL_NCDS_CUTOFF
|
||||
|
||||
DB$params <- params
|
||||
DB$chromo <- chromo
|
||||
DB$cds.lst <- cds.lst
|
||||
DB$intron <- intron
|
||||
DB$cons <- cons
|
||||
|
||||
notify("saving db.data.Rdata")
|
||||
save(DB, file="db.data.Rdata")
|
||||
|
||||
# -------------------------------
|
||||
# end
|
||||
# -------------------------------
|
||||
|
||||
quit(save='no')
|
424
detectors/cds/tools/lib/plot.models.r
Executable file
424
detectors/cds/tools/lib/plot.models.r
Executable file
@ -0,0 +1,424 @@
|
||||
#!/usr/bin/env Rscript
|
||||
#
|
||||
# plots models previously computed by make.models.r
|
||||
#
|
||||
# source("plot.models.r")
|
||||
#
|
||||
|
||||
require(vcd)
|
||||
require(plotrix)
|
||||
|
||||
LIBDIR <- Sys.getenv("LIB_DIR")
|
||||
if (LIBDIR == "") LIBDIR = "."
|
||||
|
||||
source(paste0(LIBDIR, "/util.base.r"))
|
||||
source(paste0(LIBDIR, "/util.plot.r"))
|
||||
source(paste0(LIBDIR, "/util.cons.r"))
|
||||
source(paste0(LIBDIR, "/util.grid.r"))
|
||||
|
||||
# -------------------------------
|
||||
# setup
|
||||
# -------------------------------
|
||||
|
||||
OUT.DEV <- TRUE
|
||||
OUT.TYPE <- "pdf"
|
||||
OUT.FILE <- "models"
|
||||
|
||||
if (OUT.DEV) uplot.init.dev(OUT.FILE, OUT.TYPE)
|
||||
|
||||
# -------------------------------
|
||||
# Load data
|
||||
# -------------------------------
|
||||
|
||||
notify("loading DB data")
|
||||
load("db.data.Rdata")
|
||||
|
||||
params <- DB$params
|
||||
chromo <- DB$chromo
|
||||
cds.lst <- DB$cds.lst
|
||||
intron <- DB$intron
|
||||
cons <- DB$cons
|
||||
|
||||
# -------------------------------
|
||||
# Genomes infos
|
||||
# -------------------------------
|
||||
|
||||
grd.titlepage("Species")
|
||||
grd.textpage(lineno=1, "# org: ", nrow(chromo))
|
||||
|
||||
#
|
||||
# general stats
|
||||
#
|
||||
|
||||
grd.hist(chromo, "len", main="Histogram of chromosome length")
|
||||
grd.hist(chromo, "gc", pos.quant=c(0.75, 0.6), main="Histogram of chromosome GC")
|
||||
grd.hist(chromo, "nbCds")
|
||||
grd.fplot(chromo, "len", "nbCds")
|
||||
|
||||
#
|
||||
# nb cds no introns
|
||||
#
|
||||
|
||||
chromo$nbCds_Mono <- chromo$nbCds_int0
|
||||
chromo$nbCds_Poly <- chromo$nbCds_int1 + chromo$nbCds_intsup1
|
||||
chromo$percentPoly <- round(chromo$nbCds_Poly * 100 / (chromo$nbCds_Poly + chromo$nbCds_Mono))
|
||||
|
||||
grd.hist(chromo, "nbCds_Mono", main="Histogram of monoexonic Cds")
|
||||
grd.hist(chromo, "nbCds_Poly", main="Histogram of polyexonic Cds")
|
||||
grd.hist(chromo, "percentPoly", pos.sum=c(0.23,0.6), main="Histogram of % polyexonic")
|
||||
|
||||
grd.fplot(chromo, "nbCds", "nbCds_Mono", TRUE, ablin=list(a=0, b=1, col=3))
|
||||
grd.fplot(chromo, "nbCds", "nbCds_Poly")
|
||||
|
||||
#
|
||||
# cds size
|
||||
#
|
||||
|
||||
grd.hist(chromo, "meanCdsSize", pos.quant=NULL, main="Histogram of Cds size")
|
||||
|
||||
# -------------------------------
|
||||
# CDS
|
||||
# -------------------------------
|
||||
|
||||
cds.all <- do.call(rbind, cds.lst)
|
||||
|
||||
grd.titlepage("CDS")
|
||||
|
||||
grd.textpage(lineno=1, "# core cds core cutoff: ", params$CORE_NCDS_CUTOFF)
|
||||
grd.textpage(lineno=2, "# core cds shell cutoff: ", params$SHEL_NCDS_CUTOFF)
|
||||
|
||||
grd.textpage(lineno=4, "# total cds: ", nrow(cds.all))
|
||||
grd.textpage(lineno=5, "# core cds: ", nrow(cds.lst[["core"]]))
|
||||
grd.textpage(lineno=6, "# shell cds: ", nrow(cds.lst[["shell"]]))
|
||||
grd.textpage(lineno=7, "# dust cds: ", nrow(cds.lst[["dust"]]))
|
||||
|
||||
grd.textpage(lineno=9, "# total org: ", length(unique(cds.all$X.locus)))
|
||||
grd.textpage(lineno=10, "# core org: ", length(unique(cds.lst[["core"]]$X.locus)))
|
||||
grd.textpage(lineno=11, "# shell org: ", length(unique(cds.lst[["shell"]]$X.locus)))
|
||||
grd.textpage(lineno=12, "# dust org: ", length(unique(cds.lst[["dust"]]$X.locus)))
|
||||
|
||||
|
||||
grd.textpage(lineno=14, "# total families: ", length(unique(cds.all$genefam)))
|
||||
grd.textpage(lineno=15, "# core families: ", length(unique(cds.lst[["core"]]$genefam)))
|
||||
grd.textpage(lineno=16, "# shell families: ", length(unique(cds.lst[["shell"]]$genefam)))
|
||||
grd.textpage(lineno=17, "# dust families: ", length(unique(cds.lst[["dust"]]$genefam)))
|
||||
|
||||
uplot.setup(mfrow=c(2,2), xpd=NA)
|
||||
|
||||
x <- sapply(cds.lst, nrow)
|
||||
uplot.pie(x, main="CDS", text.r=1.1, col=c(3,2,4))
|
||||
|
||||
x <- sapply(cds.lst, function(cds) length(unique(cds$X.locus)))
|
||||
uplot.pie(x, main="ORG", text.r=1.1, col=c(3,2,4))
|
||||
|
||||
x <- sapply(cds.lst, function(cds) length(unique(cds$genefam)))
|
||||
uplot.pie(x, main="FAM", text.r=1.1, col=c(3,2,4))
|
||||
|
||||
uplot.setup(xpd=F)
|
||||
|
||||
#
|
||||
# plot genes cutoff
|
||||
#
|
||||
|
||||
cds.all <- do.call(rbind, cds.lst)
|
||||
cds.byfam <- split(cds.all, cds.all$genefam)
|
||||
|
||||
tab <- sort(sapply(cds.byfam, nrow), decreasing=T)
|
||||
cols <- rep("red", length(tab))
|
||||
cols[tab >= params$SHEL_NCDS_CUTOFF] <- "blue"
|
||||
cols[tab >= params$CORE_NCDS_CUTOFF] <- "green"
|
||||
barplot(tab, col=cols, border=NA, main="# genes")
|
||||
|
||||
cols <- cols[tab >= 50]
|
||||
tab <- tab[tab >= 50]
|
||||
barplot(tab, col=cols, border=NA, las=2, cex.names=0.5, main="# genes in core")
|
||||
abline(h=params$CORE_NCDS_CUTOFF, col=1)
|
||||
text(50, 200, "CORE_NCDS_CUTOFF", pos=3)
|
||||
|
||||
#
|
||||
# cds length for core
|
||||
#
|
||||
|
||||
invisible(sapply(c("core", "shell", "dust"), function(what) {
|
||||
|
||||
cds <- cds.lst[[what]]
|
||||
|
||||
x <- split(cds$length, cds$genefam)
|
||||
x <- x[order(sapply(x, mean))]
|
||||
|
||||
uplot.setup(mfrow=c(2,1))
|
||||
|
||||
boxplot(x, pars = list(boxwex = 0.8, staplewex = 0.5, outwex = 0.5, outcex = 0.1),
|
||||
las=2, cex.axis=0.5, main=paste0(what, " genes - length distribution"))
|
||||
|
||||
boxplot(x, pars = list(boxwex = 0.8, staplewex = 0.5, outwex = 0.5, outcex = 0.1), ylim=c(0,2000),
|
||||
las=2, cex.axis=0.5, main=paste0(what, " genes - length distribution zoom"))
|
||||
|
||||
uplot.setup()
|
||||
}))
|
||||
|
||||
# -------------------------------
|
||||
# starts & stops
|
||||
# -------------------------------
|
||||
|
||||
cds <- cds.lst[["core"]]
|
||||
|
||||
grd.titlepage("Starts and Stops")
|
||||
|
||||
tab <- sort(table(cds$start), dec=T)
|
||||
tab <- tab[tab >= 100]
|
||||
tab <- tab / sum(tab) * 100
|
||||
|
||||
barplot(tab, log="y", las=1, ylim=c(0.1, 100), main="start frequencies (%)")
|
||||
text(0.7, 50, round(tab[1], 2))
|
||||
text(1.9, 2.3, round(tab[2], 2))
|
||||
text(3.1, 2.3, round(tab[3], 2))
|
||||
|
||||
#
|
||||
# start by org and gc
|
||||
#
|
||||
|
||||
x <- split(cds$start, cds$X.locus)
|
||||
|
||||
fatg <- sapply(x, function(x) table(x)["atg"]/length(x)*100)
|
||||
names(fatg) <- names(x)
|
||||
chromo$fatg <- round(fatg[chromo$X.locus], 2)
|
||||
|
||||
fgtg <- sapply(x, function(x) table(x)["gtg"]/length(x)*100)
|
||||
names(fgtg) <- names(x)
|
||||
chromo$fgtg <- round(fgtg[chromo$X.locus], 2)
|
||||
|
||||
facg <- sapply(x, function(x) table(x)["acg"]/length(x)*100)
|
||||
names(facg) <- names(x)
|
||||
chromo$facg <- round(facg[chromo$X.locus], 2)
|
||||
|
||||
grd.hist(chromo, "fatg", pos.quant=c(0.5, 0.6), main="Histogram of atg freq. by org")
|
||||
grd.hist(chromo, "fgtg", main="Histogram of gtg freq. by org")
|
||||
grd.hist(chromo, "facg", pos.sum=c(0.3, 0.6), main="Histogram of acg freq. by org")
|
||||
|
||||
grd.fplot(chromo, "gc", "fatg", main="atg freq. by org GC", pos=c(0.2, 0.3))
|
||||
grd.fplot(chromo, "gc", "fgtg", main="gtg freq. by org GC")
|
||||
grd.fplot(chromo, "gc", "facg", main="acg freq. by org GC")
|
||||
|
||||
ter <- cbind(fatg, fgtg, facg)
|
||||
colnames(ter) <- c("ATG", "GTG", "ACG")
|
||||
igc <- cut(chromo$gc, breaks=quantile(chromo$gc, seq(0, 1, 0.1)), include.lowest=T, labels=1:10)
|
||||
cols <- rainbow(10)[igc]
|
||||
ternaryplot(ter, col=cols, cex=0.2, main="Start by org", labels="outside")
|
||||
|
||||
#
|
||||
# start by common genes
|
||||
#
|
||||
|
||||
x <- split(cds$start, cds$genefam)
|
||||
|
||||
fatg <- sapply(x, function(x) table(x)["atg"]/length(x)*100)
|
||||
names(fatg) <- names(x)
|
||||
|
||||
fgtg <- sapply(x, function(x) table(x)["gtg"]/length(x)*100)
|
||||
names(fgtg) <- names(x)
|
||||
|
||||
facg <- sapply(x, function(x) table(x)["acg"]/length(x)*100)
|
||||
names(facg) <- names(x)
|
||||
|
||||
barplot(sort(fatg)[1:10], las=2, main="atg freq. by gene")
|
||||
barplot(sort(fgtg, dec=T)[1:10], las=2, main="gtg freq. by gene")
|
||||
barplot(sort(facg, dec=T)[1:10], las=2, main="acg freq. by gene")
|
||||
|
||||
ter <- cbind(fatg, fgtg, facg)
|
||||
colnames(ter) <- c("ATG", "GTG", "ACG")
|
||||
ternaryplot(ter, col=1, cex=0.5, id=rownames(ter), main="Starts by genes", labels="outside")
|
||||
|
||||
# -------------------------------
|
||||
# stops
|
||||
# -------------------------------
|
||||
|
||||
#
|
||||
# stop by org and gc
|
||||
#
|
||||
|
||||
tab <- sort(table(cds$stop), dec=T)
|
||||
tab <- tab[tab >= 100]
|
||||
tab <- tab / sum(tab) * 100
|
||||
|
||||
barplot(tab, log="y", las=1, ylim=c(0.1, 100), main="stop frequencies (%)")
|
||||
text(0.7, 80, round(tab[1], 2))
|
||||
text(1.9, 30, round(tab[2], 2))
|
||||
text(3.1, 25, round(tab[3], 2))
|
||||
|
||||
x <- split(cds$stop, cds$X.locus)
|
||||
|
||||
ftaa <- sapply(x, function(x) table(x)["taa"]/length(x)*100)
|
||||
names(ftaa) <- names(x)
|
||||
chromo$ftaa <- round(ftaa[chromo$X.locus], 2)
|
||||
|
||||
ftag <- sapply(x, function(x) table(x)["tag"]/length(x)*100)
|
||||
names(ftag) <- names(x)
|
||||
chromo$ftag <- round(ftag[chromo$X.locus], 2)
|
||||
|
||||
ftga <- sapply(x, function(x) table(x)["tga"]/length(x)*100)
|
||||
names(ftga) <- names(x)
|
||||
chromo$ftga <- round(ftga[chromo$X.locus], 2)
|
||||
|
||||
grd.hist(chromo, "ftaa", pos.quant=c(0.7, 0.6), main="Histogram of taa freq. by org")
|
||||
grd.hist(chromo, "ftag", pos.quant=c(0.8, 0.6), main="Histogram of tag freq. by org")
|
||||
grd.hist(chromo, "ftga", pos.quant=c(0.8, 0.6), main="Histogram of tga freq. by org")
|
||||
|
||||
grd.fplot(chromo, "gc", "ftaa", main="taa freq. by org GC", pos=c(0.2, 0.3))
|
||||
grd.fplot(chromo, "gc", "ftag", main="tag freq. by org GC")
|
||||
grd.fplot(chromo, "gc", "ftga", main="tga freq. by org GC")
|
||||
|
||||
ter <- cbind(ftaa, ftag, ftga)
|
||||
colnames(ter) <- c("TAA", "TAG", "TGA")
|
||||
igc <- cut(chromo$gc, breaks=quantile(chromo$gc, seq(0, 1, 0.1)), include.lowest=T, labels=1:10)
|
||||
cols <- rainbow(10)[igc]
|
||||
ternaryplot(ter, col=cols, cex=0.2, main="Stops by org", labels="outside")
|
||||
|
||||
#
|
||||
# stop by common genes
|
||||
#
|
||||
|
||||
x <- split(cds$stop, cds$genefam)
|
||||
|
||||
ftaa <- sapply(x, function(x) table(x)["taa"]/length(x)*100)
|
||||
names(ftaa) <- names(x)
|
||||
|
||||
ftag <- sapply(x, function(x) table(x)["tag"]/length(x)*100)
|
||||
names(ftag) <- names(x)
|
||||
|
||||
ftga <- sapply(x, function(x) table(x)["tga"]/length(x)*100)
|
||||
names(ftga) <- names(x)
|
||||
|
||||
barplot(sort(ftaa), las=2, cex.names=0.5, ylim=c(0,100), main="taa freq. by gene")
|
||||
barplot(sort(ftag), las=2, cex.names=0.5, ylim=c(0,100), main="tag freq. by gene")
|
||||
barplot(sort(ftga), las=2, cex.names=0.5, ylim=c(0,100), main="tga freq. by gene")
|
||||
|
||||
ter <- cbind(ftaa, ftag, ftga)
|
||||
colnames(ter) <- c("TAA", "TAG", "TGA")
|
||||
ternaryplot(ter, col=1, cex=0.3, id=rownames(ter), main="Stops by genes", labels="outside")
|
||||
|
||||
# -------------------------------
|
||||
# splice junctions
|
||||
# -------------------------------
|
||||
|
||||
grd.titlepage("Splice Junctions")
|
||||
|
||||
grd.textpage(lineno=1, "# intron in core: ", nrow(intron))
|
||||
|
||||
#
|
||||
# intron size
|
||||
#
|
||||
|
||||
intron$size <- intron$to - intron$from + 1
|
||||
|
||||
grd.hist(intron, "size", pos.quant=NULL, main="Histogram of intron size", br=1000, xlim=c(0,2000))
|
||||
|
||||
#
|
||||
# nb intron / gene
|
||||
#
|
||||
|
||||
x <- split(intron, intron$genefam)
|
||||
x <- x[order(sapply(x, function(x) mean(x$intron_nb)), decreasing=T)]
|
||||
|
||||
nmax <- max(intron$intron_nb)
|
||||
lintron <- lapply(x, function(x) x$intron_nb)
|
||||
mintron <- sapply(lintron, function(x) table(factor(x, levels=1:nmax)))
|
||||
|
||||
lintron0 <- table(cds[cds$nexon == 1,"genefam"])[names(lintron)]
|
||||
mintron <- rbind("0"=lintron0, mintron)
|
||||
mintron <- t(t(mintron)/colSums(mintron))
|
||||
|
||||
mintron[mintron==0] <- NA
|
||||
|
||||
nn <- nrow(mintron)
|
||||
xx <- mintron[nn:1,]
|
||||
ll <- lapply(1:nn, function(i) xx[i,])
|
||||
mintron <- mintron[,do.call(order, c(ll, decreasing=T))]
|
||||
|
||||
battleship.plot(mintron, maxxspan=0.3, maxyspan=0.3,
|
||||
cex.labels=0.7,
|
||||
main="% intron per polyexonic gene")
|
||||
|
||||
#
|
||||
# acceptors / donors
|
||||
#
|
||||
|
||||
tab <- sort(table(intron$acc), dec=T)
|
||||
tab <- tab[tab >= 100]
|
||||
tab <- tab / sum(tab) * 100
|
||||
|
||||
barplot(tab, log="y", las=1, ylim=c(0.1, 100), main="acceptor frequencies (%)")
|
||||
text(0.7, 50, round(tab[1], 2))
|
||||
text(1.9, 3, round(tab[2], 2))
|
||||
text(3.1, 2.3, round(tab[3], 2))
|
||||
|
||||
tab <- sort(table(intron$don), dec=T)
|
||||
tab <- tab[tab >= 100]
|
||||
tab <- tab / sum(tab) * 100
|
||||
|
||||
barplot(tab, log="y", las=1, ylim=c(0.1, 100), main="donor frequencies (%)")
|
||||
text(0.7, 50, round(tab[1], 1))
|
||||
text(1.9, 40, round(tab[2], 1))
|
||||
text(3.1, 15, round(tab[3], 1))
|
||||
text(4.3, 12, round(tab[4], 1))
|
||||
|
||||
#
|
||||
# consensus all
|
||||
#
|
||||
|
||||
cons$all <- cons.build(intron$acceptor.donor)
|
||||
|
||||
invisible(sapply(rev(names(cons)), function(what) {
|
||||
cons.plot(cons[[what]], paste0("consensus ", what))
|
||||
}))
|
||||
|
||||
#
|
||||
# default consensus score by consensus length
|
||||
#
|
||||
|
||||
cons.def <- cons[["default"]]
|
||||
cons.def <- cons.def[,! is.nan(colSums(cons.def))]
|
||||
seq.def <- sapply(intron$acceptor.donor, function(s) gsub("[^acgt]", "", s))
|
||||
|
||||
epx <- apply(cons.def, 2, function(col) -sum(col * log(col, base=4)))
|
||||
opx <- order(epx)
|
||||
|
||||
conf.def <- lapply(seq(2, length(opx), by=2), function(n) {
|
||||
pos <- head(opx, n)
|
||||
notify(n, "/", length(opx))
|
||||
cons.confusion(cons.def, seq.def, thresh=0, pos=pos)
|
||||
})
|
||||
|
||||
acc <- sapply(conf.def, function(x) x$acc)
|
||||
sen <- sapply(conf.def, function(x) x$sen)
|
||||
sel <- sapply(conf.def, function(x) x$sel)
|
||||
|
||||
plot(sel, ylim=c(0.7, 1), pch=1, type="b", main="accuracy by nb consensus positions", ylab="")
|
||||
lines(sen, type="b", pch=2)
|
||||
lines(acc, type="b", pch=3)
|
||||
legend(1, 0.95, c("sensit.", "select.", "accur."), pch=1:3, horiz=T, bty="n")
|
||||
|
||||
#
|
||||
# default consensus score by genes
|
||||
#
|
||||
|
||||
conf.def <- cons.confusion(cons.def, seq.def, thresh=0)
|
||||
cons.histconf(conf.def)
|
||||
|
||||
sfam <- split(conf.def$l2scor, intron$genefam)
|
||||
sfam <- sfam[order(sapply(sfam, median))]
|
||||
|
||||
boxplot(sfam, pars = list(boxwex = 0.8, staplewex = 0.5, outwex = 0.5, outcex = 0.1),
|
||||
las=2, cex.axis=0.7, main="default junction logr score by genes")
|
||||
abline(h=0)
|
||||
|
||||
#
|
||||
# end
|
||||
#
|
||||
|
||||
if (OUT.DEV) {
|
||||
cat("+ plot file:", paste0(OUT.FILE, ".", OUT.TYPE), "\n")
|
||||
invisible(dev.off())
|
||||
}
|
||||
|
||||
quit(save='no')
|
71
detectors/cds/tools/lib/summarize_cmp.r
Executable file
71
detectors/cds/tools/lib/summarize_cmp.r
Executable file
@ -0,0 +1,71 @@
|
||||
#!/usr/bin/env Rscript
|
||||
#
|
||||
# plot summary graphics of comparisons
|
||||
#
|
||||
#
|
||||
|
||||
LIBDIR <- Sys.getenv("LIB_DIR")
|
||||
if (LIBDIR == "") LIBDIR = "."
|
||||
|
||||
source(paste0(LIBDIR, "/util.plot.r"))
|
||||
|
||||
COLORS <- 2:10
|
||||
|
||||
#
|
||||
|
||||
OUT.DEV <- TRUE
|
||||
OUT.TYPE <- "pdf"
|
||||
OUT.FILE <- "compare"
|
||||
if (OUT.DEV) uplot.init.dev(OUT.FILE, OUT.TYPE)
|
||||
|
||||
#
|
||||
|
||||
tab <- read.table("compare.txt", header=T, comment.char="", stringsAsFactors=F)
|
||||
|
||||
#
|
||||
|
||||
par(xpd=NA)
|
||||
|
||||
#
|
||||
|
||||
sel <- c("cor", "alcor", "acc", "wrong", "over", "misstot")
|
||||
tab$ptot <- rowSums(tab[,sel])
|
||||
for (s in sel)
|
||||
tab[,paste0("p", s)] <- tab[,s] * 100 / tab$ptot
|
||||
|
||||
colors <- head(COLORS, length(sel))
|
||||
|
||||
cols <- paste0("p", sel)
|
||||
ord <- order(tab$pcor+tab$palcor+tab$pacc, decreasing=T)
|
||||
|
||||
barplot(t(tab[ord,cols]), names.arg=tab$X.org[ord],
|
||||
ylim=c(0,100), col=colors, las=2, cex.names=0.5)
|
||||
|
||||
legend(0, 110, sel, fill=colors, cex=0.7, horiz=T)
|
||||
|
||||
#
|
||||
|
||||
sel <- c("cor", "alcor", "acc", "wrong", "over", "misschlo")
|
||||
tab$rtot <- rowSums(tab[,sel])
|
||||
for (s in sel)
|
||||
tab[,paste0("r", s)] <- tab[,s] * 100 / tab$rtot
|
||||
|
||||
colors <- head(COLORS, length(sel))
|
||||
|
||||
cols <- paste0("r", sel)
|
||||
ord <- order(tab$rcor+tab$ralcor+tab$racc, decreasing=T)
|
||||
|
||||
barplot(t(tab[ord,cols]), names.arg=tab$X.org[ord],
|
||||
ylim=c(0,100), col=colors, las=2, cex.names=0.5)
|
||||
|
||||
legend(0, 110, sel, fill=colors, cex=0.7, horiz=T)
|
||||
|
||||
#
|
||||
|
||||
if (OUT.DEV) {
|
||||
cat("# plot file:", paste0(OUT.FILE, ".", OUT.TYPE), "\n")
|
||||
invisible(dev.off())
|
||||
}
|
||||
|
||||
quit(save='no')
|
||||
|
54
detectors/cds/tools/lib/summary.cmp.awk
Normal file
54
detectors/cds/tools/lib/summary.cmp.awk
Normal file
@ -0,0 +1,54 @@
|
||||
#
|
||||
#
|
||||
|
||||
function getOrg(s, _local_, a, na, org) {
|
||||
na = split(s, a, "/")
|
||||
na = split(a[na], a, "\\.")
|
||||
return a[1]
|
||||
}
|
||||
|
||||
|
||||
BEGIN {
|
||||
PROCINFO["sorted_in"] = "@ind_num_asc"
|
||||
print "#org tot cor alcor acc wrong over misstot misschlo missoth"
|
||||
}
|
||||
|
||||
/MISSED in ChloroDB/ {
|
||||
org = getOrg($1)
|
||||
Org[org]++
|
||||
Cnt[org]["MISSCHLORO"] = $2
|
||||
next
|
||||
}
|
||||
|
||||
/MISSED not in ChloroDB/ {
|
||||
org = getOrg($1)
|
||||
Org[org]++
|
||||
Cnt[org]["MISSNOTCHLORO"] = $2
|
||||
next
|
||||
}
|
||||
|
||||
/^#/ { next }
|
||||
|
||||
/^.*:MATCH/ {
|
||||
org = getOrg($1)
|
||||
Org[org]++
|
||||
split($NF, a, "\\.")
|
||||
Cnt[org][a[1]]++
|
||||
}
|
||||
|
||||
END {
|
||||
for (org in Org) {
|
||||
Cnt[org]["TOTAL"] = Cnt[org]["CORRECT"] + Cnt[org]["ALMOST_CORRECT"] \
|
||||
+ Cnt[org]["ACCEPTABLE"] + Cnt[org]["WRONG"] \
|
||||
+ Cnt[org]["MISSED"]
|
||||
}
|
||||
for (org in Org) {
|
||||
print org, Cnt[org]["TOTAL"]+0, Cnt[org]["CORRECT"]+0, \
|
||||
Cnt[org]["ALMOST_CORRECT"]+0, Cnt[org]["ACCEPTABLE"]+0, \
|
||||
Cnt[org]["WRONG"]+0, Cnt[org]["OVERPRED"]+0, \
|
||||
Cnt[org]["MISSED"]+0, \
|
||||
Cnt[org]["MISSCHLORO"]+0, Cnt[org]["MISSNOTCHLORO"]+0
|
||||
|
||||
}
|
||||
|
||||
}
|
12
detectors/cds/tools/lib/util.base.r
Normal file
12
detectors/cds/tools/lib/util.base.r
Normal file
@ -0,0 +1,12 @@
|
||||
#
|
||||
# R basic utilities
|
||||
#
|
||||
|
||||
#
|
||||
# notify on stderr
|
||||
#
|
||||
|
||||
notify <- function(...) cat("+", ..., "\n")
|
||||
|
||||
|
||||
|
109
detectors/cds/tools/lib/util.cons.r
Normal file
109
detectors/cds/tools/lib/util.cons.r
Normal file
@ -0,0 +1,109 @@
|
||||
#
|
||||
# R consensus utilities
|
||||
#
|
||||
|
||||
#
|
||||
# compute consensus
|
||||
#
|
||||
|
||||
cons.build <- function(seqs, backcount=1) {
|
||||
xx <- do.call(rbind, sapply(seqs, strsplit, "", USE.NAMES=F))
|
||||
lv <- c("a", "c", "g", "t", ".", "-")
|
||||
mx <- apply(xx, 2, function(x) table(factor(x, levels=lv)))[1:4,]
|
||||
cx <- colSums(mx)
|
||||
mx <- mx + backcount
|
||||
mx[,cx==0] <- 0
|
||||
apply(mx, 2, function(x) x / sum(x))
|
||||
}
|
||||
|
||||
#
|
||||
# score consensus
|
||||
#
|
||||
|
||||
cons.score <- function(cons, seq, pos=1:ncol(cons)) {
|
||||
seq <- strsplit(seq, "")[[1]]
|
||||
if (length(seq) != ncol(cons)) {
|
||||
warning("incompatible seq and cons size")
|
||||
return(NA)
|
||||
}
|
||||
ppx <- sapply(pos, function(i) cons[seq[i],i])
|
||||
sum(log10(ppx+1e-6))
|
||||
}
|
||||
|
||||
#
|
||||
# logratio to uniform model score
|
||||
#
|
||||
|
||||
cons.logratio <- function(cons, seq, m0=NULL, pos=1:ncol(cons)) {
|
||||
if (is.null(m0)) {
|
||||
m0 <- matrix(rep(0.25, 4), nrow=4, ncol=ncol(cons))
|
||||
rownames(m0) <- c('a', 'c', 'g', 't')
|
||||
}
|
||||
|
||||
sc <- cons.score(cons, seq, pos=pos)
|
||||
sc0 <- cons.score(m0, seq, pos=pos)
|
||||
|
||||
2 * (log(10^sc, base=2) - log(10^sc0, base=2))
|
||||
}
|
||||
|
||||
#
|
||||
# shuffle sequence
|
||||
#
|
||||
|
||||
seq.shuf <- function(seq) {
|
||||
paste0(sample(strsplit(seq, "")[[1]], nchar(seq), replace=F), collapse="")
|
||||
}
|
||||
|
||||
#
|
||||
# compute confusion matrix between actual and shuffled sequences
|
||||
#
|
||||
|
||||
cons.confusion <- function(cons, seq, m0=NULL, pos=1:ncol(cons), thresh=0) {
|
||||
som <- function(x) sum(x, na.rm=T)
|
||||
|
||||
res <- list()
|
||||
res$l2scor <- l2scor <- sapply(seq, function(s) cons.logratio(cons, s, m0=m0, pos=pos))
|
||||
|
||||
seq <- sapply(seq, seq.shuf)
|
||||
res$r2scor <- r2scor <- sapply(seq, function(s) cons.logratio(cons, s, m0=m0, pos=pos))
|
||||
|
||||
res$conf <- conf <- matrix(c(som(l2scor >= thresh), som(l2scor < thresh),
|
||||
som(r2scor >= thresh), som(r2scor < thresh)),
|
||||
nrow=2, byrow=T)
|
||||
|
||||
res$acc <- sum(diag(conf)) / sum(conf)
|
||||
res$sen <- conf[1,1] / sum(conf[1,])
|
||||
res$sel <- conf[1,1] / sum(conf[,1])
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
#
|
||||
# plot consensus
|
||||
#
|
||||
|
||||
|
||||
cons.plot <- function(cons, main="consensus") {
|
||||
cols <- c("blue", "orange", "red", "green")
|
||||
bp <- barplot(cons, col=cols, ylim=c(0,1), main=main)
|
||||
plx <- apply(cons, 2, function(col) -sum(col * log(col+1e-6, base=4)))
|
||||
lines(bp, plx, type="b", pch=19)
|
||||
legend(0, 1.1, c("a","c","g","t"), fill=cols, horiz=T, xpd=NA, bty="n")
|
||||
legend(20, 1.1, "entropy", pch=19, horiz=T, xpd=NA, bty="n")
|
||||
invisible()
|
||||
}
|
||||
|
||||
#
|
||||
# plot confusion scores histograms
|
||||
#
|
||||
|
||||
cons.histconf <- function(conf, main="junction logr score") {
|
||||
lrh <- hist(c(conf$l2scor, conf$r2scor), br=50, plot=F)
|
||||
lh <- hist(conf$l2scor, br=lrh$breaks, plot=F)
|
||||
rh <- hist(conf$r2scor, br=lrh$breaks, plot=F)
|
||||
xx <- rbind(lh$counts, rh$counts) / sum(lh$counts)
|
||||
colnames(xx) <- lrh$mids
|
||||
barplot(xx, col=c(3,2), beside=T, main=main)
|
||||
legend(0, 0.1, c("true", "shuffled"), fill=c(3,2), horiz=F, xpd=NA)
|
||||
invisible()
|
||||
}
|
93
detectors/cds/tools/lib/util.grid.r
Normal file
93
detectors/cds/tools/lib/util.grid.r
Normal file
@ -0,0 +1,93 @@
|
||||
#
|
||||
# R misc grid plotting
|
||||
#
|
||||
|
||||
require(grid)
|
||||
require(gridExtra)
|
||||
|
||||
#
|
||||
# get line height
|
||||
#
|
||||
|
||||
grd.lineheight <- function(s="X") {
|
||||
convertHeight(unit(1,"strheight", s), "native", valueOnly=T)
|
||||
}
|
||||
|
||||
#
|
||||
# quantile table
|
||||
#
|
||||
|
||||
grd.qtab <- function(df, what, cols, n=5) {
|
||||
df <- df[order(df[,what], decreasing=T),cols]
|
||||
sep <- head(df,1)
|
||||
sep[] <- "-"
|
||||
rbind(head(df, n), sep, tail(df, n))
|
||||
}
|
||||
|
||||
#
|
||||
# histogram with tables
|
||||
#
|
||||
|
||||
grd.hist <- function(df, what, cols = c(1,2, which(colnames(df) == what)),
|
||||
breaks=50, pos.sum=c(0.2,0.6), pos.quant=c(0.7,0.6), cex=0.7,
|
||||
main=paste0("Histogram of ", what), ...) {
|
||||
hist(df[,what], breaks=breaks, xlab=what, main=main, ...)
|
||||
if (! is.null(pos.sum)) {
|
||||
pushViewport(viewport(pos.sum[1], pos.sum[2], gp=gpar(cex=cex)))
|
||||
grid.table(x<-summary(df[,what]), rows=names(x))
|
||||
popViewport()
|
||||
}
|
||||
if (! is.null(pos.quant)) {
|
||||
pushViewport(viewport(pos.quant[1], pos.quant[2], gp=gpar(cex=cex)))
|
||||
grid.table(grd.qtab(df, what, cols), rows=NULL)
|
||||
popViewport()
|
||||
}
|
||||
invisible()
|
||||
}
|
||||
|
||||
#
|
||||
# plot with fit
|
||||
#
|
||||
|
||||
grd.fplot <- function(df, what.x, what.y, linfit=T, pos=c(0.2, 0.8), ablin=NULL, ...) {
|
||||
plot(df[,what.x], df[,what.y], xlab=what.x, ylab=what.y, ...)
|
||||
if (linfit) {
|
||||
fit <- lm(df[,what.y] ~ df[,what.x])
|
||||
abline(fit, col=2)
|
||||
pushViewport(viewport(gp=gpar(col=2)))
|
||||
a <- sprintf("%.2e", coef(fit)[2])
|
||||
b <- sprintf("%.2e", coef(fit)[1])
|
||||
grid.text(paste0(what.y, " = ", a, " * ", what.x, " + ", b),
|
||||
pos[1], pos[2], just="left")
|
||||
pos[2] = pos[2] - 2 * grd.lineheight()
|
||||
grid.text(paste0("R2=", round(summary(fit)$r.squared, 3)),
|
||||
pos[1], pos[2], just="left")
|
||||
popViewport()
|
||||
}
|
||||
if (! is.null(ablin))
|
||||
do.call(abline, ablin)
|
||||
invisible()
|
||||
}
|
||||
|
||||
#
|
||||
# write text
|
||||
#
|
||||
|
||||
grd.textpage <- function(..., lineno=0, left=0.1, top=0.9, cex=1, fact=1.4) {
|
||||
txt <- do.call(paste0, list(...))
|
||||
pushViewport(viewport(gp=gpar(cex=cex)))
|
||||
grid.text(txt, left, top-lineno*grd.lineheight()*fact, just="left")
|
||||
popViewport()
|
||||
invisible(txt)
|
||||
}
|
||||
|
||||
#
|
||||
# title page
|
||||
#
|
||||
|
||||
grd.titlepage <- function(title, x=0.5, y=0.7, cex=3, ...) {
|
||||
notify("processing", title)
|
||||
grid.newpage()
|
||||
grid.text(title, x, y, gp=gpar(cex=cex), ...)
|
||||
invisible()
|
||||
}
|
90
detectors/cds/tools/lib/util.modelio.r
Normal file
90
detectors/cds/tools/lib/util.modelio.r
Normal file
@ -0,0 +1,90 @@
|
||||
#
|
||||
# R models I/O utilities
|
||||
#
|
||||
|
||||
#
|
||||
# write start model
|
||||
#
|
||||
|
||||
write.model.start <- function(frq, what) {
|
||||
dir.create("models", showWarnings=F)
|
||||
fil <- paste0("models/start.", what, ".frq")
|
||||
notify("writing start model:", fil)
|
||||
cat("# start model :", what, "\n", file=fil)
|
||||
for (x in names(frq))
|
||||
cat(x, frq[x]/sum(frq), frq[x], "\n", file=fil, append=T)
|
||||
invisible(fil)
|
||||
}
|
||||
|
||||
#
|
||||
# write stop model
|
||||
#
|
||||
|
||||
write.model.stop <- function(frq, what) {
|
||||
dir.create("models", showWarnings=F)
|
||||
fil <- paste0("models/stop.", what, ".frq")
|
||||
notify("writing stop model:", fil)
|
||||
cat("# stop model :", what, "(freq. ignored)\n", file=fil)
|
||||
for (x in names(frq))
|
||||
cat(x, frq[x]/sum(frq), frq[x], "\n", file=fil, append=T)
|
||||
invisible(fil)
|
||||
}
|
||||
|
||||
#
|
||||
# write splice3 model
|
||||
# [FIXME] positions are hard-coded
|
||||
#
|
||||
|
||||
write.model.splice3 <- function(cons, what) {
|
||||
dir.create("models", showWarnings=F)
|
||||
fil <- paste0("models/splice3.", what, ".frq")
|
||||
notify("writing splice3 model:", fil)
|
||||
.catcons <- function(i) {
|
||||
cat(round(cons[c("a","c","g","t"), i]*100, 0), "\n",
|
||||
file=fil, append=T)
|
||||
}
|
||||
cat("# 3' splice model :", what, "\n", file=fil)
|
||||
cat("# A C G T\n", file=fil, append=T)
|
||||
sapply(seq.int(1, 4), .catcons)
|
||||
cat("splice\n", file=fil, append=T)
|
||||
sapply(seq.int(6, 11), .catcons)
|
||||
invisible(fil)
|
||||
}
|
||||
|
||||
#
|
||||
# write splice5 model
|
||||
# [FIXME] positions are hard-coded
|
||||
#
|
||||
|
||||
write.model.splice5 <- function(cons, what) {
|
||||
dir.create("models", showWarnings=F)
|
||||
fil <- paste0("models/splice5.", what, ".frq")
|
||||
notify("writing splice5 model:", fil)
|
||||
.catcons <- function(i) {
|
||||
cat(round(cons[c("a","c","g","t"), i]*100, 0), "\n",
|
||||
file=fil, append=T)
|
||||
}
|
||||
cat("# 5' splice model :", what, "\n", file=fil)
|
||||
cat("# A C G T\n", file=fil, append=T)
|
||||
sapply(seq.int(13, 18), .catcons)
|
||||
cat("splice\n", file=fil, append=T)
|
||||
sapply(seq.int(20, 23), .catcons)
|
||||
invisible(fil)
|
||||
}
|
||||
|
||||
#
|
||||
# write splice3/5 uniform model
|
||||
#
|
||||
|
||||
write.unif.splice <- function(pos, what) {
|
||||
dir.create("models", showWarnings=F)
|
||||
fil <- paste0("models/splice", pos, ".", what, ".frq")
|
||||
notify("writing uniform splice", pos, "model:", fil)
|
||||
cat("# 3'/5' splice null model", file=fil)
|
||||
cat("# A C G T\n", file=fil, append=T)
|
||||
cat("25 25 25 25\n", file=fil, append=T)
|
||||
cat("splice\n", file=fil, append=T)
|
||||
cat("25 25 25 25\n", file=fil, append=T)
|
||||
invisible(fil)
|
||||
}
|
||||
|
105
detectors/cds/tools/lib/util.plot.r
Normal file
105
detectors/cds/tools/lib/util.plot.r
Normal file
@ -0,0 +1,105 @@
|
||||
#
|
||||
# R plot utilities
|
||||
#
|
||||
|
||||
#
|
||||
# setup graphic device
|
||||
# tiff: high resolution 600 dpi
|
||||
# pdf
|
||||
#
|
||||
|
||||
uplot.init.dev <- function(fname, type="pdf", width=7, height=7, resol=600, ...) {
|
||||
fname <- paste0(fname, ".", type)
|
||||
res <- NULL
|
||||
if (type == "tiff") {
|
||||
res <- tiff(fname, width=width, height=height, units="in", res=resol, ...)
|
||||
}
|
||||
if (type == "pdf") {
|
||||
res <- pdf(fname, width=width, height=height, ...)
|
||||
}
|
||||
invisible(res)
|
||||
}
|
||||
|
||||
#
|
||||
# convert pdf to tiff using ghostscript
|
||||
#
|
||||
|
||||
uplot.convert2tiff <- function(fname, resol=600) {
|
||||
infile <- paste0(fname, ".pdf")
|
||||
oufile <- paste0(fname, ".tif")
|
||||
cmd <- paste0("echo quit | gs -r", resol, "-dBATCH -dNOPAUSE -sDEVICE=tiff12nc -sCompression=lzw -sOutputFile=", oufile, " ", infile)
|
||||
system(cmd)
|
||||
}
|
||||
|
||||
#
|
||||
# default plot setup
|
||||
#
|
||||
|
||||
uplot.setup <- function(mfrow=c(1,1),
|
||||
las=1,
|
||||
mgp=c(2, 0.7, 0),
|
||||
oma=c(0, 0, 0, 0),
|
||||
mar=c(4, 3, 3, 2),
|
||||
cex.main=1,
|
||||
font.main=1,
|
||||
family='Helvetica', ...) {
|
||||
par(mfrow=mfrow, las=las, mgp=mgp, oma=oma, mar=mar, cex.main=cex.main, font.main=font.main, family=family, ...)
|
||||
}
|
||||
|
||||
#
|
||||
# pie plot
|
||||
#
|
||||
|
||||
uplot.pie <- function(tab, main="", labels=c("name", "val", "per"), text.r=0.5, text.col="black", text.cex=1, main.pos=c(0,0), main.col="black", ...) {
|
||||
pie(tab, edges=2000, main="", labels="", ...)
|
||||
text(main.pos[1], main.pos[2], main, cex=1.5, col=main.col)
|
||||
prop <- tab/sum(tab)
|
||||
theta <- 2*pi * (cumsum(prop) - prop/2)
|
||||
lab <- list(name=names(tab), val=tab, per=sprintf("%d%%", round(prop*100)))
|
||||
lab <- apply(data.frame(lab[labels]), 1, paste, collapse="\n")
|
||||
if (length(lab) > 0)
|
||||
text(text.r*cos(theta), text.r*sin(theta), lab, cex=text.cex, col=text.col)
|
||||
invisible(NULL)
|
||||
}
|
||||
|
||||
#
|
||||
# plot utility : color representation of a table
|
||||
#
|
||||
|
||||
uplot.table <- function(tab, col=heat.colors(100), with.lines=TRUE) {
|
||||
image(as.matrix(tab), xaxt="n", yaxt="n", col=col)
|
||||
nli <- nrow(tab)
|
||||
nco <- ncol(tab)
|
||||
dx <- 0.5 / (nli-1)
|
||||
dy <- 0.5 / (nco-1)
|
||||
xf <- (seq_len(nli)-1)/(nli-1) - dx
|
||||
yf <- (seq_len(nco)-1)/(nco-1) - dy
|
||||
if (with.lines) {
|
||||
segments(xf, -dy, xf, 1+dy)
|
||||
segments(-dx, yf, 1+dx, yf)
|
||||
}
|
||||
Axis(c(0,1), at=xf+dx, side=1, labels=rownames(tab), las=2, cex.axis=0.5, padj=0)
|
||||
Axis(c(0,1), at=yf+dy, side=2, labels=colnames(tab), las=2, cex.axis=0.5, padj=0)
|
||||
invisible(NULL)
|
||||
}
|
||||
|
||||
#
|
||||
# plot utility : identify points within user's rectangle
|
||||
#
|
||||
|
||||
rect.identify <- function(data) {
|
||||
if (is.null(dim(data))) data <- cbind(seq_along(data), data)
|
||||
xy <- locator(n=2, type='n')
|
||||
r <- matrix(c(range(xy$x), range(xy$y)), ncol=2, byrow=TRUE)
|
||||
rect(r[1,1], r[2,1], r[1,2], r[2,2], border='red')
|
||||
.in.range <- function(p, r) {
|
||||
.in.int <- function(i) {
|
||||
(p[i] >= r[i,1]) && (p[i] <= r[i,2])
|
||||
}
|
||||
.in.int(1) && .in.int(2)
|
||||
}
|
||||
isel <- which(apply(data, 1, .in.range, r))
|
||||
points(data[isel,], col='red', pch=19)
|
||||
isel
|
||||
}
|
||||
|
Reference in New Issue
Block a user