cds/tools/chlorodb added

Former-commit-id: 0579e878a69b7c285ca71870e9ca5730649a2fda
Former-commit-id: 7cced5b488441d87bf070a9a444317db0e048880
This commit is contained in:
alain viari
2015-11-13 17:41:18 +01:00
parent 0d5f0c1f20
commit e4d6a8484d
585 changed files with 4750 additions and 50 deletions

View File

@ -0,0 +1,254 @@
#!/bin/csh -f
#
# make ChloroDB's
#
# usage: copy genbank/embl files into 'DB_DIR/download'
# usage: [create a paramter.sh file in 'DB_DIR']
# usage: go_chlorodb [DB_DIR]
#
unsetenv ORG_SOURCED
setenv ORG_HOME `dirname $0`/../../../..
source $ORG_HOME/scripts/csh_init.sh
#
# which DB to process
#
set DB_BASE = $DATA_DIR/cds/chlorodb # default location
if ($#Argv > 0) then
set DB_BASE = $Argv[1]; Shift
endif
set DB_BASE = `cd $DB_BASE && pwd -P`
NeedDir $DB_BASE/download
if (! -d $DB_BASE/info) mkdir $DB_BASE/info
if (! -d $DB_BASE/fasta) mkdir $DB_BASE/fasta
cd $DB_BASE/info
#
# params
#
if (! -e $DB_BASE/parameters.sh) then
@ n = `find $DB_BASE/download -depth 1 -type f -print | wc -l`
@ cor_cutoff = $n / 2
@ atg_cutoff = $n / 10
@ dbs_cutoff = $n / 4
if ($cor_cutoff == 0) @ cor_cutoff = 1
if ($atg_cutoff == 0) @ atg_cutoff = 1
if ($dbs_cutoff == 0) @ dbs_cutoff = 1
echo "# sourced file" > $DB_BASE/parameters.sh
echo "" >> $DB_BASE/parameters.sh
echo "set CORE_NCDS_CUTOFF = $cor_cutoff" >> $DB_BASE/parameters.sh
echo "set CORE_START_ATG_CUTOFF = $atg_cutoff" >> $DB_BASE/parameters.sh
echo "set CORE_START_DFT_CUTOFF = $atg_cutoff" >> $DB_BASE/parameters.sh
echo "set CORE_START_OTH_CUTOFF = 10" >> $DB_BASE/parameters.sh
echo "set CORE_STOP_CUTOFF = $cor_cutoff" >> $DB_BASE/parameters.sh
echo "set CORE_SPLICE_CUTOFF = $atg_cutoff" >> $DB_BASE/parameters.sh
echo "" >> $DB_BASE/parameters.sh
echo "set SHEL_NCDS_CUTOFF = 10" >> $DB_BASE/parameters.sh
echo "" >> $DB_BASE/parameters.sh
echo "set CORE_DELTA = Inf" >> $DB_BASE/parameters.sh
echo "set CORE_COVMIN = 30" >> $DB_BASE/parameters.sh
echo "set CORE_PMAX = 1e-6" >> $DB_BASE/parameters.sh
echo "set CORE_IDMIN = 30" >> $DB_BASE/parameters.sh
echo "set CORE_SIZMIN = $cor_cutoff" >> $DB_BASE/parameters.sh
echo "" >> $DB_BASE/parameters.sh
echo "set SHEL_DELTA = 0.5" >> $DB_BASE/parameters.sh
echo "set SHEL_COVMIN = 30" >> $DB_BASE/parameters.sh
echo "set SHEL_PMAX = 1e-6" >> $DB_BASE/parameters.sh
echo "set SHEL_IDMIN = 30" >> $DB_BASE/parameters.sh
echo "set SHEL_SIZMIN = $dbs_cutoff" >> $DB_BASE/parameters.sh
echo "" >> $DB_BASE/parameters.sh
echo "set DUST_DELTA = 0.5" >> $DB_BASE/parameters.sh
echo "set DUST_COVMIN = 30" >> $DB_BASE/parameters.sh
echo "set DUST_PMAX = 1e-6" >> $DB_BASE/parameters.sh
echo "set DUST_IDMIN = 30" >> $DB_BASE/parameters.sh
echo "set DUST_SIZMIN = 10" >> $DB_BASE/parameters.sh
endif
source $DB_BASE/parameters.sh
##set CMIN_COD = 0
##set FMIN_COD = 0.01
#
# temporarily uncompress
#
set ff = `find $DB_BASE/download -depth 1 -name \*.gz -print`
if ($#ff != 0) then
Notify "uncompressing $#ff entries"
foreach f ($ff)
gunzip -f $f
end
endif
#
# convert gbk/embl to fasta
#
set ff = `find $DB_BASE/download -depth 1 \( -name \*.gbk -or -name \*.embl \) -print`
Notify "convert $#ff gbk/embl entries to fasta"
foreach f ($ff)
set nom = `basename $f:r`
set typ = $f:e
$AwkCmd -f $LIB_DIR/$typ.tofasta.awk $f > $DB_BASE/fasta/$nom.fst
end
#
# get gbk/embl info
#
Notify "get gbk/embl info for $#ff entries"
echo "" | awk -v HEADONLY=1 -f $LIB_DIR/gbk.info.awk > db.info.txt # just get header
foreach f ($ff)
set nom = `basename $f:r`
set typ = $f:e
$AwkCmd -f $LIB_DIR/$typ.oneliner.awk $f |\
$AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/$typ.info.awk |\
egrep -v '^#' >> db.info.txt
end
#
# get cds info
#
Notify "get gbk/embl cds for $#ff entries"
echo "" | awk -v HEADONLY=1 -f $LIB_DIR/gbk.cds_long.awk > db.cds.txt # just get header
foreach f ($ff)
set nom = `basename $f:r`
set typ = $f:e
$AwkCmd -f $LIB_DIR/$typ.oneliner.awk $f |\
$AwkCmd -v FASTA=$DB_BASE/fasta/$nom.fst -f $LIB_DIR/libutil.awk \
-f $LIB_DIR/$typ.cds_long.awk |\
egrep -v '^#' >> db.cds.txt
end
#
# get fasta for prots
#
Notify "get prots"
$AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/cds2fasta.awk db.cds.txt > db.prot.fst
#
# get introns
#
Notify "get gbk/embl introns for $#ff entries"
echo "" | awk -v HEADONLY=1 -f $LIB_DIR/gbk.intron.awk > db.intron.txt # just get header
foreach f ($ff)
set nom = `basename $f:r`
set typ = $f:e
$AwkCmd -f $LIB_DIR/$typ.oneliner.awk $f |\
$AwkCmd -v FASTA=$DB_BASE/fasta/$nom.fst -f $LIB_DIR/libutil.awk \
-f $LIB_DIR/$typ.intron.awk |\
egrep -v '^#' >> db.intron.txt
end
#
# make models
#
Notify "Making models"
echo -n "" > db.models.params.txt
echo "CORE_NCDS_CUTOFF <- $CORE_NCDS_CUTOFF" >> db.models.params.txt
echo "CORE_START_ATG_CUTOFF <- $CORE_START_ATG_CUTOFF" >> db.models.params.txt
echo "CORE_START_DFT_CUTOFF <- $CORE_START_DFT_CUTOFF" >> db.models.params.txt
echo "CORE_START_OTH_CUTOFF <- $CORE_START_OTH_CUTOFF" >> db.models.params.txt
echo "CORE_STOP_CUTOFF <- $CORE_STOP_CUTOFF" >> db.models.params.txt
echo "CORE_SPLICE_CUTOFF <- $CORE_SPLICE_CUTOFF" >> db.models.params.txt
echo "SHEL_NCDS_CUTOFF <- $SHEL_NCDS_CUTOFF" >> db.models.params.txt
$LIB_DIR/make.models.r |& Cat
GetStatus
OnError then
Error 2 "model parameter too stringent"
endif
#
# add matrices
#
cp -f $PROG_DIR/matrices/* models
#
# make subDBs
#
if (-e db.core.pat.txt) then
Notify "Making core DB (take some time... please wait)"
$PROG_DIR/subdb/go_subdb.sh db.prot.fst db.core.pat.txt \
$CORE_DELTA $CORE_COVMIN $CORE_PMAX $CORE_IDMIN $CORE_SIZMIN
endif
if (-e db.shell.pat.txt) then
Notify "Making shell DB (take some time... please wait)"
$PROG_DIR/subdb/go_subdb.sh db.prot.fst db.shell.pat.txt \
$SHEL_DELTA $SHEL_COVMIN $SHEL_PMAX $SHEL_IDMIN $SHEL_SIZMIN
endif
if (-e db.dust.pat.txt) then
Notify "Making dust DB (take some time... please wait)"
$PROG_DIR/subdb/go_subdb.sh db.prot.fst db.dust.pat.txt \
$DUST_DELTA $DUST_COVMIN $DUST_PMAX $DUST_IDMIN $DUST_SIZMIN
endif
#
# recompress entries
#
set ff = `find $DB_BASE/download -depth 1 -type f -print`
if ($#ff != 0) then
Notify "recompressing $#ff entries"
foreach f ($ff)
gzip -f $f
end
endif
# compress fasta
set ff = `find $DB_BASE/fasta -depth 1 -name \*.fst -print`
if ($#ff != 0) then
Notify "compressing $#ff fasta entries"
foreach f ($ff)
gzip -f $f
end
endif
# install everything in proper directory
foreach dir ("core" "shell" "dust")
if (-e $DB_BASE/$dir) \rm -r $DB_BASE/$dir
if ((-d db.$dir.pat.db) && (-e db.$dir.pat.db/Annot.lst)) then
Notify "installing $DB_BASE/$dir"
\mv -f db.$dir.pat.db $DB_BASE/$dir
endif
end
if (-e $DB_BASE/models) \rm -r $DB_BASE/models
if (-d models) \mv -f models $DB_BASE
Notify "Done"
exit 0

View File

@ -0,0 +1,29 @@
#
# blosum62 substitution matrix
# with larger penalty for stops
#
A R N D C Q E G H I L K M F P S T W Y V B Z X *
A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -50
R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -50
N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -50
D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -50
C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -50
Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -50
E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -50
G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -50
H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -50
I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -50
L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -50
K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -50
M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -50
F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -50
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -50
S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -50
T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -50
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -50
Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -50
V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -50
B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -50
Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -50
X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -50
* -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 -50 1

View File

@ -0,0 +1,195 @@
#!/bin/csh -f
#
# usage: go_subdb.sh prot.fst pat.txt [deltalen covmin pmax idmin sizmin]
# usage: prot.fst : proteins fasta file
# usage: pat.txt : text file containing patterns and names for families to extract
# usage: output directory containig subdbs : basename <pat:r>.db
#
unsetenv ORG_SOURCED
setenv ORG_HOME `dirname $0`/../../../../..
source $ORG_HOME/scripts/csh_init.sh
NeedArg 2
set ProtFile = $Argv[1]; Shift
set PatFile = $Argv[1]; Shift
NeedFile $ProtFile
NeedFile $PatFile
#
# parameters
#
set Delta = 0.5
set Covmin = 30
set Pmax = 1e-6
set Idmin = 30
set Sizmin = 5
if ($#Argv > 0) then
set Delta = $Argv[1]; Shift
endif
if ($#Argv > 0) then
set Covmin = $Argv[1]; Shift
endif
if ($#Argv > 0) then
set Pmax = $Argv[1]; Shift
endif
if ($#Argv > 0) then
set Idmin = $Argv[1]; Shift
endif
if ($#Argv > 0) then
set Sizmin = $Argv[1]; Shift
endif
#
# output directory
#
set OutDir = `basename $PatFile:r`.db
if (-d $OutDir) \rm -r $OutDir
mkdir $OutDir
set OutLog = `basename $PatFile:r`.log
echo -n '' > $OutLog
alias Report 'egrep "^>" \!:1 | wc -l | awk -v P=`basename \!:1` -v H=\!:2 '"'{print H,P,"'$1}'"'"' >> $OutLog'
#
# remove entries with bad symbols
#
Notify "cleanup $ProtFile"
Report $ProtFile "init_size"
$AwkCmd -f $LIB_DIR/db.filter.sym.awk $ProtFile > P_$$
Report $ProtFile "cleanup_size"
#
# select by name pattern
#
Notify "select by patterns"
mkdir D_$$
mkdir E_$$
mkdir F_$$
set noms = `awk '{print $1}' $PatFile`
foreach nom ($noms)
set pat = `egrep "^$nom " $PatFile | awk '{print $2}'`
$AwkCmd -f $LIB_DIR/db.filter.pat.awk -v PAT="$pat" P_$$ > D_$$/$nom.fst
set n = `egrep '^>' D_$$/$nom.fst | wc -l`
Notify " pattern : $nom : $n"
Report D_$$/$nom.fst "pattern_filter"
if ($n <= $Sizmin) \rm -f D_$$/$nom.fst
end
set ok = `ls D_$$ | wc -l`
if ($ok == 0) goto fin
#
# select by length
#
Notify "select by length"
foreach f (D_$$/*.fst)
set nom = `basename $f:r`
$AwkCmd -f $LIB_DIR/db.getlen.awk $f > L_$$
$LIB_DIR/db.filter.len.r L_$$ $Delta |\
$AwkCmd '($NF == "TRUE") {print $2}' > M_$$
$AwkCmd -v FILE=M_$$ -f $LIB_DIR/db.subdb.awk $f > E_$$/$nom.fst
Report E_$$/$nom.fst "length_filter"
set n = `egrep '^>' E_$$/$nom.fst | wc -l`
Notify " length filter : $nom : $n"
if ($n <= $Sizmin) \rm -f E_$$/$nom.fst
end
set ok = `ls E_$$ | wc -l`
if ($ok == 0) goto fin
#
# select by similarity
#
Notify "select by similarity"
foreach f (E_$$/*.fst)
set nom = `basename $f:r`
Notify " blasting $nom"
makeblastdb -dbtype 'prot' -in $f >>& db.log
blastp -db $f -query $f -outfmt 7 > $f.blast.out
\rm -f $f.p??
$AwkCmd -v COVMIN=$Covmin -v PMAX=$Pmax -v IDMIN=$Idmin \
-f $LIB_DIR/db.blastlink.awk $f.blast.out |\
$AwkCmd -f $LIB_DIR/db.todl.awk > G_$$
($LIB_DIR/db.cc.r G_$$ > $f.cc.txt) >>& db.log
awk -v NAME=$nom -f $LIB_DIR/db.reportcc.awk $f.cc.txt >> $OutLog
$AwkCmd -f $LIB_DIR/db.selcc.awk $f.cc.txt > S_$$
$AwkCmd -v FILE=S_$$ -f $LIB_DIR/db.subdb.awk $f > F_$$/$nom.fst
Report F_$$/$nom.fst "similarity_filter"
set n = `egrep '^>' F_$$/$nom.fst | wc -l`
Notify " blast filter : $nom : $n"
if ($n <= $Sizmin) \rm -f F_$$/$nom.fst
end
set ok = `ls D_$$ | wc -l`
if ($ok == 0) goto fin
#
# annotations
#
echo -n "" > J_$$
foreach f (F_$$/*.fst)
$AwkCmd -f $LIB_DIR/db.annot.awk $f >> J_$$
end
awk '(NF >= 3) {print $1, $NF}' $PatFile | sort > A_$$
sort J_$$ | egrep -v '^ *$' > B_$$
join A_$$ B_$$ > F_$$/Annot.lst
#
# copy files
#
set n = `ls F_$$/* | wc -l`
Notify "copy $n files to $OutDir"
\mv -f F_$$/* $OutDir
#
# end
#
fin:
Notify "output directory : $OutDir"
\rm -r ?_$$
exit 0

View File

@ -0,0 +1,39 @@
#
/^>/ {
N++
na = split($1, a, "@")
if (a[na-1] > NEXMAX) NEXMAX = a[na-1]
NEX[a[na-1]]++
ANNOT[$NF]++
}
END {
na = split(FILENAME, a, "/")
na = split(a[na], a, "\\.")
printf("%s %d ", a[1], N)
s = ""
for (i = 1 ; i <= NEXMAX ; i ++) {
if (NEX[i] != 0)
s = s "" i ":" NEX[i] "_"
}
gsub("_+$", "", s)
printf("%s ", s)
s = (NEXMAX == 1) ? "MONEX" : "POLYEX"
printf("%s ", s)
nmax = 0
amax = "none"
for (e in ANNOT) {
if (ANNOT[e] > nmax) {
nmax = ANNOT[e]
amax = e
}
}
print amax
}

View File

@ -0,0 +1,48 @@
#
function min(x, y) {
return ((x < y) ? x : y)
}
BEGIN {
if (COVMIN == "") COVMIN = 50
if (PMAX == "") PMAX = 1e-6
if (IDMIN == "") IDMIN = 30
}
/^#/ {
hitnum = 0;
next;
}
{
if ($1 == $2) next
hitnum++;
na = split($1, a, "@");
if (na < 2) {
print "query file not properly formatted" > "/dev/stderr"
exit(1);
}
len1 = a[na];
na = split($2, a, "@");
if (na < 2) {
print "bank file not properly formatted" > "/dev/stderr"
exit(1);
}
len2 = a[na];
id = $3 + 0.0;
ali = $4;
covmin = ali * 100. / min(len1, len2);
proba = $11 + 0.0;
if ((covmin > COVMIN) && ((proba < PMAX) || (proba == 0)) && (id > IDMIN)) {
print $1, $2, hitnum, id, covmin, proba, ali, len1, len2;
}
}

View File

@ -0,0 +1,18 @@
#!/usr/bin/env Rscript
#
require(igraph, warn.conflicts=F)
args <- commandArgs(T)
path <- if(length(args) > 0) args[1] else 'graph.dl'
g <- read.graph(path, format='dl')
cc <- clusters(g)
res <- cbind(V(g)$name, membership(cc))
write.table(res, quote=FALSE, row.names=FALSE, col.names=FALSE)
quit(save="no")

View File

@ -0,0 +1,19 @@
#!/usr/bin/env Rscript
#
args <- commandArgs(T)
path <- if(length(args) > 0) args[1] else 'len.txt'
delta <- if(length(args) > 1) args[2] else 0.5
tab <- read.table(path, header=T)
lmed <- median(tab$len)
dlen <- lmed * as.numeric(delta)
tab$ok <- (abs(tab$len-lmed)/lmed) <= delta
write.table(tab, quote=F)
quit(save='no')

View File

@ -0,0 +1,10 @@
#
/^>/ {
split($1, a, "@")
ok = a[3] ~ PAT
}
ok {
print $0
}

View File

@ -0,0 +1,30 @@
#
#
#
function Check(seq) {
if (seq == "") return 0
gsub("[ACDEFGHIKLMNPQRSTVWXY\n]+", "", seq)
return (length(seq) == 0)
}
/^>/ {
if (Check(Seq)) {
print Name
printf("%s", Seq)
}
Name = $0
Seq = ""
next
}
{
Seq = Seq "" $0 "\n"
}
END {
if (Check(Seq)) {
print Name
printf("%s", Seq)
}
}

View File

@ -0,0 +1,10 @@
#
BEGIN {
print "id len"
}
/^>/ {
na = split($1, a, "@")
print substr($1, 2), a[na]
}

View File

@ -0,0 +1,15 @@
#
#
{
cnt[$NF]++
}
END {
n = asort(cnt)
printf("cc_size %s", NAME)
for (i = n ; i >= 1 ; i--)
printf(" %d", cnt[i])
print ""
}

View File

@ -0,0 +1,19 @@
#
{
N[$NF]++
E[$NF, N[$NF]] = $1
}
END {
cmax = 1
nmax = N[1]
for (i in N) {
if (N[i] > nmax) {
nmax = N[i]
cmax = i
}
}
for (i = 1 ; i <= nmax ; i++)
print E[cmax, i]
}

View File

@ -0,0 +1,17 @@
#
BEGIN {
if (FILE == "") FILE = "db.sel.txt"
while (getline < FILE)
INC[$1] = $1
close(FILE)
}
/^>/ {
name = substr($1, 2)
ok = name in INC
}
ok {
print $0
}

View File

@ -0,0 +1,21 @@
#
{
node[$1]++
node[$2]++
link[++M] = $1 " " $2
}
END {
for (n in node)
N++
print "DL n=" N
print "format = edgelist1"
print "labels embedded:"
print "data:"
for (i = 1 ; i <= M ; i++)
print link[i]
}

View File

@ -47,8 +47,8 @@ $AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/$PrdType.cds_short.awk > P_$$
Notify "compare bank to predictions"
$AwkCmd -f $LIB_DIR/libnws.awk \
-f $LIB_DIR/compareCds.awk \
$AwkCmd -f $LIB_DIR/libnws.awk \
-f $LIB_DIR/compare.cds.awk \
R_$$ P_$$ > S_$$
# base statistics

View File

@ -15,6 +15,8 @@ NeedArg 1
egrep '^#|^MATCH' $* | awk -f $LIB_DIR/summary.cmp.awk > compare.txt
Notify "text file: compare.txt"
$LIB_DIR/summarize_cmp.r

View File

@ -0,0 +1,19 @@
#
# get fasta sequence from cds list
#
# [-v FIELD=13] CDS sequence
# [-v FIELD=14] Prot Sequence
#
BEGIN {
if (CHARPERLINE == "") CHARPERLINE = 50
if (FIELD == "") FIELD = 14
}
/^#/ { next }
{
name = $1 "@" $2 "@" $3 "@" $5 "@" $6 "@" $7 "@" $8 "@" int($9/3)
comment = $NF
PrintFasta($FIELD, name " " comment)
}

View File

@ -0,0 +1,97 @@
#
# get cds features from embl (long version)
#
# -v FASTA
# @include lib.embl.awk
BEGIN {
print "#locus locustag genefam gene from to strand nexon length status start stop dnaseq protseq product"
if (HEADONLY != "") exit(0)
if (MAXSPAN == "") MAXSPAN = 10000
if (FASTA == "") Error("No FASTA file specified", 1)
if (! TestPath(FASTA)) Error("Fasta file: '" FASTA "' not found", 1)
Seq = tolower(ReadFasta(FASTA))
}
/^ID / {
locus = $2
gsub(";", "", locus)
incds = 0
next
}
/^FT CDS/ {
revstrand = match($3, "^complement")
s = substr($0, 22)
gsub("^complement", "", s)
ok = ! match(s, "complement|order")
nexon = Nexons(s)
SpanLocation(s, sloc)
spanlen = sloc[2] - sloc[1] + 1
len = LenLocation(s)
ok = ok && (len < MAXSPAN)
cdsseq = ok ? SeqLocation(Seq, s, revstrand) : "XXX"
cstart = substr(cdsseq, 1,3)
cstop = substr(cdsseq, length(cdsseq)-2)
gene = "none"
locustag = "none"
product = "none"
translation = "X"
incds = 1
next
}
(incds && /^FT [^ ]/) {
print locus, locustag, GeneFamily(gene), gene, sloc[1], sloc[2], (revstrand ? "R" : "D"),
nexon, len, (ok ? "Ok" : "Error"), cstart, cstop, cdsseq, translation, product
incds = 0
next
}
/^FT \/gene=/ {
split($0, a, "=")
gene = a[2]
gsub("^[^a-z,A-Z]+", "", gene)
gsub("\"", "", gene)
gsub(" ", "_", gene)
next
}
/^FT \/locus_tag=/ {
split($0, a, "=")
locustag = a[2]
gsub("\"", "", locustag)
gsub(" ", "_", locustag)
next
}
/^FT \/product=/ {
split($0, a, "=")
product = a[2]
gsub("\"", "", product)
gsub(" ", "_", product)
next
}
/^FT \/translation=/ {
split($0, a, "=")
translation = a[2]
gsub("\"", "", translation)
gsub(" ", "", translation)
next
}
END {
if (incds) {
print locus, locustag, GeneFamily(gene), gene, sloc[1], sloc[2], (revstrand ? "R" : "D"),
nexon, len, (ok ? "Ok" : "Error"), cstart, cstop, cdsseq, translation, product
}
}

View File

@ -0,0 +1,97 @@
#
# get feature info from embl
#
# @include libgbk.awk
function GC(s, _local_, i, len) {
s = toupper(s)
len = length(s)
gsub("G|C", "", s)
return ((len - length(s)) * 100 / (len ? len : 1))
}
#
# rules
#
BEGIN {
print "#locus orga len oklen gc nbCds nbCds_int0 nbCds_int1 nbCds_intsup1 perCds_noex meanCdsSize nbtRNA nbrRNA nboRNA"
}
/^ID/ {
locus = $2
gsub(";", "", locus)
next
}
/^OS/ {
orga = substr($0, 6)
gsub(" ", "_", orga)
next
}
/^FT source/ {
GetLoc($3, loc);
len = loc[2];
next
}
/^FT CDS/ {
meanCds = meanCds * nbCds + LenLocation($3)
nbCds++
meanCds /= nbCds
n = Nexons($3)
if (n > 3) n = 3
nbCdx[n]++
next
}
/^FT tRNA/ {
nbTrna++
next
}
/^FT rRNA/ {
nbRrna++
next
}
/^FT mRNA/ {
next
}
/^FT .*RNA/ {
nbOrna++
next
}
/^SQ / {
inseq = 1
seq = ""
next
}
inseq && /^ / {
s = $0
gsub("[0-9]+", "", s)
gsub(" ", "", s)
seq = seq "" s
next
}
/^\/\// {
oklen = (len == length(seq) ? "ok" : "wrong")
gc = GC(seq)
print locus, orga, len, oklen, gc, nbCds+0, nbCdx[1]+0, \
nbCdx[2]+0, nbCdx[3]+0, (nbCdx[1]+0)*100/Max(1, nbCds+0), \
meanCds+0, nbTrna+0, nbRrna+0, nbOrna+0
nbCds = nbTrna = nbRrna = nbOrna = len = inseq = meanCds = 0
delete nbCdx
orga = locus = "?"
}

View File

@ -0,0 +1,97 @@
#
# get intron features from embl
#
# @include libembl.awk
BEGIN {
print "#locus locustag genefam gene from to strand intron_num intron_nb acceptor-donor status"
if (HEADONLY != "") exit(0)
if (FASTA == "") Error("No FASTA file specified", 1)
if (! TestPath(FASTA)) Error("Fasta file: '" FASTA "' not found", 1)
Seq = tolower(ReadFasta(FASTA))
}
/^ID / {
locus = $2
gsub(";", "", locus)
next
}
/^FT CDS/ {
revstrand = match($3, "^complement")
s = substr($0, 22)
gsub("^complement", "", s)
ok = ! match(s, "complement|order")
if (! ok) next
na = ParseLocation(s, locs)
if (na < 2) next
delete SINfo
Ninfo = 0
val = locs[1][1]
for (i = 2 ; i <= na ; i++) {
if (locs[i][1] < val) ok = 0
val = locs[i][1]
}
if (! ok) next
val = locs[1][2]
for (i = 2 ; i <= na ; i++) {
if (locs[i][2] < val) ok = 0
val = locs[i][2]
}
if (! ok) next
from = locs[1][2] + 1
for (i = 2 ; i <= na ; i++) {
to = locs[i][1] - 1
inseq = SeqLocation(Seq, (from - 4) ".." (to + 4), revstrand)
SINfo[++Ninfo] = from " " to " " (revstrand ? "R" : "D") " "\
(revstrand ? na-i+1 : i-1) " " na-1 " "\
substr(inseq, 1,4) "."\
substr(inseq, 5,6) "-"\
substr(inseq, length(inseq)-9, 6) "."\
substr(inseq, length(inseq)-3, 4) " "\
"ok"
from = locs[i][2] + 1
}
gene = "none"
locustag = "none"
next
}
/^FT \/gene=/ {
split($0, a, "=")
gene = a[2]
gsub("^[^a-z,A-Z]+", "", gene)
gsub("\"", "", gene)
gsub(" ", "_", gene)
next
}
/^FT \/locus_tag=/ {
split($0, a, "=")
locustag = a[2]
gsub("\"", "", locustag)
gsub(" ", "_", locustag)
next
}
/^FT \/translation=/ {
for (i = 1 ; i <= Ninfo ; i++)
print locus, locustag, GeneFamily(gene), gene, SINfo[i]
Ninfo = 0
next
}
/^\/\// {
locus = "?"
}

View File

@ -0,0 +1,36 @@
#
# get fasta sequence from embl
#
/^ID / {
locus = $2
gsub(";", "", locus)
next
}
/^SQ / {
inseq = 1
nln = 0
delete seq
next
}
/^\/\// {
inseq = 0
print ">" locus
for (i = 1 ; i <= nln ; i++)
print seq[i]
next
}
inseq {
s = $0
gsub(" ", "", s)
gsub("[0-9]+", "", s)
seq[++nln] = s
next
}

View File

@ -0,0 +1,99 @@
#
# get cds features from genbank (long version)
#
# -v FASTA
# @include libgbk.awk
BEGIN {
print "#locus locustag genefam gene from to strand nexon length status start stop dnaseq protseq product"
if (HEADONLY != "") exit(0)
if (MAXSPAN == "") MAXSPAN = 10000
if (FASTA == "") Error("No FASTA file specified", 1)
if (! TestPath(FASTA)) Error("Fasta file: '" FASTA "' not found", 1)
Seq = tolower(ReadFasta(FASTA))
}
/^LOCUS/ {
locus = $2
incds = 0
next
}
/^ CDS/ {
revstrand = match($2, "^complement")
s = substr($0, 22)
gsub("^complement", "", s)
ok = ! match(s, "complement|order")
nexon = Nexons(s)
SpanLocation(s, sloc)
spanlen = sloc[2] - sloc[1] + 1
len = LenLocation(s)
ok = ok && (len < MAXSPAN)
cdsseq = ok ? SeqLocation(Seq, s, revstrand) : "XXX"
cstart = substr(cdsseq, 1,3)
cstop = substr(cdsseq, length(cdsseq)-2)
gene = "none"
locustag = "none"
product = "none"
translation = "X"
incds = 1
next
}
(incds && /^ [^ ]/) {
print locus, locustag, GeneFamily(gene), gene, sloc[1], sloc[2], (revstrand ? "R" : "D"),
nexon, len, (ok ? "Ok" : "Error"), cstart, cstop, cdsseq, translation, product
incds = 0
next
}
/^ \/gene=/ {
split($0, a, "=")
gene = a[2]
gsub("^[^a-z,A-Z]+", "", gene)
gsub("\"", "", gene)
gsub(" ", "_", gene)
next
}
/^ \/locus_tag=/ {
split($0, a, "=")
locustag = a[2]
gsub("\"", "", locustag)
gsub(" ", "_", locustag)
next
}
/^ \/product=/ {
split($0, a, "=")
product = a[2]
gsub("\"", "", product)
gsub(" ", "_", product)
next
}
/^ \/translation=/ {
split($0, a, "=")
translation = a[2]
gsub("\"", "", translation)
gsub(" ", "", translation)
next
}
/^\/\// {
locus = "?"
}
END {
if (incds) {
print locus, locustag, GeneFamily(gene), gene, sloc[1], sloc[2], (revstrand ? "R" : "D"),
nexon, len, (ok ? "Ok" : "Error"), cstart, cstop, cdsseq, translation, product
}
}

View File

@ -0,0 +1,97 @@
#
# get feature info from genbank
#
# @include libgbk.awk
function GC(s, _local_, i, len) {
s = toupper(s)
len = length(s)
gsub("G|C", "", s)
return ((len - length(s)) * 100 / len)
}
#
# rules
#
BEGIN {
print "#locus orga len oklen gc nbCds nbCds_int0 nbCds_int1 nbCds_intsup1 perCds_noex meanCdsSize nbtRNA nbrRNA nboRNA"
}
/^LOCUS/ {
locus = $2
next
}
/^ ORGANISM/ {
orga = substr($0, 13)
split(orga, a, ";")
orga = a[1]
gsub(" ", "_", orga)
next
}
/^ source/ {
GetLoc($2, loc);
len = loc[2];
next
}
/^ CDS/ {
meanCds = meanCds * nbCds + LenLocation($2)
nbCds++
meanCds /= nbCds
n = Nexons($2)
if (n > 3) n = 3
nbCdx[n]++
next
}
/^ tRNA/ {
nbTrna++
next
}
/^ rRNA/ {
nbRrna++
next
}
/^ mRNA/ {
next
}
/^ .*RNA/ {
nbOrna++
next
}
/^ORIGIN/ {
inseq = 1
seq = ""
next
}
inseq && /^ +[1-9][0-9]*/ {
s = substr($0, 11)
gsub(" ", "", s)
seq = seq "" s
next
}
/^\/\// {
oklen = (len == length(seq) ? "ok" : "wrong")
gc = GC(seq)
print locus, orga, len, oklen, gc, nbCds+0, nbCdx[1]+0, \
nbCdx[2]+0, nbCdx[3]+0, (nbCdx[1]+0)*100/Max(1, nbCds+0), \
meanCds+0, nbTrna+0, nbRrna+0, nbOrna+0
nbCds = nbTrna = nbRrna = nbOrna = len = inseq = meanCds = 0
delete nbCdx
orga = locus = "?"
}

View File

@ -0,0 +1,96 @@
#
# get intron features from genbank
#
# @include libgbk.awk
BEGIN {
print "#locus locustag genefam gene from to strand intron_num intron_nb acceptor-donor status"
if (HEADONLY != "") exit(0)
if (FASTA == "") Error("No FASTA file specified", 1)
if (! TestPath(FASTA)) Error("Fasta file: '" FASTA "' not found", 1)
Seq = tolower(ReadFasta(FASTA))
}
/^LOCUS/ {
locus = $2
next
}
/^ CDS/ {
revstrand = match($2, "^complement")
s = substr($0, 22)
gsub("^complement", "", s)
ok = ! match(s, "complement|order")
if (! ok) next
na = ParseLocation(s, locs)
if (na < 2) next
delete SINfo
Ninfo = 0
val = locs[1][1]
for (i = 2 ; i <= na ; i++) {
if (locs[i][1] < val) ok = 0
val = locs[i][1]
}
if (! ok) next
val = locs[1][2]
for (i = 2 ; i <= na ; i++) {
if (locs[i][2] < val) ok = 0
val = locs[i][2]
}
if (! ok) next
from = locs[1][2] + 1
for (i = 2 ; i <= na ; i++) {
to = locs[i][1] - 1
inseq = SeqLocation(Seq, (from - 4) ".." (to + 4), revstrand)
SINfo[++Ninfo] = from " " to " " (revstrand ? "R" : "D") " "\
(revstrand ? na-i+1 : i-1) " " na-1 " "\
substr(inseq, 1,4) "."\
substr(inseq, 5,6) "-"\
substr(inseq, length(inseq)-9, 6) "."\
substr(inseq, length(inseq)-3, 4) " "\
"ok"
from = locs[i][2] + 1
}
gene = "none"
locustag = "none"
next
}
/^ \/gene=/ {
split($0, a, "=")
gene = a[2]
gsub("^[^a-z,A-Z]+", "", gene)
gsub("\"", "", gene)
gsub(" ", "_", gene)
next
}
/^ \/locus_tag=/ {
split($0, a, "=")
locustag = a[2]
gsub("\"", "", locustag)
gsub(" ", "_", locustag)
next
}
/^ \/translation=/ {
for (i = 1 ; i <= Ninfo ; i++)
print locus, locustag, GeneFamily(gene), gene, SINfo[i]
Ninfo = 0
next
}
/^\/\// {
locus = "?"
}

View File

@ -0,0 +1,32 @@
#
# get fasta sequence from genbank
#
/^LOCUS/ {
locus = $2
next
}
/^ORIGIN/ {
inseq = 1
nln = 0
delete seq
}
inseq && /^ +[1-9][0-9]*/ {
s = substr($0, 11)
gsub(" ", "", s)
seq[++nln] = s
next
}
/^\/\// {
print ">" locus
for (i = 1 ; i <= nln ; i++)
print seq[i]
}

View File

@ -0,0 +1,31 @@
#!/usr/bin/env Rscript
#
# check and install required packages
#
out <- function(...) {
cat(paste0('+ ', ..., '\n'), file=stderr())
}
installed <- function(package) {
package %in% rownames(installed.packages())
}
check <- function(package, repos="http://cran.univ-lyon1.fr") {
if (installed(package)) {
out("R package ", package, " installed")
} else {
out("Installing R package ", package, " from ", repos)
install.packages(package, repos=repos)
}
invisible(installed(package))
}
check("grid")
check("gridExtra")
check("vcd")
check("plotrix")
check("igraph")
quit(save='no', status=0)

View File

@ -0,0 +1,224 @@
#!/usr/bin/env Rscript
#
# compute start, stop, splice-junctions models for core DB
#
# source("make.models.r")
#
LIB_DIR <- Sys.getenv("LIB_DIR")
if (LIB_DIR == "") LIB_DIR = "."
source(paste0(LIB_DIR, "/util.base.r"))
source(paste0(LIB_DIR, "/util.cons.r"))
source(paste0(LIB_DIR, "/util.modelio.r"))
# -------------------------------
# parameters
# -------------------------------
# core cutoffs
source("db.models.params.txt")
# -------------------------------
# genome infos
# -------------------------------
notify("loading info table")
chromo <- read.table("db.info.txt", com="", head=T, stringsAsFactors=F)
# -------------------------------
# CDS
# -------------------------------
notify("loading cds table")
cds <- read.table("db.cds.txt", com="", header=T, stringsAsFactors=F)
cds$start <- as.factor(cds$start)
cds$stop <- as.factor(cds$stop)
cds <- cds[cds$status=="Ok",]
cds <- cds[cds$genefam!="none",]
cds$categ <- "dust"
x <- sort(table(cds$genefam), dec=T)
ok <- names(x[x >= CORE_NCDS_CUTOFF])
cds$categ[cds$genefam %in% ok] <- "core"
x <- x[! names(x) %in% ok]
ok <- names(x[x >= SHEL_NCDS_CUTOFF])
cds$categ[cds$genefam %in% ok] <- "shell"
#
cds.ori <- cds
cds.lst <- split(cds.ori, cds.ori$categ)
#
# write out families
#
# patterns & names
invisible(lapply(cds.lst, function(cds) {
x <- sort(table(cds$genefam), decreasing=T)
tab <- paste0("^", names(x), "$")
names(tab) <- names(x)
y <- sapply(split(cds$gene, cds$genefam), function(g) {
head(names(sort(table(g), decreasing=T)), 1)
})
tab <- cbind(tab, y[names(x)])
categ <- unique(cds$categ)
f <- paste0("db.", categ, ".pat.txt")
notify("writing patterns for", categ, ":", f)
write.table(tab, file=f, quote=F, col.names=F, row.names=T)
}))
# -------------------------------
# Start models (core only)
# -------------------------------
if (! "core" %in% names(cds.lst)) {
notify("*** no gene found in core")
notify("*** please change parameters")
quit(save='no', status=1)
}
cds <- cds.lst[["core"]]
#
# start by genes
#
tab <- split(cds$start, cds$genefam)
fatg <- sapply(tab, function(x) table(x)["atg"]/length(x)*100)
names(fatg) <- names(tab)
start.dft <- names(which(fatg >= CORE_START_ATG_CUTOFF))
start.spc <- names(which(fatg < CORE_START_ATG_CUTOFF))
tab <- cds[cds$genefam %in% start.dft,]
tab <- table(tab$start)
# default model
x <- sort(tab[tab>=CORE_START_DFT_CUTOFF], decreasing=T)
write.model.start(x, "default")
# gene specific models
invisible(sapply(start.spc, function(g) {
x <- cds[cds$genefam == g,]
tx <- table(x$start)
tx <- sort(tx[tx>=CORE_START_OTH_CUTOFF], decreasing=T)
write.model.start(tx, g)
}))
# -------------------------------
# Stop models (core only)
# -------------------------------
# write default stop model
tab <- table(cds$stop)
x <- sort(tab[tab>=CORE_STOP_CUTOFF], decreasing=T)
write.model.stop(x, "default")
# -------------------------------
# splice junctions
# -------------------------------
notify("loading intron table")
intron <- read.table("db.intron.txt", com="", header=T, stringsAsFactors=F)
# remove invalid sequences
intron$seq <- gsub("\\.|-", "", intron$acceptor.donor)
lseq <- nchar(gsub("[^acgt]", "", intron$seq))
intron <- intron[lseq == 20,]
# remove genes out of core
intron <- intron[intron$genefam %in% cds$genefam,]
# acceptors / donors
intron$acc <- substr(intron$seq, 5, 6)
intron$don <- substr(intron$seq, 15, 16)
# consensus
cons.px <- cons.build(intron$acceptor.donor)
cons.px <- cons.px[,! is.nan(colSums(cons.px))]
seq.px <- sapply(intron$acceptor.donor, function(s) gsub("[^acgt]", "", s))
conf.px <- cons.confusion(cons.px, seq.px)
sfam <- split(conf.px$l2scor, intron$genefam)
sfam <- sfam[order(sapply(sfam, median))]
# extract splice exceptions
name.bad <- names(which(sapply(sfam, median) < 0))
name.spc <- names(which(sapply(sfam[name.bad], length) >= CORE_SPLICE_CUTOFF))
name.ok <- setdiff(unique(intron$genefam), name.bad)
name.bad <- setdiff(name.bad, name.spc)
name.list <- c(sapply(name.spc, function(x) x), list(default=name.ok))
cons <- lapply(name.list, function(x) cons.build(intron[intron$genefam %in% x, "acceptor.donor"]))
# write junction models
invisible(sapply(names(cons), function(n) write.model.splice3(cons[[n]], n)))
invisible(sapply(names(cons), function(n) write.model.splice5(cons[[n]], n)))
# use uniform model for bad guys
invisible(sapply(name.bad, function(n) write.unif.splice(3, n)))
invisible(sapply(name.bad, function(n) write.unif.splice(5, n)))
invisible(write.unif.splice('', "none"))
# -------------------------------
# keep data for plotting
# -------------------------------
DB <- list()
params <- list()
params$CORE_NCDS_CUTOFF <- CORE_NCDS_CUTOFF
params$CORE_START_ATG_CUTOFF <- CORE_START_ATG_CUTOFF
params$CORE_START_DFT_CUTOFF <- CORE_START_DFT_CUTOFF
params$CORE_START_OTH_CUTOFF <- CORE_START_OTH_CUTOFF
params$CORE_STOP_CUTOFF <- CORE_STOP_CUTOFF
params$CORE_SPLICE_CUTOFF <- CORE_SPLICE_CUTOFF
params$SHEL_NCDS_CUTOFF <- SHEL_NCDS_CUTOFF
DB$params <- params
DB$chromo <- chromo
DB$cds.lst <- cds.lst
DB$intron <- intron
DB$cons <- cons
notify("saving db.data.Rdata")
save(DB, file="db.data.Rdata")
# -------------------------------
# end
# -------------------------------
quit(save='no')

View File

@ -0,0 +1,424 @@
#!/usr/bin/env Rscript
#
# plots models previously computed by make.models.r
#
# source("plot.models.r")
#
require(vcd)
require(plotrix)
LIBDIR <- Sys.getenv("LIB_DIR")
if (LIBDIR == "") LIBDIR = "."
source(paste0(LIBDIR, "/util.base.r"))
source(paste0(LIBDIR, "/util.plot.r"))
source(paste0(LIBDIR, "/util.cons.r"))
source(paste0(LIBDIR, "/util.grid.r"))
# -------------------------------
# setup
# -------------------------------
OUT.DEV <- TRUE
OUT.TYPE <- "pdf"
OUT.FILE <- "models"
if (OUT.DEV) uplot.init.dev(OUT.FILE, OUT.TYPE)
# -------------------------------
# Load data
# -------------------------------
notify("loading DB data")
load("db.data.Rdata")
params <- DB$params
chromo <- DB$chromo
cds.lst <- DB$cds.lst
intron <- DB$intron
cons <- DB$cons
# -------------------------------
# Genomes infos
# -------------------------------
grd.titlepage("Species")
grd.textpage(lineno=1, "# org: ", nrow(chromo))
#
# general stats
#
grd.hist(chromo, "len", main="Histogram of chromosome length")
grd.hist(chromo, "gc", pos.quant=c(0.75, 0.6), main="Histogram of chromosome GC")
grd.hist(chromo, "nbCds")
grd.fplot(chromo, "len", "nbCds")
#
# nb cds no introns
#
chromo$nbCds_Mono <- chromo$nbCds_int0
chromo$nbCds_Poly <- chromo$nbCds_int1 + chromo$nbCds_intsup1
chromo$percentPoly <- round(chromo$nbCds_Poly * 100 / (chromo$nbCds_Poly + chromo$nbCds_Mono))
grd.hist(chromo, "nbCds_Mono", main="Histogram of monoexonic Cds")
grd.hist(chromo, "nbCds_Poly", main="Histogram of polyexonic Cds")
grd.hist(chromo, "percentPoly", pos.sum=c(0.23,0.6), main="Histogram of % polyexonic")
grd.fplot(chromo, "nbCds", "nbCds_Mono", TRUE, ablin=list(a=0, b=1, col=3))
grd.fplot(chromo, "nbCds", "nbCds_Poly")
#
# cds size
#
grd.hist(chromo, "meanCdsSize", pos.quant=NULL, main="Histogram of Cds size")
# -------------------------------
# CDS
# -------------------------------
cds.all <- do.call(rbind, cds.lst)
grd.titlepage("CDS")
grd.textpage(lineno=1, "# core cds core cutoff: ", params$CORE_NCDS_CUTOFF)
grd.textpage(lineno=2, "# core cds shell cutoff: ", params$SHEL_NCDS_CUTOFF)
grd.textpage(lineno=4, "# total cds: ", nrow(cds.all))
grd.textpage(lineno=5, "# core cds: ", nrow(cds.lst[["core"]]))
grd.textpage(lineno=6, "# shell cds: ", nrow(cds.lst[["shell"]]))
grd.textpage(lineno=7, "# dust cds: ", nrow(cds.lst[["dust"]]))
grd.textpage(lineno=9, "# total org: ", length(unique(cds.all$X.locus)))
grd.textpage(lineno=10, "# core org: ", length(unique(cds.lst[["core"]]$X.locus)))
grd.textpage(lineno=11, "# shell org: ", length(unique(cds.lst[["shell"]]$X.locus)))
grd.textpage(lineno=12, "# dust org: ", length(unique(cds.lst[["dust"]]$X.locus)))
grd.textpage(lineno=14, "# total families: ", length(unique(cds.all$genefam)))
grd.textpage(lineno=15, "# core families: ", length(unique(cds.lst[["core"]]$genefam)))
grd.textpage(lineno=16, "# shell families: ", length(unique(cds.lst[["shell"]]$genefam)))
grd.textpage(lineno=17, "# dust families: ", length(unique(cds.lst[["dust"]]$genefam)))
uplot.setup(mfrow=c(2,2), xpd=NA)
x <- sapply(cds.lst, nrow)
uplot.pie(x, main="CDS", text.r=1.1, col=c(3,2,4))
x <- sapply(cds.lst, function(cds) length(unique(cds$X.locus)))
uplot.pie(x, main="ORG", text.r=1.1, col=c(3,2,4))
x <- sapply(cds.lst, function(cds) length(unique(cds$genefam)))
uplot.pie(x, main="FAM", text.r=1.1, col=c(3,2,4))
uplot.setup(xpd=F)
#
# plot genes cutoff
#
cds.all <- do.call(rbind, cds.lst)
cds.byfam <- split(cds.all, cds.all$genefam)
tab <- sort(sapply(cds.byfam, nrow), decreasing=T)
cols <- rep("red", length(tab))
cols[tab >= params$SHEL_NCDS_CUTOFF] <- "blue"
cols[tab >= params$CORE_NCDS_CUTOFF] <- "green"
barplot(tab, col=cols, border=NA, main="# genes")
cols <- cols[tab >= 50]
tab <- tab[tab >= 50]
barplot(tab, col=cols, border=NA, las=2, cex.names=0.5, main="# genes in core")
abline(h=params$CORE_NCDS_CUTOFF, col=1)
text(50, 200, "CORE_NCDS_CUTOFF", pos=3)
#
# cds length for core
#
invisible(sapply(c("core", "shell", "dust"), function(what) {
cds <- cds.lst[[what]]
x <- split(cds$length, cds$genefam)
x <- x[order(sapply(x, mean))]
uplot.setup(mfrow=c(2,1))
boxplot(x, pars = list(boxwex = 0.8, staplewex = 0.5, outwex = 0.5, outcex = 0.1),
las=2, cex.axis=0.5, main=paste0(what, " genes - length distribution"))
boxplot(x, pars = list(boxwex = 0.8, staplewex = 0.5, outwex = 0.5, outcex = 0.1), ylim=c(0,2000),
las=2, cex.axis=0.5, main=paste0(what, " genes - length distribution zoom"))
uplot.setup()
}))
# -------------------------------
# starts & stops
# -------------------------------
cds <- cds.lst[["core"]]
grd.titlepage("Starts and Stops")
tab <- sort(table(cds$start), dec=T)
tab <- tab[tab >= 100]
tab <- tab / sum(tab) * 100
barplot(tab, log="y", las=1, ylim=c(0.1, 100), main="start frequencies (%)")
text(0.7, 50, round(tab[1], 2))
text(1.9, 2.3, round(tab[2], 2))
text(3.1, 2.3, round(tab[3], 2))
#
# start by org and gc
#
x <- split(cds$start, cds$X.locus)
fatg <- sapply(x, function(x) table(x)["atg"]/length(x)*100)
names(fatg) <- names(x)
chromo$fatg <- round(fatg[chromo$X.locus], 2)
fgtg <- sapply(x, function(x) table(x)["gtg"]/length(x)*100)
names(fgtg) <- names(x)
chromo$fgtg <- round(fgtg[chromo$X.locus], 2)
facg <- sapply(x, function(x) table(x)["acg"]/length(x)*100)
names(facg) <- names(x)
chromo$facg <- round(facg[chromo$X.locus], 2)
grd.hist(chromo, "fatg", pos.quant=c(0.5, 0.6), main="Histogram of atg freq. by org")
grd.hist(chromo, "fgtg", main="Histogram of gtg freq. by org")
grd.hist(chromo, "facg", pos.sum=c(0.3, 0.6), main="Histogram of acg freq. by org")
grd.fplot(chromo, "gc", "fatg", main="atg freq. by org GC", pos=c(0.2, 0.3))
grd.fplot(chromo, "gc", "fgtg", main="gtg freq. by org GC")
grd.fplot(chromo, "gc", "facg", main="acg freq. by org GC")
ter <- cbind(fatg, fgtg, facg)
colnames(ter) <- c("ATG", "GTG", "ACG")
igc <- cut(chromo$gc, breaks=quantile(chromo$gc, seq(0, 1, 0.1)), include.lowest=T, labels=1:10)
cols <- rainbow(10)[igc]
ternaryplot(ter, col=cols, cex=0.2, main="Start by org", labels="outside")
#
# start by common genes
#
x <- split(cds$start, cds$genefam)
fatg <- sapply(x, function(x) table(x)["atg"]/length(x)*100)
names(fatg) <- names(x)
fgtg <- sapply(x, function(x) table(x)["gtg"]/length(x)*100)
names(fgtg) <- names(x)
facg <- sapply(x, function(x) table(x)["acg"]/length(x)*100)
names(facg) <- names(x)
barplot(sort(fatg)[1:10], las=2, main="atg freq. by gene")
barplot(sort(fgtg, dec=T)[1:10], las=2, main="gtg freq. by gene")
barplot(sort(facg, dec=T)[1:10], las=2, main="acg freq. by gene")
ter <- cbind(fatg, fgtg, facg)
colnames(ter) <- c("ATG", "GTG", "ACG")
ternaryplot(ter, col=1, cex=0.5, id=rownames(ter), main="Starts by genes", labels="outside")
# -------------------------------
# stops
# -------------------------------
#
# stop by org and gc
#
tab <- sort(table(cds$stop), dec=T)
tab <- tab[tab >= 100]
tab <- tab / sum(tab) * 100
barplot(tab, log="y", las=1, ylim=c(0.1, 100), main="stop frequencies (%)")
text(0.7, 80, round(tab[1], 2))
text(1.9, 30, round(tab[2], 2))
text(3.1, 25, round(tab[3], 2))
x <- split(cds$stop, cds$X.locus)
ftaa <- sapply(x, function(x) table(x)["taa"]/length(x)*100)
names(ftaa) <- names(x)
chromo$ftaa <- round(ftaa[chromo$X.locus], 2)
ftag <- sapply(x, function(x) table(x)["tag"]/length(x)*100)
names(ftag) <- names(x)
chromo$ftag <- round(ftag[chromo$X.locus], 2)
ftga <- sapply(x, function(x) table(x)["tga"]/length(x)*100)
names(ftga) <- names(x)
chromo$ftga <- round(ftga[chromo$X.locus], 2)
grd.hist(chromo, "ftaa", pos.quant=c(0.7, 0.6), main="Histogram of taa freq. by org")
grd.hist(chromo, "ftag", pos.quant=c(0.8, 0.6), main="Histogram of tag freq. by org")
grd.hist(chromo, "ftga", pos.quant=c(0.8, 0.6), main="Histogram of tga freq. by org")
grd.fplot(chromo, "gc", "ftaa", main="taa freq. by org GC", pos=c(0.2, 0.3))
grd.fplot(chromo, "gc", "ftag", main="tag freq. by org GC")
grd.fplot(chromo, "gc", "ftga", main="tga freq. by org GC")
ter <- cbind(ftaa, ftag, ftga)
colnames(ter) <- c("TAA", "TAG", "TGA")
igc <- cut(chromo$gc, breaks=quantile(chromo$gc, seq(0, 1, 0.1)), include.lowest=T, labels=1:10)
cols <- rainbow(10)[igc]
ternaryplot(ter, col=cols, cex=0.2, main="Stops by org", labels="outside")
#
# stop by common genes
#
x <- split(cds$stop, cds$genefam)
ftaa <- sapply(x, function(x) table(x)["taa"]/length(x)*100)
names(ftaa) <- names(x)
ftag <- sapply(x, function(x) table(x)["tag"]/length(x)*100)
names(ftag) <- names(x)
ftga <- sapply(x, function(x) table(x)["tga"]/length(x)*100)
names(ftga) <- names(x)
barplot(sort(ftaa), las=2, cex.names=0.5, ylim=c(0,100), main="taa freq. by gene")
barplot(sort(ftag), las=2, cex.names=0.5, ylim=c(0,100), main="tag freq. by gene")
barplot(sort(ftga), las=2, cex.names=0.5, ylim=c(0,100), main="tga freq. by gene")
ter <- cbind(ftaa, ftag, ftga)
colnames(ter) <- c("TAA", "TAG", "TGA")
ternaryplot(ter, col=1, cex=0.3, id=rownames(ter), main="Stops by genes", labels="outside")
# -------------------------------
# splice junctions
# -------------------------------
grd.titlepage("Splice Junctions")
grd.textpage(lineno=1, "# intron in core: ", nrow(intron))
#
# intron size
#
intron$size <- intron$to - intron$from + 1
grd.hist(intron, "size", pos.quant=NULL, main="Histogram of intron size", br=1000, xlim=c(0,2000))
#
# nb intron / gene
#
x <- split(intron, intron$genefam)
x <- x[order(sapply(x, function(x) mean(x$intron_nb)), decreasing=T)]
nmax <- max(intron$intron_nb)
lintron <- lapply(x, function(x) x$intron_nb)
mintron <- sapply(lintron, function(x) table(factor(x, levels=1:nmax)))
lintron0 <- table(cds[cds$nexon == 1,"genefam"])[names(lintron)]
mintron <- rbind("0"=lintron0, mintron)
mintron <- t(t(mintron)/colSums(mintron))
mintron[mintron==0] <- NA
nn <- nrow(mintron)
xx <- mintron[nn:1,]
ll <- lapply(1:nn, function(i) xx[i,])
mintron <- mintron[,do.call(order, c(ll, decreasing=T))]
battleship.plot(mintron, maxxspan=0.3, maxyspan=0.3,
cex.labels=0.7,
main="% intron per polyexonic gene")
#
# acceptors / donors
#
tab <- sort(table(intron$acc), dec=T)
tab <- tab[tab >= 100]
tab <- tab / sum(tab) * 100
barplot(tab, log="y", las=1, ylim=c(0.1, 100), main="acceptor frequencies (%)")
text(0.7, 50, round(tab[1], 2))
text(1.9, 3, round(tab[2], 2))
text(3.1, 2.3, round(tab[3], 2))
tab <- sort(table(intron$don), dec=T)
tab <- tab[tab >= 100]
tab <- tab / sum(tab) * 100
barplot(tab, log="y", las=1, ylim=c(0.1, 100), main="donor frequencies (%)")
text(0.7, 50, round(tab[1], 1))
text(1.9, 40, round(tab[2], 1))
text(3.1, 15, round(tab[3], 1))
text(4.3, 12, round(tab[4], 1))
#
# consensus all
#
cons$all <- cons.build(intron$acceptor.donor)
invisible(sapply(rev(names(cons)), function(what) {
cons.plot(cons[[what]], paste0("consensus ", what))
}))
#
# default consensus score by consensus length
#
cons.def <- cons[["default"]]
cons.def <- cons.def[,! is.nan(colSums(cons.def))]
seq.def <- sapply(intron$acceptor.donor, function(s) gsub("[^acgt]", "", s))
epx <- apply(cons.def, 2, function(col) -sum(col * log(col, base=4)))
opx <- order(epx)
conf.def <- lapply(seq(2, length(opx), by=2), function(n) {
pos <- head(opx, n)
notify(n, "/", length(opx))
cons.confusion(cons.def, seq.def, thresh=0, pos=pos)
})
acc <- sapply(conf.def, function(x) x$acc)
sen <- sapply(conf.def, function(x) x$sen)
sel <- sapply(conf.def, function(x) x$sel)
plot(sel, ylim=c(0.7, 1), pch=1, type="b", main="accuracy by nb consensus positions", ylab="")
lines(sen, type="b", pch=2)
lines(acc, type="b", pch=3)
legend(1, 0.95, c("sensit.", "select.", "accur."), pch=1:3, horiz=T, bty="n")
#
# default consensus score by genes
#
conf.def <- cons.confusion(cons.def, seq.def, thresh=0)
cons.histconf(conf.def)
sfam <- split(conf.def$l2scor, intron$genefam)
sfam <- sfam[order(sapply(sfam, median))]
boxplot(sfam, pars = list(boxwex = 0.8, staplewex = 0.5, outwex = 0.5, outcex = 0.1),
las=2, cex.axis=0.7, main="default junction logr score by genes")
abline(h=0)
#
# end
#
if (OUT.DEV) {
cat("+ plot file:", paste0(OUT.FILE, ".", OUT.TYPE), "\n")
invisible(dev.off())
}
quit(save='no')

View File

@ -0,0 +1,71 @@
#!/usr/bin/env Rscript
#
# plot summary graphics of comparisons
#
#
LIBDIR <- Sys.getenv("LIB_DIR")
if (LIBDIR == "") LIBDIR = "."
source(paste0(LIBDIR, "/util.plot.r"))
COLORS <- 2:10
#
OUT.DEV <- TRUE
OUT.TYPE <- "pdf"
OUT.FILE <- "compare"
if (OUT.DEV) uplot.init.dev(OUT.FILE, OUT.TYPE)
#
tab <- read.table("compare.txt", header=T, comment.char="", stringsAsFactors=F)
#
par(xpd=NA)
#
sel <- c("cor", "alcor", "acc", "wrong", "over", "misstot")
tab$ptot <- rowSums(tab[,sel])
for (s in sel)
tab[,paste0("p", s)] <- tab[,s] * 100 / tab$ptot
colors <- head(COLORS, length(sel))
cols <- paste0("p", sel)
ord <- order(tab$pcor+tab$palcor+tab$pacc, decreasing=T)
barplot(t(tab[ord,cols]), names.arg=tab$X.org[ord],
ylim=c(0,100), col=colors, las=2, cex.names=0.5)
legend(0, 110, sel, fill=colors, cex=0.7, horiz=T)
#
sel <- c("cor", "alcor", "acc", "wrong", "over", "misschlo")
tab$rtot <- rowSums(tab[,sel])
for (s in sel)
tab[,paste0("r", s)] <- tab[,s] * 100 / tab$rtot
colors <- head(COLORS, length(sel))
cols <- paste0("r", sel)
ord <- order(tab$rcor+tab$ralcor+tab$racc, decreasing=T)
barplot(t(tab[ord,cols]), names.arg=tab$X.org[ord],
ylim=c(0,100), col=colors, las=2, cex.names=0.5)
legend(0, 110, sel, fill=colors, cex=0.7, horiz=T)
#
if (OUT.DEV) {
cat("# plot file:", paste0(OUT.FILE, ".", OUT.TYPE), "\n")
invisible(dev.off())
}
quit(save='no')

View File

@ -0,0 +1,54 @@
#
#
function getOrg(s, _local_, a, na, org) {
na = split(s, a, "/")
na = split(a[na], a, "\\.")
return a[1]
}
BEGIN {
PROCINFO["sorted_in"] = "@ind_num_asc"
print "#org tot cor alcor acc wrong over misstot misschlo missoth"
}
/MISSED in ChloroDB/ {
org = getOrg($1)
Org[org]++
Cnt[org]["MISSCHLORO"] = $2
next
}
/MISSED not in ChloroDB/ {
org = getOrg($1)
Org[org]++
Cnt[org]["MISSNOTCHLORO"] = $2
next
}
/^#/ { next }
/^.*:MATCH/ {
org = getOrg($1)
Org[org]++
split($NF, a, "\\.")
Cnt[org][a[1]]++
}
END {
for (org in Org) {
Cnt[org]["TOTAL"] = Cnt[org]["CORRECT"] + Cnt[org]["ALMOST_CORRECT"] \
+ Cnt[org]["ACCEPTABLE"] + Cnt[org]["WRONG"] \
+ Cnt[org]["MISSED"]
}
for (org in Org) {
print org, Cnt[org]["TOTAL"]+0, Cnt[org]["CORRECT"]+0, \
Cnt[org]["ALMOST_CORRECT"]+0, Cnt[org]["ACCEPTABLE"]+0, \
Cnt[org]["WRONG"]+0, Cnt[org]["OVERPRED"]+0, \
Cnt[org]["MISSED"]+0, \
Cnt[org]["MISSCHLORO"]+0, Cnt[org]["MISSNOTCHLORO"]+0
}
}

View File

@ -0,0 +1,12 @@
#
# R basic utilities
#
#
# notify on stderr
#
notify <- function(...) cat("+", ..., "\n")

View File

@ -0,0 +1,109 @@
#
# R consensus utilities
#
#
# compute consensus
#
cons.build <- function(seqs, backcount=1) {
xx <- do.call(rbind, sapply(seqs, strsplit, "", USE.NAMES=F))
lv <- c("a", "c", "g", "t", ".", "-")
mx <- apply(xx, 2, function(x) table(factor(x, levels=lv)))[1:4,]
cx <- colSums(mx)
mx <- mx + backcount
mx[,cx==0] <- 0
apply(mx, 2, function(x) x / sum(x))
}
#
# score consensus
#
cons.score <- function(cons, seq, pos=1:ncol(cons)) {
seq <- strsplit(seq, "")[[1]]
if (length(seq) != ncol(cons)) {
warning("incompatible seq and cons size")
return(NA)
}
ppx <- sapply(pos, function(i) cons[seq[i],i])
sum(log10(ppx+1e-6))
}
#
# logratio to uniform model score
#
cons.logratio <- function(cons, seq, m0=NULL, pos=1:ncol(cons)) {
if (is.null(m0)) {
m0 <- matrix(rep(0.25, 4), nrow=4, ncol=ncol(cons))
rownames(m0) <- c('a', 'c', 'g', 't')
}
sc <- cons.score(cons, seq, pos=pos)
sc0 <- cons.score(m0, seq, pos=pos)
2 * (log(10^sc, base=2) - log(10^sc0, base=2))
}
#
# shuffle sequence
#
seq.shuf <- function(seq) {
paste0(sample(strsplit(seq, "")[[1]], nchar(seq), replace=F), collapse="")
}
#
# compute confusion matrix between actual and shuffled sequences
#
cons.confusion <- function(cons, seq, m0=NULL, pos=1:ncol(cons), thresh=0) {
som <- function(x) sum(x, na.rm=T)
res <- list()
res$l2scor <- l2scor <- sapply(seq, function(s) cons.logratio(cons, s, m0=m0, pos=pos))
seq <- sapply(seq, seq.shuf)
res$r2scor <- r2scor <- sapply(seq, function(s) cons.logratio(cons, s, m0=m0, pos=pos))
res$conf <- conf <- matrix(c(som(l2scor >= thresh), som(l2scor < thresh),
som(r2scor >= thresh), som(r2scor < thresh)),
nrow=2, byrow=T)
res$acc <- sum(diag(conf)) / sum(conf)
res$sen <- conf[1,1] / sum(conf[1,])
res$sel <- conf[1,1] / sum(conf[,1])
res
}
#
# plot consensus
#
cons.plot <- function(cons, main="consensus") {
cols <- c("blue", "orange", "red", "green")
bp <- barplot(cons, col=cols, ylim=c(0,1), main=main)
plx <- apply(cons, 2, function(col) -sum(col * log(col+1e-6, base=4)))
lines(bp, plx, type="b", pch=19)
legend(0, 1.1, c("a","c","g","t"), fill=cols, horiz=T, xpd=NA, bty="n")
legend(20, 1.1, "entropy", pch=19, horiz=T, xpd=NA, bty="n")
invisible()
}
#
# plot confusion scores histograms
#
cons.histconf <- function(conf, main="junction logr score") {
lrh <- hist(c(conf$l2scor, conf$r2scor), br=50, plot=F)
lh <- hist(conf$l2scor, br=lrh$breaks, plot=F)
rh <- hist(conf$r2scor, br=lrh$breaks, plot=F)
xx <- rbind(lh$counts, rh$counts) / sum(lh$counts)
colnames(xx) <- lrh$mids
barplot(xx, col=c(3,2), beside=T, main=main)
legend(0, 0.1, c("true", "shuffled"), fill=c(3,2), horiz=F, xpd=NA)
invisible()
}

View File

@ -0,0 +1,93 @@
#
# R misc grid plotting
#
require(grid)
require(gridExtra)
#
# get line height
#
grd.lineheight <- function(s="X") {
convertHeight(unit(1,"strheight", s), "native", valueOnly=T)
}
#
# quantile table
#
grd.qtab <- function(df, what, cols, n=5) {
df <- df[order(df[,what], decreasing=T),cols]
sep <- head(df,1)
sep[] <- "-"
rbind(head(df, n), sep, tail(df, n))
}
#
# histogram with tables
#
grd.hist <- function(df, what, cols = c(1,2, which(colnames(df) == what)),
breaks=50, pos.sum=c(0.2,0.6), pos.quant=c(0.7,0.6), cex=0.7,
main=paste0("Histogram of ", what), ...) {
hist(df[,what], breaks=breaks, xlab=what, main=main, ...)
if (! is.null(pos.sum)) {
pushViewport(viewport(pos.sum[1], pos.sum[2], gp=gpar(cex=cex)))
grid.table(x<-summary(df[,what]), rows=names(x))
popViewport()
}
if (! is.null(pos.quant)) {
pushViewport(viewport(pos.quant[1], pos.quant[2], gp=gpar(cex=cex)))
grid.table(grd.qtab(df, what, cols), rows=NULL)
popViewport()
}
invisible()
}
#
# plot with fit
#
grd.fplot <- function(df, what.x, what.y, linfit=T, pos=c(0.2, 0.8), ablin=NULL, ...) {
plot(df[,what.x], df[,what.y], xlab=what.x, ylab=what.y, ...)
if (linfit) {
fit <- lm(df[,what.y] ~ df[,what.x])
abline(fit, col=2)
pushViewport(viewport(gp=gpar(col=2)))
a <- sprintf("%.2e", coef(fit)[2])
b <- sprintf("%.2e", coef(fit)[1])
grid.text(paste0(what.y, " = ", a, " * ", what.x, " + ", b),
pos[1], pos[2], just="left")
pos[2] = pos[2] - 2 * grd.lineheight()
grid.text(paste0("R2=", round(summary(fit)$r.squared, 3)),
pos[1], pos[2], just="left")
popViewport()
}
if (! is.null(ablin))
do.call(abline, ablin)
invisible()
}
#
# write text
#
grd.textpage <- function(..., lineno=0, left=0.1, top=0.9, cex=1, fact=1.4) {
txt <- do.call(paste0, list(...))
pushViewport(viewport(gp=gpar(cex=cex)))
grid.text(txt, left, top-lineno*grd.lineheight()*fact, just="left")
popViewport()
invisible(txt)
}
#
# title page
#
grd.titlepage <- function(title, x=0.5, y=0.7, cex=3, ...) {
notify("processing", title)
grid.newpage()
grid.text(title, x, y, gp=gpar(cex=cex), ...)
invisible()
}

View File

@ -0,0 +1,90 @@
#
# R models I/O utilities
#
#
# write start model
#
write.model.start <- function(frq, what) {
dir.create("models", showWarnings=F)
fil <- paste0("models/start.", what, ".frq")
notify("writing start model:", fil)
cat("# start model :", what, "\n", file=fil)
for (x in names(frq))
cat(x, frq[x]/sum(frq), frq[x], "\n", file=fil, append=T)
invisible(fil)
}
#
# write stop model
#
write.model.stop <- function(frq, what) {
dir.create("models", showWarnings=F)
fil <- paste0("models/stop.", what, ".frq")
notify("writing stop model:", fil)
cat("# stop model :", what, "(freq. ignored)\n", file=fil)
for (x in names(frq))
cat(x, frq[x]/sum(frq), frq[x], "\n", file=fil, append=T)
invisible(fil)
}
#
# write splice3 model
# [FIXME] positions are hard-coded
#
write.model.splice3 <- function(cons, what) {
dir.create("models", showWarnings=F)
fil <- paste0("models/splice3.", what, ".frq")
notify("writing splice3 model:", fil)
.catcons <- function(i) {
cat(round(cons[c("a","c","g","t"), i]*100, 0), "\n",
file=fil, append=T)
}
cat("# 3' splice model :", what, "\n", file=fil)
cat("# A C G T\n", file=fil, append=T)
sapply(seq.int(1, 4), .catcons)
cat("splice\n", file=fil, append=T)
sapply(seq.int(6, 11), .catcons)
invisible(fil)
}
#
# write splice5 model
# [FIXME] positions are hard-coded
#
write.model.splice5 <- function(cons, what) {
dir.create("models", showWarnings=F)
fil <- paste0("models/splice5.", what, ".frq")
notify("writing splice5 model:", fil)
.catcons <- function(i) {
cat(round(cons[c("a","c","g","t"), i]*100, 0), "\n",
file=fil, append=T)
}
cat("# 5' splice model :", what, "\n", file=fil)
cat("# A C G T\n", file=fil, append=T)
sapply(seq.int(13, 18), .catcons)
cat("splice\n", file=fil, append=T)
sapply(seq.int(20, 23), .catcons)
invisible(fil)
}
#
# write splice3/5 uniform model
#
write.unif.splice <- function(pos, what) {
dir.create("models", showWarnings=F)
fil <- paste0("models/splice", pos, ".", what, ".frq")
notify("writing uniform splice", pos, "model:", fil)
cat("# 3'/5' splice null model", file=fil)
cat("# A C G T\n", file=fil, append=T)
cat("25 25 25 25\n", file=fil, append=T)
cat("splice\n", file=fil, append=T)
cat("25 25 25 25\n", file=fil, append=T)
invisible(fil)
}

View File

@ -0,0 +1,105 @@
#
# R plot utilities
#
#
# setup graphic device
# tiff: high resolution 600 dpi
# pdf
#
uplot.init.dev <- function(fname, type="pdf", width=7, height=7, resol=600, ...) {
fname <- paste0(fname, ".", type)
res <- NULL
if (type == "tiff") {
res <- tiff(fname, width=width, height=height, units="in", res=resol, ...)
}
if (type == "pdf") {
res <- pdf(fname, width=width, height=height, ...)
}
invisible(res)
}
#
# convert pdf to tiff using ghostscript
#
uplot.convert2tiff <- function(fname, resol=600) {
infile <- paste0(fname, ".pdf")
oufile <- paste0(fname, ".tif")
cmd <- paste0("echo quit | gs -r", resol, "-dBATCH -dNOPAUSE -sDEVICE=tiff12nc -sCompression=lzw -sOutputFile=", oufile, " ", infile)
system(cmd)
}
#
# default plot setup
#
uplot.setup <- function(mfrow=c(1,1),
las=1,
mgp=c(2, 0.7, 0),
oma=c(0, 0, 0, 0),
mar=c(4, 3, 3, 2),
cex.main=1,
font.main=1,
family='Helvetica', ...) {
par(mfrow=mfrow, las=las, mgp=mgp, oma=oma, mar=mar, cex.main=cex.main, font.main=font.main, family=family, ...)
}
#
# pie plot
#
uplot.pie <- function(tab, main="", labels=c("name", "val", "per"), text.r=0.5, text.col="black", text.cex=1, main.pos=c(0,0), main.col="black", ...) {
pie(tab, edges=2000, main="", labels="", ...)
text(main.pos[1], main.pos[2], main, cex=1.5, col=main.col)
prop <- tab/sum(tab)
theta <- 2*pi * (cumsum(prop) - prop/2)
lab <- list(name=names(tab), val=tab, per=sprintf("%d%%", round(prop*100)))
lab <- apply(data.frame(lab[labels]), 1, paste, collapse="\n")
if (length(lab) > 0)
text(text.r*cos(theta), text.r*sin(theta), lab, cex=text.cex, col=text.col)
invisible(NULL)
}
#
# plot utility : color representation of a table
#
uplot.table <- function(tab, col=heat.colors(100), with.lines=TRUE) {
image(as.matrix(tab), xaxt="n", yaxt="n", col=col)
nli <- nrow(tab)
nco <- ncol(tab)
dx <- 0.5 / (nli-1)
dy <- 0.5 / (nco-1)
xf <- (seq_len(nli)-1)/(nli-1) - dx
yf <- (seq_len(nco)-1)/(nco-1) - dy
if (with.lines) {
segments(xf, -dy, xf, 1+dy)
segments(-dx, yf, 1+dx, yf)
}
Axis(c(0,1), at=xf+dx, side=1, labels=rownames(tab), las=2, cex.axis=0.5, padj=0)
Axis(c(0,1), at=yf+dy, side=2, labels=colnames(tab), las=2, cex.axis=0.5, padj=0)
invisible(NULL)
}
#
# plot utility : identify points within user's rectangle
#
rect.identify <- function(data) {
if (is.null(dim(data))) data <- cbind(seq_along(data), data)
xy <- locator(n=2, type='n')
r <- matrix(c(range(xy$x), range(xy$y)), ncol=2, byrow=TRUE)
rect(r[1,1], r[2,1], r[1,2], r[2,2], border='red')
.in.range <- function(p, r) {
.in.int <- function(i) {
(p[i] >= r[i,1]) && (p[i] <= r[i,2])
}
.in.int(1) && .in.int(2)
}
isel <- which(apply(data, 1, .in.range, r))
points(data[isel,], col='red', pch=19)
isel
}