
Former-commit-id: 0579e878a69b7c285ca71870e9ca5730649a2fda Former-commit-id: 7cced5b488441d87bf070a9a444317db0e048880
425 lines
12 KiB
R
Executable File
425 lines
12 KiB
R
Executable File
#!/usr/bin/env Rscript
|
|
#
|
|
# plots models previously computed by make.models.r
|
|
#
|
|
# source("plot.models.r")
|
|
#
|
|
|
|
require(vcd)
|
|
require(plotrix)
|
|
|
|
LIBDIR <- Sys.getenv("LIB_DIR")
|
|
if (LIBDIR == "") LIBDIR = "."
|
|
|
|
source(paste0(LIBDIR, "/util.base.r"))
|
|
source(paste0(LIBDIR, "/util.plot.r"))
|
|
source(paste0(LIBDIR, "/util.cons.r"))
|
|
source(paste0(LIBDIR, "/util.grid.r"))
|
|
|
|
# -------------------------------
|
|
# setup
|
|
# -------------------------------
|
|
|
|
OUT.DEV <- TRUE
|
|
OUT.TYPE <- "pdf"
|
|
OUT.FILE <- "models"
|
|
|
|
if (OUT.DEV) uplot.init.dev(OUT.FILE, OUT.TYPE)
|
|
|
|
# -------------------------------
|
|
# Load data
|
|
# -------------------------------
|
|
|
|
notify("loading DB data")
|
|
load("db.data.Rdata")
|
|
|
|
params <- DB$params
|
|
chromo <- DB$chromo
|
|
cds.lst <- DB$cds.lst
|
|
intron <- DB$intron
|
|
cons <- DB$cons
|
|
|
|
# -------------------------------
|
|
# Genomes infos
|
|
# -------------------------------
|
|
|
|
grd.titlepage("Species")
|
|
grd.textpage(lineno=1, "# org: ", nrow(chromo))
|
|
|
|
#
|
|
# general stats
|
|
#
|
|
|
|
grd.hist(chromo, "len", main="Histogram of chromosome length")
|
|
grd.hist(chromo, "gc", pos.quant=c(0.75, 0.6), main="Histogram of chromosome GC")
|
|
grd.hist(chromo, "nbCds")
|
|
grd.fplot(chromo, "len", "nbCds")
|
|
|
|
#
|
|
# nb cds no introns
|
|
#
|
|
|
|
chromo$nbCds_Mono <- chromo$nbCds_int0
|
|
chromo$nbCds_Poly <- chromo$nbCds_int1 + chromo$nbCds_intsup1
|
|
chromo$percentPoly <- round(chromo$nbCds_Poly * 100 / (chromo$nbCds_Poly + chromo$nbCds_Mono))
|
|
|
|
grd.hist(chromo, "nbCds_Mono", main="Histogram of monoexonic Cds")
|
|
grd.hist(chromo, "nbCds_Poly", main="Histogram of polyexonic Cds")
|
|
grd.hist(chromo, "percentPoly", pos.sum=c(0.23,0.6), main="Histogram of % polyexonic")
|
|
|
|
grd.fplot(chromo, "nbCds", "nbCds_Mono", TRUE, ablin=list(a=0, b=1, col=3))
|
|
grd.fplot(chromo, "nbCds", "nbCds_Poly")
|
|
|
|
#
|
|
# cds size
|
|
#
|
|
|
|
grd.hist(chromo, "meanCdsSize", pos.quant=NULL, main="Histogram of Cds size")
|
|
|
|
# -------------------------------
|
|
# CDS
|
|
# -------------------------------
|
|
|
|
cds.all <- do.call(rbind, cds.lst)
|
|
|
|
grd.titlepage("CDS")
|
|
|
|
grd.textpage(lineno=1, "# core cds core cutoff: ", params$CORE_NCDS_CUTOFF)
|
|
grd.textpage(lineno=2, "# core cds shell cutoff: ", params$SHEL_NCDS_CUTOFF)
|
|
|
|
grd.textpage(lineno=4, "# total cds: ", nrow(cds.all))
|
|
grd.textpage(lineno=5, "# core cds: ", nrow(cds.lst[["core"]]))
|
|
grd.textpage(lineno=6, "# shell cds: ", nrow(cds.lst[["shell"]]))
|
|
grd.textpage(lineno=7, "# dust cds: ", nrow(cds.lst[["dust"]]))
|
|
|
|
grd.textpage(lineno=9, "# total org: ", length(unique(cds.all$X.locus)))
|
|
grd.textpage(lineno=10, "# core org: ", length(unique(cds.lst[["core"]]$X.locus)))
|
|
grd.textpage(lineno=11, "# shell org: ", length(unique(cds.lst[["shell"]]$X.locus)))
|
|
grd.textpage(lineno=12, "# dust org: ", length(unique(cds.lst[["dust"]]$X.locus)))
|
|
|
|
|
|
grd.textpage(lineno=14, "# total families: ", length(unique(cds.all$genefam)))
|
|
grd.textpage(lineno=15, "# core families: ", length(unique(cds.lst[["core"]]$genefam)))
|
|
grd.textpage(lineno=16, "# shell families: ", length(unique(cds.lst[["shell"]]$genefam)))
|
|
grd.textpage(lineno=17, "# dust families: ", length(unique(cds.lst[["dust"]]$genefam)))
|
|
|
|
uplot.setup(mfrow=c(2,2), xpd=NA)
|
|
|
|
x <- sapply(cds.lst, nrow)
|
|
uplot.pie(x, main="CDS", text.r=1.1, col=c(3,2,4))
|
|
|
|
x <- sapply(cds.lst, function(cds) length(unique(cds$X.locus)))
|
|
uplot.pie(x, main="ORG", text.r=1.1, col=c(3,2,4))
|
|
|
|
x <- sapply(cds.lst, function(cds) length(unique(cds$genefam)))
|
|
uplot.pie(x, main="FAM", text.r=1.1, col=c(3,2,4))
|
|
|
|
uplot.setup(xpd=F)
|
|
|
|
#
|
|
# plot genes cutoff
|
|
#
|
|
|
|
cds.all <- do.call(rbind, cds.lst)
|
|
cds.byfam <- split(cds.all, cds.all$genefam)
|
|
|
|
tab <- sort(sapply(cds.byfam, nrow), decreasing=T)
|
|
cols <- rep("red", length(tab))
|
|
cols[tab >= params$SHEL_NCDS_CUTOFF] <- "blue"
|
|
cols[tab >= params$CORE_NCDS_CUTOFF] <- "green"
|
|
barplot(tab, col=cols, border=NA, main="# genes")
|
|
|
|
cols <- cols[tab >= 50]
|
|
tab <- tab[tab >= 50]
|
|
barplot(tab, col=cols, border=NA, las=2, cex.names=0.5, main="# genes in core")
|
|
abline(h=params$CORE_NCDS_CUTOFF, col=1)
|
|
text(50, 200, "CORE_NCDS_CUTOFF", pos=3)
|
|
|
|
#
|
|
# cds length for core
|
|
#
|
|
|
|
invisible(sapply(c("core", "shell", "dust"), function(what) {
|
|
|
|
cds <- cds.lst[[what]]
|
|
|
|
x <- split(cds$length, cds$genefam)
|
|
x <- x[order(sapply(x, mean))]
|
|
|
|
uplot.setup(mfrow=c(2,1))
|
|
|
|
boxplot(x, pars = list(boxwex = 0.8, staplewex = 0.5, outwex = 0.5, outcex = 0.1),
|
|
las=2, cex.axis=0.5, main=paste0(what, " genes - length distribution"))
|
|
|
|
boxplot(x, pars = list(boxwex = 0.8, staplewex = 0.5, outwex = 0.5, outcex = 0.1), ylim=c(0,2000),
|
|
las=2, cex.axis=0.5, main=paste0(what, " genes - length distribution zoom"))
|
|
|
|
uplot.setup()
|
|
}))
|
|
|
|
# -------------------------------
|
|
# starts & stops
|
|
# -------------------------------
|
|
|
|
cds <- cds.lst[["core"]]
|
|
|
|
grd.titlepage("Starts and Stops")
|
|
|
|
tab <- sort(table(cds$start), dec=T)
|
|
tab <- tab[tab >= 100]
|
|
tab <- tab / sum(tab) * 100
|
|
|
|
barplot(tab, log="y", las=1, ylim=c(0.1, 100), main="start frequencies (%)")
|
|
text(0.7, 50, round(tab[1], 2))
|
|
text(1.9, 2.3, round(tab[2], 2))
|
|
text(3.1, 2.3, round(tab[3], 2))
|
|
|
|
#
|
|
# start by org and gc
|
|
#
|
|
|
|
x <- split(cds$start, cds$X.locus)
|
|
|
|
fatg <- sapply(x, function(x) table(x)["atg"]/length(x)*100)
|
|
names(fatg) <- names(x)
|
|
chromo$fatg <- round(fatg[chromo$X.locus], 2)
|
|
|
|
fgtg <- sapply(x, function(x) table(x)["gtg"]/length(x)*100)
|
|
names(fgtg) <- names(x)
|
|
chromo$fgtg <- round(fgtg[chromo$X.locus], 2)
|
|
|
|
facg <- sapply(x, function(x) table(x)["acg"]/length(x)*100)
|
|
names(facg) <- names(x)
|
|
chromo$facg <- round(facg[chromo$X.locus], 2)
|
|
|
|
grd.hist(chromo, "fatg", pos.quant=c(0.5, 0.6), main="Histogram of atg freq. by org")
|
|
grd.hist(chromo, "fgtg", main="Histogram of gtg freq. by org")
|
|
grd.hist(chromo, "facg", pos.sum=c(0.3, 0.6), main="Histogram of acg freq. by org")
|
|
|
|
grd.fplot(chromo, "gc", "fatg", main="atg freq. by org GC", pos=c(0.2, 0.3))
|
|
grd.fplot(chromo, "gc", "fgtg", main="gtg freq. by org GC")
|
|
grd.fplot(chromo, "gc", "facg", main="acg freq. by org GC")
|
|
|
|
ter <- cbind(fatg, fgtg, facg)
|
|
colnames(ter) <- c("ATG", "GTG", "ACG")
|
|
igc <- cut(chromo$gc, breaks=quantile(chromo$gc, seq(0, 1, 0.1)), include.lowest=T, labels=1:10)
|
|
cols <- rainbow(10)[igc]
|
|
ternaryplot(ter, col=cols, cex=0.2, main="Start by org", labels="outside")
|
|
|
|
#
|
|
# start by common genes
|
|
#
|
|
|
|
x <- split(cds$start, cds$genefam)
|
|
|
|
fatg <- sapply(x, function(x) table(x)["atg"]/length(x)*100)
|
|
names(fatg) <- names(x)
|
|
|
|
fgtg <- sapply(x, function(x) table(x)["gtg"]/length(x)*100)
|
|
names(fgtg) <- names(x)
|
|
|
|
facg <- sapply(x, function(x) table(x)["acg"]/length(x)*100)
|
|
names(facg) <- names(x)
|
|
|
|
barplot(sort(fatg)[1:10], las=2, main="atg freq. by gene")
|
|
barplot(sort(fgtg, dec=T)[1:10], las=2, main="gtg freq. by gene")
|
|
barplot(sort(facg, dec=T)[1:10], las=2, main="acg freq. by gene")
|
|
|
|
ter <- cbind(fatg, fgtg, facg)
|
|
colnames(ter) <- c("ATG", "GTG", "ACG")
|
|
ternaryplot(ter, col=1, cex=0.5, id=rownames(ter), main="Starts by genes", labels="outside")
|
|
|
|
# -------------------------------
|
|
# stops
|
|
# -------------------------------
|
|
|
|
#
|
|
# stop by org and gc
|
|
#
|
|
|
|
tab <- sort(table(cds$stop), dec=T)
|
|
tab <- tab[tab >= 100]
|
|
tab <- tab / sum(tab) * 100
|
|
|
|
barplot(tab, log="y", las=1, ylim=c(0.1, 100), main="stop frequencies (%)")
|
|
text(0.7, 80, round(tab[1], 2))
|
|
text(1.9, 30, round(tab[2], 2))
|
|
text(3.1, 25, round(tab[3], 2))
|
|
|
|
x <- split(cds$stop, cds$X.locus)
|
|
|
|
ftaa <- sapply(x, function(x) table(x)["taa"]/length(x)*100)
|
|
names(ftaa) <- names(x)
|
|
chromo$ftaa <- round(ftaa[chromo$X.locus], 2)
|
|
|
|
ftag <- sapply(x, function(x) table(x)["tag"]/length(x)*100)
|
|
names(ftag) <- names(x)
|
|
chromo$ftag <- round(ftag[chromo$X.locus], 2)
|
|
|
|
ftga <- sapply(x, function(x) table(x)["tga"]/length(x)*100)
|
|
names(ftga) <- names(x)
|
|
chromo$ftga <- round(ftga[chromo$X.locus], 2)
|
|
|
|
grd.hist(chromo, "ftaa", pos.quant=c(0.7, 0.6), main="Histogram of taa freq. by org")
|
|
grd.hist(chromo, "ftag", pos.quant=c(0.8, 0.6), main="Histogram of tag freq. by org")
|
|
grd.hist(chromo, "ftga", pos.quant=c(0.8, 0.6), main="Histogram of tga freq. by org")
|
|
|
|
grd.fplot(chromo, "gc", "ftaa", main="taa freq. by org GC", pos=c(0.2, 0.3))
|
|
grd.fplot(chromo, "gc", "ftag", main="tag freq. by org GC")
|
|
grd.fplot(chromo, "gc", "ftga", main="tga freq. by org GC")
|
|
|
|
ter <- cbind(ftaa, ftag, ftga)
|
|
colnames(ter) <- c("TAA", "TAG", "TGA")
|
|
igc <- cut(chromo$gc, breaks=quantile(chromo$gc, seq(0, 1, 0.1)), include.lowest=T, labels=1:10)
|
|
cols <- rainbow(10)[igc]
|
|
ternaryplot(ter, col=cols, cex=0.2, main="Stops by org", labels="outside")
|
|
|
|
#
|
|
# stop by common genes
|
|
#
|
|
|
|
x <- split(cds$stop, cds$genefam)
|
|
|
|
ftaa <- sapply(x, function(x) table(x)["taa"]/length(x)*100)
|
|
names(ftaa) <- names(x)
|
|
|
|
ftag <- sapply(x, function(x) table(x)["tag"]/length(x)*100)
|
|
names(ftag) <- names(x)
|
|
|
|
ftga <- sapply(x, function(x) table(x)["tga"]/length(x)*100)
|
|
names(ftga) <- names(x)
|
|
|
|
barplot(sort(ftaa), las=2, cex.names=0.5, ylim=c(0,100), main="taa freq. by gene")
|
|
barplot(sort(ftag), las=2, cex.names=0.5, ylim=c(0,100), main="tag freq. by gene")
|
|
barplot(sort(ftga), las=2, cex.names=0.5, ylim=c(0,100), main="tga freq. by gene")
|
|
|
|
ter <- cbind(ftaa, ftag, ftga)
|
|
colnames(ter) <- c("TAA", "TAG", "TGA")
|
|
ternaryplot(ter, col=1, cex=0.3, id=rownames(ter), main="Stops by genes", labels="outside")
|
|
|
|
# -------------------------------
|
|
# splice junctions
|
|
# -------------------------------
|
|
|
|
grd.titlepage("Splice Junctions")
|
|
|
|
grd.textpage(lineno=1, "# intron in core: ", nrow(intron))
|
|
|
|
#
|
|
# intron size
|
|
#
|
|
|
|
intron$size <- intron$to - intron$from + 1
|
|
|
|
grd.hist(intron, "size", pos.quant=NULL, main="Histogram of intron size", br=1000, xlim=c(0,2000))
|
|
|
|
#
|
|
# nb intron / gene
|
|
#
|
|
|
|
x <- split(intron, intron$genefam)
|
|
x <- x[order(sapply(x, function(x) mean(x$intron_nb)), decreasing=T)]
|
|
|
|
nmax <- max(intron$intron_nb)
|
|
lintron <- lapply(x, function(x) x$intron_nb)
|
|
mintron <- sapply(lintron, function(x) table(factor(x, levels=1:nmax)))
|
|
|
|
lintron0 <- table(cds[cds$nexon == 1,"genefam"])[names(lintron)]
|
|
mintron <- rbind("0"=lintron0, mintron)
|
|
mintron <- t(t(mintron)/colSums(mintron))
|
|
|
|
mintron[mintron==0] <- NA
|
|
|
|
nn <- nrow(mintron)
|
|
xx <- mintron[nn:1,]
|
|
ll <- lapply(1:nn, function(i) xx[i,])
|
|
mintron <- mintron[,do.call(order, c(ll, decreasing=T))]
|
|
|
|
battleship.plot(mintron, maxxspan=0.3, maxyspan=0.3,
|
|
cex.labels=0.7,
|
|
main="% intron per polyexonic gene")
|
|
|
|
#
|
|
# acceptors / donors
|
|
#
|
|
|
|
tab <- sort(table(intron$acc), dec=T)
|
|
tab <- tab[tab >= 100]
|
|
tab <- tab / sum(tab) * 100
|
|
|
|
barplot(tab, log="y", las=1, ylim=c(0.1, 100), main="acceptor frequencies (%)")
|
|
text(0.7, 50, round(tab[1], 2))
|
|
text(1.9, 3, round(tab[2], 2))
|
|
text(3.1, 2.3, round(tab[3], 2))
|
|
|
|
tab <- sort(table(intron$don), dec=T)
|
|
tab <- tab[tab >= 100]
|
|
tab <- tab / sum(tab) * 100
|
|
|
|
barplot(tab, log="y", las=1, ylim=c(0.1, 100), main="donor frequencies (%)")
|
|
text(0.7, 50, round(tab[1], 1))
|
|
text(1.9, 40, round(tab[2], 1))
|
|
text(3.1, 15, round(tab[3], 1))
|
|
text(4.3, 12, round(tab[4], 1))
|
|
|
|
#
|
|
# consensus all
|
|
#
|
|
|
|
cons$all <- cons.build(intron$acceptor.donor)
|
|
|
|
invisible(sapply(rev(names(cons)), function(what) {
|
|
cons.plot(cons[[what]], paste0("consensus ", what))
|
|
}))
|
|
|
|
#
|
|
# default consensus score by consensus length
|
|
#
|
|
|
|
cons.def <- cons[["default"]]
|
|
cons.def <- cons.def[,! is.nan(colSums(cons.def))]
|
|
seq.def <- sapply(intron$acceptor.donor, function(s) gsub("[^acgt]", "", s))
|
|
|
|
epx <- apply(cons.def, 2, function(col) -sum(col * log(col, base=4)))
|
|
opx <- order(epx)
|
|
|
|
conf.def <- lapply(seq(2, length(opx), by=2), function(n) {
|
|
pos <- head(opx, n)
|
|
notify(n, "/", length(opx))
|
|
cons.confusion(cons.def, seq.def, thresh=0, pos=pos)
|
|
})
|
|
|
|
acc <- sapply(conf.def, function(x) x$acc)
|
|
sen <- sapply(conf.def, function(x) x$sen)
|
|
sel <- sapply(conf.def, function(x) x$sel)
|
|
|
|
plot(sel, ylim=c(0.7, 1), pch=1, type="b", main="accuracy by nb consensus positions", ylab="")
|
|
lines(sen, type="b", pch=2)
|
|
lines(acc, type="b", pch=3)
|
|
legend(1, 0.95, c("sensit.", "select.", "accur."), pch=1:3, horiz=T, bty="n")
|
|
|
|
#
|
|
# default consensus score by genes
|
|
#
|
|
|
|
conf.def <- cons.confusion(cons.def, seq.def, thresh=0)
|
|
cons.histconf(conf.def)
|
|
|
|
sfam <- split(conf.def$l2scor, intron$genefam)
|
|
sfam <- sfam[order(sapply(sfam, median))]
|
|
|
|
boxplot(sfam, pars = list(boxwex = 0.8, staplewex = 0.5, outwex = 0.5, outcex = 0.1),
|
|
las=2, cex.axis=0.7, main="default junction logr score by genes")
|
|
abline(h=0)
|
|
|
|
#
|
|
# end
|
|
#
|
|
|
|
if (OUT.DEV) {
|
|
cat("+ plot file:", paste0(OUT.FILE, ".", OUT.TYPE), "\n")
|
|
invisible(dev.off())
|
|
}
|
|
|
|
quit(save='no')
|