Files
annotate/detectors/cds/tools/lib/make.models.r
alain viari e4d6a8484d cds/tools/chlorodb added
Former-commit-id: 0579e878a69b7c285ca71870e9ca5730649a2fda
Former-commit-id: 7cced5b488441d87bf070a9a444317db0e048880
2015-11-13 17:41:18 +01:00

225 lines
5.2 KiB
R
Executable File

#!/usr/bin/env Rscript
#
# compute start, stop, splice-junctions models for core DB
#
# source("make.models.r")
#
LIB_DIR <- Sys.getenv("LIB_DIR")
if (LIB_DIR == "") LIB_DIR = "."
source(paste0(LIB_DIR, "/util.base.r"))
source(paste0(LIB_DIR, "/util.cons.r"))
source(paste0(LIB_DIR, "/util.modelio.r"))
# -------------------------------
# parameters
# -------------------------------
# core cutoffs
source("db.models.params.txt")
# -------------------------------
# genome infos
# -------------------------------
notify("loading info table")
chromo <- read.table("db.info.txt", com="", head=T, stringsAsFactors=F)
# -------------------------------
# CDS
# -------------------------------
notify("loading cds table")
cds <- read.table("db.cds.txt", com="", header=T, stringsAsFactors=F)
cds$start <- as.factor(cds$start)
cds$stop <- as.factor(cds$stop)
cds <- cds[cds$status=="Ok",]
cds <- cds[cds$genefam!="none",]
cds$categ <- "dust"
x <- sort(table(cds$genefam), dec=T)
ok <- names(x[x >= CORE_NCDS_CUTOFF])
cds$categ[cds$genefam %in% ok] <- "core"
x <- x[! names(x) %in% ok]
ok <- names(x[x >= SHEL_NCDS_CUTOFF])
cds$categ[cds$genefam %in% ok] <- "shell"
#
cds.ori <- cds
cds.lst <- split(cds.ori, cds.ori$categ)
#
# write out families
#
# patterns & names
invisible(lapply(cds.lst, function(cds) {
x <- sort(table(cds$genefam), decreasing=T)
tab <- paste0("^", names(x), "$")
names(tab) <- names(x)
y <- sapply(split(cds$gene, cds$genefam), function(g) {
head(names(sort(table(g), decreasing=T)), 1)
})
tab <- cbind(tab, y[names(x)])
categ <- unique(cds$categ)
f <- paste0("db.", categ, ".pat.txt")
notify("writing patterns for", categ, ":", f)
write.table(tab, file=f, quote=F, col.names=F, row.names=T)
}))
# -------------------------------
# Start models (core only)
# -------------------------------
if (! "core" %in% names(cds.lst)) {
notify("*** no gene found in core")
notify("*** please change parameters")
quit(save='no', status=1)
}
cds <- cds.lst[["core"]]
#
# start by genes
#
tab <- split(cds$start, cds$genefam)
fatg <- sapply(tab, function(x) table(x)["atg"]/length(x)*100)
names(fatg) <- names(tab)
start.dft <- names(which(fatg >= CORE_START_ATG_CUTOFF))
start.spc <- names(which(fatg < CORE_START_ATG_CUTOFF))
tab <- cds[cds$genefam %in% start.dft,]
tab <- table(tab$start)
# default model
x <- sort(tab[tab>=CORE_START_DFT_CUTOFF], decreasing=T)
write.model.start(x, "default")
# gene specific models
invisible(sapply(start.spc, function(g) {
x <- cds[cds$genefam == g,]
tx <- table(x$start)
tx <- sort(tx[tx>=CORE_START_OTH_CUTOFF], decreasing=T)
write.model.start(tx, g)
}))
# -------------------------------
# Stop models (core only)
# -------------------------------
# write default stop model
tab <- table(cds$stop)
x <- sort(tab[tab>=CORE_STOP_CUTOFF], decreasing=T)
write.model.stop(x, "default")
# -------------------------------
# splice junctions
# -------------------------------
notify("loading intron table")
intron <- read.table("db.intron.txt", com="", header=T, stringsAsFactors=F)
# remove invalid sequences
intron$seq <- gsub("\\.|-", "", intron$acceptor.donor)
lseq <- nchar(gsub("[^acgt]", "", intron$seq))
intron <- intron[lseq == 20,]
# remove genes out of core
intron <- intron[intron$genefam %in% cds$genefam,]
# acceptors / donors
intron$acc <- substr(intron$seq, 5, 6)
intron$don <- substr(intron$seq, 15, 16)
# consensus
cons.px <- cons.build(intron$acceptor.donor)
cons.px <- cons.px[,! is.nan(colSums(cons.px))]
seq.px <- sapply(intron$acceptor.donor, function(s) gsub("[^acgt]", "", s))
conf.px <- cons.confusion(cons.px, seq.px)
sfam <- split(conf.px$l2scor, intron$genefam)
sfam <- sfam[order(sapply(sfam, median))]
# extract splice exceptions
name.bad <- names(which(sapply(sfam, median) < 0))
name.spc <- names(which(sapply(sfam[name.bad], length) >= CORE_SPLICE_CUTOFF))
name.ok <- setdiff(unique(intron$genefam), name.bad)
name.bad <- setdiff(name.bad, name.spc)
name.list <- c(sapply(name.spc, function(x) x), list(default=name.ok))
cons <- lapply(name.list, function(x) cons.build(intron[intron$genefam %in% x, "acceptor.donor"]))
# write junction models
invisible(sapply(names(cons), function(n) write.model.splice3(cons[[n]], n)))
invisible(sapply(names(cons), function(n) write.model.splice5(cons[[n]], n)))
# use uniform model for bad guys
invisible(sapply(name.bad, function(n) write.unif.splice(3, n)))
invisible(sapply(name.bad, function(n) write.unif.splice(5, n)))
invisible(write.unif.splice('', "none"))
# -------------------------------
# keep data for plotting
# -------------------------------
DB <- list()
params <- list()
params$CORE_NCDS_CUTOFF <- CORE_NCDS_CUTOFF
params$CORE_START_ATG_CUTOFF <- CORE_START_ATG_CUTOFF
params$CORE_START_DFT_CUTOFF <- CORE_START_DFT_CUTOFF
params$CORE_START_OTH_CUTOFF <- CORE_START_OTH_CUTOFF
params$CORE_STOP_CUTOFF <- CORE_STOP_CUTOFF
params$CORE_SPLICE_CUTOFF <- CORE_SPLICE_CUTOFF
params$SHEL_NCDS_CUTOFF <- SHEL_NCDS_CUTOFF
DB$params <- params
DB$chromo <- chromo
DB$cds.lst <- cds.lst
DB$intron <- intron
DB$cons <- cons
notify("saving db.data.Rdata")
save(DB, file="db.data.Rdata")
# -------------------------------
# end
# -------------------------------
quit(save='no')