cds/tools/chlorodb added
Former-commit-id: 0579e878a69b7c285ca71870e9ca5730649a2fda Former-commit-id: 7cced5b488441d87bf070a9a444317db0e048880
This commit is contained in:
224
detectors/cds/tools/lib/make.models.r
Executable file
224
detectors/cds/tools/lib/make.models.r
Executable file
@ -0,0 +1,224 @@
|
||||
#!/usr/bin/env Rscript
|
||||
#
|
||||
# compute start, stop, splice-junctions models for core DB
|
||||
#
|
||||
# source("make.models.r")
|
||||
#
|
||||
|
||||
LIB_DIR <- Sys.getenv("LIB_DIR")
|
||||
if (LIB_DIR == "") LIB_DIR = "."
|
||||
|
||||
source(paste0(LIB_DIR, "/util.base.r"))
|
||||
source(paste0(LIB_DIR, "/util.cons.r"))
|
||||
source(paste0(LIB_DIR, "/util.modelio.r"))
|
||||
|
||||
# -------------------------------
|
||||
# parameters
|
||||
# -------------------------------
|
||||
|
||||
# core cutoffs
|
||||
|
||||
source("db.models.params.txt")
|
||||
|
||||
# -------------------------------
|
||||
# genome infos
|
||||
# -------------------------------
|
||||
|
||||
notify("loading info table")
|
||||
chromo <- read.table("db.info.txt", com="", head=T, stringsAsFactors=F)
|
||||
|
||||
# -------------------------------
|
||||
# CDS
|
||||
# -------------------------------
|
||||
|
||||
notify("loading cds table")
|
||||
cds <- read.table("db.cds.txt", com="", header=T, stringsAsFactors=F)
|
||||
|
||||
cds$start <- as.factor(cds$start)
|
||||
cds$stop <- as.factor(cds$stop)
|
||||
|
||||
cds <- cds[cds$status=="Ok",]
|
||||
cds <- cds[cds$genefam!="none",]
|
||||
|
||||
cds$categ <- "dust"
|
||||
|
||||
x <- sort(table(cds$genefam), dec=T)
|
||||
ok <- names(x[x >= CORE_NCDS_CUTOFF])
|
||||
|
||||
cds$categ[cds$genefam %in% ok] <- "core"
|
||||
|
||||
x <- x[! names(x) %in% ok]
|
||||
ok <- names(x[x >= SHEL_NCDS_CUTOFF])
|
||||
|
||||
cds$categ[cds$genefam %in% ok] <- "shell"
|
||||
|
||||
#
|
||||
|
||||
cds.ori <- cds
|
||||
|
||||
cds.lst <- split(cds.ori, cds.ori$categ)
|
||||
|
||||
#
|
||||
# write out families
|
||||
#
|
||||
|
||||
# patterns & names
|
||||
|
||||
invisible(lapply(cds.lst, function(cds) {
|
||||
|
||||
x <- sort(table(cds$genefam), decreasing=T)
|
||||
tab <- paste0("^", names(x), "$")
|
||||
names(tab) <- names(x)
|
||||
|
||||
y <- sapply(split(cds$gene, cds$genefam), function(g) {
|
||||
head(names(sort(table(g), decreasing=T)), 1)
|
||||
})
|
||||
|
||||
tab <- cbind(tab, y[names(x)])
|
||||
|
||||
categ <- unique(cds$categ)
|
||||
f <- paste0("db.", categ, ".pat.txt")
|
||||
notify("writing patterns for", categ, ":", f)
|
||||
write.table(tab, file=f, quote=F, col.names=F, row.names=T)
|
||||
}))
|
||||
|
||||
# -------------------------------
|
||||
# Start models (core only)
|
||||
# -------------------------------
|
||||
|
||||
if (! "core" %in% names(cds.lst)) {
|
||||
notify("*** no gene found in core")
|
||||
notify("*** please change parameters")
|
||||
quit(save='no', status=1)
|
||||
}
|
||||
|
||||
cds <- cds.lst[["core"]]
|
||||
|
||||
#
|
||||
# start by genes
|
||||
#
|
||||
|
||||
tab <- split(cds$start, cds$genefam)
|
||||
|
||||
fatg <- sapply(tab, function(x) table(x)["atg"]/length(x)*100)
|
||||
names(fatg) <- names(tab)
|
||||
|
||||
start.dft <- names(which(fatg >= CORE_START_ATG_CUTOFF))
|
||||
start.spc <- names(which(fatg < CORE_START_ATG_CUTOFF))
|
||||
|
||||
tab <- cds[cds$genefam %in% start.dft,]
|
||||
tab <- table(tab$start)
|
||||
|
||||
# default model
|
||||
|
||||
x <- sort(tab[tab>=CORE_START_DFT_CUTOFF], decreasing=T)
|
||||
write.model.start(x, "default")
|
||||
|
||||
# gene specific models
|
||||
|
||||
invisible(sapply(start.spc, function(g) {
|
||||
x <- cds[cds$genefam == g,]
|
||||
tx <- table(x$start)
|
||||
tx <- sort(tx[tx>=CORE_START_OTH_CUTOFF], decreasing=T)
|
||||
write.model.start(tx, g)
|
||||
}))
|
||||
|
||||
# -------------------------------
|
||||
# Stop models (core only)
|
||||
# -------------------------------
|
||||
|
||||
# write default stop model
|
||||
|
||||
tab <- table(cds$stop)
|
||||
x <- sort(tab[tab>=CORE_STOP_CUTOFF], decreasing=T)
|
||||
write.model.stop(x, "default")
|
||||
|
||||
# -------------------------------
|
||||
# splice junctions
|
||||
# -------------------------------
|
||||
|
||||
notify("loading intron table")
|
||||
intron <- read.table("db.intron.txt", com="", header=T, stringsAsFactors=F)
|
||||
|
||||
# remove invalid sequences
|
||||
|
||||
intron$seq <- gsub("\\.|-", "", intron$acceptor.donor)
|
||||
|
||||
lseq <- nchar(gsub("[^acgt]", "", intron$seq))
|
||||
|
||||
intron <- intron[lseq == 20,]
|
||||
|
||||
# remove genes out of core
|
||||
|
||||
intron <- intron[intron$genefam %in% cds$genefam,]
|
||||
|
||||
# acceptors / donors
|
||||
|
||||
intron$acc <- substr(intron$seq, 5, 6)
|
||||
intron$don <- substr(intron$seq, 15, 16)
|
||||
|
||||
# consensus
|
||||
|
||||
cons.px <- cons.build(intron$acceptor.donor)
|
||||
cons.px <- cons.px[,! is.nan(colSums(cons.px))]
|
||||
|
||||
seq.px <- sapply(intron$acceptor.donor, function(s) gsub("[^acgt]", "", s))
|
||||
|
||||
conf.px <- cons.confusion(cons.px, seq.px)
|
||||
|
||||
sfam <- split(conf.px$l2scor, intron$genefam)
|
||||
sfam <- sfam[order(sapply(sfam, median))]
|
||||
|
||||
# extract splice exceptions
|
||||
|
||||
name.bad <- names(which(sapply(sfam, median) < 0))
|
||||
name.spc <- names(which(sapply(sfam[name.bad], length) >= CORE_SPLICE_CUTOFF))
|
||||
name.ok <- setdiff(unique(intron$genefam), name.bad)
|
||||
name.bad <- setdiff(name.bad, name.spc)
|
||||
name.list <- c(sapply(name.spc, function(x) x), list(default=name.ok))
|
||||
|
||||
cons <- lapply(name.list, function(x) cons.build(intron[intron$genefam %in% x, "acceptor.donor"]))
|
||||
|
||||
# write junction models
|
||||
|
||||
invisible(sapply(names(cons), function(n) write.model.splice3(cons[[n]], n)))
|
||||
invisible(sapply(names(cons), function(n) write.model.splice5(cons[[n]], n)))
|
||||
|
||||
# use uniform model for bad guys
|
||||
|
||||
invisible(sapply(name.bad, function(n) write.unif.splice(3, n)))
|
||||
invisible(sapply(name.bad, function(n) write.unif.splice(5, n)))
|
||||
|
||||
invisible(write.unif.splice('', "none"))
|
||||
|
||||
# -------------------------------
|
||||
# keep data for plotting
|
||||
# -------------------------------
|
||||
|
||||
DB <- list()
|
||||
|
||||
params <- list()
|
||||
|
||||
params$CORE_NCDS_CUTOFF <- CORE_NCDS_CUTOFF
|
||||
params$CORE_START_ATG_CUTOFF <- CORE_START_ATG_CUTOFF
|
||||
params$CORE_START_DFT_CUTOFF <- CORE_START_DFT_CUTOFF
|
||||
params$CORE_START_OTH_CUTOFF <- CORE_START_OTH_CUTOFF
|
||||
params$CORE_STOP_CUTOFF <- CORE_STOP_CUTOFF
|
||||
params$CORE_SPLICE_CUTOFF <- CORE_SPLICE_CUTOFF
|
||||
|
||||
params$SHEL_NCDS_CUTOFF <- SHEL_NCDS_CUTOFF
|
||||
|
||||
DB$params <- params
|
||||
DB$chromo <- chromo
|
||||
DB$cds.lst <- cds.lst
|
||||
DB$intron <- intron
|
||||
DB$cons <- cons
|
||||
|
||||
notify("saving db.data.Rdata")
|
||||
save(DB, file="db.data.Rdata")
|
||||
|
||||
# -------------------------------
|
||||
# end
|
||||
# -------------------------------
|
||||
|
||||
quit(save='no')
|
Reference in New Issue
Block a user