225 lines
5.2 KiB
R
225 lines
5.2 KiB
R
![]() |
#!/usr/bin/env Rscript
|
||
|
#
|
||
|
# compute start, stop, splice-junctions models for core DB
|
||
|
#
|
||
|
# source("make.models.r")
|
||
|
#
|
||
|
|
||
|
LIB_DIR <- Sys.getenv("LIB_DIR")
|
||
|
if (LIB_DIR == "") LIB_DIR = "."
|
||
|
|
||
|
source(paste0(LIB_DIR, "/util.base.r"))
|
||
|
source(paste0(LIB_DIR, "/util.cons.r"))
|
||
|
source(paste0(LIB_DIR, "/util.modelio.r"))
|
||
|
|
||
|
# -------------------------------
|
||
|
# parameters
|
||
|
# -------------------------------
|
||
|
|
||
|
# core cutoffs
|
||
|
|
||
|
source("db.models.params.txt")
|
||
|
|
||
|
# -------------------------------
|
||
|
# genome infos
|
||
|
# -------------------------------
|
||
|
|
||
|
notify("loading info table")
|
||
|
chromo <- read.table("db.info.txt", com="", head=T, stringsAsFactors=F)
|
||
|
|
||
|
# -------------------------------
|
||
|
# CDS
|
||
|
# -------------------------------
|
||
|
|
||
|
notify("loading cds table")
|
||
|
cds <- read.table("db.cds.txt", com="", header=T, stringsAsFactors=F)
|
||
|
|
||
|
cds$start <- as.factor(cds$start)
|
||
|
cds$stop <- as.factor(cds$stop)
|
||
|
|
||
|
cds <- cds[cds$status=="Ok",]
|
||
|
cds <- cds[cds$genefam!="none",]
|
||
|
|
||
|
cds$categ <- "dust"
|
||
|
|
||
|
x <- sort(table(cds$genefam), dec=T)
|
||
|
ok <- names(x[x >= CORE_NCDS_CUTOFF])
|
||
|
|
||
|
cds$categ[cds$genefam %in% ok] <- "core"
|
||
|
|
||
|
x <- x[! names(x) %in% ok]
|
||
|
ok <- names(x[x >= SHEL_NCDS_CUTOFF])
|
||
|
|
||
|
cds$categ[cds$genefam %in% ok] <- "shell"
|
||
|
|
||
|
#
|
||
|
|
||
|
cds.ori <- cds
|
||
|
|
||
|
cds.lst <- split(cds.ori, cds.ori$categ)
|
||
|
|
||
|
#
|
||
|
# write out families
|
||
|
#
|
||
|
|
||
|
# patterns & names
|
||
|
|
||
|
invisible(lapply(cds.lst, function(cds) {
|
||
|
|
||
|
x <- sort(table(cds$genefam), decreasing=T)
|
||
|
tab <- paste0("^", names(x), "$")
|
||
|
names(tab) <- names(x)
|
||
|
|
||
|
y <- sapply(split(cds$gene, cds$genefam), function(g) {
|
||
|
head(names(sort(table(g), decreasing=T)), 1)
|
||
|
})
|
||
|
|
||
|
tab <- cbind(tab, y[names(x)])
|
||
|
|
||
|
categ <- unique(cds$categ)
|
||
|
f <- paste0("db.", categ, ".pat.txt")
|
||
|
notify("writing patterns for", categ, ":", f)
|
||
|
write.table(tab, file=f, quote=F, col.names=F, row.names=T)
|
||
|
}))
|
||
|
|
||
|
# -------------------------------
|
||
|
# Start models (core only)
|
||
|
# -------------------------------
|
||
|
|
||
|
if (! "core" %in% names(cds.lst)) {
|
||
|
notify("*** no gene found in core")
|
||
|
notify("*** please change parameters")
|
||
|
quit(save='no', status=1)
|
||
|
}
|
||
|
|
||
|
cds <- cds.lst[["core"]]
|
||
|
|
||
|
#
|
||
|
# start by genes
|
||
|
#
|
||
|
|
||
|
tab <- split(cds$start, cds$genefam)
|
||
|
|
||
|
fatg <- sapply(tab, function(x) table(x)["atg"]/length(x)*100)
|
||
|
names(fatg) <- names(tab)
|
||
|
|
||
|
start.dft <- names(which(fatg >= CORE_START_ATG_CUTOFF))
|
||
|
start.spc <- names(which(fatg < CORE_START_ATG_CUTOFF))
|
||
|
|
||
|
tab <- cds[cds$genefam %in% start.dft,]
|
||
|
tab <- table(tab$start)
|
||
|
|
||
|
# default model
|
||
|
|
||
|
x <- sort(tab[tab>=CORE_START_DFT_CUTOFF], decreasing=T)
|
||
|
write.model.start(x, "default")
|
||
|
|
||
|
# gene specific models
|
||
|
|
||
|
invisible(sapply(start.spc, function(g) {
|
||
|
x <- cds[cds$genefam == g,]
|
||
|
tx <- table(x$start)
|
||
|
tx <- sort(tx[tx>=CORE_START_OTH_CUTOFF], decreasing=T)
|
||
|
write.model.start(tx, g)
|
||
|
}))
|
||
|
|
||
|
# -------------------------------
|
||
|
# Stop models (core only)
|
||
|
# -------------------------------
|
||
|
|
||
|
# write default stop model
|
||
|
|
||
|
tab <- table(cds$stop)
|
||
|
x <- sort(tab[tab>=CORE_STOP_CUTOFF], decreasing=T)
|
||
|
write.model.stop(x, "default")
|
||
|
|
||
|
# -------------------------------
|
||
|
# splice junctions
|
||
|
# -------------------------------
|
||
|
|
||
|
notify("loading intron table")
|
||
|
intron <- read.table("db.intron.txt", com="", header=T, stringsAsFactors=F)
|
||
|
|
||
|
# remove invalid sequences
|
||
|
|
||
|
intron$seq <- gsub("\\.|-", "", intron$acceptor.donor)
|
||
|
|
||
|
lseq <- nchar(gsub("[^acgt]", "", intron$seq))
|
||
|
|
||
|
intron <- intron[lseq == 20,]
|
||
|
|
||
|
# remove genes out of core
|
||
|
|
||
|
intron <- intron[intron$genefam %in% cds$genefam,]
|
||
|
|
||
|
# acceptors / donors
|
||
|
|
||
|
intron$acc <- substr(intron$seq, 5, 6)
|
||
|
intron$don <- substr(intron$seq, 15, 16)
|
||
|
|
||
|
# consensus
|
||
|
|
||
|
cons.px <- cons.build(intron$acceptor.donor)
|
||
|
cons.px <- cons.px[,! is.nan(colSums(cons.px))]
|
||
|
|
||
|
seq.px <- sapply(intron$acceptor.donor, function(s) gsub("[^acgt]", "", s))
|
||
|
|
||
|
conf.px <- cons.confusion(cons.px, seq.px)
|
||
|
|
||
|
sfam <- split(conf.px$l2scor, intron$genefam)
|
||
|
sfam <- sfam[order(sapply(sfam, median))]
|
||
|
|
||
|
# extract splice exceptions
|
||
|
|
||
|
name.bad <- names(which(sapply(sfam, median) < 0))
|
||
|
name.spc <- names(which(sapply(sfam[name.bad], length) >= CORE_SPLICE_CUTOFF))
|
||
|
name.ok <- setdiff(unique(intron$genefam), name.bad)
|
||
|
name.bad <- setdiff(name.bad, name.spc)
|
||
|
name.list <- c(sapply(name.spc, function(x) x), list(default=name.ok))
|
||
|
|
||
|
cons <- lapply(name.list, function(x) cons.build(intron[intron$genefam %in% x, "acceptor.donor"]))
|
||
|
|
||
|
# write junction models
|
||
|
|
||
|
invisible(sapply(names(cons), function(n) write.model.splice3(cons[[n]], n)))
|
||
|
invisible(sapply(names(cons), function(n) write.model.splice5(cons[[n]], n)))
|
||
|
|
||
|
# use uniform model for bad guys
|
||
|
|
||
|
invisible(sapply(name.bad, function(n) write.unif.splice(3, n)))
|
||
|
invisible(sapply(name.bad, function(n) write.unif.splice(5, n)))
|
||
|
|
||
|
invisible(write.unif.splice('', "none"))
|
||
|
|
||
|
# -------------------------------
|
||
|
# keep data for plotting
|
||
|
# -------------------------------
|
||
|
|
||
|
DB <- list()
|
||
|
|
||
|
params <- list()
|
||
|
|
||
|
params$CORE_NCDS_CUTOFF <- CORE_NCDS_CUTOFF
|
||
|
params$CORE_START_ATG_CUTOFF <- CORE_START_ATG_CUTOFF
|
||
|
params$CORE_START_DFT_CUTOFF <- CORE_START_DFT_CUTOFF
|
||
|
params$CORE_START_OTH_CUTOFF <- CORE_START_OTH_CUTOFF
|
||
|
params$CORE_STOP_CUTOFF <- CORE_STOP_CUTOFF
|
||
|
params$CORE_SPLICE_CUTOFF <- CORE_SPLICE_CUTOFF
|
||
|
|
||
|
params$SHEL_NCDS_CUTOFF <- SHEL_NCDS_CUTOFF
|
||
|
|
||
|
DB$params <- params
|
||
|
DB$chromo <- chromo
|
||
|
DB$cds.lst <- cds.lst
|
||
|
DB$intron <- intron
|
||
|
DB$cons <- cons
|
||
|
|
||
|
notify("saving db.data.Rdata")
|
||
|
save(DB, file="db.data.Rdata")
|
||
|
|
||
|
# -------------------------------
|
||
|
# end
|
||
|
# -------------------------------
|
||
|
|
||
|
quit(save='no')
|