annotate/detectors/cds/tools/lib/make.models.r

#!/usr/bin/env Rscript
#
# compute start, stop, splice-junctions models for core DB
#
# source("make.models.r")
#

LIB_DIR <- Sys.getenv("LIB_DIR")
if (LIB_DIR == "") LIB_DIR = "."

source(paste0(LIB_DIR, "/util.base.r"))
source(paste0(LIB_DIR, "/util.cons.r"))
source(paste0(LIB_DIR, "/util.modelio.r"))

# -------------------------------
# parameters
# -------------------------------

# core cutoffs

source("db.models.params.txt")

# -------------------------------
# genome infos
# -------------------------------

notify("loading info table")
chromo <- read.table("db.info.txt", com="", head=T, stringsAsFactors=F)

# -------------------------------
# CDS
# -------------------------------

notify("loading cds table")
cds <- read.table("db.cds.txt", com="", header=T, stringsAsFactors=F)

cds$start <- as.factor(cds$start)
cds$stop <- as.factor(cds$stop)

cds <- cds[cds$status=="Ok",]
cds <- cds[cds$genefam!="none",]

cds$categ <- "dust"

x <- sort(table(cds$genefam), dec=T)
ok <- names(x[x >= CORE_NCDS_CUTOFF])

cds$categ[cds$genefam %in% ok] <- "core"

x <- x[! names(x) %in% ok]
ok <- names(x[x >= SHEL_NCDS_CUTOFF])

cds$categ[cds$genefam %in% ok] <- "shell"

#

cds.ori <- cds

cds.lst <- split(cds.ori, cds.ori$categ)

#
# write out families 
#

# patterns & names

invisible(lapply(cds.lst, function(cds) {

  x <- sort(table(cds$genefam), decreasing=T)
  tab <- paste0("^", names(x), "$")
  names(tab) <- names(x)

  y <- sapply(split(cds$gene, cds$genefam), function(g) {
    head(names(sort(table(g), decreasing=T)), 1)
  })
  
  tab <- cbind(tab, y[names(x)])

  categ <- unique(cds$categ)
  f <- paste0("db.", categ, ".pat.txt")
  notify("writing patterns for", categ, ":", f)
  write.table(tab, file=f, quote=F, col.names=F, row.names=T)
}))

# -------------------------------
# Start models (core only)
# -------------------------------

if (! "core" %in% names(cds.lst)) {
  notify("*** no gene found in core")
  notify("*** please change parameters")
  quit(save='no', status=1)
}

cds <- cds.lst[["core"]]

#
# start by genes
#

tab <- split(cds$start, cds$genefam)

fatg <- sapply(tab, function(x) table(x)["atg"]/length(x)*100)
names(fatg) <- names(tab)

start.dft <- names(which(fatg >= CORE_START_ATG_CUTOFF))
start.spc <- names(which(fatg < CORE_START_ATG_CUTOFF))

tab <- cds[cds$genefam %in% start.dft,]
tab <- table(tab$start)

# default model

x <- sort(tab[tab>=CORE_START_DFT_CUTOFF], decreasing=T)
write.model.start(x, "default")

# gene specific models

invisible(sapply(start.spc, function(g) {
  x <- cds[cds$genefam == g,]
  tx <- table(x$start)
  tx <- sort(tx[tx>=CORE_START_OTH_CUTOFF], decreasing=T)
  write.model.start(tx, g)
}))

# -------------------------------
# Stop models (core only)
# -------------------------------

# write default stop model

tab <- table(cds$stop)
x <- sort(tab[tab>=CORE_STOP_CUTOFF], decreasing=T)
write.model.stop(x, "default")

# -------------------------------
# splice junctions
# -------------------------------

notify("loading intron table")
intron <- read.table("db.intron.txt", com="", header=T, stringsAsFactors=F)

# remove invalid sequences

intron$seq <- gsub("\\.|-", "", intron$acceptor.donor)

lseq <- nchar(gsub("[^acgt]", "", intron$seq))

intron <- intron[lseq == 20,]

# remove genes out of core

intron <- intron[intron$genefam %in% cds$genefam,]

# acceptors / donors

intron$acc <- substr(intron$seq, 5, 6)
intron$don <- substr(intron$seq, 15, 16)

# consensus

cons.px <- cons.build(intron$acceptor.donor)
cons.px <- cons.px[,! is.nan(colSums(cons.px))]

seq.px <- sapply(intron$acceptor.donor, function(s) gsub("[^acgt]", "", s))

conf.px <- cons.confusion(cons.px, seq.px)

sfam <- split(conf.px$l2scor, intron$genefam)
sfam <- sfam[order(sapply(sfam, median))]

# extract splice exceptions

name.bad  <- names(which(sapply(sfam, median) < 0))
name.spc  <- names(which(sapply(sfam[name.bad], length) >= CORE_SPLICE_CUTOFF))
name.ok   <- setdiff(unique(intron$genefam), name.bad)
name.bad  <- setdiff(name.bad, name.spc)
name.list <- c(sapply(name.spc, function(x) x), list(default=name.ok))

cons <- lapply(name.list, function(x) cons.build(intron[intron$genefam %in% x, "acceptor.donor"]))

# write junction models

invisible(sapply(names(cons), function(n) write.model.splice3(cons[[n]], n)))
invisible(sapply(names(cons), function(n) write.model.splice5(cons[[n]], n)))

# use uniform model for bad guys

invisible(sapply(name.bad, function(n) write.unif.splice(3, n)))
invisible(sapply(name.bad, function(n) write.unif.splice(5, n)))
  
invisible(write.unif.splice('', "none"))

# -------------------------------
# keep data for plotting
# -------------------------------

DB <- list()

params <- list()

params$CORE_NCDS_CUTOFF      <- CORE_NCDS_CUTOFF
params$CORE_START_ATG_CUTOFF <- CORE_START_ATG_CUTOFF
params$CORE_START_DFT_CUTOFF <- CORE_START_DFT_CUTOFF
params$CORE_START_OTH_CUTOFF <- CORE_START_OTH_CUTOFF
params$CORE_STOP_CUTOFF      <- CORE_STOP_CUTOFF
params$CORE_SPLICE_CUTOFF    <- CORE_SPLICE_CUTOFF

params$SHEL_NCDS_CUTOFF      <- SHEL_NCDS_CUTOFF

DB$params  <- params
DB$chromo  <- chromo
DB$cds.lst <- cds.lst
DB$intron  <- intron
DB$cons    <- cons

notify("saving db.data.Rdata")
save(DB, file="db.data.Rdata")

# -------------------------------
# end
# -------------------------------

quit(save='no')