
Former-commit-id: 0579e878a69b7c285ca71870e9ca5730649a2fda Former-commit-id: 7cced5b488441d87bf070a9a444317db0e048880
110 lines
2.7 KiB
R
110 lines
2.7 KiB
R
#
|
|
# R consensus utilities
|
|
#
|
|
|
|
#
|
|
# compute consensus
|
|
#
|
|
|
|
cons.build <- function(seqs, backcount=1) {
|
|
xx <- do.call(rbind, sapply(seqs, strsplit, "", USE.NAMES=F))
|
|
lv <- c("a", "c", "g", "t", ".", "-")
|
|
mx <- apply(xx, 2, function(x) table(factor(x, levels=lv)))[1:4,]
|
|
cx <- colSums(mx)
|
|
mx <- mx + backcount
|
|
mx[,cx==0] <- 0
|
|
apply(mx, 2, function(x) x / sum(x))
|
|
}
|
|
|
|
#
|
|
# score consensus
|
|
#
|
|
|
|
cons.score <- function(cons, seq, pos=1:ncol(cons)) {
|
|
seq <- strsplit(seq, "")[[1]]
|
|
if (length(seq) != ncol(cons)) {
|
|
warning("incompatible seq and cons size")
|
|
return(NA)
|
|
}
|
|
ppx <- sapply(pos, function(i) cons[seq[i],i])
|
|
sum(log10(ppx+1e-6))
|
|
}
|
|
|
|
#
|
|
# logratio to uniform model score
|
|
#
|
|
|
|
cons.logratio <- function(cons, seq, m0=NULL, pos=1:ncol(cons)) {
|
|
if (is.null(m0)) {
|
|
m0 <- matrix(rep(0.25, 4), nrow=4, ncol=ncol(cons))
|
|
rownames(m0) <- c('a', 'c', 'g', 't')
|
|
}
|
|
|
|
sc <- cons.score(cons, seq, pos=pos)
|
|
sc0 <- cons.score(m0, seq, pos=pos)
|
|
|
|
2 * (log(10^sc, base=2) - log(10^sc0, base=2))
|
|
}
|
|
|
|
#
|
|
# shuffle sequence
|
|
#
|
|
|
|
seq.shuf <- function(seq) {
|
|
paste0(sample(strsplit(seq, "")[[1]], nchar(seq), replace=F), collapse="")
|
|
}
|
|
|
|
#
|
|
# compute confusion matrix between actual and shuffled sequences
|
|
#
|
|
|
|
cons.confusion <- function(cons, seq, m0=NULL, pos=1:ncol(cons), thresh=0) {
|
|
som <- function(x) sum(x, na.rm=T)
|
|
|
|
res <- list()
|
|
res$l2scor <- l2scor <- sapply(seq, function(s) cons.logratio(cons, s, m0=m0, pos=pos))
|
|
|
|
seq <- sapply(seq, seq.shuf)
|
|
res$r2scor <- r2scor <- sapply(seq, function(s) cons.logratio(cons, s, m0=m0, pos=pos))
|
|
|
|
res$conf <- conf <- matrix(c(som(l2scor >= thresh), som(l2scor < thresh),
|
|
som(r2scor >= thresh), som(r2scor < thresh)),
|
|
nrow=2, byrow=T)
|
|
|
|
res$acc <- sum(diag(conf)) / sum(conf)
|
|
res$sen <- conf[1,1] / sum(conf[1,])
|
|
res$sel <- conf[1,1] / sum(conf[,1])
|
|
|
|
res
|
|
}
|
|
|
|
#
|
|
# plot consensus
|
|
#
|
|
|
|
|
|
cons.plot <- function(cons, main="consensus") {
|
|
cols <- c("blue", "orange", "red", "green")
|
|
bp <- barplot(cons, col=cols, ylim=c(0,1), main=main)
|
|
plx <- apply(cons, 2, function(col) -sum(col * log(col+1e-6, base=4)))
|
|
lines(bp, plx, type="b", pch=19)
|
|
legend(0, 1.1, c("a","c","g","t"), fill=cols, horiz=T, xpd=NA, bty="n")
|
|
legend(20, 1.1, "entropy", pch=19, horiz=T, xpd=NA, bty="n")
|
|
invisible()
|
|
}
|
|
|
|
#
|
|
# plot confusion scores histograms
|
|
#
|
|
|
|
cons.histconf <- function(conf, main="junction logr score") {
|
|
lrh <- hist(c(conf$l2scor, conf$r2scor), br=50, plot=F)
|
|
lh <- hist(conf$l2scor, br=lrh$breaks, plot=F)
|
|
rh <- hist(conf$r2scor, br=lrh$breaks, plot=F)
|
|
xx <- rbind(lh$counts, rh$counts) / sum(lh$counts)
|
|
colnames(xx) <- lrh$mids
|
|
barplot(xx, col=c(3,2), beside=T, main=main)
|
|
legend(0, 0.1, c("true", "shuffled"), fill=c(3,2), horiz=F, xpd=NA)
|
|
invisible()
|
|
}
|