Files
ROBITools/R/choose.taxonomy.R
2018-02-20 06:40:29 +11:00

108 lines
4.2 KiB
R

#' @import ROBITaxonomy
#' @include 02_class_metabarcoding.data.R
NULL
#' Choose between databases for taxonomic classifications
#'
#' Chooses a sequence taxonomic assignment in order of preference for the different
#' reference databases that have been used when the assignment is above a certain threshold
#'
#'
#' @param x a \code{\link{metabarcoding.data}} object
#' @param taxonomy a \code{\linkS4class{taxonomy.obitools}} instance
#' @param dbrank string or vector indicating reference database names ranked by order of preference
#' @param thresh a best_identity threshold for applying priority. Default is \code{0.95}
#'
#' @return returns a data.frame with the refined taxonomic assignement and classic taxonomy description.
#'
#' @examples
#'
#' data(termes)
#'
#' taxo=default.taxonomy()
#'
#' #create artificial taxonomic assignments
#' attr(termes, "motus")["best_identity:DB1"] = sample(seq(0.5,1,0.001),size=nrow(termes$motus), replace=T)
#' attr(termes, "motus")["best_identity:DB2"] = sample(seq(0.5,1,0.001),size=nrow(termes$motus), replace=T)
#' attr(termes, "motus")["best_identity:DB3"] = sample(seq(0.5,1,0.001),size=nrow(termes$motus), replace=T)
#' attr(termes, "motus")["taxid_by_db:DB1"] = termes$motus$taxid
#' attr(termes, "motus")["taxid_by_db:DB2"] = sample(termes$motus$taxid,size=nrow(termes$motus), replace=F)
#' attr(termes, "motus")["taxid_by_db:DB3"] = sample(termes$motus$taxid,size=nrow(termes$motus), replace=F)
#'
#' #Run taxo.decider
#' termes.ok = taxo.decider(termes, taxo, "DB2", 0.95)
#' head(termes.ok$motus[union(grep("DB", colnames(termes.ok$motus)), grep("_ok", colnames(termes.ok$motus)))])
#'
#' termes.ok = taxo.decider(termes, taxo, c("DB3", "DB1"), 0.95)
#' head(termes.ok$motus[union(grep("DB", colnames(termes.ok$motus)), grep("_ok", colnames(termes.ok$motus)))])
#'
#' #Quick look at the enhancement in taxonomic assignements
#' par(mfrow=c(1,4))
#' for(i in grep("best_identity.", colnames(termes.ok$motus))){
#' hist(termes.ok$motus[,i], breaks=20, ylim=c(1,21), main=colnames(termes.ok$motus)[i], xlab="assignment score")
#' }
#'
#' @seealso \code{\linkS4class{taxonomy.obitools}}, and methods \code{\link{species}},\code{\link{genus}}, \code{\link{family}},\code{\link{kingdom}},
#' \code{\link{superkingdom}},\code{\link{taxonatrank}}, \code{\link{taxonmicank}}
#'
#' @author Lucie Zinger
#' @keywords taxonomy
#'
#' @export
#'
taxo.decider = function(x, taxonomy, dbrank, thresh=0.95) {
noms = colnames(x$motus)
best_ids_names = noms[grep("best_identity.", noms)]
best_ids = x$motus[,best_ids_names]
taxids = x$motus[, gsub("best_identity", "taxid_by_db", best_ids_names)]
dbs = unlist(lapply(strsplit(best_ids_names, "\\:"), "[[", 2))
#Set max indices
ind = as.vector(t(apply(best_ids,1,function(y) order(rank(-y, ties.method="max"), match(dbrank, dbs))))[,1])
#Set default vector: db, bestids, taxids with max score
db_ok = dbs[ind]
best_identity_ok = best_ids[cbind(1:length(ind), ind)]
taxids_by_db_ok = taxids[cbind(1:length(ind), ind)]
#Get vector of db index that should be used according to condition > thresh
db_choice = taxo.decider.routine(dbrank, best_ids, dbs, thresh)
#Replacing by right values according to db_ok
for(i in 1:length(dbrank)){
db_ok[which(db_choice==i)] = dbrank[i]
best_identity_ok[which(db_choice==i)] = best_ids[which(db_choice==i),grep(dbrank[i], colnames(best_ids))]
taxids_by_db_ok[which(db_choice==i)] = taxids[which(db_choice==i),grep(dbrank[i], colnames(taxids))]
}
decision = data.frame(db_ok, best_identity_ok, taxids_by_db_ok)
coltaxid = colnames(decision)[grep("taxid", colnames(decision))]
attr(x, "motus") = data.frame(x$motus, decision)
new.tax = get.classic.taxonomy(x, taxonomy, coltaxid)
attr(x, "motus") = data.frame(x$motus, new.tax)
return(x)
}
taxo.decider.routine = function(dbrank, best_ids, dbs, thresh) {
#Setting mask
mask = matrix(NA,nrow(best_ids),length(dbrank))
colnames(mask)=dbrank
#For each DB, see if condition T/F
for(i in dbrank){
mask[,i] = best_ids[,which(dbs==i)]>thresh
}
#Get the first occurence of T in the table
out = apply(mask, 1, function(x) which(x==T)[1])
return(out)
}