#' @include 02_class_metabarcoding.data.R NULL #11.03.2011 #L.Zinger ####################### #function anosim.pw ####################### #computes pairwise anosim computation #input: #dat: dissimilarity matrix #g: factor defining the grouping to test #permutations: nb of permutation to access anosim statistics #p.adjust.method: method of correction for multiple-testing # #output: a distance-like table containing: #in the upper triangle: the anosims R values #in the lower triangle: the adjusted p-values ### start anosim.pw<-function(dat, g, permutations, p.adjust.method, ...) { require(vegan) #data.trasformation dat<-as.matrix(dat) g<-factor(g) #empty object for result storage ano<-matrix(NA, nrow=nlevels(g), ncol=nlevels(g), dimnames=list(levels(g),levels(g))) p.val.tmp<-NULL #running anosims for(i in 1:(nlevels(g)-1)) for(j in (i+1):nlevels(g)){ tmp<-anosim(as.dist(dat[c(which(g==levels(g)[i]),which(g==levels(g)[j])), c(which(g==levels(g)[i]),which(g==levels(g)[j]))]), c(rep(levels(g)[i], length(which(g==levels(g)[i]))), rep(levels(g)[j], length(which(g==levels(g)[j])))), permutations) ano[i,j]<-tmp$statistic p.val.tmp<-append(p.val.tmp, tmp$signif) } #p value correction for multiple comparison p.val.tmp<-p.adjust(p.val.tmp, p.adjust.method ) #put the corrected p values in the anosim table tmp<-NULL tmp2<-NULL for(i in 1:(nlevels(g)-1)) for(j in (i+1):nlevels(g)){ tmp<-append(tmp,i) tmp2<-append(tmp2,j) } for(i in 1:length(p.val.tmp)){ ano[tmp2[i],tmp[i]]<-p.val.tmp[i]} return(ano) } ### end #23 Nov 2012 #L.Zinger ################### #function MOTUtable ################### # Generates ready-to-use MOTU tables and basic statistics on samples (i.e. sequencing depth, raw richness, and invsimpson index) #input: #x: an obitable output (samples should be indicated as e.g. "sample.A01r" in column names) #y: the column name by which that data are to be aggregated. Should be e.g. "cluster" or "species_name" #outputs: #x.otu: the ready-to-use MOTU table #x.rawstats: basic statistics on samples ### start MOTUtable<-function(x, y) { require(vegan) nom<-as.character(substitute(x)) tmp<-x[,c(grep(y, colnames(x)), grep("sample", colnames(x)))] tmp2<-t(aggregate(tmp[,-1], by=list(tmp[,1]), sum)) x.otu<-tmp2[-1,] colnames(x.otu)<-paste(y,tmp2[1,], sep=".") x.rawstats<-data.frame(Nb_ind=rowSums(x.otu), Raw_richness=specnumber(x.otu, MARGIN=1), Raw_eveness=diversity(x.otu, "invsimpson", MARGIN=1) ) #may have a pb in the rowSums depending on the R version (allows or not non-numeric) assign(paste(nom, y, sep="."),x.otu,env = .GlobalEnv) assign(paste(nom, y, "rawstats", sep="."),x.rawstats,env = .GlobalEnv) } ### end #26 Nov 2012 #F.Boyer ################### #function reads.frequency & filter.threshold ################### #can be used to filter the table of reads to have the sequences that represents at least 95% of the total reads by sample # #e.g. reads.treshold(reads.frequency(metabarcodingS4Obj@reads), 0.95) filter.threshold <- function(v, threshold) { o <- order(v, decreasing=T) ind <- which(cumsum(as.matrix(v[o]))>threshold) v[-o[seq(min(length(o), 1+length(o)-length(ind)))]] <- 0 v } reads.threshold <- function (reads, threshold, by.sample=T) { res <- apply(reads, MARGIN=ifelse(by.sample, 1, 2), filter.threshold, thr=threshold) if (by.sample) res <- t(res) data.frame(res) } reads.frequency <- function (reads, by.sample=T) { res <- apply(reads, MARGIN=ifelse(by.sample, 1, 2), function(v) {v/sum(v)}) if (by.sample) res <- t(res) data.frame(res) } #06 Jan 2013 #F.Boyer ################### #function removeOutliers ################### #given a contengency table and a distance matrix #returns the list of samples that should be removed in order to have only #distances below thresold #can't return only one sample # #e.g. intraBad <- lapply(levels(sample.desc$sampleName), function(group) {samples<-rownames(sample.desc)[sample.desc$sampleName==group]; removeOutliers(contingencyTable[samples,], thr=0.3, distFun = function(x) vegdist(x, method='bray'))}) #require(vegan) removeOutliers <- function(m, thr=0.3, distFun = function(x) vegdist(x, method='bray') ) { distMat <- as.matrix(distFun(m)) maxM <- max(distMat) theBadGuys =c() while (maxM>thr) { bad <- apply(distMat, MARGIN=1, function(row, maxM) {any(row==maxM)}, maxM=maxM) bad <- names(bad)[bad] bad <- apply(distMat[bad,], MARGIN=1, mean) badGuy <- names(bad)[bad==max(bad), drop=F][1] theBadGuys <- c(theBadGuys, badGuy) stillok <- rownames(distMat) != badGuy distMat <- distMat[stillok, stillok, drop=F] maxM <- max(distMat) } if (length(theBadGuys) >= (nrow(m)-1)) { theBadGuys <- rownames(m) } theBadGuys } #31.05.2013 #L.Zinger #getAttrPerS, a function allowing to get the values of a sequence attribute per sample #(e.g. best_identities, etc...) the output is a list with one dataframe per sample. #This dataframe contains: # first column (named as attr): the attribute value for each sequence present in the sample # second column (named weight): the corresponding number of reads in the sample getAttrPerS=function(x,attr){ #x: a metabarcoding object #attr: a character object corresponding to the attribute #for which values per sample are needed (should be equal to a colname in x@motus) if(class(x)[1]!= "metabarcoding.data") { stop("x is not a metabarcoding S4 object") } if(is.character(attr)==F) { stop("attr is not a character object") } x.motus = motus(x) x.reads = reads(x) otu = apply(x.reads, 1, function(y) x.motus[match(names(y[which(y!=0)]),x.motus$id), grep(attr, colnames(x.motus))]) reads = apply(x.reads, 1, function(y) y[which(y!=0)]) output = mapply(cbind, otu, reads) output = lapply(output, function(y) { colnames(y)=c(attr,"weight") return(y) }) return(output) } ### end getAttrPerS