Initial commit

This commit is contained in:
2016-01-13 10:20:39 +01:00
commit ffa9638359
62 changed files with 53833 additions and 0 deletions

19
ROBITools.Rproj Normal file
View File

@@ -0,0 +1,19 @@
Version: 1.0
RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: ISO-8859-1
RnwWeave: knitr
LaTeX: pdfLaTeX
BuildType: Package
PackageUseDevtools: Yes
PackagePath: ROBITools
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,collate,namespace

3
ROBITools/.gitignore vendored Normal file
View File

@@ -0,0 +1,3 @@
/man/
/loopbenchmark.R
/Read-and-delete-me

38
ROBITools/DESCRIPTION Normal file
View File

@@ -0,0 +1,38 @@
Package: ROBITools
Type: Package
Title: Metabarcoding data biodiversity analysis
Version: 0.1
Date: 2012-08-23
Author: LECA - Laboratoire d'ecologie alpine
Maintainer: LECA OBITools team <obitools@metabarcoding.org>
Description: More about what it does (maybe more than one line)
License: CeCILL v2.0
LazyLoad: yes
Roxygen: list(wrap = FALSE)
Collate:
's3objects.R'
'ROBITools.R'
'02_class_metabarcoding.data.R'
'aggregate.R'
'choose.taxonomy.R'
'contaslayer.R'
'distrib.extrapol.R'
'experimental.section.R'
'export-metabarcoding.R'
'read.obitab.R'
'import.metabarcoding.R'
'import.ngsfilter.R'
'layers.metabarcoding.R'
'metabarcoding_threshold.R'
'mstat.R'
'obiclean.R'
'pcrslayer.R'
'plot.PCRplate.R'
'plot.seqinsample.R'
'rarefy.R'
'read.ngsfilter.R'
'read.sumatra.R'
'taxoDBtree.R'
'taxonomic.resolution.R'
'taxonomy_classic_table.R'
RoxygenNote: 5.0.1

16
ROBITools/LICENSE-SLRE Executable file
View File

@@ -0,0 +1,16 @@
Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
Copyright (c) 2013 Cesanta Software Limited
All rights reserved
This code is dual-licensed: you can redistribute it and/or modify
it under the terms of the GNU General Public License version 2 as
published by the Free Software Foundation. For the terms of this
license, see <http://www.gnu.org/licenses/>.
You are free to use this code under the terms of the GNU General
Public License, but WITHOUT ANY WARRANTY; without even the implied
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.
Alternatively, you can license this code under a commercial
license, as set out in <http://cesanta.com/>.

52
ROBITools/NAMESPACE Normal file
View File

@@ -0,0 +1,52 @@
# Generated by roxygen2: do not edit by hand
S3method(aggregate,metabarcoding.data)
S3method(plot,PCRplate)
S3method(plot,seqinsample)
S3method(summary,taxores)
export(addS3Class)
export(colnames)
export(const.threshold.mask)
export(contaslayer)
export(createS3Class)
export(dbtree)
export(dist.center.group)
export(dist.clique.group)
export(dist.grid)
export(dm.univariate)
export(extracts.obiclean)
export(extracts.obiclean_cluster)
export(extrapol.freq)
export(get.classic.taxonomy)
export(import.metabarcoding.data)
export(import.ngsfilter.data)
export(layer.names)
export(m.bivariate)
export(m.univariate)
export(m.univariate.test)
export(m.weight)
export(map.extrapol.freq)
export(marginalsum)
export(metabarcoding.data)
export(motus)
export(normalize)
export(rarefy)
export(read.ngsfilter)
export(read.obitab)
export(reads)
export(rmS3Class)
export(rownames)
export(samples)
export(taxo.decider)
export(threshold)
export(threshold.mask)
export(threshold.set)
exportClasses(metabarcoding.data)
exportMethods("$")
exportMethods("$<-")
exportMethods("[[")
exportMethods("[[<-")
exportMethods(colnames)
exportMethods(rownames)
import(ROBITaxonomy)
import(igraph)

View File

@@ -0,0 +1,539 @@
#' @include ROBITools.R
#' @include s3objects.R
#' @import ROBITaxonomy
NULL
require(ROBITaxonomy)
#
# FOR THE DEVELOPPER : we have to check that the code doesn't relies on the
# fact that the xx@samples$sample column is not always
# identical to the rownames(xx@samples)
setClassUnion("characterOrNULL",c("character","NULL"))
setClassUnion("matrixOrfactorL",c("matrix","factor"))
#
# We specialize data.frame in two subclasses motus.frame and samples.frame
# for this we add to function insuring the type checking and the cast from
# data.frame
#
is.motus.frame= function(x) any(class(x)=="motus.frame")
is.samples.frame= function(x) any(class(x)=="samples.frame")
as.motus.frame= function(x) {
if (! is.data.frame(x))
stop("only cast from data.frame is allowed")
if (! is.motus.frame(x))
x = addS3Class(x,"motus.frame")
return(x)
}
as.samples.frame= function(x) {
if (! is.data.frame(x))
stop("only cast from data.frame is allowed")
if (! is.samples.frame(x))
x = addS3Class(x,"samples.frame")
return(x)
}
samples.frame=as.samples.frame
motus.frame=as.motus.frame
as.factor.or.matrix = function(x) {
if (is.matrix(x))
return(x)
if (is.factor(x)){
if (length(dim(x))!=2)
stop('Just factor with two dimensions are allowed')
return(x)
}
if (!is.data.frame(x))
stop('Just matrix, 2D factor and data.frame can be casted')
tps = sapply(x,class)
allna = sapply(x, function(y) all(is.na(y)))
if (all(tps==tps[[1]] | allna)) {
tps = tps[[1]]
}
else
stop('all the column of the data.frame must have the same type')
tps = tps[[1]]
x = as.matrix(x)
dx = dim(x)
if (tps=='factor')
x = factor(x)
dim(x)=dx
return(x)
}
#' DNA metabarcoding experiment description class
#'
#' A S4 class describing a DNA metabarcoding experiment. It groups
#' three data frames describing samples, motus and occurrences of
#' MOTUs per sample
#'
#'@section Slots:
#' \describe{
#' \item{\code{reads}:}{Matrix of class \code{"numeric"},
#' containing the counts of reads per samples
#' \itemize{
#' \item{1 samples per line}
#' \item{1 sequence per column}
#' }
#' }
#'
#' \item{\code{samples}:}{Object of class \code{"data.frame"}, describing samples
#' \itemize{
#' \item{1 samples per line}
#' \item{1 property per column}
#' }
#' }
#'
#' \item{\code{motus}:}{Object of class \code{"data.frame"}, describing MOTUs (sequences)
#' \itemize{
#' \item{1 MOTU per line}
#' \item{1 property per column}
#' }
#' }
#'
#' \item{\code{layers}:}{Object of class \code{"list"}, containing a set of data layers
#' linking motus and samples. Each element of the list is a matrix
#' of the same size than the \code{read} slot with
#' \itemize{
#' \item{1 samples per line}
#' \item{1 sequence per column}
#' }
#' }
#'
#' \item{\code{scount}:}{Object of class \code{"integer"}, containing the count of sample}
#'
#' \item{\code{mcount}:}{Object of class \code{"integer"}, containing the count of MOTUs}
#'
#' \item{\code{sample.margin}:}{Vector of class \code{"numeric"}, describing the total count of
#' sequence per sample. By default this slot is set by applying sum
#' to the reads data.frame lines}
#'
#' \item{\code{taxonomy}:}{Object of class \code{"taxonomy.obitools"}, linking the DNA metabarcoding
#' experiment to a taxonomy}
#'
#' \item{\code{taxid}:}{Vector of class \code{"character"}, list of MOTUs' attributes to manage as taxid}
#' }
#'
#' @seealso \code{\link{taxonomy.obitools}},
#' @name metabarcoding.data
#' @rdname metabarcoding-data-class
#' @keywords DNA metabarcoding
#' @author Eric Coissac
#' @exportClass metabarcoding.data
setClass("metabarcoding.data",
#
# Attribute declaration
#
representation(reads = "matrix",
samples = "data.frame",
motus = "data.frame",
layers = "list",
scount = "integer",
mcount = "integer",
sample.margin = "numeric",
taxonomy = "obitools.taxonomyOrNULL",
taxid = "characterOrNULL"
),
#
# Check object structure
#
validity = function(object) {
## object : nom reserve !
#
# Check that reads / samples and motus data.frames
# have compatible sizes
#
# reads line count = samples line count
# reads column count = motus line count
rsize = dim(object@reads)
ssize = dim(object@samples)
msize = dim(object@motus)
csize = length(object@sample.margin)
if (rsize[1] != ssize[1] &
rsize[2] != msize[1] &
rsize[1] != csize)
return(FALSE)
# if no layer, object is ok
if (length(object@layers)==0)
return(TRUE)
# otherwise we check the size of each layer as we
# did for reads
return(! any(sapply(object@layers,
function(l) any(dim(l)!=c(ssize[1],msize[1])))))
}
)
#
#' metabarcoding.data constructor
#'
#' @docType methods
#' @rdname initialize-methods
#' @aliases initialize-methods,metabarcoding.data
setMethod("initialize",
"metabarcoding.data",
function(.Object, reads,samples,motus,
taxonomy=NULL,taxid=NULL,
sample.margin=NA,
layers=list()) {
rn = rownames(reads)
cn = colnames(reads)
.Object@reads <- reads
# .Object@samples <- as.samples.frame(samples)
.Object@samples <- samples
row.names(.Object@samples) = rn
#.Object@motus <- as.motus.frame(motus)
.Object@motus <- motus
row.names(.Object@motus) = cn
# Set colnames and rownames to each layers
layers = lapply(layers, function(x) {colnames(x)=cn
rownames(x)=rn
return(x)})
.Object@layers <- layers
# Precompute sample count and motu count
.Object@scount = dim(.Object@samples)[1]
.Object@mcount = dim(.Object@motus)[1]
.Object@taxonomy = taxonomy
.Object@taxid = taxid
if (is.null(sample.margin))
.Object@sample.margin = rowSums(reads)
else
.Object@sample.margin = sample.margin
names(.Object@sample.margin) = rn
validObject(.Object) ## valide l'objet
return(.Object)
})
#
# metabarcoding.data getters
#
#' @export
setGeneric("reads", function(obj) {
return(standardGeneric("reads"))
})
#' Extracts the matrix describing MOTUs abondances
#'
#' Extract the the matrix describing MOTUs abondances (read counts)
#' from a \code{\link{metabarcoding.data}} instance.
#'
#' @param obj a \code{\link{metabarcoding.data}} instance
#' @return a matrix containing data about reads
#'
#' @examples
#' # load termite data set from the ROBITools sample data
#' data(termes)
#'
#' # Extract the matrix describing MOTUs abondances
#' d = reads(termes)
#'
#' head(d)
#'
#' @seealso \code{\link{metabarcoding.data}},
#' \code{\link{motus}}, \code{\link{samples}}
#'
#' @docType methods
#' @rdname read-methods
#' @aliases read-methods,metabarcoding.data
#' @author Eric Coissac
#'
setMethod("reads", "metabarcoding.data", function(obj) {
return(obj@reads)
})
# get samples data.frames
#' @export
setGeneric("samples", function(obj) {
return(standardGeneric("samples"))
})
#' Extracts the samples description data.frame
#'
#' Extract the sample description data.frame from a
#' \code{\link{metabarcoding.data}} instance.
#'
#' @param obj a \code{\link{metabarcoding.data}} instance
#' @return a data.frame containing data about sample
#'
#' @examples
#' # load termite data set from the ROBITools sample data
#' data(termes)
#'
#' # Extract the data frame describing samples
#' d = samples(termes)
#'
#' head(d)
#'
#' @seealso \code{\link{metabarcoding.data}},
#' \code{\link{motus}}, \code{\link{reads}}
#'
#' @docType methods
#' @rdname samples-methods
#' @aliases samples-methods,metabarcoding.data
#' @author Eric Coissac
#'
setMethod("samples", "metabarcoding.data", function(obj) {
return(obj@samples)
})
#' @export
setGeneric("motus", function(obj) {
return(standardGeneric("motus"))
})
#' Extracts the MOTU descriptions \code{data.frame}
#'
#' Extract the MOTUs description \code{data.frame} from a
#' \code{\link{metabarcoding.data}} instance.
#'
#' @param obj a \code{\link{metabarcoding.data}} instance
#' @return a data.frame containing data about MOTU
#'
#' @examples
#' # load termite data set from the ROBITools sample data
#' data(termes)
#'
#' # Extract the data.frame describing MOTUs
#' d = motus(termes)
#'
#' head(d)
#'
#' @seealso \code{\link{metabarcoding.data}},
#' \code{\link{reads}}, \code{\link{samples}}
#'
#' @docType methods
#' @rdname motu-methods
#' @aliases motu-methods,metabarcoding.data
#'
setMethod("motus", "metabarcoding.data", function(obj) {
return(obj@motus)
})
# get sample count
setGeneric("sample.count", function(obj) {
return(standardGeneric("sample.count"))
})
setMethod("sample.count", "metabarcoding.data", function(obj) {
return(obj@scount)
})
# get motu count
setGeneric("motu.count", function(obj) {
return(standardGeneric("motu.count"))
})
setMethod("motu.count", "metabarcoding.data", function(obj) {
return(obj@mcount)
})
# dim method
setMethod("dim", "metabarcoding.data", function(x) {
return(c(x@scount,x@mcount))
})
setMethod('[', "metabarcoding.data", function(x,i=NULL,j=NULL,...,drop=TRUE) {
# special case if samples are not specified (dimension 1)
if (!hasArg(i))
i = 1:x@scount
# special case if motus are not specified (dimension 2)
if (!hasArg(j))
j = 1:x@mcount
# special case if the layer attribut is specified
args = list(...)
if (!is.null(args$layer))
return(x[[args$layer]][i,j])
#####################
#
# normal case
#
r = x@reads[i,j,drop=FALSE]
if (sum(dim(r) > 1)==2 | ! drop)
{
# we do the selection on the motus and samples description data.frame
m = x@motus[j,,drop=FALSE]
s = x@samples[i,,drop=FALSE]
# we do the selection on each layers
l = lapply(x@layers,function(l) l[i,j,drop=FALSE])
newdata = copy.metabarcoding.data(x, reads=r, samples=s, motus=m, layers=l)
}
else
{
newdata = as.numeric(x@reads[i,j])
}
return(newdata)
})
setMethod('[<-', "metabarcoding.data",
function (x, i, j, ..., value) {
if (!hasArg(i))
i = 1:x@scount
if (!hasArg(j))
j = 1:x@mcount
args = list(...)
if (is.null(args$layer))
x@reads[i, j]=value
else
x[[args$layer]][i,j]=value
return(x)
})
#################################################
#
# User interface function to create
# metabarcoding.data objects
#
#################################################
#'@export
metabarcoding.data = function(reads,samples,motus,
taxonomy=NULL,taxid=NULL,
sample.margin=NULL,
layers=list()) {
rd = new('metabarcoding.data',
reads=reads,
samples=samples,
motus=motus,
taxonomy=taxonomy,
taxid=taxid,
sample.margin=sample.margin,
layers=layers
)
return(rd)
}
copy.metabarcoding.data = function(data,
reads=NULL,
samples=NULL,motus=NULL,
taxonomy=NULL,taxid=NULL,
sample.margin=NULL,
layers=NULL) {
if (is.null(reads))
reads = data@reads
if (is.null(samples))
samples = data@samples
if (is.null(motus))
motus = data@motus
if (is.null(taxonomy))
taxonomy = data@taxonomy
if (is.null(taxid))
taxid = data@taxid
if (is.null(sample.margin))
sample.margin = data@sample.margin
if (is.null(layers))
layers = data@layers
rd = new('metabarcoding.data',
reads=reads,
samples=samples,
motus=motus,
taxonomy=taxonomy,
taxid=taxid,
sample.margin=sample.margin,
layers=layers
)
return(rd)
}
#' @export
setGeneric('rownames')
#' @export
setMethod("rownames", "metabarcoding.data", function(x, do.NULL = TRUE, prefix = "col") {
return(rownames(x@reads,do.NULL,prefix))
})
#' @export
setGeneric('colnames')
#' @export
setMethod("colnames", "metabarcoding.data", function(x, do.NULL = TRUE, prefix = "col") {
return(colnames(x@reads,do.NULL,prefix))
})

33
ROBITools/R/ROBITools.R Normal file
View File

@@ -0,0 +1,33 @@
#' A package to manipulate DNA metabarcoding data.
#'
#' This package was written as a following of the OBITools.
#'
#' \tabular{ll}{
#' Package: \tab ROBITools\cr
#' Type: \tab Package\cr
#' Version: \tab 0.1\cr
#' Date: \tab 2013-06-27\cr
#' License: \tab CeCILL 2.0\cr
#' LazyLoad: \tab yes\cr
#'}
#'
#' @name ROBITools-package
#' @aliases ROBITools
#' @docType package
#' @title A package to manipulate DNA metabarcoding data.
#' @author Frederic Boyer
#' @author Aurelie Bonin
#' @author Lucie Zinger
#' @author Eric Coissac
#'
#' @references http://metabarcoding.org/obitools
#'
NA
.onLoad <- function(libname, pkgname) {
packageStartupMessage( "ROBITools package" )
#print(getwd())
}

229
ROBITools/R/aggregate.R Normal file
View File

@@ -0,0 +1,229 @@
#' @include 02_class_metabarcoding.data.R
NULL
# TODO: Add comment
#
# Author: coissac
###############################################################################
#' @export
aggregate.metabarcoding.data=function(x, by, FUN,...,
MARGIN='sample',
default.layer=NULL,
layers=NULL) {
uniq.value = function(z) {
if (is.null(z) |
any(is.na(z)) |
length(z)==0)
ans = NA
else {
if (all(z==z[1]))
ans = z[1]
else
ans = NA
}
if (is.factor(z))
ans = factor(ans,levels=levels(z))
return(ans)
}
#
# Deals with the supplementaty aggregate arguments
#
if (is.null(default.layer))
default.layer=uniq.value
if (is.null(layers)) {
layers = as.list(rep(c(default.layer),length(x@layers)))
names(layers)=layer.names(x)
}
else {
for (n in layer.names(x))
if (is.null(layers[[n]]))
layers[[n]]=default.layers
}
if (MARGIN == 'sample')
MARGIN=1
if (MARGIN == 'motu')
MARGIN=2
reads = x@reads
if (MARGIN==1) {
# prepare the aggrevation arguments for the read table
# from the function arguments
dotted = list(...)
if (length(dotted) > 0)
aggr.args = list(reads,by=by,FUN=FUN,...=dotted,simplify=FALSE)
else
aggr.args = list(reads,by=by,FUN=FUN,simplify=FALSE)
# Aggregate the read table
ragr = do.call(aggregate,aggr.args)
# extrat new ids from the aggregated table
ncat = length(by)
ids = as.character(interaction(ragr[,1:ncat,drop=FALSE]))
# remove the aggregations modalities to rebuild a correct
# reads table
ragr = as.matrix(ragr[,-(1:ncat),drop=FALSE])
dragr= dim(ragr)
cragr= colnames(ragr)
ragr = as.numeric(ragr)
dim(ragr)=dragr
colnames(ragr)=cragr
rownames(ragr)=ids
#
# Apply the same aggragation to each layer
#
ln = layer.names(x)
la = vector(mode="list",length(ln))
names(la)=ln
for (n in ln) {
f = layers[[n]]
if (is.factor(x[[n]])){
isfact = TRUE
lf = levels(x[[n]])
df = dim(x[[n]])
m = matrix(as.character(x[[n]]))
dim(m)=df
}
else
m = x[[n]]
aggr.args = list(m,by=by,FUN=f,simplify=FALSE)
lagr = do.call(aggregate,aggr.args)
lagr = as.factor.or.matrix(lagr[,-(1:ncat),drop=FALSE])
if (isfact){
df = dim(lagr)
lagr = factor(lagr,levels=lf)
dim(lagr)=df
}
rownames(lagr)=ids
la[[n]]=lagr
}
# aggragate the sample table according to the same criteria
#
# TODO: We have to take special care of factors in the samples
# data.frame
sagr = aggregate(samples(x),by,uniq.value,simplify=FALSE)
# move the first columns of the resulting data frame (the aggregations
# modalities to the last columns of the data.frame
sagr = sagr[,c((ncat+1):(dim(sagr)[2]),1:ncat),drop=FALSE]
larg = c(lapply(sagr,unlist),list(stringsAsFactors=FALSE))
sagr = do.call(data.frame,larg)
# set samples ids to the ids computed from modalities
sagr$id=ids
rownames(sagr)=ids
# build the new metabarcoding data instance
newdata = copy.metabarcoding.data(x,reads=ragr,samples=sagr)
}
else {
# prepare the aggregation arguments for the read table
# from the function arguments
# BECARFUL : the reads table is transposed
# standard aggregate runs by row and we want
# aggregation by column
dotted = list(...)
if (length(dotted) > 0)
aggr.args = list(t(reads),by=by,FUN=FUN,...=dotted,simplify=FALSE)
else
aggr.args = list(t(reads),by=by,FUN=FUN,simplify=FALSE)
# Aggregate the read table
ragr = do.call(aggregate.data.frame,aggr.args)
# extrat new ids from the aggregated table
ncat = length(by)
ids = as.character(interaction(ragr[,1:ncat,drop=FALSE]))
# remove the aggregations modalities to rebuild a correct
# reads table
ragr = t(ragr[,-(1:ncat),drop=FALSE])
dragr= dim(ragr)
rragr= rownames(ragr)
ragr = as.numeric(ragr)
dim(ragr)=dragr
colnames(ragr)=ids
rownames(ragr)=rragr
#
# Apply the same aggragation to each layer
#
ln = layer.names(x)
la = vector(mode="list",length(ln))
names(la)=ln
for (n in ln) {
f = layers[[n]]
if (is.factor(x[[n]])){
isfact = TRUE
lf = levels(x[[n]])
df = dim(x[[n]])
m = matrix(as.character(x[[n]]))
dim(m)=df
}
else
m = x[[n]]
aggr.args = list(t(m),by=by,FUN=f,simplify=FALSE)
lagr = do.call(aggregate,aggr.args)
lagr = t(as.factor.or.matrix(lagr[,-(1:ncat),drop=FALSE]))
if (isfact){
df = dim(lagr)
lagr = factor(lagr,levels=lf)
dim(lagr)=df
}
colnames(lagr)=ids
la[[n]]=lagr
}
# aggragate the motus table according to the same criteria
magr = aggregate(motus(x),by,uniq.value,simplify=FALSE)
# move the first columns of the resulting data frame (the aggregations
# modalities to the last columns of the data.frame
magr = magr[,c((ncat+1):(dim(magr)[2]),1:ncat),drop=FALSE]
larg = c(lapply(magr,unlist),list(stringsAsFactors=FALSE))
magr = do.call(data.frame,larg)
# set motus ids to the ids computed from modalities
magr$id=ids
rownames(magr)=ids
# build the new metabarcoding data instance
newdata = copy.metabarcoding.data(x,reads=ragr,motus=magr,layers=la)
}
return(newdata)
}

View File

@@ -0,0 +1,107 @@
#' @import ROBITaxonomy
#' @include 02_class_metabarcoding.data.R
NULL
#' Choose between databases for taxonomic classifications
#'
#' Chooses a sequence taxonomic assignment in order of preference for the different
#' reference databases that have been used when the assignment is above a certain threshold
#'
#'
#' @param x a \code{\link{metabarcoding.data}} object
#' @param taxonomy a \code{\linkS4class{taxonomy.obitools}} instance
#' @param dbrank string or vector indicating reference database names ranked by order of preference
#' @param thresh a best_identity threshold for applying priority. Default is \code{0.95}
#'
#' @return returns a data.frame with the refined taxonomic assignement and classic taxonomy description.
#'
#' @examples
#'
#' data(termes)
#'
#' taxo=default.taxonomy()
#'
#' #create artificial taxonomic assignments
#' attr(termes, "motus")["best_identity:DB1"] = sample(seq(0.5,1,0.001),size=nrow(termes$motus), replace=T)
#' attr(termes, "motus")["best_identity:DB2"] = sample(seq(0.5,1,0.001),size=nrow(termes$motus), replace=T)
#' attr(termes, "motus")["best_identity:DB3"] = sample(seq(0.5,1,0.001),size=nrow(termes$motus), replace=T)
#' attr(termes, "motus")["taxid_by_db:DB1"] = termes$motus$taxid
#' attr(termes, "motus")["taxid_by_db:DB2"] = sample(termes$motus$taxid,size=nrow(termes$motus), replace=F)
#' attr(termes, "motus")["taxid_by_db:DB3"] = sample(termes$motus$taxid,size=nrow(termes$motus), replace=F)
#'
#' #Run taxo.decider
#' termes.ok = taxo.decider(termes, taxo, "DB2", 0.95)
#' head(termes.ok$motus[union(grep("DB", colnames(termes.ok$motus)), grep("_ok", colnames(termes.ok$motus)))])
#'
#' termes.ok = taxo.decider(termes, taxo, c("DB3", "DB1"), 0.95)
#' head(termes.ok$motus[union(grep("DB", colnames(termes.ok$motus)), grep("_ok", colnames(termes.ok$motus)))])
#'
#' #Quick look at the enhancement in taxonomic assignements
#' par(mfrow=c(1,4))
#' for(i in grep("best_identity.", colnames(termes.ok$motus))){
#' hist(termes.ok$motus[,i], breaks=20, ylim=c(1,21), main=colnames(termes.ok$motus)[i], xlab="assignment score")
#' }
#'
#' @seealso \code{\linkS4class{taxonomy.obitools}}, and methods \code{\link{species}},\code{\link{genus}}, \code{\link{family}},\code{\link{kingdom}},
#' \code{\link{superkingdom}},\code{\link{taxonatrank}}, \code{\link{taxonmicank}}
#'
#' @author Lucie Zinger
#' @keywords taxonomy
#'
#' @export
#'
taxo.decider = function(x, taxonomy, dbrank, thresh=0.95) {
noms = colnames(x$motus)
best_ids_names = noms[grep("best_identity.", noms)]
best_ids = x$motus[,best_ids_names]
taxids = x$motus[, gsub("best_identity", "taxid_by_db", best_ids_names)]
dbs = unlist(lapply(strsplit(best_ids_names, "\\:"), "[[", 2))
#Set max indices
ind = as.vector(t(apply(best_ids,1,function(y) order(rank(-y, ties.method="max"), match(dbrank, dbs))))[,1])
#Set default vector: db, bestids, taxids with max score
db_ok = dbs[ind]
best_identity_ok = best_ids[cbind(1:length(ind), ind)]
taxids_by_db_ok = taxids[cbind(1:length(ind), ind)]
#Get vector of db index that should be used according to condition > thresh
db_choice = taxo.decider.routine(dbrank, best_ids, dbs, thresh)
#Replacing by right values according to db_ok
for(i in 1:length(dbrank)){
db_ok[which(db_choice==i)] = dbrank[i]
best_identity_ok[which(db_choice==i)] = best_ids[which(db_choice==i),grep(dbrank[i], colnames(best_ids))]
taxids_by_db_ok[which(db_choice==i)] = taxids[which(db_choice==i),grep(dbrank[i], colnames(taxids))]
}
decision = data.frame(db_ok, best_identity_ok, taxids_by_db_ok)
coltaxid = colnames(decision)[grep("taxid", colnames(decision))]
attr(x, "motus") = data.frame(x$motus, decision)
new.tax = get.classic.taxonomy(x, taxonomy, coltaxid)
attr(x, "motus") = data.frame(x$motus, new.tax)
return(x)
}
taxo.decider.routine = function(dbrank, best_ids, dbs, thresh) {
#Setting mask
mask = matrix(NA,nrow(best_ids),length(dbrank))
colnames(mask)=dbrank
#For each DB, see if condition T/F
for(i in dbrank){
mask[,i] = best_ids[,which(dbs==i)]>thresh
}
#Get the first occurence of T in the table
out = apply(mask, 1, function(x) which(x==T)[1])
return(out)
}

49
ROBITools/R/contaslayer.R Normal file
View File

@@ -0,0 +1,49 @@
#' @include 02_class_metabarcoding.data.R
NULL
#' Detects contaminants in metabarcoding data
#'
#' Detects sequences/motus in a \code{\link{metabarcoding.data}} object
#' for which frequencies over the entire dataset are maximum in negative controls and
#' hence, most likely to be contaminants.
#'
#'
#' @param x a \code{\link{metabarcoding.data}} object
#' @param controls a vector of samples names where conta are suspected to be detected
#' (typically negative control names).
#' @param clust a vector for grouping sequences. Default set to \code{NULL}.
#'
#' @return a vector containing the names of sequences identified as contaminants
#'
#' @examples
#'
#' data(termes)
#' termes.ok = termes[,colSums(termes$reads)>0]
#' neg = rownames(termes.ok)[grep("r",rownames(termes.ok))]
#'
#' #finds contaminants based on neg samples
#' contaslayer(termes.ok, neg)
#'
#' # extanding contamininant detection with grouping factor,
#' # typically obiclean/sumatra cluster or taxonomy membership
#' contaslayer(termes.ok, neg, termes.ok$motus$scientific_name)
#'
#' @seealso \code{\link{threshold}} for further trimming
#' @author Lucie Zinger
#' @export
contaslayer = function(x,controls,clust=NULL){
x.fcol = normalize(x, MARGIN=2)$reads
x.max = rownames(x.fcol[apply(x.fcol, 2, which.max),])
conta = colnames(x)[!is.na(match(x.max,controls))]
if (length(clust)!=0) {
agg = data.frame(conta.id=colnames(x.fcol), clust)
conta.ext = agg$conta.id[which(!is.na(match( agg$clust, agg$clust[match(conta,agg$conta.id)])))]
return(as.vector(conta.ext))
}
else {
return(conta)
}
}

View File

@@ -0,0 +1,178 @@
#' @include 02_class_metabarcoding.data.R
NULL
#' Read frequencies krigging
#'
#' Extrapolates read frequencies from a \code{\link{metabarcoding.data}} object in space for a finer resolution
#'
#' @param x a vector or matrix from a row-normalized read table
#' \code{\link{metabarcoding.data}} object
#' @param min.coord a vector of length = 2 indicating the minimum values of x and y
#' coordinates to be used for the predicted grid
#' @param max.coord a vector of length = 2 indicating the maximum values of x and y
#' coordinates to be used for the predicted grid
#' @param grid.grain an integer indicating the resolution (i.e. nb of subpoints) in x and y
#' coordinates required for the predicted grid
#' @param coords a dataframe containing the x and y coordinates of the abundances
#' from x to be extrapolated.
#' @param otus.table a motus data.frame containing motus informations of x
#' @param cutoff a cutoff below which abundances are set to 0.
#' This threshold also determines the value to be added to 0 values for log10
#' transformation
#' @param return.metabarcoding.data if \code{TRUE}, returns a \code{\link{metabarcoding.data}} object. Default is \code{FALSE}
#'
#' @return either a dataframe or a S3 object with a structure similar to \code{\link{metabarcoding.data}} object.
#' The number of samples corresponds to the predicted points.
#' The two last columns (if \code{return.metabarcoding.data==F}) or sample data.frame contains x y coordinates of the predicted grid
#' The all but last two columns (if \code{return.metabarcoding.data==F}) or read matrix contains the predicted log10 transformed relative abundances
#' instead of reads counts
#' If \code{return.metabarcoding.data==F} the motus data.frame contains the motus informations from x
#'
#' @examples
#'
#' data(termes)
#' #Create dummy spatial coordinates
#' attr(termes, "samples")[c("x", "y")] = expand.grid(1:7,1:3)
#'
#' #compute frequencies
#' attr(termes, "layers")[["reads.freq"]] = normalize(termes, MARGIN=1)$reads
#'
#' # Getting extrapolations
#' termes.pred = extrapol.freq(attr(termes, "layers")[["reads.freq"]], min.coord=c(1,1), max.coord=c(7,3),
#' grid.grain=100,termes$samples[,c("x", "y")], termes$motus, cutoff=1e-3)
#'
#' head(termes.pred$reads)
#' @seealso \code{\link{map.extrapol.freq}} as well as \code{sp} and \code{gstat} packages
#' @author Lucie Zinger
#' @export
extrapol.freq = function(x, min.coord, max.coord, grid.grain=100, coords, otus.table, cutoff=1e-3, return.metabarcoding.data = FALSE) {
require(gstat)
require(sp)
#predicted grid setting
new.x = seq(min.coord[1], max.coord[1], length.out = grid.grain)
new.y = seq(min.coord[2], max.coord[2], length.out = grid.grain)
grid.p=expand.grid(new.x, new.y)
colnames(grid.p)=c("x", "y")
S=sp::SpatialPoints(grid.p); sp::gridded(S)<-TRUE
m=gstat::vgm(50, "Exp", 100)
#krigging
preds = apply(x, 2, function(otu) {
otu[otu<cutoff] = cutoff
spj=cbind(coords,otu)
colnames(spj)=c("x", "y", "otu")
spj.g=gstat::gstat(id="Log10.freq", formula=log10(otu)~1,locations=~x+y,data=spj,model=m)
gstat::predict.gstat(spj.g, grid.p, quiet=T)$Log10.freq.pred
})
#formatting the output
colnames(preds) = rownames(otus.table)
rownames(preds) = paste("s", 1:nrow(grid.p), sep=".")
row.names(grid.p) = rownames(preds)
if(return.metabarcoding.data==F) {
out = data.frame(preds, grid.p)
} else{
out = metabarcoding.data(preds, grid.p, otus.table)
}
return(out)
}
#' Maps of krigged log10-transformed frequencies
#'
#' Maps the output of extrapol.freq
#'
#'
#' @param x an extrapol.freq output
#' @param path the path of the folder to export the map. Default is \code{NULL} and map is printed in Rplot/quartz
#' @param col.names a vector containing the names of the columns to be used for defining the file name. Typically
#' the column names containing the taxonomic information and/or sequence/motus id.
#' @param index an integer indicating column number of the motu/sequence to be plotted.
#' @param cutoff lower motu frequency accepted to consider motu abundance as different
#' from 0. Should be the same than the one used in extrapol.freq
#' @param add.points a 3-column data.frame containing factor levels and associated x and y coordinates
#' to be added to the map. Typically taxa observed in the field.
#' @param adj a value used for adjusting text position in the map. Default is \code{4}
#'
#' @return a map/png file displaying motus distribution.
#'
#' @examples
#'
#' data(termes)
#' attr(termes, "samples")[c("x", "y")] = expand.grid(1:7,1:3)
#'
#' #compute frequencies
#' attr(termes, "layers")[["reads.freq"]] = normalize(termes, MARGIN=1)$reads
#'
#' # Getting extrapolations
#' termes.pred = extrapol.freq(attr(termes, "layers")[["reads.freq"]],
#' grid.grain=100,termes$samples[,c("x", "y")], termes$motus, cutoff=1e-3)
#'
#' #mapping the distribution of the 3 most abundant sequences (caution, mfrow does not work for lattice's levelplot)
#' map.extrapol.freq(termes.pred, path=NULL, col.name=NULL, 1, cutoff=1e-3)
#' map.extrapol.freq(termes.pred, path=NULL, col.name=NULL, 2, cutoff=1e-3)
#' map.extrapol.freq(termes.pred, path=NULL, col.name=NULL, 3, cutoff=1e-3)
#'
#' #dummy observationnal data
#' termes.obs = data.frame(x=c(2,3,5), y=c(2.7,2,2.6), taxa = rep("Isoptera Apicotermitinae", 3))
#' map.extrapol.freq(termes.pred, path=NULL, col.name=NULL, 3, cutoff=1e-3, add.points=termes.obs)
#'
#' @seealso \code{\link{extrapol.freq}}, and \code{levelplot} from \code{lattice} package
#' @author Lucie Zinger
#' @export
map.extrapol.freq = function(x, path=NULL, col.name=NULL, index, cutoff=1e-3, add.points=NULL, adj=4) {
require(lattice)
if(!is.null(path)) {
x.motus = apply(x$motus,2,as.character)
name = gsub("\\.", "_", paste(gsub(", ", "_", toString(x.motus[index,col.name])), x.motus[index,"id"], sep="_"))
file.out = paste(path, "/", name, ".png", sep="")
}
z=x$reads[,index]
z[abs(z)>abs(log10(cutoff))]=log10(cutoff)
z[z>0] = 0
spj=as.data.frame(cbind(x$samples,z))
colnames(spj)=c("x", "y", "z")
map.out=levelplot(z~x+y, spj, col.regions=topo.colors(100),
at=seq(log10(cutoff),log10(1), by=0.2),
colorkey=list(at=seq(log10(cutoff),log10(1), by=0.2),
labels=list(at=seq(log10(cutoff),log10(1), by=0.2),
labels=round(10^seq(log10(cutoff),log10(1), by=0.2),3))),
aspect = "iso", contour=F, main=list(label=x$motus[index, "id"], cex=0.7))
if(!is.null(path)) {
png(file=file.out, width=800, height=800)
print(map.out)
if(!is.null(add.points)) {
n = (max(spj[,"y"])-min(spj["y"]))/length(unique(spj[,"y"]))*adj
trellis.focus("panel", 1, 1, highlight=FALSE)
lpoints(add.points[,"x"], add.points[,"y"], cex=0.7, lwd=3, col="red")
ltext(add.points[,"x"], add.points[,"y"]+n, add.points[,-match(c("x", "y"), colnames(add.points))], col="red", cex=1.5)
trellis.unfocus()
}
dev.off()
} else {
print(map.out)
if(!is.null(add.points)) {
n = (max(spj[,"y"])-min(spj["y"]))/length(unique(spj[,"y"]))*adj
trellis.focus("panel", 1, 1, highlight=FALSE)
lpoints(add.points[,"x"], add.points[,"y"], cex=0.7, lwd=3, col="red")
ltext(add.points[,"x"], add.points[,"y"]+n, add.points[,-match(c("x", "y"), colnames(add.points))], col="red", cex=1)
trellis.unfocus()
}
}
}

View File

@@ -0,0 +1,206 @@
#' @include 02_class_metabarcoding.data.R
NULL
#11.03.2011
#L.Zinger
#######################
#function anosim.pw
#######################
#computes pairwise anosim computation
#input:
#dat: dissimilarity matrix
#g: factor defining the grouping to test
#permutations: nb of permutation to access anosim statistics
#p.adjust.method: method of correction for multiple-testing
#
#output: a distance-like table containing:
#in the upper triangle: the anosims R values
#in the lower triangle: the adjusted p-values
### start
anosim.pw<-function(dat, g, permutations, p.adjust.method, ...) {
require(vegan)
#data.trasformation
dat<-as.matrix(dat)
g<-factor(g)
#empty object for result storage
ano<-matrix(NA, nrow=nlevels(g), ncol=nlevels(g), dimnames=list(levels(g),levels(g)))
p.val.tmp<-NULL
#running anosims
for(i in 1:(nlevels(g)-1)) for(j in (i+1):nlevels(g)){
tmp<-anosim(as.dist(dat[c(which(g==levels(g)[i]),which(g==levels(g)[j])),
c(which(g==levels(g)[i]),which(g==levels(g)[j]))]),
c(rep(levels(g)[i], length(which(g==levels(g)[i]))),
rep(levels(g)[j], length(which(g==levels(g)[j])))), permutations)
ano[i,j]<-tmp$statistic
p.val.tmp<-append(p.val.tmp, tmp$signif)
}
#p value correction for multiple comparison
p.val.tmp<-p.adjust(p.val.tmp, p.adjust.method )
#put the corrected p values in the anosim table
tmp<-NULL
tmp2<-NULL
for(i in 1:(nlevels(g)-1)) for(j in (i+1):nlevels(g)){
tmp<-append(tmp,i)
tmp2<-append(tmp2,j)
}
for(i in 1:length(p.val.tmp)){
ano[tmp2[i],tmp[i]]<-p.val.tmp[i]}
return(ano)
}
### end
#23 Nov 2012
#L.Zinger
###################
#function MOTUtable
###################
# Generates ready-to-use MOTU tables and basic statistics on samples (i.e. sequencing depth, raw richness, and invsimpson index)
#input:
#x: an obitable output (samples should be indicated as e.g. "sample.A01r" in column names)
#y: the column name by which that data are to be aggregated. Should be e.g. "cluster" or "species_name"
#outputs:
#x.otu: the ready-to-use MOTU table
#x.rawstats: basic statistics on samples
### start
MOTUtable<-function(x, y) {
require(vegan)
nom<-as.character(substitute(x))
tmp<-x[,c(grep(y, colnames(x)), grep("sample", colnames(x)))]
tmp2<-t(aggregate(tmp[,-1], by=list(tmp[,1]), sum))
x.otu<-tmp2[-1,]
colnames(x.otu)<-paste(y,tmp2[1,], sep=".")
x.rawstats<-data.frame(Nb_ind=rowSums(x.otu), Raw_richness=specnumber(x.otu, MARGIN=1), Raw_eveness=diversity(x.otu, "invsimpson", MARGIN=1) )
#may have a pb in the rowSums depending on the R version (allows or not non-numeric)
assign(paste(nom, y, sep="."),x.otu,env = .GlobalEnv)
assign(paste(nom, y, "rawstats", sep="."),x.rawstats,env = .GlobalEnv)
}
### end
#26 Nov 2012
#F.Boyer
###################
#function reads.frequency & filter.threshold
###################
#can be used to filter the table of reads to have the sequences that represents at least 95% of the total reads by sample
#
#e.g. reads.treshold(reads.frequency(metabarcodingS4Obj@reads), 0.95)
filter.threshold <- function(v, threshold) {
o <- order(v, decreasing=T)
ind <- which(cumsum(as.matrix(v[o]))>threshold)
v[-o[seq(min(length(o), 1+length(o)-length(ind)))]] <- 0
v
}
reads.threshold <- function (reads, threshold, by.sample=T) {
res <- apply(reads, MARGIN=ifelse(by.sample, 1, 2), filter.threshold, thr=threshold)
if (by.sample) res <- t(res)
data.frame(res)
}
reads.frequency <- function (reads, by.sample=T) {
res <- apply(reads, MARGIN=ifelse(by.sample, 1, 2), function(v) {v/sum(v)})
if (by.sample) res <- t(res)
data.frame(res)
}
#06 Jan 2013
#F.Boyer
###################
#function removeOutliers
###################
#given a contengency table and a distance matrix
#returns the list of samples that should be removed in order to have only
#distances below thresold
#can't return only one sample
#
#e.g. intraBad <- lapply(levels(sample.desc$sampleName), function(group) {samples<-rownames(sample.desc)[sample.desc$sampleName==group]; removeOutliers(contingencyTable[samples,], thr=0.3, distFun = function(x) vegdist(x, method='bray'))})
#require(vegan)
removeOutliers <- function(m, thr=0.3, distFun = function(x) vegdist(x, method='bray') ) {
distMat <- as.matrix(distFun(m))
maxM <- max(distMat)
theBadGuys =c()
while (maxM>thr) {
bad <- apply(distMat, MARGIN=1, function(row, maxM) {any(row==maxM)}, maxM=maxM)
bad <- names(bad)[bad]
bad <- apply(distMat[bad,], MARGIN=1, mean)
badGuy <- names(bad)[bad==max(bad), drop=F][1]
theBadGuys <- c(theBadGuys, badGuy)
stillok <- rownames(distMat) != badGuy
distMat <- distMat[stillok, stillok, drop=F]
maxM <- max(distMat)
}
if (length(theBadGuys) >= (nrow(m)-1)) {
theBadGuys <- rownames(m)
}
theBadGuys
}
#31.05.2013
#L.Zinger
#getAttrPerS, a function allowing to get the values of a sequence attribute per sample
#(e.g. best_identities, etc...) the output is a list with one dataframe per sample.
#This dataframe contains:
# first column (named as attr): the attribute value for each sequence present in the sample
# second column (named weight): the corresponding number of reads in the sample
getAttrPerS=function(x,attr){
#x: a metabarcoding object
#attr: a character object corresponding to the attribute
#for which values per sample are needed (should be equal to a colname in x@motus)
if(class(x)[1]!= "metabarcoding.data") {
stop("x is not a metabarcoding S4 object")
}
if(is.character(attr)==F) {
stop("attr is not a character object")
}
x.motus = motus(x)
x.reads = reads(x)
otu = apply(x.reads, 1, function(y) x.motus[match(names(y[which(y!=0)]),x.motus$id), grep(attr, colnames(x.motus))])
reads = apply(x.reads, 1, function(y) y[which(y!=0)])
output = mapply(cbind, otu, reads)
output = lapply(output, function(y) {
colnames(y)=c(attr,"weight")
return(y)
})
return(output)
}
### end getAttrPerS

View File

@@ -0,0 +1,62 @@
#' @include 02_class_metabarcoding.data.R
NULL
# TODO: Add comment
#
# Author: coissac
###############################################################################
require(utils)
expand.metabarcoding.data=function(data,minread=1) {
resultonesample=function(sample) {
mo= data@reads[sample,] >= minread
s = data@samples[rep(sample,sum(mo)),]
r = as.numeric(data@reads[sample,mo])
m = data@motus[mo,]
result = data.frame(s,frequency=r,m,
stringsAsFactors =FALSE,
row.names = NULL)
result
}
res = lapply(1:data@scount, resultonesample)
do.call(rbind,res)
}
#setGeneric("utils::write.csv")
write.csv.metabarcoding.data = function(...) {
Call <- match.call(expand.dots = TRUE)
if (!is.null(Call[["minread"]])) {
minread = Call[["minread"]]
Call = Call[!names(Call)=="minread"]
}
else
minread = 1
data = eval.parent(Call[[2L]])
data = expand.metabarcoding.data(data,minread)
Call[[1L]] <- as.name("write.csv")
Call[[2L]] <- as.name("data")
eval(Call)
}
#setGeneric("utils::write.csv2")
write.csv2.metabarcoding.data = function(...) {
Call <- match.call(expand.dots = TRUE)
if (!is.null(Call[["minread"]])) {
minread = Call[["minread"]]
Call = Call[!names(Call)=="minread"]
}
else
minread = 1
data = eval.parent(Call[[2L]])
data = expand.metabarcoding.data(data,minread)
Call[[1L]] <- as.name("write.csv2")
Call[[2L]] <- as.name("data")
eval(Call)
}

View File

@@ -0,0 +1,106 @@
#' @include read.obitab.R
#' @include 02_class_metabarcoding.data.R
NULL
#' Read a data file produced by the \code{obitab} command
#'
#' Read a data file issued from the conversion of a \strong{fasta}
#' file to a tabular file by the \code{obitab} command of the
#' \strong{OBITools} package
#'
#' @param file a string containing the file name of the obitab file.
#' @param sep Column separator in the obitab file.
#' The default separator is the tabulation.
#' @param sample A regular expression allowing to identify columns
#' from the file describing abundances of sequences per sample
#' @param sample.sep Separator between combined sample name.
#' @param attribute Separator used to split between sample 'tag' and sample name.
#'
#' @return a \code{\link{metabarcoding.data}} instance
#'
#' @examples
#' require(ROBITools)
#'
#' \dontshow{# switch the working directory to the data package directory}
#' \dontshow{setwd(system.file("extdata", package="ROBITools"))}
#'
#' # read the termes.tab file
#' termes=import.metabarcoding.data('termes.tab')
#'
#' # print the number of samples and motus described in the file
#' dim(termes)
#'
#' @seealso \code{\link{metabarcoding.data}}
#'
#' @author Eric Coissac
#' @keywords DNA metabarcoding
#' @export
#'
import.metabarcoding.data = function(file,sep='\t',sample="sample",sample.sep="\\.",attribute=":") {
data=read.obitab(file,sep=sep)
# get the colnames matching the sample pattern
column=colnames(data)
pat = paste('(^|',sample.sep,')',sample,'[',sample.sep,attribute,']',sep='')
scol= grep(pat,column)
# reads informations about samples
reads = data[,scol]
names = colnames(reads)
names = strsplit(names,split=attribute)
# for sample name just remove the first part of the col names
# usally "sample:"
sample.names = sapply(names,function(a) paste(a[-1],collapse=attribute))
reads=t(reads)
rownames(reads)=sample.names
# sample's data
sample.data = data.frame(t(data.frame(strsplit(sample.names,split=attribute))))
rownames(sample.data)=sample.names
colnames(sample.data)=strsplit(names[[1]][1],split=attribute)
# motus information
motus = data[,-scol]
motus.id = motus$id
rownames(motus)=motus.id
colnames(reads)=motus.id
return(metabarcoding.data(reads,sample.data,motus))
}
#pcr = gh[,grep('^sample',colnames(gh))]
#pcr.names = colnames(pcr)
#pcr.names = sub('sample\\.','',pcr.names)
#sequencer = rep('Solexa',length(pcr.names))
#sequencer[grep('454',pcr.names)]='454'
#sequencer=factor(sequencer)
#
#tmp = strsplit(pcr.names,'\\.[A-Z](sol|454)\\.')
#
#sample = sapply(tmp,function(x) x[1])
#locality = factor(sapply(strsplit(sample,'_'),function(x) x[1]))
#sample = factor(sample)
#repeats= factor(sapply(tmp,function(x) x[2]))
#
#tmp = regexpr('[A-Z](454|sol)',pcr.names)
#run=factor(substr(pcr.names,tmp,tmp+attr(tmp,"match.length")-1))
#
#pcr.metadata = data.frame(run,sequencer,locality,sample,repeats)
#
#rownames(pcr.metadata)=pcr.names

View File

@@ -0,0 +1,79 @@
#' @include 02_class_metabarcoding.data.R
NULL
#' Read ngsfilter text file
#'
#' Reads the text file used for assigning reads to samples with the
#' \code{ngsfilter} command of the \strong{OBITools} package.
#'
#' @param file a string containing the file name for the \code{ngsfilter} command.
#' @param platewell a string corresponding to the tag used for storing the sample location
#' in the PCR plate. Should be of the form "nbPlate_Well" (e.g. "01_A02").
#' Default is \code{NULL}
#' @return \code{\link{import.ngsfilter.data}} returns a \code{\link{data.frame}} instance
#'
#' @examples
#' \dontshow{# switch the working directory to the data package directory}
#' \dontshow{setwd(system.file("extdata", package="ROBITools"))}
#'
#' data(termes)
#'
#' # reading the termes_ngsfilt.txt file
#' termes.ngs=import.ngsfilter.data('termes_ngsfilt.txt', platewell="position")
#'
#' # including ngsfilter data into termes data
#' attr(termes, "samples") = termes.ngs[rownames(termes),]
#'
#' colnames(termes$samples)
#'
#' @seealso \code{\link{import.metabarcoding.data}} and \code{\link{read.obitab}} for other methods of data importation
#'
#' @author Lucie Zinger
#' @keywords DNA metabarcoding
#' @export
#'
import.ngsfilter.data = function(file, platewell=NULL) {
raw = read.table(file, sep="\t")
#get samples names
names = raw[,2]
#form first part of the output table (default ngsfilter text input)
out = raw[,-c(2,3,ncol(raw))]
colnames(out) = c("Experiment", "primerF", "primerR")
#add tags
out[,c("tagF", "tagR")] = do.call("rbind", strsplit(as.vector(raw[,3]), "\\:"))
#collect nb and names of additionnal information
max.add = max(unlist(lapply(strsplit(gsub("^F @ ","", raw[, ncol(raw)]), "; "), length)))
names.add = unique(unlist(lapply(strsplit(unlist(strsplit(gsub("^F @ ","", raw[, ncol(raw)]), "; ")), "="), "[[",1)))
#form table of additionnal info
form = lapply(strsplit(gsub("^F @ ","", raw[, ncol(raw)]), "; "), strsplit, "=")
additionnals = as.data.frame(do.call("rbind", lapply(form, function(y) {
val = rep(NA, , max.add)
names(val) = names.add
val[match(unlist(lapply(y, "[[", 1)), names(val))] = gsub(";", "",unlist(lapply(y, "[[", 2)))
val
})))
#create PCR plate coordinates
if(!is.null(platewell)) {
form = strsplit(as.vector(additionnals[, platewell]), "_")
nbPlate = as.numeric(gsub("^0", "", unlist(lapply(form, "[[", 1))))
wellPlate = unlist(lapply(form, "[[", 2))
xPlate = as.numeric(gsub("[A-Z]", "", wellPlate))
yPlate = as.numeric(as.factor(gsub("[0-9]*", "", wellPlate))) + 8*nbPlate
additionnals = additionnals[,-grep(platewell, colnames(additionnals))]
out = data.frame(out, additionnals, nbPlate, wellPlate, xPlate, yPlate)
}
else {
additionnals[,ncol(additionnals)] = gsub(";","", additionnals[,ncol(additionnals)])
out = data.frame(out, additionnals)
}
rownames(out) = names
return(out)
}

View File

@@ -0,0 +1,119 @@
#' @include 02_class_metabarcoding.data.R
NULL
#
#
# Managment of layers
#
# Layers a matrix or factors with the same dimension
# than the read matrix
#
# get motus data.frames
#' @export
setGeneric("layer.names", function(obj) {
return(standardGeneric("layer.names"))
})
#' Returns the names of all the layers
#'
#' \code{layer.names} extracts the list of all the layer
#' names attached to a \code{\link{metabarcoding.data}} instance.
#'
#' @param obj a \code{\link{metabarcoding.data}} instance
#' @return a vector of type \code{character} containing the
#' list of all the layer names.
#'
#' @docType methods
#' @rdname layer.names-methods
#' @aliases layer.names-methods,metabarcoding.data
#'
setMethod("layer.names", "metabarcoding.data", function(obj) {
return(names(obj@layers))
})
#' Returns the a layer associated to a \code{\link{metabarcoding.data}}
#'
#' [[ operator Extracts a layer
#' attached to a \code{\link{metabarcoding.data}} instance.
#'
#' @usage \method{[[}{unmutable}(x,i)
#'
#' @param x a \code{\link{metabarcoding.data}} instance
#' @return matrix or a factor.
#'
#' @docType methods
#' @rdname double-open-brace-methods
#' @aliases double-open-brace-methods,metabarcoding.data
#' @method [[
#' @export
#'
setMethod("[[", "metabarcoding.data",
function(x, i, j, ...) {
if (! is.character(i))
stop('Just named index must be used')
if (i=="reads")
return(x@reads)
if (i=="samples")
return(x@samples)
if (i=="motus")
return(x@motus)
if (i=="reads")
return(x@reads)
return(x@layers[[i,exact=TRUE]])
})
#' @method $
#' @export
setMethod("$", "metabarcoding.data",
function(x, name) {
return(x[[name]])
})
# set one data layer data.frames
#' @method [[<-
#' @export
setMethod("[[<-","metabarcoding.data",
function(x, i, j, ...,value) {
if (any(dim(value)!=c(x@scount,x@mcount)))
stop("data dimmension are not coherent with this metabarcoding.data")
if (hasArg('j'))
stop('Just one dimension must be specified')
if (! is.character(i))
stop('Just named index must be used')
if (i=='reads')
stop('you cannot change the reads layer by this way')
if (i=='motus' | i=='samples')
stop('layers cannot be names motus or samples')
value = as.factor.or.matrix(value)
rownames(value)=rownames(x@reads)
colnames(value)=colnames(x@reads)
x@layers[[i]]=value
return(x)
})
#' @method $<-
#' @export
setMethod("$<-","metabarcoding.data",
function(x, name, value) {
x[[name]]=value
return(x)
})

View File

@@ -0,0 +1,378 @@
#' @include 02_class_metabarcoding.data.R
NULL
#' @export
setGeneric("marginalsum", function(data,MARGIN="sample", na.rm = FALSE) {
return(standardGeneric("marginalsum"))
})
#' Computes marginal sums over read counts.
#'
#' Method \code{marginalsum} computes marginal sums over read counts of
#' a \code{\link{metabarcoding.data}} instance.
#'
#' @param data The \code{\linkS4class{metabarcoding.data}} instance
#' on which marginal sums have to be computed.
#' @param MARGIN Indicates if the sums have to be computed across
#' samples or motus.
#' Allowed values are :
#' \itemize{
#' \item{'sample' or 1} for computing sum across samples
#' \item{'motu' or 2} for computing sum across motus
#' }
#' @param na.rm Logical. Should missing values be omitted from the
#' calculations?
#'
#' @return Returns the vector of marginal sums as a \code{numeric} vector
#'
#' @examples
#' # load termite data set from the ROBITools sample data
#' data(termes)
#'
#' # Computes marginal sums per sample
#' ssum = marginalsum(termes,MARGIN="sample")
#'
#' # Computes marginal sums per MOTU
#' msum = marginalsum(termes,MARGIN="motu")
#'
#' @seealso \code{\linkS4class{metabarcoding.data}}
#'
#' @docType methods
#' @rdname marginalsum-methods
#' @aliases marginalsum-methods,metabarcoding.data
#' @author Aurelie Bonin
#'
setMethod("marginalsum", "metabarcoding.data", function(data,MARGIN='sample', na.rm = FALSE) {
if (MARGIN == 'sample')
MARGIN=1
if (MARGIN == 'motu')
MARGIN=2
readcount = reads(data)
if (MARGIN==1)
margesum = rowSums(readcount,na.rm=na.rm)
else
margesum = colSums(readcount,na.rm=na.rm)
return(margesum)
})
rowSums.metabarcoding.data = function (x, na.rm = FALSE, dims = 1L) {
print("coucou")
}
#' @export
setGeneric("normalize", function(data,MARGIN='sample',as.matrix=FALSE) {
return(standardGeneric("normalize"))
})
#' Normalizes read counts by sample or by MOTU.
#'
#' Method \code{normalize} computes a normalized read aboundancy matrix
#' (relative frequency matrix) of a \code{\link{metabarcoding.data}} instance.
#' Normalization can be done according aboundancies per sample or per MOTU.
#'
#' @param data The \code{\linkS4class{metabarcoding.data}} instance
#' on normalisation have to be computed.
#' @param MARGIN Indicates if the sums have to be computed across
#' samples or motus.
#' Allowed values are :
#' \itemize{
#' \item{'sample' or 1} for computing sum across samples
#' \item{'motu' or 2} for computing sum across motus
#' }
#' @param as.matrix Logical indicating if the normalized aboundancies
#' must be returned as a simple \code{matrix} (TRUE) or as a new
#' instance of the \code{\linkS4class{metabarcoding.data}} class
#' (FALSE, the default case).
#'
#' @return Returns a new instance of \code{\linkS4class{metabarcoding.data}}
#' or a \code{numeric} matrix according to the \code{return.as.matrix}
#' parameter.
#'
#' @examples
#' # load termite data set from the ROBITools sample data
#' data(termes)
#'
#' # Computes normalized aboundancies per sample
#' termes.norm = normalize(termes,MARGIN="sample")
#'
#' # Computes normalized aboundancies per sample and
#' # stores the result as a new layer into the thermes
#' # structure
#' termes$normalized = normalize(termes,MARGIN="sample",as.matrix=TRUE)
#'
#' @seealso \code{\linkS4class{metabarcoding.data}}
#'
#' @docType methods
#' @rdname normalize-methods
#' @aliases normalize-methods,metabarcoding.data
#' @author Aurelie Bonin
#'
setMethod("normalize", "metabarcoding.data", function(data,MARGIN="sample",as.matrix=FALSE) {
if (MARGIN == 'sample')
MARGIN=1
if (MARGIN == 'motu')
MARGIN=2
readcount = reads(data)
margesum = marginalsum(data,MARGIN,na.rm=TRUE)
readcount = sweep(readcount,MARGIN,margesum, FUN="/")
if (as.matrix)
newdata=readcount
else
newdata = copy.metabarcoding.data(data,reads=readcount)
return(newdata)
})
#' @export
setGeneric("threshold", function(data,MARGIN="sample",threshold=0.97) {
return(standardGeneric("threshold"))
})
#' Compute the cumulative threshold of read aboundances.
#'
#' The method \code{threshold} of the class \code{\linkS4class{metabarcoding.data}}
#' computes the thresold to be used for conserving just a part of the global
#' signal. This thresold is computed by ranking aboundances by decreasing order.
#' The cululative sums of these ranked abondencies are computed and the aboundance
#' corresponding to the first sum greater than the threshold is returned as result.
#'
#' @param data The \code{\linkS4class{metabarcoding.data}} instance
#' on normalisation have to be computed.
#' @param MARGIN Indicates if the sums have to be computed across
#' samples or motus.
#' Allowed values are :
#' \itemize{
#' \item{'sample' or 1} for computing sum across samples
#' \item{'motu' or 2} for computing sum across motus
#' }
#' @param threshold a numeric value between 0 and 1 indicating which part of
#' the signal must be conserved. Default value is setup to
#' 0.97 (97% of the total signal).
#'
#' @return a numeric vector containing the limit aboundancy to consider for
#' each sample or each MOTU according to the value of the \code{MARGIN}
#' parameter.
#'
#' @examples
#' # load termite data set from the ROBITools sample data
#' data(termes)
#'
#' # computes threshold value to used for keep 95% of
#' # the reads per MOTU
#'
#' t = threshold(termes,MARGIN='motu',threshold=0.95)
#'
#' @seealso \code{\linkS4class{metabarcoding.data}}, \code{\link{threshold.mask}}
#'
#' @docType methods
#' @rdname threshold-methods
#' @aliases threshold-methods,metabarcoding.data
#' @author Aurelie Bonin
#'
setMethod("threshold", "metabarcoding.data", function(data,MARGIN="sample",threshold=0.97) {
onethreshold=function(x,threshold) {
s = x[order(-x)]
cs= cumsum(s)
total=cs[length(cs)]
if (total > 0) {
cs= cs / total
cs = cs > threshold
t = s[cs][1]
}
else t=0
return(t)
}
if (MARGIN == 'sample')
MARGIN=1
if (MARGIN == 'motu')
MARGIN=2
readcount = reads(data)
t = apply(readcount,MARGIN,onethreshold,threshold)
return(t)
})
#' @export
setGeneric("threshold.mask", function(data,MARGIN,threshold=0.97,operator='<') {
return(standardGeneric("threshold.mask"))
})
#' Computes a cumulatif thresold mask for filtering read aboundancies.
#'
#' The method \code{threshold.mask} of the class \code{\linkS4class{metabarcoding.data}}
#' computes a logical matrix of the same size than the read matrix of the data parameter.
#' Each cell of this matrix contains a \code{TRUE} or a \code{FALSE} value according to the
#' relationship existing between the read abondancy and the corresponding theshold as computed
#' by the \code{\link{theshold}} method.
#'
#' (computed value) = (read aboundancy) operator (threshold value)
#'
#' for a cell in the result matrix, \code{(read aboundancy)} is extracted from the read layer.
#' \code{operator} is a comparaison operator and \code{(threshold value)} is estimated with the
#' \code{\link{theshold}} method.
#'
#' @param data The \code{\linkS4class{metabarcoding.data}} instance
#' on normalisation have to be computed.
#' @param MARGIN Indicates if the sums have to be computed across
#' samples or motus.
#' Allowed values are :
#' \itemize{
#' \item{'sample' or 1} for computing sum across samples
#' \item{'motu' or 2} for computing sum across motus
#' }
#' @param threshold a numeric value between 0 and 1 indicating which part of
#' the signal must be conserved. Default value is setup to
#' 0.97 (97% of the total signal).
#' @param operator is a logical comparison operator.
#'
#' @return A logical matrix usable for selecting cell in the read aboundancy matrix.
#'
#' @seealso \code{\linkS4class{metabarcoding.data}}, \code{\link{threshold.mask}}, \code{\link{threshold}}
#'
#' @docType methods
#' @rdname threshold-mask-methods
#' @aliases threshold.mask-methods,metabarcoding.data
#' @author Aurelie Bonin
#'
setMethod("threshold.mask", "metabarcoding.data", function(data,MARGIN,threshold=0.97,operator='<') {
if (MARGIN == 'sample')
MARGIN=1
if (MARGIN == 'motu')
MARGIN=2
readcount = reads(data)
t = threshold(data,MARGIN,threshold)
mask = apply(readcount,c(2,1)[MARGIN],operator,t)
if (MARGIN==2)
mask = t(mask)
return(mask)
})
#' @export
setGeneric("const.threshold.mask", function(data,MARGIN,threshold=0.01,operator='<') {
return(standardGeneric("const.threshold.mask"))
})
#' Computes a constant thresold mask for filtering read aboundancies.
#'
#' The method \code{const.threshold.mask} of the class \code{\linkS4class{metabarcoding.data}}
#' computes a logical matrix of the same size than the read matrix of the data parameter.
#' Each cell of this matrix contains a \code{TRUE} or a \code{FALSE} value according to the
#' relationship existing between the read abondancy and the global theshold.
#'
#' (computed value) = (normalized read aboundancy) operator (threshold value)
#'
#' for a cell in the result matrix, \code{(normalized read aboundancy)} is extracted from the read layer
#' after normalization.
#' \code{operator} is a comparaison operator and \code{(threshold value)} is estimated with the
#' \code{\link{theshold}} method.
#'
#' @param data The \code{\linkS4class{metabarcoding.data}} instance
#' on normalisation have to be computed.
#' @param MARGIN Indicates if the sums have to be computed across
#' samples or motus.
#' Allowed values are :
#' \itemize{
#' \item{'sample' or 1} for computing sum across samples
#' \item{'motu' or 2} for computing sum across motus
#' }
#' @param threshold a numeric value between 0 and 1 indicating which part of
#' the signal must be conserved. Default value is setup to
#' 0.01 (1% of the normalized signal).
#' @param operator is a logical comparison operator.
#'
#' @return A logical matrix usable for selecting cell in the read aboundancy matrix.
#'
#' @seealso \code{\linkS4class{metabarcoding.data}}, \code{\link{threshold.mask}}, \code{\link{normalize}}
#'
#' @docType methods
#' @rdname const-threshold-mask-methods
#' @aliases const.threshold.mask-methods,metabarcoding.data
#' @author Aurelie Bonin
#'
setMethod("const.threshold.mask", "metabarcoding.data", function(data,MARGIN,threshold=0.01,operator='<') {
if (MARGIN == 'sample')
MARGIN=1
if (MARGIN == 'motu')
MARGIN=2
readcount = normalize(data,MARGIN,as.matrix=TRUE)
mask = do.call(operator,list(readcount,threshold))
return(mask)
})
#' @export
setGeneric("threshold.set", function(data,
MARGIN,
threshold=0.97,
operator='<',
value=0,
normalize=TRUE,
mask.fun=threshold.mask) {
return(standardGeneric("threshold.set"))
})
setMethod("threshold.set", "metabarcoding.data", function(data,
MARGIN,
threshold=0.97,
operator='<',
value=0,
normalize=TRUE,
mask.fun=threshold.mask) {
if (MARGIN == 'sample')
MARGIN=1
if (MARGIN == 'motu')
MARGIN=2
readcount = reads(data)
if (normalize)
data = normalize(data,c(2,1)[MARGIN])
mask = mask.fun(data,MARGIN,threshold,operator)
readcount[mask] = value
newdata = copy.metabarcoding.data(data,reads=readcount)
return(newdata)
})

407
ROBITools/R/mstat.R Normal file
View File

@@ -0,0 +1,407 @@
#' @include 02_class_metabarcoding.data.R
#' @import igraph
NULL
require(igraph)
# pos = expand.grid(x,y)
#' Computes the pairwise distance matrix as a data.frame where
#'
#' @param x a vector for the X coordinates
#' @param y a vector for the Y coordinates
#' @param labels a vector with the sample names
#'
#' @return a data.frame instance of three columns
#' - a : The label of the first sample
#' - b : The label of the second sample
#' - dist : The euclidian distance beween sample a and b
#'
#' @examples
#' data(termes)
#' termes.ok = termes[,colSums(termes$reads)>0]
#' pos = expand.grid(1:3 * 10,1:7 * 10)
#' labels = rownames(termes.ok)
#' d = dist.grid(pos[,1],pos[2],labels)
#'
#' @export
dist.grid = function(x,y,labels=NULL){
pos = data.frame(x,y)
if (is.null(labels))
labels = as.character(interaction(pos))
else
labels = as.character(labels)
llabels=length(labels)
dpos=dist(pos)
a = rep(labels[1:(llabels-1)],(llabels-1):1)
b = do.call(c,(lapply(2:llabels, function(i) labels[i:llabels])))
return(data.frame(a,b,dist=as.vector(dpos)))
}
#' Builds the list of sample groups included in a circle around a central sample
#'
#' @param dtable a distance table between samples as
#' computed by \code{\link{dist.grid}}
#' @param radius the radius of the circle
#' @param center a \code{logical} value indicating if the center of
#' the group must be included in the group
#'
#' @return a list of vectors containing the labels of the group members
#'
#' @examples
#' data(termes)
#' termes.ok = termes[,colSums(termes$reads)>0]
#' pos = expand.grid(1:3 * 10,1:7 * 10)
#' labels = rownames(termes.ok)
#' d = dist.grid(pos[,1],pos[2],labels)
#' groups = dist.center.group(d,20)
#'
#' @export
dist.center.group=function(dtable,radius,center=TRUE) {
fgroup = function(c) {
ig = dtable[(dtable[,1]==c | dtable[,2]==c) & dtable[,3] <= radius,]
return(union(ig[,1],ig[,2]))
}
pos = as.character(union(dtable[,1],dtable[,2]))
g = lapply(pos,fgroup)
names(g) = pos
if (!center)
g = mapply(setdiff,g,pos)
return(g)
}
#' Builds the list of sample groups including samples closest than a define distance
#'
#' A graph is build by applying the threshold \code{dmax} to the distance matrix
#' A group is a clique max in this graph. Consequently all member pairs of a group
#' are distant by less or equal to \code{dmax}.
#'
#' @param dtable a distance table between samples as
#' computed by \code{\link{dist.grid}}
#' @param dmax the maximum distance between two samples
#'
#' @return a list of vectors containing the labels of the group members
#'
#' @examples
#' data(termes)
#' termes.ok = termes[,colSums(termes$reads)>0]
#' pos = expand.grid(1:3 * 10,1:7 * 10)
#' labels = rownames(termes.ok)
#' d = dist.grid(pos[,1],pos[2],labels)
#' groups = dist.clique.group(d,20)
#'
#' @export
dist.clique.group=function(dtable,dmax,center=True) {
gp = igraph::graph.edgelist(as.matrix(dtable[dtable$dist <= dmax,c('a','b')]),directed=FALSE)
g = igraph::maximal.cliques(gp)
return(lapply(g, function(i) igraph::V(gp)$name[i]))
}
#' Computes the univariate M statistics
#'
#' @param w the weigth matrix indicating the presence probability of each motu
#' in each samples. Each line corresponds to a sample and each column
#' to a MOTU. \code{rownames} of the \code{w} matrix must be the sample
#' names. It is nice but not mandatory if the \code{colnames} refer to the MOTU id.
#'
#' @param groups the list of considered groups as computed by the \code{\link{dist.center.group}}
#' function
#'
#' @seealso \code{\link{dist.center.group}}
#' @seealso \code{\link{m.weight}}
#'
#' @examples
#' data(termes)
#' termes.ok = termes[,colSums(termes$reads)>0]
#' pos = expand.grid(1:3 * 10,1:7 * 10)
#' labels = rownames(termes.ok)
#' d = dist.grid(pos[,1],pos[2],labels)
#' groups = dist.center.group(d,20)
#' w = m.weight(termes.ok)
#' m = m.univariate(w,groups)
#'
#' @references Marcon, E., Puech, F., and Traissac, S. (2012).
#' Characterizing the relative spatial structure of point patterns.
#' International Journal of Ecology, 2012.
#'
#' @export
m.univariate = function(w,groups) {
nunivar = function(members,center) {
g = w[members,]
wn = colSums(g)
wa = sum(wn)
wn = wn - center
wa = wa - center
p = wn / wa * center
return(p)
}
centers = lapply(names(groups),function(x) w[x,])
Wf = colSums(w)
Wa = sum(Wf)
Denom.univar = colSums(w * (sweep(-w,2,Wf,'+') / (Wa - w)))
Num.univar = rowSums(mapply(nunivar,groups,centers))
Munivar=Num.univar/Denom.univar
Munivar[Denom.univar==0]=0
return(Munivar)
}
#' Computes the bivariate M statistics
#'
#' The function computes the bivariate M statiscics for a set of target species around a set of
#' focus species.
#'
#' @param w1 the weigth matrix indicating the presence probability of each motu
#' used as focus species in each samples. Each line corresponds to a sample and each column
#' to a MOTU. \code{rownames} of the \code{w} matrix must be the sample
#' names. It is nice but not mandatory if the \code{colnames} refer to the MOTU id.
#'
#' @param w2 the weigth matrix indicating the presence probability of each motu
#' used as target species in each samples. Each line corresponds to a sample and each column
#' to a MOTU. \code{rownames} of the \code{w} matrix must be the sample
#' names. It is nice but not mandatory if the \code{colnames} refer to the MOTU id.
#' if \code{w2} is not set, w1 is also used as target species. in this case the diagonal
#' of the matrix return contains the univariate M statistic for the diferent species.
#'
#' @param groups the list of considered groups as computed by the \code{\link{dist.center.group}}
#' function
#'
#' @return a matrix of M bivariate statistics with one focus species by row and one target species
#' by columns If \code{w2} is not specified the diagonal of the matrix is equal to the univariate
#' M statistic of the corresponding species.
#'
#' @seealso \code{\link{dist.center.group}}
#' @seealso \code{\link{m.weight}}
#'
#' @examples
#' data(termes)
#' termes.ok = termes[,colSums(termes$reads)>0]
#' pos = expand.grid(1:3 * 10,1:7 * 10)
#' labels = rownames(termes.ok)
#' d = dist.grid(pos[,1],pos[2],labels)
#' groups = dist.center.group(d,20)
#' w = m.weight(termes.ok)
#' m = m.bivariate(w,groups)
#'
#' @references Marcon, E., Puech, F., and Traissac, S. (2012).
#' Characterizing the relative spatial structure of point patterns.
#' International Journal of Ecology, 2012.
#'
#' @export
m.bivariate = function(w1,w2=NULL,groups) {
nunbivar = function(members,center) {
g = w2[members,]
wn = colSums(g)
wa = sum(wn)
if (self){
mwn = wn %*% t(rep(1,length(wn)))
diag(mwn)= wn - center
wa = wa - center
wna = mwn/wa
p = sweep(wna,2,center,'*')
#p = center %*% wna
}
else {
wna= matrix(wn/wa,nrow=1)
p = center %*% wna
}
return(p)
}
if (is.null(w2)){
self = TRUE
w2=w1
}
else {
self = FALSE
}
centers = lapply(names(groups),function(x) w[x,])
Wf = colSums(w1)
Wn = colSums(w2)
Wa = sum(Wn)
if (self){
Wn = sweep(-w1,2,Wn,'+')
Wna = Wn/(Wa - w1)
Denom.bivar = t(w1) %*% Wna
}
else {
Wna= t(Wn/Wa)
Denom.bivar = Wf %*% Wna
}
Num.bivar = matrix(0,nrow=ncol(w1),ncol=ncol(w2))
ng = length(groups)
for (i in 1:ng) {
Num.bivar = Num.bivar + nunbivar(groups[[i]],centers[[i]])
}
Mbivar=Num.bivar/Denom.bivar
Mbivar[Denom.bivar==0]=0
return(Mbivar)
}
#' Computes a weigth matrix from a \code{\linkS4class{metabarcoding.data}}
#'
#' The weight can be considered as a propability of presence of a MOTU in a
#' given sample. This function defines this probability as the fraction of
#' the maximal occurrence frequency over all samples.
#'
#' @param data a \code{\linkS4class{metabarcoding.data}} instance
#'
#' @return a weight matrix usable for M statistics
#'
#' @examples
#' data(termes)
#' termes.ok = termes[,colSums(termes$reads)>0]
#' w = m.weight(termes.ok)
#'
#' @export
m.weight = function(data) {
ndata = normalize(data,MARGIN='sample')
fmax=apply(ndata$reads,2,max)
w = sweep(ndata$reads,2,fmax,'/')
rownames(w)=rownames(ndata)
colnames(w)=colnames(ndata)
return(w)
}
#' Simulate null distribion of the M statistics by Monte-Carlo
#'
#' Computes the null empirical distribution of the M statistics
#' by shuffling MOTUs among location.
#'
#' @param w the weigth matrix indicating the presence probability of each motu
#' in each samples. Each line corresponds to a sample and each column
#' to a MOTU. \code{rownames} of the \code{w} matrix must be the sample
#' names.
#' @param groups the list of considered groups as computed by the \code{\link{dist.center.group}}
#' function
#' @param resampling the number of simulation to establish the null distribution
#'
#' @return a matrix of M score under the null hypothesis of random distribution of MOTUs
#' with a MOTUs per line and a culumn per simulation
#'
#' @examples
#' data(termes)
#' termes.ok = termes[,colSums(termes$reads)>0]
#' pos = expand.grid(1:3 * 10,1:7 * 10)
#' labels = rownames(termes.ok)
#' d = dist.grid(pos[,1],pos[2],labels)
#' groups = dist.center.group(d,20)
#' w = m.weight(termes.ok)
#' dnull = dm.univariate(w,groups)
#'
#' @export
dm.univariate = function(w,groups,resampling=100) {
shuffle = function(w){
wr =apply(w,2,function(y) sample(y,length(y),replace=FALSE))
rownames(wr)=rownames(w)
return(wr)
}
msim = function(x) {
return(m.univariate(shuffle(w),groups))
}
dnull = mapply(msim,1:resampling)
rownames(dnull) = colnames(w)
return(dnull)
}
#' Test the significance of the M statistics by Monte-Carlo
#'
#' Computes computes the p.value the M statistics asociated to a MOTU
#' by shuffling MOTUs among location.
#'
#' @param w the weigth matrix indicating the presence probability of each motu
#' in each samples. Each line corresponds to a sample and each column
#' to a MOTU. \code{rownames} of the \code{w} matrix must be the sample
#' names.
#' @param groups the list of considered groups as computed by the \code{\link{dist.center.group}}
#' function
#' @param resampling the number of simulation to establish the null distribution
#'
#' @param alternative a character value in \code{c('two.sided','less','greater')}
#' - two.sided : the m stat is check against the two side of the empirical
#' M distribution
#' - less : test if the M stat is lesser than the M observed in the the empirical
#' M distribution (exlusion hypothesis)
#' - greater : test if the M stat is greater than the M observed in the the empirical
#' M distribution (aggregation hypothesis)
#'
#' @return a vector of p.value with an attribute \code{m.stat} containing the actual M stat
#' for each MOTUs
#'
#' @examples
#' data(termes)
#' termes.ok = termes[,colSums(termes$reads)>0]
#' pos = expand.grid(1:3 * 10,1:7 * 10)
#' labels = rownames(termes.ok)
#' d = dist.grid(pos[,1],pos[2],labels)
#' groups = dist.center.group(d,20)
#' w = m.weight(termes.ok)
#' pval = m.univariate.test(w,groups)
#'
#' @export
m.univariate.test = function(w,groups,resampling=100,alternative='two.sided') {
dnull = dm.univariate(w,groups,resampling)
m = m.univariate(w,groups)
pnull = sapply(1:dim(dnull)[1],function(y) 1 - ecdf(dnull[y,])(m[y]))
p.value=NULL
if (alternative=='two.sided') {
p.value = mapply(min,pnull,1 - pnull)
}
if (alternative=='less') {
p.value = pnull
}
if (alternative=='greater') {
p.value = 1 - pnull
}
# Set p.value to 1 if the MOTU occurres in only one place
n = colSums(w > 0)
p.value[n==1]=1
names(p.value) = colnames(w)
attr(p.value,'m.stat')=m
return(p.value)
}

118
ROBITools/R/obiclean.R Normal file
View File

@@ -0,0 +1,118 @@
#' @include 02_class_metabarcoding.data.R
NULL
# TODO: Add comment
#
# Author: coissac
###############################################################################
#' @export
setGeneric("extracts.obiclean", function(obj) {
return(standardGeneric("extracts.obiclean"))
})
#' Extracts the obiclean results
#'
#' The method \code{extracts.obiclean} of the class \code{\linkS4class{metabarcoding.data}}
#' extracts \code{obiclean} results from the MOTUs descriptions include in the
#' \code{\linkS4class{metabarcoding.data}} instance.
#' When an \code{obitab} file is imported using the \code{\link{import.metabarcoding.data}}
#' if \code{obiclean} results are present in the file they are stored in the
#' \code{motu} data.frame. By calling this methods, MOTU descriptors describing
#' the \code{obiclean} status are moved to a set of layers.
#'
#' @param obj the \code{\linkS4class{metabarcoding.data}} to analyze
#'
#' @return the modified \code{\linkS4class{metabarcoding.data}} instance
#'
#' @examples
#'
#' # load termite data set from the ROBITools sample data
#' data(termes)
#'
#' # shows the initial list of layer names
#' layer.names(t)
#'
#' # extracts the obiclean status
#' termes = extracts.obiclean(termes)
#'
#' # shows the name of the newly created layers
#' layer.names(t)
#'
#'
#'
#' @seealso \code{\linkS4class{metabarcoding.data}}, \code{\link{threshold.mask}}, \code{\link{normalize}}
#'
#' @docType methods
#' @rdname extracts-obiclean-methods
#' @aliases extracts.obiclean-methods,metabarcoding.data
#' @author Eric Coissac
#'
setMethod("extracts.obiclean", "metabarcoding.data", function(obj) {
pat = "^obiclean_status:.*$"
cols = colnames(obj@motus)
cleancols = grep(pat,cols)
clean.names=cols[cleancols]
p = grep(pat,cols)
d = t(as.factor.or.matrix(obj@motus[,p]))
n = sapply(strsplit(cols[p],':'),function(y) y[[2]])
rownames(d)=n
d = d[rownames(obj@reads),]
obj[["obiclean_status"]]=d
newmotus = obj@motus[-cleancols]
pat = "^obiclean_count:.*$"
cols = colnames(newmotus)
cleancols = grep(pat,cols)
clean.names=cols[cleancols]
p = grep(pat,cols)
d = t(as.factor.or.matrix(newmotus[,p]))
n = sapply(strsplit(cols[p],':'),function(y) y[[2]])
rownames(d)=n
d = d[rownames(obj@reads),]
obj[["obiclean_count"]]=d
newmotus = newmotus[-cleancols]
pat = "^obiclean_cluster:.*$"
cols = colnames(newmotus)
cleancols = grep(pat,cols)
clean.names=cols[cleancols]
p = grep(pat,cols)
d = t(as.factor.or.matrix(newmotus[,p]))
n = sapply(strsplit(cols[p],':'),function(y) y[[2]])
rownames(d)=n
d = d[rownames(obj@reads),]
obj[["obiclean_cluster"]]=d
newmotus = newmotus[-cleancols]
newdata = copy.metabarcoding.data(obj,motus=newmotus)
return(newdata)
})
#' @export
setGeneric("extracts.obiclean_cluster", function(obj) {
return(standardGeneric("extracts.obiclean_cluster"))
})
setMethod("extracts.obiclean_cluster", "metabarcoding.data", function(obj) {
obiclean = extracts.obiclean(obj)
obihead = obiclean[,! is.na(obiclean$motus$obiclean_head)]
obihead$obiclean_count[is.na(obihead$obiclean_count)]=0
reads = obihead$obiclean_count
l = obihead@layers[layer.names(obihead) != "obiclean_count"]
newdata = copy.metabarcoding.data(obihead,reads=reads,layers=l)
return(newdata)
}
)

0
ROBITools/R/pcrslayer.R Normal file
View File

View File

@@ -0,0 +1,84 @@
#' @include 02_class_metabarcoding.data.R
NULL
#' Plot PCR plates
#'
#' Plots samples localization in PCR plates, and points out problematic samples if provided.
#'
#' @param x a \code{\link{metabarcoding.data}} object
#' @param samples a character vector containing names of problematic samples. Default is \code{NULL}
#' @param different a boolean indicating whether different tags where used in forward and reverse to identify samples. Default is \code{TRUE}
#' @param ... arguments ot be passed to methods, such as graphical parameters
#'
#' @return \code{\link{plot.PCRplate}} returns a plot displaying no more than 4 PCR plates, with problematic sample localization
#'
#' @examples
#' \dontshow{# switch the working directory to the data package directory}
#' \dontshow{setwd(system.file("extdata", package="ROBITools"))}
#'
#' data(termes)
#'
#' # reading the termes_ngsfilt.txt file
#' termes.ngs=import.ngsfilter.data('termes_ngsfilt.txt', platewell="position")
#'
#' # including ngsfilter data into termes data
#' attr(termes, "samples") = termes.ngs[rownames(termes),]
#'
#' #plot PCR plate plan
#' col = rep("green", nrow(termes))
#' col[grep("r", rownames(termes))] = "red"
#' plot.PCRplate(termes, col=col)
#'
#' #highlighting location of samples with low identification score
#'
#' #low quality taxonomic assignements identification
#' library(plotrix)
#' weighted.hist(termes$motus$best_identity, colSums(termes$reads), breaks = 20, ylab = "Nb reads", xlab = "Ecotag scores", xaxis=F)
#' axis(1, labels = T)
#' lowqual.seq = rownames(termes$motus)[termes$motus$best_identity < 0.7]
#'
#' #identification and localization (in PCR plate) of samples with high proportions of low quality taxonomic assignements
#' termes.freq= normalize(termes, MARGIN=1)$reads
#' hist(log10(rowSums(termes.freq[,lowqual.seq]) + 1e-05), breaks = 20, xlab = "Prop low quality reads")
#' lowqual.sample = rownames(termes)[log10(rowSums(termes.freq[, lowqual.seq]) + 1e-05) > -0.5]
#'
#' plot.PCRplate(termes, lowqual.sample, col=col)
#'
#' @seealso \code{\link{import.metabarcoding.data}}
#'
#' @author Lucie Zinger
#' @keywords DNA metabarcoding
#' @export
#'
plot.PCRplate = function(x, samples=NULL, col="cyan2", different=T, ...) {
if(length(grep("xPlate", colnames(x$samples)))==0 |
length(grep("yPlate", colnames(x$samples)))==0) {
stop("samples/controls position in PCR plates (xPlate and yPlate) are not defined")
}
if(length(grep("tagF", colnames(x$samples)))==0 |
length(grep("tagR", colnames(x$samples)))==0) {
stop("tags (tagF and tagR) are not defined")
}
nplate = max(x$samples$nbPlate)
if(nplate>4) {
stop("Cannot plot more than 4 plates")
}
plot(x$samples$xPlate, -x$samples$yPlate, pch=19, xaxt="n", yaxt="n", col=col,
xlim=c(-5,17), ylab="y plate", xlab= "x plate", ylim=c(-4.5*8-5,0), ...)
if(different==T) {
text(-3, -unique(x$samples$yPlate[order(x$samples$yPlate)]), unique(x$samples$tagF[order(x$samples$yPlate)]), cex=0.5)
text(unique(x$samples$xPlate[order(x$samples$xPlate)]), -5, unique(x$samples$tagR[order(x$samples$xPlate)]), cex=0.5, srt=90)
}
abline(h=-seq(8.5,8*nplate+0.5,8), lty=2, col="grey")
segments(c(0,13), rep(min(-x$samples$yPlate),2), c(0,13), c(0,0), lty=2, col="grey")
#plot problematic samples
if(!is.null(samples)) {
points(x$samples[samples,"xPlate"], -x$samples[samples,"yPlate"], pch="x")
}
}

View File

@@ -0,0 +1,105 @@
#' @include 02_class_metabarcoding.data.R
NULL
#' Plot sequence abundance in samples
#'
#' Plots relative abundances of a set of sequences in all samples (log10 transformed)
#'
#'
#' @param x a \code{\link{metabarcoding.data}} object
#' @param seqset a vetcor with sequences names
#' @param seqtype a string indicating what type of sequences are displayed
#' @param controls a vector indicating the negative controls names in the x object.
#' Default is \code{NULL}
#'
#' @return returns a plot with the log10 transformed relative porportion of
#' selected MOTUs in each samples. If the number of samples is > 96,
#' then the plot is displayed in 4 panels
#'
#' @examples
#'
#' data(termes)
#'
#' seqset = rownames(termes$motus)[which(termes$motus$genus_name=="Anoplotermes")]
#' plot.seqinsample(termes, seqset, "Anoplotermes")
#'
#' controls = rownames(termes)[grep("r", rownames(termes))]
#' seqset = rownames(termes$motus)[which(termes$motus$best_identity<0.7)]
#' plot.seqinsample(termes, seqset, "Not assigned", controls)
#'
#' @seealso \code{\linkS4class{taxonomy.obitools}}, and method \code{\link{taxonmicank}}
#'
#' @author Lucie Zinger
#' @keywords metabarcoding
#'
#' @export
#'
plot.seqinsample = function(x, seqset, seqtype, controls=NULL){
require(vegan)
x.freq = vegan::decostand(x$reads,"total",1)
if(!is.null(controls)){
controls.ind = match(controls, rownames(x.freq))
}
if(nrow(x.freq)>96){
x.freq.parse = seq(0,round(nrow(x$samples), digit=0),
round(nrow(x$samples)/4, digit=0))
layout(matrix(c(1,2,3,1,4,5),3,2), height=c(0.3,1,1))
par(oma=c(1,1,1,0), mar=c(3,3,1,1))
#legend
breaks = seq(log10(1e-4),log10(1), length.out=100)
plot(breaks, rep(1,100), col=topo.colors(100), pch=15, cex=2, ylim=c(0,1.5),
xaxt="n", yaxt="n", bty='n')
text(breaks[seq(1,100,10)], rep(0.7,length(seq(1,100,10))),
round(10^breaks[seq(1,100,10)],4))
mtext("Seqence frequencies:", side=3, line=0, cex=0.8)
#plot
for(i in 1:(length(x.freq.parse)-1)) {
range = (x.freq.parse[i]+1):(x.freq.parse[i]+round(nrow(x$samples)/4, digit=0))
mat = x.freq[range,seqset]
image(log10(mat),col = topo.colors(100), xaxt="n", yaxt="n", breaks=c(breaks,0))
if(!is.null(controls)){
if(length(na.omit(match(controls.ind, range)))!=0){
abline(v=seq(0,1,l=round(nrow(x$samples)/4, digit=0))[match(controls.ind, range)],col="red", lty=3)
}}
axis(side=1,at=seq(0,1,l=round(nrow(x$samples)/4,digit=0)),
labels=rownames(x$samples)[range],
las=2, cex.axis=0.3)
}
mtext(side=2, paste(seqtype, "n = ", length(seqset)), outer=T, cex=0.7, font=3)
mtext(side=1, "Samples", cex=0.7, outer=T)
} else {
layout(matrix(c(1,2,1,2),2,2), height=c(0.3,1))
par(oma=c(1,1,1,0), mar=c(3,3,1,1))
#legend
breaks = seq(log10(1e-4),log10(1), length.out=100)
plot(breaks, rep(1,100), col=topo.colors(100), pch=15, cex=2, ylim=c(0,1.5),
xaxt="n", yaxt="n", bty='n')
text(breaks[seq(1,100,10)], rep(0.7,length(seq(1,100,10))),
round(10^breaks[seq(1,100,10)],4))
mtext("Seqence frequencies:", side=3, line=0, cex=0.8)
image(log10(x.freq[,seqset]),col = topo.colors(100), xaxt="n", yaxt="n", breaks=c(breaks,0))
if(!is.null(controls)){
abline(v=seq(0,1,l=round(nrow(x$samples), digit=0))[controls.ind],col="red", lty=3)
}
axis(side=1,at=seq(0,1,l=round(nrow(x$samples),digit=0)),
labels=rownames(x$samples),
las=2, cex.axis=0.3)
mtext(side=2, paste(seqtype, "n = ", length(seqset)), outer=T, cex=0.7, font=3)
mtext(side=1, "Samples", cex=0.7, outer=T)
}
}

99
ROBITools/R/rarefy.R Normal file
View File

@@ -0,0 +1,99 @@
#' @include 02_class_metabarcoding.data.R
NULL
# TODO: Add comment
#
# Author: coissac
###############################################################################
#' @export
setGeneric("rarefy", function(x,n,first.pass=0.95,pseudo.count=0,...) {
return(standardGeneric("rarefy"))
})
setMethod("rarefy", "ANY", function(x,n,first.pass=0.95,pseudo.count=0,sum=NA) {
if (is.na(sum))
sum=sum(x)
if (sum < sum(x))
stop("sum parameter must be greater or equal to sum(x)")
grey = sum-sum(x)
probs = x + pseudo.count
if (grey > 0)
probs = c(probs,grey)
# Just to ensure at least one execution of the loop
n1 = n * 2
while(n1 > n)
n1 = rpois(1,n * first.pass)
rep1 = as.vector(rmultinom(1,n1,probs))
n2 = sum(rep1)
levels = 1:length(probs)
rep2= as.vector(table(factor(sample(levels,
n - n2,
replace=TRUE,
prob = probs),
levels=levels)))
rep1 = (rep1 + rep2)
if (grey > 0)
rep1 = rep1[-length(rep1)]
return(rep1)
})
setMethod("rarefy", "metabarcoding.data", function(x,n,first.pass=0.95,pseudo.count=0,MARGIN='sample') {
if (MARGIN == 'sample')
MARGIN=1
if (MARGIN == 'motu')
MARGIN=2
dreads= dim(x@reads)
rreads= matrix(0,nrow = dreads[1] , ncol = dreads[2])
if (MARGIN == 1)
for (i in 1:dreads[1]) {
rreads[i,]=rarefy(x@reads[i,],
n=n,
first.pass=first.pass,
pseudo.count=pseudo.count)
}
# rreads = t(apply(reads,1,rarefy,n=n,
# first.pass=first.pass,
# pseudo.count=pseudo.count))
else
for (i in 1:dreads[2]) {
rreads[,i]=rarefy(x@reads[,i],
n=n,
first.pass=first.pass,
pseudo.count=pseudo.count)
}
# rreads = as.matrix(apply(reads,2,rarefy,n=n,
# first.pass=first.pass,
# pseudo.count=pseudo.count))
rreads=as.matrix(rreads)
rownames(rreads) = rownames(x@reads)
colnames(rreads) = colnames(x@reads)
newdata = copy.metabarcoding.data(x,reads=rreads)
return(newdata)
})

View File

@@ -0,0 +1,56 @@
#' @include 02_class_metabarcoding.data.R
NULL
#' Read an OBITools ngsfilter file
#'
#' Reads a ngsfilter file as formatted for the OBITools. For now, needs to be tab delimited till the "F" column.
#' Any additionnal information needs to be space delimited.
#'
#' @seealso \code{\link{import.metabarcoding.data}}
#' @author Lucie Zinger
#' @keywords data import
#' @export
#'
read.ngsfilter <- function(filename, decimal='.', as.is=!stringsAsFactors, stringsAsFactors = default.stringsAsFactors()) {
t<-read.table(file=filename, header=F, sep="\t", as.is=T)
beg <- t[,1:5]
colnames(beg) <- c('experiment','sample','tags','forward_primer','reverse_primer')
if (length(unique(beg$sample))==nrow(beg))
rownames(beg) <- beg$sample
end <- t[,c(2,6)]
#F <- unlist(lapply(end$V6, function(x) strsplit(x,"@")[[1]][1]))
rawextras <- unlist(lapply(end$V6, function(x) strsplit(x,"@")[[1]][2]))
rawextras <- lapply(rawextras, function(s) strsplit(s, '; ')[[1]])
rawextras <- lapply(rawextras, function(l) unlist(lapply(l, function(s) sub("^ +","",s))))
rawextras <- lapply(rawextras, function(l) unlist(lapply(l, function(s) sub(" +$","",s))))
rawextras <- lapply(rawextras, function(l) unlist(lapply(l, function(s) strsplit(s,"="))))
columnnames <- unique(unlist(lapply(rawextras, function(l) l[seq(1,length(l),2)])))
m <- matrix(nrow=nrow(end), ncol=length(columnnames))
colnames(m) <- columnnames
m <- as.data.frame(m)
#print(head(rawextras))
tt <- lapply(rawextras, function(l) list(l[seq(1,length(l),2)],l[seq(2,length(l),2)]))
invisible(lapply(1:length(tt), function(i){m[i,tt[[i]][[1]]] <<- tt[[i]][[2]]}))
invisible(lapply(colnames(m), function(n) m[,n] <<- type.convert(m[,n], dec=decimal, as.is=as.is)))
ngs = cbind(beg, m)
rownames(ngs) = ngs$sample
class(ngs)<-c('ngsfilter.data',class(ngs))
return(ngs)
}

39
ROBITools/R/read.obitab.R Normal file
View File

@@ -0,0 +1,39 @@
#' @include 02_class_metabarcoding.data.R
NULL
#' Reads a data file produced by the obitab command
#'
#' Read a data file issued from the convertion of a fasta
#' file to a tabular file by the obitab command
#'
#' @param file a string containing the file name of the obitab file.
#' @param sep Column separator in the obitab file.
#' The default separator is the tabulation.
#'
#' @return a \code{data.frame} instance containing the obitab file
#'
#' @examples
#' require(ROBITools)
#'
#' \dontshow{# switch the working directory to the data package directory}
#' \dontshow{setwd(system.file("extdata", package="ROBITools"))}
#'
#' # read the termes.tab file
#' termes=read.obitab('termes.tab')
#'
#' # print the dimensions of the data.frame
#' dim(termes)
#'
#' @seealso \code{\link{import.metabarcoding.data}}
#' @author Eric Coissac
#' @export
#'
read.obitab <-
function(filename,sep='\t') {
data=read.delim(filename,sep=sep,strip.white=T,check.names =F)
data
}

View File

@@ -0,0 +1,17 @@
#' @include 02_class_metabarcoding.data.R
NULL
# TODO: Add comment
#
# Author: coissac
###############################################################################
read.sumatra = function(filename) {
data = read.table(filename,sep="\t",header=FALSE)
score = data[,3]
name.first = mapply(min,as.character(s[,1]),as.character(s[,2]))
name.second= mapply(max,as.character(s[,1]),as.character(s[,2]))
sname = as.character(interaction(data[,1],data[,2]))
}

123
ROBITools/R/s3objects.R Normal file
View File

@@ -0,0 +1,123 @@
# TODO: Add comment
#
# Author: coissac
###############################################################################
#' Adds a class into the class hierarchie attribute.
#'
#' \code{addS3Class} adds a new class name to the vector
#' of class associated to the object. This the way to
#' assign an object to an S3 class. \code{addS3Class} add
#' the new class name in front of the class vector
#'
#' @param object the object to modify
#' @param classname the name of the new class
#'
#' @return the object given as parametter casted to the new
#' class
#'
#' @examples
#' x = c(1,3,2,5)
#' x = addS3Class(x,"my.vector")
#' class(x)
#'
#' @seealso \code{\link{rmS3Class}}
#'
#' @note for efficiency purpose no check is done on the input
#' parametters
#'
#' @keywords system function
#'
#' @author Eric Coissac
#' @export
#'
addS3Class = function(object,classname) {
class(object) = c(classname,class(object))
return(object)
}
#' Removes a class from the class hierarchie attribute.
#'
#' \code{rmS3Class} removes a class name from the vector
#' of class associated to the object. This the way to
#' remove the association between an object and a S3 class.
#'
#' @param object the object to modify
#' @param classname the name of the class to remove
#'
#' @return the object given as parametter.
#'
#' @examples
#' x = c(1,3,2,5)
#' x = addS3Class(x,"my.vector")
#' class(x)
#' x = rmS3Class(x,"my.vector")
#' class(x)
#'
#' @seealso \code{\link{addS3Class}}
#'
#' @note for efficiency purpose no check is done on the input
#' parametters
#'
#' @keywords system function
#'
#' @author Eric Coissac
#' @export
#'
rmS3Class = function(object,classname) {
c = class(object)
if (! is.null(c))
index = match(classname,c)
class(object)=c[-index]
return(object)
}
#' create basic functions to manipulate a new S3 class
#'
#' createS3Class function create in the \code{package:ROBITools}
#' environment an \code{is.xxx} function and an \code{as.xxx} function
#' allowing to test if an abject belong the class \code{xxx} and to add
#' the class \code{xxx} to the class list of an object. \code{xxx} is a
#' generic class name that is specified through the \code{classname}
#' argument of the function.
#'
#' @param classname a \code{character string} indicating the name
#' of the new class.
#'
#' @examples
#'
#' # Create a new S3 class named mynewclass
#' createS3Class('mynewclass')
#'
#' #create a new vector object
#' x=c(1,4,6)
#'
#' # test if it belongs the new class, that is false
#' is.mynewclass(x)
#'
#' # Associate x to the new class
#' as.mynewclass(x)
#'
#' # test again if x belongs the new class, that is now true
#' is.mynewclass(x)
#'
#' @seealso \code{\link{rmS3Class}}
#'
#' @note Take care that the new functions are created in the
#' \code{package:ROBITools} environment.
#'
#' @keywords system function
#'
#' @author Eric Coissac
#' @export
#'
createS3Class = function(classname) {
is.class = function(object) any(class(object)==classname)
as.class = function(object) return(addS3Class(object,classname))
assign(paste('is',classname,sep="."),is.class,envir=globalenv())
assign(paste('as',classname,sep="."),as.class,envir=globalenv())
}

89
ROBITools/R/taxoDBtree.R Normal file
View File

@@ -0,0 +1,89 @@
#'@include 02_class_metabarcoding.data.R
#'@import ROBITaxonomy
NULL
#' Construct a taxonomic tree from a list of taxa
#'
#' Construct a graph from a table containing the taxonomic path of sequences
#'
#'
#' @param x a table containing the taxonomic path of the references. Typically an output from get.classic.taxonomy
#'
#' @return g a directed graph displaying the taxonomy hierarchy of the input data. Stored in a \code{\link{igraph}} object
#' where the taxonomic ranks of the vertices are added to the vertices attributes
#'
#' @examples
#'
#' data(termes)
#'
#' taxo=default.taxonomy()
#'
#' termes.taxo.table = get.classic.taxonomy(termes, taxo, "taxid")
#' head(termes.taxo.table)
#'
#' graph.tax.termes = dbtree(termes.taxo.table[,1:7])
#' library(igraph)
#'
#' #plot the tree
#' coord = layout.reingold.tilford(graph.tax.termes, root=1, circular=F)
#' v.cex = as.factor(V(graph.tax.termes)$rank)
#' levels(v.cex) = match(levels(v.cex), colnames(termes.taxo.table))
#' plot(graph.tax.termes, vertex.size=1, vertex.label.cex=2*(as.numeric(as.vector(v.cex))^-1), edge.arrow.size=0, layout=coord)
#'
#'
#' #Vizualization with sequence counts
#' tax.count = log10(colSums(termes$reads)[match(as.vector(V(graph.tax.termes)$name), termes$motus$scientific_name)])
#' tax.count[is.na(tax.count)|tax.count<0] = 0.01
#' V(graph.tax.termes)$count = unname(tax.count)
#'
#' plot(graph.tax.termes, vertex.size=V(graph.tax.termes)$count, vertex.label.cex=2*(as.numeric(as.vector(v.cex))^-1), edge.arrow.size=0, layout=coord)
#'
#'
#' @seealso \code{\link{get.classic.taxonomy}}
#' @author Lucie Zinger
#' @export
dbtree = function(x) {
#dealing with noranks
x2 = x
for (i in 1:ncol(x2)) {
x2[,i] = as.character(x[,i])
if(length(which(is.na(x[,i])==T))!=0) {
if(i==1) {
x2[which(is.na(x[,i])==T),i] = "NR"
} else {
x2[which(is.na(x[,i])==T),i] = as.character(x2[,i-1])[which(is.na(x2[,i])==T)]
}
}
}
#prepare an edgelist
edgelist = list()
for (i in 1:(ncol(x2)-1)){
out = x2[,c(i,i+1)]
out2 = out[-which(duplicated(out)==T),]
colnames(out2) = c("parent", "kid")
edgelist[[i]] = out2[which(out2[,1]!=out2[,2]),]
}
edgelist = do.call("rbind", edgelist)
#construct the graph
g = igraph::graph.edgelist(as.matrix(edgelist), directed=T)
#get taxorank for each taxa
ranks = do.call("rbind", lapply(1:ncol(x), function(y) {
out = cbind(unique(as.character(x[,y])), colnames(x)[y])
out
}))
#Assign nodes to taxorank
igraph::V(g)$rank = ranks[match(igraph::V(g)$name, ranks[,1]),2]
return(g)
}

View File

@@ -0,0 +1,74 @@
#' @import ROBITaxonomy
#' @include 02_class_metabarcoding.data.R
NULL
#' Dataset taxonomic resolution summary.
#'
#' Summarizes the taxonomic relution of reads and MOTUs over the entire dataset
#'
#'
#' @param x a \code{\link{metabarcoding.data}} object
#' @param colranks a string indicating column name where ranks are stored in \code{x}
#' @param colscores a string indicating column name where taxonomic identification scores are stored in \code{x}
#' @param thresh a threshold for defining at which taxonomic identification scores a sequence can be considered as "not assigned".
#' Default is \code{0.7}
#'
#' @return returns a data.frame and piecharts of the number/proportion of MOTUs/reads assigned to each taxonomic levels
#'
#' @examples
#'
#' data(termes)
#' taxo=default.taxonomy()
#'
#' termes.taxo.table = get.classic.taxonomy(termes, taxo, "taxid")
#' attr(termes, "motus") = data.frame(termes$motus, termes.taxo.table)
#' attr(termes, "motus")["count"] = colSums(termes$reads)
#'
#' summary.taxores(termes, "taxonomic_rank_ok","best_identity")
#'
#' @seealso \code{\linkS4class{taxonomy.obitools}}, and method \code{\link{taxonmicank}}
#'
#' @author Lucie Zinger
#' @keywords taxonomy
#'
#' @export
#'
summary.taxores = function(x,colranks,colscores, thresh=0.7){
#vector encompassing all ranked possible taxonomic levels
taxorank = c("superkingdom", "kingdom", "subkingdom", "superphylum", "phylum", "subphylum", "superclass", "class", "subclass", "infraclass",
"superorder", "order", "suborder", "infraorder", "parvorder", "superfamily", "family", "subfamily", "supertribe", "tribe",
"subtribe", "supergenus", "genus", "subgenus", "species group", "species subgroup", "superspecies", "species", "subspecies",
"varietas", "forma", "no rank", "not assigned")
#settings if thresh
ranks = as.vector(x$motus[,colranks])
ranks[x$motus[,colscores]<thresh] = "not assigned"
#nb of otus
tmp = table(ranks)
taxores.otu = tmp[match(taxorank, names(tmp))]
names(taxores.otu) = taxorank
taxores.otu[is.na(taxores.otu)] = 0
#nb of reads
tmp = aggregate(x$motus$count, by=list(ranks), sum)
taxores.reads = tmp[match(taxorank,tmp[,1]),2]
names(taxores.reads) = taxorank
taxores.reads[is.na(taxores.reads)] = 0
#plot
layout(matrix(c(1,2,1,3),2,2),heights=c(0.3,1))
col.tmp = c(rainbow(length(taxorank)-2,start=0, end=0.5, alpha=0.6), "lightgrey", "darkgrey")
par(mar=c(1,0,0,0), oma=c(0,0,2,0))
frame()
legend("top", taxorank, ncol=6, cex=0.8, fill=col.tmp)
pie(taxores.otu, col=col.tmp, border="lightgrey", labels="", clockwise=T)
mtext("OTUs", side=1, cex=1)
pie(taxores.reads, col=col.tmp, border="lightgrey", labels="", clockwise=T)
mtext("Reads", side=1, cex=1)
#result
out = data.frame(otu=taxores.otu, reads=taxores.reads)
out
}

View File

@@ -0,0 +1,53 @@
#' @import ROBITaxonomy
#' @include 02_class_metabarcoding.data.R
NULL
#' Get classical taxonomy format
#'
#' Creates a table with the classical taxonomic description (from phylum to species)
#'
#' @param x a \code{\link{metabarcoding.data}} object
#' @param taxonomy a instance of \code{\linkS4class{taxonomy.obitools}}
#' @param coltaxid a the name of the column containing taxids to be used for creating classical taxonomic description
#'
#' @return returns a data.frame with the classical taxonomic description ("kingdom", "phylum", "class", "order", "family", "genus", "species"), as well as
#' sequence taxonomic assignment rank and scientific name for each sequences stored in the \code{\link{metabarcoding.data}} object
#'
#' @examples
#'
#' data(termes)
#'
#' taxo=default.taxonomy()
#'
#' termes.taxo.table = get.classic.taxonomy(termes, taxo, "taxid")
#' head(termes.taxo.table)
#'
#' attr(termes, "motus") = data.frame(termes$motus, termes.taxo.table)
#'
#'
#' @seealso \code{\linkS4class{taxonomy.obitools}}, and methods \code{\link{species}},\code{\link{genus}}, \code{\link{family}},\code{\link{kingdom}},
#' \code{\link{superkingdom}},\code{\link{taxonatrank}}, \code{\link{taxonmicank}}
#'
#' @author Lucie Zinger
#' @keywords taxonomy
#' @export
#'
get.classic.taxonomy = function(x, taxonomy, coltaxid) {
classic.taxo = c("kingdom", "phylum", "class", "order", "family", "genus", "species")
taxids = x$motus[,coltaxid]
out = as.data.frame(do.call("cbind", lapply(classic.taxo, function(y) {
scientificname(taxonomy, taxonatrank(taxonomy,taxids,y))
})))
colnames(out) = paste(classic.taxo, "_name_ok", sep="")
rownames(out) = colnames(x)
out$scientific_name_ok = scientificname(taxonomy, taxids)
out$taxonomic_rank_ok = taxonomicrank(taxonomy, taxids)
return(out)
}

128
ROBITools/README-SLRE.md Executable file
View File

@@ -0,0 +1,128 @@
SLRE: Super Light Regular Expression library
============================================
SLRE is an ISO C library that implements a subset of Perl regular
expression syntax. Main features of SLRE are:
* Written in strict ANSI C'89
* Small size (compiled x86 code is about 5kB)
* Uses little stack and does no dynamic memory allocation
* Provides simple intuitive API
* Implements most useful subset of Perl regex syntax (see below)
* Easily extensible. E.g. if one wants to introduce a new
metacharacter `\i`, meaning "IPv4 address", it is easy to do so with SLRE.
SLRE is perfect for tasks like parsing network requests, configuration
files, user input, etc, when libraries like [PCRE](http://pcre.org) are too
heavyweight for the given task. Developers of embedded systems would benefit
most.
## Supported Syntax
(?i) Must be at the beginning of the regex. Makes match case-insensitive
^ Match beginning of a buffer
$ Match end of a buffer
() Grouping and substring capturing
\s Match whitespace
\S Match non-whitespace
\d Match decimal digit
\n Match new line character
\r Match line feed character
\f Match form feed character
\v Match vertical tab character
\t Match horizontal tab character
\b Match backspace character
+ Match one or more times (greedy)
+? Match one or more times (non-greedy)
* Match zero or more times (greedy)
*? Match zero or more times (non-greedy)
? Match zero or once (non-greedy)
x|y Match x or y (alternation operator)
\meta Match one of the meta character: ^$().[]*+?|\
\xHH Match byte with hex value 0xHH, e.g. \x4a
[...] Match any character from set. Ranges like [a-z] are supported
[^...] Match any character but ones from set
Under development: Unicode support.
## API
int slre_match(const char *regexp, const char *buf, int buf_len,
struct slre_cap *caps, int num_caps, int flags);
`slre_match()` matches string buffer `buf` of length `buf_len` against
regular expression `regexp`, which should conform the syntax outlined
above. If regular expression `regexp` contains brackets, `slre_match()`
can capture the respective substrings into the array of `struct slre_cap`
structures:
/* Stores matched fragment for the expression inside brackets */
struct slre_cap {
const char *ptr; /* Points to the matched fragment */
int len; /* Length of the matched fragment */
};
N-th member of the `caps` array will contain fragment that corresponds to the
N-th opening bracket in the `regex`, N is zero-based. `slre_match()` returns
number of bytes scanned from the beginning of the string. If return value is
greater or equal to 0, there is a match. If return value is less then 0, there
is no match. Negative return codes are as follows:
#define SLRE_NO_MATCH -1
#define SLRE_UNEXPECTED_QUANTIFIER -2
#define SLRE_UNBALANCED_BRACKETS -3
#define SLRE_INTERNAL_ERROR -4
#define SLRE_INVALID_CHARACTER_SET -5
#define SLRE_INVALID_METACHARACTER -6
#define SLRE_CAPS_ARRAY_TOO_SMALL -7
#define SLRE_TOO_MANY_BRANCHES -8
#define SLRE_TOO_MANY_BRACKETS -9
## Example: parsing HTTP request line
const char *request = " GET /index.html HTTP/1.0\r\n\r\n";
struct slre_cap caps[4];
if (slre_match("^\\s*(\\S+)\\s+(\\S+)\\s+HTTP/(\\d)\\.(\\d)",
request, strlen(request), caps, 4, 0) > 0) {
printf("Method: [%.*s], URI: [%.*s]\n",
caps[0].len, caps[0].ptr,
caps[1].len, caps[1].ptr);
} else {
printf("Error parsing [%s]\n", request);
}
## Example: find all URLs in a string
static const char *str =
"<img src=\"HTTPS://FOO.COM/x?b#c=tab1\"/> "
" <a href=\"http://cesanta.com\">some link</a>";
static const char *regex = "(?i)((https?://)[^\\s/'\"<>]+/?[^\\s'\"<>]*)";
struct slre_cap caps[2];
int i, j = 0, str_len = strlen(str);
while (j < str_len &&
(i = slre_match(regex, str + j, str_len - j, caps, 2, 0)) > 0) {
printf("Found URL: [%.*s]\n", caps[0].len, caps[0].ptr);
j += i;
}
Output:
Found URL: [HTTPS://FOO.COM/x?b#c=tab1]
Found URL: [http://cesanta.com]
# License
SLRE is released under
[GNU GPL v.2](http://www.gnu.org/licenses/old-licenses/gpl-2.0.html).
Businesses have an option to get non-restrictive, royalty-free commercial
license and professional support from
[Cesanta Software](http://cesanta.com).
[Super Light DNS Resolver](https://github.com/cesanta/sldr),
[Mongoose web server](https://github.com/cesanta/mongoose)
are other projects by Cesanta Software, developed with the same philosophy
of functionality and simplicity.

BIN
ROBITools/data/termes.rda Normal file

Binary file not shown.

47210
ROBITools/inst/extdata/termes.fasta vendored Normal file

File diff suppressed because it is too large Load Diff

1
ROBITools/inst/extdata/termes.tab vendored Normal file

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,21 @@
termes_data A01 acacacac:acacacac ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_01A; serie=01; coordX=5; coordY=5;
termes_data A02 acagcaca:acacacac ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_01B; serie=01; coordX=10; coordY=5;
termes_data A03 gtgtacat:acacacac ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_01C; serie=01; coordX=15; coordY=5;
termes_data A04 tatgtcag:acacacac ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_01D; serie=01; coordX=20; coordY=5;
termes_data A05 tagtcgca:acacacac ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_01E; serie=01; coordX=25; coordY=5;
termes_data A06 tactatac:acacacac ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_01F; serie=01; coordX=30; coordY=5;
termes_data A07 actagatc:acacacac ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_01G; serie=01; coordX=35; coordY=5;
termes_data A08 gatcgcga:acacacac ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_01H; serie=01; coordX=40; coordY=5;
termes_data A09 acacacac:acagcaca ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_02A; serie=01; coordX=45; coordY=5;
termes_data A10 acagcaca:acagcaca ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_02B; serie=01; coordX=50; coordY=5;
termes_data A11 gtgtacat:acagcaca ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_02C; serie=01; coordX=55; coordY=5;
termes_data A12 tatgtcag:acagcaca ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_02D; serie=01; coordX=60; coordY=5;
termes_data A13 tagtcgca:acagcaca ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_02E; serie=02; coordX=65; coordY=5;
termes_data A14 tactatac:acagcaca ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_02F; serie=02; coordX=70; coordY=5;
termes_data A15 actagatc:acagcaca ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_02G; serie=02; coordX=75; coordY=5;
termes_data A16 gatcgcga:acagcaca ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_02H; serie=02; coordX=80; coordY=5;
termes_data A17 acacacac:gtgtacat ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_03A; serie=02; coordX=85; coordY=5;
termes_data A18 acagcaca:gtgtacat ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_03B; serie=02; coordX=90; coordY=5;
termes_data A19 gtgtacat:gtgtacat ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_03C; serie=02; coordX=95; coordY=5;
termes_data A11r tcagtgtc:gtcgtaga ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=04_10B; serie=01; coordX=55; coordY=5;
termes_data A16r actctgct:gtcgtaga ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=04_10C; serie=02; coordX=80; coordY=5;

BIN
ROBITools/src/ROBITools.so Executable file

Binary file not shown.

26
ROBITools/src/ecoError.c Normal file
View File

@@ -0,0 +1,26 @@
#include "ecoPCR.h"
#include <stdio.h>
#include <stdlib.h>
/*
* print the message given as argument and exit the program
* @param error error number
* @param message the text explaining what's going on
* @param filename the file source where the program failed
* @param linenumber the line where it has failed
* filename and linenumber are written at pre-processing
* time by a macro
*/
void ecoError(int32_t error,
const char* message,
const char * filename,
int linenumber)
{
fprintf(stderr,"Error %d in file %s line %d : %s\n",
error,
filename,
linenumber,
message);
abort();
}

BIN
ROBITools/src/ecoError.o Normal file

Binary file not shown.

122
ROBITools/src/ecoIOUtils.c Normal file
View File

@@ -0,0 +1,122 @@
#include "ecoPCR.h"
#include <stdio.h>
#include <stdlib.h>
#define SWAPINT32(x) ((((x) << 24) & 0xFF000000) | (((x) << 8) & 0xFF0000) | \
(((x) >> 8) & 0xFF00) | (((x) >> 24) & 0xFF))
int32_t is_big_endian()
{
int32_t i=1;
return (int32_t)((char*)&i)[0];
}
int32_t swap_int32_t(int32_t i)
{
return SWAPINT32(i);
}
/**
* Read part of the file
* @param *f the database
* @param recordSize the size to be read
*
* @return buffer
*/
void *read_ecorecord(FILE *f,int32_t *recordSize)
{
static void *buffer =NULL;
int32_t buffersize=0;
int32_t read;
if (!recordSize)
ECOERROR(ECO_ASSERT_ERROR,
"recordSize cannot be NULL");
read = fread(recordSize,
1,
sizeof(int32_t),
f);
if (feof(f))
return NULL;
if (read != sizeof(int32_t))
ECOERROR(ECO_IO_ERROR,"Reading record size error");
if (is_big_endian())
*recordSize=swap_int32_t(*recordSize);
if (buffersize < *recordSize)
{
if (buffer)
buffer = ECOREALLOC(buffer,*recordSize,
"Increase size of record buffer");
else
buffer = ECOMALLOC(*recordSize,
"Allocate record buffer");
}
read = fread(buffer,
1,
*recordSize,
f);
if (read != *recordSize)
ECOERROR(ECO_IO_ERROR,"Reading record data error");
return buffer;
};
/**
* Open the database and check it's readable
* @param filename name of the database (.sdx, .rdx, .tbx)
* @param sequencecount buffer - pointer to variable storing the number of occurence
* @param abort_on_open_error boolean to define the behaviour in case of error
* while opening the database
* @return FILE type
**/
FILE *open_ecorecorddb(const char *filename,
int32_t *sequencecount,
int32_t abort_on_open_error)
{
FILE *f;
int32_t read;
f = fopen(filename,"rb");
if (!f)
{
if (abort_on_open_error)
ECOERROR(ECO_IO_ERROR,"Cannot open file");
else
{
*sequencecount=0;
return NULL;
}
}
read = fread(sequencecount,
1,
sizeof(int32_t),
f);
if (read != sizeof(int32_t))
ECOERROR(ECO_IO_ERROR,"Reading record size error");
if (is_big_endian())
*sequencecount=swap_int32_t(*sequencecount);
return f;
}

BIN
ROBITools/src/ecoIOUtils.o Normal file

Binary file not shown.

79
ROBITools/src/ecoMalloc.c Normal file
View File

@@ -0,0 +1,79 @@
#include "ecoPCR.h"
#include <stdlib.h>
static int eco_log_malloc = 0;
void eco_trace_memory_allocation()
{
eco_log_malloc=1;
}
void eco_untrace_memory_allocation()
{
eco_log_malloc=0;
}
void *eco_malloc(int32_t chunksize,
const char *error_message,
const char *filename,
int32_t line)
{
void * chunk;
chunk = calloc(1,chunksize);
if (!chunk)
ecoError(ECO_MEM_ERROR,error_message,filename,line);
if (eco_log_malloc)
fprintf(stderr,
"Memory segment located at %p of size %d is allocated (file : %s [%d])",
chunk,
chunksize,
filename,
line);
return chunk;
}
void *eco_realloc(void *chunk,
int32_t newsize,
const char *error_message,
const char *filename,
int32_t line)
{
void *newchunk;
newchunk = realloc(chunk,newsize);
if (!newchunk)
ecoError(ECO_MEM_ERROR,error_message,filename,line);
if (eco_log_malloc)
fprintf(stderr,
"Old memory segment %p is reallocated at %p with a size of %d (file : %s [%d])",
chunk,
newchunk,
newsize,
filename,
line);
return newchunk;
}
void eco_free(void *chunk,
const char *error_message,
const char *filename,
int32_t line)
{
free(chunk);
if (eco_log_malloc)
fprintf(stderr,
"Memory segment %p is released => %s (file : %s [%d])",
chunk,
error_message,
filename,
line);
}

BIN
ROBITools/src/ecoMalloc.o Normal file

Binary file not shown.

283
ROBITools/src/ecoPCR.h Normal file
View File

@@ -0,0 +1,283 @@
#ifndef ECOPCR_H_
#define ECOPCR_H_
#include <stdio.h>
#include <inttypes.h>
#include <R.h>
#include <Rinternals.h>
#include <Rdefines.h>
//#ifndef H_apat
//#include "../libapat/apat.h"
//#endif
/*****************************************************
*
* Data type declarations
*
*****************************************************/
/*
*
* Sequence types
*
*/
typedef struct {
int32_t taxid;
char AC[20];
int32_t DE_length;
int32_t SQ_length;
int32_t CSQ_length;
char data[1];
} ecoseqformat_t;
typedef struct {
int32_t taxid;
int32_t SQ_length;
char *AC;
char *DE;
char *SQ;
} ecoseq_t;
/*
*
* Taxonomy taxon types
*
*/
typedef struct {
int32_t taxid;
int32_t rank;
int32_t parent;
int32_t namelength;
char name[1];
} ecotxformat_t;
typedef struct ecotxnode {
int32_t taxid;
int32_t rank;
int32_t farest;
struct ecotxnode *parent;
char *name;
} ecotx_t;
typedef struct {
int32_t count;
int32_t maxtaxid;
int32_t buffersize;
ecotx_t taxon[1];
} ecotxidx_t;
/*
*
* Taxonomy rank types
*
*/
typedef struct {
int32_t count;
char* label[1];
} ecorankidx_t;
/*
*
* Taxonomy name types
*
*/
typedef struct {
int32_t is_scientificname;
int32_t namelength;
int32_t classlength;
int32_t taxid;
char names[1];
} econameformat_t;
typedef struct {
char *name;
char *classname;
int32_t is_scientificname;
struct ecotxnode *taxon;
} econame_t;
typedef struct {
int32_t count;
econame_t names[1];
} econameidx_t;
typedef struct {
ecorankidx_t *ranks;
econameidx_t *names;
ecotxidx_t *taxons;
} ecotaxonomy_t;
/*****************************************************
*
* Function declarations
*
*****************************************************/
/*
*
* Low level system functions
*
*/
int32_t is_big_endian();
int32_t swap_int32_t(int32_t);
void *eco_malloc(int32_t chunksize,
const char *error_message,
const char *filename,
int32_t line);
void *eco_realloc(void *chunk,
int32_t chunksize,
const char *error_message,
const char *filename,
int32_t line);
void eco_free(void *chunk,
const char *error_message,
const char *filename,
int32_t line);
void eco_trace_memory_allocation();
void eco_untrace_memory_allocation();
#define ECOMALLOC(size,error_message) \
eco_malloc((size),(error_message),__FILE__,__LINE__)
#define ECOREALLOC(chunk,size,error_message) \
eco_realloc((chunk),(size),(error_message),__FILE__,__LINE__)
#define ECOFREE(chunk,error_message) \
eco_free((chunk),(error_message),__FILE__,__LINE__)
/*
*
* Error managment
*
*/
void ecoError(int32_t,const char*,const char *,int);
#define ECOERROR(code,message) ecoError((code),(message),__FILE__,__LINE__)
#define ECO_IO_ERROR (1)
#define ECO_MEM_ERROR (2)
#define ECO_ASSERT_ERROR (3)
#define ECO_NOTFOUND_ERROR (4)
/*
*
* Low level Disk access functions
*
*/
FILE *open_ecorecorddb(const char *filename,
int32_t *sequencecount,
int32_t abort_on_open_error);
void *read_ecorecord(FILE *,int32_t *recordSize);
/*
* Read function in internal binary format
*/
FILE *open_ecoseqdb(const char *filename,
int32_t *sequencecount);
ecoseq_t *readnext_ecoseq(FILE *);
ecorankidx_t *read_rankidx(const char *filename);
econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy);
/**
* Read taxonomy data as formated by the ecoPCRFormat.py script.
*
* This function is normaly uses internaly by the read_taxonomy
* function and should not be called directly.
*
* @arg filename path to the *.tdx file of the reformated db
*
* @return pointer to a taxonomy index structure
*/
ecotxidx_t *read_taxonomyidx(const char *filename,const char *filename2);
ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName);
ecotx_t *eco_findtaxonatrank(ecotx_t *taxon, int32_t rankidx);
ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy, int32_t taxid);
int eco_isundertaxon(ecotx_t *taxon, int other_taxid);
ecoseq_t *ecoseq_iterator(const char *prefix);
ecoseq_t *new_ecoseq();
int32_t delete_ecoseq(ecoseq_t *);
ecoseq_t *new_ecoseq_with_data( char *AC,
char *DE,
char *SQ,
int32_t taxid
);
int32_t delete_taxon(ecotx_t *taxon);
int32_t delete_taxonomy(ecotxidx_t *index);
int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy);
int32_t rank_index(const char* label,ecorankidx_t* ranks);
//int32_t delete_apatseq(SeqPtr pseq);
//PatternPtr buildPattern(const char *pat, int32_t error_max);
//PatternPtr complementPattern(PatternPtr pat);
//
//SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out,int32_t circular);
//char *ecoComplementPattern(char *nucAcSeq);
//char *ecoComplementSequence(char *nucAcSeq);
//char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end);
ecotx_t *eco_getspecies(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
ecotx_t *eco_getgenus(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
ecotx_t *eco_getfamily(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
ecotx_t *eco_getkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
//int eco_is_taxid_ignored(int32_t *ignored_taxid, int32_t tab_len, int32_t taxid);
//int eco_is_taxid_included(ecotaxonomy_t *taxonomy, int32_t *included_taxid, int32_t tab_len, int32_t taxid);
ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy);
#endif /*ECOPCR_H_*/

156
ROBITools/src/ecodna.c Normal file
View File

@@ -0,0 +1,156 @@
#include <string.h>
#include "ecoPCR.h"
/*
* @doc: DNA alphabet (IUPAC)
*/
#define LX_BIO_DNA_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]"
/*
* @doc: complementary DNA alphabet (IUPAC)
*/
#define LX_BIO_CDNA_ALPHA "TVGHEFCDIJMLKNOPQYSAABWXRZ#!]["
static char sNuc[] = LX_BIO_DNA_ALPHA;
static char sAnuc[] = LX_BIO_CDNA_ALPHA;
static char LXBioBaseComplement(char nucAc);
static char *LXBioSeqComplement(char *nucAcSeq);
static char *reverseSequence(char *str,char isPattern);
/* ---------------------------- */
char LXBioBaseComplement(char nucAc)
{
char *c;
if ((c = strchr(sNuc, nucAc)))
return sAnuc[(c - sNuc)];
else
return nucAc;
}
/* ---------------------------- */
char *LXBioSeqComplement(char *nucAcSeq)
{
char *s;
for (s = nucAcSeq ; *s ; s++)
*s = LXBioBaseComplement(*s);
return nucAcSeq;
}
char *reverseSequence(char *str,char isPattern)
{
char *sb, *se, c;
if (! str)
return str;
sb = str;
se = str + strlen(str) - 1;
while(sb <= se) {
c = *sb;
*sb++ = *se;
*se-- = c;
}
sb = str;
se = str + strlen(str) - 1;
if (isPattern)
for (;sb < se; sb++)
{
if (*sb=='#')
{
if (((se - sb) > 2) && (*(sb+2)=='!'))
{
*sb='!';
sb+=2;
*sb='#';
}
else
{
*sb=*(sb+1);
sb++;
*sb='#';
}
}
else if (*sb=='!')
{
*sb=*(sb-1);
*(sb-1)='!';
}
}
return str;
}
char *ecoComplementPattern(char *nucAcSeq)
{
return reverseSequence(LXBioSeqComplement(nucAcSeq),1);
}
char *ecoComplementSequence(char *nucAcSeq)
{
return reverseSequence(LXBioSeqComplement(nucAcSeq),0);
}
char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end)
/*
extract subsequence from nucAcSeq [begin,end[
*/
{
static char *buffer = NULL;
static int32_t buffSize= 0;
int32_t length;
if (begin < end)
{
length = end - begin;
if (length >= buffSize)
{
buffSize = length+1;
if (buffer)
buffer=ECOREALLOC(buffer,buffSize,
"Error in reallocating sub sequence buffer");
else
buffer=ECOMALLOC(buffSize,
"Error in allocating sub sequence buffer");
}
strncpy(buffer,nucAcSeq + begin,length);
buffer[length]=0;
}
else
{
length = end + strlen(nucAcSeq) - begin;
if (length >= buffSize)
{
buffSize = length+1;
if (buffer)
buffer=ECOREALLOC(buffer,buffSize,
"Error in reallocating sub sequence buffer");
else
buffer=ECOMALLOC(buffSize,
"Error in allocating sub sequence buffer");
}
strncpy(buffer,nucAcSeq+begin,length - end);
strncpy(buffer+(length-end),nucAcSeq ,end);
buffer[length]=0;
}
return buffer;
}

BIN
ROBITools/src/ecodna.o Normal file

Binary file not shown.

20
ROBITools/src/ecofilter.c Normal file
View File

@@ -0,0 +1,20 @@
#include "ecoPCR.h"
int eco_is_taxid_included( ecotaxonomy_t *taxonomy,
int32_t *restricted_taxid,
int32_t tab_len,
int32_t taxid)
{
int i;
ecotx_t *taxon;
taxon = eco_findtaxonbytaxid(taxonomy, taxid);
if (taxon)
for (i=0; i < tab_len; i++)
if ( (taxon->taxid == restricted_taxid[i]) ||
(eco_isundertaxon(taxon, restricted_taxid[i])) )
return 1;
return 0;
}

BIN
ROBITools/src/ecofilter.o Normal file

Binary file not shown.

64
ROBITools/src/econame.c Normal file
View File

@@ -0,0 +1,64 @@
#include "ecoPCR.h"
#include <string.h>
#include <stdlib.h>
static econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy);
econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy)
{
int32_t count;
FILE *f;
econameidx_t *indexname;
int32_t i;
f = open_ecorecorddb(filename,&count,0);
if (f==NULL)
return NULL;
indexname = (econameidx_t*) ECOMALLOC(sizeof(econameidx_t) + sizeof(econame_t) * (count-1),"Allocate names");
indexname->count=count;
for (i=0; i < count; i++){
readnext_econame(f,(indexname->names)+i,taxonomy);
}
return indexname;
}
econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy)
{
econameformat_t *raw;
int32_t rs;
raw = read_ecorecord(f,&rs);
if (!raw)
return NULL;
if (is_big_endian())
{
raw->is_scientificname = swap_int32_t(raw->is_scientificname);
raw->namelength = swap_int32_t(raw->namelength);
raw->classlength = swap_int32_t(raw->classlength);
raw->taxid = swap_int32_t(raw->taxid);
}
name->is_scientificname=raw->is_scientificname;
name->name = ECOMALLOC((raw->namelength+1) * sizeof(char),"Allocate name");
strncpy(name->name,raw->names,raw->namelength);
name->name[raw->namelength]=0;
name->classname = ECOMALLOC((raw->classlength+1) * sizeof(char),"Allocate classname");
strncpy(name->classname,(raw->names+raw->namelength),raw->classlength);
name->classname[raw->classlength]=0;
name->taxon = taxonomy->taxons->taxon + raw->taxid;
return name;
}

BIN
ROBITools/src/econame.o Normal file

Binary file not shown.

55
ROBITools/src/ecorank.c Normal file
View File

@@ -0,0 +1,55 @@
#include "ecoPCR.h"
#include <string.h>
#include <stdlib.h>
static int compareRankLabel(const void *label1, const void *label2);
ecorankidx_t *read_rankidx(const char *filename)
{
int32_t count;
FILE *f;
ecorankidx_t *index;
int32_t i;
int32_t rs;
char *buffer;
f = open_ecorecorddb(filename,&count,0);
if (f==NULL)
return NULL;
index = (ecorankidx_t*) ECOMALLOC(sizeof(ecorankidx_t) + sizeof(char*) * (count-1),
"Allocate rank index");
index->count=count;
for (i=0; i < count; i++)
{
buffer = read_ecorecord(f,&rs);
index->label[i]=(char*) ECOMALLOC(rs+1,
"Allocate rank label");
strncpy(index->label[i],buffer,rs);
}
return index;
}
int32_t rank_index(const char* label,ecorankidx_t* ranks)
{
char **rep;
rep = bsearch(label,ranks->label,ranks->count,sizeof(char*),compareRankLabel);
if (rep)
return rep-ranks->label;
// else
// ECOERROR(ECO_NOTFOUND_ERROR,"Rank label not found");
return -1;
}
int compareRankLabel(const void *label1, const void *label2)
{
return strcmp((const char*)label1,*(const char**)label2);
}

BIN
ROBITools/src/ecorank.o Normal file

Binary file not shown.

230
ROBITools/src/ecoseq.c Normal file
View File

@@ -0,0 +1,230 @@
#include "ecoPCR.h"
#include <stdlib.h>
#include <string.h>
#include <zlib.h>
#include <string.h>
#include <stdio.h>
#include <ctype.h>
static FILE *open_seqfile(const char *prefix,int32_t index);
ecoseq_t *new_ecoseq()
{
void *tmp;
tmp = ECOMALLOC(sizeof(ecoseq_t),"Allocate new ecoseq structure");
return tmp;
}
int32_t delete_ecoseq(ecoseq_t * seq)
{
if (seq)
{
if (seq->AC)
ECOFREE(seq->AC,"Free sequence AC");
if (seq->DE)
ECOFREE(seq->DE,"Free sequence DE");
if (seq->SQ)
ECOFREE(seq->SQ,"Free sequence SQ");
ECOFREE(seq,"Free sequence structure");
return 0;
}
return 1;
}
ecoseq_t *new_ecoseq_with_data( char *AC,
char *DE,
char *SQ,
int32_t taxid_idx
)
{
ecoseq_t *tmp;
int32_t lstr;
tmp = new_ecoseq();
tmp->taxid=taxid_idx;
if (AC)
{
lstr =strlen(AC);
tmp->AC=ECOMALLOC((lstr+1) * sizeof(char),
"Allocate sequence accession");
strcpy(tmp->AC,AC);
}
if (DE)
{
lstr =strlen(DE);
tmp->DE=ECOMALLOC((lstr+1) * sizeof(char),
"Allocate sequence definition");
strcpy(tmp->DE,DE);
}
if (SQ)
{
lstr =strlen(SQ);
tmp->SQ=ECOMALLOC((lstr+1) * sizeof(char),
"Allocate sequence data");
strcpy(tmp->SQ,SQ);
}
return tmp;
}
/**
* ?? used ??
**/
FILE *open_ecoseqdb(const char *filename,
int32_t *sequencecount)
{
return open_ecorecorddb(filename,sequencecount,1);
}
ecoseq_t *readnext_ecoseq(FILE *f)
{
char *compressed=NULL;
ecoseqformat_t *raw;
ecoseq_t *seq;
int32_t comp_status;
unsigned long int seqlength;
int32_t rs;
char *c;
int32_t i;
raw = read_ecorecord(f,&rs);
if (!raw)
return NULL;
if (is_big_endian())
{
raw->CSQ_length = swap_int32_t(raw->CSQ_length);
raw->DE_length = swap_int32_t(raw->DE_length);
raw->SQ_length = swap_int32_t(raw->SQ_length);
raw->taxid = swap_int32_t(raw->taxid);
}
seq = new_ecoseq();
seq->taxid = raw->taxid;
seq->AC = ECOMALLOC(strlen(raw->AC) +1,
"Allocate Sequence Accesion number");
strncpy(seq->AC,raw->AC,strlen(raw->AC));
seq->DE = ECOMALLOC(raw->DE_length+1,
"Allocate Sequence definition");
strncpy(seq->DE,raw->data,raw->DE_length);
seqlength = seq->SQ_length = raw->SQ_length;
compressed = raw->data + raw->DE_length;
seq->SQ = ECOMALLOC(seqlength+1,
"Allocate sequence buffer");
comp_status = uncompress((unsigned char*)seq->SQ,
&seqlength,
(unsigned char*)compressed,
raw->CSQ_length);
if (comp_status != Z_OK)
ECOERROR(ECO_IO_ERROR,"I cannot uncompress sequence data");
for (c=seq->SQ,i=0;i<seqlength;c++,i++)
*c=toupper(*c);
return seq;
}
/**
* Open the sequences database (.sdx file)
* @param prefix name of the database (radical without extension)
* @param index integer
*
* @return file object
*/
FILE *open_seqfile(const char *prefix,int32_t index)
{
char filename_buffer[1024];
int32_t filename_length;
FILE *input;
int32_t seqcount;
filename_length = snprintf(filename_buffer,
1023,
"%s_%03d.sdx",
prefix,
index);
// fprintf(stderr,"# Coucou %s\n",filename_buffer);
if (filename_length >= 1024)
ECOERROR(ECO_ASSERT_ERROR,"file name is too long");
filename_buffer[filename_length]=0;
input=open_ecorecorddb(filename_buffer,&seqcount,0);
if (input)
fprintf(stderr,"# Reading file %s containing %d sequences...\n",
filename_buffer,
seqcount);
return input;
}
ecoseq_t *ecoseq_iterator(const char *prefix)
{
static FILE *current_seq_file= NULL;
static int32_t current_file_idx = 1;
static char current_prefix[1024];
ecoseq_t *seq;
if (prefix)
{
current_file_idx = 1;
if (current_seq_file)
fclose(current_seq_file);
strncpy(current_prefix,prefix,1023);
current_prefix[1023]=0;
current_seq_file = open_seqfile(current_prefix,
current_file_idx);
if (!current_seq_file)
return NULL;
}
seq = readnext_ecoseq(current_seq_file);
if (!seq && feof(current_seq_file))
{
current_file_idx++;
fclose(current_seq_file);
current_seq_file = open_seqfile(current_prefix,
current_file_idx);
if (current_seq_file)
seq = readnext_ecoseq(current_seq_file);
}
return seq;
}

BIN
ROBITools/src/ecoseq.o Normal file

Binary file not shown.

437
ROBITools/src/ecotax.c Normal file
View File

@@ -0,0 +1,437 @@
#include "ecoPCR.h"
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <R.h>
#ifndef MAX
#define MAX(x,y) (((x)>(y)) ? (x):(y))
#endif
static ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon);
/**
* Open the taxonomy database
* @param pointer to the database (.tdx file)
* @return a ecotxidx_t structure
*/
ecotxidx_t *read_taxonomyidx(const char *filename,const char *filename2)
{
int32_t count;
int32_t count2;
FILE *f;
FILE *f2;
ecotxidx_t *index;
struct ecotxnode *t;
int32_t i;
int32_t j;
f = open_ecorecorddb(filename,&count,0);
if (f==NULL) return NULL;
f2 = open_ecorecorddb(filename2,&count2,0);
index = (ecotxidx_t*) ECOMALLOC(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count+count2-1),
"Allocate taxonomy");
index->count=count+count2;
index->buffersize = index->count;
index->maxtaxid=0;
REprintf("Readind %d taxa...\n",count);
for (i=0; i < count; i++){
readnext_ecotaxon(f,&(index->taxon[i]));
index->taxon[i].parent=index->taxon + (size_t)index->taxon[i].parent;
index->taxon[i].parent->farest=0;
if (index->taxon[i].taxid > index->maxtaxid)
index->maxtaxid=index->taxon[i].taxid;
}
if (count2>0)
REprintf("Readind %d local taxa...\n",count2);
else
REprintf("No local taxon\n");
count = index->count;
for (; i < count; i++){
readnext_ecotaxon(f2,&(index->taxon[i]));
index->taxon[i].parent=index->taxon + (size_t)index->taxon[i].parent;
index->taxon[i].parent->farest=0;
if (index->taxon[i].taxid > index->maxtaxid)
index->maxtaxid=index->taxon[i].taxid;
}
REprintf("Computing longest branches...\n",count);
for (i=0; i < count; i++){
t=index->taxon+i;
if (t->farest==-1)
{
t->farest=0;
while(t->parent != t)
{
j = t->farest + 1;
if (j > t->parent->farest)
{
t->parent->farest = j;
t=t->parent;
}
else
t=index->taxon;
}
}
}
return index;
}
int32_t delete_taxonomy(ecotxidx_t *index)
{
int32_t i;
if (index)
{
for (i=0; i< index->count; i++)
if (index->taxon[i].name)
ECOFREE(index->taxon[i].name,"Free scientific name");
ECOFREE(index,"Free Taxonomy");
return 0;
}
return 1;
}
int32_t delete_taxon(ecotx_t *taxon)
{
if (taxon)
{
if (taxon->name)
ECOFREE(taxon->name,"Free scientific name");
ECOFREE(taxon,"Free Taxon");
return 0;
}
return 1;
}
/**
* Read the database for a given taxon a save the data
* into the taxon structure(if any found)
* @param *f pointer to FILE type returned by fopen
* @param *taxon pointer to the structure
*
* @return a ecotx_t structure if any taxon found else NULL
*/
ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon)
{
ecotxformat_t *raw;
int32_t rs;
raw = read_ecorecord(f,&rs);
if (!raw)
return NULL;
if (is_big_endian())
{
raw->namelength = swap_int32_t(raw->namelength);
raw->parent = swap_int32_t(raw->parent);
raw->rank = swap_int32_t(raw->rank);
raw->taxid = swap_int32_t(raw->taxid);
}
taxon->parent = (ecotx_t*)((size_t)raw->parent);
taxon->taxid = raw->taxid;
taxon->rank = raw->rank;
taxon->farest = -1;
taxon->name = ECOMALLOC((raw->namelength+1) * sizeof(char),
"Allocate taxon scientific name");
strncpy(taxon->name,raw->name,raw->namelength);
return taxon;
}
ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName)
{
ecotaxonomy_t *tax;
char *filename;
char *filename2;
int buffsize;
tax = ECOMALLOC(sizeof(ecotaxonomy_t),
"Allocate taxonomy structure");
tax->ranks =NULL;
tax->taxons=NULL;
tax->names =NULL;
buffsize = strlen(prefix)+10;
filename = ECOMALLOC(buffsize,
"Allocate filename");
filename2= ECOMALLOC(buffsize,
"Allocate filename");
snprintf(filename,buffsize,"%s.rdx",prefix);
tax->ranks = read_rankidx(filename);
if (tax->ranks == NULL)
{
ECOFREE(filename,"Desallocate filename 1");
ECOFREE(filename2,"Desallocate filename 2");
delete_ecotaxonomy(tax);
return NULL;
}
snprintf(filename,buffsize,"%s.tdx",prefix);
snprintf(filename2,buffsize,"%s.ldx",prefix);
tax->taxons = read_taxonomyidx(filename,filename2);
if (tax->taxons == NULL)
{
ECOFREE(filename,"Desallocate filename 1");
ECOFREE(filename,"Desallocate filename 2");
delete_ecotaxonomy(tax);
return NULL;
}
if (readAlternativeName)
{
snprintf(filename,buffsize,"%s.ndx",prefix);
tax->names=read_nameidx(filename,tax);
}
else
tax->names=NULL;
ECOFREE(filename,"Desallocate filename 1");
ECOFREE(filename2,"Desallocate filename 2");
return tax;
}
int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy)
{
if (taxonomy)
{
if (taxonomy->ranks)
ECOFREE(taxonomy->ranks,"Free rank index");
if (taxonomy->names)
ECOFREE(taxonomy->names,"Free names index");
if (taxonomy->taxons)
ECOFREE(taxonomy->taxons,"Free taxon index");
ECOFREE(taxonomy,"Free taxonomy structure");
return 0;
}
return 1;
}
ecotx_t *eco_findtaxonatrank(ecotx_t *taxon,
int32_t rankidx)
{
ecotx_t *current_taxon;
ecotx_t *next_taxon;
current_taxon = taxon;
next_taxon = current_taxon->parent;
while ((current_taxon!=next_taxon) && // I' am the root node
(current_taxon->rank!=rankidx))
{
current_taxon = next_taxon;
next_taxon = current_taxon->parent;
}
if (current_taxon->rank==rankidx)
return current_taxon;
else
return NULL;
}
static int bcomptaxon (const void * ptaxid, const void * ptaxon) {
ecotx_t *current_taxon = (ecotx_t*)ptaxon;
int32_t taxid=(int32_t)((size_t)ptaxid);
return taxid - current_taxon->taxid;
}
/**
* Get back information concerning a taxon from a taxonomic id
* @param *taxonomy the taxonomy database
* @param taxid the taxonomic id
*
* @result a ecotx_t structure containing the taxonimic information
**/
ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy,
int32_t taxid)
{
ecotx_t *current_taxon;
int32_t taxoncount;
// int32_t i;
taxoncount=taxonomy->taxons->count;
current_taxon = (ecotx_t*) bsearch((const void *)((size_t)taxid),
(const void *)taxonomy->taxons->taxon,
taxoncount,
sizeof(ecotx_t),
bcomptaxon);
/* Old version
for (current_taxon=taxonomy->taxons->taxon,
i=0;
i < taxoncount;
i++,
current_taxon++){
if (current_taxon->taxid==taxid){
return current_taxon;
}
}
*/
return current_taxon;
}
/**
* Find out if taxon is son of other taxon (identified by its taxid)
* @param *taxon son taxon
* @param parent_taxid taxonomic id of the other taxon
*
* @return 1 is the other taxid math a parent taxid, else 0
**/
int eco_isundertaxon(ecotx_t *taxon,
int other_taxid)
{
ecotx_t *next_parent;
next_parent = taxon->parent;
while ( (other_taxid != next_parent->taxid) &&
(strcmp(next_parent->name, "root")) )
{
next_parent = next_parent->parent;
}
if (other_taxid == next_parent->taxid)
return 1;
else
return 0;
}
ecotx_t *eco_getspecies(ecotx_t *taxon,
ecotaxonomy_t *taxonomy)
{
static ecotaxonomy_t *tax=NULL;
static int32_t rankindex=-1;
if (taxonomy && tax!=taxonomy)
{
rankindex = rank_index("species",taxonomy->ranks);
tax=taxonomy;
}
if (!tax || rankindex < 0)
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
return eco_findtaxonatrank(taxon,rankindex);
}
ecotx_t *eco_getgenus(ecotx_t *taxon,
ecotaxonomy_t *taxonomy)
{
static ecotaxonomy_t *tax=NULL;
static int32_t rankindex=-1;
if (taxonomy && tax!=taxonomy)
{
rankindex = rank_index("genus",taxonomy->ranks);
tax=taxonomy;
}
if (!tax || rankindex < 0)
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
return eco_findtaxonatrank(taxon,rankindex);
}
ecotx_t *eco_getfamily(ecotx_t *taxon,
ecotaxonomy_t *taxonomy)
{
static ecotaxonomy_t *tax=NULL;
static int32_t rankindex=-1;
if (taxonomy && tax!=taxonomy)
{
rankindex = rank_index("family",taxonomy->ranks);
tax=taxonomy;
}
if (!tax || rankindex < 0)
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
return eco_findtaxonatrank(taxon,rankindex);
}
ecotx_t *eco_getkingdom(ecotx_t *taxon,
ecotaxonomy_t *taxonomy)
{
static ecotaxonomy_t *tax=NULL;
static int32_t rankindex=-1;
if (taxonomy && tax!=taxonomy)
{
rankindex = rank_index("kingdom",taxonomy->ranks);
tax=taxonomy;
}
if (!tax || rankindex < 0)
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
return eco_findtaxonatrank(taxon,rankindex);
}
ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,
ecotaxonomy_t *taxonomy)
{
static ecotaxonomy_t *tax=NULL;
static int32_t rankindex=-1;
if (taxonomy && tax!=taxonomy)
{
rankindex = rank_index("superkingdom",taxonomy->ranks);
tax=taxonomy;
}
if (!tax || rankindex < 0)
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
return eco_findtaxonatrank(taxon,rankindex);
}

BIN
ROBITools/src/ecotax.o Normal file

Binary file not shown.

835
ROBITools/src/robitax.c Normal file
View File

@@ -0,0 +1,835 @@
/*
* robitax.c
*
* Created on: 17 janv. 2013
* Author: coissac
*/
#include "robitax.h"
#include <unistd.h>
//#include <regex.h>
#include "slre.h"
/**
* Return a pointeur to an obitools taxonomy C structure
* from an R instance of taxonomy.obitools
*
* The function checks if the pointer stored in the R object is set
* to NULL. In this case this means that we have to load the taxonomy
* from the disk.
*
* @param taxonomy an R object
* @type taxonomy SEXP
*
* @return a pointer to the C structure
* @rtype ecotaxonomy_t *
*/
ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy)
{
char *pwd;
SEXP pointer;
SEXP rclass;
SEXP rdir;
SEXP rfile;
ecotaxonomy_t *ptax;
const char *class;
const char *file;
const char *dir;
int saved;
if (!IS_S4_OBJECT(Rtaxonomy) )
error("argument not taxonomy.obitools instance");
// We get the class name and compare it to "taxonomy.obitools"
rclass = getAttrib(Rtaxonomy, R_ClassSymbol);
class = CHAR(asChar(rclass));
if (strcmp(class,"taxonomy.obitools"))
error("argument not taxonomy.obitools instance");
pointer = R_do_slot(Rtaxonomy,mkString("pointer"));
saved = LOGICAL(R_do_slot(Rtaxonomy,mkString("saved")))[0];
ptax = (ecotaxonomy_t *) R_ExternalPtrAddr(pointer);
// If the external pointer is set to NULL we have to load
// the taxonomy from file
if (ptax==NULL && saved)
{
pwd = getcwd(NULL,0);
rfile = R_do_slot(Rtaxonomy,mkString("dbname"));
file = CHAR(asChar(rfile));
rdir = R_do_slot(Rtaxonomy,mkString("workingdir"));
dir = CHAR(asChar(rdir));
chdir(dir);
ptax = read_taxonomy(file,1);
R_SetExternalPtrAddr(pointer,(void*)ptax);
chdir(pwd);
free(pwd);
}
if (ptax==NULL && ! saved)
error("The taxonomy instance is no more valid and must be rebuilt");
return ptax;
}
SEXP R_delete_taxonomy(SEXP Rtaxonomy)
{
ecotaxonomy_t *ptax;
// SEXP pointer;
ptax = (ecotaxonomy_t *) R_ExternalPtrAddr(Rtaxonomy);
(void) delete_ecotaxonomy(ptax);
// Clear the external pointer
R_ClearExternalPtr(Rtaxonomy);
return R_NilValue;
}
SEXP R_read_taxonomy(SEXP filename, SEXP altenative)
{
int alt;
const char* file;
SEXP Rtax;
if (! isString(filename))
error("filename not character");
file = CHAR(STRING_ELT(filename, 0));
if (! isLogical(altenative))
error("altenative not logical");
alt = LOGICAL(altenative)[0];
ecotaxonomy_t *taxonomy = read_taxonomy(file,alt);
if (! taxonomy)
error("Cannot open the taxonomy database");
Rtax = PROTECT(R_MakeExternalPtr(taxonomy, mkString("ROBITools NCBI Taxonomy pointer"), R_NilValue));
R_RegisterCFinalizerEx(Rtax, (R_CFinalizer_t)R_delete_taxonomy,TRUE);
UNPROTECT(1);
return Rtax;
}
SEXP R_get_scientific_name(SEXP Rtaxonomy,SEXP Rtaxid)
{
ecotx_t *taxon;
ecotaxonomy_t *ptax;
int taxid;
// SEXP scname;
ptax = getTaxPointer(Rtaxonomy);
if (! isInteger(Rtaxid))
error("taxid not integer");
taxid = *INTEGER(Rtaxid);
if (! (taxid > 0))
error("taxid not positive");
taxon = eco_findtaxonbytaxid(ptax, taxid);
if (!taxon)
return ScalarString(R_NaString);
// error("unkown taxid");
return mkString(taxon->name);
}
SEXP R_get_rank(SEXP Rtaxonomy,SEXP Rtaxid)
{
ecotx_t *taxon;
ecotaxonomy_t *ptax;
int *taxid;
int ntaxid;
int i;
SEXP results;
// SEXP scname;
ptax = getTaxPointer(Rtaxonomy);
if (! isInteger(Rtaxid))
error("taxid not integer");
ntaxid = length(Rtaxid);
results = PROTECT(allocVector(STRSXP, ntaxid));
taxid = INTEGER(Rtaxid);
for (i=0; i < ntaxid; i++)
{
if (taxid[i]== NA_INTEGER || taxid[i] <= 0)
SET_STRING_ELT(results, i, R_NaString);
else {
taxon = eco_findtaxonbytaxid(ptax, taxid[i]);
if (!taxon)
SET_STRING_ELT(results, i, R_NaString);
else
SET_STRING_ELT(results, i, mkChar(ptax->ranks->label[taxon->rank]));
}
}
UNPROTECT(1);
return results;
}
SEXP R_findtaxonatrank(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rrank, SEXP Rname)
{
ecotx_t *taxon;
ecotx_t *rep;
ecotaxonomy_t *ptax;
int taxid;
int name;
const char *rank;
int rankidx;
// SEXP scname;
ptax = getTaxPointer(Rtaxonomy);
if (! isInteger(Rtaxid))
error("taxid not integer");
taxid = *INTEGER(Rtaxid);
if (! (taxid > 0))
error("taxid not positive");
if (! isString(Rrank))
error("rank not a string");
rank=CHAR(STRING_ELT(Rrank,0));
rankidx=rank_index(rank,ptax->ranks);
if (rankidx < 0)
error("unkown rank name");
if (! isLogical(Rname))
error("name not logical");
name = LOGICAL(Rname)[0];
taxon = eco_findtaxonbytaxid(ptax, taxid);
if (!taxon)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
rep = eco_findtaxonatrank(taxon,rankidx);
if (!rep)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
if (name)
return mkString(rep->name);
else
return ScalarInteger(rep->taxid);
}
SEXP R_get_species(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
{
ecotx_t *taxon;
ecotx_t *rep;
ecotaxonomy_t *ptax;
int taxid;
int name;
// SEXP scname;
ptax = getTaxPointer(Rtaxonomy);
if (! isInteger(Rtaxid))
error("taxid not integer");
taxid = *INTEGER(Rtaxid);
if (! (taxid > 0))
error("taxid not positive");
if (! isLogical(Rname))
error("name not logical");
name = LOGICAL(Rname)[0];
taxon = eco_findtaxonbytaxid(ptax, taxid);
if (!taxon)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
rep = eco_getspecies(taxon,ptax);
if (!rep)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
if (name)
return mkString(rep->name);
else
return ScalarInteger(rep->taxid);
}
SEXP R_get_genus(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
{
ecotx_t *taxon;
ecotx_t *rep;
ecotaxonomy_t *ptax;
int taxid;
int name;
// SEXP scname;
ptax = getTaxPointer(Rtaxonomy);
if (! isInteger(Rtaxid))
error("taxid not integer");
taxid = *INTEGER(Rtaxid);
if (! (taxid > 0))
error("taxid not positive");
if (! isLogical(Rname))
error("name not logical");
name = LOGICAL(Rname)[0];
taxon = eco_findtaxonbytaxid(ptax, taxid);
if (!taxon)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
rep = eco_getgenus(taxon,ptax);
if (!rep)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
if (name)
return mkString(rep->name);
else
return ScalarInteger(rep->taxid);
}
SEXP R_get_family(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
{
ecotx_t *taxon;
ecotx_t *rep;
ecotaxonomy_t *ptax;
int taxid;
int name;
// SEXP scname;
ptax = getTaxPointer(Rtaxonomy);
if (! isInteger(Rtaxid))
error("taxid not integer");
taxid = *INTEGER(Rtaxid);
if (! (taxid > 0))
error("taxid not positive");
if (! isLogical(Rname))
error("name not logical");
name = LOGICAL(Rname)[0];
taxon = eco_findtaxonbytaxid(ptax, taxid);
if (!taxon)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
rep = eco_getfamily(taxon,ptax);
if (!rep)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
if (name)
return mkString(rep->name);
else
return ScalarInteger(rep->taxid);
}
SEXP R_get_kingdom(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
{
ecotx_t *taxon;
ecotx_t *rep;
ecotaxonomy_t *ptax;
int taxid;
int name;
// SEXP scname;
ptax = getTaxPointer(Rtaxonomy);
if (! isInteger(Rtaxid))
error("taxid not integer");
taxid = *INTEGER(Rtaxid);
if (! (taxid > 0))
error("taxid not positive");
if (! isLogical(Rname))
error("name not logical");
name = LOGICAL(Rname)[0];
taxon = eco_findtaxonbytaxid(ptax, taxid);
if (!taxon)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
rep = eco_getkingdom(taxon,ptax);
if (!rep)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
if (name)
return mkString(rep->name);
else
return ScalarInteger(rep->taxid);
}
SEXP R_get_superkingdom(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
{
ecotx_t *taxon;
ecotx_t *rep;
ecotaxonomy_t *ptax;
int taxid;
int name;
// SEXP scname;
ptax = getTaxPointer(Rtaxonomy);
if (! isInteger(Rtaxid))
error("taxid not integer");
taxid = *INTEGER(Rtaxid);
if (! (taxid > 0))
error("taxid not positive");
if (! isLogical(Rname))
error("name not logical");
name = LOGICAL(Rname)[0];
taxon = eco_findtaxonbytaxid(ptax, taxid);
if (!taxon)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
rep = eco_getsuperkingdom(taxon,ptax);
if (!rep)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
if (name)
return mkString(rep->name);
else
return ScalarInteger(rep->taxid);
}
SEXP R_get_parent(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
{
ecotx_t *taxon;
ecotx_t *rep;
ecotaxonomy_t *ptax;
int taxid;
int name;
// SEXP scname;
ptax = getTaxPointer(Rtaxonomy);
if (! isInteger(Rtaxid))
error("taxid not integer");
taxid = *INTEGER(Rtaxid);
if (! (taxid > 0))
error("taxid not positive");
if (! isLogical(Rname))
error("name not logical");
name = LOGICAL(Rname)[0];
taxon = eco_findtaxonbytaxid(ptax, taxid);
if (!taxon)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
rep = taxon->parent;
if (rep->taxid==taxid)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
if (name)
return mkString(rep->name);
else
return ScalarInteger(rep->taxid);
}
SEXP R_validate_taxid(SEXP Rtaxonomy,SEXP Rtaxid)
{
ecotx_t *taxon;
ecotaxonomy_t *ptax;
int taxid;
// int name;
// SEXP scname;
ptax = getTaxPointer(Rtaxonomy);
if (! isInteger(Rtaxid))
error("taxid not integer");
taxid = *INTEGER(Rtaxid);
if (! (taxid > 0))
return ScalarInteger(R_NaInt);
taxon = eco_findtaxonbytaxid(ptax, taxid);
if (!taxon)
return ScalarInteger(R_NaInt);
else
return ScalarInteger(taxon->taxid);
}
SEXP R_is_under_taxon(SEXP Rtaxonomy, SEXP Rtaxid, SEXP Rparent)
{
ecotx_t *taxon;
ecotaxonomy_t *ptax;
int taxid;
int parent;
int rep;
// SEXP isunder;
ptax = getTaxPointer(Rtaxonomy);
if (! isInteger(Rparent))
error("parent not integer");
parent = *INTEGER(Rparent);
if (parent <= 0)
return ScalarInteger(R_NaInt);
taxon = eco_findtaxonbytaxid(ptax, parent);
if (!taxon)
return ScalarInteger(R_NaInt);
if (! isInteger(Rtaxid))
error("taxid not integer");
taxid = *INTEGER(Rtaxid);
if (taxid <= 0)
return ScalarInteger(R_NaInt);
taxon = eco_findtaxonbytaxid(ptax, taxid);
if (!taxon)
return ScalarInteger(R_NaInt);
rep = eco_isundertaxon(taxon, parent);
return ScalarLogical(rep);
}
SEXP R_longest_path(SEXP Rtaxonomy,SEXP Rtaxid)
{
ecotx_t *taxon;
ecotaxonomy_t *ptax;
int taxid;
// int name;
// SEXP scname;
ptax = getTaxPointer(Rtaxonomy);
if (! isInteger(Rtaxid))
error("taxid not integer");
taxid = *INTEGER(Rtaxid);
if (taxid <= 0)
return ScalarInteger(R_NaInt);
taxon = eco_findtaxonbytaxid(ptax, taxid);
if (!taxon)
return ScalarInteger(R_NaInt);
else
return ScalarInteger(taxon->farest);
}
SEXP R_rank_list(SEXP Rtaxonomy)
{
int nrank;
int i;
ecotaxonomy_t *ptax;
SEXP rNames;
ptax = getTaxPointer(Rtaxonomy);
nrank = ptax->ranks->count;
rNames = PROTECT(allocVector(STRSXP, nrank));
for (i=0; i < nrank;i++)
SET_STRING_ELT(rNames, i, mkChar(ptax->ranks->label[i]));
UNPROTECT(1);
return rNames;
}
SEXP R_taxid_list(SEXP Rtaxonomy)
{
int ntaxid;
int i;
ecotaxonomy_t *ptax;
SEXP rTaxids;
ptax = getTaxPointer(Rtaxonomy);
ntaxid = ptax->taxons->count;
rTaxids = PROTECT(allocVector(INTSXP, ntaxid));
for (i=0; i < ntaxid;i++)
INTEGER(rTaxids)[i]=ptax->taxons->taxon[i].taxid;
UNPROTECT(1);
return rTaxids;
}
SEXP R_max_taxid(SEXP Rtaxonomy)
{
// int nrank;
// int i;
ecotaxonomy_t *ptax;
// SEXP rNames;
ptax = getTaxPointer(Rtaxonomy);
return ScalarInteger(ptax->taxons->maxtaxid);
}
SEXP R_length_taxonomy(SEXP Rtaxonomy)
{
ecotaxonomy_t *ptax;
ptax = getTaxPointer(Rtaxonomy);
return ScalarInteger(ptax->taxons->count);
}
SEXP R_ecofind(SEXP Rtaxonomy, SEXP Rpattern, SEXP Rrank, SEXP Ralternative)
{
ecotaxonomy_t *ptax;
econame_t *name;
char* pattern=NULL;
int re_match;
SEXP taxids;
int32_t* buffer;
int32_t tax_count = 0;
size_t j = 0;
int32_t rankfilter = 1;
int* ptaxid;
char *rankname=NULL;
int32_t nummatch = 0;
int32_t alternative = 0;
size_t bsize;
ptax = getTaxPointer(Rtaxonomy);
tax_count = ptax->taxons->count;
if (! isString(Rpattern))
error("pattern not a string");
pattern= (char*) CHAR(STRING_ELT(Rpattern,0));
if (! isNull(Rrank))
{
if (! isString(Rrank))
error("rank not a string");
rankname= (char*) CHAR(STRING_ELT(Rrank,0));
}
if (! isLogical(Ralternative))
error("rank not a logical");
alternative = LOGICAL(Ralternative)[0];
nummatch=0;
buffer = (int32_t*) malloc(100 * sizeof(int32_t));
bsize=100;
if (alternative && ptax->names!=NULL)
for (j=0,name=ptax->names->names;
j < ptax->names->count;
name++,j++)
{
if(rankname)
rankfilter = !(strcmp(rankname,ptax->ranks->label[name->taxon->rank]));
re_match = slre_match(pattern, name->name,
strlen(name->name),
NULL, 0,
SLRE_IGNORE_CASE);
if (re_match > 0 && rankfilter)
{
buffer[nummatch]=name->taxon->taxid;
nummatch++;
if (nummatch==bsize) {
bsize*=2;
buffer = (int32_t*) realloc(buffer, bsize * sizeof(int32_t));
if (buffer==0)
{
// regfree(&re_preg);
error("Cannot allocate memory for the taxid list");
}
}
}
}
else
for (j=0; j < ptax->taxons->count;j++)
{
if(rankname)
rankfilter = !(strcmp(rankname,ptax->ranks->label[ptax->taxons->taxon[j].rank]));
// re_match = regexec (&re_preg, ptax->taxons->taxon[j].name, 0, NULL, 0);
re_match = slre_match(pattern, ptax->taxons->taxon[j].name,
strlen(ptax->taxons->taxon[j].name),
NULL, 0,
SLRE_IGNORE_CASE);
// if (!re_match && rankfilter)
if (re_match > 0 && rankfilter)
{
buffer[nummatch]=ptax->taxons->taxon[j].taxid;
nummatch++;
if (nummatch==bsize) {
bsize*=2;
buffer = (int32_t*) realloc(buffer, bsize * sizeof(int32_t));
if (buffer==0)
{
// regfree(&re_preg);
error("Cannot allocate memory for the taxid list");
}
}
}
}
//regfree(&re_preg);
taxids = PROTECT(NEW_INTEGER(nummatch));
ptaxid = INTEGER(taxids);
for (j=0; j < nummatch; j++)
ptaxid[j]=buffer[j];
free(buffer);
UNPROTECT(1);
return taxids;
}

6
ROBITools/src/robitax.h Normal file
View File

@@ -0,0 +1,6 @@
#include "ecoPCR.h"
ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy);
SEXP R_delete_taxonomy(SEXP Rtaxonomy);

BIN
ROBITools/src/robitax.o Normal file

Binary file not shown.

433
ROBITools/src/slre.c Executable file
View File

@@ -0,0 +1,433 @@
/*
* Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
* Copyright (c) 2013 Cesanta Software Limited
* All rights reserved
*
* This library is dual-licensed: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation. For the terms of this
* license, see <http://www.gnu.org/licenses/>.
*
* You are free to use this library under the terms of the GNU General
* Public License, but WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU General Public License for more details.
*
* Alternatively, you can license this library under a commercial
* license, as set out in <http://cesanta.com/products.html>.
*/
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include "slre.h"
#define MAX_BRANCHES 100
#define MAX_BRACKETS 100
#define FAIL_IF(condition, error_code) if (condition) return (error_code)
#ifndef ARRAY_SIZE
#define ARRAY_SIZE(ar) (sizeof(ar) / sizeof((ar)[0]))
#endif
#ifdef SLRE_DEBUG
#define DBG(x) printf x
#else
#define DBG(x)
#endif
struct bracket_pair {
const char *ptr; /* Points to the first char after '(' in regex */
int len; /* Length of the text between '(' and ')' */
int branches; /* Index in the branches array for this pair */
int num_branches; /* Number of '|' in this bracket pair */
};
struct branch {
int bracket_index; /* index for 'struct bracket_pair brackets' */
/* array defined below */
const char *schlong; /* points to the '|' character in the regex */
};
struct regex_info {
/*
* Describes all bracket pairs in the regular expression.
* First entry is always present, and grabs the whole regex.
*/
struct bracket_pair brackets[MAX_BRACKETS];
int num_brackets;
/*
* Describes alternations ('|' operators) in the regular expression.
* Each branch falls into a specific branch pair.
*/
struct branch branches[MAX_BRANCHES];
int num_branches;
/* Array of captures provided by the user */
struct slre_cap *caps;
int num_caps;
/* E.g. SLRE_IGNORE_CASE. See enum below */
int flags;
};
static int is_metacharacter(const unsigned char *s) {
static const char *metacharacters = "^$().[]*+?|\\Ssdbfnrtv";
return strchr(metacharacters, *s) != NULL;
}
static int op_len(const char *re) {
return re[0] == '\\' && re[1] == 'x' ? 4 : re[0] == '\\' ? 2 : 1;
}
static int set_len(const char *re, int re_len) {
int len = 0;
while (len < re_len && re[len] != ']') {
len += op_len(re + len);
}
return len <= re_len ? len + 1 : -1;
}
static int get_op_len(const char *re, int re_len) {
return re[0] == '[' ? set_len(re + 1, re_len - 1) + 1 : op_len(re);
}
static int is_quantifier(const char *re) {
return re[0] == '*' || re[0] == '+' || re[0] == '?';
}
static int toi(int x) {
return isdigit(x) ? x - '0' : x - 'W';
}
static int hextoi(const unsigned char *s) {
return (toi(tolower(s[0])) << 4) | toi(tolower(s[1]));
}
static int match_op(const unsigned char *re, const unsigned char *s,
struct regex_info *info) {
int result = 0;
switch (*re) {
case '\\':
/* Metacharacters */
switch (re[1]) {
case 'S': FAIL_IF(isspace(*s), SLRE_NO_MATCH); result++; break;
case 's': FAIL_IF(!isspace(*s), SLRE_NO_MATCH); result++; break;
case 'd': FAIL_IF(!isdigit(*s), SLRE_NO_MATCH); result++; break;
case 'b': FAIL_IF(*s != '\b', SLRE_NO_MATCH); result++; break;
case 'f': FAIL_IF(*s != '\f', SLRE_NO_MATCH); result++; break;
case 'n': FAIL_IF(*s != '\n', SLRE_NO_MATCH); result++; break;
case 'r': FAIL_IF(*s != '\r', SLRE_NO_MATCH); result++; break;
case 't': FAIL_IF(*s != '\t', SLRE_NO_MATCH); result++; break;
case 'v': FAIL_IF(*s != '\v', SLRE_NO_MATCH); result++; break;
case 'x':
/* Match byte, \xHH where HH is hexadecimal byte representaion */
FAIL_IF(hextoi(re + 2) != *s, SLRE_NO_MATCH);
result++;
break;
default:
/* Valid metacharacter check is done in bar() */
FAIL_IF(re[1] != s[0], SLRE_NO_MATCH);
result++;
break;
}
break;
case '|': FAIL_IF(1, SLRE_INTERNAL_ERROR); break;
case '$': FAIL_IF(1, SLRE_NO_MATCH); break;
case '.': result++; break;
default:
if (info->flags & SLRE_IGNORE_CASE) {
FAIL_IF(tolower(*re) != tolower(*s), SLRE_NO_MATCH);
} else {
FAIL_IF(*re != *s, SLRE_NO_MATCH);
}
result++;
break;
}
return result;
}
static int match_set(const char *re, int re_len, const char *s,
struct regex_info *info) {
int len = 0, result = -1, invert = re[0] == '^';
if (invert) re++, re_len--;
while (len <= re_len && re[len] != ']' && result <= 0) {
/* Support character range */
if (re[len] != '-' && re[len + 1] == '-' && re[len + 2] != ']' &&
re[len + 2] != '\0') {
result = info->flags && SLRE_IGNORE_CASE ?
*s >= re[len] && *s <= re[len + 2] :
tolower(*s) >= tolower(re[len]) && tolower(*s) <= tolower(re[len + 2]);
len += 3;
} else {
result = match_op((unsigned char *) re + len, (unsigned char *) s, info);
len += op_len(re + len);
}
}
return (!invert && result > 0) || (invert && result <= 0) ? 1 : -1;
}
static int doh(const char *s, int s_len, struct regex_info *info, int bi);
static int bar(const char *re, int re_len, const char *s, int s_len,
struct regex_info *info, int bi) {
/* i is offset in re, j is offset in s, bi is brackets index */
int i, j, n, step;
for (i = j = 0; i < re_len && j <= s_len; i += step) {
/* Handle quantifiers. Get the length of the chunk. */
step = re[i] == '(' ? info->brackets[bi + 1].len + 2 :
get_op_len(re + i, re_len - i);
DBG(("%s [%.*s] [%.*s] re_len=%d step=%d i=%d j=%d\n", __func__,
re_len - i, re + i, s_len - j, s + j, re_len, step, i, j));
FAIL_IF(is_quantifier(&re[i]), SLRE_UNEXPECTED_QUANTIFIER);
FAIL_IF(step <= 0, SLRE_INVALID_CHARACTER_SET);
if (i + step < re_len && is_quantifier(re + i + step)) {
DBG(("QUANTIFIER: [%.*s]%c [%.*s]\n", step, re + i,
re[i + step], s_len - j, s + j));
if (re[i + step] == '?') {
int result = bar(re + i, step, s + j, s_len - j, info, bi);
j += result > 0 ? result : 0;
i++;
} else if (re[i + step] == '+' || re[i + step] == '*') {
int j2 = j, nj = j, n1, n2 = -1, ni, non_greedy = 0;
/* Points to the regexp code after the quantifier */
ni = i + step + 1;
if (ni < re_len && re[ni] == '?') {
non_greedy = 1;
ni++;
}
do {
if ((n1 = bar(re + i, step, s + j2, s_len - j2, info, bi)) > 0) {
j2 += n1;
}
if (re[i + step] == '+' && n1 < 0) break;
if (ni >= re_len) {
/* After quantifier, there is nothing */
nj = j2;
} else if ((n2 = bar(re + ni, re_len - ni, s + j2,
s_len - j2, info, bi)) >= 0) {
/* Regex after quantifier matched */
nj = j2 + n2;
}
if (nj > j && non_greedy) break;
} while (n1 > 0);
if (n1 < 0 && re[i + step] == '*' &&
(n2 = bar(re + ni, re_len - ni, s + j, s_len - j, info, bi)) > 0) {
nj = j + n2;
}
DBG(("STAR/PLUS END: %d %d %d %d %d\n", j, nj, re_len - ni, n1, n2));
FAIL_IF(re[i + step] == '+' && nj == j, SLRE_NO_MATCH);
/* If while loop body above was not executed for the * quantifier, */
/* make sure the rest of the regex matches */
FAIL_IF(nj == j && ni < re_len && n2 < 0, SLRE_NO_MATCH);
/* Returning here cause we've matched the rest of RE already */
return nj;
}
continue;
}
if (re[i] == '[') {
n = match_set(re + i + 1, re_len - (i + 2), s + j, info);
DBG(("SET %.*s [%.*s] -> %d\n", step, re + i, s_len - j, s + j, n));
FAIL_IF(n <= 0, SLRE_NO_MATCH);
j += n;
} else if (re[i] == '(') {
n = SLRE_NO_MATCH;
bi++;
FAIL_IF(bi >= info->num_brackets, SLRE_INTERNAL_ERROR);
DBG(("CAPTURING [%.*s] [%.*s] [%s]\n",
step, re + i, s_len - j, s + j, re + i + step));
if (re_len - (i + step) <= 0) {
/* Nothing follows brackets */
n = doh(s + j, s_len - j, info, bi);
} else {
int j2;
for (j2 = 0; j2 <= s_len - j; j2++) {
if ((n = doh(s + j, s_len - (j + j2), info, bi)) >= 0 &&
bar(re + i + step, re_len - (i + step),
s + j + n, s_len - (j + n), info, bi) >= 0) break;
}
}
DBG(("CAPTURED [%.*s] [%.*s]:%d\n", step, re + i, s_len - j, s + j, n));
FAIL_IF(n < 0, n);
if (info->caps != NULL) {
info->caps[bi - 1].ptr = s + j;
info->caps[bi - 1].len = n;
}
j += n;
} else if (re[i] == '^') {
FAIL_IF(j != 0, SLRE_NO_MATCH);
} else if (re[i] == '$') {
FAIL_IF(j != s_len, SLRE_NO_MATCH);
} else {
FAIL_IF(j >= s_len, SLRE_NO_MATCH);
n = match_op((unsigned char *) (re + i), (unsigned char *) (s + j), info);
FAIL_IF(n <= 0, n);
j += n;
}
}
return j;
}
/* Process branch points */
static int doh(const char *s, int s_len, struct regex_info *info, int bi) {
const struct bracket_pair *b = &info->brackets[bi];
int i = 0, len, result;
const char *p;
do {
p = i == 0 ? b->ptr : info->branches[b->branches + i - 1].schlong + 1;
len = b->num_branches == 0 ? b->len :
i == b->num_branches ? (int) (b->ptr + b->len - p) :
(int) (info->branches[b->branches + i].schlong - p);
DBG(("%s %d %d [%.*s] [%.*s]\n", __func__, bi, i, len, p, s_len, s));
result = bar(p, len, s, s_len, info, bi);
DBG(("%s <- %d\n", __func__, result));
} while (result <= 0 && i++ < b->num_branches); /* At least 1 iteration */
return result;
}
static int baz(const char *s, int s_len, struct regex_info *info) {
int i, result = -1, is_anchored = info->brackets[0].ptr[0] == '^';
for (i = 0; i <= s_len; i++) {
result = doh(s + i, s_len - i, info, 0);
if (result >= 0) {
result += i;
break;
}
if (is_anchored) break;
}
return result;
}
static void setup_branch_points(struct regex_info *info) {
int i, j;
struct branch tmp;
/* First, sort branches. Must be stable, no qsort. Use bubble algo. */
for (i = 0; i < info->num_branches; i++) {
for (j = i + 1; j < info->num_branches; j++) {
if (info->branches[i].bracket_index > info->branches[j].bracket_index) {
tmp = info->branches[i];
info->branches[i] = info->branches[j];
info->branches[j] = tmp;
}
}
}
/*
* For each bracket, set their branch points. This way, for every bracket
* (i.e. every chunk of regex) we know all branch points before matching.
*/
for (i = j = 0; i < info->num_brackets; i++) {
info->brackets[i].num_branches = 0;
info->brackets[i].branches = j;
while (j < info->num_branches && info->branches[j].bracket_index == i) {
info->brackets[i].num_branches++;
j++;
}
}
}
static int foo(const char *re, int re_len, const char *s, int s_len,
struct regex_info *info) {
int i, step, depth = 0;
/* First bracket captures everything */
info->brackets[0].ptr = re;
info->brackets[0].len = re_len;
info->num_brackets = 1;
/* Make a single pass over regex string, memorize brackets and branches */
for (i = 0; i < re_len; i += step) {
step = get_op_len(re + i, re_len - i);
if (re[i] == '|') {
FAIL_IF(info->num_branches >= (int) ARRAY_SIZE(info->branches),
SLRE_TOO_MANY_BRANCHES);
info->branches[info->num_branches].bracket_index =
info->brackets[info->num_brackets - 1].len == -1 ?
info->num_brackets - 1 : depth;
info->branches[info->num_branches].schlong = &re[i];
info->num_branches++;
} else if (re[i] == '\\') {
FAIL_IF(i >= re_len - 1, SLRE_INVALID_METACHARACTER);
if (re[i + 1] == 'x') {
/* Hex digit specification must follow */
FAIL_IF(re[i + 1] == 'x' && i >= re_len - 3,
SLRE_INVALID_METACHARACTER);
FAIL_IF(re[i + 1] == 'x' && !(isxdigit(re[i + 2]) &&
isxdigit(re[i + 3])), SLRE_INVALID_METACHARACTER);
} else {
FAIL_IF(!is_metacharacter((unsigned char *) re + i + 1),
SLRE_INVALID_METACHARACTER);
}
} else if (re[i] == '(') {
FAIL_IF(info->num_brackets >= (int) ARRAY_SIZE(info->brackets),
SLRE_TOO_MANY_BRACKETS);
depth++; /* Order is important here. Depth increments first. */
info->brackets[info->num_brackets].ptr = re + i + 1;
info->brackets[info->num_brackets].len = -1;
info->num_brackets++;
FAIL_IF(info->num_caps > 0 && info->num_brackets - 1 > info->num_caps,
SLRE_CAPS_ARRAY_TOO_SMALL);
} else if (re[i] == ')') {
int ind = info->brackets[info->num_brackets - 1].len == -1 ?
info->num_brackets - 1 : depth;
info->brackets[ind].len = (int) (&re[i] - info->brackets[ind].ptr);
DBG(("SETTING BRACKET %d [%.*s]\n",
ind, info->brackets[ind].len, info->brackets[ind].ptr));
depth--;
FAIL_IF(depth < 0, SLRE_UNBALANCED_BRACKETS);
FAIL_IF(i > 0 && re[i - 1] == '(', SLRE_NO_MATCH);
}
}
FAIL_IF(depth != 0, SLRE_UNBALANCED_BRACKETS);
setup_branch_points(info);
return baz(s, s_len, info);
}
int slre_match(const char *regexp, const char *s, int s_len,
struct slre_cap *caps, int num_caps, int flags) {
struct regex_info info;
/* Initialize info structure */
info.flags = flags;
info.num_brackets = info.num_branches = 0;
info.num_caps = num_caps;
info.caps = caps;
DBG(("========================> [%s] [%.*s]\n", regexp, s_len, s));
return foo(regexp, (int) strlen(regexp), s, s_len, &info);
}

60
ROBITools/src/slre.h Executable file
View File

@@ -0,0 +1,60 @@
/*
* Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
* Copyright (c) 2013 Cesanta Software Limited
* All rights reserved
*
* This library is dual-licensed: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation. For the terms of this
* license, see <http://www.gnu.org/licenses/>.
*
* You are free to use this library under the terms of the GNU General
* Public License, but WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU General Public License for more details.
*
* Alternatively, you can license this library under a commercial
* license, as set out in <http://cesanta.com/products.html>.
*/
/*
* This is a regular expression library that implements a subset of Perl RE.
* Please refer to README.md for a detailed reference.
*/
#ifndef SLRE_HEADER_DEFINED
#define SLRE_HEADER_DEFINED
#ifdef __cplusplus
extern "C" {
#endif
struct slre_cap {
const char *ptr;
int len;
};
int slre_match(const char *regexp, const char *buf, int buf_len,
struct slre_cap *caps, int num_caps, int flags);
/* Possible flags for slre_match() */
enum { SLRE_IGNORE_CASE = 1 };
/* slre_match() failure codes */
#define SLRE_NO_MATCH -1
#define SLRE_UNEXPECTED_QUANTIFIER -2
#define SLRE_UNBALANCED_BRACKETS -3
#define SLRE_INTERNAL_ERROR -4
#define SLRE_INVALID_CHARACTER_SET -5
#define SLRE_INVALID_METACHARACTER -6
#define SLRE_CAPS_ARRAY_TOO_SMALL -7
#define SLRE_TOO_MANY_BRANCHES -8
#define SLRE_TOO_MANY_BRACKETS -9
#ifdef __cplusplus
}
#endif
#endif /* SLRE_HEADER_DEFINED */

BIN
ROBITools/src/slre.o Normal file

Binary file not shown.

190
dev_notes.txt Normal file
View File

@@ -0,0 +1,190 @@
package.skeleton("ROBITools",c("robitools.motu.count","robitools.motus",
"robitools.reads","robitools.samples",
"robitools.sample.count",))
#include <R.h>
#include <Rinternals.h>
static void cooked_goose(SEXP foo)
{
if (TYPEOF(foo) != EXTPTRSXP)
error("argument not external pointer");
double *x = (double *) R_ExternalPtrAddr(foo);
int blather = x[0];
Free(x);
if (blather)
printf("finalizer ran\n");
}
SEXP blob(SEXP nin, SEXP blatherin)
{
if (! isInteger(nin))
error("n not integer");
int n = INTEGER(nin)[0];
if (! (n > 0))
error("n not positive");
if (! isLogical(blatherin))
error("blather not logical");
int blather = LOGICAL(blatherin)[0];
double *x = Calloc(n + 2, double);
GetRNGstate();
for (int i = 0; i < n; ++i)
x[i + 2] = norm_rand();
PutRNGstate();
x[1] = n;
x[0] = blather;
SEXP bar;
PROTECT(bar = R_MakeExternalPtr(x, R_NilValue, R_NilValue));
R_RegisterCFinalizer(bar, cooked_goose);
UNPROTECT(1);
return bar;
}
SEXP blub(SEXP foo)
{
if (TYPEOF(foo) != EXTPTRSXP)
error("argument not external pointer");
double *x = (double *) R_ExternalPtrAddr(foo);
int blather = x[0];
int n = x[1];
SEXP bar;
PROTECT(bar = allocVector(REALSXP, n));
for (int i = 0; i < n; ++i)
REAL(bar)[i] = x[i + 2];
UNPROTECT(1);
return bar;
}
blob <- function(n, blather = FALSE) {
stopifnot(is.numeric(n))
stopifnot(as.integer(n) == n)
stopifnot(n > 0)
stopifnot(is.logical(blather))
.Call("blob", as.integer(n), blather)
}
blub <- function(x) {
stopifnot(class(x) == "externalptr")
.Call("blub", x)
}
Hi Robert,
It looks like there is no way to explicitly make an S4 object call a
function when it is garbage collected unless you resort to tricks with
reg.finalizer.
It turns out that Prof. Ripley's reply (thanks!!) had enough hints in it
that I was able to get the effect I wanted by using R's external pointer
facility. In fact it works quite nicely.
In a nutshell, I create a C++ object (with new) and then wrap its pointer
with an R external pointer using
SEXP rExtPtr = R_MakeExternalPtr( cPtr, aTag, R_NilValue);
Where cPtr is the C++/C pointer to the object and aTag is an R symbol
describing the pointer type [e.g. SEXP aTag =
install("this_is_a_tag_for_a_pointer_to_my_object")]. The final argument is
"a value to protect". I don't know what this means, but all of the examples
I saw use R_NilValue.
If you want a C++ function to be called when R loses the reference to the
external pointer (actually when R garbage collects it, or when R quits), do
R_RegisterCFinalizerEx( rExtPtr, (R_CFinalizer_t)functionToBeCalled, TRUE );
The TRUE means that R will call the "functionToBeCalled" if the pointer is
still around when R quits. I guess if you set it to FALSE, then you are
assuming that your shell can delete memory and/or release resources when R
quits.
So return this external pointer to R (the function that new'ed it was called
by .Call or something similar) and stick it in a slot of your object. Then
when your object is garbage collected, "functionToBeCalled" will be called.
The slot would have the type "externalptr".
The functionToBeCalled contains the code to delete the C++ pointer or
release resources, for example...
SEXP functionToBeCalled( SEXP rExtPtr ) {
// Get the C++ pointer
MyThing* ptr = R_ExternalPtrAddr(rExtPtr);
// Delete it
delete ptr;
// Clear the external pointer
R_ClearExternalPtr(rExtPtr);
return R_NilValue;
}
And there you have it.
There doesn't seem to be any official documentation on this stuff (at least
none that I could find). The best references I found are on the R developers
web page. See the links within "some notes on _references, external
objects, or mutable state_ for R and a _simple implementation_ of external
references and finalization". Note that the documents are slightly out of
date (the function names have apparently been changed somewhat). The latter
one has some examples that are very helpful. And as Prof. Ripley pointed
out, RODBC uses this facility too, so look at that code.
Hope this was useful. Good luck.
SEXP
get(SEXP ext)
{
return mkString((char *) R_ExternalPtrAddr(ext));
}
SEXP
set(SEXP ext, SEXP str)
{
char *x = (char *) R_ExternalPtrAddr(ext);
snprintf(x, N_MAX, CHAR(STRING_ELT(str, 0)));
return ScalarLogical(TRUE);
}
> dyn.load("tmp.so")
> x <- .Call("create", list("info could be any R object", 1:5))
> .Call("get", x)
[1] "my name is joe"
> ## reference semantics!
> .Call("set", x, "i am sam i am")
[1] TRUE
> .Call("get", x)
[1] "i am sam i am"
> x <- NULL
> gc()
finalizing
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 339306 18.2 467875 25 407500 21.8
Vcells 202064 1.6 786432 6 380515 3.0
SEXP
incr(SEXP ext)
{
struct Foo *foo = (struct Foo*) R_ExternalPtrAddr(ext);
foo->x += 1;
return ScalarInteger(foo->x);
}
library(ROBITools)
library.dynam('ROBITools.so')
t=.Call('R_read_taxonomy','ecochange',TRUE)
.Call('R_get_scientific_name',t,as.integer(7742))