Initial commit
This commit is contained in:
19
ROBITools.Rproj
Normal file
19
ROBITools.Rproj
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
Version: 1.0
|
||||||
|
|
||||||
|
RestoreWorkspace: Default
|
||||||
|
SaveWorkspace: Default
|
||||||
|
AlwaysSaveHistory: Default
|
||||||
|
|
||||||
|
EnableCodeIndexing: Yes
|
||||||
|
UseSpacesForTab: Yes
|
||||||
|
NumSpacesForTab: 2
|
||||||
|
Encoding: ISO-8859-1
|
||||||
|
|
||||||
|
RnwWeave: knitr
|
||||||
|
LaTeX: pdfLaTeX
|
||||||
|
|
||||||
|
BuildType: Package
|
||||||
|
PackageUseDevtools: Yes
|
||||||
|
PackagePath: ROBITools
|
||||||
|
PackageInstallArgs: --no-multiarch --with-keep.source
|
||||||
|
PackageRoxygenize: rd,collate,namespace
|
3
ROBITools/.gitignore
vendored
Normal file
3
ROBITools/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
/man/
|
||||||
|
/loopbenchmark.R
|
||||||
|
/Read-and-delete-me
|
38
ROBITools/DESCRIPTION
Normal file
38
ROBITools/DESCRIPTION
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
Package: ROBITools
|
||||||
|
Type: Package
|
||||||
|
Title: Metabarcoding data biodiversity analysis
|
||||||
|
Version: 0.1
|
||||||
|
Date: 2012-08-23
|
||||||
|
Author: LECA - Laboratoire d'ecologie alpine
|
||||||
|
Maintainer: LECA OBITools team <obitools@metabarcoding.org>
|
||||||
|
Description: More about what it does (maybe more than one line)
|
||||||
|
License: CeCILL v2.0
|
||||||
|
LazyLoad: yes
|
||||||
|
Roxygen: list(wrap = FALSE)
|
||||||
|
Collate:
|
||||||
|
's3objects.R'
|
||||||
|
'ROBITools.R'
|
||||||
|
'02_class_metabarcoding.data.R'
|
||||||
|
'aggregate.R'
|
||||||
|
'choose.taxonomy.R'
|
||||||
|
'contaslayer.R'
|
||||||
|
'distrib.extrapol.R'
|
||||||
|
'experimental.section.R'
|
||||||
|
'export-metabarcoding.R'
|
||||||
|
'read.obitab.R'
|
||||||
|
'import.metabarcoding.R'
|
||||||
|
'import.ngsfilter.R'
|
||||||
|
'layers.metabarcoding.R'
|
||||||
|
'metabarcoding_threshold.R'
|
||||||
|
'mstat.R'
|
||||||
|
'obiclean.R'
|
||||||
|
'pcrslayer.R'
|
||||||
|
'plot.PCRplate.R'
|
||||||
|
'plot.seqinsample.R'
|
||||||
|
'rarefy.R'
|
||||||
|
'read.ngsfilter.R'
|
||||||
|
'read.sumatra.R'
|
||||||
|
'taxoDBtree.R'
|
||||||
|
'taxonomic.resolution.R'
|
||||||
|
'taxonomy_classic_table.R'
|
||||||
|
RoxygenNote: 5.0.1
|
16
ROBITools/LICENSE-SLRE
Executable file
16
ROBITools/LICENSE-SLRE
Executable file
@@ -0,0 +1,16 @@
|
|||||||
|
Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
|
||||||
|
Copyright (c) 2013 Cesanta Software Limited
|
||||||
|
All rights reserved
|
||||||
|
|
||||||
|
This code is dual-licensed: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License version 2 as
|
||||||
|
published by the Free Software Foundation. For the terms of this
|
||||||
|
license, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
You are free to use this code under the terms of the GNU General
|
||||||
|
Public License, but WITHOUT ANY WARRANTY; without even the implied
|
||||||
|
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
See the GNU General Public License for more details.
|
||||||
|
|
||||||
|
Alternatively, you can license this code under a commercial
|
||||||
|
license, as set out in <http://cesanta.com/>.
|
52
ROBITools/NAMESPACE
Normal file
52
ROBITools/NAMESPACE
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
# Generated by roxygen2: do not edit by hand
|
||||||
|
|
||||||
|
S3method(aggregate,metabarcoding.data)
|
||||||
|
S3method(plot,PCRplate)
|
||||||
|
S3method(plot,seqinsample)
|
||||||
|
S3method(summary,taxores)
|
||||||
|
export(addS3Class)
|
||||||
|
export(colnames)
|
||||||
|
export(const.threshold.mask)
|
||||||
|
export(contaslayer)
|
||||||
|
export(createS3Class)
|
||||||
|
export(dbtree)
|
||||||
|
export(dist.center.group)
|
||||||
|
export(dist.clique.group)
|
||||||
|
export(dist.grid)
|
||||||
|
export(dm.univariate)
|
||||||
|
export(extracts.obiclean)
|
||||||
|
export(extracts.obiclean_cluster)
|
||||||
|
export(extrapol.freq)
|
||||||
|
export(get.classic.taxonomy)
|
||||||
|
export(import.metabarcoding.data)
|
||||||
|
export(import.ngsfilter.data)
|
||||||
|
export(layer.names)
|
||||||
|
export(m.bivariate)
|
||||||
|
export(m.univariate)
|
||||||
|
export(m.univariate.test)
|
||||||
|
export(m.weight)
|
||||||
|
export(map.extrapol.freq)
|
||||||
|
export(marginalsum)
|
||||||
|
export(metabarcoding.data)
|
||||||
|
export(motus)
|
||||||
|
export(normalize)
|
||||||
|
export(rarefy)
|
||||||
|
export(read.ngsfilter)
|
||||||
|
export(read.obitab)
|
||||||
|
export(reads)
|
||||||
|
export(rmS3Class)
|
||||||
|
export(rownames)
|
||||||
|
export(samples)
|
||||||
|
export(taxo.decider)
|
||||||
|
export(threshold)
|
||||||
|
export(threshold.mask)
|
||||||
|
export(threshold.set)
|
||||||
|
exportClasses(metabarcoding.data)
|
||||||
|
exportMethods("$")
|
||||||
|
exportMethods("$<-")
|
||||||
|
exportMethods("[[")
|
||||||
|
exportMethods("[[<-")
|
||||||
|
exportMethods(colnames)
|
||||||
|
exportMethods(rownames)
|
||||||
|
import(ROBITaxonomy)
|
||||||
|
import(igraph)
|
539
ROBITools/R/02_class_metabarcoding.data.R
Normal file
539
ROBITools/R/02_class_metabarcoding.data.R
Normal file
@@ -0,0 +1,539 @@
|
|||||||
|
#' @include ROBITools.R
|
||||||
|
#' @include s3objects.R
|
||||||
|
#' @import ROBITaxonomy
|
||||||
|
NULL
|
||||||
|
|
||||||
|
require(ROBITaxonomy)
|
||||||
|
|
||||||
|
#
|
||||||
|
# FOR THE DEVELOPPER : we have to check that the code doesn't relies on the
|
||||||
|
# fact that the xx@samples$sample column is not always
|
||||||
|
# identical to the rownames(xx@samples)
|
||||||
|
|
||||||
|
setClassUnion("characterOrNULL",c("character","NULL"))
|
||||||
|
setClassUnion("matrixOrfactorL",c("matrix","factor"))
|
||||||
|
|
||||||
|
#
|
||||||
|
# We specialize data.frame in two subclasses motus.frame and samples.frame
|
||||||
|
# for this we add to function insuring the type checking and the cast from
|
||||||
|
# data.frame
|
||||||
|
#
|
||||||
|
|
||||||
|
is.motus.frame= function(x) any(class(x)=="motus.frame")
|
||||||
|
is.samples.frame= function(x) any(class(x)=="samples.frame")
|
||||||
|
|
||||||
|
as.motus.frame= function(x) {
|
||||||
|
if (! is.data.frame(x))
|
||||||
|
stop("only cast from data.frame is allowed")
|
||||||
|
if (! is.motus.frame(x))
|
||||||
|
x = addS3Class(x,"motus.frame")
|
||||||
|
|
||||||
|
return(x)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
as.samples.frame= function(x) {
|
||||||
|
if (! is.data.frame(x))
|
||||||
|
stop("only cast from data.frame is allowed")
|
||||||
|
if (! is.samples.frame(x))
|
||||||
|
x = addS3Class(x,"samples.frame")
|
||||||
|
return(x)
|
||||||
|
}
|
||||||
|
|
||||||
|
samples.frame=as.samples.frame
|
||||||
|
motus.frame=as.motus.frame
|
||||||
|
|
||||||
|
as.factor.or.matrix = function(x) {
|
||||||
|
if (is.matrix(x))
|
||||||
|
return(x)
|
||||||
|
|
||||||
|
if (is.factor(x)){
|
||||||
|
if (length(dim(x))!=2)
|
||||||
|
stop('Just factor with two dimensions are allowed')
|
||||||
|
return(x)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!is.data.frame(x))
|
||||||
|
stop('Just matrix, 2D factor and data.frame can be casted')
|
||||||
|
|
||||||
|
tps = sapply(x,class)
|
||||||
|
allna = sapply(x, function(y) all(is.na(y)))
|
||||||
|
|
||||||
|
if (all(tps==tps[[1]] | allna)) {
|
||||||
|
tps = tps[[1]]
|
||||||
|
}
|
||||||
|
else
|
||||||
|
stop('all the column of the data.frame must have the same type')
|
||||||
|
|
||||||
|
tps = tps[[1]]
|
||||||
|
|
||||||
|
x = as.matrix(x)
|
||||||
|
dx = dim(x)
|
||||||
|
if (tps=='factor')
|
||||||
|
x = factor(x)
|
||||||
|
dim(x)=dx
|
||||||
|
|
||||||
|
return(x)
|
||||||
|
}
|
||||||
|
|
||||||
|
#' DNA metabarcoding experiment description class
|
||||||
|
#'
|
||||||
|
#' A S4 class describing a DNA metabarcoding experiment. It groups
|
||||||
|
#' three data frames describing samples, motus and occurrences of
|
||||||
|
#' MOTUs per sample
|
||||||
|
#'
|
||||||
|
#'@section Slots:
|
||||||
|
#' \describe{
|
||||||
|
#' \item{\code{reads}:}{Matrix of class \code{"numeric"},
|
||||||
|
#' containing the counts of reads per samples
|
||||||
|
#' \itemize{
|
||||||
|
#' \item{1 samples per line}
|
||||||
|
#' \item{1 sequence per column}
|
||||||
|
#' }
|
||||||
|
#' }
|
||||||
|
#'
|
||||||
|
#' \item{\code{samples}:}{Object of class \code{"data.frame"}, describing samples
|
||||||
|
#' \itemize{
|
||||||
|
#' \item{1 samples per line}
|
||||||
|
#' \item{1 property per column}
|
||||||
|
#' }
|
||||||
|
#' }
|
||||||
|
#'
|
||||||
|
#' \item{\code{motus}:}{Object of class \code{"data.frame"}, describing MOTUs (sequences)
|
||||||
|
#' \itemize{
|
||||||
|
#' \item{1 MOTU per line}
|
||||||
|
#' \item{1 property per column}
|
||||||
|
#' }
|
||||||
|
#' }
|
||||||
|
#'
|
||||||
|
#' \item{\code{layers}:}{Object of class \code{"list"}, containing a set of data layers
|
||||||
|
#' linking motus and samples. Each element of the list is a matrix
|
||||||
|
#' of the same size than the \code{read} slot with
|
||||||
|
#' \itemize{
|
||||||
|
#' \item{1 samples per line}
|
||||||
|
#' \item{1 sequence per column}
|
||||||
|
#' }
|
||||||
|
#' }
|
||||||
|
#'
|
||||||
|
#' \item{\code{scount}:}{Object of class \code{"integer"}, containing the count of sample}
|
||||||
|
#'
|
||||||
|
#' \item{\code{mcount}:}{Object of class \code{"integer"}, containing the count of MOTUs}
|
||||||
|
#'
|
||||||
|
#' \item{\code{sample.margin}:}{Vector of class \code{"numeric"}, describing the total count of
|
||||||
|
#' sequence per sample. By default this slot is set by applying sum
|
||||||
|
#' to the reads data.frame lines}
|
||||||
|
#'
|
||||||
|
#' \item{\code{taxonomy}:}{Object of class \code{"taxonomy.obitools"}, linking the DNA metabarcoding
|
||||||
|
#' experiment to a taxonomy}
|
||||||
|
#'
|
||||||
|
#' \item{\code{taxid}:}{Vector of class \code{"character"}, list of MOTUs' attributes to manage as taxid}
|
||||||
|
#' }
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\link{taxonomy.obitools}},
|
||||||
|
#' @name metabarcoding.data
|
||||||
|
#' @rdname metabarcoding-data-class
|
||||||
|
#' @keywords DNA metabarcoding
|
||||||
|
#' @author Eric Coissac
|
||||||
|
#' @exportClass metabarcoding.data
|
||||||
|
|
||||||
|
setClass("metabarcoding.data",
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Attribute declaration
|
||||||
|
#
|
||||||
|
|
||||||
|
representation(reads = "matrix",
|
||||||
|
samples = "data.frame",
|
||||||
|
motus = "data.frame",
|
||||||
|
layers = "list",
|
||||||
|
scount = "integer",
|
||||||
|
mcount = "integer",
|
||||||
|
sample.margin = "numeric",
|
||||||
|
taxonomy = "obitools.taxonomyOrNULL",
|
||||||
|
taxid = "characterOrNULL"
|
||||||
|
),
|
||||||
|
|
||||||
|
#
|
||||||
|
# Check object structure
|
||||||
|
#
|
||||||
|
|
||||||
|
validity = function(object) {
|
||||||
|
|
||||||
|
## object : nom reserve !
|
||||||
|
|
||||||
|
#
|
||||||
|
# Check that reads / samples and motus data.frames
|
||||||
|
# have compatible sizes
|
||||||
|
#
|
||||||
|
# reads line count = samples line count
|
||||||
|
# reads column count = motus line count
|
||||||
|
|
||||||
|
rsize = dim(object@reads)
|
||||||
|
ssize = dim(object@samples)
|
||||||
|
msize = dim(object@motus)
|
||||||
|
csize = length(object@sample.margin)
|
||||||
|
|
||||||
|
if (rsize[1] != ssize[1] &
|
||||||
|
rsize[2] != msize[1] &
|
||||||
|
rsize[1] != csize)
|
||||||
|
return(FALSE)
|
||||||
|
|
||||||
|
|
||||||
|
# if no layer, object is ok
|
||||||
|
|
||||||
|
if (length(object@layers)==0)
|
||||||
|
return(TRUE)
|
||||||
|
|
||||||
|
# otherwise we check the size of each layer as we
|
||||||
|
# did for reads
|
||||||
|
|
||||||
|
return(! any(sapply(object@layers,
|
||||||
|
function(l) any(dim(l)!=c(ssize[1],msize[1])))))
|
||||||
|
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
#' metabarcoding.data constructor
|
||||||
|
#'
|
||||||
|
#' @docType methods
|
||||||
|
#' @rdname initialize-methods
|
||||||
|
#' @aliases initialize-methods,metabarcoding.data
|
||||||
|
setMethod("initialize",
|
||||||
|
"metabarcoding.data",
|
||||||
|
function(.Object, reads,samples,motus,
|
||||||
|
taxonomy=NULL,taxid=NULL,
|
||||||
|
sample.margin=NA,
|
||||||
|
layers=list()) {
|
||||||
|
|
||||||
|
rn = rownames(reads)
|
||||||
|
cn = colnames(reads)
|
||||||
|
|
||||||
|
.Object@reads <- reads
|
||||||
|
|
||||||
|
# .Object@samples <- as.samples.frame(samples)
|
||||||
|
.Object@samples <- samples
|
||||||
|
row.names(.Object@samples) = rn
|
||||||
|
|
||||||
|
#.Object@motus <- as.motus.frame(motus)
|
||||||
|
.Object@motus <- motus
|
||||||
|
row.names(.Object@motus) = cn
|
||||||
|
|
||||||
|
|
||||||
|
# Set colnames and rownames to each layers
|
||||||
|
layers = lapply(layers, function(x) {colnames(x)=cn
|
||||||
|
rownames(x)=rn
|
||||||
|
return(x)})
|
||||||
|
.Object@layers <- layers
|
||||||
|
|
||||||
|
# Precompute sample count and motu count
|
||||||
|
|
||||||
|
.Object@scount = dim(.Object@samples)[1]
|
||||||
|
.Object@mcount = dim(.Object@motus)[1]
|
||||||
|
|
||||||
|
.Object@taxonomy = taxonomy
|
||||||
|
.Object@taxid = taxid
|
||||||
|
|
||||||
|
if (is.null(sample.margin))
|
||||||
|
.Object@sample.margin = rowSums(reads)
|
||||||
|
else
|
||||||
|
.Object@sample.margin = sample.margin
|
||||||
|
|
||||||
|
names(.Object@sample.margin) = rn
|
||||||
|
|
||||||
|
validObject(.Object) ## valide l'objet
|
||||||
|
|
||||||
|
return(.Object)
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# metabarcoding.data getters
|
||||||
|
#
|
||||||
|
|
||||||
|
#' @export
|
||||||
|
setGeneric("reads", function(obj) {
|
||||||
|
return(standardGeneric("reads"))
|
||||||
|
})
|
||||||
|
|
||||||
|
#' Extracts the matrix describing MOTUs abondances
|
||||||
|
#'
|
||||||
|
#' Extract the the matrix describing MOTUs abondances (read counts)
|
||||||
|
#' from a \code{\link{metabarcoding.data}} instance.
|
||||||
|
#'
|
||||||
|
#' @param obj a \code{\link{metabarcoding.data}} instance
|
||||||
|
#' @return a matrix containing data about reads
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' # load termite data set from the ROBITools sample data
|
||||||
|
#' data(termes)
|
||||||
|
#'
|
||||||
|
#' # Extract the matrix describing MOTUs abondances
|
||||||
|
#' d = reads(termes)
|
||||||
|
#'
|
||||||
|
#' head(d)
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\link{metabarcoding.data}},
|
||||||
|
#' \code{\link{motus}}, \code{\link{samples}}
|
||||||
|
#'
|
||||||
|
#' @docType methods
|
||||||
|
#' @rdname read-methods
|
||||||
|
#' @aliases read-methods,metabarcoding.data
|
||||||
|
#' @author Eric Coissac
|
||||||
|
#'
|
||||||
|
setMethod("reads", "metabarcoding.data", function(obj) {
|
||||||
|
return(obj@reads)
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
# get samples data.frames
|
||||||
|
|
||||||
|
#' @export
|
||||||
|
setGeneric("samples", function(obj) {
|
||||||
|
return(standardGeneric("samples"))
|
||||||
|
})
|
||||||
|
|
||||||
|
#' Extracts the samples description data.frame
|
||||||
|
#'
|
||||||
|
#' Extract the sample description data.frame from a
|
||||||
|
#' \code{\link{metabarcoding.data}} instance.
|
||||||
|
#'
|
||||||
|
#' @param obj a \code{\link{metabarcoding.data}} instance
|
||||||
|
#' @return a data.frame containing data about sample
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' # load termite data set from the ROBITools sample data
|
||||||
|
#' data(termes)
|
||||||
|
#'
|
||||||
|
#' # Extract the data frame describing samples
|
||||||
|
#' d = samples(termes)
|
||||||
|
#'
|
||||||
|
#' head(d)
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\link{metabarcoding.data}},
|
||||||
|
#' \code{\link{motus}}, \code{\link{reads}}
|
||||||
|
#'
|
||||||
|
#' @docType methods
|
||||||
|
#' @rdname samples-methods
|
||||||
|
#' @aliases samples-methods,metabarcoding.data
|
||||||
|
#' @author Eric Coissac
|
||||||
|
#'
|
||||||
|
setMethod("samples", "metabarcoding.data", function(obj) {
|
||||||
|
return(obj@samples)
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
#' @export
|
||||||
|
setGeneric("motus", function(obj) {
|
||||||
|
return(standardGeneric("motus"))
|
||||||
|
})
|
||||||
|
|
||||||
|
#' Extracts the MOTU descriptions \code{data.frame}
|
||||||
|
#'
|
||||||
|
#' Extract the MOTUs description \code{data.frame} from a
|
||||||
|
#' \code{\link{metabarcoding.data}} instance.
|
||||||
|
#'
|
||||||
|
#' @param obj a \code{\link{metabarcoding.data}} instance
|
||||||
|
#' @return a data.frame containing data about MOTU
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' # load termite data set from the ROBITools sample data
|
||||||
|
#' data(termes)
|
||||||
|
#'
|
||||||
|
#' # Extract the data.frame describing MOTUs
|
||||||
|
#' d = motus(termes)
|
||||||
|
#'
|
||||||
|
#' head(d)
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\link{metabarcoding.data}},
|
||||||
|
#' \code{\link{reads}}, \code{\link{samples}}
|
||||||
|
#'
|
||||||
|
#' @docType methods
|
||||||
|
#' @rdname motu-methods
|
||||||
|
#' @aliases motu-methods,metabarcoding.data
|
||||||
|
#'
|
||||||
|
setMethod("motus", "metabarcoding.data", function(obj) {
|
||||||
|
return(obj@motus)
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
# get sample count
|
||||||
|
|
||||||
|
setGeneric("sample.count", function(obj) {
|
||||||
|
return(standardGeneric("sample.count"))
|
||||||
|
})
|
||||||
|
|
||||||
|
setMethod("sample.count", "metabarcoding.data", function(obj) {
|
||||||
|
return(obj@scount)
|
||||||
|
})
|
||||||
|
|
||||||
|
# get motu count
|
||||||
|
|
||||||
|
setGeneric("motu.count", function(obj) {
|
||||||
|
return(standardGeneric("motu.count"))
|
||||||
|
})
|
||||||
|
|
||||||
|
setMethod("motu.count", "metabarcoding.data", function(obj) {
|
||||||
|
return(obj@mcount)
|
||||||
|
})
|
||||||
|
|
||||||
|
# dim method
|
||||||
|
|
||||||
|
setMethod("dim", "metabarcoding.data", function(x) {
|
||||||
|
return(c(x@scount,x@mcount))
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
setMethod('[', "metabarcoding.data", function(x,i=NULL,j=NULL,...,drop=TRUE) {
|
||||||
|
|
||||||
|
# special case if samples are not specified (dimension 1)
|
||||||
|
if (!hasArg(i))
|
||||||
|
i = 1:x@scount
|
||||||
|
|
||||||
|
# special case if motus are not specified (dimension 2)
|
||||||
|
if (!hasArg(j))
|
||||||
|
j = 1:x@mcount
|
||||||
|
|
||||||
|
# special case if the layer attribut is specified
|
||||||
|
args = list(...)
|
||||||
|
|
||||||
|
if (!is.null(args$layer))
|
||||||
|
return(x[[args$layer]][i,j])
|
||||||
|
|
||||||
|
#####################
|
||||||
|
#
|
||||||
|
# normal case
|
||||||
|
#
|
||||||
|
|
||||||
|
r = x@reads[i,j,drop=FALSE]
|
||||||
|
|
||||||
|
if (sum(dim(r) > 1)==2 | ! drop)
|
||||||
|
{
|
||||||
|
|
||||||
|
# we do the selection on the motus and samples description data.frame
|
||||||
|
|
||||||
|
m = x@motus[j,,drop=FALSE]
|
||||||
|
s = x@samples[i,,drop=FALSE]
|
||||||
|
|
||||||
|
# we do the selection on each layers
|
||||||
|
l = lapply(x@layers,function(l) l[i,j,drop=FALSE])
|
||||||
|
|
||||||
|
newdata = copy.metabarcoding.data(x, reads=r, samples=s, motus=m, layers=l)
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
newdata = as.numeric(x@reads[i,j])
|
||||||
|
}
|
||||||
|
|
||||||
|
return(newdata)
|
||||||
|
|
||||||
|
})
|
||||||
|
|
||||||
|
setMethod('[<-', "metabarcoding.data",
|
||||||
|
function (x, i, j, ..., value) {
|
||||||
|
if (!hasArg(i))
|
||||||
|
i = 1:x@scount
|
||||||
|
|
||||||
|
if (!hasArg(j))
|
||||||
|
j = 1:x@mcount
|
||||||
|
|
||||||
|
args = list(...)
|
||||||
|
|
||||||
|
if (is.null(args$layer))
|
||||||
|
x@reads[i, j]=value
|
||||||
|
else
|
||||||
|
|
||||||
|
x[[args$layer]][i,j]=value
|
||||||
|
|
||||||
|
return(x)
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#################################################
|
||||||
|
#
|
||||||
|
# User interface function to create
|
||||||
|
# metabarcoding.data objects
|
||||||
|
#
|
||||||
|
#################################################
|
||||||
|
|
||||||
|
#'@export
|
||||||
|
metabarcoding.data = function(reads,samples,motus,
|
||||||
|
taxonomy=NULL,taxid=NULL,
|
||||||
|
sample.margin=NULL,
|
||||||
|
layers=list()) {
|
||||||
|
rd = new('metabarcoding.data',
|
||||||
|
reads=reads,
|
||||||
|
samples=samples,
|
||||||
|
motus=motus,
|
||||||
|
taxonomy=taxonomy,
|
||||||
|
taxid=taxid,
|
||||||
|
sample.margin=sample.margin,
|
||||||
|
layers=layers
|
||||||
|
)
|
||||||
|
|
||||||
|
return(rd)
|
||||||
|
}
|
||||||
|
|
||||||
|
copy.metabarcoding.data = function(data,
|
||||||
|
reads=NULL,
|
||||||
|
samples=NULL,motus=NULL,
|
||||||
|
taxonomy=NULL,taxid=NULL,
|
||||||
|
sample.margin=NULL,
|
||||||
|
layers=NULL) {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if (is.null(reads))
|
||||||
|
reads = data@reads
|
||||||
|
|
||||||
|
if (is.null(samples))
|
||||||
|
samples = data@samples
|
||||||
|
|
||||||
|
if (is.null(motus))
|
||||||
|
motus = data@motus
|
||||||
|
|
||||||
|
if (is.null(taxonomy))
|
||||||
|
taxonomy = data@taxonomy
|
||||||
|
|
||||||
|
if (is.null(taxid))
|
||||||
|
taxid = data@taxid
|
||||||
|
|
||||||
|
if (is.null(sample.margin))
|
||||||
|
sample.margin = data@sample.margin
|
||||||
|
|
||||||
|
if (is.null(layers))
|
||||||
|
layers = data@layers
|
||||||
|
|
||||||
|
|
||||||
|
rd = new('metabarcoding.data',
|
||||||
|
reads=reads,
|
||||||
|
samples=samples,
|
||||||
|
motus=motus,
|
||||||
|
taxonomy=taxonomy,
|
||||||
|
taxid=taxid,
|
||||||
|
sample.margin=sample.margin,
|
||||||
|
layers=layers
|
||||||
|
)
|
||||||
|
|
||||||
|
return(rd)
|
||||||
|
}
|
||||||
|
|
||||||
|
#' @export
|
||||||
|
setGeneric('rownames')
|
||||||
|
|
||||||
|
#' @export
|
||||||
|
setMethod("rownames", "metabarcoding.data", function(x, do.NULL = TRUE, prefix = "col") {
|
||||||
|
return(rownames(x@reads,do.NULL,prefix))
|
||||||
|
})
|
||||||
|
|
||||||
|
#' @export
|
||||||
|
setGeneric('colnames')
|
||||||
|
|
||||||
|
#' @export
|
||||||
|
setMethod("colnames", "metabarcoding.data", function(x, do.NULL = TRUE, prefix = "col") {
|
||||||
|
return(colnames(x@reads,do.NULL,prefix))
|
||||||
|
})
|
33
ROBITools/R/ROBITools.R
Normal file
33
ROBITools/R/ROBITools.R
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
#' A package to manipulate DNA metabarcoding data.
|
||||||
|
#'
|
||||||
|
#' This package was written as a following of the OBITools.
|
||||||
|
#'
|
||||||
|
#' \tabular{ll}{
|
||||||
|
#' Package: \tab ROBITools\cr
|
||||||
|
#' Type: \tab Package\cr
|
||||||
|
#' Version: \tab 0.1\cr
|
||||||
|
#' Date: \tab 2013-06-27\cr
|
||||||
|
#' License: \tab CeCILL 2.0\cr
|
||||||
|
#' LazyLoad: \tab yes\cr
|
||||||
|
#'}
|
||||||
|
#'
|
||||||
|
#' @name ROBITools-package
|
||||||
|
#' @aliases ROBITools
|
||||||
|
#' @docType package
|
||||||
|
#' @title A package to manipulate DNA metabarcoding data.
|
||||||
|
#' @author Frederic Boyer
|
||||||
|
#' @author Aurelie Bonin
|
||||||
|
#' @author Lucie Zinger
|
||||||
|
#' @author Eric Coissac
|
||||||
|
#'
|
||||||
|
#' @references http://metabarcoding.org/obitools
|
||||||
|
#'
|
||||||
|
NA
|
||||||
|
|
||||||
|
.onLoad <- function(libname, pkgname) {
|
||||||
|
|
||||||
|
packageStartupMessage( "ROBITools package" )
|
||||||
|
#print(getwd())
|
||||||
|
|
||||||
|
}
|
||||||
|
|
229
ROBITools/R/aggregate.R
Normal file
229
ROBITools/R/aggregate.R
Normal file
@@ -0,0 +1,229 @@
|
|||||||
|
#' @include 02_class_metabarcoding.data.R
|
||||||
|
NULL
|
||||||
|
|
||||||
|
# TODO: Add comment
|
||||||
|
#
|
||||||
|
# Author: coissac
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
#' @export
|
||||||
|
aggregate.metabarcoding.data=function(x, by, FUN,...,
|
||||||
|
MARGIN='sample',
|
||||||
|
default.layer=NULL,
|
||||||
|
layers=NULL) {
|
||||||
|
|
||||||
|
uniq.value = function(z) {
|
||||||
|
|
||||||
|
if (is.null(z) |
|
||||||
|
any(is.na(z)) |
|
||||||
|
length(z)==0)
|
||||||
|
ans = NA
|
||||||
|
else {
|
||||||
|
if (all(z==z[1]))
|
||||||
|
ans = z[1]
|
||||||
|
else
|
||||||
|
ans = NA
|
||||||
|
}
|
||||||
|
if (is.factor(z))
|
||||||
|
ans = factor(ans,levels=levels(z))
|
||||||
|
|
||||||
|
return(ans)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Deals with the supplementaty aggregate arguments
|
||||||
|
#
|
||||||
|
|
||||||
|
if (is.null(default.layer))
|
||||||
|
default.layer=uniq.value
|
||||||
|
|
||||||
|
|
||||||
|
if (is.null(layers)) {
|
||||||
|
layers = as.list(rep(c(default.layer),length(x@layers)))
|
||||||
|
names(layers)=layer.names(x)
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
for (n in layer.names(x))
|
||||||
|
if (is.null(layers[[n]]))
|
||||||
|
layers[[n]]=default.layers
|
||||||
|
}
|
||||||
|
|
||||||
|
if (MARGIN == 'sample')
|
||||||
|
MARGIN=1
|
||||||
|
|
||||||
|
if (MARGIN == 'motu')
|
||||||
|
MARGIN=2
|
||||||
|
|
||||||
|
reads = x@reads
|
||||||
|
|
||||||
|
if (MARGIN==1) {
|
||||||
|
# prepare the aggrevation arguments for the read table
|
||||||
|
# from the function arguments
|
||||||
|
dotted = list(...)
|
||||||
|
if (length(dotted) > 0)
|
||||||
|
aggr.args = list(reads,by=by,FUN=FUN,...=dotted,simplify=FALSE)
|
||||||
|
else
|
||||||
|
aggr.args = list(reads,by=by,FUN=FUN,simplify=FALSE)
|
||||||
|
|
||||||
|
# Aggregate the read table
|
||||||
|
ragr = do.call(aggregate,aggr.args)
|
||||||
|
|
||||||
|
# extrat new ids from the aggregated table
|
||||||
|
ncat = length(by)
|
||||||
|
ids = as.character(interaction(ragr[,1:ncat,drop=FALSE]))
|
||||||
|
|
||||||
|
# remove the aggregations modalities to rebuild a correct
|
||||||
|
# reads table
|
||||||
|
ragr = as.matrix(ragr[,-(1:ncat),drop=FALSE])
|
||||||
|
dragr= dim(ragr)
|
||||||
|
cragr= colnames(ragr)
|
||||||
|
ragr = as.numeric(ragr)
|
||||||
|
dim(ragr)=dragr
|
||||||
|
colnames(ragr)=cragr
|
||||||
|
rownames(ragr)=ids
|
||||||
|
|
||||||
|
#
|
||||||
|
# Apply the same aggragation to each layer
|
||||||
|
#
|
||||||
|
|
||||||
|
ln = layer.names(x)
|
||||||
|
|
||||||
|
la = vector(mode="list",length(ln))
|
||||||
|
names(la)=ln
|
||||||
|
|
||||||
|
for (n in ln) {
|
||||||
|
f = layers[[n]]
|
||||||
|
if (is.factor(x[[n]])){
|
||||||
|
isfact = TRUE
|
||||||
|
lf = levels(x[[n]])
|
||||||
|
df = dim(x[[n]])
|
||||||
|
m = matrix(as.character(x[[n]]))
|
||||||
|
dim(m)=df
|
||||||
|
}
|
||||||
|
else
|
||||||
|
m = x[[n]]
|
||||||
|
|
||||||
|
aggr.args = list(m,by=by,FUN=f,simplify=FALSE)
|
||||||
|
lagr = do.call(aggregate,aggr.args)
|
||||||
|
lagr = as.factor.or.matrix(lagr[,-(1:ncat),drop=FALSE])
|
||||||
|
|
||||||
|
if (isfact){
|
||||||
|
df = dim(lagr)
|
||||||
|
lagr = factor(lagr,levels=lf)
|
||||||
|
dim(lagr)=df
|
||||||
|
}
|
||||||
|
|
||||||
|
rownames(lagr)=ids
|
||||||
|
la[[n]]=lagr
|
||||||
|
}
|
||||||
|
|
||||||
|
# aggragate the sample table according to the same criteria
|
||||||
|
#
|
||||||
|
# TODO: We have to take special care of factors in the samples
|
||||||
|
# data.frame
|
||||||
|
|
||||||
|
sagr = aggregate(samples(x),by,uniq.value,simplify=FALSE)
|
||||||
|
|
||||||
|
# move the first columns of the resulting data frame (the aggregations
|
||||||
|
# modalities to the last columns of the data.frame
|
||||||
|
sagr = sagr[,c((ncat+1):(dim(sagr)[2]),1:ncat),drop=FALSE]
|
||||||
|
larg = c(lapply(sagr,unlist),list(stringsAsFactors=FALSE))
|
||||||
|
sagr = do.call(data.frame,larg)
|
||||||
|
|
||||||
|
# set samples ids to the ids computed from modalities
|
||||||
|
sagr$id=ids
|
||||||
|
rownames(sagr)=ids
|
||||||
|
|
||||||
|
# build the new metabarcoding data instance
|
||||||
|
newdata = copy.metabarcoding.data(x,reads=ragr,samples=sagr)
|
||||||
|
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
# prepare the aggregation arguments for the read table
|
||||||
|
# from the function arguments
|
||||||
|
# BECARFUL : the reads table is transposed
|
||||||
|
# standard aggregate runs by row and we want
|
||||||
|
# aggregation by column
|
||||||
|
|
||||||
|
dotted = list(...)
|
||||||
|
if (length(dotted) > 0)
|
||||||
|
aggr.args = list(t(reads),by=by,FUN=FUN,...=dotted,simplify=FALSE)
|
||||||
|
else
|
||||||
|
aggr.args = list(t(reads),by=by,FUN=FUN,simplify=FALSE)
|
||||||
|
|
||||||
|
|
||||||
|
# Aggregate the read table
|
||||||
|
ragr = do.call(aggregate.data.frame,aggr.args)
|
||||||
|
|
||||||
|
# extrat new ids from the aggregated table
|
||||||
|
ncat = length(by)
|
||||||
|
ids = as.character(interaction(ragr[,1:ncat,drop=FALSE]))
|
||||||
|
|
||||||
|
# remove the aggregations modalities to rebuild a correct
|
||||||
|
# reads table
|
||||||
|
|
||||||
|
ragr = t(ragr[,-(1:ncat),drop=FALSE])
|
||||||
|
dragr= dim(ragr)
|
||||||
|
rragr= rownames(ragr)
|
||||||
|
ragr = as.numeric(ragr)
|
||||||
|
dim(ragr)=dragr
|
||||||
|
colnames(ragr)=ids
|
||||||
|
rownames(ragr)=rragr
|
||||||
|
|
||||||
|
#
|
||||||
|
# Apply the same aggragation to each layer
|
||||||
|
#
|
||||||
|
|
||||||
|
ln = layer.names(x)
|
||||||
|
|
||||||
|
la = vector(mode="list",length(ln))
|
||||||
|
names(la)=ln
|
||||||
|
|
||||||
|
for (n in ln) {
|
||||||
|
f = layers[[n]]
|
||||||
|
|
||||||
|
if (is.factor(x[[n]])){
|
||||||
|
isfact = TRUE
|
||||||
|
lf = levels(x[[n]])
|
||||||
|
df = dim(x[[n]])
|
||||||
|
m = matrix(as.character(x[[n]]))
|
||||||
|
dim(m)=df
|
||||||
|
}
|
||||||
|
else
|
||||||
|
m = x[[n]]
|
||||||
|
|
||||||
|
aggr.args = list(t(m),by=by,FUN=f,simplify=FALSE)
|
||||||
|
lagr = do.call(aggregate,aggr.args)
|
||||||
|
lagr = t(as.factor.or.matrix(lagr[,-(1:ncat),drop=FALSE]))
|
||||||
|
|
||||||
|
if (isfact){
|
||||||
|
df = dim(lagr)
|
||||||
|
lagr = factor(lagr,levels=lf)
|
||||||
|
dim(lagr)=df
|
||||||
|
}
|
||||||
|
|
||||||
|
colnames(lagr)=ids
|
||||||
|
la[[n]]=lagr
|
||||||
|
}
|
||||||
|
|
||||||
|
# aggragate the motus table according to the same criteria
|
||||||
|
magr = aggregate(motus(x),by,uniq.value,simplify=FALSE)
|
||||||
|
|
||||||
|
# move the first columns of the resulting data frame (the aggregations
|
||||||
|
# modalities to the last columns of the data.frame
|
||||||
|
magr = magr[,c((ncat+1):(dim(magr)[2]),1:ncat),drop=FALSE]
|
||||||
|
larg = c(lapply(magr,unlist),list(stringsAsFactors=FALSE))
|
||||||
|
magr = do.call(data.frame,larg)
|
||||||
|
|
||||||
|
# set motus ids to the ids computed from modalities
|
||||||
|
magr$id=ids
|
||||||
|
rownames(magr)=ids
|
||||||
|
|
||||||
|
# build the new metabarcoding data instance
|
||||||
|
newdata = copy.metabarcoding.data(x,reads=ragr,motus=magr,layers=la)
|
||||||
|
}
|
||||||
|
|
||||||
|
return(newdata)
|
||||||
|
}
|
||||||
|
|
107
ROBITools/R/choose.taxonomy.R
Normal file
107
ROBITools/R/choose.taxonomy.R
Normal file
@@ -0,0 +1,107 @@
|
|||||||
|
#' @import ROBITaxonomy
|
||||||
|
#' @include 02_class_metabarcoding.data.R
|
||||||
|
NULL
|
||||||
|
|
||||||
|
#' Choose between databases for taxonomic classifications
|
||||||
|
#'
|
||||||
|
#' Chooses a sequence taxonomic assignment in order of preference for the different
|
||||||
|
#' reference databases that have been used when the assignment is above a certain threshold
|
||||||
|
#'
|
||||||
|
#'
|
||||||
|
#' @param x a \code{\link{metabarcoding.data}} object
|
||||||
|
#' @param taxonomy a \code{\linkS4class{taxonomy.obitools}} instance
|
||||||
|
#' @param dbrank string or vector indicating reference database names ranked by order of preference
|
||||||
|
#' @param thresh a best_identity threshold for applying priority. Default is \code{0.95}
|
||||||
|
#'
|
||||||
|
#' @return returns a data.frame with the refined taxonomic assignement and classic taxonomy description.
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#'
|
||||||
|
#' data(termes)
|
||||||
|
#'
|
||||||
|
#' taxo=default.taxonomy()
|
||||||
|
#'
|
||||||
|
#' #create artificial taxonomic assignments
|
||||||
|
#' attr(termes, "motus")["best_identity:DB1"] = sample(seq(0.5,1,0.001),size=nrow(termes$motus), replace=T)
|
||||||
|
#' attr(termes, "motus")["best_identity:DB2"] = sample(seq(0.5,1,0.001),size=nrow(termes$motus), replace=T)
|
||||||
|
#' attr(termes, "motus")["best_identity:DB3"] = sample(seq(0.5,1,0.001),size=nrow(termes$motus), replace=T)
|
||||||
|
#' attr(termes, "motus")["taxid_by_db:DB1"] = termes$motus$taxid
|
||||||
|
#' attr(termes, "motus")["taxid_by_db:DB2"] = sample(termes$motus$taxid,size=nrow(termes$motus), replace=F)
|
||||||
|
#' attr(termes, "motus")["taxid_by_db:DB3"] = sample(termes$motus$taxid,size=nrow(termes$motus), replace=F)
|
||||||
|
#'
|
||||||
|
#' #Run taxo.decider
|
||||||
|
#' termes.ok = taxo.decider(termes, taxo, "DB2", 0.95)
|
||||||
|
#' head(termes.ok$motus[union(grep("DB", colnames(termes.ok$motus)), grep("_ok", colnames(termes.ok$motus)))])
|
||||||
|
#'
|
||||||
|
#' termes.ok = taxo.decider(termes, taxo, c("DB3", "DB1"), 0.95)
|
||||||
|
#' head(termes.ok$motus[union(grep("DB", colnames(termes.ok$motus)), grep("_ok", colnames(termes.ok$motus)))])
|
||||||
|
#'
|
||||||
|
#' #Quick look at the enhancement in taxonomic assignements
|
||||||
|
#' par(mfrow=c(1,4))
|
||||||
|
#' for(i in grep("best_identity.", colnames(termes.ok$motus))){
|
||||||
|
#' hist(termes.ok$motus[,i], breaks=20, ylim=c(1,21), main=colnames(termes.ok$motus)[i], xlab="assignment score")
|
||||||
|
#' }
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\linkS4class{taxonomy.obitools}}, and methods \code{\link{species}},\code{\link{genus}}, \code{\link{family}},\code{\link{kingdom}},
|
||||||
|
#' \code{\link{superkingdom}},\code{\link{taxonatrank}}, \code{\link{taxonmicank}}
|
||||||
|
#'
|
||||||
|
#' @author Lucie Zinger
|
||||||
|
#' @keywords taxonomy
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
#'
|
||||||
|
|
||||||
|
taxo.decider = function(x, taxonomy, dbrank, thresh=0.95) {
|
||||||
|
|
||||||
|
noms = colnames(x$motus)
|
||||||
|
best_ids_names = noms[grep("best_identity.", noms)]
|
||||||
|
best_ids = x$motus[,best_ids_names]
|
||||||
|
taxids = x$motus[, gsub("best_identity", "taxid_by_db", best_ids_names)]
|
||||||
|
dbs = unlist(lapply(strsplit(best_ids_names, "\\:"), "[[", 2))
|
||||||
|
|
||||||
|
|
||||||
|
#Set max indices
|
||||||
|
ind = as.vector(t(apply(best_ids,1,function(y) order(rank(-y, ties.method="max"), match(dbrank, dbs))))[,1])
|
||||||
|
|
||||||
|
#Set default vector: db, bestids, taxids with max score
|
||||||
|
db_ok = dbs[ind]
|
||||||
|
best_identity_ok = best_ids[cbind(1:length(ind), ind)]
|
||||||
|
taxids_by_db_ok = taxids[cbind(1:length(ind), ind)]
|
||||||
|
|
||||||
|
#Get vector of db index that should be used according to condition > thresh
|
||||||
|
db_choice = taxo.decider.routine(dbrank, best_ids, dbs, thresh)
|
||||||
|
|
||||||
|
#Replacing by right values according to db_ok
|
||||||
|
for(i in 1:length(dbrank)){
|
||||||
|
db_ok[which(db_choice==i)] = dbrank[i]
|
||||||
|
best_identity_ok[which(db_choice==i)] = best_ids[which(db_choice==i),grep(dbrank[i], colnames(best_ids))]
|
||||||
|
taxids_by_db_ok[which(db_choice==i)] = taxids[which(db_choice==i),grep(dbrank[i], colnames(taxids))]
|
||||||
|
}
|
||||||
|
|
||||||
|
decision = data.frame(db_ok, best_identity_ok, taxids_by_db_ok)
|
||||||
|
|
||||||
|
coltaxid = colnames(decision)[grep("taxid", colnames(decision))]
|
||||||
|
|
||||||
|
attr(x, "motus") = data.frame(x$motus, decision)
|
||||||
|
new.tax = get.classic.taxonomy(x, taxonomy, coltaxid)
|
||||||
|
|
||||||
|
attr(x, "motus") = data.frame(x$motus, new.tax)
|
||||||
|
|
||||||
|
return(x)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
taxo.decider.routine = function(dbrank, best_ids, dbs, thresh) {
|
||||||
|
#Setting mask
|
||||||
|
mask = matrix(NA,nrow(best_ids),length(dbrank))
|
||||||
|
colnames(mask)=dbrank
|
||||||
|
#For each DB, see if condition T/F
|
||||||
|
for(i in dbrank){
|
||||||
|
mask[,i] = best_ids[,which(dbs==i)]>thresh
|
||||||
|
}
|
||||||
|
#Get the first occurence of T in the table
|
||||||
|
out = apply(mask, 1, function(x) which(x==T)[1])
|
||||||
|
return(out)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
49
ROBITools/R/contaslayer.R
Normal file
49
ROBITools/R/contaslayer.R
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
#' @include 02_class_metabarcoding.data.R
|
||||||
|
NULL
|
||||||
|
|
||||||
|
#' Detects contaminants in metabarcoding data
|
||||||
|
#'
|
||||||
|
#' Detects sequences/motus in a \code{\link{metabarcoding.data}} object
|
||||||
|
#' for which frequencies over the entire dataset are maximum in negative controls and
|
||||||
|
#' hence, most likely to be contaminants.
|
||||||
|
#'
|
||||||
|
#'
|
||||||
|
#' @param x a \code{\link{metabarcoding.data}} object
|
||||||
|
#' @param controls a vector of samples names where conta are suspected to be detected
|
||||||
|
#' (typically negative control names).
|
||||||
|
#' @param clust a vector for grouping sequences. Default set to \code{NULL}.
|
||||||
|
#'
|
||||||
|
#' @return a vector containing the names of sequences identified as contaminants
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#'
|
||||||
|
#' data(termes)
|
||||||
|
#' termes.ok = termes[,colSums(termes$reads)>0]
|
||||||
|
#' neg = rownames(termes.ok)[grep("r",rownames(termes.ok))]
|
||||||
|
#'
|
||||||
|
#' #finds contaminants based on neg samples
|
||||||
|
#' contaslayer(termes.ok, neg)
|
||||||
|
#'
|
||||||
|
#' # extanding contamininant detection with grouping factor,
|
||||||
|
#' # typically obiclean/sumatra cluster or taxonomy membership
|
||||||
|
#' contaslayer(termes.ok, neg, termes.ok$motus$scientific_name)
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\link{threshold}} for further trimming
|
||||||
|
#' @author Lucie Zinger
|
||||||
|
#' @export
|
||||||
|
|
||||||
|
contaslayer = function(x,controls,clust=NULL){
|
||||||
|
|
||||||
|
x.fcol = normalize(x, MARGIN=2)$reads
|
||||||
|
x.max = rownames(x.fcol[apply(x.fcol, 2, which.max),])
|
||||||
|
conta = colnames(x)[!is.na(match(x.max,controls))]
|
||||||
|
|
||||||
|
if (length(clust)!=0) {
|
||||||
|
agg = data.frame(conta.id=colnames(x.fcol), clust)
|
||||||
|
conta.ext = agg$conta.id[which(!is.na(match( agg$clust, agg$clust[match(conta,agg$conta.id)])))]
|
||||||
|
return(as.vector(conta.ext))
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return(conta)
|
||||||
|
}
|
||||||
|
}
|
178
ROBITools/R/distrib.extrapol.R
Normal file
178
ROBITools/R/distrib.extrapol.R
Normal file
@@ -0,0 +1,178 @@
|
|||||||
|
#' @include 02_class_metabarcoding.data.R
|
||||||
|
NULL
|
||||||
|
|
||||||
|
#' Read frequencies krigging
|
||||||
|
#'
|
||||||
|
#' Extrapolates read frequencies from a \code{\link{metabarcoding.data}} object in space for a finer resolution
|
||||||
|
#'
|
||||||
|
#' @param x a vector or matrix from a row-normalized read table
|
||||||
|
#' \code{\link{metabarcoding.data}} object
|
||||||
|
#' @param min.coord a vector of length = 2 indicating the minimum values of x and y
|
||||||
|
#' coordinates to be used for the predicted grid
|
||||||
|
#' @param max.coord a vector of length = 2 indicating the maximum values of x and y
|
||||||
|
#' coordinates to be used for the predicted grid
|
||||||
|
#' @param grid.grain an integer indicating the resolution (i.e. nb of subpoints) in x and y
|
||||||
|
#' coordinates required for the predicted grid
|
||||||
|
#' @param coords a dataframe containing the x and y coordinates of the abundances
|
||||||
|
#' from x to be extrapolated.
|
||||||
|
#' @param otus.table a motus data.frame containing motus informations of x
|
||||||
|
#' @param cutoff a cutoff below which abundances are set to 0.
|
||||||
|
#' This threshold also determines the value to be added to 0 values for log10
|
||||||
|
#' transformation
|
||||||
|
#' @param return.metabarcoding.data if \code{TRUE}, returns a \code{\link{metabarcoding.data}} object. Default is \code{FALSE}
|
||||||
|
#'
|
||||||
|
#' @return either a dataframe or a S3 object with a structure similar to \code{\link{metabarcoding.data}} object.
|
||||||
|
#' The number of samples corresponds to the predicted points.
|
||||||
|
#' The two last columns (if \code{return.metabarcoding.data==F}) or sample data.frame contains x y coordinates of the predicted grid
|
||||||
|
#' The all but last two columns (if \code{return.metabarcoding.data==F}) or read matrix contains the predicted log10 transformed relative abundances
|
||||||
|
#' instead of reads counts
|
||||||
|
#' If \code{return.metabarcoding.data==F} the motus data.frame contains the motus informations from x
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#'
|
||||||
|
#' data(termes)
|
||||||
|
#' #Create dummy spatial coordinates
|
||||||
|
#' attr(termes, "samples")[c("x", "y")] = expand.grid(1:7,1:3)
|
||||||
|
#'
|
||||||
|
#' #compute frequencies
|
||||||
|
#' attr(termes, "layers")[["reads.freq"]] = normalize(termes, MARGIN=1)$reads
|
||||||
|
#'
|
||||||
|
#' # Getting extrapolations
|
||||||
|
#' termes.pred = extrapol.freq(attr(termes, "layers")[["reads.freq"]], min.coord=c(1,1), max.coord=c(7,3),
|
||||||
|
#' grid.grain=100,termes$samples[,c("x", "y")], termes$motus, cutoff=1e-3)
|
||||||
|
#'
|
||||||
|
#' head(termes.pred$reads)
|
||||||
|
#' @seealso \code{\link{map.extrapol.freq}} as well as \code{sp} and \code{gstat} packages
|
||||||
|
#' @author Lucie Zinger
|
||||||
|
#' @export
|
||||||
|
|
||||||
|
extrapol.freq = function(x, min.coord, max.coord, grid.grain=100, coords, otus.table, cutoff=1e-3, return.metabarcoding.data = FALSE) {
|
||||||
|
require(gstat)
|
||||||
|
require(sp)
|
||||||
|
|
||||||
|
#predicted grid setting
|
||||||
|
new.x = seq(min.coord[1], max.coord[1], length.out = grid.grain)
|
||||||
|
new.y = seq(min.coord[2], max.coord[2], length.out = grid.grain)
|
||||||
|
grid.p=expand.grid(new.x, new.y)
|
||||||
|
colnames(grid.p)=c("x", "y")
|
||||||
|
S=sp::SpatialPoints(grid.p); sp::gridded(S)<-TRUE
|
||||||
|
m=gstat::vgm(50, "Exp", 100)
|
||||||
|
|
||||||
|
#krigging
|
||||||
|
preds = apply(x, 2, function(otu) {
|
||||||
|
otu[otu<cutoff] = cutoff
|
||||||
|
spj=cbind(coords,otu)
|
||||||
|
colnames(spj)=c("x", "y", "otu")
|
||||||
|
spj.g=gstat::gstat(id="Log10.freq", formula=log10(otu)~1,locations=~x+y,data=spj,model=m)
|
||||||
|
gstat::predict.gstat(spj.g, grid.p, quiet=T)$Log10.freq.pred
|
||||||
|
})
|
||||||
|
|
||||||
|
#formatting the output
|
||||||
|
colnames(preds) = rownames(otus.table)
|
||||||
|
rownames(preds) = paste("s", 1:nrow(grid.p), sep=".")
|
||||||
|
row.names(grid.p) = rownames(preds)
|
||||||
|
|
||||||
|
if(return.metabarcoding.data==F) {
|
||||||
|
out = data.frame(preds, grid.p)
|
||||||
|
} else{
|
||||||
|
out = metabarcoding.data(preds, grid.p, otus.table)
|
||||||
|
}
|
||||||
|
return(out)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#' Maps of krigged log10-transformed frequencies
|
||||||
|
#'
|
||||||
|
#' Maps the output of extrapol.freq
|
||||||
|
#'
|
||||||
|
#'
|
||||||
|
#' @param x an extrapol.freq output
|
||||||
|
#' @param path the path of the folder to export the map. Default is \code{NULL} and map is printed in Rplot/quartz
|
||||||
|
#' @param col.names a vector containing the names of the columns to be used for defining the file name. Typically
|
||||||
|
#' the column names containing the taxonomic information and/or sequence/motus id.
|
||||||
|
#' @param index an integer indicating column number of the motu/sequence to be plotted.
|
||||||
|
#' @param cutoff lower motu frequency accepted to consider motu abundance as different
|
||||||
|
#' from 0. Should be the same than the one used in extrapol.freq
|
||||||
|
#' @param add.points a 3-column data.frame containing factor levels and associated x and y coordinates
|
||||||
|
#' to be added to the map. Typically taxa observed in the field.
|
||||||
|
#' @param adj a value used for adjusting text position in the map. Default is \code{4}
|
||||||
|
#'
|
||||||
|
#' @return a map/png file displaying motus distribution.
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#'
|
||||||
|
#' data(termes)
|
||||||
|
#' attr(termes, "samples")[c("x", "y")] = expand.grid(1:7,1:3)
|
||||||
|
#'
|
||||||
|
#' #compute frequencies
|
||||||
|
#' attr(termes, "layers")[["reads.freq"]] = normalize(termes, MARGIN=1)$reads
|
||||||
|
#'
|
||||||
|
#' # Getting extrapolations
|
||||||
|
#' termes.pred = extrapol.freq(attr(termes, "layers")[["reads.freq"]],
|
||||||
|
#' grid.grain=100,termes$samples[,c("x", "y")], termes$motus, cutoff=1e-3)
|
||||||
|
#'
|
||||||
|
#' #mapping the distribution of the 3 most abundant sequences (caution, mfrow does not work for lattice's levelplot)
|
||||||
|
#' map.extrapol.freq(termes.pred, path=NULL, col.name=NULL, 1, cutoff=1e-3)
|
||||||
|
#' map.extrapol.freq(termes.pred, path=NULL, col.name=NULL, 2, cutoff=1e-3)
|
||||||
|
#' map.extrapol.freq(termes.pred, path=NULL, col.name=NULL, 3, cutoff=1e-3)
|
||||||
|
#'
|
||||||
|
#' #dummy observationnal data
|
||||||
|
#' termes.obs = data.frame(x=c(2,3,5), y=c(2.7,2,2.6), taxa = rep("Isoptera Apicotermitinae", 3))
|
||||||
|
#' map.extrapol.freq(termes.pred, path=NULL, col.name=NULL, 3, cutoff=1e-3, add.points=termes.obs)
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\link{extrapol.freq}}, and \code{levelplot} from \code{lattice} package
|
||||||
|
#' @author Lucie Zinger
|
||||||
|
#' @export
|
||||||
|
|
||||||
|
map.extrapol.freq = function(x, path=NULL, col.name=NULL, index, cutoff=1e-3, add.points=NULL, adj=4) {
|
||||||
|
|
||||||
|
require(lattice)
|
||||||
|
|
||||||
|
if(!is.null(path)) {
|
||||||
|
x.motus = apply(x$motus,2,as.character)
|
||||||
|
name = gsub("\\.", "_", paste(gsub(", ", "_", toString(x.motus[index,col.name])), x.motus[index,"id"], sep="_"))
|
||||||
|
file.out = paste(path, "/", name, ".png", sep="")
|
||||||
|
}
|
||||||
|
|
||||||
|
z=x$reads[,index]
|
||||||
|
z[abs(z)>abs(log10(cutoff))]=log10(cutoff)
|
||||||
|
z[z>0] = 0
|
||||||
|
spj=as.data.frame(cbind(x$samples,z))
|
||||||
|
colnames(spj)=c("x", "y", "z")
|
||||||
|
|
||||||
|
map.out=levelplot(z~x+y, spj, col.regions=topo.colors(100),
|
||||||
|
at=seq(log10(cutoff),log10(1), by=0.2),
|
||||||
|
colorkey=list(at=seq(log10(cutoff),log10(1), by=0.2),
|
||||||
|
labels=list(at=seq(log10(cutoff),log10(1), by=0.2),
|
||||||
|
labels=round(10^seq(log10(cutoff),log10(1), by=0.2),3))),
|
||||||
|
aspect = "iso", contour=F, main=list(label=x$motus[index, "id"], cex=0.7))
|
||||||
|
|
||||||
|
if(!is.null(path)) {
|
||||||
|
png(file=file.out, width=800, height=800)
|
||||||
|
print(map.out)
|
||||||
|
if(!is.null(add.points)) {
|
||||||
|
n = (max(spj[,"y"])-min(spj["y"]))/length(unique(spj[,"y"]))*adj
|
||||||
|
trellis.focus("panel", 1, 1, highlight=FALSE)
|
||||||
|
lpoints(add.points[,"x"], add.points[,"y"], cex=0.7, lwd=3, col="red")
|
||||||
|
ltext(add.points[,"x"], add.points[,"y"]+n, add.points[,-match(c("x", "y"), colnames(add.points))], col="red", cex=1.5)
|
||||||
|
trellis.unfocus()
|
||||||
|
}
|
||||||
|
dev.off()
|
||||||
|
|
||||||
|
} else {
|
||||||
|
print(map.out)
|
||||||
|
if(!is.null(add.points)) {
|
||||||
|
n = (max(spj[,"y"])-min(spj["y"]))/length(unique(spj[,"y"]))*adj
|
||||||
|
trellis.focus("panel", 1, 1, highlight=FALSE)
|
||||||
|
lpoints(add.points[,"x"], add.points[,"y"], cex=0.7, lwd=3, col="red")
|
||||||
|
ltext(add.points[,"x"], add.points[,"y"]+n, add.points[,-match(c("x", "y"), colnames(add.points))], col="red", cex=1)
|
||||||
|
trellis.unfocus()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
206
ROBITools/R/experimental.section.R
Normal file
206
ROBITools/R/experimental.section.R
Normal file
@@ -0,0 +1,206 @@
|
|||||||
|
#' @include 02_class_metabarcoding.data.R
|
||||||
|
NULL
|
||||||
|
|
||||||
|
#11.03.2011
|
||||||
|
#L.Zinger
|
||||||
|
|
||||||
|
#######################
|
||||||
|
#function anosim.pw
|
||||||
|
#######################
|
||||||
|
#computes pairwise anosim computation
|
||||||
|
#input:
|
||||||
|
#dat: dissimilarity matrix
|
||||||
|
#g: factor defining the grouping to test
|
||||||
|
#permutations: nb of permutation to access anosim statistics
|
||||||
|
#p.adjust.method: method of correction for multiple-testing
|
||||||
|
#
|
||||||
|
#output: a distance-like table containing:
|
||||||
|
#in the upper triangle: the anosims R values
|
||||||
|
#in the lower triangle: the adjusted p-values
|
||||||
|
|
||||||
|
|
||||||
|
### start
|
||||||
|
|
||||||
|
anosim.pw<-function(dat, g, permutations, p.adjust.method, ...) {
|
||||||
|
require(vegan)
|
||||||
|
#data.trasformation
|
||||||
|
dat<-as.matrix(dat)
|
||||||
|
g<-factor(g)
|
||||||
|
|
||||||
|
#empty object for result storage
|
||||||
|
ano<-matrix(NA, nrow=nlevels(g), ncol=nlevels(g), dimnames=list(levels(g),levels(g)))
|
||||||
|
p.val.tmp<-NULL
|
||||||
|
#running anosims
|
||||||
|
for(i in 1:(nlevels(g)-1)) for(j in (i+1):nlevels(g)){
|
||||||
|
tmp<-anosim(as.dist(dat[c(which(g==levels(g)[i]),which(g==levels(g)[j])),
|
||||||
|
c(which(g==levels(g)[i]),which(g==levels(g)[j]))]),
|
||||||
|
c(rep(levels(g)[i], length(which(g==levels(g)[i]))),
|
||||||
|
rep(levels(g)[j], length(which(g==levels(g)[j])))), permutations)
|
||||||
|
ano[i,j]<-tmp$statistic
|
||||||
|
p.val.tmp<-append(p.val.tmp, tmp$signif)
|
||||||
|
}
|
||||||
|
|
||||||
|
#p value correction for multiple comparison
|
||||||
|
p.val.tmp<-p.adjust(p.val.tmp, p.adjust.method )
|
||||||
|
|
||||||
|
#put the corrected p values in the anosim table
|
||||||
|
tmp<-NULL
|
||||||
|
tmp2<-NULL
|
||||||
|
for(i in 1:(nlevels(g)-1)) for(j in (i+1):nlevels(g)){
|
||||||
|
tmp<-append(tmp,i)
|
||||||
|
tmp2<-append(tmp2,j)
|
||||||
|
}
|
||||||
|
for(i in 1:length(p.val.tmp)){
|
||||||
|
ano[tmp2[i],tmp[i]]<-p.val.tmp[i]}
|
||||||
|
|
||||||
|
return(ano)
|
||||||
|
}
|
||||||
|
|
||||||
|
### end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#23 Nov 2012
|
||||||
|
#L.Zinger
|
||||||
|
###################
|
||||||
|
#function MOTUtable
|
||||||
|
###################
|
||||||
|
# Generates ready-to-use MOTU tables and basic statistics on samples (i.e. sequencing depth, raw richness, and invsimpson index)
|
||||||
|
#input:
|
||||||
|
#x: an obitable output (samples should be indicated as e.g. "sample.A01r" in column names)
|
||||||
|
#y: the column name by which that data are to be aggregated. Should be e.g. "cluster" or "species_name"
|
||||||
|
#outputs:
|
||||||
|
#x.otu: the ready-to-use MOTU table
|
||||||
|
#x.rawstats: basic statistics on samples
|
||||||
|
|
||||||
|
### start
|
||||||
|
|
||||||
|
MOTUtable<-function(x, y) {
|
||||||
|
|
||||||
|
require(vegan)
|
||||||
|
nom<-as.character(substitute(x))
|
||||||
|
|
||||||
|
tmp<-x[,c(grep(y, colnames(x)), grep("sample", colnames(x)))]
|
||||||
|
tmp2<-t(aggregate(tmp[,-1], by=list(tmp[,1]), sum))
|
||||||
|
x.otu<-tmp2[-1,]
|
||||||
|
colnames(x.otu)<-paste(y,tmp2[1,], sep=".")
|
||||||
|
|
||||||
|
x.rawstats<-data.frame(Nb_ind=rowSums(x.otu), Raw_richness=specnumber(x.otu, MARGIN=1), Raw_eveness=diversity(x.otu, "invsimpson", MARGIN=1) )
|
||||||
|
#may have a pb in the rowSums depending on the R version (allows or not non-numeric)
|
||||||
|
|
||||||
|
assign(paste(nom, y, sep="."),x.otu,env = .GlobalEnv)
|
||||||
|
assign(paste(nom, y, "rawstats", sep="."),x.rawstats,env = .GlobalEnv)
|
||||||
|
}
|
||||||
|
|
||||||
|
### end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#26 Nov 2012
|
||||||
|
#F.Boyer
|
||||||
|
###################
|
||||||
|
#function reads.frequency & filter.threshold
|
||||||
|
###################
|
||||||
|
#can be used to filter the table of reads to have the sequences that represents at least 95% of the total reads by sample
|
||||||
|
#
|
||||||
|
#e.g. reads.treshold(reads.frequency(metabarcodingS4Obj@reads), 0.95)
|
||||||
|
|
||||||
|
|
||||||
|
filter.threshold <- function(v, threshold) {
|
||||||
|
o <- order(v, decreasing=T)
|
||||||
|
ind <- which(cumsum(as.matrix(v[o]))>threshold)
|
||||||
|
v[-o[seq(min(length(o), 1+length(o)-length(ind)))]] <- 0
|
||||||
|
v
|
||||||
|
}
|
||||||
|
|
||||||
|
reads.threshold <- function (reads, threshold, by.sample=T) {
|
||||||
|
res <- apply(reads, MARGIN=ifelse(by.sample, 1, 2), filter.threshold, thr=threshold)
|
||||||
|
if (by.sample) res <- t(res)
|
||||||
|
data.frame(res)
|
||||||
|
}
|
||||||
|
|
||||||
|
reads.frequency <- function (reads, by.sample=T) {
|
||||||
|
res <- apply(reads, MARGIN=ifelse(by.sample, 1, 2), function(v) {v/sum(v)})
|
||||||
|
if (by.sample) res <- t(res)
|
||||||
|
data.frame(res)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#06 Jan 2013
|
||||||
|
#F.Boyer
|
||||||
|
###################
|
||||||
|
#function removeOutliers
|
||||||
|
###################
|
||||||
|
#given a contengency table and a distance matrix
|
||||||
|
#returns the list of samples that should be removed in order to have only
|
||||||
|
#distances below thresold
|
||||||
|
#can't return only one sample
|
||||||
|
#
|
||||||
|
#e.g. intraBad <- lapply(levels(sample.desc$sampleName), function(group) {samples<-rownames(sample.desc)[sample.desc$sampleName==group]; removeOutliers(contingencyTable[samples,], thr=0.3, distFun = function(x) vegdist(x, method='bray'))})
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#require(vegan)
|
||||||
|
removeOutliers <- function(m, thr=0.3, distFun = function(x) vegdist(x, method='bray') ) {
|
||||||
|
distMat <- as.matrix(distFun(m))
|
||||||
|
maxM <- max(distMat)
|
||||||
|
theBadGuys =c()
|
||||||
|
|
||||||
|
while (maxM>thr) {
|
||||||
|
bad <- apply(distMat, MARGIN=1, function(row, maxM) {any(row==maxM)}, maxM=maxM)
|
||||||
|
bad <- names(bad)[bad]
|
||||||
|
bad <- apply(distMat[bad,], MARGIN=1, mean)
|
||||||
|
badGuy <- names(bad)[bad==max(bad), drop=F][1]
|
||||||
|
|
||||||
|
theBadGuys <- c(theBadGuys, badGuy)
|
||||||
|
|
||||||
|
stillok <- rownames(distMat) != badGuy
|
||||||
|
distMat <- distMat[stillok, stillok, drop=F]
|
||||||
|
maxM <- max(distMat)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (length(theBadGuys) >= (nrow(m)-1)) {
|
||||||
|
theBadGuys <- rownames(m)
|
||||||
|
}
|
||||||
|
theBadGuys
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#31.05.2013
|
||||||
|
#L.Zinger
|
||||||
|
#getAttrPerS, a function allowing to get the values of a sequence attribute per sample
|
||||||
|
#(e.g. best_identities, etc...) the output is a list with one dataframe per sample.
|
||||||
|
#This dataframe contains:
|
||||||
|
# first column (named as attr): the attribute value for each sequence present in the sample
|
||||||
|
# second column (named weight): the corresponding number of reads in the sample
|
||||||
|
|
||||||
|
getAttrPerS=function(x,attr){
|
||||||
|
#x: a metabarcoding object
|
||||||
|
#attr: a character object corresponding to the attribute
|
||||||
|
#for which values per sample are needed (should be equal to a colname in x@motus)
|
||||||
|
|
||||||
|
if(class(x)[1]!= "metabarcoding.data") {
|
||||||
|
stop("x is not a metabarcoding S4 object")
|
||||||
|
}
|
||||||
|
|
||||||
|
if(is.character(attr)==F) {
|
||||||
|
stop("attr is not a character object")
|
||||||
|
}
|
||||||
|
|
||||||
|
x.motus = motus(x)
|
||||||
|
x.reads = reads(x)
|
||||||
|
|
||||||
|
otu = apply(x.reads, 1, function(y) x.motus[match(names(y[which(y!=0)]),x.motus$id), grep(attr, colnames(x.motus))])
|
||||||
|
reads = apply(x.reads, 1, function(y) y[which(y!=0)])
|
||||||
|
|
||||||
|
output = mapply(cbind, otu, reads)
|
||||||
|
output = lapply(output, function(y) {
|
||||||
|
colnames(y)=c(attr,"weight")
|
||||||
|
return(y)
|
||||||
|
})
|
||||||
|
return(output)
|
||||||
|
}
|
||||||
|
### end getAttrPerS
|
||||||
|
|
62
ROBITools/R/export-metabarcoding.R
Normal file
62
ROBITools/R/export-metabarcoding.R
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
#' @include 02_class_metabarcoding.data.R
|
||||||
|
NULL
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: Add comment
|
||||||
|
#
|
||||||
|
# Author: coissac
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
require(utils)
|
||||||
|
|
||||||
|
expand.metabarcoding.data=function(data,minread=1) {
|
||||||
|
resultonesample=function(sample) {
|
||||||
|
mo= data@reads[sample,] >= minread
|
||||||
|
s = data@samples[rep(sample,sum(mo)),]
|
||||||
|
r = as.numeric(data@reads[sample,mo])
|
||||||
|
m = data@motus[mo,]
|
||||||
|
|
||||||
|
result = data.frame(s,frequency=r,m,
|
||||||
|
stringsAsFactors =FALSE,
|
||||||
|
row.names = NULL)
|
||||||
|
|
||||||
|
result
|
||||||
|
}
|
||||||
|
|
||||||
|
res = lapply(1:data@scount, resultonesample)
|
||||||
|
|
||||||
|
do.call(rbind,res)
|
||||||
|
}
|
||||||
|
|
||||||
|
#setGeneric("utils::write.csv")
|
||||||
|
write.csv.metabarcoding.data = function(...) {
|
||||||
|
Call <- match.call(expand.dots = TRUE)
|
||||||
|
if (!is.null(Call[["minread"]])) {
|
||||||
|
minread = Call[["minread"]]
|
||||||
|
Call = Call[!names(Call)=="minread"]
|
||||||
|
}
|
||||||
|
else
|
||||||
|
minread = 1
|
||||||
|
data = eval.parent(Call[[2L]])
|
||||||
|
data = expand.metabarcoding.data(data,minread)
|
||||||
|
Call[[1L]] <- as.name("write.csv")
|
||||||
|
Call[[2L]] <- as.name("data")
|
||||||
|
eval(Call)
|
||||||
|
}
|
||||||
|
|
||||||
|
#setGeneric("utils::write.csv2")
|
||||||
|
write.csv2.metabarcoding.data = function(...) {
|
||||||
|
Call <- match.call(expand.dots = TRUE)
|
||||||
|
if (!is.null(Call[["minread"]])) {
|
||||||
|
minread = Call[["minread"]]
|
||||||
|
Call = Call[!names(Call)=="minread"]
|
||||||
|
}
|
||||||
|
else
|
||||||
|
minread = 1
|
||||||
|
data = eval.parent(Call[[2L]])
|
||||||
|
data = expand.metabarcoding.data(data,minread)
|
||||||
|
Call[[1L]] <- as.name("write.csv2")
|
||||||
|
Call[[2L]] <- as.name("data")
|
||||||
|
eval(Call)
|
||||||
|
|
||||||
|
}
|
106
ROBITools/R/import.metabarcoding.R
Normal file
106
ROBITools/R/import.metabarcoding.R
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
#' @include read.obitab.R
|
||||||
|
#' @include 02_class_metabarcoding.data.R
|
||||||
|
NULL
|
||||||
|
|
||||||
|
#' Read a data file produced by the \code{obitab} command
|
||||||
|
#'
|
||||||
|
#' Read a data file issued from the conversion of a \strong{fasta}
|
||||||
|
#' file to a tabular file by the \code{obitab} command of the
|
||||||
|
#' \strong{OBITools} package
|
||||||
|
#'
|
||||||
|
#' @param file a string containing the file name of the obitab file.
|
||||||
|
#' @param sep Column separator in the obitab file.
|
||||||
|
#' The default separator is the tabulation.
|
||||||
|
#' @param sample A regular expression allowing to identify columns
|
||||||
|
#' from the file describing abundances of sequences per sample
|
||||||
|
#' @param sample.sep Separator between combined sample name.
|
||||||
|
#' @param attribute Separator used to split between sample 'tag' and sample name.
|
||||||
|
#'
|
||||||
|
#' @return a \code{\link{metabarcoding.data}} instance
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' require(ROBITools)
|
||||||
|
#'
|
||||||
|
#' \dontshow{# switch the working directory to the data package directory}
|
||||||
|
#' \dontshow{setwd(system.file("extdata", package="ROBITools"))}
|
||||||
|
#'
|
||||||
|
#' # read the termes.tab file
|
||||||
|
#' termes=import.metabarcoding.data('termes.tab')
|
||||||
|
#'
|
||||||
|
#' # print the number of samples and motus described in the file
|
||||||
|
#' dim(termes)
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\link{metabarcoding.data}}
|
||||||
|
#'
|
||||||
|
#' @author Eric Coissac
|
||||||
|
#' @keywords DNA metabarcoding
|
||||||
|
#' @export
|
||||||
|
#'
|
||||||
|
import.metabarcoding.data = function(file,sep='\t',sample="sample",sample.sep="\\.",attribute=":") {
|
||||||
|
|
||||||
|
data=read.obitab(file,sep=sep)
|
||||||
|
|
||||||
|
# get the colnames matching the sample pattern
|
||||||
|
|
||||||
|
column=colnames(data)
|
||||||
|
pat = paste('(^|',sample.sep,')',sample,'[',sample.sep,attribute,']',sep='')
|
||||||
|
scol= grep(pat,column)
|
||||||
|
|
||||||
|
# reads informations about samples
|
||||||
|
|
||||||
|
reads = data[,scol]
|
||||||
|
names = colnames(reads)
|
||||||
|
names = strsplit(names,split=attribute)
|
||||||
|
|
||||||
|
# for sample name just remove the first part of the col names
|
||||||
|
# usally "sample:"
|
||||||
|
|
||||||
|
sample.names = sapply(names,function(a) paste(a[-1],collapse=attribute))
|
||||||
|
|
||||||
|
reads=t(reads)
|
||||||
|
rownames(reads)=sample.names
|
||||||
|
|
||||||
|
# sample's data
|
||||||
|
|
||||||
|
sample.data = data.frame(t(data.frame(strsplit(sample.names,split=attribute))))
|
||||||
|
rownames(sample.data)=sample.names
|
||||||
|
colnames(sample.data)=strsplit(names[[1]][1],split=attribute)
|
||||||
|
|
||||||
|
|
||||||
|
# motus information
|
||||||
|
|
||||||
|
motus = data[,-scol]
|
||||||
|
|
||||||
|
motus.id = motus$id
|
||||||
|
|
||||||
|
rownames(motus)=motus.id
|
||||||
|
colnames(reads)=motus.id
|
||||||
|
|
||||||
|
|
||||||
|
return(metabarcoding.data(reads,sample.data,motus))
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#pcr = gh[,grep('^sample',colnames(gh))]
|
||||||
|
#pcr.names = colnames(pcr)
|
||||||
|
#pcr.names = sub('sample\\.','',pcr.names)
|
||||||
|
#sequencer = rep('Solexa',length(pcr.names))
|
||||||
|
#sequencer[grep('454',pcr.names)]='454'
|
||||||
|
#sequencer=factor(sequencer)
|
||||||
|
#
|
||||||
|
#tmp = strsplit(pcr.names,'\\.[A-Z](sol|454)\\.')
|
||||||
|
#
|
||||||
|
#sample = sapply(tmp,function(x) x[1])
|
||||||
|
#locality = factor(sapply(strsplit(sample,'_'),function(x) x[1]))
|
||||||
|
#sample = factor(sample)
|
||||||
|
#repeats= factor(sapply(tmp,function(x) x[2]))
|
||||||
|
#
|
||||||
|
#tmp = regexpr('[A-Z](454|sol)',pcr.names)
|
||||||
|
#run=factor(substr(pcr.names,tmp,tmp+attr(tmp,"match.length")-1))
|
||||||
|
#
|
||||||
|
#pcr.metadata = data.frame(run,sequencer,locality,sample,repeats)
|
||||||
|
#
|
||||||
|
#rownames(pcr.metadata)=pcr.names
|
||||||
|
|
||||||
|
|
79
ROBITools/R/import.ngsfilter.R
Normal file
79
ROBITools/R/import.ngsfilter.R
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
#' @include 02_class_metabarcoding.data.R
|
||||||
|
NULL
|
||||||
|
|
||||||
|
#' Read ngsfilter text file
|
||||||
|
#'
|
||||||
|
#' Reads the text file used for assigning reads to samples with the
|
||||||
|
#' \code{ngsfilter} command of the \strong{OBITools} package.
|
||||||
|
#'
|
||||||
|
#' @param file a string containing the file name for the \code{ngsfilter} command.
|
||||||
|
#' @param platewell a string corresponding to the tag used for storing the sample location
|
||||||
|
#' in the PCR plate. Should be of the form "nbPlate_Well" (e.g. "01_A02").
|
||||||
|
#' Default is \code{NULL}
|
||||||
|
#' @return \code{\link{import.ngsfilter.data}} returns a \code{\link{data.frame}} instance
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' \dontshow{# switch the working directory to the data package directory}
|
||||||
|
#' \dontshow{setwd(system.file("extdata", package="ROBITools"))}
|
||||||
|
#'
|
||||||
|
#' data(termes)
|
||||||
|
#'
|
||||||
|
#' # reading the termes_ngsfilt.txt file
|
||||||
|
#' termes.ngs=import.ngsfilter.data('termes_ngsfilt.txt', platewell="position")
|
||||||
|
#'
|
||||||
|
#' # including ngsfilter data into termes data
|
||||||
|
#' attr(termes, "samples") = termes.ngs[rownames(termes),]
|
||||||
|
#'
|
||||||
|
#' colnames(termes$samples)
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\link{import.metabarcoding.data}} and \code{\link{read.obitab}} for other methods of data importation
|
||||||
|
#'
|
||||||
|
#' @author Lucie Zinger
|
||||||
|
#' @keywords DNA metabarcoding
|
||||||
|
#' @export
|
||||||
|
#'
|
||||||
|
import.ngsfilter.data = function(file, platewell=NULL) {
|
||||||
|
raw = read.table(file, sep="\t")
|
||||||
|
|
||||||
|
#get samples names
|
||||||
|
names = raw[,2]
|
||||||
|
|
||||||
|
#form first part of the output table (default ngsfilter text input)
|
||||||
|
out = raw[,-c(2,3,ncol(raw))]
|
||||||
|
colnames(out) = c("Experiment", "primerF", "primerR")
|
||||||
|
|
||||||
|
#add tags
|
||||||
|
out[,c("tagF", "tagR")] = do.call("rbind", strsplit(as.vector(raw[,3]), "\\:"))
|
||||||
|
|
||||||
|
#collect nb and names of additionnal information
|
||||||
|
max.add = max(unlist(lapply(strsplit(gsub("^F @ ","", raw[, ncol(raw)]), "; "), length)))
|
||||||
|
names.add = unique(unlist(lapply(strsplit(unlist(strsplit(gsub("^F @ ","", raw[, ncol(raw)]), "; ")), "="), "[[",1)))
|
||||||
|
|
||||||
|
#form table of additionnal info
|
||||||
|
form = lapply(strsplit(gsub("^F @ ","", raw[, ncol(raw)]), "; "), strsplit, "=")
|
||||||
|
additionnals = as.data.frame(do.call("rbind", lapply(form, function(y) {
|
||||||
|
val = rep(NA, , max.add)
|
||||||
|
names(val) = names.add
|
||||||
|
val[match(unlist(lapply(y, "[[", 1)), names(val))] = gsub(";", "",unlist(lapply(y, "[[", 2)))
|
||||||
|
val
|
||||||
|
})))
|
||||||
|
|
||||||
|
#create PCR plate coordinates
|
||||||
|
if(!is.null(platewell)) {
|
||||||
|
form = strsplit(as.vector(additionnals[, platewell]), "_")
|
||||||
|
nbPlate = as.numeric(gsub("^0", "", unlist(lapply(form, "[[", 1))))
|
||||||
|
wellPlate = unlist(lapply(form, "[[", 2))
|
||||||
|
xPlate = as.numeric(gsub("[A-Z]", "", wellPlate))
|
||||||
|
yPlate = as.numeric(as.factor(gsub("[0-9]*", "", wellPlate))) + 8*nbPlate
|
||||||
|
|
||||||
|
additionnals = additionnals[,-grep(platewell, colnames(additionnals))]
|
||||||
|
out = data.frame(out, additionnals, nbPlate, wellPlate, xPlate, yPlate)
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
additionnals[,ncol(additionnals)] = gsub(";","", additionnals[,ncol(additionnals)])
|
||||||
|
out = data.frame(out, additionnals)
|
||||||
|
}
|
||||||
|
|
||||||
|
rownames(out) = names
|
||||||
|
return(out)
|
||||||
|
}
|
119
ROBITools/R/layers.metabarcoding.R
Normal file
119
ROBITools/R/layers.metabarcoding.R
Normal file
@@ -0,0 +1,119 @@
|
|||||||
|
#' @include 02_class_metabarcoding.data.R
|
||||||
|
NULL
|
||||||
|
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# Managment of layers
|
||||||
|
#
|
||||||
|
# Layers a matrix or factors with the same dimension
|
||||||
|
# than the read matrix
|
||||||
|
#
|
||||||
|
|
||||||
|
# get motus data.frames
|
||||||
|
|
||||||
|
#' @export
|
||||||
|
setGeneric("layer.names", function(obj) {
|
||||||
|
return(standardGeneric("layer.names"))
|
||||||
|
})
|
||||||
|
|
||||||
|
#' Returns the names of all the layers
|
||||||
|
#'
|
||||||
|
#' \code{layer.names} extracts the list of all the layer
|
||||||
|
#' names attached to a \code{\link{metabarcoding.data}} instance.
|
||||||
|
#'
|
||||||
|
#' @param obj a \code{\link{metabarcoding.data}} instance
|
||||||
|
#' @return a vector of type \code{character} containing the
|
||||||
|
#' list of all the layer names.
|
||||||
|
#'
|
||||||
|
#' @docType methods
|
||||||
|
#' @rdname layer.names-methods
|
||||||
|
#' @aliases layer.names-methods,metabarcoding.data
|
||||||
|
#'
|
||||||
|
setMethod("layer.names", "metabarcoding.data", function(obj) {
|
||||||
|
return(names(obj@layers))
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
#' Returns the a layer associated to a \code{\link{metabarcoding.data}}
|
||||||
|
#'
|
||||||
|
#' [[ operator Extracts a layer
|
||||||
|
#' attached to a \code{\link{metabarcoding.data}} instance.
|
||||||
|
#'
|
||||||
|
#' @usage \method{[[}{unmutable}(x,i)
|
||||||
|
#'
|
||||||
|
#' @param x a \code{\link{metabarcoding.data}} instance
|
||||||
|
#' @return matrix or a factor.
|
||||||
|
#'
|
||||||
|
#' @docType methods
|
||||||
|
#' @rdname double-open-brace-methods
|
||||||
|
#' @aliases double-open-brace-methods,metabarcoding.data
|
||||||
|
#' @method [[
|
||||||
|
#' @export
|
||||||
|
#'
|
||||||
|
setMethod("[[", "metabarcoding.data",
|
||||||
|
function(x, i, j, ...) {
|
||||||
|
|
||||||
|
if (! is.character(i))
|
||||||
|
stop('Just named index must be used')
|
||||||
|
|
||||||
|
if (i=="reads")
|
||||||
|
return(x@reads)
|
||||||
|
|
||||||
|
if (i=="samples")
|
||||||
|
return(x@samples)
|
||||||
|
|
||||||
|
if (i=="motus")
|
||||||
|
return(x@motus)
|
||||||
|
|
||||||
|
if (i=="reads")
|
||||||
|
return(x@reads)
|
||||||
|
|
||||||
|
return(x@layers[[i,exact=TRUE]])
|
||||||
|
})
|
||||||
|
|
||||||
|
#' @method $
|
||||||
|
#' @export
|
||||||
|
setMethod("$", "metabarcoding.data",
|
||||||
|
function(x, name) {
|
||||||
|
return(x[[name]])
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
# set one data layer data.frames
|
||||||
|
|
||||||
|
#' @method [[<-
|
||||||
|
#' @export
|
||||||
|
setMethod("[[<-","metabarcoding.data",
|
||||||
|
function(x, i, j, ...,value) {
|
||||||
|
|
||||||
|
if (any(dim(value)!=c(x@scount,x@mcount)))
|
||||||
|
stop("data dimmension are not coherent with this metabarcoding.data")
|
||||||
|
|
||||||
|
if (hasArg('j'))
|
||||||
|
stop('Just one dimension must be specified')
|
||||||
|
|
||||||
|
if (! is.character(i))
|
||||||
|
stop('Just named index must be used')
|
||||||
|
|
||||||
|
if (i=='reads')
|
||||||
|
stop('you cannot change the reads layer by this way')
|
||||||
|
|
||||||
|
if (i=='motus' | i=='samples')
|
||||||
|
stop('layers cannot be names motus or samples')
|
||||||
|
|
||||||
|
value = as.factor.or.matrix(value)
|
||||||
|
rownames(value)=rownames(x@reads)
|
||||||
|
colnames(value)=colnames(x@reads)
|
||||||
|
x@layers[[i]]=value
|
||||||
|
|
||||||
|
return(x)
|
||||||
|
})
|
||||||
|
|
||||||
|
#' @method $<-
|
||||||
|
#' @export
|
||||||
|
setMethod("$<-","metabarcoding.data",
|
||||||
|
function(x, name, value) {
|
||||||
|
|
||||||
|
x[[name]]=value
|
||||||
|
return(x)
|
||||||
|
})
|
378
ROBITools/R/metabarcoding_threshold.R
Normal file
378
ROBITools/R/metabarcoding_threshold.R
Normal file
@@ -0,0 +1,378 @@
|
|||||||
|
#' @include 02_class_metabarcoding.data.R
|
||||||
|
NULL
|
||||||
|
|
||||||
|
|
||||||
|
#' @export
|
||||||
|
setGeneric("marginalsum", function(data,MARGIN="sample", na.rm = FALSE) {
|
||||||
|
return(standardGeneric("marginalsum"))
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
#' Computes marginal sums over read counts.
|
||||||
|
#'
|
||||||
|
#' Method \code{marginalsum} computes marginal sums over read counts of
|
||||||
|
#' a \code{\link{metabarcoding.data}} instance.
|
||||||
|
#'
|
||||||
|
#' @param data The \code{\linkS4class{metabarcoding.data}} instance
|
||||||
|
#' on which marginal sums have to be computed.
|
||||||
|
#' @param MARGIN Indicates if the sums have to be computed across
|
||||||
|
#' samples or motus.
|
||||||
|
#' Allowed values are :
|
||||||
|
#' \itemize{
|
||||||
|
#' \item{'sample' or 1} for computing sum across samples
|
||||||
|
#' \item{'motu' or 2} for computing sum across motus
|
||||||
|
#' }
|
||||||
|
#' @param na.rm Logical. Should missing values be omitted from the
|
||||||
|
#' calculations?
|
||||||
|
#'
|
||||||
|
#' @return Returns the vector of marginal sums as a \code{numeric} vector
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' # load termite data set from the ROBITools sample data
|
||||||
|
#' data(termes)
|
||||||
|
#'
|
||||||
|
#' # Computes marginal sums per sample
|
||||||
|
#' ssum = marginalsum(termes,MARGIN="sample")
|
||||||
|
#'
|
||||||
|
#' # Computes marginal sums per MOTU
|
||||||
|
#' msum = marginalsum(termes,MARGIN="motu")
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\linkS4class{metabarcoding.data}}
|
||||||
|
#'
|
||||||
|
#' @docType methods
|
||||||
|
#' @rdname marginalsum-methods
|
||||||
|
#' @aliases marginalsum-methods,metabarcoding.data
|
||||||
|
#' @author Aurelie Bonin
|
||||||
|
#'
|
||||||
|
setMethod("marginalsum", "metabarcoding.data", function(data,MARGIN='sample', na.rm = FALSE) {
|
||||||
|
|
||||||
|
if (MARGIN == 'sample')
|
||||||
|
MARGIN=1
|
||||||
|
|
||||||
|
if (MARGIN == 'motu')
|
||||||
|
MARGIN=2
|
||||||
|
|
||||||
|
readcount = reads(data)
|
||||||
|
if (MARGIN==1)
|
||||||
|
margesum = rowSums(readcount,na.rm=na.rm)
|
||||||
|
else
|
||||||
|
margesum = colSums(readcount,na.rm=na.rm)
|
||||||
|
|
||||||
|
|
||||||
|
return(margesum)
|
||||||
|
})
|
||||||
|
|
||||||
|
rowSums.metabarcoding.data = function (x, na.rm = FALSE, dims = 1L) {
|
||||||
|
print("coucou")
|
||||||
|
}
|
||||||
|
|
||||||
|
#' @export
|
||||||
|
setGeneric("normalize", function(data,MARGIN='sample',as.matrix=FALSE) {
|
||||||
|
return(standardGeneric("normalize"))
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
#' Normalizes read counts by sample or by MOTU.
|
||||||
|
#'
|
||||||
|
#' Method \code{normalize} computes a normalized read aboundancy matrix
|
||||||
|
#' (relative frequency matrix) of a \code{\link{metabarcoding.data}} instance.
|
||||||
|
#' Normalization can be done according aboundancies per sample or per MOTU.
|
||||||
|
#'
|
||||||
|
#' @param data The \code{\linkS4class{metabarcoding.data}} instance
|
||||||
|
#' on normalisation have to be computed.
|
||||||
|
#' @param MARGIN Indicates if the sums have to be computed across
|
||||||
|
#' samples or motus.
|
||||||
|
#' Allowed values are :
|
||||||
|
#' \itemize{
|
||||||
|
#' \item{'sample' or 1} for computing sum across samples
|
||||||
|
#' \item{'motu' or 2} for computing sum across motus
|
||||||
|
#' }
|
||||||
|
#' @param as.matrix Logical indicating if the normalized aboundancies
|
||||||
|
#' must be returned as a simple \code{matrix} (TRUE) or as a new
|
||||||
|
#' instance of the \code{\linkS4class{metabarcoding.data}} class
|
||||||
|
#' (FALSE, the default case).
|
||||||
|
#'
|
||||||
|
#' @return Returns a new instance of \code{\linkS4class{metabarcoding.data}}
|
||||||
|
#' or a \code{numeric} matrix according to the \code{return.as.matrix}
|
||||||
|
#' parameter.
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' # load termite data set from the ROBITools sample data
|
||||||
|
#' data(termes)
|
||||||
|
#'
|
||||||
|
#' # Computes normalized aboundancies per sample
|
||||||
|
#' termes.norm = normalize(termes,MARGIN="sample")
|
||||||
|
#'
|
||||||
|
#' # Computes normalized aboundancies per sample and
|
||||||
|
#' # stores the result as a new layer into the thermes
|
||||||
|
#' # structure
|
||||||
|
#' termes$normalized = normalize(termes,MARGIN="sample",as.matrix=TRUE)
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\linkS4class{metabarcoding.data}}
|
||||||
|
#'
|
||||||
|
#' @docType methods
|
||||||
|
#' @rdname normalize-methods
|
||||||
|
#' @aliases normalize-methods,metabarcoding.data
|
||||||
|
#' @author Aurelie Bonin
|
||||||
|
#'
|
||||||
|
setMethod("normalize", "metabarcoding.data", function(data,MARGIN="sample",as.matrix=FALSE) {
|
||||||
|
|
||||||
|
if (MARGIN == 'sample')
|
||||||
|
MARGIN=1
|
||||||
|
|
||||||
|
if (MARGIN == 'motu')
|
||||||
|
MARGIN=2
|
||||||
|
|
||||||
|
readcount = reads(data)
|
||||||
|
margesum = marginalsum(data,MARGIN,na.rm=TRUE)
|
||||||
|
|
||||||
|
readcount = sweep(readcount,MARGIN,margesum, FUN="/")
|
||||||
|
|
||||||
|
if (as.matrix)
|
||||||
|
newdata=readcount
|
||||||
|
else
|
||||||
|
newdata = copy.metabarcoding.data(data,reads=readcount)
|
||||||
|
|
||||||
|
return(newdata)
|
||||||
|
})
|
||||||
|
|
||||||
|
#' @export
|
||||||
|
setGeneric("threshold", function(data,MARGIN="sample",threshold=0.97) {
|
||||||
|
return(standardGeneric("threshold"))
|
||||||
|
})
|
||||||
|
|
||||||
|
#' Compute the cumulative threshold of read aboundances.
|
||||||
|
#'
|
||||||
|
#' The method \code{threshold} of the class \code{\linkS4class{metabarcoding.data}}
|
||||||
|
#' computes the thresold to be used for conserving just a part of the global
|
||||||
|
#' signal. This thresold is computed by ranking aboundances by decreasing order.
|
||||||
|
#' The cululative sums of these ranked abondencies are computed and the aboundance
|
||||||
|
#' corresponding to the first sum greater than the threshold is returned as result.
|
||||||
|
#'
|
||||||
|
#' @param data The \code{\linkS4class{metabarcoding.data}} instance
|
||||||
|
#' on normalisation have to be computed.
|
||||||
|
#' @param MARGIN Indicates if the sums have to be computed across
|
||||||
|
#' samples or motus.
|
||||||
|
#' Allowed values are :
|
||||||
|
#' \itemize{
|
||||||
|
#' \item{'sample' or 1} for computing sum across samples
|
||||||
|
#' \item{'motu' or 2} for computing sum across motus
|
||||||
|
#' }
|
||||||
|
#' @param threshold a numeric value between 0 and 1 indicating which part of
|
||||||
|
#' the signal must be conserved. Default value is setup to
|
||||||
|
#' 0.97 (97% of the total signal).
|
||||||
|
#'
|
||||||
|
#' @return a numeric vector containing the limit aboundancy to consider for
|
||||||
|
#' each sample or each MOTU according to the value of the \code{MARGIN}
|
||||||
|
#' parameter.
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' # load termite data set from the ROBITools sample data
|
||||||
|
#' data(termes)
|
||||||
|
#'
|
||||||
|
#' # computes threshold value to used for keep 95% of
|
||||||
|
#' # the reads per MOTU
|
||||||
|
#'
|
||||||
|
#' t = threshold(termes,MARGIN='motu',threshold=0.95)
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\linkS4class{metabarcoding.data}}, \code{\link{threshold.mask}}
|
||||||
|
#'
|
||||||
|
#' @docType methods
|
||||||
|
#' @rdname threshold-methods
|
||||||
|
#' @aliases threshold-methods,metabarcoding.data
|
||||||
|
#' @author Aurelie Bonin
|
||||||
|
#'
|
||||||
|
setMethod("threshold", "metabarcoding.data", function(data,MARGIN="sample",threshold=0.97) {
|
||||||
|
|
||||||
|
|
||||||
|
onethreshold=function(x,threshold) {
|
||||||
|
s = x[order(-x)]
|
||||||
|
cs= cumsum(s)
|
||||||
|
total=cs[length(cs)]
|
||||||
|
if (total > 0) {
|
||||||
|
cs= cs / total
|
||||||
|
cs = cs > threshold
|
||||||
|
t = s[cs][1]
|
||||||
|
}
|
||||||
|
else t=0
|
||||||
|
|
||||||
|
return(t)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (MARGIN == 'sample')
|
||||||
|
MARGIN=1
|
||||||
|
|
||||||
|
if (MARGIN == 'motu')
|
||||||
|
MARGIN=2
|
||||||
|
|
||||||
|
readcount = reads(data)
|
||||||
|
|
||||||
|
t = apply(readcount,MARGIN,onethreshold,threshold)
|
||||||
|
|
||||||
|
return(t)
|
||||||
|
})
|
||||||
|
|
||||||
|
#' @export
|
||||||
|
setGeneric("threshold.mask", function(data,MARGIN,threshold=0.97,operator='<') {
|
||||||
|
return(standardGeneric("threshold.mask"))
|
||||||
|
})
|
||||||
|
|
||||||
|
#' Computes a cumulatif thresold mask for filtering read aboundancies.
|
||||||
|
#'
|
||||||
|
#' The method \code{threshold.mask} of the class \code{\linkS4class{metabarcoding.data}}
|
||||||
|
#' computes a logical matrix of the same size than the read matrix of the data parameter.
|
||||||
|
#' Each cell of this matrix contains a \code{TRUE} or a \code{FALSE} value according to the
|
||||||
|
#' relationship existing between the read abondancy and the corresponding theshold as computed
|
||||||
|
#' by the \code{\link{theshold}} method.
|
||||||
|
#'
|
||||||
|
#' (computed value) = (read aboundancy) operator (threshold value)
|
||||||
|
#'
|
||||||
|
#' for a cell in the result matrix, \code{(read aboundancy)} is extracted from the read layer.
|
||||||
|
#' \code{operator} is a comparaison operator and \code{(threshold value)} is estimated with the
|
||||||
|
#' \code{\link{theshold}} method.
|
||||||
|
#'
|
||||||
|
#' @param data The \code{\linkS4class{metabarcoding.data}} instance
|
||||||
|
#' on normalisation have to be computed.
|
||||||
|
#' @param MARGIN Indicates if the sums have to be computed across
|
||||||
|
#' samples or motus.
|
||||||
|
#' Allowed values are :
|
||||||
|
#' \itemize{
|
||||||
|
#' \item{'sample' or 1} for computing sum across samples
|
||||||
|
#' \item{'motu' or 2} for computing sum across motus
|
||||||
|
#' }
|
||||||
|
#' @param threshold a numeric value between 0 and 1 indicating which part of
|
||||||
|
#' the signal must be conserved. Default value is setup to
|
||||||
|
#' 0.97 (97% of the total signal).
|
||||||
|
#' @param operator is a logical comparison operator.
|
||||||
|
#'
|
||||||
|
#' @return A logical matrix usable for selecting cell in the read aboundancy matrix.
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\linkS4class{metabarcoding.data}}, \code{\link{threshold.mask}}, \code{\link{threshold}}
|
||||||
|
#'
|
||||||
|
#' @docType methods
|
||||||
|
#' @rdname threshold-mask-methods
|
||||||
|
#' @aliases threshold.mask-methods,metabarcoding.data
|
||||||
|
#' @author Aurelie Bonin
|
||||||
|
#'
|
||||||
|
setMethod("threshold.mask", "metabarcoding.data", function(data,MARGIN,threshold=0.97,operator='<') {
|
||||||
|
|
||||||
|
|
||||||
|
if (MARGIN == 'sample')
|
||||||
|
MARGIN=1
|
||||||
|
|
||||||
|
if (MARGIN == 'motu')
|
||||||
|
MARGIN=2
|
||||||
|
|
||||||
|
readcount = reads(data)
|
||||||
|
|
||||||
|
t = threshold(data,MARGIN,threshold)
|
||||||
|
mask = apply(readcount,c(2,1)[MARGIN],operator,t)
|
||||||
|
|
||||||
|
if (MARGIN==2)
|
||||||
|
mask = t(mask)
|
||||||
|
|
||||||
|
return(mask)
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
#' @export
|
||||||
|
setGeneric("const.threshold.mask", function(data,MARGIN,threshold=0.01,operator='<') {
|
||||||
|
return(standardGeneric("const.threshold.mask"))
|
||||||
|
})
|
||||||
|
|
||||||
|
#' Computes a constant thresold mask for filtering read aboundancies.
|
||||||
|
#'
|
||||||
|
#' The method \code{const.threshold.mask} of the class \code{\linkS4class{metabarcoding.data}}
|
||||||
|
#' computes a logical matrix of the same size than the read matrix of the data parameter.
|
||||||
|
#' Each cell of this matrix contains a \code{TRUE} or a \code{FALSE} value according to the
|
||||||
|
#' relationship existing between the read abondancy and the global theshold.
|
||||||
|
#'
|
||||||
|
#' (computed value) = (normalized read aboundancy) operator (threshold value)
|
||||||
|
#'
|
||||||
|
#' for a cell in the result matrix, \code{(normalized read aboundancy)} is extracted from the read layer
|
||||||
|
#' after normalization.
|
||||||
|
#' \code{operator} is a comparaison operator and \code{(threshold value)} is estimated with the
|
||||||
|
#' \code{\link{theshold}} method.
|
||||||
|
#'
|
||||||
|
#' @param data The \code{\linkS4class{metabarcoding.data}} instance
|
||||||
|
#' on normalisation have to be computed.
|
||||||
|
#' @param MARGIN Indicates if the sums have to be computed across
|
||||||
|
#' samples or motus.
|
||||||
|
#' Allowed values are :
|
||||||
|
#' \itemize{
|
||||||
|
#' \item{'sample' or 1} for computing sum across samples
|
||||||
|
#' \item{'motu' or 2} for computing sum across motus
|
||||||
|
#' }
|
||||||
|
#' @param threshold a numeric value between 0 and 1 indicating which part of
|
||||||
|
#' the signal must be conserved. Default value is setup to
|
||||||
|
#' 0.01 (1% of the normalized signal).
|
||||||
|
#' @param operator is a logical comparison operator.
|
||||||
|
#'
|
||||||
|
#' @return A logical matrix usable for selecting cell in the read aboundancy matrix.
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\linkS4class{metabarcoding.data}}, \code{\link{threshold.mask}}, \code{\link{normalize}}
|
||||||
|
#'
|
||||||
|
#' @docType methods
|
||||||
|
#' @rdname const-threshold-mask-methods
|
||||||
|
#' @aliases const.threshold.mask-methods,metabarcoding.data
|
||||||
|
#' @author Aurelie Bonin
|
||||||
|
#'
|
||||||
|
setMethod("const.threshold.mask", "metabarcoding.data", function(data,MARGIN,threshold=0.01,operator='<') {
|
||||||
|
|
||||||
|
|
||||||
|
if (MARGIN == 'sample')
|
||||||
|
MARGIN=1
|
||||||
|
|
||||||
|
if (MARGIN == 'motu')
|
||||||
|
MARGIN=2
|
||||||
|
|
||||||
|
readcount = normalize(data,MARGIN,as.matrix=TRUE)
|
||||||
|
|
||||||
|
mask = do.call(operator,list(readcount,threshold))
|
||||||
|
|
||||||
|
return(mask)
|
||||||
|
})
|
||||||
|
|
||||||
|
#' @export
|
||||||
|
setGeneric("threshold.set", function(data,
|
||||||
|
MARGIN,
|
||||||
|
threshold=0.97,
|
||||||
|
operator='<',
|
||||||
|
value=0,
|
||||||
|
normalize=TRUE,
|
||||||
|
mask.fun=threshold.mask) {
|
||||||
|
return(standardGeneric("threshold.set"))
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
setMethod("threshold.set", "metabarcoding.data", function(data,
|
||||||
|
MARGIN,
|
||||||
|
threshold=0.97,
|
||||||
|
operator='<',
|
||||||
|
value=0,
|
||||||
|
normalize=TRUE,
|
||||||
|
mask.fun=threshold.mask) {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if (MARGIN == 'sample')
|
||||||
|
MARGIN=1
|
||||||
|
|
||||||
|
if (MARGIN == 'motu')
|
||||||
|
MARGIN=2
|
||||||
|
|
||||||
|
readcount = reads(data)
|
||||||
|
|
||||||
|
if (normalize)
|
||||||
|
data = normalize(data,c(2,1)[MARGIN])
|
||||||
|
|
||||||
|
mask = mask.fun(data,MARGIN,threshold,operator)
|
||||||
|
|
||||||
|
readcount[mask] = value
|
||||||
|
|
||||||
|
newdata = copy.metabarcoding.data(data,reads=readcount)
|
||||||
|
|
||||||
|
return(newdata)
|
||||||
|
|
||||||
|
})
|
407
ROBITools/R/mstat.R
Normal file
407
ROBITools/R/mstat.R
Normal file
@@ -0,0 +1,407 @@
|
|||||||
|
#' @include 02_class_metabarcoding.data.R
|
||||||
|
#' @import igraph
|
||||||
|
NULL
|
||||||
|
|
||||||
|
require(igraph)
|
||||||
|
|
||||||
|
# pos = expand.grid(x,y)
|
||||||
|
|
||||||
|
#' Computes the pairwise distance matrix as a data.frame where
|
||||||
|
#'
|
||||||
|
#' @param x a vector for the X coordinates
|
||||||
|
#' @param y a vector for the Y coordinates
|
||||||
|
#' @param labels a vector with the sample names
|
||||||
|
#'
|
||||||
|
#' @return a data.frame instance of three columns
|
||||||
|
#' - a : The label of the first sample
|
||||||
|
#' - b : The label of the second sample
|
||||||
|
#' - dist : The euclidian distance beween sample a and b
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' data(termes)
|
||||||
|
#' termes.ok = termes[,colSums(termes$reads)>0]
|
||||||
|
#' pos = expand.grid(1:3 * 10,1:7 * 10)
|
||||||
|
#' labels = rownames(termes.ok)
|
||||||
|
#' d = dist.grid(pos[,1],pos[2],labels)
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
dist.grid = function(x,y,labels=NULL){
|
||||||
|
pos = data.frame(x,y)
|
||||||
|
|
||||||
|
if (is.null(labels))
|
||||||
|
labels = as.character(interaction(pos))
|
||||||
|
else
|
||||||
|
labels = as.character(labels)
|
||||||
|
|
||||||
|
llabels=length(labels)
|
||||||
|
dpos=dist(pos)
|
||||||
|
|
||||||
|
a = rep(labels[1:(llabels-1)],(llabels-1):1)
|
||||||
|
b = do.call(c,(lapply(2:llabels, function(i) labels[i:llabels])))
|
||||||
|
|
||||||
|
return(data.frame(a,b,dist=as.vector(dpos)))
|
||||||
|
}
|
||||||
|
|
||||||
|
#' Builds the list of sample groups included in a circle around a central sample
|
||||||
|
#'
|
||||||
|
#' @param dtable a distance table between samples as
|
||||||
|
#' computed by \code{\link{dist.grid}}
|
||||||
|
#' @param radius the radius of the circle
|
||||||
|
#' @param center a \code{logical} value indicating if the center of
|
||||||
|
#' the group must be included in the group
|
||||||
|
#'
|
||||||
|
#' @return a list of vectors containing the labels of the group members
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' data(termes)
|
||||||
|
#' termes.ok = termes[,colSums(termes$reads)>0]
|
||||||
|
#' pos = expand.grid(1:3 * 10,1:7 * 10)
|
||||||
|
#' labels = rownames(termes.ok)
|
||||||
|
#' d = dist.grid(pos[,1],pos[2],labels)
|
||||||
|
#' groups = dist.center.group(d,20)
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
dist.center.group=function(dtable,radius,center=TRUE) {
|
||||||
|
|
||||||
|
fgroup = function(c) {
|
||||||
|
ig = dtable[(dtable[,1]==c | dtable[,2]==c) & dtable[,3] <= radius,]
|
||||||
|
return(union(ig[,1],ig[,2]))
|
||||||
|
}
|
||||||
|
|
||||||
|
pos = as.character(union(dtable[,1],dtable[,2]))
|
||||||
|
|
||||||
|
g = lapply(pos,fgroup)
|
||||||
|
names(g) = pos
|
||||||
|
|
||||||
|
if (!center)
|
||||||
|
g = mapply(setdiff,g,pos)
|
||||||
|
|
||||||
|
return(g)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#' Builds the list of sample groups including samples closest than a define distance
|
||||||
|
#'
|
||||||
|
#' A graph is build by applying the threshold \code{dmax} to the distance matrix
|
||||||
|
#' A group is a clique max in this graph. Consequently all member pairs of a group
|
||||||
|
#' are distant by less or equal to \code{dmax}.
|
||||||
|
#'
|
||||||
|
#' @param dtable a distance table between samples as
|
||||||
|
#' computed by \code{\link{dist.grid}}
|
||||||
|
#' @param dmax the maximum distance between two samples
|
||||||
|
#'
|
||||||
|
#' @return a list of vectors containing the labels of the group members
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' data(termes)
|
||||||
|
#' termes.ok = termes[,colSums(termes$reads)>0]
|
||||||
|
#' pos = expand.grid(1:3 * 10,1:7 * 10)
|
||||||
|
#' labels = rownames(termes.ok)
|
||||||
|
#' d = dist.grid(pos[,1],pos[2],labels)
|
||||||
|
#' groups = dist.clique.group(d,20)
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
dist.clique.group=function(dtable,dmax,center=True) {
|
||||||
|
gp = igraph::graph.edgelist(as.matrix(dtable[dtable$dist <= dmax,c('a','b')]),directed=FALSE)
|
||||||
|
g = igraph::maximal.cliques(gp)
|
||||||
|
return(lapply(g, function(i) igraph::V(gp)$name[i]))
|
||||||
|
}
|
||||||
|
|
||||||
|
#' Computes the univariate M statistics
|
||||||
|
#'
|
||||||
|
#' @param w the weigth matrix indicating the presence probability of each motu
|
||||||
|
#' in each samples. Each line corresponds to a sample and each column
|
||||||
|
#' to a MOTU. \code{rownames} of the \code{w} matrix must be the sample
|
||||||
|
#' names. It is nice but not mandatory if the \code{colnames} refer to the MOTU id.
|
||||||
|
#'
|
||||||
|
#' @param groups the list of considered groups as computed by the \code{\link{dist.center.group}}
|
||||||
|
#' function
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\link{dist.center.group}}
|
||||||
|
#' @seealso \code{\link{m.weight}}
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' data(termes)
|
||||||
|
#' termes.ok = termes[,colSums(termes$reads)>0]
|
||||||
|
#' pos = expand.grid(1:3 * 10,1:7 * 10)
|
||||||
|
#' labels = rownames(termes.ok)
|
||||||
|
#' d = dist.grid(pos[,1],pos[2],labels)
|
||||||
|
#' groups = dist.center.group(d,20)
|
||||||
|
#' w = m.weight(termes.ok)
|
||||||
|
#' m = m.univariate(w,groups)
|
||||||
|
#'
|
||||||
|
#' @references Marcon, E., Puech, F., and Traissac, S. (2012).
|
||||||
|
#' Characterizing the relative spatial structure of point patterns.
|
||||||
|
#' International Journal of Ecology, 2012.
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
m.univariate = function(w,groups) {
|
||||||
|
|
||||||
|
nunivar = function(members,center) {
|
||||||
|
g = w[members,]
|
||||||
|
|
||||||
|
wn = colSums(g)
|
||||||
|
wa = sum(wn)
|
||||||
|
|
||||||
|
wn = wn - center
|
||||||
|
wa = wa - center
|
||||||
|
|
||||||
|
p = wn / wa * center
|
||||||
|
|
||||||
|
return(p)
|
||||||
|
}
|
||||||
|
|
||||||
|
centers = lapply(names(groups),function(x) w[x,])
|
||||||
|
|
||||||
|
Wf = colSums(w)
|
||||||
|
Wa = sum(Wf)
|
||||||
|
|
||||||
|
Denom.univar = colSums(w * (sweep(-w,2,Wf,'+') / (Wa - w)))
|
||||||
|
Num.univar = rowSums(mapply(nunivar,groups,centers))
|
||||||
|
|
||||||
|
Munivar=Num.univar/Denom.univar
|
||||||
|
Munivar[Denom.univar==0]=0
|
||||||
|
|
||||||
|
return(Munivar)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#' Computes the bivariate M statistics
|
||||||
|
#'
|
||||||
|
#' The function computes the bivariate M statiscics for a set of target species around a set of
|
||||||
|
#' focus species.
|
||||||
|
#'
|
||||||
|
#' @param w1 the weigth matrix indicating the presence probability of each motu
|
||||||
|
#' used as focus species in each samples. Each line corresponds to a sample and each column
|
||||||
|
#' to a MOTU. \code{rownames} of the \code{w} matrix must be the sample
|
||||||
|
#' names. It is nice but not mandatory if the \code{colnames} refer to the MOTU id.
|
||||||
|
#'
|
||||||
|
#' @param w2 the weigth matrix indicating the presence probability of each motu
|
||||||
|
#' used as target species in each samples. Each line corresponds to a sample and each column
|
||||||
|
#' to a MOTU. \code{rownames} of the \code{w} matrix must be the sample
|
||||||
|
#' names. It is nice but not mandatory if the \code{colnames} refer to the MOTU id.
|
||||||
|
#' if \code{w2} is not set, w1 is also used as target species. in this case the diagonal
|
||||||
|
#' of the matrix return contains the univariate M statistic for the diferent species.
|
||||||
|
#'
|
||||||
|
#' @param groups the list of considered groups as computed by the \code{\link{dist.center.group}}
|
||||||
|
#' function
|
||||||
|
#'
|
||||||
|
#' @return a matrix of M bivariate statistics with one focus species by row and one target species
|
||||||
|
#' by columns If \code{w2} is not specified the diagonal of the matrix is equal to the univariate
|
||||||
|
#' M statistic of the corresponding species.
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\link{dist.center.group}}
|
||||||
|
#' @seealso \code{\link{m.weight}}
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' data(termes)
|
||||||
|
#' termes.ok = termes[,colSums(termes$reads)>0]
|
||||||
|
#' pos = expand.grid(1:3 * 10,1:7 * 10)
|
||||||
|
#' labels = rownames(termes.ok)
|
||||||
|
#' d = dist.grid(pos[,1],pos[2],labels)
|
||||||
|
#' groups = dist.center.group(d,20)
|
||||||
|
#' w = m.weight(termes.ok)
|
||||||
|
#' m = m.bivariate(w,groups)
|
||||||
|
#'
|
||||||
|
#' @references Marcon, E., Puech, F., and Traissac, S. (2012).
|
||||||
|
#' Characterizing the relative spatial structure of point patterns.
|
||||||
|
#' International Journal of Ecology, 2012.
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
m.bivariate = function(w1,w2=NULL,groups) {
|
||||||
|
|
||||||
|
nunbivar = function(members,center) {
|
||||||
|
g = w2[members,]
|
||||||
|
|
||||||
|
wn = colSums(g)
|
||||||
|
wa = sum(wn)
|
||||||
|
|
||||||
|
if (self){
|
||||||
|
mwn = wn %*% t(rep(1,length(wn)))
|
||||||
|
diag(mwn)= wn - center
|
||||||
|
wa = wa - center
|
||||||
|
wna = mwn/wa
|
||||||
|
p = sweep(wna,2,center,'*')
|
||||||
|
#p = center %*% wna
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
wna= matrix(wn/wa,nrow=1)
|
||||||
|
p = center %*% wna
|
||||||
|
}
|
||||||
|
|
||||||
|
return(p)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is.null(w2)){
|
||||||
|
self = TRUE
|
||||||
|
w2=w1
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
self = FALSE
|
||||||
|
}
|
||||||
|
|
||||||
|
centers = lapply(names(groups),function(x) w[x,])
|
||||||
|
|
||||||
|
Wf = colSums(w1)
|
||||||
|
Wn = colSums(w2)
|
||||||
|
Wa = sum(Wn)
|
||||||
|
|
||||||
|
if (self){
|
||||||
|
Wn = sweep(-w1,2,Wn,'+')
|
||||||
|
Wna = Wn/(Wa - w1)
|
||||||
|
Denom.bivar = t(w1) %*% Wna
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
Wna= t(Wn/Wa)
|
||||||
|
Denom.bivar = Wf %*% Wna
|
||||||
|
}
|
||||||
|
|
||||||
|
Num.bivar = matrix(0,nrow=ncol(w1),ncol=ncol(w2))
|
||||||
|
|
||||||
|
ng = length(groups)
|
||||||
|
|
||||||
|
for (i in 1:ng) {
|
||||||
|
Num.bivar = Num.bivar + nunbivar(groups[[i]],centers[[i]])
|
||||||
|
}
|
||||||
|
|
||||||
|
Mbivar=Num.bivar/Denom.bivar
|
||||||
|
|
||||||
|
Mbivar[Denom.bivar==0]=0
|
||||||
|
|
||||||
|
return(Mbivar)
|
||||||
|
}
|
||||||
|
|
||||||
|
#' Computes a weigth matrix from a \code{\linkS4class{metabarcoding.data}}
|
||||||
|
#'
|
||||||
|
#' The weight can be considered as a propability of presence of a MOTU in a
|
||||||
|
#' given sample. This function defines this probability as the fraction of
|
||||||
|
#' the maximal occurrence frequency over all samples.
|
||||||
|
#'
|
||||||
|
#' @param data a \code{\linkS4class{metabarcoding.data}} instance
|
||||||
|
#'
|
||||||
|
#' @return a weight matrix usable for M statistics
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' data(termes)
|
||||||
|
#' termes.ok = termes[,colSums(termes$reads)>0]
|
||||||
|
#' w = m.weight(termes.ok)
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
m.weight = function(data) {
|
||||||
|
ndata = normalize(data,MARGIN='sample')
|
||||||
|
fmax=apply(ndata$reads,2,max)
|
||||||
|
w = sweep(ndata$reads,2,fmax,'/')
|
||||||
|
rownames(w)=rownames(ndata)
|
||||||
|
colnames(w)=colnames(ndata)
|
||||||
|
return(w)
|
||||||
|
}
|
||||||
|
|
||||||
|
#' Simulate null distribion of the M statistics by Monte-Carlo
|
||||||
|
#'
|
||||||
|
#' Computes the null empirical distribution of the M statistics
|
||||||
|
#' by shuffling MOTUs among location.
|
||||||
|
#'
|
||||||
|
#' @param w the weigth matrix indicating the presence probability of each motu
|
||||||
|
#' in each samples. Each line corresponds to a sample and each column
|
||||||
|
#' to a MOTU. \code{rownames} of the \code{w} matrix must be the sample
|
||||||
|
#' names.
|
||||||
|
#' @param groups the list of considered groups as computed by the \code{\link{dist.center.group}}
|
||||||
|
#' function
|
||||||
|
#' @param resampling the number of simulation to establish the null distribution
|
||||||
|
#'
|
||||||
|
#' @return a matrix of M score under the null hypothesis of random distribution of MOTUs
|
||||||
|
#' with a MOTUs per line and a culumn per simulation
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' data(termes)
|
||||||
|
#' termes.ok = termes[,colSums(termes$reads)>0]
|
||||||
|
#' pos = expand.grid(1:3 * 10,1:7 * 10)
|
||||||
|
#' labels = rownames(termes.ok)
|
||||||
|
#' d = dist.grid(pos[,1],pos[2],labels)
|
||||||
|
#' groups = dist.center.group(d,20)
|
||||||
|
#' w = m.weight(termes.ok)
|
||||||
|
#' dnull = dm.univariate(w,groups)
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
dm.univariate = function(w,groups,resampling=100) {
|
||||||
|
|
||||||
|
shuffle = function(w){
|
||||||
|
wr =apply(w,2,function(y) sample(y,length(y),replace=FALSE))
|
||||||
|
rownames(wr)=rownames(w)
|
||||||
|
return(wr)
|
||||||
|
}
|
||||||
|
|
||||||
|
msim = function(x) {
|
||||||
|
return(m.univariate(shuffle(w),groups))
|
||||||
|
}
|
||||||
|
|
||||||
|
dnull = mapply(msim,1:resampling)
|
||||||
|
|
||||||
|
rownames(dnull) = colnames(w)
|
||||||
|
|
||||||
|
return(dnull)
|
||||||
|
}
|
||||||
|
|
||||||
|
#' Test the significance of the M statistics by Monte-Carlo
|
||||||
|
#'
|
||||||
|
#' Computes computes the p.value the M statistics asociated to a MOTU
|
||||||
|
#' by shuffling MOTUs among location.
|
||||||
|
#'
|
||||||
|
#' @param w the weigth matrix indicating the presence probability of each motu
|
||||||
|
#' in each samples. Each line corresponds to a sample and each column
|
||||||
|
#' to a MOTU. \code{rownames} of the \code{w} matrix must be the sample
|
||||||
|
#' names.
|
||||||
|
#' @param groups the list of considered groups as computed by the \code{\link{dist.center.group}}
|
||||||
|
#' function
|
||||||
|
#' @param resampling the number of simulation to establish the null distribution
|
||||||
|
#'
|
||||||
|
#' @param alternative a character value in \code{c('two.sided','less','greater')}
|
||||||
|
#' - two.sided : the m stat is check against the two side of the empirical
|
||||||
|
#' M distribution
|
||||||
|
#' - less : test if the M stat is lesser than the M observed in the the empirical
|
||||||
|
#' M distribution (exlusion hypothesis)
|
||||||
|
#' - greater : test if the M stat is greater than the M observed in the the empirical
|
||||||
|
#' M distribution (aggregation hypothesis)
|
||||||
|
#'
|
||||||
|
#' @return a vector of p.value with an attribute \code{m.stat} containing the actual M stat
|
||||||
|
#' for each MOTUs
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' data(termes)
|
||||||
|
#' termes.ok = termes[,colSums(termes$reads)>0]
|
||||||
|
#' pos = expand.grid(1:3 * 10,1:7 * 10)
|
||||||
|
#' labels = rownames(termes.ok)
|
||||||
|
#' d = dist.grid(pos[,1],pos[2],labels)
|
||||||
|
#' groups = dist.center.group(d,20)
|
||||||
|
#' w = m.weight(termes.ok)
|
||||||
|
#' pval = m.univariate.test(w,groups)
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
m.univariate.test = function(w,groups,resampling=100,alternative='two.sided') {
|
||||||
|
dnull = dm.univariate(w,groups,resampling)
|
||||||
|
m = m.univariate(w,groups)
|
||||||
|
pnull = sapply(1:dim(dnull)[1],function(y) 1 - ecdf(dnull[y,])(m[y]))
|
||||||
|
|
||||||
|
p.value=NULL
|
||||||
|
|
||||||
|
if (alternative=='two.sided') {
|
||||||
|
p.value = mapply(min,pnull,1 - pnull)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (alternative=='less') {
|
||||||
|
p.value = pnull
|
||||||
|
}
|
||||||
|
|
||||||
|
if (alternative=='greater') {
|
||||||
|
p.value = 1 - pnull
|
||||||
|
}
|
||||||
|
|
||||||
|
# Set p.value to 1 if the MOTU occurres in only one place
|
||||||
|
n = colSums(w > 0)
|
||||||
|
p.value[n==1]=1
|
||||||
|
|
||||||
|
names(p.value) = colnames(w)
|
||||||
|
attr(p.value,'m.stat')=m
|
||||||
|
|
||||||
|
return(p.value)
|
||||||
|
}
|
118
ROBITools/R/obiclean.R
Normal file
118
ROBITools/R/obiclean.R
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
#' @include 02_class_metabarcoding.data.R
|
||||||
|
NULL
|
||||||
|
|
||||||
|
# TODO: Add comment
|
||||||
|
#
|
||||||
|
# Author: coissac
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
#' @export
|
||||||
|
setGeneric("extracts.obiclean", function(obj) {
|
||||||
|
return(standardGeneric("extracts.obiclean"))
|
||||||
|
})
|
||||||
|
|
||||||
|
#' Extracts the obiclean results
|
||||||
|
#'
|
||||||
|
#' The method \code{extracts.obiclean} of the class \code{\linkS4class{metabarcoding.data}}
|
||||||
|
#' extracts \code{obiclean} results from the MOTUs descriptions include in the
|
||||||
|
#' \code{\linkS4class{metabarcoding.data}} instance.
|
||||||
|
#' When an \code{obitab} file is imported using the \code{\link{import.metabarcoding.data}}
|
||||||
|
#' if \code{obiclean} results are present in the file they are stored in the
|
||||||
|
#' \code{motu} data.frame. By calling this methods, MOTU descriptors describing
|
||||||
|
#' the \code{obiclean} status are moved to a set of layers.
|
||||||
|
#'
|
||||||
|
#' @param obj the \code{\linkS4class{metabarcoding.data}} to analyze
|
||||||
|
#'
|
||||||
|
#' @return the modified \code{\linkS4class{metabarcoding.data}} instance
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#'
|
||||||
|
#' # load termite data set from the ROBITools sample data
|
||||||
|
#' data(termes)
|
||||||
|
#'
|
||||||
|
#' # shows the initial list of layer names
|
||||||
|
#' layer.names(t)
|
||||||
|
#'
|
||||||
|
#' # extracts the obiclean status
|
||||||
|
#' termes = extracts.obiclean(termes)
|
||||||
|
#'
|
||||||
|
#' # shows the name of the newly created layers
|
||||||
|
#' layer.names(t)
|
||||||
|
#'
|
||||||
|
#'
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\linkS4class{metabarcoding.data}}, \code{\link{threshold.mask}}, \code{\link{normalize}}
|
||||||
|
#'
|
||||||
|
#' @docType methods
|
||||||
|
#' @rdname extracts-obiclean-methods
|
||||||
|
#' @aliases extracts.obiclean-methods,metabarcoding.data
|
||||||
|
#' @author Eric Coissac
|
||||||
|
#'
|
||||||
|
|
||||||
|
|
||||||
|
setMethod("extracts.obiclean", "metabarcoding.data", function(obj) {
|
||||||
|
|
||||||
|
pat = "^obiclean_status:.*$"
|
||||||
|
cols = colnames(obj@motus)
|
||||||
|
cleancols = grep(pat,cols)
|
||||||
|
clean.names=cols[cleancols]
|
||||||
|
p = grep(pat,cols)
|
||||||
|
d = t(as.factor.or.matrix(obj@motus[,p]))
|
||||||
|
n = sapply(strsplit(cols[p],':'),function(y) y[[2]])
|
||||||
|
rownames(d)=n
|
||||||
|
d = d[rownames(obj@reads),]
|
||||||
|
obj[["obiclean_status"]]=d
|
||||||
|
|
||||||
|
newmotus = obj@motus[-cleancols]
|
||||||
|
|
||||||
|
pat = "^obiclean_count:.*$"
|
||||||
|
cols = colnames(newmotus)
|
||||||
|
cleancols = grep(pat,cols)
|
||||||
|
clean.names=cols[cleancols]
|
||||||
|
p = grep(pat,cols)
|
||||||
|
d = t(as.factor.or.matrix(newmotus[,p]))
|
||||||
|
n = sapply(strsplit(cols[p],':'),function(y) y[[2]])
|
||||||
|
rownames(d)=n
|
||||||
|
d = d[rownames(obj@reads),]
|
||||||
|
obj[["obiclean_count"]]=d
|
||||||
|
|
||||||
|
newmotus = newmotus[-cleancols]
|
||||||
|
|
||||||
|
pat = "^obiclean_cluster:.*$"
|
||||||
|
cols = colnames(newmotus)
|
||||||
|
cleancols = grep(pat,cols)
|
||||||
|
clean.names=cols[cleancols]
|
||||||
|
p = grep(pat,cols)
|
||||||
|
d = t(as.factor.or.matrix(newmotus[,p]))
|
||||||
|
n = sapply(strsplit(cols[p],':'),function(y) y[[2]])
|
||||||
|
rownames(d)=n
|
||||||
|
d = d[rownames(obj@reads),]
|
||||||
|
obj[["obiclean_cluster"]]=d
|
||||||
|
|
||||||
|
newmotus = newmotus[-cleancols]
|
||||||
|
|
||||||
|
newdata = copy.metabarcoding.data(obj,motus=newmotus)
|
||||||
|
|
||||||
|
return(newdata)
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
#' @export
|
||||||
|
setGeneric("extracts.obiclean_cluster", function(obj) {
|
||||||
|
return(standardGeneric("extracts.obiclean_cluster"))
|
||||||
|
})
|
||||||
|
|
||||||
|
setMethod("extracts.obiclean_cluster", "metabarcoding.data", function(obj) {
|
||||||
|
|
||||||
|
obiclean = extracts.obiclean(obj)
|
||||||
|
obihead = obiclean[,! is.na(obiclean$motus$obiclean_head)]
|
||||||
|
obihead$obiclean_count[is.na(obihead$obiclean_count)]=0
|
||||||
|
reads = obihead$obiclean_count
|
||||||
|
|
||||||
|
l = obihead@layers[layer.names(obihead) != "obiclean_count"]
|
||||||
|
|
||||||
|
newdata = copy.metabarcoding.data(obihead,reads=reads,layers=l)
|
||||||
|
|
||||||
|
return(newdata)
|
||||||
|
}
|
||||||
|
)
|
0
ROBITools/R/pcrslayer.R
Normal file
0
ROBITools/R/pcrslayer.R
Normal file
84
ROBITools/R/plot.PCRplate.R
Normal file
84
ROBITools/R/plot.PCRplate.R
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
#' @include 02_class_metabarcoding.data.R
|
||||||
|
NULL
|
||||||
|
|
||||||
|
#' Plot PCR plates
|
||||||
|
#'
|
||||||
|
#' Plots samples localization in PCR plates, and points out problematic samples if provided.
|
||||||
|
#'
|
||||||
|
#' @param x a \code{\link{metabarcoding.data}} object
|
||||||
|
#' @param samples a character vector containing names of problematic samples. Default is \code{NULL}
|
||||||
|
#' @param different a boolean indicating whether different tags where used in forward and reverse to identify samples. Default is \code{TRUE}
|
||||||
|
#' @param ... arguments ot be passed to methods, such as graphical parameters
|
||||||
|
#'
|
||||||
|
#' @return \code{\link{plot.PCRplate}} returns a plot displaying no more than 4 PCR plates, with problematic sample localization
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' \dontshow{# switch the working directory to the data package directory}
|
||||||
|
#' \dontshow{setwd(system.file("extdata", package="ROBITools"))}
|
||||||
|
#'
|
||||||
|
#' data(termes)
|
||||||
|
#'
|
||||||
|
#' # reading the termes_ngsfilt.txt file
|
||||||
|
#' termes.ngs=import.ngsfilter.data('termes_ngsfilt.txt', platewell="position")
|
||||||
|
#'
|
||||||
|
#' # including ngsfilter data into termes data
|
||||||
|
#' attr(termes, "samples") = termes.ngs[rownames(termes),]
|
||||||
|
#'
|
||||||
|
#' #plot PCR plate plan
|
||||||
|
#' col = rep("green", nrow(termes))
|
||||||
|
#' col[grep("r", rownames(termes))] = "red"
|
||||||
|
#' plot.PCRplate(termes, col=col)
|
||||||
|
#'
|
||||||
|
#' #highlighting location of samples with low identification score
|
||||||
|
#'
|
||||||
|
#' #low quality taxonomic assignements identification
|
||||||
|
#' library(plotrix)
|
||||||
|
#' weighted.hist(termes$motus$best_identity, colSums(termes$reads), breaks = 20, ylab = "Nb reads", xlab = "Ecotag scores", xaxis=F)
|
||||||
|
#' axis(1, labels = T)
|
||||||
|
#' lowqual.seq = rownames(termes$motus)[termes$motus$best_identity < 0.7]
|
||||||
|
#'
|
||||||
|
#' #identification and localization (in PCR plate) of samples with high proportions of low quality taxonomic assignements
|
||||||
|
#' termes.freq= normalize(termes, MARGIN=1)$reads
|
||||||
|
#' hist(log10(rowSums(termes.freq[,lowqual.seq]) + 1e-05), breaks = 20, xlab = "Prop low quality reads")
|
||||||
|
#' lowqual.sample = rownames(termes)[log10(rowSums(termes.freq[, lowqual.seq]) + 1e-05) > -0.5]
|
||||||
|
#'
|
||||||
|
#' plot.PCRplate(termes, lowqual.sample, col=col)
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\link{import.metabarcoding.data}}
|
||||||
|
#'
|
||||||
|
#' @author Lucie Zinger
|
||||||
|
#' @keywords DNA metabarcoding
|
||||||
|
#' @export
|
||||||
|
#'
|
||||||
|
plot.PCRplate = function(x, samples=NULL, col="cyan2", different=T, ...) {
|
||||||
|
|
||||||
|
if(length(grep("xPlate", colnames(x$samples)))==0 |
|
||||||
|
length(grep("yPlate", colnames(x$samples)))==0) {
|
||||||
|
stop("samples/controls position in PCR plates (xPlate and yPlate) are not defined")
|
||||||
|
}
|
||||||
|
|
||||||
|
if(length(grep("tagF", colnames(x$samples)))==0 |
|
||||||
|
length(grep("tagR", colnames(x$samples)))==0) {
|
||||||
|
stop("tags (tagF and tagR) are not defined")
|
||||||
|
}
|
||||||
|
|
||||||
|
nplate = max(x$samples$nbPlate)
|
||||||
|
|
||||||
|
if(nplate>4) {
|
||||||
|
stop("Cannot plot more than 4 plates")
|
||||||
|
}
|
||||||
|
|
||||||
|
plot(x$samples$xPlate, -x$samples$yPlate, pch=19, xaxt="n", yaxt="n", col=col,
|
||||||
|
xlim=c(-5,17), ylab="y plate", xlab= "x plate", ylim=c(-4.5*8-5,0), ...)
|
||||||
|
if(different==T) {
|
||||||
|
text(-3, -unique(x$samples$yPlate[order(x$samples$yPlate)]), unique(x$samples$tagF[order(x$samples$yPlate)]), cex=0.5)
|
||||||
|
text(unique(x$samples$xPlate[order(x$samples$xPlate)]), -5, unique(x$samples$tagR[order(x$samples$xPlate)]), cex=0.5, srt=90)
|
||||||
|
}
|
||||||
|
abline(h=-seq(8.5,8*nplate+0.5,8), lty=2, col="grey")
|
||||||
|
segments(c(0,13), rep(min(-x$samples$yPlate),2), c(0,13), c(0,0), lty=2, col="grey")
|
||||||
|
|
||||||
|
#plot problematic samples
|
||||||
|
if(!is.null(samples)) {
|
||||||
|
points(x$samples[samples,"xPlate"], -x$samples[samples,"yPlate"], pch="x")
|
||||||
|
}
|
||||||
|
}
|
105
ROBITools/R/plot.seqinsample.R
Normal file
105
ROBITools/R/plot.seqinsample.R
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
#' @include 02_class_metabarcoding.data.R
|
||||||
|
NULL
|
||||||
|
|
||||||
|
#' Plot sequence abundance in samples
|
||||||
|
#'
|
||||||
|
#' Plots relative abundances of a set of sequences in all samples (log10 transformed)
|
||||||
|
#'
|
||||||
|
#'
|
||||||
|
#' @param x a \code{\link{metabarcoding.data}} object
|
||||||
|
#' @param seqset a vetcor with sequences names
|
||||||
|
#' @param seqtype a string indicating what type of sequences are displayed
|
||||||
|
#' @param controls a vector indicating the negative controls names in the x object.
|
||||||
|
#' Default is \code{NULL}
|
||||||
|
#'
|
||||||
|
#' @return returns a plot with the log10 transformed relative porportion of
|
||||||
|
#' selected MOTUs in each samples. If the number of samples is > 96,
|
||||||
|
#' then the plot is displayed in 4 panels
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#'
|
||||||
|
#' data(termes)
|
||||||
|
#'
|
||||||
|
#' seqset = rownames(termes$motus)[which(termes$motus$genus_name=="Anoplotermes")]
|
||||||
|
#' plot.seqinsample(termes, seqset, "Anoplotermes")
|
||||||
|
#'
|
||||||
|
#' controls = rownames(termes)[grep("r", rownames(termes))]
|
||||||
|
#' seqset = rownames(termes$motus)[which(termes$motus$best_identity<0.7)]
|
||||||
|
#' plot.seqinsample(termes, seqset, "Not assigned", controls)
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\linkS4class{taxonomy.obitools}}, and method \code{\link{taxonmicank}}
|
||||||
|
#'
|
||||||
|
#' @author Lucie Zinger
|
||||||
|
#' @keywords metabarcoding
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
#'
|
||||||
|
|
||||||
|
plot.seqinsample = function(x, seqset, seqtype, controls=NULL){
|
||||||
|
|
||||||
|
require(vegan)
|
||||||
|
|
||||||
|
x.freq = vegan::decostand(x$reads,"total",1)
|
||||||
|
|
||||||
|
if(!is.null(controls)){
|
||||||
|
controls.ind = match(controls, rownames(x.freq))
|
||||||
|
}
|
||||||
|
|
||||||
|
if(nrow(x.freq)>96){
|
||||||
|
x.freq.parse = seq(0,round(nrow(x$samples), digit=0),
|
||||||
|
round(nrow(x$samples)/4, digit=0))
|
||||||
|
|
||||||
|
layout(matrix(c(1,2,3,1,4,5),3,2), height=c(0.3,1,1))
|
||||||
|
par(oma=c(1,1,1,0), mar=c(3,3,1,1))
|
||||||
|
|
||||||
|
#legend
|
||||||
|
breaks = seq(log10(1e-4),log10(1), length.out=100)
|
||||||
|
plot(breaks, rep(1,100), col=topo.colors(100), pch=15, cex=2, ylim=c(0,1.5),
|
||||||
|
xaxt="n", yaxt="n", bty='n')
|
||||||
|
text(breaks[seq(1,100,10)], rep(0.7,length(seq(1,100,10))),
|
||||||
|
round(10^breaks[seq(1,100,10)],4))
|
||||||
|
mtext("Seqence frequencies:", side=3, line=0, cex=0.8)
|
||||||
|
|
||||||
|
#plot
|
||||||
|
for(i in 1:(length(x.freq.parse)-1)) {
|
||||||
|
range = (x.freq.parse[i]+1):(x.freq.parse[i]+round(nrow(x$samples)/4, digit=0))
|
||||||
|
mat = x.freq[range,seqset]
|
||||||
|
image(log10(mat),col = topo.colors(100), xaxt="n", yaxt="n", breaks=c(breaks,0))
|
||||||
|
|
||||||
|
if(!is.null(controls)){
|
||||||
|
if(length(na.omit(match(controls.ind, range)))!=0){
|
||||||
|
abline(v=seq(0,1,l=round(nrow(x$samples)/4, digit=0))[match(controls.ind, range)],col="red", lty=3)
|
||||||
|
}}
|
||||||
|
|
||||||
|
axis(side=1,at=seq(0,1,l=round(nrow(x$samples)/4,digit=0)),
|
||||||
|
labels=rownames(x$samples)[range],
|
||||||
|
las=2, cex.axis=0.3)
|
||||||
|
}
|
||||||
|
mtext(side=2, paste(seqtype, "n = ", length(seqset)), outer=T, cex=0.7, font=3)
|
||||||
|
mtext(side=1, "Samples", cex=0.7, outer=T)
|
||||||
|
|
||||||
|
} else {
|
||||||
|
layout(matrix(c(1,2,1,2),2,2), height=c(0.3,1))
|
||||||
|
par(oma=c(1,1,1,0), mar=c(3,3,1,1))
|
||||||
|
|
||||||
|
#legend
|
||||||
|
breaks = seq(log10(1e-4),log10(1), length.out=100)
|
||||||
|
plot(breaks, rep(1,100), col=topo.colors(100), pch=15, cex=2, ylim=c(0,1.5),
|
||||||
|
xaxt="n", yaxt="n", bty='n')
|
||||||
|
text(breaks[seq(1,100,10)], rep(0.7,length(seq(1,100,10))),
|
||||||
|
round(10^breaks[seq(1,100,10)],4))
|
||||||
|
mtext("Seqence frequencies:", side=3, line=0, cex=0.8)
|
||||||
|
|
||||||
|
image(log10(x.freq[,seqset]),col = topo.colors(100), xaxt="n", yaxt="n", breaks=c(breaks,0))
|
||||||
|
|
||||||
|
if(!is.null(controls)){
|
||||||
|
abline(v=seq(0,1,l=round(nrow(x$samples), digit=0))[controls.ind],col="red", lty=3)
|
||||||
|
}
|
||||||
|
axis(side=1,at=seq(0,1,l=round(nrow(x$samples),digit=0)),
|
||||||
|
labels=rownames(x$samples),
|
||||||
|
las=2, cex.axis=0.3)
|
||||||
|
mtext(side=2, paste(seqtype, "n = ", length(seqset)), outer=T, cex=0.7, font=3)
|
||||||
|
mtext(side=1, "Samples", cex=0.7, outer=T)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
99
ROBITools/R/rarefy.R
Normal file
99
ROBITools/R/rarefy.R
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
#' @include 02_class_metabarcoding.data.R
|
||||||
|
NULL
|
||||||
|
|
||||||
|
# TODO: Add comment
|
||||||
|
#
|
||||||
|
# Author: coissac
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
#' @export
|
||||||
|
setGeneric("rarefy", function(x,n,first.pass=0.95,pseudo.count=0,...) {
|
||||||
|
return(standardGeneric("rarefy"))
|
||||||
|
})
|
||||||
|
|
||||||
|
setMethod("rarefy", "ANY", function(x,n,first.pass=0.95,pseudo.count=0,sum=NA) {
|
||||||
|
|
||||||
|
if (is.na(sum))
|
||||||
|
sum=sum(x)
|
||||||
|
|
||||||
|
if (sum < sum(x))
|
||||||
|
stop("sum parameter must be greater or equal to sum(x)")
|
||||||
|
|
||||||
|
grey = sum-sum(x)
|
||||||
|
|
||||||
|
probs = x + pseudo.count
|
||||||
|
|
||||||
|
if (grey > 0)
|
||||||
|
probs = c(probs,grey)
|
||||||
|
|
||||||
|
# Just to ensure at least one execution of the loop
|
||||||
|
n1 = n * 2
|
||||||
|
|
||||||
|
while(n1 > n)
|
||||||
|
n1 = rpois(1,n * first.pass)
|
||||||
|
|
||||||
|
rep1 = as.vector(rmultinom(1,n1,probs))
|
||||||
|
n2 = sum(rep1)
|
||||||
|
|
||||||
|
levels = 1:length(probs)
|
||||||
|
|
||||||
|
rep2= as.vector(table(factor(sample(levels,
|
||||||
|
n - n2,
|
||||||
|
replace=TRUE,
|
||||||
|
prob = probs),
|
||||||
|
levels=levels)))
|
||||||
|
|
||||||
|
rep1 = (rep1 + rep2)
|
||||||
|
|
||||||
|
if (grey > 0)
|
||||||
|
rep1 = rep1[-length(rep1)]
|
||||||
|
|
||||||
|
return(rep1)
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
setMethod("rarefy", "metabarcoding.data", function(x,n,first.pass=0.95,pseudo.count=0,MARGIN='sample') {
|
||||||
|
|
||||||
|
if (MARGIN == 'sample')
|
||||||
|
MARGIN=1
|
||||||
|
|
||||||
|
if (MARGIN == 'motu')
|
||||||
|
MARGIN=2
|
||||||
|
|
||||||
|
dreads= dim(x@reads)
|
||||||
|
rreads= matrix(0,nrow = dreads[1] , ncol = dreads[2])
|
||||||
|
|
||||||
|
if (MARGIN == 1)
|
||||||
|
for (i in 1:dreads[1]) {
|
||||||
|
rreads[i,]=rarefy(x@reads[i,],
|
||||||
|
n=n,
|
||||||
|
first.pass=first.pass,
|
||||||
|
pseudo.count=pseudo.count)
|
||||||
|
}
|
||||||
|
|
||||||
|
# rreads = t(apply(reads,1,rarefy,n=n,
|
||||||
|
# first.pass=first.pass,
|
||||||
|
# pseudo.count=pseudo.count))
|
||||||
|
else
|
||||||
|
for (i in 1:dreads[2]) {
|
||||||
|
rreads[,i]=rarefy(x@reads[,i],
|
||||||
|
n=n,
|
||||||
|
first.pass=first.pass,
|
||||||
|
pseudo.count=pseudo.count)
|
||||||
|
}
|
||||||
|
|
||||||
|
# rreads = as.matrix(apply(reads,2,rarefy,n=n,
|
||||||
|
# first.pass=first.pass,
|
||||||
|
# pseudo.count=pseudo.count))
|
||||||
|
|
||||||
|
rreads=as.matrix(rreads)
|
||||||
|
|
||||||
|
rownames(rreads) = rownames(x@reads)
|
||||||
|
colnames(rreads) = colnames(x@reads)
|
||||||
|
|
||||||
|
newdata = copy.metabarcoding.data(x,reads=rreads)
|
||||||
|
|
||||||
|
return(newdata)
|
||||||
|
|
||||||
|
})
|
||||||
|
|
56
ROBITools/R/read.ngsfilter.R
Normal file
56
ROBITools/R/read.ngsfilter.R
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
#' @include 02_class_metabarcoding.data.R
|
||||||
|
NULL
|
||||||
|
|
||||||
|
#' Read an OBITools ngsfilter file
|
||||||
|
#'
|
||||||
|
#' Reads a ngsfilter file as formatted for the OBITools. For now, needs to be tab delimited till the "F" column.
|
||||||
|
#' Any additionnal information needs to be space delimited.
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\link{import.metabarcoding.data}}
|
||||||
|
#' @author Lucie Zinger
|
||||||
|
#' @keywords data import
|
||||||
|
#' @export
|
||||||
|
#'
|
||||||
|
|
||||||
|
read.ngsfilter <- function(filename, decimal='.', as.is=!stringsAsFactors, stringsAsFactors = default.stringsAsFactors()) {
|
||||||
|
|
||||||
|
t<-read.table(file=filename, header=F, sep="\t", as.is=T)
|
||||||
|
beg <- t[,1:5]
|
||||||
|
|
||||||
|
colnames(beg) <- c('experiment','sample','tags','forward_primer','reverse_primer')
|
||||||
|
if (length(unique(beg$sample))==nrow(beg))
|
||||||
|
rownames(beg) <- beg$sample
|
||||||
|
end <- t[,c(2,6)]
|
||||||
|
|
||||||
|
#F <- unlist(lapply(end$V6, function(x) strsplit(x,"@")[[1]][1]))
|
||||||
|
rawextras <- unlist(lapply(end$V6, function(x) strsplit(x,"@")[[1]][2]))
|
||||||
|
|
||||||
|
rawextras <- lapply(rawextras, function(s) strsplit(s, '; ')[[1]])
|
||||||
|
rawextras <- lapply(rawextras, function(l) unlist(lapply(l, function(s) sub("^ +","",s))))
|
||||||
|
rawextras <- lapply(rawextras, function(l) unlist(lapply(l, function(s) sub(" +$","",s))))
|
||||||
|
|
||||||
|
|
||||||
|
rawextras <- lapply(rawextras, function(l) unlist(lapply(l, function(s) strsplit(s,"="))))
|
||||||
|
|
||||||
|
|
||||||
|
columnnames <- unique(unlist(lapply(rawextras, function(l) l[seq(1,length(l),2)])))
|
||||||
|
|
||||||
|
m <- matrix(nrow=nrow(end), ncol=length(columnnames))
|
||||||
|
colnames(m) <- columnnames
|
||||||
|
m <- as.data.frame(m)
|
||||||
|
|
||||||
|
|
||||||
|
#print(head(rawextras))
|
||||||
|
|
||||||
|
|
||||||
|
tt <- lapply(rawextras, function(l) list(l[seq(1,length(l),2)],l[seq(2,length(l),2)]))
|
||||||
|
invisible(lapply(1:length(tt), function(i){m[i,tt[[i]][[1]]] <<- tt[[i]][[2]]}))
|
||||||
|
|
||||||
|
invisible(lapply(colnames(m), function(n) m[,n] <<- type.convert(m[,n], dec=decimal, as.is=as.is)))
|
||||||
|
|
||||||
|
ngs = cbind(beg, m)
|
||||||
|
rownames(ngs) = ngs$sample
|
||||||
|
class(ngs)<-c('ngsfilter.data',class(ngs))
|
||||||
|
|
||||||
|
return(ngs)
|
||||||
|
}
|
39
ROBITools/R/read.obitab.R
Normal file
39
ROBITools/R/read.obitab.R
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
#' @include 02_class_metabarcoding.data.R
|
||||||
|
NULL
|
||||||
|
|
||||||
|
|
||||||
|
#' Reads a data file produced by the obitab command
|
||||||
|
#'
|
||||||
|
#' Read a data file issued from the convertion of a fasta
|
||||||
|
#' file to a tabular file by the obitab command
|
||||||
|
#'
|
||||||
|
#' @param file a string containing the file name of the obitab file.
|
||||||
|
#' @param sep Column separator in the obitab file.
|
||||||
|
#' The default separator is the tabulation.
|
||||||
|
#'
|
||||||
|
#' @return a \code{data.frame} instance containing the obitab file
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' require(ROBITools)
|
||||||
|
#'
|
||||||
|
#' \dontshow{# switch the working directory to the data package directory}
|
||||||
|
#' \dontshow{setwd(system.file("extdata", package="ROBITools"))}
|
||||||
|
#'
|
||||||
|
#' # read the termes.tab file
|
||||||
|
#' termes=read.obitab('termes.tab')
|
||||||
|
#'
|
||||||
|
#' # print the dimensions of the data.frame
|
||||||
|
#' dim(termes)
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\link{import.metabarcoding.data}}
|
||||||
|
#' @author Eric Coissac
|
||||||
|
#' @export
|
||||||
|
#'
|
||||||
|
read.obitab <-
|
||||||
|
function(filename,sep='\t') {
|
||||||
|
|
||||||
|
data=read.delim(filename,sep=sep,strip.white=T,check.names =F)
|
||||||
|
data
|
||||||
|
|
||||||
|
}
|
||||||
|
|
17
ROBITools/R/read.sumatra.R
Normal file
17
ROBITools/R/read.sumatra.R
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
#' @include 02_class_metabarcoding.data.R
|
||||||
|
NULL
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: Add comment
|
||||||
|
#
|
||||||
|
# Author: coissac
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
|
||||||
|
read.sumatra = function(filename) {
|
||||||
|
data = read.table(filename,sep="\t",header=FALSE)
|
||||||
|
score = data[,3]
|
||||||
|
name.first = mapply(min,as.character(s[,1]),as.character(s[,2]))
|
||||||
|
name.second= mapply(max,as.character(s[,1]),as.character(s[,2]))
|
||||||
|
sname = as.character(interaction(data[,1],data[,2]))
|
||||||
|
}
|
123
ROBITools/R/s3objects.R
Normal file
123
ROBITools/R/s3objects.R
Normal file
@@ -0,0 +1,123 @@
|
|||||||
|
# TODO: Add comment
|
||||||
|
#
|
||||||
|
# Author: coissac
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
#' Adds a class into the class hierarchie attribute.
|
||||||
|
#'
|
||||||
|
#' \code{addS3Class} adds a new class name to the vector
|
||||||
|
#' of class associated to the object. This the way to
|
||||||
|
#' assign an object to an S3 class. \code{addS3Class} add
|
||||||
|
#' the new class name in front of the class vector
|
||||||
|
#'
|
||||||
|
#' @param object the object to modify
|
||||||
|
#' @param classname the name of the new class
|
||||||
|
#'
|
||||||
|
#' @return the object given as parametter casted to the new
|
||||||
|
#' class
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' x = c(1,3,2,5)
|
||||||
|
#' x = addS3Class(x,"my.vector")
|
||||||
|
#' class(x)
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\link{rmS3Class}}
|
||||||
|
#'
|
||||||
|
#' @note for efficiency purpose no check is done on the input
|
||||||
|
#' parametters
|
||||||
|
#'
|
||||||
|
#' @keywords system function
|
||||||
|
#'
|
||||||
|
#' @author Eric Coissac
|
||||||
|
#' @export
|
||||||
|
#'
|
||||||
|
addS3Class = function(object,classname) {
|
||||||
|
class(object) = c(classname,class(object))
|
||||||
|
return(object)
|
||||||
|
}
|
||||||
|
|
||||||
|
#' Removes a class from the class hierarchie attribute.
|
||||||
|
#'
|
||||||
|
#' \code{rmS3Class} removes a class name from the vector
|
||||||
|
#' of class associated to the object. This the way to
|
||||||
|
#' remove the association between an object and a S3 class.
|
||||||
|
#'
|
||||||
|
#' @param object the object to modify
|
||||||
|
#' @param classname the name of the class to remove
|
||||||
|
#'
|
||||||
|
#' @return the object given as parametter.
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' x = c(1,3,2,5)
|
||||||
|
#' x = addS3Class(x,"my.vector")
|
||||||
|
#' class(x)
|
||||||
|
#' x = rmS3Class(x,"my.vector")
|
||||||
|
#' class(x)
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\link{addS3Class}}
|
||||||
|
#'
|
||||||
|
#' @note for efficiency purpose no check is done on the input
|
||||||
|
#' parametters
|
||||||
|
#'
|
||||||
|
#' @keywords system function
|
||||||
|
#'
|
||||||
|
#' @author Eric Coissac
|
||||||
|
#' @export
|
||||||
|
#'
|
||||||
|
rmS3Class = function(object,classname) {
|
||||||
|
c = class(object)
|
||||||
|
if (! is.null(c))
|
||||||
|
index = match(classname,c)
|
||||||
|
class(object)=c[-index]
|
||||||
|
return(object)
|
||||||
|
}
|
||||||
|
|
||||||
|
#' create basic functions to manipulate a new S3 class
|
||||||
|
#'
|
||||||
|
#' createS3Class function create in the \code{package:ROBITools}
|
||||||
|
#' environment an \code{is.xxx} function and an \code{as.xxx} function
|
||||||
|
#' allowing to test if an abject belong the class \code{xxx} and to add
|
||||||
|
#' the class \code{xxx} to the class list of an object. \code{xxx} is a
|
||||||
|
#' generic class name that is specified through the \code{classname}
|
||||||
|
#' argument of the function.
|
||||||
|
#'
|
||||||
|
#' @param classname a \code{character string} indicating the name
|
||||||
|
#' of the new class.
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#'
|
||||||
|
#' # Create a new S3 class named mynewclass
|
||||||
|
#' createS3Class('mynewclass')
|
||||||
|
#'
|
||||||
|
#' #create a new vector object
|
||||||
|
#' x=c(1,4,6)
|
||||||
|
#'
|
||||||
|
#' # test if it belongs the new class, that is false
|
||||||
|
#' is.mynewclass(x)
|
||||||
|
#'
|
||||||
|
#' # Associate x to the new class
|
||||||
|
#' as.mynewclass(x)
|
||||||
|
#'
|
||||||
|
#' # test again if x belongs the new class, that is now true
|
||||||
|
#' is.mynewclass(x)
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\link{rmS3Class}}
|
||||||
|
#'
|
||||||
|
#' @note Take care that the new functions are created in the
|
||||||
|
#' \code{package:ROBITools} environment.
|
||||||
|
#'
|
||||||
|
#' @keywords system function
|
||||||
|
#'
|
||||||
|
#' @author Eric Coissac
|
||||||
|
#' @export
|
||||||
|
#'
|
||||||
|
createS3Class = function(classname) {
|
||||||
|
is.class = function(object) any(class(object)==classname)
|
||||||
|
as.class = function(object) return(addS3Class(object,classname))
|
||||||
|
|
||||||
|
assign(paste('is',classname,sep="."),is.class,envir=globalenv())
|
||||||
|
assign(paste('as',classname,sep="."),as.class,envir=globalenv())
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
89
ROBITools/R/taxoDBtree.R
Normal file
89
ROBITools/R/taxoDBtree.R
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
#'@include 02_class_metabarcoding.data.R
|
||||||
|
#'@import ROBITaxonomy
|
||||||
|
|
||||||
|
NULL
|
||||||
|
|
||||||
|
#' Construct a taxonomic tree from a list of taxa
|
||||||
|
#'
|
||||||
|
#' Construct a graph from a table containing the taxonomic path of sequences
|
||||||
|
#'
|
||||||
|
#'
|
||||||
|
#' @param x a table containing the taxonomic path of the references. Typically an output from get.classic.taxonomy
|
||||||
|
#'
|
||||||
|
#' @return g a directed graph displaying the taxonomy hierarchy of the input data. Stored in a \code{\link{igraph}} object
|
||||||
|
#' where the taxonomic ranks of the vertices are added to the vertices attributes
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#'
|
||||||
|
#' data(termes)
|
||||||
|
#'
|
||||||
|
#' taxo=default.taxonomy()
|
||||||
|
#'
|
||||||
|
#' termes.taxo.table = get.classic.taxonomy(termes, taxo, "taxid")
|
||||||
|
#' head(termes.taxo.table)
|
||||||
|
#'
|
||||||
|
#' graph.tax.termes = dbtree(termes.taxo.table[,1:7])
|
||||||
|
#' library(igraph)
|
||||||
|
#'
|
||||||
|
#' #plot the tree
|
||||||
|
#' coord = layout.reingold.tilford(graph.tax.termes, root=1, circular=F)
|
||||||
|
#' v.cex = as.factor(V(graph.tax.termes)$rank)
|
||||||
|
#' levels(v.cex) = match(levels(v.cex), colnames(termes.taxo.table))
|
||||||
|
#' plot(graph.tax.termes, vertex.size=1, vertex.label.cex=2*(as.numeric(as.vector(v.cex))^-1), edge.arrow.size=0, layout=coord)
|
||||||
|
#'
|
||||||
|
#'
|
||||||
|
#' #Vizualization with sequence counts
|
||||||
|
#' tax.count = log10(colSums(termes$reads)[match(as.vector(V(graph.tax.termes)$name), termes$motus$scientific_name)])
|
||||||
|
#' tax.count[is.na(tax.count)|tax.count<0] = 0.01
|
||||||
|
#' V(graph.tax.termes)$count = unname(tax.count)
|
||||||
|
#'
|
||||||
|
#' plot(graph.tax.termes, vertex.size=V(graph.tax.termes)$count, vertex.label.cex=2*(as.numeric(as.vector(v.cex))^-1), edge.arrow.size=0, layout=coord)
|
||||||
|
#'
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\link{get.classic.taxonomy}}
|
||||||
|
#' @author Lucie Zinger
|
||||||
|
#' @export
|
||||||
|
|
||||||
|
dbtree = function(x) {
|
||||||
|
|
||||||
|
#dealing with noranks
|
||||||
|
x2 = x
|
||||||
|
for (i in 1:ncol(x2)) {
|
||||||
|
x2[,i] = as.character(x[,i])
|
||||||
|
if(length(which(is.na(x[,i])==T))!=0) {
|
||||||
|
if(i==1) {
|
||||||
|
x2[which(is.na(x[,i])==T),i] = "NR"
|
||||||
|
} else {
|
||||||
|
x2[which(is.na(x[,i])==T),i] = as.character(x2[,i-1])[which(is.na(x2[,i])==T)]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#prepare an edgelist
|
||||||
|
edgelist = list()
|
||||||
|
|
||||||
|
for (i in 1:(ncol(x2)-1)){
|
||||||
|
out = x2[,c(i,i+1)]
|
||||||
|
out2 = out[-which(duplicated(out)==T),]
|
||||||
|
colnames(out2) = c("parent", "kid")
|
||||||
|
edgelist[[i]] = out2[which(out2[,1]!=out2[,2]),]
|
||||||
|
}
|
||||||
|
|
||||||
|
edgelist = do.call("rbind", edgelist)
|
||||||
|
|
||||||
|
|
||||||
|
#construct the graph
|
||||||
|
|
||||||
|
g = igraph::graph.edgelist(as.matrix(edgelist), directed=T)
|
||||||
|
|
||||||
|
#get taxorank for each taxa
|
||||||
|
ranks = do.call("rbind", lapply(1:ncol(x), function(y) {
|
||||||
|
out = cbind(unique(as.character(x[,y])), colnames(x)[y])
|
||||||
|
out
|
||||||
|
}))
|
||||||
|
|
||||||
|
#Assign nodes to taxorank
|
||||||
|
igraph::V(g)$rank = ranks[match(igraph::V(g)$name, ranks[,1]),2]
|
||||||
|
|
||||||
|
return(g)
|
||||||
|
}
|
74
ROBITools/R/taxonomic.resolution.R
Normal file
74
ROBITools/R/taxonomic.resolution.R
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
#' @import ROBITaxonomy
|
||||||
|
#' @include 02_class_metabarcoding.data.R
|
||||||
|
NULL
|
||||||
|
|
||||||
|
#' Dataset taxonomic resolution summary.
|
||||||
|
#'
|
||||||
|
#' Summarizes the taxonomic relution of reads and MOTUs over the entire dataset
|
||||||
|
#'
|
||||||
|
#'
|
||||||
|
#' @param x a \code{\link{metabarcoding.data}} object
|
||||||
|
#' @param colranks a string indicating column name where ranks are stored in \code{x}
|
||||||
|
#' @param colscores a string indicating column name where taxonomic identification scores are stored in \code{x}
|
||||||
|
#' @param thresh a threshold for defining at which taxonomic identification scores a sequence can be considered as "not assigned".
|
||||||
|
#' Default is \code{0.7}
|
||||||
|
#'
|
||||||
|
#' @return returns a data.frame and piecharts of the number/proportion of MOTUs/reads assigned to each taxonomic levels
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#'
|
||||||
|
#' data(termes)
|
||||||
|
#' taxo=default.taxonomy()
|
||||||
|
#'
|
||||||
|
#' termes.taxo.table = get.classic.taxonomy(termes, taxo, "taxid")
|
||||||
|
#' attr(termes, "motus") = data.frame(termes$motus, termes.taxo.table)
|
||||||
|
#' attr(termes, "motus")["count"] = colSums(termes$reads)
|
||||||
|
#'
|
||||||
|
#' summary.taxores(termes, "taxonomic_rank_ok","best_identity")
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\linkS4class{taxonomy.obitools}}, and method \code{\link{taxonmicank}}
|
||||||
|
#'
|
||||||
|
#' @author Lucie Zinger
|
||||||
|
#' @keywords taxonomy
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
#'
|
||||||
|
summary.taxores = function(x,colranks,colscores, thresh=0.7){
|
||||||
|
|
||||||
|
#vector encompassing all ranked possible taxonomic levels
|
||||||
|
taxorank = c("superkingdom", "kingdom", "subkingdom", "superphylum", "phylum", "subphylum", "superclass", "class", "subclass", "infraclass",
|
||||||
|
"superorder", "order", "suborder", "infraorder", "parvorder", "superfamily", "family", "subfamily", "supertribe", "tribe",
|
||||||
|
"subtribe", "supergenus", "genus", "subgenus", "species group", "species subgroup", "superspecies", "species", "subspecies",
|
||||||
|
"varietas", "forma", "no rank", "not assigned")
|
||||||
|
|
||||||
|
#settings if thresh
|
||||||
|
ranks = as.vector(x$motus[,colranks])
|
||||||
|
ranks[x$motus[,colscores]<thresh] = "not assigned"
|
||||||
|
|
||||||
|
#nb of otus
|
||||||
|
tmp = table(ranks)
|
||||||
|
taxores.otu = tmp[match(taxorank, names(tmp))]
|
||||||
|
names(taxores.otu) = taxorank
|
||||||
|
taxores.otu[is.na(taxores.otu)] = 0
|
||||||
|
|
||||||
|
#nb of reads
|
||||||
|
tmp = aggregate(x$motus$count, by=list(ranks), sum)
|
||||||
|
taxores.reads = tmp[match(taxorank,tmp[,1]),2]
|
||||||
|
names(taxores.reads) = taxorank
|
||||||
|
taxores.reads[is.na(taxores.reads)] = 0
|
||||||
|
|
||||||
|
#plot
|
||||||
|
layout(matrix(c(1,2,1,3),2,2),heights=c(0.3,1))
|
||||||
|
col.tmp = c(rainbow(length(taxorank)-2,start=0, end=0.5, alpha=0.6), "lightgrey", "darkgrey")
|
||||||
|
par(mar=c(1,0,0,0), oma=c(0,0,2,0))
|
||||||
|
frame()
|
||||||
|
legend("top", taxorank, ncol=6, cex=0.8, fill=col.tmp)
|
||||||
|
pie(taxores.otu, col=col.tmp, border="lightgrey", labels="", clockwise=T)
|
||||||
|
mtext("OTUs", side=1, cex=1)
|
||||||
|
pie(taxores.reads, col=col.tmp, border="lightgrey", labels="", clockwise=T)
|
||||||
|
mtext("Reads", side=1, cex=1)
|
||||||
|
|
||||||
|
#result
|
||||||
|
out = data.frame(otu=taxores.otu, reads=taxores.reads)
|
||||||
|
out
|
||||||
|
}
|
53
ROBITools/R/taxonomy_classic_table.R
Normal file
53
ROBITools/R/taxonomy_classic_table.R
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
#' @import ROBITaxonomy
|
||||||
|
#' @include 02_class_metabarcoding.data.R
|
||||||
|
NULL
|
||||||
|
|
||||||
|
#' Get classical taxonomy format
|
||||||
|
#'
|
||||||
|
#' Creates a table with the classical taxonomic description (from phylum to species)
|
||||||
|
#'
|
||||||
|
#' @param x a \code{\link{metabarcoding.data}} object
|
||||||
|
#' @param taxonomy a instance of \code{\linkS4class{taxonomy.obitools}}
|
||||||
|
#' @param coltaxid a the name of the column containing taxids to be used for creating classical taxonomic description
|
||||||
|
#'
|
||||||
|
#' @return returns a data.frame with the classical taxonomic description ("kingdom", "phylum", "class", "order", "family", "genus", "species"), as well as
|
||||||
|
#' sequence taxonomic assignment rank and scientific name for each sequences stored in the \code{\link{metabarcoding.data}} object
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#'
|
||||||
|
#' data(termes)
|
||||||
|
#'
|
||||||
|
#' taxo=default.taxonomy()
|
||||||
|
#'
|
||||||
|
#' termes.taxo.table = get.classic.taxonomy(termes, taxo, "taxid")
|
||||||
|
#' head(termes.taxo.table)
|
||||||
|
#'
|
||||||
|
#' attr(termes, "motus") = data.frame(termes$motus, termes.taxo.table)
|
||||||
|
#'
|
||||||
|
#'
|
||||||
|
#' @seealso \code{\linkS4class{taxonomy.obitools}}, and methods \code{\link{species}},\code{\link{genus}}, \code{\link{family}},\code{\link{kingdom}},
|
||||||
|
#' \code{\link{superkingdom}},\code{\link{taxonatrank}}, \code{\link{taxonmicank}}
|
||||||
|
#'
|
||||||
|
#' @author Lucie Zinger
|
||||||
|
#' @keywords taxonomy
|
||||||
|
#' @export
|
||||||
|
#'
|
||||||
|
|
||||||
|
get.classic.taxonomy = function(x, taxonomy, coltaxid) {
|
||||||
|
|
||||||
|
classic.taxo = c("kingdom", "phylum", "class", "order", "family", "genus", "species")
|
||||||
|
|
||||||
|
taxids = x$motus[,coltaxid]
|
||||||
|
|
||||||
|
out = as.data.frame(do.call("cbind", lapply(classic.taxo, function(y) {
|
||||||
|
scientificname(taxonomy, taxonatrank(taxonomy,taxids,y))
|
||||||
|
})))
|
||||||
|
|
||||||
|
colnames(out) = paste(classic.taxo, "_name_ok", sep="")
|
||||||
|
rownames(out) = colnames(x)
|
||||||
|
|
||||||
|
out$scientific_name_ok = scientificname(taxonomy, taxids)
|
||||||
|
out$taxonomic_rank_ok = taxonomicrank(taxonomy, taxids)
|
||||||
|
|
||||||
|
return(out)
|
||||||
|
}
|
128
ROBITools/README-SLRE.md
Executable file
128
ROBITools/README-SLRE.md
Executable file
@@ -0,0 +1,128 @@
|
|||||||
|
SLRE: Super Light Regular Expression library
|
||||||
|
============================================
|
||||||
|
|
||||||
|
SLRE is an ISO C library that implements a subset of Perl regular
|
||||||
|
expression syntax. Main features of SLRE are:
|
||||||
|
|
||||||
|
* Written in strict ANSI C'89
|
||||||
|
* Small size (compiled x86 code is about 5kB)
|
||||||
|
* Uses little stack and does no dynamic memory allocation
|
||||||
|
* Provides simple intuitive API
|
||||||
|
* Implements most useful subset of Perl regex syntax (see below)
|
||||||
|
* Easily extensible. E.g. if one wants to introduce a new
|
||||||
|
metacharacter `\i`, meaning "IPv4 address", it is easy to do so with SLRE.
|
||||||
|
|
||||||
|
SLRE is perfect for tasks like parsing network requests, configuration
|
||||||
|
files, user input, etc, when libraries like [PCRE](http://pcre.org) are too
|
||||||
|
heavyweight for the given task. Developers of embedded systems would benefit
|
||||||
|
most.
|
||||||
|
|
||||||
|
## Supported Syntax
|
||||||
|
|
||||||
|
(?i) Must be at the beginning of the regex. Makes match case-insensitive
|
||||||
|
^ Match beginning of a buffer
|
||||||
|
$ Match end of a buffer
|
||||||
|
() Grouping and substring capturing
|
||||||
|
\s Match whitespace
|
||||||
|
\S Match non-whitespace
|
||||||
|
\d Match decimal digit
|
||||||
|
\n Match new line character
|
||||||
|
\r Match line feed character
|
||||||
|
\f Match form feed character
|
||||||
|
\v Match vertical tab character
|
||||||
|
\t Match horizontal tab character
|
||||||
|
\b Match backspace character
|
||||||
|
+ Match one or more times (greedy)
|
||||||
|
+? Match one or more times (non-greedy)
|
||||||
|
* Match zero or more times (greedy)
|
||||||
|
*? Match zero or more times (non-greedy)
|
||||||
|
? Match zero or once (non-greedy)
|
||||||
|
x|y Match x or y (alternation operator)
|
||||||
|
\meta Match one of the meta character: ^$().[]*+?|\
|
||||||
|
\xHH Match byte with hex value 0xHH, e.g. \x4a
|
||||||
|
[...] Match any character from set. Ranges like [a-z] are supported
|
||||||
|
[^...] Match any character but ones from set
|
||||||
|
|
||||||
|
Under development: Unicode support.
|
||||||
|
|
||||||
|
## API
|
||||||
|
|
||||||
|
int slre_match(const char *regexp, const char *buf, int buf_len,
|
||||||
|
struct slre_cap *caps, int num_caps, int flags);
|
||||||
|
|
||||||
|
`slre_match()` matches string buffer `buf` of length `buf_len` against
|
||||||
|
regular expression `regexp`, which should conform the syntax outlined
|
||||||
|
above. If regular expression `regexp` contains brackets, `slre_match()`
|
||||||
|
can capture the respective substrings into the array of `struct slre_cap`
|
||||||
|
structures:
|
||||||
|
|
||||||
|
/* Stores matched fragment for the expression inside brackets */
|
||||||
|
struct slre_cap {
|
||||||
|
const char *ptr; /* Points to the matched fragment */
|
||||||
|
int len; /* Length of the matched fragment */
|
||||||
|
};
|
||||||
|
|
||||||
|
N-th member of the `caps` array will contain fragment that corresponds to the
|
||||||
|
N-th opening bracket in the `regex`, N is zero-based. `slre_match()` returns
|
||||||
|
number of bytes scanned from the beginning of the string. If return value is
|
||||||
|
greater or equal to 0, there is a match. If return value is less then 0, there
|
||||||
|
is no match. Negative return codes are as follows:
|
||||||
|
|
||||||
|
#define SLRE_NO_MATCH -1
|
||||||
|
#define SLRE_UNEXPECTED_QUANTIFIER -2
|
||||||
|
#define SLRE_UNBALANCED_BRACKETS -3
|
||||||
|
#define SLRE_INTERNAL_ERROR -4
|
||||||
|
#define SLRE_INVALID_CHARACTER_SET -5
|
||||||
|
#define SLRE_INVALID_METACHARACTER -6
|
||||||
|
#define SLRE_CAPS_ARRAY_TOO_SMALL -7
|
||||||
|
#define SLRE_TOO_MANY_BRANCHES -8
|
||||||
|
#define SLRE_TOO_MANY_BRACKETS -9
|
||||||
|
|
||||||
|
|
||||||
|
## Example: parsing HTTP request line
|
||||||
|
|
||||||
|
const char *request = " GET /index.html HTTP/1.0\r\n\r\n";
|
||||||
|
struct slre_cap caps[4];
|
||||||
|
|
||||||
|
if (slre_match("^\\s*(\\S+)\\s+(\\S+)\\s+HTTP/(\\d)\\.(\\d)",
|
||||||
|
request, strlen(request), caps, 4, 0) > 0) {
|
||||||
|
printf("Method: [%.*s], URI: [%.*s]\n",
|
||||||
|
caps[0].len, caps[0].ptr,
|
||||||
|
caps[1].len, caps[1].ptr);
|
||||||
|
} else {
|
||||||
|
printf("Error parsing [%s]\n", request);
|
||||||
|
}
|
||||||
|
|
||||||
|
## Example: find all URLs in a string
|
||||||
|
|
||||||
|
static const char *str =
|
||||||
|
"<img src=\"HTTPS://FOO.COM/x?b#c=tab1\"/> "
|
||||||
|
" <a href=\"http://cesanta.com\">some link</a>";
|
||||||
|
|
||||||
|
static const char *regex = "(?i)((https?://)[^\\s/'\"<>]+/?[^\\s'\"<>]*)";
|
||||||
|
struct slre_cap caps[2];
|
||||||
|
int i, j = 0, str_len = strlen(str);
|
||||||
|
|
||||||
|
while (j < str_len &&
|
||||||
|
(i = slre_match(regex, str + j, str_len - j, caps, 2, 0)) > 0) {
|
||||||
|
printf("Found URL: [%.*s]\n", caps[0].len, caps[0].ptr);
|
||||||
|
j += i;
|
||||||
|
}
|
||||||
|
|
||||||
|
Output:
|
||||||
|
|
||||||
|
Found URL: [HTTPS://FOO.COM/x?b#c=tab1]
|
||||||
|
Found URL: [http://cesanta.com]
|
||||||
|
|
||||||
|
# License
|
||||||
|
|
||||||
|
SLRE is released under
|
||||||
|
[GNU GPL v.2](http://www.gnu.org/licenses/old-licenses/gpl-2.0.html).
|
||||||
|
Businesses have an option to get non-restrictive, royalty-free commercial
|
||||||
|
license and professional support from
|
||||||
|
[Cesanta Software](http://cesanta.com).
|
||||||
|
|
||||||
|
[Super Light DNS Resolver](https://github.com/cesanta/sldr),
|
||||||
|
[Mongoose web server](https://github.com/cesanta/mongoose)
|
||||||
|
are other projects by Cesanta Software, developed with the same philosophy
|
||||||
|
of functionality and simplicity.
|
BIN
ROBITools/data/termes.rda
Normal file
BIN
ROBITools/data/termes.rda
Normal file
Binary file not shown.
47210
ROBITools/inst/extdata/termes.fasta
vendored
Normal file
47210
ROBITools/inst/extdata/termes.fasta
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1
ROBITools/inst/extdata/termes.tab
vendored
Normal file
1
ROBITools/inst/extdata/termes.tab
vendored
Normal file
File diff suppressed because one or more lines are too long
21
ROBITools/inst/extdata/termes_ngsfilt.txt
vendored
Normal file
21
ROBITools/inst/extdata/termes_ngsfilt.txt
vendored
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
termes_data A01 acacacac:acacacac ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_01A; serie=01; coordX=5; coordY=5;
|
||||||
|
termes_data A02 acagcaca:acacacac ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_01B; serie=01; coordX=10; coordY=5;
|
||||||
|
termes_data A03 gtgtacat:acacacac ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_01C; serie=01; coordX=15; coordY=5;
|
||||||
|
termes_data A04 tatgtcag:acacacac ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_01D; serie=01; coordX=20; coordY=5;
|
||||||
|
termes_data A05 tagtcgca:acacacac ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_01E; serie=01; coordX=25; coordY=5;
|
||||||
|
termes_data A06 tactatac:acacacac ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_01F; serie=01; coordX=30; coordY=5;
|
||||||
|
termes_data A07 actagatc:acacacac ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_01G; serie=01; coordX=35; coordY=5;
|
||||||
|
termes_data A08 gatcgcga:acacacac ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_01H; serie=01; coordX=40; coordY=5;
|
||||||
|
termes_data A09 acacacac:acagcaca ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_02A; serie=01; coordX=45; coordY=5;
|
||||||
|
termes_data A10 acagcaca:acagcaca ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_02B; serie=01; coordX=50; coordY=5;
|
||||||
|
termes_data A11 gtgtacat:acagcaca ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_02C; serie=01; coordX=55; coordY=5;
|
||||||
|
termes_data A12 tatgtcag:acagcaca ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_02D; serie=01; coordX=60; coordY=5;
|
||||||
|
termes_data A13 tagtcgca:acagcaca ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_02E; serie=02; coordX=65; coordY=5;
|
||||||
|
termes_data A14 tactatac:acagcaca ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_02F; serie=02; coordX=70; coordY=5;
|
||||||
|
termes_data A15 actagatc:acagcaca ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_02G; serie=02; coordX=75; coordY=5;
|
||||||
|
termes_data A16 gatcgcga:acagcaca ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_02H; serie=02; coordX=80; coordY=5;
|
||||||
|
termes_data A17 acacacac:gtgtacat ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_03A; serie=02; coordX=85; coordY=5;
|
||||||
|
termes_data A18 acagcaca:gtgtacat ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_03B; serie=02; coordX=90; coordY=5;
|
||||||
|
termes_data A19 gtgtacat:gtgtacat ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=01_03C; serie=02; coordX=95; coordY=5;
|
||||||
|
termes_data A11r tcagtgtc:gtcgtaga ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=04_10B; serie=01; coordX=55; coordY=5;
|
||||||
|
termes_data A16r actctgct:gtcgtaga ATTTCAGGTCAAGGTGCAGC TACAACCAAATCCAATTTCA F @ position=04_10C; serie=02; coordX=80; coordY=5;
|
BIN
ROBITools/src/ROBITools.so
Executable file
BIN
ROBITools/src/ROBITools.so
Executable file
Binary file not shown.
26
ROBITools/src/ecoError.c
Normal file
26
ROBITools/src/ecoError.c
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
#include "ecoPCR.h"
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
/*
|
||||||
|
* print the message given as argument and exit the program
|
||||||
|
* @param error error number
|
||||||
|
* @param message the text explaining what's going on
|
||||||
|
* @param filename the file source where the program failed
|
||||||
|
* @param linenumber the line where it has failed
|
||||||
|
* filename and linenumber are written at pre-processing
|
||||||
|
* time by a macro
|
||||||
|
*/
|
||||||
|
void ecoError(int32_t error,
|
||||||
|
const char* message,
|
||||||
|
const char * filename,
|
||||||
|
int linenumber)
|
||||||
|
{
|
||||||
|
fprintf(stderr,"Error %d in file %s line %d : %s\n",
|
||||||
|
error,
|
||||||
|
filename,
|
||||||
|
linenumber,
|
||||||
|
message);
|
||||||
|
|
||||||
|
abort();
|
||||||
|
}
|
BIN
ROBITools/src/ecoError.o
Normal file
BIN
ROBITools/src/ecoError.o
Normal file
Binary file not shown.
122
ROBITools/src/ecoIOUtils.c
Normal file
122
ROBITools/src/ecoIOUtils.c
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
#include "ecoPCR.h"
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#define SWAPINT32(x) ((((x) << 24) & 0xFF000000) | (((x) << 8) & 0xFF0000) | \
|
||||||
|
(((x) >> 8) & 0xFF00) | (((x) >> 24) & 0xFF))
|
||||||
|
|
||||||
|
|
||||||
|
int32_t is_big_endian()
|
||||||
|
{
|
||||||
|
int32_t i=1;
|
||||||
|
|
||||||
|
return (int32_t)((char*)&i)[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int32_t swap_int32_t(int32_t i)
|
||||||
|
{
|
||||||
|
return SWAPINT32(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read part of the file
|
||||||
|
* @param *f the database
|
||||||
|
* @param recordSize the size to be read
|
||||||
|
*
|
||||||
|
* @return buffer
|
||||||
|
*/
|
||||||
|
void *read_ecorecord(FILE *f,int32_t *recordSize)
|
||||||
|
{
|
||||||
|
static void *buffer =NULL;
|
||||||
|
int32_t buffersize=0;
|
||||||
|
int32_t read;
|
||||||
|
|
||||||
|
if (!recordSize)
|
||||||
|
ECOERROR(ECO_ASSERT_ERROR,
|
||||||
|
"recordSize cannot be NULL");
|
||||||
|
|
||||||
|
read = fread(recordSize,
|
||||||
|
1,
|
||||||
|
sizeof(int32_t),
|
||||||
|
f);
|
||||||
|
|
||||||
|
if (feof(f))
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
if (read != sizeof(int32_t))
|
||||||
|
ECOERROR(ECO_IO_ERROR,"Reading record size error");
|
||||||
|
|
||||||
|
if (is_big_endian())
|
||||||
|
*recordSize=swap_int32_t(*recordSize);
|
||||||
|
|
||||||
|
if (buffersize < *recordSize)
|
||||||
|
{
|
||||||
|
if (buffer)
|
||||||
|
buffer = ECOREALLOC(buffer,*recordSize,
|
||||||
|
"Increase size of record buffer");
|
||||||
|
else
|
||||||
|
buffer = ECOMALLOC(*recordSize,
|
||||||
|
"Allocate record buffer");
|
||||||
|
}
|
||||||
|
|
||||||
|
read = fread(buffer,
|
||||||
|
1,
|
||||||
|
*recordSize,
|
||||||
|
f);
|
||||||
|
|
||||||
|
if (read != *recordSize)
|
||||||
|
ECOERROR(ECO_IO_ERROR,"Reading record data error");
|
||||||
|
|
||||||
|
return buffer;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Open the database and check it's readable
|
||||||
|
* @param filename name of the database (.sdx, .rdx, .tbx)
|
||||||
|
* @param sequencecount buffer - pointer to variable storing the number of occurence
|
||||||
|
* @param abort_on_open_error boolean to define the behaviour in case of error
|
||||||
|
* while opening the database
|
||||||
|
* @return FILE type
|
||||||
|
**/
|
||||||
|
FILE *open_ecorecorddb(const char *filename,
|
||||||
|
int32_t *sequencecount,
|
||||||
|
int32_t abort_on_open_error)
|
||||||
|
{
|
||||||
|
FILE *f;
|
||||||
|
int32_t read;
|
||||||
|
|
||||||
|
f = fopen(filename,"rb");
|
||||||
|
|
||||||
|
if (!f)
|
||||||
|
{
|
||||||
|
if (abort_on_open_error)
|
||||||
|
ECOERROR(ECO_IO_ERROR,"Cannot open file");
|
||||||
|
else
|
||||||
|
{
|
||||||
|
*sequencecount=0;
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
read = fread(sequencecount,
|
||||||
|
1,
|
||||||
|
sizeof(int32_t),
|
||||||
|
f);
|
||||||
|
|
||||||
|
if (read != sizeof(int32_t))
|
||||||
|
ECOERROR(ECO_IO_ERROR,"Reading record size error");
|
||||||
|
|
||||||
|
if (is_big_endian())
|
||||||
|
*sequencecount=swap_int32_t(*sequencecount);
|
||||||
|
|
||||||
|
return f;
|
||||||
|
}
|
||||||
|
|
BIN
ROBITools/src/ecoIOUtils.o
Normal file
BIN
ROBITools/src/ecoIOUtils.o
Normal file
Binary file not shown.
79
ROBITools/src/ecoMalloc.c
Normal file
79
ROBITools/src/ecoMalloc.c
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
#include "ecoPCR.h"
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
static int eco_log_malloc = 0;
|
||||||
|
|
||||||
|
void eco_trace_memory_allocation()
|
||||||
|
{
|
||||||
|
eco_log_malloc=1;
|
||||||
|
}
|
||||||
|
|
||||||
|
void eco_untrace_memory_allocation()
|
||||||
|
{
|
||||||
|
eco_log_malloc=0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void *eco_malloc(int32_t chunksize,
|
||||||
|
const char *error_message,
|
||||||
|
const char *filename,
|
||||||
|
int32_t line)
|
||||||
|
{
|
||||||
|
void * chunk;
|
||||||
|
|
||||||
|
chunk = calloc(1,chunksize);
|
||||||
|
|
||||||
|
if (!chunk)
|
||||||
|
ecoError(ECO_MEM_ERROR,error_message,filename,line);
|
||||||
|
|
||||||
|
if (eco_log_malloc)
|
||||||
|
fprintf(stderr,
|
||||||
|
"Memory segment located at %p of size %d is allocated (file : %s [%d])",
|
||||||
|
chunk,
|
||||||
|
chunksize,
|
||||||
|
filename,
|
||||||
|
line);
|
||||||
|
|
||||||
|
return chunk;
|
||||||
|
}
|
||||||
|
|
||||||
|
void *eco_realloc(void *chunk,
|
||||||
|
int32_t newsize,
|
||||||
|
const char *error_message,
|
||||||
|
const char *filename,
|
||||||
|
int32_t line)
|
||||||
|
{
|
||||||
|
void *newchunk;
|
||||||
|
|
||||||
|
newchunk = realloc(chunk,newsize);
|
||||||
|
|
||||||
|
if (!newchunk)
|
||||||
|
ecoError(ECO_MEM_ERROR,error_message,filename,line);
|
||||||
|
|
||||||
|
if (eco_log_malloc)
|
||||||
|
fprintf(stderr,
|
||||||
|
"Old memory segment %p is reallocated at %p with a size of %d (file : %s [%d])",
|
||||||
|
chunk,
|
||||||
|
newchunk,
|
||||||
|
newsize,
|
||||||
|
filename,
|
||||||
|
line);
|
||||||
|
|
||||||
|
return newchunk;
|
||||||
|
}
|
||||||
|
|
||||||
|
void eco_free(void *chunk,
|
||||||
|
const char *error_message,
|
||||||
|
const char *filename,
|
||||||
|
int32_t line)
|
||||||
|
{
|
||||||
|
free(chunk);
|
||||||
|
|
||||||
|
if (eco_log_malloc)
|
||||||
|
fprintf(stderr,
|
||||||
|
"Memory segment %p is released => %s (file : %s [%d])",
|
||||||
|
chunk,
|
||||||
|
error_message,
|
||||||
|
filename,
|
||||||
|
line);
|
||||||
|
}
|
BIN
ROBITools/src/ecoMalloc.o
Normal file
BIN
ROBITools/src/ecoMalloc.o
Normal file
Binary file not shown.
283
ROBITools/src/ecoPCR.h
Normal file
283
ROBITools/src/ecoPCR.h
Normal file
@@ -0,0 +1,283 @@
|
|||||||
|
#ifndef ECOPCR_H_
|
||||||
|
#define ECOPCR_H_
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <inttypes.h>
|
||||||
|
|
||||||
|
#include <R.h>
|
||||||
|
#include <Rinternals.h>
|
||||||
|
#include <Rdefines.h>
|
||||||
|
|
||||||
|
|
||||||
|
//#ifndef H_apat
|
||||||
|
//#include "../libapat/apat.h"
|
||||||
|
//#endif
|
||||||
|
|
||||||
|
/*****************************************************
|
||||||
|
*
|
||||||
|
* Data type declarations
|
||||||
|
*
|
||||||
|
*****************************************************/
|
||||||
|
|
||||||
|
/*
|
||||||
|
*
|
||||||
|
* Sequence types
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
|
||||||
|
int32_t taxid;
|
||||||
|
char AC[20];
|
||||||
|
int32_t DE_length;
|
||||||
|
int32_t SQ_length;
|
||||||
|
int32_t CSQ_length;
|
||||||
|
|
||||||
|
char data[1];
|
||||||
|
|
||||||
|
} ecoseqformat_t;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
int32_t taxid;
|
||||||
|
int32_t SQ_length;
|
||||||
|
char *AC;
|
||||||
|
char *DE;
|
||||||
|
char *SQ;
|
||||||
|
} ecoseq_t;
|
||||||
|
|
||||||
|
/*
|
||||||
|
*
|
||||||
|
* Taxonomy taxon types
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
int32_t taxid;
|
||||||
|
int32_t rank;
|
||||||
|
int32_t parent;
|
||||||
|
int32_t namelength;
|
||||||
|
char name[1];
|
||||||
|
|
||||||
|
} ecotxformat_t;
|
||||||
|
|
||||||
|
typedef struct ecotxnode {
|
||||||
|
int32_t taxid;
|
||||||
|
int32_t rank;
|
||||||
|
int32_t farest;
|
||||||
|
struct ecotxnode *parent;
|
||||||
|
char *name;
|
||||||
|
} ecotx_t;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
int32_t count;
|
||||||
|
int32_t maxtaxid;
|
||||||
|
int32_t buffersize;
|
||||||
|
ecotx_t taxon[1];
|
||||||
|
} ecotxidx_t;
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
*
|
||||||
|
* Taxonomy rank types
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
int32_t count;
|
||||||
|
char* label[1];
|
||||||
|
} ecorankidx_t;
|
||||||
|
|
||||||
|
/*
|
||||||
|
*
|
||||||
|
* Taxonomy name types
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
int32_t is_scientificname;
|
||||||
|
int32_t namelength;
|
||||||
|
int32_t classlength;
|
||||||
|
int32_t taxid;
|
||||||
|
char names[1];
|
||||||
|
} econameformat_t;
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
char *name;
|
||||||
|
char *classname;
|
||||||
|
int32_t is_scientificname;
|
||||||
|
struct ecotxnode *taxon;
|
||||||
|
} econame_t;
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
int32_t count;
|
||||||
|
econame_t names[1];
|
||||||
|
} econameidx_t;
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
ecorankidx_t *ranks;
|
||||||
|
econameidx_t *names;
|
||||||
|
ecotxidx_t *taxons;
|
||||||
|
} ecotaxonomy_t;
|
||||||
|
|
||||||
|
|
||||||
|
/*****************************************************
|
||||||
|
*
|
||||||
|
* Function declarations
|
||||||
|
*
|
||||||
|
*****************************************************/
|
||||||
|
|
||||||
|
/*
|
||||||
|
*
|
||||||
|
* Low level system functions
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
int32_t is_big_endian();
|
||||||
|
int32_t swap_int32_t(int32_t);
|
||||||
|
|
||||||
|
void *eco_malloc(int32_t chunksize,
|
||||||
|
const char *error_message,
|
||||||
|
const char *filename,
|
||||||
|
int32_t line);
|
||||||
|
|
||||||
|
|
||||||
|
void *eco_realloc(void *chunk,
|
||||||
|
int32_t chunksize,
|
||||||
|
const char *error_message,
|
||||||
|
const char *filename,
|
||||||
|
int32_t line);
|
||||||
|
|
||||||
|
void eco_free(void *chunk,
|
||||||
|
const char *error_message,
|
||||||
|
const char *filename,
|
||||||
|
int32_t line);
|
||||||
|
|
||||||
|
void eco_trace_memory_allocation();
|
||||||
|
void eco_untrace_memory_allocation();
|
||||||
|
|
||||||
|
#define ECOMALLOC(size,error_message) \
|
||||||
|
eco_malloc((size),(error_message),__FILE__,__LINE__)
|
||||||
|
|
||||||
|
#define ECOREALLOC(chunk,size,error_message) \
|
||||||
|
eco_realloc((chunk),(size),(error_message),__FILE__,__LINE__)
|
||||||
|
|
||||||
|
#define ECOFREE(chunk,error_message) \
|
||||||
|
eco_free((chunk),(error_message),__FILE__,__LINE__)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
*
|
||||||
|
* Error managment
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
void ecoError(int32_t,const char*,const char *,int);
|
||||||
|
|
||||||
|
#define ECOERROR(code,message) ecoError((code),(message),__FILE__,__LINE__)
|
||||||
|
|
||||||
|
#define ECO_IO_ERROR (1)
|
||||||
|
#define ECO_MEM_ERROR (2)
|
||||||
|
#define ECO_ASSERT_ERROR (3)
|
||||||
|
#define ECO_NOTFOUND_ERROR (4)
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
*
|
||||||
|
* Low level Disk access functions
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
FILE *open_ecorecorddb(const char *filename,
|
||||||
|
int32_t *sequencecount,
|
||||||
|
int32_t abort_on_open_error);
|
||||||
|
|
||||||
|
void *read_ecorecord(FILE *,int32_t *recordSize);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Read function in internal binary format
|
||||||
|
*/
|
||||||
|
|
||||||
|
FILE *open_ecoseqdb(const char *filename,
|
||||||
|
int32_t *sequencecount);
|
||||||
|
|
||||||
|
ecoseq_t *readnext_ecoseq(FILE *);
|
||||||
|
|
||||||
|
ecorankidx_t *read_rankidx(const char *filename);
|
||||||
|
|
||||||
|
econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read taxonomy data as formated by the ecoPCRFormat.py script.
|
||||||
|
*
|
||||||
|
* This function is normaly uses internaly by the read_taxonomy
|
||||||
|
* function and should not be called directly.
|
||||||
|
*
|
||||||
|
* @arg filename path to the *.tdx file of the reformated db
|
||||||
|
*
|
||||||
|
* @return pointer to a taxonomy index structure
|
||||||
|
*/
|
||||||
|
|
||||||
|
ecotxidx_t *read_taxonomyidx(const char *filename,const char *filename2);
|
||||||
|
|
||||||
|
ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName);
|
||||||
|
|
||||||
|
ecotx_t *eco_findtaxonatrank(ecotx_t *taxon, int32_t rankidx);
|
||||||
|
|
||||||
|
ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy, int32_t taxid);
|
||||||
|
|
||||||
|
int eco_isundertaxon(ecotx_t *taxon, int other_taxid);
|
||||||
|
|
||||||
|
ecoseq_t *ecoseq_iterator(const char *prefix);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
ecoseq_t *new_ecoseq();
|
||||||
|
int32_t delete_ecoseq(ecoseq_t *);
|
||||||
|
ecoseq_t *new_ecoseq_with_data( char *AC,
|
||||||
|
char *DE,
|
||||||
|
char *SQ,
|
||||||
|
int32_t taxid
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
int32_t delete_taxon(ecotx_t *taxon);
|
||||||
|
int32_t delete_taxonomy(ecotxidx_t *index);
|
||||||
|
int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy);
|
||||||
|
|
||||||
|
|
||||||
|
int32_t rank_index(const char* label,ecorankidx_t* ranks);
|
||||||
|
|
||||||
|
//int32_t delete_apatseq(SeqPtr pseq);
|
||||||
|
//PatternPtr buildPattern(const char *pat, int32_t error_max);
|
||||||
|
//PatternPtr complementPattern(PatternPtr pat);
|
||||||
|
//
|
||||||
|
//SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out,int32_t circular);
|
||||||
|
|
||||||
|
//char *ecoComplementPattern(char *nucAcSeq);
|
||||||
|
//char *ecoComplementSequence(char *nucAcSeq);
|
||||||
|
//char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end);
|
||||||
|
|
||||||
|
ecotx_t *eco_getspecies(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
|
||||||
|
ecotx_t *eco_getgenus(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
|
||||||
|
ecotx_t *eco_getfamily(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
|
||||||
|
ecotx_t *eco_getkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
|
||||||
|
ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
|
||||||
|
|
||||||
|
//int eco_is_taxid_ignored(int32_t *ignored_taxid, int32_t tab_len, int32_t taxid);
|
||||||
|
//int eco_is_taxid_included(ecotaxonomy_t *taxonomy, int32_t *included_taxid, int32_t tab_len, int32_t taxid);
|
||||||
|
|
||||||
|
|
||||||
|
ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy);
|
||||||
|
|
||||||
|
#endif /*ECOPCR_H_*/
|
156
ROBITools/src/ecodna.c
Normal file
156
ROBITools/src/ecodna.c
Normal file
@@ -0,0 +1,156 @@
|
|||||||
|
#include <string.h>
|
||||||
|
#include "ecoPCR.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* @doc: DNA alphabet (IUPAC)
|
||||||
|
*/
|
||||||
|
#define LX_BIO_DNA_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* @doc: complementary DNA alphabet (IUPAC)
|
||||||
|
*/
|
||||||
|
#define LX_BIO_CDNA_ALPHA "TVGHEFCDIJMLKNOPQYSAABWXRZ#!]["
|
||||||
|
|
||||||
|
|
||||||
|
static char sNuc[] = LX_BIO_DNA_ALPHA;
|
||||||
|
static char sAnuc[] = LX_BIO_CDNA_ALPHA;
|
||||||
|
|
||||||
|
static char LXBioBaseComplement(char nucAc);
|
||||||
|
static char *LXBioSeqComplement(char *nucAcSeq);
|
||||||
|
static char *reverseSequence(char *str,char isPattern);
|
||||||
|
|
||||||
|
|
||||||
|
/* ---------------------------- */
|
||||||
|
|
||||||
|
char LXBioBaseComplement(char nucAc)
|
||||||
|
{
|
||||||
|
char *c;
|
||||||
|
|
||||||
|
if ((c = strchr(sNuc, nucAc)))
|
||||||
|
return sAnuc[(c - sNuc)];
|
||||||
|
else
|
||||||
|
return nucAc;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------------------------- */
|
||||||
|
|
||||||
|
char *LXBioSeqComplement(char *nucAcSeq)
|
||||||
|
{
|
||||||
|
char *s;
|
||||||
|
|
||||||
|
for (s = nucAcSeq ; *s ; s++)
|
||||||
|
*s = LXBioBaseComplement(*s);
|
||||||
|
|
||||||
|
return nucAcSeq;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
char *reverseSequence(char *str,char isPattern)
|
||||||
|
{
|
||||||
|
char *sb, *se, c;
|
||||||
|
|
||||||
|
if (! str)
|
||||||
|
return str;
|
||||||
|
|
||||||
|
sb = str;
|
||||||
|
se = str + strlen(str) - 1;
|
||||||
|
|
||||||
|
while(sb <= se) {
|
||||||
|
c = *sb;
|
||||||
|
*sb++ = *se;
|
||||||
|
*se-- = c;
|
||||||
|
}
|
||||||
|
|
||||||
|
sb = str;
|
||||||
|
se = str + strlen(str) - 1;
|
||||||
|
|
||||||
|
if (isPattern)
|
||||||
|
for (;sb < se; sb++)
|
||||||
|
{
|
||||||
|
if (*sb=='#')
|
||||||
|
{
|
||||||
|
if (((se - sb) > 2) && (*(sb+2)=='!'))
|
||||||
|
{
|
||||||
|
*sb='!';
|
||||||
|
sb+=2;
|
||||||
|
*sb='#';
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
*sb=*(sb+1);
|
||||||
|
sb++;
|
||||||
|
*sb='#';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (*sb=='!')
|
||||||
|
{
|
||||||
|
*sb=*(sb-1);
|
||||||
|
*(sb-1)='!';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *ecoComplementPattern(char *nucAcSeq)
|
||||||
|
{
|
||||||
|
return reverseSequence(LXBioSeqComplement(nucAcSeq),1);
|
||||||
|
}
|
||||||
|
|
||||||
|
char *ecoComplementSequence(char *nucAcSeq)
|
||||||
|
{
|
||||||
|
return reverseSequence(LXBioSeqComplement(nucAcSeq),0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end)
|
||||||
|
/*
|
||||||
|
extract subsequence from nucAcSeq [begin,end[
|
||||||
|
*/
|
||||||
|
{
|
||||||
|
static char *buffer = NULL;
|
||||||
|
static int32_t buffSize= 0;
|
||||||
|
int32_t length;
|
||||||
|
|
||||||
|
if (begin < end)
|
||||||
|
{
|
||||||
|
length = end - begin;
|
||||||
|
|
||||||
|
if (length >= buffSize)
|
||||||
|
{
|
||||||
|
buffSize = length+1;
|
||||||
|
if (buffer)
|
||||||
|
buffer=ECOREALLOC(buffer,buffSize,
|
||||||
|
"Error in reallocating sub sequence buffer");
|
||||||
|
else
|
||||||
|
buffer=ECOMALLOC(buffSize,
|
||||||
|
"Error in allocating sub sequence buffer");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
strncpy(buffer,nucAcSeq + begin,length);
|
||||||
|
buffer[length]=0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
length = end + strlen(nucAcSeq) - begin;
|
||||||
|
|
||||||
|
if (length >= buffSize)
|
||||||
|
{
|
||||||
|
buffSize = length+1;
|
||||||
|
if (buffer)
|
||||||
|
buffer=ECOREALLOC(buffer,buffSize,
|
||||||
|
"Error in reallocating sub sequence buffer");
|
||||||
|
else
|
||||||
|
buffer=ECOMALLOC(buffSize,
|
||||||
|
"Error in allocating sub sequence buffer");
|
||||||
|
|
||||||
|
}
|
||||||
|
strncpy(buffer,nucAcSeq+begin,length - end);
|
||||||
|
strncpy(buffer+(length-end),nucAcSeq ,end);
|
||||||
|
buffer[length]=0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
BIN
ROBITools/src/ecodna.o
Normal file
BIN
ROBITools/src/ecodna.o
Normal file
Binary file not shown.
20
ROBITools/src/ecofilter.c
Normal file
20
ROBITools/src/ecofilter.c
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
#include "ecoPCR.h"
|
||||||
|
|
||||||
|
int eco_is_taxid_included( ecotaxonomy_t *taxonomy,
|
||||||
|
int32_t *restricted_taxid,
|
||||||
|
int32_t tab_len,
|
||||||
|
int32_t taxid)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
ecotx_t *taxon;
|
||||||
|
|
||||||
|
taxon = eco_findtaxonbytaxid(taxonomy, taxid);
|
||||||
|
|
||||||
|
if (taxon)
|
||||||
|
for (i=0; i < tab_len; i++)
|
||||||
|
if ( (taxon->taxid == restricted_taxid[i]) ||
|
||||||
|
(eco_isundertaxon(taxon, restricted_taxid[i])) )
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
BIN
ROBITools/src/ecofilter.o
Normal file
BIN
ROBITools/src/ecofilter.o
Normal file
Binary file not shown.
64
ROBITools/src/econame.c
Normal file
64
ROBITools/src/econame.c
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
#include "ecoPCR.h"
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
static econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy);
|
||||||
|
|
||||||
|
econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy)
|
||||||
|
{
|
||||||
|
|
||||||
|
int32_t count;
|
||||||
|
FILE *f;
|
||||||
|
econameidx_t *indexname;
|
||||||
|
int32_t i;
|
||||||
|
|
||||||
|
f = open_ecorecorddb(filename,&count,0);
|
||||||
|
|
||||||
|
if (f==NULL)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
indexname = (econameidx_t*) ECOMALLOC(sizeof(econameidx_t) + sizeof(econame_t) * (count-1),"Allocate names");
|
||||||
|
|
||||||
|
indexname->count=count;
|
||||||
|
|
||||||
|
for (i=0; i < count; i++){
|
||||||
|
readnext_econame(f,(indexname->names)+i,taxonomy);
|
||||||
|
}
|
||||||
|
|
||||||
|
return indexname;
|
||||||
|
}
|
||||||
|
|
||||||
|
econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy)
|
||||||
|
{
|
||||||
|
|
||||||
|
econameformat_t *raw;
|
||||||
|
int32_t rs;
|
||||||
|
|
||||||
|
raw = read_ecorecord(f,&rs);
|
||||||
|
|
||||||
|
if (!raw)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
if (is_big_endian())
|
||||||
|
{
|
||||||
|
raw->is_scientificname = swap_int32_t(raw->is_scientificname);
|
||||||
|
raw->namelength = swap_int32_t(raw->namelength);
|
||||||
|
raw->classlength = swap_int32_t(raw->classlength);
|
||||||
|
raw->taxid = swap_int32_t(raw->taxid);
|
||||||
|
}
|
||||||
|
|
||||||
|
name->is_scientificname=raw->is_scientificname;
|
||||||
|
|
||||||
|
name->name = ECOMALLOC((raw->namelength+1) * sizeof(char),"Allocate name");
|
||||||
|
strncpy(name->name,raw->names,raw->namelength);
|
||||||
|
name->name[raw->namelength]=0;
|
||||||
|
|
||||||
|
name->classname = ECOMALLOC((raw->classlength+1) * sizeof(char),"Allocate classname");
|
||||||
|
strncpy(name->classname,(raw->names+raw->namelength),raw->classlength);
|
||||||
|
name->classname[raw->classlength]=0;
|
||||||
|
|
||||||
|
name->taxon = taxonomy->taxons->taxon + raw->taxid;
|
||||||
|
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
BIN
ROBITools/src/econame.o
Normal file
BIN
ROBITools/src/econame.o
Normal file
Binary file not shown.
55
ROBITools/src/ecorank.c
Normal file
55
ROBITools/src/ecorank.c
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
#include "ecoPCR.h"
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
static int compareRankLabel(const void *label1, const void *label2);
|
||||||
|
|
||||||
|
ecorankidx_t *read_rankidx(const char *filename)
|
||||||
|
{
|
||||||
|
int32_t count;
|
||||||
|
FILE *f;
|
||||||
|
ecorankidx_t *index;
|
||||||
|
int32_t i;
|
||||||
|
int32_t rs;
|
||||||
|
char *buffer;
|
||||||
|
|
||||||
|
f = open_ecorecorddb(filename,&count,0);
|
||||||
|
|
||||||
|
if (f==NULL)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
index = (ecorankidx_t*) ECOMALLOC(sizeof(ecorankidx_t) + sizeof(char*) * (count-1),
|
||||||
|
"Allocate rank index");
|
||||||
|
|
||||||
|
index->count=count;
|
||||||
|
|
||||||
|
for (i=0; i < count; i++)
|
||||||
|
{
|
||||||
|
buffer = read_ecorecord(f,&rs);
|
||||||
|
index->label[i]=(char*) ECOMALLOC(rs+1,
|
||||||
|
"Allocate rank label");
|
||||||
|
strncpy(index->label[i],buffer,rs);
|
||||||
|
}
|
||||||
|
|
||||||
|
return index;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t rank_index(const char* label,ecorankidx_t* ranks)
|
||||||
|
{
|
||||||
|
char **rep;
|
||||||
|
|
||||||
|
rep = bsearch(label,ranks->label,ranks->count,sizeof(char*),compareRankLabel);
|
||||||
|
|
||||||
|
if (rep)
|
||||||
|
return rep-ranks->label;
|
||||||
|
// else
|
||||||
|
// ECOERROR(ECO_NOTFOUND_ERROR,"Rank label not found");
|
||||||
|
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int compareRankLabel(const void *label1, const void *label2)
|
||||||
|
{
|
||||||
|
return strcmp((const char*)label1,*(const char**)label2);
|
||||||
|
}
|
BIN
ROBITools/src/ecorank.o
Normal file
BIN
ROBITools/src/ecorank.o
Normal file
Binary file not shown.
230
ROBITools/src/ecoseq.c
Normal file
230
ROBITools/src/ecoseq.c
Normal file
@@ -0,0 +1,230 @@
|
|||||||
|
#include "ecoPCR.h"
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <zlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <ctype.h>
|
||||||
|
|
||||||
|
static FILE *open_seqfile(const char *prefix,int32_t index);
|
||||||
|
|
||||||
|
|
||||||
|
ecoseq_t *new_ecoseq()
|
||||||
|
{
|
||||||
|
void *tmp;
|
||||||
|
|
||||||
|
tmp = ECOMALLOC(sizeof(ecoseq_t),"Allocate new ecoseq structure");
|
||||||
|
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t delete_ecoseq(ecoseq_t * seq)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (seq)
|
||||||
|
{
|
||||||
|
if (seq->AC)
|
||||||
|
ECOFREE(seq->AC,"Free sequence AC");
|
||||||
|
|
||||||
|
if (seq->DE)
|
||||||
|
ECOFREE(seq->DE,"Free sequence DE");
|
||||||
|
|
||||||
|
if (seq->SQ)
|
||||||
|
ECOFREE(seq->SQ,"Free sequence SQ");
|
||||||
|
|
||||||
|
ECOFREE(seq,"Free sequence structure");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
ecoseq_t *new_ecoseq_with_data( char *AC,
|
||||||
|
char *DE,
|
||||||
|
char *SQ,
|
||||||
|
int32_t taxid_idx
|
||||||
|
)
|
||||||
|
{
|
||||||
|
ecoseq_t *tmp;
|
||||||
|
int32_t lstr;
|
||||||
|
tmp = new_ecoseq();
|
||||||
|
|
||||||
|
tmp->taxid=taxid_idx;
|
||||||
|
|
||||||
|
if (AC)
|
||||||
|
{
|
||||||
|
lstr =strlen(AC);
|
||||||
|
tmp->AC=ECOMALLOC((lstr+1) * sizeof(char),
|
||||||
|
"Allocate sequence accession");
|
||||||
|
strcpy(tmp->AC,AC);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (DE)
|
||||||
|
{
|
||||||
|
lstr =strlen(DE);
|
||||||
|
tmp->DE=ECOMALLOC((lstr+1) * sizeof(char),
|
||||||
|
"Allocate sequence definition");
|
||||||
|
strcpy(tmp->DE,DE);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (SQ)
|
||||||
|
{
|
||||||
|
lstr =strlen(SQ);
|
||||||
|
tmp->SQ=ECOMALLOC((lstr+1) * sizeof(char),
|
||||||
|
"Allocate sequence data");
|
||||||
|
strcpy(tmp->SQ,SQ);
|
||||||
|
}
|
||||||
|
return tmp;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* ?? used ??
|
||||||
|
**/
|
||||||
|
FILE *open_ecoseqdb(const char *filename,
|
||||||
|
int32_t *sequencecount)
|
||||||
|
{
|
||||||
|
return open_ecorecorddb(filename,sequencecount,1);
|
||||||
|
}
|
||||||
|
|
||||||
|
ecoseq_t *readnext_ecoseq(FILE *f)
|
||||||
|
{
|
||||||
|
char *compressed=NULL;
|
||||||
|
|
||||||
|
ecoseqformat_t *raw;
|
||||||
|
ecoseq_t *seq;
|
||||||
|
int32_t comp_status;
|
||||||
|
unsigned long int seqlength;
|
||||||
|
int32_t rs;
|
||||||
|
char *c;
|
||||||
|
int32_t i;
|
||||||
|
|
||||||
|
raw = read_ecorecord(f,&rs);
|
||||||
|
|
||||||
|
if (!raw)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
if (is_big_endian())
|
||||||
|
{
|
||||||
|
raw->CSQ_length = swap_int32_t(raw->CSQ_length);
|
||||||
|
raw->DE_length = swap_int32_t(raw->DE_length);
|
||||||
|
raw->SQ_length = swap_int32_t(raw->SQ_length);
|
||||||
|
raw->taxid = swap_int32_t(raw->taxid);
|
||||||
|
}
|
||||||
|
|
||||||
|
seq = new_ecoseq();
|
||||||
|
|
||||||
|
seq->taxid = raw->taxid;
|
||||||
|
|
||||||
|
seq->AC = ECOMALLOC(strlen(raw->AC) +1,
|
||||||
|
"Allocate Sequence Accesion number");
|
||||||
|
strncpy(seq->AC,raw->AC,strlen(raw->AC));
|
||||||
|
|
||||||
|
|
||||||
|
seq->DE = ECOMALLOC(raw->DE_length+1,
|
||||||
|
"Allocate Sequence definition");
|
||||||
|
strncpy(seq->DE,raw->data,raw->DE_length);
|
||||||
|
|
||||||
|
seqlength = seq->SQ_length = raw->SQ_length;
|
||||||
|
|
||||||
|
compressed = raw->data + raw->DE_length;
|
||||||
|
|
||||||
|
seq->SQ = ECOMALLOC(seqlength+1,
|
||||||
|
"Allocate sequence buffer");
|
||||||
|
|
||||||
|
comp_status = uncompress((unsigned char*)seq->SQ,
|
||||||
|
&seqlength,
|
||||||
|
(unsigned char*)compressed,
|
||||||
|
raw->CSQ_length);
|
||||||
|
|
||||||
|
if (comp_status != Z_OK)
|
||||||
|
ECOERROR(ECO_IO_ERROR,"I cannot uncompress sequence data");
|
||||||
|
|
||||||
|
for (c=seq->SQ,i=0;i<seqlength;c++,i++)
|
||||||
|
*c=toupper(*c);
|
||||||
|
|
||||||
|
|
||||||
|
return seq;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Open the sequences database (.sdx file)
|
||||||
|
* @param prefix name of the database (radical without extension)
|
||||||
|
* @param index integer
|
||||||
|
*
|
||||||
|
* @return file object
|
||||||
|
*/
|
||||||
|
FILE *open_seqfile(const char *prefix,int32_t index)
|
||||||
|
{
|
||||||
|
char filename_buffer[1024];
|
||||||
|
int32_t filename_length;
|
||||||
|
FILE *input;
|
||||||
|
int32_t seqcount;
|
||||||
|
|
||||||
|
filename_length = snprintf(filename_buffer,
|
||||||
|
1023,
|
||||||
|
"%s_%03d.sdx",
|
||||||
|
prefix,
|
||||||
|
index);
|
||||||
|
|
||||||
|
// fprintf(stderr,"# Coucou %s\n",filename_buffer);
|
||||||
|
|
||||||
|
|
||||||
|
if (filename_length >= 1024)
|
||||||
|
ECOERROR(ECO_ASSERT_ERROR,"file name is too long");
|
||||||
|
|
||||||
|
filename_buffer[filename_length]=0;
|
||||||
|
|
||||||
|
input=open_ecorecorddb(filename_buffer,&seqcount,0);
|
||||||
|
|
||||||
|
if (input)
|
||||||
|
fprintf(stderr,"# Reading file %s containing %d sequences...\n",
|
||||||
|
filename_buffer,
|
||||||
|
seqcount);
|
||||||
|
|
||||||
|
return input;
|
||||||
|
}
|
||||||
|
|
||||||
|
ecoseq_t *ecoseq_iterator(const char *prefix)
|
||||||
|
{
|
||||||
|
static FILE *current_seq_file= NULL;
|
||||||
|
static int32_t current_file_idx = 1;
|
||||||
|
static char current_prefix[1024];
|
||||||
|
ecoseq_t *seq;
|
||||||
|
|
||||||
|
if (prefix)
|
||||||
|
{
|
||||||
|
current_file_idx = 1;
|
||||||
|
|
||||||
|
if (current_seq_file)
|
||||||
|
fclose(current_seq_file);
|
||||||
|
|
||||||
|
strncpy(current_prefix,prefix,1023);
|
||||||
|
current_prefix[1023]=0;
|
||||||
|
|
||||||
|
current_seq_file = open_seqfile(current_prefix,
|
||||||
|
current_file_idx);
|
||||||
|
|
||||||
|
if (!current_seq_file)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
seq = readnext_ecoseq(current_seq_file);
|
||||||
|
|
||||||
|
if (!seq && feof(current_seq_file))
|
||||||
|
{
|
||||||
|
current_file_idx++;
|
||||||
|
fclose(current_seq_file);
|
||||||
|
current_seq_file = open_seqfile(current_prefix,
|
||||||
|
current_file_idx);
|
||||||
|
|
||||||
|
|
||||||
|
if (current_seq_file)
|
||||||
|
seq = readnext_ecoseq(current_seq_file);
|
||||||
|
}
|
||||||
|
|
||||||
|
return seq;
|
||||||
|
}
|
BIN
ROBITools/src/ecoseq.o
Normal file
BIN
ROBITools/src/ecoseq.o
Normal file
Binary file not shown.
437
ROBITools/src/ecotax.c
Normal file
437
ROBITools/src/ecotax.c
Normal file
@@ -0,0 +1,437 @@
|
|||||||
|
#include "ecoPCR.h"
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#include <R.h>
|
||||||
|
|
||||||
|
#ifndef MAX
|
||||||
|
#define MAX(x,y) (((x)>(y)) ? (x):(y))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Open the taxonomy database
|
||||||
|
* @param pointer to the database (.tdx file)
|
||||||
|
* @return a ecotxidx_t structure
|
||||||
|
*/
|
||||||
|
ecotxidx_t *read_taxonomyidx(const char *filename,const char *filename2)
|
||||||
|
{
|
||||||
|
int32_t count;
|
||||||
|
int32_t count2;
|
||||||
|
FILE *f;
|
||||||
|
FILE *f2;
|
||||||
|
ecotxidx_t *index;
|
||||||
|
struct ecotxnode *t;
|
||||||
|
int32_t i;
|
||||||
|
int32_t j;
|
||||||
|
|
||||||
|
f = open_ecorecorddb(filename,&count,0);
|
||||||
|
|
||||||
|
if (f==NULL) return NULL;
|
||||||
|
|
||||||
|
f2 = open_ecorecorddb(filename2,&count2,0);
|
||||||
|
|
||||||
|
index = (ecotxidx_t*) ECOMALLOC(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count+count2-1),
|
||||||
|
"Allocate taxonomy");
|
||||||
|
|
||||||
|
index->count=count+count2;
|
||||||
|
index->buffersize = index->count;
|
||||||
|
|
||||||
|
index->maxtaxid=0;
|
||||||
|
REprintf("Readind %d taxa...\n",count);
|
||||||
|
for (i=0; i < count; i++){
|
||||||
|
readnext_ecotaxon(f,&(index->taxon[i]));
|
||||||
|
index->taxon[i].parent=index->taxon + (size_t)index->taxon[i].parent;
|
||||||
|
index->taxon[i].parent->farest=0;
|
||||||
|
if (index->taxon[i].taxid > index->maxtaxid)
|
||||||
|
index->maxtaxid=index->taxon[i].taxid;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (count2>0)
|
||||||
|
REprintf("Readind %d local taxa...\n",count2);
|
||||||
|
else
|
||||||
|
REprintf("No local taxon\n");
|
||||||
|
|
||||||
|
count = index->count;
|
||||||
|
|
||||||
|
for (; i < count; i++){
|
||||||
|
readnext_ecotaxon(f2,&(index->taxon[i]));
|
||||||
|
index->taxon[i].parent=index->taxon + (size_t)index->taxon[i].parent;
|
||||||
|
index->taxon[i].parent->farest=0;
|
||||||
|
if (index->taxon[i].taxid > index->maxtaxid)
|
||||||
|
index->maxtaxid=index->taxon[i].taxid;
|
||||||
|
}
|
||||||
|
|
||||||
|
REprintf("Computing longest branches...\n",count);
|
||||||
|
|
||||||
|
for (i=0; i < count; i++){
|
||||||
|
t=index->taxon+i;
|
||||||
|
if (t->farest==-1)
|
||||||
|
{
|
||||||
|
t->farest=0;
|
||||||
|
while(t->parent != t)
|
||||||
|
{
|
||||||
|
j = t->farest + 1;
|
||||||
|
if (j > t->parent->farest)
|
||||||
|
{
|
||||||
|
t->parent->farest = j;
|
||||||
|
t=t->parent;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
t=index->taxon;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return index;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int32_t delete_taxonomy(ecotxidx_t *index)
|
||||||
|
{
|
||||||
|
int32_t i;
|
||||||
|
|
||||||
|
if (index)
|
||||||
|
{
|
||||||
|
for (i=0; i< index->count; i++)
|
||||||
|
if (index->taxon[i].name)
|
||||||
|
ECOFREE(index->taxon[i].name,"Free scientific name");
|
||||||
|
|
||||||
|
ECOFREE(index,"Free Taxonomy");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int32_t delete_taxon(ecotx_t *taxon)
|
||||||
|
{
|
||||||
|
if (taxon)
|
||||||
|
{
|
||||||
|
if (taxon->name)
|
||||||
|
ECOFREE(taxon->name,"Free scientific name");
|
||||||
|
|
||||||
|
ECOFREE(taxon,"Free Taxon");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read the database for a given taxon a save the data
|
||||||
|
* into the taxon structure(if any found)
|
||||||
|
* @param *f pointer to FILE type returned by fopen
|
||||||
|
* @param *taxon pointer to the structure
|
||||||
|
*
|
||||||
|
* @return a ecotx_t structure if any taxon found else NULL
|
||||||
|
*/
|
||||||
|
ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon)
|
||||||
|
{
|
||||||
|
|
||||||
|
ecotxformat_t *raw;
|
||||||
|
int32_t rs;
|
||||||
|
|
||||||
|
raw = read_ecorecord(f,&rs);
|
||||||
|
|
||||||
|
if (!raw)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
if (is_big_endian())
|
||||||
|
{
|
||||||
|
raw->namelength = swap_int32_t(raw->namelength);
|
||||||
|
raw->parent = swap_int32_t(raw->parent);
|
||||||
|
raw->rank = swap_int32_t(raw->rank);
|
||||||
|
raw->taxid = swap_int32_t(raw->taxid);
|
||||||
|
}
|
||||||
|
|
||||||
|
taxon->parent = (ecotx_t*)((size_t)raw->parent);
|
||||||
|
taxon->taxid = raw->taxid;
|
||||||
|
taxon->rank = raw->rank;
|
||||||
|
taxon->farest = -1;
|
||||||
|
|
||||||
|
taxon->name = ECOMALLOC((raw->namelength+1) * sizeof(char),
|
||||||
|
"Allocate taxon scientific name");
|
||||||
|
|
||||||
|
strncpy(taxon->name,raw->name,raw->namelength);
|
||||||
|
|
||||||
|
return taxon;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName)
|
||||||
|
{
|
||||||
|
ecotaxonomy_t *tax;
|
||||||
|
char *filename;
|
||||||
|
char *filename2;
|
||||||
|
int buffsize;
|
||||||
|
|
||||||
|
tax = ECOMALLOC(sizeof(ecotaxonomy_t),
|
||||||
|
"Allocate taxonomy structure");
|
||||||
|
|
||||||
|
tax->ranks =NULL;
|
||||||
|
tax->taxons=NULL;
|
||||||
|
tax->names =NULL;
|
||||||
|
|
||||||
|
buffsize = strlen(prefix)+10;
|
||||||
|
|
||||||
|
filename = ECOMALLOC(buffsize,
|
||||||
|
"Allocate filename");
|
||||||
|
filename2= ECOMALLOC(buffsize,
|
||||||
|
"Allocate filename");
|
||||||
|
|
||||||
|
snprintf(filename,buffsize,"%s.rdx",prefix);
|
||||||
|
|
||||||
|
tax->ranks = read_rankidx(filename);
|
||||||
|
|
||||||
|
if (tax->ranks == NULL)
|
||||||
|
{
|
||||||
|
ECOFREE(filename,"Desallocate filename 1");
|
||||||
|
ECOFREE(filename2,"Desallocate filename 2");
|
||||||
|
|
||||||
|
delete_ecotaxonomy(tax);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
snprintf(filename,buffsize,"%s.tdx",prefix);
|
||||||
|
snprintf(filename2,buffsize,"%s.ldx",prefix);
|
||||||
|
|
||||||
|
tax->taxons = read_taxonomyidx(filename,filename2);
|
||||||
|
|
||||||
|
if (tax->taxons == NULL)
|
||||||
|
{
|
||||||
|
ECOFREE(filename,"Desallocate filename 1");
|
||||||
|
ECOFREE(filename,"Desallocate filename 2");
|
||||||
|
|
||||||
|
delete_ecotaxonomy(tax);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (readAlternativeName)
|
||||||
|
{
|
||||||
|
snprintf(filename,buffsize,"%s.ndx",prefix);
|
||||||
|
tax->names=read_nameidx(filename,tax);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
tax->names=NULL;
|
||||||
|
|
||||||
|
ECOFREE(filename,"Desallocate filename 1");
|
||||||
|
ECOFREE(filename2,"Desallocate filename 2");
|
||||||
|
|
||||||
|
return tax;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy)
|
||||||
|
{
|
||||||
|
if (taxonomy)
|
||||||
|
{
|
||||||
|
if (taxonomy->ranks)
|
||||||
|
ECOFREE(taxonomy->ranks,"Free rank index");
|
||||||
|
|
||||||
|
if (taxonomy->names)
|
||||||
|
ECOFREE(taxonomy->names,"Free names index");
|
||||||
|
|
||||||
|
if (taxonomy->taxons)
|
||||||
|
ECOFREE(taxonomy->taxons,"Free taxon index");
|
||||||
|
|
||||||
|
ECOFREE(taxonomy,"Free taxonomy structure");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
ecotx_t *eco_findtaxonatrank(ecotx_t *taxon,
|
||||||
|
int32_t rankidx)
|
||||||
|
{
|
||||||
|
ecotx_t *current_taxon;
|
||||||
|
ecotx_t *next_taxon;
|
||||||
|
|
||||||
|
current_taxon = taxon;
|
||||||
|
next_taxon = current_taxon->parent;
|
||||||
|
|
||||||
|
while ((current_taxon!=next_taxon) && // I' am the root node
|
||||||
|
(current_taxon->rank!=rankidx))
|
||||||
|
{
|
||||||
|
current_taxon = next_taxon;
|
||||||
|
next_taxon = current_taxon->parent;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (current_taxon->rank==rankidx)
|
||||||
|
return current_taxon;
|
||||||
|
else
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int bcomptaxon (const void * ptaxid, const void * ptaxon) {
|
||||||
|
|
||||||
|
ecotx_t *current_taxon = (ecotx_t*)ptaxon;
|
||||||
|
int32_t taxid=(int32_t)((size_t)ptaxid);
|
||||||
|
return taxid - current_taxon->taxid;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get back information concerning a taxon from a taxonomic id
|
||||||
|
* @param *taxonomy the taxonomy database
|
||||||
|
* @param taxid the taxonomic id
|
||||||
|
*
|
||||||
|
* @result a ecotx_t structure containing the taxonimic information
|
||||||
|
**/
|
||||||
|
ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy,
|
||||||
|
int32_t taxid)
|
||||||
|
{
|
||||||
|
ecotx_t *current_taxon;
|
||||||
|
int32_t taxoncount;
|
||||||
|
// int32_t i;
|
||||||
|
|
||||||
|
taxoncount=taxonomy->taxons->count;
|
||||||
|
|
||||||
|
current_taxon = (ecotx_t*) bsearch((const void *)((size_t)taxid),
|
||||||
|
(const void *)taxonomy->taxons->taxon,
|
||||||
|
taxoncount,
|
||||||
|
sizeof(ecotx_t),
|
||||||
|
bcomptaxon);
|
||||||
|
|
||||||
|
/* Old version
|
||||||
|
for (current_taxon=taxonomy->taxons->taxon,
|
||||||
|
i=0;
|
||||||
|
i < taxoncount;
|
||||||
|
i++,
|
||||||
|
current_taxon++){
|
||||||
|
if (current_taxon->taxid==taxid){
|
||||||
|
return current_taxon;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
return current_taxon;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find out if taxon is son of other taxon (identified by its taxid)
|
||||||
|
* @param *taxon son taxon
|
||||||
|
* @param parent_taxid taxonomic id of the other taxon
|
||||||
|
*
|
||||||
|
* @return 1 is the other taxid math a parent taxid, else 0
|
||||||
|
**/
|
||||||
|
int eco_isundertaxon(ecotx_t *taxon,
|
||||||
|
int other_taxid)
|
||||||
|
{
|
||||||
|
ecotx_t *next_parent;
|
||||||
|
|
||||||
|
next_parent = taxon->parent;
|
||||||
|
|
||||||
|
while ( (other_taxid != next_parent->taxid) &&
|
||||||
|
(strcmp(next_parent->name, "root")) )
|
||||||
|
{
|
||||||
|
next_parent = next_parent->parent;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (other_taxid == next_parent->taxid)
|
||||||
|
return 1;
|
||||||
|
else
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
ecotx_t *eco_getspecies(ecotx_t *taxon,
|
||||||
|
ecotaxonomy_t *taxonomy)
|
||||||
|
{
|
||||||
|
static ecotaxonomy_t *tax=NULL;
|
||||||
|
static int32_t rankindex=-1;
|
||||||
|
|
||||||
|
if (taxonomy && tax!=taxonomy)
|
||||||
|
{
|
||||||
|
rankindex = rank_index("species",taxonomy->ranks);
|
||||||
|
tax=taxonomy;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!tax || rankindex < 0)
|
||||||
|
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
|
||||||
|
|
||||||
|
return eco_findtaxonatrank(taxon,rankindex);
|
||||||
|
}
|
||||||
|
|
||||||
|
ecotx_t *eco_getgenus(ecotx_t *taxon,
|
||||||
|
ecotaxonomy_t *taxonomy)
|
||||||
|
{
|
||||||
|
static ecotaxonomy_t *tax=NULL;
|
||||||
|
static int32_t rankindex=-1;
|
||||||
|
|
||||||
|
if (taxonomy && tax!=taxonomy)
|
||||||
|
{
|
||||||
|
rankindex = rank_index("genus",taxonomy->ranks);
|
||||||
|
tax=taxonomy;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!tax || rankindex < 0)
|
||||||
|
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
|
||||||
|
|
||||||
|
return eco_findtaxonatrank(taxon,rankindex);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
ecotx_t *eco_getfamily(ecotx_t *taxon,
|
||||||
|
ecotaxonomy_t *taxonomy)
|
||||||
|
{
|
||||||
|
static ecotaxonomy_t *tax=NULL;
|
||||||
|
static int32_t rankindex=-1;
|
||||||
|
|
||||||
|
if (taxonomy && tax!=taxonomy)
|
||||||
|
{
|
||||||
|
rankindex = rank_index("family",taxonomy->ranks);
|
||||||
|
tax=taxonomy;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!tax || rankindex < 0)
|
||||||
|
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
|
||||||
|
|
||||||
|
return eco_findtaxonatrank(taxon,rankindex);
|
||||||
|
}
|
||||||
|
|
||||||
|
ecotx_t *eco_getkingdom(ecotx_t *taxon,
|
||||||
|
ecotaxonomy_t *taxonomy)
|
||||||
|
{
|
||||||
|
static ecotaxonomy_t *tax=NULL;
|
||||||
|
static int32_t rankindex=-1;
|
||||||
|
|
||||||
|
if (taxonomy && tax!=taxonomy)
|
||||||
|
{
|
||||||
|
rankindex = rank_index("kingdom",taxonomy->ranks);
|
||||||
|
tax=taxonomy;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!tax || rankindex < 0)
|
||||||
|
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
|
||||||
|
|
||||||
|
return eco_findtaxonatrank(taxon,rankindex);
|
||||||
|
}
|
||||||
|
|
||||||
|
ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,
|
||||||
|
ecotaxonomy_t *taxonomy)
|
||||||
|
{
|
||||||
|
static ecotaxonomy_t *tax=NULL;
|
||||||
|
static int32_t rankindex=-1;
|
||||||
|
|
||||||
|
if (taxonomy && tax!=taxonomy)
|
||||||
|
{
|
||||||
|
rankindex = rank_index("superkingdom",taxonomy->ranks);
|
||||||
|
tax=taxonomy;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!tax || rankindex < 0)
|
||||||
|
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
|
||||||
|
|
||||||
|
return eco_findtaxonatrank(taxon,rankindex);
|
||||||
|
}
|
BIN
ROBITools/src/ecotax.o
Normal file
BIN
ROBITools/src/ecotax.o
Normal file
Binary file not shown.
835
ROBITools/src/robitax.c
Normal file
835
ROBITools/src/robitax.c
Normal file
@@ -0,0 +1,835 @@
|
|||||||
|
/*
|
||||||
|
* robitax.c
|
||||||
|
*
|
||||||
|
* Created on: 17 janv. 2013
|
||||||
|
* Author: coissac
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "robitax.h"
|
||||||
|
#include <unistd.h>
|
||||||
|
//#include <regex.h>
|
||||||
|
#include "slre.h"
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return a pointeur to an obitools taxonomy C structure
|
||||||
|
* from an R instance of taxonomy.obitools
|
||||||
|
*
|
||||||
|
* The function checks if the pointer stored in the R object is set
|
||||||
|
* to NULL. In this case this means that we have to load the taxonomy
|
||||||
|
* from the disk.
|
||||||
|
*
|
||||||
|
* @param taxonomy an R object
|
||||||
|
* @type taxonomy SEXP
|
||||||
|
*
|
||||||
|
* @return a pointer to the C structure
|
||||||
|
* @rtype ecotaxonomy_t *
|
||||||
|
*/
|
||||||
|
|
||||||
|
ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy)
|
||||||
|
{
|
||||||
|
|
||||||
|
|
||||||
|
char *pwd;
|
||||||
|
SEXP pointer;
|
||||||
|
SEXP rclass;
|
||||||
|
SEXP rdir;
|
||||||
|
SEXP rfile;
|
||||||
|
ecotaxonomy_t *ptax;
|
||||||
|
const char *class;
|
||||||
|
const char *file;
|
||||||
|
const char *dir;
|
||||||
|
|
||||||
|
int saved;
|
||||||
|
|
||||||
|
if (!IS_S4_OBJECT(Rtaxonomy) )
|
||||||
|
error("argument not taxonomy.obitools instance");
|
||||||
|
|
||||||
|
// We get the class name and compare it to "taxonomy.obitools"
|
||||||
|
rclass = getAttrib(Rtaxonomy, R_ClassSymbol);
|
||||||
|
class = CHAR(asChar(rclass));
|
||||||
|
|
||||||
|
if (strcmp(class,"taxonomy.obitools"))
|
||||||
|
error("argument not taxonomy.obitools instance");
|
||||||
|
|
||||||
|
pointer = R_do_slot(Rtaxonomy,mkString("pointer"));
|
||||||
|
saved = LOGICAL(R_do_slot(Rtaxonomy,mkString("saved")))[0];
|
||||||
|
ptax = (ecotaxonomy_t *) R_ExternalPtrAddr(pointer);
|
||||||
|
|
||||||
|
// If the external pointer is set to NULL we have to load
|
||||||
|
// the taxonomy from file
|
||||||
|
if (ptax==NULL && saved)
|
||||||
|
{
|
||||||
|
pwd = getcwd(NULL,0);
|
||||||
|
|
||||||
|
rfile = R_do_slot(Rtaxonomy,mkString("dbname"));
|
||||||
|
file = CHAR(asChar(rfile));
|
||||||
|
|
||||||
|
rdir = R_do_slot(Rtaxonomy,mkString("workingdir"));
|
||||||
|
dir = CHAR(asChar(rdir));
|
||||||
|
|
||||||
|
chdir(dir);
|
||||||
|
|
||||||
|
ptax = read_taxonomy(file,1);
|
||||||
|
|
||||||
|
R_SetExternalPtrAddr(pointer,(void*)ptax);
|
||||||
|
|
||||||
|
chdir(pwd);
|
||||||
|
free(pwd);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ptax==NULL && ! saved)
|
||||||
|
error("The taxonomy instance is no more valid and must be rebuilt");
|
||||||
|
|
||||||
|
return ptax;
|
||||||
|
}
|
||||||
|
|
||||||
|
SEXP R_delete_taxonomy(SEXP Rtaxonomy)
|
||||||
|
{
|
||||||
|
ecotaxonomy_t *ptax;
|
||||||
|
// SEXP pointer;
|
||||||
|
|
||||||
|
ptax = (ecotaxonomy_t *) R_ExternalPtrAddr(Rtaxonomy);
|
||||||
|
|
||||||
|
(void) delete_ecotaxonomy(ptax);
|
||||||
|
|
||||||
|
// Clear the external pointer
|
||||||
|
R_ClearExternalPtr(Rtaxonomy);
|
||||||
|
|
||||||
|
return R_NilValue;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
SEXP R_read_taxonomy(SEXP filename, SEXP altenative)
|
||||||
|
{
|
||||||
|
int alt;
|
||||||
|
const char* file;
|
||||||
|
SEXP Rtax;
|
||||||
|
|
||||||
|
if (! isString(filename))
|
||||||
|
error("filename not character");
|
||||||
|
file = CHAR(STRING_ELT(filename, 0));
|
||||||
|
|
||||||
|
if (! isLogical(altenative))
|
||||||
|
error("altenative not logical");
|
||||||
|
alt = LOGICAL(altenative)[0];
|
||||||
|
|
||||||
|
ecotaxonomy_t *taxonomy = read_taxonomy(file,alt);
|
||||||
|
|
||||||
|
if (! taxonomy)
|
||||||
|
error("Cannot open the taxonomy database");
|
||||||
|
|
||||||
|
Rtax = PROTECT(R_MakeExternalPtr(taxonomy, mkString("ROBITools NCBI Taxonomy pointer"), R_NilValue));
|
||||||
|
R_RegisterCFinalizerEx(Rtax, (R_CFinalizer_t)R_delete_taxonomy,TRUE);
|
||||||
|
|
||||||
|
UNPROTECT(1);
|
||||||
|
|
||||||
|
|
||||||
|
return Rtax;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
SEXP R_get_scientific_name(SEXP Rtaxonomy,SEXP Rtaxid)
|
||||||
|
{
|
||||||
|
ecotx_t *taxon;
|
||||||
|
ecotaxonomy_t *ptax;
|
||||||
|
int taxid;
|
||||||
|
// SEXP scname;
|
||||||
|
|
||||||
|
ptax = getTaxPointer(Rtaxonomy);
|
||||||
|
|
||||||
|
if (! isInteger(Rtaxid))
|
||||||
|
error("taxid not integer");
|
||||||
|
|
||||||
|
taxid = *INTEGER(Rtaxid);
|
||||||
|
|
||||||
|
if (! (taxid > 0))
|
||||||
|
error("taxid not positive");
|
||||||
|
|
||||||
|
|
||||||
|
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||||
|
|
||||||
|
if (!taxon)
|
||||||
|
return ScalarString(R_NaString);
|
||||||
|
// error("unkown taxid");
|
||||||
|
|
||||||
|
return mkString(taxon->name);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
SEXP R_get_rank(SEXP Rtaxonomy,SEXP Rtaxid)
|
||||||
|
{
|
||||||
|
ecotx_t *taxon;
|
||||||
|
ecotaxonomy_t *ptax;
|
||||||
|
int *taxid;
|
||||||
|
int ntaxid;
|
||||||
|
int i;
|
||||||
|
SEXP results;
|
||||||
|
// SEXP scname;
|
||||||
|
|
||||||
|
ptax = getTaxPointer(Rtaxonomy);
|
||||||
|
|
||||||
|
if (! isInteger(Rtaxid))
|
||||||
|
error("taxid not integer");
|
||||||
|
|
||||||
|
ntaxid = length(Rtaxid);
|
||||||
|
|
||||||
|
results = PROTECT(allocVector(STRSXP, ntaxid));
|
||||||
|
|
||||||
|
taxid = INTEGER(Rtaxid);
|
||||||
|
|
||||||
|
for (i=0; i < ntaxid; i++)
|
||||||
|
{
|
||||||
|
if (taxid[i]== NA_INTEGER || taxid[i] <= 0)
|
||||||
|
SET_STRING_ELT(results, i, R_NaString);
|
||||||
|
else {
|
||||||
|
taxon = eco_findtaxonbytaxid(ptax, taxid[i]);
|
||||||
|
if (!taxon)
|
||||||
|
SET_STRING_ELT(results, i, R_NaString);
|
||||||
|
else
|
||||||
|
SET_STRING_ELT(results, i, mkChar(ptax->ranks->label[taxon->rank]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
UNPROTECT(1);
|
||||||
|
|
||||||
|
return results;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
SEXP R_findtaxonatrank(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rrank, SEXP Rname)
|
||||||
|
{
|
||||||
|
ecotx_t *taxon;
|
||||||
|
ecotx_t *rep;
|
||||||
|
ecotaxonomy_t *ptax;
|
||||||
|
int taxid;
|
||||||
|
int name;
|
||||||
|
const char *rank;
|
||||||
|
int rankidx;
|
||||||
|
// SEXP scname;
|
||||||
|
|
||||||
|
ptax = getTaxPointer(Rtaxonomy);
|
||||||
|
|
||||||
|
if (! isInteger(Rtaxid))
|
||||||
|
error("taxid not integer");
|
||||||
|
|
||||||
|
taxid = *INTEGER(Rtaxid);
|
||||||
|
|
||||||
|
if (! (taxid > 0))
|
||||||
|
error("taxid not positive");
|
||||||
|
|
||||||
|
if (! isString(Rrank))
|
||||||
|
error("rank not a string");
|
||||||
|
|
||||||
|
rank=CHAR(STRING_ELT(Rrank,0));
|
||||||
|
|
||||||
|
rankidx=rank_index(rank,ptax->ranks);
|
||||||
|
|
||||||
|
if (rankidx < 0)
|
||||||
|
error("unkown rank name");
|
||||||
|
|
||||||
|
if (! isLogical(Rname))
|
||||||
|
error("name not logical");
|
||||||
|
name = LOGICAL(Rname)[0];
|
||||||
|
|
||||||
|
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||||
|
|
||||||
|
if (!taxon)
|
||||||
|
{
|
||||||
|
if (name)
|
||||||
|
return ScalarString(R_NaString);
|
||||||
|
else
|
||||||
|
return ScalarInteger(R_NaInt);
|
||||||
|
}
|
||||||
|
rep = eco_findtaxonatrank(taxon,rankidx);
|
||||||
|
|
||||||
|
if (!rep)
|
||||||
|
{
|
||||||
|
if (name)
|
||||||
|
return ScalarString(R_NaString);
|
||||||
|
else
|
||||||
|
return ScalarInteger(R_NaInt);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (name)
|
||||||
|
return mkString(rep->name);
|
||||||
|
else
|
||||||
|
return ScalarInteger(rep->taxid);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
SEXP R_get_species(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
|
||||||
|
{
|
||||||
|
ecotx_t *taxon;
|
||||||
|
ecotx_t *rep;
|
||||||
|
ecotaxonomy_t *ptax;
|
||||||
|
int taxid;
|
||||||
|
int name;
|
||||||
|
// SEXP scname;
|
||||||
|
|
||||||
|
ptax = getTaxPointer(Rtaxonomy);
|
||||||
|
|
||||||
|
if (! isInteger(Rtaxid))
|
||||||
|
error("taxid not integer");
|
||||||
|
|
||||||
|
taxid = *INTEGER(Rtaxid);
|
||||||
|
|
||||||
|
if (! (taxid > 0))
|
||||||
|
error("taxid not positive");
|
||||||
|
|
||||||
|
if (! isLogical(Rname))
|
||||||
|
error("name not logical");
|
||||||
|
name = LOGICAL(Rname)[0];
|
||||||
|
|
||||||
|
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||||
|
|
||||||
|
if (!taxon)
|
||||||
|
{
|
||||||
|
if (name)
|
||||||
|
return ScalarString(R_NaString);
|
||||||
|
else
|
||||||
|
return ScalarInteger(R_NaInt);
|
||||||
|
}
|
||||||
|
|
||||||
|
rep = eco_getspecies(taxon,ptax);
|
||||||
|
|
||||||
|
if (!rep)
|
||||||
|
{
|
||||||
|
if (name)
|
||||||
|
return ScalarString(R_NaString);
|
||||||
|
else
|
||||||
|
return ScalarInteger(R_NaInt);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (name)
|
||||||
|
return mkString(rep->name);
|
||||||
|
else
|
||||||
|
return ScalarInteger(rep->taxid);
|
||||||
|
}
|
||||||
|
|
||||||
|
SEXP R_get_genus(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
|
||||||
|
{
|
||||||
|
ecotx_t *taxon;
|
||||||
|
ecotx_t *rep;
|
||||||
|
ecotaxonomy_t *ptax;
|
||||||
|
int taxid;
|
||||||
|
int name;
|
||||||
|
// SEXP scname;
|
||||||
|
|
||||||
|
ptax = getTaxPointer(Rtaxonomy);
|
||||||
|
|
||||||
|
if (! isInteger(Rtaxid))
|
||||||
|
error("taxid not integer");
|
||||||
|
|
||||||
|
taxid = *INTEGER(Rtaxid);
|
||||||
|
|
||||||
|
if (! (taxid > 0))
|
||||||
|
error("taxid not positive");
|
||||||
|
|
||||||
|
if (! isLogical(Rname))
|
||||||
|
error("name not logical");
|
||||||
|
name = LOGICAL(Rname)[0];
|
||||||
|
|
||||||
|
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||||
|
|
||||||
|
if (!taxon)
|
||||||
|
{
|
||||||
|
if (name)
|
||||||
|
return ScalarString(R_NaString);
|
||||||
|
else
|
||||||
|
return ScalarInteger(R_NaInt);
|
||||||
|
}
|
||||||
|
|
||||||
|
rep = eco_getgenus(taxon,ptax);
|
||||||
|
|
||||||
|
if (!rep)
|
||||||
|
{
|
||||||
|
if (name)
|
||||||
|
return ScalarString(R_NaString);
|
||||||
|
else
|
||||||
|
return ScalarInteger(R_NaInt);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (name)
|
||||||
|
return mkString(rep->name);
|
||||||
|
else
|
||||||
|
return ScalarInteger(rep->taxid);
|
||||||
|
}
|
||||||
|
|
||||||
|
SEXP R_get_family(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
|
||||||
|
{
|
||||||
|
ecotx_t *taxon;
|
||||||
|
ecotx_t *rep;
|
||||||
|
ecotaxonomy_t *ptax;
|
||||||
|
int taxid;
|
||||||
|
int name;
|
||||||
|
// SEXP scname;
|
||||||
|
|
||||||
|
ptax = getTaxPointer(Rtaxonomy);
|
||||||
|
|
||||||
|
if (! isInteger(Rtaxid))
|
||||||
|
error("taxid not integer");
|
||||||
|
|
||||||
|
taxid = *INTEGER(Rtaxid);
|
||||||
|
|
||||||
|
if (! (taxid > 0))
|
||||||
|
error("taxid not positive");
|
||||||
|
|
||||||
|
if (! isLogical(Rname))
|
||||||
|
error("name not logical");
|
||||||
|
name = LOGICAL(Rname)[0];
|
||||||
|
|
||||||
|
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||||
|
|
||||||
|
if (!taxon)
|
||||||
|
{
|
||||||
|
if (name)
|
||||||
|
return ScalarString(R_NaString);
|
||||||
|
else
|
||||||
|
return ScalarInteger(R_NaInt);
|
||||||
|
}
|
||||||
|
|
||||||
|
rep = eco_getfamily(taxon,ptax);
|
||||||
|
|
||||||
|
if (!rep)
|
||||||
|
{
|
||||||
|
if (name)
|
||||||
|
return ScalarString(R_NaString);
|
||||||
|
else
|
||||||
|
return ScalarInteger(R_NaInt);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (name)
|
||||||
|
return mkString(rep->name);
|
||||||
|
else
|
||||||
|
return ScalarInteger(rep->taxid);
|
||||||
|
}
|
||||||
|
|
||||||
|
SEXP R_get_kingdom(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
|
||||||
|
{
|
||||||
|
ecotx_t *taxon;
|
||||||
|
ecotx_t *rep;
|
||||||
|
ecotaxonomy_t *ptax;
|
||||||
|
int taxid;
|
||||||
|
int name;
|
||||||
|
// SEXP scname;
|
||||||
|
|
||||||
|
ptax = getTaxPointer(Rtaxonomy);
|
||||||
|
|
||||||
|
if (! isInteger(Rtaxid))
|
||||||
|
error("taxid not integer");
|
||||||
|
|
||||||
|
taxid = *INTEGER(Rtaxid);
|
||||||
|
|
||||||
|
if (! (taxid > 0))
|
||||||
|
error("taxid not positive");
|
||||||
|
|
||||||
|
if (! isLogical(Rname))
|
||||||
|
error("name not logical");
|
||||||
|
name = LOGICAL(Rname)[0];
|
||||||
|
|
||||||
|
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||||
|
|
||||||
|
if (!taxon)
|
||||||
|
{
|
||||||
|
if (name)
|
||||||
|
return ScalarString(R_NaString);
|
||||||
|
else
|
||||||
|
return ScalarInteger(R_NaInt);
|
||||||
|
}
|
||||||
|
|
||||||
|
rep = eco_getkingdom(taxon,ptax);
|
||||||
|
|
||||||
|
if (!rep)
|
||||||
|
{
|
||||||
|
if (name)
|
||||||
|
return ScalarString(R_NaString);
|
||||||
|
else
|
||||||
|
return ScalarInteger(R_NaInt);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (name)
|
||||||
|
return mkString(rep->name);
|
||||||
|
else
|
||||||
|
return ScalarInteger(rep->taxid);
|
||||||
|
}
|
||||||
|
|
||||||
|
SEXP R_get_superkingdom(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
|
||||||
|
{
|
||||||
|
ecotx_t *taxon;
|
||||||
|
ecotx_t *rep;
|
||||||
|
ecotaxonomy_t *ptax;
|
||||||
|
int taxid;
|
||||||
|
int name;
|
||||||
|
// SEXP scname;
|
||||||
|
|
||||||
|
ptax = getTaxPointer(Rtaxonomy);
|
||||||
|
|
||||||
|
if (! isInteger(Rtaxid))
|
||||||
|
error("taxid not integer");
|
||||||
|
|
||||||
|
taxid = *INTEGER(Rtaxid);
|
||||||
|
|
||||||
|
if (! (taxid > 0))
|
||||||
|
error("taxid not positive");
|
||||||
|
|
||||||
|
if (! isLogical(Rname))
|
||||||
|
error("name not logical");
|
||||||
|
name = LOGICAL(Rname)[0];
|
||||||
|
|
||||||
|
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||||
|
|
||||||
|
if (!taxon)
|
||||||
|
{
|
||||||
|
if (name)
|
||||||
|
return ScalarString(R_NaString);
|
||||||
|
else
|
||||||
|
return ScalarInteger(R_NaInt);
|
||||||
|
}
|
||||||
|
|
||||||
|
rep = eco_getsuperkingdom(taxon,ptax);
|
||||||
|
|
||||||
|
if (!rep)
|
||||||
|
{
|
||||||
|
if (name)
|
||||||
|
return ScalarString(R_NaString);
|
||||||
|
else
|
||||||
|
return ScalarInteger(R_NaInt);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (name)
|
||||||
|
return mkString(rep->name);
|
||||||
|
else
|
||||||
|
return ScalarInteger(rep->taxid);
|
||||||
|
}
|
||||||
|
|
||||||
|
SEXP R_get_parent(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
|
||||||
|
{
|
||||||
|
ecotx_t *taxon;
|
||||||
|
ecotx_t *rep;
|
||||||
|
ecotaxonomy_t *ptax;
|
||||||
|
int taxid;
|
||||||
|
int name;
|
||||||
|
// SEXP scname;
|
||||||
|
|
||||||
|
ptax = getTaxPointer(Rtaxonomy);
|
||||||
|
|
||||||
|
if (! isInteger(Rtaxid))
|
||||||
|
error("taxid not integer");
|
||||||
|
|
||||||
|
taxid = *INTEGER(Rtaxid);
|
||||||
|
|
||||||
|
if (! (taxid > 0))
|
||||||
|
error("taxid not positive");
|
||||||
|
|
||||||
|
if (! isLogical(Rname))
|
||||||
|
error("name not logical");
|
||||||
|
name = LOGICAL(Rname)[0];
|
||||||
|
|
||||||
|
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||||
|
|
||||||
|
if (!taxon)
|
||||||
|
{
|
||||||
|
if (name)
|
||||||
|
return ScalarString(R_NaString);
|
||||||
|
else
|
||||||
|
return ScalarInteger(R_NaInt);
|
||||||
|
}
|
||||||
|
|
||||||
|
rep = taxon->parent;
|
||||||
|
|
||||||
|
if (rep->taxid==taxid)
|
||||||
|
{
|
||||||
|
if (name)
|
||||||
|
return ScalarString(R_NaString);
|
||||||
|
else
|
||||||
|
return ScalarInteger(R_NaInt);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (name)
|
||||||
|
return mkString(rep->name);
|
||||||
|
else
|
||||||
|
return ScalarInteger(rep->taxid);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
SEXP R_validate_taxid(SEXP Rtaxonomy,SEXP Rtaxid)
|
||||||
|
{
|
||||||
|
ecotx_t *taxon;
|
||||||
|
ecotaxonomy_t *ptax;
|
||||||
|
int taxid;
|
||||||
|
// int name;
|
||||||
|
// SEXP scname;
|
||||||
|
|
||||||
|
ptax = getTaxPointer(Rtaxonomy);
|
||||||
|
|
||||||
|
if (! isInteger(Rtaxid))
|
||||||
|
error("taxid not integer");
|
||||||
|
|
||||||
|
taxid = *INTEGER(Rtaxid);
|
||||||
|
|
||||||
|
if (! (taxid > 0))
|
||||||
|
return ScalarInteger(R_NaInt);
|
||||||
|
|
||||||
|
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||||
|
|
||||||
|
if (!taxon)
|
||||||
|
return ScalarInteger(R_NaInt);
|
||||||
|
else
|
||||||
|
return ScalarInteger(taxon->taxid);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
SEXP R_is_under_taxon(SEXP Rtaxonomy, SEXP Rtaxid, SEXP Rparent)
|
||||||
|
{
|
||||||
|
ecotx_t *taxon;
|
||||||
|
ecotaxonomy_t *ptax;
|
||||||
|
int taxid;
|
||||||
|
int parent;
|
||||||
|
int rep;
|
||||||
|
// SEXP isunder;
|
||||||
|
|
||||||
|
ptax = getTaxPointer(Rtaxonomy);
|
||||||
|
|
||||||
|
if (! isInteger(Rparent))
|
||||||
|
error("parent not integer");
|
||||||
|
|
||||||
|
parent = *INTEGER(Rparent);
|
||||||
|
|
||||||
|
if (parent <= 0)
|
||||||
|
return ScalarInteger(R_NaInt);
|
||||||
|
|
||||||
|
taxon = eco_findtaxonbytaxid(ptax, parent);
|
||||||
|
|
||||||
|
if (!taxon)
|
||||||
|
return ScalarInteger(R_NaInt);
|
||||||
|
|
||||||
|
|
||||||
|
if (! isInteger(Rtaxid))
|
||||||
|
error("taxid not integer");
|
||||||
|
|
||||||
|
taxid = *INTEGER(Rtaxid);
|
||||||
|
|
||||||
|
if (taxid <= 0)
|
||||||
|
return ScalarInteger(R_NaInt);
|
||||||
|
|
||||||
|
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||||
|
|
||||||
|
if (!taxon)
|
||||||
|
return ScalarInteger(R_NaInt);
|
||||||
|
|
||||||
|
|
||||||
|
rep = eco_isundertaxon(taxon, parent);
|
||||||
|
|
||||||
|
return ScalarLogical(rep);
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
SEXP R_longest_path(SEXP Rtaxonomy,SEXP Rtaxid)
|
||||||
|
{
|
||||||
|
ecotx_t *taxon;
|
||||||
|
ecotaxonomy_t *ptax;
|
||||||
|
int taxid;
|
||||||
|
// int name;
|
||||||
|
// SEXP scname;
|
||||||
|
|
||||||
|
ptax = getTaxPointer(Rtaxonomy);
|
||||||
|
|
||||||
|
if (! isInteger(Rtaxid))
|
||||||
|
error("taxid not integer");
|
||||||
|
|
||||||
|
taxid = *INTEGER(Rtaxid);
|
||||||
|
|
||||||
|
if (taxid <= 0)
|
||||||
|
return ScalarInteger(R_NaInt);
|
||||||
|
|
||||||
|
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||||
|
|
||||||
|
if (!taxon)
|
||||||
|
return ScalarInteger(R_NaInt);
|
||||||
|
else
|
||||||
|
return ScalarInteger(taxon->farest);
|
||||||
|
}
|
||||||
|
|
||||||
|
SEXP R_rank_list(SEXP Rtaxonomy)
|
||||||
|
{
|
||||||
|
int nrank;
|
||||||
|
int i;
|
||||||
|
ecotaxonomy_t *ptax;
|
||||||
|
SEXP rNames;
|
||||||
|
|
||||||
|
ptax = getTaxPointer(Rtaxonomy);
|
||||||
|
|
||||||
|
nrank = ptax->ranks->count;
|
||||||
|
|
||||||
|
rNames = PROTECT(allocVector(STRSXP, nrank));
|
||||||
|
|
||||||
|
for (i=0; i < nrank;i++)
|
||||||
|
SET_STRING_ELT(rNames, i, mkChar(ptax->ranks->label[i]));
|
||||||
|
|
||||||
|
UNPROTECT(1);
|
||||||
|
|
||||||
|
return rNames;
|
||||||
|
}
|
||||||
|
|
||||||
|
SEXP R_taxid_list(SEXP Rtaxonomy)
|
||||||
|
{
|
||||||
|
int ntaxid;
|
||||||
|
int i;
|
||||||
|
ecotaxonomy_t *ptax;
|
||||||
|
SEXP rTaxids;
|
||||||
|
|
||||||
|
ptax = getTaxPointer(Rtaxonomy);
|
||||||
|
ntaxid = ptax->taxons->count;
|
||||||
|
rTaxids = PROTECT(allocVector(INTSXP, ntaxid));
|
||||||
|
|
||||||
|
for (i=0; i < ntaxid;i++)
|
||||||
|
INTEGER(rTaxids)[i]=ptax->taxons->taxon[i].taxid;
|
||||||
|
|
||||||
|
UNPROTECT(1);
|
||||||
|
|
||||||
|
return rTaxids;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
SEXP R_max_taxid(SEXP Rtaxonomy)
|
||||||
|
{
|
||||||
|
// int nrank;
|
||||||
|
// int i;
|
||||||
|
ecotaxonomy_t *ptax;
|
||||||
|
// SEXP rNames;
|
||||||
|
|
||||||
|
ptax = getTaxPointer(Rtaxonomy);
|
||||||
|
|
||||||
|
return ScalarInteger(ptax->taxons->maxtaxid);
|
||||||
|
}
|
||||||
|
|
||||||
|
SEXP R_length_taxonomy(SEXP Rtaxonomy)
|
||||||
|
{
|
||||||
|
ecotaxonomy_t *ptax;
|
||||||
|
|
||||||
|
ptax = getTaxPointer(Rtaxonomy);
|
||||||
|
|
||||||
|
return ScalarInteger(ptax->taxons->count);
|
||||||
|
}
|
||||||
|
|
||||||
|
SEXP R_ecofind(SEXP Rtaxonomy, SEXP Rpattern, SEXP Rrank, SEXP Ralternative)
|
||||||
|
{
|
||||||
|
ecotaxonomy_t *ptax;
|
||||||
|
econame_t *name;
|
||||||
|
char* pattern=NULL;
|
||||||
|
int re_match;
|
||||||
|
SEXP taxids;
|
||||||
|
int32_t* buffer;
|
||||||
|
int32_t tax_count = 0;
|
||||||
|
size_t j = 0;
|
||||||
|
int32_t rankfilter = 1;
|
||||||
|
int* ptaxid;
|
||||||
|
char *rankname=NULL;
|
||||||
|
int32_t nummatch = 0;
|
||||||
|
int32_t alternative = 0;
|
||||||
|
|
||||||
|
size_t bsize;
|
||||||
|
|
||||||
|
ptax = getTaxPointer(Rtaxonomy);
|
||||||
|
tax_count = ptax->taxons->count;
|
||||||
|
|
||||||
|
if (! isString(Rpattern))
|
||||||
|
error("pattern not a string");
|
||||||
|
|
||||||
|
pattern= (char*) CHAR(STRING_ELT(Rpattern,0));
|
||||||
|
|
||||||
|
if (! isNull(Rrank))
|
||||||
|
{
|
||||||
|
if (! isString(Rrank))
|
||||||
|
error("rank not a string");
|
||||||
|
|
||||||
|
rankname= (char*) CHAR(STRING_ELT(Rrank,0));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (! isLogical(Ralternative))
|
||||||
|
error("rank not a logical");
|
||||||
|
|
||||||
|
alternative = LOGICAL(Ralternative)[0];
|
||||||
|
|
||||||
|
|
||||||
|
nummatch=0;
|
||||||
|
buffer = (int32_t*) malloc(100 * sizeof(int32_t));
|
||||||
|
bsize=100;
|
||||||
|
|
||||||
|
if (alternative && ptax->names!=NULL)
|
||||||
|
for (j=0,name=ptax->names->names;
|
||||||
|
j < ptax->names->count;
|
||||||
|
name++,j++)
|
||||||
|
{
|
||||||
|
if(rankname)
|
||||||
|
rankfilter = !(strcmp(rankname,ptax->ranks->label[name->taxon->rank]));
|
||||||
|
|
||||||
|
re_match = slre_match(pattern, name->name,
|
||||||
|
strlen(name->name),
|
||||||
|
NULL, 0,
|
||||||
|
SLRE_IGNORE_CASE);
|
||||||
|
|
||||||
|
if (re_match > 0 && rankfilter)
|
||||||
|
{
|
||||||
|
buffer[nummatch]=name->taxon->taxid;
|
||||||
|
nummatch++;
|
||||||
|
if (nummatch==bsize) {
|
||||||
|
bsize*=2;
|
||||||
|
buffer = (int32_t*) realloc(buffer, bsize * sizeof(int32_t));
|
||||||
|
if (buffer==0)
|
||||||
|
{
|
||||||
|
// regfree(&re_preg);
|
||||||
|
error("Cannot allocate memory for the taxid list");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
else
|
||||||
|
for (j=0; j < ptax->taxons->count;j++)
|
||||||
|
{
|
||||||
|
if(rankname)
|
||||||
|
rankfilter = !(strcmp(rankname,ptax->ranks->label[ptax->taxons->taxon[j].rank]));
|
||||||
|
|
||||||
|
// re_match = regexec (&re_preg, ptax->taxons->taxon[j].name, 0, NULL, 0);
|
||||||
|
re_match = slre_match(pattern, ptax->taxons->taxon[j].name,
|
||||||
|
strlen(ptax->taxons->taxon[j].name),
|
||||||
|
NULL, 0,
|
||||||
|
SLRE_IGNORE_CASE);
|
||||||
|
|
||||||
|
|
||||||
|
// if (!re_match && rankfilter)
|
||||||
|
if (re_match > 0 && rankfilter)
|
||||||
|
{
|
||||||
|
buffer[nummatch]=ptax->taxons->taxon[j].taxid;
|
||||||
|
nummatch++;
|
||||||
|
if (nummatch==bsize) {
|
||||||
|
bsize*=2;
|
||||||
|
buffer = (int32_t*) realloc(buffer, bsize * sizeof(int32_t));
|
||||||
|
if (buffer==0)
|
||||||
|
{
|
||||||
|
// regfree(&re_preg);
|
||||||
|
error("Cannot allocate memory for the taxid list");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//regfree(&re_preg);
|
||||||
|
|
||||||
|
taxids = PROTECT(NEW_INTEGER(nummatch));
|
||||||
|
ptaxid = INTEGER(taxids);
|
||||||
|
|
||||||
|
for (j=0; j < nummatch; j++)
|
||||||
|
ptaxid[j]=buffer[j];
|
||||||
|
|
||||||
|
free(buffer);
|
||||||
|
|
||||||
|
UNPROTECT(1);
|
||||||
|
return taxids;
|
||||||
|
}
|
6
ROBITools/src/robitax.h
Normal file
6
ROBITools/src/robitax.h
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
#include "ecoPCR.h"
|
||||||
|
|
||||||
|
|
||||||
|
ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy);
|
||||||
|
SEXP R_delete_taxonomy(SEXP Rtaxonomy);
|
||||||
|
|
BIN
ROBITools/src/robitax.o
Normal file
BIN
ROBITools/src/robitax.o
Normal file
Binary file not shown.
433
ROBITools/src/slre.c
Executable file
433
ROBITools/src/slre.c
Executable file
@@ -0,0 +1,433 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
|
||||||
|
* Copyright (c) 2013 Cesanta Software Limited
|
||||||
|
* All rights reserved
|
||||||
|
*
|
||||||
|
* This library is dual-licensed: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License version 2 as
|
||||||
|
* published by the Free Software Foundation. For the terms of this
|
||||||
|
* license, see <http://www.gnu.org/licenses/>.
|
||||||
|
*
|
||||||
|
* You are free to use this library under the terms of the GNU General
|
||||||
|
* Public License, but WITHOUT ANY WARRANTY; without even the implied
|
||||||
|
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
* See the GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* Alternatively, you can license this library under a commercial
|
||||||
|
* license, as set out in <http://cesanta.com/products.html>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <ctype.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include "slre.h"
|
||||||
|
|
||||||
|
#define MAX_BRANCHES 100
|
||||||
|
#define MAX_BRACKETS 100
|
||||||
|
#define FAIL_IF(condition, error_code) if (condition) return (error_code)
|
||||||
|
|
||||||
|
#ifndef ARRAY_SIZE
|
||||||
|
#define ARRAY_SIZE(ar) (sizeof(ar) / sizeof((ar)[0]))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef SLRE_DEBUG
|
||||||
|
#define DBG(x) printf x
|
||||||
|
#else
|
||||||
|
#define DBG(x)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct bracket_pair {
|
||||||
|
const char *ptr; /* Points to the first char after '(' in regex */
|
||||||
|
int len; /* Length of the text between '(' and ')' */
|
||||||
|
int branches; /* Index in the branches array for this pair */
|
||||||
|
int num_branches; /* Number of '|' in this bracket pair */
|
||||||
|
};
|
||||||
|
|
||||||
|
struct branch {
|
||||||
|
int bracket_index; /* index for 'struct bracket_pair brackets' */
|
||||||
|
/* array defined below */
|
||||||
|
const char *schlong; /* points to the '|' character in the regex */
|
||||||
|
};
|
||||||
|
|
||||||
|
struct regex_info {
|
||||||
|
/*
|
||||||
|
* Describes all bracket pairs in the regular expression.
|
||||||
|
* First entry is always present, and grabs the whole regex.
|
||||||
|
*/
|
||||||
|
struct bracket_pair brackets[MAX_BRACKETS];
|
||||||
|
int num_brackets;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Describes alternations ('|' operators) in the regular expression.
|
||||||
|
* Each branch falls into a specific branch pair.
|
||||||
|
*/
|
||||||
|
struct branch branches[MAX_BRANCHES];
|
||||||
|
int num_branches;
|
||||||
|
|
||||||
|
/* Array of captures provided by the user */
|
||||||
|
struct slre_cap *caps;
|
||||||
|
int num_caps;
|
||||||
|
|
||||||
|
/* E.g. SLRE_IGNORE_CASE. See enum below */
|
||||||
|
int flags;
|
||||||
|
};
|
||||||
|
|
||||||
|
static int is_metacharacter(const unsigned char *s) {
|
||||||
|
static const char *metacharacters = "^$().[]*+?|\\Ssdbfnrtv";
|
||||||
|
return strchr(metacharacters, *s) != NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int op_len(const char *re) {
|
||||||
|
return re[0] == '\\' && re[1] == 'x' ? 4 : re[0] == '\\' ? 2 : 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int set_len(const char *re, int re_len) {
|
||||||
|
int len = 0;
|
||||||
|
|
||||||
|
while (len < re_len && re[len] != ']') {
|
||||||
|
len += op_len(re + len);
|
||||||
|
}
|
||||||
|
|
||||||
|
return len <= re_len ? len + 1 : -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int get_op_len(const char *re, int re_len) {
|
||||||
|
return re[0] == '[' ? set_len(re + 1, re_len - 1) + 1 : op_len(re);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int is_quantifier(const char *re) {
|
||||||
|
return re[0] == '*' || re[0] == '+' || re[0] == '?';
|
||||||
|
}
|
||||||
|
|
||||||
|
static int toi(int x) {
|
||||||
|
return isdigit(x) ? x - '0' : x - 'W';
|
||||||
|
}
|
||||||
|
|
||||||
|
static int hextoi(const unsigned char *s) {
|
||||||
|
return (toi(tolower(s[0])) << 4) | toi(tolower(s[1]));
|
||||||
|
}
|
||||||
|
|
||||||
|
static int match_op(const unsigned char *re, const unsigned char *s,
|
||||||
|
struct regex_info *info) {
|
||||||
|
int result = 0;
|
||||||
|
switch (*re) {
|
||||||
|
case '\\':
|
||||||
|
/* Metacharacters */
|
||||||
|
switch (re[1]) {
|
||||||
|
case 'S': FAIL_IF(isspace(*s), SLRE_NO_MATCH); result++; break;
|
||||||
|
case 's': FAIL_IF(!isspace(*s), SLRE_NO_MATCH); result++; break;
|
||||||
|
case 'd': FAIL_IF(!isdigit(*s), SLRE_NO_MATCH); result++; break;
|
||||||
|
case 'b': FAIL_IF(*s != '\b', SLRE_NO_MATCH); result++; break;
|
||||||
|
case 'f': FAIL_IF(*s != '\f', SLRE_NO_MATCH); result++; break;
|
||||||
|
case 'n': FAIL_IF(*s != '\n', SLRE_NO_MATCH); result++; break;
|
||||||
|
case 'r': FAIL_IF(*s != '\r', SLRE_NO_MATCH); result++; break;
|
||||||
|
case 't': FAIL_IF(*s != '\t', SLRE_NO_MATCH); result++; break;
|
||||||
|
case 'v': FAIL_IF(*s != '\v', SLRE_NO_MATCH); result++; break;
|
||||||
|
|
||||||
|
case 'x':
|
||||||
|
/* Match byte, \xHH where HH is hexadecimal byte representaion */
|
||||||
|
FAIL_IF(hextoi(re + 2) != *s, SLRE_NO_MATCH);
|
||||||
|
result++;
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
/* Valid metacharacter check is done in bar() */
|
||||||
|
FAIL_IF(re[1] != s[0], SLRE_NO_MATCH);
|
||||||
|
result++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '|': FAIL_IF(1, SLRE_INTERNAL_ERROR); break;
|
||||||
|
case '$': FAIL_IF(1, SLRE_NO_MATCH); break;
|
||||||
|
case '.': result++; break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
if (info->flags & SLRE_IGNORE_CASE) {
|
||||||
|
FAIL_IF(tolower(*re) != tolower(*s), SLRE_NO_MATCH);
|
||||||
|
} else {
|
||||||
|
FAIL_IF(*re != *s, SLRE_NO_MATCH);
|
||||||
|
}
|
||||||
|
result++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int match_set(const char *re, int re_len, const char *s,
|
||||||
|
struct regex_info *info) {
|
||||||
|
int len = 0, result = -1, invert = re[0] == '^';
|
||||||
|
|
||||||
|
if (invert) re++, re_len--;
|
||||||
|
|
||||||
|
while (len <= re_len && re[len] != ']' && result <= 0) {
|
||||||
|
/* Support character range */
|
||||||
|
if (re[len] != '-' && re[len + 1] == '-' && re[len + 2] != ']' &&
|
||||||
|
re[len + 2] != '\0') {
|
||||||
|
result = info->flags && SLRE_IGNORE_CASE ?
|
||||||
|
*s >= re[len] && *s <= re[len + 2] :
|
||||||
|
tolower(*s) >= tolower(re[len]) && tolower(*s) <= tolower(re[len + 2]);
|
||||||
|
len += 3;
|
||||||
|
} else {
|
||||||
|
result = match_op((unsigned char *) re + len, (unsigned char *) s, info);
|
||||||
|
len += op_len(re + len);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return (!invert && result > 0) || (invert && result <= 0) ? 1 : -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int doh(const char *s, int s_len, struct regex_info *info, int bi);
|
||||||
|
|
||||||
|
static int bar(const char *re, int re_len, const char *s, int s_len,
|
||||||
|
struct regex_info *info, int bi) {
|
||||||
|
/* i is offset in re, j is offset in s, bi is brackets index */
|
||||||
|
int i, j, n, step;
|
||||||
|
|
||||||
|
for (i = j = 0; i < re_len && j <= s_len; i += step) {
|
||||||
|
|
||||||
|
/* Handle quantifiers. Get the length of the chunk. */
|
||||||
|
step = re[i] == '(' ? info->brackets[bi + 1].len + 2 :
|
||||||
|
get_op_len(re + i, re_len - i);
|
||||||
|
|
||||||
|
DBG(("%s [%.*s] [%.*s] re_len=%d step=%d i=%d j=%d\n", __func__,
|
||||||
|
re_len - i, re + i, s_len - j, s + j, re_len, step, i, j));
|
||||||
|
|
||||||
|
FAIL_IF(is_quantifier(&re[i]), SLRE_UNEXPECTED_QUANTIFIER);
|
||||||
|
FAIL_IF(step <= 0, SLRE_INVALID_CHARACTER_SET);
|
||||||
|
|
||||||
|
if (i + step < re_len && is_quantifier(re + i + step)) {
|
||||||
|
DBG(("QUANTIFIER: [%.*s]%c [%.*s]\n", step, re + i,
|
||||||
|
re[i + step], s_len - j, s + j));
|
||||||
|
if (re[i + step] == '?') {
|
||||||
|
int result = bar(re + i, step, s + j, s_len - j, info, bi);
|
||||||
|
j += result > 0 ? result : 0;
|
||||||
|
i++;
|
||||||
|
} else if (re[i + step] == '+' || re[i + step] == '*') {
|
||||||
|
int j2 = j, nj = j, n1, n2 = -1, ni, non_greedy = 0;
|
||||||
|
|
||||||
|
/* Points to the regexp code after the quantifier */
|
||||||
|
ni = i + step + 1;
|
||||||
|
if (ni < re_len && re[ni] == '?') {
|
||||||
|
non_greedy = 1;
|
||||||
|
ni++;
|
||||||
|
}
|
||||||
|
|
||||||
|
do {
|
||||||
|
if ((n1 = bar(re + i, step, s + j2, s_len - j2, info, bi)) > 0) {
|
||||||
|
j2 += n1;
|
||||||
|
}
|
||||||
|
if (re[i + step] == '+' && n1 < 0) break;
|
||||||
|
|
||||||
|
if (ni >= re_len) {
|
||||||
|
/* After quantifier, there is nothing */
|
||||||
|
nj = j2;
|
||||||
|
} else if ((n2 = bar(re + ni, re_len - ni, s + j2,
|
||||||
|
s_len - j2, info, bi)) >= 0) {
|
||||||
|
/* Regex after quantifier matched */
|
||||||
|
nj = j2 + n2;
|
||||||
|
}
|
||||||
|
if (nj > j && non_greedy) break;
|
||||||
|
} while (n1 > 0);
|
||||||
|
|
||||||
|
if (n1 < 0 && re[i + step] == '*' &&
|
||||||
|
(n2 = bar(re + ni, re_len - ni, s + j, s_len - j, info, bi)) > 0) {
|
||||||
|
nj = j + n2;
|
||||||
|
}
|
||||||
|
|
||||||
|
DBG(("STAR/PLUS END: %d %d %d %d %d\n", j, nj, re_len - ni, n1, n2));
|
||||||
|
FAIL_IF(re[i + step] == '+' && nj == j, SLRE_NO_MATCH);
|
||||||
|
|
||||||
|
/* If while loop body above was not executed for the * quantifier, */
|
||||||
|
/* make sure the rest of the regex matches */
|
||||||
|
FAIL_IF(nj == j && ni < re_len && n2 < 0, SLRE_NO_MATCH);
|
||||||
|
|
||||||
|
/* Returning here cause we've matched the rest of RE already */
|
||||||
|
return nj;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (re[i] == '[') {
|
||||||
|
n = match_set(re + i + 1, re_len - (i + 2), s + j, info);
|
||||||
|
DBG(("SET %.*s [%.*s] -> %d\n", step, re + i, s_len - j, s + j, n));
|
||||||
|
FAIL_IF(n <= 0, SLRE_NO_MATCH);
|
||||||
|
j += n;
|
||||||
|
} else if (re[i] == '(') {
|
||||||
|
n = SLRE_NO_MATCH;
|
||||||
|
bi++;
|
||||||
|
FAIL_IF(bi >= info->num_brackets, SLRE_INTERNAL_ERROR);
|
||||||
|
DBG(("CAPTURING [%.*s] [%.*s] [%s]\n",
|
||||||
|
step, re + i, s_len - j, s + j, re + i + step));
|
||||||
|
|
||||||
|
if (re_len - (i + step) <= 0) {
|
||||||
|
/* Nothing follows brackets */
|
||||||
|
n = doh(s + j, s_len - j, info, bi);
|
||||||
|
} else {
|
||||||
|
int j2;
|
||||||
|
for (j2 = 0; j2 <= s_len - j; j2++) {
|
||||||
|
if ((n = doh(s + j, s_len - (j + j2), info, bi)) >= 0 &&
|
||||||
|
bar(re + i + step, re_len - (i + step),
|
||||||
|
s + j + n, s_len - (j + n), info, bi) >= 0) break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
DBG(("CAPTURED [%.*s] [%.*s]:%d\n", step, re + i, s_len - j, s + j, n));
|
||||||
|
FAIL_IF(n < 0, n);
|
||||||
|
if (info->caps != NULL) {
|
||||||
|
info->caps[bi - 1].ptr = s + j;
|
||||||
|
info->caps[bi - 1].len = n;
|
||||||
|
}
|
||||||
|
j += n;
|
||||||
|
} else if (re[i] == '^') {
|
||||||
|
FAIL_IF(j != 0, SLRE_NO_MATCH);
|
||||||
|
} else if (re[i] == '$') {
|
||||||
|
FAIL_IF(j != s_len, SLRE_NO_MATCH);
|
||||||
|
} else {
|
||||||
|
FAIL_IF(j >= s_len, SLRE_NO_MATCH);
|
||||||
|
n = match_op((unsigned char *) (re + i), (unsigned char *) (s + j), info);
|
||||||
|
FAIL_IF(n <= 0, n);
|
||||||
|
j += n;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return j;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Process branch points */
|
||||||
|
static int doh(const char *s, int s_len, struct regex_info *info, int bi) {
|
||||||
|
const struct bracket_pair *b = &info->brackets[bi];
|
||||||
|
int i = 0, len, result;
|
||||||
|
const char *p;
|
||||||
|
|
||||||
|
do {
|
||||||
|
p = i == 0 ? b->ptr : info->branches[b->branches + i - 1].schlong + 1;
|
||||||
|
len = b->num_branches == 0 ? b->len :
|
||||||
|
i == b->num_branches ? (int) (b->ptr + b->len - p) :
|
||||||
|
(int) (info->branches[b->branches + i].schlong - p);
|
||||||
|
DBG(("%s %d %d [%.*s] [%.*s]\n", __func__, bi, i, len, p, s_len, s));
|
||||||
|
result = bar(p, len, s, s_len, info, bi);
|
||||||
|
DBG(("%s <- %d\n", __func__, result));
|
||||||
|
} while (result <= 0 && i++ < b->num_branches); /* At least 1 iteration */
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int baz(const char *s, int s_len, struct regex_info *info) {
|
||||||
|
int i, result = -1, is_anchored = info->brackets[0].ptr[0] == '^';
|
||||||
|
|
||||||
|
for (i = 0; i <= s_len; i++) {
|
||||||
|
result = doh(s + i, s_len - i, info, 0);
|
||||||
|
if (result >= 0) {
|
||||||
|
result += i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (is_anchored) break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void setup_branch_points(struct regex_info *info) {
|
||||||
|
int i, j;
|
||||||
|
struct branch tmp;
|
||||||
|
|
||||||
|
/* First, sort branches. Must be stable, no qsort. Use bubble algo. */
|
||||||
|
for (i = 0; i < info->num_branches; i++) {
|
||||||
|
for (j = i + 1; j < info->num_branches; j++) {
|
||||||
|
if (info->branches[i].bracket_index > info->branches[j].bracket_index) {
|
||||||
|
tmp = info->branches[i];
|
||||||
|
info->branches[i] = info->branches[j];
|
||||||
|
info->branches[j] = tmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For each bracket, set their branch points. This way, for every bracket
|
||||||
|
* (i.e. every chunk of regex) we know all branch points before matching.
|
||||||
|
*/
|
||||||
|
for (i = j = 0; i < info->num_brackets; i++) {
|
||||||
|
info->brackets[i].num_branches = 0;
|
||||||
|
info->brackets[i].branches = j;
|
||||||
|
while (j < info->num_branches && info->branches[j].bracket_index == i) {
|
||||||
|
info->brackets[i].num_branches++;
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int foo(const char *re, int re_len, const char *s, int s_len,
|
||||||
|
struct regex_info *info) {
|
||||||
|
int i, step, depth = 0;
|
||||||
|
|
||||||
|
/* First bracket captures everything */
|
||||||
|
info->brackets[0].ptr = re;
|
||||||
|
info->brackets[0].len = re_len;
|
||||||
|
info->num_brackets = 1;
|
||||||
|
|
||||||
|
/* Make a single pass over regex string, memorize brackets and branches */
|
||||||
|
for (i = 0; i < re_len; i += step) {
|
||||||
|
step = get_op_len(re + i, re_len - i);
|
||||||
|
|
||||||
|
if (re[i] == '|') {
|
||||||
|
FAIL_IF(info->num_branches >= (int) ARRAY_SIZE(info->branches),
|
||||||
|
SLRE_TOO_MANY_BRANCHES);
|
||||||
|
info->branches[info->num_branches].bracket_index =
|
||||||
|
info->brackets[info->num_brackets - 1].len == -1 ?
|
||||||
|
info->num_brackets - 1 : depth;
|
||||||
|
info->branches[info->num_branches].schlong = &re[i];
|
||||||
|
info->num_branches++;
|
||||||
|
} else if (re[i] == '\\') {
|
||||||
|
FAIL_IF(i >= re_len - 1, SLRE_INVALID_METACHARACTER);
|
||||||
|
if (re[i + 1] == 'x') {
|
||||||
|
/* Hex digit specification must follow */
|
||||||
|
FAIL_IF(re[i + 1] == 'x' && i >= re_len - 3,
|
||||||
|
SLRE_INVALID_METACHARACTER);
|
||||||
|
FAIL_IF(re[i + 1] == 'x' && !(isxdigit(re[i + 2]) &&
|
||||||
|
isxdigit(re[i + 3])), SLRE_INVALID_METACHARACTER);
|
||||||
|
} else {
|
||||||
|
FAIL_IF(!is_metacharacter((unsigned char *) re + i + 1),
|
||||||
|
SLRE_INVALID_METACHARACTER);
|
||||||
|
}
|
||||||
|
} else if (re[i] == '(') {
|
||||||
|
FAIL_IF(info->num_brackets >= (int) ARRAY_SIZE(info->brackets),
|
||||||
|
SLRE_TOO_MANY_BRACKETS);
|
||||||
|
depth++; /* Order is important here. Depth increments first. */
|
||||||
|
info->brackets[info->num_brackets].ptr = re + i + 1;
|
||||||
|
info->brackets[info->num_brackets].len = -1;
|
||||||
|
info->num_brackets++;
|
||||||
|
FAIL_IF(info->num_caps > 0 && info->num_brackets - 1 > info->num_caps,
|
||||||
|
SLRE_CAPS_ARRAY_TOO_SMALL);
|
||||||
|
} else if (re[i] == ')') {
|
||||||
|
int ind = info->brackets[info->num_brackets - 1].len == -1 ?
|
||||||
|
info->num_brackets - 1 : depth;
|
||||||
|
info->brackets[ind].len = (int) (&re[i] - info->brackets[ind].ptr);
|
||||||
|
DBG(("SETTING BRACKET %d [%.*s]\n",
|
||||||
|
ind, info->brackets[ind].len, info->brackets[ind].ptr));
|
||||||
|
depth--;
|
||||||
|
FAIL_IF(depth < 0, SLRE_UNBALANCED_BRACKETS);
|
||||||
|
FAIL_IF(i > 0 && re[i - 1] == '(', SLRE_NO_MATCH);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
FAIL_IF(depth != 0, SLRE_UNBALANCED_BRACKETS);
|
||||||
|
setup_branch_points(info);
|
||||||
|
|
||||||
|
return baz(s, s_len, info);
|
||||||
|
}
|
||||||
|
|
||||||
|
int slre_match(const char *regexp, const char *s, int s_len,
|
||||||
|
struct slre_cap *caps, int num_caps, int flags) {
|
||||||
|
struct regex_info info;
|
||||||
|
|
||||||
|
/* Initialize info structure */
|
||||||
|
info.flags = flags;
|
||||||
|
info.num_brackets = info.num_branches = 0;
|
||||||
|
info.num_caps = num_caps;
|
||||||
|
info.caps = caps;
|
||||||
|
|
||||||
|
DBG(("========================> [%s] [%.*s]\n", regexp, s_len, s));
|
||||||
|
return foo(regexp, (int) strlen(regexp), s, s_len, &info);
|
||||||
|
}
|
60
ROBITools/src/slre.h
Executable file
60
ROBITools/src/slre.h
Executable file
@@ -0,0 +1,60 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
|
||||||
|
* Copyright (c) 2013 Cesanta Software Limited
|
||||||
|
* All rights reserved
|
||||||
|
*
|
||||||
|
* This library is dual-licensed: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License version 2 as
|
||||||
|
* published by the Free Software Foundation. For the terms of this
|
||||||
|
* license, see <http://www.gnu.org/licenses/>.
|
||||||
|
*
|
||||||
|
* You are free to use this library under the terms of the GNU General
|
||||||
|
* Public License, but WITHOUT ANY WARRANTY; without even the implied
|
||||||
|
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
* See the GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* Alternatively, you can license this library under a commercial
|
||||||
|
* license, as set out in <http://cesanta.com/products.html>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This is a regular expression library that implements a subset of Perl RE.
|
||||||
|
* Please refer to README.md for a detailed reference.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef SLRE_HEADER_DEFINED
|
||||||
|
#define SLRE_HEADER_DEFINED
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct slre_cap {
|
||||||
|
const char *ptr;
|
||||||
|
int len;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
int slre_match(const char *regexp, const char *buf, int buf_len,
|
||||||
|
struct slre_cap *caps, int num_caps, int flags);
|
||||||
|
|
||||||
|
/* Possible flags for slre_match() */
|
||||||
|
enum { SLRE_IGNORE_CASE = 1 };
|
||||||
|
|
||||||
|
|
||||||
|
/* slre_match() failure codes */
|
||||||
|
#define SLRE_NO_MATCH -1
|
||||||
|
#define SLRE_UNEXPECTED_QUANTIFIER -2
|
||||||
|
#define SLRE_UNBALANCED_BRACKETS -3
|
||||||
|
#define SLRE_INTERNAL_ERROR -4
|
||||||
|
#define SLRE_INVALID_CHARACTER_SET -5
|
||||||
|
#define SLRE_INVALID_METACHARACTER -6
|
||||||
|
#define SLRE_CAPS_ARRAY_TOO_SMALL -7
|
||||||
|
#define SLRE_TOO_MANY_BRANCHES -8
|
||||||
|
#define SLRE_TOO_MANY_BRACKETS -9
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* SLRE_HEADER_DEFINED */
|
BIN
ROBITools/src/slre.o
Normal file
BIN
ROBITools/src/slre.o
Normal file
Binary file not shown.
190
dev_notes.txt
Normal file
190
dev_notes.txt
Normal file
@@ -0,0 +1,190 @@
|
|||||||
|
package.skeleton("ROBITools",c("robitools.motu.count","robitools.motus",
|
||||||
|
"robitools.reads","robitools.samples",
|
||||||
|
"robitools.sample.count",))
|
||||||
|
|
||||||
|
|
||||||
|
#include <R.h>
|
||||||
|
#include <Rinternals.h>
|
||||||
|
|
||||||
|
static void cooked_goose(SEXP foo)
|
||||||
|
{
|
||||||
|
if (TYPEOF(foo) != EXTPTRSXP)
|
||||||
|
error("argument not external pointer");
|
||||||
|
double *x = (double *) R_ExternalPtrAddr(foo);
|
||||||
|
int blather = x[0];
|
||||||
|
Free(x);
|
||||||
|
if (blather)
|
||||||
|
printf("finalizer ran\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
SEXP blob(SEXP nin, SEXP blatherin)
|
||||||
|
{
|
||||||
|
if (! isInteger(nin))
|
||||||
|
error("n not integer");
|
||||||
|
int n = INTEGER(nin)[0];
|
||||||
|
if (! (n > 0))
|
||||||
|
error("n not positive");
|
||||||
|
if (! isLogical(blatherin))
|
||||||
|
error("blather not logical");
|
||||||
|
int blather = LOGICAL(blatherin)[0];
|
||||||
|
|
||||||
|
double *x = Calloc(n + 2, double);
|
||||||
|
|
||||||
|
GetRNGstate();
|
||||||
|
for (int i = 0; i < n; ++i)
|
||||||
|
x[i + 2] = norm_rand();
|
||||||
|
PutRNGstate();
|
||||||
|
x[1] = n;
|
||||||
|
x[0] = blather;
|
||||||
|
|
||||||
|
SEXP bar;
|
||||||
|
PROTECT(bar = R_MakeExternalPtr(x, R_NilValue, R_NilValue));
|
||||||
|
R_RegisterCFinalizer(bar, cooked_goose);
|
||||||
|
UNPROTECT(1);
|
||||||
|
return bar;
|
||||||
|
}
|
||||||
|
|
||||||
|
SEXP blub(SEXP foo)
|
||||||
|
{
|
||||||
|
if (TYPEOF(foo) != EXTPTRSXP)
|
||||||
|
error("argument not external pointer");
|
||||||
|
|
||||||
|
double *x = (double *) R_ExternalPtrAddr(foo);
|
||||||
|
int blather = x[0];
|
||||||
|
int n = x[1];
|
||||||
|
|
||||||
|
SEXP bar;
|
||||||
|
PROTECT(bar = allocVector(REALSXP, n));
|
||||||
|
for (int i = 0; i < n; ++i)
|
||||||
|
REAL(bar)[i] = x[i + 2];
|
||||||
|
UNPROTECT(1);
|
||||||
|
return bar;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
blob <- function(n, blather = FALSE) {
|
||||||
|
stopifnot(is.numeric(n))
|
||||||
|
stopifnot(as.integer(n) == n)
|
||||||
|
stopifnot(n > 0)
|
||||||
|
stopifnot(is.logical(blather))
|
||||||
|
.Call("blob", as.integer(n), blather)
|
||||||
|
}
|
||||||
|
|
||||||
|
blub <- function(x) {
|
||||||
|
stopifnot(class(x) == "externalptr")
|
||||||
|
.Call("blub", x)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Hi Robert,
|
||||||
|
|
||||||
|
It looks like there is no way to explicitly make an S4 object call a
|
||||||
|
function when it is garbage collected unless you resort to tricks with
|
||||||
|
reg.finalizer.
|
||||||
|
|
||||||
|
It turns out that Prof. Ripley's reply (thanks!!) had enough hints in it
|
||||||
|
that I was able to get the effect I wanted by using R's external pointer
|
||||||
|
facility. In fact it works quite nicely.
|
||||||
|
|
||||||
|
In a nutshell, I create a C++ object (with new) and then wrap its pointer
|
||||||
|
with an R external pointer using
|
||||||
|
SEXP rExtPtr = R_MakeExternalPtr( cPtr, aTag, R_NilValue);
|
||||||
|
|
||||||
|
Where cPtr is the C++/C pointer to the object and aTag is an R symbol
|
||||||
|
describing the pointer type [e.g. SEXP aTag =
|
||||||
|
install("this_is_a_tag_for_a_pointer_to_my_object")]. The final argument is
|
||||||
|
"a value to protect". I don't know what this means, but all of the examples
|
||||||
|
I saw use R_NilValue.
|
||||||
|
|
||||||
|
If you want a C++ function to be called when R loses the reference to the
|
||||||
|
external pointer (actually when R garbage collects it, or when R quits), do
|
||||||
|
R_RegisterCFinalizerEx( rExtPtr, (R_CFinalizer_t)functionToBeCalled, TRUE );
|
||||||
|
|
||||||
|
The TRUE means that R will call the "functionToBeCalled" if the pointer is
|
||||||
|
still around when R quits. I guess if you set it to FALSE, then you are
|
||||||
|
assuming that your shell can delete memory and/or release resources when R
|
||||||
|
quits.
|
||||||
|
|
||||||
|
So return this external pointer to R (the function that new'ed it was called
|
||||||
|
by .Call or something similar) and stick it in a slot of your object. Then
|
||||||
|
when your object is garbage collected, "functionToBeCalled" will be called.
|
||||||
|
The slot would have the type "externalptr".
|
||||||
|
|
||||||
|
The functionToBeCalled contains the code to delete the C++ pointer or
|
||||||
|
release resources, for example...
|
||||||
|
|
||||||
|
SEXP functionToBeCalled( SEXP rExtPtr ) {
|
||||||
|
// Get the C++ pointer
|
||||||
|
MyThing* ptr = R_ExternalPtrAddr(rExtPtr);
|
||||||
|
|
||||||
|
// Delete it
|
||||||
|
delete ptr;
|
||||||
|
|
||||||
|
// Clear the external pointer
|
||||||
|
R_ClearExternalPtr(rExtPtr);
|
||||||
|
|
||||||
|
return R_NilValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
And there you have it.
|
||||||
|
|
||||||
|
There doesn't seem to be any official documentation on this stuff (at least
|
||||||
|
none that I could find). The best references I found are on the R developers
|
||||||
|
web page. See the links within "some notes on _references, external
|
||||||
|
objects, or mutable state_ for R and a _simple implementation_ of external
|
||||||
|
references and finalization". Note that the documents are slightly out of
|
||||||
|
date (the function names have apparently been changed somewhat). The latter
|
||||||
|
one has some examples that are very helpful. And as Prof. Ripley pointed
|
||||||
|
out, RODBC uses this facility too, so look at that code.
|
||||||
|
|
||||||
|
Hope this was useful. Good luck.
|
||||||
|
|
||||||
|
|
||||||
|
SEXP
|
||||||
|
get(SEXP ext)
|
||||||
|
{
|
||||||
|
return mkString((char *) R_ExternalPtrAddr(ext));
|
||||||
|
}
|
||||||
|
|
||||||
|
SEXP
|
||||||
|
|
||||||
|
set(SEXP ext, SEXP str)
|
||||||
|
{
|
||||||
|
char *x = (char *) R_ExternalPtrAddr(ext);
|
||||||
|
snprintf(x, N_MAX, CHAR(STRING_ELT(str, 0)));
|
||||||
|
return ScalarLogical(TRUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
> dyn.load("tmp.so")
|
||||||
|
> x <- .Call("create", list("info could be any R object", 1:5))
|
||||||
|
> .Call("get", x)
|
||||||
|
[1] "my name is joe"
|
||||||
|
> ## reference semantics!
|
||||||
|
> .Call("set", x, "i am sam i am")
|
||||||
|
[1] TRUE
|
||||||
|
> .Call("get", x)
|
||||||
|
[1] "i am sam i am"
|
||||||
|
> x <- NULL
|
||||||
|
> gc()
|
||||||
|
finalizing
|
||||||
|
used (Mb) gc trigger (Mb) max used (Mb)
|
||||||
|
Ncells 339306 18.2 467875 25 407500 21.8
|
||||||
|
Vcells 202064 1.6 786432 6 380515 3.0
|
||||||
|
|
||||||
|
|
||||||
|
SEXP
|
||||||
|
incr(SEXP ext)
|
||||||
|
{
|
||||||
|
struct Foo *foo = (struct Foo*) R_ExternalPtrAddr(ext);
|
||||||
|
foo->x += 1;
|
||||||
|
return ScalarInteger(foo->x);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
library(ROBITools)
|
||||||
|
library.dynam('ROBITools.so')
|
||||||
|
t=.Call('R_read_taxonomy','ecochange',TRUE)
|
||||||
|
.Call('R_get_scientific_name',t,as.integer(7742))
|
Reference in New Issue
Block a user