commit 05efb38fed446afa73bcfe159280b0c5ca1a69cd Author: Eric Coissac Date: Wed Jan 13 10:05:51 2016 +0100 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4bf7159 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/man/ +/vignettes/ +/Read-and-delete-me diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..c1eafea --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,20 @@ +Package: ROBITaxonomy +Type: Package +Title: Metabarcoding data biodiversity analysis +Version: 0.1 +Date: 2012-08-23 +Author: LECA - Laboratoire d'ecologie alpine +Maintainer: LECA OBITools team +Description: More about what it does (maybe more than one line) +License: CeCILL v2.0 +LazyLoad: yes +Roxygen: list(wrap = FALSE) +Collate: + 'ROBITaxonomy.R' + 'taxonomy.R' + 'basic.R' + 'default.R' + 'distance.R' + 'lca.R' + 'rank.R' +RoxygenNote: 5.0.1 diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..6d3b1ea --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,28 @@ +# Generated by roxygen2: do not edit by hand + +export(default.taxonomy) +export(distance.taxonomy) +export(ecofind) +export(family) +export(genus) +export(is.obitools.taxonomy) +export(is.subcladeof) +export(kingdom) +export(length.obitools.taxonomy) +export(longest.path) +export(lowest.common.ancestor) +export(max.obitools.taxonomy) +export(parent) +export(path) +export(rank.list) +export(read.taxonomy) +export(scientificname) +export(species) +export(superkingdom) +export(taxid.list) +export(taxonatrank) +export(taxonomicrank) +export(validate) +exportClasses(obitools.taxonomy) +exportClasses(obitools.taxonomyOrNULL) +useDynLib(ROBITaxonomy) diff --git a/R/ROBITaxonomy.R b/R/ROBITaxonomy.R new file mode 100644 index 0000000..e69de29 diff --git a/R/basic.R b/R/basic.R new file mode 100644 index 0000000..6600492 --- /dev/null +++ b/R/basic.R @@ -0,0 +1,359 @@ +#' @include taxonomy.R +NULL + +#' @export +setGeneric("scientificname", function(taxonomy,taxid) { + return(standardGeneric("scientificname")) +}) + +#' Returns the scientific name corresponding to a \emph{NCBI taxid} +#' +#' \code{scientificname} function in package \pkg{\link{ROBITaxonomy}} returns the +#' scientific name corresponding to a \emph{NCBI taxid}. +#' +#' @param taxonomy a \code{\link{obitools.taxonomy}} instance +#' @param taxid an integer value or a vector of integer representing NCBI +#' taxonomic identifiers. +#' @return The scientific name of the corresponding taxon as a string or a +#' vector of string if the \code{taxid} argument is itself a vector +#' +#' @examples +#' # load the default taxonomy database include in the ROBITaxonomy library +#' taxo=default.taxonomy() +#' +#' # build a vector of 6 taxids corresponding to species +#' sp.taxid=c(7000,7004,7007,7009,7010,7011) +#' +#' # look for the scientific names correponding to these taxids +#' scientificname(taxo,sp.taxid) +#' +#' @seealso class \code{\linkS4class{obitools.taxonomy}} +#' +#' +#' @docType methods +#' @rdname scientificname-methods +#' @aliases scientificname-methods,obitools.taxonomy +#' @author Eric Coissac +#' +setMethod("scientificname", "obitools.taxonomy",function(taxonomy,taxid) { + getscname = function(t) { + if (is.na(t)) + return(NA) + else + return( .Call('R_get_scientific_name', + taxonomy, + t, + PACKAGE="ROBITaxonomy")) + } + + taxid = as.integer(taxid) + sapply(taxid,getscname) +}) + + +###################################################################### +###################################################################### + + +#' @export +setGeneric("parent", function(taxonomy,taxid,name=FALSE) { + return(standardGeneric("parent")) +}) + +#' Returns the parent taxon corresponding to a \emph{NCBI taxid} +#' +#' \code{parent} function in package \pkg{\link{ROBITaxonomy}} returns the +#' parent taxon corresponding to a \emph{NCBI taxid}. +#' +#' @param taxonomy a \code{\link{obitools.taxonomy}} instance +#' @param taxid an integer value or a vector of integer representing NCBI +#' taxonomic identifiers. +#' @param name A logical value \code{TRUE} or \code{FALSE} indicating if the +#' method returns a taxid or a scientific name. +#' +#' @return \describe{ \item{If \code{name==FALSE}}{the taxid of the +#' parent taxon as an integer or a vector of +#' integers if the \code{taxid} argument is itself +#' a vector} +#' \item{If \code{name==TRUE}}{the scientific name of the +#' parent taxon as a string or a vector of +#' string if the \code{taxid} argument is itself a +#' vector} } +#' +#' @examples +#' # load the default taxonomy database include in the ROBITaxonomy library +#' taxo=default.taxonomy() +#' +#' # build a vector of 6 taxids corresponding to species +#' sp.taxid=c(7000,7004,7007,7009,7010,7011) +#' +#' # look for the parent taxa correponding to these taxids +#' parent(taxo,sp.taxid) +#' +#' # same things but scientific names are returned +#' parent(taxo,sp.taxid,TRUE) +#' +#' @seealso class \code{\linkS4class{obitools.taxonomy}} +#' +#' +#' @docType methods +#' @rdname parent-methods +#' @aliases parent-methods,obitools.taxonomy +#' @author Eric Coissac +#' +setMethod("parent", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) { + getp = function(t) { + if (is.na(t)) + return(NA) + else + return(.Call('R_get_parent', + taxonomy, + as.integer(t), + name, + PACKAGE="ROBITaxonomy")) + } + + taxid = as.integer(taxid) + name = as.logical(name[1]) + sapply(taxid,getp) +}) + + + +###################################################################### +###################################################################### + + + +#' @export +setGeneric("taxid.list", function(taxonomy) { + return(standardGeneric("taxid.list")) +}) + + +#' Returns the list of all taxids belonging the taxonomy. +#' +#' \code{taxid.list} returns the list of all taxids included in the +#' instance of the class \code{\linkS4class{obitools.taxonomy}} +#' +#' @param taxonomy the \code{\linkS4class{obitools.taxonomy}} to use. +#' +#' @return an \code{integer} vector containing the list of taxids. +#' +#' @examples +#' # loads the default taxonomy database +#' taxo=default.taxonomy() +#' +#' # returns the count of taxa described in the taxonomy +#' length(taxo) +#' +#' # extracts the list of all valid taxids +#' good = taxid.list(taxo) +#' +#' # returns the size of the returned list +#' length(good) +#' +#' @seealso \code{\linkS4class{obitools.taxonomy}} +#' +#' @author Eric Coissac +#' @keywords taxonomy +#' @docType methods +#' @rdname taxid.list-method +#' @aliases taxid.list +#' +setMethod("taxid.list", "obitools.taxonomy", + function(taxonomy) { + return(.Call('R_taxid_list', + taxonomy, + PACKAGE="ROBITaxonomy")) + }) + +###################################################################### +###################################################################### + + + +#' Returns the count of taxa in the taxonomy. +#' +#' \code{length} returns the count of taxa included in the +#' instance of the class \code{\linkS4class{obitools.taxonomy}} +#' +#' @param x the \code{\linkS4class{obitools.taxonomy}} to use. +#' +#' @return an \code{integer} corresponding to the count of taxa. +#' +#' @examples +#' # loads the default taxonomy database +#' taxo=default.taxonomy() +#' +#' # returns the count of taxa described in the taxonomy +#' length(taxo) +#' +#' @seealso \code{\link{length}}, \code{\linkS4class{obitools.taxonomy}} +#' +#' @author Eric Coissac +#' @keywords taxonomy +#' @export length.obitools.taxonomy +#' +length.obitools.taxonomy = function(x) +{ + return(.Call('R_length_taxonomy', + x, + PACKAGE="ROBITaxonomy")) +} + + +###################################################################### +###################################################################### + +setGeneric('max') + +#' Returns the maximum taxid in the taxonomy. +#' +#' \code{length} returns the maximum taxid included in the +#' instance of the class \code{\linkS4class{obitools.taxonomy}} +#' +#' @param taxonomy the \code{\linkS4class{obitools.taxonomy}} to use. +#' @param na.rm included for compatibility purpose, this parameter as +#' no effect on this implementation of \code{max} +#' +#' @return an \code{integer} corresponding to the count of taxa. +#' +#' @examples +#' # load the default taxonomy database +#' taxo=default.taxonomy() +#' +#' # gets the larger taxid of the database +#' max(taxo) +#' +#' @seealso \code{\link{max}}, \code{\linkS4class{obitools.taxonomy}} +#' +#' @author Eric Coissac +#' @keywords taxonomy +#' @export max.obitools.taxonomy +#' +max.obitools.taxonomy=function(taxonomy,na.rm = FALSE) { + return(.Call('R_max_taxid', + taxonomy, + PACKAGE="ROBITaxonomy")) +} + +#' @export +setGeneric("ecofind", function(taxonomy,patterns,rank=NULL,alternative=FALSE) { + return(standardGeneric("ecofind")) +}) + +#' Returns taxids associated to the names +#' +#' Return the set of taxids having their name matching the given pattern. +#' +#' @param taxonomy the \code{\linkS4class{obitools.taxonomy}} to use. +#' @param patterns one or several regular pattern used to select the the taxa. +#' @param rank a \code{character} indicating a taxonomic rank. If not \code{NULL} +#' only taxids correponding to this rank are returned. +#' @param alternative A logical value \code{TRUE} or \code{FALSE} indicating +#' if the function must only look for a scientific name. +#' +#' @return if just one pattern is given, an integer vector is returned with the +#' corresponding taxids. If a list of patterns is given, the function +#' returns a list of integer vectors, each vector containing the taxids +#' corresponding to a pattern. The returned list is in the same order +#' than the given patern list. +#' +#' @examples +#' # load the default taxonomy database +#' taxo=default.taxonomy() +#' +#' # retreives the Vertebrata taxid +#' taxid = ecofind(taxo,"Vertebrata") +#' +#' taxid +#' scientificname(taxo,taxid) +#' +#' +#' taxid = ecofind(taxo,"^Vertebrata$") +#' +#' taxid +#' scientificname(taxo,taxid) +#' +#' +#' @author Eric Coissac +#' @keywords taxonomy +#' @docType methods +#' @rdname ecofind-method +#' @aliases ecofind,obitools.taxonomy +#' +setMethod("ecofind", "obitools.taxonomy",function(taxonomy,patterns,rank=NULL,alternative=FALSE) { + getp = function(t) { + if (is.na(t)) + return(NA) + else + return(unique(.Call('R_ecofind', + taxonomy, + t, + rank, + alternative, + PACKAGE="ROBITaxonomy"))) + } + + patterns = as.character(patterns) + taxid=lapply(patterns,getp) + if (length(taxid)==1) + taxid=taxid[[1]] + + return(taxid) +}) + + + +#' @export +setGeneric("validate", function(taxonomy,taxid) { + return(standardGeneric("validate")) +}) + +#' Checks that a \emph{taxid} is really present in taxonomy +#' +#' \code{validate} function in package \pkg{\link{ROBITaxonomy}} checks +#' that a \emph{taxid} is declared in the considered taxonomy. +#' +#' @param taxonomy a \code{\link{obitools.taxonomy}} instance +#' @param taxid an integer value or a vector of integer representing NCBI +#' taxonomic identifiers. +#' +#' @return The taxid if it exists, NA otherwise. If the input taxid is a +#' vector of integer returns an integer vector composed of validated +#' taxids and NA values. +#' +#' @examples +#' # load the default taxonomy database include in the ROBITaxonomy library +#' taxo=default.taxonomy() +#' +#' # build a vector of 101 taxids +#' sp.taxid=c(7000:7100) +#' +#' # checks the list of taxids +#' validate(taxo,sp.taxid) +#' +#' @seealso class \code{\linkS4class{obitools.taxonomy}} +#' +#' +#' @docType methods +#' @rdname validate-methods +#' @aliases validate-methods,obitools.taxonomy +#' @author Eric Coissac +#' +setMethod("validate", "obitools.taxonomy",function(taxonomy,taxid) { + getp = function(t) { + if (is.na(t)) + return(NA) + else + return(.Call('R_validate_taxid', + taxonomy, + t, + PACKAGE="ROBITaxonomy")) + } + + taxid = as.integer(taxid) + sapply(taxid,getp) +}) + diff --git a/R/default.R b/R/default.R new file mode 100644 index 0000000..e298b39 --- /dev/null +++ b/R/default.R @@ -0,0 +1,52 @@ +#' @include taxonomy.R +NULL + + +# +# +# Manage le loading of the default taxonomy +# +# + +.__default__taxonomy__ = NULL + +#' Returns the default taxonomy +#' +#' Returns a \code{\linkS4class{obitools.taxonomy}} instance corresponding +#' to a NCBI taxonomy included by default in the \pkg{\link{ROBITaxonomy}} package. +#' +#' @return a \code{\linkS4class{obitools.taxonomy}} instance. +#' +#' @examples +#' +#' # Load the default taxonomy +#' taxo = default.taxonomy() +#' +#' # and use it for requesting a scientific name +#' scientificname(taxo,7742) +#' +#' @seealso \code{\linkS4class{obitools.taxonomy}} +#' +#' @author Eric Coissac +#' @keywords taxonomy +#' @export +#' +default.taxonomy = function() { + if (is.null(get(".__default__taxonomy__",envir = environment()))) + assign(".__default__taxonomy__", + read.taxonomy(paste(system.file("extdata", + package="ROBITaxonomy"), + 'ncbitaxo', + sep='/')), + envir=globalenv()) + + return(get(".__default__taxonomy__",envir = globalenv())) +} + + +#' @export +#' +is.obitools.taxonomy = function(taxonomy) { + class(t)[1] == "obitools.taxonomy" +} + diff --git a/R/distance.R b/R/distance.R new file mode 100644 index 0000000..2632205 --- /dev/null +++ b/R/distance.R @@ -0,0 +1,189 @@ +#' @include taxonomy.R +NULL + +#' @export +setGeneric("longest.path", function(taxonomy,taxid) { + return(standardGeneric("longest.path")) +}) + +#' Returns the longuest path from a taxon. +#' +#' The method \code{longest.path} returns the length of the +#' path linking a taxid to the farest leaf belonging this taxid. +#' +#' @param taxonomy the \code{\linkS4class{obitools.taxonomy}} to use. +#' +#' @param taxid an \code{integer} vector containing the list of taxids. +#' +#' @return an \code{integer} vector containing the list length. +#' +#' @examples +#' # loads the default taxonomy database +#' taxo=default.taxonomy() +#' +#' # returns the longest path in the taxonomy (from the root node) +#' longest.path(taxo,1) +#' +#' +#' @seealso \code{\linkS4class{obitools.taxonomy}} +#' +#' @author Eric Coissac +#' @keywords taxonomy +#' @docType methods +#' @rdname longest.path-method +#' @aliases longest.path,obitools.taxonomy +#' +setMethod("longest.path", "obitools.taxonomy", + function(taxonomy,taxid) { + getp = function(t) { + if (is.na(t)) + return(NA) + else + return(.Call('R_longest_path', + taxonomy, + t, + PACKAGE="ROBITaxonomy")) + } + + taxid = as.integer(taxid) + sapply(taxid,getp) + }) + + +#' @export +setGeneric("distance.taxonomy", function(taxonomy,taxid1,taxid2=NULL,name=F) { + return(standardGeneric("distance.taxonomy")) +}) + + +#' Computes a distance matrix between taxids +#' +#' The method \code{taxonomy.distance} computes a distance matrix between a +#' set of taxids. The distance between two taxa is based on the topology of +#' the taxonomomy tree. +#' +#' \deqn{ d(Taxon_A,Taxon_B) = \frac{longest.path(lca(Taxon_A,Taxon_B))}{max(longest.path(Taxon_A),longest.path(Taxon_B))}} +#' { longest.path(lca(Taxon_A,Taxon_B)) / max(longest.path(Taxon_A),longest.path(Taxon_B)) } +#' +#' +#' @param taxonomy the \code{\linkS4class{obitools.taxonomy}} to use. +#' +#' @param taxid1 an \code{integer} vector containing a list of taxids. +#' +#' @param taxid2 an \code{integer} vector containing a list of taxids. +#' If \code{taxid2} is set to \code{NULL} (it's default value) +#' then the \code{taxid2} list is considered as equal to +#' \code{taxid1} list. +#' @param name A logical value \code{TRUE} or \code{FALSE} indicating +#' if the method return distance matrix annotated by taxids or +#' by scientific names. +#' +#' @return the distance matrix between taxids specified in the \code{taxid1} +#' set and the \code{taxid2} set. +#' +#' @examples +#' # loads the default taxonomy database +#' taxo=default.taxonomy() +#' +#' # build a vector of 6 taxids corresponding to species +#' sp.taxid=c(7000,7004,7007,7009,7010,7011) +#' +#' # computes the distance matrix between taxids +#' distance.taxonomy(taxo,sp.taxid) +#' +#' # Same thing but the matrix is annotated by scientific names +#' distance.taxonomy(taxo,sp.taxid,name=TRUE) +#' +#' @seealso \code{\link{longest.path}} +#' +#' @author Eric Coissac +#' @keywords taxonomy +#' @docType methods +#' @rdname distance.taxonomy-method +#' @aliases taxonomy.distance,obitools.taxonomy +#' +setMethod("distance.taxonomy", "obitools.taxonomy", + function(taxonomy,taxid1,taxid2=NULL,name=F) { + taxdist = function(r) + { + t1=r[1] + t2=r[2] + if (is.na(t1) | is.na(t2)) + return(NA) + + p1 = path(taxonomy,t1) + p2 = path(taxonomy,t2) + + minp = min(length(p1),length(p2)) + common = sum(p1[1:minp] == p2[1:minp]) + lca = p1[common] + lp = longest.path(taxonomy,lca) + return(lp/(lp+common)) + } + + multitaxdist=function(t1,t2) { + apply(data.frame(t1,t2),1,taxdist) + } + + taxid1 = taxid1[! is.na(validate(taxonomy,taxid1))] + t1 = path(taxonomy,taxid1) + + same = is.null(taxid2) + + if (same) + { + ntaxon = length(taxid1) + t2 = t1[unlist(sapply(2:ntaxon, + function(x) x:ntaxon))] + t1 = t1[rep(1:(ntaxon-1),(ntaxon-1):1)] + } + else + { + taxid2 = taxid2[! is.na(validate(taxonomy,taxid2))] + t2 = path(taxonomy,taxid2) + nt1 = length(taxid1) + nt2 = length(taxid2) + t1 = t1[rep(1:nt1,nt2)] + t2 = t2[rep(1:nt2,rep(nt1,nt2))] + } + + lmin = mapply(function(a,b) min(length(a),length(b)), + t1, + t2) + + llca = mapply(function(x,y,l) sum(x[1:l]==y[1:l]), + t1, + t2, + lmin) + + lb = longest.path(taxonomy,mapply(function(x,y) x[y],t1,llca)) + d = as.double(lb / (lb + llca)) + + if (same) { + attr(d, "Size") <- ntaxon + if (name) + attr(d, "Labels") <- scientificname(taxonomy,taxid1) + else + attr(d, "Labels") <- as.character(taxid1) + attr(d, "Diag") <- FALSE + attr(d, "Upper") <- FALSE + attr(d, "method") <- NULL + attr(d, "call") <- match.call() + class(d) <- "dist" + } + else { + if (name) + d = matrix(d,nt1,nt2, + dimnames=list(scientificname(taxonomy,taxid1), + scientificname(taxonomy,taxid2))) + else + d = matrix(d,nt1,nt2, + dimnames=list(as.character(taxid1), + as.character(taxid2))) + + } + + return(d) + }) + + diff --git a/R/lca.R b/R/lca.R new file mode 100644 index 0000000..31d981e --- /dev/null +++ b/R/lca.R @@ -0,0 +1,122 @@ +#' @include taxonomy.R +NULL + +#' @export +setGeneric("lowest.common.ancestor", function(taxonomy,taxid,threshold=1.0,error=0,name=FALSE) { + return(standardGeneric("lowest.common.ancestor")) +}) + +#' Computes the lowest common ancestor in the taxonomy tree between a set of taxa +#' +#' The \code{lowest.common.ancestor} function in package \pkg{ROBITaxonomy} computes +#' the lowest common ancestor of a set of taxids. The lowest common ancestor (LCA) +#' is the most precise taxonomic group shared by all the considered taxa. Tha +#' \code{lowest.common.ancestor} function implemented in the \pkg{ROBITaxonomy} +#' package, considers a fuzzy definition of the LCA as the most precise +#' taxonomic group shared by a quorum of the considered taxa. +#' +#' @param taxonomy an instance of \code{\linkS4class{obitools.taxonomy}} +#' @param taxid an integer value or a vector of integer representing NCBI +#' taxonomic identifiers. +#' @param threshold a numeric value between 0.0 and 1.0 indicating the minimum +#' quorum of taxid that must belong the LCA. +#' @param error an integer value indicating the maximum count of taxids that +#' have not to belong the returned taxid. A \code{threshold} below 1.0 have +#' priority on the \code{error} parameter. +#' @param name A logical value \code{TRUE} or \code{FALSE} indicating if the +#' method return a \emph{taxid} or a scientific name. +#' +#' @return Depending on the value of the \code{name} argument, set by default +#' to \code{FALSE} the method returns : +#' \describe{ +#' \item{If \code{name==FALSE}}{ the taxid of the taxon corresponding +#' to the LCA as an integer value} +#' \item{If \code{name==TRUE}}{ the scientific name of the taxon +#' corresponding to the LCA as a string} +#' } +#' +#' @examples +#' require(ROBITaxonomy) +#' +#' \dontshow{# switch the working directory to the data package directory} +#' \dontshow{setwd(system.file("extdata", package="ROBITaxonomy"))} +#' +#' # read the taxonomy database +#' +#' taxo=read.taxonomy('ncbitaxo') +#' +#' # build a vector of 6 taxids corresponding to species +#' +#' sp.taxid=c(7000,7004,7007,7009,7010,7011) +#' +#' # look for the lowest common ancestor taxids +#' +#' lowest.common.ancestor(taxo,sp.taxid) +#' +#' # same thing but returns results as a vector of scientific names +#' lowest.common.ancestor(taxo,sp.taxid,name=TRUE) +#' +#' # If we accept than 2 or 1 taxa do not belong the LCA +#' lowest.common.ancestor(taxo,sp.taxid,name=TRUE,error=2) +#' lowest.common.ancestor(taxo,sp.taxid,name=TRUE,error=1) +#' +#' # Partial LCA can also be speciefied as the minimal frequency of +#' # taxa belonging the LCA +#' lowest.common.ancestor(taxo,sp.taxid,name=TRUE,threshold=0.8) +#' +#' @seealso class \code{\linkS4class{obitools.taxonomy}}, +#' and methods \code{\link{path}}, \code{\link{parent}}, +#' +#' @author Eric Coissac +#' @keywords taxonomy +#' @docType methods +#' @rdname lowest.common.ancestor-method +#' @aliases lowest.common.ancestor,obitools.taxonomy +#' +setMethod("lowest.common.ancestor", "obitools.taxonomy", + function(taxonomy,taxid,threshold=1.0,error=0,name=FALSE) { + + if (threshold != 1.0) + error=as.integer(floor(length(taxid) * (1-threshold))) + + + # + # Remove nod valid taxid + # + + taxid = validate(taxonomy,taxid) + if (any(is.na(taxid))) + return(NA) + + ntaxid=length(taxid) + nok = ntaxid - error + if (ntaxid==1) + return(taxid) + + allpath = path(taxonomy,taxid) + minlength= min(vapply(allpath,length,0)) + + lca=NA + for (i in 1:minlength) { + + n = vapply(allpath,function(x) x[i],0) + nt = table(n) + mt = max(nt) + if (mt >= nok) { + p = nt[nt==mt] + if (length(p)==1) + lca=as.integer(names(p)[1]) + else + break + } + else + break + } + + if (name) + return(scientificname(taxonomy,lca)) + else + return(lca) + }) + + diff --git a/R/rank.R b/R/rank.R new file mode 100644 index 0000000..a2228b6 --- /dev/null +++ b/R/rank.R @@ -0,0 +1,483 @@ +#' @include taxonomy.R +NULL + +#' @export +setGeneric("rank.list", function(taxonomy) { + return(standardGeneric("rank.list")) +}) + +#' Returns the list of taxonomic ranks +#' +#' The \code{rank.list} function returns the list of all taxonomic +#' ranks described in the taxonomy +#' +#' @param taxonomy a \code{\linkS4class{obitools.taxonomy}} instance +#' +#' @return a vector of type \code{character} containing the taxonomic rank names +#' +#' @examples +#' # read the taxonomy database +#' taxo=default.taxonomy() +#' +#' # returns the taxonomic rank for all taxid between 1000 and 1020 +#' rank.list(taxo) +#' +#' @docType methods +#' @rdname rank.list-methods +#' @aliases rank.list-methods,obitools.taxonomy +#' @author Eric Coissac +#' @keywords taxonomy +#' +setMethod("rank.list", "obitools.taxonomy", + function(taxonomy) { + return(.Call('R_rank_list', + taxonomy, + PACKAGE="ROBITaxonomy")) + }) + + +#' @export +setGeneric("taxonomicrank", function(taxonomy,taxid) { + return(standardGeneric("taxonomicrank")) +}) + +#' Returns the taxonomic rank associated to a taxid +#' +#' @param taxonomy a \code{\linkS4class{obitools.taxonomy}} instance +#' @param taxid a vector of taxid to analyse +#' +#' @return a vector of type \code{character} containing the taxonomic ranks +#' +#' @examples +#' # read the taxonomy database +#' taxo=default.taxonomy() +#' +#' # returns the taxonomic rank for all taxid between 1000 and 1020 +#' taxonomicrank(taxo,1000:1020) +#' +#' @docType methods +#' @rdname taxonomicrank-methods +#' @aliases taxonomicrank-methods,obitools.taxonomy +#' @author Eric Coissac +#' @keywords taxonomy +#' +setMethod("taxonomicrank", "obitools.taxonomy",function(taxonomy,taxid) { + taxid = as.integer(taxid) + return(.Call('R_get_rank', + taxonomy, + taxid, + PACKAGE="ROBITaxonomy")) +}) + +#' @export +setGeneric("taxonatrank", function(taxonomy,taxid,rank,name=FALSE) { + return(standardGeneric("taxonatrank")) +}) + +#' Extracts the taxid at a specified taxonomic rank. +#' +#' The \code{taxonatrank} method of \code{\linkS4class{obitools.taxonomy}} class +#' returns the \emph{taxid} or the scientific name corresponding +#' to a \emph{taxid}.at a specified taxonomic rank +#' +#' @param taxonomy a \code{\linkS4class{obitools.taxonomy}} instance +#' @param taxid a vector of taxid to analyse +#' @param rank a \code{character} indicating the desired rank +#' @param name A logical value \code{TRUE} or \code{FALSE} indicating +#' if the method return a taxid or a scientific name. +#' +#' @return \describe{ +#' \item{If \code{name==FALSE}}{the taxid of the corresponding +#' taxon as an integer or a vector of integers +#' if the \code{taxid} argument is itself +#' a vector} +#' \item{If \code{name==TRUE}}{the scientific name of the corresponding +#' taxon as a string or a vector of string +#' if the \code{taxid} argument is itself +#' a vector} +#' } +#' +#' @examples +#' # read the taxonomy database +#' taxo=default.taxonomy() +#' +#' # build a vector of 6 taxids corresponding to species +#' sp.taxid=c(7000,7004,7007,7009,7010,7011) +#' +#' # look for the subfamily taxids +#' taxonatrank(taxo,sp.taxid,"subfamily") +#' +#' # same thing but returns results as a vector of scientific names +#' taxonatrank(taxo,sp.taxid,"subfamily",TRUE) +#' +#' @seealso class \code{\linkS4class{obitools.taxonomy}}, +#' and methods \code{\link{species}},\code{\link{genus}}, +#' \code{\link{family}},\code{\link{kingdom}}, +#' \code{\link{superkingdom}} +#' +#' @docType methods +#' @rdname taxonatrank-methods +#' @aliases taxonatrank-methods,obitools.taxonomy +#' @author Eric Coissac +#' @keywords taxonomy +#' +setMethod("taxonatrank", "obitools.taxonomy",function(taxonomy,taxid,rank,name=FALSE) { + getsp = function(t) { + if (is.na(t[1]) | is.na(t[2])) + return(NA) + else + return(.Call('R_findtaxonatrank',taxonomy, + as.integer(t[1]), + t[2], + name, + PACKAGE="ROBITaxonomy")) + } + + rank = as.character(rank) + name = as.logical(name[1]) + + apply(data.frame(taxid,rank),1,getsp) +}) + + +#' @export +setGeneric("species", function(taxonomy,taxid,name=FALSE) { + return(standardGeneric("species")) +}) + +#' Extracts the species corresponding to a taxid +#' +#' The \code{species} method of \code{\linkS4class{obitools.taxonomy}} class +#' returns the \emph{taxid} or the scientific name of the species corresponding +#' to a \emph{taxid}. +#' +#' @param taxonomy a \code{\linkS4class{obitools.taxonomy}} instance +#' @param taxid a vector of taxid to analyse +#' @param name A logical value \code{TRUE} or \code{FALSE} indicating +#' if the method return a taxid or a scientific name. +#' +#' @return \describe{ +#' \item{If \code{name==FALSE}}{the taxid of the corresponding +#' taxon as an integer or a vector of integers +#' if the \code{taxid} argument is itself +#' a vector} +#' \item{If \code{name==TRUE}}{the scientific name of the corresponding +#' taxon as a string or a vector of string +#' if the \code{taxid} argument is itself +#' a vector} +#' } +#' +#' @examples +#' # read the taxonomy database +#' taxo=default.taxonomy() +#' +#' # build a vector of 6 taxids corresponding to species +#' sp.taxid=c(7000,7004,7007,7009,7010,7011) +#' +#' # look for the species taxids +#' species(taxo,sp.taxid) +#' +#' # same thing but returns results as a vector of scientific names +#' species(taxo,sp.taxid,TRUE) +#' +#' @seealso class \code{\linkS4class{obitools.taxonomy}}, +#' and methods \code{\link{taxonatrank}},\code{\link{genus}}, +#' \code{\link{family}},\code{\link{kingdom}}, +#' \code{\link{superkingdom}} +#' +#' @docType methods +#' @rdname species-methods +#' @aliases species-methods,obitools.taxonomy +#' @author Eric Coissac +#' @keywords taxonomy +#' +setMethod("species", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) { + getsp = function(t) { + if (is.na(t)) + return(NA) + else + return(.Call('R_get_species', + taxonomy, + t, + name, + PACKAGE="ROBITaxonomy")) + } + + taxid = as.integer(taxid) + name = as.logical(name[1]) + sapply(taxid,getsp) +}) + +#' @export +setGeneric("genus", function(taxonomy,taxid,name=FALSE) { + return(standardGeneric("genus")) +}) + +#' Extracts the genus corresponding to a taxid +#' +#' The \code{genus} method of \code{\linkS4class{obitools.taxonomy}} class +#' returns the \emph{taxid} or the scientific name of the genus corresponding +#' to a \emph{taxid}. +#' +#' @param taxonomy a \code{\linkS4class{obitools.taxonomy}} instance +#' @param taxid a vector of taxid to analyse +#' @param name A logical value \code{TRUE} or \code{FALSE} indicating +#' if the method return a taxid or a scientific name. +#' +#' @return \describe{ +#' \item{If \code{name==FALSE}}{the taxid of the corresponding +#' taxon as an integer or a vector of integers +#' if the \code{taxid} argument is itself +#' a vector} +#' \item{If \code{name==TRUE}}{the scientific name of the corresponding +#' taxon as a string or a vector of string +#' if the \code{taxid} argument is itself +#' a vector} +#' } +#' +#' @examples +#' # read the taxonomy database +#' taxo=default.taxonomy() +#' +#' # build a vector of 6 taxids corresponding to species +#' sp.taxid=c(7000,7004,7007,7009,7010,7011) +#' +#' # look for the genus taxids +#' genus(taxo,sp.taxid) +#' +#' # same thing but returns results as a vector of scientific names +#' genus(taxo,sp.taxid,TRUE) +#' +#' @seealso class \code{\linkS4class{obitools.taxonomy}}, +#' and methods \code{\link{species}},\code{\link{taxonatrank}}, +#' \code{\link{family}},\code{\link{kingdom}}, +#' \code{\link{superkingdom}} +#' +#' @docType methods +#' @rdname genus-methods +#' @aliases genus-methods,obitools.taxonomy +#' @author Eric Coissac +#' @keywords taxonomy +#' +setMethod("genus", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) { + getsp = function(t) { + if (is.na(t)) + return(NA) + else + return(.Call('R_get_genus', + taxonomy, + t, + name, + PACKAGE="ROBITaxonomy")) + } + + taxid = as.integer(taxid) + name = as.logical(name[1]) + sapply(taxid,getsp) +}) + +#' @export +setGeneric("family", function(taxonomy,taxid,name=FALSE) { + return(standardGeneric("family")) +}) + +#' Extracts the family corresponding to a taxid +#' +#' The \code{family} method of \code{\linkS4class{obitools.taxonomy}} class +#' returns the \emph{taxid} or the scientific name of the family corresponding +#' to a \emph{taxid}. +#' +#' @param taxonomy a \code{\linkS4class{obitools.taxonomy}} instance +#' @param taxid a vector of taxid to analyse +#' @param name A logical value \code{TRUE} or \code{FALSE} indicating +#' if the method return a taxid or a scientific name. +#' +#' @return \describe{ +#' \item{If \code{name==FALSE}}{the taxid of the corresponding +#' taxon as an integer or a vector of integers +#' if the \code{taxid} argument is itself +#' a vector} +#' \item{If \code{name==TRUE}}{the scientific name of the corresponding +#' taxon as a string or a vector of string +#' if the \code{taxid} argument is itself +#' a vector} +#' } +#' +#' @examples +#' # read the taxonomy database +#' taxo=default.taxonomy() +#' +#' # build a vector of 6 taxids corresponding to species +#' sp.taxid=c(7000,7004,7007,7009,7010,7011) +#' +#' # look for the family taxids +#' family(taxo,sp.taxid) +#' +#' # same thing but returns results as a vector of scientific names +#' family(taxo,sp.taxid,TRUE) +#' +#' @seealso class \code{\linkS4class{obitools.taxonomy}}, +#' and methods \code{\link{species}},\code{\link{genus}}, +#' \code{\link{taxonatrank}},\code{\link{kingdom}}, +#' \code{\link{superkingdom}} +#' +#' @docType methods +#' @rdname family-methods +#' @aliases family-methods,obitools.taxonomy +#' @author Eric Coissac +#' @keywords taxonomy +#' +setMethod("family", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) { + getsp = function(t) { + if (is.na(t)) + return(NA) + else + return(.Call('R_get_family', + taxonomy, + t, + name, + PACKAGE="ROBITaxonomy")) + } + + taxid = as.integer(taxid) + name = as.logical(name[1]) + sapply(taxid,getsp) +}) + +#' @export +setGeneric("kingdom", function(taxonomy,taxid,name=FALSE) { + return(standardGeneric("kingdom")) +}) + +#' Extracts the kingdom corresponding to a taxid +#' +#' The \code{kingdom} method of \code{\linkS4class{obitools.taxonomy}} class +#' returns the \emph{taxid} or the scientific name of the kingdom corresponding +#' to a \emph{taxid}. +#' +#' @param taxonomy a \code{\linkS4class{obitools.taxonomy}} instance +#' @param taxid a vector of taxid to analyse +#' @param name A logical value \code{TRUE} or \code{FALSE} indicating +#' if the method return a taxid or a scientific name. +#' +#' @return \describe{ +#' \item{If \code{name==FALSE}}{the taxid of the corresponding +#' taxon as an integer or a vector of integers +#' if the \code{taxid} argument is itself +#' a vector} +#' \item{If \code{name==TRUE}}{the scientific name of the corresponding +#' taxon as a string or a vector of string +#' if the \code{taxid} argument is itself +#' a vector} +#' } +#' +#' @examples +#' # read the taxonomy database +#' taxo=default.taxonomy() +#' +#' # build a vector of 6 taxids corresponding to species +#' sp.taxid=c(7000,7004,7007,7009,7010,7011) +#' +#' # look for the kingdom taxids +#' kingdom(taxo,sp.taxid) +#' +#' # same thing but returns results as a vector of scientific names +#' kingdom(taxo,sp.taxid,TRUE) +#' +#' @seealso class \code{\linkS4class{obitools.taxonomy}}, +#' and methods \code{\link{species}},\code{\link{genus}}, +#' \code{\link{family}},\code{\link{taxonatrank}}, +#' \code{\link{superkingdom}} +#' +#' @docType methods +#' @rdname kingdom-methods +#' @aliases kingdom-methods,obitools.taxonomy +#' @author Eric Coissac +#' @keywords taxonomy +#' +setMethod("kingdom", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) { + getsp = function(t) { + if (is.na(t)) + return(NA) + else + return(.Call('R_get_kingdom', + taxonomy, + t, + name, + PACKAGE="ROBITaxonomy")) + } + + taxid = as.integer(taxid) + name = as.logical(name[1]) + sapply(taxid,getsp) +}) + +#' @export +setGeneric("superkingdom", function(taxonomy,taxid,name=FALSE) { + return(standardGeneric("superkingdom")) +}) + +#' Extracts the superkingdom corresponding to a taxid +#' +#' The \code{superkingdom} method of \code{\linkS4class{obitools.taxonomy}} class +#' returns the \emph{taxid} or the scientific name of the superkingdom corresponding +#' to a \emph{taxid}. +#' +#' @param taxonomy a \code{\linkS4class{obitools.taxonomy}} instance +#' @param taxid a vector of taxid to analyse +#' @param name A logical value \code{TRUE} or \code{FALSE} indicating +#' if the method return a taxid or a scientific name. +#' +#' @return \describe{ +#' \item{If \code{name==FALSE}}{the taxid of the corresponding +#' taxon as an integer or a vector of integers +#' if the \code{taxid} argument is itself +#' a vector} +#' \item{If \code{name==TRUE}}{the scientific name of the corresponding +#' taxon as a string or a vector of string +#' if the \code{taxid} argument is itself +#' a vector} +#' } +#' +#' @examples +#' # read the taxonomy database +#' taxo=default.taxonomy() +#' +#' # build a vector of 6 taxids corresponding to species +#' sp.taxid=c(7000,7004,7007,7009,7010,7011) +#' +#' # look for the superkingdom taxids +#' superkingdom(taxo,sp.taxid) +#' +#' # same thing but returns results as a vector of scientific names +#' superkingdom(taxo,sp.taxid,TRUE) +#' +#' @seealso class \code{\linkS4class{obitools.taxonomy}}, +#' and methods \code{\link{species}},\code{\link{genus}}, +#' \code{\link{family}},\code{\link{kingdom}}, +#' \code{\link{taxonatrank}} +#' +#' @docType methods +#' @rdname superkingdom-methods +#' @aliases superkingdom-methods,obitools.taxonomy +#' @author Eric Coissac +#' @keywords taxonomy +#' +setMethod("superkingdom", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) { + getsp = function(t) { + if (is.na(t)) + return(NA) + else + return(.Call('R_get_superkingdom', + taxonomy, + t, + name, + PACKAGE="ROBITaxonomy")) + } + + taxid = as.integer(taxid) + name = as.logical(name[1]) + sapply(taxid,getsp) +}) + + diff --git a/R/taxonomy.R b/R/taxonomy.R new file mode 100644 index 0000000..8cb46fb --- /dev/null +++ b/R/taxonomy.R @@ -0,0 +1,216 @@ +#' @include ROBITaxonomy.R +#' @useDynLib ROBITaxonomy +NULL + +#' Gives access to a taxonomy preformated by OBITools +#' +#' A S4 class describing a taxonomy. It allows access to +#' taxonomy formated for OBITools. +#' +#' @references \describe{ +#' \item{NCBI Taxonomy : }{\url{http://www.ncbi.nlm.nih.gov/taxonomy}} +#' \item{OBITools : }{\url{http://metabarcoding/obitools/doc}} +#' } +#' +#' @seealso \code{\link{read.taxonomy}} +#' +#' @name obitools.taxonomy +#' @rdname obitools-taxonomy-class +#' @keywords taxonomy +#' @author Eric Coissac +#' @exportClass obitools.taxonomy +#' +setClass("obitools.taxonomy", + + + # + # Attribute declaration + # + + # data.frame containing the counts of reads per samples + # 1 samples per line + # 1 sequence per column + + representation( + + # An external pointer structure to + # the C taxonomy structure + + pointer = "externalptr", + + # the name of the database on the hard disk + + dbname = 'character', + + # the working directory when the taxonomy + # object is created. + # This inforation combined with bname allows + # to reload taxonomy from disk + + workingdir = 'character', + + # Indicate if the taxonomy is saved in a file + # Taxonomy created in R or modified in R are + # not saved ==> This have to be take into + # consideration but how ??? + saved = 'logical' + ), + + # + # Check object structure + # + + validity = function(object) { + return(TRUE) + } +) + + + +#' obitools.taxonomy constructor +#' +#' --> this constructor have not to be called directly +#' use the read.obitools.taxonomy function to +#' create a new instance of taxonomy +#' +#' @docType methods +#' @rdname initialize-methods-obitools.taxonomy +#' @aliases initialize-methods,obitools.taxonomy +setMethod("initialize", + "obitools.taxonomy", + function(.Object, pointer,dbname,workingdir,saved) { + .Object@pointer <- pointer + .Object@dbname <- dbname + .Object@workingdir <- workingdir + .Object@saved <- saved + + validObject(.Object) ## valide l'objet + return(.Object) + }) + + +#' @exportClass obitools.taxonomyOrNULL +setClassUnion("obitools.taxonomyOrNULL",c("obitools.taxonomy","NULL")) + + +#' @export +setGeneric("path", function(taxonomy,taxid,name=FALSE) { + return(standardGeneric("path")) +}) + + + +setMethod("path", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) { + getp = function(t) { + if (is.na(t)) + return(NA) + else + { + path=c() + + t=.Call('R_validate_taxid', + taxonomy, + as.integer(t), + PACKAGE="ROBITaxonomy") + + if (is.na(t)) + return(NA) + + repeat { + if (name) + path = c(scientificname(taxonomy,t),path) + else + path = c(t,path) + + t = .Call('R_get_parent', + taxonomy, + t, + FALSE, + PACKAGE="ROBITaxonomy") + if (is.na(t)) + break + } + + return(path) + } + } + + taxid=as.integer(taxid) + name=as.logical(name) + + p = lapply(taxid,getp) + d = dim(p) + + if (!is.null(d)) + if (d[2]==1) + p = as.vector(p) + + return(p) +}) + + +#' @export +setGeneric("is.subcladeof", function(taxonomy,taxid,parent) { + return(standardGeneric("is.subcladeof")) +}) + +setMethod("is.subcladeof", "obitools.taxonomy",function(taxonomy,taxid,parent) { + taxid = as.integer(taxid) + parent= as.integer(parent) + return(.Call('R_is_under_taxon', + taxonomy, + taxid, + parent, + PACKAGE="ROBITaxonomy")) +}) + + + +build.taxonomy = function(pointer,dbname,workingdir,saved) { + rd <- new('obitools.taxonomy', + pointer=pointer, + dbname=dbname, + workingdir=workingdir, + saved=saved + ) + return(rd) +} + + +#' Reads a taxonomy +#' +#' \code{read.taxonomy} reads a taxonomy formated by OBITools. +#' NCBI taxonomy can be download from the NCBI FTP site in taxdump format. +#' The taxdump must be formated using the obitaxonomy command from OBITools +#' before being used in R. A OBITools formated taxonomy is composed of 3 files +#' with the same prefix name and suffixes .tdx, .rdx, .ndx, two extra files +#' suffixed .adx and .ldx can also be present. +#' +#' @param dbname A character string containing the file name of the database +#' +#' @return an instance of the class \code{\linkS4class{obitools.taxonomy}} +#' +#' @examples +#' +#' \dontshow{# switch the working directory to the data package directory} +#' \dontshow{setwd(system.file("extdata", package="ROBITaxonomy"))} +#' +#' # read the taxonomy ncbi +#' ncbi = read.taxonomy("ncbitaxo") +#' +#' # and use it for requesting a scientific name +#' scientificname(ncbi,7742) +#' +#' @seealso \code{\linkS4class{obitools.taxonomy}} +#' +#' @author Eric Coissac +#' @keywords taxonomy +#' @export + +read.taxonomy = function(dbname) { + t <- .Call('R_read_taxonomy',dbname,TRUE,PACKAGE="ROBITaxonomy") + + return(build.taxonomy(t,dbname,getwd(),TRUE)) +} + + diff --git a/ROBITaxonomy.Rproj b/ROBITaxonomy.Rproj new file mode 100644 index 0000000..4aff1be --- /dev/null +++ b/ROBITaxonomy.Rproj @@ -0,0 +1,20 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: knitr +LaTeX: pdfLaTeX + +AutoAppendNewline: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace diff --git a/inst/extdata/.DS_Store b/inst/extdata/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/inst/extdata/.DS_Store differ diff --git a/inst/extdata/ncbitaxo.adx b/inst/extdata/ncbitaxo.adx new file mode 100644 index 0000000..8e02583 Binary files /dev/null and b/inst/extdata/ncbitaxo.adx differ diff --git a/inst/extdata/ncbitaxo.ndx b/inst/extdata/ncbitaxo.ndx new file mode 100644 index 0000000..46c99d7 Binary files /dev/null and b/inst/extdata/ncbitaxo.ndx differ diff --git a/inst/extdata/ncbitaxo.rdx b/inst/extdata/ncbitaxo.rdx new file mode 100644 index 0000000..2eda8ab Binary files /dev/null and b/inst/extdata/ncbitaxo.rdx differ diff --git a/inst/extdata/ncbitaxo.tdx b/inst/extdata/ncbitaxo.tdx new file mode 100644 index 0000000..d30b9a3 Binary files /dev/null and b/inst/extdata/ncbitaxo.tdx differ diff --git a/src/ROBITaxonomy.so b/src/ROBITaxonomy.so new file mode 100755 index 0000000..c41f4e7 Binary files /dev/null and b/src/ROBITaxonomy.so differ diff --git a/src/ecoError.c b/src/ecoError.c new file mode 100644 index 0000000..aa33609 --- /dev/null +++ b/src/ecoError.c @@ -0,0 +1,24 @@ +#include "ecoPCR.h" +#include +#include + +/* + * print the message given as argument and exit the program + * @param error error number + * @param message the text explaining what's going on + * @param filename the file source where the program failed + * @param linenumber the line where it has failed + * filename and linenumber are written at pre-processing + * time by a macro + */ +void ecoError(int32_t errorcode, + const char* message, + const char * filename, + int linenumber) +{ + error("Error %d in file %s line %d : %s", + errorcode, + filename, + linenumber, + message); +} diff --git a/src/ecoError.o b/src/ecoError.o new file mode 100644 index 0000000..fe3967d Binary files /dev/null and b/src/ecoError.o differ diff --git a/src/ecoIOUtils.c b/src/ecoIOUtils.c new file mode 100644 index 0000000..d0c397e --- /dev/null +++ b/src/ecoIOUtils.c @@ -0,0 +1,122 @@ +#include "ecoPCR.h" +#include +#include + +#define SWAPINT32(x) ((((x) << 24) & 0xFF000000) | (((x) << 8) & 0xFF0000) | \ + (((x) >> 8) & 0xFF00) | (((x) >> 24) & 0xFF)) + + +int32_t is_big_endian() +{ + int32_t i=1; + + return (int32_t)((char*)&i)[0]; +} + + + + +int32_t swap_int32_t(int32_t i) +{ + return SWAPINT32(i); +} + + +/** + * Read part of the file + * @param *f the database + * @param recordSize the size to be read + * + * @return buffer + */ +void *read_ecorecord(FILE *f,int32_t *recordSize) +{ + static void *buffer =NULL; + int32_t buffersize=0; + int32_t read; + + if (!recordSize) + ECOERROR(ECO_ASSERT_ERROR, + "recordSize cannot be NULL"); + + read = fread(recordSize, + 1, + sizeof(int32_t), + f); + + if (feof(f)) + return NULL; + + if (read != sizeof(int32_t)) + ECOERROR(ECO_IO_ERROR,"Reading record size error"); + + if (is_big_endian()) + *recordSize=swap_int32_t(*recordSize); + + if (buffersize < *recordSize) + { + if (buffer) + buffer = ECOREALLOC(buffer,*recordSize, + "Increase size of record buffer"); + else + buffer = ECOMALLOC(*recordSize, + "Allocate record buffer"); + } + + read = fread(buffer, + 1, + *recordSize, + f); + + if (read != *recordSize) + ECOERROR(ECO_IO_ERROR,"Reading record data error"); + + return buffer; +} + + + + + +/** + * Open the database and check it's readable + * @param filename name of the database (.sdx, .rdx, .tbx) + * @param sequencecount buffer - pointer to variable storing the number of occurence + * @param abort_on_open_error boolean to define the behaviour in case of error + * while opening the database + * @return FILE type + **/ +FILE *open_ecorecorddb(const char *filename, + int32_t *sequencecount, + int32_t abort_on_open_error) +{ + FILE *f; + int32_t read; + + f = fopen(filename,"rb"); + + if (!f) + { + if (abort_on_open_error) + ECOERROR(ECO_IO_ERROR,"Cannot open file"); + else + { + *sequencecount=0; + return NULL; + } + } + + read = fread(sequencecount, + 1, + sizeof(int32_t), + f); + + if (read != sizeof(int32_t)) + ECOERROR(ECO_IO_ERROR,"Reading record size error"); + + if (is_big_endian()) + *sequencecount=swap_int32_t(*sequencecount); + + return f; +} + diff --git a/src/ecoIOUtils.o b/src/ecoIOUtils.o new file mode 100644 index 0000000..2a12abe Binary files /dev/null and b/src/ecoIOUtils.o differ diff --git a/src/ecoMalloc.c b/src/ecoMalloc.c new file mode 100644 index 0000000..0d99b50 --- /dev/null +++ b/src/ecoMalloc.c @@ -0,0 +1,76 @@ +#include "ecoPCR.h" +#include + +static int eco_log_malloc = 0; + +void eco_trace_memory_allocation() +{ + eco_log_malloc=1; +} + +void eco_untrace_memory_allocation() +{ + eco_log_malloc=0; +} + + +void *eco_malloc(int32_t chunksize, + const char *error_message, + const char *filename, + int32_t line) +{ + void * chunk; + + chunk = calloc(1,chunksize); + + if (!chunk) + ecoError(ECO_MEM_ERROR,error_message,filename,line); + + if (eco_log_malloc) + REprintf("Memory segment located at %p of size %d is allocated (file : %s [%d])", + chunk, + chunksize, + filename, + line); + + return chunk; +} + +void *eco_realloc(void *chunk, + int32_t newsize, + const char *error_message, + const char *filename, + int32_t line) +{ + void *newchunk; + + newchunk = realloc(chunk,newsize); + + if (!newchunk) + ecoError(ECO_MEM_ERROR,error_message,filename,line); + + if (eco_log_malloc) + REprintf("Old memory segment %p is reallocated at %p with a size of %d (file : %s [%d])", + chunk, + newchunk, + newsize, + filename, + line); + + return newchunk; +} + +void eco_free(void *chunk, + const char *error_message, + const char *filename, + int32_t line) +{ + free(chunk); + + if (eco_log_malloc) + REprintf("Memory segment %p is released => %s (file : %s [%d])", + chunk, + error_message, + filename, + line); +} diff --git a/src/ecoMalloc.o b/src/ecoMalloc.o new file mode 100644 index 0000000..ce7a5cd Binary files /dev/null and b/src/ecoMalloc.o differ diff --git a/src/ecoPCR.h b/src/ecoPCR.h new file mode 100644 index 0000000..2fa8aa4 --- /dev/null +++ b/src/ecoPCR.h @@ -0,0 +1,285 @@ +#ifndef ECOPCR_H_ +#define ECOPCR_H_ + +#include +#include + +#include +#include +#include + + +//#ifndef H_apat +//#include "../libapat/apat.h" +//#endif + +/***************************************************** + * + * Data type declarations + * + *****************************************************/ + +/* + * + * Sequence types + * + */ + +typedef struct { + + int32_t taxid; + char AC[20]; + int32_t DE_length; + int32_t SQ_length; + int32_t CSQ_length; + + char data[1]; + +} ecoseqformat_t; + +typedef struct { + int32_t taxid; + int32_t SQ_length; + char *AC; + char *DE; + char *SQ; +} ecoseq_t; + +/* + * + * Taxonomy taxon types + * + */ + + +typedef struct { + int32_t taxid; + int32_t rank; + int32_t parent; + int32_t namelength; + char name[1]; + +} ecotxformat_t; + +typedef struct ecotxnode { + int32_t taxid; + int32_t rank; + int32_t farest; + struct ecotxnode *parent; + char *name; +} ecotx_t; + +typedef struct { + int32_t count; + int32_t maxtaxid; + int32_t buffersize; + ecotx_t taxon[1]; +} ecotxidx_t; + + +/* + * + * Taxonomy rank types + * + */ + +typedef struct { + int32_t count; + char* label[1]; +} ecorankidx_t; + +/* + * + * Taxonomy name types + * + */ + +typedef struct { + int32_t is_scientificname; + int32_t namelength; + int32_t classlength; + int32_t taxid; + char names[1]; +} econameformat_t; + + + typedef struct { + char *name; + char *classname; + int32_t is_scientificname; + struct ecotxnode *taxon; +} econame_t; + + +typedef struct { + int32_t count; + econame_t names[1]; +} econameidx_t; + + + typedef struct { + ecorankidx_t *ranks; + econameidx_t *names; + ecotxidx_t *taxons; +} ecotaxonomy_t; + + +/***************************************************** + * + * Function declarations + * + *****************************************************/ + +/* + * + * Low level system functions + * + */ + +int32_t is_big_endian(); +int32_t swap_int32_t(int32_t); + +void *eco_malloc(int32_t chunksize, + const char *error_message, + const char *filename, + int32_t line); + + +void *eco_realloc(void *chunk, + int32_t chunksize, + const char *error_message, + const char *filename, + int32_t line); + +void eco_free(void *chunk, + const char *error_message, + const char *filename, + int32_t line); + +void eco_trace_memory_allocation(); +void eco_untrace_memory_allocation(); + +#define ECOMALLOC(size,error_message) \ + eco_malloc((size),(error_message),__FILE__,__LINE__) + +#define ECOREALLOC(chunk,size,error_message) \ + eco_realloc((chunk),(size),(error_message),__FILE__,__LINE__) + +#define ECOFREE(chunk,error_message) \ + eco_free((chunk),(error_message),__FILE__,__LINE__) + + + + +/* + * + * Error managment + * + */ + + +void ecoError(int32_t,const char*,const char *,int); + +#define ECOERROR(code,message) ecoError((code),(message),__FILE__,__LINE__) + +#define ECO_IO_ERROR (1) +#define ECO_MEM_ERROR (2) +#define ECO_ASSERT_ERROR (3) +#define ECO_NOTFOUND_ERROR (4) + + +/* + * + * Low level Disk access functions + * + */ + +FILE *open_ecorecorddb(const char *filename, + int32_t *sequencecount, + int32_t abort_on_open_error); + +void *read_ecorecord(FILE *,int32_t *recordSize); + + + +/* + * Read function in internal binary format + */ + +FILE *open_ecoseqdb(const char *filename, + int32_t *sequencecount); + +ecoseq_t *readnext_ecoseq(FILE *); + +ecorankidx_t *read_rankidx(const char *filename); + +econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy); + + + + /** + * Read taxonomy data as formated by the ecoPCRFormat.py script. + * + * This function is normaly uses internaly by the read_taxonomy + * function and should not be called directly. + * + * @arg filename path to the *.tdx file of the reformated db + * + * @return pointer to a taxonomy index structure + */ + +ecotxidx_t *read_taxonomyidx(const char *filename,const char *filename2); + +ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName); + +ecotx_t *eco_findtaxonatrank(ecotx_t *taxon, int32_t rankidx); + +ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy, int32_t taxid); + +int eco_isundertaxon(ecotx_t *taxon, int other_taxid); + +ecoseq_t *ecoseq_iterator(const char *prefix); + + + +ecoseq_t *new_ecoseq(); +int32_t delete_ecoseq(ecoseq_t *); +ecoseq_t *new_ecoseq_with_data( char *AC, + char *DE, + char *SQ, + int32_t taxid + ); + + +int32_t delete_taxon(ecotx_t *taxon); +int32_t delete_rankidx(ecorankidx_t *rankidx); +int32_t delete_nameidx(econameidx_t *nameidx); +int32_t delete_taxonomy(ecotxidx_t *index); +int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy); + + +int32_t rank_index(const char* label,ecorankidx_t* ranks); + +//int32_t delete_apatseq(SeqPtr pseq); +//PatternPtr buildPattern(const char *pat, int32_t error_max); +//PatternPtr complementPattern(PatternPtr pat); +// +//SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out,int32_t circular); + +//char *ecoComplementPattern(char *nucAcSeq); +//char *ecoComplementSequence(char *nucAcSeq); +//char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end); + +ecotx_t *eco_getspecies(ecotx_t *taxon,ecotaxonomy_t *taxonomy); +ecotx_t *eco_getgenus(ecotx_t *taxon,ecotaxonomy_t *taxonomy); +ecotx_t *eco_getfamily(ecotx_t *taxon,ecotaxonomy_t *taxonomy); +ecotx_t *eco_getkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy); +ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy); + +//int eco_is_taxid_ignored(int32_t *ignored_taxid, int32_t tab_len, int32_t taxid); +//int eco_is_taxid_included(ecotaxonomy_t *taxonomy, int32_t *included_taxid, int32_t tab_len, int32_t taxid); + + +ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy); + +#endif /*ECOPCR_H_*/ diff --git a/src/ecodna.c b/src/ecodna.c new file mode 100644 index 0000000..86d2012 --- /dev/null +++ b/src/ecodna.c @@ -0,0 +1,156 @@ +#include +#include "ecoPCR.h" + +/* + * @doc: DNA alphabet (IUPAC) + */ +#define LX_BIO_DNA_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]" + +/* + * @doc: complementary DNA alphabet (IUPAC) + */ +#define LX_BIO_CDNA_ALPHA "TVGHEFCDIJMLKNOPQYSAABWXRZ#!][" + + +static char sNuc[] = LX_BIO_DNA_ALPHA; +static char sAnuc[] = LX_BIO_CDNA_ALPHA; + +static char LXBioBaseComplement(char nucAc); +static char *LXBioSeqComplement(char *nucAcSeq); +static char *reverseSequence(char *str,char isPattern); + + +/* ---------------------------- */ + +char LXBioBaseComplement(char nucAc) +{ + char *c; + + if ((c = strchr(sNuc, nucAc))) + return sAnuc[(c - sNuc)]; + else + return nucAc; +} + +/* ---------------------------- */ + +char *LXBioSeqComplement(char *nucAcSeq) +{ + char *s; + + for (s = nucAcSeq ; *s ; s++) + *s = LXBioBaseComplement(*s); + + return nucAcSeq; +} + + +char *reverseSequence(char *str,char isPattern) +{ + char *sb, *se, c; + + if (! str) + return str; + + sb = str; + se = str + strlen(str) - 1; + + while(sb <= se) { + c = *sb; + *sb++ = *se; + *se-- = c; + } + + sb = str; + se = str + strlen(str) - 1; + + if (isPattern) + for (;sb < se; sb++) + { + if (*sb=='#') + { + if (((se - sb) > 2) && (*(sb+2)=='!')) + { + *sb='!'; + sb+=2; + *sb='#'; + } + else + { + *sb=*(sb+1); + sb++; + *sb='#'; + } + } + else if (*sb=='!') + { + *sb=*(sb-1); + *(sb-1)='!'; + } + } + + return str; +} + +char *ecoComplementPattern(char *nucAcSeq) +{ + return reverseSequence(LXBioSeqComplement(nucAcSeq),1); +} + +char *ecoComplementSequence(char *nucAcSeq) +{ + return reverseSequence(LXBioSeqComplement(nucAcSeq),0); +} + + +char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end) +/* + extract subsequence from nucAcSeq [begin,end[ +*/ +{ + static char *buffer = NULL; + static int32_t buffSize= 0; + int32_t length; + + if (begin < end) + { + length = end - begin; + + if (length >= buffSize) + { + buffSize = length+1; + if (buffer) + buffer=ECOREALLOC(buffer,buffSize, + "Error in reallocating sub sequence buffer"); + else + buffer=ECOMALLOC(buffSize, + "Error in allocating sub sequence buffer"); + + } + + strncpy(buffer,nucAcSeq + begin,length); + buffer[length]=0; + } + else + { + length = end + strlen(nucAcSeq) - begin; + + if (length >= buffSize) + { + buffSize = length+1; + if (buffer) + buffer=ECOREALLOC(buffer,buffSize, + "Error in reallocating sub sequence buffer"); + else + buffer=ECOMALLOC(buffSize, + "Error in allocating sub sequence buffer"); + + } + strncpy(buffer,nucAcSeq+begin,length - end); + strncpy(buffer+(length-end),nucAcSeq ,end); + buffer[length]=0; + } + + return buffer; +} + diff --git a/src/ecodna.o b/src/ecodna.o new file mode 100644 index 0000000..d85ab88 Binary files /dev/null and b/src/ecodna.o differ diff --git a/src/ecofilter.c b/src/ecofilter.c new file mode 100644 index 0000000..64276c0 --- /dev/null +++ b/src/ecofilter.c @@ -0,0 +1,20 @@ +#include "ecoPCR.h" + +int eco_is_taxid_included( ecotaxonomy_t *taxonomy, + int32_t *restricted_taxid, + int32_t tab_len, + int32_t taxid) +{ + int i; + ecotx_t *taxon; + + taxon = eco_findtaxonbytaxid(taxonomy, taxid); + + if (taxon) + for (i=0; i < tab_len; i++) + if ( (taxon->taxid == restricted_taxid[i]) || + (eco_isundertaxon(taxon, restricted_taxid[i])) ) + return 1; + + return 0; +} diff --git a/src/ecofilter.o b/src/ecofilter.o new file mode 100644 index 0000000..d3596f6 Binary files /dev/null and b/src/ecofilter.o differ diff --git a/src/econame.c b/src/econame.c new file mode 100644 index 0000000..52bbe42 --- /dev/null +++ b/src/econame.c @@ -0,0 +1,86 @@ +#include "ecoPCR.h" +#include +#include + +static econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy); + +econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy) +{ + + int32_t count; + FILE *f; + econameidx_t *indexname; + int32_t i; + + f = open_ecorecorddb(filename,&count,0); + + if (f==NULL) + return NULL; + + indexname = (econameidx_t*) ECOMALLOC(sizeof(econameidx_t) + sizeof(econame_t) * (count-1),"Allocate names"); + + indexname->count=count; + + for (i=0; i < count; i++){ + readnext_econame(f,(indexname->names)+i,taxonomy); + } + + return indexname; +} + +int32_t delete_nameidx(econameidx_t *nameidx) +{ + size_t i; + + if (nameidx) { + for (i=0; i < nameidx->count; i++) { + if (nameidx->names[i].name) + ECOFREE(nameidx->names[i].name, + "Desallocate name"); + if (nameidx->names[i].classname) + ECOFREE(nameidx->names[i].classname, + "Desallocate classname"); + } + + ECOFREE(nameidx,"Desallocate name index"); + + return 0; + } + + return 1; +} + +econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy) +{ + + econameformat_t *raw; + int32_t rs; + + raw = read_ecorecord(f,&rs); + + if (!raw) + return NULL; + + if (is_big_endian()) + { + raw->is_scientificname = swap_int32_t(raw->is_scientificname); + raw->namelength = swap_int32_t(raw->namelength); + raw->classlength = swap_int32_t(raw->classlength); + raw->taxid = swap_int32_t(raw->taxid); + } + + name->is_scientificname=raw->is_scientificname; + + name->name = ECOMALLOC((raw->namelength+1) * sizeof(char),"Allocate name"); + strncpy(name->name,raw->names,raw->namelength); + name->name[raw->namelength]=0; + + name->classname = ECOMALLOC((raw->classlength+1) * sizeof(char),"Allocate classname"); + strncpy(name->classname,(raw->names+raw->namelength),raw->classlength); + name->classname[raw->classlength]=0; + + name->taxon = taxonomy->taxons->taxon + raw->taxid; + + return name; +} + diff --git a/src/econame.o b/src/econame.o new file mode 100644 index 0000000..a565233 Binary files /dev/null and b/src/econame.o differ diff --git a/src/ecorank.c b/src/ecorank.c new file mode 100644 index 0000000..00ba7fc --- /dev/null +++ b/src/ecorank.c @@ -0,0 +1,75 @@ +#include "ecoPCR.h" +#include +#include + +static int compareRankLabel(const void *label1, const void *label2); + +ecorankidx_t *read_rankidx(const char *filename) +{ + int32_t count; + FILE *f; + ecorankidx_t *index; + int32_t i; + int32_t rs; + char *buffer; + + f = open_ecorecorddb(filename,&count,0); + + if (f==NULL) + return NULL; + + index = (ecorankidx_t*) ECOMALLOC(sizeof(ecorankidx_t) + sizeof(char*) * (count-1), + "Allocate rank index"); + + index->count=count; + + for (i=0; i < count; i++) + { + buffer = read_ecorecord(f,&rs); + index->label[i]=(char*) ECOMALLOC(rs+1, + "Allocate rank label"); + strncpy(index->label[i],buffer,rs); + } + + return index; +} + +int32_t delete_rankidx(ecorankidx_t *rankidx) +{ + size_t i; + + if (rankidx) { + for (i=0; i < rankidx->count; i++) + if (rankidx->label[i]) + ECOFREE(rankidx->label[i], + "Desallocate rank label"); + + ECOFREE(rankidx,"Desallocate rank index"); + + return 0; + } + + return 1; +} + +int32_t rank_index(const char* label,ecorankidx_t* ranks) +{ + char **rep; + + rep = bsearch(label,ranks->label,ranks->count,sizeof(char*),compareRankLabel); + + if (rep) + return rep-ranks->label; +// else +// ECOERROR(ECO_NOTFOUND_ERROR,"Rank label not found"); + + return -1; +} + + +int compareRankLabel(const void *label1, const void *label2) +{ + return strcmp((const char*)label1,*(const char**)label2); +} + + diff --git a/src/ecorank.o b/src/ecorank.o new file mode 100644 index 0000000..783e087 Binary files /dev/null and b/src/ecorank.o differ diff --git a/src/ecoseq.c b/src/ecoseq.c new file mode 100644 index 0000000..b2128a0 --- /dev/null +++ b/src/ecoseq.c @@ -0,0 +1,231 @@ +#include "ecoPCR.h" +#include +#include +#include +#include +#include +#include + +static FILE *open_seqfile(const char *prefix,int32_t index); + + +ecoseq_t *new_ecoseq() +{ + void *tmp; + + tmp = ECOMALLOC(sizeof(ecoseq_t),"Allocate new ecoseq structure"); + + return tmp; +} + +int32_t delete_ecoseq(ecoseq_t * seq) +{ + + if (seq) + { + if (seq->AC) + ECOFREE(seq->AC,"Free sequence AC"); + + if (seq->DE) + ECOFREE(seq->DE,"Free sequence DE"); + + if (seq->SQ) + ECOFREE(seq->SQ,"Free sequence SQ"); + + ECOFREE(seq,"Free sequence structure"); + + return 0; + + } + + return 1; +} + +ecoseq_t *new_ecoseq_with_data( char *AC, + char *DE, + char *SQ, + int32_t taxid_idx + ) +{ + ecoseq_t *tmp; + int32_t lstr; + tmp = new_ecoseq(); + + tmp->taxid=taxid_idx; + + if (AC) + { + lstr =strlen(AC); + tmp->AC=ECOMALLOC((lstr+1) * sizeof(char), + "Allocate sequence accession"); + strcpy(tmp->AC,AC); + } + + if (DE) + { + lstr =strlen(DE); + tmp->DE=ECOMALLOC((lstr+1) * sizeof(char), + "Allocate sequence definition"); + strcpy(tmp->DE,DE); + } + + if (SQ) + { + lstr =strlen(SQ); + tmp->SQ=ECOMALLOC((lstr+1) * sizeof(char), + "Allocate sequence data"); + strcpy(tmp->SQ,SQ); + } + return tmp; + +} + +/** + * ?? used ?? + **/ +FILE *open_ecoseqdb(const char *filename, + int32_t *sequencecount) +{ + return open_ecorecorddb(filename,sequencecount,1); +} + +ecoseq_t *readnext_ecoseq(FILE *f) +{ + char *compressed=NULL; + + ecoseqformat_t *raw; + ecoseq_t *seq; + int32_t comp_status; + unsigned long int seqlength; + int32_t rs; + char *c; + int32_t i; + + raw = read_ecorecord(f,&rs); + + if (!raw) + return NULL; + + if (is_big_endian()) + { + raw->CSQ_length = swap_int32_t(raw->CSQ_length); + raw->DE_length = swap_int32_t(raw->DE_length); + raw->SQ_length = swap_int32_t(raw->SQ_length); + raw->taxid = swap_int32_t(raw->taxid); + } + + seq = new_ecoseq(); + + seq->taxid = raw->taxid; + + seq->AC = ECOMALLOC(strlen(raw->AC) +1, + "Allocate Sequence Accesion number"); + strncpy(seq->AC,raw->AC,strlen(raw->AC)); + + + seq->DE = ECOMALLOC(raw->DE_length+1, + "Allocate Sequence definition"); + strncpy(seq->DE,raw->data,raw->DE_length); + + seqlength = seq->SQ_length = raw->SQ_length; + + compressed = raw->data + raw->DE_length; + + seq->SQ = ECOMALLOC(seqlength+1, + "Allocate sequence buffer"); +/* + comp_status = uncompress((unsigned char*)seq->SQ, + &seqlength, + (unsigned char*)compressed, + raw->CSQ_length); +*/ + + if (comp_status != Z_OK) + ECOERROR(ECO_IO_ERROR,"I cannot uncompress sequence data"); + + for (c=seq->SQ,i=0;i= 1024) + ECOERROR(ECO_ASSERT_ERROR,"file name is too long"); + + filename_buffer[filename_length]=0; + + input=open_ecorecorddb(filename_buffer,&seqcount,0); + + if (input) + REprintf("# Reading file %s containing %d sequences...\n", + filename_buffer, + seqcount); + + return input; +} + +ecoseq_t *ecoseq_iterator(const char *prefix) +{ + static FILE *current_seq_file= NULL; + static int32_t current_file_idx = 1; + static char current_prefix[1024]; + ecoseq_t *seq; + + if (prefix) + { + current_file_idx = 1; + + if (current_seq_file) + fclose(current_seq_file); + + strncpy(current_prefix,prefix,1023); + current_prefix[1023]=0; + + current_seq_file = open_seqfile(current_prefix, + current_file_idx); + + if (!current_seq_file) + return NULL; + + } + + seq = readnext_ecoseq(current_seq_file); + + if (!seq && feof(current_seq_file)) + { + current_file_idx++; + fclose(current_seq_file); + current_seq_file = open_seqfile(current_prefix, + current_file_idx); + + + if (current_seq_file) + seq = readnext_ecoseq(current_seq_file); + } + + return seq; +} diff --git a/src/ecoseq.o b/src/ecoseq.o new file mode 100644 index 0000000..0476882 Binary files /dev/null and b/src/ecoseq.o differ diff --git a/src/ecotax.c b/src/ecotax.c new file mode 100644 index 0000000..6b57fbf --- /dev/null +++ b/src/ecotax.c @@ -0,0 +1,420 @@ +#include "ecoPCR.h" +#include +#include +#include + +#include + +#ifndef MAX +#define MAX(x,y) (((x)>(y)) ? (x):(y)) +#endif + +static ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon); + + /** + * Open the taxonomy database + * @param pointer to the database (.tdx file) + * @return a ecotxidx_t structure + */ +ecotxidx_t *read_taxonomyidx(const char *filename,const char *filename2) +{ + int32_t count; + int32_t count2; + FILE *f; + FILE *f2; + ecotxidx_t *index; + struct ecotxnode *t; + int32_t i; + int32_t j; + + f = open_ecorecorddb(filename,&count,0); + + if (f==NULL) return NULL; + + f2 = open_ecorecorddb(filename2,&count2,0); + + index = (ecotxidx_t*) ECOMALLOC(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count+count2-1), + "Allocate taxonomy"); + + index->count=count+count2; + index->buffersize = index->count; + + index->maxtaxid=0; + REprintf("Readind %d taxa...\n",count); + for (i=0; i < count; i++){ + readnext_ecotaxon(f,&(index->taxon[i])); + index->taxon[i].parent=index->taxon + (size_t)index->taxon[i].parent; + index->taxon[i].parent->farest=0; + if (index->taxon[i].taxid > index->maxtaxid) + index->maxtaxid=index->taxon[i].taxid; + } + + + if (count2>0) + REprintf("Readind %d local taxa...\n",count2); + else + REprintf("No local taxon\n"); + + count = index->count; + + for (; i < count; i++){ + readnext_ecotaxon(f2,&(index->taxon[i])); + index->taxon[i].parent=index->taxon + (size_t)index->taxon[i].parent; + index->taxon[i].parent->farest=0; + if (index->taxon[i].taxid > index->maxtaxid) + index->maxtaxid=index->taxon[i].taxid; + } + + REprintf("Computing longest branches...\n",count); + + for (i=0; i < count; i++){ + t=index->taxon+i; + if (t->farest==-1) + { + t->farest=0; + while(t->parent != t) + { + j = t->farest + 1; + if (j > t->parent->farest) + { + t->parent->farest = j; + t=t->parent; + } + else + t=index->taxon; + } + } + } + + return index; +} + + +int32_t delete_taxonomy(ecotxidx_t *index) +{ + int32_t i; + + if (index) + { + for (i=0; i< index->count; i++) + if (index->taxon[i].name) + ECOFREE(index->taxon[i].name,"Free scientific name"); + + + ECOFREE(index,"Free Taxonomy"); + + return 0; + } + + return 1; +} + + + +int32_t delete_taxon(ecotx_t *taxon) +{ + if (taxon) + { + if (taxon->name) + ECOFREE(taxon->name,"Free scientific name"); + + ECOFREE(taxon,"Free Taxon"); + + return 0; + } + + return 1; +} + + +/** + * Read the database for a given taxon a save the data + * into the taxon structure(if any found) + * @param *f pointer to FILE type returned by fopen + * @param *taxon pointer to the structure + * + * @return a ecotx_t structure if any taxon found else NULL + */ +ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon) +{ + + ecotxformat_t *raw; + int32_t rs; + + raw = read_ecorecord(f,&rs); + + if (!raw) + return NULL; + + if (is_big_endian()) + { + raw->namelength = swap_int32_t(raw->namelength); + raw->parent = swap_int32_t(raw->parent); + raw->rank = swap_int32_t(raw->rank); + raw->taxid = swap_int32_t(raw->taxid); + } + + taxon->parent = (ecotx_t*)((size_t)raw->parent); + taxon->taxid = raw->taxid; + taxon->rank = raw->rank; + taxon->farest = -1; + + taxon->name = ECOMALLOC((raw->namelength+1) * sizeof(char), + "Allocate taxon scientific name"); + + strncpy(taxon->name,raw->name,raw->namelength); + + return taxon; +} + + +ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName) +{ + ecotaxonomy_t *tax; + char *filename; + char *filename2; + int buffsize; + + tax = ECOMALLOC(sizeof(ecotaxonomy_t), + "Allocate taxonomy structure"); + + tax->ranks =NULL; + tax->taxons=NULL; + tax->names =NULL; + + buffsize = strlen(prefix)+10; + + filename = ECOMALLOC(buffsize, + "Allocate filename"); + filename2= ECOMALLOC(buffsize, + "Allocate filename"); + + snprintf(filename,buffsize,"%s.rdx",prefix); + + tax->ranks = read_rankidx(filename); + + if (tax->ranks == NULL) + { + ECOFREE(filename,"Desallocate filename 1"); + ECOFREE(filename2,"Desallocate filename 2"); + + delete_ecotaxonomy(tax); + return NULL; + } + + snprintf(filename,buffsize,"%s.tdx",prefix); + snprintf(filename2,buffsize,"%s.ldx",prefix); + + tax->taxons = read_taxonomyidx(filename,filename2); + + if (tax->taxons == NULL) + { + ECOFREE(filename,"Desallocate filename 1"); + ECOFREE(filename,"Desallocate filename 2"); + + delete_ecotaxonomy(tax); + return NULL; + } + + if (readAlternativeName) + { + snprintf(filename,buffsize,"%s.ndx",prefix); + tax->names=read_nameidx(filename,tax); + } + else + tax->names=NULL; + + ECOFREE(filename,"Desallocate filename 1"); + ECOFREE(filename2,"Desallocate filename 2"); + + return tax; + +} + + + +int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy) +{ + if (taxonomy) + { + delete_rankidx(taxonomy->ranks); + delete_nameidx(taxonomy->names); + delete_taxonomy(taxonomy->taxons); + + ECOFREE(taxonomy,"Free taxonomy structure"); + + return 0; + } + + return 1; +} + +ecotx_t *eco_findtaxonatrank(ecotx_t *taxon, + int32_t rankidx) +{ + ecotx_t *current_taxon; + ecotx_t *next_taxon; + + current_taxon = taxon; + next_taxon = current_taxon->parent; + + while ((current_taxon!=next_taxon) && // I' am the root node + (current_taxon->rank!=rankidx)) + { + current_taxon = next_taxon; + next_taxon = current_taxon->parent; + } + + if (current_taxon->rank==rankidx) + return current_taxon; + else + return NULL; +} + +static int bcomptaxon (const void * ptaxid, const void * ptaxon) { + + ecotx_t *current_taxon = (ecotx_t*)ptaxon; + int32_t taxid=(int32_t)((size_t)ptaxid); + return taxid - current_taxon->taxid; +} + +/** + * Get back information concerning a taxon from a taxonomic id + * @param *taxonomy the taxonomy database + * @param taxid the taxonomic id + * + * @result a ecotx_t structure containing the taxonimic information + **/ +ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy, + int32_t taxid) +{ + ecotx_t *current_taxon; + int32_t taxoncount; + + taxoncount=taxonomy->taxons->count; + + current_taxon = (ecotx_t*) bsearch((const void *)((size_t)taxid), + (const void *)taxonomy->taxons->taxon, + taxoncount, + sizeof(ecotx_t), + bcomptaxon); + + return current_taxon; +} + +/** + * Find out if taxon is son of other taxon (identified by its taxid) + * @param *taxon son taxon + * @param parent_taxid taxonomic id of the other taxon + * + * @return 1 is the other taxid math a parent taxid, else 0 + **/ +int eco_isundertaxon(ecotx_t *taxon, + int other_taxid) +{ + ecotx_t *next_parent; + + next_parent = taxon->parent; + + while ( (other_taxid != next_parent->taxid) && + (strcmp(next_parent->name, "root")) ) + { + next_parent = next_parent->parent; + } + + if (other_taxid == next_parent->taxid) + return 1; + else + return 0; +} + +ecotx_t *eco_getspecies(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("species",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} + +ecotx_t *eco_getgenus(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("genus",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} + + +ecotx_t *eco_getfamily(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("family",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} + +ecotx_t *eco_getkingdom(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("kingdom",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} + +ecotx_t *eco_getsuperkingdom(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("superkingdom",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} diff --git a/src/ecotax.o b/src/ecotax.o new file mode 100644 index 0000000..510f5c1 Binary files /dev/null and b/src/ecotax.o differ diff --git a/src/robitax.c b/src/robitax.c new file mode 100644 index 0000000..8445e3e --- /dev/null +++ b/src/robitax.c @@ -0,0 +1,845 @@ +/* + * robitax.c + * + * Created on: 17 janv. 2013 + * Author: coissac + */ + +#include "robitax.h" +#include +//#include +#include "slre.h" + +/** + * Return a pointeur to an obitools taxonomy C structure + * from an R instance of taxonomy.obitools + * + * The function checks if the pointer stored in the R object is set + * to NULL. In this case this means that we have to load the taxonomy + * from the disk. + * + * @param taxonomy an R object + * @type taxonomy SEXP + * + * @return a pointer to the C structure + * @rtype ecotaxonomy_t * + */ + +ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy) +{ + + + char *pwd; + SEXP pointer; + SEXP rclass; + SEXP rdir; + SEXP rfile; + ecotaxonomy_t *ptax; + const char *class; + const char *file; + const char *dir; + + int saved; + + if (!IS_S4_OBJECT(Rtaxonomy) ) + error("argument not obitools.taxonomy instance"); + + // We get the class name and compare it to "taxonomy.obitools" + rclass = getAttrib(Rtaxonomy, R_ClassSymbol); + class = CHAR(asChar(rclass)); + + if (strcmp(class,"obitools.taxonomy")) + error("argument not obitools.taxonomy instance"); + + pointer = R_do_slot(Rtaxonomy,mkString("pointer")); + saved = LOGICAL(R_do_slot(Rtaxonomy,mkString("saved")))[0]; + ptax = (ecotaxonomy_t *) R_ExternalPtrAddr(pointer); + + // If the external pointer is set to NULL we have to load + // the taxonomy from file + if (ptax==NULL && saved) + { + pwd = getcwd(NULL,0); + + rfile = R_do_slot(Rtaxonomy,mkString("dbname")); + file = CHAR(asChar(rfile)); + + rdir = R_do_slot(Rtaxonomy,mkString("workingdir")); + dir = CHAR(asChar(rdir)); + + chdir(dir); + + ptax = read_taxonomy(file,1); + + R_SetExternalPtrAddr(pointer,(void*)ptax); + + chdir(pwd); + free(pwd); + } + + if (ptax==NULL && ! saved) + error("The taxonomy instance is no more valid and must be rebuilt"); + + return ptax; +} + +SEXP R_delete_taxonomy(SEXP Rtaxonomy) +{ + ecotaxonomy_t *ptax; +// SEXP pointer; + + ptax = (ecotaxonomy_t *) R_ExternalPtrAddr(Rtaxonomy); + + (void) delete_ecotaxonomy(ptax); + + // Clear the external pointer + R_ClearExternalPtr(Rtaxonomy); + + return R_NilValue; + +} + + + +SEXP R_read_taxonomy(SEXP filename, SEXP altenative) +{ + int alt; + const char* file; + SEXP Rtax; + + if (! isString(filename)) + error("filename not character"); + file = CHAR(STRING_ELT(filename, 0)); + + if (! isLogical(altenative)) + error("altenative not logical"); + alt = LOGICAL(altenative)[0]; + + ecotaxonomy_t *taxonomy = read_taxonomy(file,alt); + + if (! taxonomy) + error("Cannot open the taxonomy database"); + + Rtax = PROTECT(R_MakeExternalPtr(taxonomy, mkString("ROBITools NCBI Taxonomy pointer"), R_NilValue)); + R_RegisterCFinalizerEx(Rtax, (R_CFinalizer_t)R_delete_taxonomy,TRUE); + + UNPROTECT(1); + + + return Rtax; +} + + +SEXP R_get_scientific_name(SEXP Rtaxonomy,SEXP Rtaxid) +{ + ecotx_t *taxon; + ecotaxonomy_t *ptax; + int taxid; +// SEXP scname; + + ptax = getTaxPointer(Rtaxonomy); + + if (! isInteger(Rtaxid)) + error("taxid not integer"); + + taxid = *INTEGER(Rtaxid); + + if (! (taxid > 0)) + error("taxid not positive"); + + + taxon = eco_findtaxonbytaxid(ptax, taxid); + + if (!taxon) + return ScalarString(R_NaString); + // error("unkown taxid"); + + return mkString(taxon->name); + +} + +SEXP R_get_rank(SEXP Rtaxonomy,SEXP Rtaxid) +{ + ecotx_t *taxon; + ecotaxonomy_t *ptax; + int *taxid; + int ntaxid; + int i; + SEXP results; +// SEXP scname; + + ptax = getTaxPointer(Rtaxonomy); + + if (! isInteger(Rtaxid)) + error("taxid not integer"); + + ntaxid = length(Rtaxid); + + results = PROTECT(allocVector(STRSXP, ntaxid)); + + taxid = INTEGER(Rtaxid); + + for (i=0; i < ntaxid; i++) + { + if (taxid[i]== NA_INTEGER || taxid[i] <= 0) + SET_STRING_ELT(results, i, R_NaString); + else { + taxon = eco_findtaxonbytaxid(ptax, taxid[i]); + if (!taxon) + SET_STRING_ELT(results, i, R_NaString); + else + SET_STRING_ELT(results, i, mkChar(ptax->ranks->label[taxon->rank])); + } + } + + UNPROTECT(1); + + return results; + +} + +SEXP R_findtaxonatrank(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rrank, SEXP Rname) +{ + ecotx_t *taxon; + ecotx_t *rep; + ecotaxonomy_t *ptax; + int taxid; + int name; + const char *rank; + int rankidx; +// SEXP scname; + + ptax = getTaxPointer(Rtaxonomy); + + if (! isInteger(Rtaxid)) + error("taxid not integer"); + + taxid = *INTEGER(Rtaxid); + + if (! (taxid > 0)) + error("taxid not positive"); + + if (! isString(Rrank)) + error("rank not a string"); + + rank=CHAR(STRING_ELT(Rrank,0)); + + rankidx=rank_index(rank,ptax->ranks); + + if (rankidx < 0) + error("unkown rank name"); + + if (! isLogical(Rname)) + error("name not logical"); + name = LOGICAL(Rname)[0]; + + taxon = eco_findtaxonbytaxid(ptax, taxid); + + if (!taxon) + { + if (name) + return ScalarString(R_NaString); + else + return ScalarInteger(R_NaInt); + } + rep = eco_findtaxonatrank(taxon,rankidx); + + if (!rep) + { + if (name) + return ScalarString(R_NaString); + else + return ScalarInteger(R_NaInt); + } + + if (name) + return mkString(rep->name); + else + return ScalarInteger(rep->taxid); +} + + +SEXP R_get_species(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname) +{ + ecotx_t *taxon; + ecotx_t *rep; + ecotaxonomy_t *ptax; + int taxid; + int name; +// SEXP scname; + + ptax = getTaxPointer(Rtaxonomy); + + if (! isInteger(Rtaxid)) + error("taxid not integer"); + + taxid = *INTEGER(Rtaxid); + + if (! (taxid > 0)) + error("taxid not positive"); + + if (! isLogical(Rname)) + error("name not logical"); + name = LOGICAL(Rname)[0]; + + taxon = eco_findtaxonbytaxid(ptax, taxid); + + if (!taxon) + { + if (name) + return ScalarString(R_NaString); + else + return ScalarInteger(R_NaInt); + } + + rep = eco_getspecies(taxon,ptax); + + if (!rep) + { + if (name) + return ScalarString(R_NaString); + else + return ScalarInteger(R_NaInt); + } + + if (name) + return mkString(rep->name); + else + return ScalarInteger(rep->taxid); +} + +SEXP R_get_genus(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname) +{ + ecotx_t *taxon; + ecotx_t *rep; + ecotaxonomy_t *ptax; + int taxid; + int name; +// SEXP scname; + + ptax = getTaxPointer(Rtaxonomy); + + if (! isInteger(Rtaxid)) + error("taxid not integer"); + + taxid = *INTEGER(Rtaxid); + + if (! (taxid > 0)) + error("taxid not positive"); + + if (! isLogical(Rname)) + error("name not logical"); + name = LOGICAL(Rname)[0]; + + taxon = eco_findtaxonbytaxid(ptax, taxid); + + if (!taxon) + { + if (name) + return ScalarString(R_NaString); + else + return ScalarInteger(R_NaInt); + } + + rep = eco_getgenus(taxon,ptax); + + if (!rep) + { + if (name) + return ScalarString(R_NaString); + else + return ScalarInteger(R_NaInt); + } + + if (name) + return mkString(rep->name); + else + return ScalarInteger(rep->taxid); +} + +SEXP R_get_family(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname) +{ + ecotx_t *taxon; + ecotx_t *rep; + ecotaxonomy_t *ptax; + int taxid; + int name; +// SEXP scname; + + ptax = getTaxPointer(Rtaxonomy); + + if (! isInteger(Rtaxid)) + error("taxid not integer"); + + taxid = *INTEGER(Rtaxid); + + if (! (taxid > 0)) + error("taxid not positive"); + + if (! isLogical(Rname)) + error("name not logical"); + name = LOGICAL(Rname)[0]; + + taxon = eco_findtaxonbytaxid(ptax, taxid); + + if (!taxon) + { + if (name) + return ScalarString(R_NaString); + else + return ScalarInteger(R_NaInt); + } + + rep = eco_getfamily(taxon,ptax); + + if (!rep) + { + if (name) + return ScalarString(R_NaString); + else + return ScalarInteger(R_NaInt); + } + + if (name) + return mkString(rep->name); + else + return ScalarInteger(rep->taxid); +} + +SEXP R_get_kingdom(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname) +{ + ecotx_t *taxon; + ecotx_t *rep; + ecotaxonomy_t *ptax; + int taxid; + int name; +// SEXP scname; + + ptax = getTaxPointer(Rtaxonomy); + + if (! isInteger(Rtaxid)) + error("taxid not integer"); + + taxid = *INTEGER(Rtaxid); + + if (! (taxid > 0)) + error("taxid not positive"); + + if (! isLogical(Rname)) + error("name not logical"); + name = LOGICAL(Rname)[0]; + + taxon = eco_findtaxonbytaxid(ptax, taxid); + + if (!taxon) + { + if (name) + return ScalarString(R_NaString); + else + return ScalarInteger(R_NaInt); + } + + rep = eco_getkingdom(taxon,ptax); + + if (!rep) + { + if (name) + return ScalarString(R_NaString); + else + return ScalarInteger(R_NaInt); + } + + if (name) + return mkString(rep->name); + else + return ScalarInteger(rep->taxid); +} + +SEXP R_get_superkingdom(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname) +{ + ecotx_t *taxon; + ecotx_t *rep; + ecotaxonomy_t *ptax; + int taxid; + int name; +// SEXP scname; + + ptax = getTaxPointer(Rtaxonomy); + + if (! isInteger(Rtaxid)) + error("taxid not integer"); + + taxid = *INTEGER(Rtaxid); + + if (! (taxid > 0)) + error("taxid not positive"); + + if (! isLogical(Rname)) + error("name not logical"); + name = LOGICAL(Rname)[0]; + + taxon = eco_findtaxonbytaxid(ptax, taxid); + + if (!taxon) + { + if (name) + return ScalarString(R_NaString); + else + return ScalarInteger(R_NaInt); + } + + rep = eco_getsuperkingdom(taxon,ptax); + + if (!rep) + { + if (name) + return ScalarString(R_NaString); + else + return ScalarInteger(R_NaInt); + } + + if (name) + return mkString(rep->name); + else + return ScalarInteger(rep->taxid); +} + +SEXP R_get_parent(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname) +{ + ecotx_t *taxon; + ecotx_t *rep; + ecotaxonomy_t *ptax; + int taxid; + int name; +// SEXP scname; + + ptax = getTaxPointer(Rtaxonomy); + + if (! isInteger(Rtaxid)) + error("taxid not integer"); + + taxid = *INTEGER(Rtaxid); + + if (! (taxid > 0)) + error("taxid not positive"); + + if (! isLogical(Rname)) + error("name not logical"); + name = LOGICAL(Rname)[0]; + + taxon = eco_findtaxonbytaxid(ptax, taxid); + + if (!taxon) + { + if (name) + return ScalarString(R_NaString); + else + return ScalarInteger(R_NaInt); + } + + rep = taxon->parent; + + if (rep->taxid==taxid) + { + if (name) + return ScalarString(R_NaString); + else + return ScalarInteger(R_NaInt); + } + + if (name) + return mkString(rep->name); + else + return ScalarInteger(rep->taxid); +} + + +SEXP R_validate_taxid(SEXP Rtaxonomy,SEXP Rtaxid) +{ + ecotx_t *taxon; + ecotaxonomy_t *ptax; + int taxid; +// int name; +// SEXP scname; + + ptax = getTaxPointer(Rtaxonomy); + + if (! isInteger(Rtaxid)) + error("taxid not integer"); + + taxid = *INTEGER(Rtaxid); + + if (! (taxid > 0)) + return ScalarInteger(R_NaInt); + + taxon = eco_findtaxonbytaxid(ptax, taxid); + + if (!taxon) + return ScalarInteger(R_NaInt); + else + return ScalarInteger(taxon->taxid); +} + + +SEXP R_is_under_taxon(SEXP Rtaxonomy, SEXP Rtaxid, SEXP Rparent) +{ + ecotx_t *taxon; + ecotaxonomy_t *ptax; + int taxid; + int *ptaxid; + int ntaxid; + int parent; + SEXP rep; + int* prep; + size_t i; +// SEXP isunder; + + ptax = getTaxPointer(Rtaxonomy); + + if (! isInteger(Rparent)) + error("parent not integer"); + + if (GET_LENGTH(Rparent) > 1) + error("parent must have a length equal to one."); + + parent = *INTEGER(Rparent); + + if (! isInteger(Rtaxid)) + error("taxid not integer"); + + ntaxid = GET_LENGTH(Rtaxid); + rep = PROTECT(NEW_LOGICAL(ntaxid)); + prep=LOGICAL(rep); + + ptaxid = INTEGER_POINTER(Rtaxid); + + + if (parent > 0) { + taxon = eco_findtaxonbytaxid(ptax, parent); + if (taxon) { + for (i=0; i < ntaxid; i++) { + if (ptaxid[i] <= 0) + prep[i]=NA_LOGICAL; + else { + taxon = eco_findtaxonbytaxid(ptax, ptaxid[i]); + prep[i] = eco_isundertaxon(taxon, parent); + } + } + } + } + else + { + for (i=0; i < ntaxid; i++) + prep[i]=NA_LOGICAL; + } + + unprotect(1); + return rep; +} + + +SEXP R_longest_path(SEXP Rtaxonomy,SEXP Rtaxid) +{ + ecotx_t *taxon; + ecotaxonomy_t *ptax; + int taxid; +// int name; +// SEXP scname; + + ptax = getTaxPointer(Rtaxonomy); + + if (! isInteger(Rtaxid)) + error("taxid not integer"); + + taxid = *INTEGER(Rtaxid); + + if (taxid <= 0) + return ScalarInteger(R_NaInt); + + taxon = eco_findtaxonbytaxid(ptax, taxid); + + if (!taxon) + return ScalarInteger(R_NaInt); + else + return ScalarInteger(taxon->farest); +} + +SEXP R_rank_list(SEXP Rtaxonomy) +{ + int nrank; + int i; + ecotaxonomy_t *ptax; + SEXP rNames; + + ptax = getTaxPointer(Rtaxonomy); + + nrank = ptax->ranks->count; + + rNames = PROTECT(allocVector(STRSXP, nrank)); + + for (i=0; i < nrank;i++) + SET_STRING_ELT(rNames, i, mkChar(ptax->ranks->label[i])); + + UNPROTECT(1); + + return rNames; +} + +SEXP R_taxid_list(SEXP Rtaxonomy) +{ + int ntaxid; + int i; + ecotaxonomy_t *ptax; + SEXP rTaxids; + + ptax = getTaxPointer(Rtaxonomy); + ntaxid = ptax->taxons->count; + rTaxids = PROTECT(allocVector(INTSXP, ntaxid)); + + for (i=0; i < ntaxid;i++) + INTEGER(rTaxids)[i]=ptax->taxons->taxon[i].taxid; + + UNPROTECT(1); + + return rTaxids; + +} + +SEXP R_max_taxid(SEXP Rtaxonomy) +{ +// int nrank; +// int i; + ecotaxonomy_t *ptax; +// SEXP rNames; + + ptax = getTaxPointer(Rtaxonomy); + + return ScalarInteger(ptax->taxons->maxtaxid); +} + +SEXP R_length_taxonomy(SEXP Rtaxonomy) +{ + ecotaxonomy_t *ptax; + + ptax = getTaxPointer(Rtaxonomy); + + return ScalarInteger(ptax->taxons->count); +} + +SEXP R_ecofind(SEXP Rtaxonomy, SEXP Rpattern, SEXP Rrank, SEXP Ralternative) +{ + ecotaxonomy_t *ptax; + econame_t *name; + char* pattern=NULL; + int re_match; + SEXP taxids; + int32_t* buffer; + int32_t tax_count = 0; + size_t j = 0; + int32_t rankfilter = 1; + int* ptaxid; + char *rankname=NULL; + int32_t nummatch = 0; + int32_t alternative = 0; + + size_t bsize; + + ptax = getTaxPointer(Rtaxonomy); + tax_count = ptax->taxons->count; + + if (! isString(Rpattern)) + error("pattern not a string"); + + pattern= (char*) CHAR(STRING_ELT(Rpattern,0)); + + if (! isNull(Rrank)) + { + if (! isString(Rrank)) + error("rank not a string"); + + rankname= (char*) CHAR(STRING_ELT(Rrank,0)); + } + + if (! isLogical(Ralternative)) + error("rank not a logical"); + + alternative = LOGICAL(Ralternative)[0]; + + + nummatch=0; + buffer = (int32_t*) malloc(100 * sizeof(int32_t)); + bsize=100; + + if (alternative && ptax->names!=NULL) + for (j=0,name=ptax->names->names; + j < ptax->names->count; + name++,j++) + { + if(rankname) + rankfilter = !(strcmp(rankname,ptax->ranks->label[name->taxon->rank])); + + re_match = slre_match(pattern, name->name, + strlen(name->name), + NULL, 0, + SLRE_IGNORE_CASE); + + if (re_match > 0 && rankfilter) + { + buffer[nummatch]=name->taxon->taxid; + nummatch++; + if (nummatch==bsize) { + bsize*=2; + buffer = (int32_t*) realloc(buffer, bsize * sizeof(int32_t)); + if (buffer==0) + { + // regfree(&re_preg); + error("Cannot allocate memory for the taxid list"); + } + } + } + + } + else + for (j=0; j < ptax->taxons->count;j++) + { + if(rankname) + rankfilter = !(strcmp(rankname,ptax->ranks->label[ptax->taxons->taxon[j].rank])); + +// re_match = regexec (&re_preg, ptax->taxons->taxon[j].name, 0, NULL, 0); + re_match = slre_match(pattern, ptax->taxons->taxon[j].name, + strlen(ptax->taxons->taxon[j].name), + NULL, 0, + SLRE_IGNORE_CASE); + + +// if (!re_match && rankfilter) + if (re_match > 0 && rankfilter) + { + buffer[nummatch]=ptax->taxons->taxon[j].taxid; + nummatch++; + if (nummatch==bsize) { + bsize*=2; + buffer = (int32_t*) realloc(buffer, bsize * sizeof(int32_t)); + if (buffer==0) + { + // regfree(&re_preg); + error("Cannot allocate memory for the taxid list"); + } + } + } + + } + + //regfree(&re_preg); + + taxids = PROTECT(NEW_INTEGER(nummatch)); + ptaxid = INTEGER(taxids); + + for (j=0; j < nummatch; j++) + ptaxid[j]=buffer[j]; + + free(buffer); + + UNPROTECT(1); + return taxids; +} diff --git a/src/robitax.h b/src/robitax.h new file mode 100644 index 0000000..52dec45 --- /dev/null +++ b/src/robitax.h @@ -0,0 +1,6 @@ +#include "ecoPCR.h" + + +ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy); +SEXP R_delete_taxonomy(SEXP Rtaxonomy); + diff --git a/src/robitax.o b/src/robitax.o new file mode 100644 index 0000000..6a95c65 Binary files /dev/null and b/src/robitax.o differ diff --git a/src/slre.c b/src/slre.c new file mode 100755 index 0000000..4a7fd89 --- /dev/null +++ b/src/slre.c @@ -0,0 +1,433 @@ +/* + * Copyright (c) 2004-2013 Sergey Lyubka + * Copyright (c) 2013 Cesanta Software Limited + * All rights reserved + * + * This library is dual-licensed: you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. For the terms of this + * license, see . + * + * You are free to use this library under the terms of the GNU General + * Public License, but WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * Alternatively, you can license this library under a commercial + * license, as set out in . + */ + +#include +#include +#include + +#include "slre.h" + +#define MAX_BRANCHES 100 +#define MAX_BRACKETS 100 +#define FAIL_IF(condition, error_code) if (condition) return (error_code) + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(ar) (sizeof(ar) / sizeof((ar)[0])) +#endif + +#ifdef SLRE_DEBUG +#define DBG(x) printf x +#else +#define DBG(x) +#endif + +struct bracket_pair { + const char *ptr; /* Points to the first char after '(' in regex */ + int len; /* Length of the text between '(' and ')' */ + int branches; /* Index in the branches array for this pair */ + int num_branches; /* Number of '|' in this bracket pair */ +}; + +struct branch { + int bracket_index; /* index for 'struct bracket_pair brackets' */ + /* array defined below */ + const char *schlong; /* points to the '|' character in the regex */ +}; + +struct regex_info { + /* + * Describes all bracket pairs in the regular expression. + * First entry is always present, and grabs the whole regex. + */ + struct bracket_pair brackets[MAX_BRACKETS]; + int num_brackets; + + /* + * Describes alternations ('|' operators) in the regular expression. + * Each branch falls into a specific branch pair. + */ + struct branch branches[MAX_BRANCHES]; + int num_branches; + + /* Array of captures provided by the user */ + struct slre_cap *caps; + int num_caps; + + /* E.g. SLRE_IGNORE_CASE. See enum below */ + int flags; +}; + +static int is_metacharacter(const unsigned char *s) { + static const char *metacharacters = "^$().[]*+?|\\Ssdbfnrtv"; + return strchr(metacharacters, *s) != NULL; +} + +static int op_len(const char *re) { + return re[0] == '\\' && re[1] == 'x' ? 4 : re[0] == '\\' ? 2 : 1; +} + +static int set_len(const char *re, int re_len) { + int len = 0; + + while (len < re_len && re[len] != ']') { + len += op_len(re + len); + } + + return len <= re_len ? len + 1 : -1; +} + +static int get_op_len(const char *re, int re_len) { + return re[0] == '[' ? set_len(re + 1, re_len - 1) + 1 : op_len(re); +} + +static int is_quantifier(const char *re) { + return re[0] == '*' || re[0] == '+' || re[0] == '?'; +} + +static int toi(int x) { + return isdigit(x) ? x - '0' : x - 'W'; +} + +static int hextoi(const unsigned char *s) { + return (toi(tolower(s[0])) << 4) | toi(tolower(s[1])); +} + +static int match_op(const unsigned char *re, const unsigned char *s, + struct regex_info *info) { + int result = 0; + switch (*re) { + case '\\': + /* Metacharacters */ + switch (re[1]) { + case 'S': FAIL_IF(isspace(*s), SLRE_NO_MATCH); result++; break; + case 's': FAIL_IF(!isspace(*s), SLRE_NO_MATCH); result++; break; + case 'd': FAIL_IF(!isdigit(*s), SLRE_NO_MATCH); result++; break; + case 'b': FAIL_IF(*s != '\b', SLRE_NO_MATCH); result++; break; + case 'f': FAIL_IF(*s != '\f', SLRE_NO_MATCH); result++; break; + case 'n': FAIL_IF(*s != '\n', SLRE_NO_MATCH); result++; break; + case 'r': FAIL_IF(*s != '\r', SLRE_NO_MATCH); result++; break; + case 't': FAIL_IF(*s != '\t', SLRE_NO_MATCH); result++; break; + case 'v': FAIL_IF(*s != '\v', SLRE_NO_MATCH); result++; break; + + case 'x': + /* Match byte, \xHH where HH is hexadecimal byte representaion */ + FAIL_IF(hextoi(re + 2) != *s, SLRE_NO_MATCH); + result++; + break; + + default: + /* Valid metacharacter check is done in bar() */ + FAIL_IF(re[1] != s[0], SLRE_NO_MATCH); + result++; + break; + } + break; + + case '|': FAIL_IF(1, SLRE_INTERNAL_ERROR); break; + case '$': FAIL_IF(1, SLRE_NO_MATCH); break; + case '.': result++; break; + + default: + if (info->flags & SLRE_IGNORE_CASE) { + FAIL_IF(tolower(*re) != tolower(*s), SLRE_NO_MATCH); + } else { + FAIL_IF(*re != *s, SLRE_NO_MATCH); + } + result++; + break; + } + + return result; +} + +static int match_set(const char *re, int re_len, const char *s, + struct regex_info *info) { + int len = 0, result = -1, invert = re[0] == '^'; + + if (invert) re++, re_len--; + + while (len <= re_len && re[len] != ']' && result <= 0) { + /* Support character range */ + if (re[len] != '-' && re[len + 1] == '-' && re[len + 2] != ']' && + re[len + 2] != '\0') { + result = info->flags && SLRE_IGNORE_CASE ? + *s >= re[len] && *s <= re[len + 2] : + tolower(*s) >= tolower(re[len]) && tolower(*s) <= tolower(re[len + 2]); + len += 3; + } else { + result = match_op((unsigned char *) re + len, (unsigned char *) s, info); + len += op_len(re + len); + } + } + return (!invert && result > 0) || (invert && result <= 0) ? 1 : -1; +} + +static int doh(const char *s, int s_len, struct regex_info *info, int bi); + +static int bar(const char *re, int re_len, const char *s, int s_len, + struct regex_info *info, int bi) { + /* i is offset in re, j is offset in s, bi is brackets index */ + int i, j, n, step; + + for (i = j = 0; i < re_len && j <= s_len; i += step) { + + /* Handle quantifiers. Get the length of the chunk. */ + step = re[i] == '(' ? info->brackets[bi + 1].len + 2 : + get_op_len(re + i, re_len - i); + + DBG(("%s [%.*s] [%.*s] re_len=%d step=%d i=%d j=%d\n", __func__, + re_len - i, re + i, s_len - j, s + j, re_len, step, i, j)); + + FAIL_IF(is_quantifier(&re[i]), SLRE_UNEXPECTED_QUANTIFIER); + FAIL_IF(step <= 0, SLRE_INVALID_CHARACTER_SET); + + if (i + step < re_len && is_quantifier(re + i + step)) { + DBG(("QUANTIFIER: [%.*s]%c [%.*s]\n", step, re + i, + re[i + step], s_len - j, s + j)); + if (re[i + step] == '?') { + int result = bar(re + i, step, s + j, s_len - j, info, bi); + j += result > 0 ? result : 0; + i++; + } else if (re[i + step] == '+' || re[i + step] == '*') { + int j2 = j, nj = j, n1, n2 = -1, ni, non_greedy = 0; + + /* Points to the regexp code after the quantifier */ + ni = i + step + 1; + if (ni < re_len && re[ni] == '?') { + non_greedy = 1; + ni++; + } + + do { + if ((n1 = bar(re + i, step, s + j2, s_len - j2, info, bi)) > 0) { + j2 += n1; + } + if (re[i + step] == '+' && n1 < 0) break; + + if (ni >= re_len) { + /* After quantifier, there is nothing */ + nj = j2; + } else if ((n2 = bar(re + ni, re_len - ni, s + j2, + s_len - j2, info, bi)) >= 0) { + /* Regex after quantifier matched */ + nj = j2 + n2; + } + if (nj > j && non_greedy) break; + } while (n1 > 0); + + if (n1 < 0 && re[i + step] == '*' && + (n2 = bar(re + ni, re_len - ni, s + j, s_len - j, info, bi)) > 0) { + nj = j + n2; + } + + DBG(("STAR/PLUS END: %d %d %d %d %d\n", j, nj, re_len - ni, n1, n2)); + FAIL_IF(re[i + step] == '+' && nj == j, SLRE_NO_MATCH); + + /* If while loop body above was not executed for the * quantifier, */ + /* make sure the rest of the regex matches */ + FAIL_IF(nj == j && ni < re_len && n2 < 0, SLRE_NO_MATCH); + + /* Returning here cause we've matched the rest of RE already */ + return nj; + } + continue; + } + + if (re[i] == '[') { + n = match_set(re + i + 1, re_len - (i + 2), s + j, info); + DBG(("SET %.*s [%.*s] -> %d\n", step, re + i, s_len - j, s + j, n)); + FAIL_IF(n <= 0, SLRE_NO_MATCH); + j += n; + } else if (re[i] == '(') { + n = SLRE_NO_MATCH; + bi++; + FAIL_IF(bi >= info->num_brackets, SLRE_INTERNAL_ERROR); + DBG(("CAPTURING [%.*s] [%.*s] [%s]\n", + step, re + i, s_len - j, s + j, re + i + step)); + + if (re_len - (i + step) <= 0) { + /* Nothing follows brackets */ + n = doh(s + j, s_len - j, info, bi); + } else { + int j2; + for (j2 = 0; j2 <= s_len - j; j2++) { + if ((n = doh(s + j, s_len - (j + j2), info, bi)) >= 0 && + bar(re + i + step, re_len - (i + step), + s + j + n, s_len - (j + n), info, bi) >= 0) break; + } + } + + DBG(("CAPTURED [%.*s] [%.*s]:%d\n", step, re + i, s_len - j, s + j, n)); + FAIL_IF(n < 0, n); + if (info->caps != NULL) { + info->caps[bi - 1].ptr = s + j; + info->caps[bi - 1].len = n; + } + j += n; + } else if (re[i] == '^') { + FAIL_IF(j != 0, SLRE_NO_MATCH); + } else if (re[i] == '$') { + FAIL_IF(j != s_len, SLRE_NO_MATCH); + } else { + FAIL_IF(j >= s_len, SLRE_NO_MATCH); + n = match_op((unsigned char *) (re + i), (unsigned char *) (s + j), info); + FAIL_IF(n <= 0, n); + j += n; + } + } + + return j; +} + +/* Process branch points */ +static int doh(const char *s, int s_len, struct regex_info *info, int bi) { + const struct bracket_pair *b = &info->brackets[bi]; + int i = 0, len, result; + const char *p; + + do { + p = i == 0 ? b->ptr : info->branches[b->branches + i - 1].schlong + 1; + len = b->num_branches == 0 ? b->len : + i == b->num_branches ? (int) (b->ptr + b->len - p) : + (int) (info->branches[b->branches + i].schlong - p); + DBG(("%s %d %d [%.*s] [%.*s]\n", __func__, bi, i, len, p, s_len, s)); + result = bar(p, len, s, s_len, info, bi); + DBG(("%s <- %d\n", __func__, result)); + } while (result <= 0 && i++ < b->num_branches); /* At least 1 iteration */ + + return result; +} + +static int baz(const char *s, int s_len, struct regex_info *info) { + int i, result = -1, is_anchored = info->brackets[0].ptr[0] == '^'; + + for (i = 0; i <= s_len; i++) { + result = doh(s + i, s_len - i, info, 0); + if (result >= 0) { + result += i; + break; + } + if (is_anchored) break; + } + + return result; +} + +static void setup_branch_points(struct regex_info *info) { + int i, j; + struct branch tmp; + + /* First, sort branches. Must be stable, no qsort. Use bubble algo. */ + for (i = 0; i < info->num_branches; i++) { + for (j = i + 1; j < info->num_branches; j++) { + if (info->branches[i].bracket_index > info->branches[j].bracket_index) { + tmp = info->branches[i]; + info->branches[i] = info->branches[j]; + info->branches[j] = tmp; + } + } + } + + /* + * For each bracket, set their branch points. This way, for every bracket + * (i.e. every chunk of regex) we know all branch points before matching. + */ + for (i = j = 0; i < info->num_brackets; i++) { + info->brackets[i].num_branches = 0; + info->brackets[i].branches = j; + while (j < info->num_branches && info->branches[j].bracket_index == i) { + info->brackets[i].num_branches++; + j++; + } + } +} + +static int foo(const char *re, int re_len, const char *s, int s_len, + struct regex_info *info) { + int i, step, depth = 0; + + /* First bracket captures everything */ + info->brackets[0].ptr = re; + info->brackets[0].len = re_len; + info->num_brackets = 1; + + /* Make a single pass over regex string, memorize brackets and branches */ + for (i = 0; i < re_len; i += step) { + step = get_op_len(re + i, re_len - i); + + if (re[i] == '|') { + FAIL_IF(info->num_branches >= (int) ARRAY_SIZE(info->branches), + SLRE_TOO_MANY_BRANCHES); + info->branches[info->num_branches].bracket_index = + info->brackets[info->num_brackets - 1].len == -1 ? + info->num_brackets - 1 : depth; + info->branches[info->num_branches].schlong = &re[i]; + info->num_branches++; + } else if (re[i] == '\\') { + FAIL_IF(i >= re_len - 1, SLRE_INVALID_METACHARACTER); + if (re[i + 1] == 'x') { + /* Hex digit specification must follow */ + FAIL_IF(re[i + 1] == 'x' && i >= re_len - 3, + SLRE_INVALID_METACHARACTER); + FAIL_IF(re[i + 1] == 'x' && !(isxdigit(re[i + 2]) && + isxdigit(re[i + 3])), SLRE_INVALID_METACHARACTER); + } else { + FAIL_IF(!is_metacharacter((unsigned char *) re + i + 1), + SLRE_INVALID_METACHARACTER); + } + } else if (re[i] == '(') { + FAIL_IF(info->num_brackets >= (int) ARRAY_SIZE(info->brackets), + SLRE_TOO_MANY_BRACKETS); + depth++; /* Order is important here. Depth increments first. */ + info->brackets[info->num_brackets].ptr = re + i + 1; + info->brackets[info->num_brackets].len = -1; + info->num_brackets++; + FAIL_IF(info->num_caps > 0 && info->num_brackets - 1 > info->num_caps, + SLRE_CAPS_ARRAY_TOO_SMALL); + } else if (re[i] == ')') { + int ind = info->brackets[info->num_brackets - 1].len == -1 ? + info->num_brackets - 1 : depth; + info->brackets[ind].len = (int) (&re[i] - info->brackets[ind].ptr); + DBG(("SETTING BRACKET %d [%.*s]\n", + ind, info->brackets[ind].len, info->brackets[ind].ptr)); + depth--; + FAIL_IF(depth < 0, SLRE_UNBALANCED_BRACKETS); + FAIL_IF(i > 0 && re[i - 1] == '(', SLRE_NO_MATCH); + } + } + + FAIL_IF(depth != 0, SLRE_UNBALANCED_BRACKETS); + setup_branch_points(info); + + return baz(s, s_len, info); +} + +int slre_match(const char *regexp, const char *s, int s_len, + struct slre_cap *caps, int num_caps, int flags) { + struct regex_info info; + + /* Initialize info structure */ + info.flags = flags; + info.num_brackets = info.num_branches = 0; + info.num_caps = num_caps; + info.caps = caps; + + DBG(("========================> [%s] [%.*s]\n", regexp, s_len, s)); + return foo(regexp, (int) strlen(regexp), s, s_len, &info); +} diff --git a/src/slre.h b/src/slre.h new file mode 100755 index 0000000..8929574 --- /dev/null +++ b/src/slre.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2004-2013 Sergey Lyubka + * Copyright (c) 2013 Cesanta Software Limited + * All rights reserved + * + * This library is dual-licensed: you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. For the terms of this + * license, see . + * + * You are free to use this library under the terms of the GNU General + * Public License, but WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * Alternatively, you can license this library under a commercial + * license, as set out in . + */ + +/* + * This is a regular expression library that implements a subset of Perl RE. + * Please refer to README.md for a detailed reference. + */ + +#ifndef SLRE_HEADER_DEFINED +#define SLRE_HEADER_DEFINED + +#ifdef __cplusplus +extern "C" { +#endif + +struct slre_cap { + const char *ptr; + int len; +}; + + +int slre_match(const char *regexp, const char *buf, int buf_len, + struct slre_cap *caps, int num_caps, int flags); + +/* Possible flags for slre_match() */ +enum { SLRE_IGNORE_CASE = 1 }; + + +/* slre_match() failure codes */ +#define SLRE_NO_MATCH -1 +#define SLRE_UNEXPECTED_QUANTIFIER -2 +#define SLRE_UNBALANCED_BRACKETS -3 +#define SLRE_INTERNAL_ERROR -4 +#define SLRE_INVALID_CHARACTER_SET -5 +#define SLRE_INVALID_METACHARACTER -6 +#define SLRE_CAPS_ARRAY_TOO_SMALL -7 +#define SLRE_TOO_MANY_BRANCHES -8 +#define SLRE_TOO_MANY_BRACKETS -9 + +#ifdef __cplusplus +} +#endif + +#endif /* SLRE_HEADER_DEFINED */ diff --git a/src/slre.o b/src/slre.o new file mode 100644 index 0000000..2890c56 Binary files /dev/null and b/src/slre.o differ