initial commit
This commit is contained in:
@@ -0,0 +1,3 @@
|
||||
/man/
|
||||
/vignettes/
|
||||
/Read-and-delete-me
|
||||
+20
@@ -0,0 +1,20 @@
|
||||
Package: ROBITaxonomy
|
||||
Type: Package
|
||||
Title: Metabarcoding data biodiversity analysis
|
||||
Version: 0.1
|
||||
Date: 2012-08-23
|
||||
Author: LECA - Laboratoire d'ecologie alpine
|
||||
Maintainer: LECA OBITools team <obitools@metabarcoding.org>
|
||||
Description: More about what it does (maybe more than one line)
|
||||
License: CeCILL v2.0
|
||||
LazyLoad: yes
|
||||
Roxygen: list(wrap = FALSE)
|
||||
Collate:
|
||||
'ROBITaxonomy.R'
|
||||
'taxonomy.R'
|
||||
'basic.R'
|
||||
'default.R'
|
||||
'distance.R'
|
||||
'lca.R'
|
||||
'rank.R'
|
||||
RoxygenNote: 5.0.1
|
||||
@@ -0,0 +1,28 @@
|
||||
# Generated by roxygen2: do not edit by hand
|
||||
|
||||
export(default.taxonomy)
|
||||
export(distance.taxonomy)
|
||||
export(ecofind)
|
||||
export(family)
|
||||
export(genus)
|
||||
export(is.obitools.taxonomy)
|
||||
export(is.subcladeof)
|
||||
export(kingdom)
|
||||
export(length.obitools.taxonomy)
|
||||
export(longest.path)
|
||||
export(lowest.common.ancestor)
|
||||
export(max.obitools.taxonomy)
|
||||
export(parent)
|
||||
export(path)
|
||||
export(rank.list)
|
||||
export(read.taxonomy)
|
||||
export(scientificname)
|
||||
export(species)
|
||||
export(superkingdom)
|
||||
export(taxid.list)
|
||||
export(taxonatrank)
|
||||
export(taxonomicrank)
|
||||
export(validate)
|
||||
exportClasses(obitools.taxonomy)
|
||||
exportClasses(obitools.taxonomyOrNULL)
|
||||
useDynLib(ROBITaxonomy)
|
||||
@@ -0,0 +1,359 @@
|
||||
#' @include taxonomy.R
|
||||
NULL
|
||||
|
||||
#' @export
|
||||
setGeneric("scientificname", function(taxonomy,taxid) {
|
||||
return(standardGeneric("scientificname"))
|
||||
})
|
||||
|
||||
#' Returns the scientific name corresponding to a \emph{NCBI taxid}
|
||||
#'
|
||||
#' \code{scientificname} function in package \pkg{\link{ROBITaxonomy}} returns the
|
||||
#' scientific name corresponding to a \emph{NCBI taxid}.
|
||||
#'
|
||||
#' @param taxonomy a \code{\link{obitools.taxonomy}} instance
|
||||
#' @param taxid an integer value or a vector of integer representing NCBI
|
||||
#' taxonomic identifiers.
|
||||
#' @return The scientific name of the corresponding taxon as a string or a
|
||||
#' vector of string if the \code{taxid} argument is itself a vector
|
||||
#'
|
||||
#' @examples
|
||||
#' # load the default taxonomy database include in the ROBITaxonomy library
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # build a vector of 6 taxids corresponding to species
|
||||
#' sp.taxid=c(7000,7004,7007,7009,7010,7011)
|
||||
#'
|
||||
#' # look for the scientific names correponding to these taxids
|
||||
#' scientificname(taxo,sp.taxid)
|
||||
#'
|
||||
#' @seealso class \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#'
|
||||
#' @docType methods
|
||||
#' @rdname scientificname-methods
|
||||
#' @aliases scientificname-methods,obitools.taxonomy
|
||||
#' @author Eric Coissac
|
||||
#'
|
||||
setMethod("scientificname", "obitools.taxonomy",function(taxonomy,taxid) {
|
||||
getscname = function(t) {
|
||||
if (is.na(t))
|
||||
return(NA)
|
||||
else
|
||||
return( .Call('R_get_scientific_name',
|
||||
taxonomy,
|
||||
t,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
}
|
||||
|
||||
taxid = as.integer(taxid)
|
||||
sapply(taxid,getscname)
|
||||
})
|
||||
|
||||
|
||||
######################################################################
|
||||
######################################################################
|
||||
|
||||
|
||||
#' @export
|
||||
setGeneric("parent", function(taxonomy,taxid,name=FALSE) {
|
||||
return(standardGeneric("parent"))
|
||||
})
|
||||
|
||||
#' Returns the parent taxon corresponding to a \emph{NCBI taxid}
|
||||
#'
|
||||
#' \code{parent} function in package \pkg{\link{ROBITaxonomy}} returns the
|
||||
#' parent taxon corresponding to a \emph{NCBI taxid}.
|
||||
#'
|
||||
#' @param taxonomy a \code{\link{obitools.taxonomy}} instance
|
||||
#' @param taxid an integer value or a vector of integer representing NCBI
|
||||
#' taxonomic identifiers.
|
||||
#' @param name A logical value \code{TRUE} or \code{FALSE} indicating if the
|
||||
#' method returns a taxid or a scientific name.
|
||||
#'
|
||||
#' @return \describe{ \item{If \code{name==FALSE}}{the taxid of the
|
||||
#' parent taxon as an integer or a vector of
|
||||
#' integers if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' \item{If \code{name==TRUE}}{the scientific name of the
|
||||
#' parent taxon as a string or a vector of
|
||||
#' string if the \code{taxid} argument is itself a
|
||||
#' vector} }
|
||||
#'
|
||||
#' @examples
|
||||
#' # load the default taxonomy database include in the ROBITaxonomy library
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # build a vector of 6 taxids corresponding to species
|
||||
#' sp.taxid=c(7000,7004,7007,7009,7010,7011)
|
||||
#'
|
||||
#' # look for the parent taxa correponding to these taxids
|
||||
#' parent(taxo,sp.taxid)
|
||||
#'
|
||||
#' # same things but scientific names are returned
|
||||
#' parent(taxo,sp.taxid,TRUE)
|
||||
#'
|
||||
#' @seealso class \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#'
|
||||
#' @docType methods
|
||||
#' @rdname parent-methods
|
||||
#' @aliases parent-methods,obitools.taxonomy
|
||||
#' @author Eric Coissac
|
||||
#'
|
||||
setMethod("parent", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) {
|
||||
getp = function(t) {
|
||||
if (is.na(t))
|
||||
return(NA)
|
||||
else
|
||||
return(.Call('R_get_parent',
|
||||
taxonomy,
|
||||
as.integer(t),
|
||||
name,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
}
|
||||
|
||||
taxid = as.integer(taxid)
|
||||
name = as.logical(name[1])
|
||||
sapply(taxid,getp)
|
||||
})
|
||||
|
||||
|
||||
|
||||
######################################################################
|
||||
######################################################################
|
||||
|
||||
|
||||
|
||||
#' @export
|
||||
setGeneric("taxid.list", function(taxonomy) {
|
||||
return(standardGeneric("taxid.list"))
|
||||
})
|
||||
|
||||
|
||||
#' Returns the list of all taxids belonging the taxonomy.
|
||||
#'
|
||||
#' \code{taxid.list} returns the list of all taxids included in the
|
||||
#' instance of the class \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#' @param taxonomy the \code{\linkS4class{obitools.taxonomy}} to use.
|
||||
#'
|
||||
#' @return an \code{integer} vector containing the list of taxids.
|
||||
#'
|
||||
#' @examples
|
||||
#' # loads the default taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # returns the count of taxa described in the taxonomy
|
||||
#' length(taxo)
|
||||
#'
|
||||
#' # extracts the list of all valid taxids
|
||||
#' good = taxid.list(taxo)
|
||||
#'
|
||||
#' # returns the size of the returned list
|
||||
#' length(good)
|
||||
#'
|
||||
#' @seealso \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#' @docType methods
|
||||
#' @rdname taxid.list-method
|
||||
#' @aliases taxid.list
|
||||
#'
|
||||
setMethod("taxid.list", "obitools.taxonomy",
|
||||
function(taxonomy) {
|
||||
return(.Call('R_taxid_list',
|
||||
taxonomy,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
})
|
||||
|
||||
######################################################################
|
||||
######################################################################
|
||||
|
||||
|
||||
|
||||
#' Returns the count of taxa in the taxonomy.
|
||||
#'
|
||||
#' \code{length} returns the count of taxa included in the
|
||||
#' instance of the class \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#' @param x the \code{\linkS4class{obitools.taxonomy}} to use.
|
||||
#'
|
||||
#' @return an \code{integer} corresponding to the count of taxa.
|
||||
#'
|
||||
#' @examples
|
||||
#' # loads the default taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # returns the count of taxa described in the taxonomy
|
||||
#' length(taxo)
|
||||
#'
|
||||
#' @seealso \code{\link{length}}, \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#' @export length.obitools.taxonomy
|
||||
#'
|
||||
length.obitools.taxonomy = function(x)
|
||||
{
|
||||
return(.Call('R_length_taxonomy',
|
||||
x,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
}
|
||||
|
||||
|
||||
######################################################################
|
||||
######################################################################
|
||||
|
||||
setGeneric('max')
|
||||
|
||||
#' Returns the maximum taxid in the taxonomy.
|
||||
#'
|
||||
#' \code{length} returns the maximum taxid included in the
|
||||
#' instance of the class \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#' @param taxonomy the \code{\linkS4class{obitools.taxonomy}} to use.
|
||||
#' @param na.rm included for compatibility purpose, this parameter as
|
||||
#' no effect on this implementation of \code{max}
|
||||
#'
|
||||
#' @return an \code{integer} corresponding to the count of taxa.
|
||||
#'
|
||||
#' @examples
|
||||
#' # load the default taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # gets the larger taxid of the database
|
||||
#' max(taxo)
|
||||
#'
|
||||
#' @seealso \code{\link{max}}, \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#' @export max.obitools.taxonomy
|
||||
#'
|
||||
max.obitools.taxonomy=function(taxonomy,na.rm = FALSE) {
|
||||
return(.Call('R_max_taxid',
|
||||
taxonomy,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
}
|
||||
|
||||
#' @export
|
||||
setGeneric("ecofind", function(taxonomy,patterns,rank=NULL,alternative=FALSE) {
|
||||
return(standardGeneric("ecofind"))
|
||||
})
|
||||
|
||||
#' Returns taxids associated to the names
|
||||
#'
|
||||
#' Return the set of taxids having their name matching the given pattern.
|
||||
#'
|
||||
#' @param taxonomy the \code{\linkS4class{obitools.taxonomy}} to use.
|
||||
#' @param patterns one or several regular pattern used to select the the taxa.
|
||||
#' @param rank a \code{character} indicating a taxonomic rank. If not \code{NULL}
|
||||
#' only taxids correponding to this rank are returned.
|
||||
#' @param alternative A logical value \code{TRUE} or \code{FALSE} indicating
|
||||
#' if the function must only look for a scientific name.
|
||||
#'
|
||||
#' @return if just one pattern is given, an integer vector is returned with the
|
||||
#' corresponding taxids. If a list of patterns is given, the function
|
||||
#' returns a list of integer vectors, each vector containing the taxids
|
||||
#' corresponding to a pattern. The returned list is in the same order
|
||||
#' than the given patern list.
|
||||
#'
|
||||
#' @examples
|
||||
#' # load the default taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # retreives the Vertebrata taxid
|
||||
#' taxid = ecofind(taxo,"Vertebrata")
|
||||
#'
|
||||
#' taxid
|
||||
#' scientificname(taxo,taxid)
|
||||
#'
|
||||
#'
|
||||
#' taxid = ecofind(taxo,"^Vertebrata$")
|
||||
#'
|
||||
#' taxid
|
||||
#' scientificname(taxo,taxid)
|
||||
#'
|
||||
#'
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#' @docType methods
|
||||
#' @rdname ecofind-method
|
||||
#' @aliases ecofind,obitools.taxonomy
|
||||
#'
|
||||
setMethod("ecofind", "obitools.taxonomy",function(taxonomy,patterns,rank=NULL,alternative=FALSE) {
|
||||
getp = function(t) {
|
||||
if (is.na(t))
|
||||
return(NA)
|
||||
else
|
||||
return(unique(.Call('R_ecofind',
|
||||
taxonomy,
|
||||
t,
|
||||
rank,
|
||||
alternative,
|
||||
PACKAGE="ROBITaxonomy")))
|
||||
}
|
||||
|
||||
patterns = as.character(patterns)
|
||||
taxid=lapply(patterns,getp)
|
||||
if (length(taxid)==1)
|
||||
taxid=taxid[[1]]
|
||||
|
||||
return(taxid)
|
||||
})
|
||||
|
||||
|
||||
|
||||
#' @export
|
||||
setGeneric("validate", function(taxonomy,taxid) {
|
||||
return(standardGeneric("validate"))
|
||||
})
|
||||
|
||||
#' Checks that a \emph{taxid} is really present in taxonomy
|
||||
#'
|
||||
#' \code{validate} function in package \pkg{\link{ROBITaxonomy}} checks
|
||||
#' that a \emph{taxid} is declared in the considered taxonomy.
|
||||
#'
|
||||
#' @param taxonomy a \code{\link{obitools.taxonomy}} instance
|
||||
#' @param taxid an integer value or a vector of integer representing NCBI
|
||||
#' taxonomic identifiers.
|
||||
#'
|
||||
#' @return The taxid if it exists, NA otherwise. If the input taxid is a
|
||||
#' vector of integer returns an integer vector composed of validated
|
||||
#' taxids and NA values.
|
||||
#'
|
||||
#' @examples
|
||||
#' # load the default taxonomy database include in the ROBITaxonomy library
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # build a vector of 101 taxids
|
||||
#' sp.taxid=c(7000:7100)
|
||||
#'
|
||||
#' # checks the list of taxids
|
||||
#' validate(taxo,sp.taxid)
|
||||
#'
|
||||
#' @seealso class \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#'
|
||||
#' @docType methods
|
||||
#' @rdname validate-methods
|
||||
#' @aliases validate-methods,obitools.taxonomy
|
||||
#' @author Eric Coissac
|
||||
#'
|
||||
setMethod("validate", "obitools.taxonomy",function(taxonomy,taxid) {
|
||||
getp = function(t) {
|
||||
if (is.na(t))
|
||||
return(NA)
|
||||
else
|
||||
return(.Call('R_validate_taxid',
|
||||
taxonomy,
|
||||
t,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
}
|
||||
|
||||
taxid = as.integer(taxid)
|
||||
sapply(taxid,getp)
|
||||
})
|
||||
|
||||
+52
@@ -0,0 +1,52 @@
|
||||
#' @include taxonomy.R
|
||||
NULL
|
||||
|
||||
|
||||
#
|
||||
#
|
||||
# Manage le loading of the default taxonomy
|
||||
#
|
||||
#
|
||||
|
||||
.__default__taxonomy__ = NULL
|
||||
|
||||
#' Returns the default taxonomy
|
||||
#'
|
||||
#' Returns a \code{\linkS4class{obitools.taxonomy}} instance corresponding
|
||||
#' to a NCBI taxonomy included by default in the \pkg{\link{ROBITaxonomy}} package.
|
||||
#'
|
||||
#' @return a \code{\linkS4class{obitools.taxonomy}} instance.
|
||||
#'
|
||||
#' @examples
|
||||
#'
|
||||
#' # Load the default taxonomy
|
||||
#' taxo = default.taxonomy()
|
||||
#'
|
||||
#' # and use it for requesting a scientific name
|
||||
#' scientificname(taxo,7742)
|
||||
#'
|
||||
#' @seealso \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#' @export
|
||||
#'
|
||||
default.taxonomy = function() {
|
||||
if (is.null(get(".__default__taxonomy__",envir = environment())))
|
||||
assign(".__default__taxonomy__",
|
||||
read.taxonomy(paste(system.file("extdata",
|
||||
package="ROBITaxonomy"),
|
||||
'ncbitaxo',
|
||||
sep='/')),
|
||||
envir=globalenv())
|
||||
|
||||
return(get(".__default__taxonomy__",envir = globalenv()))
|
||||
}
|
||||
|
||||
|
||||
#' @export
|
||||
#'
|
||||
is.obitools.taxonomy = function(taxonomy) {
|
||||
class(t)[1] == "obitools.taxonomy"
|
||||
}
|
||||
|
||||
+189
@@ -0,0 +1,189 @@
|
||||
#' @include taxonomy.R
|
||||
NULL
|
||||
|
||||
#' @export
|
||||
setGeneric("longest.path", function(taxonomy,taxid) {
|
||||
return(standardGeneric("longest.path"))
|
||||
})
|
||||
|
||||
#' Returns the longuest path from a taxon.
|
||||
#'
|
||||
#' The method \code{longest.path} returns the length of the
|
||||
#' path linking a taxid to the farest leaf belonging this taxid.
|
||||
#'
|
||||
#' @param taxonomy the \code{\linkS4class{obitools.taxonomy}} to use.
|
||||
#'
|
||||
#' @param taxid an \code{integer} vector containing the list of taxids.
|
||||
#'
|
||||
#' @return an \code{integer} vector containing the list length.
|
||||
#'
|
||||
#' @examples
|
||||
#' # loads the default taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # returns the longest path in the taxonomy (from the root node)
|
||||
#' longest.path(taxo,1)
|
||||
#'
|
||||
#'
|
||||
#' @seealso \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#' @docType methods
|
||||
#' @rdname longest.path-method
|
||||
#' @aliases longest.path,obitools.taxonomy
|
||||
#'
|
||||
setMethod("longest.path", "obitools.taxonomy",
|
||||
function(taxonomy,taxid) {
|
||||
getp = function(t) {
|
||||
if (is.na(t))
|
||||
return(NA)
|
||||
else
|
||||
return(.Call('R_longest_path',
|
||||
taxonomy,
|
||||
t,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
}
|
||||
|
||||
taxid = as.integer(taxid)
|
||||
sapply(taxid,getp)
|
||||
})
|
||||
|
||||
|
||||
#' @export
|
||||
setGeneric("distance.taxonomy", function(taxonomy,taxid1,taxid2=NULL,name=F) {
|
||||
return(standardGeneric("distance.taxonomy"))
|
||||
})
|
||||
|
||||
|
||||
#' Computes a distance matrix between taxids
|
||||
#'
|
||||
#' The method \code{taxonomy.distance} computes a distance matrix between a
|
||||
#' set of taxids. The distance between two taxa is based on the topology of
|
||||
#' the taxonomomy tree.
|
||||
#'
|
||||
#' \deqn{ d(Taxon_A,Taxon_B) = \frac{longest.path(lca(Taxon_A,Taxon_B))}{max(longest.path(Taxon_A),longest.path(Taxon_B))}}
|
||||
#' { longest.path(lca(Taxon_A,Taxon_B)) / max(longest.path(Taxon_A),longest.path(Taxon_B)) }
|
||||
#'
|
||||
#'
|
||||
#' @param taxonomy the \code{\linkS4class{obitools.taxonomy}} to use.
|
||||
#'
|
||||
#' @param taxid1 an \code{integer} vector containing a list of taxids.
|
||||
#'
|
||||
#' @param taxid2 an \code{integer} vector containing a list of taxids.
|
||||
#' If \code{taxid2} is set to \code{NULL} (it's default value)
|
||||
#' then the \code{taxid2} list is considered as equal to
|
||||
#' \code{taxid1} list.
|
||||
#' @param name A logical value \code{TRUE} or \code{FALSE} indicating
|
||||
#' if the method return distance matrix annotated by taxids or
|
||||
#' by scientific names.
|
||||
#'
|
||||
#' @return the distance matrix between taxids specified in the \code{taxid1}
|
||||
#' set and the \code{taxid2} set.
|
||||
#'
|
||||
#' @examples
|
||||
#' # loads the default taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # build a vector of 6 taxids corresponding to species
|
||||
#' sp.taxid=c(7000,7004,7007,7009,7010,7011)
|
||||
#'
|
||||
#' # computes the distance matrix between taxids
|
||||
#' distance.taxonomy(taxo,sp.taxid)
|
||||
#'
|
||||
#' # Same thing but the matrix is annotated by scientific names
|
||||
#' distance.taxonomy(taxo,sp.taxid,name=TRUE)
|
||||
#'
|
||||
#' @seealso \code{\link{longest.path}}
|
||||
#'
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#' @docType methods
|
||||
#' @rdname distance.taxonomy-method
|
||||
#' @aliases taxonomy.distance,obitools.taxonomy
|
||||
#'
|
||||
setMethod("distance.taxonomy", "obitools.taxonomy",
|
||||
function(taxonomy,taxid1,taxid2=NULL,name=F) {
|
||||
taxdist = function(r)
|
||||
{
|
||||
t1=r[1]
|
||||
t2=r[2]
|
||||
if (is.na(t1) | is.na(t2))
|
||||
return(NA)
|
||||
|
||||
p1 = path(taxonomy,t1)
|
||||
p2 = path(taxonomy,t2)
|
||||
|
||||
minp = min(length(p1),length(p2))
|
||||
common = sum(p1[1:minp] == p2[1:minp])
|
||||
lca = p1[common]
|
||||
lp = longest.path(taxonomy,lca)
|
||||
return(lp/(lp+common))
|
||||
}
|
||||
|
||||
multitaxdist=function(t1,t2) {
|
||||
apply(data.frame(t1,t2),1,taxdist)
|
||||
}
|
||||
|
||||
taxid1 = taxid1[! is.na(validate(taxonomy,taxid1))]
|
||||
t1 = path(taxonomy,taxid1)
|
||||
|
||||
same = is.null(taxid2)
|
||||
|
||||
if (same)
|
||||
{
|
||||
ntaxon = length(taxid1)
|
||||
t2 = t1[unlist(sapply(2:ntaxon,
|
||||
function(x) x:ntaxon))]
|
||||
t1 = t1[rep(1:(ntaxon-1),(ntaxon-1):1)]
|
||||
}
|
||||
else
|
||||
{
|
||||
taxid2 = taxid2[! is.na(validate(taxonomy,taxid2))]
|
||||
t2 = path(taxonomy,taxid2)
|
||||
nt1 = length(taxid1)
|
||||
nt2 = length(taxid2)
|
||||
t1 = t1[rep(1:nt1,nt2)]
|
||||
t2 = t2[rep(1:nt2,rep(nt1,nt2))]
|
||||
}
|
||||
|
||||
lmin = mapply(function(a,b) min(length(a),length(b)),
|
||||
t1,
|
||||
t2)
|
||||
|
||||
llca = mapply(function(x,y,l) sum(x[1:l]==y[1:l]),
|
||||
t1,
|
||||
t2,
|
||||
lmin)
|
||||
|
||||
lb = longest.path(taxonomy,mapply(function(x,y) x[y],t1,llca))
|
||||
d = as.double(lb / (lb + llca))
|
||||
|
||||
if (same) {
|
||||
attr(d, "Size") <- ntaxon
|
||||
if (name)
|
||||
attr(d, "Labels") <- scientificname(taxonomy,taxid1)
|
||||
else
|
||||
attr(d, "Labels") <- as.character(taxid1)
|
||||
attr(d, "Diag") <- FALSE
|
||||
attr(d, "Upper") <- FALSE
|
||||
attr(d, "method") <- NULL
|
||||
attr(d, "call") <- match.call()
|
||||
class(d) <- "dist"
|
||||
}
|
||||
else {
|
||||
if (name)
|
||||
d = matrix(d,nt1,nt2,
|
||||
dimnames=list(scientificname(taxonomy,taxid1),
|
||||
scientificname(taxonomy,taxid2)))
|
||||
else
|
||||
d = matrix(d,nt1,nt2,
|
||||
dimnames=list(as.character(taxid1),
|
||||
as.character(taxid2)))
|
||||
|
||||
}
|
||||
|
||||
return(d)
|
||||
})
|
||||
|
||||
|
||||
@@ -0,0 +1,122 @@
|
||||
#' @include taxonomy.R
|
||||
NULL
|
||||
|
||||
#' @export
|
||||
setGeneric("lowest.common.ancestor", function(taxonomy,taxid,threshold=1.0,error=0,name=FALSE) {
|
||||
return(standardGeneric("lowest.common.ancestor"))
|
||||
})
|
||||
|
||||
#' Computes the lowest common ancestor in the taxonomy tree between a set of taxa
|
||||
#'
|
||||
#' The \code{lowest.common.ancestor} function in package \pkg{ROBITaxonomy} computes
|
||||
#' the lowest common ancestor of a set of taxids. The lowest common ancestor (LCA)
|
||||
#' is the most precise taxonomic group shared by all the considered taxa. Tha
|
||||
#' \code{lowest.common.ancestor} function implemented in the \pkg{ROBITaxonomy}
|
||||
#' package, considers a fuzzy definition of the LCA as the most precise
|
||||
#' taxonomic group shared by a quorum of the considered taxa.
|
||||
#'
|
||||
#' @param taxonomy an instance of \code{\linkS4class{obitools.taxonomy}}
|
||||
#' @param taxid an integer value or a vector of integer representing NCBI
|
||||
#' taxonomic identifiers.
|
||||
#' @param threshold a numeric value between 0.0 and 1.0 indicating the minimum
|
||||
#' quorum of taxid that must belong the LCA.
|
||||
#' @param error an integer value indicating the maximum count of taxids that
|
||||
#' have not to belong the returned taxid. A \code{threshold} below 1.0 have
|
||||
#' priority on the \code{error} parameter.
|
||||
#' @param name A logical value \code{TRUE} or \code{FALSE} indicating if the
|
||||
#' method return a \emph{taxid} or a scientific name.
|
||||
#'
|
||||
#' @return Depending on the value of the \code{name} argument, set by default
|
||||
#' to \code{FALSE} the method returns :
|
||||
#' \describe{
|
||||
#' \item{If \code{name==FALSE}}{ the taxid of the taxon corresponding
|
||||
#' to the LCA as an integer value}
|
||||
#' \item{If \code{name==TRUE}}{ the scientific name of the taxon
|
||||
#' corresponding to the LCA as a string}
|
||||
#' }
|
||||
#'
|
||||
#' @examples
|
||||
#' require(ROBITaxonomy)
|
||||
#'
|
||||
#' \dontshow{# switch the working directory to the data package directory}
|
||||
#' \dontshow{setwd(system.file("extdata", package="ROBITaxonomy"))}
|
||||
#'
|
||||
#' # read the taxonomy database
|
||||
#'
|
||||
#' taxo=read.taxonomy('ncbitaxo')
|
||||
#'
|
||||
#' # build a vector of 6 taxids corresponding to species
|
||||
#'
|
||||
#' sp.taxid=c(7000,7004,7007,7009,7010,7011)
|
||||
#'
|
||||
#' # look for the lowest common ancestor taxids
|
||||
#'
|
||||
#' lowest.common.ancestor(taxo,sp.taxid)
|
||||
#'
|
||||
#' # same thing but returns results as a vector of scientific names
|
||||
#' lowest.common.ancestor(taxo,sp.taxid,name=TRUE)
|
||||
#'
|
||||
#' # If we accept than 2 or 1 taxa do not belong the LCA
|
||||
#' lowest.common.ancestor(taxo,sp.taxid,name=TRUE,error=2)
|
||||
#' lowest.common.ancestor(taxo,sp.taxid,name=TRUE,error=1)
|
||||
#'
|
||||
#' # Partial LCA can also be speciefied as the minimal frequency of
|
||||
#' # taxa belonging the LCA
|
||||
#' lowest.common.ancestor(taxo,sp.taxid,name=TRUE,threshold=0.8)
|
||||
#'
|
||||
#' @seealso class \code{\linkS4class{obitools.taxonomy}},
|
||||
#' and methods \code{\link{path}}, \code{\link{parent}},
|
||||
#'
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#' @docType methods
|
||||
#' @rdname lowest.common.ancestor-method
|
||||
#' @aliases lowest.common.ancestor,obitools.taxonomy
|
||||
#'
|
||||
setMethod("lowest.common.ancestor", "obitools.taxonomy",
|
||||
function(taxonomy,taxid,threshold=1.0,error=0,name=FALSE) {
|
||||
|
||||
if (threshold != 1.0)
|
||||
error=as.integer(floor(length(taxid) * (1-threshold)))
|
||||
|
||||
|
||||
#
|
||||
# Remove nod valid taxid
|
||||
#
|
||||
|
||||
taxid = validate(taxonomy,taxid)
|
||||
if (any(is.na(taxid)))
|
||||
return(NA)
|
||||
|
||||
ntaxid=length(taxid)
|
||||
nok = ntaxid - error
|
||||
if (ntaxid==1)
|
||||
return(taxid)
|
||||
|
||||
allpath = path(taxonomy,taxid)
|
||||
minlength= min(vapply(allpath,length,0))
|
||||
|
||||
lca=NA
|
||||
for (i in 1:minlength) {
|
||||
|
||||
n = vapply(allpath,function(x) x[i],0)
|
||||
nt = table(n)
|
||||
mt = max(nt)
|
||||
if (mt >= nok) {
|
||||
p = nt[nt==mt]
|
||||
if (length(p)==1)
|
||||
lca=as.integer(names(p)[1])
|
||||
else
|
||||
break
|
||||
}
|
||||
else
|
||||
break
|
||||
}
|
||||
|
||||
if (name)
|
||||
return(scientificname(taxonomy,lca))
|
||||
else
|
||||
return(lca)
|
||||
})
|
||||
|
||||
|
||||
@@ -0,0 +1,483 @@
|
||||
#' @include taxonomy.R
|
||||
NULL
|
||||
|
||||
#' @export
|
||||
setGeneric("rank.list", function(taxonomy) {
|
||||
return(standardGeneric("rank.list"))
|
||||
})
|
||||
|
||||
#' Returns the list of taxonomic ranks
|
||||
#'
|
||||
#' The \code{rank.list} function returns the list of all taxonomic
|
||||
#' ranks described in the taxonomy
|
||||
#'
|
||||
#' @param taxonomy a \code{\linkS4class{obitools.taxonomy}} instance
|
||||
#'
|
||||
#' @return a vector of type \code{character} containing the taxonomic rank names
|
||||
#'
|
||||
#' @examples
|
||||
#' # read the taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # returns the taxonomic rank for all taxid between 1000 and 1020
|
||||
#' rank.list(taxo)
|
||||
#'
|
||||
#' @docType methods
|
||||
#' @rdname rank.list-methods
|
||||
#' @aliases rank.list-methods,obitools.taxonomy
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#'
|
||||
setMethod("rank.list", "obitools.taxonomy",
|
||||
function(taxonomy) {
|
||||
return(.Call('R_rank_list',
|
||||
taxonomy,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
})
|
||||
|
||||
|
||||
#' @export
|
||||
setGeneric("taxonomicrank", function(taxonomy,taxid) {
|
||||
return(standardGeneric("taxonomicrank"))
|
||||
})
|
||||
|
||||
#' Returns the taxonomic rank associated to a taxid
|
||||
#'
|
||||
#' @param taxonomy a \code{\linkS4class{obitools.taxonomy}} instance
|
||||
#' @param taxid a vector of taxid to analyse
|
||||
#'
|
||||
#' @return a vector of type \code{character} containing the taxonomic ranks
|
||||
#'
|
||||
#' @examples
|
||||
#' # read the taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # returns the taxonomic rank for all taxid between 1000 and 1020
|
||||
#' taxonomicrank(taxo,1000:1020)
|
||||
#'
|
||||
#' @docType methods
|
||||
#' @rdname taxonomicrank-methods
|
||||
#' @aliases taxonomicrank-methods,obitools.taxonomy
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#'
|
||||
setMethod("taxonomicrank", "obitools.taxonomy",function(taxonomy,taxid) {
|
||||
taxid = as.integer(taxid)
|
||||
return(.Call('R_get_rank',
|
||||
taxonomy,
|
||||
taxid,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
})
|
||||
|
||||
#' @export
|
||||
setGeneric("taxonatrank", function(taxonomy,taxid,rank,name=FALSE) {
|
||||
return(standardGeneric("taxonatrank"))
|
||||
})
|
||||
|
||||
#' Extracts the taxid at a specified taxonomic rank.
|
||||
#'
|
||||
#' The \code{taxonatrank} method of \code{\linkS4class{obitools.taxonomy}} class
|
||||
#' returns the \emph{taxid} or the scientific name corresponding
|
||||
#' to a \emph{taxid}.at a specified taxonomic rank
|
||||
#'
|
||||
#' @param taxonomy a \code{\linkS4class{obitools.taxonomy}} instance
|
||||
#' @param taxid a vector of taxid to analyse
|
||||
#' @param rank a \code{character} indicating the desired rank
|
||||
#' @param name A logical value \code{TRUE} or \code{FALSE} indicating
|
||||
#' if the method return a taxid or a scientific name.
|
||||
#'
|
||||
#' @return \describe{
|
||||
#' \item{If \code{name==FALSE}}{the taxid of the corresponding
|
||||
#' taxon as an integer or a vector of integers
|
||||
#' if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' \item{If \code{name==TRUE}}{the scientific name of the corresponding
|
||||
#' taxon as a string or a vector of string
|
||||
#' if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' }
|
||||
#'
|
||||
#' @examples
|
||||
#' # read the taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # build a vector of 6 taxids corresponding to species
|
||||
#' sp.taxid=c(7000,7004,7007,7009,7010,7011)
|
||||
#'
|
||||
#' # look for the subfamily taxids
|
||||
#' taxonatrank(taxo,sp.taxid,"subfamily")
|
||||
#'
|
||||
#' # same thing but returns results as a vector of scientific names
|
||||
#' taxonatrank(taxo,sp.taxid,"subfamily",TRUE)
|
||||
#'
|
||||
#' @seealso class \code{\linkS4class{obitools.taxonomy}},
|
||||
#' and methods \code{\link{species}},\code{\link{genus}},
|
||||
#' \code{\link{family}},\code{\link{kingdom}},
|
||||
#' \code{\link{superkingdom}}
|
||||
#'
|
||||
#' @docType methods
|
||||
#' @rdname taxonatrank-methods
|
||||
#' @aliases taxonatrank-methods,obitools.taxonomy
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#'
|
||||
setMethod("taxonatrank", "obitools.taxonomy",function(taxonomy,taxid,rank,name=FALSE) {
|
||||
getsp = function(t) {
|
||||
if (is.na(t[1]) | is.na(t[2]))
|
||||
return(NA)
|
||||
else
|
||||
return(.Call('R_findtaxonatrank',taxonomy,
|
||||
as.integer(t[1]),
|
||||
t[2],
|
||||
name,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
}
|
||||
|
||||
rank = as.character(rank)
|
||||
name = as.logical(name[1])
|
||||
|
||||
apply(data.frame(taxid,rank),1,getsp)
|
||||
})
|
||||
|
||||
|
||||
#' @export
|
||||
setGeneric("species", function(taxonomy,taxid,name=FALSE) {
|
||||
return(standardGeneric("species"))
|
||||
})
|
||||
|
||||
#' Extracts the species corresponding to a taxid
|
||||
#'
|
||||
#' The \code{species} method of \code{\linkS4class{obitools.taxonomy}} class
|
||||
#' returns the \emph{taxid} or the scientific name of the species corresponding
|
||||
#' to a \emph{taxid}.
|
||||
#'
|
||||
#' @param taxonomy a \code{\linkS4class{obitools.taxonomy}} instance
|
||||
#' @param taxid a vector of taxid to analyse
|
||||
#' @param name A logical value \code{TRUE} or \code{FALSE} indicating
|
||||
#' if the method return a taxid or a scientific name.
|
||||
#'
|
||||
#' @return \describe{
|
||||
#' \item{If \code{name==FALSE}}{the taxid of the corresponding
|
||||
#' taxon as an integer or a vector of integers
|
||||
#' if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' \item{If \code{name==TRUE}}{the scientific name of the corresponding
|
||||
#' taxon as a string or a vector of string
|
||||
#' if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' }
|
||||
#'
|
||||
#' @examples
|
||||
#' # read the taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # build a vector of 6 taxids corresponding to species
|
||||
#' sp.taxid=c(7000,7004,7007,7009,7010,7011)
|
||||
#'
|
||||
#' # look for the species taxids
|
||||
#' species(taxo,sp.taxid)
|
||||
#'
|
||||
#' # same thing but returns results as a vector of scientific names
|
||||
#' species(taxo,sp.taxid,TRUE)
|
||||
#'
|
||||
#' @seealso class \code{\linkS4class{obitools.taxonomy}},
|
||||
#' and methods \code{\link{taxonatrank}},\code{\link{genus}},
|
||||
#' \code{\link{family}},\code{\link{kingdom}},
|
||||
#' \code{\link{superkingdom}}
|
||||
#'
|
||||
#' @docType methods
|
||||
#' @rdname species-methods
|
||||
#' @aliases species-methods,obitools.taxonomy
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#'
|
||||
setMethod("species", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) {
|
||||
getsp = function(t) {
|
||||
if (is.na(t))
|
||||
return(NA)
|
||||
else
|
||||
return(.Call('R_get_species',
|
||||
taxonomy,
|
||||
t,
|
||||
name,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
}
|
||||
|
||||
taxid = as.integer(taxid)
|
||||
name = as.logical(name[1])
|
||||
sapply(taxid,getsp)
|
||||
})
|
||||
|
||||
#' @export
|
||||
setGeneric("genus", function(taxonomy,taxid,name=FALSE) {
|
||||
return(standardGeneric("genus"))
|
||||
})
|
||||
|
||||
#' Extracts the genus corresponding to a taxid
|
||||
#'
|
||||
#' The \code{genus} method of \code{\linkS4class{obitools.taxonomy}} class
|
||||
#' returns the \emph{taxid} or the scientific name of the genus corresponding
|
||||
#' to a \emph{taxid}.
|
||||
#'
|
||||
#' @param taxonomy a \code{\linkS4class{obitools.taxonomy}} instance
|
||||
#' @param taxid a vector of taxid to analyse
|
||||
#' @param name A logical value \code{TRUE} or \code{FALSE} indicating
|
||||
#' if the method return a taxid or a scientific name.
|
||||
#'
|
||||
#' @return \describe{
|
||||
#' \item{If \code{name==FALSE}}{the taxid of the corresponding
|
||||
#' taxon as an integer or a vector of integers
|
||||
#' if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' \item{If \code{name==TRUE}}{the scientific name of the corresponding
|
||||
#' taxon as a string or a vector of string
|
||||
#' if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' }
|
||||
#'
|
||||
#' @examples
|
||||
#' # read the taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # build a vector of 6 taxids corresponding to species
|
||||
#' sp.taxid=c(7000,7004,7007,7009,7010,7011)
|
||||
#'
|
||||
#' # look for the genus taxids
|
||||
#' genus(taxo,sp.taxid)
|
||||
#'
|
||||
#' # same thing but returns results as a vector of scientific names
|
||||
#' genus(taxo,sp.taxid,TRUE)
|
||||
#'
|
||||
#' @seealso class \code{\linkS4class{obitools.taxonomy}},
|
||||
#' and methods \code{\link{species}},\code{\link{taxonatrank}},
|
||||
#' \code{\link{family}},\code{\link{kingdom}},
|
||||
#' \code{\link{superkingdom}}
|
||||
#'
|
||||
#' @docType methods
|
||||
#' @rdname genus-methods
|
||||
#' @aliases genus-methods,obitools.taxonomy
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#'
|
||||
setMethod("genus", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) {
|
||||
getsp = function(t) {
|
||||
if (is.na(t))
|
||||
return(NA)
|
||||
else
|
||||
return(.Call('R_get_genus',
|
||||
taxonomy,
|
||||
t,
|
||||
name,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
}
|
||||
|
||||
taxid = as.integer(taxid)
|
||||
name = as.logical(name[1])
|
||||
sapply(taxid,getsp)
|
||||
})
|
||||
|
||||
#' @export
|
||||
setGeneric("family", function(taxonomy,taxid,name=FALSE) {
|
||||
return(standardGeneric("family"))
|
||||
})
|
||||
|
||||
#' Extracts the family corresponding to a taxid
|
||||
#'
|
||||
#' The \code{family} method of \code{\linkS4class{obitools.taxonomy}} class
|
||||
#' returns the \emph{taxid} or the scientific name of the family corresponding
|
||||
#' to a \emph{taxid}.
|
||||
#'
|
||||
#' @param taxonomy a \code{\linkS4class{obitools.taxonomy}} instance
|
||||
#' @param taxid a vector of taxid to analyse
|
||||
#' @param name A logical value \code{TRUE} or \code{FALSE} indicating
|
||||
#' if the method return a taxid or a scientific name.
|
||||
#'
|
||||
#' @return \describe{
|
||||
#' \item{If \code{name==FALSE}}{the taxid of the corresponding
|
||||
#' taxon as an integer or a vector of integers
|
||||
#' if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' \item{If \code{name==TRUE}}{the scientific name of the corresponding
|
||||
#' taxon as a string or a vector of string
|
||||
#' if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' }
|
||||
#'
|
||||
#' @examples
|
||||
#' # read the taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # build a vector of 6 taxids corresponding to species
|
||||
#' sp.taxid=c(7000,7004,7007,7009,7010,7011)
|
||||
#'
|
||||
#' # look for the family taxids
|
||||
#' family(taxo,sp.taxid)
|
||||
#'
|
||||
#' # same thing but returns results as a vector of scientific names
|
||||
#' family(taxo,sp.taxid,TRUE)
|
||||
#'
|
||||
#' @seealso class \code{\linkS4class{obitools.taxonomy}},
|
||||
#' and methods \code{\link{species}},\code{\link{genus}},
|
||||
#' \code{\link{taxonatrank}},\code{\link{kingdom}},
|
||||
#' \code{\link{superkingdom}}
|
||||
#'
|
||||
#' @docType methods
|
||||
#' @rdname family-methods
|
||||
#' @aliases family-methods,obitools.taxonomy
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#'
|
||||
setMethod("family", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) {
|
||||
getsp = function(t) {
|
||||
if (is.na(t))
|
||||
return(NA)
|
||||
else
|
||||
return(.Call('R_get_family',
|
||||
taxonomy,
|
||||
t,
|
||||
name,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
}
|
||||
|
||||
taxid = as.integer(taxid)
|
||||
name = as.logical(name[1])
|
||||
sapply(taxid,getsp)
|
||||
})
|
||||
|
||||
#' @export
|
||||
setGeneric("kingdom", function(taxonomy,taxid,name=FALSE) {
|
||||
return(standardGeneric("kingdom"))
|
||||
})
|
||||
|
||||
#' Extracts the kingdom corresponding to a taxid
|
||||
#'
|
||||
#' The \code{kingdom} method of \code{\linkS4class{obitools.taxonomy}} class
|
||||
#' returns the \emph{taxid} or the scientific name of the kingdom corresponding
|
||||
#' to a \emph{taxid}.
|
||||
#'
|
||||
#' @param taxonomy a \code{\linkS4class{obitools.taxonomy}} instance
|
||||
#' @param taxid a vector of taxid to analyse
|
||||
#' @param name A logical value \code{TRUE} or \code{FALSE} indicating
|
||||
#' if the method return a taxid or a scientific name.
|
||||
#'
|
||||
#' @return \describe{
|
||||
#' \item{If \code{name==FALSE}}{the taxid of the corresponding
|
||||
#' taxon as an integer or a vector of integers
|
||||
#' if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' \item{If \code{name==TRUE}}{the scientific name of the corresponding
|
||||
#' taxon as a string or a vector of string
|
||||
#' if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' }
|
||||
#'
|
||||
#' @examples
|
||||
#' # read the taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # build a vector of 6 taxids corresponding to species
|
||||
#' sp.taxid=c(7000,7004,7007,7009,7010,7011)
|
||||
#'
|
||||
#' # look for the kingdom taxids
|
||||
#' kingdom(taxo,sp.taxid)
|
||||
#'
|
||||
#' # same thing but returns results as a vector of scientific names
|
||||
#' kingdom(taxo,sp.taxid,TRUE)
|
||||
#'
|
||||
#' @seealso class \code{\linkS4class{obitools.taxonomy}},
|
||||
#' and methods \code{\link{species}},\code{\link{genus}},
|
||||
#' \code{\link{family}},\code{\link{taxonatrank}},
|
||||
#' \code{\link{superkingdom}}
|
||||
#'
|
||||
#' @docType methods
|
||||
#' @rdname kingdom-methods
|
||||
#' @aliases kingdom-methods,obitools.taxonomy
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#'
|
||||
setMethod("kingdom", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) {
|
||||
getsp = function(t) {
|
||||
if (is.na(t))
|
||||
return(NA)
|
||||
else
|
||||
return(.Call('R_get_kingdom',
|
||||
taxonomy,
|
||||
t,
|
||||
name,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
}
|
||||
|
||||
taxid = as.integer(taxid)
|
||||
name = as.logical(name[1])
|
||||
sapply(taxid,getsp)
|
||||
})
|
||||
|
||||
#' @export
|
||||
setGeneric("superkingdom", function(taxonomy,taxid,name=FALSE) {
|
||||
return(standardGeneric("superkingdom"))
|
||||
})
|
||||
|
||||
#' Extracts the superkingdom corresponding to a taxid
|
||||
#'
|
||||
#' The \code{superkingdom} method of \code{\linkS4class{obitools.taxonomy}} class
|
||||
#' returns the \emph{taxid} or the scientific name of the superkingdom corresponding
|
||||
#' to a \emph{taxid}.
|
||||
#'
|
||||
#' @param taxonomy a \code{\linkS4class{obitools.taxonomy}} instance
|
||||
#' @param taxid a vector of taxid to analyse
|
||||
#' @param name A logical value \code{TRUE} or \code{FALSE} indicating
|
||||
#' if the method return a taxid or a scientific name.
|
||||
#'
|
||||
#' @return \describe{
|
||||
#' \item{If \code{name==FALSE}}{the taxid of the corresponding
|
||||
#' taxon as an integer or a vector of integers
|
||||
#' if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' \item{If \code{name==TRUE}}{the scientific name of the corresponding
|
||||
#' taxon as a string or a vector of string
|
||||
#' if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' }
|
||||
#'
|
||||
#' @examples
|
||||
#' # read the taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # build a vector of 6 taxids corresponding to species
|
||||
#' sp.taxid=c(7000,7004,7007,7009,7010,7011)
|
||||
#'
|
||||
#' # look for the superkingdom taxids
|
||||
#' superkingdom(taxo,sp.taxid)
|
||||
#'
|
||||
#' # same thing but returns results as a vector of scientific names
|
||||
#' superkingdom(taxo,sp.taxid,TRUE)
|
||||
#'
|
||||
#' @seealso class \code{\linkS4class{obitools.taxonomy}},
|
||||
#' and methods \code{\link{species}},\code{\link{genus}},
|
||||
#' \code{\link{family}},\code{\link{kingdom}},
|
||||
#' \code{\link{taxonatrank}}
|
||||
#'
|
||||
#' @docType methods
|
||||
#' @rdname superkingdom-methods
|
||||
#' @aliases superkingdom-methods,obitools.taxonomy
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#'
|
||||
setMethod("superkingdom", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) {
|
||||
getsp = function(t) {
|
||||
if (is.na(t))
|
||||
return(NA)
|
||||
else
|
||||
return(.Call('R_get_superkingdom',
|
||||
taxonomy,
|
||||
t,
|
||||
name,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
}
|
||||
|
||||
taxid = as.integer(taxid)
|
||||
name = as.logical(name[1])
|
||||
sapply(taxid,getsp)
|
||||
})
|
||||
|
||||
|
||||
+216
@@ -0,0 +1,216 @@
|
||||
#' @include ROBITaxonomy.R
|
||||
#' @useDynLib ROBITaxonomy
|
||||
NULL
|
||||
|
||||
#' Gives access to a taxonomy preformated by OBITools
|
||||
#'
|
||||
#' A S4 class describing a taxonomy. It allows access to
|
||||
#' taxonomy formated for OBITools.
|
||||
#'
|
||||
#' @references \describe{
|
||||
#' \item{NCBI Taxonomy : }{\url{http://www.ncbi.nlm.nih.gov/taxonomy}}
|
||||
#' \item{OBITools : }{\url{http://metabarcoding/obitools/doc}}
|
||||
#' }
|
||||
#'
|
||||
#' @seealso \code{\link{read.taxonomy}}
|
||||
#'
|
||||
#' @name obitools.taxonomy
|
||||
#' @rdname obitools-taxonomy-class
|
||||
#' @keywords taxonomy
|
||||
#' @author Eric Coissac
|
||||
#' @exportClass obitools.taxonomy
|
||||
#'
|
||||
setClass("obitools.taxonomy",
|
||||
|
||||
|
||||
#
|
||||
# Attribute declaration
|
||||
#
|
||||
|
||||
# data.frame containing the counts of reads per samples
|
||||
# 1 samples per line
|
||||
# 1 sequence per column
|
||||
|
||||
representation(
|
||||
|
||||
# An external pointer structure to
|
||||
# the C taxonomy structure
|
||||
|
||||
pointer = "externalptr",
|
||||
|
||||
# the name of the database on the hard disk
|
||||
|
||||
dbname = 'character',
|
||||
|
||||
# the working directory when the taxonomy
|
||||
# object is created.
|
||||
# This inforation combined with bname allows
|
||||
# to reload taxonomy from disk
|
||||
|
||||
workingdir = 'character',
|
||||
|
||||
# Indicate if the taxonomy is saved in a file
|
||||
# Taxonomy created in R or modified in R are
|
||||
# not saved ==> This have to be take into
|
||||
# consideration but how ???
|
||||
saved = 'logical'
|
||||
),
|
||||
|
||||
#
|
||||
# Check object structure
|
||||
#
|
||||
|
||||
validity = function(object) {
|
||||
return(TRUE)
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
|
||||
#' obitools.taxonomy constructor
|
||||
#'
|
||||
#' --> this constructor have not to be called directly
|
||||
#' use the read.obitools.taxonomy function to
|
||||
#' create a new instance of taxonomy
|
||||
#'
|
||||
#' @docType methods
|
||||
#' @rdname initialize-methods-obitools.taxonomy
|
||||
#' @aliases initialize-methods,obitools.taxonomy
|
||||
setMethod("initialize",
|
||||
"obitools.taxonomy",
|
||||
function(.Object, pointer,dbname,workingdir,saved) {
|
||||
.Object@pointer <- pointer
|
||||
.Object@dbname <- dbname
|
||||
.Object@workingdir <- workingdir
|
||||
.Object@saved <- saved
|
||||
|
||||
validObject(.Object) ## valide l'objet
|
||||
return(.Object)
|
||||
})
|
||||
|
||||
|
||||
#' @exportClass obitools.taxonomyOrNULL
|
||||
setClassUnion("obitools.taxonomyOrNULL",c("obitools.taxonomy","NULL"))
|
||||
|
||||
|
||||
#' @export
|
||||
setGeneric("path", function(taxonomy,taxid,name=FALSE) {
|
||||
return(standardGeneric("path"))
|
||||
})
|
||||
|
||||
|
||||
|
||||
setMethod("path", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) {
|
||||
getp = function(t) {
|
||||
if (is.na(t))
|
||||
return(NA)
|
||||
else
|
||||
{
|
||||
path=c()
|
||||
|
||||
t=.Call('R_validate_taxid',
|
||||
taxonomy,
|
||||
as.integer(t),
|
||||
PACKAGE="ROBITaxonomy")
|
||||
|
||||
if (is.na(t))
|
||||
return(NA)
|
||||
|
||||
repeat {
|
||||
if (name)
|
||||
path = c(scientificname(taxonomy,t),path)
|
||||
else
|
||||
path = c(t,path)
|
||||
|
||||
t = .Call('R_get_parent',
|
||||
taxonomy,
|
||||
t,
|
||||
FALSE,
|
||||
PACKAGE="ROBITaxonomy")
|
||||
if (is.na(t))
|
||||
break
|
||||
}
|
||||
|
||||
return(path)
|
||||
}
|
||||
}
|
||||
|
||||
taxid=as.integer(taxid)
|
||||
name=as.logical(name)
|
||||
|
||||
p = lapply(taxid,getp)
|
||||
d = dim(p)
|
||||
|
||||
if (!is.null(d))
|
||||
if (d[2]==1)
|
||||
p = as.vector(p)
|
||||
|
||||
return(p)
|
||||
})
|
||||
|
||||
|
||||
#' @export
|
||||
setGeneric("is.subcladeof", function(taxonomy,taxid,parent) {
|
||||
return(standardGeneric("is.subcladeof"))
|
||||
})
|
||||
|
||||
setMethod("is.subcladeof", "obitools.taxonomy",function(taxonomy,taxid,parent) {
|
||||
taxid = as.integer(taxid)
|
||||
parent= as.integer(parent)
|
||||
return(.Call('R_is_under_taxon',
|
||||
taxonomy,
|
||||
taxid,
|
||||
parent,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
})
|
||||
|
||||
|
||||
|
||||
build.taxonomy = function(pointer,dbname,workingdir,saved) {
|
||||
rd <- new('obitools.taxonomy',
|
||||
pointer=pointer,
|
||||
dbname=dbname,
|
||||
workingdir=workingdir,
|
||||
saved=saved
|
||||
)
|
||||
return(rd)
|
||||
}
|
||||
|
||||
|
||||
#' Reads a taxonomy
|
||||
#'
|
||||
#' \code{read.taxonomy} reads a taxonomy formated by OBITools.
|
||||
#' NCBI taxonomy can be download from the NCBI FTP site in taxdump format.
|
||||
#' The taxdump must be formated using the obitaxonomy command from OBITools
|
||||
#' before being used in R. A OBITools formated taxonomy is composed of 3 files
|
||||
#' with the same prefix name and suffixes .tdx, .rdx, .ndx, two extra files
|
||||
#' suffixed .adx and .ldx can also be present.
|
||||
#'
|
||||
#' @param dbname A character string containing the file name of the database
|
||||
#'
|
||||
#' @return an instance of the class \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#' @examples
|
||||
#'
|
||||
#' \dontshow{# switch the working directory to the data package directory}
|
||||
#' \dontshow{setwd(system.file("extdata", package="ROBITaxonomy"))}
|
||||
#'
|
||||
#' # read the taxonomy ncbi
|
||||
#' ncbi = read.taxonomy("ncbitaxo")
|
||||
#'
|
||||
#' # and use it for requesting a scientific name
|
||||
#' scientificname(ncbi,7742)
|
||||
#'
|
||||
#' @seealso \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#' @export
|
||||
|
||||
read.taxonomy = function(dbname) {
|
||||
t <- .Call('R_read_taxonomy',dbname,TRUE,PACKAGE="ROBITaxonomy")
|
||||
|
||||
return(build.taxonomy(t,dbname,getwd(),TRUE))
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,20 @@
|
||||
Version: 1.0
|
||||
|
||||
RestoreWorkspace: Default
|
||||
SaveWorkspace: Default
|
||||
AlwaysSaveHistory: Default
|
||||
|
||||
EnableCodeIndexing: Yes
|
||||
UseSpacesForTab: Yes
|
||||
NumSpacesForTab: 2
|
||||
Encoding: UTF-8
|
||||
|
||||
RnwWeave: knitr
|
||||
LaTeX: pdfLaTeX
|
||||
|
||||
AutoAppendNewline: Yes
|
||||
|
||||
BuildType: Package
|
||||
PackageUseDevtools: Yes
|
||||
PackageInstallArgs: --no-multiarch --with-keep.source
|
||||
PackageRoxygenize: rd,collate,namespace
|
||||
Vendored
BIN
Binary file not shown.
Vendored
BIN
Binary file not shown.
Vendored
BIN
Binary file not shown.
Vendored
BIN
Binary file not shown.
Vendored
BIN
Binary file not shown.
Executable
BIN
Binary file not shown.
@@ -0,0 +1,24 @@
|
||||
#include "ecoPCR.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
/*
|
||||
* print the message given as argument and exit the program
|
||||
* @param error error number
|
||||
* @param message the text explaining what's going on
|
||||
* @param filename the file source where the program failed
|
||||
* @param linenumber the line where it has failed
|
||||
* filename and linenumber are written at pre-processing
|
||||
* time by a macro
|
||||
*/
|
||||
void ecoError(int32_t errorcode,
|
||||
const char* message,
|
||||
const char * filename,
|
||||
int linenumber)
|
||||
{
|
||||
error("Error %d in file %s line %d : %s",
|
||||
errorcode,
|
||||
filename,
|
||||
linenumber,
|
||||
message);
|
||||
}
|
||||
Binary file not shown.
@@ -0,0 +1,122 @@
|
||||
#include "ecoPCR.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#define SWAPINT32(x) ((((x) << 24) & 0xFF000000) | (((x) << 8) & 0xFF0000) | \
|
||||
(((x) >> 8) & 0xFF00) | (((x) >> 24) & 0xFF))
|
||||
|
||||
|
||||
int32_t is_big_endian()
|
||||
{
|
||||
int32_t i=1;
|
||||
|
||||
return (int32_t)((char*)&i)[0];
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
int32_t swap_int32_t(int32_t i)
|
||||
{
|
||||
return SWAPINT32(i);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Read part of the file
|
||||
* @param *f the database
|
||||
* @param recordSize the size to be read
|
||||
*
|
||||
* @return buffer
|
||||
*/
|
||||
void *read_ecorecord(FILE *f,int32_t *recordSize)
|
||||
{
|
||||
static void *buffer =NULL;
|
||||
int32_t buffersize=0;
|
||||
int32_t read;
|
||||
|
||||
if (!recordSize)
|
||||
ECOERROR(ECO_ASSERT_ERROR,
|
||||
"recordSize cannot be NULL");
|
||||
|
||||
read = fread(recordSize,
|
||||
1,
|
||||
sizeof(int32_t),
|
||||
f);
|
||||
|
||||
if (feof(f))
|
||||
return NULL;
|
||||
|
||||
if (read != sizeof(int32_t))
|
||||
ECOERROR(ECO_IO_ERROR,"Reading record size error");
|
||||
|
||||
if (is_big_endian())
|
||||
*recordSize=swap_int32_t(*recordSize);
|
||||
|
||||
if (buffersize < *recordSize)
|
||||
{
|
||||
if (buffer)
|
||||
buffer = ECOREALLOC(buffer,*recordSize,
|
||||
"Increase size of record buffer");
|
||||
else
|
||||
buffer = ECOMALLOC(*recordSize,
|
||||
"Allocate record buffer");
|
||||
}
|
||||
|
||||
read = fread(buffer,
|
||||
1,
|
||||
*recordSize,
|
||||
f);
|
||||
|
||||
if (read != *recordSize)
|
||||
ECOERROR(ECO_IO_ERROR,"Reading record data error");
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Open the database and check it's readable
|
||||
* @param filename name of the database (.sdx, .rdx, .tbx)
|
||||
* @param sequencecount buffer - pointer to variable storing the number of occurence
|
||||
* @param abort_on_open_error boolean to define the behaviour in case of error
|
||||
* while opening the database
|
||||
* @return FILE type
|
||||
**/
|
||||
FILE *open_ecorecorddb(const char *filename,
|
||||
int32_t *sequencecount,
|
||||
int32_t abort_on_open_error)
|
||||
{
|
||||
FILE *f;
|
||||
int32_t read;
|
||||
|
||||
f = fopen(filename,"rb");
|
||||
|
||||
if (!f)
|
||||
{
|
||||
if (abort_on_open_error)
|
||||
ECOERROR(ECO_IO_ERROR,"Cannot open file");
|
||||
else
|
||||
{
|
||||
*sequencecount=0;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
read = fread(sequencecount,
|
||||
1,
|
||||
sizeof(int32_t),
|
||||
f);
|
||||
|
||||
if (read != sizeof(int32_t))
|
||||
ECOERROR(ECO_IO_ERROR,"Reading record size error");
|
||||
|
||||
if (is_big_endian())
|
||||
*sequencecount=swap_int32_t(*sequencecount);
|
||||
|
||||
return f;
|
||||
}
|
||||
|
||||
Binary file not shown.
@@ -0,0 +1,76 @@
|
||||
#include "ecoPCR.h"
|
||||
#include <stdlib.h>
|
||||
|
||||
static int eco_log_malloc = 0;
|
||||
|
||||
void eco_trace_memory_allocation()
|
||||
{
|
||||
eco_log_malloc=1;
|
||||
}
|
||||
|
||||
void eco_untrace_memory_allocation()
|
||||
{
|
||||
eco_log_malloc=0;
|
||||
}
|
||||
|
||||
|
||||
void *eco_malloc(int32_t chunksize,
|
||||
const char *error_message,
|
||||
const char *filename,
|
||||
int32_t line)
|
||||
{
|
||||
void * chunk;
|
||||
|
||||
chunk = calloc(1,chunksize);
|
||||
|
||||
if (!chunk)
|
||||
ecoError(ECO_MEM_ERROR,error_message,filename,line);
|
||||
|
||||
if (eco_log_malloc)
|
||||
REprintf("Memory segment located at %p of size %d is allocated (file : %s [%d])",
|
||||
chunk,
|
||||
chunksize,
|
||||
filename,
|
||||
line);
|
||||
|
||||
return chunk;
|
||||
}
|
||||
|
||||
void *eco_realloc(void *chunk,
|
||||
int32_t newsize,
|
||||
const char *error_message,
|
||||
const char *filename,
|
||||
int32_t line)
|
||||
{
|
||||
void *newchunk;
|
||||
|
||||
newchunk = realloc(chunk,newsize);
|
||||
|
||||
if (!newchunk)
|
||||
ecoError(ECO_MEM_ERROR,error_message,filename,line);
|
||||
|
||||
if (eco_log_malloc)
|
||||
REprintf("Old memory segment %p is reallocated at %p with a size of %d (file : %s [%d])",
|
||||
chunk,
|
||||
newchunk,
|
||||
newsize,
|
||||
filename,
|
||||
line);
|
||||
|
||||
return newchunk;
|
||||
}
|
||||
|
||||
void eco_free(void *chunk,
|
||||
const char *error_message,
|
||||
const char *filename,
|
||||
int32_t line)
|
||||
{
|
||||
free(chunk);
|
||||
|
||||
if (eco_log_malloc)
|
||||
REprintf("Memory segment %p is released => %s (file : %s [%d])",
|
||||
chunk,
|
||||
error_message,
|
||||
filename,
|
||||
line);
|
||||
}
|
||||
Binary file not shown.
+285
@@ -0,0 +1,285 @@
|
||||
#ifndef ECOPCR_H_
|
||||
#define ECOPCR_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
#include <R.h>
|
||||
#include <Rinternals.h>
|
||||
#include <Rdefines.h>
|
||||
|
||||
|
||||
//#ifndef H_apat
|
||||
//#include "../libapat/apat.h"
|
||||
//#endif
|
||||
|
||||
/*****************************************************
|
||||
*
|
||||
* Data type declarations
|
||||
*
|
||||
*****************************************************/
|
||||
|
||||
/*
|
||||
*
|
||||
* Sequence types
|
||||
*
|
||||
*/
|
||||
|
||||
typedef struct {
|
||||
|
||||
int32_t taxid;
|
||||
char AC[20];
|
||||
int32_t DE_length;
|
||||
int32_t SQ_length;
|
||||
int32_t CSQ_length;
|
||||
|
||||
char data[1];
|
||||
|
||||
} ecoseqformat_t;
|
||||
|
||||
typedef struct {
|
||||
int32_t taxid;
|
||||
int32_t SQ_length;
|
||||
char *AC;
|
||||
char *DE;
|
||||
char *SQ;
|
||||
} ecoseq_t;
|
||||
|
||||
/*
|
||||
*
|
||||
* Taxonomy taxon types
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
typedef struct {
|
||||
int32_t taxid;
|
||||
int32_t rank;
|
||||
int32_t parent;
|
||||
int32_t namelength;
|
||||
char name[1];
|
||||
|
||||
} ecotxformat_t;
|
||||
|
||||
typedef struct ecotxnode {
|
||||
int32_t taxid;
|
||||
int32_t rank;
|
||||
int32_t farest;
|
||||
struct ecotxnode *parent;
|
||||
char *name;
|
||||
} ecotx_t;
|
||||
|
||||
typedef struct {
|
||||
int32_t count;
|
||||
int32_t maxtaxid;
|
||||
int32_t buffersize;
|
||||
ecotx_t taxon[1];
|
||||
} ecotxidx_t;
|
||||
|
||||
|
||||
/*
|
||||
*
|
||||
* Taxonomy rank types
|
||||
*
|
||||
*/
|
||||
|
||||
typedef struct {
|
||||
int32_t count;
|
||||
char* label[1];
|
||||
} ecorankidx_t;
|
||||
|
||||
/*
|
||||
*
|
||||
* Taxonomy name types
|
||||
*
|
||||
*/
|
||||
|
||||
typedef struct {
|
||||
int32_t is_scientificname;
|
||||
int32_t namelength;
|
||||
int32_t classlength;
|
||||
int32_t taxid;
|
||||
char names[1];
|
||||
} econameformat_t;
|
||||
|
||||
|
||||
typedef struct {
|
||||
char *name;
|
||||
char *classname;
|
||||
int32_t is_scientificname;
|
||||
struct ecotxnode *taxon;
|
||||
} econame_t;
|
||||
|
||||
|
||||
typedef struct {
|
||||
int32_t count;
|
||||
econame_t names[1];
|
||||
} econameidx_t;
|
||||
|
||||
|
||||
typedef struct {
|
||||
ecorankidx_t *ranks;
|
||||
econameidx_t *names;
|
||||
ecotxidx_t *taxons;
|
||||
} ecotaxonomy_t;
|
||||
|
||||
|
||||
/*****************************************************
|
||||
*
|
||||
* Function declarations
|
||||
*
|
||||
*****************************************************/
|
||||
|
||||
/*
|
||||
*
|
||||
* Low level system functions
|
||||
*
|
||||
*/
|
||||
|
||||
int32_t is_big_endian();
|
||||
int32_t swap_int32_t(int32_t);
|
||||
|
||||
void *eco_malloc(int32_t chunksize,
|
||||
const char *error_message,
|
||||
const char *filename,
|
||||
int32_t line);
|
||||
|
||||
|
||||
void *eco_realloc(void *chunk,
|
||||
int32_t chunksize,
|
||||
const char *error_message,
|
||||
const char *filename,
|
||||
int32_t line);
|
||||
|
||||
void eco_free(void *chunk,
|
||||
const char *error_message,
|
||||
const char *filename,
|
||||
int32_t line);
|
||||
|
||||
void eco_trace_memory_allocation();
|
||||
void eco_untrace_memory_allocation();
|
||||
|
||||
#define ECOMALLOC(size,error_message) \
|
||||
eco_malloc((size),(error_message),__FILE__,__LINE__)
|
||||
|
||||
#define ECOREALLOC(chunk,size,error_message) \
|
||||
eco_realloc((chunk),(size),(error_message),__FILE__,__LINE__)
|
||||
|
||||
#define ECOFREE(chunk,error_message) \
|
||||
eco_free((chunk),(error_message),__FILE__,__LINE__)
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
*
|
||||
* Error managment
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
void ecoError(int32_t,const char*,const char *,int);
|
||||
|
||||
#define ECOERROR(code,message) ecoError((code),(message),__FILE__,__LINE__)
|
||||
|
||||
#define ECO_IO_ERROR (1)
|
||||
#define ECO_MEM_ERROR (2)
|
||||
#define ECO_ASSERT_ERROR (3)
|
||||
#define ECO_NOTFOUND_ERROR (4)
|
||||
|
||||
|
||||
/*
|
||||
*
|
||||
* Low level Disk access functions
|
||||
*
|
||||
*/
|
||||
|
||||
FILE *open_ecorecorddb(const char *filename,
|
||||
int32_t *sequencecount,
|
||||
int32_t abort_on_open_error);
|
||||
|
||||
void *read_ecorecord(FILE *,int32_t *recordSize);
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Read function in internal binary format
|
||||
*/
|
||||
|
||||
FILE *open_ecoseqdb(const char *filename,
|
||||
int32_t *sequencecount);
|
||||
|
||||
ecoseq_t *readnext_ecoseq(FILE *);
|
||||
|
||||
ecorankidx_t *read_rankidx(const char *filename);
|
||||
|
||||
econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy);
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Read taxonomy data as formated by the ecoPCRFormat.py script.
|
||||
*
|
||||
* This function is normaly uses internaly by the read_taxonomy
|
||||
* function and should not be called directly.
|
||||
*
|
||||
* @arg filename path to the *.tdx file of the reformated db
|
||||
*
|
||||
* @return pointer to a taxonomy index structure
|
||||
*/
|
||||
|
||||
ecotxidx_t *read_taxonomyidx(const char *filename,const char *filename2);
|
||||
|
||||
ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName);
|
||||
|
||||
ecotx_t *eco_findtaxonatrank(ecotx_t *taxon, int32_t rankidx);
|
||||
|
||||
ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy, int32_t taxid);
|
||||
|
||||
int eco_isundertaxon(ecotx_t *taxon, int other_taxid);
|
||||
|
||||
ecoseq_t *ecoseq_iterator(const char *prefix);
|
||||
|
||||
|
||||
|
||||
ecoseq_t *new_ecoseq();
|
||||
int32_t delete_ecoseq(ecoseq_t *);
|
||||
ecoseq_t *new_ecoseq_with_data( char *AC,
|
||||
char *DE,
|
||||
char *SQ,
|
||||
int32_t taxid
|
||||
);
|
||||
|
||||
|
||||
int32_t delete_taxon(ecotx_t *taxon);
|
||||
int32_t delete_rankidx(ecorankidx_t *rankidx);
|
||||
int32_t delete_nameidx(econameidx_t *nameidx);
|
||||
int32_t delete_taxonomy(ecotxidx_t *index);
|
||||
int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy);
|
||||
|
||||
|
||||
int32_t rank_index(const char* label,ecorankidx_t* ranks);
|
||||
|
||||
//int32_t delete_apatseq(SeqPtr pseq);
|
||||
//PatternPtr buildPattern(const char *pat, int32_t error_max);
|
||||
//PatternPtr complementPattern(PatternPtr pat);
|
||||
//
|
||||
//SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out,int32_t circular);
|
||||
|
||||
//char *ecoComplementPattern(char *nucAcSeq);
|
||||
//char *ecoComplementSequence(char *nucAcSeq);
|
||||
//char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end);
|
||||
|
||||
ecotx_t *eco_getspecies(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
|
||||
ecotx_t *eco_getgenus(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
|
||||
ecotx_t *eco_getfamily(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
|
||||
ecotx_t *eco_getkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
|
||||
ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
|
||||
|
||||
//int eco_is_taxid_ignored(int32_t *ignored_taxid, int32_t tab_len, int32_t taxid);
|
||||
//int eco_is_taxid_included(ecotaxonomy_t *taxonomy, int32_t *included_taxid, int32_t tab_len, int32_t taxid);
|
||||
|
||||
|
||||
ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy);
|
||||
|
||||
#endif /*ECOPCR_H_*/
|
||||
+156
@@ -0,0 +1,156 @@
|
||||
#include <string.h>
|
||||
#include "ecoPCR.h"
|
||||
|
||||
/*
|
||||
* @doc: DNA alphabet (IUPAC)
|
||||
*/
|
||||
#define LX_BIO_DNA_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]"
|
||||
|
||||
/*
|
||||
* @doc: complementary DNA alphabet (IUPAC)
|
||||
*/
|
||||
#define LX_BIO_CDNA_ALPHA "TVGHEFCDIJMLKNOPQYSAABWXRZ#!]["
|
||||
|
||||
|
||||
static char sNuc[] = LX_BIO_DNA_ALPHA;
|
||||
static char sAnuc[] = LX_BIO_CDNA_ALPHA;
|
||||
|
||||
static char LXBioBaseComplement(char nucAc);
|
||||
static char *LXBioSeqComplement(char *nucAcSeq);
|
||||
static char *reverseSequence(char *str,char isPattern);
|
||||
|
||||
|
||||
/* ---------------------------- */
|
||||
|
||||
char LXBioBaseComplement(char nucAc)
|
||||
{
|
||||
char *c;
|
||||
|
||||
if ((c = strchr(sNuc, nucAc)))
|
||||
return sAnuc[(c - sNuc)];
|
||||
else
|
||||
return nucAc;
|
||||
}
|
||||
|
||||
/* ---------------------------- */
|
||||
|
||||
char *LXBioSeqComplement(char *nucAcSeq)
|
||||
{
|
||||
char *s;
|
||||
|
||||
for (s = nucAcSeq ; *s ; s++)
|
||||
*s = LXBioBaseComplement(*s);
|
||||
|
||||
return nucAcSeq;
|
||||
}
|
||||
|
||||
|
||||
char *reverseSequence(char *str,char isPattern)
|
||||
{
|
||||
char *sb, *se, c;
|
||||
|
||||
if (! str)
|
||||
return str;
|
||||
|
||||
sb = str;
|
||||
se = str + strlen(str) - 1;
|
||||
|
||||
while(sb <= se) {
|
||||
c = *sb;
|
||||
*sb++ = *se;
|
||||
*se-- = c;
|
||||
}
|
||||
|
||||
sb = str;
|
||||
se = str + strlen(str) - 1;
|
||||
|
||||
if (isPattern)
|
||||
for (;sb < se; sb++)
|
||||
{
|
||||
if (*sb=='#')
|
||||
{
|
||||
if (((se - sb) > 2) && (*(sb+2)=='!'))
|
||||
{
|
||||
*sb='!';
|
||||
sb+=2;
|
||||
*sb='#';
|
||||
}
|
||||
else
|
||||
{
|
||||
*sb=*(sb+1);
|
||||
sb++;
|
||||
*sb='#';
|
||||
}
|
||||
}
|
||||
else if (*sb=='!')
|
||||
{
|
||||
*sb=*(sb-1);
|
||||
*(sb-1)='!';
|
||||
}
|
||||
}
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
char *ecoComplementPattern(char *nucAcSeq)
|
||||
{
|
||||
return reverseSequence(LXBioSeqComplement(nucAcSeq),1);
|
||||
}
|
||||
|
||||
char *ecoComplementSequence(char *nucAcSeq)
|
||||
{
|
||||
return reverseSequence(LXBioSeqComplement(nucAcSeq),0);
|
||||
}
|
||||
|
||||
|
||||
char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end)
|
||||
/*
|
||||
extract subsequence from nucAcSeq [begin,end[
|
||||
*/
|
||||
{
|
||||
static char *buffer = NULL;
|
||||
static int32_t buffSize= 0;
|
||||
int32_t length;
|
||||
|
||||
if (begin < end)
|
||||
{
|
||||
length = end - begin;
|
||||
|
||||
if (length >= buffSize)
|
||||
{
|
||||
buffSize = length+1;
|
||||
if (buffer)
|
||||
buffer=ECOREALLOC(buffer,buffSize,
|
||||
"Error in reallocating sub sequence buffer");
|
||||
else
|
||||
buffer=ECOMALLOC(buffSize,
|
||||
"Error in allocating sub sequence buffer");
|
||||
|
||||
}
|
||||
|
||||
strncpy(buffer,nucAcSeq + begin,length);
|
||||
buffer[length]=0;
|
||||
}
|
||||
else
|
||||
{
|
||||
length = end + strlen(nucAcSeq) - begin;
|
||||
|
||||
if (length >= buffSize)
|
||||
{
|
||||
buffSize = length+1;
|
||||
if (buffer)
|
||||
buffer=ECOREALLOC(buffer,buffSize,
|
||||
"Error in reallocating sub sequence buffer");
|
||||
else
|
||||
buffer=ECOMALLOC(buffSize,
|
||||
"Error in allocating sub sequence buffer");
|
||||
|
||||
}
|
||||
strncpy(buffer,nucAcSeq+begin,length - end);
|
||||
strncpy(buffer+(length-end),nucAcSeq ,end);
|
||||
buffer[length]=0;
|
||||
}
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
Binary file not shown.
@@ -0,0 +1,20 @@
|
||||
#include "ecoPCR.h"
|
||||
|
||||
int eco_is_taxid_included( ecotaxonomy_t *taxonomy,
|
||||
int32_t *restricted_taxid,
|
||||
int32_t tab_len,
|
||||
int32_t taxid)
|
||||
{
|
||||
int i;
|
||||
ecotx_t *taxon;
|
||||
|
||||
taxon = eco_findtaxonbytaxid(taxonomy, taxid);
|
||||
|
||||
if (taxon)
|
||||
for (i=0; i < tab_len; i++)
|
||||
if ( (taxon->taxid == restricted_taxid[i]) ||
|
||||
(eco_isundertaxon(taxon, restricted_taxid[i])) )
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
Binary file not shown.
@@ -0,0 +1,86 @@
|
||||
#include "ecoPCR.h"
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
static econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy);
|
||||
|
||||
econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy)
|
||||
{
|
||||
|
||||
int32_t count;
|
||||
FILE *f;
|
||||
econameidx_t *indexname;
|
||||
int32_t i;
|
||||
|
||||
f = open_ecorecorddb(filename,&count,0);
|
||||
|
||||
if (f==NULL)
|
||||
return NULL;
|
||||
|
||||
indexname = (econameidx_t*) ECOMALLOC(sizeof(econameidx_t) + sizeof(econame_t) * (count-1),"Allocate names");
|
||||
|
||||
indexname->count=count;
|
||||
|
||||
for (i=0; i < count; i++){
|
||||
readnext_econame(f,(indexname->names)+i,taxonomy);
|
||||
}
|
||||
|
||||
return indexname;
|
||||
}
|
||||
|
||||
int32_t delete_nameidx(econameidx_t *nameidx)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
if (nameidx) {
|
||||
for (i=0; i < nameidx->count; i++) {
|
||||
if (nameidx->names[i].name)
|
||||
ECOFREE(nameidx->names[i].name,
|
||||
"Desallocate name");
|
||||
if (nameidx->names[i].classname)
|
||||
ECOFREE(nameidx->names[i].classname,
|
||||
"Desallocate classname");
|
||||
}
|
||||
|
||||
ECOFREE(nameidx,"Desallocate name index");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy)
|
||||
{
|
||||
|
||||
econameformat_t *raw;
|
||||
int32_t rs;
|
||||
|
||||
raw = read_ecorecord(f,&rs);
|
||||
|
||||
if (!raw)
|
||||
return NULL;
|
||||
|
||||
if (is_big_endian())
|
||||
{
|
||||
raw->is_scientificname = swap_int32_t(raw->is_scientificname);
|
||||
raw->namelength = swap_int32_t(raw->namelength);
|
||||
raw->classlength = swap_int32_t(raw->classlength);
|
||||
raw->taxid = swap_int32_t(raw->taxid);
|
||||
}
|
||||
|
||||
name->is_scientificname=raw->is_scientificname;
|
||||
|
||||
name->name = ECOMALLOC((raw->namelength+1) * sizeof(char),"Allocate name");
|
||||
strncpy(name->name,raw->names,raw->namelength);
|
||||
name->name[raw->namelength]=0;
|
||||
|
||||
name->classname = ECOMALLOC((raw->classlength+1) * sizeof(char),"Allocate classname");
|
||||
strncpy(name->classname,(raw->names+raw->namelength),raw->classlength);
|
||||
name->classname[raw->classlength]=0;
|
||||
|
||||
name->taxon = taxonomy->taxons->taxon + raw->taxid;
|
||||
|
||||
return name;
|
||||
}
|
||||
|
||||
Binary file not shown.
@@ -0,0 +1,75 @@
|
||||
#include "ecoPCR.h"
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
static int compareRankLabel(const void *label1, const void *label2);
|
||||
|
||||
ecorankidx_t *read_rankidx(const char *filename)
|
||||
{
|
||||
int32_t count;
|
||||
FILE *f;
|
||||
ecorankidx_t *index;
|
||||
int32_t i;
|
||||
int32_t rs;
|
||||
char *buffer;
|
||||
|
||||
f = open_ecorecorddb(filename,&count,0);
|
||||
|
||||
if (f==NULL)
|
||||
return NULL;
|
||||
|
||||
index = (ecorankidx_t*) ECOMALLOC(sizeof(ecorankidx_t) + sizeof(char*) * (count-1),
|
||||
"Allocate rank index");
|
||||
|
||||
index->count=count;
|
||||
|
||||
for (i=0; i < count; i++)
|
||||
{
|
||||
buffer = read_ecorecord(f,&rs);
|
||||
index->label[i]=(char*) ECOMALLOC(rs+1,
|
||||
"Allocate rank label");
|
||||
strncpy(index->label[i],buffer,rs);
|
||||
}
|
||||
|
||||
return index;
|
||||
}
|
||||
|
||||
int32_t delete_rankidx(ecorankidx_t *rankidx)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
if (rankidx) {
|
||||
for (i=0; i < rankidx->count; i++)
|
||||
if (rankidx->label[i])
|
||||
ECOFREE(rankidx->label[i],
|
||||
"Desallocate rank label");
|
||||
|
||||
ECOFREE(rankidx,"Desallocate rank index");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int32_t rank_index(const char* label,ecorankidx_t* ranks)
|
||||
{
|
||||
char **rep;
|
||||
|
||||
rep = bsearch(label,ranks->label,ranks->count,sizeof(char*),compareRankLabel);
|
||||
|
||||
if (rep)
|
||||
return rep-ranks->label;
|
||||
// else
|
||||
// ECOERROR(ECO_NOTFOUND_ERROR,"Rank label not found");
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
int compareRankLabel(const void *label1, const void *label2)
|
||||
{
|
||||
return strcmp((const char*)label1,*(const char**)label2);
|
||||
}
|
||||
|
||||
|
||||
Binary file not shown.
+231
@@ -0,0 +1,231 @@
|
||||
#include "ecoPCR.h"
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <zlib.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <ctype.h>
|
||||
|
||||
static FILE *open_seqfile(const char *prefix,int32_t index);
|
||||
|
||||
|
||||
ecoseq_t *new_ecoseq()
|
||||
{
|
||||
void *tmp;
|
||||
|
||||
tmp = ECOMALLOC(sizeof(ecoseq_t),"Allocate new ecoseq structure");
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
int32_t delete_ecoseq(ecoseq_t * seq)
|
||||
{
|
||||
|
||||
if (seq)
|
||||
{
|
||||
if (seq->AC)
|
||||
ECOFREE(seq->AC,"Free sequence AC");
|
||||
|
||||
if (seq->DE)
|
||||
ECOFREE(seq->DE,"Free sequence DE");
|
||||
|
||||
if (seq->SQ)
|
||||
ECOFREE(seq->SQ,"Free sequence SQ");
|
||||
|
||||
ECOFREE(seq,"Free sequence structure");
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
ecoseq_t *new_ecoseq_with_data( char *AC,
|
||||
char *DE,
|
||||
char *SQ,
|
||||
int32_t taxid_idx
|
||||
)
|
||||
{
|
||||
ecoseq_t *tmp;
|
||||
int32_t lstr;
|
||||
tmp = new_ecoseq();
|
||||
|
||||
tmp->taxid=taxid_idx;
|
||||
|
||||
if (AC)
|
||||
{
|
||||
lstr =strlen(AC);
|
||||
tmp->AC=ECOMALLOC((lstr+1) * sizeof(char),
|
||||
"Allocate sequence accession");
|
||||
strcpy(tmp->AC,AC);
|
||||
}
|
||||
|
||||
if (DE)
|
||||
{
|
||||
lstr =strlen(DE);
|
||||
tmp->DE=ECOMALLOC((lstr+1) * sizeof(char),
|
||||
"Allocate sequence definition");
|
||||
strcpy(tmp->DE,DE);
|
||||
}
|
||||
|
||||
if (SQ)
|
||||
{
|
||||
lstr =strlen(SQ);
|
||||
tmp->SQ=ECOMALLOC((lstr+1) * sizeof(char),
|
||||
"Allocate sequence data");
|
||||
strcpy(tmp->SQ,SQ);
|
||||
}
|
||||
return tmp;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* ?? used ??
|
||||
**/
|
||||
FILE *open_ecoseqdb(const char *filename,
|
||||
int32_t *sequencecount)
|
||||
{
|
||||
return open_ecorecorddb(filename,sequencecount,1);
|
||||
}
|
||||
|
||||
ecoseq_t *readnext_ecoseq(FILE *f)
|
||||
{
|
||||
char *compressed=NULL;
|
||||
|
||||
ecoseqformat_t *raw;
|
||||
ecoseq_t *seq;
|
||||
int32_t comp_status;
|
||||
unsigned long int seqlength;
|
||||
int32_t rs;
|
||||
char *c;
|
||||
int32_t i;
|
||||
|
||||
raw = read_ecorecord(f,&rs);
|
||||
|
||||
if (!raw)
|
||||
return NULL;
|
||||
|
||||
if (is_big_endian())
|
||||
{
|
||||
raw->CSQ_length = swap_int32_t(raw->CSQ_length);
|
||||
raw->DE_length = swap_int32_t(raw->DE_length);
|
||||
raw->SQ_length = swap_int32_t(raw->SQ_length);
|
||||
raw->taxid = swap_int32_t(raw->taxid);
|
||||
}
|
||||
|
||||
seq = new_ecoseq();
|
||||
|
||||
seq->taxid = raw->taxid;
|
||||
|
||||
seq->AC = ECOMALLOC(strlen(raw->AC) +1,
|
||||
"Allocate Sequence Accesion number");
|
||||
strncpy(seq->AC,raw->AC,strlen(raw->AC));
|
||||
|
||||
|
||||
seq->DE = ECOMALLOC(raw->DE_length+1,
|
||||
"Allocate Sequence definition");
|
||||
strncpy(seq->DE,raw->data,raw->DE_length);
|
||||
|
||||
seqlength = seq->SQ_length = raw->SQ_length;
|
||||
|
||||
compressed = raw->data + raw->DE_length;
|
||||
|
||||
seq->SQ = ECOMALLOC(seqlength+1,
|
||||
"Allocate sequence buffer");
|
||||
/*
|
||||
comp_status = uncompress((unsigned char*)seq->SQ,
|
||||
&seqlength,
|
||||
(unsigned char*)compressed,
|
||||
raw->CSQ_length);
|
||||
*/
|
||||
|
||||
if (comp_status != Z_OK)
|
||||
ECOERROR(ECO_IO_ERROR,"I cannot uncompress sequence data");
|
||||
|
||||
for (c=seq->SQ,i=0;i<seqlength;c++,i++)
|
||||
*c=toupper(*c);
|
||||
|
||||
|
||||
return seq;
|
||||
}
|
||||
|
||||
/**
|
||||
* Open the sequences database (.sdx file)
|
||||
* @param prefix name of the database (radical without extension)
|
||||
* @param index integer
|
||||
*
|
||||
* @return file object
|
||||
*/
|
||||
FILE *open_seqfile(const char *prefix,int32_t index)
|
||||
{
|
||||
char filename_buffer[1024];
|
||||
int32_t filename_length;
|
||||
FILE *input;
|
||||
int32_t seqcount;
|
||||
|
||||
filename_length = snprintf(filename_buffer,
|
||||
1023,
|
||||
"%s_%03d.sdx",
|
||||
prefix,
|
||||
index);
|
||||
|
||||
// fprintf(stderr,"# Coucou %s\n",filename_buffer);
|
||||
|
||||
|
||||
if (filename_length >= 1024)
|
||||
ECOERROR(ECO_ASSERT_ERROR,"file name is too long");
|
||||
|
||||
filename_buffer[filename_length]=0;
|
||||
|
||||
input=open_ecorecorddb(filename_buffer,&seqcount,0);
|
||||
|
||||
if (input)
|
||||
REprintf("# Reading file %s containing %d sequences...\n",
|
||||
filename_buffer,
|
||||
seqcount);
|
||||
|
||||
return input;
|
||||
}
|
||||
|
||||
ecoseq_t *ecoseq_iterator(const char *prefix)
|
||||
{
|
||||
static FILE *current_seq_file= NULL;
|
||||
static int32_t current_file_idx = 1;
|
||||
static char current_prefix[1024];
|
||||
ecoseq_t *seq;
|
||||
|
||||
if (prefix)
|
||||
{
|
||||
current_file_idx = 1;
|
||||
|
||||
if (current_seq_file)
|
||||
fclose(current_seq_file);
|
||||
|
||||
strncpy(current_prefix,prefix,1023);
|
||||
current_prefix[1023]=0;
|
||||
|
||||
current_seq_file = open_seqfile(current_prefix,
|
||||
current_file_idx);
|
||||
|
||||
if (!current_seq_file)
|
||||
return NULL;
|
||||
|
||||
}
|
||||
|
||||
seq = readnext_ecoseq(current_seq_file);
|
||||
|
||||
if (!seq && feof(current_seq_file))
|
||||
{
|
||||
current_file_idx++;
|
||||
fclose(current_seq_file);
|
||||
current_seq_file = open_seqfile(current_prefix,
|
||||
current_file_idx);
|
||||
|
||||
|
||||
if (current_seq_file)
|
||||
seq = readnext_ecoseq(current_seq_file);
|
||||
}
|
||||
|
||||
return seq;
|
||||
}
|
||||
Binary file not shown.
+420
@@ -0,0 +1,420 @@
|
||||
#include "ecoPCR.h"
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <R.h>
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(x,y) (((x)>(y)) ? (x):(y))
|
||||
#endif
|
||||
|
||||
static ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon);
|
||||
|
||||
/**
|
||||
* Open the taxonomy database
|
||||
* @param pointer to the database (.tdx file)
|
||||
* @return a ecotxidx_t structure
|
||||
*/
|
||||
ecotxidx_t *read_taxonomyidx(const char *filename,const char *filename2)
|
||||
{
|
||||
int32_t count;
|
||||
int32_t count2;
|
||||
FILE *f;
|
||||
FILE *f2;
|
||||
ecotxidx_t *index;
|
||||
struct ecotxnode *t;
|
||||
int32_t i;
|
||||
int32_t j;
|
||||
|
||||
f = open_ecorecorddb(filename,&count,0);
|
||||
|
||||
if (f==NULL) return NULL;
|
||||
|
||||
f2 = open_ecorecorddb(filename2,&count2,0);
|
||||
|
||||
index = (ecotxidx_t*) ECOMALLOC(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count+count2-1),
|
||||
"Allocate taxonomy");
|
||||
|
||||
index->count=count+count2;
|
||||
index->buffersize = index->count;
|
||||
|
||||
index->maxtaxid=0;
|
||||
REprintf("Readind %d taxa...\n",count);
|
||||
for (i=0; i < count; i++){
|
||||
readnext_ecotaxon(f,&(index->taxon[i]));
|
||||
index->taxon[i].parent=index->taxon + (size_t)index->taxon[i].parent;
|
||||
index->taxon[i].parent->farest=0;
|
||||
if (index->taxon[i].taxid > index->maxtaxid)
|
||||
index->maxtaxid=index->taxon[i].taxid;
|
||||
}
|
||||
|
||||
|
||||
if (count2>0)
|
||||
REprintf("Readind %d local taxa...\n",count2);
|
||||
else
|
||||
REprintf("No local taxon\n");
|
||||
|
||||
count = index->count;
|
||||
|
||||
for (; i < count; i++){
|
||||
readnext_ecotaxon(f2,&(index->taxon[i]));
|
||||
index->taxon[i].parent=index->taxon + (size_t)index->taxon[i].parent;
|
||||
index->taxon[i].parent->farest=0;
|
||||
if (index->taxon[i].taxid > index->maxtaxid)
|
||||
index->maxtaxid=index->taxon[i].taxid;
|
||||
}
|
||||
|
||||
REprintf("Computing longest branches...\n",count);
|
||||
|
||||
for (i=0; i < count; i++){
|
||||
t=index->taxon+i;
|
||||
if (t->farest==-1)
|
||||
{
|
||||
t->farest=0;
|
||||
while(t->parent != t)
|
||||
{
|
||||
j = t->farest + 1;
|
||||
if (j > t->parent->farest)
|
||||
{
|
||||
t->parent->farest = j;
|
||||
t=t->parent;
|
||||
}
|
||||
else
|
||||
t=index->taxon;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return index;
|
||||
}
|
||||
|
||||
|
||||
int32_t delete_taxonomy(ecotxidx_t *index)
|
||||
{
|
||||
int32_t i;
|
||||
|
||||
if (index)
|
||||
{
|
||||
for (i=0; i< index->count; i++)
|
||||
if (index->taxon[i].name)
|
||||
ECOFREE(index->taxon[i].name,"Free scientific name");
|
||||
|
||||
|
||||
ECOFREE(index,"Free Taxonomy");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int32_t delete_taxon(ecotx_t *taxon)
|
||||
{
|
||||
if (taxon)
|
||||
{
|
||||
if (taxon->name)
|
||||
ECOFREE(taxon->name,"Free scientific name");
|
||||
|
||||
ECOFREE(taxon,"Free Taxon");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Read the database for a given taxon a save the data
|
||||
* into the taxon structure(if any found)
|
||||
* @param *f pointer to FILE type returned by fopen
|
||||
* @param *taxon pointer to the structure
|
||||
*
|
||||
* @return a ecotx_t structure if any taxon found else NULL
|
||||
*/
|
||||
ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon)
|
||||
{
|
||||
|
||||
ecotxformat_t *raw;
|
||||
int32_t rs;
|
||||
|
||||
raw = read_ecorecord(f,&rs);
|
||||
|
||||
if (!raw)
|
||||
return NULL;
|
||||
|
||||
if (is_big_endian())
|
||||
{
|
||||
raw->namelength = swap_int32_t(raw->namelength);
|
||||
raw->parent = swap_int32_t(raw->parent);
|
||||
raw->rank = swap_int32_t(raw->rank);
|
||||
raw->taxid = swap_int32_t(raw->taxid);
|
||||
}
|
||||
|
||||
taxon->parent = (ecotx_t*)((size_t)raw->parent);
|
||||
taxon->taxid = raw->taxid;
|
||||
taxon->rank = raw->rank;
|
||||
taxon->farest = -1;
|
||||
|
||||
taxon->name = ECOMALLOC((raw->namelength+1) * sizeof(char),
|
||||
"Allocate taxon scientific name");
|
||||
|
||||
strncpy(taxon->name,raw->name,raw->namelength);
|
||||
|
||||
return taxon;
|
||||
}
|
||||
|
||||
|
||||
ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName)
|
||||
{
|
||||
ecotaxonomy_t *tax;
|
||||
char *filename;
|
||||
char *filename2;
|
||||
int buffsize;
|
||||
|
||||
tax = ECOMALLOC(sizeof(ecotaxonomy_t),
|
||||
"Allocate taxonomy structure");
|
||||
|
||||
tax->ranks =NULL;
|
||||
tax->taxons=NULL;
|
||||
tax->names =NULL;
|
||||
|
||||
buffsize = strlen(prefix)+10;
|
||||
|
||||
filename = ECOMALLOC(buffsize,
|
||||
"Allocate filename");
|
||||
filename2= ECOMALLOC(buffsize,
|
||||
"Allocate filename");
|
||||
|
||||
snprintf(filename,buffsize,"%s.rdx",prefix);
|
||||
|
||||
tax->ranks = read_rankidx(filename);
|
||||
|
||||
if (tax->ranks == NULL)
|
||||
{
|
||||
ECOFREE(filename,"Desallocate filename 1");
|
||||
ECOFREE(filename2,"Desallocate filename 2");
|
||||
|
||||
delete_ecotaxonomy(tax);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
snprintf(filename,buffsize,"%s.tdx",prefix);
|
||||
snprintf(filename2,buffsize,"%s.ldx",prefix);
|
||||
|
||||
tax->taxons = read_taxonomyidx(filename,filename2);
|
||||
|
||||
if (tax->taxons == NULL)
|
||||
{
|
||||
ECOFREE(filename,"Desallocate filename 1");
|
||||
ECOFREE(filename,"Desallocate filename 2");
|
||||
|
||||
delete_ecotaxonomy(tax);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (readAlternativeName)
|
||||
{
|
||||
snprintf(filename,buffsize,"%s.ndx",prefix);
|
||||
tax->names=read_nameidx(filename,tax);
|
||||
}
|
||||
else
|
||||
tax->names=NULL;
|
||||
|
||||
ECOFREE(filename,"Desallocate filename 1");
|
||||
ECOFREE(filename2,"Desallocate filename 2");
|
||||
|
||||
return tax;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy)
|
||||
{
|
||||
if (taxonomy)
|
||||
{
|
||||
delete_rankidx(taxonomy->ranks);
|
||||
delete_nameidx(taxonomy->names);
|
||||
delete_taxonomy(taxonomy->taxons);
|
||||
|
||||
ECOFREE(taxonomy,"Free taxonomy structure");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
ecotx_t *eco_findtaxonatrank(ecotx_t *taxon,
|
||||
int32_t rankidx)
|
||||
{
|
||||
ecotx_t *current_taxon;
|
||||
ecotx_t *next_taxon;
|
||||
|
||||
current_taxon = taxon;
|
||||
next_taxon = current_taxon->parent;
|
||||
|
||||
while ((current_taxon!=next_taxon) && // I' am the root node
|
||||
(current_taxon->rank!=rankidx))
|
||||
{
|
||||
current_taxon = next_taxon;
|
||||
next_taxon = current_taxon->parent;
|
||||
}
|
||||
|
||||
if (current_taxon->rank==rankidx)
|
||||
return current_taxon;
|
||||
else
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int bcomptaxon (const void * ptaxid, const void * ptaxon) {
|
||||
|
||||
ecotx_t *current_taxon = (ecotx_t*)ptaxon;
|
||||
int32_t taxid=(int32_t)((size_t)ptaxid);
|
||||
return taxid - current_taxon->taxid;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get back information concerning a taxon from a taxonomic id
|
||||
* @param *taxonomy the taxonomy database
|
||||
* @param taxid the taxonomic id
|
||||
*
|
||||
* @result a ecotx_t structure containing the taxonimic information
|
||||
**/
|
||||
ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy,
|
||||
int32_t taxid)
|
||||
{
|
||||
ecotx_t *current_taxon;
|
||||
int32_t taxoncount;
|
||||
|
||||
taxoncount=taxonomy->taxons->count;
|
||||
|
||||
current_taxon = (ecotx_t*) bsearch((const void *)((size_t)taxid),
|
||||
(const void *)taxonomy->taxons->taxon,
|
||||
taxoncount,
|
||||
sizeof(ecotx_t),
|
||||
bcomptaxon);
|
||||
|
||||
return current_taxon;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find out if taxon is son of other taxon (identified by its taxid)
|
||||
* @param *taxon son taxon
|
||||
* @param parent_taxid taxonomic id of the other taxon
|
||||
*
|
||||
* @return 1 is the other taxid math a parent taxid, else 0
|
||||
**/
|
||||
int eco_isundertaxon(ecotx_t *taxon,
|
||||
int other_taxid)
|
||||
{
|
||||
ecotx_t *next_parent;
|
||||
|
||||
next_parent = taxon->parent;
|
||||
|
||||
while ( (other_taxid != next_parent->taxid) &&
|
||||
(strcmp(next_parent->name, "root")) )
|
||||
{
|
||||
next_parent = next_parent->parent;
|
||||
}
|
||||
|
||||
if (other_taxid == next_parent->taxid)
|
||||
return 1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
ecotx_t *eco_getspecies(ecotx_t *taxon,
|
||||
ecotaxonomy_t *taxonomy)
|
||||
{
|
||||
static ecotaxonomy_t *tax=NULL;
|
||||
static int32_t rankindex=-1;
|
||||
|
||||
if (taxonomy && tax!=taxonomy)
|
||||
{
|
||||
rankindex = rank_index("species",taxonomy->ranks);
|
||||
tax=taxonomy;
|
||||
}
|
||||
|
||||
if (!tax || rankindex < 0)
|
||||
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
|
||||
|
||||
return eco_findtaxonatrank(taxon,rankindex);
|
||||
}
|
||||
|
||||
ecotx_t *eco_getgenus(ecotx_t *taxon,
|
||||
ecotaxonomy_t *taxonomy)
|
||||
{
|
||||
static ecotaxonomy_t *tax=NULL;
|
||||
static int32_t rankindex=-1;
|
||||
|
||||
if (taxonomy && tax!=taxonomy)
|
||||
{
|
||||
rankindex = rank_index("genus",taxonomy->ranks);
|
||||
tax=taxonomy;
|
||||
}
|
||||
|
||||
if (!tax || rankindex < 0)
|
||||
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
|
||||
|
||||
return eco_findtaxonatrank(taxon,rankindex);
|
||||
}
|
||||
|
||||
|
||||
ecotx_t *eco_getfamily(ecotx_t *taxon,
|
||||
ecotaxonomy_t *taxonomy)
|
||||
{
|
||||
static ecotaxonomy_t *tax=NULL;
|
||||
static int32_t rankindex=-1;
|
||||
|
||||
if (taxonomy && tax!=taxonomy)
|
||||
{
|
||||
rankindex = rank_index("family",taxonomy->ranks);
|
||||
tax=taxonomy;
|
||||
}
|
||||
|
||||
if (!tax || rankindex < 0)
|
||||
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
|
||||
|
||||
return eco_findtaxonatrank(taxon,rankindex);
|
||||
}
|
||||
|
||||
ecotx_t *eco_getkingdom(ecotx_t *taxon,
|
||||
ecotaxonomy_t *taxonomy)
|
||||
{
|
||||
static ecotaxonomy_t *tax=NULL;
|
||||
static int32_t rankindex=-1;
|
||||
|
||||
if (taxonomy && tax!=taxonomy)
|
||||
{
|
||||
rankindex = rank_index("kingdom",taxonomy->ranks);
|
||||
tax=taxonomy;
|
||||
}
|
||||
|
||||
if (!tax || rankindex < 0)
|
||||
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
|
||||
|
||||
return eco_findtaxonatrank(taxon,rankindex);
|
||||
}
|
||||
|
||||
ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,
|
||||
ecotaxonomy_t *taxonomy)
|
||||
{
|
||||
static ecotaxonomy_t *tax=NULL;
|
||||
static int32_t rankindex=-1;
|
||||
|
||||
if (taxonomy && tax!=taxonomy)
|
||||
{
|
||||
rankindex = rank_index("superkingdom",taxonomy->ranks);
|
||||
tax=taxonomy;
|
||||
}
|
||||
|
||||
if (!tax || rankindex < 0)
|
||||
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
|
||||
|
||||
return eco_findtaxonatrank(taxon,rankindex);
|
||||
}
|
||||
Binary file not shown.
+845
@@ -0,0 +1,845 @@
|
||||
/*
|
||||
* robitax.c
|
||||
*
|
||||
* Created on: 17 janv. 2013
|
||||
* Author: coissac
|
||||
*/
|
||||
|
||||
#include "robitax.h"
|
||||
#include <unistd.h>
|
||||
//#include <regex.h>
|
||||
#include "slre.h"
|
||||
|
||||
/**
|
||||
* Return a pointeur to an obitools taxonomy C structure
|
||||
* from an R instance of taxonomy.obitools
|
||||
*
|
||||
* The function checks if the pointer stored in the R object is set
|
||||
* to NULL. In this case this means that we have to load the taxonomy
|
||||
* from the disk.
|
||||
*
|
||||
* @param taxonomy an R object
|
||||
* @type taxonomy SEXP
|
||||
*
|
||||
* @return a pointer to the C structure
|
||||
* @rtype ecotaxonomy_t *
|
||||
*/
|
||||
|
||||
ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy)
|
||||
{
|
||||
|
||||
|
||||
char *pwd;
|
||||
SEXP pointer;
|
||||
SEXP rclass;
|
||||
SEXP rdir;
|
||||
SEXP rfile;
|
||||
ecotaxonomy_t *ptax;
|
||||
const char *class;
|
||||
const char *file;
|
||||
const char *dir;
|
||||
|
||||
int saved;
|
||||
|
||||
if (!IS_S4_OBJECT(Rtaxonomy) )
|
||||
error("argument not obitools.taxonomy instance");
|
||||
|
||||
// We get the class name and compare it to "taxonomy.obitools"
|
||||
rclass = getAttrib(Rtaxonomy, R_ClassSymbol);
|
||||
class = CHAR(asChar(rclass));
|
||||
|
||||
if (strcmp(class,"obitools.taxonomy"))
|
||||
error("argument not obitools.taxonomy instance");
|
||||
|
||||
pointer = R_do_slot(Rtaxonomy,mkString("pointer"));
|
||||
saved = LOGICAL(R_do_slot(Rtaxonomy,mkString("saved")))[0];
|
||||
ptax = (ecotaxonomy_t *) R_ExternalPtrAddr(pointer);
|
||||
|
||||
// If the external pointer is set to NULL we have to load
|
||||
// the taxonomy from file
|
||||
if (ptax==NULL && saved)
|
||||
{
|
||||
pwd = getcwd(NULL,0);
|
||||
|
||||
rfile = R_do_slot(Rtaxonomy,mkString("dbname"));
|
||||
file = CHAR(asChar(rfile));
|
||||
|
||||
rdir = R_do_slot(Rtaxonomy,mkString("workingdir"));
|
||||
dir = CHAR(asChar(rdir));
|
||||
|
||||
chdir(dir);
|
||||
|
||||
ptax = read_taxonomy(file,1);
|
||||
|
||||
R_SetExternalPtrAddr(pointer,(void*)ptax);
|
||||
|
||||
chdir(pwd);
|
||||
free(pwd);
|
||||
}
|
||||
|
||||
if (ptax==NULL && ! saved)
|
||||
error("The taxonomy instance is no more valid and must be rebuilt");
|
||||
|
||||
return ptax;
|
||||
}
|
||||
|
||||
SEXP R_delete_taxonomy(SEXP Rtaxonomy)
|
||||
{
|
||||
ecotaxonomy_t *ptax;
|
||||
// SEXP pointer;
|
||||
|
||||
ptax = (ecotaxonomy_t *) R_ExternalPtrAddr(Rtaxonomy);
|
||||
|
||||
(void) delete_ecotaxonomy(ptax);
|
||||
|
||||
// Clear the external pointer
|
||||
R_ClearExternalPtr(Rtaxonomy);
|
||||
|
||||
return R_NilValue;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
SEXP R_read_taxonomy(SEXP filename, SEXP altenative)
|
||||
{
|
||||
int alt;
|
||||
const char* file;
|
||||
SEXP Rtax;
|
||||
|
||||
if (! isString(filename))
|
||||
error("filename not character");
|
||||
file = CHAR(STRING_ELT(filename, 0));
|
||||
|
||||
if (! isLogical(altenative))
|
||||
error("altenative not logical");
|
||||
alt = LOGICAL(altenative)[0];
|
||||
|
||||
ecotaxonomy_t *taxonomy = read_taxonomy(file,alt);
|
||||
|
||||
if (! taxonomy)
|
||||
error("Cannot open the taxonomy database");
|
||||
|
||||
Rtax = PROTECT(R_MakeExternalPtr(taxonomy, mkString("ROBITools NCBI Taxonomy pointer"), R_NilValue));
|
||||
R_RegisterCFinalizerEx(Rtax, (R_CFinalizer_t)R_delete_taxonomy,TRUE);
|
||||
|
||||
UNPROTECT(1);
|
||||
|
||||
|
||||
return Rtax;
|
||||
}
|
||||
|
||||
|
||||
SEXP R_get_scientific_name(SEXP Rtaxonomy,SEXP Rtaxid)
|
||||
{
|
||||
ecotx_t *taxon;
|
||||
ecotaxonomy_t *ptax;
|
||||
int taxid;
|
||||
// SEXP scname;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
if (! isInteger(Rtaxid))
|
||||
error("taxid not integer");
|
||||
|
||||
taxid = *INTEGER(Rtaxid);
|
||||
|
||||
if (! (taxid > 0))
|
||||
error("taxid not positive");
|
||||
|
||||
|
||||
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||
|
||||
if (!taxon)
|
||||
return ScalarString(R_NaString);
|
||||
// error("unkown taxid");
|
||||
|
||||
return mkString(taxon->name);
|
||||
|
||||
}
|
||||
|
||||
SEXP R_get_rank(SEXP Rtaxonomy,SEXP Rtaxid)
|
||||
{
|
||||
ecotx_t *taxon;
|
||||
ecotaxonomy_t *ptax;
|
||||
int *taxid;
|
||||
int ntaxid;
|
||||
int i;
|
||||
SEXP results;
|
||||
// SEXP scname;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
if (! isInteger(Rtaxid))
|
||||
error("taxid not integer");
|
||||
|
||||
ntaxid = length(Rtaxid);
|
||||
|
||||
results = PROTECT(allocVector(STRSXP, ntaxid));
|
||||
|
||||
taxid = INTEGER(Rtaxid);
|
||||
|
||||
for (i=0; i < ntaxid; i++)
|
||||
{
|
||||
if (taxid[i]== NA_INTEGER || taxid[i] <= 0)
|
||||
SET_STRING_ELT(results, i, R_NaString);
|
||||
else {
|
||||
taxon = eco_findtaxonbytaxid(ptax, taxid[i]);
|
||||
if (!taxon)
|
||||
SET_STRING_ELT(results, i, R_NaString);
|
||||
else
|
||||
SET_STRING_ELT(results, i, mkChar(ptax->ranks->label[taxon->rank]));
|
||||
}
|
||||
}
|
||||
|
||||
UNPROTECT(1);
|
||||
|
||||
return results;
|
||||
|
||||
}
|
||||
|
||||
SEXP R_findtaxonatrank(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rrank, SEXP Rname)
|
||||
{
|
||||
ecotx_t *taxon;
|
||||
ecotx_t *rep;
|
||||
ecotaxonomy_t *ptax;
|
||||
int taxid;
|
||||
int name;
|
||||
const char *rank;
|
||||
int rankidx;
|
||||
// SEXP scname;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
if (! isInteger(Rtaxid))
|
||||
error("taxid not integer");
|
||||
|
||||
taxid = *INTEGER(Rtaxid);
|
||||
|
||||
if (! (taxid > 0))
|
||||
error("taxid not positive");
|
||||
|
||||
if (! isString(Rrank))
|
||||
error("rank not a string");
|
||||
|
||||
rank=CHAR(STRING_ELT(Rrank,0));
|
||||
|
||||
rankidx=rank_index(rank,ptax->ranks);
|
||||
|
||||
if (rankidx < 0)
|
||||
error("unkown rank name");
|
||||
|
||||
if (! isLogical(Rname))
|
||||
error("name not logical");
|
||||
name = LOGICAL(Rname)[0];
|
||||
|
||||
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||
|
||||
if (!taxon)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
rep = eco_findtaxonatrank(taxon,rankidx);
|
||||
|
||||
if (!rep)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
if (name)
|
||||
return mkString(rep->name);
|
||||
else
|
||||
return ScalarInteger(rep->taxid);
|
||||
}
|
||||
|
||||
|
||||
SEXP R_get_species(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
|
||||
{
|
||||
ecotx_t *taxon;
|
||||
ecotx_t *rep;
|
||||
ecotaxonomy_t *ptax;
|
||||
int taxid;
|
||||
int name;
|
||||
// SEXP scname;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
if (! isInteger(Rtaxid))
|
||||
error("taxid not integer");
|
||||
|
||||
taxid = *INTEGER(Rtaxid);
|
||||
|
||||
if (! (taxid > 0))
|
||||
error("taxid not positive");
|
||||
|
||||
if (! isLogical(Rname))
|
||||
error("name not logical");
|
||||
name = LOGICAL(Rname)[0];
|
||||
|
||||
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||
|
||||
if (!taxon)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
rep = eco_getspecies(taxon,ptax);
|
||||
|
||||
if (!rep)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
if (name)
|
||||
return mkString(rep->name);
|
||||
else
|
||||
return ScalarInteger(rep->taxid);
|
||||
}
|
||||
|
||||
SEXP R_get_genus(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
|
||||
{
|
||||
ecotx_t *taxon;
|
||||
ecotx_t *rep;
|
||||
ecotaxonomy_t *ptax;
|
||||
int taxid;
|
||||
int name;
|
||||
// SEXP scname;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
if (! isInteger(Rtaxid))
|
||||
error("taxid not integer");
|
||||
|
||||
taxid = *INTEGER(Rtaxid);
|
||||
|
||||
if (! (taxid > 0))
|
||||
error("taxid not positive");
|
||||
|
||||
if (! isLogical(Rname))
|
||||
error("name not logical");
|
||||
name = LOGICAL(Rname)[0];
|
||||
|
||||
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||
|
||||
if (!taxon)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
rep = eco_getgenus(taxon,ptax);
|
||||
|
||||
if (!rep)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
if (name)
|
||||
return mkString(rep->name);
|
||||
else
|
||||
return ScalarInteger(rep->taxid);
|
||||
}
|
||||
|
||||
SEXP R_get_family(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
|
||||
{
|
||||
ecotx_t *taxon;
|
||||
ecotx_t *rep;
|
||||
ecotaxonomy_t *ptax;
|
||||
int taxid;
|
||||
int name;
|
||||
// SEXP scname;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
if (! isInteger(Rtaxid))
|
||||
error("taxid not integer");
|
||||
|
||||
taxid = *INTEGER(Rtaxid);
|
||||
|
||||
if (! (taxid > 0))
|
||||
error("taxid not positive");
|
||||
|
||||
if (! isLogical(Rname))
|
||||
error("name not logical");
|
||||
name = LOGICAL(Rname)[0];
|
||||
|
||||
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||
|
||||
if (!taxon)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
rep = eco_getfamily(taxon,ptax);
|
||||
|
||||
if (!rep)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
if (name)
|
||||
return mkString(rep->name);
|
||||
else
|
||||
return ScalarInteger(rep->taxid);
|
||||
}
|
||||
|
||||
SEXP R_get_kingdom(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
|
||||
{
|
||||
ecotx_t *taxon;
|
||||
ecotx_t *rep;
|
||||
ecotaxonomy_t *ptax;
|
||||
int taxid;
|
||||
int name;
|
||||
// SEXP scname;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
if (! isInteger(Rtaxid))
|
||||
error("taxid not integer");
|
||||
|
||||
taxid = *INTEGER(Rtaxid);
|
||||
|
||||
if (! (taxid > 0))
|
||||
error("taxid not positive");
|
||||
|
||||
if (! isLogical(Rname))
|
||||
error("name not logical");
|
||||
name = LOGICAL(Rname)[0];
|
||||
|
||||
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||
|
||||
if (!taxon)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
rep = eco_getkingdom(taxon,ptax);
|
||||
|
||||
if (!rep)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
if (name)
|
||||
return mkString(rep->name);
|
||||
else
|
||||
return ScalarInteger(rep->taxid);
|
||||
}
|
||||
|
||||
SEXP R_get_superkingdom(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
|
||||
{
|
||||
ecotx_t *taxon;
|
||||
ecotx_t *rep;
|
||||
ecotaxonomy_t *ptax;
|
||||
int taxid;
|
||||
int name;
|
||||
// SEXP scname;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
if (! isInteger(Rtaxid))
|
||||
error("taxid not integer");
|
||||
|
||||
taxid = *INTEGER(Rtaxid);
|
||||
|
||||
if (! (taxid > 0))
|
||||
error("taxid not positive");
|
||||
|
||||
if (! isLogical(Rname))
|
||||
error("name not logical");
|
||||
name = LOGICAL(Rname)[0];
|
||||
|
||||
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||
|
||||
if (!taxon)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
rep = eco_getsuperkingdom(taxon,ptax);
|
||||
|
||||
if (!rep)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
if (name)
|
||||
return mkString(rep->name);
|
||||
else
|
||||
return ScalarInteger(rep->taxid);
|
||||
}
|
||||
|
||||
SEXP R_get_parent(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
|
||||
{
|
||||
ecotx_t *taxon;
|
||||
ecotx_t *rep;
|
||||
ecotaxonomy_t *ptax;
|
||||
int taxid;
|
||||
int name;
|
||||
// SEXP scname;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
if (! isInteger(Rtaxid))
|
||||
error("taxid not integer");
|
||||
|
||||
taxid = *INTEGER(Rtaxid);
|
||||
|
||||
if (! (taxid > 0))
|
||||
error("taxid not positive");
|
||||
|
||||
if (! isLogical(Rname))
|
||||
error("name not logical");
|
||||
name = LOGICAL(Rname)[0];
|
||||
|
||||
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||
|
||||
if (!taxon)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
rep = taxon->parent;
|
||||
|
||||
if (rep->taxid==taxid)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
if (name)
|
||||
return mkString(rep->name);
|
||||
else
|
||||
return ScalarInteger(rep->taxid);
|
||||
}
|
||||
|
||||
|
||||
SEXP R_validate_taxid(SEXP Rtaxonomy,SEXP Rtaxid)
|
||||
{
|
||||
ecotx_t *taxon;
|
||||
ecotaxonomy_t *ptax;
|
||||
int taxid;
|
||||
// int name;
|
||||
// SEXP scname;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
if (! isInteger(Rtaxid))
|
||||
error("taxid not integer");
|
||||
|
||||
taxid = *INTEGER(Rtaxid);
|
||||
|
||||
if (! (taxid > 0))
|
||||
return ScalarInteger(R_NaInt);
|
||||
|
||||
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||
|
||||
if (!taxon)
|
||||
return ScalarInteger(R_NaInt);
|
||||
else
|
||||
return ScalarInteger(taxon->taxid);
|
||||
}
|
||||
|
||||
|
||||
SEXP R_is_under_taxon(SEXP Rtaxonomy, SEXP Rtaxid, SEXP Rparent)
|
||||
{
|
||||
ecotx_t *taxon;
|
||||
ecotaxonomy_t *ptax;
|
||||
int taxid;
|
||||
int *ptaxid;
|
||||
int ntaxid;
|
||||
int parent;
|
||||
SEXP rep;
|
||||
int* prep;
|
||||
size_t i;
|
||||
// SEXP isunder;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
if (! isInteger(Rparent))
|
||||
error("parent not integer");
|
||||
|
||||
if (GET_LENGTH(Rparent) > 1)
|
||||
error("parent must have a length equal to one.");
|
||||
|
||||
parent = *INTEGER(Rparent);
|
||||
|
||||
if (! isInteger(Rtaxid))
|
||||
error("taxid not integer");
|
||||
|
||||
ntaxid = GET_LENGTH(Rtaxid);
|
||||
rep = PROTECT(NEW_LOGICAL(ntaxid));
|
||||
prep=LOGICAL(rep);
|
||||
|
||||
ptaxid = INTEGER_POINTER(Rtaxid);
|
||||
|
||||
|
||||
if (parent > 0) {
|
||||
taxon = eco_findtaxonbytaxid(ptax, parent);
|
||||
if (taxon) {
|
||||
for (i=0; i < ntaxid; i++) {
|
||||
if (ptaxid[i] <= 0)
|
||||
prep[i]=NA_LOGICAL;
|
||||
else {
|
||||
taxon = eco_findtaxonbytaxid(ptax, ptaxid[i]);
|
||||
prep[i] = eco_isundertaxon(taxon, parent);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (i=0; i < ntaxid; i++)
|
||||
prep[i]=NA_LOGICAL;
|
||||
}
|
||||
|
||||
unprotect(1);
|
||||
return rep;
|
||||
}
|
||||
|
||||
|
||||
SEXP R_longest_path(SEXP Rtaxonomy,SEXP Rtaxid)
|
||||
{
|
||||
ecotx_t *taxon;
|
||||
ecotaxonomy_t *ptax;
|
||||
int taxid;
|
||||
// int name;
|
||||
// SEXP scname;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
if (! isInteger(Rtaxid))
|
||||
error("taxid not integer");
|
||||
|
||||
taxid = *INTEGER(Rtaxid);
|
||||
|
||||
if (taxid <= 0)
|
||||
return ScalarInteger(R_NaInt);
|
||||
|
||||
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||
|
||||
if (!taxon)
|
||||
return ScalarInteger(R_NaInt);
|
||||
else
|
||||
return ScalarInteger(taxon->farest);
|
||||
}
|
||||
|
||||
SEXP R_rank_list(SEXP Rtaxonomy)
|
||||
{
|
||||
int nrank;
|
||||
int i;
|
||||
ecotaxonomy_t *ptax;
|
||||
SEXP rNames;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
nrank = ptax->ranks->count;
|
||||
|
||||
rNames = PROTECT(allocVector(STRSXP, nrank));
|
||||
|
||||
for (i=0; i < nrank;i++)
|
||||
SET_STRING_ELT(rNames, i, mkChar(ptax->ranks->label[i]));
|
||||
|
||||
UNPROTECT(1);
|
||||
|
||||
return rNames;
|
||||
}
|
||||
|
||||
SEXP R_taxid_list(SEXP Rtaxonomy)
|
||||
{
|
||||
int ntaxid;
|
||||
int i;
|
||||
ecotaxonomy_t *ptax;
|
||||
SEXP rTaxids;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
ntaxid = ptax->taxons->count;
|
||||
rTaxids = PROTECT(allocVector(INTSXP, ntaxid));
|
||||
|
||||
for (i=0; i < ntaxid;i++)
|
||||
INTEGER(rTaxids)[i]=ptax->taxons->taxon[i].taxid;
|
||||
|
||||
UNPROTECT(1);
|
||||
|
||||
return rTaxids;
|
||||
|
||||
}
|
||||
|
||||
SEXP R_max_taxid(SEXP Rtaxonomy)
|
||||
{
|
||||
// int nrank;
|
||||
// int i;
|
||||
ecotaxonomy_t *ptax;
|
||||
// SEXP rNames;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
return ScalarInteger(ptax->taxons->maxtaxid);
|
||||
}
|
||||
|
||||
SEXP R_length_taxonomy(SEXP Rtaxonomy)
|
||||
{
|
||||
ecotaxonomy_t *ptax;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
return ScalarInteger(ptax->taxons->count);
|
||||
}
|
||||
|
||||
SEXP R_ecofind(SEXP Rtaxonomy, SEXP Rpattern, SEXP Rrank, SEXP Ralternative)
|
||||
{
|
||||
ecotaxonomy_t *ptax;
|
||||
econame_t *name;
|
||||
char* pattern=NULL;
|
||||
int re_match;
|
||||
SEXP taxids;
|
||||
int32_t* buffer;
|
||||
int32_t tax_count = 0;
|
||||
size_t j = 0;
|
||||
int32_t rankfilter = 1;
|
||||
int* ptaxid;
|
||||
char *rankname=NULL;
|
||||
int32_t nummatch = 0;
|
||||
int32_t alternative = 0;
|
||||
|
||||
size_t bsize;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
tax_count = ptax->taxons->count;
|
||||
|
||||
if (! isString(Rpattern))
|
||||
error("pattern not a string");
|
||||
|
||||
pattern= (char*) CHAR(STRING_ELT(Rpattern,0));
|
||||
|
||||
if (! isNull(Rrank))
|
||||
{
|
||||
if (! isString(Rrank))
|
||||
error("rank not a string");
|
||||
|
||||
rankname= (char*) CHAR(STRING_ELT(Rrank,0));
|
||||
}
|
||||
|
||||
if (! isLogical(Ralternative))
|
||||
error("rank not a logical");
|
||||
|
||||
alternative = LOGICAL(Ralternative)[0];
|
||||
|
||||
|
||||
nummatch=0;
|
||||
buffer = (int32_t*) malloc(100 * sizeof(int32_t));
|
||||
bsize=100;
|
||||
|
||||
if (alternative && ptax->names!=NULL)
|
||||
for (j=0,name=ptax->names->names;
|
||||
j < ptax->names->count;
|
||||
name++,j++)
|
||||
{
|
||||
if(rankname)
|
||||
rankfilter = !(strcmp(rankname,ptax->ranks->label[name->taxon->rank]));
|
||||
|
||||
re_match = slre_match(pattern, name->name,
|
||||
strlen(name->name),
|
||||
NULL, 0,
|
||||
SLRE_IGNORE_CASE);
|
||||
|
||||
if (re_match > 0 && rankfilter)
|
||||
{
|
||||
buffer[nummatch]=name->taxon->taxid;
|
||||
nummatch++;
|
||||
if (nummatch==bsize) {
|
||||
bsize*=2;
|
||||
buffer = (int32_t*) realloc(buffer, bsize * sizeof(int32_t));
|
||||
if (buffer==0)
|
||||
{
|
||||
// regfree(&re_preg);
|
||||
error("Cannot allocate memory for the taxid list");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
for (j=0; j < ptax->taxons->count;j++)
|
||||
{
|
||||
if(rankname)
|
||||
rankfilter = !(strcmp(rankname,ptax->ranks->label[ptax->taxons->taxon[j].rank]));
|
||||
|
||||
// re_match = regexec (&re_preg, ptax->taxons->taxon[j].name, 0, NULL, 0);
|
||||
re_match = slre_match(pattern, ptax->taxons->taxon[j].name,
|
||||
strlen(ptax->taxons->taxon[j].name),
|
||||
NULL, 0,
|
||||
SLRE_IGNORE_CASE);
|
||||
|
||||
|
||||
// if (!re_match && rankfilter)
|
||||
if (re_match > 0 && rankfilter)
|
||||
{
|
||||
buffer[nummatch]=ptax->taxons->taxon[j].taxid;
|
||||
nummatch++;
|
||||
if (nummatch==bsize) {
|
||||
bsize*=2;
|
||||
buffer = (int32_t*) realloc(buffer, bsize * sizeof(int32_t));
|
||||
if (buffer==0)
|
||||
{
|
||||
// regfree(&re_preg);
|
||||
error("Cannot allocate memory for the taxid list");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//regfree(&re_preg);
|
||||
|
||||
taxids = PROTECT(NEW_INTEGER(nummatch));
|
||||
ptaxid = INTEGER(taxids);
|
||||
|
||||
for (j=0; j < nummatch; j++)
|
||||
ptaxid[j]=buffer[j];
|
||||
|
||||
free(buffer);
|
||||
|
||||
UNPROTECT(1);
|
||||
return taxids;
|
||||
}
|
||||
@@ -0,0 +1,6 @@
|
||||
#include "ecoPCR.h"
|
||||
|
||||
|
||||
ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy);
|
||||
SEXP R_delete_taxonomy(SEXP Rtaxonomy);
|
||||
|
||||
Binary file not shown.
Executable
+433
@@ -0,0 +1,433 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
|
||||
* Copyright (c) 2013 Cesanta Software Limited
|
||||
* All rights reserved
|
||||
*
|
||||
* This library is dual-licensed: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation. For the terms of this
|
||||
* license, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You are free to use this library under the terms of the GNU General
|
||||
* Public License, but WITHOUT ANY WARRANTY; without even the implied
|
||||
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See the GNU General Public License for more details.
|
||||
*
|
||||
* Alternatively, you can license this library under a commercial
|
||||
* license, as set out in <http://cesanta.com/products.html>.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <ctype.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "slre.h"
|
||||
|
||||
#define MAX_BRANCHES 100
|
||||
#define MAX_BRACKETS 100
|
||||
#define FAIL_IF(condition, error_code) if (condition) return (error_code)
|
||||
|
||||
#ifndef ARRAY_SIZE
|
||||
#define ARRAY_SIZE(ar) (sizeof(ar) / sizeof((ar)[0]))
|
||||
#endif
|
||||
|
||||
#ifdef SLRE_DEBUG
|
||||
#define DBG(x) printf x
|
||||
#else
|
||||
#define DBG(x)
|
||||
#endif
|
||||
|
||||
struct bracket_pair {
|
||||
const char *ptr; /* Points to the first char after '(' in regex */
|
||||
int len; /* Length of the text between '(' and ')' */
|
||||
int branches; /* Index in the branches array for this pair */
|
||||
int num_branches; /* Number of '|' in this bracket pair */
|
||||
};
|
||||
|
||||
struct branch {
|
||||
int bracket_index; /* index for 'struct bracket_pair brackets' */
|
||||
/* array defined below */
|
||||
const char *schlong; /* points to the '|' character in the regex */
|
||||
};
|
||||
|
||||
struct regex_info {
|
||||
/*
|
||||
* Describes all bracket pairs in the regular expression.
|
||||
* First entry is always present, and grabs the whole regex.
|
||||
*/
|
||||
struct bracket_pair brackets[MAX_BRACKETS];
|
||||
int num_brackets;
|
||||
|
||||
/*
|
||||
* Describes alternations ('|' operators) in the regular expression.
|
||||
* Each branch falls into a specific branch pair.
|
||||
*/
|
||||
struct branch branches[MAX_BRANCHES];
|
||||
int num_branches;
|
||||
|
||||
/* Array of captures provided by the user */
|
||||
struct slre_cap *caps;
|
||||
int num_caps;
|
||||
|
||||
/* E.g. SLRE_IGNORE_CASE. See enum below */
|
||||
int flags;
|
||||
};
|
||||
|
||||
static int is_metacharacter(const unsigned char *s) {
|
||||
static const char *metacharacters = "^$().[]*+?|\\Ssdbfnrtv";
|
||||
return strchr(metacharacters, *s) != NULL;
|
||||
}
|
||||
|
||||
static int op_len(const char *re) {
|
||||
return re[0] == '\\' && re[1] == 'x' ? 4 : re[0] == '\\' ? 2 : 1;
|
||||
}
|
||||
|
||||
static int set_len(const char *re, int re_len) {
|
||||
int len = 0;
|
||||
|
||||
while (len < re_len && re[len] != ']') {
|
||||
len += op_len(re + len);
|
||||
}
|
||||
|
||||
return len <= re_len ? len + 1 : -1;
|
||||
}
|
||||
|
||||
static int get_op_len(const char *re, int re_len) {
|
||||
return re[0] == '[' ? set_len(re + 1, re_len - 1) + 1 : op_len(re);
|
||||
}
|
||||
|
||||
static int is_quantifier(const char *re) {
|
||||
return re[0] == '*' || re[0] == '+' || re[0] == '?';
|
||||
}
|
||||
|
||||
static int toi(int x) {
|
||||
return isdigit(x) ? x - '0' : x - 'W';
|
||||
}
|
||||
|
||||
static int hextoi(const unsigned char *s) {
|
||||
return (toi(tolower(s[0])) << 4) | toi(tolower(s[1]));
|
||||
}
|
||||
|
||||
static int match_op(const unsigned char *re, const unsigned char *s,
|
||||
struct regex_info *info) {
|
||||
int result = 0;
|
||||
switch (*re) {
|
||||
case '\\':
|
||||
/* Metacharacters */
|
||||
switch (re[1]) {
|
||||
case 'S': FAIL_IF(isspace(*s), SLRE_NO_MATCH); result++; break;
|
||||
case 's': FAIL_IF(!isspace(*s), SLRE_NO_MATCH); result++; break;
|
||||
case 'd': FAIL_IF(!isdigit(*s), SLRE_NO_MATCH); result++; break;
|
||||
case 'b': FAIL_IF(*s != '\b', SLRE_NO_MATCH); result++; break;
|
||||
case 'f': FAIL_IF(*s != '\f', SLRE_NO_MATCH); result++; break;
|
||||
case 'n': FAIL_IF(*s != '\n', SLRE_NO_MATCH); result++; break;
|
||||
case 'r': FAIL_IF(*s != '\r', SLRE_NO_MATCH); result++; break;
|
||||
case 't': FAIL_IF(*s != '\t', SLRE_NO_MATCH); result++; break;
|
||||
case 'v': FAIL_IF(*s != '\v', SLRE_NO_MATCH); result++; break;
|
||||
|
||||
case 'x':
|
||||
/* Match byte, \xHH where HH is hexadecimal byte representaion */
|
||||
FAIL_IF(hextoi(re + 2) != *s, SLRE_NO_MATCH);
|
||||
result++;
|
||||
break;
|
||||
|
||||
default:
|
||||
/* Valid metacharacter check is done in bar() */
|
||||
FAIL_IF(re[1] != s[0], SLRE_NO_MATCH);
|
||||
result++;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
case '|': FAIL_IF(1, SLRE_INTERNAL_ERROR); break;
|
||||
case '$': FAIL_IF(1, SLRE_NO_MATCH); break;
|
||||
case '.': result++; break;
|
||||
|
||||
default:
|
||||
if (info->flags & SLRE_IGNORE_CASE) {
|
||||
FAIL_IF(tolower(*re) != tolower(*s), SLRE_NO_MATCH);
|
||||
} else {
|
||||
FAIL_IF(*re != *s, SLRE_NO_MATCH);
|
||||
}
|
||||
result++;
|
||||
break;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static int match_set(const char *re, int re_len, const char *s,
|
||||
struct regex_info *info) {
|
||||
int len = 0, result = -1, invert = re[0] == '^';
|
||||
|
||||
if (invert) re++, re_len--;
|
||||
|
||||
while (len <= re_len && re[len] != ']' && result <= 0) {
|
||||
/* Support character range */
|
||||
if (re[len] != '-' && re[len + 1] == '-' && re[len + 2] != ']' &&
|
||||
re[len + 2] != '\0') {
|
||||
result = info->flags && SLRE_IGNORE_CASE ?
|
||||
*s >= re[len] && *s <= re[len + 2] :
|
||||
tolower(*s) >= tolower(re[len]) && tolower(*s) <= tolower(re[len + 2]);
|
||||
len += 3;
|
||||
} else {
|
||||
result = match_op((unsigned char *) re + len, (unsigned char *) s, info);
|
||||
len += op_len(re + len);
|
||||
}
|
||||
}
|
||||
return (!invert && result > 0) || (invert && result <= 0) ? 1 : -1;
|
||||
}
|
||||
|
||||
static int doh(const char *s, int s_len, struct regex_info *info, int bi);
|
||||
|
||||
static int bar(const char *re, int re_len, const char *s, int s_len,
|
||||
struct regex_info *info, int bi) {
|
||||
/* i is offset in re, j is offset in s, bi is brackets index */
|
||||
int i, j, n, step;
|
||||
|
||||
for (i = j = 0; i < re_len && j <= s_len; i += step) {
|
||||
|
||||
/* Handle quantifiers. Get the length of the chunk. */
|
||||
step = re[i] == '(' ? info->brackets[bi + 1].len + 2 :
|
||||
get_op_len(re + i, re_len - i);
|
||||
|
||||
DBG(("%s [%.*s] [%.*s] re_len=%d step=%d i=%d j=%d\n", __func__,
|
||||
re_len - i, re + i, s_len - j, s + j, re_len, step, i, j));
|
||||
|
||||
FAIL_IF(is_quantifier(&re[i]), SLRE_UNEXPECTED_QUANTIFIER);
|
||||
FAIL_IF(step <= 0, SLRE_INVALID_CHARACTER_SET);
|
||||
|
||||
if (i + step < re_len && is_quantifier(re + i + step)) {
|
||||
DBG(("QUANTIFIER: [%.*s]%c [%.*s]\n", step, re + i,
|
||||
re[i + step], s_len - j, s + j));
|
||||
if (re[i + step] == '?') {
|
||||
int result = bar(re + i, step, s + j, s_len - j, info, bi);
|
||||
j += result > 0 ? result : 0;
|
||||
i++;
|
||||
} else if (re[i + step] == '+' || re[i + step] == '*') {
|
||||
int j2 = j, nj = j, n1, n2 = -1, ni, non_greedy = 0;
|
||||
|
||||
/* Points to the regexp code after the quantifier */
|
||||
ni = i + step + 1;
|
||||
if (ni < re_len && re[ni] == '?') {
|
||||
non_greedy = 1;
|
||||
ni++;
|
||||
}
|
||||
|
||||
do {
|
||||
if ((n1 = bar(re + i, step, s + j2, s_len - j2, info, bi)) > 0) {
|
||||
j2 += n1;
|
||||
}
|
||||
if (re[i + step] == '+' && n1 < 0) break;
|
||||
|
||||
if (ni >= re_len) {
|
||||
/* After quantifier, there is nothing */
|
||||
nj = j2;
|
||||
} else if ((n2 = bar(re + ni, re_len - ni, s + j2,
|
||||
s_len - j2, info, bi)) >= 0) {
|
||||
/* Regex after quantifier matched */
|
||||
nj = j2 + n2;
|
||||
}
|
||||
if (nj > j && non_greedy) break;
|
||||
} while (n1 > 0);
|
||||
|
||||
if (n1 < 0 && re[i + step] == '*' &&
|
||||
(n2 = bar(re + ni, re_len - ni, s + j, s_len - j, info, bi)) > 0) {
|
||||
nj = j + n2;
|
||||
}
|
||||
|
||||
DBG(("STAR/PLUS END: %d %d %d %d %d\n", j, nj, re_len - ni, n1, n2));
|
||||
FAIL_IF(re[i + step] == '+' && nj == j, SLRE_NO_MATCH);
|
||||
|
||||
/* If while loop body above was not executed for the * quantifier, */
|
||||
/* make sure the rest of the regex matches */
|
||||
FAIL_IF(nj == j && ni < re_len && n2 < 0, SLRE_NO_MATCH);
|
||||
|
||||
/* Returning here cause we've matched the rest of RE already */
|
||||
return nj;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (re[i] == '[') {
|
||||
n = match_set(re + i + 1, re_len - (i + 2), s + j, info);
|
||||
DBG(("SET %.*s [%.*s] -> %d\n", step, re + i, s_len - j, s + j, n));
|
||||
FAIL_IF(n <= 0, SLRE_NO_MATCH);
|
||||
j += n;
|
||||
} else if (re[i] == '(') {
|
||||
n = SLRE_NO_MATCH;
|
||||
bi++;
|
||||
FAIL_IF(bi >= info->num_brackets, SLRE_INTERNAL_ERROR);
|
||||
DBG(("CAPTURING [%.*s] [%.*s] [%s]\n",
|
||||
step, re + i, s_len - j, s + j, re + i + step));
|
||||
|
||||
if (re_len - (i + step) <= 0) {
|
||||
/* Nothing follows brackets */
|
||||
n = doh(s + j, s_len - j, info, bi);
|
||||
} else {
|
||||
int j2;
|
||||
for (j2 = 0; j2 <= s_len - j; j2++) {
|
||||
if ((n = doh(s + j, s_len - (j + j2), info, bi)) >= 0 &&
|
||||
bar(re + i + step, re_len - (i + step),
|
||||
s + j + n, s_len - (j + n), info, bi) >= 0) break;
|
||||
}
|
||||
}
|
||||
|
||||
DBG(("CAPTURED [%.*s] [%.*s]:%d\n", step, re + i, s_len - j, s + j, n));
|
||||
FAIL_IF(n < 0, n);
|
||||
if (info->caps != NULL) {
|
||||
info->caps[bi - 1].ptr = s + j;
|
||||
info->caps[bi - 1].len = n;
|
||||
}
|
||||
j += n;
|
||||
} else if (re[i] == '^') {
|
||||
FAIL_IF(j != 0, SLRE_NO_MATCH);
|
||||
} else if (re[i] == '$') {
|
||||
FAIL_IF(j != s_len, SLRE_NO_MATCH);
|
||||
} else {
|
||||
FAIL_IF(j >= s_len, SLRE_NO_MATCH);
|
||||
n = match_op((unsigned char *) (re + i), (unsigned char *) (s + j), info);
|
||||
FAIL_IF(n <= 0, n);
|
||||
j += n;
|
||||
}
|
||||
}
|
||||
|
||||
return j;
|
||||
}
|
||||
|
||||
/* Process branch points */
|
||||
static int doh(const char *s, int s_len, struct regex_info *info, int bi) {
|
||||
const struct bracket_pair *b = &info->brackets[bi];
|
||||
int i = 0, len, result;
|
||||
const char *p;
|
||||
|
||||
do {
|
||||
p = i == 0 ? b->ptr : info->branches[b->branches + i - 1].schlong + 1;
|
||||
len = b->num_branches == 0 ? b->len :
|
||||
i == b->num_branches ? (int) (b->ptr + b->len - p) :
|
||||
(int) (info->branches[b->branches + i].schlong - p);
|
||||
DBG(("%s %d %d [%.*s] [%.*s]\n", __func__, bi, i, len, p, s_len, s));
|
||||
result = bar(p, len, s, s_len, info, bi);
|
||||
DBG(("%s <- %d\n", __func__, result));
|
||||
} while (result <= 0 && i++ < b->num_branches); /* At least 1 iteration */
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static int baz(const char *s, int s_len, struct regex_info *info) {
|
||||
int i, result = -1, is_anchored = info->brackets[0].ptr[0] == '^';
|
||||
|
||||
for (i = 0; i <= s_len; i++) {
|
||||
result = doh(s + i, s_len - i, info, 0);
|
||||
if (result >= 0) {
|
||||
result += i;
|
||||
break;
|
||||
}
|
||||
if (is_anchored) break;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static void setup_branch_points(struct regex_info *info) {
|
||||
int i, j;
|
||||
struct branch tmp;
|
||||
|
||||
/* First, sort branches. Must be stable, no qsort. Use bubble algo. */
|
||||
for (i = 0; i < info->num_branches; i++) {
|
||||
for (j = i + 1; j < info->num_branches; j++) {
|
||||
if (info->branches[i].bracket_index > info->branches[j].bracket_index) {
|
||||
tmp = info->branches[i];
|
||||
info->branches[i] = info->branches[j];
|
||||
info->branches[j] = tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* For each bracket, set their branch points. This way, for every bracket
|
||||
* (i.e. every chunk of regex) we know all branch points before matching.
|
||||
*/
|
||||
for (i = j = 0; i < info->num_brackets; i++) {
|
||||
info->brackets[i].num_branches = 0;
|
||||
info->brackets[i].branches = j;
|
||||
while (j < info->num_branches && info->branches[j].bracket_index == i) {
|
||||
info->brackets[i].num_branches++;
|
||||
j++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int foo(const char *re, int re_len, const char *s, int s_len,
|
||||
struct regex_info *info) {
|
||||
int i, step, depth = 0;
|
||||
|
||||
/* First bracket captures everything */
|
||||
info->brackets[0].ptr = re;
|
||||
info->brackets[0].len = re_len;
|
||||
info->num_brackets = 1;
|
||||
|
||||
/* Make a single pass over regex string, memorize brackets and branches */
|
||||
for (i = 0; i < re_len; i += step) {
|
||||
step = get_op_len(re + i, re_len - i);
|
||||
|
||||
if (re[i] == '|') {
|
||||
FAIL_IF(info->num_branches >= (int) ARRAY_SIZE(info->branches),
|
||||
SLRE_TOO_MANY_BRANCHES);
|
||||
info->branches[info->num_branches].bracket_index =
|
||||
info->brackets[info->num_brackets - 1].len == -1 ?
|
||||
info->num_brackets - 1 : depth;
|
||||
info->branches[info->num_branches].schlong = &re[i];
|
||||
info->num_branches++;
|
||||
} else if (re[i] == '\\') {
|
||||
FAIL_IF(i >= re_len - 1, SLRE_INVALID_METACHARACTER);
|
||||
if (re[i + 1] == 'x') {
|
||||
/* Hex digit specification must follow */
|
||||
FAIL_IF(re[i + 1] == 'x' && i >= re_len - 3,
|
||||
SLRE_INVALID_METACHARACTER);
|
||||
FAIL_IF(re[i + 1] == 'x' && !(isxdigit(re[i + 2]) &&
|
||||
isxdigit(re[i + 3])), SLRE_INVALID_METACHARACTER);
|
||||
} else {
|
||||
FAIL_IF(!is_metacharacter((unsigned char *) re + i + 1),
|
||||
SLRE_INVALID_METACHARACTER);
|
||||
}
|
||||
} else if (re[i] == '(') {
|
||||
FAIL_IF(info->num_brackets >= (int) ARRAY_SIZE(info->brackets),
|
||||
SLRE_TOO_MANY_BRACKETS);
|
||||
depth++; /* Order is important here. Depth increments first. */
|
||||
info->brackets[info->num_brackets].ptr = re + i + 1;
|
||||
info->brackets[info->num_brackets].len = -1;
|
||||
info->num_brackets++;
|
||||
FAIL_IF(info->num_caps > 0 && info->num_brackets - 1 > info->num_caps,
|
||||
SLRE_CAPS_ARRAY_TOO_SMALL);
|
||||
} else if (re[i] == ')') {
|
||||
int ind = info->brackets[info->num_brackets - 1].len == -1 ?
|
||||
info->num_brackets - 1 : depth;
|
||||
info->brackets[ind].len = (int) (&re[i] - info->brackets[ind].ptr);
|
||||
DBG(("SETTING BRACKET %d [%.*s]\n",
|
||||
ind, info->brackets[ind].len, info->brackets[ind].ptr));
|
||||
depth--;
|
||||
FAIL_IF(depth < 0, SLRE_UNBALANCED_BRACKETS);
|
||||
FAIL_IF(i > 0 && re[i - 1] == '(', SLRE_NO_MATCH);
|
||||
}
|
||||
}
|
||||
|
||||
FAIL_IF(depth != 0, SLRE_UNBALANCED_BRACKETS);
|
||||
setup_branch_points(info);
|
||||
|
||||
return baz(s, s_len, info);
|
||||
}
|
||||
|
||||
int slre_match(const char *regexp, const char *s, int s_len,
|
||||
struct slre_cap *caps, int num_caps, int flags) {
|
||||
struct regex_info info;
|
||||
|
||||
/* Initialize info structure */
|
||||
info.flags = flags;
|
||||
info.num_brackets = info.num_branches = 0;
|
||||
info.num_caps = num_caps;
|
||||
info.caps = caps;
|
||||
|
||||
DBG(("========================> [%s] [%.*s]\n", regexp, s_len, s));
|
||||
return foo(regexp, (int) strlen(regexp), s, s_len, &info);
|
||||
}
|
||||
Executable
+60
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
|
||||
* Copyright (c) 2013 Cesanta Software Limited
|
||||
* All rights reserved
|
||||
*
|
||||
* This library is dual-licensed: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation. For the terms of this
|
||||
* license, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You are free to use this library under the terms of the GNU General
|
||||
* Public License, but WITHOUT ANY WARRANTY; without even the implied
|
||||
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See the GNU General Public License for more details.
|
||||
*
|
||||
* Alternatively, you can license this library under a commercial
|
||||
* license, as set out in <http://cesanta.com/products.html>.
|
||||
*/
|
||||
|
||||
/*
|
||||
* This is a regular expression library that implements a subset of Perl RE.
|
||||
* Please refer to README.md for a detailed reference.
|
||||
*/
|
||||
|
||||
#ifndef SLRE_HEADER_DEFINED
|
||||
#define SLRE_HEADER_DEFINED
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct slre_cap {
|
||||
const char *ptr;
|
||||
int len;
|
||||
};
|
||||
|
||||
|
||||
int slre_match(const char *regexp, const char *buf, int buf_len,
|
||||
struct slre_cap *caps, int num_caps, int flags);
|
||||
|
||||
/* Possible flags for slre_match() */
|
||||
enum { SLRE_IGNORE_CASE = 1 };
|
||||
|
||||
|
||||
/* slre_match() failure codes */
|
||||
#define SLRE_NO_MATCH -1
|
||||
#define SLRE_UNEXPECTED_QUANTIFIER -2
|
||||
#define SLRE_UNBALANCED_BRACKETS -3
|
||||
#define SLRE_INTERNAL_ERROR -4
|
||||
#define SLRE_INVALID_CHARACTER_SET -5
|
||||
#define SLRE_INVALID_METACHARACTER -6
|
||||
#define SLRE_CAPS_ARRAY_TOO_SMALL -7
|
||||
#define SLRE_TOO_MANY_BRANCHES -8
|
||||
#define SLRE_TOO_MANY_BRACKETS -9
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* SLRE_HEADER_DEFINED */
|
||||
BIN
Binary file not shown.
Reference in New Issue
Block a user