initial commit
This commit is contained in:
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
/man/
|
||||
/vignettes/
|
||||
/Read-and-delete-me
|
20
DESCRIPTION
Normal file
20
DESCRIPTION
Normal file
@@ -0,0 +1,20 @@
|
||||
Package: ROBITaxonomy
|
||||
Type: Package
|
||||
Title: Metabarcoding data biodiversity analysis
|
||||
Version: 0.1
|
||||
Date: 2012-08-23
|
||||
Author: LECA - Laboratoire d'ecologie alpine
|
||||
Maintainer: LECA OBITools team <obitools@metabarcoding.org>
|
||||
Description: More about what it does (maybe more than one line)
|
||||
License: CeCILL v2.0
|
||||
LazyLoad: yes
|
||||
Roxygen: list(wrap = FALSE)
|
||||
Collate:
|
||||
'ROBITaxonomy.R'
|
||||
'taxonomy.R'
|
||||
'basic.R'
|
||||
'default.R'
|
||||
'distance.R'
|
||||
'lca.R'
|
||||
'rank.R'
|
||||
RoxygenNote: 5.0.1
|
28
NAMESPACE
Normal file
28
NAMESPACE
Normal file
@@ -0,0 +1,28 @@
|
||||
# Generated by roxygen2: do not edit by hand
|
||||
|
||||
export(default.taxonomy)
|
||||
export(distance.taxonomy)
|
||||
export(ecofind)
|
||||
export(family)
|
||||
export(genus)
|
||||
export(is.obitools.taxonomy)
|
||||
export(is.subcladeof)
|
||||
export(kingdom)
|
||||
export(length.obitools.taxonomy)
|
||||
export(longest.path)
|
||||
export(lowest.common.ancestor)
|
||||
export(max.obitools.taxonomy)
|
||||
export(parent)
|
||||
export(path)
|
||||
export(rank.list)
|
||||
export(read.taxonomy)
|
||||
export(scientificname)
|
||||
export(species)
|
||||
export(superkingdom)
|
||||
export(taxid.list)
|
||||
export(taxonatrank)
|
||||
export(taxonomicrank)
|
||||
export(validate)
|
||||
exportClasses(obitools.taxonomy)
|
||||
exportClasses(obitools.taxonomyOrNULL)
|
||||
useDynLib(ROBITaxonomy)
|
0
R/ROBITaxonomy.R
Normal file
0
R/ROBITaxonomy.R
Normal file
359
R/basic.R
Normal file
359
R/basic.R
Normal file
@@ -0,0 +1,359 @@
|
||||
#' @include taxonomy.R
|
||||
NULL
|
||||
|
||||
#' @export
|
||||
setGeneric("scientificname", function(taxonomy,taxid) {
|
||||
return(standardGeneric("scientificname"))
|
||||
})
|
||||
|
||||
#' Returns the scientific name corresponding to a \emph{NCBI taxid}
|
||||
#'
|
||||
#' \code{scientificname} function in package \pkg{\link{ROBITaxonomy}} returns the
|
||||
#' scientific name corresponding to a \emph{NCBI taxid}.
|
||||
#'
|
||||
#' @param taxonomy a \code{\link{obitools.taxonomy}} instance
|
||||
#' @param taxid an integer value or a vector of integer representing NCBI
|
||||
#' taxonomic identifiers.
|
||||
#' @return The scientific name of the corresponding taxon as a string or a
|
||||
#' vector of string if the \code{taxid} argument is itself a vector
|
||||
#'
|
||||
#' @examples
|
||||
#' # load the default taxonomy database include in the ROBITaxonomy library
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # build a vector of 6 taxids corresponding to species
|
||||
#' sp.taxid=c(7000,7004,7007,7009,7010,7011)
|
||||
#'
|
||||
#' # look for the scientific names correponding to these taxids
|
||||
#' scientificname(taxo,sp.taxid)
|
||||
#'
|
||||
#' @seealso class \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#'
|
||||
#' @docType methods
|
||||
#' @rdname scientificname-methods
|
||||
#' @aliases scientificname-methods,obitools.taxonomy
|
||||
#' @author Eric Coissac
|
||||
#'
|
||||
setMethod("scientificname", "obitools.taxonomy",function(taxonomy,taxid) {
|
||||
getscname = function(t) {
|
||||
if (is.na(t))
|
||||
return(NA)
|
||||
else
|
||||
return( .Call('R_get_scientific_name',
|
||||
taxonomy,
|
||||
t,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
}
|
||||
|
||||
taxid = as.integer(taxid)
|
||||
sapply(taxid,getscname)
|
||||
})
|
||||
|
||||
|
||||
######################################################################
|
||||
######################################################################
|
||||
|
||||
|
||||
#' @export
|
||||
setGeneric("parent", function(taxonomy,taxid,name=FALSE) {
|
||||
return(standardGeneric("parent"))
|
||||
})
|
||||
|
||||
#' Returns the parent taxon corresponding to a \emph{NCBI taxid}
|
||||
#'
|
||||
#' \code{parent} function in package \pkg{\link{ROBITaxonomy}} returns the
|
||||
#' parent taxon corresponding to a \emph{NCBI taxid}.
|
||||
#'
|
||||
#' @param taxonomy a \code{\link{obitools.taxonomy}} instance
|
||||
#' @param taxid an integer value or a vector of integer representing NCBI
|
||||
#' taxonomic identifiers.
|
||||
#' @param name A logical value \code{TRUE} or \code{FALSE} indicating if the
|
||||
#' method returns a taxid or a scientific name.
|
||||
#'
|
||||
#' @return \describe{ \item{If \code{name==FALSE}}{the taxid of the
|
||||
#' parent taxon as an integer or a vector of
|
||||
#' integers if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' \item{If \code{name==TRUE}}{the scientific name of the
|
||||
#' parent taxon as a string or a vector of
|
||||
#' string if the \code{taxid} argument is itself a
|
||||
#' vector} }
|
||||
#'
|
||||
#' @examples
|
||||
#' # load the default taxonomy database include in the ROBITaxonomy library
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # build a vector of 6 taxids corresponding to species
|
||||
#' sp.taxid=c(7000,7004,7007,7009,7010,7011)
|
||||
#'
|
||||
#' # look for the parent taxa correponding to these taxids
|
||||
#' parent(taxo,sp.taxid)
|
||||
#'
|
||||
#' # same things but scientific names are returned
|
||||
#' parent(taxo,sp.taxid,TRUE)
|
||||
#'
|
||||
#' @seealso class \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#'
|
||||
#' @docType methods
|
||||
#' @rdname parent-methods
|
||||
#' @aliases parent-methods,obitools.taxonomy
|
||||
#' @author Eric Coissac
|
||||
#'
|
||||
setMethod("parent", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) {
|
||||
getp = function(t) {
|
||||
if (is.na(t))
|
||||
return(NA)
|
||||
else
|
||||
return(.Call('R_get_parent',
|
||||
taxonomy,
|
||||
as.integer(t),
|
||||
name,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
}
|
||||
|
||||
taxid = as.integer(taxid)
|
||||
name = as.logical(name[1])
|
||||
sapply(taxid,getp)
|
||||
})
|
||||
|
||||
|
||||
|
||||
######################################################################
|
||||
######################################################################
|
||||
|
||||
|
||||
|
||||
#' @export
|
||||
setGeneric("taxid.list", function(taxonomy) {
|
||||
return(standardGeneric("taxid.list"))
|
||||
})
|
||||
|
||||
|
||||
#' Returns the list of all taxids belonging the taxonomy.
|
||||
#'
|
||||
#' \code{taxid.list} returns the list of all taxids included in the
|
||||
#' instance of the class \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#' @param taxonomy the \code{\linkS4class{obitools.taxonomy}} to use.
|
||||
#'
|
||||
#' @return an \code{integer} vector containing the list of taxids.
|
||||
#'
|
||||
#' @examples
|
||||
#' # loads the default taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # returns the count of taxa described in the taxonomy
|
||||
#' length(taxo)
|
||||
#'
|
||||
#' # extracts the list of all valid taxids
|
||||
#' good = taxid.list(taxo)
|
||||
#'
|
||||
#' # returns the size of the returned list
|
||||
#' length(good)
|
||||
#'
|
||||
#' @seealso \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#' @docType methods
|
||||
#' @rdname taxid.list-method
|
||||
#' @aliases taxid.list
|
||||
#'
|
||||
setMethod("taxid.list", "obitools.taxonomy",
|
||||
function(taxonomy) {
|
||||
return(.Call('R_taxid_list',
|
||||
taxonomy,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
})
|
||||
|
||||
######################################################################
|
||||
######################################################################
|
||||
|
||||
|
||||
|
||||
#' Returns the count of taxa in the taxonomy.
|
||||
#'
|
||||
#' \code{length} returns the count of taxa included in the
|
||||
#' instance of the class \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#' @param x the \code{\linkS4class{obitools.taxonomy}} to use.
|
||||
#'
|
||||
#' @return an \code{integer} corresponding to the count of taxa.
|
||||
#'
|
||||
#' @examples
|
||||
#' # loads the default taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # returns the count of taxa described in the taxonomy
|
||||
#' length(taxo)
|
||||
#'
|
||||
#' @seealso \code{\link{length}}, \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#' @export length.obitools.taxonomy
|
||||
#'
|
||||
length.obitools.taxonomy = function(x)
|
||||
{
|
||||
return(.Call('R_length_taxonomy',
|
||||
x,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
}
|
||||
|
||||
|
||||
######################################################################
|
||||
######################################################################
|
||||
|
||||
setGeneric('max')
|
||||
|
||||
#' Returns the maximum taxid in the taxonomy.
|
||||
#'
|
||||
#' \code{length} returns the maximum taxid included in the
|
||||
#' instance of the class \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#' @param taxonomy the \code{\linkS4class{obitools.taxonomy}} to use.
|
||||
#' @param na.rm included for compatibility purpose, this parameter as
|
||||
#' no effect on this implementation of \code{max}
|
||||
#'
|
||||
#' @return an \code{integer} corresponding to the count of taxa.
|
||||
#'
|
||||
#' @examples
|
||||
#' # load the default taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # gets the larger taxid of the database
|
||||
#' max(taxo)
|
||||
#'
|
||||
#' @seealso \code{\link{max}}, \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#' @export max.obitools.taxonomy
|
||||
#'
|
||||
max.obitools.taxonomy=function(taxonomy,na.rm = FALSE) {
|
||||
return(.Call('R_max_taxid',
|
||||
taxonomy,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
}
|
||||
|
||||
#' @export
|
||||
setGeneric("ecofind", function(taxonomy,patterns,rank=NULL,alternative=FALSE) {
|
||||
return(standardGeneric("ecofind"))
|
||||
})
|
||||
|
||||
#' Returns taxids associated to the names
|
||||
#'
|
||||
#' Return the set of taxids having their name matching the given pattern.
|
||||
#'
|
||||
#' @param taxonomy the \code{\linkS4class{obitools.taxonomy}} to use.
|
||||
#' @param patterns one or several regular pattern used to select the the taxa.
|
||||
#' @param rank a \code{character} indicating a taxonomic rank. If not \code{NULL}
|
||||
#' only taxids correponding to this rank are returned.
|
||||
#' @param alternative A logical value \code{TRUE} or \code{FALSE} indicating
|
||||
#' if the function must only look for a scientific name.
|
||||
#'
|
||||
#' @return if just one pattern is given, an integer vector is returned with the
|
||||
#' corresponding taxids. If a list of patterns is given, the function
|
||||
#' returns a list of integer vectors, each vector containing the taxids
|
||||
#' corresponding to a pattern. The returned list is in the same order
|
||||
#' than the given patern list.
|
||||
#'
|
||||
#' @examples
|
||||
#' # load the default taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # retreives the Vertebrata taxid
|
||||
#' taxid = ecofind(taxo,"Vertebrata")
|
||||
#'
|
||||
#' taxid
|
||||
#' scientificname(taxo,taxid)
|
||||
#'
|
||||
#'
|
||||
#' taxid = ecofind(taxo,"^Vertebrata$")
|
||||
#'
|
||||
#' taxid
|
||||
#' scientificname(taxo,taxid)
|
||||
#'
|
||||
#'
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#' @docType methods
|
||||
#' @rdname ecofind-method
|
||||
#' @aliases ecofind,obitools.taxonomy
|
||||
#'
|
||||
setMethod("ecofind", "obitools.taxonomy",function(taxonomy,patterns,rank=NULL,alternative=FALSE) {
|
||||
getp = function(t) {
|
||||
if (is.na(t))
|
||||
return(NA)
|
||||
else
|
||||
return(unique(.Call('R_ecofind',
|
||||
taxonomy,
|
||||
t,
|
||||
rank,
|
||||
alternative,
|
||||
PACKAGE="ROBITaxonomy")))
|
||||
}
|
||||
|
||||
patterns = as.character(patterns)
|
||||
taxid=lapply(patterns,getp)
|
||||
if (length(taxid)==1)
|
||||
taxid=taxid[[1]]
|
||||
|
||||
return(taxid)
|
||||
})
|
||||
|
||||
|
||||
|
||||
#' @export
|
||||
setGeneric("validate", function(taxonomy,taxid) {
|
||||
return(standardGeneric("validate"))
|
||||
})
|
||||
|
||||
#' Checks that a \emph{taxid} is really present in taxonomy
|
||||
#'
|
||||
#' \code{validate} function in package \pkg{\link{ROBITaxonomy}} checks
|
||||
#' that a \emph{taxid} is declared in the considered taxonomy.
|
||||
#'
|
||||
#' @param taxonomy a \code{\link{obitools.taxonomy}} instance
|
||||
#' @param taxid an integer value or a vector of integer representing NCBI
|
||||
#' taxonomic identifiers.
|
||||
#'
|
||||
#' @return The taxid if it exists, NA otherwise. If the input taxid is a
|
||||
#' vector of integer returns an integer vector composed of validated
|
||||
#' taxids and NA values.
|
||||
#'
|
||||
#' @examples
|
||||
#' # load the default taxonomy database include in the ROBITaxonomy library
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # build a vector of 101 taxids
|
||||
#' sp.taxid=c(7000:7100)
|
||||
#'
|
||||
#' # checks the list of taxids
|
||||
#' validate(taxo,sp.taxid)
|
||||
#'
|
||||
#' @seealso class \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#'
|
||||
#' @docType methods
|
||||
#' @rdname validate-methods
|
||||
#' @aliases validate-methods,obitools.taxonomy
|
||||
#' @author Eric Coissac
|
||||
#'
|
||||
setMethod("validate", "obitools.taxonomy",function(taxonomy,taxid) {
|
||||
getp = function(t) {
|
||||
if (is.na(t))
|
||||
return(NA)
|
||||
else
|
||||
return(.Call('R_validate_taxid',
|
||||
taxonomy,
|
||||
t,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
}
|
||||
|
||||
taxid = as.integer(taxid)
|
||||
sapply(taxid,getp)
|
||||
})
|
||||
|
52
R/default.R
Normal file
52
R/default.R
Normal file
@@ -0,0 +1,52 @@
|
||||
#' @include taxonomy.R
|
||||
NULL
|
||||
|
||||
|
||||
#
|
||||
#
|
||||
# Manage le loading of the default taxonomy
|
||||
#
|
||||
#
|
||||
|
||||
.__default__taxonomy__ = NULL
|
||||
|
||||
#' Returns the default taxonomy
|
||||
#'
|
||||
#' Returns a \code{\linkS4class{obitools.taxonomy}} instance corresponding
|
||||
#' to a NCBI taxonomy included by default in the \pkg{\link{ROBITaxonomy}} package.
|
||||
#'
|
||||
#' @return a \code{\linkS4class{obitools.taxonomy}} instance.
|
||||
#'
|
||||
#' @examples
|
||||
#'
|
||||
#' # Load the default taxonomy
|
||||
#' taxo = default.taxonomy()
|
||||
#'
|
||||
#' # and use it for requesting a scientific name
|
||||
#' scientificname(taxo,7742)
|
||||
#'
|
||||
#' @seealso \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#' @export
|
||||
#'
|
||||
default.taxonomy = function() {
|
||||
if (is.null(get(".__default__taxonomy__",envir = environment())))
|
||||
assign(".__default__taxonomy__",
|
||||
read.taxonomy(paste(system.file("extdata",
|
||||
package="ROBITaxonomy"),
|
||||
'ncbitaxo',
|
||||
sep='/')),
|
||||
envir=globalenv())
|
||||
|
||||
return(get(".__default__taxonomy__",envir = globalenv()))
|
||||
}
|
||||
|
||||
|
||||
#' @export
|
||||
#'
|
||||
is.obitools.taxonomy = function(taxonomy) {
|
||||
class(t)[1] == "obitools.taxonomy"
|
||||
}
|
||||
|
189
R/distance.R
Normal file
189
R/distance.R
Normal file
@@ -0,0 +1,189 @@
|
||||
#' @include taxonomy.R
|
||||
NULL
|
||||
|
||||
#' @export
|
||||
setGeneric("longest.path", function(taxonomy,taxid) {
|
||||
return(standardGeneric("longest.path"))
|
||||
})
|
||||
|
||||
#' Returns the longuest path from a taxon.
|
||||
#'
|
||||
#' The method \code{longest.path} returns the length of the
|
||||
#' path linking a taxid to the farest leaf belonging this taxid.
|
||||
#'
|
||||
#' @param taxonomy the \code{\linkS4class{obitools.taxonomy}} to use.
|
||||
#'
|
||||
#' @param taxid an \code{integer} vector containing the list of taxids.
|
||||
#'
|
||||
#' @return an \code{integer} vector containing the list length.
|
||||
#'
|
||||
#' @examples
|
||||
#' # loads the default taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # returns the longest path in the taxonomy (from the root node)
|
||||
#' longest.path(taxo,1)
|
||||
#'
|
||||
#'
|
||||
#' @seealso \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#' @docType methods
|
||||
#' @rdname longest.path-method
|
||||
#' @aliases longest.path,obitools.taxonomy
|
||||
#'
|
||||
setMethod("longest.path", "obitools.taxonomy",
|
||||
function(taxonomy,taxid) {
|
||||
getp = function(t) {
|
||||
if (is.na(t))
|
||||
return(NA)
|
||||
else
|
||||
return(.Call('R_longest_path',
|
||||
taxonomy,
|
||||
t,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
}
|
||||
|
||||
taxid = as.integer(taxid)
|
||||
sapply(taxid,getp)
|
||||
})
|
||||
|
||||
|
||||
#' @export
|
||||
setGeneric("distance.taxonomy", function(taxonomy,taxid1,taxid2=NULL,name=F) {
|
||||
return(standardGeneric("distance.taxonomy"))
|
||||
})
|
||||
|
||||
|
||||
#' Computes a distance matrix between taxids
|
||||
#'
|
||||
#' The method \code{taxonomy.distance} computes a distance matrix between a
|
||||
#' set of taxids. The distance between two taxa is based on the topology of
|
||||
#' the taxonomomy tree.
|
||||
#'
|
||||
#' \deqn{ d(Taxon_A,Taxon_B) = \frac{longest.path(lca(Taxon_A,Taxon_B))}{max(longest.path(Taxon_A),longest.path(Taxon_B))}}
|
||||
#' { longest.path(lca(Taxon_A,Taxon_B)) / max(longest.path(Taxon_A),longest.path(Taxon_B)) }
|
||||
#'
|
||||
#'
|
||||
#' @param taxonomy the \code{\linkS4class{obitools.taxonomy}} to use.
|
||||
#'
|
||||
#' @param taxid1 an \code{integer} vector containing a list of taxids.
|
||||
#'
|
||||
#' @param taxid2 an \code{integer} vector containing a list of taxids.
|
||||
#' If \code{taxid2} is set to \code{NULL} (it's default value)
|
||||
#' then the \code{taxid2} list is considered as equal to
|
||||
#' \code{taxid1} list.
|
||||
#' @param name A logical value \code{TRUE} or \code{FALSE} indicating
|
||||
#' if the method return distance matrix annotated by taxids or
|
||||
#' by scientific names.
|
||||
#'
|
||||
#' @return the distance matrix between taxids specified in the \code{taxid1}
|
||||
#' set and the \code{taxid2} set.
|
||||
#'
|
||||
#' @examples
|
||||
#' # loads the default taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # build a vector of 6 taxids corresponding to species
|
||||
#' sp.taxid=c(7000,7004,7007,7009,7010,7011)
|
||||
#'
|
||||
#' # computes the distance matrix between taxids
|
||||
#' distance.taxonomy(taxo,sp.taxid)
|
||||
#'
|
||||
#' # Same thing but the matrix is annotated by scientific names
|
||||
#' distance.taxonomy(taxo,sp.taxid,name=TRUE)
|
||||
#'
|
||||
#' @seealso \code{\link{longest.path}}
|
||||
#'
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#' @docType methods
|
||||
#' @rdname distance.taxonomy-method
|
||||
#' @aliases taxonomy.distance,obitools.taxonomy
|
||||
#'
|
||||
setMethod("distance.taxonomy", "obitools.taxonomy",
|
||||
function(taxonomy,taxid1,taxid2=NULL,name=F) {
|
||||
taxdist = function(r)
|
||||
{
|
||||
t1=r[1]
|
||||
t2=r[2]
|
||||
if (is.na(t1) | is.na(t2))
|
||||
return(NA)
|
||||
|
||||
p1 = path(taxonomy,t1)
|
||||
p2 = path(taxonomy,t2)
|
||||
|
||||
minp = min(length(p1),length(p2))
|
||||
common = sum(p1[1:minp] == p2[1:minp])
|
||||
lca = p1[common]
|
||||
lp = longest.path(taxonomy,lca)
|
||||
return(lp/(lp+common))
|
||||
}
|
||||
|
||||
multitaxdist=function(t1,t2) {
|
||||
apply(data.frame(t1,t2),1,taxdist)
|
||||
}
|
||||
|
||||
taxid1 = taxid1[! is.na(validate(taxonomy,taxid1))]
|
||||
t1 = path(taxonomy,taxid1)
|
||||
|
||||
same = is.null(taxid2)
|
||||
|
||||
if (same)
|
||||
{
|
||||
ntaxon = length(taxid1)
|
||||
t2 = t1[unlist(sapply(2:ntaxon,
|
||||
function(x) x:ntaxon))]
|
||||
t1 = t1[rep(1:(ntaxon-1),(ntaxon-1):1)]
|
||||
}
|
||||
else
|
||||
{
|
||||
taxid2 = taxid2[! is.na(validate(taxonomy,taxid2))]
|
||||
t2 = path(taxonomy,taxid2)
|
||||
nt1 = length(taxid1)
|
||||
nt2 = length(taxid2)
|
||||
t1 = t1[rep(1:nt1,nt2)]
|
||||
t2 = t2[rep(1:nt2,rep(nt1,nt2))]
|
||||
}
|
||||
|
||||
lmin = mapply(function(a,b) min(length(a),length(b)),
|
||||
t1,
|
||||
t2)
|
||||
|
||||
llca = mapply(function(x,y,l) sum(x[1:l]==y[1:l]),
|
||||
t1,
|
||||
t2,
|
||||
lmin)
|
||||
|
||||
lb = longest.path(taxonomy,mapply(function(x,y) x[y],t1,llca))
|
||||
d = as.double(lb / (lb + llca))
|
||||
|
||||
if (same) {
|
||||
attr(d, "Size") <- ntaxon
|
||||
if (name)
|
||||
attr(d, "Labels") <- scientificname(taxonomy,taxid1)
|
||||
else
|
||||
attr(d, "Labels") <- as.character(taxid1)
|
||||
attr(d, "Diag") <- FALSE
|
||||
attr(d, "Upper") <- FALSE
|
||||
attr(d, "method") <- NULL
|
||||
attr(d, "call") <- match.call()
|
||||
class(d) <- "dist"
|
||||
}
|
||||
else {
|
||||
if (name)
|
||||
d = matrix(d,nt1,nt2,
|
||||
dimnames=list(scientificname(taxonomy,taxid1),
|
||||
scientificname(taxonomy,taxid2)))
|
||||
else
|
||||
d = matrix(d,nt1,nt2,
|
||||
dimnames=list(as.character(taxid1),
|
||||
as.character(taxid2)))
|
||||
|
||||
}
|
||||
|
||||
return(d)
|
||||
})
|
||||
|
||||
|
122
R/lca.R
Normal file
122
R/lca.R
Normal file
@@ -0,0 +1,122 @@
|
||||
#' @include taxonomy.R
|
||||
NULL
|
||||
|
||||
#' @export
|
||||
setGeneric("lowest.common.ancestor", function(taxonomy,taxid,threshold=1.0,error=0,name=FALSE) {
|
||||
return(standardGeneric("lowest.common.ancestor"))
|
||||
})
|
||||
|
||||
#' Computes the lowest common ancestor in the taxonomy tree between a set of taxa
|
||||
#'
|
||||
#' The \code{lowest.common.ancestor} function in package \pkg{ROBITaxonomy} computes
|
||||
#' the lowest common ancestor of a set of taxids. The lowest common ancestor (LCA)
|
||||
#' is the most precise taxonomic group shared by all the considered taxa. Tha
|
||||
#' \code{lowest.common.ancestor} function implemented in the \pkg{ROBITaxonomy}
|
||||
#' package, considers a fuzzy definition of the LCA as the most precise
|
||||
#' taxonomic group shared by a quorum of the considered taxa.
|
||||
#'
|
||||
#' @param taxonomy an instance of \code{\linkS4class{obitools.taxonomy}}
|
||||
#' @param taxid an integer value or a vector of integer representing NCBI
|
||||
#' taxonomic identifiers.
|
||||
#' @param threshold a numeric value between 0.0 and 1.0 indicating the minimum
|
||||
#' quorum of taxid that must belong the LCA.
|
||||
#' @param error an integer value indicating the maximum count of taxids that
|
||||
#' have not to belong the returned taxid. A \code{threshold} below 1.0 have
|
||||
#' priority on the \code{error} parameter.
|
||||
#' @param name A logical value \code{TRUE} or \code{FALSE} indicating if the
|
||||
#' method return a \emph{taxid} or a scientific name.
|
||||
#'
|
||||
#' @return Depending on the value of the \code{name} argument, set by default
|
||||
#' to \code{FALSE} the method returns :
|
||||
#' \describe{
|
||||
#' \item{If \code{name==FALSE}}{ the taxid of the taxon corresponding
|
||||
#' to the LCA as an integer value}
|
||||
#' \item{If \code{name==TRUE}}{ the scientific name of the taxon
|
||||
#' corresponding to the LCA as a string}
|
||||
#' }
|
||||
#'
|
||||
#' @examples
|
||||
#' require(ROBITaxonomy)
|
||||
#'
|
||||
#' \dontshow{# switch the working directory to the data package directory}
|
||||
#' \dontshow{setwd(system.file("extdata", package="ROBITaxonomy"))}
|
||||
#'
|
||||
#' # read the taxonomy database
|
||||
#'
|
||||
#' taxo=read.taxonomy('ncbitaxo')
|
||||
#'
|
||||
#' # build a vector of 6 taxids corresponding to species
|
||||
#'
|
||||
#' sp.taxid=c(7000,7004,7007,7009,7010,7011)
|
||||
#'
|
||||
#' # look for the lowest common ancestor taxids
|
||||
#'
|
||||
#' lowest.common.ancestor(taxo,sp.taxid)
|
||||
#'
|
||||
#' # same thing but returns results as a vector of scientific names
|
||||
#' lowest.common.ancestor(taxo,sp.taxid,name=TRUE)
|
||||
#'
|
||||
#' # If we accept than 2 or 1 taxa do not belong the LCA
|
||||
#' lowest.common.ancestor(taxo,sp.taxid,name=TRUE,error=2)
|
||||
#' lowest.common.ancestor(taxo,sp.taxid,name=TRUE,error=1)
|
||||
#'
|
||||
#' # Partial LCA can also be speciefied as the minimal frequency of
|
||||
#' # taxa belonging the LCA
|
||||
#' lowest.common.ancestor(taxo,sp.taxid,name=TRUE,threshold=0.8)
|
||||
#'
|
||||
#' @seealso class \code{\linkS4class{obitools.taxonomy}},
|
||||
#' and methods \code{\link{path}}, \code{\link{parent}},
|
||||
#'
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#' @docType methods
|
||||
#' @rdname lowest.common.ancestor-method
|
||||
#' @aliases lowest.common.ancestor,obitools.taxonomy
|
||||
#'
|
||||
setMethod("lowest.common.ancestor", "obitools.taxonomy",
|
||||
function(taxonomy,taxid,threshold=1.0,error=0,name=FALSE) {
|
||||
|
||||
if (threshold != 1.0)
|
||||
error=as.integer(floor(length(taxid) * (1-threshold)))
|
||||
|
||||
|
||||
#
|
||||
# Remove nod valid taxid
|
||||
#
|
||||
|
||||
taxid = validate(taxonomy,taxid)
|
||||
if (any(is.na(taxid)))
|
||||
return(NA)
|
||||
|
||||
ntaxid=length(taxid)
|
||||
nok = ntaxid - error
|
||||
if (ntaxid==1)
|
||||
return(taxid)
|
||||
|
||||
allpath = path(taxonomy,taxid)
|
||||
minlength= min(vapply(allpath,length,0))
|
||||
|
||||
lca=NA
|
||||
for (i in 1:minlength) {
|
||||
|
||||
n = vapply(allpath,function(x) x[i],0)
|
||||
nt = table(n)
|
||||
mt = max(nt)
|
||||
if (mt >= nok) {
|
||||
p = nt[nt==mt]
|
||||
if (length(p)==1)
|
||||
lca=as.integer(names(p)[1])
|
||||
else
|
||||
break
|
||||
}
|
||||
else
|
||||
break
|
||||
}
|
||||
|
||||
if (name)
|
||||
return(scientificname(taxonomy,lca))
|
||||
else
|
||||
return(lca)
|
||||
})
|
||||
|
||||
|
483
R/rank.R
Normal file
483
R/rank.R
Normal file
@@ -0,0 +1,483 @@
|
||||
#' @include taxonomy.R
|
||||
NULL
|
||||
|
||||
#' @export
|
||||
setGeneric("rank.list", function(taxonomy) {
|
||||
return(standardGeneric("rank.list"))
|
||||
})
|
||||
|
||||
#' Returns the list of taxonomic ranks
|
||||
#'
|
||||
#' The \code{rank.list} function returns the list of all taxonomic
|
||||
#' ranks described in the taxonomy
|
||||
#'
|
||||
#' @param taxonomy a \code{\linkS4class{obitools.taxonomy}} instance
|
||||
#'
|
||||
#' @return a vector of type \code{character} containing the taxonomic rank names
|
||||
#'
|
||||
#' @examples
|
||||
#' # read the taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # returns the taxonomic rank for all taxid between 1000 and 1020
|
||||
#' rank.list(taxo)
|
||||
#'
|
||||
#' @docType methods
|
||||
#' @rdname rank.list-methods
|
||||
#' @aliases rank.list-methods,obitools.taxonomy
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#'
|
||||
setMethod("rank.list", "obitools.taxonomy",
|
||||
function(taxonomy) {
|
||||
return(.Call('R_rank_list',
|
||||
taxonomy,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
})
|
||||
|
||||
|
||||
#' @export
|
||||
setGeneric("taxonomicrank", function(taxonomy,taxid) {
|
||||
return(standardGeneric("taxonomicrank"))
|
||||
})
|
||||
|
||||
#' Returns the taxonomic rank associated to a taxid
|
||||
#'
|
||||
#' @param taxonomy a \code{\linkS4class{obitools.taxonomy}} instance
|
||||
#' @param taxid a vector of taxid to analyse
|
||||
#'
|
||||
#' @return a vector of type \code{character} containing the taxonomic ranks
|
||||
#'
|
||||
#' @examples
|
||||
#' # read the taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # returns the taxonomic rank for all taxid between 1000 and 1020
|
||||
#' taxonomicrank(taxo,1000:1020)
|
||||
#'
|
||||
#' @docType methods
|
||||
#' @rdname taxonomicrank-methods
|
||||
#' @aliases taxonomicrank-methods,obitools.taxonomy
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#'
|
||||
setMethod("taxonomicrank", "obitools.taxonomy",function(taxonomy,taxid) {
|
||||
taxid = as.integer(taxid)
|
||||
return(.Call('R_get_rank',
|
||||
taxonomy,
|
||||
taxid,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
})
|
||||
|
||||
#' @export
|
||||
setGeneric("taxonatrank", function(taxonomy,taxid,rank,name=FALSE) {
|
||||
return(standardGeneric("taxonatrank"))
|
||||
})
|
||||
|
||||
#' Extracts the taxid at a specified taxonomic rank.
|
||||
#'
|
||||
#' The \code{taxonatrank} method of \code{\linkS4class{obitools.taxonomy}} class
|
||||
#' returns the \emph{taxid} or the scientific name corresponding
|
||||
#' to a \emph{taxid}.at a specified taxonomic rank
|
||||
#'
|
||||
#' @param taxonomy a \code{\linkS4class{obitools.taxonomy}} instance
|
||||
#' @param taxid a vector of taxid to analyse
|
||||
#' @param rank a \code{character} indicating the desired rank
|
||||
#' @param name A logical value \code{TRUE} or \code{FALSE} indicating
|
||||
#' if the method return a taxid or a scientific name.
|
||||
#'
|
||||
#' @return \describe{
|
||||
#' \item{If \code{name==FALSE}}{the taxid of the corresponding
|
||||
#' taxon as an integer or a vector of integers
|
||||
#' if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' \item{If \code{name==TRUE}}{the scientific name of the corresponding
|
||||
#' taxon as a string or a vector of string
|
||||
#' if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' }
|
||||
#'
|
||||
#' @examples
|
||||
#' # read the taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # build a vector of 6 taxids corresponding to species
|
||||
#' sp.taxid=c(7000,7004,7007,7009,7010,7011)
|
||||
#'
|
||||
#' # look for the subfamily taxids
|
||||
#' taxonatrank(taxo,sp.taxid,"subfamily")
|
||||
#'
|
||||
#' # same thing but returns results as a vector of scientific names
|
||||
#' taxonatrank(taxo,sp.taxid,"subfamily",TRUE)
|
||||
#'
|
||||
#' @seealso class \code{\linkS4class{obitools.taxonomy}},
|
||||
#' and methods \code{\link{species}},\code{\link{genus}},
|
||||
#' \code{\link{family}},\code{\link{kingdom}},
|
||||
#' \code{\link{superkingdom}}
|
||||
#'
|
||||
#' @docType methods
|
||||
#' @rdname taxonatrank-methods
|
||||
#' @aliases taxonatrank-methods,obitools.taxonomy
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#'
|
||||
setMethod("taxonatrank", "obitools.taxonomy",function(taxonomy,taxid,rank,name=FALSE) {
|
||||
getsp = function(t) {
|
||||
if (is.na(t[1]) | is.na(t[2]))
|
||||
return(NA)
|
||||
else
|
||||
return(.Call('R_findtaxonatrank',taxonomy,
|
||||
as.integer(t[1]),
|
||||
t[2],
|
||||
name,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
}
|
||||
|
||||
rank = as.character(rank)
|
||||
name = as.logical(name[1])
|
||||
|
||||
apply(data.frame(taxid,rank),1,getsp)
|
||||
})
|
||||
|
||||
|
||||
#' @export
|
||||
setGeneric("species", function(taxonomy,taxid,name=FALSE) {
|
||||
return(standardGeneric("species"))
|
||||
})
|
||||
|
||||
#' Extracts the species corresponding to a taxid
|
||||
#'
|
||||
#' The \code{species} method of \code{\linkS4class{obitools.taxonomy}} class
|
||||
#' returns the \emph{taxid} or the scientific name of the species corresponding
|
||||
#' to a \emph{taxid}.
|
||||
#'
|
||||
#' @param taxonomy a \code{\linkS4class{obitools.taxonomy}} instance
|
||||
#' @param taxid a vector of taxid to analyse
|
||||
#' @param name A logical value \code{TRUE} or \code{FALSE} indicating
|
||||
#' if the method return a taxid or a scientific name.
|
||||
#'
|
||||
#' @return \describe{
|
||||
#' \item{If \code{name==FALSE}}{the taxid of the corresponding
|
||||
#' taxon as an integer or a vector of integers
|
||||
#' if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' \item{If \code{name==TRUE}}{the scientific name of the corresponding
|
||||
#' taxon as a string or a vector of string
|
||||
#' if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' }
|
||||
#'
|
||||
#' @examples
|
||||
#' # read the taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # build a vector of 6 taxids corresponding to species
|
||||
#' sp.taxid=c(7000,7004,7007,7009,7010,7011)
|
||||
#'
|
||||
#' # look for the species taxids
|
||||
#' species(taxo,sp.taxid)
|
||||
#'
|
||||
#' # same thing but returns results as a vector of scientific names
|
||||
#' species(taxo,sp.taxid,TRUE)
|
||||
#'
|
||||
#' @seealso class \code{\linkS4class{obitools.taxonomy}},
|
||||
#' and methods \code{\link{taxonatrank}},\code{\link{genus}},
|
||||
#' \code{\link{family}},\code{\link{kingdom}},
|
||||
#' \code{\link{superkingdom}}
|
||||
#'
|
||||
#' @docType methods
|
||||
#' @rdname species-methods
|
||||
#' @aliases species-methods,obitools.taxonomy
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#'
|
||||
setMethod("species", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) {
|
||||
getsp = function(t) {
|
||||
if (is.na(t))
|
||||
return(NA)
|
||||
else
|
||||
return(.Call('R_get_species',
|
||||
taxonomy,
|
||||
t,
|
||||
name,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
}
|
||||
|
||||
taxid = as.integer(taxid)
|
||||
name = as.logical(name[1])
|
||||
sapply(taxid,getsp)
|
||||
})
|
||||
|
||||
#' @export
|
||||
setGeneric("genus", function(taxonomy,taxid,name=FALSE) {
|
||||
return(standardGeneric("genus"))
|
||||
})
|
||||
|
||||
#' Extracts the genus corresponding to a taxid
|
||||
#'
|
||||
#' The \code{genus} method of \code{\linkS4class{obitools.taxonomy}} class
|
||||
#' returns the \emph{taxid} or the scientific name of the genus corresponding
|
||||
#' to a \emph{taxid}.
|
||||
#'
|
||||
#' @param taxonomy a \code{\linkS4class{obitools.taxonomy}} instance
|
||||
#' @param taxid a vector of taxid to analyse
|
||||
#' @param name A logical value \code{TRUE} or \code{FALSE} indicating
|
||||
#' if the method return a taxid or a scientific name.
|
||||
#'
|
||||
#' @return \describe{
|
||||
#' \item{If \code{name==FALSE}}{the taxid of the corresponding
|
||||
#' taxon as an integer or a vector of integers
|
||||
#' if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' \item{If \code{name==TRUE}}{the scientific name of the corresponding
|
||||
#' taxon as a string or a vector of string
|
||||
#' if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' }
|
||||
#'
|
||||
#' @examples
|
||||
#' # read the taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # build a vector of 6 taxids corresponding to species
|
||||
#' sp.taxid=c(7000,7004,7007,7009,7010,7011)
|
||||
#'
|
||||
#' # look for the genus taxids
|
||||
#' genus(taxo,sp.taxid)
|
||||
#'
|
||||
#' # same thing but returns results as a vector of scientific names
|
||||
#' genus(taxo,sp.taxid,TRUE)
|
||||
#'
|
||||
#' @seealso class \code{\linkS4class{obitools.taxonomy}},
|
||||
#' and methods \code{\link{species}},\code{\link{taxonatrank}},
|
||||
#' \code{\link{family}},\code{\link{kingdom}},
|
||||
#' \code{\link{superkingdom}}
|
||||
#'
|
||||
#' @docType methods
|
||||
#' @rdname genus-methods
|
||||
#' @aliases genus-methods,obitools.taxonomy
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#'
|
||||
setMethod("genus", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) {
|
||||
getsp = function(t) {
|
||||
if (is.na(t))
|
||||
return(NA)
|
||||
else
|
||||
return(.Call('R_get_genus',
|
||||
taxonomy,
|
||||
t,
|
||||
name,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
}
|
||||
|
||||
taxid = as.integer(taxid)
|
||||
name = as.logical(name[1])
|
||||
sapply(taxid,getsp)
|
||||
})
|
||||
|
||||
#' @export
|
||||
setGeneric("family", function(taxonomy,taxid,name=FALSE) {
|
||||
return(standardGeneric("family"))
|
||||
})
|
||||
|
||||
#' Extracts the family corresponding to a taxid
|
||||
#'
|
||||
#' The \code{family} method of \code{\linkS4class{obitools.taxonomy}} class
|
||||
#' returns the \emph{taxid} or the scientific name of the family corresponding
|
||||
#' to a \emph{taxid}.
|
||||
#'
|
||||
#' @param taxonomy a \code{\linkS4class{obitools.taxonomy}} instance
|
||||
#' @param taxid a vector of taxid to analyse
|
||||
#' @param name A logical value \code{TRUE} or \code{FALSE} indicating
|
||||
#' if the method return a taxid or a scientific name.
|
||||
#'
|
||||
#' @return \describe{
|
||||
#' \item{If \code{name==FALSE}}{the taxid of the corresponding
|
||||
#' taxon as an integer or a vector of integers
|
||||
#' if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' \item{If \code{name==TRUE}}{the scientific name of the corresponding
|
||||
#' taxon as a string or a vector of string
|
||||
#' if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' }
|
||||
#'
|
||||
#' @examples
|
||||
#' # read the taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # build a vector of 6 taxids corresponding to species
|
||||
#' sp.taxid=c(7000,7004,7007,7009,7010,7011)
|
||||
#'
|
||||
#' # look for the family taxids
|
||||
#' family(taxo,sp.taxid)
|
||||
#'
|
||||
#' # same thing but returns results as a vector of scientific names
|
||||
#' family(taxo,sp.taxid,TRUE)
|
||||
#'
|
||||
#' @seealso class \code{\linkS4class{obitools.taxonomy}},
|
||||
#' and methods \code{\link{species}},\code{\link{genus}},
|
||||
#' \code{\link{taxonatrank}},\code{\link{kingdom}},
|
||||
#' \code{\link{superkingdom}}
|
||||
#'
|
||||
#' @docType methods
|
||||
#' @rdname family-methods
|
||||
#' @aliases family-methods,obitools.taxonomy
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#'
|
||||
setMethod("family", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) {
|
||||
getsp = function(t) {
|
||||
if (is.na(t))
|
||||
return(NA)
|
||||
else
|
||||
return(.Call('R_get_family',
|
||||
taxonomy,
|
||||
t,
|
||||
name,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
}
|
||||
|
||||
taxid = as.integer(taxid)
|
||||
name = as.logical(name[1])
|
||||
sapply(taxid,getsp)
|
||||
})
|
||||
|
||||
#' @export
|
||||
setGeneric("kingdom", function(taxonomy,taxid,name=FALSE) {
|
||||
return(standardGeneric("kingdom"))
|
||||
})
|
||||
|
||||
#' Extracts the kingdom corresponding to a taxid
|
||||
#'
|
||||
#' The \code{kingdom} method of \code{\linkS4class{obitools.taxonomy}} class
|
||||
#' returns the \emph{taxid} or the scientific name of the kingdom corresponding
|
||||
#' to a \emph{taxid}.
|
||||
#'
|
||||
#' @param taxonomy a \code{\linkS4class{obitools.taxonomy}} instance
|
||||
#' @param taxid a vector of taxid to analyse
|
||||
#' @param name A logical value \code{TRUE} or \code{FALSE} indicating
|
||||
#' if the method return a taxid or a scientific name.
|
||||
#'
|
||||
#' @return \describe{
|
||||
#' \item{If \code{name==FALSE}}{the taxid of the corresponding
|
||||
#' taxon as an integer or a vector of integers
|
||||
#' if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' \item{If \code{name==TRUE}}{the scientific name of the corresponding
|
||||
#' taxon as a string or a vector of string
|
||||
#' if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' }
|
||||
#'
|
||||
#' @examples
|
||||
#' # read the taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # build a vector of 6 taxids corresponding to species
|
||||
#' sp.taxid=c(7000,7004,7007,7009,7010,7011)
|
||||
#'
|
||||
#' # look for the kingdom taxids
|
||||
#' kingdom(taxo,sp.taxid)
|
||||
#'
|
||||
#' # same thing but returns results as a vector of scientific names
|
||||
#' kingdom(taxo,sp.taxid,TRUE)
|
||||
#'
|
||||
#' @seealso class \code{\linkS4class{obitools.taxonomy}},
|
||||
#' and methods \code{\link{species}},\code{\link{genus}},
|
||||
#' \code{\link{family}},\code{\link{taxonatrank}},
|
||||
#' \code{\link{superkingdom}}
|
||||
#'
|
||||
#' @docType methods
|
||||
#' @rdname kingdom-methods
|
||||
#' @aliases kingdom-methods,obitools.taxonomy
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#'
|
||||
setMethod("kingdom", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) {
|
||||
getsp = function(t) {
|
||||
if (is.na(t))
|
||||
return(NA)
|
||||
else
|
||||
return(.Call('R_get_kingdom',
|
||||
taxonomy,
|
||||
t,
|
||||
name,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
}
|
||||
|
||||
taxid = as.integer(taxid)
|
||||
name = as.logical(name[1])
|
||||
sapply(taxid,getsp)
|
||||
})
|
||||
|
||||
#' @export
|
||||
setGeneric("superkingdom", function(taxonomy,taxid,name=FALSE) {
|
||||
return(standardGeneric("superkingdom"))
|
||||
})
|
||||
|
||||
#' Extracts the superkingdom corresponding to a taxid
|
||||
#'
|
||||
#' The \code{superkingdom} method of \code{\linkS4class{obitools.taxonomy}} class
|
||||
#' returns the \emph{taxid} or the scientific name of the superkingdom corresponding
|
||||
#' to a \emph{taxid}.
|
||||
#'
|
||||
#' @param taxonomy a \code{\linkS4class{obitools.taxonomy}} instance
|
||||
#' @param taxid a vector of taxid to analyse
|
||||
#' @param name A logical value \code{TRUE} or \code{FALSE} indicating
|
||||
#' if the method return a taxid or a scientific name.
|
||||
#'
|
||||
#' @return \describe{
|
||||
#' \item{If \code{name==FALSE}}{the taxid of the corresponding
|
||||
#' taxon as an integer or a vector of integers
|
||||
#' if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' \item{If \code{name==TRUE}}{the scientific name of the corresponding
|
||||
#' taxon as a string or a vector of string
|
||||
#' if the \code{taxid} argument is itself
|
||||
#' a vector}
|
||||
#' }
|
||||
#'
|
||||
#' @examples
|
||||
#' # read the taxonomy database
|
||||
#' taxo=default.taxonomy()
|
||||
#'
|
||||
#' # build a vector of 6 taxids corresponding to species
|
||||
#' sp.taxid=c(7000,7004,7007,7009,7010,7011)
|
||||
#'
|
||||
#' # look for the superkingdom taxids
|
||||
#' superkingdom(taxo,sp.taxid)
|
||||
#'
|
||||
#' # same thing but returns results as a vector of scientific names
|
||||
#' superkingdom(taxo,sp.taxid,TRUE)
|
||||
#'
|
||||
#' @seealso class \code{\linkS4class{obitools.taxonomy}},
|
||||
#' and methods \code{\link{species}},\code{\link{genus}},
|
||||
#' \code{\link{family}},\code{\link{kingdom}},
|
||||
#' \code{\link{taxonatrank}}
|
||||
#'
|
||||
#' @docType methods
|
||||
#' @rdname superkingdom-methods
|
||||
#' @aliases superkingdom-methods,obitools.taxonomy
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#'
|
||||
setMethod("superkingdom", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) {
|
||||
getsp = function(t) {
|
||||
if (is.na(t))
|
||||
return(NA)
|
||||
else
|
||||
return(.Call('R_get_superkingdom',
|
||||
taxonomy,
|
||||
t,
|
||||
name,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
}
|
||||
|
||||
taxid = as.integer(taxid)
|
||||
name = as.logical(name[1])
|
||||
sapply(taxid,getsp)
|
||||
})
|
||||
|
||||
|
216
R/taxonomy.R
Normal file
216
R/taxonomy.R
Normal file
@@ -0,0 +1,216 @@
|
||||
#' @include ROBITaxonomy.R
|
||||
#' @useDynLib ROBITaxonomy
|
||||
NULL
|
||||
|
||||
#' Gives access to a taxonomy preformated by OBITools
|
||||
#'
|
||||
#' A S4 class describing a taxonomy. It allows access to
|
||||
#' taxonomy formated for OBITools.
|
||||
#'
|
||||
#' @references \describe{
|
||||
#' \item{NCBI Taxonomy : }{\url{http://www.ncbi.nlm.nih.gov/taxonomy}}
|
||||
#' \item{OBITools : }{\url{http://metabarcoding/obitools/doc}}
|
||||
#' }
|
||||
#'
|
||||
#' @seealso \code{\link{read.taxonomy}}
|
||||
#'
|
||||
#' @name obitools.taxonomy
|
||||
#' @rdname obitools-taxonomy-class
|
||||
#' @keywords taxonomy
|
||||
#' @author Eric Coissac
|
||||
#' @exportClass obitools.taxonomy
|
||||
#'
|
||||
setClass("obitools.taxonomy",
|
||||
|
||||
|
||||
#
|
||||
# Attribute declaration
|
||||
#
|
||||
|
||||
# data.frame containing the counts of reads per samples
|
||||
# 1 samples per line
|
||||
# 1 sequence per column
|
||||
|
||||
representation(
|
||||
|
||||
# An external pointer structure to
|
||||
# the C taxonomy structure
|
||||
|
||||
pointer = "externalptr",
|
||||
|
||||
# the name of the database on the hard disk
|
||||
|
||||
dbname = 'character',
|
||||
|
||||
# the working directory when the taxonomy
|
||||
# object is created.
|
||||
# This inforation combined with bname allows
|
||||
# to reload taxonomy from disk
|
||||
|
||||
workingdir = 'character',
|
||||
|
||||
# Indicate if the taxonomy is saved in a file
|
||||
# Taxonomy created in R or modified in R are
|
||||
# not saved ==> This have to be take into
|
||||
# consideration but how ???
|
||||
saved = 'logical'
|
||||
),
|
||||
|
||||
#
|
||||
# Check object structure
|
||||
#
|
||||
|
||||
validity = function(object) {
|
||||
return(TRUE)
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
|
||||
#' obitools.taxonomy constructor
|
||||
#'
|
||||
#' --> this constructor have not to be called directly
|
||||
#' use the read.obitools.taxonomy function to
|
||||
#' create a new instance of taxonomy
|
||||
#'
|
||||
#' @docType methods
|
||||
#' @rdname initialize-methods-obitools.taxonomy
|
||||
#' @aliases initialize-methods,obitools.taxonomy
|
||||
setMethod("initialize",
|
||||
"obitools.taxonomy",
|
||||
function(.Object, pointer,dbname,workingdir,saved) {
|
||||
.Object@pointer <- pointer
|
||||
.Object@dbname <- dbname
|
||||
.Object@workingdir <- workingdir
|
||||
.Object@saved <- saved
|
||||
|
||||
validObject(.Object) ## valide l'objet
|
||||
return(.Object)
|
||||
})
|
||||
|
||||
|
||||
#' @exportClass obitools.taxonomyOrNULL
|
||||
setClassUnion("obitools.taxonomyOrNULL",c("obitools.taxonomy","NULL"))
|
||||
|
||||
|
||||
#' @export
|
||||
setGeneric("path", function(taxonomy,taxid,name=FALSE) {
|
||||
return(standardGeneric("path"))
|
||||
})
|
||||
|
||||
|
||||
|
||||
setMethod("path", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) {
|
||||
getp = function(t) {
|
||||
if (is.na(t))
|
||||
return(NA)
|
||||
else
|
||||
{
|
||||
path=c()
|
||||
|
||||
t=.Call('R_validate_taxid',
|
||||
taxonomy,
|
||||
as.integer(t),
|
||||
PACKAGE="ROBITaxonomy")
|
||||
|
||||
if (is.na(t))
|
||||
return(NA)
|
||||
|
||||
repeat {
|
||||
if (name)
|
||||
path = c(scientificname(taxonomy,t),path)
|
||||
else
|
||||
path = c(t,path)
|
||||
|
||||
t = .Call('R_get_parent',
|
||||
taxonomy,
|
||||
t,
|
||||
FALSE,
|
||||
PACKAGE="ROBITaxonomy")
|
||||
if (is.na(t))
|
||||
break
|
||||
}
|
||||
|
||||
return(path)
|
||||
}
|
||||
}
|
||||
|
||||
taxid=as.integer(taxid)
|
||||
name=as.logical(name)
|
||||
|
||||
p = lapply(taxid,getp)
|
||||
d = dim(p)
|
||||
|
||||
if (!is.null(d))
|
||||
if (d[2]==1)
|
||||
p = as.vector(p)
|
||||
|
||||
return(p)
|
||||
})
|
||||
|
||||
|
||||
#' @export
|
||||
setGeneric("is.subcladeof", function(taxonomy,taxid,parent) {
|
||||
return(standardGeneric("is.subcladeof"))
|
||||
})
|
||||
|
||||
setMethod("is.subcladeof", "obitools.taxonomy",function(taxonomy,taxid,parent) {
|
||||
taxid = as.integer(taxid)
|
||||
parent= as.integer(parent)
|
||||
return(.Call('R_is_under_taxon',
|
||||
taxonomy,
|
||||
taxid,
|
||||
parent,
|
||||
PACKAGE="ROBITaxonomy"))
|
||||
})
|
||||
|
||||
|
||||
|
||||
build.taxonomy = function(pointer,dbname,workingdir,saved) {
|
||||
rd <- new('obitools.taxonomy',
|
||||
pointer=pointer,
|
||||
dbname=dbname,
|
||||
workingdir=workingdir,
|
||||
saved=saved
|
||||
)
|
||||
return(rd)
|
||||
}
|
||||
|
||||
|
||||
#' Reads a taxonomy
|
||||
#'
|
||||
#' \code{read.taxonomy} reads a taxonomy formated by OBITools.
|
||||
#' NCBI taxonomy can be download from the NCBI FTP site in taxdump format.
|
||||
#' The taxdump must be formated using the obitaxonomy command from OBITools
|
||||
#' before being used in R. A OBITools formated taxonomy is composed of 3 files
|
||||
#' with the same prefix name and suffixes .tdx, .rdx, .ndx, two extra files
|
||||
#' suffixed .adx and .ldx can also be present.
|
||||
#'
|
||||
#' @param dbname A character string containing the file name of the database
|
||||
#'
|
||||
#' @return an instance of the class \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#' @examples
|
||||
#'
|
||||
#' \dontshow{# switch the working directory to the data package directory}
|
||||
#' \dontshow{setwd(system.file("extdata", package="ROBITaxonomy"))}
|
||||
#'
|
||||
#' # read the taxonomy ncbi
|
||||
#' ncbi = read.taxonomy("ncbitaxo")
|
||||
#'
|
||||
#' # and use it for requesting a scientific name
|
||||
#' scientificname(ncbi,7742)
|
||||
#'
|
||||
#' @seealso \code{\linkS4class{obitools.taxonomy}}
|
||||
#'
|
||||
#' @author Eric Coissac
|
||||
#' @keywords taxonomy
|
||||
#' @export
|
||||
|
||||
read.taxonomy = function(dbname) {
|
||||
t <- .Call('R_read_taxonomy',dbname,TRUE,PACKAGE="ROBITaxonomy")
|
||||
|
||||
return(build.taxonomy(t,dbname,getwd(),TRUE))
|
||||
}
|
||||
|
||||
|
20
ROBITaxonomy.Rproj
Normal file
20
ROBITaxonomy.Rproj
Normal file
@@ -0,0 +1,20 @@
|
||||
Version: 1.0
|
||||
|
||||
RestoreWorkspace: Default
|
||||
SaveWorkspace: Default
|
||||
AlwaysSaveHistory: Default
|
||||
|
||||
EnableCodeIndexing: Yes
|
||||
UseSpacesForTab: Yes
|
||||
NumSpacesForTab: 2
|
||||
Encoding: UTF-8
|
||||
|
||||
RnwWeave: knitr
|
||||
LaTeX: pdfLaTeX
|
||||
|
||||
AutoAppendNewline: Yes
|
||||
|
||||
BuildType: Package
|
||||
PackageUseDevtools: Yes
|
||||
PackageInstallArgs: --no-multiarch --with-keep.source
|
||||
PackageRoxygenize: rd,collate,namespace
|
BIN
inst/extdata/.DS_Store
vendored
Normal file
BIN
inst/extdata/.DS_Store
vendored
Normal file
Binary file not shown.
BIN
inst/extdata/ncbitaxo.adx
vendored
Normal file
BIN
inst/extdata/ncbitaxo.adx
vendored
Normal file
Binary file not shown.
BIN
inst/extdata/ncbitaxo.ndx
vendored
Normal file
BIN
inst/extdata/ncbitaxo.ndx
vendored
Normal file
Binary file not shown.
BIN
inst/extdata/ncbitaxo.rdx
vendored
Normal file
BIN
inst/extdata/ncbitaxo.rdx
vendored
Normal file
Binary file not shown.
BIN
inst/extdata/ncbitaxo.tdx
vendored
Normal file
BIN
inst/extdata/ncbitaxo.tdx
vendored
Normal file
Binary file not shown.
BIN
src/ROBITaxonomy.so
Executable file
BIN
src/ROBITaxonomy.so
Executable file
Binary file not shown.
24
src/ecoError.c
Normal file
24
src/ecoError.c
Normal file
@@ -0,0 +1,24 @@
|
||||
#include "ecoPCR.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
/*
|
||||
* print the message given as argument and exit the program
|
||||
* @param error error number
|
||||
* @param message the text explaining what's going on
|
||||
* @param filename the file source where the program failed
|
||||
* @param linenumber the line where it has failed
|
||||
* filename and linenumber are written at pre-processing
|
||||
* time by a macro
|
||||
*/
|
||||
void ecoError(int32_t errorcode,
|
||||
const char* message,
|
||||
const char * filename,
|
||||
int linenumber)
|
||||
{
|
||||
error("Error %d in file %s line %d : %s",
|
||||
errorcode,
|
||||
filename,
|
||||
linenumber,
|
||||
message);
|
||||
}
|
BIN
src/ecoError.o
Normal file
BIN
src/ecoError.o
Normal file
Binary file not shown.
122
src/ecoIOUtils.c
Normal file
122
src/ecoIOUtils.c
Normal file
@@ -0,0 +1,122 @@
|
||||
#include "ecoPCR.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#define SWAPINT32(x) ((((x) << 24) & 0xFF000000) | (((x) << 8) & 0xFF0000) | \
|
||||
(((x) >> 8) & 0xFF00) | (((x) >> 24) & 0xFF))
|
||||
|
||||
|
||||
int32_t is_big_endian()
|
||||
{
|
||||
int32_t i=1;
|
||||
|
||||
return (int32_t)((char*)&i)[0];
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
int32_t swap_int32_t(int32_t i)
|
||||
{
|
||||
return SWAPINT32(i);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Read part of the file
|
||||
* @param *f the database
|
||||
* @param recordSize the size to be read
|
||||
*
|
||||
* @return buffer
|
||||
*/
|
||||
void *read_ecorecord(FILE *f,int32_t *recordSize)
|
||||
{
|
||||
static void *buffer =NULL;
|
||||
int32_t buffersize=0;
|
||||
int32_t read;
|
||||
|
||||
if (!recordSize)
|
||||
ECOERROR(ECO_ASSERT_ERROR,
|
||||
"recordSize cannot be NULL");
|
||||
|
||||
read = fread(recordSize,
|
||||
1,
|
||||
sizeof(int32_t),
|
||||
f);
|
||||
|
||||
if (feof(f))
|
||||
return NULL;
|
||||
|
||||
if (read != sizeof(int32_t))
|
||||
ECOERROR(ECO_IO_ERROR,"Reading record size error");
|
||||
|
||||
if (is_big_endian())
|
||||
*recordSize=swap_int32_t(*recordSize);
|
||||
|
||||
if (buffersize < *recordSize)
|
||||
{
|
||||
if (buffer)
|
||||
buffer = ECOREALLOC(buffer,*recordSize,
|
||||
"Increase size of record buffer");
|
||||
else
|
||||
buffer = ECOMALLOC(*recordSize,
|
||||
"Allocate record buffer");
|
||||
}
|
||||
|
||||
read = fread(buffer,
|
||||
1,
|
||||
*recordSize,
|
||||
f);
|
||||
|
||||
if (read != *recordSize)
|
||||
ECOERROR(ECO_IO_ERROR,"Reading record data error");
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Open the database and check it's readable
|
||||
* @param filename name of the database (.sdx, .rdx, .tbx)
|
||||
* @param sequencecount buffer - pointer to variable storing the number of occurence
|
||||
* @param abort_on_open_error boolean to define the behaviour in case of error
|
||||
* while opening the database
|
||||
* @return FILE type
|
||||
**/
|
||||
FILE *open_ecorecorddb(const char *filename,
|
||||
int32_t *sequencecount,
|
||||
int32_t abort_on_open_error)
|
||||
{
|
||||
FILE *f;
|
||||
int32_t read;
|
||||
|
||||
f = fopen(filename,"rb");
|
||||
|
||||
if (!f)
|
||||
{
|
||||
if (abort_on_open_error)
|
||||
ECOERROR(ECO_IO_ERROR,"Cannot open file");
|
||||
else
|
||||
{
|
||||
*sequencecount=0;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
read = fread(sequencecount,
|
||||
1,
|
||||
sizeof(int32_t),
|
||||
f);
|
||||
|
||||
if (read != sizeof(int32_t))
|
||||
ECOERROR(ECO_IO_ERROR,"Reading record size error");
|
||||
|
||||
if (is_big_endian())
|
||||
*sequencecount=swap_int32_t(*sequencecount);
|
||||
|
||||
return f;
|
||||
}
|
||||
|
BIN
src/ecoIOUtils.o
Normal file
BIN
src/ecoIOUtils.o
Normal file
Binary file not shown.
76
src/ecoMalloc.c
Normal file
76
src/ecoMalloc.c
Normal file
@@ -0,0 +1,76 @@
|
||||
#include "ecoPCR.h"
|
||||
#include <stdlib.h>
|
||||
|
||||
static int eco_log_malloc = 0;
|
||||
|
||||
void eco_trace_memory_allocation()
|
||||
{
|
||||
eco_log_malloc=1;
|
||||
}
|
||||
|
||||
void eco_untrace_memory_allocation()
|
||||
{
|
||||
eco_log_malloc=0;
|
||||
}
|
||||
|
||||
|
||||
void *eco_malloc(int32_t chunksize,
|
||||
const char *error_message,
|
||||
const char *filename,
|
||||
int32_t line)
|
||||
{
|
||||
void * chunk;
|
||||
|
||||
chunk = calloc(1,chunksize);
|
||||
|
||||
if (!chunk)
|
||||
ecoError(ECO_MEM_ERROR,error_message,filename,line);
|
||||
|
||||
if (eco_log_malloc)
|
||||
REprintf("Memory segment located at %p of size %d is allocated (file : %s [%d])",
|
||||
chunk,
|
||||
chunksize,
|
||||
filename,
|
||||
line);
|
||||
|
||||
return chunk;
|
||||
}
|
||||
|
||||
void *eco_realloc(void *chunk,
|
||||
int32_t newsize,
|
||||
const char *error_message,
|
||||
const char *filename,
|
||||
int32_t line)
|
||||
{
|
||||
void *newchunk;
|
||||
|
||||
newchunk = realloc(chunk,newsize);
|
||||
|
||||
if (!newchunk)
|
||||
ecoError(ECO_MEM_ERROR,error_message,filename,line);
|
||||
|
||||
if (eco_log_malloc)
|
||||
REprintf("Old memory segment %p is reallocated at %p with a size of %d (file : %s [%d])",
|
||||
chunk,
|
||||
newchunk,
|
||||
newsize,
|
||||
filename,
|
||||
line);
|
||||
|
||||
return newchunk;
|
||||
}
|
||||
|
||||
void eco_free(void *chunk,
|
||||
const char *error_message,
|
||||
const char *filename,
|
||||
int32_t line)
|
||||
{
|
||||
free(chunk);
|
||||
|
||||
if (eco_log_malloc)
|
||||
REprintf("Memory segment %p is released => %s (file : %s [%d])",
|
||||
chunk,
|
||||
error_message,
|
||||
filename,
|
||||
line);
|
||||
}
|
BIN
src/ecoMalloc.o
Normal file
BIN
src/ecoMalloc.o
Normal file
Binary file not shown.
285
src/ecoPCR.h
Normal file
285
src/ecoPCR.h
Normal file
@@ -0,0 +1,285 @@
|
||||
#ifndef ECOPCR_H_
|
||||
#define ECOPCR_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
#include <R.h>
|
||||
#include <Rinternals.h>
|
||||
#include <Rdefines.h>
|
||||
|
||||
|
||||
//#ifndef H_apat
|
||||
//#include "../libapat/apat.h"
|
||||
//#endif
|
||||
|
||||
/*****************************************************
|
||||
*
|
||||
* Data type declarations
|
||||
*
|
||||
*****************************************************/
|
||||
|
||||
/*
|
||||
*
|
||||
* Sequence types
|
||||
*
|
||||
*/
|
||||
|
||||
typedef struct {
|
||||
|
||||
int32_t taxid;
|
||||
char AC[20];
|
||||
int32_t DE_length;
|
||||
int32_t SQ_length;
|
||||
int32_t CSQ_length;
|
||||
|
||||
char data[1];
|
||||
|
||||
} ecoseqformat_t;
|
||||
|
||||
typedef struct {
|
||||
int32_t taxid;
|
||||
int32_t SQ_length;
|
||||
char *AC;
|
||||
char *DE;
|
||||
char *SQ;
|
||||
} ecoseq_t;
|
||||
|
||||
/*
|
||||
*
|
||||
* Taxonomy taxon types
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
typedef struct {
|
||||
int32_t taxid;
|
||||
int32_t rank;
|
||||
int32_t parent;
|
||||
int32_t namelength;
|
||||
char name[1];
|
||||
|
||||
} ecotxformat_t;
|
||||
|
||||
typedef struct ecotxnode {
|
||||
int32_t taxid;
|
||||
int32_t rank;
|
||||
int32_t farest;
|
||||
struct ecotxnode *parent;
|
||||
char *name;
|
||||
} ecotx_t;
|
||||
|
||||
typedef struct {
|
||||
int32_t count;
|
||||
int32_t maxtaxid;
|
||||
int32_t buffersize;
|
||||
ecotx_t taxon[1];
|
||||
} ecotxidx_t;
|
||||
|
||||
|
||||
/*
|
||||
*
|
||||
* Taxonomy rank types
|
||||
*
|
||||
*/
|
||||
|
||||
typedef struct {
|
||||
int32_t count;
|
||||
char* label[1];
|
||||
} ecorankidx_t;
|
||||
|
||||
/*
|
||||
*
|
||||
* Taxonomy name types
|
||||
*
|
||||
*/
|
||||
|
||||
typedef struct {
|
||||
int32_t is_scientificname;
|
||||
int32_t namelength;
|
||||
int32_t classlength;
|
||||
int32_t taxid;
|
||||
char names[1];
|
||||
} econameformat_t;
|
||||
|
||||
|
||||
typedef struct {
|
||||
char *name;
|
||||
char *classname;
|
||||
int32_t is_scientificname;
|
||||
struct ecotxnode *taxon;
|
||||
} econame_t;
|
||||
|
||||
|
||||
typedef struct {
|
||||
int32_t count;
|
||||
econame_t names[1];
|
||||
} econameidx_t;
|
||||
|
||||
|
||||
typedef struct {
|
||||
ecorankidx_t *ranks;
|
||||
econameidx_t *names;
|
||||
ecotxidx_t *taxons;
|
||||
} ecotaxonomy_t;
|
||||
|
||||
|
||||
/*****************************************************
|
||||
*
|
||||
* Function declarations
|
||||
*
|
||||
*****************************************************/
|
||||
|
||||
/*
|
||||
*
|
||||
* Low level system functions
|
||||
*
|
||||
*/
|
||||
|
||||
int32_t is_big_endian();
|
||||
int32_t swap_int32_t(int32_t);
|
||||
|
||||
void *eco_malloc(int32_t chunksize,
|
||||
const char *error_message,
|
||||
const char *filename,
|
||||
int32_t line);
|
||||
|
||||
|
||||
void *eco_realloc(void *chunk,
|
||||
int32_t chunksize,
|
||||
const char *error_message,
|
||||
const char *filename,
|
||||
int32_t line);
|
||||
|
||||
void eco_free(void *chunk,
|
||||
const char *error_message,
|
||||
const char *filename,
|
||||
int32_t line);
|
||||
|
||||
void eco_trace_memory_allocation();
|
||||
void eco_untrace_memory_allocation();
|
||||
|
||||
#define ECOMALLOC(size,error_message) \
|
||||
eco_malloc((size),(error_message),__FILE__,__LINE__)
|
||||
|
||||
#define ECOREALLOC(chunk,size,error_message) \
|
||||
eco_realloc((chunk),(size),(error_message),__FILE__,__LINE__)
|
||||
|
||||
#define ECOFREE(chunk,error_message) \
|
||||
eco_free((chunk),(error_message),__FILE__,__LINE__)
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
*
|
||||
* Error managment
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
void ecoError(int32_t,const char*,const char *,int);
|
||||
|
||||
#define ECOERROR(code,message) ecoError((code),(message),__FILE__,__LINE__)
|
||||
|
||||
#define ECO_IO_ERROR (1)
|
||||
#define ECO_MEM_ERROR (2)
|
||||
#define ECO_ASSERT_ERROR (3)
|
||||
#define ECO_NOTFOUND_ERROR (4)
|
||||
|
||||
|
||||
/*
|
||||
*
|
||||
* Low level Disk access functions
|
||||
*
|
||||
*/
|
||||
|
||||
FILE *open_ecorecorddb(const char *filename,
|
||||
int32_t *sequencecount,
|
||||
int32_t abort_on_open_error);
|
||||
|
||||
void *read_ecorecord(FILE *,int32_t *recordSize);
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Read function in internal binary format
|
||||
*/
|
||||
|
||||
FILE *open_ecoseqdb(const char *filename,
|
||||
int32_t *sequencecount);
|
||||
|
||||
ecoseq_t *readnext_ecoseq(FILE *);
|
||||
|
||||
ecorankidx_t *read_rankidx(const char *filename);
|
||||
|
||||
econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy);
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Read taxonomy data as formated by the ecoPCRFormat.py script.
|
||||
*
|
||||
* This function is normaly uses internaly by the read_taxonomy
|
||||
* function and should not be called directly.
|
||||
*
|
||||
* @arg filename path to the *.tdx file of the reformated db
|
||||
*
|
||||
* @return pointer to a taxonomy index structure
|
||||
*/
|
||||
|
||||
ecotxidx_t *read_taxonomyidx(const char *filename,const char *filename2);
|
||||
|
||||
ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName);
|
||||
|
||||
ecotx_t *eco_findtaxonatrank(ecotx_t *taxon, int32_t rankidx);
|
||||
|
||||
ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy, int32_t taxid);
|
||||
|
||||
int eco_isundertaxon(ecotx_t *taxon, int other_taxid);
|
||||
|
||||
ecoseq_t *ecoseq_iterator(const char *prefix);
|
||||
|
||||
|
||||
|
||||
ecoseq_t *new_ecoseq();
|
||||
int32_t delete_ecoseq(ecoseq_t *);
|
||||
ecoseq_t *new_ecoseq_with_data( char *AC,
|
||||
char *DE,
|
||||
char *SQ,
|
||||
int32_t taxid
|
||||
);
|
||||
|
||||
|
||||
int32_t delete_taxon(ecotx_t *taxon);
|
||||
int32_t delete_rankidx(ecorankidx_t *rankidx);
|
||||
int32_t delete_nameidx(econameidx_t *nameidx);
|
||||
int32_t delete_taxonomy(ecotxidx_t *index);
|
||||
int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy);
|
||||
|
||||
|
||||
int32_t rank_index(const char* label,ecorankidx_t* ranks);
|
||||
|
||||
//int32_t delete_apatseq(SeqPtr pseq);
|
||||
//PatternPtr buildPattern(const char *pat, int32_t error_max);
|
||||
//PatternPtr complementPattern(PatternPtr pat);
|
||||
//
|
||||
//SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out,int32_t circular);
|
||||
|
||||
//char *ecoComplementPattern(char *nucAcSeq);
|
||||
//char *ecoComplementSequence(char *nucAcSeq);
|
||||
//char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end);
|
||||
|
||||
ecotx_t *eco_getspecies(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
|
||||
ecotx_t *eco_getgenus(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
|
||||
ecotx_t *eco_getfamily(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
|
||||
ecotx_t *eco_getkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
|
||||
ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
|
||||
|
||||
//int eco_is_taxid_ignored(int32_t *ignored_taxid, int32_t tab_len, int32_t taxid);
|
||||
//int eco_is_taxid_included(ecotaxonomy_t *taxonomy, int32_t *included_taxid, int32_t tab_len, int32_t taxid);
|
||||
|
||||
|
||||
ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy);
|
||||
|
||||
#endif /*ECOPCR_H_*/
|
156
src/ecodna.c
Normal file
156
src/ecodna.c
Normal file
@@ -0,0 +1,156 @@
|
||||
#include <string.h>
|
||||
#include "ecoPCR.h"
|
||||
|
||||
/*
|
||||
* @doc: DNA alphabet (IUPAC)
|
||||
*/
|
||||
#define LX_BIO_DNA_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]"
|
||||
|
||||
/*
|
||||
* @doc: complementary DNA alphabet (IUPAC)
|
||||
*/
|
||||
#define LX_BIO_CDNA_ALPHA "TVGHEFCDIJMLKNOPQYSAABWXRZ#!]["
|
||||
|
||||
|
||||
static char sNuc[] = LX_BIO_DNA_ALPHA;
|
||||
static char sAnuc[] = LX_BIO_CDNA_ALPHA;
|
||||
|
||||
static char LXBioBaseComplement(char nucAc);
|
||||
static char *LXBioSeqComplement(char *nucAcSeq);
|
||||
static char *reverseSequence(char *str,char isPattern);
|
||||
|
||||
|
||||
/* ---------------------------- */
|
||||
|
||||
char LXBioBaseComplement(char nucAc)
|
||||
{
|
||||
char *c;
|
||||
|
||||
if ((c = strchr(sNuc, nucAc)))
|
||||
return sAnuc[(c - sNuc)];
|
||||
else
|
||||
return nucAc;
|
||||
}
|
||||
|
||||
/* ---------------------------- */
|
||||
|
||||
char *LXBioSeqComplement(char *nucAcSeq)
|
||||
{
|
||||
char *s;
|
||||
|
||||
for (s = nucAcSeq ; *s ; s++)
|
||||
*s = LXBioBaseComplement(*s);
|
||||
|
||||
return nucAcSeq;
|
||||
}
|
||||
|
||||
|
||||
char *reverseSequence(char *str,char isPattern)
|
||||
{
|
||||
char *sb, *se, c;
|
||||
|
||||
if (! str)
|
||||
return str;
|
||||
|
||||
sb = str;
|
||||
se = str + strlen(str) - 1;
|
||||
|
||||
while(sb <= se) {
|
||||
c = *sb;
|
||||
*sb++ = *se;
|
||||
*se-- = c;
|
||||
}
|
||||
|
||||
sb = str;
|
||||
se = str + strlen(str) - 1;
|
||||
|
||||
if (isPattern)
|
||||
for (;sb < se; sb++)
|
||||
{
|
||||
if (*sb=='#')
|
||||
{
|
||||
if (((se - sb) > 2) && (*(sb+2)=='!'))
|
||||
{
|
||||
*sb='!';
|
||||
sb+=2;
|
||||
*sb='#';
|
||||
}
|
||||
else
|
||||
{
|
||||
*sb=*(sb+1);
|
||||
sb++;
|
||||
*sb='#';
|
||||
}
|
||||
}
|
||||
else if (*sb=='!')
|
||||
{
|
||||
*sb=*(sb-1);
|
||||
*(sb-1)='!';
|
||||
}
|
||||
}
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
char *ecoComplementPattern(char *nucAcSeq)
|
||||
{
|
||||
return reverseSequence(LXBioSeqComplement(nucAcSeq),1);
|
||||
}
|
||||
|
||||
char *ecoComplementSequence(char *nucAcSeq)
|
||||
{
|
||||
return reverseSequence(LXBioSeqComplement(nucAcSeq),0);
|
||||
}
|
||||
|
||||
|
||||
char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end)
|
||||
/*
|
||||
extract subsequence from nucAcSeq [begin,end[
|
||||
*/
|
||||
{
|
||||
static char *buffer = NULL;
|
||||
static int32_t buffSize= 0;
|
||||
int32_t length;
|
||||
|
||||
if (begin < end)
|
||||
{
|
||||
length = end - begin;
|
||||
|
||||
if (length >= buffSize)
|
||||
{
|
||||
buffSize = length+1;
|
||||
if (buffer)
|
||||
buffer=ECOREALLOC(buffer,buffSize,
|
||||
"Error in reallocating sub sequence buffer");
|
||||
else
|
||||
buffer=ECOMALLOC(buffSize,
|
||||
"Error in allocating sub sequence buffer");
|
||||
|
||||
}
|
||||
|
||||
strncpy(buffer,nucAcSeq + begin,length);
|
||||
buffer[length]=0;
|
||||
}
|
||||
else
|
||||
{
|
||||
length = end + strlen(nucAcSeq) - begin;
|
||||
|
||||
if (length >= buffSize)
|
||||
{
|
||||
buffSize = length+1;
|
||||
if (buffer)
|
||||
buffer=ECOREALLOC(buffer,buffSize,
|
||||
"Error in reallocating sub sequence buffer");
|
||||
else
|
||||
buffer=ECOMALLOC(buffSize,
|
||||
"Error in allocating sub sequence buffer");
|
||||
|
||||
}
|
||||
strncpy(buffer,nucAcSeq+begin,length - end);
|
||||
strncpy(buffer+(length-end),nucAcSeq ,end);
|
||||
buffer[length]=0;
|
||||
}
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
BIN
src/ecodna.o
Normal file
BIN
src/ecodna.o
Normal file
Binary file not shown.
20
src/ecofilter.c
Normal file
20
src/ecofilter.c
Normal file
@@ -0,0 +1,20 @@
|
||||
#include "ecoPCR.h"
|
||||
|
||||
int eco_is_taxid_included( ecotaxonomy_t *taxonomy,
|
||||
int32_t *restricted_taxid,
|
||||
int32_t tab_len,
|
||||
int32_t taxid)
|
||||
{
|
||||
int i;
|
||||
ecotx_t *taxon;
|
||||
|
||||
taxon = eco_findtaxonbytaxid(taxonomy, taxid);
|
||||
|
||||
if (taxon)
|
||||
for (i=0; i < tab_len; i++)
|
||||
if ( (taxon->taxid == restricted_taxid[i]) ||
|
||||
(eco_isundertaxon(taxon, restricted_taxid[i])) )
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
BIN
src/ecofilter.o
Normal file
BIN
src/ecofilter.o
Normal file
Binary file not shown.
86
src/econame.c
Normal file
86
src/econame.c
Normal file
@@ -0,0 +1,86 @@
|
||||
#include "ecoPCR.h"
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
static econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy);
|
||||
|
||||
econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy)
|
||||
{
|
||||
|
||||
int32_t count;
|
||||
FILE *f;
|
||||
econameidx_t *indexname;
|
||||
int32_t i;
|
||||
|
||||
f = open_ecorecorddb(filename,&count,0);
|
||||
|
||||
if (f==NULL)
|
||||
return NULL;
|
||||
|
||||
indexname = (econameidx_t*) ECOMALLOC(sizeof(econameidx_t) + sizeof(econame_t) * (count-1),"Allocate names");
|
||||
|
||||
indexname->count=count;
|
||||
|
||||
for (i=0; i < count; i++){
|
||||
readnext_econame(f,(indexname->names)+i,taxonomy);
|
||||
}
|
||||
|
||||
return indexname;
|
||||
}
|
||||
|
||||
int32_t delete_nameidx(econameidx_t *nameidx)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
if (nameidx) {
|
||||
for (i=0; i < nameidx->count; i++) {
|
||||
if (nameidx->names[i].name)
|
||||
ECOFREE(nameidx->names[i].name,
|
||||
"Desallocate name");
|
||||
if (nameidx->names[i].classname)
|
||||
ECOFREE(nameidx->names[i].classname,
|
||||
"Desallocate classname");
|
||||
}
|
||||
|
||||
ECOFREE(nameidx,"Desallocate name index");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy)
|
||||
{
|
||||
|
||||
econameformat_t *raw;
|
||||
int32_t rs;
|
||||
|
||||
raw = read_ecorecord(f,&rs);
|
||||
|
||||
if (!raw)
|
||||
return NULL;
|
||||
|
||||
if (is_big_endian())
|
||||
{
|
||||
raw->is_scientificname = swap_int32_t(raw->is_scientificname);
|
||||
raw->namelength = swap_int32_t(raw->namelength);
|
||||
raw->classlength = swap_int32_t(raw->classlength);
|
||||
raw->taxid = swap_int32_t(raw->taxid);
|
||||
}
|
||||
|
||||
name->is_scientificname=raw->is_scientificname;
|
||||
|
||||
name->name = ECOMALLOC((raw->namelength+1) * sizeof(char),"Allocate name");
|
||||
strncpy(name->name,raw->names,raw->namelength);
|
||||
name->name[raw->namelength]=0;
|
||||
|
||||
name->classname = ECOMALLOC((raw->classlength+1) * sizeof(char),"Allocate classname");
|
||||
strncpy(name->classname,(raw->names+raw->namelength),raw->classlength);
|
||||
name->classname[raw->classlength]=0;
|
||||
|
||||
name->taxon = taxonomy->taxons->taxon + raw->taxid;
|
||||
|
||||
return name;
|
||||
}
|
||||
|
BIN
src/econame.o
Normal file
BIN
src/econame.o
Normal file
Binary file not shown.
75
src/ecorank.c
Normal file
75
src/ecorank.c
Normal file
@@ -0,0 +1,75 @@
|
||||
#include "ecoPCR.h"
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
static int compareRankLabel(const void *label1, const void *label2);
|
||||
|
||||
ecorankidx_t *read_rankidx(const char *filename)
|
||||
{
|
||||
int32_t count;
|
||||
FILE *f;
|
||||
ecorankidx_t *index;
|
||||
int32_t i;
|
||||
int32_t rs;
|
||||
char *buffer;
|
||||
|
||||
f = open_ecorecorddb(filename,&count,0);
|
||||
|
||||
if (f==NULL)
|
||||
return NULL;
|
||||
|
||||
index = (ecorankidx_t*) ECOMALLOC(sizeof(ecorankidx_t) + sizeof(char*) * (count-1),
|
||||
"Allocate rank index");
|
||||
|
||||
index->count=count;
|
||||
|
||||
for (i=0; i < count; i++)
|
||||
{
|
||||
buffer = read_ecorecord(f,&rs);
|
||||
index->label[i]=(char*) ECOMALLOC(rs+1,
|
||||
"Allocate rank label");
|
||||
strncpy(index->label[i],buffer,rs);
|
||||
}
|
||||
|
||||
return index;
|
||||
}
|
||||
|
||||
int32_t delete_rankidx(ecorankidx_t *rankidx)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
if (rankidx) {
|
||||
for (i=0; i < rankidx->count; i++)
|
||||
if (rankidx->label[i])
|
||||
ECOFREE(rankidx->label[i],
|
||||
"Desallocate rank label");
|
||||
|
||||
ECOFREE(rankidx,"Desallocate rank index");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int32_t rank_index(const char* label,ecorankidx_t* ranks)
|
||||
{
|
||||
char **rep;
|
||||
|
||||
rep = bsearch(label,ranks->label,ranks->count,sizeof(char*),compareRankLabel);
|
||||
|
||||
if (rep)
|
||||
return rep-ranks->label;
|
||||
// else
|
||||
// ECOERROR(ECO_NOTFOUND_ERROR,"Rank label not found");
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
int compareRankLabel(const void *label1, const void *label2)
|
||||
{
|
||||
return strcmp((const char*)label1,*(const char**)label2);
|
||||
}
|
||||
|
||||
|
BIN
src/ecorank.o
Normal file
BIN
src/ecorank.o
Normal file
Binary file not shown.
231
src/ecoseq.c
Normal file
231
src/ecoseq.c
Normal file
@@ -0,0 +1,231 @@
|
||||
#include "ecoPCR.h"
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <zlib.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <ctype.h>
|
||||
|
||||
static FILE *open_seqfile(const char *prefix,int32_t index);
|
||||
|
||||
|
||||
ecoseq_t *new_ecoseq()
|
||||
{
|
||||
void *tmp;
|
||||
|
||||
tmp = ECOMALLOC(sizeof(ecoseq_t),"Allocate new ecoseq structure");
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
int32_t delete_ecoseq(ecoseq_t * seq)
|
||||
{
|
||||
|
||||
if (seq)
|
||||
{
|
||||
if (seq->AC)
|
||||
ECOFREE(seq->AC,"Free sequence AC");
|
||||
|
||||
if (seq->DE)
|
||||
ECOFREE(seq->DE,"Free sequence DE");
|
||||
|
||||
if (seq->SQ)
|
||||
ECOFREE(seq->SQ,"Free sequence SQ");
|
||||
|
||||
ECOFREE(seq,"Free sequence structure");
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
ecoseq_t *new_ecoseq_with_data( char *AC,
|
||||
char *DE,
|
||||
char *SQ,
|
||||
int32_t taxid_idx
|
||||
)
|
||||
{
|
||||
ecoseq_t *tmp;
|
||||
int32_t lstr;
|
||||
tmp = new_ecoseq();
|
||||
|
||||
tmp->taxid=taxid_idx;
|
||||
|
||||
if (AC)
|
||||
{
|
||||
lstr =strlen(AC);
|
||||
tmp->AC=ECOMALLOC((lstr+1) * sizeof(char),
|
||||
"Allocate sequence accession");
|
||||
strcpy(tmp->AC,AC);
|
||||
}
|
||||
|
||||
if (DE)
|
||||
{
|
||||
lstr =strlen(DE);
|
||||
tmp->DE=ECOMALLOC((lstr+1) * sizeof(char),
|
||||
"Allocate sequence definition");
|
||||
strcpy(tmp->DE,DE);
|
||||
}
|
||||
|
||||
if (SQ)
|
||||
{
|
||||
lstr =strlen(SQ);
|
||||
tmp->SQ=ECOMALLOC((lstr+1) * sizeof(char),
|
||||
"Allocate sequence data");
|
||||
strcpy(tmp->SQ,SQ);
|
||||
}
|
||||
return tmp;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* ?? used ??
|
||||
**/
|
||||
FILE *open_ecoseqdb(const char *filename,
|
||||
int32_t *sequencecount)
|
||||
{
|
||||
return open_ecorecorddb(filename,sequencecount,1);
|
||||
}
|
||||
|
||||
ecoseq_t *readnext_ecoseq(FILE *f)
|
||||
{
|
||||
char *compressed=NULL;
|
||||
|
||||
ecoseqformat_t *raw;
|
||||
ecoseq_t *seq;
|
||||
int32_t comp_status;
|
||||
unsigned long int seqlength;
|
||||
int32_t rs;
|
||||
char *c;
|
||||
int32_t i;
|
||||
|
||||
raw = read_ecorecord(f,&rs);
|
||||
|
||||
if (!raw)
|
||||
return NULL;
|
||||
|
||||
if (is_big_endian())
|
||||
{
|
||||
raw->CSQ_length = swap_int32_t(raw->CSQ_length);
|
||||
raw->DE_length = swap_int32_t(raw->DE_length);
|
||||
raw->SQ_length = swap_int32_t(raw->SQ_length);
|
||||
raw->taxid = swap_int32_t(raw->taxid);
|
||||
}
|
||||
|
||||
seq = new_ecoseq();
|
||||
|
||||
seq->taxid = raw->taxid;
|
||||
|
||||
seq->AC = ECOMALLOC(strlen(raw->AC) +1,
|
||||
"Allocate Sequence Accesion number");
|
||||
strncpy(seq->AC,raw->AC,strlen(raw->AC));
|
||||
|
||||
|
||||
seq->DE = ECOMALLOC(raw->DE_length+1,
|
||||
"Allocate Sequence definition");
|
||||
strncpy(seq->DE,raw->data,raw->DE_length);
|
||||
|
||||
seqlength = seq->SQ_length = raw->SQ_length;
|
||||
|
||||
compressed = raw->data + raw->DE_length;
|
||||
|
||||
seq->SQ = ECOMALLOC(seqlength+1,
|
||||
"Allocate sequence buffer");
|
||||
/*
|
||||
comp_status = uncompress((unsigned char*)seq->SQ,
|
||||
&seqlength,
|
||||
(unsigned char*)compressed,
|
||||
raw->CSQ_length);
|
||||
*/
|
||||
|
||||
if (comp_status != Z_OK)
|
||||
ECOERROR(ECO_IO_ERROR,"I cannot uncompress sequence data");
|
||||
|
||||
for (c=seq->SQ,i=0;i<seqlength;c++,i++)
|
||||
*c=toupper(*c);
|
||||
|
||||
|
||||
return seq;
|
||||
}
|
||||
|
||||
/**
|
||||
* Open the sequences database (.sdx file)
|
||||
* @param prefix name of the database (radical without extension)
|
||||
* @param index integer
|
||||
*
|
||||
* @return file object
|
||||
*/
|
||||
FILE *open_seqfile(const char *prefix,int32_t index)
|
||||
{
|
||||
char filename_buffer[1024];
|
||||
int32_t filename_length;
|
||||
FILE *input;
|
||||
int32_t seqcount;
|
||||
|
||||
filename_length = snprintf(filename_buffer,
|
||||
1023,
|
||||
"%s_%03d.sdx",
|
||||
prefix,
|
||||
index);
|
||||
|
||||
// fprintf(stderr,"# Coucou %s\n",filename_buffer);
|
||||
|
||||
|
||||
if (filename_length >= 1024)
|
||||
ECOERROR(ECO_ASSERT_ERROR,"file name is too long");
|
||||
|
||||
filename_buffer[filename_length]=0;
|
||||
|
||||
input=open_ecorecorddb(filename_buffer,&seqcount,0);
|
||||
|
||||
if (input)
|
||||
REprintf("# Reading file %s containing %d sequences...\n",
|
||||
filename_buffer,
|
||||
seqcount);
|
||||
|
||||
return input;
|
||||
}
|
||||
|
||||
ecoseq_t *ecoseq_iterator(const char *prefix)
|
||||
{
|
||||
static FILE *current_seq_file= NULL;
|
||||
static int32_t current_file_idx = 1;
|
||||
static char current_prefix[1024];
|
||||
ecoseq_t *seq;
|
||||
|
||||
if (prefix)
|
||||
{
|
||||
current_file_idx = 1;
|
||||
|
||||
if (current_seq_file)
|
||||
fclose(current_seq_file);
|
||||
|
||||
strncpy(current_prefix,prefix,1023);
|
||||
current_prefix[1023]=0;
|
||||
|
||||
current_seq_file = open_seqfile(current_prefix,
|
||||
current_file_idx);
|
||||
|
||||
if (!current_seq_file)
|
||||
return NULL;
|
||||
|
||||
}
|
||||
|
||||
seq = readnext_ecoseq(current_seq_file);
|
||||
|
||||
if (!seq && feof(current_seq_file))
|
||||
{
|
||||
current_file_idx++;
|
||||
fclose(current_seq_file);
|
||||
current_seq_file = open_seqfile(current_prefix,
|
||||
current_file_idx);
|
||||
|
||||
|
||||
if (current_seq_file)
|
||||
seq = readnext_ecoseq(current_seq_file);
|
||||
}
|
||||
|
||||
return seq;
|
||||
}
|
BIN
src/ecoseq.o
Normal file
BIN
src/ecoseq.o
Normal file
Binary file not shown.
420
src/ecotax.c
Normal file
420
src/ecotax.c
Normal file
@@ -0,0 +1,420 @@
|
||||
#include "ecoPCR.h"
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <R.h>
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(x,y) (((x)>(y)) ? (x):(y))
|
||||
#endif
|
||||
|
||||
static ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon);
|
||||
|
||||
/**
|
||||
* Open the taxonomy database
|
||||
* @param pointer to the database (.tdx file)
|
||||
* @return a ecotxidx_t structure
|
||||
*/
|
||||
ecotxidx_t *read_taxonomyidx(const char *filename,const char *filename2)
|
||||
{
|
||||
int32_t count;
|
||||
int32_t count2;
|
||||
FILE *f;
|
||||
FILE *f2;
|
||||
ecotxidx_t *index;
|
||||
struct ecotxnode *t;
|
||||
int32_t i;
|
||||
int32_t j;
|
||||
|
||||
f = open_ecorecorddb(filename,&count,0);
|
||||
|
||||
if (f==NULL) return NULL;
|
||||
|
||||
f2 = open_ecorecorddb(filename2,&count2,0);
|
||||
|
||||
index = (ecotxidx_t*) ECOMALLOC(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count+count2-1),
|
||||
"Allocate taxonomy");
|
||||
|
||||
index->count=count+count2;
|
||||
index->buffersize = index->count;
|
||||
|
||||
index->maxtaxid=0;
|
||||
REprintf("Readind %d taxa...\n",count);
|
||||
for (i=0; i < count; i++){
|
||||
readnext_ecotaxon(f,&(index->taxon[i]));
|
||||
index->taxon[i].parent=index->taxon + (size_t)index->taxon[i].parent;
|
||||
index->taxon[i].parent->farest=0;
|
||||
if (index->taxon[i].taxid > index->maxtaxid)
|
||||
index->maxtaxid=index->taxon[i].taxid;
|
||||
}
|
||||
|
||||
|
||||
if (count2>0)
|
||||
REprintf("Readind %d local taxa...\n",count2);
|
||||
else
|
||||
REprintf("No local taxon\n");
|
||||
|
||||
count = index->count;
|
||||
|
||||
for (; i < count; i++){
|
||||
readnext_ecotaxon(f2,&(index->taxon[i]));
|
||||
index->taxon[i].parent=index->taxon + (size_t)index->taxon[i].parent;
|
||||
index->taxon[i].parent->farest=0;
|
||||
if (index->taxon[i].taxid > index->maxtaxid)
|
||||
index->maxtaxid=index->taxon[i].taxid;
|
||||
}
|
||||
|
||||
REprintf("Computing longest branches...\n",count);
|
||||
|
||||
for (i=0; i < count; i++){
|
||||
t=index->taxon+i;
|
||||
if (t->farest==-1)
|
||||
{
|
||||
t->farest=0;
|
||||
while(t->parent != t)
|
||||
{
|
||||
j = t->farest + 1;
|
||||
if (j > t->parent->farest)
|
||||
{
|
||||
t->parent->farest = j;
|
||||
t=t->parent;
|
||||
}
|
||||
else
|
||||
t=index->taxon;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return index;
|
||||
}
|
||||
|
||||
|
||||
int32_t delete_taxonomy(ecotxidx_t *index)
|
||||
{
|
||||
int32_t i;
|
||||
|
||||
if (index)
|
||||
{
|
||||
for (i=0; i< index->count; i++)
|
||||
if (index->taxon[i].name)
|
||||
ECOFREE(index->taxon[i].name,"Free scientific name");
|
||||
|
||||
|
||||
ECOFREE(index,"Free Taxonomy");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int32_t delete_taxon(ecotx_t *taxon)
|
||||
{
|
||||
if (taxon)
|
||||
{
|
||||
if (taxon->name)
|
||||
ECOFREE(taxon->name,"Free scientific name");
|
||||
|
||||
ECOFREE(taxon,"Free Taxon");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Read the database for a given taxon a save the data
|
||||
* into the taxon structure(if any found)
|
||||
* @param *f pointer to FILE type returned by fopen
|
||||
* @param *taxon pointer to the structure
|
||||
*
|
||||
* @return a ecotx_t structure if any taxon found else NULL
|
||||
*/
|
||||
ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon)
|
||||
{
|
||||
|
||||
ecotxformat_t *raw;
|
||||
int32_t rs;
|
||||
|
||||
raw = read_ecorecord(f,&rs);
|
||||
|
||||
if (!raw)
|
||||
return NULL;
|
||||
|
||||
if (is_big_endian())
|
||||
{
|
||||
raw->namelength = swap_int32_t(raw->namelength);
|
||||
raw->parent = swap_int32_t(raw->parent);
|
||||
raw->rank = swap_int32_t(raw->rank);
|
||||
raw->taxid = swap_int32_t(raw->taxid);
|
||||
}
|
||||
|
||||
taxon->parent = (ecotx_t*)((size_t)raw->parent);
|
||||
taxon->taxid = raw->taxid;
|
||||
taxon->rank = raw->rank;
|
||||
taxon->farest = -1;
|
||||
|
||||
taxon->name = ECOMALLOC((raw->namelength+1) * sizeof(char),
|
||||
"Allocate taxon scientific name");
|
||||
|
||||
strncpy(taxon->name,raw->name,raw->namelength);
|
||||
|
||||
return taxon;
|
||||
}
|
||||
|
||||
|
||||
ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName)
|
||||
{
|
||||
ecotaxonomy_t *tax;
|
||||
char *filename;
|
||||
char *filename2;
|
||||
int buffsize;
|
||||
|
||||
tax = ECOMALLOC(sizeof(ecotaxonomy_t),
|
||||
"Allocate taxonomy structure");
|
||||
|
||||
tax->ranks =NULL;
|
||||
tax->taxons=NULL;
|
||||
tax->names =NULL;
|
||||
|
||||
buffsize = strlen(prefix)+10;
|
||||
|
||||
filename = ECOMALLOC(buffsize,
|
||||
"Allocate filename");
|
||||
filename2= ECOMALLOC(buffsize,
|
||||
"Allocate filename");
|
||||
|
||||
snprintf(filename,buffsize,"%s.rdx",prefix);
|
||||
|
||||
tax->ranks = read_rankidx(filename);
|
||||
|
||||
if (tax->ranks == NULL)
|
||||
{
|
||||
ECOFREE(filename,"Desallocate filename 1");
|
||||
ECOFREE(filename2,"Desallocate filename 2");
|
||||
|
||||
delete_ecotaxonomy(tax);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
snprintf(filename,buffsize,"%s.tdx",prefix);
|
||||
snprintf(filename2,buffsize,"%s.ldx",prefix);
|
||||
|
||||
tax->taxons = read_taxonomyidx(filename,filename2);
|
||||
|
||||
if (tax->taxons == NULL)
|
||||
{
|
||||
ECOFREE(filename,"Desallocate filename 1");
|
||||
ECOFREE(filename,"Desallocate filename 2");
|
||||
|
||||
delete_ecotaxonomy(tax);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (readAlternativeName)
|
||||
{
|
||||
snprintf(filename,buffsize,"%s.ndx",prefix);
|
||||
tax->names=read_nameidx(filename,tax);
|
||||
}
|
||||
else
|
||||
tax->names=NULL;
|
||||
|
||||
ECOFREE(filename,"Desallocate filename 1");
|
||||
ECOFREE(filename2,"Desallocate filename 2");
|
||||
|
||||
return tax;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy)
|
||||
{
|
||||
if (taxonomy)
|
||||
{
|
||||
delete_rankidx(taxonomy->ranks);
|
||||
delete_nameidx(taxonomy->names);
|
||||
delete_taxonomy(taxonomy->taxons);
|
||||
|
||||
ECOFREE(taxonomy,"Free taxonomy structure");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
ecotx_t *eco_findtaxonatrank(ecotx_t *taxon,
|
||||
int32_t rankidx)
|
||||
{
|
||||
ecotx_t *current_taxon;
|
||||
ecotx_t *next_taxon;
|
||||
|
||||
current_taxon = taxon;
|
||||
next_taxon = current_taxon->parent;
|
||||
|
||||
while ((current_taxon!=next_taxon) && // I' am the root node
|
||||
(current_taxon->rank!=rankidx))
|
||||
{
|
||||
current_taxon = next_taxon;
|
||||
next_taxon = current_taxon->parent;
|
||||
}
|
||||
|
||||
if (current_taxon->rank==rankidx)
|
||||
return current_taxon;
|
||||
else
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int bcomptaxon (const void * ptaxid, const void * ptaxon) {
|
||||
|
||||
ecotx_t *current_taxon = (ecotx_t*)ptaxon;
|
||||
int32_t taxid=(int32_t)((size_t)ptaxid);
|
||||
return taxid - current_taxon->taxid;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get back information concerning a taxon from a taxonomic id
|
||||
* @param *taxonomy the taxonomy database
|
||||
* @param taxid the taxonomic id
|
||||
*
|
||||
* @result a ecotx_t structure containing the taxonimic information
|
||||
**/
|
||||
ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy,
|
||||
int32_t taxid)
|
||||
{
|
||||
ecotx_t *current_taxon;
|
||||
int32_t taxoncount;
|
||||
|
||||
taxoncount=taxonomy->taxons->count;
|
||||
|
||||
current_taxon = (ecotx_t*) bsearch((const void *)((size_t)taxid),
|
||||
(const void *)taxonomy->taxons->taxon,
|
||||
taxoncount,
|
||||
sizeof(ecotx_t),
|
||||
bcomptaxon);
|
||||
|
||||
return current_taxon;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find out if taxon is son of other taxon (identified by its taxid)
|
||||
* @param *taxon son taxon
|
||||
* @param parent_taxid taxonomic id of the other taxon
|
||||
*
|
||||
* @return 1 is the other taxid math a parent taxid, else 0
|
||||
**/
|
||||
int eco_isundertaxon(ecotx_t *taxon,
|
||||
int other_taxid)
|
||||
{
|
||||
ecotx_t *next_parent;
|
||||
|
||||
next_parent = taxon->parent;
|
||||
|
||||
while ( (other_taxid != next_parent->taxid) &&
|
||||
(strcmp(next_parent->name, "root")) )
|
||||
{
|
||||
next_parent = next_parent->parent;
|
||||
}
|
||||
|
||||
if (other_taxid == next_parent->taxid)
|
||||
return 1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
ecotx_t *eco_getspecies(ecotx_t *taxon,
|
||||
ecotaxonomy_t *taxonomy)
|
||||
{
|
||||
static ecotaxonomy_t *tax=NULL;
|
||||
static int32_t rankindex=-1;
|
||||
|
||||
if (taxonomy && tax!=taxonomy)
|
||||
{
|
||||
rankindex = rank_index("species",taxonomy->ranks);
|
||||
tax=taxonomy;
|
||||
}
|
||||
|
||||
if (!tax || rankindex < 0)
|
||||
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
|
||||
|
||||
return eco_findtaxonatrank(taxon,rankindex);
|
||||
}
|
||||
|
||||
ecotx_t *eco_getgenus(ecotx_t *taxon,
|
||||
ecotaxonomy_t *taxonomy)
|
||||
{
|
||||
static ecotaxonomy_t *tax=NULL;
|
||||
static int32_t rankindex=-1;
|
||||
|
||||
if (taxonomy && tax!=taxonomy)
|
||||
{
|
||||
rankindex = rank_index("genus",taxonomy->ranks);
|
||||
tax=taxonomy;
|
||||
}
|
||||
|
||||
if (!tax || rankindex < 0)
|
||||
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
|
||||
|
||||
return eco_findtaxonatrank(taxon,rankindex);
|
||||
}
|
||||
|
||||
|
||||
ecotx_t *eco_getfamily(ecotx_t *taxon,
|
||||
ecotaxonomy_t *taxonomy)
|
||||
{
|
||||
static ecotaxonomy_t *tax=NULL;
|
||||
static int32_t rankindex=-1;
|
||||
|
||||
if (taxonomy && tax!=taxonomy)
|
||||
{
|
||||
rankindex = rank_index("family",taxonomy->ranks);
|
||||
tax=taxonomy;
|
||||
}
|
||||
|
||||
if (!tax || rankindex < 0)
|
||||
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
|
||||
|
||||
return eco_findtaxonatrank(taxon,rankindex);
|
||||
}
|
||||
|
||||
ecotx_t *eco_getkingdom(ecotx_t *taxon,
|
||||
ecotaxonomy_t *taxonomy)
|
||||
{
|
||||
static ecotaxonomy_t *tax=NULL;
|
||||
static int32_t rankindex=-1;
|
||||
|
||||
if (taxonomy && tax!=taxonomy)
|
||||
{
|
||||
rankindex = rank_index("kingdom",taxonomy->ranks);
|
||||
tax=taxonomy;
|
||||
}
|
||||
|
||||
if (!tax || rankindex < 0)
|
||||
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
|
||||
|
||||
return eco_findtaxonatrank(taxon,rankindex);
|
||||
}
|
||||
|
||||
ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,
|
||||
ecotaxonomy_t *taxonomy)
|
||||
{
|
||||
static ecotaxonomy_t *tax=NULL;
|
||||
static int32_t rankindex=-1;
|
||||
|
||||
if (taxonomy && tax!=taxonomy)
|
||||
{
|
||||
rankindex = rank_index("superkingdom",taxonomy->ranks);
|
||||
tax=taxonomy;
|
||||
}
|
||||
|
||||
if (!tax || rankindex < 0)
|
||||
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
|
||||
|
||||
return eco_findtaxonatrank(taxon,rankindex);
|
||||
}
|
BIN
src/ecotax.o
Normal file
BIN
src/ecotax.o
Normal file
Binary file not shown.
845
src/robitax.c
Normal file
845
src/robitax.c
Normal file
@@ -0,0 +1,845 @@
|
||||
/*
|
||||
* robitax.c
|
||||
*
|
||||
* Created on: 17 janv. 2013
|
||||
* Author: coissac
|
||||
*/
|
||||
|
||||
#include "robitax.h"
|
||||
#include <unistd.h>
|
||||
//#include <regex.h>
|
||||
#include "slre.h"
|
||||
|
||||
/**
|
||||
* Return a pointeur to an obitools taxonomy C structure
|
||||
* from an R instance of taxonomy.obitools
|
||||
*
|
||||
* The function checks if the pointer stored in the R object is set
|
||||
* to NULL. In this case this means that we have to load the taxonomy
|
||||
* from the disk.
|
||||
*
|
||||
* @param taxonomy an R object
|
||||
* @type taxonomy SEXP
|
||||
*
|
||||
* @return a pointer to the C structure
|
||||
* @rtype ecotaxonomy_t *
|
||||
*/
|
||||
|
||||
ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy)
|
||||
{
|
||||
|
||||
|
||||
char *pwd;
|
||||
SEXP pointer;
|
||||
SEXP rclass;
|
||||
SEXP rdir;
|
||||
SEXP rfile;
|
||||
ecotaxonomy_t *ptax;
|
||||
const char *class;
|
||||
const char *file;
|
||||
const char *dir;
|
||||
|
||||
int saved;
|
||||
|
||||
if (!IS_S4_OBJECT(Rtaxonomy) )
|
||||
error("argument not obitools.taxonomy instance");
|
||||
|
||||
// We get the class name and compare it to "taxonomy.obitools"
|
||||
rclass = getAttrib(Rtaxonomy, R_ClassSymbol);
|
||||
class = CHAR(asChar(rclass));
|
||||
|
||||
if (strcmp(class,"obitools.taxonomy"))
|
||||
error("argument not obitools.taxonomy instance");
|
||||
|
||||
pointer = R_do_slot(Rtaxonomy,mkString("pointer"));
|
||||
saved = LOGICAL(R_do_slot(Rtaxonomy,mkString("saved")))[0];
|
||||
ptax = (ecotaxonomy_t *) R_ExternalPtrAddr(pointer);
|
||||
|
||||
// If the external pointer is set to NULL we have to load
|
||||
// the taxonomy from file
|
||||
if (ptax==NULL && saved)
|
||||
{
|
||||
pwd = getcwd(NULL,0);
|
||||
|
||||
rfile = R_do_slot(Rtaxonomy,mkString("dbname"));
|
||||
file = CHAR(asChar(rfile));
|
||||
|
||||
rdir = R_do_slot(Rtaxonomy,mkString("workingdir"));
|
||||
dir = CHAR(asChar(rdir));
|
||||
|
||||
chdir(dir);
|
||||
|
||||
ptax = read_taxonomy(file,1);
|
||||
|
||||
R_SetExternalPtrAddr(pointer,(void*)ptax);
|
||||
|
||||
chdir(pwd);
|
||||
free(pwd);
|
||||
}
|
||||
|
||||
if (ptax==NULL && ! saved)
|
||||
error("The taxonomy instance is no more valid and must be rebuilt");
|
||||
|
||||
return ptax;
|
||||
}
|
||||
|
||||
SEXP R_delete_taxonomy(SEXP Rtaxonomy)
|
||||
{
|
||||
ecotaxonomy_t *ptax;
|
||||
// SEXP pointer;
|
||||
|
||||
ptax = (ecotaxonomy_t *) R_ExternalPtrAddr(Rtaxonomy);
|
||||
|
||||
(void) delete_ecotaxonomy(ptax);
|
||||
|
||||
// Clear the external pointer
|
||||
R_ClearExternalPtr(Rtaxonomy);
|
||||
|
||||
return R_NilValue;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
SEXP R_read_taxonomy(SEXP filename, SEXP altenative)
|
||||
{
|
||||
int alt;
|
||||
const char* file;
|
||||
SEXP Rtax;
|
||||
|
||||
if (! isString(filename))
|
||||
error("filename not character");
|
||||
file = CHAR(STRING_ELT(filename, 0));
|
||||
|
||||
if (! isLogical(altenative))
|
||||
error("altenative not logical");
|
||||
alt = LOGICAL(altenative)[0];
|
||||
|
||||
ecotaxonomy_t *taxonomy = read_taxonomy(file,alt);
|
||||
|
||||
if (! taxonomy)
|
||||
error("Cannot open the taxonomy database");
|
||||
|
||||
Rtax = PROTECT(R_MakeExternalPtr(taxonomy, mkString("ROBITools NCBI Taxonomy pointer"), R_NilValue));
|
||||
R_RegisterCFinalizerEx(Rtax, (R_CFinalizer_t)R_delete_taxonomy,TRUE);
|
||||
|
||||
UNPROTECT(1);
|
||||
|
||||
|
||||
return Rtax;
|
||||
}
|
||||
|
||||
|
||||
SEXP R_get_scientific_name(SEXP Rtaxonomy,SEXP Rtaxid)
|
||||
{
|
||||
ecotx_t *taxon;
|
||||
ecotaxonomy_t *ptax;
|
||||
int taxid;
|
||||
// SEXP scname;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
if (! isInteger(Rtaxid))
|
||||
error("taxid not integer");
|
||||
|
||||
taxid = *INTEGER(Rtaxid);
|
||||
|
||||
if (! (taxid > 0))
|
||||
error("taxid not positive");
|
||||
|
||||
|
||||
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||
|
||||
if (!taxon)
|
||||
return ScalarString(R_NaString);
|
||||
// error("unkown taxid");
|
||||
|
||||
return mkString(taxon->name);
|
||||
|
||||
}
|
||||
|
||||
SEXP R_get_rank(SEXP Rtaxonomy,SEXP Rtaxid)
|
||||
{
|
||||
ecotx_t *taxon;
|
||||
ecotaxonomy_t *ptax;
|
||||
int *taxid;
|
||||
int ntaxid;
|
||||
int i;
|
||||
SEXP results;
|
||||
// SEXP scname;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
if (! isInteger(Rtaxid))
|
||||
error("taxid not integer");
|
||||
|
||||
ntaxid = length(Rtaxid);
|
||||
|
||||
results = PROTECT(allocVector(STRSXP, ntaxid));
|
||||
|
||||
taxid = INTEGER(Rtaxid);
|
||||
|
||||
for (i=0; i < ntaxid; i++)
|
||||
{
|
||||
if (taxid[i]== NA_INTEGER || taxid[i] <= 0)
|
||||
SET_STRING_ELT(results, i, R_NaString);
|
||||
else {
|
||||
taxon = eco_findtaxonbytaxid(ptax, taxid[i]);
|
||||
if (!taxon)
|
||||
SET_STRING_ELT(results, i, R_NaString);
|
||||
else
|
||||
SET_STRING_ELT(results, i, mkChar(ptax->ranks->label[taxon->rank]));
|
||||
}
|
||||
}
|
||||
|
||||
UNPROTECT(1);
|
||||
|
||||
return results;
|
||||
|
||||
}
|
||||
|
||||
SEXP R_findtaxonatrank(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rrank, SEXP Rname)
|
||||
{
|
||||
ecotx_t *taxon;
|
||||
ecotx_t *rep;
|
||||
ecotaxonomy_t *ptax;
|
||||
int taxid;
|
||||
int name;
|
||||
const char *rank;
|
||||
int rankidx;
|
||||
// SEXP scname;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
if (! isInteger(Rtaxid))
|
||||
error("taxid not integer");
|
||||
|
||||
taxid = *INTEGER(Rtaxid);
|
||||
|
||||
if (! (taxid > 0))
|
||||
error("taxid not positive");
|
||||
|
||||
if (! isString(Rrank))
|
||||
error("rank not a string");
|
||||
|
||||
rank=CHAR(STRING_ELT(Rrank,0));
|
||||
|
||||
rankidx=rank_index(rank,ptax->ranks);
|
||||
|
||||
if (rankidx < 0)
|
||||
error("unkown rank name");
|
||||
|
||||
if (! isLogical(Rname))
|
||||
error("name not logical");
|
||||
name = LOGICAL(Rname)[0];
|
||||
|
||||
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||
|
||||
if (!taxon)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
rep = eco_findtaxonatrank(taxon,rankidx);
|
||||
|
||||
if (!rep)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
if (name)
|
||||
return mkString(rep->name);
|
||||
else
|
||||
return ScalarInteger(rep->taxid);
|
||||
}
|
||||
|
||||
|
||||
SEXP R_get_species(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
|
||||
{
|
||||
ecotx_t *taxon;
|
||||
ecotx_t *rep;
|
||||
ecotaxonomy_t *ptax;
|
||||
int taxid;
|
||||
int name;
|
||||
// SEXP scname;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
if (! isInteger(Rtaxid))
|
||||
error("taxid not integer");
|
||||
|
||||
taxid = *INTEGER(Rtaxid);
|
||||
|
||||
if (! (taxid > 0))
|
||||
error("taxid not positive");
|
||||
|
||||
if (! isLogical(Rname))
|
||||
error("name not logical");
|
||||
name = LOGICAL(Rname)[0];
|
||||
|
||||
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||
|
||||
if (!taxon)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
rep = eco_getspecies(taxon,ptax);
|
||||
|
||||
if (!rep)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
if (name)
|
||||
return mkString(rep->name);
|
||||
else
|
||||
return ScalarInteger(rep->taxid);
|
||||
}
|
||||
|
||||
SEXP R_get_genus(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
|
||||
{
|
||||
ecotx_t *taxon;
|
||||
ecotx_t *rep;
|
||||
ecotaxonomy_t *ptax;
|
||||
int taxid;
|
||||
int name;
|
||||
// SEXP scname;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
if (! isInteger(Rtaxid))
|
||||
error("taxid not integer");
|
||||
|
||||
taxid = *INTEGER(Rtaxid);
|
||||
|
||||
if (! (taxid > 0))
|
||||
error("taxid not positive");
|
||||
|
||||
if (! isLogical(Rname))
|
||||
error("name not logical");
|
||||
name = LOGICAL(Rname)[0];
|
||||
|
||||
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||
|
||||
if (!taxon)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
rep = eco_getgenus(taxon,ptax);
|
||||
|
||||
if (!rep)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
if (name)
|
||||
return mkString(rep->name);
|
||||
else
|
||||
return ScalarInteger(rep->taxid);
|
||||
}
|
||||
|
||||
SEXP R_get_family(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
|
||||
{
|
||||
ecotx_t *taxon;
|
||||
ecotx_t *rep;
|
||||
ecotaxonomy_t *ptax;
|
||||
int taxid;
|
||||
int name;
|
||||
// SEXP scname;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
if (! isInteger(Rtaxid))
|
||||
error("taxid not integer");
|
||||
|
||||
taxid = *INTEGER(Rtaxid);
|
||||
|
||||
if (! (taxid > 0))
|
||||
error("taxid not positive");
|
||||
|
||||
if (! isLogical(Rname))
|
||||
error("name not logical");
|
||||
name = LOGICAL(Rname)[0];
|
||||
|
||||
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||
|
||||
if (!taxon)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
rep = eco_getfamily(taxon,ptax);
|
||||
|
||||
if (!rep)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
if (name)
|
||||
return mkString(rep->name);
|
||||
else
|
||||
return ScalarInteger(rep->taxid);
|
||||
}
|
||||
|
||||
SEXP R_get_kingdom(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
|
||||
{
|
||||
ecotx_t *taxon;
|
||||
ecotx_t *rep;
|
||||
ecotaxonomy_t *ptax;
|
||||
int taxid;
|
||||
int name;
|
||||
// SEXP scname;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
if (! isInteger(Rtaxid))
|
||||
error("taxid not integer");
|
||||
|
||||
taxid = *INTEGER(Rtaxid);
|
||||
|
||||
if (! (taxid > 0))
|
||||
error("taxid not positive");
|
||||
|
||||
if (! isLogical(Rname))
|
||||
error("name not logical");
|
||||
name = LOGICAL(Rname)[0];
|
||||
|
||||
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||
|
||||
if (!taxon)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
rep = eco_getkingdom(taxon,ptax);
|
||||
|
||||
if (!rep)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
if (name)
|
||||
return mkString(rep->name);
|
||||
else
|
||||
return ScalarInteger(rep->taxid);
|
||||
}
|
||||
|
||||
SEXP R_get_superkingdom(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
|
||||
{
|
||||
ecotx_t *taxon;
|
||||
ecotx_t *rep;
|
||||
ecotaxonomy_t *ptax;
|
||||
int taxid;
|
||||
int name;
|
||||
// SEXP scname;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
if (! isInteger(Rtaxid))
|
||||
error("taxid not integer");
|
||||
|
||||
taxid = *INTEGER(Rtaxid);
|
||||
|
||||
if (! (taxid > 0))
|
||||
error("taxid not positive");
|
||||
|
||||
if (! isLogical(Rname))
|
||||
error("name not logical");
|
||||
name = LOGICAL(Rname)[0];
|
||||
|
||||
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||
|
||||
if (!taxon)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
rep = eco_getsuperkingdom(taxon,ptax);
|
||||
|
||||
if (!rep)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
if (name)
|
||||
return mkString(rep->name);
|
||||
else
|
||||
return ScalarInteger(rep->taxid);
|
||||
}
|
||||
|
||||
SEXP R_get_parent(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
|
||||
{
|
||||
ecotx_t *taxon;
|
||||
ecotx_t *rep;
|
||||
ecotaxonomy_t *ptax;
|
||||
int taxid;
|
||||
int name;
|
||||
// SEXP scname;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
if (! isInteger(Rtaxid))
|
||||
error("taxid not integer");
|
||||
|
||||
taxid = *INTEGER(Rtaxid);
|
||||
|
||||
if (! (taxid > 0))
|
||||
error("taxid not positive");
|
||||
|
||||
if (! isLogical(Rname))
|
||||
error("name not logical");
|
||||
name = LOGICAL(Rname)[0];
|
||||
|
||||
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||
|
||||
if (!taxon)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
rep = taxon->parent;
|
||||
|
||||
if (rep->taxid==taxid)
|
||||
{
|
||||
if (name)
|
||||
return ScalarString(R_NaString);
|
||||
else
|
||||
return ScalarInteger(R_NaInt);
|
||||
}
|
||||
|
||||
if (name)
|
||||
return mkString(rep->name);
|
||||
else
|
||||
return ScalarInteger(rep->taxid);
|
||||
}
|
||||
|
||||
|
||||
SEXP R_validate_taxid(SEXP Rtaxonomy,SEXP Rtaxid)
|
||||
{
|
||||
ecotx_t *taxon;
|
||||
ecotaxonomy_t *ptax;
|
||||
int taxid;
|
||||
// int name;
|
||||
// SEXP scname;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
if (! isInteger(Rtaxid))
|
||||
error("taxid not integer");
|
||||
|
||||
taxid = *INTEGER(Rtaxid);
|
||||
|
||||
if (! (taxid > 0))
|
||||
return ScalarInteger(R_NaInt);
|
||||
|
||||
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||
|
||||
if (!taxon)
|
||||
return ScalarInteger(R_NaInt);
|
||||
else
|
||||
return ScalarInteger(taxon->taxid);
|
||||
}
|
||||
|
||||
|
||||
SEXP R_is_under_taxon(SEXP Rtaxonomy, SEXP Rtaxid, SEXP Rparent)
|
||||
{
|
||||
ecotx_t *taxon;
|
||||
ecotaxonomy_t *ptax;
|
||||
int taxid;
|
||||
int *ptaxid;
|
||||
int ntaxid;
|
||||
int parent;
|
||||
SEXP rep;
|
||||
int* prep;
|
||||
size_t i;
|
||||
// SEXP isunder;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
if (! isInteger(Rparent))
|
||||
error("parent not integer");
|
||||
|
||||
if (GET_LENGTH(Rparent) > 1)
|
||||
error("parent must have a length equal to one.");
|
||||
|
||||
parent = *INTEGER(Rparent);
|
||||
|
||||
if (! isInteger(Rtaxid))
|
||||
error("taxid not integer");
|
||||
|
||||
ntaxid = GET_LENGTH(Rtaxid);
|
||||
rep = PROTECT(NEW_LOGICAL(ntaxid));
|
||||
prep=LOGICAL(rep);
|
||||
|
||||
ptaxid = INTEGER_POINTER(Rtaxid);
|
||||
|
||||
|
||||
if (parent > 0) {
|
||||
taxon = eco_findtaxonbytaxid(ptax, parent);
|
||||
if (taxon) {
|
||||
for (i=0; i < ntaxid; i++) {
|
||||
if (ptaxid[i] <= 0)
|
||||
prep[i]=NA_LOGICAL;
|
||||
else {
|
||||
taxon = eco_findtaxonbytaxid(ptax, ptaxid[i]);
|
||||
prep[i] = eco_isundertaxon(taxon, parent);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (i=0; i < ntaxid; i++)
|
||||
prep[i]=NA_LOGICAL;
|
||||
}
|
||||
|
||||
unprotect(1);
|
||||
return rep;
|
||||
}
|
||||
|
||||
|
||||
SEXP R_longest_path(SEXP Rtaxonomy,SEXP Rtaxid)
|
||||
{
|
||||
ecotx_t *taxon;
|
||||
ecotaxonomy_t *ptax;
|
||||
int taxid;
|
||||
// int name;
|
||||
// SEXP scname;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
if (! isInteger(Rtaxid))
|
||||
error("taxid not integer");
|
||||
|
||||
taxid = *INTEGER(Rtaxid);
|
||||
|
||||
if (taxid <= 0)
|
||||
return ScalarInteger(R_NaInt);
|
||||
|
||||
taxon = eco_findtaxonbytaxid(ptax, taxid);
|
||||
|
||||
if (!taxon)
|
||||
return ScalarInteger(R_NaInt);
|
||||
else
|
||||
return ScalarInteger(taxon->farest);
|
||||
}
|
||||
|
||||
SEXP R_rank_list(SEXP Rtaxonomy)
|
||||
{
|
||||
int nrank;
|
||||
int i;
|
||||
ecotaxonomy_t *ptax;
|
||||
SEXP rNames;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
nrank = ptax->ranks->count;
|
||||
|
||||
rNames = PROTECT(allocVector(STRSXP, nrank));
|
||||
|
||||
for (i=0; i < nrank;i++)
|
||||
SET_STRING_ELT(rNames, i, mkChar(ptax->ranks->label[i]));
|
||||
|
||||
UNPROTECT(1);
|
||||
|
||||
return rNames;
|
||||
}
|
||||
|
||||
SEXP R_taxid_list(SEXP Rtaxonomy)
|
||||
{
|
||||
int ntaxid;
|
||||
int i;
|
||||
ecotaxonomy_t *ptax;
|
||||
SEXP rTaxids;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
ntaxid = ptax->taxons->count;
|
||||
rTaxids = PROTECT(allocVector(INTSXP, ntaxid));
|
||||
|
||||
for (i=0; i < ntaxid;i++)
|
||||
INTEGER(rTaxids)[i]=ptax->taxons->taxon[i].taxid;
|
||||
|
||||
UNPROTECT(1);
|
||||
|
||||
return rTaxids;
|
||||
|
||||
}
|
||||
|
||||
SEXP R_max_taxid(SEXP Rtaxonomy)
|
||||
{
|
||||
// int nrank;
|
||||
// int i;
|
||||
ecotaxonomy_t *ptax;
|
||||
// SEXP rNames;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
return ScalarInteger(ptax->taxons->maxtaxid);
|
||||
}
|
||||
|
||||
SEXP R_length_taxonomy(SEXP Rtaxonomy)
|
||||
{
|
||||
ecotaxonomy_t *ptax;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
|
||||
return ScalarInteger(ptax->taxons->count);
|
||||
}
|
||||
|
||||
SEXP R_ecofind(SEXP Rtaxonomy, SEXP Rpattern, SEXP Rrank, SEXP Ralternative)
|
||||
{
|
||||
ecotaxonomy_t *ptax;
|
||||
econame_t *name;
|
||||
char* pattern=NULL;
|
||||
int re_match;
|
||||
SEXP taxids;
|
||||
int32_t* buffer;
|
||||
int32_t tax_count = 0;
|
||||
size_t j = 0;
|
||||
int32_t rankfilter = 1;
|
||||
int* ptaxid;
|
||||
char *rankname=NULL;
|
||||
int32_t nummatch = 0;
|
||||
int32_t alternative = 0;
|
||||
|
||||
size_t bsize;
|
||||
|
||||
ptax = getTaxPointer(Rtaxonomy);
|
||||
tax_count = ptax->taxons->count;
|
||||
|
||||
if (! isString(Rpattern))
|
||||
error("pattern not a string");
|
||||
|
||||
pattern= (char*) CHAR(STRING_ELT(Rpattern,0));
|
||||
|
||||
if (! isNull(Rrank))
|
||||
{
|
||||
if (! isString(Rrank))
|
||||
error("rank not a string");
|
||||
|
||||
rankname= (char*) CHAR(STRING_ELT(Rrank,0));
|
||||
}
|
||||
|
||||
if (! isLogical(Ralternative))
|
||||
error("rank not a logical");
|
||||
|
||||
alternative = LOGICAL(Ralternative)[0];
|
||||
|
||||
|
||||
nummatch=0;
|
||||
buffer = (int32_t*) malloc(100 * sizeof(int32_t));
|
||||
bsize=100;
|
||||
|
||||
if (alternative && ptax->names!=NULL)
|
||||
for (j=0,name=ptax->names->names;
|
||||
j < ptax->names->count;
|
||||
name++,j++)
|
||||
{
|
||||
if(rankname)
|
||||
rankfilter = !(strcmp(rankname,ptax->ranks->label[name->taxon->rank]));
|
||||
|
||||
re_match = slre_match(pattern, name->name,
|
||||
strlen(name->name),
|
||||
NULL, 0,
|
||||
SLRE_IGNORE_CASE);
|
||||
|
||||
if (re_match > 0 && rankfilter)
|
||||
{
|
||||
buffer[nummatch]=name->taxon->taxid;
|
||||
nummatch++;
|
||||
if (nummatch==bsize) {
|
||||
bsize*=2;
|
||||
buffer = (int32_t*) realloc(buffer, bsize * sizeof(int32_t));
|
||||
if (buffer==0)
|
||||
{
|
||||
// regfree(&re_preg);
|
||||
error("Cannot allocate memory for the taxid list");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
for (j=0; j < ptax->taxons->count;j++)
|
||||
{
|
||||
if(rankname)
|
||||
rankfilter = !(strcmp(rankname,ptax->ranks->label[ptax->taxons->taxon[j].rank]));
|
||||
|
||||
// re_match = regexec (&re_preg, ptax->taxons->taxon[j].name, 0, NULL, 0);
|
||||
re_match = slre_match(pattern, ptax->taxons->taxon[j].name,
|
||||
strlen(ptax->taxons->taxon[j].name),
|
||||
NULL, 0,
|
||||
SLRE_IGNORE_CASE);
|
||||
|
||||
|
||||
// if (!re_match && rankfilter)
|
||||
if (re_match > 0 && rankfilter)
|
||||
{
|
||||
buffer[nummatch]=ptax->taxons->taxon[j].taxid;
|
||||
nummatch++;
|
||||
if (nummatch==bsize) {
|
||||
bsize*=2;
|
||||
buffer = (int32_t*) realloc(buffer, bsize * sizeof(int32_t));
|
||||
if (buffer==0)
|
||||
{
|
||||
// regfree(&re_preg);
|
||||
error("Cannot allocate memory for the taxid list");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//regfree(&re_preg);
|
||||
|
||||
taxids = PROTECT(NEW_INTEGER(nummatch));
|
||||
ptaxid = INTEGER(taxids);
|
||||
|
||||
for (j=0; j < nummatch; j++)
|
||||
ptaxid[j]=buffer[j];
|
||||
|
||||
free(buffer);
|
||||
|
||||
UNPROTECT(1);
|
||||
return taxids;
|
||||
}
|
6
src/robitax.h
Normal file
6
src/robitax.h
Normal file
@@ -0,0 +1,6 @@
|
||||
#include "ecoPCR.h"
|
||||
|
||||
|
||||
ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy);
|
||||
SEXP R_delete_taxonomy(SEXP Rtaxonomy);
|
||||
|
BIN
src/robitax.o
Normal file
BIN
src/robitax.o
Normal file
Binary file not shown.
433
src/slre.c
Executable file
433
src/slre.c
Executable file
@@ -0,0 +1,433 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
|
||||
* Copyright (c) 2013 Cesanta Software Limited
|
||||
* All rights reserved
|
||||
*
|
||||
* This library is dual-licensed: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation. For the terms of this
|
||||
* license, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You are free to use this library under the terms of the GNU General
|
||||
* Public License, but WITHOUT ANY WARRANTY; without even the implied
|
||||
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See the GNU General Public License for more details.
|
||||
*
|
||||
* Alternatively, you can license this library under a commercial
|
||||
* license, as set out in <http://cesanta.com/products.html>.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <ctype.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "slre.h"
|
||||
|
||||
#define MAX_BRANCHES 100
|
||||
#define MAX_BRACKETS 100
|
||||
#define FAIL_IF(condition, error_code) if (condition) return (error_code)
|
||||
|
||||
#ifndef ARRAY_SIZE
|
||||
#define ARRAY_SIZE(ar) (sizeof(ar) / sizeof((ar)[0]))
|
||||
#endif
|
||||
|
||||
#ifdef SLRE_DEBUG
|
||||
#define DBG(x) printf x
|
||||
#else
|
||||
#define DBG(x)
|
||||
#endif
|
||||
|
||||
struct bracket_pair {
|
||||
const char *ptr; /* Points to the first char after '(' in regex */
|
||||
int len; /* Length of the text between '(' and ')' */
|
||||
int branches; /* Index in the branches array for this pair */
|
||||
int num_branches; /* Number of '|' in this bracket pair */
|
||||
};
|
||||
|
||||
struct branch {
|
||||
int bracket_index; /* index for 'struct bracket_pair brackets' */
|
||||
/* array defined below */
|
||||
const char *schlong; /* points to the '|' character in the regex */
|
||||
};
|
||||
|
||||
struct regex_info {
|
||||
/*
|
||||
* Describes all bracket pairs in the regular expression.
|
||||
* First entry is always present, and grabs the whole regex.
|
||||
*/
|
||||
struct bracket_pair brackets[MAX_BRACKETS];
|
||||
int num_brackets;
|
||||
|
||||
/*
|
||||
* Describes alternations ('|' operators) in the regular expression.
|
||||
* Each branch falls into a specific branch pair.
|
||||
*/
|
||||
struct branch branches[MAX_BRANCHES];
|
||||
int num_branches;
|
||||
|
||||
/* Array of captures provided by the user */
|
||||
struct slre_cap *caps;
|
||||
int num_caps;
|
||||
|
||||
/* E.g. SLRE_IGNORE_CASE. See enum below */
|
||||
int flags;
|
||||
};
|
||||
|
||||
static int is_metacharacter(const unsigned char *s) {
|
||||
static const char *metacharacters = "^$().[]*+?|\\Ssdbfnrtv";
|
||||
return strchr(metacharacters, *s) != NULL;
|
||||
}
|
||||
|
||||
static int op_len(const char *re) {
|
||||
return re[0] == '\\' && re[1] == 'x' ? 4 : re[0] == '\\' ? 2 : 1;
|
||||
}
|
||||
|
||||
static int set_len(const char *re, int re_len) {
|
||||
int len = 0;
|
||||
|
||||
while (len < re_len && re[len] != ']') {
|
||||
len += op_len(re + len);
|
||||
}
|
||||
|
||||
return len <= re_len ? len + 1 : -1;
|
||||
}
|
||||
|
||||
static int get_op_len(const char *re, int re_len) {
|
||||
return re[0] == '[' ? set_len(re + 1, re_len - 1) + 1 : op_len(re);
|
||||
}
|
||||
|
||||
static int is_quantifier(const char *re) {
|
||||
return re[0] == '*' || re[0] == '+' || re[0] == '?';
|
||||
}
|
||||
|
||||
static int toi(int x) {
|
||||
return isdigit(x) ? x - '0' : x - 'W';
|
||||
}
|
||||
|
||||
static int hextoi(const unsigned char *s) {
|
||||
return (toi(tolower(s[0])) << 4) | toi(tolower(s[1]));
|
||||
}
|
||||
|
||||
static int match_op(const unsigned char *re, const unsigned char *s,
|
||||
struct regex_info *info) {
|
||||
int result = 0;
|
||||
switch (*re) {
|
||||
case '\\':
|
||||
/* Metacharacters */
|
||||
switch (re[1]) {
|
||||
case 'S': FAIL_IF(isspace(*s), SLRE_NO_MATCH); result++; break;
|
||||
case 's': FAIL_IF(!isspace(*s), SLRE_NO_MATCH); result++; break;
|
||||
case 'd': FAIL_IF(!isdigit(*s), SLRE_NO_MATCH); result++; break;
|
||||
case 'b': FAIL_IF(*s != '\b', SLRE_NO_MATCH); result++; break;
|
||||
case 'f': FAIL_IF(*s != '\f', SLRE_NO_MATCH); result++; break;
|
||||
case 'n': FAIL_IF(*s != '\n', SLRE_NO_MATCH); result++; break;
|
||||
case 'r': FAIL_IF(*s != '\r', SLRE_NO_MATCH); result++; break;
|
||||
case 't': FAIL_IF(*s != '\t', SLRE_NO_MATCH); result++; break;
|
||||
case 'v': FAIL_IF(*s != '\v', SLRE_NO_MATCH); result++; break;
|
||||
|
||||
case 'x':
|
||||
/* Match byte, \xHH where HH is hexadecimal byte representaion */
|
||||
FAIL_IF(hextoi(re + 2) != *s, SLRE_NO_MATCH);
|
||||
result++;
|
||||
break;
|
||||
|
||||
default:
|
||||
/* Valid metacharacter check is done in bar() */
|
||||
FAIL_IF(re[1] != s[0], SLRE_NO_MATCH);
|
||||
result++;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
case '|': FAIL_IF(1, SLRE_INTERNAL_ERROR); break;
|
||||
case '$': FAIL_IF(1, SLRE_NO_MATCH); break;
|
||||
case '.': result++; break;
|
||||
|
||||
default:
|
||||
if (info->flags & SLRE_IGNORE_CASE) {
|
||||
FAIL_IF(tolower(*re) != tolower(*s), SLRE_NO_MATCH);
|
||||
} else {
|
||||
FAIL_IF(*re != *s, SLRE_NO_MATCH);
|
||||
}
|
||||
result++;
|
||||
break;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static int match_set(const char *re, int re_len, const char *s,
|
||||
struct regex_info *info) {
|
||||
int len = 0, result = -1, invert = re[0] == '^';
|
||||
|
||||
if (invert) re++, re_len--;
|
||||
|
||||
while (len <= re_len && re[len] != ']' && result <= 0) {
|
||||
/* Support character range */
|
||||
if (re[len] != '-' && re[len + 1] == '-' && re[len + 2] != ']' &&
|
||||
re[len + 2] != '\0') {
|
||||
result = info->flags && SLRE_IGNORE_CASE ?
|
||||
*s >= re[len] && *s <= re[len + 2] :
|
||||
tolower(*s) >= tolower(re[len]) && tolower(*s) <= tolower(re[len + 2]);
|
||||
len += 3;
|
||||
} else {
|
||||
result = match_op((unsigned char *) re + len, (unsigned char *) s, info);
|
||||
len += op_len(re + len);
|
||||
}
|
||||
}
|
||||
return (!invert && result > 0) || (invert && result <= 0) ? 1 : -1;
|
||||
}
|
||||
|
||||
static int doh(const char *s, int s_len, struct regex_info *info, int bi);
|
||||
|
||||
static int bar(const char *re, int re_len, const char *s, int s_len,
|
||||
struct regex_info *info, int bi) {
|
||||
/* i is offset in re, j is offset in s, bi is brackets index */
|
||||
int i, j, n, step;
|
||||
|
||||
for (i = j = 0; i < re_len && j <= s_len; i += step) {
|
||||
|
||||
/* Handle quantifiers. Get the length of the chunk. */
|
||||
step = re[i] == '(' ? info->brackets[bi + 1].len + 2 :
|
||||
get_op_len(re + i, re_len - i);
|
||||
|
||||
DBG(("%s [%.*s] [%.*s] re_len=%d step=%d i=%d j=%d\n", __func__,
|
||||
re_len - i, re + i, s_len - j, s + j, re_len, step, i, j));
|
||||
|
||||
FAIL_IF(is_quantifier(&re[i]), SLRE_UNEXPECTED_QUANTIFIER);
|
||||
FAIL_IF(step <= 0, SLRE_INVALID_CHARACTER_SET);
|
||||
|
||||
if (i + step < re_len && is_quantifier(re + i + step)) {
|
||||
DBG(("QUANTIFIER: [%.*s]%c [%.*s]\n", step, re + i,
|
||||
re[i + step], s_len - j, s + j));
|
||||
if (re[i + step] == '?') {
|
||||
int result = bar(re + i, step, s + j, s_len - j, info, bi);
|
||||
j += result > 0 ? result : 0;
|
||||
i++;
|
||||
} else if (re[i + step] == '+' || re[i + step] == '*') {
|
||||
int j2 = j, nj = j, n1, n2 = -1, ni, non_greedy = 0;
|
||||
|
||||
/* Points to the regexp code after the quantifier */
|
||||
ni = i + step + 1;
|
||||
if (ni < re_len && re[ni] == '?') {
|
||||
non_greedy = 1;
|
||||
ni++;
|
||||
}
|
||||
|
||||
do {
|
||||
if ((n1 = bar(re + i, step, s + j2, s_len - j2, info, bi)) > 0) {
|
||||
j2 += n1;
|
||||
}
|
||||
if (re[i + step] == '+' && n1 < 0) break;
|
||||
|
||||
if (ni >= re_len) {
|
||||
/* After quantifier, there is nothing */
|
||||
nj = j2;
|
||||
} else if ((n2 = bar(re + ni, re_len - ni, s + j2,
|
||||
s_len - j2, info, bi)) >= 0) {
|
||||
/* Regex after quantifier matched */
|
||||
nj = j2 + n2;
|
||||
}
|
||||
if (nj > j && non_greedy) break;
|
||||
} while (n1 > 0);
|
||||
|
||||
if (n1 < 0 && re[i + step] == '*' &&
|
||||
(n2 = bar(re + ni, re_len - ni, s + j, s_len - j, info, bi)) > 0) {
|
||||
nj = j + n2;
|
||||
}
|
||||
|
||||
DBG(("STAR/PLUS END: %d %d %d %d %d\n", j, nj, re_len - ni, n1, n2));
|
||||
FAIL_IF(re[i + step] == '+' && nj == j, SLRE_NO_MATCH);
|
||||
|
||||
/* If while loop body above was not executed for the * quantifier, */
|
||||
/* make sure the rest of the regex matches */
|
||||
FAIL_IF(nj == j && ni < re_len && n2 < 0, SLRE_NO_MATCH);
|
||||
|
||||
/* Returning here cause we've matched the rest of RE already */
|
||||
return nj;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (re[i] == '[') {
|
||||
n = match_set(re + i + 1, re_len - (i + 2), s + j, info);
|
||||
DBG(("SET %.*s [%.*s] -> %d\n", step, re + i, s_len - j, s + j, n));
|
||||
FAIL_IF(n <= 0, SLRE_NO_MATCH);
|
||||
j += n;
|
||||
} else if (re[i] == '(') {
|
||||
n = SLRE_NO_MATCH;
|
||||
bi++;
|
||||
FAIL_IF(bi >= info->num_brackets, SLRE_INTERNAL_ERROR);
|
||||
DBG(("CAPTURING [%.*s] [%.*s] [%s]\n",
|
||||
step, re + i, s_len - j, s + j, re + i + step));
|
||||
|
||||
if (re_len - (i + step) <= 0) {
|
||||
/* Nothing follows brackets */
|
||||
n = doh(s + j, s_len - j, info, bi);
|
||||
} else {
|
||||
int j2;
|
||||
for (j2 = 0; j2 <= s_len - j; j2++) {
|
||||
if ((n = doh(s + j, s_len - (j + j2), info, bi)) >= 0 &&
|
||||
bar(re + i + step, re_len - (i + step),
|
||||
s + j + n, s_len - (j + n), info, bi) >= 0) break;
|
||||
}
|
||||
}
|
||||
|
||||
DBG(("CAPTURED [%.*s] [%.*s]:%d\n", step, re + i, s_len - j, s + j, n));
|
||||
FAIL_IF(n < 0, n);
|
||||
if (info->caps != NULL) {
|
||||
info->caps[bi - 1].ptr = s + j;
|
||||
info->caps[bi - 1].len = n;
|
||||
}
|
||||
j += n;
|
||||
} else if (re[i] == '^') {
|
||||
FAIL_IF(j != 0, SLRE_NO_MATCH);
|
||||
} else if (re[i] == '$') {
|
||||
FAIL_IF(j != s_len, SLRE_NO_MATCH);
|
||||
} else {
|
||||
FAIL_IF(j >= s_len, SLRE_NO_MATCH);
|
||||
n = match_op((unsigned char *) (re + i), (unsigned char *) (s + j), info);
|
||||
FAIL_IF(n <= 0, n);
|
||||
j += n;
|
||||
}
|
||||
}
|
||||
|
||||
return j;
|
||||
}
|
||||
|
||||
/* Process branch points */
|
||||
static int doh(const char *s, int s_len, struct regex_info *info, int bi) {
|
||||
const struct bracket_pair *b = &info->brackets[bi];
|
||||
int i = 0, len, result;
|
||||
const char *p;
|
||||
|
||||
do {
|
||||
p = i == 0 ? b->ptr : info->branches[b->branches + i - 1].schlong + 1;
|
||||
len = b->num_branches == 0 ? b->len :
|
||||
i == b->num_branches ? (int) (b->ptr + b->len - p) :
|
||||
(int) (info->branches[b->branches + i].schlong - p);
|
||||
DBG(("%s %d %d [%.*s] [%.*s]\n", __func__, bi, i, len, p, s_len, s));
|
||||
result = bar(p, len, s, s_len, info, bi);
|
||||
DBG(("%s <- %d\n", __func__, result));
|
||||
} while (result <= 0 && i++ < b->num_branches); /* At least 1 iteration */
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static int baz(const char *s, int s_len, struct regex_info *info) {
|
||||
int i, result = -1, is_anchored = info->brackets[0].ptr[0] == '^';
|
||||
|
||||
for (i = 0; i <= s_len; i++) {
|
||||
result = doh(s + i, s_len - i, info, 0);
|
||||
if (result >= 0) {
|
||||
result += i;
|
||||
break;
|
||||
}
|
||||
if (is_anchored) break;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static void setup_branch_points(struct regex_info *info) {
|
||||
int i, j;
|
||||
struct branch tmp;
|
||||
|
||||
/* First, sort branches. Must be stable, no qsort. Use bubble algo. */
|
||||
for (i = 0; i < info->num_branches; i++) {
|
||||
for (j = i + 1; j < info->num_branches; j++) {
|
||||
if (info->branches[i].bracket_index > info->branches[j].bracket_index) {
|
||||
tmp = info->branches[i];
|
||||
info->branches[i] = info->branches[j];
|
||||
info->branches[j] = tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* For each bracket, set their branch points. This way, for every bracket
|
||||
* (i.e. every chunk of regex) we know all branch points before matching.
|
||||
*/
|
||||
for (i = j = 0; i < info->num_brackets; i++) {
|
||||
info->brackets[i].num_branches = 0;
|
||||
info->brackets[i].branches = j;
|
||||
while (j < info->num_branches && info->branches[j].bracket_index == i) {
|
||||
info->brackets[i].num_branches++;
|
||||
j++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int foo(const char *re, int re_len, const char *s, int s_len,
|
||||
struct regex_info *info) {
|
||||
int i, step, depth = 0;
|
||||
|
||||
/* First bracket captures everything */
|
||||
info->brackets[0].ptr = re;
|
||||
info->brackets[0].len = re_len;
|
||||
info->num_brackets = 1;
|
||||
|
||||
/* Make a single pass over regex string, memorize brackets and branches */
|
||||
for (i = 0; i < re_len; i += step) {
|
||||
step = get_op_len(re + i, re_len - i);
|
||||
|
||||
if (re[i] == '|') {
|
||||
FAIL_IF(info->num_branches >= (int) ARRAY_SIZE(info->branches),
|
||||
SLRE_TOO_MANY_BRANCHES);
|
||||
info->branches[info->num_branches].bracket_index =
|
||||
info->brackets[info->num_brackets - 1].len == -1 ?
|
||||
info->num_brackets - 1 : depth;
|
||||
info->branches[info->num_branches].schlong = &re[i];
|
||||
info->num_branches++;
|
||||
} else if (re[i] == '\\') {
|
||||
FAIL_IF(i >= re_len - 1, SLRE_INVALID_METACHARACTER);
|
||||
if (re[i + 1] == 'x') {
|
||||
/* Hex digit specification must follow */
|
||||
FAIL_IF(re[i + 1] == 'x' && i >= re_len - 3,
|
||||
SLRE_INVALID_METACHARACTER);
|
||||
FAIL_IF(re[i + 1] == 'x' && !(isxdigit(re[i + 2]) &&
|
||||
isxdigit(re[i + 3])), SLRE_INVALID_METACHARACTER);
|
||||
} else {
|
||||
FAIL_IF(!is_metacharacter((unsigned char *) re + i + 1),
|
||||
SLRE_INVALID_METACHARACTER);
|
||||
}
|
||||
} else if (re[i] == '(') {
|
||||
FAIL_IF(info->num_brackets >= (int) ARRAY_SIZE(info->brackets),
|
||||
SLRE_TOO_MANY_BRACKETS);
|
||||
depth++; /* Order is important here. Depth increments first. */
|
||||
info->brackets[info->num_brackets].ptr = re + i + 1;
|
||||
info->brackets[info->num_brackets].len = -1;
|
||||
info->num_brackets++;
|
||||
FAIL_IF(info->num_caps > 0 && info->num_brackets - 1 > info->num_caps,
|
||||
SLRE_CAPS_ARRAY_TOO_SMALL);
|
||||
} else if (re[i] == ')') {
|
||||
int ind = info->brackets[info->num_brackets - 1].len == -1 ?
|
||||
info->num_brackets - 1 : depth;
|
||||
info->brackets[ind].len = (int) (&re[i] - info->brackets[ind].ptr);
|
||||
DBG(("SETTING BRACKET %d [%.*s]\n",
|
||||
ind, info->brackets[ind].len, info->brackets[ind].ptr));
|
||||
depth--;
|
||||
FAIL_IF(depth < 0, SLRE_UNBALANCED_BRACKETS);
|
||||
FAIL_IF(i > 0 && re[i - 1] == '(', SLRE_NO_MATCH);
|
||||
}
|
||||
}
|
||||
|
||||
FAIL_IF(depth != 0, SLRE_UNBALANCED_BRACKETS);
|
||||
setup_branch_points(info);
|
||||
|
||||
return baz(s, s_len, info);
|
||||
}
|
||||
|
||||
int slre_match(const char *regexp, const char *s, int s_len,
|
||||
struct slre_cap *caps, int num_caps, int flags) {
|
||||
struct regex_info info;
|
||||
|
||||
/* Initialize info structure */
|
||||
info.flags = flags;
|
||||
info.num_brackets = info.num_branches = 0;
|
||||
info.num_caps = num_caps;
|
||||
info.caps = caps;
|
||||
|
||||
DBG(("========================> [%s] [%.*s]\n", regexp, s_len, s));
|
||||
return foo(regexp, (int) strlen(regexp), s, s_len, &info);
|
||||
}
|
60
src/slre.h
Executable file
60
src/slre.h
Executable file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
|
||||
* Copyright (c) 2013 Cesanta Software Limited
|
||||
* All rights reserved
|
||||
*
|
||||
* This library is dual-licensed: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation. For the terms of this
|
||||
* license, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You are free to use this library under the terms of the GNU General
|
||||
* Public License, but WITHOUT ANY WARRANTY; without even the implied
|
||||
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See the GNU General Public License for more details.
|
||||
*
|
||||
* Alternatively, you can license this library under a commercial
|
||||
* license, as set out in <http://cesanta.com/products.html>.
|
||||
*/
|
||||
|
||||
/*
|
||||
* This is a regular expression library that implements a subset of Perl RE.
|
||||
* Please refer to README.md for a detailed reference.
|
||||
*/
|
||||
|
||||
#ifndef SLRE_HEADER_DEFINED
|
||||
#define SLRE_HEADER_DEFINED
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct slre_cap {
|
||||
const char *ptr;
|
||||
int len;
|
||||
};
|
||||
|
||||
|
||||
int slre_match(const char *regexp, const char *buf, int buf_len,
|
||||
struct slre_cap *caps, int num_caps, int flags);
|
||||
|
||||
/* Possible flags for slre_match() */
|
||||
enum { SLRE_IGNORE_CASE = 1 };
|
||||
|
||||
|
||||
/* slre_match() failure codes */
|
||||
#define SLRE_NO_MATCH -1
|
||||
#define SLRE_UNEXPECTED_QUANTIFIER -2
|
||||
#define SLRE_UNBALANCED_BRACKETS -3
|
||||
#define SLRE_INTERNAL_ERROR -4
|
||||
#define SLRE_INVALID_CHARACTER_SET -5
|
||||
#define SLRE_INVALID_METACHARACTER -6
|
||||
#define SLRE_CAPS_ARRAY_TOO_SMALL -7
|
||||
#define SLRE_TOO_MANY_BRANCHES -8
|
||||
#define SLRE_TOO_MANY_BRACKETS -9
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* SLRE_HEADER_DEFINED */
|
BIN
src/slre.o
Normal file
BIN
src/slre.o
Normal file
Binary file not shown.
Reference in New Issue
Block a user