initial commit

2016-01-13 10:05:51 +01:00
commit 05efb38fed
42 changed files with 4331 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,3 @@
 /man/
 /vignettes/
 /Read-and-delete-me
--- a/20
+++ b/20
@@ -0,0 +1,20 @@
 Package: ROBITaxonomy
 Type: Package
 Title: Metabarcoding data biodiversity analysis
 Version: 0.1
 Date: 2012-08-23
 Author: LECA - Laboratoire d'ecologie alpine
 Maintainer: LECA OBITools team <obitools@metabarcoding.org>
 Description: More about what it does (maybe more than one line)
 License: CeCILL v2.0
 LazyLoad: yes
 Roxygen: list(wrap = FALSE)
 Collate:
    'ROBITaxonomy.R'
    'taxonomy.R'
    'basic.R'
    'default.R'
    'distance.R'
    'lca.R'
    'rank.R'
 RoxygenNote: 5.0.1
--- a/28
+++ b/28
@@ -0,0 +1,28 @@
 # Generated by roxygen2: do not edit by hand
 export(default.taxonomy)
 export(distance.taxonomy)
 export(ecofind)
 export(family)
 export(genus)
 export(is.obitools.taxonomy)
 export(is.subcladeof)
 export(kingdom)
 export(length.obitools.taxonomy)
 export(longest.path)
 export(lowest.common.ancestor)
 export(max.obitools.taxonomy)
 export(parent)
 export(path)
 export(rank.list)
 export(read.taxonomy)
 export(scientificname)
 export(species)
 export(superkingdom)
 export(taxid.list)
 export(taxonatrank)
 export(taxonomicrank)
 export(validate)
 exportClasses(obitools.taxonomy)
 exportClasses(obitools.taxonomyOrNULL)
 useDynLib(ROBITaxonomy)
--- a/R/ROBITaxonomy.R
+++ b/R/ROBITaxonomy.R
--- a/R/basic.R
+++ b/R/basic.R
@@ -0,0 +1,359 @@
 #' @include taxonomy.R
 NULL
 #' @export
 setGeneric("scientificname", function(taxonomy,taxid) {
  return(standardGeneric("scientificname"))
 })
 #' Returns the scientific name corresponding to a \emph{NCBI taxid}
 #' 
 #' \code{scientificname} function in package \pkg{\link{ROBITaxonomy}} returns the 
 #' scientific name corresponding to a \emph{NCBI taxid}.
 #' 
 #' @param   taxonomy a \code{\link{obitools.taxonomy}} instance
 #' @param   taxid an integer value or a vector of integer representing NCBI 
 #'          taxonomic identifiers.
 #' @return  The scientific name of the corresponding taxon as a string or a 
 #'          vector of string if the \code{taxid} argument is itself a vector
 #' 
 #' @examples
 #' # load the default taxonomy database include in the ROBITaxonomy library
 #' taxo=default.taxonomy()
 #' 
 #' # build a vector of 6 taxids corresponding to species
 #' sp.taxid=c(7000,7004,7007,7009,7010,7011)
 #' 
 #' # look for the scientific names correponding to these taxids
 #' scientificname(taxo,sp.taxid)
 #' 
 #' @seealso class \code{\linkS4class{obitools.taxonomy}}
 #'          
 #'          
 #' @docType methods
 #' @rdname scientificname-methods
 #' @aliases scientificname-methods,obitools.taxonomy
 #' @author Eric Coissac
 #' 
 setMethod("scientificname", "obitools.taxonomy",function(taxonomy,taxid) {
  getscname = function(t)  {	
    if (is.na(t))
      return(NA)
    else
      return(	.Call('R_get_scientific_name',
                    taxonomy,
                    t,
                    PACKAGE="ROBITaxonomy"))
  }
  taxid = as.integer(taxid)
  sapply(taxid,getscname)
 })
 ######################################################################
 ######################################################################
 #' @export
 setGeneric("parent", function(taxonomy,taxid,name=FALSE) {
  return(standardGeneric("parent"))
 })
 #' Returns the parent taxon corresponding to a \emph{NCBI taxid}
 #' 
 #' \code{parent} function in package \pkg{\link{ROBITaxonomy}} returns the 
 #' parent taxon corresponding to a \emph{NCBI taxid}.
 #' 
 #' @param   taxonomy a \code{\link{obitools.taxonomy}} instance
 #' @param   taxid an integer value or a vector of integer representing NCBI 
 #'          taxonomic identifiers.
 #' @param   name A logical value \code{TRUE} or \code{FALSE} indicating if the 
 #'          method returns a taxid or a scientific name.
 #'                            
 #' @return  \describe{ \item{If \code{name==FALSE}}{the taxid of the 
 #'                            parent taxon as an integer or a vector of 
 #'                            integers if the \code{taxid} argument is itself 
 #'                            a vector} 
 #'                      \item{If \code{name==TRUE}}{the scientific name of the
 #'                            parent taxon as a string or a vector of 
 #'                            string if  the \code{taxid} argument is itself a 
 #'                            vector} }
 #' 
 #' @examples
 #' # load the default taxonomy database include in the ROBITaxonomy library
 #' taxo=default.taxonomy()
 #' 
 #' # build a vector of 6 taxids corresponding to species
 #' sp.taxid=c(7000,7004,7007,7009,7010,7011)
 #' 
 #' # look for the parent taxa correponding to these taxids
 #' parent(taxo,sp.taxid)
 #' 
 #' # same things but scientific names are returned
 #' parent(taxo,sp.taxid,TRUE)
 #' 
 #' @seealso class \code{\linkS4class{obitools.taxonomy}}
 #'          
 #'          
 #' @docType methods
 #' @rdname parent-methods
 #' @aliases parent-methods,obitools.taxonomy
 #' @author Eric Coissac
 #' 
 setMethod("parent", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) {
  getp = function(t)	 {	
    if (is.na(t))
      return(NA)
    else
      return(.Call('R_get_parent',
                   taxonomy,
                   as.integer(t),
                   name,
                   PACKAGE="ROBITaxonomy"))
  }
  taxid = as.integer(taxid)
  name  = as.logical(name[1])
  sapply(taxid,getp)
 })
 ######################################################################
 ######################################################################
 #' @export
 setGeneric("taxid.list", function(taxonomy) {
  return(standardGeneric("taxid.list"))
 })
 #' Returns the list of all taxids belonging the taxonomy.
 #'  
 #' \code{taxid.list} returns the list of all taxids included in the
 #' instance of the class \code{\linkS4class{obitools.taxonomy}}
 #' 
 #' @param taxonomy the \code{\linkS4class{obitools.taxonomy}} to use.
 #' 
 #' @return an \code{integer} vector containing the list of taxids.
 #' 
 #' @examples
 #' # loads the default taxonomy database
 #' taxo=default.taxonomy()
 #' 
 #' # returns the count of taxa described in the taxonomy
 #' length(taxo)
 #' 
 #' # extracts the list of all valid taxids
 #' good = taxid.list(taxo)
 #' 
 #' # returns the size of the returned list
 #' length(good)
 #' 
 #' @seealso \code{\linkS4class{obitools.taxonomy}}
 #' 
 #' @author Eric Coissac
 #' @keywords taxonomy
 #' @docType methods
 #' @rdname taxid.list-method
 #' @aliases taxid.list
 #' 
 setMethod("taxid.list", "obitools.taxonomy",
          function(taxonomy) {
            return(.Call('R_taxid_list',
                         taxonomy,
                         PACKAGE="ROBITaxonomy"))
          })
 ######################################################################
 ######################################################################
 #' Returns the count of taxa in the taxonomy.
 #'  
 #' \code{length} returns the count of taxa included in the
 #' instance of the class \code{\linkS4class{obitools.taxonomy}}
 #' 
 #' @param x the \code{\linkS4class{obitools.taxonomy}} to use.
 #' 
 #' @return an \code{integer} corresponding to the count of taxa.
 #' 
 #' @examples
 #' # loads the default taxonomy database
 #' taxo=default.taxonomy()
 #' 
 #' # returns the count of taxa described in the taxonomy
 #' length(taxo)
 #' 
 #' @seealso \code{\link{length}}, \code{\linkS4class{obitools.taxonomy}}
 #' 
 #' @author Eric Coissac
 #' @keywords taxonomy
 #' @export  length.obitools.taxonomy
 #' 
 length.obitools.taxonomy = function(x) 
 {
  return(.Call('R_length_taxonomy',
               x,
               PACKAGE="ROBITaxonomy"))
 }
 ######################################################################
 ######################################################################
 setGeneric('max')  	
 #' Returns the maximum taxid in the taxonomy.
 #'  
 #' \code{length} returns the maximum taxid included in the
 #' instance of the class \code{\linkS4class{obitools.taxonomy}}
 #' 
 #' @param taxonomy the \code{\linkS4class{obitools.taxonomy}} to use.
 #' @param na.rm included for compatibility purpose, this parameter as
 #'              no effect on this implementation of \code{max}
 #' 
 #' @return an \code{integer} corresponding to the count of taxa.
 #' 
 #' @examples
 #' # load the default taxonomy database
 #' taxo=default.taxonomy()
 #' 
 #' # gets the larger taxid of the database
 #' max(taxo)
 #' 
 #' @seealso \code{\link{max}}, \code{\linkS4class{obitools.taxonomy}}
 #' 
 #' @author Eric Coissac
 #' @keywords taxonomy
 #' @export  max.obitools.taxonomy
 #' 
 max.obitools.taxonomy=function(taxonomy,na.rm = FALSE) {
  return(.Call('R_max_taxid',
               taxonomy,
               PACKAGE="ROBITaxonomy"))
 }
 #' @export
 setGeneric("ecofind", function(taxonomy,patterns,rank=NULL,alternative=FALSE) {
  return(standardGeneric("ecofind"))
 })
 #' Returns taxids associated to the names
 #' 
 #' Return the set of taxids having their name matching the given pattern.
 #' 
 #' @param taxonomy the \code{\linkS4class{obitools.taxonomy}} to use.
 #' @param patterns one or several regular pattern used to select the the taxa.
 #' @param   rank a \code{character} indicating a taxonomic rank. If not \code{NULL}
 #'               only taxids correponding to this rank are returned.
 #' @param   alternative  A logical value \code{TRUE} or \code{FALSE} indicating 
 #'          if the function must only look for a scientific name.
 #' 
 #' @return if just one pattern is given, an integer vector is returned with the
 #'         corresponding taxids. If a list of patterns is given, the function
 #'         returns a list of integer vectors, each vector containing the taxids
 #'         corresponding to a pattern. The returned list is in the same order 
 #'         than the given patern list.
 #' 
 #' @examples
 #' # load the default taxonomy database
 #' taxo=default.taxonomy()
 #' 
 #' # retreives the Vertebrata taxid
 #' taxid = ecofind(taxo,"Vertebrata")
 #' 
 #' taxid
 #' scientificname(taxo,taxid)
 #' 
 #' 
 #' taxid = ecofind(taxo,"^Vertebrata$")
 #' 
 #' taxid
 #' scientificname(taxo,taxid)
 #' 
 #'
 #' @author Eric Coissac
 #' @keywords taxonomy
 #' @docType methods
 #' @rdname ecofind-method
 #' @aliases ecofind,obitools.taxonomy
 #' 
 setMethod("ecofind", "obitools.taxonomy",function(taxonomy,patterns,rank=NULL,alternative=FALSE) {
  getp = function(t)   {	
    if (is.na(t))
      return(NA)
    else
      return(unique(.Call('R_ecofind',
                          taxonomy,
                          t,
                          rank,
                          alternative,
                          PACKAGE="ROBITaxonomy")))
  }
  patterns = as.character(patterns)
  taxid=lapply(patterns,getp)
  if (length(taxid)==1)
    taxid=taxid[[1]]
  return(taxid)
 })
 #' @export
 setGeneric("validate", function(taxonomy,taxid) {
  return(standardGeneric("validate"))
 })
 #' Checks that a \emph{taxid} is really present in taxonomy
 #' 
 #' \code{validate} function in package \pkg{\link{ROBITaxonomy}} checks 
 #' that a \emph{taxid} is declared in the considered taxonomy.
 #' 
 #' @param   taxonomy a \code{\link{obitools.taxonomy}} instance
 #' @param   taxid an integer value or a vector of integer representing NCBI 
 #'          taxonomic identifiers.
 #'                            
 #' @return  The taxid if it exists, NA otherwise. If the input taxid is a 
 #'          vector of integer returns an integer vector composed of validated
 #'          taxids and NA values.
 #' 
 #' @examples
 #' # load the default taxonomy database include in the ROBITaxonomy library
 #' taxo=default.taxonomy()
 #' 
 #' # build a vector of 101 taxids 
 #' sp.taxid=c(7000:7100)
 #' 
 #' # checks the list of taxids
 #' validate(taxo,sp.taxid)
 #' 
 #' @seealso class \code{\linkS4class{obitools.taxonomy}}
 #'          
 #'          
 #' @docType methods
 #' @rdname validate-methods
 #' @aliases validate-methods,obitools.taxonomy
 #' @author Eric Coissac
 #' 
 setMethod("validate", "obitools.taxonomy",function(taxonomy,taxid) {
  getp = function(t)   {  
    if (is.na(t))
      return(NA)
    else
      return(.Call('R_validate_taxid',
                   taxonomy,
                   t,
                   PACKAGE="ROBITaxonomy"))
  }
  taxid = as.integer(taxid)
  sapply(taxid,getp)
 })
--- a/R/default.R
+++ b/R/default.R
@@ -0,0 +1,52 @@
 #' @include taxonomy.R
 NULL
 #
 #
 # Manage le loading of the default taxonomy
 #
 #
 .__default__taxonomy__ = NULL
 #' Returns the default taxonomy
 #' 
 #' Returns a \code{\linkS4class{obitools.taxonomy}} instance corresponding
 #' to a NCBI taxonomy included by default in the \pkg{\link{ROBITaxonomy}} package.
 #' 
 #' @return a \code{\linkS4class{obitools.taxonomy}} instance.
 #' 
 #' @examples
 #' 
 #' # Load the default taxonomy
 #' taxo = default.taxonomy()
 #' 
 #' # and use it for requesting a scientific name
 #' scientificname(taxo,7742)
 #' 
 #' @seealso \code{\linkS4class{obitools.taxonomy}}
 #' 
 #' @author Eric Coissac
 #' @keywords taxonomy
 #' @export
 #' 
 default.taxonomy = function() {
  if (is.null(get(".__default__taxonomy__",envir = environment())))
    assign(".__default__taxonomy__", 
           read.taxonomy(paste(system.file("extdata", 
                                           package="ROBITaxonomy"),
                                           'ncbitaxo',
                                        sep='/')), 
           envir=globalenv())
  return(get(".__default__taxonomy__",envir = globalenv()))
 }
 #' @export
 #' 
 is.obitools.taxonomy = function(taxonomy) {
  class(t)[1] == "obitools.taxonomy"
 }
--- a/R/distance.R
+++ b/R/distance.R
@@ -0,0 +1,189 @@
 #' @include taxonomy.R
 NULL
 #' @export
 setGeneric("longest.path", function(taxonomy,taxid) {
  return(standardGeneric("longest.path"))
 })
 #' Returns the longuest path from a taxon.
 #' 
 #' The method \code{longest.path} returns the length of the
 #' path linking a taxid to the farest leaf belonging this taxid.
 #' 
 #' @param taxonomy the \code{\linkS4class{obitools.taxonomy}} to use.
 #' 
 #' @param taxid an \code{integer} vector containing the list of taxids.
 #' 
 #' @return an \code{integer} vector containing the list length.
 #' 
 #' @examples
 #' # loads the default taxonomy database
 #' taxo=default.taxonomy()
 #' 
 #' # returns the longest path in the taxonomy (from the root node)
 #' longest.path(taxo,1)
 #' 
 #' 
 #' @seealso \code{\linkS4class{obitools.taxonomy}}
 #' 
 #' @author Eric Coissac
 #' @keywords taxonomy
 #' @docType methods
 #' @rdname longest.path-method
 #' @aliases longest.path,obitools.taxonomy
 #' 
 setMethod("longest.path", "obitools.taxonomy",
          function(taxonomy,taxid) {
            getp = function(t)   {	
              if (is.na(t))
                return(NA)
              else
                return(.Call('R_longest_path',
                             taxonomy,
                             t,
                             PACKAGE="ROBITaxonomy"))
            }
            taxid = as.integer(taxid)
            sapply(taxid,getp)
          })
 #' @export
 setGeneric("distance.taxonomy", function(taxonomy,taxid1,taxid2=NULL,name=F) {
  return(standardGeneric("distance.taxonomy"))
 })
 #' Computes a distance matrix between taxids
 #' 
 #' The method \code{taxonomy.distance} computes a distance matrix between a
 #' set of taxids. The distance between two taxa is based on the topology of
 #' the taxonomomy tree.
 #' 
 #' \deqn{ d(Taxon_A,Taxon_B) = \frac{longest.path(lca(Taxon_A,Taxon_B))}{max(longest.path(Taxon_A),longest.path(Taxon_B))}}
 #'                                  { longest.path(lca(Taxon_A,Taxon_B)) / max(longest.path(Taxon_A),longest.path(Taxon_B)) }
 #' 
 #' 
 #' @param taxonomy the \code{\linkS4class{obitools.taxonomy}} to use.
 #' 
 #' @param taxid1 an \code{integer} vector containing a list of taxids.
 #' 
 #' @param taxid2 an \code{integer} vector containing a list of taxids.
 #'               If \code{taxid2} is set to \code{NULL} (it's default value)
 #'               then the \code{taxid2} list is considered as equal to 
 #'               \code{taxid1} list. 
 #' @param name  A logical value \code{TRUE} or \code{FALSE} indicating 
 #'          if the method return distance matrix annotated by taxids or 
 #'          by scientific names.
 #'          
 #' @return the distance matrix between taxids specified in the \code{taxid1}
 #'         set and the \code{taxid2} set.
 #'         
 #' @examples
 #' # loads the default taxonomy database
 #' taxo=default.taxonomy()
 #' 
 #' # build a vector of 6 taxids corresponding to species
 #' sp.taxid=c(7000,7004,7007,7009,7010,7011)
 #' 
 #' # computes the distance matrix between taxids
 #' distance.taxonomy(taxo,sp.taxid)
 #' 
 #' # Same thing but the matrix is annotated by scientific names
 #' distance.taxonomy(taxo,sp.taxid,name=TRUE)
 #' 
 #' @seealso \code{\link{longest.path}}
 #' 
 #' @author Eric Coissac
 #' @keywords taxonomy
 #' @docType methods
 #' @rdname distance.taxonomy-method
 #' @aliases taxonomy.distance,obitools.taxonomy
 #' 
 setMethod("distance.taxonomy", "obitools.taxonomy",
          function(taxonomy,taxid1,taxid2=NULL,name=F) {
            taxdist = function(r)
            {	
              t1=r[1]
              t2=r[2]
              if (is.na(t1) | is.na(t2))
                return(NA)
              p1 = path(taxonomy,t1)
              p2 = path(taxonomy,t2)
              minp = min(length(p1),length(p2))
              common = sum(p1[1:minp] == p2[1:minp])
              lca = p1[common]
              lp = longest.path(taxonomy,lca)
              return(lp/(lp+common))
            }
            multitaxdist=function(t1,t2) {
              apply(data.frame(t1,t2),1,taxdist)
            }
            taxid1 = taxid1[! is.na(validate(taxonomy,taxid1))]
            t1 = path(taxonomy,taxid1)
            same = is.null(taxid2)
            if (same)
            {
              ntaxon = length(taxid1) 
              t2 = t1[unlist(sapply(2:ntaxon,
                                    function(x) x:ntaxon))]
              t1 = t1[rep(1:(ntaxon-1),(ntaxon-1):1)]
            }
            else
            {
              taxid2 = taxid2[! is.na(validate(taxonomy,taxid2))]
              t2 = path(taxonomy,taxid2)
              nt1 = length(taxid1)
              nt2 = length(taxid2)
              t1 = t1[rep(1:nt1,nt2)]
              t2 = t2[rep(1:nt2,rep(nt1,nt2))]
            }
            lmin = mapply(function(a,b) min(length(a),length(b)),
                          t1,
                          t2)
            llca = mapply(function(x,y,l) sum(x[1:l]==y[1:l]),
                          t1,
                          t2,
                          lmin)
            lb   = longest.path(taxonomy,mapply(function(x,y) x[y],t1,llca))
            d    = as.double(lb / (lb + llca))
            if (same) {
              attr(d, "Size") <- ntaxon
              if (name)
                attr(d, "Labels") <- scientificname(taxonomy,taxid1)
              else
                attr(d, "Labels") <- as.character(taxid1)
              attr(d, "Diag") <- FALSE
              attr(d, "Upper") <- FALSE
              attr(d, "method") <- NULL
              attr(d, "call") <- match.call()
              class(d) <- "dist"
            }
            else {
              if (name)
                d = matrix(d,nt1,nt2,
                           dimnames=list(scientificname(taxonomy,taxid1),
                                         scientificname(taxonomy,taxid2)))
              else
                d = matrix(d,nt1,nt2,
                           dimnames=list(as.character(taxid1),
                                         as.character(taxid2)))
            }
            return(d)
          })
--- a/R/lca.R
+++ b/R/lca.R
@@ -0,0 +1,122 @@
 #' @include taxonomy.R
 NULL
 #' @export
 setGeneric("lowest.common.ancestor", function(taxonomy,taxid,threshold=1.0,error=0,name=FALSE) {
  return(standardGeneric("lowest.common.ancestor"))
 })
 #' Computes the lowest common ancestor in the taxonomy tree between a set of taxa
 #' 
 #' The \code{lowest.common.ancestor} function in package \pkg{ROBITaxonomy} computes
 #' the lowest common ancestor of a set of taxids. The lowest common ancestor (LCA)
 #' is the most precise taxonomic group shared by all the considered taxa. Tha
 #' \code{lowest.common.ancestor} function implemented in the \pkg{ROBITaxonomy}
 #' package, considers a fuzzy definition of the LCA as the most precise
 #' taxonomic group shared by a quorum of the considered taxa.
 #' 
 #' @param taxonomy an instance of \code{\linkS4class{obitools.taxonomy}}
 #' @param taxid an integer value or a vector of integer representing NCBI 
 #'   taxonomic identifiers.
 #' @param threshold a numeric value between 0.0 and 1.0 indicating the minimum 
 #' quorum of taxid that must belong the LCA.
 #' @param error an integer value indicating the maximum count of taxids that 
 #' have not to belong the returned taxid. A \code{threshold} below 1.0 have
 #'  priority on the \code{error} parameter.
 #' @param name A logical value \code{TRUE} or \code{FALSE} indicating if the
 #' method return a \emph{taxid} or a scientific name.
 #' 
 #' @return Depending on the value of the \code{name} argument, set by default 
 #'         to \code{FALSE} the method returns :
 #'    \describe{  
 #'           \item{If \code{name==FALSE}}{ the taxid of the taxon corresponding 
 #'             to the LCA as an integer value}
 #'           \item{If \code{name==TRUE}}{ the scientific name of the taxon 
 #'                 corresponding to the LCA as a string}
 #'          }
 #'          
 #' @examples
 #' require(ROBITaxonomy)
 #'
 #' \dontshow{# switch the working directory to the data package directory}
 #' \dontshow{setwd(system.file("extdata", package="ROBITaxonomy"))}
 #'   
 #' # read the taxonomy database
 #'   
 #' taxo=read.taxonomy('ncbitaxo')
 #' 
 #' # build a vector of 6 taxids corresponding to species
 #' 
 #' sp.taxid=c(7000,7004,7007,7009,7010,7011)
 #' 
 #' # look for the lowest common ancestor taxids
 #' 
 #' lowest.common.ancestor(taxo,sp.taxid)
 #' 
 #' # same thing but returns results as a vector of scientific names
 #' lowest.common.ancestor(taxo,sp.taxid,name=TRUE)
 #' 
 #' # If we accept than 2 or 1 taxa do not belong the LCA
 #' lowest.common.ancestor(taxo,sp.taxid,name=TRUE,error=2)
 #' lowest.common.ancestor(taxo,sp.taxid,name=TRUE,error=1)
 #' 
 #' # Partial LCA can also be speciefied as the minimal frequency of
 #' # taxa belonging the LCA 
 #' lowest.common.ancestor(taxo,sp.taxid,name=TRUE,threshold=0.8)
 #' 
 #' @seealso class \code{\linkS4class{obitools.taxonomy}},
 #' and methods \code{\link{path}}, \code{\link{parent}},
 #' 
 #' @author Eric Coissac
 #' @keywords taxonomy
 #' @docType methods
 #' @rdname lowest.common.ancestor-method
 #' @aliases lowest.common.ancestor,obitools.taxonomy
 #' 
 setMethod("lowest.common.ancestor", "obitools.taxonomy",
          function(taxonomy,taxid,threshold=1.0,error=0,name=FALSE) {
            if (threshold != 1.0)
              error=as.integer(floor(length(taxid) * (1-threshold)))
            #
            # Remove nod valid taxid
            #
            taxid = validate(taxonomy,taxid)
            if (any(is.na(taxid)))
              return(NA)
            ntaxid=length(taxid)
            nok = ntaxid - error
            if (ntaxid==1)
              return(taxid)
            allpath  = path(taxonomy,taxid)
            minlength= min(vapply(allpath,length,0))
            lca=NA
            for (i in 1:minlength) {
              n = vapply(allpath,function(x) x[i],0)
              nt = table(n)
              mt = max(nt)
              if (mt >= nok) {
                p = nt[nt==mt]
                if (length(p)==1)
                  lca=as.integer(names(p)[1])
                else
                  break
              }
              else
                  break
            }
            if (name)
              return(scientificname(taxonomy,lca))
            else
              return(lca)
          })
--- a/R/rank.R
+++ b/R/rank.R
@@ -0,0 +1,483 @@
 #' @include taxonomy.R
 NULL
 #' @export
 setGeneric("rank.list", function(taxonomy) {
  return(standardGeneric("rank.list"))
 })
 #' Returns the list of taxonomic ranks
 #' 
 #' The \code{rank.list} function returns the list of all taxonomic
 #' ranks described in the taxonomy
 #' 
 #' @param   taxonomy a \code{\linkS4class{obitools.taxonomy}} instance
 #' 
 #' @return a vector of type \code{character} containing the taxonomic rank names
 #'  
 #' @examples
 #' # read the taxonomy database
 #' taxo=default.taxonomy()
 #' 
 #' # returns the taxonomic rank for all taxid between 1000 and 1020
 #' rank.list(taxo)
 #' 
 #' @docType methods
 #' @rdname rank.list-methods
 #' @aliases rank.list-methods,obitools.taxonomy
 #' @author Eric Coissac
 #' @keywords taxonomy
 #' 
 setMethod("rank.list", "obitools.taxonomy",
          function(taxonomy) {
            return(.Call('R_rank_list',
                         taxonomy,
                         PACKAGE="ROBITaxonomy"))
          })
 #' @export
 setGeneric("taxonomicrank", function(taxonomy,taxid) {
  return(standardGeneric("taxonomicrank"))
 })
 #' Returns the taxonomic rank associated to a taxid
 #' 
 #' @param   taxonomy a \code{\linkS4class{obitools.taxonomy}} instance
 #' @param   taxid a vector of taxid to analyse
 #'
 #' @return a vector of type \code{character} containing the taxonomic ranks
 #'  
 #' @examples
 #' # read the taxonomy database
 #' taxo=default.taxonomy()
 #' 
 #' # returns the taxonomic rank for all taxid between 1000 and 1020
 #' taxonomicrank(taxo,1000:1020)
 #' 
 #' @docType methods
 #' @rdname taxonomicrank-methods
 #' @aliases taxonomicrank-methods,obitools.taxonomy
 #' @author Eric Coissac
 #' @keywords taxonomy
 #' 
 setMethod("taxonomicrank", "obitools.taxonomy",function(taxonomy,taxid) {  
  taxid = as.integer(taxid)
  return(.Call('R_get_rank',
               taxonomy,
               taxid,
               PACKAGE="ROBITaxonomy"))
 })
 #' @export
 setGeneric("taxonatrank", function(taxonomy,taxid,rank,name=FALSE) {
  return(standardGeneric("taxonatrank"))
 })
 #' Extracts the taxid at a specified taxonomic rank.
 #' 
 #' The \code{taxonatrank} method of \code{\linkS4class{obitools.taxonomy}} class
 #' returns the \emph{taxid} or the scientific name corresponding 
 #' to a \emph{taxid}.at a specified taxonomic rank
 #' 
 #' @param   taxonomy a \code{\linkS4class{obitools.taxonomy}} instance
 #' @param   taxid a vector of taxid to analyse
 #' @param   rank a \code{character} indicating the desired rank
 #' @param   name  A logical value \code{TRUE} or \code{FALSE} indicating 
 #'          if the method return a taxid or a scientific name.
 #'          
 #' @return  \describe{
 #'   \item{If \code{name==FALSE}}{the taxid of the corresponding 
 #'                                taxon as an integer or a vector of integers 
 #'                                if the \code{taxid} argument is itself 
 #'                                a vector}
 #'   \item{If \code{name==TRUE}}{the scientific name of the corresponding 
 #'                               taxon as a string or a vector of string 
 #'                               if the \code{taxid} argument is itself 
 #'                               a vector}
 #'  }
 #'  
 #' @examples
 #' # read the taxonomy database
 #' taxo=default.taxonomy()
 #' 
 #' # build a vector of 6 taxids corresponding to species
 #' sp.taxid=c(7000,7004,7007,7009,7010,7011)
 #' 
 #' # look for the subfamily taxids
 #' taxonatrank(taxo,sp.taxid,"subfamily")
 #' 
 #' # same thing but returns results as a vector of scientific names
 #' taxonatrank(taxo,sp.taxid,"subfamily",TRUE)
 #' 
 #' @seealso class \code{\linkS4class{obitools.taxonomy}},
 #' and methods \code{\link{species}},\code{\link{genus}},
 #' \code{\link{family}},\code{\link{kingdom}},
 #' \code{\link{superkingdom}}
 #' 
 #' @docType methods
 #' @rdname taxonatrank-methods
 #' @aliases taxonatrank-methods,obitools.taxonomy
 #' @author Eric Coissac
 #' @keywords taxonomy
 #' 
 setMethod("taxonatrank", "obitools.taxonomy",function(taxonomy,taxid,rank,name=FALSE) {
  getsp = function(t)   {	
    if (is.na(t[1]) | is.na(t[2]))
      return(NA)
    else
      return(.Call('R_findtaxonatrank',taxonomy,
                   as.integer(t[1]),
                   t[2],
                   name,
                   PACKAGE="ROBITaxonomy"))
  }
  rank  = as.character(rank)
  name  = as.logical(name[1])
  apply(data.frame(taxid,rank),1,getsp)
 })
 #' @export
 setGeneric("species", function(taxonomy,taxid,name=FALSE) {
  return(standardGeneric("species"))
 })
 #' Extracts the species corresponding to a taxid
 #' 
 #' The \code{species} method of \code{\linkS4class{obitools.taxonomy}} class
 #' returns the \emph{taxid} or the scientific name of the species corresponding 
 #' to a \emph{taxid}.
 #' 
 #' @param   taxonomy a \code{\linkS4class{obitools.taxonomy}} instance
 #' @param   taxid a vector of taxid to analyse
 #' @param   name  A logical value \code{TRUE} or \code{FALSE} indicating 
 #'          if the method return a taxid or a scientific name.
 #'          
 #' @return  \describe{
 #'   \item{If \code{name==FALSE}}{the taxid of the corresponding 
 #'                                taxon as an integer or a vector of integers 
 #'                                if the \code{taxid} argument is itself 
 #'                                a vector}
 #'   \item{If \code{name==TRUE}}{the scientific name of the corresponding 
 #'                               taxon as a string or a vector of string 
 #'                               if the \code{taxid} argument is itself 
 #'                               a vector}
 #'  }
 #'  
 #' @examples
 #' # read the taxonomy database
 #' taxo=default.taxonomy()
 #' 
 #' # build a vector of 6 taxids corresponding to species
 #' sp.taxid=c(7000,7004,7007,7009,7010,7011)
 #' 
 #' # look for the species taxids
 #' species(taxo,sp.taxid)
 #' 
 #' # same thing but returns results as a vector of scientific names
 #' species(taxo,sp.taxid,TRUE)
 #' 
 #' @seealso class \code{\linkS4class{obitools.taxonomy}},
 #' and methods \code{\link{taxonatrank}},\code{\link{genus}},
 #' \code{\link{family}},\code{\link{kingdom}},
 #' \code{\link{superkingdom}}
 #' 
 #' @docType methods
 #' @rdname species-methods
 #' @aliases species-methods,obitools.taxonomy
 #' @author Eric Coissac
 #' @keywords taxonomy
 #' 
 setMethod("species", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) {
  getsp = function(t)	 {	
    if (is.na(t))
      return(NA)
    else
      return(.Call('R_get_species',
                   taxonomy,
                   t,
                   name,
                   PACKAGE="ROBITaxonomy"))
  }
  taxid = as.integer(taxid)
  name  = as.logical(name[1])
  sapply(taxid,getsp)
 })
 #' @export
 setGeneric("genus", function(taxonomy,taxid,name=FALSE) {
  return(standardGeneric("genus"))
 })
 #' Extracts the genus corresponding to a taxid
 #' 
 #' The \code{genus} method of \code{\linkS4class{obitools.taxonomy}} class
 #' returns the \emph{taxid} or the scientific name of the genus corresponding 
 #' to a \emph{taxid}.
 #' 
 #' @param   taxonomy a \code{\linkS4class{obitools.taxonomy}} instance
 #' @param   taxid a vector of taxid to analyse
 #' @param   name  A logical value \code{TRUE} or \code{FALSE} indicating 
 #'          if the method return a taxid or a scientific name.
 #'          
 #' @return  \describe{
 #'   \item{If \code{name==FALSE}}{the taxid of the corresponding 
 #'                                taxon as an integer or a vector of integers 
 #'                                if the \code{taxid} argument is itself 
 #'                                a vector}
 #'   \item{If \code{name==TRUE}}{the scientific name of the corresponding 
 #'                               taxon as a string or a vector of string 
 #'                               if the \code{taxid} argument is itself 
 #'                               a vector}
 #'  }
 #'  
 #' @examples
 #' # read the taxonomy database
 #' taxo=default.taxonomy()
 #' 
 #' # build a vector of 6 taxids corresponding to species
 #' sp.taxid=c(7000,7004,7007,7009,7010,7011)
 #' 
 #' # look for the genus taxids
 #' genus(taxo,sp.taxid)
 #' 
 #' # same thing but returns results as a vector of scientific names
 #' genus(taxo,sp.taxid,TRUE)
 #' 
 #' @seealso class \code{\linkS4class{obitools.taxonomy}},
 #' and methods \code{\link{species}},\code{\link{taxonatrank}},
 #' \code{\link{family}},\code{\link{kingdom}},
 #' \code{\link{superkingdom}}
 #' 
 #' @docType methods
 #' @rdname genus-methods
 #' @aliases genus-methods,obitools.taxonomy
 #' @author Eric Coissac
 #' @keywords taxonomy
 #' 
 setMethod("genus", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) {
  getsp = function(t)	 {	
    if (is.na(t))
      return(NA)
    else
      return(.Call('R_get_genus',
                   taxonomy,
                   t,
                   name,
                   PACKAGE="ROBITaxonomy"))
  }
  taxid = as.integer(taxid)
  name  = as.logical(name[1])
  sapply(taxid,getsp)
 })
 #' @export
 setGeneric("family", function(taxonomy,taxid,name=FALSE) {
  return(standardGeneric("family"))
 })
 #' Extracts the family corresponding to a taxid
 #' 
 #' The \code{family} method of \code{\linkS4class{obitools.taxonomy}} class
 #' returns the \emph{taxid} or the scientific name of the family corresponding 
 #' to a \emph{taxid}.
 #' 
 #' @param   taxonomy a \code{\linkS4class{obitools.taxonomy}} instance
 #' @param   taxid a vector of taxid to analyse
 #' @param   name  A logical value \code{TRUE} or \code{FALSE} indicating 
 #'          if the method return a taxid or a scientific name.
 #'          
 #' @return  \describe{
 #'   \item{If \code{name==FALSE}}{the taxid of the corresponding 
 #'                                taxon as an integer or a vector of integers 
 #'                                if the \code{taxid} argument is itself 
 #'                                a vector}
 #'   \item{If \code{name==TRUE}}{the scientific name of the corresponding 
 #'                               taxon as a string or a vector of string 
 #'                               if the \code{taxid} argument is itself 
 #'                               a vector}
 #'  }
 #'  
 #' @examples
 #' # read the taxonomy database
 #' taxo=default.taxonomy()
 #' 
 #' # build a vector of 6 taxids corresponding to species
 #' sp.taxid=c(7000,7004,7007,7009,7010,7011)
 #' 
 #' # look for the family taxids
 #' family(taxo,sp.taxid)
 #' 
 #' # same thing but returns results as a vector of scientific names
 #' family(taxo,sp.taxid,TRUE)
 #' 
 #' @seealso class \code{\linkS4class{obitools.taxonomy}},
 #' and methods \code{\link{species}},\code{\link{genus}},
 #' \code{\link{taxonatrank}},\code{\link{kingdom}},
 #' \code{\link{superkingdom}}
 #' 
 #' @docType methods
 #' @rdname family-methods
 #' @aliases family-methods,obitools.taxonomy
 #' @author Eric Coissac
 #' @keywords taxonomy
 #' 
 setMethod("family", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) {
  getsp = function(t)	 {	
    if (is.na(t))
      return(NA)
    else
      return(.Call('R_get_family',
                   taxonomy,
                   t,
                   name,
                   PACKAGE="ROBITaxonomy"))
  }
  taxid = as.integer(taxid)
  name  = as.logical(name[1])
  sapply(taxid,getsp)
 })
 #' @export
 setGeneric("kingdom", function(taxonomy,taxid,name=FALSE) {
  return(standardGeneric("kingdom"))
 })
 #' Extracts the kingdom corresponding to a taxid
 #' 
 #' The \code{kingdom} method of \code{\linkS4class{obitools.taxonomy}} class
 #' returns the \emph{taxid} or the scientific name of the kingdom corresponding 
 #' to a \emph{taxid}.
 #' 
 #' @param   taxonomy a \code{\linkS4class{obitools.taxonomy}} instance
 #' @param   taxid a vector of taxid to analyse
 #' @param   name  A logical value \code{TRUE} or \code{FALSE} indicating 
 #'          if the method return a taxid or a scientific name.
 #'          
 #' @return  \describe{
 #'   \item{If \code{name==FALSE}}{the taxid of the corresponding 
 #'                                taxon as an integer or a vector of integers 
 #'                                if the \code{taxid} argument is itself 
 #'                                a vector}
 #'   \item{If \code{name==TRUE}}{the scientific name of the corresponding 
 #'                               taxon as a string or a vector of string 
 #'                               if the \code{taxid} argument is itself 
 #'                               a vector}
 #'  }
 #'  
 #' @examples
 #' # read the taxonomy database
 #' taxo=default.taxonomy()
 #' 
 #' # build a vector of 6 taxids corresponding to species
 #' sp.taxid=c(7000,7004,7007,7009,7010,7011)
 #' 
 #' # look for the kingdom taxids
 #' kingdom(taxo,sp.taxid)
 #' 
 #' # same thing but returns results as a vector of scientific names
 #' kingdom(taxo,sp.taxid,TRUE)
 #' 
 #' @seealso class \code{\linkS4class{obitools.taxonomy}},
 #' and methods \code{\link{species}},\code{\link{genus}},
 #' \code{\link{family}},\code{\link{taxonatrank}},
 #' \code{\link{superkingdom}}
 #' 
 #' @docType methods
 #' @rdname kingdom-methods
 #' @aliases kingdom-methods,obitools.taxonomy
 #' @author Eric Coissac
 #' @keywords taxonomy
 #' 
 setMethod("kingdom", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) {
  getsp = function(t)	 {	
    if (is.na(t))
      return(NA)
    else
      return(.Call('R_get_kingdom',
                   taxonomy,
                   t,
                   name,
                   PACKAGE="ROBITaxonomy"))
  }
  taxid = as.integer(taxid)
  name  = as.logical(name[1])
  sapply(taxid,getsp)
 })
 #' @export
 setGeneric("superkingdom", function(taxonomy,taxid,name=FALSE) {
  return(standardGeneric("superkingdom"))
 })
 #' Extracts the superkingdom corresponding to a taxid
 #' 
 #' The \code{superkingdom} method of \code{\linkS4class{obitools.taxonomy}} class
 #' returns the \emph{taxid} or the scientific name of the superkingdom corresponding 
 #' to a \emph{taxid}.
 #' 
 #' @param   taxonomy a \code{\linkS4class{obitools.taxonomy}} instance
 #' @param   taxid a vector of taxid to analyse
 #' @param   name  A logical value \code{TRUE} or \code{FALSE} indicating 
 #'          if the method return a taxid or a scientific name.
 #'          
 #' @return  \describe{
 #'   \item{If \code{name==FALSE}}{the taxid of the corresponding 
 #'                                taxon as an integer or a vector of integers 
 #'                                if the \code{taxid} argument is itself 
 #'                                a vector}
 #'   \item{If \code{name==TRUE}}{the scientific name of the corresponding 
 #'                               taxon as a string or a vector of string 
 #'                               if the \code{taxid} argument is itself 
 #'                               a vector}
 #'  }
 #'  
 #' @examples
 #' # read the taxonomy database
 #' taxo=default.taxonomy()
 #' 
 #' # build a vector of 6 taxids corresponding to species
 #' sp.taxid=c(7000,7004,7007,7009,7010,7011)
 #' 
 #' # look for the superkingdom taxids
 #' superkingdom(taxo,sp.taxid)
 #' 
 #' # same thing but returns results as a vector of scientific names
 #' superkingdom(taxo,sp.taxid,TRUE)
 #' 
 #' @seealso class \code{\linkS4class{obitools.taxonomy}},
 #' and methods \code{\link{species}},\code{\link{genus}},
 #' \code{\link{family}},\code{\link{kingdom}},
 #' \code{\link{taxonatrank}}
 #' 
 #' @docType methods
 #' @rdname superkingdom-methods
 #' @aliases superkingdom-methods,obitools.taxonomy
 #' @author Eric Coissac
 #' @keywords taxonomy
 #' 
 setMethod("superkingdom", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) {
  getsp = function(t)	 {	
    if (is.na(t))
      return(NA)
    else
      return(.Call('R_get_superkingdom',
                   taxonomy,
                   t,
                   name,
                   PACKAGE="ROBITaxonomy"))
  }
  taxid = as.integer(taxid)
  name  = as.logical(name[1])
  sapply(taxid,getsp)
 })
--- a/R/taxonomy.R
+++ b/R/taxonomy.R
@@ -0,0 +1,216 @@
 #' @include ROBITaxonomy.R
 #' @useDynLib ROBITaxonomy
 NULL
 #' Gives access to a taxonomy preformated by OBITools
 #' 
 #' A S4 class describing a taxonomy. It allows access to 
 #' taxonomy formated for OBITools.
 #'
 #' @references \describe{
 #'  \item{NCBI Taxonomy : }{\url{http://www.ncbi.nlm.nih.gov/taxonomy}}
 #'  \item{OBITools : }{\url{http://metabarcoding/obitools/doc}}
 #'  }
 #'  
 #' @seealso \code{\link{read.taxonomy}}
 #'  
 #' @name obitools.taxonomy
 #' @rdname obitools-taxonomy-class
 #' @keywords taxonomy
 #' @author Eric Coissac
 #' @exportClass obitools.taxonomy
 #' 
 setClass("obitools.taxonomy",
         #
         # Attribute declaration
         #
         # data.frame containing the counts of reads per samples
         #      1 samples per line
         #      1 sequence per column
         representation(
           # An external pointer structure to 
           # the C taxonomy structure
           pointer    = "externalptr",
           # the name of the database on the hard disk
           dbname     = 'character',
           # the working directory when the taxonomy
           # object is created. 
           # This inforation combined with bname allows
           # to reload taxonomy from disk
           workingdir = 'character',
           # Indicate if the taxonomy is saved in a file
           # Taxonomy created in R or modified in R are
           # not saved ==> This have to be take into 
           # consideration but how ???
           saved = 'logical'
         ),
         #
         # Check object structure 
         #
         validity = function(object) { 			
           return(TRUE)
         }
 )
 #' obitools.taxonomy constructor
 #' 
 #' --> this constructor have not to be called directly
 #'     use the read.obitools.taxonomy function to
 #'     create a new instance of taxonomy
 #'
 #' @docType methods
 #' @rdname initialize-methods-obitools.taxonomy
 #' @aliases initialize-methods,obitools.taxonomy
 setMethod("initialize",
          "obitools.taxonomy",
          function(.Object, pointer,dbname,workingdir,saved) {
            .Object@pointer    <- pointer
            .Object@dbname     <- dbname
            .Object@workingdir <- workingdir
            .Object@saved      <- saved
            validObject(.Object) ## valide l'objet
            return(.Object)
          })
 #' @exportClass obitools.taxonomyOrNULL
 setClassUnion("obitools.taxonomyOrNULL",c("obitools.taxonomy","NULL"))
 #' @export
 setGeneric("path", function(taxonomy,taxid,name=FALSE) {
  return(standardGeneric("path"))
 })
 setMethod("path", "obitools.taxonomy",function(taxonomy,taxid,name=FALSE) {
  getp = function(t)	 {	
    if (is.na(t))
      return(NA)
    else
    {
      path=c()
      t=.Call('R_validate_taxid',
              taxonomy,
              as.integer(t),
              PACKAGE="ROBITaxonomy")
      if (is.na(t))
        return(NA)
      repeat {
        if (name)
          path = c(scientificname(taxonomy,t),path)
        else
          path = c(t,path)
        t = .Call('R_get_parent',
                  taxonomy,
                  t,
                  FALSE,
                  PACKAGE="ROBITaxonomy")
        if (is.na(t))
          break
      }
      return(path)
    }
  }
  taxid=as.integer(taxid)
  name=as.logical(name)
  p = lapply(taxid,getp)
  d = dim(p)
  if (!is.null(d))
    if (d[2]==1)
      p = as.vector(p)
  return(p)
 })
 #' @export
 setGeneric("is.subcladeof", function(taxonomy,taxid,parent) {
  return(standardGeneric("is.subcladeof"))
 })
 setMethod("is.subcladeof", "obitools.taxonomy",function(taxonomy,taxid,parent) {
  taxid = as.integer(taxid)
  parent= as.integer(parent)
  return(.Call('R_is_under_taxon',
               taxonomy,
               taxid,
               parent,
               PACKAGE="ROBITaxonomy"))
 })
 build.taxonomy = function(pointer,dbname,workingdir,saved) {
  rd <- new('obitools.taxonomy',
            pointer=pointer,
            dbname=dbname,
            workingdir=workingdir,
            saved=saved
  )
  return(rd)
 }
 #' Reads a taxonomy
 #' 
 #' \code{read.taxonomy} reads a taxonomy formated by OBITools.
 #' NCBI taxonomy can be download from the NCBI FTP site in taxdump format.
 #' The taxdump must be formated using the obitaxonomy command from OBITools
 #' before being used in R. A OBITools formated taxonomy is composed of 3 files 
 #' with the same prefix name and suffixes .tdx, .rdx, .ndx, two extra files 
 #' suffixed .adx and .ldx can also be present. 
 #' 
 #' @param  dbname A character string containing the file name of the database
 #' 
 #' @return an instance of the class \code{\linkS4class{obitools.taxonomy}}
 #' 
 #' @examples
 #' 
 #' \dontshow{# switch the working directory to the data package directory}
 #' \dontshow{setwd(system.file("extdata", package="ROBITaxonomy"))}
 #'
 #' # read the taxonomy ncbi
 #' ncbi = read.taxonomy("ncbitaxo")
 #' 
 #' # and use it for requesting a scientific name
 #' scientificname(ncbi,7742)
 #' 
 #' @seealso \code{\linkS4class{obitools.taxonomy}}
 #' 
 #' @author Eric Coissac
 #' @keywords taxonomy
 #' @export
 read.taxonomy = function(dbname) {
  t <- .Call('R_read_taxonomy',dbname,TRUE,PACKAGE="ROBITaxonomy")	
  return(build.taxonomy(t,dbname,getwd(),TRUE))
 }
--- a/ROBITaxonomy.Rproj
+++ b/ROBITaxonomy.Rproj
@@ -0,0 +1,20 @@
 Version: 1.0
 RestoreWorkspace: Default
 SaveWorkspace: Default
 AlwaysSaveHistory: Default
 EnableCodeIndexing: Yes
 UseSpacesForTab: Yes
 NumSpacesForTab: 2
 Encoding: UTF-8
 RnwWeave: knitr
 LaTeX: pdfLaTeX
 AutoAppendNewline: Yes
 BuildType: Package
 PackageUseDevtools: Yes
 PackageInstallArgs: --no-multiarch --with-keep.source
 PackageRoxygenize: rd,collate,namespace
--- a/inst/extdata/.DS_Store
+++ b/inst/extdata/.DS_Store
--- a/inst/extdata/ncbitaxo.adx
+++ b/inst/extdata/ncbitaxo.adx
--- a/inst/extdata/ncbitaxo.ndx
+++ b/inst/extdata/ncbitaxo.ndx
--- a/inst/extdata/ncbitaxo.rdx
+++ b/inst/extdata/ncbitaxo.rdx
--- a/inst/extdata/ncbitaxo.tdx
+++ b/inst/extdata/ncbitaxo.tdx
--- a/src/ROBITaxonomy.so
+++ b/src/ROBITaxonomy.so
--- a/src/ecoError.c
+++ b/src/ecoError.c
@@ -0,0 +1,24 @@
 #include "ecoPCR.h"
 #include <stdio.h>
 #include <stdlib.h>
 /*
 * print the message given as argument and exit the program
 * @param error		error number	
 * @param message 	the text explaining what's going on
 * @param filename	the file source where the program failed
 * @param linenumber	the line where it has failed
 * filename and linenumber are written at pre-processing 
 * time by a macro
 */
 void ecoError(int32_t errorcode,
              const char* message,
              const char * filename,
              int linenumber)
 {
  error("Error %d in file %s line %d : %s",
                 errorcode,
 	               filename,
 	               linenumber,
 	               message);
 }
--- a/src/ecoError.o
+++ b/src/ecoError.o
--- a/src/ecoIOUtils.c
+++ b/src/ecoIOUtils.c
@@ -0,0 +1,122 @@
 #include "ecoPCR.h"
 #include <stdio.h>
 #include <stdlib.h>
 #define SWAPINT32(x)     ((((x) << 24) & 0xFF000000) | (((x) <<  8) & 0xFF0000) | \
                         (((x) >>  8) & 0xFF00)     | (((x) >> 24) & 0xFF))
 int32_t is_big_endian()
 {
 	int32_t i=1;
 	return (int32_t)((char*)&i)[0];
 }
 int32_t swap_int32_t(int32_t i)
 {
 	return SWAPINT32(i);
 }
 /**
 * Read part of the file
 * @param	*f	the database
 * @param	recordSize the size to be read
 * 
 * @return	buffer
 */
 void *read_ecorecord(FILE *f,int32_t *recordSize)
 {
 	static void *buffer    =NULL;
 	int32_t      buffersize=0;
 	int32_t      read;
 	if (!recordSize)
 		ECOERROR(ECO_ASSERT_ERROR,
 		         "recordSize cannot be NULL");
 	read = fread(recordSize,
 	      		 1,
 	      		 sizeof(int32_t),
 	             f);
 	if (feof(f))
 		return NULL;
 	if (read != sizeof(int32_t))
 		ECOERROR(ECO_IO_ERROR,"Reading record size error");
 	if (is_big_endian())
 		*recordSize=swap_int32_t(*recordSize);
 	if (buffersize < *recordSize)
 	{
 		if (buffer)
 			buffer = ECOREALLOC(buffer,*recordSize,
 			                    "Increase size of record buffer");
 		else
 			buffer = ECOMALLOC(*recordSize,
 			                    "Allocate record buffer");
 	}
 	read = fread(buffer,
 	             1,
 				 *recordSize,
 				 f);
 	if (read != *recordSize)
 		ECOERROR(ECO_IO_ERROR,"Reading record data error");
 	return buffer;	 
 }
 /**
 * Open the database and check it's readable
 * @param 	filename 		name of the database (.sdx, .rdx, .tbx)
 * @param 	sequencecount	buffer - pointer to variable storing the number of occurence    
 * @param 	abort_on_open_error		 	boolean to define the behaviour in case of error 
 * 										while opening the database
 * @return 	FILE type
 **/
 FILE *open_ecorecorddb(const char *filename,
                       int32_t    *sequencecount,
                       int32_t    abort_on_open_error)
 {
    FILE        *f;
 	int32_t      read;
 	f = fopen(filename,"rb");
 	if (!f)
 		{
 			if (abort_on_open_error)
 		 		ECOERROR(ECO_IO_ERROR,"Cannot open file");
 		 	else
 		 	{
 		 		*sequencecount=0;
 		 		return NULL;
 		 	}
 		}
 	read = fread(sequencecount,
 	      		 1,
 	      		 sizeof(int32_t),
 	      		 f);
 	if (read != sizeof(int32_t))
 		ECOERROR(ECO_IO_ERROR,"Reading record size error");
 	if (is_big_endian())
 		*sequencecount=swap_int32_t(*sequencecount);
 	return f;                  
 }
--- a/src/ecoIOUtils.o
+++ b/src/ecoIOUtils.o
--- a/src/ecoMalloc.c
+++ b/src/ecoMalloc.c
@@ -0,0 +1,76 @@
 #include "ecoPCR.h"
 #include <stdlib.h>
 static int eco_log_malloc = 0;
 void    eco_trace_memory_allocation()
 {
 	eco_log_malloc=1;
 }
 void    eco_untrace_memory_allocation()
 {
 	eco_log_malloc=0;
 }
 void   *eco_malloc(int32_t chunksize,
                   const char *error_message,
                   const char *filename,
                   int32_t    line)
 {
 	void * chunk;
 	chunk = calloc(1,chunksize);
 	if (!chunk)
 		ecoError(ECO_MEM_ERROR,error_message,filename,line);
 	if (eco_log_malloc)
 		REprintf("Memory segment located at %p of size %d is allocated (file : %s [%d])",
 			       chunk,
 			       chunksize,
 			       filename,
 			       line);
 	return chunk;
 }
 void   *eco_realloc(void *chunk,
                    int32_t newsize,
                    const char *error_message,
                    const char *filename,
                    int32_t    line)
 {
 	void *newchunk;
 	newchunk = realloc(chunk,newsize);
 	if (!newchunk)
 		ecoError(ECO_MEM_ERROR,error_message,filename,line);
 	if (eco_log_malloc)
 		REprintf("Old memory segment %p is reallocated at %p with a size of %d (file : %s [%d])",
 			        chunk,
 			        newchunk,
 			        newsize,
 			        filename,
 			        line);
 	return newchunk;	
 }
 void    eco_free(void *chunk,
                 const char *error_message,
                 const char *filename,
                 int32_t    line)
 {
 	free(chunk);
 	if (eco_log_malloc)
 		REprintf("Memory segment %p is released => %s (file : %s [%d])",
 			       chunk,
 			       error_message,
 			       filename,
 			       line);
 }
--- a/src/ecoMalloc.o
+++ b/src/ecoMalloc.o
--- a/src/ecoPCR.h
+++ b/src/ecoPCR.h
@@ -0,0 +1,285 @@
 #ifndef ECOPCR_H_
 #define ECOPCR_H_
 #include <stdio.h>
 #include <inttypes.h>
 #include <R.h>
 #include <Rinternals.h>
 #include <Rdefines.h>
 //#ifndef H_apat
 //#include "../libapat/apat.h"
 //#endif
 /*****************************************************
 * 
 *  Data type declarations
 * 
 *****************************************************/
 /*
 * 
 *  Sequence types
 * 
 */
 typedef struct {
 	int32_t  taxid;
 	char     AC[20];
 	int32_t  DE_length;
 	int32_t  SQ_length;
 	int32_t  CSQ_length;
 	char     data[1];
 } ecoseqformat_t;
 typedef struct {
 	int32_t taxid;
 	int32_t SQ_length;
 	char    *AC;
 	char    *DE;
 	char    *SQ;
 } ecoseq_t;
 /*
 * 
 * Taxonomy taxon types
 * 
 */
 typedef struct {
 	int32_t  taxid;
 	int32_t  rank;
 	int32_t	 parent;
 	int32_t  namelength;
 	char     name[1];
 } ecotxformat_t;
 typedef struct ecotxnode {
 	int32_t           taxid;
 	int32_t           rank;
 	int32_t  		      farest;
 	struct ecotxnode  *parent;
 	char              *name;
 } ecotx_t;
 typedef struct {
 	int32_t count;
 	int32_t maxtaxid;
  int32_t buffersize;
 	ecotx_t taxon[1];
 } ecotxidx_t;
 /*
 * 
 * Taxonomy rank types
 * 
 */
 typedef struct {
 	int32_t count;
 	char*   label[1];
 } ecorankidx_t;
 /*
 * 
 * Taxonomy name types
 * 
 */
 typedef struct {
 	int32_t is_scientificname;
 	int32_t  namelength;
 	int32_t  classlength;
 	int32_t  taxid;
 	char     names[1];	
 } econameformat_t;
 typedef struct {
 	char 	*name;
 	char 	*classname;
 	int32_t is_scientificname;
 	struct ecotxnode  *taxon;
 } econame_t;
 typedef struct {
 	int32_t count;
 	econame_t   names[1];
 } econameidx_t;
 typedef struct {
 	ecorankidx_t *ranks;
 	econameidx_t *names;
 	ecotxidx_t   *taxons;
 } ecotaxonomy_t;
 /*****************************************************
 * 
 *  Function declarations
 * 
 *****************************************************/
 /*
 * 
 * Low level system functions
 * 
 */
 int32_t is_big_endian();
 int32_t swap_int32_t(int32_t);
 void   *eco_malloc(int32_t chunksize,
                   const char *error_message,
                   const char *filename,
                   int32_t    line);
 void   *eco_realloc(void *chunk,
                    int32_t chunksize,
                    const char *error_message,
                    const char *filename,
                    int32_t    line);
 void    eco_free(void *chunk,
                 const char *error_message,
                 const char *filename,
                 int32_t    line);
 void    eco_trace_memory_allocation();
 void    eco_untrace_memory_allocation();
 #define ECOMALLOC(size,error_message) \
 	    eco_malloc((size),(error_message),__FILE__,__LINE__)
 #define ECOREALLOC(chunk,size,error_message) \
        eco_realloc((chunk),(size),(error_message),__FILE__,__LINE__)
 #define ECOFREE(chunk,error_message) \
        eco_free((chunk),(error_message),__FILE__,__LINE__)
 /*
 * 
 * Error managment
 * 
 */
 void ecoError(int32_t,const char*,const char *,int);
 #define ECOERROR(code,message) ecoError((code),(message),__FILE__,__LINE__)
 #define ECO_IO_ERROR       (1)
 #define ECO_MEM_ERROR      (2)
 #define ECO_ASSERT_ERROR   (3)
 #define ECO_NOTFOUND_ERROR (4)
 /*
 * 
 * Low level Disk access functions
 * 
 */
 FILE *open_ecorecorddb(const char *filename,
                       int32_t    *sequencecount,
                       int32_t    abort_on_open_error);
 void *read_ecorecord(FILE *,int32_t *recordSize);
 /* 
 *   Read function in internal binary format
 */
 FILE             *open_ecoseqdb(const char *filename,
                                int32_t    *sequencecount);
 ecoseq_t         *readnext_ecoseq(FILE *);
 ecorankidx_t     *read_rankidx(const char *filename);
 econameidx_t     *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy);
 	/**
 	 * Read taxonomy data as formated by the ecoPCRFormat.py script.
 	 * 
 	 * This function is normaly uses internaly by the read_taxonomy
 	 * function and should not be called directly.
 	 * 
 	 * @arg filename  path to the *.tdx file of the reformated db
 	 * 
 	 * @return pointer to a taxonomy index structure
 	 */
 ecotxidx_t       *read_taxonomyidx(const char *filename,const char *filename2);
 ecotaxonomy_t    *read_taxonomy(const char *prefix,int32_t readAlternativeName);
 ecotx_t *eco_findtaxonatrank(ecotx_t *taxon, int32_t rankidx);
 ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy, int32_t taxid);
 int eco_isundertaxon(ecotx_t *taxon, int other_taxid);
 ecoseq_t *ecoseq_iterator(const char *prefix);
 ecoseq_t *new_ecoseq();
 int32_t   delete_ecoseq(ecoseq_t *);
 ecoseq_t *new_ecoseq_with_data( char *AC,
 								char *DE,
 								char *SQ,
 								int32_t   taxid
 								);
 int32_t delete_taxon(ecotx_t *taxon);
 int32_t delete_rankidx(ecorankidx_t *rankidx);
 int32_t delete_nameidx(econameidx_t *nameidx);
 int32_t delete_taxonomy(ecotxidx_t *index);
 int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy);
 int32_t rank_index(const char* label,ecorankidx_t* ranks);
 //int32_t  delete_apatseq(SeqPtr pseq);
 //PatternPtr buildPattern(const char *pat, int32_t error_max);
 //PatternPtr complementPattern(PatternPtr pat);
 //
 //SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out,int32_t circular);
 //char *ecoComplementPattern(char *nucAcSeq);
 //char *ecoComplementSequence(char *nucAcSeq);
 //char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end);
 ecotx_t *eco_getspecies(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
 ecotx_t *eco_getgenus(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
 ecotx_t *eco_getfamily(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
 ecotx_t *eco_getkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
 ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
 //int eco_is_taxid_ignored(int32_t *ignored_taxid, int32_t tab_len, int32_t taxid);
 //int eco_is_taxid_included(ecotaxonomy_t *taxonomy, int32_t *included_taxid, int32_t tab_len, int32_t taxid);
 ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy);
 #endif /*ECOPCR_H_*/
--- a/src/ecodna.c
+++ b/src/ecodna.c
@@ -0,0 +1,156 @@
 #include <string.h>
 #include "ecoPCR.h"
 /*
 * @doc: DNA alphabet (IUPAC)
 */
 #define LX_BIO_DNA_ALPHA   "ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]"
 /*
 * @doc: complementary DNA alphabet (IUPAC)
 */
 #define LX_BIO_CDNA_ALPHA  "TVGHEFCDIJMLKNOPQYSAABWXRZ#!]["
 static char sNuc[]     = LX_BIO_DNA_ALPHA;
 static char sAnuc[]    = LX_BIO_CDNA_ALPHA;
 static char LXBioBaseComplement(char nucAc);
 static char *LXBioSeqComplement(char *nucAcSeq);
 static char *reverseSequence(char *str,char isPattern);
 /* ---------------------------- */
 char LXBioBaseComplement(char nucAc)
 {
    char *c;
    if ((c = strchr(sNuc, nucAc)))
        return sAnuc[(c - sNuc)];
    else
        return nucAc;
 }
 /* ---------------------------- */
 char *LXBioSeqComplement(char *nucAcSeq)
 {
    char *s;
    for (s = nucAcSeq ; *s ; s++)
        *s = LXBioBaseComplement(*s);
    return nucAcSeq;
 }
 char *reverseSequence(char *str,char isPattern)
 {
        char *sb, *se, c;
        if (! str)
            return str;
        sb = str;
        se = str + strlen(str) - 1;
        while(sb <= se) {
           c    = *sb;
          *sb++ = *se;
          *se-- = c;
        }
 		sb = str;
 		se = str + strlen(str) - 1;
 		if (isPattern)
 			for (;sb < se; sb++)
 			{
 				if (*sb=='#')
 				{
 					if (((se - sb) > 2) && (*(sb+2)=='!'))
 					{
 						*sb='!';
 						sb+=2;
 						*sb='#';
 					}
 					else
 					{
 						*sb=*(sb+1);
 						sb++;
 						*sb='#';
 					}
 				}
 				else if (*sb=='!')
 					{
 						*sb=*(sb-1);
 						*(sb-1)='!';
 					}
 			}
        return str;
 }
 char *ecoComplementPattern(char *nucAcSeq)
 {
    return reverseSequence(LXBioSeqComplement(nucAcSeq),1);
 }
 char *ecoComplementSequence(char *nucAcSeq)
 {
    return reverseSequence(LXBioSeqComplement(nucAcSeq),0);
 }
 char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end)
 /*
   extract subsequence from nucAcSeq [begin,end[
 */
 {
 	static char *buffer  = NULL;
 	static int32_t buffSize= 0;
 	int32_t length;
 	if (begin < end)
 	{
 		length = end - begin;
 		if (length >= buffSize)
 		{
 			buffSize = length+1;
 			if (buffer)
 				buffer=ECOREALLOC(buffer,buffSize,
 						   	      "Error in reallocating sub sequence buffer");
 			else
 				buffer=ECOMALLOC(buffSize,
 				          		 "Error in allocating sub sequence buffer");
 		}
 		strncpy(buffer,nucAcSeq + begin,length);
 		buffer[length]=0;
 	}
 	else
 	{
 		length = end + strlen(nucAcSeq) - begin;
 		if (length >= buffSize)
 		{
 			buffSize = length+1;
 			if (buffer)
 				buffer=ECOREALLOC(buffer,buffSize,
 						   	      "Error in reallocating sub sequence buffer");
 			else
 				buffer=ECOMALLOC(buffSize,
 				          		 "Error in allocating sub sequence buffer");
 		}
 		strncpy(buffer,nucAcSeq+begin,length - end);
 		strncpy(buffer+(length-end),nucAcSeq ,end);
 		buffer[length]=0;
 	}
 	return buffer;
 }
--- a/src/ecodna.o
+++ b/src/ecodna.o
--- a/src/ecofilter.c
+++ b/src/ecofilter.c
@@ -0,0 +1,20 @@
 #include "ecoPCR.h"
 int eco_is_taxid_included(	ecotaxonomy_t *taxonomy, 
 							int32_t *restricted_taxid, 
 							int32_t tab_len, 
 							int32_t taxid)
 {
 	int i;
 	ecotx_t *taxon;
 	taxon = eco_findtaxonbytaxid(taxonomy, taxid);
 	if (taxon)
 		for (i=0; i < tab_len; i++)
 			if ( (taxon->taxid == restricted_taxid[i]) ||
 				 (eco_isundertaxon(taxon, restricted_taxid[i])) )
 				return 1;
 	return 0;
 }
--- a/src/ecofilter.o
+++ b/src/ecofilter.o
--- a/src/econame.c
+++ b/src/econame.c
@@ -0,0 +1,86 @@
 #include "ecoPCR.h"
 #include <string.h>
 #include <stdlib.h>
 static econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy);
 econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy)
 {
 	int32_t      		count;
 	FILE         		*f;
 	econameidx_t		*indexname;
 	int32_t      		i;
 	f = open_ecorecorddb(filename,&count,0);
 	if (f==NULL)
 		return NULL;
 	indexname = (econameidx_t*) ECOMALLOC(sizeof(econameidx_t) + sizeof(econame_t) * (count-1),"Allocate names");
 	indexname->count=count;
 	for (i=0; i < count; i++){
 		readnext_econame(f,(indexname->names)+i,taxonomy);
 	}
 	return indexname;
 }
 int32_t delete_nameidx(econameidx_t *nameidx)
 {
 	size_t i;
 	if (nameidx) {
 		for (i=0; i < nameidx->count; i++) {
 			if (nameidx->names[i].name)
 				ECOFREE(nameidx->names[i].name,
 						"Desallocate name");
 			if (nameidx->names[i].classname)
 				ECOFREE(nameidx->names[i].classname,
 						"Desallocate classname");
 		}
 		ECOFREE(nameidx,"Desallocate name index");
 		return 0;
 	}
 	return 1;
 }
 econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy)
 {
 	econameformat_t *raw;
 	int32_t  rs;
 	raw = read_ecorecord(f,&rs);
 	if (!raw)
 		return NULL;
 	if (is_big_endian())
 	{
 		raw->is_scientificname 	= swap_int32_t(raw->is_scientificname);
 		raw->namelength 	    = swap_int32_t(raw->namelength);
 		raw->classlength        = swap_int32_t(raw->classlength);
 		raw->taxid  	        = swap_int32_t(raw->taxid); 
 	}
 	name->is_scientificname=raw->is_scientificname;
 	name->name   	= ECOMALLOC((raw->namelength+1) * sizeof(char),"Allocate name");
 	strncpy(name->name,raw->names,raw->namelength);
 	name->name[raw->namelength]=0;
 	name->classname = ECOMALLOC((raw->classlength+1) * sizeof(char),"Allocate classname");
 	strncpy(name->classname,(raw->names+raw->namelength),raw->classlength);
 	name->classname[raw->classlength]=0;
 	name->taxon = taxonomy->taxons->taxon + raw->taxid;
 	return name;
 }
--- a/src/econame.o
+++ b/src/econame.o
--- a/src/ecorank.c
+++ b/src/ecorank.c
@@ -0,0 +1,75 @@
 #include "ecoPCR.h"
 #include <string.h>
 #include <stdlib.h>
 static int compareRankLabel(const void *label1, const void *label2);
 ecorankidx_t     *read_rankidx(const char *filename)
 {
 	int32_t      count;
 	FILE         *f;
 	ecorankidx_t *index;
 	int32_t      i;
 	int32_t      rs;
 	char         *buffer;
 	f = open_ecorecorddb(filename,&count,0);
 	if (f==NULL)
 		return NULL;
 	index = (ecorankidx_t*) ECOMALLOC(sizeof(ecorankidx_t) + sizeof(char*) * (count-1),
 	                                  "Allocate rank index");
 	index->count=count;                                 
 	for (i=0; i < count; i++)
 		{
 			buffer = read_ecorecord(f,&rs);
 			index->label[i]=(char*) ECOMALLOC(rs+1,
 			                                  "Allocate rank label");
 			strncpy(index->label[i],buffer,rs);
 		}
 	return index;
 }
 int32_t delete_rankidx(ecorankidx_t *rankidx)
 {
 	size_t i;
 	if (rankidx) {
 		for (i=0; i < rankidx->count; i++)
 			if (rankidx->label[i])
 				ECOFREE(rankidx->label[i],
 						"Desallocate rank label");
 		ECOFREE(rankidx,"Desallocate rank index");
 		return 0;
 	}
 	return 1;
 }
 int32_t rank_index(const char* label,ecorankidx_t* ranks)
 {
 	char **rep;
 	rep = bsearch(label,ranks->label,ranks->count,sizeof(char*),compareRankLabel);
 	if (rep)
 		return rep-ranks->label;
 //	else
 //		ECOERROR(ECO_NOTFOUND_ERROR,"Rank label not found");
 	return -1;
 }
 int compareRankLabel(const void *label1, const void *label2)
 {
 	return strcmp((const char*)label1,*(const char**)label2);
 }
--- a/src/ecorank.o
+++ b/src/ecorank.o
--- a/src/ecoseq.c
+++ b/src/ecoseq.c
@@ -0,0 +1,231 @@
 #include "ecoPCR.h"
 #include <stdlib.h>
 #include <string.h>
 #include <zlib.h>
 #include <string.h>
 #include <stdio.h>
 #include <ctype.h>
 static FILE *open_seqfile(const char *prefix,int32_t index);
 ecoseq_t *new_ecoseq()
 {
 	void *tmp;
 	tmp = ECOMALLOC(sizeof(ecoseq_t),"Allocate new ecoseq structure");
 	return tmp;
 }
 int32_t delete_ecoseq(ecoseq_t * seq)
 {
 	if (seq)
 	{
 		if (seq->AC)
 			ECOFREE(seq->AC,"Free sequence AC");
 		if (seq->DE)
 			ECOFREE(seq->DE,"Free sequence DE");
 		if (seq->SQ)
 			ECOFREE(seq->SQ,"Free sequence SQ");
 		ECOFREE(seq,"Free sequence structure");
 		return 0;
 	}
 	return 1;
 }
 ecoseq_t *new_ecoseq_with_data( char *AC,
 								char *DE,
 								char *SQ,
 								int32_t   taxid_idx
 								)
 {
 	ecoseq_t *tmp;
 	int32_t lstr;
 	tmp = new_ecoseq();
 	tmp->taxid=taxid_idx;
 	if (AC)
 		{
 			lstr =strlen(AC);
 			tmp->AC=ECOMALLOC((lstr+1) * sizeof(char),
 			                  "Allocate sequence accession");
 			strcpy(tmp->AC,AC);
 		}
 	if (DE)
 		{
 			lstr =strlen(DE);
 			tmp->DE=ECOMALLOC((lstr+1) * sizeof(char),
 			                  "Allocate sequence definition");
 			strcpy(tmp->DE,DE);
 		}
 	if (SQ)
 		{
 			lstr =strlen(SQ);
 			tmp->SQ=ECOMALLOC((lstr+1) * sizeof(char),
 			                  "Allocate sequence data");
 			strcpy(tmp->SQ,SQ);
 		}
 	return tmp;
 }
 /**
 * ?? used ??
 **/
 FILE *open_ecoseqdb(const char *filename,
                    int32_t    *sequencecount)
 {
 	return open_ecorecorddb(filename,sequencecount,1);
 }
 ecoseq_t *readnext_ecoseq(FILE *f)
 {
 	char     *compressed=NULL;
 	ecoseqformat_t *raw;
 	ecoseq_t *seq;
 	int32_t  comp_status;
 	unsigned long int seqlength;
 	int32_t  rs;
 	char *c;
 	int32_t i;
 	raw = read_ecorecord(f,&rs);
 	if (!raw)
 		return NULL;
 	if (is_big_endian())
 	{
 		raw->CSQ_length = swap_int32_t(raw->CSQ_length);
 		raw->DE_length  = swap_int32_t(raw->DE_length);
 		raw->SQ_length  = swap_int32_t(raw->SQ_length);
 		raw->taxid      = swap_int32_t(raw->taxid);
 	}
 	seq = new_ecoseq();
 	seq->taxid = raw->taxid;
    seq->AC    = ECOMALLOC(strlen(raw->AC) +1,
                           "Allocate Sequence Accesion number");
    strncpy(seq->AC,raw->AC,strlen(raw->AC));
    seq->DE    = ECOMALLOC(raw->DE_length+1,
                           "Allocate Sequence definition");
    strncpy(seq->DE,raw->data,raw->DE_length);
 	seqlength = seq->SQ_length = raw->SQ_length;
    compressed = raw->data + raw->DE_length;
    seq->SQ = ECOMALLOC(seqlength+1,
                        "Allocate sequence buffer");
 /*
    comp_status = uncompress((unsigned char*)seq->SQ,
                             &seqlength,
                             (unsigned char*)compressed,
                             raw->CSQ_length);
 */
    if (comp_status != Z_OK)
    	ECOERROR(ECO_IO_ERROR,"I cannot uncompress sequence data");
    for (c=seq->SQ,i=0;i<seqlength;c++,i++)
    	*c=toupper(*c);
 	return seq;
 }
 /**
 * Open the sequences database (.sdx file)
 * @param	prefix	name of the database (radical without extension)
 * @param	index 	integer
 *
 * @return	file object
 */
 FILE *open_seqfile(const char *prefix,int32_t index)
 {
 	char           filename_buffer[1024];
 	int32_t        filename_length;
 	FILE           *input;
 	int32_t        seqcount;
 	filename_length = snprintf(filename_buffer,
 								1023,
 	                           "%s_%03d.sdx",
 	                           prefix,
 	                           index);
 		//	fprintf(stderr,"# Coucou %s\n",filename_buffer);
 	if (filename_length >= 1024)
 		ECOERROR(ECO_ASSERT_ERROR,"file name is too long");
 	filename_buffer[filename_length]=0;
 	input=open_ecorecorddb(filename_buffer,&seqcount,0);
 	if (input)
 		REprintf("# Reading file %s containing %d sequences...\n",
 				     filename_buffer,
 				     seqcount);
 	return input;
 }
 ecoseq_t *ecoseq_iterator(const char *prefix)
 {
 	static FILE    *current_seq_file= NULL;
 	static int32_t current_file_idx = 1;
 	static char    current_prefix[1024];
 	ecoseq_t       *seq;
 	if (prefix)
 	{
 		current_file_idx = 1;
 		if (current_seq_file)
 			fclose(current_seq_file);
 		strncpy(current_prefix,prefix,1023);
 		current_prefix[1023]=0;
 		current_seq_file = open_seqfile(current_prefix,
 		 							    current_file_idx);
 		if (!current_seq_file)
 			return NULL;
 	}
 	seq = readnext_ecoseq(current_seq_file);
 	if (!seq && feof(current_seq_file))
 	{
 		current_file_idx++;
 		fclose(current_seq_file);
 		current_seq_file = open_seqfile(current_prefix,
 		 							    current_file_idx);
 		if (current_seq_file)
 			seq = readnext_ecoseq(current_seq_file);
 	}
 	return seq;
 }
--- a/src/ecoseq.o
+++ b/src/ecoseq.o
--- a/src/ecotax.c
+++ b/src/ecotax.c
@@ -0,0 +1,420 @@
 #include "ecoPCR.h"
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <R.h>
 #ifndef MAX
 #define MAX(x,y) (((x)>(y)) ? (x):(y))
 #endif
 static ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon);
 /** 
 * Open the taxonomy database 
 * @param	pointer to the database (.tdx file) 
 * @return	a ecotxidx_t structure 
 */
 ecotxidx_t     *read_taxonomyidx(const char *filename,const char *filename2)
 {
 	int32_t      count;
 	int32_t      count2;
 	FILE         *f;
 	FILE         *f2;
 	ecotxidx_t *index;
 	struct ecotxnode  *t;
 	int32_t      i;
 	int32_t      j;
 	f  = open_ecorecorddb(filename,&count,0);
 	if (f==NULL) return NULL;
 	f2 = open_ecorecorddb(filename2,&count2,0);
 	index = (ecotxidx_t*) ECOMALLOC(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count+count2-1),
 	                                  "Allocate taxonomy");
 	index->count=count+count2;
    index->buffersize = index->count;
 	index->maxtaxid=0;
 	REprintf("Readind %d taxa...\n",count);
 	for (i=0; i < count; i++){
 		readnext_ecotaxon(f,&(index->taxon[i]));
 		index->taxon[i].parent=index->taxon + (size_t)index->taxon[i].parent;
 		index->taxon[i].parent->farest=0;
 		if (index->taxon[i].taxid > index->maxtaxid)
 			index->maxtaxid=index->taxon[i].taxid;
 	}					
 	if (count2>0)
 		REprintf("Readind %d local taxa...\n",count2);
 	else
 		REprintf("No local taxon\n");
 	count = index->count;
 	for (; i < count; i++){
 		readnext_ecotaxon(f2,&(index->taxon[i]));
 		index->taxon[i].parent=index->taxon + (size_t)index->taxon[i].parent;
 		index->taxon[i].parent->farest=0;
 		if (index->taxon[i].taxid > index->maxtaxid)
 			index->maxtaxid=index->taxon[i].taxid;
 	}
 	REprintf("Computing longest branches...\n",count);
 	for (i=0; i < count; i++){
 		t=index->taxon+i;
 		if (t->farest==-1)
 		{
 			t->farest=0;
            while(t->parent != t)
            {
            	j = t->farest + 1;
            	if (j > t->parent->farest)
            	{
            		t->parent->farest = j;
            		t=t->parent;
            	}
            	else
            		t=index->taxon;
            }
 		}
 	}
 	return index;
 }
 int32_t delete_taxonomy(ecotxidx_t *index)
 {
 	int32_t i;
 	if (index)
 	{
 		for (i=0; i< index->count; i++)
 			if (index->taxon[i].name)
 				ECOFREE(index->taxon[i].name,"Free scientific name");
 		ECOFREE(index,"Free Taxonomy");
 		return 0;
 	}
 	return 1;
 }
 int32_t delete_taxon(ecotx_t *taxon)
 {
 	if (taxon)
 	{
 		if (taxon->name)
 			ECOFREE(taxon->name,"Free scientific name");
 		ECOFREE(taxon,"Free Taxon");
 		return 0;
 	}
 	return 1;
 }
 /**
 * Read the database for a given taxon a save the data 
 * into the taxon structure(if any found)
 * @param	*f	pointer to FILE type returned by fopen
 * @param	*taxon	pointer to the structure
 * 
 * @return	a ecotx_t structure if any taxon found else NULL 
 */
 ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon)
 {
 	ecotxformat_t *raw;
 	int32_t  rs;
 	raw = read_ecorecord(f,&rs);
 	if (!raw)
 		return NULL;
 	if (is_big_endian())
 	{
 		raw->namelength = swap_int32_t(raw->namelength);
 		raw->parent     = swap_int32_t(raw->parent);
 		raw->rank       = swap_int32_t(raw->rank);
 		raw->taxid      = swap_int32_t(raw->taxid); 
 	}
 	taxon->parent = (ecotx_t*)((size_t)raw->parent);
 	taxon->taxid  = raw->taxid;
 	taxon->rank   = raw->rank;
 	taxon->farest = -1;
 	taxon->name   = ECOMALLOC((raw->namelength+1) * sizeof(char),
 	                          "Allocate taxon scientific name");
 	strncpy(taxon->name,raw->name,raw->namelength);
 	return taxon;
 }
 ecotaxonomy_t    *read_taxonomy(const char *prefix,int32_t readAlternativeName)
 {
 	ecotaxonomy_t *tax;
 	char          *filename;
 	char          *filename2;
 	int           buffsize;
 	tax = ECOMALLOC(sizeof(ecotaxonomy_t),
 	                "Allocate taxonomy structure");
 	tax->ranks =NULL;
 	tax->taxons=NULL;
 	tax->names =NULL;
 	buffsize = strlen(prefix)+10;
 	filename = ECOMALLOC(buffsize,
 	                     "Allocate filename");
 	filename2= ECOMALLOC(buffsize,
 	                     "Allocate filename");
 	snprintf(filename,buffsize,"%s.rdx",prefix);
 	tax->ranks = read_rankidx(filename);
 	if (tax->ranks == NULL)
 	{
 		ECOFREE(filename,"Desallocate filename 1");
 		ECOFREE(filename2,"Desallocate filename 2");
 		delete_ecotaxonomy(tax);
 		return NULL;
 	}
 	snprintf(filename,buffsize,"%s.tdx",prefix);
 	snprintf(filename2,buffsize,"%s.ldx",prefix);
 	tax->taxons = read_taxonomyidx(filename,filename2);
 	if (tax->taxons == NULL)
 	{
 		ECOFREE(filename,"Desallocate filename 1");
 		ECOFREE(filename,"Desallocate filename 2");
 		delete_ecotaxonomy(tax);
 		return NULL;
 	}
 	if (readAlternativeName)
 	{
  	   snprintf(filename,buffsize,"%s.ndx",prefix);
 	   tax->names=read_nameidx(filename,tax);
 	}
 	else
 	   tax->names=NULL;
 	ECOFREE(filename,"Desallocate filename 1");
 	ECOFREE(filename2,"Desallocate filename 2");
 	return tax;
 }
 int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy)
 {
 	if (taxonomy)
 	{
 		delete_rankidx(taxonomy->ranks);
 		delete_nameidx(taxonomy->names);
 		delete_taxonomy(taxonomy->taxons);
 		ECOFREE(taxonomy,"Free taxonomy structure");
 		return 0;
 	}
 	return 1;
 }
 ecotx_t *eco_findtaxonatrank(ecotx_t *taxon,
                                int32_t rankidx)
 {
 	ecotx_t *current_taxon;
 	ecotx_t *next_taxon;
 	current_taxon = taxon;
 	next_taxon    = current_taxon->parent;
 	while ((current_taxon!=next_taxon) &&  // I' am the root node
 		   (current_taxon->rank!=rankidx))
 		   {
 		   	current_taxon = next_taxon;
 		   	next_taxon    = current_taxon->parent;
 		   }
 	if (current_taxon->rank==rankidx)
 		return current_taxon;
 	else
 		return NULL;
 }
 static int bcomptaxon (const void * ptaxid, const void * ptaxon) {
  ecotx_t    *current_taxon = (ecotx_t*)ptaxon;
  int32_t    taxid=(int32_t)((size_t)ptaxid);
  return taxid - current_taxon->taxid;
 }
 /**
 * Get back information concerning a taxon from a taxonomic id
 * @param 	*taxonomy 	the taxonomy database
 * @param	taxid		the taxonomic id 
 * 
 * @result	a ecotx_t structure containing the taxonimic information  
 **/
 ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy, 
 							  int32_t taxid)
 {
 	ecotx_t    *current_taxon;
 	int32_t     taxoncount;
  taxoncount=taxonomy->taxons->count;
  current_taxon = (ecotx_t*)  bsearch((const void *)((size_t)taxid), 
                                      (const void *)taxonomy->taxons->taxon, 
                                      taxoncount, 
                                      sizeof(ecotx_t), 
                                      bcomptaxon);
 	return current_taxon;	
 }
 /**
 * Find out if taxon is son of other taxon (identified by its taxid)
 * @param	*taxon son 		taxon
 * @param	parent_taxid 	taxonomic id of the other taxon
 * 
 * @return 	1 is the other taxid math a parent taxid, else 0
 **/
 int eco_isundertaxon(ecotx_t *taxon, 
 						int other_taxid)
 {
 	ecotx_t *next_parent;
 	next_parent = taxon->parent;	
 	while ( (other_taxid != next_parent->taxid) && 
 			(strcmp(next_parent->name, "root")) )
 	{
 		next_parent = next_parent->parent;
 	}
 	if (other_taxid == next_parent->taxid)
 		return 1;
 	else
 		return 0;
 }
 ecotx_t *eco_getspecies(ecotx_t *taxon,
 						ecotaxonomy_t *taxonomy)
 {
 	static ecotaxonomy_t *tax=NULL;
 	static int32_t		 rankindex=-1;
 	if (taxonomy && tax!=taxonomy)
 	{
 		rankindex = rank_index("species",taxonomy->ranks);
 		tax=taxonomy;
 	}
 	if (!tax || rankindex < 0)
 		ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
 	return eco_findtaxonatrank(taxon,rankindex);
 }
 ecotx_t *eco_getgenus(ecotx_t *taxon,
 						ecotaxonomy_t *taxonomy)
 {
 	static ecotaxonomy_t *tax=NULL;
 	static int32_t		 rankindex=-1;
 	if (taxonomy && tax!=taxonomy)
 	{
 		rankindex = rank_index("genus",taxonomy->ranks);
 		tax=taxonomy;
 	}
 	if (!tax || rankindex < 0)
 		ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
 	return eco_findtaxonatrank(taxon,rankindex);
 }
 ecotx_t *eco_getfamily(ecotx_t *taxon,
 						ecotaxonomy_t *taxonomy)
 {
 	static ecotaxonomy_t *tax=NULL;
 	static int32_t		 rankindex=-1;
 	if (taxonomy && tax!=taxonomy)
 	{
 		rankindex = rank_index("family",taxonomy->ranks);
 		tax=taxonomy;
 	}
 	if (!tax || rankindex < 0)
 		ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
 	return eco_findtaxonatrank(taxon,rankindex);
 }
 ecotx_t *eco_getkingdom(ecotx_t *taxon,
 						ecotaxonomy_t *taxonomy)
 {
 	static ecotaxonomy_t *tax=NULL;
 	static int32_t		 rankindex=-1;
 	if (taxonomy && tax!=taxonomy)
 	{
 		rankindex = rank_index("kingdom",taxonomy->ranks);
 		tax=taxonomy;
 	}
 	if (!tax || rankindex < 0)
 		ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
 	return eco_findtaxonatrank(taxon,rankindex);
 }
 ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,
 						ecotaxonomy_t *taxonomy)
 {
 	static ecotaxonomy_t *tax=NULL;
 	static int32_t		 rankindex=-1;
 	if (taxonomy && tax!=taxonomy)
 	{
 		rankindex = rank_index("superkingdom",taxonomy->ranks);
 		tax=taxonomy;
 	}
 	if (!tax || rankindex < 0)
 		ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
 	return eco_findtaxonatrank(taxon,rankindex);
 }
--- a/src/ecotax.o
+++ b/src/ecotax.o
--- a/src/robitax.c
+++ b/src/robitax.c
@@ -0,0 +1,845 @@
 /*
 * robitax.c
 *
 *  Created on: 17 janv. 2013
 *      Author: coissac
 */
 #include "robitax.h"
 #include <unistd.h>
 //#include <regex.h>
 #include "slre.h"
 /**
 * Return a pointeur to an obitools taxonomy C structure
 * from an R instance of taxonomy.obitools
 *
 * The function checks if the pointer stored in the R object is set
 * to NULL. In this case this means that we have to load the taxonomy
 * from the disk.
 *
 * @param taxonomy an R object
 * @type  taxonomy SEXP
 *
 * @return a pointer to the C structure
 * @rtype  ecotaxonomy_t *
 */
 ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy)
 {
 	char *pwd;
 	SEXP pointer;
 	SEXP rclass;
 	SEXP rdir;
 	SEXP rfile;
 	ecotaxonomy_t *ptax;
 	const char *class;
 	const char *file;
 	const char *dir;
  int saved;
    if (!IS_S4_OBJECT(Rtaxonomy) )
        error("argument not obitools.taxonomy instance");
 	// We get the class name and compare it to "taxonomy.obitools"
    rclass = getAttrib(Rtaxonomy, R_ClassSymbol);
    class = CHAR(asChar(rclass));
    if (strcmp(class,"obitools.taxonomy"))
        error("argument not obitools.taxonomy instance");
    pointer = R_do_slot(Rtaxonomy,mkString("pointer"));
    saved = LOGICAL(R_do_slot(Rtaxonomy,mkString("saved")))[0];
    ptax = (ecotaxonomy_t *) R_ExternalPtrAddr(pointer);
    // If the external pointer is set to NULL we have to load
    // the taxonomy from file
    if (ptax==NULL && saved)
    {
    	pwd = getcwd(NULL,0);
    	rfile = R_do_slot(Rtaxonomy,mkString("dbname"));
    	file  = CHAR(asChar(rfile));
    	rdir  = R_do_slot(Rtaxonomy,mkString("workingdir"));
    	dir   = CHAR(asChar(rdir));
    	chdir(dir);
    	ptax = read_taxonomy(file,1);
    	R_SetExternalPtrAddr(pointer,(void*)ptax);
    	chdir(pwd);
    	free(pwd);
    }
    if (ptax==NULL && ! saved)
      error("The taxonomy instance is no more valid and must be rebuilt");
    return ptax;
 }
 SEXP R_delete_taxonomy(SEXP Rtaxonomy)
 {
 	ecotaxonomy_t *ptax;
 //	SEXP pointer;
    ptax = (ecotaxonomy_t *) R_ExternalPtrAddr(Rtaxonomy);
    (void) delete_ecotaxonomy(ptax);
    // Clear the external pointer
    R_ClearExternalPtr(Rtaxonomy);
    return R_NilValue;
 }
 SEXP R_read_taxonomy(SEXP filename, SEXP altenative)
 {
 	int   alt;
 	const char* file;
 	SEXP  Rtax;
    if (! isString(filename))
        error("filename not character");
    file = CHAR(STRING_ELT(filename, 0));
    if (! isLogical(altenative))
        error("altenative not logical");
    alt = LOGICAL(altenative)[0];
    ecotaxonomy_t *taxonomy = read_taxonomy(file,alt);
    if (! taxonomy)
    	error("Cannot open the taxonomy database");
 	Rtax = PROTECT(R_MakeExternalPtr(taxonomy, mkString("ROBITools NCBI Taxonomy pointer"), R_NilValue));
 	R_RegisterCFinalizerEx(Rtax, (R_CFinalizer_t)R_delete_taxonomy,TRUE);
    UNPROTECT(1);
 	return Rtax;
 }
 SEXP R_get_scientific_name(SEXP Rtaxonomy,SEXP Rtaxid)
 {
 	ecotx_t *taxon;
 	ecotaxonomy_t *ptax;
 	int taxid;
 //	SEXP scname;
    ptax = getTaxPointer(Rtaxonomy);
    if (! isInteger(Rtaxid))
        error("taxid not integer");
    taxid = *INTEGER(Rtaxid);
    if (! (taxid > 0))
        error("taxid not positive");
 	taxon = eco_findtaxonbytaxid(ptax, taxid);
 	if (!taxon)
 		return ScalarString(R_NaString);
 		// error("unkown taxid");
 	return mkString(taxon->name);
 }
 SEXP R_get_rank(SEXP Rtaxonomy,SEXP Rtaxid)
 {
 	ecotx_t *taxon;
 	ecotaxonomy_t *ptax;
 	int *taxid;
  int ntaxid;
  int i;
  SEXP results;
 //	SEXP scname;
    ptax = getTaxPointer(Rtaxonomy);
    if (! isInteger(Rtaxid))
        error("taxid not integer");
    ntaxid = length(Rtaxid);
    results = PROTECT(allocVector(STRSXP, ntaxid));
    taxid = INTEGER(Rtaxid);
    for (i=0; i < ntaxid; i++)
    {
      if (taxid[i]== NA_INTEGER || taxid[i] <= 0)
        SET_STRING_ELT(results, i, R_NaString);
      else {
        taxon = eco_findtaxonbytaxid(ptax, taxid[i]);
        if (!taxon)
          SET_STRING_ELT(results, i, R_NaString);
        else
          SET_STRING_ELT(results, i, mkChar(ptax->ranks->label[taxon->rank]));
      }
    }
  UNPROTECT(1);
 	return results;
 }
 SEXP R_findtaxonatrank(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rrank, SEXP Rname)
 {
 	ecotx_t *taxon;
 	ecotx_t *rep;
 	ecotaxonomy_t *ptax;
 	int taxid;
 	int name;
 	const char *rank;
 	int   rankidx;
 //	SEXP scname;
    ptax = getTaxPointer(Rtaxonomy);
    if (! isInteger(Rtaxid))
        error("taxid not integer");
    taxid = *INTEGER(Rtaxid);
    if (! (taxid > 0))
        error("taxid not positive");
    if (! isString(Rrank))
        error("rank not a string");
    rank=CHAR(STRING_ELT(Rrank,0));
    rankidx=rank_index(rank,ptax->ranks);
    if (rankidx < 0)
        error("unkown rank name");
    if (! isLogical(Rname))
        error("name not logical");
    name = LOGICAL(Rname)[0];
 	taxon = eco_findtaxonbytaxid(ptax, taxid);
 	if (!taxon)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	rep = eco_findtaxonatrank(taxon,rankidx);
 	if (!rep)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	if (name)
 		return mkString(rep->name);
 	else
 		return ScalarInteger(rep->taxid);
 }
 SEXP R_get_species(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
 {
 	ecotx_t *taxon;
 	ecotx_t *rep;
 	ecotaxonomy_t *ptax;
 	int taxid;
 	int name;
 //	SEXP scname;
    ptax = getTaxPointer(Rtaxonomy);
    if (! isInteger(Rtaxid))
        error("taxid not integer");
    taxid = *INTEGER(Rtaxid);
    if (! (taxid > 0))
        error("taxid not positive");
    if (! isLogical(Rname))
        error("name not logical");
    name = LOGICAL(Rname)[0];
 	taxon = eco_findtaxonbytaxid(ptax, taxid);
 	if (!taxon)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	rep = eco_getspecies(taxon,ptax);
 	if (!rep)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	if (name)
 		return mkString(rep->name);
 	else
 		return ScalarInteger(rep->taxid);
 }
 SEXP R_get_genus(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
 {
 	ecotx_t *taxon;
 	ecotx_t *rep;
 	ecotaxonomy_t *ptax;
 	int taxid;
 	int name;
 //	SEXP scname;
    ptax = getTaxPointer(Rtaxonomy);
    if (! isInteger(Rtaxid))
        error("taxid not integer");
    taxid = *INTEGER(Rtaxid);
    if (! (taxid > 0))
        error("taxid not positive");
    if (! isLogical(Rname))
        error("name not logical");
    name = LOGICAL(Rname)[0];
 	taxon = eco_findtaxonbytaxid(ptax, taxid);
 	if (!taxon)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	rep = eco_getgenus(taxon,ptax);
 	if (!rep)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	if (name)
 		return mkString(rep->name);
 	else
 		return ScalarInteger(rep->taxid);
 }
 SEXP R_get_family(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
 {
 	ecotx_t *taxon;
 	ecotx_t *rep;
 	ecotaxonomy_t *ptax;
 	int taxid;
 	int name;
 //	SEXP scname;
    ptax = getTaxPointer(Rtaxonomy);
    if (! isInteger(Rtaxid))
        error("taxid not integer");
    taxid = *INTEGER(Rtaxid);
    if (! (taxid > 0))
        error("taxid not positive");
    if (! isLogical(Rname))
        error("name not logical");
    name = LOGICAL(Rname)[0];
 	taxon = eco_findtaxonbytaxid(ptax, taxid);
 	if (!taxon)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	rep = eco_getfamily(taxon,ptax);
 	if (!rep)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	if (name)
 		return mkString(rep->name);
 	else
 		return ScalarInteger(rep->taxid);
 }
 SEXP R_get_kingdom(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
 {
 	ecotx_t *taxon;
 	ecotx_t *rep;
 	ecotaxonomy_t *ptax;
 	int taxid;
 	int name;
 //	SEXP scname;
    ptax = getTaxPointer(Rtaxonomy);
    if (! isInteger(Rtaxid))
        error("taxid not integer");
    taxid = *INTEGER(Rtaxid);
    if (! (taxid > 0))
        error("taxid not positive");
    if (! isLogical(Rname))
        error("name not logical");
    name = LOGICAL(Rname)[0];
 	taxon = eco_findtaxonbytaxid(ptax, taxid);
 	if (!taxon)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	rep = eco_getkingdom(taxon,ptax);
 	if (!rep)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	if (name)
 		return mkString(rep->name);
 	else
 		return ScalarInteger(rep->taxid);
 }
 SEXP R_get_superkingdom(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
 {
 	ecotx_t *taxon;
 	ecotx_t *rep;
 	ecotaxonomy_t *ptax;
 	int taxid;
 	int name;
 //	SEXP scname;
    ptax = getTaxPointer(Rtaxonomy);
    if (! isInteger(Rtaxid))
        error("taxid not integer");
    taxid = *INTEGER(Rtaxid);
    if (! (taxid > 0))
        error("taxid not positive");
    if (! isLogical(Rname))
        error("name not logical");
    name = LOGICAL(Rname)[0];
 	taxon = eco_findtaxonbytaxid(ptax, taxid);
 	if (!taxon)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	rep = eco_getsuperkingdom(taxon,ptax);
 	if (!rep)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	if (name)
 		return mkString(rep->name);
 	else
 		return ScalarInteger(rep->taxid);
 }
 SEXP R_get_parent(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
 {
 	ecotx_t *taxon;
 	ecotx_t *rep;
 	ecotaxonomy_t *ptax;
 	int taxid;
 	int name;
 //	SEXP scname;
    ptax = getTaxPointer(Rtaxonomy);
    if (! isInteger(Rtaxid))
        error("taxid not integer");
    taxid = *INTEGER(Rtaxid);
    if (! (taxid > 0))
        error("taxid not positive");
    if (! isLogical(Rname))
        error("name not logical");
    name = LOGICAL(Rname)[0];
 	taxon = eco_findtaxonbytaxid(ptax, taxid);
 	if (!taxon)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	rep = taxon->parent;
 	if (rep->taxid==taxid)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	if (name)
 		return mkString(rep->name);
 	else
 		return ScalarInteger(rep->taxid);
 }
 SEXP R_validate_taxid(SEXP Rtaxonomy,SEXP Rtaxid)
 {
 	ecotx_t *taxon;
 	ecotaxonomy_t *ptax;
 	int taxid;
 //	int name;
 //	SEXP scname;
    ptax = getTaxPointer(Rtaxonomy);
    if (! isInteger(Rtaxid))
        error("taxid not integer");
    taxid = *INTEGER(Rtaxid);
    if (! (taxid > 0))
    	return ScalarInteger(R_NaInt);
 	taxon = eco_findtaxonbytaxid(ptax, taxid);
 	if (!taxon)
 		return ScalarInteger(R_NaInt);
 	else
 		return ScalarInteger(taxon->taxid);
 }
 SEXP R_is_under_taxon(SEXP Rtaxonomy, SEXP Rtaxid, SEXP Rparent)
 {
 	ecotx_t *taxon;
 	ecotaxonomy_t *ptax;
 	int taxid;
  int *ptaxid;
  int ntaxid;
 	int parent;
 	SEXP rep;
  int* prep;
  size_t i;
 //	SEXP isunder;
    ptax = getTaxPointer(Rtaxonomy);
    if (! isInteger(Rparent))
        error("parent not integer");
    if (GET_LENGTH(Rparent) > 1)
        error("parent must have a length equal to one.");
    parent = *INTEGER(Rparent);
    if (! isInteger(Rtaxid))
        error("taxid not integer");
    ntaxid = GET_LENGTH(Rtaxid);
    rep = PROTECT(NEW_LOGICAL(ntaxid));
    prep=LOGICAL(rep);
    ptaxid = INTEGER_POINTER(Rtaxid); 
    if (parent > 0) {
        taxon = eco_findtaxonbytaxid(ptax, parent);
        if (taxon) {
          for (i=0; i < ntaxid; i++) {
            if (ptaxid[i] <= 0)
              prep[i]=NA_LOGICAL;
            else {
              taxon = eco_findtaxonbytaxid(ptax, ptaxid[i]);
              prep[i] = eco_isundertaxon(taxon, parent);            
            }  
          }          
        }
    }    
    else
    {
      for (i=0; i < ntaxid; i++)
        prep[i]=NA_LOGICAL;
    }
  unprotect(1);
 	return rep;
 }
 SEXP R_longest_path(SEXP Rtaxonomy,SEXP Rtaxid)
 {
 	ecotx_t *taxon;
 	ecotaxonomy_t *ptax;
 	int taxid;
 //	int name;
 //	SEXP scname;
    ptax = getTaxPointer(Rtaxonomy);
    if (! isInteger(Rtaxid))
        error("taxid not integer");
    taxid = *INTEGER(Rtaxid);
    if (taxid <= 0)
    	return ScalarInteger(R_NaInt);
 	taxon = eco_findtaxonbytaxid(ptax, taxid);
 	if (!taxon)
 		return ScalarInteger(R_NaInt);
 	else
 		return ScalarInteger(taxon->farest);
 }
 SEXP R_rank_list(SEXP Rtaxonomy)
 {
 	int nrank;
 	int i;
 	ecotaxonomy_t *ptax;
 	SEXP rNames;
    ptax = getTaxPointer(Rtaxonomy);
    nrank = ptax->ranks->count;
    rNames = PROTECT(allocVector(STRSXP, nrank));
 	for (i=0; i < nrank;i++)
 		SET_STRING_ELT(rNames, i, mkChar(ptax->ranks->label[i]));
 	UNPROTECT(1);
 	return rNames;
 }
 SEXP R_taxid_list(SEXP Rtaxonomy)
 {
 	int ntaxid;
 	int i;
 	ecotaxonomy_t *ptax;
 	SEXP rTaxids;
    ptax = getTaxPointer(Rtaxonomy);
    ntaxid  = ptax->taxons->count;
    rTaxids = PROTECT(allocVector(INTSXP, ntaxid));
 	for (i=0; i < ntaxid;i++)
 		INTEGER(rTaxids)[i]=ptax->taxons->taxon[i].taxid;
 	UNPROTECT(1);
 	return rTaxids;
 }
 SEXP R_max_taxid(SEXP Rtaxonomy)
 {
 //	int nrank;
 //	int i;
 	ecotaxonomy_t *ptax;
 //	SEXP rNames;
    ptax = getTaxPointer(Rtaxonomy);
    return ScalarInteger(ptax->taxons->maxtaxid);
 }
 SEXP R_length_taxonomy(SEXP Rtaxonomy)
 {
 	ecotaxonomy_t *ptax;
    ptax = getTaxPointer(Rtaxonomy);
    return ScalarInteger(ptax->taxons->count);
 }
 SEXP R_ecofind(SEXP Rtaxonomy, SEXP Rpattern, SEXP Rrank, SEXP Ralternative)
 {
  ecotaxonomy_t *ptax;
  econame_t		*name;
  char*     pattern=NULL;
 	int				re_match;
  SEXP      taxids;
  int32_t*  buffer;
  int32_t		tax_count	= 0;
  size_t 		j 		= 0;
  int32_t		rankfilter 	= 1;
  int*      ptaxid;
  char			*rankname=NULL;
  int32_t			nummatch 	= 0;
  int32_t         alternative = 0;
  size_t    bsize;
  ptax = getTaxPointer(Rtaxonomy);
  tax_count = ptax->taxons->count;
  if (! isString(Rpattern))
      error("pattern not a string");
  pattern= (char*) CHAR(STRING_ELT(Rpattern,0));
  if (! isNull(Rrank))
  {
    if (! isString(Rrank))
      error("rank not a string");
    rankname= (char*) CHAR(STRING_ELT(Rrank,0));
  }
  if (! isLogical(Ralternative))
      error("rank not a logical");
  alternative = LOGICAL(Ralternative)[0];
 	nummatch=0;
  buffer = (int32_t*) malloc(100 * sizeof(int32_t));
  bsize=100;
  if (alternative && ptax->names!=NULL)
 	  for (j=0,name=ptax->names->names;
 			  j < ptax->names->count;
 			  name++,j++)
 	  {
 		  if(rankname)
 			  rankfilter = !(strcmp(rankname,ptax->ranks->label[name->taxon->rank]));
  	  re_match = slre_match(pattern, name->name, 
                            strlen(name->name), 
                            NULL, 0, 
                            SLRE_IGNORE_CASE);
  	  if (re_match > 0 && rankfilter)
 		  {
 			  buffer[nummatch]=name->taxon->taxid;
 			  nummatch++;
 			  if (nummatch==bsize) {
 				  bsize*=2;
 				  buffer = (int32_t*) realloc(buffer, bsize * sizeof(int32_t));
 				  if (buffer==0)
 				  {
 					  // regfree(&re_preg);
 					  error("Cannot allocate memory for the taxid list");
 				  }
 			  }
 		  }
 	  }
  else
 	  for (j=0; j < ptax->taxons->count;j++)
 	  {
 		  if(rankname)
 			  rankfilter = !(strcmp(rankname,ptax->ranks->label[ptax->taxons->taxon[j].rank]));
 //		  re_match = regexec (&re_preg, ptax->taxons->taxon[j].name, 0, NULL, 0);
      re_match = slre_match(pattern, ptax->taxons->taxon[j].name, 
                            strlen(ptax->taxons->taxon[j].name), 
                            NULL, 0, 
                            SLRE_IGNORE_CASE);
 //  	  if (!re_match && rankfilter)
  	  if (re_match > 0 && rankfilter)
 		  {
 			  buffer[nummatch]=ptax->taxons->taxon[j].taxid;
 			  nummatch++;
 			  if (nummatch==bsize) {
 				  bsize*=2;
 				  buffer = (int32_t*) realloc(buffer, bsize * sizeof(int32_t));
 				  if (buffer==0)
 				  {
 					  // regfree(&re_preg);
 					  error("Cannot allocate memory for the taxid list");
 				  }
 			  }
 		  }
 	  }
   	//regfree(&re_preg);
    taxids = PROTECT(NEW_INTEGER(nummatch));
    ptaxid = INTEGER(taxids);
    for (j=0; j < nummatch; j++)
      ptaxid[j]=buffer[j];
    free(buffer);
    UNPROTECT(1);
    return taxids;
 }
--- a/src/robitax.h
+++ b/src/robitax.h
@@ -0,0 +1,6 @@
 #include "ecoPCR.h"
 ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy);
 SEXP R_delete_taxonomy(SEXP Rtaxonomy);
--- a/src/robitax.o
+++ b/src/robitax.o
--- a/src/slre.c
+++ b/src/slre.c
@@ -0,0 +1,433 @@
 /*
 * Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
 * Copyright (c) 2013 Cesanta Software Limited
 * All rights reserved
 *
 * This library is dual-licensed: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation. For the terms of this
 * license, see <http://www.gnu.org/licenses/>.
 *
 * You are free to use this library under the terms of the GNU General
 * Public License, but WITHOUT ANY WARRANTY; without even the implied
 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 *
 * Alternatively, you can license this library under a commercial
 * license, as set out in <http://cesanta.com/products.html>.
 */
 #include <stdio.h>
 #include <ctype.h>
 #include <string.h>
 #include "slre.h"
 #define MAX_BRANCHES 100
 #define MAX_BRACKETS 100
 #define FAIL_IF(condition, error_code) if (condition) return (error_code)
 #ifndef ARRAY_SIZE
 #define ARRAY_SIZE(ar) (sizeof(ar) / sizeof((ar)[0]))
 #endif
 #ifdef SLRE_DEBUG
 #define DBG(x) printf x
 #else
 #define DBG(x)
 #endif
 struct bracket_pair {
  const char *ptr;  /* Points to the first char after '(' in regex  */
  int len;          /* Length of the text between '(' and ')'       */
  int branches;     /* Index in the branches array for this pair    */
  int num_branches; /* Number of '|' in this bracket pair           */
 };
 struct branch {
  int bracket_index;    /* index for 'struct bracket_pair brackets' */
                        /* array defined below                      */
  const char *schlong;  /* points to the '|' character in the regex */
 };
 struct regex_info {
  /*
   * Describes all bracket pairs in the regular expression.
   * First entry is always present, and grabs the whole regex.
   */
  struct bracket_pair brackets[MAX_BRACKETS];
  int num_brackets;
  /*
   * Describes alternations ('|' operators) in the regular expression.
   * Each branch falls into a specific branch pair.
   */
  struct branch branches[MAX_BRANCHES];
  int num_branches;
  /* Array of captures provided by the user */
  struct slre_cap *caps;
  int num_caps;
  /* E.g. SLRE_IGNORE_CASE. See enum below */
  int flags;
 };
 static int is_metacharacter(const unsigned char *s) {
  static const char *metacharacters = "^$().[]*+?|\\Ssdbfnrtv";
  return strchr(metacharacters, *s) != NULL;
 }
 static int op_len(const char *re) {
  return re[0] == '\\' && re[1] == 'x' ? 4 : re[0] == '\\' ? 2 : 1;
 }
 static int set_len(const char *re, int re_len) {
  int len = 0;
  while (len < re_len && re[len] != ']') {
    len += op_len(re + len);
  }
  return len <= re_len ? len + 1 : -1;
 }
 static int get_op_len(const char *re, int re_len) {
  return re[0] == '[' ? set_len(re + 1, re_len - 1) + 1 : op_len(re);
 }
 static int is_quantifier(const char *re) {
  return re[0] == '*' || re[0] == '+' || re[0] == '?';
 }
 static int toi(int x) {
  return isdigit(x) ? x - '0' : x - 'W';
 }
 static int hextoi(const unsigned char *s) {
  return (toi(tolower(s[0])) << 4) | toi(tolower(s[1]));
 }
 static int match_op(const unsigned char *re, const unsigned char *s,
                    struct regex_info *info) {
  int result = 0;
  switch (*re) {
    case '\\':
      /* Metacharacters */
      switch (re[1]) {
        case 'S': FAIL_IF(isspace(*s), SLRE_NO_MATCH); result++; break;
        case 's': FAIL_IF(!isspace(*s), SLRE_NO_MATCH); result++; break;
        case 'd': FAIL_IF(!isdigit(*s), SLRE_NO_MATCH); result++; break;
        case 'b': FAIL_IF(*s != '\b', SLRE_NO_MATCH); result++; break;
        case 'f': FAIL_IF(*s != '\f', SLRE_NO_MATCH); result++; break;
        case 'n': FAIL_IF(*s != '\n', SLRE_NO_MATCH); result++; break;
        case 'r': FAIL_IF(*s != '\r', SLRE_NO_MATCH); result++; break;
        case 't': FAIL_IF(*s != '\t', SLRE_NO_MATCH); result++; break;
        case 'v': FAIL_IF(*s != '\v', SLRE_NO_MATCH); result++; break;
        case 'x':
          /* Match byte, \xHH where HH is hexadecimal byte representaion */
          FAIL_IF(hextoi(re + 2) != *s, SLRE_NO_MATCH);
          result++;
          break;
        default:
          /* Valid metacharacter check is done in bar() */
          FAIL_IF(re[1] != s[0], SLRE_NO_MATCH);
          result++;
          break;
      }
      break;
    case '|': FAIL_IF(1, SLRE_INTERNAL_ERROR); break;
    case '$': FAIL_IF(1, SLRE_NO_MATCH); break;
    case '.': result++; break;
    default:
      if (info->flags & SLRE_IGNORE_CASE) {
        FAIL_IF(tolower(*re) != tolower(*s), SLRE_NO_MATCH);
      } else {
        FAIL_IF(*re != *s, SLRE_NO_MATCH);
      }
      result++;
      break;
  }
  return result;
 }
 static int match_set(const char *re, int re_len, const char *s,
                     struct regex_info *info) {
  int len = 0, result = -1, invert = re[0] == '^';
  if (invert) re++, re_len--;
  while (len <= re_len && re[len] != ']' && result <= 0) {
    /* Support character range */
    if (re[len] != '-' && re[len + 1] == '-' && re[len + 2] != ']' &&
        re[len + 2] != '\0') {
      result = info->flags &&  SLRE_IGNORE_CASE ?
        *s >= re[len] && *s <= re[len + 2] :
        tolower(*s) >= tolower(re[len]) && tolower(*s) <= tolower(re[len + 2]);
      len += 3;
    } else {
      result = match_op((unsigned char *) re + len, (unsigned char *) s, info);
      len += op_len(re + len);
    }
  }
  return (!invert && result > 0) || (invert && result <= 0) ? 1 : -1;
 }
 static int doh(const char *s, int s_len, struct regex_info *info, int bi);
 static int bar(const char *re, int re_len, const char *s, int s_len,
               struct regex_info *info, int bi) {
  /* i is offset in re, j is offset in s, bi is brackets index */
  int i, j, n, step;
  for (i = j = 0; i < re_len && j <= s_len; i += step) {
    /* Handle quantifiers. Get the length of the chunk. */
    step = re[i] == '(' ? info->brackets[bi + 1].len + 2 :
      get_op_len(re + i, re_len - i);
    DBG(("%s [%.*s] [%.*s] re_len=%d step=%d i=%d j=%d\n", __func__,
         re_len - i, re + i, s_len - j, s + j, re_len, step, i, j));
    FAIL_IF(is_quantifier(&re[i]), SLRE_UNEXPECTED_QUANTIFIER);
    FAIL_IF(step <= 0, SLRE_INVALID_CHARACTER_SET);
    if (i + step < re_len && is_quantifier(re + i + step)) {
      DBG(("QUANTIFIER: [%.*s]%c [%.*s]\n", step, re + i,
           re[i + step], s_len - j, s + j));
      if (re[i + step] == '?') {
        int result = bar(re + i, step, s + j, s_len - j, info, bi);
        j += result > 0 ? result : 0;
        i++;
      } else if (re[i + step] == '+' || re[i + step] == '*') {
        int j2 = j, nj = j, n1, n2 = -1, ni, non_greedy = 0;
        /* Points to the regexp code after the quantifier */
        ni = i + step + 1;
        if (ni < re_len && re[ni] == '?') {
          non_greedy = 1;
          ni++;
        }
        do {
          if ((n1 = bar(re + i, step, s + j2, s_len - j2, info, bi)) > 0) {
            j2 += n1;
          }
          if (re[i + step] == '+' && n1 < 0) break;
          if (ni >= re_len) {
            /* After quantifier, there is nothing */
            nj = j2;
          } else if ((n2 = bar(re + ni, re_len - ni, s + j2,
                               s_len - j2, info, bi)) >= 0) {
            /* Regex after quantifier matched */
            nj = j2 + n2;
          }
          if (nj > j && non_greedy) break;
        } while (n1 > 0);
        if (n1 < 0 && re[i + step] == '*' &&
            (n2 = bar(re + ni, re_len - ni, s + j, s_len - j, info, bi)) > 0) {
          nj = j + n2;
        }
        DBG(("STAR/PLUS END: %d %d %d %d %d\n", j, nj, re_len - ni, n1, n2));
        FAIL_IF(re[i + step] == '+' && nj == j, SLRE_NO_MATCH);
        /* If while loop body above was not executed for the * quantifier,  */
        /* make sure the rest of the regex matches                          */
        FAIL_IF(nj == j && ni < re_len && n2 < 0, SLRE_NO_MATCH);
        /* Returning here cause we've matched the rest of RE already */
        return nj;
      }
      continue;
    }
    if (re[i] == '[') {
      n = match_set(re + i + 1, re_len - (i + 2), s + j, info);
      DBG(("SET %.*s [%.*s] -> %d\n", step, re + i, s_len - j, s + j, n));
      FAIL_IF(n <= 0, SLRE_NO_MATCH);
      j += n;
    } else if (re[i] == '(') {
      n = SLRE_NO_MATCH;
      bi++;
      FAIL_IF(bi >= info->num_brackets, SLRE_INTERNAL_ERROR);
      DBG(("CAPTURING [%.*s] [%.*s] [%s]\n",
           step, re + i, s_len - j, s + j, re + i + step));
      if (re_len - (i + step) <= 0) {
        /* Nothing follows brackets */
        n = doh(s + j, s_len - j, info, bi);
      } else {
        int j2;
        for (j2 = 0; j2 <= s_len - j; j2++) {
          if ((n = doh(s + j, s_len - (j + j2), info, bi)) >= 0 &&
              bar(re + i + step, re_len - (i + step),
                  s + j + n, s_len - (j + n), info, bi) >= 0) break;
        }
      }
      DBG(("CAPTURED [%.*s] [%.*s]:%d\n", step, re + i, s_len - j, s + j, n));
      FAIL_IF(n < 0, n);
      if (info->caps != NULL) {
        info->caps[bi - 1].ptr = s + j;
        info->caps[bi - 1].len = n;
      }
      j += n;
    } else if (re[i] == '^') {
      FAIL_IF(j != 0, SLRE_NO_MATCH);
    } else if (re[i] == '$') {
      FAIL_IF(j != s_len, SLRE_NO_MATCH);
    } else {
      FAIL_IF(j >= s_len, SLRE_NO_MATCH);
      n = match_op((unsigned char *) (re + i), (unsigned char *) (s + j), info);
      FAIL_IF(n <= 0, n);
      j += n;
    }
  }
  return j;
 }
 /* Process branch points */
 static int doh(const char *s, int s_len, struct regex_info *info, int bi) {
  const struct bracket_pair *b = &info->brackets[bi];
  int i = 0, len, result;
  const char *p;
  do {
    p = i == 0 ? b->ptr : info->branches[b->branches + i - 1].schlong + 1;
    len = b->num_branches == 0 ? b->len :
      i == b->num_branches ? (int) (b->ptr + b->len - p) :
      (int) (info->branches[b->branches + i].schlong - p);
    DBG(("%s %d %d [%.*s] [%.*s]\n", __func__, bi, i, len, p, s_len, s));
    result = bar(p, len, s, s_len, info, bi);
    DBG(("%s <- %d\n", __func__, result));
  } while (result <= 0 && i++ < b->num_branches);  /* At least 1 iteration */
  return result;
 }
 static int baz(const char *s, int s_len, struct regex_info *info) {
  int i, result = -1, is_anchored = info->brackets[0].ptr[0] == '^';
  for (i = 0; i <= s_len; i++) {
    result = doh(s + i, s_len - i, info, 0);
    if (result >= 0) {
      result += i;
      break;
    }
    if (is_anchored) break;
  }
  return result;
 }
 static void setup_branch_points(struct regex_info *info) {
  int i, j;
  struct branch tmp;
  /* First, sort branches. Must be stable, no qsort. Use bubble algo. */
  for (i = 0; i < info->num_branches; i++) {
    for (j = i + 1; j < info->num_branches; j++) {
      if (info->branches[i].bracket_index > info->branches[j].bracket_index) {
        tmp = info->branches[i];
        info->branches[i] = info->branches[j];
        info->branches[j] = tmp;
      }
    }
  }
  /*
   * For each bracket, set their branch points. This way, for every bracket
   * (i.e. every chunk of regex) we know all branch points before matching.
   */
  for (i = j = 0; i < info->num_brackets; i++) {
    info->brackets[i].num_branches = 0;
    info->brackets[i].branches = j;
    while (j < info->num_branches && info->branches[j].bracket_index == i) {
      info->brackets[i].num_branches++;
      j++;
    }
  }
 }
 static int foo(const char *re, int re_len, const char *s, int s_len,
               struct regex_info *info) {
  int i, step, depth = 0;
  /* First bracket captures everything */
  info->brackets[0].ptr = re;
  info->brackets[0].len = re_len;
  info->num_brackets = 1;
  /* Make a single pass over regex string, memorize brackets and branches */
  for (i = 0; i < re_len; i += step) {
    step = get_op_len(re + i, re_len - i);
    if (re[i] == '|') {
      FAIL_IF(info->num_branches >= (int) ARRAY_SIZE(info->branches),
              SLRE_TOO_MANY_BRANCHES);
      info->branches[info->num_branches].bracket_index =
        info->brackets[info->num_brackets - 1].len == -1 ?
        info->num_brackets - 1 : depth;
      info->branches[info->num_branches].schlong = &re[i];
      info->num_branches++;
    } else if (re[i] == '\\') {
      FAIL_IF(i >= re_len - 1, SLRE_INVALID_METACHARACTER);
      if (re[i + 1] == 'x') {
        /* Hex digit specification must follow */
        FAIL_IF(re[i + 1] == 'x' && i >= re_len - 3,
                SLRE_INVALID_METACHARACTER);
        FAIL_IF(re[i + 1] ==  'x' && !(isxdigit(re[i + 2]) &&
                isxdigit(re[i + 3])), SLRE_INVALID_METACHARACTER);
      } else {
        FAIL_IF(!is_metacharacter((unsigned char *) re + i + 1),
                SLRE_INVALID_METACHARACTER);
      }
    } else if (re[i] == '(') {
      FAIL_IF(info->num_brackets >= (int) ARRAY_SIZE(info->brackets),
              SLRE_TOO_MANY_BRACKETS);
      depth++;  /* Order is important here. Depth increments first. */
      info->brackets[info->num_brackets].ptr = re + i + 1;
      info->brackets[info->num_brackets].len = -1;
      info->num_brackets++;
      FAIL_IF(info->num_caps > 0 && info->num_brackets - 1 > info->num_caps,
              SLRE_CAPS_ARRAY_TOO_SMALL);
    } else if (re[i] == ')') {
      int ind = info->brackets[info->num_brackets - 1].len == -1 ?
        info->num_brackets - 1 : depth;
      info->brackets[ind].len = (int) (&re[i] - info->brackets[ind].ptr);
      DBG(("SETTING BRACKET %d [%.*s]\n",
           ind, info->brackets[ind].len, info->brackets[ind].ptr));
      depth--;
      FAIL_IF(depth < 0, SLRE_UNBALANCED_BRACKETS);
      FAIL_IF(i > 0 && re[i - 1] == '(', SLRE_NO_MATCH);
    }
  }
  FAIL_IF(depth != 0, SLRE_UNBALANCED_BRACKETS);
  setup_branch_points(info);
  return baz(s, s_len, info);
 }
 int slre_match(const char *regexp, const char *s, int s_len,
               struct slre_cap *caps, int num_caps, int flags) {
  struct regex_info info;
  /* Initialize info structure */
  info.flags = flags;
  info.num_brackets = info.num_branches = 0;
  info.num_caps = num_caps;
  info.caps = caps;
  DBG(("========================> [%s] [%.*s]\n", regexp, s_len, s));
  return foo(regexp, (int) strlen(regexp), s, s_len, &info);
 }
--- a/src/slre.h
+++ b/src/slre.h
@@ -0,0 +1,60 @@
 /*
 * Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
 * Copyright (c) 2013 Cesanta Software Limited
 * All rights reserved
 *
 * This library is dual-licensed: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation. For the terms of this
 * license, see <http://www.gnu.org/licenses/>.
 *
 * You are free to use this library under the terms of the GNU General
 * Public License, but WITHOUT ANY WARRANTY; without even the implied
 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 *
 * Alternatively, you can license this library under a commercial
 * license, as set out in <http://cesanta.com/products.html>.
 */
 /*
 * This is a regular expression library that implements a subset of Perl RE.
 * Please refer to README.md for a detailed reference.
 */
 #ifndef SLRE_HEADER_DEFINED
 #define SLRE_HEADER_DEFINED
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct slre_cap {
  const char *ptr;
  int len;
 };
 int slre_match(const char *regexp, const char *buf, int buf_len,
               struct slre_cap *caps, int num_caps, int flags);
 /* Possible flags for slre_match() */
 enum { SLRE_IGNORE_CASE = 1 };
 /* slre_match() failure codes */
 #define SLRE_NO_MATCH               -1
 #define SLRE_UNEXPECTED_QUANTIFIER  -2
 #define SLRE_UNBALANCED_BRACKETS    -3
 #define SLRE_INTERNAL_ERROR         -4
 #define SLRE_INVALID_CHARACTER_SET  -5
 #define SLRE_INVALID_METACHARACTER  -6
 #define SLRE_CAPS_ARRAY_TOO_SMALL   -7
 #define SLRE_TOO_MANY_BRANCHES      -8
 #define SLRE_TOO_MANY_BRACKETS      -9
 #ifdef __cplusplus
 }
 #endif
 #endif  /* SLRE_HEADER_DEFINED */
--- a/src/slre.o
+++ b/src/slre.o