Initial commit

2016-01-13 10:20:39 +01:00
commit ffa9638359
62 changed files with 53833 additions and 0 deletions
--- a/ROBITools.Rproj
+++ b/ROBITools.Rproj
@@ -0,0 +1,19 @@
 Version: 1.0
 RestoreWorkspace: Default
 SaveWorkspace: Default
 AlwaysSaveHistory: Default
 EnableCodeIndexing: Yes
 UseSpacesForTab: Yes
 NumSpacesForTab: 2
 Encoding: ISO-8859-1
 RnwWeave: knitr
 LaTeX: pdfLaTeX
 BuildType: Package
 PackageUseDevtools: Yes
 PackagePath: ROBITools
 PackageInstallArgs: --no-multiarch --with-keep.source
 PackageRoxygenize: rd,collate,namespace
--- a/ROBITools/.gitignore
+++ b/ROBITools/.gitignore
@@ -0,0 +1,3 @@
 /man/
 /loopbenchmark.R
 /Read-and-delete-me
--- a/ROBITools/DESCRIPTION
+++ b/ROBITools/DESCRIPTION
@@ -0,0 +1,38 @@
 Package: ROBITools
 Type: Package
 Title: Metabarcoding data biodiversity analysis
 Version: 0.1
 Date: 2012-08-23
 Author: LECA - Laboratoire d'ecologie alpine
 Maintainer: LECA OBITools team <obitools@metabarcoding.org>
 Description: More about what it does (maybe more than one line)
 License: CeCILL v2.0
 LazyLoad: yes
 Roxygen: list(wrap = FALSE)
 Collate:
    's3objects.R'
    'ROBITools.R'
    '02_class_metabarcoding.data.R'
    'aggregate.R'
    'choose.taxonomy.R'
    'contaslayer.R'
    'distrib.extrapol.R'
    'experimental.section.R'
    'export-metabarcoding.R'
    'read.obitab.R'
    'import.metabarcoding.R'
    'import.ngsfilter.R'
    'layers.metabarcoding.R'
    'metabarcoding_threshold.R'
    'mstat.R'
    'obiclean.R'
    'pcrslayer.R'
    'plot.PCRplate.R'
    'plot.seqinsample.R'
    'rarefy.R'
    'read.ngsfilter.R'
    'read.sumatra.R'
    'taxoDBtree.R'
    'taxonomic.resolution.R'
    'taxonomy_classic_table.R'
 RoxygenNote: 5.0.1
--- a/ROBITools/LICENSE-SLRE
+++ b/ROBITools/LICENSE-SLRE
@@ -0,0 +1,16 @@
 Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
 Copyright (c) 2013 Cesanta Software Limited
 All rights reserved
 This code is dual-licensed: you can redistribute it and/or modify
 it under the terms of the GNU General Public License version 2 as
 published by the Free Software Foundation. For the terms of this
 license, see <http://www.gnu.org/licenses/>.
 You are free to use this code under the terms of the GNU General
 Public License, but WITHOUT ANY WARRANTY; without even the implied
 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 See the GNU General Public License for more details.
 Alternatively, you can license this code under a commercial
 license, as set out in <http://cesanta.com/>.
--- a/ROBITools/NAMESPACE
+++ b/ROBITools/NAMESPACE
@@ -0,0 +1,52 @@
 # Generated by roxygen2: do not edit by hand
 S3method(aggregate,metabarcoding.data)
 S3method(plot,PCRplate)
 S3method(plot,seqinsample)
 S3method(summary,taxores)
 export(addS3Class)
 export(colnames)
 export(const.threshold.mask)
 export(contaslayer)
 export(createS3Class)
 export(dbtree)
 export(dist.center.group)
 export(dist.clique.group)
 export(dist.grid)
 export(dm.univariate)
 export(extracts.obiclean)
 export(extracts.obiclean_cluster)
 export(extrapol.freq)
 export(get.classic.taxonomy)
 export(import.metabarcoding.data)
 export(import.ngsfilter.data)
 export(layer.names)
 export(m.bivariate)
 export(m.univariate)
 export(m.univariate.test)
 export(m.weight)
 export(map.extrapol.freq)
 export(marginalsum)
 export(metabarcoding.data)
 export(motus)
 export(normalize)
 export(rarefy)
 export(read.ngsfilter)
 export(read.obitab)
 export(reads)
 export(rmS3Class)
 export(rownames)
 export(samples)
 export(taxo.decider)
 export(threshold)
 export(threshold.mask)
 export(threshold.set)
 exportClasses(metabarcoding.data)
 exportMethods("$")
 exportMethods("$<-")
 exportMethods("[[")
 exportMethods("[[<-")
 exportMethods(colnames)
 exportMethods(rownames)
 import(ROBITaxonomy)
 import(igraph)
--- a/ROBITools/R/02_class_metabarcoding.data.R
+++ b/ROBITools/R/02_class_metabarcoding.data.R
@@ -0,0 +1,539 @@
 #' @include ROBITools.R
 #' @include s3objects.R
 #' @import ROBITaxonomy 
 NULL
 require(ROBITaxonomy)
 #
 # FOR THE DEVELOPPER : we have to check that the code doesn't relies on the
 #                      fact that the xx@samples$sample column is not always 
 #                      identical to the rownames(xx@samples)
 setClassUnion("characterOrNULL",c("character","NULL"))
 setClassUnion("matrixOrfactorL",c("matrix","factor"))
 #
 # We specialize data.frame in two subclasses motus.frame and samples.frame
 # for this we add to function insuring the type checking and the cast from
 # data.frame
 #
 is.motus.frame= function(x) any(class(x)=="motus.frame")
 is.samples.frame= function(x) any(class(x)=="samples.frame")
 as.motus.frame= function(x) {
  if (! is.data.frame(x))
    stop("only cast from data.frame is allowed")
  if (! is.motus.frame(x))
    x = addS3Class(x,"motus.frame")
  return(x)
 }
 as.samples.frame= function(x) {
  if (! is.data.frame(x))
    stop("only cast from data.frame is allowed")
  if (! is.samples.frame(x))
    x = addS3Class(x,"samples.frame")
  return(x)
 }
 samples.frame=as.samples.frame
 motus.frame=as.motus.frame
 as.factor.or.matrix = function(x) {
  if (is.matrix(x))
    return(x)
  if (is.factor(x)){
    if (length(dim(x))!=2)
      stop('Just factor with two dimensions are allowed')
    return(x)
  }
  if (!is.data.frame(x))
    stop('Just matrix, 2D factor and data.frame can be casted')
  tps   = sapply(x,class)
  allna = sapply(x, function(y) all(is.na(y)))
  if (all(tps==tps[[1]] | allna)) {
    tps = tps[[1]] 
  }
  else
    stop('all the column of the data.frame must have the same type')
  tps = tps[[1]]
  x = as.matrix(x)
  dx = dim(x)
  if (tps=='factor')
    x = factor(x)
    dim(x)=dx
  return(x)
 }
 #' DNA metabarcoding experiment description class
 #' 
 #' A S4 class describing a DNA metabarcoding experiment. It groups
 #' three data frames describing samples, motus and occurrences of 
 #' MOTUs per sample
 #'
 #'@section Slots: 
 #'  \describe{
 #'    \item{\code{reads}:}{Matrix of class \code{"numeric"},  
 #'                         containing the counts of reads per samples
 #'                         \itemize{
 #'                           \item{1 samples per line}
 #'                           \item{1 sequence per column}
 #'                         }
 #'                        }
 #'                        
 #'    \item{\code{samples}:}{Object of class \code{"data.frame"}, describing samples
 #'                         \itemize{
 #'                           \item{1 samples per line}
 #'                           \item{1 property per column}
 #'                         }
 #'                       }
 #'                       
 #'    \item{\code{motus}:}{Object of class \code{"data.frame"}, describing MOTUs (sequences)
 #'                         \itemize{
 #'                           \item{1 MOTU per line}
 #'                           \item{1 property per column}
 #'                         }
 #'                       }
 #'                       
 #'    \item{\code{layers}:}{Object of class \code{"list"}, containing a set of data layers 
 #'                          linking motus and samples. Each element of the list is a matrix
 #'                          of the same size than the \code{read} slot with
 #'                         \itemize{
 #'                           \item{1 samples per line}
 #'                           \item{1 sequence per column}
 #'                         }
 #'                       }
 #'        
 #'    \item{\code{scount}:}{Object of class \code{"integer"}, containing the count of sample}             
 #'        
 #'    \item{\code{mcount}:}{Object of class \code{"integer"}, containing the count of MOTUs}             
 #'        
 #'    \item{\code{sample.margin}:}{Vector of class \code{"numeric"},  describing the total count of 
 #'                          sequence per sample. By default this slot is set by applying sum
 #'                         to the reads data.frame lines}             
 #'
 #'    \item{\code{taxonomy}:}{Object of class \code{"taxonomy.obitools"}, linking the DNA metabarcoding
 #'                            experiment to a taxonomy}             
 #'        
 #'    \item{\code{taxid}:}{Vector of class \code{"character"}, list of MOTUs' attributes to manage as taxid}             
 #'  }
 #'
 #' @seealso \code{\link{taxonomy.obitools}},
 #' @name metabarcoding.data
 #' @rdname metabarcoding-data-class
 #' @keywords DNA metabarcoding
 #' @author Eric Coissac
 #' @exportClass metabarcoding.data
 setClass("metabarcoding.data",
 		#
 		# Attribute declaration
 		#
 		 representation(reads         = "matrix", 
 						        samples       = "data.frame",
  		  		        motus         = "data.frame",
        						layers        = "list",
    				        scount        = "integer",
        						mcount        = "integer",
 				 		        sample.margin = "numeric",
 						        taxonomy      = "obitools.taxonomyOrNULL",
 						        taxid         = "characterOrNULL"
 						),
 		#
 		# Check object structure 
 		#
 		validity = function(object) { 
 						## object : nom reserve !
 			#
 			# Check that reads / samples and motus data.frames
 			# have compatible sizes
 			#
 			#   reads line count   = samples line count
 			#   reads column count = motus   line count
 			rsize = dim(object@reads)
 			ssize = dim(object@samples)
 			msize = dim(object@motus)
 			csize = length(object@sample.margin)
 			if (rsize[1] != ssize[1] & 
 				rsize[2] != msize[1] &
 				rsize[1] != csize)
 				return(FALSE)
 			# if no layer, object is ok
 			if (length(object@layers)==0)
 				return(TRUE)
 			# otherwise we check the size of each layer as we
 			# did for reads
 			return(! any(sapply(object@layers, 
 					   function(l) any(dim(l)!=c(ssize[1],msize[1])))))
 		}
 )
 #
 #' metabarcoding.data constructor
 #' 
 #' @docType methods
 #' @rdname initialize-methods
 #' @aliases initialize-methods,metabarcoding.data
 setMethod("initialize",
 		  "metabarcoding.data",
 			function(.Object, reads,samples,motus,
 					          taxonomy=NULL,taxid=NULL,
 							  sample.margin=NA,
 							  layers=list()) {
        rn = rownames(reads)
        cn = colnames(reads)
 				.Object@reads   <- reads
 				# .Object@samples <- as.samples.frame(samples)
        .Object@samples <- samples
        row.names(.Object@samples) = rn
 				#.Object@motus   <- as.motus.frame(motus)
        .Object@motus   <- motus
        row.names(.Object@motus) = cn
        # Set colnames and rownames to each layers
        layers = lapply(layers, function(x) {colnames(x)=cn 
                                             rownames(x)=rn 
                                             return(x)})
 		.Object@layers   <- layers
 				# Precompute sample count and motu count
 		.Object@scount = dim(.Object@samples)[1]
 		.Object@mcount = dim(.Object@motus)[1]
 		.Object@taxonomy = taxonomy
 		.Object@taxid = taxid
 		if (is.null(sample.margin))
 			.Object@sample.margin = rowSums(reads)
 		else
 			.Object@sample.margin = sample.margin
        names(.Object@sample.margin) = rn
 		validObject(.Object) ## valide l'objet
 		return(.Object)
 		})
 #
 # metabarcoding.data getters
 #
 #' @export
 setGeneric("reads", function(obj) {
 			return(standardGeneric("reads"))
 		})
 #' Extracts the matrix describing MOTUs abondances
 #' 
 #' Extract the the matrix describing MOTUs abondances (read counts) 
 #' from a \code{\link{metabarcoding.data}} instance.
 #' 
 #' @param   obj a \code{\link{metabarcoding.data}} instance
 #' @return  a matrix containing data about reads 
 #' 
 #' @examples
 #' # load termite data set from the ROBITools sample data
 #' data(termes)
 #' 
 #' # Extract the matrix describing MOTUs abondances
 #' d = reads(termes)
 #' 
 #' head(d)
 #' 
 #' @seealso \code{\link{metabarcoding.data}},
 #'          \code{\link{motus}}, \code{\link{samples}}
 #'          
 #' @docType methods
 #' @rdname read-methods
 #' @aliases read-methods,metabarcoding.data
 #' @author Eric Coissac
 #' 
 setMethod("reads", "metabarcoding.data", function(obj) {
 			return(obj@reads)
 		})
 # get samples data.frames
 #' @export
 setGeneric("samples", function(obj) {
 			return(standardGeneric("samples"))
 		})
 #' Extracts the samples description data.frame
 #' 
 #' Extract the sample description data.frame from a 
 #' \code{\link{metabarcoding.data}} instance.
 #' 
 #' @param   obj a \code{\link{metabarcoding.data}} instance
 #' @return  a data.frame containing data about sample 
 #' 
 #' @examples
 #' # load termite data set from the ROBITools sample data
 #' data(termes)
 #' 
 #' # Extract the data frame describing samples
 #' d = samples(termes)
 #' 
 #' head(d)
 #' 
 #' @seealso \code{\link{metabarcoding.data}},
 #'          \code{\link{motus}}, \code{\link{reads}}
 #'          
 #' @docType methods
 #' @rdname samples-methods
 #' @aliases samples-methods,metabarcoding.data
 #' @author Eric Coissac
 #' 
 setMethod("samples", "metabarcoding.data", function(obj) {
 			return(obj@samples)
 		})
 #' @export
 setGeneric("motus", function(obj) {
 			return(standardGeneric("motus"))
 		})
 #' Extracts the MOTU descriptions \code{data.frame}
 #' 
 #' Extract the MOTUs description \code{data.frame} from a 
 #' \code{\link{metabarcoding.data}} instance.
 #' 
 #' @param   obj a \code{\link{metabarcoding.data}} instance
 #' @return  a data.frame containing data about MOTU 
 #' 
 #' @examples
 #' # load termite data set from the ROBITools sample data
 #' data(termes)
 #' 
 #' # Extract the data.frame describing MOTUs
 #' d = motus(termes)
 #' 
 #' head(d)
 #' 
 #' @seealso \code{\link{metabarcoding.data}},
 #'          \code{\link{reads}}, \code{\link{samples}}
 #'          
 #' @docType methods
 #' @rdname motu-methods
 #' @aliases motu-methods,metabarcoding.data
 #' 
 setMethod("motus", "metabarcoding.data", function(obj) {
 			return(obj@motus)
 		})
 # get sample count 
 setGeneric("sample.count", function(obj) {
 			return(standardGeneric("sample.count"))
 		})
 setMethod("sample.count", "metabarcoding.data", function(obj) {
 			return(obj@scount)
 		})
 # get motu count 
 setGeneric("motu.count", function(obj) {
 			return(standardGeneric("motu.count"))
 		})
 setMethod("motu.count", "metabarcoding.data", function(obj) {
 			return(obj@mcount)
 		})
 # dim method
 setMethod("dim", "metabarcoding.data", function(x) {
 			return(c(x@scount,x@mcount))
 		})
 setMethod('[', "metabarcoding.data", function(x,i=NULL,j=NULL,...,drop=TRUE) {
      # special case if samples are not specified (dimension 1)
 			if (!hasArg(i))
 				i = 1:x@scount
 			# special case if motus are not specified (dimension 2)
 			if (!hasArg(j))
 				j = 1:x@mcount
      # special case if the layer attribut is specified
 			args = list(...)
 			if (!is.null(args$layer))
          return(x[[args$layer]][i,j])	
      #####################
      #
      # normal case 
      #
 			r = x@reads[i,j,drop=FALSE]
 			if (sum(dim(r) > 1)==2 | ! drop)
 			{
        # we do the selection on the motus and samples description data.frame
 				m = x@motus[j,,drop=FALSE]
 				s = x@samples[i,,drop=FALSE]
        # we do the selection on each layers
        l = lapply(x@layers,function(l) l[i,j,drop=FALSE])
 				newdata = copy.metabarcoding.data(x, reads=r, samples=s, motus=m, layers=l)
 			}
 			else
 			{
 				newdata = as.numeric(x@reads[i,j])
 			}
 			return(newdata)
 		})
 setMethod('[<-', "metabarcoding.data",
          function (x, i, j, ..., value) {
            if (!hasArg(i))
              i = 1:x@scount
            if (!hasArg(j))
              j = 1:x@mcount
            args = list(...)
            if (is.null(args$layer))
              x@reads[i, j]=value
            else
              x[[args$layer]][i,j]=value
            return(x)
          })
 #################################################
 #
 # User interface function to create 
 # metabarcoding.data objects
 #
 #################################################
 #'@export
 metabarcoding.data = function(reads,samples,motus,
 							  taxonomy=NULL,taxid=NULL,
 							  sample.margin=NULL,
                layers=list()) {
 	rd = new('metabarcoding.data',
 			reads=reads,
 			samples=samples,
 			motus=motus,
 			taxonomy=taxonomy,
 			taxid=taxid,
 			sample.margin=sample.margin,
      layers=layers
 	)
 	return(rd)
 }
 copy.metabarcoding.data = function(data, 
                                   reads=NULL,
                                   samples=NULL,motus=NULL,
 										               taxonomy=NULL,taxid=NULL,
 										               sample.margin=NULL,
                                   layers=NULL) {
 		if (is.null(reads))
 			reads = data@reads
 		if (is.null(samples))
 			samples = data@samples
 		if (is.null(motus))
 			motus = data@motus
 		if (is.null(taxonomy))
 			taxonomy = data@taxonomy
 		if (is.null(taxid))
 			taxid = data@taxid
 		if (is.null(sample.margin))
 			sample.margin = data@sample.margin
 		if (is.null(layers))
 		  layers = data@layers
 		rd = new('metabarcoding.data',
 				reads=reads,
 				samples=samples,
 				motus=motus,
 				taxonomy=taxonomy,
 				taxid=taxid,
 				sample.margin=sample.margin,
        		layers=layers
 		)
 		return(rd)
 }
 #' @export
 setGeneric('rownames')
 #' @export
 setMethod("rownames", "metabarcoding.data", function(x, do.NULL = TRUE, prefix = "col") {
  return(rownames(x@reads,do.NULL,prefix))
 })
 #' @export
 setGeneric('colnames')
 #' @export
 setMethod("colnames", "metabarcoding.data", function(x, do.NULL = TRUE, prefix = "col") {
  return(colnames(x@reads,do.NULL,prefix))
 })
--- a/ROBITools/R/ROBITools.R
+++ b/ROBITools/R/ROBITools.R
@@ -0,0 +1,33 @@
 #' A package to manipulate DNA metabarcoding data.
 #' 
 #' This package was written as a following of the OBITools. 
 #' 
 #' \tabular{ll}{
 #'  Package: \tab ROBITools\cr
 #'  Type: \tab Package\cr
 #'  Version: \tab 0.1\cr
 #'  Date: \tab 2013-06-27\cr
 #'  License: \tab CeCILL 2.0\cr
 #'  LazyLoad: \tab yes\cr
 #'}
 #' 
 #' @name ROBITools-package
 #' @aliases ROBITools
 #' @docType package
 #' @title  A package to manipulate DNA metabarcoding data.
 #' @author Frederic Boyer
 #' @author Aurelie Bonin
 #' @author Lucie Zinger
 #' @author Eric Coissac
 #' 
 #' @references http://metabarcoding.org/obitools
 #' 
 NA
 .onLoad <- function(libname, pkgname) { 
  packageStartupMessage( "ROBITools package" )
  #print(getwd())
 }
--- a/ROBITools/R/aggregate.R
+++ b/ROBITools/R/aggregate.R
@@ -0,0 +1,229 @@
 #' @include 02_class_metabarcoding.data.R
 NULL
 # TODO: Add comment
 # 
 # Author: coissac
 ###############################################################################
 #' @export
 aggregate.metabarcoding.data=function(x, by, FUN,..., 
 		                              MARGIN='sample',
 									  default.layer=NULL,
 									  layers=NULL) {	
 	uniq.value = function(z) {
 		if (is.null(z) | 
 				any(is.na(z)) |
 				length(z)==0)
 			ans = NA
 		else {
      if (all(z==z[1]))
 			  ans = z[1]
 		  else
 			  ans = NA
 		}
 		if (is.factor(z))
 			ans = factor(ans,levels=levels(z))
 		return(ans)
 	}			
 	#
 	# Deals with the supplementaty aggregate arguments
 	#
 	if (is.null(default.layer))
 		default.layer=uniq.value
 	if (is.null(layers)) {
 		layers = as.list(rep(c(default.layer),length(x@layers)))
 		names(layers)=layer.names(x)
 	}
 	else {
 		for (n in layer.names(x))
 			if (is.null(layers[[n]]))
 				layers[[n]]=default.layers
 	}
 	if (MARGIN == 'sample')
 		MARGIN=1
 	if (MARGIN == 'motu')
 		MARGIN=2
 	reads = x@reads
 	if (MARGIN==1) {
 		# prepare the aggrevation arguments for the read table
 		# from the function arguments
 	  dotted = list(...)
 	  if (length(dotted) > 0)
 	    aggr.args = list(reads,by=by,FUN=FUN,...=dotted,simplify=FALSE)
 	  else
 	    aggr.args = list(reads,by=by,FUN=FUN,simplify=FALSE)
 		# Aggregate the read table
 		ragr = do.call(aggregate,aggr.args)
 		# extrat new ids from the aggregated table
 		ncat = length(by)
 		ids  = as.character(interaction(ragr[,1:ncat,drop=FALSE]))
 		# remove the aggregations modalities to rebuild a correct
 		# reads table
 		ragr = as.matrix(ragr[,-(1:ncat),drop=FALSE])
 		dragr= dim(ragr)
 		cragr= colnames(ragr)
 		ragr = as.numeric(ragr)
 		dim(ragr)=dragr
 		colnames(ragr)=cragr		
 		rownames(ragr)=ids
 		#
 		# Apply the same aggragation to each layer
 		#
 		ln = layer.names(x)
    la = vector(mode="list",length(ln))
    names(la)=ln
 		for (n in ln) {
 			f = layers[[n]]
 			if (is.factor(x[[n]])){
 			  isfact = TRUE
 			  lf = levels(x[[n]])
 			  df = dim(x[[n]])
 			  m = matrix(as.character(x[[n]]))
 			  dim(m)=df
 			}
 			else
 			  m = x[[n]]
      aggr.args = list(m,by=by,FUN=f,simplify=FALSE)
 			lagr = do.call(aggregate,aggr.args)
 			lagr = as.factor.or.matrix(lagr[,-(1:ncat),drop=FALSE])
 			if (isfact){
 			  df = dim(lagr)
 			  lagr = factor(lagr,levels=lf)
 			  dim(lagr)=df
 			}
      rownames(lagr)=ids
 			la[[n]]=lagr
 		}
 		# aggragate the sample table according to the same criteria
 		#
 		# TODO: We have to take special care of factors in the samples
 		#       data.frame
 		sagr = aggregate(samples(x),by,uniq.value,simplify=FALSE)
 		# move the first columns of the resulting data frame (the aggregations
 		# modalities to the last columns of the data.frame
 		sagr = sagr[,c((ncat+1):(dim(sagr)[2]),1:ncat),drop=FALSE]
 		larg = c(lapply(sagr,unlist),list(stringsAsFactors=FALSE))	
 		sagr = do.call(data.frame,larg)
 		# set samples ids to the ids computed from modalities
 		sagr$id=ids
 		rownames(sagr)=ids
 		# build the new metabarcoding data instance
 		newdata = copy.metabarcoding.data(x,reads=ragr,samples=sagr)
 	}
 	else {
 		# prepare the aggregation arguments for the read table
 		# from the function arguments
 		# BECARFUL : the reads table is transposed
 		#            standard aggregate runs by row and we want
 		# 			 aggregation by column
    dotted = list(...)
    if (length(dotted) > 0)
 		  aggr.args = list(t(reads),by=by,FUN=FUN,...=dotted,simplify=FALSE)
    else
      aggr.args = list(t(reads),by=by,FUN=FUN,simplify=FALSE)
    # Aggregate the read table
 		ragr = do.call(aggregate.data.frame,aggr.args)
 		# extrat new ids from the aggregated table
 		ncat = length(by)
 		ids  = as.character(interaction(ragr[,1:ncat,drop=FALSE]))
 		# remove the aggregations modalities to rebuild a correct
 		# reads table
 		ragr = t(ragr[,-(1:ncat),drop=FALSE])		
    dragr= dim(ragr)
    rragr= rownames(ragr)
    ragr = as.numeric(ragr)
    dim(ragr)=dragr
    colnames(ragr)=ids
    rownames(ragr)=rragr
 		#
 		# Apply the same aggragation to each layer
 		#
 		ln = layer.names(x)
    la = vector(mode="list",length(ln))
    names(la)=ln
    for (n in ln) {
 			f = layers[[n]]
      if (is.factor(x[[n]])){
 			  isfact = TRUE
        lf = levels(x[[n]])
        df = dim(x[[n]])
        m = matrix(as.character(x[[n]]))
        dim(m)=df
      }
      else
          m = x[[n]]
      aggr.args = list(t(m),by=by,FUN=f,simplify=FALSE)
 			lagr = do.call(aggregate,aggr.args)
 			lagr = t(as.factor.or.matrix(lagr[,-(1:ncat),drop=FALSE]))
      if (isfact){
        df = dim(lagr)
        lagr = factor(lagr,levels=lf)
        dim(lagr)=df
 			}
      colnames(lagr)=ids
 			la[[n]]=lagr
 		}
 		# aggragate the motus table according to the same criteria
 		magr = aggregate(motus(x),by,uniq.value,simplify=FALSE)
 		# move the first columns of the resulting data frame (the aggregations
 		# modalities to the last columns of the data.frame
 		magr = magr[,c((ncat+1):(dim(magr)[2]),1:ncat),drop=FALSE]
 		larg = c(lapply(magr,unlist),list(stringsAsFactors=FALSE))	
 		magr = do.call(data.frame,larg)
 		# set motus ids to the ids computed from modalities
 		magr$id=ids
 		rownames(magr)=ids
 		# build the new metabarcoding data instance
 		newdata = copy.metabarcoding.data(x,reads=ragr,motus=magr,layers=la)
 	}
 	return(newdata)			
 }
--- a/ROBITools/R/choose.taxonomy.R
+++ b/ROBITools/R/choose.taxonomy.R
@@ -0,0 +1,107 @@
 #' @import ROBITaxonomy 
 #' @include 02_class_metabarcoding.data.R
 NULL
 #' Choose between databases for taxonomic classifications
 #' 
 #' Chooses a sequence taxonomic assignment in order of preference for the different 
 #' reference databases that have been used when the assignment is above a certain threshold
 #' 
 #' 
 #' @param x a \code{\link{metabarcoding.data}} object
 #' @param taxonomy a \code{\linkS4class{taxonomy.obitools}} instance
 #' @param dbrank string or vector indicating reference database names ranked by order of preference
 #' @param thresh a best_identity threshold for applying priority. Default is \code{0.95}
 #' 
 #' @return returns a data.frame with the refined taxonomic assignement and classic taxonomy description.
 #' 
 #' @examples
 #' 
 #' data(termes)
 #' 
 #' taxo=default.taxonomy()
 #' 
 #' #create artificial taxonomic assignments
 #' attr(termes, "motus")["best_identity:DB1"] = sample(seq(0.5,1,0.001),size=nrow(termes$motus), replace=T)
 #' attr(termes, "motus")["best_identity:DB2"] = sample(seq(0.5,1,0.001),size=nrow(termes$motus), replace=T)
 #' attr(termes, "motus")["best_identity:DB3"] = sample(seq(0.5,1,0.001),size=nrow(termes$motus), replace=T)
 #' attr(termes, "motus")["taxid_by_db:DB1"] = termes$motus$taxid
 #' attr(termes, "motus")["taxid_by_db:DB2"] = sample(termes$motus$taxid,size=nrow(termes$motus), replace=F)
 #' attr(termes, "motus")["taxid_by_db:DB3"] = sample(termes$motus$taxid,size=nrow(termes$motus), replace=F)
 #' 
 #' #Run taxo.decider
 #' termes.ok = taxo.decider(termes, taxo, "DB2", 0.95)
 #' head(termes.ok$motus[union(grep("DB",  colnames(termes.ok$motus)), grep("_ok", colnames(termes.ok$motus)))])
 #' 
 #' termes.ok = taxo.decider(termes, taxo, c("DB3", "DB1"), 0.95)
 #' head(termes.ok$motus[union(grep("DB",  colnames(termes.ok$motus)), grep("_ok", colnames(termes.ok$motus)))])
 #' 
 #' #Quick look at the enhancement in taxonomic assignements
 #' par(mfrow=c(1,4))
 #' for(i in grep("best_identity.", colnames(termes.ok$motus))){
 #' hist(termes.ok$motus[,i], breaks=20, ylim=c(1,21), main=colnames(termes.ok$motus)[i], xlab="assignment score")
 #' }
 #'     
 #' @seealso \code{\linkS4class{taxonomy.obitools}}, and methods \code{\link{species}},\code{\link{genus}}, \code{\link{family}},\code{\link{kingdom}},
 #'          \code{\link{superkingdom}},\code{\link{taxonatrank}}, \code{\link{taxonmicank}}
 #'
 #' @author Lucie Zinger
 #' @keywords taxonomy
 #' 
 #' @export
 #' 
 taxo.decider = function(x, taxonomy, dbrank, thresh=0.95) {
  noms = colnames(x$motus)
  best_ids_names = noms[grep("best_identity.", noms)]
  best_ids = x$motus[,best_ids_names]
  taxids = x$motus[, gsub("best_identity", "taxid_by_db", best_ids_names)]
  dbs = unlist(lapply(strsplit(best_ids_names, "\\:"), "[[", 2))
  #Set max indices
  ind = as.vector(t(apply(best_ids,1,function(y) order(rank(-y, ties.method="max"), match(dbrank, dbs))))[,1])
  #Set default vector: db, bestids, taxids with max score
  db_ok = dbs[ind]
  best_identity_ok = best_ids[cbind(1:length(ind), ind)]
  taxids_by_db_ok = taxids[cbind(1:length(ind), ind)]
  #Get vector of db index that should be used according to condition > thresh
  db_choice = taxo.decider.routine(dbrank, best_ids, dbs, thresh)
  #Replacing by right values according to db_ok
  for(i in 1:length(dbrank)){
    db_ok[which(db_choice==i)] = dbrank[i]
    best_identity_ok[which(db_choice==i)] = best_ids[which(db_choice==i),grep(dbrank[i], colnames(best_ids))]
    taxids_by_db_ok[which(db_choice==i)] = taxids[which(db_choice==i),grep(dbrank[i], colnames(taxids))]
  }
  decision = data.frame(db_ok, best_identity_ok, taxids_by_db_ok)
  coltaxid = colnames(decision)[grep("taxid", colnames(decision))]
  attr(x, "motus") = data.frame(x$motus, decision)
  new.tax = get.classic.taxonomy(x, taxonomy, coltaxid)
  attr(x, "motus") = data.frame(x$motus, new.tax)
  return(x)  
 }
 taxo.decider.routine = function(dbrank, best_ids, dbs, thresh) {
  #Setting mask 
  mask = matrix(NA,nrow(best_ids),length(dbrank))
  colnames(mask)=dbrank
  #For each DB, see if condition T/F
  for(i in dbrank){
    mask[,i] = best_ids[,which(dbs==i)]>thresh
  }
  #Get the first occurence of T in the table
  out = apply(mask, 1, function(x) which(x==T)[1])
  return(out)
 }
--- a/ROBITools/R/contaslayer.R
+++ b/ROBITools/R/contaslayer.R
@@ -0,0 +1,49 @@
 #' @include 02_class_metabarcoding.data.R
 NULL
 #' Detects contaminants in metabarcoding data
 #' 
 #' Detects sequences/motus in a \code{\link{metabarcoding.data}} object
 #' for which frequencies over the entire dataset are maximum in negative controls and 
 #' hence, most likely to be contaminants. 
 #' 
 #' 
 #' @param x a \code{\link{metabarcoding.data}} object
 #' @param controls a vector of samples names where conta are suspected to be detected 
 #'                 (typically negative control names).
 #' @param clust a vector for grouping sequences. Default set to \code{NULL}.
 #'
 #' @return a vector containing the names of sequences identified as contaminants
 #'
 #' @examples
 #' 
 #' data(termes)
 #' termes.ok = termes[,colSums(termes$reads)>0]
 #' neg = rownames(termes.ok)[grep("r",rownames(termes.ok))]
 #' 
 #' #finds contaminants based on neg samples
 #' contaslayer(termes.ok, neg)
 #' 
 #' # extanding contamininant detection with grouping factor, 
 #' # typically obiclean/sumatra cluster or taxonomy membership
 #' contaslayer(termes.ok, neg, termes.ok$motus$scientific_name)
 #'   
 #' @seealso \code{\link{threshold}} for further trimming
 #' @author Lucie Zinger
 #' @export
 contaslayer = function(x,controls,clust=NULL){
  x.fcol = normalize(x, MARGIN=2)$reads
  x.max = rownames(x.fcol[apply(x.fcol, 2, which.max),])
  conta = colnames(x)[!is.na(match(x.max,controls))]
  if (length(clust)!=0) {
    agg = data.frame(conta.id=colnames(x.fcol), clust)
    conta.ext = agg$conta.id[which(!is.na(match( agg$clust, agg$clust[match(conta,agg$conta.id)])))]
    return(as.vector(conta.ext))
  } 
  else {
    return(conta)
  }
 }  
--- a/ROBITools/R/distrib.extrapol.R
+++ b/ROBITools/R/distrib.extrapol.R
@@ -0,0 +1,178 @@
 #' @include 02_class_metabarcoding.data.R
 NULL
 #' Read frequencies krigging
 #' 
 #' Extrapolates read frequencies from a \code{\link{metabarcoding.data}} object in space for a finer resolution 
 #' 
 #' @param x a vector or matrix from a row-normalized read table 
 #'          \code{\link{metabarcoding.data}} object
 #' @param min.coord a vector of length = 2 indicating the minimum values of x and y
 #'                  coordinates to be used for the predicted grid
 #' @param max.coord a vector of length = 2 indicating the maximum values of x and y
 #'                  coordinates to be used for the predicted grid
 #' @param grid.grain an integer indicating the resolution (i.e. nb of subpoints) in x and y
 #'                  coordinates required for the predicted grid
 #' @param coords a dataframe containing the x and y coordinates of the abundances 
 #'               from x to be extrapolated.
 #' @param otus.table a motus data.frame containing motus informations of x
 #' @param cutoff a cutoff below which abundances are set to 0. 
 #'               This threshold also determines the value to be added to 0 values for log10 
 #'               transformation
 #' @param return.metabarcoding.data if \code{TRUE}, returns a \code{\link{metabarcoding.data}} object. Default is \code{FALSE}
 #'
 #' @return either a dataframe or a S3 object with a structure similar to \code{\link{metabarcoding.data}} object. 
 #'         The number of samples corresponds to the predicted points.
 #'         The two last columns (if \code{return.metabarcoding.data==F}) or sample data.frame contains x y coordinates of the predicted grid
 #'         The all but last two columns (if \code{return.metabarcoding.data==F}) or read matrix contains the predicted log10 transformed relative abundances 
 #'         instead of reads counts
 #'         If \code{return.metabarcoding.data==F} the motus data.frame contains the motus informations from x
 #'
 #' @examples
 #' 
 #' data(termes)
 #' #Create dummy spatial coordinates
 #' attr(termes, "samples")[c("x", "y")] = expand.grid(1:7,1:3)
 #' 
 #' #compute frequencies
 #' attr(termes, "layers")[["reads.freq"]] = normalize(termes, MARGIN=1)$reads
 #' 
 #' # Getting extrapolations
 #' termes.pred = extrapol.freq(attr(termes, "layers")[["reads.freq"]], min.coord=c(1,1), max.coord=c(7,3), 
 #'                             grid.grain=100,termes$samples[,c("x", "y")], termes$motus, cutoff=1e-3)
 #'
 #' head(termes.pred$reads)
 #' @seealso \code{\link{map.extrapol.freq}} as well as \code{sp} and \code{gstat} packages
 #' @author Lucie Zinger
 #' @export
 extrapol.freq = function(x, min.coord, max.coord, grid.grain=100, coords, otus.table, cutoff=1e-3, return.metabarcoding.data = FALSE) {
  require(gstat)
  require(sp)
  #predicted grid setting
  new.x = seq(min.coord[1], max.coord[1], length.out = grid.grain)
  new.y = seq(min.coord[2], max.coord[2], length.out = grid.grain)
  grid.p=expand.grid(new.x, new.y)
  colnames(grid.p)=c("x", "y")
  S=sp::SpatialPoints(grid.p); sp::gridded(S)<-TRUE
  m=gstat::vgm(50, "Exp", 100)
  #krigging
  preds = apply(x, 2, function(otu) {
    otu[otu<cutoff] = cutoff
    spj=cbind(coords,otu)
    colnames(spj)=c("x", "y", "otu")
    spj.g=gstat::gstat(id="Log10.freq", formula=log10(otu)~1,locations=~x+y,data=spj,model=m)
    gstat::predict.gstat(spj.g, grid.p, quiet=T)$Log10.freq.pred
  })
  #formatting the output
  colnames(preds) = rownames(otus.table)
  rownames(preds) = paste("s", 1:nrow(grid.p), sep=".")
  row.names(grid.p) = rownames(preds)
  if(return.metabarcoding.data==F) {
    out = data.frame(preds, grid.p)
  } else{ 
    out = metabarcoding.data(preds, grid.p, otus.table)
  }
  return(out)
 }
 #' Maps of krigged log10-transformed frequencies
 #' 
 #' Maps the output of extrapol.freq
 #' 
 #' 
 #' @param x an extrapol.freq output
 #' @param path the path of the folder to export the map. Default is \code{NULL} and map is printed in Rplot/quartz
 #' @param col.names a vector containing the names of the columns to be used for defining the file name. Typically
 #'                  the column names containing the taxonomic information and/or sequence/motus id.
 #' @param index  an integer indicating column number of the motu/sequence to be plotted.
 #' @param cutoff  lower motu frequency accepted to consider motu abundance as different
 #'                from 0. Should be the same than the one used in extrapol.freq
 #' @param add.points a 3-column data.frame containing factor levels and associated x and y coordinates
 #'                   to be added to the map. Typically taxa observed in the field.
 #' @param adj a value used for adjusting text position in the map. Default is \code{4}
 #'
 #' @return a map/png file displaying motus distribution.
 #'
 #' @examples
 #' 
 #' data(termes)
 #' attr(termes, "samples")[c("x", "y")] = expand.grid(1:7,1:3)
 #' 
 #' #compute frequencies
 #' attr(termes, "layers")[["reads.freq"]] = normalize(termes, MARGIN=1)$reads
 #' 
 #' # Getting extrapolations
 #' termes.pred = extrapol.freq(attr(termes, "layers")[["reads.freq"]], 
 #' grid.grain=100,termes$samples[,c("x", "y")], termes$motus, cutoff=1e-3)
 #' 
 #' #mapping the distribution of the 3 most abundant sequences (caution, mfrow does not work for lattice's levelplot)
 #' map.extrapol.freq(termes.pred, path=NULL, col.name=NULL, 1, cutoff=1e-3)
 #' map.extrapol.freq(termes.pred, path=NULL, col.name=NULL, 2, cutoff=1e-3)
 #' map.extrapol.freq(termes.pred, path=NULL, col.name=NULL, 3, cutoff=1e-3)
 #' 
 #' #dummy observationnal data
 #' termes.obs = data.frame(x=c(2,3,5), y=c(2.7,2,2.6), taxa = rep("Isoptera Apicotermitinae", 3))
 #' map.extrapol.freq(termes.pred, path=NULL, col.name=NULL, 3, cutoff=1e-3, add.points=termes.obs)
 #' 
 #' @seealso \code{\link{extrapol.freq}}, and \code{levelplot} from \code{lattice} package
 #' @author Lucie Zinger
 #' @export
 map.extrapol.freq = function(x, path=NULL, col.name=NULL, index, cutoff=1e-3, add.points=NULL, adj=4) {
  require(lattice)
  if(!is.null(path)) {
    x.motus = apply(x$motus,2,as.character)
    name = gsub("\\.", "_", paste(gsub(", ", "_", toString(x.motus[index,col.name])), x.motus[index,"id"], sep="_"))
    file.out = paste(path, "/", name, ".png", sep="")
  }
  z=x$reads[,index]
  z[abs(z)>abs(log10(cutoff))]=log10(cutoff)
  z[z>0] = 0
  spj=as.data.frame(cbind(x$samples,z))
  colnames(spj)=c("x", "y", "z")
  map.out=levelplot(z~x+y, spj, col.regions=topo.colors(100), 
                    at=seq(log10(cutoff),log10(1), by=0.2), 
                    colorkey=list(at=seq(log10(cutoff),log10(1), by=0.2),
                                  labels=list(at=seq(log10(cutoff),log10(1), by=0.2),
                                              labels=round(10^seq(log10(cutoff),log10(1), by=0.2),3))),
                    aspect = "iso", contour=F, main=list(label=x$motus[index, "id"], cex=0.7))         
  if(!is.null(path)) {
    png(file=file.out, width=800, height=800)  
    print(map.out)
    if(!is.null(add.points)) {
      n = (max(spj[,"y"])-min(spj["y"]))/length(unique(spj[,"y"]))*adj
      trellis.focus("panel", 1, 1, highlight=FALSE)
      lpoints(add.points[,"x"], add.points[,"y"], cex=0.7, lwd=3, col="red")
      ltext(add.points[,"x"], add.points[,"y"]+n, add.points[,-match(c("x", "y"), colnames(add.points))], col="red", cex=1.5)
      trellis.unfocus()
    }
    dev.off() 
  } else {
    print(map.out)
    if(!is.null(add.points)) {
      n = (max(spj[,"y"])-min(spj["y"]))/length(unique(spj[,"y"]))*adj
      trellis.focus("panel", 1, 1, highlight=FALSE)
      lpoints(add.points[,"x"], add.points[,"y"], cex=0.7, lwd=3, col="red")
      ltext(add.points[,"x"], add.points[,"y"]+n, add.points[,-match(c("x", "y"), colnames(add.points))], col="red", cex=1)
      trellis.unfocus()
    }
  }
 }
--- a/ROBITools/R/experimental.section.R
+++ b/ROBITools/R/experimental.section.R
@@ -0,0 +1,206 @@
 #' @include 02_class_metabarcoding.data.R
 NULL
 #11.03.2011
 #L.Zinger
 #######################
 #function anosim.pw
 #######################
 #computes pairwise anosim computation
 #input:
 #dat: dissimilarity matrix
 #g: factor defining the grouping to test
 #permutations: nb of permutation to access anosim statistics
 #p.adjust.method: method of correction for multiple-testing
 #
 #output: a distance-like table containing:
 #in the upper triangle: the anosims R values
 #in the lower triangle: the adjusted p-values
 ### start
 anosim.pw<-function(dat, g, permutations, p.adjust.method, ...) {
 	require(vegan)
 	#data.trasformation
 	dat<-as.matrix(dat)
 	g<-factor(g)
 	#empty object for result storage
 	ano<-matrix(NA, nrow=nlevels(g), ncol=nlevels(g), dimnames=list(levels(g),levels(g)))
 	p.val.tmp<-NULL
 	#running anosims
 	for(i in 1:(nlevels(g)-1)) for(j in (i+1):nlevels(g)){
 			tmp<-anosim(as.dist(dat[c(which(g==levels(g)[i]),which(g==levels(g)[j])),
 									c(which(g==levels(g)[i]),which(g==levels(g)[j]))]),
 					c(rep(levels(g)[i], length(which(g==levels(g)[i]))),
 							rep(levels(g)[j], length(which(g==levels(g)[j])))), permutations)
 			ano[i,j]<-tmp$statistic
 			p.val.tmp<-append(p.val.tmp, tmp$signif)
 		}
 	#p value correction for multiple comparison
 	p.val.tmp<-p.adjust(p.val.tmp, p.adjust.method )
 	#put the corrected p values in the anosim table
 	tmp<-NULL
 	tmp2<-NULL
 	for(i in 1:(nlevels(g)-1)) for(j in (i+1):nlevels(g)){
 			tmp<-append(tmp,i)
 			tmp2<-append(tmp2,j)
 		}
 	for(i in 1:length(p.val.tmp)){
 		ano[tmp2[i],tmp[i]]<-p.val.tmp[i]}
 	return(ano)
 }
 ### end
 #23 Nov 2012
 #L.Zinger
 ###################
 #function MOTUtable
 ###################
 # Generates ready-to-use MOTU tables and basic statistics on samples (i.e. sequencing depth, raw richness, and invsimpson index)
 #input:
 #x: an obitable output (samples should be indicated as e.g. "sample.A01r" in column names)
 #y: the column name by which that data are to be aggregated. Should be e.g. "cluster" or "species_name"
 #outputs:
 #x.otu: the ready-to-use MOTU table
 #x.rawstats: basic statistics on samples
 ### start
 MOTUtable<-function(x, y) {
 	require(vegan)
 	nom<-as.character(substitute(x))
 	tmp<-x[,c(grep(y, colnames(x)), grep("sample", colnames(x)))]
 	tmp2<-t(aggregate(tmp[,-1], by=list(tmp[,1]), sum))
 	x.otu<-tmp2[-1,]
 	colnames(x.otu)<-paste(y,tmp2[1,], sep=".")
 	x.rawstats<-data.frame(Nb_ind=rowSums(x.otu), Raw_richness=specnumber(x.otu, MARGIN=1), Raw_eveness=diversity(x.otu, "invsimpson", MARGIN=1) )
 	#may have a pb in the rowSums depending on the R version (allows or not non-numeric)
 	assign(paste(nom, y, sep="."),x.otu,env = .GlobalEnv)
 	assign(paste(nom, y, "rawstats", sep="."),x.rawstats,env = .GlobalEnv)
 }
 ### end
 #26 Nov 2012
 #F.Boyer
 ###################
 #function reads.frequency & filter.threshold
 ###################
 #can be used to filter the table of reads to have the sequences that represents at least 95% of the total reads by sample
 #
 #e.g. reads.treshold(reads.frequency(metabarcodingS4Obj@reads), 0.95)
 filter.threshold <- function(v, threshold) {
 	o <- order(v, decreasing=T)
 	ind <- which(cumsum(as.matrix(v[o]))>threshold)
 	v[-o[seq(min(length(o), 1+length(o)-length(ind)))]] <- 0
 	v
 }
 reads.threshold  <- function (reads, threshold, by.sample=T) {
 	res <- apply(reads, MARGIN=ifelse(by.sample, 1, 2), filter.threshold, thr=threshold)
 	if (by.sample) res <- t(res)
 	data.frame(res)
 }
 reads.frequency <- function (reads, by.sample=T) {
 	res <- apply(reads, MARGIN=ifelse(by.sample, 1, 2), function(v) {v/sum(v)})
 	if (by.sample) res <- t(res)
 	data.frame(res)
 }
 #06 Jan 2013
 #F.Boyer
 ###################
 #function removeOutliers
 ###################
 #given a contengency table and a distance matrix
 #returns the list of samples that should be removed in order to have only 
 #distances below thresold
 #can't return only one sample
 #
 #e.g. intraBad <- lapply(levels(sample.desc$sampleName), function(group) {samples<-rownames(sample.desc)[sample.desc$sampleName==group]; removeOutliers(contingencyTable[samples,], thr=0.3, distFun = function(x) vegdist(x, method='bray'))})
 #require(vegan)
 removeOutliers <- function(m, thr=0.3, distFun = function(x) vegdist(x, method='bray') ) {
 	distMat <- as.matrix(distFun(m))
 	maxM <- max(distMat)
 	theBadGuys =c()
 	while (maxM>thr) {
 		bad <- apply(distMat, MARGIN=1, function(row, maxM) {any(row==maxM)}, maxM=maxM)    
 		bad <- names(bad)[bad]
 		bad <- apply(distMat[bad,], MARGIN=1, mean)
 		badGuy <- names(bad)[bad==max(bad), drop=F][1]
 		theBadGuys <- c(theBadGuys, badGuy)
 		stillok <- rownames(distMat) != badGuy
 		distMat <- distMat[stillok, stillok, drop=F]
 		maxM <- max(distMat)
 	}
 	if (length(theBadGuys) >= (nrow(m)-1)) {
 		theBadGuys <- rownames(m)    
 	}
 	theBadGuys
 }
 #31.05.2013
 #L.Zinger
 #getAttrPerS, a function allowing to get the values of a sequence attribute per sample
 #(e.g. best_identities, etc...) the output is a list with one dataframe per sample.
 #This dataframe contains:
 #	 first column (named as attr): the attribute value for each sequence present in the sample
 #	 second column (named weight): the corresponding number of reads in the sample
 getAttrPerS=function(x,attr){
 	#x: a metabarcoding object
 	#attr: a character object corresponding to the attribute
 	#for which values per sample are needed (should be equal to a colname in x@motus)
 	if(class(x)[1]!= "metabarcoding.data") {
 		stop("x is not a metabarcoding S4 object")
 	}
 	if(is.character(attr)==F) {
 		stop("attr is not a character object")
 	}
 	x.motus = motus(x)
 	x.reads = reads(x)
 	otu = apply(x.reads, 1, function(y) x.motus[match(names(y[which(y!=0)]),x.motus$id), grep(attr, colnames(x.motus))])
 	reads = apply(x.reads, 1, function(y) y[which(y!=0)])
 	output = mapply(cbind, otu, reads)
 	output = lapply(output, function(y) {
 				colnames(y)=c(attr,"weight")
 				return(y)
 			})
 	return(output)
 }
 ### end getAttrPerS
--- a/ROBITools/R/export-metabarcoding.R
+++ b/ROBITools/R/export-metabarcoding.R
@@ -0,0 +1,62 @@
 #' @include 02_class_metabarcoding.data.R
 NULL
 # TODO: Add comment
 # 
 # Author: coissac
 ###############################################################################
 require(utils)
 expand.metabarcoding.data=function(data,minread=1) {
 	resultonesample=function(sample) {
 		mo= data@reads[sample,] >= minread
 		s = data@samples[rep(sample,sum(mo)),]
 		r = as.numeric(data@reads[sample,mo])
 		m = data@motus[mo,]
 		result = data.frame(s,frequency=r,m, 
 				stringsAsFactors =FALSE,
 				row.names = NULL)
 		result
 	}
 	res = lapply(1:data@scount, resultonesample)
 	do.call(rbind,res)	
 }
 #setGeneric("utils::write.csv")
 write.csv.metabarcoding.data = function(...) {
 	Call <- match.call(expand.dots = TRUE)
 	if (!is.null(Call[["minread"]])) {
 		minread = Call[["minread"]]
 		Call = Call[!names(Call)=="minread"]
 	}
 	else
 		minread = 1
 	data = eval.parent(Call[[2L]])
 	data = expand.metabarcoding.data(data,minread)
 	Call[[1L]] <- as.name("write.csv")
 	Call[[2L]] <- as.name("data")
 	eval(Call)
 }
 #setGeneric("utils::write.csv2")
 write.csv2.metabarcoding.data = function(...) {
 	Call <- match.call(expand.dots = TRUE)
 	if (!is.null(Call[["minread"]])) {
 		minread = Call[["minread"]]
 		Call = Call[!names(Call)=="minread"]
 	}
 	else
 		minread = 1
 	data = eval.parent(Call[[2L]])
 	data = expand.metabarcoding.data(data,minread)
 	Call[[1L]] <- as.name("write.csv2")
 	Call[[2L]] <- as.name("data")
 	eval(Call)
 }
--- a/ROBITools/R/import.metabarcoding.R
+++ b/ROBITools/R/import.metabarcoding.R
@@ -0,0 +1,106 @@
 #' @include read.obitab.R
 #' @include 02_class_metabarcoding.data.R
 NULL
 #' Read a data file produced by the \code{obitab} command 
 #' 
 #' Read a data file issued from the conversion of a \strong{fasta} 
 #' file to a tabular file by the \code{obitab} command of the 
 #' \strong{OBITools} package 
 #' 
 #' @param file a string containing the file name of the obitab file.
 #' @param sep  Column separator in the obitab file. 
 #'             The default separator is the tabulation.
 #' @param sample A regular expression allowing to identify columns 
 #'               from the file describing abundances of sequences per sample
 #' @param sample.sep Separator between combined sample name.
 #' @param attribute Separator used to split between sample 'tag' and sample name.
 #' 
 #' @return a \code{\link{metabarcoding.data}} instance
 #' 
 #' @examples
 #' require(ROBITools)
 #' 
 #' \dontshow{# switch the working directory to the data package directory}
 #' \dontshow{setwd(system.file("extdata", package="ROBITools"))}
 #' 
 #' # read the termes.tab file
 #' termes=import.metabarcoding.data('termes.tab')
 #' 
 #' # print the number of samples and motus described in the file
 #' dim(termes)
 #'   
 #' @seealso \code{\link{metabarcoding.data}}
 #'
 #' @author Eric Coissac
 #' @keywords DNA metabarcoding
 #' @export
 #' 
 import.metabarcoding.data = function(file,sep='\t',sample="sample",sample.sep="\\.",attribute=":") {
 	data=read.obitab(file,sep=sep)
 	# get the colnames matching the sample pattern
 	column=colnames(data)
 	pat = paste('(^|',sample.sep,')',sample,'[',sample.sep,attribute,']',sep='')
 	scol= grep(pat,column)
 	# reads informations about samples
 	reads  = data[,scol]
 	names  = colnames(reads)
 	names  = strsplit(names,split=attribute)
 			# for sample name just remove the first part of the col names
 			# usally "sample:"
 	sample.names = sapply(names,function(a) paste(a[-1],collapse=attribute))	
 	reads=t(reads)
 	rownames(reads)=sample.names
 	# sample's data
 	sample.data = data.frame(t(data.frame(strsplit(sample.names,split=attribute))))
 	rownames(sample.data)=sample.names
 	colnames(sample.data)=strsplit(names[[1]][1],split=attribute)
 	# motus information 
 	motus = data[,-scol]
 	motus.id = motus$id
 	rownames(motus)=motus.id
 	colnames(reads)=motus.id
 	return(metabarcoding.data(reads,sample.data,motus))
 }
 #pcr = gh[,grep('^sample',colnames(gh))]
 #pcr.names = colnames(pcr)
 #pcr.names = sub('sample\\.','',pcr.names)
 #sequencer = rep('Solexa',length(pcr.names))
 #sequencer[grep('454',pcr.names)]='454'
 #sequencer=factor(sequencer)
 #
 #tmp = strsplit(pcr.names,'\\.[A-Z](sol|454)\\.')
 #
 #sample = sapply(tmp,function(x) x[1])
 #locality = factor(sapply(strsplit(sample,'_'),function(x) x[1]))
 #sample = factor(sample)
 #repeats= factor(sapply(tmp,function(x) x[2]))
 #
 #tmp = regexpr('[A-Z](454|sol)',pcr.names)
 #run=factor(substr(pcr.names,tmp,tmp+attr(tmp,"match.length")-1))
 #
 #pcr.metadata = data.frame(run,sequencer,locality,sample,repeats)
 #
 #rownames(pcr.metadata)=pcr.names
--- a/ROBITools/R/import.ngsfilter.R
+++ b/ROBITools/R/import.ngsfilter.R
@@ -0,0 +1,79 @@
 #' @include 02_class_metabarcoding.data.R
 NULL
 #' Read ngsfilter text file
 #' 
 #' Reads  the text file used for assigning reads to samples with the
 #'  \code{ngsfilter} command of the \strong{OBITools} package. 
 #' 
 #' @param file a string containing the file name for the \code{ngsfilter} command.
 #' @param platewell a string corresponding to the tag used for storing the sample location
 #'                  in the PCR plate. Should be of the form "nbPlate_Well" (e.g. "01_A02").
 #'                  Default is \code{NULL}
 #' @return \code{\link{import.ngsfilter.data}} returns a \code{\link{data.frame}} instance
 #' 
 #' @examples
 #' \dontshow{# switch the working directory to the data package directory}
 #' \dontshow{setwd(system.file("extdata", package="ROBITools"))}
 #' 
 #' data(termes)
 #' 
 #' # reading the termes_ngsfilt.txt file
 #' termes.ngs=import.ngsfilter.data('termes_ngsfilt.txt', platewell="position")
 #' 
 #' # including ngsfilter data into termes data
 #' attr(termes, "samples") = termes.ngs[rownames(termes),]
 #' 
 #' colnames(termes$samples)
 #'   
 #' @seealso \code{\link{import.metabarcoding.data}} and \code{\link{read.obitab}} for other methods of data importation 
 #'
 #' @author Lucie Zinger
 #' @keywords DNA metabarcoding
 #' @export
 #' 
 import.ngsfilter.data = function(file, platewell=NULL) {
  raw = read.table(file, sep="\t")
  #get samples names
  names = raw[,2]
  #form first part of the output table (default ngsfilter text input)
  out = raw[,-c(2,3,ncol(raw))]
  colnames(out) = c("Experiment", "primerF", "primerR")
  #add tags
  out[,c("tagF", "tagR")] = do.call("rbind", strsplit(as.vector(raw[,3]), "\\:"))
  #collect nb and names of additionnal information
  max.add = max(unlist(lapply(strsplit(gsub("^F @ ","", raw[, ncol(raw)]), "; "), length)))
  names.add = unique(unlist(lapply(strsplit(unlist(strsplit(gsub("^F @ ","", raw[, ncol(raw)]), "; ")), "="), "[[",1)))
  #form table of additionnal info
  form = lapply(strsplit(gsub("^F @ ","", raw[, ncol(raw)]), "; "), strsplit, "=")
  additionnals = as.data.frame(do.call("rbind", lapply(form, function(y) {
    val = rep(NA, , max.add)
    names(val) = names.add
    val[match(unlist(lapply(y, "[[", 1)), names(val))] = gsub(";", "",unlist(lapply(y, "[[", 2)))
    val
  })))
  #create PCR plate coordinates
  if(!is.null(platewell)) {
    form = strsplit(as.vector(additionnals[, platewell]), "_")
    nbPlate = as.numeric(gsub("^0", "", unlist(lapply(form, "[[", 1))))
    wellPlate = unlist(lapply(form, "[[", 2))
    xPlate = as.numeric(gsub("[A-Z]", "", wellPlate))
    yPlate = as.numeric(as.factor(gsub("[0-9]*", "", wellPlate))) + 8*nbPlate
    additionnals = additionnals[,-grep(platewell, colnames(additionnals))]
    out = data.frame(out, additionnals, nbPlate, wellPlate, xPlate, yPlate)
  }
  else {
    additionnals[,ncol(additionnals)] = gsub(";","", additionnals[,ncol(additionnals)])
    out = data.frame(out, additionnals)
  }
  rownames(out) = names
  return(out)
 }
--- a/ROBITools/R/layers.metabarcoding.R
+++ b/ROBITools/R/layers.metabarcoding.R
@@ -0,0 +1,119 @@
 #' @include 02_class_metabarcoding.data.R
 NULL
 #
 #
 # Managment of layers
 #
 # Layers a matrix or factors with the same dimension
 # than the read matrix
 #
 # get motus data.frames
 #' @export
 setGeneric("layer.names", function(obj) {
  return(standardGeneric("layer.names"))
 })
 #' Returns the names of all the layers 
 #' 
 #' \code{layer.names} extracts the list of all the layer
 #' names attached to a \code{\link{metabarcoding.data}} instance.
 #' 
 #' @param   obj a \code{\link{metabarcoding.data}} instance
 #' @return  a vector of type \code{character} containing the
 #'          list of all the layer names.
 #' 
 #' @docType methods
 #' @rdname layer.names-methods
 #' @aliases layer.names-methods,metabarcoding.data
 #' 
 setMethod("layer.names", "metabarcoding.data", function(obj) {
  return(names(obj@layers))
 })
 #' Returns the a layer associated to a \code{\link{metabarcoding.data}}
 #' 
 #' [[ operator Extracts a layer
 #' attached to a \code{\link{metabarcoding.data}} instance.
 #' 
 #' @usage \method{[[}{unmutable}(x,i)
 #'
 #' @param   x a \code{\link{metabarcoding.data}} instance
 #' @return  matrix or a factor.
 #' 
 #' @docType methods
 #' @rdname double-open-brace-methods
 #' @aliases double-open-brace-methods,metabarcoding.data
 #' @method [[
 #' @export
 #' 
 setMethod("[[", "metabarcoding.data", 
          function(x, i, j, ...) {
            if (! is.character(i))
              stop('Just named index must be used')  
            if (i=="reads")
              return(x@reads)
            if (i=="samples")
              return(x@samples)
            if (i=="motus")
              return(x@motus)
            if (i=="reads")
              return(x@reads)
            return(x@layers[[i,exact=TRUE]])
          })
 #' @method $
 #' @export
 setMethod("$", "metabarcoding.data", 
          function(x, name) {
            return(x[[name]])
          })
 # set one data layer data.frames
 #' @method [[<-
 #' @export
 setMethod("[[<-","metabarcoding.data", 
          function(x, i, j, ...,value) {
            if (any(dim(value)!=c(x@scount,x@mcount)))
              stop("data dimmension are not coherent with this metabarcoding.data")
            if (hasArg('j'))
              stop('Just one dimension must be specified')
            if (! is.character(i))
              stop('Just named index must be used')
            if (i=='reads')
              stop('you cannot change the reads layer by this way')
            if (i=='motus' | i=='samples')
              stop('layers cannot be names motus or samples')
            value = as.factor.or.matrix(value)
            rownames(value)=rownames(x@reads)
            colnames(value)=colnames(x@reads)
            x@layers[[i]]=value
            return(x)
          })
 #' @method $<-
 #' @export
 setMethod("$<-","metabarcoding.data", 
          function(x, name, value) {
            x[[name]]=value
            return(x)
          })
--- a/ROBITools/R/metabarcoding_threshold.R
+++ b/ROBITools/R/metabarcoding_threshold.R
@@ -0,0 +1,378 @@
 #' @include 02_class_metabarcoding.data.R
 NULL
 #' @export
 setGeneric("marginalsum", function(data,MARGIN="sample", na.rm = FALSE) {
 			return(standardGeneric("marginalsum"))
 		})
 #' Computes marginal sums over read counts.
 #' 
 #' Method \code{marginalsum} computes marginal sums over read counts of
 #' a \code{\link{metabarcoding.data}} instance.
 #' 
 #' @param data The \code{\linkS4class{metabarcoding.data}} instance
 #'             on which marginal sums have to be computed.
 #' @param MARGIN Indicates if the sums have to be computed across 
 #'               samples or motus. 
 #'               Allowed values are :
 #'               \itemize{
 #'                 \item{'sample' or 1} for computing sum across samples
 #'                 \item{'motu' or 2} for computing sum across motus
 #'                 }
 #' @param na.rm  Logical. Should missing values be omitted from the 
 #'               calculations?
 #'               
 #' @return Returns the vector of marginal sums as a \code{numeric} vector
 #' 
 #' @examples
 #' # load termite data set from the ROBITools sample data
 #' data(termes)
 #' 
 #' # Computes marginal sums per sample
 #' ssum = marginalsum(termes,MARGIN="sample")
 #' 
 #' # Computes marginal sums per MOTU
 #' msum = marginalsum(termes,MARGIN="motu")
 #' 
 #' @seealso \code{\linkS4class{metabarcoding.data}}
 #'
 #' @docType methods
 #' @rdname marginalsum-methods
 #' @aliases marginalsum-methods,metabarcoding.data
 #' @author Aurelie Bonin
 #' 
 setMethod("marginalsum", "metabarcoding.data", function(data,MARGIN='sample', na.rm = FALSE) {
 	if (MARGIN == 'sample')
 		MARGIN=1
 	if (MARGIN == 'motu')
 		MARGIN=2
 	readcount = reads(data)
  if (MARGIN==1)    
 	  margesum = rowSums(readcount,na.rm=na.rm)
  else
    margesum = colSums(readcount,na.rm=na.rm)
 	return(margesum)
 })
 rowSums.metabarcoding.data = function (x, na.rm = FALSE, dims = 1L) {
  print("coucou")
 }
 #' @export
 setGeneric("normalize", function(data,MARGIN='sample',as.matrix=FALSE) {
 			return(standardGeneric("normalize"))
 		})
 #' Normalizes read counts by sample or by MOTU.
 #' 
 #' Method \code{normalize} computes a normalized read aboundancy matrix
 #' (relative frequency matrix) of a \code{\link{metabarcoding.data}} instance.
 #' Normalization can be done according aboundancies per sample or per MOTU.
 #' 
 #' @param data The \code{\linkS4class{metabarcoding.data}} instance
 #'             on normalisation have to be computed.
 #' @param MARGIN Indicates if the sums have to be computed across 
 #'               samples or motus. 
 #'               Allowed values are :
 #'               \itemize{
 #'                 \item{'sample' or 1} for computing sum across samples
 #'                 \item{'motu' or 2} for computing sum across motus
 #'                 }
 #' @param as.matrix Logical indicating if the normalized aboundancies
 #'               must be returned as a simple \code{matrix} (TRUE) or as a new
 #'               instance of the \code{\linkS4class{metabarcoding.data}} class
 #'               (FALSE, the default case).
 #'               
 #' @return Returns a new instance of \code{\linkS4class{metabarcoding.data}}
 #'         or a \code{numeric} matrix according to the \code{return.as.matrix}
 #'         parameter.
 #' 
 #' @examples
 #' # load termite data set from the ROBITools sample data
 #' data(termes)
 #' 
 #' # Computes normalized aboundancies per sample
 #' termes.norm = normalize(termes,MARGIN="sample")
 #' 
 #' # Computes normalized aboundancies per sample and
 #' # stores the result as a new layer into the thermes
 #' # structure
 #' termes$normalized = normalize(termes,MARGIN="sample",as.matrix=TRUE)
 #' 
 #' @seealso \code{\linkS4class{metabarcoding.data}}
 #'
 #' @docType methods
 #' @rdname normalize-methods
 #' @aliases normalize-methods,metabarcoding.data
 #' @author Aurelie Bonin
 #' 
 setMethod("normalize", "metabarcoding.data", function(data,MARGIN="sample",as.matrix=FALSE) {
 	if (MARGIN == 'sample')
 		MARGIN=1
 	if (MARGIN == 'motu')
 		MARGIN=2
 	readcount = reads(data)
 	margesum  = marginalsum(data,MARGIN,na.rm=TRUE)
 	readcount = sweep(readcount,MARGIN,margesum, FUN="/")
  if (as.matrix)
    newdata=readcount
  else
 	  newdata = copy.metabarcoding.data(data,reads=readcount)
 	return(newdata)
 })
 #' @export
 setGeneric("threshold", function(data,MARGIN="sample",threshold=0.97) {
 			return(standardGeneric("threshold"))
 		})
 #' Compute the cumulative threshold of read aboundances.
 #' 
 #' The method \code{threshold} of the class \code{\linkS4class{metabarcoding.data}}
 #' computes the thresold to be used for conserving just a part of the global
 #' signal. This thresold is computed by ranking aboundances by decreasing order.
 #' The cululative sums of these ranked abondencies are computed and the aboundance
 #' corresponding to the first sum greater than the threshold is returned as result.
 #' 
 #' @param data The \code{\linkS4class{metabarcoding.data}} instance
 #'             on normalisation have to be computed.
 #' @param MARGIN Indicates if the sums have to be computed across 
 #'               samples or motus. 
 #'               Allowed values are :
 #'               \itemize{
 #'                 \item{'sample' or 1} for computing sum across samples
 #'                 \item{'motu' or 2} for computing sum across motus
 #'                 }
 #' @param threshold a numeric value between 0 and 1 indicating which part of 
 #'                  the signal must be conserved. Default value is setup to
 #'                  0.97 (97% of the total signal).
 #'                  
 #' @return a numeric vector containing the limit aboundancy to consider for
 #'         each sample or each MOTU according to the value of the \code{MARGIN} 
 #'         parameter.
 #'                  
 #' @examples
 #' # load termite data set from the ROBITools sample data
 #' data(termes)
 #' 
 #' # computes threshold value to used for keep 95% of 
 #' # the reads per MOTU
 #' 
 #' t = threshold(termes,MARGIN='motu',threshold=0.95)
 #'  
 #' @seealso \code{\linkS4class{metabarcoding.data}}, \code{\link{threshold.mask}}
 #'
 #' @docType methods
 #' @rdname threshold-methods
 #' @aliases threshold-methods,metabarcoding.data
 #' @author Aurelie Bonin
 #'
 setMethod("threshold", "metabarcoding.data", function(data,MARGIN="sample",threshold=0.97) {
 	onethreshold=function(x,threshold) {
 		s = x[order(-x)]
 		cs= cumsum(s) 
    total=cs[length(cs)]
    if (total > 0) {
 		  cs= cs / total
 		  cs = cs > threshold
 		  t = s[cs][1]
    }
    else t=0
    return(t)
 	}
 	if (MARGIN == 'sample')
 		MARGIN=1
 	if (MARGIN == 'motu')
 		MARGIN=2
 	readcount = reads(data)
 	t = apply(readcount,MARGIN,onethreshold,threshold)
 	return(t)
 })
 #' @export
 setGeneric("threshold.mask", function(data,MARGIN,threshold=0.97,operator='<') {
 			return(standardGeneric("threshold.mask"))
 		})
 #' Computes a cumulatif thresold mask for filtering read aboundancies.
 #' 
 #' The method \code{threshold.mask} of the class \code{\linkS4class{metabarcoding.data}}
 #' computes a logical matrix of the same size than the read matrix of the data parameter.
 #' Each cell of this matrix contains a \code{TRUE} or a \code{FALSE} value according to the
 #' relationship existing between the read abondancy and the corresponding theshold as computed
 #' by the \code{\link{theshold}} method.
 #' 
 #' (computed value) = (read aboundancy) operator (threshold value)
 #' 
 #' for a cell in the result matrix, \code{(read aboundancy)} is extracted from the read layer.
 #' \code{operator} is a comparaison operator and \code{(threshold value)} is estimated with the
 #' \code{\link{theshold}} method.
 #' 
 #' @param data The \code{\linkS4class{metabarcoding.data}} instance
 #'             on normalisation have to be computed.
 #' @param MARGIN Indicates if the sums have to be computed across 
 #'               samples or motus. 
 #'               Allowed values are :
 #'               \itemize{
 #'                 \item{'sample' or 1} for computing sum across samples
 #'                 \item{'motu' or 2} for computing sum across motus
 #'                 }
 #' @param threshold a numeric value between 0 and 1 indicating which part of 
 #'                  the signal must be conserved. Default value is setup to
 #'                  0.97 (97% of the total signal).
 #' @param operator is a logical comparison operator.
 #' 
 #' @return A logical matrix usable for selecting cell in the read aboundancy matrix.
 #'                   
 #' @seealso \code{\linkS4class{metabarcoding.data}}, \code{\link{threshold.mask}}, \code{\link{threshold}}
 #'
 #' @docType methods
 #' @rdname threshold-mask-methods
 #' @aliases threshold.mask-methods,metabarcoding.data
 #' @author Aurelie Bonin
 #' 
 setMethod("threshold.mask", "metabarcoding.data", function(data,MARGIN,threshold=0.97,operator='<') {
 	if (MARGIN == 'sample')
 		MARGIN=1
 	if (MARGIN == 'motu')
 		MARGIN=2
 	readcount = reads(data)
 	t = threshold(data,MARGIN,threshold)
 	mask = apply(readcount,c(2,1)[MARGIN],operator,t)
 	if (MARGIN==2)
 		mask = t(mask)
 	return(mask)
 })
 #' @export
 setGeneric("const.threshold.mask", function(data,MARGIN,threshold=0.01,operator='<') {
 			return(standardGeneric("const.threshold.mask"))
 		})
 #' Computes a constant thresold mask for filtering read aboundancies.
 #' 
 #' The method \code{const.threshold.mask} of the class \code{\linkS4class{metabarcoding.data}}
 #' computes a logical matrix of the same size than the read matrix of the data parameter.
 #' Each cell of this matrix contains a \code{TRUE} or a \code{FALSE} value according to the
 #' relationship existing between the read abondancy and the global theshold.
 #' 
 #' (computed value) = (normalized read aboundancy) operator (threshold value)
 #' 
 #' for a cell in the result matrix, \code{(normalized read aboundancy)} is extracted from the read layer
 #' after normalization.
 #' \code{operator} is a comparaison operator and \code{(threshold value)} is estimated with the
 #' \code{\link{theshold}} method.
 #' 
 #' @param data The \code{\linkS4class{metabarcoding.data}} instance
 #'             on normalisation have to be computed.
 #' @param MARGIN Indicates if the sums have to be computed across 
 #'               samples or motus. 
 #'               Allowed values are :
 #'               \itemize{
 #'                 \item{'sample' or 1} for computing sum across samples
 #'                 \item{'motu' or 2} for computing sum across motus
 #'                 }
 #' @param threshold a numeric value between 0 and 1 indicating which part of 
 #'                  the signal must be conserved. Default value is setup to
 #'                  0.01 (1% of the normalized signal).
 #' @param operator is a logical comparison operator.
 #' 
 #' @return A logical matrix usable for selecting cell in the read aboundancy matrix.
 #'                   
 #' @seealso \code{\linkS4class{metabarcoding.data}}, \code{\link{threshold.mask}}, \code{\link{normalize}}
 #'
 #' @docType methods
 #' @rdname const-threshold-mask-methods
 #' @aliases const.threshold.mask-methods,metabarcoding.data
 #' @author Aurelie Bonin
 #' 
 setMethod("const.threshold.mask", "metabarcoding.data", function(data,MARGIN,threshold=0.01,operator='<') {
  if (MARGIN == 'sample')
    MARGIN=1
  if (MARGIN == 'motu')
    MARGIN=2
  readcount = normalize(data,MARGIN,as.matrix=TRUE)	
  mask = do.call(operator,list(readcount,threshold))
  return(mask)
 })
 #' @export
 setGeneric("threshold.set", function(data,
 				MARGIN,
 				threshold=0.97,
 				operator='<',
 				value=0,
 				normalize=TRUE,
 				mask.fun=threshold.mask) {
 			return(standardGeneric("threshold.set"))
 		})
 setMethod("threshold.set", "metabarcoding.data", function(data,
 		MARGIN,
 		threshold=0.97,
 		operator='<',
 		value=0,
 		normalize=TRUE,
 		mask.fun=threshold.mask) {
 	if (MARGIN == 'sample')
 		MARGIN=1
 	if (MARGIN == 'motu')
 		MARGIN=2
 	readcount = reads(data)
 	if (normalize)
 		data = normalize(data,c(2,1)[MARGIN])
 	mask = mask.fun(data,MARGIN,threshold,operator)
 	readcount[mask] = value
 	newdata = copy.metabarcoding.data(data,reads=readcount)
 	return(newdata)
 })
--- a/ROBITools/R/mstat.R
+++ b/ROBITools/R/mstat.R
@@ -0,0 +1,407 @@
 #' @include 02_class_metabarcoding.data.R
 #' @import igraph
 NULL
 require(igraph)
 # pos = expand.grid(x,y)
 #' Computes the pairwise distance matrix as a data.frame where
 #' 
 #' @param x a vector for the X coordinates
 #' @param y a vector for the Y coordinates
 #' @param labels a vector with the sample names
 #' 
 #' @return a data.frame instance of three columns
 #'    - a : The label of the first sample
 #'    - b : The label of the second sample
 #'    - dist : The euclidian distance beween sample a and b
 #' 
 #' @examples
 #' data(termes)
 #' termes.ok = termes[,colSums(termes$reads)>0]
 #' pos = expand.grid(1:3 * 10,1:7 * 10)
 #' labels = rownames(termes.ok)
 #' d = dist.grid(pos[,1],pos[2],labels)
 #' 
 #' @export
 dist.grid = function(x,y,labels=NULL){
  pos = data.frame(x,y)
  if (is.null(labels))
    labels = as.character(interaction(pos))
  else
    labels = as.character(labels)
  llabels=length(labels)
  dpos=dist(pos)
  a = rep(labels[1:(llabels-1)],(llabels-1):1)
  b = do.call(c,(lapply(2:llabels, function(i) labels[i:llabels])))
  return(data.frame(a,b,dist=as.vector(dpos)))
 }
 #' Builds the list of sample groups included in a circle around a central sample
 #' 
 #' @param dtable a distance table between samples as 
 #'               computed by \code{\link{dist.grid}}
 #' @param radius the radius of the circle
 #' @param center a \code{logical} value indicating if the center of
 #'               the group must be included in the group
 #'               
 #' @return a list of vectors containing the labels of the group members 
 #'          
 #' @examples
 #' data(termes)
 #' termes.ok = termes[,colSums(termes$reads)>0]
 #' pos = expand.grid(1:3 * 10,1:7 * 10)
 #' labels = rownames(termes.ok)
 #' d = dist.grid(pos[,1],pos[2],labels)
 #' groups = dist.center.group(d,20)
 #' 
 #' @export
 dist.center.group=function(dtable,radius,center=TRUE) {
  fgroup = function(c) {
    ig = dtable[(dtable[,1]==c | dtable[,2]==c) & dtable[,3] <= radius,]
    return(union(ig[,1],ig[,2]))
  }
  pos = as.character(union(dtable[,1],dtable[,2]))
  g = lapply(pos,fgroup)
  names(g) = pos
  if (!center)
    g = mapply(setdiff,g,pos)
  return(g)
 }
 #' Builds the list of sample groups including samples closest than a define distance
 #' 
 #' A graph is build by applying the threshold \code{dmax} to the distance matrix
 #' A group is a clique max in this graph. Consequently all member pairs of a group
 #' are distant by less or equal to \code{dmax}.
 #' 
 #' @param dtable a distance table between samples as 
 #'               computed by \code{\link{dist.grid}}
 #' @param dmax the maximum distance between two samples
 #'               
 #' @return a list of vectors containing the labels of the group members 
 #'          
 #' @examples
 #' data(termes)
 #' termes.ok = termes[,colSums(termes$reads)>0]
 #' pos = expand.grid(1:3 * 10,1:7 * 10)
 #' labels = rownames(termes.ok)
 #' d = dist.grid(pos[,1],pos[2],labels)
 #' groups = dist.clique.group(d,20)
 #' 
 #' @export
 dist.clique.group=function(dtable,dmax,center=True) {
  gp = igraph::graph.edgelist(as.matrix(dtable[dtable$dist <= dmax,c('a','b')]),directed=FALSE)
  g  = igraph::maximal.cliques(gp)
  return(lapply(g, function(i) igraph::V(gp)$name[i]))
 }
 #' Computes the univariate M statistics 
 #' 
 #' @param w the weigth matrix indicating the presence probability of each motu
 #'          in each samples. Each line corresponds to a sample and each column
 #'          to a MOTU. \code{rownames} of the \code{w} matrix must be the sample
 #'          names.  It is nice but not mandatory if the \code{colnames} refer to the MOTU id.
 #'          
 #' @param groups the list of considered groups as computed by the \code{\link{dist.center.group}}
 #'               function
 #'
 #' @seealso \code{\link{dist.center.group}}
 #' @seealso \code{\link{m.weight}}
 #' 
 #' @examples
 #' data(termes)
 #' termes.ok = termes[,colSums(termes$reads)>0]
 #' pos = expand.grid(1:3 * 10,1:7 * 10)
 #' labels = rownames(termes.ok)
 #' d = dist.grid(pos[,1],pos[2],labels)
 #' groups = dist.center.group(d,20)
 #' w = m.weight(termes.ok)
 #' m = m.univariate(w,groups)
 #' 
 #' @references Marcon, E., Puech, F., and Traissac, S. (2012). 
 #'             Characterizing the relative spatial structure of point patterns. 
 #'             International Journal of Ecology, 2012.
 #'             
 #' @export
 m.univariate = function(w,groups) {
  nunivar = function(members,center) {
    g = w[members,]
    wn = colSums(g)
    wa = sum(wn)
    wn = wn - center
    wa = wa - center
    p = wn / wa * center
    return(p)
  }
  centers = lapply(names(groups),function(x) w[x,])
  Wf = colSums(w)
  Wa = sum(Wf)
  Denom.univar = colSums(w * (sweep(-w,2,Wf,'+') / (Wa - w)))
  Num.univar   = rowSums(mapply(nunivar,groups,centers))
  Munivar=Num.univar/Denom.univar
  Munivar[Denom.univar==0]=0
  return(Munivar)
 }
 #' Computes the bivariate M statistics 
 #' 
 #' The function computes the bivariate M statiscics for a set of target species around a set of
 #' focus species.
 #' 
 #' @param w1 the weigth matrix indicating the presence probability of each motu
 #'          used as focus species in each samples. Each line corresponds to a sample and each column
 #'          to a MOTU. \code{rownames} of the \code{w} matrix must be the sample
 #'          names. It is nice but not mandatory if the \code{colnames} refer to the MOTU id.
 #'          
 #' @param w2 the weigth matrix indicating the presence probability of each motu
 #'          used as target species in each samples. Each line corresponds to a sample and each column
 #'          to a MOTU. \code{rownames} of the \code{w} matrix must be the sample
 #'          names. It is nice but not mandatory if the \code{colnames} refer to the MOTU id.
 #'          if \code{w2} is not set, w1 is also used as target species. in this case the diagonal
 #'          of the matrix return contains the univariate M statistic for the diferent species.
 #'          
 #' @param groups the list of considered groups as computed by the \code{\link{dist.center.group}}
 #'               function
 #'               
 #' @return a matrix of M bivariate statistics with one focus species by row and one target species 
 #'         by columns If \code{w2} is not specified the diagonal of the matrix is equal to the univariate
 #'         M statistic of the corresponding species.
 #'
 #' @seealso \code{\link{dist.center.group}}
 #' @seealso \code{\link{m.weight}}
 #' 
 #' @examples
 #' data(termes)
 #' termes.ok = termes[,colSums(termes$reads)>0]
 #' pos = expand.grid(1:3 * 10,1:7 * 10)
 #' labels = rownames(termes.ok)
 #' d = dist.grid(pos[,1],pos[2],labels)
 #' groups = dist.center.group(d,20)
 #' w = m.weight(termes.ok)
 #' m = m.bivariate(w,groups)
 #' 
 #' @references Marcon, E., Puech, F., and Traissac, S. (2012). 
 #'             Characterizing the relative spatial structure of point patterns. 
 #'             International Journal of Ecology, 2012.
 #'             
 #' @export
 m.bivariate = function(w1,w2=NULL,groups) {
  nunbivar = function(members,center) {
    g = w2[members,]
    wn = colSums(g)
    wa = sum(wn)
    if (self){
      mwn  = wn %*% t(rep(1,length(wn)))
      diag(mwn)= wn - center
      wa = wa - center
      wna = mwn/wa
      p = sweep(wna,2,center,'*')
      #p = center %*% wna
    }
    else {
      wna= matrix(wn/wa,nrow=1)
      p = center %*% wna
    }    
    return(p)
  }
  if (is.null(w2)){
    self = TRUE
    w2=w1
  }
  else {
    self = FALSE
  }
  centers = lapply(names(groups),function(x) w[x,])
  Wf = colSums(w1)
  Wn = colSums(w2)
  Wa = sum(Wn)
  if (self){      
    Wn = sweep(-w1,2,Wn,'+')
    Wna = Wn/(Wa - w1)
    Denom.bivar = t(w1) %*% Wna
  }
  else {
    Wna= t(Wn/Wa)
    Denom.bivar = Wf %*% Wna    
  }
  Num.bivar = matrix(0,nrow=ncol(w1),ncol=ncol(w2))
  ng = length(groups)
  for (i in 1:ng) {
    Num.bivar = Num.bivar + nunbivar(groups[[i]],centers[[i]])
  }
  Mbivar=Num.bivar/Denom.bivar
  Mbivar[Denom.bivar==0]=0
  return(Mbivar)
 }
 #' Computes a weigth matrix from a \code{\linkS4class{metabarcoding.data}}
 #' 
 #' The weight can be considered as a propability of presence of a MOTU in a 
 #' given sample. This function defines this probability as the fraction of
 #' the maximal occurrence frequency over all samples. 
 #' 
 #' @param data a \code{\linkS4class{metabarcoding.data}} instance
 #' 
 #' @return a weight matrix usable for M statistics
 #' 
 #' @examples
 #' data(termes)
 #' termes.ok = termes[,colSums(termes$reads)>0]
 #' w = m.weight(termes.ok)
 #' 
 #' @export
 m.weight = function(data) {
  ndata = normalize(data,MARGIN='sample')
  fmax=apply(ndata$reads,2,max)
  w = sweep(ndata$reads,2,fmax,'/')
  rownames(w)=rownames(ndata)
  colnames(w)=colnames(ndata)
  return(w)
 }
 #' Simulate null distribion of the M statistics by Monte-Carlo
 #' 
 #' Computes the null empirical distribution of the M statistics
 #' by shuffling MOTUs among location.
 #' 
 #' @param w the weigth matrix indicating the presence probability of each motu
 #'          in each samples. Each line corresponds to a sample and each column
 #'          to a MOTU. \code{rownames} of the \code{w} matrix must be the sample
 #'          names.
 #' @param groups the list of considered groups as computed by the \code{\link{dist.center.group}}
 #'               function
 #' @param resampling the number of simulation to establish the null distribution
 #' 
 #' @return a matrix of M score under the null hypothesis of random distribution of MOTUs
 #'         with a MOTUs per line and a culumn per simulation
 #'
 #' @examples       
 #' data(termes)
 #' termes.ok = termes[,colSums(termes$reads)>0]
 #' pos = expand.grid(1:3 * 10,1:7 * 10)
 #' labels = rownames(termes.ok)
 #' d = dist.grid(pos[,1],pos[2],labels)
 #' groups = dist.center.group(d,20)
 #' w = m.weight(termes.ok)
 #' dnull = dm.univariate(w,groups)
 #' 
 #' @export              
 dm.univariate = function(w,groups,resampling=100) {
  shuffle = function(w){
    wr =apply(w,2,function(y) sample(y,length(y),replace=FALSE))
    rownames(wr)=rownames(w)
    return(wr)
  }
  msim = function(x) {
    return(m.univariate(shuffle(w),groups))
  }
  dnull = mapply(msim,1:resampling)
  rownames(dnull) = colnames(w)
  return(dnull)
 }
 #' Test the significance of the M statistics by Monte-Carlo
 #' 
 #' Computes computes the p.value the M statistics asociated to a MOTU
 #' by shuffling MOTUs among location.
 #' 
 #' @param w the weigth matrix indicating the presence probability of each motu
 #'          in each samples. Each line corresponds to a sample and each column
 #'          to a MOTU. \code{rownames} of the \code{w} matrix must be the sample
 #'          names.
 #' @param groups the list of considered groups as computed by the \code{\link{dist.center.group}}
 #'               function
 #' @param resampling the number of simulation to establish the null distribution
 #' 
 #' @param alternative a character value in \code{c('two.sided','less','greater')}
 #'               - two.sided : the m stat is check against the two side of the empirical
 #'                             M distribution
 #'               - less : test if the M stat is lesser than the M observed in the the empirical
 #'                             M distribution (exlusion hypothesis)
 #'               - greater : test if the M stat is greater than the M observed in the the empirical
 #'                             M distribution (aggregation hypothesis)
 #' 
 #' @return a vector of p.value with an attribute \code{m.stat} containing the actual M stat
 #'         for each MOTUs
 #'
 #' @examples       
 #' data(termes)
 #' termes.ok = termes[,colSums(termes$reads)>0]
 #' pos = expand.grid(1:3 * 10,1:7 * 10)
 #' labels = rownames(termes.ok)
 #' d = dist.grid(pos[,1],pos[2],labels)
 #' groups = dist.center.group(d,20)
 #' w = m.weight(termes.ok)
 #' pval = m.univariate.test(w,groups)
 #' 
 #' @export              
 m.univariate.test = function(w,groups,resampling=100,alternative='two.sided') {
  dnull = dm.univariate(w,groups,resampling)
  m = m.univariate(w,groups)
  pnull = sapply(1:dim(dnull)[1],function(y) 1 - ecdf(dnull[y,])(m[y]))
  p.value=NULL
  if (alternative=='two.sided') {
    p.value = mapply(min,pnull,1 - pnull)
  }
  if (alternative=='less') {
    p.value = pnull
  }
  if (alternative=='greater') {
    p.value = 1 - pnull
  }
  # Set p.value to 1 if the MOTU occurres in only one place
  n = colSums(w > 0)
  p.value[n==1]=1
  names(p.value) = colnames(w)
  attr(p.value,'m.stat')=m
  return(p.value)
 }
--- a/ROBITools/R/obiclean.R
+++ b/ROBITools/R/obiclean.R
@@ -0,0 +1,118 @@
 #' @include 02_class_metabarcoding.data.R
 NULL
 # TODO: Add comment
 # 
 # Author: coissac
 ###############################################################################
 #' @export
 setGeneric("extracts.obiclean", function(obj) {
 			return(standardGeneric("extracts.obiclean"))
 		})
 #' Extracts the obiclean results
 #' 
 #' The method \code{extracts.obiclean} of the class \code{\linkS4class{metabarcoding.data}}
 #' extracts \code{obiclean} results from the MOTUs descriptions include in the 
 #' \code{\linkS4class{metabarcoding.data}} instance. 
 #' When an \code{obitab} file is imported using the \code{\link{import.metabarcoding.data}}
 #' if \code{obiclean} results are present in the file they are stored in the 
 #' \code{motu} data.frame. By calling this methods, MOTU descriptors describing 
 #' the \code{obiclean} status are moved to a set of layers.
 #' 
 #' @param obj the \code{\linkS4class{metabarcoding.data}} to analyze
 #' 
 #' @return the modified \code{\linkS4class{metabarcoding.data}} instance
 #' 
 #' @examples
 #' 
 #' # load termite data set from the ROBITools sample data
 #' data(termes)
 #' 
 #' # shows the initial list of layer names
 #' layer.names(t)
 #' 
 #' # extracts the obiclean status
 #' termes = extracts.obiclean(termes)
 #' 
 #' # shows the name of the newly created layers
 #' layer.names(t)
 #' 
 #' 
 #' 
 #' @seealso \code{\linkS4class{metabarcoding.data}}, \code{\link{threshold.mask}}, \code{\link{normalize}}
 #'
 #' @docType methods
 #' @rdname extracts-obiclean-methods
 #' @aliases extracts.obiclean-methods,metabarcoding.data
 #' @author Eric Coissac
 #' 
 setMethod("extracts.obiclean", "metabarcoding.data", function(obj) {
 			pat = "^obiclean_status:.*$"
 			cols = colnames(obj@motus)
 			cleancols = grep(pat,cols)
 			clean.names=cols[cleancols]
      p = grep(pat,cols)
      d = t(as.factor.or.matrix(obj@motus[,p]))
      n = sapply(strsplit(cols[p],':'),function(y) y[[2]])
      rownames(d)=n
      d = d[rownames(obj@reads),]
      obj[["obiclean_status"]]=d
      newmotus = obj@motus[-cleancols]
 			pat = "^obiclean_count:.*$"
 			cols = colnames(newmotus)
 			cleancols = grep(pat,cols)
 			clean.names=cols[cleancols]
 			p = grep(pat,cols)
 			d = t(as.factor.or.matrix(newmotus[,p]))
 			n = sapply(strsplit(cols[p],':'),function(y) y[[2]])
 			rownames(d)=n
 			d = d[rownames(obj@reads),]
 			obj[["obiclean_count"]]=d
 			newmotus = newmotus[-cleancols]
 			pat = "^obiclean_cluster:.*$"
 			cols = colnames(newmotus)
 			cleancols = grep(pat,cols)
 			clean.names=cols[cleancols]
 			p = grep(pat,cols)
 			d = t(as.factor.or.matrix(newmotus[,p]))
 			n = sapply(strsplit(cols[p],':'),function(y) y[[2]])
 			rownames(d)=n
 			d = d[rownames(obj@reads),]
 			obj[["obiclean_cluster"]]=d
 			newmotus = newmotus[-cleancols]
 			newdata = copy.metabarcoding.data(obj,motus=newmotus)
 			return(newdata)
 		})
 #' @export
 setGeneric("extracts.obiclean_cluster", function(obj) {
  return(standardGeneric("extracts.obiclean_cluster"))
 })
 setMethod("extracts.obiclean_cluster", "metabarcoding.data", function(obj) {
    obiclean = extracts.obiclean(obj)
    obihead  = obiclean[,! is.na(obiclean$motus$obiclean_head)]
    obihead$obiclean_count[is.na(obihead$obiclean_count)]=0
    reads = obihead$obiclean_count
    l = obihead@layers[layer.names(obihead) != "obiclean_count"]
    newdata = copy.metabarcoding.data(obihead,reads=reads,layers=l)
    return(newdata)
 }
 )
--- a/ROBITools/R/pcrslayer.R
+++ b/ROBITools/R/pcrslayer.R
--- a/ROBITools/R/plot.PCRplate.R
+++ b/ROBITools/R/plot.PCRplate.R
@@ -0,0 +1,84 @@
 #' @include 02_class_metabarcoding.data.R
 NULL
 #' Plot PCR plates
 #' 
 #' Plots samples localization in PCR plates, and points out problematic samples if provided.
 #' 
 #' @param x a \code{\link{metabarcoding.data}} object
 #' @param samples a character vector containing names of problematic samples. Default is \code{NULL}
 #' @param different a boolean indicating whether different tags where used in forward and reverse to identify samples. Default is \code{TRUE}
 #' @param ... arguments ot be passed to methods, such as graphical parameters
 #' 
 #' @return \code{\link{plot.PCRplate}} returns a plot displaying no more than 4 PCR plates, with problematic sample localization
 #' 
 #' @examples
 #' \dontshow{# switch the working directory to the data package directory}
 #' \dontshow{setwd(system.file("extdata", package="ROBITools"))}
 #' 
 #' data(termes)
 #' 
 #' # reading the termes_ngsfilt.txt file
 #' termes.ngs=import.ngsfilter.data('termes_ngsfilt.txt', platewell="position")
 #' 
 #' # including ngsfilter data into termes data
 #' attr(termes, "samples") = termes.ngs[rownames(termes),]
 #' 
 #' #plot PCR plate plan
 #' col = rep("green", nrow(termes))
 #' col[grep("r", rownames(termes))] = "red"
 #' plot.PCRplate(termes, col=col)
 #' 
 #' #highlighting location of samples with low identification score
 #' 
 #' #low quality taxonomic assignements identification
 #' library(plotrix)
 #' weighted.hist(termes$motus$best_identity, colSums(termes$reads), breaks = 20, ylab = "Nb reads", xlab = "Ecotag scores", xaxis=F)
 #' axis(1, labels = T)
 #' lowqual.seq = rownames(termes$motus)[termes$motus$best_identity < 0.7]
 #' 
 #' #identification and localization (in PCR plate) of samples with high proportions of low quality taxonomic assignements
 #' termes.freq= normalize(termes, MARGIN=1)$reads
 #' hist(log10(rowSums(termes.freq[,lowqual.seq]) + 1e-05), breaks = 20, xlab = "Prop low quality reads")
 #' lowqual.sample = rownames(termes)[log10(rowSums(termes.freq[, lowqual.seq]) + 1e-05) > -0.5]
 #' 
 #' plot.PCRplate(termes, lowqual.sample, col=col)
 #'   
 #' @seealso \code{\link{import.metabarcoding.data}}
 #'
 #' @author Lucie Zinger
 #' @keywords DNA metabarcoding
 #' @export
 #' 
 plot.PCRplate = function(x, samples=NULL, col="cyan2", different=T, ...) {
  if(length(grep("xPlate", colnames(x$samples)))==0 | 
       length(grep("yPlate", colnames(x$samples)))==0) {
    stop("samples/controls position in PCR plates (xPlate and yPlate) are not defined")
  }
  if(length(grep("tagF", colnames(x$samples)))==0 | 
       length(grep("tagR", colnames(x$samples)))==0) {
    stop("tags (tagF and tagR) are not defined")
  }
  nplate = max(x$samples$nbPlate)
  if(nplate>4) {
    stop("Cannot plot more than 4 plates")
  }
    plot(x$samples$xPlate, -x$samples$yPlate, pch=19, xaxt="n", yaxt="n", col=col,
         xlim=c(-5,17), ylab="y plate", xlab= "x plate", ylim=c(-4.5*8-5,0), ...)
    if(different==T) {
      text(-3, -unique(x$samples$yPlate[order(x$samples$yPlate)]), unique(x$samples$tagF[order(x$samples$yPlate)]), cex=0.5)
      text(unique(x$samples$xPlate[order(x$samples$xPlate)]), -5, unique(x$samples$tagR[order(x$samples$xPlate)]), cex=0.5, srt=90)
      }
      abline(h=-seq(8.5,8*nplate+0.5,8), lty=2, col="grey")
      segments(c(0,13), rep(min(-x$samples$yPlate),2), c(0,13), c(0,0), lty=2, col="grey")
    #plot problematic samples
    if(!is.null(samples)) {
      points(x$samples[samples,"xPlate"], -x$samples[samples,"yPlate"], pch="x")
    }  
 }
--- a/ROBITools/R/plot.seqinsample.R
+++ b/ROBITools/R/plot.seqinsample.R
@@ -0,0 +1,105 @@
 #' @include 02_class_metabarcoding.data.R
 NULL
 #' Plot sequence abundance in samples
 #' 
 #' Plots relative abundances of a set of sequences in all samples (log10 transformed)
 #' 
 #' 
 #' @param x          a \code{\link{metabarcoding.data}} object
 #' @param seqset     a vetcor with sequences names
 #' @param seqtype    a string indicating what type of sequences are displayed
 #' @param controls   a vector indicating the negative controls names in the x object.
 #'                   Default is \code{NULL}
 #' 
 #' @return returns a plot with the log10 transformed relative porportion of 
 #'         selected MOTUs in each samples. If the number of samples is > 96,
 #'         then the plot is displayed in 4 panels
 #' 
 #' @examples
 #' 
 #' data(termes)
 #' 
 #' seqset = rownames(termes$motus)[which(termes$motus$genus_name=="Anoplotermes")]
 #' plot.seqinsample(termes, seqset, "Anoplotermes")
 #' 
 #' controls = rownames(termes)[grep("r", rownames(termes))]
 #' seqset = rownames(termes$motus)[which(termes$motus$best_identity<0.7)]
 #' plot.seqinsample(termes, seqset, "Not assigned", controls)
 #' 
 #' @seealso \code{\linkS4class{taxonomy.obitools}}, and method \code{\link{taxonmicank}}
 #'
 #' @author Lucie Zinger
 #' @keywords metabarcoding
 #' 
 #' @export
 #' 
 plot.seqinsample = function(x, seqset, seqtype, controls=NULL){
  require(vegan)
  x.freq = vegan::decostand(x$reads,"total",1)
  if(!is.null(controls)){
    controls.ind = match(controls, rownames(x.freq))
  }
  if(nrow(x.freq)>96){
    x.freq.parse = seq(0,round(nrow(x$samples), digit=0),
                       round(nrow(x$samples)/4, digit=0))
    layout(matrix(c(1,2,3,1,4,5),3,2), height=c(0.3,1,1))
    par(oma=c(1,1,1,0), mar=c(3,3,1,1))
    #legend
    breaks = seq(log10(1e-4),log10(1), length.out=100)
    plot(breaks, rep(1,100), col=topo.colors(100), pch=15, cex=2, ylim=c(0,1.5),
         xaxt="n", yaxt="n", bty='n')
    text(breaks[seq(1,100,10)], rep(0.7,length(seq(1,100,10))), 
         round(10^breaks[seq(1,100,10)],4))
    mtext("Seqence frequencies:", side=3, line=0, cex=0.8)
    #plot
    for(i in 1:(length(x.freq.parse)-1)) {
      range = (x.freq.parse[i]+1):(x.freq.parse[i]+round(nrow(x$samples)/4, digit=0))
      mat = x.freq[range,seqset]
      image(log10(mat),col = topo.colors(100), xaxt="n", yaxt="n", breaks=c(breaks,0))
      if(!is.null(controls)){
        if(length(na.omit(match(controls.ind, range)))!=0){
          abline(v=seq(0,1,l=round(nrow(x$samples)/4, digit=0))[match(controls.ind, range)],col="red", lty=3)
        }}
      axis(side=1,at=seq(0,1,l=round(nrow(x$samples)/4,digit=0)),
           labels=rownames(x$samples)[range],
           las=2, cex.axis=0.3)
    }
    mtext(side=2, paste(seqtype, "n = ", length(seqset)), outer=T, cex=0.7, font=3)
    mtext(side=1, "Samples", cex=0.7, outer=T)
  } else {
    layout(matrix(c(1,2,1,2),2,2), height=c(0.3,1))
    par(oma=c(1,1,1,0), mar=c(3,3,1,1))
    #legend
    breaks = seq(log10(1e-4),log10(1), length.out=100)
    plot(breaks, rep(1,100), col=topo.colors(100), pch=15, cex=2, ylim=c(0,1.5),
         xaxt="n", yaxt="n", bty='n')
    text(breaks[seq(1,100,10)], rep(0.7,length(seq(1,100,10))), 
         round(10^breaks[seq(1,100,10)],4))
    mtext("Seqence frequencies:", side=3, line=0, cex=0.8)
    image(log10(x.freq[,seqset]),col = topo.colors(100), xaxt="n", yaxt="n", breaks=c(breaks,0))
    if(!is.null(controls)){
      abline(v=seq(0,1,l=round(nrow(x$samples), digit=0))[controls.ind],col="red", lty=3)
      }
    axis(side=1,at=seq(0,1,l=round(nrow(x$samples),digit=0)),
         labels=rownames(x$samples),
         las=2, cex.axis=0.3)
  mtext(side=2, paste(seqtype, "n = ", length(seqset)), outer=T, cex=0.7, font=3)
  mtext(side=1, "Samples", cex=0.7, outer=T)
  }
 }
--- a/ROBITools/R/rarefy.R
+++ b/ROBITools/R/rarefy.R
@@ -0,0 +1,99 @@
 #' @include 02_class_metabarcoding.data.R
 NULL
 # TODO: Add comment
 # 
 # Author: coissac
 ###############################################################################
 #' @export
 setGeneric("rarefy", function(x,n,first.pass=0.95,pseudo.count=0,...) {
 			return(standardGeneric("rarefy"))
 		})
 setMethod("rarefy", "ANY", function(x,n,first.pass=0.95,pseudo.count=0,sum=NA) {
 	if (is.na(sum))
 		sum=sum(x)
 	if (sum < sum(x))
 		stop("sum parameter must be greater or equal to sum(x)")
  grey = sum-sum(x)
 	probs = x + pseudo.count
  if (grey > 0)
 	  probs = c(probs,grey)
 	# Just to ensure at least one execution of the loop
 	n1 = n * 2
 	while(n1 > n)
 		n1 = rpois(1,n * first.pass)
 	rep1 = as.vector(rmultinom(1,n1,probs))
 	n2  = sum(rep1)
 	levels = 1:length(probs)
 	rep2= as.vector(table(factor(sample(levels,
 						   	                      n - n2,
 							                        replace=TRUE, 
 							                        prob = probs),
 					                            levels=levels)))
 	rep1 = (rep1 + rep2)
  if (grey > 0)
    rep1 = rep1[-length(rep1)]
 	return(rep1)
 })
 setMethod("rarefy", "metabarcoding.data", function(x,n,first.pass=0.95,pseudo.count=0,MARGIN='sample') {
 	if (MARGIN == 'sample')
 		MARGIN=1
 	if (MARGIN == 'motu')
 		MARGIN=2
  dreads= dim(x@reads)
  rreads= matrix(0,nrow = dreads[1] , ncol = dreads[2])
 	if (MARGIN == 1)
      for (i in 1:dreads[1]) {
        rreads[i,]=rarefy(x@reads[i,],
                          n=n,
                          first.pass=first.pass,
                          pseudo.count=pseudo.count)
      }
 # 		rreads = t(apply(reads,1,rarefy,n=n,
 # 					 	 first.pass=first.pass,
 # 						 pseudo.count=pseudo.count))
 	else
 	  for (i in 1:dreads[2]) {
 	    rreads[,i]=rarefy(x@reads[,i],
 	                      n=n,
 	                      first.pass=first.pass,
 	                      pseudo.count=pseudo.count)
 	  }
 # rreads =   as.matrix(apply(reads,2,rarefy,n=n,
 # 								   first.pass=first.pass,
 # 								   pseudo.count=pseudo.count))
  rreads=as.matrix(rreads)
 	rownames(rreads) = rownames(x@reads)
 	colnames(rreads) = colnames(x@reads)
 	newdata = copy.metabarcoding.data(x,reads=rreads)
 	return(newdata)
 })
--- a/ROBITools/R/read.ngsfilter.R
+++ b/ROBITools/R/read.ngsfilter.R
@@ -0,0 +1,56 @@
 #' @include 02_class_metabarcoding.data.R
 NULL
 #' Read an OBITools ngsfilter file
 #' 
 #' Reads a ngsfilter file as formatted for the OBITools. For now, needs to be tab delimited till the "F" column. 
 #' Any additionnal information needs to be space delimited.
 #' 
 #' @seealso \code{\link{import.metabarcoding.data}}
 #' @author Lucie Zinger
 #' @keywords data import
 #' @export
 #' 
 read.ngsfilter <- function(filename, decimal='.', as.is=!stringsAsFactors, stringsAsFactors = default.stringsAsFactors()) {
 	t<-read.table(file=filename, header=F, sep="\t", as.is=T)
 	beg <- t[,1:5]
 	colnames(beg) <- c('experiment','sample','tags','forward_primer','reverse_primer')
 	if (length(unique(beg$sample))==nrow(beg)) 
 		rownames(beg) <- beg$sample
 	end <- t[,c(2,6)]
 	#F <- unlist(lapply(end$V6, function(x) strsplit(x,"@")[[1]][1]))
 	rawextras <- unlist(lapply(end$V6, function(x) strsplit(x,"@")[[1]][2]))
 	rawextras <- lapply(rawextras, function(s) strsplit(s, '; ')[[1]])
 	rawextras <- lapply(rawextras, function(l) unlist(lapply(l, function(s) sub("^ +","",s))))
 	rawextras <- lapply(rawextras, function(l) unlist(lapply(l, function(s) sub(" +$","",s))))
 	rawextras <- lapply(rawextras, function(l) unlist(lapply(l, function(s) strsplit(s,"="))))
 	columnnames <- unique(unlist(lapply(rawextras, function(l) l[seq(1,length(l),2)])))
 	m <- matrix(nrow=nrow(end), ncol=length(columnnames))
 	colnames(m) <- columnnames
 	m <- as.data.frame(m)    
 	#print(head(rawextras))
 	tt <- lapply(rawextras, function(l) list(l[seq(1,length(l),2)],l[seq(2,length(l),2)]))
 	invisible(lapply(1:length(tt), function(i){m[i,tt[[i]][[1]]] <<- tt[[i]][[2]]}))
 	invisible(lapply(colnames(m), function(n) m[,n] <<- type.convert(m[,n], dec=decimal, as.is=as.is)))
 	ngs = cbind(beg, m)
 	rownames(ngs) = ngs$sample
 	class(ngs)<-c('ngsfilter.data',class(ngs))
 	return(ngs)
 }
--- a/ROBITools/R/read.obitab.R
+++ b/ROBITools/R/read.obitab.R
@@ -0,0 +1,39 @@
 #' @include 02_class_metabarcoding.data.R
 NULL
 #' Reads a data file produced by the obitab command
 #'
 #' Read a data file issued from the convertion of a fasta 
 #' file to a tabular file by the obitab command
 #' 
 #' @param file a string containing the file name of the obitab file.
 #' @param sep  Column separator in the obitab file. 
 #'             The default separator is the tabulation.
 #'
 #' @return a \code{data.frame} instance containing the obitab file
 #'
 #' @examples
 #' require(ROBITools)
 #' 
 #' \dontshow{# switch the working directory to the data package directory}
 #' \dontshow{setwd(system.file("extdata", package="ROBITools"))}
 #' 
 #' # read the termes.tab file
 #' termes=read.obitab('termes.tab')
 #' 
 #' # print the dimensions of the data.frame
 #' dim(termes)
 #'   
 #' @seealso \code{\link{import.metabarcoding.data}}
 #' @author Eric Coissac
 #' @export
 #'
 read.obitab <-
 function(filename,sep='\t') {
   data=read.delim(filename,sep=sep,strip.white=T,check.names =F)
   data
 }
--- a/ROBITools/R/read.sumatra.R
+++ b/ROBITools/R/read.sumatra.R
@@ -0,0 +1,17 @@
 #' @include 02_class_metabarcoding.data.R
 NULL
 # TODO: Add comment
 # 
 # Author: coissac
 ###############################################################################
 read.sumatra = function(filename) {
 	data = read.table(filename,sep="\t",header=FALSE)
 	score = data[,3]
 	name.first = mapply(min,as.character(s[,1]),as.character(s[,2]))
 	name.second= mapply(max,as.character(s[,1]),as.character(s[,2]))
 	sname = as.character(interaction(data[,1],data[,2]))
 }
--- a/ROBITools/R/s3objects.R
+++ b/ROBITools/R/s3objects.R
@@ -0,0 +1,123 @@
 # TODO: Add comment
 # 
 # Author: coissac
 ###############################################################################
 #' Adds a class into the class hierarchie attribute.
 #' 
 #' \code{addS3Class} adds a new class name to the vector
 #' of class associated to the object. This the way to
 #' assign an object to an S3 class. \code{addS3Class} add
 #' the new class name in front of the class vector
 #' 
 #' @param object the object to modify
 #' @param classname the name of the new class
 #' 
 #' @return the object given as parametter casted to the new
 #'         class
 #' 
 #' @examples
 #' x = c(1,3,2,5)
 #' x = addS3Class(x,"my.vector")
 #' class(x)
 #' 
 #' @seealso \code{\link{rmS3Class}}
 #' 
 #' @note for efficiency purpose no check is done on the input
 #'       parametters
 #'       
 #' @keywords system function
 #' 
 #' @author Eric Coissac
 #' @export
 #' 
 addS3Class = function(object,classname) {
 	class(object) = c(classname,class(object))
 	return(object)
 }
 #' Removes a class from the class hierarchie attribute.
 #' 
 #' \code{rmS3Class} removes a class name from the vector
 #' of class associated to the object. This the way to
 #' remove the association between an object and a S3 class. 
 #' 
 #' @param object the object to modify
 #' @param classname the name of the class to remove
 #' 
 #' @return the object given as parametter.
 #' 
 #' @examples
 #' x = c(1,3,2,5)
 #' x = addS3Class(x,"my.vector")
 #' class(x)
 #' x = rmS3Class(x,"my.vector")
 #' class(x)
 #' 
 #' @seealso \code{\link{addS3Class}}
 #' 
 #' @note for efficiency purpose no check is done on the input
 #'       parametters
 #'       
 #' @keywords system function
 #' 
 #' @author Eric Coissac
 #' @export
 #' 
 rmS3Class = function(object,classname) {
 	c = class(object)
 	if (! is.null(c))
 		index = match(classname,c)
 		class(object)=c[-index]
 	return(object)
 }
 #' create basic functions to manipulate a new S3 class
 #' 
 #' createS3Class function create in the \code{package:ROBITools}
 #' environment an \code{is.xxx} function and an \code{as.xxx} function
 #' allowing to test if an abject belong the class \code{xxx} and to add
 #' the class \code{xxx} to the class list of an object. \code{xxx} is a 
 #' generic class name that is specified through the \code{classname}
 #' argument of the function.
 #' 
 #' @param classname a \code{character string} indicating the name
 #'         of the new class.
 #'         
 #' @examples
 #' 
 #' # Create a new S3 class named mynewclass
 #' createS3Class('mynewclass')
 #' 
 #' #create a new vector object
 #' x=c(1,4,6)
 #' 
 #' # test if it belongs the new class, that is false
 #' is.mynewclass(x)
 #' 
 #' # Associate x to the new class
 #' as.mynewclass(x)
 #' 
 #' # test again if x belongs the new class, that is now true
 #' is.mynewclass(x)
 #' 
 #' @seealso \code{\link{rmS3Class}}
 #' 
 #' @note Take care that the new functions are created in the 
 #' \code{package:ROBITools} environment.
 #' 
 #' @keywords system function
 #' 
 #' @author Eric Coissac
 #' @export
 #' 
 createS3Class = function(classname) {
  is.class = function(object) any(class(object)==classname)
  as.class = function(object) return(addS3Class(object,classname))
  assign(paste('is',classname,sep="."),is.class,envir=globalenv())
  assign(paste('as',classname,sep="."),as.class,envir=globalenv())
 }
--- a/ROBITools/R/taxoDBtree.R
+++ b/ROBITools/R/taxoDBtree.R
@@ -0,0 +1,89 @@
 #'@include 02_class_metabarcoding.data.R
 #'@import ROBITaxonomy 
 NULL
 #' Construct a taxonomic tree from a list of taxa
 #' 
 #' Construct a graph from a table containing the taxonomic path of sequences
 #' 
 #' 
 #' @param x a table containing the taxonomic path of the references. Typically an output from get.classic.taxonomy
 #' 
 #' @return g a directed graph displaying the taxonomy hierarchy of the input data. Stored in a \code{\link{igraph}} object 
 #'         where the taxonomic ranks of the vertices are added to the vertices attributes
 #'
 #' @examples
 #' 
 #' data(termes)
 #' 
 #' taxo=default.taxonomy()
 #' 
 #' termes.taxo.table = get.classic.taxonomy(termes, taxo, "taxid")
 #' head(termes.taxo.table)
 #' 
 #' graph.tax.termes = dbtree(termes.taxo.table[,1:7])
 #' library(igraph)
 #' 
 #' #plot the tree
 #' coord = layout.reingold.tilford(graph.tax.termes, root=1, circular=F)
 #' v.cex = as.factor(V(graph.tax.termes)$rank)
 #' levels(v.cex) = match(levels(v.cex), colnames(termes.taxo.table))
 #' plot(graph.tax.termes, vertex.size=1, vertex.label.cex=2*(as.numeric(as.vector(v.cex))^-1), edge.arrow.size=0, layout=coord)
 #' 
 #' 
 #' #Vizualization with sequence counts
 #' tax.count = log10(colSums(termes$reads)[match(as.vector(V(graph.tax.termes)$name), termes$motus$scientific_name)])
 #' tax.count[is.na(tax.count)|tax.count<0] = 0.01
 #' V(graph.tax.termes)$count = unname(tax.count)
 #' 
 #' plot(graph.tax.termes, vertex.size=V(graph.tax.termes)$count, vertex.label.cex=2*(as.numeric(as.vector(v.cex))^-1), edge.arrow.size=0, layout=coord)
 #' 
 #'   
 #' @seealso \code{\link{get.classic.taxonomy}}
 #' @author Lucie Zinger
 #' @export
 dbtree = function(x) {
  #dealing with noranks
  x2 = x
  for (i in 1:ncol(x2)) {
    x2[,i] = as.character(x[,i])
    if(length(which(is.na(x[,i])==T))!=0) {
      if(i==1) {
        x2[which(is.na(x[,i])==T),i] = "NR"
      } else {
        x2[which(is.na(x[,i])==T),i] = as.character(x2[,i-1])[which(is.na(x2[,i])==T)]
      }
    }
  }
  #prepare an edgelist
  edgelist = list()
  for (i in 1:(ncol(x2)-1)){
    out = x2[,c(i,i+1)]
    out2 = out[-which(duplicated(out)==T),]
    colnames(out2) = c("parent", "kid")
    edgelist[[i]] = out2[which(out2[,1]!=out2[,2]),]
  } 
  edgelist = do.call("rbind", edgelist)
  #construct the graph
  g = igraph::graph.edgelist(as.matrix(edgelist), directed=T)
  #get taxorank for each taxa
  ranks = do.call("rbind", lapply(1:ncol(x), function(y) {
    out = cbind(unique(as.character(x[,y])), colnames(x)[y])
    out
  }))
  #Assign nodes to taxorank
  igraph::V(g)$rank = ranks[match(igraph::V(g)$name, ranks[,1]),2]
  return(g)
 }
--- a/ROBITools/R/taxonomic.resolution.R
+++ b/ROBITools/R/taxonomic.resolution.R
@@ -0,0 +1,74 @@
 #' @import ROBITaxonomy
 #' @include 02_class_metabarcoding.data.R
 NULL
 #' Dataset taxonomic resolution summary.
 #' 
 #' Summarizes the taxonomic relution of reads and MOTUs over the entire dataset
 #' 
 #' 
 #' @param x          a \code{\link{metabarcoding.data}} object
 #' @param colranks   a string indicating column name where ranks are stored in \code{x}
 #' @param colscores  a string indicating column name where taxonomic identification scores are stored in \code{x}
 #' @param thresh     a threshold for defining at which taxonomic identification scores a sequence can be considered as "not assigned". 
 #'                   Default is \code{0.7}
 #' 
 #' @return returns a data.frame and piecharts of the number/proportion of MOTUs/reads assigned to each taxonomic levels
 #' 
 #' @examples
 #' 
 #' data(termes)
 #' taxo=default.taxonomy()
 #' 
 #' termes.taxo.table = get.classic.taxonomy(termes, taxo, "taxid")
 #' attr(termes, "motus") = data.frame(termes$motus, termes.taxo.table)
 #' attr(termes, "motus")["count"] = colSums(termes$reads)
 #' 
 #' summary.taxores(termes, "taxonomic_rank_ok","best_identity")
 #' 
 #' @seealso \code{\linkS4class{taxonomy.obitools}}, and method \code{\link{taxonmicank}}
 #'
 #' @author Lucie Zinger
 #' @keywords taxonomy
 #' 
 #' @export
 #' 
 summary.taxores = function(x,colranks,colscores, thresh=0.7){
  #vector encompassing all ranked possible taxonomic levels
  taxorank = c("superkingdom", "kingdom", "subkingdom", "superphylum", "phylum", "subphylum", "superclass", "class", "subclass", "infraclass",
               "superorder", "order", "suborder", "infraorder", "parvorder", "superfamily", "family", "subfamily", "supertribe", "tribe", 
               "subtribe", "supergenus", "genus", "subgenus", "species group", "species subgroup", "superspecies", "species", "subspecies",
               "varietas", "forma", "no rank", "not assigned")
  #settings if thresh
  ranks = as.vector(x$motus[,colranks])
  ranks[x$motus[,colscores]<thresh] =  "not assigned"
  #nb of otus
  tmp = table(ranks)
  taxores.otu = tmp[match(taxorank, names(tmp))]
  names(taxores.otu) = taxorank
  taxores.otu[is.na(taxores.otu)] = 0
  #nb of reads
  tmp = aggregate(x$motus$count, by=list(ranks), sum)
  taxores.reads = tmp[match(taxorank,tmp[,1]),2]
  names(taxores.reads) = taxorank
  taxores.reads[is.na(taxores.reads)] = 0
  #plot
  layout(matrix(c(1,2,1,3),2,2),heights=c(0.3,1))
  col.tmp = c(rainbow(length(taxorank)-2,start=0, end=0.5, alpha=0.6), "lightgrey", "darkgrey")
  par(mar=c(1,0,0,0), oma=c(0,0,2,0))
  frame()
  legend("top", taxorank, ncol=6, cex=0.8, fill=col.tmp)
  pie(taxores.otu, col=col.tmp, border="lightgrey", labels="", clockwise=T)
  mtext("OTUs", side=1, cex=1)
  pie(taxores.reads, col=col.tmp, border="lightgrey", labels="", clockwise=T)
  mtext("Reads", side=1, cex=1)
  #result
  out = data.frame(otu=taxores.otu, reads=taxores.reads)
  out
 }
--- a/ROBITools/R/taxonomy_classic_table.R
+++ b/ROBITools/R/taxonomy_classic_table.R
@@ -0,0 +1,53 @@
 #' @import ROBITaxonomy
 #' @include 02_class_metabarcoding.data.R
 NULL
 #' Get classical taxonomy format
 #' 
 #' Creates a table with the classical taxonomic description (from phylum to species)
 #' 
 #' @param x a \code{\link{metabarcoding.data}} object
 #' @param taxonomy a instance of \code{\linkS4class{taxonomy.obitools}}
 #' @param coltaxid a the name of the column containing taxids to be used for creating classical taxonomic description
 #' 
 #' @return returns a data.frame with the classical taxonomic description ("kingdom", "phylum", "class", "order", "family", "genus", "species"), as well as
 #'         sequence taxonomic assignment rank and scientific name for each sequences stored in the \code{\link{metabarcoding.data}} object
 #' 
 #' @examples
 #' 
 #' data(termes)
 #' 
 #' taxo=default.taxonomy()
 #' 
 #' termes.taxo.table = get.classic.taxonomy(termes, taxo, "taxid")
 #' head(termes.taxo.table)
 #' 
 #' attr(termes, "motus") = data.frame(termes$motus, termes.taxo.table)
 #' 
 #'   
 #' @seealso \code{\linkS4class{taxonomy.obitools}}, and methods \code{\link{species}},\code{\link{genus}}, \code{\link{family}},\code{\link{kingdom}},
 #'          \code{\link{superkingdom}},\code{\link{taxonatrank}}, \code{\link{taxonmicank}}
 #'
 #' @author Lucie Zinger
 #' @keywords taxonomy
 #' @export
 #' 
 get.classic.taxonomy = function(x, taxonomy, coltaxid) {
  classic.taxo = c("kingdom", "phylum", "class", "order", "family", "genus", "species")
  taxids = x$motus[,coltaxid]
  out = as.data.frame(do.call("cbind", lapply(classic.taxo, function(y) {
    scientificname(taxonomy, taxonatrank(taxonomy,taxids,y))
  })))
  colnames(out) = paste(classic.taxo, "_name_ok", sep="")
  rownames(out) = colnames(x)
  out$scientific_name_ok = scientificname(taxonomy, taxids)
  out$taxonomic_rank_ok = taxonomicrank(taxonomy, taxids)
  return(out)
 }
--- a/ROBITools/README-SLRE.md
+++ b/ROBITools/README-SLRE.md
@@ -0,0 +1,128 @@
 SLRE: Super Light Regular Expression library
 ============================================
 SLRE is an ISO C library that implements a subset of Perl regular
 expression syntax. Main features of SLRE are:
   * Written in strict ANSI C'89
   * Small size (compiled x86 code is about 5kB)
   * Uses little stack and does no dynamic memory allocation
   * Provides simple intuitive API
   * Implements most useful subset of Perl regex syntax (see below)
   * Easily extensible. E.g. if one wants to introduce a new
 metacharacter `\i`, meaning "IPv4 address", it is easy to do so with SLRE.
 SLRE is perfect for tasks like parsing network requests, configuration
 files, user input, etc, when libraries like [PCRE](http://pcre.org) are too
 heavyweight for the given task. Developers of embedded systems would benefit
 most.
 ## Supported Syntax
    (?i)    Must be at the beginning of the regex. Makes match case-insensitive
    ^       Match beginning of a buffer
    $       Match end of a buffer
    ()      Grouping and substring capturing
    \s      Match whitespace
    \S      Match non-whitespace
    \d      Match decimal digit
    \n      Match new line character
    \r      Match line feed character
    \f      Match form feed character
    \v      Match vertical tab character
    \t      Match horizontal tab character
    \b      Match backspace character
    +       Match one or more times (greedy)
    +?      Match one or more times (non-greedy)
    *       Match zero or more times (greedy)
    *?      Match zero or more times (non-greedy)
    ?       Match zero or once (non-greedy)
    x|y     Match x or y (alternation operator)
    \meta   Match one of the meta character: ^$().[]*+?|\
    \xHH    Match byte with hex value 0xHH, e.g. \x4a
    [...]   Match any character from set. Ranges like [a-z] are supported
    [^...]  Match any character but ones from set
 Under development: Unicode support.
 ## API
    int slre_match(const char *regexp, const char *buf, int buf_len,
                   struct slre_cap *caps, int num_caps, int flags);
 `slre_match()` matches string buffer `buf` of length `buf_len` against
 regular expression `regexp`, which should conform the syntax outlined
 above. If regular expression `regexp` contains brackets, `slre_match()`
 can capture the respective substrings into the array of `struct slre_cap`
 structures:
    /* Stores matched fragment for the expression inside brackets */
    struct slre_cap {
      const char *ptr;  /* Points to the matched fragment */
      int len;          /* Length of the matched fragment */
    };
 N-th member of the `caps` array will contain fragment that corresponds to the
 N-th opening bracket in the `regex`, N is zero-based. `slre_match()` returns
 number of bytes scanned from the beginning of the string. If return value is
 greater or equal to 0, there is a match. If return value is less then 0, there
 is no match. Negative return codes are as follows:
    #define SLRE_NO_MATCH               -1
    #define SLRE_UNEXPECTED_QUANTIFIER  -2
    #define SLRE_UNBALANCED_BRACKETS    -3
    #define SLRE_INTERNAL_ERROR         -4
    #define SLRE_INVALID_CHARACTER_SET  -5
    #define SLRE_INVALID_METACHARACTER  -6
    #define SLRE_CAPS_ARRAY_TOO_SMALL   -7
    #define SLRE_TOO_MANY_BRANCHES      -8
    #define SLRE_TOO_MANY_BRACKETS      -9
 ## Example: parsing HTTP request line
    const char *request = " GET /index.html HTTP/1.0\r\n\r\n";
    struct slre_cap caps[4];
    if (slre_match("^\\s*(\\S+)\\s+(\\S+)\\s+HTTP/(\\d)\\.(\\d)",
                   request, strlen(request), caps, 4, 0) > 0) {
      printf("Method: [%.*s], URI: [%.*s]\n",
             caps[0].len, caps[0].ptr,
             caps[1].len, caps[1].ptr);
    } else {
      printf("Error parsing [%s]\n", request);
    }
 ## Example: find all URLs in a string
    static const char *str =
      "<img src=\"HTTPS://FOO.COM/x?b#c=tab1\"/> "
      "  <a href=\"http://cesanta.com\">some link</a>";
    static const char *regex = "(?i)((https?://)[^\\s/'\"<>]+/?[^\\s'\"<>]*)";
    struct slre_cap caps[2];
    int i, j = 0, str_len = strlen(str);
    while (j < str_len &&
           (i = slre_match(regex, str + j, str_len - j, caps, 2, 0)) > 0) {
      printf("Found URL: [%.*s]\n", caps[0].len, caps[0].ptr);
      j += i;
    }
 Output:
    Found URL: [HTTPS://FOO.COM/x?b#c=tab1]
    Found URL: [http://cesanta.com]
 # License
 SLRE is released under
 [GNU GPL v.2](http://www.gnu.org/licenses/old-licenses/gpl-2.0.html).
 Businesses have an option to get non-restrictive, royalty-free commercial
 license and professional support from
 [Cesanta Software](http://cesanta.com).
 [Super Light DNS Resolver](https://github.com/cesanta/sldr),
 [Mongoose web server](https://github.com/cesanta/mongoose)
 are other projects by Cesanta Software, developed with the same philosophy
 of functionality and simplicity.
--- a/ROBITools/data/termes.rda
+++ b/ROBITools/data/termes.rda
--- a/ROBITools/inst/extdata/termes.fasta
+++ b/ROBITools/inst/extdata/termes.fasta
--- a/ROBITools/inst/extdata/termes.tab
+++ b/ROBITools/inst/extdata/termes.tab
--- a/ROBITools/inst/extdata/termes_ngsfilt.txt
+++ b/ROBITools/inst/extdata/termes_ngsfilt.txt
@@ -0,0 +1,21 @@
 termes_data	A01	acacacac:acacacac	ATTTCAGGTCAAGGTGCAGC	TACAACCAAATCCAATTTCA	F @ position=01_01A; serie=01; coordX=5; coordY=5;
 termes_data	A02	acagcaca:acacacac	ATTTCAGGTCAAGGTGCAGC	TACAACCAAATCCAATTTCA	F @ position=01_01B; serie=01; coordX=10; coordY=5;
 termes_data	A03	gtgtacat:acacacac	ATTTCAGGTCAAGGTGCAGC	TACAACCAAATCCAATTTCA	F @ position=01_01C; serie=01; coordX=15; coordY=5;
 termes_data	A04	tatgtcag:acacacac	ATTTCAGGTCAAGGTGCAGC	TACAACCAAATCCAATTTCA	F @ position=01_01D; serie=01; coordX=20; coordY=5;
 termes_data	A05	tagtcgca:acacacac	ATTTCAGGTCAAGGTGCAGC	TACAACCAAATCCAATTTCA	F @ position=01_01E; serie=01; coordX=25; coordY=5;
 termes_data	A06	tactatac:acacacac	ATTTCAGGTCAAGGTGCAGC	TACAACCAAATCCAATTTCA	F @ position=01_01F; serie=01; coordX=30; coordY=5;
 termes_data	A07	actagatc:acacacac	ATTTCAGGTCAAGGTGCAGC	TACAACCAAATCCAATTTCA	F @ position=01_01G; serie=01; coordX=35; coordY=5;
 termes_data	A08	gatcgcga:acacacac	ATTTCAGGTCAAGGTGCAGC	TACAACCAAATCCAATTTCA	F @ position=01_01H; serie=01; coordX=40; coordY=5;
 termes_data	A09	acacacac:acagcaca	ATTTCAGGTCAAGGTGCAGC	TACAACCAAATCCAATTTCA	F @ position=01_02A; serie=01; coordX=45; coordY=5;
 termes_data	A10	acagcaca:acagcaca	ATTTCAGGTCAAGGTGCAGC	TACAACCAAATCCAATTTCA	F @ position=01_02B; serie=01; coordX=50; coordY=5;
 termes_data	A11	gtgtacat:acagcaca	ATTTCAGGTCAAGGTGCAGC	TACAACCAAATCCAATTTCA	F @ position=01_02C; serie=01; coordX=55; coordY=5;
 termes_data	A12	tatgtcag:acagcaca	ATTTCAGGTCAAGGTGCAGC	TACAACCAAATCCAATTTCA	F @ position=01_02D; serie=01; coordX=60; coordY=5;
 termes_data	A13	tagtcgca:acagcaca	ATTTCAGGTCAAGGTGCAGC	TACAACCAAATCCAATTTCA	F @ position=01_02E; serie=02; coordX=65; coordY=5;
 termes_data	A14	tactatac:acagcaca	ATTTCAGGTCAAGGTGCAGC	TACAACCAAATCCAATTTCA	F @ position=01_02F; serie=02; coordX=70; coordY=5;
 termes_data	A15	actagatc:acagcaca	ATTTCAGGTCAAGGTGCAGC	TACAACCAAATCCAATTTCA	F @ position=01_02G; serie=02; coordX=75; coordY=5;
 termes_data	A16	gatcgcga:acagcaca	ATTTCAGGTCAAGGTGCAGC	TACAACCAAATCCAATTTCA	F @ position=01_02H; serie=02; coordX=80; coordY=5;
 termes_data	A17	acacacac:gtgtacat	ATTTCAGGTCAAGGTGCAGC	TACAACCAAATCCAATTTCA	F @ position=01_03A; serie=02; coordX=85; coordY=5;
 termes_data	A18	acagcaca:gtgtacat	ATTTCAGGTCAAGGTGCAGC	TACAACCAAATCCAATTTCA	F @ position=01_03B; serie=02; coordX=90; coordY=5;
 termes_data	A19	gtgtacat:gtgtacat	ATTTCAGGTCAAGGTGCAGC	TACAACCAAATCCAATTTCA	F @ position=01_03C; serie=02; coordX=95; coordY=5;
 termes_data	A11r	tcagtgtc:gtcgtaga	ATTTCAGGTCAAGGTGCAGC	TACAACCAAATCCAATTTCA	F @ position=04_10B; serie=01; coordX=55; coordY=5;
 termes_data	A16r	actctgct:gtcgtaga	ATTTCAGGTCAAGGTGCAGC	TACAACCAAATCCAATTTCA	F @ position=04_10C; serie=02; coordX=80; coordY=5;
--- a/ROBITools/src/ROBITools.so
+++ b/ROBITools/src/ROBITools.so
--- a/ROBITools/src/ecoError.c
+++ b/ROBITools/src/ecoError.c
@@ -0,0 +1,26 @@
 #include "ecoPCR.h"
 #include <stdio.h>
 #include <stdlib.h>
 /*
 * print the message given as argument and exit the program
 * @param error		error number	
 * @param message 	the text explaining what's going on
 * @param filename	the file source where the program failed
 * @param linenumber	the line where it has failed
 * filename and linenumber are written at pre-processing 
 * time by a macro
 */
 void ecoError(int32_t error,
              const char* message,
              const char * filename,
              int linenumber)
 {
 	fprintf(stderr,"Error %d in file %s line %d : %s\n",
 	               error,
 	               filename,
 	               linenumber,
 	               message);
 	abort();
 }
--- a/ROBITools/src/ecoError.o
+++ b/ROBITools/src/ecoError.o
--- a/ROBITools/src/ecoIOUtils.c
+++ b/ROBITools/src/ecoIOUtils.c
@@ -0,0 +1,122 @@
 #include "ecoPCR.h"
 #include <stdio.h>
 #include <stdlib.h>
 #define SWAPINT32(x)     ((((x) << 24) & 0xFF000000) | (((x) <<  8) & 0xFF0000) | \
                         (((x) >>  8) & 0xFF00)     | (((x) >> 24) & 0xFF))
 int32_t is_big_endian()
 {
 	int32_t i=1;
 	return (int32_t)((char*)&i)[0];
 }
 int32_t swap_int32_t(int32_t i)
 {
 	return SWAPINT32(i);
 }
 /**
 * Read part of the file
 * @param	*f	the database
 * @param	recordSize the size to be read
 * 
 * @return	buffer
 */
 void *read_ecorecord(FILE *f,int32_t *recordSize)
 {
 	static void *buffer    =NULL;
 	int32_t      buffersize=0;
 	int32_t      read;
 	if (!recordSize)
 		ECOERROR(ECO_ASSERT_ERROR,
 		         "recordSize cannot be NULL");
 	read = fread(recordSize,
 	      		 1,
 	      		 sizeof(int32_t),
 	             f);
 	if (feof(f))
 		return NULL;
 	if (read != sizeof(int32_t))
 		ECOERROR(ECO_IO_ERROR,"Reading record size error");
 	if (is_big_endian())
 		*recordSize=swap_int32_t(*recordSize);
 	if (buffersize < *recordSize)
 	{
 		if (buffer)
 			buffer = ECOREALLOC(buffer,*recordSize,
 			                    "Increase size of record buffer");
 		else
 			buffer = ECOMALLOC(*recordSize,
 			                    "Allocate record buffer");
 	}
 	read = fread(buffer,
 	             1,
 				 *recordSize,
 				 f);
 	if (read != *recordSize)
 		ECOERROR(ECO_IO_ERROR,"Reading record data error");
 	return buffer;	 
 };
 /**
 * Open the database and check it's readable
 * @param 	filename 		name of the database (.sdx, .rdx, .tbx)
 * @param 	sequencecount	buffer - pointer to variable storing the number of occurence    
 * @param 	abort_on_open_error		 	boolean to define the behaviour in case of error 
 * 										while opening the database
 * @return 	FILE type
 **/
 FILE *open_ecorecorddb(const char *filename,
                       int32_t    *sequencecount,
                       int32_t    abort_on_open_error)
 {
    FILE        *f;
 	int32_t      read;
 	f = fopen(filename,"rb");
 	if (!f)
 		{
 			if (abort_on_open_error)
 		 		ECOERROR(ECO_IO_ERROR,"Cannot open file");
 		 	else
 		 	{
 		 		*sequencecount=0;
 		 		return NULL;
 		 	}
 		}
 	read = fread(sequencecount,
 	      		 1,
 	      		 sizeof(int32_t),
 	      		 f);
 	if (read != sizeof(int32_t))
 		ECOERROR(ECO_IO_ERROR,"Reading record size error");
 	if (is_big_endian())
 		*sequencecount=swap_int32_t(*sequencecount);
 	return f;                  
 }
--- a/ROBITools/src/ecoIOUtils.o
+++ b/ROBITools/src/ecoIOUtils.o
--- a/ROBITools/src/ecoMalloc.c
+++ b/ROBITools/src/ecoMalloc.c
@@ -0,0 +1,79 @@
 #include "ecoPCR.h"
 #include <stdlib.h>
 static int eco_log_malloc = 0;
 void    eco_trace_memory_allocation()
 {
 	eco_log_malloc=1;
 }
 void    eco_untrace_memory_allocation()
 {
 	eco_log_malloc=0;
 }
 void   *eco_malloc(int32_t chunksize,
                   const char *error_message,
                   const char *filename,
                   int32_t    line)
 {
 	void * chunk;
 	chunk = calloc(1,chunksize);
 	if (!chunk)
 		ecoError(ECO_MEM_ERROR,error_message,filename,line);
 	if (eco_log_malloc)
 		fprintf(stderr,
 			    "Memory segment located at %p of size %d is allocated (file : %s [%d])",
 			    chunk,
 			    chunksize,
 			    filename,
 			    line);
 	return chunk;
 }
 void   *eco_realloc(void *chunk,
                    int32_t newsize,
                    const char *error_message,
                    const char *filename,
                    int32_t    line)
 {
 	void *newchunk;
 	newchunk = realloc(chunk,newsize);
 	if (!newchunk)
 		ecoError(ECO_MEM_ERROR,error_message,filename,line);
 	if (eco_log_malloc)
 		fprintf(stderr,
 			    "Old memory segment %p is reallocated at %p with a size of %d (file : %s [%d])",
 			    chunk,
 			    newchunk,
 			    newsize,
 			    filename,
 			    line);
 	return newchunk;	
 }
 void    eco_free(void *chunk,
                 const char *error_message,
                 const char *filename,
                 int32_t    line)
 {
 	free(chunk);
 	if (eco_log_malloc)
 		fprintf(stderr,
 			    "Memory segment %p is released => %s (file : %s [%d])",
 			    chunk,
 			    error_message,
 			    filename,
 			    line);
 }
--- a/ROBITools/src/ecoMalloc.o
+++ b/ROBITools/src/ecoMalloc.o
--- a/ROBITools/src/ecoPCR.h
+++ b/ROBITools/src/ecoPCR.h
@@ -0,0 +1,283 @@
 #ifndef ECOPCR_H_
 #define ECOPCR_H_
 #include <stdio.h>
 #include <inttypes.h>
 #include <R.h>
 #include <Rinternals.h>
 #include <Rdefines.h>
 //#ifndef H_apat
 //#include "../libapat/apat.h"
 //#endif
 /*****************************************************
 * 
 *  Data type declarations
 * 
 *****************************************************/
 /*
 * 
 *  Sequence types
 * 
 */
 typedef struct {
 	int32_t  taxid;
 	char     AC[20];
 	int32_t  DE_length;
 	int32_t  SQ_length;
 	int32_t  CSQ_length;
 	char     data[1];
 } ecoseqformat_t;
 typedef struct {
 	int32_t taxid;
 	int32_t SQ_length;
 	char    *AC;
 	char    *DE;
 	char    *SQ;
 } ecoseq_t;
 /*
 * 
 * Taxonomy taxon types
 * 
 */
 typedef struct {
 	int32_t  taxid;
 	int32_t  rank;
 	int32_t	 parent;
 	int32_t  namelength;
 	char     name[1];
 } ecotxformat_t;
 typedef struct ecotxnode {
 	int32_t           taxid;
 	int32_t           rank;
 	int32_t  		      farest;
 	struct ecotxnode  *parent;
 	char              *name;
 } ecotx_t;
 typedef struct {
 	int32_t count;
 	int32_t maxtaxid;
  int32_t buffersize;
 	ecotx_t taxon[1];
 } ecotxidx_t;
 /*
 * 
 * Taxonomy rank types
 * 
 */
 typedef struct {
 	int32_t count;
 	char*   label[1];
 } ecorankidx_t;
 /*
 * 
 * Taxonomy name types
 * 
 */
 typedef struct {
 	int32_t is_scientificname;
 	int32_t  namelength;
 	int32_t  classlength;
 	int32_t  taxid;
 	char     names[1];	
 } econameformat_t;
 typedef struct {
 	char 	*name;
 	char 	*classname;
 	int32_t is_scientificname;
 	struct ecotxnode  *taxon;
 } econame_t;
 typedef struct {
 	int32_t count;
 	econame_t   names[1];
 } econameidx_t;
 typedef struct {
 	ecorankidx_t *ranks;
 	econameidx_t *names;
 	ecotxidx_t   *taxons;
 } ecotaxonomy_t;
 /*****************************************************
 * 
 *  Function declarations
 * 
 *****************************************************/
 /*
 * 
 * Low level system functions
 * 
 */
 int32_t is_big_endian();
 int32_t swap_int32_t(int32_t);
 void   *eco_malloc(int32_t chunksize,
                   const char *error_message,
                   const char *filename,
                   int32_t    line);
 void   *eco_realloc(void *chunk,
                    int32_t chunksize,
                    const char *error_message,
                    const char *filename,
                    int32_t    line);
 void    eco_free(void *chunk,
                 const char *error_message,
                 const char *filename,
                 int32_t    line);
 void    eco_trace_memory_allocation();
 void    eco_untrace_memory_allocation();
 #define ECOMALLOC(size,error_message) \
 	    eco_malloc((size),(error_message),__FILE__,__LINE__)
 #define ECOREALLOC(chunk,size,error_message) \
        eco_realloc((chunk),(size),(error_message),__FILE__,__LINE__)
 #define ECOFREE(chunk,error_message) \
        eco_free((chunk),(error_message),__FILE__,__LINE__)
 /*
 * 
 * Error managment
 * 
 */
 void ecoError(int32_t,const char*,const char *,int);
 #define ECOERROR(code,message) ecoError((code),(message),__FILE__,__LINE__)
 #define ECO_IO_ERROR       (1)
 #define ECO_MEM_ERROR      (2)
 #define ECO_ASSERT_ERROR   (3)
 #define ECO_NOTFOUND_ERROR (4)
 /*
 * 
 * Low level Disk access functions
 * 
 */
 FILE *open_ecorecorddb(const char *filename,
                       int32_t    *sequencecount,
                       int32_t    abort_on_open_error);
 void *read_ecorecord(FILE *,int32_t *recordSize);
 /* 
 *   Read function in internal binary format
 */
 FILE             *open_ecoseqdb(const char *filename,
                                int32_t    *sequencecount);
 ecoseq_t         *readnext_ecoseq(FILE *);
 ecorankidx_t     *read_rankidx(const char *filename);
 econameidx_t     *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy);
 	/**
 	 * Read taxonomy data as formated by the ecoPCRFormat.py script.
 	 * 
 	 * This function is normaly uses internaly by the read_taxonomy
 	 * function and should not be called directly.
 	 * 
 	 * @arg filename  path to the *.tdx file of the reformated db
 	 * 
 	 * @return pointer to a taxonomy index structure
 	 */
 ecotxidx_t       *read_taxonomyidx(const char *filename,const char *filename2);
 ecotaxonomy_t    *read_taxonomy(const char *prefix,int32_t readAlternativeName);
 ecotx_t *eco_findtaxonatrank(ecotx_t *taxon, int32_t rankidx);
 ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy, int32_t taxid);
 int eco_isundertaxon(ecotx_t *taxon, int other_taxid);
 ecoseq_t *ecoseq_iterator(const char *prefix);
 ecoseq_t *new_ecoseq();
 int32_t   delete_ecoseq(ecoseq_t *);
 ecoseq_t *new_ecoseq_with_data( char *AC,
 								char *DE,
 								char *SQ,
 								int32_t   taxid
 								);
 int32_t delete_taxon(ecotx_t *taxon);
 int32_t delete_taxonomy(ecotxidx_t *index);
 int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy);
 int32_t rank_index(const char* label,ecorankidx_t* ranks);
 //int32_t  delete_apatseq(SeqPtr pseq);
 //PatternPtr buildPattern(const char *pat, int32_t error_max);
 //PatternPtr complementPattern(PatternPtr pat);
 //
 //SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out,int32_t circular);
 //char *ecoComplementPattern(char *nucAcSeq);
 //char *ecoComplementSequence(char *nucAcSeq);
 //char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end);
 ecotx_t *eco_getspecies(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
 ecotx_t *eco_getgenus(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
 ecotx_t *eco_getfamily(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
 ecotx_t *eco_getkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
 ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
 //int eco_is_taxid_ignored(int32_t *ignored_taxid, int32_t tab_len, int32_t taxid);
 //int eco_is_taxid_included(ecotaxonomy_t *taxonomy, int32_t *included_taxid, int32_t tab_len, int32_t taxid);
 ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy);
 #endif /*ECOPCR_H_*/
--- a/ROBITools/src/ecodna.c
+++ b/ROBITools/src/ecodna.c
@@ -0,0 +1,156 @@
 #include <string.h>
 #include "ecoPCR.h"
 /*
 * @doc: DNA alphabet (IUPAC)
 */
 #define LX_BIO_DNA_ALPHA   "ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]"
 /*
 * @doc: complementary DNA alphabet (IUPAC)
 */
 #define LX_BIO_CDNA_ALPHA  "TVGHEFCDIJMLKNOPQYSAABWXRZ#!]["
 static char sNuc[]     = LX_BIO_DNA_ALPHA;
 static char sAnuc[]    = LX_BIO_CDNA_ALPHA;
 static char LXBioBaseComplement(char nucAc);
 static char *LXBioSeqComplement(char *nucAcSeq);
 static char *reverseSequence(char *str,char isPattern);
 /* ---------------------------- */
 char LXBioBaseComplement(char nucAc)
 {
    char *c;
    if ((c = strchr(sNuc, nucAc)))
        return sAnuc[(c - sNuc)];
    else
        return nucAc;
 }
 /* ---------------------------- */
 char *LXBioSeqComplement(char *nucAcSeq)
 {
    char *s;
    for (s = nucAcSeq ; *s ; s++)
        *s = LXBioBaseComplement(*s);
    return nucAcSeq;
 }
 char *reverseSequence(char *str,char isPattern)
 {
        char *sb, *se, c;
        if (! str)
            return str;
        sb = str;
        se = str + strlen(str) - 1;
        while(sb <= se) {
           c    = *sb;
          *sb++ = *se;
          *se-- = c;
        }
 		sb = str;
 		se = str + strlen(str) - 1;
 		if (isPattern)
 			for (;sb < se; sb++)
 			{
 				if (*sb=='#')
 				{
 					if (((se - sb) > 2) && (*(sb+2)=='!'))
 					{
 						*sb='!';
 						sb+=2;
 						*sb='#';
 					}
 					else
 					{
 						*sb=*(sb+1);
 						sb++;
 						*sb='#';
 					}
 				}
 				else if (*sb=='!')
 					{
 						*sb=*(sb-1);
 						*(sb-1)='!';
 					}
 			}
        return str;
 }
 char *ecoComplementPattern(char *nucAcSeq)
 {
    return reverseSequence(LXBioSeqComplement(nucAcSeq),1);
 }
 char *ecoComplementSequence(char *nucAcSeq)
 {
    return reverseSequence(LXBioSeqComplement(nucAcSeq),0);
 }
 char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end)
 /*
   extract subsequence from nucAcSeq [begin,end[
 */
 {
 	static char *buffer  = NULL;
 	static int32_t buffSize= 0;
 	int32_t length;
 	if (begin < end)
 	{
 		length = end - begin;
 		if (length >= buffSize)
 		{
 			buffSize = length+1;
 			if (buffer)
 				buffer=ECOREALLOC(buffer,buffSize,
 						   	      "Error in reallocating sub sequence buffer");
 			else
 				buffer=ECOMALLOC(buffSize,
 				          		 "Error in allocating sub sequence buffer");
 		}
 		strncpy(buffer,nucAcSeq + begin,length);
 		buffer[length]=0;
 	}
 	else
 	{
 		length = end + strlen(nucAcSeq) - begin;
 		if (length >= buffSize)
 		{
 			buffSize = length+1;
 			if (buffer)
 				buffer=ECOREALLOC(buffer,buffSize,
 						   	      "Error in reallocating sub sequence buffer");
 			else
 				buffer=ECOMALLOC(buffSize,
 				          		 "Error in allocating sub sequence buffer");
 		}
 		strncpy(buffer,nucAcSeq+begin,length - end);
 		strncpy(buffer+(length-end),nucAcSeq ,end);
 		buffer[length]=0;
 	}
 	return buffer;
 }
--- a/ROBITools/src/ecodna.o
+++ b/ROBITools/src/ecodna.o
--- a/ROBITools/src/ecofilter.c
+++ b/ROBITools/src/ecofilter.c
@@ -0,0 +1,20 @@
 #include "ecoPCR.h"
 int eco_is_taxid_included(	ecotaxonomy_t *taxonomy, 
 							int32_t *restricted_taxid, 
 							int32_t tab_len, 
 							int32_t taxid)
 {
 	int i;
 	ecotx_t *taxon;
 	taxon = eco_findtaxonbytaxid(taxonomy, taxid);
 	if (taxon)
 		for (i=0; i < tab_len; i++)
 			if ( (taxon->taxid == restricted_taxid[i]) ||
 				 (eco_isundertaxon(taxon, restricted_taxid[i])) )
 				return 1;
 	return 0;
 }
--- a/ROBITools/src/ecofilter.o
+++ b/ROBITools/src/ecofilter.o
--- a/ROBITools/src/econame.c
+++ b/ROBITools/src/econame.c
@@ -0,0 +1,64 @@
 #include "ecoPCR.h"
 #include <string.h>
 #include <stdlib.h>
 static econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy);
 econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy)
 {
 	int32_t      		count;
 	FILE         		*f;
 	econameidx_t		*indexname;
 	int32_t      		i;
 	f = open_ecorecorddb(filename,&count,0);
 	if (f==NULL)
 		return NULL;
 	indexname = (econameidx_t*) ECOMALLOC(sizeof(econameidx_t) + sizeof(econame_t) * (count-1),"Allocate names");
 	indexname->count=count;
 	for (i=0; i < count; i++){
 		readnext_econame(f,(indexname->names)+i,taxonomy);
 	}
 	return indexname;
 }
 econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy)
 {
 	econameformat_t *raw;
 	int32_t  rs;
 	raw = read_ecorecord(f,&rs);
 	if (!raw)
 		return NULL;
 	if (is_big_endian())
 	{
 		raw->is_scientificname 	= swap_int32_t(raw->is_scientificname);
 		raw->namelength 	    = swap_int32_t(raw->namelength);
 		raw->classlength        = swap_int32_t(raw->classlength);
 		raw->taxid  	        = swap_int32_t(raw->taxid); 
 	}
 	name->is_scientificname=raw->is_scientificname;
 	name->name   	= ECOMALLOC((raw->namelength+1) * sizeof(char),"Allocate name");
 	strncpy(name->name,raw->names,raw->namelength);
 	name->name[raw->namelength]=0;
 	name->classname = ECOMALLOC((raw->classlength+1) * sizeof(char),"Allocate classname");
 	strncpy(name->classname,(raw->names+raw->namelength),raw->classlength);
 	name->classname[raw->classlength]=0;
 	name->taxon = taxonomy->taxons->taxon + raw->taxid;
 	return name;
 }
--- a/ROBITools/src/econame.o
+++ b/ROBITools/src/econame.o
--- a/ROBITools/src/ecorank.c
+++ b/ROBITools/src/ecorank.c
@@ -0,0 +1,55 @@
 #include "ecoPCR.h"
 #include <string.h>
 #include <stdlib.h>
 static int compareRankLabel(const void *label1, const void *label2);
 ecorankidx_t     *read_rankidx(const char *filename)
 {
 	int32_t      count;
 	FILE         *f;
 	ecorankidx_t *index;
 	int32_t      i;
 	int32_t      rs;
 	char         *buffer;
 	f = open_ecorecorddb(filename,&count,0);
 	if (f==NULL)
 		return NULL;
 	index = (ecorankidx_t*) ECOMALLOC(sizeof(ecorankidx_t) + sizeof(char*) * (count-1),
 	                                  "Allocate rank index");
 	index->count=count;                                 
 	for (i=0; i < count; i++)
 		{
 			buffer = read_ecorecord(f,&rs);
 			index->label[i]=(char*) ECOMALLOC(rs+1,
 			                                  "Allocate rank label");
 			strncpy(index->label[i],buffer,rs);
 		}
 	return index;
 }
 int32_t rank_index(const char* label,ecorankidx_t* ranks)
 {
 	char **rep;
 	rep = bsearch(label,ranks->label,ranks->count,sizeof(char*),compareRankLabel);
 	if (rep)
 		return rep-ranks->label;
 //	else
 //		ECOERROR(ECO_NOTFOUND_ERROR,"Rank label not found");
 	return -1;
 }
 int compareRankLabel(const void *label1, const void *label2)
 {
 	return strcmp((const char*)label1,*(const char**)label2);
 }
--- a/ROBITools/src/ecorank.o
+++ b/ROBITools/src/ecorank.o
--- a/ROBITools/src/ecoseq.c
+++ b/ROBITools/src/ecoseq.c
@@ -0,0 +1,230 @@
 #include "ecoPCR.h"
 #include <stdlib.h>
 #include <string.h>
 #include <zlib.h>
 #include <string.h>
 #include <stdio.h>
 #include <ctype.h>
 static FILE *open_seqfile(const char *prefix,int32_t index);
 ecoseq_t *new_ecoseq()
 {
 	void *tmp;
 	tmp = ECOMALLOC(sizeof(ecoseq_t),"Allocate new ecoseq structure");
 	return tmp;
 }
 int32_t delete_ecoseq(ecoseq_t * seq)
 {
 	if (seq)
 	{
 		if (seq->AC)
 			ECOFREE(seq->AC,"Free sequence AC");
 		if (seq->DE)
 			ECOFREE(seq->DE,"Free sequence DE");
 		if (seq->SQ)
 			ECOFREE(seq->SQ,"Free sequence SQ");
 		ECOFREE(seq,"Free sequence structure");
 		return 0;
 	}
 	return 1;
 }
 ecoseq_t *new_ecoseq_with_data( char *AC,
 								char *DE,
 								char *SQ,
 								int32_t   taxid_idx
 								)
 {
 	ecoseq_t *tmp;
 	int32_t lstr;
 	tmp = new_ecoseq();
 	tmp->taxid=taxid_idx;
 	if (AC)
 		{
 			lstr =strlen(AC);
 			tmp->AC=ECOMALLOC((lstr+1) * sizeof(char),
 			                  "Allocate sequence accession");
 			strcpy(tmp->AC,AC);
 		}
 	if (DE)
 		{
 			lstr =strlen(DE);
 			tmp->DE=ECOMALLOC((lstr+1) * sizeof(char),
 			                  "Allocate sequence definition");
 			strcpy(tmp->DE,DE);
 		}
 	if (SQ)
 		{
 			lstr =strlen(SQ);
 			tmp->SQ=ECOMALLOC((lstr+1) * sizeof(char),
 			                  "Allocate sequence data");
 			strcpy(tmp->SQ,SQ);
 		}
 	return tmp;
 }
 /**
 * ?? used ??
 **/
 FILE *open_ecoseqdb(const char *filename,
                    int32_t    *sequencecount)
 {
 	return open_ecorecorddb(filename,sequencecount,1);
 }
 ecoseq_t *readnext_ecoseq(FILE *f)
 {
 	char     *compressed=NULL;
 	ecoseqformat_t *raw;
 	ecoseq_t *seq;
 	int32_t  comp_status;
 	unsigned long int seqlength;
 	int32_t  rs;
 	char *c;
 	int32_t i;
 	raw = read_ecorecord(f,&rs);
 	if (!raw)
 		return NULL;
 	if (is_big_endian())
 	{
 		raw->CSQ_length = swap_int32_t(raw->CSQ_length);
 		raw->DE_length  = swap_int32_t(raw->DE_length);
 		raw->SQ_length  = swap_int32_t(raw->SQ_length);
 		raw->taxid      = swap_int32_t(raw->taxid);
 	}
 	seq = new_ecoseq();
 	seq->taxid = raw->taxid;
    seq->AC    = ECOMALLOC(strlen(raw->AC) +1,
                           "Allocate Sequence Accesion number");
    strncpy(seq->AC,raw->AC,strlen(raw->AC));
    seq->DE    = ECOMALLOC(raw->DE_length+1,
                           "Allocate Sequence definition");
    strncpy(seq->DE,raw->data,raw->DE_length);
 	seqlength = seq->SQ_length = raw->SQ_length;
    compressed = raw->data + raw->DE_length;
    seq->SQ = ECOMALLOC(seqlength+1,
                        "Allocate sequence buffer");
    comp_status = uncompress((unsigned char*)seq->SQ,
                             &seqlength,
                             (unsigned char*)compressed,
                             raw->CSQ_length);
    if (comp_status != Z_OK)
    	ECOERROR(ECO_IO_ERROR,"I cannot uncompress sequence data");
    for (c=seq->SQ,i=0;i<seqlength;c++,i++)
    	*c=toupper(*c);
 	return seq;
 }
 /**
 * Open the sequences database (.sdx file)
 * @param	prefix	name of the database (radical without extension)
 * @param	index 	integer
 *
 * @return	file object
 */
 FILE *open_seqfile(const char *prefix,int32_t index)
 {
 	char           filename_buffer[1024];
 	int32_t        filename_length;
 	FILE           *input;
 	int32_t        seqcount;
 	filename_length = snprintf(filename_buffer,
 								1023,
 	                           "%s_%03d.sdx",
 	                           prefix,
 	                           index);
 		//	fprintf(stderr,"# Coucou %s\n",filename_buffer);
 	if (filename_length >= 1024)
 		ECOERROR(ECO_ASSERT_ERROR,"file name is too long");
 	filename_buffer[filename_length]=0;
 	input=open_ecorecorddb(filename_buffer,&seqcount,0);
 	if (input)
 		fprintf(stderr,"# Reading file %s containing %d sequences...\n",
 				filename_buffer,
 				seqcount);
 	return input;
 }
 ecoseq_t *ecoseq_iterator(const char *prefix)
 {
 	static FILE    *current_seq_file= NULL;
 	static int32_t current_file_idx = 1;
 	static char    current_prefix[1024];
 	ecoseq_t       *seq;
 	if (prefix)
 	{
 		current_file_idx = 1;
 		if (current_seq_file)
 			fclose(current_seq_file);
 		strncpy(current_prefix,prefix,1023);
 		current_prefix[1023]=0;
 		current_seq_file = open_seqfile(current_prefix,
 		 							    current_file_idx);
 		if (!current_seq_file)
 			return NULL;
 	}
 	seq = readnext_ecoseq(current_seq_file);
 	if (!seq && feof(current_seq_file))
 	{
 		current_file_idx++;
 		fclose(current_seq_file);
 		current_seq_file = open_seqfile(current_prefix,
 		 							    current_file_idx);
 		if (current_seq_file)
 			seq = readnext_ecoseq(current_seq_file);
 	}
 	return seq;
 }
--- a/ROBITools/src/ecoseq.o
+++ b/ROBITools/src/ecoseq.o
--- a/ROBITools/src/ecotax.c
+++ b/ROBITools/src/ecotax.c
@@ -0,0 +1,437 @@
 #include "ecoPCR.h"
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <R.h>
 #ifndef MAX
 #define MAX(x,y) (((x)>(y)) ? (x):(y))
 #endif
 static ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon);
 /** 
 * Open the taxonomy database 
 * @param	pointer to the database (.tdx file) 
 * @return	a ecotxidx_t structure 
 */
 ecotxidx_t     *read_taxonomyidx(const char *filename,const char *filename2)
 {
 	int32_t      count;
 	int32_t      count2;
 	FILE         *f;
 	FILE         *f2;
 	ecotxidx_t *index;
 	struct ecotxnode  *t;
 	int32_t      i;
 	int32_t      j;
 	f  = open_ecorecorddb(filename,&count,0);
 	if (f==NULL) return NULL;
 	f2 = open_ecorecorddb(filename2,&count2,0);
 	index = (ecotxidx_t*) ECOMALLOC(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count+count2-1),
 	                                  "Allocate taxonomy");
 	index->count=count+count2;
  index->buffersize = index->count;
 	index->maxtaxid=0;
 	REprintf("Readind %d taxa...\n",count);
 	for (i=0; i < count; i++){
 		readnext_ecotaxon(f,&(index->taxon[i]));
 		index->taxon[i].parent=index->taxon + (size_t)index->taxon[i].parent;
 		index->taxon[i].parent->farest=0;
 		if (index->taxon[i].taxid > index->maxtaxid)
 			index->maxtaxid=index->taxon[i].taxid;
 	}					
 	if (count2>0)
 		REprintf("Readind %d local taxa...\n",count2);
 	else
 		REprintf("No local taxon\n");
 	count = index->count;
 	for (; i < count; i++){
 		readnext_ecotaxon(f2,&(index->taxon[i]));
 		index->taxon[i].parent=index->taxon + (size_t)index->taxon[i].parent;
 		index->taxon[i].parent->farest=0;
 		if (index->taxon[i].taxid > index->maxtaxid)
 			index->maxtaxid=index->taxon[i].taxid;
 	}
 	REprintf("Computing longest branches...\n",count);
 	for (i=0; i < count; i++){
 		t=index->taxon+i;
 		if (t->farest==-1)
 		{
 			t->farest=0;
            while(t->parent != t)
            {
            	j = t->farest + 1;
            	if (j > t->parent->farest)
            	{
            		t->parent->farest = j;
            		t=t->parent;
            	}
            	else
            		t=index->taxon;
            }
 		}
 	}
 	return index;
 }
 int32_t delete_taxonomy(ecotxidx_t *index)
 {
 	int32_t i;
 	if (index)
 	{
 		for (i=0; i< index->count; i++)
 			if (index->taxon[i].name)
 				ECOFREE(index->taxon[i].name,"Free scientific name");
 		ECOFREE(index,"Free Taxonomy");
 		return 0;
 	}
 	return 1;
 }
 int32_t delete_taxon(ecotx_t *taxon)
 {
 	if (taxon)
 	{
 		if (taxon->name)
 			ECOFREE(taxon->name,"Free scientific name");
 		ECOFREE(taxon,"Free Taxon");
 		return 0;
 	}
 	return 1;
 }
 /**
 * Read the database for a given taxon a save the data 
 * into the taxon structure(if any found)
 * @param	*f	pointer to FILE type returned by fopen
 * @param	*taxon	pointer to the structure
 * 
 * @return	a ecotx_t structure if any taxon found else NULL 
 */
 ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon)
 {
 	ecotxformat_t *raw;
 	int32_t  rs;
 	raw = read_ecorecord(f,&rs);
 	if (!raw)
 		return NULL;
 	if (is_big_endian())
 	{
 		raw->namelength = swap_int32_t(raw->namelength);
 		raw->parent     = swap_int32_t(raw->parent);
 		raw->rank       = swap_int32_t(raw->rank);
 		raw->taxid      = swap_int32_t(raw->taxid); 
 	}
 	taxon->parent = (ecotx_t*)((size_t)raw->parent);
 	taxon->taxid  = raw->taxid;
 	taxon->rank   = raw->rank;
 	taxon->farest = -1;
 	taxon->name   = ECOMALLOC((raw->namelength+1) * sizeof(char),
 	                          "Allocate taxon scientific name");
 	strncpy(taxon->name,raw->name,raw->namelength);
 	return taxon;
 }
 ecotaxonomy_t    *read_taxonomy(const char *prefix,int32_t readAlternativeName)
 {
 	ecotaxonomy_t *tax;
 	char          *filename;
 	char          *filename2;
 	int           buffsize;
 	tax = ECOMALLOC(sizeof(ecotaxonomy_t),
 	                "Allocate taxonomy structure");
 	tax->ranks =NULL;
 	tax->taxons=NULL;
 	tax->names =NULL;
 	buffsize = strlen(prefix)+10;
 	filename = ECOMALLOC(buffsize,
 	                     "Allocate filename");
 	filename2= ECOMALLOC(buffsize,
 	                     "Allocate filename");
 	snprintf(filename,buffsize,"%s.rdx",prefix);
 	tax->ranks = read_rankidx(filename);
 	if (tax->ranks == NULL)
 	{
 		ECOFREE(filename,"Desallocate filename 1");
 		ECOFREE(filename2,"Desallocate filename 2");
 		delete_ecotaxonomy(tax);
 		return NULL;
 	}
 	snprintf(filename,buffsize,"%s.tdx",prefix);
 	snprintf(filename2,buffsize,"%s.ldx",prefix);
 	tax->taxons = read_taxonomyidx(filename,filename2);
 	if (tax->taxons == NULL)
 	{
 		ECOFREE(filename,"Desallocate filename 1");
 		ECOFREE(filename,"Desallocate filename 2");
 		delete_ecotaxonomy(tax);
 		return NULL;
 	}
 	if (readAlternativeName)
 	{
  	   snprintf(filename,buffsize,"%s.ndx",prefix);
 	   tax->names=read_nameidx(filename,tax);
 	}
 	else
 	   tax->names=NULL;
 	ECOFREE(filename,"Desallocate filename 1");
 	ECOFREE(filename2,"Desallocate filename 2");
 	return tax;
 }
 int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy)
 {
 	if (taxonomy)
 	{
 		if (taxonomy->ranks)
 			ECOFREE(taxonomy->ranks,"Free rank index");
 		if (taxonomy->names)
 			ECOFREE(taxonomy->names,"Free names index");
 		if (taxonomy->taxons)
 			ECOFREE(taxonomy->taxons,"Free taxon index");
 		ECOFREE(taxonomy,"Free taxonomy structure");
 		return 0;
 	}
 	return 1;
 }
 ecotx_t *eco_findtaxonatrank(ecotx_t *taxon,
                                int32_t rankidx)
 {
 	ecotx_t *current_taxon;
 	ecotx_t *next_taxon;
 	current_taxon = taxon;
 	next_taxon    = current_taxon->parent;
 	while ((current_taxon!=next_taxon) &&  // I' am the root node
 		   (current_taxon->rank!=rankidx))
 		   {
 		   	current_taxon = next_taxon;
 		   	next_taxon    = current_taxon->parent;
 		   }
 	if (current_taxon->rank==rankidx)
 		return current_taxon;
 	else
 		return NULL;
 }
 static int bcomptaxon (const void * ptaxid, const void * ptaxon) {
  ecotx_t    *current_taxon = (ecotx_t*)ptaxon;
  int32_t    taxid=(int32_t)((size_t)ptaxid);
  return taxid - current_taxon->taxid;
 }
 /**
 * Get back information concerning a taxon from a taxonomic id
 * @param 	*taxonomy 	the taxonomy database
 * @param	taxid		the taxonomic id 
 * 
 * @result	a ecotx_t structure containing the taxonimic information  
 **/
 ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy, 
 							  int32_t taxid)
 {
 	ecotx_t    *current_taxon;
 	int32_t     taxoncount;
 //	int32_t     i;
 	taxoncount=taxonomy->taxons->count;
  current_taxon = (ecotx_t*)  bsearch((const void *)((size_t)taxid), 
                                      (const void *)taxonomy->taxons->taxon, 
                                      taxoncount, 
                                      sizeof(ecotx_t), 
                                      bcomptaxon);
 /* Old version  
 	for (current_taxon=taxonomy->taxons->taxon,
 	     i=0;
 	     i < taxoncount;
 	     i++,
 	     current_taxon++){
 	     if (current_taxon->taxid==taxid){
 	     	return current_taxon;
 	     }
 	 }
 */
 	return current_taxon;	
 }
 /**
 * Find out if taxon is son of other taxon (identified by its taxid)
 * @param	*taxon son 		taxon
 * @param	parent_taxid 	taxonomic id of the other taxon
 * 
 * @return 	1 is the other taxid math a parent taxid, else 0
 **/
 int eco_isundertaxon(ecotx_t *taxon, 
 						int other_taxid)
 {
 	ecotx_t *next_parent;
 	next_parent = taxon->parent;	
 	while ( (other_taxid != next_parent->taxid) && 
 			(strcmp(next_parent->name, "root")) )
 	{
 		next_parent = next_parent->parent;
 	}
 	if (other_taxid == next_parent->taxid)
 		return 1;
 	else
 		return 0;
 }
 ecotx_t *eco_getspecies(ecotx_t *taxon,
 						ecotaxonomy_t *taxonomy)
 {
 	static ecotaxonomy_t *tax=NULL;
 	static int32_t		 rankindex=-1;
 	if (taxonomy && tax!=taxonomy)
 	{
 		rankindex = rank_index("species",taxonomy->ranks);
 		tax=taxonomy;
 	}
 	if (!tax || rankindex < 0)
 		ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
 	return eco_findtaxonatrank(taxon,rankindex);
 }
 ecotx_t *eco_getgenus(ecotx_t *taxon,
 						ecotaxonomy_t *taxonomy)
 {
 	static ecotaxonomy_t *tax=NULL;
 	static int32_t		 rankindex=-1;
 	if (taxonomy && tax!=taxonomy)
 	{
 		rankindex = rank_index("genus",taxonomy->ranks);
 		tax=taxonomy;
 	}
 	if (!tax || rankindex < 0)
 		ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
 	return eco_findtaxonatrank(taxon,rankindex);
 }
 ecotx_t *eco_getfamily(ecotx_t *taxon,
 						ecotaxonomy_t *taxonomy)
 {
 	static ecotaxonomy_t *tax=NULL;
 	static int32_t		 rankindex=-1;
 	if (taxonomy && tax!=taxonomy)
 	{
 		rankindex = rank_index("family",taxonomy->ranks);
 		tax=taxonomy;
 	}
 	if (!tax || rankindex < 0)
 		ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
 	return eco_findtaxonatrank(taxon,rankindex);
 }
 ecotx_t *eco_getkingdom(ecotx_t *taxon,
 						ecotaxonomy_t *taxonomy)
 {
 	static ecotaxonomy_t *tax=NULL;
 	static int32_t		 rankindex=-1;
 	if (taxonomy && tax!=taxonomy)
 	{
 		rankindex = rank_index("kingdom",taxonomy->ranks);
 		tax=taxonomy;
 	}
 	if (!tax || rankindex < 0)
 		ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
 	return eco_findtaxonatrank(taxon,rankindex);
 }
 ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,
 						ecotaxonomy_t *taxonomy)
 {
 	static ecotaxonomy_t *tax=NULL;
 	static int32_t		 rankindex=-1;
 	if (taxonomy && tax!=taxonomy)
 	{
 		rankindex = rank_index("superkingdom",taxonomy->ranks);
 		tax=taxonomy;
 	}
 	if (!tax || rankindex < 0)
 		ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
 	return eco_findtaxonatrank(taxon,rankindex);
 }
--- a/ROBITools/src/ecotax.o
+++ b/ROBITools/src/ecotax.o
--- a/ROBITools/src/robitax.c
+++ b/ROBITools/src/robitax.c
@@ -0,0 +1,835 @@
 /*
 * robitax.c
 *
 *  Created on: 17 janv. 2013
 *      Author: coissac
 */
 #include "robitax.h"
 #include <unistd.h>
 //#include <regex.h>
 #include "slre.h"
 /**
 * Return a pointeur to an obitools taxonomy C structure
 * from an R instance of taxonomy.obitools
 *
 * The function checks if the pointer stored in the R object is set
 * to NULL. In this case this means that we have to load the taxonomy
 * from the disk.
 *
 * @param taxonomy an R object
 * @type  taxonomy SEXP
 *
 * @return a pointer to the C structure
 * @rtype  ecotaxonomy_t *
 */
 ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy)
 {
 	char *pwd;
 	SEXP pointer;
 	SEXP rclass;
 	SEXP rdir;
 	SEXP rfile;
 	ecotaxonomy_t *ptax;
 	const char *class;
 	const char *file;
 	const char *dir;
  int saved;
    if (!IS_S4_OBJECT(Rtaxonomy) )
        error("argument not taxonomy.obitools instance");
 	// We get the class name and compare it to "taxonomy.obitools"
    rclass = getAttrib(Rtaxonomy, R_ClassSymbol);
    class = CHAR(asChar(rclass));
    if (strcmp(class,"taxonomy.obitools"))
        error("argument not taxonomy.obitools instance");
    pointer = R_do_slot(Rtaxonomy,mkString("pointer"));
    saved = LOGICAL(R_do_slot(Rtaxonomy,mkString("saved")))[0];
    ptax = (ecotaxonomy_t *) R_ExternalPtrAddr(pointer);
    // If the external pointer is set to NULL we have to load
    // the taxonomy from file
    if (ptax==NULL && saved)
    {
    	pwd = getcwd(NULL,0);
    	rfile = R_do_slot(Rtaxonomy,mkString("dbname"));
    	file  = CHAR(asChar(rfile));
    	rdir  = R_do_slot(Rtaxonomy,mkString("workingdir"));
    	dir   = CHAR(asChar(rdir));
    	chdir(dir);
    	ptax = read_taxonomy(file,1);
    	R_SetExternalPtrAddr(pointer,(void*)ptax);
    	chdir(pwd);
    	free(pwd);
    }
    if (ptax==NULL && ! saved)
      error("The taxonomy instance is no more valid and must be rebuilt");
    return ptax;
 }
 SEXP R_delete_taxonomy(SEXP Rtaxonomy)
 {
 	ecotaxonomy_t *ptax;
 //	SEXP pointer;
    ptax = (ecotaxonomy_t *) R_ExternalPtrAddr(Rtaxonomy);
    (void) delete_ecotaxonomy(ptax);
    // Clear the external pointer
    R_ClearExternalPtr(Rtaxonomy);
    return R_NilValue;
 }
 SEXP R_read_taxonomy(SEXP filename, SEXP altenative)
 {
 	int   alt;
 	const char* file;
 	SEXP  Rtax;
    if (! isString(filename))
        error("filename not character");
    file = CHAR(STRING_ELT(filename, 0));
    if (! isLogical(altenative))
        error("altenative not logical");
    alt = LOGICAL(altenative)[0];
    ecotaxonomy_t *taxonomy = read_taxonomy(file,alt);
    if (! taxonomy)
    	error("Cannot open the taxonomy database");
 	Rtax = PROTECT(R_MakeExternalPtr(taxonomy, mkString("ROBITools NCBI Taxonomy pointer"), R_NilValue));
 	R_RegisterCFinalizerEx(Rtax, (R_CFinalizer_t)R_delete_taxonomy,TRUE);
    UNPROTECT(1);
 	return Rtax;
 }
 SEXP R_get_scientific_name(SEXP Rtaxonomy,SEXP Rtaxid)
 {
 	ecotx_t *taxon;
 	ecotaxonomy_t *ptax;
 	int taxid;
 //	SEXP scname;
    ptax = getTaxPointer(Rtaxonomy);
    if (! isInteger(Rtaxid))
        error("taxid not integer");
    taxid = *INTEGER(Rtaxid);
    if (! (taxid > 0))
        error("taxid not positive");
 	taxon = eco_findtaxonbytaxid(ptax, taxid);
 	if (!taxon)
 		return ScalarString(R_NaString);
 		// error("unkown taxid");
 	return mkString(taxon->name);
 }
 SEXP R_get_rank(SEXP Rtaxonomy,SEXP Rtaxid)
 {
 	ecotx_t *taxon;
 	ecotaxonomy_t *ptax;
 	int *taxid;
  int ntaxid;
  int i;
  SEXP results;
 //	SEXP scname;
    ptax = getTaxPointer(Rtaxonomy);
    if (! isInteger(Rtaxid))
        error("taxid not integer");
    ntaxid = length(Rtaxid);
    results = PROTECT(allocVector(STRSXP, ntaxid));
    taxid = INTEGER(Rtaxid);
    for (i=0; i < ntaxid; i++)
    {
      if (taxid[i]== NA_INTEGER || taxid[i] <= 0)
        SET_STRING_ELT(results, i, R_NaString);
      else {
        taxon = eco_findtaxonbytaxid(ptax, taxid[i]);
        if (!taxon)
          SET_STRING_ELT(results, i, R_NaString);
        else
          SET_STRING_ELT(results, i, mkChar(ptax->ranks->label[taxon->rank]));
      }
    }
  UNPROTECT(1);
 	return results;
 }
 SEXP R_findtaxonatrank(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rrank, SEXP Rname)
 {
 	ecotx_t *taxon;
 	ecotx_t *rep;
 	ecotaxonomy_t *ptax;
 	int taxid;
 	int name;
 	const char *rank;
 	int   rankidx;
 //	SEXP scname;
    ptax = getTaxPointer(Rtaxonomy);
    if (! isInteger(Rtaxid))
        error("taxid not integer");
    taxid = *INTEGER(Rtaxid);
    if (! (taxid > 0))
        error("taxid not positive");
    if (! isString(Rrank))
        error("rank not a string");
    rank=CHAR(STRING_ELT(Rrank,0));
    rankidx=rank_index(rank,ptax->ranks);
    if (rankidx < 0)
        error("unkown rank name");
    if (! isLogical(Rname))
        error("name not logical");
    name = LOGICAL(Rname)[0];
 	taxon = eco_findtaxonbytaxid(ptax, taxid);
 	if (!taxon)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	rep = eco_findtaxonatrank(taxon,rankidx);
 	if (!rep)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	if (name)
 		return mkString(rep->name);
 	else
 		return ScalarInteger(rep->taxid);
 }
 SEXP R_get_species(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
 {
 	ecotx_t *taxon;
 	ecotx_t *rep;
 	ecotaxonomy_t *ptax;
 	int taxid;
 	int name;
 //	SEXP scname;
    ptax = getTaxPointer(Rtaxonomy);
    if (! isInteger(Rtaxid))
        error("taxid not integer");
    taxid = *INTEGER(Rtaxid);
    if (! (taxid > 0))
        error("taxid not positive");
    if (! isLogical(Rname))
        error("name not logical");
    name = LOGICAL(Rname)[0];
 	taxon = eco_findtaxonbytaxid(ptax, taxid);
 	if (!taxon)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	rep = eco_getspecies(taxon,ptax);
 	if (!rep)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	if (name)
 		return mkString(rep->name);
 	else
 		return ScalarInteger(rep->taxid);
 }
 SEXP R_get_genus(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
 {
 	ecotx_t *taxon;
 	ecotx_t *rep;
 	ecotaxonomy_t *ptax;
 	int taxid;
 	int name;
 //	SEXP scname;
    ptax = getTaxPointer(Rtaxonomy);
    if (! isInteger(Rtaxid))
        error("taxid not integer");
    taxid = *INTEGER(Rtaxid);
    if (! (taxid > 0))
        error("taxid not positive");
    if (! isLogical(Rname))
        error("name not logical");
    name = LOGICAL(Rname)[0];
 	taxon = eco_findtaxonbytaxid(ptax, taxid);
 	if (!taxon)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	rep = eco_getgenus(taxon,ptax);
 	if (!rep)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	if (name)
 		return mkString(rep->name);
 	else
 		return ScalarInteger(rep->taxid);
 }
 SEXP R_get_family(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
 {
 	ecotx_t *taxon;
 	ecotx_t *rep;
 	ecotaxonomy_t *ptax;
 	int taxid;
 	int name;
 //	SEXP scname;
    ptax = getTaxPointer(Rtaxonomy);
    if (! isInteger(Rtaxid))
        error("taxid not integer");
    taxid = *INTEGER(Rtaxid);
    if (! (taxid > 0))
        error("taxid not positive");
    if (! isLogical(Rname))
        error("name not logical");
    name = LOGICAL(Rname)[0];
 	taxon = eco_findtaxonbytaxid(ptax, taxid);
 	if (!taxon)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	rep = eco_getfamily(taxon,ptax);
 	if (!rep)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	if (name)
 		return mkString(rep->name);
 	else
 		return ScalarInteger(rep->taxid);
 }
 SEXP R_get_kingdom(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
 {
 	ecotx_t *taxon;
 	ecotx_t *rep;
 	ecotaxonomy_t *ptax;
 	int taxid;
 	int name;
 //	SEXP scname;
    ptax = getTaxPointer(Rtaxonomy);
    if (! isInteger(Rtaxid))
        error("taxid not integer");
    taxid = *INTEGER(Rtaxid);
    if (! (taxid > 0))
        error("taxid not positive");
    if (! isLogical(Rname))
        error("name not logical");
    name = LOGICAL(Rname)[0];
 	taxon = eco_findtaxonbytaxid(ptax, taxid);
 	if (!taxon)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	rep = eco_getkingdom(taxon,ptax);
 	if (!rep)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	if (name)
 		return mkString(rep->name);
 	else
 		return ScalarInteger(rep->taxid);
 }
 SEXP R_get_superkingdom(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
 {
 	ecotx_t *taxon;
 	ecotx_t *rep;
 	ecotaxonomy_t *ptax;
 	int taxid;
 	int name;
 //	SEXP scname;
    ptax = getTaxPointer(Rtaxonomy);
    if (! isInteger(Rtaxid))
        error("taxid not integer");
    taxid = *INTEGER(Rtaxid);
    if (! (taxid > 0))
        error("taxid not positive");
    if (! isLogical(Rname))
        error("name not logical");
    name = LOGICAL(Rname)[0];
 	taxon = eco_findtaxonbytaxid(ptax, taxid);
 	if (!taxon)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	rep = eco_getsuperkingdom(taxon,ptax);
 	if (!rep)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	if (name)
 		return mkString(rep->name);
 	else
 		return ScalarInteger(rep->taxid);
 }
 SEXP R_get_parent(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
 {
 	ecotx_t *taxon;
 	ecotx_t *rep;
 	ecotaxonomy_t *ptax;
 	int taxid;
 	int name;
 //	SEXP scname;
    ptax = getTaxPointer(Rtaxonomy);
    if (! isInteger(Rtaxid))
        error("taxid not integer");
    taxid = *INTEGER(Rtaxid);
    if (! (taxid > 0))
        error("taxid not positive");
    if (! isLogical(Rname))
        error("name not logical");
    name = LOGICAL(Rname)[0];
 	taxon = eco_findtaxonbytaxid(ptax, taxid);
 	if (!taxon)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	rep = taxon->parent;
 	if (rep->taxid==taxid)
  {
 		if (name)
 			return ScalarString(R_NaString);
 		else
 			return ScalarInteger(R_NaInt);
  }
 	if (name)
 		return mkString(rep->name);
 	else
 		return ScalarInteger(rep->taxid);
 }
 SEXP R_validate_taxid(SEXP Rtaxonomy,SEXP Rtaxid)
 {
 	ecotx_t *taxon;
 	ecotaxonomy_t *ptax;
 	int taxid;
 //	int name;
 //	SEXP scname;
    ptax = getTaxPointer(Rtaxonomy);
    if (! isInteger(Rtaxid))
        error("taxid not integer");
    taxid = *INTEGER(Rtaxid);
    if (! (taxid > 0))
    	return ScalarInteger(R_NaInt);
 	taxon = eco_findtaxonbytaxid(ptax, taxid);
 	if (!taxon)
 		return ScalarInteger(R_NaInt);
 	else
 		return ScalarInteger(taxon->taxid);
 }
 SEXP R_is_under_taxon(SEXP Rtaxonomy, SEXP Rtaxid, SEXP Rparent)
 {
 	ecotx_t *taxon;
 	ecotaxonomy_t *ptax;
 	int taxid;
 	int parent;
 	int rep;
 //	SEXP isunder;
    ptax = getTaxPointer(Rtaxonomy);
    if (! isInteger(Rparent))
        error("parent not integer");
    parent = *INTEGER(Rparent);
    if (parent <= 0)
    	return ScalarInteger(R_NaInt);
 	taxon = eco_findtaxonbytaxid(ptax, parent);
 	if (!taxon)
 		return ScalarInteger(R_NaInt);
 	if (! isInteger(Rtaxid))
        error("taxid not integer");
    taxid = *INTEGER(Rtaxid);
    if (taxid <= 0)
    	return ScalarInteger(R_NaInt);
 	taxon = eco_findtaxonbytaxid(ptax, taxid);
 	if (!taxon)
 		return ScalarInteger(R_NaInt);
 	rep = eco_isundertaxon(taxon, parent);
 	return ScalarLogical(rep);
 }
 SEXP R_longest_path(SEXP Rtaxonomy,SEXP Rtaxid)
 {
 	ecotx_t *taxon;
 	ecotaxonomy_t *ptax;
 	int taxid;
 //	int name;
 //	SEXP scname;
    ptax = getTaxPointer(Rtaxonomy);
    if (! isInteger(Rtaxid))
        error("taxid not integer");
    taxid = *INTEGER(Rtaxid);
    if (taxid <= 0)
    	return ScalarInteger(R_NaInt);
 	taxon = eco_findtaxonbytaxid(ptax, taxid);
 	if (!taxon)
 		return ScalarInteger(R_NaInt);
 	else
 		return ScalarInteger(taxon->farest);
 }
 SEXP R_rank_list(SEXP Rtaxonomy)
 {
 	int nrank;
 	int i;
 	ecotaxonomy_t *ptax;
 	SEXP rNames;
    ptax = getTaxPointer(Rtaxonomy);
    nrank = ptax->ranks->count;
    rNames = PROTECT(allocVector(STRSXP, nrank));
 	for (i=0; i < nrank;i++)
 		SET_STRING_ELT(rNames, i, mkChar(ptax->ranks->label[i]));
 	UNPROTECT(1);
 	return rNames;
 }
 SEXP R_taxid_list(SEXP Rtaxonomy)
 {
 	int ntaxid;
 	int i;
 	ecotaxonomy_t *ptax;
 	SEXP rTaxids;
    ptax = getTaxPointer(Rtaxonomy);
    ntaxid  = ptax->taxons->count;
    rTaxids = PROTECT(allocVector(INTSXP, ntaxid));
 	for (i=0; i < ntaxid;i++)
 		INTEGER(rTaxids)[i]=ptax->taxons->taxon[i].taxid;
 	UNPROTECT(1);
 	return rTaxids;
 }
 SEXP R_max_taxid(SEXP Rtaxonomy)
 {
 //	int nrank;
 //	int i;
 	ecotaxonomy_t *ptax;
 //	SEXP rNames;
    ptax = getTaxPointer(Rtaxonomy);
    return ScalarInteger(ptax->taxons->maxtaxid);
 }
 SEXP R_length_taxonomy(SEXP Rtaxonomy)
 {
 	ecotaxonomy_t *ptax;
    ptax = getTaxPointer(Rtaxonomy);
    return ScalarInteger(ptax->taxons->count);
 }
 SEXP R_ecofind(SEXP Rtaxonomy, SEXP Rpattern, SEXP Rrank, SEXP Ralternative)
 {
  ecotaxonomy_t *ptax;
  econame_t		*name;
  char*     pattern=NULL;
 	int				re_match;
  SEXP      taxids;
  int32_t*  buffer;
  int32_t		tax_count	= 0;
  size_t 		j 		= 0;
  int32_t		rankfilter 	= 1;
  int*      ptaxid;
  char			*rankname=NULL;
  int32_t			nummatch 	= 0;
  int32_t         alternative = 0;
  size_t    bsize;
  ptax = getTaxPointer(Rtaxonomy);
  tax_count = ptax->taxons->count;
  if (! isString(Rpattern))
      error("pattern not a string");
  pattern= (char*) CHAR(STRING_ELT(Rpattern,0));
  if (! isNull(Rrank))
  {
    if (! isString(Rrank))
      error("rank not a string");
    rankname= (char*) CHAR(STRING_ELT(Rrank,0));
  }
  if (! isLogical(Ralternative))
      error("rank not a logical");
  alternative = LOGICAL(Ralternative)[0];
 	nummatch=0;
  buffer = (int32_t*) malloc(100 * sizeof(int32_t));
  bsize=100;
  if (alternative && ptax->names!=NULL)
 	  for (j=0,name=ptax->names->names;
 			  j < ptax->names->count;
 			  name++,j++)
 	  {
 		  if(rankname)
 			  rankfilter = !(strcmp(rankname,ptax->ranks->label[name->taxon->rank]));
  	  re_match = slre_match(pattern, name->name, 
                            strlen(name->name), 
                            NULL, 0, 
                            SLRE_IGNORE_CASE);
  	  if (re_match > 0 && rankfilter)
 		  {
 			  buffer[nummatch]=name->taxon->taxid;
 			  nummatch++;
 			  if (nummatch==bsize) {
 				  bsize*=2;
 				  buffer = (int32_t*) realloc(buffer, bsize * sizeof(int32_t));
 				  if (buffer==0)
 				  {
 					  // regfree(&re_preg);
 					  error("Cannot allocate memory for the taxid list");
 				  }
 			  }
 		  }
 	  }
  else
 	  for (j=0; j < ptax->taxons->count;j++)
 	  {
 		  if(rankname)
 			  rankfilter = !(strcmp(rankname,ptax->ranks->label[ptax->taxons->taxon[j].rank]));
 //		  re_match = regexec (&re_preg, ptax->taxons->taxon[j].name, 0, NULL, 0);
      re_match = slre_match(pattern, ptax->taxons->taxon[j].name, 
                            strlen(ptax->taxons->taxon[j].name), 
                            NULL, 0, 
                            SLRE_IGNORE_CASE);
 //  	  if (!re_match && rankfilter)
  	  if (re_match > 0 && rankfilter)
 		  {
 			  buffer[nummatch]=ptax->taxons->taxon[j].taxid;
 			  nummatch++;
 			  if (nummatch==bsize) {
 				  bsize*=2;
 				  buffer = (int32_t*) realloc(buffer, bsize * sizeof(int32_t));
 				  if (buffer==0)
 				  {
 					  // regfree(&re_preg);
 					  error("Cannot allocate memory for the taxid list");
 				  }
 			  }
 		  }
 	  }
   	//regfree(&re_preg);
    taxids = PROTECT(NEW_INTEGER(nummatch));
    ptaxid = INTEGER(taxids);
    for (j=0; j < nummatch; j++)
      ptaxid[j]=buffer[j];
    free(buffer);
    UNPROTECT(1);
    return taxids;
 }
--- a/ROBITools/src/robitax.h
+++ b/ROBITools/src/robitax.h
@@ -0,0 +1,6 @@
 #include "ecoPCR.h"
 ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy);
 SEXP R_delete_taxonomy(SEXP Rtaxonomy);
--- a/ROBITools/src/robitax.o
+++ b/ROBITools/src/robitax.o
--- a/ROBITools/src/slre.c
+++ b/ROBITools/src/slre.c
@@ -0,0 +1,433 @@
 /*
 * Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
 * Copyright (c) 2013 Cesanta Software Limited
 * All rights reserved
 *
 * This library is dual-licensed: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation. For the terms of this
 * license, see <http://www.gnu.org/licenses/>.
 *
 * You are free to use this library under the terms of the GNU General
 * Public License, but WITHOUT ANY WARRANTY; without even the implied
 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 *
 * Alternatively, you can license this library under a commercial
 * license, as set out in <http://cesanta.com/products.html>.
 */
 #include <stdio.h>
 #include <ctype.h>
 #include <string.h>
 #include "slre.h"
 #define MAX_BRANCHES 100
 #define MAX_BRACKETS 100
 #define FAIL_IF(condition, error_code) if (condition) return (error_code)
 #ifndef ARRAY_SIZE
 #define ARRAY_SIZE(ar) (sizeof(ar) / sizeof((ar)[0]))
 #endif
 #ifdef SLRE_DEBUG
 #define DBG(x) printf x
 #else
 #define DBG(x)
 #endif
 struct bracket_pair {
  const char *ptr;  /* Points to the first char after '(' in regex  */
  int len;          /* Length of the text between '(' and ')'       */
  int branches;     /* Index in the branches array for this pair    */
  int num_branches; /* Number of '|' in this bracket pair           */
 };
 struct branch {
  int bracket_index;    /* index for 'struct bracket_pair brackets' */
                        /* array defined below                      */
  const char *schlong;  /* points to the '|' character in the regex */
 };
 struct regex_info {
  /*
   * Describes all bracket pairs in the regular expression.
   * First entry is always present, and grabs the whole regex.
   */
  struct bracket_pair brackets[MAX_BRACKETS];
  int num_brackets;
  /*
   * Describes alternations ('|' operators) in the regular expression.
   * Each branch falls into a specific branch pair.
   */
  struct branch branches[MAX_BRANCHES];
  int num_branches;
  /* Array of captures provided by the user */
  struct slre_cap *caps;
  int num_caps;
  /* E.g. SLRE_IGNORE_CASE. See enum below */
  int flags;
 };
 static int is_metacharacter(const unsigned char *s) {
  static const char *metacharacters = "^$().[]*+?|\\Ssdbfnrtv";
  return strchr(metacharacters, *s) != NULL;
 }
 static int op_len(const char *re) {
  return re[0] == '\\' && re[1] == 'x' ? 4 : re[0] == '\\' ? 2 : 1;
 }
 static int set_len(const char *re, int re_len) {
  int len = 0;
  while (len < re_len && re[len] != ']') {
    len += op_len(re + len);
  }
  return len <= re_len ? len + 1 : -1;
 }
 static int get_op_len(const char *re, int re_len) {
  return re[0] == '[' ? set_len(re + 1, re_len - 1) + 1 : op_len(re);
 }
 static int is_quantifier(const char *re) {
  return re[0] == '*' || re[0] == '+' || re[0] == '?';
 }
 static int toi(int x) {
  return isdigit(x) ? x - '0' : x - 'W';
 }
 static int hextoi(const unsigned char *s) {
  return (toi(tolower(s[0])) << 4) | toi(tolower(s[1]));
 }
 static int match_op(const unsigned char *re, const unsigned char *s,
                    struct regex_info *info) {
  int result = 0;
  switch (*re) {
    case '\\':
      /* Metacharacters */
      switch (re[1]) {
        case 'S': FAIL_IF(isspace(*s), SLRE_NO_MATCH); result++; break;
        case 's': FAIL_IF(!isspace(*s), SLRE_NO_MATCH); result++; break;
        case 'd': FAIL_IF(!isdigit(*s), SLRE_NO_MATCH); result++; break;
        case 'b': FAIL_IF(*s != '\b', SLRE_NO_MATCH); result++; break;
        case 'f': FAIL_IF(*s != '\f', SLRE_NO_MATCH); result++; break;
        case 'n': FAIL_IF(*s != '\n', SLRE_NO_MATCH); result++; break;
        case 'r': FAIL_IF(*s != '\r', SLRE_NO_MATCH); result++; break;
        case 't': FAIL_IF(*s != '\t', SLRE_NO_MATCH); result++; break;
        case 'v': FAIL_IF(*s != '\v', SLRE_NO_MATCH); result++; break;
        case 'x':
          /* Match byte, \xHH where HH is hexadecimal byte representaion */
          FAIL_IF(hextoi(re + 2) != *s, SLRE_NO_MATCH);
          result++;
          break;
        default:
          /* Valid metacharacter check is done in bar() */
          FAIL_IF(re[1] != s[0], SLRE_NO_MATCH);
          result++;
          break;
      }
      break;
    case '|': FAIL_IF(1, SLRE_INTERNAL_ERROR); break;
    case '$': FAIL_IF(1, SLRE_NO_MATCH); break;
    case '.': result++; break;
    default:
      if (info->flags & SLRE_IGNORE_CASE) {
        FAIL_IF(tolower(*re) != tolower(*s), SLRE_NO_MATCH);
      } else {
        FAIL_IF(*re != *s, SLRE_NO_MATCH);
      }
      result++;
      break;
  }
  return result;
 }
 static int match_set(const char *re, int re_len, const char *s,
                     struct regex_info *info) {
  int len = 0, result = -1, invert = re[0] == '^';
  if (invert) re++, re_len--;
  while (len <= re_len && re[len] != ']' && result <= 0) {
    /* Support character range */
    if (re[len] != '-' && re[len + 1] == '-' && re[len + 2] != ']' &&
        re[len + 2] != '\0') {
      result = info->flags &&  SLRE_IGNORE_CASE ?
        *s >= re[len] && *s <= re[len + 2] :
        tolower(*s) >= tolower(re[len]) && tolower(*s) <= tolower(re[len + 2]);
      len += 3;
    } else {
      result = match_op((unsigned char *) re + len, (unsigned char *) s, info);
      len += op_len(re + len);
    }
  }
  return (!invert && result > 0) || (invert && result <= 0) ? 1 : -1;
 }
 static int doh(const char *s, int s_len, struct regex_info *info, int bi);
 static int bar(const char *re, int re_len, const char *s, int s_len,
               struct regex_info *info, int bi) {
  /* i is offset in re, j is offset in s, bi is brackets index */
  int i, j, n, step;
  for (i = j = 0; i < re_len && j <= s_len; i += step) {
    /* Handle quantifiers. Get the length of the chunk. */
    step = re[i] == '(' ? info->brackets[bi + 1].len + 2 :
      get_op_len(re + i, re_len - i);
    DBG(("%s [%.*s] [%.*s] re_len=%d step=%d i=%d j=%d\n", __func__,
         re_len - i, re + i, s_len - j, s + j, re_len, step, i, j));
    FAIL_IF(is_quantifier(&re[i]), SLRE_UNEXPECTED_QUANTIFIER);
    FAIL_IF(step <= 0, SLRE_INVALID_CHARACTER_SET);
    if (i + step < re_len && is_quantifier(re + i + step)) {
      DBG(("QUANTIFIER: [%.*s]%c [%.*s]\n", step, re + i,
           re[i + step], s_len - j, s + j));
      if (re[i + step] == '?') {
        int result = bar(re + i, step, s + j, s_len - j, info, bi);
        j += result > 0 ? result : 0;
        i++;
      } else if (re[i + step] == '+' || re[i + step] == '*') {
        int j2 = j, nj = j, n1, n2 = -1, ni, non_greedy = 0;
        /* Points to the regexp code after the quantifier */
        ni = i + step + 1;
        if (ni < re_len && re[ni] == '?') {
          non_greedy = 1;
          ni++;
        }
        do {
          if ((n1 = bar(re + i, step, s + j2, s_len - j2, info, bi)) > 0) {
            j2 += n1;
          }
          if (re[i + step] == '+' && n1 < 0) break;
          if (ni >= re_len) {
            /* After quantifier, there is nothing */
            nj = j2;
          } else if ((n2 = bar(re + ni, re_len - ni, s + j2,
                               s_len - j2, info, bi)) >= 0) {
            /* Regex after quantifier matched */
            nj = j2 + n2;
          }
          if (nj > j && non_greedy) break;
        } while (n1 > 0);
        if (n1 < 0 && re[i + step] == '*' &&
            (n2 = bar(re + ni, re_len - ni, s + j, s_len - j, info, bi)) > 0) {
          nj = j + n2;
        }
        DBG(("STAR/PLUS END: %d %d %d %d %d\n", j, nj, re_len - ni, n1, n2));
        FAIL_IF(re[i + step] == '+' && nj == j, SLRE_NO_MATCH);
        /* If while loop body above was not executed for the * quantifier,  */
        /* make sure the rest of the regex matches                          */
        FAIL_IF(nj == j && ni < re_len && n2 < 0, SLRE_NO_MATCH);
        /* Returning here cause we've matched the rest of RE already */
        return nj;
      }
      continue;
    }
    if (re[i] == '[') {
      n = match_set(re + i + 1, re_len - (i + 2), s + j, info);
      DBG(("SET %.*s [%.*s] -> %d\n", step, re + i, s_len - j, s + j, n));
      FAIL_IF(n <= 0, SLRE_NO_MATCH);
      j += n;
    } else if (re[i] == '(') {
      n = SLRE_NO_MATCH;
      bi++;
      FAIL_IF(bi >= info->num_brackets, SLRE_INTERNAL_ERROR);
      DBG(("CAPTURING [%.*s] [%.*s] [%s]\n",
           step, re + i, s_len - j, s + j, re + i + step));
      if (re_len - (i + step) <= 0) {
        /* Nothing follows brackets */
        n = doh(s + j, s_len - j, info, bi);
      } else {
        int j2;
        for (j2 = 0; j2 <= s_len - j; j2++) {
          if ((n = doh(s + j, s_len - (j + j2), info, bi)) >= 0 &&
              bar(re + i + step, re_len - (i + step),
                  s + j + n, s_len - (j + n), info, bi) >= 0) break;
        }
      }
      DBG(("CAPTURED [%.*s] [%.*s]:%d\n", step, re + i, s_len - j, s + j, n));
      FAIL_IF(n < 0, n);
      if (info->caps != NULL) {
        info->caps[bi - 1].ptr = s + j;
        info->caps[bi - 1].len = n;
      }
      j += n;
    } else if (re[i] == '^') {
      FAIL_IF(j != 0, SLRE_NO_MATCH);
    } else if (re[i] == '$') {
      FAIL_IF(j != s_len, SLRE_NO_MATCH);
    } else {
      FAIL_IF(j >= s_len, SLRE_NO_MATCH);
      n = match_op((unsigned char *) (re + i), (unsigned char *) (s + j), info);
      FAIL_IF(n <= 0, n);
      j += n;
    }
  }
  return j;
 }
 /* Process branch points */
 static int doh(const char *s, int s_len, struct regex_info *info, int bi) {
  const struct bracket_pair *b = &info->brackets[bi];
  int i = 0, len, result;
  const char *p;
  do {
    p = i == 0 ? b->ptr : info->branches[b->branches + i - 1].schlong + 1;
    len = b->num_branches == 0 ? b->len :
      i == b->num_branches ? (int) (b->ptr + b->len - p) :
      (int) (info->branches[b->branches + i].schlong - p);
    DBG(("%s %d %d [%.*s] [%.*s]\n", __func__, bi, i, len, p, s_len, s));
    result = bar(p, len, s, s_len, info, bi);
    DBG(("%s <- %d\n", __func__, result));
  } while (result <= 0 && i++ < b->num_branches);  /* At least 1 iteration */
  return result;
 }
 static int baz(const char *s, int s_len, struct regex_info *info) {
  int i, result = -1, is_anchored = info->brackets[0].ptr[0] == '^';
  for (i = 0; i <= s_len; i++) {
    result = doh(s + i, s_len - i, info, 0);
    if (result >= 0) {
      result += i;
      break;
    }
    if (is_anchored) break;
  }
  return result;
 }
 static void setup_branch_points(struct regex_info *info) {
  int i, j;
  struct branch tmp;
  /* First, sort branches. Must be stable, no qsort. Use bubble algo. */
  for (i = 0; i < info->num_branches; i++) {
    for (j = i + 1; j < info->num_branches; j++) {
      if (info->branches[i].bracket_index > info->branches[j].bracket_index) {
        tmp = info->branches[i];
        info->branches[i] = info->branches[j];
        info->branches[j] = tmp;
      }
    }
  }
  /*
   * For each bracket, set their branch points. This way, for every bracket
   * (i.e. every chunk of regex) we know all branch points before matching.
   */
  for (i = j = 0; i < info->num_brackets; i++) {
    info->brackets[i].num_branches = 0;
    info->brackets[i].branches = j;
    while (j < info->num_branches && info->branches[j].bracket_index == i) {
      info->brackets[i].num_branches++;
      j++;
    }
  }
 }
 static int foo(const char *re, int re_len, const char *s, int s_len,
               struct regex_info *info) {
  int i, step, depth = 0;
  /* First bracket captures everything */
  info->brackets[0].ptr = re;
  info->brackets[0].len = re_len;
  info->num_brackets = 1;
  /* Make a single pass over regex string, memorize brackets and branches */
  for (i = 0; i < re_len; i += step) {
    step = get_op_len(re + i, re_len - i);
    if (re[i] == '|') {
      FAIL_IF(info->num_branches >= (int) ARRAY_SIZE(info->branches),
              SLRE_TOO_MANY_BRANCHES);
      info->branches[info->num_branches].bracket_index =
        info->brackets[info->num_brackets - 1].len == -1 ?
        info->num_brackets - 1 : depth;
      info->branches[info->num_branches].schlong = &re[i];
      info->num_branches++;
    } else if (re[i] == '\\') {
      FAIL_IF(i >= re_len - 1, SLRE_INVALID_METACHARACTER);
      if (re[i + 1] == 'x') {
        /* Hex digit specification must follow */
        FAIL_IF(re[i + 1] == 'x' && i >= re_len - 3,
                SLRE_INVALID_METACHARACTER);
        FAIL_IF(re[i + 1] ==  'x' && !(isxdigit(re[i + 2]) &&
                isxdigit(re[i + 3])), SLRE_INVALID_METACHARACTER);
      } else {
        FAIL_IF(!is_metacharacter((unsigned char *) re + i + 1),
                SLRE_INVALID_METACHARACTER);
      }
    } else if (re[i] == '(') {
      FAIL_IF(info->num_brackets >= (int) ARRAY_SIZE(info->brackets),
              SLRE_TOO_MANY_BRACKETS);
      depth++;  /* Order is important here. Depth increments first. */
      info->brackets[info->num_brackets].ptr = re + i + 1;
      info->brackets[info->num_brackets].len = -1;
      info->num_brackets++;
      FAIL_IF(info->num_caps > 0 && info->num_brackets - 1 > info->num_caps,
              SLRE_CAPS_ARRAY_TOO_SMALL);
    } else if (re[i] == ')') {
      int ind = info->brackets[info->num_brackets - 1].len == -1 ?
        info->num_brackets - 1 : depth;
      info->brackets[ind].len = (int) (&re[i] - info->brackets[ind].ptr);
      DBG(("SETTING BRACKET %d [%.*s]\n",
           ind, info->brackets[ind].len, info->brackets[ind].ptr));
      depth--;
      FAIL_IF(depth < 0, SLRE_UNBALANCED_BRACKETS);
      FAIL_IF(i > 0 && re[i - 1] == '(', SLRE_NO_MATCH);
    }
  }
  FAIL_IF(depth != 0, SLRE_UNBALANCED_BRACKETS);
  setup_branch_points(info);
  return baz(s, s_len, info);
 }
 int slre_match(const char *regexp, const char *s, int s_len,
               struct slre_cap *caps, int num_caps, int flags) {
  struct regex_info info;
  /* Initialize info structure */
  info.flags = flags;
  info.num_brackets = info.num_branches = 0;
  info.num_caps = num_caps;
  info.caps = caps;
  DBG(("========================> [%s] [%.*s]\n", regexp, s_len, s));
  return foo(regexp, (int) strlen(regexp), s, s_len, &info);
 }
--- a/ROBITools/src/slre.h
+++ b/ROBITools/src/slre.h
@@ -0,0 +1,60 @@
 /*
 * Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
 * Copyright (c) 2013 Cesanta Software Limited
 * All rights reserved
 *
 * This library is dual-licensed: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation. For the terms of this
 * license, see <http://www.gnu.org/licenses/>.
 *
 * You are free to use this library under the terms of the GNU General
 * Public License, but WITHOUT ANY WARRANTY; without even the implied
 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 *
 * Alternatively, you can license this library under a commercial
 * license, as set out in <http://cesanta.com/products.html>.
 */
 /*
 * This is a regular expression library that implements a subset of Perl RE.
 * Please refer to README.md for a detailed reference.
 */
 #ifndef SLRE_HEADER_DEFINED
 #define SLRE_HEADER_DEFINED
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct slre_cap {
  const char *ptr;
  int len;
 };
 int slre_match(const char *regexp, const char *buf, int buf_len,
               struct slre_cap *caps, int num_caps, int flags);
 /* Possible flags for slre_match() */
 enum { SLRE_IGNORE_CASE = 1 };
 /* slre_match() failure codes */
 #define SLRE_NO_MATCH               -1
 #define SLRE_UNEXPECTED_QUANTIFIER  -2
 #define SLRE_UNBALANCED_BRACKETS    -3
 #define SLRE_INTERNAL_ERROR         -4
 #define SLRE_INVALID_CHARACTER_SET  -5
 #define SLRE_INVALID_METACHARACTER  -6
 #define SLRE_CAPS_ARRAY_TOO_SMALL   -7
 #define SLRE_TOO_MANY_BRANCHES      -8
 #define SLRE_TOO_MANY_BRACKETS      -9
 #ifdef __cplusplus
 }
 #endif
 #endif  /* SLRE_HEADER_DEFINED */
--- a/ROBITools/src/slre.o
+++ b/ROBITools/src/slre.o
--- a/dev_notes.txt
+++ b/dev_notes.txt
@@ -0,0 +1,190 @@
 package.skeleton("ROBITools",c("robitools.motu.count","robitools.motus",
                               "robitools.reads","robitools.samples",
                               "robitools.sample.count",))
 #include <R.h>
 #include <Rinternals.h>
 static void cooked_goose(SEXP foo)
 {
    if (TYPEOF(foo) != EXTPTRSXP)
        error("argument not external pointer");
    double *x = (double *) R_ExternalPtrAddr(foo);
    int blather = x[0];
    Free(x);
    if (blather)
        printf("finalizer ran\n");
 }
 SEXP blob(SEXP nin, SEXP blatherin)
 {
    if (! isInteger(nin))
        error("n not integer");
    int n = INTEGER(nin)[0];
    if (! (n > 0))
        error("n not positive");
    if (! isLogical(blatherin))
        error("blather not logical");
    int blather = LOGICAL(blatherin)[0];
    double *x = Calloc(n + 2, double);
    GetRNGstate();
    for (int i = 0; i < n; ++i)
        x[i + 2] = norm_rand();
    PutRNGstate();
    x[1] = n;
    x[0] = blather;
    SEXP bar;
    PROTECT(bar = R_MakeExternalPtr(x, R_NilValue, R_NilValue));
    R_RegisterCFinalizer(bar, cooked_goose);
    UNPROTECT(1);
    return bar;
 }
 SEXP blub(SEXP foo)
 {
    if (TYPEOF(foo) != EXTPTRSXP)
        error("argument not external pointer");
    double *x = (double *) R_ExternalPtrAddr(foo);
    int blather = x[0];
    int n = x[1];
    SEXP bar;
    PROTECT(bar = allocVector(REALSXP, n));
    for (int i = 0; i < n; ++i)
        REAL(bar)[i] = x[i + 2];
    UNPROTECT(1);
    return bar;
 }
 blob <- function(n, blather = FALSE) {
    stopifnot(is.numeric(n))
    stopifnot(as.integer(n) == n)
    stopifnot(n > 0)
    stopifnot(is.logical(blather))
    .Call("blob", as.integer(n), blather)
 }
 blub <- function(x) {
    stopifnot(class(x) == "externalptr")
    .Call("blub", x)
 }
 Hi Robert,
 It looks like there is no way to explicitly make an S4 object call a
 function when it is garbage collected unless you resort to tricks with
 reg.finalizer.
 It turns out that Prof. Ripley's reply (thanks!!) had enough hints in it
 that I was able to get the effect I wanted by using R's external pointer
 facility. In fact it works quite nicely.
 In a nutshell, I create a C++ object (with new) and then wrap its pointer
 with an R external pointer using
 SEXP rExtPtr = R_MakeExternalPtr( cPtr, aTag, R_NilValue);
 Where cPtr is the C++/C pointer to the object and aTag is an R symbol
 describing the pointer type [e.g. SEXP aTag =
 install("this_is_a_tag_for_a_pointer_to_my_object")]. The final argument is
 "a value to protect". I don't know what this means, but all of the examples
 I saw use R_NilValue.
 If you want a C++ function to be called when R loses the reference to the
 external pointer (actually when R garbage collects it, or when R quits), do
 R_RegisterCFinalizerEx( rExtPtr, (R_CFinalizer_t)functionToBeCalled, TRUE );
 The TRUE means that R will call the "functionToBeCalled" if the pointer is
 still around when R quits. I guess if you set it to FALSE, then you are
 assuming that your shell can delete memory and/or release resources when R
 quits. 
 So return this external pointer to R (the function that new'ed it was called
 by .Call or something similar) and stick it in a slot of your object. Then
 when your object is garbage collected, "functionToBeCalled" will be called.
 The slot would have the type "externalptr".
 The functionToBeCalled contains the code to delete the C++ pointer or
 release resources, for example...
 SEXP functionToBeCalled( SEXP rExtPtr ) {
  // Get the C++ pointer
  MyThing* ptr = R_ExternalPtrAddr(rExtPtr);
  // Delete it
  delete ptr;
  // Clear the external pointer
  R_ClearExternalPtr(rExtPtr);
  return R_NilValue;
 }
 And there you have it.
 There doesn't seem to be any official documentation on this stuff (at least
 none that I could find). The best references I found are on the R developers
 web page. See the links within  "some notes on _references, external
 objects, or mutable state_ for R and a _simple implementation_ of external
 references and finalization". Note that the documents are slightly out of
 date (the function names have apparently been changed somewhat). The latter
 one has some examples that are very helpful. And as Prof. Ripley pointed
 out, RODBC uses this facility too, so look at that code.
 Hope this was useful. Good luck.
 SEXP
 get(SEXP ext)
 {
    return mkString((char *) R_ExternalPtrAddr(ext));
 }
 SEXP
 set(SEXP ext, SEXP str)
 {
    char *x = (char *) R_ExternalPtrAddr(ext);
    snprintf(x, N_MAX, CHAR(STRING_ELT(str, 0)));
    return ScalarLogical(TRUE);
 }
 > dyn.load("tmp.so")
 > x <- .Call("create", list("info could be any R object", 1:5))
 > .Call("get", x)
 [1] "my name is joe"
 > ## reference semantics!
 > .Call("set", x, "i am sam i am")
 [1] TRUE
 > .Call("get", x)
 [1] "i am sam i am"
 > x <- NULL
 > gc()
 finalizing
         used (Mb) gc trigger (Mb) max used (Mb)
 Ncells 339306 18.2     467875   25   407500 21.8
 Vcells 202064  1.6     786432    6   380515  3.0
 SEXP
 incr(SEXP ext)
 {
    struct Foo *foo = (struct Foo*) R_ExternalPtrAddr(ext);
    foo->x += 1;
    return ScalarInteger(foo->x);
 }
 library(ROBITools)
 library.dynam('ROBITools.so')
 t=.Call('R_read_taxonomy','ecochange',TRUE)
 .Call('R_get_scientific_name',t,as.integer(7742))