ROBITools/R/import.metabarcoding.R

#' @include read.obitab.R
#' @include 02_class_metabarcoding.data.R
NULL

#' Read a data file produced by the \code{obitab} command
#'
#' Read a data file issued from the conversion of a \strong{fasta}
#' file to a tabular file by the \code{obitab} command of the
#' \strong{OBITools} package
#'
#' @param file a string containing the file name of the obitab file.
#' @param sep  Column separator in the obitab file.
#'             The default separator is the tabulation.
#' @param sample A regular expression allowing to identify columns
#'               from the file describing abundances of sequences per sample
#' @param sample.sep Separator between combined sample name.
#' @param attribute Separator used to split between sample 'tag' and sample name.
#'
#' @return a \code{\link{metabarcoding.data}} instance
#'
#' @examples
#' require(ROBITools)
#'
#' \dontshow{# switch the working directory to the data package directory}
#' \dontshow{setwd(system.file("extdata", package="ROBITools"))}
#'
#' # read the termes.tab file
#' termes=import.metabarcoding.data('termes.tab')
#'
#' # print the number of samples and motus described in the file
#' dim(termes)
#'
#' @seealso \code{\link{metabarcoding.data}}
#'
#' @author Eric Coissac
#' @keywords DNA metabarcoding
#' @export
#'
import.metabarcoding.data = function(file,sep='\t',sample="sample",sample.sep="\\.",attribute=":") {

	data=read.obitab(file,sep=sep)

	# get the colnames matching the sample pattern

	column=colnames(data)
	pat = paste('(^|',sample.sep,')',sample,'[',sample.sep,attribute,']',sep='')
	scol= grep(pat,column)

	# reads informations about samples

	reads  = data[,scol]
	names  = colnames(reads)
	names  = strsplit(names,split=attribute)

			# for sample name just remove the first part of the col names
			# usally "sample:"

	sample.names = sapply(names,function(a) paste(a[-1],collapse=attribute))

	reads=t(reads)
	rownames(reads)=sample.names

	# sample's data

	sample.data = data.frame(t(data.frame(strsplit(sample.names,split=attribute))))
	rownames(sample.data)=sample.names
	colnames(sample.data)=strsplit(names[[1]][1],split=attribute)


	# motus information

	motus = data[,-scol]

	motus.id = motus$id

	rownames(motus)=motus.id
	colnames(reads)=motus.id


	return(metabarcoding.data(reads,sample.data,motus))

}


#pcr = gh[,grep('^sample',colnames(gh))]
#pcr.names = colnames(pcr)
#pcr.names = sub('sample\\.','',pcr.names)
#sequencer = rep('Solexa',length(pcr.names))
#sequencer[grep('454',pcr.names)]='454'
#sequencer=factor(sequencer)
#
#tmp = strsplit(pcr.names,'\\.[A-Z](sol|454)\\.')
#
#sample = sapply(tmp,function(x) x[1])
#locality = factor(sapply(strsplit(sample,'_'),function(x) x[1]))
#sample = factor(sample)
#repeats= factor(sapply(tmp,function(x) x[2]))
#
#tmp = regexpr('[A-Z](454|sol)',pcr.names)
#run=factor(substr(pcr.names,tmp,tmp+attr(tmp,"match.length")-1))
#
#pcr.metadata = data.frame(run,sequencer,locality,sample,repeats)
#
#rownames(pcr.metadata)=pcr.names