Files
biodiversity-metrics/RawData/prepare_data.R

145 lines
5.5 KiB
R

positive = read.delim("RawData/positifs.uniq.annotated.txt",
sep="\t",
header = TRUE)
columns = names(positive)
columns.info = c("id", "dilution", "species_name", "taxid", "true", "sequence")
columns.counts= columns[grep("^sample\\.",columns)]
positive.count = t(positive[,columns.counts])
motus = as.data.frame(positive[,columns.info])
positive.motus = data.frame(dilution = as.numeric(motus$dilution)/2,
species = as.character(motus$species_name),
taxid = as.integer(motus$taxid),
true = motus$true == "True"
)
samples.names = rownames(positive.count)
samples = t(simplify2array(strsplit(samples.names,split="_")))
# [,1] [,2] [,3] [,4] [,5] [,6]
# [1,] "sample.TM" "POS" "d16" "1" "a" "A1"
# [2,] "sample.TM" "POS" "d16" "1" "a" "B1"
# [3,] "sample.TM" "POS" "d16" "1" "b" "A2"
# [4,] "sample.TM" "POS" "d16" "1" "b" "B2"
# [5,] "sample.TM" "POS" "d16" "2" "a" "A1"
# [6,] "sample.TM" "POS" "d16" "2" "a" "B1"
samples = as.data.frame(samples[,3:6])
names(samples) = c("dilution","repeats","PCR","Plate")
positive.samples = data.frame(dilution = 32%/%as.integer(substr(as.character(samples$dilution),2,10)),
repeats = interaction(samples[,2:4],drop = TRUE)
)
rownames(positive.samples)= samples.names
rownames(positive.count) = samples.names
rownames(positive.motus) = positive$id
colnames(positive.count) = positive$id
plants.16 = positive.motus[positive.motus$true,][,c(2,3,1)]
plants.16 = plants.16[order(1/plants.16$dilution),]
plants.16$log10.dilution = - seq_len(nrow(plants.16)) / log(10)*log(2)
plants.16$dilution = 1/(2^seq_len(nrow(plants.16)))
usethis::use_data(positive.samples,overwrite = TRUE)
usethis::use_data(positive.motus,overwrite = TRUE)
usethis::use_data(positive.count,overwrite = TRUE)
usethis::use_data(plants.16,overwrite = TRUE)
positive.clean = read.delim("RawData/positifs.uniq.annotated.clean.txt",
sep="\t",
header = TRUE)
columns = names(positive.clean)
columns.info = c("id", "dilution", "species_name", "taxid", "true", "sequence")
columns.counts= columns[grep("^sample\\.",columns)]
positive.clean.count = t(positive.clean[,columns.counts])
motus.clean = as.data.frame(positive.clean[,columns.info])
positive.clean.motus = data.frame(dilution = as.numeric(motus.clean$dilution)/2,
species = as.character(motus.clean$species_name),
taxid = as.integer(motus.clean$taxid),
true = motus.clean$true == "True"
)
samples.names = rownames(positive.clean.count)
samples = t(simplify2array(strsplit(samples.names,split="_")))
samples = as.data.frame(samples[,3:6])
names(samples) = c("dilution","repeats","PCR","Plate")
positive.clean.samples = data.frame(dilution = 32%/%as.integer(substr(as.character(samples$dilution),2,10)),
repeats = interaction(samples[,2:4],drop = TRUE)
)
rownames(positive.clean.samples)= samples.names
rownames(positive.clean.count) = samples.names
rownames(positive.clean.motus) = positive.clean$id
colnames(positive.clean.count) = positive.clean$id
usethis::use_data(positive.clean.samples,overwrite = TRUE)
usethis::use_data(positive.clean.motus,overwrite = TRUE)
usethis::use_data(positive.clean.count,overwrite = TRUE)
#
# Litter/Soil dataset
#
guiana = read.delim("RawData/litiere_ins_cl97_agg_filt_tax.tab",
header = TRUE,
sep="\t")
columns = names(guiana)
columns.info = c("id","best_identity.order_filtered_embl_r136_noenv_INS",
"taxid",
"phylum_name","order_name","class_name","family_name","genus_name","species_name",
"sequence")
columns.counts= columns[grep("^sample\\.",columns)]
samples.names = gsub(pattern = "sample.",
replacement = "",
columns.counts)
guiana.count = t(guiana[,columns.counts])
motus = as.data.frame(guiana[,columns.info])
guiana.motus = data.frame(id = paste("EUK",sprintf("%06d",1:nrow(motus)),sep=""),
best_id = motus$best_identity.order_filtered_embl_r136_noenv_INS,
taxid = as.integer(motus$taxid),
species = factor(as.character(motus$species_name)),
genus = factor(as.character(motus$genus_name)),
family = factor(as.character(motus$family_name)),
class = factor(as.character(motus$class_name)),
order = factor(as.character(motus$order_name)),
phylum = factor(as.character(motus$phylum_name)),
sequence = as.character(motus$sequence),
stringsAsFactors = FALSE
)
samples = read.delim("RawData/Litiere_sample_list.txt",header=TRUE)
guiana.samples = samples[samples.names,]
guiana.samples$sample = as.factor(sub("_r.$","",samples.names))
rownames(guiana.count) = samples.names
colnames(guiana.count) = guiana.motus$id
rownames(guiana.motus) = guiana.motus$id
usethis::use_data(guiana.samples,overwrite = TRUE)
usethis::use_data(guiana.motus,overwrite = TRUE)
usethis::use_data(guiana.count,overwrite = TRUE)