Initial commit of the project
This commit is contained in:
144
RawData/prepare_data.R
Normal file
144
RawData/prepare_data.R
Normal file
@@ -0,0 +1,144 @@
|
||||
positive = read.delim("RawData/positifs.uniq.annotated.txt",
|
||||
sep="\t",
|
||||
header = TRUE)
|
||||
|
||||
|
||||
columns = names(positive)
|
||||
columns.info = c("id", "dilution", "species_name", "taxid", "true", "sequence")
|
||||
columns.counts= columns[grep("^sample\\.",columns)]
|
||||
|
||||
positive.count = t(positive[,columns.counts])
|
||||
|
||||
motus = as.data.frame(positive[,columns.info])
|
||||
positive.motus = data.frame(dilution = as.numeric(motus$dilution)/2,
|
||||
species = as.character(motus$species_name),
|
||||
taxid = as.integer(motus$taxid),
|
||||
true = motus$true == "True"
|
||||
)
|
||||
|
||||
|
||||
samples.names = rownames(positive.count)
|
||||
|
||||
samples = t(simplify2array(strsplit(samples.names,split="_")))
|
||||
|
||||
# [,1] [,2] [,3] [,4] [,5] [,6]
|
||||
# [1,] "sample.TM" "POS" "d16" "1" "a" "A1"
|
||||
# [2,] "sample.TM" "POS" "d16" "1" "a" "B1"
|
||||
# [3,] "sample.TM" "POS" "d16" "1" "b" "A2"
|
||||
# [4,] "sample.TM" "POS" "d16" "1" "b" "B2"
|
||||
# [5,] "sample.TM" "POS" "d16" "2" "a" "A1"
|
||||
# [6,] "sample.TM" "POS" "d16" "2" "a" "B1"
|
||||
|
||||
samples = as.data.frame(samples[,3:6])
|
||||
names(samples) = c("dilution","repeats","PCR","Plate")
|
||||
|
||||
positive.samples = data.frame(dilution = 32%/%as.integer(substr(as.character(samples$dilution),2,10)),
|
||||
repeats = interaction(samples[,2:4],drop = TRUE)
|
||||
)
|
||||
|
||||
rownames(positive.samples)= samples.names
|
||||
rownames(positive.count) = samples.names
|
||||
|
||||
rownames(positive.motus) = positive$id
|
||||
colnames(positive.count) = positive$id
|
||||
|
||||
plants.16 = positive.motus[positive.motus$true,][,c(2,3,1)]
|
||||
|
||||
plants.16 = plants.16[order(1/plants.16$dilution),]
|
||||
plants.16$log10.dilution = - seq_len(nrow(plants.16)) / log(10)*log(2)
|
||||
plants.16$dilution = 1/(2^seq_len(nrow(plants.16)))
|
||||
usethis::use_data(positive.samples,overwrite = TRUE)
|
||||
usethis::use_data(positive.motus,overwrite = TRUE)
|
||||
usethis::use_data(positive.count,overwrite = TRUE)
|
||||
usethis::use_data(plants.16,overwrite = TRUE)
|
||||
|
||||
positive.clean = read.delim("RawData/positifs.uniq.annotated.clean.txt",
|
||||
sep="\t",
|
||||
header = TRUE)
|
||||
|
||||
columns = names(positive.clean)
|
||||
columns.info = c("id", "dilution", "species_name", "taxid", "true", "sequence")
|
||||
columns.counts= columns[grep("^sample\\.",columns)]
|
||||
|
||||
positive.clean.count = t(positive.clean[,columns.counts])
|
||||
|
||||
motus.clean = as.data.frame(positive.clean[,columns.info])
|
||||
positive.clean.motus = data.frame(dilution = as.numeric(motus.clean$dilution)/2,
|
||||
species = as.character(motus.clean$species_name),
|
||||
taxid = as.integer(motus.clean$taxid),
|
||||
true = motus.clean$true == "True"
|
||||
)
|
||||
|
||||
samples.names = rownames(positive.clean.count)
|
||||
|
||||
samples = t(simplify2array(strsplit(samples.names,split="_")))
|
||||
|
||||
samples = as.data.frame(samples[,3:6])
|
||||
names(samples) = c("dilution","repeats","PCR","Plate")
|
||||
|
||||
positive.clean.samples = data.frame(dilution = 32%/%as.integer(substr(as.character(samples$dilution),2,10)),
|
||||
repeats = interaction(samples[,2:4],drop = TRUE)
|
||||
)
|
||||
|
||||
rownames(positive.clean.samples)= samples.names
|
||||
rownames(positive.clean.count) = samples.names
|
||||
|
||||
rownames(positive.clean.motus) = positive.clean$id
|
||||
colnames(positive.clean.count) = positive.clean$id
|
||||
|
||||
usethis::use_data(positive.clean.samples,overwrite = TRUE)
|
||||
usethis::use_data(positive.clean.motus,overwrite = TRUE)
|
||||
usethis::use_data(positive.clean.count,overwrite = TRUE)
|
||||
|
||||
|
||||
#
|
||||
# Litter/Soil dataset
|
||||
#
|
||||
|
||||
|
||||
guiana = read.delim("RawData/litiere_ins_cl97_agg_filt_tax.tab",
|
||||
header = TRUE,
|
||||
sep="\t")
|
||||
|
||||
columns = names(guiana)
|
||||
|
||||
columns.info = c("id","best_identity.order_filtered_embl_r136_noenv_INS",
|
||||
"taxid",
|
||||
"phylum_name","order_name","class_name","family_name","genus_name","species_name",
|
||||
"sequence")
|
||||
|
||||
columns.counts= columns[grep("^sample\\.",columns)]
|
||||
|
||||
samples.names = gsub(pattern = "sample.",
|
||||
replacement = "",
|
||||
columns.counts)
|
||||
|
||||
guiana.count = t(guiana[,columns.counts])
|
||||
|
||||
motus = as.data.frame(guiana[,columns.info])
|
||||
guiana.motus = data.frame(id = paste("EUK",sprintf("%06d",1:nrow(motus)),sep=""),
|
||||
best_id = motus$best_identity.order_filtered_embl_r136_noenv_INS,
|
||||
taxid = as.integer(motus$taxid),
|
||||
species = factor(as.character(motus$species_name)),
|
||||
genus = factor(as.character(motus$genus_name)),
|
||||
family = factor(as.character(motus$family_name)),
|
||||
class = factor(as.character(motus$class_name)),
|
||||
order = factor(as.character(motus$order_name)),
|
||||
phylum = factor(as.character(motus$phylum_name)),
|
||||
sequence = as.character(motus$sequence),
|
||||
stringsAsFactors = FALSE
|
||||
)
|
||||
|
||||
samples = read.delim("RawData/Litiere_sample_list.txt",header=TRUE)
|
||||
|
||||
guiana.samples = samples[samples.names,]
|
||||
guiana.samples$sample = as.factor(sub("_r.$","",samples.names))
|
||||
|
||||
|
||||
rownames(guiana.count) = samples.names
|
||||
colnames(guiana.count) = guiana.motus$id
|
||||
rownames(guiana.motus) = guiana.motus$id
|
||||
|
||||
usethis::use_data(guiana.samples,overwrite = TRUE)
|
||||
usethis::use_data(guiana.motus,overwrite = TRUE)
|
||||
usethis::use_data(guiana.count,overwrite = TRUE)
|
||||
Reference in New Issue
Block a user