4 Commits

200 changed files with 7894 additions and 2651 deletions

2125
Lecture.html Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,16 +1,16 @@
--- ---
title: "Biodiversity metrics \ and metabarcoding" title: "Biodiversity metrics \ and metabarcoding"
author: "Eric Coissac" author: "Eric Coissac"
date: "28/01/2019" date: "02/02/2024"
bibliography: inst/REFERENCES.bib bibliography: inst/REFERENCES.bib
output: format:
ioslides_presentation: revealjs:
widescreen: true
smaller: true smaller: true
css: slides.css transition: slide
mathjax: local scrollable: true
self_contained: false theme: simple
slidy_presentation: default html-math-method: mathjax
editor: visual
--- ---
```{r setup, include=FALSE} ```{r setup, include=FALSE}
@@ -18,15 +18,16 @@ library(knitr)
library(tidyverse) library(tidyverse)
library(kableExtra) library(kableExtra)
library(latex2exp) library(latex2exp)
library(MetabarSchool)
opts_chunk$set(echo = FALSE, opts_chunk$set(echo = FALSE,
cache = TRUE, cache = TRUE,
cache.lazy = FALSE) cache.lazy = FALSE)
``` ```
# Summary # Summary
- The MetabarSchool Package
- What do the reading numbers per PCR mean? - What do the reading numbers per PCR mean?
- Rarefaction vs. relative frequencies - Rarefaction vs. relative frequencies
- alpha diversity metrics - alpha diversity metrics
@@ -34,6 +35,28 @@ opts_chunk$set(echo = FALSE,
- multidimentionnal analysis - multidimentionnal analysis
- comparison between datasets - comparison between datasets
# The MetabarSchool Package
## Installing the package
You need the *devtools* package
```{r eval=FALSE, echo=TRUE}
install.packages("devtools",dependencies = TRUE)
```
Then you can install *MetabarSchool*
```{r eval=FALSE, echo=TRUE}
devtools::install_git("https://git.metabarcoding.org/MetabarcodingSchool/biodiversity-metrics.git")
```
You will also need the *vegan* package
```{r eval=FALSE, echo=TRUE}
install.packages("vegan",dependencies = TRUE)
```
# The dataset # The dataset
## The mock community {.flexbox .vcenter .smaller} ## The mock community {.flexbox .vcenter .smaller}
@@ -59,22 +82,21 @@ data("positive.samples")
- `r nrow(positive.samples)` PCR of the mock community using SPER02 trnL-P6-Loop primers - `r nrow(positive.samples)` PCR of the mock community using SPER02 trnL-P6-Loop primers
- `r length(table(positive.samples$dilution))` dilutions of the mock - `r length(table(positive.samples$dilution))` dilutions of the mock community: `r paste0('1/',names(table(positive.samples$dilution)))`
community: `r paste0('1/',names(table(positive.samples$dilution)))`
- `r as.numeric(table(positive.samples$dilution)[1])` repeats per dilution - `r as.numeric(table(positive.samples$dilution)[1])` repeats per dilution
## Loading data ## Loading data
```{r echo=TRUE} ```{r echo=TRUE}
library(MetabarSchool)
data("positive.count") data("positive.count")
data("positive.samples") data("positive.samples")
data("positive.motus") data("positive.motus")
``` ```
- `positive.count` read count matrix - `positive.count` read count matrix $`r nrow(positive.count)` \; PCRs \; \times \; `r ncol(positive.count)` \; MOTUs$
$`r nrow(positive.count)` \; PCRs \; \times \; `r ncol(positive.count)` \; MOTUs$
```{r} ```{r}
knitr::kable(positive.count[1:5,1:5], knitr::kable(positive.count[1:5,1:5],
@@ -85,22 +107,22 @@ knitr::kable(positive.count[1:5,1:5],
``` ```
<br> <br>
```{r echo=TRUE,eval=FALSE} ```{r echo=TRUE,eval=FALSE}
positive.count[1:5,1:5] positive.count[1:5,1:5]
``` ```
## Loading data ## Loading data
```{r echo=TRUE} ```{r echo=TRUE}
library(MetabarSchool)
data("positive.count") data("positive.count")
data("positive.samples") data("positive.samples")
data("positive.motus") data("positive.motus")
``` ```
- `positive.samples` a `r nrow(positive.samples)` rows `data.frame` of - `positive.samples` a `r nrow(positive.samples)` rows `data.frame` of `r ncol(positive.samples)` columns describing each PCR
`r ncol(positive.samples)` columns describing each PCR
```{r} ```{r}
knitr::kable(head(positive.samples,n=3), knitr::kable(head(positive.samples,n=3),
@@ -110,21 +132,22 @@ knitr::kable(head(positive.samples,n=3),
``` ```
<br> <br>
```{r echo=TRUE,eval=FALSE} ```{r echo=TRUE,eval=FALSE}
head(positive.samples,n=3) head(positive.samples,n=3)
``` ```
## Loading data ## Loading data
```{r echo=TRUE} ```{r echo=TRUE}
library(MetabarSchool)
data("positive.count") data("positive.count")
data("positive.samples") data("positive.samples")
data("positive.motus") data("positive.motus")
``` ```
- `positive.motus` : a `r nrow(positive.motus)` rows `data.frame` of - `positive.motus` : a `r nrow(positive.motus)` rows `data.frame` of `r ncol(positive.motus)` columns describing each MOTU
`r ncol(positive.motus)` columns describing each MOTU
```{r} ```{r}
knitr::kable(head(positive.motus,n=3), knitr::kable(head(positive.motus,n=3),
@@ -134,6 +157,7 @@ knitr::kable(head(positive.motus,n=3),
``` ```
<br> <br>
```{r echo=TRUE,eval=FALSE} ```{r echo=TRUE,eval=FALSE}
head(positive.motus,n=3) head(positive.motus,n=3)
``` ```
@@ -146,7 +170,6 @@ Singleton sequences are observed only once over the complete dataset.
table(colSums(positive.count) == 1) table(colSums(positive.count) == 1)
``` ```
```{r} ```{r}
kable(t(table(colSums(positive.count) == 1)), kable(t(table(colSums(positive.count) == 1)),
format = "html") %>% format = "html") %>%
@@ -164,11 +187,9 @@ positive.count = positive.count[,are.not.singleton]
positive.motus = positive.motus[are.not.singleton,] positive.motus = positive.motus[are.not.singleton,]
``` ```
- `positive.count` is now a - `positive.count` is now a $`r nrow(positive.count)` \; PCRs \; \times \; `r ncol(positive.count)` \; MOTUs$ matrix
$`r nrow(positive.count)` \; PCRs \; \times \; `r ncol(positive.count)` \; MOTUs$
matrix
## Not all the PCR have the number of reads {.flexbox .vcenter} ## Not all the PCR have the same number of reads {.flexbox .vcenter}
Despite all standardization efforts Despite all standardization efforts
@@ -180,9 +201,9 @@ hist(rowSums(positive.count),
main = "Number of read per PCR") main = "Number of read per PCR")
``` ```
<div class="green"> ::: green
Is it related to the amount of DNA in the extract ? Is it related to the amount of DNA in the extract ?
</div> :::
## What do the reading numbers per PCR mean? {.smaller} ## What do the reading numbers per PCR mean? {.smaller}
@@ -192,17 +213,13 @@ boxplot(rowSums(positive.count) ~ positive.samples$dilution,log="y")
abline(h = median(rowSums(positive.count)),lw=2,col="red",lty=2) abline(h = median(rowSums(positive.count)),lw=2,col="red",lty=2)
``` ```
```{r} ```{r}
SC = summary(aov((rowSums(positive.count)) ~ positive.samples$dilution))[[1]]$`Sum Sq` SC = summary(aov((rowSums(positive.count)) ~ positive.samples$dilution))[[1]]$`Sum Sq`
``` ```
<div class="red2"> ::: red2
<center> <center>Only `r round((SC/sum(SC)*100)[1],1)`% of the PCR read count variation is explain by dilution</center>
Only `r round((SC/sum(SC)*100)[1],1)`% of the PCR read count :::
variation is explain by dilution
</center>
</div>
## You must normalize your read counts ## You must normalize your read counts
@@ -212,7 +229,6 @@ Two options:
Randomly subsample the same number of reads for all the PCRs Randomly subsample the same number of reads for all the PCRs
### Relative frequencies ### Relative frequencies
Divide the read count of each MOTU in each sample by the total total read count of the same sample Divide the read count of each MOTU in each sample by the total total read count of the same sample
@@ -239,13 +255,19 @@ positive.count.rarefied = rrarefy(positive.count,2000)
## Rarefying read count (2) {.flexbox .vcenter} ## Rarefying read count (2) {.flexbox .vcenter}
```{r fig.height=3} ```{r fig.height=4}
par(mfrow=c(1,2),bg=NA) par(mfrow=c(1,2),bg=NA)
hist(log10(colSums(positive.count)+1), hist(log10(colSums(positive.count)+1),
main = "Not rarefied", main = "Not rarefied",
xlim = c(0,6),
ylim = c(0,2300),
breaks = 30,
xlab = TeX("$\\log_{10}(reads per MOTUs)$")) xlab = TeX("$\\log_{10}(reads per MOTUs)$"))
hist(log10(colSums(positive.count.rarefied)+1), hist(log10(colSums(positive.count.rarefied)+1),
main = "Rarefied data", main = "Rarefied data",
xlim = c(0,6),
ylim = c(0,2300),
breaks = 30,
xlab = TeX("$\\log_{10}(reads per MOTUs)$")) xlab = TeX("$\\log_{10}(reads per MOTUs)$"))
``` ```
@@ -282,9 +304,7 @@ positive.count.rarefied = positive.count.rarefied[,are.still.present]
positive.motus.rare = positive.motus[are.still.present,] positive.motus.rare = positive.motus[are.still.present,]
``` ```
<center> <center>positive.motus.rare is now a $`r nrow(positive.count.rarefied)` \; PCRs \; \times \; `r ncol(positive.count.rarefied)` \; MOTUs$</center>
positive.motus.rare is now a $`r nrow(positive.count.rarefied)` \; PCRs \; \times \; `r ncol(positive.count.rarefied)` \; MOTUs$
</center>
## Why rarefying ? {.vcenter .columns-2} ## Why rarefying ? {.vcenter .columns-2}
@@ -292,8 +312,7 @@ positive.motus.rare is now a $`r nrow(positive.count.rarefied)` \; PCRs \; \time
knitr::include_graphics("figures/subsampling.svg") knitr::include_graphics("figures/subsampling.svg")
``` ```
<br><br><br><br> <br><br><br><br> Increasing the number of reads just increase the description of the subpart of the PCR you have sequenced.
Increasing the number of reads just increase the description of the subpart of the PCR you have sequenced.
## Transforming read counts to relative frequencies ## Transforming read counts to relative frequencies
@@ -312,26 +331,21 @@ table(colSums(positive.count.relfreq) == 0)
## The different types of diversity {.vcenter} ## The different types of diversity {.vcenter}
<div style="float: left; width: 40%;"> ::: {style="float: left; width: 40%;"}
```{r} ```{r}
knitr::include_graphics("figures/diversity.svg") knitr::include_graphics("figures/diversity.svg")
``` ```
</div> :::
<div style="float: left; width: 60%;"> ::: {style="float: left; width: 60%;"}
<br><br> @Whittaker:10:00 <br><br><br><br>
<br><br> - $\alpha\text{-diversity}$ : Mean diversity per site ($species/site$)
@Whittaker:10:00
<br><br><br><br>
- $\alpha-diversity$ : Mean diversity per site ($species/site$) - $\gamma\text{-diversity}$ : Regional biodiversity ($species/region$)
- $\gamma-diversity$ : Regional biodiversity ($species/region$)
- $\beta-diversity$ : $\beta = \frac{\gamma}{\alpha}$ ($site$)
</div>
- $\beta\text{-diversity}$ : $\beta = \frac{\gamma}{\alpha}$ ($sites/region$)
:::
# $\alpha$-diversity # $\alpha$-diversity
@@ -341,7 +355,6 @@ knitr::include_graphics("figures/diversity.svg")
knitr::include_graphics("figures/alpha_diversity.svg") knitr::include_graphics("figures/alpha_diversity.svg")
``` ```
```{r out.width = "400px"} ```{r out.width = "400px"}
E1 = c(A=0.25,B=0.25,C=0.25,D=0.25,E=0,F=0,G=0) E1 = c(A=0.25,B=0.25,C=0.25,D=0.25,E=0,F=0,G=0)
E2 = c(A=0.55,B=0.07,C=0.02,D=0.17,E=0.07,F=0.07,G=0.03) E2 = c(A=0.55,B=0.07,C=0.02,D=0.17,E=0.07,F=0.07,G=0.03)
@@ -352,7 +365,6 @@ kable(environments,
kable_styling(position = "center") kable_styling(position = "center")
``` ```
## Richness {.flexbox .vcenter} ## Richness {.flexbox .vcenter}
The actual number of species present in your environement whatever their aboundances The actual number of species present in your environement whatever their aboundances
@@ -374,17 +386,15 @@ kable(data.frame(S=S),
## Gini-Simpson's index {.smaller} ## Gini-Simpson's index {.smaller}
<div style="float: left; width: 60%;"> ::: {style="float: left; width: 60%;"}
The Simpson's index is the probability of having the same species twice when you randomly select two specimens. The Simpson's index is the probability of having the same species twice when you randomly select two specimens. <br> <br>
<br> :::
<br>
</div> ::: {style="float: right; width: 40%;"}
<div style="float: right; width: 40%;">
$$ $$
\lambda =\sum _{i=1}^{S}p_{i}^{2} \lambda =\sum _{i=1}^{S}p_{i}^{2}
$$ $$ <br>
<br> :::
</div>
<center> <center>
@@ -409,30 +419,22 @@ kable(data.frame(`Gini-Simpson`=GS),
kable_styling(position = "center") kable_styling(position = "center")
``` ```
## Shanon entropy {.smaller} ## Shannon entropy {.smaller}
<div style="float: left; width: 65%;"> Shannon entropy is based on information theory:
Shanon entropy is based on information theory.
Let $X$ be a uniformly distributed random variable with values in $A$ <center>$H^{\prime }=-\sum _{i=1}^{S}p_{i}\log p_{i}$</center>
if $A$ is a community where every species are equally represented then $$
H(A) = \log|A|
$$ $$
H(X) = \log|A|
$$
<br>
</div>
<div style="float: right; width: 35%;">
$$
H^{\prime }=-\sum _{i=1}^{S}p_{i}\log p_{i}
$$
<br>
</div>
<center> <center>
```{r out.width = "400px"} ```{r out.width = "400px"}
knitr::include_graphics("figures/alpha_diversity.svg") knitr::include_graphics("figures/alpha_diversity.svg")
``` ```
</center> </center>
```{r echo=TRUE} ```{r echo=TRUE}
@@ -440,7 +442,7 @@ H = - rowSums(environments * log(environments),na.rm = TRUE)
``` ```
```{r} ```{r}
kable(data.frame(`Shanon index`=H), kable(data.frame(`Shannon index`=H),
format="html", format="html",
align = 'rr') %>% align = 'rr') %>%
kable_styling(position = "center") kable_styling(position = "center")
@@ -448,23 +450,24 @@ kable(data.frame(`Shanon index`=H),
## Hill's number {.smaller} ## Hill's number {.smaller}
<div style="float: left; width: 50%;"> ::: {style="float: left; width: 50%;"}
As : As : $$
$$ H(A) = \log|A| \;\Rightarrow\; ^1D = e^{H(A)}
H(X) = \log|A| \;\Rightarrow\; ^2D = e^{H(X)} $$ <br>
$$ :::
<br>
</div> ::: {style="float: right; width: 50%;"}
<div style="float: right; width: 50%;"> where $^1D$ is the theoretical number of species in a evenly distributed community that would have the same Shannon's entropy than ours.
where $^2D$ is the theoretical number of species in a evenly distributed community that would have the same Shanon's entropy than ours. :::
</div>
<center> <center>
<BR>
<BR> <BR> <BR>
```{r out.width = "400px"} ```{r out.width = "400px"}
knitr::include_graphics("figures/alpha_diversity.svg") knitr::include_graphics("figures/alpha_diversity.svg")
``` ```
</center> </center>
```{r echo=TRUE} ```{r echo=TRUE}
@@ -483,7 +486,7 @@ kable(data.frame(`Hill Numbers`=D2),
Based on the generalized entropy @Tsallis:94:00 we can propose a generalized form of logarithm. Based on the generalized entropy @Tsallis:94:00 we can propose a generalized form of logarithm.
$$ $$
^q\log(x) = \frac{x^{(1-q)}}{1-q} ^q\log(x) = \frac{x^{(1-q)}-1}{1-q}
$$ $$
The function is not defined for $q=1$ but when $q \longrightarrow 1\;,\; ^q\log(x) \longrightarrow \log(x)$ The function is not defined for $q=1$ but when $q \longrightarrow 1\;,\; ^q\log(x) \longrightarrow \log(x)$
@@ -491,14 +494,14 @@ The function is not defined for $q=1$ but when $q \longrightarrow 1\;,\; ^q\log(
$$ $$
^q\log(x) = \left\{ ^q\log(x) = \left\{
\begin{align} \begin{align}
\log(x),& \text{if } x = 1\\ \log(x),& \text{if } q = 1\\
\frac{x^{(1-q)}}{1-q},& \text{otherwise} \frac{x^{(1-q)}-1}{1-q},& \text{otherwise}
\end{align} \end{align}
\right. \right.
$$ $$
```{r echo=TRUE, eval=FALSE} ```{r echo=TRUE, eval=FALSE}
log.q = function(x,q=1) { log_q = function(x,q=1) {
if (q==1) if (q==1)
log(x) log(x)
else else
@@ -506,6 +509,28 @@ log.q = function(x,q=1) {
} }
``` ```
## Impact of $q$ on the `log_q` function
```{r}
layout(matrix(c(1,1,2),nrow=1))
qs = seq(0,5,by=1)
x = seq(0.001,1,length.out = 100)
plot(x,log_q(x,0),lty=2,lwd=3,col=0,type="l",
ylab = TeX("$^q\\log$"),
ylim=c(-10,0))
for (i in seq_along(qs)) {
points(x,log_q(x,qs[i]),lty=1,lwd=1,col=i,type="l")
}
points(x,log(x),lty=2,lwd=3,col="red",type="l")
plot(0,type='n',axes=FALSE,ann=FALSE)
legend("topleft",legend = qs,fill = seq_along(qs),cex=1.5)
```
## And its inverse function {.flexbox .vcenter} ## And its inverse function {.flexbox .vcenter}
$$ $$
@@ -516,8 +541,9 @@ $$
\end{align} \end{align}
\right. \right.
$$ $$
```{r echo=TRUE, eval=FALSE} ```{r echo=TRUE, eval=FALSE}
exp.q = function(x,q=1) { exp_q = function(x,q=1) {
if (q==1) if (q==1)
exp(x) exp(x)
else else
@@ -525,41 +551,41 @@ exp.q = function(x,q=1) {
} }
``` ```
## Generalised Shanon entropy ## Generalised Shannon entropy
$$ $$
^qH = - \sum_{i=1}^S pi \times ^q\log pi ^qH = - \sum_{i=1}^S p_i \; ^q\log p_i
$$ $$
```{r echo=TRUE, eval=FALSE} ```{r echo=TRUE, eval=FALSE}
H.q = function(x,q=1) { H_q = function(x,q=1) {
sum(x * log.q(1/x,q),na.rm = TRUE) sum(x * log_q(1/x,q),na.rm = TRUE)
} }
``` ```
and generalized the previously presented Hill's number and generalized the previously presented Hill's number
$$ $$
^qD=^qe^{^qH} ^qD=^qe^{^qH}
$$ $$
```{r echo=TRUE, eval=FALSE} ```{r echo=TRUE, eval=FALSE}
D.q = function(x,q=1) { D_q = function(x,q=1) {
exp.q(H.q(x,q),q) exp_q(H_q(x,q),q)
} }
``` ```
## Biodiversity spectrum (1) {.flexbox .vcenter} ## Biodiversity spectrum (1) {.flexbox .vcenter}
```{r echo=TRUE, eval=FALSE} ```{r echo=TRUE, eval=FALSE}
H.spectrum = function(x,q=1) { H_spectrum = function(x,q=1) {
sapply(q,function(Q) H.q(x,Q)) sapply(q,function(Q) H_q(x,Q))
} }
``` ```
```{r echo=TRUE, eval=FALSE} ```{r echo=TRUE, eval=FALSE}
D.spectrum = function(x,q=1) { D_spectrum = function(x,q=1) {
sapply(q,function(Q) D.q(x,Q)) sapply(q,function(Q) D_q(x,Q))
} }
``` ```
@@ -568,8 +594,8 @@ D.spectrum = function(x,q=1) {
```{r echo=TRUE,warning=FALSE,error=FALSE} ```{r echo=TRUE,warning=FALSE,error=FALSE}
library(MetabarSchool) library(MetabarSchool)
qs = seq(from=0,to=3,by=0.1) qs = seq(from=0,to=3,by=0.1)
environments.hq = apply(environments,MARGIN = 1,H.spectrum,q=qs) environments.hq = apply(environments,MARGIN = 1,H_spectrum,q=qs)
environments.dq = apply(environments,MARGIN = 1,D.spectrum,q=qs) environments.dq = apply(environments,MARGIN = 1,D_spectrum,q=qs)
``` ```
```{r} ```{r}
@@ -593,7 +619,7 @@ abline(v=c(0,1,2),lty=2,col=4:6)
- $^0H(X) = S - 1$ : the richness minus one. - $^0H(X) = S - 1$ : the richness minus one.
- $^1H(X) = H^{\prime}$ : the Shanon's entropy. - $^1H(X) = H^{\prime}$ : the Shannon's entropy.
- $^2H(X) = 1 - \lambda$ : Gini-Simpson's index. - $^2H(X) = 1 - \lambda$ : Gini-Simpson's index.
@@ -606,18 +632,20 @@ abline(v=c(0,1,2),lty=2,col=4:6)
- $^2D(X) = 1 / \lambda$ : The number of species in an even community having the same Gini-Simpson's index. - $^2D(X) = 1 / \lambda$ : The number of species in an even community having the same Gini-Simpson's index.
<br> <br>
<center> <center>
$q$ can be considered as a penality you give to rare species $q$ can be considered as a penality you give to rare species
**when $q=0$ all the species have the same weight** **when** $q=0$ all the species have the same weight
</center> </center>
## Biodiversity spectrum of the mock community ## Biodiversity spectrum of the mock community
```{r echo=TRUE} ```{r echo=TRUE}
H.mock = H.spectrum(plants.16$dilution,qs) H.mock = H_spectrum(plants.16$dilution,qs)
D.mock = D.spectrum(plants.16$dilution,qs) D.mock = D_spectrum(plants.16$dilution,qs)
``` ```
```{r} ```{r}
@@ -640,9 +668,10 @@ abline(v=c(0,1,2),lty=2,col=4:6)
```{r echo=TRUE} ```{r echo=TRUE}
positive.H = apply(positive.count.relfreq, positive.H = apply(positive.count.relfreq,
MARGIN = 1, MARGIN = 1,
FUN = H.spectrum, FUN = H_spectrum,
q=qs) q=qs)
``` ```
```{r} ```{r}
par(bg=NA) par(bg=NA)
boxplot(t(positive.H), boxplot(t(positive.H),
@@ -654,7 +683,6 @@ points(H.mock,col="red",type="l")
## Biodiversity spectrum and metabarcoding (2) {.flexbox .vcenter .smaller} ## Biodiversity spectrum and metabarcoding (2) {.flexbox .vcenter .smaller}
```{r} ```{r}
par(bg=NA) par(bg=NA)
boxplot(t(positive.H)[,11:31], boxplot(t(positive.H)[,11:31],
@@ -672,7 +700,7 @@ positive.H.means = rowMeans(positive.H)
```{r echo=TRUE} ```{r echo=TRUE}
positive.D = apply(positive.count.relfreq, positive.D = apply(positive.count.relfreq,
MARGIN = 1, MARGIN = 1,
FUN = D.spectrum, FUN = D_spectrum,
q=qs) q=qs)
``` ```
@@ -709,7 +737,6 @@ obiclean -s merged_sample -H -C -r 0.1 \
> positifs.uniq.annotated.clean.fasta > positifs.uniq.annotated.clean.fasta
``` ```
## Impact of data cleaning on $\alpha$-diversity (2) ## Impact of data cleaning on $\alpha$-diversity (2)
```{r echo=TRUE} ```{r echo=TRUE}
@@ -720,7 +747,7 @@ positive.clean.count.relfreq = decostand(positive.clean.count,
positive.clean.H = apply(positive.clean.count.relfreq, positive.clean.H = apply(positive.clean.count.relfreq,
MARGIN = 1, MARGIN = 1,
FUN = H.spectrum, FUN = H_spectrum,
q=qs) q=qs)
``` ```
@@ -738,7 +765,7 @@ points(H.mock,col="red",type="l")
```{r echo=TRUE} ```{r echo=TRUE}
positive.clean.D = apply(positive.clean.count.relfreq, positive.clean.D = apply(positive.clean.count.relfreq,
MARGIN = 1, MARGIN = 1,
FUN = D.spectrum, FUN = D_spectrum,
q=qs) q=qs)
``` ```
@@ -753,16 +780,11 @@ points(D.mock,col="red",type="l")
positive.clean.D.means = rowMeans(positive.D) positive.clean.D.means = rowMeans(positive.D)
``` ```
# $\beta$-diversity # $\beta$-diversity
## Dissimilarity indices or non-metric distances {.flexbox .vcenter} ## Dissimilarity indices or non-metric distances {.flexbox .vcenter}
<center>
A dissimilarity index $d(A,B)$ is a numerical measurement <center>A dissimilarity index $d(A,B)$ is a numerical measurement <br> of how far apart objects $A$ and $B$ are.</center>
<br>
of how far apart objects $A$ and $B$ are.
</center>
### Properties ### Properties
@@ -794,17 +816,15 @@ $$
## Metrics or distances ## Metrics or distances
<div style="float: left; width: 50%;"> ::: {style="float: left; width: 50%;"}
```{r out.width = "400px"} ```{r out.width = "400px"}
knitr::include_graphics("figures/metric.svg") knitr::include_graphics("figures/metric.svg")
``` ```
</div> :::
<div style="float: right; width: 50%;">
::: {style="float: right; width: 50%;"}
A metric is a dissimilarity index verifying the *subadditivity* also named *triangle inequality* A metric is a dissimilarity index verifying the *subadditivity* also named *triangle inequality*
$$ $$
\begin{align} \begin{align}
d(A,B) \geqslant& 0 \\ d(A,B) \geqslant& 0 \\
@@ -813,20 +833,18 @@ d(A,B) =& \;0 \iff A = B \\
d(A,B) \leqslant& \;d(A,C) + d(C,B) d(A,B) \leqslant& \;d(A,C) + d(C,B)
\end{align} \end{align}
$$ $$
:::
</div>
## Some metrics ## Some metrics
<div style="float: left; width: 50%;"> ::: columns
::: {.column width="40%"}
```{r out.width = "400px"} ```{r out.width = "400px"}
knitr::include_graphics("figures/Distance.svg") knitr::include_graphics("figures/Distance.svg")
``` ```
:::
</div> ::: {.column width="60%"}
<div style="float: right; width: 50%;">
### Computing ### Computing
$$ $$
@@ -836,8 +854,8 @@ d_m =& |x_A - x_B| + |y_A - y_B| \\
d_c =& \max(|x_A - x_B| , |y_A - y_B|) \\ d_c =& \max(|x_A - x_B| , |y_A - y_B|) \\
\end{align} \end{align}
$$ $$
:::
</div> :::
## Generalizable on a n-dimension space {.smaller} ## Generalizable on a n-dimension space {.smaller}
@@ -852,7 +870,6 @@ $$
with $a_i$ and $b_i$ being respectively the value of the $i^{th}$ variable for $A$ and $B$. with $a_i$ and $b_i$ being respectively the value of the $i^{th}$ variable for $A$ and $B$.
$$ $$
\begin{align} \begin{align}
d_e =& \sqrt{\sum_{i=1}^{n}(a_i - b_i)^2 } \\ d_e =& \sqrt{\sum_{i=1}^{n}(a_i - b_i)^2 } \\
@@ -875,14 +892,14 @@ $$
## Metrics and ultrametrics ## Metrics and ultrametrics
<div style="float: left; width: 50%;"> ::: columns
::: {.column width="40%"}
```{r out.width = "400px"} ```{r out.width = "400px"}
knitr::include_graphics("figures/ultrametric.svg") knitr::include_graphics("figures/ultrametric.svg")
``` ```
</div> :::
<div style="float: right; width: 50%;">
::: {.column width="60%"}
### Metric ### Metric
$$ $$
@@ -894,9 +911,8 @@ $$
$$ $$
d(x,z)\leq \max(d(x,y),d(y,z)) d(x,z)\leq \max(d(x,y),d(y,z))
$$ $$
:::
:::
</div>
## Why it is nice to use metrics ? {.flexbox .vcenter} ## Why it is nice to use metrics ? {.flexbox .vcenter}
@@ -905,7 +921,6 @@ $$
- This means that rotations are not changing distances between objects - This means that rotations are not changing distances between objects
- Multidimensional scaling (PCA, PCoA, CoA...) are rotations - Multidimensional scaling (PCA, PCoA, CoA...) are rotations
## The data set {.flexbox .vcenter} ## The data set {.flexbox .vcenter}
**We analyzed two forest sites in French Guiana** **We analyzed two forest sites in French Guiana**
@@ -926,7 +941,6 @@ data("guiana.motus")
data("guiana.samples") data("guiana.samples")
``` ```
## Clean out bad PCR cycle 1 {.flexbox .vcenter .smaller} ## Clean out bad PCR cycle 1 {.flexbox .vcenter .smaller}
```{r echo=TRUE,fig.height=2.5} ```{r echo=TRUE,fig.height=2.5}
@@ -934,6 +948,7 @@ s = tag_bad_pcr(guiana.samples$sample,guiana.count)
guiana.count.clean = guiana.count[s$keep,] guiana.count.clean = guiana.count[s$keep,]
guiana.samples.clean = guiana.samples[s$keep,] guiana.samples.clean = guiana.samples[s$keep,]
``` ```
```{r echo=TRUE} ```{r echo=TRUE}
table(s$keep) table(s$keep)
``` ```
@@ -965,7 +980,7 @@ table(s$keep)
## Averaging good PCR replicates (1) {.flexbox .vcenter} ## Averaging good PCR replicates (1) {.flexbox .vcenter}
```{r echo=TRUE} ```{r echo=TRUE}
guiana.samples.clean = cbind(guiana.samples.clean,s) guiana.samples.clean = cbind(guiana.samples.clean,s[rownames(guiana.samples.clean),])
guiana.count.mean = aggregate(decostand(guiana.count.clean,method = "total"), guiana.count.mean = aggregate(decostand(guiana.count.clean,method = "total"),
by = list(guiana.samples.clean$sample), by = list(guiana.samples.clean$sample),
@@ -1023,18 +1038,20 @@ xy = xy[,1:2]
xy.hellinger = decostand(xy,method = "hellinger") xy.hellinger = decostand(xy,method = "hellinger")
``` ```
<div style="float: left; width: 50%;"> ::: columns
::: {.column width="40%"}
```{r, fig.width=4,fig.height=4} ```{r, fig.width=4,fig.height=4}
par(bg=NA) par(bg=NA)
plot(xy.hellinger,asp=1) plot(xy.hellinger,asp=1)
``` ```
</div> :::
<div style="float: right; width: 50%;">
::: {.column width="60%"}
```{r out.width = "400px"} ```{r out.width = "400px"}
knitr::include_graphics("figures/euclidean_hellinger.svg") knitr::include_graphics("figures/euclidean_hellinger.svg")
``` ```
</div> :::
:::
## Bray-Curtis distance on relative frequencies ## Bray-Curtis distance on relative frequencies
@@ -1051,15 +1068,15 @@ BC_{jk}=\frac{\sum _{i=1}^{p}(N_{ij} - min(N_{ij},N_{ik}) + (N_{ik} - min(N_{ij}
$$ $$
$$ $$
BC_{jk}=\frac{\sum _{i=1}^{p}]N_{ij} - N_{ik}|}{\sum _{i=1}^{p}N_{ij}+\sum _{i=1}^{p}N_{ik}} BC_{jk}=\frac{\sum _{i=1}^{p}|N_{ij} - N_{ik}|}{\sum _{i=1}^{p}N_{ij}+\sum _{i=1}^{p}N_{ik}}
$$ $$
$$ $$
BC_{jk}=\frac{\sum _{i=1}^{p}]N_{ij} - N_{ik}|}{1+1} BC_{jk}=\frac{\sum _{i=1}^{p}|N_{ij} - N_{ik}|}{1+1}
$$ $$
$$ $$
BC_{jk}=\frac{1}{2}\sum _{i=1}^{p}]N_{ij} - N_{ik}| BC_{jk}=\frac{1}{2}\sum _{i=1}^{p}|N_{ij} - N_{ik}|
$$ $$
## Principale coordinate analysis (1) {.flexbox .vcenter} ## Principale coordinate analysis (1) {.flexbox .vcenter}
@@ -1086,7 +1103,7 @@ plot(guiana.bc.pcoa$points[,1:2],
xlab="Axis 1", xlab="Axis 1",
ylab="Axis 2", ylab="Axis 2",
main = "Bray Curtis on Rel. Freqs") main = "Bray Curtis on Rel. Freqs")
plot(guiana.euc.pcoa$points[,1:2], plot(guiana.euc.pcoa$points[,1],-guiana.euc.pcoa$points[,2],
col = samples.type, col = samples.type,
asp = 1, asp = 1,
xlab="Axis 1", xlab="Axis 1",
@@ -1100,7 +1117,7 @@ plot(guiana.jac.1.pcoa$points[,1:2],
xlab="Axis 1", xlab="Axis 1",
ylab="Axis 2", ylab="Axis 2",
main = "Jaccard on presence (0.1%)") main = "Jaccard on presence (0.1%)")
plot(guiana.jac.10.pcoa$points[,1:2], plot(-guiana.jac.10.pcoa$points[,1],guiana.jac.10.pcoa$points[,2],
col = samples.type, col = samples.type,
asp = 1, asp = 1,
xlab="Axis 1", xlab="Axis 1",
@@ -1139,12 +1156,66 @@ plot(0,type='n',axes=FALSE,ann=FALSE)
legend("topleft",legend = levels(samples.type),fill = 1:4,cex=1.2) legend("topleft",legend = levels(samples.type),fill = 1:4,cex=1.2)
``` ```
````{=html}
<!---
## Computation of norms
```{r guiana_norm, echo=TRUE}
guiana.n1.dist = norm(guiana.relfreq.final,l=1)
guiana.n2.dist = norm(guiana.relfreq.final^(1/2),l=2)
guiana.n3.dist = norm(guiana.relfreq.final^(1/3),l=3)
guiana.n4.dist = norm(guiana.relfreq.final^(1/100),l=100)
```
## pCoA on norms
```{r dependson="guiana_norm"}
guiana.n1.pcoa = cmdscale(guiana.n1.dist,k=3,eig = TRUE)
guiana.n2.pcoa = cmdscale(guiana.n2.dist,k=3,eig = TRUE)
guiana.n3.pcoa = cmdscale(guiana.n3.dist,k=3,eig = TRUE)
guiana.n4.pcoa = cmdscale(guiana.n4.dist,k=3,eig = TRUE)
```
```{r}
par(mfrow=c(2,3),bg=NA)
plot(guiana.n1.pcoa$points[,1],guiana.n1.pcoa$points[,2],
col = samples.type,
asp = 1,
xlab="Axis 1",
ylab="Axis 2",
main = "Norm 1 on Hellinger")
plot(guiana.n2.pcoa$points[,1],-guiana.n2.pcoa$points[,2],
col = samples.type,
asp = 1,
xlab="Axis 1",
ylab="Axis 2",
main = "Norm 2 on Hellinger")
plot(0,type='n',axes=FALSE,ann=FALSE)
legend("topleft",legend = levels(samples.type),fill = 1:4,cex=1.2)
plot(-guiana.n3.pcoa$points[,1],-guiana.n3.pcoa$points[,2],
col = samples.type,
asp = 1,
xlab="Axis 1",
ylab="Axis 2",
main = "Norm 3 on Hellinger")
plot(-guiana.n4.pcoa$points[,1],-guiana.n4.pcoa$points[,2],
col = samples.type,
asp = 1,
xlab="Axis 1",
ylab="Axis 2",
main = "Norm 4 on Hellinger")
```
--->
````
## Comparing diversity of the environments ## Comparing diversity of the environments
```{r} ```{r}
guiana.relfreq.final = apply(guiana.relfreq.final, guiana.relfreq.final = apply(guiana.relfreq.final,
MARGIN = 1, MARGIN = 1,
FUN = H.spectrum, FUN = H_spectrum,
q=qs) q=qs)
``` ```
@@ -1174,7 +1245,4 @@ boxplot(t(guiana.relfreq.final[,samples.type=="soil.Petit Plateau"]),log="y",
names=qs,las=2,col=4,add=TRUE) names=qs,las=2,col=4,add=TRUE)
``` ```
## Bibliography ## Bibliography

View File

@@ -0,0 +1,17 @@
knitr
tidyverse
ggplot2
tibble
tidyr
readr
purrr
dplyr
stringr
forcats
lubridate
kableExtra
latex2exp
MetabarSchool
permute
lattice
vegan

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 59 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 54 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 46 KiB

Some files were not shown because too many files have changed in this diff Show More