New version of the lecture

2024-02-02 09:49:21 +01:00
parent 050956c01b
commit f6431654dc
190 changed files with 7703 additions and 2629 deletions
@@ -1,16 +1,16 @@
 ---
 title: "Biodiversity metrics \ and metabarcoding"
 author: "Eric Coissac"
-date: "28/01/2019"
+date: "02/02/2024"
 bibliography: inst/REFERENCES.bib
-output:
-  ioslides_presentation: 
-    widescreen: true
+format: 
+  revealjs:
     smaller: true
-    css: slides.css
-    mathjax: local
-    self_contained: false
-  slidy_presentation: default
+     transition: slide 
+     scrollable: true
+     theme: simple 
+     html-math-method: mathjax
+editor: visual
 ---

 ```{r setup, include=FALSE}
@@ -25,7 +25,6 @@ opts_chunk$set(echo = FALSE,
               cache.lazy = FALSE)
 ```

-
 # Summary

 -   The MetabarSchool Package
@@ -38,7 +37,7 @@ opts_chunk$set(echo = FALSE,

 # The MetabarSchool Package

-## Instaling the package
+## Installing the package

 You need the *devtools* package

@@ -83,8 +82,7 @@ data("positive.samples")

 -   `r nrow(positive.samples)` PCR of the mock community using SPER02 trnL-P6-Loop primers

-  - `r length(table(positive.samples$dilution))` dilutions of the mock 
-    community: `r paste0('1/',names(table(positive.samples$dilution)))` 
+    -   `r length(table(positive.samples$dilution))` dilutions of the mock community: `r paste0('1/',names(table(positive.samples$dilution)))`

    -   `r as.numeric(table(positive.samples$dilution)[1])` repeats per dilution

@@ -98,9 +96,7 @@ data("positive.samples")
 data("positive.motus")
 ```

- `positive.count` read count matrix 
-  $`r nrow(positive.count)` \; PCRs \; \times \;  `r ncol(positive.count)` \; MOTUs$
-  
+-   `positive.count` read count matrix $`r nrow(positive.count)` \; PCRs \; \times \; `r ncol(positive.count)` \; MOTUs$

 ```{r}
 knitr::kable(positive.count[1:5,1:5],
@@ -111,11 +107,11 @@ knitr::kable(positive.count[1:5,1:5],
 ```

 <br>
+
 ```{r echo=TRUE,eval=FALSE}
 positive.count[1:5,1:5]
 ```

-
 ## Loading data

 ```{r echo=TRUE}
@@ -126,9 +122,7 @@ data("positive.samples")
 data("positive.motus")
 ```

- `positive.samples` a `r nrow(positive.samples)` rows `data.frame` of 
-   `r ncol(positive.samples)` columns describing each PCR
-
+-   `positive.samples` a `r nrow(positive.samples)` rows `data.frame` of `r ncol(positive.samples)` columns describing each PCR

 ```{r}
 knitr::kable(head(positive.samples,n=3),
@@ -138,11 +132,11 @@ knitr::kable(head(positive.samples,n=3),
 ```

 <br>
+
 ```{r echo=TRUE,eval=FALSE}
 head(positive.samples,n=3)
 ```

-
 ## Loading data

 ```{r echo=TRUE}
@@ -153,8 +147,7 @@ data("positive.samples")
 data("positive.motus")
 ```

- `positive.motus` : a `r nrow(positive.motus)` rows `data.frame` of 
-   `r ncol(positive.motus)` columns describing each MOTU
+-   `positive.motus` : a `r nrow(positive.motus)` rows `data.frame` of `r ncol(positive.motus)` columns describing each MOTU

 ```{r}
 knitr::kable(head(positive.motus,n=3),
@@ -164,6 +157,7 @@ knitr::kable(head(positive.motus,n=3),
 ```

 <br>
+
 ```{r echo=TRUE,eval=FALSE}
 head(positive.motus,n=3)
 ```
@@ -176,7 +170,6 @@ Singleton sequences are observed only once over the complete dataset.
 table(colSums(positive.count) == 1)
 ```

-
 ```{r}
 kable(t(table(colSums(positive.count) == 1)),
             format = "html") %>% 
@@ -194,9 +187,7 @@ positive.count = positive.count[,are.not.singleton]
 positive.motus = positive.motus[are.not.singleton,]
 ```

- `positive.count` is now a 
-  $`r nrow(positive.count)` \; PCRs \; \times \;  `r ncol(positive.count)` \; MOTUs$
-  matrix 
+-   `positive.count` is now a $`r nrow(positive.count)` \; PCRs \; \times \; `r ncol(positive.count)` \; MOTUs$ matrix

 ## Not all the PCR have the same number of reads {.flexbox .vcenter}

@@ -210,9 +201,9 @@ hist(rowSums(positive.count),
     main = "Number of read per PCR")
 ```

-<div class="green">
+::: green
 Is it related to the amount of DNA in the extract ?
-</div>
+:::

 ## What do the reading numbers per PCR mean? {.smaller}

@@ -222,17 +213,13 @@ boxplot(rowSums(positive.count) ~ positive.samples$dilution,log="y")
 abline(h = median(rowSums(positive.count)),lw=2,col="red",lty=2)
 ```

-
 ```{r}
 SC = summary(aov((rowSums(positive.count)) ~ positive.samples$dilution))[[1]]$`Sum Sq`
 ```

-<div class="red2">
-<center>
-  Only `r round((SC/sum(SC)*100)[1],1)`% of the PCR read count 
-  variation is explain by dilution
-</center>
-</div>
+::: red2
+<center>Only `r round((SC/sum(SC)*100)[1],1)`% of the PCR read count variation is explain by dilution</center>
+:::

 ## You must normalize your read counts

@@ -242,7 +229,6 @@ Two options:

 Randomly subsample the same number of reads for all the PCRs

-
 ### Relative frequencies

 Divide the read count of each MOTU in each sample by the total total read count of the same sample
@@ -318,9 +304,7 @@ positive.count.rarefied = positive.count.rarefied[,are.still.present]
 positive.motus.rare = positive.motus[are.still.present,]
 ```

-<center>
-positive.motus.rare is now a $`r nrow(positive.count.rarefied)` \; PCRs \; \times \;  `r ncol(positive.count.rarefied)` \; MOTUs$
-</center>  
+<center>positive.motus.rare is now a $`r nrow(positive.count.rarefied)` \; PCRs \; \times \; `r ncol(positive.count.rarefied)` \; MOTUs$</center>

 ## Why rarefying ? {.vcenter .columns-2}

@@ -328,8 +312,7 @@ positive.motus.rare is now a $`r nrow(positive.count.rarefied)` \; PCRs \; \time
 knitr::include_graphics("figures/subsampling.svg")
 ```

-<br><br><br><br>
-Increasing the number of reads just increase the description of the subpart of the PCR you have sequenced.
+<br><br><br><br> Increasing the number of reads just increase the description of the subpart of the PCR you have sequenced.

 ## Transforming read counts to relative frequencies

@@ -348,26 +331,21 @@ table(colSums(positive.count.relfreq) == 0)

 ## The different types of diversity {.vcenter}

-<div style="float: left; width: 40%;">
+::: {style="float: left; width: 40%;"}
 ```{r}
 knitr::include_graphics("figures/diversity.svg")
 ```
-</div>
+:::

-<div style="float: left; width: 60%;">
-
-<br><br>
-@Whittaker:10:00
-<br><br><br><br>
+::: {style="float: left; width: 60%;"}
+<br><br> @Whittaker:10:00 <br><br><br><br>

 -   $\alpha\text{-diversity}$ : Mean diversity per site ($species/site$)

 -   $\gamma\text{-diversity}$ : Regional biodiversity ($species/region$)

 -   $\beta\text{-diversity}$ : $\beta = \frac{\gamma}{\alpha}$ ($sites/region$)
-
-</div>
-
+:::

 # $\alpha$-diversity

@@ -377,7 +355,6 @@ knitr::include_graphics("figures/diversity.svg")
 knitr::include_graphics("figures/alpha_diversity.svg")
 ```

-
 ```{r out.width = "400px"}
 E1 = c(A=0.25,B=0.25,C=0.25,D=0.25,E=0,F=0,G=0)
 E2 = c(A=0.55,B=0.07,C=0.02,D=0.17,E=0.07,F=0.07,G=0.03)
@@ -388,7 +365,6 @@ kable(environments,
  kable_styling(position = "center")
 ```

-
 ## Richness {.flexbox .vcenter}

 The actual number of species present in your environement whatever their aboundances
@@ -410,17 +386,15 @@ kable(data.frame(S=S),

 ## Gini-Simpson's index {.smaller}

-<div style="float: left; width: 60%;">
-The Simpson's index is the probability of having the same species twice when you randomly select two specimens.
-<br>
-<br>
-</div>
-<div style="float: right; width: 40%;">
+::: {style="float: left; width: 60%;"}
+The Simpson's index is the probability of having the same species twice when you randomly select two specimens. <br> <br>
+:::
+
+::: {style="float: right; width: 40%;"}
 $$
 \lambda =\sum _{i=1}^{S}p_{i}^{2}
-$$
-<br>
-</div>
+$$ <br>
+:::

 <center>

@@ -449,20 +423,18 @@ kable(data.frame(`Gini-Simpson`=GS),

 Shannon entropy is based on information theory:

-<center>
-$H^{\prime }=-\sum _{i=1}^{S}p_{i}\log p_{i}$
-</center>
+<center>$H^{\prime }=-\sum _{i=1}^{S}p_{i}\log p_{i}$</center>

-
-if $A$ is a community where every species are equally represented then 
-$$
+if $A$ is a community where every species are equally represented then $$
 H(A) = \log|A|
 $$

 <center>
+
 ```{r out.width = "400px"}
 knitr::include_graphics("figures/alpha_diversity.svg")
 ```
+
 </center>

 ```{r echo=TRUE}
@@ -478,23 +450,24 @@ kable(data.frame(`Shannon index`=H),

 ## Hill's number {.smaller}

-<div style="float: left; width: 50%;">
-As :
-$$
+::: {style="float: left; width: 50%;"}
+As : $$
 H(A) = \log|A| \;\Rightarrow\; ^1D = e^{H(A)}
-$$
-<br>
-</div>
-<div style="float: right; width: 50%;">
+$$ <br>
+:::
+
+::: {style="float: right; width: 50%;"}
 where $^1D$ is the theoretical number of species in a evenly distributed community that would have the same Shannon's entropy than ours.
-</div>
+:::

 <center>
-<BR>
-<BR>
+
+<BR> <BR>
+
 ```{r out.width = "400px"}
 knitr::include_graphics("figures/alpha_diversity.svg")
 ```
+
 </center>

 ```{r echo=TRUE}
@@ -513,7 +486,7 @@ kable(data.frame(`Hill Numbers`=D2),
 Based on the generalized entropy @Tsallis:94:00 we can propose a generalized form of logarithm.

 $$
-^q\log(x) = \frac{x^{(1-q)}}{1-q}
+^q\log(x) = \frac{x^{(1-q)}-1}{1-q}
 $$

 The function is not defined for $q=1$ but when $q \longrightarrow 1\;,\; ^q\log(x) \longrightarrow \log(x)$
@@ -521,8 +494,8 @@ The function is not defined for $q=1$ but when $q \longrightarrow 1\;,\; ^q\log(
 $$
 ^q\log(x) = \left\{ 
             \begin{align}
-               \log(x),& \text{if } x = 1\\
-               \frac{x^{(1-q)}}{1-q},& \text{otherwise}
+               \log(x),& \text{if } q = 1\\
+               \frac{x^{(1-q)}-1}{1-q},& \text{otherwise}
             \end{align}
           \right.
 $$
@@ -568,6 +541,7 @@ $$
             \end{align}
           \right.
 $$
+
 ```{r echo=TRUE, eval=FALSE}
 exp_q = function(x,q=1) {
  if (q==1)
@@ -589,12 +563,12 @@ H_q = function(x,q=1) {
 }
 ```

-
 and generalized the previously presented Hill's number

 $$
 ^qD=^qe^{^qH}
 $$
+
 ```{r echo=TRUE, eval=FALSE}
 D_q = function(x,q=1) {
  exp_q(H_q(x,q),q)
@@ -658,10 +632,12 @@ abline(v=c(0,1,2),lty=2,col=4:6)
 -   $^2D(X) = 1 / \lambda$ : The number of species in an even community having the same Gini-Simpson's index.

 <br>
+
 <center>
+
 $q$ can be considered as a penality you give to rare species

-**when $q=0$ all the species have the same weight**
+**when** $q=0$ all the species have the same weight

 </center>

@@ -695,6 +671,7 @@ positive.H = apply(positive.count.relfreq,
                   FUN = H_spectrum,
                   q=qs)
 ```
+
 ```{r}
 par(bg=NA) 
 boxplot(t(positive.H),
@@ -706,7 +683,6 @@ points(H.mock,col="red",type="l")

 ## Biodiversity spectrum and metabarcoding (2) {.flexbox .vcenter .smaller}

-
 ```{r}
 par(bg=NA) 
 boxplot(t(positive.H)[,11:31],
@@ -761,7 +737,6 @@ obiclean -s merged_sample -H -C -r 0.1 \
      > positifs.uniq.annotated.clean.fasta
 ```

-
 ## Impact of data cleaning on $\alpha$-diversity (2)

 ```{r echo=TRUE}
@@ -805,16 +780,11 @@ points(D.mock,col="red",type="l")
 positive.clean.D.means = rowMeans(positive.D)
 ```

-
 # $\beta$-diversity

-
 ## Dissimilarity indices or non-metric distances {.flexbox .vcenter}
-<center>
-A dissimilarity index $d(A,B)$ is a numerical measurement 
-<br>
-of how far apart  objects $A$ and $B$ are.
-</center>
+
+<center>A dissimilarity index $d(A,B)$ is a numerical measurement <br> of how far apart objects $A$ and $B$ are.</center>

 ### Properties

@@ -846,17 +816,15 @@ $$

 ## Metrics or distances

-<div style="float: left; width: 50%;">
+::: {style="float: left; width: 50%;"}
 ```{r out.width = "400px"}
 knitr::include_graphics("figures/metric.svg")
 ```
-</div>
-
-<div style="float: right; width: 50%;">
+:::

+::: {style="float: right; width: 50%;"}
 A metric is a dissimilarity index verifying the *subadditivity* also named *triangle inequality*

-
 $$
 \begin{align}
 d(A,B) \geqslant& 0 \\
@@ -865,20 +833,18 @@ d(A,B) =& \;0 \iff A = B \\
 d(A,B) \leqslant& \;d(A,C) + d(C,B)
 \end{align}
 $$
-
-</div>
+:::

 ## Some metrics

-<div style="float: left; width: 50%;">
-
+::: columns
+::: {.column width="40%"}
 ```{r out.width = "400px"}
 knitr::include_graphics("figures/Distance.svg")
 ```
+:::

-</div>
-<div style="float: right; width: 50%;">
-
+::: {.column width="60%"}
 ### Computing

 $$
@@ -888,8 +854,8 @@ d_m =& |x_A - x_B| + |y_A - y_B| \\
 d_c =& \max(|x_A - x_B| , |y_A - y_B|) \\
 \end{align}
 $$
-
-</div>
+:::
+:::

 ## Generalizable on a n-dimension space {.smaller}

@@ -904,7 +870,6 @@ $$

 with $a_i$ and $b_i$ being respectively the value of the $i^{th}$ variable for $A$ and $B$.

-
 $$
 \begin{align}
 d_e =& \sqrt{\sum_{i=1}^{n}(a_i - b_i)^2 } \\
@@ -927,14 +892,14 @@ $$

 ## Metrics and ultrametrics

-<div style="float: left; width: 50%;">
+::: columns
+::: {.column width="40%"}
 ```{r out.width = "400px"}
 knitr::include_graphics("figures/ultrametric.svg")
 ```
-</div>
-
-<div style="float: right; width: 50%;">
+:::

+::: {.column width="60%"}
 ### Metric

 $$
@@ -946,9 +911,8 @@ $$
 $$
 d(x,z)\leq \max(d(x,y),d(y,z))
 $$
-
-
-</div>
+:::
+:::

 ## Why it is nice to use metrics ? {.flexbox .vcenter}

@@ -957,7 +921,6 @@ $$
 -   This means that rotations are not changing distances between objects
 -   Multidimensional scaling (PCA, PCoA, CoA...) are rotations

-
 ## The data set {.flexbox .vcenter}

 **We analyzed two forest sites in French Guiana**
@@ -978,7 +941,6 @@ data("guiana.motus")
 data("guiana.samples")
 ```

-
 ## Clean out bad PCR cycle 1 {.flexbox .vcenter .smaller}

 ```{r echo=TRUE,fig.height=2.5}
@@ -986,6 +948,7 @@ s = tag_bad_pcr(guiana.samples$sample,guiana.count)
 guiana.count.clean = guiana.count[s$keep,]
 guiana.samples.clean = guiana.samples[s$keep,]
 ```
+
 ```{r echo=TRUE}
 table(s$keep)
 ```
@@ -1017,7 +980,7 @@ table(s$keep)
 ## Averaging good PCR replicates (1) {.flexbox .vcenter}

 ```{r echo=TRUE}
-guiana.samples.clean = cbind(guiana.samples.clean,s)
+guiana.samples.clean = cbind(guiana.samples.clean,s[rownames(guiana.samples.clean),])

 guiana.count.mean = aggregate(decostand(guiana.count.clean,method = "total"),
                              by = list(guiana.samples.clean$sample),
@@ -1075,18 +1038,20 @@ xy = xy[,1:2]
 xy.hellinger = decostand(xy,method = "hellinger")
 ```

-<div style="float: left; width: 50%;">
-
+::: columns
+::: {.column width="40%"}
 ```{r, fig.width=4,fig.height=4}
 par(bg=NA)
 plot(xy.hellinger,asp=1)
 ```
-</div>
-<div style="float: right; width: 50%;">
+:::
+
+::: {.column width="60%"}
 ```{r out.width = "400px"}
 knitr::include_graphics("figures/euclidean_hellinger.svg")
 ```
-</div>
+:::
+:::

 ## Bray-Curtis distance on relative frequencies

@@ -1191,6 +1156,7 @@ plot(0,type='n',axes=FALSE,ann=FALSE)
 legend("topleft",legend = levels(samples.type),fill = 1:4,cex=1.2)
 ```

+````{=html}
 <!---
 ## Computation of norms 

@@ -1242,6 +1208,7 @@ plot(-guiana.n4.pcoa$points[,1],-guiana.n4.pcoa$points[,2],
 ```

 --->
+````

 ## Comparing diversity of the environments

@@ -1278,7 +1245,4 @@ boxplot(t(guiana.relfreq.final[,samples.type=="soil.Petit Plateau"]),log="y",
        names=qs,las=2,col=4,add=TRUE)
 ```

-
-
-
 ## Bibliography
@@ -0,0 +1,17 @@
+knitr
+tidyverse
+ggplot2
+tibble
+tidyr
+readr
+purrr
+dplyr
+stringr
+forcats
+lubridate
+kableExtra
+latex2exp
+MetabarSchool
+permute
+lattice
+vegan
--- a/Show More
+++ b/Show More