Some corrections on the lecture

2019-11-03 13:12:58 -05:00
parent cf80ff9f6a
commit 050956c01b
7 changed files with 246 additions and 104 deletions
--- a/index.Rmd
+++ b/index.Rmd
@@ -28,6 +28,7 @@ opts_chunk$set(echo = FALSE,

 # Summary

+- The MetabarSchool Package
 - What do the reading numbers per PCR mean?
 - Rarefaction vs. relative frequencies
 - alpha diversity metrics
@@ -35,6 +36,28 @@ opts_chunk$set(echo = FALSE,
 - multidimentionnal analysis
 - comparison between datasets

+# The MetabarSchool Package
+
+## Instaling the package
+
+You need the *devtools* package
+
+```{r eval=FALSE, echo=TRUE}
+install.packages("devtools",dependencies = TRUE)
+```
+
+Then you can install *MetabarSchool*
+
+```{r  eval=FALSE, echo=TRUE}
+devtools::install_git("https://git.metabarcoding.org/MetabarcodingSchool/biodiversity-metrics.git")
+```
+
+You will also need the *vegan* package
+
+```{r eval=FALSE, echo=TRUE}
+install.packages("vegan",dependencies = TRUE)
+```
+
 # The dataset

 ## The mock community {.flexbox .vcenter .smaller}
@@ -68,6 +91,8 @@ data("positive.samples")
 ## Loading data

 ```{r echo=TRUE}
+library(MetabarSchool)
+
 data("positive.count")
 data("positive.samples")
 data("positive.motus")
@@ -94,6 +119,8 @@ positive.count[1:5,1:5]
 ## Loading data

 ```{r echo=TRUE}
+library(MetabarSchool)
+
 data("positive.count")
 data("positive.samples")
 data("positive.motus")
@@ -119,6 +146,8 @@ head(positive.samples,n=3)
 ## Loading data

 ```{r echo=TRUE}
+library(MetabarSchool)
+
 data("positive.count")
 data("positive.samples")
 data("positive.motus")
@@ -169,7 +198,7 @@ positive.motus = positive.motus[are.not.singleton,]
  $`r nrow(positive.count)` \; PCRs \; \times \;  `r ncol(positive.count)` \; MOTUs$
  matrix 
  
-## Not all the PCR have the number of reads  {.flexbox .vcenter}
+## Not all the PCR have the same number of reads  {.flexbox .vcenter}

 Despite all standardization efforts

@@ -240,13 +269,19 @@ positive.count.rarefied = rrarefy(positive.count,2000)

 ## Rarefying read count (2)   {.flexbox .vcenter}

-```{r fig.height=3}
+```{r fig.height=4}
 par(mfrow=c(1,2),bg=NA)
 hist(log10(colSums(positive.count)+1),
     main = "Not rarefied",
+     xlim = c(0,6),
+     ylim = c(0,2300),
+     breaks = 30,
     xlab = TeX("$\\log_{10}(reads per MOTUs)$"))
 hist(log10(colSums(positive.count.rarefied)+1),
     main = "Rarefied data",
+     xlim = c(0,6),
+     ylim = c(0,2300),
+     breaks = 30,
     xlab = TeX("$\\log_{10}(reads per MOTUs)$"))
 ```

@@ -325,11 +360,11 @@ knitr::include_graphics("figures/diversity.svg")
@Whittaker:10:00
 <br><br><br><br>

- $\alpha-diversity$ : Mean diversity per site ($species/site$)
+- $\alpha\text{-diversity}$ : Mean diversity per site ($species/site$)

- $\gamma-diversity$ : Regional biodiversity   ($species/region$)
+- $\gamma\text{-diversity}$ : Regional biodiversity   ($species/region$)

- $\beta-diversity$  : $\beta = \frac{\gamma}{\alpha}$ ($site$)
+- $\beta\text{-diversity}$  : $\beta = \frac{\gamma}{\alpha}$ ($sites/region$)

 </div>

@@ -410,25 +445,19 @@ kable(data.frame(`Gini-Simpson`=GS),
  kable_styling(position = "center")
 ```

-## Shanon entropy   {.smaller}
+## Shannon entropy   {.smaller}

-<div style="float: left; width: 65%;">
-Shanon entropy is based on information theory. 
+Shannon entropy is based on information theory:

-Let $X$ be a uniformly distributed random variable with values in $A$ 
+<center>
+$H^{\prime }=-\sum _{i=1}^{S}p_{i}\log p_{i}$
+</center>

-$$
-H(X) = \log|A|
-$$

-<br>
-</div>
-<div style="float: right; width: 35%;">
+if $A$ is a community where every species are equally represented then 
 $$
-H^{\prime }=-\sum _{i=1}^{S}p_{i}\log p_{i}
+H(A) = \log|A|
 $$
-<br>
-</div>

 <center>
 ```{r out.width = "400px"}
@@ -441,7 +470,7 @@ H = - rowSums(environments * log(environments),na.rm = TRUE)
 ```

 ```{r}
-kable(data.frame(`Shanon index`=H),
+kable(data.frame(`Shannon index`=H),
             format="html",
             align = 'rr') %>% 
  kable_styling(position = "center")
@@ -452,12 +481,12 @@ kable(data.frame(`Shanon index`=H),
 <div style="float: left; width: 50%;">
 As :
 $$
-H(X) = \log|A| \;\Rightarrow\; ^1D = e^{H(X)}
+H(A) = \log|A| \;\Rightarrow\; ^1D = e^{H(A)}
 $$
 <br>
 </div>
 <div style="float: right; width: 50%;">
-where $^1D$ is the theoretical number of species in a evenly distributed community that would have the same Shanon's entropy than ours.
+where $^1D$ is the theoretical number of species in a evenly distributed community that would have the same Shannon's entropy than ours.
 </div>

 <center>
@@ -548,10 +577,10 @@ exp_q = function(x,q=1) {
 }
 ```

-## Generalised Shanon entropy
+## Generalised Shannon entropy

 $$
-^qH = - \sum_{i=1}^S pi \times ^q\log pi
+^qH = - \sum_{i=1}^S p_i \; ^q\log p_i
 $$

 ```{r echo=TRUE, eval=FALSE}
@@ -616,7 +645,7 @@ abline(v=c(0,1,2),lty=2,col=4:6)

 - $^0H(X) = S - 1$ : the richness minus one.

- $^1H(X) = H^{\prime}$ : the Shanon's entropy.
+- $^1H(X) = H^{\prime}$ : the Shannon's entropy.

 - $^2H(X) = 1 - \lambda$ : Gini-Simpson's index.

@@ -1074,15 +1103,15 @@ BC_{jk}=\frac{\sum _{i=1}^{p}(N_{ij} - min(N_{ij},N_{ik}) + (N_{ik} - min(N_{ij}
 $$

 $$
-BC_{jk}=\frac{\sum _{i=1}^{p}]N_{ij} - N_{ik}|}{\sum _{i=1}^{p}N_{ij}+\sum _{i=1}^{p}N_{ik}} 
+BC_{jk}=\frac{\sum _{i=1}^{p}|N_{ij} - N_{ik}|}{\sum _{i=1}^{p}N_{ij}+\sum _{i=1}^{p}N_{ik}} 
 $$

 $$
-BC_{jk}=\frac{\sum _{i=1}^{p}]N_{ij} - N_{ik}|}{1+1}
+BC_{jk}=\frac{\sum _{i=1}^{p}|N_{ij} - N_{ik}|}{1+1}
 $$

 $$
-BC_{jk}=\frac{1}{2}\sum _{i=1}^{p}]N_{ij} - N_{ik}|
+BC_{jk}=\frac{1}{2}\sum _{i=1}^{p}|N_{ij} - N_{ik}|
 $$

 ## Principale coordinate analysis (1) {.flexbox .vcenter}
@@ -1109,7 +1138,7 @@ plot(guiana.bc.pcoa$points[,1:2],
     xlab="Axis 1",
     ylab="Axis 2",
     main = "Bray Curtis on Rel. Freqs")
-plot(guiana.euc.pcoa$points[,1:2],
+plot(guiana.euc.pcoa$points[,1],-guiana.euc.pcoa$points[,2],
     col = samples.type,
     asp = 1,
     xlab="Axis 1",
@@ -1123,7 +1152,7 @@ plot(guiana.jac.1.pcoa$points[,1:2],
     xlab="Axis 1",
     ylab="Axis 2",
     main = "Jaccard on presence (0.1%)")
-plot(guiana.jac.10.pcoa$points[,1:2],
+plot(-guiana.jac.10.pcoa$points[,1],guiana.jac.10.pcoa$points[,2],
     col = samples.type,
     asp = 1,
     xlab="Axis 1",
@@ -1162,6 +1191,58 @@ plot(0,type='n',axes=FALSE,ann=FALSE)
 legend("topleft",legend = levels(samples.type),fill = 1:4,cex=1.2)
 ```

+<!---
+## Computation of norms 
+
+```{r guiana_norm, echo=TRUE}
+guiana.n1.dist = norm(guiana.relfreq.final,l=1)
+guiana.n2.dist = norm(guiana.relfreq.final^(1/2),l=2)
+guiana.n3.dist = norm(guiana.relfreq.final^(1/3),l=3)
+guiana.n4.dist = norm(guiana.relfreq.final^(1/100),l=100)
+```
+
+## pCoA on norms 
+
+```{r dependson="guiana_norm"}
+guiana.n1.pcoa  = cmdscale(guiana.n1.dist,k=3,eig = TRUE)
+guiana.n2.pcoa  = cmdscale(guiana.n2.dist,k=3,eig = TRUE)
+guiana.n3.pcoa  = cmdscale(guiana.n3.dist,k=3,eig = TRUE)
+guiana.n4.pcoa  = cmdscale(guiana.n4.dist,k=3,eig = TRUE)
+```
+
+```{r}
+par(mfrow=c(2,3),bg=NA)
+plot(guiana.n1.pcoa$points[,1],guiana.n1.pcoa$points[,2],
+     col = samples.type,
+     asp = 1,
+     xlab="Axis 1",
+     ylab="Axis 2",
+     main = "Norm 1 on Hellinger")
+plot(guiana.n2.pcoa$points[,1],-guiana.n2.pcoa$points[,2],
+     col = samples.type,
+     asp = 1,
+     xlab="Axis 1",
+     ylab="Axis 2",
+     main = "Norm 2 on Hellinger")
+plot(0,type='n',axes=FALSE,ann=FALSE)
+legend("topleft",legend = levels(samples.type),fill = 1:4,cex=1.2)
+plot(-guiana.n3.pcoa$points[,1],-guiana.n3.pcoa$points[,2],
+     col = samples.type,
+     asp = 1,
+     xlab="Axis 1",
+     ylab="Axis 2",
+     main = "Norm 3 on Hellinger")
+plot(-guiana.n4.pcoa$points[,1],-guiana.n4.pcoa$points[,2],
+     col = samples.type,
+     asp = 1,
+     xlab="Axis 1",
+     ylab="Axis 2",
+     main = "Norm 4 on Hellinger")
+
+```
+
+--->
+
 ## Comparing diversity of the environments

 ```{r}