diff --git a/NAMESPACE b/NAMESPACE index 835f9f3..1d0f808 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,5 +7,6 @@ export(H_spectrum) export(exp_q) export(log_q) export(mode) +export(norm) export(tag_bad_pcr) importFrom(Rdpack,reprompt) diff --git a/R/norme.R b/R/norme.R new file mode 100644 index 0000000..5d45a23 --- /dev/null +++ b/R/norme.R @@ -0,0 +1,16 @@ +#' @export +norm <- function(data,l=2) { + no <- function(x,y) sum(abs(data[x,]-data[y,])^l)^(1/l) + n = nrow(data) + d = matrix(0,nrow = n,ncol = n) + for (i in 1:n) + for (j in i:n) { + d[i,j] <- no(i,j) + d[j,i] <- d[i,j] + } + + rownames(d) = rownames(data) + colnames(d) = rownames(data) + + as.dist(d) +} diff --git a/figures/dist_hellinger.afdesign b/figures/dist_hellinger.afdesign index fb6838d..ac8d351 100644 Binary files a/figures/dist_hellinger.afdesign and b/figures/dist_hellinger.afdesign differ diff --git a/figures/diversity.afdesign b/figures/diversity.afdesign index 807a904..5491315 100644 Binary files a/figures/diversity.afdesign and b/figures/diversity.afdesign differ diff --git a/figures/subsampling.svg b/figures/subsampling.svg index aaebcec..c6073fb 100644 --- a/figures/subsampling.svg +++ b/figures/subsampling.svg @@ -1,7 +1,7 @@ - - + + @@ -19,30 +19,30 @@ - + - + - + - Subsampling + Subsampling - Subsampling + Subsampling - Subsampling + Subsampling - - - + + + diff --git a/index.Rmd b/index.Rmd index cf2940b..9f911cd 100644 --- a/index.Rmd +++ b/index.Rmd @@ -28,6 +28,7 @@ opts_chunk$set(echo = FALSE, # Summary +- The MetabarSchool Package - What do the reading numbers per PCR mean? - Rarefaction vs. relative frequencies - alpha diversity metrics @@ -35,6 +36,28 @@ opts_chunk$set(echo = FALSE, - multidimentionnal analysis - comparison between datasets +# The MetabarSchool Package + +## Instaling the package + +You need the *devtools* package + +```{r eval=FALSE, echo=TRUE} +install.packages("devtools",dependencies = TRUE) +``` + +Then you can install *MetabarSchool* + +```{r eval=FALSE, echo=TRUE} +devtools::install_git("https://git.metabarcoding.org/MetabarcodingSchool/biodiversity-metrics.git") +``` + +You will also need the *vegan* package + +```{r eval=FALSE, echo=TRUE} +install.packages("vegan",dependencies = TRUE) +``` + # The dataset ## The mock community {.flexbox .vcenter .smaller} @@ -68,6 +91,8 @@ data("positive.samples") ## Loading data ```{r echo=TRUE} +library(MetabarSchool) + data("positive.count") data("positive.samples") data("positive.motus") @@ -94,6 +119,8 @@ positive.count[1:5,1:5] ## Loading data ```{r echo=TRUE} +library(MetabarSchool) + data("positive.count") data("positive.samples") data("positive.motus") @@ -119,6 +146,8 @@ head(positive.samples,n=3) ## Loading data ```{r echo=TRUE} +library(MetabarSchool) + data("positive.count") data("positive.samples") data("positive.motus") @@ -169,7 +198,7 @@ positive.motus = positive.motus[are.not.singleton,] $`r nrow(positive.count)` \; PCRs \; \times \; `r ncol(positive.count)` \; MOTUs$ matrix -## Not all the PCR have the number of reads {.flexbox .vcenter} +## Not all the PCR have the same number of reads {.flexbox .vcenter} Despite all standardization efforts @@ -240,13 +269,19 @@ positive.count.rarefied = rrarefy(positive.count,2000) ## Rarefying read count (2) {.flexbox .vcenter} -```{r fig.height=3} +```{r fig.height=4} par(mfrow=c(1,2),bg=NA) hist(log10(colSums(positive.count)+1), main = "Not rarefied", + xlim = c(0,6), + ylim = c(0,2300), + breaks = 30, xlab = TeX("$\\log_{10}(reads per MOTUs)$")) hist(log10(colSums(positive.count.rarefied)+1), main = "Rarefied data", + xlim = c(0,6), + ylim = c(0,2300), + breaks = 30, xlab = TeX("$\\log_{10}(reads per MOTUs)$")) ``` @@ -325,11 +360,11 @@ knitr::include_graphics("figures/diversity.svg") @Whittaker:10:00



-- $\alpha-diversity$ : Mean diversity per site ($species/site$) +- $\alpha\text{-diversity}$ : Mean diversity per site ($species/site$) -- $\gamma-diversity$ : Regional biodiversity ($species/region$) +- $\gamma\text{-diversity}$ : Regional biodiversity ($species/region$) -- $\beta-diversity$ : $\beta = \frac{\gamma}{\alpha}$ ($site$) +- $\beta\text{-diversity}$ : $\beta = \frac{\gamma}{\alpha}$ ($sites/region$) @@ -410,25 +445,19 @@ kable(data.frame(`Gini-Simpson`=GS), kable_styling(position = "center") ``` -## Shanon entropy {.smaller} +## Shannon entropy {.smaller} -
-Shanon entropy is based on information theory. +Shannon entropy is based on information theory: -Let $X$ be a uniformly distributed random variable with values in $A$ +
+$H^{\prime }=-\sum _{i=1}^{S}p_{i}\log p_{i}$ +
-$$ -H(X) = \log|A| -$$ -
-
-
+if $A$ is a community where every species are equally represented then $$ -H^{\prime }=-\sum _{i=1}^{S}p_{i}\log p_{i} +H(A) = \log|A| $$ -
-
```{r out.width = "400px"} @@ -441,7 +470,7 @@ H = - rowSums(environments * log(environments),na.rm = TRUE) ``` ```{r} -kable(data.frame(`Shanon index`=H), +kable(data.frame(`Shannon index`=H), format="html", align = 'rr') %>% kable_styling(position = "center") @@ -452,12 +481,12 @@ kable(data.frame(`Shanon index`=H),
As : $$ -H(X) = \log|A| \;\Rightarrow\; ^1D = e^{H(X)} +H(A) = \log|A| \;\Rightarrow\; ^1D = e^{H(A)} $$
-where $^1D$ is the theoretical number of species in a evenly distributed community that would have the same Shanon's entropy than ours. +where $^1D$ is the theoretical number of species in a evenly distributed community that would have the same Shannon's entropy than ours.
@@ -548,10 +577,10 @@ exp_q = function(x,q=1) { } ``` -## Generalised Shanon entropy +## Generalised Shannon entropy $$ -^qH = - \sum_{i=1}^S pi \times ^q\log pi +^qH = - \sum_{i=1}^S p_i \; ^q\log p_i $$ ```{r echo=TRUE, eval=FALSE} @@ -616,7 +645,7 @@ abline(v=c(0,1,2),lty=2,col=4:6) - $^0H(X) = S - 1$ : the richness minus one. -- $^1H(X) = H^{\prime}$ : the Shanon's entropy. +- $^1H(X) = H^{\prime}$ : the Shannon's entropy. - $^2H(X) = 1 - \lambda$ : Gini-Simpson's index. @@ -1074,15 +1103,15 @@ BC_{jk}=\frac{\sum _{i=1}^{p}(N_{ij} - min(N_{ij},N_{ik}) + (N_{ik} - min(N_{ij} $$ $$ -BC_{jk}=\frac{\sum _{i=1}^{p}]N_{ij} - N_{ik}|}{\sum _{i=1}^{p}N_{ij}+\sum _{i=1}^{p}N_{ik}} +BC_{jk}=\frac{\sum _{i=1}^{p}|N_{ij} - N_{ik}|}{\sum _{i=1}^{p}N_{ij}+\sum _{i=1}^{p}N_{ik}} $$ $$ -BC_{jk}=\frac{\sum _{i=1}^{p}]N_{ij} - N_{ik}|}{1+1} +BC_{jk}=\frac{\sum _{i=1}^{p}|N_{ij} - N_{ik}|}{1+1} $$ $$ -BC_{jk}=\frac{1}{2}\sum _{i=1}^{p}]N_{ij} - N_{ik}| +BC_{jk}=\frac{1}{2}\sum _{i=1}^{p}|N_{ij} - N_{ik}| $$ ## Principale coordinate analysis (1) {.flexbox .vcenter} @@ -1109,7 +1138,7 @@ plot(guiana.bc.pcoa$points[,1:2], xlab="Axis 1", ylab="Axis 2", main = "Bray Curtis on Rel. Freqs") -plot(guiana.euc.pcoa$points[,1:2], +plot(guiana.euc.pcoa$points[,1],-guiana.euc.pcoa$points[,2], col = samples.type, asp = 1, xlab="Axis 1", @@ -1123,7 +1152,7 @@ plot(guiana.jac.1.pcoa$points[,1:2], xlab="Axis 1", ylab="Axis 2", main = "Jaccard on presence (0.1%)") -plot(guiana.jac.10.pcoa$points[,1:2], +plot(-guiana.jac.10.pcoa$points[,1],guiana.jac.10.pcoa$points[,2], col = samples.type, asp = 1, xlab="Axis 1", @@ -1162,6 +1191,58 @@ plot(0,type='n',axes=FALSE,ann=FALSE) legend("topleft",legend = levels(samples.type),fill = 1:4,cex=1.2) ``` + + ## Comparing diversity of the environments ```{r} diff --git a/index.html b/index.html index eeb275f..078b191 100644 --- a/index.html +++ b/index.html @@ -4,7 +4,6 @@ Biodiversity metrics and metabarcoding - @@ -82,6 +81,10 @@ transition: opacity 0.6s ease-in 0.4s; opacity: 0; } +/* https://github.com/ropensci/plotly/pull/524#issuecomment-468142578 */ +slide:not(.current) .plotly.html-widget{ + display: block; +} @@ -106,14 +109,31 @@

Summary

    +
  • The MetabarSchool Package
  • What do the reading numbers per PCR mean?
  • -
  • Rarefaction vs. relative frequencies
  • +
  • Rarefaction vs. relative frequencies
  • alpha diversity metrics
  • beta diversity metrics
  • multidimentionnal analysis
  • comparison between datasets
+

The MetabarSchool Package

+ +

Instaling the package

+ +

You need the devtools package

+ +
install.packages("devtools",dependencies = TRUE)
+ +

Then you can install MetabarSchool

+ +
devtools::install_git("https://git.metabarcoding.org/MetabarcodingSchool/biodiversity-metrics.git")
+ +

You will also need the vegan package

+ +
install.packages("vegan",dependencies = TRUE)
+

The dataset

The mock community

@@ -609,14 +629,19 @@ Lotus corniculatus

The experiment

    -
  • 192 PCR of the mock community using SPER02 trnL-P6-Loop primers

  • +
  • 192 PCR of the mock community using SPER02 trnL-P6-Loop primers

    + +
    • 6 dilutions of the mock community: 1/1, 1/2, 1/4, 1/8, 1/16, 1/32

    • 32 repeats per dilution

    • +

Loading data

-
data("positive.count")
+
library(MetabarSchool)
+
+data("positive.count")
 data("positive.samples")
 data("positive.motus")
@@ -880,7 +905,9 @@ sample.TM_POS_d16_2_a_A1

Loading data

-
data("positive.count")
+
library(MetabarSchool)
+
+data("positive.count")
 data("positive.samples")
 data("positive.motus")
@@ -992,7 +1019,9 @@ sample.TM_POS_d16_1_b_A2

Loading data

-
data("positive.count")
+
library(MetabarSchool)
+
+data("positive.count")
 data("positive.samples")
 data("positive.motus")
@@ -1212,11 +1241,11 @@ positive.motus = positive.motus[are.not.singleton,]
  • positive.count is now a \(192 \; PCRs \; \times \; 5579 \; MOTUs\) matrix
  • -

    Not all the PCR have the number of reads

    +

    Not all the PCR have the same number of reads

    Despite all standardization efforts

    -

    +

    Is it related to the amount of DNA in the extract ?

    @@ -1227,7 +1256,7 @@ positive.motus = positive.motus[are.not.singleton,] boxplot(rowSums(positive.count) ~ positive.samples$dilution,log="y") abline(h = median(rowSums(positive.count)),lw=2,col="red",lty=2) -

    +

    @@ -1268,7 +1297,7 @@ Only 7.4% of the PCR read count variation is explain by dilution

    Rarefying read count (2)

    -

    +

    Rarefying read count (3)

    @@ -1284,16 +1313,16 @@ are.still.present[1:5]
    ## are.still.present
     ## FALSE  TRUE 
    -##  1942  3637
    +## 1886 3693

    Rarefying read count (4)

    par(bg=NA) 
     boxplot(colSums(positive.count) ~ are.still.present, log="y")
    -

    +

    -

    The MOTUs removed by rarefaction were at most occurring 21 times

    +

    The MOTUs removed by rarefaction were at most occurring 13 times

    The MOTUs kept by rarefaction were at least occurring 2 times

    @@ -1306,7 +1335,7 @@ positive.motus.rare = positive.motus[are.still.present,]
    -positive.motus.rare is now a \(192 \; PCRs \; \times \; 3637 \; MOTUs\) +positive.motus.rare is now a \(192 \; PCRs \; \times \; 3693 \; MOTUs\)
    @@ -1340,9 +1369,9 @@ positive.motus.rare is now a \(192 \; PCRs \; \times \; 3637 \; MOTUs\)



    Whittaker (2010)



      -
    • \(\alpha-diversity\) : Mean diversity per site (\(species/site\))

    • -
    • \(\gamma-diversity\) : Regional biodiversity (\(species/region\))

    • -
    • \(\beta-diversity\) : \(\beta = \frac{\gamma}{\alpha}\) (\(site\))

    • +
    • \(\alpha\text{-diversity}\) : Mean diversity per site (\(species/site\))

    • +
    • \(\gamma\text{-diversity}\) : Regional biodiversity (\(species/region\))

    • +
    • \(\beta\text{-diversity}\) : \(\beta = \frac{\gamma}{\alpha}\) (\(sites/region\))

    \(\alpha\)-diversity

    @@ -1583,10 +1612,10 @@ Environment.2 -

    Gini-Simpson's index

    +

    Gini-Simpson’s index

    -

    The Simpson's index is the probability of having the same species twice when you randomly select two specimens.

    +

    The Simpson’s index is the probability of having the same species twice when you randomly select two specimens.

    \[ @@ -1597,7 +1626,7 @@ Environment.2

    \(\lambda\) decrease when complexity of your ecosystem increase.

    -

    Gini-Simpson's index defined as \(1-\lambda\) increase with diversity

    +

    Gini-Simpson’s index defined as \(1-\lambda\) increase with diversity

    @@ -1663,24 +1692,20 @@ Environment.2 -

    Shanon entropy

    +

    Shannon entropy

    -
    -

    Shanon entropy is based on information theory.

    +

    Shannon entropy is based on information theory:

    -

    Let \(X\) be a uniformly distributed random variable with values in \(A\)

    +
    -

    \[ -H(X) = \log|A| +\(H^{\prime }=-\sum _{i=1}^{S}p_{i}\log p_{i}\) + +

    + +

    if \(A\) is a community where every species are equally represented then \[ +H(A) = \log|A| \]

    -


    - -
    -

    \[ -H^{\prime }=-\sum _{i=1}^{S}p_{i}\log p_{i} -\]

    -
    @@ -1701,7 +1726,7 @@ H^{\prime }=-\sum _{i=1}^{S}p_{i}\log p_{i} -Shanon.index +Shannon.index @@ -1747,15 +1772,15 @@ Environment.2 -

    Hill's number

    +

    Hill’s number

    As : \[ -H(X) = \log|A| \;\Rightarrow\; ^1D = e^{H(X)} +H(A) = \log|A| \;\Rightarrow\; ^1D = e^{H(A)} \]

    -

    where \(^1D\) is the theoretical number of species in a evenly distributed community that would have the same Shanon's entropy than ours.

    +

    where \(^1D\) is the theoretical number of species in a evenly distributed community that would have the same Shannon’s entropy than ours.

    @@ -1851,7 +1876,7 @@ Environment.2

    Impact of \(q\) on the log_q function

    -

    +

    And its inverse function

    @@ -1871,17 +1896,17 @@ Environment.2 (1 + (1-q)*x)^(1/(1-q)) } -

    Generalised Shanon entropy

    +

    Generalised Shannon entropy

    \[ -^qH = - \sum_{i=1}^S pi \times ^q\log pi +^qH = - \sum_{i=1}^S p_i \; ^q\log p_i \]

    H_q = function(x,q=1) {
       sum(x * log_q(1/x,q),na.rm = TRUE)
     }
    -

    and generalized the previously presented Hill's number

    +

    and generalized the previously presented Hill’s number

    \[ ^qD=^qe^{^qH} @@ -1908,22 +1933,22 @@ qs = seq(from=0,to=3,by=0.1) environments.hq = apply(environments,MARGIN = 1,H_spectrum,q=qs) environments.dq = apply(environments,MARGIN = 1,D_spectrum,q=qs) -

    +

    Generalized entropy \(vs\) \(\alpha\)-diversity indices

    • \(^0H(X) = S - 1\) : the richness minus one.

    • -
    • \(^1H(X) = H^{\prime}\) : the Shanon's entropy.

    • -
    • \(^2H(X) = 1 - \lambda\) : Gini-Simpson's index.

    • +
    • \(^1H(X) = H^{\prime}\) : the Shannon’s entropy.

    • +
    • \(^2H(X) = 1 - \lambda\) : Gini-Simpson’s index.

    -

    When computing the exponential of entropy : Hill's number

    +

    When computing the exponential of entropy : Hill’s number

    • \(^0D(X) = S\) : The richness.

    • \(^1D(X) = e^{H^{\prime}}\) : The number of species in an even community having the same \(H^{\prime}\).

    • -
    • \(^2D(X) = 1 / \lambda\) : The number of species in an even community having the same Gini-Simpson's index.

    • +
    • \(^2D(X) = 1 / \lambda\) : The number of species in an even community having the same Gini-Simpson’s index.


    @@ -1941,7 +1966,7 @@ environments.dq = apply(environments,MARGIN = 1,D_spectrum,q=qs)
    H.mock = H_spectrum(plants.16$dilution,qs)
     D.mock = D_spectrum(plants.16$dilution,qs)
    -

    +

    Biodiversity spectrum and metabarcoding (1)

    @@ -1950,11 +1975,11 @@ D.mock = D_spectrum(plants.16$dilution,qs) FUN = H_spectrum, q=qs) -

    +

    Biodiversity spectrum and metabarcoding (2)

    -

    +

    Biodiversity spectrum and metabarcoding (3)

    @@ -1963,7 +1988,7 @@ D.mock = D_spectrum(plants.16$dilution,qs) FUN = D_spectrum, q=qs) -

    +

    Impact of data cleaning on \(\alpha\)-diversity (1)

    @@ -1999,7 +2024,7 @@ positive.clean.H = apply(positive.clean.count.relfreq, FUN = H_spectrum, q=qs) -

    +

    Impact of data cleaning on \(\alpha\)-diversity (3)

    @@ -2008,7 +2033,7 @@ positive.clean.H = apply(positive.clean.count.relfreq, FUN = D_spectrum, q=qs) -

    +

    \(\beta\)-diversity

    @@ -2102,7 +2127,7 @@ d_c =& \max\limits_{1\leqslant i \leqslant n}\left|a_i - b_i\right| \\ \end{align} \]

    -

    For the fun… ;-)

    +

    For the fun… ;-)

    You can generalize those distances as a norm of order \(k\)

    @@ -2140,7 +2165,7 @@ d(x,z)\leq \max(d(x,y),d(y,z))
  • A metric induce a metric space
  • In a metric space rotations are isometries
  • This means that rotations are not changing distances between objects
  • -
  • Multidimensional scaling (PCA, PCoA, CoA…) are rotations
  • +
  • Multidimensional scaling (PCA, PCoA, CoA…) are rotations
  • The data set

    @@ -2167,7 +2192,7 @@ data("guiana.samples")
    s = tag_bad_pcr(guiana.samples$sample,guiana.count)
    -

    +

    guiana.count.clean = guiana.count[s$keep,]
     guiana.samples.clean = guiana.samples[s$keep,]
    @@ -2182,7 +2207,7 @@ guiana.samples.clean = guiana.samples[s$keep,]
    s = tag_bad_pcr(guiana.samples.clean$sample,guiana.count.clean)
    -

    +

    guiana.count.clean = guiana.count.clean[s$keep,]
     guiana.samples.clean = guiana.samples.clean[s$keep,]
    @@ -2197,7 +2222,7 @@ guiana.samples.clean = guiana.samples.clean[s$keep,]
    s = tag_bad_pcr(guiana.samples.clean$sample,guiana.count.clean)
    -

    +

    guiana.count.clean = guiana.count.clean[s$keep,]
     guiana.samples.clean = guiana.samples.clean[s$keep,]
    @@ -2264,7 +2289,7 @@ xy = xy[,1:2] xy.hellinger = decostand(xy,method = "hellinger")
    -

    +

    @@ -2284,15 +2309,15 @@ BC_{jk}=\frac{\sum _{i=1}^{p}(N_{ij} - min(N_{ij},N_{ik}) + (N_{ik} - min(N_{ij} \]

    \[ -BC_{jk}=\frac{\sum _{i=1}^{p}]N_{ij} - N_{ik}|}{\sum _{i=1}^{p}N_{ij}+\sum _{i=1}^{p}N_{ik}} +BC_{jk}=\frac{\sum _{i=1}^{p}|N_{ij} - N_{ik}|}{\sum _{i=1}^{p}N_{ij}+\sum _{i=1}^{p}N_{ik}} \]

    \[ -BC_{jk}=\frac{\sum _{i=1}^{p}]N_{ij} - N_{ik}|}{1+1} +BC_{jk}=\frac{\sum _{i=1}^{p}|N_{ij} - N_{ik}|}{1+1} \]

    \[ -BC_{jk}=\frac{1}{2}\sum _{i=1}^{p}]N_{ij} - N_{ik}| +BC_{jk}=\frac{1}{2}\sum _{i=1}^{p}|N_{ij} - N_{ik}| \]

    Principale coordinate analysis (1)

    @@ -2305,17 +2330,36 @@ guiana.jac.50.pcoa = cmdscale(guiana.jac.50.dist,k=3,eig = TRUE)

    Principale coordinate analysis (2)

    -

    +

    Principale composante analysis

    guiana.hellinger.pca = prcomp(guiana.hellinger.final,center = TRUE, scale. = FALSE)
    -

    +

    + +

    Comparing diversity of the environments

    -

    +

    Bibliography