mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-12-07 16:20:27 +00:00
319 lines
9.9 KiB
Plaintext
319 lines
9.9 KiB
Plaintext
```{r}
|
|
library(tidyverse)
|
|
```
|
|
|
|
```{r}
|
|
x <- sample(1:4096, 29, replace=TRUE)
|
|
```
|
|
|
|
```{r}
|
|
emax <- function(lseq,word_size) {
|
|
nword = lseq - word_size + 1
|
|
nalpha = 4^word_size
|
|
|
|
if (nalpha < nword) {
|
|
cov = nword %/% nalpha
|
|
remains = nword %% nalpha
|
|
f1 = cov/nword
|
|
f2 = (cov+1)/nword
|
|
print(c(nalpha - remains,f1,remains,f2))
|
|
e = -(nalpha - remains) * f1 * log(f1) -
|
|
remains * f2 * log(f2)
|
|
} else {
|
|
e = log(nword)
|
|
}
|
|
|
|
e
|
|
}
|
|
```
|
|
|
|
```{r}
|
|
ec <- function(data,kmer_size) {
|
|
table <- table(data)
|
|
s <- sum(table)
|
|
e <- sum(table * log(table))/s
|
|
ed <- log(s) - e
|
|
|
|
em <- emax(s+kmer_size-1,kmer_size)
|
|
|
|
ed/em
|
|
}
|
|
```
|
|
|
|
```{r}
|
|
ef <- function(data,kmer_size) {
|
|
table <- table(data)
|
|
s <- sum(table)
|
|
f <- table / s
|
|
|
|
f <- as.numeric(f)
|
|
f <- f[f > 0]
|
|
|
|
em <- emax(s+kmer_size-1,kmer_size)
|
|
ed <- -sum(f * log(f))
|
|
|
|
print(c(ed,em,ed/em))
|
|
|
|
ed/em
|
|
}
|
|
```
|
|
|
|
```{r}
|
|
okmer <- function(data,kmer_size) {
|
|
str_sub(data,1:(nchar(data)-kmer_size+1)) %>%
|
|
str_sub(1,kmer_size)
|
|
}
|
|
```
|
|
|
|
```{r}
|
|
# Normalisation circulaire: retourne le plus petit k-mer par rotation circulaire
|
|
normalize_circular <- function(kmer) {
|
|
if (nchar(kmer) == 0) return(kmer)
|
|
|
|
canonical <- kmer
|
|
n <- nchar(kmer)
|
|
|
|
# Tester toutes les rotations circulaires
|
|
for (i in 2:n) {
|
|
rotated <- paste0(str_sub(kmer, i, n), str_sub(kmer, 1, i-1))
|
|
if (rotated < canonical) {
|
|
canonical <- rotated
|
|
}
|
|
}
|
|
|
|
canonical
|
|
}
|
|
```
|
|
|
|
```{r}
|
|
# Fonction totient d'Euler: compte le nombre d'entiers de 1 à n coprimes avec n
|
|
euler_totient <- function(n) {
|
|
if (n <= 0) return(0)
|
|
|
|
result <- n
|
|
p <- 2
|
|
|
|
# Traiter tous les facteurs premiers
|
|
while (p * p <= n) {
|
|
if (n %% p == 0) {
|
|
# Retirer toutes les occurrences de p
|
|
while (n %% p == 0) {
|
|
n <- n %/% p
|
|
}
|
|
# Appliquer la formule: φ(n) = n * (1 - 1/p)
|
|
result <- result - result %/% p
|
|
}
|
|
p <- p + 1
|
|
}
|
|
|
|
# Si n est toujours > 1, alors c'est un facteur premier
|
|
if (n > 1) {
|
|
result <- result - result %/% n
|
|
}
|
|
|
|
result
|
|
}
|
|
```
|
|
|
|
```{r}
|
|
# Retourne tous les diviseurs de n
|
|
divisors <- function(n) {
|
|
if (n <= 0) return(integer(0))
|
|
|
|
divs <- c()
|
|
i <- 1
|
|
while (i * i <= n) {
|
|
if (n %% i == 0) {
|
|
divs <- c(divs, i)
|
|
if (i != n %/% i) {
|
|
divs <- c(divs, n %/% i)
|
|
}
|
|
}
|
|
i <- i + 1
|
|
}
|
|
|
|
sort(divs)
|
|
}
|
|
```
|
|
|
|
```{r}
|
|
# Compte le nombre de colliers (necklaces) distincts de longueur n
|
|
# sur un alphabet de taille a en utilisant la formule de Moreau:
|
|
# N(n, a) = (1/n) * Σ φ(d) * a^(n/d)
|
|
# où la somme est sur tous les diviseurs d de n, et φ est la fonction totient d'Euler
|
|
necklace_count <- function(n, alphabet_size) {
|
|
if (n <= 0) return(0)
|
|
|
|
divs <- divisors(n)
|
|
sum_val <- 0
|
|
|
|
for (d in divs) {
|
|
# Calculer alphabet_size^(n/d)
|
|
power <- alphabet_size^(n %/% d)
|
|
sum_val <- sum_val + euler_totient(d) * power
|
|
}
|
|
|
|
sum_val %/% n
|
|
}
|
|
```
|
|
|
|
```{r}
|
|
# Nombre de classes d'équivalence pour les k-mers normalisés
|
|
# Utilise la formule exacte de Moreau pour compter les colliers (necklaces)
|
|
n_normalized_kmers <- function(kmer_size) {
|
|
# Valeurs exactes pré-calculées pour k=1 à 6
|
|
if (kmer_size == 1) return(4)
|
|
if (kmer_size == 2) return(10)
|
|
if (kmer_size == 3) return(24)
|
|
if (kmer_size == 4) return(70)
|
|
if (kmer_size == 5) return(208)
|
|
if (kmer_size == 6) return(700)
|
|
|
|
# Pour k > 6, utiliser la formule de Moreau (exacte)
|
|
# Alphabet ADN a 4 bases
|
|
necklace_count(kmer_size, 4)
|
|
}
|
|
```
|
|
|
|
```{r}
|
|
# Entropie maximale pour k-mers normalisés
|
|
enmax <- function(lseq, word_size) {
|
|
nword = lseq - word_size + 1
|
|
nalpha = n_normalized_kmers(word_size)
|
|
|
|
if (nalpha < nword) {
|
|
cov = nword %/% nalpha
|
|
remains = nword %% nalpha
|
|
f1 = cov/nword
|
|
f2 = (cov+1)/nword
|
|
e = -(nalpha - remains) * f1 * log(f1) -
|
|
remains * f2 * log(f2)
|
|
} else {
|
|
e = log(nword)
|
|
}
|
|
|
|
e
|
|
}
|
|
```
|
|
|
|
```{r}
|
|
# Entropie normalisée avec normalisation circulaire des k-mers
|
|
ecn <- function(data, kmer_size) {
|
|
# Normaliser tous les k-mers
|
|
normalized_data <- sapply(data, normalize_circular)
|
|
|
|
# Calculer la table des fréquences
|
|
table <- table(normalized_data)
|
|
s <- sum(table)
|
|
e <- sum(table * log(table))/s
|
|
ed <- log(s) - e
|
|
|
|
# Entropie maximale avec normalisation
|
|
em <- enmax(s + kmer_size - 1, kmer_size)
|
|
|
|
ed/em
|
|
}
|
|
```
|
|
|
|
```{r}
|
|
k<-'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
|
|
ec(okmer(k,1),1)
|
|
ec(okmer(k,2),2)
|
|
ec(okmer(k,3),3)
|
|
ec(okmer(k,4),4)
|
|
```
|
|
|
|
```{r}
|
|
k<-'atatatatatatatatatatatatatatata'
|
|
ef(okmer(k,1),1)
|
|
ef(okmer(k,2),2)
|
|
ef(okmer(k,3),3)
|
|
ef(okmer(k,4),4)
|
|
```
|
|
|
|
```{r}
|
|
k<-'aaaaaaaaaaaaaaaattttttttttttttt'
|
|
ef(okmer(k,1),1)
|
|
ef(okmer(k,2),2)
|
|
ef(okmer(k,3),3)
|
|
ef(okmer(k,4),4)
|
|
```
|
|
|
|
```{r}
|
|
k<-'atgatgatgatgatgatgatgatgatgatga'
|
|
ef(okmer(k,1),1)
|
|
ef(okmer(k,2),2)
|
|
ef(okmer(k,3),3)
|
|
ef(okmer(k,4),4)
|
|
```
|
|
|
|
```{r}
|
|
k<-'atcgatcgatcgatcgatcgatcgatcgact'
|
|
ecn(okmer(k,1),1)
|
|
ecn(okmer(k,2),2)
|
|
ecn(okmer(k,3),3)
|
|
ecn(okmer(k,4),4)
|
|
```
|
|
|
|
```{r}
|
|
k<-paste(sample(rep(c("a","c","g","t"),8),31),collapse="")
|
|
k <- "actatggcaagtcgtaaccgcgcttatcagg"
|
|
ecn(okmer(k,1),1)
|
|
ecn(okmer(k,2),2)
|
|
ecn(okmer(k,3),3)
|
|
ecn(okmer(k,4),4)
|
|
```
|
|
|
|
aattaaaaaaacaagataaaataatattttt
|
|
|
|
```{r}
|
|
k<-'aattaaaaaaacaagataaaataatattttt'
|
|
ecn(okmer(k,1),1)
|
|
ecn(okmer(k,2),2)
|
|
ecn(okmer(k,3),3)
|
|
ecn(okmer(k,4),4)
|
|
```
|
|
|
|
atg tga gat ,,,,
|
|
|
|
cat tca atc
|
|
|
|
tgatgatgatgatgatgatgatgatgatg
|
|
|
|
## Tests de normalisation circulaire
|
|
|
|
```{r}
|
|
# Test de la fonction de normalisation
|
|
normalize_circular("ca") # devrait donner "ac"
|
|
normalize_circular("tgca") # devrait donner "atgc"
|
|
normalize_circular("acgt") # devrait donner "acgt"
|
|
```
|
|
|
|
```{r}
|
|
# Comparaison ec vs ecn sur une séquence répétitive
|
|
# Les k-mers "atg", "tga", "gat" sont équivalents par rotation
|
|
k <- 'atgatgatgatgatgatgatgatgatgatga'
|
|
cat("Séquence:", k, "\n")
|
|
cat("ec(k,3) =", ec(okmer(k,3),3), "\n")
|
|
cat("ecn(k,3) =", ecn(okmer(k,3),3), "\n")
|
|
```
|
|
|
|
```{r}
|
|
# Comparaison sur séquence aléatoire
|
|
k <- "actatggcaagtcgtaaccgcgcttatcagg"
|
|
cat("Séquence:", k, "\n")
|
|
cat("Sans normalisation:\n")
|
|
cat(" ec(k,2) =", ec(okmer(k,2),2), "\n")
|
|
cat(" ec(k,3) =", ec(okmer(k,3),3), "\n")
|
|
cat(" ec(k,4) =", ec(okmer(k,4),4), "\n")
|
|
cat("Avec normalisation circulaire:\n")
|
|
cat(" ecn(k,2) =", ecn(okmer(k,2),2), "\n")
|
|
cat(" ecn(k,3) =", ecn(okmer(k,3),3), "\n")
|
|
cat(" ecn(k,4) =", ecn(okmer(k,4),4), "\n")
|
|
```
|
|
|
|
```{r}
|
|
re <- rev(c(0.8108602271901116,0.8108602271901116,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.7800272339058549,0.7800272339058549,0.7751610144606091,0.7751610144606091,0.7751610144606091,0.764858185548322,0.7325526601302021,0.7137620699527615,0.6789199521982864,0.6584536373623372,0.634002687184193,0.6075290415873623,0.5785545803330997,0.5785545803330997,0.5503220289212184,0.5315314387437778,0.4966893209893028,0.46077361820145696,0.42388221293245526,0.4009547969713408,0.3561142883497758,0.3561142883497758,0.3561142883497758,0.3561142883497758,0.3561142883497758,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.35141814451677883,0.35141814451677883,0.35141814451677883,0.35141814451677883,0.35141814451677883,0.390029016052137,0.42781461756157363,0.45192285937059073,0.47238917420654,0.47238917420654,0.47238917420654,0.5092805794755417,0.5451962822633876,0.5800384000178626,0.602395141014297,0.6046146614886381,0.6046146614886381,0.6119084258128231,0.6119084258128231,0.6214217106113492,0.6424704346756562,0.6482381543085467,0.6635191587399633,0.6635191587399633,0.6635191587399633,0.6828444721058894,0.6950205907027562,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.7208976112999935))
|
|
|
|
di <- c(0.7208976112999935,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6950205907027562,0.6828444721058894,0.6635191587399633,0.6635191587399633,0.6635191587399633,0.6482381543085467,0.6424704346756562,0.6214217106113492,0.6119084258128231,0.6119084258128231,0.6046146614886382,0.6046146614886382,0.6023951410142971,0.5800384000178627,0.5451962822633876,0.5092805794755418,0.47238917420654003,0.47238917420654003,0.47238917420654003,0.4519228593705908,0.4278146175615737,0.39002901605213713,0.35141814451677894,0.35141814451677894,0.35141814451677894,0.35141814451677894,0.35141814451677883,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3561142883497762,0.3561142883497762,0.3561142883497762,0.3561142883497762,0.3561142883497762,0.40095479697134073,0.42388221293245526,0.46077361820145696,0.4966893209893028,0.5315314387437778,0.5503220289212184,0.5785545803330997,0.5785545803330997,0.6075290415873625,0.6340026871841933,0.6584536373623374,0.6789199521982866,0.7137620699527616,0.7325526601302023,0.7648581855483221,0.7751610144606093,0.7751610144606093,0.7751610144606093,0.7800272339058549,0.7800272339058549,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8108602271901116,0.8108602271901116)
|
|
``` |