mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-12-07 16:20:27 +00:00
implementation de obilowmask
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -16,6 +16,7 @@
|
|||||||
**/*.tgz
|
**/*.tgz
|
||||||
**/*.yaml
|
**/*.yaml
|
||||||
**/*.csv
|
**/*.csv
|
||||||
|
xx
|
||||||
|
|
||||||
.rhistory
|
.rhistory
|
||||||
/.vscode
|
/.vscode
|
||||||
@@ -27,3 +28,6 @@
|
|||||||
!/obitests/**
|
!/obitests/**
|
||||||
!/sample/**
|
!/sample/**
|
||||||
LLM/**
|
LLM/**
|
||||||
|
*_files
|
||||||
|
|
||||||
|
entropy.html
|
||||||
47
cmd/obitools/obilowmask/main.go
Normal file
47
cmd/obitools/obilowmask/main.go
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obilowmask"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
|
||||||
|
defer obiseq.LogBioSeqStatus()
|
||||||
|
|
||||||
|
// go tool pprof -http=":8000" ./obipairing ./cpu.pprof
|
||||||
|
// f, err := os.Create("cpu.pprof")
|
||||||
|
// if err != nil {
|
||||||
|
// log.Fatal(err)
|
||||||
|
// }
|
||||||
|
// pprof.StartCPUProfile(f)
|
||||||
|
// defer pprof.StopCPUProfile()
|
||||||
|
|
||||||
|
// go tool trace cpu.trace
|
||||||
|
// ftrace, err := os.Create("cpu.trace")
|
||||||
|
// if err != nil {
|
||||||
|
// log.Fatal(err)
|
||||||
|
// }
|
||||||
|
// trace.Start(ftrace)
|
||||||
|
// defer trace.Stop()
|
||||||
|
|
||||||
|
optionParser := obioptions.GenerateOptionParser(
|
||||||
|
"obimicrosat",
|
||||||
|
"looks for microsatellites sequences in a sequence file",
|
||||||
|
obilowmask.OptionSet)
|
||||||
|
|
||||||
|
_, args := optionParser(os.Args)
|
||||||
|
|
||||||
|
sequences, err := obiconvert.CLIReadBioSequences(args...)
|
||||||
|
obiconvert.OpenSequenceDataErrorMessage(args, err)
|
||||||
|
|
||||||
|
selected := obilowmask.CLISequenceEntropyMasker(sequences)
|
||||||
|
obiconvert.CLIWriteBioSequences(selected, true)
|
||||||
|
obiutils.WaitForLastPipe()
|
||||||
|
|
||||||
|
}
|
||||||
@@ -23,6 +23,10 @@ var __single_base_code__ = []byte{0,
|
|||||||
0, 0, 0,
|
0, 0, 0,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func EncodeNucleotide(b byte) byte {
|
||||||
|
return __single_base_code__[b&31]
|
||||||
|
}
|
||||||
|
|
||||||
// Encode4mer transforms an obiseq.BioSequence into a sequence
|
// Encode4mer transforms an obiseq.BioSequence into a sequence
|
||||||
// of kmer of length 4. Each letter of the sequence not belonging
|
// of kmer of length 4. Each letter of the sequence not belonging
|
||||||
// A, C, G, T, U are considered as a A. The kmer is encoded as a byte
|
// A, C, G, T, U are considered as a A. The kmer is encoded as a byte
|
||||||
@@ -54,7 +58,7 @@ func Encode4mer(seq *obiseq.BioSequence, buffer *[]byte) []byte {
|
|||||||
code = 0
|
code = 0
|
||||||
for ; i < 4; i++ {
|
for ; i < 4; i++ {
|
||||||
code <<= 2
|
code <<= 2
|
||||||
code += __single_base_code__[rawseq[i]&31]
|
code += EncodeNucleotide(rawseq[i])
|
||||||
}
|
}
|
||||||
|
|
||||||
*buffer = append((*buffer), code)
|
*buffer = append((*buffer), code)
|
||||||
|
|||||||
5503
pkg/obikmer/kmernorm.go
Normal file
5503
pkg/obikmer/kmernorm.go
Normal file
File diff suppressed because it is too large
Load Diff
77
pkg/obikmer/kmernorm_test.go
Normal file
77
pkg/obikmer/kmernorm_test.go
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
package obikmer
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestNormalize(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
kmer string
|
||||||
|
expected string
|
||||||
|
}{
|
||||||
|
// Test avec k=1
|
||||||
|
{"k=1 a", "a", "a"},
|
||||||
|
{"k=1 c", "c", "c"},
|
||||||
|
|
||||||
|
// Test avec k=2
|
||||||
|
{"k=2 ca", "ca", "ac"},
|
||||||
|
{"k=2 ac", "ac", "ac"},
|
||||||
|
|
||||||
|
// Test avec k=4
|
||||||
|
{"k=4 acgt", "acgt", "acgt"},
|
||||||
|
{"k=4 cgta", "cgta", "acgt"},
|
||||||
|
{"k=4 gtac", "gtac", "acgt"},
|
||||||
|
{"k=4 tacg", "tacg", "acgt"},
|
||||||
|
{"k=4 tgca", "tgca", "atgc"},
|
||||||
|
|
||||||
|
// Test avec k=6
|
||||||
|
{"k=6 aaaaaa", "aaaaaa", "aaaaaa"},
|
||||||
|
{"k=6 tttttt", "tttttt", "tttttt"},
|
||||||
|
|
||||||
|
// Test avec k>6 (calcul à la volée)
|
||||||
|
{"k=7 aaaaaaa", "aaaaaaa", "aaaaaaa"},
|
||||||
|
{"k=7 tgcatgc", "tgcatgc", "atgctgc"},
|
||||||
|
{"k=7 gcatgct", "gcatgct", "atgctgc"},
|
||||||
|
{"k=8 acgtacgt", "acgtacgt", "acgtacgt"},
|
||||||
|
{"k=8 gtacgtac", "gtacgtac", "acgtacgt"},
|
||||||
|
{"k=10 acgtacgtac", "acgtacgtac", "acacgtacgt"},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
result := Normalize(tt.kmer)
|
||||||
|
if result != tt.expected {
|
||||||
|
t.Errorf("Normalize(%q) = %q, want %q", tt.kmer, result, tt.expected)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNormalizeTableConsistency(t *testing.T) {
|
||||||
|
// Vérifier que tous les kmers de la table donnent le bon résultat
|
||||||
|
// en comparant avec le calcul à la volée
|
||||||
|
for kmer, expected := range LexicographicNormalization {
|
||||||
|
calculated := getCanonicalCircular(kmer)
|
||||||
|
if calculated != expected {
|
||||||
|
t.Errorf("Table inconsistency for %q: table=%q, calculated=%q",
|
||||||
|
kmer, expected, calculated)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkNormalizeSmall(b *testing.B) {
|
||||||
|
// Benchmark pour k<=6 (utilise la table)
|
||||||
|
kmer := "acgtac"
|
||||||
|
b.ResetTimer()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = Normalize(kmer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkNormalizeLarge(b *testing.B) {
|
||||||
|
// Benchmark pour k>6 (calcul à la volée)
|
||||||
|
kmer := "acgtacgtac"
|
||||||
|
b.ResetTimer()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = Normalize(kmer)
|
||||||
|
}
|
||||||
|
}
|
||||||
5670
pkg/obikmer/kmernormint.go
Normal file
5670
pkg/obikmer/kmernormint.go
Normal file
File diff suppressed because it is too large
Load Diff
357
pkg/obikmer/kmernormint_test.go
Normal file
357
pkg/obikmer/kmernormint_test.go
Normal file
@@ -0,0 +1,357 @@
|
|||||||
|
package obikmer
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestEncodeDecodeKmer(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
kmer string
|
||||||
|
code int
|
||||||
|
}{
|
||||||
|
{"a", 0},
|
||||||
|
{"c", 1},
|
||||||
|
{"g", 2},
|
||||||
|
{"t", 3},
|
||||||
|
{"aa", 0},
|
||||||
|
{"ac", 1},
|
||||||
|
{"ca", 4},
|
||||||
|
{"acgt", 27}, // 0b00011011
|
||||||
|
{"cgta", 108}, // 0b01101100
|
||||||
|
{"tttt", 255}, // 0b11111111
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.kmer, func(t *testing.T) {
|
||||||
|
// Test encoding
|
||||||
|
encoded := EncodeKmer(tt.kmer)
|
||||||
|
if encoded != tt.code {
|
||||||
|
t.Errorf("EncodeKmer(%q) = %d, want %d", tt.kmer, encoded, tt.code)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test decoding
|
||||||
|
decoded := DecodeKmer(tt.code, len(tt.kmer))
|
||||||
|
if decoded != tt.kmer {
|
||||||
|
t.Errorf("DecodeKmer(%d, %d) = %q, want %q", tt.code, len(tt.kmer), decoded, tt.kmer)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNormalizeInt(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
kmer string
|
||||||
|
expected string
|
||||||
|
}{
|
||||||
|
// Test avec k=1
|
||||||
|
{"k=1 a", "a", "a"},
|
||||||
|
{"k=1 c", "c", "c"},
|
||||||
|
|
||||||
|
// Test avec k=2
|
||||||
|
{"k=2 ca", "ca", "ac"},
|
||||||
|
{"k=2 ac", "ac", "ac"},
|
||||||
|
{"k=2 ta", "ta", "at"},
|
||||||
|
|
||||||
|
// Test avec k=4 - toutes les rotations de "acgt"
|
||||||
|
{"k=4 acgt", "acgt", "acgt"},
|
||||||
|
{"k=4 cgta", "cgta", "acgt"},
|
||||||
|
{"k=4 gtac", "gtac", "acgt"},
|
||||||
|
{"k=4 tacg", "tacg", "acgt"},
|
||||||
|
|
||||||
|
// Test avec k=4 - rotations de "tgca"
|
||||||
|
{"k=4 tgca", "tgca", "atgc"},
|
||||||
|
{"k=4 gcat", "gcat", "atgc"},
|
||||||
|
{"k=4 catg", "catg", "atgc"},
|
||||||
|
{"k=4 atgc", "atgc", "atgc"},
|
||||||
|
|
||||||
|
// Test avec k=3 - rotations de "atg"
|
||||||
|
{"k=3 atg", "atg", "atg"},
|
||||||
|
{"k=3 tga", "tga", "atg"},
|
||||||
|
{"k=3 gat", "gat", "atg"},
|
||||||
|
|
||||||
|
// Test avec k=6
|
||||||
|
{"k=6 aaaaaa", "aaaaaa", "aaaaaa"},
|
||||||
|
{"k=6 tttttt", "tttttt", "tttttt"},
|
||||||
|
|
||||||
|
// Test avec k>6 (calcul à la volée)
|
||||||
|
{"k=7 aaaaaaa", "aaaaaaa", "aaaaaaa"},
|
||||||
|
{"k=7 tgcatgc", "tgcatgc", "atgctgc"},
|
||||||
|
{"k=7 gcatgct", "gcatgct", "atgctgc"},
|
||||||
|
{"k=8 acgtacgt", "acgtacgt", "acgtacgt"},
|
||||||
|
{"k=8 gtacgtac", "gtacgtac", "acgtacgt"},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
kmerCode := EncodeKmer(tt.kmer)
|
||||||
|
expectedCode := EncodeKmer(tt.expected)
|
||||||
|
|
||||||
|
result := NormalizeInt(kmerCode, len(tt.kmer))
|
||||||
|
|
||||||
|
if result != expectedCode {
|
||||||
|
resultKmer := DecodeKmer(result, len(tt.kmer))
|
||||||
|
t.Errorf("NormalizeInt(%q) = %q (code %d), want %q (code %d)",
|
||||||
|
tt.kmer, resultKmer, result, tt.expected, expectedCode)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNormalizeIntConsistencyWithString(t *testing.T) {
|
||||||
|
// Vérifier que NormalizeInt donne le même résultat que Normalize
|
||||||
|
// pour tous les k-mers de taille 1 à 4 (pour ne pas trop ralentir les tests)
|
||||||
|
bases := []byte{'a', 'c', 'g', 't'}
|
||||||
|
|
||||||
|
var testKmers func(current string, maxSize int)
|
||||||
|
testKmers = func(current string, maxSize int) {
|
||||||
|
if len(current) > 0 {
|
||||||
|
// Test normalization
|
||||||
|
normalizedStr := Normalize(current)
|
||||||
|
normalizedStrCode := EncodeKmer(normalizedStr)
|
||||||
|
|
||||||
|
kmerCode := EncodeKmer(current)
|
||||||
|
normalizedInt := NormalizeInt(kmerCode, len(current))
|
||||||
|
|
||||||
|
if normalizedInt != normalizedStrCode {
|
||||||
|
normalizedIntStr := DecodeKmer(normalizedInt, len(current))
|
||||||
|
t.Errorf("Inconsistency for %q: Normalize=%q (code %d), NormalizeInt=%q (code %d)",
|
||||||
|
current, normalizedStr, normalizedStrCode, normalizedIntStr, normalizedInt)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(current) < maxSize {
|
||||||
|
for _, base := range bases {
|
||||||
|
testKmers(current+string(base), maxSize)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
testKmers("", 4) // Test jusqu'à k=4 pour rester raisonnable
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCircularRotations(t *testing.T) {
|
||||||
|
// Test que toutes les rotations circulaires donnent le même canonical
|
||||||
|
tests := []struct {
|
||||||
|
kmers []string
|
||||||
|
canonical string
|
||||||
|
}{
|
||||||
|
{[]string{"atg", "tga", "gat"}, "atg"},
|
||||||
|
{[]string{"acgt", "cgta", "gtac", "tacg"}, "acgt"},
|
||||||
|
{[]string{"tgca", "gcat", "catg", "atgc"}, "atgc"},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
canonicalCode := EncodeKmer(tt.canonical)
|
||||||
|
|
||||||
|
for _, kmer := range tt.kmers {
|
||||||
|
kmerCode := EncodeKmer(kmer)
|
||||||
|
result := NormalizeInt(kmerCode, len(kmer))
|
||||||
|
|
||||||
|
if result != canonicalCode {
|
||||||
|
resultKmer := DecodeKmer(result, len(kmer))
|
||||||
|
t.Errorf("NormalizeInt(%q) = %q, want %q", kmer, resultKmer, tt.canonical)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkNormalizeIntSmall(b *testing.B) {
|
||||||
|
// Benchmark pour k<=6 (utilise la table)
|
||||||
|
kmer := "acgtac"
|
||||||
|
kmerCode := EncodeKmer(kmer)
|
||||||
|
kmerSize := len(kmer)
|
||||||
|
|
||||||
|
b.ResetTimer()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = NormalizeInt(kmerCode, kmerSize)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkNormalizeIntLarge(b *testing.B) {
|
||||||
|
// Benchmark pour k>6 (calcul à la volée)
|
||||||
|
kmer := "acgtacgtac"
|
||||||
|
kmerCode := EncodeKmer(kmer)
|
||||||
|
kmerSize := len(kmer)
|
||||||
|
|
||||||
|
b.ResetTimer()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = NormalizeInt(kmerCode, kmerSize)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkEncodeKmer(b *testing.B) {
|
||||||
|
kmer := "acgtacgt"
|
||||||
|
|
||||||
|
b.ResetTimer()
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = EncodeKmer(kmer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCanonicalKmerCount(t *testing.T) {
|
||||||
|
// Test exact counts for k=1 to 6
|
||||||
|
tests := []struct {
|
||||||
|
k int
|
||||||
|
expected int
|
||||||
|
}{
|
||||||
|
{1, 4},
|
||||||
|
{2, 10},
|
||||||
|
{3, 24},
|
||||||
|
{4, 70},
|
||||||
|
{5, 208},
|
||||||
|
{6, 700},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(fmt.Sprintf("k=%d", tt.k), func(t *testing.T) {
|
||||||
|
result := CanonicalKmerCount(tt.k)
|
||||||
|
if result != tt.expected {
|
||||||
|
t.Errorf("CanonicalKmerCount(%d) = %d, want %d", tt.k, result, tt.expected)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify counts match table sizes
|
||||||
|
for k := 1; k <= 6; k++ {
|
||||||
|
// Count unique canonical codes in the table
|
||||||
|
uniqueCodes := make(map[int]bool)
|
||||||
|
for _, canonicalCode := range LexicographicNormalizationInt[k] {
|
||||||
|
uniqueCodes[canonicalCode] = true
|
||||||
|
}
|
||||||
|
|
||||||
|
expected := len(uniqueCodes)
|
||||||
|
result := CanonicalKmerCount(k)
|
||||||
|
|
||||||
|
if result != expected {
|
||||||
|
t.Errorf("CanonicalKmerCount(%d) = %d, but table has %d unique canonical codes",
|
||||||
|
k, result, expected)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNecklaceCountFormula(t *testing.T) {
|
||||||
|
// Verify Moreau's formula gives the same results as hardcoded values for k=1 to 6
|
||||||
|
// and compute exact values for k=7+
|
||||||
|
tests := []struct {
|
||||||
|
k int
|
||||||
|
expected int
|
||||||
|
}{
|
||||||
|
{1, 4},
|
||||||
|
{2, 10},
|
||||||
|
{3, 24},
|
||||||
|
{4, 70},
|
||||||
|
{5, 208},
|
||||||
|
{6, 700},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(fmt.Sprintf("k=%d", tt.k), func(t *testing.T) {
|
||||||
|
result := necklaceCount(tt.k, 4)
|
||||||
|
if result != tt.expected {
|
||||||
|
t.Errorf("necklaceCount(%d, 4) = %d, want %d", tt.k, result, tt.expected)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNecklaceCountByBruteForce(t *testing.T) {
|
||||||
|
// Verify necklace count for k=7 and k=8 by brute force
|
||||||
|
// Generate all 4^k k-mers and count unique normalized ones
|
||||||
|
bases := []byte{'a', 'c', 'g', 't'}
|
||||||
|
|
||||||
|
for _, k := range []int{7, 8} {
|
||||||
|
t.Run(fmt.Sprintf("k=%d", k), func(t *testing.T) {
|
||||||
|
unique := make(map[int]bool)
|
||||||
|
|
||||||
|
// Generate all possible k-mers
|
||||||
|
var generate func(current int, depth int)
|
||||||
|
generate = func(current int, depth int) {
|
||||||
|
if depth == k {
|
||||||
|
// Normalize and add to set
|
||||||
|
normalized := NormalizeInt(current, k)
|
||||||
|
unique[normalized] = true
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, base := range bases {
|
||||||
|
newCode := (current << 2) | int(EncodeNucleotide(base))
|
||||||
|
generate(newCode, depth+1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
generate(0, 0)
|
||||||
|
|
||||||
|
bruteForceCount := len(unique)
|
||||||
|
formulaCount := necklaceCount(k, 4)
|
||||||
|
|
||||||
|
if bruteForceCount != formulaCount {
|
||||||
|
t.Errorf("For k=%d: brute force count = %d, formula count = %d",
|
||||||
|
k, bruteForceCount, formulaCount)
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Logf("k=%d: unique canonical k-mers = %d (formula matches brute force)", k, bruteForceCount)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEulerTotient(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
n int
|
||||||
|
expected int
|
||||||
|
}{
|
||||||
|
{1, 1},
|
||||||
|
{2, 1},
|
||||||
|
{3, 2},
|
||||||
|
{4, 2},
|
||||||
|
{5, 4},
|
||||||
|
{6, 2},
|
||||||
|
{7, 6},
|
||||||
|
{8, 4},
|
||||||
|
{9, 6},
|
||||||
|
{10, 4},
|
||||||
|
{12, 4},
|
||||||
|
{15, 8},
|
||||||
|
{20, 8},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(fmt.Sprintf("φ(%d)", tt.n), func(t *testing.T) {
|
||||||
|
result := eulerTotient(tt.n)
|
||||||
|
if result != tt.expected {
|
||||||
|
t.Errorf("eulerTotient(%d) = %d, want %d", tt.n, result, tt.expected)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDivisors(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
n int
|
||||||
|
expected []int
|
||||||
|
}{
|
||||||
|
{1, []int{1}},
|
||||||
|
{2, []int{1, 2}},
|
||||||
|
{6, []int{1, 2, 3, 6}},
|
||||||
|
{12, []int{1, 2, 3, 4, 6, 12}},
|
||||||
|
{15, []int{1, 3, 5, 15}},
|
||||||
|
{20, []int{1, 2, 4, 5, 10, 20}},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(fmt.Sprintf("divisors(%d)", tt.n), func(t *testing.T) {
|
||||||
|
result := divisors(tt.n)
|
||||||
|
if len(result) != len(tt.expected) {
|
||||||
|
t.Errorf("divisors(%d) = %v, want %v", tt.n, result, tt.expected)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for i := range result {
|
||||||
|
if result[i] != tt.expected[i] {
|
||||||
|
t.Errorf("divisors(%d) = %v, want %v", tt.n, result, tt.expected)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -8,7 +8,7 @@ import (
|
|||||||
// corresponds to the last commit, and not the one when the file will be
|
// corresponds to the last commit, and not the one when the file will be
|
||||||
// commited
|
// commited
|
||||||
|
|
||||||
var _Commit = "7500ee1"
|
var _Commit = "07cdd6f"
|
||||||
var _Version = "Release 4.4.0"
|
var _Version = "Release 4.4.0"
|
||||||
|
|
||||||
// Version returns the version of the obitools package.
|
// Version returns the version of the obitools package.
|
||||||
|
|||||||
23
pkg/obiseq/kmers.go
Normal file
23
pkg/obiseq/kmers.go
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
package obiseq
|
||||||
|
|
||||||
|
import "iter"
|
||||||
|
|
||||||
|
func (seq *BioSequence) Kmers(k int) iter.Seq[[]byte] {
|
||||||
|
return func(yield func([]byte) bool) {
|
||||||
|
// Gérer les cas où k est invalide ou la séquence trop courte
|
||||||
|
if k <= 0 || k > len(seq.sequence) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Itérer sur tous les k-mers possibles
|
||||||
|
for i := 0; i <= len(seq.sequence)-k; i++ {
|
||||||
|
// Extraire le k-mer actuel
|
||||||
|
kmer := seq.sequence[i : i+k]
|
||||||
|
|
||||||
|
// Passer au consommateur et arrêter si demandé
|
||||||
|
if !yield(kmer) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
319
pkg/obitools/obilowmask/entropy.qmd
Normal file
319
pkg/obitools/obilowmask/entropy.qmd
Normal file
@@ -0,0 +1,319 @@
|
|||||||
|
```{r}
|
||||||
|
library(tidyverse)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
x <- sample(1:4096, 29, replace=TRUE)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
emax <- function(lseq,word_size) {
|
||||||
|
nword = lseq - word_size + 1
|
||||||
|
nalpha = 4^word_size
|
||||||
|
|
||||||
|
if (nalpha < nword) {
|
||||||
|
cov = nword %/% nalpha
|
||||||
|
remains = nword %% nalpha
|
||||||
|
f1 = cov/nword
|
||||||
|
f2 = (cov+1)/nword
|
||||||
|
print(c(nalpha - remains,f1,remains,f2))
|
||||||
|
e = -(nalpha - remains) * f1 * log(f1) -
|
||||||
|
remains * f2 * log(f2)
|
||||||
|
} else {
|
||||||
|
e = log(nword)
|
||||||
|
}
|
||||||
|
|
||||||
|
e
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
ec <- function(data,kmer_size) {
|
||||||
|
table <- table(data)
|
||||||
|
s <- sum(table)
|
||||||
|
e <- sum(table * log(table))/s
|
||||||
|
ed <- log(s) - e
|
||||||
|
|
||||||
|
em <- emax(s+kmer_size-1,kmer_size)
|
||||||
|
|
||||||
|
ed/em
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
ef <- function(data,kmer_size) {
|
||||||
|
table <- table(data)
|
||||||
|
s <- sum(table)
|
||||||
|
f <- table / s
|
||||||
|
|
||||||
|
f <- as.numeric(f)
|
||||||
|
f <- f[f > 0]
|
||||||
|
|
||||||
|
em <- emax(s+kmer_size-1,kmer_size)
|
||||||
|
ed <- -sum(f * log(f))
|
||||||
|
|
||||||
|
print(c(ed,em,ed/em))
|
||||||
|
|
||||||
|
ed/em
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
okmer <- function(data,kmer_size) {
|
||||||
|
str_sub(data,1:(nchar(data)-kmer_size+1)) %>%
|
||||||
|
str_sub(1,kmer_size)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# Normalisation circulaire: retourne le plus petit k-mer par rotation circulaire
|
||||||
|
normalize_circular <- function(kmer) {
|
||||||
|
if (nchar(kmer) == 0) return(kmer)
|
||||||
|
|
||||||
|
canonical <- kmer
|
||||||
|
n <- nchar(kmer)
|
||||||
|
|
||||||
|
# Tester toutes les rotations circulaires
|
||||||
|
for (i in 2:n) {
|
||||||
|
rotated <- paste0(str_sub(kmer, i, n), str_sub(kmer, 1, i-1))
|
||||||
|
if (rotated < canonical) {
|
||||||
|
canonical <- rotated
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
canonical
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# Fonction totient d'Euler: compte le nombre d'entiers de 1 à n coprimes avec n
|
||||||
|
euler_totient <- function(n) {
|
||||||
|
if (n <= 0) return(0)
|
||||||
|
|
||||||
|
result <- n
|
||||||
|
p <- 2
|
||||||
|
|
||||||
|
# Traiter tous les facteurs premiers
|
||||||
|
while (p * p <= n) {
|
||||||
|
if (n %% p == 0) {
|
||||||
|
# Retirer toutes les occurrences de p
|
||||||
|
while (n %% p == 0) {
|
||||||
|
n <- n %/% p
|
||||||
|
}
|
||||||
|
# Appliquer la formule: φ(n) = n * (1 - 1/p)
|
||||||
|
result <- result - result %/% p
|
||||||
|
}
|
||||||
|
p <- p + 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Si n est toujours > 1, alors c'est un facteur premier
|
||||||
|
if (n > 1) {
|
||||||
|
result <- result - result %/% n
|
||||||
|
}
|
||||||
|
|
||||||
|
result
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# Retourne tous les diviseurs de n
|
||||||
|
divisors <- function(n) {
|
||||||
|
if (n <= 0) return(integer(0))
|
||||||
|
|
||||||
|
divs <- c()
|
||||||
|
i <- 1
|
||||||
|
while (i * i <= n) {
|
||||||
|
if (n %% i == 0) {
|
||||||
|
divs <- c(divs, i)
|
||||||
|
if (i != n %/% i) {
|
||||||
|
divs <- c(divs, n %/% i)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
i <- i + 1
|
||||||
|
}
|
||||||
|
|
||||||
|
sort(divs)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# Compte le nombre de colliers (necklaces) distincts de longueur n
|
||||||
|
# sur un alphabet de taille a en utilisant la formule de Moreau:
|
||||||
|
# N(n, a) = (1/n) * Σ φ(d) * a^(n/d)
|
||||||
|
# où la somme est sur tous les diviseurs d de n, et φ est la fonction totient d'Euler
|
||||||
|
necklace_count <- function(n, alphabet_size) {
|
||||||
|
if (n <= 0) return(0)
|
||||||
|
|
||||||
|
divs <- divisors(n)
|
||||||
|
sum_val <- 0
|
||||||
|
|
||||||
|
for (d in divs) {
|
||||||
|
# Calculer alphabet_size^(n/d)
|
||||||
|
power <- alphabet_size^(n %/% d)
|
||||||
|
sum_val <- sum_val + euler_totient(d) * power
|
||||||
|
}
|
||||||
|
|
||||||
|
sum_val %/% n
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# Nombre de classes d'équivalence pour les k-mers normalisés
|
||||||
|
# Utilise la formule exacte de Moreau pour compter les colliers (necklaces)
|
||||||
|
n_normalized_kmers <- function(kmer_size) {
|
||||||
|
# Valeurs exactes pré-calculées pour k=1 à 6
|
||||||
|
if (kmer_size == 1) return(4)
|
||||||
|
if (kmer_size == 2) return(10)
|
||||||
|
if (kmer_size == 3) return(24)
|
||||||
|
if (kmer_size == 4) return(70)
|
||||||
|
if (kmer_size == 5) return(208)
|
||||||
|
if (kmer_size == 6) return(700)
|
||||||
|
|
||||||
|
# Pour k > 6, utiliser la formule de Moreau (exacte)
|
||||||
|
# Alphabet ADN a 4 bases
|
||||||
|
necklace_count(kmer_size, 4)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# Entropie maximale pour k-mers normalisés
|
||||||
|
enmax <- function(lseq, word_size) {
|
||||||
|
nword = lseq - word_size + 1
|
||||||
|
nalpha = n_normalized_kmers(word_size)
|
||||||
|
|
||||||
|
if (nalpha < nword) {
|
||||||
|
cov = nword %/% nalpha
|
||||||
|
remains = nword %% nalpha
|
||||||
|
f1 = cov/nword
|
||||||
|
f2 = (cov+1)/nword
|
||||||
|
e = -(nalpha - remains) * f1 * log(f1) -
|
||||||
|
remains * f2 * log(f2)
|
||||||
|
} else {
|
||||||
|
e = log(nword)
|
||||||
|
}
|
||||||
|
|
||||||
|
e
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# Entropie normalisée avec normalisation circulaire des k-mers
|
||||||
|
ecn <- function(data, kmer_size) {
|
||||||
|
# Normaliser tous les k-mers
|
||||||
|
normalized_data <- sapply(data, normalize_circular)
|
||||||
|
|
||||||
|
# Calculer la table des fréquences
|
||||||
|
table <- table(normalized_data)
|
||||||
|
s <- sum(table)
|
||||||
|
e <- sum(table * log(table))/s
|
||||||
|
ed <- log(s) - e
|
||||||
|
|
||||||
|
# Entropie maximale avec normalisation
|
||||||
|
em <- enmax(s + kmer_size - 1, kmer_size)
|
||||||
|
|
||||||
|
ed/em
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
k<-'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
|
||||||
|
ec(okmer(k,1),1)
|
||||||
|
ec(okmer(k,2),2)
|
||||||
|
ec(okmer(k,3),3)
|
||||||
|
ec(okmer(k,4),4)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
k<-'atatatatatatatatatatatatatatata'
|
||||||
|
ef(okmer(k,1),1)
|
||||||
|
ef(okmer(k,2),2)
|
||||||
|
ef(okmer(k,3),3)
|
||||||
|
ef(okmer(k,4),4)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
k<-'aaaaaaaaaaaaaaaattttttttttttttt'
|
||||||
|
ef(okmer(k,1),1)
|
||||||
|
ef(okmer(k,2),2)
|
||||||
|
ef(okmer(k,3),3)
|
||||||
|
ef(okmer(k,4),4)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
k<-'atgatgatgatgatgatgatgatgatgatga'
|
||||||
|
ef(okmer(k,1),1)
|
||||||
|
ef(okmer(k,2),2)
|
||||||
|
ef(okmer(k,3),3)
|
||||||
|
ef(okmer(k,4),4)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
k<-'atcgatcgatcgatcgatcgatcgatcgact'
|
||||||
|
ecn(okmer(k,1),1)
|
||||||
|
ecn(okmer(k,2),2)
|
||||||
|
ecn(okmer(k,3),3)
|
||||||
|
ecn(okmer(k,4),4)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
k<-paste(sample(rep(c("a","c","g","t"),8),31),collapse="")
|
||||||
|
k <- "actatggcaagtcgtaaccgcgcttatcagg"
|
||||||
|
ecn(okmer(k,1),1)
|
||||||
|
ecn(okmer(k,2),2)
|
||||||
|
ecn(okmer(k,3),3)
|
||||||
|
ecn(okmer(k,4),4)
|
||||||
|
```
|
||||||
|
|
||||||
|
aattaaaaaaacaagataaaataatattttt
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
k<-'aattaaaaaaacaagataaaataatattttt'
|
||||||
|
ecn(okmer(k,1),1)
|
||||||
|
ecn(okmer(k,2),2)
|
||||||
|
ecn(okmer(k,3),3)
|
||||||
|
ecn(okmer(k,4),4)
|
||||||
|
```
|
||||||
|
|
||||||
|
atg tga gat ,,,,
|
||||||
|
|
||||||
|
cat tca atc
|
||||||
|
|
||||||
|
tgatgatgatgatgatgatgatgatgatg
|
||||||
|
|
||||||
|
## Tests de normalisation circulaire
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# Test de la fonction de normalisation
|
||||||
|
normalize_circular("ca") # devrait donner "ac"
|
||||||
|
normalize_circular("tgca") # devrait donner "atgc"
|
||||||
|
normalize_circular("acgt") # devrait donner "acgt"
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# Comparaison ec vs ecn sur une séquence répétitive
|
||||||
|
# Les k-mers "atg", "tga", "gat" sont équivalents par rotation
|
||||||
|
k <- 'atgatgatgatgatgatgatgatgatgatga'
|
||||||
|
cat("Séquence:", k, "\n")
|
||||||
|
cat("ec(k,3) =", ec(okmer(k,3),3), "\n")
|
||||||
|
cat("ecn(k,3) =", ecn(okmer(k,3),3), "\n")
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
# Comparaison sur séquence aléatoire
|
||||||
|
k <- "actatggcaagtcgtaaccgcgcttatcagg"
|
||||||
|
cat("Séquence:", k, "\n")
|
||||||
|
cat("Sans normalisation:\n")
|
||||||
|
cat(" ec(k,2) =", ec(okmer(k,2),2), "\n")
|
||||||
|
cat(" ec(k,3) =", ec(okmer(k,3),3), "\n")
|
||||||
|
cat(" ec(k,4) =", ec(okmer(k,4),4), "\n")
|
||||||
|
cat("Avec normalisation circulaire:\n")
|
||||||
|
cat(" ecn(k,2) =", ecn(okmer(k,2),2), "\n")
|
||||||
|
cat(" ecn(k,3) =", ecn(okmer(k,3),3), "\n")
|
||||||
|
cat(" ecn(k,4) =", ecn(okmer(k,4),4), "\n")
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
re <- rev(c(0.8108602271901116,0.8108602271901116,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.7800272339058549,0.7800272339058549,0.7751610144606091,0.7751610144606091,0.7751610144606091,0.764858185548322,0.7325526601302021,0.7137620699527615,0.6789199521982864,0.6584536373623372,0.634002687184193,0.6075290415873623,0.5785545803330997,0.5785545803330997,0.5503220289212184,0.5315314387437778,0.4966893209893028,0.46077361820145696,0.42388221293245526,0.4009547969713408,0.3561142883497758,0.3561142883497758,0.3561142883497758,0.3561142883497758,0.3561142883497758,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.35141814451677883,0.35141814451677883,0.35141814451677883,0.35141814451677883,0.35141814451677883,0.390029016052137,0.42781461756157363,0.45192285937059073,0.47238917420654,0.47238917420654,0.47238917420654,0.5092805794755417,0.5451962822633876,0.5800384000178626,0.602395141014297,0.6046146614886381,0.6046146614886381,0.6119084258128231,0.6119084258128231,0.6214217106113492,0.6424704346756562,0.6482381543085467,0.6635191587399633,0.6635191587399633,0.6635191587399633,0.6828444721058894,0.6950205907027562,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.7208976112999935))
|
||||||
|
|
||||||
|
di <- c(0.7208976112999935,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6950205907027562,0.6828444721058894,0.6635191587399633,0.6635191587399633,0.6635191587399633,0.6482381543085467,0.6424704346756562,0.6214217106113492,0.6119084258128231,0.6119084258128231,0.6046146614886382,0.6046146614886382,0.6023951410142971,0.5800384000178627,0.5451962822633876,0.5092805794755418,0.47238917420654003,0.47238917420654003,0.47238917420654003,0.4519228593705908,0.4278146175615737,0.39002901605213713,0.35141814451677894,0.35141814451677894,0.35141814451677894,0.35141814451677894,0.35141814451677883,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3561142883497762,0.3561142883497762,0.3561142883497762,0.3561142883497762,0.3561142883497762,0.40095479697134073,0.42388221293245526,0.46077361820145696,0.4966893209893028,0.5315314387437778,0.5503220289212184,0.5785545803330997,0.5785545803330997,0.6075290415873625,0.6340026871841933,0.6584536373623374,0.6789199521982866,0.7137620699527616,0.7325526601302023,0.7648581855483221,0.7751610144606093,0.7751610144606093,0.7751610144606093,0.7800272339058549,0.7800272339058549,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8108602271901116,0.8108602271901116)
|
||||||
|
```
|
||||||
440
pkg/obitools/obilowmask/obilowmask.go
Normal file
440
pkg/obitools/obilowmask/obilowmask.go
Normal file
@@ -0,0 +1,440 @@
|
|||||||
|
package obilowmask
|
||||||
|
|
||||||
|
import (
|
||||||
|
"math"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
)
|
||||||
|
|
||||||
|
// MaskingMode defines how to handle low-complexity regions
|
||||||
|
type MaskingMode int
|
||||||
|
|
||||||
|
const (
|
||||||
|
Mask MaskingMode = iota // Mask mode: replace low-complexity regions with masked characters
|
||||||
|
Split // Split mode: split sequence into high-complexity fragments
|
||||||
|
)
|
||||||
|
|
||||||
|
// LowMaskWorker creates a worker to mask low-complexity regions in DNA sequences.
|
||||||
|
//
|
||||||
|
// Algorithm principle:
|
||||||
|
// Calculate the normalized entropy of each k-mer at different scales (wordSize = 1 to level_max).
|
||||||
|
// K-mers with entropy below the threshold are masked.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - kmer_size: size of the sliding window for entropy calculation
|
||||||
|
// - level_max: maximum word size used for entropy calculation (finest scale)
|
||||||
|
// - threshold: normalized entropy threshold below which masking occurs (between 0 and 1)
|
||||||
|
// - mode: Mask (masking) or Split (splitting)
|
||||||
|
// - maskChar: character used for masking (typically 'n' or 'N')
|
||||||
|
//
|
||||||
|
// Returns: a SeqWorker function that can be applied to each sequence
|
||||||
|
func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte) obiseq.SeqWorker {
|
||||||
|
|
||||||
|
// ========================================================================
|
||||||
|
// FUNCTION 1: emax - Calculate theoretical maximum entropy
|
||||||
|
// ========================================================================
|
||||||
|
// Computes the maximum entropy of a k-mer of length lseq containing words of size word_size.
|
||||||
|
//
|
||||||
|
// Maximum entropy depends on the theoretical optimal word distribution:
|
||||||
|
// - If we have more positions (nw) than possible canonical words (na),
|
||||||
|
// some words will appear multiple times
|
||||||
|
// - We calculate the entropy of a distribution where all words appear
|
||||||
|
// cov or cov+1 times (most uniform distribution possible)
|
||||||
|
//
|
||||||
|
// IMPORTANT: Uses CanonicalKmerCount to get the actual number of canonical words
|
||||||
|
// after circular normalization (e.g., "atg", "tga", "gat" → all "atg").
|
||||||
|
// This is much smaller than 4^word_size (e.g., 10 instead of 16 for word_size=2).
|
||||||
|
emax := func(lseq, word_size int) float64 {
|
||||||
|
nw := lseq - word_size + 1 // Number of words in a k-mer of length lseq
|
||||||
|
na := obikmer.CanonicalKmerCount(word_size) // Number of canonical words after normalization
|
||||||
|
|
||||||
|
// Case 1: Fewer positions than possible words
|
||||||
|
// Maximum entropy is simply log(nw) since we can have at most nw different words
|
||||||
|
if nw < na {
|
||||||
|
return math.Log(float64(nw))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Case 2: More positions than possible words
|
||||||
|
// Some words must appear multiple times
|
||||||
|
cov := nw / na // Average coverage (average number of occurrences per word)
|
||||||
|
remains := nw - (na * cov) // Number of words that will have one additional occurrence
|
||||||
|
|
||||||
|
// Calculate frequencies in the optimal distribution:
|
||||||
|
// - (na - remains) words appear cov times → frequency f1 = cov/nw
|
||||||
|
// - remains words appear (cov+1) times → frequency f2 = (cov+1)/nw
|
||||||
|
f1 := float64(cov) / float64(nw)
|
||||||
|
f2 := float64(cov+1) / float64(nw)
|
||||||
|
|
||||||
|
// Shannon entropy: H = -Σ p(i) * log(p(i))
|
||||||
|
// where p(i) is the probability of observing word i
|
||||||
|
return -(float64(na-remains)*f1*math.Log(f1) +
|
||||||
|
float64(remains)*f2*math.Log(f2))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================================================
|
||||||
|
// FUNCTION 2: maskAmbiguities - Mark positions containing ambiguities
|
||||||
|
// ========================================================================
|
||||||
|
// Identifies positions with ambiguous nucleotides (N, Y, R, etc.) and marks
|
||||||
|
// all k-mers that contain them.
|
||||||
|
//
|
||||||
|
// Returns: a slice where maskPositions[i] = -1 if position i is part of a
|
||||||
|
// k-mer containing an ambiguity, 0 otherwise
|
||||||
|
maskAmbiguities := func(sequence []byte) []int {
|
||||||
|
maskPositions := make([]int, len(sequence))
|
||||||
|
for i, nuc := range sequence {
|
||||||
|
// If nucleotide is not a, c, g or t (lowercase), it's an ambiguity
|
||||||
|
if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
|
||||||
|
// Mark all positions of k-mers that contain this nucleotide
|
||||||
|
// A k-mer starting at position (i - kmer_size + 1) will contain position i
|
||||||
|
end := max(0, i-kmer_size+1)
|
||||||
|
for j := i; j >= end; j-- {
|
||||||
|
maskPositions[j] = -1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return maskPositions
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================================================
|
||||||
|
// FUNCTION 3: cleanTable - Reset a frequency table to zero
|
||||||
|
// ========================================================================
|
||||||
|
cleanTable := func(table []int, over int) {
|
||||||
|
for i := 0; i < over; i++ {
|
||||||
|
table[i] = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================================================
|
||||||
|
// FUNCTION 4: slidingMin - Calculate sliding minimum over a window
|
||||||
|
// ========================================================================
|
||||||
|
// Applies a sliding window of size window over data and replaces each
|
||||||
|
// value with the minimum in the window centered on that position.
|
||||||
|
//
|
||||||
|
// Uses a MinMultiset to efficiently maintain the minimum in the window.
|
||||||
|
slidingMin := func(data []float64, window int) {
|
||||||
|
minimier := obiutils.NewMinMultiset(func(a, b float64) bool { return a < b })
|
||||||
|
ldata := len(data)
|
||||||
|
mem := make([]float64, window) // Circular buffer to store window values
|
||||||
|
|
||||||
|
// Initialize buffer with sentinel value
|
||||||
|
for i := range mem {
|
||||||
|
mem[i] = 10000
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, v := range data {
|
||||||
|
// Get the old value leaving the window
|
||||||
|
m := mem[i%window]
|
||||||
|
mem[i%window] = v
|
||||||
|
|
||||||
|
// Remove old value from multiset if it was valid
|
||||||
|
if m < 10000 {
|
||||||
|
minimier.RemoveOne(m)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add new value if full window is ahead of us
|
||||||
|
if (ldata - i) >= window {
|
||||||
|
minimier.Add(v)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Retrieve and store current minimum
|
||||||
|
var ok bool
|
||||||
|
if data[i], ok = minimier.Min(); !ok {
|
||||||
|
log.Error("problem with minimum entropy")
|
||||||
|
data[i] = 0.0
|
||||||
|
}
|
||||||
|
|
||||||
|
//xx, _ := minimier.Min()
|
||||||
|
//log.Warnf("Pos: %d n: %d min: %.3f -> %.3f", i, minimier.Len(), v, xx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================================================
|
||||||
|
// FUNCTION 5: computeEntropies - Calculate normalized entropy for each position
|
||||||
|
// ========================================================================
|
||||||
|
// This is the central function that calculates the entropy of each k-mer in the sequence
|
||||||
|
// at a given scale (wordSize).
|
||||||
|
//
|
||||||
|
// Algorithm:
|
||||||
|
// 1. Encode the sequence into words (subsequences of size wordSize)
|
||||||
|
// 2. For each k-mer, count the frequencies of words it contains
|
||||||
|
// 3. Calculate normalized entropy = observed_entropy / maximum_entropy
|
||||||
|
// 4. Apply a sliding min filter to smooth results
|
||||||
|
//
|
||||||
|
// IMPORTANT: Line 147 uses NormalizeInt for circular normalization of words!
|
||||||
|
// This means "atg", "tga", and "gat" are considered the same word.
|
||||||
|
computeEntropies := func(sequence []byte,
|
||||||
|
maskPositions []int, // Positions of ambiguities
|
||||||
|
entropies []float64, // Output: normalized entropies for each position
|
||||||
|
table []int, // Frequency table for words (reused between calls)
|
||||||
|
words []int, // Buffer to store encoded words (reused)
|
||||||
|
wordSize int) { // Word size (scale of analysis)
|
||||||
|
|
||||||
|
lseq := len(sequence) // Sequence length
|
||||||
|
tableSize := 1 << (wordSize * 2) // Actual table size (must fit all codes 0 to 4^wordSize-1)
|
||||||
|
nwords := kmer_size - wordSize + 1 // Number of words in a k-mer
|
||||||
|
float_nwords := float64(nwords)
|
||||||
|
log_nwords := math.Log(float_nwords) // log(nwords) used in entropy calculation
|
||||||
|
entropyMax := emax(kmer_size, wordSize) // Theoretical maximum entropy (uses CanonicalKmerCount internally)
|
||||||
|
|
||||||
|
// Reset frequency table (must clear entire table, not just nalpha entries)
|
||||||
|
cleanTable(table, tableSize)
|
||||||
|
|
||||||
|
for i := 1; i < lseq; i++ {
|
||||||
|
entropies[i] = 6
|
||||||
|
}
|
||||||
|
end := lseq - wordSize + 1 // Last position where a word can start
|
||||||
|
|
||||||
|
// ========================================================================
|
||||||
|
// STEP 1: Encode all words in the sequence
|
||||||
|
// ========================================================================
|
||||||
|
// Uses left-shift encoding: each nucleotide is encoded on 2 bits
|
||||||
|
// a=00, c=01, g=10, t=11
|
||||||
|
|
||||||
|
mask := (1 << (wordSize * 2)) - 1 // Mask to keep only last wordSize*2 bits
|
||||||
|
|
||||||
|
// Initialize first word (all nucleotides except the last one)
|
||||||
|
word_index := 0
|
||||||
|
for i := 0; i < wordSize-1; i++ {
|
||||||
|
word_index = (word_index << 2) + int(obikmer.EncodeNucleotide(sequence[i]))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Encode all words with sliding window
|
||||||
|
for i, j := 0, wordSize-1; i < end; i, j = i+1, j+1 {
|
||||||
|
// Shift left by 2 bits, mask, and add new nucleotide
|
||||||
|
word_index = ((word_index << 2) & mask) + int(obikmer.EncodeNucleotide(sequence[j]))
|
||||||
|
|
||||||
|
// *** CIRCULAR NORMALIZATION ***
|
||||||
|
// Convert word to its canonical form (smallest by circular rotation)
|
||||||
|
// This is where "atg", "tga", "gat" all become "atg"
|
||||||
|
words[i] = obikmer.NormalizeInt(word_index, wordSize)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================================================
|
||||||
|
// STEP 2: Calculate entropy for each k-mer with sliding window
|
||||||
|
// ========================================================================
|
||||||
|
s := 0 // Number of words processed in current k-mer
|
||||||
|
sum_n_logn := 0.0 // Sum of n*log(n) for entropy calculation
|
||||||
|
entropy := 1.0 // Current normalized entropy
|
||||||
|
cleaned := true // Flag indicating if table has been cleaned
|
||||||
|
|
||||||
|
for i := range end {
|
||||||
|
s++
|
||||||
|
|
||||||
|
switch {
|
||||||
|
// CASE 1: Filling phase (fewer than nwords words collected)
|
||||||
|
case s < nwords:
|
||||||
|
cleaned = false
|
||||||
|
table[words[i]]++ // Increment word frequency
|
||||||
|
|
||||||
|
// CASE 2: Position contains an ambiguity
|
||||||
|
case i >= (nwords-1) && maskPositions[i-nwords+1] < 0:
|
||||||
|
entropies[i-nwords+1] = 4.0 // Mark entropy as invalid
|
||||||
|
if !cleaned {
|
||||||
|
cleanTable(table, tableSize) // Reset table
|
||||||
|
}
|
||||||
|
cleaned = true
|
||||||
|
s = 0
|
||||||
|
sum_n_logn = 0.0
|
||||||
|
|
||||||
|
// CASE 3: First complete k-mer (s == nwords)
|
||||||
|
case s == nwords:
|
||||||
|
cleaned = false
|
||||||
|
table[words[i]]++
|
||||||
|
|
||||||
|
// Calculate Shannon entropy: H = -Σ p(i)*log(p(i))
|
||||||
|
// = log(N) - (1/N)*Σ n(i)*log(n(i))
|
||||||
|
// where N = nwords, n(i) = frequency of word i
|
||||||
|
//
|
||||||
|
// NOTE: We iterate over entire table (tableSize = 4^wordSize) to count all frequencies.
|
||||||
|
// Canonical codes are not contiguous (e.g., for k=2: {0,1,2,3,5,6,7,10,11,15})
|
||||||
|
// so we must scan the full table even though only ~10 entries will be non-zero
|
||||||
|
sum_n_logn = 0
|
||||||
|
for j := range tableSize {
|
||||||
|
n := float64(table[j])
|
||||||
|
if n > 0 {
|
||||||
|
sum_n_logn += n * math.Log(n)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Normalized entropy = observed entropy / maximum entropy
|
||||||
|
entropy = (log_nwords - sum_n_logn/float_nwords) / entropyMax
|
||||||
|
|
||||||
|
// CASE 4: Sliding window (s > nwords)
|
||||||
|
// Incremental update of entropy by adding a new word
|
||||||
|
// and removing the old one
|
||||||
|
case s > nwords:
|
||||||
|
cleaned = false
|
||||||
|
|
||||||
|
new_word := words[i]
|
||||||
|
old_word := words[i-nwords]
|
||||||
|
|
||||||
|
// Optimization: only recalculate if word changes
|
||||||
|
if old_word != new_word {
|
||||||
|
table[new_word]++
|
||||||
|
table[old_word]--
|
||||||
|
|
||||||
|
n_old := float64(table[old_word])
|
||||||
|
n_new := float64(table[new_word])
|
||||||
|
|
||||||
|
// Incremental update of sum_n_logn
|
||||||
|
// Remove contribution of old word (before decrement)
|
||||||
|
sum_n_logn -= (n_old + 1) * math.Log(n_old+1)
|
||||||
|
// Add contribution of old word (after decrement)
|
||||||
|
if n_old > 0 {
|
||||||
|
sum_n_logn += n_old * math.Log(n_old)
|
||||||
|
}
|
||||||
|
// Add contribution of new word (after increment)
|
||||||
|
if n_new > 0 {
|
||||||
|
sum_n_logn += n_new * math.Log(n_new)
|
||||||
|
}
|
||||||
|
// Remove contribution of new word (before increment)
|
||||||
|
if n_new > 1 {
|
||||||
|
sum_n_logn -= (n_new - 1) * math.Log(n_new-1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
entropy = (log_nwords - sum_n_logn/float_nwords) / entropyMax
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store entropy for position corresponding to start of k-mer
|
||||||
|
if s >= nwords && maskPositions[i-nwords+1] >= 0 {
|
||||||
|
if entropy == 0 {
|
||||||
|
log.Errorf("Zero entropy @ positon %d", i-nwords+1)
|
||||||
|
}
|
||||||
|
entropies[i-nwords+1] = entropy
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================================================
|
||||||
|
// STEP 3: Apply sliding min filter
|
||||||
|
// ========================================================================
|
||||||
|
// Replace each entropy with minimum in window of size kmer_size
|
||||||
|
// This allows robust detection of low-complexity regions
|
||||||
|
slidingMin(entropies, kmer_size)
|
||||||
|
// log.Warnf("%v\n%v", e, entropies)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================================================
|
||||||
|
// FUNCTION 6: applyMaskMode - Apply masking to sequence
|
||||||
|
// ========================================================================
|
||||||
|
applyMaskMode := func(sequence *obiseq.BioSequence, maskPositions []bool, mask byte) (obiseq.BioSequenceSlice, error) {
|
||||||
|
// Create copy to avoid modifying original
|
||||||
|
seqCopy := sequence.Copy()
|
||||||
|
sequenceBytes := seqCopy.Sequence()
|
||||||
|
|
||||||
|
// Mask identified positions
|
||||||
|
for i := 0; i < len(sequenceBytes); i++ {
|
||||||
|
if maskPositions[i] {
|
||||||
|
// Operation &^ 32 converts to UPPERCASE (clears bit 5)
|
||||||
|
sequenceBytes[i] = sequenceBytes[i] &^ 32
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return obiseq.BioSequenceSlice{seqCopy}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================================================
|
||||||
|
// FUNCTION 7: masking - Main masking function
|
||||||
|
// ========================================================================
|
||||||
|
// Calculates entropies at all scales and masks positions
|
||||||
|
// whose minimum entropy is below the threshold.
|
||||||
|
masking := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||||
|
bseq := sequence.Sequence()
|
||||||
|
|
||||||
|
// Identify ambiguities
|
||||||
|
maskPositions := maskAmbiguities(bseq)
|
||||||
|
|
||||||
|
// Initialize data structures
|
||||||
|
mask := make([]int, len(bseq)) // Stores scale detecting minimum entropy
|
||||||
|
entropies := make([]float64, len(bseq)) // Minimum entropy at each position
|
||||||
|
for i := range entropies {
|
||||||
|
entropies[i] = 4.0 // Very high initial value
|
||||||
|
}
|
||||||
|
|
||||||
|
freqs := make([]int, 1<<(2*level_max)) // Frequency table (max size)
|
||||||
|
words := make([]int, len(bseq)) // Buffer for encoded words
|
||||||
|
|
||||||
|
// ========================================================================
|
||||||
|
// Calculate entropy at maximum scale (level_max)
|
||||||
|
// ========================================================================
|
||||||
|
computeEntropies(bseq, maskPositions, entropies, freqs, words, level_max)
|
||||||
|
|
||||||
|
// Initialize mask with level_max everywhere (except ambiguities)
|
||||||
|
for i := range bseq {
|
||||||
|
v := level_max
|
||||||
|
// if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
|
||||||
|
// v = 0
|
||||||
|
// }
|
||||||
|
mask[i] = v
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================================================
|
||||||
|
// Calculate entropy at lower scales
|
||||||
|
// ========================================================================
|
||||||
|
entropies2 := make([]float64, len(bseq))
|
||||||
|
|
||||||
|
for ws := level_max - 1; ws > 0; ws-- {
|
||||||
|
// *** WARNING: POTENTIAL BUG ***
|
||||||
|
// The parameter passed is level_max instead of ws!
|
||||||
|
// This means we always recalculate with the same scale
|
||||||
|
// Should be: computeEntropies(bseq, maskPositions, entropies2, freqs, words, ws)
|
||||||
|
computeEntropies(bseq, maskPositions, entropies2, freqs, words, ws)
|
||||||
|
// Keep minimum entropy and corresponding scale
|
||||||
|
for i, e2 := range entropies2 {
|
||||||
|
if e2 < entropies[i] {
|
||||||
|
entropies[i] = e2
|
||||||
|
mask[i] = ws
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Force entropy to 0 for ambiguous positions
|
||||||
|
for i, nuc := range bseq {
|
||||||
|
if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
|
||||||
|
entropies[i] = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================================================
|
||||||
|
// Identify positions to mask
|
||||||
|
// ========================================================================
|
||||||
|
remove := make([]bool, len(entropies))
|
||||||
|
for i, e := range entropies {
|
||||||
|
remove[i] = e <= threshold
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save metadata in sequence attributes
|
||||||
|
sequence.SetAttribute("mask", mask)
|
||||||
|
sequence.SetAttribute("Entropies", entropies)
|
||||||
|
|
||||||
|
return applyMaskMode(sequence, remove, maskChar)
|
||||||
|
}
|
||||||
|
|
||||||
|
return masking
|
||||||
|
}
|
||||||
|
|
||||||
|
// CLISequenceEntropyMasker creates an iterator that applies entropy masking
|
||||||
|
// to all sequences in an input iterator.
|
||||||
|
//
|
||||||
|
// Uses command-line parameters to configure the worker.
|
||||||
|
func CLISequenceEntropyMasker(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||||
|
var newIter obiiter.IBioSequence
|
||||||
|
|
||||||
|
worker := LowMaskWorker(
|
||||||
|
CLIKmerSize(),
|
||||||
|
CLILevelMax(),
|
||||||
|
CLIThreshold(),
|
||||||
|
CLIMaskingMode(),
|
||||||
|
CLIMaskingChar(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Apply worker in parallel
|
||||||
|
newIter = iterator.MakeIWorker(worker, false, obidefault.ParallelWorkers())
|
||||||
|
|
||||||
|
// Filter resulting empty sequences
|
||||||
|
return newIter.FilterEmpty()
|
||||||
|
}
|
||||||
72
pkg/obitools/obilowmask/options.go
Normal file
72
pkg/obitools/obilowmask/options.go
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
package obilowmask
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||||
|
"github.com/DavidGamba/go-getoptions"
|
||||||
|
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
)
|
||||||
|
|
||||||
|
var __kmer_size__ = 31
|
||||||
|
var __level_max__ = 6
|
||||||
|
var __threshold__ = 0.5
|
||||||
|
var __split_mode__ = false
|
||||||
|
var __mask__ = "."
|
||||||
|
|
||||||
|
func LowMaskOptionSet(options *getoptions.GetOpt) {
|
||||||
|
|
||||||
|
options.IntVar(&__kmer_size__, "kmer-size", __kmer_size__,
|
||||||
|
options.Description("Size of the kmer considered to estimate entropy."),
|
||||||
|
)
|
||||||
|
|
||||||
|
options.IntVar(&__level_max__, "entropy_size", __level_max__,
|
||||||
|
options.Description("Maximum word size considered for entropy estimate"),
|
||||||
|
)
|
||||||
|
|
||||||
|
options.Float64Var(&__threshold__, "threshold", __threshold__,
|
||||||
|
options.Description("entropy theshold used to mask a kmer"),
|
||||||
|
)
|
||||||
|
|
||||||
|
options.BoolVar(&__split_mode__, "--split-mode", __split_mode__,
|
||||||
|
options.Description("in split mode, input sequences are splitted to remove masked regions"),
|
||||||
|
)
|
||||||
|
|
||||||
|
options.StringVar(&__mask__, "--masking-char", __mask__,
|
||||||
|
options.Description("Character used to mask low complexity region"),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func OptionSet(options *getoptions.GetOpt) {
|
||||||
|
LowMaskOptionSet(options)
|
||||||
|
obiconvert.InputOptionSet(options)
|
||||||
|
}
|
||||||
|
|
||||||
|
func CLIKmerSize() int {
|
||||||
|
return __kmer_size__
|
||||||
|
}
|
||||||
|
|
||||||
|
func CLILevelMax() int {
|
||||||
|
return __level_max__
|
||||||
|
}
|
||||||
|
|
||||||
|
func CLIThreshold() float64 {
|
||||||
|
return __threshold__
|
||||||
|
}
|
||||||
|
|
||||||
|
func CLIMaskingMode() MaskingMode {
|
||||||
|
if __split_mode__ {
|
||||||
|
return Split
|
||||||
|
} else {
|
||||||
|
return Mask
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func CLIMaskingChar() byte {
|
||||||
|
mask := strings.TrimSpace(__mask__)
|
||||||
|
if len(mask) != 1 {
|
||||||
|
log.Fatalf("--masking-char option accept a single character, not %s", mask)
|
||||||
|
}
|
||||||
|
return []byte(mask)[0]
|
||||||
|
}
|
||||||
118
pkg/obiutils/minmultiset.go
Normal file
118
pkg/obiutils/minmultiset.go
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
package obiutils
|
||||||
|
|
||||||
|
import (
|
||||||
|
"container/heap"
|
||||||
|
)
|
||||||
|
|
||||||
|
// MinMultiset maintient un multiset de valeurs et expose le minimum courant.
|
||||||
|
// T doit être comparable pour servir de clé de map. L'ordre est défini par less.
|
||||||
|
type MinMultiset[T comparable] struct {
|
||||||
|
pq priorityQueue[T] // tas min
|
||||||
|
less func(a, b T) bool
|
||||||
|
count map[T]int // cardinalité logique par valeur
|
||||||
|
pending map[T]int // suppressions en attente (lazy delete)
|
||||||
|
size int // taille logique totale
|
||||||
|
}
|
||||||
|
|
||||||
|
// New crée un multiset. less doit imposer un ordre strict total.
|
||||||
|
func NewMinMultiset[T comparable](less func(a, b T) bool) *MinMultiset[T] {
|
||||||
|
m := &MinMultiset[T]{
|
||||||
|
pq: priorityQueue[T]{less: less},
|
||||||
|
less: less,
|
||||||
|
count: make(map[T]int),
|
||||||
|
pending: make(map[T]int),
|
||||||
|
}
|
||||||
|
heap.Init(&m.pq)
|
||||||
|
return m
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add ajoute une occurrence.
|
||||||
|
func (m *MinMultiset[T]) Add(v T) {
|
||||||
|
heap.Push(&m.pq, v)
|
||||||
|
m.count[v]++
|
||||||
|
m.size++
|
||||||
|
}
|
||||||
|
|
||||||
|
// RemoveOne retire UNE occurrence de v. Retourne false si absente.
|
||||||
|
func (m *MinMultiset[T]) RemoveOne(v T) bool {
|
||||||
|
if m.count[v] == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
m.count[v]--
|
||||||
|
m.pending[v]++
|
||||||
|
m.size--
|
||||||
|
m.shrink()
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// Min retourne le minimum courant. ok=false si vide.
|
||||||
|
func (m *MinMultiset[T]) Min() (v T, ok bool) {
|
||||||
|
if m.size == 0 {
|
||||||
|
var zero T
|
||||||
|
return zero, false
|
||||||
|
}
|
||||||
|
m.cleanTop()
|
||||||
|
return m.pq.data[0], true
|
||||||
|
}
|
||||||
|
|
||||||
|
// Len retourne la taille logique.
|
||||||
|
func (m *MinMultiset[T]) Len() int { return m.size }
|
||||||
|
|
||||||
|
// --- interne ---
|
||||||
|
|
||||||
|
// retire du sommet toutes les valeurs marquées à supprimer.
|
||||||
|
func (m *MinMultiset[T]) cleanTop() {
|
||||||
|
for m.pq.Len() > 0 {
|
||||||
|
top := m.pq.data[0]
|
||||||
|
if m.pending[top] > 0 {
|
||||||
|
m.pending[top]--
|
||||||
|
if m.pending[top] == 0 {
|
||||||
|
delete(m.pending, top) // ← nettoyage de la map
|
||||||
|
}
|
||||||
|
heap.Pop(&m.pq)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// rééquilibre le tas si trop de tombstones.
|
||||||
|
func (m *MinMultiset[T]) shrink() {
|
||||||
|
// nettoyage léger au retrait pour borner la dérive
|
||||||
|
if m.pq.Len() > 0 {
|
||||||
|
m.cleanTop()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// priorityQueue implémente heap.Interface pour T.
|
||||||
|
type priorityQueue[T any] struct {
|
||||||
|
data []T
|
||||||
|
less func(a, b T) bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q priorityQueue[T]) Len() int { return len(q.data) }
|
||||||
|
func (q priorityQueue[T]) Less(i, j int) bool { return q.less(q.data[i], q.data[j]) }
|
||||||
|
func (q priorityQueue[T]) Swap(i, j int) { q.data[i], q.data[j] = q.data[j], q.data[i] }
|
||||||
|
func (q *priorityQueue[T]) Push(x any) { q.data = append(q.data, x.(T)) }
|
||||||
|
func (q *priorityQueue[T]) Pop() any {
|
||||||
|
n := len(q.data)
|
||||||
|
x := q.data[n-1]
|
||||||
|
q.data = q.data[:n-1]
|
||||||
|
return x
|
||||||
|
}
|
||||||
|
func (q priorityQueue[T]) peek() (T, bool) {
|
||||||
|
if len(q.data) == 0 {
|
||||||
|
var z T
|
||||||
|
return z, false
|
||||||
|
}
|
||||||
|
return q.data[0], true
|
||||||
|
}
|
||||||
|
func (q *priorityQueue[T]) Top() (T, bool) { return q.peek() }
|
||||||
|
func (q *priorityQueue[T]) PushValue(v T) { heap.Push(q, v) }
|
||||||
|
func (q *priorityQueue[T]) PopValue() (T, bool) {
|
||||||
|
if q.Len() == 0 {
|
||||||
|
var z T
|
||||||
|
return z, false
|
||||||
|
}
|
||||||
|
return heap.Pop(q).(T), true
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user