pkg/obikmer/entropy.go

package obikmer

import "math"

// KmerEntropy computes the entropy of a single encoded k-mer.
//
// The algorithm mirrors the Rust obiskbuilder entropy: it decodes the k-mer
// to a DNA sequence, extracts all sub-words of each size from 1 to levelMax,
// normalizes them by circular canonical form, counts their frequencies, and
// computes Shannon entropy corrected for class sizes, normalized by the
// maximum possible entropy over 4^ws raw bins.
// The returned value is the minimum entropy across all word sizes.
//
// Correction for small sequences: the raw entropy H = log(N) - Σ f·log(f)/N
// under-estimates the true complexity when many raw words collapse to the same
// canonical form.  Adding Σ f·log(class_size)/N recovers the entropy of the
// underlying uncollapsed distribution (assuming uniform mixing within each
// equivalence class).
//
// A value close to 0 indicates very low complexity (e.g. "AAAA..."),
// while a value close to 1 indicates high complexity.
func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
	if k < 1 || levelMax < 1 {
		return 1.0
	}
	if levelMax >= k {
		levelMax = k - 1
	}
	if levelMax < 1 {
		return 1.0
	}

	// Decode k-mer to DNA sequence
	var seqBuf [32]byte
	seq := DecodeKmer(kmer, k, seqBuf[:])

	// Pre-compute nLogN lookup
	nLogN := make([]float64, k+1)
	for i := 1; i <= k; i++ {
		nLogN[i] = float64(i) * math.Log(float64(i))
	}

	// Build circular-canonical normalization tables per word size
	normTables := make([][]int, levelMax+1)
	for ws := 1; ws <= levelMax; ws++ {
		size := 1 << (ws * 2)
		normTables[ws] = make([]int, size)
		for code := 0; code < size; code++ {
			normTables[ws][code] = int(NormalizeCircular(uint64(code), ws))
		}
	}

	// Build ln(class_size) tables: for each canonical form, how many raw
	// words map to it under circular normalization.
	classLogSizeTables := make([][]float64, levelMax+1)
	for ws := 1; ws <= levelMax; ws++ {
		tableSize := 1 << (ws * 2)
		classSize := make([]int, tableSize)
		for code := 0; code < tableSize; code++ {
			classSize[normTables[ws][code]]++
		}
		classLogSizeTables[ws] = make([]float64, tableSize)
		for j := 0; j < tableSize; j++ {
			if classSize[j] > 0 {
				classLogSizeTables[ws][j] = math.Log(float64(classSize[j]))
			}
		}
	}

	minEntropy := math.MaxFloat64

	for ws := 1; ws <= levelMax; ws++ {
		nwords := k - ws + 1
		if nwords < 1 {
			continue
		}

		// Count circular-canonical sub-word frequencies
		tableSize := 1 << (ws * 2)
		table := make([]int, tableSize)
		mask := (1 << (ws * 2)) - 1

		wordIndex := 0
		for i := 0; i < ws-1; i++ {
			wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i]))
		}

		for i, j := 0, ws-1; j < k; i, j = i+1, j+1 {
			wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j]))
			normWord := normTables[ws][wordIndex]
			table[normWord]++
		}

		// Compute emax over 4^ws raw bins (uncollapsed distribution).
		floatNwords := float64(nwords)
		logNwords := math.Log(floatNwords)
		na := tableSize // 4^ws
		var emax float64
		if nwords < na {
			emax = logNwords
		} else {
			cov := nwords / na
			remains := nwords - (na * cov)
			f1 := float64(cov) / floatNwords
			f2 := float64(cov+1) / floatNwords
			emax = -(float64(na-remains)*f1*math.Log(f1) +
				float64(remains)*f2*math.Log(f2))
		}

		if emax <= 0 {
			continue
		}

		// Accumulate Σ f·log(f) and Σ f·log(class_size) over canonical forms.
		classLogSize := classLogSizeTables[ws]
		var sumNLogN, sumClassLogN float64
		for j := 0; j < tableSize; j++ {
			n := table[j]
			if n > 0 {
				sumNLogN += nLogN[n]
				sumClassLogN += float64(n) * classLogSize[j]
			}
		}

		// Corrected entropy: H_raw ≈ log(N) + (Σf·log(s) - Σf·log(f)) / N
		entropy := (logNwords + sumClassLogN/floatNwords - sumNLogN/floatNwords) / emax
		if entropy < 0 {
			entropy = 0
		}

		if entropy < minEntropy {
			minEntropy = entropy
		}
	}

	if minEntropy == math.MaxFloat64 {
		return 1.0
	}

	return math.Round(minEntropy*10000) / 10000
}

// KmerEntropyFilter is a reusable entropy filter for batch processing.
// It pre-computes normalization tables and lookup values to avoid repeated
// allocation across millions of k-mers.
//
// IMPORTANT: a KmerEntropyFilter is NOT safe for concurrent use.
// Each goroutine must create its own instance via NewKmerEntropyFilter.
type KmerEntropyFilter struct {
	k                   int
	levelMax            int
	threshold           float64
	nLogN               []float64
	normTables          [][]int
	classLogSizeTables  [][]float64
	emaxValues          []float64
	logNwords           []float64
	// Pre-allocated frequency tables reused across Entropy() calls.
	// One per word size (index 0 unused). Reset to zero before each use.
	freqTables [][]int
}

// NewKmerEntropyFilter creates an entropy filter with pre-computed tables.
func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter {
	if levelMax >= k {
		levelMax = k - 1
	}
	if levelMax < 1 {
		levelMax = 1
	}

	nLogN := make([]float64, k+1)
	for i := 1; i <= k; i++ {
		nLogN[i] = float64(i) * math.Log(float64(i))
	}

	normTables := make([][]int, levelMax+1)
	for ws := 1; ws <= levelMax; ws++ {
		size := 1 << (ws * 2)
		normTables[ws] = make([]int, size)
		for code := 0; code < size; code++ {
			normTables[ws][code] = int(NormalizeCircular(uint64(code), ws))
		}
	}

	// ln(class_size) for each canonical form under circular normalization.
	classLogSizeTables := make([][]float64, levelMax+1)
	for ws := 1; ws <= levelMax; ws++ {
		tableSize := 1 << (ws * 2)
		classSize := make([]int, tableSize)
		for code := 0; code < tableSize; code++ {
			classSize[normTables[ws][code]]++
		}
		classLogSizeTables[ws] = make([]float64, tableSize)
		for j := 0; j < tableSize; j++ {
			if classSize[j] > 0 {
				classLogSizeTables[ws][j] = math.Log(float64(classSize[j]))
			}
		}
	}

	// Pre-compute emax and logNwords per word size.
	// emax uses 4^ws raw bins to match the corrected entropy.
	emaxValues := make([]float64, levelMax+1)
	logNwords := make([]float64, levelMax+1)
	for ws := 1; ws <= levelMax; ws++ {
		nw := k - ws + 1
		na := 1 << (ws * 2) // 4^ws raw bins
		floatNw := float64(nw)
		logNwords[ws] = math.Log(floatNw)
		if nw < na {
			emaxValues[ws] = logNwords[ws]
		} else {
			cov := nw / na
			remains := nw - (na * cov)
			f1 := float64(cov) / floatNw
			f2 := float64(cov+1) / floatNw
			emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) +
				float64(remains)*f2*math.Log(f2))
		}
	}

	// Pre-allocate frequency tables per word size
	freqTables := make([][]int, levelMax+1)
	for ws := 1; ws <= levelMax; ws++ {
		freqTables[ws] = make([]int, 1<<(ws*2))
	}

	return &KmerEntropyFilter{
		k:                  k,
		levelMax:           levelMax,
		threshold:          threshold,
		nLogN:              nLogN,
		normTables:         normTables,
		classLogSizeTables: classLogSizeTables,
		emaxValues:         emaxValues,
		logNwords:          logNwords,
		freqTables:         freqTables,
	}
}

// Accept returns true if the k-mer has entropy strictly above the threshold.
// Low-complexity k-mers (entropy <= threshold) are rejected.
func (ef *KmerEntropyFilter) Accept(kmer uint64) bool {
	return ef.Entropy(kmer) > ef.threshold
}

// Entropy computes the entropy for a single k-mer using pre-computed tables.
func (ef *KmerEntropyFilter) Entropy(kmer uint64) float64 {
	k := ef.k

	// Decode k-mer to DNA sequence
	var seqBuf [32]byte
	seq := DecodeKmer(kmer, k, seqBuf[:])

	minEntropy := math.MaxFloat64

	for ws := 1; ws <= ef.levelMax; ws++ {
		nwords := k - ws + 1
		if nwords < 1 {
			continue
		}

		emax := ef.emaxValues[ws]
		if emax <= 0 {
			continue
		}

		// Count circular-canonical sub-word frequencies
		tableSize := 1 << (ws * 2)
		table := ef.freqTables[ws]
		clear(table)
		mask := (1 << (ws * 2)) - 1
		normTable := ef.normTables[ws]

		wordIndex := 0
		for i := 0; i < ws-1; i++ {
			wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i]))
		}

		for i, j := 0, ws-1; j < k; i, j = i+1, j+1 {
			wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j]))
			normWord := normTable[wordIndex]
			table[normWord]++
		}

		floatNwords := float64(nwords)
		logNwords := ef.logNwords[ws]
		classLogSize := ef.classLogSizeTables[ws]

		var sumNLogN, sumClassLogN float64
		for j := 0; j < tableSize; j++ {
			n := table[j]
			if n > 0 {
				sumNLogN += ef.nLogN[n]
				sumClassLogN += float64(n) * classLogSize[j]
			}
		}

		// Corrected entropy: H_raw ≈ log(N) + (Σf·log(s) - Σf·log(f)) / N
		entropy := (logNwords + sumClassLogN/floatNwords - sumNLogN/floatNwords) / emax
		if entropy < 0 {
			entropy = 0
		}

		if entropy < minEntropy {
			minEntropy = entropy
		}
	}

	if minEntropy == math.MaxFloat64 {
		return 1.0
	}

	return math.Round(minEntropy*10000) / 10000
}
Add entropy-based filtering for k-mers 2026-02-10 18:19:57 +01:00			`package obikmer`

			`import "math"`

			`// KmerEntropy computes the entropy of a single encoded k-mer.`
			`//`
Correct Shannon entropy bias for canonical k-mers 2026-05-17 14:52:31 +08:00			`// The algorithm mirrors the Rust obiskbuilder entropy: it decodes the k-mer`
Add entropy-based filtering for k-mers 2026-02-10 18:19:57 +01:00			`// to a DNA sequence, extracts all sub-words of each size from 1 to levelMax,`
			`// normalizes them by circular canonical form, counts their frequencies, and`
Correct Shannon entropy bias for canonical k-mers 2026-05-17 14:52:31 +08:00			`// computes Shannon entropy corrected for class sizes, normalized by the`
			`// maximum possible entropy over 4^ws raw bins.`
Add entropy-based filtering for k-mers 2026-02-10 18:19:57 +01:00			`// The returned value is the minimum entropy across all word sizes.`
			`//`
Correct Shannon entropy bias for canonical k-mers 2026-05-17 14:52:31 +08:00			`// Correction for small sequences: the raw entropy H = log(N) - Σ f·log(f)/N`
			`// under-estimates the true complexity when many raw words collapse to the same`
			`// canonical form. Adding Σ f·log(class_size)/N recovers the entropy of the`
			`// underlying uncollapsed distribution (assuming uniform mixing within each`
			`// equivalence class).`
			`//`
Add entropy-based filtering for k-mers 2026-02-10 18:19:57 +01:00			`// A value close to 0 indicates very low complexity (e.g. "AAAA..."),`
			`// while a value close to 1 indicates high complexity.`
			`func KmerEntropy(kmer uint64, k int, levelMax int) float64 {`
			`if k < 1 \|\| levelMax < 1 {`
			`return 1.0`
			`}`
			`if levelMax >= k {`
			`levelMax = k - 1`
			`}`
			`if levelMax < 1 {`
			`return 1.0`
			`}`

			`// Decode k-mer to DNA sequence`
			`var seqBuf [32]byte`
			`seq := DecodeKmer(kmer, k, seqBuf[:])`

Correct Shannon entropy bias for canonical k-mers 2026-05-17 14:52:31 +08:00			`// Pre-compute nLogN lookup`
Add entropy-based filtering for k-mers 2026-02-10 18:19:57 +01:00			`nLogN := make([]float64, k+1)`
			`for i := 1; i <= k; i++ {`
			`nLogN[i] = float64(i) * math.Log(float64(i))`
			`}`

			`// Build circular-canonical normalization tables per word size`
			`normTables := make([][]int, levelMax+1)`
			`for ws := 1; ws <= levelMax; ws++ {`
			`size := 1 << (ws * 2)`
			`normTables[ws] = make([]int, size)`
			`for code := 0; code < size; code++ {`
			`normTables[ws][code] = int(NormalizeCircular(uint64(code), ws))`
			`}`
			`}`

Correct Shannon entropy bias for canonical k-mers 2026-05-17 14:52:31 +08:00			`// Build ln(class_size) tables: for each canonical form, how many raw`
			`// words map to it under circular normalization.`
			`classLogSizeTables := make([][]float64, levelMax+1)`
			`for ws := 1; ws <= levelMax; ws++ {`
			`tableSize := 1 << (ws * 2)`
			`classSize := make([]int, tableSize)`
			`for code := 0; code < tableSize; code++ {`
			`classSize[normTables[ws][code]]++`
			`}`
			`classLogSizeTables[ws] = make([]float64, tableSize)`
			`for j := 0; j < tableSize; j++ {`
			`if classSize[j] > 0 {`
			`classLogSizeTables[ws][j] = math.Log(float64(classSize[j]))`
			`}`
			`}`
			`}`

Add entropy-based filtering for k-mers 2026-02-10 18:19:57 +01:00			`minEntropy := math.MaxFloat64`

			`for ws := 1; ws <= levelMax; ws++ {`
			`nwords := k - ws + 1`
			`if nwords < 1 {`
			`continue`
			`}`

			`// Count circular-canonical sub-word frequencies`
			`tableSize := 1 << (ws * 2)`
			`table := make([]int, tableSize)`
			`mask := (1 << (ws * 2)) - 1`

			`wordIndex := 0`
			`for i := 0; i < ws-1; i++ {`
			`wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i]))`
			`}`

			`for i, j := 0, ws-1; j < k; i, j = i+1, j+1 {`
			`wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j]))`
			`normWord := normTables[ws][wordIndex]`
			`table[normWord]++`
			`}`

Correct Shannon entropy bias for canonical k-mers 2026-05-17 14:52:31 +08:00			`// Compute emax over 4^ws raw bins (uncollapsed distribution).`
Add entropy-based filtering for k-mers 2026-02-10 18:19:57 +01:00			`floatNwords := float64(nwords)`
			`logNwords := math.Log(floatNwords)`
Correct Shannon entropy bias for canonical k-mers 2026-05-17 14:52:31 +08:00			`na := tableSize // 4^ws`
Add entropy-based filtering for k-mers 2026-02-10 18:19:57 +01:00			`var emax float64`
			`if nwords < na {`
Correct Shannon entropy bias for canonical k-mers 2026-05-17 14:52:31 +08:00			`emax = logNwords`
Add entropy-based filtering for k-mers 2026-02-10 18:19:57 +01:00			`} else {`
			`cov := nwords / na`
			`remains := nwords - (na * cov)`
			`f1 := float64(cov) / floatNwords`
			`f2 := float64(cov+1) / floatNwords`
			`emax = -(float64(na-remains)f1math.Log(f1) +`
			`float64(remains)f2math.Log(f2))`
			`}`

			`if emax <= 0 {`
			`continue`
			`}`

Correct Shannon entropy bias for canonical k-mers 2026-05-17 14:52:31 +08:00			`// Accumulate Σ f·log(f) and Σ f·log(class_size) over canonical forms.`
			`classLogSize := classLogSizeTables[ws]`
			`var sumNLogN, sumClassLogN float64`
			`for j := 0; j < tableSize; j++ {`
			`n := table[j]`
			`if n > 0 {`
			`sumNLogN += nLogN[n]`
			`sumClassLogN += float64(n) * classLogSize[j]`
			`}`
			`}`

			`// Corrected entropy: H_raw ≈ log(N) + (Σf·log(s) - Σf·log(f)) / N`
			`entropy := (logNwords + sumClassLogN/floatNwords - sumNLogN/floatNwords) / emax`
Add entropy-based filtering for k-mers 2026-02-10 18:19:57 +01:00			`if entropy < 0 {`
			`entropy = 0`
			`}`

			`if entropy < minEntropy {`
			`minEntropy = entropy`
			`}`
			`}`

			`if minEntropy == math.MaxFloat64 {`
			`return 1.0`
			`}`

			`return math.Round(minEntropy*10000) / 10000`
			`}`

			`// KmerEntropyFilter is a reusable entropy filter for batch processing.`
			`// It pre-computes normalization tables and lookup values to avoid repeated`
			`// allocation across millions of k-mers.`
			`//`
			`// IMPORTANT: a KmerEntropyFilter is NOT safe for concurrent use.`
			`// Each goroutine must create its own instance via NewKmerEntropyFilter.`
			`type KmerEntropyFilter struct {`
Correct Shannon entropy bias for canonical k-mers 2026-05-17 14:52:31 +08:00			`k int`
			`levelMax int`
			`threshold float64`
			`nLogN []float64`
			`normTables [][]int`
			`classLogSizeTables [][]float64`
			`emaxValues []float64`
			`logNwords []float64`
Add entropy-based filtering for k-mers 2026-02-10 18:19:57 +01:00			`// Pre-allocated frequency tables reused across Entropy() calls.`
			`// One per word size (index 0 unused). Reset to zero before each use.`
			`freqTables [][]int`
			`}`

			`// NewKmerEntropyFilter creates an entropy filter with pre-computed tables.`
			`func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter {`
			`if levelMax >= k {`
			`levelMax = k - 1`
			`}`
			`if levelMax < 1 {`
			`levelMax = 1`
			`}`

			`nLogN := make([]float64, k+1)`
			`for i := 1; i <= k; i++ {`
			`nLogN[i] = float64(i) * math.Log(float64(i))`
			`}`

			`normTables := make([][]int, levelMax+1)`
			`for ws := 1; ws <= levelMax; ws++ {`
			`size := 1 << (ws * 2)`
			`normTables[ws] = make([]int, size)`
			`for code := 0; code < size; code++ {`
			`normTables[ws][code] = int(NormalizeCircular(uint64(code), ws))`
			`}`
			`}`

Correct Shannon entropy bias for canonical k-mers 2026-05-17 14:52:31 +08:00			`// ln(class_size) for each canonical form under circular normalization.`
			`classLogSizeTables := make([][]float64, levelMax+1)`
			`for ws := 1; ws <= levelMax; ws++ {`
			`tableSize := 1 << (ws * 2)`
			`classSize := make([]int, tableSize)`
			`for code := 0; code < tableSize; code++ {`
			`classSize[normTables[ws][code]]++`
			`}`
			`classLogSizeTables[ws] = make([]float64, tableSize)`
			`for j := 0; j < tableSize; j++ {`
			`if classSize[j] > 0 {`
			`classLogSizeTables[ws][j] = math.Log(float64(classSize[j]))`
			`}`
			`}`
			`}`

			`// Pre-compute emax and logNwords per word size.`
			`// emax uses 4^ws raw bins to match the corrected entropy.`
Add entropy-based filtering for k-mers 2026-02-10 18:19:57 +01:00			`emaxValues := make([]float64, levelMax+1)`
			`logNwords := make([]float64, levelMax+1)`
			`for ws := 1; ws <= levelMax; ws++ {`
			`nw := k - ws + 1`
Correct Shannon entropy bias for canonical k-mers 2026-05-17 14:52:31 +08:00			`na := 1 << (ws * 2) // 4^ws raw bins`
			`floatNw := float64(nw)`
			`logNwords[ws] = math.Log(floatNw)`
Add entropy-based filtering for k-mers 2026-02-10 18:19:57 +01:00			`if nw < na {`
Correct Shannon entropy bias for canonical k-mers 2026-05-17 14:52:31 +08:00			`emaxValues[ws] = logNwords[ws]`
Add entropy-based filtering for k-mers 2026-02-10 18:19:57 +01:00			`} else {`
			`cov := nw / na`
			`remains := nw - (na * cov)`
Correct Shannon entropy bias for canonical k-mers 2026-05-17 14:52:31 +08:00			`f1 := float64(cov) / floatNw`
			`f2 := float64(cov+1) / floatNw`
Add entropy-based filtering for k-mers 2026-02-10 18:19:57 +01:00			`emaxValues[ws] = -(float64(na-remains)f1math.Log(f1) +`
			`float64(remains)f2math.Log(f2))`
			`}`
			`}`

			`// Pre-allocate frequency tables per word size`
			`freqTables := make([][]int, levelMax+1)`
			`for ws := 1; ws <= levelMax; ws++ {`
			`freqTables[ws] = make([]int, 1<<(ws*2))`
			`}`

			`return &KmerEntropyFilter{`
Correct Shannon entropy bias for canonical k-mers 2026-05-17 14:52:31 +08:00			`k: k,`
			`levelMax: levelMax,`
			`threshold: threshold,`
			`nLogN: nLogN,`
			`normTables: normTables,`
			`classLogSizeTables: classLogSizeTables,`
			`emaxValues: emaxValues,`
			`logNwords: logNwords,`
			`freqTables: freqTables,`
Add entropy-based filtering for k-mers 2026-02-10 18:19:57 +01:00			`}`
			`}`

			`// Accept returns true if the k-mer has entropy strictly above the threshold.`
			`// Low-complexity k-mers (entropy <= threshold) are rejected.`
			`func (ef *KmerEntropyFilter) Accept(kmer uint64) bool {`
			`return ef.Entropy(kmer) > ef.threshold`
			`}`

			`// Entropy computes the entropy for a single k-mer using pre-computed tables.`
			`func (ef *KmerEntropyFilter) Entropy(kmer uint64) float64 {`
			`k := ef.k`

			`// Decode k-mer to DNA sequence`
			`var seqBuf [32]byte`
			`seq := DecodeKmer(kmer, k, seqBuf[:])`

			`minEntropy := math.MaxFloat64`

			`for ws := 1; ws <= ef.levelMax; ws++ {`
			`nwords := k - ws + 1`
			`if nwords < 1 {`
			`continue`
			`}`

			`emax := ef.emaxValues[ws]`
			`if emax <= 0 {`
			`continue`
			`}`

			`// Count circular-canonical sub-word frequencies`
			`tableSize := 1 << (ws * 2)`
			`table := ef.freqTables[ws]`
Correct Shannon entropy bias for canonical k-mers 2026-05-17 14:52:31 +08:00			`clear(table)`
Add entropy-based filtering for k-mers 2026-02-10 18:19:57 +01:00			`mask := (1 << (ws * 2)) - 1`
			`normTable := ef.normTables[ws]`

			`wordIndex := 0`
			`for i := 0; i < ws-1; i++ {`
			`wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i]))`
			`}`

			`for i, j := 0, ws-1; j < k; i, j = i+1, j+1 {`
			`wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j]))`
			`normWord := normTable[wordIndex]`
			`table[normWord]++`
			`}`

			`floatNwords := float64(nwords)`
			`logNwords := ef.logNwords[ws]`
Correct Shannon entropy bias for canonical k-mers 2026-05-17 14:52:31 +08:00			`classLogSize := ef.classLogSizeTables[ws]`
Add entropy-based filtering for k-mers 2026-02-10 18:19:57 +01:00
Correct Shannon entropy bias for canonical k-mers 2026-05-17 14:52:31 +08:00			`var sumNLogN, sumClassLogN float64`
Add entropy-based filtering for k-mers 2026-02-10 18:19:57 +01:00			`for j := 0; j < tableSize; j++ {`
			`n := table[j]`
			`if n > 0 {`
			`sumNLogN += ef.nLogN[n]`
Correct Shannon entropy bias for canonical k-mers 2026-05-17 14:52:31 +08:00			`sumClassLogN += float64(n) * classLogSize[j]`
Add entropy-based filtering for k-mers 2026-02-10 18:19:57 +01:00			`}`
			`}`

Correct Shannon entropy bias for canonical k-mers 2026-05-17 14:52:31 +08:00			`// Corrected entropy: H_raw ≈ log(N) + (Σf·log(s) - Σf·log(f)) / N`
			`entropy := (logNwords + sumClassLogN/floatNwords - sumNLogN/floatNwords) / emax`
Add entropy-based filtering for k-mers 2026-02-10 18:19:57 +01:00			`if entropy < 0 {`
			`entropy = 0`
			`}`

			`if entropy < minEntropy {`
			`minEntropy = entropy`
			`}`
			`}`

			`if minEntropy == math.MaxFloat64 {`
			`return 1.0`
			`}`

			`return math.Round(minEntropy*10000) / 10000`
			`}`