obitools4/pkg/obikmer/entropy.go

package obikmer

import "math"

// KmerEntropy computes the entropy of a single encoded k-mer.
//
// The algorithm mirrors the lowmask entropy calculation: it decodes the k-mer
// to a DNA sequence, extracts all sub-words of each size from 1 to levelMax,
// normalizes them by circular canonical form, counts their frequencies, and
// computes Shannon entropy normalized by the maximum possible entropy.
// The returned value is the minimum entropy across all word sizes.
//
// A value close to 0 indicates very low complexity (e.g. "AAAA..."),
// while a value close to 1 indicates high complexity.
//
// Parameters:
//   - kmer: the encoded k-mer (2 bits per base)
//   - k: the k-mer size
//   - levelMax: maximum sub-word size for entropy (typically 6)
//
// Returns:
//   - minimum normalized entropy across all word sizes 1..levelMax
func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
	if k < 1 || levelMax < 1 {
		return 1.0
	}
	if levelMax >= k {
		levelMax = k - 1
	}
	if levelMax < 1 {
		return 1.0
	}

	// Decode k-mer to DNA sequence
	var seqBuf [32]byte
	seq := DecodeKmer(kmer, k, seqBuf[:])

	// Pre-compute nLogN lookup (same as lowmask)
	nLogN := make([]float64, k+1)
	for i := 1; i <= k; i++ {
		nLogN[i] = float64(i) * math.Log(float64(i))
	}

	// Build circular-canonical normalization tables per word size
	normTables := make([][]int, levelMax+1)
	for ws := 1; ws <= levelMax; ws++ {
		size := 1 << (ws * 2)
		normTables[ws] = make([]int, size)
		for code := 0; code < size; code++ {
			normTables[ws][code] = int(NormalizeCircular(uint64(code), ws))
		}
	}

	minEntropy := math.MaxFloat64

	for ws := 1; ws <= levelMax; ws++ {
		nwords := k - ws + 1
		if nwords < 1 {
			continue
		}

		// Count circular-canonical sub-word frequencies
		tableSize := 1 << (ws * 2)
		table := make([]int, tableSize)
		mask := (1 << (ws * 2)) - 1

		wordIndex := 0
		for i := 0; i < ws-1; i++ {
			wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i]))
		}

		for i, j := 0, ws-1; j < k; i, j = i+1, j+1 {
			wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j]))
			normWord := normTables[ws][wordIndex]
			table[normWord]++
		}

		// Compute Shannon entropy
		floatNwords := float64(nwords)
		logNwords := math.Log(floatNwords)

		var sumNLogN float64
		for j := 0; j < tableSize; j++ {
			n := table[j]
			if n > 0 {
				sumNLogN += nLogN[n]
			}
		}

		// Compute emax (maximum possible entropy for this word size)
		na := CanonicalCircularKmerCount(ws)
		var emax float64
		if nwords < na {
			emax = math.Log(float64(nwords))
		} else {
			cov := nwords / na
			remains := nwords - (na * cov)
			f1 := float64(cov) / floatNwords
			f2 := float64(cov+1) / floatNwords
			emax = -(float64(na-remains)*f1*math.Log(f1) +
				float64(remains)*f2*math.Log(f2))
		}

		if emax <= 0 {
			continue
		}

		entropy := (logNwords - sumNLogN/floatNwords) / emax
		if entropy < 0 {
			entropy = 0
		}

		if entropy < minEntropy {
			minEntropy = entropy
		}
	}

	if minEntropy == math.MaxFloat64 {
		return 1.0
	}

	return math.Round(minEntropy*10000) / 10000
}

// KmerEntropyFilter is a reusable entropy filter for batch processing.
// It pre-computes normalization tables and lookup values to avoid repeated
// allocation across millions of k-mers.
//
// IMPORTANT: a KmerEntropyFilter is NOT safe for concurrent use.
// Each goroutine must create its own instance via NewKmerEntropyFilter.
type KmerEntropyFilter struct {
	k          int
	levelMax   int
	threshold  float64
	nLogN      []float64
	normTables [][]int
	emaxValues []float64
	logNwords  []float64
	// Pre-allocated frequency tables reused across Entropy() calls.
	// One per word size (index 0 unused). Reset to zero before each use.
	freqTables [][]int
}

// NewKmerEntropyFilter creates an entropy filter with pre-computed tables.
//
// Parameters:
//   - k: the k-mer size
//   - levelMax: maximum sub-word size for entropy (typically 6)
//   - threshold: entropy threshold (k-mers with entropy <= threshold are rejected)
func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter {
	if levelMax >= k {
		levelMax = k - 1
	}
	if levelMax < 1 {
		levelMax = 1
	}

	nLogN := make([]float64, k+1)
	for i := 1; i <= k; i++ {
		nLogN[i] = float64(i) * math.Log(float64(i))
	}

	normTables := make([][]int, levelMax+1)
	for ws := 1; ws <= levelMax; ws++ {
		size := 1 << (ws * 2)
		normTables[ws] = make([]int, size)
		for code := 0; code < size; code++ {
			normTables[ws][code] = int(NormalizeCircular(uint64(code), ws))
		}
	}

	emaxValues := make([]float64, levelMax+1)
	logNwords := make([]float64, levelMax+1)
	for ws := 1; ws <= levelMax; ws++ {
		nw := k - ws + 1
		na := CanonicalCircularKmerCount(ws)
		if nw < na {
			logNwords[ws] = math.Log(float64(nw))
			emaxValues[ws] = math.Log(float64(nw))
		} else {
			cov := nw / na
			remains := nw - (na * cov)
			f1 := float64(cov) / float64(nw)
			f2 := float64(cov+1) / float64(nw)
			logNwords[ws] = math.Log(float64(nw))
			emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) +
				float64(remains)*f2*math.Log(f2))
		}
	}

	// Pre-allocate frequency tables per word size
	freqTables := make([][]int, levelMax+1)
	for ws := 1; ws <= levelMax; ws++ {
		freqTables[ws] = make([]int, 1<<(ws*2))
	}

	return &KmerEntropyFilter{
		k:          k,
		levelMax:   levelMax,
		threshold:  threshold,
		nLogN:      nLogN,
		normTables: normTables,
		emaxValues: emaxValues,
		logNwords:  logNwords,
		freqTables: freqTables,
	}
}

// Accept returns true if the k-mer has entropy strictly above the threshold.
// Low-complexity k-mers (entropy <= threshold) are rejected.
func (ef *KmerEntropyFilter) Accept(kmer uint64) bool {
	return ef.Entropy(kmer) > ef.threshold
}

// Entropy computes the entropy for a single k-mer using pre-computed tables.
func (ef *KmerEntropyFilter) Entropy(kmer uint64) float64 {
	k := ef.k

	// Decode k-mer to DNA sequence
	var seqBuf [32]byte
	seq := DecodeKmer(kmer, k, seqBuf[:])

	minEntropy := math.MaxFloat64

	for ws := 1; ws <= ef.levelMax; ws++ {
		nwords := k - ws + 1
		if nwords < 1 {
			continue
		}

		emax := ef.emaxValues[ws]
		if emax <= 0 {
			continue
		}

		// Count circular-canonical sub-word frequencies
		tableSize := 1 << (ws * 2)
		table := ef.freqTables[ws]
		clear(table) // reset to zero
		mask := (1 << (ws * 2)) - 1
		normTable := ef.normTables[ws]

		wordIndex := 0
		for i := 0; i < ws-1; i++ {
			wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i]))
		}

		for i, j := 0, ws-1; j < k; i, j = i+1, j+1 {
			wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j]))
			normWord := normTable[wordIndex]
			table[normWord]++
		}

		// Compute Shannon entropy
		floatNwords := float64(nwords)
		logNwords := ef.logNwords[ws]

		var sumNLogN float64
		for j := 0; j < tableSize; j++ {
			n := table[j]
			if n > 0 {
				sumNLogN += ef.nLogN[n]
			}
		}

		entropy := (logNwords - sumNLogN/floatNwords) / emax
		if entropy < 0 {
			entropy = 0
		}

		if entropy < minEntropy {
			minEntropy = entropy
		}
	}

	if minEntropy == math.MaxFloat64 {
		return 1.0
	}

	return math.Round(minEntropy*10000) / 10000
}