Add entropy-based filtering for k-mers

This commit introduces entropy-based filtering for k-mers to remove low-complexity sequences. It adds: - New KmerEntropy and KmerEntropyFilter functions in pkg/obikmer/entropy.go for computing and filtering k-mer entropy - Integration of entropy filtering in the k-mer set builder (pkg/obikmer/kmer_set_builder.go) - A new 'filter' command in obik tool (pkg/obitools/obik/filter.go) to apply entropy filtering on existing indices - CLI options for configuring entropy filtering during index building and filtering The entropy filter helps improve the quality of k-mer sets by removing repetitive sequences that may interfere with downstream analyses.
2026-06-24 09:41:00 +00:00 · 2026-02-10 18:19:57 +01:00
parent c6e04265f1
commit bebbbbfe7d
7 changed files with 910 additions and 60 deletions
@@ -0,0 +1,281 @@
 package obikmer
 import "math"
 // KmerEntropy computes the entropy of a single encoded k-mer.
 //
 // The algorithm mirrors the lowmask entropy calculation: it decodes the k-mer
 // to a DNA sequence, extracts all sub-words of each size from 1 to levelMax,
 // normalizes them by circular canonical form, counts their frequencies, and
 // computes Shannon entropy normalized by the maximum possible entropy.
 // The returned value is the minimum entropy across all word sizes.
 //
 // A value close to 0 indicates very low complexity (e.g. "AAAA..."),
 // while a value close to 1 indicates high complexity.
 //
 // Parameters:
 //   - kmer: the encoded k-mer (2 bits per base)
 //   - k: the k-mer size
 //   - levelMax: maximum sub-word size for entropy (typically 6)
 //
 // Returns:
 //   - minimum normalized entropy across all word sizes 1..levelMax
 func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
 	if k < 1 || levelMax < 1 {
 		return 1.0
 	}
 	if levelMax >= k {
 		levelMax = k - 1
 	}
 	if levelMax < 1 {
 		return 1.0
 	}
 	// Decode k-mer to DNA sequence
 	var seqBuf [32]byte
 	seq := DecodeKmer(kmer, k, seqBuf[:])
 	// Pre-compute nLogN lookup (same as lowmask)
 	nLogN := make([]float64, k+1)
 	for i := 1; i <= k; i++ {
 		nLogN[i] = float64(i) * math.Log(float64(i))
 	}
 	// Build circular-canonical normalization tables per word size
 	normTables := make([][]int, levelMax+1)
 	for ws := 1; ws <= levelMax; ws++ {
 		size := 1 << (ws * 2)
 		normTables[ws] = make([]int, size)
 		for code := 0; code < size; code++ {
 			normTables[ws][code] = int(NormalizeCircular(uint64(code), ws))
 		}
 	}
 	minEntropy := math.MaxFloat64
 	for ws := 1; ws <= levelMax; ws++ {
 		nwords := k - ws + 1
 		if nwords < 1 {
 			continue
 		}
 		// Count circular-canonical sub-word frequencies
 		tableSize := 1 << (ws * 2)
 		table := make([]int, tableSize)
 		mask := (1 << (ws * 2)) - 1
 		wordIndex := 0
 		for i := 0; i < ws-1; i++ {
 			wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i]))
 		}
 		for i, j := 0, ws-1; j < k; i, j = i+1, j+1 {
 			wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j]))
 			normWord := normTables[ws][wordIndex]
 			table[normWord]++
 		}
 		// Compute Shannon entropy
 		floatNwords := float64(nwords)
 		logNwords := math.Log(floatNwords)
 		var sumNLogN float64
 		for j := 0; j < tableSize; j++ {
 			n := table[j]
 			if n > 0 {
 				sumNLogN += nLogN[n]
 			}
 		}
 		// Compute emax (maximum possible entropy for this word size)
 		na := CanonicalCircularKmerCount(ws)
 		var emax float64
 		if nwords < na {
 			emax = math.Log(float64(nwords))
 		} else {
 			cov := nwords / na
 			remains := nwords - (na * cov)
 			f1 := float64(cov) / floatNwords
 			f2 := float64(cov+1) / floatNwords
 			emax = -(float64(na-remains)*f1*math.Log(f1) +
 				float64(remains)*f2*math.Log(f2))
 		}
 		if emax <= 0 {
 			continue
 		}
 		entropy := (logNwords - sumNLogN/floatNwords) / emax
 		if entropy < 0 {
 			entropy = 0
 		}
 		if entropy < minEntropy {
 			minEntropy = entropy
 		}
 	}
 	if minEntropy == math.MaxFloat64 {
 		return 1.0
 	}
 	return math.Round(minEntropy*10000) / 10000
 }
 // KmerEntropyFilter is a reusable entropy filter for batch processing.
 // It pre-computes normalization tables and lookup values to avoid repeated
 // allocation across millions of k-mers.
 //
 // IMPORTANT: a KmerEntropyFilter is NOT safe for concurrent use.
 // Each goroutine must create its own instance via NewKmerEntropyFilter.
 type KmerEntropyFilter struct {
 	k          int
 	levelMax   int
 	threshold  float64
 	nLogN      []float64
 	normTables [][]int
 	emaxValues []float64
 	logNwords  []float64
 	// Pre-allocated frequency tables reused across Entropy() calls.
 	// One per word size (index 0 unused). Reset to zero before each use.
 	freqTables [][]int
 }
 // NewKmerEntropyFilter creates an entropy filter with pre-computed tables.
 //
 // Parameters:
 //   - k: the k-mer size
 //   - levelMax: maximum sub-word size for entropy (typically 6)
 //   - threshold: entropy threshold (k-mers with entropy <= threshold are rejected)
 func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter {
 	if levelMax >= k {
 		levelMax = k - 1
 	}
 	if levelMax < 1 {
 		levelMax = 1
 	}
 	nLogN := make([]float64, k+1)
 	for i := 1; i <= k; i++ {
 		nLogN[i] = float64(i) * math.Log(float64(i))
 	}
 	normTables := make([][]int, levelMax+1)
 	for ws := 1; ws <= levelMax; ws++ {
 		size := 1 << (ws * 2)
 		normTables[ws] = make([]int, size)
 		for code := 0; code < size; code++ {
 			normTables[ws][code] = int(NormalizeCircular(uint64(code), ws))
 		}
 	}
 	emaxValues := make([]float64, levelMax+1)
 	logNwords := make([]float64, levelMax+1)
 	for ws := 1; ws <= levelMax; ws++ {
 		nw := k - ws + 1
 		na := CanonicalCircularKmerCount(ws)
 		if nw < na {
 			logNwords[ws] = math.Log(float64(nw))
 			emaxValues[ws] = math.Log(float64(nw))
 		} else {
 			cov := nw / na
 			remains := nw - (na * cov)
 			f1 := float64(cov) / float64(nw)
 			f2 := float64(cov+1) / float64(nw)
 			logNwords[ws] = math.Log(float64(nw))
 			emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) +
 				float64(remains)*f2*math.Log(f2))
 		}
 	}
 	// Pre-allocate frequency tables per word size
 	freqTables := make([][]int, levelMax+1)
 	for ws := 1; ws <= levelMax; ws++ {
 		freqTables[ws] = make([]int, 1<<(ws*2))
 	}
 	return &KmerEntropyFilter{
 		k:          k,
 		levelMax:   levelMax,
 		threshold:  threshold,
 		nLogN:      nLogN,
 		normTables: normTables,
 		emaxValues: emaxValues,
 		logNwords:  logNwords,
 		freqTables: freqTables,
 	}
 }
 // Accept returns true if the k-mer has entropy strictly above the threshold.
 // Low-complexity k-mers (entropy <= threshold) are rejected.
 func (ef *KmerEntropyFilter) Accept(kmer uint64) bool {
 	return ef.Entropy(kmer) > ef.threshold
 }
 // Entropy computes the entropy for a single k-mer using pre-computed tables.
 func (ef *KmerEntropyFilter) Entropy(kmer uint64) float64 {
 	k := ef.k
 	// Decode k-mer to DNA sequence
 	var seqBuf [32]byte
 	seq := DecodeKmer(kmer, k, seqBuf[:])
 	minEntropy := math.MaxFloat64
 	for ws := 1; ws <= ef.levelMax; ws++ {
 		nwords := k - ws + 1
 		if nwords < 1 {
 			continue
 		}
 		emax := ef.emaxValues[ws]
 		if emax <= 0 {
 			continue
 		}
 		// Count circular-canonical sub-word frequencies
 		tableSize := 1 << (ws * 2)
 		table := ef.freqTables[ws]
 		clear(table) // reset to zero
 		mask := (1 << (ws * 2)) - 1
 		normTable := ef.normTables[ws]
 		wordIndex := 0
 		for i := 0; i < ws-1; i++ {
 			wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i]))
 		}
 		for i, j := 0, ws-1; j < k; i, j = i+1, j+1 {
 			wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j]))
 			normWord := normTable[wordIndex]
 			table[normWord]++
 		}
 		// Compute Shannon entropy
 		floatNwords := float64(nwords)
 		logNwords := ef.logNwords[ws]
 		var sumNLogN float64
 		for j := 0; j < tableSize; j++ {
 			n := table[j]
 			if n > 0 {
 				sumNLogN += ef.nLogN[n]
 			}
 		}
 		entropy := (logNwords - sumNLogN/floatNwords) / emax
 		if entropy < 0 {
 			entropy = 0
 		}
 		if entropy < minEntropy {
 			minEntropy = entropy
 		}
 	}
 	if minEntropy == math.MaxFloat64 {
 		return 1.0
 	}
 	return math.Round(minEntropy*10000) / 10000
 }
@@ -5,11 +5,12 @@ import (
 	"math"
 	"os"
 	"path/filepath"
-	"runtime"
+	"slices"
 	"sort"
 	"sync"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 	"github.com/schollz/progressbar/v3"
 )
 // BuilderOption is a functional option for KmerSetGroupBuilder.
@@ -19,6 +20,8 @@ type builderConfig struct {
 	minFreq          int     // 0 means no frequency filtering (simple dedup)
 	maxFreq          int     // 0 means no upper bound
 	saveFreqTopN     int     // >0 means save the N most frequent k-mers per set to CSV
 	entropyThreshold float64 // >0 means filter k-mers with entropy <= threshold
 	entropyLevelMax  int     // max sub-word size for entropy (typically 6)
 }
 // WithMinFrequency activates frequency filtering mode.
@@ -45,6 +48,16 @@ func WithSaveFreqKmers(n int) BuilderOption {
 	}
 }
 // WithEntropyFilter activates entropy-based low-complexity filtering.
 // K-mers with entropy <= threshold are discarded during finalization.
 // levelMax is the maximum sub-word size for entropy computation (typically 6).
 func WithEntropyFilter(threshold float64, levelMax int) BuilderOption {
 	return func(c *builderConfig) {
 		c.entropyThreshold = threshold
 		c.entropyLevelMax = levelMax
 	}
 }
 // KmerSetGroupBuilder constructs a KmerSetGroup on disk.
 // During construction, super-kmers are written to temporary .skm files
 // partitioned by minimizer. On Close(), each partition is finalized
@@ -299,7 +312,17 @@ func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) {
 		}
 	}
-	// Process partitions in parallel
+	// =====================================================================
 	// 2-stage pipeline: readers (pure I/O) → workers (CPU + write)
 	//
 	// - nReaders goroutines read .skm files (pure I/O, fast)
 	// - nWorkers goroutines extract k-mers, sort, dedup, filter, write .kdi
 	//
 	// One unbuffered channel between stages. Readers are truly I/O-bound
 	// (small files, buffered reads), workers are CPU-bound and stay busy.
 	// =====================================================================
 	totalJobs := b.n * b.P
 	counts := make([][]uint64, b.n)
 	spectra := make([][]map[int]uint64, b.n)
 	var topKmers [][]*TopNKmers
@@ -314,27 +337,71 @@ func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) {
 		}
 	}
-	nWorkers := runtime.NumCPU()
+	nCPU := obidefault.ParallelWorkers()
-	if nWorkers > b.P {
+
-		nWorkers = b.P
+	// Stage sizing
 	nWorkers := nCPU     // CPU-bound: one per core
 	nReaders := nCPU / 4 // pure I/O: few goroutines suffice
 	if nReaders < 2 {
 		nReaders = 2
 	}
 	if nReaders > 4 {
 		nReaders = 4
 	}
 	if nWorkers > totalJobs {
 		nWorkers = totalJobs
 	}
 	if nReaders > totalJobs {
 		nReaders = totalJobs
 	}
-	type job struct {
+	var bar *progressbar.ProgressBar
 	if obidefault.ProgressBar() {
 		pbopt := []progressbar.Option{
 			progressbar.OptionSetWriter(os.Stderr),
 			progressbar.OptionSetWidth(15),
 			progressbar.OptionShowCount(),
 			progressbar.OptionShowIts(),
 			progressbar.OptionSetPredictTime(true),
 			progressbar.OptionSetDescription("[Finalizing partitions]"),
 		}
 		bar = progressbar.NewOptions(totalJobs, pbopt...)
 	}
 	// --- Channel types ---
 	type partitionData struct {
 		setIdx  int
 		partIdx int
 		skmers  []SuperKmer // raw super-kmers from I/O stage
 	}
 	type readJob struct {
 		setIdx  int
 		partIdx int
 	}
-	jobs := make(chan job, b.n*b.P)
+	dataCh := make(chan *partitionData) // unbuffered
-	var wg sync.WaitGroup
+	readJobs := make(chan readJob, totalJobs)
 	var errMu sync.Mutex
 	var firstErr error
-	for w := 0; w < nWorkers; w++ {
+	// Fill job queue (buffered, all jobs pre-loaded)
-		wg.Add(1)
+	for s := 0; s < b.n; s++ {
 		for p := 0; p < b.P; p++ {
 			readJobs <- readJob{s, p}
 		}
 	}
 	close(readJobs)
 	// --- Stage 1: Readers (pure I/O) ---
 	var readWg sync.WaitGroup
 	for w := 0; w < nReaders; w++ {
 		readWg.Add(1)
 		go func() {
-			defer wg.Done()
+			defer readWg.Done()
-			for j := range jobs {
+			for rj := range readJobs {
-				partSpec, partTop, err := b.finalizePartition(j.setIdx, j.partIdx, &counts[j.setIdx][j.partIdx])
+				skmers, err := b.loadPartitionRaw(rj.setIdx, rj.partIdx)
 				if err != nil {
 					errMu.Lock()
 					if firstErr == nil {
@@ -342,21 +409,62 @@ func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) {
 					}
 					errMu.Unlock()
 				}
-				spectra[j.setIdx][j.partIdx] = partSpec
+				dataCh <- &partitionData{rj.setIdx, rj.partIdx, skmers}
 			}
 		}()
 	}
 	go func() {
 		readWg.Wait()
 		close(dataCh)
 	}()
 	// --- Stage 2: Workers (CPU: extract k-mers + sort/filter + write .kdi) ---
 	var workWg sync.WaitGroup
 	for w := 0; w < nWorkers; w++ {
 		workWg.Add(1)
 		go func() {
 			defer workWg.Done()
 			for pd := range dataCh {
 				// CPU: extract canonical k-mers from super-kmers
 				kmers := extractCanonicalKmers(pd.skmers, b.k)
 				pd.skmers = nil // allow GC of raw super-kmers
 				// CPU: sort, dedup, filter
 				filtered, spectrum, topN := b.sortFilterPartition(kmers)
 				kmers = nil // allow GC of unsorted data
 				// I/O: write .kdi file
 				globalIdx := b.startIndex + pd.setIdx
 				kdiPath := filepath.Join(b.dir,
 					fmt.Sprintf("set_%d", globalIdx),
 					fmt.Sprintf("part_%04d.kdi", pd.partIdx))
 				n, err := b.writePartitionKdi(kdiPath, filtered)
 				if err != nil {
 					errMu.Lock()
 					if firstErr == nil {
 						firstErr = err
 					}
 					errMu.Unlock()
 				}
 				counts[pd.setIdx][pd.partIdx] = n
 				spectra[pd.setIdx][pd.partIdx] = spectrum
 				if topKmers != nil {
-					topKmers[j.setIdx][j.partIdx] = partTop
+					topKmers[pd.setIdx][pd.partIdx] = topN
 				}
 				if bar != nil {
 					bar.Add(1)
 				}
 			}
 		}()
 	}
-	for s := 0; s < b.n; s++ {
+	workWg.Wait()
-		for p := 0; p < b.P; p++ {
+
-			jobs <- job{s, p}
+	if bar != nil {
 		fmt.Fprintln(os.Stderr)
 	}
 	}
 	close(jobs)
 	wg.Wait()
 	if firstErr != nil {
 		return nil, firstErr
@@ -449,58 +557,89 @@ func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) {
 	return ksg, nil
 }
-// finalizePartition processes a single partition: load SKM, extract k-mers,
+// loadPartitionRaw reads a .skm file and returns raw super-kmers.
-// sort, dedup/count, write KDI. Returns a partial frequency spectrum
+// This is pure I/O — no k-mer extraction is done here.
-// (frequency → count of distinct k-mers) computed before filtering,
+// Returns nil (not an error) if the .skm file is empty or missing.
-// and optionally the top-N most frequent k-mers.
+func (b *KmerSetGroupBuilder) loadPartitionRaw(setIdx, partIdx int) ([]SuperKmer, error) {
 func (b *KmerSetGroupBuilder) finalizePartition(setIdx, partIdx int, count *uint64) (map[int]uint64, *TopNKmers, error) {
 	// setIdx is local (0..n-1); build dirs use local index, output dirs use global
 	skmPath := filepath.Join(b.dir, ".build",
 		fmt.Sprintf("set_%d", setIdx),
 		fmt.Sprintf("part_%04d.skm", partIdx))
-	globalIdx := b.startIndex + setIdx
+	fi, err := os.Stat(skmPath)
 	kdiPath := filepath.Join(b.dir,
 		fmt.Sprintf("set_%d", globalIdx),
 		fmt.Sprintf("part_%04d.kdi", partIdx))
 	// Load super-kmers and extract canonical k-mers
 	reader, err := NewSkmReader(skmPath)
 	if err != nil {
-		// If file doesn't exist or is empty, write empty KDI
+		return nil, nil // empty partition, not an error
 		return nil, nil, b.writeEmptyKdi(kdiPath, count)
 	}
-	var kmers []uint64
+	reader, err := NewSkmReader(skmPath)
 	if err != nil {
 		return nil, nil
 	}
 	// Estimate capacity from file size. Each super-kmer record is
 	// 2 bytes (length) + packed bases (~k/4 bytes), so roughly
 	// (2 + k/4) bytes per super-kmer on average.
 	avgRecordSize := 2 + b.k/4
 	if avgRecordSize < 4 {
 		avgRecordSize = 4
 	}
 	estCount := int(fi.Size()) / avgRecordSize
 	skmers := make([]SuperKmer, 0, estCount)
 	for {
 		sk, ok := reader.Next()
 		if !ok {
 			break
 		}
-		for kmer := range IterCanonicalKmers(sk.Sequence, b.k) {
+		skmers = append(skmers, sk)
 			kmers = append(kmers, kmer)
 		}
 	}
 	reader.Close()
 	return skmers, nil
 }
 // extractCanonicalKmers extracts all canonical k-mers from a slice of super-kmers.
 // This is CPU-bound work (sliding-window forward/reverse complement).
 func extractCanonicalKmers(skmers []SuperKmer, k int) []uint64 {
 	// Pre-compute total capacity to avoid repeated slice growth.
 	// Each super-kmer of length L yields L-k+1 canonical k-mers.
 	total := 0
 	for i := range skmers {
 		n := len(skmers[i].Sequence) - k + 1
 		if n > 0 {
 			total += n
 		}
 	}
 	kmers := make([]uint64, 0, total)
 	for _, sk := range skmers {
 		for kmer := range IterCanonicalKmers(sk.Sequence, k) {
 			kmers = append(kmers, kmer)
 		}
 	}
 	return kmers
 }
 // sortFilterPartition sorts, deduplicates, and filters k-mers in memory (CPU-bound).
 // Returns the filtered sorted slice, frequency spectrum, and optional top-N.
 func (b *KmerSetGroupBuilder) sortFilterPartition(kmers []uint64) ([]uint64, map[int]uint64, *TopNKmers) {
 	if len(kmers) == 0 {
-		return nil, nil, b.writeEmptyKdi(kdiPath, count)
+		return nil, nil, nil
 	}
-	// Sort
+	// Sort (CPU-bound) — slices.Sort avoids reflection overhead of sort.Slice
-	sort.Slice(kmers, func(i, j int) bool { return kmers[i] < kmers[j] })
+	slices.Sort(kmers)
 	// Write KDI based on mode
 	w, err := NewKdiWriter(kdiPath)
 	if err != nil {
 		return nil, nil, err
 	}
 	minFreq := b.config.minFreq
 	if minFreq <= 0 {
 		minFreq = 1 // simple dedup
 	}
-	maxFreq := b.config.maxFreq // 0 means no upper bound
+	maxFreq := b.config.maxFreq
 	// Prepare entropy filter if requested
 	var entropyFilter *KmerEntropyFilter
 	if b.config.entropyThreshold > 0 && b.config.entropyLevelMax > 0 {
 		entropyFilter = NewKmerEntropyFilter(b.k, b.config.entropyLevelMax, b.config.entropyThreshold)
 	}
 	// Prepare top-N collector if requested
 	var topN *TopNKmers
@@ -508,8 +647,10 @@ func (b *KmerSetGroupBuilder) finalizePartition(setIdx, partIdx int, count *uint
 		topN = NewTopNKmers(b.config.saveFreqTopN)
 	}
-	// Linear scan: count consecutive identical values and accumulate spectrum
+	// Linear scan: count consecutive identical values, filter, accumulate spectrum
 	partSpectrum := make(map[int]uint64)
 	filtered := make([]uint64, 0, len(kmers)/2)
 	i := 0
 	for i < len(kmers) {
 		val := kmers[i]
@@ -522,16 +663,33 @@ func (b *KmerSetGroupBuilder) finalizePartition(setIdx, partIdx int, count *uint
 			topN.Add(val, c)
 		}
 		if c >= minFreq && (maxFreq <= 0 || c <= maxFreq) {
-			if err := w.Write(val); err != nil {
+			if entropyFilter == nil || entropyFilter.Accept(val) {
-				w.Close()
+				filtered = append(filtered, val)
 				return nil, nil, err
 			}
 		}
 		i += c
 	}
-	*count = w.Count()
+	return filtered, partSpectrum, topN
-	return partSpectrum, topN, w.Close()
+}
 // writePartitionKdi writes a sorted slice of k-mers to a .kdi file (I/O-bound).
 // Returns the number of k-mers written.
 func (b *KmerSetGroupBuilder) writePartitionKdi(kdiPath string, kmers []uint64) (uint64, error) {
 	w, err := NewKdiWriter(kdiPath)
 	if err != nil {
 		return 0, err
 	}
 	for _, val := range kmers {
 		if err := w.Write(val); err != nil {
 			w.Close()
 			return 0, err
 		}
 	}
 	n := w.Count()
 	return n, w.Close()
 }
 func (b *KmerSetGroupBuilder) writeEmptyKdi(path string, count *uint64) error {
@@ -128,6 +128,27 @@ func OpenKmerSetGroup(directory string) (*KmerSetGroup, error) {
 	return ksg, nil
 }
 // NewFilteredKmerSetGroup creates a KmerSetGroup from pre-computed data.
 // Used by the filter command to construct a new group after filtering partitions.
 func NewFilteredKmerSetGroup(
 	directory string, k, m, partitions, n int,
 	setsIDs []string, counts []uint64,
 	setsMetadata []map[string]interface{},
 ) (*KmerSetGroup, error) {
 	ksg := &KmerSetGroup{
 		path:         directory,
 		k:            k,
 		m:            m,
 		partitions:   partitions,
 		n:            n,
 		setsIDs:      setsIDs,
 		counts:       counts,
 		setsMetadata: setsMetadata,
 		Metadata:     make(map[string]interface{}),
 	}
 	return ksg, nil
 }
 // SaveMetadata writes the metadata.toml file. This is useful after
 // modifying attributes or IDs on an already-finalized index.
 func (ksg *KmerSetGroup) SaveMetadata() error {
@@ -0,0 +1,344 @@
 package obik
 import (
 	"context"
 	"fmt"
 	"os"
 	"path/filepath"
 	"strings"
 	"sync"
 	"sync/atomic"
 	"github.com/schollz/progressbar/v3"
 	log "github.com/sirupsen/logrus"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
 	"github.com/DavidGamba/go-getoptions"
 )
 // KmerFilter is a predicate applied to individual k-mers during filtering.
 // Returns true if the k-mer should be kept.
 type KmerFilter func(kmer uint64) bool
 // KmerFilterFactory creates a new KmerFilter instance.
 // Each goroutine should call the factory to get its own filter,
 // since some filters (e.g. KmerEntropyFilter) are not thread-safe.
 type KmerFilterFactory func() KmerFilter
 // chainFilterFactories combines multiple KmerFilterFactory into one.
 // The resulting factory creates a filter that accepts a k-mer only
 // if all individual filters accept it.
 func chainFilterFactories(factories []KmerFilterFactory) KmerFilterFactory {
 	switch len(factories) {
 	case 0:
 		return func() KmerFilter { return func(uint64) bool { return true } }
 	case 1:
 		return factories[0]
 	default:
 		return func() KmerFilter {
 			filters := make([]KmerFilter, len(factories))
 			for i, f := range factories {
 				filters[i] = f()
 			}
 			return func(kmer uint64) bool {
 				for _, f := range filters {
 					if !f(kmer) {
 						return false
 					}
 				}
 				return true
 			}
 		}
 	}
 }
 // runFilter implements the "obik filter" subcommand.
 // It reads an existing kmer index, applies a chain of filters,
 // and writes a new filtered index.
 func runFilter(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
 	if len(args) < 1 {
 		return fmt.Errorf("usage: obik filter [options] <source_index> --out <dest_index>")
 	}
 	srcDir := args[0]
 	destDir := CLIOutputDirectory()
 	if destDir == "" || destDir == "-" {
 		return fmt.Errorf("--out option is required and must specify a destination directory")
 	}
 	// Open source index
 	src, err := obikmer.OpenKmerSetGroup(srcDir)
 	if err != nil {
 		return fmt.Errorf("failed to open source index: %w", err)
 	}
 	k := src.K()
 	// Build filter factory chain from CLI options.
 	// Factories are used so each goroutine creates its own filter instance,
 	// since some filters (e.g. KmerEntropyFilter) have mutable state.
 	var factories []KmerFilterFactory
 	var filterDescriptions []string
 	// Entropy filter
 	entropyThreshold := CLIIndexEntropyThreshold()
 	entropySize := CLIIndexEntropySize()
 	if entropyThreshold > 0 {
 		factories = append(factories, func() KmerFilter {
 			ef := obikmer.NewKmerEntropyFilter(k, entropySize, entropyThreshold)
 			return ef.Accept
 		})
 		filterDescriptions = append(filterDescriptions,
 			fmt.Sprintf("entropy(threshold=%.4f, level-max=%d)", entropyThreshold, entropySize))
 	}
 	// Future filters will be added here, e.g.:
 	// quorumFilter, frequencyFilter, ...
 	if len(factories) == 0 {
 		return fmt.Errorf("no filter specified; use --entropy-filter or other filter options")
 	}
 	filterFactory := chainFilterFactories(factories)
 	// Resolve set selection (default: all sets)
 	patterns := CLISetPatterns()
 	var setIndices []int
 	if len(patterns) > 0 {
 		setIndices, err = src.MatchSetIDs(patterns)
 		if err != nil {
 			return fmt.Errorf("failed to match set patterns: %w", err)
 		}
 		if len(setIndices) == 0 {
 			return fmt.Errorf("no sets match the given patterns")
 		}
 	} else {
 		setIndices = make([]int, src.Size())
 		for i := range setIndices {
 			setIndices[i] = i
 		}
 	}
 	log.Infof("Filtering %d set(s) from %s with: %s",
 		len(setIndices), srcDir, strings.Join(filterDescriptions, " + "))
 	// Create destination directory
 	if err := os.MkdirAll(destDir, 0755); err != nil {
 		return fmt.Errorf("failed to create destination: %w", err)
 	}
 	P := src.Partitions()
 	// Progress bar for partition filtering
 	totalPartitions := len(setIndices) * P
 	var bar *progressbar.ProgressBar
 	if obidefault.ProgressBar() {
 		pbopt := []progressbar.Option{
 			progressbar.OptionSetWriter(os.Stderr),
 			progressbar.OptionSetWidth(15),
 			progressbar.OptionShowCount(),
 			progressbar.OptionShowIts(),
 			progressbar.OptionSetPredictTime(true),
 			progressbar.OptionSetDescription("[Filtering partitions]"),
 		}
 		bar = progressbar.NewOptions(totalPartitions, pbopt...)
 	}
 	// Process each selected set
 	newCounts := make([]uint64, len(setIndices))
 	for si, srcIdx := range setIndices {
 		setID := src.SetIDOf(srcIdx)
 		if setID == "" {
 			setID = fmt.Sprintf("set_%d", srcIdx)
 		}
 		destSetDir := filepath.Join(destDir, fmt.Sprintf("set_%d", si))
 		if err := os.MkdirAll(destSetDir, 0755); err != nil {
 			return fmt.Errorf("failed to create set directory: %w", err)
 		}
 		// Process partitions in parallel
 		nWorkers := obidefault.ParallelWorkers()
 		if nWorkers > P {
 			nWorkers = P
 		}
 		var totalKept atomic.Uint64
 		var totalProcessed atomic.Uint64
 		type job struct {
 			partIdx int
 		}
 		jobs := make(chan job, P)
 		var wg sync.WaitGroup
 		var errMu sync.Mutex
 		var firstErr error
 		for w := 0; w < nWorkers; w++ {
 			wg.Add(1)
 			go func() {
 				defer wg.Done()
 				// Each goroutine gets its own filter instance
 				workerFilter := filterFactory()
 				for j := range jobs {
 					kept, processed, err := filterPartition(
 						src.PartitionPath(srcIdx, j.partIdx),
 						filepath.Join(destSetDir, fmt.Sprintf("part_%04d.kdi", j.partIdx)),
 						workerFilter,
 					)
 					if err != nil {
 						errMu.Lock()
 						if firstErr == nil {
 							firstErr = err
 						}
 						errMu.Unlock()
 						return
 					}
 					totalKept.Add(kept)
 					totalProcessed.Add(processed)
 					if bar != nil {
 						bar.Add(1)
 					}
 				}
 			}()
 		}
 		for p := 0; p < P; p++ {
 			jobs <- job{p}
 		}
 		close(jobs)
 		wg.Wait()
 		if firstErr != nil {
 			return fmt.Errorf("failed to filter set %q: %w", setID, firstErr)
 		}
 		kept := totalKept.Load()
 		processed := totalProcessed.Load()
 		newCounts[si] = kept
 		log.Infof("Set %q: %d/%d k-mers kept (%.1f%% removed)",
 			setID, kept, processed,
 			100.0*float64(processed-kept)/float64(max(processed, 1)))
 		// Copy spectrum.bin if it exists
 		srcSpecPath := src.SpectrumPath(srcIdx)
 		if _, err := os.Stat(srcSpecPath); err == nil {
 			destSpecPath := filepath.Join(destSetDir, "spectrum.bin")
 			if err := copyFileHelper(srcSpecPath, destSpecPath); err != nil {
 				log.Warnf("Could not copy spectrum for set %q: %v", setID, err)
 			}
 		}
 	}
 	if bar != nil {
 		fmt.Fprintln(os.Stderr)
 	}
 	// Build destination metadata
 	setsIDs := make([]string, len(setIndices))
 	setsMetadata := make([]map[string]interface{}, len(setIndices))
 	for i, srcIdx := range setIndices {
 		setsIDs[i] = src.SetIDOf(srcIdx)
 		setsMetadata[i] = src.AllSetMetadata(srcIdx)
 		if setsMetadata[i] == nil {
 			setsMetadata[i] = make(map[string]interface{})
 		}
 	}
 	// Write metadata for the filtered index
 	dest, err := obikmer.NewFilteredKmerSetGroup(
 		destDir, k, src.M(), P,
 		len(setIndices), setsIDs, newCounts, setsMetadata,
 	)
 	if err != nil {
 		return fmt.Errorf("failed to create filtered metadata: %w", err)
 	}
 	// Copy group-level metadata and record applied filters
 	for key, value := range src.Metadata {
 		dest.SetAttribute(key, value)
 	}
 	if entropyThreshold > 0 {
 		dest.SetAttribute("entropy_filter", entropyThreshold)
 		dest.SetAttribute("entropy_filter_size", entropySize)
 	}
 	dest.SetAttribute("filtered_from", srcDir)
 	if err := dest.SaveMetadata(); err != nil {
 		return fmt.Errorf("failed to save metadata: %w", err)
 	}
 	log.Info("Done.")
 	return nil
 }
 // filterPartition reads a single .kdi partition, applies the filter predicate,
 // and writes the accepted k-mers to a new .kdi file.
 // Returns (kept, processed, error).
 func filterPartition(srcPath, destPath string, accept KmerFilter) (uint64, uint64, error) {
 	reader, err := obikmer.NewKdiReader(srcPath)
 	if err != nil {
 		// Empty partition — write empty KDI
 		w, err2 := obikmer.NewKdiWriter(destPath)
 		if err2 != nil {
 			return 0, 0, err2
 		}
 		return 0, 0, w.Close()
 	}
 	defer reader.Close()
 	w, err := obikmer.NewKdiWriter(destPath)
 	if err != nil {
 		return 0, 0, err
 	}
 	var kept, processed uint64
 	for {
 		kmer, ok := reader.Next()
 		if !ok {
 			break
 		}
 		processed++
 		if accept(kmer) {
 			if err := w.Write(kmer); err != nil {
 				w.Close()
 				return 0, 0, err
 			}
 			kept++
 		}
 	}
 	return kept, processed, w.Close()
 }
 // copyFileHelper copies a file (used for spectrum.bin etc.)
 func copyFileHelper(src, dst string) error {
 	in, err := os.Open(src)
 	if err != nil {
 		return err
 	}
 	defer in.Close()
 	out, err := os.Create(dst)
 	if err != nil {
 		return err
 	}
 	defer out.Close()
 	buf := make([]byte, 32*1024)
 	for {
 		n, readErr := in.Read(buf)
 		if n > 0 {
 			if _, writeErr := out.Write(buf[:n]); writeErr != nil {
 				return writeErr
 			}
 		}
 		if readErr != nil {
 			break
 		}
 	}
 	return out.Close()
 }
@@ -33,6 +33,9 @@ func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error
 	maxOcc := CLIMaxOccurrence()
 	entropyThreshold := CLIIndexEntropyThreshold()
 	entropySize := CLIIndexEntropySize()
 	// Build options
 	var opts []obikmer.BuilderOption
 	if minOcc > 1 {
@@ -44,6 +47,9 @@ func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error
 	if topN := CLISaveFreqKmer(); topN > 0 {
 		opts = append(opts, obikmer.WithSaveFreqKmers(topN))
 	}
 	if entropyThreshold > 0 {
 		opts = append(opts, obikmer.WithEntropyFilter(entropyThreshold, entropySize))
 	}
 	// Determine whether to append to existing group or create new
 	var builder *obikmer.KmerSetGroupBuilder
@@ -115,6 +121,11 @@ func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error
 		ksg.SetAttribute("max_occurrence", maxOcc)
 	}
 	if entropyThreshold > 0 {
 		ksg.SetAttribute("entropy_filter", entropyThreshold)
 		ksg.SetAttribute("entropy_filter_size", entropySize)
 	}
 	if err := ksg.SaveMetadata(); err != nil {
 		return fmt.Errorf("failed to save metadata: %w", err)
 	}
@@ -74,4 +74,11 @@ func OptionSet(opt *getoptions.GetOpt) {
 	obiconvert.OutputOptionSet(matchCmd)
 	SetSelectionOptionSet(matchCmd)
 	matchCmd.SetCommandFn(runMatch)
 	// filter: filter an index to remove low-complexity k-mers
 	filterCmd := opt.NewCommand("filter", "Filter a kmer index to remove low-complexity k-mers")
 	obiconvert.OutputModeOptionSet(filterCmd, false)
 	EntropyFilterOptionSet(filterCmd)
 	SetSelectionOptionSet(filterCmd)
 	filterCmd.SetCommandFn(runFilter)
 }
@@ -105,6 +105,8 @@ var _minOccurrence = 1
 var _maxOccurrence = 0
 var _saveFullFilter = false
 var _saveFreqKmer = 0
 var _indexEntropyThreshold = 0.0
 var _indexEntropySize = 6
 // KmerIndexOptionSet defines every option related to kmer index building.
 func KmerIndexOptionSet(options *getoptions.GetOpt) {
@@ -133,6 +135,22 @@ func KmerIndexOptionSet(options *getoptions.GetOpt) {
 	options.IntVar(&_saveFreqKmer, "save-freq-kmer", _saveFreqKmer,
 		options.Description("Save the N most frequent k-mers per set to a CSV file (top_kmers.csv)."))
 	options.Float64Var(&_indexEntropyThreshold, "entropy-filter", _indexEntropyThreshold,
 		options.Description("Filter low-complexity k-mers with entropy <= threshold (0 = disabled)."))
 	options.IntVar(&_indexEntropySize, "entropy-filter-size", _indexEntropySize,
 		options.Description("Maximum word size for entropy filter computation (default 6)."))
 }
 // EntropyFilterOptionSet registers entropy filter options for commands
 // that process existing indices (e.g. filter).
 func EntropyFilterOptionSet(options *getoptions.GetOpt) {
 	options.Float64Var(&_indexEntropyThreshold, "entropy-filter", _indexEntropyThreshold,
 		options.Description("Filter low-complexity k-mers with entropy <= threshold (0 = disabled)."))
 	options.IntVar(&_indexEntropySize, "entropy-filter-size", _indexEntropySize,
 		options.Description("Maximum word size for entropy filter computation (default 6)."))
 }
 // ==============================
@@ -262,6 +280,16 @@ func CLIKeepShorter() bool {
 	return _keepShorter
 }
 // CLIIndexEntropyThreshold returns the entropy filter threshold for index building (0 = disabled).
 func CLIIndexEntropyThreshold() float64 {
 	return _indexEntropyThreshold
 }
 // CLIIndexEntropySize returns the entropy filter word size for index building.
 func CLIIndexEntropySize() int {
 	return _indexEntropySize
 }
 // OutputFormatOptionSet registers --json-output, --csv-output, --yaml-output.
 func OutputFormatOptionSet(options *getoptions.GetOpt) {
 	options.BoolVar(&_jsonOutput, "json-output", false,