Add entropy-based filtering for k-mers

This commit introduces entropy-based filtering for k-mers to remove low-complexity sequences. It adds: - New KmerEntropy and KmerEntropyFilter functions in pkg/obikmer/entropy.go for computing and filtering k-mer entropy - Integration of entropy filtering in the k-mer set builder (pkg/obikmer/kmer_set_builder.go) - A new 'filter' command in obik tool (pkg/obitools/obik/filter.go) to apply entropy filtering on existing indices - CLI options for configuring entropy filtering during index building and filtering The entropy filter helps improve the quality of k-mer sets by removing repetitive sequences that may interfere with downstream analyses.
2026-05-09 15:40:40 +00:00 · 2026-02-10 18:19:57 +01:00
parent c6e04265f1
commit bebbbbfe7d
7 changed files with 910 additions and 60 deletions
@@ -0,0 +1,281 @@
+package obikmer
+
+import "math"
+
+// KmerEntropy computes the entropy of a single encoded k-mer.
+//
+// The algorithm mirrors the lowmask entropy calculation: it decodes the k-mer
+// to a DNA sequence, extracts all sub-words of each size from 1 to levelMax,
+// normalizes them by circular canonical form, counts their frequencies, and
+// computes Shannon entropy normalized by the maximum possible entropy.
+// The returned value is the minimum entropy across all word sizes.
+//
+// A value close to 0 indicates very low complexity (e.g. "AAAA..."),
+// while a value close to 1 indicates high complexity.
+//
+// Parameters:
+//   - kmer: the encoded k-mer (2 bits per base)
+//   - k: the k-mer size
+//   - levelMax: maximum sub-word size for entropy (typically 6)
+//
+// Returns:
+//   - minimum normalized entropy across all word sizes 1..levelMax
+func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
+	if k < 1 || levelMax < 1 {
+		return 1.0
+	}
+	if levelMax >= k {
+		levelMax = k - 1
+	}
+	if levelMax < 1 {
+		return 1.0
+	}
+
+	// Decode k-mer to DNA sequence
+	var seqBuf [32]byte
+	seq := DecodeKmer(kmer, k, seqBuf[:])
+
+	// Pre-compute nLogN lookup (same as lowmask)
+	nLogN := make([]float64, k+1)
+	for i := 1; i <= k; i++ {
+		nLogN[i] = float64(i) * math.Log(float64(i))
+	}
+
+	// Build circular-canonical normalization tables per word size
+	normTables := make([][]int, levelMax+1)
+	for ws := 1; ws <= levelMax; ws++ {
+		size := 1 << (ws * 2)
+		normTables[ws] = make([]int, size)
+		for code := 0; code < size; code++ {
+			normTables[ws][code] = int(NormalizeCircular(uint64(code), ws))
+		}
+	}
+
+	minEntropy := math.MaxFloat64
+
+	for ws := 1; ws <= levelMax; ws++ {
+		nwords := k - ws + 1
+		if nwords < 1 {
+			continue
+		}
+
+		// Count circular-canonical sub-word frequencies
+		tableSize := 1 << (ws * 2)
+		table := make([]int, tableSize)
+		mask := (1 << (ws * 2)) - 1
+
+		wordIndex := 0
+		for i := 0; i < ws-1; i++ {
+			wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i]))
+		}
+
+		for i, j := 0, ws-1; j < k; i, j = i+1, j+1 {
+			wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j]))
+			normWord := normTables[ws][wordIndex]
+			table[normWord]++
+		}
+
+		// Compute Shannon entropy
+		floatNwords := float64(nwords)
+		logNwords := math.Log(floatNwords)
+
+		var sumNLogN float64
+		for j := 0; j < tableSize; j++ {
+			n := table[j]
+			if n > 0 {
+				sumNLogN += nLogN[n]
+			}
+		}
+
+		// Compute emax (maximum possible entropy for this word size)
+		na := CanonicalCircularKmerCount(ws)
+		var emax float64
+		if nwords < na {
+			emax = math.Log(float64(nwords))
+		} else {
+			cov := nwords / na
+			remains := nwords - (na * cov)
+			f1 := float64(cov) / floatNwords
+			f2 := float64(cov+1) / floatNwords
+			emax = -(float64(na-remains)*f1*math.Log(f1) +
+				float64(remains)*f2*math.Log(f2))
+		}
+
+		if emax <= 0 {
+			continue
+		}
+
+		entropy := (logNwords - sumNLogN/floatNwords) / emax
+		if entropy < 0 {
+			entropy = 0
+		}
+
+		if entropy < minEntropy {
+			minEntropy = entropy
+		}
+	}
+
+	if minEntropy == math.MaxFloat64 {
+		return 1.0
+	}
+
+	return math.Round(minEntropy*10000) / 10000
+}
+
+// KmerEntropyFilter is a reusable entropy filter for batch processing.
+// It pre-computes normalization tables and lookup values to avoid repeated
+// allocation across millions of k-mers.
+//
+// IMPORTANT: a KmerEntropyFilter is NOT safe for concurrent use.
+// Each goroutine must create its own instance via NewKmerEntropyFilter.
+type KmerEntropyFilter struct {
+	k          int
+	levelMax   int
+	threshold  float64
+	nLogN      []float64
+	normTables [][]int
+	emaxValues []float64
+	logNwords  []float64
+	// Pre-allocated frequency tables reused across Entropy() calls.
+	// One per word size (index 0 unused). Reset to zero before each use.
+	freqTables [][]int
+}
+
+// NewKmerEntropyFilter creates an entropy filter with pre-computed tables.
+//
+// Parameters:
+//   - k: the k-mer size
+//   - levelMax: maximum sub-word size for entropy (typically 6)
+//   - threshold: entropy threshold (k-mers with entropy <= threshold are rejected)
+func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter {
+	if levelMax >= k {
+		levelMax = k - 1
+	}
+	if levelMax < 1 {
+		levelMax = 1
+	}
+
+	nLogN := make([]float64, k+1)
+	for i := 1; i <= k; i++ {
+		nLogN[i] = float64(i) * math.Log(float64(i))
+	}
+
+	normTables := make([][]int, levelMax+1)
+	for ws := 1; ws <= levelMax; ws++ {
+		size := 1 << (ws * 2)
+		normTables[ws] = make([]int, size)
+		for code := 0; code < size; code++ {
+			normTables[ws][code] = int(NormalizeCircular(uint64(code), ws))
+		}
+	}
+
+	emaxValues := make([]float64, levelMax+1)
+	logNwords := make([]float64, levelMax+1)
+	for ws := 1; ws <= levelMax; ws++ {
+		nw := k - ws + 1
+		na := CanonicalCircularKmerCount(ws)
+		if nw < na {
+			logNwords[ws] = math.Log(float64(nw))
+			emaxValues[ws] = math.Log(float64(nw))
+		} else {
+			cov := nw / na
+			remains := nw - (na * cov)
+			f1 := float64(cov) / float64(nw)
+			f2 := float64(cov+1) / float64(nw)
+			logNwords[ws] = math.Log(float64(nw))
+			emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) +
+				float64(remains)*f2*math.Log(f2))
+		}
+	}
+
+	// Pre-allocate frequency tables per word size
+	freqTables := make([][]int, levelMax+1)
+	for ws := 1; ws <= levelMax; ws++ {
+		freqTables[ws] = make([]int, 1<<(ws*2))
+	}
+
+	return &KmerEntropyFilter{
+		k:          k,
+		levelMax:   levelMax,
+		threshold:  threshold,
+		nLogN:      nLogN,
+		normTables: normTables,
+		emaxValues: emaxValues,
+		logNwords:  logNwords,
+		freqTables: freqTables,
+	}
+}
+
+// Accept returns true if the k-mer has entropy strictly above the threshold.
+// Low-complexity k-mers (entropy <= threshold) are rejected.
+func (ef *KmerEntropyFilter) Accept(kmer uint64) bool {
+	return ef.Entropy(kmer) > ef.threshold
+}
+
+// Entropy computes the entropy for a single k-mer using pre-computed tables.
+func (ef *KmerEntropyFilter) Entropy(kmer uint64) float64 {
+	k := ef.k
+
+	// Decode k-mer to DNA sequence
+	var seqBuf [32]byte
+	seq := DecodeKmer(kmer, k, seqBuf[:])
+
+	minEntropy := math.MaxFloat64
+
+	for ws := 1; ws <= ef.levelMax; ws++ {
+		nwords := k - ws + 1
+		if nwords < 1 {
+			continue
+		}
+
+		emax := ef.emaxValues[ws]
+		if emax <= 0 {
+			continue
+		}
+
+		// Count circular-canonical sub-word frequencies
+		tableSize := 1 << (ws * 2)
+		table := ef.freqTables[ws]
+		clear(table) // reset to zero
+		mask := (1 << (ws * 2)) - 1
+		normTable := ef.normTables[ws]
+
+		wordIndex := 0
+		for i := 0; i < ws-1; i++ {
+			wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i]))
+		}
+
+		for i, j := 0, ws-1; j < k; i, j = i+1, j+1 {
+			wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j]))
+			normWord := normTable[wordIndex]
+			table[normWord]++
+		}
+
+		// Compute Shannon entropy
+		floatNwords := float64(nwords)
+		logNwords := ef.logNwords[ws]
+
+		var sumNLogN float64
+		for j := 0; j < tableSize; j++ {
+			n := table[j]
+			if n > 0 {
+				sumNLogN += ef.nLogN[n]
+			}
+		}
+
+		entropy := (logNwords - sumNLogN/floatNwords) / emax
+		if entropy < 0 {
+			entropy = 0
+		}
+
+		if entropy < minEntropy {
+			minEntropy = entropy
+		}
+	}
+
+	if minEntropy == math.MaxFloat64 {
+		return 1.0
+	}
+
+	return math.Round(minEntropy*10000) / 10000
+}
@@ -5,20 +5,23 @@ import (
 	"math"
 	"os"
 	"path/filepath"
-	"runtime"
-	"sort"
+	"slices"
 	"sync"

+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
+	"github.com/schollz/progressbar/v3"
 )

 // BuilderOption is a functional option for KmerSetGroupBuilder.
 type BuilderOption func(*builderConfig)

 type builderConfig struct {
-	minFreq      int // 0 means no frequency filtering (simple dedup)
-	maxFreq      int // 0 means no upper bound
-	saveFreqTopN int // >0 means save the N most frequent k-mers per set to CSV
+	minFreq          int     // 0 means no frequency filtering (simple dedup)
+	maxFreq          int     // 0 means no upper bound
+	saveFreqTopN     int     // >0 means save the N most frequent k-mers per set to CSV
+	entropyThreshold float64 // >0 means filter k-mers with entropy <= threshold
+	entropyLevelMax  int     // max sub-word size for entropy (typically 6)
 }

 // WithMinFrequency activates frequency filtering mode.
@@ -45,6 +48,16 @@ func WithSaveFreqKmers(n int) BuilderOption {
 	}
 }

+// WithEntropyFilter activates entropy-based low-complexity filtering.
+// K-mers with entropy <= threshold are discarded during finalization.
+// levelMax is the maximum sub-word size for entropy computation (typically 6).
+func WithEntropyFilter(threshold float64, levelMax int) BuilderOption {
+	return func(c *builderConfig) {
+		c.entropyThreshold = threshold
+		c.entropyLevelMax = levelMax
+	}
+}
+
 // KmerSetGroupBuilder constructs a KmerSetGroup on disk.
 // During construction, super-kmers are written to temporary .skm files
 // partitioned by minimizer. On Close(), each partition is finalized
@@ -299,7 +312,17 @@ func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) {
 		}
 	}

-	// Process partitions in parallel
+	// =====================================================================
+	// 2-stage pipeline: readers (pure I/O) → workers (CPU + write)
+	//
+	// - nReaders goroutines read .skm files (pure I/O, fast)
+	// - nWorkers goroutines extract k-mers, sort, dedup, filter, write .kdi
+	//
+	// One unbuffered channel between stages. Readers are truly I/O-bound
+	// (small files, buffered reads), workers are CPU-bound and stay busy.
+	// =====================================================================
+	totalJobs := b.n * b.P
+
 	counts := make([][]uint64, b.n)
 	spectra := make([][]map[int]uint64, b.n)
 	var topKmers [][]*TopNKmers
@@ -314,27 +337,71 @@ func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) {
 		}
 	}

-	nWorkers := runtime.NumCPU()
-	if nWorkers > b.P {
-		nWorkers = b.P
+	nCPU := obidefault.ParallelWorkers()
+
+	// Stage sizing
+	nWorkers := nCPU     // CPU-bound: one per core
+	nReaders := nCPU / 4 // pure I/O: few goroutines suffice
+	if nReaders < 2 {
+		nReaders = 2
+	}
+	if nReaders > 4 {
+		nReaders = 4
+	}
+	if nWorkers > totalJobs {
+		nWorkers = totalJobs
+	}
+	if nReaders > totalJobs {
+		nReaders = totalJobs
 	}

-	type job struct {
+	var bar *progressbar.ProgressBar
+	if obidefault.ProgressBar() {
+		pbopt := []progressbar.Option{
+			progressbar.OptionSetWriter(os.Stderr),
+			progressbar.OptionSetWidth(15),
+			progressbar.OptionShowCount(),
+			progressbar.OptionShowIts(),
+			progressbar.OptionSetPredictTime(true),
+			progressbar.OptionSetDescription("[Finalizing partitions]"),
+		}
+		bar = progressbar.NewOptions(totalJobs, pbopt...)
+	}
+
+	// --- Channel types ---
+	type partitionData struct {
+		setIdx  int
+		partIdx int
+		skmers  []SuperKmer // raw super-kmers from I/O stage
+	}
+
+	type readJob struct {
 		setIdx  int
 		partIdx int
 	}

-	jobs := make(chan job, b.n*b.P)
-	var wg sync.WaitGroup
+	dataCh := make(chan *partitionData) // unbuffered
+	readJobs := make(chan readJob, totalJobs)
+
 	var errMu sync.Mutex
 	var firstErr error

-	for w := 0; w < nWorkers; w++ {
-		wg.Add(1)
+	// Fill job queue (buffered, all jobs pre-loaded)
+	for s := 0; s < b.n; s++ {
+		for p := 0; p < b.P; p++ {
+			readJobs <- readJob{s, p}
+		}
+	}
+	close(readJobs)
+
+	// --- Stage 1: Readers (pure I/O) ---
+	var readWg sync.WaitGroup
+	for w := 0; w < nReaders; w++ {
+		readWg.Add(1)
 		go func() {
-			defer wg.Done()
-			for j := range jobs {
-				partSpec, partTop, err := b.finalizePartition(j.setIdx, j.partIdx, &counts[j.setIdx][j.partIdx])
+			defer readWg.Done()
+			for rj := range readJobs {
+				skmers, err := b.loadPartitionRaw(rj.setIdx, rj.partIdx)
 				if err != nil {
 					errMu.Lock()
 					if firstErr == nil {
@@ -342,21 +409,62 @@ func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) {
 					}
 					errMu.Unlock()
 				}
-				spectra[j.setIdx][j.partIdx] = partSpec
+				dataCh <- &partitionData{rj.setIdx, rj.partIdx, skmers}
+			}
+		}()
+	}
+
+	go func() {
+		readWg.Wait()
+		close(dataCh)
+	}()
+
+	// --- Stage 2: Workers (CPU: extract k-mers + sort/filter + write .kdi) ---
+	var workWg sync.WaitGroup
+	for w := 0; w < nWorkers; w++ {
+		workWg.Add(1)
+		go func() {
+			defer workWg.Done()
+			for pd := range dataCh {
+				// CPU: extract canonical k-mers from super-kmers
+				kmers := extractCanonicalKmers(pd.skmers, b.k)
+				pd.skmers = nil // allow GC of raw super-kmers
+
+				// CPU: sort, dedup, filter
+				filtered, spectrum, topN := b.sortFilterPartition(kmers)
+				kmers = nil // allow GC of unsorted data
+
+				// I/O: write .kdi file
+				globalIdx := b.startIndex + pd.setIdx
+				kdiPath := filepath.Join(b.dir,
+					fmt.Sprintf("set_%d", globalIdx),
+					fmt.Sprintf("part_%04d.kdi", pd.partIdx))
+
+				n, err := b.writePartitionKdi(kdiPath, filtered)
+				if err != nil {
+					errMu.Lock()
+					if firstErr == nil {
+						firstErr = err
+					}
+					errMu.Unlock()
+				}
+				counts[pd.setIdx][pd.partIdx] = n
+				spectra[pd.setIdx][pd.partIdx] = spectrum
 				if topKmers != nil {
-					topKmers[j.setIdx][j.partIdx] = partTop
+					topKmers[pd.setIdx][pd.partIdx] = topN
+				}
+				if bar != nil {
+					bar.Add(1)
 				}
 			}
 		}()
 	}

-	for s := 0; s < b.n; s++ {
-		for p := 0; p < b.P; p++ {
-			jobs <- job{s, p}
-		}
+	workWg.Wait()
+
+	if bar != nil {
+		fmt.Fprintln(os.Stderr)
 	}
-	close(jobs)
-	wg.Wait()

 	if firstErr != nil {
 		return nil, firstErr
@@ -449,58 +557,89 @@ func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) {
 	return ksg, nil
 }

-// finalizePartition processes a single partition: load SKM, extract k-mers,
-// sort, dedup/count, write KDI. Returns a partial frequency spectrum
-// (frequency → count of distinct k-mers) computed before filtering,
-// and optionally the top-N most frequent k-mers.
-func (b *KmerSetGroupBuilder) finalizePartition(setIdx, partIdx int, count *uint64) (map[int]uint64, *TopNKmers, error) {
-	// setIdx is local (0..n-1); build dirs use local index, output dirs use global
+// loadPartitionRaw reads a .skm file and returns raw super-kmers.
+// This is pure I/O — no k-mer extraction is done here.
+// Returns nil (not an error) if the .skm file is empty or missing.
+func (b *KmerSetGroupBuilder) loadPartitionRaw(setIdx, partIdx int) ([]SuperKmer, error) {
 	skmPath := filepath.Join(b.dir, ".build",
 		fmt.Sprintf("set_%d", setIdx),
 		fmt.Sprintf("part_%04d.skm", partIdx))

-	globalIdx := b.startIndex + setIdx
-	kdiPath := filepath.Join(b.dir,
-		fmt.Sprintf("set_%d", globalIdx),
-		fmt.Sprintf("part_%04d.kdi", partIdx))
-
-	// Load super-kmers and extract canonical k-mers
-	reader, err := NewSkmReader(skmPath)
+	fi, err := os.Stat(skmPath)
 	if err != nil {
-		// If file doesn't exist or is empty, write empty KDI
-		return nil, nil, b.writeEmptyKdi(kdiPath, count)
+		return nil, nil // empty partition, not an error
 	}

-	var kmers []uint64
+	reader, err := NewSkmReader(skmPath)
+	if err != nil {
+		return nil, nil
+	}
+
+	// Estimate capacity from file size. Each super-kmer record is
+	// 2 bytes (length) + packed bases (~k/4 bytes), so roughly
+	// (2 + k/4) bytes per super-kmer on average.
+	avgRecordSize := 2 + b.k/4
+	if avgRecordSize < 4 {
+		avgRecordSize = 4
+	}
+	estCount := int(fi.Size()) / avgRecordSize
+
+	skmers := make([]SuperKmer, 0, estCount)
 	for {
 		sk, ok := reader.Next()
 		if !ok {
 			break
 		}
-		for kmer := range IterCanonicalKmers(sk.Sequence, b.k) {
-			kmers = append(kmers, kmer)
-		}
+		skmers = append(skmers, sk)
 	}
 	reader.Close()

+	return skmers, nil
+}
+
+// extractCanonicalKmers extracts all canonical k-mers from a slice of super-kmers.
+// This is CPU-bound work (sliding-window forward/reverse complement).
+func extractCanonicalKmers(skmers []SuperKmer, k int) []uint64 {
+	// Pre-compute total capacity to avoid repeated slice growth.
+	// Each super-kmer of length L yields L-k+1 canonical k-mers.
+	total := 0
+	for i := range skmers {
+		n := len(skmers[i].Sequence) - k + 1
+		if n > 0 {
+			total += n
+		}
+	}
+
+	kmers := make([]uint64, 0, total)
+	for _, sk := range skmers {
+		for kmer := range IterCanonicalKmers(sk.Sequence, k) {
+			kmers = append(kmers, kmer)
+		}
+	}
+	return kmers
+}
+
+// sortFilterPartition sorts, deduplicates, and filters k-mers in memory (CPU-bound).
+// Returns the filtered sorted slice, frequency spectrum, and optional top-N.
+func (b *KmerSetGroupBuilder) sortFilterPartition(kmers []uint64) ([]uint64, map[int]uint64, *TopNKmers) {
 	if len(kmers) == 0 {
-		return nil, nil, b.writeEmptyKdi(kdiPath, count)
+		return nil, nil, nil
 	}

-	// Sort
-	sort.Slice(kmers, func(i, j int) bool { return kmers[i] < kmers[j] })
-
-	// Write KDI based on mode
-	w, err := NewKdiWriter(kdiPath)
-	if err != nil {
-		return nil, nil, err
-	}
+	// Sort (CPU-bound) — slices.Sort avoids reflection overhead of sort.Slice
+	slices.Sort(kmers)

 	minFreq := b.config.minFreq
 	if minFreq <= 0 {
 		minFreq = 1 // simple dedup
 	}
-	maxFreq := b.config.maxFreq // 0 means no upper bound
+	maxFreq := b.config.maxFreq
+
+	// Prepare entropy filter if requested
+	var entropyFilter *KmerEntropyFilter
+	if b.config.entropyThreshold > 0 && b.config.entropyLevelMax > 0 {
+		entropyFilter = NewKmerEntropyFilter(b.k, b.config.entropyLevelMax, b.config.entropyThreshold)
+	}

 	// Prepare top-N collector if requested
 	var topN *TopNKmers
@@ -508,8 +647,10 @@ func (b *KmerSetGroupBuilder) finalizePartition(setIdx, partIdx int, count *uint
 		topN = NewTopNKmers(b.config.saveFreqTopN)
 	}

-	// Linear scan: count consecutive identical values and accumulate spectrum
+	// Linear scan: count consecutive identical values, filter, accumulate spectrum
 	partSpectrum := make(map[int]uint64)
+	filtered := make([]uint64, 0, len(kmers)/2)
+
 	i := 0
 	for i < len(kmers) {
 		val := kmers[i]
@@ -522,16 +663,33 @@ func (b *KmerSetGroupBuilder) finalizePartition(setIdx, partIdx int, count *uint
 			topN.Add(val, c)
 		}
 		if c >= minFreq && (maxFreq <= 0 || c <= maxFreq) {
-			if err := w.Write(val); err != nil {
-				w.Close()
-				return nil, nil, err
+			if entropyFilter == nil || entropyFilter.Accept(val) {
+				filtered = append(filtered, val)
 			}
 		}
 		i += c
 	}

-	*count = w.Count()
-	return partSpectrum, topN, w.Close()
+	return filtered, partSpectrum, topN
+}
+
+// writePartitionKdi writes a sorted slice of k-mers to a .kdi file (I/O-bound).
+// Returns the number of k-mers written.
+func (b *KmerSetGroupBuilder) writePartitionKdi(kdiPath string, kmers []uint64) (uint64, error) {
+	w, err := NewKdiWriter(kdiPath)
+	if err != nil {
+		return 0, err
+	}
+
+	for _, val := range kmers {
+		if err := w.Write(val); err != nil {
+			w.Close()
+			return 0, err
+		}
+	}
+
+	n := w.Count()
+	return n, w.Close()
 }

 func (b *KmerSetGroupBuilder) writeEmptyKdi(path string, count *uint64) error {
@@ -128,6 +128,27 @@ func OpenKmerSetGroup(directory string) (*KmerSetGroup, error) {
 	return ksg, nil
 }

+// NewFilteredKmerSetGroup creates a KmerSetGroup from pre-computed data.
+// Used by the filter command to construct a new group after filtering partitions.
+func NewFilteredKmerSetGroup(
+	directory string, k, m, partitions, n int,
+	setsIDs []string, counts []uint64,
+	setsMetadata []map[string]interface{},
+) (*KmerSetGroup, error) {
+	ksg := &KmerSetGroup{
+		path:         directory,
+		k:            k,
+		m:            m,
+		partitions:   partitions,
+		n:            n,
+		setsIDs:      setsIDs,
+		counts:       counts,
+		setsMetadata: setsMetadata,
+		Metadata:     make(map[string]interface{}),
+	}
+	return ksg, nil
+}
+
 // SaveMetadata writes the metadata.toml file. This is useful after
 // modifying attributes or IDs on an already-finalized index.
 func (ksg *KmerSetGroup) SaveMetadata() error {