Add entropy-based filtering for k-mers

This commit introduces entropy-based filtering for k-mers to remove low-complexity sequences. It adds:

- New KmerEntropy and KmerEntropyFilter functions in pkg/obikmer/entropy.go for computing and filtering k-mer entropy
- Integration of entropy filtering in the k-mer set builder (pkg/obikmer/kmer_set_builder.go)
- A new 'filter' command in obik tool (pkg/obitools/obik/filter.go) to apply entropy filtering on existing indices
- CLI options for configuring entropy filtering during index building and filtering

The entropy filter helps improve the quality of k-mer sets by removing repetitive sequences that may interfere with downstream analyses.
This commit is contained in:
Eric Coissac
2026-02-10 18:19:57 +01:00
parent c6e04265f1
commit bebbbbfe7d
7 changed files with 910 additions and 60 deletions

View File

@@ -33,6 +33,9 @@ func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error
maxOcc := CLIMaxOccurrence()
entropyThreshold := CLIIndexEntropyThreshold()
entropySize := CLIIndexEntropySize()
// Build options
var opts []obikmer.BuilderOption
if minOcc > 1 {
@@ -44,6 +47,9 @@ func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error
if topN := CLISaveFreqKmer(); topN > 0 {
opts = append(opts, obikmer.WithSaveFreqKmers(topN))
}
if entropyThreshold > 0 {
opts = append(opts, obikmer.WithEntropyFilter(entropyThreshold, entropySize))
}
// Determine whether to append to existing group or create new
var builder *obikmer.KmerSetGroupBuilder
@@ -115,6 +121,11 @@ func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error
ksg.SetAttribute("max_occurrence", maxOcc)
}
if entropyThreshold > 0 {
ksg.SetAttribute("entropy_filter", entropyThreshold)
ksg.SetAttribute("entropy_filter_size", entropySize)
}
if err := ksg.SaveMetadata(); err != nil {
return fmt.Errorf("failed to save metadata: %w", err)
}