mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 21:40:52 +00:00
Add entropy-based filtering for k-mers
This commit introduces entropy-based filtering for k-mers to remove low-complexity sequences. It adds: - New KmerEntropy and KmerEntropyFilter functions in pkg/obikmer/entropy.go for computing and filtering k-mer entropy - Integration of entropy filtering in the k-mer set builder (pkg/obikmer/kmer_set_builder.go) - A new 'filter' command in obik tool (pkg/obitools/obik/filter.go) to apply entropy filtering on existing indices - CLI options for configuring entropy filtering during index building and filtering The entropy filter helps improve the quality of k-mer sets by removing repetitive sequences that may interfere with downstream analyses.
This commit is contained in:
281
pkg/obikmer/entropy.go
Normal file
281
pkg/obikmer/entropy.go
Normal file
@@ -0,0 +1,281 @@
|
|||||||
|
package obikmer
|
||||||
|
|
||||||
|
import "math"
|
||||||
|
|
||||||
|
// KmerEntropy computes the entropy of a single encoded k-mer.
|
||||||
|
//
|
||||||
|
// The algorithm mirrors the lowmask entropy calculation: it decodes the k-mer
|
||||||
|
// to a DNA sequence, extracts all sub-words of each size from 1 to levelMax,
|
||||||
|
// normalizes them by circular canonical form, counts their frequencies, and
|
||||||
|
// computes Shannon entropy normalized by the maximum possible entropy.
|
||||||
|
// The returned value is the minimum entropy across all word sizes.
|
||||||
|
//
|
||||||
|
// A value close to 0 indicates very low complexity (e.g. "AAAA..."),
|
||||||
|
// while a value close to 1 indicates high complexity.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - kmer: the encoded k-mer (2 bits per base)
|
||||||
|
// - k: the k-mer size
|
||||||
|
// - levelMax: maximum sub-word size for entropy (typically 6)
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - minimum normalized entropy across all word sizes 1..levelMax
|
||||||
|
func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
|
||||||
|
if k < 1 || levelMax < 1 {
|
||||||
|
return 1.0
|
||||||
|
}
|
||||||
|
if levelMax >= k {
|
||||||
|
levelMax = k - 1
|
||||||
|
}
|
||||||
|
if levelMax < 1 {
|
||||||
|
return 1.0
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decode k-mer to DNA sequence
|
||||||
|
var seqBuf [32]byte
|
||||||
|
seq := DecodeKmer(kmer, k, seqBuf[:])
|
||||||
|
|
||||||
|
// Pre-compute nLogN lookup (same as lowmask)
|
||||||
|
nLogN := make([]float64, k+1)
|
||||||
|
for i := 1; i <= k; i++ {
|
||||||
|
nLogN[i] = float64(i) * math.Log(float64(i))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build circular-canonical normalization tables per word size
|
||||||
|
normTables := make([][]int, levelMax+1)
|
||||||
|
for ws := 1; ws <= levelMax; ws++ {
|
||||||
|
size := 1 << (ws * 2)
|
||||||
|
normTables[ws] = make([]int, size)
|
||||||
|
for code := 0; code < size; code++ {
|
||||||
|
normTables[ws][code] = int(NormalizeCircular(uint64(code), ws))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
minEntropy := math.MaxFloat64
|
||||||
|
|
||||||
|
for ws := 1; ws <= levelMax; ws++ {
|
||||||
|
nwords := k - ws + 1
|
||||||
|
if nwords < 1 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Count circular-canonical sub-word frequencies
|
||||||
|
tableSize := 1 << (ws * 2)
|
||||||
|
table := make([]int, tableSize)
|
||||||
|
mask := (1 << (ws * 2)) - 1
|
||||||
|
|
||||||
|
wordIndex := 0
|
||||||
|
for i := 0; i < ws-1; i++ {
|
||||||
|
wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i]))
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, j := 0, ws-1; j < k; i, j = i+1, j+1 {
|
||||||
|
wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j]))
|
||||||
|
normWord := normTables[ws][wordIndex]
|
||||||
|
table[normWord]++
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute Shannon entropy
|
||||||
|
floatNwords := float64(nwords)
|
||||||
|
logNwords := math.Log(floatNwords)
|
||||||
|
|
||||||
|
var sumNLogN float64
|
||||||
|
for j := 0; j < tableSize; j++ {
|
||||||
|
n := table[j]
|
||||||
|
if n > 0 {
|
||||||
|
sumNLogN += nLogN[n]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute emax (maximum possible entropy for this word size)
|
||||||
|
na := CanonicalCircularKmerCount(ws)
|
||||||
|
var emax float64
|
||||||
|
if nwords < na {
|
||||||
|
emax = math.Log(float64(nwords))
|
||||||
|
} else {
|
||||||
|
cov := nwords / na
|
||||||
|
remains := nwords - (na * cov)
|
||||||
|
f1 := float64(cov) / floatNwords
|
||||||
|
f2 := float64(cov+1) / floatNwords
|
||||||
|
emax = -(float64(na-remains)*f1*math.Log(f1) +
|
||||||
|
float64(remains)*f2*math.Log(f2))
|
||||||
|
}
|
||||||
|
|
||||||
|
if emax <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
entropy := (logNwords - sumNLogN/floatNwords) / emax
|
||||||
|
if entropy < 0 {
|
||||||
|
entropy = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
if entropy < minEntropy {
|
||||||
|
minEntropy = entropy
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if minEntropy == math.MaxFloat64 {
|
||||||
|
return 1.0
|
||||||
|
}
|
||||||
|
|
||||||
|
return math.Round(minEntropy*10000) / 10000
|
||||||
|
}
|
||||||
|
|
||||||
|
// KmerEntropyFilter is a reusable entropy filter for batch processing.
|
||||||
|
// It pre-computes normalization tables and lookup values to avoid repeated
|
||||||
|
// allocation across millions of k-mers.
|
||||||
|
//
|
||||||
|
// IMPORTANT: a KmerEntropyFilter is NOT safe for concurrent use.
|
||||||
|
// Each goroutine must create its own instance via NewKmerEntropyFilter.
|
||||||
|
type KmerEntropyFilter struct {
|
||||||
|
k int
|
||||||
|
levelMax int
|
||||||
|
threshold float64
|
||||||
|
nLogN []float64
|
||||||
|
normTables [][]int
|
||||||
|
emaxValues []float64
|
||||||
|
logNwords []float64
|
||||||
|
// Pre-allocated frequency tables reused across Entropy() calls.
|
||||||
|
// One per word size (index 0 unused). Reset to zero before each use.
|
||||||
|
freqTables [][]int
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewKmerEntropyFilter creates an entropy filter with pre-computed tables.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - k: the k-mer size
|
||||||
|
// - levelMax: maximum sub-word size for entropy (typically 6)
|
||||||
|
// - threshold: entropy threshold (k-mers with entropy <= threshold are rejected)
|
||||||
|
func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter {
|
||||||
|
if levelMax >= k {
|
||||||
|
levelMax = k - 1
|
||||||
|
}
|
||||||
|
if levelMax < 1 {
|
||||||
|
levelMax = 1
|
||||||
|
}
|
||||||
|
|
||||||
|
nLogN := make([]float64, k+1)
|
||||||
|
for i := 1; i <= k; i++ {
|
||||||
|
nLogN[i] = float64(i) * math.Log(float64(i))
|
||||||
|
}
|
||||||
|
|
||||||
|
normTables := make([][]int, levelMax+1)
|
||||||
|
for ws := 1; ws <= levelMax; ws++ {
|
||||||
|
size := 1 << (ws * 2)
|
||||||
|
normTables[ws] = make([]int, size)
|
||||||
|
for code := 0; code < size; code++ {
|
||||||
|
normTables[ws][code] = int(NormalizeCircular(uint64(code), ws))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
emaxValues := make([]float64, levelMax+1)
|
||||||
|
logNwords := make([]float64, levelMax+1)
|
||||||
|
for ws := 1; ws <= levelMax; ws++ {
|
||||||
|
nw := k - ws + 1
|
||||||
|
na := CanonicalCircularKmerCount(ws)
|
||||||
|
if nw < na {
|
||||||
|
logNwords[ws] = math.Log(float64(nw))
|
||||||
|
emaxValues[ws] = math.Log(float64(nw))
|
||||||
|
} else {
|
||||||
|
cov := nw / na
|
||||||
|
remains := nw - (na * cov)
|
||||||
|
f1 := float64(cov) / float64(nw)
|
||||||
|
f2 := float64(cov+1) / float64(nw)
|
||||||
|
logNwords[ws] = math.Log(float64(nw))
|
||||||
|
emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) +
|
||||||
|
float64(remains)*f2*math.Log(f2))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pre-allocate frequency tables per word size
|
||||||
|
freqTables := make([][]int, levelMax+1)
|
||||||
|
for ws := 1; ws <= levelMax; ws++ {
|
||||||
|
freqTables[ws] = make([]int, 1<<(ws*2))
|
||||||
|
}
|
||||||
|
|
||||||
|
return &KmerEntropyFilter{
|
||||||
|
k: k,
|
||||||
|
levelMax: levelMax,
|
||||||
|
threshold: threshold,
|
||||||
|
nLogN: nLogN,
|
||||||
|
normTables: normTables,
|
||||||
|
emaxValues: emaxValues,
|
||||||
|
logNwords: logNwords,
|
||||||
|
freqTables: freqTables,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Accept returns true if the k-mer has entropy strictly above the threshold.
|
||||||
|
// Low-complexity k-mers (entropy <= threshold) are rejected.
|
||||||
|
func (ef *KmerEntropyFilter) Accept(kmer uint64) bool {
|
||||||
|
return ef.Entropy(kmer) > ef.threshold
|
||||||
|
}
|
||||||
|
|
||||||
|
// Entropy computes the entropy for a single k-mer using pre-computed tables.
|
||||||
|
func (ef *KmerEntropyFilter) Entropy(kmer uint64) float64 {
|
||||||
|
k := ef.k
|
||||||
|
|
||||||
|
// Decode k-mer to DNA sequence
|
||||||
|
var seqBuf [32]byte
|
||||||
|
seq := DecodeKmer(kmer, k, seqBuf[:])
|
||||||
|
|
||||||
|
minEntropy := math.MaxFloat64
|
||||||
|
|
||||||
|
for ws := 1; ws <= ef.levelMax; ws++ {
|
||||||
|
nwords := k - ws + 1
|
||||||
|
if nwords < 1 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
emax := ef.emaxValues[ws]
|
||||||
|
if emax <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Count circular-canonical sub-word frequencies
|
||||||
|
tableSize := 1 << (ws * 2)
|
||||||
|
table := ef.freqTables[ws]
|
||||||
|
clear(table) // reset to zero
|
||||||
|
mask := (1 << (ws * 2)) - 1
|
||||||
|
normTable := ef.normTables[ws]
|
||||||
|
|
||||||
|
wordIndex := 0
|
||||||
|
for i := 0; i < ws-1; i++ {
|
||||||
|
wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i]))
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, j := 0, ws-1; j < k; i, j = i+1, j+1 {
|
||||||
|
wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j]))
|
||||||
|
normWord := normTable[wordIndex]
|
||||||
|
table[normWord]++
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute Shannon entropy
|
||||||
|
floatNwords := float64(nwords)
|
||||||
|
logNwords := ef.logNwords[ws]
|
||||||
|
|
||||||
|
var sumNLogN float64
|
||||||
|
for j := 0; j < tableSize; j++ {
|
||||||
|
n := table[j]
|
||||||
|
if n > 0 {
|
||||||
|
sumNLogN += ef.nLogN[n]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
entropy := (logNwords - sumNLogN/floatNwords) / emax
|
||||||
|
if entropy < 0 {
|
||||||
|
entropy = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
if entropy < minEntropy {
|
||||||
|
minEntropy = entropy
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if minEntropy == math.MaxFloat64 {
|
||||||
|
return 1.0
|
||||||
|
}
|
||||||
|
|
||||||
|
return math.Round(minEntropy*10000) / 10000
|
||||||
|
}
|
||||||
@@ -5,11 +5,12 @@ import (
|
|||||||
"math"
|
"math"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"runtime"
|
"slices"
|
||||||
"sort"
|
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||||
|
"github.com/schollz/progressbar/v3"
|
||||||
)
|
)
|
||||||
|
|
||||||
// BuilderOption is a functional option for KmerSetGroupBuilder.
|
// BuilderOption is a functional option for KmerSetGroupBuilder.
|
||||||
@@ -19,6 +20,8 @@ type builderConfig struct {
|
|||||||
minFreq int // 0 means no frequency filtering (simple dedup)
|
minFreq int // 0 means no frequency filtering (simple dedup)
|
||||||
maxFreq int // 0 means no upper bound
|
maxFreq int // 0 means no upper bound
|
||||||
saveFreqTopN int // >0 means save the N most frequent k-mers per set to CSV
|
saveFreqTopN int // >0 means save the N most frequent k-mers per set to CSV
|
||||||
|
entropyThreshold float64 // >0 means filter k-mers with entropy <= threshold
|
||||||
|
entropyLevelMax int // max sub-word size for entropy (typically 6)
|
||||||
}
|
}
|
||||||
|
|
||||||
// WithMinFrequency activates frequency filtering mode.
|
// WithMinFrequency activates frequency filtering mode.
|
||||||
@@ -45,6 +48,16 @@ func WithSaveFreqKmers(n int) BuilderOption {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// WithEntropyFilter activates entropy-based low-complexity filtering.
|
||||||
|
// K-mers with entropy <= threshold are discarded during finalization.
|
||||||
|
// levelMax is the maximum sub-word size for entropy computation (typically 6).
|
||||||
|
func WithEntropyFilter(threshold float64, levelMax int) BuilderOption {
|
||||||
|
return func(c *builderConfig) {
|
||||||
|
c.entropyThreshold = threshold
|
||||||
|
c.entropyLevelMax = levelMax
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// KmerSetGroupBuilder constructs a KmerSetGroup on disk.
|
// KmerSetGroupBuilder constructs a KmerSetGroup on disk.
|
||||||
// During construction, super-kmers are written to temporary .skm files
|
// During construction, super-kmers are written to temporary .skm files
|
||||||
// partitioned by minimizer. On Close(), each partition is finalized
|
// partitioned by minimizer. On Close(), each partition is finalized
|
||||||
@@ -299,7 +312,17 @@ func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process partitions in parallel
|
// =====================================================================
|
||||||
|
// 2-stage pipeline: readers (pure I/O) → workers (CPU + write)
|
||||||
|
//
|
||||||
|
// - nReaders goroutines read .skm files (pure I/O, fast)
|
||||||
|
// - nWorkers goroutines extract k-mers, sort, dedup, filter, write .kdi
|
||||||
|
//
|
||||||
|
// One unbuffered channel between stages. Readers are truly I/O-bound
|
||||||
|
// (small files, buffered reads), workers are CPU-bound and stay busy.
|
||||||
|
// =====================================================================
|
||||||
|
totalJobs := b.n * b.P
|
||||||
|
|
||||||
counts := make([][]uint64, b.n)
|
counts := make([][]uint64, b.n)
|
||||||
spectra := make([][]map[int]uint64, b.n)
|
spectra := make([][]map[int]uint64, b.n)
|
||||||
var topKmers [][]*TopNKmers
|
var topKmers [][]*TopNKmers
|
||||||
@@ -314,27 +337,71 @@ func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
nWorkers := runtime.NumCPU()
|
nCPU := obidefault.ParallelWorkers()
|
||||||
if nWorkers > b.P {
|
|
||||||
nWorkers = b.P
|
// Stage sizing
|
||||||
|
nWorkers := nCPU // CPU-bound: one per core
|
||||||
|
nReaders := nCPU / 4 // pure I/O: few goroutines suffice
|
||||||
|
if nReaders < 2 {
|
||||||
|
nReaders = 2
|
||||||
|
}
|
||||||
|
if nReaders > 4 {
|
||||||
|
nReaders = 4
|
||||||
|
}
|
||||||
|
if nWorkers > totalJobs {
|
||||||
|
nWorkers = totalJobs
|
||||||
|
}
|
||||||
|
if nReaders > totalJobs {
|
||||||
|
nReaders = totalJobs
|
||||||
}
|
}
|
||||||
|
|
||||||
type job struct {
|
var bar *progressbar.ProgressBar
|
||||||
|
if obidefault.ProgressBar() {
|
||||||
|
pbopt := []progressbar.Option{
|
||||||
|
progressbar.OptionSetWriter(os.Stderr),
|
||||||
|
progressbar.OptionSetWidth(15),
|
||||||
|
progressbar.OptionShowCount(),
|
||||||
|
progressbar.OptionShowIts(),
|
||||||
|
progressbar.OptionSetPredictTime(true),
|
||||||
|
progressbar.OptionSetDescription("[Finalizing partitions]"),
|
||||||
|
}
|
||||||
|
bar = progressbar.NewOptions(totalJobs, pbopt...)
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Channel types ---
|
||||||
|
type partitionData struct {
|
||||||
|
setIdx int
|
||||||
|
partIdx int
|
||||||
|
skmers []SuperKmer // raw super-kmers from I/O stage
|
||||||
|
}
|
||||||
|
|
||||||
|
type readJob struct {
|
||||||
setIdx int
|
setIdx int
|
||||||
partIdx int
|
partIdx int
|
||||||
}
|
}
|
||||||
|
|
||||||
jobs := make(chan job, b.n*b.P)
|
dataCh := make(chan *partitionData) // unbuffered
|
||||||
var wg sync.WaitGroup
|
readJobs := make(chan readJob, totalJobs)
|
||||||
|
|
||||||
var errMu sync.Mutex
|
var errMu sync.Mutex
|
||||||
var firstErr error
|
var firstErr error
|
||||||
|
|
||||||
for w := 0; w < nWorkers; w++ {
|
// Fill job queue (buffered, all jobs pre-loaded)
|
||||||
wg.Add(1)
|
for s := 0; s < b.n; s++ {
|
||||||
|
for p := 0; p < b.P; p++ {
|
||||||
|
readJobs <- readJob{s, p}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
close(readJobs)
|
||||||
|
|
||||||
|
// --- Stage 1: Readers (pure I/O) ---
|
||||||
|
var readWg sync.WaitGroup
|
||||||
|
for w := 0; w < nReaders; w++ {
|
||||||
|
readWg.Add(1)
|
||||||
go func() {
|
go func() {
|
||||||
defer wg.Done()
|
defer readWg.Done()
|
||||||
for j := range jobs {
|
for rj := range readJobs {
|
||||||
partSpec, partTop, err := b.finalizePartition(j.setIdx, j.partIdx, &counts[j.setIdx][j.partIdx])
|
skmers, err := b.loadPartitionRaw(rj.setIdx, rj.partIdx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
errMu.Lock()
|
errMu.Lock()
|
||||||
if firstErr == nil {
|
if firstErr == nil {
|
||||||
@@ -342,21 +409,62 @@ func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) {
|
|||||||
}
|
}
|
||||||
errMu.Unlock()
|
errMu.Unlock()
|
||||||
}
|
}
|
||||||
spectra[j.setIdx][j.partIdx] = partSpec
|
dataCh <- &partitionData{rj.setIdx, rj.partIdx, skmers}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
readWg.Wait()
|
||||||
|
close(dataCh)
|
||||||
|
}()
|
||||||
|
|
||||||
|
// --- Stage 2: Workers (CPU: extract k-mers + sort/filter + write .kdi) ---
|
||||||
|
var workWg sync.WaitGroup
|
||||||
|
for w := 0; w < nWorkers; w++ {
|
||||||
|
workWg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer workWg.Done()
|
||||||
|
for pd := range dataCh {
|
||||||
|
// CPU: extract canonical k-mers from super-kmers
|
||||||
|
kmers := extractCanonicalKmers(pd.skmers, b.k)
|
||||||
|
pd.skmers = nil // allow GC of raw super-kmers
|
||||||
|
|
||||||
|
// CPU: sort, dedup, filter
|
||||||
|
filtered, spectrum, topN := b.sortFilterPartition(kmers)
|
||||||
|
kmers = nil // allow GC of unsorted data
|
||||||
|
|
||||||
|
// I/O: write .kdi file
|
||||||
|
globalIdx := b.startIndex + pd.setIdx
|
||||||
|
kdiPath := filepath.Join(b.dir,
|
||||||
|
fmt.Sprintf("set_%d", globalIdx),
|
||||||
|
fmt.Sprintf("part_%04d.kdi", pd.partIdx))
|
||||||
|
|
||||||
|
n, err := b.writePartitionKdi(kdiPath, filtered)
|
||||||
|
if err != nil {
|
||||||
|
errMu.Lock()
|
||||||
|
if firstErr == nil {
|
||||||
|
firstErr = err
|
||||||
|
}
|
||||||
|
errMu.Unlock()
|
||||||
|
}
|
||||||
|
counts[pd.setIdx][pd.partIdx] = n
|
||||||
|
spectra[pd.setIdx][pd.partIdx] = spectrum
|
||||||
if topKmers != nil {
|
if topKmers != nil {
|
||||||
topKmers[j.setIdx][j.partIdx] = partTop
|
topKmers[pd.setIdx][pd.partIdx] = topN
|
||||||
|
}
|
||||||
|
if bar != nil {
|
||||||
|
bar.Add(1)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
|
|
||||||
for s := 0; s < b.n; s++ {
|
workWg.Wait()
|
||||||
for p := 0; p < b.P; p++ {
|
|
||||||
jobs <- job{s, p}
|
if bar != nil {
|
||||||
|
fmt.Fprintln(os.Stderr)
|
||||||
}
|
}
|
||||||
}
|
|
||||||
close(jobs)
|
|
||||||
wg.Wait()
|
|
||||||
|
|
||||||
if firstErr != nil {
|
if firstErr != nil {
|
||||||
return nil, firstErr
|
return nil, firstErr
|
||||||
@@ -449,58 +557,89 @@ func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) {
|
|||||||
return ksg, nil
|
return ksg, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// finalizePartition processes a single partition: load SKM, extract k-mers,
|
// loadPartitionRaw reads a .skm file and returns raw super-kmers.
|
||||||
// sort, dedup/count, write KDI. Returns a partial frequency spectrum
|
// This is pure I/O — no k-mer extraction is done here.
|
||||||
// (frequency → count of distinct k-mers) computed before filtering,
|
// Returns nil (not an error) if the .skm file is empty or missing.
|
||||||
// and optionally the top-N most frequent k-mers.
|
func (b *KmerSetGroupBuilder) loadPartitionRaw(setIdx, partIdx int) ([]SuperKmer, error) {
|
||||||
func (b *KmerSetGroupBuilder) finalizePartition(setIdx, partIdx int, count *uint64) (map[int]uint64, *TopNKmers, error) {
|
|
||||||
// setIdx is local (0..n-1); build dirs use local index, output dirs use global
|
|
||||||
skmPath := filepath.Join(b.dir, ".build",
|
skmPath := filepath.Join(b.dir, ".build",
|
||||||
fmt.Sprintf("set_%d", setIdx),
|
fmt.Sprintf("set_%d", setIdx),
|
||||||
fmt.Sprintf("part_%04d.skm", partIdx))
|
fmt.Sprintf("part_%04d.skm", partIdx))
|
||||||
|
|
||||||
globalIdx := b.startIndex + setIdx
|
fi, err := os.Stat(skmPath)
|
||||||
kdiPath := filepath.Join(b.dir,
|
|
||||||
fmt.Sprintf("set_%d", globalIdx),
|
|
||||||
fmt.Sprintf("part_%04d.kdi", partIdx))
|
|
||||||
|
|
||||||
// Load super-kmers and extract canonical k-mers
|
|
||||||
reader, err := NewSkmReader(skmPath)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// If file doesn't exist or is empty, write empty KDI
|
return nil, nil // empty partition, not an error
|
||||||
return nil, nil, b.writeEmptyKdi(kdiPath, count)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var kmers []uint64
|
reader, err := NewSkmReader(skmPath)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Estimate capacity from file size. Each super-kmer record is
|
||||||
|
// 2 bytes (length) + packed bases (~k/4 bytes), so roughly
|
||||||
|
// (2 + k/4) bytes per super-kmer on average.
|
||||||
|
avgRecordSize := 2 + b.k/4
|
||||||
|
if avgRecordSize < 4 {
|
||||||
|
avgRecordSize = 4
|
||||||
|
}
|
||||||
|
estCount := int(fi.Size()) / avgRecordSize
|
||||||
|
|
||||||
|
skmers := make([]SuperKmer, 0, estCount)
|
||||||
for {
|
for {
|
||||||
sk, ok := reader.Next()
|
sk, ok := reader.Next()
|
||||||
if !ok {
|
if !ok {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
for kmer := range IterCanonicalKmers(sk.Sequence, b.k) {
|
skmers = append(skmers, sk)
|
||||||
kmers = append(kmers, kmer)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
reader.Close()
|
reader.Close()
|
||||||
|
|
||||||
|
return skmers, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// extractCanonicalKmers extracts all canonical k-mers from a slice of super-kmers.
|
||||||
|
// This is CPU-bound work (sliding-window forward/reverse complement).
|
||||||
|
func extractCanonicalKmers(skmers []SuperKmer, k int) []uint64 {
|
||||||
|
// Pre-compute total capacity to avoid repeated slice growth.
|
||||||
|
// Each super-kmer of length L yields L-k+1 canonical k-mers.
|
||||||
|
total := 0
|
||||||
|
for i := range skmers {
|
||||||
|
n := len(skmers[i].Sequence) - k + 1
|
||||||
|
if n > 0 {
|
||||||
|
total += n
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
kmers := make([]uint64, 0, total)
|
||||||
|
for _, sk := range skmers {
|
||||||
|
for kmer := range IterCanonicalKmers(sk.Sequence, k) {
|
||||||
|
kmers = append(kmers, kmer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return kmers
|
||||||
|
}
|
||||||
|
|
||||||
|
// sortFilterPartition sorts, deduplicates, and filters k-mers in memory (CPU-bound).
|
||||||
|
// Returns the filtered sorted slice, frequency spectrum, and optional top-N.
|
||||||
|
func (b *KmerSetGroupBuilder) sortFilterPartition(kmers []uint64) ([]uint64, map[int]uint64, *TopNKmers) {
|
||||||
if len(kmers) == 0 {
|
if len(kmers) == 0 {
|
||||||
return nil, nil, b.writeEmptyKdi(kdiPath, count)
|
return nil, nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort
|
// Sort (CPU-bound) — slices.Sort avoids reflection overhead of sort.Slice
|
||||||
sort.Slice(kmers, func(i, j int) bool { return kmers[i] < kmers[j] })
|
slices.Sort(kmers)
|
||||||
|
|
||||||
// Write KDI based on mode
|
|
||||||
w, err := NewKdiWriter(kdiPath)
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
minFreq := b.config.minFreq
|
minFreq := b.config.minFreq
|
||||||
if minFreq <= 0 {
|
if minFreq <= 0 {
|
||||||
minFreq = 1 // simple dedup
|
minFreq = 1 // simple dedup
|
||||||
}
|
}
|
||||||
maxFreq := b.config.maxFreq // 0 means no upper bound
|
maxFreq := b.config.maxFreq
|
||||||
|
|
||||||
|
// Prepare entropy filter if requested
|
||||||
|
var entropyFilter *KmerEntropyFilter
|
||||||
|
if b.config.entropyThreshold > 0 && b.config.entropyLevelMax > 0 {
|
||||||
|
entropyFilter = NewKmerEntropyFilter(b.k, b.config.entropyLevelMax, b.config.entropyThreshold)
|
||||||
|
}
|
||||||
|
|
||||||
// Prepare top-N collector if requested
|
// Prepare top-N collector if requested
|
||||||
var topN *TopNKmers
|
var topN *TopNKmers
|
||||||
@@ -508,8 +647,10 @@ func (b *KmerSetGroupBuilder) finalizePartition(setIdx, partIdx int, count *uint
|
|||||||
topN = NewTopNKmers(b.config.saveFreqTopN)
|
topN = NewTopNKmers(b.config.saveFreqTopN)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Linear scan: count consecutive identical values and accumulate spectrum
|
// Linear scan: count consecutive identical values, filter, accumulate spectrum
|
||||||
partSpectrum := make(map[int]uint64)
|
partSpectrum := make(map[int]uint64)
|
||||||
|
filtered := make([]uint64, 0, len(kmers)/2)
|
||||||
|
|
||||||
i := 0
|
i := 0
|
||||||
for i < len(kmers) {
|
for i < len(kmers) {
|
||||||
val := kmers[i]
|
val := kmers[i]
|
||||||
@@ -522,16 +663,33 @@ func (b *KmerSetGroupBuilder) finalizePartition(setIdx, partIdx int, count *uint
|
|||||||
topN.Add(val, c)
|
topN.Add(val, c)
|
||||||
}
|
}
|
||||||
if c >= minFreq && (maxFreq <= 0 || c <= maxFreq) {
|
if c >= minFreq && (maxFreq <= 0 || c <= maxFreq) {
|
||||||
if err := w.Write(val); err != nil {
|
if entropyFilter == nil || entropyFilter.Accept(val) {
|
||||||
w.Close()
|
filtered = append(filtered, val)
|
||||||
return nil, nil, err
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
i += c
|
i += c
|
||||||
}
|
}
|
||||||
|
|
||||||
*count = w.Count()
|
return filtered, partSpectrum, topN
|
||||||
return partSpectrum, topN, w.Close()
|
}
|
||||||
|
|
||||||
|
// writePartitionKdi writes a sorted slice of k-mers to a .kdi file (I/O-bound).
|
||||||
|
// Returns the number of k-mers written.
|
||||||
|
func (b *KmerSetGroupBuilder) writePartitionKdi(kdiPath string, kmers []uint64) (uint64, error) {
|
||||||
|
w, err := NewKdiWriter(kdiPath)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, val := range kmers {
|
||||||
|
if err := w.Write(val); err != nil {
|
||||||
|
w.Close()
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
n := w.Count()
|
||||||
|
return n, w.Close()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (b *KmerSetGroupBuilder) writeEmptyKdi(path string, count *uint64) error {
|
func (b *KmerSetGroupBuilder) writeEmptyKdi(path string, count *uint64) error {
|
||||||
|
|||||||
@@ -128,6 +128,27 @@ func OpenKmerSetGroup(directory string) (*KmerSetGroup, error) {
|
|||||||
return ksg, nil
|
return ksg, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NewFilteredKmerSetGroup creates a KmerSetGroup from pre-computed data.
|
||||||
|
// Used by the filter command to construct a new group after filtering partitions.
|
||||||
|
func NewFilteredKmerSetGroup(
|
||||||
|
directory string, k, m, partitions, n int,
|
||||||
|
setsIDs []string, counts []uint64,
|
||||||
|
setsMetadata []map[string]interface{},
|
||||||
|
) (*KmerSetGroup, error) {
|
||||||
|
ksg := &KmerSetGroup{
|
||||||
|
path: directory,
|
||||||
|
k: k,
|
||||||
|
m: m,
|
||||||
|
partitions: partitions,
|
||||||
|
n: n,
|
||||||
|
setsIDs: setsIDs,
|
||||||
|
counts: counts,
|
||||||
|
setsMetadata: setsMetadata,
|
||||||
|
Metadata: make(map[string]interface{}),
|
||||||
|
}
|
||||||
|
return ksg, nil
|
||||||
|
}
|
||||||
|
|
||||||
// SaveMetadata writes the metadata.toml file. This is useful after
|
// SaveMetadata writes the metadata.toml file. This is useful after
|
||||||
// modifying attributes or IDs on an already-finalized index.
|
// modifying attributes or IDs on an already-finalized index.
|
||||||
func (ksg *KmerSetGroup) SaveMetadata() error {
|
func (ksg *KmerSetGroup) SaveMetadata() error {
|
||||||
|
|||||||
344
pkg/obitools/obik/filter.go
Normal file
344
pkg/obitools/obik/filter.go
Normal file
@@ -0,0 +1,344 @@
|
|||||||
|
package obik
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
|
|
||||||
|
"github.com/schollz/progressbar/v3"
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||||
|
"github.com/DavidGamba/go-getoptions"
|
||||||
|
)
|
||||||
|
|
||||||
|
// KmerFilter is a predicate applied to individual k-mers during filtering.
|
||||||
|
// Returns true if the k-mer should be kept.
|
||||||
|
type KmerFilter func(kmer uint64) bool
|
||||||
|
|
||||||
|
// KmerFilterFactory creates a new KmerFilter instance.
|
||||||
|
// Each goroutine should call the factory to get its own filter,
|
||||||
|
// since some filters (e.g. KmerEntropyFilter) are not thread-safe.
|
||||||
|
type KmerFilterFactory func() KmerFilter
|
||||||
|
|
||||||
|
// chainFilterFactories combines multiple KmerFilterFactory into one.
|
||||||
|
// The resulting factory creates a filter that accepts a k-mer only
|
||||||
|
// if all individual filters accept it.
|
||||||
|
func chainFilterFactories(factories []KmerFilterFactory) KmerFilterFactory {
|
||||||
|
switch len(factories) {
|
||||||
|
case 0:
|
||||||
|
return func() KmerFilter { return func(uint64) bool { return true } }
|
||||||
|
case 1:
|
||||||
|
return factories[0]
|
||||||
|
default:
|
||||||
|
return func() KmerFilter {
|
||||||
|
filters := make([]KmerFilter, len(factories))
|
||||||
|
for i, f := range factories {
|
||||||
|
filters[i] = f()
|
||||||
|
}
|
||||||
|
return func(kmer uint64) bool {
|
||||||
|
for _, f := range filters {
|
||||||
|
if !f(kmer) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// runFilter implements the "obik filter" subcommand.
|
||||||
|
// It reads an existing kmer index, applies a chain of filters,
|
||||||
|
// and writes a new filtered index.
|
||||||
|
func runFilter(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
|
||||||
|
if len(args) < 1 {
|
||||||
|
return fmt.Errorf("usage: obik filter [options] <source_index> --out <dest_index>")
|
||||||
|
}
|
||||||
|
|
||||||
|
srcDir := args[0]
|
||||||
|
destDir := CLIOutputDirectory()
|
||||||
|
if destDir == "" || destDir == "-" {
|
||||||
|
return fmt.Errorf("--out option is required and must specify a destination directory")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Open source index
|
||||||
|
src, err := obikmer.OpenKmerSetGroup(srcDir)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to open source index: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
k := src.K()
|
||||||
|
|
||||||
|
// Build filter factory chain from CLI options.
|
||||||
|
// Factories are used so each goroutine creates its own filter instance,
|
||||||
|
// since some filters (e.g. KmerEntropyFilter) have mutable state.
|
||||||
|
var factories []KmerFilterFactory
|
||||||
|
var filterDescriptions []string
|
||||||
|
|
||||||
|
// Entropy filter
|
||||||
|
entropyThreshold := CLIIndexEntropyThreshold()
|
||||||
|
entropySize := CLIIndexEntropySize()
|
||||||
|
if entropyThreshold > 0 {
|
||||||
|
factories = append(factories, func() KmerFilter {
|
||||||
|
ef := obikmer.NewKmerEntropyFilter(k, entropySize, entropyThreshold)
|
||||||
|
return ef.Accept
|
||||||
|
})
|
||||||
|
filterDescriptions = append(filterDescriptions,
|
||||||
|
fmt.Sprintf("entropy(threshold=%.4f, level-max=%d)", entropyThreshold, entropySize))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Future filters will be added here, e.g.:
|
||||||
|
// quorumFilter, frequencyFilter, ...
|
||||||
|
|
||||||
|
if len(factories) == 0 {
|
||||||
|
return fmt.Errorf("no filter specified; use --entropy-filter or other filter options")
|
||||||
|
}
|
||||||
|
|
||||||
|
filterFactory := chainFilterFactories(factories)
|
||||||
|
|
||||||
|
// Resolve set selection (default: all sets)
|
||||||
|
patterns := CLISetPatterns()
|
||||||
|
var setIndices []int
|
||||||
|
if len(patterns) > 0 {
|
||||||
|
setIndices, err = src.MatchSetIDs(patterns)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to match set patterns: %w", err)
|
||||||
|
}
|
||||||
|
if len(setIndices) == 0 {
|
||||||
|
return fmt.Errorf("no sets match the given patterns")
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
setIndices = make([]int, src.Size())
|
||||||
|
for i := range setIndices {
|
||||||
|
setIndices[i] = i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Infof("Filtering %d set(s) from %s with: %s",
|
||||||
|
len(setIndices), srcDir, strings.Join(filterDescriptions, " + "))
|
||||||
|
|
||||||
|
// Create destination directory
|
||||||
|
if err := os.MkdirAll(destDir, 0755); err != nil {
|
||||||
|
return fmt.Errorf("failed to create destination: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
P := src.Partitions()
|
||||||
|
|
||||||
|
// Progress bar for partition filtering
|
||||||
|
totalPartitions := len(setIndices) * P
|
||||||
|
var bar *progressbar.ProgressBar
|
||||||
|
if obidefault.ProgressBar() {
|
||||||
|
pbopt := []progressbar.Option{
|
||||||
|
progressbar.OptionSetWriter(os.Stderr),
|
||||||
|
progressbar.OptionSetWidth(15),
|
||||||
|
progressbar.OptionShowCount(),
|
||||||
|
progressbar.OptionShowIts(),
|
||||||
|
progressbar.OptionSetPredictTime(true),
|
||||||
|
progressbar.OptionSetDescription("[Filtering partitions]"),
|
||||||
|
}
|
||||||
|
bar = progressbar.NewOptions(totalPartitions, pbopt...)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process each selected set
|
||||||
|
newCounts := make([]uint64, len(setIndices))
|
||||||
|
|
||||||
|
for si, srcIdx := range setIndices {
|
||||||
|
setID := src.SetIDOf(srcIdx)
|
||||||
|
if setID == "" {
|
||||||
|
setID = fmt.Sprintf("set_%d", srcIdx)
|
||||||
|
}
|
||||||
|
|
||||||
|
destSetDir := filepath.Join(destDir, fmt.Sprintf("set_%d", si))
|
||||||
|
if err := os.MkdirAll(destSetDir, 0755); err != nil {
|
||||||
|
return fmt.Errorf("failed to create set directory: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process partitions in parallel
|
||||||
|
nWorkers := obidefault.ParallelWorkers()
|
||||||
|
if nWorkers > P {
|
||||||
|
nWorkers = P
|
||||||
|
}
|
||||||
|
|
||||||
|
var totalKept atomic.Uint64
|
||||||
|
var totalProcessed atomic.Uint64
|
||||||
|
|
||||||
|
type job struct {
|
||||||
|
partIdx int
|
||||||
|
}
|
||||||
|
|
||||||
|
jobs := make(chan job, P)
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
var errMu sync.Mutex
|
||||||
|
var firstErr error
|
||||||
|
|
||||||
|
for w := 0; w < nWorkers; w++ {
|
||||||
|
wg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
// Each goroutine gets its own filter instance
|
||||||
|
workerFilter := filterFactory()
|
||||||
|
for j := range jobs {
|
||||||
|
kept, processed, err := filterPartition(
|
||||||
|
src.PartitionPath(srcIdx, j.partIdx),
|
||||||
|
filepath.Join(destSetDir, fmt.Sprintf("part_%04d.kdi", j.partIdx)),
|
||||||
|
workerFilter,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
errMu.Lock()
|
||||||
|
if firstErr == nil {
|
||||||
|
firstErr = err
|
||||||
|
}
|
||||||
|
errMu.Unlock()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
totalKept.Add(kept)
|
||||||
|
totalProcessed.Add(processed)
|
||||||
|
if bar != nil {
|
||||||
|
bar.Add(1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
for p := 0; p < P; p++ {
|
||||||
|
jobs <- job{p}
|
||||||
|
}
|
||||||
|
close(jobs)
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
if firstErr != nil {
|
||||||
|
return fmt.Errorf("failed to filter set %q: %w", setID, firstErr)
|
||||||
|
}
|
||||||
|
|
||||||
|
kept := totalKept.Load()
|
||||||
|
processed := totalProcessed.Load()
|
||||||
|
newCounts[si] = kept
|
||||||
|
log.Infof("Set %q: %d/%d k-mers kept (%.1f%% removed)",
|
||||||
|
setID, kept, processed,
|
||||||
|
100.0*float64(processed-kept)/float64(max(processed, 1)))
|
||||||
|
|
||||||
|
// Copy spectrum.bin if it exists
|
||||||
|
srcSpecPath := src.SpectrumPath(srcIdx)
|
||||||
|
if _, err := os.Stat(srcSpecPath); err == nil {
|
||||||
|
destSpecPath := filepath.Join(destSetDir, "spectrum.bin")
|
||||||
|
if err := copyFileHelper(srcSpecPath, destSpecPath); err != nil {
|
||||||
|
log.Warnf("Could not copy spectrum for set %q: %v", setID, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if bar != nil {
|
||||||
|
fmt.Fprintln(os.Stderr)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build destination metadata
|
||||||
|
setsIDs := make([]string, len(setIndices))
|
||||||
|
setsMetadata := make([]map[string]interface{}, len(setIndices))
|
||||||
|
for i, srcIdx := range setIndices {
|
||||||
|
setsIDs[i] = src.SetIDOf(srcIdx)
|
||||||
|
setsMetadata[i] = src.AllSetMetadata(srcIdx)
|
||||||
|
if setsMetadata[i] == nil {
|
||||||
|
setsMetadata[i] = make(map[string]interface{})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write metadata for the filtered index
|
||||||
|
dest, err := obikmer.NewFilteredKmerSetGroup(
|
||||||
|
destDir, k, src.M(), P,
|
||||||
|
len(setIndices), setsIDs, newCounts, setsMetadata,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to create filtered metadata: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Copy group-level metadata and record applied filters
|
||||||
|
for key, value := range src.Metadata {
|
||||||
|
dest.SetAttribute(key, value)
|
||||||
|
}
|
||||||
|
if entropyThreshold > 0 {
|
||||||
|
dest.SetAttribute("entropy_filter", entropyThreshold)
|
||||||
|
dest.SetAttribute("entropy_filter_size", entropySize)
|
||||||
|
}
|
||||||
|
dest.SetAttribute("filtered_from", srcDir)
|
||||||
|
|
||||||
|
if err := dest.SaveMetadata(); err != nil {
|
||||||
|
return fmt.Errorf("failed to save metadata: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Info("Done.")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// filterPartition reads a single .kdi partition, applies the filter predicate,
|
||||||
|
// and writes the accepted k-mers to a new .kdi file.
|
||||||
|
// Returns (kept, processed, error).
|
||||||
|
func filterPartition(srcPath, destPath string, accept KmerFilter) (uint64, uint64, error) {
|
||||||
|
reader, err := obikmer.NewKdiReader(srcPath)
|
||||||
|
if err != nil {
|
||||||
|
// Empty partition — write empty KDI
|
||||||
|
w, err2 := obikmer.NewKdiWriter(destPath)
|
||||||
|
if err2 != nil {
|
||||||
|
return 0, 0, err2
|
||||||
|
}
|
||||||
|
return 0, 0, w.Close()
|
||||||
|
}
|
||||||
|
defer reader.Close()
|
||||||
|
|
||||||
|
w, err := obikmer.NewKdiWriter(destPath)
|
||||||
|
if err != nil {
|
||||||
|
return 0, 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var kept, processed uint64
|
||||||
|
for {
|
||||||
|
kmer, ok := reader.Next()
|
||||||
|
if !ok {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
processed++
|
||||||
|
if accept(kmer) {
|
||||||
|
if err := w.Write(kmer); err != nil {
|
||||||
|
w.Close()
|
||||||
|
return 0, 0, err
|
||||||
|
}
|
||||||
|
kept++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return kept, processed, w.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
// copyFileHelper copies a file (used for spectrum.bin etc.)
|
||||||
|
func copyFileHelper(src, dst string) error {
|
||||||
|
in, err := os.Open(src)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer in.Close()
|
||||||
|
|
||||||
|
out, err := os.Create(dst)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer out.Close()
|
||||||
|
|
||||||
|
buf := make([]byte, 32*1024)
|
||||||
|
for {
|
||||||
|
n, readErr := in.Read(buf)
|
||||||
|
if n > 0 {
|
||||||
|
if _, writeErr := out.Write(buf[:n]); writeErr != nil {
|
||||||
|
return writeErr
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if readErr != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out.Close()
|
||||||
|
}
|
||||||
@@ -33,6 +33,9 @@ func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error
|
|||||||
|
|
||||||
maxOcc := CLIMaxOccurrence()
|
maxOcc := CLIMaxOccurrence()
|
||||||
|
|
||||||
|
entropyThreshold := CLIIndexEntropyThreshold()
|
||||||
|
entropySize := CLIIndexEntropySize()
|
||||||
|
|
||||||
// Build options
|
// Build options
|
||||||
var opts []obikmer.BuilderOption
|
var opts []obikmer.BuilderOption
|
||||||
if minOcc > 1 {
|
if minOcc > 1 {
|
||||||
@@ -44,6 +47,9 @@ func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error
|
|||||||
if topN := CLISaveFreqKmer(); topN > 0 {
|
if topN := CLISaveFreqKmer(); topN > 0 {
|
||||||
opts = append(opts, obikmer.WithSaveFreqKmers(topN))
|
opts = append(opts, obikmer.WithSaveFreqKmers(topN))
|
||||||
}
|
}
|
||||||
|
if entropyThreshold > 0 {
|
||||||
|
opts = append(opts, obikmer.WithEntropyFilter(entropyThreshold, entropySize))
|
||||||
|
}
|
||||||
|
|
||||||
// Determine whether to append to existing group or create new
|
// Determine whether to append to existing group or create new
|
||||||
var builder *obikmer.KmerSetGroupBuilder
|
var builder *obikmer.KmerSetGroupBuilder
|
||||||
@@ -115,6 +121,11 @@ func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error
|
|||||||
ksg.SetAttribute("max_occurrence", maxOcc)
|
ksg.SetAttribute("max_occurrence", maxOcc)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if entropyThreshold > 0 {
|
||||||
|
ksg.SetAttribute("entropy_filter", entropyThreshold)
|
||||||
|
ksg.SetAttribute("entropy_filter_size", entropySize)
|
||||||
|
}
|
||||||
|
|
||||||
if err := ksg.SaveMetadata(); err != nil {
|
if err := ksg.SaveMetadata(); err != nil {
|
||||||
return fmt.Errorf("failed to save metadata: %w", err)
|
return fmt.Errorf("failed to save metadata: %w", err)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -74,4 +74,11 @@ func OptionSet(opt *getoptions.GetOpt) {
|
|||||||
obiconvert.OutputOptionSet(matchCmd)
|
obiconvert.OutputOptionSet(matchCmd)
|
||||||
SetSelectionOptionSet(matchCmd)
|
SetSelectionOptionSet(matchCmd)
|
||||||
matchCmd.SetCommandFn(runMatch)
|
matchCmd.SetCommandFn(runMatch)
|
||||||
|
|
||||||
|
// filter: filter an index to remove low-complexity k-mers
|
||||||
|
filterCmd := opt.NewCommand("filter", "Filter a kmer index to remove low-complexity k-mers")
|
||||||
|
obiconvert.OutputModeOptionSet(filterCmd, false)
|
||||||
|
EntropyFilterOptionSet(filterCmd)
|
||||||
|
SetSelectionOptionSet(filterCmd)
|
||||||
|
filterCmd.SetCommandFn(runFilter)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -105,6 +105,8 @@ var _minOccurrence = 1
|
|||||||
var _maxOccurrence = 0
|
var _maxOccurrence = 0
|
||||||
var _saveFullFilter = false
|
var _saveFullFilter = false
|
||||||
var _saveFreqKmer = 0
|
var _saveFreqKmer = 0
|
||||||
|
var _indexEntropyThreshold = 0.0
|
||||||
|
var _indexEntropySize = 6
|
||||||
|
|
||||||
// KmerIndexOptionSet defines every option related to kmer index building.
|
// KmerIndexOptionSet defines every option related to kmer index building.
|
||||||
func KmerIndexOptionSet(options *getoptions.GetOpt) {
|
func KmerIndexOptionSet(options *getoptions.GetOpt) {
|
||||||
@@ -133,6 +135,22 @@ func KmerIndexOptionSet(options *getoptions.GetOpt) {
|
|||||||
|
|
||||||
options.IntVar(&_saveFreqKmer, "save-freq-kmer", _saveFreqKmer,
|
options.IntVar(&_saveFreqKmer, "save-freq-kmer", _saveFreqKmer,
|
||||||
options.Description("Save the N most frequent k-mers per set to a CSV file (top_kmers.csv)."))
|
options.Description("Save the N most frequent k-mers per set to a CSV file (top_kmers.csv)."))
|
||||||
|
|
||||||
|
options.Float64Var(&_indexEntropyThreshold, "entropy-filter", _indexEntropyThreshold,
|
||||||
|
options.Description("Filter low-complexity k-mers with entropy <= threshold (0 = disabled)."))
|
||||||
|
|
||||||
|
options.IntVar(&_indexEntropySize, "entropy-filter-size", _indexEntropySize,
|
||||||
|
options.Description("Maximum word size for entropy filter computation (default 6)."))
|
||||||
|
}
|
||||||
|
|
||||||
|
// EntropyFilterOptionSet registers entropy filter options for commands
|
||||||
|
// that process existing indices (e.g. filter).
|
||||||
|
func EntropyFilterOptionSet(options *getoptions.GetOpt) {
|
||||||
|
options.Float64Var(&_indexEntropyThreshold, "entropy-filter", _indexEntropyThreshold,
|
||||||
|
options.Description("Filter low-complexity k-mers with entropy <= threshold (0 = disabled)."))
|
||||||
|
|
||||||
|
options.IntVar(&_indexEntropySize, "entropy-filter-size", _indexEntropySize,
|
||||||
|
options.Description("Maximum word size for entropy filter computation (default 6)."))
|
||||||
}
|
}
|
||||||
|
|
||||||
// ==============================
|
// ==============================
|
||||||
@@ -262,6 +280,16 @@ func CLIKeepShorter() bool {
|
|||||||
return _keepShorter
|
return _keepShorter
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CLIIndexEntropyThreshold returns the entropy filter threshold for index building (0 = disabled).
|
||||||
|
func CLIIndexEntropyThreshold() float64 {
|
||||||
|
return _indexEntropyThreshold
|
||||||
|
}
|
||||||
|
|
||||||
|
// CLIIndexEntropySize returns the entropy filter word size for index building.
|
||||||
|
func CLIIndexEntropySize() int {
|
||||||
|
return _indexEntropySize
|
||||||
|
}
|
||||||
|
|
||||||
// OutputFormatOptionSet registers --json-output, --csv-output, --yaml-output.
|
// OutputFormatOptionSet registers --json-output, --csv-output, --yaml-output.
|
||||||
func OutputFormatOptionSet(options *getoptions.GetOpt) {
|
func OutputFormatOptionSet(options *getoptions.GetOpt) {
|
||||||
options.BoolVar(&_jsonOutput, "json-output", false,
|
options.BoolVar(&_jsonOutput, "json-output", false,
|
||||||
|
|||||||
Reference in New Issue
Block a user