mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Refactor k-mer index building to use disk-based KmerSetGroupBuilder
Refactor k-mer index building to use the new disk-based KmerSetGroupBuilder instead of the old KmerSet and FrequencyFilter approaches. This change introduces a more efficient and scalable approach to building k-mer indices by using partitioned disk storage with streaming operations. - Replace BuildKmerIndex and BuildFrequencyFilterIndex with KmerSetGroupBuilder - Add support for frequency filtering via WithMinFrequency option - Remove deprecated k-mer set persistence methods - Update CLI to use new builder approach - Add new disk-based k-mer operations (union, intersect, difference, quorum) - Introduce KDI (K-mer Delta Index) file format for efficient storage - Add K-way merge operations for combining sorted k-mer streams - Update documentation and examples to reflect new API This refactoring provides better memory usage, faster operations on large datasets, and more flexible k-mer set operations.
This commit is contained in:
96
pkg/obikmer/kdi_reader.go
Normal file
96
pkg/obikmer/kdi_reader.go
Normal file
@@ -0,0 +1,96 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
)
|
||||
|
||||
// KdiReader reads k-mers from a .kdi file using streaming delta-varint decoding.
|
||||
type KdiReader struct {
|
||||
r *bufio.Reader
|
||||
file *os.File
|
||||
count uint64 // total number of k-mers
|
||||
read uint64 // number of k-mers already consumed
|
||||
prev uint64 // last decoded value
|
||||
started bool // whether first value has been read
|
||||
}
|
||||
|
||||
// NewKdiReader opens a .kdi file for streaming reading.
|
||||
func NewKdiReader(path string) (*KdiReader, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
r := bufio.NewReaderSize(f, 65536)
|
||||
|
||||
// Read and verify magic
|
||||
var magic [4]byte
|
||||
if _, err := io.ReadFull(r, magic[:]); err != nil {
|
||||
f.Close()
|
||||
return nil, fmt.Errorf("kdi: read magic: %w", err)
|
||||
}
|
||||
if magic != kdiMagic {
|
||||
f.Close()
|
||||
return nil, fmt.Errorf("kdi: bad magic %v", magic)
|
||||
}
|
||||
|
||||
// Read count
|
||||
var countBuf [8]byte
|
||||
if _, err := io.ReadFull(r, countBuf[:]); err != nil {
|
||||
f.Close()
|
||||
return nil, fmt.Errorf("kdi: read count: %w", err)
|
||||
}
|
||||
count := binary.LittleEndian.Uint64(countBuf[:])
|
||||
|
||||
return &KdiReader{
|
||||
r: r,
|
||||
file: f,
|
||||
count: count,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Next returns the next k-mer and true, or (0, false) when exhausted.
|
||||
func (kr *KdiReader) Next() (uint64, bool) {
|
||||
if kr.read >= kr.count {
|
||||
return 0, false
|
||||
}
|
||||
|
||||
if !kr.started {
|
||||
// Read first value as absolute uint64 LE
|
||||
var buf [8]byte
|
||||
if _, err := io.ReadFull(kr.r, buf[:]); err != nil {
|
||||
return 0, false
|
||||
}
|
||||
kr.prev = binary.LittleEndian.Uint64(buf[:])
|
||||
kr.started = true
|
||||
kr.read++
|
||||
return kr.prev, true
|
||||
}
|
||||
|
||||
// Read delta varint
|
||||
delta, err := DecodeVarint(kr.r)
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
kr.prev += delta
|
||||
kr.read++
|
||||
return kr.prev, true
|
||||
}
|
||||
|
||||
// Count returns the total number of k-mers in this partition.
|
||||
func (kr *KdiReader) Count() uint64 {
|
||||
return kr.count
|
||||
}
|
||||
|
||||
// Remaining returns how many k-mers have not been read yet.
|
||||
func (kr *KdiReader) Remaining() uint64 {
|
||||
return kr.count - kr.read
|
||||
}
|
||||
|
||||
// Close closes the underlying file.
|
||||
func (kr *KdiReader) Close() error {
|
||||
return kr.file.Close()
|
||||
}
|
||||
Reference in New Issue
Block a user