mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Refactor k-mer index building to use disk-based KmerSetGroupBuilder
Refactor k-mer index building to use the new disk-based KmerSetGroupBuilder instead of the old KmerSet and FrequencyFilter approaches. This change introduces a more efficient and scalable approach to building k-mer indices by using partitioned disk storage with streaming operations. - Replace BuildKmerIndex and BuildFrequencyFilterIndex with KmerSetGroupBuilder - Add support for frequency filtering via WithMinFrequency option - Remove deprecated k-mer set persistence methods - Update CLI to use new builder approach - Add new disk-based k-mer operations (union, intersect, difference, quorum) - Introduce KDI (K-mer Delta Index) file format for efficient storage - Add K-way merge operations for combining sorted k-mer streams - Update documentation and examples to reflect new API This refactoring provides better memory usage, faster operations on large datasets, and more flexible k-mer set operations.
This commit is contained in:
74
pkg/obikmer/skm_writer.go
Normal file
74
pkg/obikmer/skm_writer.go
Normal file
@@ -0,0 +1,74 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/binary"
|
||||
"os"
|
||||
)
|
||||
|
||||
// SkmWriter writes super-kmers to a binary .skm file.
|
||||
//
|
||||
// Format per super-kmer:
|
||||
//
|
||||
// [len: uint16 LE] length of the super-kmer in bases
|
||||
// [data: ceil(len/4) bytes] sequence encoded 2 bits/base, packed
|
||||
//
|
||||
// Nucleotide encoding: A=00, C=01, G=10, T=11.
|
||||
// The last byte is zero-padded on the low bits if len%4 != 0.
|
||||
type SkmWriter struct {
|
||||
w *bufio.Writer
|
||||
file *os.File
|
||||
}
|
||||
|
||||
// NewSkmWriter creates a new SkmWriter writing to the given file path.
|
||||
func NewSkmWriter(path string) (*SkmWriter, error) {
|
||||
f, err := os.Create(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &SkmWriter{
|
||||
w: bufio.NewWriterSize(f, 65536),
|
||||
file: f,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Write encodes a SuperKmer to the .skm file.
|
||||
// The sequence bytes are packed 2 bits per base.
|
||||
func (sw *SkmWriter) Write(sk SuperKmer) error {
|
||||
seq := sk.Sequence
|
||||
seqLen := uint16(len(seq))
|
||||
|
||||
// Write length
|
||||
var lenbuf [2]byte
|
||||
binary.LittleEndian.PutUint16(lenbuf[:], seqLen)
|
||||
if _, err := sw.w.Write(lenbuf[:]); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Encode and write packed sequence (2 bits/base)
|
||||
nBytes := (int(seqLen) + 3) / 4
|
||||
for i := 0; i < nBytes; i++ {
|
||||
var packed byte
|
||||
for j := 0; j < 4; j++ {
|
||||
pos := i*4 + j
|
||||
packed <<= 2
|
||||
if pos < int(seqLen) {
|
||||
packed |= __single_base_code__[seq[pos]&31]
|
||||
}
|
||||
}
|
||||
if err := sw.w.WriteByte(packed); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Close flushes buffered data and closes the underlying file.
|
||||
func (sw *SkmWriter) Close() error {
|
||||
if err := sw.w.Flush(); err != nil {
|
||||
sw.file.Close()
|
||||
return err
|
||||
}
|
||||
return sw.file.Close()
|
||||
}
|
||||
Reference in New Issue
Block a user