Files
obitools4/pkg/obikmer/spectrum.go
Eric Coissac f2937af1ad Add max frequency filtering and top-kmer saving capabilities
This commit introduces max frequency filtering to limit k-mer occurrences and adds functionality to save the N most frequent k-mers per set to CSV files. It also includes the ability to output k-mer frequency spectra as CSV and updates the CLI options accordingly.
2026-02-10 09:27:04 +01:00

254 lines
6.0 KiB
Go

package obikmer
import (
"bufio"
"container/heap"
"encoding/csv"
"fmt"
"os"
"sort"
"strconv"
)
// KSP file magic bytes: "KSP\x01" (K-mer SPectrum v1)
var kspMagic = [4]byte{'K', 'S', 'P', 0x01}
// SpectrumEntry represents one entry in a k-mer frequency spectrum.
type SpectrumEntry struct {
Frequency int // how many times a k-mer was observed
Count uint64 // how many distinct k-mers have this frequency
}
// KmerSpectrum represents the frequency distribution of k-mers.
// Entries are sorted by Frequency in ascending order and only include
// non-zero counts.
type KmerSpectrum struct {
Entries []SpectrumEntry
}
// MaxFrequency returns the highest frequency in the spectrum, or 0 if empty.
func (s *KmerSpectrum) MaxFrequency() int {
if len(s.Entries) == 0 {
return 0
}
return s.Entries[len(s.Entries)-1].Frequency
}
// ToMap converts a KmerSpectrum back to a map for easy lookup.
func (s *KmerSpectrum) ToMap() map[int]uint64 {
m := make(map[int]uint64, len(s.Entries))
for _, e := range s.Entries {
m[e.Frequency] = e.Count
}
return m
}
// MapToSpectrum converts a map[int]uint64 to a sorted KmerSpectrum.
func MapToSpectrum(m map[int]uint64) *KmerSpectrum {
entries := make([]SpectrumEntry, 0, len(m))
for freq, count := range m {
if count > 0 {
entries = append(entries, SpectrumEntry{Frequency: freq, Count: count})
}
}
sort.Slice(entries, func(i, j int) bool {
return entries[i].Frequency < entries[j].Frequency
})
return &KmerSpectrum{Entries: entries}
}
// MergeSpectraMaps adds all entries from b into a.
func MergeSpectraMaps(a, b map[int]uint64) {
for freq, count := range b {
a[freq] += count
}
}
// WriteSpectrum writes a KmerSpectrum to a binary file.
//
// Format:
//
// [magic: 4 bytes "KSP\x01"]
// [n_entries: varint]
// For each entry (sorted by frequency ascending):
// [frequency: varint]
// [count: varint]
func WriteSpectrum(path string, spectrum *KmerSpectrum) error {
f, err := os.Create(path)
if err != nil {
return fmt.Errorf("create spectrum file: %w", err)
}
w := bufio.NewWriterSize(f, 65536)
// Magic
if _, err := w.Write(kspMagic[:]); err != nil {
f.Close()
return err
}
// Number of entries
if _, err := EncodeVarint(w, uint64(len(spectrum.Entries))); err != nil {
f.Close()
return err
}
// Entries
for _, e := range spectrum.Entries {
if _, err := EncodeVarint(w, uint64(e.Frequency)); err != nil {
f.Close()
return err
}
if _, err := EncodeVarint(w, e.Count); err != nil {
f.Close()
return err
}
}
if err := w.Flush(); err != nil {
f.Close()
return err
}
return f.Close()
}
// ReadSpectrum reads a KmerSpectrum from a binary file.
func ReadSpectrum(path string) (*KmerSpectrum, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
r := bufio.NewReaderSize(f, 65536)
// Check magic
var magic [4]byte
if _, err := r.Read(magic[:]); err != nil {
return nil, fmt.Errorf("read spectrum magic: %w", err)
}
if magic != kspMagic {
return nil, fmt.Errorf("invalid spectrum file magic: %v", magic)
}
// Number of entries
nEntries, err := DecodeVarint(r)
if err != nil {
return nil, fmt.Errorf("read spectrum entry count: %w", err)
}
entries := make([]SpectrumEntry, nEntries)
for i := uint64(0); i < nEntries; i++ {
freq, err := DecodeVarint(r)
if err != nil {
return nil, fmt.Errorf("read spectrum freq at entry %d: %w", i, err)
}
count, err := DecodeVarint(r)
if err != nil {
return nil, fmt.Errorf("read spectrum count at entry %d: %w", i, err)
}
entries[i] = SpectrumEntry{
Frequency: int(freq),
Count: count,
}
}
return &KmerSpectrum{Entries: entries}, nil
}
// KmerFreq associates a k-mer (encoded as uint64) with its observed frequency.
type KmerFreq struct {
Kmer uint64
Freq int
}
// kmerFreqHeap is a min-heap of KmerFreq ordered by Freq (lowest first).
// Used to maintain a top-N most frequent k-mers set.
type kmerFreqHeap []KmerFreq
func (h kmerFreqHeap) Len() int { return len(h) }
func (h kmerFreqHeap) Less(i, j int) bool { return h[i].Freq < h[j].Freq }
func (h kmerFreqHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
func (h *kmerFreqHeap) Push(x interface{}) { *h = append(*h, x.(KmerFreq)) }
func (h *kmerFreqHeap) Pop() interface{} {
old := *h
n := len(old)
x := old[n-1]
*h = old[:n-1]
return x
}
// TopNKmers maintains a collection of the N most frequent k-mers
// using a min-heap. Thread-safe usage requires external synchronization.
type TopNKmers struct {
n int
h kmerFreqHeap
}
// NewTopNKmers creates a new top-N collector.
func NewTopNKmers(n int) *TopNKmers {
return &TopNKmers{
n: n,
h: make(kmerFreqHeap, 0, n+1),
}
}
// Add considers a k-mer with the given frequency for inclusion in the top-N.
func (t *TopNKmers) Add(kmer uint64, freq int) {
if t.n <= 0 {
return
}
if len(t.h) < t.n {
heap.Push(&t.h, KmerFreq{Kmer: kmer, Freq: freq})
} else if freq > t.h[0].Freq {
t.h[0] = KmerFreq{Kmer: kmer, Freq: freq}
heap.Fix(&t.h, 0)
}
}
// Results returns the collected k-mers sorted by frequency descending.
func (t *TopNKmers) Results() []KmerFreq {
result := make([]KmerFreq, len(t.h))
copy(result, t.h)
sort.Slice(result, func(i, j int) bool {
return result[i].Freq > result[j].Freq
})
return result
}
// MergeTopN merges another TopNKmers into this one.
func (t *TopNKmers) MergeTopN(other *TopNKmers) {
if other == nil {
return
}
for _, kf := range other.h {
t.Add(kf.Kmer, kf.Freq)
}
}
// WriteTopKmersCSV writes the top k-mers to a CSV file.
// Columns: sequence, frequency
func WriteTopKmersCSV(path string, topKmers []KmerFreq, k int) error {
f, err := os.Create(path)
if err != nil {
return fmt.Errorf("create top-kmers file: %w", err)
}
defer f.Close()
w := csv.NewWriter(f)
defer w.Flush()
if err := w.Write([]string{"sequence", "frequency"}); err != nil {
return err
}
buf := make([]byte, k)
for _, kf := range topKmers {
seq := DecodeKmer(kf.Kmer, k, buf)
if err := w.Write([]string{string(seq), strconv.Itoa(kf.Freq)}); err != nil {
return err
}
}
return nil
}