mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 21:40:52 +00:00
254 lines
6.0 KiB
Go
254 lines
6.0 KiB
Go
|
|
package obikmer
|
||
|
|
|
||
|
|
import (
|
||
|
|
"bufio"
|
||
|
|
"container/heap"
|
||
|
|
"encoding/csv"
|
||
|
|
"fmt"
|
||
|
|
"os"
|
||
|
|
"sort"
|
||
|
|
"strconv"
|
||
|
|
)
|
||
|
|
|
||
|
|
// KSP file magic bytes: "KSP\x01" (K-mer SPectrum v1)
|
||
|
|
var kspMagic = [4]byte{'K', 'S', 'P', 0x01}
|
||
|
|
|
||
|
|
// SpectrumEntry represents one entry in a k-mer frequency spectrum.
|
||
|
|
type SpectrumEntry struct {
|
||
|
|
Frequency int // how many times a k-mer was observed
|
||
|
|
Count uint64 // how many distinct k-mers have this frequency
|
||
|
|
}
|
||
|
|
|
||
|
|
// KmerSpectrum represents the frequency distribution of k-mers.
|
||
|
|
// Entries are sorted by Frequency in ascending order and only include
|
||
|
|
// non-zero counts.
|
||
|
|
type KmerSpectrum struct {
|
||
|
|
Entries []SpectrumEntry
|
||
|
|
}
|
||
|
|
|
||
|
|
// MaxFrequency returns the highest frequency in the spectrum, or 0 if empty.
|
||
|
|
func (s *KmerSpectrum) MaxFrequency() int {
|
||
|
|
if len(s.Entries) == 0 {
|
||
|
|
return 0
|
||
|
|
}
|
||
|
|
return s.Entries[len(s.Entries)-1].Frequency
|
||
|
|
}
|
||
|
|
|
||
|
|
// ToMap converts a KmerSpectrum back to a map for easy lookup.
|
||
|
|
func (s *KmerSpectrum) ToMap() map[int]uint64 {
|
||
|
|
m := make(map[int]uint64, len(s.Entries))
|
||
|
|
for _, e := range s.Entries {
|
||
|
|
m[e.Frequency] = e.Count
|
||
|
|
}
|
||
|
|
return m
|
||
|
|
}
|
||
|
|
|
||
|
|
// MapToSpectrum converts a map[int]uint64 to a sorted KmerSpectrum.
|
||
|
|
func MapToSpectrum(m map[int]uint64) *KmerSpectrum {
|
||
|
|
entries := make([]SpectrumEntry, 0, len(m))
|
||
|
|
for freq, count := range m {
|
||
|
|
if count > 0 {
|
||
|
|
entries = append(entries, SpectrumEntry{Frequency: freq, Count: count})
|
||
|
|
}
|
||
|
|
}
|
||
|
|
sort.Slice(entries, func(i, j int) bool {
|
||
|
|
return entries[i].Frequency < entries[j].Frequency
|
||
|
|
})
|
||
|
|
return &KmerSpectrum{Entries: entries}
|
||
|
|
}
|
||
|
|
|
||
|
|
// MergeSpectraMaps adds all entries from b into a.
|
||
|
|
func MergeSpectraMaps(a, b map[int]uint64) {
|
||
|
|
for freq, count := range b {
|
||
|
|
a[freq] += count
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// WriteSpectrum writes a KmerSpectrum to a binary file.
|
||
|
|
//
|
||
|
|
// Format:
|
||
|
|
//
|
||
|
|
// [magic: 4 bytes "KSP\x01"]
|
||
|
|
// [n_entries: varint]
|
||
|
|
// For each entry (sorted by frequency ascending):
|
||
|
|
// [frequency: varint]
|
||
|
|
// [count: varint]
|
||
|
|
func WriteSpectrum(path string, spectrum *KmerSpectrum) error {
|
||
|
|
f, err := os.Create(path)
|
||
|
|
if err != nil {
|
||
|
|
return fmt.Errorf("create spectrum file: %w", err)
|
||
|
|
}
|
||
|
|
w := bufio.NewWriterSize(f, 65536)
|
||
|
|
|
||
|
|
// Magic
|
||
|
|
if _, err := w.Write(kspMagic[:]); err != nil {
|
||
|
|
f.Close()
|
||
|
|
return err
|
||
|
|
}
|
||
|
|
|
||
|
|
// Number of entries
|
||
|
|
if _, err := EncodeVarint(w, uint64(len(spectrum.Entries))); err != nil {
|
||
|
|
f.Close()
|
||
|
|
return err
|
||
|
|
}
|
||
|
|
|
||
|
|
// Entries
|
||
|
|
for _, e := range spectrum.Entries {
|
||
|
|
if _, err := EncodeVarint(w, uint64(e.Frequency)); err != nil {
|
||
|
|
f.Close()
|
||
|
|
return err
|
||
|
|
}
|
||
|
|
if _, err := EncodeVarint(w, e.Count); err != nil {
|
||
|
|
f.Close()
|
||
|
|
return err
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if err := w.Flush(); err != nil {
|
||
|
|
f.Close()
|
||
|
|
return err
|
||
|
|
}
|
||
|
|
return f.Close()
|
||
|
|
}
|
||
|
|
|
||
|
|
// ReadSpectrum reads a KmerSpectrum from a binary file.
|
||
|
|
func ReadSpectrum(path string) (*KmerSpectrum, error) {
|
||
|
|
f, err := os.Open(path)
|
||
|
|
if err != nil {
|
||
|
|
return nil, err
|
||
|
|
}
|
||
|
|
defer f.Close()
|
||
|
|
|
||
|
|
r := bufio.NewReaderSize(f, 65536)
|
||
|
|
|
||
|
|
// Check magic
|
||
|
|
var magic [4]byte
|
||
|
|
if _, err := r.Read(magic[:]); err != nil {
|
||
|
|
return nil, fmt.Errorf("read spectrum magic: %w", err)
|
||
|
|
}
|
||
|
|
if magic != kspMagic {
|
||
|
|
return nil, fmt.Errorf("invalid spectrum file magic: %v", magic)
|
||
|
|
}
|
||
|
|
|
||
|
|
// Number of entries
|
||
|
|
nEntries, err := DecodeVarint(r)
|
||
|
|
if err != nil {
|
||
|
|
return nil, fmt.Errorf("read spectrum entry count: %w", err)
|
||
|
|
}
|
||
|
|
|
||
|
|
entries := make([]SpectrumEntry, nEntries)
|
||
|
|
for i := uint64(0); i < nEntries; i++ {
|
||
|
|
freq, err := DecodeVarint(r)
|
||
|
|
if err != nil {
|
||
|
|
return nil, fmt.Errorf("read spectrum freq at entry %d: %w", i, err)
|
||
|
|
}
|
||
|
|
count, err := DecodeVarint(r)
|
||
|
|
if err != nil {
|
||
|
|
return nil, fmt.Errorf("read spectrum count at entry %d: %w", i, err)
|
||
|
|
}
|
||
|
|
entries[i] = SpectrumEntry{
|
||
|
|
Frequency: int(freq),
|
||
|
|
Count: count,
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return &KmerSpectrum{Entries: entries}, nil
|
||
|
|
}
|
||
|
|
|
||
|
|
// KmerFreq associates a k-mer (encoded as uint64) with its observed frequency.
|
||
|
|
type KmerFreq struct {
|
||
|
|
Kmer uint64
|
||
|
|
Freq int
|
||
|
|
}
|
||
|
|
|
||
|
|
// kmerFreqHeap is a min-heap of KmerFreq ordered by Freq (lowest first).
|
||
|
|
// Used to maintain a top-N most frequent k-mers set.
|
||
|
|
type kmerFreqHeap []KmerFreq
|
||
|
|
|
||
|
|
func (h kmerFreqHeap) Len() int { return len(h) }
|
||
|
|
func (h kmerFreqHeap) Less(i, j int) bool { return h[i].Freq < h[j].Freq }
|
||
|
|
func (h kmerFreqHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
|
||
|
|
func (h *kmerFreqHeap) Push(x interface{}) { *h = append(*h, x.(KmerFreq)) }
|
||
|
|
func (h *kmerFreqHeap) Pop() interface{} {
|
||
|
|
old := *h
|
||
|
|
n := len(old)
|
||
|
|
x := old[n-1]
|
||
|
|
*h = old[:n-1]
|
||
|
|
return x
|
||
|
|
}
|
||
|
|
|
||
|
|
// TopNKmers maintains a collection of the N most frequent k-mers
|
||
|
|
// using a min-heap. Thread-safe usage requires external synchronization.
|
||
|
|
type TopNKmers struct {
|
||
|
|
n int
|
||
|
|
h kmerFreqHeap
|
||
|
|
}
|
||
|
|
|
||
|
|
// NewTopNKmers creates a new top-N collector.
|
||
|
|
func NewTopNKmers(n int) *TopNKmers {
|
||
|
|
return &TopNKmers{
|
||
|
|
n: n,
|
||
|
|
h: make(kmerFreqHeap, 0, n+1),
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Add considers a k-mer with the given frequency for inclusion in the top-N.
|
||
|
|
func (t *TopNKmers) Add(kmer uint64, freq int) {
|
||
|
|
if t.n <= 0 {
|
||
|
|
return
|
||
|
|
}
|
||
|
|
if len(t.h) < t.n {
|
||
|
|
heap.Push(&t.h, KmerFreq{Kmer: kmer, Freq: freq})
|
||
|
|
} else if freq > t.h[0].Freq {
|
||
|
|
t.h[0] = KmerFreq{Kmer: kmer, Freq: freq}
|
||
|
|
heap.Fix(&t.h, 0)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Results returns the collected k-mers sorted by frequency descending.
|
||
|
|
func (t *TopNKmers) Results() []KmerFreq {
|
||
|
|
result := make([]KmerFreq, len(t.h))
|
||
|
|
copy(result, t.h)
|
||
|
|
sort.Slice(result, func(i, j int) bool {
|
||
|
|
return result[i].Freq > result[j].Freq
|
||
|
|
})
|
||
|
|
return result
|
||
|
|
}
|
||
|
|
|
||
|
|
// MergeTopN merges another TopNKmers into this one.
|
||
|
|
func (t *TopNKmers) MergeTopN(other *TopNKmers) {
|
||
|
|
if other == nil {
|
||
|
|
return
|
||
|
|
}
|
||
|
|
for _, kf := range other.h {
|
||
|
|
t.Add(kf.Kmer, kf.Freq)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// WriteTopKmersCSV writes the top k-mers to a CSV file.
|
||
|
|
// Columns: sequence, frequency
|
||
|
|
func WriteTopKmersCSV(path string, topKmers []KmerFreq, k int) error {
|
||
|
|
f, err := os.Create(path)
|
||
|
|
if err != nil {
|
||
|
|
return fmt.Errorf("create top-kmers file: %w", err)
|
||
|
|
}
|
||
|
|
defer f.Close()
|
||
|
|
|
||
|
|
w := csv.NewWriter(f)
|
||
|
|
defer w.Flush()
|
||
|
|
|
||
|
|
if err := w.Write([]string{"sequence", "frequency"}); err != nil {
|
||
|
|
return err
|
||
|
|
}
|
||
|
|
|
||
|
|
buf := make([]byte, k)
|
||
|
|
for _, kf := range topKmers {
|
||
|
|
seq := DecodeKmer(kf.Kmer, k, buf)
|
||
|
|
if err := w.Write([]string{string(seq), strconv.Itoa(kf.Freq)}); err != nil {
|
||
|
|
return err
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return nil
|
||
|
|
}
|