mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 05:20:52 +00:00
Refactor kmer index package to use disk-based partitioning with minimizer - Replace roaring64 bitmaps with disk-based kmer index - Implement partitioned kmer sets with delta-varint encoding - Add support for frequency filtering during construction - Introduce new builder pattern for index construction - Add streaming operations for set operations (union, intersect, etc.) - Add support for super-kmer encoding during construction - Update command line tool to use new index format - Remove dependency on roaring bitmap library This change introduces a new architecture for kmer indexing that is more memory efficient and scalable for large datasets.
348 lines
9.2 KiB
Go
348 lines
9.2 KiB
Go
package obikmer
|
||
|
||
import (
|
||
"fmt"
|
||
|
||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidist"
|
||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||
)
|
||
|
||
// KmerSetGroup represents a vector of KmerSet
|
||
// Used to manage multiple k-mer sets (for example, by frequency level)
|
||
type KmerSetGroup struct {
|
||
id string // Unique identifier of the KmerSetGroup
|
||
k int // Size of k-mers (immutable)
|
||
sets []*KmerSet // Vector of KmerSet
|
||
Metadata map[string]interface{} // Group metadata (not individual sets)
|
||
}
|
||
|
||
// NewKmerSetGroup creates a new group of n KmerSets
|
||
func NewKmerSetGroup(k int, n int) *KmerSetGroup {
|
||
if n < 1 {
|
||
panic("KmerSetGroup size must be >= 1")
|
||
}
|
||
|
||
sets := make([]*KmerSet, n)
|
||
for i := range sets {
|
||
sets[i] = NewKmerSet(k)
|
||
}
|
||
|
||
return &KmerSetGroup{
|
||
k: k,
|
||
sets: sets,
|
||
Metadata: make(map[string]interface{}),
|
||
}
|
||
}
|
||
|
||
// K returns the size of k-mers (immutable)
|
||
func (ksg *KmerSetGroup) K() int {
|
||
return ksg.k
|
||
}
|
||
|
||
// Size returns the number of KmerSet in the group
|
||
func (ksg *KmerSetGroup) Size() int {
|
||
return len(ksg.sets)
|
||
}
|
||
|
||
// Get returns the KmerSet at the given index
|
||
// Returns nil if the index is invalid
|
||
func (ksg *KmerSetGroup) Get(index int) *KmerSet {
|
||
if index < 0 || index >= len(ksg.sets) {
|
||
return nil
|
||
}
|
||
return ksg.sets[index]
|
||
}
|
||
|
||
// Set replaces the KmerSet at the given index
|
||
// Panics if the index is invalid or if k does not match
|
||
func (ksg *KmerSetGroup) Set(index int, ks *KmerSet) {
|
||
if index < 0 || index >= len(ksg.sets) {
|
||
panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets)))
|
||
}
|
||
if ks.k != ksg.k {
|
||
panic(fmt.Sprintf("KmerSet k mismatch: expected %d, got %d", ksg.k, ks.k))
|
||
}
|
||
ksg.sets[index] = ks
|
||
}
|
||
|
||
// Len returns the number of k-mers in a specific KmerSet
|
||
// Without argument: returns the number of k-mers in the last KmerSet
|
||
// With argument index: returns the number of k-mers in the KmerSet at this index
|
||
func (ksg *KmerSetGroup) Len(index ...int) uint64 {
|
||
if len(index) == 0 {
|
||
// Without argument: last KmerSet
|
||
return ksg.sets[len(ksg.sets)-1].Len()
|
||
}
|
||
|
||
// With argument: specific KmerSet
|
||
idx := index[0]
|
||
if idx < 0 || idx >= len(ksg.sets) {
|
||
return 0
|
||
}
|
||
return ksg.sets[idx].Len()
|
||
}
|
||
|
||
// MemoryUsage returns the total memory usage in bytes
|
||
func (ksg *KmerSetGroup) MemoryUsage() uint64 {
|
||
total := uint64(0)
|
||
for _, ks := range ksg.sets {
|
||
total += ks.MemoryUsage()
|
||
}
|
||
return total
|
||
}
|
||
|
||
// Clear empties all KmerSet in the group
|
||
func (ksg *KmerSetGroup) Clear() {
|
||
for _, ks := range ksg.sets {
|
||
ks.Clear()
|
||
}
|
||
}
|
||
|
||
// Copy creates a complete copy of the group (consistent with BioSequence.Copy)
|
||
func (ksg *KmerSetGroup) Copy() *KmerSetGroup {
|
||
copiedSets := make([]*KmerSet, len(ksg.sets))
|
||
for i, ks := range ksg.sets {
|
||
copiedSets[i] = ks.Copy() // Copy each KmerSet with its metadata
|
||
}
|
||
|
||
// Copy group metadata
|
||
groupMetadata := make(map[string]interface{}, len(ksg.Metadata))
|
||
for k, v := range ksg.Metadata {
|
||
groupMetadata[k] = v
|
||
}
|
||
|
||
return &KmerSetGroup{
|
||
id: ksg.id,
|
||
k: ksg.k,
|
||
sets: copiedSets,
|
||
Metadata: groupMetadata,
|
||
}
|
||
}
|
||
|
||
// Id returns the identifier of the KmerSetGroup (consistent with BioSequence.Id)
|
||
func (ksg *KmerSetGroup) Id() string {
|
||
return ksg.id
|
||
}
|
||
|
||
// SetId sets the identifier of the KmerSetGroup (consistent with BioSequence.SetId)
|
||
func (ksg *KmerSetGroup) SetId(id string) {
|
||
ksg.id = id
|
||
}
|
||
|
||
// AddSequence adds all k-mers from a sequence to a specific KmerSet
|
||
func (ksg *KmerSetGroup) AddSequence(seq *obiseq.BioSequence, index int) {
|
||
if index < 0 || index >= len(ksg.sets) {
|
||
panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets)))
|
||
}
|
||
ksg.sets[index].AddSequence(seq)
|
||
}
|
||
|
||
// AddSequences adds all k-mers from multiple sequences to a specific KmerSet
|
||
func (ksg *KmerSetGroup) AddSequences(sequences *obiseq.BioSequenceSlice, index int) {
|
||
if index < 0 || index >= len(ksg.sets) {
|
||
panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets)))
|
||
}
|
||
ksg.sets[index].AddSequences(sequences)
|
||
}
|
||
|
||
// AddSequenceSlice adds all k-mers from a slice of sequences to a specific KmerSet
|
||
func (ksg *KmerSetGroup) AddSequenceSlice(sequences *obiseq.BioSequenceSlice, index int) {
|
||
if index < 0 || index >= len(ksg.sets) {
|
||
panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets)))
|
||
}
|
||
ksg.sets[index].AddSequenceSlice(sequences)
|
||
}
|
||
|
||
// Union returns the union of all KmerSet in the group
|
||
// Optimization: starts from the largest set to minimize operations
|
||
func (ksg *KmerSetGroup) Union() *KmerSet {
|
||
if len(ksg.sets) == 0 {
|
||
return NewKmerSet(ksg.k)
|
||
}
|
||
|
||
if len(ksg.sets) == 1 {
|
||
return ksg.sets[0].Copy()
|
||
}
|
||
|
||
// Find the index of the largest set (the one with the most k-mers)
|
||
maxIdx := 0
|
||
maxCard := ksg.sets[0].Len()
|
||
for i := 1; i < len(ksg.sets); i++ {
|
||
card := ksg.sets[i].Len()
|
||
if card > maxCard {
|
||
maxCard = card
|
||
maxIdx = i
|
||
}
|
||
}
|
||
|
||
// Copy the largest set and perform unions in-place
|
||
result := ksg.sets[maxIdx].bitmap.Clone()
|
||
for i := 0; i < len(ksg.sets); i++ {
|
||
if i != maxIdx {
|
||
result.Or(ksg.sets[i].bitmap)
|
||
}
|
||
}
|
||
|
||
return NewKmerSetFromBitmap(ksg.k, result)
|
||
}
|
||
|
||
// Intersect returns the intersection of all KmerSet in the group
|
||
// Optimization: starts from the smallest set to minimize operations
|
||
func (ksg *KmerSetGroup) Intersect() *KmerSet {
|
||
if len(ksg.sets) == 0 {
|
||
return NewKmerSet(ksg.k)
|
||
}
|
||
|
||
if len(ksg.sets) == 1 {
|
||
return ksg.sets[0].Copy()
|
||
}
|
||
|
||
// Find the index of the smallest set (the one with the fewest k-mers)
|
||
minIdx := 0
|
||
minCard := ksg.sets[0].Len()
|
||
for i := 1; i < len(ksg.sets); i++ {
|
||
card := ksg.sets[i].Len()
|
||
if card < minCard {
|
||
minCard = card
|
||
minIdx = i
|
||
}
|
||
}
|
||
|
||
// Copy the smallest set and perform intersections in-place
|
||
result := ksg.sets[minIdx].bitmap.Clone()
|
||
for i := 0; i < len(ksg.sets); i++ {
|
||
if i != minIdx {
|
||
result.And(ksg.sets[i].bitmap)
|
||
}
|
||
}
|
||
|
||
return NewKmerSetFromBitmap(ksg.k, result)
|
||
}
|
||
|
||
// Stats returns statistics for each KmerSet in the group
|
||
type KmerSetGroupStats struct {
|
||
K int
|
||
Size int // Number of KmerSet
|
||
TotalBytes uint64 // Total memory used
|
||
Sets []KmerSetStats // Stats of each KmerSet
|
||
}
|
||
|
||
type KmerSetStats struct {
|
||
Index int // Index of the KmerSet in the group
|
||
Len uint64 // Number of k-mers
|
||
SizeBytes uint64 // Size in bytes
|
||
}
|
||
|
||
func (ksg *KmerSetGroup) Stats() KmerSetGroupStats {
|
||
stats := KmerSetGroupStats{
|
||
K: ksg.k,
|
||
Size: len(ksg.sets),
|
||
Sets: make([]KmerSetStats, len(ksg.sets)),
|
||
}
|
||
|
||
for i, ks := range ksg.sets {
|
||
sizeBytes := ks.MemoryUsage()
|
||
stats.Sets[i] = KmerSetStats{
|
||
Index: i,
|
||
Len: ks.Len(),
|
||
SizeBytes: sizeBytes,
|
||
}
|
||
stats.TotalBytes += sizeBytes
|
||
}
|
||
|
||
return stats
|
||
}
|
||
|
||
func (ksgs KmerSetGroupStats) String() string {
|
||
result := fmt.Sprintf(`KmerSetGroup Statistics (k=%d, size=%d):
|
||
Total memory: %.2f MB
|
||
|
||
Set breakdown:
|
||
`, ksgs.K, ksgs.Size, float64(ksgs.TotalBytes)/1024/1024)
|
||
|
||
for _, set := range ksgs.Sets {
|
||
result += fmt.Sprintf(" Set[%d]: %d k-mers (%.2f MB)\n",
|
||
set.Index,
|
||
set.Len,
|
||
float64(set.SizeBytes)/1024/1024)
|
||
}
|
||
|
||
return result
|
||
}
|
||
|
||
// JaccardDistanceMatrix computes a pairwise Jaccard distance matrix for all KmerSets in the group.
|
||
// Returns a triangular distance matrix where element (i, j) represents the Jaccard distance
|
||
// between set i and set j.
|
||
//
|
||
// The Jaccard distance is: 1 - (|A ∩ B| / |A ∪ B|)
|
||
//
|
||
// The matrix labels are set to the IDs of the individual KmerSets if available,
|
||
// otherwise they are set to "set_0", "set_1", etc.
|
||
//
|
||
// Time complexity: O(n² × (|A| + |B|)) where n is the number of sets
|
||
// Space complexity: O(n²) for the distance matrix
|
||
func (ksg *KmerSetGroup) JaccardDistanceMatrix() *obidist.DistMatrix {
|
||
n := len(ksg.sets)
|
||
|
||
// Create labels from set IDs
|
||
labels := make([]string, n)
|
||
for i, ks := range ksg.sets {
|
||
if ks.Id() != "" {
|
||
labels[i] = ks.Id()
|
||
} else {
|
||
labels[i] = fmt.Sprintf("set_%d", i)
|
||
}
|
||
}
|
||
|
||
dm := obidist.NewDistMatrixWithLabels(labels)
|
||
|
||
// Compute pairwise distances
|
||
for i := 0; i < n-1; i++ {
|
||
for j := i + 1; j < n; j++ {
|
||
distance := ksg.sets[i].JaccardDistance(ksg.sets[j])
|
||
dm.Set(i, j, distance)
|
||
}
|
||
}
|
||
|
||
return dm
|
||
}
|
||
|
||
// JaccardSimilarityMatrix computes a pairwise Jaccard similarity matrix for all KmerSets in the group.
|
||
// Returns a similarity matrix where element (i, j) represents the Jaccard similarity
|
||
// between set i and set j.
|
||
//
|
||
// The Jaccard similarity is: |A ∩ B| / |A ∪ B|
|
||
//
|
||
// The diagonal is 1.0 (similarity of a set to itself).
|
||
//
|
||
// The matrix labels are set to the IDs of the individual KmerSets if available,
|
||
// otherwise they are set to "set_0", "set_1", etc.
|
||
//
|
||
// Time complexity: O(n² × (|A| + |B|)) where n is the number of sets
|
||
// Space complexity: O(n²) for the similarity matrix
|
||
func (ksg *KmerSetGroup) JaccardSimilarityMatrix() *obidist.DistMatrix {
|
||
n := len(ksg.sets)
|
||
|
||
// Create labels from set IDs
|
||
labels := make([]string, n)
|
||
for i, ks := range ksg.sets {
|
||
if ks.Id() != "" {
|
||
labels[i] = ks.Id()
|
||
} else {
|
||
labels[i] = fmt.Sprintf("set_%d", i)
|
||
}
|
||
}
|
||
|
||
sm := obidist.NewSimilarityMatrixWithLabels(labels)
|
||
|
||
// Compute pairwise similarities
|
||
for i := 0; i < n-1; i++ {
|
||
for j := i + 1; j < n; j++ {
|
||
similarity := ksg.sets[i].JaccardSimilarity(ksg.sets[j])
|
||
sm.Set(i, j, similarity)
|
||
}
|
||
}
|
||
|
||
return sm
|
||
}
|