mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Refactor k-mer index building to use disk-based KmerSetGroupBuilder
Refactor k-mer index building to use the new disk-based KmerSetGroupBuilder instead of the old KmerSet and FrequencyFilter approaches. This change introduces a more efficient and scalable approach to building k-mer indices by using partitioned disk storage with streaming operations. - Replace BuildKmerIndex and BuildFrequencyFilterIndex with KmerSetGroupBuilder - Add support for frequency filtering via WithMinFrequency option - Remove deprecated k-mer set persistence methods - Update CLI to use new builder approach - Add new disk-based k-mer operations (union, intersect, difference, quorum) - Introduce KDI (K-mer Delta Index) file format for efficient storage - Add K-way merge operations for combining sorted k-mer streams - Update documentation and examples to reflect new API This refactoring provides better memory usage, faster operations on large datasets, and more flexible k-mer set operations.
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -33,3 +33,4 @@ LLM/**
|
||||
entropy.html
|
||||
bug_id.txt
|
||||
obilowmask_ref
|
||||
test_*
|
||||
|
||||
@@ -11,7 +11,7 @@ import (
|
||||
func main() {
|
||||
optionParser := obioptions.GenerateOptionParser(
|
||||
"obikindex",
|
||||
"builds a roaring kmer index from sequence files",
|
||||
"builds a disk-based kmer index from sequence files",
|
||||
obikindex.OptionSet)
|
||||
|
||||
_, args := optionParser(os.Args)
|
||||
|
||||
5
go.mod
5
go.mod
@@ -14,6 +14,7 @@ require (
|
||||
github.com/goccy/go-json v0.10.3
|
||||
github.com/klauspost/pgzip v1.2.6
|
||||
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58
|
||||
github.com/pelletier/go-toml/v2 v2.2.4
|
||||
github.com/rrethy/ahocorasick v1.0.0
|
||||
github.com/schollz/progressbar/v3 v3.13.1
|
||||
github.com/sirupsen/logrus v1.9.3
|
||||
@@ -27,14 +28,10 @@ require (
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/RoaringBitmap/roaring v1.9.4 // indirect
|
||||
github.com/bits-and-blooms/bitset v1.12.0 // indirect
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/goombaio/orderedmap v0.0.0-20180924084748-ba921b7e2419 // indirect
|
||||
github.com/kr/pretty v0.3.1 // indirect
|
||||
github.com/kr/text v0.2.0 // indirect
|
||||
github.com/mschoch/smat v0.2.0 // indirect
|
||||
github.com/pelletier/go-toml/v2 v2.2.4 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
github.com/rogpeppe/go-internal v1.12.0 // indirect
|
||||
)
|
||||
|
||||
6
go.sum
6
go.sum
@@ -4,12 +4,8 @@ github.com/PaesslerAG/gval v1.2.2 h1:Y7iBzhgE09IGTt5QgGQ2IdaYYYOU134YGHBThD+wm9E
|
||||
github.com/PaesslerAG/gval v1.2.2/go.mod h1:XRFLwvmkTEdYziLdaCeCa5ImcGVrfQbeNUbVR+C6xac=
|
||||
github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI=
|
||||
github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8=
|
||||
github.com/RoaringBitmap/roaring v1.9.4 h1:yhEIoH4YezLYT04s1nHehNO64EKFTop/wBhxv2QzDdQ=
|
||||
github.com/RoaringBitmap/roaring v1.9.4/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90=
|
||||
github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df h1:GSoSVRLoBaFpOOds6QyY1L8AX7uoY+Ln3BHc22W40X0=
|
||||
github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df/go.mod h1:hiVxq5OP2bUGBRNS3Z/bt/reCLFNbdcST6gISi1fiOM=
|
||||
github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA=
|
||||
github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
|
||||
github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMUs=
|
||||
github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0=
|
||||
github.com/chen3feng/stl4go v0.1.1 h1:0L1+mDw7pomftKDruM23f1mA7miavOj6C6MZeadzN2Q=
|
||||
@@ -51,8 +47,6 @@ github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZ
|
||||
github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
|
||||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ=
|
||||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw=
|
||||
github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM=
|
||||
github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw=
|
||||
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
|
||||
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
|
||||
github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4=
|
||||
|
||||
@@ -1,292 +0,0 @@
|
||||
# Filtre de Fréquence avec v Niveaux de Roaring Bitmaps
|
||||
|
||||
## Algorithme
|
||||
|
||||
```go
|
||||
Pour chaque k-mer rencontré dans les données:
|
||||
c = 0
|
||||
tant que (k-mer ∈ index[c] ET c < v):
|
||||
c++
|
||||
|
||||
si c < v:
|
||||
index[c].insert(k-mer)
|
||||
```
|
||||
|
||||
**Résultat** : `index[v-1]` contient les k-mers vus **≥ v fois**
|
||||
|
||||
---
|
||||
|
||||
## Exemple d'exécution (v=3)
|
||||
|
||||
```
|
||||
Données:
|
||||
Read1: kmer X
|
||||
Read2: kmer X
|
||||
Read3: kmer X (X vu 3 fois)
|
||||
Read4: kmer Y
|
||||
Read5: kmer Y (Y vu 2 fois)
|
||||
Read6: kmer Z (Z vu 1 fois)
|
||||
|
||||
Exécution:
|
||||
|
||||
Read1 (X):
|
||||
c=0: X ∉ index[0] → index[0].add(X)
|
||||
État: index[0]={X}, index[1]={}, index[2]={}
|
||||
|
||||
Read2 (X):
|
||||
c=0: X ∈ index[0] → c=1
|
||||
c=1: X ∉ index[1] → index[1].add(X)
|
||||
État: index[0]={X}, index[1]={X}, index[2]={}
|
||||
|
||||
Read3 (X):
|
||||
c=0: X ∈ index[0] → c=1
|
||||
c=1: X ∈ index[1] → c=2
|
||||
c=2: X ∉ index[2] → index[2].add(X)
|
||||
État: index[0]={X}, index[1]={X}, index[2]={X}
|
||||
|
||||
Read4 (Y):
|
||||
c=0: Y ∉ index[0] → index[0].add(Y)
|
||||
État: index[0]={X,Y}, index[1]={X}, index[2]={X}
|
||||
|
||||
Read5 (Y):
|
||||
c=0: Y ∈ index[0] → c=1
|
||||
c=1: Y ∉ index[1] → index[1].add(Y)
|
||||
État: index[0]={X,Y}, index[1]={X,Y}, index[2]={X}
|
||||
|
||||
Read6 (Z):
|
||||
c=0: Z ∉ index[0] → index[0].add(Z)
|
||||
État: index[0]={X,Y,Z}, index[1]={X,Y}, index[2]={X}
|
||||
|
||||
Résultat final:
|
||||
index[0] (freq≥1): {X, Y, Z}
|
||||
index[1] (freq≥2): {X, Y}
|
||||
index[2] (freq≥3): {X} ← K-mers filtrés ✓
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Utilisation
|
||||
|
||||
```go
|
||||
// Créer le filtre
|
||||
filter := obikmer.NewFrequencyFilter(31, 3) // k=31, minFreq=3
|
||||
|
||||
// Ajouter les séquences
|
||||
for _, read := range reads {
|
||||
filter.AddSequence(read)
|
||||
}
|
||||
|
||||
// Récupérer les k-mers filtrés (freq ≥ 3)
|
||||
filtered := filter.GetFilteredSet("filtered")
|
||||
fmt.Printf("K-mers de qualité: %d\n", filtered.Cardinality())
|
||||
|
||||
// Statistiques
|
||||
stats := filter.Stats()
|
||||
fmt.Println(stats.String())
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Performance
|
||||
|
||||
### Complexité
|
||||
|
||||
**Par k-mer** :
|
||||
- Lookups : Moyenne ~v/2, pire cas v
|
||||
- Insertions : 1 Add
|
||||
- **Pas de Remove** ✅
|
||||
|
||||
**Total pour n k-mers** :
|
||||
- Temps : O(n × v/2)
|
||||
- Mémoire : O(unique_kmers × v × 2 bytes)
|
||||
|
||||
### Early exit pour distribution skewed
|
||||
|
||||
Avec distribution typique (séquençage) :
|
||||
```
|
||||
80% singletons → 1 lookup (early exit)
|
||||
15% freq 2-3 → 2-3 lookups
|
||||
5% freq ≥4 → jusqu'à v lookups
|
||||
|
||||
Moyenne réelle : ~2 lookups/kmer (au lieu de v/2)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Mémoire
|
||||
|
||||
### Pour 10^8 k-mers uniques
|
||||
|
||||
| v (minFreq) | Nombre bitmaps | Mémoire | vs map simple |
|
||||
|-------------|----------------|---------|---------------|
|
||||
| v=2 | 2 | ~400 MB | 6x moins |
|
||||
| v=3 | 3 | ~600 MB | 4x moins |
|
||||
| v=5 | 5 | ~1 GB | 2.4x moins |
|
||||
| v=10 | 10 | ~2 GB | 1.2x moins |
|
||||
| v=20 | 20 | ~4 GB | ~égal |
|
||||
|
||||
**Note** : Avec distribution skewed (beaucoup de singletons), la mémoire réelle est bien plus faible car les niveaux hauts ont peu d'éléments.
|
||||
|
||||
### Exemple réaliste (séquençage)
|
||||
|
||||
Pour 10^8 k-mers totaux, v=3 :
|
||||
```
|
||||
Distribution:
|
||||
80% singletons → 80M dans index[0]
|
||||
15% freq 2-3 → 15M dans index[1]
|
||||
5% freq ≥3 → 5M dans index[2]
|
||||
|
||||
Mémoire:
|
||||
index[0]: 80M × 2 bytes = 160 MB
|
||||
index[1]: 15M × 2 bytes = 30 MB
|
||||
index[2]: 5M × 2 bytes = 10 MB
|
||||
Total: ~200 MB ✅
|
||||
|
||||
vs map simple: 80M × 24 bytes = ~2 GB
|
||||
Réduction: 10x
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Comparaison des approches
|
||||
|
||||
| Approche | Mémoire (10^8 kmers) | Passes | Lookups/kmer | Quand utiliser |
|
||||
|----------|----------------------|--------|--------------|----------------|
|
||||
| **v-Bitmaps** | **200-600 MB** | **1** | **~2 (avg)** | **Standard** ✅ |
|
||||
| Map simple | 2.4 GB | 1 | 1 | Si RAM illimitée |
|
||||
| Multi-pass | 400 MB | v | v | Si I/O pas cher |
|
||||
|
||||
---
|
||||
|
||||
## Avantages de v-Bitmaps
|
||||
|
||||
✅ **Une seule passe** sur les données
|
||||
✅ **Mémoire optimale** avec Roaring bitmaps
|
||||
✅ **Pas de Remove** (seulement Contains + Add)
|
||||
✅ **Early exit** efficace sur singletons
|
||||
✅ **Scalable** jusqu'à v~10-20
|
||||
✅ **Simple** à implémenter et comprendre
|
||||
|
||||
---
|
||||
|
||||
## Cas d'usage typiques
|
||||
|
||||
### 1. Éliminer erreurs de séquençage
|
||||
|
||||
```go
|
||||
filter := obikmer.NewFrequencyFilter(31, 3)
|
||||
|
||||
// Traiter FASTQ
|
||||
for read := range StreamFastq("sample.fastq") {
|
||||
filter.AddSequence(read)
|
||||
}
|
||||
|
||||
// K-mers de qualité (pas d'erreurs)
|
||||
cleaned := filter.GetFilteredSet("cleaned")
|
||||
```
|
||||
|
||||
**Résultat** : Élimine 70-80% des k-mers (erreurs)
|
||||
|
||||
### 2. Assemblage de génome
|
||||
|
||||
```go
|
||||
filter := obikmer.NewFrequencyFilter(31, 2)
|
||||
|
||||
// Filtrer avant l'assemblage
|
||||
for read := range reads {
|
||||
filter.AddSequence(read)
|
||||
}
|
||||
|
||||
solidKmers := filter.GetFilteredSet("solid")
|
||||
// Utiliser solidKmers pour le graphe de Bruijn
|
||||
```
|
||||
|
||||
### 3. Comparaison de génomes
|
||||
|
||||
```go
|
||||
collection := obikmer.NewKmerSetCollection(31)
|
||||
|
||||
for _, genome := range genomes {
|
||||
filter := obikmer.NewFrequencyFilter(31, 3)
|
||||
filter.AddSequences(genome.Reads)
|
||||
|
||||
cleaned := filter.GetFilteredSet(genome.ID)
|
||||
collection.Add(cleaned)
|
||||
}
|
||||
|
||||
// Analyses comparatives sur k-mers de qualité
|
||||
matrix := collection.ParallelPairwiseJaccard(8)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Limites
|
||||
|
||||
**Pour v > 20** :
|
||||
- Trop de lookups (v lookups/kmer)
|
||||
- Mémoire importante (v × 200MB pour 10^8 kmers)
|
||||
|
||||
**Solutions alternatives pour v > 20** :
|
||||
- Utiliser map simple (9 bytes/kmer) si RAM disponible
|
||||
- Algorithme différent (sketch, probabiliste)
|
||||
|
||||
---
|
||||
|
||||
## Optimisations possibles
|
||||
|
||||
### 1. Parallélisation
|
||||
|
||||
```go
|
||||
// Traiter plusieurs fichiers en parallèle
|
||||
filters := make([]*FrequencyFilter, numFiles)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
for i, file := range files {
|
||||
wg.Add(1)
|
||||
go func(idx int, f string) {
|
||||
defer wg.Done()
|
||||
filters[idx] = ProcessFile(f, k, minFreq)
|
||||
}(i, file)
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
// Merger les résultats
|
||||
merged := MergeFilters(filters)
|
||||
```
|
||||
|
||||
### 2. Streaming avec seuil adaptatif
|
||||
|
||||
```go
|
||||
// Commencer avec v=5, réduire progressivement
|
||||
filter := obikmer.NewFrequencyFilter(31, 5)
|
||||
|
||||
// ... traitement ...
|
||||
|
||||
// Si trop de mémoire, réduire à v=3
|
||||
if filter.MemoryUsage() > threshold {
|
||||
filter = ConvertToLowerThreshold(filter, 3)
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Récapitulatif final
|
||||
|
||||
**Pour filtrer les k-mers par fréquence ≥ v :**
|
||||
|
||||
1. **Créer** : `filter := NewFrequencyFilter(k, v)`
|
||||
2. **Traiter** : `filter.AddSequence(read)` pour chaque read
|
||||
3. **Résultat** : `filtered := filter.GetFilteredSet(id)`
|
||||
|
||||
**Mémoire** : ~2v MB par million de k-mers uniques
|
||||
**Temps** : Une seule passe, ~2 lookups/kmer en moyenne
|
||||
**Optimal pour** : v ≤ 20, distribution skewed (séquençage)
|
||||
|
||||
---
|
||||
|
||||
## Code fourni
|
||||
|
||||
1. **frequency_filter.go** - Implémentation complète
|
||||
2. **examples_frequency_filter_final.go** - Exemples d'utilisation
|
||||
|
||||
**Tout est prêt à utiliser !** 🚀
|
||||
@@ -1,320 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"obikmer"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// ==========================================
|
||||
// EXEMPLE 1 : Utilisation basique
|
||||
// ==========================================
|
||||
fmt.Println("=== EXEMPLE 1 : Utilisation basique ===\n")
|
||||
|
||||
k := 31
|
||||
minFreq := 3 // Garder les k-mers vus ≥3 fois
|
||||
|
||||
// Créer le filtre
|
||||
filter := obikmer.NewFrequencyFilter(k, minFreq)
|
||||
|
||||
// Simuler des séquences avec différentes fréquences
|
||||
sequences := [][]byte{
|
||||
[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"), // Kmer X
|
||||
[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"), // Kmer X (freq=2)
|
||||
[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"), // Kmer X (freq=3) ✓
|
||||
[]byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"), // Kmer Y
|
||||
[]byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"), // Kmer Y (freq=2) ✗
|
||||
[]byte("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"), // Kmer Z (freq=1) ✗
|
||||
}
|
||||
|
||||
fmt.Printf("Traitement de %d séquences...\n", len(sequences))
|
||||
for _, seq := range sequences {
|
||||
filter.AddSequence(seq)
|
||||
}
|
||||
|
||||
// Récupérer les k-mers filtrés
|
||||
filtered := filter.GetFilteredSet("filtered")
|
||||
fmt.Printf("\nK-mers avec freq ≥ %d: %d\n", minFreq, filtered.Cardinality())
|
||||
|
||||
// Statistiques
|
||||
stats := filter.Stats()
|
||||
fmt.Println("\n" + stats.String())
|
||||
|
||||
// ==========================================
|
||||
// EXEMPLE 2 : Vérifier les niveaux
|
||||
// ==========================================
|
||||
fmt.Println("\n=== EXEMPLE 2 : Inspection des niveaux ===\n")
|
||||
|
||||
// Vérifier chaque niveau
|
||||
for level := 0; level < minFreq; level++ {
|
||||
levelSet := filter.GetKmersAtLevel(level)
|
||||
fmt.Printf("Niveau %d (freq≥%d): %d k-mers\n",
|
||||
level+1, level+1, levelSet.Cardinality())
|
||||
}
|
||||
|
||||
// ==========================================
|
||||
// EXEMPLE 3 : Données réalistes
|
||||
// ==========================================
|
||||
fmt.Println("\n=== EXEMPLE 3 : Simulation données séquençage ===\n")
|
||||
|
||||
filter2 := obikmer.NewFrequencyFilter(31, 3)
|
||||
|
||||
// Simuler un dataset réaliste :
|
||||
// - 1000 reads
|
||||
// - 80% contiennent des erreurs (singletons)
|
||||
// - 15% vrais k-mers à basse fréquence
|
||||
// - 5% vrais k-mers à haute fréquence
|
||||
|
||||
// Vraie séquence répétée
|
||||
trueSeq := []byte("ACGTACGTACGTACGTACGTACGTACGTACG")
|
||||
for i := 0; i < 50; i++ {
|
||||
filter2.AddSequence(trueSeq)
|
||||
}
|
||||
|
||||
// Séquence à fréquence moyenne
|
||||
mediumSeq := []byte("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC")
|
||||
for i := 0; i < 5; i++ {
|
||||
filter2.AddSequence(mediumSeq)
|
||||
}
|
||||
|
||||
// Erreurs de séquençage (singletons)
|
||||
for i := 0; i < 100; i++ {
|
||||
errorSeq := []byte(fmt.Sprintf("TTTTTTTTTTTTTTTTTTTTTTTTTTTT%03d", i))
|
||||
filter2.AddSequence(errorSeq)
|
||||
}
|
||||
|
||||
stats2 := filter2.Stats()
|
||||
fmt.Println(stats2.String())
|
||||
|
||||
fmt.Println("Distribution attendue:")
|
||||
fmt.Println(" - Beaucoup de singletons (erreurs)")
|
||||
fmt.Println(" - Peu de k-mers à haute fréquence (signal)")
|
||||
fmt.Println(" → Filtrage efficace !")
|
||||
|
||||
// ==========================================
|
||||
// EXEMPLE 4 : Tester différents seuils
|
||||
// ==========================================
|
||||
fmt.Println("\n=== EXEMPLE 4 : Comparaison de seuils ===\n")
|
||||
|
||||
testSeqs := [][]byte{
|
||||
[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
|
||||
[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
|
||||
[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
|
||||
[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
|
||||
[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"), // freq=5
|
||||
[]byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"),
|
||||
[]byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"),
|
||||
[]byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"), // freq=3
|
||||
[]byte("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"), // freq=1
|
||||
}
|
||||
|
||||
for _, minFreq := range []int{2, 3, 5} {
|
||||
f := obikmer.NewFrequencyFilter(31, minFreq)
|
||||
f.AddSequences(testSeqs)
|
||||
|
||||
fmt.Printf("minFreq=%d: %d k-mers retenus (%.2f MB)\n",
|
||||
minFreq,
|
||||
f.Cardinality(),
|
||||
float64(f.MemoryUsage())/1024/1024)
|
||||
}
|
||||
|
||||
// ==========================================
|
||||
// EXEMPLE 5 : Comparaison mémoire
|
||||
// ==========================================
|
||||
fmt.Println("\n=== EXEMPLE 5 : Comparaison mémoire ===\n")
|
||||
|
||||
filter3 := obikmer.NewFrequencyFilter(31, 3)
|
||||
|
||||
// Simuler 10000 séquences
|
||||
for i := 0; i < 10000; i++ {
|
||||
seq := make([]byte, 100)
|
||||
for j := range seq {
|
||||
seq[j] = "ACGT"[(i+j)%4]
|
||||
}
|
||||
filter3.AddSequence(seq)
|
||||
}
|
||||
|
||||
fmt.Println(filter3.CompareWithSimpleMap())
|
||||
|
||||
// ==========================================
|
||||
// EXEMPLE 6 : Workflow complet
|
||||
// ==========================================
|
||||
fmt.Println("\n=== EXEMPLE 6 : Workflow complet ===\n")
|
||||
|
||||
fmt.Println("1. Créer le filtre")
|
||||
finalFilter := obikmer.NewFrequencyFilter(31, 3)
|
||||
|
||||
fmt.Println("2. Traiter les données (simulation)")
|
||||
// En pratique : lire depuis FASTQ
|
||||
// for read := range ReadFastq("data.fastq") {
|
||||
// finalFilter.AddSequence(read)
|
||||
// }
|
||||
|
||||
// Simulation
|
||||
for i := 0; i < 1000; i++ {
|
||||
seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACG")
|
||||
finalFilter.AddSequence(seq)
|
||||
}
|
||||
|
||||
fmt.Println("3. Récupérer les k-mers filtrés")
|
||||
result := finalFilter.GetFilteredSet("final")
|
||||
|
||||
fmt.Println("4. Utiliser le résultat")
|
||||
fmt.Printf(" K-mers de qualité: %d\n", result.Cardinality())
|
||||
fmt.Printf(" Mémoire utilisée: %.2f MB\n", float64(finalFilter.MemoryUsage())/1024/1024)
|
||||
|
||||
fmt.Println("5. Sauvegarder (optionnel)")
|
||||
// result.Save("filtered_kmers.bin")
|
||||
|
||||
// ==========================================
|
||||
// EXEMPLE 7 : Vérification individuelle
|
||||
// ==========================================
|
||||
fmt.Println("\n=== EXEMPLE 7 : Vérification de k-mers spécifiques ===\n")
|
||||
|
||||
checkFilter := obikmer.NewFrequencyFilter(31, 3)
|
||||
|
||||
testSeq := []byte("ACGTACGTACGTACGTACGTACGTACGTACG")
|
||||
for i := 0; i < 5; i++ {
|
||||
checkFilter.AddSequence(testSeq)
|
||||
}
|
||||
|
||||
var kmers []uint64
|
||||
kmers = obikmer.EncodeKmers(testSeq, 31, &kmers)
|
||||
|
||||
if len(kmers) > 0 {
|
||||
testKmer := kmers[0]
|
||||
|
||||
fmt.Printf("K-mer test: 0x%016X\n", testKmer)
|
||||
fmt.Printf(" Présent dans filtre: %v\n", checkFilter.Contains(testKmer))
|
||||
fmt.Printf(" Fréquence approx: %d\n", checkFilter.GetFrequency(testKmer))
|
||||
}
|
||||
|
||||
// ==========================================
|
||||
// EXEMPLE 8 : Intégration avec collection
|
||||
// ==========================================
|
||||
fmt.Println("\n=== EXEMPLE 8 : Intégration avec KmerSetCollection ===\n")
|
||||
|
||||
// Créer une collection de génomes filtrés
|
||||
collection := obikmer.NewKmerSetCollection(31)
|
||||
|
||||
genomes := map[string][][]byte{
|
||||
"Genome1": {
|
||||
[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
|
||||
[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
|
||||
[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
|
||||
[]byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"), // Erreur
|
||||
},
|
||||
"Genome2": {
|
||||
[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
|
||||
[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
|
||||
[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
|
||||
[]byte("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"), // Erreur
|
||||
},
|
||||
}
|
||||
|
||||
for id, sequences := range genomes {
|
||||
// Filtrer chaque génome
|
||||
genomeFilter := obikmer.NewFrequencyFilter(31, 3)
|
||||
genomeFilter.AddSequences(sequences)
|
||||
|
||||
// Ajouter à la collection
|
||||
filteredSet := genomeFilter.GetFilteredSet(id)
|
||||
collection.Add(filteredSet)
|
||||
|
||||
fmt.Printf("%s: %d k-mers de qualité\n", id, filteredSet.Cardinality())
|
||||
}
|
||||
|
||||
// Analyser la collection
|
||||
fmt.Println("\nAnalyse comparative:")
|
||||
collectionStats := collection.ComputeStats()
|
||||
fmt.Printf(" Core genome: %d k-mers\n", collectionStats.CoreSize)
|
||||
fmt.Printf(" Pan genome: %d k-mers\n", collectionStats.PanGenomeSize)
|
||||
|
||||
// ==========================================
|
||||
// RÉSUMÉ
|
||||
// ==========================================
|
||||
fmt.Println("\n=== RÉSUMÉ ===\n")
|
||||
fmt.Println("Le FrequencyFilter permet de:")
|
||||
fmt.Println(" ✓ Filtrer les k-mers par fréquence minimale")
|
||||
fmt.Println(" ✓ Utiliser une mémoire optimale avec Roaring bitmaps")
|
||||
fmt.Println(" ✓ Une seule passe sur les données")
|
||||
fmt.Println(" ✓ Éliminer efficacement les erreurs de séquençage")
|
||||
fmt.Println("")
|
||||
fmt.Println("Workflow typique:")
|
||||
fmt.Println(" 1. filter := NewFrequencyFilter(k, minFreq)")
|
||||
fmt.Println(" 2. for each sequence: filter.AddSequence(seq)")
|
||||
fmt.Println(" 3. filtered := filter.GetFilteredSet(id)")
|
||||
fmt.Println(" 4. Utiliser filtered dans vos analyses")
|
||||
}
|
||||
|
||||
// ==================================
|
||||
// FONCTION HELPER POUR BENCHMARKS
|
||||
// ==================================
|
||||
|
||||
func BenchmarkFrequencyFilter() {
|
||||
k := 31
|
||||
minFreq := 3
|
||||
|
||||
// Test avec différentes tailles
|
||||
sizes := []int{1000, 10000, 100000}
|
||||
|
||||
fmt.Println("\n=== BENCHMARK ===\n")
|
||||
|
||||
for _, size := range sizes {
|
||||
filter := obikmer.NewFrequencyFilter(k, minFreq)
|
||||
|
||||
// Générer des séquences
|
||||
for i := 0; i < size; i++ {
|
||||
seq := make([]byte, 100)
|
||||
for j := range seq {
|
||||
seq[j] = "ACGT"[(i+j)%4]
|
||||
}
|
||||
filter.AddSequence(seq)
|
||||
}
|
||||
|
||||
fmt.Printf("Size=%d reads:\n", size)
|
||||
fmt.Printf(" Filtered k-mers: %d\n", filter.Cardinality())
|
||||
fmt.Printf(" Memory: %.2f MB\n", float64(filter.MemoryUsage())/1024/1024)
|
||||
fmt.Println()
|
||||
}
|
||||
}
|
||||
|
||||
// ==================================
|
||||
// FONCTION POUR DONNÉES RÉELLES
|
||||
// ==================================
|
||||
|
||||
func ProcessRealData() {
|
||||
// Exemple pour traiter de vraies données FASTQ
|
||||
|
||||
k := 31
|
||||
minFreq := 3
|
||||
|
||||
filter := obikmer.NewFrequencyFilter(k, minFreq)
|
||||
|
||||
// Pseudo-code pour lire un FASTQ
|
||||
/*
|
||||
fastqFile := "sample.fastq"
|
||||
reader := NewFastqReader(fastqFile)
|
||||
|
||||
for reader.HasNext() {
|
||||
read := reader.Next()
|
||||
filter.AddSequence(read.Sequence)
|
||||
}
|
||||
|
||||
// Récupérer le résultat
|
||||
filtered := filter.GetFilteredSet("sample_filtered")
|
||||
filtered.Save("sample_filtered_kmers.bin")
|
||||
|
||||
// Stats
|
||||
stats := filter.Stats()
|
||||
fmt.Println(stats.String())
|
||||
*/
|
||||
|
||||
fmt.Println("Workflow pour données réelles:")
|
||||
fmt.Println(" 1. Créer le filtre avec minFreq approprié (2-5 typique)")
|
||||
fmt.Println(" 2. Stream les reads depuis FASTQ")
|
||||
fmt.Println(" 3. Récupérer les k-mers filtrés")
|
||||
fmt.Println(" 4. Utiliser pour assemblage/comparaison/etc.")
|
||||
|
||||
_ = filter // unused
|
||||
}
|
||||
@@ -1,317 +0,0 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
)
|
||||
|
||||
// FrequencyFilter filters k-mers by minimum frequency
|
||||
// Specialization of KmerSetGroup where index[i] contains k-mers seen at least i+1 times
|
||||
type FrequencyFilter struct {
|
||||
*KmerSetGroup // Group of KmerSet (one per frequency level)
|
||||
MinFreq int // v - minimum required frequency
|
||||
}
|
||||
|
||||
// NewFrequencyFilter creates a new frequency filter
|
||||
// minFreq: minimum number d'occurrences required (v)
|
||||
func NewFrequencyFilter(k, minFreq int) *FrequencyFilter {
|
||||
ff := &FrequencyFilter{
|
||||
KmerSetGroup: NewKmerSetGroup(k, minFreq),
|
||||
MinFreq: minFreq,
|
||||
}
|
||||
|
||||
// Initialize group metadata
|
||||
ff.SetAttribute("type", "FrequencyFilter")
|
||||
ff.SetAttribute("min_freq", minFreq)
|
||||
|
||||
// Initialize metadata for each level
|
||||
for i := 0; i < minFreq; i++ {
|
||||
level := ff.Get(i)
|
||||
level.SetAttribute("level", i)
|
||||
level.SetAttribute("min_occurrences", i+1)
|
||||
level.SetId(fmt.Sprintf("level_%d", i))
|
||||
}
|
||||
|
||||
return ff
|
||||
}
|
||||
|
||||
// AddSequence adds all k-mers from a sequence to the filter
|
||||
// Uses an iterator to avoid allocating an intermediate vector
|
||||
func (ff *FrequencyFilter) AddSequence(seq *obiseq.BioSequence) {
|
||||
rawSeq := seq.Sequence()
|
||||
for canonical := range IterCanonicalKmers(rawSeq, ff.K()) {
|
||||
ff.AddKmerCode(canonical)
|
||||
}
|
||||
}
|
||||
|
||||
// AddKmerCode adds an encoded k-mer to the filter (main algorithm)
|
||||
func (ff *FrequencyFilter) AddKmerCode(kmer uint64) {
|
||||
// Find the current level of the k-mer
|
||||
c := 0
|
||||
for c < ff.MinFreq && ff.Get(c).Contains(kmer) {
|
||||
c++
|
||||
}
|
||||
|
||||
// Add to next level (if not yet at maximum)
|
||||
if c < ff.MinFreq {
|
||||
ff.Get(c).AddKmerCode(kmer)
|
||||
}
|
||||
}
|
||||
|
||||
// AddCanonicalKmerCode adds an encoded canonical k-mer to the filter
|
||||
func (ff *FrequencyFilter) AddCanonicalKmerCode(kmer uint64) {
|
||||
canonical := CanonicalKmer(kmer, ff.K())
|
||||
ff.AddKmerCode(canonical)
|
||||
}
|
||||
|
||||
// AddKmer adds a k-mer to the filter by encoding the sequence
|
||||
// The sequence must have exactly k nucleotides
|
||||
// Zero-allocation: encodes directly without creating an intermediate slice
|
||||
func (ff *FrequencyFilter) AddKmer(seq []byte) {
|
||||
kmer := EncodeKmer(seq, ff.K())
|
||||
ff.AddKmerCode(kmer)
|
||||
}
|
||||
|
||||
// AddCanonicalKmer adds a canonical k-mer to the filter by encoding the sequence
|
||||
// The sequence must have exactly k nucleotides
|
||||
// Zero-allocation: encodes directly in canonical form without creating an intermediate slice
|
||||
func (ff *FrequencyFilter) AddCanonicalKmer(seq []byte) {
|
||||
canonical := EncodeCanonicalKmer(seq, ff.K())
|
||||
ff.AddKmerCode(canonical)
|
||||
}
|
||||
|
||||
// GetFilteredSet returns a KmerSet of k-mers with frequency ≥ minFreq
|
||||
func (ff *FrequencyFilter) GetFilteredSet() *KmerSet {
|
||||
// Filtered k-mers are in the last level
|
||||
return ff.Get(ff.MinFreq - 1).Copy()
|
||||
}
|
||||
|
||||
// GetKmersAtLevel returns a KmerSet of k-mers seen at least (level+1) times
|
||||
// level doit être dans [0, minFreq-1]
|
||||
func (ff *FrequencyFilter) GetKmersAtLevel(level int) *KmerSet {
|
||||
ks := ff.Get(level)
|
||||
if ks == nil {
|
||||
return NewKmerSet(ff.K())
|
||||
}
|
||||
return ks.Copy()
|
||||
}
|
||||
|
||||
// Stats returns statistics on frequency levels
|
||||
func (ff *FrequencyFilter) Stats() FrequencyFilterStats {
|
||||
stats := FrequencyFilterStats{
|
||||
MinFreq: ff.MinFreq,
|
||||
Levels: make([]LevelStats, ff.MinFreq),
|
||||
}
|
||||
|
||||
for i := 0; i < ff.MinFreq; i++ {
|
||||
ks := ff.Get(i)
|
||||
card := ks.Len()
|
||||
sizeBytes := ks.MemoryUsage()
|
||||
|
||||
stats.Levels[i] = LevelStats{
|
||||
Level: i + 1, // Level 1 = freq ≥ 1
|
||||
Cardinality: card,
|
||||
SizeBytes: sizeBytes,
|
||||
}
|
||||
|
||||
stats.TotalBytes += sizeBytes
|
||||
}
|
||||
|
||||
// The last level contains the result
|
||||
stats.FilteredKmers = stats.Levels[ff.MinFreq-1].Cardinality
|
||||
|
||||
return stats
|
||||
}
|
||||
|
||||
// FrequencyFilterStats contains the filter statistics
|
||||
type FrequencyFilterStats struct {
|
||||
MinFreq int
|
||||
FilteredKmers uint64 // K-mers with freq ≥ minFreq
|
||||
TotalBytes uint64 // Total memory used
|
||||
Levels []LevelStats
|
||||
}
|
||||
|
||||
// LevelStats contains the stats of a level
|
||||
type LevelStats struct {
|
||||
Level int // freq ≥ Level
|
||||
Cardinality uint64 // Number of k-mers
|
||||
SizeBytes uint64 // Size in bytes
|
||||
}
|
||||
|
||||
func (ffs FrequencyFilterStats) String() string {
|
||||
result := fmt.Sprintf(`Frequency Filter Statistics (minFreq=%d):
|
||||
Filtered k-mers (freq≥%d): %d
|
||||
Total memory: %.2f MB
|
||||
|
||||
Level breakdown:
|
||||
`, ffs.MinFreq, ffs.MinFreq, ffs.FilteredKmers, float64(ffs.TotalBytes)/1024/1024)
|
||||
|
||||
for _, level := range ffs.Levels {
|
||||
result += fmt.Sprintf(" freq≥%d: %d k-mers (%.2f MB)\n",
|
||||
level.Level,
|
||||
level.Cardinality,
|
||||
float64(level.SizeBytes)/1024/1024)
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// Clear libère la mémoire de tous les niveaux
|
||||
// (héritée de KmerSetGroup mais redéfinie pour clarté)
|
||||
func (ff *FrequencyFilter) Clear() {
|
||||
ff.KmerSetGroup.Clear()
|
||||
}
|
||||
|
||||
// ==================================
|
||||
// BATCH PROCESSING
|
||||
// ==================================
|
||||
|
||||
// AddSequences adds multiple sequences in batch
|
||||
func (ff *FrequencyFilter) AddSequences(sequences *obiseq.BioSequenceSlice) {
|
||||
for _, seq := range *sequences {
|
||||
ff.AddSequence(seq)
|
||||
}
|
||||
}
|
||||
|
||||
// AddSequenceSlice adds all k-mers from a slice of sequences to the filter
|
||||
func (ff *FrequencyFilter) AddSequenceSlice(sequences *obiseq.BioSequenceSlice) {
|
||||
for _, seq := range *sequences {
|
||||
ff.AddSequence(seq)
|
||||
}
|
||||
}
|
||||
|
||||
// ==================================
|
||||
// PERSISTANCE
|
||||
// ==================================
|
||||
|
||||
// Save sauvegarde le FrequencyFilter dans un répertoire
|
||||
// Utilise le format de sérialisation du KmerSetGroup sous-jacent
|
||||
// Les métadonnées incluent le type "FrequencyFilter" et min_freq
|
||||
//
|
||||
// Format:
|
||||
// - directory/metadata.{toml,yaml,json} - métadonnées du filtre
|
||||
// - directory/set_0.roaring - k-mers vus ≥1 fois
|
||||
// - directory/set_1.roaring - k-mers vus ≥2 fois
|
||||
// - ...
|
||||
// - directory/set_{minFreq-1}.roaring - k-mers vus ≥minFreq fois
|
||||
//
|
||||
// Parameters:
|
||||
// - directory: répertoire de destination
|
||||
// - format: format des métadonnées (FormatTOML, FormatYAML, FormatJSON)
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// err := ff.Save("./my_filter", obikmer.FormatTOML)
|
||||
func (ff *FrequencyFilter) Save(directory string, format MetadataFormat) error {
|
||||
// Déléguer à KmerSetGroup qui gère déjà tout
|
||||
return ff.KmerSetGroup.Save(directory, format)
|
||||
}
|
||||
|
||||
// LoadFrequencyFilter charge un FrequencyFilter depuis un répertoire
|
||||
// Vérifie que les métadonnées correspondent à un FrequencyFilter
|
||||
//
|
||||
// Parameters:
|
||||
// - directory: répertoire source
|
||||
//
|
||||
// Returns:
|
||||
// - *FrequencyFilter: le filtre chargé
|
||||
// - error: erreur si le chargement échoue ou si ce n'est pas un FrequencyFilter
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// ff, err := obikmer.LoadFrequencyFilter("./my_filter")
|
||||
func LoadFrequencyFilter(directory string) (*FrequencyFilter, error) {
|
||||
// Charger le KmerSetGroup
|
||||
ksg, err := LoadKmerSetGroup(directory)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Vérifier que c'est bien un FrequencyFilter
|
||||
if typeAttr, ok := ksg.GetAttribute("type"); !ok || typeAttr != "FrequencyFilter" {
|
||||
return nil, fmt.Errorf("loaded data is not a FrequencyFilter (type=%v)", typeAttr)
|
||||
}
|
||||
|
||||
// Récupérer min_freq
|
||||
minFreqAttr, ok := ksg.GetIntAttribute("min_freq")
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("FrequencyFilter missing min_freq attribute")
|
||||
}
|
||||
|
||||
// Créer le FrequencyFilter
|
||||
ff := &FrequencyFilter{
|
||||
KmerSetGroup: ksg,
|
||||
MinFreq: minFreqAttr,
|
||||
}
|
||||
|
||||
return ff, nil
|
||||
}
|
||||
|
||||
// ==================================
|
||||
// UTILITAIRES
|
||||
// ==================================
|
||||
|
||||
// Contains vérifie si un k-mer a atteint la fréquence minimale
|
||||
func (ff *FrequencyFilter) Contains(kmer uint64) bool {
|
||||
canonical := CanonicalKmer(kmer, ff.K())
|
||||
return ff.Get(ff.MinFreq - 1).Contains(canonical)
|
||||
}
|
||||
|
||||
// GetFrequency returns the approximate frequency of a k-mer
|
||||
// Retourne le niveau maximum atteint (freq ≥ niveau)
|
||||
func (ff *FrequencyFilter) GetFrequency(kmer uint64) int {
|
||||
canonical := CanonicalKmer(kmer, ff.K())
|
||||
|
||||
freq := 0
|
||||
for i := 0; i < ff.MinFreq; i++ {
|
||||
if ff.Get(i).Contains(canonical) {
|
||||
freq = i + 1
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return freq
|
||||
}
|
||||
|
||||
// Len returns the number of filtered k-mers or at a specific level
|
||||
// Without argument: returns the number of k-mers with freq ≥ minFreq (last level)
|
||||
// With argument level: returns the number of k-mers with freq ≥ (level+1)
|
||||
// Exemple: Len() pour les k-mers filtrés, Len(2) pour freq ≥ 3
|
||||
// (héritée de KmerSetGroup mais redéfinie pour la documentation)
|
||||
func (ff *FrequencyFilter) Len(level ...int) uint64 {
|
||||
return ff.KmerSetGroup.Len(level...)
|
||||
}
|
||||
|
||||
// MemoryUsage returns memory usage in bytes
|
||||
// (héritée de KmerSetGroup mais redéfinie pour clarté)
|
||||
func (ff *FrequencyFilter) MemoryUsage() uint64 {
|
||||
return ff.KmerSetGroup.MemoryUsage()
|
||||
}
|
||||
|
||||
// ==================================
|
||||
// COMPARAISON AVEC D'AUTRES APPROCHES
|
||||
// ==================================
|
||||
|
||||
// CompareWithSimpleMap compare la mémoire avec une simple map
|
||||
func (ff *FrequencyFilter) CompareWithSimpleMap() string {
|
||||
totalKmers := ff.Get(0).Len()
|
||||
|
||||
simpleMapBytes := totalKmers * 24 // ~24 bytes par entrée
|
||||
roaringBytes := ff.MemoryUsage()
|
||||
|
||||
reduction := float64(simpleMapBytes) / float64(roaringBytes)
|
||||
|
||||
return fmt.Sprintf(`Memory Comparison for %d k-mers:
|
||||
Simple map[uint64]uint32: %.2f MB
|
||||
Roaring filter (v=%d): %.2f MB
|
||||
Reduction: %.1fx
|
||||
`,
|
||||
totalKmers,
|
||||
float64(simpleMapBytes)/1024/1024,
|
||||
ff.MinFreq,
|
||||
float64(roaringBytes)/1024/1024,
|
||||
reduction,
|
||||
)
|
||||
}
|
||||
86
pkg/obikmer/kdi_merge.go
Normal file
86
pkg/obikmer/kdi_merge.go
Normal file
@@ -0,0 +1,86 @@
|
||||
package obikmer
|
||||
|
||||
import "container/heap"
|
||||
|
||||
// mergeItem represents an element in the min-heap for k-way merge.
|
||||
type mergeItem struct {
|
||||
value uint64
|
||||
idx int // index of the reader that produced this value
|
||||
}
|
||||
|
||||
// mergeHeap implements heap.Interface for k-way merge.
|
||||
type mergeHeap []mergeItem
|
||||
|
||||
func (h mergeHeap) Len() int { return len(h) }
|
||||
func (h mergeHeap) Less(i, j int) bool { return h[i].value < h[j].value }
|
||||
func (h mergeHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
|
||||
func (h *mergeHeap) Push(x interface{}) { *h = append(*h, x.(mergeItem)) }
|
||||
func (h *mergeHeap) Pop() interface{} {
|
||||
old := *h
|
||||
n := len(old)
|
||||
x := old[n-1]
|
||||
*h = old[:n-1]
|
||||
return x
|
||||
}
|
||||
|
||||
// KWayMerge performs a k-way merge of multiple sorted KdiReader streams.
|
||||
// For each unique k-mer value, it reports the value and the number of
|
||||
// input streams that contained it (count).
|
||||
type KWayMerge struct {
|
||||
h mergeHeap
|
||||
readers []*KdiReader
|
||||
}
|
||||
|
||||
// NewKWayMerge creates a k-way merge from multiple KdiReaders.
|
||||
// Each reader must produce values in sorted (ascending) order.
|
||||
func NewKWayMerge(readers []*KdiReader) *KWayMerge {
|
||||
m := &KWayMerge{
|
||||
h: make(mergeHeap, 0, len(readers)),
|
||||
readers: readers,
|
||||
}
|
||||
|
||||
// Initialize heap with first value from each reader
|
||||
for i, r := range readers {
|
||||
if v, ok := r.Next(); ok {
|
||||
m.h = append(m.h, mergeItem{value: v, idx: i})
|
||||
}
|
||||
}
|
||||
heap.Init(&m.h)
|
||||
|
||||
return m
|
||||
}
|
||||
|
||||
// Next returns the next smallest k-mer value, the number of readers
|
||||
// that contained this value (count), and true.
|
||||
// Returns (0, 0, false) when all streams are exhausted.
|
||||
func (m *KWayMerge) Next() (kmer uint64, count int, ok bool) {
|
||||
if len(m.h) == 0 {
|
||||
return 0, 0, false
|
||||
}
|
||||
|
||||
minVal := m.h[0].value
|
||||
count = 0
|
||||
|
||||
// Pop all items with the same value
|
||||
for len(m.h) > 0 && m.h[0].value == minVal {
|
||||
item := heap.Pop(&m.h).(mergeItem)
|
||||
count++
|
||||
// Advance that reader
|
||||
if v, ok := m.readers[item.idx].Next(); ok {
|
||||
heap.Push(&m.h, mergeItem{value: v, idx: item.idx})
|
||||
}
|
||||
}
|
||||
|
||||
return minVal, count, true
|
||||
}
|
||||
|
||||
// Close closes all underlying readers.
|
||||
func (m *KWayMerge) Close() error {
|
||||
var firstErr error
|
||||
for _, r := range m.readers {
|
||||
if err := r.Close(); err != nil && firstErr == nil {
|
||||
firstErr = err
|
||||
}
|
||||
}
|
||||
return firstErr
|
||||
}
|
||||
159
pkg/obikmer/kdi_merge_test.go
Normal file
159
pkg/obikmer/kdi_merge_test.go
Normal file
@@ -0,0 +1,159 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// writeKdi is a helper that writes sorted kmers to a .kdi file.
|
||||
func writeKdi(t *testing.T, dir, name string, kmers []uint64) string {
|
||||
t.Helper()
|
||||
path := filepath.Join(dir, name)
|
||||
w, err := NewKdiWriter(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, v := range kmers {
|
||||
if err := w.Write(v); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
return path
|
||||
}
|
||||
|
||||
func TestKWayMergeBasic(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
// Three sorted streams
|
||||
p1 := writeKdi(t, dir, "a.kdi", []uint64{1, 3, 5, 7})
|
||||
p2 := writeKdi(t, dir, "b.kdi", []uint64{2, 3, 6, 7})
|
||||
p3 := writeKdi(t, dir, "c.kdi", []uint64{3, 4, 7, 8})
|
||||
|
||||
r1, _ := NewKdiReader(p1)
|
||||
r2, _ := NewKdiReader(p2)
|
||||
r3, _ := NewKdiReader(p3)
|
||||
|
||||
m := NewKWayMerge([]*KdiReader{r1, r2, r3})
|
||||
defer m.Close()
|
||||
|
||||
type result struct {
|
||||
kmer uint64
|
||||
count int
|
||||
}
|
||||
var results []result
|
||||
for {
|
||||
kmer, count, ok := m.Next()
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
results = append(results, result{kmer, count})
|
||||
}
|
||||
|
||||
expected := []result{
|
||||
{1, 1}, {2, 1}, {3, 3}, {4, 1}, {5, 1}, {6, 1}, {7, 3}, {8, 1},
|
||||
}
|
||||
if len(results) != len(expected) {
|
||||
t.Fatalf("got %d results, want %d", len(results), len(expected))
|
||||
}
|
||||
for i, exp := range expected {
|
||||
if results[i] != exp {
|
||||
t.Errorf("result %d: got %+v, want %+v", i, results[i], exp)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestKWayMergeSingleStream(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
p := writeKdi(t, dir, "a.kdi", []uint64{10, 20, 30})
|
||||
|
||||
r, _ := NewKdiReader(p)
|
||||
m := NewKWayMerge([]*KdiReader{r})
|
||||
defer m.Close()
|
||||
|
||||
vals := []uint64{10, 20, 30}
|
||||
for _, expected := range vals {
|
||||
kmer, count, ok := m.Next()
|
||||
if !ok {
|
||||
t.Fatal("unexpected EOF")
|
||||
}
|
||||
if kmer != expected || count != 1 {
|
||||
t.Fatalf("got (%d, %d), want (%d, 1)", kmer, count, expected)
|
||||
}
|
||||
}
|
||||
_, _, ok := m.Next()
|
||||
if ok {
|
||||
t.Fatal("expected EOF")
|
||||
}
|
||||
}
|
||||
|
||||
func TestKWayMergeEmpty(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
p1 := writeKdi(t, dir, "a.kdi", nil)
|
||||
p2 := writeKdi(t, dir, "b.kdi", nil)
|
||||
|
||||
r1, _ := NewKdiReader(p1)
|
||||
r2, _ := NewKdiReader(p2)
|
||||
|
||||
m := NewKWayMerge([]*KdiReader{r1, r2})
|
||||
defer m.Close()
|
||||
|
||||
_, _, ok := m.Next()
|
||||
if ok {
|
||||
t.Fatal("expected no results from empty streams")
|
||||
}
|
||||
}
|
||||
|
||||
func TestKWayMergeDisjoint(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
p1 := writeKdi(t, dir, "a.kdi", []uint64{1, 2, 3})
|
||||
p2 := writeKdi(t, dir, "b.kdi", []uint64{10, 20, 30})
|
||||
|
||||
r1, _ := NewKdiReader(p1)
|
||||
r2, _ := NewKdiReader(p2)
|
||||
|
||||
m := NewKWayMerge([]*KdiReader{r1, r2})
|
||||
defer m.Close()
|
||||
|
||||
expected := []uint64{1, 2, 3, 10, 20, 30}
|
||||
for _, exp := range expected {
|
||||
kmer, count, ok := m.Next()
|
||||
if !ok {
|
||||
t.Fatal("unexpected EOF")
|
||||
}
|
||||
if kmer != exp || count != 1 {
|
||||
t.Fatalf("got (%d, %d), want (%d, 1)", kmer, count, exp)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestKWayMergeAllSame(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
p1 := writeKdi(t, dir, "a.kdi", []uint64{42})
|
||||
p2 := writeKdi(t, dir, "b.kdi", []uint64{42})
|
||||
p3 := writeKdi(t, dir, "c.kdi", []uint64{42})
|
||||
|
||||
r1, _ := NewKdiReader(p1)
|
||||
r2, _ := NewKdiReader(p2)
|
||||
r3, _ := NewKdiReader(p3)
|
||||
|
||||
m := NewKWayMerge([]*KdiReader{r1, r2, r3})
|
||||
defer m.Close()
|
||||
|
||||
kmer, count, ok := m.Next()
|
||||
if !ok {
|
||||
t.Fatal("expected one result")
|
||||
}
|
||||
if kmer != 42 || count != 3 {
|
||||
t.Fatalf("got (%d, %d), want (42, 3)", kmer, count)
|
||||
}
|
||||
_, _, ok = m.Next()
|
||||
if ok {
|
||||
t.Fatal("expected EOF")
|
||||
}
|
||||
}
|
||||
96
pkg/obikmer/kdi_reader.go
Normal file
96
pkg/obikmer/kdi_reader.go
Normal file
@@ -0,0 +1,96 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
)
|
||||
|
||||
// KdiReader reads k-mers from a .kdi file using streaming delta-varint decoding.
|
||||
type KdiReader struct {
|
||||
r *bufio.Reader
|
||||
file *os.File
|
||||
count uint64 // total number of k-mers
|
||||
read uint64 // number of k-mers already consumed
|
||||
prev uint64 // last decoded value
|
||||
started bool // whether first value has been read
|
||||
}
|
||||
|
||||
// NewKdiReader opens a .kdi file for streaming reading.
|
||||
func NewKdiReader(path string) (*KdiReader, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
r := bufio.NewReaderSize(f, 65536)
|
||||
|
||||
// Read and verify magic
|
||||
var magic [4]byte
|
||||
if _, err := io.ReadFull(r, magic[:]); err != nil {
|
||||
f.Close()
|
||||
return nil, fmt.Errorf("kdi: read magic: %w", err)
|
||||
}
|
||||
if magic != kdiMagic {
|
||||
f.Close()
|
||||
return nil, fmt.Errorf("kdi: bad magic %v", magic)
|
||||
}
|
||||
|
||||
// Read count
|
||||
var countBuf [8]byte
|
||||
if _, err := io.ReadFull(r, countBuf[:]); err != nil {
|
||||
f.Close()
|
||||
return nil, fmt.Errorf("kdi: read count: %w", err)
|
||||
}
|
||||
count := binary.LittleEndian.Uint64(countBuf[:])
|
||||
|
||||
return &KdiReader{
|
||||
r: r,
|
||||
file: f,
|
||||
count: count,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Next returns the next k-mer and true, or (0, false) when exhausted.
|
||||
func (kr *KdiReader) Next() (uint64, bool) {
|
||||
if kr.read >= kr.count {
|
||||
return 0, false
|
||||
}
|
||||
|
||||
if !kr.started {
|
||||
// Read first value as absolute uint64 LE
|
||||
var buf [8]byte
|
||||
if _, err := io.ReadFull(kr.r, buf[:]); err != nil {
|
||||
return 0, false
|
||||
}
|
||||
kr.prev = binary.LittleEndian.Uint64(buf[:])
|
||||
kr.started = true
|
||||
kr.read++
|
||||
return kr.prev, true
|
||||
}
|
||||
|
||||
// Read delta varint
|
||||
delta, err := DecodeVarint(kr.r)
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
kr.prev += delta
|
||||
kr.read++
|
||||
return kr.prev, true
|
||||
}
|
||||
|
||||
// Count returns the total number of k-mers in this partition.
|
||||
func (kr *KdiReader) Count() uint64 {
|
||||
return kr.count
|
||||
}
|
||||
|
||||
// Remaining returns how many k-mers have not been read yet.
|
||||
func (kr *KdiReader) Remaining() uint64 {
|
||||
return kr.count - kr.read
|
||||
}
|
||||
|
||||
// Close closes the underlying file.
|
||||
func (kr *KdiReader) Close() error {
|
||||
return kr.file.Close()
|
||||
}
|
||||
255
pkg/obikmer/kdi_test.go
Normal file
255
pkg/obikmer/kdi_test.go
Normal file
@@ -0,0 +1,255 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestKdiRoundTrip(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "test.kdi")
|
||||
|
||||
// Sorted k-mer values
|
||||
kmers := []uint64{10, 20, 30, 100, 200, 500, 10000, 1 << 40, 1<<62 - 1}
|
||||
|
||||
w, err := NewKdiWriter(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, v := range kmers {
|
||||
if err := w.Write(v); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
if w.Count() != uint64(len(kmers)) {
|
||||
t.Fatalf("writer count: got %d, want %d", w.Count(), len(kmers))
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Read back
|
||||
r, err := NewKdiReader(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
if r.Count() != uint64(len(kmers)) {
|
||||
t.Fatalf("reader count: got %d, want %d", r.Count(), len(kmers))
|
||||
}
|
||||
|
||||
for i, expected := range kmers {
|
||||
got, ok := r.Next()
|
||||
if !ok {
|
||||
t.Fatalf("unexpected EOF at index %d", i)
|
||||
}
|
||||
if got != expected {
|
||||
t.Fatalf("kmer %d: got %d, want %d", i, got, expected)
|
||||
}
|
||||
}
|
||||
|
||||
_, ok := r.Next()
|
||||
if ok {
|
||||
t.Fatal("expected EOF after all k-mers")
|
||||
}
|
||||
}
|
||||
|
||||
func TestKdiEmpty(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "empty.kdi")
|
||||
|
||||
w, err := NewKdiWriter(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
r, err := NewKdiReader(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
if r.Count() != 0 {
|
||||
t.Fatalf("expected count 0, got %d", r.Count())
|
||||
}
|
||||
|
||||
_, ok := r.Next()
|
||||
if ok {
|
||||
t.Fatal("expected no k-mers in empty file")
|
||||
}
|
||||
}
|
||||
|
||||
func TestKdiSingleValue(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "single.kdi")
|
||||
|
||||
w, err := NewKdiWriter(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := w.Write(42); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
r, err := NewKdiReader(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
if r.Count() != 1 {
|
||||
t.Fatalf("expected count 1, got %d", r.Count())
|
||||
}
|
||||
|
||||
v, ok := r.Next()
|
||||
if !ok {
|
||||
t.Fatal("expected one k-mer")
|
||||
}
|
||||
if v != 42 {
|
||||
t.Fatalf("got %d, want 42", v)
|
||||
}
|
||||
}
|
||||
|
||||
func TestKdiFileSize(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "size.kdi")
|
||||
|
||||
// Write: magic(4) + count(8) + first(8) = 20 bytes
|
||||
w, err := NewKdiWriter(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := w.Write(0); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
info, err := os.Stat(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
// magic(4) + count(8) + first(8) = 20
|
||||
if info.Size() != 20 {
|
||||
t.Fatalf("file size: got %d, want 20", info.Size())
|
||||
}
|
||||
}
|
||||
|
||||
func TestKdiDeltaCompression(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "delta.kdi")
|
||||
|
||||
// Dense consecutive values should compress well
|
||||
n := 10000
|
||||
kmers := make([]uint64, n)
|
||||
for i := range kmers {
|
||||
kmers[i] = uint64(i * 2) // even numbers
|
||||
}
|
||||
|
||||
w, err := NewKdiWriter(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, v := range kmers {
|
||||
if err := w.Write(v); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Each delta is 2, encoded as 1 byte varint
|
||||
// Total: magic(4) + count(8) + first(8) + (n-1)*1 = 20 + 9999 bytes
|
||||
info, err := os.Stat(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
expected := int64(20 + n - 1)
|
||||
if info.Size() != expected {
|
||||
t.Fatalf("file size: got %d, want %d", info.Size(), expected)
|
||||
}
|
||||
|
||||
// Verify round-trip
|
||||
r, err := NewKdiReader(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
for i, expected := range kmers {
|
||||
got, ok := r.Next()
|
||||
if !ok {
|
||||
t.Fatalf("unexpected EOF at index %d", i)
|
||||
}
|
||||
if got != expected {
|
||||
t.Fatalf("kmer %d: got %d, want %d", i, got, expected)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestKdiFromRealKmers(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "real.kdi")
|
||||
|
||||
// Extract k-mers from a sequence, sort, dedup, write to KDI
|
||||
seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT")
|
||||
k := 15
|
||||
|
||||
var kmers []uint64
|
||||
for kmer := range IterCanonicalKmers(seq, k) {
|
||||
kmers = append(kmers, kmer)
|
||||
}
|
||||
sort.Slice(kmers, func(i, j int) bool { return kmers[i] < kmers[j] })
|
||||
// Dedup
|
||||
deduped := kmers[:0]
|
||||
for i, v := range kmers {
|
||||
if i == 0 || v != kmers[i-1] {
|
||||
deduped = append(deduped, v)
|
||||
}
|
||||
}
|
||||
|
||||
w, err := NewKdiWriter(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, v := range deduped {
|
||||
if err := w.Write(v); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Read back and verify
|
||||
r, err := NewKdiReader(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
if r.Count() != uint64(len(deduped)) {
|
||||
t.Fatalf("count: got %d, want %d", r.Count(), len(deduped))
|
||||
}
|
||||
|
||||
for i, expected := range deduped {
|
||||
got, ok := r.Next()
|
||||
if !ok {
|
||||
t.Fatalf("unexpected EOF at index %d", i)
|
||||
}
|
||||
if got != expected {
|
||||
t.Fatalf("kmer %d: got %d, want %d", i, got, expected)
|
||||
}
|
||||
}
|
||||
}
|
||||
113
pkg/obikmer/kdi_writer.go
Normal file
113
pkg/obikmer/kdi_writer.go
Normal file
@@ -0,0 +1,113 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/binary"
|
||||
"os"
|
||||
)
|
||||
|
||||
// KDI file magic bytes: "KDI\x01"
|
||||
var kdiMagic = [4]byte{'K', 'D', 'I', 0x01}
|
||||
|
||||
// KdiWriter writes a sorted sequence of uint64 k-mers to a .kdi file
|
||||
// using delta-varint encoding.
|
||||
//
|
||||
// Format:
|
||||
//
|
||||
// [magic: 4 bytes "KDI\x01"]
|
||||
// [count: uint64 LE] number of k-mers
|
||||
// [first: uint64 LE] first k-mer (absolute value)
|
||||
// [delta_1: varint] arr[1] - arr[0]
|
||||
// [delta_2: varint] arr[2] - arr[1]
|
||||
// ...
|
||||
//
|
||||
// The caller must write k-mers in strictly increasing order.
|
||||
type KdiWriter struct {
|
||||
w *bufio.Writer
|
||||
file *os.File
|
||||
count uint64
|
||||
prev uint64
|
||||
first bool
|
||||
path string
|
||||
}
|
||||
|
||||
// NewKdiWriter creates a new KdiWriter writing to the given file path.
|
||||
// The header (magic + count placeholder) is written immediately.
|
||||
// Count is patched on Close().
|
||||
func NewKdiWriter(path string) (*KdiWriter, error) {
|
||||
f, err := os.Create(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
w := bufio.NewWriterSize(f, 65536)
|
||||
|
||||
// Write magic
|
||||
if _, err := w.Write(kdiMagic[:]); err != nil {
|
||||
f.Close()
|
||||
return nil, err
|
||||
}
|
||||
// Write placeholder for count (will be patched on Close)
|
||||
var countBuf [8]byte
|
||||
if _, err := w.Write(countBuf[:]); err != nil {
|
||||
f.Close()
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &KdiWriter{
|
||||
w: w,
|
||||
file: f,
|
||||
first: true,
|
||||
path: path,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Write adds a k-mer to the file. K-mers must be written in strictly
|
||||
// increasing order.
|
||||
func (kw *KdiWriter) Write(kmer uint64) error {
|
||||
if kw.first {
|
||||
// Write first value as absolute uint64 LE
|
||||
var buf [8]byte
|
||||
binary.LittleEndian.PutUint64(buf[:], kmer)
|
||||
if _, err := kw.w.Write(buf[:]); err != nil {
|
||||
return err
|
||||
}
|
||||
kw.prev = kmer
|
||||
kw.first = false
|
||||
} else {
|
||||
delta := kmer - kw.prev
|
||||
if _, err := EncodeVarint(kw.w, delta); err != nil {
|
||||
return err
|
||||
}
|
||||
kw.prev = kmer
|
||||
}
|
||||
kw.count++
|
||||
return nil
|
||||
}
|
||||
|
||||
// Count returns the number of k-mers written so far.
|
||||
func (kw *KdiWriter) Count() uint64 {
|
||||
return kw.count
|
||||
}
|
||||
|
||||
// Close flushes buffered data, patches the count in the header,
|
||||
// and closes the file.
|
||||
func (kw *KdiWriter) Close() error {
|
||||
if err := kw.w.Flush(); err != nil {
|
||||
kw.file.Close()
|
||||
return err
|
||||
}
|
||||
|
||||
// Patch count at offset 4 (after magic)
|
||||
if _, err := kw.file.Seek(4, 0); err != nil {
|
||||
kw.file.Close()
|
||||
return err
|
||||
}
|
||||
var countBuf [8]byte
|
||||
binary.LittleEndian.PutUint64(countBuf[:], kw.count)
|
||||
if _, err := kw.file.Write(countBuf[:]); err != nil {
|
||||
kw.file.Close()
|
||||
return err
|
||||
}
|
||||
|
||||
return kw.file.Close()
|
||||
}
|
||||
@@ -1,204 +0,0 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"math"
|
||||
"sync"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
)
|
||||
|
||||
// DefaultMinimizerSize returns ceil(k / 2.5) as a reasonable default minimizer size.
|
||||
func DefaultMinimizerSize(k int) int {
|
||||
m := int(math.Ceil(float64(k) / 2.5))
|
||||
if m < 1 {
|
||||
m = 1
|
||||
}
|
||||
if m >= k {
|
||||
m = k - 1
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// MinMinimizerSize returns the minimum m such that 4^m >= nworkers,
|
||||
// i.e. ceil(log(nworkers) / log(4)).
|
||||
func MinMinimizerSize(nworkers int) int {
|
||||
if nworkers <= 1 {
|
||||
return 1
|
||||
}
|
||||
return int(math.Ceil(math.Log(float64(nworkers)) / math.Log(4)))
|
||||
}
|
||||
|
||||
// ValidateMinimizerSize checks and adjusts the minimizer size to satisfy constraints:
|
||||
// - m >= ceil(log(nworkers)/log(4))
|
||||
// - 1 <= m < k
|
||||
func ValidateMinimizerSize(m, k, nworkers int) int {
|
||||
minM := MinMinimizerSize(nworkers)
|
||||
if m < minM {
|
||||
log.Warnf("Minimizer size %d too small for %d workers (4^%d = %d < %d), adjusting to %d",
|
||||
m, nworkers, m, 1<<(2*m), nworkers, minM)
|
||||
m = minM
|
||||
}
|
||||
if m < 1 {
|
||||
m = 1
|
||||
}
|
||||
if m >= k {
|
||||
m = k - 1
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// BuildKmerIndex builds a KmerSet from an iterator using parallel super-kmer partitioning.
|
||||
//
|
||||
// The algorithm:
|
||||
// 1. Extract super-kmers from each sequence using IterSuperKmers
|
||||
// 2. Route each super-kmer to a worker based on minimizer % nworkers
|
||||
// 3. Each worker extracts canonical k-mers and adds them to its local KmerSet
|
||||
// 4. Merge all KmerSets via Union
|
||||
//
|
||||
// Parameters:
|
||||
// - iterator: source of BioSequence batches
|
||||
// - k: k-mer size (1-31)
|
||||
// - m: minimizer size (1 to k-1)
|
||||
func BuildKmerIndex(iterator obiiter.IBioSequence, k, m int) *KmerSet {
|
||||
nproc := obidefault.ParallelWorkers()
|
||||
m = ValidateMinimizerSize(m, k, nproc)
|
||||
|
||||
// Channels to route super-kmers to workers
|
||||
channels := make([]chan SuperKmer, nproc)
|
||||
for i := range channels {
|
||||
channels[i] = make(chan SuperKmer, 1024)
|
||||
}
|
||||
|
||||
// Workers: each manages a partition of the minimizer space
|
||||
sets := make([]*KmerSet, nproc)
|
||||
waiter := sync.WaitGroup{}
|
||||
waiter.Add(nproc)
|
||||
for i := 0; i < nproc; i++ {
|
||||
sets[i] = NewKmerSet(k)
|
||||
go func(ch chan SuperKmer, ks *KmerSet) {
|
||||
defer waiter.Done()
|
||||
for sk := range ch {
|
||||
for kmer := range IterCanonicalKmers(sk.Sequence, k) {
|
||||
ks.AddKmerCode(kmer)
|
||||
}
|
||||
}
|
||||
}(channels[i], sets[i])
|
||||
}
|
||||
|
||||
// Reader: extract super-kmers and route them
|
||||
seqCount := 0
|
||||
for iterator.Next() {
|
||||
batch := iterator.Get()
|
||||
for _, seq := range batch.Slice() {
|
||||
rawSeq := seq.Sequence()
|
||||
if len(rawSeq) < k {
|
||||
continue
|
||||
}
|
||||
for sk := range IterSuperKmers(rawSeq, k, m) {
|
||||
worker := int(sk.Minimizer % uint64(nproc))
|
||||
channels[worker] <- sk
|
||||
}
|
||||
seqCount++
|
||||
}
|
||||
}
|
||||
|
||||
// Close channels to signal workers to finish
|
||||
for _, ch := range channels {
|
||||
close(ch)
|
||||
}
|
||||
waiter.Wait()
|
||||
|
||||
log.Infof("Processed %d sequences", seqCount)
|
||||
|
||||
// Merge partitions (mostly disjoint -> fast union)
|
||||
result := sets[0]
|
||||
for i := 1; i < nproc; i++ {
|
||||
result.bitmap.Or(sets[i].bitmap)
|
||||
}
|
||||
|
||||
log.Infof("Index contains %d k-mers (%.2f MB)",
|
||||
result.Len(), float64(result.MemoryUsage())/1024/1024)
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// BuildFrequencyFilterIndex builds a FrequencyFilter from an iterator
|
||||
// using parallel super-kmer partitioning.
|
||||
//
|
||||
// Each worker manages its own FrequencyFilter for its partition of the
|
||||
// minimizer space. Since all k-mers sharing a minimizer go to the same worker,
|
||||
// the frequency counting is correct per partition.
|
||||
//
|
||||
// Parameters:
|
||||
// - iterator: source of BioSequence batches
|
||||
// - k: k-mer size (1-31)
|
||||
// - m: minimizer size (1 to k-1)
|
||||
// - minFreq: minimum frequency threshold (>= 1)
|
||||
func BuildFrequencyFilterIndex(iterator obiiter.IBioSequence, k, m, minFreq int) *FrequencyFilter {
|
||||
nproc := obidefault.ParallelWorkers()
|
||||
m = ValidateMinimizerSize(m, k, nproc)
|
||||
|
||||
// Channels to route super-kmers to workers
|
||||
channels := make([]chan SuperKmer, nproc)
|
||||
for i := range channels {
|
||||
channels[i] = make(chan SuperKmer, 1024)
|
||||
}
|
||||
|
||||
// Workers: each manages a local FrequencyFilter
|
||||
filters := make([]*FrequencyFilter, nproc)
|
||||
waiter := sync.WaitGroup{}
|
||||
waiter.Add(nproc)
|
||||
for i := 0; i < nproc; i++ {
|
||||
filters[i] = NewFrequencyFilter(k, minFreq)
|
||||
go func(ch chan SuperKmer, ff *FrequencyFilter) {
|
||||
defer waiter.Done()
|
||||
for sk := range ch {
|
||||
for kmer := range IterCanonicalKmers(sk.Sequence, k) {
|
||||
ff.AddKmerCode(kmer)
|
||||
}
|
||||
}
|
||||
}(channels[i], filters[i])
|
||||
}
|
||||
|
||||
// Reader: extract super-kmers and route them
|
||||
seqCount := 0
|
||||
for iterator.Next() {
|
||||
batch := iterator.Get()
|
||||
for _, seq := range batch.Slice() {
|
||||
rawSeq := seq.Sequence()
|
||||
if len(rawSeq) < k {
|
||||
continue
|
||||
}
|
||||
for sk := range IterSuperKmers(rawSeq, k, m) {
|
||||
worker := int(sk.Minimizer % uint64(nproc))
|
||||
channels[worker] <- sk
|
||||
}
|
||||
seqCount++
|
||||
}
|
||||
}
|
||||
|
||||
// Close channels to signal workers to finish
|
||||
for _, ch := range channels {
|
||||
close(ch)
|
||||
}
|
||||
waiter.Wait()
|
||||
|
||||
log.Infof("Processed %d sequences", seqCount)
|
||||
|
||||
// Merge FrequencyFilters: union level by level
|
||||
result := filters[0]
|
||||
for i := 1; i < nproc; i++ {
|
||||
for level := 0; level < minFreq; level++ {
|
||||
result.Get(level).bitmap.Or(filters[i].Get(level).bitmap)
|
||||
}
|
||||
}
|
||||
|
||||
stats := result.Stats()
|
||||
log.Infof("FrequencyFilter: %d k-mers with freq >= %d (%.2f MB total)",
|
||||
stats.FilteredKmers, minFreq, float64(stats.TotalBytes)/1024/1024)
|
||||
|
||||
return result
|
||||
}
|
||||
@@ -1,224 +0,0 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"github.com/RoaringBitmap/roaring/roaring64"
|
||||
)
|
||||
|
||||
// KmerSet wraps a set of k-mers stored in a Roaring Bitmap
|
||||
// Provides utility methods for manipulating k-mer sets
|
||||
type KmerSet struct {
|
||||
id string // Unique identifier of the KmerSet
|
||||
k int // Size of k-mers (immutable)
|
||||
bitmap *roaring64.Bitmap // Bitmap containing the k-mers
|
||||
Metadata map[string]interface{} // User metadata (key=atomic value)
|
||||
}
|
||||
|
||||
// NewKmerSet creates a new empty KmerSet
|
||||
func NewKmerSet(k int) *KmerSet {
|
||||
return &KmerSet{
|
||||
k: k,
|
||||
bitmap: roaring64.New(),
|
||||
Metadata: make(map[string]interface{}),
|
||||
}
|
||||
}
|
||||
|
||||
// NewKmerSetFromBitmap creates a KmerSet from an existing bitmap
|
||||
func NewKmerSetFromBitmap(k int, bitmap *roaring64.Bitmap) *KmerSet {
|
||||
return &KmerSet{
|
||||
k: k,
|
||||
bitmap: bitmap,
|
||||
Metadata: make(map[string]interface{}),
|
||||
}
|
||||
}
|
||||
|
||||
// K returns the size of k-mers (immutable)
|
||||
func (ks *KmerSet) K() int {
|
||||
return ks.k
|
||||
}
|
||||
|
||||
// AddKmerCode adds an encoded k-mer to the set
|
||||
func (ks *KmerSet) AddKmerCode(kmer uint64) {
|
||||
ks.bitmap.Add(kmer)
|
||||
}
|
||||
|
||||
// AddCanonicalKmerCode adds an encoded canonical k-mer to the set
|
||||
func (ks *KmerSet) AddCanonicalKmerCode(kmer uint64) {
|
||||
canonical := CanonicalKmer(kmer, ks.k)
|
||||
ks.bitmap.Add(canonical)
|
||||
}
|
||||
|
||||
// AddKmer adds a k-mer to the set by encoding the sequence
|
||||
// The sequence must have exactly k nucleotides
|
||||
// Zero-allocation: encodes directly without creating an intermediate slice
|
||||
func (ks *KmerSet) AddKmer(seq []byte) {
|
||||
kmer := EncodeKmer(seq, ks.k)
|
||||
ks.bitmap.Add(kmer)
|
||||
}
|
||||
|
||||
// AddCanonicalKmer adds a canonical k-mer to the set by encoding the sequence
|
||||
// The sequence must have exactly k nucleotides
|
||||
// Zero-allocation: encodes directly in canonical form without creating an intermediate slice
|
||||
func (ks *KmerSet) AddCanonicalKmer(seq []byte) {
|
||||
canonical := EncodeCanonicalKmer(seq, ks.k)
|
||||
ks.bitmap.Add(canonical)
|
||||
}
|
||||
|
||||
// AddSequence adds all k-mers from a sequence to the set
|
||||
// Uses an iterator to avoid allocating an intermediate vector
|
||||
func (ks *KmerSet) AddSequence(seq *obiseq.BioSequence) {
|
||||
rawSeq := seq.Sequence()
|
||||
for canonical := range IterCanonicalKmers(rawSeq, ks.k) {
|
||||
ks.bitmap.Add(canonical)
|
||||
}
|
||||
}
|
||||
|
||||
// AddSequences adds all k-mers from multiple sequences in batch
|
||||
func (ks *KmerSet) AddSequences(sequences *obiseq.BioSequenceSlice) {
|
||||
for _, seq := range *sequences {
|
||||
ks.AddSequence(seq)
|
||||
}
|
||||
}
|
||||
|
||||
// AddSequenceSlice adds all k-mers from a slice of sequences
|
||||
func (ks *KmerSet) AddSequenceSlice(sequences *obiseq.BioSequenceSlice) {
|
||||
for _, seq := range *sequences {
|
||||
ks.AddSequence(seq)
|
||||
}
|
||||
}
|
||||
|
||||
// Contains checks if a k-mer is in the set
|
||||
func (ks *KmerSet) Contains(kmer uint64) bool {
|
||||
return ks.bitmap.Contains(kmer)
|
||||
}
|
||||
|
||||
// Len returns the number of k-mers in the set
|
||||
func (ks *KmerSet) Len() uint64 {
|
||||
return ks.bitmap.GetCardinality()
|
||||
}
|
||||
|
||||
// MemoryUsage returns memory usage in bytes
|
||||
func (ks *KmerSet) MemoryUsage() uint64 {
|
||||
return ks.bitmap.GetSizeInBytes()
|
||||
}
|
||||
|
||||
// Clear empties the set
|
||||
func (ks *KmerSet) Clear() {
|
||||
ks.bitmap.Clear()
|
||||
}
|
||||
|
||||
// Copy creates a copy of the set (consistent with BioSequence.Copy)
|
||||
func (ks *KmerSet) Copy() *KmerSet {
|
||||
// Copy metadata
|
||||
metadata := make(map[string]interface{}, len(ks.Metadata))
|
||||
for k, v := range ks.Metadata {
|
||||
metadata[k] = v
|
||||
}
|
||||
|
||||
return &KmerSet{
|
||||
id: ks.id,
|
||||
k: ks.k,
|
||||
bitmap: ks.bitmap.Clone(),
|
||||
Metadata: metadata,
|
||||
}
|
||||
}
|
||||
|
||||
// Id returns the identifier of the KmerSet (consistent with BioSequence.Id)
|
||||
func (ks *KmerSet) Id() string {
|
||||
return ks.id
|
||||
}
|
||||
|
||||
// SetId sets the identifier of the KmerSet (consistent with BioSequence.SetId)
|
||||
func (ks *KmerSet) SetId(id string) {
|
||||
ks.id = id
|
||||
}
|
||||
|
||||
// Union returns the union of this set with another
|
||||
func (ks *KmerSet) Union(other *KmerSet) *KmerSet {
|
||||
if ks.k != other.k {
|
||||
panic(fmt.Sprintf("Cannot union KmerSets with different k values: %d vs %d", ks.k, other.k))
|
||||
}
|
||||
result := ks.bitmap.Clone()
|
||||
result.Or(other.bitmap)
|
||||
return NewKmerSetFromBitmap(ks.k, result)
|
||||
}
|
||||
|
||||
// Intersect returns the intersection of this set with another
|
||||
func (ks *KmerSet) Intersect(other *KmerSet) *KmerSet {
|
||||
if ks.k != other.k {
|
||||
panic(fmt.Sprintf("Cannot intersect KmerSets with different k values: %d vs %d", ks.k, other.k))
|
||||
}
|
||||
result := ks.bitmap.Clone()
|
||||
result.And(other.bitmap)
|
||||
return NewKmerSetFromBitmap(ks.k, result)
|
||||
}
|
||||
|
||||
// Difference returns the difference of this set with another (this - other)
|
||||
func (ks *KmerSet) Difference(other *KmerSet) *KmerSet {
|
||||
if ks.k != other.k {
|
||||
panic(fmt.Sprintf("Cannot subtract KmerSets with different k values: %d vs %d", ks.k, other.k))
|
||||
}
|
||||
result := ks.bitmap.Clone()
|
||||
result.AndNot(other.bitmap)
|
||||
return NewKmerSetFromBitmap(ks.k, result)
|
||||
}
|
||||
|
||||
// JaccardDistance computes the Jaccard distance between two KmerSets.
|
||||
// The Jaccard distance is defined as: 1 - (|A ∩ B| / |A ∪ B|)
|
||||
// where A and B are the two sets.
|
||||
//
|
||||
// Returns:
|
||||
// - 0.0 when sets are identical (distance = 0, similarity = 1)
|
||||
// - 1.0 when sets are completely disjoint (distance = 1, similarity = 0)
|
||||
// - 1.0 when both sets are empty (by convention)
|
||||
//
|
||||
// Time complexity: O(|A| + |B|) for Roaring Bitmap operations
|
||||
// Space complexity: O(1) as operations are done in-place on temporary bitmaps
|
||||
func (ks *KmerSet) JaccardDistance(other *KmerSet) float64 {
|
||||
if ks.k != other.k {
|
||||
panic(fmt.Sprintf("Cannot compute Jaccard distance between KmerSets with different k values: %d vs %d", ks.k, other.k))
|
||||
}
|
||||
|
||||
// Compute intersection cardinality
|
||||
intersectionCard := ks.bitmap.AndCardinality(other.bitmap)
|
||||
|
||||
// Compute union cardinality
|
||||
unionCard := ks.bitmap.OrCardinality(other.bitmap)
|
||||
|
||||
// If union is empty, both sets are empty - return 1.0 by convention
|
||||
if unionCard == 0 {
|
||||
return 1.0
|
||||
}
|
||||
|
||||
// Jaccard similarity = |A ∩ B| / |A ∪ B|
|
||||
similarity := float64(intersectionCard) / float64(unionCard)
|
||||
|
||||
// Jaccard distance = 1 - similarity
|
||||
return 1.0 - similarity
|
||||
}
|
||||
|
||||
// JaccardSimilarity computes the Jaccard similarity coefficient between two KmerSets.
|
||||
// The Jaccard similarity is defined as: |A ∩ B| / |A ∪ B|
|
||||
//
|
||||
// Returns:
|
||||
// - 1.0 when sets are identical (maximum similarity)
|
||||
// - 0.0 when sets are completely disjoint (no similarity)
|
||||
// - 0.0 when both sets are empty (by convention)
|
||||
//
|
||||
// Time complexity: O(|A| + |B|) for Roaring Bitmap operations
|
||||
// Space complexity: O(1) as operations are done in-place on temporary bitmaps
|
||||
func (ks *KmerSet) JaccardSimilarity(other *KmerSet) float64 {
|
||||
return 1.0 - ks.JaccardDistance(other)
|
||||
}
|
||||
|
||||
// Iterator returns an iterator over all k-mers in the set
|
||||
func (ks *KmerSet) Iterator() roaring64.IntIterable64 {
|
||||
return ks.bitmap.Iterator()
|
||||
}
|
||||
|
||||
// Bitmap returns the underlying bitmap (for compatibility)
|
||||
func (ks *KmerSet) Bitmap() *roaring64.Bitmap {
|
||||
return ks.bitmap
|
||||
}
|
||||
@@ -1,362 +0,0 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
)
|
||||
|
||||
// ==================================
|
||||
// KMER SET ATTRIBUTE API
|
||||
// Mimic BioSequence attribute API from obiseq/attributes.go
|
||||
// ==================================
|
||||
|
||||
// HasAttribute vérifie si une clé d'attribut existe
|
||||
func (ks *KmerSet) HasAttribute(key string) bool {
|
||||
_, ok := ks.Metadata[key]
|
||||
return ok
|
||||
}
|
||||
|
||||
// GetAttribute récupère la valeur d'un attribut
|
||||
// Cas particuliers: "id" utilise Id(), "k" utilise K()
|
||||
func (ks *KmerSet) GetAttribute(key string) (interface{}, bool) {
|
||||
switch key {
|
||||
case "id":
|
||||
return ks.Id(), true
|
||||
case "k":
|
||||
return ks.K(), true
|
||||
default:
|
||||
value, ok := ks.Metadata[key]
|
||||
return value, ok
|
||||
}
|
||||
}
|
||||
|
||||
// SetAttribute sets the value of an attribute
|
||||
// Cas particuliers: "id" utilise SetId(), "k" est immutable (panique)
|
||||
func (ks *KmerSet) SetAttribute(key string, value interface{}) {
|
||||
switch key {
|
||||
case "id":
|
||||
if id, ok := value.(string); ok {
|
||||
ks.SetId(id)
|
||||
} else {
|
||||
panic(fmt.Sprintf("id must be a string, got %T", value))
|
||||
}
|
||||
case "k":
|
||||
panic("k is immutable and cannot be modified via SetAttribute")
|
||||
default:
|
||||
ks.Metadata[key] = value
|
||||
}
|
||||
}
|
||||
|
||||
// DeleteAttribute supprime un attribut
|
||||
func (ks *KmerSet) DeleteAttribute(key string) {
|
||||
delete(ks.Metadata, key)
|
||||
}
|
||||
|
||||
// RemoveAttribute supprime un attribut (alias de DeleteAttribute)
|
||||
func (ks *KmerSet) RemoveAttribute(key string) {
|
||||
ks.DeleteAttribute(key)
|
||||
}
|
||||
|
||||
// RenameAttribute renomme un attribut
|
||||
func (ks *KmerSet) RenameAttribute(newName, oldName string) {
|
||||
if value, ok := ks.Metadata[oldName]; ok {
|
||||
ks.Metadata[newName] = value
|
||||
delete(ks.Metadata, oldName)
|
||||
}
|
||||
}
|
||||
|
||||
// GetIntAttribute récupère un attribut en tant qu'entier
|
||||
func (ks *KmerSet) GetIntAttribute(key string) (int, bool) {
|
||||
value, ok := ks.Metadata[key]
|
||||
if !ok {
|
||||
return 0, false
|
||||
}
|
||||
|
||||
switch v := value.(type) {
|
||||
case int:
|
||||
return v, true
|
||||
case int64:
|
||||
return int(v), true
|
||||
case float64:
|
||||
return int(v), true
|
||||
case string:
|
||||
if i, err := strconv.Atoi(v); err == nil {
|
||||
return i, true
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
// GetFloatAttribute récupère un attribut en tant que float64
|
||||
func (ks *KmerSet) GetFloatAttribute(key string) (float64, bool) {
|
||||
value, ok := ks.Metadata[key]
|
||||
if !ok {
|
||||
return 0, false
|
||||
}
|
||||
|
||||
switch v := value.(type) {
|
||||
case float64:
|
||||
return v, true
|
||||
case float32:
|
||||
return float64(v), true
|
||||
case int:
|
||||
return float64(v), true
|
||||
case int64:
|
||||
return float64(v), true
|
||||
case string:
|
||||
if f, err := strconv.ParseFloat(v, 64); err == nil {
|
||||
return f, true
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
// GetNumericAttribute récupère un attribut numérique (alias de GetFloatAttribute)
|
||||
func (ks *KmerSet) GetNumericAttribute(key string) (float64, bool) {
|
||||
return ks.GetFloatAttribute(key)
|
||||
}
|
||||
|
||||
// GetStringAttribute récupère un attribut en tant que chaîne
|
||||
func (ks *KmerSet) GetStringAttribute(key string) (string, bool) {
|
||||
value, ok := ks.Metadata[key]
|
||||
if !ok {
|
||||
return "", false
|
||||
}
|
||||
|
||||
switch v := value.(type) {
|
||||
case string:
|
||||
return v, true
|
||||
default:
|
||||
return fmt.Sprintf("%v", v), true
|
||||
}
|
||||
}
|
||||
|
||||
// GetBoolAttribute récupère un attribut en tant que booléen
|
||||
func (ks *KmerSet) GetBoolAttribute(key string) (bool, bool) {
|
||||
value, ok := ks.Metadata[key]
|
||||
if !ok {
|
||||
return false, false
|
||||
}
|
||||
|
||||
switch v := value.(type) {
|
||||
case bool:
|
||||
return v, true
|
||||
case int:
|
||||
return v != 0, true
|
||||
case string:
|
||||
if b, err := strconv.ParseBool(v); err == nil {
|
||||
return b, true
|
||||
}
|
||||
}
|
||||
return false, false
|
||||
}
|
||||
|
||||
// AttributeKeys returns the set of attribute keys
|
||||
func (ks *KmerSet) AttributeKeys() obiutils.Set[string] {
|
||||
keys := obiutils.MakeSet[string]()
|
||||
for key := range ks.Metadata {
|
||||
keys.Add(key)
|
||||
}
|
||||
return keys
|
||||
}
|
||||
|
||||
// Keys returns the set of attribute keys (alias of AttributeKeys)
|
||||
func (ks *KmerSet) Keys() obiutils.Set[string] {
|
||||
return ks.AttributeKeys()
|
||||
}
|
||||
|
||||
// ==================================
|
||||
// KMER SET GROUP ATTRIBUTE API
|
||||
// Métadonnées du groupe + accès via Get() pour les sets individuels
|
||||
// ==================================
|
||||
|
||||
// HasAttribute vérifie si une clé d'attribut existe pour le groupe
|
||||
func (ksg *KmerSetGroup) HasAttribute(key string) bool {
|
||||
_, ok := ksg.Metadata[key]
|
||||
return ok
|
||||
}
|
||||
|
||||
// GetAttribute récupère la valeur d'un attribut du groupe
|
||||
// Cas particuliers: "id" utilise Id(), "k" utilise K()
|
||||
func (ksg *KmerSetGroup) GetAttribute(key string) (interface{}, bool) {
|
||||
switch key {
|
||||
case "id":
|
||||
return ksg.Id(), true
|
||||
case "k":
|
||||
return ksg.K(), true
|
||||
default:
|
||||
value, ok := ksg.Metadata[key]
|
||||
return value, ok
|
||||
}
|
||||
}
|
||||
|
||||
// SetAttribute sets the value of an attribute du groupe
|
||||
// Cas particuliers: "id" utilise SetId(), "k" est immutable (panique)
|
||||
func (ksg *KmerSetGroup) SetAttribute(key string, value interface{}) {
|
||||
switch key {
|
||||
case "id":
|
||||
if id, ok := value.(string); ok {
|
||||
ksg.SetId(id)
|
||||
} else {
|
||||
panic(fmt.Sprintf("id must be a string, got %T", value))
|
||||
}
|
||||
case "k":
|
||||
panic("k is immutable and cannot be modified via SetAttribute")
|
||||
default:
|
||||
ksg.Metadata[key] = value
|
||||
}
|
||||
}
|
||||
|
||||
// DeleteAttribute supprime un attribut du groupe
|
||||
func (ksg *KmerSetGroup) DeleteAttribute(key string) {
|
||||
delete(ksg.Metadata, key)
|
||||
}
|
||||
|
||||
// RemoveAttribute supprime un attribut du groupe (alias)
|
||||
func (ksg *KmerSetGroup) RemoveAttribute(key string) {
|
||||
ksg.DeleteAttribute(key)
|
||||
}
|
||||
|
||||
// RenameAttribute renomme un attribut du groupe
|
||||
func (ksg *KmerSetGroup) RenameAttribute(newName, oldName string) {
|
||||
if value, ok := ksg.Metadata[oldName]; ok {
|
||||
ksg.Metadata[newName] = value
|
||||
delete(ksg.Metadata, oldName)
|
||||
}
|
||||
}
|
||||
|
||||
// GetIntAttribute récupère un attribut entier du groupe
|
||||
func (ksg *KmerSetGroup) GetIntAttribute(key string) (int, bool) {
|
||||
value, ok := ksg.GetAttribute(key)
|
||||
if !ok {
|
||||
return 0, false
|
||||
}
|
||||
|
||||
switch v := value.(type) {
|
||||
case int:
|
||||
return v, true
|
||||
case int64:
|
||||
return int(v), true
|
||||
case float64:
|
||||
return int(v), true
|
||||
case string:
|
||||
if i, err := strconv.Atoi(v); err == nil {
|
||||
return i, true
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
// GetFloatAttribute récupère un attribut float64 du groupe
|
||||
func (ksg *KmerSetGroup) GetFloatAttribute(key string) (float64, bool) {
|
||||
value, ok := ksg.GetAttribute(key)
|
||||
if !ok {
|
||||
return 0, false
|
||||
}
|
||||
|
||||
switch v := value.(type) {
|
||||
case float64:
|
||||
return v, true
|
||||
case float32:
|
||||
return float64(v), true
|
||||
case int:
|
||||
return float64(v), true
|
||||
case int64:
|
||||
return float64(v), true
|
||||
case string:
|
||||
if f, err := strconv.ParseFloat(v, 64); err == nil {
|
||||
return f, true
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
// GetNumericAttribute récupère un attribut numérique du groupe
|
||||
func (ksg *KmerSetGroup) GetNumericAttribute(key string) (float64, bool) {
|
||||
return ksg.GetFloatAttribute(key)
|
||||
}
|
||||
|
||||
// GetStringAttribute récupère un attribut chaîne du groupe
|
||||
func (ksg *KmerSetGroup) GetStringAttribute(key string) (string, bool) {
|
||||
value, ok := ksg.GetAttribute(key)
|
||||
if !ok {
|
||||
return "", false
|
||||
}
|
||||
|
||||
switch v := value.(type) {
|
||||
case string:
|
||||
return v, true
|
||||
default:
|
||||
return fmt.Sprintf("%v", v), true
|
||||
}
|
||||
}
|
||||
|
||||
// GetBoolAttribute récupère un attribut booléen du groupe
|
||||
func (ksg *KmerSetGroup) GetBoolAttribute(key string) (bool, bool) {
|
||||
value, ok := ksg.GetAttribute(key)
|
||||
if !ok {
|
||||
return false, false
|
||||
}
|
||||
|
||||
switch v := value.(type) {
|
||||
case bool:
|
||||
return v, true
|
||||
case int:
|
||||
return v != 0, true
|
||||
case string:
|
||||
if b, err := strconv.ParseBool(v); err == nil {
|
||||
return b, true
|
||||
}
|
||||
}
|
||||
return false, false
|
||||
}
|
||||
|
||||
// AttributeKeys returns the set of attribute keys du groupe
|
||||
func (ksg *KmerSetGroup) AttributeKeys() obiutils.Set[string] {
|
||||
keys := obiutils.MakeSet[string]()
|
||||
for key := range ksg.Metadata {
|
||||
keys.Add(key)
|
||||
}
|
||||
return keys
|
||||
}
|
||||
|
||||
// Keys returns the set of group attribute keys (alias)
|
||||
func (ksg *KmerSetGroup) Keys() obiutils.Set[string] {
|
||||
return ksg.AttributeKeys()
|
||||
}
|
||||
|
||||
// ==================================
|
||||
// MÉTHODES POUR ACCÉDER AUX ATTRIBUTS DES SETS INDIVIDUELS VIA Get()
|
||||
// Architecture zero-copy: ksg.Get(i).SetAttribute(...)
|
||||
// ==================================
|
||||
|
||||
// Exemple d'utilisation:
|
||||
// Pour accéder aux métadonnées d'un KmerSet individuel dans un groupe:
|
||||
// ks := ksg.Get(0)
|
||||
// ks.SetAttribute("level", 1)
|
||||
// hasLevel := ks.HasAttribute("level")
|
||||
//
|
||||
// Pour les métadonnées du groupe:
|
||||
// ksg.SetAttribute("name", "FrequencyFilter")
|
||||
// name, ok := ksg.GetStringAttribute("name")
|
||||
|
||||
// AllAttributeKeys returns all unique attribute keys of the group AND all its sets
|
||||
func (ksg *KmerSetGroup) AllAttributeKeys() obiutils.Set[string] {
|
||||
keys := obiutils.MakeSet[string]()
|
||||
|
||||
// Ajouter les clés du groupe
|
||||
for key := range ksg.Metadata {
|
||||
keys.Add(key)
|
||||
}
|
||||
|
||||
// Ajouter les clés de chaque set
|
||||
for _, ks := range ksg.sets {
|
||||
for key := range ks.Metadata {
|
||||
keys.Add(key)
|
||||
}
|
||||
}
|
||||
|
||||
return keys
|
||||
}
|
||||
361
pkg/obikmer/kmer_set_builder.go
Normal file
361
pkg/obikmer/kmer_set_builder.go
Normal file
@@ -0,0 +1,361 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"sort"
|
||||
"sync"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
)
|
||||
|
||||
// BuilderOption is a functional option for KmerSetGroupBuilder.
|
||||
type BuilderOption func(*builderConfig)
|
||||
|
||||
type builderConfig struct {
|
||||
minFreq int // 0 means no frequency filtering (simple dedup)
|
||||
}
|
||||
|
||||
// WithMinFrequency activates frequency filtering mode.
|
||||
// Only k-mers seen >= minFreq times are kept in the final index.
|
||||
func WithMinFrequency(minFreq int) BuilderOption {
|
||||
return func(c *builderConfig) {
|
||||
c.minFreq = minFreq
|
||||
}
|
||||
}
|
||||
|
||||
// KmerSetGroupBuilder constructs a KmerSetGroup on disk.
|
||||
// During construction, super-kmers are written to temporary .skm files
|
||||
// partitioned by minimizer. On Close(), each partition is finalized
|
||||
// (sort, dedup, optional frequency filter) into .kdi files.
|
||||
type KmerSetGroupBuilder struct {
|
||||
dir string
|
||||
k int
|
||||
m int
|
||||
n int // number of sets
|
||||
P int // number of partitions
|
||||
config builderConfig
|
||||
writers [][]*SkmWriter // [setIndex][partIndex]
|
||||
mu [][]sync.Mutex // per-writer mutex for concurrent access
|
||||
closed bool
|
||||
}
|
||||
|
||||
// NewKmerSetGroupBuilder creates a builder for a new KmerSetGroup.
|
||||
//
|
||||
// Parameters:
|
||||
// - directory: destination directory (created if necessary)
|
||||
// - k: k-mer size (1-31)
|
||||
// - m: minimizer size (-1 for auto = ceil(k/2.5))
|
||||
// - n: number of sets in the group
|
||||
// - P: number of partitions (-1 for auto)
|
||||
// - options: optional builder options (e.g. WithMinFrequency)
|
||||
func NewKmerSetGroupBuilder(directory string, k, m, n, P int,
|
||||
options ...BuilderOption) (*KmerSetGroupBuilder, error) {
|
||||
|
||||
if k < 2 || k > 31 {
|
||||
return nil, fmt.Errorf("obikmer: k must be between 2 and 31, got %d", k)
|
||||
}
|
||||
if n < 1 {
|
||||
return nil, fmt.Errorf("obikmer: n must be >= 1, got %d", n)
|
||||
}
|
||||
|
||||
// Auto minimizer size
|
||||
if m < 0 {
|
||||
m = int(math.Ceil(float64(k) / 2.5))
|
||||
}
|
||||
if m < 1 {
|
||||
m = 1
|
||||
}
|
||||
if m >= k {
|
||||
m = k - 1
|
||||
}
|
||||
|
||||
// Auto partition count
|
||||
if P < 0 {
|
||||
// Use 4^m as the maximum, capped at a reasonable value
|
||||
maxP := 1 << (2 * m) // 4^m
|
||||
P = maxP
|
||||
if P > 4096 {
|
||||
P = 4096
|
||||
}
|
||||
if P < 64 {
|
||||
P = 64
|
||||
}
|
||||
}
|
||||
|
||||
// Apply options
|
||||
var config builderConfig
|
||||
for _, opt := range options {
|
||||
opt(&config)
|
||||
}
|
||||
|
||||
// Create build directory structure
|
||||
buildDir := filepath.Join(directory, ".build")
|
||||
for s := 0; s < n; s++ {
|
||||
setDir := filepath.Join(buildDir, fmt.Sprintf("set_%d", s))
|
||||
if err := os.MkdirAll(setDir, 0755); err != nil {
|
||||
return nil, fmt.Errorf("obikmer: create build dir: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Create SKM writers
|
||||
writers := make([][]*SkmWriter, n)
|
||||
mutexes := make([][]sync.Mutex, n)
|
||||
for s := 0; s < n; s++ {
|
||||
writers[s] = make([]*SkmWriter, P)
|
||||
mutexes[s] = make([]sync.Mutex, P)
|
||||
for p := 0; p < P; p++ {
|
||||
path := filepath.Join(buildDir, fmt.Sprintf("set_%d", s),
|
||||
fmt.Sprintf("part_%04d.skm", p))
|
||||
w, err := NewSkmWriter(path)
|
||||
if err != nil {
|
||||
// Close already-created writers
|
||||
for ss := 0; ss <= s; ss++ {
|
||||
for pp := 0; pp < P; pp++ {
|
||||
if writers[ss][pp] != nil {
|
||||
writers[ss][pp].Close()
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil, fmt.Errorf("obikmer: create skm writer: %w", err)
|
||||
}
|
||||
writers[s][p] = w
|
||||
}
|
||||
}
|
||||
|
||||
return &KmerSetGroupBuilder{
|
||||
dir: directory,
|
||||
k: k,
|
||||
m: m,
|
||||
n: n,
|
||||
P: P,
|
||||
config: config,
|
||||
writers: writers,
|
||||
mu: mutexes,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// AddSequence extracts super-kmers from a sequence and writes them
|
||||
// to the appropriate partition files for the given set.
|
||||
func (b *KmerSetGroupBuilder) AddSequence(setIndex int, seq *obiseq.BioSequence) {
|
||||
if setIndex < 0 || setIndex >= b.n {
|
||||
return
|
||||
}
|
||||
rawSeq := seq.Sequence()
|
||||
if len(rawSeq) < b.k {
|
||||
return
|
||||
}
|
||||
for sk := range IterSuperKmers(rawSeq, b.k, b.m) {
|
||||
part := int(sk.Minimizer % uint64(b.P))
|
||||
b.mu[setIndex][part].Lock()
|
||||
b.writers[setIndex][part].Write(sk)
|
||||
b.mu[setIndex][part].Unlock()
|
||||
}
|
||||
}
|
||||
|
||||
// AddSuperKmer writes a single super-kmer to the appropriate partition.
|
||||
func (b *KmerSetGroupBuilder) AddSuperKmer(setIndex int, sk SuperKmer) {
|
||||
if setIndex < 0 || setIndex >= b.n {
|
||||
return
|
||||
}
|
||||
part := int(sk.Minimizer % uint64(b.P))
|
||||
b.mu[setIndex][part].Lock()
|
||||
b.writers[setIndex][part].Write(sk)
|
||||
b.mu[setIndex][part].Unlock()
|
||||
}
|
||||
|
||||
// Close finalizes the construction:
|
||||
// 1. Flush and close all SKM writers
|
||||
// 2. For each partition of each set (in parallel):
|
||||
// - Load super-kmers from .skm
|
||||
// - Extract canonical k-mers
|
||||
// - Sort and deduplicate (count if frequency filter)
|
||||
// - Write .kdi file
|
||||
// 3. Write metadata.toml
|
||||
// 4. Remove .build/ directory
|
||||
//
|
||||
// Returns the finalized KmerSetGroup in read-only mode.
|
||||
func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) {
|
||||
if b.closed {
|
||||
return nil, fmt.Errorf("obikmer: builder already closed")
|
||||
}
|
||||
b.closed = true
|
||||
|
||||
// 1. Close all SKM writers
|
||||
for s := 0; s < b.n; s++ {
|
||||
for p := 0; p < b.P; p++ {
|
||||
if err := b.writers[s][p].Close(); err != nil {
|
||||
return nil, fmt.Errorf("obikmer: close skm writer set=%d part=%d: %w", s, p, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Create output directory structure
|
||||
for s := 0; s < b.n; s++ {
|
||||
setDir := filepath.Join(b.dir, fmt.Sprintf("set_%d", s))
|
||||
if err := os.MkdirAll(setDir, 0755); err != nil {
|
||||
return nil, fmt.Errorf("obikmer: create set dir: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Process partitions in parallel
|
||||
counts := make([][]uint64, b.n)
|
||||
for s := 0; s < b.n; s++ {
|
||||
counts[s] = make([]uint64, b.P)
|
||||
}
|
||||
|
||||
nWorkers := runtime.NumCPU()
|
||||
if nWorkers > b.P {
|
||||
nWorkers = b.P
|
||||
}
|
||||
|
||||
type job struct {
|
||||
setIdx int
|
||||
partIdx int
|
||||
}
|
||||
|
||||
jobs := make(chan job, b.n*b.P)
|
||||
var wg sync.WaitGroup
|
||||
var errMu sync.Mutex
|
||||
var firstErr error
|
||||
|
||||
for w := 0; w < nWorkers; w++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for j := range jobs {
|
||||
if err := b.finalizePartition(j.setIdx, j.partIdx, &counts[j.setIdx][j.partIdx]); err != nil {
|
||||
errMu.Lock()
|
||||
if firstErr == nil {
|
||||
firstErr = err
|
||||
}
|
||||
errMu.Unlock()
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
for s := 0; s < b.n; s++ {
|
||||
for p := 0; p < b.P; p++ {
|
||||
jobs <- job{s, p}
|
||||
}
|
||||
}
|
||||
close(jobs)
|
||||
wg.Wait()
|
||||
|
||||
if firstErr != nil {
|
||||
return nil, firstErr
|
||||
}
|
||||
|
||||
// 3. Build KmerSetGroup and write metadata
|
||||
totalCounts := make([]uint64, b.n)
|
||||
for s := 0; s < b.n; s++ {
|
||||
for p := 0; p < b.P; p++ {
|
||||
totalCounts[s] += counts[s][p]
|
||||
}
|
||||
}
|
||||
|
||||
setsIDs := make([]string, b.n)
|
||||
|
||||
ksg := &KmerSetGroup{
|
||||
path: b.dir,
|
||||
k: b.k,
|
||||
m: b.m,
|
||||
partitions: b.P,
|
||||
n: b.n,
|
||||
setsIDs: setsIDs,
|
||||
counts: totalCounts,
|
||||
Metadata: make(map[string]interface{}),
|
||||
}
|
||||
|
||||
if err := ksg.saveMetadata(); err != nil {
|
||||
return nil, fmt.Errorf("obikmer: write metadata: %w", err)
|
||||
}
|
||||
|
||||
// 4. Remove .build/ directory
|
||||
buildDir := filepath.Join(b.dir, ".build")
|
||||
os.RemoveAll(buildDir)
|
||||
|
||||
return ksg, nil
|
||||
}
|
||||
|
||||
// finalizePartition processes a single partition: load SKM, extract k-mers,
|
||||
// sort, dedup/count, write KDI.
|
||||
func (b *KmerSetGroupBuilder) finalizePartition(setIdx, partIdx int, count *uint64) error {
|
||||
skmPath := filepath.Join(b.dir, ".build",
|
||||
fmt.Sprintf("set_%d", setIdx),
|
||||
fmt.Sprintf("part_%04d.skm", partIdx))
|
||||
|
||||
kdiPath := filepath.Join(b.dir,
|
||||
fmt.Sprintf("set_%d", setIdx),
|
||||
fmt.Sprintf("part_%04d.kdi", partIdx))
|
||||
|
||||
// Load super-kmers and extract canonical k-mers
|
||||
reader, err := NewSkmReader(skmPath)
|
||||
if err != nil {
|
||||
// If file doesn't exist or is empty, write empty KDI
|
||||
return b.writeEmptyKdi(kdiPath, count)
|
||||
}
|
||||
|
||||
var kmers []uint64
|
||||
for {
|
||||
sk, ok := reader.Next()
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
for kmer := range IterCanonicalKmers(sk.Sequence, b.k) {
|
||||
kmers = append(kmers, kmer)
|
||||
}
|
||||
}
|
||||
reader.Close()
|
||||
|
||||
if len(kmers) == 0 {
|
||||
return b.writeEmptyKdi(kdiPath, count)
|
||||
}
|
||||
|
||||
// Sort
|
||||
sort.Slice(kmers, func(i, j int) bool { return kmers[i] < kmers[j] })
|
||||
|
||||
// Write KDI based on mode
|
||||
w, err := NewKdiWriter(kdiPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
minFreq := b.config.minFreq
|
||||
if minFreq <= 0 {
|
||||
minFreq = 1 // simple dedup
|
||||
}
|
||||
|
||||
// Linear scan: count consecutive identical values
|
||||
i := 0
|
||||
for i < len(kmers) {
|
||||
val := kmers[i]
|
||||
c := 1
|
||||
for i+c < len(kmers) && kmers[i+c] == val {
|
||||
c++
|
||||
}
|
||||
if c >= minFreq {
|
||||
if err := w.Write(val); err != nil {
|
||||
w.Close()
|
||||
return err
|
||||
}
|
||||
}
|
||||
i += c
|
||||
}
|
||||
|
||||
*count = w.Count()
|
||||
return w.Close()
|
||||
}
|
||||
|
||||
func (b *KmerSetGroupBuilder) writeEmptyKdi(path string, count *uint64) error {
|
||||
w, err := NewKdiWriter(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
*count = 0
|
||||
return w.Close()
|
||||
}
|
||||
278
pkg/obikmer/kmer_set_builder_test.go
Normal file
278
pkg/obikmer/kmer_set_builder_test.go
Normal file
@@ -0,0 +1,278 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"testing"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
)
|
||||
|
||||
func TestBuilderBasic(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
seq := obiseq.NewBioSequence("test", []byte("ACGTACGTACGTACGTACGTACGTACGT"), "")
|
||||
builder.AddSequence(0, seq)
|
||||
|
||||
ksg, err := builder.Close()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if ksg.K() != 15 {
|
||||
t.Fatalf("K() = %d, want 15", ksg.K())
|
||||
}
|
||||
if ksg.M() != 7 {
|
||||
t.Fatalf("M() = %d, want 7", ksg.M())
|
||||
}
|
||||
if ksg.Partitions() != 64 {
|
||||
t.Fatalf("Partitions() = %d, want 64", ksg.Partitions())
|
||||
}
|
||||
if ksg.Size() != 1 {
|
||||
t.Fatalf("Size() = %d, want 1", ksg.Size())
|
||||
}
|
||||
if ksg.Len(0) == 0 {
|
||||
t.Fatal("Len(0) = 0, expected some k-mers")
|
||||
}
|
||||
|
||||
// Verify k-mers match what we'd compute directly
|
||||
var expected []uint64
|
||||
for kmer := range IterCanonicalKmers(seq.Sequence(), 15) {
|
||||
expected = append(expected, kmer)
|
||||
}
|
||||
sort.Slice(expected, func(i, j int) bool { return expected[i] < expected[j] })
|
||||
// Dedup
|
||||
deduped := expected[:0]
|
||||
for i, v := range expected {
|
||||
if i == 0 || v != expected[i-1] {
|
||||
deduped = append(deduped, v)
|
||||
}
|
||||
}
|
||||
|
||||
if ksg.Len(0) != uint64(len(deduped)) {
|
||||
t.Fatalf("Len(0) = %d, expected %d unique k-mers", ksg.Len(0), len(deduped))
|
||||
}
|
||||
|
||||
// Check iterator
|
||||
var fromIter []uint64
|
||||
for kmer := range ksg.Iterator(0) {
|
||||
fromIter = append(fromIter, kmer)
|
||||
}
|
||||
// The iterator does a k-way merge so should be sorted
|
||||
for i := 1; i < len(fromIter); i++ {
|
||||
if fromIter[i] <= fromIter[i-1] {
|
||||
t.Fatalf("iterator not sorted at %d: %d <= %d", i, fromIter[i], fromIter[i-1])
|
||||
}
|
||||
}
|
||||
if len(fromIter) != len(deduped) {
|
||||
t.Fatalf("iterator yielded %d k-mers, expected %d", len(fromIter), len(deduped))
|
||||
}
|
||||
for i, v := range fromIter {
|
||||
if v != deduped[i] {
|
||||
t.Fatalf("iterator kmer %d: got %d, want %d", i, v, deduped[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuilderMultipleSequences(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
seqs := []string{
|
||||
"ACGTACGTACGTACGTACGTACGTACGT",
|
||||
"TTTTTTTTTTTTTTTTTTTTTTTTT",
|
||||
"GGGGGGGGGGGGGGGGGGGGGGGG",
|
||||
}
|
||||
for _, s := range seqs {
|
||||
seq := obiseq.NewBioSequence("", []byte(s), "")
|
||||
builder.AddSequence(0, seq)
|
||||
}
|
||||
|
||||
ksg, err := builder.Close()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if ksg.Len(0) == 0 {
|
||||
t.Fatal("expected k-mers after multiple sequences")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuilderFrequencyFilter(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64,
|
||||
WithMinFrequency(3))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Add same sequence 3 times — all k-mers should survive freq=3
|
||||
seq := obiseq.NewBioSequence("test", []byte("ACGTACGTACGTACGTACGTACGTACGT"), "")
|
||||
for i := 0; i < 3; i++ {
|
||||
builder.AddSequence(0, seq)
|
||||
}
|
||||
|
||||
ksg, err := builder.Close()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// All k-mers appear exactly 3 times → all should survive
|
||||
var expected []uint64
|
||||
for kmer := range IterCanonicalKmers(seq.Sequence(), 15) {
|
||||
expected = append(expected, kmer)
|
||||
}
|
||||
sort.Slice(expected, func(i, j int) bool { return expected[i] < expected[j] })
|
||||
deduped := expected[:0]
|
||||
for i, v := range expected {
|
||||
if i == 0 || v != expected[i-1] {
|
||||
deduped = append(deduped, v)
|
||||
}
|
||||
}
|
||||
|
||||
if ksg.Len(0) != uint64(len(deduped)) {
|
||||
t.Fatalf("Len(0) = %d, expected %d (all k-mers at freq=3)", ksg.Len(0), len(deduped))
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuilderFrequencyFilterRejects(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64,
|
||||
WithMinFrequency(5))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Use a non-repetitive sequence so each canonical k-mer appears once per pass.
|
||||
// Adding it twice gives freq=2 per kmer, which is < minFreq=5 → all rejected.
|
||||
seq := obiseq.NewBioSequence("test",
|
||||
[]byte("ACGATCGATCTAGCTAGCTGATCGATCGATCG"), "")
|
||||
builder.AddSequence(0, seq)
|
||||
builder.AddSequence(0, seq)
|
||||
|
||||
ksg, err := builder.Close()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if ksg.Len(0) != 0 {
|
||||
t.Fatalf("Len(0) = %d, expected 0 (all k-mers at freq=2 < minFreq=5)", ksg.Len(0))
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuilderMultipleSets(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 3, 64)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
seqs := []string{
|
||||
"ACGTACGTACGTACGTACGTACGTACGT",
|
||||
"TTTTTTTTTTTTTTTTTTTTTTTTT",
|
||||
"GGGGGGGGGGGGGGGGGGGGGGGG",
|
||||
}
|
||||
for i, s := range seqs {
|
||||
seq := obiseq.NewBioSequence("", []byte(s), "")
|
||||
builder.AddSequence(i, seq)
|
||||
}
|
||||
|
||||
ksg, err := builder.Close()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if ksg.Size() != 3 {
|
||||
t.Fatalf("Size() = %d, want 3", ksg.Size())
|
||||
}
|
||||
for s := 0; s < 3; s++ {
|
||||
if ksg.Len(s) == 0 {
|
||||
t.Fatalf("Len(%d) = 0, expected some k-mers", s)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuilderOpenRoundTrip(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
seq := obiseq.NewBioSequence("test", []byte("ACGTACGTACGTACGTACGTACGTACGT"), "")
|
||||
builder.AddSequence(0, seq)
|
||||
|
||||
ksg1, err := builder.Close()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Reopen
|
||||
ksg2, err := OpenKmerSetGroup(dir)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if ksg2.K() != ksg1.K() {
|
||||
t.Fatalf("K mismatch: %d vs %d", ksg2.K(), ksg1.K())
|
||||
}
|
||||
if ksg2.M() != ksg1.M() {
|
||||
t.Fatalf("M mismatch: %d vs %d", ksg2.M(), ksg1.M())
|
||||
}
|
||||
if ksg2.Partitions() != ksg1.Partitions() {
|
||||
t.Fatalf("Partitions mismatch: %d vs %d", ksg2.Partitions(), ksg1.Partitions())
|
||||
}
|
||||
if ksg2.Len(0) != ksg1.Len(0) {
|
||||
t.Fatalf("Len mismatch: %d vs %d", ksg2.Len(0), ksg1.Len(0))
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuilderAttributes(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
seq := obiseq.NewBioSequence("test", []byte("ACGTACGTACGTACGTACGTACGTACGT"), "")
|
||||
builder.AddSequence(0, seq)
|
||||
|
||||
ksg, err := builder.Close()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
ksg.SetId("my_index")
|
||||
ksg.SetAttribute("organism", "test")
|
||||
ksg.SaveMetadata()
|
||||
|
||||
// Reopen and check
|
||||
ksg2, err := OpenKmerSetGroup(dir)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if ksg2.Id() != "my_index" {
|
||||
t.Fatalf("Id() = %q, want %q", ksg2.Id(), "my_index")
|
||||
}
|
||||
if !ksg2.HasAttribute("organism") {
|
||||
t.Fatal("expected 'organism' attribute")
|
||||
}
|
||||
v, _ := ksg2.GetAttribute("organism")
|
||||
if v != "test" {
|
||||
t.Fatalf("organism = %v, want 'test'", v)
|
||||
}
|
||||
}
|
||||
580
pkg/obikmer/kmer_set_disk.go
Normal file
580
pkg/obikmer/kmer_set_disk.go
Normal file
@@ -0,0 +1,580 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"iter"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidist"
|
||||
"github.com/pelletier/go-toml/v2"
|
||||
)
|
||||
|
||||
// MetadataFormat represents the metadata serialization format.
|
||||
// Currently only TOML is used for disk-based indices, but the type
|
||||
// is kept for backward compatibility with CLI options.
|
||||
type MetadataFormat int
|
||||
|
||||
const (
|
||||
FormatTOML MetadataFormat = iota
|
||||
FormatYAML
|
||||
FormatJSON
|
||||
)
|
||||
|
||||
// String returns the file extension for the format.
|
||||
func (f MetadataFormat) String() string {
|
||||
switch f {
|
||||
case FormatTOML:
|
||||
return "toml"
|
||||
case FormatYAML:
|
||||
return "yaml"
|
||||
case FormatJSON:
|
||||
return "json"
|
||||
default:
|
||||
return "toml"
|
||||
}
|
||||
}
|
||||
|
||||
// KmerSetGroup is a disk-based collection of N k-mer sets sharing the same
|
||||
// k, m, and partition count P. After construction (via KmerSetGroupBuilder),
|
||||
// it is immutable and all operations are streaming (partition by partition).
|
||||
//
|
||||
// A KmerSetGroup with Size()==1 is effectively a KmerSet (singleton).
|
||||
type KmerSetGroup struct {
|
||||
path string // root directory
|
||||
id string // user-assigned identifier
|
||||
k int // k-mer size
|
||||
m int // minimizer size
|
||||
partitions int // number of partitions P
|
||||
n int // number of sets N
|
||||
setsIDs []string // IDs of individual sets
|
||||
counts []uint64 // total k-mer count per set (sum over partitions)
|
||||
Metadata map[string]interface{} // group-level user metadata
|
||||
}
|
||||
|
||||
// diskMetadata is the TOML-serializable structure for metadata.toml.
|
||||
type diskMetadata struct {
|
||||
ID string `toml:"id,omitempty"`
|
||||
K int `toml:"k"`
|
||||
M int `toml:"m"`
|
||||
Partitions int `toml:"partitions"`
|
||||
Type string `toml:"type"`
|
||||
Size int `toml:"size"`
|
||||
SetsIDs []string `toml:"sets_ids,omitempty"`
|
||||
Counts []uint64 `toml:"counts,omitempty"`
|
||||
UserMetadata map[string]interface{} `toml:"user_metadata,omitempty"`
|
||||
}
|
||||
|
||||
// OpenKmerSetGroup opens a finalized index directory in read-only mode.
|
||||
func OpenKmerSetGroup(directory string) (*KmerSetGroup, error) {
|
||||
metaPath := filepath.Join(directory, "metadata.toml")
|
||||
f, err := os.Open(metaPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("obikmer: open metadata: %w", err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
var meta diskMetadata
|
||||
if err := toml.NewDecoder(f).Decode(&meta); err != nil {
|
||||
return nil, fmt.Errorf("obikmer: decode metadata: %w", err)
|
||||
}
|
||||
|
||||
ksg := &KmerSetGroup{
|
||||
path: directory,
|
||||
id: meta.ID,
|
||||
k: meta.K,
|
||||
m: meta.M,
|
||||
partitions: meta.Partitions,
|
||||
n: meta.Size,
|
||||
setsIDs: meta.SetsIDs,
|
||||
counts: meta.Counts,
|
||||
Metadata: meta.UserMetadata,
|
||||
}
|
||||
if ksg.Metadata == nil {
|
||||
ksg.Metadata = make(map[string]interface{})
|
||||
}
|
||||
if ksg.setsIDs == nil {
|
||||
ksg.setsIDs = make([]string, ksg.n)
|
||||
}
|
||||
if ksg.counts == nil {
|
||||
// Compute counts by scanning partitions
|
||||
ksg.counts = make([]uint64, ksg.n)
|
||||
for s := 0; s < ksg.n; s++ {
|
||||
for p := 0; p < ksg.partitions; p++ {
|
||||
path := ksg.partitionPath(s, p)
|
||||
r, err := NewKdiReader(path)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
ksg.counts[s] += r.Count()
|
||||
r.Close()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ksg, nil
|
||||
}
|
||||
|
||||
// SaveMetadata writes the metadata.toml file. This is useful after
|
||||
// modifying attributes or IDs on an already-finalized index.
|
||||
func (ksg *KmerSetGroup) SaveMetadata() error {
|
||||
return ksg.saveMetadata()
|
||||
}
|
||||
|
||||
// saveMetadata writes the metadata.toml file (internal).
|
||||
func (ksg *KmerSetGroup) saveMetadata() error {
|
||||
meta := diskMetadata{
|
||||
ID: ksg.id,
|
||||
K: ksg.k,
|
||||
M: ksg.m,
|
||||
Partitions: ksg.partitions,
|
||||
Type: "KmerSetGroup",
|
||||
Size: ksg.n,
|
||||
SetsIDs: ksg.setsIDs,
|
||||
Counts: ksg.counts,
|
||||
UserMetadata: ksg.Metadata,
|
||||
}
|
||||
|
||||
metaPath := filepath.Join(ksg.path, "metadata.toml")
|
||||
f, err := os.Create(metaPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
return toml.NewEncoder(f).Encode(meta)
|
||||
}
|
||||
|
||||
// partitionPath returns the file path for partition p of set s.
|
||||
func (ksg *KmerSetGroup) partitionPath(setIndex, partIndex int) string {
|
||||
return filepath.Join(ksg.path, fmt.Sprintf("set_%d", setIndex),
|
||||
fmt.Sprintf("part_%04d.kdi", partIndex))
|
||||
}
|
||||
|
||||
// Path returns the root directory of the index.
|
||||
func (ksg *KmerSetGroup) Path() string {
|
||||
return ksg.path
|
||||
}
|
||||
|
||||
// K returns the k-mer size.
|
||||
func (ksg *KmerSetGroup) K() int {
|
||||
return ksg.k
|
||||
}
|
||||
|
||||
// M returns the minimizer size.
|
||||
func (ksg *KmerSetGroup) M() int {
|
||||
return ksg.m
|
||||
}
|
||||
|
||||
// Partitions returns the number of partitions P.
|
||||
func (ksg *KmerSetGroup) Partitions() int {
|
||||
return ksg.partitions
|
||||
}
|
||||
|
||||
// Size returns the number of sets N.
|
||||
func (ksg *KmerSetGroup) Size() int {
|
||||
return ksg.n
|
||||
}
|
||||
|
||||
// Id returns the group identifier.
|
||||
func (ksg *KmerSetGroup) Id() string {
|
||||
return ksg.id
|
||||
}
|
||||
|
||||
// SetId sets the group identifier and persists the change.
|
||||
func (ksg *KmerSetGroup) SetId(id string) {
|
||||
ksg.id = id
|
||||
}
|
||||
|
||||
// Len returns the total number of k-mers.
|
||||
// Without argument: total across all sets.
|
||||
// With argument setIndex: count for that specific set.
|
||||
func (ksg *KmerSetGroup) Len(setIndex ...int) uint64 {
|
||||
if len(setIndex) == 0 {
|
||||
var total uint64
|
||||
for _, c := range ksg.counts {
|
||||
total += c
|
||||
}
|
||||
return total
|
||||
}
|
||||
idx := setIndex[0]
|
||||
if idx < 0 || idx >= ksg.n {
|
||||
return 0
|
||||
}
|
||||
return ksg.counts[idx]
|
||||
}
|
||||
|
||||
// Contains checks if a k-mer is present in the specified set.
|
||||
// Uses binary search on the appropriate partition's KDI file.
|
||||
func (ksg *KmerSetGroup) Contains(setIndex int, kmer uint64) bool {
|
||||
if setIndex < 0 || setIndex >= ksg.n {
|
||||
return false
|
||||
}
|
||||
// Determine partition from minimizer
|
||||
// For a canonical k-mer, we need to find which partition it would fall into.
|
||||
// The partition is determined by the minimizer during construction.
|
||||
// For Contains, we must scan all partitions of this set (linear search within each).
|
||||
// A full binary-search approach would require an index file.
|
||||
// For now, scan the partition determined by the k-mer's minimizer.
|
||||
// Since we don't know the minimizer, we do a linear scan of all partitions.
|
||||
// This is O(total_kmers / P) per partition on average.
|
||||
|
||||
// Optimization: scan all partitions in parallel
|
||||
type result struct {
|
||||
found bool
|
||||
}
|
||||
ch := make(chan result, ksg.partitions)
|
||||
|
||||
for p := 0; p < ksg.partitions; p++ {
|
||||
go func(part int) {
|
||||
r, err := NewKdiReader(ksg.partitionPath(setIndex, part))
|
||||
if err != nil {
|
||||
ch <- result{false}
|
||||
return
|
||||
}
|
||||
defer r.Close()
|
||||
for {
|
||||
v, ok := r.Next()
|
||||
if !ok {
|
||||
ch <- result{false}
|
||||
return
|
||||
}
|
||||
if v == kmer {
|
||||
ch <- result{true}
|
||||
return
|
||||
}
|
||||
if v > kmer {
|
||||
ch <- result{false}
|
||||
return
|
||||
}
|
||||
}
|
||||
}(p)
|
||||
}
|
||||
|
||||
for i := 0; i < ksg.partitions; i++ {
|
||||
res := <-ch
|
||||
if res.found {
|
||||
// Drain remaining goroutines
|
||||
go func() {
|
||||
for j := i + 1; j < ksg.partitions; j++ {
|
||||
<-ch
|
||||
}
|
||||
}()
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// Iterator returns an iterator over all k-mers in the specified set,
|
||||
// in sorted order within each partition. Since partitions are independent,
|
||||
// to get a globally sorted stream, use iteratorSorted.
|
||||
func (ksg *KmerSetGroup) Iterator(setIndex int) iter.Seq[uint64] {
|
||||
return func(yield func(uint64) bool) {
|
||||
if setIndex < 0 || setIndex >= ksg.n {
|
||||
return
|
||||
}
|
||||
|
||||
// Open all partition readers and merge them
|
||||
readers := make([]*KdiReader, 0, ksg.partitions)
|
||||
for p := 0; p < ksg.partitions; p++ {
|
||||
r, err := NewKdiReader(ksg.partitionPath(setIndex, p))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if r.Count() > 0 {
|
||||
readers = append(readers, r)
|
||||
} else {
|
||||
r.Close()
|
||||
}
|
||||
}
|
||||
|
||||
if len(readers) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
m := NewKWayMerge(readers)
|
||||
defer m.Close()
|
||||
|
||||
for {
|
||||
kmer, _, ok := m.Next()
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if !yield(kmer) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ==============================
|
||||
// Attribute API (compatible with old API)
|
||||
// ==============================
|
||||
|
||||
// HasAttribute checks if a metadata key exists.
|
||||
func (ksg *KmerSetGroup) HasAttribute(key string) bool {
|
||||
_, ok := ksg.Metadata[key]
|
||||
return ok
|
||||
}
|
||||
|
||||
// GetAttribute returns the value of an attribute.
|
||||
func (ksg *KmerSetGroup) GetAttribute(key string) (interface{}, bool) {
|
||||
switch key {
|
||||
case "id":
|
||||
return ksg.Id(), true
|
||||
case "k":
|
||||
return ksg.K(), true
|
||||
default:
|
||||
value, ok := ksg.Metadata[key]
|
||||
return value, ok
|
||||
}
|
||||
}
|
||||
|
||||
// SetAttribute sets a metadata attribute.
|
||||
func (ksg *KmerSetGroup) SetAttribute(key string, value interface{}) {
|
||||
switch key {
|
||||
case "id":
|
||||
if id, ok := value.(string); ok {
|
||||
ksg.SetId(id)
|
||||
} else {
|
||||
panic(fmt.Sprintf("id must be a string, got %T", value))
|
||||
}
|
||||
case "k":
|
||||
panic("k is immutable")
|
||||
default:
|
||||
ksg.Metadata[key] = value
|
||||
}
|
||||
}
|
||||
|
||||
// DeleteAttribute removes a metadata attribute.
|
||||
func (ksg *KmerSetGroup) DeleteAttribute(key string) {
|
||||
delete(ksg.Metadata, key)
|
||||
}
|
||||
|
||||
// GetIntAttribute returns an attribute as int.
|
||||
func (ksg *KmerSetGroup) GetIntAttribute(key string) (int, bool) {
|
||||
v, ok := ksg.GetAttribute(key)
|
||||
if !ok {
|
||||
return 0, false
|
||||
}
|
||||
switch val := v.(type) {
|
||||
case int:
|
||||
return val, true
|
||||
case int64:
|
||||
return int(val), true
|
||||
case float64:
|
||||
return int(val), true
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
// GetStringAttribute returns an attribute as string.
|
||||
func (ksg *KmerSetGroup) GetStringAttribute(key string) (string, bool) {
|
||||
v, ok := ksg.GetAttribute(key)
|
||||
if !ok {
|
||||
return "", false
|
||||
}
|
||||
if s, ok := v.(string); ok {
|
||||
return s, true
|
||||
}
|
||||
return fmt.Sprintf("%v", v), true
|
||||
}
|
||||
|
||||
// ==============================
|
||||
// Jaccard metrics (streaming, disk-based)
|
||||
// ==============================
|
||||
|
||||
// JaccardDistanceMatrix computes a pairwise Jaccard distance matrix
|
||||
// for all sets in the group. Operates partition by partition in streaming.
|
||||
func (ksg *KmerSetGroup) JaccardDistanceMatrix() *obidist.DistMatrix {
|
||||
n := ksg.n
|
||||
labels := make([]string, n)
|
||||
for i := 0; i < n; i++ {
|
||||
if i < len(ksg.setsIDs) && ksg.setsIDs[i] != "" {
|
||||
labels[i] = ksg.setsIDs[i]
|
||||
} else {
|
||||
labels[i] = fmt.Sprintf("set_%d", i)
|
||||
}
|
||||
}
|
||||
|
||||
dm := obidist.NewDistMatrixWithLabels(labels)
|
||||
|
||||
// Accumulate intersection and union counts
|
||||
intersections := make([][]uint64, n)
|
||||
unions := make([][]uint64, n)
|
||||
for i := 0; i < n; i++ {
|
||||
intersections[i] = make([]uint64, n)
|
||||
unions[i] = make([]uint64, n)
|
||||
}
|
||||
|
||||
// Process partition by partition
|
||||
var mu sync.Mutex
|
||||
var wg sync.WaitGroup
|
||||
|
||||
for p := 0; p < ksg.partitions; p++ {
|
||||
wg.Add(1)
|
||||
go func(part int) {
|
||||
defer wg.Done()
|
||||
|
||||
// Open all set readers for this partition
|
||||
readers := make([]*KdiReader, n)
|
||||
for s := 0; s < n; s++ {
|
||||
r, err := NewKdiReader(ksg.partitionPath(s, part))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
readers[s] = r
|
||||
}
|
||||
defer func() {
|
||||
for _, r := range readers {
|
||||
if r != nil {
|
||||
r.Close()
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
// Merge all N readers to count intersections and unions
|
||||
activeReaders := make([]*KdiReader, 0, n)
|
||||
activeIndices := make([]int, 0, n)
|
||||
for i, r := range readers {
|
||||
if r != nil && r.Count() > 0 {
|
||||
activeReaders = append(activeReaders, r)
|
||||
activeIndices = append(activeIndices, i)
|
||||
}
|
||||
}
|
||||
if len(activeReaders) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
merge := NewKWayMerge(activeReaders)
|
||||
// Don't close merge here since readers are managed above
|
||||
// We only want to iterate
|
||||
|
||||
// We need per-set presence tracking, so we use a custom merge
|
||||
// Rebuild with a direct approach
|
||||
merge.Close() // close the merge (which closes readers)
|
||||
|
||||
// Reopen readers for custom merge
|
||||
for s := 0; s < n; s++ {
|
||||
readers[s] = nil
|
||||
r, err := NewKdiReader(ksg.partitionPath(s, part))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if r.Count() > 0 {
|
||||
readers[s] = r
|
||||
} else {
|
||||
r.Close()
|
||||
}
|
||||
}
|
||||
|
||||
// Custom k-way merge that tracks which sets contain each kmer
|
||||
type entry struct {
|
||||
val uint64
|
||||
setIdx int
|
||||
}
|
||||
|
||||
// Use a simpler approach: read all values for this partition into memory
|
||||
// for each set, then do a merge
|
||||
setKmers := make([][]uint64, n)
|
||||
for s := 0; s < n; s++ {
|
||||
if readers[s] == nil {
|
||||
continue
|
||||
}
|
||||
kmers := make([]uint64, 0, readers[s].Count())
|
||||
for {
|
||||
v, ok := readers[s].Next()
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
kmers = append(kmers, v)
|
||||
}
|
||||
setKmers[s] = kmers
|
||||
readers[s].Close()
|
||||
readers[s] = nil
|
||||
}
|
||||
|
||||
// Count pairwise intersections using sorted merge
|
||||
// For each pair (i,j), count kmers present in both
|
||||
localInter := make([][]uint64, n)
|
||||
localUnion := make([][]uint64, n)
|
||||
for i := 0; i < n; i++ {
|
||||
localInter[i] = make([]uint64, n)
|
||||
localUnion[i] = make([]uint64, n)
|
||||
}
|
||||
|
||||
for i := 0; i < n; i++ {
|
||||
localUnion[i][i] = uint64(len(setKmers[i]))
|
||||
for j := i + 1; j < n; j++ {
|
||||
a, b := setKmers[i], setKmers[j]
|
||||
var inter uint64
|
||||
ai, bi := 0, 0
|
||||
for ai < len(a) && bi < len(b) {
|
||||
if a[ai] == b[bi] {
|
||||
inter++
|
||||
ai++
|
||||
bi++
|
||||
} else if a[ai] < b[bi] {
|
||||
ai++
|
||||
} else {
|
||||
bi++
|
||||
}
|
||||
}
|
||||
localInter[i][j] = inter
|
||||
localUnion[i][j] = uint64(len(a)) + uint64(len(b)) - inter
|
||||
}
|
||||
}
|
||||
|
||||
mu.Lock()
|
||||
for i := 0; i < n; i++ {
|
||||
for j := i; j < n; j++ {
|
||||
intersections[i][j] += localInter[i][j]
|
||||
unions[i][j] += localUnion[i][j]
|
||||
}
|
||||
}
|
||||
mu.Unlock()
|
||||
}(p)
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
// Compute distances from accumulated counts
|
||||
for i := 0; i < n-1; i++ {
|
||||
for j := i + 1; j < n; j++ {
|
||||
u := unions[i][j]
|
||||
if u == 0 {
|
||||
dm.Set(i, j, 1.0)
|
||||
} else {
|
||||
dm.Set(i, j, 1.0-float64(intersections[i][j])/float64(u))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return dm
|
||||
}
|
||||
|
||||
// JaccardSimilarityMatrix computes a pairwise Jaccard similarity matrix.
|
||||
func (ksg *KmerSetGroup) JaccardSimilarityMatrix() *obidist.DistMatrix {
|
||||
n := ksg.n
|
||||
labels := make([]string, n)
|
||||
for i := 0; i < n; i++ {
|
||||
if i < len(ksg.setsIDs) && ksg.setsIDs[i] != "" {
|
||||
labels[i] = ksg.setsIDs[i]
|
||||
} else {
|
||||
labels[i] = fmt.Sprintf("set_%d", i)
|
||||
}
|
||||
}
|
||||
|
||||
// Reuse distance computation
|
||||
dm := ksg.JaccardDistanceMatrix()
|
||||
sm := obidist.NewSimilarityMatrixWithLabels(labels)
|
||||
|
||||
for i := 0; i < n-1; i++ {
|
||||
for j := i + 1; j < n; j++ {
|
||||
sm.Set(i, j, 1.0-dm.Get(i, j))
|
||||
}
|
||||
}
|
||||
|
||||
return sm
|
||||
}
|
||||
568
pkg/obikmer/kmer_set_disk_ops.go
Normal file
568
pkg/obikmer/kmer_set_disk_ops.go
Normal file
@@ -0,0 +1,568 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"sync"
|
||||
)
|
||||
|
||||
// Union computes the union of all sets in the group, producing a new
|
||||
// singleton KmerSetGroup on disk. A k-mer is in the result if it
|
||||
// appears in any set.
|
||||
func (ksg *KmerSetGroup) Union(outputDir string) (*KmerSetGroup, error) {
|
||||
return ksg.quorumOp(outputDir, 1, ksg.n)
|
||||
}
|
||||
|
||||
// Intersect computes the intersection of all sets, producing a new
|
||||
// singleton KmerSetGroup on disk. A k-mer is in the result if it
|
||||
// appears in every set.
|
||||
func (ksg *KmerSetGroup) Intersect(outputDir string) (*KmerSetGroup, error) {
|
||||
return ksg.quorumOp(outputDir, ksg.n, ksg.n)
|
||||
}
|
||||
|
||||
// Difference computes set_0 minus the union of all other sets.
|
||||
func (ksg *KmerSetGroup) Difference(outputDir string) (*KmerSetGroup, error) {
|
||||
return ksg.differenceOp(outputDir)
|
||||
}
|
||||
|
||||
// QuorumAtLeast returns k-mers present in at least q sets.
|
||||
func (ksg *KmerSetGroup) QuorumAtLeast(q int, outputDir string) (*KmerSetGroup, error) {
|
||||
return ksg.quorumOp(outputDir, q, ksg.n)
|
||||
}
|
||||
|
||||
// QuorumExactly returns k-mers present in exactly q sets.
|
||||
func (ksg *KmerSetGroup) QuorumExactly(q int, outputDir string) (*KmerSetGroup, error) {
|
||||
return ksg.quorumOp(outputDir, q, q)
|
||||
}
|
||||
|
||||
// QuorumAtMost returns k-mers present in at most q sets.
|
||||
func (ksg *KmerSetGroup) QuorumAtMost(q int, outputDir string) (*KmerSetGroup, error) {
|
||||
return ksg.quorumOp(outputDir, 1, q)
|
||||
}
|
||||
|
||||
// UnionWith merges this group with another, producing a new KmerSetGroup
|
||||
// whose set_i is the union of this.set_i and other.set_i.
|
||||
// Both groups must have the same k, m, P, and N.
|
||||
func (ksg *KmerSetGroup) UnionWith(other *KmerSetGroup, outputDir string) (*KmerSetGroup, error) {
|
||||
if err := ksg.checkCompatible(other); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return ksg.pairwiseOp(other, outputDir, mergeUnion)
|
||||
}
|
||||
|
||||
// IntersectWith merges this group with another, producing a new KmerSetGroup
|
||||
// whose set_i is the intersection of this.set_i and other.set_i.
|
||||
func (ksg *KmerSetGroup) IntersectWith(other *KmerSetGroup, outputDir string) (*KmerSetGroup, error) {
|
||||
if err := ksg.checkCompatible(other); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return ksg.pairwiseOp(other, outputDir, mergeIntersect)
|
||||
}
|
||||
|
||||
// ==============================
|
||||
// Internal implementation
|
||||
// ==============================
|
||||
|
||||
func (ksg *KmerSetGroup) checkCompatible(other *KmerSetGroup) error {
|
||||
if ksg.k != other.k {
|
||||
return fmt.Errorf("obikmer: incompatible k: %d vs %d", ksg.k, other.k)
|
||||
}
|
||||
if ksg.m != other.m {
|
||||
return fmt.Errorf("obikmer: incompatible m: %d vs %d", ksg.m, other.m)
|
||||
}
|
||||
if ksg.partitions != other.partitions {
|
||||
return fmt.Errorf("obikmer: incompatible partitions: %d vs %d", ksg.partitions, other.partitions)
|
||||
}
|
||||
if ksg.n != other.n {
|
||||
return fmt.Errorf("obikmer: incompatible size: %d vs %d", ksg.n, other.n)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// quorumOp processes all N sets partition by partition.
|
||||
// For each partition, it opens N KdiReaders and does a k-way merge.
|
||||
// A kmer is written to the result if minQ <= count <= maxQ.
|
||||
func (ksg *KmerSetGroup) quorumOp(outputDir string, minQ, maxQ int) (*KmerSetGroup, error) {
|
||||
if minQ < 1 {
|
||||
minQ = 1
|
||||
}
|
||||
if maxQ > ksg.n {
|
||||
maxQ = ksg.n
|
||||
}
|
||||
|
||||
// Create output structure
|
||||
setDir := filepath.Join(outputDir, "set_0")
|
||||
if err := os.MkdirAll(setDir, 0755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
counts := make([]uint64, ksg.partitions)
|
||||
|
||||
nWorkers := runtime.NumCPU()
|
||||
if nWorkers > ksg.partitions {
|
||||
nWorkers = ksg.partitions
|
||||
}
|
||||
|
||||
jobs := make(chan int, ksg.partitions)
|
||||
var wg sync.WaitGroup
|
||||
var errMu sync.Mutex
|
||||
var firstErr error
|
||||
|
||||
for w := 0; w < nWorkers; w++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for p := range jobs {
|
||||
c, err := ksg.quorumPartition(p, setDir, minQ, maxQ)
|
||||
if err != nil {
|
||||
errMu.Lock()
|
||||
if firstErr == nil {
|
||||
firstErr = err
|
||||
}
|
||||
errMu.Unlock()
|
||||
return
|
||||
}
|
||||
counts[p] = c
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
for p := 0; p < ksg.partitions; p++ {
|
||||
jobs <- p
|
||||
}
|
||||
close(jobs)
|
||||
wg.Wait()
|
||||
|
||||
if firstErr != nil {
|
||||
return nil, firstErr
|
||||
}
|
||||
|
||||
var totalCount uint64
|
||||
for _, c := range counts {
|
||||
totalCount += c
|
||||
}
|
||||
|
||||
result := &KmerSetGroup{
|
||||
path: outputDir,
|
||||
k: ksg.k,
|
||||
m: ksg.m,
|
||||
partitions: ksg.partitions,
|
||||
n: 1,
|
||||
setsIDs: []string{""},
|
||||
counts: []uint64{totalCount},
|
||||
Metadata: make(map[string]interface{}),
|
||||
}
|
||||
|
||||
if err := result.saveMetadata(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// quorumPartition processes a single partition for quorum filtering.
|
||||
func (ksg *KmerSetGroup) quorumPartition(partIdx int, outSetDir string, minQ, maxQ int) (uint64, error) {
|
||||
// Open readers for all sets
|
||||
readers := make([]*KdiReader, 0, ksg.n)
|
||||
for s := 0; s < ksg.n; s++ {
|
||||
r, err := NewKdiReader(ksg.partitionPath(s, partIdx))
|
||||
if err != nil {
|
||||
// Close already-opened readers
|
||||
for _, rr := range readers {
|
||||
rr.Close()
|
||||
}
|
||||
return 0, err
|
||||
}
|
||||
if r.Count() > 0 {
|
||||
readers = append(readers, r)
|
||||
} else {
|
||||
r.Close()
|
||||
}
|
||||
}
|
||||
|
||||
outPath := filepath.Join(outSetDir, fmt.Sprintf("part_%04d.kdi", partIdx))
|
||||
|
||||
if len(readers) == 0 {
|
||||
// Write empty KDI
|
||||
w, err := NewKdiWriter(outPath)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return 0, w.Close()
|
||||
}
|
||||
|
||||
merge := NewKWayMerge(readers)
|
||||
// merge.Close() will close readers
|
||||
|
||||
w, err := NewKdiWriter(outPath)
|
||||
if err != nil {
|
||||
merge.Close()
|
||||
return 0, err
|
||||
}
|
||||
|
||||
for {
|
||||
kmer, count, ok := merge.Next()
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
if count >= minQ && count <= maxQ {
|
||||
if err := w.Write(kmer); err != nil {
|
||||
merge.Close()
|
||||
w.Close()
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
merge.Close()
|
||||
cnt := w.Count()
|
||||
return cnt, w.Close()
|
||||
}
|
||||
|
||||
// differenceOp computes set_0 minus the union of all other sets.
|
||||
func (ksg *KmerSetGroup) differenceOp(outputDir string) (*KmerSetGroup, error) {
|
||||
if ksg.n < 1 {
|
||||
return nil, fmt.Errorf("obikmer: difference requires at least 1 set")
|
||||
}
|
||||
|
||||
setDir := filepath.Join(outputDir, "set_0")
|
||||
if err := os.MkdirAll(setDir, 0755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
counts := make([]uint64, ksg.partitions)
|
||||
|
||||
nWorkers := runtime.NumCPU()
|
||||
if nWorkers > ksg.partitions {
|
||||
nWorkers = ksg.partitions
|
||||
}
|
||||
|
||||
jobs := make(chan int, ksg.partitions)
|
||||
var wg sync.WaitGroup
|
||||
var errMu sync.Mutex
|
||||
var firstErr error
|
||||
|
||||
for w := 0; w < nWorkers; w++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for p := range jobs {
|
||||
c, err := ksg.differencePartition(p, setDir)
|
||||
if err != nil {
|
||||
errMu.Lock()
|
||||
if firstErr == nil {
|
||||
firstErr = err
|
||||
}
|
||||
errMu.Unlock()
|
||||
return
|
||||
}
|
||||
counts[p] = c
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
for p := 0; p < ksg.partitions; p++ {
|
||||
jobs <- p
|
||||
}
|
||||
close(jobs)
|
||||
wg.Wait()
|
||||
|
||||
if firstErr != nil {
|
||||
return nil, firstErr
|
||||
}
|
||||
|
||||
var totalCount uint64
|
||||
for _, c := range counts {
|
||||
totalCount += c
|
||||
}
|
||||
|
||||
result := &KmerSetGroup{
|
||||
path: outputDir,
|
||||
k: ksg.k,
|
||||
m: ksg.m,
|
||||
partitions: ksg.partitions,
|
||||
n: 1,
|
||||
setsIDs: []string{""},
|
||||
counts: []uint64{totalCount},
|
||||
Metadata: make(map[string]interface{}),
|
||||
}
|
||||
|
||||
if err := result.saveMetadata(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// differencePartition computes set_0 - union(set_1..set_{n-1}) for one partition.
|
||||
func (ksg *KmerSetGroup) differencePartition(partIdx int, outSetDir string) (uint64, error) {
|
||||
outPath := filepath.Join(outSetDir, fmt.Sprintf("part_%04d.kdi", partIdx))
|
||||
|
||||
// Open set_0 reader
|
||||
r0, err := NewKdiReader(ksg.partitionPath(0, partIdx))
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if r0.Count() == 0 {
|
||||
r0.Close()
|
||||
w, err := NewKdiWriter(outPath)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return 0, w.Close()
|
||||
}
|
||||
|
||||
// Open readers for the other sets and merge them
|
||||
var otherReaders []*KdiReader
|
||||
for s := 1; s < ksg.n; s++ {
|
||||
r, err := NewKdiReader(ksg.partitionPath(s, partIdx))
|
||||
if err != nil {
|
||||
r0.Close()
|
||||
for _, rr := range otherReaders {
|
||||
rr.Close()
|
||||
}
|
||||
return 0, err
|
||||
}
|
||||
if r.Count() > 0 {
|
||||
otherReaders = append(otherReaders, r)
|
||||
} else {
|
||||
r.Close()
|
||||
}
|
||||
}
|
||||
|
||||
w, err := NewKdiWriter(outPath)
|
||||
if err != nil {
|
||||
r0.Close()
|
||||
for _, rr := range otherReaders {
|
||||
rr.Close()
|
||||
}
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if len(otherReaders) == 0 {
|
||||
// No other sets — copy set_0
|
||||
for {
|
||||
v, ok := r0.Next()
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
if err := w.Write(v); err != nil {
|
||||
r0.Close()
|
||||
w.Close()
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
r0.Close()
|
||||
cnt := w.Count()
|
||||
return cnt, w.Close()
|
||||
}
|
||||
|
||||
// Merge other sets to get the "subtraction" stream
|
||||
otherMerge := NewKWayMerge(otherReaders)
|
||||
|
||||
// Streaming difference: advance both streams
|
||||
v0, ok0 := r0.Next()
|
||||
vo, _, oko := otherMerge.Next()
|
||||
|
||||
for ok0 {
|
||||
if !oko || v0 < vo {
|
||||
// v0 not in others → emit
|
||||
if err := w.Write(v0); err != nil {
|
||||
r0.Close()
|
||||
otherMerge.Close()
|
||||
w.Close()
|
||||
return 0, err
|
||||
}
|
||||
v0, ok0 = r0.Next()
|
||||
} else if v0 == vo {
|
||||
// v0 in others → skip
|
||||
v0, ok0 = r0.Next()
|
||||
vo, _, oko = otherMerge.Next()
|
||||
} else {
|
||||
// vo < v0 → advance others
|
||||
vo, _, oko = otherMerge.Next()
|
||||
}
|
||||
}
|
||||
|
||||
r0.Close()
|
||||
otherMerge.Close()
|
||||
cnt := w.Count()
|
||||
return cnt, w.Close()
|
||||
}
|
||||
|
||||
// mergeMode defines how to combine two values during pairwise operations.
|
||||
type mergeMode int
|
||||
|
||||
const (
|
||||
mergeUnion mergeMode = iota // emit if in either
|
||||
mergeIntersect // emit if in both
|
||||
)
|
||||
|
||||
// pairwiseOp applies a merge operation between corresponding sets of two groups.
|
||||
func (ksg *KmerSetGroup) pairwiseOp(other *KmerSetGroup, outputDir string, mode mergeMode) (*KmerSetGroup, error) {
|
||||
for s := 0; s < ksg.n; s++ {
|
||||
setDir := filepath.Join(outputDir, fmt.Sprintf("set_%d", s))
|
||||
if err := os.MkdirAll(setDir, 0755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
counts := make([][]uint64, ksg.n)
|
||||
for s := 0; s < ksg.n; s++ {
|
||||
counts[s] = make([]uint64, ksg.partitions)
|
||||
}
|
||||
|
||||
nWorkers := runtime.NumCPU()
|
||||
if nWorkers > ksg.partitions {
|
||||
nWorkers = ksg.partitions
|
||||
}
|
||||
|
||||
type job struct {
|
||||
setIdx int
|
||||
partIdx int
|
||||
}
|
||||
jobs := make(chan job, ksg.n*ksg.partitions)
|
||||
var wg sync.WaitGroup
|
||||
var errMu sync.Mutex
|
||||
var firstErr error
|
||||
|
||||
for w := 0; w < nWorkers; w++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for j := range jobs {
|
||||
c, err := pairwiseMergePartition(
|
||||
ksg.partitionPath(j.setIdx, j.partIdx),
|
||||
other.partitionPath(j.setIdx, j.partIdx),
|
||||
filepath.Join(outputDir, fmt.Sprintf("set_%d", j.setIdx),
|
||||
fmt.Sprintf("part_%04d.kdi", j.partIdx)),
|
||||
mode,
|
||||
)
|
||||
if err != nil {
|
||||
errMu.Lock()
|
||||
if firstErr == nil {
|
||||
firstErr = err
|
||||
}
|
||||
errMu.Unlock()
|
||||
return
|
||||
}
|
||||
counts[j.setIdx][j.partIdx] = c
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
for s := 0; s < ksg.n; s++ {
|
||||
for p := 0; p < ksg.partitions; p++ {
|
||||
jobs <- job{s, p}
|
||||
}
|
||||
}
|
||||
close(jobs)
|
||||
wg.Wait()
|
||||
|
||||
if firstErr != nil {
|
||||
return nil, firstErr
|
||||
}
|
||||
|
||||
totalCounts := make([]uint64, ksg.n)
|
||||
setsIDs := make([]string, ksg.n)
|
||||
for s := 0; s < ksg.n; s++ {
|
||||
for p := 0; p < ksg.partitions; p++ {
|
||||
totalCounts[s] += counts[s][p]
|
||||
}
|
||||
}
|
||||
|
||||
result := &KmerSetGroup{
|
||||
path: outputDir,
|
||||
k: ksg.k,
|
||||
m: ksg.m,
|
||||
partitions: ksg.partitions,
|
||||
n: ksg.n,
|
||||
setsIDs: setsIDs,
|
||||
counts: totalCounts,
|
||||
Metadata: make(map[string]interface{}),
|
||||
}
|
||||
|
||||
if err := result.saveMetadata(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// pairwiseMergePartition merges two KDI files (sorted streams) with the given mode.
|
||||
func pairwiseMergePartition(pathA, pathB, outPath string, mode mergeMode) (uint64, error) {
|
||||
rA, err := NewKdiReader(pathA)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
rB, err := NewKdiReader(pathB)
|
||||
if err != nil {
|
||||
rA.Close()
|
||||
return 0, err
|
||||
}
|
||||
|
||||
w, err := NewKdiWriter(outPath)
|
||||
if err != nil {
|
||||
rA.Close()
|
||||
rB.Close()
|
||||
return 0, err
|
||||
}
|
||||
|
||||
cnt, mergeErr := doPairwiseMerge(rA, rB, w, mode)
|
||||
rA.Close()
|
||||
rB.Close()
|
||||
closeErr := w.Close()
|
||||
if mergeErr != nil {
|
||||
return 0, mergeErr
|
||||
}
|
||||
return cnt, closeErr
|
||||
}
|
||||
|
||||
func doPairwiseMerge(rA, rB *KdiReader, w *KdiWriter, mode mergeMode) (uint64, error) {
|
||||
vA, okA := rA.Next()
|
||||
vB, okB := rB.Next()
|
||||
|
||||
for okA && okB {
|
||||
if vA == vB {
|
||||
if err := w.Write(vA); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
vA, okA = rA.Next()
|
||||
vB, okB = rB.Next()
|
||||
} else if vA < vB {
|
||||
if mode == mergeUnion {
|
||||
if err := w.Write(vA); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
vA, okA = rA.Next()
|
||||
} else {
|
||||
if mode == mergeUnion {
|
||||
if err := w.Write(vB); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
vB, okB = rB.Next()
|
||||
}
|
||||
}
|
||||
|
||||
if mode == mergeUnion {
|
||||
for okA {
|
||||
if err := w.Write(vA); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
vA, okA = rA.Next()
|
||||
}
|
||||
for okB {
|
||||
if err := w.Write(vB); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
vB, okB = rB.Next()
|
||||
}
|
||||
}
|
||||
|
||||
return w.Count(), nil
|
||||
}
|
||||
251
pkg/obikmer/kmer_set_disk_ops_test.go
Normal file
251
pkg/obikmer/kmer_set_disk_ops_test.go
Normal file
@@ -0,0 +1,251 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
)
|
||||
|
||||
// buildGroupFromSeqs creates a KmerSetGroup with one set per sequence.
|
||||
func buildGroupFromSeqs(t *testing.T, dir string, k, m int, seqs []string) *KmerSetGroup {
|
||||
t.Helper()
|
||||
n := len(seqs)
|
||||
builder, err := NewKmerSetGroupBuilder(dir, k, m, n, 64)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for i, s := range seqs {
|
||||
seq := obiseq.NewBioSequence("", []byte(s), "")
|
||||
builder.AddSequence(i, seq)
|
||||
}
|
||||
ksg, err := builder.Close()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
return ksg
|
||||
}
|
||||
|
||||
func collectKmers(t *testing.T, ksg *KmerSetGroup, setIdx int) []uint64 {
|
||||
t.Helper()
|
||||
var result []uint64
|
||||
for kmer := range ksg.Iterator(setIdx) {
|
||||
result = append(result, kmer)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func TestDiskOpsUnion(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
indexDir := filepath.Join(dir, "index")
|
||||
outDir := filepath.Join(dir, "union")
|
||||
|
||||
// Two sequences with some overlap
|
||||
seqs := []string{
|
||||
"ACGATCGATCTAGCTAGCTGATCGATCGATCG",
|
||||
"CTAGCTAGCTGATCGATCGATCGTTTAAACCC",
|
||||
}
|
||||
ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs)
|
||||
|
||||
result, err := ksg.Union(outDir)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Union should have at least as many k-mers as each individual set
|
||||
unionLen := result.Len(0)
|
||||
if unionLen == 0 {
|
||||
t.Fatal("union is empty")
|
||||
}
|
||||
if unionLen < ksg.Len(0) || unionLen < ksg.Len(1) {
|
||||
t.Fatalf("union (%d) smaller than an input set (%d, %d)", unionLen, ksg.Len(0), ksg.Len(1))
|
||||
}
|
||||
|
||||
// Union should not exceed the sum of both sets
|
||||
if unionLen > ksg.Len(0)+ksg.Len(1) {
|
||||
t.Fatalf("union (%d) larger than sum of sets (%d)", unionLen, ksg.Len(0)+ksg.Len(1))
|
||||
}
|
||||
}
|
||||
|
||||
func TestDiskOpsIntersect(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
indexDir := filepath.Join(dir, "index")
|
||||
outDir := filepath.Join(dir, "intersect")
|
||||
|
||||
// Two sequences with some shared k-mers
|
||||
seqs := []string{
|
||||
"ACGATCGATCTAGCTAGCTGATCGATCGATCG",
|
||||
"CTAGCTAGCTGATCGATCGATCGTTTAAACCC",
|
||||
}
|
||||
ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs)
|
||||
|
||||
result, err := ksg.Intersect(outDir)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
interLen := result.Len(0)
|
||||
// Intersection should not be bigger than any individual set
|
||||
if interLen > ksg.Len(0) || interLen > ksg.Len(1) {
|
||||
t.Fatalf("intersection (%d) larger than input sets (%d, %d)", interLen, ksg.Len(0), ksg.Len(1))
|
||||
}
|
||||
}
|
||||
|
||||
func TestDiskOpsDifference(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
indexDir := filepath.Join(dir, "index")
|
||||
outDir := filepath.Join(dir, "diff")
|
||||
|
||||
seqs := []string{
|
||||
"ACGATCGATCTAGCTAGCTGATCGATCGATCG",
|
||||
"CTAGCTAGCTGATCGATCGATCGTTTAAACCC",
|
||||
}
|
||||
ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs)
|
||||
|
||||
result, err := ksg.Difference(outDir)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
diffLen := result.Len(0)
|
||||
// Difference = set_0 - set_1, so should be <= set_0
|
||||
if diffLen > ksg.Len(0) {
|
||||
t.Fatalf("difference (%d) larger than set_0 (%d)", diffLen, ksg.Len(0))
|
||||
}
|
||||
}
|
||||
|
||||
func TestDiskOpsConsistency(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
indexDir := filepath.Join(dir, "index")
|
||||
|
||||
seqs := []string{
|
||||
"ACGATCGATCTAGCTAGCTGATCGATCGATCG",
|
||||
"CTAGCTAGCTGATCGATCGATCGTTTAAACCC",
|
||||
}
|
||||
ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs)
|
||||
|
||||
unionResult, err := ksg.Union(filepath.Join(dir, "union"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
interResult, err := ksg.Intersect(filepath.Join(dir, "intersect"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
diffResult, err := ksg.Difference(filepath.Join(dir, "diff"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
unionLen := unionResult.Len(0)
|
||||
interLen := interResult.Len(0)
|
||||
diffLen := diffResult.Len(0)
|
||||
|
||||
// |A ∪ B| = |A| + |B| - |A ∩ B|
|
||||
expectedUnion := ksg.Len(0) + ksg.Len(1) - interLen
|
||||
if unionLen != expectedUnion {
|
||||
t.Fatalf("|A∪B|=%d, expected |A|+|B|-|A∩B|=%d+%d-%d=%d",
|
||||
unionLen, ksg.Len(0), ksg.Len(1), interLen, expectedUnion)
|
||||
}
|
||||
|
||||
// |A \ B| = |A| - |A ∩ B|
|
||||
expectedDiff := ksg.Len(0) - interLen
|
||||
if diffLen != expectedDiff {
|
||||
t.Fatalf("|A\\B|=%d, expected |A|-|A∩B|=%d-%d=%d",
|
||||
diffLen, ksg.Len(0), interLen, expectedDiff)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDiskOpsQuorum(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
indexDir := filepath.Join(dir, "index")
|
||||
|
||||
// Three sets
|
||||
seqs := []string{
|
||||
"ACGATCGATCTAGCTAGCTGATCGATCGATCG",
|
||||
"CTAGCTAGCTGATCGATCGATCGTTTAAACCC",
|
||||
"GATCGATCGATCGAAATTTCCCGGG",
|
||||
}
|
||||
ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs)
|
||||
|
||||
// QuorumAtLeast(1) = Union
|
||||
q1, err := ksg.QuorumAtLeast(1, filepath.Join(dir, "q1"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
union, err := ksg.Union(filepath.Join(dir, "union"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if q1.Len(0) != union.Len(0) {
|
||||
t.Fatalf("QuorumAtLeast(1)=%d != Union=%d", q1.Len(0), union.Len(0))
|
||||
}
|
||||
|
||||
// QuorumAtLeast(3) = Intersect
|
||||
q3, err := ksg.QuorumAtLeast(3, filepath.Join(dir, "q3"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
inter, err := ksg.Intersect(filepath.Join(dir, "inter"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if q3.Len(0) != inter.Len(0) {
|
||||
t.Fatalf("QuorumAtLeast(3)=%d != Intersect=%d", q3.Len(0), inter.Len(0))
|
||||
}
|
||||
|
||||
// QuorumAtLeast(2) should be between Intersect and Union
|
||||
q2, err := ksg.QuorumAtLeast(2, filepath.Join(dir, "q2"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if q2.Len(0) < q3.Len(0) || q2.Len(0) > q1.Len(0) {
|
||||
t.Fatalf("QuorumAtLeast(2)=%d not between intersect=%d and union=%d",
|
||||
q2.Len(0), q3.Len(0), q1.Len(0))
|
||||
}
|
||||
}
|
||||
|
||||
func TestDiskOpsJaccard(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
indexDir := filepath.Join(dir, "index")
|
||||
|
||||
seqs := []string{
|
||||
"ACGATCGATCTAGCTAGCTGATCGATCGATCG",
|
||||
"ACGATCGATCTAGCTAGCTGATCGATCGATCG", // identical to first
|
||||
"TTTTTTTTTTTTTTTTTTTTTTTTT", // completely different
|
||||
}
|
||||
ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs)
|
||||
|
||||
dm := ksg.JaccardDistanceMatrix()
|
||||
if dm == nil {
|
||||
t.Fatal("JaccardDistanceMatrix returned nil")
|
||||
}
|
||||
|
||||
// Identical sets should have distance 0
|
||||
d01 := dm.Get(0, 1)
|
||||
if d01 != 0.0 {
|
||||
t.Fatalf("distance(0,1) = %f, expected 0.0 for identical sets", d01)
|
||||
}
|
||||
|
||||
// Completely different sets should have distance 1.0
|
||||
d02 := dm.Get(0, 2)
|
||||
if d02 != 1.0 {
|
||||
t.Fatalf("distance(0,2) = %f, expected 1.0 for disjoint sets", d02)
|
||||
}
|
||||
|
||||
// Similarity matrix
|
||||
sm := ksg.JaccardSimilarityMatrix()
|
||||
if sm == nil {
|
||||
t.Fatal("JaccardSimilarityMatrix returned nil")
|
||||
}
|
||||
|
||||
s01 := sm.Get(0, 1)
|
||||
if s01 != 1.0 {
|
||||
t.Fatalf("similarity(0,1) = %f, expected 1.0 for identical sets", s01)
|
||||
}
|
||||
|
||||
s02 := sm.Get(0, 2)
|
||||
if s02 != 0.0 {
|
||||
t.Fatalf("similarity(0,2) = %f, expected 0.0 for disjoint sets", s02)
|
||||
}
|
||||
}
|
||||
@@ -1,347 +0,0 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidist"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
)
|
||||
|
||||
// KmerSetGroup represents a vector of KmerSet
|
||||
// Used to manage multiple k-mer sets (for example, by frequency level)
|
||||
type KmerSetGroup struct {
|
||||
id string // Unique identifier of the KmerSetGroup
|
||||
k int // Size of k-mers (immutable)
|
||||
sets []*KmerSet // Vector of KmerSet
|
||||
Metadata map[string]interface{} // Group metadata (not individual sets)
|
||||
}
|
||||
|
||||
// NewKmerSetGroup creates a new group of n KmerSets
|
||||
func NewKmerSetGroup(k int, n int) *KmerSetGroup {
|
||||
if n < 1 {
|
||||
panic("KmerSetGroup size must be >= 1")
|
||||
}
|
||||
|
||||
sets := make([]*KmerSet, n)
|
||||
for i := range sets {
|
||||
sets[i] = NewKmerSet(k)
|
||||
}
|
||||
|
||||
return &KmerSetGroup{
|
||||
k: k,
|
||||
sets: sets,
|
||||
Metadata: make(map[string]interface{}),
|
||||
}
|
||||
}
|
||||
|
||||
// K returns the size of k-mers (immutable)
|
||||
func (ksg *KmerSetGroup) K() int {
|
||||
return ksg.k
|
||||
}
|
||||
|
||||
// Size returns the number of KmerSet in the group
|
||||
func (ksg *KmerSetGroup) Size() int {
|
||||
return len(ksg.sets)
|
||||
}
|
||||
|
||||
// Get returns the KmerSet at the given index
|
||||
// Returns nil if the index is invalid
|
||||
func (ksg *KmerSetGroup) Get(index int) *KmerSet {
|
||||
if index < 0 || index >= len(ksg.sets) {
|
||||
return nil
|
||||
}
|
||||
return ksg.sets[index]
|
||||
}
|
||||
|
||||
// Set replaces the KmerSet at the given index
|
||||
// Panics if the index is invalid or if k does not match
|
||||
func (ksg *KmerSetGroup) Set(index int, ks *KmerSet) {
|
||||
if index < 0 || index >= len(ksg.sets) {
|
||||
panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets)))
|
||||
}
|
||||
if ks.k != ksg.k {
|
||||
panic(fmt.Sprintf("KmerSet k mismatch: expected %d, got %d", ksg.k, ks.k))
|
||||
}
|
||||
ksg.sets[index] = ks
|
||||
}
|
||||
|
||||
// Len returns the number of k-mers in a specific KmerSet
|
||||
// Without argument: returns the number of k-mers in the last KmerSet
|
||||
// With argument index: returns the number of k-mers in the KmerSet at this index
|
||||
func (ksg *KmerSetGroup) Len(index ...int) uint64 {
|
||||
if len(index) == 0 {
|
||||
// Without argument: last KmerSet
|
||||
return ksg.sets[len(ksg.sets)-1].Len()
|
||||
}
|
||||
|
||||
// With argument: specific KmerSet
|
||||
idx := index[0]
|
||||
if idx < 0 || idx >= len(ksg.sets) {
|
||||
return 0
|
||||
}
|
||||
return ksg.sets[idx].Len()
|
||||
}
|
||||
|
||||
// MemoryUsage returns the total memory usage in bytes
|
||||
func (ksg *KmerSetGroup) MemoryUsage() uint64 {
|
||||
total := uint64(0)
|
||||
for _, ks := range ksg.sets {
|
||||
total += ks.MemoryUsage()
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
// Clear empties all KmerSet in the group
|
||||
func (ksg *KmerSetGroup) Clear() {
|
||||
for _, ks := range ksg.sets {
|
||||
ks.Clear()
|
||||
}
|
||||
}
|
||||
|
||||
// Copy creates a complete copy of the group (consistent with BioSequence.Copy)
|
||||
func (ksg *KmerSetGroup) Copy() *KmerSetGroup {
|
||||
copiedSets := make([]*KmerSet, len(ksg.sets))
|
||||
for i, ks := range ksg.sets {
|
||||
copiedSets[i] = ks.Copy() // Copy each KmerSet with its metadata
|
||||
}
|
||||
|
||||
// Copy group metadata
|
||||
groupMetadata := make(map[string]interface{}, len(ksg.Metadata))
|
||||
for k, v := range ksg.Metadata {
|
||||
groupMetadata[k] = v
|
||||
}
|
||||
|
||||
return &KmerSetGroup{
|
||||
id: ksg.id,
|
||||
k: ksg.k,
|
||||
sets: copiedSets,
|
||||
Metadata: groupMetadata,
|
||||
}
|
||||
}
|
||||
|
||||
// Id returns the identifier of the KmerSetGroup (consistent with BioSequence.Id)
|
||||
func (ksg *KmerSetGroup) Id() string {
|
||||
return ksg.id
|
||||
}
|
||||
|
||||
// SetId sets the identifier of the KmerSetGroup (consistent with BioSequence.SetId)
|
||||
func (ksg *KmerSetGroup) SetId(id string) {
|
||||
ksg.id = id
|
||||
}
|
||||
|
||||
// AddSequence adds all k-mers from a sequence to a specific KmerSet
|
||||
func (ksg *KmerSetGroup) AddSequence(seq *obiseq.BioSequence, index int) {
|
||||
if index < 0 || index >= len(ksg.sets) {
|
||||
panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets)))
|
||||
}
|
||||
ksg.sets[index].AddSequence(seq)
|
||||
}
|
||||
|
||||
// AddSequences adds all k-mers from multiple sequences to a specific KmerSet
|
||||
func (ksg *KmerSetGroup) AddSequences(sequences *obiseq.BioSequenceSlice, index int) {
|
||||
if index < 0 || index >= len(ksg.sets) {
|
||||
panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets)))
|
||||
}
|
||||
ksg.sets[index].AddSequences(sequences)
|
||||
}
|
||||
|
||||
// AddSequenceSlice adds all k-mers from a slice of sequences to a specific KmerSet
|
||||
func (ksg *KmerSetGroup) AddSequenceSlice(sequences *obiseq.BioSequenceSlice, index int) {
|
||||
if index < 0 || index >= len(ksg.sets) {
|
||||
panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets)))
|
||||
}
|
||||
ksg.sets[index].AddSequenceSlice(sequences)
|
||||
}
|
||||
|
||||
// Union returns the union of all KmerSet in the group
|
||||
// Optimization: starts from the largest set to minimize operations
|
||||
func (ksg *KmerSetGroup) Union() *KmerSet {
|
||||
if len(ksg.sets) == 0 {
|
||||
return NewKmerSet(ksg.k)
|
||||
}
|
||||
|
||||
if len(ksg.sets) == 1 {
|
||||
return ksg.sets[0].Copy()
|
||||
}
|
||||
|
||||
// Find the index of the largest set (the one with the most k-mers)
|
||||
maxIdx := 0
|
||||
maxCard := ksg.sets[0].Len()
|
||||
for i := 1; i < len(ksg.sets); i++ {
|
||||
card := ksg.sets[i].Len()
|
||||
if card > maxCard {
|
||||
maxCard = card
|
||||
maxIdx = i
|
||||
}
|
||||
}
|
||||
|
||||
// Copy the largest set and perform unions in-place
|
||||
result := ksg.sets[maxIdx].bitmap.Clone()
|
||||
for i := 0; i < len(ksg.sets); i++ {
|
||||
if i != maxIdx {
|
||||
result.Or(ksg.sets[i].bitmap)
|
||||
}
|
||||
}
|
||||
|
||||
return NewKmerSetFromBitmap(ksg.k, result)
|
||||
}
|
||||
|
||||
// Intersect returns the intersection of all KmerSet in the group
|
||||
// Optimization: starts from the smallest set to minimize operations
|
||||
func (ksg *KmerSetGroup) Intersect() *KmerSet {
|
||||
if len(ksg.sets) == 0 {
|
||||
return NewKmerSet(ksg.k)
|
||||
}
|
||||
|
||||
if len(ksg.sets) == 1 {
|
||||
return ksg.sets[0].Copy()
|
||||
}
|
||||
|
||||
// Find the index of the smallest set (the one with the fewest k-mers)
|
||||
minIdx := 0
|
||||
minCard := ksg.sets[0].Len()
|
||||
for i := 1; i < len(ksg.sets); i++ {
|
||||
card := ksg.sets[i].Len()
|
||||
if card < minCard {
|
||||
minCard = card
|
||||
minIdx = i
|
||||
}
|
||||
}
|
||||
|
||||
// Copy the smallest set and perform intersections in-place
|
||||
result := ksg.sets[minIdx].bitmap.Clone()
|
||||
for i := 0; i < len(ksg.sets); i++ {
|
||||
if i != minIdx {
|
||||
result.And(ksg.sets[i].bitmap)
|
||||
}
|
||||
}
|
||||
|
||||
return NewKmerSetFromBitmap(ksg.k, result)
|
||||
}
|
||||
|
||||
// Stats returns statistics for each KmerSet in the group
|
||||
type KmerSetGroupStats struct {
|
||||
K int
|
||||
Size int // Number of KmerSet
|
||||
TotalBytes uint64 // Total memory used
|
||||
Sets []KmerSetStats // Stats of each KmerSet
|
||||
}
|
||||
|
||||
type KmerSetStats struct {
|
||||
Index int // Index of the KmerSet in the group
|
||||
Len uint64 // Number of k-mers
|
||||
SizeBytes uint64 // Size in bytes
|
||||
}
|
||||
|
||||
func (ksg *KmerSetGroup) Stats() KmerSetGroupStats {
|
||||
stats := KmerSetGroupStats{
|
||||
K: ksg.k,
|
||||
Size: len(ksg.sets),
|
||||
Sets: make([]KmerSetStats, len(ksg.sets)),
|
||||
}
|
||||
|
||||
for i, ks := range ksg.sets {
|
||||
sizeBytes := ks.MemoryUsage()
|
||||
stats.Sets[i] = KmerSetStats{
|
||||
Index: i,
|
||||
Len: ks.Len(),
|
||||
SizeBytes: sizeBytes,
|
||||
}
|
||||
stats.TotalBytes += sizeBytes
|
||||
}
|
||||
|
||||
return stats
|
||||
}
|
||||
|
||||
func (ksgs KmerSetGroupStats) String() string {
|
||||
result := fmt.Sprintf(`KmerSetGroup Statistics (k=%d, size=%d):
|
||||
Total memory: %.2f MB
|
||||
|
||||
Set breakdown:
|
||||
`, ksgs.K, ksgs.Size, float64(ksgs.TotalBytes)/1024/1024)
|
||||
|
||||
for _, set := range ksgs.Sets {
|
||||
result += fmt.Sprintf(" Set[%d]: %d k-mers (%.2f MB)\n",
|
||||
set.Index,
|
||||
set.Len,
|
||||
float64(set.SizeBytes)/1024/1024)
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// JaccardDistanceMatrix computes a pairwise Jaccard distance matrix for all KmerSets in the group.
|
||||
// Returns a triangular distance matrix where element (i, j) represents the Jaccard distance
|
||||
// between set i and set j.
|
||||
//
|
||||
// The Jaccard distance is: 1 - (|A ∩ B| / |A ∪ B|)
|
||||
//
|
||||
// The matrix labels are set to the IDs of the individual KmerSets if available,
|
||||
// otherwise they are set to "set_0", "set_1", etc.
|
||||
//
|
||||
// Time complexity: O(n² × (|A| + |B|)) where n is the number of sets
|
||||
// Space complexity: O(n²) for the distance matrix
|
||||
func (ksg *KmerSetGroup) JaccardDistanceMatrix() *obidist.DistMatrix {
|
||||
n := len(ksg.sets)
|
||||
|
||||
// Create labels from set IDs
|
||||
labels := make([]string, n)
|
||||
for i, ks := range ksg.sets {
|
||||
if ks.Id() != "" {
|
||||
labels[i] = ks.Id()
|
||||
} else {
|
||||
labels[i] = fmt.Sprintf("set_%d", i)
|
||||
}
|
||||
}
|
||||
|
||||
dm := obidist.NewDistMatrixWithLabels(labels)
|
||||
|
||||
// Compute pairwise distances
|
||||
for i := 0; i < n-1; i++ {
|
||||
for j := i + 1; j < n; j++ {
|
||||
distance := ksg.sets[i].JaccardDistance(ksg.sets[j])
|
||||
dm.Set(i, j, distance)
|
||||
}
|
||||
}
|
||||
|
||||
return dm
|
||||
}
|
||||
|
||||
// JaccardSimilarityMatrix computes a pairwise Jaccard similarity matrix for all KmerSets in the group.
|
||||
// Returns a similarity matrix where element (i, j) represents the Jaccard similarity
|
||||
// between set i and set j.
|
||||
//
|
||||
// The Jaccard similarity is: |A ∩ B| / |A ∪ B|
|
||||
//
|
||||
// The diagonal is 1.0 (similarity of a set to itself).
|
||||
//
|
||||
// The matrix labels are set to the IDs of the individual KmerSets if available,
|
||||
// otherwise they are set to "set_0", "set_1", etc.
|
||||
//
|
||||
// Time complexity: O(n² × (|A| + |B|)) where n is the number of sets
|
||||
// Space complexity: O(n²) for the similarity matrix
|
||||
func (ksg *KmerSetGroup) JaccardSimilarityMatrix() *obidist.DistMatrix {
|
||||
n := len(ksg.sets)
|
||||
|
||||
// Create labels from set IDs
|
||||
labels := make([]string, n)
|
||||
for i, ks := range ksg.sets {
|
||||
if ks.Id() != "" {
|
||||
labels[i] = ks.Id()
|
||||
} else {
|
||||
labels[i] = fmt.Sprintf("set_%d", i)
|
||||
}
|
||||
}
|
||||
|
||||
sm := obidist.NewSimilarityMatrixWithLabels(labels)
|
||||
|
||||
// Compute pairwise similarities
|
||||
for i := 0; i < n-1; i++ {
|
||||
for j := i + 1; j < n; j++ {
|
||||
similarity := ksg.sets[i].JaccardSimilarity(ksg.sets[j])
|
||||
sm.Set(i, j, similarity)
|
||||
}
|
||||
}
|
||||
|
||||
return sm
|
||||
}
|
||||
@@ -1,231 +0,0 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"math"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestKmerSetGroupJaccardDistanceMatrix(t *testing.T) {
|
||||
ksg := NewKmerSetGroup(5, 3)
|
||||
|
||||
// Set 0: {1, 2, 3}
|
||||
ksg.Get(0).AddKmerCode(1)
|
||||
ksg.Get(0).AddKmerCode(2)
|
||||
ksg.Get(0).AddKmerCode(3)
|
||||
ksg.Get(0).SetId("set_A")
|
||||
|
||||
// Set 1: {2, 3, 4}
|
||||
ksg.Get(1).AddKmerCode(2)
|
||||
ksg.Get(1).AddKmerCode(3)
|
||||
ksg.Get(1).AddKmerCode(4)
|
||||
ksg.Get(1).SetId("set_B")
|
||||
|
||||
// Set 2: {5, 6, 7}
|
||||
ksg.Get(2).AddKmerCode(5)
|
||||
ksg.Get(2).AddKmerCode(6)
|
||||
ksg.Get(2).AddKmerCode(7)
|
||||
ksg.Get(2).SetId("set_C")
|
||||
|
||||
dm := ksg.JaccardDistanceMatrix()
|
||||
|
||||
// Check labels
|
||||
if dm.GetLabel(0) != "set_A" {
|
||||
t.Errorf("Expected label 'set_A' at index 0, got '%s'", dm.GetLabel(0))
|
||||
}
|
||||
if dm.GetLabel(1) != "set_B" {
|
||||
t.Errorf("Expected label 'set_B' at index 1, got '%s'", dm.GetLabel(1))
|
||||
}
|
||||
if dm.GetLabel(2) != "set_C" {
|
||||
t.Errorf("Expected label 'set_C' at index 2, got '%s'", dm.GetLabel(2))
|
||||
}
|
||||
|
||||
// Check distances
|
||||
// Distance(0, 1):
|
||||
// Intersection: {2, 3} -> 2 elements
|
||||
// Union: {1, 2, 3, 4} -> 4 elements
|
||||
// Similarity: 2/4 = 0.5
|
||||
// Distance: 1 - 0.5 = 0.5
|
||||
expectedDist01 := 0.5
|
||||
actualDist01 := dm.Get(0, 1)
|
||||
if math.Abs(actualDist01-expectedDist01) > 1e-10 {
|
||||
t.Errorf("Distance(0, 1): expected %f, got %f", expectedDist01, actualDist01)
|
||||
}
|
||||
|
||||
// Distance(0, 2):
|
||||
// Intersection: {} -> 0 elements
|
||||
// Union: {1, 2, 3, 5, 6, 7} -> 6 elements
|
||||
// Similarity: 0/6 = 0
|
||||
// Distance: 1 - 0 = 1.0
|
||||
expectedDist02 := 1.0
|
||||
actualDist02 := dm.Get(0, 2)
|
||||
if math.Abs(actualDist02-expectedDist02) > 1e-10 {
|
||||
t.Errorf("Distance(0, 2): expected %f, got %f", expectedDist02, actualDist02)
|
||||
}
|
||||
|
||||
// Distance(1, 2):
|
||||
// Intersection: {} -> 0 elements
|
||||
// Union: {2, 3, 4, 5, 6, 7} -> 6 elements
|
||||
// Similarity: 0/6 = 0
|
||||
// Distance: 1 - 0 = 1.0
|
||||
expectedDist12 := 1.0
|
||||
actualDist12 := dm.Get(1, 2)
|
||||
if math.Abs(actualDist12-expectedDist12) > 1e-10 {
|
||||
t.Errorf("Distance(1, 2): expected %f, got %f", expectedDist12, actualDist12)
|
||||
}
|
||||
|
||||
// Check symmetry
|
||||
if dm.Get(0, 1) != dm.Get(1, 0) {
|
||||
t.Errorf("Matrix not symmetric: Get(0, 1) = %f, Get(1, 0) = %f",
|
||||
dm.Get(0, 1), dm.Get(1, 0))
|
||||
}
|
||||
|
||||
// Check diagonal
|
||||
if dm.Get(0, 0) != 0.0 {
|
||||
t.Errorf("Diagonal should be 0, got %f", dm.Get(0, 0))
|
||||
}
|
||||
if dm.Get(1, 1) != 0.0 {
|
||||
t.Errorf("Diagonal should be 0, got %f", dm.Get(1, 1))
|
||||
}
|
||||
if dm.Get(2, 2) != 0.0 {
|
||||
t.Errorf("Diagonal should be 0, got %f", dm.Get(2, 2))
|
||||
}
|
||||
}
|
||||
|
||||
func TestKmerSetGroupJaccardSimilarityMatrix(t *testing.T) {
|
||||
ksg := NewKmerSetGroup(5, 3)
|
||||
|
||||
// Set 0: {1, 2, 3}
|
||||
ksg.Get(0).AddKmerCode(1)
|
||||
ksg.Get(0).AddKmerCode(2)
|
||||
ksg.Get(0).AddKmerCode(3)
|
||||
|
||||
// Set 1: {2, 3, 4}
|
||||
ksg.Get(1).AddKmerCode(2)
|
||||
ksg.Get(1).AddKmerCode(3)
|
||||
ksg.Get(1).AddKmerCode(4)
|
||||
|
||||
// Set 2: {1, 2, 3} (same as set 0)
|
||||
ksg.Get(2).AddKmerCode(1)
|
||||
ksg.Get(2).AddKmerCode(2)
|
||||
ksg.Get(2).AddKmerCode(3)
|
||||
|
||||
sm := ksg.JaccardSimilarityMatrix()
|
||||
|
||||
// Check similarities
|
||||
// Similarity(0, 1): 0.5 (as calculated above)
|
||||
expectedSim01 := 0.5
|
||||
actualSim01 := sm.Get(0, 1)
|
||||
if math.Abs(actualSim01-expectedSim01) > 1e-10 {
|
||||
t.Errorf("Similarity(0, 1): expected %f, got %f", expectedSim01, actualSim01)
|
||||
}
|
||||
|
||||
// Similarity(0, 2): 1.0 (identical sets)
|
||||
expectedSim02 := 1.0
|
||||
actualSim02 := sm.Get(0, 2)
|
||||
if math.Abs(actualSim02-expectedSim02) > 1e-10 {
|
||||
t.Errorf("Similarity(0, 2): expected %f, got %f", expectedSim02, actualSim02)
|
||||
}
|
||||
|
||||
// Similarity(1, 2): 0.5
|
||||
// Intersection: {2, 3} -> 2
|
||||
// Union: {1, 2, 3, 4} -> 4
|
||||
// Similarity: 2/4 = 0.5
|
||||
expectedSim12 := 0.5
|
||||
actualSim12 := sm.Get(1, 2)
|
||||
if math.Abs(actualSim12-expectedSim12) > 1e-10 {
|
||||
t.Errorf("Similarity(1, 2): expected %f, got %f", expectedSim12, actualSim12)
|
||||
}
|
||||
|
||||
// Check diagonal (similarity to self = 1.0)
|
||||
if sm.Get(0, 0) != 1.0 {
|
||||
t.Errorf("Diagonal should be 1.0, got %f", sm.Get(0, 0))
|
||||
}
|
||||
if sm.Get(1, 1) != 1.0 {
|
||||
t.Errorf("Diagonal should be 1.0, got %f", sm.Get(1, 1))
|
||||
}
|
||||
if sm.Get(2, 2) != 1.0 {
|
||||
t.Errorf("Diagonal should be 1.0, got %f", sm.Get(2, 2))
|
||||
}
|
||||
}
|
||||
|
||||
func TestKmerSetGroupJaccardMatricesRelation(t *testing.T) {
|
||||
ksg := NewKmerSetGroup(5, 4)
|
||||
|
||||
// Create different sets
|
||||
ksg.Get(0).AddKmerCode(1)
|
||||
ksg.Get(0).AddKmerCode(2)
|
||||
|
||||
ksg.Get(1).AddKmerCode(2)
|
||||
ksg.Get(1).AddKmerCode(3)
|
||||
|
||||
ksg.Get(2).AddKmerCode(1)
|
||||
ksg.Get(2).AddKmerCode(2)
|
||||
ksg.Get(2).AddKmerCode(3)
|
||||
|
||||
ksg.Get(3).AddKmerCode(10)
|
||||
ksg.Get(3).AddKmerCode(20)
|
||||
|
||||
dm := ksg.JaccardDistanceMatrix()
|
||||
sm := ksg.JaccardSimilarityMatrix()
|
||||
|
||||
// For all pairs (including diagonal), distance + similarity should equal 1.0
|
||||
for i := 0; i < 4; i++ {
|
||||
for j := 0; j < 4; j++ {
|
||||
distance := dm.Get(i, j)
|
||||
similarity := sm.Get(i, j)
|
||||
sum := distance + similarity
|
||||
|
||||
if math.Abs(sum-1.0) > 1e-10 {
|
||||
t.Errorf("At (%d, %d): distance %f + similarity %f = %f, expected 1.0",
|
||||
i, j, distance, similarity, sum)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestKmerSetGroupJaccardMatrixLabels(t *testing.T) {
|
||||
ksg := NewKmerSetGroup(5, 3)
|
||||
|
||||
// Don't set IDs - should use default labels
|
||||
ksg.Get(0).AddKmerCode(1)
|
||||
ksg.Get(1).AddKmerCode(2)
|
||||
ksg.Get(2).AddKmerCode(3)
|
||||
|
||||
dm := ksg.JaccardDistanceMatrix()
|
||||
|
||||
// Check default labels
|
||||
if dm.GetLabel(0) != "set_0" {
|
||||
t.Errorf("Expected default label 'set_0', got '%s'", dm.GetLabel(0))
|
||||
}
|
||||
if dm.GetLabel(1) != "set_1" {
|
||||
t.Errorf("Expected default label 'set_1', got '%s'", dm.GetLabel(1))
|
||||
}
|
||||
if dm.GetLabel(2) != "set_2" {
|
||||
t.Errorf("Expected default label 'set_2', got '%s'", dm.GetLabel(2))
|
||||
}
|
||||
}
|
||||
|
||||
func TestKmerSetGroupJaccardMatrixSize(t *testing.T) {
|
||||
ksg := NewKmerSetGroup(5, 5)
|
||||
|
||||
for i := 0; i < 5; i++ {
|
||||
ksg.Get(i).AddKmerCode(uint64(i))
|
||||
}
|
||||
|
||||
dm := ksg.JaccardDistanceMatrix()
|
||||
|
||||
if dm.Size() != 5 {
|
||||
t.Errorf("Expected matrix size 5, got %d", dm.Size())
|
||||
}
|
||||
|
||||
// All sets are disjoint, so all distances should be 1.0
|
||||
for i := 0; i < 5; i++ {
|
||||
for j := i + 1; j < 5; j++ {
|
||||
dist := dm.Get(i, j)
|
||||
if math.Abs(dist-1.0) > 1e-10 {
|
||||
t.Errorf("Expected distance 1.0 for disjoint sets (%d, %d), got %f",
|
||||
i, j, dist)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,235 +0,0 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"container/heap"
|
||||
|
||||
"github.com/RoaringBitmap/roaring/roaring64"
|
||||
)
|
||||
|
||||
// heapItem represents an element in the min-heap for k-way merge
|
||||
type heapItem struct {
|
||||
value uint64
|
||||
idx int
|
||||
}
|
||||
|
||||
// kmerMinHeap implements heap.Interface for k-way merge algorithm
|
||||
type kmerMinHeap []heapItem
|
||||
|
||||
func (h kmerMinHeap) Len() int { return len(h) }
|
||||
func (h kmerMinHeap) Less(i, j int) bool { return h[i].value < h[j].value }
|
||||
func (h kmerMinHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
|
||||
|
||||
func (h *kmerMinHeap) Push(x interface{}) {
|
||||
*h = append(*h, x.(heapItem))
|
||||
}
|
||||
|
||||
func (h *kmerMinHeap) Pop() interface{} {
|
||||
old := *h
|
||||
n := len(old)
|
||||
x := old[n-1]
|
||||
*h = old[0 : n-1]
|
||||
return x
|
||||
}
|
||||
|
||||
// QuorumAtLeast returns k-mers present in at least q sets
|
||||
//
|
||||
// Algorithm: K-way merge with min-heap counting
|
||||
//
|
||||
// The algorithm processes all k-mers in sorted order using a min-heap:
|
||||
//
|
||||
// 1. Initialize one iterator per non-empty set
|
||||
// 2. Build a min-heap of (value, set_index) pairs, one per iterator
|
||||
// 3. While heap is not empty:
|
||||
// a. Extract the minimum value v from heap
|
||||
// b. Pop ALL heap items with value == v (counting occurrences)
|
||||
// c. If count >= q, add v to result
|
||||
// d. Advance each popped iterator and re-insert into heap if valid
|
||||
//
|
||||
// This ensures each unique k-mer is counted exactly once across all sets.
|
||||
//
|
||||
// Time complexity: O(M log N)
|
||||
// - M = sum of all set cardinalities (total k-mer occurrences)
|
||||
// - N = number of sets
|
||||
// - Each k-mer occurrence is inserted/extracted from heap once: O(M) operations
|
||||
// - Each heap operation costs O(log N)
|
||||
//
|
||||
// Space complexity: O(N)
|
||||
// - Heap contains at most N elements (one per set iterator)
|
||||
// - Output bitmap size depends on quorum result
|
||||
//
|
||||
// Special cases (optimized):
|
||||
// - q <= 0: returns empty set
|
||||
// - q == 1: delegates to Union() (native OR operations)
|
||||
// - q == n: delegates to Intersect() (native AND operations)
|
||||
// - q > n: returns empty set (impossible to satisfy)
|
||||
func (ksg *KmerSetGroup) QuorumAtLeast(q int) *KmerSet {
|
||||
n := len(ksg.sets)
|
||||
|
||||
// Edge cases
|
||||
if q <= 0 || n == 0 {
|
||||
return NewKmerSet(ksg.k)
|
||||
}
|
||||
if q > n {
|
||||
return NewKmerSet(ksg.k)
|
||||
}
|
||||
if q == 1 {
|
||||
return ksg.Union()
|
||||
}
|
||||
if q == n {
|
||||
return ksg.Intersect()
|
||||
}
|
||||
|
||||
// Initialize iterators for all non-empty sets
|
||||
iterators := make([]roaring64.IntIterable64, 0, n)
|
||||
iterIndices := make([]int, 0, n)
|
||||
|
||||
for i, set := range ksg.sets {
|
||||
if set.Len() > 0 {
|
||||
iter := set.bitmap.Iterator()
|
||||
if iter.HasNext() {
|
||||
iterators = append(iterators, iter)
|
||||
iterIndices = append(iterIndices, i)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(iterators) == 0 {
|
||||
return NewKmerSet(ksg.k)
|
||||
}
|
||||
|
||||
// Initialize heap with first value from each iterator
|
||||
h := make(kmerMinHeap, len(iterators))
|
||||
for i, iter := range iterators {
|
||||
h[i] = heapItem{value: iter.Next(), idx: i}
|
||||
}
|
||||
heap.Init(&h)
|
||||
|
||||
// Result bitmap
|
||||
result := roaring64.New()
|
||||
|
||||
// K-way merge with counting
|
||||
for len(h) > 0 {
|
||||
minVal := h[0].value
|
||||
count := 0
|
||||
activeIndices := make([]int, 0, len(h))
|
||||
|
||||
// Pop all elements with same value (count occurrences)
|
||||
for len(h) > 0 && h[0].value == minVal {
|
||||
item := heap.Pop(&h).(heapItem)
|
||||
count++
|
||||
activeIndices = append(activeIndices, item.idx)
|
||||
}
|
||||
|
||||
// Add to result if quorum reached
|
||||
if count >= q {
|
||||
result.Add(minVal)
|
||||
}
|
||||
|
||||
// Advance iterators and re-insert into heap
|
||||
for _, iterIdx := range activeIndices {
|
||||
if iterators[iterIdx].HasNext() {
|
||||
heap.Push(&h, heapItem{
|
||||
value: iterators[iterIdx].Next(),
|
||||
idx: iterIdx,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return NewKmerSetFromBitmap(ksg.k, result)
|
||||
}
|
||||
|
||||
// QuorumAtMost returns k-mers present in at most q sets
|
||||
//
|
||||
// Algorithm: Uses the mathematical identity
|
||||
// AtMost(q) = Union() - AtLeast(q+1)
|
||||
//
|
||||
// Proof:
|
||||
// - Union() contains all k-mers present in at least 1 set
|
||||
// - AtLeast(q+1) contains all k-mers present in q+1 or more sets
|
||||
// - Their difference contains only k-mers present in at most q sets
|
||||
//
|
||||
// Implementation:
|
||||
// 1. Compute U = Union()
|
||||
// 2. Compute A = QuorumAtLeast(q+1)
|
||||
// 3. Return U - A using bitmap AndNot operation
|
||||
//
|
||||
// Time complexity: O(M log N)
|
||||
// - Union(): O(M) with native OR operations
|
||||
// - QuorumAtLeast(q+1): O(M log N)
|
||||
// - AndNot: O(|U|) where |U| <= M
|
||||
// - Total: O(M log N)
|
||||
//
|
||||
// Space complexity: O(N)
|
||||
// - Inherited from QuorumAtLeast heap
|
||||
//
|
||||
// Special cases:
|
||||
// - q <= 0: returns empty set
|
||||
// - q >= n: returns Union() (all k-mers are in at most n sets)
|
||||
func (ksg *KmerSetGroup) QuorumAtMost(q int) *KmerSet {
|
||||
n := len(ksg.sets)
|
||||
|
||||
// Edge cases
|
||||
if q <= 0 {
|
||||
return NewKmerSet(ksg.k)
|
||||
}
|
||||
if q >= n {
|
||||
return ksg.Union()
|
||||
}
|
||||
|
||||
// Compute Union() - AtLeast(q+1)
|
||||
union := ksg.Union()
|
||||
atLeastQ1 := ksg.QuorumAtLeast(q + 1)
|
||||
|
||||
// Difference: elements in union but not in atLeastQ1
|
||||
result := union.bitmap.Clone()
|
||||
result.AndNot(atLeastQ1.bitmap)
|
||||
|
||||
return NewKmerSetFromBitmap(ksg.k, result)
|
||||
}
|
||||
|
||||
// QuorumExactly returns k-mers present in exactly q sets
|
||||
//
|
||||
// Algorithm: Uses the mathematical identity
|
||||
// Exactly(q) = AtLeast(q) - AtLeast(q+1)
|
||||
//
|
||||
// Proof:
|
||||
// - AtLeast(q) contains all k-mers present in q or more sets
|
||||
// - AtLeast(q+1) contains all k-mers present in q+1 or more sets
|
||||
// - Their difference contains only k-mers present in exactly q sets
|
||||
//
|
||||
// Implementation:
|
||||
// 1. Compute A = QuorumAtLeast(q)
|
||||
// 2. Compute B = QuorumAtLeast(q+1)
|
||||
// 3. Return A - B using bitmap AndNot operation
|
||||
//
|
||||
// Time complexity: O(M log N)
|
||||
// - Two calls to QuorumAtLeast: 2 * O(M log N)
|
||||
// - One AndNot operation: O(|A|) where |A| <= M
|
||||
// - Total: O(M log N) since AndNot is dominated by merge operations
|
||||
//
|
||||
// Space complexity: O(N)
|
||||
// - Inherited from QuorumAtLeast heap
|
||||
// - Two temporary bitmaps for intermediate results
|
||||
//
|
||||
// Special cases:
|
||||
// - q <= 0: returns empty set
|
||||
// - q > n: returns empty set (impossible to have k-mer in more than n sets)
|
||||
func (ksg *KmerSetGroup) QuorumExactly(q int) *KmerSet {
|
||||
n := len(ksg.sets)
|
||||
|
||||
// Edge cases
|
||||
if q <= 0 || q > n {
|
||||
return NewKmerSet(ksg.k)
|
||||
}
|
||||
|
||||
// Compute AtLeast(q) - AtLeast(q+1)
|
||||
aq := ksg.QuorumAtLeast(q)
|
||||
aq1 := ksg.QuorumAtLeast(q + 1)
|
||||
|
||||
// Difference: elements in aq but not in aq1
|
||||
result := aq.bitmap.Clone()
|
||||
result.AndNot(aq1.bitmap)
|
||||
|
||||
return NewKmerSetFromBitmap(ksg.k, result)
|
||||
}
|
||||
@@ -1,395 +0,0 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestQuorumAtLeastEdgeCases tests edge cases for QuorumAtLeast
|
||||
func TestQuorumAtLeastEdgeCases(t *testing.T) {
|
||||
k := 5
|
||||
|
||||
// Test group with all empty sets
|
||||
emptyGroup := NewKmerSetGroup(k, 3)
|
||||
result := emptyGroup.QuorumAtLeast(1)
|
||||
if result.Len() != 0 {
|
||||
t.Errorf("Empty sets: expected 0 k-mers, got %d", result.Len())
|
||||
}
|
||||
|
||||
// Test q <= 0
|
||||
group := NewKmerSetGroup(k, 3)
|
||||
result = group.QuorumAtLeast(0)
|
||||
if result.Len() != 0 {
|
||||
t.Errorf("q=0: expected 0 k-mers, got %d", result.Len())
|
||||
}
|
||||
|
||||
result = group.QuorumAtLeast(-1)
|
||||
if result.Len() != 0 {
|
||||
t.Errorf("q=-1: expected 0 k-mers, got %d", result.Len())
|
||||
}
|
||||
|
||||
// Test q > n
|
||||
group.Get(0).AddKmerCode(1)
|
||||
result = group.QuorumAtLeast(10)
|
||||
if result.Len() != 0 {
|
||||
t.Errorf("q>n: expected 0 k-mers, got %d", result.Len())
|
||||
}
|
||||
}
|
||||
|
||||
// TestQuorumAtLeastQ1 tests q=1 (should equal Union)
|
||||
func TestQuorumAtLeastQ1(t *testing.T) {
|
||||
k := 5
|
||||
group := NewKmerSetGroup(k, 3)
|
||||
|
||||
// Add different k-mers to each set
|
||||
group.Get(0).AddKmerCode(1)
|
||||
group.Get(0).AddKmerCode(2)
|
||||
group.Get(1).AddKmerCode(2)
|
||||
group.Get(1).AddKmerCode(3)
|
||||
group.Get(2).AddKmerCode(3)
|
||||
group.Get(2).AddKmerCode(4)
|
||||
|
||||
quorum := group.QuorumAtLeast(1)
|
||||
union := group.Union()
|
||||
|
||||
if quorum.Len() != union.Len() {
|
||||
t.Errorf("QuorumAtLeast(1) length %d != Union length %d", quorum.Len(), union.Len())
|
||||
}
|
||||
|
||||
// Check all elements match
|
||||
for kmer := uint64(1); kmer <= 4; kmer++ {
|
||||
if quorum.Contains(kmer) != union.Contains(kmer) {
|
||||
t.Errorf("Mismatch for k-mer %d", kmer)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestQuorumAtLeastQN tests q=n (should equal Intersect)
|
||||
func TestQuorumAtLeastQN(t *testing.T) {
|
||||
k := 5
|
||||
group := NewKmerSetGroup(k, 3)
|
||||
|
||||
// Add some common k-mers and some unique
|
||||
for i := 0; i < 3; i++ {
|
||||
group.Get(i).AddKmerCode(10) // common to all
|
||||
group.Get(i).AddKmerCode(20) // common to all
|
||||
}
|
||||
group.Get(0).AddKmerCode(1) // unique to set 0
|
||||
group.Get(1).AddKmerCode(2) // unique to set 1
|
||||
|
||||
quorum := group.QuorumAtLeast(3)
|
||||
intersect := group.Intersect()
|
||||
|
||||
if quorum.Len() != intersect.Len() {
|
||||
t.Errorf("QuorumAtLeast(n) length %d != Intersect length %d", quorum.Len(), intersect.Len())
|
||||
}
|
||||
|
||||
if quorum.Len() != 2 {
|
||||
t.Errorf("Expected 2 common k-mers, got %d", quorum.Len())
|
||||
}
|
||||
|
||||
if !quorum.Contains(10) || !quorum.Contains(20) {
|
||||
t.Error("Missing common k-mers")
|
||||
}
|
||||
|
||||
if quorum.Contains(1) || quorum.Contains(2) {
|
||||
t.Error("Unique k-mers should not be in result")
|
||||
}
|
||||
}
|
||||
|
||||
// TestQuorumAtLeastGeneral tests general quorum values
|
||||
func TestQuorumAtLeastGeneral(t *testing.T) {
|
||||
k := 5
|
||||
group := NewKmerSetGroup(k, 5)
|
||||
|
||||
// Setup: k-mer i appears in i sets (for i=1..5)
|
||||
// k-mer 1: in set 0
|
||||
// k-mer 2: in sets 0,1
|
||||
// k-mer 3: in sets 0,1,2
|
||||
// k-mer 4: in sets 0,1,2,3
|
||||
// k-mer 5: in sets 0,1,2,3,4 (all)
|
||||
|
||||
for kmer := uint64(1); kmer <= 5; kmer++ {
|
||||
for setIdx := 0; setIdx < int(kmer); setIdx++ {
|
||||
group.Get(setIdx).AddKmerCode(kmer)
|
||||
}
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
q int
|
||||
expected map[uint64]bool
|
||||
}{
|
||||
{1, map[uint64]bool{1: true, 2: true, 3: true, 4: true, 5: true}},
|
||||
{2, map[uint64]bool{2: true, 3: true, 4: true, 5: true}},
|
||||
{3, map[uint64]bool{3: true, 4: true, 5: true}},
|
||||
{4, map[uint64]bool{4: true, 5: true}},
|
||||
{5, map[uint64]bool{5: true}},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
result := group.QuorumAtLeast(tt.q)
|
||||
|
||||
if result.Len() != uint64(len(tt.expected)) {
|
||||
t.Errorf("q=%d: expected %d k-mers, got %d", tt.q, len(tt.expected), result.Len())
|
||||
}
|
||||
|
||||
for kmer := uint64(1); kmer <= 5; kmer++ {
|
||||
shouldContain := tt.expected[kmer]
|
||||
doesContain := result.Contains(kmer)
|
||||
if shouldContain != doesContain {
|
||||
t.Errorf("q=%d, k-mer=%d: expected contains=%v, got %v", tt.q, kmer, shouldContain, doesContain)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestQuorumExactlyBasic tests QuorumExactly basic functionality
|
||||
func TestQuorumExactlyBasic(t *testing.T) {
|
||||
k := 5
|
||||
group := NewKmerSetGroup(k, 5)
|
||||
|
||||
// Setup: k-mer i appears in exactly i sets
|
||||
for kmer := uint64(1); kmer <= 5; kmer++ {
|
||||
for setIdx := 0; setIdx < int(kmer); setIdx++ {
|
||||
group.Get(setIdx).AddKmerCode(kmer)
|
||||
}
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
q int
|
||||
expected []uint64
|
||||
}{
|
||||
{1, []uint64{1}},
|
||||
{2, []uint64{2}},
|
||||
{3, []uint64{3}},
|
||||
{4, []uint64{4}},
|
||||
{5, []uint64{5}},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
result := group.QuorumExactly(tt.q)
|
||||
|
||||
if result.Len() != uint64(len(tt.expected)) {
|
||||
t.Errorf("q=%d: expected %d k-mers, got %d", tt.q, len(tt.expected), result.Len())
|
||||
}
|
||||
|
||||
for _, kmer := range tt.expected {
|
||||
if !result.Contains(kmer) {
|
||||
t.Errorf("q=%d: missing k-mer %d", tt.q, kmer)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestQuorumIdentity tests the mathematical identity: Exactly(q) = AtLeast(q) - AtLeast(q+1)
|
||||
func TestQuorumIdentity(t *testing.T) {
|
||||
k := 5
|
||||
group := NewKmerSetGroup(k, 4)
|
||||
|
||||
// Add random distribution
|
||||
group.Get(0).AddKmerCode(1)
|
||||
group.Get(0).AddKmerCode(2)
|
||||
group.Get(0).AddKmerCode(3)
|
||||
|
||||
group.Get(1).AddKmerCode(2)
|
||||
group.Get(1).AddKmerCode(3)
|
||||
group.Get(1).AddKmerCode(4)
|
||||
|
||||
group.Get(2).AddKmerCode(3)
|
||||
group.Get(2).AddKmerCode(4)
|
||||
|
||||
group.Get(3).AddKmerCode(4)
|
||||
|
||||
for q := 1; q <= 4; q++ {
|
||||
exactly := group.QuorumExactly(q)
|
||||
atLeast := group.QuorumAtLeast(q)
|
||||
atLeastPlus1 := group.QuorumAtLeast(q + 1)
|
||||
|
||||
// Verify: every element in exactly(q) is in atLeast(q)
|
||||
iter := exactly.Iterator()
|
||||
for iter.HasNext() {
|
||||
kmer := iter.Next()
|
||||
if !atLeast.Contains(kmer) {
|
||||
t.Errorf("q=%d: k-mer %d in Exactly but not in AtLeast", q, kmer)
|
||||
}
|
||||
if atLeastPlus1.Contains(kmer) {
|
||||
t.Errorf("q=%d: k-mer %d in Exactly but also in AtLeast(q+1)", q, kmer)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestQuorumDisjointSets tests quorum on completely disjoint sets
|
||||
func TestQuorumDisjointSets(t *testing.T) {
|
||||
k := 5
|
||||
group := NewKmerSetGroup(k, 3)
|
||||
|
||||
// Each set has unique k-mers
|
||||
group.Get(0).AddKmerCode(1)
|
||||
group.Get(1).AddKmerCode(2)
|
||||
group.Get(2).AddKmerCode(3)
|
||||
|
||||
// q=1 should give all
|
||||
result := group.QuorumAtLeast(1)
|
||||
if result.Len() != 3 {
|
||||
t.Errorf("Disjoint sets q=1: expected 3, got %d", result.Len())
|
||||
}
|
||||
|
||||
// q=2 should give none
|
||||
result = group.QuorumAtLeast(2)
|
||||
if result.Len() != 0 {
|
||||
t.Errorf("Disjoint sets q=2: expected 0, got %d", result.Len())
|
||||
}
|
||||
}
|
||||
|
||||
// TestQuorumIdenticalSets tests quorum on identical sets
|
||||
func TestQuorumIdenticalSets(t *testing.T) {
|
||||
k := 5
|
||||
group := NewKmerSetGroup(k, 3)
|
||||
|
||||
// All sets have same k-mers
|
||||
for i := 0; i < 3; i++ {
|
||||
group.Get(i).AddKmerCode(10)
|
||||
group.Get(i).AddKmerCode(20)
|
||||
group.Get(i).AddKmerCode(30)
|
||||
}
|
||||
|
||||
// Any q <= n should give all k-mers
|
||||
for q := 1; q <= 3; q++ {
|
||||
result := group.QuorumAtLeast(q)
|
||||
if result.Len() != 3 {
|
||||
t.Errorf("Identical sets q=%d: expected 3, got %d", q, result.Len())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestQuorumLargeNumbers tests with large k-mer values
|
||||
func TestQuorumLargeNumbers(t *testing.T) {
|
||||
k := 21
|
||||
group := NewKmerSetGroup(k, 3)
|
||||
|
||||
// Use large uint64 values (actual k-mer encodings)
|
||||
largeKmers := []uint64{
|
||||
0x1234567890ABCDEF,
|
||||
0xFEDCBA0987654321,
|
||||
0xAAAAAAAAAAAAAAAA,
|
||||
}
|
||||
|
||||
// Add to multiple sets
|
||||
for i := 0; i < 3; i++ {
|
||||
for j := 0; j <= i; j++ {
|
||||
group.Get(j).AddKmerCode(largeKmers[i])
|
||||
}
|
||||
}
|
||||
|
||||
result := group.QuorumAtLeast(2)
|
||||
if result.Len() != 2 {
|
||||
t.Errorf("Large numbers q=2: expected 2, got %d", result.Len())
|
||||
}
|
||||
|
||||
if !result.Contains(largeKmers[1]) || !result.Contains(largeKmers[2]) {
|
||||
t.Error("Large numbers: wrong k-mers in result")
|
||||
}
|
||||
}
|
||||
|
||||
// TestQuorumAtMostBasic tests QuorumAtMost basic functionality
|
||||
func TestQuorumAtMostBasic(t *testing.T) {
|
||||
k := 5
|
||||
group := NewKmerSetGroup(k, 5)
|
||||
|
||||
// Setup: k-mer i appears in exactly i sets
|
||||
for kmer := uint64(1); kmer <= 5; kmer++ {
|
||||
for setIdx := 0; setIdx < int(kmer); setIdx++ {
|
||||
group.Get(setIdx).AddKmerCode(kmer)
|
||||
}
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
q int
|
||||
expected []uint64
|
||||
}{
|
||||
{0, []uint64{}}, // at most 0: none
|
||||
{1, []uint64{1}}, // at most 1: only k-mer 1
|
||||
{2, []uint64{1, 2}}, // at most 2: k-mers 1,2
|
||||
{3, []uint64{1, 2, 3}}, // at most 3: k-mers 1,2,3
|
||||
{4, []uint64{1, 2, 3, 4}}, // at most 4: k-mers 1,2,3,4
|
||||
{5, []uint64{1, 2, 3, 4, 5}}, // at most 5: all k-mers
|
||||
{10, []uint64{1, 2, 3, 4, 5}}, // at most 10: all k-mers
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
result := group.QuorumAtMost(tt.q)
|
||||
|
||||
if result.Len() != uint64(len(tt.expected)) {
|
||||
t.Errorf("q=%d: expected %d k-mers, got %d", tt.q, len(tt.expected), result.Len())
|
||||
}
|
||||
|
||||
for _, kmer := range tt.expected {
|
||||
if !result.Contains(kmer) {
|
||||
t.Errorf("q=%d: missing k-mer %d", tt.q, kmer)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestQuorumComplementIdentity tests that AtLeast and AtMost are complementary
|
||||
func TestQuorumComplementIdentity(t *testing.T) {
|
||||
k := 5
|
||||
group := NewKmerSetGroup(k, 4)
|
||||
|
||||
// Add random distribution
|
||||
group.Get(0).AddKmerCode(1)
|
||||
group.Get(0).AddKmerCode(2)
|
||||
group.Get(0).AddKmerCode(3)
|
||||
|
||||
group.Get(1).AddKmerCode(2)
|
||||
group.Get(1).AddKmerCode(3)
|
||||
group.Get(1).AddKmerCode(4)
|
||||
|
||||
group.Get(2).AddKmerCode(3)
|
||||
group.Get(2).AddKmerCode(4)
|
||||
|
||||
group.Get(3).AddKmerCode(4)
|
||||
|
||||
union := group.Union()
|
||||
|
||||
for q := 1; q < 4; q++ {
|
||||
atMost := group.QuorumAtMost(q)
|
||||
atLeast := group.QuorumAtLeast(q + 1)
|
||||
|
||||
// Verify: AtMost(q) ∪ AtLeast(q+1) = Union()
|
||||
combined := atMost.Union(atLeast)
|
||||
|
||||
if combined.Len() != union.Len() {
|
||||
t.Errorf("q=%d: AtMost(q) ∪ AtLeast(q+1) has %d k-mers, Union has %d",
|
||||
q, combined.Len(), union.Len())
|
||||
}
|
||||
|
||||
// Verify: AtMost(q) ∩ AtLeast(q+1) = ∅
|
||||
overlap := atMost.Intersect(atLeast)
|
||||
if overlap.Len() != 0 {
|
||||
t.Errorf("q=%d: AtMost(q) and AtLeast(q+1) overlap with %d k-mers",
|
||||
q, overlap.Len())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// BenchmarkQuorumAtLeast benchmarks quorum operations
|
||||
func BenchmarkQuorumAtLeast(b *testing.B) {
|
||||
k := 21
|
||||
n := 10
|
||||
group := NewKmerSetGroup(k, n)
|
||||
|
||||
// Populate with realistic data
|
||||
for i := 0; i < n; i++ {
|
||||
for j := uint64(0); j < 10000; j++ {
|
||||
if (j % uint64(n)) <= uint64(i) {
|
||||
group.Get(i).AddKmerCode(j)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = group.QuorumAtLeast(5)
|
||||
}
|
||||
}
|
||||
@@ -1,376 +0,0 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/pelletier/go-toml/v2"
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
// MetadataFormat represents the metadata serialization format
|
||||
type MetadataFormat int
|
||||
|
||||
const (
|
||||
FormatTOML MetadataFormat = iota
|
||||
FormatYAML
|
||||
FormatJSON
|
||||
)
|
||||
|
||||
// String returns the file extension for the format
|
||||
func (f MetadataFormat) String() string {
|
||||
switch f {
|
||||
case FormatTOML:
|
||||
return "toml"
|
||||
case FormatYAML:
|
||||
return "yaml"
|
||||
case FormatJSON:
|
||||
return "json"
|
||||
default:
|
||||
return "toml"
|
||||
}
|
||||
}
|
||||
|
||||
// KmerSetMetadata contient les métadonnées d'un KmerSet ou KmerSetGroup
|
||||
type KmerSetMetadata struct {
|
||||
ID string `toml:"id,omitempty" yaml:"id,omitempty" json:"id,omitempty"` // Identifiant unique
|
||||
K int `toml:"k" yaml:"k" json:"k"` // Taille des k-mers
|
||||
Type string `toml:"type" yaml:"type" json:"type"` // "KmerSet" ou "KmerSetGroup"
|
||||
Size int `toml:"size" yaml:"size" json:"size"` // 1 pour KmerSet, n pour KmerSetGroup
|
||||
Files []string `toml:"files" yaml:"files" json:"files"` // Liste des fichiers .roaring
|
||||
SetsIDs []string `toml:"sets_ids,omitempty" yaml:"sets_ids,omitempty" json:"sets_ids,omitempty"` // IDs des KmerSet individuels
|
||||
UserMetadata map[string]interface{} `toml:"user_metadata,omitempty" yaml:"user_metadata,omitempty" json:"user_metadata,omitempty"` // Métadonnées KmerSet ou KmerSetGroup
|
||||
SetsMetadata []map[string]interface{} `toml:"sets_metadata,omitempty" yaml:"sets_metadata,omitempty" json:"sets_metadata,omitempty"` // Métadonnées des KmerSet individuels dans un KmerSetGroup
|
||||
}
|
||||
|
||||
// SaveKmerSet sauvegarde un KmerSet dans un répertoire
|
||||
// Format: directory/metadata.{toml,yaml,json} + directory/set_0.roaring
|
||||
func (ks *KmerSet) Save(directory string, format MetadataFormat) error {
|
||||
// Créer le répertoire si nécessaire
|
||||
if err := os.MkdirAll(directory, 0755); err != nil {
|
||||
return fmt.Errorf("failed to create directory %s: %w", directory, err)
|
||||
}
|
||||
|
||||
// Métadonnées
|
||||
metadata := KmerSetMetadata{
|
||||
ID: ks.id,
|
||||
K: ks.k,
|
||||
Type: "KmerSet",
|
||||
Size: 1,
|
||||
Files: []string{"set_0.roaring"},
|
||||
UserMetadata: ks.Metadata, // Sauvegarder les métadonnées utilisateur
|
||||
}
|
||||
|
||||
// Sauvegarder les métadonnées
|
||||
if err := saveMetadata(filepath.Join(directory, "metadata."+format.String()), metadata, format); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Sauvegarder le bitmap
|
||||
bitmapPath := filepath.Join(directory, "set_0.roaring")
|
||||
file, err := os.Create(bitmapPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create bitmap file %s: %w", bitmapPath, err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
if _, err := ks.bitmap.WriteTo(file); err != nil {
|
||||
return fmt.Errorf("failed to write bitmap: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// LoadKmerSet charge un KmerSet depuis un répertoire
|
||||
func LoadKmerSet(directory string) (*KmerSet, error) {
|
||||
// Lire les métadonnées (essayer tous les formats)
|
||||
metadata, err := loadMetadata(directory)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Vérifier le type
|
||||
if metadata.Type != "KmerSet" {
|
||||
return nil, fmt.Errorf("invalid type: expected KmerSet, got %s", metadata.Type)
|
||||
}
|
||||
|
||||
// Vérifier qu'il n'y a qu'un seul fichier
|
||||
if metadata.Size != 1 || len(metadata.Files) != 1 {
|
||||
return nil, fmt.Errorf("KmerSet must have exactly 1 bitmap file, got %d", len(metadata.Files))
|
||||
}
|
||||
|
||||
// Charger le bitmap
|
||||
bitmapPath := filepath.Join(directory, metadata.Files[0])
|
||||
file, err := os.Open(bitmapPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to open bitmap file %s: %w", bitmapPath, err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
ks := NewKmerSet(metadata.K)
|
||||
|
||||
// Charger l'ID
|
||||
ks.id = metadata.ID
|
||||
|
||||
// Charger les métadonnées utilisateur
|
||||
if metadata.UserMetadata != nil {
|
||||
ks.Metadata = metadata.UserMetadata
|
||||
}
|
||||
|
||||
if _, err := ks.bitmap.ReadFrom(file); err != nil {
|
||||
return nil, fmt.Errorf("failed to read bitmap: %w", err)
|
||||
}
|
||||
|
||||
return ks, nil
|
||||
}
|
||||
|
||||
// SaveKmerSetGroup sauvegarde un KmerSetGroup dans un répertoire
|
||||
// Format: directory/metadata.{toml,yaml,json} + directory/set_0.roaring, set_1.roaring, ...
|
||||
func (ksg *KmerSetGroup) Save(directory string, format MetadataFormat) error {
|
||||
// Créer le répertoire si nécessaire
|
||||
if err := os.MkdirAll(directory, 0755); err != nil {
|
||||
return fmt.Errorf("failed to create directory %s: %w", directory, err)
|
||||
}
|
||||
|
||||
// Métadonnées
|
||||
files := make([]string, len(ksg.sets))
|
||||
for i := range ksg.sets {
|
||||
files[i] = fmt.Sprintf("set_%d.roaring", i)
|
||||
}
|
||||
|
||||
// Collecter les IDs et métadonnées de chaque KmerSet individuel
|
||||
setsIDs := make([]string, len(ksg.sets))
|
||||
setsMetadata := make([]map[string]interface{}, len(ksg.sets))
|
||||
for i, ks := range ksg.sets {
|
||||
setsIDs[i] = ks.id
|
||||
setsMetadata[i] = ks.Metadata
|
||||
}
|
||||
|
||||
metadata := KmerSetMetadata{
|
||||
ID: ksg.id,
|
||||
K: ksg.k,
|
||||
Type: "KmerSetGroup",
|
||||
Size: len(ksg.sets),
|
||||
Files: files,
|
||||
SetsIDs: setsIDs, // IDs de chaque set
|
||||
UserMetadata: ksg.Metadata, // Métadonnées du groupe
|
||||
SetsMetadata: setsMetadata, // Métadonnées de chaque set
|
||||
}
|
||||
|
||||
// Sauvegarder les métadonnées
|
||||
if err := saveMetadata(filepath.Join(directory, "metadata."+format.String()), metadata, format); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Sauvegarder chaque bitmap
|
||||
for i, ks := range ksg.sets {
|
||||
bitmapPath := filepath.Join(directory, files[i])
|
||||
file, err := os.Create(bitmapPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create bitmap file %s: %w", bitmapPath, err)
|
||||
}
|
||||
|
||||
if _, err := ks.bitmap.WriteTo(file); err != nil {
|
||||
file.Close()
|
||||
return fmt.Errorf("failed to write bitmap %d: %w", i, err)
|
||||
}
|
||||
file.Close()
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// LoadKmerSetGroup charge un KmerSetGroup depuis un répertoire
|
||||
func LoadKmerSetGroup(directory string) (*KmerSetGroup, error) {
|
||||
// Lire les métadonnées (essayer tous les formats)
|
||||
metadata, err := loadMetadata(directory)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Vérifier le type
|
||||
if metadata.Type != "KmerSetGroup" {
|
||||
return nil, fmt.Errorf("invalid type: expected KmerSetGroup, got %s", metadata.Type)
|
||||
}
|
||||
|
||||
// Vérifier la cohérence
|
||||
if metadata.Size != len(metadata.Files) {
|
||||
return nil, fmt.Errorf("size mismatch: size=%d but %d files listed", metadata.Size, len(metadata.Files))
|
||||
}
|
||||
|
||||
// Créer le groupe
|
||||
ksg := NewKmerSetGroup(metadata.K, metadata.Size)
|
||||
|
||||
// Charger l'ID du groupe
|
||||
ksg.id = metadata.ID
|
||||
|
||||
// Charger les métadonnées du groupe
|
||||
if metadata.UserMetadata != nil {
|
||||
ksg.Metadata = metadata.UserMetadata
|
||||
}
|
||||
|
||||
// Charger les IDs de chaque KmerSet
|
||||
if metadata.SetsIDs != nil && len(metadata.SetsIDs) == metadata.Size {
|
||||
for i := range ksg.sets {
|
||||
ksg.sets[i].id = metadata.SetsIDs[i]
|
||||
}
|
||||
}
|
||||
|
||||
// Charger les métadonnées de chaque KmerSet individuel
|
||||
if metadata.SetsMetadata != nil {
|
||||
if len(metadata.SetsMetadata) != metadata.Size {
|
||||
return nil, fmt.Errorf("sets metadata size mismatch: expected %d, got %d", metadata.Size, len(metadata.SetsMetadata))
|
||||
}
|
||||
for i := range ksg.sets {
|
||||
ksg.sets[i].Metadata = metadata.SetsMetadata[i]
|
||||
}
|
||||
}
|
||||
|
||||
// Charger chaque bitmap
|
||||
for i, filename := range metadata.Files {
|
||||
bitmapPath := filepath.Join(directory, filename)
|
||||
file, err := os.Open(bitmapPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to open bitmap file %s: %w", bitmapPath, err)
|
||||
}
|
||||
|
||||
if _, err := ksg.sets[i].bitmap.ReadFrom(file); err != nil {
|
||||
file.Close()
|
||||
return nil, fmt.Errorf("failed to read bitmap %d: %w", i, err)
|
||||
}
|
||||
file.Close()
|
||||
}
|
||||
|
||||
return ksg, nil
|
||||
}
|
||||
|
||||
// saveMetadata sauvegarde les métadonnées dans le format spécifié
|
||||
func saveMetadata(path string, metadata KmerSetMetadata, format MetadataFormat) error {
|
||||
file, err := os.Create(path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create metadata file %s: %w", path, err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
var encoder interface{ Encode(interface{}) error }
|
||||
|
||||
switch format {
|
||||
case FormatTOML:
|
||||
encoder = toml.NewEncoder(file)
|
||||
case FormatYAML:
|
||||
encoder = yaml.NewEncoder(file)
|
||||
case FormatJSON:
|
||||
jsonEncoder := json.NewEncoder(file)
|
||||
jsonEncoder.SetIndent("", " ")
|
||||
encoder = jsonEncoder
|
||||
default:
|
||||
return fmt.Errorf("unsupported format: %v", format)
|
||||
}
|
||||
|
||||
if err := encoder.Encode(metadata); err != nil {
|
||||
return fmt.Errorf("failed to encode metadata: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// loadMetadata charge les métadonnées depuis un répertoire
|
||||
// Essaie tous les formats (TOML, YAML, JSON) dans l'ordre
|
||||
func loadMetadata(directory string) (*KmerSetMetadata, error) {
|
||||
formats := []MetadataFormat{FormatTOML, FormatYAML, FormatJSON}
|
||||
|
||||
var lastErr error
|
||||
for _, format := range formats {
|
||||
path := filepath.Join(directory, "metadata."+format.String())
|
||||
|
||||
// Vérifier si le fichier existe
|
||||
if _, err := os.Stat(path); os.IsNotExist(err) {
|
||||
continue
|
||||
}
|
||||
|
||||
metadata, err := loadMetadataFromFile(path, format)
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
continue
|
||||
}
|
||||
return metadata, nil
|
||||
}
|
||||
|
||||
if lastErr != nil {
|
||||
return nil, fmt.Errorf("failed to load metadata: %w", lastErr)
|
||||
}
|
||||
return nil, fmt.Errorf("no metadata file found in %s (tried .toml, .yaml, .json)", directory)
|
||||
}
|
||||
|
||||
// loadMetadataFromFile charge les métadonnées depuis un fichier spécifique
|
||||
func loadMetadataFromFile(path string, format MetadataFormat) (*KmerSetMetadata, error) {
|
||||
file, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to open metadata file %s: %w", path, err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
var metadata KmerSetMetadata
|
||||
var decoder interface{ Decode(interface{}) error }
|
||||
|
||||
switch format {
|
||||
case FormatTOML:
|
||||
decoder = toml.NewDecoder(file)
|
||||
case FormatYAML:
|
||||
decoder = yaml.NewDecoder(file)
|
||||
case FormatJSON:
|
||||
decoder = json.NewDecoder(file)
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported format: %v", format)
|
||||
}
|
||||
|
||||
if err := decoder.Decode(&metadata); err != nil {
|
||||
return nil, fmt.Errorf("failed to decode metadata: %w", err)
|
||||
}
|
||||
|
||||
return &metadata, nil
|
||||
}
|
||||
|
||||
// DetectFormat détecte le format des métadonnées dans un répertoire
|
||||
func DetectFormat(directory string) (MetadataFormat, error) {
|
||||
formats := []MetadataFormat{FormatTOML, FormatYAML, FormatJSON}
|
||||
|
||||
for _, format := range formats {
|
||||
path := filepath.Join(directory, "metadata."+format.String())
|
||||
if _, err := os.Stat(path); err == nil {
|
||||
return format, nil
|
||||
}
|
||||
}
|
||||
|
||||
return FormatTOML, fmt.Errorf("no metadata file found in %s", directory)
|
||||
}
|
||||
|
||||
// IsKmerSetDirectory vérifie si un répertoire contient un KmerSet ou KmerSetGroup
|
||||
func IsKmerSetDirectory(directory string) (bool, string, error) {
|
||||
metadata, err := loadMetadata(directory)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
return true, metadata.Type, nil
|
||||
}
|
||||
|
||||
// ListBitmapFiles liste tous les fichiers .roaring dans un répertoire
|
||||
func ListBitmapFiles(directory string) ([]string, error) {
|
||||
entries, err := os.ReadDir(directory)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read directory %s: %w", directory, err)
|
||||
}
|
||||
|
||||
var files []string
|
||||
for _, entry := range entries {
|
||||
if !entry.IsDir() && strings.HasSuffix(entry.Name(), ".roaring") {
|
||||
files = append(files, entry.Name())
|
||||
}
|
||||
}
|
||||
|
||||
return files, nil
|
||||
}
|
||||
@@ -1,272 +0,0 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"math"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestJaccardDistanceIdentical(t *testing.T) {
|
||||
ks1 := NewKmerSet(5)
|
||||
ks1.AddKmerCode(100)
|
||||
ks1.AddKmerCode(200)
|
||||
ks1.AddKmerCode(300)
|
||||
|
||||
ks2 := NewKmerSet(5)
|
||||
ks2.AddKmerCode(100)
|
||||
ks2.AddKmerCode(200)
|
||||
ks2.AddKmerCode(300)
|
||||
|
||||
distance := ks1.JaccardDistance(ks2)
|
||||
similarity := ks1.JaccardSimilarity(ks2)
|
||||
|
||||
if distance != 0.0 {
|
||||
t.Errorf("Expected distance 0.0 for identical sets, got %f", distance)
|
||||
}
|
||||
|
||||
if similarity != 1.0 {
|
||||
t.Errorf("Expected similarity 1.0 for identical sets, got %f", similarity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestJaccardDistanceDisjoint(t *testing.T) {
|
||||
ks1 := NewKmerSet(5)
|
||||
ks1.AddKmerCode(100)
|
||||
ks1.AddKmerCode(200)
|
||||
ks1.AddKmerCode(300)
|
||||
|
||||
ks2 := NewKmerSet(5)
|
||||
ks2.AddKmerCode(400)
|
||||
ks2.AddKmerCode(500)
|
||||
ks2.AddKmerCode(600)
|
||||
|
||||
distance := ks1.JaccardDistance(ks2)
|
||||
similarity := ks1.JaccardSimilarity(ks2)
|
||||
|
||||
if distance != 1.0 {
|
||||
t.Errorf("Expected distance 1.0 for disjoint sets, got %f", distance)
|
||||
}
|
||||
|
||||
if similarity != 0.0 {
|
||||
t.Errorf("Expected similarity 0.0 for disjoint sets, got %f", similarity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestJaccardDistancePartialOverlap(t *testing.T) {
|
||||
// Set 1: {1, 2, 3}
|
||||
ks1 := NewKmerSet(5)
|
||||
ks1.AddKmerCode(1)
|
||||
ks1.AddKmerCode(2)
|
||||
ks1.AddKmerCode(3)
|
||||
|
||||
// Set 2: {2, 3, 4}
|
||||
ks2 := NewKmerSet(5)
|
||||
ks2.AddKmerCode(2)
|
||||
ks2.AddKmerCode(3)
|
||||
ks2.AddKmerCode(4)
|
||||
|
||||
// Intersection: {2, 3} -> cardinality = 2
|
||||
// Union: {1, 2, 3, 4} -> cardinality = 4
|
||||
// Similarity = 2/4 = 0.5
|
||||
// Distance = 1 - 0.5 = 0.5
|
||||
|
||||
distance := ks1.JaccardDistance(ks2)
|
||||
similarity := ks1.JaccardSimilarity(ks2)
|
||||
|
||||
expectedDistance := 0.5
|
||||
expectedSimilarity := 0.5
|
||||
|
||||
if math.Abs(distance-expectedDistance) > 1e-10 {
|
||||
t.Errorf("Expected distance %f, got %f", expectedDistance, distance)
|
||||
}
|
||||
|
||||
if math.Abs(similarity-expectedSimilarity) > 1e-10 {
|
||||
t.Errorf("Expected similarity %f, got %f", expectedSimilarity, similarity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestJaccardDistanceOneSubsetOfOther(t *testing.T) {
|
||||
// Set 1: {1, 2}
|
||||
ks1 := NewKmerSet(5)
|
||||
ks1.AddKmerCode(1)
|
||||
ks1.AddKmerCode(2)
|
||||
|
||||
// Set 2: {1, 2, 3, 4}
|
||||
ks2 := NewKmerSet(5)
|
||||
ks2.AddKmerCode(1)
|
||||
ks2.AddKmerCode(2)
|
||||
ks2.AddKmerCode(3)
|
||||
ks2.AddKmerCode(4)
|
||||
|
||||
// Intersection: {1, 2} -> cardinality = 2
|
||||
// Union: {1, 2, 3, 4} -> cardinality = 4
|
||||
// Similarity = 2/4 = 0.5
|
||||
// Distance = 1 - 0.5 = 0.5
|
||||
|
||||
distance := ks1.JaccardDistance(ks2)
|
||||
similarity := ks1.JaccardSimilarity(ks2)
|
||||
|
||||
expectedDistance := 0.5
|
||||
expectedSimilarity := 0.5
|
||||
|
||||
if math.Abs(distance-expectedDistance) > 1e-10 {
|
||||
t.Errorf("Expected distance %f, got %f", expectedDistance, distance)
|
||||
}
|
||||
|
||||
if math.Abs(similarity-expectedSimilarity) > 1e-10 {
|
||||
t.Errorf("Expected similarity %f, got %f", expectedSimilarity, similarity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestJaccardDistanceEmptySets(t *testing.T) {
|
||||
ks1 := NewKmerSet(5)
|
||||
ks2 := NewKmerSet(5)
|
||||
|
||||
distance := ks1.JaccardDistance(ks2)
|
||||
similarity := ks1.JaccardSimilarity(ks2)
|
||||
|
||||
// By convention, distance = 1.0 for empty sets
|
||||
if distance != 1.0 {
|
||||
t.Errorf("Expected distance 1.0 for empty sets, got %f", distance)
|
||||
}
|
||||
|
||||
if similarity != 0.0 {
|
||||
t.Errorf("Expected similarity 0.0 for empty sets, got %f", similarity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestJaccardDistanceOneEmpty(t *testing.T) {
|
||||
ks1 := NewKmerSet(5)
|
||||
ks1.AddKmerCode(1)
|
||||
ks1.AddKmerCode(2)
|
||||
ks1.AddKmerCode(3)
|
||||
|
||||
ks2 := NewKmerSet(5)
|
||||
|
||||
distance := ks1.JaccardDistance(ks2)
|
||||
similarity := ks1.JaccardSimilarity(ks2)
|
||||
|
||||
// Intersection: {} -> cardinality = 0
|
||||
// Union: {1, 2, 3} -> cardinality = 3
|
||||
// Similarity = 0/3 = 0.0
|
||||
// Distance = 1.0
|
||||
|
||||
if distance != 1.0 {
|
||||
t.Errorf("Expected distance 1.0 when one set is empty, got %f", distance)
|
||||
}
|
||||
|
||||
if similarity != 0.0 {
|
||||
t.Errorf("Expected similarity 0.0 when one set is empty, got %f", similarity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestJaccardDistanceDifferentK(t *testing.T) {
|
||||
ks1 := NewKmerSet(5)
|
||||
ks1.AddKmerCode(1)
|
||||
|
||||
ks2 := NewKmerSet(7)
|
||||
ks2.AddKmerCode(1)
|
||||
|
||||
defer func() {
|
||||
if r := recover(); r == nil {
|
||||
t.Errorf("Expected panic when computing Jaccard distance with different k values")
|
||||
}
|
||||
}()
|
||||
|
||||
_ = ks1.JaccardDistance(ks2)
|
||||
}
|
||||
|
||||
func TestJaccardDistanceSimilarityRelation(t *testing.T) {
|
||||
// Test that distance + similarity = 1.0 for all cases
|
||||
testCases := []struct {
|
||||
name string
|
||||
ks1 *KmerSet
|
||||
ks2 *KmerSet
|
||||
}{
|
||||
{
|
||||
name: "partial overlap",
|
||||
ks1: func() *KmerSet {
|
||||
ks := NewKmerSet(5)
|
||||
ks.AddKmerCode(1)
|
||||
ks.AddKmerCode(2)
|
||||
ks.AddKmerCode(3)
|
||||
return ks
|
||||
}(),
|
||||
ks2: func() *KmerSet {
|
||||
ks := NewKmerSet(5)
|
||||
ks.AddKmerCode(2)
|
||||
ks.AddKmerCode(3)
|
||||
ks.AddKmerCode(4)
|
||||
ks.AddKmerCode(5)
|
||||
return ks
|
||||
}(),
|
||||
},
|
||||
{
|
||||
name: "identical",
|
||||
ks1: func() *KmerSet {
|
||||
ks := NewKmerSet(5)
|
||||
ks.AddKmerCode(10)
|
||||
ks.AddKmerCode(20)
|
||||
return ks
|
||||
}(),
|
||||
ks2: func() *KmerSet {
|
||||
ks := NewKmerSet(5)
|
||||
ks.AddKmerCode(10)
|
||||
ks.AddKmerCode(20)
|
||||
return ks
|
||||
}(),
|
||||
},
|
||||
{
|
||||
name: "disjoint",
|
||||
ks1: func() *KmerSet {
|
||||
ks := NewKmerSet(5)
|
||||
ks.AddKmerCode(1)
|
||||
return ks
|
||||
}(),
|
||||
ks2: func() *KmerSet {
|
||||
ks := NewKmerSet(5)
|
||||
ks.AddKmerCode(100)
|
||||
return ks
|
||||
}(),
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
distance := tc.ks1.JaccardDistance(tc.ks2)
|
||||
similarity := tc.ks1.JaccardSimilarity(tc.ks2)
|
||||
|
||||
sum := distance + similarity
|
||||
|
||||
if math.Abs(sum-1.0) > 1e-10 {
|
||||
t.Errorf("Expected distance + similarity = 1.0, got %f + %f = %f",
|
||||
distance, similarity, sum)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestJaccardDistanceSymmetry(t *testing.T) {
|
||||
ks1 := NewKmerSet(5)
|
||||
ks1.AddKmerCode(1)
|
||||
ks1.AddKmerCode(2)
|
||||
ks1.AddKmerCode(3)
|
||||
|
||||
ks2 := NewKmerSet(5)
|
||||
ks2.AddKmerCode(2)
|
||||
ks2.AddKmerCode(3)
|
||||
ks2.AddKmerCode(4)
|
||||
|
||||
distance1 := ks1.JaccardDistance(ks2)
|
||||
distance2 := ks2.JaccardDistance(ks1)
|
||||
|
||||
similarity1 := ks1.JaccardSimilarity(ks2)
|
||||
similarity2 := ks2.JaccardSimilarity(ks1)
|
||||
|
||||
if math.Abs(distance1-distance2) > 1e-10 {
|
||||
t.Errorf("Jaccard distance not symmetric: %f vs %f", distance1, distance2)
|
||||
}
|
||||
|
||||
if math.Abs(similarity1-similarity2) > 1e-10 {
|
||||
t.Errorf("Jaccard similarity not symmetric: %f vs %f", similarity1, similarity2)
|
||||
}
|
||||
}
|
||||
47
pkg/obikmer/minimizer_utils.go
Normal file
47
pkg/obikmer/minimizer_utils.go
Normal file
@@ -0,0 +1,47 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// DefaultMinimizerSize returns ceil(k / 2.5) as a reasonable default minimizer size.
|
||||
func DefaultMinimizerSize(k int) int {
|
||||
m := int(math.Ceil(float64(k) / 2.5))
|
||||
if m < 1 {
|
||||
m = 1
|
||||
}
|
||||
if m >= k {
|
||||
m = k - 1
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// MinMinimizerSize returns the minimum m such that 4^m >= nworkers,
|
||||
// i.e. ceil(log(nworkers) / log(4)).
|
||||
func MinMinimizerSize(nworkers int) int {
|
||||
if nworkers <= 1 {
|
||||
return 1
|
||||
}
|
||||
return int(math.Ceil(math.Log(float64(nworkers)) / math.Log(4)))
|
||||
}
|
||||
|
||||
// ValidateMinimizerSize checks and adjusts the minimizer size to satisfy constraints:
|
||||
// - m >= ceil(log(nworkers)/log(4))
|
||||
// - 1 <= m < k
|
||||
func ValidateMinimizerSize(m, k, nworkers int) int {
|
||||
minM := MinMinimizerSize(nworkers)
|
||||
if m < minM {
|
||||
log.Warnf("Minimizer size %d too small for %d workers (4^%d = %d < %d), adjusting to %d",
|
||||
m, nworkers, m, 1<<(2*m), nworkers, minM)
|
||||
m = minM
|
||||
}
|
||||
if m < 1 {
|
||||
m = 1
|
||||
}
|
||||
if m >= k {
|
||||
m = k - 1
|
||||
}
|
||||
return m
|
||||
}
|
||||
67
pkg/obikmer/skm_reader.go
Normal file
67
pkg/obikmer/skm_reader.go
Normal file
@@ -0,0 +1,67 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/binary"
|
||||
"io"
|
||||
"os"
|
||||
)
|
||||
|
||||
// decode2bit maps 2-bit codes back to nucleotide bytes.
|
||||
var decode2bit = [4]byte{'a', 'c', 'g', 't'}
|
||||
|
||||
// SkmReader reads super-kmers from a binary .skm file.
|
||||
type SkmReader struct {
|
||||
r *bufio.Reader
|
||||
file *os.File
|
||||
}
|
||||
|
||||
// NewSkmReader opens a .skm file for reading.
|
||||
func NewSkmReader(path string) (*SkmReader, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &SkmReader{
|
||||
r: bufio.NewReaderSize(f, 65536),
|
||||
file: f,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Next reads the next super-kmer from the file.
|
||||
// Returns the SuperKmer and true, or a zero SuperKmer and false at EOF.
|
||||
func (sr *SkmReader) Next() (SuperKmer, bool) {
|
||||
// Read length
|
||||
var lenbuf [2]byte
|
||||
if _, err := io.ReadFull(sr.r, lenbuf[:]); err != nil {
|
||||
return SuperKmer{}, false
|
||||
}
|
||||
seqLen := int(binary.LittleEndian.Uint16(lenbuf[:]))
|
||||
|
||||
// Read packed bytes
|
||||
nBytes := (seqLen + 3) / 4
|
||||
packed := make([]byte, nBytes)
|
||||
if _, err := io.ReadFull(sr.r, packed); err != nil {
|
||||
return SuperKmer{}, false
|
||||
}
|
||||
|
||||
// Decode to nucleotide bytes
|
||||
seq := make([]byte, seqLen)
|
||||
for i := 0; i < seqLen; i++ {
|
||||
byteIdx := i / 4
|
||||
bitPos := uint(6 - (i%4)*2)
|
||||
code := (packed[byteIdx] >> bitPos) & 0x03
|
||||
seq[i] = decode2bit[code]
|
||||
}
|
||||
|
||||
return SuperKmer{
|
||||
Sequence: seq,
|
||||
Start: 0,
|
||||
End: seqLen,
|
||||
}, true
|
||||
}
|
||||
|
||||
// Close closes the underlying file.
|
||||
func (sr *SkmReader) Close() error {
|
||||
return sr.file.Close()
|
||||
}
|
||||
176
pkg/obikmer/skm_test.go
Normal file
176
pkg/obikmer/skm_test.go
Normal file
@@ -0,0 +1,176 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestSkmRoundTrip(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "test.skm")
|
||||
|
||||
// Create super-kmers from a known sequence
|
||||
seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT")
|
||||
k := 21
|
||||
m := 9
|
||||
superKmers := ExtractSuperKmers(seq, k, m, nil)
|
||||
if len(superKmers) == 0 {
|
||||
t.Fatal("no super-kmers extracted")
|
||||
}
|
||||
|
||||
// Write
|
||||
w, err := NewSkmWriter(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, sk := range superKmers {
|
||||
if err := w.Write(sk); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Read back
|
||||
r, err := NewSkmReader(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
idx := 0
|
||||
for {
|
||||
sk, ok := r.Next()
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
if idx >= len(superKmers) {
|
||||
t.Fatal("read more super-kmers than written")
|
||||
}
|
||||
expected := superKmers[idx]
|
||||
if len(sk.Sequence) != len(expected.Sequence) {
|
||||
t.Fatalf("super-kmer %d: length mismatch: got %d, want %d",
|
||||
idx, len(sk.Sequence), len(expected.Sequence))
|
||||
}
|
||||
// Compare nucleotide-by-nucleotide (case insensitive since decode produces lowercase)
|
||||
for j := range sk.Sequence {
|
||||
got := sk.Sequence[j] | 0x20
|
||||
want := expected.Sequence[j] | 0x20
|
||||
if got != want {
|
||||
t.Fatalf("super-kmer %d pos %d: got %c, want %c", idx, j, got, want)
|
||||
}
|
||||
}
|
||||
idx++
|
||||
}
|
||||
if idx != len(superKmers) {
|
||||
t.Fatalf("read %d super-kmers, want %d", idx, len(superKmers))
|
||||
}
|
||||
}
|
||||
|
||||
func TestSkmEmptyFile(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "empty.skm")
|
||||
|
||||
// Write nothing
|
||||
w, err := NewSkmWriter(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Read back
|
||||
r, err := NewSkmReader(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
_, ok := r.Next()
|
||||
if ok {
|
||||
t.Fatal("expected no super-kmers in empty file")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSkmSingleBase(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "single.skm")
|
||||
|
||||
// Test with sequences of various lengths to check padding
|
||||
sequences := [][]byte{
|
||||
[]byte("A"),
|
||||
[]byte("AC"),
|
||||
[]byte("ACG"),
|
||||
[]byte("ACGT"),
|
||||
[]byte("ACGTA"),
|
||||
}
|
||||
|
||||
w, err := NewSkmWriter(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, seq := range sequences {
|
||||
sk := SuperKmer{Sequence: seq}
|
||||
if err := w.Write(sk); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
r, err := NewSkmReader(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
for i, expected := range sequences {
|
||||
sk, ok := r.Next()
|
||||
if !ok {
|
||||
t.Fatalf("expected super-kmer %d, got EOF", i)
|
||||
}
|
||||
if len(sk.Sequence) != len(expected) {
|
||||
t.Fatalf("sk %d: length %d, want %d", i, len(sk.Sequence), len(expected))
|
||||
}
|
||||
for j := range sk.Sequence {
|
||||
got := sk.Sequence[j] | 0x20
|
||||
want := expected[j] | 0x20
|
||||
if got != want {
|
||||
t.Fatalf("sk %d pos %d: got %c, want %c", i, j, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestSkmFileSize(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "size.skm")
|
||||
|
||||
// Write a sequence of known length
|
||||
seq := []byte("ACGTACGTAC") // 10 bases
|
||||
sk := SuperKmer{Sequence: seq}
|
||||
|
||||
w, err := NewSkmWriter(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := w.Write(sk); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Expected: 2 bytes (length) + ceil(10/4)=3 bytes (data) = 5 bytes
|
||||
info, err := os.Stat(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if info.Size() != 5 {
|
||||
t.Fatalf("file size: got %d, want 5", info.Size())
|
||||
}
|
||||
}
|
||||
74
pkg/obikmer/skm_writer.go
Normal file
74
pkg/obikmer/skm_writer.go
Normal file
@@ -0,0 +1,74 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/binary"
|
||||
"os"
|
||||
)
|
||||
|
||||
// SkmWriter writes super-kmers to a binary .skm file.
|
||||
//
|
||||
// Format per super-kmer:
|
||||
//
|
||||
// [len: uint16 LE] length of the super-kmer in bases
|
||||
// [data: ceil(len/4) bytes] sequence encoded 2 bits/base, packed
|
||||
//
|
||||
// Nucleotide encoding: A=00, C=01, G=10, T=11.
|
||||
// The last byte is zero-padded on the low bits if len%4 != 0.
|
||||
type SkmWriter struct {
|
||||
w *bufio.Writer
|
||||
file *os.File
|
||||
}
|
||||
|
||||
// NewSkmWriter creates a new SkmWriter writing to the given file path.
|
||||
func NewSkmWriter(path string) (*SkmWriter, error) {
|
||||
f, err := os.Create(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &SkmWriter{
|
||||
w: bufio.NewWriterSize(f, 65536),
|
||||
file: f,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Write encodes a SuperKmer to the .skm file.
|
||||
// The sequence bytes are packed 2 bits per base.
|
||||
func (sw *SkmWriter) Write(sk SuperKmer) error {
|
||||
seq := sk.Sequence
|
||||
seqLen := uint16(len(seq))
|
||||
|
||||
// Write length
|
||||
var lenbuf [2]byte
|
||||
binary.LittleEndian.PutUint16(lenbuf[:], seqLen)
|
||||
if _, err := sw.w.Write(lenbuf[:]); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Encode and write packed sequence (2 bits/base)
|
||||
nBytes := (int(seqLen) + 3) / 4
|
||||
for i := 0; i < nBytes; i++ {
|
||||
var packed byte
|
||||
for j := 0; j < 4; j++ {
|
||||
pos := i*4 + j
|
||||
packed <<= 2
|
||||
if pos < int(seqLen) {
|
||||
packed |= __single_base_code__[seq[pos]&31]
|
||||
}
|
||||
}
|
||||
if err := sw.w.WriteByte(packed); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Close flushes buffered data and closes the underlying file.
|
||||
func (sw *SkmWriter) Close() error {
|
||||
if err := sw.w.Flush(); err != nil {
|
||||
sw.file.Close()
|
||||
return err
|
||||
}
|
||||
return sw.file.Close()
|
||||
}
|
||||
53
pkg/obikmer/varint.go
Normal file
53
pkg/obikmer/varint.go
Normal file
@@ -0,0 +1,53 @@
|
||||
package obikmer
|
||||
|
||||
import "io"
|
||||
|
||||
// EncodeVarint writes a uint64 value as a variable-length integer to w.
|
||||
// Uses 7 bits per byte with the high bit as a continuation flag
|
||||
// (identical to protobuf unsigned varint encoding).
|
||||
// Returns the number of bytes written.
|
||||
func EncodeVarint(w io.Writer, v uint64) (int, error) {
|
||||
var buf [10]byte // max 10 bytes for uint64 varint
|
||||
n := 0
|
||||
for v >= 0x80 {
|
||||
buf[n] = byte(v) | 0x80
|
||||
v >>= 7
|
||||
n++
|
||||
}
|
||||
buf[n] = byte(v)
|
||||
n++
|
||||
return w.Write(buf[:n])
|
||||
}
|
||||
|
||||
// DecodeVarint reads a variable-length encoded uint64 from r.
|
||||
// Returns the decoded value and any error encountered.
|
||||
func DecodeVarint(r io.Reader) (uint64, error) {
|
||||
var val uint64
|
||||
var shift uint
|
||||
var buf [1]byte
|
||||
|
||||
for {
|
||||
if _, err := io.ReadFull(r, buf[:]); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
b := buf[0]
|
||||
val |= uint64(b&0x7F) << shift
|
||||
if b < 0x80 {
|
||||
return val, nil
|
||||
}
|
||||
shift += 7
|
||||
if shift >= 70 {
|
||||
return 0, io.ErrUnexpectedEOF
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// VarintLen returns the number of bytes needed to encode v as a varint.
|
||||
func VarintLen(v uint64) int {
|
||||
n := 1
|
||||
for v >= 0x80 {
|
||||
v >>= 7
|
||||
n++
|
||||
}
|
||||
return n
|
||||
}
|
||||
82
pkg/obikmer/varint_test.go
Normal file
82
pkg/obikmer/varint_test.go
Normal file
@@ -0,0 +1,82 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestVarintRoundTrip(t *testing.T) {
|
||||
values := []uint64{
|
||||
0, 1, 127, 128, 255, 256,
|
||||
16383, 16384,
|
||||
1<<21 - 1, 1 << 21,
|
||||
1<<28 - 1, 1 << 28,
|
||||
1<<35 - 1, 1 << 35,
|
||||
1<<42 - 1, 1 << 42,
|
||||
1<<49 - 1, 1 << 49,
|
||||
1<<56 - 1, 1 << 56,
|
||||
1<<63 - 1, 1 << 63,
|
||||
^uint64(0), // max uint64
|
||||
}
|
||||
|
||||
for _, v := range values {
|
||||
var buf bytes.Buffer
|
||||
n, err := EncodeVarint(&buf, v)
|
||||
if err != nil {
|
||||
t.Fatalf("EncodeVarint(%d): %v", v, err)
|
||||
}
|
||||
if n != VarintLen(v) {
|
||||
t.Fatalf("EncodeVarint(%d): wrote %d bytes, VarintLen says %d", v, n, VarintLen(v))
|
||||
}
|
||||
|
||||
decoded, err := DecodeVarint(&buf)
|
||||
if err != nil {
|
||||
t.Fatalf("DecodeVarint for %d: %v", v, err)
|
||||
}
|
||||
if decoded != v {
|
||||
t.Fatalf("roundtrip failed: encoded %d, decoded %d", v, decoded)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestVarintLen(t *testing.T) {
|
||||
tests := []struct {
|
||||
value uint64
|
||||
expected int
|
||||
}{
|
||||
{0, 1},
|
||||
{127, 1},
|
||||
{128, 2},
|
||||
{16383, 2},
|
||||
{16384, 3},
|
||||
{^uint64(0), 10},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
got := VarintLen(tc.value)
|
||||
if got != tc.expected {
|
||||
t.Errorf("VarintLen(%d) = %d, want %d", tc.value, got, tc.expected)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestVarintSequence(t *testing.T) {
|
||||
var buf bytes.Buffer
|
||||
values := []uint64{0, 42, 1000000, ^uint64(0), 1}
|
||||
|
||||
for _, v := range values {
|
||||
if _, err := EncodeVarint(&buf, v); err != nil {
|
||||
t.Fatalf("EncodeVarint(%d): %v", v, err)
|
||||
}
|
||||
}
|
||||
|
||||
for _, expected := range values {
|
||||
got, err := DecodeVarint(&buf)
|
||||
if err != nil {
|
||||
t.Fatalf("DecodeVarint: %v", err)
|
||||
}
|
||||
if got != expected {
|
||||
t.Errorf("got %d, want %d", got, expected)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -7,8 +7,8 @@ import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
)
|
||||
|
||||
// CLIBuildKmerIndex reads sequences from the iterator and builds a kmer index
|
||||
// saved as a roaring bitmap directory.
|
||||
// CLIBuildKmerIndex reads sequences from the iterator and builds a
|
||||
// disk-based kmer index using the KmerSetGroupBuilder.
|
||||
func CLIBuildKmerIndex(iterator obiiter.IBioSequence) {
|
||||
// Validate output directory
|
||||
outDir := CLIOutputDirectory()
|
||||
@@ -31,64 +31,56 @@ func CLIBuildKmerIndex(iterator obiiter.IBioSequence) {
|
||||
log.Fatalf("Invalid min-occurrence: %d (must be >= 1)", minOcc)
|
||||
}
|
||||
|
||||
// Resolve metadata format
|
||||
format := CLIMetadataFormat()
|
||||
|
||||
log.Infof("Building kmer index: k=%d, m=%d, min-occurrence=%d", k, m, minOcc)
|
||||
|
||||
if minOcc <= 1 {
|
||||
// Simple KmerSet mode
|
||||
ks := obikmer.BuildKmerIndex(iterator, k, m)
|
||||
// Build options
|
||||
var opts []obikmer.BuilderOption
|
||||
if minOcc > 1 {
|
||||
opts = append(opts, obikmer.WithMinFrequency(minOcc))
|
||||
}
|
||||
|
||||
// Apply metadata
|
||||
applyKmerSetMetadata(ks)
|
||||
// Create builder (1 set, auto partitions)
|
||||
builder, err := obikmer.NewKmerSetGroupBuilder(outDir, k, m, 1, -1, opts...)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to create kmer index builder: %v", err)
|
||||
}
|
||||
|
||||
// Save
|
||||
log.Infof("Saving KmerSet to %s", outDir)
|
||||
if err := ks.Save(outDir, format); err != nil {
|
||||
log.Fatalf("Failed to save kmer index: %v", err)
|
||||
}
|
||||
} else {
|
||||
// FrequencyFilter mode
|
||||
ff := obikmer.BuildFrequencyFilterIndex(iterator, k, m, minOcc)
|
||||
|
||||
if CLISaveFullFilter() {
|
||||
// Save the full filter (all levels)
|
||||
applyMetadataGroup(ff.KmerSetGroup)
|
||||
|
||||
log.Infof("Saving full FrequencyFilter to %s", outDir)
|
||||
if err := ff.Save(outDir, format); err != nil {
|
||||
log.Fatalf("Failed to save frequency filter: %v", err)
|
||||
}
|
||||
} else {
|
||||
// Save only the filtered KmerSet (k-mers with freq >= minOcc)
|
||||
ks := ff.GetFilteredSet()
|
||||
applyKmerSetMetadata(ks)
|
||||
ks.SetAttribute("min_occurrence", minOcc)
|
||||
|
||||
log.Infof("Saving filtered KmerSet (freq >= %d) to %s", minOcc, outDir)
|
||||
if err := ks.Save(outDir, format); err != nil {
|
||||
log.Fatalf("Failed to save filtered kmer index: %v", err)
|
||||
}
|
||||
// Feed sequences
|
||||
seqCount := 0
|
||||
for iterator.Next() {
|
||||
batch := iterator.Get()
|
||||
for _, seq := range batch.Slice() {
|
||||
builder.AddSequence(0, seq)
|
||||
seqCount++
|
||||
}
|
||||
}
|
||||
|
||||
log.Infof("Processed %d sequences", seqCount)
|
||||
|
||||
// Finalize
|
||||
ksg, err := builder.Close()
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to finalize kmer index: %v", err)
|
||||
}
|
||||
|
||||
// Apply metadata
|
||||
applyMetadata(ksg)
|
||||
|
||||
if minOcc > 1 {
|
||||
ksg.SetAttribute("min_occurrence", minOcc)
|
||||
}
|
||||
|
||||
// Persist metadata updates
|
||||
if err := ksg.SaveMetadata(); err != nil {
|
||||
log.Fatalf("Failed to save metadata: %v", err)
|
||||
}
|
||||
|
||||
log.Infof("Index contains %d k-mers in %s", ksg.Len(0), outDir)
|
||||
log.Info("Done.")
|
||||
}
|
||||
|
||||
// applyKmerSetMetadata sets index-id and --set-tag metadata on a KmerSet.
|
||||
func applyKmerSetMetadata(ks *obikmer.KmerSet) {
|
||||
if id := CLIIndexId(); id != "" {
|
||||
ks.SetId(id)
|
||||
}
|
||||
|
||||
for key, value := range CLISetTag() {
|
||||
ks.SetAttribute(key, value)
|
||||
}
|
||||
}
|
||||
|
||||
// applyMetadataGroup sets index-id and --set-tag metadata on a KmerSetGroup.
|
||||
func applyMetadataGroup(ksg *obikmer.KmerSetGroup) {
|
||||
// applyMetadata sets index-id and --set-tag metadata on a KmerSetGroup.
|
||||
func applyMetadata(ksg *obikmer.KmerSetGroup) {
|
||||
if id := CLIIndexId(); id != "" {
|
||||
ksg.SetId(id)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user