diff --git a/.gitignore b/.gitignore index b3d2a3c..4cf23ab 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,4 @@ LLM/** entropy.html bug_id.txt obilowmask_ref +test_* diff --git a/cmd/obitools/obikindex/main.go b/cmd/obitools/obikindex/main.go index 010ceab..cb31985 100644 --- a/cmd/obitools/obikindex/main.go +++ b/cmd/obitools/obikindex/main.go @@ -11,7 +11,7 @@ import ( func main() { optionParser := obioptions.GenerateOptionParser( "obikindex", - "builds a roaring kmer index from sequence files", + "builds a disk-based kmer index from sequence files", obikindex.OptionSet) _, args := optionParser(os.Args) diff --git a/go.mod b/go.mod index 9c0a019..e40a216 100644 --- a/go.mod +++ b/go.mod @@ -14,6 +14,7 @@ require ( github.com/goccy/go-json v0.10.3 github.com/klauspost/pgzip v1.2.6 github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 + github.com/pelletier/go-toml/v2 v2.2.4 github.com/rrethy/ahocorasick v1.0.0 github.com/schollz/progressbar/v3 v3.13.1 github.com/sirupsen/logrus v1.9.3 @@ -27,14 +28,10 @@ require ( ) require ( - github.com/RoaringBitmap/roaring v1.9.4 // indirect - github.com/bits-and-blooms/bitset v1.12.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/goombaio/orderedmap v0.0.0-20180924084748-ba921b7e2419 // indirect github.com/kr/pretty v0.3.1 // indirect github.com/kr/text v0.2.0 // indirect - github.com/mschoch/smat v0.2.0 // indirect - github.com/pelletier/go-toml/v2 v2.2.4 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/rogpeppe/go-internal v1.12.0 // indirect ) diff --git a/go.sum b/go.sum index 52d2591..ec95ed3 100644 --- a/go.sum +++ b/go.sum @@ -4,12 +4,8 @@ github.com/PaesslerAG/gval v1.2.2 h1:Y7iBzhgE09IGTt5QgGQ2IdaYYYOU134YGHBThD+wm9E github.com/PaesslerAG/gval v1.2.2/go.mod h1:XRFLwvmkTEdYziLdaCeCa5ImcGVrfQbeNUbVR+C6xac= github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI= github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8= -github.com/RoaringBitmap/roaring v1.9.4 h1:yhEIoH4YezLYT04s1nHehNO64EKFTop/wBhxv2QzDdQ= -github.com/RoaringBitmap/roaring v1.9.4/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90= github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df h1:GSoSVRLoBaFpOOds6QyY1L8AX7uoY+Ln3BHc22W40X0= github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df/go.mod h1:hiVxq5OP2bUGBRNS3Z/bt/reCLFNbdcST6gISi1fiOM= -github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA= -github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMUs= github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0= github.com/chen3feng/stl4go v0.1.1 h1:0L1+mDw7pomftKDruM23f1mA7miavOj6C6MZeadzN2Q= @@ -51,8 +47,6 @@ github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZ github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ= github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw= -github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM= -github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw= github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0= github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y= github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4= diff --git a/kmer_roaring_index/FREQUENCY_FILTER_FINAL.md b/kmer_roaring_index/FREQUENCY_FILTER_FINAL.md deleted file mode 100644 index d00be69..0000000 --- a/kmer_roaring_index/FREQUENCY_FILTER_FINAL.md +++ /dev/null @@ -1,292 +0,0 @@ -# Filtre de Fréquence avec v Niveaux de Roaring Bitmaps - -## Algorithme - -```go -Pour chaque k-mer rencontré dans les données: - c = 0 - tant que (k-mer ∈ index[c] ET c < v): - c++ - - si c < v: - index[c].insert(k-mer) -``` - -**Résultat** : `index[v-1]` contient les k-mers vus **≥ v fois** - ---- - -## Exemple d'exécution (v=3) - -``` -Données: - Read1: kmer X - Read2: kmer X - Read3: kmer X (X vu 3 fois) - Read4: kmer Y - Read5: kmer Y (Y vu 2 fois) - Read6: kmer Z (Z vu 1 fois) - -Exécution: - -Read1 (X): - c=0: X ∉ index[0] → index[0].add(X) - État: index[0]={X}, index[1]={}, index[2]={} - -Read2 (X): - c=0: X ∈ index[0] → c=1 - c=1: X ∉ index[1] → index[1].add(X) - État: index[0]={X}, index[1]={X}, index[2]={} - -Read3 (X): - c=0: X ∈ index[0] → c=1 - c=1: X ∈ index[1] → c=2 - c=2: X ∉ index[2] → index[2].add(X) - État: index[0]={X}, index[1]={X}, index[2]={X} - -Read4 (Y): - c=0: Y ∉ index[0] → index[0].add(Y) - État: index[0]={X,Y}, index[1]={X}, index[2]={X} - -Read5 (Y): - c=0: Y ∈ index[0] → c=1 - c=1: Y ∉ index[1] → index[1].add(Y) - État: index[0]={X,Y}, index[1]={X,Y}, index[2]={X} - -Read6 (Z): - c=0: Z ∉ index[0] → index[0].add(Z) - État: index[0]={X,Y,Z}, index[1]={X,Y}, index[2]={X} - -Résultat final: - index[0] (freq≥1): {X, Y, Z} - index[1] (freq≥2): {X, Y} - index[2] (freq≥3): {X} ← K-mers filtrés ✓ -``` - ---- - -## Utilisation - -```go -// Créer le filtre -filter := obikmer.NewFrequencyFilter(31, 3) // k=31, minFreq=3 - -// Ajouter les séquences -for _, read := range reads { - filter.AddSequence(read) -} - -// Récupérer les k-mers filtrés (freq ≥ 3) -filtered := filter.GetFilteredSet("filtered") -fmt.Printf("K-mers de qualité: %d\n", filtered.Cardinality()) - -// Statistiques -stats := filter.Stats() -fmt.Println(stats.String()) -``` - ---- - -## Performance - -### Complexité - -**Par k-mer** : -- Lookups : Moyenne ~v/2, pire cas v -- Insertions : 1 Add -- **Pas de Remove** ✅ - -**Total pour n k-mers** : -- Temps : O(n × v/2) -- Mémoire : O(unique_kmers × v × 2 bytes) - -### Early exit pour distribution skewed - -Avec distribution typique (séquençage) : -``` -80% singletons → 1 lookup (early exit) -15% freq 2-3 → 2-3 lookups -5% freq ≥4 → jusqu'à v lookups - -Moyenne réelle : ~2 lookups/kmer (au lieu de v/2) -``` - ---- - -## Mémoire - -### Pour 10^8 k-mers uniques - -| v (minFreq) | Nombre bitmaps | Mémoire | vs map simple | -|-------------|----------------|---------|---------------| -| v=2 | 2 | ~400 MB | 6x moins | -| v=3 | 3 | ~600 MB | 4x moins | -| v=5 | 5 | ~1 GB | 2.4x moins | -| v=10 | 10 | ~2 GB | 1.2x moins | -| v=20 | 20 | ~4 GB | ~égal | - -**Note** : Avec distribution skewed (beaucoup de singletons), la mémoire réelle est bien plus faible car les niveaux hauts ont peu d'éléments. - -### Exemple réaliste (séquençage) - -Pour 10^8 k-mers totaux, v=3 : -``` -Distribution: - 80% singletons → 80M dans index[0] - 15% freq 2-3 → 15M dans index[1] - 5% freq ≥3 → 5M dans index[2] - -Mémoire: - index[0]: 80M × 2 bytes = 160 MB - index[1]: 15M × 2 bytes = 30 MB - index[2]: 5M × 2 bytes = 10 MB - Total: ~200 MB ✅ - -vs map simple: 80M × 24 bytes = ~2 GB -Réduction: 10x -``` - ---- - -## Comparaison des approches - -| Approche | Mémoire (10^8 kmers) | Passes | Lookups/kmer | Quand utiliser | -|----------|----------------------|--------|--------------|----------------| -| **v-Bitmaps** | **200-600 MB** | **1** | **~2 (avg)** | **Standard** ✅ | -| Map simple | 2.4 GB | 1 | 1 | Si RAM illimitée | -| Multi-pass | 400 MB | v | v | Si I/O pas cher | - ---- - -## Avantages de v-Bitmaps - -✅ **Une seule passe** sur les données -✅ **Mémoire optimale** avec Roaring bitmaps -✅ **Pas de Remove** (seulement Contains + Add) -✅ **Early exit** efficace sur singletons -✅ **Scalable** jusqu'à v~10-20 -✅ **Simple** à implémenter et comprendre - ---- - -## Cas d'usage typiques - -### 1. Éliminer erreurs de séquençage - -```go -filter := obikmer.NewFrequencyFilter(31, 3) - -// Traiter FASTQ -for read := range StreamFastq("sample.fastq") { - filter.AddSequence(read) -} - -// K-mers de qualité (pas d'erreurs) -cleaned := filter.GetFilteredSet("cleaned") -``` - -**Résultat** : Élimine 70-80% des k-mers (erreurs) - -### 2. Assemblage de génome - -```go -filter := obikmer.NewFrequencyFilter(31, 2) - -// Filtrer avant l'assemblage -for read := range reads { - filter.AddSequence(read) -} - -solidKmers := filter.GetFilteredSet("solid") -// Utiliser solidKmers pour le graphe de Bruijn -``` - -### 3. Comparaison de génomes - -```go -collection := obikmer.NewKmerSetCollection(31) - -for _, genome := range genomes { - filter := obikmer.NewFrequencyFilter(31, 3) - filter.AddSequences(genome.Reads) - - cleaned := filter.GetFilteredSet(genome.ID) - collection.Add(cleaned) -} - -// Analyses comparatives sur k-mers de qualité -matrix := collection.ParallelPairwiseJaccard(8) -``` - ---- - -## Limites - -**Pour v > 20** : -- Trop de lookups (v lookups/kmer) -- Mémoire importante (v × 200MB pour 10^8 kmers) - -**Solutions alternatives pour v > 20** : -- Utiliser map simple (9 bytes/kmer) si RAM disponible -- Algorithme différent (sketch, probabiliste) - ---- - -## Optimisations possibles - -### 1. Parallélisation - -```go -// Traiter plusieurs fichiers en parallèle -filters := make([]*FrequencyFilter, numFiles) - -var wg sync.WaitGroup -for i, file := range files { - wg.Add(1) - go func(idx int, f string) { - defer wg.Done() - filters[idx] = ProcessFile(f, k, minFreq) - }(i, file) -} -wg.Wait() - -// Merger les résultats -merged := MergeFilters(filters) -``` - -### 2. Streaming avec seuil adaptatif - -```go -// Commencer avec v=5, réduire progressivement -filter := obikmer.NewFrequencyFilter(31, 5) - -// ... traitement ... - -// Si trop de mémoire, réduire à v=3 -if filter.MemoryUsage() > threshold { - filter = ConvertToLowerThreshold(filter, 3) -} -``` - ---- - -## Récapitulatif final - -**Pour filtrer les k-mers par fréquence ≥ v :** - -1. **Créer** : `filter := NewFrequencyFilter(k, v)` -2. **Traiter** : `filter.AddSequence(read)` pour chaque read -3. **Résultat** : `filtered := filter.GetFilteredSet(id)` - -**Mémoire** : ~2v MB par million de k-mers uniques -**Temps** : Une seule passe, ~2 lookups/kmer en moyenne -**Optimal pour** : v ≤ 20, distribution skewed (séquençage) - ---- - -## Code fourni - -1. **frequency_filter.go** - Implémentation complète -2. **examples_frequency_filter_final.go** - Exemples d'utilisation - -**Tout est prêt à utiliser !** 🚀 diff --git a/kmer_roaring_index/examples_frequency_filter_final.go b/kmer_roaring_index/examples_frequency_filter_final.go deleted file mode 100644 index b2a83d6..0000000 --- a/kmer_roaring_index/examples_frequency_filter_final.go +++ /dev/null @@ -1,320 +0,0 @@ -package main - -import ( - "fmt" - "obikmer" -) - -func main() { - // ========================================== - // EXEMPLE 1 : Utilisation basique - // ========================================== - fmt.Println("=== EXEMPLE 1 : Utilisation basique ===\n") - - k := 31 - minFreq := 3 // Garder les k-mers vus ≥3 fois - - // Créer le filtre - filter := obikmer.NewFrequencyFilter(k, minFreq) - - // Simuler des séquences avec différentes fréquences - sequences := [][]byte{ - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), // Kmer X - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), // Kmer X (freq=2) - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), // Kmer X (freq=3) ✓ - []byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"), // Kmer Y - []byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"), // Kmer Y (freq=2) ✗ - []byte("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"), // Kmer Z (freq=1) ✗ - } - - fmt.Printf("Traitement de %d séquences...\n", len(sequences)) - for _, seq := range sequences { - filter.AddSequence(seq) - } - - // Récupérer les k-mers filtrés - filtered := filter.GetFilteredSet("filtered") - fmt.Printf("\nK-mers avec freq ≥ %d: %d\n", minFreq, filtered.Cardinality()) - - // Statistiques - stats := filter.Stats() - fmt.Println("\n" + stats.String()) - - // ========================================== - // EXEMPLE 2 : Vérifier les niveaux - // ========================================== - fmt.Println("\n=== EXEMPLE 2 : Inspection des niveaux ===\n") - - // Vérifier chaque niveau - for level := 0; level < minFreq; level++ { - levelSet := filter.GetKmersAtLevel(level) - fmt.Printf("Niveau %d (freq≥%d): %d k-mers\n", - level+1, level+1, levelSet.Cardinality()) - } - - // ========================================== - // EXEMPLE 3 : Données réalistes - // ========================================== - fmt.Println("\n=== EXEMPLE 3 : Simulation données séquençage ===\n") - - filter2 := obikmer.NewFrequencyFilter(31, 3) - - // Simuler un dataset réaliste : - // - 1000 reads - // - 80% contiennent des erreurs (singletons) - // - 15% vrais k-mers à basse fréquence - // - 5% vrais k-mers à haute fréquence - - // Vraie séquence répétée - trueSeq := []byte("ACGTACGTACGTACGTACGTACGTACGTACG") - for i := 0; i < 50; i++ { - filter2.AddSequence(trueSeq) - } - - // Séquence à fréquence moyenne - mediumSeq := []byte("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC") - for i := 0; i < 5; i++ { - filter2.AddSequence(mediumSeq) - } - - // Erreurs de séquençage (singletons) - for i := 0; i < 100; i++ { - errorSeq := []byte(fmt.Sprintf("TTTTTTTTTTTTTTTTTTTTTTTTTTTT%03d", i)) - filter2.AddSequence(errorSeq) - } - - stats2 := filter2.Stats() - fmt.Println(stats2.String()) - - fmt.Println("Distribution attendue:") - fmt.Println(" - Beaucoup de singletons (erreurs)") - fmt.Println(" - Peu de k-mers à haute fréquence (signal)") - fmt.Println(" → Filtrage efficace !") - - // ========================================== - // EXEMPLE 4 : Tester différents seuils - // ========================================== - fmt.Println("\n=== EXEMPLE 4 : Comparaison de seuils ===\n") - - testSeqs := [][]byte{ - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), // freq=5 - []byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"), - []byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"), - []byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"), // freq=3 - []byte("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"), // freq=1 - } - - for _, minFreq := range []int{2, 3, 5} { - f := obikmer.NewFrequencyFilter(31, minFreq) - f.AddSequences(testSeqs) - - fmt.Printf("minFreq=%d: %d k-mers retenus (%.2f MB)\n", - minFreq, - f.Cardinality(), - float64(f.MemoryUsage())/1024/1024) - } - - // ========================================== - // EXEMPLE 5 : Comparaison mémoire - // ========================================== - fmt.Println("\n=== EXEMPLE 5 : Comparaison mémoire ===\n") - - filter3 := obikmer.NewFrequencyFilter(31, 3) - - // Simuler 10000 séquences - for i := 0; i < 10000; i++ { - seq := make([]byte, 100) - for j := range seq { - seq[j] = "ACGT"[(i+j)%4] - } - filter3.AddSequence(seq) - } - - fmt.Println(filter3.CompareWithSimpleMap()) - - // ========================================== - // EXEMPLE 6 : Workflow complet - // ========================================== - fmt.Println("\n=== EXEMPLE 6 : Workflow complet ===\n") - - fmt.Println("1. Créer le filtre") - finalFilter := obikmer.NewFrequencyFilter(31, 3) - - fmt.Println("2. Traiter les données (simulation)") - // En pratique : lire depuis FASTQ - // for read := range ReadFastq("data.fastq") { - // finalFilter.AddSequence(read) - // } - - // Simulation - for i := 0; i < 1000; i++ { - seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACG") - finalFilter.AddSequence(seq) - } - - fmt.Println("3. Récupérer les k-mers filtrés") - result := finalFilter.GetFilteredSet("final") - - fmt.Println("4. Utiliser le résultat") - fmt.Printf(" K-mers de qualité: %d\n", result.Cardinality()) - fmt.Printf(" Mémoire utilisée: %.2f MB\n", float64(finalFilter.MemoryUsage())/1024/1024) - - fmt.Println("5. Sauvegarder (optionnel)") - // result.Save("filtered_kmers.bin") - - // ========================================== - // EXEMPLE 7 : Vérification individuelle - // ========================================== - fmt.Println("\n=== EXEMPLE 7 : Vérification de k-mers spécifiques ===\n") - - checkFilter := obikmer.NewFrequencyFilter(31, 3) - - testSeq := []byte("ACGTACGTACGTACGTACGTACGTACGTACG") - for i := 0; i < 5; i++ { - checkFilter.AddSequence(testSeq) - } - - var kmers []uint64 - kmers = obikmer.EncodeKmers(testSeq, 31, &kmers) - - if len(kmers) > 0 { - testKmer := kmers[0] - - fmt.Printf("K-mer test: 0x%016X\n", testKmer) - fmt.Printf(" Présent dans filtre: %v\n", checkFilter.Contains(testKmer)) - fmt.Printf(" Fréquence approx: %d\n", checkFilter.GetFrequency(testKmer)) - } - - // ========================================== - // EXEMPLE 8 : Intégration avec collection - // ========================================== - fmt.Println("\n=== EXEMPLE 8 : Intégration avec KmerSetCollection ===\n") - - // Créer une collection de génomes filtrés - collection := obikmer.NewKmerSetCollection(31) - - genomes := map[string][][]byte{ - "Genome1": { - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), - []byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"), // Erreur - }, - "Genome2": { - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), - []byte("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"), // Erreur - }, - } - - for id, sequences := range genomes { - // Filtrer chaque génome - genomeFilter := obikmer.NewFrequencyFilter(31, 3) - genomeFilter.AddSequences(sequences) - - // Ajouter à la collection - filteredSet := genomeFilter.GetFilteredSet(id) - collection.Add(filteredSet) - - fmt.Printf("%s: %d k-mers de qualité\n", id, filteredSet.Cardinality()) - } - - // Analyser la collection - fmt.Println("\nAnalyse comparative:") - collectionStats := collection.ComputeStats() - fmt.Printf(" Core genome: %d k-mers\n", collectionStats.CoreSize) - fmt.Printf(" Pan genome: %d k-mers\n", collectionStats.PanGenomeSize) - - // ========================================== - // RÉSUMÉ - // ========================================== - fmt.Println("\n=== RÉSUMÉ ===\n") - fmt.Println("Le FrequencyFilter permet de:") - fmt.Println(" ✓ Filtrer les k-mers par fréquence minimale") - fmt.Println(" ✓ Utiliser une mémoire optimale avec Roaring bitmaps") - fmt.Println(" ✓ Une seule passe sur les données") - fmt.Println(" ✓ Éliminer efficacement les erreurs de séquençage") - fmt.Println("") - fmt.Println("Workflow typique:") - fmt.Println(" 1. filter := NewFrequencyFilter(k, minFreq)") - fmt.Println(" 2. for each sequence: filter.AddSequence(seq)") - fmt.Println(" 3. filtered := filter.GetFilteredSet(id)") - fmt.Println(" 4. Utiliser filtered dans vos analyses") -} - -// ================================== -// FONCTION HELPER POUR BENCHMARKS -// ================================== - -func BenchmarkFrequencyFilter() { - k := 31 - minFreq := 3 - - // Test avec différentes tailles - sizes := []int{1000, 10000, 100000} - - fmt.Println("\n=== BENCHMARK ===\n") - - for _, size := range sizes { - filter := obikmer.NewFrequencyFilter(k, minFreq) - - // Générer des séquences - for i := 0; i < size; i++ { - seq := make([]byte, 100) - for j := range seq { - seq[j] = "ACGT"[(i+j)%4] - } - filter.AddSequence(seq) - } - - fmt.Printf("Size=%d reads:\n", size) - fmt.Printf(" Filtered k-mers: %d\n", filter.Cardinality()) - fmt.Printf(" Memory: %.2f MB\n", float64(filter.MemoryUsage())/1024/1024) - fmt.Println() - } -} - -// ================================== -// FONCTION POUR DONNÉES RÉELLES -// ================================== - -func ProcessRealData() { - // Exemple pour traiter de vraies données FASTQ - - k := 31 - minFreq := 3 - - filter := obikmer.NewFrequencyFilter(k, minFreq) - - // Pseudo-code pour lire un FASTQ - /* - fastqFile := "sample.fastq" - reader := NewFastqReader(fastqFile) - - for reader.HasNext() { - read := reader.Next() - filter.AddSequence(read.Sequence) - } - - // Récupérer le résultat - filtered := filter.GetFilteredSet("sample_filtered") - filtered.Save("sample_filtered_kmers.bin") - - // Stats - stats := filter.Stats() - fmt.Println(stats.String()) - */ - - fmt.Println("Workflow pour données réelles:") - fmt.Println(" 1. Créer le filtre avec minFreq approprié (2-5 typique)") - fmt.Println(" 2. Stream les reads depuis FASTQ") - fmt.Println(" 3. Récupérer les k-mers filtrés") - fmt.Println(" 4. Utiliser pour assemblage/comparaison/etc.") - - _ = filter // unused -} diff --git a/pkg/obikmer/frequency_filter.go b/pkg/obikmer/frequency_filter.go deleted file mode 100644 index 4bd9fd3..0000000 --- a/pkg/obikmer/frequency_filter.go +++ /dev/null @@ -1,317 +0,0 @@ -package obikmer - -import ( - "fmt" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" -) - -// FrequencyFilter filters k-mers by minimum frequency -// Specialization of KmerSetGroup where index[i] contains k-mers seen at least i+1 times -type FrequencyFilter struct { - *KmerSetGroup // Group of KmerSet (one per frequency level) - MinFreq int // v - minimum required frequency -} - -// NewFrequencyFilter creates a new frequency filter -// minFreq: minimum number d'occurrences required (v) -func NewFrequencyFilter(k, minFreq int) *FrequencyFilter { - ff := &FrequencyFilter{ - KmerSetGroup: NewKmerSetGroup(k, minFreq), - MinFreq: minFreq, - } - - // Initialize group metadata - ff.SetAttribute("type", "FrequencyFilter") - ff.SetAttribute("min_freq", minFreq) - - // Initialize metadata for each level - for i := 0; i < minFreq; i++ { - level := ff.Get(i) - level.SetAttribute("level", i) - level.SetAttribute("min_occurrences", i+1) - level.SetId(fmt.Sprintf("level_%d", i)) - } - - return ff -} - -// AddSequence adds all k-mers from a sequence to the filter -// Uses an iterator to avoid allocating an intermediate vector -func (ff *FrequencyFilter) AddSequence(seq *obiseq.BioSequence) { - rawSeq := seq.Sequence() - for canonical := range IterCanonicalKmers(rawSeq, ff.K()) { - ff.AddKmerCode(canonical) - } -} - -// AddKmerCode adds an encoded k-mer to the filter (main algorithm) -func (ff *FrequencyFilter) AddKmerCode(kmer uint64) { - // Find the current level of the k-mer - c := 0 - for c < ff.MinFreq && ff.Get(c).Contains(kmer) { - c++ - } - - // Add to next level (if not yet at maximum) - if c < ff.MinFreq { - ff.Get(c).AddKmerCode(kmer) - } -} - -// AddCanonicalKmerCode adds an encoded canonical k-mer to the filter -func (ff *FrequencyFilter) AddCanonicalKmerCode(kmer uint64) { - canonical := CanonicalKmer(kmer, ff.K()) - ff.AddKmerCode(canonical) -} - -// AddKmer adds a k-mer to the filter by encoding the sequence -// The sequence must have exactly k nucleotides -// Zero-allocation: encodes directly without creating an intermediate slice -func (ff *FrequencyFilter) AddKmer(seq []byte) { - kmer := EncodeKmer(seq, ff.K()) - ff.AddKmerCode(kmer) -} - -// AddCanonicalKmer adds a canonical k-mer to the filter by encoding the sequence -// The sequence must have exactly k nucleotides -// Zero-allocation: encodes directly in canonical form without creating an intermediate slice -func (ff *FrequencyFilter) AddCanonicalKmer(seq []byte) { - canonical := EncodeCanonicalKmer(seq, ff.K()) - ff.AddKmerCode(canonical) -} - -// GetFilteredSet returns a KmerSet of k-mers with frequency ≥ minFreq -func (ff *FrequencyFilter) GetFilteredSet() *KmerSet { - // Filtered k-mers are in the last level - return ff.Get(ff.MinFreq - 1).Copy() -} - -// GetKmersAtLevel returns a KmerSet of k-mers seen at least (level+1) times -// level doit être dans [0, minFreq-1] -func (ff *FrequencyFilter) GetKmersAtLevel(level int) *KmerSet { - ks := ff.Get(level) - if ks == nil { - return NewKmerSet(ff.K()) - } - return ks.Copy() -} - -// Stats returns statistics on frequency levels -func (ff *FrequencyFilter) Stats() FrequencyFilterStats { - stats := FrequencyFilterStats{ - MinFreq: ff.MinFreq, - Levels: make([]LevelStats, ff.MinFreq), - } - - for i := 0; i < ff.MinFreq; i++ { - ks := ff.Get(i) - card := ks.Len() - sizeBytes := ks.MemoryUsage() - - stats.Levels[i] = LevelStats{ - Level: i + 1, // Level 1 = freq ≥ 1 - Cardinality: card, - SizeBytes: sizeBytes, - } - - stats.TotalBytes += sizeBytes - } - - // The last level contains the result - stats.FilteredKmers = stats.Levels[ff.MinFreq-1].Cardinality - - return stats -} - -// FrequencyFilterStats contains the filter statistics -type FrequencyFilterStats struct { - MinFreq int - FilteredKmers uint64 // K-mers with freq ≥ minFreq - TotalBytes uint64 // Total memory used - Levels []LevelStats -} - -// LevelStats contains the stats of a level -type LevelStats struct { - Level int // freq ≥ Level - Cardinality uint64 // Number of k-mers - SizeBytes uint64 // Size in bytes -} - -func (ffs FrequencyFilterStats) String() string { - result := fmt.Sprintf(`Frequency Filter Statistics (minFreq=%d): - Filtered k-mers (freq≥%d): %d - Total memory: %.2f MB - -Level breakdown: -`, ffs.MinFreq, ffs.MinFreq, ffs.FilteredKmers, float64(ffs.TotalBytes)/1024/1024) - - for _, level := range ffs.Levels { - result += fmt.Sprintf(" freq≥%d: %d k-mers (%.2f MB)\n", - level.Level, - level.Cardinality, - float64(level.SizeBytes)/1024/1024) - } - - return result -} - -// Clear libère la mémoire de tous les niveaux -// (héritée de KmerSetGroup mais redéfinie pour clarté) -func (ff *FrequencyFilter) Clear() { - ff.KmerSetGroup.Clear() -} - -// ================================== -// BATCH PROCESSING -// ================================== - -// AddSequences adds multiple sequences in batch -func (ff *FrequencyFilter) AddSequences(sequences *obiseq.BioSequenceSlice) { - for _, seq := range *sequences { - ff.AddSequence(seq) - } -} - -// AddSequenceSlice adds all k-mers from a slice of sequences to the filter -func (ff *FrequencyFilter) AddSequenceSlice(sequences *obiseq.BioSequenceSlice) { - for _, seq := range *sequences { - ff.AddSequence(seq) - } -} - -// ================================== -// PERSISTANCE -// ================================== - -// Save sauvegarde le FrequencyFilter dans un répertoire -// Utilise le format de sérialisation du KmerSetGroup sous-jacent -// Les métadonnées incluent le type "FrequencyFilter" et min_freq -// -// Format: -// - directory/metadata.{toml,yaml,json} - métadonnées du filtre -// - directory/set_0.roaring - k-mers vus ≥1 fois -// - directory/set_1.roaring - k-mers vus ≥2 fois -// - ... -// - directory/set_{minFreq-1}.roaring - k-mers vus ≥minFreq fois -// -// Parameters: -// - directory: répertoire de destination -// - format: format des métadonnées (FormatTOML, FormatYAML, FormatJSON) -// -// Example: -// -// err := ff.Save("./my_filter", obikmer.FormatTOML) -func (ff *FrequencyFilter) Save(directory string, format MetadataFormat) error { - // Déléguer à KmerSetGroup qui gère déjà tout - return ff.KmerSetGroup.Save(directory, format) -} - -// LoadFrequencyFilter charge un FrequencyFilter depuis un répertoire -// Vérifie que les métadonnées correspondent à un FrequencyFilter -// -// Parameters: -// - directory: répertoire source -// -// Returns: -// - *FrequencyFilter: le filtre chargé -// - error: erreur si le chargement échoue ou si ce n'est pas un FrequencyFilter -// -// Example: -// -// ff, err := obikmer.LoadFrequencyFilter("./my_filter") -func LoadFrequencyFilter(directory string) (*FrequencyFilter, error) { - // Charger le KmerSetGroup - ksg, err := LoadKmerSetGroup(directory) - if err != nil { - return nil, err - } - - // Vérifier que c'est bien un FrequencyFilter - if typeAttr, ok := ksg.GetAttribute("type"); !ok || typeAttr != "FrequencyFilter" { - return nil, fmt.Errorf("loaded data is not a FrequencyFilter (type=%v)", typeAttr) - } - - // Récupérer min_freq - minFreqAttr, ok := ksg.GetIntAttribute("min_freq") - if !ok { - return nil, fmt.Errorf("FrequencyFilter missing min_freq attribute") - } - - // Créer le FrequencyFilter - ff := &FrequencyFilter{ - KmerSetGroup: ksg, - MinFreq: minFreqAttr, - } - - return ff, nil -} - -// ================================== -// UTILITAIRES -// ================================== - -// Contains vérifie si un k-mer a atteint la fréquence minimale -func (ff *FrequencyFilter) Contains(kmer uint64) bool { - canonical := CanonicalKmer(kmer, ff.K()) - return ff.Get(ff.MinFreq - 1).Contains(canonical) -} - -// GetFrequency returns the approximate frequency of a k-mer -// Retourne le niveau maximum atteint (freq ≥ niveau) -func (ff *FrequencyFilter) GetFrequency(kmer uint64) int { - canonical := CanonicalKmer(kmer, ff.K()) - - freq := 0 - for i := 0; i < ff.MinFreq; i++ { - if ff.Get(i).Contains(canonical) { - freq = i + 1 - } else { - break - } - } - - return freq -} - -// Len returns the number of filtered k-mers or at a specific level -// Without argument: returns the number of k-mers with freq ≥ minFreq (last level) -// With argument level: returns the number of k-mers with freq ≥ (level+1) -// Exemple: Len() pour les k-mers filtrés, Len(2) pour freq ≥ 3 -// (héritée de KmerSetGroup mais redéfinie pour la documentation) -func (ff *FrequencyFilter) Len(level ...int) uint64 { - return ff.KmerSetGroup.Len(level...) -} - -// MemoryUsage returns memory usage in bytes -// (héritée de KmerSetGroup mais redéfinie pour clarté) -func (ff *FrequencyFilter) MemoryUsage() uint64 { - return ff.KmerSetGroup.MemoryUsage() -} - -// ================================== -// COMPARAISON AVEC D'AUTRES APPROCHES -// ================================== - -// CompareWithSimpleMap compare la mémoire avec une simple map -func (ff *FrequencyFilter) CompareWithSimpleMap() string { - totalKmers := ff.Get(0).Len() - - simpleMapBytes := totalKmers * 24 // ~24 bytes par entrée - roaringBytes := ff.MemoryUsage() - - reduction := float64(simpleMapBytes) / float64(roaringBytes) - - return fmt.Sprintf(`Memory Comparison for %d k-mers: - Simple map[uint64]uint32: %.2f MB - Roaring filter (v=%d): %.2f MB - Reduction: %.1fx -`, - totalKmers, - float64(simpleMapBytes)/1024/1024, - ff.MinFreq, - float64(roaringBytes)/1024/1024, - reduction, - ) -} diff --git a/pkg/obikmer/kdi_merge.go b/pkg/obikmer/kdi_merge.go new file mode 100644 index 0000000..bf0dcf5 --- /dev/null +++ b/pkg/obikmer/kdi_merge.go @@ -0,0 +1,86 @@ +package obikmer + +import "container/heap" + +// mergeItem represents an element in the min-heap for k-way merge. +type mergeItem struct { + value uint64 + idx int // index of the reader that produced this value +} + +// mergeHeap implements heap.Interface for k-way merge. +type mergeHeap []mergeItem + +func (h mergeHeap) Len() int { return len(h) } +func (h mergeHeap) Less(i, j int) bool { return h[i].value < h[j].value } +func (h mergeHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } +func (h *mergeHeap) Push(x interface{}) { *h = append(*h, x.(mergeItem)) } +func (h *mergeHeap) Pop() interface{} { + old := *h + n := len(old) + x := old[n-1] + *h = old[:n-1] + return x +} + +// KWayMerge performs a k-way merge of multiple sorted KdiReader streams. +// For each unique k-mer value, it reports the value and the number of +// input streams that contained it (count). +type KWayMerge struct { + h mergeHeap + readers []*KdiReader +} + +// NewKWayMerge creates a k-way merge from multiple KdiReaders. +// Each reader must produce values in sorted (ascending) order. +func NewKWayMerge(readers []*KdiReader) *KWayMerge { + m := &KWayMerge{ + h: make(mergeHeap, 0, len(readers)), + readers: readers, + } + + // Initialize heap with first value from each reader + for i, r := range readers { + if v, ok := r.Next(); ok { + m.h = append(m.h, mergeItem{value: v, idx: i}) + } + } + heap.Init(&m.h) + + return m +} + +// Next returns the next smallest k-mer value, the number of readers +// that contained this value (count), and true. +// Returns (0, 0, false) when all streams are exhausted. +func (m *KWayMerge) Next() (kmer uint64, count int, ok bool) { + if len(m.h) == 0 { + return 0, 0, false + } + + minVal := m.h[0].value + count = 0 + + // Pop all items with the same value + for len(m.h) > 0 && m.h[0].value == minVal { + item := heap.Pop(&m.h).(mergeItem) + count++ + // Advance that reader + if v, ok := m.readers[item.idx].Next(); ok { + heap.Push(&m.h, mergeItem{value: v, idx: item.idx}) + } + } + + return minVal, count, true +} + +// Close closes all underlying readers. +func (m *KWayMerge) Close() error { + var firstErr error + for _, r := range m.readers { + if err := r.Close(); err != nil && firstErr == nil { + firstErr = err + } + } + return firstErr +} diff --git a/pkg/obikmer/kdi_merge_test.go b/pkg/obikmer/kdi_merge_test.go new file mode 100644 index 0000000..56aa028 --- /dev/null +++ b/pkg/obikmer/kdi_merge_test.go @@ -0,0 +1,159 @@ +package obikmer + +import ( + "path/filepath" + "testing" +) + +// writeKdi is a helper that writes sorted kmers to a .kdi file. +func writeKdi(t *testing.T, dir, name string, kmers []uint64) string { + t.Helper() + path := filepath.Join(dir, name) + w, err := NewKdiWriter(path) + if err != nil { + t.Fatal(err) + } + for _, v := range kmers { + if err := w.Write(v); err != nil { + t.Fatal(err) + } + } + if err := w.Close(); err != nil { + t.Fatal(err) + } + return path +} + +func TestKWayMergeBasic(t *testing.T) { + dir := t.TempDir() + + // Three sorted streams + p1 := writeKdi(t, dir, "a.kdi", []uint64{1, 3, 5, 7}) + p2 := writeKdi(t, dir, "b.kdi", []uint64{2, 3, 6, 7}) + p3 := writeKdi(t, dir, "c.kdi", []uint64{3, 4, 7, 8}) + + r1, _ := NewKdiReader(p1) + r2, _ := NewKdiReader(p2) + r3, _ := NewKdiReader(p3) + + m := NewKWayMerge([]*KdiReader{r1, r2, r3}) + defer m.Close() + + type result struct { + kmer uint64 + count int + } + var results []result + for { + kmer, count, ok := m.Next() + if !ok { + break + } + results = append(results, result{kmer, count}) + } + + expected := []result{ + {1, 1}, {2, 1}, {3, 3}, {4, 1}, {5, 1}, {6, 1}, {7, 3}, {8, 1}, + } + if len(results) != len(expected) { + t.Fatalf("got %d results, want %d", len(results), len(expected)) + } + for i, exp := range expected { + if results[i] != exp { + t.Errorf("result %d: got %+v, want %+v", i, results[i], exp) + } + } +} + +func TestKWayMergeSingleStream(t *testing.T) { + dir := t.TempDir() + p := writeKdi(t, dir, "a.kdi", []uint64{10, 20, 30}) + + r, _ := NewKdiReader(p) + m := NewKWayMerge([]*KdiReader{r}) + defer m.Close() + + vals := []uint64{10, 20, 30} + for _, expected := range vals { + kmer, count, ok := m.Next() + if !ok { + t.Fatal("unexpected EOF") + } + if kmer != expected || count != 1 { + t.Fatalf("got (%d, %d), want (%d, 1)", kmer, count, expected) + } + } + _, _, ok := m.Next() + if ok { + t.Fatal("expected EOF") + } +} + +func TestKWayMergeEmpty(t *testing.T) { + dir := t.TempDir() + + p1 := writeKdi(t, dir, "a.kdi", nil) + p2 := writeKdi(t, dir, "b.kdi", nil) + + r1, _ := NewKdiReader(p1) + r2, _ := NewKdiReader(p2) + + m := NewKWayMerge([]*KdiReader{r1, r2}) + defer m.Close() + + _, _, ok := m.Next() + if ok { + t.Fatal("expected no results from empty streams") + } +} + +func TestKWayMergeDisjoint(t *testing.T) { + dir := t.TempDir() + + p1 := writeKdi(t, dir, "a.kdi", []uint64{1, 2, 3}) + p2 := writeKdi(t, dir, "b.kdi", []uint64{10, 20, 30}) + + r1, _ := NewKdiReader(p1) + r2, _ := NewKdiReader(p2) + + m := NewKWayMerge([]*KdiReader{r1, r2}) + defer m.Close() + + expected := []uint64{1, 2, 3, 10, 20, 30} + for _, exp := range expected { + kmer, count, ok := m.Next() + if !ok { + t.Fatal("unexpected EOF") + } + if kmer != exp || count != 1 { + t.Fatalf("got (%d, %d), want (%d, 1)", kmer, count, exp) + } + } +} + +func TestKWayMergeAllSame(t *testing.T) { + dir := t.TempDir() + + p1 := writeKdi(t, dir, "a.kdi", []uint64{42}) + p2 := writeKdi(t, dir, "b.kdi", []uint64{42}) + p3 := writeKdi(t, dir, "c.kdi", []uint64{42}) + + r1, _ := NewKdiReader(p1) + r2, _ := NewKdiReader(p2) + r3, _ := NewKdiReader(p3) + + m := NewKWayMerge([]*KdiReader{r1, r2, r3}) + defer m.Close() + + kmer, count, ok := m.Next() + if !ok { + t.Fatal("expected one result") + } + if kmer != 42 || count != 3 { + t.Fatalf("got (%d, %d), want (42, 3)", kmer, count) + } + _, _, ok = m.Next() + if ok { + t.Fatal("expected EOF") + } +} diff --git a/pkg/obikmer/kdi_reader.go b/pkg/obikmer/kdi_reader.go new file mode 100644 index 0000000..22722f9 --- /dev/null +++ b/pkg/obikmer/kdi_reader.go @@ -0,0 +1,96 @@ +package obikmer + +import ( + "bufio" + "encoding/binary" + "fmt" + "io" + "os" +) + +// KdiReader reads k-mers from a .kdi file using streaming delta-varint decoding. +type KdiReader struct { + r *bufio.Reader + file *os.File + count uint64 // total number of k-mers + read uint64 // number of k-mers already consumed + prev uint64 // last decoded value + started bool // whether first value has been read +} + +// NewKdiReader opens a .kdi file for streaming reading. +func NewKdiReader(path string) (*KdiReader, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + r := bufio.NewReaderSize(f, 65536) + + // Read and verify magic + var magic [4]byte + if _, err := io.ReadFull(r, magic[:]); err != nil { + f.Close() + return nil, fmt.Errorf("kdi: read magic: %w", err) + } + if magic != kdiMagic { + f.Close() + return nil, fmt.Errorf("kdi: bad magic %v", magic) + } + + // Read count + var countBuf [8]byte + if _, err := io.ReadFull(r, countBuf[:]); err != nil { + f.Close() + return nil, fmt.Errorf("kdi: read count: %w", err) + } + count := binary.LittleEndian.Uint64(countBuf[:]) + + return &KdiReader{ + r: r, + file: f, + count: count, + }, nil +} + +// Next returns the next k-mer and true, or (0, false) when exhausted. +func (kr *KdiReader) Next() (uint64, bool) { + if kr.read >= kr.count { + return 0, false + } + + if !kr.started { + // Read first value as absolute uint64 LE + var buf [8]byte + if _, err := io.ReadFull(kr.r, buf[:]); err != nil { + return 0, false + } + kr.prev = binary.LittleEndian.Uint64(buf[:]) + kr.started = true + kr.read++ + return kr.prev, true + } + + // Read delta varint + delta, err := DecodeVarint(kr.r) + if err != nil { + return 0, false + } + kr.prev += delta + kr.read++ + return kr.prev, true +} + +// Count returns the total number of k-mers in this partition. +func (kr *KdiReader) Count() uint64 { + return kr.count +} + +// Remaining returns how many k-mers have not been read yet. +func (kr *KdiReader) Remaining() uint64 { + return kr.count - kr.read +} + +// Close closes the underlying file. +func (kr *KdiReader) Close() error { + return kr.file.Close() +} diff --git a/pkg/obikmer/kdi_test.go b/pkg/obikmer/kdi_test.go new file mode 100644 index 0000000..5b94092 --- /dev/null +++ b/pkg/obikmer/kdi_test.go @@ -0,0 +1,255 @@ +package obikmer + +import ( + "os" + "path/filepath" + "sort" + "testing" +) + +func TestKdiRoundTrip(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "test.kdi") + + // Sorted k-mer values + kmers := []uint64{10, 20, 30, 100, 200, 500, 10000, 1 << 40, 1<<62 - 1} + + w, err := NewKdiWriter(path) + if err != nil { + t.Fatal(err) + } + for _, v := range kmers { + if err := w.Write(v); err != nil { + t.Fatal(err) + } + } + if w.Count() != uint64(len(kmers)) { + t.Fatalf("writer count: got %d, want %d", w.Count(), len(kmers)) + } + if err := w.Close(); err != nil { + t.Fatal(err) + } + + // Read back + r, err := NewKdiReader(path) + if err != nil { + t.Fatal(err) + } + defer r.Close() + + if r.Count() != uint64(len(kmers)) { + t.Fatalf("reader count: got %d, want %d", r.Count(), len(kmers)) + } + + for i, expected := range kmers { + got, ok := r.Next() + if !ok { + t.Fatalf("unexpected EOF at index %d", i) + } + if got != expected { + t.Fatalf("kmer %d: got %d, want %d", i, got, expected) + } + } + + _, ok := r.Next() + if ok { + t.Fatal("expected EOF after all k-mers") + } +} + +func TestKdiEmpty(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "empty.kdi") + + w, err := NewKdiWriter(path) + if err != nil { + t.Fatal(err) + } + if err := w.Close(); err != nil { + t.Fatal(err) + } + + r, err := NewKdiReader(path) + if err != nil { + t.Fatal(err) + } + defer r.Close() + + if r.Count() != 0 { + t.Fatalf("expected count 0, got %d", r.Count()) + } + + _, ok := r.Next() + if ok { + t.Fatal("expected no k-mers in empty file") + } +} + +func TestKdiSingleValue(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "single.kdi") + + w, err := NewKdiWriter(path) + if err != nil { + t.Fatal(err) + } + if err := w.Write(42); err != nil { + t.Fatal(err) + } + if err := w.Close(); err != nil { + t.Fatal(err) + } + + r, err := NewKdiReader(path) + if err != nil { + t.Fatal(err) + } + defer r.Close() + + if r.Count() != 1 { + t.Fatalf("expected count 1, got %d", r.Count()) + } + + v, ok := r.Next() + if !ok { + t.Fatal("expected one k-mer") + } + if v != 42 { + t.Fatalf("got %d, want 42", v) + } +} + +func TestKdiFileSize(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "size.kdi") + + // Write: magic(4) + count(8) + first(8) = 20 bytes + w, err := NewKdiWriter(path) + if err != nil { + t.Fatal(err) + } + if err := w.Write(0); err != nil { + t.Fatal(err) + } + if err := w.Close(); err != nil { + t.Fatal(err) + } + + info, err := os.Stat(path) + if err != nil { + t.Fatal(err) + } + // magic(4) + count(8) + first(8) = 20 + if info.Size() != 20 { + t.Fatalf("file size: got %d, want 20", info.Size()) + } +} + +func TestKdiDeltaCompression(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "delta.kdi") + + // Dense consecutive values should compress well + n := 10000 + kmers := make([]uint64, n) + for i := range kmers { + kmers[i] = uint64(i * 2) // even numbers + } + + w, err := NewKdiWriter(path) + if err != nil { + t.Fatal(err) + } + for _, v := range kmers { + if err := w.Write(v); err != nil { + t.Fatal(err) + } + } + if err := w.Close(); err != nil { + t.Fatal(err) + } + + // Each delta is 2, encoded as 1 byte varint + // Total: magic(4) + count(8) + first(8) + (n-1)*1 = 20 + 9999 bytes + info, err := os.Stat(path) + if err != nil { + t.Fatal(err) + } + expected := int64(20 + n - 1) + if info.Size() != expected { + t.Fatalf("file size: got %d, want %d", info.Size(), expected) + } + + // Verify round-trip + r, err := NewKdiReader(path) + if err != nil { + t.Fatal(err) + } + defer r.Close() + + for i, expected := range kmers { + got, ok := r.Next() + if !ok { + t.Fatalf("unexpected EOF at index %d", i) + } + if got != expected { + t.Fatalf("kmer %d: got %d, want %d", i, got, expected) + } + } +} + +func TestKdiFromRealKmers(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "real.kdi") + + // Extract k-mers from a sequence, sort, dedup, write to KDI + seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT") + k := 15 + + var kmers []uint64 + for kmer := range IterCanonicalKmers(seq, k) { + kmers = append(kmers, kmer) + } + sort.Slice(kmers, func(i, j int) bool { return kmers[i] < kmers[j] }) + // Dedup + deduped := kmers[:0] + for i, v := range kmers { + if i == 0 || v != kmers[i-1] { + deduped = append(deduped, v) + } + } + + w, err := NewKdiWriter(path) + if err != nil { + t.Fatal(err) + } + for _, v := range deduped { + if err := w.Write(v); err != nil { + t.Fatal(err) + } + } + if err := w.Close(); err != nil { + t.Fatal(err) + } + + // Read back and verify + r, err := NewKdiReader(path) + if err != nil { + t.Fatal(err) + } + defer r.Close() + + if r.Count() != uint64(len(deduped)) { + t.Fatalf("count: got %d, want %d", r.Count(), len(deduped)) + } + + for i, expected := range deduped { + got, ok := r.Next() + if !ok { + t.Fatalf("unexpected EOF at index %d", i) + } + if got != expected { + t.Fatalf("kmer %d: got %d, want %d", i, got, expected) + } + } +} diff --git a/pkg/obikmer/kdi_writer.go b/pkg/obikmer/kdi_writer.go new file mode 100644 index 0000000..f853586 --- /dev/null +++ b/pkg/obikmer/kdi_writer.go @@ -0,0 +1,113 @@ +package obikmer + +import ( + "bufio" + "encoding/binary" + "os" +) + +// KDI file magic bytes: "KDI\x01" +var kdiMagic = [4]byte{'K', 'D', 'I', 0x01} + +// KdiWriter writes a sorted sequence of uint64 k-mers to a .kdi file +// using delta-varint encoding. +// +// Format: +// +// [magic: 4 bytes "KDI\x01"] +// [count: uint64 LE] number of k-mers +// [first: uint64 LE] first k-mer (absolute value) +// [delta_1: varint] arr[1] - arr[0] +// [delta_2: varint] arr[2] - arr[1] +// ... +// +// The caller must write k-mers in strictly increasing order. +type KdiWriter struct { + w *bufio.Writer + file *os.File + count uint64 + prev uint64 + first bool + path string +} + +// NewKdiWriter creates a new KdiWriter writing to the given file path. +// The header (magic + count placeholder) is written immediately. +// Count is patched on Close(). +func NewKdiWriter(path string) (*KdiWriter, error) { + f, err := os.Create(path) + if err != nil { + return nil, err + } + w := bufio.NewWriterSize(f, 65536) + + // Write magic + if _, err := w.Write(kdiMagic[:]); err != nil { + f.Close() + return nil, err + } + // Write placeholder for count (will be patched on Close) + var countBuf [8]byte + if _, err := w.Write(countBuf[:]); err != nil { + f.Close() + return nil, err + } + + return &KdiWriter{ + w: w, + file: f, + first: true, + path: path, + }, nil +} + +// Write adds a k-mer to the file. K-mers must be written in strictly +// increasing order. +func (kw *KdiWriter) Write(kmer uint64) error { + if kw.first { + // Write first value as absolute uint64 LE + var buf [8]byte + binary.LittleEndian.PutUint64(buf[:], kmer) + if _, err := kw.w.Write(buf[:]); err != nil { + return err + } + kw.prev = kmer + kw.first = false + } else { + delta := kmer - kw.prev + if _, err := EncodeVarint(kw.w, delta); err != nil { + return err + } + kw.prev = kmer + } + kw.count++ + return nil +} + +// Count returns the number of k-mers written so far. +func (kw *KdiWriter) Count() uint64 { + return kw.count +} + +// Close flushes buffered data, patches the count in the header, +// and closes the file. +func (kw *KdiWriter) Close() error { + if err := kw.w.Flush(); err != nil { + kw.file.Close() + return err + } + + // Patch count at offset 4 (after magic) + if _, err := kw.file.Seek(4, 0); err != nil { + kw.file.Close() + return err + } + var countBuf [8]byte + binary.LittleEndian.PutUint64(countBuf[:], kw.count) + if _, err := kw.file.Write(countBuf[:]); err != nil { + kw.file.Close() + return err + } + + return kw.file.Close() +} diff --git a/pkg/obikmer/kmer_index_builder.go b/pkg/obikmer/kmer_index_builder.go deleted file mode 100644 index 4f5e49a..0000000 --- a/pkg/obikmer/kmer_index_builder.go +++ /dev/null @@ -1,204 +0,0 @@ -package obikmer - -import ( - "math" - "sync" - - log "github.com/sirupsen/logrus" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" -) - -// DefaultMinimizerSize returns ceil(k / 2.5) as a reasonable default minimizer size. -func DefaultMinimizerSize(k int) int { - m := int(math.Ceil(float64(k) / 2.5)) - if m < 1 { - m = 1 - } - if m >= k { - m = k - 1 - } - return m -} - -// MinMinimizerSize returns the minimum m such that 4^m >= nworkers, -// i.e. ceil(log(nworkers) / log(4)). -func MinMinimizerSize(nworkers int) int { - if nworkers <= 1 { - return 1 - } - return int(math.Ceil(math.Log(float64(nworkers)) / math.Log(4))) -} - -// ValidateMinimizerSize checks and adjusts the minimizer size to satisfy constraints: -// - m >= ceil(log(nworkers)/log(4)) -// - 1 <= m < k -func ValidateMinimizerSize(m, k, nworkers int) int { - minM := MinMinimizerSize(nworkers) - if m < minM { - log.Warnf("Minimizer size %d too small for %d workers (4^%d = %d < %d), adjusting to %d", - m, nworkers, m, 1<<(2*m), nworkers, minM) - m = minM - } - if m < 1 { - m = 1 - } - if m >= k { - m = k - 1 - } - return m -} - -// BuildKmerIndex builds a KmerSet from an iterator using parallel super-kmer partitioning. -// -// The algorithm: -// 1. Extract super-kmers from each sequence using IterSuperKmers -// 2. Route each super-kmer to a worker based on minimizer % nworkers -// 3. Each worker extracts canonical k-mers and adds them to its local KmerSet -// 4. Merge all KmerSets via Union -// -// Parameters: -// - iterator: source of BioSequence batches -// - k: k-mer size (1-31) -// - m: minimizer size (1 to k-1) -func BuildKmerIndex(iterator obiiter.IBioSequence, k, m int) *KmerSet { - nproc := obidefault.ParallelWorkers() - m = ValidateMinimizerSize(m, k, nproc) - - // Channels to route super-kmers to workers - channels := make([]chan SuperKmer, nproc) - for i := range channels { - channels[i] = make(chan SuperKmer, 1024) - } - - // Workers: each manages a partition of the minimizer space - sets := make([]*KmerSet, nproc) - waiter := sync.WaitGroup{} - waiter.Add(nproc) - for i := 0; i < nproc; i++ { - sets[i] = NewKmerSet(k) - go func(ch chan SuperKmer, ks *KmerSet) { - defer waiter.Done() - for sk := range ch { - for kmer := range IterCanonicalKmers(sk.Sequence, k) { - ks.AddKmerCode(kmer) - } - } - }(channels[i], sets[i]) - } - - // Reader: extract super-kmers and route them - seqCount := 0 - for iterator.Next() { - batch := iterator.Get() - for _, seq := range batch.Slice() { - rawSeq := seq.Sequence() - if len(rawSeq) < k { - continue - } - for sk := range IterSuperKmers(rawSeq, k, m) { - worker := int(sk.Minimizer % uint64(nproc)) - channels[worker] <- sk - } - seqCount++ - } - } - - // Close channels to signal workers to finish - for _, ch := range channels { - close(ch) - } - waiter.Wait() - - log.Infof("Processed %d sequences", seqCount) - - // Merge partitions (mostly disjoint -> fast union) - result := sets[0] - for i := 1; i < nproc; i++ { - result.bitmap.Or(sets[i].bitmap) - } - - log.Infof("Index contains %d k-mers (%.2f MB)", - result.Len(), float64(result.MemoryUsage())/1024/1024) - - return result -} - -// BuildFrequencyFilterIndex builds a FrequencyFilter from an iterator -// using parallel super-kmer partitioning. -// -// Each worker manages its own FrequencyFilter for its partition of the -// minimizer space. Since all k-mers sharing a minimizer go to the same worker, -// the frequency counting is correct per partition. -// -// Parameters: -// - iterator: source of BioSequence batches -// - k: k-mer size (1-31) -// - m: minimizer size (1 to k-1) -// - minFreq: minimum frequency threshold (>= 1) -func BuildFrequencyFilterIndex(iterator obiiter.IBioSequence, k, m, minFreq int) *FrequencyFilter { - nproc := obidefault.ParallelWorkers() - m = ValidateMinimizerSize(m, k, nproc) - - // Channels to route super-kmers to workers - channels := make([]chan SuperKmer, nproc) - for i := range channels { - channels[i] = make(chan SuperKmer, 1024) - } - - // Workers: each manages a local FrequencyFilter - filters := make([]*FrequencyFilter, nproc) - waiter := sync.WaitGroup{} - waiter.Add(nproc) - for i := 0; i < nproc; i++ { - filters[i] = NewFrequencyFilter(k, minFreq) - go func(ch chan SuperKmer, ff *FrequencyFilter) { - defer waiter.Done() - for sk := range ch { - for kmer := range IterCanonicalKmers(sk.Sequence, k) { - ff.AddKmerCode(kmer) - } - } - }(channels[i], filters[i]) - } - - // Reader: extract super-kmers and route them - seqCount := 0 - for iterator.Next() { - batch := iterator.Get() - for _, seq := range batch.Slice() { - rawSeq := seq.Sequence() - if len(rawSeq) < k { - continue - } - for sk := range IterSuperKmers(rawSeq, k, m) { - worker := int(sk.Minimizer % uint64(nproc)) - channels[worker] <- sk - } - seqCount++ - } - } - - // Close channels to signal workers to finish - for _, ch := range channels { - close(ch) - } - waiter.Wait() - - log.Infof("Processed %d sequences", seqCount) - - // Merge FrequencyFilters: union level by level - result := filters[0] - for i := 1; i < nproc; i++ { - for level := 0; level < minFreq; level++ { - result.Get(level).bitmap.Or(filters[i].Get(level).bitmap) - } - } - - stats := result.Stats() - log.Infof("FrequencyFilter: %d k-mers with freq >= %d (%.2f MB total)", - stats.FilteredKmers, minFreq, float64(stats.TotalBytes)/1024/1024) - - return result -} diff --git a/pkg/obikmer/kmer_set.go b/pkg/obikmer/kmer_set.go deleted file mode 100644 index ad56e8f..0000000 --- a/pkg/obikmer/kmer_set.go +++ /dev/null @@ -1,224 +0,0 @@ -package obikmer - -import ( - "fmt" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" - "github.com/RoaringBitmap/roaring/roaring64" -) - -// KmerSet wraps a set of k-mers stored in a Roaring Bitmap -// Provides utility methods for manipulating k-mer sets -type KmerSet struct { - id string // Unique identifier of the KmerSet - k int // Size of k-mers (immutable) - bitmap *roaring64.Bitmap // Bitmap containing the k-mers - Metadata map[string]interface{} // User metadata (key=atomic value) -} - -// NewKmerSet creates a new empty KmerSet -func NewKmerSet(k int) *KmerSet { - return &KmerSet{ - k: k, - bitmap: roaring64.New(), - Metadata: make(map[string]interface{}), - } -} - -// NewKmerSetFromBitmap creates a KmerSet from an existing bitmap -func NewKmerSetFromBitmap(k int, bitmap *roaring64.Bitmap) *KmerSet { - return &KmerSet{ - k: k, - bitmap: bitmap, - Metadata: make(map[string]interface{}), - } -} - -// K returns the size of k-mers (immutable) -func (ks *KmerSet) K() int { - return ks.k -} - -// AddKmerCode adds an encoded k-mer to the set -func (ks *KmerSet) AddKmerCode(kmer uint64) { - ks.bitmap.Add(kmer) -} - -// AddCanonicalKmerCode adds an encoded canonical k-mer to the set -func (ks *KmerSet) AddCanonicalKmerCode(kmer uint64) { - canonical := CanonicalKmer(kmer, ks.k) - ks.bitmap.Add(canonical) -} - -// AddKmer adds a k-mer to the set by encoding the sequence -// The sequence must have exactly k nucleotides -// Zero-allocation: encodes directly without creating an intermediate slice -func (ks *KmerSet) AddKmer(seq []byte) { - kmer := EncodeKmer(seq, ks.k) - ks.bitmap.Add(kmer) -} - -// AddCanonicalKmer adds a canonical k-mer to the set by encoding the sequence -// The sequence must have exactly k nucleotides -// Zero-allocation: encodes directly in canonical form without creating an intermediate slice -func (ks *KmerSet) AddCanonicalKmer(seq []byte) { - canonical := EncodeCanonicalKmer(seq, ks.k) - ks.bitmap.Add(canonical) -} - -// AddSequence adds all k-mers from a sequence to the set -// Uses an iterator to avoid allocating an intermediate vector -func (ks *KmerSet) AddSequence(seq *obiseq.BioSequence) { - rawSeq := seq.Sequence() - for canonical := range IterCanonicalKmers(rawSeq, ks.k) { - ks.bitmap.Add(canonical) - } -} - -// AddSequences adds all k-mers from multiple sequences in batch -func (ks *KmerSet) AddSequences(sequences *obiseq.BioSequenceSlice) { - for _, seq := range *sequences { - ks.AddSequence(seq) - } -} - -// AddSequenceSlice adds all k-mers from a slice of sequences -func (ks *KmerSet) AddSequenceSlice(sequences *obiseq.BioSequenceSlice) { - for _, seq := range *sequences { - ks.AddSequence(seq) - } -} - -// Contains checks if a k-mer is in the set -func (ks *KmerSet) Contains(kmer uint64) bool { - return ks.bitmap.Contains(kmer) -} - -// Len returns the number of k-mers in the set -func (ks *KmerSet) Len() uint64 { - return ks.bitmap.GetCardinality() -} - -// MemoryUsage returns memory usage in bytes -func (ks *KmerSet) MemoryUsage() uint64 { - return ks.bitmap.GetSizeInBytes() -} - -// Clear empties the set -func (ks *KmerSet) Clear() { - ks.bitmap.Clear() -} - -// Copy creates a copy of the set (consistent with BioSequence.Copy) -func (ks *KmerSet) Copy() *KmerSet { - // Copy metadata - metadata := make(map[string]interface{}, len(ks.Metadata)) - for k, v := range ks.Metadata { - metadata[k] = v - } - - return &KmerSet{ - id: ks.id, - k: ks.k, - bitmap: ks.bitmap.Clone(), - Metadata: metadata, - } -} - -// Id returns the identifier of the KmerSet (consistent with BioSequence.Id) -func (ks *KmerSet) Id() string { - return ks.id -} - -// SetId sets the identifier of the KmerSet (consistent with BioSequence.SetId) -func (ks *KmerSet) SetId(id string) { - ks.id = id -} - -// Union returns the union of this set with another -func (ks *KmerSet) Union(other *KmerSet) *KmerSet { - if ks.k != other.k { - panic(fmt.Sprintf("Cannot union KmerSets with different k values: %d vs %d", ks.k, other.k)) - } - result := ks.bitmap.Clone() - result.Or(other.bitmap) - return NewKmerSetFromBitmap(ks.k, result) -} - -// Intersect returns the intersection of this set with another -func (ks *KmerSet) Intersect(other *KmerSet) *KmerSet { - if ks.k != other.k { - panic(fmt.Sprintf("Cannot intersect KmerSets with different k values: %d vs %d", ks.k, other.k)) - } - result := ks.bitmap.Clone() - result.And(other.bitmap) - return NewKmerSetFromBitmap(ks.k, result) -} - -// Difference returns the difference of this set with another (this - other) -func (ks *KmerSet) Difference(other *KmerSet) *KmerSet { - if ks.k != other.k { - panic(fmt.Sprintf("Cannot subtract KmerSets with different k values: %d vs %d", ks.k, other.k)) - } - result := ks.bitmap.Clone() - result.AndNot(other.bitmap) - return NewKmerSetFromBitmap(ks.k, result) -} - -// JaccardDistance computes the Jaccard distance between two KmerSets. -// The Jaccard distance is defined as: 1 - (|A ∩ B| / |A ∪ B|) -// where A and B are the two sets. -// -// Returns: -// - 0.0 when sets are identical (distance = 0, similarity = 1) -// - 1.0 when sets are completely disjoint (distance = 1, similarity = 0) -// - 1.0 when both sets are empty (by convention) -// -// Time complexity: O(|A| + |B|) for Roaring Bitmap operations -// Space complexity: O(1) as operations are done in-place on temporary bitmaps -func (ks *KmerSet) JaccardDistance(other *KmerSet) float64 { - if ks.k != other.k { - panic(fmt.Sprintf("Cannot compute Jaccard distance between KmerSets with different k values: %d vs %d", ks.k, other.k)) - } - - // Compute intersection cardinality - intersectionCard := ks.bitmap.AndCardinality(other.bitmap) - - // Compute union cardinality - unionCard := ks.bitmap.OrCardinality(other.bitmap) - - // If union is empty, both sets are empty - return 1.0 by convention - if unionCard == 0 { - return 1.0 - } - - // Jaccard similarity = |A ∩ B| / |A ∪ B| - similarity := float64(intersectionCard) / float64(unionCard) - - // Jaccard distance = 1 - similarity - return 1.0 - similarity -} - -// JaccardSimilarity computes the Jaccard similarity coefficient between two KmerSets. -// The Jaccard similarity is defined as: |A ∩ B| / |A ∪ B| -// -// Returns: -// - 1.0 when sets are identical (maximum similarity) -// - 0.0 when sets are completely disjoint (no similarity) -// - 0.0 when both sets are empty (by convention) -// -// Time complexity: O(|A| + |B|) for Roaring Bitmap operations -// Space complexity: O(1) as operations are done in-place on temporary bitmaps -func (ks *KmerSet) JaccardSimilarity(other *KmerSet) float64 { - return 1.0 - ks.JaccardDistance(other) -} - -// Iterator returns an iterator over all k-mers in the set -func (ks *KmerSet) Iterator() roaring64.IntIterable64 { - return ks.bitmap.Iterator() -} - -// Bitmap returns the underlying bitmap (for compatibility) -func (ks *KmerSet) Bitmap() *roaring64.Bitmap { - return ks.bitmap -} diff --git a/pkg/obikmer/kmer_set_attributes.go b/pkg/obikmer/kmer_set_attributes.go deleted file mode 100644 index 82151f8..0000000 --- a/pkg/obikmer/kmer_set_attributes.go +++ /dev/null @@ -1,362 +0,0 @@ -package obikmer - -import ( - "fmt" - "strconv" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" -) - -// ================================== -// KMER SET ATTRIBUTE API -// Mimic BioSequence attribute API from obiseq/attributes.go -// ================================== - -// HasAttribute vérifie si une clé d'attribut existe -func (ks *KmerSet) HasAttribute(key string) bool { - _, ok := ks.Metadata[key] - return ok -} - -// GetAttribute récupère la valeur d'un attribut -// Cas particuliers: "id" utilise Id(), "k" utilise K() -func (ks *KmerSet) GetAttribute(key string) (interface{}, bool) { - switch key { - case "id": - return ks.Id(), true - case "k": - return ks.K(), true - default: - value, ok := ks.Metadata[key] - return value, ok - } -} - -// SetAttribute sets the value of an attribute -// Cas particuliers: "id" utilise SetId(), "k" est immutable (panique) -func (ks *KmerSet) SetAttribute(key string, value interface{}) { - switch key { - case "id": - if id, ok := value.(string); ok { - ks.SetId(id) - } else { - panic(fmt.Sprintf("id must be a string, got %T", value)) - } - case "k": - panic("k is immutable and cannot be modified via SetAttribute") - default: - ks.Metadata[key] = value - } -} - -// DeleteAttribute supprime un attribut -func (ks *KmerSet) DeleteAttribute(key string) { - delete(ks.Metadata, key) -} - -// RemoveAttribute supprime un attribut (alias de DeleteAttribute) -func (ks *KmerSet) RemoveAttribute(key string) { - ks.DeleteAttribute(key) -} - -// RenameAttribute renomme un attribut -func (ks *KmerSet) RenameAttribute(newName, oldName string) { - if value, ok := ks.Metadata[oldName]; ok { - ks.Metadata[newName] = value - delete(ks.Metadata, oldName) - } -} - -// GetIntAttribute récupère un attribut en tant qu'entier -func (ks *KmerSet) GetIntAttribute(key string) (int, bool) { - value, ok := ks.Metadata[key] - if !ok { - return 0, false - } - - switch v := value.(type) { - case int: - return v, true - case int64: - return int(v), true - case float64: - return int(v), true - case string: - if i, err := strconv.Atoi(v); err == nil { - return i, true - } - } - return 0, false -} - -// GetFloatAttribute récupère un attribut en tant que float64 -func (ks *KmerSet) GetFloatAttribute(key string) (float64, bool) { - value, ok := ks.Metadata[key] - if !ok { - return 0, false - } - - switch v := value.(type) { - case float64: - return v, true - case float32: - return float64(v), true - case int: - return float64(v), true - case int64: - return float64(v), true - case string: - if f, err := strconv.ParseFloat(v, 64); err == nil { - return f, true - } - } - return 0, false -} - -// GetNumericAttribute récupère un attribut numérique (alias de GetFloatAttribute) -func (ks *KmerSet) GetNumericAttribute(key string) (float64, bool) { - return ks.GetFloatAttribute(key) -} - -// GetStringAttribute récupère un attribut en tant que chaîne -func (ks *KmerSet) GetStringAttribute(key string) (string, bool) { - value, ok := ks.Metadata[key] - if !ok { - return "", false - } - - switch v := value.(type) { - case string: - return v, true - default: - return fmt.Sprintf("%v", v), true - } -} - -// GetBoolAttribute récupère un attribut en tant que booléen -func (ks *KmerSet) GetBoolAttribute(key string) (bool, bool) { - value, ok := ks.Metadata[key] - if !ok { - return false, false - } - - switch v := value.(type) { - case bool: - return v, true - case int: - return v != 0, true - case string: - if b, err := strconv.ParseBool(v); err == nil { - return b, true - } - } - return false, false -} - -// AttributeKeys returns the set of attribute keys -func (ks *KmerSet) AttributeKeys() obiutils.Set[string] { - keys := obiutils.MakeSet[string]() - for key := range ks.Metadata { - keys.Add(key) - } - return keys -} - -// Keys returns the set of attribute keys (alias of AttributeKeys) -func (ks *KmerSet) Keys() obiutils.Set[string] { - return ks.AttributeKeys() -} - -// ================================== -// KMER SET GROUP ATTRIBUTE API -// Métadonnées du groupe + accès via Get() pour les sets individuels -// ================================== - -// HasAttribute vérifie si une clé d'attribut existe pour le groupe -func (ksg *KmerSetGroup) HasAttribute(key string) bool { - _, ok := ksg.Metadata[key] - return ok -} - -// GetAttribute récupère la valeur d'un attribut du groupe -// Cas particuliers: "id" utilise Id(), "k" utilise K() -func (ksg *KmerSetGroup) GetAttribute(key string) (interface{}, bool) { - switch key { - case "id": - return ksg.Id(), true - case "k": - return ksg.K(), true - default: - value, ok := ksg.Metadata[key] - return value, ok - } -} - -// SetAttribute sets the value of an attribute du groupe -// Cas particuliers: "id" utilise SetId(), "k" est immutable (panique) -func (ksg *KmerSetGroup) SetAttribute(key string, value interface{}) { - switch key { - case "id": - if id, ok := value.(string); ok { - ksg.SetId(id) - } else { - panic(fmt.Sprintf("id must be a string, got %T", value)) - } - case "k": - panic("k is immutable and cannot be modified via SetAttribute") - default: - ksg.Metadata[key] = value - } -} - -// DeleteAttribute supprime un attribut du groupe -func (ksg *KmerSetGroup) DeleteAttribute(key string) { - delete(ksg.Metadata, key) -} - -// RemoveAttribute supprime un attribut du groupe (alias) -func (ksg *KmerSetGroup) RemoveAttribute(key string) { - ksg.DeleteAttribute(key) -} - -// RenameAttribute renomme un attribut du groupe -func (ksg *KmerSetGroup) RenameAttribute(newName, oldName string) { - if value, ok := ksg.Metadata[oldName]; ok { - ksg.Metadata[newName] = value - delete(ksg.Metadata, oldName) - } -} - -// GetIntAttribute récupère un attribut entier du groupe -func (ksg *KmerSetGroup) GetIntAttribute(key string) (int, bool) { - value, ok := ksg.GetAttribute(key) - if !ok { - return 0, false - } - - switch v := value.(type) { - case int: - return v, true - case int64: - return int(v), true - case float64: - return int(v), true - case string: - if i, err := strconv.Atoi(v); err == nil { - return i, true - } - } - return 0, false -} - -// GetFloatAttribute récupère un attribut float64 du groupe -func (ksg *KmerSetGroup) GetFloatAttribute(key string) (float64, bool) { - value, ok := ksg.GetAttribute(key) - if !ok { - return 0, false - } - - switch v := value.(type) { - case float64: - return v, true - case float32: - return float64(v), true - case int: - return float64(v), true - case int64: - return float64(v), true - case string: - if f, err := strconv.ParseFloat(v, 64); err == nil { - return f, true - } - } - return 0, false -} - -// GetNumericAttribute récupère un attribut numérique du groupe -func (ksg *KmerSetGroup) GetNumericAttribute(key string) (float64, bool) { - return ksg.GetFloatAttribute(key) -} - -// GetStringAttribute récupère un attribut chaîne du groupe -func (ksg *KmerSetGroup) GetStringAttribute(key string) (string, bool) { - value, ok := ksg.GetAttribute(key) - if !ok { - return "", false - } - - switch v := value.(type) { - case string: - return v, true - default: - return fmt.Sprintf("%v", v), true - } -} - -// GetBoolAttribute récupère un attribut booléen du groupe -func (ksg *KmerSetGroup) GetBoolAttribute(key string) (bool, bool) { - value, ok := ksg.GetAttribute(key) - if !ok { - return false, false - } - - switch v := value.(type) { - case bool: - return v, true - case int: - return v != 0, true - case string: - if b, err := strconv.ParseBool(v); err == nil { - return b, true - } - } - return false, false -} - -// AttributeKeys returns the set of attribute keys du groupe -func (ksg *KmerSetGroup) AttributeKeys() obiutils.Set[string] { - keys := obiutils.MakeSet[string]() - for key := range ksg.Metadata { - keys.Add(key) - } - return keys -} - -// Keys returns the set of group attribute keys (alias) -func (ksg *KmerSetGroup) Keys() obiutils.Set[string] { - return ksg.AttributeKeys() -} - -// ================================== -// MÉTHODES POUR ACCÉDER AUX ATTRIBUTS DES SETS INDIVIDUELS VIA Get() -// Architecture zero-copy: ksg.Get(i).SetAttribute(...) -// ================================== - -// Exemple d'utilisation: -// Pour accéder aux métadonnées d'un KmerSet individuel dans un groupe: -// ks := ksg.Get(0) -// ks.SetAttribute("level", 1) -// hasLevel := ks.HasAttribute("level") -// -// Pour les métadonnées du groupe: -// ksg.SetAttribute("name", "FrequencyFilter") -// name, ok := ksg.GetStringAttribute("name") - -// AllAttributeKeys returns all unique attribute keys of the group AND all its sets -func (ksg *KmerSetGroup) AllAttributeKeys() obiutils.Set[string] { - keys := obiutils.MakeSet[string]() - - // Ajouter les clés du groupe - for key := range ksg.Metadata { - keys.Add(key) - } - - // Ajouter les clés de chaque set - for _, ks := range ksg.sets { - for key := range ks.Metadata { - keys.Add(key) - } - } - - return keys -} diff --git a/pkg/obikmer/kmer_set_builder.go b/pkg/obikmer/kmer_set_builder.go new file mode 100644 index 0000000..b987093 --- /dev/null +++ b/pkg/obikmer/kmer_set_builder.go @@ -0,0 +1,361 @@ +package obikmer + +import ( + "fmt" + "math" + "os" + "path/filepath" + "runtime" + "sort" + "sync" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" +) + +// BuilderOption is a functional option for KmerSetGroupBuilder. +type BuilderOption func(*builderConfig) + +type builderConfig struct { + minFreq int // 0 means no frequency filtering (simple dedup) +} + +// WithMinFrequency activates frequency filtering mode. +// Only k-mers seen >= minFreq times are kept in the final index. +func WithMinFrequency(minFreq int) BuilderOption { + return func(c *builderConfig) { + c.minFreq = minFreq + } +} + +// KmerSetGroupBuilder constructs a KmerSetGroup on disk. +// During construction, super-kmers are written to temporary .skm files +// partitioned by minimizer. On Close(), each partition is finalized +// (sort, dedup, optional frequency filter) into .kdi files. +type KmerSetGroupBuilder struct { + dir string + k int + m int + n int // number of sets + P int // number of partitions + config builderConfig + writers [][]*SkmWriter // [setIndex][partIndex] + mu [][]sync.Mutex // per-writer mutex for concurrent access + closed bool +} + +// NewKmerSetGroupBuilder creates a builder for a new KmerSetGroup. +// +// Parameters: +// - directory: destination directory (created if necessary) +// - k: k-mer size (1-31) +// - m: minimizer size (-1 for auto = ceil(k/2.5)) +// - n: number of sets in the group +// - P: number of partitions (-1 for auto) +// - options: optional builder options (e.g. WithMinFrequency) +func NewKmerSetGroupBuilder(directory string, k, m, n, P int, + options ...BuilderOption) (*KmerSetGroupBuilder, error) { + + if k < 2 || k > 31 { + return nil, fmt.Errorf("obikmer: k must be between 2 and 31, got %d", k) + } + if n < 1 { + return nil, fmt.Errorf("obikmer: n must be >= 1, got %d", n) + } + + // Auto minimizer size + if m < 0 { + m = int(math.Ceil(float64(k) / 2.5)) + } + if m < 1 { + m = 1 + } + if m >= k { + m = k - 1 + } + + // Auto partition count + if P < 0 { + // Use 4^m as the maximum, capped at a reasonable value + maxP := 1 << (2 * m) // 4^m + P = maxP + if P > 4096 { + P = 4096 + } + if P < 64 { + P = 64 + } + } + + // Apply options + var config builderConfig + for _, opt := range options { + opt(&config) + } + + // Create build directory structure + buildDir := filepath.Join(directory, ".build") + for s := 0; s < n; s++ { + setDir := filepath.Join(buildDir, fmt.Sprintf("set_%d", s)) + if err := os.MkdirAll(setDir, 0755); err != nil { + return nil, fmt.Errorf("obikmer: create build dir: %w", err) + } + } + + // Create SKM writers + writers := make([][]*SkmWriter, n) + mutexes := make([][]sync.Mutex, n) + for s := 0; s < n; s++ { + writers[s] = make([]*SkmWriter, P) + mutexes[s] = make([]sync.Mutex, P) + for p := 0; p < P; p++ { + path := filepath.Join(buildDir, fmt.Sprintf("set_%d", s), + fmt.Sprintf("part_%04d.skm", p)) + w, err := NewSkmWriter(path) + if err != nil { + // Close already-created writers + for ss := 0; ss <= s; ss++ { + for pp := 0; pp < P; pp++ { + if writers[ss][pp] != nil { + writers[ss][pp].Close() + } + } + } + return nil, fmt.Errorf("obikmer: create skm writer: %w", err) + } + writers[s][p] = w + } + } + + return &KmerSetGroupBuilder{ + dir: directory, + k: k, + m: m, + n: n, + P: P, + config: config, + writers: writers, + mu: mutexes, + }, nil +} + +// AddSequence extracts super-kmers from a sequence and writes them +// to the appropriate partition files for the given set. +func (b *KmerSetGroupBuilder) AddSequence(setIndex int, seq *obiseq.BioSequence) { + if setIndex < 0 || setIndex >= b.n { + return + } + rawSeq := seq.Sequence() + if len(rawSeq) < b.k { + return + } + for sk := range IterSuperKmers(rawSeq, b.k, b.m) { + part := int(sk.Minimizer % uint64(b.P)) + b.mu[setIndex][part].Lock() + b.writers[setIndex][part].Write(sk) + b.mu[setIndex][part].Unlock() + } +} + +// AddSuperKmer writes a single super-kmer to the appropriate partition. +func (b *KmerSetGroupBuilder) AddSuperKmer(setIndex int, sk SuperKmer) { + if setIndex < 0 || setIndex >= b.n { + return + } + part := int(sk.Minimizer % uint64(b.P)) + b.mu[setIndex][part].Lock() + b.writers[setIndex][part].Write(sk) + b.mu[setIndex][part].Unlock() +} + +// Close finalizes the construction: +// 1. Flush and close all SKM writers +// 2. For each partition of each set (in parallel): +// - Load super-kmers from .skm +// - Extract canonical k-mers +// - Sort and deduplicate (count if frequency filter) +// - Write .kdi file +// 3. Write metadata.toml +// 4. Remove .build/ directory +// +// Returns the finalized KmerSetGroup in read-only mode. +func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) { + if b.closed { + return nil, fmt.Errorf("obikmer: builder already closed") + } + b.closed = true + + // 1. Close all SKM writers + for s := 0; s < b.n; s++ { + for p := 0; p < b.P; p++ { + if err := b.writers[s][p].Close(); err != nil { + return nil, fmt.Errorf("obikmer: close skm writer set=%d part=%d: %w", s, p, err) + } + } + } + + // 2. Create output directory structure + for s := 0; s < b.n; s++ { + setDir := filepath.Join(b.dir, fmt.Sprintf("set_%d", s)) + if err := os.MkdirAll(setDir, 0755); err != nil { + return nil, fmt.Errorf("obikmer: create set dir: %w", err) + } + } + + // Process partitions in parallel + counts := make([][]uint64, b.n) + for s := 0; s < b.n; s++ { + counts[s] = make([]uint64, b.P) + } + + nWorkers := runtime.NumCPU() + if nWorkers > b.P { + nWorkers = b.P + } + + type job struct { + setIdx int + partIdx int + } + + jobs := make(chan job, b.n*b.P) + var wg sync.WaitGroup + var errMu sync.Mutex + var firstErr error + + for w := 0; w < nWorkers; w++ { + wg.Add(1) + go func() { + defer wg.Done() + for j := range jobs { + if err := b.finalizePartition(j.setIdx, j.partIdx, &counts[j.setIdx][j.partIdx]); err != nil { + errMu.Lock() + if firstErr == nil { + firstErr = err + } + errMu.Unlock() + } + } + }() + } + + for s := 0; s < b.n; s++ { + for p := 0; p < b.P; p++ { + jobs <- job{s, p} + } + } + close(jobs) + wg.Wait() + + if firstErr != nil { + return nil, firstErr + } + + // 3. Build KmerSetGroup and write metadata + totalCounts := make([]uint64, b.n) + for s := 0; s < b.n; s++ { + for p := 0; p < b.P; p++ { + totalCounts[s] += counts[s][p] + } + } + + setsIDs := make([]string, b.n) + + ksg := &KmerSetGroup{ + path: b.dir, + k: b.k, + m: b.m, + partitions: b.P, + n: b.n, + setsIDs: setsIDs, + counts: totalCounts, + Metadata: make(map[string]interface{}), + } + + if err := ksg.saveMetadata(); err != nil { + return nil, fmt.Errorf("obikmer: write metadata: %w", err) + } + + // 4. Remove .build/ directory + buildDir := filepath.Join(b.dir, ".build") + os.RemoveAll(buildDir) + + return ksg, nil +} + +// finalizePartition processes a single partition: load SKM, extract k-mers, +// sort, dedup/count, write KDI. +func (b *KmerSetGroupBuilder) finalizePartition(setIdx, partIdx int, count *uint64) error { + skmPath := filepath.Join(b.dir, ".build", + fmt.Sprintf("set_%d", setIdx), + fmt.Sprintf("part_%04d.skm", partIdx)) + + kdiPath := filepath.Join(b.dir, + fmt.Sprintf("set_%d", setIdx), + fmt.Sprintf("part_%04d.kdi", partIdx)) + + // Load super-kmers and extract canonical k-mers + reader, err := NewSkmReader(skmPath) + if err != nil { + // If file doesn't exist or is empty, write empty KDI + return b.writeEmptyKdi(kdiPath, count) + } + + var kmers []uint64 + for { + sk, ok := reader.Next() + if !ok { + break + } + for kmer := range IterCanonicalKmers(sk.Sequence, b.k) { + kmers = append(kmers, kmer) + } + } + reader.Close() + + if len(kmers) == 0 { + return b.writeEmptyKdi(kdiPath, count) + } + + // Sort + sort.Slice(kmers, func(i, j int) bool { return kmers[i] < kmers[j] }) + + // Write KDI based on mode + w, err := NewKdiWriter(kdiPath) + if err != nil { + return err + } + + minFreq := b.config.minFreq + if minFreq <= 0 { + minFreq = 1 // simple dedup + } + + // Linear scan: count consecutive identical values + i := 0 + for i < len(kmers) { + val := kmers[i] + c := 1 + for i+c < len(kmers) && kmers[i+c] == val { + c++ + } + if c >= minFreq { + if err := w.Write(val); err != nil { + w.Close() + return err + } + } + i += c + } + + *count = w.Count() + return w.Close() +} + +func (b *KmerSetGroupBuilder) writeEmptyKdi(path string, count *uint64) error { + w, err := NewKdiWriter(path) + if err != nil { + return err + } + *count = 0 + return w.Close() +} diff --git a/pkg/obikmer/kmer_set_builder_test.go b/pkg/obikmer/kmer_set_builder_test.go new file mode 100644 index 0000000..47b47d6 --- /dev/null +++ b/pkg/obikmer/kmer_set_builder_test.go @@ -0,0 +1,278 @@ +package obikmer + +import ( + "sort" + "testing" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" +) + +func TestBuilderBasic(t *testing.T) { + dir := t.TempDir() + + builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64) + if err != nil { + t.Fatal(err) + } + + seq := obiseq.NewBioSequence("test", []byte("ACGTACGTACGTACGTACGTACGTACGT"), "") + builder.AddSequence(0, seq) + + ksg, err := builder.Close() + if err != nil { + t.Fatal(err) + } + + if ksg.K() != 15 { + t.Fatalf("K() = %d, want 15", ksg.K()) + } + if ksg.M() != 7 { + t.Fatalf("M() = %d, want 7", ksg.M()) + } + if ksg.Partitions() != 64 { + t.Fatalf("Partitions() = %d, want 64", ksg.Partitions()) + } + if ksg.Size() != 1 { + t.Fatalf("Size() = %d, want 1", ksg.Size()) + } + if ksg.Len(0) == 0 { + t.Fatal("Len(0) = 0, expected some k-mers") + } + + // Verify k-mers match what we'd compute directly + var expected []uint64 + for kmer := range IterCanonicalKmers(seq.Sequence(), 15) { + expected = append(expected, kmer) + } + sort.Slice(expected, func(i, j int) bool { return expected[i] < expected[j] }) + // Dedup + deduped := expected[:0] + for i, v := range expected { + if i == 0 || v != expected[i-1] { + deduped = append(deduped, v) + } + } + + if ksg.Len(0) != uint64(len(deduped)) { + t.Fatalf("Len(0) = %d, expected %d unique k-mers", ksg.Len(0), len(deduped)) + } + + // Check iterator + var fromIter []uint64 + for kmer := range ksg.Iterator(0) { + fromIter = append(fromIter, kmer) + } + // The iterator does a k-way merge so should be sorted + for i := 1; i < len(fromIter); i++ { + if fromIter[i] <= fromIter[i-1] { + t.Fatalf("iterator not sorted at %d: %d <= %d", i, fromIter[i], fromIter[i-1]) + } + } + if len(fromIter) != len(deduped) { + t.Fatalf("iterator yielded %d k-mers, expected %d", len(fromIter), len(deduped)) + } + for i, v := range fromIter { + if v != deduped[i] { + t.Fatalf("iterator kmer %d: got %d, want %d", i, v, deduped[i]) + } + } +} + +func TestBuilderMultipleSequences(t *testing.T) { + dir := t.TempDir() + + builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64) + if err != nil { + t.Fatal(err) + } + + seqs := []string{ + "ACGTACGTACGTACGTACGTACGTACGT", + "TTTTTTTTTTTTTTTTTTTTTTTTT", + "GGGGGGGGGGGGGGGGGGGGGGGG", + } + for _, s := range seqs { + seq := obiseq.NewBioSequence("", []byte(s), "") + builder.AddSequence(0, seq) + } + + ksg, err := builder.Close() + if err != nil { + t.Fatal(err) + } + + if ksg.Len(0) == 0 { + t.Fatal("expected k-mers after multiple sequences") + } +} + +func TestBuilderFrequencyFilter(t *testing.T) { + dir := t.TempDir() + + builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64, + WithMinFrequency(3)) + if err != nil { + t.Fatal(err) + } + + // Add same sequence 3 times — all k-mers should survive freq=3 + seq := obiseq.NewBioSequence("test", []byte("ACGTACGTACGTACGTACGTACGTACGT"), "") + for i := 0; i < 3; i++ { + builder.AddSequence(0, seq) + } + + ksg, err := builder.Close() + if err != nil { + t.Fatal(err) + } + + // All k-mers appear exactly 3 times → all should survive + var expected []uint64 + for kmer := range IterCanonicalKmers(seq.Sequence(), 15) { + expected = append(expected, kmer) + } + sort.Slice(expected, func(i, j int) bool { return expected[i] < expected[j] }) + deduped := expected[:0] + for i, v := range expected { + if i == 0 || v != expected[i-1] { + deduped = append(deduped, v) + } + } + + if ksg.Len(0) != uint64(len(deduped)) { + t.Fatalf("Len(0) = %d, expected %d (all k-mers at freq=3)", ksg.Len(0), len(deduped)) + } +} + +func TestBuilderFrequencyFilterRejects(t *testing.T) { + dir := t.TempDir() + + builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64, + WithMinFrequency(5)) + if err != nil { + t.Fatal(err) + } + + // Use a non-repetitive sequence so each canonical k-mer appears once per pass. + // Adding it twice gives freq=2 per kmer, which is < minFreq=5 → all rejected. + seq := obiseq.NewBioSequence("test", + []byte("ACGATCGATCTAGCTAGCTGATCGATCGATCG"), "") + builder.AddSequence(0, seq) + builder.AddSequence(0, seq) + + ksg, err := builder.Close() + if err != nil { + t.Fatal(err) + } + + if ksg.Len(0) != 0 { + t.Fatalf("Len(0) = %d, expected 0 (all k-mers at freq=2 < minFreq=5)", ksg.Len(0)) + } +} + +func TestBuilderMultipleSets(t *testing.T) { + dir := t.TempDir() + + builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 3, 64) + if err != nil { + t.Fatal(err) + } + + seqs := []string{ + "ACGTACGTACGTACGTACGTACGTACGT", + "TTTTTTTTTTTTTTTTTTTTTTTTT", + "GGGGGGGGGGGGGGGGGGGGGGGG", + } + for i, s := range seqs { + seq := obiseq.NewBioSequence("", []byte(s), "") + builder.AddSequence(i, seq) + } + + ksg, err := builder.Close() + if err != nil { + t.Fatal(err) + } + + if ksg.Size() != 3 { + t.Fatalf("Size() = %d, want 3", ksg.Size()) + } + for s := 0; s < 3; s++ { + if ksg.Len(s) == 0 { + t.Fatalf("Len(%d) = 0, expected some k-mers", s) + } + } +} + +func TestBuilderOpenRoundTrip(t *testing.T) { + dir := t.TempDir() + + builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64) + if err != nil { + t.Fatal(err) + } + + seq := obiseq.NewBioSequence("test", []byte("ACGTACGTACGTACGTACGTACGTACGT"), "") + builder.AddSequence(0, seq) + + ksg1, err := builder.Close() + if err != nil { + t.Fatal(err) + } + + // Reopen + ksg2, err := OpenKmerSetGroup(dir) + if err != nil { + t.Fatal(err) + } + + if ksg2.K() != ksg1.K() { + t.Fatalf("K mismatch: %d vs %d", ksg2.K(), ksg1.K()) + } + if ksg2.M() != ksg1.M() { + t.Fatalf("M mismatch: %d vs %d", ksg2.M(), ksg1.M()) + } + if ksg2.Partitions() != ksg1.Partitions() { + t.Fatalf("Partitions mismatch: %d vs %d", ksg2.Partitions(), ksg1.Partitions()) + } + if ksg2.Len(0) != ksg1.Len(0) { + t.Fatalf("Len mismatch: %d vs %d", ksg2.Len(0), ksg1.Len(0)) + } +} + +func TestBuilderAttributes(t *testing.T) { + dir := t.TempDir() + + builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64) + if err != nil { + t.Fatal(err) + } + + seq := obiseq.NewBioSequence("test", []byte("ACGTACGTACGTACGTACGTACGTACGT"), "") + builder.AddSequence(0, seq) + + ksg, err := builder.Close() + if err != nil { + t.Fatal(err) + } + + ksg.SetId("my_index") + ksg.SetAttribute("organism", "test") + ksg.SaveMetadata() + + // Reopen and check + ksg2, err := OpenKmerSetGroup(dir) + if err != nil { + t.Fatal(err) + } + + if ksg2.Id() != "my_index" { + t.Fatalf("Id() = %q, want %q", ksg2.Id(), "my_index") + } + if !ksg2.HasAttribute("organism") { + t.Fatal("expected 'organism' attribute") + } + v, _ := ksg2.GetAttribute("organism") + if v != "test" { + t.Fatalf("organism = %v, want 'test'", v) + } +} diff --git a/pkg/obikmer/kmer_set_disk.go b/pkg/obikmer/kmer_set_disk.go new file mode 100644 index 0000000..0ac623c --- /dev/null +++ b/pkg/obikmer/kmer_set_disk.go @@ -0,0 +1,580 @@ +package obikmer + +import ( + "fmt" + "iter" + "os" + "path/filepath" + "sync" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidist" + "github.com/pelletier/go-toml/v2" +) + +// MetadataFormat represents the metadata serialization format. +// Currently only TOML is used for disk-based indices, but the type +// is kept for backward compatibility with CLI options. +type MetadataFormat int + +const ( + FormatTOML MetadataFormat = iota + FormatYAML + FormatJSON +) + +// String returns the file extension for the format. +func (f MetadataFormat) String() string { + switch f { + case FormatTOML: + return "toml" + case FormatYAML: + return "yaml" + case FormatJSON: + return "json" + default: + return "toml" + } +} + +// KmerSetGroup is a disk-based collection of N k-mer sets sharing the same +// k, m, and partition count P. After construction (via KmerSetGroupBuilder), +// it is immutable and all operations are streaming (partition by partition). +// +// A KmerSetGroup with Size()==1 is effectively a KmerSet (singleton). +type KmerSetGroup struct { + path string // root directory + id string // user-assigned identifier + k int // k-mer size + m int // minimizer size + partitions int // number of partitions P + n int // number of sets N + setsIDs []string // IDs of individual sets + counts []uint64 // total k-mer count per set (sum over partitions) + Metadata map[string]interface{} // group-level user metadata +} + +// diskMetadata is the TOML-serializable structure for metadata.toml. +type diskMetadata struct { + ID string `toml:"id,omitempty"` + K int `toml:"k"` + M int `toml:"m"` + Partitions int `toml:"partitions"` + Type string `toml:"type"` + Size int `toml:"size"` + SetsIDs []string `toml:"sets_ids,omitempty"` + Counts []uint64 `toml:"counts,omitempty"` + UserMetadata map[string]interface{} `toml:"user_metadata,omitempty"` +} + +// OpenKmerSetGroup opens a finalized index directory in read-only mode. +func OpenKmerSetGroup(directory string) (*KmerSetGroup, error) { + metaPath := filepath.Join(directory, "metadata.toml") + f, err := os.Open(metaPath) + if err != nil { + return nil, fmt.Errorf("obikmer: open metadata: %w", err) + } + defer f.Close() + + var meta diskMetadata + if err := toml.NewDecoder(f).Decode(&meta); err != nil { + return nil, fmt.Errorf("obikmer: decode metadata: %w", err) + } + + ksg := &KmerSetGroup{ + path: directory, + id: meta.ID, + k: meta.K, + m: meta.M, + partitions: meta.Partitions, + n: meta.Size, + setsIDs: meta.SetsIDs, + counts: meta.Counts, + Metadata: meta.UserMetadata, + } + if ksg.Metadata == nil { + ksg.Metadata = make(map[string]interface{}) + } + if ksg.setsIDs == nil { + ksg.setsIDs = make([]string, ksg.n) + } + if ksg.counts == nil { + // Compute counts by scanning partitions + ksg.counts = make([]uint64, ksg.n) + for s := 0; s < ksg.n; s++ { + for p := 0; p < ksg.partitions; p++ { + path := ksg.partitionPath(s, p) + r, err := NewKdiReader(path) + if err != nil { + continue + } + ksg.counts[s] += r.Count() + r.Close() + } + } + } + + return ksg, nil +} + +// SaveMetadata writes the metadata.toml file. This is useful after +// modifying attributes or IDs on an already-finalized index. +func (ksg *KmerSetGroup) SaveMetadata() error { + return ksg.saveMetadata() +} + +// saveMetadata writes the metadata.toml file (internal). +func (ksg *KmerSetGroup) saveMetadata() error { + meta := diskMetadata{ + ID: ksg.id, + K: ksg.k, + M: ksg.m, + Partitions: ksg.partitions, + Type: "KmerSetGroup", + Size: ksg.n, + SetsIDs: ksg.setsIDs, + Counts: ksg.counts, + UserMetadata: ksg.Metadata, + } + + metaPath := filepath.Join(ksg.path, "metadata.toml") + f, err := os.Create(metaPath) + if err != nil { + return err + } + defer f.Close() + + return toml.NewEncoder(f).Encode(meta) +} + +// partitionPath returns the file path for partition p of set s. +func (ksg *KmerSetGroup) partitionPath(setIndex, partIndex int) string { + return filepath.Join(ksg.path, fmt.Sprintf("set_%d", setIndex), + fmt.Sprintf("part_%04d.kdi", partIndex)) +} + +// Path returns the root directory of the index. +func (ksg *KmerSetGroup) Path() string { + return ksg.path +} + +// K returns the k-mer size. +func (ksg *KmerSetGroup) K() int { + return ksg.k +} + +// M returns the minimizer size. +func (ksg *KmerSetGroup) M() int { + return ksg.m +} + +// Partitions returns the number of partitions P. +func (ksg *KmerSetGroup) Partitions() int { + return ksg.partitions +} + +// Size returns the number of sets N. +func (ksg *KmerSetGroup) Size() int { + return ksg.n +} + +// Id returns the group identifier. +func (ksg *KmerSetGroup) Id() string { + return ksg.id +} + +// SetId sets the group identifier and persists the change. +func (ksg *KmerSetGroup) SetId(id string) { + ksg.id = id +} + +// Len returns the total number of k-mers. +// Without argument: total across all sets. +// With argument setIndex: count for that specific set. +func (ksg *KmerSetGroup) Len(setIndex ...int) uint64 { + if len(setIndex) == 0 { + var total uint64 + for _, c := range ksg.counts { + total += c + } + return total + } + idx := setIndex[0] + if idx < 0 || idx >= ksg.n { + return 0 + } + return ksg.counts[idx] +} + +// Contains checks if a k-mer is present in the specified set. +// Uses binary search on the appropriate partition's KDI file. +func (ksg *KmerSetGroup) Contains(setIndex int, kmer uint64) bool { + if setIndex < 0 || setIndex >= ksg.n { + return false + } + // Determine partition from minimizer + // For a canonical k-mer, we need to find which partition it would fall into. + // The partition is determined by the minimizer during construction. + // For Contains, we must scan all partitions of this set (linear search within each). + // A full binary-search approach would require an index file. + // For now, scan the partition determined by the k-mer's minimizer. + // Since we don't know the minimizer, we do a linear scan of all partitions. + // This is O(total_kmers / P) per partition on average. + + // Optimization: scan all partitions in parallel + type result struct { + found bool + } + ch := make(chan result, ksg.partitions) + + for p := 0; p < ksg.partitions; p++ { + go func(part int) { + r, err := NewKdiReader(ksg.partitionPath(setIndex, part)) + if err != nil { + ch <- result{false} + return + } + defer r.Close() + for { + v, ok := r.Next() + if !ok { + ch <- result{false} + return + } + if v == kmer { + ch <- result{true} + return + } + if v > kmer { + ch <- result{false} + return + } + } + }(p) + } + + for i := 0; i < ksg.partitions; i++ { + res := <-ch + if res.found { + // Drain remaining goroutines + go func() { + for j := i + 1; j < ksg.partitions; j++ { + <-ch + } + }() + return true + } + } + return false +} + +// Iterator returns an iterator over all k-mers in the specified set, +// in sorted order within each partition. Since partitions are independent, +// to get a globally sorted stream, use iteratorSorted. +func (ksg *KmerSetGroup) Iterator(setIndex int) iter.Seq[uint64] { + return func(yield func(uint64) bool) { + if setIndex < 0 || setIndex >= ksg.n { + return + } + + // Open all partition readers and merge them + readers := make([]*KdiReader, 0, ksg.partitions) + for p := 0; p < ksg.partitions; p++ { + r, err := NewKdiReader(ksg.partitionPath(setIndex, p)) + if err != nil { + continue + } + if r.Count() > 0 { + readers = append(readers, r) + } else { + r.Close() + } + } + + if len(readers) == 0 { + return + } + + m := NewKWayMerge(readers) + defer m.Close() + + for { + kmer, _, ok := m.Next() + if !ok { + return + } + if !yield(kmer) { + return + } + } + } +} + +// ============================== +// Attribute API (compatible with old API) +// ============================== + +// HasAttribute checks if a metadata key exists. +func (ksg *KmerSetGroup) HasAttribute(key string) bool { + _, ok := ksg.Metadata[key] + return ok +} + +// GetAttribute returns the value of an attribute. +func (ksg *KmerSetGroup) GetAttribute(key string) (interface{}, bool) { + switch key { + case "id": + return ksg.Id(), true + case "k": + return ksg.K(), true + default: + value, ok := ksg.Metadata[key] + return value, ok + } +} + +// SetAttribute sets a metadata attribute. +func (ksg *KmerSetGroup) SetAttribute(key string, value interface{}) { + switch key { + case "id": + if id, ok := value.(string); ok { + ksg.SetId(id) + } else { + panic(fmt.Sprintf("id must be a string, got %T", value)) + } + case "k": + panic("k is immutable") + default: + ksg.Metadata[key] = value + } +} + +// DeleteAttribute removes a metadata attribute. +func (ksg *KmerSetGroup) DeleteAttribute(key string) { + delete(ksg.Metadata, key) +} + +// GetIntAttribute returns an attribute as int. +func (ksg *KmerSetGroup) GetIntAttribute(key string) (int, bool) { + v, ok := ksg.GetAttribute(key) + if !ok { + return 0, false + } + switch val := v.(type) { + case int: + return val, true + case int64: + return int(val), true + case float64: + return int(val), true + } + return 0, false +} + +// GetStringAttribute returns an attribute as string. +func (ksg *KmerSetGroup) GetStringAttribute(key string) (string, bool) { + v, ok := ksg.GetAttribute(key) + if !ok { + return "", false + } + if s, ok := v.(string); ok { + return s, true + } + return fmt.Sprintf("%v", v), true +} + +// ============================== +// Jaccard metrics (streaming, disk-based) +// ============================== + +// JaccardDistanceMatrix computes a pairwise Jaccard distance matrix +// for all sets in the group. Operates partition by partition in streaming. +func (ksg *KmerSetGroup) JaccardDistanceMatrix() *obidist.DistMatrix { + n := ksg.n + labels := make([]string, n) + for i := 0; i < n; i++ { + if i < len(ksg.setsIDs) && ksg.setsIDs[i] != "" { + labels[i] = ksg.setsIDs[i] + } else { + labels[i] = fmt.Sprintf("set_%d", i) + } + } + + dm := obidist.NewDistMatrixWithLabels(labels) + + // Accumulate intersection and union counts + intersections := make([][]uint64, n) + unions := make([][]uint64, n) + for i := 0; i < n; i++ { + intersections[i] = make([]uint64, n) + unions[i] = make([]uint64, n) + } + + // Process partition by partition + var mu sync.Mutex + var wg sync.WaitGroup + + for p := 0; p < ksg.partitions; p++ { + wg.Add(1) + go func(part int) { + defer wg.Done() + + // Open all set readers for this partition + readers := make([]*KdiReader, n) + for s := 0; s < n; s++ { + r, err := NewKdiReader(ksg.partitionPath(s, part)) + if err != nil { + continue + } + readers[s] = r + } + defer func() { + for _, r := range readers { + if r != nil { + r.Close() + } + } + }() + + // Merge all N readers to count intersections and unions + activeReaders := make([]*KdiReader, 0, n) + activeIndices := make([]int, 0, n) + for i, r := range readers { + if r != nil && r.Count() > 0 { + activeReaders = append(activeReaders, r) + activeIndices = append(activeIndices, i) + } + } + if len(activeReaders) == 0 { + return + } + + merge := NewKWayMerge(activeReaders) + // Don't close merge here since readers are managed above + // We only want to iterate + + // We need per-set presence tracking, so we use a custom merge + // Rebuild with a direct approach + merge.Close() // close the merge (which closes readers) + + // Reopen readers for custom merge + for s := 0; s < n; s++ { + readers[s] = nil + r, err := NewKdiReader(ksg.partitionPath(s, part)) + if err != nil { + continue + } + if r.Count() > 0 { + readers[s] = r + } else { + r.Close() + } + } + + // Custom k-way merge that tracks which sets contain each kmer + type entry struct { + val uint64 + setIdx int + } + + // Use a simpler approach: read all values for this partition into memory + // for each set, then do a merge + setKmers := make([][]uint64, n) + for s := 0; s < n; s++ { + if readers[s] == nil { + continue + } + kmers := make([]uint64, 0, readers[s].Count()) + for { + v, ok := readers[s].Next() + if !ok { + break + } + kmers = append(kmers, v) + } + setKmers[s] = kmers + readers[s].Close() + readers[s] = nil + } + + // Count pairwise intersections using sorted merge + // For each pair (i,j), count kmers present in both + localInter := make([][]uint64, n) + localUnion := make([][]uint64, n) + for i := 0; i < n; i++ { + localInter[i] = make([]uint64, n) + localUnion[i] = make([]uint64, n) + } + + for i := 0; i < n; i++ { + localUnion[i][i] = uint64(len(setKmers[i])) + for j := i + 1; j < n; j++ { + a, b := setKmers[i], setKmers[j] + var inter uint64 + ai, bi := 0, 0 + for ai < len(a) && bi < len(b) { + if a[ai] == b[bi] { + inter++ + ai++ + bi++ + } else if a[ai] < b[bi] { + ai++ + } else { + bi++ + } + } + localInter[i][j] = inter + localUnion[i][j] = uint64(len(a)) + uint64(len(b)) - inter + } + } + + mu.Lock() + for i := 0; i < n; i++ { + for j := i; j < n; j++ { + intersections[i][j] += localInter[i][j] + unions[i][j] += localUnion[i][j] + } + } + mu.Unlock() + }(p) + } + wg.Wait() + + // Compute distances from accumulated counts + for i := 0; i < n-1; i++ { + for j := i + 1; j < n; j++ { + u := unions[i][j] + if u == 0 { + dm.Set(i, j, 1.0) + } else { + dm.Set(i, j, 1.0-float64(intersections[i][j])/float64(u)) + } + } + } + + return dm +} + +// JaccardSimilarityMatrix computes a pairwise Jaccard similarity matrix. +func (ksg *KmerSetGroup) JaccardSimilarityMatrix() *obidist.DistMatrix { + n := ksg.n + labels := make([]string, n) + for i := 0; i < n; i++ { + if i < len(ksg.setsIDs) && ksg.setsIDs[i] != "" { + labels[i] = ksg.setsIDs[i] + } else { + labels[i] = fmt.Sprintf("set_%d", i) + } + } + + // Reuse distance computation + dm := ksg.JaccardDistanceMatrix() + sm := obidist.NewSimilarityMatrixWithLabels(labels) + + for i := 0; i < n-1; i++ { + for j := i + 1; j < n; j++ { + sm.Set(i, j, 1.0-dm.Get(i, j)) + } + } + + return sm +} diff --git a/pkg/obikmer/kmer_set_disk_ops.go b/pkg/obikmer/kmer_set_disk_ops.go new file mode 100644 index 0000000..4c96624 --- /dev/null +++ b/pkg/obikmer/kmer_set_disk_ops.go @@ -0,0 +1,568 @@ +package obikmer + +import ( + "fmt" + "os" + "path/filepath" + "runtime" + "sync" +) + +// Union computes the union of all sets in the group, producing a new +// singleton KmerSetGroup on disk. A k-mer is in the result if it +// appears in any set. +func (ksg *KmerSetGroup) Union(outputDir string) (*KmerSetGroup, error) { + return ksg.quorumOp(outputDir, 1, ksg.n) +} + +// Intersect computes the intersection of all sets, producing a new +// singleton KmerSetGroup on disk. A k-mer is in the result if it +// appears in every set. +func (ksg *KmerSetGroup) Intersect(outputDir string) (*KmerSetGroup, error) { + return ksg.quorumOp(outputDir, ksg.n, ksg.n) +} + +// Difference computes set_0 minus the union of all other sets. +func (ksg *KmerSetGroup) Difference(outputDir string) (*KmerSetGroup, error) { + return ksg.differenceOp(outputDir) +} + +// QuorumAtLeast returns k-mers present in at least q sets. +func (ksg *KmerSetGroup) QuorumAtLeast(q int, outputDir string) (*KmerSetGroup, error) { + return ksg.quorumOp(outputDir, q, ksg.n) +} + +// QuorumExactly returns k-mers present in exactly q sets. +func (ksg *KmerSetGroup) QuorumExactly(q int, outputDir string) (*KmerSetGroup, error) { + return ksg.quorumOp(outputDir, q, q) +} + +// QuorumAtMost returns k-mers present in at most q sets. +func (ksg *KmerSetGroup) QuorumAtMost(q int, outputDir string) (*KmerSetGroup, error) { + return ksg.quorumOp(outputDir, 1, q) +} + +// UnionWith merges this group with another, producing a new KmerSetGroup +// whose set_i is the union of this.set_i and other.set_i. +// Both groups must have the same k, m, P, and N. +func (ksg *KmerSetGroup) UnionWith(other *KmerSetGroup, outputDir string) (*KmerSetGroup, error) { + if err := ksg.checkCompatible(other); err != nil { + return nil, err + } + return ksg.pairwiseOp(other, outputDir, mergeUnion) +} + +// IntersectWith merges this group with another, producing a new KmerSetGroup +// whose set_i is the intersection of this.set_i and other.set_i. +func (ksg *KmerSetGroup) IntersectWith(other *KmerSetGroup, outputDir string) (*KmerSetGroup, error) { + if err := ksg.checkCompatible(other); err != nil { + return nil, err + } + return ksg.pairwiseOp(other, outputDir, mergeIntersect) +} + +// ============================== +// Internal implementation +// ============================== + +func (ksg *KmerSetGroup) checkCompatible(other *KmerSetGroup) error { + if ksg.k != other.k { + return fmt.Errorf("obikmer: incompatible k: %d vs %d", ksg.k, other.k) + } + if ksg.m != other.m { + return fmt.Errorf("obikmer: incompatible m: %d vs %d", ksg.m, other.m) + } + if ksg.partitions != other.partitions { + return fmt.Errorf("obikmer: incompatible partitions: %d vs %d", ksg.partitions, other.partitions) + } + if ksg.n != other.n { + return fmt.Errorf("obikmer: incompatible size: %d vs %d", ksg.n, other.n) + } + return nil +} + +// quorumOp processes all N sets partition by partition. +// For each partition, it opens N KdiReaders and does a k-way merge. +// A kmer is written to the result if minQ <= count <= maxQ. +func (ksg *KmerSetGroup) quorumOp(outputDir string, minQ, maxQ int) (*KmerSetGroup, error) { + if minQ < 1 { + minQ = 1 + } + if maxQ > ksg.n { + maxQ = ksg.n + } + + // Create output structure + setDir := filepath.Join(outputDir, "set_0") + if err := os.MkdirAll(setDir, 0755); err != nil { + return nil, err + } + + counts := make([]uint64, ksg.partitions) + + nWorkers := runtime.NumCPU() + if nWorkers > ksg.partitions { + nWorkers = ksg.partitions + } + + jobs := make(chan int, ksg.partitions) + var wg sync.WaitGroup + var errMu sync.Mutex + var firstErr error + + for w := 0; w < nWorkers; w++ { + wg.Add(1) + go func() { + defer wg.Done() + for p := range jobs { + c, err := ksg.quorumPartition(p, setDir, minQ, maxQ) + if err != nil { + errMu.Lock() + if firstErr == nil { + firstErr = err + } + errMu.Unlock() + return + } + counts[p] = c + } + }() + } + + for p := 0; p < ksg.partitions; p++ { + jobs <- p + } + close(jobs) + wg.Wait() + + if firstErr != nil { + return nil, firstErr + } + + var totalCount uint64 + for _, c := range counts { + totalCount += c + } + + result := &KmerSetGroup{ + path: outputDir, + k: ksg.k, + m: ksg.m, + partitions: ksg.partitions, + n: 1, + setsIDs: []string{""}, + counts: []uint64{totalCount}, + Metadata: make(map[string]interface{}), + } + + if err := result.saveMetadata(); err != nil { + return nil, err + } + + return result, nil +} + +// quorumPartition processes a single partition for quorum filtering. +func (ksg *KmerSetGroup) quorumPartition(partIdx int, outSetDir string, minQ, maxQ int) (uint64, error) { + // Open readers for all sets + readers := make([]*KdiReader, 0, ksg.n) + for s := 0; s < ksg.n; s++ { + r, err := NewKdiReader(ksg.partitionPath(s, partIdx)) + if err != nil { + // Close already-opened readers + for _, rr := range readers { + rr.Close() + } + return 0, err + } + if r.Count() > 0 { + readers = append(readers, r) + } else { + r.Close() + } + } + + outPath := filepath.Join(outSetDir, fmt.Sprintf("part_%04d.kdi", partIdx)) + + if len(readers) == 0 { + // Write empty KDI + w, err := NewKdiWriter(outPath) + if err != nil { + return 0, err + } + return 0, w.Close() + } + + merge := NewKWayMerge(readers) + // merge.Close() will close readers + + w, err := NewKdiWriter(outPath) + if err != nil { + merge.Close() + return 0, err + } + + for { + kmer, count, ok := merge.Next() + if !ok { + break + } + if count >= minQ && count <= maxQ { + if err := w.Write(kmer); err != nil { + merge.Close() + w.Close() + return 0, err + } + } + } + + merge.Close() + cnt := w.Count() + return cnt, w.Close() +} + +// differenceOp computes set_0 minus the union of all other sets. +func (ksg *KmerSetGroup) differenceOp(outputDir string) (*KmerSetGroup, error) { + if ksg.n < 1 { + return nil, fmt.Errorf("obikmer: difference requires at least 1 set") + } + + setDir := filepath.Join(outputDir, "set_0") + if err := os.MkdirAll(setDir, 0755); err != nil { + return nil, err + } + + counts := make([]uint64, ksg.partitions) + + nWorkers := runtime.NumCPU() + if nWorkers > ksg.partitions { + nWorkers = ksg.partitions + } + + jobs := make(chan int, ksg.partitions) + var wg sync.WaitGroup + var errMu sync.Mutex + var firstErr error + + for w := 0; w < nWorkers; w++ { + wg.Add(1) + go func() { + defer wg.Done() + for p := range jobs { + c, err := ksg.differencePartition(p, setDir) + if err != nil { + errMu.Lock() + if firstErr == nil { + firstErr = err + } + errMu.Unlock() + return + } + counts[p] = c + } + }() + } + + for p := 0; p < ksg.partitions; p++ { + jobs <- p + } + close(jobs) + wg.Wait() + + if firstErr != nil { + return nil, firstErr + } + + var totalCount uint64 + for _, c := range counts { + totalCount += c + } + + result := &KmerSetGroup{ + path: outputDir, + k: ksg.k, + m: ksg.m, + partitions: ksg.partitions, + n: 1, + setsIDs: []string{""}, + counts: []uint64{totalCount}, + Metadata: make(map[string]interface{}), + } + + if err := result.saveMetadata(); err != nil { + return nil, err + } + + return result, nil +} + +// differencePartition computes set_0 - union(set_1..set_{n-1}) for one partition. +func (ksg *KmerSetGroup) differencePartition(partIdx int, outSetDir string) (uint64, error) { + outPath := filepath.Join(outSetDir, fmt.Sprintf("part_%04d.kdi", partIdx)) + + // Open set_0 reader + r0, err := NewKdiReader(ksg.partitionPath(0, partIdx)) + if err != nil { + return 0, err + } + + if r0.Count() == 0 { + r0.Close() + w, err := NewKdiWriter(outPath) + if err != nil { + return 0, err + } + return 0, w.Close() + } + + // Open readers for the other sets and merge them + var otherReaders []*KdiReader + for s := 1; s < ksg.n; s++ { + r, err := NewKdiReader(ksg.partitionPath(s, partIdx)) + if err != nil { + r0.Close() + for _, rr := range otherReaders { + rr.Close() + } + return 0, err + } + if r.Count() > 0 { + otherReaders = append(otherReaders, r) + } else { + r.Close() + } + } + + w, err := NewKdiWriter(outPath) + if err != nil { + r0.Close() + for _, rr := range otherReaders { + rr.Close() + } + return 0, err + } + + if len(otherReaders) == 0 { + // No other sets — copy set_0 + for { + v, ok := r0.Next() + if !ok { + break + } + if err := w.Write(v); err != nil { + r0.Close() + w.Close() + return 0, err + } + } + r0.Close() + cnt := w.Count() + return cnt, w.Close() + } + + // Merge other sets to get the "subtraction" stream + otherMerge := NewKWayMerge(otherReaders) + + // Streaming difference: advance both streams + v0, ok0 := r0.Next() + vo, _, oko := otherMerge.Next() + + for ok0 { + if !oko || v0 < vo { + // v0 not in others → emit + if err := w.Write(v0); err != nil { + r0.Close() + otherMerge.Close() + w.Close() + return 0, err + } + v0, ok0 = r0.Next() + } else if v0 == vo { + // v0 in others → skip + v0, ok0 = r0.Next() + vo, _, oko = otherMerge.Next() + } else { + // vo < v0 → advance others + vo, _, oko = otherMerge.Next() + } + } + + r0.Close() + otherMerge.Close() + cnt := w.Count() + return cnt, w.Close() +} + +// mergeMode defines how to combine two values during pairwise operations. +type mergeMode int + +const ( + mergeUnion mergeMode = iota // emit if in either + mergeIntersect // emit if in both +) + +// pairwiseOp applies a merge operation between corresponding sets of two groups. +func (ksg *KmerSetGroup) pairwiseOp(other *KmerSetGroup, outputDir string, mode mergeMode) (*KmerSetGroup, error) { + for s := 0; s < ksg.n; s++ { + setDir := filepath.Join(outputDir, fmt.Sprintf("set_%d", s)) + if err := os.MkdirAll(setDir, 0755); err != nil { + return nil, err + } + } + + counts := make([][]uint64, ksg.n) + for s := 0; s < ksg.n; s++ { + counts[s] = make([]uint64, ksg.partitions) + } + + nWorkers := runtime.NumCPU() + if nWorkers > ksg.partitions { + nWorkers = ksg.partitions + } + + type job struct { + setIdx int + partIdx int + } + jobs := make(chan job, ksg.n*ksg.partitions) + var wg sync.WaitGroup + var errMu sync.Mutex + var firstErr error + + for w := 0; w < nWorkers; w++ { + wg.Add(1) + go func() { + defer wg.Done() + for j := range jobs { + c, err := pairwiseMergePartition( + ksg.partitionPath(j.setIdx, j.partIdx), + other.partitionPath(j.setIdx, j.partIdx), + filepath.Join(outputDir, fmt.Sprintf("set_%d", j.setIdx), + fmt.Sprintf("part_%04d.kdi", j.partIdx)), + mode, + ) + if err != nil { + errMu.Lock() + if firstErr == nil { + firstErr = err + } + errMu.Unlock() + return + } + counts[j.setIdx][j.partIdx] = c + } + }() + } + + for s := 0; s < ksg.n; s++ { + for p := 0; p < ksg.partitions; p++ { + jobs <- job{s, p} + } + } + close(jobs) + wg.Wait() + + if firstErr != nil { + return nil, firstErr + } + + totalCounts := make([]uint64, ksg.n) + setsIDs := make([]string, ksg.n) + for s := 0; s < ksg.n; s++ { + for p := 0; p < ksg.partitions; p++ { + totalCounts[s] += counts[s][p] + } + } + + result := &KmerSetGroup{ + path: outputDir, + k: ksg.k, + m: ksg.m, + partitions: ksg.partitions, + n: ksg.n, + setsIDs: setsIDs, + counts: totalCounts, + Metadata: make(map[string]interface{}), + } + + if err := result.saveMetadata(); err != nil { + return nil, err + } + + return result, nil +} + +// pairwiseMergePartition merges two KDI files (sorted streams) with the given mode. +func pairwiseMergePartition(pathA, pathB, outPath string, mode mergeMode) (uint64, error) { + rA, err := NewKdiReader(pathA) + if err != nil { + return 0, err + } + rB, err := NewKdiReader(pathB) + if err != nil { + rA.Close() + return 0, err + } + + w, err := NewKdiWriter(outPath) + if err != nil { + rA.Close() + rB.Close() + return 0, err + } + + cnt, mergeErr := doPairwiseMerge(rA, rB, w, mode) + rA.Close() + rB.Close() + closeErr := w.Close() + if mergeErr != nil { + return 0, mergeErr + } + return cnt, closeErr +} + +func doPairwiseMerge(rA, rB *KdiReader, w *KdiWriter, mode mergeMode) (uint64, error) { + vA, okA := rA.Next() + vB, okB := rB.Next() + + for okA && okB { + if vA == vB { + if err := w.Write(vA); err != nil { + return 0, err + } + vA, okA = rA.Next() + vB, okB = rB.Next() + } else if vA < vB { + if mode == mergeUnion { + if err := w.Write(vA); err != nil { + return 0, err + } + } + vA, okA = rA.Next() + } else { + if mode == mergeUnion { + if err := w.Write(vB); err != nil { + return 0, err + } + } + vB, okB = rB.Next() + } + } + + if mode == mergeUnion { + for okA { + if err := w.Write(vA); err != nil { + return 0, err + } + vA, okA = rA.Next() + } + for okB { + if err := w.Write(vB); err != nil { + return 0, err + } + vB, okB = rB.Next() + } + } + + return w.Count(), nil +} diff --git a/pkg/obikmer/kmer_set_disk_ops_test.go b/pkg/obikmer/kmer_set_disk_ops_test.go new file mode 100644 index 0000000..1ca3cfd --- /dev/null +++ b/pkg/obikmer/kmer_set_disk_ops_test.go @@ -0,0 +1,251 @@ +package obikmer + +import ( + "path/filepath" + "testing" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" +) + +// buildGroupFromSeqs creates a KmerSetGroup with one set per sequence. +func buildGroupFromSeqs(t *testing.T, dir string, k, m int, seqs []string) *KmerSetGroup { + t.Helper() + n := len(seqs) + builder, err := NewKmerSetGroupBuilder(dir, k, m, n, 64) + if err != nil { + t.Fatal(err) + } + for i, s := range seqs { + seq := obiseq.NewBioSequence("", []byte(s), "") + builder.AddSequence(i, seq) + } + ksg, err := builder.Close() + if err != nil { + t.Fatal(err) + } + return ksg +} + +func collectKmers(t *testing.T, ksg *KmerSetGroup, setIdx int) []uint64 { + t.Helper() + var result []uint64 + for kmer := range ksg.Iterator(setIdx) { + result = append(result, kmer) + } + return result +} + +func TestDiskOpsUnion(t *testing.T) { + dir := t.TempDir() + indexDir := filepath.Join(dir, "index") + outDir := filepath.Join(dir, "union") + + // Two sequences with some overlap + seqs := []string{ + "ACGATCGATCTAGCTAGCTGATCGATCGATCG", + "CTAGCTAGCTGATCGATCGATCGTTTAAACCC", + } + ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs) + + result, err := ksg.Union(outDir) + if err != nil { + t.Fatal(err) + } + + // Union should have at least as many k-mers as each individual set + unionLen := result.Len(0) + if unionLen == 0 { + t.Fatal("union is empty") + } + if unionLen < ksg.Len(0) || unionLen < ksg.Len(1) { + t.Fatalf("union (%d) smaller than an input set (%d, %d)", unionLen, ksg.Len(0), ksg.Len(1)) + } + + // Union should not exceed the sum of both sets + if unionLen > ksg.Len(0)+ksg.Len(1) { + t.Fatalf("union (%d) larger than sum of sets (%d)", unionLen, ksg.Len(0)+ksg.Len(1)) + } +} + +func TestDiskOpsIntersect(t *testing.T) { + dir := t.TempDir() + indexDir := filepath.Join(dir, "index") + outDir := filepath.Join(dir, "intersect") + + // Two sequences with some shared k-mers + seqs := []string{ + "ACGATCGATCTAGCTAGCTGATCGATCGATCG", + "CTAGCTAGCTGATCGATCGATCGTTTAAACCC", + } + ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs) + + result, err := ksg.Intersect(outDir) + if err != nil { + t.Fatal(err) + } + + interLen := result.Len(0) + // Intersection should not be bigger than any individual set + if interLen > ksg.Len(0) || interLen > ksg.Len(1) { + t.Fatalf("intersection (%d) larger than input sets (%d, %d)", interLen, ksg.Len(0), ksg.Len(1)) + } +} + +func TestDiskOpsDifference(t *testing.T) { + dir := t.TempDir() + indexDir := filepath.Join(dir, "index") + outDir := filepath.Join(dir, "diff") + + seqs := []string{ + "ACGATCGATCTAGCTAGCTGATCGATCGATCG", + "CTAGCTAGCTGATCGATCGATCGTTTAAACCC", + } + ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs) + + result, err := ksg.Difference(outDir) + if err != nil { + t.Fatal(err) + } + + diffLen := result.Len(0) + // Difference = set_0 - set_1, so should be <= set_0 + if diffLen > ksg.Len(0) { + t.Fatalf("difference (%d) larger than set_0 (%d)", diffLen, ksg.Len(0)) + } +} + +func TestDiskOpsConsistency(t *testing.T) { + dir := t.TempDir() + indexDir := filepath.Join(dir, "index") + + seqs := []string{ + "ACGATCGATCTAGCTAGCTGATCGATCGATCG", + "CTAGCTAGCTGATCGATCGATCGTTTAAACCC", + } + ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs) + + unionResult, err := ksg.Union(filepath.Join(dir, "union")) + if err != nil { + t.Fatal(err) + } + interResult, err := ksg.Intersect(filepath.Join(dir, "intersect")) + if err != nil { + t.Fatal(err) + } + diffResult, err := ksg.Difference(filepath.Join(dir, "diff")) + if err != nil { + t.Fatal(err) + } + + unionLen := unionResult.Len(0) + interLen := interResult.Len(0) + diffLen := diffResult.Len(0) + + // |A ∪ B| = |A| + |B| - |A ∩ B| + expectedUnion := ksg.Len(0) + ksg.Len(1) - interLen + if unionLen != expectedUnion { + t.Fatalf("|A∪B|=%d, expected |A|+|B|-|A∩B|=%d+%d-%d=%d", + unionLen, ksg.Len(0), ksg.Len(1), interLen, expectedUnion) + } + + // |A \ B| = |A| - |A ∩ B| + expectedDiff := ksg.Len(0) - interLen + if diffLen != expectedDiff { + t.Fatalf("|A\\B|=%d, expected |A|-|A∩B|=%d-%d=%d", + diffLen, ksg.Len(0), interLen, expectedDiff) + } +} + +func TestDiskOpsQuorum(t *testing.T) { + dir := t.TempDir() + indexDir := filepath.Join(dir, "index") + + // Three sets + seqs := []string{ + "ACGATCGATCTAGCTAGCTGATCGATCGATCG", + "CTAGCTAGCTGATCGATCGATCGTTTAAACCC", + "GATCGATCGATCGAAATTTCCCGGG", + } + ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs) + + // QuorumAtLeast(1) = Union + q1, err := ksg.QuorumAtLeast(1, filepath.Join(dir, "q1")) + if err != nil { + t.Fatal(err) + } + union, err := ksg.Union(filepath.Join(dir, "union")) + if err != nil { + t.Fatal(err) + } + if q1.Len(0) != union.Len(0) { + t.Fatalf("QuorumAtLeast(1)=%d != Union=%d", q1.Len(0), union.Len(0)) + } + + // QuorumAtLeast(3) = Intersect + q3, err := ksg.QuorumAtLeast(3, filepath.Join(dir, "q3")) + if err != nil { + t.Fatal(err) + } + inter, err := ksg.Intersect(filepath.Join(dir, "inter")) + if err != nil { + t.Fatal(err) + } + if q3.Len(0) != inter.Len(0) { + t.Fatalf("QuorumAtLeast(3)=%d != Intersect=%d", q3.Len(0), inter.Len(0)) + } + + // QuorumAtLeast(2) should be between Intersect and Union + q2, err := ksg.QuorumAtLeast(2, filepath.Join(dir, "q2")) + if err != nil { + t.Fatal(err) + } + if q2.Len(0) < q3.Len(0) || q2.Len(0) > q1.Len(0) { + t.Fatalf("QuorumAtLeast(2)=%d not between intersect=%d and union=%d", + q2.Len(0), q3.Len(0), q1.Len(0)) + } +} + +func TestDiskOpsJaccard(t *testing.T) { + dir := t.TempDir() + indexDir := filepath.Join(dir, "index") + + seqs := []string{ + "ACGATCGATCTAGCTAGCTGATCGATCGATCG", + "ACGATCGATCTAGCTAGCTGATCGATCGATCG", // identical to first + "TTTTTTTTTTTTTTTTTTTTTTTTT", // completely different + } + ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs) + + dm := ksg.JaccardDistanceMatrix() + if dm == nil { + t.Fatal("JaccardDistanceMatrix returned nil") + } + + // Identical sets should have distance 0 + d01 := dm.Get(0, 1) + if d01 != 0.0 { + t.Fatalf("distance(0,1) = %f, expected 0.0 for identical sets", d01) + } + + // Completely different sets should have distance 1.0 + d02 := dm.Get(0, 2) + if d02 != 1.0 { + t.Fatalf("distance(0,2) = %f, expected 1.0 for disjoint sets", d02) + } + + // Similarity matrix + sm := ksg.JaccardSimilarityMatrix() + if sm == nil { + t.Fatal("JaccardSimilarityMatrix returned nil") + } + + s01 := sm.Get(0, 1) + if s01 != 1.0 { + t.Fatalf("similarity(0,1) = %f, expected 1.0 for identical sets", s01) + } + + s02 := sm.Get(0, 2) + if s02 != 0.0 { + t.Fatalf("similarity(0,2) = %f, expected 0.0 for disjoint sets", s02) + } +} diff --git a/pkg/obikmer/kmer_set_group.go b/pkg/obikmer/kmer_set_group.go deleted file mode 100644 index 037641e..0000000 --- a/pkg/obikmer/kmer_set_group.go +++ /dev/null @@ -1,347 +0,0 @@ -package obikmer - -import ( - "fmt" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidist" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" -) - -// KmerSetGroup represents a vector of KmerSet -// Used to manage multiple k-mer sets (for example, by frequency level) -type KmerSetGroup struct { - id string // Unique identifier of the KmerSetGroup - k int // Size of k-mers (immutable) - sets []*KmerSet // Vector of KmerSet - Metadata map[string]interface{} // Group metadata (not individual sets) -} - -// NewKmerSetGroup creates a new group of n KmerSets -func NewKmerSetGroup(k int, n int) *KmerSetGroup { - if n < 1 { - panic("KmerSetGroup size must be >= 1") - } - - sets := make([]*KmerSet, n) - for i := range sets { - sets[i] = NewKmerSet(k) - } - - return &KmerSetGroup{ - k: k, - sets: sets, - Metadata: make(map[string]interface{}), - } -} - -// K returns the size of k-mers (immutable) -func (ksg *KmerSetGroup) K() int { - return ksg.k -} - -// Size returns the number of KmerSet in the group -func (ksg *KmerSetGroup) Size() int { - return len(ksg.sets) -} - -// Get returns the KmerSet at the given index -// Returns nil if the index is invalid -func (ksg *KmerSetGroup) Get(index int) *KmerSet { - if index < 0 || index >= len(ksg.sets) { - return nil - } - return ksg.sets[index] -} - -// Set replaces the KmerSet at the given index -// Panics if the index is invalid or if k does not match -func (ksg *KmerSetGroup) Set(index int, ks *KmerSet) { - if index < 0 || index >= len(ksg.sets) { - panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets))) - } - if ks.k != ksg.k { - panic(fmt.Sprintf("KmerSet k mismatch: expected %d, got %d", ksg.k, ks.k)) - } - ksg.sets[index] = ks -} - -// Len returns the number of k-mers in a specific KmerSet -// Without argument: returns the number of k-mers in the last KmerSet -// With argument index: returns the number of k-mers in the KmerSet at this index -func (ksg *KmerSetGroup) Len(index ...int) uint64 { - if len(index) == 0 { - // Without argument: last KmerSet - return ksg.sets[len(ksg.sets)-1].Len() - } - - // With argument: specific KmerSet - idx := index[0] - if idx < 0 || idx >= len(ksg.sets) { - return 0 - } - return ksg.sets[idx].Len() -} - -// MemoryUsage returns the total memory usage in bytes -func (ksg *KmerSetGroup) MemoryUsage() uint64 { - total := uint64(0) - for _, ks := range ksg.sets { - total += ks.MemoryUsage() - } - return total -} - -// Clear empties all KmerSet in the group -func (ksg *KmerSetGroup) Clear() { - for _, ks := range ksg.sets { - ks.Clear() - } -} - -// Copy creates a complete copy of the group (consistent with BioSequence.Copy) -func (ksg *KmerSetGroup) Copy() *KmerSetGroup { - copiedSets := make([]*KmerSet, len(ksg.sets)) - for i, ks := range ksg.sets { - copiedSets[i] = ks.Copy() // Copy each KmerSet with its metadata - } - - // Copy group metadata - groupMetadata := make(map[string]interface{}, len(ksg.Metadata)) - for k, v := range ksg.Metadata { - groupMetadata[k] = v - } - - return &KmerSetGroup{ - id: ksg.id, - k: ksg.k, - sets: copiedSets, - Metadata: groupMetadata, - } -} - -// Id returns the identifier of the KmerSetGroup (consistent with BioSequence.Id) -func (ksg *KmerSetGroup) Id() string { - return ksg.id -} - -// SetId sets the identifier of the KmerSetGroup (consistent with BioSequence.SetId) -func (ksg *KmerSetGroup) SetId(id string) { - ksg.id = id -} - -// AddSequence adds all k-mers from a sequence to a specific KmerSet -func (ksg *KmerSetGroup) AddSequence(seq *obiseq.BioSequence, index int) { - if index < 0 || index >= len(ksg.sets) { - panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets))) - } - ksg.sets[index].AddSequence(seq) -} - -// AddSequences adds all k-mers from multiple sequences to a specific KmerSet -func (ksg *KmerSetGroup) AddSequences(sequences *obiseq.BioSequenceSlice, index int) { - if index < 0 || index >= len(ksg.sets) { - panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets))) - } - ksg.sets[index].AddSequences(sequences) -} - -// AddSequenceSlice adds all k-mers from a slice of sequences to a specific KmerSet -func (ksg *KmerSetGroup) AddSequenceSlice(sequences *obiseq.BioSequenceSlice, index int) { - if index < 0 || index >= len(ksg.sets) { - panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets))) - } - ksg.sets[index].AddSequenceSlice(sequences) -} - -// Union returns the union of all KmerSet in the group -// Optimization: starts from the largest set to minimize operations -func (ksg *KmerSetGroup) Union() *KmerSet { - if len(ksg.sets) == 0 { - return NewKmerSet(ksg.k) - } - - if len(ksg.sets) == 1 { - return ksg.sets[0].Copy() - } - - // Find the index of the largest set (the one with the most k-mers) - maxIdx := 0 - maxCard := ksg.sets[0].Len() - for i := 1; i < len(ksg.sets); i++ { - card := ksg.sets[i].Len() - if card > maxCard { - maxCard = card - maxIdx = i - } - } - - // Copy the largest set and perform unions in-place - result := ksg.sets[maxIdx].bitmap.Clone() - for i := 0; i < len(ksg.sets); i++ { - if i != maxIdx { - result.Or(ksg.sets[i].bitmap) - } - } - - return NewKmerSetFromBitmap(ksg.k, result) -} - -// Intersect returns the intersection of all KmerSet in the group -// Optimization: starts from the smallest set to minimize operations -func (ksg *KmerSetGroup) Intersect() *KmerSet { - if len(ksg.sets) == 0 { - return NewKmerSet(ksg.k) - } - - if len(ksg.sets) == 1 { - return ksg.sets[0].Copy() - } - - // Find the index of the smallest set (the one with the fewest k-mers) - minIdx := 0 - minCard := ksg.sets[0].Len() - for i := 1; i < len(ksg.sets); i++ { - card := ksg.sets[i].Len() - if card < minCard { - minCard = card - minIdx = i - } - } - - // Copy the smallest set and perform intersections in-place - result := ksg.sets[minIdx].bitmap.Clone() - for i := 0; i < len(ksg.sets); i++ { - if i != minIdx { - result.And(ksg.sets[i].bitmap) - } - } - - return NewKmerSetFromBitmap(ksg.k, result) -} - -// Stats returns statistics for each KmerSet in the group -type KmerSetGroupStats struct { - K int - Size int // Number of KmerSet - TotalBytes uint64 // Total memory used - Sets []KmerSetStats // Stats of each KmerSet -} - -type KmerSetStats struct { - Index int // Index of the KmerSet in the group - Len uint64 // Number of k-mers - SizeBytes uint64 // Size in bytes -} - -func (ksg *KmerSetGroup) Stats() KmerSetGroupStats { - stats := KmerSetGroupStats{ - K: ksg.k, - Size: len(ksg.sets), - Sets: make([]KmerSetStats, len(ksg.sets)), - } - - for i, ks := range ksg.sets { - sizeBytes := ks.MemoryUsage() - stats.Sets[i] = KmerSetStats{ - Index: i, - Len: ks.Len(), - SizeBytes: sizeBytes, - } - stats.TotalBytes += sizeBytes - } - - return stats -} - -func (ksgs KmerSetGroupStats) String() string { - result := fmt.Sprintf(`KmerSetGroup Statistics (k=%d, size=%d): - Total memory: %.2f MB - -Set breakdown: -`, ksgs.K, ksgs.Size, float64(ksgs.TotalBytes)/1024/1024) - - for _, set := range ksgs.Sets { - result += fmt.Sprintf(" Set[%d]: %d k-mers (%.2f MB)\n", - set.Index, - set.Len, - float64(set.SizeBytes)/1024/1024) - } - - return result -} - -// JaccardDistanceMatrix computes a pairwise Jaccard distance matrix for all KmerSets in the group. -// Returns a triangular distance matrix where element (i, j) represents the Jaccard distance -// between set i and set j. -// -// The Jaccard distance is: 1 - (|A ∩ B| / |A ∪ B|) -// -// The matrix labels are set to the IDs of the individual KmerSets if available, -// otherwise they are set to "set_0", "set_1", etc. -// -// Time complexity: O(n² × (|A| + |B|)) where n is the number of sets -// Space complexity: O(n²) for the distance matrix -func (ksg *KmerSetGroup) JaccardDistanceMatrix() *obidist.DistMatrix { - n := len(ksg.sets) - - // Create labels from set IDs - labels := make([]string, n) - for i, ks := range ksg.sets { - if ks.Id() != "" { - labels[i] = ks.Id() - } else { - labels[i] = fmt.Sprintf("set_%d", i) - } - } - - dm := obidist.NewDistMatrixWithLabels(labels) - - // Compute pairwise distances - for i := 0; i < n-1; i++ { - for j := i + 1; j < n; j++ { - distance := ksg.sets[i].JaccardDistance(ksg.sets[j]) - dm.Set(i, j, distance) - } - } - - return dm -} - -// JaccardSimilarityMatrix computes a pairwise Jaccard similarity matrix for all KmerSets in the group. -// Returns a similarity matrix where element (i, j) represents the Jaccard similarity -// between set i and set j. -// -// The Jaccard similarity is: |A ∩ B| / |A ∪ B| -// -// The diagonal is 1.0 (similarity of a set to itself). -// -// The matrix labels are set to the IDs of the individual KmerSets if available, -// otherwise they are set to "set_0", "set_1", etc. -// -// Time complexity: O(n² × (|A| + |B|)) where n is the number of sets -// Space complexity: O(n²) for the similarity matrix -func (ksg *KmerSetGroup) JaccardSimilarityMatrix() *obidist.DistMatrix { - n := len(ksg.sets) - - // Create labels from set IDs - labels := make([]string, n) - for i, ks := range ksg.sets { - if ks.Id() != "" { - labels[i] = ks.Id() - } else { - labels[i] = fmt.Sprintf("set_%d", i) - } - } - - sm := obidist.NewSimilarityMatrixWithLabels(labels) - - // Compute pairwise similarities - for i := 0; i < n-1; i++ { - for j := i + 1; j < n; j++ { - similarity := ksg.sets[i].JaccardSimilarity(ksg.sets[j]) - sm.Set(i, j, similarity) - } - } - - return sm -} diff --git a/pkg/obikmer/kmer_set_group_jaccard_test.go b/pkg/obikmer/kmer_set_group_jaccard_test.go deleted file mode 100644 index 1e17d02..0000000 --- a/pkg/obikmer/kmer_set_group_jaccard_test.go +++ /dev/null @@ -1,231 +0,0 @@ -package obikmer - -import ( - "math" - "testing" -) - -func TestKmerSetGroupJaccardDistanceMatrix(t *testing.T) { - ksg := NewKmerSetGroup(5, 3) - - // Set 0: {1, 2, 3} - ksg.Get(0).AddKmerCode(1) - ksg.Get(0).AddKmerCode(2) - ksg.Get(0).AddKmerCode(3) - ksg.Get(0).SetId("set_A") - - // Set 1: {2, 3, 4} - ksg.Get(1).AddKmerCode(2) - ksg.Get(1).AddKmerCode(3) - ksg.Get(1).AddKmerCode(4) - ksg.Get(1).SetId("set_B") - - // Set 2: {5, 6, 7} - ksg.Get(2).AddKmerCode(5) - ksg.Get(2).AddKmerCode(6) - ksg.Get(2).AddKmerCode(7) - ksg.Get(2).SetId("set_C") - - dm := ksg.JaccardDistanceMatrix() - - // Check labels - if dm.GetLabel(0) != "set_A" { - t.Errorf("Expected label 'set_A' at index 0, got '%s'", dm.GetLabel(0)) - } - if dm.GetLabel(1) != "set_B" { - t.Errorf("Expected label 'set_B' at index 1, got '%s'", dm.GetLabel(1)) - } - if dm.GetLabel(2) != "set_C" { - t.Errorf("Expected label 'set_C' at index 2, got '%s'", dm.GetLabel(2)) - } - - // Check distances - // Distance(0, 1): - // Intersection: {2, 3} -> 2 elements - // Union: {1, 2, 3, 4} -> 4 elements - // Similarity: 2/4 = 0.5 - // Distance: 1 - 0.5 = 0.5 - expectedDist01 := 0.5 - actualDist01 := dm.Get(0, 1) - if math.Abs(actualDist01-expectedDist01) > 1e-10 { - t.Errorf("Distance(0, 1): expected %f, got %f", expectedDist01, actualDist01) - } - - // Distance(0, 2): - // Intersection: {} -> 0 elements - // Union: {1, 2, 3, 5, 6, 7} -> 6 elements - // Similarity: 0/6 = 0 - // Distance: 1 - 0 = 1.0 - expectedDist02 := 1.0 - actualDist02 := dm.Get(0, 2) - if math.Abs(actualDist02-expectedDist02) > 1e-10 { - t.Errorf("Distance(0, 2): expected %f, got %f", expectedDist02, actualDist02) - } - - // Distance(1, 2): - // Intersection: {} -> 0 elements - // Union: {2, 3, 4, 5, 6, 7} -> 6 elements - // Similarity: 0/6 = 0 - // Distance: 1 - 0 = 1.0 - expectedDist12 := 1.0 - actualDist12 := dm.Get(1, 2) - if math.Abs(actualDist12-expectedDist12) > 1e-10 { - t.Errorf("Distance(1, 2): expected %f, got %f", expectedDist12, actualDist12) - } - - // Check symmetry - if dm.Get(0, 1) != dm.Get(1, 0) { - t.Errorf("Matrix not symmetric: Get(0, 1) = %f, Get(1, 0) = %f", - dm.Get(0, 1), dm.Get(1, 0)) - } - - // Check diagonal - if dm.Get(0, 0) != 0.0 { - t.Errorf("Diagonal should be 0, got %f", dm.Get(0, 0)) - } - if dm.Get(1, 1) != 0.0 { - t.Errorf("Diagonal should be 0, got %f", dm.Get(1, 1)) - } - if dm.Get(2, 2) != 0.0 { - t.Errorf("Diagonal should be 0, got %f", dm.Get(2, 2)) - } -} - -func TestKmerSetGroupJaccardSimilarityMatrix(t *testing.T) { - ksg := NewKmerSetGroup(5, 3) - - // Set 0: {1, 2, 3} - ksg.Get(0).AddKmerCode(1) - ksg.Get(0).AddKmerCode(2) - ksg.Get(0).AddKmerCode(3) - - // Set 1: {2, 3, 4} - ksg.Get(1).AddKmerCode(2) - ksg.Get(1).AddKmerCode(3) - ksg.Get(1).AddKmerCode(4) - - // Set 2: {1, 2, 3} (same as set 0) - ksg.Get(2).AddKmerCode(1) - ksg.Get(2).AddKmerCode(2) - ksg.Get(2).AddKmerCode(3) - - sm := ksg.JaccardSimilarityMatrix() - - // Check similarities - // Similarity(0, 1): 0.5 (as calculated above) - expectedSim01 := 0.5 - actualSim01 := sm.Get(0, 1) - if math.Abs(actualSim01-expectedSim01) > 1e-10 { - t.Errorf("Similarity(0, 1): expected %f, got %f", expectedSim01, actualSim01) - } - - // Similarity(0, 2): 1.0 (identical sets) - expectedSim02 := 1.0 - actualSim02 := sm.Get(0, 2) - if math.Abs(actualSim02-expectedSim02) > 1e-10 { - t.Errorf("Similarity(0, 2): expected %f, got %f", expectedSim02, actualSim02) - } - - // Similarity(1, 2): 0.5 - // Intersection: {2, 3} -> 2 - // Union: {1, 2, 3, 4} -> 4 - // Similarity: 2/4 = 0.5 - expectedSim12 := 0.5 - actualSim12 := sm.Get(1, 2) - if math.Abs(actualSim12-expectedSim12) > 1e-10 { - t.Errorf("Similarity(1, 2): expected %f, got %f", expectedSim12, actualSim12) - } - - // Check diagonal (similarity to self = 1.0) - if sm.Get(0, 0) != 1.0 { - t.Errorf("Diagonal should be 1.0, got %f", sm.Get(0, 0)) - } - if sm.Get(1, 1) != 1.0 { - t.Errorf("Diagonal should be 1.0, got %f", sm.Get(1, 1)) - } - if sm.Get(2, 2) != 1.0 { - t.Errorf("Diagonal should be 1.0, got %f", sm.Get(2, 2)) - } -} - -func TestKmerSetGroupJaccardMatricesRelation(t *testing.T) { - ksg := NewKmerSetGroup(5, 4) - - // Create different sets - ksg.Get(0).AddKmerCode(1) - ksg.Get(0).AddKmerCode(2) - - ksg.Get(1).AddKmerCode(2) - ksg.Get(1).AddKmerCode(3) - - ksg.Get(2).AddKmerCode(1) - ksg.Get(2).AddKmerCode(2) - ksg.Get(2).AddKmerCode(3) - - ksg.Get(3).AddKmerCode(10) - ksg.Get(3).AddKmerCode(20) - - dm := ksg.JaccardDistanceMatrix() - sm := ksg.JaccardSimilarityMatrix() - - // For all pairs (including diagonal), distance + similarity should equal 1.0 - for i := 0; i < 4; i++ { - for j := 0; j < 4; j++ { - distance := dm.Get(i, j) - similarity := sm.Get(i, j) - sum := distance + similarity - - if math.Abs(sum-1.0) > 1e-10 { - t.Errorf("At (%d, %d): distance %f + similarity %f = %f, expected 1.0", - i, j, distance, similarity, sum) - } - } - } -} - -func TestKmerSetGroupJaccardMatrixLabels(t *testing.T) { - ksg := NewKmerSetGroup(5, 3) - - // Don't set IDs - should use default labels - ksg.Get(0).AddKmerCode(1) - ksg.Get(1).AddKmerCode(2) - ksg.Get(2).AddKmerCode(3) - - dm := ksg.JaccardDistanceMatrix() - - // Check default labels - if dm.GetLabel(0) != "set_0" { - t.Errorf("Expected default label 'set_0', got '%s'", dm.GetLabel(0)) - } - if dm.GetLabel(1) != "set_1" { - t.Errorf("Expected default label 'set_1', got '%s'", dm.GetLabel(1)) - } - if dm.GetLabel(2) != "set_2" { - t.Errorf("Expected default label 'set_2', got '%s'", dm.GetLabel(2)) - } -} - -func TestKmerSetGroupJaccardMatrixSize(t *testing.T) { - ksg := NewKmerSetGroup(5, 5) - - for i := 0; i < 5; i++ { - ksg.Get(i).AddKmerCode(uint64(i)) - } - - dm := ksg.JaccardDistanceMatrix() - - if dm.Size() != 5 { - t.Errorf("Expected matrix size 5, got %d", dm.Size()) - } - - // All sets are disjoint, so all distances should be 1.0 - for i := 0; i < 5; i++ { - for j := i + 1; j < 5; j++ { - dist := dm.Get(i, j) - if math.Abs(dist-1.0) > 1e-10 { - t.Errorf("Expected distance 1.0 for disjoint sets (%d, %d), got %f", - i, j, dist) - } - } - } -} diff --git a/pkg/obikmer/kmer_set_group_quorum.go b/pkg/obikmer/kmer_set_group_quorum.go deleted file mode 100644 index 4f21f95..0000000 --- a/pkg/obikmer/kmer_set_group_quorum.go +++ /dev/null @@ -1,235 +0,0 @@ -package obikmer - -import ( - "container/heap" - - "github.com/RoaringBitmap/roaring/roaring64" -) - -// heapItem represents an element in the min-heap for k-way merge -type heapItem struct { - value uint64 - idx int -} - -// kmerMinHeap implements heap.Interface for k-way merge algorithm -type kmerMinHeap []heapItem - -func (h kmerMinHeap) Len() int { return len(h) } -func (h kmerMinHeap) Less(i, j int) bool { return h[i].value < h[j].value } -func (h kmerMinHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } - -func (h *kmerMinHeap) Push(x interface{}) { - *h = append(*h, x.(heapItem)) -} - -func (h *kmerMinHeap) Pop() interface{} { - old := *h - n := len(old) - x := old[n-1] - *h = old[0 : n-1] - return x -} - -// QuorumAtLeast returns k-mers present in at least q sets -// -// Algorithm: K-way merge with min-heap counting -// -// The algorithm processes all k-mers in sorted order using a min-heap: -// -// 1. Initialize one iterator per non-empty set -// 2. Build a min-heap of (value, set_index) pairs, one per iterator -// 3. While heap is not empty: -// a. Extract the minimum value v from heap -// b. Pop ALL heap items with value == v (counting occurrences) -// c. If count >= q, add v to result -// d. Advance each popped iterator and re-insert into heap if valid -// -// This ensures each unique k-mer is counted exactly once across all sets. -// -// Time complexity: O(M log N) -// - M = sum of all set cardinalities (total k-mer occurrences) -// - N = number of sets -// - Each k-mer occurrence is inserted/extracted from heap once: O(M) operations -// - Each heap operation costs O(log N) -// -// Space complexity: O(N) -// - Heap contains at most N elements (one per set iterator) -// - Output bitmap size depends on quorum result -// -// Special cases (optimized): -// - q <= 0: returns empty set -// - q == 1: delegates to Union() (native OR operations) -// - q == n: delegates to Intersect() (native AND operations) -// - q > n: returns empty set (impossible to satisfy) -func (ksg *KmerSetGroup) QuorumAtLeast(q int) *KmerSet { - n := len(ksg.sets) - - // Edge cases - if q <= 0 || n == 0 { - return NewKmerSet(ksg.k) - } - if q > n { - return NewKmerSet(ksg.k) - } - if q == 1 { - return ksg.Union() - } - if q == n { - return ksg.Intersect() - } - - // Initialize iterators for all non-empty sets - iterators := make([]roaring64.IntIterable64, 0, n) - iterIndices := make([]int, 0, n) - - for i, set := range ksg.sets { - if set.Len() > 0 { - iter := set.bitmap.Iterator() - if iter.HasNext() { - iterators = append(iterators, iter) - iterIndices = append(iterIndices, i) - } - } - } - - if len(iterators) == 0 { - return NewKmerSet(ksg.k) - } - - // Initialize heap with first value from each iterator - h := make(kmerMinHeap, len(iterators)) - for i, iter := range iterators { - h[i] = heapItem{value: iter.Next(), idx: i} - } - heap.Init(&h) - - // Result bitmap - result := roaring64.New() - - // K-way merge with counting - for len(h) > 0 { - minVal := h[0].value - count := 0 - activeIndices := make([]int, 0, len(h)) - - // Pop all elements with same value (count occurrences) - for len(h) > 0 && h[0].value == minVal { - item := heap.Pop(&h).(heapItem) - count++ - activeIndices = append(activeIndices, item.idx) - } - - // Add to result if quorum reached - if count >= q { - result.Add(minVal) - } - - // Advance iterators and re-insert into heap - for _, iterIdx := range activeIndices { - if iterators[iterIdx].HasNext() { - heap.Push(&h, heapItem{ - value: iterators[iterIdx].Next(), - idx: iterIdx, - }) - } - } - } - - return NewKmerSetFromBitmap(ksg.k, result) -} - -// QuorumAtMost returns k-mers present in at most q sets -// -// Algorithm: Uses the mathematical identity -// AtMost(q) = Union() - AtLeast(q+1) -// -// Proof: -// - Union() contains all k-mers present in at least 1 set -// - AtLeast(q+1) contains all k-mers present in q+1 or more sets -// - Their difference contains only k-mers present in at most q sets -// -// Implementation: -// 1. Compute U = Union() -// 2. Compute A = QuorumAtLeast(q+1) -// 3. Return U - A using bitmap AndNot operation -// -// Time complexity: O(M log N) -// - Union(): O(M) with native OR operations -// - QuorumAtLeast(q+1): O(M log N) -// - AndNot: O(|U|) where |U| <= M -// - Total: O(M log N) -// -// Space complexity: O(N) -// - Inherited from QuorumAtLeast heap -// -// Special cases: -// - q <= 0: returns empty set -// - q >= n: returns Union() (all k-mers are in at most n sets) -func (ksg *KmerSetGroup) QuorumAtMost(q int) *KmerSet { - n := len(ksg.sets) - - // Edge cases - if q <= 0 { - return NewKmerSet(ksg.k) - } - if q >= n { - return ksg.Union() - } - - // Compute Union() - AtLeast(q+1) - union := ksg.Union() - atLeastQ1 := ksg.QuorumAtLeast(q + 1) - - // Difference: elements in union but not in atLeastQ1 - result := union.bitmap.Clone() - result.AndNot(atLeastQ1.bitmap) - - return NewKmerSetFromBitmap(ksg.k, result) -} - -// QuorumExactly returns k-mers present in exactly q sets -// -// Algorithm: Uses the mathematical identity -// Exactly(q) = AtLeast(q) - AtLeast(q+1) -// -// Proof: -// - AtLeast(q) contains all k-mers present in q or more sets -// - AtLeast(q+1) contains all k-mers present in q+1 or more sets -// - Their difference contains only k-mers present in exactly q sets -// -// Implementation: -// 1. Compute A = QuorumAtLeast(q) -// 2. Compute B = QuorumAtLeast(q+1) -// 3. Return A - B using bitmap AndNot operation -// -// Time complexity: O(M log N) -// - Two calls to QuorumAtLeast: 2 * O(M log N) -// - One AndNot operation: O(|A|) where |A| <= M -// - Total: O(M log N) since AndNot is dominated by merge operations -// -// Space complexity: O(N) -// - Inherited from QuorumAtLeast heap -// - Two temporary bitmaps for intermediate results -// -// Special cases: -// - q <= 0: returns empty set -// - q > n: returns empty set (impossible to have k-mer in more than n sets) -func (ksg *KmerSetGroup) QuorumExactly(q int) *KmerSet { - n := len(ksg.sets) - - // Edge cases - if q <= 0 || q > n { - return NewKmerSet(ksg.k) - } - - // Compute AtLeast(q) - AtLeast(q+1) - aq := ksg.QuorumAtLeast(q) - aq1 := ksg.QuorumAtLeast(q + 1) - - // Difference: elements in aq but not in aq1 - result := aq.bitmap.Clone() - result.AndNot(aq1.bitmap) - - return NewKmerSetFromBitmap(ksg.k, result) -} diff --git a/pkg/obikmer/kmer_set_group_quorum_test.go b/pkg/obikmer/kmer_set_group_quorum_test.go deleted file mode 100644 index ab11319..0000000 --- a/pkg/obikmer/kmer_set_group_quorum_test.go +++ /dev/null @@ -1,395 +0,0 @@ -package obikmer - -import ( - "testing" -) - -// TestQuorumAtLeastEdgeCases tests edge cases for QuorumAtLeast -func TestQuorumAtLeastEdgeCases(t *testing.T) { - k := 5 - - // Test group with all empty sets - emptyGroup := NewKmerSetGroup(k, 3) - result := emptyGroup.QuorumAtLeast(1) - if result.Len() != 0 { - t.Errorf("Empty sets: expected 0 k-mers, got %d", result.Len()) - } - - // Test q <= 0 - group := NewKmerSetGroup(k, 3) - result = group.QuorumAtLeast(0) - if result.Len() != 0 { - t.Errorf("q=0: expected 0 k-mers, got %d", result.Len()) - } - - result = group.QuorumAtLeast(-1) - if result.Len() != 0 { - t.Errorf("q=-1: expected 0 k-mers, got %d", result.Len()) - } - - // Test q > n - group.Get(0).AddKmerCode(1) - result = group.QuorumAtLeast(10) - if result.Len() != 0 { - t.Errorf("q>n: expected 0 k-mers, got %d", result.Len()) - } -} - -// TestQuorumAtLeastQ1 tests q=1 (should equal Union) -func TestQuorumAtLeastQ1(t *testing.T) { - k := 5 - group := NewKmerSetGroup(k, 3) - - // Add different k-mers to each set - group.Get(0).AddKmerCode(1) - group.Get(0).AddKmerCode(2) - group.Get(1).AddKmerCode(2) - group.Get(1).AddKmerCode(3) - group.Get(2).AddKmerCode(3) - group.Get(2).AddKmerCode(4) - - quorum := group.QuorumAtLeast(1) - union := group.Union() - - if quorum.Len() != union.Len() { - t.Errorf("QuorumAtLeast(1) length %d != Union length %d", quorum.Len(), union.Len()) - } - - // Check all elements match - for kmer := uint64(1); kmer <= 4; kmer++ { - if quorum.Contains(kmer) != union.Contains(kmer) { - t.Errorf("Mismatch for k-mer %d", kmer) - } - } -} - -// TestQuorumAtLeastQN tests q=n (should equal Intersect) -func TestQuorumAtLeastQN(t *testing.T) { - k := 5 - group := NewKmerSetGroup(k, 3) - - // Add some common k-mers and some unique - for i := 0; i < 3; i++ { - group.Get(i).AddKmerCode(10) // common to all - group.Get(i).AddKmerCode(20) // common to all - } - group.Get(0).AddKmerCode(1) // unique to set 0 - group.Get(1).AddKmerCode(2) // unique to set 1 - - quorum := group.QuorumAtLeast(3) - intersect := group.Intersect() - - if quorum.Len() != intersect.Len() { - t.Errorf("QuorumAtLeast(n) length %d != Intersect length %d", quorum.Len(), intersect.Len()) - } - - if quorum.Len() != 2 { - t.Errorf("Expected 2 common k-mers, got %d", quorum.Len()) - } - - if !quorum.Contains(10) || !quorum.Contains(20) { - t.Error("Missing common k-mers") - } - - if quorum.Contains(1) || quorum.Contains(2) { - t.Error("Unique k-mers should not be in result") - } -} - -// TestQuorumAtLeastGeneral tests general quorum values -func TestQuorumAtLeastGeneral(t *testing.T) { - k := 5 - group := NewKmerSetGroup(k, 5) - - // Setup: k-mer i appears in i sets (for i=1..5) - // k-mer 1: in set 0 - // k-mer 2: in sets 0,1 - // k-mer 3: in sets 0,1,2 - // k-mer 4: in sets 0,1,2,3 - // k-mer 5: in sets 0,1,2,3,4 (all) - - for kmer := uint64(1); kmer <= 5; kmer++ { - for setIdx := 0; setIdx < int(kmer); setIdx++ { - group.Get(setIdx).AddKmerCode(kmer) - } - } - - tests := []struct { - q int - expected map[uint64]bool - }{ - {1, map[uint64]bool{1: true, 2: true, 3: true, 4: true, 5: true}}, - {2, map[uint64]bool{2: true, 3: true, 4: true, 5: true}}, - {3, map[uint64]bool{3: true, 4: true, 5: true}}, - {4, map[uint64]bool{4: true, 5: true}}, - {5, map[uint64]bool{5: true}}, - } - - for _, tt := range tests { - result := group.QuorumAtLeast(tt.q) - - if result.Len() != uint64(len(tt.expected)) { - t.Errorf("q=%d: expected %d k-mers, got %d", tt.q, len(tt.expected), result.Len()) - } - - for kmer := uint64(1); kmer <= 5; kmer++ { - shouldContain := tt.expected[kmer] - doesContain := result.Contains(kmer) - if shouldContain != doesContain { - t.Errorf("q=%d, k-mer=%d: expected contains=%v, got %v", tt.q, kmer, shouldContain, doesContain) - } - } - } -} - -// TestQuorumExactlyBasic tests QuorumExactly basic functionality -func TestQuorumExactlyBasic(t *testing.T) { - k := 5 - group := NewKmerSetGroup(k, 5) - - // Setup: k-mer i appears in exactly i sets - for kmer := uint64(1); kmer <= 5; kmer++ { - for setIdx := 0; setIdx < int(kmer); setIdx++ { - group.Get(setIdx).AddKmerCode(kmer) - } - } - - tests := []struct { - q int - expected []uint64 - }{ - {1, []uint64{1}}, - {2, []uint64{2}}, - {3, []uint64{3}}, - {4, []uint64{4}}, - {5, []uint64{5}}, - } - - for _, tt := range tests { - result := group.QuorumExactly(tt.q) - - if result.Len() != uint64(len(tt.expected)) { - t.Errorf("q=%d: expected %d k-mers, got %d", tt.q, len(tt.expected), result.Len()) - } - - for _, kmer := range tt.expected { - if !result.Contains(kmer) { - t.Errorf("q=%d: missing k-mer %d", tt.q, kmer) - } - } - } -} - -// TestQuorumIdentity tests the mathematical identity: Exactly(q) = AtLeast(q) - AtLeast(q+1) -func TestQuorumIdentity(t *testing.T) { - k := 5 - group := NewKmerSetGroup(k, 4) - - // Add random distribution - group.Get(0).AddKmerCode(1) - group.Get(0).AddKmerCode(2) - group.Get(0).AddKmerCode(3) - - group.Get(1).AddKmerCode(2) - group.Get(1).AddKmerCode(3) - group.Get(1).AddKmerCode(4) - - group.Get(2).AddKmerCode(3) - group.Get(2).AddKmerCode(4) - - group.Get(3).AddKmerCode(4) - - for q := 1; q <= 4; q++ { - exactly := group.QuorumExactly(q) - atLeast := group.QuorumAtLeast(q) - atLeastPlus1 := group.QuorumAtLeast(q + 1) - - // Verify: every element in exactly(q) is in atLeast(q) - iter := exactly.Iterator() - for iter.HasNext() { - kmer := iter.Next() - if !atLeast.Contains(kmer) { - t.Errorf("q=%d: k-mer %d in Exactly but not in AtLeast", q, kmer) - } - if atLeastPlus1.Contains(kmer) { - t.Errorf("q=%d: k-mer %d in Exactly but also in AtLeast(q+1)", q, kmer) - } - } - } -} - -// TestQuorumDisjointSets tests quorum on completely disjoint sets -func TestQuorumDisjointSets(t *testing.T) { - k := 5 - group := NewKmerSetGroup(k, 3) - - // Each set has unique k-mers - group.Get(0).AddKmerCode(1) - group.Get(1).AddKmerCode(2) - group.Get(2).AddKmerCode(3) - - // q=1 should give all - result := group.QuorumAtLeast(1) - if result.Len() != 3 { - t.Errorf("Disjoint sets q=1: expected 3, got %d", result.Len()) - } - - // q=2 should give none - result = group.QuorumAtLeast(2) - if result.Len() != 0 { - t.Errorf("Disjoint sets q=2: expected 0, got %d", result.Len()) - } -} - -// TestQuorumIdenticalSets tests quorum on identical sets -func TestQuorumIdenticalSets(t *testing.T) { - k := 5 - group := NewKmerSetGroup(k, 3) - - // All sets have same k-mers - for i := 0; i < 3; i++ { - group.Get(i).AddKmerCode(10) - group.Get(i).AddKmerCode(20) - group.Get(i).AddKmerCode(30) - } - - // Any q <= n should give all k-mers - for q := 1; q <= 3; q++ { - result := group.QuorumAtLeast(q) - if result.Len() != 3 { - t.Errorf("Identical sets q=%d: expected 3, got %d", q, result.Len()) - } - } -} - -// TestQuorumLargeNumbers tests with large k-mer values -func TestQuorumLargeNumbers(t *testing.T) { - k := 21 - group := NewKmerSetGroup(k, 3) - - // Use large uint64 values (actual k-mer encodings) - largeKmers := []uint64{ - 0x1234567890ABCDEF, - 0xFEDCBA0987654321, - 0xAAAAAAAAAAAAAAAA, - } - - // Add to multiple sets - for i := 0; i < 3; i++ { - for j := 0; j <= i; j++ { - group.Get(j).AddKmerCode(largeKmers[i]) - } - } - - result := group.QuorumAtLeast(2) - if result.Len() != 2 { - t.Errorf("Large numbers q=2: expected 2, got %d", result.Len()) - } - - if !result.Contains(largeKmers[1]) || !result.Contains(largeKmers[2]) { - t.Error("Large numbers: wrong k-mers in result") - } -} - -// TestQuorumAtMostBasic tests QuorumAtMost basic functionality -func TestQuorumAtMostBasic(t *testing.T) { - k := 5 - group := NewKmerSetGroup(k, 5) - - // Setup: k-mer i appears in exactly i sets - for kmer := uint64(1); kmer <= 5; kmer++ { - for setIdx := 0; setIdx < int(kmer); setIdx++ { - group.Get(setIdx).AddKmerCode(kmer) - } - } - - tests := []struct { - q int - expected []uint64 - }{ - {0, []uint64{}}, // at most 0: none - {1, []uint64{1}}, // at most 1: only k-mer 1 - {2, []uint64{1, 2}}, // at most 2: k-mers 1,2 - {3, []uint64{1, 2, 3}}, // at most 3: k-mers 1,2,3 - {4, []uint64{1, 2, 3, 4}}, // at most 4: k-mers 1,2,3,4 - {5, []uint64{1, 2, 3, 4, 5}}, // at most 5: all k-mers - {10, []uint64{1, 2, 3, 4, 5}}, // at most 10: all k-mers - } - - for _, tt := range tests { - result := group.QuorumAtMost(tt.q) - - if result.Len() != uint64(len(tt.expected)) { - t.Errorf("q=%d: expected %d k-mers, got %d", tt.q, len(tt.expected), result.Len()) - } - - for _, kmer := range tt.expected { - if !result.Contains(kmer) { - t.Errorf("q=%d: missing k-mer %d", tt.q, kmer) - } - } - } -} - -// TestQuorumComplementIdentity tests that AtLeast and AtMost are complementary -func TestQuorumComplementIdentity(t *testing.T) { - k := 5 - group := NewKmerSetGroup(k, 4) - - // Add random distribution - group.Get(0).AddKmerCode(1) - group.Get(0).AddKmerCode(2) - group.Get(0).AddKmerCode(3) - - group.Get(1).AddKmerCode(2) - group.Get(1).AddKmerCode(3) - group.Get(1).AddKmerCode(4) - - group.Get(2).AddKmerCode(3) - group.Get(2).AddKmerCode(4) - - group.Get(3).AddKmerCode(4) - - union := group.Union() - - for q := 1; q < 4; q++ { - atMost := group.QuorumAtMost(q) - atLeast := group.QuorumAtLeast(q + 1) - - // Verify: AtMost(q) ∪ AtLeast(q+1) = Union() - combined := atMost.Union(atLeast) - - if combined.Len() != union.Len() { - t.Errorf("q=%d: AtMost(q) ∪ AtLeast(q+1) has %d k-mers, Union has %d", - q, combined.Len(), union.Len()) - } - - // Verify: AtMost(q) ∩ AtLeast(q+1) = ∅ - overlap := atMost.Intersect(atLeast) - if overlap.Len() != 0 { - t.Errorf("q=%d: AtMost(q) and AtLeast(q+1) overlap with %d k-mers", - q, overlap.Len()) - } - } -} - -// BenchmarkQuorumAtLeast benchmarks quorum operations -func BenchmarkQuorumAtLeast(b *testing.B) { - k := 21 - n := 10 - group := NewKmerSetGroup(k, n) - - // Populate with realistic data - for i := 0; i < n; i++ { - for j := uint64(0); j < 10000; j++ { - if (j % uint64(n)) <= uint64(i) { - group.Get(i).AddKmerCode(j) - } - } - } - - b.ResetTimer() - for i := 0; i < b.N; i++ { - _ = group.QuorumAtLeast(5) - } -} diff --git a/pkg/obikmer/kmer_set_persistence.go b/pkg/obikmer/kmer_set_persistence.go deleted file mode 100644 index 3bdc2ae..0000000 --- a/pkg/obikmer/kmer_set_persistence.go +++ /dev/null @@ -1,376 +0,0 @@ -package obikmer - -import ( - "encoding/json" - "fmt" - "os" - "path/filepath" - "strings" - - "github.com/pelletier/go-toml/v2" - "gopkg.in/yaml.v3" -) - -// MetadataFormat represents the metadata serialization format -type MetadataFormat int - -const ( - FormatTOML MetadataFormat = iota - FormatYAML - FormatJSON -) - -// String returns the file extension for the format -func (f MetadataFormat) String() string { - switch f { - case FormatTOML: - return "toml" - case FormatYAML: - return "yaml" - case FormatJSON: - return "json" - default: - return "toml" - } -} - -// KmerSetMetadata contient les métadonnées d'un KmerSet ou KmerSetGroup -type KmerSetMetadata struct { - ID string `toml:"id,omitempty" yaml:"id,omitempty" json:"id,omitempty"` // Identifiant unique - K int `toml:"k" yaml:"k" json:"k"` // Taille des k-mers - Type string `toml:"type" yaml:"type" json:"type"` // "KmerSet" ou "KmerSetGroup" - Size int `toml:"size" yaml:"size" json:"size"` // 1 pour KmerSet, n pour KmerSetGroup - Files []string `toml:"files" yaml:"files" json:"files"` // Liste des fichiers .roaring - SetsIDs []string `toml:"sets_ids,omitempty" yaml:"sets_ids,omitempty" json:"sets_ids,omitempty"` // IDs des KmerSet individuels - UserMetadata map[string]interface{} `toml:"user_metadata,omitempty" yaml:"user_metadata,omitempty" json:"user_metadata,omitempty"` // Métadonnées KmerSet ou KmerSetGroup - SetsMetadata []map[string]interface{} `toml:"sets_metadata,omitempty" yaml:"sets_metadata,omitempty" json:"sets_metadata,omitempty"` // Métadonnées des KmerSet individuels dans un KmerSetGroup -} - -// SaveKmerSet sauvegarde un KmerSet dans un répertoire -// Format: directory/metadata.{toml,yaml,json} + directory/set_0.roaring -func (ks *KmerSet) Save(directory string, format MetadataFormat) error { - // Créer le répertoire si nécessaire - if err := os.MkdirAll(directory, 0755); err != nil { - return fmt.Errorf("failed to create directory %s: %w", directory, err) - } - - // Métadonnées - metadata := KmerSetMetadata{ - ID: ks.id, - K: ks.k, - Type: "KmerSet", - Size: 1, - Files: []string{"set_0.roaring"}, - UserMetadata: ks.Metadata, // Sauvegarder les métadonnées utilisateur - } - - // Sauvegarder les métadonnées - if err := saveMetadata(filepath.Join(directory, "metadata."+format.String()), metadata, format); err != nil { - return err - } - - // Sauvegarder le bitmap - bitmapPath := filepath.Join(directory, "set_0.roaring") - file, err := os.Create(bitmapPath) - if err != nil { - return fmt.Errorf("failed to create bitmap file %s: %w", bitmapPath, err) - } - defer file.Close() - - if _, err := ks.bitmap.WriteTo(file); err != nil { - return fmt.Errorf("failed to write bitmap: %w", err) - } - - return nil -} - -// LoadKmerSet charge un KmerSet depuis un répertoire -func LoadKmerSet(directory string) (*KmerSet, error) { - // Lire les métadonnées (essayer tous les formats) - metadata, err := loadMetadata(directory) - if err != nil { - return nil, err - } - - // Vérifier le type - if metadata.Type != "KmerSet" { - return nil, fmt.Errorf("invalid type: expected KmerSet, got %s", metadata.Type) - } - - // Vérifier qu'il n'y a qu'un seul fichier - if metadata.Size != 1 || len(metadata.Files) != 1 { - return nil, fmt.Errorf("KmerSet must have exactly 1 bitmap file, got %d", len(metadata.Files)) - } - - // Charger le bitmap - bitmapPath := filepath.Join(directory, metadata.Files[0]) - file, err := os.Open(bitmapPath) - if err != nil { - return nil, fmt.Errorf("failed to open bitmap file %s: %w", bitmapPath, err) - } - defer file.Close() - - ks := NewKmerSet(metadata.K) - - // Charger l'ID - ks.id = metadata.ID - - // Charger les métadonnées utilisateur - if metadata.UserMetadata != nil { - ks.Metadata = metadata.UserMetadata - } - - if _, err := ks.bitmap.ReadFrom(file); err != nil { - return nil, fmt.Errorf("failed to read bitmap: %w", err) - } - - return ks, nil -} - -// SaveKmerSetGroup sauvegarde un KmerSetGroup dans un répertoire -// Format: directory/metadata.{toml,yaml,json} + directory/set_0.roaring, set_1.roaring, ... -func (ksg *KmerSetGroup) Save(directory string, format MetadataFormat) error { - // Créer le répertoire si nécessaire - if err := os.MkdirAll(directory, 0755); err != nil { - return fmt.Errorf("failed to create directory %s: %w", directory, err) - } - - // Métadonnées - files := make([]string, len(ksg.sets)) - for i := range ksg.sets { - files[i] = fmt.Sprintf("set_%d.roaring", i) - } - - // Collecter les IDs et métadonnées de chaque KmerSet individuel - setsIDs := make([]string, len(ksg.sets)) - setsMetadata := make([]map[string]interface{}, len(ksg.sets)) - for i, ks := range ksg.sets { - setsIDs[i] = ks.id - setsMetadata[i] = ks.Metadata - } - - metadata := KmerSetMetadata{ - ID: ksg.id, - K: ksg.k, - Type: "KmerSetGroup", - Size: len(ksg.sets), - Files: files, - SetsIDs: setsIDs, // IDs de chaque set - UserMetadata: ksg.Metadata, // Métadonnées du groupe - SetsMetadata: setsMetadata, // Métadonnées de chaque set - } - - // Sauvegarder les métadonnées - if err := saveMetadata(filepath.Join(directory, "metadata."+format.String()), metadata, format); err != nil { - return err - } - - // Sauvegarder chaque bitmap - for i, ks := range ksg.sets { - bitmapPath := filepath.Join(directory, files[i]) - file, err := os.Create(bitmapPath) - if err != nil { - return fmt.Errorf("failed to create bitmap file %s: %w", bitmapPath, err) - } - - if _, err := ks.bitmap.WriteTo(file); err != nil { - file.Close() - return fmt.Errorf("failed to write bitmap %d: %w", i, err) - } - file.Close() - } - - return nil -} - -// LoadKmerSetGroup charge un KmerSetGroup depuis un répertoire -func LoadKmerSetGroup(directory string) (*KmerSetGroup, error) { - // Lire les métadonnées (essayer tous les formats) - metadata, err := loadMetadata(directory) - if err != nil { - return nil, err - } - - // Vérifier le type - if metadata.Type != "KmerSetGroup" { - return nil, fmt.Errorf("invalid type: expected KmerSetGroup, got %s", metadata.Type) - } - - // Vérifier la cohérence - if metadata.Size != len(metadata.Files) { - return nil, fmt.Errorf("size mismatch: size=%d but %d files listed", metadata.Size, len(metadata.Files)) - } - - // Créer le groupe - ksg := NewKmerSetGroup(metadata.K, metadata.Size) - - // Charger l'ID du groupe - ksg.id = metadata.ID - - // Charger les métadonnées du groupe - if metadata.UserMetadata != nil { - ksg.Metadata = metadata.UserMetadata - } - - // Charger les IDs de chaque KmerSet - if metadata.SetsIDs != nil && len(metadata.SetsIDs) == metadata.Size { - for i := range ksg.sets { - ksg.sets[i].id = metadata.SetsIDs[i] - } - } - - // Charger les métadonnées de chaque KmerSet individuel - if metadata.SetsMetadata != nil { - if len(metadata.SetsMetadata) != metadata.Size { - return nil, fmt.Errorf("sets metadata size mismatch: expected %d, got %d", metadata.Size, len(metadata.SetsMetadata)) - } - for i := range ksg.sets { - ksg.sets[i].Metadata = metadata.SetsMetadata[i] - } - } - - // Charger chaque bitmap - for i, filename := range metadata.Files { - bitmapPath := filepath.Join(directory, filename) - file, err := os.Open(bitmapPath) - if err != nil { - return nil, fmt.Errorf("failed to open bitmap file %s: %w", bitmapPath, err) - } - - if _, err := ksg.sets[i].bitmap.ReadFrom(file); err != nil { - file.Close() - return nil, fmt.Errorf("failed to read bitmap %d: %w", i, err) - } - file.Close() - } - - return ksg, nil -} - -// saveMetadata sauvegarde les métadonnées dans le format spécifié -func saveMetadata(path string, metadata KmerSetMetadata, format MetadataFormat) error { - file, err := os.Create(path) - if err != nil { - return fmt.Errorf("failed to create metadata file %s: %w", path, err) - } - defer file.Close() - - var encoder interface{ Encode(interface{}) error } - - switch format { - case FormatTOML: - encoder = toml.NewEncoder(file) - case FormatYAML: - encoder = yaml.NewEncoder(file) - case FormatJSON: - jsonEncoder := json.NewEncoder(file) - jsonEncoder.SetIndent("", " ") - encoder = jsonEncoder - default: - return fmt.Errorf("unsupported format: %v", format) - } - - if err := encoder.Encode(metadata); err != nil { - return fmt.Errorf("failed to encode metadata: %w", err) - } - - return nil -} - -// loadMetadata charge les métadonnées depuis un répertoire -// Essaie tous les formats (TOML, YAML, JSON) dans l'ordre -func loadMetadata(directory string) (*KmerSetMetadata, error) { - formats := []MetadataFormat{FormatTOML, FormatYAML, FormatJSON} - - var lastErr error - for _, format := range formats { - path := filepath.Join(directory, "metadata."+format.String()) - - // Vérifier si le fichier existe - if _, err := os.Stat(path); os.IsNotExist(err) { - continue - } - - metadata, err := loadMetadataFromFile(path, format) - if err != nil { - lastErr = err - continue - } - return metadata, nil - } - - if lastErr != nil { - return nil, fmt.Errorf("failed to load metadata: %w", lastErr) - } - return nil, fmt.Errorf("no metadata file found in %s (tried .toml, .yaml, .json)", directory) -} - -// loadMetadataFromFile charge les métadonnées depuis un fichier spécifique -func loadMetadataFromFile(path string, format MetadataFormat) (*KmerSetMetadata, error) { - file, err := os.Open(path) - if err != nil { - return nil, fmt.Errorf("failed to open metadata file %s: %w", path, err) - } - defer file.Close() - - var metadata KmerSetMetadata - var decoder interface{ Decode(interface{}) error } - - switch format { - case FormatTOML: - decoder = toml.NewDecoder(file) - case FormatYAML: - decoder = yaml.NewDecoder(file) - case FormatJSON: - decoder = json.NewDecoder(file) - default: - return nil, fmt.Errorf("unsupported format: %v", format) - } - - if err := decoder.Decode(&metadata); err != nil { - return nil, fmt.Errorf("failed to decode metadata: %w", err) - } - - return &metadata, nil -} - -// DetectFormat détecte le format des métadonnées dans un répertoire -func DetectFormat(directory string) (MetadataFormat, error) { - formats := []MetadataFormat{FormatTOML, FormatYAML, FormatJSON} - - for _, format := range formats { - path := filepath.Join(directory, "metadata."+format.String()) - if _, err := os.Stat(path); err == nil { - return format, nil - } - } - - return FormatTOML, fmt.Errorf("no metadata file found in %s", directory) -} - -// IsKmerSetDirectory vérifie si un répertoire contient un KmerSet ou KmerSetGroup -func IsKmerSetDirectory(directory string) (bool, string, error) { - metadata, err := loadMetadata(directory) - if err != nil { - return false, "", err - } - - return true, metadata.Type, nil -} - -// ListBitmapFiles liste tous les fichiers .roaring dans un répertoire -func ListBitmapFiles(directory string) ([]string, error) { - entries, err := os.ReadDir(directory) - if err != nil { - return nil, fmt.Errorf("failed to read directory %s: %w", directory, err) - } - - var files []string - for _, entry := range entries { - if !entry.IsDir() && strings.HasSuffix(entry.Name(), ".roaring") { - files = append(files, entry.Name()) - } - } - - return files, nil -} diff --git a/pkg/obikmer/kmer_set_test.go b/pkg/obikmer/kmer_set_test.go deleted file mode 100644 index 77144c7..0000000 --- a/pkg/obikmer/kmer_set_test.go +++ /dev/null @@ -1,272 +0,0 @@ -package obikmer - -import ( - "math" - "testing" -) - -func TestJaccardDistanceIdentical(t *testing.T) { - ks1 := NewKmerSet(5) - ks1.AddKmerCode(100) - ks1.AddKmerCode(200) - ks1.AddKmerCode(300) - - ks2 := NewKmerSet(5) - ks2.AddKmerCode(100) - ks2.AddKmerCode(200) - ks2.AddKmerCode(300) - - distance := ks1.JaccardDistance(ks2) - similarity := ks1.JaccardSimilarity(ks2) - - if distance != 0.0 { - t.Errorf("Expected distance 0.0 for identical sets, got %f", distance) - } - - if similarity != 1.0 { - t.Errorf("Expected similarity 1.0 for identical sets, got %f", similarity) - } -} - -func TestJaccardDistanceDisjoint(t *testing.T) { - ks1 := NewKmerSet(5) - ks1.AddKmerCode(100) - ks1.AddKmerCode(200) - ks1.AddKmerCode(300) - - ks2 := NewKmerSet(5) - ks2.AddKmerCode(400) - ks2.AddKmerCode(500) - ks2.AddKmerCode(600) - - distance := ks1.JaccardDistance(ks2) - similarity := ks1.JaccardSimilarity(ks2) - - if distance != 1.0 { - t.Errorf("Expected distance 1.0 for disjoint sets, got %f", distance) - } - - if similarity != 0.0 { - t.Errorf("Expected similarity 0.0 for disjoint sets, got %f", similarity) - } -} - -func TestJaccardDistancePartialOverlap(t *testing.T) { - // Set 1: {1, 2, 3} - ks1 := NewKmerSet(5) - ks1.AddKmerCode(1) - ks1.AddKmerCode(2) - ks1.AddKmerCode(3) - - // Set 2: {2, 3, 4} - ks2 := NewKmerSet(5) - ks2.AddKmerCode(2) - ks2.AddKmerCode(3) - ks2.AddKmerCode(4) - - // Intersection: {2, 3} -> cardinality = 2 - // Union: {1, 2, 3, 4} -> cardinality = 4 - // Similarity = 2/4 = 0.5 - // Distance = 1 - 0.5 = 0.5 - - distance := ks1.JaccardDistance(ks2) - similarity := ks1.JaccardSimilarity(ks2) - - expectedDistance := 0.5 - expectedSimilarity := 0.5 - - if math.Abs(distance-expectedDistance) > 1e-10 { - t.Errorf("Expected distance %f, got %f", expectedDistance, distance) - } - - if math.Abs(similarity-expectedSimilarity) > 1e-10 { - t.Errorf("Expected similarity %f, got %f", expectedSimilarity, similarity) - } -} - -func TestJaccardDistanceOneSubsetOfOther(t *testing.T) { - // Set 1: {1, 2} - ks1 := NewKmerSet(5) - ks1.AddKmerCode(1) - ks1.AddKmerCode(2) - - // Set 2: {1, 2, 3, 4} - ks2 := NewKmerSet(5) - ks2.AddKmerCode(1) - ks2.AddKmerCode(2) - ks2.AddKmerCode(3) - ks2.AddKmerCode(4) - - // Intersection: {1, 2} -> cardinality = 2 - // Union: {1, 2, 3, 4} -> cardinality = 4 - // Similarity = 2/4 = 0.5 - // Distance = 1 - 0.5 = 0.5 - - distance := ks1.JaccardDistance(ks2) - similarity := ks1.JaccardSimilarity(ks2) - - expectedDistance := 0.5 - expectedSimilarity := 0.5 - - if math.Abs(distance-expectedDistance) > 1e-10 { - t.Errorf("Expected distance %f, got %f", expectedDistance, distance) - } - - if math.Abs(similarity-expectedSimilarity) > 1e-10 { - t.Errorf("Expected similarity %f, got %f", expectedSimilarity, similarity) - } -} - -func TestJaccardDistanceEmptySets(t *testing.T) { - ks1 := NewKmerSet(5) - ks2 := NewKmerSet(5) - - distance := ks1.JaccardDistance(ks2) - similarity := ks1.JaccardSimilarity(ks2) - - // By convention, distance = 1.0 for empty sets - if distance != 1.0 { - t.Errorf("Expected distance 1.0 for empty sets, got %f", distance) - } - - if similarity != 0.0 { - t.Errorf("Expected similarity 0.0 for empty sets, got %f", similarity) - } -} - -func TestJaccardDistanceOneEmpty(t *testing.T) { - ks1 := NewKmerSet(5) - ks1.AddKmerCode(1) - ks1.AddKmerCode(2) - ks1.AddKmerCode(3) - - ks2 := NewKmerSet(5) - - distance := ks1.JaccardDistance(ks2) - similarity := ks1.JaccardSimilarity(ks2) - - // Intersection: {} -> cardinality = 0 - // Union: {1, 2, 3} -> cardinality = 3 - // Similarity = 0/3 = 0.0 - // Distance = 1.0 - - if distance != 1.0 { - t.Errorf("Expected distance 1.0 when one set is empty, got %f", distance) - } - - if similarity != 0.0 { - t.Errorf("Expected similarity 0.0 when one set is empty, got %f", similarity) - } -} - -func TestJaccardDistanceDifferentK(t *testing.T) { - ks1 := NewKmerSet(5) - ks1.AddKmerCode(1) - - ks2 := NewKmerSet(7) - ks2.AddKmerCode(1) - - defer func() { - if r := recover(); r == nil { - t.Errorf("Expected panic when computing Jaccard distance with different k values") - } - }() - - _ = ks1.JaccardDistance(ks2) -} - -func TestJaccardDistanceSimilarityRelation(t *testing.T) { - // Test that distance + similarity = 1.0 for all cases - testCases := []struct { - name string - ks1 *KmerSet - ks2 *KmerSet - }{ - { - name: "partial overlap", - ks1: func() *KmerSet { - ks := NewKmerSet(5) - ks.AddKmerCode(1) - ks.AddKmerCode(2) - ks.AddKmerCode(3) - return ks - }(), - ks2: func() *KmerSet { - ks := NewKmerSet(5) - ks.AddKmerCode(2) - ks.AddKmerCode(3) - ks.AddKmerCode(4) - ks.AddKmerCode(5) - return ks - }(), - }, - { - name: "identical", - ks1: func() *KmerSet { - ks := NewKmerSet(5) - ks.AddKmerCode(10) - ks.AddKmerCode(20) - return ks - }(), - ks2: func() *KmerSet { - ks := NewKmerSet(5) - ks.AddKmerCode(10) - ks.AddKmerCode(20) - return ks - }(), - }, - { - name: "disjoint", - ks1: func() *KmerSet { - ks := NewKmerSet(5) - ks.AddKmerCode(1) - return ks - }(), - ks2: func() *KmerSet { - ks := NewKmerSet(5) - ks.AddKmerCode(100) - return ks - }(), - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - distance := tc.ks1.JaccardDistance(tc.ks2) - similarity := tc.ks1.JaccardSimilarity(tc.ks2) - - sum := distance + similarity - - if math.Abs(sum-1.0) > 1e-10 { - t.Errorf("Expected distance + similarity = 1.0, got %f + %f = %f", - distance, similarity, sum) - } - }) - } -} - -func TestJaccardDistanceSymmetry(t *testing.T) { - ks1 := NewKmerSet(5) - ks1.AddKmerCode(1) - ks1.AddKmerCode(2) - ks1.AddKmerCode(3) - - ks2 := NewKmerSet(5) - ks2.AddKmerCode(2) - ks2.AddKmerCode(3) - ks2.AddKmerCode(4) - - distance1 := ks1.JaccardDistance(ks2) - distance2 := ks2.JaccardDistance(ks1) - - similarity1 := ks1.JaccardSimilarity(ks2) - similarity2 := ks2.JaccardSimilarity(ks1) - - if math.Abs(distance1-distance2) > 1e-10 { - t.Errorf("Jaccard distance not symmetric: %f vs %f", distance1, distance2) - } - - if math.Abs(similarity1-similarity2) > 1e-10 { - t.Errorf("Jaccard similarity not symmetric: %f vs %f", similarity1, similarity2) - } -} diff --git a/pkg/obikmer/minimizer_utils.go b/pkg/obikmer/minimizer_utils.go new file mode 100644 index 0000000..8221eb3 --- /dev/null +++ b/pkg/obikmer/minimizer_utils.go @@ -0,0 +1,47 @@ +package obikmer + +import ( + "math" + + log "github.com/sirupsen/logrus" +) + +// DefaultMinimizerSize returns ceil(k / 2.5) as a reasonable default minimizer size. +func DefaultMinimizerSize(k int) int { + m := int(math.Ceil(float64(k) / 2.5)) + if m < 1 { + m = 1 + } + if m >= k { + m = k - 1 + } + return m +} + +// MinMinimizerSize returns the minimum m such that 4^m >= nworkers, +// i.e. ceil(log(nworkers) / log(4)). +func MinMinimizerSize(nworkers int) int { + if nworkers <= 1 { + return 1 + } + return int(math.Ceil(math.Log(float64(nworkers)) / math.Log(4))) +} + +// ValidateMinimizerSize checks and adjusts the minimizer size to satisfy constraints: +// - m >= ceil(log(nworkers)/log(4)) +// - 1 <= m < k +func ValidateMinimizerSize(m, k, nworkers int) int { + minM := MinMinimizerSize(nworkers) + if m < minM { + log.Warnf("Minimizer size %d too small for %d workers (4^%d = %d < %d), adjusting to %d", + m, nworkers, m, 1<<(2*m), nworkers, minM) + m = minM + } + if m < 1 { + m = 1 + } + if m >= k { + m = k - 1 + } + return m +} diff --git a/pkg/obikmer/skm_reader.go b/pkg/obikmer/skm_reader.go new file mode 100644 index 0000000..64ef6f9 --- /dev/null +++ b/pkg/obikmer/skm_reader.go @@ -0,0 +1,67 @@ +package obikmer + +import ( + "bufio" + "encoding/binary" + "io" + "os" +) + +// decode2bit maps 2-bit codes back to nucleotide bytes. +var decode2bit = [4]byte{'a', 'c', 'g', 't'} + +// SkmReader reads super-kmers from a binary .skm file. +type SkmReader struct { + r *bufio.Reader + file *os.File +} + +// NewSkmReader opens a .skm file for reading. +func NewSkmReader(path string) (*SkmReader, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + return &SkmReader{ + r: bufio.NewReaderSize(f, 65536), + file: f, + }, nil +} + +// Next reads the next super-kmer from the file. +// Returns the SuperKmer and true, or a zero SuperKmer and false at EOF. +func (sr *SkmReader) Next() (SuperKmer, bool) { + // Read length + var lenbuf [2]byte + if _, err := io.ReadFull(sr.r, lenbuf[:]); err != nil { + return SuperKmer{}, false + } + seqLen := int(binary.LittleEndian.Uint16(lenbuf[:])) + + // Read packed bytes + nBytes := (seqLen + 3) / 4 + packed := make([]byte, nBytes) + if _, err := io.ReadFull(sr.r, packed); err != nil { + return SuperKmer{}, false + } + + // Decode to nucleotide bytes + seq := make([]byte, seqLen) + for i := 0; i < seqLen; i++ { + byteIdx := i / 4 + bitPos := uint(6 - (i%4)*2) + code := (packed[byteIdx] >> bitPos) & 0x03 + seq[i] = decode2bit[code] + } + + return SuperKmer{ + Sequence: seq, + Start: 0, + End: seqLen, + }, true +} + +// Close closes the underlying file. +func (sr *SkmReader) Close() error { + return sr.file.Close() +} diff --git a/pkg/obikmer/skm_test.go b/pkg/obikmer/skm_test.go new file mode 100644 index 0000000..7bc4734 --- /dev/null +++ b/pkg/obikmer/skm_test.go @@ -0,0 +1,176 @@ +package obikmer + +import ( + "os" + "path/filepath" + "testing" +) + +func TestSkmRoundTrip(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "test.skm") + + // Create super-kmers from a known sequence + seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT") + k := 21 + m := 9 + superKmers := ExtractSuperKmers(seq, k, m, nil) + if len(superKmers) == 0 { + t.Fatal("no super-kmers extracted") + } + + // Write + w, err := NewSkmWriter(path) + if err != nil { + t.Fatal(err) + } + for _, sk := range superKmers { + if err := w.Write(sk); err != nil { + t.Fatal(err) + } + } + if err := w.Close(); err != nil { + t.Fatal(err) + } + + // Read back + r, err := NewSkmReader(path) + if err != nil { + t.Fatal(err) + } + defer r.Close() + + idx := 0 + for { + sk, ok := r.Next() + if !ok { + break + } + if idx >= len(superKmers) { + t.Fatal("read more super-kmers than written") + } + expected := superKmers[idx] + if len(sk.Sequence) != len(expected.Sequence) { + t.Fatalf("super-kmer %d: length mismatch: got %d, want %d", + idx, len(sk.Sequence), len(expected.Sequence)) + } + // Compare nucleotide-by-nucleotide (case insensitive since decode produces lowercase) + for j := range sk.Sequence { + got := sk.Sequence[j] | 0x20 + want := expected.Sequence[j] | 0x20 + if got != want { + t.Fatalf("super-kmer %d pos %d: got %c, want %c", idx, j, got, want) + } + } + idx++ + } + if idx != len(superKmers) { + t.Fatalf("read %d super-kmers, want %d", idx, len(superKmers)) + } +} + +func TestSkmEmptyFile(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "empty.skm") + + // Write nothing + w, err := NewSkmWriter(path) + if err != nil { + t.Fatal(err) + } + if err := w.Close(); err != nil { + t.Fatal(err) + } + + // Read back + r, err := NewSkmReader(path) + if err != nil { + t.Fatal(err) + } + defer r.Close() + + _, ok := r.Next() + if ok { + t.Fatal("expected no super-kmers in empty file") + } +} + +func TestSkmSingleBase(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "single.skm") + + // Test with sequences of various lengths to check padding + sequences := [][]byte{ + []byte("A"), + []byte("AC"), + []byte("ACG"), + []byte("ACGT"), + []byte("ACGTA"), + } + + w, err := NewSkmWriter(path) + if err != nil { + t.Fatal(err) + } + for _, seq := range sequences { + sk := SuperKmer{Sequence: seq} + if err := w.Write(sk); err != nil { + t.Fatal(err) + } + } + if err := w.Close(); err != nil { + t.Fatal(err) + } + + r, err := NewSkmReader(path) + if err != nil { + t.Fatal(err) + } + defer r.Close() + + for i, expected := range sequences { + sk, ok := r.Next() + if !ok { + t.Fatalf("expected super-kmer %d, got EOF", i) + } + if len(sk.Sequence) != len(expected) { + t.Fatalf("sk %d: length %d, want %d", i, len(sk.Sequence), len(expected)) + } + for j := range sk.Sequence { + got := sk.Sequence[j] | 0x20 + want := expected[j] | 0x20 + if got != want { + t.Fatalf("sk %d pos %d: got %c, want %c", i, j, got, want) + } + } + } +} + +func TestSkmFileSize(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "size.skm") + + // Write a sequence of known length + seq := []byte("ACGTACGTAC") // 10 bases + sk := SuperKmer{Sequence: seq} + + w, err := NewSkmWriter(path) + if err != nil { + t.Fatal(err) + } + if err := w.Write(sk); err != nil { + t.Fatal(err) + } + if err := w.Close(); err != nil { + t.Fatal(err) + } + + // Expected: 2 bytes (length) + ceil(10/4)=3 bytes (data) = 5 bytes + info, err := os.Stat(path) + if err != nil { + t.Fatal(err) + } + if info.Size() != 5 { + t.Fatalf("file size: got %d, want 5", info.Size()) + } +} diff --git a/pkg/obikmer/skm_writer.go b/pkg/obikmer/skm_writer.go new file mode 100644 index 0000000..1123d2e --- /dev/null +++ b/pkg/obikmer/skm_writer.go @@ -0,0 +1,74 @@ +package obikmer + +import ( + "bufio" + "encoding/binary" + "os" +) + +// SkmWriter writes super-kmers to a binary .skm file. +// +// Format per super-kmer: +// +// [len: uint16 LE] length of the super-kmer in bases +// [data: ceil(len/4) bytes] sequence encoded 2 bits/base, packed +// +// Nucleotide encoding: A=00, C=01, G=10, T=11. +// The last byte is zero-padded on the low bits if len%4 != 0. +type SkmWriter struct { + w *bufio.Writer + file *os.File +} + +// NewSkmWriter creates a new SkmWriter writing to the given file path. +func NewSkmWriter(path string) (*SkmWriter, error) { + f, err := os.Create(path) + if err != nil { + return nil, err + } + return &SkmWriter{ + w: bufio.NewWriterSize(f, 65536), + file: f, + }, nil +} + +// Write encodes a SuperKmer to the .skm file. +// The sequence bytes are packed 2 bits per base. +func (sw *SkmWriter) Write(sk SuperKmer) error { + seq := sk.Sequence + seqLen := uint16(len(seq)) + + // Write length + var lenbuf [2]byte + binary.LittleEndian.PutUint16(lenbuf[:], seqLen) + if _, err := sw.w.Write(lenbuf[:]); err != nil { + return err + } + + // Encode and write packed sequence (2 bits/base) + nBytes := (int(seqLen) + 3) / 4 + for i := 0; i < nBytes; i++ { + var packed byte + for j := 0; j < 4; j++ { + pos := i*4 + j + packed <<= 2 + if pos < int(seqLen) { + packed |= __single_base_code__[seq[pos]&31] + } + } + if err := sw.w.WriteByte(packed); err != nil { + return err + } + } + + return nil +} + +// Close flushes buffered data and closes the underlying file. +func (sw *SkmWriter) Close() error { + if err := sw.w.Flush(); err != nil { + sw.file.Close() + return err + } + return sw.file.Close() +} diff --git a/pkg/obikmer/varint.go b/pkg/obikmer/varint.go new file mode 100644 index 0000000..cae6475 --- /dev/null +++ b/pkg/obikmer/varint.go @@ -0,0 +1,53 @@ +package obikmer + +import "io" + +// EncodeVarint writes a uint64 value as a variable-length integer to w. +// Uses 7 bits per byte with the high bit as a continuation flag +// (identical to protobuf unsigned varint encoding). +// Returns the number of bytes written. +func EncodeVarint(w io.Writer, v uint64) (int, error) { + var buf [10]byte // max 10 bytes for uint64 varint + n := 0 + for v >= 0x80 { + buf[n] = byte(v) | 0x80 + v >>= 7 + n++ + } + buf[n] = byte(v) + n++ + return w.Write(buf[:n]) +} + +// DecodeVarint reads a variable-length encoded uint64 from r. +// Returns the decoded value and any error encountered. +func DecodeVarint(r io.Reader) (uint64, error) { + var val uint64 + var shift uint + var buf [1]byte + + for { + if _, err := io.ReadFull(r, buf[:]); err != nil { + return 0, err + } + b := buf[0] + val |= uint64(b&0x7F) << shift + if b < 0x80 { + return val, nil + } + shift += 7 + if shift >= 70 { + return 0, io.ErrUnexpectedEOF + } + } +} + +// VarintLen returns the number of bytes needed to encode v as a varint. +func VarintLen(v uint64) int { + n := 1 + for v >= 0x80 { + v >>= 7 + n++ + } + return n +} diff --git a/pkg/obikmer/varint_test.go b/pkg/obikmer/varint_test.go new file mode 100644 index 0000000..ebe466b --- /dev/null +++ b/pkg/obikmer/varint_test.go @@ -0,0 +1,82 @@ +package obikmer + +import ( + "bytes" + "testing" +) + +func TestVarintRoundTrip(t *testing.T) { + values := []uint64{ + 0, 1, 127, 128, 255, 256, + 16383, 16384, + 1<<21 - 1, 1 << 21, + 1<<28 - 1, 1 << 28, + 1<<35 - 1, 1 << 35, + 1<<42 - 1, 1 << 42, + 1<<49 - 1, 1 << 49, + 1<<56 - 1, 1 << 56, + 1<<63 - 1, 1 << 63, + ^uint64(0), // max uint64 + } + + for _, v := range values { + var buf bytes.Buffer + n, err := EncodeVarint(&buf, v) + if err != nil { + t.Fatalf("EncodeVarint(%d): %v", v, err) + } + if n != VarintLen(v) { + t.Fatalf("EncodeVarint(%d): wrote %d bytes, VarintLen says %d", v, n, VarintLen(v)) + } + + decoded, err := DecodeVarint(&buf) + if err != nil { + t.Fatalf("DecodeVarint for %d: %v", v, err) + } + if decoded != v { + t.Fatalf("roundtrip failed: encoded %d, decoded %d", v, decoded) + } + } +} + +func TestVarintLen(t *testing.T) { + tests := []struct { + value uint64 + expected int + }{ + {0, 1}, + {127, 1}, + {128, 2}, + {16383, 2}, + {16384, 3}, + {^uint64(0), 10}, + } + + for _, tc := range tests { + got := VarintLen(tc.value) + if got != tc.expected { + t.Errorf("VarintLen(%d) = %d, want %d", tc.value, got, tc.expected) + } + } +} + +func TestVarintSequence(t *testing.T) { + var buf bytes.Buffer + values := []uint64{0, 42, 1000000, ^uint64(0), 1} + + for _, v := range values { + if _, err := EncodeVarint(&buf, v); err != nil { + t.Fatalf("EncodeVarint(%d): %v", v, err) + } + } + + for _, expected := range values { + got, err := DecodeVarint(&buf) + if err != nil { + t.Fatalf("DecodeVarint: %v", err) + } + if got != expected { + t.Errorf("got %d, want %d", got, expected) + } + } +} diff --git a/pkg/obitools/obikindex/obikindex.go b/pkg/obitools/obikindex/obikindex.go index 3fb634c..6c504cf 100644 --- a/pkg/obitools/obikindex/obikindex.go +++ b/pkg/obitools/obikindex/obikindex.go @@ -7,8 +7,8 @@ import ( "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" ) -// CLIBuildKmerIndex reads sequences from the iterator and builds a kmer index -// saved as a roaring bitmap directory. +// CLIBuildKmerIndex reads sequences from the iterator and builds a +// disk-based kmer index using the KmerSetGroupBuilder. func CLIBuildKmerIndex(iterator obiiter.IBioSequence) { // Validate output directory outDir := CLIOutputDirectory() @@ -31,64 +31,56 @@ func CLIBuildKmerIndex(iterator obiiter.IBioSequence) { log.Fatalf("Invalid min-occurrence: %d (must be >= 1)", minOcc) } - // Resolve metadata format - format := CLIMetadataFormat() - log.Infof("Building kmer index: k=%d, m=%d, min-occurrence=%d", k, m, minOcc) - if minOcc <= 1 { - // Simple KmerSet mode - ks := obikmer.BuildKmerIndex(iterator, k, m) + // Build options + var opts []obikmer.BuilderOption + if minOcc > 1 { + opts = append(opts, obikmer.WithMinFrequency(minOcc)) + } - // Apply metadata - applyKmerSetMetadata(ks) + // Create builder (1 set, auto partitions) + builder, err := obikmer.NewKmerSetGroupBuilder(outDir, k, m, 1, -1, opts...) + if err != nil { + log.Fatalf("Failed to create kmer index builder: %v", err) + } - // Save - log.Infof("Saving KmerSet to %s", outDir) - if err := ks.Save(outDir, format); err != nil { - log.Fatalf("Failed to save kmer index: %v", err) - } - } else { - // FrequencyFilter mode - ff := obikmer.BuildFrequencyFilterIndex(iterator, k, m, minOcc) - - if CLISaveFullFilter() { - // Save the full filter (all levels) - applyMetadataGroup(ff.KmerSetGroup) - - log.Infof("Saving full FrequencyFilter to %s", outDir) - if err := ff.Save(outDir, format); err != nil { - log.Fatalf("Failed to save frequency filter: %v", err) - } - } else { - // Save only the filtered KmerSet (k-mers with freq >= minOcc) - ks := ff.GetFilteredSet() - applyKmerSetMetadata(ks) - ks.SetAttribute("min_occurrence", minOcc) - - log.Infof("Saving filtered KmerSet (freq >= %d) to %s", minOcc, outDir) - if err := ks.Save(outDir, format); err != nil { - log.Fatalf("Failed to save filtered kmer index: %v", err) - } + // Feed sequences + seqCount := 0 + for iterator.Next() { + batch := iterator.Get() + for _, seq := range batch.Slice() { + builder.AddSequence(0, seq) + seqCount++ } } + log.Infof("Processed %d sequences", seqCount) + + // Finalize + ksg, err := builder.Close() + if err != nil { + log.Fatalf("Failed to finalize kmer index: %v", err) + } + + // Apply metadata + applyMetadata(ksg) + + if minOcc > 1 { + ksg.SetAttribute("min_occurrence", minOcc) + } + + // Persist metadata updates + if err := ksg.SaveMetadata(); err != nil { + log.Fatalf("Failed to save metadata: %v", err) + } + + log.Infof("Index contains %d k-mers in %s", ksg.Len(0), outDir) log.Info("Done.") } -// applyKmerSetMetadata sets index-id and --set-tag metadata on a KmerSet. -func applyKmerSetMetadata(ks *obikmer.KmerSet) { - if id := CLIIndexId(); id != "" { - ks.SetId(id) - } - - for key, value := range CLISetTag() { - ks.SetAttribute(key, value) - } -} - -// applyMetadataGroup sets index-id and --set-tag metadata on a KmerSetGroup. -func applyMetadataGroup(ksg *obikmer.KmerSetGroup) { +// applyMetadata sets index-id and --set-tag metadata on a KmerSetGroup. +func applyMetadata(ksg *obikmer.KmerSetGroup) { if id := CLIIndexId(); id != "" { ksg.SetId(id) }