diff --git a/.gitignore b/.gitignore index b3d2a3c..4cf23ab 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,4 @@ LLM/** entropy.html bug_id.txt obilowmask_ref +test_* diff --git a/blackboard/Prospective/kmer_disk_index_plan.md b/blackboard/Prospective/kmer_disk_index_plan.md new file mode 100644 index 0000000..43e5af3 --- /dev/null +++ b/blackboard/Prospective/kmer_disk_index_plan.md @@ -0,0 +1,508 @@ +# Plan de refonte du package obikmer : index disk-based par partitions minimizer + +## Constat + +Les roaring64 bitmaps ne sont pas adaptés au stockage de 10^10 k-mers +(k=31) dispersés sur un espace de 2^62. L'overhead structurel (containers +roaring par high key 32 bits) dépasse la taille des données elles-mêmes, +et les opérations `Or()` entre bitmaps fragmentés ne terminent pas en +temps raisonnable. + +## Principe de la nouvelle architecture + +Un `KmerSet` est un ensemble trié de k-mers canoniques (uint64) stocké +sur disque, partitionné par minimizer. Chaque partition est un fichier +binaire contenant des uint64 triés, compressés par delta-varint. + +Un `KmerSetGroup` est un répertoire contenant N ensembles partitionnés +de la même façon (même k, même m, même P). + +Un `KmerSet` est un `KmerSetGroup` de taille 1 (singleton). + +Les opérations ensemblistes se font partition par partition, en merge +streaming, sans charger l'index complet en mémoire. + +## Cycle de vie d'un index + +L'index a deux phases distinctes : + +1. **Phase de construction (mutable)** : on ouvre un index, on y ajoute + des séquences. Pour chaque séquence, les super-kmers sont extraits + et écrits de manière compacte (2 bits/base) dans le fichier + temporaire de partition correspondant (`minimizer % P`). Les + super-kmers sont une représentation compressée naturelle des k-mers + chevauchants : un super-kmer de longueur L encode L-k+1 k-mers en + ne stockant que ~L/4 bytes au lieu de (L-k+1) × 8 bytes. + +2. **Phase de clôture (optimisation)** : on ferme l'index, ce qui + déclenche le traitement **partition par partition** (indépendant, + parallélisable) : + - Charger les super-kmers de la partition + - En extraire tous les k-mers canoniques + - Trier le tableau de k-mers + - Dédupliquer (et compter si FrequencyFilter) + - Delta-encoder et écrire le fichier .kdi final + Après clôture, l'index est statique et immuable. + +3. **Phase de lecture (immutable)** : opérations ensemblistes, + Jaccard, Quorum, Contains, itération. Toutes en streaming. + +--- + +## Format sur disque + +### Index finalisé + +``` +index_dir/ + metadata.toml + set_0/ + part_0000.kdi + part_0001.kdi + ... + part_{P-1}.kdi + set_1/ + part_0000.kdi + ... + ... + set_{N-1}/ + ... +``` + +### Fichiers temporaires pendant la construction + +``` +index_dir/ + .build/ + set_0/ + part_0000.skm # super-kmers encodés 2 bits/base + part_0001.skm + ... + set_1/ + ... +``` + +Le répertoire `.build/` est supprimé après Close(). + +### metadata.toml + +```toml +id = "mon_index" +k = 31 +m = 13 +partitions = 1024 +type = "KmerSetGroup" # ou "KmerSet" (N=1) +size = 3 # nombre de sets (N) +sets_ids = ["genome_A", "genome_B", "genome_C"] + +[user_metadata] +organism = "Triticum aestivum" + +[sets_metadata] +# métadonnées individuelles par set si nécessaire +``` + +### Fichier .kdi (Kmer Delta Index) + +Format binaire : + +``` +[magic: 4 bytes "KDI\x01"] +[count: uint64 little-endian] # nombre de k-mers dans cette partition +[first: uint64 little-endian] # premier k-mer (valeur absolue) +[delta_1: varint] # arr[1] - arr[0] +[delta_2: varint] # arr[2] - arr[1] +... +[delta_{count-1}: varint] # arr[count-1] - arr[count-2] +``` + +Varint : encoding unsigned, 7 bits utiles par byte, bit de poids fort += continuation (identique au varint protobuf). + +Fichier vide (partition sans k-mer) : magic + count=0. + +### Fichier .skm (Super-Kmer temporaire) + +Format binaire, séquence de super-kmers encodés : + +``` +[len: uint16 little-endian] # longueur du super-kmer en bases +[sequence: ceil(len/4) bytes] # séquence encodée 2 bits/base, packed +... +``` + +**Compression par rapport au stockage de k-mers bruts** : + +Un super-kmer de longueur L contient L-k+1 k-mers. +- Stockage super-kmer : 2 + ceil(L/4) bytes +- Stockage k-mers bruts : (L-k+1) × 8 bytes + +Exemple avec k=31, super-kmer typique L=50 : +- Super-kmer : 2 + 13 = 15 bytes → encode 20 k-mers +- K-mers bruts : 20 × 8 = 160 bytes +- **Facteur de compression : ~10×** + +Pour un génome de 10 Gbases (~10^10 k-mers bruts) : +- K-mers bruts : ~80 Go par set temporaire +- Super-kmers : **~8 Go** par set temporaire + +Avec FrequencyFilter et couverture 30× : +- K-mers bruts : ~2.4 To +- Super-kmers : **~240 Go** + +--- + +## FrequencyFilter + +Le FrequencyFilter n'est plus un type de données séparé. C'est un +**mode de construction** du builder. Le résultat est un KmerSetGroup +standard. + +### Principe + +Pendant la construction, tous les super-kmers sont écrits dans les +fichiers temporaires .skm, y compris les doublons (chaque occurrence +de chaque séquence est écrite). + +Pendant Close(), pour chaque partition : +1. Charger tous les super-kmers de la partition +2. Extraire tous les k-mers canoniques dans un tableau []uint64 +3. Trier le tableau +4. Parcourir linéairement : les k-mers identiques sont consécutifs +5. Compter les occurrences de chaque k-mer +6. Si count >= minFreq → écrire dans le .kdi final (une seule fois) +7. Sinon → ignorer + +### Dimensionnement + +Pour un génome de 10 Gbases avec couverture 30× : +- N_brut ≈ 3×10^11 k-mers bruts +- Espace temporaire .skm ≈ 240 Go (compressé super-kmer) +- RAM par partition pendant Close() : + Avec P=1024 : ~3×10^8 k-mers/partition × 8 = **~2.4 Go** + Avec P=4096 : ~7.3×10^7 k-mers/partition × 8 = **~600 Mo** + +Le choix de P détermine le compromis nombre de fichiers vs RAM par +partition. + +### Sans FrequencyFilter (déduplication simple) + +Pour de la déduplication simple (chaque k-mer écrit une fois), le +builder peut dédupliquer au niveau des buffers en RAM avant flush. +Cela réduit significativement l'espace temporaire car les doublons +au sein d'un même buffer (provenant de séquences proches) sont +éliminés immédiatement. + +--- + +## API publique visée + +### Structures + +```go +// KmerSetGroup est l'entité de base. +// Un KmerSet est un KmerSetGroup avec Size() == 1. +type KmerSetGroup struct { + // champs internes : path, k, m, P, N, metadata, état +} + +// KmerSetGroupBuilder construit un KmerSetGroup mutable. +type KmerSetGroupBuilder struct { + // champs internes : buffers I/O par partition et par set, + // fichiers temporaires .skm, paramètres (minFreq, etc.) +} +``` + +### Construction + +```go +// NewKmerSetGroupBuilder crée un builder pour un nouveau KmerSetGroup. +// directory : répertoire de destination +// k : taille des k-mers (1-31) +// m : taille des minimizers (-1 pour auto = ceil(k/2.5)) +// n : nombre de sets dans le groupe +// P : nombre de partitions (-1 pour auto) +// options : options de construction (FrequencyFilter, etc.) +func NewKmerSetGroupBuilder(directory string, k, m, n, P int, + options ...BuilderOption) (*KmerSetGroupBuilder, error) + +// WithMinFrequency active le mode FrequencyFilter. +// Seuls les k-mers vus >= minFreq fois sont conservés dans l'index +// final. Les super-kmers sont écrits avec leurs doublons pendant +// la construction ; le comptage exact se fait au Close(). +func WithMinFrequency(minFreq int) BuilderOption + +// AddSequence extrait les super-kmers d'une séquence et les écrit +// dans les fichiers temporaires de partition du set i. +func (b *KmerSetGroupBuilder) AddSequence(setIndex int, seq *obiseq.BioSequence) + +// AddSuperKmer écrit un super-kmer dans le fichier temporaire de +// sa partition pour le set i. +func (b *KmerSetGroupBuilder) AddSuperKmer(setIndex int, sk SuperKmer) + +// Close finalise la construction : +// - flush des buffers d'écriture +// - pour chaque partition de chaque set (parallélisable) : +// - charger les super-kmers depuis le .skm +// - extraire les k-mers canoniques +// - trier, dédupliquer (compter si freq filter) +// - delta-encoder et écrire le .kdi +// - écrire metadata.toml +// - supprimer le répertoire .build/ +// Retourne le KmerSetGroup en lecture seule. +func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) +``` + +### Lecture et opérations + +```go +// OpenKmerSetGroup ouvre un index finalisé en lecture seule. +func OpenKmerSetGroup(directory string) (*KmerSetGroup, error) + +// --- Métadonnées (API inchangée) --- +func (ksg *KmerSetGroup) K() int +func (ksg *KmerSetGroup) M() int // nouveau : taille du minimizer +func (ksg *KmerSetGroup) Partitions() int // nouveau : nombre de partitions +func (ksg *KmerSetGroup) Size() int +func (ksg *KmerSetGroup) Id() string +func (ksg *KmerSetGroup) SetId(id string) +func (ksg *KmerSetGroup) HasAttribute(key string) bool +func (ksg *KmerSetGroup) GetAttribute(key string) (interface{}, bool) +func (ksg *KmerSetGroup) SetAttribute(key string, value interface{}) +// ... etc (toute l'API attributs actuelle est conservée) + +// --- Opérations ensemblistes --- +// Toutes produisent un nouveau KmerSetGroup singleton sur disque. +// Opèrent partition par partition en streaming. + +func (ksg *KmerSetGroup) Union(outputDir string) (*KmerSetGroup, error) +func (ksg *KmerSetGroup) Intersect(outputDir string) (*KmerSetGroup, error) +func (ksg *KmerSetGroup) Difference(outputDir string) (*KmerSetGroup, error) +func (ksg *KmerSetGroup) QuorumAtLeast(q int, outputDir string) (*KmerSetGroup, error) +func (ksg *KmerSetGroup) QuorumExactly(q int, outputDir string) (*KmerSetGroup, error) +func (ksg *KmerSetGroup) QuorumAtMost(q int, outputDir string) (*KmerSetGroup, error) + +// --- Opérations entre deux KmerSetGroups --- +// Les deux groupes doivent avoir les mêmes k, m, P. + +func (ksg *KmerSetGroup) UnionWith(other *KmerSetGroup, outputDir string) (*KmerSetGroup, error) +func (ksg *KmerSetGroup) IntersectWith(other *KmerSetGroup, outputDir string) (*KmerSetGroup, error) + +// --- Métriques (résultat en mémoire, pas de sortie disque) --- + +func (ksg *KmerSetGroup) JaccardDistanceMatrix() *obidist.DistMatrix +func (ksg *KmerSetGroup) JaccardSimilarityMatrix() *obidist.DistMatrix + +// --- Accès individuel --- + +func (ksg *KmerSetGroup) Len(setIndex ...int) uint64 +func (ksg *KmerSetGroup) Contains(setIndex int, kmer uint64) bool +func (ksg *KmerSetGroup) Iterator(setIndex int) iter.Seq[uint64] +``` + +--- + +## Implémentation interne + +### Primitives bas niveau + +**`varint.go`** : encode/decode varint uint64 + +```go +func EncodeVarint(w io.Writer, v uint64) (int, error) +func DecodeVarint(r io.Reader) (uint64, error) +``` + +### Format .kdi + +**`kdi_writer.go`** : écriture d'un fichier .kdi à partir d'un flux +trié de uint64 (delta-encode au vol). + +```go +type KdiWriter struct { ... } +func NewKdiWriter(path string) (*KdiWriter, error) +func (w *KdiWriter) Write(kmer uint64) error +func (w *KdiWriter) Close() error +``` + +**`kdi_reader.go`** : lecture streaming d'un fichier .kdi (décode +les deltas au vol). + +```go +type KdiReader struct { ... } +func NewKdiReader(path string) (*KdiReader, error) +func (r *KdiReader) Next() (uint64, bool) +func (r *KdiReader) Count() uint64 +func (r *KdiReader) Close() error +``` + +### Format .skm + +**`skm_writer.go`** : écriture de super-kmers encodés 2 bits/base. + +```go +type SkmWriter struct { ... } +func NewSkmWriter(path string) (*SkmWriter, error) +func (w *SkmWriter) Write(sk SuperKmer) error +func (w *SkmWriter) Close() error +``` + +**`skm_reader.go`** : lecture de super-kmers depuis un fichier .skm. + +```go +type SkmReader struct { ... } +func NewSkmReader(path string) (*SkmReader, error) +func (r *SkmReader) Next() (SuperKmer, bool) +func (r *SkmReader) Close() error +``` + +### Merge streaming + +**`kdi_merge.go`** : k-way merge de plusieurs flux triés. + +```go +type KWayMerge struct { ... } +func NewKWayMerge(readers []*KdiReader) *KWayMerge +func (m *KWayMerge) Next() (kmer uint64, count int, ok bool) +func (m *KWayMerge) Close() error +``` + +### Builder + +**`kmer_set_builder.go`** : construction d'un KmerSetGroup. + +Le builder gère : +- P × N écrivains .skm bufferisés (un par partition × set) +- À la clôture : traitement partition par partition + (parallélisable sur plusieurs cores) + +Gestion mémoire des buffers d'écriture : +- Chaque SkmWriter a un buffer I/O de taille raisonnable (~64 Ko) +- Avec P=1024 et N=1 : 1024 × 64 Ko = 64 Mo de buffers +- Avec P=1024 et N=10 : 640 Mo de buffers +- Pas de buffer de k-mers en RAM : tout est écrit sur disque + immédiatement via les super-kmers + +RAM pendant Close() (tri d'une partition) : +- Charger les super-kmers → extraire les k-mers → tableau []uint64 +- Avec P=1024 et 10^10 k-mers/set : ~10^7 k-mers/partition × 8 = ~80 Mo +- Avec FrequencyFilter (doublons) et couverture 30× : + ~3×10^8/partition × 8 = ~2.4 Go (ajustable via P) + +### Structure disk-based + +**`kmer_set_disk.go`** : KmerSetGroup en lecture seule. + +**`kmer_set_disk_ops.go`** : opérations ensemblistes par merge +streaming partition par partition. + +--- + +## Ce qui change par rapport à l'API actuelle + +### Changements de sémantique + +| Aspect | Ancien (roaring) | Nouveau (disk-based) | +|---|---|---| +| Stockage | En mémoire (roaring64.Bitmap) | Sur disque (.kdi delta-encoded) | +| Temporaire construction | En mémoire | Super-kmers sur disque (.skm 2 bits/base) | +| Mutabilité | Mutable à tout moment | Builder → Close() → immutable | +| Opérations ensemblistes | Résultat en mémoire | Résultat sur disque (nouveau répertoire) | +| Contains | O(1) roaring lookup | O(log n) recherche binaire sur .kdi | +| Itération | Roaring iterator | Streaming décodage delta-varint | + +### API conservée (signatures identiques ou quasi-identiques) + +- `KmerSetGroup` : `K()`, `Size()`, `Id()`, `SetId()` +- Toute l'API attributs +- `JaccardDistanceMatrix()`, `JaccardSimilarityMatrix()` +- `Len()`, `Contains()` + +### API modifiée + +- `Union()`, `Intersect()`, etc. : ajout du paramètre `outputDir` +- `QuorumAtLeast()`, etc. : idem +- Construction : `NewKmerSetGroupBuilder()` + `AddSequence()` + `Close()` + au lieu de manipulation directe + +### API supprimée + +- `KmerSet` comme type distinct (remplacé par KmerSetGroup singleton) +- `FrequencyFilter` comme type distinct (mode du Builder) +- Tout accès direct à `roaring64.Bitmap` +- `KmerSet.Copy()` (copie de répertoire à la place) +- `KmerSet.Union()`, `.Intersect()`, `.Difference()` (deviennent méthodes + de KmerSetGroup avec outputDir) + +--- + +## Fichiers à créer / modifier dans pkg/obikmer + +### Nouveaux fichiers + +| Fichier | Contenu | +|---|---| +| `varint.go` | Encode/Decode varint uint64 | +| `kdi_writer.go` | Écrivain de fichiers .kdi (delta-encoded) | +| `kdi_reader.go` | Lecteur streaming de fichiers .kdi | +| `skm_writer.go` | Écrivain de super-kmers encodés 2 bits/base | +| `skm_reader.go` | Lecteur de super-kmers depuis .skm | +| `kdi_merge.go` | K-way merge streaming de flux triés | +| `kmer_set_builder.go` | KmerSetGroupBuilder (construction) | +| `kmer_set_disk.go` | KmerSetGroup disk-based (lecture, métadonnées) | +| `kmer_set_disk_ops.go` | Opérations ensemblistes streaming | + +### Fichiers à supprimer + +| Fichier | Raison | +|---|---| +| `kmer_set.go` | Remplacé par kmer_set_disk.go | +| `kmer_set_group.go` | Idem | +| `kmer_set_attributes.go` | Intégré dans kmer_set_disk.go | +| `kmer_set_persistence.go` | L'index est nativement sur disque | +| `kmer_set_group_quorum.go` | Intégré dans kmer_set_disk_ops.go | +| `frequency_filter.go` | Mode du Builder, plus de type séparé | +| `kmer_index_builder.go` | Remplacé par kmer_set_builder.go | + +### Fichiers conservés tels quels + +| Fichier | Contenu | +|---|---| +| `encodekmer.go` | Encodage/décodage k-mers | +| `superkmer.go` | Structure SuperKmer | +| `superkmer_iter.go` | IterSuperKmers, IterCanonicalKmers | +| `encodefourmer.go` | Encode4mer | +| `counting.go` | Count4Mer | +| `kmermap.go` | KmerMap (usage indépendant) | +| `debruijn.go` | Graphe de de Bruijn | + +--- + +## Ordre d'implémentation + +1. `varint.go` + tests +2. `skm_writer.go` + `skm_reader.go` + tests +3. `kdi_writer.go` + `kdi_reader.go` + tests +4. `kdi_merge.go` + tests +5. `kmer_set_builder.go` + tests (construction + Close) +6. `kmer_set_disk.go` (structure, métadonnées, Open) +7. `kmer_set_disk_ops.go` + tests (Union, Intersect, Quorum, Jaccard) +8. Adaptation de `pkg/obitools/obikindex/` +9. Suppression des anciens fichiers roaring +10. Adaptation des tests existants + +Chaque étape est testable indépendamment. + +--- + +## Dépendances externes + +### Supprimées + +- `github.com/RoaringBitmap/roaring` : plus nécessaire pour les + index k-mers (vérifier si d'autres packages l'utilisent encore) + +### Ajoutées + +- Aucune. Varint, delta-encoding, merge, encodage 2 bits/base : + tout est implémentable en Go standard. diff --git a/cmd/obitools/obik/main.go b/cmd/obitools/obik/main.go new file mode 100644 index 0000000..694d1ab --- /dev/null +++ b/cmd/obitools/obik/main.go @@ -0,0 +1,34 @@ +package main + +import ( + "context" + "errors" + "os" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obik" + "github.com/DavidGamba/go-getoptions" +) + +func main() { + defer obiseq.LogBioSeqStatus() + + opt, parser := obioptions.GenerateSubcommandParser( + "obik", + "Manage disk-based kmer indices", + obik.OptionSet, + ) + + _, remaining := parser(os.Args) + + err := opt.Dispatch(context.Background(), remaining) + if err != nil { + if errors.Is(err, getoptions.ErrorHelpCalled) { + os.Exit(0) + } + log.Fatalf("Error: %v", err) + } +} diff --git a/cmd/obitools/obilowmask/main.go b/cmd/obitools/obilowmask/main.go deleted file mode 100644 index ec43a54..0000000 --- a/cmd/obitools/obilowmask/main.go +++ /dev/null @@ -1,47 +0,0 @@ -package main - -import ( - "os" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obilowmask" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" -) - -func main() { - - defer obiseq.LogBioSeqStatus() - - // go tool pprof -http=":8000" ./obipairing ./cpu.pprof - // f, err := os.Create("cpu.pprof") - // if err != nil { - // log.Fatal(err) - // } - // pprof.StartCPUProfile(f) - // defer pprof.StopCPUProfile() - - // go tool trace cpu.trace - // ftrace, err := os.Create("cpu.trace") - // if err != nil { - // log.Fatal(err) - // } - // trace.Start(ftrace) - // defer trace.Stop() - - optionParser := obioptions.GenerateOptionParser( - "obimicrosat", - "looks for microsatellites sequences in a sequence file", - obilowmask.OptionSet) - - _, args := optionParser(os.Args) - - sequences, err := obiconvert.CLIReadBioSequences(args...) - obiconvert.OpenSequenceDataErrorMessage(args, err) - - selected := obilowmask.CLISequenceEntropyMasker(sequences) - obiconvert.CLIWriteBioSequences(selected, true) - obiutils.WaitForLastPipe() - -} diff --git a/cmd/obitools/obisuperkmer/main.go b/cmd/obitools/obisuperkmer/main.go deleted file mode 100644 index aedaa80..0000000 --- a/cmd/obitools/obisuperkmer/main.go +++ /dev/null @@ -1,34 +0,0 @@ -package main - -import ( - "os" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obisuperkmer" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" -) - -func main() { - // Generate option parser - optionParser := obioptions.GenerateOptionParser( - "obisuperkmer", - "extract super k-mers from sequence files", - obisuperkmer.OptionSet) - - // Parse command-line arguments - _, args := optionParser(os.Args) - - // Read input sequences - sequences, err := obiconvert.CLIReadBioSequences(args...) - obiconvert.OpenSequenceDataErrorMessage(args, err) - - // Extract super k-mers - superkmers := obisuperkmer.CLIExtractSuperKmers(sequences) - - // Write output sequences - obiconvert.CLIWriteBioSequences(superkmers, true) - - // Wait for pipeline completion - obiutils.WaitForLastPipe() -} diff --git a/go.mod b/go.mod index 9c0a019..e40a216 100644 --- a/go.mod +++ b/go.mod @@ -14,6 +14,7 @@ require ( github.com/goccy/go-json v0.10.3 github.com/klauspost/pgzip v1.2.6 github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 + github.com/pelletier/go-toml/v2 v2.2.4 github.com/rrethy/ahocorasick v1.0.0 github.com/schollz/progressbar/v3 v3.13.1 github.com/sirupsen/logrus v1.9.3 @@ -27,14 +28,10 @@ require ( ) require ( - github.com/RoaringBitmap/roaring v1.9.4 // indirect - github.com/bits-and-blooms/bitset v1.12.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/goombaio/orderedmap v0.0.0-20180924084748-ba921b7e2419 // indirect github.com/kr/pretty v0.3.1 // indirect github.com/kr/text v0.2.0 // indirect - github.com/mschoch/smat v0.2.0 // indirect - github.com/pelletier/go-toml/v2 v2.2.4 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/rogpeppe/go-internal v1.12.0 // indirect ) diff --git a/go.sum b/go.sum index 52d2591..ec95ed3 100644 --- a/go.sum +++ b/go.sum @@ -4,12 +4,8 @@ github.com/PaesslerAG/gval v1.2.2 h1:Y7iBzhgE09IGTt5QgGQ2IdaYYYOU134YGHBThD+wm9E github.com/PaesslerAG/gval v1.2.2/go.mod h1:XRFLwvmkTEdYziLdaCeCa5ImcGVrfQbeNUbVR+C6xac= github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI= github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8= -github.com/RoaringBitmap/roaring v1.9.4 h1:yhEIoH4YezLYT04s1nHehNO64EKFTop/wBhxv2QzDdQ= -github.com/RoaringBitmap/roaring v1.9.4/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90= github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df h1:GSoSVRLoBaFpOOds6QyY1L8AX7uoY+Ln3BHc22W40X0= github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df/go.mod h1:hiVxq5OP2bUGBRNS3Z/bt/reCLFNbdcST6gISi1fiOM= -github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA= -github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMUs= github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0= github.com/chen3feng/stl4go v0.1.1 h1:0L1+mDw7pomftKDruM23f1mA7miavOj6C6MZeadzN2Q= @@ -51,8 +47,6 @@ github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZ github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ= github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw= -github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM= -github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw= github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0= github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y= github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4= diff --git a/kmer_roaring_index/FREQUENCY_FILTER_FINAL.md b/kmer_roaring_index/FREQUENCY_FILTER_FINAL.md deleted file mode 100644 index d00be69..0000000 --- a/kmer_roaring_index/FREQUENCY_FILTER_FINAL.md +++ /dev/null @@ -1,292 +0,0 @@ -# Filtre de Fréquence avec v Niveaux de Roaring Bitmaps - -## Algorithme - -```go -Pour chaque k-mer rencontré dans les données: - c = 0 - tant que (k-mer ∈ index[c] ET c < v): - c++ - - si c < v: - index[c].insert(k-mer) -``` - -**Résultat** : `index[v-1]` contient les k-mers vus **≥ v fois** - ---- - -## Exemple d'exécution (v=3) - -``` -Données: - Read1: kmer X - Read2: kmer X - Read3: kmer X (X vu 3 fois) - Read4: kmer Y - Read5: kmer Y (Y vu 2 fois) - Read6: kmer Z (Z vu 1 fois) - -Exécution: - -Read1 (X): - c=0: X ∉ index[0] → index[0].add(X) - État: index[0]={X}, index[1]={}, index[2]={} - -Read2 (X): - c=0: X ∈ index[0] → c=1 - c=1: X ∉ index[1] → index[1].add(X) - État: index[0]={X}, index[1]={X}, index[2]={} - -Read3 (X): - c=0: X ∈ index[0] → c=1 - c=1: X ∈ index[1] → c=2 - c=2: X ∉ index[2] → index[2].add(X) - État: index[0]={X}, index[1]={X}, index[2]={X} - -Read4 (Y): - c=0: Y ∉ index[0] → index[0].add(Y) - État: index[0]={X,Y}, index[1]={X}, index[2]={X} - -Read5 (Y): - c=0: Y ∈ index[0] → c=1 - c=1: Y ∉ index[1] → index[1].add(Y) - État: index[0]={X,Y}, index[1]={X,Y}, index[2]={X} - -Read6 (Z): - c=0: Z ∉ index[0] → index[0].add(Z) - État: index[0]={X,Y,Z}, index[1]={X,Y}, index[2]={X} - -Résultat final: - index[0] (freq≥1): {X, Y, Z} - index[1] (freq≥2): {X, Y} - index[2] (freq≥3): {X} ← K-mers filtrés ✓ -``` - ---- - -## Utilisation - -```go -// Créer le filtre -filter := obikmer.NewFrequencyFilter(31, 3) // k=31, minFreq=3 - -// Ajouter les séquences -for _, read := range reads { - filter.AddSequence(read) -} - -// Récupérer les k-mers filtrés (freq ≥ 3) -filtered := filter.GetFilteredSet("filtered") -fmt.Printf("K-mers de qualité: %d\n", filtered.Cardinality()) - -// Statistiques -stats := filter.Stats() -fmt.Println(stats.String()) -``` - ---- - -## Performance - -### Complexité - -**Par k-mer** : -- Lookups : Moyenne ~v/2, pire cas v -- Insertions : 1 Add -- **Pas de Remove** ✅ - -**Total pour n k-mers** : -- Temps : O(n × v/2) -- Mémoire : O(unique_kmers × v × 2 bytes) - -### Early exit pour distribution skewed - -Avec distribution typique (séquençage) : -``` -80% singletons → 1 lookup (early exit) -15% freq 2-3 → 2-3 lookups -5% freq ≥4 → jusqu'à v lookups - -Moyenne réelle : ~2 lookups/kmer (au lieu de v/2) -``` - ---- - -## Mémoire - -### Pour 10^8 k-mers uniques - -| v (minFreq) | Nombre bitmaps | Mémoire | vs map simple | -|-------------|----------------|---------|---------------| -| v=2 | 2 | ~400 MB | 6x moins | -| v=3 | 3 | ~600 MB | 4x moins | -| v=5 | 5 | ~1 GB | 2.4x moins | -| v=10 | 10 | ~2 GB | 1.2x moins | -| v=20 | 20 | ~4 GB | ~égal | - -**Note** : Avec distribution skewed (beaucoup de singletons), la mémoire réelle est bien plus faible car les niveaux hauts ont peu d'éléments. - -### Exemple réaliste (séquençage) - -Pour 10^8 k-mers totaux, v=3 : -``` -Distribution: - 80% singletons → 80M dans index[0] - 15% freq 2-3 → 15M dans index[1] - 5% freq ≥3 → 5M dans index[2] - -Mémoire: - index[0]: 80M × 2 bytes = 160 MB - index[1]: 15M × 2 bytes = 30 MB - index[2]: 5M × 2 bytes = 10 MB - Total: ~200 MB ✅ - -vs map simple: 80M × 24 bytes = ~2 GB -Réduction: 10x -``` - ---- - -## Comparaison des approches - -| Approche | Mémoire (10^8 kmers) | Passes | Lookups/kmer | Quand utiliser | -|----------|----------------------|--------|--------------|----------------| -| **v-Bitmaps** | **200-600 MB** | **1** | **~2 (avg)** | **Standard** ✅ | -| Map simple | 2.4 GB | 1 | 1 | Si RAM illimitée | -| Multi-pass | 400 MB | v | v | Si I/O pas cher | - ---- - -## Avantages de v-Bitmaps - -✅ **Une seule passe** sur les données -✅ **Mémoire optimale** avec Roaring bitmaps -✅ **Pas de Remove** (seulement Contains + Add) -✅ **Early exit** efficace sur singletons -✅ **Scalable** jusqu'à v~10-20 -✅ **Simple** à implémenter et comprendre - ---- - -## Cas d'usage typiques - -### 1. Éliminer erreurs de séquençage - -```go -filter := obikmer.NewFrequencyFilter(31, 3) - -// Traiter FASTQ -for read := range StreamFastq("sample.fastq") { - filter.AddSequence(read) -} - -// K-mers de qualité (pas d'erreurs) -cleaned := filter.GetFilteredSet("cleaned") -``` - -**Résultat** : Élimine 70-80% des k-mers (erreurs) - -### 2. Assemblage de génome - -```go -filter := obikmer.NewFrequencyFilter(31, 2) - -// Filtrer avant l'assemblage -for read := range reads { - filter.AddSequence(read) -} - -solidKmers := filter.GetFilteredSet("solid") -// Utiliser solidKmers pour le graphe de Bruijn -``` - -### 3. Comparaison de génomes - -```go -collection := obikmer.NewKmerSetCollection(31) - -for _, genome := range genomes { - filter := obikmer.NewFrequencyFilter(31, 3) - filter.AddSequences(genome.Reads) - - cleaned := filter.GetFilteredSet(genome.ID) - collection.Add(cleaned) -} - -// Analyses comparatives sur k-mers de qualité -matrix := collection.ParallelPairwiseJaccard(8) -``` - ---- - -## Limites - -**Pour v > 20** : -- Trop de lookups (v lookups/kmer) -- Mémoire importante (v × 200MB pour 10^8 kmers) - -**Solutions alternatives pour v > 20** : -- Utiliser map simple (9 bytes/kmer) si RAM disponible -- Algorithme différent (sketch, probabiliste) - ---- - -## Optimisations possibles - -### 1. Parallélisation - -```go -// Traiter plusieurs fichiers en parallèle -filters := make([]*FrequencyFilter, numFiles) - -var wg sync.WaitGroup -for i, file := range files { - wg.Add(1) - go func(idx int, f string) { - defer wg.Done() - filters[idx] = ProcessFile(f, k, minFreq) - }(i, file) -} -wg.Wait() - -// Merger les résultats -merged := MergeFilters(filters) -``` - -### 2. Streaming avec seuil adaptatif - -```go -// Commencer avec v=5, réduire progressivement -filter := obikmer.NewFrequencyFilter(31, 5) - -// ... traitement ... - -// Si trop de mémoire, réduire à v=3 -if filter.MemoryUsage() > threshold { - filter = ConvertToLowerThreshold(filter, 3) -} -``` - ---- - -## Récapitulatif final - -**Pour filtrer les k-mers par fréquence ≥ v :** - -1. **Créer** : `filter := NewFrequencyFilter(k, v)` -2. **Traiter** : `filter.AddSequence(read)` pour chaque read -3. **Résultat** : `filtered := filter.GetFilteredSet(id)` - -**Mémoire** : ~2v MB par million de k-mers uniques -**Temps** : Une seule passe, ~2 lookups/kmer en moyenne -**Optimal pour** : v ≤ 20, distribution skewed (séquençage) - ---- - -## Code fourni - -1. **frequency_filter.go** - Implémentation complète -2. **examples_frequency_filter_final.go** - Exemples d'utilisation - -**Tout est prêt à utiliser !** 🚀 diff --git a/kmer_roaring_index/examples_frequency_filter_final.go b/kmer_roaring_index/examples_frequency_filter_final.go deleted file mode 100644 index b2a83d6..0000000 --- a/kmer_roaring_index/examples_frequency_filter_final.go +++ /dev/null @@ -1,320 +0,0 @@ -package main - -import ( - "fmt" - "obikmer" -) - -func main() { - // ========================================== - // EXEMPLE 1 : Utilisation basique - // ========================================== - fmt.Println("=== EXEMPLE 1 : Utilisation basique ===\n") - - k := 31 - minFreq := 3 // Garder les k-mers vus ≥3 fois - - // Créer le filtre - filter := obikmer.NewFrequencyFilter(k, minFreq) - - // Simuler des séquences avec différentes fréquences - sequences := [][]byte{ - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), // Kmer X - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), // Kmer X (freq=2) - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), // Kmer X (freq=3) ✓ - []byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"), // Kmer Y - []byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"), // Kmer Y (freq=2) ✗ - []byte("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"), // Kmer Z (freq=1) ✗ - } - - fmt.Printf("Traitement de %d séquences...\n", len(sequences)) - for _, seq := range sequences { - filter.AddSequence(seq) - } - - // Récupérer les k-mers filtrés - filtered := filter.GetFilteredSet("filtered") - fmt.Printf("\nK-mers avec freq ≥ %d: %d\n", minFreq, filtered.Cardinality()) - - // Statistiques - stats := filter.Stats() - fmt.Println("\n" + stats.String()) - - // ========================================== - // EXEMPLE 2 : Vérifier les niveaux - // ========================================== - fmt.Println("\n=== EXEMPLE 2 : Inspection des niveaux ===\n") - - // Vérifier chaque niveau - for level := 0; level < minFreq; level++ { - levelSet := filter.GetKmersAtLevel(level) - fmt.Printf("Niveau %d (freq≥%d): %d k-mers\n", - level+1, level+1, levelSet.Cardinality()) - } - - // ========================================== - // EXEMPLE 3 : Données réalistes - // ========================================== - fmt.Println("\n=== EXEMPLE 3 : Simulation données séquençage ===\n") - - filter2 := obikmer.NewFrequencyFilter(31, 3) - - // Simuler un dataset réaliste : - // - 1000 reads - // - 80% contiennent des erreurs (singletons) - // - 15% vrais k-mers à basse fréquence - // - 5% vrais k-mers à haute fréquence - - // Vraie séquence répétée - trueSeq := []byte("ACGTACGTACGTACGTACGTACGTACGTACG") - for i := 0; i < 50; i++ { - filter2.AddSequence(trueSeq) - } - - // Séquence à fréquence moyenne - mediumSeq := []byte("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC") - for i := 0; i < 5; i++ { - filter2.AddSequence(mediumSeq) - } - - // Erreurs de séquençage (singletons) - for i := 0; i < 100; i++ { - errorSeq := []byte(fmt.Sprintf("TTTTTTTTTTTTTTTTTTTTTTTTTTTT%03d", i)) - filter2.AddSequence(errorSeq) - } - - stats2 := filter2.Stats() - fmt.Println(stats2.String()) - - fmt.Println("Distribution attendue:") - fmt.Println(" - Beaucoup de singletons (erreurs)") - fmt.Println(" - Peu de k-mers à haute fréquence (signal)") - fmt.Println(" → Filtrage efficace !") - - // ========================================== - // EXEMPLE 4 : Tester différents seuils - // ========================================== - fmt.Println("\n=== EXEMPLE 4 : Comparaison de seuils ===\n") - - testSeqs := [][]byte{ - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), // freq=5 - []byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"), - []byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"), - []byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"), // freq=3 - []byte("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"), // freq=1 - } - - for _, minFreq := range []int{2, 3, 5} { - f := obikmer.NewFrequencyFilter(31, minFreq) - f.AddSequences(testSeqs) - - fmt.Printf("minFreq=%d: %d k-mers retenus (%.2f MB)\n", - minFreq, - f.Cardinality(), - float64(f.MemoryUsage())/1024/1024) - } - - // ========================================== - // EXEMPLE 5 : Comparaison mémoire - // ========================================== - fmt.Println("\n=== EXEMPLE 5 : Comparaison mémoire ===\n") - - filter3 := obikmer.NewFrequencyFilter(31, 3) - - // Simuler 10000 séquences - for i := 0; i < 10000; i++ { - seq := make([]byte, 100) - for j := range seq { - seq[j] = "ACGT"[(i+j)%4] - } - filter3.AddSequence(seq) - } - - fmt.Println(filter3.CompareWithSimpleMap()) - - // ========================================== - // EXEMPLE 6 : Workflow complet - // ========================================== - fmt.Println("\n=== EXEMPLE 6 : Workflow complet ===\n") - - fmt.Println("1. Créer le filtre") - finalFilter := obikmer.NewFrequencyFilter(31, 3) - - fmt.Println("2. Traiter les données (simulation)") - // En pratique : lire depuis FASTQ - // for read := range ReadFastq("data.fastq") { - // finalFilter.AddSequence(read) - // } - - // Simulation - for i := 0; i < 1000; i++ { - seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACG") - finalFilter.AddSequence(seq) - } - - fmt.Println("3. Récupérer les k-mers filtrés") - result := finalFilter.GetFilteredSet("final") - - fmt.Println("4. Utiliser le résultat") - fmt.Printf(" K-mers de qualité: %d\n", result.Cardinality()) - fmt.Printf(" Mémoire utilisée: %.2f MB\n", float64(finalFilter.MemoryUsage())/1024/1024) - - fmt.Println("5. Sauvegarder (optionnel)") - // result.Save("filtered_kmers.bin") - - // ========================================== - // EXEMPLE 7 : Vérification individuelle - // ========================================== - fmt.Println("\n=== EXEMPLE 7 : Vérification de k-mers spécifiques ===\n") - - checkFilter := obikmer.NewFrequencyFilter(31, 3) - - testSeq := []byte("ACGTACGTACGTACGTACGTACGTACGTACG") - for i := 0; i < 5; i++ { - checkFilter.AddSequence(testSeq) - } - - var kmers []uint64 - kmers = obikmer.EncodeKmers(testSeq, 31, &kmers) - - if len(kmers) > 0 { - testKmer := kmers[0] - - fmt.Printf("K-mer test: 0x%016X\n", testKmer) - fmt.Printf(" Présent dans filtre: %v\n", checkFilter.Contains(testKmer)) - fmt.Printf(" Fréquence approx: %d\n", checkFilter.GetFrequency(testKmer)) - } - - // ========================================== - // EXEMPLE 8 : Intégration avec collection - // ========================================== - fmt.Println("\n=== EXEMPLE 8 : Intégration avec KmerSetCollection ===\n") - - // Créer une collection de génomes filtrés - collection := obikmer.NewKmerSetCollection(31) - - genomes := map[string][][]byte{ - "Genome1": { - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), - []byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"), // Erreur - }, - "Genome2": { - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), - []byte("ACGTACGTACGTACGTACGTACGTACGTACG"), - []byte("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"), // Erreur - }, - } - - for id, sequences := range genomes { - // Filtrer chaque génome - genomeFilter := obikmer.NewFrequencyFilter(31, 3) - genomeFilter.AddSequences(sequences) - - // Ajouter à la collection - filteredSet := genomeFilter.GetFilteredSet(id) - collection.Add(filteredSet) - - fmt.Printf("%s: %d k-mers de qualité\n", id, filteredSet.Cardinality()) - } - - // Analyser la collection - fmt.Println("\nAnalyse comparative:") - collectionStats := collection.ComputeStats() - fmt.Printf(" Core genome: %d k-mers\n", collectionStats.CoreSize) - fmt.Printf(" Pan genome: %d k-mers\n", collectionStats.PanGenomeSize) - - // ========================================== - // RÉSUMÉ - // ========================================== - fmt.Println("\n=== RÉSUMÉ ===\n") - fmt.Println("Le FrequencyFilter permet de:") - fmt.Println(" ✓ Filtrer les k-mers par fréquence minimale") - fmt.Println(" ✓ Utiliser une mémoire optimale avec Roaring bitmaps") - fmt.Println(" ✓ Une seule passe sur les données") - fmt.Println(" ✓ Éliminer efficacement les erreurs de séquençage") - fmt.Println("") - fmt.Println("Workflow typique:") - fmt.Println(" 1. filter := NewFrequencyFilter(k, minFreq)") - fmt.Println(" 2. for each sequence: filter.AddSequence(seq)") - fmt.Println(" 3. filtered := filter.GetFilteredSet(id)") - fmt.Println(" 4. Utiliser filtered dans vos analyses") -} - -// ================================== -// FONCTION HELPER POUR BENCHMARKS -// ================================== - -func BenchmarkFrequencyFilter() { - k := 31 - minFreq := 3 - - // Test avec différentes tailles - sizes := []int{1000, 10000, 100000} - - fmt.Println("\n=== BENCHMARK ===\n") - - for _, size := range sizes { - filter := obikmer.NewFrequencyFilter(k, minFreq) - - // Générer des séquences - for i := 0; i < size; i++ { - seq := make([]byte, 100) - for j := range seq { - seq[j] = "ACGT"[(i+j)%4] - } - filter.AddSequence(seq) - } - - fmt.Printf("Size=%d reads:\n", size) - fmt.Printf(" Filtered k-mers: %d\n", filter.Cardinality()) - fmt.Printf(" Memory: %.2f MB\n", float64(filter.MemoryUsage())/1024/1024) - fmt.Println() - } -} - -// ================================== -// FONCTION POUR DONNÉES RÉELLES -// ================================== - -func ProcessRealData() { - // Exemple pour traiter de vraies données FASTQ - - k := 31 - minFreq := 3 - - filter := obikmer.NewFrequencyFilter(k, minFreq) - - // Pseudo-code pour lire un FASTQ - /* - fastqFile := "sample.fastq" - reader := NewFastqReader(fastqFile) - - for reader.HasNext() { - read := reader.Next() - filter.AddSequence(read.Sequence) - } - - // Récupérer le résultat - filtered := filter.GetFilteredSet("sample_filtered") - filtered.Save("sample_filtered_kmers.bin") - - // Stats - stats := filter.Stats() - fmt.Println(stats.String()) - */ - - fmt.Println("Workflow pour données réelles:") - fmt.Println(" 1. Créer le filtre avec minFreq approprié (2-5 typique)") - fmt.Println(" 2. Stream les reads depuis FASTQ") - fmt.Println(" 3. Récupérer les k-mers filtrés") - fmt.Println(" 4. Utiliser pour assemblage/comparaison/etc.") - - _ = filter // unused -} diff --git a/pkg/obikmer/entropy.go b/pkg/obikmer/entropy.go new file mode 100644 index 0000000..94f8ab7 --- /dev/null +++ b/pkg/obikmer/entropy.go @@ -0,0 +1,281 @@ +package obikmer + +import "math" + +// KmerEntropy computes the entropy of a single encoded k-mer. +// +// The algorithm mirrors the lowmask entropy calculation: it decodes the k-mer +// to a DNA sequence, extracts all sub-words of each size from 1 to levelMax, +// normalizes them by circular canonical form, counts their frequencies, and +// computes Shannon entropy normalized by the maximum possible entropy. +// The returned value is the minimum entropy across all word sizes. +// +// A value close to 0 indicates very low complexity (e.g. "AAAA..."), +// while a value close to 1 indicates high complexity. +// +// Parameters: +// - kmer: the encoded k-mer (2 bits per base) +// - k: the k-mer size +// - levelMax: maximum sub-word size for entropy (typically 6) +// +// Returns: +// - minimum normalized entropy across all word sizes 1..levelMax +func KmerEntropy(kmer uint64, k int, levelMax int) float64 { + if k < 1 || levelMax < 1 { + return 1.0 + } + if levelMax >= k { + levelMax = k - 1 + } + if levelMax < 1 { + return 1.0 + } + + // Decode k-mer to DNA sequence + var seqBuf [32]byte + seq := DecodeKmer(kmer, k, seqBuf[:]) + + // Pre-compute nLogN lookup (same as lowmask) + nLogN := make([]float64, k+1) + for i := 1; i <= k; i++ { + nLogN[i] = float64(i) * math.Log(float64(i)) + } + + // Build circular-canonical normalization tables per word size + normTables := make([][]int, levelMax+1) + for ws := 1; ws <= levelMax; ws++ { + size := 1 << (ws * 2) + normTables[ws] = make([]int, size) + for code := 0; code < size; code++ { + normTables[ws][code] = int(NormalizeCircular(uint64(code), ws)) + } + } + + minEntropy := math.MaxFloat64 + + for ws := 1; ws <= levelMax; ws++ { + nwords := k - ws + 1 + if nwords < 1 { + continue + } + + // Count circular-canonical sub-word frequencies + tableSize := 1 << (ws * 2) + table := make([]int, tableSize) + mask := (1 << (ws * 2)) - 1 + + wordIndex := 0 + for i := 0; i < ws-1; i++ { + wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i])) + } + + for i, j := 0, ws-1; j < k; i, j = i+1, j+1 { + wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j])) + normWord := normTables[ws][wordIndex] + table[normWord]++ + } + + // Compute Shannon entropy + floatNwords := float64(nwords) + logNwords := math.Log(floatNwords) + + var sumNLogN float64 + for j := 0; j < tableSize; j++ { + n := table[j] + if n > 0 { + sumNLogN += nLogN[n] + } + } + + // Compute emax (maximum possible entropy for this word size) + na := CanonicalCircularKmerCount(ws) + var emax float64 + if nwords < na { + emax = math.Log(float64(nwords)) + } else { + cov := nwords / na + remains := nwords - (na * cov) + f1 := float64(cov) / floatNwords + f2 := float64(cov+1) / floatNwords + emax = -(float64(na-remains)*f1*math.Log(f1) + + float64(remains)*f2*math.Log(f2)) + } + + if emax <= 0 { + continue + } + + entropy := (logNwords - sumNLogN/floatNwords) / emax + if entropy < 0 { + entropy = 0 + } + + if entropy < minEntropy { + minEntropy = entropy + } + } + + if minEntropy == math.MaxFloat64 { + return 1.0 + } + + return math.Round(minEntropy*10000) / 10000 +} + +// KmerEntropyFilter is a reusable entropy filter for batch processing. +// It pre-computes normalization tables and lookup values to avoid repeated +// allocation across millions of k-mers. +// +// IMPORTANT: a KmerEntropyFilter is NOT safe for concurrent use. +// Each goroutine must create its own instance via NewKmerEntropyFilter. +type KmerEntropyFilter struct { + k int + levelMax int + threshold float64 + nLogN []float64 + normTables [][]int + emaxValues []float64 + logNwords []float64 + // Pre-allocated frequency tables reused across Entropy() calls. + // One per word size (index 0 unused). Reset to zero before each use. + freqTables [][]int +} + +// NewKmerEntropyFilter creates an entropy filter with pre-computed tables. +// +// Parameters: +// - k: the k-mer size +// - levelMax: maximum sub-word size for entropy (typically 6) +// - threshold: entropy threshold (k-mers with entropy <= threshold are rejected) +func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter { + if levelMax >= k { + levelMax = k - 1 + } + if levelMax < 1 { + levelMax = 1 + } + + nLogN := make([]float64, k+1) + for i := 1; i <= k; i++ { + nLogN[i] = float64(i) * math.Log(float64(i)) + } + + normTables := make([][]int, levelMax+1) + for ws := 1; ws <= levelMax; ws++ { + size := 1 << (ws * 2) + normTables[ws] = make([]int, size) + for code := 0; code < size; code++ { + normTables[ws][code] = int(NormalizeCircular(uint64(code), ws)) + } + } + + emaxValues := make([]float64, levelMax+1) + logNwords := make([]float64, levelMax+1) + for ws := 1; ws <= levelMax; ws++ { + nw := k - ws + 1 + na := CanonicalCircularKmerCount(ws) + if nw < na { + logNwords[ws] = math.Log(float64(nw)) + emaxValues[ws] = math.Log(float64(nw)) + } else { + cov := nw / na + remains := nw - (na * cov) + f1 := float64(cov) / float64(nw) + f2 := float64(cov+1) / float64(nw) + logNwords[ws] = math.Log(float64(nw)) + emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) + + float64(remains)*f2*math.Log(f2)) + } + } + + // Pre-allocate frequency tables per word size + freqTables := make([][]int, levelMax+1) + for ws := 1; ws <= levelMax; ws++ { + freqTables[ws] = make([]int, 1<<(ws*2)) + } + + return &KmerEntropyFilter{ + k: k, + levelMax: levelMax, + threshold: threshold, + nLogN: nLogN, + normTables: normTables, + emaxValues: emaxValues, + logNwords: logNwords, + freqTables: freqTables, + } +} + +// Accept returns true if the k-mer has entropy strictly above the threshold. +// Low-complexity k-mers (entropy <= threshold) are rejected. +func (ef *KmerEntropyFilter) Accept(kmer uint64) bool { + return ef.Entropy(kmer) > ef.threshold +} + +// Entropy computes the entropy for a single k-mer using pre-computed tables. +func (ef *KmerEntropyFilter) Entropy(kmer uint64) float64 { + k := ef.k + + // Decode k-mer to DNA sequence + var seqBuf [32]byte + seq := DecodeKmer(kmer, k, seqBuf[:]) + + minEntropy := math.MaxFloat64 + + for ws := 1; ws <= ef.levelMax; ws++ { + nwords := k - ws + 1 + if nwords < 1 { + continue + } + + emax := ef.emaxValues[ws] + if emax <= 0 { + continue + } + + // Count circular-canonical sub-word frequencies + tableSize := 1 << (ws * 2) + table := ef.freqTables[ws] + clear(table) // reset to zero + mask := (1 << (ws * 2)) - 1 + normTable := ef.normTables[ws] + + wordIndex := 0 + for i := 0; i < ws-1; i++ { + wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i])) + } + + for i, j := 0, ws-1; j < k; i, j = i+1, j+1 { + wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j])) + normWord := normTable[wordIndex] + table[normWord]++ + } + + // Compute Shannon entropy + floatNwords := float64(nwords) + logNwords := ef.logNwords[ws] + + var sumNLogN float64 + for j := 0; j < tableSize; j++ { + n := table[j] + if n > 0 { + sumNLogN += ef.nLogN[n] + } + } + + entropy := (logNwords - sumNLogN/floatNwords) / emax + if entropy < 0 { + entropy = 0 + } + + if entropy < minEntropy { + minEntropy = entropy + } + } + + if minEntropy == math.MaxFloat64 { + return 1.0 + } + + return math.Round(minEntropy*10000) / 10000 +} diff --git a/pkg/obikmer/frequency_filter.go b/pkg/obikmer/frequency_filter.go deleted file mode 100644 index 91b3b84..0000000 --- a/pkg/obikmer/frequency_filter.go +++ /dev/null @@ -1,310 +0,0 @@ -package obikmer - -import ( - "fmt" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" -) - -// FrequencyFilter filters k-mers by minimum frequency -// Specialization of KmerSetGroup where index[i] contains k-mers seen at least i+1 times -type FrequencyFilter struct { - *KmerSetGroup // Group of KmerSet (one per frequency level) - MinFreq int // v - minimum required frequency -} - -// NewFrequencyFilter creates a new frequency filter -// minFreq: minimum number d'occurrences required (v) -func NewFrequencyFilter(k, minFreq int) *FrequencyFilter { - ff := &FrequencyFilter{ - KmerSetGroup: NewKmerSetGroup(k, minFreq), - MinFreq: minFreq, - } - - // Initialize group metadata - ff.SetAttribute("type", "FrequencyFilter") - ff.SetAttribute("min_freq", minFreq) - - // Initialize metadata for each level - for i := 0; i < minFreq; i++ { - level := ff.Get(i) - level.SetAttribute("level", i) - level.SetAttribute("min_occurrences", i+1) - level.SetId(fmt.Sprintf("level_%d", i)) - } - - return ff -} - -// AddSequence adds all k-mers from a sequence to the filter -// Uses an iterator to avoid allocating an intermediate vector -func (ff *FrequencyFilter) AddSequence(seq *obiseq.BioSequence) { - rawSeq := seq.Sequence() - for canonical := range IterCanonicalKmers(rawSeq, ff.K()) { - ff.AddKmerCode(canonical) - } -} - -// AddKmerCode adds an encoded k-mer to the filter (main algorithm) -func (ff *FrequencyFilter) AddKmerCode(kmer uint64) { - // Find the current level of the k-mer - c := 0 - for c < ff.MinFreq && ff.Get(c).Contains(kmer) { - c++ - } - - // Add to next level (if not yet at maximum) - if c < ff.MinFreq { - ff.Get(c).AddKmerCode(kmer) - } -} - -// AddCanonicalKmerCode adds an encoded canonical k-mer to the filter -func (ff *FrequencyFilter) AddCanonicalKmerCode(kmer uint64) { - canonical := CanonicalKmer(kmer, ff.K()) - ff.AddKmerCode(canonical) -} - -// AddKmer adds a k-mer to the filter by encoding the sequence -// The sequence must have exactly k nucleotides -// Zero-allocation: encodes directly without creating an intermediate slice -func (ff *FrequencyFilter) AddKmer(seq []byte) { - kmer := EncodeKmer(seq, ff.K()) - ff.AddKmerCode(kmer) -} - -// AddCanonicalKmer adds a canonical k-mer to the filter by encoding the sequence -// The sequence must have exactly k nucleotides -// Zero-allocation: encodes directly in canonical form without creating an intermediate slice -func (ff *FrequencyFilter) AddCanonicalKmer(seq []byte) { - canonical := EncodeCanonicalKmer(seq, ff.K()) - ff.AddKmerCode(canonical) -} - -// GetFilteredSet returns a KmerSet of k-mers with frequency ≥ minFreq -func (ff *FrequencyFilter) GetFilteredSet() *KmerSet { - // Filtered k-mers are in the last level - return ff.Get(ff.MinFreq - 1).Copy() -} - -// GetKmersAtLevel returns a KmerSet of k-mers seen at least (level+1) times -// level doit être dans [0, minFreq-1] -func (ff *FrequencyFilter) GetKmersAtLevel(level int) *KmerSet { - ks := ff.Get(level) - if ks == nil { - return NewKmerSet(ff.K()) - } - return ks.Copy() -} - -// Stats returns statistics on frequency levels -func (ff *FrequencyFilter) Stats() FrequencyFilterStats { - stats := FrequencyFilterStats{ - MinFreq: ff.MinFreq, - Levels: make([]LevelStats, ff.MinFreq), - } - - for i := 0; i < ff.MinFreq; i++ { - ks := ff.Get(i) - card := ks.Len() - sizeBytes := ks.MemoryUsage() - - stats.Levels[i] = LevelStats{ - Level: i + 1, // Level 1 = freq ≥ 1 - Cardinality: card, - SizeBytes: sizeBytes, - } - - stats.TotalBytes += sizeBytes - } - - // The last level contains the result - stats.FilteredKmers = stats.Levels[ff.MinFreq-1].Cardinality - - return stats -} - -// FrequencyFilterStats contains the filter statistics -type FrequencyFilterStats struct { - MinFreq int - FilteredKmers uint64 // K-mers with freq ≥ minFreq - TotalBytes uint64 // Total memory used - Levels []LevelStats -} - -// LevelStats contains the stats of a level -type LevelStats struct { - Level int // freq ≥ Level - Cardinality uint64 // Number of k-mers - SizeBytes uint64 // Size in bytes -} - -func (ffs FrequencyFilterStats) String() string { - result := fmt.Sprintf(`Frequency Filter Statistics (minFreq=%d): - Filtered k-mers (freq≥%d): %d - Total memory: %.2f MB - -Level breakdown: -`, ffs.MinFreq, ffs.MinFreq, ffs.FilteredKmers, float64(ffs.TotalBytes)/1024/1024) - - for _, level := range ffs.Levels { - result += fmt.Sprintf(" freq≥%d: %d k-mers (%.2f MB)\n", - level.Level, - level.Cardinality, - float64(level.SizeBytes)/1024/1024) - } - - return result -} - -// Clear libère la mémoire de tous les niveaux -// (héritée de KmerSetGroup mais redéfinie pour clarté) -func (ff *FrequencyFilter) Clear() { - ff.KmerSetGroup.Clear() -} - -// ================================== -// BATCH PROCESSING -// ================================== - -// AddSequences adds multiple sequences in batch -func (ff *FrequencyFilter) AddSequences(sequences *obiseq.BioSequenceSlice) { - for _, seq := range *sequences { - ff.AddSequence(seq) - } -} - -// ================================== -// PERSISTANCE -// ================================== - -// Save sauvegarde le FrequencyFilter dans un répertoire -// Utilise le format de sérialisation du KmerSetGroup sous-jacent -// Les métadonnées incluent le type "FrequencyFilter" et min_freq -// -// Format: -// - directory/metadata.{toml,yaml,json} - métadonnées du filtre -// - directory/set_0.roaring - k-mers vus ≥1 fois -// - directory/set_1.roaring - k-mers vus ≥2 fois -// - ... -// - directory/set_{minFreq-1}.roaring - k-mers vus ≥minFreq fois -// -// Parameters: -// - directory: répertoire de destination -// - format: format des métadonnées (FormatTOML, FormatYAML, FormatJSON) -// -// Example: -// -// err := ff.Save("./my_filter", obikmer.FormatTOML) -func (ff *FrequencyFilter) Save(directory string, format MetadataFormat) error { - // Déléguer à KmerSetGroup qui gère déjà tout - return ff.KmerSetGroup.Save(directory, format) -} - -// LoadFrequencyFilter charge un FrequencyFilter depuis un répertoire -// Vérifie que les métadonnées correspondent à un FrequencyFilter -// -// Parameters: -// - directory: répertoire source -// -// Returns: -// - *FrequencyFilter: le filtre chargé -// - error: erreur si le chargement échoue ou si ce n'est pas un FrequencyFilter -// -// Example: -// -// ff, err := obikmer.LoadFrequencyFilter("./my_filter") -func LoadFrequencyFilter(directory string) (*FrequencyFilter, error) { - // Charger le KmerSetGroup - ksg, err := LoadKmerSetGroup(directory) - if err != nil { - return nil, err - } - - // Vérifier que c'est bien un FrequencyFilter - if typeAttr, ok := ksg.GetAttribute("type"); !ok || typeAttr != "FrequencyFilter" { - return nil, fmt.Errorf("loaded data is not a FrequencyFilter (type=%v)", typeAttr) - } - - // Récupérer min_freq - minFreqAttr, ok := ksg.GetIntAttribute("min_freq") - if !ok { - return nil, fmt.Errorf("FrequencyFilter missing min_freq attribute") - } - - // Créer le FrequencyFilter - ff := &FrequencyFilter{ - KmerSetGroup: ksg, - MinFreq: minFreqAttr, - } - - return ff, nil -} - -// ================================== -// UTILITAIRES -// ================================== - -// Contains vérifie si un k-mer a atteint la fréquence minimale -func (ff *FrequencyFilter) Contains(kmer uint64) bool { - canonical := CanonicalKmer(kmer, ff.K()) - return ff.Get(ff.MinFreq - 1).Contains(canonical) -} - -// GetFrequency returns the approximate frequency of a k-mer -// Retourne le niveau maximum atteint (freq ≥ niveau) -func (ff *FrequencyFilter) GetFrequency(kmer uint64) int { - canonical := CanonicalKmer(kmer, ff.K()) - - freq := 0 - for i := 0; i < ff.MinFreq; i++ { - if ff.Get(i).Contains(canonical) { - freq = i + 1 - } else { - break - } - } - - return freq -} - -// Len returns the number of filtered k-mers or at a specific level -// Without argument: returns the number of k-mers with freq ≥ minFreq (last level) -// With argument level: returns the number of k-mers with freq ≥ (level+1) -// Exemple: Len() pour les k-mers filtrés, Len(2) pour freq ≥ 3 -// (héritée de KmerSetGroup mais redéfinie pour la documentation) -func (ff *FrequencyFilter) Len(level ...int) uint64 { - return ff.KmerSetGroup.Len(level...) -} - -// MemoryUsage returns memory usage in bytes -// (héritée de KmerSetGroup mais redéfinie pour clarté) -func (ff *FrequencyFilter) MemoryUsage() uint64 { - return ff.KmerSetGroup.MemoryUsage() -} - -// ================================== -// COMPARAISON AVEC D'AUTRES APPROCHES -// ================================== - -// CompareWithSimpleMap compare la mémoire avec une simple map -func (ff *FrequencyFilter) CompareWithSimpleMap() string { - totalKmers := ff.Get(0).Len() - - simpleMapBytes := totalKmers * 24 // ~24 bytes par entrée - roaringBytes := ff.MemoryUsage() - - reduction := float64(simpleMapBytes) / float64(roaringBytes) - - return fmt.Sprintf(`Memory Comparison for %d k-mers: - Simple map[uint64]uint32: %.2f MB - Roaring filter (v=%d): %.2f MB - Reduction: %.1fx -`, - totalKmers, - float64(simpleMapBytes)/1024/1024, - ff.MinFreq, - float64(roaringBytes)/1024/1024, - reduction, - ) -} diff --git a/pkg/obikmer/kdi_merge.go b/pkg/obikmer/kdi_merge.go new file mode 100644 index 0000000..bf0dcf5 --- /dev/null +++ b/pkg/obikmer/kdi_merge.go @@ -0,0 +1,86 @@ +package obikmer + +import "container/heap" + +// mergeItem represents an element in the min-heap for k-way merge. +type mergeItem struct { + value uint64 + idx int // index of the reader that produced this value +} + +// mergeHeap implements heap.Interface for k-way merge. +type mergeHeap []mergeItem + +func (h mergeHeap) Len() int { return len(h) } +func (h mergeHeap) Less(i, j int) bool { return h[i].value < h[j].value } +func (h mergeHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } +func (h *mergeHeap) Push(x interface{}) { *h = append(*h, x.(mergeItem)) } +func (h *mergeHeap) Pop() interface{} { + old := *h + n := len(old) + x := old[n-1] + *h = old[:n-1] + return x +} + +// KWayMerge performs a k-way merge of multiple sorted KdiReader streams. +// For each unique k-mer value, it reports the value and the number of +// input streams that contained it (count). +type KWayMerge struct { + h mergeHeap + readers []*KdiReader +} + +// NewKWayMerge creates a k-way merge from multiple KdiReaders. +// Each reader must produce values in sorted (ascending) order. +func NewKWayMerge(readers []*KdiReader) *KWayMerge { + m := &KWayMerge{ + h: make(mergeHeap, 0, len(readers)), + readers: readers, + } + + // Initialize heap with first value from each reader + for i, r := range readers { + if v, ok := r.Next(); ok { + m.h = append(m.h, mergeItem{value: v, idx: i}) + } + } + heap.Init(&m.h) + + return m +} + +// Next returns the next smallest k-mer value, the number of readers +// that contained this value (count), and true. +// Returns (0, 0, false) when all streams are exhausted. +func (m *KWayMerge) Next() (kmer uint64, count int, ok bool) { + if len(m.h) == 0 { + return 0, 0, false + } + + minVal := m.h[0].value + count = 0 + + // Pop all items with the same value + for len(m.h) > 0 && m.h[0].value == minVal { + item := heap.Pop(&m.h).(mergeItem) + count++ + // Advance that reader + if v, ok := m.readers[item.idx].Next(); ok { + heap.Push(&m.h, mergeItem{value: v, idx: item.idx}) + } + } + + return minVal, count, true +} + +// Close closes all underlying readers. +func (m *KWayMerge) Close() error { + var firstErr error + for _, r := range m.readers { + if err := r.Close(); err != nil && firstErr == nil { + firstErr = err + } + } + return firstErr +} diff --git a/pkg/obikmer/kdi_merge_test.go b/pkg/obikmer/kdi_merge_test.go new file mode 100644 index 0000000..56aa028 --- /dev/null +++ b/pkg/obikmer/kdi_merge_test.go @@ -0,0 +1,159 @@ +package obikmer + +import ( + "path/filepath" + "testing" +) + +// writeKdi is a helper that writes sorted kmers to a .kdi file. +func writeKdi(t *testing.T, dir, name string, kmers []uint64) string { + t.Helper() + path := filepath.Join(dir, name) + w, err := NewKdiWriter(path) + if err != nil { + t.Fatal(err) + } + for _, v := range kmers { + if err := w.Write(v); err != nil { + t.Fatal(err) + } + } + if err := w.Close(); err != nil { + t.Fatal(err) + } + return path +} + +func TestKWayMergeBasic(t *testing.T) { + dir := t.TempDir() + + // Three sorted streams + p1 := writeKdi(t, dir, "a.kdi", []uint64{1, 3, 5, 7}) + p2 := writeKdi(t, dir, "b.kdi", []uint64{2, 3, 6, 7}) + p3 := writeKdi(t, dir, "c.kdi", []uint64{3, 4, 7, 8}) + + r1, _ := NewKdiReader(p1) + r2, _ := NewKdiReader(p2) + r3, _ := NewKdiReader(p3) + + m := NewKWayMerge([]*KdiReader{r1, r2, r3}) + defer m.Close() + + type result struct { + kmer uint64 + count int + } + var results []result + for { + kmer, count, ok := m.Next() + if !ok { + break + } + results = append(results, result{kmer, count}) + } + + expected := []result{ + {1, 1}, {2, 1}, {3, 3}, {4, 1}, {5, 1}, {6, 1}, {7, 3}, {8, 1}, + } + if len(results) != len(expected) { + t.Fatalf("got %d results, want %d", len(results), len(expected)) + } + for i, exp := range expected { + if results[i] != exp { + t.Errorf("result %d: got %+v, want %+v", i, results[i], exp) + } + } +} + +func TestKWayMergeSingleStream(t *testing.T) { + dir := t.TempDir() + p := writeKdi(t, dir, "a.kdi", []uint64{10, 20, 30}) + + r, _ := NewKdiReader(p) + m := NewKWayMerge([]*KdiReader{r}) + defer m.Close() + + vals := []uint64{10, 20, 30} + for _, expected := range vals { + kmer, count, ok := m.Next() + if !ok { + t.Fatal("unexpected EOF") + } + if kmer != expected || count != 1 { + t.Fatalf("got (%d, %d), want (%d, 1)", kmer, count, expected) + } + } + _, _, ok := m.Next() + if ok { + t.Fatal("expected EOF") + } +} + +func TestKWayMergeEmpty(t *testing.T) { + dir := t.TempDir() + + p1 := writeKdi(t, dir, "a.kdi", nil) + p2 := writeKdi(t, dir, "b.kdi", nil) + + r1, _ := NewKdiReader(p1) + r2, _ := NewKdiReader(p2) + + m := NewKWayMerge([]*KdiReader{r1, r2}) + defer m.Close() + + _, _, ok := m.Next() + if ok { + t.Fatal("expected no results from empty streams") + } +} + +func TestKWayMergeDisjoint(t *testing.T) { + dir := t.TempDir() + + p1 := writeKdi(t, dir, "a.kdi", []uint64{1, 2, 3}) + p2 := writeKdi(t, dir, "b.kdi", []uint64{10, 20, 30}) + + r1, _ := NewKdiReader(p1) + r2, _ := NewKdiReader(p2) + + m := NewKWayMerge([]*KdiReader{r1, r2}) + defer m.Close() + + expected := []uint64{1, 2, 3, 10, 20, 30} + for _, exp := range expected { + kmer, count, ok := m.Next() + if !ok { + t.Fatal("unexpected EOF") + } + if kmer != exp || count != 1 { + t.Fatalf("got (%d, %d), want (%d, 1)", kmer, count, exp) + } + } +} + +func TestKWayMergeAllSame(t *testing.T) { + dir := t.TempDir() + + p1 := writeKdi(t, dir, "a.kdi", []uint64{42}) + p2 := writeKdi(t, dir, "b.kdi", []uint64{42}) + p3 := writeKdi(t, dir, "c.kdi", []uint64{42}) + + r1, _ := NewKdiReader(p1) + r2, _ := NewKdiReader(p2) + r3, _ := NewKdiReader(p3) + + m := NewKWayMerge([]*KdiReader{r1, r2, r3}) + defer m.Close() + + kmer, count, ok := m.Next() + if !ok { + t.Fatal("expected one result") + } + if kmer != 42 || count != 3 { + t.Fatalf("got (%d, %d), want (42, 3)", kmer, count) + } + _, _, ok = m.Next() + if ok { + t.Fatal("expected EOF") + } +} diff --git a/pkg/obikmer/kdi_reader.go b/pkg/obikmer/kdi_reader.go new file mode 100644 index 0000000..09cbb46 --- /dev/null +++ b/pkg/obikmer/kdi_reader.go @@ -0,0 +1,170 @@ +package obikmer + +import ( + "bufio" + "encoding/binary" + "fmt" + "io" + "os" +) + +// KdiReader reads k-mers from a .kdi file using streaming delta-varint decoding. +type KdiReader struct { + r *bufio.Reader + file *os.File + count uint64 // total number of k-mers + read uint64 // number of k-mers already consumed + prev uint64 // last decoded value + started bool // whether first value has been read + index *KdxIndex // optional sparse index for seeking +} + +// NewKdiReader opens a .kdi file for streaming reading (no index). +func NewKdiReader(path string) (*KdiReader, error) { + return openKdiReader(path, nil) +} + +// NewKdiIndexedReader opens a .kdi file with its companion .kdx index +// loaded for fast seeking. If the .kdx file does not exist, it gracefully +// falls back to sequential reading. +func NewKdiIndexedReader(path string) (*KdiReader, error) { + kdxPath := KdxPathForKdi(path) + idx, err := LoadKdxIndex(kdxPath) + if err != nil { + // Index load failed — fall back to non-indexed + return openKdiReader(path, nil) + } + // idx may be nil if file does not exist — that's fine + return openKdiReader(path, idx) +} + +func openKdiReader(path string, idx *KdxIndex) (*KdiReader, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + r := bufio.NewReaderSize(f, 65536) + + // Read and verify magic + var magic [4]byte + if _, err := io.ReadFull(r, magic[:]); err != nil { + f.Close() + return nil, fmt.Errorf("kdi: read magic: %w", err) + } + if magic != kdiMagic { + f.Close() + return nil, fmt.Errorf("kdi: bad magic %v", magic) + } + + // Read count + var countBuf [8]byte + if _, err := io.ReadFull(r, countBuf[:]); err != nil { + f.Close() + return nil, fmt.Errorf("kdi: read count: %w", err) + } + count := binary.LittleEndian.Uint64(countBuf[:]) + + return &KdiReader{ + r: r, + file: f, + count: count, + index: idx, + }, nil +} + +// Next returns the next k-mer and true, or (0, false) when exhausted. +func (kr *KdiReader) Next() (uint64, bool) { + if kr.read >= kr.count { + return 0, false + } + + if !kr.started { + // Read first value as absolute uint64 LE + var buf [8]byte + if _, err := io.ReadFull(kr.r, buf[:]); err != nil { + return 0, false + } + kr.prev = binary.LittleEndian.Uint64(buf[:]) + kr.started = true + kr.read++ + return kr.prev, true + } + + // Read delta varint + delta, err := DecodeVarint(kr.r) + if err != nil { + return 0, false + } + kr.prev += delta + kr.read++ + return kr.prev, true +} + +// SeekTo positions the reader near the target k-mer using the sparse .kdx index. +// After SeekTo, the reader is positioned so that the next call to Next() +// returns the k-mer immediately after the indexed entry at or before target. +// +// If the reader has no index, or the target is before the current position, +// SeekTo does nothing (linear scan continues from current position). +func (kr *KdiReader) SeekTo(target uint64) error { + if kr.index == nil { + return nil + } + + // If we've already passed the target, we can't seek backwards + if kr.started && kr.prev >= target { + return nil + } + + offset, skipCount, ok := kr.index.FindOffset(target) + if !ok { + return nil + } + + // skipCount is the number of k-mers consumed at the indexed position. + // The index was recorded AFTER writing the k-mer at position skipCount-1 + // (since count%stride==0 after incrementing count). So the actual number + // of k-mers consumed is skipCount (the entry's kmer is the last one + // before the offset). + + // Only seek if it would skip significant work + if kr.started && skipCount <= kr.read { + return nil + } + + // The index entry stores (kmer_value, byte_offset_after_that_kmer). + // skipCount = (entryIdx+1)*stride, so entryIdx = skipCount/stride - 1 + // We seek to that offset, set prev = indexedKmer, and the next Next() + // call will read the delta-varint of the following k-mer. + entryIdx := int(skipCount)/kr.index.stride - 1 + if entryIdx < 0 || entryIdx >= len(kr.index.entries) { + return nil + } + indexedKmer := kr.index.entries[entryIdx].kmer + + if _, err := kr.file.Seek(int64(offset), io.SeekStart); err != nil { + return fmt.Errorf("kdi: seek: %w", err) + } + kr.r.Reset(kr.file) + + kr.prev = indexedKmer + kr.started = true + kr.read = skipCount + + return nil +} + +// Count returns the total number of k-mers in this partition. +func (kr *KdiReader) Count() uint64 { + return kr.count +} + +// Remaining returns how many k-mers have not been read yet. +func (kr *KdiReader) Remaining() uint64 { + return kr.count - kr.read +} + +// Close closes the underlying file. +func (kr *KdiReader) Close() error { + return kr.file.Close() +} diff --git a/pkg/obikmer/kdi_test.go b/pkg/obikmer/kdi_test.go new file mode 100644 index 0000000..5b94092 --- /dev/null +++ b/pkg/obikmer/kdi_test.go @@ -0,0 +1,255 @@ +package obikmer + +import ( + "os" + "path/filepath" + "sort" + "testing" +) + +func TestKdiRoundTrip(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "test.kdi") + + // Sorted k-mer values + kmers := []uint64{10, 20, 30, 100, 200, 500, 10000, 1 << 40, 1<<62 - 1} + + w, err := NewKdiWriter(path) + if err != nil { + t.Fatal(err) + } + for _, v := range kmers { + if err := w.Write(v); err != nil { + t.Fatal(err) + } + } + if w.Count() != uint64(len(kmers)) { + t.Fatalf("writer count: got %d, want %d", w.Count(), len(kmers)) + } + if err := w.Close(); err != nil { + t.Fatal(err) + } + + // Read back + r, err := NewKdiReader(path) + if err != nil { + t.Fatal(err) + } + defer r.Close() + + if r.Count() != uint64(len(kmers)) { + t.Fatalf("reader count: got %d, want %d", r.Count(), len(kmers)) + } + + for i, expected := range kmers { + got, ok := r.Next() + if !ok { + t.Fatalf("unexpected EOF at index %d", i) + } + if got != expected { + t.Fatalf("kmer %d: got %d, want %d", i, got, expected) + } + } + + _, ok := r.Next() + if ok { + t.Fatal("expected EOF after all k-mers") + } +} + +func TestKdiEmpty(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "empty.kdi") + + w, err := NewKdiWriter(path) + if err != nil { + t.Fatal(err) + } + if err := w.Close(); err != nil { + t.Fatal(err) + } + + r, err := NewKdiReader(path) + if err != nil { + t.Fatal(err) + } + defer r.Close() + + if r.Count() != 0 { + t.Fatalf("expected count 0, got %d", r.Count()) + } + + _, ok := r.Next() + if ok { + t.Fatal("expected no k-mers in empty file") + } +} + +func TestKdiSingleValue(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "single.kdi") + + w, err := NewKdiWriter(path) + if err != nil { + t.Fatal(err) + } + if err := w.Write(42); err != nil { + t.Fatal(err) + } + if err := w.Close(); err != nil { + t.Fatal(err) + } + + r, err := NewKdiReader(path) + if err != nil { + t.Fatal(err) + } + defer r.Close() + + if r.Count() != 1 { + t.Fatalf("expected count 1, got %d", r.Count()) + } + + v, ok := r.Next() + if !ok { + t.Fatal("expected one k-mer") + } + if v != 42 { + t.Fatalf("got %d, want 42", v) + } +} + +func TestKdiFileSize(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "size.kdi") + + // Write: magic(4) + count(8) + first(8) = 20 bytes + w, err := NewKdiWriter(path) + if err != nil { + t.Fatal(err) + } + if err := w.Write(0); err != nil { + t.Fatal(err) + } + if err := w.Close(); err != nil { + t.Fatal(err) + } + + info, err := os.Stat(path) + if err != nil { + t.Fatal(err) + } + // magic(4) + count(8) + first(8) = 20 + if info.Size() != 20 { + t.Fatalf("file size: got %d, want 20", info.Size()) + } +} + +func TestKdiDeltaCompression(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "delta.kdi") + + // Dense consecutive values should compress well + n := 10000 + kmers := make([]uint64, n) + for i := range kmers { + kmers[i] = uint64(i * 2) // even numbers + } + + w, err := NewKdiWriter(path) + if err != nil { + t.Fatal(err) + } + for _, v := range kmers { + if err := w.Write(v); err != nil { + t.Fatal(err) + } + } + if err := w.Close(); err != nil { + t.Fatal(err) + } + + // Each delta is 2, encoded as 1 byte varint + // Total: magic(4) + count(8) + first(8) + (n-1)*1 = 20 + 9999 bytes + info, err := os.Stat(path) + if err != nil { + t.Fatal(err) + } + expected := int64(20 + n - 1) + if info.Size() != expected { + t.Fatalf("file size: got %d, want %d", info.Size(), expected) + } + + // Verify round-trip + r, err := NewKdiReader(path) + if err != nil { + t.Fatal(err) + } + defer r.Close() + + for i, expected := range kmers { + got, ok := r.Next() + if !ok { + t.Fatalf("unexpected EOF at index %d", i) + } + if got != expected { + t.Fatalf("kmer %d: got %d, want %d", i, got, expected) + } + } +} + +func TestKdiFromRealKmers(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "real.kdi") + + // Extract k-mers from a sequence, sort, dedup, write to KDI + seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT") + k := 15 + + var kmers []uint64 + for kmer := range IterCanonicalKmers(seq, k) { + kmers = append(kmers, kmer) + } + sort.Slice(kmers, func(i, j int) bool { return kmers[i] < kmers[j] }) + // Dedup + deduped := kmers[:0] + for i, v := range kmers { + if i == 0 || v != kmers[i-1] { + deduped = append(deduped, v) + } + } + + w, err := NewKdiWriter(path) + if err != nil { + t.Fatal(err) + } + for _, v := range deduped { + if err := w.Write(v); err != nil { + t.Fatal(err) + } + } + if err := w.Close(); err != nil { + t.Fatal(err) + } + + // Read back and verify + r, err := NewKdiReader(path) + if err != nil { + t.Fatal(err) + } + defer r.Close() + + if r.Count() != uint64(len(deduped)) { + t.Fatalf("count: got %d, want %d", r.Count(), len(deduped)) + } + + for i, expected := range deduped { + got, ok := r.Next() + if !ok { + t.Fatalf("unexpected EOF at index %d", i) + } + if got != expected { + t.Fatalf("kmer %d: got %d, want %d", i, got, expected) + } + } +} diff --git a/pkg/obikmer/kdi_writer.go b/pkg/obikmer/kdi_writer.go new file mode 100644 index 0000000..325e745 --- /dev/null +++ b/pkg/obikmer/kdi_writer.go @@ -0,0 +1,151 @@ +package obikmer + +import ( + "bufio" + "encoding/binary" + "os" +) + +// KDI file magic bytes: "KDI\x01" +var kdiMagic = [4]byte{'K', 'D', 'I', 0x01} + +// kdiHeaderSize is the size of the KDI header: magic(4) + count(8) = 12 bytes. +const kdiHeaderSize = 12 + +// KdiWriter writes a sorted sequence of uint64 k-mers to a .kdi file +// using delta-varint encoding. +// +// Format: +// +// [magic: 4 bytes "KDI\x01"] +// [count: uint64 LE] number of k-mers +// [first: uint64 LE] first k-mer (absolute value) +// [delta_1: varint] arr[1] - arr[0] +// [delta_2: varint] arr[2] - arr[1] +// ... +// +// The caller must write k-mers in strictly increasing order. +// +// On Close(), a companion .kdx sparse index file is written alongside +// the .kdi file for fast random access. +type KdiWriter struct { + w *bufio.Writer + file *os.File + count uint64 + prev uint64 + first bool + path string + bytesWritten uint64 // bytes written after header (data section offset) + indexEntries []kdxEntry // sparse index entries collected during writes +} + +// NewKdiWriter creates a new KdiWriter writing to the given file path. +// The header (magic + count placeholder) is written immediately. +// Count is patched on Close(). +func NewKdiWriter(path string) (*KdiWriter, error) { + f, err := os.Create(path) + if err != nil { + return nil, err + } + w := bufio.NewWriterSize(f, 65536) + + // Write magic + if _, err := w.Write(kdiMagic[:]); err != nil { + f.Close() + return nil, err + } + // Write placeholder for count (will be patched on Close) + var countBuf [8]byte + if _, err := w.Write(countBuf[:]); err != nil { + f.Close() + return nil, err + } + + return &KdiWriter{ + w: w, + file: f, + first: true, + path: path, + bytesWritten: 0, + indexEntries: make([]kdxEntry, 0, 256), + }, nil +} + +// Write adds a k-mer to the file. K-mers must be written in strictly +// increasing order. +func (kw *KdiWriter) Write(kmer uint64) error { + if kw.first { + // Write first value as absolute uint64 LE + var buf [8]byte + binary.LittleEndian.PutUint64(buf[:], kmer) + if _, err := kw.w.Write(buf[:]); err != nil { + return err + } + kw.bytesWritten += 8 + kw.prev = kmer + kw.first = false + } else { + delta := kmer - kw.prev + n, err := EncodeVarint(kw.w, delta) + if err != nil { + return err + } + kw.bytesWritten += uint64(n) + kw.prev = kmer + } + kw.count++ + + // Record sparse index entry every defaultKdxStride k-mers. + // The offset recorded is AFTER writing this k-mer, so it points to + // where the next k-mer's data will start. SeekTo uses this: it seeks + // to the recorded offset, sets prev = indexedKmer, and Next() reads + // the delta of the following k-mer. + if kw.count%defaultKdxStride == 0 { + kw.indexEntries = append(kw.indexEntries, kdxEntry{ + kmer: kmer, + offset: kdiHeaderSize + kw.bytesWritten, + }) + } + + return nil +} + +// Count returns the number of k-mers written so far. +func (kw *KdiWriter) Count() uint64 { + return kw.count +} + +// Close flushes buffered data, patches the count in the header, +// writes the companion .kdx index file, and closes the file. +func (kw *KdiWriter) Close() error { + if err := kw.w.Flush(); err != nil { + kw.file.Close() + return err + } + + // Patch count at offset 4 (after magic) + if _, err := kw.file.Seek(4, 0); err != nil { + kw.file.Close() + return err + } + var countBuf [8]byte + binary.LittleEndian.PutUint64(countBuf[:], kw.count) + if _, err := kw.file.Write(countBuf[:]); err != nil { + kw.file.Close() + return err + } + + if err := kw.file.Close(); err != nil { + return err + } + + // Write .kdx index file if there are entries to index + if len(kw.indexEntries) > 0 { + kdxPath := KdxPathForKdi(kw.path) + if err := WriteKdxIndex(kdxPath, defaultKdxStride, kw.indexEntries); err != nil { + return err + } + } + + return nil +} diff --git a/pkg/obikmer/kdx.go b/pkg/obikmer/kdx.go new file mode 100644 index 0000000..19b7dd9 --- /dev/null +++ b/pkg/obikmer/kdx.go @@ -0,0 +1,170 @@ +package obikmer + +import ( + "encoding/binary" + "fmt" + "io" + "os" + "sort" + "strings" +) + +// KDX file magic bytes: "KDX\x01" +var kdxMagic = [4]byte{'K', 'D', 'X', 0x01} + +// defaultKdxStride is the number of k-mers between consecutive index entries. +const defaultKdxStride = 4096 + +// kdxEntry is a single entry in the sparse index: the absolute k-mer value +// and the byte offset in the corresponding .kdi file where that k-mer is stored. +type kdxEntry struct { + kmer uint64 + offset uint64 // absolute byte offset in .kdi file +} + +// KdxIndex is a sparse, in-memory index for a .kdi file. +// It stores one entry every `stride` k-mers, enabling O(log N / stride) +// binary search followed by at most `stride` linear scan steps. +type KdxIndex struct { + stride int + entries []kdxEntry +} + +// LoadKdxIndex reads a .kdx file into memory. +// Returns (nil, nil) if the file does not exist (graceful degradation). +func LoadKdxIndex(path string) (*KdxIndex, error) { + f, err := os.Open(path) + if err != nil { + if os.IsNotExist(err) { + return nil, nil + } + return nil, err + } + defer f.Close() + + // Read magic + var magic [4]byte + if _, err := io.ReadFull(f, magic[:]); err != nil { + return nil, fmt.Errorf("kdx: read magic: %w", err) + } + if magic != kdxMagic { + return nil, fmt.Errorf("kdx: bad magic %v", magic) + } + + // Read stride (uint32 LE) + var buf4 [4]byte + if _, err := io.ReadFull(f, buf4[:]); err != nil { + return nil, fmt.Errorf("kdx: read stride: %w", err) + } + stride := int(binary.LittleEndian.Uint32(buf4[:])) + + // Read count (uint32 LE) + if _, err := io.ReadFull(f, buf4[:]); err != nil { + return nil, fmt.Errorf("kdx: read count: %w", err) + } + count := int(binary.LittleEndian.Uint32(buf4[:])) + + // Read entries + entries := make([]kdxEntry, count) + var buf16 [16]byte + for i := 0; i < count; i++ { + if _, err := io.ReadFull(f, buf16[:]); err != nil { + return nil, fmt.Errorf("kdx: read entry %d: %w", i, err) + } + entries[i] = kdxEntry{ + kmer: binary.LittleEndian.Uint64(buf16[0:8]), + offset: binary.LittleEndian.Uint64(buf16[8:16]), + } + } + + return &KdxIndex{ + stride: stride, + entries: entries, + }, nil +} + +// FindOffset locates the best starting point in the .kdi file to scan for +// the target k-mer. It returns: +// - offset: the byte offset in the .kdi file to seek to (positioned after +// the indexed k-mer, ready to read the next delta) +// - skipCount: the number of k-mers already consumed at that offset +// (to set the reader's internal counter) +// - ok: true if the index provides a useful starting point +// +// Index entries are recorded at k-mer count positions stride, 2*stride, etc. +// Entry i corresponds to the k-mer written at count = (i+1)*stride. +func (idx *KdxIndex) FindOffset(target uint64) (offset uint64, skipCount uint64, ok bool) { + if idx == nil || len(idx.entries) == 0 { + return 0, 0, false + } + + // Binary search: find the largest entry with kmer <= target + i := sort.Search(len(idx.entries), func(i int) bool { + return idx.entries[i].kmer > target + }) + // i is the first entry with kmer > target, so i-1 is the last with kmer <= target + if i == 0 { + // Target is before the first index entry. + // No useful jump point — caller should scan from the beginning. + return 0, 0, false + } + + i-- // largest entry with kmer <= target + // Entry i was recorded after writing k-mer at count = (i+1)*stride + skipCount = uint64(i+1) * uint64(idx.stride) + return idx.entries[i].offset, skipCount, true +} + +// Stride returns the stride of this index. +func (idx *KdxIndex) Stride() int { + return idx.stride +} + +// Len returns the number of entries in this index. +func (idx *KdxIndex) Len() int { + return len(idx.entries) +} + +// WriteKdxIndex writes a .kdx file from a slice of entries. +func WriteKdxIndex(path string, stride int, entries []kdxEntry) error { + f, err := os.Create(path) + if err != nil { + return err + } + defer f.Close() + + // Magic + if _, err := f.Write(kdxMagic[:]); err != nil { + return err + } + + // Stride (uint32 LE) + var buf4 [4]byte + binary.LittleEndian.PutUint32(buf4[:], uint32(stride)) + if _, err := f.Write(buf4[:]); err != nil { + return err + } + + // Count (uint32 LE) + binary.LittleEndian.PutUint32(buf4[:], uint32(len(entries))) + if _, err := f.Write(buf4[:]); err != nil { + return err + } + + // Entries + var buf16 [16]byte + for _, e := range entries { + binary.LittleEndian.PutUint64(buf16[0:8], e.kmer) + binary.LittleEndian.PutUint64(buf16[8:16], e.offset) + if _, err := f.Write(buf16[:]); err != nil { + return err + } + } + + return nil +} + +// KdxPathForKdi returns the .kdx path corresponding to a .kdi path. +func KdxPathForKdi(kdiPath string) string { + return strings.TrimSuffix(kdiPath, ".kdi") + ".kdx" +} diff --git a/pkg/obikmer/kmer_match.go b/pkg/obikmer/kmer_match.go new file mode 100644 index 0000000..14e303d --- /dev/null +++ b/pkg/obikmer/kmer_match.go @@ -0,0 +1,256 @@ +package obikmer + +import ( + "cmp" + "slices" + "sync" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" +) + +// QueryEntry represents a canonical k-mer to look up, together with +// metadata to trace the result back to the originating sequence and position. +type QueryEntry struct { + Kmer uint64 // canonical k-mer value + SeqIdx int // index within the batch + Pos int // 1-based position in the sequence +} + +// MatchResult holds matched positions for each sequence in a batch. +// results[i] contains the sorted matched positions for sequence i. +type MatchResult [][]int + +// PreparedQueries holds pre-computed query buckets along with the number +// of sequences they were built from. This is used by the accumulation +// pipeline to merge queries from multiple batches. +type PreparedQueries struct { + Buckets [][]QueryEntry // queries[partition], each sorted by Kmer + NSeqs int // number of sequences that produced these queries + NKmers int // total number of k-mer entries across all partitions +} + +// MergeQueries merges src into dst, offsetting all SeqIdx values in src +// by dst.NSeqs. Both dst and src must have the same number of partitions. +// After merging, src should not be reused. +// +// Each partition's entries are merged in sorted order (merge-sort of two +// already-sorted slices). +func MergeQueries(dst, src *PreparedQueries) { + for p := range dst.Buckets { + if len(src.Buckets[p]) == 0 { + continue + } + + offset := dst.NSeqs + srcB := src.Buckets[p] + + // Offset SeqIdx in src entries + for i := range srcB { + srcB[i].SeqIdx += offset + } + + if len(dst.Buckets[p]) == 0 { + dst.Buckets[p] = srcB + continue + } + + // Merge two sorted slices + dstB := dst.Buckets[p] + merged := make([]QueryEntry, 0, len(dstB)+len(srcB)) + i, j := 0, 0 + for i < len(dstB) && j < len(srcB) { + if dstB[i].Kmer <= srcB[j].Kmer { + merged = append(merged, dstB[i]) + i++ + } else { + merged = append(merged, srcB[j]) + j++ + } + } + merged = append(merged, dstB[i:]...) + merged = append(merged, srcB[j:]...) + dst.Buckets[p] = merged + } + dst.NSeqs += src.NSeqs + dst.NKmers += src.NKmers +} + +// PrepareQueries extracts all canonical k-mers from a batch of sequences +// and groups them by partition using super-kmer minimizers. +// +// Returns a PreparedQueries with sorted per-partition buckets. +func (ksg *KmerSetGroup) PrepareQueries(sequences []*obiseq.BioSequence) *PreparedQueries { + P := ksg.partitions + k := ksg.k + m := ksg.m + + // Pre-allocate partition buckets + buckets := make([][]QueryEntry, P) + for i := range buckets { + buckets[i] = make([]QueryEntry, 0, 64) + } + + totalKmers := 0 + for seqIdx, seq := range sequences { + bseq := seq.Sequence() + if len(bseq) < k { + continue + } + + // Iterate super-kmers to get minimizer → partition mapping + for sk := range IterSuperKmers(bseq, k, m) { + partition := int(sk.Minimizer % uint64(P)) + + // Iterate canonical k-mers within this super-kmer + skSeq := sk.Sequence + if len(skSeq) < k { + continue + } + + localPos := 0 + for kmer := range IterCanonicalKmers(skSeq, k) { + buckets[partition] = append(buckets[partition], QueryEntry{ + Kmer: kmer, + SeqIdx: seqIdx, + Pos: sk.Start + localPos + 1, + }) + localPos++ + totalKmers++ + } + } + } + + // Sort each bucket by k-mer value for merge-scan + for p := range buckets { + slices.SortFunc(buckets[p], func(a, b QueryEntry) int { + return cmp.Compare(a.Kmer, b.Kmer) + }) + } + + return &PreparedQueries{ + Buckets: buckets, + NSeqs: len(sequences), + NKmers: totalKmers, + } +} + +// MatchBatch looks up pre-sorted queries against one set of the index. +// Partitions are processed in parallel. For each partition, a merge-scan +// compares the sorted queries against the sorted KDI stream. +// +// Returns a MatchResult where result[i] contains sorted matched positions +// for sequence i. +func (ksg *KmerSetGroup) MatchBatch(setIndex int, pq *PreparedQueries) MatchResult { + P := ksg.partitions + + // Pre-allocated per-sequence results and mutexes. + // Each partition goroutine appends to results[seqIdx] with mus[seqIdx] held. + // Contention is low: a sequence's k-mers span many partitions, but each + // partition processes its queries sequentially and the critical section is tiny. + results := make([][]int, pq.NSeqs) + mus := make([]sync.Mutex, pq.NSeqs) + + var wg sync.WaitGroup + + for p := 0; p < P; p++ { + if len(pq.Buckets[p]) == 0 { + continue + } + wg.Add(1) + go func(part int) { + defer wg.Done() + ksg.matchPartition(setIndex, part, pq.Buckets[part], results, mus) + }(p) + } + + wg.Wait() + + // Sort positions within each sequence + for i := range results { + if len(results[i]) > 1 { + slices.Sort(results[i]) + } + } + + return MatchResult(results) +} + +// matchPartition processes one partition: opens the KDI reader (with index), +// seeks to the first query, then merge-scans queries against the KDI stream. +func (ksg *KmerSetGroup) matchPartition( + setIndex int, + partIndex int, + queries []QueryEntry, // sorted by Kmer + results [][]int, + mus []sync.Mutex, +) { + r, err := NewKdiIndexedReader(ksg.partitionPath(setIndex, partIndex)) + if err != nil { + return + } + defer r.Close() + + if r.Count() == 0 || len(queries) == 0 { + return + } + + // Seek to the first query's neighborhood + if err := r.SeekTo(queries[0].Kmer); err != nil { + return + } + + // Read first kmer from the stream after seek + currentKmer, ok := r.Next() + if !ok { + return + } + + qi := 0 // query index + + for qi < len(queries) { + q := queries[qi] + + // If the next query is far ahead, re-seek instead of linear scan. + // Only seek if we'd skip more k-mers than the index stride, + // otherwise linear scan through the buffer is faster than a syscall. + if r.index != nil && q.Kmer > currentKmer && r.Remaining() > uint64(r.index.stride) { + _, skipCount, found := r.index.FindOffset(q.Kmer) + if found && skipCount > r.read+uint64(r.index.stride) { + if err := r.SeekTo(q.Kmer); err == nil { + nextKmer, nextOk := r.Next() + if !nextOk { + return + } + currentKmer = nextKmer + ok = true + } + } + } + + // Advance KDI stream until >= query kmer + for currentKmer < q.Kmer { + currentKmer, ok = r.Next() + if !ok { + return // KDI exhausted + } + } + + if currentKmer == q.Kmer { + // Match! Record all queries with this same k-mer value + matchedKmer := q.Kmer + for qi < len(queries) && queries[qi].Kmer == matchedKmer { + idx := queries[qi].SeqIdx + mus[idx].Lock() + results[idx] = append(results[idx], queries[qi].Pos) + mus[idx].Unlock() + qi++ + } + } else { + // currentKmer > q.Kmer: skip all queries with this kmer value + skippedKmer := q.Kmer + for qi < len(queries) && queries[qi].Kmer == skippedKmer { + qi++ + } + } + } +} diff --git a/pkg/obikmer/kmer_set.go b/pkg/obikmer/kmer_set.go deleted file mode 100644 index f295072..0000000 --- a/pkg/obikmer/kmer_set.go +++ /dev/null @@ -1,217 +0,0 @@ -package obikmer - -import ( - "fmt" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" - "github.com/RoaringBitmap/roaring/roaring64" -) - -// KmerSet wraps a set of k-mers stored in a Roaring Bitmap -// Provides utility methods for manipulating k-mer sets -type KmerSet struct { - id string // Unique identifier of the KmerSet - k int // Size of k-mers (immutable) - bitmap *roaring64.Bitmap // Bitmap containing the k-mers - Metadata map[string]interface{} // User metadata (key=atomic value) -} - -// NewKmerSet creates a new empty KmerSet -func NewKmerSet(k int) *KmerSet { - return &KmerSet{ - k: k, - bitmap: roaring64.New(), - Metadata: make(map[string]interface{}), - } -} - -// NewKmerSetFromBitmap creates a KmerSet from an existing bitmap -func NewKmerSetFromBitmap(k int, bitmap *roaring64.Bitmap) *KmerSet { - return &KmerSet{ - k: k, - bitmap: bitmap, - Metadata: make(map[string]interface{}), - } -} - -// K returns the size of k-mers (immutable) -func (ks *KmerSet) K() int { - return ks.k -} - -// AddKmerCode adds an encoded k-mer to the set -func (ks *KmerSet) AddKmerCode(kmer uint64) { - ks.bitmap.Add(kmer) -} - -// AddCanonicalKmerCode adds an encoded canonical k-mer to the set -func (ks *KmerSet) AddCanonicalKmerCode(kmer uint64) { - canonical := CanonicalKmer(kmer, ks.k) - ks.bitmap.Add(canonical) -} - -// AddKmer adds a k-mer to the set by encoding the sequence -// The sequence must have exactly k nucleotides -// Zero-allocation: encodes directly without creating an intermediate slice -func (ks *KmerSet) AddKmer(seq []byte) { - kmer := EncodeKmer(seq, ks.k) - ks.bitmap.Add(kmer) -} - -// AddCanonicalKmer adds a canonical k-mer to the set by encoding the sequence -// The sequence must have exactly k nucleotides -// Zero-allocation: encodes directly in canonical form without creating an intermediate slice -func (ks *KmerSet) AddCanonicalKmer(seq []byte) { - canonical := EncodeCanonicalKmer(seq, ks.k) - ks.bitmap.Add(canonical) -} - -// AddSequence adds all k-mers from a sequence to the set -// Uses an iterator to avoid allocating an intermediate vector -func (ks *KmerSet) AddSequence(seq *obiseq.BioSequence) { - rawSeq := seq.Sequence() - for canonical := range IterCanonicalKmers(rawSeq, ks.k) { - ks.bitmap.Add(canonical) - } -} - -// AddSequences adds all k-mers from multiple sequences in batch -func (ks *KmerSet) AddSequences(sequences *obiseq.BioSequenceSlice) { - for _, seq := range *sequences { - ks.AddSequence(seq) - } -} - -// Contains checks if a k-mer is in the set -func (ks *KmerSet) Contains(kmer uint64) bool { - return ks.bitmap.Contains(kmer) -} - -// Len returns the number of k-mers in the set -func (ks *KmerSet) Len() uint64 { - return ks.bitmap.GetCardinality() -} - -// MemoryUsage returns memory usage in bytes -func (ks *KmerSet) MemoryUsage() uint64 { - return ks.bitmap.GetSizeInBytes() -} - -// Clear empties the set -func (ks *KmerSet) Clear() { - ks.bitmap.Clear() -} - -// Copy creates a copy of the set (consistent with BioSequence.Copy) -func (ks *KmerSet) Copy() *KmerSet { - // Copy metadata - metadata := make(map[string]interface{}, len(ks.Metadata)) - for k, v := range ks.Metadata { - metadata[k] = v - } - - return &KmerSet{ - id: ks.id, - k: ks.k, - bitmap: ks.bitmap.Clone(), - Metadata: metadata, - } -} - -// Id returns the identifier of the KmerSet (consistent with BioSequence.Id) -func (ks *KmerSet) Id() string { - return ks.id -} - -// SetId sets the identifier of the KmerSet (consistent with BioSequence.SetId) -func (ks *KmerSet) SetId(id string) { - ks.id = id -} - -// Union returns the union of this set with another -func (ks *KmerSet) Union(other *KmerSet) *KmerSet { - if ks.k != other.k { - panic(fmt.Sprintf("Cannot union KmerSets with different k values: %d vs %d", ks.k, other.k)) - } - result := ks.bitmap.Clone() - result.Or(other.bitmap) - return NewKmerSetFromBitmap(ks.k, result) -} - -// Intersect returns the intersection of this set with another -func (ks *KmerSet) Intersect(other *KmerSet) *KmerSet { - if ks.k != other.k { - panic(fmt.Sprintf("Cannot intersect KmerSets with different k values: %d vs %d", ks.k, other.k)) - } - result := ks.bitmap.Clone() - result.And(other.bitmap) - return NewKmerSetFromBitmap(ks.k, result) -} - -// Difference returns the difference of this set with another (this - other) -func (ks *KmerSet) Difference(other *KmerSet) *KmerSet { - if ks.k != other.k { - panic(fmt.Sprintf("Cannot subtract KmerSets with different k values: %d vs %d", ks.k, other.k)) - } - result := ks.bitmap.Clone() - result.AndNot(other.bitmap) - return NewKmerSetFromBitmap(ks.k, result) -} - -// JaccardDistance computes the Jaccard distance between two KmerSets. -// The Jaccard distance is defined as: 1 - (|A ∩ B| / |A ∪ B|) -// where A and B are the two sets. -// -// Returns: -// - 0.0 when sets are identical (distance = 0, similarity = 1) -// - 1.0 when sets are completely disjoint (distance = 1, similarity = 0) -// - 1.0 when both sets are empty (by convention) -// -// Time complexity: O(|A| + |B|) for Roaring Bitmap operations -// Space complexity: O(1) as operations are done in-place on temporary bitmaps -func (ks *KmerSet) JaccardDistance(other *KmerSet) float64 { - if ks.k != other.k { - panic(fmt.Sprintf("Cannot compute Jaccard distance between KmerSets with different k values: %d vs %d", ks.k, other.k)) - } - - // Compute intersection cardinality - intersectionCard := ks.bitmap.AndCardinality(other.bitmap) - - // Compute union cardinality - unionCard := ks.bitmap.OrCardinality(other.bitmap) - - // If union is empty, both sets are empty - return 1.0 by convention - if unionCard == 0 { - return 1.0 - } - - // Jaccard similarity = |A ∩ B| / |A ∪ B| - similarity := float64(intersectionCard) / float64(unionCard) - - // Jaccard distance = 1 - similarity - return 1.0 - similarity -} - -// JaccardSimilarity computes the Jaccard similarity coefficient between two KmerSets. -// The Jaccard similarity is defined as: |A ∩ B| / |A ∪ B| -// -// Returns: -// - 1.0 when sets are identical (maximum similarity) -// - 0.0 when sets are completely disjoint (no similarity) -// - 0.0 when both sets are empty (by convention) -// -// Time complexity: O(|A| + |B|) for Roaring Bitmap operations -// Space complexity: O(1) as operations are done in-place on temporary bitmaps -func (ks *KmerSet) JaccardSimilarity(other *KmerSet) float64 { - return 1.0 - ks.JaccardDistance(other) -} - -// Iterator returns an iterator over all k-mers in the set -func (ks *KmerSet) Iterator() roaring64.IntIterable64 { - return ks.bitmap.Iterator() -} - -// Bitmap returns the underlying bitmap (for compatibility) -func (ks *KmerSet) Bitmap() *roaring64.Bitmap { - return ks.bitmap -} diff --git a/pkg/obikmer/kmer_set_attributes.go b/pkg/obikmer/kmer_set_attributes.go deleted file mode 100644 index 82151f8..0000000 --- a/pkg/obikmer/kmer_set_attributes.go +++ /dev/null @@ -1,362 +0,0 @@ -package obikmer - -import ( - "fmt" - "strconv" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" -) - -// ================================== -// KMER SET ATTRIBUTE API -// Mimic BioSequence attribute API from obiseq/attributes.go -// ================================== - -// HasAttribute vérifie si une clé d'attribut existe -func (ks *KmerSet) HasAttribute(key string) bool { - _, ok := ks.Metadata[key] - return ok -} - -// GetAttribute récupère la valeur d'un attribut -// Cas particuliers: "id" utilise Id(), "k" utilise K() -func (ks *KmerSet) GetAttribute(key string) (interface{}, bool) { - switch key { - case "id": - return ks.Id(), true - case "k": - return ks.K(), true - default: - value, ok := ks.Metadata[key] - return value, ok - } -} - -// SetAttribute sets the value of an attribute -// Cas particuliers: "id" utilise SetId(), "k" est immutable (panique) -func (ks *KmerSet) SetAttribute(key string, value interface{}) { - switch key { - case "id": - if id, ok := value.(string); ok { - ks.SetId(id) - } else { - panic(fmt.Sprintf("id must be a string, got %T", value)) - } - case "k": - panic("k is immutable and cannot be modified via SetAttribute") - default: - ks.Metadata[key] = value - } -} - -// DeleteAttribute supprime un attribut -func (ks *KmerSet) DeleteAttribute(key string) { - delete(ks.Metadata, key) -} - -// RemoveAttribute supprime un attribut (alias de DeleteAttribute) -func (ks *KmerSet) RemoveAttribute(key string) { - ks.DeleteAttribute(key) -} - -// RenameAttribute renomme un attribut -func (ks *KmerSet) RenameAttribute(newName, oldName string) { - if value, ok := ks.Metadata[oldName]; ok { - ks.Metadata[newName] = value - delete(ks.Metadata, oldName) - } -} - -// GetIntAttribute récupère un attribut en tant qu'entier -func (ks *KmerSet) GetIntAttribute(key string) (int, bool) { - value, ok := ks.Metadata[key] - if !ok { - return 0, false - } - - switch v := value.(type) { - case int: - return v, true - case int64: - return int(v), true - case float64: - return int(v), true - case string: - if i, err := strconv.Atoi(v); err == nil { - return i, true - } - } - return 0, false -} - -// GetFloatAttribute récupère un attribut en tant que float64 -func (ks *KmerSet) GetFloatAttribute(key string) (float64, bool) { - value, ok := ks.Metadata[key] - if !ok { - return 0, false - } - - switch v := value.(type) { - case float64: - return v, true - case float32: - return float64(v), true - case int: - return float64(v), true - case int64: - return float64(v), true - case string: - if f, err := strconv.ParseFloat(v, 64); err == nil { - return f, true - } - } - return 0, false -} - -// GetNumericAttribute récupère un attribut numérique (alias de GetFloatAttribute) -func (ks *KmerSet) GetNumericAttribute(key string) (float64, bool) { - return ks.GetFloatAttribute(key) -} - -// GetStringAttribute récupère un attribut en tant que chaîne -func (ks *KmerSet) GetStringAttribute(key string) (string, bool) { - value, ok := ks.Metadata[key] - if !ok { - return "", false - } - - switch v := value.(type) { - case string: - return v, true - default: - return fmt.Sprintf("%v", v), true - } -} - -// GetBoolAttribute récupère un attribut en tant que booléen -func (ks *KmerSet) GetBoolAttribute(key string) (bool, bool) { - value, ok := ks.Metadata[key] - if !ok { - return false, false - } - - switch v := value.(type) { - case bool: - return v, true - case int: - return v != 0, true - case string: - if b, err := strconv.ParseBool(v); err == nil { - return b, true - } - } - return false, false -} - -// AttributeKeys returns the set of attribute keys -func (ks *KmerSet) AttributeKeys() obiutils.Set[string] { - keys := obiutils.MakeSet[string]() - for key := range ks.Metadata { - keys.Add(key) - } - return keys -} - -// Keys returns the set of attribute keys (alias of AttributeKeys) -func (ks *KmerSet) Keys() obiutils.Set[string] { - return ks.AttributeKeys() -} - -// ================================== -// KMER SET GROUP ATTRIBUTE API -// Métadonnées du groupe + accès via Get() pour les sets individuels -// ================================== - -// HasAttribute vérifie si une clé d'attribut existe pour le groupe -func (ksg *KmerSetGroup) HasAttribute(key string) bool { - _, ok := ksg.Metadata[key] - return ok -} - -// GetAttribute récupère la valeur d'un attribut du groupe -// Cas particuliers: "id" utilise Id(), "k" utilise K() -func (ksg *KmerSetGroup) GetAttribute(key string) (interface{}, bool) { - switch key { - case "id": - return ksg.Id(), true - case "k": - return ksg.K(), true - default: - value, ok := ksg.Metadata[key] - return value, ok - } -} - -// SetAttribute sets the value of an attribute du groupe -// Cas particuliers: "id" utilise SetId(), "k" est immutable (panique) -func (ksg *KmerSetGroup) SetAttribute(key string, value interface{}) { - switch key { - case "id": - if id, ok := value.(string); ok { - ksg.SetId(id) - } else { - panic(fmt.Sprintf("id must be a string, got %T", value)) - } - case "k": - panic("k is immutable and cannot be modified via SetAttribute") - default: - ksg.Metadata[key] = value - } -} - -// DeleteAttribute supprime un attribut du groupe -func (ksg *KmerSetGroup) DeleteAttribute(key string) { - delete(ksg.Metadata, key) -} - -// RemoveAttribute supprime un attribut du groupe (alias) -func (ksg *KmerSetGroup) RemoveAttribute(key string) { - ksg.DeleteAttribute(key) -} - -// RenameAttribute renomme un attribut du groupe -func (ksg *KmerSetGroup) RenameAttribute(newName, oldName string) { - if value, ok := ksg.Metadata[oldName]; ok { - ksg.Metadata[newName] = value - delete(ksg.Metadata, oldName) - } -} - -// GetIntAttribute récupère un attribut entier du groupe -func (ksg *KmerSetGroup) GetIntAttribute(key string) (int, bool) { - value, ok := ksg.GetAttribute(key) - if !ok { - return 0, false - } - - switch v := value.(type) { - case int: - return v, true - case int64: - return int(v), true - case float64: - return int(v), true - case string: - if i, err := strconv.Atoi(v); err == nil { - return i, true - } - } - return 0, false -} - -// GetFloatAttribute récupère un attribut float64 du groupe -func (ksg *KmerSetGroup) GetFloatAttribute(key string) (float64, bool) { - value, ok := ksg.GetAttribute(key) - if !ok { - return 0, false - } - - switch v := value.(type) { - case float64: - return v, true - case float32: - return float64(v), true - case int: - return float64(v), true - case int64: - return float64(v), true - case string: - if f, err := strconv.ParseFloat(v, 64); err == nil { - return f, true - } - } - return 0, false -} - -// GetNumericAttribute récupère un attribut numérique du groupe -func (ksg *KmerSetGroup) GetNumericAttribute(key string) (float64, bool) { - return ksg.GetFloatAttribute(key) -} - -// GetStringAttribute récupère un attribut chaîne du groupe -func (ksg *KmerSetGroup) GetStringAttribute(key string) (string, bool) { - value, ok := ksg.GetAttribute(key) - if !ok { - return "", false - } - - switch v := value.(type) { - case string: - return v, true - default: - return fmt.Sprintf("%v", v), true - } -} - -// GetBoolAttribute récupère un attribut booléen du groupe -func (ksg *KmerSetGroup) GetBoolAttribute(key string) (bool, bool) { - value, ok := ksg.GetAttribute(key) - if !ok { - return false, false - } - - switch v := value.(type) { - case bool: - return v, true - case int: - return v != 0, true - case string: - if b, err := strconv.ParseBool(v); err == nil { - return b, true - } - } - return false, false -} - -// AttributeKeys returns the set of attribute keys du groupe -func (ksg *KmerSetGroup) AttributeKeys() obiutils.Set[string] { - keys := obiutils.MakeSet[string]() - for key := range ksg.Metadata { - keys.Add(key) - } - return keys -} - -// Keys returns the set of group attribute keys (alias) -func (ksg *KmerSetGroup) Keys() obiutils.Set[string] { - return ksg.AttributeKeys() -} - -// ================================== -// MÉTHODES POUR ACCÉDER AUX ATTRIBUTS DES SETS INDIVIDUELS VIA Get() -// Architecture zero-copy: ksg.Get(i).SetAttribute(...) -// ================================== - -// Exemple d'utilisation: -// Pour accéder aux métadonnées d'un KmerSet individuel dans un groupe: -// ks := ksg.Get(0) -// ks.SetAttribute("level", 1) -// hasLevel := ks.HasAttribute("level") -// -// Pour les métadonnées du groupe: -// ksg.SetAttribute("name", "FrequencyFilter") -// name, ok := ksg.GetStringAttribute("name") - -// AllAttributeKeys returns all unique attribute keys of the group AND all its sets -func (ksg *KmerSetGroup) AllAttributeKeys() obiutils.Set[string] { - keys := obiutils.MakeSet[string]() - - // Ajouter les clés du groupe - for key := range ksg.Metadata { - keys.Add(key) - } - - // Ajouter les clés de chaque set - for _, ks := range ksg.sets { - for key := range ks.Metadata { - keys.Add(key) - } - } - - return keys -} diff --git a/pkg/obikmer/kmer_set_builder.go b/pkg/obikmer/kmer_set_builder.go new file mode 100644 index 0000000..c58efa6 --- /dev/null +++ b/pkg/obikmer/kmer_set_builder.go @@ -0,0 +1,702 @@ +package obikmer + +import ( + "fmt" + "math" + "os" + "path/filepath" + "slices" + "sync" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "github.com/schollz/progressbar/v3" +) + +// BuilderOption is a functional option for KmerSetGroupBuilder. +type BuilderOption func(*builderConfig) + +type builderConfig struct { + minFreq int // 0 means no frequency filtering (simple dedup) + maxFreq int // 0 means no upper bound + saveFreqTopN int // >0 means save the N most frequent k-mers per set to CSV + entropyThreshold float64 // >0 means filter k-mers with entropy <= threshold + entropyLevelMax int // max sub-word size for entropy (typically 6) +} + +// WithMinFrequency activates frequency filtering mode. +// Only k-mers seen >= minFreq times are kept in the final index. +func WithMinFrequency(minFreq int) BuilderOption { + return func(c *builderConfig) { + c.minFreq = minFreq + } +} + +// WithMaxFrequency sets the upper frequency bound. +// Only k-mers seen <= maxFreq times are kept in the final index. +func WithMaxFrequency(maxFreq int) BuilderOption { + return func(c *builderConfig) { + c.maxFreq = maxFreq + } +} + +// WithSaveFreqKmers saves the N most frequent k-mers per set to a CSV file +// (top_kmers.csv in each set directory). +func WithSaveFreqKmers(n int) BuilderOption { + return func(c *builderConfig) { + c.saveFreqTopN = n + } +} + +// WithEntropyFilter activates entropy-based low-complexity filtering. +// K-mers with entropy <= threshold are discarded during finalization. +// levelMax is the maximum sub-word size for entropy computation (typically 6). +func WithEntropyFilter(threshold float64, levelMax int) BuilderOption { + return func(c *builderConfig) { + c.entropyThreshold = threshold + c.entropyLevelMax = levelMax + } +} + +// KmerSetGroupBuilder constructs a KmerSetGroup on disk. +// During construction, super-kmers are written to temporary .skm files +// partitioned by minimizer. On Close(), each partition is finalized +// (sort, dedup, optional frequency filter) into .kdi files. +type KmerSetGroupBuilder struct { + dir string + k int + m int + n int // number of NEW sets being built + P int // number of partitions + startIndex int // first set index (0 for new groups, existingN for appends) + config builderConfig + existing *KmerSetGroup // non-nil when appending to existing group + writers [][]*SkmWriter // [setIndex][partIndex] (local index 0..n-1) + mu [][]sync.Mutex // per-writer mutex for concurrent access + closed bool +} + +// NewKmerSetGroupBuilder creates a builder for a new KmerSetGroup. +// +// Parameters: +// - directory: destination directory (created if necessary) +// - k: k-mer size (1-31) +// - m: minimizer size (-1 for auto = ceil(k/2.5)) +// - n: number of sets in the group +// - P: number of partitions (-1 for auto) +// - options: optional builder options (e.g. WithMinFrequency) +func NewKmerSetGroupBuilder(directory string, k, m, n, P int, + options ...BuilderOption) (*KmerSetGroupBuilder, error) { + + if k < 2 || k > 31 { + return nil, fmt.Errorf("obikmer: k must be between 2 and 31, got %d", k) + } + if n < 1 { + return nil, fmt.Errorf("obikmer: n must be >= 1, got %d", n) + } + + // Auto minimizer size + if m < 0 { + m = int(math.Ceil(float64(k) / 2.5)) + } + if m < 1 { + m = 1 + } + if m >= k { + m = k - 1 + } + + // Auto partition count + if P < 0 { + // Use 4^m as the maximum, capped at a reasonable value + maxP := 1 << (2 * m) // 4^m + P = maxP + if P > 4096 { + P = 4096 + } + if P < 64 { + P = 64 + } + } + + // Apply options + var config builderConfig + for _, opt := range options { + opt(&config) + } + + // Create build directory structure + buildDir := filepath.Join(directory, ".build") + for s := 0; s < n; s++ { + setDir := filepath.Join(buildDir, fmt.Sprintf("set_%d", s)) + if err := os.MkdirAll(setDir, 0755); err != nil { + return nil, fmt.Errorf("obikmer: create build dir: %w", err) + } + } + + // Create SKM writers + writers := make([][]*SkmWriter, n) + mutexes := make([][]sync.Mutex, n) + for s := 0; s < n; s++ { + writers[s] = make([]*SkmWriter, P) + mutexes[s] = make([]sync.Mutex, P) + for p := 0; p < P; p++ { + path := filepath.Join(buildDir, fmt.Sprintf("set_%d", s), + fmt.Sprintf("part_%04d.skm", p)) + w, err := NewSkmWriter(path) + if err != nil { + // Close already-created writers + for ss := 0; ss <= s; ss++ { + for pp := 0; pp < P; pp++ { + if writers[ss][pp] != nil { + writers[ss][pp].Close() + } + } + } + return nil, fmt.Errorf("obikmer: create skm writer: %w", err) + } + writers[s][p] = w + } + } + + return &KmerSetGroupBuilder{ + dir: directory, + k: k, + m: m, + n: n, + P: P, + startIndex: 0, + config: config, + writers: writers, + mu: mutexes, + }, nil +} + +// AppendKmerSetGroupBuilder opens an existing KmerSetGroup and creates +// a builder that adds n new sets starting from the existing set count. +// The k, m, and partitions are inherited from the existing group. +func AppendKmerSetGroupBuilder(directory string, n int, options ...BuilderOption) (*KmerSetGroupBuilder, error) { + existing, err := OpenKmerSetGroup(directory) + if err != nil { + return nil, fmt.Errorf("obikmer: open existing group: %w", err) + } + + if n < 1 { + return nil, fmt.Errorf("obikmer: n must be >= 1, got %d", n) + } + + k := existing.K() + m := existing.M() + P := existing.Partitions() + startIndex := existing.Size() + + var config builderConfig + for _, opt := range options { + opt(&config) + } + + // Create build directory structure for new sets + buildDir := filepath.Join(directory, ".build") + for s := 0; s < n; s++ { + setDir := filepath.Join(buildDir, fmt.Sprintf("set_%d", s)) + if err := os.MkdirAll(setDir, 0755); err != nil { + return nil, fmt.Errorf("obikmer: create build dir: %w", err) + } + } + + // Create SKM writers for new sets + writers := make([][]*SkmWriter, n) + mutexes := make([][]sync.Mutex, n) + for s := 0; s < n; s++ { + writers[s] = make([]*SkmWriter, P) + mutexes[s] = make([]sync.Mutex, P) + for p := 0; p < P; p++ { + path := filepath.Join(buildDir, fmt.Sprintf("set_%d", s), + fmt.Sprintf("part_%04d.skm", p)) + w, err := NewSkmWriter(path) + if err != nil { + for ss := 0; ss <= s; ss++ { + for pp := 0; pp < P; pp++ { + if writers[ss][pp] != nil { + writers[ss][pp].Close() + } + } + } + return nil, fmt.Errorf("obikmer: create skm writer: %w", err) + } + writers[s][p] = w + } + } + + return &KmerSetGroupBuilder{ + dir: directory, + k: k, + m: m, + n: n, + P: P, + startIndex: startIndex, + config: config, + existing: existing, + writers: writers, + mu: mutexes, + }, nil +} + +// StartIndex returns the first global set index for the new sets being built. +// For new groups this is 0; for appends it is the existing group's Size(). +func (b *KmerSetGroupBuilder) StartIndex() int { + return b.startIndex +} + +// AddSequence extracts super-kmers from a sequence and writes them +// to the appropriate partition files for the given set. +func (b *KmerSetGroupBuilder) AddSequence(setIndex int, seq *obiseq.BioSequence) { + if setIndex < 0 || setIndex >= b.n { + return + } + rawSeq := seq.Sequence() + if len(rawSeq) < b.k { + return + } + for sk := range IterSuperKmers(rawSeq, b.k, b.m) { + part := int(sk.Minimizer % uint64(b.P)) + b.mu[setIndex][part].Lock() + b.writers[setIndex][part].Write(sk) + b.mu[setIndex][part].Unlock() + } +} + +// AddSuperKmer writes a single super-kmer to the appropriate partition. +func (b *KmerSetGroupBuilder) AddSuperKmer(setIndex int, sk SuperKmer) { + if setIndex < 0 || setIndex >= b.n { + return + } + part := int(sk.Minimizer % uint64(b.P)) + b.mu[setIndex][part].Lock() + b.writers[setIndex][part].Write(sk) + b.mu[setIndex][part].Unlock() +} + +// Close finalizes the construction: +// 1. Flush and close all SKM writers +// 2. For each partition of each set (in parallel): +// - Load super-kmers from .skm +// - Extract canonical k-mers +// - Sort and deduplicate (count if frequency filter) +// - Write .kdi file +// 3. Write metadata.toml +// 4. Remove .build/ directory +// +// Returns the finalized KmerSetGroup in read-only mode. +func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) { + if b.closed { + return nil, fmt.Errorf("obikmer: builder already closed") + } + b.closed = true + + // 1. Close all SKM writers + for s := 0; s < b.n; s++ { + for p := 0; p < b.P; p++ { + if err := b.writers[s][p].Close(); err != nil { + return nil, fmt.Errorf("obikmer: close skm writer set=%d part=%d: %w", s, p, err) + } + } + } + + // 2. Create output directory structure for new sets + for s := 0; s < b.n; s++ { + globalIdx := b.startIndex + s + setDir := filepath.Join(b.dir, fmt.Sprintf("set_%d", globalIdx)) + if err := os.MkdirAll(setDir, 0755); err != nil { + return nil, fmt.Errorf("obikmer: create set dir: %w", err) + } + } + + // ===================================================================== + // 2-stage pipeline: readers (pure I/O) → workers (CPU + write) + // + // - nReaders goroutines read .skm files (pure I/O, fast) + // - nWorkers goroutines extract k-mers, sort, dedup, filter, write .kdi + // + // One unbuffered channel between stages. Readers are truly I/O-bound + // (small files, buffered reads), workers are CPU-bound and stay busy. + // ===================================================================== + totalJobs := b.n * b.P + + counts := make([][]uint64, b.n) + spectra := make([][]map[int]uint64, b.n) + var topKmers [][]*TopNKmers + for s := 0; s < b.n; s++ { + counts[s] = make([]uint64, b.P) + spectra[s] = make([]map[int]uint64, b.P) + } + if b.config.saveFreqTopN > 0 { + topKmers = make([][]*TopNKmers, b.n) + for s := 0; s < b.n; s++ { + topKmers[s] = make([]*TopNKmers, b.P) + } + } + + nCPU := obidefault.ParallelWorkers() + + // Stage sizing + nWorkers := nCPU // CPU-bound: one per core + nReaders := nCPU / 4 // pure I/O: few goroutines suffice + if nReaders < 2 { + nReaders = 2 + } + if nReaders > 4 { + nReaders = 4 + } + if nWorkers > totalJobs { + nWorkers = totalJobs + } + if nReaders > totalJobs { + nReaders = totalJobs + } + + var bar *progressbar.ProgressBar + if obidefault.ProgressBar() { + pbopt := []progressbar.Option{ + progressbar.OptionSetWriter(os.Stderr), + progressbar.OptionSetWidth(15), + progressbar.OptionShowCount(), + progressbar.OptionShowIts(), + progressbar.OptionSetPredictTime(true), + progressbar.OptionSetDescription("[Finalizing partitions]"), + } + bar = progressbar.NewOptions(totalJobs, pbopt...) + } + + // --- Channel types --- + type partitionData struct { + setIdx int + partIdx int + skmers []SuperKmer // raw super-kmers from I/O stage + } + + type readJob struct { + setIdx int + partIdx int + } + + dataCh := make(chan *partitionData) // unbuffered + readJobs := make(chan readJob, totalJobs) + + var errMu sync.Mutex + var firstErr error + + // Fill job queue (buffered, all jobs pre-loaded) + for s := 0; s < b.n; s++ { + for p := 0; p < b.P; p++ { + readJobs <- readJob{s, p} + } + } + close(readJobs) + + // --- Stage 1: Readers (pure I/O) --- + var readWg sync.WaitGroup + for w := 0; w < nReaders; w++ { + readWg.Add(1) + go func() { + defer readWg.Done() + for rj := range readJobs { + skmers, err := b.loadPartitionRaw(rj.setIdx, rj.partIdx) + if err != nil { + errMu.Lock() + if firstErr == nil { + firstErr = err + } + errMu.Unlock() + } + dataCh <- &partitionData{rj.setIdx, rj.partIdx, skmers} + } + }() + } + + go func() { + readWg.Wait() + close(dataCh) + }() + + // --- Stage 2: Workers (CPU: extract k-mers + sort/filter + write .kdi) --- + var workWg sync.WaitGroup + for w := 0; w < nWorkers; w++ { + workWg.Add(1) + go func() { + defer workWg.Done() + for pd := range dataCh { + // CPU: extract canonical k-mers from super-kmers + kmers := extractCanonicalKmers(pd.skmers, b.k) + pd.skmers = nil // allow GC of raw super-kmers + + // CPU: sort, dedup, filter + filtered, spectrum, topN := b.sortFilterPartition(kmers) + kmers = nil // allow GC of unsorted data + + // I/O: write .kdi file + globalIdx := b.startIndex + pd.setIdx + kdiPath := filepath.Join(b.dir, + fmt.Sprintf("set_%d", globalIdx), + fmt.Sprintf("part_%04d.kdi", pd.partIdx)) + + n, err := b.writePartitionKdi(kdiPath, filtered) + if err != nil { + errMu.Lock() + if firstErr == nil { + firstErr = err + } + errMu.Unlock() + } + counts[pd.setIdx][pd.partIdx] = n + spectra[pd.setIdx][pd.partIdx] = spectrum + if topKmers != nil { + topKmers[pd.setIdx][pd.partIdx] = topN + } + if bar != nil { + bar.Add(1) + } + } + }() + } + + workWg.Wait() + + if bar != nil { + fmt.Fprintln(os.Stderr) + } + + if firstErr != nil { + return nil, firstErr + } + + // Aggregate per-partition spectra into per-set spectra and write spectrum.bin + for s := 0; s < b.n; s++ { + globalIdx := b.startIndex + s + setSpectrum := make(map[int]uint64) + for p := 0; p < b.P; p++ { + if spectra[s][p] != nil { + MergeSpectraMaps(setSpectrum, spectra[s][p]) + } + } + if len(setSpectrum) > 0 { + specPath := filepath.Join(b.dir, fmt.Sprintf("set_%d", globalIdx), "spectrum.bin") + if err := WriteSpectrum(specPath, MapToSpectrum(setSpectrum)); err != nil { + return nil, fmt.Errorf("obikmer: write spectrum set=%d: %w", globalIdx, err) + } + } + } + + // Aggregate per-partition top-N k-mers and write CSV + if topKmers != nil { + for s := 0; s < b.n; s++ { + globalIdx := b.startIndex + s + merged := NewTopNKmers(b.config.saveFreqTopN) + for p := 0; p < b.P; p++ { + merged.MergeTopN(topKmers[s][p]) + } + results := merged.Results() + if len(results) > 0 { + csvPath := filepath.Join(b.dir, fmt.Sprintf("set_%d", globalIdx), "top_kmers.csv") + if err := WriteTopKmersCSV(csvPath, results, b.k); err != nil { + return nil, fmt.Errorf("obikmer: write top kmers set=%d: %w", globalIdx, err) + } + } + } + } + + // 3. Build KmerSetGroup and write metadata + newCounts := make([]uint64, b.n) + for s := 0; s < b.n; s++ { + for p := 0; p < b.P; p++ { + newCounts[s] += counts[s][p] + } + } + + var ksg *KmerSetGroup + + if b.existing != nil { + // Append mode: extend existing group + ksg = b.existing + ksg.n += b.n + ksg.setsIDs = append(ksg.setsIDs, make([]string, b.n)...) + ksg.counts = append(ksg.counts, newCounts...) + newMeta := make([]map[string]interface{}, b.n) + for i := range newMeta { + newMeta[i] = make(map[string]interface{}) + } + ksg.setsMetadata = append(ksg.setsMetadata, newMeta...) + } else { + // New group + setsIDs := make([]string, b.n) + setsMetadata := make([]map[string]interface{}, b.n) + for i := range setsMetadata { + setsMetadata[i] = make(map[string]interface{}) + } + ksg = &KmerSetGroup{ + path: b.dir, + k: b.k, + m: b.m, + partitions: b.P, + n: b.n, + setsIDs: setsIDs, + counts: newCounts, + setsMetadata: setsMetadata, + Metadata: make(map[string]interface{}), + } + } + + if err := ksg.saveMetadata(); err != nil { + return nil, fmt.Errorf("obikmer: write metadata: %w", err) + } + + // 4. Remove .build/ directory + buildDir := filepath.Join(b.dir, ".build") + os.RemoveAll(buildDir) + + return ksg, nil +} + +// loadPartitionRaw reads a .skm file and returns raw super-kmers. +// This is pure I/O — no k-mer extraction is done here. +// Returns nil (not an error) if the .skm file is empty or missing. +func (b *KmerSetGroupBuilder) loadPartitionRaw(setIdx, partIdx int) ([]SuperKmer, error) { + skmPath := filepath.Join(b.dir, ".build", + fmt.Sprintf("set_%d", setIdx), + fmt.Sprintf("part_%04d.skm", partIdx)) + + fi, err := os.Stat(skmPath) + if err != nil { + return nil, nil // empty partition, not an error + } + + reader, err := NewSkmReader(skmPath) + if err != nil { + return nil, nil + } + + // Estimate capacity from file size. Each super-kmer record is + // 2 bytes (length) + packed bases (~k/4 bytes), so roughly + // (2 + k/4) bytes per super-kmer on average. + avgRecordSize := 2 + b.k/4 + if avgRecordSize < 4 { + avgRecordSize = 4 + } + estCount := int(fi.Size()) / avgRecordSize + + skmers := make([]SuperKmer, 0, estCount) + for { + sk, ok := reader.Next() + if !ok { + break + } + skmers = append(skmers, sk) + } + reader.Close() + + return skmers, nil +} + +// extractCanonicalKmers extracts all canonical k-mers from a slice of super-kmers. +// This is CPU-bound work (sliding-window forward/reverse complement). +func extractCanonicalKmers(skmers []SuperKmer, k int) []uint64 { + // Pre-compute total capacity to avoid repeated slice growth. + // Each super-kmer of length L yields L-k+1 canonical k-mers. + total := 0 + for i := range skmers { + n := len(skmers[i].Sequence) - k + 1 + if n > 0 { + total += n + } + } + + kmers := make([]uint64, 0, total) + for _, sk := range skmers { + for kmer := range IterCanonicalKmers(sk.Sequence, k) { + kmers = append(kmers, kmer) + } + } + return kmers +} + +// sortFilterPartition sorts, deduplicates, and filters k-mers in memory (CPU-bound). +// Returns the filtered sorted slice, frequency spectrum, and optional top-N. +func (b *KmerSetGroupBuilder) sortFilterPartition(kmers []uint64) ([]uint64, map[int]uint64, *TopNKmers) { + if len(kmers) == 0 { + return nil, nil, nil + } + + // Sort (CPU-bound) — slices.Sort avoids reflection overhead of sort.Slice + slices.Sort(kmers) + + minFreq := b.config.minFreq + if minFreq <= 0 { + minFreq = 1 // simple dedup + } + maxFreq := b.config.maxFreq + + // Prepare entropy filter if requested + var entropyFilter *KmerEntropyFilter + if b.config.entropyThreshold > 0 && b.config.entropyLevelMax > 0 { + entropyFilter = NewKmerEntropyFilter(b.k, b.config.entropyLevelMax, b.config.entropyThreshold) + } + + // Prepare top-N collector if requested + var topN *TopNKmers + if b.config.saveFreqTopN > 0 { + topN = NewTopNKmers(b.config.saveFreqTopN) + } + + // Linear scan: count consecutive identical values, filter, accumulate spectrum + partSpectrum := make(map[int]uint64) + filtered := make([]uint64, 0, len(kmers)/2) + + i := 0 + for i < len(kmers) { + val := kmers[i] + c := 1 + for i+c < len(kmers) && kmers[i+c] == val { + c++ + } + partSpectrum[c]++ + if topN != nil { + topN.Add(val, c) + } + if c >= minFreq && (maxFreq <= 0 || c <= maxFreq) { + if entropyFilter == nil || entropyFilter.Accept(val) { + filtered = append(filtered, val) + } + } + i += c + } + + return filtered, partSpectrum, topN +} + +// writePartitionKdi writes a sorted slice of k-mers to a .kdi file (I/O-bound). +// Returns the number of k-mers written. +func (b *KmerSetGroupBuilder) writePartitionKdi(kdiPath string, kmers []uint64) (uint64, error) { + w, err := NewKdiWriter(kdiPath) + if err != nil { + return 0, err + } + + for _, val := range kmers { + if err := w.Write(val); err != nil { + w.Close() + return 0, err + } + } + + n := w.Count() + return n, w.Close() +} + +func (b *KmerSetGroupBuilder) writeEmptyKdi(path string, count *uint64) error { + w, err := NewKdiWriter(path) + if err != nil { + return err + } + *count = 0 + return w.Close() +} diff --git a/pkg/obikmer/kmer_set_builder_test.go b/pkg/obikmer/kmer_set_builder_test.go new file mode 100644 index 0000000..47b47d6 --- /dev/null +++ b/pkg/obikmer/kmer_set_builder_test.go @@ -0,0 +1,278 @@ +package obikmer + +import ( + "sort" + "testing" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" +) + +func TestBuilderBasic(t *testing.T) { + dir := t.TempDir() + + builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64) + if err != nil { + t.Fatal(err) + } + + seq := obiseq.NewBioSequence("test", []byte("ACGTACGTACGTACGTACGTACGTACGT"), "") + builder.AddSequence(0, seq) + + ksg, err := builder.Close() + if err != nil { + t.Fatal(err) + } + + if ksg.K() != 15 { + t.Fatalf("K() = %d, want 15", ksg.K()) + } + if ksg.M() != 7 { + t.Fatalf("M() = %d, want 7", ksg.M()) + } + if ksg.Partitions() != 64 { + t.Fatalf("Partitions() = %d, want 64", ksg.Partitions()) + } + if ksg.Size() != 1 { + t.Fatalf("Size() = %d, want 1", ksg.Size()) + } + if ksg.Len(0) == 0 { + t.Fatal("Len(0) = 0, expected some k-mers") + } + + // Verify k-mers match what we'd compute directly + var expected []uint64 + for kmer := range IterCanonicalKmers(seq.Sequence(), 15) { + expected = append(expected, kmer) + } + sort.Slice(expected, func(i, j int) bool { return expected[i] < expected[j] }) + // Dedup + deduped := expected[:0] + for i, v := range expected { + if i == 0 || v != expected[i-1] { + deduped = append(deduped, v) + } + } + + if ksg.Len(0) != uint64(len(deduped)) { + t.Fatalf("Len(0) = %d, expected %d unique k-mers", ksg.Len(0), len(deduped)) + } + + // Check iterator + var fromIter []uint64 + for kmer := range ksg.Iterator(0) { + fromIter = append(fromIter, kmer) + } + // The iterator does a k-way merge so should be sorted + for i := 1; i < len(fromIter); i++ { + if fromIter[i] <= fromIter[i-1] { + t.Fatalf("iterator not sorted at %d: %d <= %d", i, fromIter[i], fromIter[i-1]) + } + } + if len(fromIter) != len(deduped) { + t.Fatalf("iterator yielded %d k-mers, expected %d", len(fromIter), len(deduped)) + } + for i, v := range fromIter { + if v != deduped[i] { + t.Fatalf("iterator kmer %d: got %d, want %d", i, v, deduped[i]) + } + } +} + +func TestBuilderMultipleSequences(t *testing.T) { + dir := t.TempDir() + + builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64) + if err != nil { + t.Fatal(err) + } + + seqs := []string{ + "ACGTACGTACGTACGTACGTACGTACGT", + "TTTTTTTTTTTTTTTTTTTTTTTTT", + "GGGGGGGGGGGGGGGGGGGGGGGG", + } + for _, s := range seqs { + seq := obiseq.NewBioSequence("", []byte(s), "") + builder.AddSequence(0, seq) + } + + ksg, err := builder.Close() + if err != nil { + t.Fatal(err) + } + + if ksg.Len(0) == 0 { + t.Fatal("expected k-mers after multiple sequences") + } +} + +func TestBuilderFrequencyFilter(t *testing.T) { + dir := t.TempDir() + + builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64, + WithMinFrequency(3)) + if err != nil { + t.Fatal(err) + } + + // Add same sequence 3 times — all k-mers should survive freq=3 + seq := obiseq.NewBioSequence("test", []byte("ACGTACGTACGTACGTACGTACGTACGT"), "") + for i := 0; i < 3; i++ { + builder.AddSequence(0, seq) + } + + ksg, err := builder.Close() + if err != nil { + t.Fatal(err) + } + + // All k-mers appear exactly 3 times → all should survive + var expected []uint64 + for kmer := range IterCanonicalKmers(seq.Sequence(), 15) { + expected = append(expected, kmer) + } + sort.Slice(expected, func(i, j int) bool { return expected[i] < expected[j] }) + deduped := expected[:0] + for i, v := range expected { + if i == 0 || v != expected[i-1] { + deduped = append(deduped, v) + } + } + + if ksg.Len(0) != uint64(len(deduped)) { + t.Fatalf("Len(0) = %d, expected %d (all k-mers at freq=3)", ksg.Len(0), len(deduped)) + } +} + +func TestBuilderFrequencyFilterRejects(t *testing.T) { + dir := t.TempDir() + + builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64, + WithMinFrequency(5)) + if err != nil { + t.Fatal(err) + } + + // Use a non-repetitive sequence so each canonical k-mer appears once per pass. + // Adding it twice gives freq=2 per kmer, which is < minFreq=5 → all rejected. + seq := obiseq.NewBioSequence("test", + []byte("ACGATCGATCTAGCTAGCTGATCGATCGATCG"), "") + builder.AddSequence(0, seq) + builder.AddSequence(0, seq) + + ksg, err := builder.Close() + if err != nil { + t.Fatal(err) + } + + if ksg.Len(0) != 0 { + t.Fatalf("Len(0) = %d, expected 0 (all k-mers at freq=2 < minFreq=5)", ksg.Len(0)) + } +} + +func TestBuilderMultipleSets(t *testing.T) { + dir := t.TempDir() + + builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 3, 64) + if err != nil { + t.Fatal(err) + } + + seqs := []string{ + "ACGTACGTACGTACGTACGTACGTACGT", + "TTTTTTTTTTTTTTTTTTTTTTTTT", + "GGGGGGGGGGGGGGGGGGGGGGGG", + } + for i, s := range seqs { + seq := obiseq.NewBioSequence("", []byte(s), "") + builder.AddSequence(i, seq) + } + + ksg, err := builder.Close() + if err != nil { + t.Fatal(err) + } + + if ksg.Size() != 3 { + t.Fatalf("Size() = %d, want 3", ksg.Size()) + } + for s := 0; s < 3; s++ { + if ksg.Len(s) == 0 { + t.Fatalf("Len(%d) = 0, expected some k-mers", s) + } + } +} + +func TestBuilderOpenRoundTrip(t *testing.T) { + dir := t.TempDir() + + builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64) + if err != nil { + t.Fatal(err) + } + + seq := obiseq.NewBioSequence("test", []byte("ACGTACGTACGTACGTACGTACGTACGT"), "") + builder.AddSequence(0, seq) + + ksg1, err := builder.Close() + if err != nil { + t.Fatal(err) + } + + // Reopen + ksg2, err := OpenKmerSetGroup(dir) + if err != nil { + t.Fatal(err) + } + + if ksg2.K() != ksg1.K() { + t.Fatalf("K mismatch: %d vs %d", ksg2.K(), ksg1.K()) + } + if ksg2.M() != ksg1.M() { + t.Fatalf("M mismatch: %d vs %d", ksg2.M(), ksg1.M()) + } + if ksg2.Partitions() != ksg1.Partitions() { + t.Fatalf("Partitions mismatch: %d vs %d", ksg2.Partitions(), ksg1.Partitions()) + } + if ksg2.Len(0) != ksg1.Len(0) { + t.Fatalf("Len mismatch: %d vs %d", ksg2.Len(0), ksg1.Len(0)) + } +} + +func TestBuilderAttributes(t *testing.T) { + dir := t.TempDir() + + builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64) + if err != nil { + t.Fatal(err) + } + + seq := obiseq.NewBioSequence("test", []byte("ACGTACGTACGTACGTACGTACGTACGT"), "") + builder.AddSequence(0, seq) + + ksg, err := builder.Close() + if err != nil { + t.Fatal(err) + } + + ksg.SetId("my_index") + ksg.SetAttribute("organism", "test") + ksg.SaveMetadata() + + // Reopen and check + ksg2, err := OpenKmerSetGroup(dir) + if err != nil { + t.Fatal(err) + } + + if ksg2.Id() != "my_index" { + t.Fatalf("Id() = %q, want %q", ksg2.Id(), "my_index") + } + if !ksg2.HasAttribute("organism") { + t.Fatal("expected 'organism' attribute") + } + v, _ := ksg2.GetAttribute("organism") + if v != "test" { + t.Fatalf("organism = %v, want 'test'", v) + } +} diff --git a/pkg/obikmer/kmer_set_disk.go b/pkg/obikmer/kmer_set_disk.go new file mode 100644 index 0000000..7397fcf --- /dev/null +++ b/pkg/obikmer/kmer_set_disk.go @@ -0,0 +1,944 @@ +package obikmer + +import ( + "fmt" + "io" + "iter" + "os" + "path" + "path/filepath" + "sort" + "sync" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidist" + "github.com/pelletier/go-toml/v2" +) + +// MetadataFormat represents the metadata serialization format. +// Currently only TOML is used for disk-based indices, but the type +// is kept for backward compatibility with CLI options. +type MetadataFormat int + +const ( + FormatTOML MetadataFormat = iota + FormatYAML + FormatJSON +) + +// String returns the file extension for the format. +func (f MetadataFormat) String() string { + switch f { + case FormatTOML: + return "toml" + case FormatYAML: + return "yaml" + case FormatJSON: + return "json" + default: + return "toml" + } +} + +// KmerSetGroup is a disk-based collection of N k-mer sets sharing the same +// k, m, and partition count P. After construction (via KmerSetGroupBuilder), +// it is immutable and all operations are streaming (partition by partition). +// +// A KmerSetGroup with Size()==1 is effectively a KmerSet (singleton). +type KmerSetGroup struct { + path string // root directory + id string // user-assigned identifier + k int // k-mer size + m int // minimizer size + partitions int // number of partitions P + n int // number of sets N + setsIDs []string // IDs of individual sets + counts []uint64 // total k-mer count per set (sum over partitions) + setsMetadata []map[string]interface{} // per-set user metadata + Metadata map[string]interface{} // group-level user metadata +} + +// diskMetadata is the TOML-serializable structure for metadata.toml. +type diskMetadata struct { + ID string `toml:"id,omitempty"` + K int `toml:"k"` + M int `toml:"m"` + Partitions int `toml:"partitions"` + Type string `toml:"type"` + Size int `toml:"size"` + SetsIDs []string `toml:"sets_ids,omitempty"` + Counts []uint64 `toml:"counts,omitempty"` + SetsMetadata []map[string]interface{} `toml:"sets_metadata,omitempty"` + UserMetadata map[string]interface{} `toml:"user_metadata,omitempty"` +} + +// OpenKmerSetGroup opens a finalized index directory in read-only mode. +func OpenKmerSetGroup(directory string) (*KmerSetGroup, error) { + metaPath := filepath.Join(directory, "metadata.toml") + f, err := os.Open(metaPath) + if err != nil { + return nil, fmt.Errorf("obikmer: open metadata: %w", err) + } + defer f.Close() + + var meta diskMetadata + if err := toml.NewDecoder(f).Decode(&meta); err != nil { + return nil, fmt.Errorf("obikmer: decode metadata: %w", err) + } + + ksg := &KmerSetGroup{ + path: directory, + id: meta.ID, + k: meta.K, + m: meta.M, + partitions: meta.Partitions, + n: meta.Size, + setsIDs: meta.SetsIDs, + counts: meta.Counts, + setsMetadata: meta.SetsMetadata, + Metadata: meta.UserMetadata, + } + if ksg.Metadata == nil { + ksg.Metadata = make(map[string]interface{}) + } + if ksg.setsIDs == nil { + ksg.setsIDs = make([]string, ksg.n) + } + if ksg.setsMetadata == nil { + ksg.setsMetadata = make([]map[string]interface{}, ksg.n) + for i := range ksg.setsMetadata { + ksg.setsMetadata[i] = make(map[string]interface{}) + } + } + if ksg.counts == nil { + // Compute counts by scanning partitions + ksg.counts = make([]uint64, ksg.n) + for s := 0; s < ksg.n; s++ { + for p := 0; p < ksg.partitions; p++ { + path := ksg.partitionPath(s, p) + r, err := NewKdiReader(path) + if err != nil { + continue + } + ksg.counts[s] += r.Count() + r.Close() + } + } + } + + return ksg, nil +} + +// NewFilteredKmerSetGroup creates a KmerSetGroup from pre-computed data. +// Used by the filter command to construct a new group after filtering partitions. +func NewFilteredKmerSetGroup( + directory string, k, m, partitions, n int, + setsIDs []string, counts []uint64, + setsMetadata []map[string]interface{}, +) (*KmerSetGroup, error) { + ksg := &KmerSetGroup{ + path: directory, + k: k, + m: m, + partitions: partitions, + n: n, + setsIDs: setsIDs, + counts: counts, + setsMetadata: setsMetadata, + Metadata: make(map[string]interface{}), + } + return ksg, nil +} + +// SaveMetadata writes the metadata.toml file. This is useful after +// modifying attributes or IDs on an already-finalized index. +func (ksg *KmerSetGroup) SaveMetadata() error { + return ksg.saveMetadata() +} + +// saveMetadata writes the metadata.toml file (internal). +func (ksg *KmerSetGroup) saveMetadata() error { + meta := diskMetadata{ + ID: ksg.id, + K: ksg.k, + M: ksg.m, + Partitions: ksg.partitions, + Type: "KmerSetGroup", + Size: ksg.n, + SetsIDs: ksg.setsIDs, + Counts: ksg.counts, + SetsMetadata: ksg.setsMetadata, + UserMetadata: ksg.Metadata, + } + + metaPath := filepath.Join(ksg.path, "metadata.toml") + f, err := os.Create(metaPath) + if err != nil { + return err + } + defer f.Close() + + return toml.NewEncoder(f).Encode(meta) +} + +// partitionPath returns the file path for partition p of set s. +func (ksg *KmerSetGroup) partitionPath(setIndex, partIndex int) string { + return filepath.Join(ksg.path, fmt.Sprintf("set_%d", setIndex), + fmt.Sprintf("part_%04d.kdi", partIndex)) +} + +// Path returns the root directory of the index. +func (ksg *KmerSetGroup) Path() string { + return ksg.path +} + +// K returns the k-mer size. +func (ksg *KmerSetGroup) K() int { + return ksg.k +} + +// M returns the minimizer size. +func (ksg *KmerSetGroup) M() int { + return ksg.m +} + +// Partitions returns the number of partitions P. +func (ksg *KmerSetGroup) Partitions() int { + return ksg.partitions +} + +// Size returns the number of sets N. +func (ksg *KmerSetGroup) Size() int { + return ksg.n +} + +// Id returns the group identifier. +func (ksg *KmerSetGroup) Id() string { + return ksg.id +} + +// SetId sets the group identifier and persists the change. +func (ksg *KmerSetGroup) SetId(id string) { + ksg.id = id +} + +// Len returns the total number of k-mers. +// Without argument: total across all sets. +// With argument setIndex: count for that specific set. +func (ksg *KmerSetGroup) Len(setIndex ...int) uint64 { + if len(setIndex) == 0 { + var total uint64 + for _, c := range ksg.counts { + total += c + } + return total + } + idx := setIndex[0] + if idx < 0 || idx >= ksg.n { + return 0 + } + return ksg.counts[idx] +} + +// Contains checks if a k-mer is present in the specified set. +// Uses the .kdx sparse index (if available) for fast seeking within +// each partition, then a short linear scan of at most `stride` entries. +// All partitions are searched in parallel since the k-mer's partition +// is not known without its minimizer context. +func (ksg *KmerSetGroup) Contains(setIndex int, kmer uint64) bool { + if setIndex < 0 || setIndex >= ksg.n { + return false + } + + type result struct { + found bool + } + ch := make(chan result, ksg.partitions) + + for p := 0; p < ksg.partitions; p++ { + go func(part int) { + r, err := NewKdiIndexedReader(ksg.partitionPath(setIndex, part)) + if err != nil { + ch <- result{false} + return + } + defer r.Close() + + // Use index to jump near the target + if err := r.SeekTo(kmer); err != nil { + ch <- result{false} + return + } + + // Linear scan from the seek position + for { + v, ok := r.Next() + if !ok { + ch <- result{false} + return + } + if v == kmer { + ch <- result{true} + return + } + if v > kmer { + ch <- result{false} + return + } + } + }(p) + } + + for i := 0; i < ksg.partitions; i++ { + res := <-ch + if res.found { + // Drain remaining goroutines + go func() { + for j := i + 1; j < ksg.partitions; j++ { + <-ch + } + }() + return true + } + } + return false +} + +// Iterator returns an iterator over all k-mers in the specified set, +// in sorted order within each partition. Since partitions are independent, +// to get a globally sorted stream, use iteratorSorted. +func (ksg *KmerSetGroup) Iterator(setIndex int) iter.Seq[uint64] { + return func(yield func(uint64) bool) { + if setIndex < 0 || setIndex >= ksg.n { + return + } + + // Open all partition readers and merge them + readers := make([]*KdiReader, 0, ksg.partitions) + for p := 0; p < ksg.partitions; p++ { + r, err := NewKdiReader(ksg.partitionPath(setIndex, p)) + if err != nil { + continue + } + if r.Count() > 0 { + readers = append(readers, r) + } else { + r.Close() + } + } + + if len(readers) == 0 { + return + } + + m := NewKWayMerge(readers) + defer m.Close() + + for { + kmer, _, ok := m.Next() + if !ok { + return + } + if !yield(kmer) { + return + } + } + } +} + +// ============================== +// Attribute API (compatible with old API) +// ============================== + +// HasAttribute checks if a metadata key exists. +func (ksg *KmerSetGroup) HasAttribute(key string) bool { + _, ok := ksg.Metadata[key] + return ok +} + +// GetAttribute returns the value of an attribute. +func (ksg *KmerSetGroup) GetAttribute(key string) (interface{}, bool) { + switch key { + case "id": + return ksg.Id(), true + case "k": + return ksg.K(), true + default: + value, ok := ksg.Metadata[key] + return value, ok + } +} + +// SetAttribute sets a metadata attribute. +func (ksg *KmerSetGroup) SetAttribute(key string, value interface{}) { + switch key { + case "id": + if id, ok := value.(string); ok { + ksg.SetId(id) + } else { + panic(fmt.Sprintf("id must be a string, got %T", value)) + } + case "k": + panic("k is immutable") + default: + ksg.Metadata[key] = value + } +} + +// DeleteAttribute removes a metadata attribute. +func (ksg *KmerSetGroup) DeleteAttribute(key string) { + delete(ksg.Metadata, key) +} + +// GetIntAttribute returns an attribute as int. +func (ksg *KmerSetGroup) GetIntAttribute(key string) (int, bool) { + v, ok := ksg.GetAttribute(key) + if !ok { + return 0, false + } + switch val := v.(type) { + case int: + return val, true + case int64: + return int(val), true + case float64: + return int(val), true + } + return 0, false +} + +// GetStringAttribute returns an attribute as string. +func (ksg *KmerSetGroup) GetStringAttribute(key string) (string, bool) { + v, ok := ksg.GetAttribute(key) + if !ok { + return "", false + } + if s, ok := v.(string); ok { + return s, true + } + return fmt.Sprintf("%v", v), true +} + +// ============================== +// Jaccard metrics (streaming, disk-based) +// ============================== + +// JaccardDistanceMatrix computes a pairwise Jaccard distance matrix +// for all sets in the group. Operates partition by partition in streaming. +func (ksg *KmerSetGroup) JaccardDistanceMatrix() *obidist.DistMatrix { + n := ksg.n + labels := make([]string, n) + for i := 0; i < n; i++ { + if i < len(ksg.setsIDs) && ksg.setsIDs[i] != "" { + labels[i] = ksg.setsIDs[i] + } else { + labels[i] = fmt.Sprintf("set_%d", i) + } + } + + dm := obidist.NewDistMatrixWithLabels(labels) + + // Accumulate intersection and union counts + intersections := make([][]uint64, n) + unions := make([][]uint64, n) + for i := 0; i < n; i++ { + intersections[i] = make([]uint64, n) + unions[i] = make([]uint64, n) + } + + // Process partition by partition + var mu sync.Mutex + var wg sync.WaitGroup + + for p := 0; p < ksg.partitions; p++ { + wg.Add(1) + go func(part int) { + defer wg.Done() + + // Open all set readers for this partition + readers := make([]*KdiReader, n) + for s := 0; s < n; s++ { + r, err := NewKdiReader(ksg.partitionPath(s, part)) + if err != nil { + continue + } + readers[s] = r + } + defer func() { + for _, r := range readers { + if r != nil { + r.Close() + } + } + }() + + // Merge all N readers to count intersections and unions + activeReaders := make([]*KdiReader, 0, n) + activeIndices := make([]int, 0, n) + for i, r := range readers { + if r != nil && r.Count() > 0 { + activeReaders = append(activeReaders, r) + activeIndices = append(activeIndices, i) + } + } + if len(activeReaders) == 0 { + return + } + + merge := NewKWayMerge(activeReaders) + // Don't close merge here since readers are managed above + // We only want to iterate + + // We need per-set presence tracking, so we use a custom merge + // Rebuild with a direct approach + merge.Close() // close the merge (which closes readers) + + // Reopen readers for custom merge + for s := 0; s < n; s++ { + readers[s] = nil + r, err := NewKdiReader(ksg.partitionPath(s, part)) + if err != nil { + continue + } + if r.Count() > 0 { + readers[s] = r + } else { + r.Close() + } + } + + // Custom k-way merge that tracks which sets contain each kmer + type entry struct { + val uint64 + setIdx int + } + + // Use a simpler approach: read all values for this partition into memory + // for each set, then do a merge + setKmers := make([][]uint64, n) + for s := 0; s < n; s++ { + if readers[s] == nil { + continue + } + kmers := make([]uint64, 0, readers[s].Count()) + for { + v, ok := readers[s].Next() + if !ok { + break + } + kmers = append(kmers, v) + } + setKmers[s] = kmers + readers[s].Close() + readers[s] = nil + } + + // Count pairwise intersections using sorted merge + // For each pair (i,j), count kmers present in both + localInter := make([][]uint64, n) + localUnion := make([][]uint64, n) + for i := 0; i < n; i++ { + localInter[i] = make([]uint64, n) + localUnion[i] = make([]uint64, n) + } + + for i := 0; i < n; i++ { + localUnion[i][i] = uint64(len(setKmers[i])) + for j := i + 1; j < n; j++ { + a, b := setKmers[i], setKmers[j] + var inter uint64 + ai, bi := 0, 0 + for ai < len(a) && bi < len(b) { + if a[ai] == b[bi] { + inter++ + ai++ + bi++ + } else if a[ai] < b[bi] { + ai++ + } else { + bi++ + } + } + localInter[i][j] = inter + localUnion[i][j] = uint64(len(a)) + uint64(len(b)) - inter + } + } + + mu.Lock() + for i := 0; i < n; i++ { + for j := i; j < n; j++ { + intersections[i][j] += localInter[i][j] + unions[i][j] += localUnion[i][j] + } + } + mu.Unlock() + }(p) + } + wg.Wait() + + // Compute distances from accumulated counts + for i := 0; i < n-1; i++ { + for j := i + 1; j < n; j++ { + u := unions[i][j] + if u == 0 { + dm.Set(i, j, 1.0) + } else { + dm.Set(i, j, 1.0-float64(intersections[i][j])/float64(u)) + } + } + } + + return dm +} + +// JaccardSimilarityMatrix computes a pairwise Jaccard similarity matrix. +func (ksg *KmerSetGroup) JaccardSimilarityMatrix() *obidist.DistMatrix { + n := ksg.n + labels := make([]string, n) + for i := 0; i < n; i++ { + if i < len(ksg.setsIDs) && ksg.setsIDs[i] != "" { + labels[i] = ksg.setsIDs[i] + } else { + labels[i] = fmt.Sprintf("set_%d", i) + } + } + + // Reuse distance computation + dm := ksg.JaccardDistanceMatrix() + sm := obidist.NewSimilarityMatrixWithLabels(labels) + + for i := 0; i < n-1; i++ { + for j := i + 1; j < n; j++ { + sm.Set(i, j, 1.0-dm.Get(i, j)) + } + } + + return sm +} + +// ============================== +// Set ID accessors +// ============================== + +// SetsIDs returns a copy of the per-set string identifiers. +func (ksg *KmerSetGroup) SetsIDs() []string { + out := make([]string, len(ksg.setsIDs)) + copy(out, ksg.setsIDs) + return out +} + +// SetIDOf returns the string ID of the set at the given index. +// Returns "" if index is out of range. +func (ksg *KmerSetGroup) SetIDOf(index int) string { + if index < 0 || index >= ksg.n { + return "" + } + return ksg.setsIDs[index] +} + +// SetSetID sets the string ID of the set at the given index. +func (ksg *KmerSetGroup) SetSetID(index int, id string) { + if index >= 0 && index < ksg.n { + ksg.setsIDs[index] = id + } +} + +// IndexOfSetID returns the numeric index for a set ID, or -1 if not found. +func (ksg *KmerSetGroup) IndexOfSetID(id string) int { + for i, sid := range ksg.setsIDs { + if sid == id { + return i + } + } + return -1 +} + +// MatchSetIDs resolves glob patterns against set IDs and returns matching +// indices sorted in ascending order. Uses path.Match for pattern matching +// (supports *, ?, [...] patterns). Returns error if a pattern is malformed. +func (ksg *KmerSetGroup) MatchSetIDs(patterns []string) ([]int, error) { + seen := make(map[int]bool) + for _, pattern := range patterns { + for i, sid := range ksg.setsIDs { + matched, err := path.Match(pattern, sid) + if err != nil { + return nil, fmt.Errorf("obikmer: invalid glob pattern %q: %w", pattern, err) + } + if matched { + seen[i] = true + } + } + } + result := make([]int, 0, len(seen)) + for idx := range seen { + result = append(result, idx) + } + sort.Ints(result) + return result, nil +} + +// ============================== +// Per-set metadata accessors +// ============================== + +// GetSetMetadata returns the value of a per-set metadata key. +func (ksg *KmerSetGroup) GetSetMetadata(setIndex int, key string) (interface{}, bool) { + if setIndex < 0 || setIndex >= ksg.n { + return nil, false + } + v, ok := ksg.setsMetadata[setIndex][key] + return v, ok +} + +// SetSetMetadata sets a per-set metadata attribute. +func (ksg *KmerSetGroup) SetSetMetadata(setIndex int, key string, value interface{}) { + if setIndex < 0 || setIndex >= ksg.n { + return + } + if ksg.setsMetadata[setIndex] == nil { + ksg.setsMetadata[setIndex] = make(map[string]interface{}) + } + ksg.setsMetadata[setIndex][key] = value +} + +// DeleteSetMetadata removes a per-set metadata attribute. +func (ksg *KmerSetGroup) DeleteSetMetadata(setIndex int, key string) { + if setIndex < 0 || setIndex >= ksg.n { + return + } + delete(ksg.setsMetadata[setIndex], key) +} + +// AllSetMetadata returns a copy of all metadata for a given set. +func (ksg *KmerSetGroup) AllSetMetadata(setIndex int) map[string]interface{} { + if setIndex < 0 || setIndex >= ksg.n { + return nil + } + out := make(map[string]interface{}, len(ksg.setsMetadata[setIndex])) + for k, v := range ksg.setsMetadata[setIndex] { + out[k] = v + } + return out +} + +// ============================== +// Exported partition path and compatibility +// ============================== + +// PartitionPath returns the file path for partition partIndex of set setIndex. +func (ksg *KmerSetGroup) PartitionPath(setIndex, partIndex int) string { + return ksg.partitionPath(setIndex, partIndex) +} + +// SpectrumPath returns the path to the spectrum.bin file for the given set. +func (ksg *KmerSetGroup) SpectrumPath(setIndex int) string { + return filepath.Join(ksg.path, fmt.Sprintf("set_%d", setIndex), "spectrum.bin") +} + +// Spectrum reads the k-mer frequency spectrum for the given set. +// Returns nil, nil if no spectrum file exists. +func (ksg *KmerSetGroup) Spectrum(setIndex int) (*KmerSpectrum, error) { + path := ksg.SpectrumPath(setIndex) + if _, err := os.Stat(path); os.IsNotExist(err) { + return nil, nil + } + return ReadSpectrum(path) +} + +// IsCompatibleWith returns true if the other group has the same k, m, and partitions. +func (ksg *KmerSetGroup) IsCompatibleWith(other *KmerSetGroup) bool { + return ksg.k == other.k && ksg.m == other.m && ksg.partitions == other.partitions +} + +// ============================== +// Set management operations +// ============================== + +// NewEmptyCompatible creates an empty KmerSetGroup at destDir with the same +// k, m, and partitions as this group. The destination must not already exist. +func (ksg *KmerSetGroup) NewEmptyCompatible(destDir string) (*KmerSetGroup, error) { + if err := os.MkdirAll(destDir, 0755); err != nil { + return nil, fmt.Errorf("obikmer: create directory: %w", err) + } + + dest := &KmerSetGroup{ + path: destDir, + k: ksg.k, + m: ksg.m, + partitions: ksg.partitions, + n: 0, + setsIDs: []string{}, + counts: []uint64{}, + setsMetadata: []map[string]interface{}{}, + Metadata: make(map[string]interface{}), + } + + if err := dest.saveMetadata(); err != nil { + return nil, fmt.Errorf("obikmer: write metadata: %w", err) + } + + return dest, nil +} + +// RemoveSetByID removes the set with the given ID from the group. +// It deletes the set directory, renumbers all subsequent sets, and +// updates the metadata on disk. +func (ksg *KmerSetGroup) RemoveSetByID(id string) error { + idx := ksg.IndexOfSetID(id) + if idx < 0 { + return fmt.Errorf("obikmer: set ID %q not found", id) + } + + // Delete the set directory + setDir := filepath.Join(ksg.path, fmt.Sprintf("set_%d", idx)) + if err := os.RemoveAll(setDir); err != nil { + return fmt.Errorf("obikmer: remove set directory: %w", err) + } + + // Renumber subsequent sets + for i := idx + 1; i < ksg.n; i++ { + oldDir := filepath.Join(ksg.path, fmt.Sprintf("set_%d", i)) + newDir := filepath.Join(ksg.path, fmt.Sprintf("set_%d", i-1)) + if err := os.Rename(oldDir, newDir); err != nil { + return fmt.Errorf("obikmer: rename set_%d to set_%d: %w", i, i-1, err) + } + } + + // Update slices + ksg.setsIDs = append(ksg.setsIDs[:idx], ksg.setsIDs[idx+1:]...) + ksg.counts = append(ksg.counts[:idx], ksg.counts[idx+1:]...) + ksg.setsMetadata = append(ksg.setsMetadata[:idx], ksg.setsMetadata[idx+1:]...) + ksg.n-- + + return ksg.saveMetadata() +} + +// CopySetsByIDTo copies sets identified by their IDs into a KmerSetGroup +// at destDir. If destDir does not exist, a new compatible empty group is +// created. If it exists, compatibility (k, m, partitions) is checked. +// If a set ID already exists in the destination, an error is returned +// unless force is true (in which case the existing set is replaced). +// Per-set metadata travels with the set. +func (ksg *KmerSetGroup) CopySetsByIDTo(ids []string, destDir string, force bool) (*KmerSetGroup, error) { + // Resolve source IDs to indices + srcIndices := make([]int, len(ids)) + for i, id := range ids { + idx := ksg.IndexOfSetID(id) + if idx < 0 { + return nil, fmt.Errorf("obikmer: source set ID %q not found", id) + } + srcIndices[i] = idx + } + + // Open or create destination + var dest *KmerSetGroup + metaPath := filepath.Join(destDir, "metadata.toml") + if _, err := os.Stat(metaPath); err == nil { + // Destination exists + dest, err = OpenKmerSetGroup(destDir) + if err != nil { + return nil, fmt.Errorf("obikmer: open destination: %w", err) + } + if !ksg.IsCompatibleWith(dest) { + return nil, fmt.Errorf("obikmer: incompatible groups: source (k=%d, m=%d, P=%d) vs dest (k=%d, m=%d, P=%d)", + ksg.k, ksg.m, ksg.partitions, dest.k, dest.m, dest.partitions) + } + } else { + // Create new destination + var err error + dest, err = ksg.NewEmptyCompatible(destDir) + if err != nil { + return nil, err + } + } + + // Copy each set + for i, srcIdx := range srcIndices { + srcID := ids[i] + + // Check for ID conflict in destination + existingIdx := dest.IndexOfSetID(srcID) + if existingIdx >= 0 { + if !force { + return nil, fmt.Errorf("obikmer: set ID %q already exists in destination (use force to replace)", srcID) + } + // Force: remove existing set in destination + if err := dest.RemoveSetByID(srcID); err != nil { + return nil, fmt.Errorf("obikmer: remove existing set %q in destination: %w", srcID, err) + } + } + + // Destination set index = current dest size + destIdx := dest.n + + // Create destination set directory + destSetDir := filepath.Join(destDir, fmt.Sprintf("set_%d", destIdx)) + if err := os.MkdirAll(destSetDir, 0755); err != nil { + return nil, fmt.Errorf("obikmer: create dest set dir: %w", err) + } + + // Copy all partition files and their .kdx indices + for p := 0; p < ksg.partitions; p++ { + srcPath := ksg.partitionPath(srcIdx, p) + destPath := dest.partitionPath(destIdx, p) + if err := copyFile(srcPath, destPath); err != nil { + return nil, fmt.Errorf("obikmer: copy partition %d of set %q: %w", p, srcID, err) + } + // Copy .kdx index if it exists + srcKdx := KdxPathForKdi(srcPath) + if _, err := os.Stat(srcKdx); err == nil { + destKdx := KdxPathForKdi(destPath) + if err := copyFile(srcKdx, destKdx); err != nil { + return nil, fmt.Errorf("obikmer: copy index %d of set %q: %w", p, srcID, err) + } + } + } + + // Copy spectrum.bin if it exists + srcSpecPath := ksg.SpectrumPath(srcIdx) + if _, err := os.Stat(srcSpecPath); err == nil { + destSpecPath := filepath.Join(destSetDir, "spectrum.bin") + if err := copyFile(srcSpecPath, destSpecPath); err != nil { + return nil, fmt.Errorf("obikmer: copy spectrum of set %q: %w", srcID, err) + } + } + + // Update destination metadata + dest.setsIDs = append(dest.setsIDs, srcID) + dest.counts = append(dest.counts, ksg.counts[srcIdx]) + + // Copy per-set metadata + srcMeta := ksg.AllSetMetadata(srcIdx) + if srcMeta == nil { + srcMeta = make(map[string]interface{}) + } + dest.setsMetadata = append(dest.setsMetadata, srcMeta) + dest.n++ + } + + if err := dest.saveMetadata(); err != nil { + return nil, fmt.Errorf("obikmer: save destination metadata: %w", err) + } + + return dest, nil +} + +// copyFile copies a file from src to dst. +func copyFile(src, dst string) error { + in, err := os.Open(src) + if err != nil { + return err + } + defer in.Close() + + out, err := os.Create(dst) + if err != nil { + return err + } + defer out.Close() + + if _, err := io.Copy(out, in); err != nil { + return err + } + + return out.Close() +} diff --git a/pkg/obikmer/kmer_set_disk_ops.go b/pkg/obikmer/kmer_set_disk_ops.go new file mode 100644 index 0000000..4c96624 --- /dev/null +++ b/pkg/obikmer/kmer_set_disk_ops.go @@ -0,0 +1,568 @@ +package obikmer + +import ( + "fmt" + "os" + "path/filepath" + "runtime" + "sync" +) + +// Union computes the union of all sets in the group, producing a new +// singleton KmerSetGroup on disk. A k-mer is in the result if it +// appears in any set. +func (ksg *KmerSetGroup) Union(outputDir string) (*KmerSetGroup, error) { + return ksg.quorumOp(outputDir, 1, ksg.n) +} + +// Intersect computes the intersection of all sets, producing a new +// singleton KmerSetGroup on disk. A k-mer is in the result if it +// appears in every set. +func (ksg *KmerSetGroup) Intersect(outputDir string) (*KmerSetGroup, error) { + return ksg.quorumOp(outputDir, ksg.n, ksg.n) +} + +// Difference computes set_0 minus the union of all other sets. +func (ksg *KmerSetGroup) Difference(outputDir string) (*KmerSetGroup, error) { + return ksg.differenceOp(outputDir) +} + +// QuorumAtLeast returns k-mers present in at least q sets. +func (ksg *KmerSetGroup) QuorumAtLeast(q int, outputDir string) (*KmerSetGroup, error) { + return ksg.quorumOp(outputDir, q, ksg.n) +} + +// QuorumExactly returns k-mers present in exactly q sets. +func (ksg *KmerSetGroup) QuorumExactly(q int, outputDir string) (*KmerSetGroup, error) { + return ksg.quorumOp(outputDir, q, q) +} + +// QuorumAtMost returns k-mers present in at most q sets. +func (ksg *KmerSetGroup) QuorumAtMost(q int, outputDir string) (*KmerSetGroup, error) { + return ksg.quorumOp(outputDir, 1, q) +} + +// UnionWith merges this group with another, producing a new KmerSetGroup +// whose set_i is the union of this.set_i and other.set_i. +// Both groups must have the same k, m, P, and N. +func (ksg *KmerSetGroup) UnionWith(other *KmerSetGroup, outputDir string) (*KmerSetGroup, error) { + if err := ksg.checkCompatible(other); err != nil { + return nil, err + } + return ksg.pairwiseOp(other, outputDir, mergeUnion) +} + +// IntersectWith merges this group with another, producing a new KmerSetGroup +// whose set_i is the intersection of this.set_i and other.set_i. +func (ksg *KmerSetGroup) IntersectWith(other *KmerSetGroup, outputDir string) (*KmerSetGroup, error) { + if err := ksg.checkCompatible(other); err != nil { + return nil, err + } + return ksg.pairwiseOp(other, outputDir, mergeIntersect) +} + +// ============================== +// Internal implementation +// ============================== + +func (ksg *KmerSetGroup) checkCompatible(other *KmerSetGroup) error { + if ksg.k != other.k { + return fmt.Errorf("obikmer: incompatible k: %d vs %d", ksg.k, other.k) + } + if ksg.m != other.m { + return fmt.Errorf("obikmer: incompatible m: %d vs %d", ksg.m, other.m) + } + if ksg.partitions != other.partitions { + return fmt.Errorf("obikmer: incompatible partitions: %d vs %d", ksg.partitions, other.partitions) + } + if ksg.n != other.n { + return fmt.Errorf("obikmer: incompatible size: %d vs %d", ksg.n, other.n) + } + return nil +} + +// quorumOp processes all N sets partition by partition. +// For each partition, it opens N KdiReaders and does a k-way merge. +// A kmer is written to the result if minQ <= count <= maxQ. +func (ksg *KmerSetGroup) quorumOp(outputDir string, minQ, maxQ int) (*KmerSetGroup, error) { + if minQ < 1 { + minQ = 1 + } + if maxQ > ksg.n { + maxQ = ksg.n + } + + // Create output structure + setDir := filepath.Join(outputDir, "set_0") + if err := os.MkdirAll(setDir, 0755); err != nil { + return nil, err + } + + counts := make([]uint64, ksg.partitions) + + nWorkers := runtime.NumCPU() + if nWorkers > ksg.partitions { + nWorkers = ksg.partitions + } + + jobs := make(chan int, ksg.partitions) + var wg sync.WaitGroup + var errMu sync.Mutex + var firstErr error + + for w := 0; w < nWorkers; w++ { + wg.Add(1) + go func() { + defer wg.Done() + for p := range jobs { + c, err := ksg.quorumPartition(p, setDir, minQ, maxQ) + if err != nil { + errMu.Lock() + if firstErr == nil { + firstErr = err + } + errMu.Unlock() + return + } + counts[p] = c + } + }() + } + + for p := 0; p < ksg.partitions; p++ { + jobs <- p + } + close(jobs) + wg.Wait() + + if firstErr != nil { + return nil, firstErr + } + + var totalCount uint64 + for _, c := range counts { + totalCount += c + } + + result := &KmerSetGroup{ + path: outputDir, + k: ksg.k, + m: ksg.m, + partitions: ksg.partitions, + n: 1, + setsIDs: []string{""}, + counts: []uint64{totalCount}, + Metadata: make(map[string]interface{}), + } + + if err := result.saveMetadata(); err != nil { + return nil, err + } + + return result, nil +} + +// quorumPartition processes a single partition for quorum filtering. +func (ksg *KmerSetGroup) quorumPartition(partIdx int, outSetDir string, minQ, maxQ int) (uint64, error) { + // Open readers for all sets + readers := make([]*KdiReader, 0, ksg.n) + for s := 0; s < ksg.n; s++ { + r, err := NewKdiReader(ksg.partitionPath(s, partIdx)) + if err != nil { + // Close already-opened readers + for _, rr := range readers { + rr.Close() + } + return 0, err + } + if r.Count() > 0 { + readers = append(readers, r) + } else { + r.Close() + } + } + + outPath := filepath.Join(outSetDir, fmt.Sprintf("part_%04d.kdi", partIdx)) + + if len(readers) == 0 { + // Write empty KDI + w, err := NewKdiWriter(outPath) + if err != nil { + return 0, err + } + return 0, w.Close() + } + + merge := NewKWayMerge(readers) + // merge.Close() will close readers + + w, err := NewKdiWriter(outPath) + if err != nil { + merge.Close() + return 0, err + } + + for { + kmer, count, ok := merge.Next() + if !ok { + break + } + if count >= minQ && count <= maxQ { + if err := w.Write(kmer); err != nil { + merge.Close() + w.Close() + return 0, err + } + } + } + + merge.Close() + cnt := w.Count() + return cnt, w.Close() +} + +// differenceOp computes set_0 minus the union of all other sets. +func (ksg *KmerSetGroup) differenceOp(outputDir string) (*KmerSetGroup, error) { + if ksg.n < 1 { + return nil, fmt.Errorf("obikmer: difference requires at least 1 set") + } + + setDir := filepath.Join(outputDir, "set_0") + if err := os.MkdirAll(setDir, 0755); err != nil { + return nil, err + } + + counts := make([]uint64, ksg.partitions) + + nWorkers := runtime.NumCPU() + if nWorkers > ksg.partitions { + nWorkers = ksg.partitions + } + + jobs := make(chan int, ksg.partitions) + var wg sync.WaitGroup + var errMu sync.Mutex + var firstErr error + + for w := 0; w < nWorkers; w++ { + wg.Add(1) + go func() { + defer wg.Done() + for p := range jobs { + c, err := ksg.differencePartition(p, setDir) + if err != nil { + errMu.Lock() + if firstErr == nil { + firstErr = err + } + errMu.Unlock() + return + } + counts[p] = c + } + }() + } + + for p := 0; p < ksg.partitions; p++ { + jobs <- p + } + close(jobs) + wg.Wait() + + if firstErr != nil { + return nil, firstErr + } + + var totalCount uint64 + for _, c := range counts { + totalCount += c + } + + result := &KmerSetGroup{ + path: outputDir, + k: ksg.k, + m: ksg.m, + partitions: ksg.partitions, + n: 1, + setsIDs: []string{""}, + counts: []uint64{totalCount}, + Metadata: make(map[string]interface{}), + } + + if err := result.saveMetadata(); err != nil { + return nil, err + } + + return result, nil +} + +// differencePartition computes set_0 - union(set_1..set_{n-1}) for one partition. +func (ksg *KmerSetGroup) differencePartition(partIdx int, outSetDir string) (uint64, error) { + outPath := filepath.Join(outSetDir, fmt.Sprintf("part_%04d.kdi", partIdx)) + + // Open set_0 reader + r0, err := NewKdiReader(ksg.partitionPath(0, partIdx)) + if err != nil { + return 0, err + } + + if r0.Count() == 0 { + r0.Close() + w, err := NewKdiWriter(outPath) + if err != nil { + return 0, err + } + return 0, w.Close() + } + + // Open readers for the other sets and merge them + var otherReaders []*KdiReader + for s := 1; s < ksg.n; s++ { + r, err := NewKdiReader(ksg.partitionPath(s, partIdx)) + if err != nil { + r0.Close() + for _, rr := range otherReaders { + rr.Close() + } + return 0, err + } + if r.Count() > 0 { + otherReaders = append(otherReaders, r) + } else { + r.Close() + } + } + + w, err := NewKdiWriter(outPath) + if err != nil { + r0.Close() + for _, rr := range otherReaders { + rr.Close() + } + return 0, err + } + + if len(otherReaders) == 0 { + // No other sets — copy set_0 + for { + v, ok := r0.Next() + if !ok { + break + } + if err := w.Write(v); err != nil { + r0.Close() + w.Close() + return 0, err + } + } + r0.Close() + cnt := w.Count() + return cnt, w.Close() + } + + // Merge other sets to get the "subtraction" stream + otherMerge := NewKWayMerge(otherReaders) + + // Streaming difference: advance both streams + v0, ok0 := r0.Next() + vo, _, oko := otherMerge.Next() + + for ok0 { + if !oko || v0 < vo { + // v0 not in others → emit + if err := w.Write(v0); err != nil { + r0.Close() + otherMerge.Close() + w.Close() + return 0, err + } + v0, ok0 = r0.Next() + } else if v0 == vo { + // v0 in others → skip + v0, ok0 = r0.Next() + vo, _, oko = otherMerge.Next() + } else { + // vo < v0 → advance others + vo, _, oko = otherMerge.Next() + } + } + + r0.Close() + otherMerge.Close() + cnt := w.Count() + return cnt, w.Close() +} + +// mergeMode defines how to combine two values during pairwise operations. +type mergeMode int + +const ( + mergeUnion mergeMode = iota // emit if in either + mergeIntersect // emit if in both +) + +// pairwiseOp applies a merge operation between corresponding sets of two groups. +func (ksg *KmerSetGroup) pairwiseOp(other *KmerSetGroup, outputDir string, mode mergeMode) (*KmerSetGroup, error) { + for s := 0; s < ksg.n; s++ { + setDir := filepath.Join(outputDir, fmt.Sprintf("set_%d", s)) + if err := os.MkdirAll(setDir, 0755); err != nil { + return nil, err + } + } + + counts := make([][]uint64, ksg.n) + for s := 0; s < ksg.n; s++ { + counts[s] = make([]uint64, ksg.partitions) + } + + nWorkers := runtime.NumCPU() + if nWorkers > ksg.partitions { + nWorkers = ksg.partitions + } + + type job struct { + setIdx int + partIdx int + } + jobs := make(chan job, ksg.n*ksg.partitions) + var wg sync.WaitGroup + var errMu sync.Mutex + var firstErr error + + for w := 0; w < nWorkers; w++ { + wg.Add(1) + go func() { + defer wg.Done() + for j := range jobs { + c, err := pairwiseMergePartition( + ksg.partitionPath(j.setIdx, j.partIdx), + other.partitionPath(j.setIdx, j.partIdx), + filepath.Join(outputDir, fmt.Sprintf("set_%d", j.setIdx), + fmt.Sprintf("part_%04d.kdi", j.partIdx)), + mode, + ) + if err != nil { + errMu.Lock() + if firstErr == nil { + firstErr = err + } + errMu.Unlock() + return + } + counts[j.setIdx][j.partIdx] = c + } + }() + } + + for s := 0; s < ksg.n; s++ { + for p := 0; p < ksg.partitions; p++ { + jobs <- job{s, p} + } + } + close(jobs) + wg.Wait() + + if firstErr != nil { + return nil, firstErr + } + + totalCounts := make([]uint64, ksg.n) + setsIDs := make([]string, ksg.n) + for s := 0; s < ksg.n; s++ { + for p := 0; p < ksg.partitions; p++ { + totalCounts[s] += counts[s][p] + } + } + + result := &KmerSetGroup{ + path: outputDir, + k: ksg.k, + m: ksg.m, + partitions: ksg.partitions, + n: ksg.n, + setsIDs: setsIDs, + counts: totalCounts, + Metadata: make(map[string]interface{}), + } + + if err := result.saveMetadata(); err != nil { + return nil, err + } + + return result, nil +} + +// pairwiseMergePartition merges two KDI files (sorted streams) with the given mode. +func pairwiseMergePartition(pathA, pathB, outPath string, mode mergeMode) (uint64, error) { + rA, err := NewKdiReader(pathA) + if err != nil { + return 0, err + } + rB, err := NewKdiReader(pathB) + if err != nil { + rA.Close() + return 0, err + } + + w, err := NewKdiWriter(outPath) + if err != nil { + rA.Close() + rB.Close() + return 0, err + } + + cnt, mergeErr := doPairwiseMerge(rA, rB, w, mode) + rA.Close() + rB.Close() + closeErr := w.Close() + if mergeErr != nil { + return 0, mergeErr + } + return cnt, closeErr +} + +func doPairwiseMerge(rA, rB *KdiReader, w *KdiWriter, mode mergeMode) (uint64, error) { + vA, okA := rA.Next() + vB, okB := rB.Next() + + for okA && okB { + if vA == vB { + if err := w.Write(vA); err != nil { + return 0, err + } + vA, okA = rA.Next() + vB, okB = rB.Next() + } else if vA < vB { + if mode == mergeUnion { + if err := w.Write(vA); err != nil { + return 0, err + } + } + vA, okA = rA.Next() + } else { + if mode == mergeUnion { + if err := w.Write(vB); err != nil { + return 0, err + } + } + vB, okB = rB.Next() + } + } + + if mode == mergeUnion { + for okA { + if err := w.Write(vA); err != nil { + return 0, err + } + vA, okA = rA.Next() + } + for okB { + if err := w.Write(vB); err != nil { + return 0, err + } + vB, okB = rB.Next() + } + } + + return w.Count(), nil +} diff --git a/pkg/obikmer/kmer_set_disk_ops_test.go b/pkg/obikmer/kmer_set_disk_ops_test.go new file mode 100644 index 0000000..1ca3cfd --- /dev/null +++ b/pkg/obikmer/kmer_set_disk_ops_test.go @@ -0,0 +1,251 @@ +package obikmer + +import ( + "path/filepath" + "testing" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" +) + +// buildGroupFromSeqs creates a KmerSetGroup with one set per sequence. +func buildGroupFromSeqs(t *testing.T, dir string, k, m int, seqs []string) *KmerSetGroup { + t.Helper() + n := len(seqs) + builder, err := NewKmerSetGroupBuilder(dir, k, m, n, 64) + if err != nil { + t.Fatal(err) + } + for i, s := range seqs { + seq := obiseq.NewBioSequence("", []byte(s), "") + builder.AddSequence(i, seq) + } + ksg, err := builder.Close() + if err != nil { + t.Fatal(err) + } + return ksg +} + +func collectKmers(t *testing.T, ksg *KmerSetGroup, setIdx int) []uint64 { + t.Helper() + var result []uint64 + for kmer := range ksg.Iterator(setIdx) { + result = append(result, kmer) + } + return result +} + +func TestDiskOpsUnion(t *testing.T) { + dir := t.TempDir() + indexDir := filepath.Join(dir, "index") + outDir := filepath.Join(dir, "union") + + // Two sequences with some overlap + seqs := []string{ + "ACGATCGATCTAGCTAGCTGATCGATCGATCG", + "CTAGCTAGCTGATCGATCGATCGTTTAAACCC", + } + ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs) + + result, err := ksg.Union(outDir) + if err != nil { + t.Fatal(err) + } + + // Union should have at least as many k-mers as each individual set + unionLen := result.Len(0) + if unionLen == 0 { + t.Fatal("union is empty") + } + if unionLen < ksg.Len(0) || unionLen < ksg.Len(1) { + t.Fatalf("union (%d) smaller than an input set (%d, %d)", unionLen, ksg.Len(0), ksg.Len(1)) + } + + // Union should not exceed the sum of both sets + if unionLen > ksg.Len(0)+ksg.Len(1) { + t.Fatalf("union (%d) larger than sum of sets (%d)", unionLen, ksg.Len(0)+ksg.Len(1)) + } +} + +func TestDiskOpsIntersect(t *testing.T) { + dir := t.TempDir() + indexDir := filepath.Join(dir, "index") + outDir := filepath.Join(dir, "intersect") + + // Two sequences with some shared k-mers + seqs := []string{ + "ACGATCGATCTAGCTAGCTGATCGATCGATCG", + "CTAGCTAGCTGATCGATCGATCGTTTAAACCC", + } + ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs) + + result, err := ksg.Intersect(outDir) + if err != nil { + t.Fatal(err) + } + + interLen := result.Len(0) + // Intersection should not be bigger than any individual set + if interLen > ksg.Len(0) || interLen > ksg.Len(1) { + t.Fatalf("intersection (%d) larger than input sets (%d, %d)", interLen, ksg.Len(0), ksg.Len(1)) + } +} + +func TestDiskOpsDifference(t *testing.T) { + dir := t.TempDir() + indexDir := filepath.Join(dir, "index") + outDir := filepath.Join(dir, "diff") + + seqs := []string{ + "ACGATCGATCTAGCTAGCTGATCGATCGATCG", + "CTAGCTAGCTGATCGATCGATCGTTTAAACCC", + } + ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs) + + result, err := ksg.Difference(outDir) + if err != nil { + t.Fatal(err) + } + + diffLen := result.Len(0) + // Difference = set_0 - set_1, so should be <= set_0 + if diffLen > ksg.Len(0) { + t.Fatalf("difference (%d) larger than set_0 (%d)", diffLen, ksg.Len(0)) + } +} + +func TestDiskOpsConsistency(t *testing.T) { + dir := t.TempDir() + indexDir := filepath.Join(dir, "index") + + seqs := []string{ + "ACGATCGATCTAGCTAGCTGATCGATCGATCG", + "CTAGCTAGCTGATCGATCGATCGTTTAAACCC", + } + ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs) + + unionResult, err := ksg.Union(filepath.Join(dir, "union")) + if err != nil { + t.Fatal(err) + } + interResult, err := ksg.Intersect(filepath.Join(dir, "intersect")) + if err != nil { + t.Fatal(err) + } + diffResult, err := ksg.Difference(filepath.Join(dir, "diff")) + if err != nil { + t.Fatal(err) + } + + unionLen := unionResult.Len(0) + interLen := interResult.Len(0) + diffLen := diffResult.Len(0) + + // |A ∪ B| = |A| + |B| - |A ∩ B| + expectedUnion := ksg.Len(0) + ksg.Len(1) - interLen + if unionLen != expectedUnion { + t.Fatalf("|A∪B|=%d, expected |A|+|B|-|A∩B|=%d+%d-%d=%d", + unionLen, ksg.Len(0), ksg.Len(1), interLen, expectedUnion) + } + + // |A \ B| = |A| - |A ∩ B| + expectedDiff := ksg.Len(0) - interLen + if diffLen != expectedDiff { + t.Fatalf("|A\\B|=%d, expected |A|-|A∩B|=%d-%d=%d", + diffLen, ksg.Len(0), interLen, expectedDiff) + } +} + +func TestDiskOpsQuorum(t *testing.T) { + dir := t.TempDir() + indexDir := filepath.Join(dir, "index") + + // Three sets + seqs := []string{ + "ACGATCGATCTAGCTAGCTGATCGATCGATCG", + "CTAGCTAGCTGATCGATCGATCGTTTAAACCC", + "GATCGATCGATCGAAATTTCCCGGG", + } + ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs) + + // QuorumAtLeast(1) = Union + q1, err := ksg.QuorumAtLeast(1, filepath.Join(dir, "q1")) + if err != nil { + t.Fatal(err) + } + union, err := ksg.Union(filepath.Join(dir, "union")) + if err != nil { + t.Fatal(err) + } + if q1.Len(0) != union.Len(0) { + t.Fatalf("QuorumAtLeast(1)=%d != Union=%d", q1.Len(0), union.Len(0)) + } + + // QuorumAtLeast(3) = Intersect + q3, err := ksg.QuorumAtLeast(3, filepath.Join(dir, "q3")) + if err != nil { + t.Fatal(err) + } + inter, err := ksg.Intersect(filepath.Join(dir, "inter")) + if err != nil { + t.Fatal(err) + } + if q3.Len(0) != inter.Len(0) { + t.Fatalf("QuorumAtLeast(3)=%d != Intersect=%d", q3.Len(0), inter.Len(0)) + } + + // QuorumAtLeast(2) should be between Intersect and Union + q2, err := ksg.QuorumAtLeast(2, filepath.Join(dir, "q2")) + if err != nil { + t.Fatal(err) + } + if q2.Len(0) < q3.Len(0) || q2.Len(0) > q1.Len(0) { + t.Fatalf("QuorumAtLeast(2)=%d not between intersect=%d and union=%d", + q2.Len(0), q3.Len(0), q1.Len(0)) + } +} + +func TestDiskOpsJaccard(t *testing.T) { + dir := t.TempDir() + indexDir := filepath.Join(dir, "index") + + seqs := []string{ + "ACGATCGATCTAGCTAGCTGATCGATCGATCG", + "ACGATCGATCTAGCTAGCTGATCGATCGATCG", // identical to first + "TTTTTTTTTTTTTTTTTTTTTTTTT", // completely different + } + ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs) + + dm := ksg.JaccardDistanceMatrix() + if dm == nil { + t.Fatal("JaccardDistanceMatrix returned nil") + } + + // Identical sets should have distance 0 + d01 := dm.Get(0, 1) + if d01 != 0.0 { + t.Fatalf("distance(0,1) = %f, expected 0.0 for identical sets", d01) + } + + // Completely different sets should have distance 1.0 + d02 := dm.Get(0, 2) + if d02 != 1.0 { + t.Fatalf("distance(0,2) = %f, expected 1.0 for disjoint sets", d02) + } + + // Similarity matrix + sm := ksg.JaccardSimilarityMatrix() + if sm == nil { + t.Fatal("JaccardSimilarityMatrix returned nil") + } + + s01 := sm.Get(0, 1) + if s01 != 1.0 { + t.Fatalf("similarity(0,1) = %f, expected 1.0 for identical sets", s01) + } + + s02 := sm.Get(0, 2) + if s02 != 0.0 { + t.Fatalf("similarity(0,2) = %f, expected 0.0 for disjoint sets", s02) + } +} diff --git a/pkg/obikmer/kmer_set_group.go b/pkg/obikmer/kmer_set_group.go deleted file mode 100644 index c008665..0000000 --- a/pkg/obikmer/kmer_set_group.go +++ /dev/null @@ -1,339 +0,0 @@ -package obikmer - -import ( - "fmt" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidist" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" -) - -// KmerSetGroup represents a vector of KmerSet -// Used to manage multiple k-mer sets (for example, by frequency level) -type KmerSetGroup struct { - id string // Unique identifier of the KmerSetGroup - k int // Size of k-mers (immutable) - sets []*KmerSet // Vector of KmerSet - Metadata map[string]interface{} // Group metadata (not individual sets) -} - -// NewKmerSetGroup creates a new group of n KmerSets -func NewKmerSetGroup(k int, n int) *KmerSetGroup { - if n < 1 { - panic("KmerSetGroup size must be >= 1") - } - - sets := make([]*KmerSet, n) - for i := range sets { - sets[i] = NewKmerSet(k) - } - - return &KmerSetGroup{ - k: k, - sets: sets, - Metadata: make(map[string]interface{}), - } -} - -// K returns the size of k-mers (immutable) -func (ksg *KmerSetGroup) K() int { - return ksg.k -} - -// Size returns the number of KmerSet in the group -func (ksg *KmerSetGroup) Size() int { - return len(ksg.sets) -} - -// Get returns the KmerSet at the given index -// Returns nil if the index is invalid -func (ksg *KmerSetGroup) Get(index int) *KmerSet { - if index < 0 || index >= len(ksg.sets) { - return nil - } - return ksg.sets[index] -} - -// Set replaces the KmerSet at the given index -// Panics if the index is invalid or if k does not match -func (ksg *KmerSetGroup) Set(index int, ks *KmerSet) { - if index < 0 || index >= len(ksg.sets) { - panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets))) - } - if ks.k != ksg.k { - panic(fmt.Sprintf("KmerSet k mismatch: expected %d, got %d", ksg.k, ks.k)) - } - ksg.sets[index] = ks -} - -// Len returns the number of k-mers in a specific KmerSet -// Without argument: returns the number of k-mers in the last KmerSet -// With argument index: returns the number of k-mers in the KmerSet at this index -func (ksg *KmerSetGroup) Len(index ...int) uint64 { - if len(index) == 0 { - // Without argument: last KmerSet - return ksg.sets[len(ksg.sets)-1].Len() - } - - // With argument: specific KmerSet - idx := index[0] - if idx < 0 || idx >= len(ksg.sets) { - return 0 - } - return ksg.sets[idx].Len() -} - -// MemoryUsage returns the total memory usage in bytes -func (ksg *KmerSetGroup) MemoryUsage() uint64 { - total := uint64(0) - for _, ks := range ksg.sets { - total += ks.MemoryUsage() - } - return total -} - -// Clear empties all KmerSet in the group -func (ksg *KmerSetGroup) Clear() { - for _, ks := range ksg.sets { - ks.Clear() - } -} - -// Copy creates a complete copy of the group (consistent with BioSequence.Copy) -func (ksg *KmerSetGroup) Copy() *KmerSetGroup { - copiedSets := make([]*KmerSet, len(ksg.sets)) - for i, ks := range ksg.sets { - copiedSets[i] = ks.Copy() // Copy each KmerSet with its metadata - } - - // Copy group metadata - groupMetadata := make(map[string]interface{}, len(ksg.Metadata)) - for k, v := range ksg.Metadata { - groupMetadata[k] = v - } - - return &KmerSetGroup{ - id: ksg.id, - k: ksg.k, - sets: copiedSets, - Metadata: groupMetadata, - } -} - -// Id returns the identifier of the KmerSetGroup (consistent with BioSequence.Id) -func (ksg *KmerSetGroup) Id() string { - return ksg.id -} - -// SetId sets the identifier of the KmerSetGroup (consistent with BioSequence.SetId) -func (ksg *KmerSetGroup) SetId(id string) { - ksg.id = id -} - -// AddSequence adds all k-mers from a sequence to a specific KmerSet -func (ksg *KmerSetGroup) AddSequence(seq *obiseq.BioSequence, index int) { - if index < 0 || index >= len(ksg.sets) { - panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets))) - } - ksg.sets[index].AddSequence(seq) -} - -// AddSequences adds all k-mers from multiple sequences to a specific KmerSet -func (ksg *KmerSetGroup) AddSequences(sequences *obiseq.BioSequenceSlice, index int) { - if index < 0 || index >= len(ksg.sets) { - panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets))) - } - ksg.sets[index].AddSequences(sequences) -} - -// Union returns the union of all KmerSet in the group -// Optimization: starts from the largest set to minimize operations -func (ksg *KmerSetGroup) Union() *KmerSet { - if len(ksg.sets) == 0 { - return NewKmerSet(ksg.k) - } - - if len(ksg.sets) == 1 { - return ksg.sets[0].Copy() - } - - // Find the index of the largest set (the one with the most k-mers) - maxIdx := 0 - maxCard := ksg.sets[0].Len() - for i := 1; i < len(ksg.sets); i++ { - card := ksg.sets[i].Len() - if card > maxCard { - maxCard = card - maxIdx = i - } - } - - // Copy the largest set and perform unions in-place - result := ksg.sets[maxIdx].bitmap.Clone() - for i := 0; i < len(ksg.sets); i++ { - if i != maxIdx { - result.Or(ksg.sets[i].bitmap) - } - } - - return NewKmerSetFromBitmap(ksg.k, result) -} - -// Intersect returns the intersection of all KmerSet in the group -// Optimization: starts from the smallest set to minimize operations -func (ksg *KmerSetGroup) Intersect() *KmerSet { - if len(ksg.sets) == 0 { - return NewKmerSet(ksg.k) - } - - if len(ksg.sets) == 1 { - return ksg.sets[0].Copy() - } - - // Find the index of the smallest set (the one with the fewest k-mers) - minIdx := 0 - minCard := ksg.sets[0].Len() - for i := 1; i < len(ksg.sets); i++ { - card := ksg.sets[i].Len() - if card < minCard { - minCard = card - minIdx = i - } - } - - // Copy the smallest set and perform intersections in-place - result := ksg.sets[minIdx].bitmap.Clone() - for i := 0; i < len(ksg.sets); i++ { - if i != minIdx { - result.And(ksg.sets[i].bitmap) - } - } - - return NewKmerSetFromBitmap(ksg.k, result) -} - -// Stats returns statistics for each KmerSet in the group -type KmerSetGroupStats struct { - K int - Size int // Number of KmerSet - TotalBytes uint64 // Total memory used - Sets []KmerSetStats // Stats of each KmerSet -} - -type KmerSetStats struct { - Index int // Index of the KmerSet in the group - Len uint64 // Number of k-mers - SizeBytes uint64 // Size in bytes -} - -func (ksg *KmerSetGroup) Stats() KmerSetGroupStats { - stats := KmerSetGroupStats{ - K: ksg.k, - Size: len(ksg.sets), - Sets: make([]KmerSetStats, len(ksg.sets)), - } - - for i, ks := range ksg.sets { - sizeBytes := ks.MemoryUsage() - stats.Sets[i] = KmerSetStats{ - Index: i, - Len: ks.Len(), - SizeBytes: sizeBytes, - } - stats.TotalBytes += sizeBytes - } - - return stats -} - -func (ksgs KmerSetGroupStats) String() string { - result := fmt.Sprintf(`KmerSetGroup Statistics (k=%d, size=%d): - Total memory: %.2f MB - -Set breakdown: -`, ksgs.K, ksgs.Size, float64(ksgs.TotalBytes)/1024/1024) - - for _, set := range ksgs.Sets { - result += fmt.Sprintf(" Set[%d]: %d k-mers (%.2f MB)\n", - set.Index, - set.Len, - float64(set.SizeBytes)/1024/1024) - } - - return result -} - -// JaccardDistanceMatrix computes a pairwise Jaccard distance matrix for all KmerSets in the group. -// Returns a triangular distance matrix where element (i, j) represents the Jaccard distance -// between set i and set j. -// -// The Jaccard distance is: 1 - (|A ∩ B| / |A ∪ B|) -// -// The matrix labels are set to the IDs of the individual KmerSets if available, -// otherwise they are set to "set_0", "set_1", etc. -// -// Time complexity: O(n² × (|A| + |B|)) where n is the number of sets -// Space complexity: O(n²) for the distance matrix -func (ksg *KmerSetGroup) JaccardDistanceMatrix() *obidist.DistMatrix { - n := len(ksg.sets) - - // Create labels from set IDs - labels := make([]string, n) - for i, ks := range ksg.sets { - if ks.Id() != "" { - labels[i] = ks.Id() - } else { - labels[i] = fmt.Sprintf("set_%d", i) - } - } - - dm := obidist.NewDistMatrixWithLabels(labels) - - // Compute pairwise distances - for i := 0; i < n-1; i++ { - for j := i + 1; j < n; j++ { - distance := ksg.sets[i].JaccardDistance(ksg.sets[j]) - dm.Set(i, j, distance) - } - } - - return dm -} - -// JaccardSimilarityMatrix computes a pairwise Jaccard similarity matrix for all KmerSets in the group. -// Returns a similarity matrix where element (i, j) represents the Jaccard similarity -// between set i and set j. -// -// The Jaccard similarity is: |A ∩ B| / |A ∪ B| -// -// The diagonal is 1.0 (similarity of a set to itself). -// -// The matrix labels are set to the IDs of the individual KmerSets if available, -// otherwise they are set to "set_0", "set_1", etc. -// -// Time complexity: O(n² × (|A| + |B|)) where n is the number of sets -// Space complexity: O(n²) for the similarity matrix -func (ksg *KmerSetGroup) JaccardSimilarityMatrix() *obidist.DistMatrix { - n := len(ksg.sets) - - // Create labels from set IDs - labels := make([]string, n) - for i, ks := range ksg.sets { - if ks.Id() != "" { - labels[i] = ks.Id() - } else { - labels[i] = fmt.Sprintf("set_%d", i) - } - } - - sm := obidist.NewSimilarityMatrixWithLabels(labels) - - // Compute pairwise similarities - for i := 0; i < n-1; i++ { - for j := i + 1; j < n; j++ { - similarity := ksg.sets[i].JaccardSimilarity(ksg.sets[j]) - sm.Set(i, j, similarity) - } - } - - return sm -} diff --git a/pkg/obikmer/kmer_set_group_jaccard_test.go b/pkg/obikmer/kmer_set_group_jaccard_test.go deleted file mode 100644 index 1e17d02..0000000 --- a/pkg/obikmer/kmer_set_group_jaccard_test.go +++ /dev/null @@ -1,231 +0,0 @@ -package obikmer - -import ( - "math" - "testing" -) - -func TestKmerSetGroupJaccardDistanceMatrix(t *testing.T) { - ksg := NewKmerSetGroup(5, 3) - - // Set 0: {1, 2, 3} - ksg.Get(0).AddKmerCode(1) - ksg.Get(0).AddKmerCode(2) - ksg.Get(0).AddKmerCode(3) - ksg.Get(0).SetId("set_A") - - // Set 1: {2, 3, 4} - ksg.Get(1).AddKmerCode(2) - ksg.Get(1).AddKmerCode(3) - ksg.Get(1).AddKmerCode(4) - ksg.Get(1).SetId("set_B") - - // Set 2: {5, 6, 7} - ksg.Get(2).AddKmerCode(5) - ksg.Get(2).AddKmerCode(6) - ksg.Get(2).AddKmerCode(7) - ksg.Get(2).SetId("set_C") - - dm := ksg.JaccardDistanceMatrix() - - // Check labels - if dm.GetLabel(0) != "set_A" { - t.Errorf("Expected label 'set_A' at index 0, got '%s'", dm.GetLabel(0)) - } - if dm.GetLabel(1) != "set_B" { - t.Errorf("Expected label 'set_B' at index 1, got '%s'", dm.GetLabel(1)) - } - if dm.GetLabel(2) != "set_C" { - t.Errorf("Expected label 'set_C' at index 2, got '%s'", dm.GetLabel(2)) - } - - // Check distances - // Distance(0, 1): - // Intersection: {2, 3} -> 2 elements - // Union: {1, 2, 3, 4} -> 4 elements - // Similarity: 2/4 = 0.5 - // Distance: 1 - 0.5 = 0.5 - expectedDist01 := 0.5 - actualDist01 := dm.Get(0, 1) - if math.Abs(actualDist01-expectedDist01) > 1e-10 { - t.Errorf("Distance(0, 1): expected %f, got %f", expectedDist01, actualDist01) - } - - // Distance(0, 2): - // Intersection: {} -> 0 elements - // Union: {1, 2, 3, 5, 6, 7} -> 6 elements - // Similarity: 0/6 = 0 - // Distance: 1 - 0 = 1.0 - expectedDist02 := 1.0 - actualDist02 := dm.Get(0, 2) - if math.Abs(actualDist02-expectedDist02) > 1e-10 { - t.Errorf("Distance(0, 2): expected %f, got %f", expectedDist02, actualDist02) - } - - // Distance(1, 2): - // Intersection: {} -> 0 elements - // Union: {2, 3, 4, 5, 6, 7} -> 6 elements - // Similarity: 0/6 = 0 - // Distance: 1 - 0 = 1.0 - expectedDist12 := 1.0 - actualDist12 := dm.Get(1, 2) - if math.Abs(actualDist12-expectedDist12) > 1e-10 { - t.Errorf("Distance(1, 2): expected %f, got %f", expectedDist12, actualDist12) - } - - // Check symmetry - if dm.Get(0, 1) != dm.Get(1, 0) { - t.Errorf("Matrix not symmetric: Get(0, 1) = %f, Get(1, 0) = %f", - dm.Get(0, 1), dm.Get(1, 0)) - } - - // Check diagonal - if dm.Get(0, 0) != 0.0 { - t.Errorf("Diagonal should be 0, got %f", dm.Get(0, 0)) - } - if dm.Get(1, 1) != 0.0 { - t.Errorf("Diagonal should be 0, got %f", dm.Get(1, 1)) - } - if dm.Get(2, 2) != 0.0 { - t.Errorf("Diagonal should be 0, got %f", dm.Get(2, 2)) - } -} - -func TestKmerSetGroupJaccardSimilarityMatrix(t *testing.T) { - ksg := NewKmerSetGroup(5, 3) - - // Set 0: {1, 2, 3} - ksg.Get(0).AddKmerCode(1) - ksg.Get(0).AddKmerCode(2) - ksg.Get(0).AddKmerCode(3) - - // Set 1: {2, 3, 4} - ksg.Get(1).AddKmerCode(2) - ksg.Get(1).AddKmerCode(3) - ksg.Get(1).AddKmerCode(4) - - // Set 2: {1, 2, 3} (same as set 0) - ksg.Get(2).AddKmerCode(1) - ksg.Get(2).AddKmerCode(2) - ksg.Get(2).AddKmerCode(3) - - sm := ksg.JaccardSimilarityMatrix() - - // Check similarities - // Similarity(0, 1): 0.5 (as calculated above) - expectedSim01 := 0.5 - actualSim01 := sm.Get(0, 1) - if math.Abs(actualSim01-expectedSim01) > 1e-10 { - t.Errorf("Similarity(0, 1): expected %f, got %f", expectedSim01, actualSim01) - } - - // Similarity(0, 2): 1.0 (identical sets) - expectedSim02 := 1.0 - actualSim02 := sm.Get(0, 2) - if math.Abs(actualSim02-expectedSim02) > 1e-10 { - t.Errorf("Similarity(0, 2): expected %f, got %f", expectedSim02, actualSim02) - } - - // Similarity(1, 2): 0.5 - // Intersection: {2, 3} -> 2 - // Union: {1, 2, 3, 4} -> 4 - // Similarity: 2/4 = 0.5 - expectedSim12 := 0.5 - actualSim12 := sm.Get(1, 2) - if math.Abs(actualSim12-expectedSim12) > 1e-10 { - t.Errorf("Similarity(1, 2): expected %f, got %f", expectedSim12, actualSim12) - } - - // Check diagonal (similarity to self = 1.0) - if sm.Get(0, 0) != 1.0 { - t.Errorf("Diagonal should be 1.0, got %f", sm.Get(0, 0)) - } - if sm.Get(1, 1) != 1.0 { - t.Errorf("Diagonal should be 1.0, got %f", sm.Get(1, 1)) - } - if sm.Get(2, 2) != 1.0 { - t.Errorf("Diagonal should be 1.0, got %f", sm.Get(2, 2)) - } -} - -func TestKmerSetGroupJaccardMatricesRelation(t *testing.T) { - ksg := NewKmerSetGroup(5, 4) - - // Create different sets - ksg.Get(0).AddKmerCode(1) - ksg.Get(0).AddKmerCode(2) - - ksg.Get(1).AddKmerCode(2) - ksg.Get(1).AddKmerCode(3) - - ksg.Get(2).AddKmerCode(1) - ksg.Get(2).AddKmerCode(2) - ksg.Get(2).AddKmerCode(3) - - ksg.Get(3).AddKmerCode(10) - ksg.Get(3).AddKmerCode(20) - - dm := ksg.JaccardDistanceMatrix() - sm := ksg.JaccardSimilarityMatrix() - - // For all pairs (including diagonal), distance + similarity should equal 1.0 - for i := 0; i < 4; i++ { - for j := 0; j < 4; j++ { - distance := dm.Get(i, j) - similarity := sm.Get(i, j) - sum := distance + similarity - - if math.Abs(sum-1.0) > 1e-10 { - t.Errorf("At (%d, %d): distance %f + similarity %f = %f, expected 1.0", - i, j, distance, similarity, sum) - } - } - } -} - -func TestKmerSetGroupJaccardMatrixLabels(t *testing.T) { - ksg := NewKmerSetGroup(5, 3) - - // Don't set IDs - should use default labels - ksg.Get(0).AddKmerCode(1) - ksg.Get(1).AddKmerCode(2) - ksg.Get(2).AddKmerCode(3) - - dm := ksg.JaccardDistanceMatrix() - - // Check default labels - if dm.GetLabel(0) != "set_0" { - t.Errorf("Expected default label 'set_0', got '%s'", dm.GetLabel(0)) - } - if dm.GetLabel(1) != "set_1" { - t.Errorf("Expected default label 'set_1', got '%s'", dm.GetLabel(1)) - } - if dm.GetLabel(2) != "set_2" { - t.Errorf("Expected default label 'set_2', got '%s'", dm.GetLabel(2)) - } -} - -func TestKmerSetGroupJaccardMatrixSize(t *testing.T) { - ksg := NewKmerSetGroup(5, 5) - - for i := 0; i < 5; i++ { - ksg.Get(i).AddKmerCode(uint64(i)) - } - - dm := ksg.JaccardDistanceMatrix() - - if dm.Size() != 5 { - t.Errorf("Expected matrix size 5, got %d", dm.Size()) - } - - // All sets are disjoint, so all distances should be 1.0 - for i := 0; i < 5; i++ { - for j := i + 1; j < 5; j++ { - dist := dm.Get(i, j) - if math.Abs(dist-1.0) > 1e-10 { - t.Errorf("Expected distance 1.0 for disjoint sets (%d, %d), got %f", - i, j, dist) - } - } - } -} diff --git a/pkg/obikmer/kmer_set_group_quorum.go b/pkg/obikmer/kmer_set_group_quorum.go deleted file mode 100644 index 4f21f95..0000000 --- a/pkg/obikmer/kmer_set_group_quorum.go +++ /dev/null @@ -1,235 +0,0 @@ -package obikmer - -import ( - "container/heap" - - "github.com/RoaringBitmap/roaring/roaring64" -) - -// heapItem represents an element in the min-heap for k-way merge -type heapItem struct { - value uint64 - idx int -} - -// kmerMinHeap implements heap.Interface for k-way merge algorithm -type kmerMinHeap []heapItem - -func (h kmerMinHeap) Len() int { return len(h) } -func (h kmerMinHeap) Less(i, j int) bool { return h[i].value < h[j].value } -func (h kmerMinHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } - -func (h *kmerMinHeap) Push(x interface{}) { - *h = append(*h, x.(heapItem)) -} - -func (h *kmerMinHeap) Pop() interface{} { - old := *h - n := len(old) - x := old[n-1] - *h = old[0 : n-1] - return x -} - -// QuorumAtLeast returns k-mers present in at least q sets -// -// Algorithm: K-way merge with min-heap counting -// -// The algorithm processes all k-mers in sorted order using a min-heap: -// -// 1. Initialize one iterator per non-empty set -// 2. Build a min-heap of (value, set_index) pairs, one per iterator -// 3. While heap is not empty: -// a. Extract the minimum value v from heap -// b. Pop ALL heap items with value == v (counting occurrences) -// c. If count >= q, add v to result -// d. Advance each popped iterator and re-insert into heap if valid -// -// This ensures each unique k-mer is counted exactly once across all sets. -// -// Time complexity: O(M log N) -// - M = sum of all set cardinalities (total k-mer occurrences) -// - N = number of sets -// - Each k-mer occurrence is inserted/extracted from heap once: O(M) operations -// - Each heap operation costs O(log N) -// -// Space complexity: O(N) -// - Heap contains at most N elements (one per set iterator) -// - Output bitmap size depends on quorum result -// -// Special cases (optimized): -// - q <= 0: returns empty set -// - q == 1: delegates to Union() (native OR operations) -// - q == n: delegates to Intersect() (native AND operations) -// - q > n: returns empty set (impossible to satisfy) -func (ksg *KmerSetGroup) QuorumAtLeast(q int) *KmerSet { - n := len(ksg.sets) - - // Edge cases - if q <= 0 || n == 0 { - return NewKmerSet(ksg.k) - } - if q > n { - return NewKmerSet(ksg.k) - } - if q == 1 { - return ksg.Union() - } - if q == n { - return ksg.Intersect() - } - - // Initialize iterators for all non-empty sets - iterators := make([]roaring64.IntIterable64, 0, n) - iterIndices := make([]int, 0, n) - - for i, set := range ksg.sets { - if set.Len() > 0 { - iter := set.bitmap.Iterator() - if iter.HasNext() { - iterators = append(iterators, iter) - iterIndices = append(iterIndices, i) - } - } - } - - if len(iterators) == 0 { - return NewKmerSet(ksg.k) - } - - // Initialize heap with first value from each iterator - h := make(kmerMinHeap, len(iterators)) - for i, iter := range iterators { - h[i] = heapItem{value: iter.Next(), idx: i} - } - heap.Init(&h) - - // Result bitmap - result := roaring64.New() - - // K-way merge with counting - for len(h) > 0 { - minVal := h[0].value - count := 0 - activeIndices := make([]int, 0, len(h)) - - // Pop all elements with same value (count occurrences) - for len(h) > 0 && h[0].value == minVal { - item := heap.Pop(&h).(heapItem) - count++ - activeIndices = append(activeIndices, item.idx) - } - - // Add to result if quorum reached - if count >= q { - result.Add(minVal) - } - - // Advance iterators and re-insert into heap - for _, iterIdx := range activeIndices { - if iterators[iterIdx].HasNext() { - heap.Push(&h, heapItem{ - value: iterators[iterIdx].Next(), - idx: iterIdx, - }) - } - } - } - - return NewKmerSetFromBitmap(ksg.k, result) -} - -// QuorumAtMost returns k-mers present in at most q sets -// -// Algorithm: Uses the mathematical identity -// AtMost(q) = Union() - AtLeast(q+1) -// -// Proof: -// - Union() contains all k-mers present in at least 1 set -// - AtLeast(q+1) contains all k-mers present in q+1 or more sets -// - Their difference contains only k-mers present in at most q sets -// -// Implementation: -// 1. Compute U = Union() -// 2. Compute A = QuorumAtLeast(q+1) -// 3. Return U - A using bitmap AndNot operation -// -// Time complexity: O(M log N) -// - Union(): O(M) with native OR operations -// - QuorumAtLeast(q+1): O(M log N) -// - AndNot: O(|U|) where |U| <= M -// - Total: O(M log N) -// -// Space complexity: O(N) -// - Inherited from QuorumAtLeast heap -// -// Special cases: -// - q <= 0: returns empty set -// - q >= n: returns Union() (all k-mers are in at most n sets) -func (ksg *KmerSetGroup) QuorumAtMost(q int) *KmerSet { - n := len(ksg.sets) - - // Edge cases - if q <= 0 { - return NewKmerSet(ksg.k) - } - if q >= n { - return ksg.Union() - } - - // Compute Union() - AtLeast(q+1) - union := ksg.Union() - atLeastQ1 := ksg.QuorumAtLeast(q + 1) - - // Difference: elements in union but not in atLeastQ1 - result := union.bitmap.Clone() - result.AndNot(atLeastQ1.bitmap) - - return NewKmerSetFromBitmap(ksg.k, result) -} - -// QuorumExactly returns k-mers present in exactly q sets -// -// Algorithm: Uses the mathematical identity -// Exactly(q) = AtLeast(q) - AtLeast(q+1) -// -// Proof: -// - AtLeast(q) contains all k-mers present in q or more sets -// - AtLeast(q+1) contains all k-mers present in q+1 or more sets -// - Their difference contains only k-mers present in exactly q sets -// -// Implementation: -// 1. Compute A = QuorumAtLeast(q) -// 2. Compute B = QuorumAtLeast(q+1) -// 3. Return A - B using bitmap AndNot operation -// -// Time complexity: O(M log N) -// - Two calls to QuorumAtLeast: 2 * O(M log N) -// - One AndNot operation: O(|A|) where |A| <= M -// - Total: O(M log N) since AndNot is dominated by merge operations -// -// Space complexity: O(N) -// - Inherited from QuorumAtLeast heap -// - Two temporary bitmaps for intermediate results -// -// Special cases: -// - q <= 0: returns empty set -// - q > n: returns empty set (impossible to have k-mer in more than n sets) -func (ksg *KmerSetGroup) QuorumExactly(q int) *KmerSet { - n := len(ksg.sets) - - // Edge cases - if q <= 0 || q > n { - return NewKmerSet(ksg.k) - } - - // Compute AtLeast(q) - AtLeast(q+1) - aq := ksg.QuorumAtLeast(q) - aq1 := ksg.QuorumAtLeast(q + 1) - - // Difference: elements in aq but not in aq1 - result := aq.bitmap.Clone() - result.AndNot(aq1.bitmap) - - return NewKmerSetFromBitmap(ksg.k, result) -} diff --git a/pkg/obikmer/kmer_set_group_quorum_test.go b/pkg/obikmer/kmer_set_group_quorum_test.go deleted file mode 100644 index ab11319..0000000 --- a/pkg/obikmer/kmer_set_group_quorum_test.go +++ /dev/null @@ -1,395 +0,0 @@ -package obikmer - -import ( - "testing" -) - -// TestQuorumAtLeastEdgeCases tests edge cases for QuorumAtLeast -func TestQuorumAtLeastEdgeCases(t *testing.T) { - k := 5 - - // Test group with all empty sets - emptyGroup := NewKmerSetGroup(k, 3) - result := emptyGroup.QuorumAtLeast(1) - if result.Len() != 0 { - t.Errorf("Empty sets: expected 0 k-mers, got %d", result.Len()) - } - - // Test q <= 0 - group := NewKmerSetGroup(k, 3) - result = group.QuorumAtLeast(0) - if result.Len() != 0 { - t.Errorf("q=0: expected 0 k-mers, got %d", result.Len()) - } - - result = group.QuorumAtLeast(-1) - if result.Len() != 0 { - t.Errorf("q=-1: expected 0 k-mers, got %d", result.Len()) - } - - // Test q > n - group.Get(0).AddKmerCode(1) - result = group.QuorumAtLeast(10) - if result.Len() != 0 { - t.Errorf("q>n: expected 0 k-mers, got %d", result.Len()) - } -} - -// TestQuorumAtLeastQ1 tests q=1 (should equal Union) -func TestQuorumAtLeastQ1(t *testing.T) { - k := 5 - group := NewKmerSetGroup(k, 3) - - // Add different k-mers to each set - group.Get(0).AddKmerCode(1) - group.Get(0).AddKmerCode(2) - group.Get(1).AddKmerCode(2) - group.Get(1).AddKmerCode(3) - group.Get(2).AddKmerCode(3) - group.Get(2).AddKmerCode(4) - - quorum := group.QuorumAtLeast(1) - union := group.Union() - - if quorum.Len() != union.Len() { - t.Errorf("QuorumAtLeast(1) length %d != Union length %d", quorum.Len(), union.Len()) - } - - // Check all elements match - for kmer := uint64(1); kmer <= 4; kmer++ { - if quorum.Contains(kmer) != union.Contains(kmer) { - t.Errorf("Mismatch for k-mer %d", kmer) - } - } -} - -// TestQuorumAtLeastQN tests q=n (should equal Intersect) -func TestQuorumAtLeastQN(t *testing.T) { - k := 5 - group := NewKmerSetGroup(k, 3) - - // Add some common k-mers and some unique - for i := 0; i < 3; i++ { - group.Get(i).AddKmerCode(10) // common to all - group.Get(i).AddKmerCode(20) // common to all - } - group.Get(0).AddKmerCode(1) // unique to set 0 - group.Get(1).AddKmerCode(2) // unique to set 1 - - quorum := group.QuorumAtLeast(3) - intersect := group.Intersect() - - if quorum.Len() != intersect.Len() { - t.Errorf("QuorumAtLeast(n) length %d != Intersect length %d", quorum.Len(), intersect.Len()) - } - - if quorum.Len() != 2 { - t.Errorf("Expected 2 common k-mers, got %d", quorum.Len()) - } - - if !quorum.Contains(10) || !quorum.Contains(20) { - t.Error("Missing common k-mers") - } - - if quorum.Contains(1) || quorum.Contains(2) { - t.Error("Unique k-mers should not be in result") - } -} - -// TestQuorumAtLeastGeneral tests general quorum values -func TestQuorumAtLeastGeneral(t *testing.T) { - k := 5 - group := NewKmerSetGroup(k, 5) - - // Setup: k-mer i appears in i sets (for i=1..5) - // k-mer 1: in set 0 - // k-mer 2: in sets 0,1 - // k-mer 3: in sets 0,1,2 - // k-mer 4: in sets 0,1,2,3 - // k-mer 5: in sets 0,1,2,3,4 (all) - - for kmer := uint64(1); kmer <= 5; kmer++ { - for setIdx := 0; setIdx < int(kmer); setIdx++ { - group.Get(setIdx).AddKmerCode(kmer) - } - } - - tests := []struct { - q int - expected map[uint64]bool - }{ - {1, map[uint64]bool{1: true, 2: true, 3: true, 4: true, 5: true}}, - {2, map[uint64]bool{2: true, 3: true, 4: true, 5: true}}, - {3, map[uint64]bool{3: true, 4: true, 5: true}}, - {4, map[uint64]bool{4: true, 5: true}}, - {5, map[uint64]bool{5: true}}, - } - - for _, tt := range tests { - result := group.QuorumAtLeast(tt.q) - - if result.Len() != uint64(len(tt.expected)) { - t.Errorf("q=%d: expected %d k-mers, got %d", tt.q, len(tt.expected), result.Len()) - } - - for kmer := uint64(1); kmer <= 5; kmer++ { - shouldContain := tt.expected[kmer] - doesContain := result.Contains(kmer) - if shouldContain != doesContain { - t.Errorf("q=%d, k-mer=%d: expected contains=%v, got %v", tt.q, kmer, shouldContain, doesContain) - } - } - } -} - -// TestQuorumExactlyBasic tests QuorumExactly basic functionality -func TestQuorumExactlyBasic(t *testing.T) { - k := 5 - group := NewKmerSetGroup(k, 5) - - // Setup: k-mer i appears in exactly i sets - for kmer := uint64(1); kmer <= 5; kmer++ { - for setIdx := 0; setIdx < int(kmer); setIdx++ { - group.Get(setIdx).AddKmerCode(kmer) - } - } - - tests := []struct { - q int - expected []uint64 - }{ - {1, []uint64{1}}, - {2, []uint64{2}}, - {3, []uint64{3}}, - {4, []uint64{4}}, - {5, []uint64{5}}, - } - - for _, tt := range tests { - result := group.QuorumExactly(tt.q) - - if result.Len() != uint64(len(tt.expected)) { - t.Errorf("q=%d: expected %d k-mers, got %d", tt.q, len(tt.expected), result.Len()) - } - - for _, kmer := range tt.expected { - if !result.Contains(kmer) { - t.Errorf("q=%d: missing k-mer %d", tt.q, kmer) - } - } - } -} - -// TestQuorumIdentity tests the mathematical identity: Exactly(q) = AtLeast(q) - AtLeast(q+1) -func TestQuorumIdentity(t *testing.T) { - k := 5 - group := NewKmerSetGroup(k, 4) - - // Add random distribution - group.Get(0).AddKmerCode(1) - group.Get(0).AddKmerCode(2) - group.Get(0).AddKmerCode(3) - - group.Get(1).AddKmerCode(2) - group.Get(1).AddKmerCode(3) - group.Get(1).AddKmerCode(4) - - group.Get(2).AddKmerCode(3) - group.Get(2).AddKmerCode(4) - - group.Get(3).AddKmerCode(4) - - for q := 1; q <= 4; q++ { - exactly := group.QuorumExactly(q) - atLeast := group.QuorumAtLeast(q) - atLeastPlus1 := group.QuorumAtLeast(q + 1) - - // Verify: every element in exactly(q) is in atLeast(q) - iter := exactly.Iterator() - for iter.HasNext() { - kmer := iter.Next() - if !atLeast.Contains(kmer) { - t.Errorf("q=%d: k-mer %d in Exactly but not in AtLeast", q, kmer) - } - if atLeastPlus1.Contains(kmer) { - t.Errorf("q=%d: k-mer %d in Exactly but also in AtLeast(q+1)", q, kmer) - } - } - } -} - -// TestQuorumDisjointSets tests quorum on completely disjoint sets -func TestQuorumDisjointSets(t *testing.T) { - k := 5 - group := NewKmerSetGroup(k, 3) - - // Each set has unique k-mers - group.Get(0).AddKmerCode(1) - group.Get(1).AddKmerCode(2) - group.Get(2).AddKmerCode(3) - - // q=1 should give all - result := group.QuorumAtLeast(1) - if result.Len() != 3 { - t.Errorf("Disjoint sets q=1: expected 3, got %d", result.Len()) - } - - // q=2 should give none - result = group.QuorumAtLeast(2) - if result.Len() != 0 { - t.Errorf("Disjoint sets q=2: expected 0, got %d", result.Len()) - } -} - -// TestQuorumIdenticalSets tests quorum on identical sets -func TestQuorumIdenticalSets(t *testing.T) { - k := 5 - group := NewKmerSetGroup(k, 3) - - // All sets have same k-mers - for i := 0; i < 3; i++ { - group.Get(i).AddKmerCode(10) - group.Get(i).AddKmerCode(20) - group.Get(i).AddKmerCode(30) - } - - // Any q <= n should give all k-mers - for q := 1; q <= 3; q++ { - result := group.QuorumAtLeast(q) - if result.Len() != 3 { - t.Errorf("Identical sets q=%d: expected 3, got %d", q, result.Len()) - } - } -} - -// TestQuorumLargeNumbers tests with large k-mer values -func TestQuorumLargeNumbers(t *testing.T) { - k := 21 - group := NewKmerSetGroup(k, 3) - - // Use large uint64 values (actual k-mer encodings) - largeKmers := []uint64{ - 0x1234567890ABCDEF, - 0xFEDCBA0987654321, - 0xAAAAAAAAAAAAAAAA, - } - - // Add to multiple sets - for i := 0; i < 3; i++ { - for j := 0; j <= i; j++ { - group.Get(j).AddKmerCode(largeKmers[i]) - } - } - - result := group.QuorumAtLeast(2) - if result.Len() != 2 { - t.Errorf("Large numbers q=2: expected 2, got %d", result.Len()) - } - - if !result.Contains(largeKmers[1]) || !result.Contains(largeKmers[2]) { - t.Error("Large numbers: wrong k-mers in result") - } -} - -// TestQuorumAtMostBasic tests QuorumAtMost basic functionality -func TestQuorumAtMostBasic(t *testing.T) { - k := 5 - group := NewKmerSetGroup(k, 5) - - // Setup: k-mer i appears in exactly i sets - for kmer := uint64(1); kmer <= 5; kmer++ { - for setIdx := 0; setIdx < int(kmer); setIdx++ { - group.Get(setIdx).AddKmerCode(kmer) - } - } - - tests := []struct { - q int - expected []uint64 - }{ - {0, []uint64{}}, // at most 0: none - {1, []uint64{1}}, // at most 1: only k-mer 1 - {2, []uint64{1, 2}}, // at most 2: k-mers 1,2 - {3, []uint64{1, 2, 3}}, // at most 3: k-mers 1,2,3 - {4, []uint64{1, 2, 3, 4}}, // at most 4: k-mers 1,2,3,4 - {5, []uint64{1, 2, 3, 4, 5}}, // at most 5: all k-mers - {10, []uint64{1, 2, 3, 4, 5}}, // at most 10: all k-mers - } - - for _, tt := range tests { - result := group.QuorumAtMost(tt.q) - - if result.Len() != uint64(len(tt.expected)) { - t.Errorf("q=%d: expected %d k-mers, got %d", tt.q, len(tt.expected), result.Len()) - } - - for _, kmer := range tt.expected { - if !result.Contains(kmer) { - t.Errorf("q=%d: missing k-mer %d", tt.q, kmer) - } - } - } -} - -// TestQuorumComplementIdentity tests that AtLeast and AtMost are complementary -func TestQuorumComplementIdentity(t *testing.T) { - k := 5 - group := NewKmerSetGroup(k, 4) - - // Add random distribution - group.Get(0).AddKmerCode(1) - group.Get(0).AddKmerCode(2) - group.Get(0).AddKmerCode(3) - - group.Get(1).AddKmerCode(2) - group.Get(1).AddKmerCode(3) - group.Get(1).AddKmerCode(4) - - group.Get(2).AddKmerCode(3) - group.Get(2).AddKmerCode(4) - - group.Get(3).AddKmerCode(4) - - union := group.Union() - - for q := 1; q < 4; q++ { - atMost := group.QuorumAtMost(q) - atLeast := group.QuorumAtLeast(q + 1) - - // Verify: AtMost(q) ∪ AtLeast(q+1) = Union() - combined := atMost.Union(atLeast) - - if combined.Len() != union.Len() { - t.Errorf("q=%d: AtMost(q) ∪ AtLeast(q+1) has %d k-mers, Union has %d", - q, combined.Len(), union.Len()) - } - - // Verify: AtMost(q) ∩ AtLeast(q+1) = ∅ - overlap := atMost.Intersect(atLeast) - if overlap.Len() != 0 { - t.Errorf("q=%d: AtMost(q) and AtLeast(q+1) overlap with %d k-mers", - q, overlap.Len()) - } - } -} - -// BenchmarkQuorumAtLeast benchmarks quorum operations -func BenchmarkQuorumAtLeast(b *testing.B) { - k := 21 - n := 10 - group := NewKmerSetGroup(k, n) - - // Populate with realistic data - for i := 0; i < n; i++ { - for j := uint64(0); j < 10000; j++ { - if (j % uint64(n)) <= uint64(i) { - group.Get(i).AddKmerCode(j) - } - } - } - - b.ResetTimer() - for i := 0; i < b.N; i++ { - _ = group.QuorumAtLeast(5) - } -} diff --git a/pkg/obikmer/kmer_set_persistence.go b/pkg/obikmer/kmer_set_persistence.go deleted file mode 100644 index 3bdc2ae..0000000 --- a/pkg/obikmer/kmer_set_persistence.go +++ /dev/null @@ -1,376 +0,0 @@ -package obikmer - -import ( - "encoding/json" - "fmt" - "os" - "path/filepath" - "strings" - - "github.com/pelletier/go-toml/v2" - "gopkg.in/yaml.v3" -) - -// MetadataFormat represents the metadata serialization format -type MetadataFormat int - -const ( - FormatTOML MetadataFormat = iota - FormatYAML - FormatJSON -) - -// String returns the file extension for the format -func (f MetadataFormat) String() string { - switch f { - case FormatTOML: - return "toml" - case FormatYAML: - return "yaml" - case FormatJSON: - return "json" - default: - return "toml" - } -} - -// KmerSetMetadata contient les métadonnées d'un KmerSet ou KmerSetGroup -type KmerSetMetadata struct { - ID string `toml:"id,omitempty" yaml:"id,omitempty" json:"id,omitempty"` // Identifiant unique - K int `toml:"k" yaml:"k" json:"k"` // Taille des k-mers - Type string `toml:"type" yaml:"type" json:"type"` // "KmerSet" ou "KmerSetGroup" - Size int `toml:"size" yaml:"size" json:"size"` // 1 pour KmerSet, n pour KmerSetGroup - Files []string `toml:"files" yaml:"files" json:"files"` // Liste des fichiers .roaring - SetsIDs []string `toml:"sets_ids,omitempty" yaml:"sets_ids,omitempty" json:"sets_ids,omitempty"` // IDs des KmerSet individuels - UserMetadata map[string]interface{} `toml:"user_metadata,omitempty" yaml:"user_metadata,omitempty" json:"user_metadata,omitempty"` // Métadonnées KmerSet ou KmerSetGroup - SetsMetadata []map[string]interface{} `toml:"sets_metadata,omitempty" yaml:"sets_metadata,omitempty" json:"sets_metadata,omitempty"` // Métadonnées des KmerSet individuels dans un KmerSetGroup -} - -// SaveKmerSet sauvegarde un KmerSet dans un répertoire -// Format: directory/metadata.{toml,yaml,json} + directory/set_0.roaring -func (ks *KmerSet) Save(directory string, format MetadataFormat) error { - // Créer le répertoire si nécessaire - if err := os.MkdirAll(directory, 0755); err != nil { - return fmt.Errorf("failed to create directory %s: %w", directory, err) - } - - // Métadonnées - metadata := KmerSetMetadata{ - ID: ks.id, - K: ks.k, - Type: "KmerSet", - Size: 1, - Files: []string{"set_0.roaring"}, - UserMetadata: ks.Metadata, // Sauvegarder les métadonnées utilisateur - } - - // Sauvegarder les métadonnées - if err := saveMetadata(filepath.Join(directory, "metadata."+format.String()), metadata, format); err != nil { - return err - } - - // Sauvegarder le bitmap - bitmapPath := filepath.Join(directory, "set_0.roaring") - file, err := os.Create(bitmapPath) - if err != nil { - return fmt.Errorf("failed to create bitmap file %s: %w", bitmapPath, err) - } - defer file.Close() - - if _, err := ks.bitmap.WriteTo(file); err != nil { - return fmt.Errorf("failed to write bitmap: %w", err) - } - - return nil -} - -// LoadKmerSet charge un KmerSet depuis un répertoire -func LoadKmerSet(directory string) (*KmerSet, error) { - // Lire les métadonnées (essayer tous les formats) - metadata, err := loadMetadata(directory) - if err != nil { - return nil, err - } - - // Vérifier le type - if metadata.Type != "KmerSet" { - return nil, fmt.Errorf("invalid type: expected KmerSet, got %s", metadata.Type) - } - - // Vérifier qu'il n'y a qu'un seul fichier - if metadata.Size != 1 || len(metadata.Files) != 1 { - return nil, fmt.Errorf("KmerSet must have exactly 1 bitmap file, got %d", len(metadata.Files)) - } - - // Charger le bitmap - bitmapPath := filepath.Join(directory, metadata.Files[0]) - file, err := os.Open(bitmapPath) - if err != nil { - return nil, fmt.Errorf("failed to open bitmap file %s: %w", bitmapPath, err) - } - defer file.Close() - - ks := NewKmerSet(metadata.K) - - // Charger l'ID - ks.id = metadata.ID - - // Charger les métadonnées utilisateur - if metadata.UserMetadata != nil { - ks.Metadata = metadata.UserMetadata - } - - if _, err := ks.bitmap.ReadFrom(file); err != nil { - return nil, fmt.Errorf("failed to read bitmap: %w", err) - } - - return ks, nil -} - -// SaveKmerSetGroup sauvegarde un KmerSetGroup dans un répertoire -// Format: directory/metadata.{toml,yaml,json} + directory/set_0.roaring, set_1.roaring, ... -func (ksg *KmerSetGroup) Save(directory string, format MetadataFormat) error { - // Créer le répertoire si nécessaire - if err := os.MkdirAll(directory, 0755); err != nil { - return fmt.Errorf("failed to create directory %s: %w", directory, err) - } - - // Métadonnées - files := make([]string, len(ksg.sets)) - for i := range ksg.sets { - files[i] = fmt.Sprintf("set_%d.roaring", i) - } - - // Collecter les IDs et métadonnées de chaque KmerSet individuel - setsIDs := make([]string, len(ksg.sets)) - setsMetadata := make([]map[string]interface{}, len(ksg.sets)) - for i, ks := range ksg.sets { - setsIDs[i] = ks.id - setsMetadata[i] = ks.Metadata - } - - metadata := KmerSetMetadata{ - ID: ksg.id, - K: ksg.k, - Type: "KmerSetGroup", - Size: len(ksg.sets), - Files: files, - SetsIDs: setsIDs, // IDs de chaque set - UserMetadata: ksg.Metadata, // Métadonnées du groupe - SetsMetadata: setsMetadata, // Métadonnées de chaque set - } - - // Sauvegarder les métadonnées - if err := saveMetadata(filepath.Join(directory, "metadata."+format.String()), metadata, format); err != nil { - return err - } - - // Sauvegarder chaque bitmap - for i, ks := range ksg.sets { - bitmapPath := filepath.Join(directory, files[i]) - file, err := os.Create(bitmapPath) - if err != nil { - return fmt.Errorf("failed to create bitmap file %s: %w", bitmapPath, err) - } - - if _, err := ks.bitmap.WriteTo(file); err != nil { - file.Close() - return fmt.Errorf("failed to write bitmap %d: %w", i, err) - } - file.Close() - } - - return nil -} - -// LoadKmerSetGroup charge un KmerSetGroup depuis un répertoire -func LoadKmerSetGroup(directory string) (*KmerSetGroup, error) { - // Lire les métadonnées (essayer tous les formats) - metadata, err := loadMetadata(directory) - if err != nil { - return nil, err - } - - // Vérifier le type - if metadata.Type != "KmerSetGroup" { - return nil, fmt.Errorf("invalid type: expected KmerSetGroup, got %s", metadata.Type) - } - - // Vérifier la cohérence - if metadata.Size != len(metadata.Files) { - return nil, fmt.Errorf("size mismatch: size=%d but %d files listed", metadata.Size, len(metadata.Files)) - } - - // Créer le groupe - ksg := NewKmerSetGroup(metadata.K, metadata.Size) - - // Charger l'ID du groupe - ksg.id = metadata.ID - - // Charger les métadonnées du groupe - if metadata.UserMetadata != nil { - ksg.Metadata = metadata.UserMetadata - } - - // Charger les IDs de chaque KmerSet - if metadata.SetsIDs != nil && len(metadata.SetsIDs) == metadata.Size { - for i := range ksg.sets { - ksg.sets[i].id = metadata.SetsIDs[i] - } - } - - // Charger les métadonnées de chaque KmerSet individuel - if metadata.SetsMetadata != nil { - if len(metadata.SetsMetadata) != metadata.Size { - return nil, fmt.Errorf("sets metadata size mismatch: expected %d, got %d", metadata.Size, len(metadata.SetsMetadata)) - } - for i := range ksg.sets { - ksg.sets[i].Metadata = metadata.SetsMetadata[i] - } - } - - // Charger chaque bitmap - for i, filename := range metadata.Files { - bitmapPath := filepath.Join(directory, filename) - file, err := os.Open(bitmapPath) - if err != nil { - return nil, fmt.Errorf("failed to open bitmap file %s: %w", bitmapPath, err) - } - - if _, err := ksg.sets[i].bitmap.ReadFrom(file); err != nil { - file.Close() - return nil, fmt.Errorf("failed to read bitmap %d: %w", i, err) - } - file.Close() - } - - return ksg, nil -} - -// saveMetadata sauvegarde les métadonnées dans le format spécifié -func saveMetadata(path string, metadata KmerSetMetadata, format MetadataFormat) error { - file, err := os.Create(path) - if err != nil { - return fmt.Errorf("failed to create metadata file %s: %w", path, err) - } - defer file.Close() - - var encoder interface{ Encode(interface{}) error } - - switch format { - case FormatTOML: - encoder = toml.NewEncoder(file) - case FormatYAML: - encoder = yaml.NewEncoder(file) - case FormatJSON: - jsonEncoder := json.NewEncoder(file) - jsonEncoder.SetIndent("", " ") - encoder = jsonEncoder - default: - return fmt.Errorf("unsupported format: %v", format) - } - - if err := encoder.Encode(metadata); err != nil { - return fmt.Errorf("failed to encode metadata: %w", err) - } - - return nil -} - -// loadMetadata charge les métadonnées depuis un répertoire -// Essaie tous les formats (TOML, YAML, JSON) dans l'ordre -func loadMetadata(directory string) (*KmerSetMetadata, error) { - formats := []MetadataFormat{FormatTOML, FormatYAML, FormatJSON} - - var lastErr error - for _, format := range formats { - path := filepath.Join(directory, "metadata."+format.String()) - - // Vérifier si le fichier existe - if _, err := os.Stat(path); os.IsNotExist(err) { - continue - } - - metadata, err := loadMetadataFromFile(path, format) - if err != nil { - lastErr = err - continue - } - return metadata, nil - } - - if lastErr != nil { - return nil, fmt.Errorf("failed to load metadata: %w", lastErr) - } - return nil, fmt.Errorf("no metadata file found in %s (tried .toml, .yaml, .json)", directory) -} - -// loadMetadataFromFile charge les métadonnées depuis un fichier spécifique -func loadMetadataFromFile(path string, format MetadataFormat) (*KmerSetMetadata, error) { - file, err := os.Open(path) - if err != nil { - return nil, fmt.Errorf("failed to open metadata file %s: %w", path, err) - } - defer file.Close() - - var metadata KmerSetMetadata - var decoder interface{ Decode(interface{}) error } - - switch format { - case FormatTOML: - decoder = toml.NewDecoder(file) - case FormatYAML: - decoder = yaml.NewDecoder(file) - case FormatJSON: - decoder = json.NewDecoder(file) - default: - return nil, fmt.Errorf("unsupported format: %v", format) - } - - if err := decoder.Decode(&metadata); err != nil { - return nil, fmt.Errorf("failed to decode metadata: %w", err) - } - - return &metadata, nil -} - -// DetectFormat détecte le format des métadonnées dans un répertoire -func DetectFormat(directory string) (MetadataFormat, error) { - formats := []MetadataFormat{FormatTOML, FormatYAML, FormatJSON} - - for _, format := range formats { - path := filepath.Join(directory, "metadata."+format.String()) - if _, err := os.Stat(path); err == nil { - return format, nil - } - } - - return FormatTOML, fmt.Errorf("no metadata file found in %s", directory) -} - -// IsKmerSetDirectory vérifie si un répertoire contient un KmerSet ou KmerSetGroup -func IsKmerSetDirectory(directory string) (bool, string, error) { - metadata, err := loadMetadata(directory) - if err != nil { - return false, "", err - } - - return true, metadata.Type, nil -} - -// ListBitmapFiles liste tous les fichiers .roaring dans un répertoire -func ListBitmapFiles(directory string) ([]string, error) { - entries, err := os.ReadDir(directory) - if err != nil { - return nil, fmt.Errorf("failed to read directory %s: %w", directory, err) - } - - var files []string - for _, entry := range entries { - if !entry.IsDir() && strings.HasSuffix(entry.Name(), ".roaring") { - files = append(files, entry.Name()) - } - } - - return files, nil -} diff --git a/pkg/obikmer/kmer_set_test.go b/pkg/obikmer/kmer_set_test.go deleted file mode 100644 index 77144c7..0000000 --- a/pkg/obikmer/kmer_set_test.go +++ /dev/null @@ -1,272 +0,0 @@ -package obikmer - -import ( - "math" - "testing" -) - -func TestJaccardDistanceIdentical(t *testing.T) { - ks1 := NewKmerSet(5) - ks1.AddKmerCode(100) - ks1.AddKmerCode(200) - ks1.AddKmerCode(300) - - ks2 := NewKmerSet(5) - ks2.AddKmerCode(100) - ks2.AddKmerCode(200) - ks2.AddKmerCode(300) - - distance := ks1.JaccardDistance(ks2) - similarity := ks1.JaccardSimilarity(ks2) - - if distance != 0.0 { - t.Errorf("Expected distance 0.0 for identical sets, got %f", distance) - } - - if similarity != 1.0 { - t.Errorf("Expected similarity 1.0 for identical sets, got %f", similarity) - } -} - -func TestJaccardDistanceDisjoint(t *testing.T) { - ks1 := NewKmerSet(5) - ks1.AddKmerCode(100) - ks1.AddKmerCode(200) - ks1.AddKmerCode(300) - - ks2 := NewKmerSet(5) - ks2.AddKmerCode(400) - ks2.AddKmerCode(500) - ks2.AddKmerCode(600) - - distance := ks1.JaccardDistance(ks2) - similarity := ks1.JaccardSimilarity(ks2) - - if distance != 1.0 { - t.Errorf("Expected distance 1.0 for disjoint sets, got %f", distance) - } - - if similarity != 0.0 { - t.Errorf("Expected similarity 0.0 for disjoint sets, got %f", similarity) - } -} - -func TestJaccardDistancePartialOverlap(t *testing.T) { - // Set 1: {1, 2, 3} - ks1 := NewKmerSet(5) - ks1.AddKmerCode(1) - ks1.AddKmerCode(2) - ks1.AddKmerCode(3) - - // Set 2: {2, 3, 4} - ks2 := NewKmerSet(5) - ks2.AddKmerCode(2) - ks2.AddKmerCode(3) - ks2.AddKmerCode(4) - - // Intersection: {2, 3} -> cardinality = 2 - // Union: {1, 2, 3, 4} -> cardinality = 4 - // Similarity = 2/4 = 0.5 - // Distance = 1 - 0.5 = 0.5 - - distance := ks1.JaccardDistance(ks2) - similarity := ks1.JaccardSimilarity(ks2) - - expectedDistance := 0.5 - expectedSimilarity := 0.5 - - if math.Abs(distance-expectedDistance) > 1e-10 { - t.Errorf("Expected distance %f, got %f", expectedDistance, distance) - } - - if math.Abs(similarity-expectedSimilarity) > 1e-10 { - t.Errorf("Expected similarity %f, got %f", expectedSimilarity, similarity) - } -} - -func TestJaccardDistanceOneSubsetOfOther(t *testing.T) { - // Set 1: {1, 2} - ks1 := NewKmerSet(5) - ks1.AddKmerCode(1) - ks1.AddKmerCode(2) - - // Set 2: {1, 2, 3, 4} - ks2 := NewKmerSet(5) - ks2.AddKmerCode(1) - ks2.AddKmerCode(2) - ks2.AddKmerCode(3) - ks2.AddKmerCode(4) - - // Intersection: {1, 2} -> cardinality = 2 - // Union: {1, 2, 3, 4} -> cardinality = 4 - // Similarity = 2/4 = 0.5 - // Distance = 1 - 0.5 = 0.5 - - distance := ks1.JaccardDistance(ks2) - similarity := ks1.JaccardSimilarity(ks2) - - expectedDistance := 0.5 - expectedSimilarity := 0.5 - - if math.Abs(distance-expectedDistance) > 1e-10 { - t.Errorf("Expected distance %f, got %f", expectedDistance, distance) - } - - if math.Abs(similarity-expectedSimilarity) > 1e-10 { - t.Errorf("Expected similarity %f, got %f", expectedSimilarity, similarity) - } -} - -func TestJaccardDistanceEmptySets(t *testing.T) { - ks1 := NewKmerSet(5) - ks2 := NewKmerSet(5) - - distance := ks1.JaccardDistance(ks2) - similarity := ks1.JaccardSimilarity(ks2) - - // By convention, distance = 1.0 for empty sets - if distance != 1.0 { - t.Errorf("Expected distance 1.0 for empty sets, got %f", distance) - } - - if similarity != 0.0 { - t.Errorf("Expected similarity 0.0 for empty sets, got %f", similarity) - } -} - -func TestJaccardDistanceOneEmpty(t *testing.T) { - ks1 := NewKmerSet(5) - ks1.AddKmerCode(1) - ks1.AddKmerCode(2) - ks1.AddKmerCode(3) - - ks2 := NewKmerSet(5) - - distance := ks1.JaccardDistance(ks2) - similarity := ks1.JaccardSimilarity(ks2) - - // Intersection: {} -> cardinality = 0 - // Union: {1, 2, 3} -> cardinality = 3 - // Similarity = 0/3 = 0.0 - // Distance = 1.0 - - if distance != 1.0 { - t.Errorf("Expected distance 1.0 when one set is empty, got %f", distance) - } - - if similarity != 0.0 { - t.Errorf("Expected similarity 0.0 when one set is empty, got %f", similarity) - } -} - -func TestJaccardDistanceDifferentK(t *testing.T) { - ks1 := NewKmerSet(5) - ks1.AddKmerCode(1) - - ks2 := NewKmerSet(7) - ks2.AddKmerCode(1) - - defer func() { - if r := recover(); r == nil { - t.Errorf("Expected panic when computing Jaccard distance with different k values") - } - }() - - _ = ks1.JaccardDistance(ks2) -} - -func TestJaccardDistanceSimilarityRelation(t *testing.T) { - // Test that distance + similarity = 1.0 for all cases - testCases := []struct { - name string - ks1 *KmerSet - ks2 *KmerSet - }{ - { - name: "partial overlap", - ks1: func() *KmerSet { - ks := NewKmerSet(5) - ks.AddKmerCode(1) - ks.AddKmerCode(2) - ks.AddKmerCode(3) - return ks - }(), - ks2: func() *KmerSet { - ks := NewKmerSet(5) - ks.AddKmerCode(2) - ks.AddKmerCode(3) - ks.AddKmerCode(4) - ks.AddKmerCode(5) - return ks - }(), - }, - { - name: "identical", - ks1: func() *KmerSet { - ks := NewKmerSet(5) - ks.AddKmerCode(10) - ks.AddKmerCode(20) - return ks - }(), - ks2: func() *KmerSet { - ks := NewKmerSet(5) - ks.AddKmerCode(10) - ks.AddKmerCode(20) - return ks - }(), - }, - { - name: "disjoint", - ks1: func() *KmerSet { - ks := NewKmerSet(5) - ks.AddKmerCode(1) - return ks - }(), - ks2: func() *KmerSet { - ks := NewKmerSet(5) - ks.AddKmerCode(100) - return ks - }(), - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - distance := tc.ks1.JaccardDistance(tc.ks2) - similarity := tc.ks1.JaccardSimilarity(tc.ks2) - - sum := distance + similarity - - if math.Abs(sum-1.0) > 1e-10 { - t.Errorf("Expected distance + similarity = 1.0, got %f + %f = %f", - distance, similarity, sum) - } - }) - } -} - -func TestJaccardDistanceSymmetry(t *testing.T) { - ks1 := NewKmerSet(5) - ks1.AddKmerCode(1) - ks1.AddKmerCode(2) - ks1.AddKmerCode(3) - - ks2 := NewKmerSet(5) - ks2.AddKmerCode(2) - ks2.AddKmerCode(3) - ks2.AddKmerCode(4) - - distance1 := ks1.JaccardDistance(ks2) - distance2 := ks2.JaccardDistance(ks1) - - similarity1 := ks1.JaccardSimilarity(ks2) - similarity2 := ks2.JaccardSimilarity(ks1) - - if math.Abs(distance1-distance2) > 1e-10 { - t.Errorf("Jaccard distance not symmetric: %f vs %f", distance1, distance2) - } - - if math.Abs(similarity1-similarity2) > 1e-10 { - t.Errorf("Jaccard similarity not symmetric: %f vs %f", similarity1, similarity2) - } -} diff --git a/pkg/obikmer/minimizer_utils.go b/pkg/obikmer/minimizer_utils.go new file mode 100644 index 0000000..8221eb3 --- /dev/null +++ b/pkg/obikmer/minimizer_utils.go @@ -0,0 +1,47 @@ +package obikmer + +import ( + "math" + + log "github.com/sirupsen/logrus" +) + +// DefaultMinimizerSize returns ceil(k / 2.5) as a reasonable default minimizer size. +func DefaultMinimizerSize(k int) int { + m := int(math.Ceil(float64(k) / 2.5)) + if m < 1 { + m = 1 + } + if m >= k { + m = k - 1 + } + return m +} + +// MinMinimizerSize returns the minimum m such that 4^m >= nworkers, +// i.e. ceil(log(nworkers) / log(4)). +func MinMinimizerSize(nworkers int) int { + if nworkers <= 1 { + return 1 + } + return int(math.Ceil(math.Log(float64(nworkers)) / math.Log(4))) +} + +// ValidateMinimizerSize checks and adjusts the minimizer size to satisfy constraints: +// - m >= ceil(log(nworkers)/log(4)) +// - 1 <= m < k +func ValidateMinimizerSize(m, k, nworkers int) int { + minM := MinMinimizerSize(nworkers) + if m < minM { + log.Warnf("Minimizer size %d too small for %d workers (4^%d = %d < %d), adjusting to %d", + m, nworkers, m, 1<<(2*m), nworkers, minM) + m = minM + } + if m < 1 { + m = 1 + } + if m >= k { + m = k - 1 + } + return m +} diff --git a/pkg/obikmer/skm_reader.go b/pkg/obikmer/skm_reader.go new file mode 100644 index 0000000..64ef6f9 --- /dev/null +++ b/pkg/obikmer/skm_reader.go @@ -0,0 +1,67 @@ +package obikmer + +import ( + "bufio" + "encoding/binary" + "io" + "os" +) + +// decode2bit maps 2-bit codes back to nucleotide bytes. +var decode2bit = [4]byte{'a', 'c', 'g', 't'} + +// SkmReader reads super-kmers from a binary .skm file. +type SkmReader struct { + r *bufio.Reader + file *os.File +} + +// NewSkmReader opens a .skm file for reading. +func NewSkmReader(path string) (*SkmReader, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + return &SkmReader{ + r: bufio.NewReaderSize(f, 65536), + file: f, + }, nil +} + +// Next reads the next super-kmer from the file. +// Returns the SuperKmer and true, or a zero SuperKmer and false at EOF. +func (sr *SkmReader) Next() (SuperKmer, bool) { + // Read length + var lenbuf [2]byte + if _, err := io.ReadFull(sr.r, lenbuf[:]); err != nil { + return SuperKmer{}, false + } + seqLen := int(binary.LittleEndian.Uint16(lenbuf[:])) + + // Read packed bytes + nBytes := (seqLen + 3) / 4 + packed := make([]byte, nBytes) + if _, err := io.ReadFull(sr.r, packed); err != nil { + return SuperKmer{}, false + } + + // Decode to nucleotide bytes + seq := make([]byte, seqLen) + for i := 0; i < seqLen; i++ { + byteIdx := i / 4 + bitPos := uint(6 - (i%4)*2) + code := (packed[byteIdx] >> bitPos) & 0x03 + seq[i] = decode2bit[code] + } + + return SuperKmer{ + Sequence: seq, + Start: 0, + End: seqLen, + }, true +} + +// Close closes the underlying file. +func (sr *SkmReader) Close() error { + return sr.file.Close() +} diff --git a/pkg/obikmer/skm_test.go b/pkg/obikmer/skm_test.go new file mode 100644 index 0000000..7bc4734 --- /dev/null +++ b/pkg/obikmer/skm_test.go @@ -0,0 +1,176 @@ +package obikmer + +import ( + "os" + "path/filepath" + "testing" +) + +func TestSkmRoundTrip(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "test.skm") + + // Create super-kmers from a known sequence + seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT") + k := 21 + m := 9 + superKmers := ExtractSuperKmers(seq, k, m, nil) + if len(superKmers) == 0 { + t.Fatal("no super-kmers extracted") + } + + // Write + w, err := NewSkmWriter(path) + if err != nil { + t.Fatal(err) + } + for _, sk := range superKmers { + if err := w.Write(sk); err != nil { + t.Fatal(err) + } + } + if err := w.Close(); err != nil { + t.Fatal(err) + } + + // Read back + r, err := NewSkmReader(path) + if err != nil { + t.Fatal(err) + } + defer r.Close() + + idx := 0 + for { + sk, ok := r.Next() + if !ok { + break + } + if idx >= len(superKmers) { + t.Fatal("read more super-kmers than written") + } + expected := superKmers[idx] + if len(sk.Sequence) != len(expected.Sequence) { + t.Fatalf("super-kmer %d: length mismatch: got %d, want %d", + idx, len(sk.Sequence), len(expected.Sequence)) + } + // Compare nucleotide-by-nucleotide (case insensitive since decode produces lowercase) + for j := range sk.Sequence { + got := sk.Sequence[j] | 0x20 + want := expected.Sequence[j] | 0x20 + if got != want { + t.Fatalf("super-kmer %d pos %d: got %c, want %c", idx, j, got, want) + } + } + idx++ + } + if idx != len(superKmers) { + t.Fatalf("read %d super-kmers, want %d", idx, len(superKmers)) + } +} + +func TestSkmEmptyFile(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "empty.skm") + + // Write nothing + w, err := NewSkmWriter(path) + if err != nil { + t.Fatal(err) + } + if err := w.Close(); err != nil { + t.Fatal(err) + } + + // Read back + r, err := NewSkmReader(path) + if err != nil { + t.Fatal(err) + } + defer r.Close() + + _, ok := r.Next() + if ok { + t.Fatal("expected no super-kmers in empty file") + } +} + +func TestSkmSingleBase(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "single.skm") + + // Test with sequences of various lengths to check padding + sequences := [][]byte{ + []byte("A"), + []byte("AC"), + []byte("ACG"), + []byte("ACGT"), + []byte("ACGTA"), + } + + w, err := NewSkmWriter(path) + if err != nil { + t.Fatal(err) + } + for _, seq := range sequences { + sk := SuperKmer{Sequence: seq} + if err := w.Write(sk); err != nil { + t.Fatal(err) + } + } + if err := w.Close(); err != nil { + t.Fatal(err) + } + + r, err := NewSkmReader(path) + if err != nil { + t.Fatal(err) + } + defer r.Close() + + for i, expected := range sequences { + sk, ok := r.Next() + if !ok { + t.Fatalf("expected super-kmer %d, got EOF", i) + } + if len(sk.Sequence) != len(expected) { + t.Fatalf("sk %d: length %d, want %d", i, len(sk.Sequence), len(expected)) + } + for j := range sk.Sequence { + got := sk.Sequence[j] | 0x20 + want := expected[j] | 0x20 + if got != want { + t.Fatalf("sk %d pos %d: got %c, want %c", i, j, got, want) + } + } + } +} + +func TestSkmFileSize(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "size.skm") + + // Write a sequence of known length + seq := []byte("ACGTACGTAC") // 10 bases + sk := SuperKmer{Sequence: seq} + + w, err := NewSkmWriter(path) + if err != nil { + t.Fatal(err) + } + if err := w.Write(sk); err != nil { + t.Fatal(err) + } + if err := w.Close(); err != nil { + t.Fatal(err) + } + + // Expected: 2 bytes (length) + ceil(10/4)=3 bytes (data) = 5 bytes + info, err := os.Stat(path) + if err != nil { + t.Fatal(err) + } + if info.Size() != 5 { + t.Fatalf("file size: got %d, want 5", info.Size()) + } +} diff --git a/pkg/obikmer/skm_writer.go b/pkg/obikmer/skm_writer.go new file mode 100644 index 0000000..1123d2e --- /dev/null +++ b/pkg/obikmer/skm_writer.go @@ -0,0 +1,74 @@ +package obikmer + +import ( + "bufio" + "encoding/binary" + "os" +) + +// SkmWriter writes super-kmers to a binary .skm file. +// +// Format per super-kmer: +// +// [len: uint16 LE] length of the super-kmer in bases +// [data: ceil(len/4) bytes] sequence encoded 2 bits/base, packed +// +// Nucleotide encoding: A=00, C=01, G=10, T=11. +// The last byte is zero-padded on the low bits if len%4 != 0. +type SkmWriter struct { + w *bufio.Writer + file *os.File +} + +// NewSkmWriter creates a new SkmWriter writing to the given file path. +func NewSkmWriter(path string) (*SkmWriter, error) { + f, err := os.Create(path) + if err != nil { + return nil, err + } + return &SkmWriter{ + w: bufio.NewWriterSize(f, 65536), + file: f, + }, nil +} + +// Write encodes a SuperKmer to the .skm file. +// The sequence bytes are packed 2 bits per base. +func (sw *SkmWriter) Write(sk SuperKmer) error { + seq := sk.Sequence + seqLen := uint16(len(seq)) + + // Write length + var lenbuf [2]byte + binary.LittleEndian.PutUint16(lenbuf[:], seqLen) + if _, err := sw.w.Write(lenbuf[:]); err != nil { + return err + } + + // Encode and write packed sequence (2 bits/base) + nBytes := (int(seqLen) + 3) / 4 + for i := 0; i < nBytes; i++ { + var packed byte + for j := 0; j < 4; j++ { + pos := i*4 + j + packed <<= 2 + if pos < int(seqLen) { + packed |= __single_base_code__[seq[pos]&31] + } + } + if err := sw.w.WriteByte(packed); err != nil { + return err + } + } + + return nil +} + +// Close flushes buffered data and closes the underlying file. +func (sw *SkmWriter) Close() error { + if err := sw.w.Flush(); err != nil { + sw.file.Close() + return err + } + return sw.file.Close() +} diff --git a/pkg/obikmer/spectrum.go b/pkg/obikmer/spectrum.go new file mode 100644 index 0000000..c5b5733 --- /dev/null +++ b/pkg/obikmer/spectrum.go @@ -0,0 +1,253 @@ +package obikmer + +import ( + "bufio" + "container/heap" + "encoding/csv" + "fmt" + "os" + "sort" + "strconv" +) + +// KSP file magic bytes: "KSP\x01" (K-mer SPectrum v1) +var kspMagic = [4]byte{'K', 'S', 'P', 0x01} + +// SpectrumEntry represents one entry in a k-mer frequency spectrum. +type SpectrumEntry struct { + Frequency int // how many times a k-mer was observed + Count uint64 // how many distinct k-mers have this frequency +} + +// KmerSpectrum represents the frequency distribution of k-mers. +// Entries are sorted by Frequency in ascending order and only include +// non-zero counts. +type KmerSpectrum struct { + Entries []SpectrumEntry +} + +// MaxFrequency returns the highest frequency in the spectrum, or 0 if empty. +func (s *KmerSpectrum) MaxFrequency() int { + if len(s.Entries) == 0 { + return 0 + } + return s.Entries[len(s.Entries)-1].Frequency +} + +// ToMap converts a KmerSpectrum back to a map for easy lookup. +func (s *KmerSpectrum) ToMap() map[int]uint64 { + m := make(map[int]uint64, len(s.Entries)) + for _, e := range s.Entries { + m[e.Frequency] = e.Count + } + return m +} + +// MapToSpectrum converts a map[int]uint64 to a sorted KmerSpectrum. +func MapToSpectrum(m map[int]uint64) *KmerSpectrum { + entries := make([]SpectrumEntry, 0, len(m)) + for freq, count := range m { + if count > 0 { + entries = append(entries, SpectrumEntry{Frequency: freq, Count: count}) + } + } + sort.Slice(entries, func(i, j int) bool { + return entries[i].Frequency < entries[j].Frequency + }) + return &KmerSpectrum{Entries: entries} +} + +// MergeSpectraMaps adds all entries from b into a. +func MergeSpectraMaps(a, b map[int]uint64) { + for freq, count := range b { + a[freq] += count + } +} + +// WriteSpectrum writes a KmerSpectrum to a binary file. +// +// Format: +// +// [magic: 4 bytes "KSP\x01"] +// [n_entries: varint] +// For each entry (sorted by frequency ascending): +// [frequency: varint] +// [count: varint] +func WriteSpectrum(path string, spectrum *KmerSpectrum) error { + f, err := os.Create(path) + if err != nil { + return fmt.Errorf("create spectrum file: %w", err) + } + w := bufio.NewWriterSize(f, 65536) + + // Magic + if _, err := w.Write(kspMagic[:]); err != nil { + f.Close() + return err + } + + // Number of entries + if _, err := EncodeVarint(w, uint64(len(spectrum.Entries))); err != nil { + f.Close() + return err + } + + // Entries + for _, e := range spectrum.Entries { + if _, err := EncodeVarint(w, uint64(e.Frequency)); err != nil { + f.Close() + return err + } + if _, err := EncodeVarint(w, e.Count); err != nil { + f.Close() + return err + } + } + + if err := w.Flush(); err != nil { + f.Close() + return err + } + return f.Close() +} + +// ReadSpectrum reads a KmerSpectrum from a binary file. +func ReadSpectrum(path string) (*KmerSpectrum, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + + r := bufio.NewReaderSize(f, 65536) + + // Check magic + var magic [4]byte + if _, err := r.Read(magic[:]); err != nil { + return nil, fmt.Errorf("read spectrum magic: %w", err) + } + if magic != kspMagic { + return nil, fmt.Errorf("invalid spectrum file magic: %v", magic) + } + + // Number of entries + nEntries, err := DecodeVarint(r) + if err != nil { + return nil, fmt.Errorf("read spectrum entry count: %w", err) + } + + entries := make([]SpectrumEntry, nEntries) + for i := uint64(0); i < nEntries; i++ { + freq, err := DecodeVarint(r) + if err != nil { + return nil, fmt.Errorf("read spectrum freq at entry %d: %w", i, err) + } + count, err := DecodeVarint(r) + if err != nil { + return nil, fmt.Errorf("read spectrum count at entry %d: %w", i, err) + } + entries[i] = SpectrumEntry{ + Frequency: int(freq), + Count: count, + } + } + + return &KmerSpectrum{Entries: entries}, nil +} + +// KmerFreq associates a k-mer (encoded as uint64) with its observed frequency. +type KmerFreq struct { + Kmer uint64 + Freq int +} + +// kmerFreqHeap is a min-heap of KmerFreq ordered by Freq (lowest first). +// Used to maintain a top-N most frequent k-mers set. +type kmerFreqHeap []KmerFreq + +func (h kmerFreqHeap) Len() int { return len(h) } +func (h kmerFreqHeap) Less(i, j int) bool { return h[i].Freq < h[j].Freq } +func (h kmerFreqHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } +func (h *kmerFreqHeap) Push(x interface{}) { *h = append(*h, x.(KmerFreq)) } +func (h *kmerFreqHeap) Pop() interface{} { + old := *h + n := len(old) + x := old[n-1] + *h = old[:n-1] + return x +} + +// TopNKmers maintains a collection of the N most frequent k-mers +// using a min-heap. Thread-safe usage requires external synchronization. +type TopNKmers struct { + n int + h kmerFreqHeap +} + +// NewTopNKmers creates a new top-N collector. +func NewTopNKmers(n int) *TopNKmers { + return &TopNKmers{ + n: n, + h: make(kmerFreqHeap, 0, n+1), + } +} + +// Add considers a k-mer with the given frequency for inclusion in the top-N. +func (t *TopNKmers) Add(kmer uint64, freq int) { + if t.n <= 0 { + return + } + if len(t.h) < t.n { + heap.Push(&t.h, KmerFreq{Kmer: kmer, Freq: freq}) + } else if freq > t.h[0].Freq { + t.h[0] = KmerFreq{Kmer: kmer, Freq: freq} + heap.Fix(&t.h, 0) + } +} + +// Results returns the collected k-mers sorted by frequency descending. +func (t *TopNKmers) Results() []KmerFreq { + result := make([]KmerFreq, len(t.h)) + copy(result, t.h) + sort.Slice(result, func(i, j int) bool { + return result[i].Freq > result[j].Freq + }) + return result +} + +// MergeTopN merges another TopNKmers into this one. +func (t *TopNKmers) MergeTopN(other *TopNKmers) { + if other == nil { + return + } + for _, kf := range other.h { + t.Add(kf.Kmer, kf.Freq) + } +} + +// WriteTopKmersCSV writes the top k-mers to a CSV file. +// Columns: sequence, frequency +func WriteTopKmersCSV(path string, topKmers []KmerFreq, k int) error { + f, err := os.Create(path) + if err != nil { + return fmt.Errorf("create top-kmers file: %w", err) + } + defer f.Close() + + w := csv.NewWriter(f) + defer w.Flush() + + if err := w.Write([]string{"sequence", "frequency"}); err != nil { + return err + } + + buf := make([]byte, k) + for _, kf := range topKmers { + seq := DecodeKmer(kf.Kmer, k, buf) + if err := w.Write([]string{string(seq), strconv.Itoa(kf.Freq)}); err != nil { + return err + } + } + + return nil +} diff --git a/pkg/obikmer/varint.go b/pkg/obikmer/varint.go new file mode 100644 index 0000000..cae6475 --- /dev/null +++ b/pkg/obikmer/varint.go @@ -0,0 +1,53 @@ +package obikmer + +import "io" + +// EncodeVarint writes a uint64 value as a variable-length integer to w. +// Uses 7 bits per byte with the high bit as a continuation flag +// (identical to protobuf unsigned varint encoding). +// Returns the number of bytes written. +func EncodeVarint(w io.Writer, v uint64) (int, error) { + var buf [10]byte // max 10 bytes for uint64 varint + n := 0 + for v >= 0x80 { + buf[n] = byte(v) | 0x80 + v >>= 7 + n++ + } + buf[n] = byte(v) + n++ + return w.Write(buf[:n]) +} + +// DecodeVarint reads a variable-length encoded uint64 from r. +// Returns the decoded value and any error encountered. +func DecodeVarint(r io.Reader) (uint64, error) { + var val uint64 + var shift uint + var buf [1]byte + + for { + if _, err := io.ReadFull(r, buf[:]); err != nil { + return 0, err + } + b := buf[0] + val |= uint64(b&0x7F) << shift + if b < 0x80 { + return val, nil + } + shift += 7 + if shift >= 70 { + return 0, io.ErrUnexpectedEOF + } + } +} + +// VarintLen returns the number of bytes needed to encode v as a varint. +func VarintLen(v uint64) int { + n := 1 + for v >= 0x80 { + v >>= 7 + n++ + } + return n +} diff --git a/pkg/obikmer/varint_test.go b/pkg/obikmer/varint_test.go new file mode 100644 index 0000000..ebe466b --- /dev/null +++ b/pkg/obikmer/varint_test.go @@ -0,0 +1,82 @@ +package obikmer + +import ( + "bytes" + "testing" +) + +func TestVarintRoundTrip(t *testing.T) { + values := []uint64{ + 0, 1, 127, 128, 255, 256, + 16383, 16384, + 1<<21 - 1, 1 << 21, + 1<<28 - 1, 1 << 28, + 1<<35 - 1, 1 << 35, + 1<<42 - 1, 1 << 42, + 1<<49 - 1, 1 << 49, + 1<<56 - 1, 1 << 56, + 1<<63 - 1, 1 << 63, + ^uint64(0), // max uint64 + } + + for _, v := range values { + var buf bytes.Buffer + n, err := EncodeVarint(&buf, v) + if err != nil { + t.Fatalf("EncodeVarint(%d): %v", v, err) + } + if n != VarintLen(v) { + t.Fatalf("EncodeVarint(%d): wrote %d bytes, VarintLen says %d", v, n, VarintLen(v)) + } + + decoded, err := DecodeVarint(&buf) + if err != nil { + t.Fatalf("DecodeVarint for %d: %v", v, err) + } + if decoded != v { + t.Fatalf("roundtrip failed: encoded %d, decoded %d", v, decoded) + } + } +} + +func TestVarintLen(t *testing.T) { + tests := []struct { + value uint64 + expected int + }{ + {0, 1}, + {127, 1}, + {128, 2}, + {16383, 2}, + {16384, 3}, + {^uint64(0), 10}, + } + + for _, tc := range tests { + got := VarintLen(tc.value) + if got != tc.expected { + t.Errorf("VarintLen(%d) = %d, want %d", tc.value, got, tc.expected) + } + } +} + +func TestVarintSequence(t *testing.T) { + var buf bytes.Buffer + values := []uint64{0, 42, 1000000, ^uint64(0), 1} + + for _, v := range values { + if _, err := EncodeVarint(&buf, v); err != nil { + t.Fatalf("EncodeVarint(%d): %v", v, err) + } + } + + for _, expected := range values { + got, err := DecodeVarint(&buf) + if err != nil { + t.Fatalf("DecodeVarint: %v", err) + } + if got != expected { + t.Errorf("got %d, want %d", got, expected) + } + } +} diff --git a/pkg/obioptions/options.go b/pkg/obioptions/options.go index aaf45f9..5109ac3 100644 --- a/pkg/obioptions/options.go +++ b/pkg/obioptions/options.go @@ -26,16 +26,11 @@ var __defaut_taxonomy_mutex__ sync.Mutex type ArgumentParser func([]string) (*getoptions.GetOpt, []string) -func GenerateOptionParser(program string, - documentation string, - optionset ...func(*getoptions.GetOpt)) ArgumentParser { - - options := getoptions.New() - options.Self(program, documentation) - options.SetMode(getoptions.Bundling) - options.SetUnknownMode(getoptions.Fail) - options.Bool("help", false, options.Alias("h", "?")) - +// RegisterGlobalOptions registers the global options shared by all obitools +// commands onto the given GetOpt instance. It does NOT register --help, +// which must be handled by the caller (either as a Bool option or via +// HelpCommand for subcommand-based parsers). +func RegisterGlobalOptions(options *getoptions.GetOpt) { options.Bool("version", false, options.Description("Prints the version and exits.")) @@ -46,17 +41,10 @@ func GenerateOptionParser(program string, options.BoolVar(&_Pprof, "pprof", false, options.Description("Enable pprof server. Look at the log for details.")) - // options.IntVar(&_ParallelWorkers, "workers", _ParallelWorkers, - // options.Alias("w"), - // options.Description("Number of parallele threads computing the result")) - options.IntVar(obidefault.MaxCPUPtr(), "max-cpu", obidefault.MaxCPU(), options.GetEnv("OBIMAXCPU"), options.Description("Number of parallele threads computing the result")) - // options.BoolVar(&_Pprof, "force-one-cpu", false, - // options.Description("Force to use only one cpu core for parallel processing")) - options.IntVar(&_PprofMudex, "pprof-mutex", _PprofMudex, options.GetEnv("OBIPPROFMUTEX"), options.Description("Enable profiling of mutex lock.")) @@ -77,119 +65,119 @@ func GenerateOptionParser(program string, options.GetEnv("OBIWARNING"), options.Description("Stop printing of the warning message"), ) +} + +// ProcessParsedOptions handles the post-parse logic common to all obitools +// commands: help, version, debug, pprof, taxonomy, cpu configuration, etc. +// It receives the GetOpt instance and the parse error (if any). +func ProcessParsedOptions(options *getoptions.GetOpt, parseErr error) { + // Note: "help" may not be registered as a Bool (e.g. when using HelpCommand + // for subcommand-based parsers). Only check if it won't panic. + // We use a recover guard to be safe. + func() { + defer func() { recover() }() + if options.Called("help") { + fmt.Fprint(os.Stderr, options.Help()) + os.Exit(0) + } + }() + + if options.Called("version") { + fmt.Fprintf(os.Stderr, "OBITools %s\n", VersionString()) + os.Exit(0) + } + + if options.Called("taxonomy") { + __defaut_taxonomy_mutex__.Lock() + defer __defaut_taxonomy_mutex__.Unlock() + taxonomy, err := obiformats.LoadTaxonomy( + obidefault.SelectedTaxonomy(), + !obidefault.AreAlternativeNamesSelected(), + SeqAsTaxa(), + ) + + if err != nil { + log.Fatalf("Cannot load default taxonomy: %v", err) + } + + taxonomy.SetAsDefault() + } + + log.SetLevel(log.InfoLevel) + if options.Called("debug") { + log.SetLevel(log.DebugLevel) + log.Debugln("Switch to debug level logging") + } + + if options.Called("pprof") { + url := "localhost:6060" + go http.ListenAndServe(url, nil) + log.Infof("Start a pprof server at address %s/debug/pprof", url) + log.Info("Profil can be followed running concurrently the command :") + log.Info(" go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/profile?seconds=30'") + } + + if options.Called("pprof-mutex") { + url := "localhost:6060" + go http.ListenAndServe(url, nil) + runtime.SetMutexProfileFraction(_PprofMudex) + log.Infof("Start a pprof server at address %s/debug/pprof", url) + log.Info("Profil can be followed running concurrently the command :") + log.Info(" go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/mutex'") + } + + if options.Called("pprof-goroutine") { + url := "localhost:6060" + go http.ListenAndServe(url, nil) + runtime.SetBlockProfileRate(_PprofGoroutine) + log.Infof("Start a pprof server at address %s/debug/pprof", url) + log.Info("Profil can be followed running concurrently the command :") + log.Info(" go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/block'") + } + + // Handle user errors + if parseErr != nil { + fmt.Fprintf(os.Stderr, "ERROR: %s\n\n", parseErr) + fmt.Fprint(os.Stderr, options.Help(getoptions.HelpSynopsis)) + os.Exit(1) + } + + runtime.GOMAXPROCS(obidefault.MaxCPU()) + + if options.Called("max-cpu") { + log.Printf("CPU number limited to %d", obidefault.MaxCPU()) + } + + if options.Called("no-singleton") { + log.Printf("No singleton option set") + } + + log.Printf("Number of workers set %d", obidefault.ParallelWorkers()) + + if options.Called("solexa") { + obidefault.SetReadQualitiesShift(64) + } +} + +func GenerateOptionParser(program string, + documentation string, + optionset ...func(*getoptions.GetOpt)) ArgumentParser { + + options := getoptions.New() + options.Self(program, documentation) + options.SetMode(getoptions.Bundling) + options.SetUnknownMode(getoptions.Fail) + options.Bool("help", false, options.Alias("h", "?")) + + RegisterGlobalOptions(options) for _, o := range optionset { o(options) } return func(args []string) (*getoptions.GetOpt, []string) { - remaining, err := options.Parse(args[1:]) - - if options.Called("help") { - fmt.Fprint(os.Stderr, options.Help()) - os.Exit(0) - } - - if options.Called("version") { - fmt.Fprintf(os.Stderr, "OBITools %s\n", VersionString()) - os.Exit(0) - } - - if options.Called("taxonomy") { - __defaut_taxonomy_mutex__.Lock() - defer __defaut_taxonomy_mutex__.Unlock() - taxonomy, err := obiformats.LoadTaxonomy( - obidefault.SelectedTaxonomy(), - !obidefault.AreAlternativeNamesSelected(), - SeqAsTaxa(), - ) - - if err != nil { - log.Fatalf("Cannot load default taxonomy: %v", err) - - } - - taxonomy.SetAsDefault() - } - - log.SetLevel(log.InfoLevel) - if options.Called("debug") { - log.SetLevel(log.DebugLevel) - log.Debugln("Switch to debug level logging") - } - - if options.Called("pprof") { - url := "localhost:6060" - go http.ListenAndServe(url, nil) - log.Infof("Start a pprof server at address %s/debug/pprof", url) - log.Info("Profil can be followed running concurrently the command :") - log.Info(" go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/profile?seconds=30'") - } - - if options.Called("pprof-mutex") { - url := "localhost:6060" - go http.ListenAndServe(url, nil) - runtime.SetMutexProfileFraction(_PprofMudex) - log.Infof("Start a pprof server at address %s/debug/pprof", url) - log.Info("Profil can be followed running concurrently the command :") - log.Info(" go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/mutex'") - } - - if options.Called("pprof-goroutine") { - url := "localhost:6060" - go http.ListenAndServe(url, nil) - runtime.SetBlockProfileRate(_PprofGoroutine) - log.Infof("Start a pprof server at address %s/debug/pprof", url) - log.Info("Profil can be followed running concurrently the command :") - log.Info(" go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/block'") - } - - // Handle user errors - if err != nil { - fmt.Fprintf(os.Stderr, "ERROR: %s\n\n", err) - fmt.Fprint(os.Stderr, options.Help(getoptions.HelpSynopsis)) - os.Exit(1) - } - - // // Setup the maximum number of CPU usable by the program - // if obidefault.MaxCPU() == 1 { - // log.Warn("Limitating the Maximum number of CPU to 1 is not recommanded") - // log.Warn("The number of CPU requested has been set to 2") - // obidefault.SetMaxCPU(2) - // } - - // if options.Called("force-one-cpu") { - // log.Warn("Limitating the Maximum number of CPU to 1 is not recommanded") - // log.Warn("The number of CPU has been forced to 1") - // log.Warn("This can lead to unexpected behavior") - // obidefault.SetMaxCPU(1) - // } - - runtime.GOMAXPROCS(obidefault.MaxCPU()) - - // if options.Called("max-cpu") || options.Called("force-one-cpu") { - // log.Printf("CPU number limited to %d", obidefault.MaxCPU()) - // } - - if options.Called("max-cpu") { - log.Printf("CPU number limited to %d", obidefault.MaxCPU()) - } - - if options.Called("no-singleton") { - log.Printf("No singleton option set") - } - - log.Printf("Number of workers set %d", obidefault.ParallelWorkers()) - - // if options.Called("workers") { - - // } - - if options.Called("solexa") { - obidefault.SetReadQualitiesShift(64) - } - + ProcessParsedOptions(options, err) return options, remaining } } diff --git a/pkg/obioptions/subcommand.go b/pkg/obioptions/subcommand.go new file mode 100644 index 0000000..bd55d3a --- /dev/null +++ b/pkg/obioptions/subcommand.go @@ -0,0 +1,43 @@ +package obioptions + +import ( + "github.com/DavidGamba/go-getoptions" +) + +// GenerateSubcommandParser creates an option parser that supports subcommands +// via go-getoptions' NewCommand/SetCommandFn/Dispatch API. +// +// The setup function receives the root *GetOpt and should register subcommands +// using opt.NewCommand(). Global options (--debug, --max-cpu, etc.) are +// registered before setup is called and are inherited by all subcommands. +// +// Returns the root *GetOpt (needed for Dispatch) and an ArgumentParser +// that handles parsing and post-parse processing. +func GenerateSubcommandParser( + program string, + documentation string, + setup func(opt *getoptions.GetOpt), +) (*getoptions.GetOpt, ArgumentParser) { + + options := getoptions.New() + options.Self(program, documentation) + options.SetMode(getoptions.Bundling) + options.SetUnknownMode(getoptions.Fail) + + // Register global options (inherited by all subcommands) + RegisterGlobalOptions(options) + + // Let the caller register subcommands + setup(options) + + // Add automatic help subcommand (must be after all commands) + options.HelpCommand("help", options.Description("Show help for a command")) + + parser := func(args []string) (*getoptions.GetOpt, []string) { + remaining, err := options.Parse(args[1:]) + ProcessParsedOptions(options, err) + return options, remaining + } + + return options, parser +} diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index 489bd84..70a87f4 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -3,7 +3,7 @@ package obioptions // Version is automatically updated by the Makefile from version.txt // The patch number (third digit) is incremented on each push to the repository -var _Version = "Release 4.4.12" +var _Version = "Release 4.4.13" // Version returns the version of the obitools package. // diff --git a/pkg/obitools/obik/cp.go b/pkg/obitools/obik/cp.go new file mode 100644 index 0000000..ada0e61 --- /dev/null +++ b/pkg/obitools/obik/cp.go @@ -0,0 +1,55 @@ +package obik + +import ( + "context" + "fmt" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" + "github.com/DavidGamba/go-getoptions" +) + +func runCp(ctx context.Context, opt *getoptions.GetOpt, args []string) error { + if len(args) < 2 { + return fmt.Errorf("usage: obik cp [--set PATTERN]... [--force] ") + } + + srcDir := args[0] + destDir := args[1] + + ksg, err := obikmer.OpenKmerSetGroup(srcDir) + if err != nil { + return fmt.Errorf("failed to open source kmer index: %w", err) + } + + // Resolve set patterns + patterns := CLISetPatterns() + var ids []string + if len(patterns) > 0 { + indices, err := ksg.MatchSetIDs(patterns) + if err != nil { + return err + } + if len(indices) == 0 { + return fmt.Errorf("no sets match the given patterns") + } + ids = make([]string, len(indices)) + for i, idx := range indices { + ids[i] = ksg.SetIDOf(idx) + } + } else { + // Copy all sets + ids = ksg.SetsIDs() + } + + log.Infof("Copying %d set(s) from %s to %s", len(ids), srcDir, destDir) + + dest, err := ksg.CopySetsByIDTo(ids, destDir, CLIForce()) + if err != nil { + return err + } + + log.Infof("Destination now has %d set(s)", dest.Size()) + return nil +} diff --git a/pkg/obitools/obik/filter.go b/pkg/obitools/obik/filter.go new file mode 100644 index 0000000..ee8aad6 --- /dev/null +++ b/pkg/obitools/obik/filter.go @@ -0,0 +1,344 @@ +package obik + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strings" + "sync" + "sync/atomic" + + "github.com/schollz/progressbar/v3" + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" + "github.com/DavidGamba/go-getoptions" +) + +// KmerFilter is a predicate applied to individual k-mers during filtering. +// Returns true if the k-mer should be kept. +type KmerFilter func(kmer uint64) bool + +// KmerFilterFactory creates a new KmerFilter instance. +// Each goroutine should call the factory to get its own filter, +// since some filters (e.g. KmerEntropyFilter) are not thread-safe. +type KmerFilterFactory func() KmerFilter + +// chainFilterFactories combines multiple KmerFilterFactory into one. +// The resulting factory creates a filter that accepts a k-mer only +// if all individual filters accept it. +func chainFilterFactories(factories []KmerFilterFactory) KmerFilterFactory { + switch len(factories) { + case 0: + return func() KmerFilter { return func(uint64) bool { return true } } + case 1: + return factories[0] + default: + return func() KmerFilter { + filters := make([]KmerFilter, len(factories)) + for i, f := range factories { + filters[i] = f() + } + return func(kmer uint64) bool { + for _, f := range filters { + if !f(kmer) { + return false + } + } + return true + } + } + } +} + +// runFilter implements the "obik filter" subcommand. +// It reads an existing kmer index, applies a chain of filters, +// and writes a new filtered index. +func runFilter(ctx context.Context, opt *getoptions.GetOpt, args []string) error { + if len(args) < 1 { + return fmt.Errorf("usage: obik filter [options] --out ") + } + + srcDir := args[0] + destDir := CLIOutputDirectory() + if destDir == "" || destDir == "-" { + return fmt.Errorf("--out option is required and must specify a destination directory") + } + + // Open source index + src, err := obikmer.OpenKmerSetGroup(srcDir) + if err != nil { + return fmt.Errorf("failed to open source index: %w", err) + } + + k := src.K() + + // Build filter factory chain from CLI options. + // Factories are used so each goroutine creates its own filter instance, + // since some filters (e.g. KmerEntropyFilter) have mutable state. + var factories []KmerFilterFactory + var filterDescriptions []string + + // Entropy filter + entropyThreshold := CLIIndexEntropyThreshold() + entropySize := CLIIndexEntropySize() + if entropyThreshold > 0 { + factories = append(factories, func() KmerFilter { + ef := obikmer.NewKmerEntropyFilter(k, entropySize, entropyThreshold) + return ef.Accept + }) + filterDescriptions = append(filterDescriptions, + fmt.Sprintf("entropy(threshold=%.4f, level-max=%d)", entropyThreshold, entropySize)) + } + + // Future filters will be added here, e.g.: + // quorumFilter, frequencyFilter, ... + + if len(factories) == 0 { + return fmt.Errorf("no filter specified; use --entropy-filter or other filter options") + } + + filterFactory := chainFilterFactories(factories) + + // Resolve set selection (default: all sets) + patterns := CLISetPatterns() + var setIndices []int + if len(patterns) > 0 { + setIndices, err = src.MatchSetIDs(patterns) + if err != nil { + return fmt.Errorf("failed to match set patterns: %w", err) + } + if len(setIndices) == 0 { + return fmt.Errorf("no sets match the given patterns") + } + } else { + setIndices = make([]int, src.Size()) + for i := range setIndices { + setIndices[i] = i + } + } + + log.Infof("Filtering %d set(s) from %s with: %s", + len(setIndices), srcDir, strings.Join(filterDescriptions, " + ")) + + // Create destination directory + if err := os.MkdirAll(destDir, 0755); err != nil { + return fmt.Errorf("failed to create destination: %w", err) + } + + P := src.Partitions() + + // Progress bar for partition filtering + totalPartitions := len(setIndices) * P + var bar *progressbar.ProgressBar + if obidefault.ProgressBar() { + pbopt := []progressbar.Option{ + progressbar.OptionSetWriter(os.Stderr), + progressbar.OptionSetWidth(15), + progressbar.OptionShowCount(), + progressbar.OptionShowIts(), + progressbar.OptionSetPredictTime(true), + progressbar.OptionSetDescription("[Filtering partitions]"), + } + bar = progressbar.NewOptions(totalPartitions, pbopt...) + } + + // Process each selected set + newCounts := make([]uint64, len(setIndices)) + + for si, srcIdx := range setIndices { + setID := src.SetIDOf(srcIdx) + if setID == "" { + setID = fmt.Sprintf("set_%d", srcIdx) + } + + destSetDir := filepath.Join(destDir, fmt.Sprintf("set_%d", si)) + if err := os.MkdirAll(destSetDir, 0755); err != nil { + return fmt.Errorf("failed to create set directory: %w", err) + } + + // Process partitions in parallel + nWorkers := obidefault.ParallelWorkers() + if nWorkers > P { + nWorkers = P + } + + var totalKept atomic.Uint64 + var totalProcessed atomic.Uint64 + + type job struct { + partIdx int + } + + jobs := make(chan job, P) + var wg sync.WaitGroup + var errMu sync.Mutex + var firstErr error + + for w := 0; w < nWorkers; w++ { + wg.Add(1) + go func() { + defer wg.Done() + // Each goroutine gets its own filter instance + workerFilter := filterFactory() + for j := range jobs { + kept, processed, err := filterPartition( + src.PartitionPath(srcIdx, j.partIdx), + filepath.Join(destSetDir, fmt.Sprintf("part_%04d.kdi", j.partIdx)), + workerFilter, + ) + if err != nil { + errMu.Lock() + if firstErr == nil { + firstErr = err + } + errMu.Unlock() + return + } + totalKept.Add(kept) + totalProcessed.Add(processed) + if bar != nil { + bar.Add(1) + } + } + }() + } + + for p := 0; p < P; p++ { + jobs <- job{p} + } + close(jobs) + wg.Wait() + + if firstErr != nil { + return fmt.Errorf("failed to filter set %q: %w", setID, firstErr) + } + + kept := totalKept.Load() + processed := totalProcessed.Load() + newCounts[si] = kept + log.Infof("Set %q: %d/%d k-mers kept (%.1f%% removed)", + setID, kept, processed, + 100.0*float64(processed-kept)/float64(max(processed, 1))) + + // Copy spectrum.bin if it exists + srcSpecPath := src.SpectrumPath(srcIdx) + if _, err := os.Stat(srcSpecPath); err == nil { + destSpecPath := filepath.Join(destSetDir, "spectrum.bin") + if err := copyFileHelper(srcSpecPath, destSpecPath); err != nil { + log.Warnf("Could not copy spectrum for set %q: %v", setID, err) + } + } + } + + if bar != nil { + fmt.Fprintln(os.Stderr) + } + + // Build destination metadata + setsIDs := make([]string, len(setIndices)) + setsMetadata := make([]map[string]interface{}, len(setIndices)) + for i, srcIdx := range setIndices { + setsIDs[i] = src.SetIDOf(srcIdx) + setsMetadata[i] = src.AllSetMetadata(srcIdx) + if setsMetadata[i] == nil { + setsMetadata[i] = make(map[string]interface{}) + } + } + + // Write metadata for the filtered index + dest, err := obikmer.NewFilteredKmerSetGroup( + destDir, k, src.M(), P, + len(setIndices), setsIDs, newCounts, setsMetadata, + ) + if err != nil { + return fmt.Errorf("failed to create filtered metadata: %w", err) + } + + // Copy group-level metadata and record applied filters + for key, value := range src.Metadata { + dest.SetAttribute(key, value) + } + if entropyThreshold > 0 { + dest.SetAttribute("entropy_filter", entropyThreshold) + dest.SetAttribute("entropy_filter_size", entropySize) + } + dest.SetAttribute("filtered_from", srcDir) + + if err := dest.SaveMetadata(); err != nil { + return fmt.Errorf("failed to save metadata: %w", err) + } + + log.Info("Done.") + return nil +} + +// filterPartition reads a single .kdi partition, applies the filter predicate, +// and writes the accepted k-mers to a new .kdi file. +// Returns (kept, processed, error). +func filterPartition(srcPath, destPath string, accept KmerFilter) (uint64, uint64, error) { + reader, err := obikmer.NewKdiReader(srcPath) + if err != nil { + // Empty partition — write empty KDI + w, err2 := obikmer.NewKdiWriter(destPath) + if err2 != nil { + return 0, 0, err2 + } + return 0, 0, w.Close() + } + defer reader.Close() + + w, err := obikmer.NewKdiWriter(destPath) + if err != nil { + return 0, 0, err + } + + var kept, processed uint64 + for { + kmer, ok := reader.Next() + if !ok { + break + } + processed++ + if accept(kmer) { + if err := w.Write(kmer); err != nil { + w.Close() + return 0, 0, err + } + kept++ + } + } + + return kept, processed, w.Close() +} + +// copyFileHelper copies a file (used for spectrum.bin etc.) +func copyFileHelper(src, dst string) error { + in, err := os.Open(src) + if err != nil { + return err + } + defer in.Close() + + out, err := os.Create(dst) + if err != nil { + return err + } + defer out.Close() + + buf := make([]byte, 32*1024) + for { + n, readErr := in.Read(buf) + if n > 0 { + if _, writeErr := out.Write(buf[:n]); writeErr != nil { + return writeErr + } + } + if readErr != nil { + break + } + } + return out.Close() +} diff --git a/pkg/obitools/obik/index.go b/pkg/obitools/obik/index.go new file mode 100644 index 0000000..71605c2 --- /dev/null +++ b/pkg/obitools/obik/index.go @@ -0,0 +1,154 @@ +package obik + +import ( + "context" + "fmt" + "os" + "path/filepath" + "sync" + "sync/atomic" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "github.com/DavidGamba/go-getoptions" +) + +func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error { + outDir := CLIOutputDirectory() + if outDir == "" || outDir == "-" { + return fmt.Errorf("--out option is required and must specify a directory path") + } + + k := CLIKmerSize() + if k < 2 || k > 31 { + return fmt.Errorf("invalid k-mer size: %d (must be between 2 and 31)", k) + } + + m := CLIMinimizerSize() + + minOcc := CLIMinOccurrence() + if minOcc < 1 { + return fmt.Errorf("invalid min-occurrence: %d (must be >= 1)", minOcc) + } + + maxOcc := CLIMaxOccurrence() + + entropyThreshold := CLIIndexEntropyThreshold() + entropySize := CLIIndexEntropySize() + + // Build options + var opts []obikmer.BuilderOption + if minOcc > 1 { + opts = append(opts, obikmer.WithMinFrequency(minOcc)) + } + if maxOcc > 0 { + opts = append(opts, obikmer.WithMaxFrequency(maxOcc)) + } + if topN := CLISaveFreqKmer(); topN > 0 { + opts = append(opts, obikmer.WithSaveFreqKmers(topN)) + } + if entropyThreshold > 0 { + opts = append(opts, obikmer.WithEntropyFilter(entropyThreshold, entropySize)) + } + + // Determine whether to append to existing group or create new + var builder *obikmer.KmerSetGroupBuilder + var err error + metaPath := filepath.Join(outDir, "metadata.toml") + if _, statErr := os.Stat(metaPath); statErr == nil { + // Existing group: append + log.Infof("Appending to existing kmer index at %s", outDir) + builder, err = obikmer.AppendKmerSetGroupBuilder(outDir, 1, opts...) + if err != nil { + return fmt.Errorf("failed to open existing kmer index for appending: %w", err) + } + } else { + // New group + if maxOcc > 0 { + log.Infof("Creating new kmer index: k=%d, m=%d, occurrence=[%d,%d]", k, m, minOcc, maxOcc) + } else { + log.Infof("Creating new kmer index: k=%d, m=%d, min-occurrence=%d", k, m, minOcc) + } + builder, err = obikmer.NewKmerSetGroupBuilder(outDir, k, m, 1, -1, opts...) + if err != nil { + return fmt.Errorf("failed to create kmer index builder: %w", err) + } + } + + // Read and process sequences in parallel + sequences, err := obiconvert.CLIReadBioSequences(args...) + if err != nil { + return fmt.Errorf("failed to open sequence files: %w", err) + } + + nworkers := obidefault.ParallelWorkers() + var seqCount atomic.Int64 + var wg sync.WaitGroup + + consumer := func(iter obiiter.IBioSequence) { + defer wg.Done() + for iter.Next() { + batch := iter.Get() + for _, seq := range batch.Slice() { + builder.AddSequence(0, seq) + seqCount.Add(1) + } + } + } + + for i := 1; i < nworkers; i++ { + wg.Add(1) + go consumer(sequences.Split()) + } + wg.Add(1) + go consumer(sequences) + wg.Wait() + + log.Infof("Processed %d sequences", seqCount.Load()) + + // Finalize + ksg, err := builder.Close() + if err != nil { + return fmt.Errorf("failed to finalize kmer index: %w", err) + } + + // Apply index-id to the new set + newSetIdx := builder.StartIndex() + if id := CLIIndexId(); id != "" { + ksg.SetSetID(newSetIdx, id) + } + + // Apply group-level tags (-S) + for key, value := range CLISetTag() { + ksg.SetAttribute(key, value) + } + + // Apply per-set tags (-T) to the new set + for key, value := range _setMetaTags { + ksg.SetSetMetadata(newSetIdx, key, value) + } + + if minOcc > 1 { + ksg.SetAttribute("min_occurrence", minOcc) + } + if maxOcc > 0 { + ksg.SetAttribute("max_occurrence", maxOcc) + } + + if entropyThreshold > 0 { + ksg.SetAttribute("entropy_filter", entropyThreshold) + ksg.SetAttribute("entropy_filter_size", entropySize) + } + + if err := ksg.SaveMetadata(); err != nil { + return fmt.Errorf("failed to save metadata: %w", err) + } + + log.Infof("Index contains %d k-mers for set %d in %s", ksg.Len(newSetIdx), newSetIdx, outDir) + log.Info("Done.") + return nil +} diff --git a/pkg/obitools/obilowmask/obilowmask.go b/pkg/obitools/obik/lowmask.go similarity index 72% rename from pkg/obitools/obilowmask/obilowmask.go rename to pkg/obitools/obik/lowmask.go index 893cab2..35c1505 100644 --- a/pkg/obitools/obilowmask/obilowmask.go +++ b/pkg/obitools/obik/lowmask.go @@ -1,39 +1,22 @@ -package obilowmask +package obik import ( + "context" "fmt" "math" + log "github.com/sirupsen/logrus" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" + "github.com/DavidGamba/go-getoptions" ) -// MaskingMode defines how to handle low-complexity regions -type MaskingMode int - -const ( - Mask MaskingMode = iota // Mask mode: replace low-complexity regions with masked characters - Split // Split mode: split sequence into high-complexity fragments - Extract -) - -// LowMaskWorker creates a worker to mask low-complexity regions in DNA sequences. -// -// Algorithm principle: -// Calculate the normalized entropy of each k-mer at different scales (wordSize = 1 to level_max). -// K-mers with entropy below the threshold are masked. -// -// Parameters: -// - kmer_size: size of the sliding window for entropy calculation -// - level_max: maximum word size used for entropy calculation (finest scale) -// - threshold: normalized entropy threshold below which masking occurs (between 0 and 1) -// - mode: Mask (masking) or Split (splitting) -// - maskChar: character used for masking (typically 'n' or 'N') -// -// Returns: a SeqWorker function that can be applied to each sequence -func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte) obiseq.SeqWorker { +// lowMaskWorker creates a worker to mask low-complexity regions in DNA sequences. +func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte, keepShorter bool) obiseq.SeqWorker { nLogN := make([]float64, kmer_size+1) for i := 1; i <= kmer_size; i++ { @@ -87,6 +70,7 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking data[i] = deque[0].value } } + emaxValues := make([]float64, level_max+1) logNwords := make([]float64, level_max+1) for ws := 1; ws <= level_max; ws++ { @@ -259,11 +243,14 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking } if inlow && !masked { if fromlow >= 0 { - frg, err := sequence.Subsequence(fromlow, i, false) - if err != nil { - return nil, err + frgLen := i - fromlow + if keepShorter || frgLen >= kmer_size { + frg, err := sequence.Subsequence(fromlow, i, false) + if err != nil { + return nil, err + } + rep.Push(frg) } - rep.Push(frg) } inlow = false fromlow = -1 @@ -271,11 +258,14 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking } if inlow && fromlow >= 0 { - frg, err := sequence.Subsequence(fromlow, len(maskPosition), false) - if err != nil { - return nil, err + frgLen := len(maskPosition) - fromlow + if keepShorter || frgLen >= kmer_size { + frg, err := sequence.Subsequence(fromlow, len(maskPosition), false) + if err != nil { + return nil, err + } + rep.Push(frg) } - rep.Push(frg) } return *rep, nil @@ -293,11 +283,14 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking } if inhigh && masked { if fromhigh >= 0 { - frg, err := sequence.Subsequence(fromhigh, i, false) - if err != nil { - return nil, err + frgLen := i - fromhigh + if keepShorter || frgLen >= kmer_size { + frg, err := sequence.Subsequence(fromhigh, i, false) + if err != nil { + return nil, err + } + rep.Push(frg) } - rep.Push(frg) } inhigh = false fromhigh = -1 @@ -305,11 +298,14 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking } if inhigh && fromhigh >= 0 { - frg, err := sequence.Subsequence(fromhigh, len(maskPosition), false) - if err != nil { - return nil, err + frgLen := len(maskPosition) - fromhigh + if keepShorter || frgLen >= kmer_size { + frg, err := sequence.Subsequence(fromhigh, len(maskPosition), false) + if err != nil { + return nil, err + } + rep.Push(frg) } - rep.Push(frg) } return *rep, nil @@ -322,14 +318,22 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking for i := range remove { remove[i] = true } - return applyMaskMode(sequence, remove, maskChar) + switch mode { + case MaskMode: + return applyMaskMode(sequence, remove, maskChar) + case SplitMode: + return selectunmasked(sequence, remove) + case ExtractMode: + return selectMasked(sequence, remove) + } + return nil, fmt.Errorf("unknown mode %d", mode) } bseq := sequence.Sequence() maskPositions := maskAmbiguities(bseq) - mask := make([]int, len(bseq)) + maskFlags := make([]int, len(bseq)) entropies := make([]float64, len(bseq)) for i := range entropies { entropies[i] = 4.0 @@ -343,7 +347,7 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking for i := range bseq { v := level_max - mask[i] = v + maskFlags[i] = v } for ws := level_max - 1; ws > 0; ws-- { @@ -351,7 +355,7 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking for i, e2 := range entropies2 { if e2 < entropies[i] { entropies[i] = e2 - mask[i] = ws + maskFlags[i] = ws } } } @@ -367,39 +371,49 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking remove[i] = e <= threshold } - sequence.SetAttribute("mask", mask) + sequence.SetAttribute("mask", maskFlags) sequence.SetAttribute("Entropies", entropies) switch mode { - case Mask: + case MaskMode: return applyMaskMode(sequence, remove, maskChar) - case Split: + case SplitMode: return selectunmasked(sequence, remove) - case Extract: + case ExtractMode: return selectMasked(sequence, remove) } - return nil, fmt.Errorf("Unknown mode %d", mode) + return nil, fmt.Errorf("unknown mode %d", mode) } return masking } -// CLISequenceEntropyMasker creates an iterator that applies entropy masking -// to all sequences in an input iterator. -// -// Uses command-line parameters to configure the worker. -func CLISequenceEntropyMasker(iterator obiiter.IBioSequence) obiiter.IBioSequence { - var newIter obiiter.IBioSequence +// runLowmask implements the "obik lowmask" subcommand. +// It masks low-complexity regions in DNA sequences using entropy-based detection. +func runLowmask(ctx context.Context, opt *getoptions.GetOpt, args []string) error { + kmerSize := CLIKmerSize() + levelMax := CLIEntropySize() + threshold := CLIEntropyThreshold() + mode := CLIMaskingMode() + maskChar := CLIMaskingChar() - worker := LowMaskWorker( - CLIKmerSize(), - CLILevelMax(), - CLIThreshold(), - CLIMaskingMode(), - CLIMaskingChar(), - ) + log.Printf("Low-complexity masking: kmer-size=%d, entropy-size=%d, threshold=%.4f", kmerSize, levelMax, threshold) - newIter = iterator.MakeIWorker(worker, false, obidefault.ParallelWorkers()) + sequences, err := obiconvert.CLIReadBioSequences(args...) + if err != nil { + return fmt.Errorf("failed to open sequence files: %w", err) + } - return newIter.FilterEmpty() + worker := lowMaskWorker(kmerSize, levelMax, threshold, mode, maskChar, CLIKeepShorter()) + + masked := sequences.MakeIWorker( + worker, + false, + obidefault.ParallelWorkers(), + ).FilterEmpty() + + obiconvert.CLIWriteBioSequences(masked, true) + obiutils.WaitForLastPipe() + + return nil } diff --git a/pkg/obitools/obik/ls.go b/pkg/obitools/obik/ls.go new file mode 100644 index 0000000..7a06f43 --- /dev/null +++ b/pkg/obitools/obik/ls.go @@ -0,0 +1,96 @@ +package obik + +import ( + "context" + "encoding/json" + "fmt" + "strings" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" + "github.com/DavidGamba/go-getoptions" + "gopkg.in/yaml.v3" +) + +type setEntry struct { + Index int `json:"index" yaml:"index"` + ID string `json:"id" yaml:"id"` + Count uint64 `json:"count" yaml:"count"` +} + +func runLs(ctx context.Context, opt *getoptions.GetOpt, args []string) error { + if len(args) < 1 { + return fmt.Errorf("usage: obik ls [options] ") + } + + ksg, err := obikmer.OpenKmerSetGroup(args[0]) + if err != nil { + return fmt.Errorf("failed to open kmer index: %w", err) + } + + // Determine which sets to show + patterns := CLISetPatterns() + var indices []int + if len(patterns) > 0 { + indices, err = ksg.MatchSetIDs(patterns) + if err != nil { + return err + } + } else { + indices = make([]int, ksg.Size()) + for i := range indices { + indices[i] = i + } + } + + entries := make([]setEntry, len(indices)) + for i, idx := range indices { + entries[i] = setEntry{ + Index: idx, + ID: ksg.SetIDOf(idx), + Count: ksg.Len(idx), + } + } + + format := CLIOutFormat() + switch format { + case "json": + return outputLsJSON(entries) + case "yaml": + return outputLsYAML(entries) + case "csv": + return outputLsCSV(entries) + default: + return outputLsCSV(entries) + } +} + +func outputLsCSV(entries []setEntry) error { + fmt.Println("index,id,count") + for _, e := range entries { + // Escape commas in ID if needed + id := e.ID + if strings.ContainsAny(id, ",\"") { + id = "\"" + strings.ReplaceAll(id, "\"", "\"\"") + "\"" + } + fmt.Printf("%d,%s,%d\n", e.Index, id, e.Count) + } + return nil +} + +func outputLsJSON(entries []setEntry) error { + data, err := json.MarshalIndent(entries, "", " ") + if err != nil { + return err + } + fmt.Println(string(data)) + return nil +} + +func outputLsYAML(entries []setEntry) error { + data, err := yaml.Marshal(entries) + if err != nil { + return err + } + fmt.Print(string(data)) + return nil +} diff --git a/pkg/obitools/obik/match.go b/pkg/obitools/obik/match.go new file mode 100644 index 0000000..2ff51bb --- /dev/null +++ b/pkg/obitools/obik/match.go @@ -0,0 +1,221 @@ +package obik + +import ( + "context" + "fmt" + "sync" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" + "github.com/DavidGamba/go-getoptions" +) + +// defaultMatchQueryThreshold is the minimum number of k-mer entries to +// accumulate before launching a MatchBatch. Larger values amortize the +// cost of opening .kdi files across more query k-mers. +const defaultMatchQueryThreshold = 10_000_000 + +// preparedBatch pairs a batch with its pre-computed queries. +type preparedBatch struct { + batch obiiter.BioSequenceBatch + seqs []*obiseq.BioSequence + queries *obikmer.PreparedQueries +} + +// accumulatedWork holds multiple prepared batches whose queries have been +// merged into a single PreparedQueries. The flat seqs slice allows +// MatchBatch results (indexed by merged SeqIdx) to be mapped back to +// the original sequences. +type accumulatedWork struct { + batches []obiiter.BioSequenceBatch // original batches in order + seqs []*obiseq.BioSequence // flat: seqs from all batches concatenated + queries *obikmer.PreparedQueries // merged queries with rebased SeqIdx +} + +// runMatch implements the "obik match" subcommand. +// +// Pipeline architecture (no shared mutable state between stages): +// +// [input batches] +// │ Split across nCPU goroutines +// ▼ +// PrepareQueries (CPU, parallel) +// │ preparedCh +// ▼ +// Accumulate & MergeQueries (1 goroutine) +// │ matchCh — fires when totalKmers >= threshold +// ▼ +// MatchBatch + annotate (1 goroutine, internal parallelism per partition) +// │ +// ▼ +// [output batches] +func runMatch(ctx context.Context, opt *getoptions.GetOpt, args []string) error { + indexDir := CLIIndexDirectory() + + // Open the k-mer index + ksg, err := obikmer.OpenKmerSetGroup(indexDir) + if err != nil { + return fmt.Errorf("failed to open kmer index: %w", err) + } + + log.Infof("Opened index: k=%d, m=%d, %d partitions, %d set(s)", + ksg.K(), ksg.M(), ksg.Partitions(), ksg.Size()) + + // Resolve which sets to match against + patterns := CLISetPatterns() + var setIndices []int + if len(patterns) > 0 { + setIndices, err = ksg.MatchSetIDs(patterns) + if err != nil { + return fmt.Errorf("failed to match set patterns: %w", err) + } + if len(setIndices) == 0 { + return fmt.Errorf("no sets match the given patterns") + } + } else { + setIndices = make([]int, ksg.Size()) + for i := range setIndices { + setIndices[i] = i + } + } + + for _, idx := range setIndices { + id := ksg.SetIDOf(idx) + if id == "" { + id = fmt.Sprintf("set_%d", idx) + } + log.Infof("Matching against set %d (%s): %d k-mers", idx, id, ksg.Len(idx)) + } + + // Read input sequences + sequences, err := obiconvert.CLIReadBioSequences(args...) + if err != nil { + return fmt.Errorf("failed to open sequence files: %w", err) + } + + nworkers := obidefault.ParallelWorkers() + + // --- Stage 1: Prepare queries in parallel --- + preparedCh := make(chan preparedBatch, nworkers) + + var prepWg sync.WaitGroup + preparer := func(iter obiiter.IBioSequence) { + defer prepWg.Done() + for iter.Next() { + batch := iter.Get() + slice := batch.Slice() + + seqs := make([]*obiseq.BioSequence, len(slice)) + for i, s := range slice { + seqs[i] = s + } + + pq := ksg.PrepareQueries(seqs) + + preparedCh <- preparedBatch{ + batch: batch, + seqs: seqs, + queries: pq, + } + } + } + + for i := 1; i < nworkers; i++ { + prepWg.Add(1) + go preparer(sequences.Split()) + } + prepWg.Add(1) + go preparer(sequences) + + go func() { + prepWg.Wait() + close(preparedCh) + }() + + // --- Stage 2: Accumulate & merge queries --- + matchCh := make(chan *accumulatedWork, 2) + + go func() { + defer close(matchCh) + + var acc *accumulatedWork + + for pb := range preparedCh { + if acc == nil { + acc = &accumulatedWork{ + batches: []obiiter.BioSequenceBatch{pb.batch}, + seqs: pb.seqs, + queries: pb.queries, + } + } else { + // Merge this batch's queries into the accumulator + obikmer.MergeQueries(acc.queries, pb.queries) + acc.batches = append(acc.batches, pb.batch) + acc.seqs = append(acc.seqs, pb.seqs...) + } + + // Flush when we exceed the threshold + if acc.queries.NKmers >= defaultMatchQueryThreshold { + matchCh <- acc + acc = nil + } + } + + // Flush remaining + if acc != nil { + matchCh <- acc + } + }() + + // --- Stage 3: Match & annotate --- + output := obiiter.MakeIBioSequence() + if sequences.IsPaired() { + output.MarkAsPaired() + } + + output.Add(1) + go func() { + defer output.Done() + + for work := range matchCh { + // Match against each selected set + for _, setIdx := range setIndices { + result := ksg.MatchBatch(setIdx, work.queries) + + setID := ksg.SetIDOf(setIdx) + if setID == "" { + setID = fmt.Sprintf("set_%d", setIdx) + } + attrName := "kmer_matched_" + setID + + for seqIdx, positions := range result { + if len(positions) > 0 { + work.seqs[seqIdx].SetAttribute(attrName, positions) + } + } + } + + // Push annotated batches to output + for _, b := range work.batches { + output.Push(b) + } + + // Help GC + work.seqs = nil + work.queries = nil + } + }() + + go output.WaitAndClose() + + obiconvert.CLIWriteBioSequences(output, true) + obiutils.WaitForLastPipe() + + return nil +} diff --git a/pkg/obitools/obik/mv.go b/pkg/obitools/obik/mv.go new file mode 100644 index 0000000..6aa2dfd --- /dev/null +++ b/pkg/obitools/obik/mv.go @@ -0,0 +1,63 @@ +package obik + +import ( + "context" + "fmt" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" + "github.com/DavidGamba/go-getoptions" +) + +func runMv(ctx context.Context, opt *getoptions.GetOpt, args []string) error { + if len(args) < 2 { + return fmt.Errorf("usage: obik mv [--set PATTERN]... [--force] ") + } + + srcDir := args[0] + destDir := args[1] + + ksg, err := obikmer.OpenKmerSetGroup(srcDir) + if err != nil { + return fmt.Errorf("failed to open source kmer index: %w", err) + } + + // Resolve set patterns + patterns := CLISetPatterns() + var ids []string + if len(patterns) > 0 { + indices, err := ksg.MatchSetIDs(patterns) + if err != nil { + return err + } + if len(indices) == 0 { + return fmt.Errorf("no sets match the given patterns") + } + ids = make([]string, len(indices)) + for i, idx := range indices { + ids[i] = ksg.SetIDOf(idx) + } + } else { + // Move all sets + ids = ksg.SetsIDs() + } + + log.Infof("Moving %d set(s) from %s to %s", len(ids), srcDir, destDir) + + // Copy first + dest, err := ksg.CopySetsByIDTo(ids, destDir, CLIForce()) + if err != nil { + return err + } + + // Remove from source (in reverse order to avoid renumbering issues) + for i := len(ids) - 1; i >= 0; i-- { + if err := ksg.RemoveSetByID(ids[i]); err != nil { + return fmt.Errorf("failed to remove set %q from source after copy: %w", ids[i], err) + } + } + + log.Infof("Destination now has %d set(s), source has %d set(s)", dest.Size(), ksg.Size()) + return nil +} diff --git a/pkg/obitools/obik/obik.go b/pkg/obitools/obik/obik.go new file mode 100644 index 0000000..28441d4 --- /dev/null +++ b/pkg/obitools/obik/obik.go @@ -0,0 +1,85 @@ +package obik + +import ( + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "github.com/DavidGamba/go-getoptions" +) + +// OptionSet registers all obik subcommands on the root GetOpt. +func OptionSet(opt *getoptions.GetOpt) { + // index: build or extend a kmer index from sequence files + indexCmd := opt.NewCommand("index", "Build a disk-based kmer index from sequence files") + obiconvert.InputOptionSet(indexCmd) + obiconvert.OutputModeOptionSet(indexCmd, false) + KmerIndexOptionSet(indexCmd) + indexCmd.StringMapVar(&_setMetaTags, "tag", 1, 1, + indexCmd.Alias("T"), + indexCmd.ArgName("KEY=VALUE"), + indexCmd.Description("Per-set metadata tag (repeatable).")) + indexCmd.SetCommandFn(runIndex) + + // ls: list sets in a kmer index + lsCmd := opt.NewCommand("ls", "List sets in a kmer index") + OutputFormatOptionSet(lsCmd) + SetSelectionOptionSet(lsCmd) + lsCmd.SetCommandFn(runLs) + + // summary: detailed statistics + summaryCmd := opt.NewCommand("summary", "Show detailed statistics of a kmer index") + OutputFormatOptionSet(summaryCmd) + summaryCmd.BoolVar(&_jaccard, "jaccard", false, + summaryCmd.Description("Compute and display pairwise Jaccard distance matrix.")) + summaryCmd.SetCommandFn(runSummary) + + // cp: copy sets between indices + cpCmd := opt.NewCommand("cp", "Copy sets between kmer indices") + SetSelectionOptionSet(cpCmd) + ForceOptionSet(cpCmd) + cpCmd.SetCommandFn(runCp) + + // mv: move sets between indices + mvCmd := opt.NewCommand("mv", "Move sets between kmer indices") + SetSelectionOptionSet(mvCmd) + ForceOptionSet(mvCmd) + mvCmd.SetCommandFn(runMv) + + // rm: remove sets from an index + rmCmd := opt.NewCommand("rm", "Remove sets from a kmer index") + SetSelectionOptionSet(rmCmd) + rmCmd.SetCommandFn(runRm) + + // spectrum: output k-mer frequency spectrum as CSV + spectrumCmd := opt.NewCommand("spectrum", "Output k-mer frequency spectrum as CSV") + SetSelectionOptionSet(spectrumCmd) + obiconvert.OutputModeOptionSet(spectrumCmd, false) + spectrumCmd.SetCommandFn(runSpectrum) + + // super: extract super k-mers from sequences + superCmd := opt.NewCommand("super", "Extract super k-mers from sequence files") + obiconvert.InputOptionSet(superCmd) + obiconvert.OutputOptionSet(superCmd) + SuperKmerOptionSet(superCmd) + superCmd.SetCommandFn(runSuper) + + // lowmask: mask low-complexity regions + lowmaskCmd := opt.NewCommand("lowmask", "Mask low-complexity regions in sequences using entropy") + obiconvert.InputOptionSet(lowmaskCmd) + obiconvert.OutputOptionSet(lowmaskCmd) + LowMaskOptionSet(lowmaskCmd) + lowmaskCmd.SetCommandFn(runLowmask) + + // match: annotate sequences with k-mer match positions from an index + matchCmd := opt.NewCommand("match", "Annotate sequences with k-mer match positions from an index") + IndexDirectoryOptionSet(matchCmd) + obiconvert.InputOptionSet(matchCmd) + obiconvert.OutputOptionSet(matchCmd) + SetSelectionOptionSet(matchCmd) + matchCmd.SetCommandFn(runMatch) + + // filter: filter an index to remove low-complexity k-mers + filterCmd := opt.NewCommand("filter", "Filter a kmer index to remove low-complexity k-mers") + obiconvert.OutputModeOptionSet(filterCmd, false) + EntropyFilterOptionSet(filterCmd) + SetSelectionOptionSet(filterCmd) + filterCmd.SetCommandFn(runFilter) +} diff --git a/pkg/obitools/obik/options.go b/pkg/obitools/obik/options.go new file mode 100644 index 0000000..5ad6d1d --- /dev/null +++ b/pkg/obitools/obik/options.go @@ -0,0 +1,360 @@ +package obik + +import ( + "strings" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "github.com/DavidGamba/go-getoptions" +) + +// MaskingMode defines how to handle low-complexity regions +type MaskingMode int + +const ( + MaskMode MaskingMode = iota // Replace low-complexity regions with masked characters + SplitMode // Split sequence into high-complexity fragments + ExtractMode // Extract low-complexity fragments +) + +// Output format flags +var _jsonOutput bool +var _csvOutput bool +var _yamlOutput bool + +// Set selection flags +var _setPatterns []string + +// Force flag +var _force bool + +// Jaccard flag +var _jaccard bool + +// Per-set tags for index subcommand +var _setMetaTags = make(map[string]string, 0) + +// ============================== +// Shared kmer options (used by index, super, lowmask) +// ============================== + +var _kmerSize = 31 +var _minimizerSize = -1 // -1 means auto: ceil(k / 2.5) + +// KmerSizeOptionSet registers --kmer-size / -k. +// Shared by index, super, and lowmask subcommands. +func KmerSizeOptionSet(options *getoptions.GetOpt) { + options.IntVar(&_kmerSize, "kmer-size", _kmerSize, + options.Alias("k"), + options.Description("Size of k-mers (must be between 2 and 31).")) +} + +// MinimizerOptionSet registers --minimizer-size / -m. +// Shared by index and super subcommands. +func MinimizerOptionSet(options *getoptions.GetOpt) { + options.IntVar(&_minimizerSize, "minimizer-size", _minimizerSize, + options.Alias("m"), + options.Description("Size of minimizers for parallelization (-1 for auto = ceil(k/2.5)).")) +} + +// ============================== +// Lowmask-specific options +// ============================== + +var _entropySize = 6 +var _entropyThreshold = 0.5 +var _splitMode = false +var _extractMode = false +var _maskingChar = "." +var _keepShorter = false + +// LowMaskOptionSet registers options specific to low-complexity masking. +func LowMaskOptionSet(options *getoptions.GetOpt) { + KmerSizeOptionSet(options) + + options.IntVar(&_entropySize, "entropy-size", _entropySize, + options.Description("Maximum word size considered for entropy estimate.")) + + options.Float64Var(&_entropyThreshold, "threshold", _entropyThreshold, + options.Description("Entropy threshold below which a kmer is masked (0 to 1).")) + + options.BoolVar(&_splitMode, "extract-high", _splitMode, + options.Description("Extract only high-complexity regions.")) + + options.BoolVar(&_extractMode, "extract-low", _extractMode, + options.Description("Extract only low-complexity regions.")) + + options.StringVar(&_maskingChar, "masking-char", _maskingChar, + options.Description("Character used to mask low complexity regions.")) + + options.BoolVar(&_keepShorter, "keep-shorter", _keepShorter, + options.Description("Keep fragments shorter than kmer-size in split/extract mode.")) +} + +// ============================== +// Index-specific options +// ============================== + +var _indexId = "" +var _metadataFormat = "toml" +var _setTag = make(map[string]string, 0) +var _minOccurrence = 1 +var _maxOccurrence = 0 +var _saveFullFilter = false +var _saveFreqKmer = 0 +var _indexEntropyThreshold = 0.0 +var _indexEntropySize = 6 + +// KmerIndexOptionSet defines every option related to kmer index building. +func KmerIndexOptionSet(options *getoptions.GetOpt) { + KmerSizeOptionSet(options) + MinimizerOptionSet(options) + + options.StringVar(&_indexId, "index-id", _indexId, + options.Description("Identifier for the kmer index.")) + + options.StringVar(&_metadataFormat, "metadata-format", _metadataFormat, + options.Description("Format for metadata file (toml, yaml, json).")) + + options.StringMapVar(&_setTag, "set-tag", 1, 1, + options.Alias("S"), + options.ArgName("KEY=VALUE"), + options.Description("Adds a group-level metadata attribute KEY with value VALUE.")) + + options.IntVar(&_minOccurrence, "min-occurrence", _minOccurrence, + options.Description("Minimum number of occurrences for a k-mer to be kept (default 1 = keep all).")) + + options.IntVar(&_maxOccurrence, "max-occurrence", _maxOccurrence, + options.Description("Maximum number of occurrences for a k-mer to be kept (default 0 = no upper bound).")) + + options.BoolVar(&_saveFullFilter, "save-full-filter", _saveFullFilter, + options.Description("When using --min-occurrence > 1, save the full frequency filter instead of just the filtered index.")) + + options.IntVar(&_saveFreqKmer, "save-freq-kmer", _saveFreqKmer, + options.Description("Save the N most frequent k-mers per set to a CSV file (top_kmers.csv).")) + + options.Float64Var(&_indexEntropyThreshold, "entropy-filter", _indexEntropyThreshold, + options.Description("Filter low-complexity k-mers with entropy <= threshold (0 = disabled).")) + + options.IntVar(&_indexEntropySize, "entropy-filter-size", _indexEntropySize, + options.Description("Maximum word size for entropy filter computation (default 6).")) +} + +// EntropyFilterOptionSet registers entropy filter options for commands +// that process existing indices (e.g. filter). +func EntropyFilterOptionSet(options *getoptions.GetOpt) { + options.Float64Var(&_indexEntropyThreshold, "entropy-filter", _indexEntropyThreshold, + options.Description("Filter low-complexity k-mers with entropy <= threshold (0 = disabled).")) + + options.IntVar(&_indexEntropySize, "entropy-filter-size", _indexEntropySize, + options.Description("Maximum word size for entropy filter computation (default 6).")) +} + +// ============================== +// Super kmer options +// ============================== + +// SuperKmerOptionSet registers options specific to super k-mer extraction. +func SuperKmerOptionSet(options *getoptions.GetOpt) { + KmerSizeOptionSet(options) + MinimizerOptionSet(options) +} + +// CLIKmerSize returns the k-mer size. +func CLIKmerSize() int { + return _kmerSize +} + +// CLIMinimizerSize returns the effective minimizer size. +func CLIMinimizerSize() int { + m := _minimizerSize + if m < 0 { + m = obikmer.DefaultMinimizerSize(_kmerSize) + } + nworkers := obidefault.ParallelWorkers() + m = obikmer.ValidateMinimizerSize(m, _kmerSize, nworkers) + return m +} + +// CLIIndexId returns the index identifier. +func CLIIndexId() string { + return _indexId +} + +// CLIMetadataFormat returns the metadata format. +func CLIMetadataFormat() obikmer.MetadataFormat { + switch strings.ToLower(_metadataFormat) { + case "toml": + return obikmer.FormatTOML + case "yaml": + return obikmer.FormatYAML + case "json": + return obikmer.FormatJSON + default: + log.Warnf("Unknown metadata format %q, defaulting to TOML", _metadataFormat) + return obikmer.FormatTOML + } +} + +// CLISetTag returns the group-level metadata key=value pairs. +func CLISetTag() map[string]string { + return _setTag +} + +// CLIMinOccurrence returns the minimum occurrence threshold. +func CLIMinOccurrence() int { + return _minOccurrence +} + +// CLIMaxOccurrence returns the maximum occurrence threshold (0 = no upper bound). +func CLIMaxOccurrence() int { + return _maxOccurrence +} + +// CLISaveFullFilter returns whether to save the full frequency filter. +func CLISaveFullFilter() bool { + return _saveFullFilter +} + +// CLISaveFreqKmer returns the number of top frequent k-mers to save (0 = disabled). +func CLISaveFreqKmer() int { + return _saveFreqKmer +} + +// CLIOutputDirectory returns the output directory path. +func CLIOutputDirectory() string { + return obiconvert.CLIOutPutFileName() +} + +// SetKmerSize sets the k-mer size (for testing). +func SetKmerSize(k int) { + _kmerSize = k +} + +// SetMinimizerSize sets the minimizer size (for testing). +func SetMinimizerSize(m int) { + _minimizerSize = m +} + +// SetMinOccurrence sets the minimum occurrence (for testing). +func SetMinOccurrence(n int) { + _minOccurrence = n +} + +// CLIMaskingMode returns the masking mode from CLI flags. +func CLIMaskingMode() MaskingMode { + switch { + case _extractMode: + return ExtractMode + case _splitMode: + return SplitMode + default: + return MaskMode + } +} + +// CLIMaskingChar returns the masking character, validated. +func CLIMaskingChar() byte { + mask := strings.TrimSpace(_maskingChar) + if len(mask) != 1 { + log.Fatalf("--masking-char option accepts a single character, not %s", mask) + } + return []byte(mask)[0] +} + +// CLIEntropySize returns the entropy word size. +func CLIEntropySize() int { + return _entropySize +} + +// CLIEntropyThreshold returns the entropy threshold. +func CLIEntropyThreshold() float64 { + return _entropyThreshold +} + +// CLIKeepShorter returns whether to keep short fragments. +func CLIKeepShorter() bool { + return _keepShorter +} + +// ============================== +// Match-specific options +// ============================== + +var _indexDirectory = "" + +// IndexDirectoryOptionSet registers --index / -i (mandatory directory for match). +func IndexDirectoryOptionSet(options *getoptions.GetOpt) { + options.StringVar(&_indexDirectory, "index", _indexDirectory, + options.Alias("i"), + options.Required(), + options.ArgName("DIRECTORY"), + options.Description("Path to the kmer index directory.")) +} + +// CLIIndexDirectory returns the --index directory path. +func CLIIndexDirectory() string { + return _indexDirectory +} + +// CLIIndexEntropyThreshold returns the entropy filter threshold for index building (0 = disabled). +func CLIIndexEntropyThreshold() float64 { + return _indexEntropyThreshold +} + +// CLIIndexEntropySize returns the entropy filter word size for index building. +func CLIIndexEntropySize() int { + return _indexEntropySize +} + +// OutputFormatOptionSet registers --json-output, --csv-output, --yaml-output. +func OutputFormatOptionSet(options *getoptions.GetOpt) { + options.BoolVar(&_jsonOutput, "json-output", false, + options.Description("Print results as JSON.")) + options.BoolVar(&_csvOutput, "csv-output", false, + options.Description("Print results as CSV.")) + options.BoolVar(&_yamlOutput, "yaml-output", false, + options.Description("Print results as YAML.")) +} + +// CLIOutFormat returns the selected output format: "json", "csv", "yaml", or "text". +func CLIOutFormat() string { + if _jsonOutput { + return "json" + } + if _csvOutput { + return "csv" + } + if _yamlOutput { + return "yaml" + } + return "text" +} + +// SetSelectionOptionSet registers --set (repeatable). +func SetSelectionOptionSet(options *getoptions.GetOpt) { + options.StringSliceVar(&_setPatterns, "set", 1, 1, + options.Alias("s"), + options.ArgName("PATTERN"), + options.Description("Set ID or glob pattern (repeatable, supports *, ?, [...]).")) +} + +// CLISetPatterns returns the --set patterns provided by the user. +func CLISetPatterns() []string { + return _setPatterns +} + +// ForceOptionSet registers --force / -f. +func ForceOptionSet(options *getoptions.GetOpt) { + options.BoolVar(&_force, "force", false, + options.Alias("f"), + options.Description("Force operation even if set ID already exists in destination.")) +} + +// CLIForce returns whether --force was specified. +func CLIForce() bool { + return _force +} diff --git a/pkg/obitools/obik/rm.go b/pkg/obitools/obik/rm.go new file mode 100644 index 0000000..77f0f85 --- /dev/null +++ b/pkg/obitools/obik/rm.go @@ -0,0 +1,56 @@ +package obik + +import ( + "context" + "fmt" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" + "github.com/DavidGamba/go-getoptions" +) + +func runRm(ctx context.Context, opt *getoptions.GetOpt, args []string) error { + if len(args) < 1 { + return fmt.Errorf("usage: obik rm --set PATTERN [--set PATTERN]... ") + } + + patterns := CLISetPatterns() + if len(patterns) == 0 { + return fmt.Errorf("--set is required (specify which sets to remove)") + } + + indexDir := args[0] + + ksg, err := obikmer.OpenKmerSetGroup(indexDir) + if err != nil { + return fmt.Errorf("failed to open kmer index: %w", err) + } + + indices, err := ksg.MatchSetIDs(patterns) + if err != nil { + return err + } + if len(indices) == 0 { + return fmt.Errorf("no sets match the given patterns") + } + + // Collect IDs before removal (indices shift as we remove) + ids := make([]string, len(indices)) + for i, idx := range indices { + ids[i] = ksg.SetIDOf(idx) + } + + log.Infof("Removing %d set(s) from %s", len(ids), indexDir) + + // Remove in reverse order to avoid renumbering issues + for i := len(ids) - 1; i >= 0; i-- { + if err := ksg.RemoveSetByID(ids[i]); err != nil { + return fmt.Errorf("failed to remove set %q: %w", ids[i], err) + } + log.Infof("Removed set %q", ids[i]) + } + + log.Infof("Index now has %d set(s)", ksg.Size()) + return nil +} diff --git a/pkg/obitools/obik/spectrum.go b/pkg/obitools/obik/spectrum.go new file mode 100644 index 0000000..a226b4f --- /dev/null +++ b/pkg/obitools/obik/spectrum.go @@ -0,0 +1,121 @@ +package obik + +import ( + "context" + "encoding/csv" + "fmt" + "os" + "strconv" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "github.com/DavidGamba/go-getoptions" +) + +// runSpectrum implements the "obik spectrum" subcommand. +// It outputs k-mer frequency spectra as CSV with one column per set. +func runSpectrum(ctx context.Context, opt *getoptions.GetOpt, args []string) error { + if len(args) < 1 { + return fmt.Errorf("usage: obik spectrum [options] ") + } + + ksg, err := obikmer.OpenKmerSetGroup(args[0]) + if err != nil { + return fmt.Errorf("failed to open kmer index: %w", err) + } + + // Determine which sets to include + patterns := CLISetPatterns() + var indices []int + if len(patterns) > 0 { + indices, err = ksg.MatchSetIDs(patterns) + if err != nil { + return fmt.Errorf("failed to match set patterns: %w", err) + } + if len(indices) == 0 { + return fmt.Errorf("no sets match the given patterns") + } + } else { + // All sets + indices = make([]int, ksg.Size()) + for i := range indices { + indices[i] = i + } + } + + // Read spectra for selected sets + spectraMaps := make([]map[int]uint64, len(indices)) + maxFreq := 0 + for i, idx := range indices { + spectrum, err := ksg.Spectrum(idx) + if err != nil { + return fmt.Errorf("failed to read spectrum for set %d: %w", idx, err) + } + if spectrum == nil { + log.Warnf("No spectrum data for set %d (%s)", idx, ksg.SetIDOf(idx)) + spectraMaps[i] = make(map[int]uint64) + continue + } + spectraMaps[i] = spectrum.ToMap() + if mf := spectrum.MaxFrequency(); mf > maxFreq { + maxFreq = mf + } + } + + if maxFreq == 0 { + return fmt.Errorf("no spectrum data found in any selected set") + } + + // Determine output destination + outFile := obiconvert.CLIOutPutFileName() + var w *csv.Writer + if outFile == "" || outFile == "-" { + w = csv.NewWriter(os.Stdout) + } else { + f, err := os.Create(outFile) + if err != nil { + return fmt.Errorf("failed to create output file: %w", err) + } + defer f.Close() + w = csv.NewWriter(f) + } + defer w.Flush() + + // Build header: frequency, set_id_1, set_id_2, ... + header := make([]string, 1+len(indices)) + header[0] = "frequency" + for i, idx := range indices { + id := ksg.SetIDOf(idx) + if id == "" { + id = fmt.Sprintf("set_%d", idx) + } + header[i+1] = id + } + if err := w.Write(header); err != nil { + return err + } + + // Write rows for each frequency from 1 to maxFreq + record := make([]string, 1+len(indices)) + for freq := 1; freq <= maxFreq; freq++ { + record[0] = strconv.Itoa(freq) + hasData := false + for i := range indices { + count := spectraMaps[i][freq] + record[i+1] = strconv.FormatUint(count, 10) + if count > 0 { + hasData = true + } + } + // Only write rows where at least one set has a non-zero count + if hasData { + if err := w.Write(record); err != nil { + return err + } + } + } + + return nil +} diff --git a/pkg/obitools/obik/summary.go b/pkg/obitools/obik/summary.go new file mode 100644 index 0000000..3cd20de --- /dev/null +++ b/pkg/obitools/obik/summary.go @@ -0,0 +1,148 @@ +package obik + +import ( + "context" + "encoding/json" + "fmt" + "os" + "path/filepath" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" + "github.com/DavidGamba/go-getoptions" + "gopkg.in/yaml.v3" +) + +type setSummary struct { + Index int `json:"index" yaml:"index"` + ID string `json:"id" yaml:"id"` + Count uint64 `json:"count" yaml:"count"` + DiskSize int64 `json:"disk_bytes" yaml:"disk_bytes"` + Metadata map[string]interface{} `json:"metadata,omitempty" yaml:"metadata,omitempty"` +} + +type groupSummary struct { + Path string `json:"path" yaml:"path"` + ID string `json:"id,omitempty" yaml:"id,omitempty"` + K int `json:"k" yaml:"k"` + M int `json:"m" yaml:"m"` + Partitions int `json:"partitions" yaml:"partitions"` + TotalSets int `json:"total_sets" yaml:"total_sets"` + TotalKmers uint64 `json:"total_kmers" yaml:"total_kmers"` + TotalDisk int64 `json:"total_disk_bytes" yaml:"total_disk_bytes"` + Metadata map[string]interface{} `json:"metadata,omitempty" yaml:"metadata,omitempty"` + Sets []setSummary `json:"sets" yaml:"sets"` + Jaccard [][]float64 `json:"jaccard,omitempty" yaml:"jaccard,omitempty"` +} + +func runSummary(ctx context.Context, opt *getoptions.GetOpt, args []string) error { + if len(args) < 1 { + return fmt.Errorf("usage: obik summary [options] ") + } + + ksg, err := obikmer.OpenKmerSetGroup(args[0]) + if err != nil { + return fmt.Errorf("failed to open kmer index: %w", err) + } + + summary := groupSummary{ + Path: ksg.Path(), + ID: ksg.Id(), + K: ksg.K(), + M: ksg.M(), + Partitions: ksg.Partitions(), + TotalSets: ksg.Size(), + TotalKmers: ksg.Len(), + Metadata: ksg.Metadata, + Sets: make([]setSummary, ksg.Size()), + } + + var totalDisk int64 + for i := 0; i < ksg.Size(); i++ { + diskSize := computeSetDiskSize(ksg, i) + totalDisk += diskSize + summary.Sets[i] = setSummary{ + Index: i, + ID: ksg.SetIDOf(i), + Count: ksg.Len(i), + DiskSize: diskSize, + Metadata: ksg.AllSetMetadata(i), + } + } + summary.TotalDisk = totalDisk + + // Jaccard matrix + if _jaccard && ksg.Size() > 1 { + dm := ksg.JaccardDistanceMatrix() + n := ksg.Size() + matrix := make([][]float64, n) + for i := 0; i < n; i++ { + matrix[i] = make([]float64, n) + for j := 0; j < n; j++ { + if i == j { + matrix[i][j] = 0 + } else { + matrix[i][j] = dm.Get(i, j) + } + } + } + summary.Jaccard = matrix + } + + format := CLIOutFormat() + switch format { + case "json": + return outputSummaryJSON(summary) + case "yaml": + return outputSummaryYAML(summary) + case "csv": + return outputSummaryCSV(summary) + default: + return outputSummaryJSON(summary) + } +} + +func computeSetDiskSize(ksg *obikmer.KmerSetGroup, setIndex int) int64 { + var total int64 + for p := 0; p < ksg.Partitions(); p++ { + path := ksg.PartitionPath(setIndex, p) + info, err := os.Stat(path) + if err != nil { + continue + } + total += info.Size() + } + // Also count the set directory entry itself + setDir := filepath.Join(ksg.Path(), fmt.Sprintf("set_%d", setIndex)) + entries, err := os.ReadDir(setDir) + if err == nil { + // We already counted .kdi files above; this is just for completeness + _ = entries + } + return total +} + +func outputSummaryJSON(summary groupSummary) error { + data, err := json.MarshalIndent(summary, "", " ") + if err != nil { + return err + } + fmt.Println(string(data)) + return nil +} + +func outputSummaryYAML(summary groupSummary) error { + data, err := yaml.Marshal(summary) + if err != nil { + return err + } + fmt.Print(string(data)) + return nil +} + +func outputSummaryCSV(summary groupSummary) error { + fmt.Println("index,id,count,disk_bytes") + for _, s := range summary.Sets { + fmt.Printf("%d,%s,%d,%d\n", s.Index, s.ID, s.Count, s.DiskSize) + } + return nil +} diff --git a/pkg/obitools/obik/super.go b/pkg/obitools/obik/super.go new file mode 100644 index 0000000..86d0d69 --- /dev/null +++ b/pkg/obitools/obik/super.go @@ -0,0 +1,49 @@ +package obik + +import ( + "context" + "fmt" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" + "github.com/DavidGamba/go-getoptions" +) + +// runSuper implements the "obik super" subcommand. +// It extracts super k-mers from DNA sequences. +func runSuper(ctx context.Context, opt *getoptions.GetOpt, args []string) error { + k := CLIKmerSize() + m := CLIMinimizerSize() + + if k < 2 || k > 31 { + return fmt.Errorf("invalid k-mer size: %d (must be between 2 and 31)", k) + } + + if m < 1 || m >= k { + return fmt.Errorf("invalid parameters: minimizer size (%d) must be between 1 and k-1 (%d)", m, k-1) + } + + log.Printf("Extracting super k-mers with k=%d, m=%d", k, m) + + sequences, err := obiconvert.CLIReadBioSequences(args...) + if err != nil { + return fmt.Errorf("failed to open sequence files: %w", err) + } + + worker := obikmer.SuperKmerWorker(k, m) + + superkmers := sequences.MakeIWorker( + worker, + false, + obidefault.ParallelWorkers(), + ) + + obiconvert.CLIWriteBioSequences(superkmers, true) + obiutils.WaitForLastPipe() + + return nil +} diff --git a/pkg/obitools/obilowmask/entropy.qmd b/pkg/obitools/obilowmask/entropy.qmd deleted file mode 100644 index e851a57..0000000 --- a/pkg/obitools/obilowmask/entropy.qmd +++ /dev/null @@ -1,332 +0,0 @@ -```{r} -library(tidyverse) -``` - -```{r} -x <- sample(1:4096, 29, replace=TRUE) -``` - -```{r} -emax <- function(lseq,word_size) { - nword = lseq - word_size + 1 - nalpha = 4^word_size - - if (nalpha < nword) { - cov = nword %/% nalpha - remains = nword %% nalpha - f1 = cov/nword - f2 = (cov+1)/nword - print(c(nalpha - remains,f1,remains,f2)) - e = -(nalpha - remains) * f1 * log(f1) - - remains * f2 * log(f2) - } else { - e = log(nword) - } - - e -} -``` - -```{r} -ec <- function(data,kmer_size) { - table <- table(data) - s <- sum(table) - e <- sum(table * log(table))/s - ed <- log(s) - e - - em <- emax(s+kmer_size-1,kmer_size) - - ed/em -} -``` - -```{r} -ef <- function(data,kmer_size) { - table <- table(data) - s <- sum(table) - f <- table / s - - f <- as.numeric(f) - f <- f[f > 0] - - em <- emax(s+kmer_size-1,kmer_size) - ed <- -sum(f * log(f)) - - print(c(ed,em,ed/em)) - - ed/em -} -``` - -```{r} -okmer <- function(data,kmer_size) { - str_sub(data,1:(nchar(data)-kmer_size+1)) %>% - str_sub(1,kmer_size) -} -``` - -```{r} -# Normalisation circulaire: retourne le plus petit k-mer par rotation circulaire -normalize_circular <- function(kmer) { - if (nchar(kmer) == 0) return(kmer) - - canonical <- kmer - n <- nchar(kmer) - - # Tester toutes les rotations circulaires - for (i in 2:n) { - rotated <- paste0(str_sub(kmer, i, n), str_sub(kmer, 1, i-1)) - if (rotated < canonical) { - canonical <- rotated - } - } - - canonical -} -``` - -```{r} -# Fonction totient d'Euler: compte le nombre d'entiers de 1 à n coprimes avec n -euler_totient <- function(n) { - if (n <= 0) return(0) - - result <- n - p <- 2 - - # Traiter tous les facteurs premiers - while (p * p <= n) { - if (n %% p == 0) { - # Retirer toutes les occurrences de p - while (n %% p == 0) { - n <- n %/% p - } - # Appliquer la formule: φ(n) = n * (1 - 1/p) - result <- result - result %/% p - } - p <- p + 1 - } - - # Si n est toujours > 1, alors c'est un facteur premier - if (n > 1) { - result <- result - result %/% n - } - - result -} -``` - -```{r} -# Retourne tous les diviseurs de n -divisors <- function(n) { - if (n <= 0) return(integer(0)) - - divs <- c() - i <- 1 - while (i * i <= n) { - if (n %% i == 0) { - divs <- c(divs, i) - if (i != n %/% i) { - divs <- c(divs, n %/% i) - } - } - i <- i + 1 - } - - sort(divs) -} -``` - -```{r} -# Compte le nombre de colliers (necklaces) distincts de longueur n -# sur un alphabet de taille a en utilisant la formule de Moreau: -# N(n, a) = (1/n) * Σ φ(d) * a^(n/d) -# où la somme est sur tous les diviseurs d de n, et φ est la fonction totient d'Euler -necklace_count <- function(n, alphabet_size) { - if (n <= 0) return(0) - - divs <- divisors(n) - sum_val <- 0 - - for (d in divs) { - # Calculer alphabet_size^(n/d) - power <- alphabet_size^(n %/% d) - sum_val <- sum_val + euler_totient(d) * power - } - - sum_val %/% n -} -``` - -```{r} -# Nombre de classes d'équivalence pour les k-mers normalisés -# Utilise la formule exacte de Moreau pour compter les colliers (necklaces) -n_normalized_kmers <- function(kmer_size) { - # Valeurs exactes pré-calculées pour k=1 à 6 - if (kmer_size == 1) return(4) - if (kmer_size == 2) return(10) - if (kmer_size == 3) return(24) - if (kmer_size == 4) return(70) - if (kmer_size == 5) return(208) - if (kmer_size == 6) return(700) - - # Pour k > 6, utiliser la formule de Moreau (exacte) - # Alphabet ADN a 4 bases - necklace_count(kmer_size, 4) -} -``` - -```{r} -# Entropie maximale pour k-mers normalisés -enmax <- function(lseq, word_size) { - nword = lseq - word_size + 1 - nalpha = n_normalized_kmers(word_size) - - if (nalpha < nword) { - cov = nword %/% nalpha - remains = nword %% nalpha - f1 = cov/nword - f2 = (cov+1)/nword - e = -(nalpha - remains) * f1 * log(f1) - - remains * f2 * log(f2) - } else { - e = log(nword) - } - - e -} -``` - -```{r} -# Entropie normalisée avec normalisation circulaire des k-mers -ecn <- function(data, kmer_size) { - # Normaliser tous les k-mers - normalized_data <- sapply(data, normalize_circular) - - # Calculer la table des fréquences - table <- table(normalized_data) - s <- sum(table) - e <- sum(table * log(table))/s - ed <- log(s) - e - - # Entropie maximale avec normalisation - em <- enmax(s + kmer_size - 1, kmer_size) - - ed/em -} -``` - -```{r} -k<-'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa' -ec(okmer(k,1),1) -ec(okmer(k,2),2) -ec(okmer(k,3),3) -ec(okmer(k,4),4) -``` - -```{r} -k<-'atatatatatatatatatatatatatatata' -ef(okmer(k,1),1) -ef(okmer(k,2),2) -ef(okmer(k,3),3) -ef(okmer(k,4),4) -``` - -```{r} -k<-'aaaaaaaaaaaaaaaattttttttttttttt' -ef(okmer(k,1),1) -ef(okmer(k,2),2) -ef(okmer(k,3),3) -ef(okmer(k,4),4) -``` - -```{r} -k<-'atgatgatgatgatgatgatgatgatgatga' -ef(okmer(k,1),1) -ef(okmer(k,2),2) -ef(okmer(k,3),3) -ef(okmer(k,4),4) -``` - -```{r} -k<-'atcgatcgatcgatcgatcgatcgatcgact' -ecn(okmer(k,1),1) -ecn(okmer(k,2),2) -ecn(okmer(k,3),3) -ecn(okmer(k,4),4) -``` - -```{r} -k<-paste(sample(rep(c("a","c","g","t"),8),31),collapse="") -k <- "actatggcaagtcgtaaccgcgcttatcagg" -ecn(okmer(k,1),1) -ecn(okmer(k,2),2) -ecn(okmer(k,3),3) -ecn(okmer(k,4),4) -``` - -aattaaaaaaacaagataaaataatattttt - -```{r} -k<-'aattaaaaaaacaagataaaataatattttt' -ecn(okmer(k,1),1) -ecn(okmer(k,2),2) -ecn(okmer(k,3),3) -ecn(okmer(k,4),4) -``` - -atg tga gat ,,,, - -cat tca atc - -tgatgatgatgatgatgatgatgatgatg - -## Tests de normalisation circulaire - -```{r} -# Test de la fonction de normalisation -normalize_circular("ca") # devrait donner "ac" -normalize_circular("tgca") # devrait donner "atgc" -normalize_circular("acgt") # devrait donner "acgt" -``` - -```{r} -# Comparaison ec vs ecn sur une séquence répétitive -# Les k-mers "atg", "tga", "gat" sont équivalents par rotation -k <- 'atgatgatgatgatgatgatgatgatgatga' -cat("Séquence:", k, "\n") -cat("ec(k,3) =", ec(okmer(k,3),3), "\n") -cat("ecn(k,3) =", ecn(okmer(k,3),3), "\n") -``` - -```{r} -# Comparaison sur séquence aléatoire -k <- "actatggcaagtcgtaaccgcgcttatcagg" -cat("Séquence:", k, "\n") -cat("Sans normalisation:\n") -cat(" ec(k,2) =", ec(okmer(k,2),2), "\n") -cat(" ec(k,3) =", ec(okmer(k,3),3), "\n") -cat(" ec(k,4) =", ec(okmer(k,4),4), "\n") -cat("Avec normalisation circulaire:\n") -cat(" ecn(k,2) =", ecn(okmer(k,2),2), "\n") -cat(" ecn(k,3) =", ecn(okmer(k,3),3), "\n") -cat(" ecn(k,4) =", ecn(okmer(k,4),4), "\n") -``` - -```{r} - -sequence <- "ttcatcactcagcaatcctgaatgatGAGAGCTTTTTTTTTTTATATATATATATATGTATATGTATGAAATACACTtatgctccgtttgtttcgccgtaa" -re <- rev(c(0.8108602271901116,0.8108602271901116,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.7800272339058549,0.7800272339058549,0.7751610144606091,0.7751610144606091,0.7751610144606091,0.764858185548322,0.7325526601302021,0.7137620699527615,0.6789199521982864,0.6584536373623372,0.634002687184193,0.6075290415873623,0.5785545803330997,0.5785545803330997,0.5503220289212184,0.5315314387437778,0.4966893209893028,0.46077361820145696,0.42388221293245526,0.4009547969713408,0.3561142883497758,0.3561142883497758,0.3561142883497758,0.3561142883497758,0.3561142883497758,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.35141814451677883,0.35141814451677883,0.35141814451677883,0.35141814451677883,0.35141814451677883,0.390029016052137,0.42781461756157363,0.45192285937059073,0.47238917420654,0.47238917420654,0.47238917420654,0.5092805794755417,0.5451962822633876,0.5800384000178626,0.602395141014297,0.6046146614886381,0.6046146614886381,0.6119084258128231,0.6119084258128231,0.6214217106113492,0.6424704346756562,0.6482381543085467,0.6635191587399633,0.6635191587399633,0.6635191587399633,0.6828444721058894,0.6950205907027562,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.7208976112999935)) - -di <- c(0.7208976112999935,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6950205907027562,0.6828444721058894,0.6635191587399633,0.6635191587399633,0.6635191587399633,0.6482381543085467,0.6424704346756562,0.6214217106113492,0.6119084258128231,0.6119084258128231,0.6046146614886382,0.6046146614886382,0.6023951410142971,0.5800384000178627,0.5451962822633876,0.5092805794755418,0.47238917420654003,0.47238917420654003,0.47238917420654003,0.4519228593705908,0.4278146175615737,0.39002901605213713,0.35141814451677894,0.35141814451677894,0.35141814451677894,0.35141814451677894,0.35141814451677883,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3561142883497762,0.3561142883497762,0.3561142883497762,0.3561142883497762,0.3561142883497762,0.40095479697134073,0.42388221293245526,0.46077361820145696,0.4966893209893028,0.5315314387437778,0.5503220289212184,0.5785545803330997,0.5785545803330997,0.6075290415873625,0.6340026871841933,0.6584536373623374,0.6789199521982866,0.7137620699527616,0.7325526601302023,0.7648581855483221,0.7751610144606093,0.7751610144606093,0.7751610144606093,0.7800272339058549,0.7800272339058549,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8108602271901116,0.8108602271901116) - -ebidir <- tibble(direct=di,reverse=re) %>% - mutate(position = 1:length(re), - nucleotide = str_sub(sequence,position,position)) - -ebidir %>% - ggplot(aes(x=position,y=direct)) + - geom_line() + - scale_x_continuous(breaks = ebidir$position, labels = ebidir$nucleotide) + - ylim(0,1)+ - geom_hline(yintercept=0.5, col = "red", linetype = "dashed") -``` \ No newline at end of file diff --git a/pkg/obitools/obilowmask/obilowmask_test.go b/pkg/obitools/obilowmask/obilowmask_test.go deleted file mode 100644 index 4afd764..0000000 --- a/pkg/obitools/obilowmask/obilowmask_test.go +++ /dev/null @@ -1,40 +0,0 @@ -package obilowmask - -import ( - "testing" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" -) - -func TestLowMaskWorker(t *testing.T) { - worker := LowMaskWorker(31, 6, 0.3, Mask, 'n') - - seq := obiseq.NewBioSequence("test", []byte("acgtacgtacgtacgtacgtacgtacgtacgt"), "test") - result, err := worker(seq) - if err != nil { - t.Fatalf("Worker failed: %v", err) - } - - if result.Len() != 1 { - t.Fatalf("Expected 1 sequence, got %d", result.Len()) - } - - resultSeq := result[0] - if resultSeq.Len() != 32 { - t.Fatalf("Expected sequence length 32, got %d", resultSeq.Len()) - } -} - -func TestLowMaskWorkerWithAmbiguity(t *testing.T) { - worker := LowMaskWorker(31, 6, 0.3, Mask, 'n') - - seq := obiseq.NewBioSequence("test", []byte("acgtNcgtacgtacgtacgtacgtacgtacgt"), "test") - result, err := worker(seq) - if err != nil { - t.Fatalf("Worker failed: %v", err) - } - - if result.Len() != 1 { - t.Fatalf("Expected 1 sequence, got %d", result.Len()) - } -} diff --git a/pkg/obitools/obilowmask/options.go b/pkg/obitools/obilowmask/options.go deleted file mode 100644 index 30c1408..0000000 --- a/pkg/obitools/obilowmask/options.go +++ /dev/null @@ -1,81 +0,0 @@ -package obilowmask - -import ( - "strings" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" - "github.com/DavidGamba/go-getoptions" - - log "github.com/sirupsen/logrus" -) - -var __kmer_size__ = 31 -var __level_max__ = 6 -var __threshold__ = 0.5 -var __split_mode__ = false -var __low_mode__ = false -var __mask__ = "." - -func LowMaskOptionSet(options *getoptions.GetOpt) { - - options.IntVar(&__kmer_size__, "kmer-size", __kmer_size__, - options.Description("Size of the kmer considered to estimate entropy."), - ) - - options.IntVar(&__level_max__, "entropy_size", __level_max__, - options.Description("Maximum word size considered for entropy estimate"), - ) - - options.Float64Var(&__threshold__, "threshold", __threshold__, - options.Description("entropy theshold used to mask a kmer"), - ) - - options.BoolVar(&__split_mode__, "split-mode", __split_mode__, - options.Description("in split mode, input sequences are splitted to remove masked regions"), - ) - - options.BoolVar(&__low_mode__, "low-mode", __low_mode__, - options.Description("in split mode, input sequences are splitted to remove masked regions"), - ) - - options.StringVar(&__mask__, "masking-char", __mask__, - options.Description("Character used to mask low complexity region"), - ) -} - -func OptionSet(options *getoptions.GetOpt) { - LowMaskOptionSet(options) - obiconvert.InputOptionSet(options) - obiconvert.OutputOptionSet(options) -} - -func CLIKmerSize() int { - return __kmer_size__ -} - -func CLILevelMax() int { - return __level_max__ -} - -func CLIThreshold() float64 { - return __threshold__ -} - -func CLIMaskingMode() MaskingMode { - switch { - case __low_mode__: - return Extract - case __split_mode__: - return Split - default: - return Mask - } -} - -func CLIMaskingChar() byte { - mask := strings.TrimSpace(__mask__) - if len(mask) != 1 { - log.Fatalf("--masking-char option accept a single character, not %s", mask) - } - return []byte(mask)[0] -} diff --git a/pkg/obitools/obisuperkmer/obisuperkmer.go b/pkg/obitools/obisuperkmer/obisuperkmer.go deleted file mode 100644 index c332564..0000000 --- a/pkg/obitools/obisuperkmer/obisuperkmer.go +++ /dev/null @@ -1,10 +0,0 @@ -// obisuperkmer function utility package. -// -// The obitools/obisuperkmer package contains every -// function specifically required by the obisuperkmer utility. -// -// The obisuperkmer command extracts super k-mers from DNA sequences. -// A super k-mer is a maximal subsequence where all consecutive k-mers -// share the same minimizer. This decomposition is useful for efficient -// k-mer indexing and analysis in bioinformatics applications. -package obisuperkmer diff --git a/pkg/obitools/obisuperkmer/options.go b/pkg/obitools/obisuperkmer/options.go deleted file mode 100644 index 2f25a8e..0000000 --- a/pkg/obitools/obisuperkmer/options.go +++ /dev/null @@ -1,69 +0,0 @@ -package obisuperkmer - -import ( - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" - "github.com/DavidGamba/go-getoptions" -) - -// Private variables for storing option values -var _KmerSize = 31 -var _MinimizerSize = 13 - -// SuperKmerOptionSet defines every option related to super k-mer extraction. -// -// The function adds to a CLI every option proposed to the user -// to tune the parameters of the super k-mer extraction algorithm. -// -// Parameters: -// - options: is a pointer to a getoptions.GetOpt instance normally -// produced by the obioptions.GenerateOptionParser function. -func SuperKmerOptionSet(options *getoptions.GetOpt) { - options.IntVar(&_KmerSize, "kmer-size", _KmerSize, - options.Alias("k"), - options.Description("Size of k-mers (must be between m+1 and 31).")) - - options.IntVar(&_MinimizerSize, "minimizer-size", _MinimizerSize, - options.Alias("m"), - options.Description("Size of minimizers (must be between 1 and k-1).")) -} - -// OptionSet adds to the basic option set every option declared for -// the obisuperkmer command. -// -// It takes a pointer to a GetOpt struct as its parameter and does not return anything. -func OptionSet(options *getoptions.GetOpt) { - obiconvert.OptionSet(false)(options) - SuperKmerOptionSet(options) -} - -// CLIKmerSize returns the k-mer size to use for super k-mer extraction. -// -// It does not take any parameters. -// It returns an integer representing the k-mer size. -func CLIKmerSize() int { - return _KmerSize -} - -// SetKmerSize sets the k-mer size for super k-mer extraction. -// -// Parameters: -// - k: the k-mer size (must be between m+1 and 31). -func SetKmerSize(k int) { - _KmerSize = k -} - -// CLIMinimizerSize returns the minimizer size to use for super k-mer extraction. -// -// It does not take any parameters. -// It returns an integer representing the minimizer size. -func CLIMinimizerSize() int { - return _MinimizerSize -} - -// SetMinimizerSize sets the minimizer size for super k-mer extraction. -// -// Parameters: -// - m: the minimizer size (must be between 1 and k-1). -func SetMinimizerSize(m int) { - _MinimizerSize = m -} diff --git a/pkg/obitools/obisuperkmer/superkmer.go b/pkg/obitools/obisuperkmer/superkmer.go deleted file mode 100644 index d4bcf2e..0000000 --- a/pkg/obitools/obisuperkmer/superkmer.go +++ /dev/null @@ -1,59 +0,0 @@ -package obisuperkmer - -import ( - log "github.com/sirupsen/logrus" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" -) - -// CLIExtractSuperKmers extracts super k-mers from an iterator of BioSequences. -// -// This function takes an iterator of BioSequence objects, extracts super k-mers -// from each sequence using the k-mer and minimizer sizes specified by CLI options, -// and returns a new iterator yielding the extracted super k-mers as BioSequence objects. -// -// Each super k-mer is a maximal subsequence where all consecutive k-mers share -// the same minimizer. The resulting BioSequences contain metadata including: -// - minimizer_value: the canonical minimizer value -// - minimizer_seq: the DNA sequence of the minimizer -// - k: the k-mer size used -// - m: the minimizer size used -// - start: starting position in the original sequence -// - end: ending position in the original sequence -// - parent_id: ID of the parent sequence -// -// Parameters: -// - iterator: an iterator yielding BioSequence objects to process. -// -// Returns: -// - An iterator yielding BioSequence objects representing super k-mers. -func CLIExtractSuperKmers(iterator obiiter.IBioSequence) obiiter.IBioSequence { - // Get k-mer and minimizer sizes from CLI options - k := CLIKmerSize() - m := CLIMinimizerSize() - - // Validate parameters - if m < 1 || m >= k { - log.Fatalf("Invalid parameters: minimizer size (%d) must be between 1 and k-1 (%d)", m, k-1) - } - - if k < 2 || k > 31 { - log.Fatalf("Invalid k-mer size: %d (must be between 2 and 31)", k) - } - - log.Printf("Extracting super k-mers with k=%d, m=%d", k, m) - - // Create the worker for super k-mer extraction - worker := obikmer.SuperKmerWorker(k, m) - - // Apply the worker to the iterator with parallel processing - newIter := iterator.MakeIWorker( - worker, - false, // don't merge results - obidefault.ParallelWorkers(), - ) - - return newIter -} diff --git a/pkg/obitools/obisuperkmer/superkmer_test.go b/pkg/obitools/obisuperkmer/superkmer_test.go deleted file mode 100644 index 2d94019..0000000 --- a/pkg/obitools/obisuperkmer/superkmer_test.go +++ /dev/null @@ -1,149 +0,0 @@ -package obisuperkmer - -import ( - "testing" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" -) - -func TestCLIExtractSuperKmers(t *testing.T) { - // Create a test sequence - testSeq := obiseq.NewBioSequence( - "test_seq", - []byte("ACGTACGTACGTACGTACGTACGTACGTACGT"), - "", - ) - - // Create a batch with the test sequence - batch := obiseq.NewBioSequenceBatch() - batch.Add(testSeq) - - // Create an iterator from the batch - iterator := obiiter.MakeBioSequenceBatchChannel(1) - go func() { - iterator.Push(batch) - iterator.Close() - }() - - // Set test parameters - SetKmerSize(15) - SetMinimizerSize(7) - - // Extract super k-mers - result := CLIExtractSuperKmers(iterator) - - // Count the number of super k-mers - count := 0 - for result.Next() { - batch := result.Get() - for _, sk := range batch.Slice() { - count++ - - // Verify that the super k-mer has the expected attributes - if !sk.HasAttribute("minimizer_value") { - t.Error("Super k-mer missing 'minimizer_value' attribute") - } - if !sk.HasAttribute("minimizer_seq") { - t.Error("Super k-mer missing 'minimizer_seq' attribute") - } - if !sk.HasAttribute("k") { - t.Error("Super k-mer missing 'k' attribute") - } - if !sk.HasAttribute("m") { - t.Error("Super k-mer missing 'm' attribute") - } - if !sk.HasAttribute("start") { - t.Error("Super k-mer missing 'start' attribute") - } - if !sk.HasAttribute("end") { - t.Error("Super k-mer missing 'end' attribute") - } - if !sk.HasAttribute("parent_id") { - t.Error("Super k-mer missing 'parent_id' attribute") - } - - // Verify attribute values - k, _ := sk.GetIntAttribute("k") - m, _ := sk.GetIntAttribute("m") - - if k != 15 { - t.Errorf("Expected k=15, got k=%d", k) - } - if m != 7 { - t.Errorf("Expected m=7, got m=%d", m) - } - - parentID, _ := sk.GetStringAttribute("parent_id") - if parentID != "test_seq" { - t.Errorf("Expected parent_id='test_seq', got '%s'", parentID) - } - } - } - - if count == 0 { - t.Error("No super k-mers were extracted") - } - - t.Logf("Extracted %d super k-mers from test sequence", count) -} - -func TestOptionGettersAndSetters(t *testing.T) { - // Test initial values - if CLIKmerSize() != 21 { - t.Errorf("Expected default k-mer size 21, got %d", CLIKmerSize()) - } - if CLIMinimizerSize() != 11 { - t.Errorf("Expected default minimizer size 11, got %d", CLIMinimizerSize()) - } - - // Test setters - SetKmerSize(25) - SetMinimizerSize(13) - - if CLIKmerSize() != 25 { - t.Errorf("SetKmerSize failed: expected 25, got %d", CLIKmerSize()) - } - if CLIMinimizerSize() != 13 { - t.Errorf("SetMinimizerSize failed: expected 13, got %d", CLIMinimizerSize()) - } - - // Reset to defaults - SetKmerSize(21) - SetMinimizerSize(11) -} - -func BenchmarkCLIExtractSuperKmers(b *testing.B) { - // Create a longer test sequence - longSeq := make([]byte, 1000) - bases := []byte{'A', 'C', 'G', 'T'} - for i := range longSeq { - longSeq[i] = bases[i%4] - } - - testSeq := obiseq.NewBioSequence("bench_seq", longSeq, "") - - // Set parameters - SetKmerSize(21) - SetMinimizerSize(11) - - b.ResetTimer() - - for i := 0; i < b.N; i++ { - batch := obiseq.NewBioSequenceBatch() - batch.Add(testSeq) - - iterator := obiiter.MakeBioSequenceBatchChannel(1) - go func() { - iterator.Push(batch) - iterator.Close() - }() - - result := CLIExtractSuperKmers(iterator) - - // Consume the iterator - for result.Next() { - result.Get() - } - } -} diff --git a/version.txt b/version.txt index 64d67ff..0208476 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -4.4.12 +4.4.13