Merge pull request #101 from metabarcoding/push-klzowrsmmnyv

Dynamic Batch Flushing and Build Improvements
2026-03-25 13:30:52 +00:00 · 2026-03-16 22:29:29 +01:00 · 2026-03-16 22:06:51 +01:00 · 2026-03-16 22:06:44 +01:00 · 2026-03-14 12:21:34 +01:00 · 2026-03-14 11:59:15 +01:00
114 changed files with 12941 additions and 5029 deletions
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -16,7 +16,7 @@ jobs:
      - name: Setup Go
        uses: actions/setup-go@v5
        with:
-          go-version: "1.23"
+          go-version: "1.26"
      - name: Checkout obitools4 project
        uses: actions/checkout@v4
      - name: Run tests
@@ -32,12 +32,11 @@ jobs:
            goos: linux
            goarch: amd64
            output_name: linux_amd64
-          - os: ubuntu-latest
+          - os: ubuntu-24.04-arm
            goos: linux
            goarch: arm64
            output_name: linux_arm64
-            cross_compile: true
-          - os: macos-latest
+          - os: macos-15-intel
            goos: darwin
            goarch: amd64
            output_name: darwin_amd64
@@ -55,7 +54,7 @@ jobs:
      - name: Setup Go
        uses: actions/setup-go@v5
        with:
-          go-version: "1.23"
+          go-version: "1.26"

      - name: Extract version from tag
        id: get_version
@@ -63,25 +62,38 @@ jobs:
          TAG=${GITHUB_REF#refs/tags/Release_}
          echo "version=$TAG" >> $GITHUB_OUTPUT

-      - name: Install cross-compilation tools (Linux ARM64 only)
-        if: matrix.cross_compile
+      - name: Install build tools (macOS)
+        if: runner.os == 'macOS'
        run: |
-          sudo apt-get update
-          sudo apt-get install -y gcc-aarch64-linux-gnu
+          # Ensure Xcode Command Line Tools are installed
+          xcode-select --install 2>/dev/null || true
+          xcode-select -p

-      - name: Build binaries
+      - name: Build binaries (Linux)
+        if: runner.os == 'Linux'
+        env:
+          VERSION: ${{ steps.get_version.outputs.version }}
+        run: |
+          docker run --rm \
+            -v "$(pwd):/src" \
+            -w /src \
+            -e VERSION="${VERSION}" \
+            golang:1.26-alpine \
+            sh -c "apk add --no-cache gcc musl-dev zlib-dev zlib-static make && \
+                   make LDFLAGS='-linkmode=external -extldflags=-static' obitools"
+          mkdir -p artifacts
+          tar -czf artifacts/obitools4_${VERSION}_${{ matrix.output_name }}.tar.gz -C build .
+
+      - name: Build binaries (macOS)
+        if: runner.os == 'macOS'
        env:
          GOOS: ${{ matrix.goos }}
          GOARCH: ${{ matrix.goarch }}
-          CC: ${{ matrix.cross_compile && 'aarch64-linux-gnu-gcc' || '' }}
          VERSION: ${{ steps.get_version.outputs.version }}
        run: |
          make obitools
          mkdir -p artifacts
-          cd build
-          for binary in *; do
-            tar -czf ../artifacts/${binary}_${VERSION}_${{ matrix.output_name }}.tar.gz ${binary}
-          done
+          tar -czf artifacts/obitools4_${VERSION}_${{ matrix.output_name }}.tar.gz -C build .

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
@@ -139,29 +151,29 @@ jobs:
          echo "" >> release_notes.md
          echo "## Installation" >> release_notes.md
          echo "" >> release_notes.md
-          echo "Download the appropriate binary for your system and extract it:" >> release_notes.md
+          echo "Download the appropriate archive for your system and extract it:" >> release_notes.md
          echo "" >> release_notes.md
          echo "### Linux (AMD64)" >> release_notes.md
          echo '```bash' >> release_notes.md
-          echo "tar -xzf <tool>_${VERSION}_linux_amd64.tar.gz" >> release_notes.md
+          echo "tar -xzf obitools4_${VERSION}_linux_amd64.tar.gz" >> release_notes.md
          echo '```' >> release_notes.md
          echo "" >> release_notes.md
          echo "### Linux (ARM64)" >> release_notes.md
          echo '```bash' >> release_notes.md
-          echo "tar -xzf <tool>_${VERSION}_linux_arm64.tar.gz" >> release_notes.md
+          echo "tar -xzf obitools4_${VERSION}_linux_arm64.tar.gz" >> release_notes.md
          echo '```' >> release_notes.md
          echo "" >> release_notes.md
          echo "### macOS (Intel)" >> release_notes.md
          echo '```bash' >> release_notes.md
-          echo "tar -xzf <tool>_${VERSION}_darwin_amd64.tar.gz" >> release_notes.md
+          echo "tar -xzf obitools4_${VERSION}_darwin_amd64.tar.gz" >> release_notes.md
          echo '```' >> release_notes.md
          echo "" >> release_notes.md
          echo "### macOS (Apple Silicon)" >> release_notes.md
          echo '```bash' >> release_notes.md
-          echo "tar -xzf <tool>_${VERSION}_darwin_arm64.tar.gz" >> release_notes.md
+          echo "tar -xzf obitools4_${VERSION}_darwin_arm64.tar.gz" >> release_notes.md
          echo '```' >> release_notes.md
          echo "" >> release_notes.md
-          echo "Available tools: Replace \`<tool>\` with one of the obitools commands." >> release_notes.md
+          echo "All OBITools4 binaries are included in each archive." >> release_notes.md

      - name: Create GitHub Release
        uses: softprops/action-gh-release@v1
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@
 **/*.tgz
 **/*.yaml
 **/*.csv
+**/*.pb.gz
 xx

 .rhistory
@@ -31,3 +32,6 @@ LLM/**
 *_files

 entropy.html
+bug_id.txt
+obilowmask_ref
+test_*
--- a/115
+++ b/115
@@ -2,9 +2,17 @@
 #export GOBIN=$(GOPATH)/bin
 #export PATH=$(GOBIN):$(shell echo $${PATH})

+.DEFAULT_GOAL := all
+
+GREEN  := \033[0;32m
+YELLOW := \033[0;33m
+BLUE   := \033[0;34m
+NC     := \033[0m
+
 GOFLAGS=
+LDFLAGS=
 GOCMD=go
-GOBUILD=$(GOCMD) build $(GOFLAGS)
+GOBUILD=$(GOCMD) build $(GOFLAGS) $(if $(LDFLAGS),-ldflags="$(LDFLAGS)")
 GOGENERATE=$(GOCMD) generate
 GOCLEAN=$(GOCMD) clean
 GOTEST=$(GOCMD) test
@@ -43,7 +51,7 @@ $(OBITOOLS_PREFIX)$(notdir $(1)): $(BUILD_DIR) $(1) pkg/obioptions/version.go
 	@echo -n - Building obitool $(notdir $(1))...
 	@$(GOBUILD)  -o $(BUILD_DIR)/$(OBITOOLS_PREFIX)$(notdir $(1)) ./$(1) \
 	             2> $(OBITOOLS_PREFIX)$(notdir $(1)).log \
-				 || cat $(OBITOOLS_PREFIX)$(notdir $(1)).log
+				 || { cat $(OBITOOLS_PREFIX)$(notdir $(1)).log; rm -f $(OBITOOLS_PREFIX)$(notdir $(1)).log; exit 1; }
 	@rm -f $(OBITOOLS_PREFIX)$(notdir $(1)).log
 	@echo Done.
 endef
@@ -60,6 +68,28 @@ endif

 OUTPUT:=$(shell mktemp)

+help:
+	@printf "$(GREEN)OBITools4 Makefile$(NC)\n\n"
+	@printf "$(BLUE)Main targets:$(NC)\n"
+	@printf "  %-20s %s\n" "all"          "Build all obitools (default)"
+	@printf "  %-20s %s\n" "obitools"     "Build all obitools binaries to build/"
+	@printf "  %-20s %s\n" "test"         "Run Go unit tests"
+	@printf "  %-20s %s\n" "obitests"     "Run integration tests (obitests/)"
+	@printf "  %-20s %s\n" "bump-version" "Increment patch version (or set with VERSION=x.y.z)"
+	@printf "  %-20s %s\n" "update-deps"  "Update all Go dependencies"
+	@printf "\n$(BLUE)Jujutsu workflow:$(NC)\n"
+	@printf "  %-20s %s\n" "jjnew"        "Document current commit and start a new one"
+	@printf "  %-20s %s\n" "jjpush"       "Release: describe, bump, generate notes, push PR, tag (VERSION=x.y.z optional)"
+	@printf "  %-20s %s\n" "jjfetch"      "Fetch latest commits from origin"
+	@printf "\n$(BLUE)Required tools:$(NC)\n"
+	@printf "  %-20s " "go";      command -v go      >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(go version)" || printf "$(YELLOW)✗ not found$(NC)\n"
+	@printf "  %-20s " "git";     command -v git     >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(git --version)" || printf "$(YELLOW)✗ not found$(NC)\n"
+	@printf "  %-20s " "jj";      command -v jj      >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(jj --version)" || printf "$(YELLOW)✗ not found$(NC)\n"
+	@printf "  %-20s " "gh";      command -v gh      >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(gh --version | head -1)" || printf "$(YELLOW)✗ not found$(NC)  (brew install gh)\n"
+	@printf "\n$(BLUE)Optional tools (release notes generation):$(NC)\n"
+	@printf "  %-20s " "aichat";  command -v aichat  >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(aichat --version)" || printf "$(YELLOW)✗ not found$(NC)  (https://github.com/sigoden/aichat)\n"
+	@printf "  %-20s " "jq";      command -v jq      >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(jq --version)" || printf "$(YELLOW)✗ not found$(NC)  (brew install jq)\n"
+
 all: install-githook obitools

 obitools: $(patsubst %,$(OBITOOLS_PREFIX)%,$(OBITOOLS))
@@ -106,8 +136,12 @@ pkg/obioptions/version.go: version.txt .FORCE
 	@rm -f $(OUTPUT)

 bump-version:
-	@echo "Incrementing version..."
 	@current=$$(cat version.txt); \
+	if [ -n "$(VERSION)" ]; then \
+		new_version="$(VERSION)"; \
+		echo "Setting version to $$new_version (was $$current)"; \
+	else \
+		echo "Incrementing version..."; \
 		echo "  Current version: $$current"; \
 		major=$$(echo $$current | cut -d. -f1); \
 		minor=$$(echo $$current | cut -d. -f2); \
@@ -115,6 +149,7 @@ bump-version:
 		new_patch=$$((patch + 1)); \
 		new_version="$$major.$$minor.$$new_patch"; \
 		echo "  New version: $$new_version"; \
+	fi; \
 	echo "$$new_version" > version.txt
 	@echo "✓ Version updated in version.txt"
 	@$(MAKE) pkg/obioptions/version.go
@@ -128,21 +163,77 @@ jjnew:
 	@echo "$(GREEN)✓ New commit created$(NC)"

 jjpush:
-	@echo "$(YELLOW)→ Pushing commit to repository...$(NC)"
+	@$(MAKE) jjpush-describe
+	@$(MAKE) jjpush-bump
+	@$(MAKE) jjpush-notes
+	@$(MAKE) jjpush-push
+	@$(MAKE) jjpush-tag
+	@echo "$(GREEN)✓ Release complete$(NC)"
+
+jjpush-describe:
 	@echo "$(BLUE)→ Documenting current commit...$(NC)"
 	@jj auto-describe
+
+jjpush-bump:
 	@echo "$(BLUE)→ Creating new commit for version bump...$(NC)"
 	@jj new
 	@$(MAKE) bump-version
-	@echo "$(BLUE)→ Documenting version bump commit...$(NC)"
-	@jj auto-describe
+
+jjpush-notes:
+	@version=$$(cat version.txt); \
+	echo "$(BLUE)→ Generating release notes for version $$version...$(NC)"; \
+	release_title="Release $$version"; \
+	release_body=""; \
+	if command -v aichat >/dev/null 2>&1; then \
+		previous_tag=$$(git describe --tags --abbrev=0 --match 'Release_*' 2>/dev/null); \
+		if [ -z "$$previous_tag" ]; then \
+			echo "$(YELLOW)⚠ No previous Release tag found, skipping release notes$(NC)"; \
+		else \
+			raw_output=$$(git log --format="%h %B" "$$previous_tag..HEAD" | \
+				aichat \
+				"Summarize the following commits into a GitHub release note for version $$version. Ignore commits related to version bumps, .gitignore changes, or any internal housekeeping that is irrelevant to end users. Describe each user-facing change precisely without exposing code. Eliminate redundancy. Output strictly valid JSON with no surrounding text, using this exact schema: {\"title\": \"<short release title>\", \"body\": \"<detailed markdown release notes>\"}" 2>/dev/null) || true; \
+			if [ -n "$$raw_output" ]; then \
+				notes=$$(printf '%s\n' "$$raw_output" | python3 tools/json2md.py 2>/dev/null); \
+				if [ -n "$$notes" ]; then \
+					release_title=$$(echo "$$notes" | head -1); \
+					release_body=$$(echo "$$notes" | tail -n +3); \
+				else \
+					echo "$(YELLOW)⚠ JSON parsing failed, using default release message$(NC)"; \
+				fi; \
+			fi; \
+		fi; \
+	fi; \
+	printf '%s' "$$release_title" > /tmp/obitools4-release-title.txt; \
+	printf '%s' "$$release_body" > /tmp/obitools4-release-body.txt; \
+	echo "$(BLUE)→ Setting release notes as commit description...$(NC)"; \
+	jj desc -m "$$release_title"$$'\n\n'"$$release_body"
+
+jjpush-push:
+	@echo "$(BLUE)→ Pushing commits...$(NC)"
+	@jj git push --change @
+	@echo "$(BLUE)→ Creating/updating PR...$(NC)"
+	@release_title=$$(cat /tmp/obitools4-release-title.txt 2>/dev/null || echo "Release $$(cat version.txt)"); \
+	release_body=$$(cat /tmp/obitools4-release-body.txt 2>/dev/null || echo ""); \
+	branch=$$(jj log -r @ --no-graph -T 'bookmarks.map(|b| b.name()).join("\n")' 2>/dev/null | head -1); \
+	if [ -n "$$branch" ] && command -v gh >/dev/null 2>&1; then \
+		gh pr create --title "$$release_title" --body "$$release_body" --base master --head "$$branch" 2>/dev/null \
+		|| gh pr edit "$$branch" --title "$$release_title" --body "$$release_body" 2>/dev/null \
+		|| echo "$(YELLOW)⚠ Could not create/update PR$(NC)"; \
+	fi
+
+jjpush-tag:
 	@version=$$(cat version.txt); \
 	tag_name="Release_$$version"; \
-	echo "$(BLUE)→ Pushing commits and creating tag $$tag_name...$(NC)"; \
-	jj git push --change @; \
-	git tag -a "$$tag_name" -m "Release $$version" 2>/dev/null || echo "Tag $$tag_name already exists"; \
-	git push origin "$$tag_name" 2>/dev/null || echo "Tag already pushed"
-	@echo "$(GREEN)✓ Commits and tag pushed to repository$(NC)"
+	release_title=$$(cat /tmp/obitools4-release-title.txt 2>/dev/null || echo "Release $$version"); \
+	release_body=$$(cat /tmp/obitools4-release-body.txt 2>/dev/null || echo ""); \
+	install_section=$$'\n## Installation\n\n### Pre-built binaries\n\nDownload the appropriate archive for your system from the\n[release assets](https://github.com/metabarcoding/obitools4/releases/tag/Release_'"$$version"')\nand extract it:\n\n#### Linux (AMD64)\n```bash\ntar -xzf obitools4_'"$$version"'_linux_amd64.tar.gz\n```\n\n#### Linux (ARM64)\n```bash\ntar -xzf obitools4_'"$$version"'_linux_arm64.tar.gz\n```\n\n#### macOS (Intel)\n```bash\ntar -xzf obitools4_'"$$version"'_darwin_amd64.tar.gz\n```\n\n#### macOS (Apple Silicon)\n```bash\ntar -xzf obitools4_'"$$version"'_darwin_arm64.tar.gz\n```\n\nAll OBITools4 binaries are included in each archive.\n\n### From source\n\nYou can also compile and install OBITools4 directly from source using the\ninstallation script:\n\n```bash\ncurl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | bash -s -- --version '"$$version"'\n```\n\nBy default binaries are installed in `/usr/local/bin`. Use `--install-dir` to\nchange the destination and `--obitools-prefix` to add a prefix to command names:\n\n```bash\ncurl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | \\\n  bash -s -- --version '"$$version"' --install-dir ~/local --obitools-prefix k\n```\n'; \
+	release_message="$$release_title"$$'\n\n'"$$release_body$$install_section"; \
+	echo "$(BLUE)→ Creating tag $$tag_name...$(NC)"; \
+	commit_hash=$$(jj log -r @ --no-graph -T 'commit_id' 2>/dev/null); \
+	git tag -a "$$tag_name" $${commit_hash:+"$$commit_hash"} -m "$$release_message" 2>/dev/null || echo "$(YELLOW)⚠ Tag $$tag_name already exists$(NC)"; \
+	echo "$(BLUE)→ Pushing tag $$tag_name...$(NC)"; \
+	git push origin "$$tag_name" 2>/dev/null || echo "$(YELLOW)⚠ Tag push failed or already pushed$(NC)"; \
+	rm -f /tmp/obitools4-release-title.txt /tmp/obitools4-release-body.txt

 jjfetch:
 	@echo "$(YELLOW)→ Pulling latest commits...$(NC)"
@@ -150,5 +241,5 @@ jjfetch:
 	@jj new master@origin
 	@echo "$(GREEN)✓ Latest commits pulled$(NC)"

-.PHONY: all obitools update-deps obitests githubtests jjnew jjpush jjfetch bump-version .FORCE
+.PHONY: all obitools update-deps obitests githubtests help jjnew jjpush jjpush-describe jjpush-bump jjpush-notes jjpush-push jjpush-tag jjfetch bump-version .FORCE
 .FORCE:
--- a/README.md
+++ b/README.md
@@ -16,12 +16,17 @@ The easiest way to run it is to copy and paste the following command into your t
 curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | bash
 ```

-By default, the script installs the *OBITools* commands and other associated files into the `/usr/local` directory.
-The names of the commands in the new *OBITools4* are mostly identical to those in *OBITools2*.
-Therefore, installing the new *OBITools* may hide or delete the old ones. If you want both versions to be 
-available on your system, the installation script offers two options:
+By default, the script installs the latest version of *OBITools* commands and other associated files into the `/usr/local` directory.

+### Installation Options

+The installation script offers several options:
+
+>  -l, --list              List all available versions and exit.
+> 
+>  -v, --version           Install a specific version (e.g., `-v 4.4.3`).
+>                          By default, the latest version is installed.
+> 
 >  -i, --install-dir       Directory where obitools are installed 
 >                          (as example use `/usr/local` not `/usr/local/bin`).
 > 
@@ -29,15 +34,36 @@ available on your system, the installation script offers two options:
 >                          want to have several versions of obitools at the
 >                          same time on your system (as example `-p g` will produce
 >                          `gobigrep` command instead of `obigrep`).
+>
+>  -j, --jobs              Number of parallel jobs used for compilation
+>                          (default: 1). Increase this value to speed up
+>                          compilation on multi-core systems (e.g., `-j 4`).

-You can use these options by following the installation command:
+### Examples

+List all available versions:
+```{bash}
+curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | bash -s -- --list
+```
+
+Install a specific version:
+```{bash}
+curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | bash -s -- --version 4.4.3
+```
+
+Install in a custom directory with command prefix:
 ```{bash}
 curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | \
      bash -s -- --install-dir test_install --obitools-prefix k
 ```

-In this case, the binaries will be installed in the `test_install` directory and all command names will be prefixed with the letter `k`. Thus, `obigrep` will be named `kobigrep`.
+In this last example, the binaries will be installed in the `test_install` directory and all command names will be prefixed with the letter `k`. Thus, `obigrep` will be named `kobigrep`.
+
+### Note on Version Compatibility
+
+The names of the commands in the new *OBITools4* are mostly identical to those in *OBITools2*.
+Therefore, installing the new *OBITools* may hide or delete the old ones. If you want both versions to be 
+available on your system, use the `--install-dir` and `--obitools-prefix` options as shown above.

 ## Continuing the analysis...

--- a/blackboard/Prospective/kmer_disk_index_plan.md
+++ b/blackboard/Prospective/kmer_disk_index_plan.md
@@ -0,0 +1,508 @@
+# Plan de refonte du package obikmer : index disk-based par partitions minimizer
+
+## Constat
+
+Les roaring64 bitmaps ne sont pas adaptés au stockage de 10^10 k-mers
+(k=31) dispersés sur un espace de 2^62. L'overhead structurel (containers
+roaring par high key 32 bits) dépasse la taille des données elles-mêmes,
+et les opérations `Or()` entre bitmaps fragmentés ne terminent pas en
+temps raisonnable.
+
+## Principe de la nouvelle architecture
+
+Un `KmerSet` est un ensemble trié de k-mers canoniques (uint64) stocké
+sur disque, partitionné par minimizer. Chaque partition est un fichier
+binaire contenant des uint64 triés, compressés par delta-varint.
+
+Un `KmerSetGroup` est un répertoire contenant N ensembles partitionnés
+de la même façon (même k, même m, même P).
+
+Un `KmerSet` est un `KmerSetGroup` de taille 1 (singleton).
+
+Les opérations ensemblistes se font partition par partition, en merge
+streaming, sans charger l'index complet en mémoire.
+
+## Cycle de vie d'un index
+
+L'index a deux phases distinctes :
+
+1. **Phase de construction (mutable)** : on ouvre un index, on y ajoute
+   des séquences. Pour chaque séquence, les super-kmers sont extraits
+   et écrits de manière compacte (2 bits/base) dans le fichier
+   temporaire de partition correspondant (`minimizer % P`). Les
+   super-kmers sont une représentation compressée naturelle des k-mers
+   chevauchants : un super-kmer de longueur L encode L-k+1 k-mers en
+   ne stockant que ~L/4 bytes au lieu de (L-k+1) × 8 bytes.
+
+2. **Phase de clôture (optimisation)** : on ferme l'index, ce qui
+   déclenche le traitement **partition par partition** (indépendant,
+   parallélisable) :
+   - Charger les super-kmers de la partition
+   - En extraire tous les k-mers canoniques
+   - Trier le tableau de k-mers
+   - Dédupliquer (et compter si FrequencyFilter)
+   - Delta-encoder et écrire le fichier .kdi final
+   Après clôture, l'index est statique et immuable.
+
+3. **Phase de lecture (immutable)** : opérations ensemblistes,
+   Jaccard, Quorum, Contains, itération. Toutes en streaming.
+
+---
+
+## Format sur disque
+
+### Index finalisé
+
+```
+index_dir/
+  metadata.toml
+  set_0/
+    part_0000.kdi
+    part_0001.kdi
+    ...
+    part_{P-1}.kdi
+  set_1/
+    part_0000.kdi
+    ...
+  ...
+  set_{N-1}/
+    ...
+```
+
+### Fichiers temporaires pendant la construction
+
+```
+index_dir/
+  .build/
+    set_0/
+      part_0000.skm          # super-kmers encodés 2 bits/base
+      part_0001.skm
+      ...
+    set_1/
+      ...
+```
+
+Le répertoire `.build/` est supprimé après Close().
+
+### metadata.toml
+
+```toml
+id = "mon_index"
+k = 31
+m = 13
+partitions = 1024
+type = "KmerSetGroup"       # ou "KmerSet" (N=1)
+size = 3                    # nombre de sets (N)
+sets_ids = ["genome_A", "genome_B", "genome_C"]
+
+[user_metadata]
+organism = "Triticum aestivum"
+
+[sets_metadata]
+# métadonnées individuelles par set si nécessaire
+```
+
+### Fichier .kdi (Kmer Delta Index)
+
+Format binaire :
+
+```
+[magic: 4 bytes "KDI\x01"]
+[count: uint64 little-endian]       # nombre de k-mers dans cette partition
+[first: uint64 little-endian]       # premier k-mer (valeur absolue)
+[delta_1: varint]                   # arr[1] - arr[0]
+[delta_2: varint]                   # arr[2] - arr[1]
+...
+[delta_{count-1}: varint]           # arr[count-1] - arr[count-2]
+```
+
+Varint : encoding unsigned, 7 bits utiles par byte, bit de poids fort
+= continuation (identique au varint protobuf).
+
+Fichier vide (partition sans k-mer) : magic + count=0.
+
+### Fichier .skm (Super-Kmer temporaire)
+
+Format binaire, séquence de super-kmers encodés :
+
+```
+[len: uint16 little-endian]         # longueur du super-kmer en bases
+[sequence: ceil(len/4) bytes]       # séquence encodée 2 bits/base, packed
+...
+```
+
+**Compression par rapport au stockage de k-mers bruts** :
+
+Un super-kmer de longueur L contient L-k+1 k-mers.
+- Stockage super-kmer : 2 + ceil(L/4) bytes
+- Stockage k-mers bruts : (L-k+1) × 8 bytes
+
+Exemple avec k=31, super-kmer typique L=50 :
+- Super-kmer : 2 + 13 = 15 bytes → encode 20 k-mers
+- K-mers bruts : 20 × 8 = 160 bytes
+- **Facteur de compression : ~10×**
+
+Pour un génome de 10 Gbases (~10^10 k-mers bruts) :
+- K-mers bruts : ~80 Go par set temporaire
+- Super-kmers : **~8 Go** par set temporaire
+
+Avec FrequencyFilter et couverture 30× :
+- K-mers bruts : ~2.4 To
+- Super-kmers : **~240 Go**
+
+---
+
+## FrequencyFilter
+
+Le FrequencyFilter n'est plus un type de données séparé. C'est un
+**mode de construction** du builder. Le résultat est un KmerSetGroup
+standard.
+
+### Principe
+
+Pendant la construction, tous les super-kmers sont écrits dans les
+fichiers temporaires .skm, y compris les doublons (chaque occurrence
+de chaque séquence est écrite).
+
+Pendant Close(), pour chaque partition :
+1. Charger tous les super-kmers de la partition
+2. Extraire tous les k-mers canoniques dans un tableau []uint64
+3. Trier le tableau
+4. Parcourir linéairement : les k-mers identiques sont consécutifs
+5. Compter les occurrences de chaque k-mer
+6. Si count >= minFreq → écrire dans le .kdi final (une seule fois)
+7. Sinon → ignorer
+
+### Dimensionnement
+
+Pour un génome de 10 Gbases avec couverture 30× :
+- N_brut ≈ 3×10^11 k-mers bruts
+- Espace temporaire .skm ≈ 240 Go (compressé super-kmer)
+- RAM par partition pendant Close() :
+  Avec P=1024 : ~3×10^8 k-mers/partition × 8 = **~2.4 Go**
+  Avec P=4096 : ~7.3×10^7 k-mers/partition × 8 = **~600 Mo**
+
+Le choix de P détermine le compromis nombre de fichiers vs RAM par
+partition.
+
+### Sans FrequencyFilter (déduplication simple)
+
+Pour de la déduplication simple (chaque k-mer écrit une fois), le
+builder peut dédupliquer au niveau des buffers en RAM avant flush.
+Cela réduit significativement l'espace temporaire car les doublons
+au sein d'un même buffer (provenant de séquences proches) sont
+éliminés immédiatement.
+
+---
+
+## API publique visée
+
+### Structures
+
+```go
+// KmerSetGroup est l'entité de base.
+// Un KmerSet est un KmerSetGroup avec Size() == 1.
+type KmerSetGroup struct {
+    // champs internes : path, k, m, P, N, metadata, état
+}
+
+// KmerSetGroupBuilder construit un KmerSetGroup mutable.
+type KmerSetGroupBuilder struct {
+    // champs internes : buffers I/O par partition et par set,
+    // fichiers temporaires .skm, paramètres (minFreq, etc.)
+}
+```
+
+### Construction
+
+```go
+// NewKmerSetGroupBuilder crée un builder pour un nouveau KmerSetGroup.
+//   directory : répertoire de destination
+//   k : taille des k-mers (1-31)
+//   m : taille des minimizers (-1 pour auto = ceil(k/2.5))
+//   n : nombre de sets dans le groupe
+//   P : nombre de partitions (-1 pour auto)
+//   options : options de construction (FrequencyFilter, etc.)
+func NewKmerSetGroupBuilder(directory string, k, m, n, P int,
+    options ...BuilderOption) (*KmerSetGroupBuilder, error)
+
+// WithMinFrequency active le mode FrequencyFilter.
+// Seuls les k-mers vus >= minFreq fois sont conservés dans l'index
+// final. Les super-kmers sont écrits avec leurs doublons pendant
+// la construction ; le comptage exact se fait au Close().
+func WithMinFrequency(minFreq int) BuilderOption
+
+// AddSequence extrait les super-kmers d'une séquence et les écrit
+// dans les fichiers temporaires de partition du set i.
+func (b *KmerSetGroupBuilder) AddSequence(setIndex int, seq *obiseq.BioSequence)
+
+// AddSuperKmer écrit un super-kmer dans le fichier temporaire de
+// sa partition pour le set i.
+func (b *KmerSetGroupBuilder) AddSuperKmer(setIndex int, sk SuperKmer)
+
+// Close finalise la construction :
+//   - flush des buffers d'écriture
+//   - pour chaque partition de chaque set (parallélisable) :
+//     - charger les super-kmers depuis le .skm
+//     - extraire les k-mers canoniques
+//     - trier, dédupliquer (compter si freq filter)
+//     - delta-encoder et écrire le .kdi
+//   - écrire metadata.toml
+//   - supprimer le répertoire .build/
+// Retourne le KmerSetGroup en lecture seule.
+func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error)
+```
+
+### Lecture et opérations
+
+```go
+// OpenKmerSetGroup ouvre un index finalisé en lecture seule.
+func OpenKmerSetGroup(directory string) (*KmerSetGroup, error)
+
+// --- Métadonnées (API inchangée) ---
+func (ksg *KmerSetGroup) K() int
+func (ksg *KmerSetGroup) M() int          // nouveau : taille du minimizer
+func (ksg *KmerSetGroup) Partitions() int  // nouveau : nombre de partitions
+func (ksg *KmerSetGroup) Size() int
+func (ksg *KmerSetGroup) Id() string
+func (ksg *KmerSetGroup) SetId(id string)
+func (ksg *KmerSetGroup) HasAttribute(key string) bool
+func (ksg *KmerSetGroup) GetAttribute(key string) (interface{}, bool)
+func (ksg *KmerSetGroup) SetAttribute(key string, value interface{})
+// ... etc (toute l'API attributs actuelle est conservée)
+
+// --- Opérations ensemblistes ---
+// Toutes produisent un nouveau KmerSetGroup singleton sur disque.
+// Opèrent partition par partition en streaming.
+
+func (ksg *KmerSetGroup) Union(outputDir string) (*KmerSetGroup, error)
+func (ksg *KmerSetGroup) Intersect(outputDir string) (*KmerSetGroup, error)
+func (ksg *KmerSetGroup) Difference(outputDir string) (*KmerSetGroup, error)
+func (ksg *KmerSetGroup) QuorumAtLeast(q int, outputDir string) (*KmerSetGroup, error)
+func (ksg *KmerSetGroup) QuorumExactly(q int, outputDir string) (*KmerSetGroup, error)
+func (ksg *KmerSetGroup) QuorumAtMost(q int, outputDir string) (*KmerSetGroup, error)
+
+// --- Opérations entre deux KmerSetGroups ---
+// Les deux groupes doivent avoir les mêmes k, m, P.
+
+func (ksg *KmerSetGroup) UnionWith(other *KmerSetGroup, outputDir string) (*KmerSetGroup, error)
+func (ksg *KmerSetGroup) IntersectWith(other *KmerSetGroup, outputDir string) (*KmerSetGroup, error)
+
+// --- Métriques (résultat en mémoire, pas de sortie disque) ---
+
+func (ksg *KmerSetGroup) JaccardDistanceMatrix() *obidist.DistMatrix
+func (ksg *KmerSetGroup) JaccardSimilarityMatrix() *obidist.DistMatrix
+
+// --- Accès individuel ---
+
+func (ksg *KmerSetGroup) Len(setIndex ...int) uint64
+func (ksg *KmerSetGroup) Contains(setIndex int, kmer uint64) bool
+func (ksg *KmerSetGroup) Iterator(setIndex int) iter.Seq[uint64]
+```
+
+---
+
+## Implémentation interne
+
+### Primitives bas niveau
+
+**`varint.go`** : encode/decode varint uint64
+
+```go
+func EncodeVarint(w io.Writer, v uint64) (int, error)
+func DecodeVarint(r io.Reader) (uint64, error)
+```
+
+### Format .kdi
+
+**`kdi_writer.go`** : écriture d'un fichier .kdi à partir d'un flux
+trié de uint64 (delta-encode au vol).
+
+```go
+type KdiWriter struct { ... }
+func NewKdiWriter(path string) (*KdiWriter, error)
+func (w *KdiWriter) Write(kmer uint64) error
+func (w *KdiWriter) Close() error
+```
+
+**`kdi_reader.go`** : lecture streaming d'un fichier .kdi (décode
+les deltas au vol).
+
+```go
+type KdiReader struct { ... }
+func NewKdiReader(path string) (*KdiReader, error)
+func (r *KdiReader) Next() (uint64, bool)
+func (r *KdiReader) Count() uint64
+func (r *KdiReader) Close() error
+```
+
+### Format .skm
+
+**`skm_writer.go`** : écriture de super-kmers encodés 2 bits/base.
+
+```go
+type SkmWriter struct { ... }
+func NewSkmWriter(path string) (*SkmWriter, error)
+func (w *SkmWriter) Write(sk SuperKmer) error
+func (w *SkmWriter) Close() error
+```
+
+**`skm_reader.go`** : lecture de super-kmers depuis un fichier .skm.
+
+```go
+type SkmReader struct { ... }
+func NewSkmReader(path string) (*SkmReader, error)
+func (r *SkmReader) Next() (SuperKmer, bool)
+func (r *SkmReader) Close() error
+```
+
+### Merge streaming
+
+**`kdi_merge.go`** : k-way merge de plusieurs flux triés.
+
+```go
+type KWayMerge struct { ... }
+func NewKWayMerge(readers []*KdiReader) *KWayMerge
+func (m *KWayMerge) Next() (kmer uint64, count int, ok bool)
+func (m *KWayMerge) Close() error
+```
+
+### Builder
+
+**`kmer_set_builder.go`** : construction d'un KmerSetGroup.
+
+Le builder gère :
+- P × N écrivains .skm bufferisés (un par partition × set)
+- À la clôture : traitement partition par partition
+  (parallélisable sur plusieurs cores)
+
+Gestion mémoire des buffers d'écriture :
+- Chaque SkmWriter a un buffer I/O de taille raisonnable (~64 Ko)
+- Avec P=1024 et N=1 : 1024 × 64 Ko = 64 Mo de buffers
+- Avec P=1024 et N=10 : 640 Mo de buffers
+- Pas de buffer de k-mers en RAM : tout est écrit sur disque
+  immédiatement via les super-kmers
+
+RAM pendant Close() (tri d'une partition) :
+- Charger les super-kmers → extraire les k-mers → tableau []uint64
+- Avec P=1024 et 10^10 k-mers/set : ~10^7 k-mers/partition × 8 = ~80 Mo
+- Avec FrequencyFilter (doublons) et couverture 30× :
+  ~3×10^8/partition × 8 = ~2.4 Go (ajustable via P)
+
+### Structure disk-based
+
+**`kmer_set_disk.go`** : KmerSetGroup en lecture seule.
+
+**`kmer_set_disk_ops.go`** : opérations ensemblistes par merge
+streaming partition par partition.
+
+---
+
+## Ce qui change par rapport à l'API actuelle
+
+### Changements de sémantique
+
+| Aspect | Ancien (roaring) | Nouveau (disk-based) |
+|---|---|---|
+| Stockage | En mémoire (roaring64.Bitmap) | Sur disque (.kdi delta-encoded) |
+| Temporaire construction | En mémoire | Super-kmers sur disque (.skm 2 bits/base) |
+| Mutabilité | Mutable à tout moment | Builder → Close() → immutable |
+| Opérations ensemblistes | Résultat en mémoire | Résultat sur disque (nouveau répertoire) |
+| Contains | O(1) roaring lookup | O(log n) recherche binaire sur .kdi |
+| Itération | Roaring iterator | Streaming décodage delta-varint |
+
+### API conservée (signatures identiques ou quasi-identiques)
+
+- `KmerSetGroup` : `K()`, `Size()`, `Id()`, `SetId()`
+- Toute l'API attributs
+- `JaccardDistanceMatrix()`, `JaccardSimilarityMatrix()`
+- `Len()`, `Contains()`
+
+### API modifiée
+
+- `Union()`, `Intersect()`, etc. : ajout du paramètre `outputDir`
+- `QuorumAtLeast()`, etc. : idem
+- Construction : `NewKmerSetGroupBuilder()` + `AddSequence()` + `Close()`
+  au lieu de manipulation directe
+
+### API supprimée
+
+- `KmerSet` comme type distinct (remplacé par KmerSetGroup singleton)
+- `FrequencyFilter` comme type distinct (mode du Builder)
+- Tout accès direct à `roaring64.Bitmap`
+- `KmerSet.Copy()` (copie de répertoire à la place)
+- `KmerSet.Union()`, `.Intersect()`, `.Difference()` (deviennent méthodes
+  de KmerSetGroup avec outputDir)
+
+---
+
+## Fichiers à créer / modifier dans pkg/obikmer
+
+### Nouveaux fichiers
+
+| Fichier | Contenu |
+|---|---|
+| `varint.go` | Encode/Decode varint uint64 |
+| `kdi_writer.go` | Écrivain de fichiers .kdi (delta-encoded) |
+| `kdi_reader.go` | Lecteur streaming de fichiers .kdi |
+| `skm_writer.go` | Écrivain de super-kmers encodés 2 bits/base |
+| `skm_reader.go` | Lecteur de super-kmers depuis .skm |
+| `kdi_merge.go` | K-way merge streaming de flux triés |
+| `kmer_set_builder.go` | KmerSetGroupBuilder (construction) |
+| `kmer_set_disk.go` | KmerSetGroup disk-based (lecture, métadonnées) |
+| `kmer_set_disk_ops.go` | Opérations ensemblistes streaming |
+
+### Fichiers à supprimer
+
+| Fichier | Raison |
+|---|---|
+| `kmer_set.go` | Remplacé par kmer_set_disk.go |
+| `kmer_set_group.go` | Idem |
+| `kmer_set_attributes.go` | Intégré dans kmer_set_disk.go |
+| `kmer_set_persistence.go` | L'index est nativement sur disque |
+| `kmer_set_group_quorum.go` | Intégré dans kmer_set_disk_ops.go |
+| `frequency_filter.go` | Mode du Builder, plus de type séparé |
+| `kmer_index_builder.go` | Remplacé par kmer_set_builder.go |
+
+### Fichiers conservés tels quels
+
+| Fichier | Contenu |
+|---|---|
+| `encodekmer.go` | Encodage/décodage k-mers |
+| `superkmer.go` | Structure SuperKmer |
+| `superkmer_iter.go` | IterSuperKmers, IterCanonicalKmers |
+| `encodefourmer.go` | Encode4mer |
+| `counting.go` | Count4Mer |
+| `kmermap.go` | KmerMap (usage indépendant) |
+| `debruijn.go` | Graphe de de Bruijn |
+
+---
+
+## Ordre d'implémentation
+
+1. `varint.go` + tests
+2. `skm_writer.go` + `skm_reader.go` + tests
+3. `kdi_writer.go` + `kdi_reader.go` + tests
+4. `kdi_merge.go` + tests
+5. `kmer_set_builder.go` + tests (construction + Close)
+6. `kmer_set_disk.go` (structure, métadonnées, Open)
+7. `kmer_set_disk_ops.go` + tests (Union, Intersect, Quorum, Jaccard)
+8. Adaptation de `pkg/obitools/obikindex/`
+9. Suppression des anciens fichiers roaring
+10. Adaptation des tests existants
+
+Chaque étape est testable indépendamment.
+
+---
+
+## Dépendances externes
+
+### Supprimées
+
+- `github.com/RoaringBitmap/roaring` : plus nécessaire pour les
+  index k-mers (vérifier si d'autres packages l'utilisent encore)
+
+### Ajoutées
+
+- Aucune. Varint, delta-encoding, merge, encodage 2 bits/base :
+  tout est implémentable en Go standard.
--- a/blackboard/Prospective/large_sequence_parsing.md
+++ b/blackboard/Prospective/large_sequence_parsing.md
@@ -0,0 +1,264 @@
+# Optimisation du parsing des grandes séquences
+
+## Contexte
+
+OBITools4 doit pouvoir traiter des séquences de taille chromosomique (plusieurs Gbp), notamment
+issues de fichiers GenBank/EMBL (assemblages de génomes) ou de fichiers FASTA convertis depuis
+ces formats.
+
+## Architecture actuelle
+
+### Pipeline de lecture (`pkg/obiformats/`)
+
+```
+ReadFileChunk (goroutine)
+    → ChannelFileChunk
+    → N × _ParseGenbankFile / _ParseFastaFile (goroutines)
+    → IBioSequence
+```
+
+`ReadFileChunk` (`file_chunk_read.go`) lit le fichier par morceaux via une chaîne de
+`PieceOfChunk` (rope). Chaque nœud fait `fileChunkSize` bytes :
+
+- GenBank/EMBL : 128 MB (`1024*1024*128`)
+- FASTA/FASTQ  : 1 MB (`1024*1024`)
+
+La chaîne est accumulée jusqu'à trouver la fin du dernier enregistrement complet (splitter),
+puis `Pack()` est appelé pour fusionner tous les nœuds en un seul buffer contigu. Ce buffer
+est transmis au parseur via `FileChunk.Raw *bytes.Buffer`.
+
+### Parseur GenBank (`genbank_read.go`)
+
+`GenbankChunkParser` reçoit un `io.Reader` sur le buffer packé, lit ligne par ligne via
+`bufio.NewReader` (buffer 4096 bytes), et pour chaque ligne de la section `ORIGIN` :
+
+```go
+line = string(bline)                        // allocation par ligne
+cleanline := strings.TrimSpace(line)        // allocation
+parts := strings.SplitN(cleanline, " ", 7) // allocation []string + substrings
+for i := 1; i < lparts; i++ {
+    seqBytes.WriteString(parts[i])
+}
+```
+
+Point positif : `seqBytes` est pré-alloué grâce à `lseq` extrait de la ligne `LOCUS`.
+
+### Parseur FASTA (`fastaseq_read.go`)
+
+`FastaChunkParser` lit **octet par octet** via `scanner.ReadByte()`. Pour 3 Gbp :
+3 milliards d'appels. `seqBytes` est un `bytes.Buffer{}` sans pré-allocation.
+
+## Problème principal
+
+Pour une séquence de plusieurs Gbp, `Pack()` fusionne une chaîne de ~N nœuds de 128 MB en
+un seul buffer contigu. C'est une allocation de N × 128 MB suivie d'une copie de toutes les
+données. Bien que l'implémentation de `Pack()` soit efficace (libère les nœuds au fur et à
+mesure via `slices.Grow`), la copie est inévitable avec l'architecture actuelle.
+
+De plus, le parseur GenBank produit des dizaines de millions d'allocations temporaires pour
+parser la section `ORIGIN` (une par ligne).
+
+## Invariant clé découvert
+
+**Si la rope a plus d'un nœud, le premier nœud seul ne se termine pas sur une frontière
+d'enregistrement** (pas de `//\n` en fin de `piece1`).
+
+Preuve par construction dans `ReadFileChunk` :
+- `splitter` est appelé dès le premier nœud (ligne 157)
+- Si `end >= 0` → frontière trouvée dans 128 MB → boucle interne sautée → rope à 1 nœud
+- Si `end < 0` → boucle interne ajoute des nœuds → rope à ≥ 2 nœuds
+
+Corollaire : si rope à 1 nœud, `Pack()` ne fait rien (aucun nœud suivant).
+
+**Attention** : rope à ≥ 2 nœuds ne signifie pas qu'il n'y a qu'une seule séquence dans
+la rope. La rope packée peut contenir plusieurs enregistrements complets. Exemple : records
+de 80 MB → `nextpieces` (48 MB de reste) + nouveau nœud (128 MB) = rope à 2 nœuds
+contenant 2 records complets + début d'un troisième.
+
+L'invariant dit seulement que `piece1` seul est incomplet — pas que la rope entière
+ne contient qu'un seul record.
+
+**Invariant : le dernier FileChunk envoyé finit sur une frontière d'enregistrement.**
+
+Deux chemins dans `ReadFileChunk` :
+
+1. **Chemin normal** (`end >= 0` via `splitter`) : le buffer est explicitement tronqué à
+   `end` (ligne 200 : `pieces.data = pieces.data[:end]`). Frontière garantie par construction
+   pour tous les formats. ✓
+
+2. **Chemin EOF** (`end < 0`, `end = pieces.Len()`) : tout le reste du fichier est envoyé.
+   - **GenBank/EMBL** : présuppose fichier bien formé (se termine par `//\n`). Le parseur
+     lève un `log.Fatalf` sur tout état inattendu — filet de sécurité suffisant. ✓
+   - **FASTQ** : présupposé, vérifié par le parseur. ✓
+   - **FASTA** : garanti par le format lui-même (fin d'enregistrement = EOF ou `>`). ✓
+
+**Hypothèse de travail adoptée** : les fichiers d'entrée sont bien formés. Dans le pire cas,
+le parseur lèvera une erreur explicite. Il n'y a pas de risque de corruption silencieuse.
+
+## Piste d'optimisation : se dispenser de Pack()
+
+### Idée centrale
+
+Au lieu de fusionner la rope avant de la passer au parseur, **parser directement la rope
+nœud par nœud**, et **écrire la séquence compactée in-place dans le premier nœud**.
+
+Pourquoi c'est sûr :
+- Le header (LOCUS, DEFINITION, SOURCE, FEATURES) est **petit** et traité en premier
+- La séquence (ORIGIN) est **à la fin** du record
+- Au moment d'écrire la séquence depuis l'offset 0 de `piece1`, le pointeur de lecture
+  est profond dans la rope (offset >> 0) → jamais de collision
+- La séquence compactée est toujours plus courte que les données brutes
+
+### Pré-allocation
+
+Pour GenBank/EMBL : `lseq` est connu dès la ligne `LOCUS`/`ID` (première ligne, dans
+`piece1`). On peut faire `slices.Grow(piece1.data, lseq)` dès ce moment.
+
+Pour FASTA : pas de taille garantie dans le header, mais `rope.Len()` donne un majorant.
+On peut utiliser `rope.Len() / 2` comme estimation initiale.
+
+### Gestion des jonctions entre nœuds
+
+Une ligne peut chevaucher deux nœuds (rare avec 128 MB, mais possible). Solution : carry
+buffer de ~128 bytes pour les quelques bytes en fin de nœud.
+
+### Cas FASTA/FASTQ multi-séquences
+
+Un FileChunk peut contenir N séquences (notamment FASTA/FASTQ courts). Dans ce cas
+l'écriture in-place dans `piece1` n'est pas applicable directement — on écrase des données
+nécessaires aux séquences suivantes.
+
+Stratégie par cas :
+- **Rope à 1 nœud** (record ≤ 128 MB) : `Pack()` est trivial (no-op), parseur actuel OK
+- **Rope à ≥ 2 nœuds** : par l'invariant, `piece1` ne contient pas de record complet →
+  une seule grande séquence → in-place applicable
+
+### Format d'une ligne séquence GenBank (Après ORIGIN)
+
+```
+/^ *[0-9]+( [nuc]{10}){0,5} [nuc]{1,10}/
+```
+
+### Format d'une ligne séquence GenBank (Après SQ)
+
+La ligne SQ contient aussi la taille de la séquence
+
+```
+/^ *( [nuc]{10}){0,5} [nuc]{1,10} *[0-9]+/
+```
+
+Compactage in-place sur `bline` ([]byte brut, sans conversion `string`) :
+
+```go
+w := 0
+i := 0
+for i < len(bline) && bline[i] == ' '  { i++ }   // skip indentation
+for i < len(bline) && bline[i] <= '9'  { i++ }   // skip position number
+for ; i < len(bline); i++ {
+    if bline[i] != ' ' {
+        bline[w] = bline[i]
+        w++
+    }
+}
+// écrire bline[:w] directement dans piece1.data[seqOffset:]
+```
+
+## Changements nécessaires
+
+1. **`FileChunk`** : exposer la rope `*PieceOfChunk` non-packée en plus (ou à la place)
+   de `Raw *bytes.Buffer`
+2. **`GenbankChunkParser` / `EmblChunkParser`** : accepter `*PieceOfChunk`, parser la
+   rope séquentiellement avec carry buffer pour les jonctions
+3. **`FastaChunkParser`** : idem, avec in-place conditionnel selon taille de la rope
+4. **`ReadFileChunk`** : ne pas appeler `Pack()` avant envoi sur le channel (ou version
+   alternative `ReadFileChunkRope`)
+
+## Fichiers concernés
+
+- `pkg/obiformats/file_chunk_read.go` — structure rope, `ReadFileChunk`
+- `pkg/obiformats/genbank_read.go` — `GenbankChunkParser`, `_ParseGenbankFile`
+- `pkg/obiformats/embl_read.go` — `EmblChunkParser`, `ReadEMBL`
+- `pkg/obiformats/fastaseq_read.go` — `FastaChunkParser`, `_ParseFastaFile`
+- `pkg/obiformats/fastqseq_read.go` — parseur FASTQ (même structure)
+
+## Plan d'implémentation : parseur GenBank sur rope
+
+### Contexte
+
+Baseline mesurée : `obiconvert gbpln640.seq.gz` → 49s real, 42s user, 29s sys, **57 GB RSS**.
+Le sys élevé indique des allocations massives. Deux causes :
+1. `Pack()` : fusionne toute la rope (N × 128 MB) en un buffer contigu avant de parser
+2. Parser ORIGIN : `string(bline)` + `TrimSpace` + `SplitN` × millions de lignes
+
+### 1. `gbRopeScanner`
+
+Struct de lecture ligne par ligne sur la rope, sans allocation heap :
+
+```go
+type gbRopeScanner struct {
+    current *PieceOfChunk
+    pos     int
+    carry   [256]byte  // stack-allocated, max GenBank line = 80 chars
+    carryN  int
+}
+```
+
+`ReadLine()` :
+- Cherche `\n` dans `current.data[pos:]` via `bytes.IndexByte`
+- Si trouvé sans carry : retourne slice direct du node (zéro alloc)
+- Si trouvé avec carry : copie dans carry buffer, retourne `carry[:n]`
+- Si non trouvé : copie le reste dans carry, avance au node suivant, recommence
+- EOF : retourne `carry[:carryN]` puis nil
+
+`extractSequence(dest []byte, UtoT bool) int` :
+- Scan direct des bytes pour section ORIGIN, sans passer par ReadLine
+- Machine d'états : lineStart → skip espaces/digits → copier nucléotides dans dest
+- Stop sur `//` en début de ligne
+- Zéro allocation, UtoT inline
+
+### 2. `GenbankChunkParserRope`
+
+```go
+func GenbankChunkParserRope(source string, rope *PieceOfChunk,
+    withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error)
+```
+
+- Même machine d'états que `GenbankChunkParser`, sur `[]byte` (`bytes.HasPrefix`)
+- LOCUS : extrait `id` et `lseq` par scan direct (remplace `_seqlenght_rx`)
+- FEATURES / default inFeature : taxid extrait par scan de `/db_xref="taxon:`
+  dans la source feature ; `featBytes` rempli seulement si `withFeatureTable=true`
+- DEFINITION : toujours conservée
+- ORIGIN : `dest = make([]byte, 0, lseq+20)` puis `s.extractSequence(dest, UtoT)`
+
+### 3. Modifications `_ParseGenbankFile` et `ReadGenbank`
+
+`_ParseGenbankFile` utilise `chunk.Rope` :
+```go
+sequences, err := GenbankChunkParserRope(chunk.Source, chunk.Rope, ...)
+```
+
+`ReadGenbank` passe `pack=false` :
+```go
+entry_channel := ReadFileChunk(..., false)
+```
+
+### 4. Ce qui NE change pas
+
+- `GenbankChunkParser` reste (référence, tests)
+- `ReadFileChunk`, `Pack()`, autres parseurs (EMBL, FASTA, FASTQ) : inchangés
+
+### 5. Gains attendus
+
+- **RSS** : pic ≈ 128 MB × workers (au lieu de N × 128 MB)
+- **Temps sys** : élimination des mmap/munmap pour les gros buffers
+- **Temps user** : ~50M allocations éliminées
+
+### 6. Vérification
+
+```bash
+/usr/local/go/bin/go build ./...
+diff <(obiconvert gbpln640.seq.gz) gbpln640.reference.fasta
+cd bugs/genbank && ./benchmark.sh gbpln640.seq.gz
+```
+
+Cible : RSS < 1 GB, temps comparable ou meilleur.
--- a/blackboard/architechture/architecture-commande-obitools.md
+++ b/blackboard/architechture/architecture-commande-obitools.md
@@ -0,0 +1,735 @@
+# Architecture d'une commande OBITools
+
+## Vue d'ensemble
+
+Une commande OBITools suit une architecture modulaire et standardisée qui sépare clairement les responsabilités entre :
+- Le package de la commande dans `pkg/obitools/<nom_commande>/`
+- L'exécutable dans `cmd/obitools/<nom_commande>/`
+
+Cette architecture favorise la réutilisabilité du code, la testabilité et la cohérence entre les différentes commandes de la suite OBITools.
+
+## Structure du projet
+
+```
+obitools4/
+├── pkg/obitools/
+│   ├── obiconvert/          # Commande de conversion (base pour toutes)
+│   │   ├── obiconvert.go    # Fonctions vides (pas d'implémentation)
+│   │   ├── options.go       # Définition des options CLI
+│   │   ├── sequence_reader.go  # Lecture des séquences
+│   │   └── sequence_writer.go  # Écriture des séquences
+│   ├── obiuniq/             # Commande de déréplication
+│   │   ├── obiuniq.go       # (fichier vide)
+│   │   ├── options.go       # Options spécifiques à obiuniq
+│   │   └── unique.go        # Implémentation du traitement
+│   ├── obipairing/          # Assemblage de lectures paired-end
+│   ├── obisummary/          # Résumé de fichiers de séquences
+│   └── obimicrosat/         # Détection de microsatellites
+└── cmd/obitools/
+    ├── obiconvert/
+    │   └── main.go          # Point d'entrée de la commande
+    ├── obiuniq/
+    │   └── main.go
+    ├── obipairing/
+    │   └── main.go
+    ├── obisummary/
+    │   └── main.go
+    └── obimicrosat/
+        └── main.go
+```
+
+## Composants de l'architecture
+
+### 1. Package `pkg/obitools/<commande>/`
+
+Chaque commande possède son propre package dans `pkg/obitools/` qui contient l'implémentation complète de la logique métier. Ce package est structuré en plusieurs fichiers :
+
+#### a) `options.go` - Gestion des options CLI
+
+Ce fichier définit :
+- Les **variables globales** privées (préfixées par `_`) stockant les valeurs des options
+- La fonction **`OptionSet()`** qui configure toutes les options pour la commande
+- Les fonctions **`CLI*()`** qui retournent les valeurs des options (getters)
+- Les fonctions **`Set*()`** qui permettent de définir les options programmatiquement (setters)
+
+**Exemple (obiuniq/options.go) :**
+
+```go
+package obiuniq
+
+import (
+    "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
+    "github.com/DavidGamba/go-getoptions"
+)
+
+// Variables globales privées pour stocker les options
+var _StatsOn = make([]string, 0, 10)
+var _Keys = make([]string, 0, 10)
+var _InMemory = false
+var _chunks = 100
+
+// Configuration des options spécifiques à la commande
+func UniqueOptionSet(options *getoptions.GetOpt) {
+    options.StringSliceVar(&_StatsOn, "merge", 1, 1,
+        options.Alias("m"),
+        options.ArgName("KEY"),
+        options.Description("Adds a merged attribute..."))
+    
+    options.BoolVar(&_InMemory, "in-memory", _InMemory,
+        options.Description("Use memory instead of disk..."))
+    
+    options.IntVar(&_chunks, "chunk-count", _chunks,
+        options.Description("In how many chunks..."))
+}
+
+// OptionSet combine les options de base + les options spécifiques
+func OptionSet(options *getoptions.GetOpt) {
+    obiconvert.OptionSet(false)(options)  // Options de base
+    UniqueOptionSet(options)              // Options spécifiques
+}
+
+// Getters pour accéder aux valeurs des options
+func CLIStatsOn() []string {
+    return _StatsOn
+}
+
+func CLIUniqueInMemory() bool {
+    return _InMemory
+}
+
+// Setters pour définir les options programmatiquement
+func SetUniqueInMemory(inMemory bool) {
+    _InMemory = inMemory
+}
+```
+
+**Convention de nommage :**
+- Variables privées : `_NomOption` (underscore préfixe)
+- Getters : `CLINomOption()` (préfixe CLI)
+- Setters : `SetNomOption()` (préfixe Set)
+
+#### b) Fichier(s) d'implémentation
+
+Un ou plusieurs fichiers contenant la logique métier de la commande :
+
+**Exemple (obiuniq/unique.go) :**
+
+```go
+package obiuniq
+
+import (
+    "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
+    "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obichunk"
+)
+
+// Fonction CLI principale qui orchestre le traitement
+func CLIUnique(sequences obiiter.IBioSequence) obiiter.IBioSequence {
+    // Récupération des options via les getters CLI*()
+    options := make([]obichunk.WithOption, 0, 30)
+    
+    options = append(options,
+        obichunk.OptionBatchCount(CLINumberOfChunks()),
+    )
+    
+    if CLIUniqueInMemory() {
+        options = append(options, obichunk.OptionSortOnMemory())
+    } else {
+        options = append(options, obichunk.OptionSortOnDisk())
+    }
+    
+    // Appel de la fonction de traitement réelle
+    iUnique, err := obichunk.IUniqueSequence(sequences, options...)
+    
+    if err != nil {
+        log.Fatal(err)
+    }
+    
+    return iUnique
+}
+```
+
+**Autres exemples d'implémentation :**
+
+- **obimicrosat/microsat.go** : Contient `MakeMicrosatWorker()` et `CLIAnnotateMicrosat()`
+- **obisummary/obisummary.go** : Contient `ISummary()` et les structures de données
+
+#### c) Fichiers utilitaires (optionnel)
+
+Certaines commandes ont des fichiers additionnels pour des fonctionnalités spécifiques.
+
+**Exemple (obipairing/options.go) :**
+
+```go
+// Fonction spéciale pour créer un itérateur de séquences pairées
+func CLIPairedSequence() (obiiter.IBioSequence, error) {
+    forward, err := obiconvert.CLIReadBioSequences(_ForwardFile)
+    if err != nil {
+        return obiiter.NilIBioSequence, err
+    }
+    
+    reverse, err := obiconvert.CLIReadBioSequences(_ReverseFile)
+    if err != nil {
+        return obiiter.NilIBioSequence, err
+    }
+    
+    paired := forward.PairTo(reverse)
+    return paired, nil
+}
+```
+
+### 2. Package `obiconvert` - La base commune
+
+Le package `obiconvert` est spécial car il fournit les fonctionnalités de base utilisées par toutes les autres commandes :
+
+#### Fonctionnalités fournies :
+
+1. **Lecture de séquences** (`sequence_reader.go`)
+   - `CLIReadBioSequences()` : lecture depuis fichiers ou stdin
+   - Support de multiples formats (FASTA, FASTQ, EMBL, GenBank, etc.)
+   - Gestion des fichiers multiples
+   - Barre de progression optionnelle
+
+2. **Écriture de séquences** (`sequence_writer.go`)
+   - `CLIWriteBioSequences()` : écriture vers fichiers ou stdout
+   - Support de multiples formats
+   - Gestion des lectures pairées
+   - Compression optionnelle
+
+3. **Options communes** (`options.go`)
+   - Options d'entrée (format, skip, etc.)
+   - Options de sortie (format, fichier, compression)
+   - Options de mode (barre de progression, etc.)
+
+#### Utilisation par les autres commandes :
+
+Toutes les commandes incluent les options de `obiconvert` via :
+
+```go
+func OptionSet(options *getoptions.GetOpt) {
+    obiconvert.OptionSet(false)(options)  // false = pas de fichiers pairés
+    MaCommandeOptionSet(options)          // Options spécifiques
+}
+```
+
+### 3. Exécutable `cmd/obitools/<commande>/main.go`
+
+Le fichier `main.go` de chaque commande est volontairement **minimaliste** et suit toujours le même pattern :
+
+```go
+package main
+
+import (
+    "os"
+    "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
+    "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
+    "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
+    "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/macommande"
+    "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
+)
+
+func main() {
+    // 1. Configuration optionnelle de paramètres par défaut
+    obidefault.SetBatchSize(10)
+    
+    // 2. Génération du parser d'options
+    optionParser := obioptions.GenerateOptionParser(
+        "macommande",                    // Nom de la commande
+        "description de la commande",    // Description
+        macommande.OptionSet)            // Fonction de configuration des options
+    
+    // 3. Parsing des arguments
+    _, args := optionParser(os.Args)
+    
+    // 4. Lecture des séquences d'entrée
+    sequences, err := obiconvert.CLIReadBioSequences(args...)
+    obiconvert.OpenSequenceDataErrorMessage(args, err)
+    
+    // 5. Traitement spécifique de la commande
+    resultat := macommande.CLITraitement(sequences)
+    
+    // 6. Écriture des résultats
+    obiconvert.CLIWriteBioSequences(resultat, true)
+    
+    // 7. Attente de la fin du pipeline
+    obiutils.WaitForLastPipe()
+}
+```
+
+## Patterns architecturaux
+
+### Pattern 1 : Pipeline de traitement de séquences
+
+La plupart des commandes suivent ce pattern :
+
+```
+Lecture → Traitement → Écriture
+```
+
+**Exemples :**
+- **obiconvert** : Lecture → Écriture (conversion de format)
+- **obiuniq** : Lecture → Déréplication → Écriture
+- **obimicrosat** : Lecture → Annotation → Filtrage → Écriture
+
+### Pattern 2 : Traitement avec entrées multiples
+
+Certaines commandes acceptent plusieurs fichiers d'entrée :
+
+**obipairing** :
+```
+Lecture Forward + Lecture Reverse → Pairing → Assemblage → Écriture
+```
+
+### Pattern 3 : Traitement sans écriture de séquences
+
+**obisummary** : produit un résumé JSON/YAML au lieu de séquences
+
+```go
+func main() {
+    // ... parsing options et lecture ...
+    
+    summary := obisummary.ISummary(fs, obisummary.CLIMapSummary())
+    
+    // Formatage et affichage direct
+    if obisummary.CLIOutFormat() == "json" {
+        output, _ := json.MarshalIndent(summary, "", "  ")
+        fmt.Print(string(output))
+    } else {
+        output, _ := yaml.Marshal(summary)
+        fmt.Print(string(output))
+    }
+}
+```
+
+### Pattern 4 : Utilisation de Workers
+
+Les commandes qui transforment des séquences utilisent souvent le pattern Worker :
+
+```go
+// Création d'un worker
+worker := MakeMicrosatWorker(
+    CLIMinUnitLength(),
+    CLIMaxUnitLength(),
+    // ... autres paramètres
+)
+
+// Application du worker sur l'itérateur
+newIter = iterator.MakeIWorker(
+    worker, 
+    false,                              // merge results
+    obidefault.ParallelWorkers()        // parallélisation
+)
+```
+
+## Étapes d'implémentation d'une nouvelle commande
+
+### Étape 1 : Créer le package dans `pkg/obitools/`
+
+```bash
+mkdir -p pkg/obitools/macommande
+```
+
+### Étape 2 : Créer `options.go`
+
+```go
+package macommande
+
+import (
+    "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
+    "github.com/DavidGamba/go-getoptions"
+)
+
+// Variables privées pour les options
+var _MonOption = "valeur_par_defaut"
+
+// Configuration des options spécifiques
+func MaCommandeOptionSet(options *getoptions.GetOpt) {
+    options.StringVar(&_MonOption, "mon-option", _MonOption,
+        options.Alias("o"),
+        options.Description("Description de l'option"))
+}
+
+// OptionSet combine options de base + spécifiques
+func OptionSet(options *getoptions.GetOpt) {
+    obiconvert.OptionSet(false)(options)  // false si pas de fichiers pairés
+    MaCommandeOptionSet(options)
+}
+
+// Getters
+func CLIMonOption() string {
+    return _MonOption
+}
+
+// Setters
+func SetMonOption(value string) {
+    _MonOption = value
+}
+```
+
+### Étape 3 : Créer le fichier d'implémentation
+
+Créer `macommande.go` (ou un nom plus descriptif) :
+
+```go
+package macommande
+
+import (
+    "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
+    "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
+)
+
+// Fonction de traitement principale
+func CLIMaCommande(sequences obiiter.IBioSequence) obiiter.IBioSequence {
+    // Récupération des options
+    option := CLIMonOption()
+    
+    // Implémentation du traitement
+    // ...
+    
+    return resultat
+}
+```
+
+### Étape 4 : Créer l'exécutable dans `cmd/obitools/`
+
+```bash
+mkdir -p cmd/obitools/macommande
+```
+
+Créer `main.go` :
+
+```go
+package main
+
+import (
+    "os"
+    "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
+    "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
+    "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/macommande"
+    "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
+)
+
+func main() {
+    // Parser d'options
+    optionParser := obioptions.GenerateOptionParser(
+        "macommande",
+        "Description courte de ma commande",
+        macommande.OptionSet)
+    
+    _, args := optionParser(os.Args)
+    
+    // Lecture
+    sequences, err := obiconvert.CLIReadBioSequences(args...)
+    obiconvert.OpenSequenceDataErrorMessage(args, err)
+    
+    // Traitement
+    resultat := macommande.CLIMaCommande(sequences)
+    
+    // Écriture
+    obiconvert.CLIWriteBioSequences(resultat, true)
+    
+    // Attente
+    obiutils.WaitForLastPipe()
+}
+```
+
+### Étape 5 : Configurations optionnelles
+
+Dans `main.go`, avant le parsing des options, on peut configurer :
+
+```go
+// Taille des batchs de séquences
+obidefault.SetBatchSize(10)
+
+// Nombre de workers en lecture (strict)
+obidefault.SetStrictReadWorker(2)
+
+// Nombre de workers en écriture
+obidefault.SetStrictWriteWorker(2)
+
+// Désactiver la lecture des qualités
+obidefault.SetReadQualities(false)
+```
+
+### Étape 6 : Gestion des erreurs
+
+Utiliser les fonctions utilitaires pour les messages d'erreur cohérents :
+
+```go
+// Pour les erreurs d'ouverture de fichiers
+obiconvert.OpenSequenceDataErrorMessage(args, err)
+
+// Pour les erreurs générales
+if err != nil {
+    log.Errorf("Message d'erreur: %v", err)
+    os.Exit(1)
+}
+```
+
+### Étape 7 : Tests et debugging (optionnel)
+
+Des commentaires dans le code montrent comment activer le profiling :
+
+```go
+// go tool pprof -http=":8000" ./macommande ./cpu.pprof
+// f, err := os.Create("cpu.pprof")
+// if err != nil {
+//     log.Fatal(err)
+// }
+// pprof.StartCPUProfile(f)
+// defer pprof.StopCPUProfile()
+
+// go tool trace cpu.trace
+// ftrace, err := os.Create("cpu.trace")
+// if err != nil {
+//     log.Fatal(err)
+// }
+// trace.Start(ftrace)
+// defer trace.Stop()
+```
+
+## Bonnes pratiques observées
+
+### 1. Séparation des responsabilités
+
+- **`main.go`** : orchestration minimale
+- **`options.go`** : définition et gestion des options
+- **Fichiers d'implémentation** : logique métier
+
+### 2. Convention de nommage cohérente
+
+- Variables d'options : `_NomOption`
+- Getters CLI : `CLINomOption()`
+- Setters : `SetNomOption()`
+- Fonctions de traitement CLI : `CLITraitement()`
+
+### 3. Réutilisation du code
+
+- Toutes les commandes réutilisent `obiconvert` pour l'I/O
+- Les options communes sont partagées
+- Les fonctions utilitaires sont centralisées
+
+### 4. Configuration par défaut
+
+Les valeurs par défaut sont :
+- Définies lors de l'initialisation des variables
+- Modifiables via les options CLI
+- Modifiables programmatiquement via les setters
+
+### 5. Gestion des formats
+
+Support automatique de multiples formats :
+- FASTA / FASTQ (avec compression gzip)
+- EMBL / GenBank
+- ecoPCR
+- CSV
+- JSON (avec différents formats d'en-têtes)
+
+### 6. Parallélisation
+
+Les commandes utilisent les workers parallèles via :
+- `obidefault.ParallelWorkers()`
+- `obidefault.SetStrictReadWorker(n)`
+- `obidefault.SetStrictWriteWorker(n)`
+
+### 7. Logging cohérent
+
+Utilisation de `logrus` pour tous les logs :
+```go
+log.Printf("Message informatif")
+log.Errorf("Message d'erreur: %v", err)
+log.Fatal(err)  // Arrêt du programme
+```
+
+## Dépendances principales
+
+### Packages internes OBITools
+
+- `pkg/obidefault` : valeurs par défaut et configuration globale
+- `pkg/obioptions` : génération du parser d'options
+- `pkg/obiiter` : itérateurs de séquences biologiques
+- `pkg/obiseq` : structures et fonctions pour séquences biologiques
+- `pkg/obiformats` : lecture/écriture de différents formats
+- `pkg/obiutils` : fonctions utilitaires diverses
+- `pkg/obichunk` : traitement par chunks (pour dereplication, etc.)
+
+### Packages externes
+
+- `github.com/DavidGamba/go-getoptions` : parsing des options CLI
+- `github.com/sirupsen/logrus` : logging structuré
+- `gopkg.in/yaml.v3` : encodage/décodage YAML
+- `github.com/dlclark/regexp2` : expressions régulières avancées
+
+## Cas spéciaux
+
+### Commande avec fichiers pairés (obipairing)
+
+```go
+func OptionSet(options *getoptions.GetOpt) {
+    obiconvert.OutputOptionSet(options)
+    obiconvert.InputOptionSet(options)
+    PairingOptionSet(options)  // Options spécifiques au pairing
+}
+
+func CLIPairedSequence() (obiiter.IBioSequence, error) {
+    forward, err := obiconvert.CLIReadBioSequences(_ForwardFile)
+    // ...
+    reverse, err := obiconvert.CLIReadBioSequences(_ReverseFile)
+    // ...
+    paired := forward.PairTo(reverse)
+    return paired, nil
+}
+```
+
+Dans `main.go` :
+```go
+pairs, err := obipairing.CLIPairedSequence()  // Lecture spéciale
+if err != nil {
+    log.Errorf("Cannot open file (%v)", err)
+    os.Exit(1)
+}
+
+paired := obipairing.IAssemblePESequencesBatch(
+    pairs,
+    obipairing.CLIGapPenality(),
+    // ... autres paramètres
+)
+```
+
+### Commande sans sortie de séquences (obisummary)
+
+Au lieu de `obiconvert.CLIWriteBioSequences()`, affichage direct :
+
+```go
+summary := obisummary.ISummary(fs, obisummary.CLIMapSummary())
+
+if obisummary.CLIOutFormat() == "json" {
+    output, _ := json.MarshalIndent(summary, "", "  ")
+    fmt.Print(string(output))
+} else {
+    output, _ := yaml.Marshal(summary)
+    fmt.Print(string(output))
+}
+fmt.Printf("\n")
+```
+
+### Commande avec Workers personnalisés (obimicrosat)
+
+```go
+func CLIAnnotateMicrosat(iterator obiiter.IBioSequence) obiiter.IBioSequence {
+    // Création du worker
+    worker := MakeMicrosatWorker(
+        CLIMinUnitLength(),
+        CLIMaxUnitLength(),
+        CLIMinUnitCount(),
+        CLIMinLength(),
+        CLIMinFlankLength(),
+        CLIReoriented(),
+    )
+    
+    // Application du worker
+    newIter := iterator.MakeIWorker(
+        worker, 
+        false,                           // pas de merge
+        obidefault.ParallelWorkers(),    // parallélisation
+    )
+    
+    return newIter.FilterEmpty()  // Filtrage des résultats vides
+}
+```
+
+## Diagramme de flux d'exécution
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                      cmd/obitools/macommande/main.go        │
+└─────────────────────────────────────────────────────────────┘
+                              │
+                              ▼
+┌─────────────────────────────────────────────────────────────┐
+│  1. Génération du parser d'options                          │
+│     obioptions.GenerateOptionParser(                        │
+│         "macommande",                                       │
+│         "description",                                      │
+│         macommande.OptionSet)                               │
+└─────────────────────────────────────────────────────────────┘
+                              │
+                              ▼
+┌─────────────────────────────────────────────────────────────┐
+│  pkg/obitools/macommande/options.go                         │
+│  ┌─────────────────────────────────────────────────────┐   │
+│  │ func OptionSet(options *getoptions.GetOpt)          │   │
+│  │   obiconvert.OptionSet(false)(options) ───────────┐ │   │
+│  │   MaCommandeOptionSet(options)                    │ │   │
+│  └───────────────────────────────────────────────────┼─┘   │
+└────────────────────────────────────────────────────────┼─────┘
+                              │                         │
+                              │                         │
+                ┌─────────────┘                         │
+                │                                       │
+                ▼                                       ▼
+┌─────────────────────────────────┐  ┌───────────────────────────────┐
+│ 2. Parsing des arguments        │  │ pkg/obitools/obiconvert/      │
+│    _, args := optionParser(...) │  │    options.go                 │
+└─────────────────────────────────┘  │  - InputOptionSet()           │
+                │                     │  - OutputOptionSet()          │
+                ▼                     │  - PairedFilesOptionSet()     │
+┌─────────────────────────────────┐  └───────────────────────────────┘
+│ 3. Lecture des séquences        │
+│    CLIReadBioSequences(args)    │
+└─────────────────────────────────┘
+                │
+                ▼
+┌─────────────────────────────────────────────────────────────┐
+│ pkg/obitools/obiconvert/sequence_reader.go                  │
+│  - ExpandListOfFiles()                                      │
+│  - ReadSequencesFromFile() / ReadSequencesFromStdin()       │
+│  - Support: FASTA, FASTQ, EMBL, GenBank, ecoPCR, CSV        │
+└─────────────────────────────────────────────────────────────┘
+                │
+                ▼ obiiter.IBioSequence
+┌─────────────────────────────────────────────────────────────┐
+│ 4. Traitement spécifique                                    │
+│    macommande.CLITraitement(sequences)                      │
+└─────────────────────────────────────────────────────────────┘
+                │
+                ▼
+┌─────────────────────────────────────────────────────────────┐
+│ pkg/obitools/macommande/<implementation>.go                 │
+│  - Récupération des options via CLI*() getters             │
+│  - Application de la logique métier                         │
+│  - Retour d'un nouvel iterator                              │
+└─────────────────────────────────────────────────────────────┘
+                │
+                ▼ obiiter.IBioSequence
+┌─────────────────────────────────────────────────────────────┐
+│ 5. Écriture des résultats                                   │
+│    CLIWriteBioSequences(resultat, true)                     │
+└─────────────────────────────────────────────────────────────┘
+                │
+                ▼
+┌─────────────────────────────────────────────────────────────┐
+│ pkg/obitools/obiconvert/sequence_writer.go                  │
+│  - WriteSequencesToFile() / WriteSequencesToStdout()        │
+│  - Support: FASTA, FASTQ, JSON                              │
+│  - Gestion des lectures pairées                             │
+│  - Compression optionnelle                                  │
+└─────────────────────────────────────────────────────────────┘
+                │
+                ▼
+┌─────────────────────────────────────────────────────────────┐
+│ 6. Attente de fin du pipeline                               │
+│    obiutils.WaitForLastPipe()                               │
+└─────────────────────────────────────────────────────────────┘
+```
+
+## Conclusion
+
+L'architecture des commandes OBITools est conçue pour :
+
+1. **Maximiser la réutilisation** : `obiconvert` fournit les fonctionnalités communes
+2. **Simplifier l'ajout de nouvelles commandes** : pattern standardisé et minimaliste
+3. **Faciliter la maintenance** : séparation claire des responsabilités
+4. **Garantir la cohérence** : conventions de nommage et structure uniforme
+5. **Optimiser les performances** : parallélisation intégrée et traitement par batch
+
+Cette architecture modulaire permet de créer rapidement de nouvelles commandes tout en maintenant une qualité et une cohérence élevées dans toute la suite OBITools.
--- a/blackboard/architechture/definition-superkmer.md
+++ b/blackboard/architechture/definition-superkmer.md
@@ -0,0 +1,99 @@
+# Définition du super k-mer
+
+## Définition
+
+Un **super k-mer** est une **sous-séquence MAXIMALE** d'une séquence dans laquelle **tous les k-mers consécutifs partagent le même minimiseur**.
+
+### Termes
+
+- **k-mer** : sous-séquence de longueur k
+- **minimiseur** : le plus petit m-mer canonique parmi tous les m-mers d'un k-mer
+- **k-mers consécutifs** : k-mers aux positions i et i+1 (chevauchement de k-1 nucléotides)
+- **MAXIMALE** : ne peut être étendue ni à gauche ni à droite
+
+## RÈGLES ABSOLUES
+
+### RÈGLE 1 : Longueur minimum = k
+
+Un super k-mer contient au minimum k nucléotides.
+
+```
+longueur(super-kmer) >= k
+```
+
+### RÈGLE 2 : Chevauchement obligatoire = k-1
+
+Deux super-kmers consécutifs se chevauchent d'EXACTEMENT k-1 nucléotides.
+
+```
+SK1.End - SK2.Start = k - 1
+```
+
+### RÈGLE 3 : Bijection séquence ↔ minimiseur
+
+Une séquence de super k-mer a UN et UN SEUL minimiseur.
+
+```
+Même séquence → Même minimiseur (TOUJOURS)
+```
+
+**Si vous observez la même séquence avec deux minimiseurs différents, c'est un BUG.**
+
+### RÈGLE 4 : Tous les k-mers partagent le minimiseur
+
+TOUS les k-mers contenus dans un super k-mer ont le même minimiseur.
+
+```
+∀ k-mer K dans SK : minimiseur(K) = SK.minimizer
+```
+
+### RÈGLE 5 : Maximalité
+
+Un super k-mer ne peut pas être étendu.
+
+- Si on ajoute un nucléotide à gauche : le nouveau k-mer a un minimiseur différent
+- Si on ajoute un nucléotide à droite : le nouveau k-mer a un minimiseur différent
+
+## VIOLATIONS INTERDITES
+
+❌ **Super k-mer de longueur < k**
+❌ **Chevauchement ≠ k-1 entre consécutifs**
+❌ **Même séquence avec minimiseurs différents**
+❌ **K-mer dans le super k-mer avec minimiseur différent**
+❌ **Super k-mer extensible (non-maximal)**
+
+## CONSÉQUENCES PRATIQUES
+
+### Pour l'extraction
+
+L'algorithme doit :
+1. Calculer le minimiseur de chaque k-mer
+2. Découper quand le minimiseur change
+3. Assigner au super k-mer le minimiseur commun à tous ses k-mers
+4. Garantir que chaque super k-mer contient au moins k nucléotides
+5. Garantir le chevauchement de k-1 entre consécutifs
+
+### Pour la validation
+
+Si après déduplication (obiuniq) on observe :
+```
+Séquence: ACGT...
+Minimiseurs: {M1, M2}  // plusieurs minimiseurs
+```
+
+C'est la PREUVE d'un bug : l'algorithme a produit cette séquence avec des minimiseurs différents, ce qui viole la RÈGLE 3.
+
+## DIAGNOSTIC DU BUG
+
+**Bug observé** : Même séquence avec minimiseurs différents après obiuniq
+
+**Cause possible** : L'algorithme assigne le mauvais minimiseur OU découpe mal les super-kmers
+
+**Ce que le bug NE PEUT PAS être** :
+- Un problème d'obiuniq (révèle le bug, ne le crée pas)
+- Un problème de chevauchement légitime (k-1 est correct)
+
+**Ce que le bug DOIT être** :
+- Minimiseur mal calculé ou mal assigné
+- Découpage incorrect (mauvais endPos)
+- Copie incorrecte des données
--- a/blackboard/architechture/guide-redaction-obitest.md
+++ b/blackboard/architechture/guide-redaction-obitest.md
@@ -0,0 +1,316 @@
+# Guide de rédaction d'un obitest
+
+## Règles essentielles
+
+1. **Données < 1 KB** - Fichiers de test très petits
+2. **Exécution < 10 sec** - Tests rapides pour CI/CD
+3. **Auto-contenu** - Pas de dépendances externes
+4. **Auto-nettoyage** - Pas de fichiers résiduels
+
+## Structure minimale
+
+```
+obitests/obitools/<commande>/
+├── test.sh          # Script exécutable
+└── data.fasta       # Données minimales (optionnel)
+```
+
+## Template de test.sh
+
+```bash
+#!/bin/bash
+
+TEST_NAME=<commande>
+CMD=<commande>
+
+TEST_DIR="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")"
+OBITOOLS_DIR="${TEST_DIR/obitest*/}build"
+export PATH="${OBITOOLS_DIR}:${PATH}"
+
+MCMD="$(echo "${CMD:0:4}" | tr '[:lower:]' '[:upper:]')$(echo "${CMD:4}" | tr '[:upper:]' '[:lower:]')"
+
+TMPDIR="$(mktemp -d)"
+ntest=0
+success=0
+failed=0
+
+cleanup() {
+    echo "========================================" 1>&2
+    echo "## Results of the $TEST_NAME tests:" 1>&2
+    echo 1>&2
+    echo "- $ntest tests run" 1>&2
+    echo "- $success successfully completed" 1>&2
+    echo "- $failed failed tests" 1>&2
+    echo 1>&2
+    echo "Cleaning up the temporary directory..." 1>&2
+    echo 1>&2
+    echo "========================================" 1>&2
+
+    rm -rf "$TMPDIR"
+
+    if [ $failed -gt 0 ]; then
+       log "$TEST_NAME tests failed" 
+        log
+        log
+       exit 1
+    fi
+
+    log
+    log
+    exit 0
+}
+
+log() {
+    echo -e "[$TEST_NAME @ $(date)] $*" 1>&2
+}
+
+log "Testing $TEST_NAME..." 
+log "Test directory is $TEST_DIR" 
+log "obitools directory is $OBITOOLS_DIR" 
+log "Temporary directory is $TMPDIR" 
+log "files: $(find $TEST_DIR | awk -F'/' '{print $NF}' | tail -n +2)"
+
+########## TESTS ##########
+
+# Test 1: Help (OBLIGATOIRE)
+((ntest++))
+if $CMD -h > "${TMPDIR}/help.txt" 2>&1 
+then
+    log "$MCMD: printing help OK" 
+    ((success++))
+else
+    log "$MCMD: printing help failed" 
+    ((failed++))
+fi
+
+# Ajoutez vos tests ici...
+
+###########################
+
+cleanup
+```
+
+## Pattern de test
+
+```bash
+((ntest++))
+if commande args > "${TMPDIR}/output.txt" 2>&1
+then
+    log "$MCMD: description OK" 
+    ((success++))
+else
+    log "$MCMD: description failed"
+    ((failed++))
+fi
+```
+
+## Tests courants
+
+### Exécution basique
+```bash
+((ntest++))
+if $CMD "${TEST_DIR}/input.fasta" > "${TMPDIR}/output.fasta" 2>&1
+then
+    log "$MCMD: basic execution OK" 
+    ((success++))
+else
+    log "$MCMD: basic execution failed"
+    ((failed++))
+fi
+```
+
+### Sortie non vide
+```bash
+((ntest++))
+if [ -s "${TMPDIR}/output.fasta" ]
+then
+    log "$MCMD: output not empty OK"
+    ((success++))
+else
+    log "$MCMD: output empty - failed"
+    ((failed++))
+fi
+```
+
+### Comptage
+```bash
+((ntest++))
+count=$(grep -c "^>" "${TMPDIR}/output.fasta")
+if [ "$count" -gt 0 ]
+then
+    log "$MCMD: extracted $count sequences OK"
+    ((success++))
+else
+    log "$MCMD: no sequences - failed"
+    ((failed++))
+fi
+```
+
+### Présence de contenu
+```bash
+((ntest++))
+if grep -q "expected_string" "${TMPDIR}/output.fasta"
+then
+    log "$MCMD: expected content found OK"
+    ((success++))
+else
+    log "$MCMD: content not found - failed"
+    ((failed++))
+fi
+```
+
+### Comparaison avec référence
+```bash
+((ntest++))
+if diff "${TEST_DIR}/expected.fasta" "${TMPDIR}/output.fasta" > /dev/null
+then
+    log "$MCMD: matches reference OK"
+    ((success++))
+else
+    log "$MCMD: differs from reference - failed"
+    ((failed++))
+fi
+```
+
+### Test avec options
+```bash
+((ntest++))
+if $CMD --opt value "${TEST_DIR}/input.fasta" > "${TMPDIR}/out.fasta" 2>&1
+then
+    log "$MCMD: with option OK" 
+    ((success++))
+else
+    log "$MCMD: with option failed"
+    ((failed++))
+fi
+```
+
+## Variables importantes
+
+- **TEST_DIR** - Répertoire du test (données d'entrée)
+- **TMPDIR** - Répertoire temporaire (sorties)
+- **CMD** - Nom de la commande
+- **MCMD** - Nom formaté pour les logs
+
+## Règles d'or
+
+✅ **Entrées** → `${TEST_DIR}/`
+✅ **Sorties** → `${TMPDIR}/`
+✅ **Toujours rediriger** → `> file 2>&1`
+✅ **Incrémenter ntest** → Avant chaque test
+✅ **Messages clairs** → Descriptions explicites
+
+❌ **Pas de chemins en dur**
+❌ **Pas de /tmp direct**
+❌ **Pas de sortie vers TEST_DIR**
+❌ **Pas de commandes sans redirection**
+
+## Données de test
+
+Créer un fichier minimal (< 500 bytes) :
+
+```fasta
+>seq1
+ACGTACGTACGTACGT
+>seq2
+AAAACCCCGGGGTTTT
+>seq3
+ATCGATCGATCGATCG
+```
+
+## Création rapide
+
+```bash
+# 1. Créer le répertoire
+mkdir -p obitests/obitools/<commande>
+cd obitests/obitools/<commande>
+
+# 2. Créer les données de test
+cat > test_data.fasta << 'EOF'
+>seq1
+ACGTACGTACGTACGT
+>seq2
+AAAACCCCGGGGTTTT
+EOF
+
+# 3. Copier le template dans test.sh
+# 4. Adapter le TEST_NAME et CMD
+# 5. Ajouter les tests
+# 6. Rendre exécutable
+chmod +x test.sh
+
+# 7. Tester
+./test.sh
+```
+
+## Checklist
+
+- [ ] `test.sh` exécutable (`chmod +x`)
+- [ ] Test d'aide inclus
+- [ ] Données < 1 KB
+- [ ] Sorties vers `${TMPDIR}/`
+- [ ] Entrées depuis `${TEST_DIR}/`
+- [ ] Redirections `2>&1`
+- [ ] Messages clairs
+- [ ] Testé localement
+- [ ] Exit code 0 si succès
+
+## Debug
+
+Conserver TMPDIR pour inspection :
+```bash
+cleanup() {
+    echo "Temporary directory: $TMPDIR" 1>&2
+    # rm -rf "$TMPDIR"  # Commenté
+    ...
+}
+```
+
+Mode verbose :
+```bash
+set -x  # Au début du script
+```
+
+## Exemples
+
+**Simple (1 test)** - obimicrosat
+```bash
+# Juste l'aide
+```
+
+**Moyen (4-5 tests)** - obisuperkmer
+```bash
+# Aide + exécution + validation sortie + contenu
+```
+
+**Complet (7+ tests)** - obiuniq
+```bash
+# Aide + exécution + comparaison CSV + options + multiples cas
+```
+
+## Commandes utiles
+
+```bash
+# Compter séquences
+grep -c "^>" file.fasta
+
+# Fichier non vide
+[ -s file ]
+
+# Comparer
+diff file1 file2 > /dev/null
+
+# Comparer compressés
+zdiff file1.gz file2.gz
+
+# Compter bases
+grep -v "^>" file | tr -d '\n' | wc -c
+```
+
+## Ce qu'il faut retenir
+
+Un bon test est **COURT**, **RAPIDE** et **SIMPLE** :
+- 3-10 tests maximum
+- Données < 1 KB
+- Exécution < 10 secondes
+- Pattern standard respecté
--- a/blackboard/architechture/obisuperkmer-implementation.md
+++ b/blackboard/architechture/obisuperkmer-implementation.md
@@ -0,0 +1,268 @@
+# Implémentation de la commande obisuperkmer
+
+## Vue d'ensemble
+
+La commande `obisuperkmer` a été implémentée en suivant l'architecture standard des commandes OBITools décrite dans `architecture-commande-obitools.md`. Cette commande permet d'extraire les super k-mers de fichiers de séquences biologiques.
+
+## Qu'est-ce qu'un super k-mer ?
+
+Un super k-mer est une sous-séquence maximale dans laquelle tous les k-mers consécutifs partagent le même minimiseur. Cette décomposition est utile pour :
+- L'indexation efficace de k-mers
+- La réduction de la redondance dans les analyses
+- L'optimisation de la mémoire pour les structures de données de k-mers
+
+## Structure de l'implémentation
+
+### 1. Package `pkg/obitools/obisuperkmer/`
+
+Le package contient trois fichiers :
+
+#### `obisuperkmer.go`
+Documentation du package avec une description de son rôle.
+
+#### `options.go`
+Définit les options de ligne de commande :
+
+```go
+var _KmerSize = 21          // Taille des k-mers (par défaut 21)
+var _MinimizerSize = 11     // Taille des minimiseurs (par défaut 11)
+```
+
+**Options CLI disponibles :**
+- `--kmer-size` / `-k` : Taille des k-mers (entre m+1 et 31)
+- `--minimizer-size` / `-m` : Taille des minimiseurs (entre 1 et k-1)
+
+**Fonctions d'accès :**
+- `CLIKmerSize()` : retourne la taille des k-mers
+- `CLIMinimizerSize()` : retourne la taille des minimiseurs
+- `SetKmerSize(k int)` : définit la taille des k-mers
+- `SetMinimizerSize(m int)` : définit la taille des minimiseurs
+
+#### `superkmer.go`
+Implémente la logique de traitement :
+
+```go
+func CLIExtractSuperKmers(iterator obiiter.IBioSequence) obiiter.IBioSequence
+```
+
+Cette fonction :
+1. Récupère les paramètres k et m depuis les options CLI
+2. Valide les paramètres (m < k, k <= 31, etc.)
+3. Crée un worker utilisant `obikmer.SuperKmerWorker(k, m)`
+4. Applique le worker en parallèle sur l'itérateur de séquences
+5. Retourne un itérateur de super k-mers
+
+### 2. Exécutable `cmd/obitools/obisuperkmer/main.go`
+
+L'exécutable suit le pattern standard minimal :
+
+```go
+func main() {
+    // 1. Génération du parser d'options
+    optionParser := obioptions.GenerateOptionParser(
+        "obisuperkmer",
+        "extract super k-mers from sequence files",
+        obisuperkmer.OptionSet)
+    
+    // 2. Parsing des arguments
+    _, args := optionParser(os.Args)
+    
+    // 3. Lecture des séquences
+    sequences, err := obiconvert.CLIReadBioSequences(args...)
+    obiconvert.OpenSequenceDataErrorMessage(args, err)
+    
+    // 4. Extraction des super k-mers
+    superkmers := obisuperkmer.CLIExtractSuperKmers(sequences)
+    
+    // 5. Écriture des résultats
+    obiconvert.CLIWriteBioSequences(superkmers, true)
+    
+    // 6. Attente de la fin du pipeline
+    obiutils.WaitForLastPipe()
+}
+```
+
+## Utilisation du package `obikmer`
+
+L'implémentation s'appuie sur le package `obikmer` qui fournit :
+
+### `SuperKmerWorker(k int, m int) obiseq.SeqWorker`
+
+Crée un worker qui :
+- Extrait les super k-mers d'une BioSequence
+- Retourne une slice de BioSequence, une par super k-mer
+- Chaque super k-mer contient les attributs suivants :
+
+```go
+// Métadonnées ajoutées à chaque super k-mer :
+{
+    "minimizer_value": uint64,  // Valeur canonique du minimiseur
+    "minimizer_seq": string,    // Séquence ADN du minimiseur
+    "k": int,                   // Taille des k-mers utilisée
+    "m": int,                   // Taille des minimiseurs utilisée
+    "start": int,               // Position de début (0-indexé)
+    "end": int,                 // Position de fin (exclusif)
+    "parent_id": string,        // ID de la séquence parente
+}
+```
+
+### Algorithme sous-jacent
+
+Le package `obikmer` utilise :
+- `IterSuperKmers(seq []byte, k int, m int)` : itérateur sur les super k-mers
+- Une deque monotone pour suivre les minimiseurs dans une fenêtre glissante
+- Complexité temporelle : O(n) où n est la longueur de la séquence
+- Complexité spatiale : O(k-m+1) pour la deque
+
+## Exemple d'utilisation
+
+### Ligne de commande
+
+```bash
+# Extraction avec paramètres par défaut (k=21, m=11)
+obisuperkmer sequences.fasta > superkmers.fasta
+
+# Spécifier les tailles de k-mers et minimiseurs
+obisuperkmer -k 25 -m 13 sequences.fasta -o superkmers.fasta
+
+# Avec plusieurs fichiers d'entrée
+obisuperkmer --kmer-size 31 --minimizer-size 15 file1.fasta file2.fasta > output.fasta
+
+# Format FASTQ en entrée, FASTA en sortie
+obisuperkmer sequences.fastq --fasta-output -o superkmers.fasta
+
+# Avec compression
+obisuperkmer sequences.fasta -o superkmers.fasta.gz --compress
+```
+
+### Exemple de sortie
+
+Pour une séquence d'entrée :
+```
+>seq1
+ACGTACGTACGTACGTACGTACGT
+```
+
+La sortie contiendra plusieurs super k-mers :
+```
+>seq1_superkmer_0_15 {"minimizer_value":123456,"minimizer_seq":"acgtacgt","k":21,"m":11,"start":0,"end":15,"parent_id":"seq1"}
+ACGTACGTACGTACG
+>seq1_superkmer_8_24 {"minimizer_value":789012,"minimizer_seq":"gtacgtac","k":21,"m":11,"start":8,"end":24,"parent_id":"seq1"}
+TACGTACGTACGTACGT
+```
+
+## Options héritées de `obiconvert`
+
+La commande hérite de toutes les options standard d'OBITools :
+
+### Options d'entrée
+- `--fasta` : forcer le format FASTA
+- `--fastq` : forcer le format FASTQ
+- `--ecopcr` : format ecoPCR
+- `--embl` : format EMBL
+- `--genbank` : format GenBank
+- `--input-json-header` : en-têtes JSON
+- `--input-OBI-header` : en-têtes OBI
+
+### Options de sortie
+- `--out` / `-o` : fichier de sortie (défaut : stdout)
+- `--fasta-output` : sortie en format FASTA
+- `--fastq-output` : sortie en format FASTQ
+- `--json-output` : sortie en format JSON
+- `--output-json-header` : en-têtes JSON en sortie
+- `--output-OBI-header` / `-O` : en-têtes OBI en sortie
+- `--compress` / `-Z` : compression gzip
+- `--skip-empty` : ignorer les séquences vides
+- `--no-progressbar` : désactiver la barre de progression
+
+## Compilation
+
+Pour compiler la commande :
+
+```bash
+cd /chemin/vers/obitools4
+go build -o bin/obisuperkmer ./cmd/obitools/obisuperkmer/
+```
+
+## Tests
+
+Pour tester la commande :
+
+```bash
+# Créer un fichier de test
+echo -e ">test\nACGTACGTACGTACGTACGTACGTACGTACGT" > test.fasta
+
+# Exécuter obisuperkmer
+obisuperkmer test.fasta
+
+# Vérifier avec des paramètres différents
+obisuperkmer -k 15 -m 7 test.fasta
+```
+
+## Validation des paramètres
+
+La commande valide automatiquement :
+- `1 <= m < k` : le minimiseur doit être plus petit que le k-mer
+- `2 <= k <= 31` : contrainte du codage sur 64 bits
+- `len(sequence) >= k` : la séquence doit être assez longue
+
+En cas de paramètres invalides, la commande affiche une erreur explicite et s'arrête.
+
+## Intégration avec le pipeline OBITools
+
+La commande s'intègre naturellement dans les pipelines OBITools :
+
+```bash
+# Pipeline complet d'analyse
+obiconvert sequences.fastq --fasta-output | \
+  obisuperkmer -k 21 -m 11 | \
+  obiuniq | \
+  obigrep -p "minimizer_value>1000" > filtered_superkmers.fasta
+```
+
+## Parallélisation
+
+La commande utilise automatiquement :
+- `obidefault.ParallelWorkers()` pour le traitement parallèle
+- Les workers sont distribués sur les séquences d'entrée
+- La parallélisation est transparente pour l'utilisateur
+
+## Conformité avec l'architecture OBITools
+
+L'implémentation respecte tous les principes de l'architecture :
+
+✅ Séparation des responsabilités (package + commande)
+✅ Convention de nommage cohérente (CLI*, Set*, _variables)
+✅ Réutilisation de `obiconvert` pour l'I/O
+✅ Options standard partagées
+✅ Pattern Worker pour le traitement
+✅ Validation des paramètres
+✅ Logging avec `logrus`
+✅ Gestion d'erreurs cohérente
+✅ Documentation complète
+
+## Fichiers créés
+
+```
+pkg/obitools/obisuperkmer/
+├── obisuperkmer.go      # Documentation du package
+├── options.go           # Définition des options CLI
+└── superkmer.go         # Implémentation du traitement
+
+cmd/obitools/obisuperkmer/
+└── main.go              # Point d'entrée de la commande
+```
+
+## Prochaines étapes
+
+1. **Compilation** : Compiler la commande avec `go build`
+2. **Tests unitaires** : Créer des tests dans `pkg/obitools/obisuperkmer/superkmer_test.go`
+3. **Documentation utilisateur** : Ajouter la documentation de la commande
+4. **Intégration CI/CD** : Ajouter aux tests d'intégration
+5. **Benchmarks** : Mesurer les performances sur différents jeux de données
+
+## Références
+
+- Architecture des commandes OBITools : `architecture-commande-obitools.md`
+- Package `obikmer` : `pkg/obikmer/`
+- Tests du package : `pkg/obikmer/superkmer_iter_test.go`
--- a/blackboard/architechture/obisuperkmer-tests.md
+++ b/blackboard/architechture/obisuperkmer-tests.md
@@ -0,0 +1,440 @@
+# Tests automatisés pour obisuperkmer
+
+## Vue d'ensemble
+
+Des tests automatisés ont été créés pour la commande `obisuperkmer` dans le répertoire `obitests/obitools/obisuperkmer/`. Ces tests suivent le pattern standard utilisé par toutes les commandes OBITools et sont conçus pour être exécutés dans un environnement CI/CD.
+
+## Fichiers créés
+
+```
+obitests/obitools/obisuperkmer/
+├── test.sh                    # Script de test principal (6.7 KB)
+├── test_sequences.fasta       # Données de test (117 bytes)
+└── README.md                  # Documentation (4.1 KB)
+```
+
+### Taille totale : ~11 KB
+
+Cette taille minimale est idéale pour un dépôt Git et des tests CI/CD rapides.
+
+## Jeu de données de test
+
+### Fichier : `test_sequences.fasta` (117 bytes)
+
+Le fichier contient 3 séquences de 32 nucléotides chacune :
+
+```fasta
+>seq1
+ACGTACGTACGTACGTACGTACGTACGTACGT
+>seq2
+AAAACCCCGGGGTTTTAAAACCCCGGGGTTTT
+>seq3
+ATCGATCGATCGATCGATCGATCGATCGATCG
+```
+
+#### Justification du choix
+
+1. **seq1** : Motif répétitif simple (ACGT)
+   - Teste l'extraction de super k-mers sur une séquence avec faible complexité
+   - Les minimiseurs devraient être assez réguliers
+
+2. **seq2** : Blocs homopolymères
+   - Teste le comportement avec des régions de très faible complexité
+   - Les minimiseurs varieront entre les blocs A, C, G et T
+
+3. **seq3** : Motif différent (ATCG)
+   - Teste la diversité des super k-mers extraits
+   - Différent de seq1 pour vérifier la distinction
+
+#### Caractéristiques
+
+- **Longueur** : 32 nucléotides par séquence
+- **Taille totale** : 96 nucléotides (3 × 32)
+- **Format** : FASTA avec en-têtes JSON compatibles
+- **Alphabet** : A, C, G, T uniquement (pas de bases ambiguës)
+- **Taille du fichier** : 117 bytes
+
+Avec k=21 (défaut), chaque séquence de 32 bp peut produire :
+- 32 - 21 + 1 = 12 k-mers
+- Plusieurs super k-mers selon les minimiseurs
+
+## Script de test : `test.sh`
+
+### Structure
+
+Le script suit le pattern standard OBITools :
+
+```bash
+#!/bin/bash
+
+TEST_NAME=obisuperkmer
+CMD=obisuperkmer
+
+# Variables et fonctions standard
+TEST_DIR="..."
+OBITOOLS_DIR="..."
+TMPDIR="$(mktemp -d)"
+ntest=0
+success=0
+failed=0
+
+cleanup() { ... }
+log() { ... }
+
+# Tests (12 au total)
+# ...
+
+cleanup
+```
+
+### Tests implémentés
+
+#### 1. Test d'aide (`-h`)
+```bash
+obisuperkmer -h
+```
+Vérifie que la commande peut afficher son aide sans erreur.
+
+#### 2. Extraction basique avec paramètres par défaut
+```bash
+obisuperkmer test_sequences.fasta > output_default.fasta
+```
+Teste l'exécution avec k=21, m=11 (défaut).
+
+#### 3. Vérification de sortie non vide
+```bash
+[ -s output_default.fasta ]
+```
+S'assure que la commande produit un résultat.
+
+#### 4. Comptage des super k-mers
+```bash
+grep -c "^>" output_default.fasta
+```
+Vérifie qu'au moins un super k-mer a été extrait.
+
+#### 5. Présence des métadonnées
+```bash
+grep -q "minimizer_value" output_default.fasta
+grep -q "minimizer_seq" output_default.fasta
+grep -q "parent_id" output_default.fasta
+```
+Vérifie que les attributs requis sont présents.
+
+#### 6. Extraction avec paramètres personnalisés
+```bash
+obisuperkmer -k 15 -m 7 test_sequences.fasta > output_k15_m7.fasta
+```
+Teste la configuration de k et m.
+
+#### 7. Validation des paramètres personnalisés
+```bash
+grep -q '"k":15' output_k15_m7.fasta
+grep -q '"m":7' output_k15_m7.fasta
+```
+Vérifie que les paramètres sont correctement enregistrés.
+
+#### 8. Format de sortie FASTA
+```bash
+obisuperkmer --fasta-output test_sequences.fasta > output_fasta.fasta
+```
+Teste l'option de format explicite.
+
+#### 9. Vérification des IDs
+```bash
+grep "^>" output_default.fasta | grep -q "superkmer"
+```
+S'assure que les IDs contiennent "superkmer".
+
+#### 10. Préservation des IDs parents
+```bash
+grep -q "seq1" output_default.fasta
+grep -q "seq2" output_default.fasta
+grep -q "seq3" output_default.fasta
+```
+Vérifie que les IDs des séquences parentes sont préservés.
+
+#### 11. Option de fichier de sortie (`-o`)
+```bash
+obisuperkmer -o output_file.fasta test_sequences.fasta
+```
+Teste la redirection vers un fichier.
+
+#### 12. Vérification de création du fichier
+```bash
+[ -s output_file.fasta ]
+```
+S'assure que le fichier a été créé.
+
+#### 13. Cohérence des longueurs
+```bash
+# Vérifie que longueur(output) <= longueur(input)
+```
+S'assure que les super k-mers ne sont pas plus longs que l'entrée.
+
+### Compteurs
+
+- **ntest** : Nombre de tests exécutés
+- **success** : Nombre de tests réussis
+- **failed** : Nombre de tests échoués
+
+### Sortie du script
+
+#### En cas de succès
+```
+========================================
+## Results of the obisuperkmer tests:
+
+- 12 tests run
+- 12 successfully completed
+- 0 failed tests
+
+Cleaning up the temporary directory...
+
+========================================
+```
+
+Exit code : **0**
+
+#### En cas d'échec
+```
+========================================
+## Results of the obisuperkmer tests:
+
+- 12 tests run
+- 10 successfully completed
+- 2 failed tests
+
+Cleaning up the temporary directory...
+
+========================================
+```
+
+Exit code : **1**
+
+## Intégration CI/CD
+
+### Exécution automatique
+
+Le script est conçu pour être exécuté automatiquement dans un pipeline CI/CD :
+
+1. Le build produit l'exécutable dans `build/obisuperkmer`
+2. Le script de test ajoute `build/` au PATH
+3. Les tests s'exécutent
+4. Le code de retour indique le succès (0) ou l'échec (1)
+
+### Exemple de configuration CI/CD
+
+```yaml
+# .github/workflows/test.yml ou équivalent
+test-obisuperkmer:
+  runs-on: ubuntu-latest
+  steps:
+    - uses: actions/checkout@v2
+    - name: Build obitools
+      run: make build
+    - name: Test obisuperkmer
+      run: ./obitests/obitools/obisuperkmer/test.sh
+```
+
+### Avantages
+
+✅ **Rapidité** : Données de test minimales (117 bytes)
+✅ **Fiabilité** : Tests reproductibles
+✅ **Isolation** : Utilisation d'un répertoire temporaire
+✅ **Nettoyage automatique** : Pas de fichiers résiduels
+✅ **Logging** : Messages horodatés et détaillés
+✅ **Compatibilité** : Pattern standard OBITools
+
+## Exécution locale
+
+### Prérequis
+
+1. Compiler obisuperkmer :
+   ```bash
+   cd /chemin/vers/obitools4
+   go build -o build/obisuperkmer ./cmd/obitools/obisuperkmer/
+   ```
+
+2. Se placer dans le répertoire de test :
+   ```bash
+   cd obitests/obitools/obisuperkmer
+   ```
+
+3. Exécuter le script :
+   ```bash
+   ./test.sh
+   ```
+
+### Exemple de sortie
+
+```
+[obisuperkmer @ Fri Feb  7 13:00:00 CET 2026] Testing obisuperkmer...
+[obisuperkmer @ Fri Feb  7 13:00:00 CET 2026] Test directory is /path/to/obitests/obitools/obisuperkmer
+[obisuperkmer @ Fri Feb  7 13:00:00 CET 2026] obitools directory is /path/to/build
+[obisuperkmer @ Fri Feb  7 13:00:00 CET 2026] Temporary directory is /tmp/tmp.abc123
+[obisuperkmer @ Fri Feb  7 13:00:00 CET 2026] files: README.md test.sh test_sequences.fasta
+[obisuperkmer @ Fri Feb  7 13:00:01 CET 2026] OBISuperkmer: printing help OK
+[obisuperkmer @ Fri Feb  7 13:00:02 CET 2026] OBISuperkmer: basic extraction with default parameters OK
+[obisuperkmer @ Fri Feb  7 13:00:02 CET 2026] OBISuperkmer: output file is not empty OK
+[obisuperkmer @ Fri Feb  7 13:00:02 CET 2026] OBISuperkmer: extracted 8 super k-mers OK
+[obisuperkmer @ Fri Feb  7 13:00:02 CET 2026] OBISuperkmer: super k-mers contain required metadata OK
+[obisuperkmer @ Fri Feb  7 13:00:03 CET 2026] OBISuperkmer: extraction with custom k=15, m=7 OK
+[obisuperkmer @ Fri Feb  7 13:00:03 CET 2026] OBISuperkmer: custom parameters correctly set in metadata OK
+[obisuperkmer @ Fri Feb  7 13:00:03 CET 2026] OBISuperkmer: FASTA output format OK
+[obisuperkmer @ Fri Feb  7 13:00:03 CET 2026] OBISuperkmer: super k-mer IDs contain 'superkmer' OK
+[obisuperkmer @ Fri Feb  7 13:00:03 CET 2026] OBISuperkmer: parent sequence IDs preserved OK
+[obisuperkmer @ Fri Feb  7 13:00:04 CET 2026] OBISuperkmer: output to file with -o option OK
+[obisuperkmer @ Fri Feb  7 13:00:04 CET 2026] OBISuperkmer: output file created with -o option OK
+[obisuperkmer @ Fri Feb  7 13:00:04 CET 2026] OBISuperkmer: super k-mer total length <= input length OK
+========================================
+## Results of the obisuperkmer tests:
+
+- 12 tests run
+- 12 successfully completed
+- 0 failed tests
+
+Cleaning up the temporary directory...
+
+========================================
+```
+
+## Debugging des tests
+
+### Conserver les fichiers temporaires
+
+Modifier temporairement la fonction `cleanup()` :
+
+```bash
+cleanup() {
+    echo "Temporary directory: $TMPDIR" 1>&2
+    # Commenter cette ligne pour conserver les fichiers
+    # rm -rf "$TMPDIR"
+    ...
+}
+```
+
+### Activer le mode verbose
+
+Ajouter au début du script :
+
+```bash
+set -x  # Active l'affichage de toutes les commandes
+```
+
+### Tester une seule commande
+
+Extraire et exécuter manuellement :
+
+```bash
+export TEST_DIR=/chemin/vers/obitests/obitools/obisuperkmer
+export TMPDIR=$(mktemp -d)
+obisuperkmer "${TEST_DIR}/test_sequences.fasta" > "${TMPDIR}/output.fasta"
+cat "${TMPDIR}/output.fasta"
+```
+
+## Ajout de nouveaux tests
+
+Pour ajouter un test supplémentaire :
+
+1. Incrémenter le compteur `ntest`
+2. Écrire la condition de test
+3. Logger le succès ou l'échec
+4. Incrémenter le bon compteur
+
+```bash
+((ntest++))
+if ma_nouvelle_commande_de_test
+then
+    log "Description du test: OK" 
+    ((success++))
+else
+    log "Description du test: failed"
+    ((failed++))
+fi
+```
+
+## Comparaison avec d'autres tests
+
+### Taille des données de test
+
+| Commande | Taille des données | Nombre de fichiers |
+|----------|-------------------|-------------------|
+| obiconvert | 925 KB | 1 fichier |
+| obiuniq | ~600 bytes | 4 fichiers |
+| obimicrosat | 0 bytes | 0 fichiers (génère à la volée) |
+| **obisuperkmer** | **117 bytes** | **1 fichier** |
+
+Notre test `obisuperkmer` est parmi les plus légers, ce qui est optimal pour CI/CD.
+
+### Nombre de tests
+
+| Commande | Nombre de tests |
+|----------|----------------|
+| obiconvert | 3 tests |
+| obiuniq | 7 tests |
+| obimicrosat | 1 test |
+| **obisuperkmer** | **12 tests** |
+
+Notre test `obisuperkmer` offre une couverture complète avec 12 tests différents.
+
+## Couverture de test
+
+Les tests couvrent :
+
+✅ Affichage de l'aide  
+✅ Exécution basique  
+✅ Paramètres par défaut (k=21, m=11)  
+✅ Paramètres personnalisés (k=15, m=7)  
+✅ Formats de sortie (FASTA)  
+✅ Redirection vers fichier (`-o`)  
+✅ Présence des métadonnées  
+✅ Validation des IDs  
+✅ Préservation des IDs parents  
+✅ Cohérence des longueurs  
+✅ Production de résultats non vides  
+
+## Maintenance
+
+### Mise à jour des tests
+
+Si l'implémentation de `obisuperkmer` change :
+
+1. Vérifier que les tests existants passent toujours
+2. Ajouter de nouveaux tests pour les nouvelles fonctionnalités
+3. Mettre à jour `README.md` si nécessaire
+4. Documenter les changements
+
+### Vérification régulière
+
+Exécuter périodiquement :
+
+```bash
+cd obitests/obitools/obisuperkmer
+./test.sh
+```
+
+Ou via l'ensemble des tests :
+
+```bash
+cd obitests
+for dir in obitools/*/; do
+    if [ -f "$dir/test.sh" ]; then
+        echo "Testing $(basename $dir)..."
+        (cd "$dir" && ./test.sh) || echo "FAILED: $(basename $dir)"
+    fi
+done
+```
+
+## Conclusion
+
+Les tests pour `obisuperkmer` sont :
+
+- ✅ **Complets** : 12 tests couvrant toutes les fonctionnalités principales
+- ✅ **Légers** : 117 bytes de données de test
+- ✅ **Rapides** : Exécution en quelques secondes
+- ✅ **Fiables** : Pattern éprouvé utilisé par toutes les commandes OBITools
+- ✅ **Maintenables** : Structure claire et documentée
+- ✅ **CI/CD ready** : Code de retour approprié et nettoyage automatique
+
+Ils garantissent que la commande fonctionne correctement à chaque commit et facilitent la détection précoce des régressions.
--- a/cmd/obitools/obik/main.go
+++ b/cmd/obitools/obik/main.go
@@ -0,0 +1,34 @@
+package main
+
+import (
+	"context"
+	"errors"
+	"os"
+
+	log "github.com/sirupsen/logrus"
+
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obik"
+	"github.com/DavidGamba/go-getoptions"
+)
+
+func main() {
+	defer obiseq.LogBioSeqStatus()
+
+	opt, parser := obioptions.GenerateSubcommandParser(
+		"obik",
+		"Manage disk-based kmer indices",
+		obik.OptionSet,
+	)
+
+	_, remaining := parser(os.Args)
+
+	err := opt.Dispatch(context.Background(), remaining)
+	if err != nil {
+		if errors.Is(err, getoptions.ErrorHelpCalled) {
+			os.Exit(0)
+		}
+		log.Fatalf("Error: %v", err)
+	}
+}
--- a/cmd/obitools/obilowmask/main.go
+++ b/cmd/obitools/obilowmask/main.go
@@ -1,47 +0,0 @@
-package main
-
-import (
-	"os"
-
-	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
-	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
-	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
-	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obilowmask"
-	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
-)
-
-func main() {
-
-	defer obiseq.LogBioSeqStatus()
-
-	// go tool pprof -http=":8000" ./obipairing ./cpu.pprof
-	// f, err := os.Create("cpu.pprof")
-	// if err != nil {
-	// 	log.Fatal(err)
-	// }
-	// pprof.StartCPUProfile(f)
-	// defer pprof.StopCPUProfile()
-
-	// go tool trace cpu.trace
-	// ftrace, err := os.Create("cpu.trace")
-	// if err != nil {
-	// 	log.Fatal(err)
-	// }
-	// trace.Start(ftrace)
-	// defer trace.Stop()
-
-	optionParser := obioptions.GenerateOptionParser(
-		"obimicrosat",
-		"looks for microsatellites sequences in a sequence file",
-		obilowmask.OptionSet)
-
-	_, args := optionParser(os.Args)
-
-	sequences, err := obiconvert.CLIReadBioSequences(args...)
-	obiconvert.OpenSequenceDataErrorMessage(args, err)
-
-	selected := obilowmask.CLISequenceEntropyMasker(sequences)
-	obiconvert.CLIWriteBioSequences(selected, true)
-	obiutils.WaitForLastPipe()
-
-}
--- a/1
+++ b/1
--- a/go.mod
+++ b/go.mod
@@ -1,56 +1,50 @@
 module git.metabarcoding.org/obitools/obitools4/obitools4

-go 1.23.4
-
-toolchain go1.24.2
+go 1.26.1

 require (
-	github.com/DavidGamba/go-getoptions v0.28.0
-	github.com/PaesslerAG/gval v1.2.2
+	github.com/DavidGamba/go-getoptions v0.33.0
+	github.com/PaesslerAG/gval v1.2.4
 	github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df
 	github.com/buger/jsonparser v1.1.1
 	github.com/chen3feng/stl4go v0.1.1
-	github.com/dlclark/regexp2 v1.11.4
-	github.com/goccy/go-json v0.10.3
+	github.com/dlclark/regexp2 v1.11.5
+	github.com/goccy/go-json v0.10.6
 	github.com/klauspost/pgzip v1.2.6
 	github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58
+	github.com/pelletier/go-toml/v2 v2.2.4
 	github.com/rrethy/ahocorasick v1.0.0
-	github.com/schollz/progressbar/v3 v3.13.1
-	github.com/sirupsen/logrus v1.9.3
-	github.com/stretchr/testify v1.8.4
+	github.com/schollz/progressbar/v3 v3.19.0
+	github.com/sirupsen/logrus v1.9.4
+	github.com/stretchr/testify v1.10.0
 	github.com/tevino/abool/v2 v2.1.0
 	github.com/yuin/gopher-lua v1.1.1
-	golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa
-	gonum.org/v1/gonum v0.14.0
+	golang.org/x/exp v0.0.0-20260312153236-7ab1446f8b90
+	gonum.org/v1/gonum v0.17.0
 	gopkg.in/yaml.v3 v3.0.1
 	scientificgo.org/special v0.0.0
 )

 require (
-	github.com/RoaringBitmap/roaring v1.9.4 // indirect
-	github.com/bits-and-blooms/bitset v1.12.0 // indirect
 	github.com/davecgh/go-spew v1.1.1 // indirect
-	github.com/goombaio/orderedmap v0.0.0-20180924084748-ba921b7e2419 // indirect
+	github.com/goombaio/orderedmap v0.0.0-20180925151256-3da0e2f905f9 // indirect
 	github.com/kr/pretty v0.3.1 // indirect
 	github.com/kr/text v0.2.0 // indirect
-	github.com/mschoch/smat v0.2.0 // indirect
-	github.com/pelletier/go-toml/v2 v2.2.4 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
 	github.com/rogpeppe/go-internal v1.12.0 // indirect
 )

 require (
 	github.com/dsnet/compress v0.0.1
-	github.com/gabriel-vasile/mimetype v1.4.3
+	github.com/gabriel-vasile/mimetype v1.4.13
 	github.com/goombaio/orderedset v0.0.0-20180925151225-8e67b20a9b77
-	github.com/klauspost/compress v1.17.2
-	github.com/mattn/go-runewidth v0.0.15 // indirect
+	github.com/klauspost/compress v1.18.4
+	github.com/mattn/go-runewidth v0.0.21 // indirect
 	github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
-	github.com/rivo/uniseg v0.4.4 // indirect
-	github.com/shopspring/decimal v1.3.1 // indirect
-	github.com/ulikunitz/xz v0.5.11
-	golang.org/x/net v0.35.0 // indirect
-	golang.org/x/sys v0.30.0 // indirect
-	golang.org/x/term v0.29.0 // indirect
+	github.com/rivo/uniseg v0.4.7 // indirect
+	github.com/shopspring/decimal v1.4.0 // indirect
+	github.com/ulikunitz/xz v0.5.15
+	golang.org/x/sys v0.42.0 // indirect
+	golang.org/x/term v0.41.0 // indirect
 	gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c
 )
--- a/go.sum
+++ b/go.sum
@@ -1,40 +1,41 @@
-github.com/DavidGamba/go-getoptions v0.28.0 h1:18wgEvfZdrlfIhVDGEBO3Dl0fkOyXqXLa0tLMCKxM1c=
-github.com/DavidGamba/go-getoptions v0.28.0/go.mod h1:zE97E3PR9P3BI/HKyNYgdMlYxodcuiC6W68KIgeYT84=
-github.com/PaesslerAG/gval v1.2.2 h1:Y7iBzhgE09IGTt5QgGQ2IdaYYYOU134YGHBThD+wm9E=
-github.com/PaesslerAG/gval v1.2.2/go.mod h1:XRFLwvmkTEdYziLdaCeCa5ImcGVrfQbeNUbVR+C6xac=
+github.com/DavidGamba/go-getoptions v0.33.0 h1:8xCPH87Yy5avYenygyHVlqqm8RpymH0YFe4a7IWlarE=
+github.com/DavidGamba/go-getoptions v0.33.0/go.mod h1:zE97E3PR9P3BI/HKyNYgdMlYxodcuiC6W68KIgeYT84=
+github.com/PaesslerAG/gval v1.2.4 h1:rhX7MpjJlcxYwL2eTTYIOBUyEKZ+A96T9vQySWkVUiU=
+github.com/PaesslerAG/gval v1.2.4/go.mod h1:XRFLwvmkTEdYziLdaCeCa5ImcGVrfQbeNUbVR+C6xac=
 github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI=
 github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8=
-github.com/RoaringBitmap/roaring v1.9.4 h1:yhEIoH4YezLYT04s1nHehNO64EKFTop/wBhxv2QzDdQ=
-github.com/RoaringBitmap/roaring v1.9.4/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90=
 github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df h1:GSoSVRLoBaFpOOds6QyY1L8AX7uoY+Ln3BHc22W40X0=
 github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df/go.mod h1:hiVxq5OP2bUGBRNS3Z/bt/reCLFNbdcST6gISi1fiOM=
-github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA=
-github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
 github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMUs=
 github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0=
 github.com/chen3feng/stl4go v0.1.1 h1:0L1+mDw7pomftKDruM23f1mA7miavOj6C6MZeadzN2Q=
 github.com/chen3feng/stl4go v0.1.1/go.mod h1:5ml3psLgETJjRJnMbPE+JiHLrCpt+Ajc2weeTECXzWU=
+github.com/chengxilo/virtualterm v1.0.4 h1:Z6IpERbRVlfB8WkOmtbHiDbBANU7cimRIof7mk9/PwM=
+github.com/chengxilo/virtualterm v1.0.4/go.mod h1:DyxxBZz/x1iqJjFxTFcr6/x+jSpqN0iwWCOK1q10rlY=
+github.com/clipperhouse/uax29/v2 v2.2.0 h1:ChwIKnQN3kcZteTXMgb1wztSgaU+ZemkgWdohwgs8tY=
+github.com/clipperhouse/uax29/v2 v2.2.0/go.mod h1:EFJ2TJMRUaplDxHKj1qAEhCtQPW2tJSwu5BF98AuoVM=
 github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
-github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/dlclark/regexp2 v1.11.4 h1:rPYF9/LECdNymJufQKmri9gV604RvvABwgOA8un7yAo=
-github.com/dlclark/regexp2 v1.11.4/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
+github.com/dlclark/regexp2 v1.11.5 h1:Q/sSnsKerHeCkc/jSTNq1oCm7KiVgUMZRDUoRu0JQZQ=
+github.com/dlclark/regexp2 v1.11.5/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
 github.com/dsnet/compress v0.0.1 h1:PlZu0n3Tuv04TzpfPbrnI0HW/YwodEXDS+oPKahKF0Q=
 github.com/dsnet/compress v0.0.1/go.mod h1:Aw8dCMJ7RioblQeTqt88akK31OvO8Dhf5JflhBbQEHo=
 github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY=
-github.com/gabriel-vasile/mimetype v1.4.3 h1:in2uUcidCuFcDKtdcBxlR0rJ1+fsokWf+uqxgUFjbI0=
-github.com/gabriel-vasile/mimetype v1.4.3/go.mod h1:d8uq/6HKRL6CGdk+aubisF/M5GcPfT7nKyLpA0lbSSk=
-github.com/goccy/go-json v0.10.3 h1:KZ5WoDbxAIgm2HNbYckL0se1fHD6rz5j4ywS6ebzDqA=
-github.com/goccy/go-json v0.10.3/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M=
-github.com/goombaio/orderedmap v0.0.0-20180924084748-ba921b7e2419 h1:SajEQ6tktpF9SRIuzbiPOX9AEZZ53Bvw0k9Mzrts8Lg=
+github.com/gabriel-vasile/mimetype v1.4.13 h1:46nXokslUBsAJE/wMsp5gtO500a4F3Nkz9Ufpk2AcUM=
+github.com/gabriel-vasile/mimetype v1.4.13/go.mod h1:d+9Oxyo1wTzWdyVUPMmXFvp4F9tea18J8ufA774AB3s=
+github.com/goccy/go-json v0.10.6 h1:p8HrPJzOakx/mn/bQtjgNjdTcN+/S6FcG2CTtQOrHVU=
+github.com/goccy/go-json v0.10.6/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M=
+github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
+github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 github.com/goombaio/orderedmap v0.0.0-20180924084748-ba921b7e2419/go.mod h1:YKu81H3RSd1cFh0d7NhvUoTtUC9IY/vBX0WUQb1/o4Y=
+github.com/goombaio/orderedmap v0.0.0-20180925151256-3da0e2f905f9 h1:vFjPvFavIiDY71bQ9HIxPQBANvNl1SmFC4fgg5xRkho=
+github.com/goombaio/orderedmap v0.0.0-20180925151256-3da0e2f905f9/go.mod h1:YKu81H3RSd1cFh0d7NhvUoTtUC9IY/vBX0WUQb1/o4Y=
 github.com/goombaio/orderedset v0.0.0-20180925151225-8e67b20a9b77 h1:4dvq1tGHn1Y9KSRY0OZ24Khki4+4U+ZrA//YYsdUlJU=
 github.com/goombaio/orderedset v0.0.0-20180925151225-8e67b20a9b77/go.mod h1:HPelMYpOyy0XvglpBbmZ3krZpwaHmszj/vQNlnETPTM=
-github.com/k0kubun/go-ansi v0.0.0-20180517002512-3bf9e2903213/go.mod h1:vNUNkEQ1e29fT/6vq2aBdFsgNPmy8qMdSay1npru+Sw=
 github.com/klauspost/compress v1.4.1/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
-github.com/klauspost/compress v1.17.2 h1:RlWWUY/Dr4fL8qk9YG7DTZ7PDgME2V4csBXA8L/ixi4=
-github.com/klauspost/compress v1.17.2/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
+github.com/klauspost/compress v1.18.4 h1:RPhnKRAQ4Fh8zU2FY/6ZFDwTVTxgJ/EMydqSTzE9a2c=
+github.com/klauspost/compress v1.18.4/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
 github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
 github.com/klauspost/pgzip v1.2.6 h1:8RXeL5crjEUFnR2/Sn6GJNWtSQ3Dk8pq4CL3jvdDyjU=
 github.com/klauspost/pgzip v1.2.6/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
@@ -45,14 +46,10 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
 github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
-github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
-github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
-github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZgg3U=
-github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
+github.com/mattn/go-runewidth v0.0.21 h1:jJKAZiQH+2mIinzCJIaIG9Be1+0NR+5sz/lYEEjdM8w=
+github.com/mattn/go-runewidth v0.0.21/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs=
 github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ=
 github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw=
-github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM=
-github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw=
 github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
 github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
 github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4=
@@ -60,50 +57,40 @@ github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8
 github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
-github.com/rivo/uniseg v0.4.4 h1:8TfxU8dW6PdqD27gjM8MVNuicgxIjxpm4K7x4jp8sis=
-github.com/rivo/uniseg v0.4.4/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
+github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
+github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
 github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
 github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
 github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
 github.com/rrethy/ahocorasick v1.0.0 h1:YKkCB+E5PXc0xmLfMrWbfNht8vG9Re97IHSWZk/Lk8E=
 github.com/rrethy/ahocorasick v1.0.0/go.mod h1:nq8oScE7Vy1rOppoQxpQiiDmPHuKCuk9rXrNcxUV3R0=
-github.com/schollz/progressbar/v3 v3.13.1 h1:o8rySDYiQ59Mwzy2FELeHY5ZARXZTVJC7iHD6PEFUiE=
-github.com/schollz/progressbar/v3 v3.13.1/go.mod h1:xvrbki8kfT1fzWzBT/UZd9L6GA+jdL7HAgq2RFnO6fQ=
-github.com/shopspring/decimal v1.3.1 h1:2Usl1nmF/WZucqkFZhnfFYxxxu8LG21F6nPQBE5gKV8=
+github.com/schollz/progressbar/v3 v3.19.0 h1:Ea18xuIRQXLAUidVDox3AbwfUhD0/1IvohyTutOIFoc=
+github.com/schollz/progressbar/v3 v3.19.0/go.mod h1:IsO3lpbaGuzh8zIMzgY3+J8l4C8GjO0Y9S69eFvNsec=
 github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o=
-github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
-github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
-github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
-github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
-github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
-github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
-github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
+github.com/shopspring/decimal v1.4.0 h1:bxl37RwXBklmTi0C79JfXCEBD1cqqHt0bbgBAGFp81k=
+github.com/shopspring/decimal v1.4.0/go.mod h1:gawqmDU56v4yIKSwfBSFip1HdCCXN8/+DMd9qYNcwME=
+github.com/sirupsen/logrus v1.9.4 h1:TsZE7l11zFCLZnZ+teH4Umoq5BhEIfIzfRDZ1Uzql2w=
+github.com/sirupsen/logrus v1.9.4/go.mod h1:ftWc9WdOfJ0a92nsE2jF5u5ZwH8Bv2zdeOC42RjbV2g=
+github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
+github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
 github.com/tevino/abool/v2 v2.1.0 h1:7w+Vf9f/5gmKT4m4qkayb33/92M+Um45F2BkHOR+L/c=
 github.com/tevino/abool/v2 v2.1.0/go.mod h1:+Lmlqk6bHDWHqN1cbxqhwEAwMPXgc8I1SDEamtseuXY=
 github.com/ulikunitz/xz v0.5.6/go.mod h1:2bypXElzHzzJZwzH67Y6wb67pO62Rzfn7BSiF4ABRW8=
-github.com/ulikunitz/xz v0.5.11 h1:kpFauv27b6ynzBNT/Xy+1k+fK4WswhN/6PN5WhFAGw8=
-github.com/ulikunitz/xz v0.5.11/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
+github.com/ulikunitz/xz v0.5.15 h1:9DNdB5s+SgV3bQ2ApL10xRc35ck0DuIX/isZvIk+ubY=
+github.com/ulikunitz/xz v0.5.15/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
 github.com/yuin/gopher-lua v1.1.1 h1:kYKnWBjvbNP4XLT3+bPEwAXJx262OhaHDWDVOPjL46M=
 github.com/yuin/gopher-lua v1.1.1/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7kdAd1Pw=
-golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa h1:FRnLl4eNAQl8hwxVVC17teOw8kdjVDVAiFMtgUdTSRQ=
-golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa/go.mod h1:zk2irFbV9DP96SEBUUAy67IdHUaZuSnrz1n472HUCLE=
-golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8=
-golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk=
-golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
-golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
-golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U=
-golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU=
-golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s=
-gonum.org/v1/gonum v0.14.0 h1:2NiG67LD1tEH0D7kM+ps2V+fXmsAnpUeec7n8tcr4S0=
-gonum.org/v1/gonum v0.14.0/go.mod h1:AoWeoz0becf9QMWtE8iWXNXc27fK4fNeHNf/oMejGfU=
+golang.org/x/exp v0.0.0-20260312153236-7ab1446f8b90 h1:jiDhWWeC7jfWqR9c/uplMOqJ0sbNlNWv0UkzE0vX1MA=
+golang.org/x/exp v0.0.0-20260312153236-7ab1446f8b90/go.mod h1:xE1HEv6b+1SCZ5/uscMRjUBKtIxworgEcEi+/n9NQDQ=
+golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
+golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
+golang.org/x/term v0.41.0 h1:QCgPso/Q3RTJx2Th4bDLqML4W6iJiaXFq2/ftQF13YU=
+golang.org/x/term v0.41.0/go.mod h1:3pfBgksrReYfZ5lvYM0kSO0LIkAl4Yl2bXOkKP7Ec2A=
+gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4=
+gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
-gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 scientificgo.org/special v0.0.0 h1:P6WJkECo6tgtvZAEfNXl+KEB9ReAatjKAeX8U07mjSc=
--- a/go.work
+++ b/go.work
@@ -1,5 +1,3 @@
-go 1.23.4
-
-toolchain go1.24.2
+go 1.26.1

 use .
--- a/go.work.sum
+++ b/go.work.sum
@@ -52,6 +52,8 @@ golang.org/x/image v0.6.0/go.mod h1:MXLdDR43H7cDJq5GEGXEVeeNhPgi+YYEQ2pC1byI1x0=
 golang.org/x/mod v0.13.0 h1:I/DsJXRlw/8l/0c24sM9yb0T4z9liZTduXvdAWYiysY=
 golang.org/x/mod v0.13.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
 golang.org/x/mod v0.14.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
+golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
+golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw=
 golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4 h1:uVc8UZUe6tr40fFVnUP5Oj+veunVezqYl9z7DYw9xzw=
 golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k=
 golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
--- a/install_obitools.sh
+++ b/install_obitools.sh
@@ -1,27 +1,58 @@
 #!/bin/bash

-INSTALL_DIR="/usr/local"
-OBITOOLS_PREFIX=""
-# default values
+# Default values
 URL="https://go.dev/dl/"
-OBIURL4="https://github.com/metabarcoding/obitools4/archive/refs/heads/master.zip"
+GITHUB_REPO="https://github.com/metabarcoding/obitools4"
 INSTALL_DIR="/usr/local"
 OBITOOLS_PREFIX=""
+VERSION=""
+LIST_VERSIONS=false
+JOBS=1

-# help message
+# Help message
 function display_help {
  echo "Usage: $0 [OPTIONS]"
  echo ""
  echo "Options:"
  echo "  -i, --install-dir       Directory where obitools are installed "
-  echo "                          (as example use /usr/local not /usr/local/bin)."
+  echo "                          (e.g., use /usr/local not /usr/local/bin)."
  echo "  -p, --obitools-prefix   Prefix added to the obitools command names if you"
  echo "                          want to have several versions of obitools at the"
-  echo "                          same time on your system (as example -p g will produce "
+  echo "                          same time on your system (e.g., -p g will produce "
  echo "                          gobigrep command instead of obigrep)."
+  echo "  -v, --version           Install a specific version (e.g., 4.4.8)."
+  echo "                          If not specified, installs the latest version."
+  echo "  -j, --jobs              Number of parallel jobs for compilation (default: 1)."
+  echo "  -l, --list              List all available versions and exit."
  echo "  -h, --help              Display this help message."
+  echo ""
+  echo "Examples:"
+  echo "  $0                      # Install latest version"
+  echo "  $0 -l                   # List available versions"
+  echo "  $0 -v 4.4.8             # Install specific version"
+  echo "  $0 -i /opt/local        # Install to custom directory"
 }

+# List available versions from GitHub releases
+function list_versions {
+  echo "Fetching available versions..." 1>&2
+  echo ""
+  curl -s "https://api.github.com/repos/metabarcoding/obitools4/releases" \
+    | grep '"tag_name":' \
+    | sed -E 's/.*"tag_name": "Release_([0-9.]+)".*/\1/' \
+    | sort -V -r
+}
+
+# Get latest version from GitHub releases
+function get_latest_version {
+  curl -s "https://api.github.com/repos/metabarcoding/obitools4/releases" \
+    | grep '"tag_name":' \
+    | sed -E 's/.*"tag_name": "Release_([0-9.]+)".*/\1/' \
+    | sort -V -r \
+    | head -1
+}
+
+# Parse command line arguments
 while [ "$#" -gt 0 ]; do
  case "$1" in
    -i|--install-dir)
@@ -32,28 +63,61 @@ while [ "$#" -gt 0 ]; do
      OBITOOLS_PREFIX="$2"
      shift 2
      ;;
+    -v|--version)
+      VERSION="$2"
+      shift 2
+      ;;
+    -j|--jobs)
+      JOBS="$2"
+      shift 2
+      ;;
+    -l|--list)
+      LIST_VERSIONS=true
+      shift
+      ;;
    -h|--help)
-      display_help  1>&2 
+      display_help
      exit 0
      ;;
    *)
      echo "Error: Unsupported option $1" 1>&2
+      display_help 1>&2
      exit 1
      ;;
  esac
 done

-# the directory from where the script is run
+# List versions and exit if requested
+if [ "$LIST_VERSIONS" = true ]; then
+  echo "Available OBITools4 versions:"
+  echo "=============================="
+  list_versions
+  exit 0
+fi
+
+# Determine version to install
+if [ -z "$VERSION" ]; then
+  echo "Fetching latest version..." 1>&2
+  VERSION=$(get_latest_version)
+  if [ -z "$VERSION" ]; then
+    echo "Error: Could not determine latest version" 1>&2
+    exit 1
+  fi
+  echo "Latest version: $VERSION" 1>&2
+else
+  echo "Installing version: $VERSION" 1>&2
+fi
+
+# Construct source URL for the specified version
+OBIURL4="${GITHUB_REPO}/archive/refs/tags/Release_${VERSION}.zip"
+
+# The directory from where the script is run
 DIR="$(pwd)"

-# the temp directory used, within $DIR
-# omit the -p parameter to create a temporal directory in the default location
-# WORK_DIR=$(mktemp -d -p "$DIR"  "obitools4.XXXXXX" 2> /dev/null || \
-#            mktemp -d -t "$DIR"  "obitools4.XXXXXX")
-
+# Create temporary directory
 WORK_DIR=$(mktemp -d "obitools4.XXXXXX")

-# check if tmp dir was created
+# Check if tmp dir was created
 if [[ ! "$WORK_DIR" || ! -d "$WORK_DIR" ]]; then
  echo "Could not create temp dir" 1>&2
  exit 1
@@ -63,10 +127,16 @@ mkdir -p "${WORK_DIR}/cache" \
  || (echo "Cannot create ${WORK_DIR}/cache directory" 1>&2
      exit 1)

-
-mkdir -p "${INSTALL_DIR}/bin" 2> /dev/null \
-  || (echo "Please enter your password for installing obitools in ${INSTALL_DIR}"  1>&2
-      sudo mkdir -p "${INSTALL_DIR}/bin")
+# Create installation directory
+if ! mkdir -p "${INSTALL_DIR}/bin" 2>/dev/null; then
+  if [ ! -w "$(dirname "${INSTALL_DIR}")" ] && [ ! -w "${INSTALL_DIR}" ]; then
+    echo "Please enter your password for installing obitools in ${INSTALL_DIR}" 1>&2
+    sudo mkdir -p "${INSTALL_DIR}/bin"
+  else
+    echo "Error: Could not create ${INSTALL_DIR}/bin (check path or disk space)" 1>&2
+    exit 1
+  fi
+fi

 if [[ ! -d "${INSTALL_DIR}/bin" ]]; then
  echo "Could not create ${INSTALL_DIR}/bin directory for installing obitools" 1>&2
@@ -75,12 +145,18 @@ fi

 INSTALL_DIR="$(cd ${INSTALL_DIR} && pwd)"

+echo "================================" 1>&2
+echo "OBITools4 Installation" 1>&2
+echo "================================" 1>&2
+echo "VERSION=$VERSION" 1>&2
 echo "WORK_DIR=$WORK_DIR" 1>&2
 echo "INSTALL_DIR=$INSTALL_DIR" 1>&2
 echo "OBITOOLS_PREFIX=$OBITOOLS_PREFIX" 1>&2
+echo "================================" 1>&2

-pushd "$WORK_DIR"|| exit
+pushd "$WORK_DIR" > /dev/null || exit

+# Detect OS and architecture
 OS=$(uname -a | awk '{print $1}')
 ARCH=$(uname -m)

@@ -92,7 +168,9 @@ if [[ "$ARCH" == "aarch64" ]] ; then
    ARCH="arm64"
 fi

-GOFILE=$(curl "$URL" \
+# Download and install Go
+echo "Downloading Go..." 1>&2
+GOFILE=$(curl -s "$URL" \
            | grep 'class="download"' \
            | grep "\.tar\.gz" \
            | sed -E 's@^.*/dl/(go[1-9].+\.tar\.gz)".*$@\1@' \
@@ -100,44 +178,86 @@ GOFILE=$(curl "$URL" \
            | grep -i "$ARCH" \
            | head -1)

-GOURL=$(curl "${URL}${GOFILE}" \
+GOURL=$(curl -s "${URL}${GOFILE}" \
        | sed -E 's@^.*href="(.*\.tar\.gz)".*$@\1@')

-echo "Install GO from : $GOURL" 1>&2
+echo "Installing Go from: $GOURL" 1>&2

-curl "$GOURL" \
-    | tar zxf -
+curl --progress-bar "$GOURL" | tar zxf -

-PATH="$(pwd)/go/bin:$PATH"
+export GOROOT="$(pwd)/go"
+PATH="${GOROOT}/bin:$PATH"
 export PATH
-GOPATH="$(pwd)/go"
-export GOPATH
-
+export GOPATH="$(pwd)/gopath"
 export GOCACHE="$(pwd)/cache"
-echo "GOCACHE=$GOCACHE" 1>&2@
-mkdir -p "$GOCACHE"
+export GOTOOLCHAIN=local

+echo "GOROOT=$GOROOT" 1>&2
+echo "GOCACHE=$GOCACHE" 1>&2
+mkdir -p "$GOPATH" "$GOCACHE"

-curl -L "$OBIURL4" > master.zip
-unzip master.zip
+# Download OBITools4 source
+echo "Downloading OBITools4 v${VERSION}..." 1>&2
+echo "Source URL: $OBIURL4" 1>&2

-echo "Install OBITOOLS from : $OBIURL4"
-
-cd obitools4-master || exit
-mkdir vendor
-
-if [[ -z "$OBITOOLS_PREFIX" ]] ; then
-  make GOFLAGS="-buildvcs=false" 
-else
-  make GOFLAGS="-buildvcs=false" OBITOOLS_PREFIX="${OBITOOLS_PREFIX}"
+if ! curl --progress-bar -L "$OBIURL4" > obitools4.zip; then
+  echo "Error: Could not download OBITools4 version ${VERSION}" 1>&2
+  echo "Please check that this version exists with: $0 --list" 1>&2
+  exit 1
 fi

-(cp build/* "${INSTALL_DIR}/bin" 2> /dev/null) \
-   || (echo "Please enter your password for installing obitools in ${INSTALL_DIR}" 
-       sudo cp build/* "${INSTALL_DIR}/bin")
+unzip -q obitools4.zip

-popd || exit
+# Find the extracted directory
+OBITOOLS_DIR=$(ls -d obitools4-* 2>/dev/null | head -1)

+if [ -z "$OBITOOLS_DIR" ] || [ ! -d "$OBITOOLS_DIR" ]; then
+  echo "Error: Could not find extracted OBITools4 directory" 1>&2
+  exit 1
+fi
+
+echo "Building OBITools4..." 1>&2
+cd "$OBITOOLS_DIR" || exit
+mkdir -p vendor
+
+# Build with or without prefix
+if [[ -z "$OBITOOLS_PREFIX" ]] ; then
+  make -j"${JOBS}" obitools GOFLAGS="-buildvcs=false"
+else
+  make -j"${JOBS}" obitools GOFLAGS="-buildvcs=false" OBITOOLS_PREFIX="${OBITOOLS_PREFIX}"
+fi
+
+# Install binaries
+echo "Installing binaries to ${INSTALL_DIR}/bin..." 1>&2
+if ! cp build/* "${INSTALL_DIR}/bin" 2>/dev/null; then
+  if [ ! -w "${INSTALL_DIR}/bin" ]; then
+    echo "Please enter your password for installing obitools in ${INSTALL_DIR}" 1>&2
+    sudo cp build/* "${INSTALL_DIR}/bin"
+  else
+    echo "Error: Could not copy binaries to ${INSTALL_DIR}/bin" 1>&2
+    echo "  Source files: $(ls build/ 2>/dev/null || echo 'none found')" 1>&2
+    echo "" 1>&2
+    echo "The build directory has been preserved for manual recovery:" 1>&2
+    echo "  $(pwd)/build/" 1>&2
+    echo "You can install manually with:" 1>&2
+    echo "  cp $(pwd)/build/* ${INSTALL_DIR}/bin/" 1>&2
+    popd > /dev/null || true
+    exit 1
+  fi
+fi
+
+popd > /dev/null || exit
+
+# Cleanup
+echo "Cleaning up..." 1>&2
 chmod -R +w "$WORK_DIR"
 rm -rf "$WORK_DIR"

+echo "" 1>&2
+echo "================================" 1>&2
+echo "OBITools4 v${VERSION} installed successfully!" 1>&2
+echo "Binaries location: ${INSTALL_DIR}/bin" 1>&2
+if [[ -n "$OBITOOLS_PREFIX" ]] ; then
+  echo "Command prefix: ${OBITOOLS_PREFIX}" 1>&2
+fi
+echo "================================" 1>&2
--- a/kmer_roaring_index/FREQUENCY_FILTER_FINAL.md
+++ b/kmer_roaring_index/FREQUENCY_FILTER_FINAL.md
@@ -1,292 +0,0 @@
-# Filtre de Fréquence avec v Niveaux de Roaring Bitmaps
-
-## Algorithme
-
-```go
-Pour chaque k-mer rencontré dans les données:
-    c = 0
-    tant que (k-mer ∈ index[c] ET c < v):
-        c++
-    
-    si c < v:
-        index[c].insert(k-mer)
-```
-
-**Résultat** : `index[v-1]` contient les k-mers vus **≥ v fois**
-
---
-
-## Exemple d'exécution (v=3)
-
-```
-Données:
-  Read1: kmer X
-  Read2: kmer X
-  Read3: kmer X  (X vu 3 fois)
-  Read4: kmer Y
-  Read5: kmer Y  (Y vu 2 fois)
-  Read6: kmer Z  (Z vu 1 fois)
-
-Exécution:
-
-Read1 (X):
-  c=0: X ∉ index[0] → index[0].add(X)
-  État: index[0]={X}, index[1]={}, index[2]={}
-
-Read2 (X):
-  c=0: X ∈ index[0] → c=1
-  c=1: X ∉ index[1] → index[1].add(X)
-  État: index[0]={X}, index[1]={X}, index[2]={}
-
-Read3 (X):
-  c=0: X ∈ index[0] → c=1
-  c=1: X ∈ index[1] → c=2
-  c=2: X ∉ index[2] → index[2].add(X)
-  État: index[0]={X}, index[1]={X}, index[2]={X}
-
-Read4 (Y):
-  c=0: Y ∉ index[0] → index[0].add(Y)
-  État: index[0]={X,Y}, index[1]={X}, index[2]={X}
-
-Read5 (Y):
-  c=0: Y ∈ index[0] → c=1
-  c=1: Y ∉ index[1] → index[1].add(Y)
-  État: index[0]={X,Y}, index[1]={X,Y}, index[2]={X}
-
-Read6 (Z):
-  c=0: Z ∉ index[0] → index[0].add(Z)
-  État: index[0]={X,Y,Z}, index[1]={X,Y}, index[2]={X}
-
-Résultat final:
-  index[0] (freq≥1): {X, Y, Z}
-  index[1] (freq≥2): {X, Y}
-  index[2] (freq≥3): {X}  ← K-mers filtrés ✓
-```
-
---
-
-## Utilisation
-
-```go
-// Créer le filtre
-filter := obikmer.NewFrequencyFilter(31, 3) // k=31, minFreq=3
-
-// Ajouter les séquences
-for _, read := range reads {
-    filter.AddSequence(read)
-}
-
-// Récupérer les k-mers filtrés (freq ≥ 3)
-filtered := filter.GetFilteredSet("filtered")
-fmt.Printf("K-mers de qualité: %d\n", filtered.Cardinality())
-
-// Statistiques
-stats := filter.Stats()
-fmt.Println(stats.String())
-```
-
---
-
-## Performance
-
-### Complexité
-
-**Par k-mer** :
- Lookups : Moyenne ~v/2, pire cas v
- Insertions : 1 Add
- **Pas de Remove** ✅
-
-**Total pour n k-mers** :
- Temps : O(n × v/2)
- Mémoire : O(unique_kmers × v × 2 bytes)
-
-### Early exit pour distribution skewed
-
-Avec distribution typique (séquençage) :
-```
-80% singletons → 1 lookup (early exit)
-15% freq 2-3   → 2-3 lookups
-5% freq ≥4     → jusqu'à v lookups
-
-Moyenne réelle : ~2 lookups/kmer (au lieu de v/2)
-```
-
---
-
-## Mémoire
-
-### Pour 10^8 k-mers uniques
-
-| v (minFreq) | Nombre bitmaps | Mémoire | vs map simple |
-|-------------|----------------|---------|---------------|
-| v=2 | 2 | ~400 MB | 6x moins |
-| v=3 | 3 | ~600 MB | 4x moins |
-| v=5 | 5 | ~1 GB | 2.4x moins |
-| v=10 | 10 | ~2 GB | 1.2x moins |
-| v=20 | 20 | ~4 GB | ~égal |
-
-**Note** : Avec distribution skewed (beaucoup de singletons), la mémoire réelle est bien plus faible car les niveaux hauts ont peu d'éléments.
-
-### Exemple réaliste (séquençage)
-
-Pour 10^8 k-mers totaux, v=3 :
-```
-Distribution:
-  80% singletons  → 80M dans index[0]
-  15% freq 2-3    → 15M dans index[1]
-  5% freq ≥3      → 5M dans index[2]
-
-Mémoire:
-  index[0]: 80M × 2 bytes = 160 MB
-  index[1]: 15M × 2 bytes = 30 MB
-  index[2]: 5M × 2 bytes = 10 MB
-  Total: ~200 MB ✅
-
-vs map simple: 80M × 24 bytes = ~2 GB
-Réduction: 10x
-```
-
---
-
-## Comparaison des approches
-
-| Approche | Mémoire (10^8 kmers) | Passes | Lookups/kmer | Quand utiliser |
-|----------|----------------------|--------|--------------|----------------|
-| **v-Bitmaps** | **200-600 MB** | **1** | **~2 (avg)** | **Standard** ✅ |
-| Map simple | 2.4 GB | 1 | 1 | Si RAM illimitée |
-| Multi-pass | 400 MB | v | v | Si I/O pas cher |
-
---
-
-## Avantages de v-Bitmaps
-
-✅ **Une seule passe** sur les données  
-✅ **Mémoire optimale** avec Roaring bitmaps  
-✅ **Pas de Remove** (seulement Contains + Add)  
-✅ **Early exit** efficace sur singletons  
-✅ **Scalable** jusqu'à v~10-20  
-✅ **Simple** à implémenter et comprendre  
-
---
-
-## Cas d'usage typiques
-
-### 1. Éliminer erreurs de séquençage
-
-```go
-filter := obikmer.NewFrequencyFilter(31, 3)
-
-// Traiter FASTQ
-for read := range StreamFastq("sample.fastq") {
-    filter.AddSequence(read)
-}
-
-// K-mers de qualité (pas d'erreurs)
-cleaned := filter.GetFilteredSet("cleaned")
-```
-
-**Résultat** : Élimine 70-80% des k-mers (erreurs)
-
-### 2. Assemblage de génome
-
-```go
-filter := obikmer.NewFrequencyFilter(31, 2)
-
-// Filtrer avant l'assemblage
-for read := range reads {
-    filter.AddSequence(read)
-}
-
-solidKmers := filter.GetFilteredSet("solid")
-// Utiliser solidKmers pour le graphe de Bruijn
-```
-
-### 3. Comparaison de génomes
-
-```go
-collection := obikmer.NewKmerSetCollection(31)
-
-for _, genome := range genomes {
-    filter := obikmer.NewFrequencyFilter(31, 3)
-    filter.AddSequences(genome.Reads)
-    
-    cleaned := filter.GetFilteredSet(genome.ID)
-    collection.Add(cleaned)
-}
-
-// Analyses comparatives sur k-mers de qualité
-matrix := collection.ParallelPairwiseJaccard(8)
-```
-
---
-
-## Limites
-
-**Pour v > 20** :
- Trop de lookups (v lookups/kmer)
- Mémoire importante (v × 200MB pour 10^8 kmers)
-
-**Solutions alternatives pour v > 20** :
- Utiliser map simple (9 bytes/kmer) si RAM disponible
- Algorithme différent (sketch, probabiliste)
-
---
-
-## Optimisations possibles
-
-### 1. Parallélisation
-
-```go
-// Traiter plusieurs fichiers en parallèle
-filters := make([]*FrequencyFilter, numFiles)
-
-var wg sync.WaitGroup
-for i, file := range files {
-    wg.Add(1)
-    go func(idx int, f string) {
-        defer wg.Done()
-        filters[idx] = ProcessFile(f, k, minFreq)
-    }(i, file)
-}
-wg.Wait()
-
-// Merger les résultats
-merged := MergeFilters(filters)
-```
-
-### 2. Streaming avec seuil adaptatif
-
-```go
-// Commencer avec v=5, réduire progressivement
-filter := obikmer.NewFrequencyFilter(31, 5)
-
-// ... traitement ...
-
-// Si trop de mémoire, réduire à v=3
-if filter.MemoryUsage() > threshold {
-    filter = ConvertToLowerThreshold(filter, 3)
-}
-```
-
---
-
-## Récapitulatif final
-
-**Pour filtrer les k-mers par fréquence ≥ v :**
-
-1. **Créer** : `filter := NewFrequencyFilter(k, v)`
-2. **Traiter** : `filter.AddSequence(read)` pour chaque read
-3. **Résultat** : `filtered := filter.GetFilteredSet(id)`
-
-**Mémoire** : ~2v MB par million de k-mers uniques  
-**Temps** : Une seule passe, ~2 lookups/kmer en moyenne  
-**Optimal pour** : v ≤ 20, distribution skewed (séquençage)  
-
---
-
-## Code fourni
-
-1. **frequency_filter.go** - Implémentation complète
-2. **examples_frequency_filter_final.go** - Exemples d'utilisation
-
-**Tout est prêt à utiliser !** 🚀
--- a/kmer_roaring_index/examples_frequency_filter_final.go
+++ b/kmer_roaring_index/examples_frequency_filter_final.go
@@ -1,320 +0,0 @@
-package main
-
-import (
-	"fmt"
-	"obikmer"
-)
-
-func main() {
-	// ==========================================
-	// EXEMPLE 1 : Utilisation basique
-	// ==========================================
-	fmt.Println("=== EXEMPLE 1 : Utilisation basique ===\n")
-
-	k := 31
-	minFreq := 3 // Garder les k-mers vus ≥3 fois
-
-	// Créer le filtre
-	filter := obikmer.NewFrequencyFilter(k, minFreq)
-
-	// Simuler des séquences avec différentes fréquences
-	sequences := [][]byte{
-		[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"), // Kmer X
-		[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"), // Kmer X (freq=2)
-		[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"), // Kmer X (freq=3) ✓
-		[]byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"), // Kmer Y
-		[]byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"), // Kmer Y (freq=2) ✗
-		[]byte("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"), // Kmer Z (freq=1) ✗
-	}
-
-	fmt.Printf("Traitement de %d séquences...\n", len(sequences))
-	for _, seq := range sequences {
-		filter.AddSequence(seq)
-	}
-
-	// Récupérer les k-mers filtrés
-	filtered := filter.GetFilteredSet("filtered")
-	fmt.Printf("\nK-mers avec freq ≥ %d: %d\n", minFreq, filtered.Cardinality())
-
-	// Statistiques
-	stats := filter.Stats()
-	fmt.Println("\n" + stats.String())
-
-	// ==========================================
-	// EXEMPLE 2 : Vérifier les niveaux
-	// ==========================================
-	fmt.Println("\n=== EXEMPLE 2 : Inspection des niveaux ===\n")
-
-	// Vérifier chaque niveau
-	for level := 0; level < minFreq; level++ {
-		levelSet := filter.GetKmersAtLevel(level)
-		fmt.Printf("Niveau %d (freq≥%d): %d k-mers\n",
-			level+1, level+1, levelSet.Cardinality())
-	}
-
-	// ==========================================
-	// EXEMPLE 3 : Données réalistes
-	// ==========================================
-	fmt.Println("\n=== EXEMPLE 3 : Simulation données séquençage ===\n")
-
-	filter2 := obikmer.NewFrequencyFilter(31, 3)
-
-	// Simuler un dataset réaliste :
-	// - 1000 reads
-	// - 80% contiennent des erreurs (singletons)
-	// - 15% vrais k-mers à basse fréquence
-	// - 5% vrais k-mers à haute fréquence
-
-	// Vraie séquence répétée
-	trueSeq := []byte("ACGTACGTACGTACGTACGTACGTACGTACG")
-	for i := 0; i < 50; i++ {
-		filter2.AddSequence(trueSeq)
-	}
-
-	// Séquence à fréquence moyenne
-	mediumSeq := []byte("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC")
-	for i := 0; i < 5; i++ {
-		filter2.AddSequence(mediumSeq)
-	}
-
-	// Erreurs de séquençage (singletons)
-	for i := 0; i < 100; i++ {
-		errorSeq := []byte(fmt.Sprintf("TTTTTTTTTTTTTTTTTTTTTTTTTTTT%03d", i))
-		filter2.AddSequence(errorSeq)
-	}
-
-	stats2 := filter2.Stats()
-	fmt.Println(stats2.String())
-
-	fmt.Println("Distribution attendue:")
-	fmt.Println("  - Beaucoup de singletons (erreurs)")
-	fmt.Println("  - Peu de k-mers à haute fréquence (signal)")
-	fmt.Println("  → Filtrage efficace !")
-
-	// ==========================================
-	// EXEMPLE 4 : Tester différents seuils
-	// ==========================================
-	fmt.Println("\n=== EXEMPLE 4 : Comparaison de seuils ===\n")
-
-	testSeqs := [][]byte{
-		[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
-		[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
-		[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
-		[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
-		[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"), // freq=5
-		[]byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"),
-		[]byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"),
-		[]byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"), // freq=3
-		[]byte("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"), // freq=1
-	}
-
-	for _, minFreq := range []int{2, 3, 5} {
-		f := obikmer.NewFrequencyFilter(31, minFreq)
-		f.AddSequences(testSeqs)
-
-		fmt.Printf("minFreq=%d: %d k-mers retenus (%.2f MB)\n",
-			minFreq,
-			f.Cardinality(),
-			float64(f.MemoryUsage())/1024/1024)
-	}
-
-	// ==========================================
-	// EXEMPLE 5 : Comparaison mémoire
-	// ==========================================
-	fmt.Println("\n=== EXEMPLE 5 : Comparaison mémoire ===\n")
-
-	filter3 := obikmer.NewFrequencyFilter(31, 3)
-
-	// Simuler 10000 séquences
-	for i := 0; i < 10000; i++ {
-		seq := make([]byte, 100)
-		for j := range seq {
-			seq[j] = "ACGT"[(i+j)%4]
-		}
-		filter3.AddSequence(seq)
-	}
-
-	fmt.Println(filter3.CompareWithSimpleMap())
-
-	// ==========================================
-	// EXEMPLE 6 : Workflow complet
-	// ==========================================
-	fmt.Println("\n=== EXEMPLE 6 : Workflow complet ===\n")
-
-	fmt.Println("1. Créer le filtre")
-	finalFilter := obikmer.NewFrequencyFilter(31, 3)
-
-	fmt.Println("2. Traiter les données (simulation)")
-	// En pratique : lire depuis FASTQ
-	// for read := range ReadFastq("data.fastq") {
-	//     finalFilter.AddSequence(read)
-	// }
-
-	// Simulation
-	for i := 0; i < 1000; i++ {
-		seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACG")
-		finalFilter.AddSequence(seq)
-	}
-
-	fmt.Println("3. Récupérer les k-mers filtrés")
-	result := finalFilter.GetFilteredSet("final")
-
-	fmt.Println("4. Utiliser le résultat")
-	fmt.Printf("   K-mers de qualité: %d\n", result.Cardinality())
-	fmt.Printf("   Mémoire utilisée: %.2f MB\n", float64(finalFilter.MemoryUsage())/1024/1024)
-
-	fmt.Println("5. Sauvegarder (optionnel)")
-	// result.Save("filtered_kmers.bin")
-
-	// ==========================================
-	// EXEMPLE 7 : Vérification individuelle
-	// ==========================================
-	fmt.Println("\n=== EXEMPLE 7 : Vérification de k-mers spécifiques ===\n")
-
-	checkFilter := obikmer.NewFrequencyFilter(31, 3)
-
-	testSeq := []byte("ACGTACGTACGTACGTACGTACGTACGTACG")
-	for i := 0; i < 5; i++ {
-		checkFilter.AddSequence(testSeq)
-	}
-
-	var kmers []uint64
-	kmers = obikmer.EncodeKmers(testSeq, 31, &kmers)
-
-	if len(kmers) > 0 {
-		testKmer := kmers[0]
-
-		fmt.Printf("K-mer test: 0x%016X\n", testKmer)
-		fmt.Printf("  Présent dans filtre: %v\n", checkFilter.Contains(testKmer))
-		fmt.Printf("  Fréquence approx: %d\n", checkFilter.GetFrequency(testKmer))
-	}
-
-	// ==========================================
-	// EXEMPLE 8 : Intégration avec collection
-	// ==========================================
-	fmt.Println("\n=== EXEMPLE 8 : Intégration avec KmerSetCollection ===\n")
-
-	// Créer une collection de génomes filtrés
-	collection := obikmer.NewKmerSetCollection(31)
-
-	genomes := map[string][][]byte{
-		"Genome1": {
-			[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
-			[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
-			[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
-			[]byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"), // Erreur
-		},
-		"Genome2": {
-			[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
-			[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
-			[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
-			[]byte("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"), // Erreur
-		},
-	}
-
-	for id, sequences := range genomes {
-		// Filtrer chaque génome
-		genomeFilter := obikmer.NewFrequencyFilter(31, 3)
-		genomeFilter.AddSequences(sequences)
-
-		// Ajouter à la collection
-		filteredSet := genomeFilter.GetFilteredSet(id)
-		collection.Add(filteredSet)
-
-		fmt.Printf("%s: %d k-mers de qualité\n", id, filteredSet.Cardinality())
-	}
-
-	// Analyser la collection
-	fmt.Println("\nAnalyse comparative:")
-	collectionStats := collection.ComputeStats()
-	fmt.Printf("  Core genome: %d k-mers\n", collectionStats.CoreSize)
-	fmt.Printf("  Pan genome: %d k-mers\n", collectionStats.PanGenomeSize)
-
-	// ==========================================
-	// RÉSUMÉ
-	// ==========================================
-	fmt.Println("\n=== RÉSUMÉ ===\n")
-	fmt.Println("Le FrequencyFilter permet de:")
-	fmt.Println("  ✓ Filtrer les k-mers par fréquence minimale")
-	fmt.Println("  ✓ Utiliser une mémoire optimale avec Roaring bitmaps")
-	fmt.Println("  ✓ Une seule passe sur les données")
-	fmt.Println("  ✓ Éliminer efficacement les erreurs de séquençage")
-	fmt.Println("")
-	fmt.Println("Workflow typique:")
-	fmt.Println("  1. filter := NewFrequencyFilter(k, minFreq)")
-	fmt.Println("  2. for each sequence: filter.AddSequence(seq)")
-	fmt.Println("  3. filtered := filter.GetFilteredSet(id)")
-	fmt.Println("  4. Utiliser filtered dans vos analyses")
-}
-
-// ==================================
-// FONCTION HELPER POUR BENCHMARKS
-// ==================================
-
-func BenchmarkFrequencyFilter() {
-	k := 31
-	minFreq := 3
-
-	// Test avec différentes tailles
-	sizes := []int{1000, 10000, 100000}
-
-	fmt.Println("\n=== BENCHMARK ===\n")
-
-	for _, size := range sizes {
-		filter := obikmer.NewFrequencyFilter(k, minFreq)
-
-		// Générer des séquences
-		for i := 0; i < size; i++ {
-			seq := make([]byte, 100)
-			for j := range seq {
-				seq[j] = "ACGT"[(i+j)%4]
-			}
-			filter.AddSequence(seq)
-		}
-
-		fmt.Printf("Size=%d reads:\n", size)
-		fmt.Printf("  Filtered k-mers: %d\n", filter.Cardinality())
-		fmt.Printf("  Memory: %.2f MB\n", float64(filter.MemoryUsage())/1024/1024)
-		fmt.Println()
-	}
-}
-
-// ==================================
-// FONCTION POUR DONNÉES RÉELLES
-// ==================================
-
-func ProcessRealData() {
-	// Exemple pour traiter de vraies données FASTQ
-
-	k := 31
-	minFreq := 3
-
-	filter := obikmer.NewFrequencyFilter(k, minFreq)
-
-	// Pseudo-code pour lire un FASTQ
-	/*
-	fastqFile := "sample.fastq"
-	reader := NewFastqReader(fastqFile)
-
-	for reader.HasNext() {
-		read := reader.Next()
-		filter.AddSequence(read.Sequence)
-	}
-
-	// Récupérer le résultat
-	filtered := filter.GetFilteredSet("sample_filtered")
-	filtered.Save("sample_filtered_kmers.bin")
-
-	// Stats
-	stats := filter.Stats()
-	fmt.Println(stats.String())
-	*/
-
-	fmt.Println("Workflow pour données réelles:")
-	fmt.Println("  1. Créer le filtre avec minFreq approprié (2-5 typique)")
-	fmt.Println("  2. Stream les reads depuis FASTQ")
-	fmt.Println("  3. Récupérer les k-mers filtrés")
-	fmt.Println("  4. Utiliser pour assemblage/comparaison/etc.")
-
-	_ = filter // unused
-}
--- a/logs_60535302930.zip
+++ b/logs_60535302930.zip
--- a/obitests/obitools/obisuperkmer/README.md
+++ b/obitests/obitools/obisuperkmer/README.md
@@ -0,0 +1,148 @@
+# Tests pour obisuperkmer
+
+## Description
+
+Ce répertoire contient les tests automatisés pour la commande `obisuperkmer`.
+
+## Fichiers
+
+- `test.sh` : Script de test principal (exécutable)
+- `test_sequences.fasta` : Jeu de données de test minimal (3 séquences courtes)
+- `README.md` : Ce fichier
+
+## Jeu de données de test
+
+Le fichier `test_sequences.fasta` contient 3 séquences de 32 nucléotides chacune :
+
+1. **seq1** : Répétition du motif ACGT (séquence régulière)
+2. **seq2** : Alternance de blocs homopolymères (AAAA, CCCC, GGGG, TTTT)
+3. **seq3** : Répétition du motif ATCG (différent de seq1)
+
+Ces séquences sont volontairement courtes pour :
+- Minimiser la taille du dépôt Git
+- Accélérer l'exécution des tests en CI/CD
+- Tester différents cas d'extraction de super k-mers
+
+## Tests effectués
+
+Le script `test.sh` effectue 12 tests :
+
+### Test 1 : Affichage de l'aide
+Vérifie que `obisuperkmer -h` s'exécute correctement.
+
+### Test 2 : Extraction basique avec paramètres par défaut
+Exécute `obisuperkmer` avec k=21, m=11 (valeurs par défaut).
+
+### Test 3 : Vérification du fichier de sortie non vide
+S'assure que la commande produit une sortie.
+
+### Test 4 : Comptage des super k-mers extraits
+Vérifie qu'au moins un super k-mer a été extrait.
+
+### Test 5 : Présence des métadonnées requises
+Vérifie que chaque super k-mer contient :
+- `minimizer_value`
+- `minimizer_seq`
+- `parent_id`
+
+### Test 6 : Extraction avec paramètres personnalisés
+Teste avec k=15 et m=7.
+
+### Test 7 : Vérification des paramètres dans les métadonnées
+S'assure que les valeurs k=15 et m=7 sont présentes dans la sortie.
+
+### Test 8 : Format de sortie FASTA explicite
+Teste l'option `--fasta-output`.
+
+### Test 9 : Vérification des IDs des super k-mers
+S'assure que tous les IDs contiennent "superkmer".
+
+### Test 10 : Préservation des IDs parents
+Vérifie que seq1, seq2 et seq3 apparaissent dans la sortie.
+
+### Test 11 : Option -o pour fichier de sortie
+Teste la redirection vers un fichier avec `-o`.
+
+### Test 12 : Vérification de la création du fichier avec -o
+S'assure que le fichier de sortie a été créé.
+
+### Test 13 : Cohérence des longueurs
+Vérifie que la somme des longueurs des super k-mers est inférieure ou égale à la longueur totale des séquences d'entrée.
+
+## Exécution des tests
+
+### Localement
+
+```bash
+cd /chemin/vers/obitools4/obitests/obitools/obisuperkmer
+./test.sh
+```
+
+### En CI/CD
+
+Les tests sont automatiquement exécutés lors de chaque commit via le système CI/CD configuré pour le projet.
+
+### Prérequis
+
+- La commande `obisuperkmer` doit être compilée et disponible dans `../../build/`
+- Les dépendances système : bash, grep, etc.
+
+## Structure du script de test
+
+Le script suit le pattern standard utilisé par tous les tests OBITools :
+
+1. **En-tête** : Définition du nom du test et de la commande
+2. **Variables** : Configuration des chemins et compteurs
+3. **Fonction cleanup()** : Affiche les résultats et nettoie le répertoire temporaire
+4. **Fonction log()** : Affiche les messages horodatés
+5. **Tests** : Série de tests avec incrémentation des compteurs
+6. **Appel cleanup()** : Nettoyage et sortie avec code de retour approprié
+
+## Format de sortie
+
+Chaque test affiche :
+```
+[obisuperkmer @ date] message
+```
+
+En fin d'exécution :
+```
+========================================
+## Results of the obisuperkmer tests:
+
+- 12 tests run
+- 12 successfully completed
+- 0 failed tests
+
+Cleaning up the temporary directory...
+
+========================================
+```
+
+## Codes de retour
+
+- **0** : Tous les tests ont réussi
+- **1** : Au moins un test a échoué
+
+## Ajout de nouveaux tests
+
+Pour ajouter un nouveau test, suivre le pattern :
+
+```bash
+((ntest++))
+if commande_test arguments
+then
+    log "Description: OK" 
+    ((success++))
+else
+    log "Description: failed"
+    ((failed++))
+fi
+```
+
+## Notes
+
+- Les fichiers temporaires sont créés dans `$TMPDIR` (créé par mktemp)
+- Les fichiers de données sont dans `$TEST_DIR`
+- La commande testée doit être dans `$OBITOOLS_DIR` (../../build/)
+- Le répertoire temporaire est automatiquement nettoyé à la fin
--- a/obitests/obitools/obisuperkmer/test.sh
+++ b/obitests/obitools/obisuperkmer/test.sh
@@ -0,0 +1,232 @@
+#!/bin/bash
+
+#
+# Here give the name of the test serie
+#
+
+TEST_NAME=obik-super
+CMD=obik
+
+######
+#
+# Some variable and function definitions: please don't change them
+#
+######
+TEST_DIR="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")"
+OBITOOLS_DIR="${TEST_DIR/obitest*/}build"
+export PATH="${OBITOOLS_DIR}:${PATH}"
+
+MCMD="OBIk-super"
+
+TMPDIR="$(mktemp -d)"
+ntest=0
+success=0
+failed=0
+
+cleanup() {
+    echo "========================================" 1>&2
+    echo "## Results of the $TEST_NAME tests:" 1>&2
+
+    echo 1>&2
+    echo "- $ntest tests run" 1>&2
+    echo "- $success successfully completed" 1>&2
+    echo "- $failed failed tests" 1>&2
+    echo 1>&2
+    echo "Cleaning up the temporary directory..." 1>&2
+    echo 1>&2
+    echo "========================================" 1>&2
+
+    rm -rf "$TMPDIR"  # Suppress the temporary directory
+
+    if [ $failed -gt 0 ]; then
+       log "$TEST_NAME tests failed"
+        log
+        log
+       exit 1
+    fi
+
+    log
+    log
+
+    exit 0
+}
+
+log() {
+    echo -e "[$TEST_NAME @ $(date)] $*" 1>&2
+}
+
+log "Testing $TEST_NAME..."
+log "Test directory is $TEST_DIR"
+log "obitools directory is $OBITOOLS_DIR"
+log "Temporary directory is $TMPDIR"
+log "files: $(find $TEST_DIR | awk -F'/' '{print $NF}' | tail -n +2)"
+
+######################################################################
+####
+#### Below are the tests
+####
+######################################################################
+
+((ntest++))
+if $CMD super -h > "${TMPDIR}/help.txt" 2>&1
+then
+    log "$MCMD: printing help OK"
+    ((success++))
+else
+    log "$MCMD: printing help failed"
+    ((failed++))
+fi
+
+# Test 1: Basic super k-mer extraction with default parameters
+((ntest++))
+if $CMD super "${TEST_DIR}/test_sequences.fasta" \
+    > "${TMPDIR}/output_default.fasta" 2>&1
+then
+    log "$MCMD: basic extraction with default parameters OK"
+    ((success++))
+else
+    log "$MCMD: basic extraction with default parameters failed"
+    ((failed++))
+fi
+
+# Test 2: Verify output is not empty
+((ntest++))
+if [ -s "${TMPDIR}/output_default.fasta" ]
+then
+    log "$MCMD: output file is not empty OK"
+    ((success++))
+else
+    log "$MCMD: output file is empty - failed"
+    ((failed++))
+fi
+
+# Test 3: Count number of super k-mers extracted (should be > 0)
+((ntest++))
+num_sequences=$(grep -c "^>" "${TMPDIR}/output_default.fasta")
+if [ "$num_sequences" -gt 0 ]
+then
+    log "$MCMD: extracted $num_sequences super k-mers OK"
+    ((success++))
+else
+    log "$MCMD: no super k-mers extracted - failed"
+    ((failed++))
+fi
+
+# Test 4: Verify super k-mers have required metadata attributes
+((ntest++))
+if grep -q "minimizer_value" "${TMPDIR}/output_default.fasta" && \
+   grep -q "minimizer_seq" "${TMPDIR}/output_default.fasta" && \
+   grep -q "parent_id" "${TMPDIR}/output_default.fasta"
+then
+    log "$MCMD: super k-mers contain required metadata OK"
+    ((success++))
+else
+    log "$MCMD: super k-mers missing metadata - failed"
+    ((failed++))
+fi
+
+# Test 5: Extract super k-mers with custom k and m parameters
+((ntest++))
+if $CMD super -k 15 -m 7 "${TEST_DIR}/test_sequences.fasta" \
+    > "${TMPDIR}/output_k15_m7.fasta" 2>&1
+then
+    log "$MCMD: extraction with custom k=15, m=7 OK"
+    ((success++))
+else
+    log "$MCMD: extraction with custom k=15, m=7 failed"
+    ((failed++))
+fi
+
+# Test 6: Verify custom parameters in output metadata
+((ntest++))
+if grep -q '"k":15' "${TMPDIR}/output_k15_m7.fasta" && \
+   grep -q '"m":7' "${TMPDIR}/output_k15_m7.fasta"
+then
+    log "$MCMD: custom parameters correctly set in metadata OK"
+    ((success++))
+else
+    log "$MCMD: custom parameters not in metadata - failed"
+    ((failed++))
+fi
+
+# Test 7: Test with different output format (FASTA output explicitly)
+((ntest++))
+if $CMD super --fasta-output -k 21 -m 11 \
+    "${TEST_DIR}/test_sequences.fasta" \
+    > "${TMPDIR}/output_fasta.fasta" 2>&1
+then
+    log "$MCMD: FASTA output format OK"
+    ((success++))
+else
+    log "$MCMD: FASTA output format failed"
+    ((failed++))
+fi
+
+# Test 8: Verify all super k-mers have superkmer in their ID
+((ntest++))
+if grep "^>" "${TMPDIR}/output_default.fasta" | grep -q "superkmer"
+then
+    log "$MCMD: super k-mer IDs contain 'superkmer' OK"
+    ((success++))
+else
+    log "$MCMD: super k-mer IDs missing 'superkmer' - failed"
+    ((failed++))
+fi
+
+# Test 9: Verify parent sequence IDs are preserved
+((ntest++))
+if grep -q "seq1" "${TMPDIR}/output_default.fasta" && \
+   grep -q "seq2" "${TMPDIR}/output_default.fasta" && \
+   grep -q "seq3" "${TMPDIR}/output_default.fasta"
+then
+    log "$MCMD: parent sequence IDs preserved OK"
+    ((success++))
+else
+    log "$MCMD: parent sequence IDs not preserved - failed"
+    ((failed++))
+fi
+
+# Test 10: Test with output file option
+((ntest++))
+if $CMD super -o "${TMPDIR}/output_file.fasta" \
+    "${TEST_DIR}/test_sequences.fasta" 2>&1
+then
+    log "$MCMD: output to file with -o option OK"
+    ((success++))
+else
+    log "$MCMD: output to file with -o option failed"
+    ((failed++))
+fi
+
+# Test 11: Verify output file was created with -o option
+((ntest++))
+if [ -s "${TMPDIR}/output_file.fasta" ]
+then
+    log "$MCMD: output file created with -o option OK"
+    ((success++))
+else
+    log "$MCMD: output file not created with -o option - failed"
+    ((failed++))
+fi
+
+# Test 12: Verify each super k-mer length is >= k (default k=31)
+((ntest++))
+min_len=$(grep -v "^>" "${TMPDIR}/output_default.fasta" | awk '{print length}' | sort -n | head -1)
+
+if [ "$min_len" -ge 31 ]
+then
+    log "$MCMD: all super k-mers have length >= k OK"
+    ((success++))
+else
+    log "$MCMD: some super k-mers shorter than k ($min_len < 31) - failed"
+    ((failed++))
+fi
+
+#########################################
+#
+# At the end of the tests
+# the cleanup function is called
+#
+#########################################
+
+cleanup
--- a/obitests/obitools/obisuperkmer/test_sequences.fasta
+++ b/obitests/obitools/obisuperkmer/test_sequences.fasta
@@ -0,0 +1,6 @@
+>seq1
+ACGTACGTACGTACGTACGTACGTACGTACGT
+>seq2
+AAAACCCCGGGGTTTTAAAACCCCGGGGTTTT
+>seq3
+ATCGATCGATCGATCGATCGATCGATCGATCG
--- a/obitests/obitools/obiuniq/test.sh
+++ b/obitests/obitools/obiuniq/test.sh
@@ -195,6 +195,59 @@ else
    ((failed++))
 fi

+##
+## Test merge attributes consistency between in-memory and on-disk paths
+## This test catches the bug where the shared classifier in the on-disk
+## dereplication path caused incorrect merged attributes.
+##
+
+((ntest++))
+if obiuniq -m a -m b --in-memory \
+    "${TEST_DIR}/touniq.fasta" \
+    > "${TMPDIR}/touniq_u_merge_mem.fasta" 2>/dev/null
+then
+    log "OBIUniq merge in-memory: running OK"
+    ((success++))
+else
+    log "OBIUniq merge in-memory: running failed"
+    ((failed++))
+fi
+
+((ntest++))
+if obiuniq -m a -m b --chunk-count 4 \
+    "${TEST_DIR}/touniq.fasta" \
+    > "${TMPDIR}/touniq_u_merge_disk.fasta" 2>/dev/null
+then
+    log "OBIUniq merge on-disk: running OK"
+    ((success++))
+else
+    log "OBIUniq merge on-disk: running failed"
+    ((failed++))
+fi
+
+# Extract sorted annotations (JSON attributes) from both outputs
+# to compare merge results independently of sequence ordering
+grep '^>' "${TMPDIR}/touniq_u_merge_mem.fasta" \
+| sed 's/^>seq[0-9]* //' \
+| sort \
+> "${TMPDIR}/touniq_u_merge_mem.json"
+
+grep '^>' "${TMPDIR}/touniq_u_merge_disk.fasta" \
+| sed 's/^>seq[0-9]* //' \
+| sort \
+> "${TMPDIR}/touniq_u_merge_disk.json"
+
+((ntest++))
+if diff "${TMPDIR}/touniq_u_merge_mem.json" \
+        "${TMPDIR}/touniq_u_merge_disk.json" > /dev/null
+then
+    log "OBIUniq merge on-disk vs in-memory: result OK"
+    ((success++))
+else
+    log "OBIUniq merge on-disk vs in-memory: result failed"
+    ((failed++))
+fi
+
 #########################################
 #
 # At the end of the tests
--- a/pkg/obichunk/chunk_on_disk.go
+++ b/pkg/obichunk/chunk_on_disk.go
@@ -110,6 +110,7 @@ func ISequenceChunkOnDisk(iterator obiiter.IBioSequence,
 	log.Infof("Data splitted over %d batches", nbatch)

 	go func() {
+		localClassifier := uniqueClassifier.Clone()

 		for order, file := range fileNames {
 			iseq, err := obiformats.ReadSequencesFromFile(file)
@@ -121,7 +122,7 @@ func ISequenceChunkOnDisk(iterator obiiter.IBioSequence,
 			if dereplicate {
 				u := make(map[string]*obiseq.BioSequence)
 				var source string
-				uniqueClassifier.Reset()
+				localClassifier.Reset()

 				for iseq.Next() {
 					batch := iseq.Get()
@@ -129,8 +130,8 @@ func ISequenceChunkOnDisk(iterator obiiter.IBioSequence,

 					for _, seq := range batch.Slice() {
 						// Use composite key: sequence + categories
-						code := uniqueClassifier.Code(seq)
-						key := uniqueClassifier.Value(code)
+						code := localClassifier.Code(seq)
+						key := localClassifier.Value(code)
 						prev, ok := u[key]
 						if ok {
 							prev.Merge(seq, na, true, statsOn)
--- a/pkg/obicorazick/worker.go
+++ b/pkg/obicorazick/worker.go
@@ -27,6 +27,8 @@ func AhoCorazickWorker(slot string, patterns []string) obiseq.SeqWorker {
 	npar := min(obidefault.ParallelWorkers(), nmatcher)
 	mutex.Add(npar)

+	var bar *progressbar.ProgressBar
+	if obidefault.ProgressBar() {
 		pbopt := make([]progressbar.Option, 0, 5)
 		pbopt = append(pbopt,
 			progressbar.OptionSetWriter(os.Stderr),
@@ -36,14 +38,16 @@ func AhoCorazickWorker(slot string, patterns []string) obiseq.SeqWorker {
 			progressbar.OptionSetDescription("Building AhoCorasick matcher..."),
 		)

-	bar := progressbar.NewOptions(nmatcher, pbopt...)
-	bar.Add(0)
+		bar = progressbar.NewOptions(nmatcher, pbopt...)
+	}

 	builder := func() {
 		for i := range ieme  {
 		matchers[i] = ahocorasick.CompileStrings(patterns[i*sizebatch:min((i+1)*sizebatch,len(patterns))])
+		if bar != nil {
 			bar.Add(1)
 		}
+		}
 		mutex.Done()
 	}

--- a/pkg/obidefault/batch.go
+++ b/pkg/obidefault/batch.go
@@ -1,6 +1,12 @@
 package obidefault

-var _BatchSize = 2000
+// _BatchSize is the minimum number of sequences per batch (floor).
+// Used as the minSeqs argument to RebatchBySize.
+var _BatchSize = 1
+
+// _BatchSizeMax is the maximum number of sequences per batch (ceiling).
+// A batch is flushed when this count is reached regardless of memory usage.
+var _BatchSizeMax = 2000

 // SetBatchSize sets the size of the sequence batches.
 //
@@ -24,3 +30,42 @@ func BatchSize() int {
 func BatchSizePtr() *int {
 	return &_BatchSize
 }
+
+// BatchSizeMax returns the maximum number of sequences per batch.
+func BatchSizeMax() int {
+	return _BatchSizeMax
+}
+
+func BatchSizeMaxPtr() *int {
+	return &_BatchSizeMax
+}
+
+// _BatchMem holds the maximum cumulative memory (in bytes) per batch when
+// memory-based batching is requested. A value of 0 disables memory-based
+// batching and falls back to count-based batching.
+var _BatchMem = 128 * 1024 * 1024 // 128 MB default; set to 0 to disable
+var _BatchMemStr = ""
+
+// SetBatchMem sets the memory budget per batch in bytes.
+func SetBatchMem(n int) {
+	_BatchMem = n
+}
+
+// BatchMem returns the current memory budget per batch in bytes.
+// A value of 0 means memory-based batching is disabled.
+func BatchMem() int {
+	return _BatchMem
+}
+
+func BatchMemPtr() *int {
+	return &_BatchMem
+}
+
+// BatchMemStr returns the raw --batch-mem string value as provided on the CLI.
+func BatchMemStr() string {
+	return _BatchMemStr
+}
+
+func BatchMemStrPtr() *string {
+	return &_BatchMemStr
+}
--- a/pkg/obidefault/progressbar.go
+++ b/pkg/obidefault/progressbar.go
@@ -0,0 +1,19 @@
+package obidefault
+
+var __no_progress_bar__ = false
+
+func ProgressBar() bool {
+	return !__no_progress_bar__
+}
+
+func NoProgressBar() bool {
+	return __no_progress_bar__
+}
+
+func SetNoProgressBar(b bool) {
+	__no_progress_bar__ = b
+}
+
+func NoProgressBarPtr() *bool {
+	return &__no_progress_bar__
+}
--- a/pkg/obiformats/embl_read.go
+++ b/pkg/obiformats/embl_read.go
@@ -161,6 +161,149 @@ func EmblChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (obise
 	return parser
 }

+// extractEmblSeq scans the sequence section of an EMBL record directly on the
+// rope. EMBL sequence lines start with 5 spaces followed by bases in groups of
+// 10, separated by spaces, with a position number at the end. The section ends
+// with "//".
+func (s *ropeScanner) extractEmblSeq(dest []byte, UtoT bool) []byte {
+	// We use ReadLine and scan each line for bases (skip digits, spaces, newlines).
+	for {
+		line := s.ReadLine()
+		if line == nil {
+			break
+		}
+		if len(line) >= 2 && line[0] == '/' && line[1] == '/' {
+			break
+		}
+		// Lines start with 5 spaces; bases follow separated by single spaces.
+		// Digits at the end are the position counter — skip them.
+		// Simplest: take every byte that is a letter.
+		for _, b := range line {
+			if b >= 'A' && b <= 'Z' {
+				b += 'a' - 'A'
+			}
+			if UtoT && b == 'u' {
+				b = 't'
+			}
+			if b >= 'a' && b <= 'z' {
+				dest = append(dest, b)
+			}
+		}
+	}
+	return dest
+}
+
+// EmblChunkParserRope parses an EMBL chunk directly from a rope without Pack().
+func EmblChunkParserRope(source string, rope *PieceOfChunk, withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) {
+	scanner := newRopeScanner(rope)
+	sequences := obiseq.MakeBioSequenceSlice(100)[:0]
+
+	var id string
+	var scientificName string
+	defBytes := make([]byte, 0, 256)
+	featBytes := make([]byte, 0, 1024)
+	var taxid int
+	inSeq := false
+
+	for {
+		line := scanner.ReadLine()
+		if line == nil {
+			break
+		}
+
+		if inSeq {
+			// Should not happen — extractEmblSeq consumed up to "//"
+			inSeq = false
+			continue
+		}
+
+		switch {
+		case bytes.HasPrefix(line, []byte("ID   ")):
+			id = string(bytes.SplitN(line[5:], []byte(";"), 2)[0])
+		case bytes.HasPrefix(line, []byte("OS   ")):
+			scientificName = string(bytes.TrimSpace(line[5:]))
+		case bytes.HasPrefix(line, []byte("DE   ")):
+			if len(defBytes) > 0 {
+				defBytes = append(defBytes, ' ')
+			}
+			defBytes = append(defBytes, bytes.TrimSpace(line[5:])...)
+		case withFeatureTable && bytes.HasPrefix(line, []byte("FH   ")):
+			featBytes = append(featBytes, line...)
+		case withFeatureTable && bytes.Equal(line, []byte("FH")):
+			featBytes = append(featBytes, '\n')
+			featBytes = append(featBytes, line...)
+		case bytes.HasPrefix(line, []byte("FT   ")):
+			if withFeatureTable {
+				featBytes = append(featBytes, '\n')
+				featBytes = append(featBytes, line...)
+			}
+			if bytes.HasPrefix(line, []byte(`FT                   /db_xref="taxon:`)) {
+				rest := line[37:]
+				end := bytes.IndexByte(rest, '"')
+				if end > 0 {
+					taxid, _ = strconv.Atoi(string(rest[:end]))
+				}
+			}
+		case bytes.HasPrefix(line, []byte("     ")):
+			// First sequence line: extract all bases via extractEmblSeq,
+			// which also consumes this line's remaining content.
+			// But ReadLine already consumed this line — we need to process it
+			// plus subsequent lines. Process this line inline then call helper.
+			seqDest := make([]byte, 0, 4096)
+			for _, b := range line {
+				if b >= 'A' && b <= 'Z' {
+					b += 'a' - 'A'
+				}
+				if UtoT && b == 'u' {
+					b = 't'
+				}
+				if b >= 'a' && b <= 'z' {
+					seqDest = append(seqDest, b)
+				}
+			}
+			seqDest = scanner.extractEmblSeq(seqDest, UtoT)
+
+			seq := obiseq.NewBioSequenceOwning(id, seqDest, string(defBytes))
+			seq.SetSource(source)
+			if withFeatureTable {
+				seq.SetFeatures(featBytes)
+			}
+			annot := seq.Annotations()
+			annot["scientific_name"] = scientificName
+			annot["taxid"] = taxid
+			sequences = append(sequences, seq)
+
+			// Reset state
+			id = ""
+			scientificName = ""
+			defBytes = defBytes[:0]
+			featBytes = featBytes[:0]
+			taxid = 1
+
+		case bytes.Equal(line, []byte("//")):
+			// record ended without SQ/sequence section (e.g. WGS entries)
+			if id != "" {
+				seq := obiseq.NewBioSequenceOwning(id, []byte{}, string(defBytes))
+				seq.SetSource(source)
+				if withFeatureTable {
+					seq.SetFeatures(featBytes)
+				}
+				annot := seq.Annotations()
+				annot["scientific_name"] = scientificName
+				annot["taxid"] = taxid
+				sequences = append(sequences, seq)
+			}
+			id = ""
+			scientificName = ""
+			defBytes = defBytes[:0]
+			featBytes = featBytes[:0]
+			taxid = 1
+		}
+	}
+
+	return sequences, nil
+}
+
 func _ParseEmblFile(
 	input ChannelFileChunk,
 	out obiiter.IBioSequence,
@@ -171,7 +314,14 @@ func _ParseEmblFile(

 	for chunks := range input {
 		order := chunks.Order
-		sequences, err := parser(chunks.Source, chunks.Raw)
+		var sequences obiseq.BioSequenceSlice
+		var err error
+
+		if chunks.Rope != nil {
+			sequences, err = EmblChunkParserRope(chunks.Source, chunks.Rope, withFeatureTable, UtoT)
+		} else {
+			sequences, err = parser(chunks.Source, chunks.Raw)
+		}

 		if err != nil {
 			log.Fatalf("%s : Cannot parse the embl file : %v", chunks.Source, err)
@@ -196,6 +346,7 @@ func ReadEMBL(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, er
 		1024*1024*128,
 		EndOfLastFlatFileEntry,
 		"\nID   ",
+		false,
 	)

 	newIter := obiiter.MakeIBioSequence()
--- a/pkg/obiformats/fastaseq_read.go
+++ b/pkg/obiformats/fastaseq_read.go
@@ -209,28 +209,121 @@ func FastaChunkParser(UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlic
 	return parser
 }

+// extractFastaSeq scans sequence bytes from the rope directly into dest,
+// appending valid nucleotide characters and skipping whitespace.
+// Stops when '>' is found at the start of a line (next record) or at EOF.
+// Returns (dest with appended bases, hasMore).
+// hasMore=true means scanner is now positioned at '>' of the next record.
+func (s *ropeScanner) extractFastaSeq(dest []byte, UtoT bool) ([]byte, bool) {
+	lineStart := true
+
+	for s.current != nil {
+		data := s.current.data[s.pos:]
+		for i, b := range data {
+			if lineStart && b == '>' {
+				s.pos += i
+				if s.pos >= len(s.current.data) {
+					s.current = s.current.Next()
+					s.pos = 0
+				}
+				return dest, true
+			}
+			if b == '\n' || b == '\r' {
+				lineStart = true
+				continue
+			}
+			lineStart = false
+			if b == ' ' || b == '\t' {
+				continue
+			}
+			if b >= 'A' && b <= 'Z' {
+				b += 'a' - 'A'
+			}
+			if UtoT && b == 'u' {
+				b = 't'
+			}
+			dest = append(dest, b)
+		}
+		s.current = s.current.Next()
+		s.pos = 0
+	}
+	return dest, false
+}
+
+// FastaChunkParserRope parses a FASTA chunk directly from the rope without Pack().
+func FastaChunkParserRope(source string, rope *PieceOfChunk, UtoT bool) (obiseq.BioSequenceSlice, error) {
+	scanner := newRopeScanner(rope)
+	sequences := obiseq.MakeBioSequenceSlice(100)[:0]
+
+	for {
+		bline := scanner.ReadLine()
+		if bline == nil {
+			break
+		}
+		if len(bline) == 0 || bline[0] != '>' {
+			continue
+		}
+
+		// Parse header: ">id definition"
+		header := bline[1:]
+		var id string
+		var definition string
+		sp := bytes.IndexByte(header, ' ')
+		if sp < 0 {
+			sp = bytes.IndexByte(header, '\t')
+		}
+		if sp < 0 {
+			id = string(header)
+		} else {
+			id = string(header[:sp])
+			definition = string(bytes.TrimSpace(header[sp+1:]))
+		}
+
+		seqDest := make([]byte, 0, 4096)
+		var hasMore bool
+		seqDest, hasMore = scanner.extractFastaSeq(seqDest, UtoT)
+
+		if len(seqDest) == 0 {
+			log.Fatalf("%s [%s]: sequence is empty", source, id)
+		}
+
+		seq := obiseq.NewBioSequenceOwning(id, seqDest, definition)
+		seq.SetSource(source)
+		sequences = append(sequences, seq)
+
+		if !hasMore {
+			break
+		}
+	}
+
+	return sequences, nil
+}
+
 func _ParseFastaFile(
 	input ChannelFileChunk,
 	out obiiter.IBioSequence,
 	UtoT bool,
 ) {
-
 	parser := FastaChunkParser(UtoT)

 	for chunks := range input {
-		sequences, err := parser(chunks.Source, chunks.Raw)
-		// obilog.Warnf("Chunck(%d:%d) -%d- ", chunks.Order, l, sequences.Len())
+		var sequences obiseq.BioSequenceSlice
+		var err error
+
+		if chunks.Rope != nil {
+			sequences, err = FastaChunkParserRope(chunks.Source, chunks.Rope, UtoT)
+		} else {
+			sequences, err = parser(chunks.Source, chunks.Raw)
+		}

 		if err != nil {
 			log.Fatalf("File %s : Cannot parse the fasta file : %v", chunks.Source, err)
 		}

 		out.Push(obiiter.MakeBioSequenceBatch(chunks.Source, chunks.Order, sequences))
-
 	}

 	out.Done()
-
 }

 func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
@@ -245,6 +338,7 @@ func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
 		1024*1024,
 		EndOfLastFastaEntry,
 		"\n>",
+		false,
 	)

 	for i := 0; i < nworker; i++ {
--- a/pkg/obiformats/fastqseq_read.go
+++ b/pkg/obiformats/fastqseq_read.go
@@ -303,6 +303,80 @@ func FastqChunkParser(quality_shift byte, with_quality bool, UtoT bool) func(str
 	return parser
 }

+// FastqChunkParserRope parses a FASTQ chunk directly from a rope without Pack().
+func FastqChunkParserRope(source string, rope *PieceOfChunk, quality_shift byte, with_quality, UtoT bool) (obiseq.BioSequenceSlice, error) {
+	scanner := newRopeScanner(rope)
+	sequences := obiseq.MakeBioSequenceSlice(100)[:0]
+
+	for {
+		// Line 1: @id [definition]
+		hline := scanner.ReadLine()
+		if hline == nil {
+			break
+		}
+		if len(hline) == 0 || hline[0] != '@' {
+			continue
+		}
+		header := hline[1:]
+		var id string
+		var definition string
+		sp := bytes.IndexByte(header, ' ')
+		if sp < 0 {
+			sp = bytes.IndexByte(header, '\t')
+		}
+		if sp < 0 {
+			id = string(header)
+		} else {
+			id = string(header[:sp])
+			definition = string(bytes.TrimSpace(header[sp+1:]))
+		}
+
+		// Line 2: sequence
+		sline := scanner.ReadLine()
+		if sline == nil {
+			log.Fatalf("@%s[%s]: unexpected EOF after header", id, source)
+		}
+		seqDest := make([]byte, len(sline))
+		w := 0
+		for _, b := range sline {
+			if b >= 'A' && b <= 'Z' {
+				b += 'a' - 'A'
+			}
+			if UtoT && b == 'u' {
+				b = 't'
+			}
+			seqDest[w] = b
+			w++
+		}
+		seqDest = seqDest[:w]
+		if len(seqDest) == 0 {
+			log.Fatalf("@%s[%s]: sequence is empty", id, source)
+		}
+
+		// Line 3: + (skip)
+		scanner.ReadLine()
+
+		// Line 4: quality
+		qline := scanner.ReadLine()
+
+		seq := obiseq.NewBioSequenceOwning(id, seqDest, definition)
+		seq.SetSource(source)
+
+		if with_quality && qline != nil {
+			qDest := make([]byte, len(qline))
+			copy(qDest, qline)
+			for i := range qDest {
+				qDest[i] -= quality_shift
+			}
+			seq.TakeQualities(qDest)
+		}
+
+		sequences = append(sequences, seq)
+	}
+
+	return sequences, nil
+}
+
 func _ParseFastqFile(
 	input ChannelFileChunk,
 	out obiiter.IBioSequence,
@@ -313,7 +387,14 @@ func _ParseFastqFile(
 	parser := FastqChunkParser(quality_shift, with_quality, UtoT)

 	for chunks := range input {
-		sequences, err := parser(chunks.Source, chunks.Raw)
+		var sequences obiseq.BioSequenceSlice
+		var err error
+
+		if chunks.Rope != nil {
+			sequences, err = FastqChunkParserRope(chunks.Source, chunks.Rope, quality_shift, with_quality, UtoT)
+		} else {
+			sequences, err = parser(chunks.Source, chunks.Raw)
+		}

 		if err != nil {
 			log.Fatalf("File %s : Cannot parse the fastq file : %v", chunks.Source, err)
@@ -339,6 +420,7 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
 		1024*1024,
 		EndOfLastFastqEntry,
 		"\n@",
+		false,
 	)

 	for i := 0; i < nworker; i++ {
--- a/pkg/obiformats/fastseq_json_header.go
+++ b/pkg/obiformats/fastseq_json_header.go
@@ -296,7 +296,7 @@ func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {

 			case strings.HasSuffix(skey, "_taxid"):
 				if dataType == jsonparser.Number || dataType == jsonparser.String {
-					rank, _ := obiutils.SplitInTwo(skey, '_')
+					rank := skey[:len(skey)-len("_taxid")]

 					taxid := string(value)
 					sequence.SetTaxid(taxid, rank)
--- a/pkg/obiformats/fastseq_write_fasta.go
+++ b/pkg/obiformats/fastseq_write_fasta.go
@@ -77,45 +77,47 @@ func FormatFasta(seq *obiseq.BioSequence, formater FormatHeader) string {
 //
 // It returns a byte array containing the formatted sequences.
 func FormatFastaBatch(batch obiiter.BioSequenceBatch, formater FormatHeader, skipEmpty bool) *bytes.Buffer {
-	// Create a buffer to store the formatted sequences
 	var bs bytes.Buffer

 	lt := 0
-
 	for _, seq := range batch.Slice() {
 		lt += seq.Len()
 	}

-	// Iterate over each sequence in the batch
+	// Pre-allocate: sequence data + newlines every 60 chars + ~100 bytes header per sequence
+	bs.Grow(lt + lt/60 + 100*batch.Len() + 1)
+
 	log.Debugf("FormatFastaBatch: #%d : %d seqs", batch.Order(), batch.Len())
-	first := true
+
 	for _, seq := range batch.Slice() {
-		// Check if the sequence is empty
 		if seq.Len() > 0 {
-			// Format the sequence using the provided formater function
-			formattedSeq := FormatFasta(seq, formater)
-
-			if first {
-				bs.Grow(lt + (len(formattedSeq)-seq.Len())*batch.Len()*5/4)
-				first = false
-			}
-
-			// Append the formatted sequence to the buffer
-			bs.WriteString(formattedSeq)
+			// Write header directly into bs — no intermediate string
+			bs.WriteByte('>')
+			bs.WriteString(seq.Id())
+			bs.WriteByte(' ')
+			bs.WriteString(formater(seq))
 			bs.WriteByte('\n')
+
+			// Write folded sequence directly into bs — no copies
+			s := seq.Sequence()
+			l := len(s)
+			for i := 0; i < l; i += 60 {
+				to := i + 60
+				if to > l {
+					to = l
+				}
+				bs.Write(s[i:to])
+				bs.WriteByte('\n')
+			}
 		} else {
-			// Handle empty sequences
 			if skipEmpty {
-				// Skip empty sequences if skipEmpty is true
 				obilog.Warnf("Sequence %s is empty and skipped in output", seq.Id())
 			} else {
-				// Terminate the program if skipEmpty is false
 				log.Fatalf("Sequence %s is empty", seq.Id())
 			}
 		}
 	}

-	// Return the byte array representation of the buffer
 	return &bs
 }

--- a/pkg/obiformats/file_chunk_read.go
+++ b/pkg/obiformats/file_chunk_read.go
@@ -16,6 +16,7 @@ type SeqFileChunkParser func(string, io.Reader) (obiseq.BioSequenceSlice, error)
 type FileChunk struct {
 	Source string
 	Raw    *bytes.Buffer
+	Rope   *PieceOfChunk
 	Order  int
 }

@@ -97,11 +98,17 @@ func (piece *PieceOfChunk) IsLast() bool {
 	return piece.next == nil
 }

-func (piece *PieceOfChunk) FileChunk(source string, order int) FileChunk {
+func (piece *PieceOfChunk) FileChunk(source string, order int, pack bool) FileChunk {
+	piece = piece.Head()
+	var raw *bytes.Buffer
+	if pack {
 		piece.Pack()
+		raw = bytes.NewBuffer(piece.data)
+	}
 	return FileChunk{
 		Source: source,
-		Raw:    bytes.NewBuffer(piece.data),
+		Raw:    raw,
+		Rope:   piece,
 		Order:  order,
 	}
 }
@@ -133,7 +140,8 @@ func ReadFileChunk(
 	reader io.Reader,
 	fileChunkSize int,
 	splitter LastSeqRecord,
-	probe string) ChannelFileChunk {
+	probe string,
+	pack bool) ChannelFileChunk {

 	chunk_channel := make(ChannelFileChunk)

@@ -205,7 +213,7 @@ func ReadFileChunk(

 				if len(pieces.data) > 0 {
 					// obilog.Warnf("chuck %d :Read %d bytes from file %s", i, io.Len(), source)
-					chunk_channel <- pieces.FileChunk(source, i)
+					chunk_channel <- pieces.FileChunk(source, i, pack)
 					i++
 				}

@@ -222,7 +230,7 @@ func ReadFileChunk(

 		// Send the last chunk to the channel
 		if pieces.Len() > 0 {
-			chunk_channel <- pieces.FileChunk(source, i)
+			chunk_channel <- pieces.FileChunk(source, i, pack)
 		}

 		// Close the readers channel when the end of the file is reached
--- a/pkg/obiformats/genbank_read.go
+++ b/pkg/obiformats/genbank_read.go
@@ -29,6 +29,265 @@ const (

 var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp")

+// extractSequence scans the ORIGIN section byte-by-byte directly on the rope,
+// appending compacted bases to dest. Returns the extended slice.
+// Stops and returns when "//" is found at the start of a line.
+// The scanner is left positioned after the "//" line.
+func (s *ropeScanner) extractSequence(dest []byte, UtoT bool) []byte {
+	lineStart := true
+	skipDigits := true
+
+	for s.current != nil {
+		data := s.current.data[s.pos:]
+		for i, b := range data {
+			if lineStart {
+				if b == '/' {
+					// End-of-record marker "//"
+					s.pos += i + 1
+					if s.pos >= len(s.current.data) {
+						s.current = s.current.Next()
+						s.pos = 0
+					}
+					s.skipToNewline()
+					return dest
+				}
+				lineStart = false
+				skipDigits = true
+			}
+			switch {
+			case b == '\n':
+				lineStart = true
+			case b == '\r':
+				// skip
+			case skipDigits:
+				if b != ' ' && (b < '0' || b > '9') {
+					skipDigits = false
+					if UtoT && b == 'u' {
+						b = 't'
+					}
+					dest = append(dest, b)
+				}
+			case b != ' ':
+				if UtoT && b == 'u' {
+					b = 't'
+				}
+				dest = append(dest, b)
+			}
+		}
+		s.current = s.current.Next()
+		s.pos = 0
+	}
+	return dest
+}
+
+// parseLseqFromLocus extracts the declared sequence length from a LOCUS line.
+// Format: "LOCUS       <id> <length> bp ..."
+// Returns -1 if not found or parse error.
+func parseLseqFromLocus(line []byte) int {
+	if len(line) < 13 {
+		return -1
+	}
+	i := 12
+	for i < len(line) && line[i] != ' ' {
+		i++
+	}
+	for i < len(line) && line[i] == ' ' {
+		i++
+	}
+	start := i
+	for i < len(line) && line[i] >= '0' && line[i] <= '9' {
+		i++
+	}
+	if i == start {
+		return -1
+	}
+	n, err := strconv.Atoi(string(line[start:i]))
+	if err != nil {
+		return -1
+	}
+	return n
+}
+
+// Prefix constants for GenBank section headers (byte slices for zero-alloc comparison).
+var (
+	gbPfxLocus      = []byte("LOCUS       ")
+	gbPfxDefinition = []byte("DEFINITION  ")
+	gbPfxContinue   = []byte("            ")
+	gbPfxSource     = []byte("SOURCE      ")
+	gbPfxFeatures   = []byte("FEATURES    ")
+	gbPfxOrigin     = []byte("ORIGIN")
+	gbPfxContig     = []byte("CONTIG")
+	gbPfxEnd        = []byte("//")
+	gbPfxDbXref     = []byte(`                     /db_xref="taxon:`)
+)
+
+// GenbankChunkParserRope parses a GenBank FileChunk directly from the rope
+// (PieceOfChunk linked list) without calling Pack(). This eliminates the large
+// contiguous allocation required for chromosomal-scale sequences.
+func GenbankChunkParserRope(source string, rope *PieceOfChunk,
+	withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) {
+
+	state := inHeader
+	scanner := newRopeScanner(rope)
+	sequences := obiseq.MakeBioSequenceSlice(100)[:0]
+
+	id := ""
+	lseq := -1
+	scientificName := ""
+	defBytes := new(bytes.Buffer)
+	featBytes := new(bytes.Buffer)
+	var seqDest []byte
+	taxid := 1
+	nl := 0
+
+	for bline := scanner.ReadLine(); bline != nil; bline = scanner.ReadLine() {
+		nl++
+		processed := false
+		for !processed {
+			switch {
+
+			case bytes.HasPrefix(bline, gbPfxLocus):
+				if state != inHeader {
+					log.Fatalf("Line %d - Unexpected state %d while reading LOCUS: %s", nl, state, bline)
+				}
+				rest := bline[12:]
+				sp := bytes.IndexByte(rest, ' ')
+				if sp < 0 {
+					id = string(rest)
+				} else {
+					id = string(rest[:sp])
+				}
+				lseq = parseLseqFromLocus(bline)
+				cap0 := lseq + 20
+				if cap0 < 1024 {
+					cap0 = 1024
+				}
+				seqDest = make([]byte, 0, cap0)
+				state = inEntry
+				processed = true
+
+			case bytes.HasPrefix(bline, gbPfxDefinition):
+				if state != inEntry {
+					log.Fatalf("Line %d - Unexpected state %d while reading DEFINITION: %s", nl, state, bline)
+				}
+				defBytes.Write(bytes.TrimSpace(bline[12:]))
+				state = inDefinition
+				processed = true
+
+			case state == inDefinition:
+				if bytes.HasPrefix(bline, gbPfxContinue) {
+					defBytes.WriteByte(' ')
+					defBytes.Write(bytes.TrimSpace(bline[12:]))
+					processed = true
+				} else {
+					state = inEntry
+				}
+
+			case bytes.HasPrefix(bline, gbPfxSource):
+				if state != inEntry {
+					log.Fatalf("Line %d - Unexpected state %d while reading SOURCE: %s", nl, state, bline)
+				}
+				scientificName = string(bytes.TrimSpace(bline[12:]))
+				processed = true
+
+			case bytes.HasPrefix(bline, gbPfxFeatures):
+				if state != inEntry {
+					log.Fatalf("Line %d - Unexpected state %d while reading FEATURES: %s", nl, state, bline)
+				}
+				if withFeatureTable {
+					featBytes.Write(bline)
+				}
+				state = inFeature
+				processed = true
+
+			case bytes.HasPrefix(bline, gbPfxOrigin):
+				if state != inFeature && state != inContig {
+					log.Fatalf("Line %d - Unexpected state %d while reading ORIGIN: %s", nl, state, bline)
+				}
+				// Use fast byte-scan to extract sequence and consume through "//"
+				seqDest = scanner.extractSequence(seqDest, UtoT)
+				// Emit record
+				if id == "" {
+					log.Warn("Empty id when parsing genbank file")
+				}
+				sequence := obiseq.NewBioSequenceOwning(id, seqDest, defBytes.String())
+				sequence.SetSource(source)
+				if withFeatureTable {
+					sequence.SetFeatures(featBytes.Bytes())
+				}
+				annot := sequence.Annotations()
+				annot["scientific_name"] = scientificName
+				annot["taxid"] = taxid
+				sequences = append(sequences, sequence)
+
+				defBytes = bytes.NewBuffer(obiseq.GetSlice(200))
+				featBytes = new(bytes.Buffer)
+				nl = 0
+				taxid = 1
+				seqDest = nil
+				state = inHeader
+				processed = true
+
+			case bytes.HasPrefix(bline, gbPfxContig):
+				if state != inFeature && state != inContig {
+					log.Fatalf("Line %d - Unexpected state %d while reading CONTIG: %s", nl, state, bline)
+				}
+				state = inContig
+				processed = true
+
+			case bytes.Equal(bline, gbPfxEnd):
+				// Reached for CONTIG records (no ORIGIN section)
+				if state != inContig {
+					log.Fatalf("Line %d - Unexpected state %d while reading end of record %s", nl, state, id)
+				}
+				if id == "" {
+					log.Warn("Empty id when parsing genbank file")
+				}
+				sequence := obiseq.NewBioSequenceOwning(id, seqDest, defBytes.String())
+				sequence.SetSource(source)
+				if withFeatureTable {
+					sequence.SetFeatures(featBytes.Bytes())
+				}
+				annot := sequence.Annotations()
+				annot["scientific_name"] = scientificName
+				annot["taxid"] = taxid
+				sequences = append(sequences, sequence)
+
+				defBytes = bytes.NewBuffer(obiseq.GetSlice(200))
+				featBytes = new(bytes.Buffer)
+				nl = 0
+				taxid = 1
+				seqDest = nil
+				state = inHeader
+				processed = true
+
+			default:
+				switch state {
+				case inFeature:
+					if withFeatureTable {
+						featBytes.WriteByte('\n')
+						featBytes.Write(bline)
+					}
+					if bytes.HasPrefix(bline, gbPfxDbXref) {
+						rest := bline[len(gbPfxDbXref):]
+						q := bytes.IndexByte(rest, '"')
+						if q >= 0 {
+							taxid, _ = strconv.Atoi(string(rest[:q]))
+						}
+					}
+					processed = true
+				case inHeader, inEntry, inContig:
+					processed = true
+				default:
+					log.Fatalf("Unexpected state %d while reading: %s", state, bline)
+				}
+			}
+		}
+	}
+
+	return sequences, nil
+}
+
 func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlice, error) {
 	return func(source string, input io.Reader) (obiseq.BioSequenceSlice, error) {
 		state := inHeader
@@ -125,13 +384,10 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
 					if state != inSequence && state != inContig {
 						log.Fatalf("Line %d - Unexpected state %d while reading end of record %s", nl, state, id)
 					}
-					// log.Debugln("Total lines := ", nl)
 					if id == "" {
 						log.Warn("Empty id when parsing genbank file")
 					}

-					// log.Debugf("End of sequence %s: %dbp ", id, seqBytes.Len())
-
 					sequence := obiseq.NewBioSequence(id,
 						seqBytes.Bytes(),
 						defBytes.String())
@@ -144,9 +400,6 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
 					annot := sequence.Annotations()
 					annot["scientific_name"] = scientificName
 					annot["taxid"] = taxid
-					// log.Println(FormatFasta(sequence, FormatFastSeqJsonHeader))
-					// log.Debugf("Read sequences %s: %dbp (%d)", sequence.Id(),
-					//	sequence.Len(), seqBytes.Len())

 					sequences = append(sequences, sequence)

@@ -159,12 +412,11 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
 					processed = true

 				case state == inSequence:
-					// log.Debugf("Chunk %d : Genbank: line %d, state = %d : %s", chunks.order, nl, state, line)
-
 					sl++
-					parts := strings.SplitN(line[10:], " ", 6)
+					cleanline := strings.TrimSpace(line)
+					parts := strings.SplitN(cleanline, " ", 7)
 					lparts := len(parts)
-					for i := 0; i < lparts; i++ {
+					for i := 1; i < lparts; i++ {
 						if UtoT {
 							parts[i] = strings.ReplaceAll(parts[i], "u", "t")
 						}
@@ -197,6 +449,7 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob

 		}

+		_ = sl
 		return sequences, nil
 	}
 }
@@ -205,10 +458,16 @@ func _ParseGenbankFile(input ChannelFileChunk,
 	out obiiter.IBioSequence,
 	withFeatureTable, UtoT bool) {

-	parser := GenbankChunkParser(withFeatureTable, UtoT)
-
 	for chunks := range input {
-		sequences, err := parser(chunks.Source, chunks.Raw)
+		var sequences obiseq.BioSequenceSlice
+		var err error
+
+		if chunks.Rope != nil {
+			sequences, err = GenbankChunkParserRope(chunks.Source, chunks.Rope, withFeatureTable, UtoT)
+		} else {
+			parser := GenbankChunkParser(withFeatureTable, UtoT)
+			sequences, err = parser(chunks.Source, chunks.Raw)
+		}

 		if err != nil {
 			log.Fatalf("File %s : Cannot parse the genbank file : %v", chunks.Source, err)
@@ -224,7 +483,6 @@ func _ParseGenbankFile(input ChannelFileChunk,

 func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
 	opt := MakeOptions(options)
-	// entry_channel := make(chan _FileChunk)

 	entry_channel := ReadFileChunk(
 		opt.Source(),
@@ -232,13 +490,13 @@ func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence,
 		1024*1024*128,
 		EndOfLastFlatFileEntry,
 		"\nLOCUS       ",
+		false, // do not pack: rope-based parser avoids contiguous allocation
 	)

 	newIter := obiiter.MakeIBioSequence()

 	nworkers := opt.ParallelWorkers()

-	// for j := 0; j < opt.ParallelWorkers(); j++ {
 	for j := 0; j < nworkers; j++ {
 		newIter.Add(1)
 		go _ParseGenbankFile(
@@ -249,8 +507,6 @@ func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence,
 		)
 	}

-	// go _ReadFlatFileChunk(reader, entry_channel)
-
 	go func() {
 		newIter.WaitAndClose()
 		log.Debug("End of the genbank file ", opt.Source())
--- a/pkg/obiformats/rope_scanner.go
+++ b/pkg/obiformats/rope_scanner.go
@@ -0,0 +1,77 @@
+package obiformats
+
+import "bytes"
+
+// ropeScanner reads lines from a PieceOfChunk rope.
+// The carry buffer handles lines that span two rope nodes; it grows as needed.
+type ropeScanner struct {
+	current *PieceOfChunk
+	pos     int
+	carry   []byte
+}
+
+func newRopeScanner(rope *PieceOfChunk) *ropeScanner {
+	return &ropeScanner{current: rope}
+}
+
+// ReadLine returns the next line without the trailing \n (or \r\n).
+// Returns nil at end of rope. The returned slice aliases carry[] or the node
+// data and is valid only until the next ReadLine call.
+func (s *ropeScanner) ReadLine() []byte {
+	for {
+		if s.current == nil {
+			if len(s.carry) > 0 {
+				line := s.carry
+				s.carry = s.carry[:0]
+				return line
+			}
+			return nil
+		}
+
+		data := s.current.data[s.pos:]
+		idx := bytes.IndexByte(data, '\n')
+
+		if idx >= 0 {
+			var line []byte
+			if len(s.carry) == 0 {
+				line = data[:idx]
+			} else {
+				s.carry = append(s.carry, data[:idx]...)
+				line = s.carry
+				s.carry = s.carry[:0]
+			}
+			s.pos += idx + 1
+			if s.pos >= len(s.current.data) {
+				s.current = s.current.Next()
+				s.pos = 0
+			}
+			if len(line) > 0 && line[len(line)-1] == '\r' {
+				line = line[:len(line)-1]
+			}
+			return line
+		}
+
+		// No \n in this node: accumulate into carry and advance
+		s.carry = append(s.carry, data...)
+		s.current = s.current.Next()
+		s.pos = 0
+	}
+}
+
+// skipToNewline advances the scanner past the next '\n'.
+func (s *ropeScanner) skipToNewline() {
+	for s.current != nil {
+		data := s.current.data[s.pos:]
+		idx := bytes.IndexByte(data, '\n')
+		if idx >= 0 {
+			s.pos += idx + 1
+			if s.pos >= len(s.current.data) {
+				s.current = s.current.Next()
+				s.pos = 0
+			}
+			return
+		}
+		s.current = s.current.Next()
+		s.pos = 0
+	}
+}
--- a/pkg/obiiter/batchiterator.go
+++ b/pkg/obiiter/batchiterator.go
@@ -444,6 +444,67 @@ func (iterator IBioSequence) Rebatch(size int) IBioSequence {
 	return newIter
 }

+// RebatchBySize reorganises the stream into batches bounded by two independent
+// upper limits: maxCount (max number of sequences) and maxBytes (max cumulative
+// estimated memory). A batch is flushed as soon as either limit would be
+// exceeded. A single sequence larger than maxBytes is always emitted alone.
+// Passing 0 for a limit disables that constraint; if both are 0 it falls back
+// to Rebatch(obidefault.BatchSizeMax()).
+func (iterator IBioSequence) RebatchBySize(maxBytes int, maxCount int) IBioSequence {
+	if maxBytes <= 0 && maxCount <= 0 {
+		return iterator.Rebatch(obidefault.BatchSizeMax())
+	}
+
+	newIter := MakeIBioSequence()
+
+	newIter.Add(1)
+
+	go func() {
+		newIter.WaitAndClose()
+	}()
+
+	go func() {
+		order := 0
+		iterator = iterator.SortBatches()
+		buffer := obiseq.MakeBioSequenceSlice()
+		bufBytes := 0
+		source := ""
+
+		flush := func() {
+			if len(buffer) > 0 {
+				newIter.Push(MakeBioSequenceBatch(source, order, buffer))
+				order++
+				buffer = obiseq.MakeBioSequenceSlice()
+				bufBytes = 0
+			}
+		}
+
+		for iterator.Next() {
+			seqs := iterator.Get()
+			source = seqs.Source()
+			for _, s := range seqs.Slice() {
+				sz := s.MemorySize()
+				countFull := maxCount > 0 && len(buffer) >= maxCount
+				memFull := maxBytes > 0 && bufBytes+sz > maxBytes && len(buffer) > 0
+				if countFull || memFull {
+					flush()
+				}
+				buffer = append(buffer, s)
+				bufBytes += sz
+			}
+		}
+		flush()
+
+		newIter.Done()
+	}()
+
+	if iterator.IsPaired() {
+		newIter.MarkAsPaired()
+	}
+
+	return newIter
+}
+
 func (iterator IBioSequence) FilterEmpty() IBioSequence {

 	newIter := MakeIBioSequence()
@@ -638,7 +699,7 @@ func (iterator IBioSequence) FilterOn(predicate obiseq.SequencePredicate,
 		trueIter.MarkAsPaired()
 	}

-	return trueIter.Rebatch(size)
+	return trueIter.RebatchBySize(obidefault.BatchMem(), obidefault.BatchSizeMax())
 }

 func (iterator IBioSequence) FilterAnd(predicate obiseq.SequencePredicate,
@@ -694,7 +755,7 @@ func (iterator IBioSequence) FilterAnd(predicate obiseq.SequencePredicate,
 		trueIter.MarkAsPaired()
 	}

-	return trueIter.Rebatch(size)
+	return trueIter.RebatchBySize(obidefault.BatchMem(), obidefault.BatchSizeMax())
 }

 // Load all sequences availables from an IBioSequenceBatch iterator into
--- a/pkg/obiiter/distribute.go
+++ b/pkg/obiiter/distribute.go
@@ -57,34 +57,21 @@ func (dist *IDistribute) Classifier() *obiseq.BioSequenceClassifier {
 }

 // Distribute organizes the biosequences from the iterator into batches
-// based on the provided classifier and batch sizes. It returns an
-// IDistribute instance that manages the distribution of the sequences.
+// based on the provided classifier. It returns an IDistribute instance
+// that manages the distribution of the sequences.
 //
-// Parameters:
-//   - class: A pointer to a BioSequenceClassifier used to classify
-//     the biosequences during distribution.
-//   - sizes: Optional integer values specifying the batch size. If
-//     no sizes are provided, a default batch size of 5000 is used.
-//
-// Returns:
-// An IDistribute instance that contains the outputs of the
-// classified biosequences, a channel for new data notifications,
-// and the classifier used for distribution. The method operates
-// asynchronously, processing the sequences in separate goroutines.
-// It ensures that the outputs are closed and cleaned up once
-// processing is complete.
-func (iterator IBioSequence) Distribute(class *obiseq.BioSequenceClassifier, sizes ...int) IDistribute {
-	batchsize := obidefault.BatchSize()
+// Batches are flushed when either BatchSizeMax() sequences or BatchMem()
+// bytes are accumulated per key, mirroring the RebatchBySize strategy.
+func (iterator IBioSequence) Distribute(class *obiseq.BioSequenceClassifier) IDistribute {
+	maxCount := obidefault.BatchSizeMax()
+	maxBytes := obidefault.BatchMem()

 	outputs := make(map[int]IBioSequence, 100)
 	slices := make(map[int]*obiseq.BioSequenceSlice, 100)
+	bufBytes := make(map[int]int, 100)
 	orders := make(map[int]int, 100)
 	news := make(chan int)

-	if len(sizes) > 0 {
-		batchsize = sizes[0]
-	}
-
 	jobDone := sync.WaitGroup{}
 	lock := sync.Mutex{}

@@ -115,6 +102,7 @@ func (iterator IBioSequence) Distribute(class *obiseq.BioSequenceClassifier, siz
 					slice = &s
 					slices[key] = slice
 					orders[key] = 0
+					bufBytes[key] = 0

 					lock.Lock()
 					outputs[key] = MakeIBioSequence()
@@ -123,14 +111,20 @@ func (iterator IBioSequence) Distribute(class *obiseq.BioSequenceClassifier, siz
 					news <- key
 				}

-				*slice = append(*slice, s)
-
-				if len(*slice) == batchsize {
+				sz := s.MemorySize()
+				countFull := maxCount > 0 && len(*slice) >= maxCount
+				memFull := maxBytes > 0 && bufBytes[key]+sz > maxBytes && len(*slice) > 0
+				if countFull || memFull {
 					outputs[key].Push(MakeBioSequenceBatch(source, orders[key], *slice))
 					orders[key]++
 					s := obiseq.MakeBioSequenceSlice()
 					slices[key] = &s
+					slice = &s
+					bufBytes[key] = 0
 				}
+
+				*slice = append(*slice, s)
+				bufBytes[key] += sz
 			}
 		}

--- a/pkg/obiiter/fragment.go
+++ b/pkg/obiiter/fragment.go
@@ -3,6 +3,7 @@ package obiiter
 import (
 	log "github.com/sirupsen/logrus"

+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 )

@@ -70,7 +71,7 @@ func IFragments(minsize, length, overlap, size, nworkers int) Pipeable {
 		}
 		go f(iterator)

-		return newiter.SortBatches().Rebatch(size)
+		return newiter.SortBatches().RebatchBySize(obidefault.BatchMem(), obidefault.BatchSizeMax())
 	}

 	return ifrg
--- a/pkg/obiiter/speed.go
+++ b/pkg/obiiter/speed.go
@@ -5,18 +5,30 @@ import (
 	"os"
 	"time"

+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
 	"github.com/schollz/progressbar/v3"
 )

 func (iterator IBioSequence) Speed(message string, size ...int) IBioSequence {

-	// If the STDERR is redicted and doesn't end up to a terminal
+	// If the progress bar is disabled via --no-progressbar option
+	if !obidefault.ProgressBar() {
+		return iterator
+	}
+
+	// If the STDERR is redirected and doesn't end up to a terminal
 	// No progress bar is printed.
 	o, _ := os.Stderr.Stat()
 	if (o.Mode() & os.ModeCharDevice) != os.ModeCharDevice {
 		return iterator
 	}

+	// If stdout is piped, no progress bar is printed.
+	oo, _ := os.Stdout.Stat()
+	if (oo.Mode() & os.ModeNamedPipe) == os.ModeNamedPipe {
+		return iterator
+	}
+
 	newIter := MakeIBioSequence()

 	newIter.Add(1)
--- a/pkg/obikmer/encodekmer.go
+++ b/pkg/obikmer/encodekmer.go
@@ -447,141 +447,6 @@ func IterCanonicalKmers(seq []byte, k int) iter.Seq[uint64] {
 		}
 	}
 }
-
-// SuperKmer represents a maximal subsequence where all consecutive k-mers
-// share the same minimizer. A minimizer is the smallest canonical m-mer
-// among the (k-m+1) m-mers contained in a k-mer.
-type SuperKmer struct {
-	Minimizer uint64 // The canonical minimizer value (normalized m-mer)
-	Start     int    // Starting position in the original sequence (0-indexed)
-	End       int    // Ending position (exclusive, like Go slice notation)
-	Sequence  []byte // The actual DNA subsequence [Start:End]
-}
-
-// dequeItem represents an element in the monotone deque used for
-// tracking minimizers in a sliding window.
-type dequeItem struct {
-	position  int    // Position of the m-mer in the sequence
-	canonical uint64 // Canonical (normalized) m-mer value
-}
-
-// ExtractSuperKmers extracts super k-mers from a DNA sequence.
-// A super k-mer is a maximal subsequence where all consecutive k-mers
-// share the same minimizer. The minimizer of a k-mer is the smallest
-// canonical m-mer among its (k-m+1) constituent m-mers.
-//
-// The algorithm uses:
-// - Simultaneous forward/reverse m-mer encoding for O(1) canonical m-mer computation
-// - Monotone deque for O(1) amortized minimizer tracking per position
-//
-// The maximum k-mer size is 31 (using 62 bits), leaving the top 2 bits
-// available for error markers if needed.
-//
-// Parameters:
-//   - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
-//   - k: k-mer size (must be between m+1 and 31)
-//   - m: minimizer size (must be between 1 and k-1)
-//   - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
-//
-// Returns:
-//   - slice of SuperKmer structs representing maximal subsequences
-//   - nil if parameters are invalid or sequence is too short
-//
-// Time complexity: O(n) where n is the sequence length
-// Space complexity: O(k-m+1) for the deque + O(number of super k-mers) for results
-func ExtractSuperKmers(seq []byte, k int, m int, buffer *[]SuperKmer) []SuperKmer {
-	if m < 1 || m >= k || k < 2 || k > 31 || len(seq) < k {
-		return nil
-	}
-
-	var result []SuperKmer
-	if buffer == nil {
-		estimatedSize := len(seq) / k
-		if estimatedSize < 1 {
-			estimatedSize = 1
-		}
-		result = make([]SuperKmer, 0, estimatedSize)
-	} else {
-		result = (*buffer)[:0]
-	}
-
-	deque := make([]dequeItem, 0, k-m+1)
-
-	mMask := uint64(1)<<(m*2) - 1
-	rcShift := uint((m - 1) * 2)
-
-	var fwdMmer, rvcMmer uint64
-	for i := 0; i < m-1 && i < len(seq); i++ {
-		code := uint64(__single_base_code__[seq[i]&31])
-		fwdMmer = (fwdMmer << 2) | code
-		rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
-	}
-
-	superKmerStart := 0
-	var currentMinimizer uint64
-	firstKmer := true
-
-	for pos := m - 1; pos < len(seq); pos++ {
-		code := uint64(__single_base_code__[seq[pos]&31])
-		fwdMmer = ((fwdMmer << 2) | code) & mMask
-		rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
-
-		canonical := fwdMmer
-		if rvcMmer < fwdMmer {
-			canonical = rvcMmer
-		}
-
-		mmerPos := pos - m + 1
-
-		if pos >= k-1 {
-			windowStart := pos - k + 1
-			for len(deque) > 0 && deque[0].position < windowStart {
-				deque = deque[1:]
-			}
-		}
-
-		for len(deque) > 0 && deque[len(deque)-1].canonical >= canonical {
-			deque = deque[:len(deque)-1]
-		}
-
-		deque = append(deque, dequeItem{position: mmerPos, canonical: canonical})
-
-		if pos >= k-1 {
-			newMinimizer := deque[0].canonical
-			kmerStart := pos - k + 1
-
-			if firstKmer {
-				currentMinimizer = newMinimizer
-				firstKmer = false
-			} else if newMinimizer != currentMinimizer {
-				endPos := kmerStart + k - 1
-				superKmer := SuperKmer{
-					Minimizer: currentMinimizer,
-					Start:     superKmerStart,
-					End:       endPos,
-					Sequence:  seq[superKmerStart:endPos],
-				}
-				result = append(result, superKmer)
-
-				superKmerStart = kmerStart
-				currentMinimizer = newMinimizer
-			}
-		}
-	}
-
-	if !firstKmer {
-		superKmer := SuperKmer{
-			Minimizer: currentMinimizer,
-			Start:     superKmerStart,
-			End:       len(seq),
-			Sequence:  seq[superKmerStart:],
-		}
-		result = append(result, superKmer)
-	}
-
-	return result
-}
-
 // ReverseComplement computes the reverse complement of an encoded k-mer.
 // The k-mer is encoded with 2 bits per nucleotide (A=00, C=01, G=10, T=11).
 // The complement is: A↔T (00↔11), C↔G (01↔10), which is simply XOR with 11.
--- a/pkg/obikmer/entropy.go
+++ b/pkg/obikmer/entropy.go
@@ -0,0 +1,281 @@
+package obikmer
+
+import "math"
+
+// KmerEntropy computes the entropy of a single encoded k-mer.
+//
+// The algorithm mirrors the lowmask entropy calculation: it decodes the k-mer
+// to a DNA sequence, extracts all sub-words of each size from 1 to levelMax,
+// normalizes them by circular canonical form, counts their frequencies, and
+// computes Shannon entropy normalized by the maximum possible entropy.
+// The returned value is the minimum entropy across all word sizes.
+//
+// A value close to 0 indicates very low complexity (e.g. "AAAA..."),
+// while a value close to 1 indicates high complexity.
+//
+// Parameters:
+//   - kmer: the encoded k-mer (2 bits per base)
+//   - k: the k-mer size
+//   - levelMax: maximum sub-word size for entropy (typically 6)
+//
+// Returns:
+//   - minimum normalized entropy across all word sizes 1..levelMax
+func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
+	if k < 1 || levelMax < 1 {
+		return 1.0
+	}
+	if levelMax >= k {
+		levelMax = k - 1
+	}
+	if levelMax < 1 {
+		return 1.0
+	}
+
+	// Decode k-mer to DNA sequence
+	var seqBuf [32]byte
+	seq := DecodeKmer(kmer, k, seqBuf[:])
+
+	// Pre-compute nLogN lookup (same as lowmask)
+	nLogN := make([]float64, k+1)
+	for i := 1; i <= k; i++ {
+		nLogN[i] = float64(i) * math.Log(float64(i))
+	}
+
+	// Build circular-canonical normalization tables per word size
+	normTables := make([][]int, levelMax+1)
+	for ws := 1; ws <= levelMax; ws++ {
+		size := 1 << (ws * 2)
+		normTables[ws] = make([]int, size)
+		for code := 0; code < size; code++ {
+			normTables[ws][code] = int(NormalizeCircular(uint64(code), ws))
+		}
+	}
+
+	minEntropy := math.MaxFloat64
+
+	for ws := 1; ws <= levelMax; ws++ {
+		nwords := k - ws + 1
+		if nwords < 1 {
+			continue
+		}
+
+		// Count circular-canonical sub-word frequencies
+		tableSize := 1 << (ws * 2)
+		table := make([]int, tableSize)
+		mask := (1 << (ws * 2)) - 1
+
+		wordIndex := 0
+		for i := 0; i < ws-1; i++ {
+			wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i]))
+		}
+
+		for i, j := 0, ws-1; j < k; i, j = i+1, j+1 {
+			wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j]))
+			normWord := normTables[ws][wordIndex]
+			table[normWord]++
+		}
+
+		// Compute Shannon entropy
+		floatNwords := float64(nwords)
+		logNwords := math.Log(floatNwords)
+
+		var sumNLogN float64
+		for j := 0; j < tableSize; j++ {
+			n := table[j]
+			if n > 0 {
+				sumNLogN += nLogN[n]
+			}
+		}
+
+		// Compute emax (maximum possible entropy for this word size)
+		na := CanonicalCircularKmerCount(ws)
+		var emax float64
+		if nwords < na {
+			emax = math.Log(float64(nwords))
+		} else {
+			cov := nwords / na
+			remains := nwords - (na * cov)
+			f1 := float64(cov) / floatNwords
+			f2 := float64(cov+1) / floatNwords
+			emax = -(float64(na-remains)*f1*math.Log(f1) +
+				float64(remains)*f2*math.Log(f2))
+		}
+
+		if emax <= 0 {
+			continue
+		}
+
+		entropy := (logNwords - sumNLogN/floatNwords) / emax
+		if entropy < 0 {
+			entropy = 0
+		}
+
+		if entropy < minEntropy {
+			minEntropy = entropy
+		}
+	}
+
+	if minEntropy == math.MaxFloat64 {
+		return 1.0
+	}
+
+	return math.Round(minEntropy*10000) / 10000
+}
+
+// KmerEntropyFilter is a reusable entropy filter for batch processing.
+// It pre-computes normalization tables and lookup values to avoid repeated
+// allocation across millions of k-mers.
+//
+// IMPORTANT: a KmerEntropyFilter is NOT safe for concurrent use.
+// Each goroutine must create its own instance via NewKmerEntropyFilter.
+type KmerEntropyFilter struct {
+	k          int
+	levelMax   int
+	threshold  float64
+	nLogN      []float64
+	normTables [][]int
+	emaxValues []float64
+	logNwords  []float64
+	// Pre-allocated frequency tables reused across Entropy() calls.
+	// One per word size (index 0 unused). Reset to zero before each use.
+	freqTables [][]int
+}
+
+// NewKmerEntropyFilter creates an entropy filter with pre-computed tables.
+//
+// Parameters:
+//   - k: the k-mer size
+//   - levelMax: maximum sub-word size for entropy (typically 6)
+//   - threshold: entropy threshold (k-mers with entropy <= threshold are rejected)
+func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter {
+	if levelMax >= k {
+		levelMax = k - 1
+	}
+	if levelMax < 1 {
+		levelMax = 1
+	}
+
+	nLogN := make([]float64, k+1)
+	for i := 1; i <= k; i++ {
+		nLogN[i] = float64(i) * math.Log(float64(i))
+	}
+
+	normTables := make([][]int, levelMax+1)
+	for ws := 1; ws <= levelMax; ws++ {
+		size := 1 << (ws * 2)
+		normTables[ws] = make([]int, size)
+		for code := 0; code < size; code++ {
+			normTables[ws][code] = int(NormalizeCircular(uint64(code), ws))
+		}
+	}
+
+	emaxValues := make([]float64, levelMax+1)
+	logNwords := make([]float64, levelMax+1)
+	for ws := 1; ws <= levelMax; ws++ {
+		nw := k - ws + 1
+		na := CanonicalCircularKmerCount(ws)
+		if nw < na {
+			logNwords[ws] = math.Log(float64(nw))
+			emaxValues[ws] = math.Log(float64(nw))
+		} else {
+			cov := nw / na
+			remains := nw - (na * cov)
+			f1 := float64(cov) / float64(nw)
+			f2 := float64(cov+1) / float64(nw)
+			logNwords[ws] = math.Log(float64(nw))
+			emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) +
+				float64(remains)*f2*math.Log(f2))
+		}
+	}
+
+	// Pre-allocate frequency tables per word size
+	freqTables := make([][]int, levelMax+1)
+	for ws := 1; ws <= levelMax; ws++ {
+		freqTables[ws] = make([]int, 1<<(ws*2))
+	}
+
+	return &KmerEntropyFilter{
+		k:          k,
+		levelMax:   levelMax,
+		threshold:  threshold,
+		nLogN:      nLogN,
+		normTables: normTables,
+		emaxValues: emaxValues,
+		logNwords:  logNwords,
+		freqTables: freqTables,
+	}
+}
+
+// Accept returns true if the k-mer has entropy strictly above the threshold.
+// Low-complexity k-mers (entropy <= threshold) are rejected.
+func (ef *KmerEntropyFilter) Accept(kmer uint64) bool {
+	return ef.Entropy(kmer) > ef.threshold
+}
+
+// Entropy computes the entropy for a single k-mer using pre-computed tables.
+func (ef *KmerEntropyFilter) Entropy(kmer uint64) float64 {
+	k := ef.k
+
+	// Decode k-mer to DNA sequence
+	var seqBuf [32]byte
+	seq := DecodeKmer(kmer, k, seqBuf[:])
+
+	minEntropy := math.MaxFloat64
+
+	for ws := 1; ws <= ef.levelMax; ws++ {
+		nwords := k - ws + 1
+		if nwords < 1 {
+			continue
+		}
+
+		emax := ef.emaxValues[ws]
+		if emax <= 0 {
+			continue
+		}
+
+		// Count circular-canonical sub-word frequencies
+		tableSize := 1 << (ws * 2)
+		table := ef.freqTables[ws]
+		clear(table) // reset to zero
+		mask := (1 << (ws * 2)) - 1
+		normTable := ef.normTables[ws]
+
+		wordIndex := 0
+		for i := 0; i < ws-1; i++ {
+			wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i]))
+		}
+
+		for i, j := 0, ws-1; j < k; i, j = i+1, j+1 {
+			wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j]))
+			normWord := normTable[wordIndex]
+			table[normWord]++
+		}
+
+		// Compute Shannon entropy
+		floatNwords := float64(nwords)
+		logNwords := ef.logNwords[ws]
+
+		var sumNLogN float64
+		for j := 0; j < tableSize; j++ {
+			n := table[j]
+			if n > 0 {
+				sumNLogN += ef.nLogN[n]
+			}
+		}
+
+		entropy := (logNwords - sumNLogN/floatNwords) / emax
+		if entropy < 0 {
+			entropy = 0
+		}
+
+		if entropy < minEntropy {
+			minEntropy = entropy
+		}
+	}
+
+	if minEntropy == math.MaxFloat64 {
+		return 1.0
+	}
+
+	return math.Round(minEntropy*10000) / 10000
+}
--- a/pkg/obikmer/frequency_filter.go
+++ b/pkg/obikmer/frequency_filter.go
@@ -1,310 +0,0 @@
-package obikmer
-
-import (
-	"fmt"
-
-	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
-)
-
-// FrequencyFilter filters k-mers by minimum frequency
-// Specialization of KmerSetGroup where index[i] contains k-mers seen at least i+1 times
-type FrequencyFilter struct {
-	*KmerSetGroup          // Group of KmerSet (one per frequency level)
-	MinFreq       int      // v - minimum required frequency
-}
-
-// NewFrequencyFilter creates a new frequency filter
-// minFreq: minimum number d'occurrences required (v)
-func NewFrequencyFilter(k, minFreq int) *FrequencyFilter {
-	ff := &FrequencyFilter{
-		KmerSetGroup: NewKmerSetGroup(k, minFreq),
-		MinFreq:      minFreq,
-	}
-
-	// Initialize group metadata
-	ff.SetAttribute("type", "FrequencyFilter")
-	ff.SetAttribute("min_freq", minFreq)
-
-	// Initialize metadata for each level
-	for i := 0; i < minFreq; i++ {
-		level := ff.Get(i)
-		level.SetAttribute("level", i)
-		level.SetAttribute("min_occurrences", i+1)
-		level.SetId(fmt.Sprintf("level_%d", i))
-	}
-
-	return ff
-}
-
-// AddSequence adds all k-mers from a sequence to the filter
-// Uses an iterator to avoid allocating an intermediate vector
-func (ff *FrequencyFilter) AddSequence(seq *obiseq.BioSequence) {
-	rawSeq := seq.Sequence()
-	for canonical := range IterCanonicalKmers(rawSeq, ff.K()) {
-		ff.AddKmerCode(canonical)
-	}
-}
-
-// AddKmerCode adds an encoded k-mer to the filter (main algorithm)
-func (ff *FrequencyFilter) AddKmerCode(kmer uint64) {
-	// Find the current level of the k-mer
-	c := 0
-	for c < ff.MinFreq && ff.Get(c).Contains(kmer) {
-		c++
-	}
-
-	// Add to next level (if not yet at maximum)
-	if c < ff.MinFreq {
-		ff.Get(c).AddKmerCode(kmer)
-	}
-}
-
-// AddCanonicalKmerCode adds an encoded canonical k-mer to the filter
-func (ff *FrequencyFilter) AddCanonicalKmerCode(kmer uint64) {
-	canonical := CanonicalKmer(kmer, ff.K())
-	ff.AddKmerCode(canonical)
-}
-
-// AddKmer adds a k-mer to the filter by encoding the sequence
-// The sequence must have exactly k nucleotides
-// Zero-allocation: encodes directly without creating an intermediate slice
-func (ff *FrequencyFilter) AddKmer(seq []byte) {
-	kmer := EncodeKmer(seq, ff.K())
-	ff.AddKmerCode(kmer)
-}
-
-// AddCanonicalKmer adds a canonical k-mer to the filter by encoding the sequence
-// The sequence must have exactly k nucleotides
-// Zero-allocation: encodes directly in canonical form without creating an intermediate slice
-func (ff *FrequencyFilter) AddCanonicalKmer(seq []byte) {
-	canonical := EncodeCanonicalKmer(seq, ff.K())
-	ff.AddKmerCode(canonical)
-}
-
-// GetFilteredSet returns a KmerSet of k-mers with frequency ≥ minFreq
-func (ff *FrequencyFilter) GetFilteredSet() *KmerSet {
-	// Filtered k-mers are in the last level
-	return ff.Get(ff.MinFreq - 1).Copy()
-}
-
-// GetKmersAtLevel returns a KmerSet of k-mers seen at least (level+1) times
-// level doit être dans [0, minFreq-1]
-func (ff *FrequencyFilter) GetKmersAtLevel(level int) *KmerSet {
-	ks := ff.Get(level)
-	if ks == nil {
-		return NewKmerSet(ff.K())
-	}
-	return ks.Copy()
-}
-
-// Stats returns statistics on frequency levels
-func (ff *FrequencyFilter) Stats() FrequencyFilterStats {
-	stats := FrequencyFilterStats{
-		MinFreq: ff.MinFreq,
-		Levels:  make([]LevelStats, ff.MinFreq),
-	}
-
-	for i := 0; i < ff.MinFreq; i++ {
-		ks := ff.Get(i)
-		card := ks.Len()
-		sizeBytes := ks.MemoryUsage()
-
-		stats.Levels[i] = LevelStats{
-			Level:       i + 1, // Level 1 = freq ≥ 1
-			Cardinality: card,
-			SizeBytes:   sizeBytes,
-		}
-
-		stats.TotalBytes += sizeBytes
-	}
-
-	// The last level contains the result
-	stats.FilteredKmers = stats.Levels[ff.MinFreq-1].Cardinality
-
-	return stats
-}
-
-// FrequencyFilterStats contains the filter statistics
-type FrequencyFilterStats struct {
-	MinFreq       int
-	FilteredKmers uint64      // K-mers with freq ≥ minFreq
-	TotalBytes    uint64      // Total memory used
-	Levels        []LevelStats
-}
-
-// LevelStats contains the stats of a level
-type LevelStats struct {
-	Level       int    // freq ≥ Level
-	Cardinality uint64 // Number of k-mers
-	SizeBytes   uint64 // Size in bytes
-}
-
-func (ffs FrequencyFilterStats) String() string {
-	result := fmt.Sprintf(`Frequency Filter Statistics (minFreq=%d):
-  Filtered k-mers (freq≥%d): %d
-  Total memory: %.2f MB
-
-Level breakdown:
-`, ffs.MinFreq, ffs.MinFreq, ffs.FilteredKmers, float64(ffs.TotalBytes)/1024/1024)
-
-	for _, level := range ffs.Levels {
-		result += fmt.Sprintf("  freq≥%d: %d k-mers (%.2f MB)\n",
-			level.Level,
-			level.Cardinality,
-			float64(level.SizeBytes)/1024/1024)
-	}
-
-	return result
-}
-
-// Clear libère la mémoire de tous les niveaux
-// (héritée de KmerSetGroup mais redéfinie pour clarté)
-func (ff *FrequencyFilter) Clear() {
-	ff.KmerSetGroup.Clear()
-}
-
-// ==================================
-// BATCH PROCESSING
-// ==================================
-
-// AddSequences adds multiple sequences in batch
-func (ff *FrequencyFilter) AddSequences(sequences *obiseq.BioSequenceSlice) {
-	for _, seq := range *sequences {
-		ff.AddSequence(seq)
-	}
-}
-
-// ==================================
-// PERSISTANCE
-// ==================================
-
-// Save sauvegarde le FrequencyFilter dans un répertoire
-// Utilise le format de sérialisation du KmerSetGroup sous-jacent
-// Les métadonnées incluent le type "FrequencyFilter" et min_freq
-//
-// Format:
-//   - directory/metadata.{toml,yaml,json} - métadonnées du filtre
-//   - directory/set_0.roaring - k-mers vus ≥1 fois
-//   - directory/set_1.roaring - k-mers vus ≥2 fois
-//   - ...
-//   - directory/set_{minFreq-1}.roaring - k-mers vus ≥minFreq fois
-//
-// Parameters:
-//   - directory: répertoire de destination
-//   - format: format des métadonnées (FormatTOML, FormatYAML, FormatJSON)
-//
-// Example:
-//
-//	err := ff.Save("./my_filter", obikmer.FormatTOML)
-func (ff *FrequencyFilter) Save(directory string, format MetadataFormat) error {
-	// Déléguer à KmerSetGroup qui gère déjà tout
-	return ff.KmerSetGroup.Save(directory, format)
-}
-
-// LoadFrequencyFilter charge un FrequencyFilter depuis un répertoire
-// Vérifie que les métadonnées correspondent à un FrequencyFilter
-//
-// Parameters:
-//   - directory: répertoire source
-//
-// Returns:
-//   - *FrequencyFilter: le filtre chargé
-//   - error: erreur si le chargement échoue ou si ce n'est pas un FrequencyFilter
-//
-// Example:
-//
-//	ff, err := obikmer.LoadFrequencyFilter("./my_filter")
-func LoadFrequencyFilter(directory string) (*FrequencyFilter, error) {
-	// Charger le KmerSetGroup
-	ksg, err := LoadKmerSetGroup(directory)
-	if err != nil {
-		return nil, err
-	}
-
-	// Vérifier que c'est bien un FrequencyFilter
-	if typeAttr, ok := ksg.GetAttribute("type"); !ok || typeAttr != "FrequencyFilter" {
-		return nil, fmt.Errorf("loaded data is not a FrequencyFilter (type=%v)", typeAttr)
-	}
-
-	// Récupérer min_freq
-	minFreqAttr, ok := ksg.GetIntAttribute("min_freq")
-	if !ok {
-		return nil, fmt.Errorf("FrequencyFilter missing min_freq attribute")
-	}
-
-	// Créer le FrequencyFilter
-	ff := &FrequencyFilter{
-		KmerSetGroup: ksg,
-		MinFreq:      minFreqAttr,
-	}
-
-	return ff, nil
-}
-
-// ==================================
-// UTILITAIRES
-// ==================================
-
-// Contains vérifie si un k-mer a atteint la fréquence minimale
-func (ff *FrequencyFilter) Contains(kmer uint64) bool {
-	canonical := CanonicalKmer(kmer, ff.K())
-	return ff.Get(ff.MinFreq - 1).Contains(canonical)
-}
-
-// GetFrequency returns the approximate frequency of a k-mer
-// Retourne le niveau maximum atteint (freq ≥ niveau)
-func (ff *FrequencyFilter) GetFrequency(kmer uint64) int {
-	canonical := CanonicalKmer(kmer, ff.K())
-
-	freq := 0
-	for i := 0; i < ff.MinFreq; i++ {
-		if ff.Get(i).Contains(canonical) {
-			freq = i + 1
-		} else {
-			break
-		}
-	}
-
-	return freq
-}
-
-// Len returns the number of filtered k-mers or at a specific level
-// Without argument: returns the number of k-mers with freq ≥ minFreq (last level)
-// With argument level: returns the number of k-mers with freq ≥ (level+1)
-// Exemple: Len() pour les k-mers filtrés, Len(2) pour freq ≥ 3
-// (héritée de KmerSetGroup mais redéfinie pour la documentation)
-func (ff *FrequencyFilter) Len(level ...int) uint64 {
-	return ff.KmerSetGroup.Len(level...)
-}
-
-// MemoryUsage returns memory usage in bytes
-// (héritée de KmerSetGroup mais redéfinie pour clarté)
-func (ff *FrequencyFilter) MemoryUsage() uint64 {
-	return ff.KmerSetGroup.MemoryUsage()
-}
-
-// ==================================
-// COMPARAISON AVEC D'AUTRES APPROCHES
-// ==================================
-
-// CompareWithSimpleMap compare la mémoire avec une simple map
-func (ff *FrequencyFilter) CompareWithSimpleMap() string {
-	totalKmers := ff.Get(0).Len()
-
-	simpleMapBytes := totalKmers * 24 // ~24 bytes par entrée
-	roaringBytes := ff.MemoryUsage()
-
-	reduction := float64(simpleMapBytes) / float64(roaringBytes)
-
-	return fmt.Sprintf(`Memory Comparison for %d k-mers:
-  Simple map[uint64]uint32: %.2f MB
-  Roaring filter (v=%d):    %.2f MB
-  Reduction:                %.1fx
-`,
-		totalKmers,
-		float64(simpleMapBytes)/1024/1024,
-		ff.MinFreq,
-		float64(roaringBytes)/1024/1024,
-		reduction,
-	)
-}
--- a/pkg/obikmer/kdi_merge.go
+++ b/pkg/obikmer/kdi_merge.go
@@ -0,0 +1,86 @@
+package obikmer
+
+import "container/heap"
+
+// mergeItem represents an element in the min-heap for k-way merge.
+type mergeItem struct {
+	value uint64
+	idx   int // index of the reader that produced this value
+}
+
+// mergeHeap implements heap.Interface for k-way merge.
+type mergeHeap []mergeItem
+
+func (h mergeHeap) Len() int            { return len(h) }
+func (h mergeHeap) Less(i, j int) bool  { return h[i].value < h[j].value }
+func (h mergeHeap) Swap(i, j int)       { h[i], h[j] = h[j], h[i] }
+func (h *mergeHeap) Push(x interface{}) { *h = append(*h, x.(mergeItem)) }
+func (h *mergeHeap) Pop() interface{} {
+	old := *h
+	n := len(old)
+	x := old[n-1]
+	*h = old[:n-1]
+	return x
+}
+
+// KWayMerge performs a k-way merge of multiple sorted KdiReader streams.
+// For each unique k-mer value, it reports the value and the number of
+// input streams that contained it (count).
+type KWayMerge struct {
+	h       mergeHeap
+	readers []*KdiReader
+}
+
+// NewKWayMerge creates a k-way merge from multiple KdiReaders.
+// Each reader must produce values in sorted (ascending) order.
+func NewKWayMerge(readers []*KdiReader) *KWayMerge {
+	m := &KWayMerge{
+		h:       make(mergeHeap, 0, len(readers)),
+		readers: readers,
+	}
+
+	// Initialize heap with first value from each reader
+	for i, r := range readers {
+		if v, ok := r.Next(); ok {
+			m.h = append(m.h, mergeItem{value: v, idx: i})
+		}
+	}
+	heap.Init(&m.h)
+
+	return m
+}
+
+// Next returns the next smallest k-mer value, the number of readers
+// that contained this value (count), and true.
+// Returns (0, 0, false) when all streams are exhausted.
+func (m *KWayMerge) Next() (kmer uint64, count int, ok bool) {
+	if len(m.h) == 0 {
+		return 0, 0, false
+	}
+
+	minVal := m.h[0].value
+	count = 0
+
+	// Pop all items with the same value
+	for len(m.h) > 0 && m.h[0].value == minVal {
+		item := heap.Pop(&m.h).(mergeItem)
+		count++
+		// Advance that reader
+		if v, ok := m.readers[item.idx].Next(); ok {
+			heap.Push(&m.h, mergeItem{value: v, idx: item.idx})
+		}
+	}
+
+	return minVal, count, true
+}
+
+// Close closes all underlying readers.
+func (m *KWayMerge) Close() error {
+	var firstErr error
+	for _, r := range m.readers {
+		if err := r.Close(); err != nil && firstErr == nil {
+			firstErr = err
+		}
+	}
+	return firstErr
+}
--- a/pkg/obikmer/kdi_merge_test.go
+++ b/pkg/obikmer/kdi_merge_test.go
@@ -0,0 +1,159 @@
+package obikmer
+
+import (
+	"path/filepath"
+	"testing"
+)
+
+// writeKdi is a helper that writes sorted kmers to a .kdi file.
+func writeKdi(t *testing.T, dir, name string, kmers []uint64) string {
+	t.Helper()
+	path := filepath.Join(dir, name)
+	w, err := NewKdiWriter(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, v := range kmers {
+		if err := w.Write(v); err != nil {
+			t.Fatal(err)
+		}
+	}
+	if err := w.Close(); err != nil {
+		t.Fatal(err)
+	}
+	return path
+}
+
+func TestKWayMergeBasic(t *testing.T) {
+	dir := t.TempDir()
+
+	// Three sorted streams
+	p1 := writeKdi(t, dir, "a.kdi", []uint64{1, 3, 5, 7})
+	p2 := writeKdi(t, dir, "b.kdi", []uint64{2, 3, 6, 7})
+	p3 := writeKdi(t, dir, "c.kdi", []uint64{3, 4, 7, 8})
+
+	r1, _ := NewKdiReader(p1)
+	r2, _ := NewKdiReader(p2)
+	r3, _ := NewKdiReader(p3)
+
+	m := NewKWayMerge([]*KdiReader{r1, r2, r3})
+	defer m.Close()
+
+	type result struct {
+		kmer  uint64
+		count int
+	}
+	var results []result
+	for {
+		kmer, count, ok := m.Next()
+		if !ok {
+			break
+		}
+		results = append(results, result{kmer, count})
+	}
+
+	expected := []result{
+		{1, 1}, {2, 1}, {3, 3}, {4, 1}, {5, 1}, {6, 1}, {7, 3}, {8, 1},
+	}
+	if len(results) != len(expected) {
+		t.Fatalf("got %d results, want %d", len(results), len(expected))
+	}
+	for i, exp := range expected {
+		if results[i] != exp {
+			t.Errorf("result %d: got %+v, want %+v", i, results[i], exp)
+		}
+	}
+}
+
+func TestKWayMergeSingleStream(t *testing.T) {
+	dir := t.TempDir()
+	p := writeKdi(t, dir, "a.kdi", []uint64{10, 20, 30})
+
+	r, _ := NewKdiReader(p)
+	m := NewKWayMerge([]*KdiReader{r})
+	defer m.Close()
+
+	vals := []uint64{10, 20, 30}
+	for _, expected := range vals {
+		kmer, count, ok := m.Next()
+		if !ok {
+			t.Fatal("unexpected EOF")
+		}
+		if kmer != expected || count != 1 {
+			t.Fatalf("got (%d, %d), want (%d, 1)", kmer, count, expected)
+		}
+	}
+	_, _, ok := m.Next()
+	if ok {
+		t.Fatal("expected EOF")
+	}
+}
+
+func TestKWayMergeEmpty(t *testing.T) {
+	dir := t.TempDir()
+
+	p1 := writeKdi(t, dir, "a.kdi", nil)
+	p2 := writeKdi(t, dir, "b.kdi", nil)
+
+	r1, _ := NewKdiReader(p1)
+	r2, _ := NewKdiReader(p2)
+
+	m := NewKWayMerge([]*KdiReader{r1, r2})
+	defer m.Close()
+
+	_, _, ok := m.Next()
+	if ok {
+		t.Fatal("expected no results from empty streams")
+	}
+}
+
+func TestKWayMergeDisjoint(t *testing.T) {
+	dir := t.TempDir()
+
+	p1 := writeKdi(t, dir, "a.kdi", []uint64{1, 2, 3})
+	p2 := writeKdi(t, dir, "b.kdi", []uint64{10, 20, 30})
+
+	r1, _ := NewKdiReader(p1)
+	r2, _ := NewKdiReader(p2)
+
+	m := NewKWayMerge([]*KdiReader{r1, r2})
+	defer m.Close()
+
+	expected := []uint64{1, 2, 3, 10, 20, 30}
+	for _, exp := range expected {
+		kmer, count, ok := m.Next()
+		if !ok {
+			t.Fatal("unexpected EOF")
+		}
+		if kmer != exp || count != 1 {
+			t.Fatalf("got (%d, %d), want (%d, 1)", kmer, count, exp)
+		}
+	}
+}
+
+func TestKWayMergeAllSame(t *testing.T) {
+	dir := t.TempDir()
+
+	p1 := writeKdi(t, dir, "a.kdi", []uint64{42})
+	p2 := writeKdi(t, dir, "b.kdi", []uint64{42})
+	p3 := writeKdi(t, dir, "c.kdi", []uint64{42})
+
+	r1, _ := NewKdiReader(p1)
+	r2, _ := NewKdiReader(p2)
+	r3, _ := NewKdiReader(p3)
+
+	m := NewKWayMerge([]*KdiReader{r1, r2, r3})
+	defer m.Close()
+
+	kmer, count, ok := m.Next()
+	if !ok {
+		t.Fatal("expected one result")
+	}
+	if kmer != 42 || count != 3 {
+		t.Fatalf("got (%d, %d), want (42, 3)", kmer, count)
+	}
+	_, _, ok = m.Next()
+	if ok {
+		t.Fatal("expected EOF")
+	}
+}
--- a/pkg/obikmer/kdi_reader.go
+++ b/pkg/obikmer/kdi_reader.go
@@ -0,0 +1,170 @@
+package obikmer
+
+import (
+	"bufio"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"os"
+)
+
+// KdiReader reads k-mers from a .kdi file using streaming delta-varint decoding.
+type KdiReader struct {
+	r       *bufio.Reader
+	file    *os.File
+	count   uint64    // total number of k-mers
+	read    uint64    // number of k-mers already consumed
+	prev    uint64    // last decoded value
+	started bool      // whether first value has been read
+	index   *KdxIndex // optional sparse index for seeking
+}
+
+// NewKdiReader opens a .kdi file for streaming reading (no index).
+func NewKdiReader(path string) (*KdiReader, error) {
+	return openKdiReader(path, nil)
+}
+
+// NewKdiIndexedReader opens a .kdi file with its companion .kdx index
+// loaded for fast seeking. If the .kdx file does not exist, it gracefully
+// falls back to sequential reading.
+func NewKdiIndexedReader(path string) (*KdiReader, error) {
+	kdxPath := KdxPathForKdi(path)
+	idx, err := LoadKdxIndex(kdxPath)
+	if err != nil {
+		// Index load failed — fall back to non-indexed
+		return openKdiReader(path, nil)
+	}
+	// idx may be nil if file does not exist — that's fine
+	return openKdiReader(path, idx)
+}
+
+func openKdiReader(path string, idx *KdxIndex) (*KdiReader, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	r := bufio.NewReaderSize(f, 65536)
+
+	// Read and verify magic
+	var magic [4]byte
+	if _, err := io.ReadFull(r, magic[:]); err != nil {
+		f.Close()
+		return nil, fmt.Errorf("kdi: read magic: %w", err)
+	}
+	if magic != kdiMagic {
+		f.Close()
+		return nil, fmt.Errorf("kdi: bad magic %v", magic)
+	}
+
+	// Read count
+	var countBuf [8]byte
+	if _, err := io.ReadFull(r, countBuf[:]); err != nil {
+		f.Close()
+		return nil, fmt.Errorf("kdi: read count: %w", err)
+	}
+	count := binary.LittleEndian.Uint64(countBuf[:])
+
+	return &KdiReader{
+		r:     r,
+		file:  f,
+		count: count,
+		index: idx,
+	}, nil
+}
+
+// Next returns the next k-mer and true, or (0, false) when exhausted.
+func (kr *KdiReader) Next() (uint64, bool) {
+	if kr.read >= kr.count {
+		return 0, false
+	}
+
+	if !kr.started {
+		// Read first value as absolute uint64 LE
+		var buf [8]byte
+		if _, err := io.ReadFull(kr.r, buf[:]); err != nil {
+			return 0, false
+		}
+		kr.prev = binary.LittleEndian.Uint64(buf[:])
+		kr.started = true
+		kr.read++
+		return kr.prev, true
+	}
+
+	// Read delta varint
+	delta, err := DecodeVarint(kr.r)
+	if err != nil {
+		return 0, false
+	}
+	kr.prev += delta
+	kr.read++
+	return kr.prev, true
+}
+
+// SeekTo positions the reader near the target k-mer using the sparse .kdx index.
+// After SeekTo, the reader is positioned so that the next call to Next()
+// returns the k-mer immediately after the indexed entry at or before target.
+//
+// If the reader has no index, or the target is before the current position,
+// SeekTo does nothing (linear scan continues from current position).
+func (kr *KdiReader) SeekTo(target uint64) error {
+	if kr.index == nil {
+		return nil
+	}
+
+	// If we've already passed the target, we can't seek backwards
+	if kr.started && kr.prev >= target {
+		return nil
+	}
+
+	offset, skipCount, ok := kr.index.FindOffset(target)
+	if !ok {
+		return nil
+	}
+
+	// skipCount is the number of k-mers consumed at the indexed position.
+	// The index was recorded AFTER writing the k-mer at position skipCount-1
+	// (since count%stride==0 after incrementing count). So the actual number
+	// of k-mers consumed is skipCount (the entry's kmer is the last one
+	// before the offset).
+
+	// Only seek if it would skip significant work
+	if kr.started && skipCount <= kr.read {
+		return nil
+	}
+
+	// The index entry stores (kmer_value, byte_offset_after_that_kmer).
+	// skipCount = (entryIdx+1)*stride, so entryIdx = skipCount/stride - 1
+	// We seek to that offset, set prev = indexedKmer, and the next Next()
+	// call will read the delta-varint of the following k-mer.
+	entryIdx := int(skipCount)/kr.index.stride - 1
+	if entryIdx < 0 || entryIdx >= len(kr.index.entries) {
+		return nil
+	}
+	indexedKmer := kr.index.entries[entryIdx].kmer
+
+	if _, err := kr.file.Seek(int64(offset), io.SeekStart); err != nil {
+		return fmt.Errorf("kdi: seek: %w", err)
+	}
+	kr.r.Reset(kr.file)
+
+	kr.prev = indexedKmer
+	kr.started = true
+	kr.read = skipCount
+
+	return nil
+}
+
+// Count returns the total number of k-mers in this partition.
+func (kr *KdiReader) Count() uint64 {
+	return kr.count
+}
+
+// Remaining returns how many k-mers have not been read yet.
+func (kr *KdiReader) Remaining() uint64 {
+	return kr.count - kr.read
+}
+
+// Close closes the underlying file.
+func (kr *KdiReader) Close() error {
+	return kr.file.Close()
+}
--- a/pkg/obikmer/kdi_test.go
+++ b/pkg/obikmer/kdi_test.go
@@ -0,0 +1,255 @@
+package obikmer
+
+import (
+	"os"
+	"path/filepath"
+	"sort"
+	"testing"
+)
+
+func TestKdiRoundTrip(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "test.kdi")
+
+	// Sorted k-mer values
+	kmers := []uint64{10, 20, 30, 100, 200, 500, 10000, 1 << 40, 1<<62 - 1}
+
+	w, err := NewKdiWriter(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, v := range kmers {
+		if err := w.Write(v); err != nil {
+			t.Fatal(err)
+		}
+	}
+	if w.Count() != uint64(len(kmers)) {
+		t.Fatalf("writer count: got %d, want %d", w.Count(), len(kmers))
+	}
+	if err := w.Close(); err != nil {
+		t.Fatal(err)
+	}
+
+	// Read back
+	r, err := NewKdiReader(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer r.Close()
+
+	if r.Count() != uint64(len(kmers)) {
+		t.Fatalf("reader count: got %d, want %d", r.Count(), len(kmers))
+	}
+
+	for i, expected := range kmers {
+		got, ok := r.Next()
+		if !ok {
+			t.Fatalf("unexpected EOF at index %d", i)
+		}
+		if got != expected {
+			t.Fatalf("kmer %d: got %d, want %d", i, got, expected)
+		}
+	}
+
+	_, ok := r.Next()
+	if ok {
+		t.Fatal("expected EOF after all k-mers")
+	}
+}
+
+func TestKdiEmpty(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "empty.kdi")
+
+	w, err := NewKdiWriter(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := w.Close(); err != nil {
+		t.Fatal(err)
+	}
+
+	r, err := NewKdiReader(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer r.Close()
+
+	if r.Count() != 0 {
+		t.Fatalf("expected count 0, got %d", r.Count())
+	}
+
+	_, ok := r.Next()
+	if ok {
+		t.Fatal("expected no k-mers in empty file")
+	}
+}
+
+func TestKdiSingleValue(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "single.kdi")
+
+	w, err := NewKdiWriter(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := w.Write(42); err != nil {
+		t.Fatal(err)
+	}
+	if err := w.Close(); err != nil {
+		t.Fatal(err)
+	}
+
+	r, err := NewKdiReader(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer r.Close()
+
+	if r.Count() != 1 {
+		t.Fatalf("expected count 1, got %d", r.Count())
+	}
+
+	v, ok := r.Next()
+	if !ok {
+		t.Fatal("expected one k-mer")
+	}
+	if v != 42 {
+		t.Fatalf("got %d, want 42", v)
+	}
+}
+
+func TestKdiFileSize(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "size.kdi")
+
+	// Write: magic(4) + count(8) + first(8) = 20 bytes
+	w, err := NewKdiWriter(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := w.Write(0); err != nil {
+		t.Fatal(err)
+	}
+	if err := w.Close(); err != nil {
+		t.Fatal(err)
+	}
+
+	info, err := os.Stat(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	// magic(4) + count(8) + first(8) = 20
+	if info.Size() != 20 {
+		t.Fatalf("file size: got %d, want 20", info.Size())
+	}
+}
+
+func TestKdiDeltaCompression(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "delta.kdi")
+
+	// Dense consecutive values should compress well
+	n := 10000
+	kmers := make([]uint64, n)
+	for i := range kmers {
+		kmers[i] = uint64(i * 2) // even numbers
+	}
+
+	w, err := NewKdiWriter(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, v := range kmers {
+		if err := w.Write(v); err != nil {
+			t.Fatal(err)
+		}
+	}
+	if err := w.Close(); err != nil {
+		t.Fatal(err)
+	}
+
+	// Each delta is 2, encoded as 1 byte varint
+	// Total: magic(4) + count(8) + first(8) + (n-1)*1 = 20 + 9999 bytes
+	info, err := os.Stat(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	expected := int64(20 + n - 1)
+	if info.Size() != expected {
+		t.Fatalf("file size: got %d, want %d", info.Size(), expected)
+	}
+
+	// Verify round-trip
+	r, err := NewKdiReader(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer r.Close()
+
+	for i, expected := range kmers {
+		got, ok := r.Next()
+		if !ok {
+			t.Fatalf("unexpected EOF at index %d", i)
+		}
+		if got != expected {
+			t.Fatalf("kmer %d: got %d, want %d", i, got, expected)
+		}
+	}
+}
+
+func TestKdiFromRealKmers(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "real.kdi")
+
+	// Extract k-mers from a sequence, sort, dedup, write to KDI
+	seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT")
+	k := 15
+
+	var kmers []uint64
+	for kmer := range IterCanonicalKmers(seq, k) {
+		kmers = append(kmers, kmer)
+	}
+	sort.Slice(kmers, func(i, j int) bool { return kmers[i] < kmers[j] })
+	// Dedup
+	deduped := kmers[:0]
+	for i, v := range kmers {
+		if i == 0 || v != kmers[i-1] {
+			deduped = append(deduped, v)
+		}
+	}
+
+	w, err := NewKdiWriter(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, v := range deduped {
+		if err := w.Write(v); err != nil {
+			t.Fatal(err)
+		}
+	}
+	if err := w.Close(); err != nil {
+		t.Fatal(err)
+	}
+
+	// Read back and verify
+	r, err := NewKdiReader(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer r.Close()
+
+	if r.Count() != uint64(len(deduped)) {
+		t.Fatalf("count: got %d, want %d", r.Count(), len(deduped))
+	}
+
+	for i, expected := range deduped {
+		got, ok := r.Next()
+		if !ok {
+			t.Fatalf("unexpected EOF at index %d", i)
+		}
+		if got != expected {
+			t.Fatalf("kmer %d: got %d, want %d", i, got, expected)
+		}
+	}
+}
--- a/pkg/obikmer/kdi_writer.go
+++ b/pkg/obikmer/kdi_writer.go
@@ -0,0 +1,151 @@
+package obikmer
+
+import (
+	"bufio"
+	"encoding/binary"
+	"os"
+)
+
+// KDI file magic bytes: "KDI\x01"
+var kdiMagic = [4]byte{'K', 'D', 'I', 0x01}
+
+// kdiHeaderSize is the size of the KDI header: magic(4) + count(8) = 12 bytes.
+const kdiHeaderSize = 12
+
+// KdiWriter writes a sorted sequence of uint64 k-mers to a .kdi file
+// using delta-varint encoding.
+//
+// Format:
+//
+//	[magic: 4 bytes "KDI\x01"]
+//	[count: uint64 LE]        number of k-mers
+//	[first: uint64 LE]        first k-mer (absolute value)
+//	[delta_1: varint]          arr[1] - arr[0]
+//	[delta_2: varint]          arr[2] - arr[1]
+//	...
+//
+// The caller must write k-mers in strictly increasing order.
+//
+// On Close(), a companion .kdx sparse index file is written alongside
+// the .kdi file for fast random access.
+type KdiWriter struct {
+	w            *bufio.Writer
+	file         *os.File
+	count        uint64
+	prev         uint64
+	first        bool
+	path         string
+	bytesWritten uint64     // bytes written after header (data section offset)
+	indexEntries []kdxEntry // sparse index entries collected during writes
+}
+
+// NewKdiWriter creates a new KdiWriter writing to the given file path.
+// The header (magic + count placeholder) is written immediately.
+// Count is patched on Close().
+func NewKdiWriter(path string) (*KdiWriter, error) {
+	f, err := os.Create(path)
+	if err != nil {
+		return nil, err
+	}
+	w := bufio.NewWriterSize(f, 65536)
+
+	// Write magic
+	if _, err := w.Write(kdiMagic[:]); err != nil {
+		f.Close()
+		return nil, err
+	}
+	// Write placeholder for count (will be patched on Close)
+	var countBuf [8]byte
+	if _, err := w.Write(countBuf[:]); err != nil {
+		f.Close()
+		return nil, err
+	}
+
+	return &KdiWriter{
+		w:            w,
+		file:         f,
+		first:        true,
+		path:         path,
+		bytesWritten: 0,
+		indexEntries: make([]kdxEntry, 0, 256),
+	}, nil
+}
+
+// Write adds a k-mer to the file. K-mers must be written in strictly
+// increasing order.
+func (kw *KdiWriter) Write(kmer uint64) error {
+	if kw.first {
+		// Write first value as absolute uint64 LE
+		var buf [8]byte
+		binary.LittleEndian.PutUint64(buf[:], kmer)
+		if _, err := kw.w.Write(buf[:]); err != nil {
+			return err
+		}
+		kw.bytesWritten += 8
+		kw.prev = kmer
+		kw.first = false
+	} else {
+		delta := kmer - kw.prev
+		n, err := EncodeVarint(kw.w, delta)
+		if err != nil {
+			return err
+		}
+		kw.bytesWritten += uint64(n)
+		kw.prev = kmer
+	}
+	kw.count++
+
+	// Record sparse index entry every defaultKdxStride k-mers.
+	// The offset recorded is AFTER writing this k-mer, so it points to
+	// where the next k-mer's data will start. SeekTo uses this: it seeks
+	// to the recorded offset, sets prev = indexedKmer, and Next() reads
+	// the delta of the following k-mer.
+	if kw.count%defaultKdxStride == 0 {
+		kw.indexEntries = append(kw.indexEntries, kdxEntry{
+			kmer:   kmer,
+			offset: kdiHeaderSize + kw.bytesWritten,
+		})
+	}
+
+	return nil
+}
+
+// Count returns the number of k-mers written so far.
+func (kw *KdiWriter) Count() uint64 {
+	return kw.count
+}
+
+// Close flushes buffered data, patches the count in the header,
+// writes the companion .kdx index file, and closes the file.
+func (kw *KdiWriter) Close() error {
+	if err := kw.w.Flush(); err != nil {
+		kw.file.Close()
+		return err
+	}
+
+	// Patch count at offset 4 (after magic)
+	if _, err := kw.file.Seek(4, 0); err != nil {
+		kw.file.Close()
+		return err
+	}
+	var countBuf [8]byte
+	binary.LittleEndian.PutUint64(countBuf[:], kw.count)
+	if _, err := kw.file.Write(countBuf[:]); err != nil {
+		kw.file.Close()
+		return err
+	}
+
+	if err := kw.file.Close(); err != nil {
+		return err
+	}
+
+	// Write .kdx index file if there are entries to index
+	if len(kw.indexEntries) > 0 {
+		kdxPath := KdxPathForKdi(kw.path)
+		if err := WriteKdxIndex(kdxPath, defaultKdxStride, kw.indexEntries); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
--- a/pkg/obikmer/kdx.go
+++ b/pkg/obikmer/kdx.go
@@ -0,0 +1,170 @@
+package obikmer
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+	"os"
+	"sort"
+	"strings"
+)
+
+// KDX file magic bytes: "KDX\x01"
+var kdxMagic = [4]byte{'K', 'D', 'X', 0x01}
+
+// defaultKdxStride is the number of k-mers between consecutive index entries.
+const defaultKdxStride = 4096
+
+// kdxEntry is a single entry in the sparse index: the absolute k-mer value
+// and the byte offset in the corresponding .kdi file where that k-mer is stored.
+type kdxEntry struct {
+	kmer   uint64
+	offset uint64 // absolute byte offset in .kdi file
+}
+
+// KdxIndex is a sparse, in-memory index for a .kdi file.
+// It stores one entry every `stride` k-mers, enabling O(log N / stride)
+// binary search followed by at most `stride` linear scan steps.
+type KdxIndex struct {
+	stride  int
+	entries []kdxEntry
+}
+
+// LoadKdxIndex reads a .kdx file into memory.
+// Returns (nil, nil) if the file does not exist (graceful degradation).
+func LoadKdxIndex(path string) (*KdxIndex, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil, nil
+		}
+		return nil, err
+	}
+	defer f.Close()
+
+	// Read magic
+	var magic [4]byte
+	if _, err := io.ReadFull(f, magic[:]); err != nil {
+		return nil, fmt.Errorf("kdx: read magic: %w", err)
+	}
+	if magic != kdxMagic {
+		return nil, fmt.Errorf("kdx: bad magic %v", magic)
+	}
+
+	// Read stride (uint32 LE)
+	var buf4 [4]byte
+	if _, err := io.ReadFull(f, buf4[:]); err != nil {
+		return nil, fmt.Errorf("kdx: read stride: %w", err)
+	}
+	stride := int(binary.LittleEndian.Uint32(buf4[:]))
+
+	// Read count (uint32 LE)
+	if _, err := io.ReadFull(f, buf4[:]); err != nil {
+		return nil, fmt.Errorf("kdx: read count: %w", err)
+	}
+	count := int(binary.LittleEndian.Uint32(buf4[:]))
+
+	// Read entries
+	entries := make([]kdxEntry, count)
+	var buf16 [16]byte
+	for i := 0; i < count; i++ {
+		if _, err := io.ReadFull(f, buf16[:]); err != nil {
+			return nil, fmt.Errorf("kdx: read entry %d: %w", i, err)
+		}
+		entries[i] = kdxEntry{
+			kmer:   binary.LittleEndian.Uint64(buf16[0:8]),
+			offset: binary.LittleEndian.Uint64(buf16[8:16]),
+		}
+	}
+
+	return &KdxIndex{
+		stride:  stride,
+		entries: entries,
+	}, nil
+}
+
+// FindOffset locates the best starting point in the .kdi file to scan for
+// the target k-mer. It returns:
+//   - offset: the byte offset in the .kdi file to seek to (positioned after
+//     the indexed k-mer, ready to read the next delta)
+//   - skipCount: the number of k-mers already consumed at that offset
+//     (to set the reader's internal counter)
+//   - ok: true if the index provides a useful starting point
+//
+// Index entries are recorded at k-mer count positions stride, 2*stride, etc.
+// Entry i corresponds to the k-mer written at count = (i+1)*stride.
+func (idx *KdxIndex) FindOffset(target uint64) (offset uint64, skipCount uint64, ok bool) {
+	if idx == nil || len(idx.entries) == 0 {
+		return 0, 0, false
+	}
+
+	// Binary search: find the largest entry with kmer <= target
+	i := sort.Search(len(idx.entries), func(i int) bool {
+		return idx.entries[i].kmer > target
+	})
+	// i is the first entry with kmer > target, so i-1 is the last with kmer <= target
+	if i == 0 {
+		// Target is before the first index entry.
+		// No useful jump point — caller should scan from the beginning.
+		return 0, 0, false
+	}
+
+	i-- // largest entry with kmer <= target
+	// Entry i was recorded after writing k-mer at count = (i+1)*stride
+	skipCount = uint64(i+1) * uint64(idx.stride)
+	return idx.entries[i].offset, skipCount, true
+}
+
+// Stride returns the stride of this index.
+func (idx *KdxIndex) Stride() int {
+	return idx.stride
+}
+
+// Len returns the number of entries in this index.
+func (idx *KdxIndex) Len() int {
+	return len(idx.entries)
+}
+
+// WriteKdxIndex writes a .kdx file from a slice of entries.
+func WriteKdxIndex(path string, stride int, entries []kdxEntry) error {
+	f, err := os.Create(path)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	// Magic
+	if _, err := f.Write(kdxMagic[:]); err != nil {
+		return err
+	}
+
+	// Stride (uint32 LE)
+	var buf4 [4]byte
+	binary.LittleEndian.PutUint32(buf4[:], uint32(stride))
+	if _, err := f.Write(buf4[:]); err != nil {
+		return err
+	}
+
+	// Count (uint32 LE)
+	binary.LittleEndian.PutUint32(buf4[:], uint32(len(entries)))
+	if _, err := f.Write(buf4[:]); err != nil {
+		return err
+	}
+
+	// Entries
+	var buf16 [16]byte
+	for _, e := range entries {
+		binary.LittleEndian.PutUint64(buf16[0:8], e.kmer)
+		binary.LittleEndian.PutUint64(buf16[8:16], e.offset)
+		if _, err := f.Write(buf16[:]); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// KdxPathForKdi returns the .kdx path corresponding to a .kdi path.
+func KdxPathForKdi(kdiPath string) string {
+	return strings.TrimSuffix(kdiPath, ".kdi") + ".kdx"
+}
--- a/pkg/obikmer/kmer_match.go
+++ b/pkg/obikmer/kmer_match.go
@@ -0,0 +1,256 @@
+package obikmer
+
+import (
+	"cmp"
+	"slices"
+	"sync"
+
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
+)
+
+// QueryEntry represents a canonical k-mer to look up, together with
+// metadata to trace the result back to the originating sequence and position.
+type QueryEntry struct {
+	Kmer   uint64 // canonical k-mer value
+	SeqIdx int    // index within the batch
+	Pos    int    // 1-based position in the sequence
+}
+
+// MatchResult holds matched positions for each sequence in a batch.
+// results[i] contains the sorted matched positions for sequence i.
+type MatchResult [][]int
+
+// PreparedQueries holds pre-computed query buckets along with the number
+// of sequences they were built from. This is used by the accumulation
+// pipeline to merge queries from multiple batches.
+type PreparedQueries struct {
+	Buckets [][]QueryEntry // queries[partition], each sorted by Kmer
+	NSeqs   int            // number of sequences that produced these queries
+	NKmers  int            // total number of k-mer entries across all partitions
+}
+
+// MergeQueries merges src into dst, offsetting all SeqIdx values in src
+// by dst.NSeqs. Both dst and src must have the same number of partitions.
+// After merging, src should not be reused.
+//
+// Each partition's entries are merged in sorted order (merge-sort of two
+// already-sorted slices).
+func MergeQueries(dst, src *PreparedQueries) {
+	for p := range dst.Buckets {
+		if len(src.Buckets[p]) == 0 {
+			continue
+		}
+
+		offset := dst.NSeqs
+		srcB := src.Buckets[p]
+
+		// Offset SeqIdx in src entries
+		for i := range srcB {
+			srcB[i].SeqIdx += offset
+		}
+
+		if len(dst.Buckets[p]) == 0 {
+			dst.Buckets[p] = srcB
+			continue
+		}
+
+		// Merge two sorted slices
+		dstB := dst.Buckets[p]
+		merged := make([]QueryEntry, 0, len(dstB)+len(srcB))
+		i, j := 0, 0
+		for i < len(dstB) && j < len(srcB) {
+			if dstB[i].Kmer <= srcB[j].Kmer {
+				merged = append(merged, dstB[i])
+				i++
+			} else {
+				merged = append(merged, srcB[j])
+				j++
+			}
+		}
+		merged = append(merged, dstB[i:]...)
+		merged = append(merged, srcB[j:]...)
+		dst.Buckets[p] = merged
+	}
+	dst.NSeqs += src.NSeqs
+	dst.NKmers += src.NKmers
+}
+
+// PrepareQueries extracts all canonical k-mers from a batch of sequences
+// and groups them by partition using super-kmer minimizers.
+//
+// Returns a PreparedQueries with sorted per-partition buckets.
+func (ksg *KmerSetGroup) PrepareQueries(sequences []*obiseq.BioSequence) *PreparedQueries {
+	P := ksg.partitions
+	k := ksg.k
+	m := ksg.m
+
+	// Pre-allocate partition buckets
+	buckets := make([][]QueryEntry, P)
+	for i := range buckets {
+		buckets[i] = make([]QueryEntry, 0, 64)
+	}
+
+	totalKmers := 0
+	for seqIdx, seq := range sequences {
+		bseq := seq.Sequence()
+		if len(bseq) < k {
+			continue
+		}
+
+		// Iterate super-kmers to get minimizer → partition mapping
+		for sk := range IterSuperKmers(bseq, k, m) {
+			partition := int(sk.Minimizer % uint64(P))
+
+			// Iterate canonical k-mers within this super-kmer
+			skSeq := sk.Sequence
+			if len(skSeq) < k {
+				continue
+			}
+
+			localPos := 0
+			for kmer := range IterCanonicalKmers(skSeq, k) {
+				buckets[partition] = append(buckets[partition], QueryEntry{
+					Kmer:   kmer,
+					SeqIdx: seqIdx,
+					Pos:    sk.Start + localPos + 1,
+				})
+				localPos++
+				totalKmers++
+			}
+		}
+	}
+
+	// Sort each bucket by k-mer value for merge-scan
+	for p := range buckets {
+		slices.SortFunc(buckets[p], func(a, b QueryEntry) int {
+			return cmp.Compare(a.Kmer, b.Kmer)
+		})
+	}
+
+	return &PreparedQueries{
+		Buckets: buckets,
+		NSeqs:   len(sequences),
+		NKmers:  totalKmers,
+	}
+}
+
+// MatchBatch looks up pre-sorted queries against one set of the index.
+// Partitions are processed in parallel. For each partition, a merge-scan
+// compares the sorted queries against the sorted KDI stream.
+//
+// Returns a MatchResult where result[i] contains sorted matched positions
+// for sequence i.
+func (ksg *KmerSetGroup) MatchBatch(setIndex int, pq *PreparedQueries) MatchResult {
+	P := ksg.partitions
+
+	// Pre-allocated per-sequence results and mutexes.
+	// Each partition goroutine appends to results[seqIdx] with mus[seqIdx] held.
+	// Contention is low: a sequence's k-mers span many partitions, but each
+	// partition processes its queries sequentially and the critical section is tiny.
+	results := make([][]int, pq.NSeqs)
+	mus := make([]sync.Mutex, pq.NSeqs)
+
+	var wg sync.WaitGroup
+
+	for p := 0; p < P; p++ {
+		if len(pq.Buckets[p]) == 0 {
+			continue
+		}
+		wg.Add(1)
+		go func(part int) {
+			defer wg.Done()
+			ksg.matchPartition(setIndex, part, pq.Buckets[part], results, mus)
+		}(p)
+	}
+
+	wg.Wait()
+
+	// Sort positions within each sequence
+	for i := range results {
+		if len(results[i]) > 1 {
+			slices.Sort(results[i])
+		}
+	}
+
+	return MatchResult(results)
+}
+
+// matchPartition processes one partition: opens the KDI reader (with index),
+// seeks to the first query, then merge-scans queries against the KDI stream.
+func (ksg *KmerSetGroup) matchPartition(
+	setIndex int,
+	partIndex int,
+	queries []QueryEntry, // sorted by Kmer
+	results [][]int,
+	mus []sync.Mutex,
+) {
+	r, err := NewKdiIndexedReader(ksg.partitionPath(setIndex, partIndex))
+	if err != nil {
+		return
+	}
+	defer r.Close()
+
+	if r.Count() == 0 || len(queries) == 0 {
+		return
+	}
+
+	// Seek to the first query's neighborhood
+	if err := r.SeekTo(queries[0].Kmer); err != nil {
+		return
+	}
+
+	// Read first kmer from the stream after seek
+	currentKmer, ok := r.Next()
+	if !ok {
+		return
+	}
+
+	qi := 0 // query index
+
+	for qi < len(queries) {
+		q := queries[qi]
+
+		// If the next query is far ahead, re-seek instead of linear scan.
+		// Only seek if we'd skip more k-mers than the index stride,
+		// otherwise linear scan through the buffer is faster than a syscall.
+		if r.index != nil && q.Kmer > currentKmer && r.Remaining() > uint64(r.index.stride) {
+			_, skipCount, found := r.index.FindOffset(q.Kmer)
+			if found && skipCount > r.read+uint64(r.index.stride) {
+				if err := r.SeekTo(q.Kmer); err == nil {
+					nextKmer, nextOk := r.Next()
+					if !nextOk {
+						return
+					}
+					currentKmer = nextKmer
+					ok = true
+				}
+			}
+		}
+
+		// Advance KDI stream until >= query kmer
+		for currentKmer < q.Kmer {
+			currentKmer, ok = r.Next()
+			if !ok {
+				return // KDI exhausted
+			}
+		}
+
+		if currentKmer == q.Kmer {
+			// Match! Record all queries with this same k-mer value
+			matchedKmer := q.Kmer
+			for qi < len(queries) && queries[qi].Kmer == matchedKmer {
+				idx := queries[qi].SeqIdx
+				mus[idx].Lock()
+				results[idx] = append(results[idx], queries[qi].Pos)
+				mus[idx].Unlock()
+				qi++
+			}
+		} else {
+			// currentKmer > q.Kmer: skip all queries with this kmer value
+			skippedKmer := q.Kmer
+			for qi < len(queries) && queries[qi].Kmer == skippedKmer {
+				qi++
+			}
+		}
+	}
+}
--- a/pkg/obikmer/kmer_set.go
+++ b/pkg/obikmer/kmer_set.go
@@ -1,217 +0,0 @@
-package obikmer
-
-import (
-	"fmt"
-
-	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
-	"github.com/RoaringBitmap/roaring/roaring64"
-)
-
-// KmerSet wraps a set of k-mers stored in a Roaring Bitmap
-// Provides utility methods for manipulating k-mer sets
-type KmerSet struct {
-	id       string                 // Unique identifier of the KmerSet
-	k        int                    // Size of k-mers (immutable)
-	bitmap   *roaring64.Bitmap      // Bitmap containing the k-mers
-	Metadata map[string]interface{} // User metadata (key=atomic value)
-}
-
-// NewKmerSet creates a new empty KmerSet
-func NewKmerSet(k int) *KmerSet {
-	return &KmerSet{
-		k:        k,
-		bitmap:   roaring64.New(),
-		Metadata: make(map[string]interface{}),
-	}
-}
-
-// NewKmerSetFromBitmap creates a KmerSet from an existing bitmap
-func NewKmerSetFromBitmap(k int, bitmap *roaring64.Bitmap) *KmerSet {
-	return &KmerSet{
-		k:        k,
-		bitmap:   bitmap,
-		Metadata: make(map[string]interface{}),
-	}
-}
-
-// K returns the size of k-mers (immutable)
-func (ks *KmerSet) K() int {
-	return ks.k
-}
-
-// AddKmerCode adds an encoded k-mer to the set
-func (ks *KmerSet) AddKmerCode(kmer uint64) {
-	ks.bitmap.Add(kmer)
-}
-
-// AddCanonicalKmerCode adds an encoded canonical k-mer to the set
-func (ks *KmerSet) AddCanonicalKmerCode(kmer uint64) {
-	canonical := CanonicalKmer(kmer, ks.k)
-	ks.bitmap.Add(canonical)
-}
-
-// AddKmer adds a k-mer to the set by encoding the sequence
-// The sequence must have exactly k nucleotides
-// Zero-allocation: encodes directly without creating an intermediate slice
-func (ks *KmerSet) AddKmer(seq []byte) {
-	kmer := EncodeKmer(seq, ks.k)
-	ks.bitmap.Add(kmer)
-}
-
-// AddCanonicalKmer adds a canonical k-mer to the set by encoding the sequence
-// The sequence must have exactly k nucleotides
-// Zero-allocation: encodes directly in canonical form without creating an intermediate slice
-func (ks *KmerSet) AddCanonicalKmer(seq []byte) {
-	canonical := EncodeCanonicalKmer(seq, ks.k)
-	ks.bitmap.Add(canonical)
-}
-
-// AddSequence adds all k-mers from a sequence to the set
-// Uses an iterator to avoid allocating an intermediate vector
-func (ks *KmerSet) AddSequence(seq *obiseq.BioSequence) {
-	rawSeq := seq.Sequence()
-	for canonical := range IterCanonicalKmers(rawSeq, ks.k) {
-		ks.bitmap.Add(canonical)
-	}
-}
-
-// AddSequences adds all k-mers from multiple sequences in batch
-func (ks *KmerSet) AddSequences(sequences *obiseq.BioSequenceSlice) {
-	for _, seq := range *sequences {
-		ks.AddSequence(seq)
-	}
-}
-
-// Contains checks if a k-mer is in the set
-func (ks *KmerSet) Contains(kmer uint64) bool {
-	return ks.bitmap.Contains(kmer)
-}
-
-// Len returns the number of k-mers in the set
-func (ks *KmerSet) Len() uint64 {
-	return ks.bitmap.GetCardinality()
-}
-
-// MemoryUsage returns memory usage in bytes
-func (ks *KmerSet) MemoryUsage() uint64 {
-	return ks.bitmap.GetSizeInBytes()
-}
-
-// Clear empties the set
-func (ks *KmerSet) Clear() {
-	ks.bitmap.Clear()
-}
-
-// Copy creates a copy of the set (consistent with BioSequence.Copy)
-func (ks *KmerSet) Copy() *KmerSet {
-	// Copy metadata
-	metadata := make(map[string]interface{}, len(ks.Metadata))
-	for k, v := range ks.Metadata {
-		metadata[k] = v
-	}
-
-	return &KmerSet{
-		id:       ks.id,
-		k:        ks.k,
-		bitmap:   ks.bitmap.Clone(),
-		Metadata: metadata,
-	}
-}
-
-// Id returns the identifier of the KmerSet (consistent with BioSequence.Id)
-func (ks *KmerSet) Id() string {
-	return ks.id
-}
-
-// SetId sets the identifier of the KmerSet (consistent with BioSequence.SetId)
-func (ks *KmerSet) SetId(id string) {
-	ks.id = id
-}
-
-// Union returns the union of this set with another
-func (ks *KmerSet) Union(other *KmerSet) *KmerSet {
-	if ks.k != other.k {
-		panic(fmt.Sprintf("Cannot union KmerSets with different k values: %d vs %d", ks.k, other.k))
-	}
-	result := ks.bitmap.Clone()
-	result.Or(other.bitmap)
-	return NewKmerSetFromBitmap(ks.k, result)
-}
-
-// Intersect returns the intersection of this set with another
-func (ks *KmerSet) Intersect(other *KmerSet) *KmerSet {
-	if ks.k != other.k {
-		panic(fmt.Sprintf("Cannot intersect KmerSets with different k values: %d vs %d", ks.k, other.k))
-	}
-	result := ks.bitmap.Clone()
-	result.And(other.bitmap)
-	return NewKmerSetFromBitmap(ks.k, result)
-}
-
-// Difference returns the difference of this set with another (this - other)
-func (ks *KmerSet) Difference(other *KmerSet) *KmerSet {
-	if ks.k != other.k {
-		panic(fmt.Sprintf("Cannot subtract KmerSets with different k values: %d vs %d", ks.k, other.k))
-	}
-	result := ks.bitmap.Clone()
-	result.AndNot(other.bitmap)
-	return NewKmerSetFromBitmap(ks.k, result)
-}
-
-// JaccardDistance computes the Jaccard distance between two KmerSets.
-// The Jaccard distance is defined as: 1 - (|A ∩ B| / |A ∪ B|)
-// where A and B are the two sets.
-//
-// Returns:
-//   - 0.0 when sets are identical (distance = 0, similarity = 1)
-//   - 1.0 when sets are completely disjoint (distance = 1, similarity = 0)
-//   - 1.0 when both sets are empty (by convention)
-//
-// Time complexity: O(|A| + |B|) for Roaring Bitmap operations
-// Space complexity: O(1) as operations are done in-place on temporary bitmaps
-func (ks *KmerSet) JaccardDistance(other *KmerSet) float64 {
-	if ks.k != other.k {
-		panic(fmt.Sprintf("Cannot compute Jaccard distance between KmerSets with different k values: %d vs %d", ks.k, other.k))
-	}
-
-	// Compute intersection cardinality
-	intersectionCard := ks.bitmap.AndCardinality(other.bitmap)
-
-	// Compute union cardinality
-	unionCard := ks.bitmap.OrCardinality(other.bitmap)
-
-	// If union is empty, both sets are empty - return 1.0 by convention
-	if unionCard == 0 {
-		return 1.0
-	}
-
-	// Jaccard similarity = |A ∩ B| / |A ∪ B|
-	similarity := float64(intersectionCard) / float64(unionCard)
-
-	// Jaccard distance = 1 - similarity
-	return 1.0 - similarity
-}
-
-// JaccardSimilarity computes the Jaccard similarity coefficient between two KmerSets.
-// The Jaccard similarity is defined as: |A ∩ B| / |A ∪ B|
-//
-// Returns:
-//   - 1.0 when sets are identical (maximum similarity)
-//   - 0.0 when sets are completely disjoint (no similarity)
-//   - 0.0 when both sets are empty (by convention)
-//
-// Time complexity: O(|A| + |B|) for Roaring Bitmap operations
-// Space complexity: O(1) as operations are done in-place on temporary bitmaps
-func (ks *KmerSet) JaccardSimilarity(other *KmerSet) float64 {
-	return 1.0 - ks.JaccardDistance(other)
-}
-
-// Iterator returns an iterator over all k-mers in the set
-func (ks *KmerSet) Iterator() roaring64.IntIterable64 {
-	return ks.bitmap.Iterator()
-}
-
-// Bitmap returns the underlying bitmap (for compatibility)
-func (ks *KmerSet) Bitmap() *roaring64.Bitmap {
-	return ks.bitmap
-}
--- a/pkg/obikmer/kmer_set_attributes.go
+++ b/pkg/obikmer/kmer_set_attributes.go
@@ -1,362 +0,0 @@
-package obikmer
-
-import (
-	"fmt"
-	"strconv"
-
-	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
-)
-
-// ==================================
-// KMER SET ATTRIBUTE API
-// Mimic BioSequence attribute API from obiseq/attributes.go
-// ==================================
-
-// HasAttribute vérifie si une clé d'attribut existe
-func (ks *KmerSet) HasAttribute(key string) bool {
-	_, ok := ks.Metadata[key]
-	return ok
-}
-
-// GetAttribute récupère la valeur d'un attribut
-// Cas particuliers: "id" utilise Id(), "k" utilise K()
-func (ks *KmerSet) GetAttribute(key string) (interface{}, bool) {
-	switch key {
-	case "id":
-		return ks.Id(), true
-	case "k":
-		return ks.K(), true
-	default:
-		value, ok := ks.Metadata[key]
-		return value, ok
-	}
-}
-
-// SetAttribute sets the value of an attribute
-// Cas particuliers: "id" utilise SetId(), "k" est immutable (panique)
-func (ks *KmerSet) SetAttribute(key string, value interface{}) {
-	switch key {
-	case "id":
-		if id, ok := value.(string); ok {
-			ks.SetId(id)
-		} else {
-			panic(fmt.Sprintf("id must be a string, got %T", value))
-		}
-	case "k":
-		panic("k is immutable and cannot be modified via SetAttribute")
-	default:
-		ks.Metadata[key] = value
-	}
-}
-
-// DeleteAttribute supprime un attribut
-func (ks *KmerSet) DeleteAttribute(key string) {
-	delete(ks.Metadata, key)
-}
-
-// RemoveAttribute supprime un attribut (alias de DeleteAttribute)
-func (ks *KmerSet) RemoveAttribute(key string) {
-	ks.DeleteAttribute(key)
-}
-
-// RenameAttribute renomme un attribut
-func (ks *KmerSet) RenameAttribute(newName, oldName string) {
-	if value, ok := ks.Metadata[oldName]; ok {
-		ks.Metadata[newName] = value
-		delete(ks.Metadata, oldName)
-	}
-}
-
-// GetIntAttribute récupère un attribut en tant qu'entier
-func (ks *KmerSet) GetIntAttribute(key string) (int, bool) {
-	value, ok := ks.Metadata[key]
-	if !ok {
-		return 0, false
-	}
-
-	switch v := value.(type) {
-	case int:
-		return v, true
-	case int64:
-		return int(v), true
-	case float64:
-		return int(v), true
-	case string:
-		if i, err := strconv.Atoi(v); err == nil {
-			return i, true
-		}
-	}
-	return 0, false
-}
-
-// GetFloatAttribute récupère un attribut en tant que float64
-func (ks *KmerSet) GetFloatAttribute(key string) (float64, bool) {
-	value, ok := ks.Metadata[key]
-	if !ok {
-		return 0, false
-	}
-
-	switch v := value.(type) {
-	case float64:
-		return v, true
-	case float32:
-		return float64(v), true
-	case int:
-		return float64(v), true
-	case int64:
-		return float64(v), true
-	case string:
-		if f, err := strconv.ParseFloat(v, 64); err == nil {
-			return f, true
-		}
-	}
-	return 0, false
-}
-
-// GetNumericAttribute récupère un attribut numérique (alias de GetFloatAttribute)
-func (ks *KmerSet) GetNumericAttribute(key string) (float64, bool) {
-	return ks.GetFloatAttribute(key)
-}
-
-// GetStringAttribute récupère un attribut en tant que chaîne
-func (ks *KmerSet) GetStringAttribute(key string) (string, bool) {
-	value, ok := ks.Metadata[key]
-	if !ok {
-		return "", false
-	}
-
-	switch v := value.(type) {
-	case string:
-		return v, true
-	default:
-		return fmt.Sprintf("%v", v), true
-	}
-}
-
-// GetBoolAttribute récupère un attribut en tant que booléen
-func (ks *KmerSet) GetBoolAttribute(key string) (bool, bool) {
-	value, ok := ks.Metadata[key]
-	if !ok {
-		return false, false
-	}
-
-	switch v := value.(type) {
-	case bool:
-		return v, true
-	case int:
-		return v != 0, true
-	case string:
-		if b, err := strconv.ParseBool(v); err == nil {
-			return b, true
-		}
-	}
-	return false, false
-}
-
-// AttributeKeys returns the set of attribute keys
-func (ks *KmerSet) AttributeKeys() obiutils.Set[string] {
-	keys := obiutils.MakeSet[string]()
-	for key := range ks.Metadata {
-		keys.Add(key)
-	}
-	return keys
-}
-
-// Keys returns the set of attribute keys (alias of AttributeKeys)
-func (ks *KmerSet) Keys() obiutils.Set[string] {
-	return ks.AttributeKeys()
-}
-
-// ==================================
-// KMER SET GROUP ATTRIBUTE API
-// Métadonnées du groupe + accès via Get() pour les sets individuels
-// ==================================
-
-// HasAttribute vérifie si une clé d'attribut existe pour le groupe
-func (ksg *KmerSetGroup) HasAttribute(key string) bool {
-	_, ok := ksg.Metadata[key]
-	return ok
-}
-
-// GetAttribute récupère la valeur d'un attribut du groupe
-// Cas particuliers: "id" utilise Id(), "k" utilise K()
-func (ksg *KmerSetGroup) GetAttribute(key string) (interface{}, bool) {
-	switch key {
-	case "id":
-		return ksg.Id(), true
-	case "k":
-		return ksg.K(), true
-	default:
-		value, ok := ksg.Metadata[key]
-		return value, ok
-	}
-}
-
-// SetAttribute sets the value of an attribute du groupe
-// Cas particuliers: "id" utilise SetId(), "k" est immutable (panique)
-func (ksg *KmerSetGroup) SetAttribute(key string, value interface{}) {
-	switch key {
-	case "id":
-		if id, ok := value.(string); ok {
-			ksg.SetId(id)
-		} else {
-			panic(fmt.Sprintf("id must be a string, got %T", value))
-		}
-	case "k":
-		panic("k is immutable and cannot be modified via SetAttribute")
-	default:
-		ksg.Metadata[key] = value
-	}
-}
-
-// DeleteAttribute supprime un attribut du groupe
-func (ksg *KmerSetGroup) DeleteAttribute(key string) {
-	delete(ksg.Metadata, key)
-}
-
-// RemoveAttribute supprime un attribut du groupe (alias)
-func (ksg *KmerSetGroup) RemoveAttribute(key string) {
-	ksg.DeleteAttribute(key)
-}
-
-// RenameAttribute renomme un attribut du groupe
-func (ksg *KmerSetGroup) RenameAttribute(newName, oldName string) {
-	if value, ok := ksg.Metadata[oldName]; ok {
-		ksg.Metadata[newName] = value
-		delete(ksg.Metadata, oldName)
-	}
-}
-
-// GetIntAttribute récupère un attribut entier du groupe
-func (ksg *KmerSetGroup) GetIntAttribute(key string) (int, bool) {
-	value, ok := ksg.GetAttribute(key)
-	if !ok {
-		return 0, false
-	}
-
-	switch v := value.(type) {
-	case int:
-		return v, true
-	case int64:
-		return int(v), true
-	case float64:
-		return int(v), true
-	case string:
-		if i, err := strconv.Atoi(v); err == nil {
-			return i, true
-		}
-	}
-	return 0, false
-}
-
-// GetFloatAttribute récupère un attribut float64 du groupe
-func (ksg *KmerSetGroup) GetFloatAttribute(key string) (float64, bool) {
-	value, ok := ksg.GetAttribute(key)
-	if !ok {
-		return 0, false
-	}
-
-	switch v := value.(type) {
-	case float64:
-		return v, true
-	case float32:
-		return float64(v), true
-	case int:
-		return float64(v), true
-	case int64:
-		return float64(v), true
-	case string:
-		if f, err := strconv.ParseFloat(v, 64); err == nil {
-			return f, true
-		}
-	}
-	return 0, false
-}
-
-// GetNumericAttribute récupère un attribut numérique du groupe
-func (ksg *KmerSetGroup) GetNumericAttribute(key string) (float64, bool) {
-	return ksg.GetFloatAttribute(key)
-}
-
-// GetStringAttribute récupère un attribut chaîne du groupe
-func (ksg *KmerSetGroup) GetStringAttribute(key string) (string, bool) {
-	value, ok := ksg.GetAttribute(key)
-	if !ok {
-		return "", false
-	}
-
-	switch v := value.(type) {
-	case string:
-		return v, true
-	default:
-		return fmt.Sprintf("%v", v), true
-	}
-}
-
-// GetBoolAttribute récupère un attribut booléen du groupe
-func (ksg *KmerSetGroup) GetBoolAttribute(key string) (bool, bool) {
-	value, ok := ksg.GetAttribute(key)
-	if !ok {
-		return false, false
-	}
-
-	switch v := value.(type) {
-	case bool:
-		return v, true
-	case int:
-		return v != 0, true
-	case string:
-		if b, err := strconv.ParseBool(v); err == nil {
-			return b, true
-		}
-	}
-	return false, false
-}
-
-// AttributeKeys returns the set of attribute keys du groupe
-func (ksg *KmerSetGroup) AttributeKeys() obiutils.Set[string] {
-	keys := obiutils.MakeSet[string]()
-	for key := range ksg.Metadata {
-		keys.Add(key)
-	}
-	return keys
-}
-
-// Keys returns the set of group attribute keys (alias)
-func (ksg *KmerSetGroup) Keys() obiutils.Set[string] {
-	return ksg.AttributeKeys()
-}
-
-// ==================================
-// MÉTHODES POUR ACCÉDER AUX ATTRIBUTS DES SETS INDIVIDUELS VIA Get()
-// Architecture zero-copy: ksg.Get(i).SetAttribute(...)
-// ==================================
-
-// Exemple d'utilisation:
-// Pour accéder aux métadonnées d'un KmerSet individuel dans un groupe:
-//   ks := ksg.Get(0)
-//   ks.SetAttribute("level", 1)
-//   hasLevel := ks.HasAttribute("level")
-//
-// Pour les métadonnées du groupe:
-//   ksg.SetAttribute("name", "FrequencyFilter")
-//   name, ok := ksg.GetStringAttribute("name")
-
-// AllAttributeKeys returns all unique attribute keys of the group AND all its sets
-func (ksg *KmerSetGroup) AllAttributeKeys() obiutils.Set[string] {
-	keys := obiutils.MakeSet[string]()
-
-	// Ajouter les clés du groupe
-	for key := range ksg.Metadata {
-		keys.Add(key)
-	}
-
-	// Ajouter les clés de chaque set
-	for _, ks := range ksg.sets {
-		for key := range ks.Metadata {
-			keys.Add(key)
-		}
-	}
-
-	return keys
-}
--- a/pkg/obikmer/kmer_set_builder.go
+++ b/pkg/obikmer/kmer_set_builder.go
@@ -0,0 +1,702 @@
+package obikmer
+
+import (
+	"fmt"
+	"math"
+	"os"
+	"path/filepath"
+	"slices"
+	"sync"
+
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
+	"github.com/schollz/progressbar/v3"
+)
+
+// BuilderOption is a functional option for KmerSetGroupBuilder.
+type BuilderOption func(*builderConfig)
+
+type builderConfig struct {
+	minFreq          int     // 0 means no frequency filtering (simple dedup)
+	maxFreq          int     // 0 means no upper bound
+	saveFreqTopN     int     // >0 means save the N most frequent k-mers per set to CSV
+	entropyThreshold float64 // >0 means filter k-mers with entropy <= threshold
+	entropyLevelMax  int     // max sub-word size for entropy (typically 6)
+}
+
+// WithMinFrequency activates frequency filtering mode.
+// Only k-mers seen >= minFreq times are kept in the final index.
+func WithMinFrequency(minFreq int) BuilderOption {
+	return func(c *builderConfig) {
+		c.minFreq = minFreq
+	}
+}
+
+// WithMaxFrequency sets the upper frequency bound.
+// Only k-mers seen <= maxFreq times are kept in the final index.
+func WithMaxFrequency(maxFreq int) BuilderOption {
+	return func(c *builderConfig) {
+		c.maxFreq = maxFreq
+	}
+}
+
+// WithSaveFreqKmers saves the N most frequent k-mers per set to a CSV file
+// (top_kmers.csv in each set directory).
+func WithSaveFreqKmers(n int) BuilderOption {
+	return func(c *builderConfig) {
+		c.saveFreqTopN = n
+	}
+}
+
+// WithEntropyFilter activates entropy-based low-complexity filtering.
+// K-mers with entropy <= threshold are discarded during finalization.
+// levelMax is the maximum sub-word size for entropy computation (typically 6).
+func WithEntropyFilter(threshold float64, levelMax int) BuilderOption {
+	return func(c *builderConfig) {
+		c.entropyThreshold = threshold
+		c.entropyLevelMax = levelMax
+	}
+}
+
+// KmerSetGroupBuilder constructs a KmerSetGroup on disk.
+// During construction, super-kmers are written to temporary .skm files
+// partitioned by minimizer. On Close(), each partition is finalized
+// (sort, dedup, optional frequency filter) into .kdi files.
+type KmerSetGroupBuilder struct {
+	dir        string
+	k          int
+	m          int
+	n          int // number of NEW sets being built
+	P          int // number of partitions
+	startIndex int // first set index (0 for new groups, existingN for appends)
+	config     builderConfig
+	existing   *KmerSetGroup  // non-nil when appending to existing group
+	writers    [][]*SkmWriter // [setIndex][partIndex] (local index 0..n-1)
+	mu         [][]sync.Mutex // per-writer mutex for concurrent access
+	closed     bool
+}
+
+// NewKmerSetGroupBuilder creates a builder for a new KmerSetGroup.
+//
+// Parameters:
+//   - directory: destination directory (created if necessary)
+//   - k: k-mer size (1-31)
+//   - m: minimizer size (-1 for auto = ceil(k/2.5))
+//   - n: number of sets in the group
+//   - P: number of partitions (-1 for auto)
+//   - options: optional builder options (e.g. WithMinFrequency)
+func NewKmerSetGroupBuilder(directory string, k, m, n, P int,
+	options ...BuilderOption) (*KmerSetGroupBuilder, error) {
+
+	if k < 2 || k > 31 {
+		return nil, fmt.Errorf("obikmer: k must be between 2 and 31, got %d", k)
+	}
+	if n < 1 {
+		return nil, fmt.Errorf("obikmer: n must be >= 1, got %d", n)
+	}
+
+	// Auto minimizer size
+	if m < 0 {
+		m = int(math.Ceil(float64(k) / 2.5))
+	}
+	if m < 1 {
+		m = 1
+	}
+	if m >= k {
+		m = k - 1
+	}
+
+	// Auto partition count
+	if P < 0 {
+		// Use 4^m as the maximum, capped at a reasonable value
+		maxP := 1 << (2 * m) // 4^m
+		P = maxP
+		if P > 4096 {
+			P = 4096
+		}
+		if P < 64 {
+			P = 64
+		}
+	}
+
+	// Apply options
+	var config builderConfig
+	for _, opt := range options {
+		opt(&config)
+	}
+
+	// Create build directory structure
+	buildDir := filepath.Join(directory, ".build")
+	for s := 0; s < n; s++ {
+		setDir := filepath.Join(buildDir, fmt.Sprintf("set_%d", s))
+		if err := os.MkdirAll(setDir, 0755); err != nil {
+			return nil, fmt.Errorf("obikmer: create build dir: %w", err)
+		}
+	}
+
+	// Create SKM writers
+	writers := make([][]*SkmWriter, n)
+	mutexes := make([][]sync.Mutex, n)
+	for s := 0; s < n; s++ {
+		writers[s] = make([]*SkmWriter, P)
+		mutexes[s] = make([]sync.Mutex, P)
+		for p := 0; p < P; p++ {
+			path := filepath.Join(buildDir, fmt.Sprintf("set_%d", s),
+				fmt.Sprintf("part_%04d.skm", p))
+			w, err := NewSkmWriter(path)
+			if err != nil {
+				// Close already-created writers
+				for ss := 0; ss <= s; ss++ {
+					for pp := 0; pp < P; pp++ {
+						if writers[ss][pp] != nil {
+							writers[ss][pp].Close()
+						}
+					}
+				}
+				return nil, fmt.Errorf("obikmer: create skm writer: %w", err)
+			}
+			writers[s][p] = w
+		}
+	}
+
+	return &KmerSetGroupBuilder{
+		dir:        directory,
+		k:          k,
+		m:          m,
+		n:          n,
+		P:          P,
+		startIndex: 0,
+		config:     config,
+		writers:    writers,
+		mu:         mutexes,
+	}, nil
+}
+
+// AppendKmerSetGroupBuilder opens an existing KmerSetGroup and creates
+// a builder that adds n new sets starting from the existing set count.
+// The k, m, and partitions are inherited from the existing group.
+func AppendKmerSetGroupBuilder(directory string, n int, options ...BuilderOption) (*KmerSetGroupBuilder, error) {
+	existing, err := OpenKmerSetGroup(directory)
+	if err != nil {
+		return nil, fmt.Errorf("obikmer: open existing group: %w", err)
+	}
+
+	if n < 1 {
+		return nil, fmt.Errorf("obikmer: n must be >= 1, got %d", n)
+	}
+
+	k := existing.K()
+	m := existing.M()
+	P := existing.Partitions()
+	startIndex := existing.Size()
+
+	var config builderConfig
+	for _, opt := range options {
+		opt(&config)
+	}
+
+	// Create build directory structure for new sets
+	buildDir := filepath.Join(directory, ".build")
+	for s := 0; s < n; s++ {
+		setDir := filepath.Join(buildDir, fmt.Sprintf("set_%d", s))
+		if err := os.MkdirAll(setDir, 0755); err != nil {
+			return nil, fmt.Errorf("obikmer: create build dir: %w", err)
+		}
+	}
+
+	// Create SKM writers for new sets
+	writers := make([][]*SkmWriter, n)
+	mutexes := make([][]sync.Mutex, n)
+	for s := 0; s < n; s++ {
+		writers[s] = make([]*SkmWriter, P)
+		mutexes[s] = make([]sync.Mutex, P)
+		for p := 0; p < P; p++ {
+			path := filepath.Join(buildDir, fmt.Sprintf("set_%d", s),
+				fmt.Sprintf("part_%04d.skm", p))
+			w, err := NewSkmWriter(path)
+			if err != nil {
+				for ss := 0; ss <= s; ss++ {
+					for pp := 0; pp < P; pp++ {
+						if writers[ss][pp] != nil {
+							writers[ss][pp].Close()
+						}
+					}
+				}
+				return nil, fmt.Errorf("obikmer: create skm writer: %w", err)
+			}
+			writers[s][p] = w
+		}
+	}
+
+	return &KmerSetGroupBuilder{
+		dir:        directory,
+		k:          k,
+		m:          m,
+		n:          n,
+		P:          P,
+		startIndex: startIndex,
+		config:     config,
+		existing:   existing,
+		writers:    writers,
+		mu:         mutexes,
+	}, nil
+}
+
+// StartIndex returns the first global set index for the new sets being built.
+// For new groups this is 0; for appends it is the existing group's Size().
+func (b *KmerSetGroupBuilder) StartIndex() int {
+	return b.startIndex
+}
+
+// AddSequence extracts super-kmers from a sequence and writes them
+// to the appropriate partition files for the given set.
+func (b *KmerSetGroupBuilder) AddSequence(setIndex int, seq *obiseq.BioSequence) {
+	if setIndex < 0 || setIndex >= b.n {
+		return
+	}
+	rawSeq := seq.Sequence()
+	if len(rawSeq) < b.k {
+		return
+	}
+	for sk := range IterSuperKmers(rawSeq, b.k, b.m) {
+		part := int(sk.Minimizer % uint64(b.P))
+		b.mu[setIndex][part].Lock()
+		b.writers[setIndex][part].Write(sk)
+		b.mu[setIndex][part].Unlock()
+	}
+}
+
+// AddSuperKmer writes a single super-kmer to the appropriate partition.
+func (b *KmerSetGroupBuilder) AddSuperKmer(setIndex int, sk SuperKmer) {
+	if setIndex < 0 || setIndex >= b.n {
+		return
+	}
+	part := int(sk.Minimizer % uint64(b.P))
+	b.mu[setIndex][part].Lock()
+	b.writers[setIndex][part].Write(sk)
+	b.mu[setIndex][part].Unlock()
+}
+
+// Close finalizes the construction:
+//  1. Flush and close all SKM writers
+//  2. For each partition of each set (in parallel):
+//     - Load super-kmers from .skm
+//     - Extract canonical k-mers
+//     - Sort and deduplicate (count if frequency filter)
+//     - Write .kdi file
+//  3. Write metadata.toml
+//  4. Remove .build/ directory
+//
+// Returns the finalized KmerSetGroup in read-only mode.
+func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) {
+	if b.closed {
+		return nil, fmt.Errorf("obikmer: builder already closed")
+	}
+	b.closed = true
+
+	// 1. Close all SKM writers
+	for s := 0; s < b.n; s++ {
+		for p := 0; p < b.P; p++ {
+			if err := b.writers[s][p].Close(); err != nil {
+				return nil, fmt.Errorf("obikmer: close skm writer set=%d part=%d: %w", s, p, err)
+			}
+		}
+	}
+
+	// 2. Create output directory structure for new sets
+	for s := 0; s < b.n; s++ {
+		globalIdx := b.startIndex + s
+		setDir := filepath.Join(b.dir, fmt.Sprintf("set_%d", globalIdx))
+		if err := os.MkdirAll(setDir, 0755); err != nil {
+			return nil, fmt.Errorf("obikmer: create set dir: %w", err)
+		}
+	}
+
+	// =====================================================================
+	// 2-stage pipeline: readers (pure I/O) → workers (CPU + write)
+	//
+	// - nReaders goroutines read .skm files (pure I/O, fast)
+	// - nWorkers goroutines extract k-mers, sort, dedup, filter, write .kdi
+	//
+	// One unbuffered channel between stages. Readers are truly I/O-bound
+	// (small files, buffered reads), workers are CPU-bound and stay busy.
+	// =====================================================================
+	totalJobs := b.n * b.P
+
+	counts := make([][]uint64, b.n)
+	spectra := make([][]map[int]uint64, b.n)
+	var topKmers [][]*TopNKmers
+	for s := 0; s < b.n; s++ {
+		counts[s] = make([]uint64, b.P)
+		spectra[s] = make([]map[int]uint64, b.P)
+	}
+	if b.config.saveFreqTopN > 0 {
+		topKmers = make([][]*TopNKmers, b.n)
+		for s := 0; s < b.n; s++ {
+			topKmers[s] = make([]*TopNKmers, b.P)
+		}
+	}
+
+	nCPU := obidefault.ParallelWorkers()
+
+	// Stage sizing
+	nWorkers := nCPU     // CPU-bound: one per core
+	nReaders := nCPU / 4 // pure I/O: few goroutines suffice
+	if nReaders < 2 {
+		nReaders = 2
+	}
+	if nReaders > 4 {
+		nReaders = 4
+	}
+	if nWorkers > totalJobs {
+		nWorkers = totalJobs
+	}
+	if nReaders > totalJobs {
+		nReaders = totalJobs
+	}
+
+	var bar *progressbar.ProgressBar
+	if obidefault.ProgressBar() {
+		pbopt := []progressbar.Option{
+			progressbar.OptionSetWriter(os.Stderr),
+			progressbar.OptionSetWidth(15),
+			progressbar.OptionShowCount(),
+			progressbar.OptionShowIts(),
+			progressbar.OptionSetPredictTime(true),
+			progressbar.OptionSetDescription("[Finalizing partitions]"),
+		}
+		bar = progressbar.NewOptions(totalJobs, pbopt...)
+	}
+
+	// --- Channel types ---
+	type partitionData struct {
+		setIdx  int
+		partIdx int
+		skmers  []SuperKmer // raw super-kmers from I/O stage
+	}
+
+	type readJob struct {
+		setIdx  int
+		partIdx int
+	}
+
+	dataCh := make(chan *partitionData) // unbuffered
+	readJobs := make(chan readJob, totalJobs)
+
+	var errMu sync.Mutex
+	var firstErr error
+
+	// Fill job queue (buffered, all jobs pre-loaded)
+	for s := 0; s < b.n; s++ {
+		for p := 0; p < b.P; p++ {
+			readJobs <- readJob{s, p}
+		}
+	}
+	close(readJobs)
+
+	// --- Stage 1: Readers (pure I/O) ---
+	var readWg sync.WaitGroup
+	for w := 0; w < nReaders; w++ {
+		readWg.Add(1)
+		go func() {
+			defer readWg.Done()
+			for rj := range readJobs {
+				skmers, err := b.loadPartitionRaw(rj.setIdx, rj.partIdx)
+				if err != nil {
+					errMu.Lock()
+					if firstErr == nil {
+						firstErr = err
+					}
+					errMu.Unlock()
+				}
+				dataCh <- &partitionData{rj.setIdx, rj.partIdx, skmers}
+			}
+		}()
+	}
+
+	go func() {
+		readWg.Wait()
+		close(dataCh)
+	}()
+
+	// --- Stage 2: Workers (CPU: extract k-mers + sort/filter + write .kdi) ---
+	var workWg sync.WaitGroup
+	for w := 0; w < nWorkers; w++ {
+		workWg.Add(1)
+		go func() {
+			defer workWg.Done()
+			for pd := range dataCh {
+				// CPU: extract canonical k-mers from super-kmers
+				kmers := extractCanonicalKmers(pd.skmers, b.k)
+				pd.skmers = nil // allow GC of raw super-kmers
+
+				// CPU: sort, dedup, filter
+				filtered, spectrum, topN := b.sortFilterPartition(kmers)
+				kmers = nil // allow GC of unsorted data
+
+				// I/O: write .kdi file
+				globalIdx := b.startIndex + pd.setIdx
+				kdiPath := filepath.Join(b.dir,
+					fmt.Sprintf("set_%d", globalIdx),
+					fmt.Sprintf("part_%04d.kdi", pd.partIdx))
+
+				n, err := b.writePartitionKdi(kdiPath, filtered)
+				if err != nil {
+					errMu.Lock()
+					if firstErr == nil {
+						firstErr = err
+					}
+					errMu.Unlock()
+				}
+				counts[pd.setIdx][pd.partIdx] = n
+				spectra[pd.setIdx][pd.partIdx] = spectrum
+				if topKmers != nil {
+					topKmers[pd.setIdx][pd.partIdx] = topN
+				}
+				if bar != nil {
+					bar.Add(1)
+				}
+			}
+		}()
+	}
+
+	workWg.Wait()
+
+	if bar != nil {
+		fmt.Fprintln(os.Stderr)
+	}
+
+	if firstErr != nil {
+		return nil, firstErr
+	}
+
+	// Aggregate per-partition spectra into per-set spectra and write spectrum.bin
+	for s := 0; s < b.n; s++ {
+		globalIdx := b.startIndex + s
+		setSpectrum := make(map[int]uint64)
+		for p := 0; p < b.P; p++ {
+			if spectra[s][p] != nil {
+				MergeSpectraMaps(setSpectrum, spectra[s][p])
+			}
+		}
+		if len(setSpectrum) > 0 {
+			specPath := filepath.Join(b.dir, fmt.Sprintf("set_%d", globalIdx), "spectrum.bin")
+			if err := WriteSpectrum(specPath, MapToSpectrum(setSpectrum)); err != nil {
+				return nil, fmt.Errorf("obikmer: write spectrum set=%d: %w", globalIdx, err)
+			}
+		}
+	}
+
+	// Aggregate per-partition top-N k-mers and write CSV
+	if topKmers != nil {
+		for s := 0; s < b.n; s++ {
+			globalIdx := b.startIndex + s
+			merged := NewTopNKmers(b.config.saveFreqTopN)
+			for p := 0; p < b.P; p++ {
+				merged.MergeTopN(topKmers[s][p])
+			}
+			results := merged.Results()
+			if len(results) > 0 {
+				csvPath := filepath.Join(b.dir, fmt.Sprintf("set_%d", globalIdx), "top_kmers.csv")
+				if err := WriteTopKmersCSV(csvPath, results, b.k); err != nil {
+					return nil, fmt.Errorf("obikmer: write top kmers set=%d: %w", globalIdx, err)
+				}
+			}
+		}
+	}
+
+	// 3. Build KmerSetGroup and write metadata
+	newCounts := make([]uint64, b.n)
+	for s := 0; s < b.n; s++ {
+		for p := 0; p < b.P; p++ {
+			newCounts[s] += counts[s][p]
+		}
+	}
+
+	var ksg *KmerSetGroup
+
+	if b.existing != nil {
+		// Append mode: extend existing group
+		ksg = b.existing
+		ksg.n += b.n
+		ksg.setsIDs = append(ksg.setsIDs, make([]string, b.n)...)
+		ksg.counts = append(ksg.counts, newCounts...)
+		newMeta := make([]map[string]interface{}, b.n)
+		for i := range newMeta {
+			newMeta[i] = make(map[string]interface{})
+		}
+		ksg.setsMetadata = append(ksg.setsMetadata, newMeta...)
+	} else {
+		// New group
+		setsIDs := make([]string, b.n)
+		setsMetadata := make([]map[string]interface{}, b.n)
+		for i := range setsMetadata {
+			setsMetadata[i] = make(map[string]interface{})
+		}
+		ksg = &KmerSetGroup{
+			path:         b.dir,
+			k:            b.k,
+			m:            b.m,
+			partitions:   b.P,
+			n:            b.n,
+			setsIDs:      setsIDs,
+			counts:       newCounts,
+			setsMetadata: setsMetadata,
+			Metadata:     make(map[string]interface{}),
+		}
+	}
+
+	if err := ksg.saveMetadata(); err != nil {
+		return nil, fmt.Errorf("obikmer: write metadata: %w", err)
+	}
+
+	// 4. Remove .build/ directory
+	buildDir := filepath.Join(b.dir, ".build")
+	os.RemoveAll(buildDir)
+
+	return ksg, nil
+}
+
+// loadPartitionRaw reads a .skm file and returns raw super-kmers.
+// This is pure I/O — no k-mer extraction is done here.
+// Returns nil (not an error) if the .skm file is empty or missing.
+func (b *KmerSetGroupBuilder) loadPartitionRaw(setIdx, partIdx int) ([]SuperKmer, error) {
+	skmPath := filepath.Join(b.dir, ".build",
+		fmt.Sprintf("set_%d", setIdx),
+		fmt.Sprintf("part_%04d.skm", partIdx))
+
+	fi, err := os.Stat(skmPath)
+	if err != nil {
+		return nil, nil // empty partition, not an error
+	}
+
+	reader, err := NewSkmReader(skmPath)
+	if err != nil {
+		return nil, nil
+	}
+
+	// Estimate capacity from file size. Each super-kmer record is
+	// 2 bytes (length) + packed bases (~k/4 bytes), so roughly
+	// (2 + k/4) bytes per super-kmer on average.
+	avgRecordSize := 2 + b.k/4
+	if avgRecordSize < 4 {
+		avgRecordSize = 4
+	}
+	estCount := int(fi.Size()) / avgRecordSize
+
+	skmers := make([]SuperKmer, 0, estCount)
+	for {
+		sk, ok := reader.Next()
+		if !ok {
+			break
+		}
+		skmers = append(skmers, sk)
+	}
+	reader.Close()
+
+	return skmers, nil
+}
+
+// extractCanonicalKmers extracts all canonical k-mers from a slice of super-kmers.
+// This is CPU-bound work (sliding-window forward/reverse complement).
+func extractCanonicalKmers(skmers []SuperKmer, k int) []uint64 {
+	// Pre-compute total capacity to avoid repeated slice growth.
+	// Each super-kmer of length L yields L-k+1 canonical k-mers.
+	total := 0
+	for i := range skmers {
+		n := len(skmers[i].Sequence) - k + 1
+		if n > 0 {
+			total += n
+		}
+	}
+
+	kmers := make([]uint64, 0, total)
+	for _, sk := range skmers {
+		for kmer := range IterCanonicalKmers(sk.Sequence, k) {
+			kmers = append(kmers, kmer)
+		}
+	}
+	return kmers
+}
+
+// sortFilterPartition sorts, deduplicates, and filters k-mers in memory (CPU-bound).
+// Returns the filtered sorted slice, frequency spectrum, and optional top-N.
+func (b *KmerSetGroupBuilder) sortFilterPartition(kmers []uint64) ([]uint64, map[int]uint64, *TopNKmers) {
+	if len(kmers) == 0 {
+		return nil, nil, nil
+	}
+
+	// Sort (CPU-bound) — slices.Sort avoids reflection overhead of sort.Slice
+	slices.Sort(kmers)
+
+	minFreq := b.config.minFreq
+	if minFreq <= 0 {
+		minFreq = 1 // simple dedup
+	}
+	maxFreq := b.config.maxFreq
+
+	// Prepare entropy filter if requested
+	var entropyFilter *KmerEntropyFilter
+	if b.config.entropyThreshold > 0 && b.config.entropyLevelMax > 0 {
+		entropyFilter = NewKmerEntropyFilter(b.k, b.config.entropyLevelMax, b.config.entropyThreshold)
+	}
+
+	// Prepare top-N collector if requested
+	var topN *TopNKmers
+	if b.config.saveFreqTopN > 0 {
+		topN = NewTopNKmers(b.config.saveFreqTopN)
+	}
+
+	// Linear scan: count consecutive identical values, filter, accumulate spectrum
+	partSpectrum := make(map[int]uint64)
+	filtered := make([]uint64, 0, len(kmers)/2)
+
+	i := 0
+	for i < len(kmers) {
+		val := kmers[i]
+		c := 1
+		for i+c < len(kmers) && kmers[i+c] == val {
+			c++
+		}
+		partSpectrum[c]++
+		if topN != nil {
+			topN.Add(val, c)
+		}
+		if c >= minFreq && (maxFreq <= 0 || c <= maxFreq) {
+			if entropyFilter == nil || entropyFilter.Accept(val) {
+				filtered = append(filtered, val)
+			}
+		}
+		i += c
+	}
+
+	return filtered, partSpectrum, topN
+}
+
+// writePartitionKdi writes a sorted slice of k-mers to a .kdi file (I/O-bound).
+// Returns the number of k-mers written.
+func (b *KmerSetGroupBuilder) writePartitionKdi(kdiPath string, kmers []uint64) (uint64, error) {
+	w, err := NewKdiWriter(kdiPath)
+	if err != nil {
+		return 0, err
+	}
+
+	for _, val := range kmers {
+		if err := w.Write(val); err != nil {
+			w.Close()
+			return 0, err
+		}
+	}
+
+	n := w.Count()
+	return n, w.Close()
+}
+
+func (b *KmerSetGroupBuilder) writeEmptyKdi(path string, count *uint64) error {
+	w, err := NewKdiWriter(path)
+	if err != nil {
+		return err
+	}
+	*count = 0
+	return w.Close()
+}
--- a/pkg/obikmer/kmer_set_builder_test.go
+++ b/pkg/obikmer/kmer_set_builder_test.go
@@ -0,0 +1,278 @@
+package obikmer
+
+import (
+	"sort"
+	"testing"
+
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
+)
+
+func TestBuilderBasic(t *testing.T) {
+	dir := t.TempDir()
+
+	builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	seq := obiseq.NewBioSequence("test", []byte("ACGTACGTACGTACGTACGTACGTACGT"), "")
+	builder.AddSequence(0, seq)
+
+	ksg, err := builder.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if ksg.K() != 15 {
+		t.Fatalf("K() = %d, want 15", ksg.K())
+	}
+	if ksg.M() != 7 {
+		t.Fatalf("M() = %d, want 7", ksg.M())
+	}
+	if ksg.Partitions() != 64 {
+		t.Fatalf("Partitions() = %d, want 64", ksg.Partitions())
+	}
+	if ksg.Size() != 1 {
+		t.Fatalf("Size() = %d, want 1", ksg.Size())
+	}
+	if ksg.Len(0) == 0 {
+		t.Fatal("Len(0) = 0, expected some k-mers")
+	}
+
+	// Verify k-mers match what we'd compute directly
+	var expected []uint64
+	for kmer := range IterCanonicalKmers(seq.Sequence(), 15) {
+		expected = append(expected, kmer)
+	}
+	sort.Slice(expected, func(i, j int) bool { return expected[i] < expected[j] })
+	// Dedup
+	deduped := expected[:0]
+	for i, v := range expected {
+		if i == 0 || v != expected[i-1] {
+			deduped = append(deduped, v)
+		}
+	}
+
+	if ksg.Len(0) != uint64(len(deduped)) {
+		t.Fatalf("Len(0) = %d, expected %d unique k-mers", ksg.Len(0), len(deduped))
+	}
+
+	// Check iterator
+	var fromIter []uint64
+	for kmer := range ksg.Iterator(0) {
+		fromIter = append(fromIter, kmer)
+	}
+	// The iterator does a k-way merge so should be sorted
+	for i := 1; i < len(fromIter); i++ {
+		if fromIter[i] <= fromIter[i-1] {
+			t.Fatalf("iterator not sorted at %d: %d <= %d", i, fromIter[i], fromIter[i-1])
+		}
+	}
+	if len(fromIter) != len(deduped) {
+		t.Fatalf("iterator yielded %d k-mers, expected %d", len(fromIter), len(deduped))
+	}
+	for i, v := range fromIter {
+		if v != deduped[i] {
+			t.Fatalf("iterator kmer %d: got %d, want %d", i, v, deduped[i])
+		}
+	}
+}
+
+func TestBuilderMultipleSequences(t *testing.T) {
+	dir := t.TempDir()
+
+	builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	seqs := []string{
+		"ACGTACGTACGTACGTACGTACGTACGT",
+		"TTTTTTTTTTTTTTTTTTTTTTTTT",
+		"GGGGGGGGGGGGGGGGGGGGGGGG",
+	}
+	for _, s := range seqs {
+		seq := obiseq.NewBioSequence("", []byte(s), "")
+		builder.AddSequence(0, seq)
+	}
+
+	ksg, err := builder.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if ksg.Len(0) == 0 {
+		t.Fatal("expected k-mers after multiple sequences")
+	}
+}
+
+func TestBuilderFrequencyFilter(t *testing.T) {
+	dir := t.TempDir()
+
+	builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64,
+		WithMinFrequency(3))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Add same sequence 3 times — all k-mers should survive freq=3
+	seq := obiseq.NewBioSequence("test", []byte("ACGTACGTACGTACGTACGTACGTACGT"), "")
+	for i := 0; i < 3; i++ {
+		builder.AddSequence(0, seq)
+	}
+
+	ksg, err := builder.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// All k-mers appear exactly 3 times → all should survive
+	var expected []uint64
+	for kmer := range IterCanonicalKmers(seq.Sequence(), 15) {
+		expected = append(expected, kmer)
+	}
+	sort.Slice(expected, func(i, j int) bool { return expected[i] < expected[j] })
+	deduped := expected[:0]
+	for i, v := range expected {
+		if i == 0 || v != expected[i-1] {
+			deduped = append(deduped, v)
+		}
+	}
+
+	if ksg.Len(0) != uint64(len(deduped)) {
+		t.Fatalf("Len(0) = %d, expected %d (all k-mers at freq=3)", ksg.Len(0), len(deduped))
+	}
+}
+
+func TestBuilderFrequencyFilterRejects(t *testing.T) {
+	dir := t.TempDir()
+
+	builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64,
+		WithMinFrequency(5))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Use a non-repetitive sequence so each canonical k-mer appears once per pass.
+	// Adding it twice gives freq=2 per kmer, which is < minFreq=5 → all rejected.
+	seq := obiseq.NewBioSequence("test",
+		[]byte("ACGATCGATCTAGCTAGCTGATCGATCGATCG"), "")
+	builder.AddSequence(0, seq)
+	builder.AddSequence(0, seq)
+
+	ksg, err := builder.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if ksg.Len(0) != 0 {
+		t.Fatalf("Len(0) = %d, expected 0 (all k-mers at freq=2 < minFreq=5)", ksg.Len(0))
+	}
+}
+
+func TestBuilderMultipleSets(t *testing.T) {
+	dir := t.TempDir()
+
+	builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 3, 64)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	seqs := []string{
+		"ACGTACGTACGTACGTACGTACGTACGT",
+		"TTTTTTTTTTTTTTTTTTTTTTTTT",
+		"GGGGGGGGGGGGGGGGGGGGGGGG",
+	}
+	for i, s := range seqs {
+		seq := obiseq.NewBioSequence("", []byte(s), "")
+		builder.AddSequence(i, seq)
+	}
+
+	ksg, err := builder.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if ksg.Size() != 3 {
+		t.Fatalf("Size() = %d, want 3", ksg.Size())
+	}
+	for s := 0; s < 3; s++ {
+		if ksg.Len(s) == 0 {
+			t.Fatalf("Len(%d) = 0, expected some k-mers", s)
+		}
+	}
+}
+
+func TestBuilderOpenRoundTrip(t *testing.T) {
+	dir := t.TempDir()
+
+	builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	seq := obiseq.NewBioSequence("test", []byte("ACGTACGTACGTACGTACGTACGTACGT"), "")
+	builder.AddSequence(0, seq)
+
+	ksg1, err := builder.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Reopen
+	ksg2, err := OpenKmerSetGroup(dir)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if ksg2.K() != ksg1.K() {
+		t.Fatalf("K mismatch: %d vs %d", ksg2.K(), ksg1.K())
+	}
+	if ksg2.M() != ksg1.M() {
+		t.Fatalf("M mismatch: %d vs %d", ksg2.M(), ksg1.M())
+	}
+	if ksg2.Partitions() != ksg1.Partitions() {
+		t.Fatalf("Partitions mismatch: %d vs %d", ksg2.Partitions(), ksg1.Partitions())
+	}
+	if ksg2.Len(0) != ksg1.Len(0) {
+		t.Fatalf("Len mismatch: %d vs %d", ksg2.Len(0), ksg1.Len(0))
+	}
+}
+
+func TestBuilderAttributes(t *testing.T) {
+	dir := t.TempDir()
+
+	builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	seq := obiseq.NewBioSequence("test", []byte("ACGTACGTACGTACGTACGTACGTACGT"), "")
+	builder.AddSequence(0, seq)
+
+	ksg, err := builder.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	ksg.SetId("my_index")
+	ksg.SetAttribute("organism", "test")
+	ksg.SaveMetadata()
+
+	// Reopen and check
+	ksg2, err := OpenKmerSetGroup(dir)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if ksg2.Id() != "my_index" {
+		t.Fatalf("Id() = %q, want %q", ksg2.Id(), "my_index")
+	}
+	if !ksg2.HasAttribute("organism") {
+		t.Fatal("expected 'organism' attribute")
+	}
+	v, _ := ksg2.GetAttribute("organism")
+	if v != "test" {
+		t.Fatalf("organism = %v, want 'test'", v)
+	}
+}
--- a/pkg/obikmer/kmer_set_disk.go
+++ b/pkg/obikmer/kmer_set_disk.go
@@ -0,0 +1,944 @@
+package obikmer
+
+import (
+	"fmt"
+	"io"
+	"iter"
+	"os"
+	"path"
+	"path/filepath"
+	"sort"
+	"sync"
+
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidist"
+	"github.com/pelletier/go-toml/v2"
+)
+
+// MetadataFormat represents the metadata serialization format.
+// Currently only TOML is used for disk-based indices, but the type
+// is kept for backward compatibility with CLI options.
+type MetadataFormat int
+
+const (
+	FormatTOML MetadataFormat = iota
+	FormatYAML
+	FormatJSON
+)
+
+// String returns the file extension for the format.
+func (f MetadataFormat) String() string {
+	switch f {
+	case FormatTOML:
+		return "toml"
+	case FormatYAML:
+		return "yaml"
+	case FormatJSON:
+		return "json"
+	default:
+		return "toml"
+	}
+}
+
+// KmerSetGroup is a disk-based collection of N k-mer sets sharing the same
+// k, m, and partition count P. After construction (via KmerSetGroupBuilder),
+// it is immutable and all operations are streaming (partition by partition).
+//
+// A KmerSetGroup with Size()==1 is effectively a KmerSet (singleton).
+type KmerSetGroup struct {
+	path         string                   // root directory
+	id           string                   // user-assigned identifier
+	k            int                      // k-mer size
+	m            int                      // minimizer size
+	partitions   int                      // number of partitions P
+	n            int                      // number of sets N
+	setsIDs      []string                 // IDs of individual sets
+	counts       []uint64                 // total k-mer count per set (sum over partitions)
+	setsMetadata []map[string]interface{} // per-set user metadata
+	Metadata     map[string]interface{}   // group-level user metadata
+}
+
+// diskMetadata is the TOML-serializable structure for metadata.toml.
+type diskMetadata struct {
+	ID           string                   `toml:"id,omitempty"`
+	K            int                      `toml:"k"`
+	M            int                      `toml:"m"`
+	Partitions   int                      `toml:"partitions"`
+	Type         string                   `toml:"type"`
+	Size         int                      `toml:"size"`
+	SetsIDs      []string                 `toml:"sets_ids,omitempty"`
+	Counts       []uint64                 `toml:"counts,omitempty"`
+	SetsMetadata []map[string]interface{} `toml:"sets_metadata,omitempty"`
+	UserMetadata map[string]interface{}   `toml:"user_metadata,omitempty"`
+}
+
+// OpenKmerSetGroup opens a finalized index directory in read-only mode.
+func OpenKmerSetGroup(directory string) (*KmerSetGroup, error) {
+	metaPath := filepath.Join(directory, "metadata.toml")
+	f, err := os.Open(metaPath)
+	if err != nil {
+		return nil, fmt.Errorf("obikmer: open metadata: %w", err)
+	}
+	defer f.Close()
+
+	var meta diskMetadata
+	if err := toml.NewDecoder(f).Decode(&meta); err != nil {
+		return nil, fmt.Errorf("obikmer: decode metadata: %w", err)
+	}
+
+	ksg := &KmerSetGroup{
+		path:         directory,
+		id:           meta.ID,
+		k:            meta.K,
+		m:            meta.M,
+		partitions:   meta.Partitions,
+		n:            meta.Size,
+		setsIDs:      meta.SetsIDs,
+		counts:       meta.Counts,
+		setsMetadata: meta.SetsMetadata,
+		Metadata:     meta.UserMetadata,
+	}
+	if ksg.Metadata == nil {
+		ksg.Metadata = make(map[string]interface{})
+	}
+	if ksg.setsIDs == nil {
+		ksg.setsIDs = make([]string, ksg.n)
+	}
+	if ksg.setsMetadata == nil {
+		ksg.setsMetadata = make([]map[string]interface{}, ksg.n)
+		for i := range ksg.setsMetadata {
+			ksg.setsMetadata[i] = make(map[string]interface{})
+		}
+	}
+	if ksg.counts == nil {
+		// Compute counts by scanning partitions
+		ksg.counts = make([]uint64, ksg.n)
+		for s := 0; s < ksg.n; s++ {
+			for p := 0; p < ksg.partitions; p++ {
+				path := ksg.partitionPath(s, p)
+				r, err := NewKdiReader(path)
+				if err != nil {
+					continue
+				}
+				ksg.counts[s] += r.Count()
+				r.Close()
+			}
+		}
+	}
+
+	return ksg, nil
+}
+
+// NewFilteredKmerSetGroup creates a KmerSetGroup from pre-computed data.
+// Used by the filter command to construct a new group after filtering partitions.
+func NewFilteredKmerSetGroup(
+	directory string, k, m, partitions, n int,
+	setsIDs []string, counts []uint64,
+	setsMetadata []map[string]interface{},
+) (*KmerSetGroup, error) {
+	ksg := &KmerSetGroup{
+		path:         directory,
+		k:            k,
+		m:            m,
+		partitions:   partitions,
+		n:            n,
+		setsIDs:      setsIDs,
+		counts:       counts,
+		setsMetadata: setsMetadata,
+		Metadata:     make(map[string]interface{}),
+	}
+	return ksg, nil
+}
+
+// SaveMetadata writes the metadata.toml file. This is useful after
+// modifying attributes or IDs on an already-finalized index.
+func (ksg *KmerSetGroup) SaveMetadata() error {
+	return ksg.saveMetadata()
+}
+
+// saveMetadata writes the metadata.toml file (internal).
+func (ksg *KmerSetGroup) saveMetadata() error {
+	meta := diskMetadata{
+		ID:           ksg.id,
+		K:            ksg.k,
+		M:            ksg.m,
+		Partitions:   ksg.partitions,
+		Type:         "KmerSetGroup",
+		Size:         ksg.n,
+		SetsIDs:      ksg.setsIDs,
+		Counts:       ksg.counts,
+		SetsMetadata: ksg.setsMetadata,
+		UserMetadata: ksg.Metadata,
+	}
+
+	metaPath := filepath.Join(ksg.path, "metadata.toml")
+	f, err := os.Create(metaPath)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	return toml.NewEncoder(f).Encode(meta)
+}
+
+// partitionPath returns the file path for partition p of set s.
+func (ksg *KmerSetGroup) partitionPath(setIndex, partIndex int) string {
+	return filepath.Join(ksg.path, fmt.Sprintf("set_%d", setIndex),
+		fmt.Sprintf("part_%04d.kdi", partIndex))
+}
+
+// Path returns the root directory of the index.
+func (ksg *KmerSetGroup) Path() string {
+	return ksg.path
+}
+
+// K returns the k-mer size.
+func (ksg *KmerSetGroup) K() int {
+	return ksg.k
+}
+
+// M returns the minimizer size.
+func (ksg *KmerSetGroup) M() int {
+	return ksg.m
+}
+
+// Partitions returns the number of partitions P.
+func (ksg *KmerSetGroup) Partitions() int {
+	return ksg.partitions
+}
+
+// Size returns the number of sets N.
+func (ksg *KmerSetGroup) Size() int {
+	return ksg.n
+}
+
+// Id returns the group identifier.
+func (ksg *KmerSetGroup) Id() string {
+	return ksg.id
+}
+
+// SetId sets the group identifier and persists the change.
+func (ksg *KmerSetGroup) SetId(id string) {
+	ksg.id = id
+}
+
+// Len returns the total number of k-mers.
+// Without argument: total across all sets.
+// With argument setIndex: count for that specific set.
+func (ksg *KmerSetGroup) Len(setIndex ...int) uint64 {
+	if len(setIndex) == 0 {
+		var total uint64
+		for _, c := range ksg.counts {
+			total += c
+		}
+		return total
+	}
+	idx := setIndex[0]
+	if idx < 0 || idx >= ksg.n {
+		return 0
+	}
+	return ksg.counts[idx]
+}
+
+// Contains checks if a k-mer is present in the specified set.
+// Uses the .kdx sparse index (if available) for fast seeking within
+// each partition, then a short linear scan of at most `stride` entries.
+// All partitions are searched in parallel since the k-mer's partition
+// is not known without its minimizer context.
+func (ksg *KmerSetGroup) Contains(setIndex int, kmer uint64) bool {
+	if setIndex < 0 || setIndex >= ksg.n {
+		return false
+	}
+
+	type result struct {
+		found bool
+	}
+	ch := make(chan result, ksg.partitions)
+
+	for p := 0; p < ksg.partitions; p++ {
+		go func(part int) {
+			r, err := NewKdiIndexedReader(ksg.partitionPath(setIndex, part))
+			if err != nil {
+				ch <- result{false}
+				return
+			}
+			defer r.Close()
+
+			// Use index to jump near the target
+			if err := r.SeekTo(kmer); err != nil {
+				ch <- result{false}
+				return
+			}
+
+			// Linear scan from the seek position
+			for {
+				v, ok := r.Next()
+				if !ok {
+					ch <- result{false}
+					return
+				}
+				if v == kmer {
+					ch <- result{true}
+					return
+				}
+				if v > kmer {
+					ch <- result{false}
+					return
+				}
+			}
+		}(p)
+	}
+
+	for i := 0; i < ksg.partitions; i++ {
+		res := <-ch
+		if res.found {
+			// Drain remaining goroutines
+			go func() {
+				for j := i + 1; j < ksg.partitions; j++ {
+					<-ch
+				}
+			}()
+			return true
+		}
+	}
+	return false
+}
+
+// Iterator returns an iterator over all k-mers in the specified set,
+// in sorted order within each partition. Since partitions are independent,
+// to get a globally sorted stream, use iteratorSorted.
+func (ksg *KmerSetGroup) Iterator(setIndex int) iter.Seq[uint64] {
+	return func(yield func(uint64) bool) {
+		if setIndex < 0 || setIndex >= ksg.n {
+			return
+		}
+
+		// Open all partition readers and merge them
+		readers := make([]*KdiReader, 0, ksg.partitions)
+		for p := 0; p < ksg.partitions; p++ {
+			r, err := NewKdiReader(ksg.partitionPath(setIndex, p))
+			if err != nil {
+				continue
+			}
+			if r.Count() > 0 {
+				readers = append(readers, r)
+			} else {
+				r.Close()
+			}
+		}
+
+		if len(readers) == 0 {
+			return
+		}
+
+		m := NewKWayMerge(readers)
+		defer m.Close()
+
+		for {
+			kmer, _, ok := m.Next()
+			if !ok {
+				return
+			}
+			if !yield(kmer) {
+				return
+			}
+		}
+	}
+}
+
+// ==============================
+// Attribute API (compatible with old API)
+// ==============================
+
+// HasAttribute checks if a metadata key exists.
+func (ksg *KmerSetGroup) HasAttribute(key string) bool {
+	_, ok := ksg.Metadata[key]
+	return ok
+}
+
+// GetAttribute returns the value of an attribute.
+func (ksg *KmerSetGroup) GetAttribute(key string) (interface{}, bool) {
+	switch key {
+	case "id":
+		return ksg.Id(), true
+	case "k":
+		return ksg.K(), true
+	default:
+		value, ok := ksg.Metadata[key]
+		return value, ok
+	}
+}
+
+// SetAttribute sets a metadata attribute.
+func (ksg *KmerSetGroup) SetAttribute(key string, value interface{}) {
+	switch key {
+	case "id":
+		if id, ok := value.(string); ok {
+			ksg.SetId(id)
+		} else {
+			panic(fmt.Sprintf("id must be a string, got %T", value))
+		}
+	case "k":
+		panic("k is immutable")
+	default:
+		ksg.Metadata[key] = value
+	}
+}
+
+// DeleteAttribute removes a metadata attribute.
+func (ksg *KmerSetGroup) DeleteAttribute(key string) {
+	delete(ksg.Metadata, key)
+}
+
+// GetIntAttribute returns an attribute as int.
+func (ksg *KmerSetGroup) GetIntAttribute(key string) (int, bool) {
+	v, ok := ksg.GetAttribute(key)
+	if !ok {
+		return 0, false
+	}
+	switch val := v.(type) {
+	case int:
+		return val, true
+	case int64:
+		return int(val), true
+	case float64:
+		return int(val), true
+	}
+	return 0, false
+}
+
+// GetStringAttribute returns an attribute as string.
+func (ksg *KmerSetGroup) GetStringAttribute(key string) (string, bool) {
+	v, ok := ksg.GetAttribute(key)
+	if !ok {
+		return "", false
+	}
+	if s, ok := v.(string); ok {
+		return s, true
+	}
+	return fmt.Sprintf("%v", v), true
+}
+
+// ==============================
+// Jaccard metrics (streaming, disk-based)
+// ==============================
+
+// JaccardDistanceMatrix computes a pairwise Jaccard distance matrix
+// for all sets in the group. Operates partition by partition in streaming.
+func (ksg *KmerSetGroup) JaccardDistanceMatrix() *obidist.DistMatrix {
+	n := ksg.n
+	labels := make([]string, n)
+	for i := 0; i < n; i++ {
+		if i < len(ksg.setsIDs) && ksg.setsIDs[i] != "" {
+			labels[i] = ksg.setsIDs[i]
+		} else {
+			labels[i] = fmt.Sprintf("set_%d", i)
+		}
+	}
+
+	dm := obidist.NewDistMatrixWithLabels(labels)
+
+	// Accumulate intersection and union counts
+	intersections := make([][]uint64, n)
+	unions := make([][]uint64, n)
+	for i := 0; i < n; i++ {
+		intersections[i] = make([]uint64, n)
+		unions[i] = make([]uint64, n)
+	}
+
+	// Process partition by partition
+	var mu sync.Mutex
+	var wg sync.WaitGroup
+
+	for p := 0; p < ksg.partitions; p++ {
+		wg.Add(1)
+		go func(part int) {
+			defer wg.Done()
+
+			// Open all set readers for this partition
+			readers := make([]*KdiReader, n)
+			for s := 0; s < n; s++ {
+				r, err := NewKdiReader(ksg.partitionPath(s, part))
+				if err != nil {
+					continue
+				}
+				readers[s] = r
+			}
+			defer func() {
+				for _, r := range readers {
+					if r != nil {
+						r.Close()
+					}
+				}
+			}()
+
+			// Merge all N readers to count intersections and unions
+			activeReaders := make([]*KdiReader, 0, n)
+			activeIndices := make([]int, 0, n)
+			for i, r := range readers {
+				if r != nil && r.Count() > 0 {
+					activeReaders = append(activeReaders, r)
+					activeIndices = append(activeIndices, i)
+				}
+			}
+			if len(activeReaders) == 0 {
+				return
+			}
+
+			merge := NewKWayMerge(activeReaders)
+			// Don't close merge here since readers are managed above
+			// We only want to iterate
+
+			// We need per-set presence tracking, so we use a custom merge
+			// Rebuild with a direct approach
+			merge.Close() // close the merge (which closes readers)
+
+			// Reopen readers for custom merge
+			for s := 0; s < n; s++ {
+				readers[s] = nil
+				r, err := NewKdiReader(ksg.partitionPath(s, part))
+				if err != nil {
+					continue
+				}
+				if r.Count() > 0 {
+					readers[s] = r
+				} else {
+					r.Close()
+				}
+			}
+
+			// Custom k-way merge that tracks which sets contain each kmer
+			type entry struct {
+				val    uint64
+				setIdx int
+			}
+
+			// Use a simpler approach: read all values for this partition into memory
+			// for each set, then do a merge
+			setKmers := make([][]uint64, n)
+			for s := 0; s < n; s++ {
+				if readers[s] == nil {
+					continue
+				}
+				kmers := make([]uint64, 0, readers[s].Count())
+				for {
+					v, ok := readers[s].Next()
+					if !ok {
+						break
+					}
+					kmers = append(kmers, v)
+				}
+				setKmers[s] = kmers
+				readers[s].Close()
+				readers[s] = nil
+			}
+
+			// Count pairwise intersections using sorted merge
+			// For each pair (i,j), count kmers present in both
+			localInter := make([][]uint64, n)
+			localUnion := make([][]uint64, n)
+			for i := 0; i < n; i++ {
+				localInter[i] = make([]uint64, n)
+				localUnion[i] = make([]uint64, n)
+			}
+
+			for i := 0; i < n; i++ {
+				localUnion[i][i] = uint64(len(setKmers[i]))
+				for j := i + 1; j < n; j++ {
+					a, b := setKmers[i], setKmers[j]
+					var inter uint64
+					ai, bi := 0, 0
+					for ai < len(a) && bi < len(b) {
+						if a[ai] == b[bi] {
+							inter++
+							ai++
+							bi++
+						} else if a[ai] < b[bi] {
+							ai++
+						} else {
+							bi++
+						}
+					}
+					localInter[i][j] = inter
+					localUnion[i][j] = uint64(len(a)) + uint64(len(b)) - inter
+				}
+			}
+
+			mu.Lock()
+			for i := 0; i < n; i++ {
+				for j := i; j < n; j++ {
+					intersections[i][j] += localInter[i][j]
+					unions[i][j] += localUnion[i][j]
+				}
+			}
+			mu.Unlock()
+		}(p)
+	}
+	wg.Wait()
+
+	// Compute distances from accumulated counts
+	for i := 0; i < n-1; i++ {
+		for j := i + 1; j < n; j++ {
+			u := unions[i][j]
+			if u == 0 {
+				dm.Set(i, j, 1.0)
+			} else {
+				dm.Set(i, j, 1.0-float64(intersections[i][j])/float64(u))
+			}
+		}
+	}
+
+	return dm
+}
+
+// JaccardSimilarityMatrix computes a pairwise Jaccard similarity matrix.
+func (ksg *KmerSetGroup) JaccardSimilarityMatrix() *obidist.DistMatrix {
+	n := ksg.n
+	labels := make([]string, n)
+	for i := 0; i < n; i++ {
+		if i < len(ksg.setsIDs) && ksg.setsIDs[i] != "" {
+			labels[i] = ksg.setsIDs[i]
+		} else {
+			labels[i] = fmt.Sprintf("set_%d", i)
+		}
+	}
+
+	// Reuse distance computation
+	dm := ksg.JaccardDistanceMatrix()
+	sm := obidist.NewSimilarityMatrixWithLabels(labels)
+
+	for i := 0; i < n-1; i++ {
+		for j := i + 1; j < n; j++ {
+			sm.Set(i, j, 1.0-dm.Get(i, j))
+		}
+	}
+
+	return sm
+}
+
+// ==============================
+// Set ID accessors
+// ==============================
+
+// SetsIDs returns a copy of the per-set string identifiers.
+func (ksg *KmerSetGroup) SetsIDs() []string {
+	out := make([]string, len(ksg.setsIDs))
+	copy(out, ksg.setsIDs)
+	return out
+}
+
+// SetIDOf returns the string ID of the set at the given index.
+// Returns "" if index is out of range.
+func (ksg *KmerSetGroup) SetIDOf(index int) string {
+	if index < 0 || index >= ksg.n {
+		return ""
+	}
+	return ksg.setsIDs[index]
+}
+
+// SetSetID sets the string ID of the set at the given index.
+func (ksg *KmerSetGroup) SetSetID(index int, id string) {
+	if index >= 0 && index < ksg.n {
+		ksg.setsIDs[index] = id
+	}
+}
+
+// IndexOfSetID returns the numeric index for a set ID, or -1 if not found.
+func (ksg *KmerSetGroup) IndexOfSetID(id string) int {
+	for i, sid := range ksg.setsIDs {
+		if sid == id {
+			return i
+		}
+	}
+	return -1
+}
+
+// MatchSetIDs resolves glob patterns against set IDs and returns matching
+// indices sorted in ascending order. Uses path.Match for pattern matching
+// (supports *, ?, [...] patterns). Returns error if a pattern is malformed.
+func (ksg *KmerSetGroup) MatchSetIDs(patterns []string) ([]int, error) {
+	seen := make(map[int]bool)
+	for _, pattern := range patterns {
+		for i, sid := range ksg.setsIDs {
+			matched, err := path.Match(pattern, sid)
+			if err != nil {
+				return nil, fmt.Errorf("obikmer: invalid glob pattern %q: %w", pattern, err)
+			}
+			if matched {
+				seen[i] = true
+			}
+		}
+	}
+	result := make([]int, 0, len(seen))
+	for idx := range seen {
+		result = append(result, idx)
+	}
+	sort.Ints(result)
+	return result, nil
+}
+
+// ==============================
+// Per-set metadata accessors
+// ==============================
+
+// GetSetMetadata returns the value of a per-set metadata key.
+func (ksg *KmerSetGroup) GetSetMetadata(setIndex int, key string) (interface{}, bool) {
+	if setIndex < 0 || setIndex >= ksg.n {
+		return nil, false
+	}
+	v, ok := ksg.setsMetadata[setIndex][key]
+	return v, ok
+}
+
+// SetSetMetadata sets a per-set metadata attribute.
+func (ksg *KmerSetGroup) SetSetMetadata(setIndex int, key string, value interface{}) {
+	if setIndex < 0 || setIndex >= ksg.n {
+		return
+	}
+	if ksg.setsMetadata[setIndex] == nil {
+		ksg.setsMetadata[setIndex] = make(map[string]interface{})
+	}
+	ksg.setsMetadata[setIndex][key] = value
+}
+
+// DeleteSetMetadata removes a per-set metadata attribute.
+func (ksg *KmerSetGroup) DeleteSetMetadata(setIndex int, key string) {
+	if setIndex < 0 || setIndex >= ksg.n {
+		return
+	}
+	delete(ksg.setsMetadata[setIndex], key)
+}
+
+// AllSetMetadata returns a copy of all metadata for a given set.
+func (ksg *KmerSetGroup) AllSetMetadata(setIndex int) map[string]interface{} {
+	if setIndex < 0 || setIndex >= ksg.n {
+		return nil
+	}
+	out := make(map[string]interface{}, len(ksg.setsMetadata[setIndex]))
+	for k, v := range ksg.setsMetadata[setIndex] {
+		out[k] = v
+	}
+	return out
+}
+
+// ==============================
+// Exported partition path and compatibility
+// ==============================
+
+// PartitionPath returns the file path for partition partIndex of set setIndex.
+func (ksg *KmerSetGroup) PartitionPath(setIndex, partIndex int) string {
+	return ksg.partitionPath(setIndex, partIndex)
+}
+
+// SpectrumPath returns the path to the spectrum.bin file for the given set.
+func (ksg *KmerSetGroup) SpectrumPath(setIndex int) string {
+	return filepath.Join(ksg.path, fmt.Sprintf("set_%d", setIndex), "spectrum.bin")
+}
+
+// Spectrum reads the k-mer frequency spectrum for the given set.
+// Returns nil, nil if no spectrum file exists.
+func (ksg *KmerSetGroup) Spectrum(setIndex int) (*KmerSpectrum, error) {
+	path := ksg.SpectrumPath(setIndex)
+	if _, err := os.Stat(path); os.IsNotExist(err) {
+		return nil, nil
+	}
+	return ReadSpectrum(path)
+}
+
+// IsCompatibleWith returns true if the other group has the same k, m, and partitions.
+func (ksg *KmerSetGroup) IsCompatibleWith(other *KmerSetGroup) bool {
+	return ksg.k == other.k && ksg.m == other.m && ksg.partitions == other.partitions
+}
+
+// ==============================
+// Set management operations
+// ==============================
+
+// NewEmptyCompatible creates an empty KmerSetGroup at destDir with the same
+// k, m, and partitions as this group. The destination must not already exist.
+func (ksg *KmerSetGroup) NewEmptyCompatible(destDir string) (*KmerSetGroup, error) {
+	if err := os.MkdirAll(destDir, 0755); err != nil {
+		return nil, fmt.Errorf("obikmer: create directory: %w", err)
+	}
+
+	dest := &KmerSetGroup{
+		path:         destDir,
+		k:            ksg.k,
+		m:            ksg.m,
+		partitions:   ksg.partitions,
+		n:            0,
+		setsIDs:      []string{},
+		counts:       []uint64{},
+		setsMetadata: []map[string]interface{}{},
+		Metadata:     make(map[string]interface{}),
+	}
+
+	if err := dest.saveMetadata(); err != nil {
+		return nil, fmt.Errorf("obikmer: write metadata: %w", err)
+	}
+
+	return dest, nil
+}
+
+// RemoveSetByID removes the set with the given ID from the group.
+// It deletes the set directory, renumbers all subsequent sets, and
+// updates the metadata on disk.
+func (ksg *KmerSetGroup) RemoveSetByID(id string) error {
+	idx := ksg.IndexOfSetID(id)
+	if idx < 0 {
+		return fmt.Errorf("obikmer: set ID %q not found", id)
+	}
+
+	// Delete the set directory
+	setDir := filepath.Join(ksg.path, fmt.Sprintf("set_%d", idx))
+	if err := os.RemoveAll(setDir); err != nil {
+		return fmt.Errorf("obikmer: remove set directory: %w", err)
+	}
+
+	// Renumber subsequent sets
+	for i := idx + 1; i < ksg.n; i++ {
+		oldDir := filepath.Join(ksg.path, fmt.Sprintf("set_%d", i))
+		newDir := filepath.Join(ksg.path, fmt.Sprintf("set_%d", i-1))
+		if err := os.Rename(oldDir, newDir); err != nil {
+			return fmt.Errorf("obikmer: rename set_%d to set_%d: %w", i, i-1, err)
+		}
+	}
+
+	// Update slices
+	ksg.setsIDs = append(ksg.setsIDs[:idx], ksg.setsIDs[idx+1:]...)
+	ksg.counts = append(ksg.counts[:idx], ksg.counts[idx+1:]...)
+	ksg.setsMetadata = append(ksg.setsMetadata[:idx], ksg.setsMetadata[idx+1:]...)
+	ksg.n--
+
+	return ksg.saveMetadata()
+}
+
+// CopySetsByIDTo copies sets identified by their IDs into a KmerSetGroup
+// at destDir. If destDir does not exist, a new compatible empty group is
+// created. If it exists, compatibility (k, m, partitions) is checked.
+// If a set ID already exists in the destination, an error is returned
+// unless force is true (in which case the existing set is replaced).
+// Per-set metadata travels with the set.
+func (ksg *KmerSetGroup) CopySetsByIDTo(ids []string, destDir string, force bool) (*KmerSetGroup, error) {
+	// Resolve source IDs to indices
+	srcIndices := make([]int, len(ids))
+	for i, id := range ids {
+		idx := ksg.IndexOfSetID(id)
+		if idx < 0 {
+			return nil, fmt.Errorf("obikmer: source set ID %q not found", id)
+		}
+		srcIndices[i] = idx
+	}
+
+	// Open or create destination
+	var dest *KmerSetGroup
+	metaPath := filepath.Join(destDir, "metadata.toml")
+	if _, err := os.Stat(metaPath); err == nil {
+		// Destination exists
+		dest, err = OpenKmerSetGroup(destDir)
+		if err != nil {
+			return nil, fmt.Errorf("obikmer: open destination: %w", err)
+		}
+		if !ksg.IsCompatibleWith(dest) {
+			return nil, fmt.Errorf("obikmer: incompatible groups: source (k=%d, m=%d, P=%d) vs dest (k=%d, m=%d, P=%d)",
+				ksg.k, ksg.m, ksg.partitions, dest.k, dest.m, dest.partitions)
+		}
+	} else {
+		// Create new destination
+		var err error
+		dest, err = ksg.NewEmptyCompatible(destDir)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	// Copy each set
+	for i, srcIdx := range srcIndices {
+		srcID := ids[i]
+
+		// Check for ID conflict in destination
+		existingIdx := dest.IndexOfSetID(srcID)
+		if existingIdx >= 0 {
+			if !force {
+				return nil, fmt.Errorf("obikmer: set ID %q already exists in destination (use force to replace)", srcID)
+			}
+			// Force: remove existing set in destination
+			if err := dest.RemoveSetByID(srcID); err != nil {
+				return nil, fmt.Errorf("obikmer: remove existing set %q in destination: %w", srcID, err)
+			}
+		}
+
+		// Destination set index = current dest size
+		destIdx := dest.n
+
+		// Create destination set directory
+		destSetDir := filepath.Join(destDir, fmt.Sprintf("set_%d", destIdx))
+		if err := os.MkdirAll(destSetDir, 0755); err != nil {
+			return nil, fmt.Errorf("obikmer: create dest set dir: %w", err)
+		}
+
+		// Copy all partition files and their .kdx indices
+		for p := 0; p < ksg.partitions; p++ {
+			srcPath := ksg.partitionPath(srcIdx, p)
+			destPath := dest.partitionPath(destIdx, p)
+			if err := copyFile(srcPath, destPath); err != nil {
+				return nil, fmt.Errorf("obikmer: copy partition %d of set %q: %w", p, srcID, err)
+			}
+			// Copy .kdx index if it exists
+			srcKdx := KdxPathForKdi(srcPath)
+			if _, err := os.Stat(srcKdx); err == nil {
+				destKdx := KdxPathForKdi(destPath)
+				if err := copyFile(srcKdx, destKdx); err != nil {
+					return nil, fmt.Errorf("obikmer: copy index %d of set %q: %w", p, srcID, err)
+				}
+			}
+		}
+
+		// Copy spectrum.bin if it exists
+		srcSpecPath := ksg.SpectrumPath(srcIdx)
+		if _, err := os.Stat(srcSpecPath); err == nil {
+			destSpecPath := filepath.Join(destSetDir, "spectrum.bin")
+			if err := copyFile(srcSpecPath, destSpecPath); err != nil {
+				return nil, fmt.Errorf("obikmer: copy spectrum of set %q: %w", srcID, err)
+			}
+		}
+
+		// Update destination metadata
+		dest.setsIDs = append(dest.setsIDs, srcID)
+		dest.counts = append(dest.counts, ksg.counts[srcIdx])
+
+		// Copy per-set metadata
+		srcMeta := ksg.AllSetMetadata(srcIdx)
+		if srcMeta == nil {
+			srcMeta = make(map[string]interface{})
+		}
+		dest.setsMetadata = append(dest.setsMetadata, srcMeta)
+		dest.n++
+	}
+
+	if err := dest.saveMetadata(); err != nil {
+		return nil, fmt.Errorf("obikmer: save destination metadata: %w", err)
+	}
+
+	return dest, nil
+}
+
+// copyFile copies a file from src to dst.
+func copyFile(src, dst string) error {
+	in, err := os.Open(src)
+	if err != nil {
+		return err
+	}
+	defer in.Close()
+
+	out, err := os.Create(dst)
+	if err != nil {
+		return err
+	}
+	defer out.Close()
+
+	if _, err := io.Copy(out, in); err != nil {
+		return err
+	}
+
+	return out.Close()
+}
--- a/pkg/obikmer/kmer_set_disk_ops.go
+++ b/pkg/obikmer/kmer_set_disk_ops.go
@@ -0,0 +1,568 @@
+package obikmer
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"runtime"
+	"sync"
+)
+
+// Union computes the union of all sets in the group, producing a new
+// singleton KmerSetGroup on disk. A k-mer is in the result if it
+// appears in any set.
+func (ksg *KmerSetGroup) Union(outputDir string) (*KmerSetGroup, error) {
+	return ksg.quorumOp(outputDir, 1, ksg.n)
+}
+
+// Intersect computes the intersection of all sets, producing a new
+// singleton KmerSetGroup on disk. A k-mer is in the result if it
+// appears in every set.
+func (ksg *KmerSetGroup) Intersect(outputDir string) (*KmerSetGroup, error) {
+	return ksg.quorumOp(outputDir, ksg.n, ksg.n)
+}
+
+// Difference computes set_0 minus the union of all other sets.
+func (ksg *KmerSetGroup) Difference(outputDir string) (*KmerSetGroup, error) {
+	return ksg.differenceOp(outputDir)
+}
+
+// QuorumAtLeast returns k-mers present in at least q sets.
+func (ksg *KmerSetGroup) QuorumAtLeast(q int, outputDir string) (*KmerSetGroup, error) {
+	return ksg.quorumOp(outputDir, q, ksg.n)
+}
+
+// QuorumExactly returns k-mers present in exactly q sets.
+func (ksg *KmerSetGroup) QuorumExactly(q int, outputDir string) (*KmerSetGroup, error) {
+	return ksg.quorumOp(outputDir, q, q)
+}
+
+// QuorumAtMost returns k-mers present in at most q sets.
+func (ksg *KmerSetGroup) QuorumAtMost(q int, outputDir string) (*KmerSetGroup, error) {
+	return ksg.quorumOp(outputDir, 1, q)
+}
+
+// UnionWith merges this group with another, producing a new KmerSetGroup
+// whose set_i is the union of this.set_i and other.set_i.
+// Both groups must have the same k, m, P, and N.
+func (ksg *KmerSetGroup) UnionWith(other *KmerSetGroup, outputDir string) (*KmerSetGroup, error) {
+	if err := ksg.checkCompatible(other); err != nil {
+		return nil, err
+	}
+	return ksg.pairwiseOp(other, outputDir, mergeUnion)
+}
+
+// IntersectWith merges this group with another, producing a new KmerSetGroup
+// whose set_i is the intersection of this.set_i and other.set_i.
+func (ksg *KmerSetGroup) IntersectWith(other *KmerSetGroup, outputDir string) (*KmerSetGroup, error) {
+	if err := ksg.checkCompatible(other); err != nil {
+		return nil, err
+	}
+	return ksg.pairwiseOp(other, outputDir, mergeIntersect)
+}
+
+// ==============================
+// Internal implementation
+// ==============================
+
+func (ksg *KmerSetGroup) checkCompatible(other *KmerSetGroup) error {
+	if ksg.k != other.k {
+		return fmt.Errorf("obikmer: incompatible k: %d vs %d", ksg.k, other.k)
+	}
+	if ksg.m != other.m {
+		return fmt.Errorf("obikmer: incompatible m: %d vs %d", ksg.m, other.m)
+	}
+	if ksg.partitions != other.partitions {
+		return fmt.Errorf("obikmer: incompatible partitions: %d vs %d", ksg.partitions, other.partitions)
+	}
+	if ksg.n != other.n {
+		return fmt.Errorf("obikmer: incompatible size: %d vs %d", ksg.n, other.n)
+	}
+	return nil
+}
+
+// quorumOp processes all N sets partition by partition.
+// For each partition, it opens N KdiReaders and does a k-way merge.
+// A kmer is written to the result if minQ <= count <= maxQ.
+func (ksg *KmerSetGroup) quorumOp(outputDir string, minQ, maxQ int) (*KmerSetGroup, error) {
+	if minQ < 1 {
+		minQ = 1
+	}
+	if maxQ > ksg.n {
+		maxQ = ksg.n
+	}
+
+	// Create output structure
+	setDir := filepath.Join(outputDir, "set_0")
+	if err := os.MkdirAll(setDir, 0755); err != nil {
+		return nil, err
+	}
+
+	counts := make([]uint64, ksg.partitions)
+
+	nWorkers := runtime.NumCPU()
+	if nWorkers > ksg.partitions {
+		nWorkers = ksg.partitions
+	}
+
+	jobs := make(chan int, ksg.partitions)
+	var wg sync.WaitGroup
+	var errMu sync.Mutex
+	var firstErr error
+
+	for w := 0; w < nWorkers; w++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for p := range jobs {
+				c, err := ksg.quorumPartition(p, setDir, minQ, maxQ)
+				if err != nil {
+					errMu.Lock()
+					if firstErr == nil {
+						firstErr = err
+					}
+					errMu.Unlock()
+					return
+				}
+				counts[p] = c
+			}
+		}()
+	}
+
+	for p := 0; p < ksg.partitions; p++ {
+		jobs <- p
+	}
+	close(jobs)
+	wg.Wait()
+
+	if firstErr != nil {
+		return nil, firstErr
+	}
+
+	var totalCount uint64
+	for _, c := range counts {
+		totalCount += c
+	}
+
+	result := &KmerSetGroup{
+		path:       outputDir,
+		k:          ksg.k,
+		m:          ksg.m,
+		partitions: ksg.partitions,
+		n:          1,
+		setsIDs:    []string{""},
+		counts:     []uint64{totalCount},
+		Metadata:   make(map[string]interface{}),
+	}
+
+	if err := result.saveMetadata(); err != nil {
+		return nil, err
+	}
+
+	return result, nil
+}
+
+// quorumPartition processes a single partition for quorum filtering.
+func (ksg *KmerSetGroup) quorumPartition(partIdx int, outSetDir string, minQ, maxQ int) (uint64, error) {
+	// Open readers for all sets
+	readers := make([]*KdiReader, 0, ksg.n)
+	for s := 0; s < ksg.n; s++ {
+		r, err := NewKdiReader(ksg.partitionPath(s, partIdx))
+		if err != nil {
+			// Close already-opened readers
+			for _, rr := range readers {
+				rr.Close()
+			}
+			return 0, err
+		}
+		if r.Count() > 0 {
+			readers = append(readers, r)
+		} else {
+			r.Close()
+		}
+	}
+
+	outPath := filepath.Join(outSetDir, fmt.Sprintf("part_%04d.kdi", partIdx))
+
+	if len(readers) == 0 {
+		// Write empty KDI
+		w, err := NewKdiWriter(outPath)
+		if err != nil {
+			return 0, err
+		}
+		return 0, w.Close()
+	}
+
+	merge := NewKWayMerge(readers)
+	// merge.Close() will close readers
+
+	w, err := NewKdiWriter(outPath)
+	if err != nil {
+		merge.Close()
+		return 0, err
+	}
+
+	for {
+		kmer, count, ok := merge.Next()
+		if !ok {
+			break
+		}
+		if count >= minQ && count <= maxQ {
+			if err := w.Write(kmer); err != nil {
+				merge.Close()
+				w.Close()
+				return 0, err
+			}
+		}
+	}
+
+	merge.Close()
+	cnt := w.Count()
+	return cnt, w.Close()
+}
+
+// differenceOp computes set_0 minus the union of all other sets.
+func (ksg *KmerSetGroup) differenceOp(outputDir string) (*KmerSetGroup, error) {
+	if ksg.n < 1 {
+		return nil, fmt.Errorf("obikmer: difference requires at least 1 set")
+	}
+
+	setDir := filepath.Join(outputDir, "set_0")
+	if err := os.MkdirAll(setDir, 0755); err != nil {
+		return nil, err
+	}
+
+	counts := make([]uint64, ksg.partitions)
+
+	nWorkers := runtime.NumCPU()
+	if nWorkers > ksg.partitions {
+		nWorkers = ksg.partitions
+	}
+
+	jobs := make(chan int, ksg.partitions)
+	var wg sync.WaitGroup
+	var errMu sync.Mutex
+	var firstErr error
+
+	for w := 0; w < nWorkers; w++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for p := range jobs {
+				c, err := ksg.differencePartition(p, setDir)
+				if err != nil {
+					errMu.Lock()
+					if firstErr == nil {
+						firstErr = err
+					}
+					errMu.Unlock()
+					return
+				}
+				counts[p] = c
+			}
+		}()
+	}
+
+	for p := 0; p < ksg.partitions; p++ {
+		jobs <- p
+	}
+	close(jobs)
+	wg.Wait()
+
+	if firstErr != nil {
+		return nil, firstErr
+	}
+
+	var totalCount uint64
+	for _, c := range counts {
+		totalCount += c
+	}
+
+	result := &KmerSetGroup{
+		path:       outputDir,
+		k:          ksg.k,
+		m:          ksg.m,
+		partitions: ksg.partitions,
+		n:          1,
+		setsIDs:    []string{""},
+		counts:     []uint64{totalCount},
+		Metadata:   make(map[string]interface{}),
+	}
+
+	if err := result.saveMetadata(); err != nil {
+		return nil, err
+	}
+
+	return result, nil
+}
+
+// differencePartition computes set_0 - union(set_1..set_{n-1}) for one partition.
+func (ksg *KmerSetGroup) differencePartition(partIdx int, outSetDir string) (uint64, error) {
+	outPath := filepath.Join(outSetDir, fmt.Sprintf("part_%04d.kdi", partIdx))
+
+	// Open set_0 reader
+	r0, err := NewKdiReader(ksg.partitionPath(0, partIdx))
+	if err != nil {
+		return 0, err
+	}
+
+	if r0.Count() == 0 {
+		r0.Close()
+		w, err := NewKdiWriter(outPath)
+		if err != nil {
+			return 0, err
+		}
+		return 0, w.Close()
+	}
+
+	// Open readers for the other sets and merge them
+	var otherReaders []*KdiReader
+	for s := 1; s < ksg.n; s++ {
+		r, err := NewKdiReader(ksg.partitionPath(s, partIdx))
+		if err != nil {
+			r0.Close()
+			for _, rr := range otherReaders {
+				rr.Close()
+			}
+			return 0, err
+		}
+		if r.Count() > 0 {
+			otherReaders = append(otherReaders, r)
+		} else {
+			r.Close()
+		}
+	}
+
+	w, err := NewKdiWriter(outPath)
+	if err != nil {
+		r0.Close()
+		for _, rr := range otherReaders {
+			rr.Close()
+		}
+		return 0, err
+	}
+
+	if len(otherReaders) == 0 {
+		// No other sets — copy set_0
+		for {
+			v, ok := r0.Next()
+			if !ok {
+				break
+			}
+			if err := w.Write(v); err != nil {
+				r0.Close()
+				w.Close()
+				return 0, err
+			}
+		}
+		r0.Close()
+		cnt := w.Count()
+		return cnt, w.Close()
+	}
+
+	// Merge other sets to get the "subtraction" stream
+	otherMerge := NewKWayMerge(otherReaders)
+
+	// Streaming difference: advance both streams
+	v0, ok0 := r0.Next()
+	vo, _, oko := otherMerge.Next()
+
+	for ok0 {
+		if !oko || v0 < vo {
+			// v0 not in others → emit
+			if err := w.Write(v0); err != nil {
+				r0.Close()
+				otherMerge.Close()
+				w.Close()
+				return 0, err
+			}
+			v0, ok0 = r0.Next()
+		} else if v0 == vo {
+			// v0 in others → skip
+			v0, ok0 = r0.Next()
+			vo, _, oko = otherMerge.Next()
+		} else {
+			// vo < v0 → advance others
+			vo, _, oko = otherMerge.Next()
+		}
+	}
+
+	r0.Close()
+	otherMerge.Close()
+	cnt := w.Count()
+	return cnt, w.Close()
+}
+
+// mergeMode defines how to combine two values during pairwise operations.
+type mergeMode int
+
+const (
+	mergeUnion     mergeMode = iota // emit if in either
+	mergeIntersect                  // emit if in both
+)
+
+// pairwiseOp applies a merge operation between corresponding sets of two groups.
+func (ksg *KmerSetGroup) pairwiseOp(other *KmerSetGroup, outputDir string, mode mergeMode) (*KmerSetGroup, error) {
+	for s := 0; s < ksg.n; s++ {
+		setDir := filepath.Join(outputDir, fmt.Sprintf("set_%d", s))
+		if err := os.MkdirAll(setDir, 0755); err != nil {
+			return nil, err
+		}
+	}
+
+	counts := make([][]uint64, ksg.n)
+	for s := 0; s < ksg.n; s++ {
+		counts[s] = make([]uint64, ksg.partitions)
+	}
+
+	nWorkers := runtime.NumCPU()
+	if nWorkers > ksg.partitions {
+		nWorkers = ksg.partitions
+	}
+
+	type job struct {
+		setIdx  int
+		partIdx int
+	}
+	jobs := make(chan job, ksg.n*ksg.partitions)
+	var wg sync.WaitGroup
+	var errMu sync.Mutex
+	var firstErr error
+
+	for w := 0; w < nWorkers; w++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for j := range jobs {
+				c, err := pairwiseMergePartition(
+					ksg.partitionPath(j.setIdx, j.partIdx),
+					other.partitionPath(j.setIdx, j.partIdx),
+					filepath.Join(outputDir, fmt.Sprintf("set_%d", j.setIdx),
+						fmt.Sprintf("part_%04d.kdi", j.partIdx)),
+					mode,
+				)
+				if err != nil {
+					errMu.Lock()
+					if firstErr == nil {
+						firstErr = err
+					}
+					errMu.Unlock()
+					return
+				}
+				counts[j.setIdx][j.partIdx] = c
+			}
+		}()
+	}
+
+	for s := 0; s < ksg.n; s++ {
+		for p := 0; p < ksg.partitions; p++ {
+			jobs <- job{s, p}
+		}
+	}
+	close(jobs)
+	wg.Wait()
+
+	if firstErr != nil {
+		return nil, firstErr
+	}
+
+	totalCounts := make([]uint64, ksg.n)
+	setsIDs := make([]string, ksg.n)
+	for s := 0; s < ksg.n; s++ {
+		for p := 0; p < ksg.partitions; p++ {
+			totalCounts[s] += counts[s][p]
+		}
+	}
+
+	result := &KmerSetGroup{
+		path:       outputDir,
+		k:          ksg.k,
+		m:          ksg.m,
+		partitions: ksg.partitions,
+		n:          ksg.n,
+		setsIDs:    setsIDs,
+		counts:     totalCounts,
+		Metadata:   make(map[string]interface{}),
+	}
+
+	if err := result.saveMetadata(); err != nil {
+		return nil, err
+	}
+
+	return result, nil
+}
+
+// pairwiseMergePartition merges two KDI files (sorted streams) with the given mode.
+func pairwiseMergePartition(pathA, pathB, outPath string, mode mergeMode) (uint64, error) {
+	rA, err := NewKdiReader(pathA)
+	if err != nil {
+		return 0, err
+	}
+	rB, err := NewKdiReader(pathB)
+	if err != nil {
+		rA.Close()
+		return 0, err
+	}
+
+	w, err := NewKdiWriter(outPath)
+	if err != nil {
+		rA.Close()
+		rB.Close()
+		return 0, err
+	}
+
+	cnt, mergeErr := doPairwiseMerge(rA, rB, w, mode)
+	rA.Close()
+	rB.Close()
+	closeErr := w.Close()
+	if mergeErr != nil {
+		return 0, mergeErr
+	}
+	return cnt, closeErr
+}
+
+func doPairwiseMerge(rA, rB *KdiReader, w *KdiWriter, mode mergeMode) (uint64, error) {
+	vA, okA := rA.Next()
+	vB, okB := rB.Next()
+
+	for okA && okB {
+		if vA == vB {
+			if err := w.Write(vA); err != nil {
+				return 0, err
+			}
+			vA, okA = rA.Next()
+			vB, okB = rB.Next()
+		} else if vA < vB {
+			if mode == mergeUnion {
+				if err := w.Write(vA); err != nil {
+					return 0, err
+				}
+			}
+			vA, okA = rA.Next()
+		} else {
+			if mode == mergeUnion {
+				if err := w.Write(vB); err != nil {
+					return 0, err
+				}
+			}
+			vB, okB = rB.Next()
+		}
+	}
+
+	if mode == mergeUnion {
+		for okA {
+			if err := w.Write(vA); err != nil {
+				return 0, err
+			}
+			vA, okA = rA.Next()
+		}
+		for okB {
+			if err := w.Write(vB); err != nil {
+				return 0, err
+			}
+			vB, okB = rB.Next()
+		}
+	}
+
+	return w.Count(), nil
+}
--- a/pkg/obikmer/kmer_set_disk_ops_test.go
+++ b/pkg/obikmer/kmer_set_disk_ops_test.go
@@ -0,0 +1,251 @@
+package obikmer
+
+import (
+	"path/filepath"
+	"testing"
+
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
+)
+
+// buildGroupFromSeqs creates a KmerSetGroup with one set per sequence.
+func buildGroupFromSeqs(t *testing.T, dir string, k, m int, seqs []string) *KmerSetGroup {
+	t.Helper()
+	n := len(seqs)
+	builder, err := NewKmerSetGroupBuilder(dir, k, m, n, 64)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for i, s := range seqs {
+		seq := obiseq.NewBioSequence("", []byte(s), "")
+		builder.AddSequence(i, seq)
+	}
+	ksg, err := builder.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+	return ksg
+}
+
+func collectKmers(t *testing.T, ksg *KmerSetGroup, setIdx int) []uint64 {
+	t.Helper()
+	var result []uint64
+	for kmer := range ksg.Iterator(setIdx) {
+		result = append(result, kmer)
+	}
+	return result
+}
+
+func TestDiskOpsUnion(t *testing.T) {
+	dir := t.TempDir()
+	indexDir := filepath.Join(dir, "index")
+	outDir := filepath.Join(dir, "union")
+
+	// Two sequences with some overlap
+	seqs := []string{
+		"ACGATCGATCTAGCTAGCTGATCGATCGATCG",
+		"CTAGCTAGCTGATCGATCGATCGTTTAAACCC",
+	}
+	ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs)
+
+	result, err := ksg.Union(outDir)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Union should have at least as many k-mers as each individual set
+	unionLen := result.Len(0)
+	if unionLen == 0 {
+		t.Fatal("union is empty")
+	}
+	if unionLen < ksg.Len(0) || unionLen < ksg.Len(1) {
+		t.Fatalf("union (%d) smaller than an input set (%d, %d)", unionLen, ksg.Len(0), ksg.Len(1))
+	}
+
+	// Union should not exceed the sum of both sets
+	if unionLen > ksg.Len(0)+ksg.Len(1) {
+		t.Fatalf("union (%d) larger than sum of sets (%d)", unionLen, ksg.Len(0)+ksg.Len(1))
+	}
+}
+
+func TestDiskOpsIntersect(t *testing.T) {
+	dir := t.TempDir()
+	indexDir := filepath.Join(dir, "index")
+	outDir := filepath.Join(dir, "intersect")
+
+	// Two sequences with some shared k-mers
+	seqs := []string{
+		"ACGATCGATCTAGCTAGCTGATCGATCGATCG",
+		"CTAGCTAGCTGATCGATCGATCGTTTAAACCC",
+	}
+	ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs)
+
+	result, err := ksg.Intersect(outDir)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	interLen := result.Len(0)
+	// Intersection should not be bigger than any individual set
+	if interLen > ksg.Len(0) || interLen > ksg.Len(1) {
+		t.Fatalf("intersection (%d) larger than input sets (%d, %d)", interLen, ksg.Len(0), ksg.Len(1))
+	}
+}
+
+func TestDiskOpsDifference(t *testing.T) {
+	dir := t.TempDir()
+	indexDir := filepath.Join(dir, "index")
+	outDir := filepath.Join(dir, "diff")
+
+	seqs := []string{
+		"ACGATCGATCTAGCTAGCTGATCGATCGATCG",
+		"CTAGCTAGCTGATCGATCGATCGTTTAAACCC",
+	}
+	ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs)
+
+	result, err := ksg.Difference(outDir)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	diffLen := result.Len(0)
+	// Difference = set_0 - set_1, so should be <= set_0
+	if diffLen > ksg.Len(0) {
+		t.Fatalf("difference (%d) larger than set_0 (%d)", diffLen, ksg.Len(0))
+	}
+}
+
+func TestDiskOpsConsistency(t *testing.T) {
+	dir := t.TempDir()
+	indexDir := filepath.Join(dir, "index")
+
+	seqs := []string{
+		"ACGATCGATCTAGCTAGCTGATCGATCGATCG",
+		"CTAGCTAGCTGATCGATCGATCGTTTAAACCC",
+	}
+	ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs)
+
+	unionResult, err := ksg.Union(filepath.Join(dir, "union"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	interResult, err := ksg.Intersect(filepath.Join(dir, "intersect"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	diffResult, err := ksg.Difference(filepath.Join(dir, "diff"))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	unionLen := unionResult.Len(0)
+	interLen := interResult.Len(0)
+	diffLen := diffResult.Len(0)
+
+	// |A ∪ B| = |A| + |B| - |A ∩ B|
+	expectedUnion := ksg.Len(0) + ksg.Len(1) - interLen
+	if unionLen != expectedUnion {
+		t.Fatalf("|A∪B|=%d, expected |A|+|B|-|A∩B|=%d+%d-%d=%d",
+			unionLen, ksg.Len(0), ksg.Len(1), interLen, expectedUnion)
+	}
+
+	// |A \ B| = |A| - |A ∩ B|
+	expectedDiff := ksg.Len(0) - interLen
+	if diffLen != expectedDiff {
+		t.Fatalf("|A\\B|=%d, expected |A|-|A∩B|=%d-%d=%d",
+			diffLen, ksg.Len(0), interLen, expectedDiff)
+	}
+}
+
+func TestDiskOpsQuorum(t *testing.T) {
+	dir := t.TempDir()
+	indexDir := filepath.Join(dir, "index")
+
+	// Three sets
+	seqs := []string{
+		"ACGATCGATCTAGCTAGCTGATCGATCGATCG",
+		"CTAGCTAGCTGATCGATCGATCGTTTAAACCC",
+		"GATCGATCGATCGAAATTTCCCGGG",
+	}
+	ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs)
+
+	// QuorumAtLeast(1) = Union
+	q1, err := ksg.QuorumAtLeast(1, filepath.Join(dir, "q1"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	union, err := ksg.Union(filepath.Join(dir, "union"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	if q1.Len(0) != union.Len(0) {
+		t.Fatalf("QuorumAtLeast(1)=%d != Union=%d", q1.Len(0), union.Len(0))
+	}
+
+	// QuorumAtLeast(3) = Intersect
+	q3, err := ksg.QuorumAtLeast(3, filepath.Join(dir, "q3"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	inter, err := ksg.Intersect(filepath.Join(dir, "inter"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	if q3.Len(0) != inter.Len(0) {
+		t.Fatalf("QuorumAtLeast(3)=%d != Intersect=%d", q3.Len(0), inter.Len(0))
+	}
+
+	// QuorumAtLeast(2) should be between Intersect and Union
+	q2, err := ksg.QuorumAtLeast(2, filepath.Join(dir, "q2"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	if q2.Len(0) < q3.Len(0) || q2.Len(0) > q1.Len(0) {
+		t.Fatalf("QuorumAtLeast(2)=%d not between intersect=%d and union=%d",
+			q2.Len(0), q3.Len(0), q1.Len(0))
+	}
+}
+
+func TestDiskOpsJaccard(t *testing.T) {
+	dir := t.TempDir()
+	indexDir := filepath.Join(dir, "index")
+
+	seqs := []string{
+		"ACGATCGATCTAGCTAGCTGATCGATCGATCG",
+		"ACGATCGATCTAGCTAGCTGATCGATCGATCG", // identical to first
+		"TTTTTTTTTTTTTTTTTTTTTTTTT",        // completely different
+	}
+	ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs)
+
+	dm := ksg.JaccardDistanceMatrix()
+	if dm == nil {
+		t.Fatal("JaccardDistanceMatrix returned nil")
+	}
+
+	// Identical sets should have distance 0
+	d01 := dm.Get(0, 1)
+	if d01 != 0.0 {
+		t.Fatalf("distance(0,1) = %f, expected 0.0 for identical sets", d01)
+	}
+
+	// Completely different sets should have distance 1.0
+	d02 := dm.Get(0, 2)
+	if d02 != 1.0 {
+		t.Fatalf("distance(0,2) = %f, expected 1.0 for disjoint sets", d02)
+	}
+
+	// Similarity matrix
+	sm := ksg.JaccardSimilarityMatrix()
+	if sm == nil {
+		t.Fatal("JaccardSimilarityMatrix returned nil")
+	}
+
+	s01 := sm.Get(0, 1)
+	if s01 != 1.0 {
+		t.Fatalf("similarity(0,1) = %f, expected 1.0 for identical sets", s01)
+	}
+
+	s02 := sm.Get(0, 2)
+	if s02 != 0.0 {
+		t.Fatalf("similarity(0,2) = %f, expected 0.0 for disjoint sets", s02)
+	}
+}
--- a/pkg/obikmer/kmer_set_group.go
+++ b/pkg/obikmer/kmer_set_group.go
@@ -1,339 +0,0 @@
-package obikmer
-
-import (
-	"fmt"
-
-	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidist"
-	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
-)
-
-// KmerSetGroup represents a vector of KmerSet
-// Used to manage multiple k-mer sets (for example, by frequency level)
-type KmerSetGroup struct {
-	id       string                 // Unique identifier of the KmerSetGroup
-	k        int                    // Size of k-mers (immutable)
-	sets     []*KmerSet             // Vector of KmerSet
-	Metadata map[string]interface{} // Group metadata (not individual sets)
-}
-
-// NewKmerSetGroup creates a new group of n KmerSets
-func NewKmerSetGroup(k int, n int) *KmerSetGroup {
-	if n < 1 {
-		panic("KmerSetGroup size must be >= 1")
-	}
-
-	sets := make([]*KmerSet, n)
-	for i := range sets {
-		sets[i] = NewKmerSet(k)
-	}
-
-	return &KmerSetGroup{
-		k:        k,
-		sets:     sets,
-		Metadata: make(map[string]interface{}),
-	}
-}
-
-// K returns the size of k-mers (immutable)
-func (ksg *KmerSetGroup) K() int {
-	return ksg.k
-}
-
-// Size returns the number of KmerSet in the group
-func (ksg *KmerSetGroup) Size() int {
-	return len(ksg.sets)
-}
-
-// Get returns the KmerSet at the given index
-// Returns nil if the index is invalid
-func (ksg *KmerSetGroup) Get(index int) *KmerSet {
-	if index < 0 || index >= len(ksg.sets) {
-		return nil
-	}
-	return ksg.sets[index]
-}
-
-// Set replaces the KmerSet at the given index
-// Panics if the index is invalid or if k does not match
-func (ksg *KmerSetGroup) Set(index int, ks *KmerSet) {
-	if index < 0 || index >= len(ksg.sets) {
-		panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets)))
-	}
-	if ks.k != ksg.k {
-		panic(fmt.Sprintf("KmerSet k mismatch: expected %d, got %d", ksg.k, ks.k))
-	}
-	ksg.sets[index] = ks
-}
-
-// Len returns the number of k-mers in a specific KmerSet
-// Without argument: returns the number of k-mers in the last KmerSet
-// With argument index: returns the number of k-mers in the KmerSet at this index
-func (ksg *KmerSetGroup) Len(index ...int) uint64 {
-	if len(index) == 0 {
-		// Without argument: last KmerSet
-		return ksg.sets[len(ksg.sets)-1].Len()
-	}
-
-	// With argument: specific KmerSet
-	idx := index[0]
-	if idx < 0 || idx >= len(ksg.sets) {
-		return 0
-	}
-	return ksg.sets[idx].Len()
-}
-
-// MemoryUsage returns the total memory usage in bytes
-func (ksg *KmerSetGroup) MemoryUsage() uint64 {
-	total := uint64(0)
-	for _, ks := range ksg.sets {
-		total += ks.MemoryUsage()
-	}
-	return total
-}
-
-// Clear empties all KmerSet in the group
-func (ksg *KmerSetGroup) Clear() {
-	for _, ks := range ksg.sets {
-		ks.Clear()
-	}
-}
-
-// Copy creates a complete copy of the group (consistent with BioSequence.Copy)
-func (ksg *KmerSetGroup) Copy() *KmerSetGroup {
-	copiedSets := make([]*KmerSet, len(ksg.sets))
-	for i, ks := range ksg.sets {
-		copiedSets[i] = ks.Copy() // Copy each KmerSet with its metadata
-	}
-
-	// Copy group metadata
-	groupMetadata := make(map[string]interface{}, len(ksg.Metadata))
-	for k, v := range ksg.Metadata {
-		groupMetadata[k] = v
-	}
-
-	return &KmerSetGroup{
-		id:       ksg.id,
-		k:        ksg.k,
-		sets:     copiedSets,
-		Metadata: groupMetadata,
-	}
-}
-
-// Id returns the identifier of the KmerSetGroup (consistent with BioSequence.Id)
-func (ksg *KmerSetGroup) Id() string {
-	return ksg.id
-}
-
-// SetId sets the identifier of the KmerSetGroup (consistent with BioSequence.SetId)
-func (ksg *KmerSetGroup) SetId(id string) {
-	ksg.id = id
-}
-
-// AddSequence adds all k-mers from a sequence to a specific KmerSet
-func (ksg *KmerSetGroup) AddSequence(seq *obiseq.BioSequence, index int) {
-	if index < 0 || index >= len(ksg.sets) {
-		panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets)))
-	}
-	ksg.sets[index].AddSequence(seq)
-}
-
-// AddSequences adds all k-mers from multiple sequences to a specific KmerSet
-func (ksg *KmerSetGroup) AddSequences(sequences *obiseq.BioSequenceSlice, index int) {
-	if index < 0 || index >= len(ksg.sets) {
-		panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets)))
-	}
-	ksg.sets[index].AddSequences(sequences)
-}
-
-// Union returns the union of all KmerSet in the group
-// Optimization: starts from the largest set to minimize operations
-func (ksg *KmerSetGroup) Union() *KmerSet {
-	if len(ksg.sets) == 0 {
-		return NewKmerSet(ksg.k)
-	}
-
-	if len(ksg.sets) == 1 {
-		return ksg.sets[0].Copy()
-	}
-
-	// Find the index of the largest set (the one with the most k-mers)
-	maxIdx := 0
-	maxCard := ksg.sets[0].Len()
-	for i := 1; i < len(ksg.sets); i++ {
-		card := ksg.sets[i].Len()
-		if card > maxCard {
-			maxCard = card
-			maxIdx = i
-		}
-	}
-
-	// Copy the largest set and perform unions in-place
-	result := ksg.sets[maxIdx].bitmap.Clone()
-	for i := 0; i < len(ksg.sets); i++ {
-		if i != maxIdx {
-			result.Or(ksg.sets[i].bitmap)
-		}
-	}
-
-	return NewKmerSetFromBitmap(ksg.k, result)
-}
-
-// Intersect returns the intersection of all KmerSet in the group
-// Optimization: starts from the smallest set to minimize operations
-func (ksg *KmerSetGroup) Intersect() *KmerSet {
-	if len(ksg.sets) == 0 {
-		return NewKmerSet(ksg.k)
-	}
-
-	if len(ksg.sets) == 1 {
-		return ksg.sets[0].Copy()
-	}
-
-	// Find the index of the smallest set (the one with the fewest k-mers)
-	minIdx := 0
-	minCard := ksg.sets[0].Len()
-	for i := 1; i < len(ksg.sets); i++ {
-		card := ksg.sets[i].Len()
-		if card < minCard {
-			minCard = card
-			minIdx = i
-		}
-	}
-
-	// Copy the smallest set and perform intersections in-place
-	result := ksg.sets[minIdx].bitmap.Clone()
-	for i := 0; i < len(ksg.sets); i++ {
-		if i != minIdx {
-			result.And(ksg.sets[i].bitmap)
-		}
-	}
-
-	return NewKmerSetFromBitmap(ksg.k, result)
-}
-
-// Stats returns statistics for each KmerSet in the group
-type KmerSetGroupStats struct {
-	K          int
-	Size       int              // Number of KmerSet
-	TotalBytes uint64           // Total memory used
-	Sets       []KmerSetStats   // Stats of each KmerSet
-}
-
-type KmerSetStats struct {
-	Index     int    // Index of the KmerSet in the group
-	Len       uint64 // Number of k-mers
-	SizeBytes uint64 // Size in bytes
-}
-
-func (ksg *KmerSetGroup) Stats() KmerSetGroupStats {
-	stats := KmerSetGroupStats{
-		K:    ksg.k,
-		Size: len(ksg.sets),
-		Sets: make([]KmerSetStats, len(ksg.sets)),
-	}
-
-	for i, ks := range ksg.sets {
-		sizeBytes := ks.MemoryUsage()
-		stats.Sets[i] = KmerSetStats{
-			Index:     i,
-			Len:       ks.Len(),
-			SizeBytes: sizeBytes,
-		}
-		stats.TotalBytes += sizeBytes
-	}
-
-	return stats
-}
-
-func (ksgs KmerSetGroupStats) String() string {
-	result := fmt.Sprintf(`KmerSetGroup Statistics (k=%d, size=%d):
-  Total memory: %.2f MB
-
-Set breakdown:
-`, ksgs.K, ksgs.Size, float64(ksgs.TotalBytes)/1024/1024)
-
-	for _, set := range ksgs.Sets {
-		result += fmt.Sprintf("  Set[%d]: %d k-mers (%.2f MB)\n",
-			set.Index,
-			set.Len,
-			float64(set.SizeBytes)/1024/1024)
-	}
-
-	return result
-}
-
-// JaccardDistanceMatrix computes a pairwise Jaccard distance matrix for all KmerSets in the group.
-// Returns a triangular distance matrix where element (i, j) represents the Jaccard distance
-// between set i and set j.
-//
-// The Jaccard distance is: 1 - (|A ∩ B| / |A ∪ B|)
-//
-// The matrix labels are set to the IDs of the individual KmerSets if available,
-// otherwise they are set to "set_0", "set_1", etc.
-//
-// Time complexity: O(n² × (|A| + |B|)) where n is the number of sets
-// Space complexity: O(n²) for the distance matrix
-func (ksg *KmerSetGroup) JaccardDistanceMatrix() *obidist.DistMatrix {
-	n := len(ksg.sets)
-
-	// Create labels from set IDs
-	labels := make([]string, n)
-	for i, ks := range ksg.sets {
-		if ks.Id() != "" {
-			labels[i] = ks.Id()
-		} else {
-			labels[i] = fmt.Sprintf("set_%d", i)
-		}
-	}
-
-	dm := obidist.NewDistMatrixWithLabels(labels)
-
-	// Compute pairwise distances
-	for i := 0; i < n-1; i++ {
-		for j := i + 1; j < n; j++ {
-			distance := ksg.sets[i].JaccardDistance(ksg.sets[j])
-			dm.Set(i, j, distance)
-		}
-	}
-
-	return dm
-}
-
-// JaccardSimilarityMatrix computes a pairwise Jaccard similarity matrix for all KmerSets in the group.
-// Returns a similarity matrix where element (i, j) represents the Jaccard similarity
-// between set i and set j.
-//
-// The Jaccard similarity is: |A ∩ B| / |A ∪ B|
-//
-// The diagonal is 1.0 (similarity of a set to itself).
-//
-// The matrix labels are set to the IDs of the individual KmerSets if available,
-// otherwise they are set to "set_0", "set_1", etc.
-//
-// Time complexity: O(n² × (|A| + |B|)) where n is the number of sets
-// Space complexity: O(n²) for the similarity matrix
-func (ksg *KmerSetGroup) JaccardSimilarityMatrix() *obidist.DistMatrix {
-	n := len(ksg.sets)
-
-	// Create labels from set IDs
-	labels := make([]string, n)
-	for i, ks := range ksg.sets {
-		if ks.Id() != "" {
-			labels[i] = ks.Id()
-		} else {
-			labels[i] = fmt.Sprintf("set_%d", i)
-		}
-	}
-
-	sm := obidist.NewSimilarityMatrixWithLabels(labels)
-
-	// Compute pairwise similarities
-	for i := 0; i < n-1; i++ {
-		for j := i + 1; j < n; j++ {
-			similarity := ksg.sets[i].JaccardSimilarity(ksg.sets[j])
-			sm.Set(i, j, similarity)
-		}
-	}
-
-	return sm
-}
--- a/pkg/obikmer/kmer_set_group_jaccard_test.go
+++ b/pkg/obikmer/kmer_set_group_jaccard_test.go
@@ -1,231 +0,0 @@
-package obikmer
-
-import (
-	"math"
-	"testing"
-)
-
-func TestKmerSetGroupJaccardDistanceMatrix(t *testing.T) {
-	ksg := NewKmerSetGroup(5, 3)
-
-	// Set 0: {1, 2, 3}
-	ksg.Get(0).AddKmerCode(1)
-	ksg.Get(0).AddKmerCode(2)
-	ksg.Get(0).AddKmerCode(3)
-	ksg.Get(0).SetId("set_A")
-
-	// Set 1: {2, 3, 4}
-	ksg.Get(1).AddKmerCode(2)
-	ksg.Get(1).AddKmerCode(3)
-	ksg.Get(1).AddKmerCode(4)
-	ksg.Get(1).SetId("set_B")
-
-	// Set 2: {5, 6, 7}
-	ksg.Get(2).AddKmerCode(5)
-	ksg.Get(2).AddKmerCode(6)
-	ksg.Get(2).AddKmerCode(7)
-	ksg.Get(2).SetId("set_C")
-
-	dm := ksg.JaccardDistanceMatrix()
-
-	// Check labels
-	if dm.GetLabel(0) != "set_A" {
-		t.Errorf("Expected label 'set_A' at index 0, got '%s'", dm.GetLabel(0))
-	}
-	if dm.GetLabel(1) != "set_B" {
-		t.Errorf("Expected label 'set_B' at index 1, got '%s'", dm.GetLabel(1))
-	}
-	if dm.GetLabel(2) != "set_C" {
-		t.Errorf("Expected label 'set_C' at index 2, got '%s'", dm.GetLabel(2))
-	}
-
-	// Check distances
-	// Distance(0, 1):
-	// Intersection: {2, 3} -> 2 elements
-	// Union: {1, 2, 3, 4} -> 4 elements
-	// Similarity: 2/4 = 0.5
-	// Distance: 1 - 0.5 = 0.5
-	expectedDist01 := 0.5
-	actualDist01 := dm.Get(0, 1)
-	if math.Abs(actualDist01-expectedDist01) > 1e-10 {
-		t.Errorf("Distance(0, 1): expected %f, got %f", expectedDist01, actualDist01)
-	}
-
-	// Distance(0, 2):
-	// Intersection: {} -> 0 elements
-	// Union: {1, 2, 3, 5, 6, 7} -> 6 elements
-	// Similarity: 0/6 = 0
-	// Distance: 1 - 0 = 1.0
-	expectedDist02 := 1.0
-	actualDist02 := dm.Get(0, 2)
-	if math.Abs(actualDist02-expectedDist02) > 1e-10 {
-		t.Errorf("Distance(0, 2): expected %f, got %f", expectedDist02, actualDist02)
-	}
-
-	// Distance(1, 2):
-	// Intersection: {} -> 0 elements
-	// Union: {2, 3, 4, 5, 6, 7} -> 6 elements
-	// Similarity: 0/6 = 0
-	// Distance: 1 - 0 = 1.0
-	expectedDist12 := 1.0
-	actualDist12 := dm.Get(1, 2)
-	if math.Abs(actualDist12-expectedDist12) > 1e-10 {
-		t.Errorf("Distance(1, 2): expected %f, got %f", expectedDist12, actualDist12)
-	}
-
-	// Check symmetry
-	if dm.Get(0, 1) != dm.Get(1, 0) {
-		t.Errorf("Matrix not symmetric: Get(0, 1) = %f, Get(1, 0) = %f",
-			dm.Get(0, 1), dm.Get(1, 0))
-	}
-
-	// Check diagonal
-	if dm.Get(0, 0) != 0.0 {
-		t.Errorf("Diagonal should be 0, got %f", dm.Get(0, 0))
-	}
-	if dm.Get(1, 1) != 0.0 {
-		t.Errorf("Diagonal should be 0, got %f", dm.Get(1, 1))
-	}
-	if dm.Get(2, 2) != 0.0 {
-		t.Errorf("Diagonal should be 0, got %f", dm.Get(2, 2))
-	}
-}
-
-func TestKmerSetGroupJaccardSimilarityMatrix(t *testing.T) {
-	ksg := NewKmerSetGroup(5, 3)
-
-	// Set 0: {1, 2, 3}
-	ksg.Get(0).AddKmerCode(1)
-	ksg.Get(0).AddKmerCode(2)
-	ksg.Get(0).AddKmerCode(3)
-
-	// Set 1: {2, 3, 4}
-	ksg.Get(1).AddKmerCode(2)
-	ksg.Get(1).AddKmerCode(3)
-	ksg.Get(1).AddKmerCode(4)
-
-	// Set 2: {1, 2, 3} (same as set 0)
-	ksg.Get(2).AddKmerCode(1)
-	ksg.Get(2).AddKmerCode(2)
-	ksg.Get(2).AddKmerCode(3)
-
-	sm := ksg.JaccardSimilarityMatrix()
-
-	// Check similarities
-	// Similarity(0, 1): 0.5 (as calculated above)
-	expectedSim01 := 0.5
-	actualSim01 := sm.Get(0, 1)
-	if math.Abs(actualSim01-expectedSim01) > 1e-10 {
-		t.Errorf("Similarity(0, 1): expected %f, got %f", expectedSim01, actualSim01)
-	}
-
-	// Similarity(0, 2): 1.0 (identical sets)
-	expectedSim02 := 1.0
-	actualSim02 := sm.Get(0, 2)
-	if math.Abs(actualSim02-expectedSim02) > 1e-10 {
-		t.Errorf("Similarity(0, 2): expected %f, got %f", expectedSim02, actualSim02)
-	}
-
-	// Similarity(1, 2): 0.5
-	// Intersection: {2, 3} -> 2
-	// Union: {1, 2, 3, 4} -> 4
-	// Similarity: 2/4 = 0.5
-	expectedSim12 := 0.5
-	actualSim12 := sm.Get(1, 2)
-	if math.Abs(actualSim12-expectedSim12) > 1e-10 {
-		t.Errorf("Similarity(1, 2): expected %f, got %f", expectedSim12, actualSim12)
-	}
-
-	// Check diagonal (similarity to self = 1.0)
-	if sm.Get(0, 0) != 1.0 {
-		t.Errorf("Diagonal should be 1.0, got %f", sm.Get(0, 0))
-	}
-	if sm.Get(1, 1) != 1.0 {
-		t.Errorf("Diagonal should be 1.0, got %f", sm.Get(1, 1))
-	}
-	if sm.Get(2, 2) != 1.0 {
-		t.Errorf("Diagonal should be 1.0, got %f", sm.Get(2, 2))
-	}
-}
-
-func TestKmerSetGroupJaccardMatricesRelation(t *testing.T) {
-	ksg := NewKmerSetGroup(5, 4)
-
-	// Create different sets
-	ksg.Get(0).AddKmerCode(1)
-	ksg.Get(0).AddKmerCode(2)
-
-	ksg.Get(1).AddKmerCode(2)
-	ksg.Get(1).AddKmerCode(3)
-
-	ksg.Get(2).AddKmerCode(1)
-	ksg.Get(2).AddKmerCode(2)
-	ksg.Get(2).AddKmerCode(3)
-
-	ksg.Get(3).AddKmerCode(10)
-	ksg.Get(3).AddKmerCode(20)
-
-	dm := ksg.JaccardDistanceMatrix()
-	sm := ksg.JaccardSimilarityMatrix()
-
-	// For all pairs (including diagonal), distance + similarity should equal 1.0
-	for i := 0; i < 4; i++ {
-		for j := 0; j < 4; j++ {
-			distance := dm.Get(i, j)
-			similarity := sm.Get(i, j)
-			sum := distance + similarity
-
-			if math.Abs(sum-1.0) > 1e-10 {
-				t.Errorf("At (%d, %d): distance %f + similarity %f = %f, expected 1.0",
-					i, j, distance, similarity, sum)
-			}
-		}
-	}
-}
-
-func TestKmerSetGroupJaccardMatrixLabels(t *testing.T) {
-	ksg := NewKmerSetGroup(5, 3)
-
-	// Don't set IDs - should use default labels
-	ksg.Get(0).AddKmerCode(1)
-	ksg.Get(1).AddKmerCode(2)
-	ksg.Get(2).AddKmerCode(3)
-
-	dm := ksg.JaccardDistanceMatrix()
-
-	// Check default labels
-	if dm.GetLabel(0) != "set_0" {
-		t.Errorf("Expected default label 'set_0', got '%s'", dm.GetLabel(0))
-	}
-	if dm.GetLabel(1) != "set_1" {
-		t.Errorf("Expected default label 'set_1', got '%s'", dm.GetLabel(1))
-	}
-	if dm.GetLabel(2) != "set_2" {
-		t.Errorf("Expected default label 'set_2', got '%s'", dm.GetLabel(2))
-	}
-}
-
-func TestKmerSetGroupJaccardMatrixSize(t *testing.T) {
-	ksg := NewKmerSetGroup(5, 5)
-
-	for i := 0; i < 5; i++ {
-		ksg.Get(i).AddKmerCode(uint64(i))
-	}
-
-	dm := ksg.JaccardDistanceMatrix()
-
-	if dm.Size() != 5 {
-		t.Errorf("Expected matrix size 5, got %d", dm.Size())
-	}
-
-	// All sets are disjoint, so all distances should be 1.0
-	for i := 0; i < 5; i++ {
-		for j := i + 1; j < 5; j++ {
-			dist := dm.Get(i, j)
-			if math.Abs(dist-1.0) > 1e-10 {
-				t.Errorf("Expected distance 1.0 for disjoint sets (%d, %d), got %f",
-					i, j, dist)
-			}
-		}
-	}
-}
--- a/pkg/obikmer/kmer_set_group_quorum.go
+++ b/pkg/obikmer/kmer_set_group_quorum.go
@@ -1,235 +0,0 @@
-package obikmer
-
-import (
-	"container/heap"
-
-	"github.com/RoaringBitmap/roaring/roaring64"
-)
-
-// heapItem represents an element in the min-heap for k-way merge
-type heapItem struct {
-	value uint64
-	idx   int
-}
-
-// kmerMinHeap implements heap.Interface for k-way merge algorithm
-type kmerMinHeap []heapItem
-
-func (h kmerMinHeap) Len() int           { return len(h) }
-func (h kmerMinHeap) Less(i, j int) bool { return h[i].value < h[j].value }
-func (h kmerMinHeap) Swap(i, j int)      { h[i], h[j] = h[j], h[i] }
-
-func (h *kmerMinHeap) Push(x interface{}) {
-	*h = append(*h, x.(heapItem))
-}
-
-func (h *kmerMinHeap) Pop() interface{} {
-	old := *h
-	n := len(old)
-	x := old[n-1]
-	*h = old[0 : n-1]
-	return x
-}
-
-// QuorumAtLeast returns k-mers present in at least q sets
-//
-// Algorithm: K-way merge with min-heap counting
-//
-// The algorithm processes all k-mers in sorted order using a min-heap:
-//
-//  1. Initialize one iterator per non-empty set
-//  2. Build a min-heap of (value, set_index) pairs, one per iterator
-//  3. While heap is not empty:
-//     a. Extract the minimum value v from heap
-//     b. Pop ALL heap items with value == v (counting occurrences)
-//     c. If count >= q, add v to result
-//     d. Advance each popped iterator and re-insert into heap if valid
-//
-// This ensures each unique k-mer is counted exactly once across all sets.
-//
-// Time complexity: O(M log N)
-//   - M = sum of all set cardinalities (total k-mer occurrences)
-//   - N = number of sets
-//   - Each k-mer occurrence is inserted/extracted from heap once: O(M) operations
-//   - Each heap operation costs O(log N)
-//
-// Space complexity: O(N)
-//   - Heap contains at most N elements (one per set iterator)
-//   - Output bitmap size depends on quorum result
-//
-// Special cases (optimized):
-//   - q <= 0: returns empty set
-//   - q == 1: delegates to Union() (native OR operations)
-//   - q == n: delegates to Intersect() (native AND operations)
-//   - q > n: returns empty set (impossible to satisfy)
-func (ksg *KmerSetGroup) QuorumAtLeast(q int) *KmerSet {
-	n := len(ksg.sets)
-
-	// Edge cases
-	if q <= 0 || n == 0 {
-		return NewKmerSet(ksg.k)
-	}
-	if q > n {
-		return NewKmerSet(ksg.k)
-	}
-	if q == 1 {
-		return ksg.Union()
-	}
-	if q == n {
-		return ksg.Intersect()
-	}
-
-	// Initialize iterators for all non-empty sets
-	iterators := make([]roaring64.IntIterable64, 0, n)
-	iterIndices := make([]int, 0, n)
-
-	for i, set := range ksg.sets {
-		if set.Len() > 0 {
-			iter := set.bitmap.Iterator()
-			if iter.HasNext() {
-				iterators = append(iterators, iter)
-				iterIndices = append(iterIndices, i)
-			}
-		}
-	}
-
-	if len(iterators) == 0 {
-		return NewKmerSet(ksg.k)
-	}
-
-	// Initialize heap with first value from each iterator
-	h := make(kmerMinHeap, len(iterators))
-	for i, iter := range iterators {
-		h[i] = heapItem{value: iter.Next(), idx: i}
-	}
-	heap.Init(&h)
-
-	// Result bitmap
-	result := roaring64.New()
-
-	// K-way merge with counting
-	for len(h) > 0 {
-		minVal := h[0].value
-		count := 0
-		activeIndices := make([]int, 0, len(h))
-
-		// Pop all elements with same value (count occurrences)
-		for len(h) > 0 && h[0].value == minVal {
-			item := heap.Pop(&h).(heapItem)
-			count++
-			activeIndices = append(activeIndices, item.idx)
-		}
-
-		// Add to result if quorum reached
-		if count >= q {
-			result.Add(minVal)
-		}
-
-		// Advance iterators and re-insert into heap
-		for _, iterIdx := range activeIndices {
-			if iterators[iterIdx].HasNext() {
-				heap.Push(&h, heapItem{
-					value: iterators[iterIdx].Next(),
-					idx:   iterIdx,
-				})
-			}
-		}
-	}
-
-	return NewKmerSetFromBitmap(ksg.k, result)
-}
-
-// QuorumAtMost returns k-mers present in at most q sets
-//
-// Algorithm: Uses the mathematical identity
-//   AtMost(q) = Union() - AtLeast(q+1)
-//
-// Proof:
-//   - Union() contains all k-mers present in at least 1 set
-//   - AtLeast(q+1) contains all k-mers present in q+1 or more sets
-//   - Their difference contains only k-mers present in at most q sets
-//
-// Implementation:
-//  1. Compute U = Union()
-//  2. Compute A = QuorumAtLeast(q+1)
-//  3. Return U - A using bitmap AndNot operation
-//
-// Time complexity: O(M log N)
-//   - Union(): O(M) with native OR operations
-//   - QuorumAtLeast(q+1): O(M log N)
-//   - AndNot: O(|U|) where |U| <= M
-//   - Total: O(M log N)
-//
-// Space complexity: O(N)
-//   - Inherited from QuorumAtLeast heap
-//
-// Special cases:
-//   - q <= 0: returns empty set
-//   - q >= n: returns Union() (all k-mers are in at most n sets)
-func (ksg *KmerSetGroup) QuorumAtMost(q int) *KmerSet {
-	n := len(ksg.sets)
-
-	// Edge cases
-	if q <= 0 {
-		return NewKmerSet(ksg.k)
-	}
-	if q >= n {
-		return ksg.Union()
-	}
-
-	// Compute Union() - AtLeast(q+1)
-	union := ksg.Union()
-	atLeastQ1 := ksg.QuorumAtLeast(q + 1)
-
-	// Difference: elements in union but not in atLeastQ1
-	result := union.bitmap.Clone()
-	result.AndNot(atLeastQ1.bitmap)
-
-	return NewKmerSetFromBitmap(ksg.k, result)
-}
-
-// QuorumExactly returns k-mers present in exactly q sets
-//
-// Algorithm: Uses the mathematical identity
-//   Exactly(q) = AtLeast(q) - AtLeast(q+1)
-//
-// Proof:
-//   - AtLeast(q) contains all k-mers present in q or more sets
-//   - AtLeast(q+1) contains all k-mers present in q+1 or more sets
-//   - Their difference contains only k-mers present in exactly q sets
-//
-// Implementation:
-//  1. Compute A = QuorumAtLeast(q)
-//  2. Compute B = QuorumAtLeast(q+1)
-//  3. Return A - B using bitmap AndNot operation
-//
-// Time complexity: O(M log N)
-//   - Two calls to QuorumAtLeast: 2 * O(M log N)
-//   - One AndNot operation: O(|A|) where |A| <= M
-//   - Total: O(M log N) since AndNot is dominated by merge operations
-//
-// Space complexity: O(N)
-//   - Inherited from QuorumAtLeast heap
-//   - Two temporary bitmaps for intermediate results
-//
-// Special cases:
-//   - q <= 0: returns empty set
-//   - q > n: returns empty set (impossible to have k-mer in more than n sets)
-func (ksg *KmerSetGroup) QuorumExactly(q int) *KmerSet {
-	n := len(ksg.sets)
-
-	// Edge cases
-	if q <= 0 || q > n {
-		return NewKmerSet(ksg.k)
-	}
-
-	// Compute AtLeast(q) - AtLeast(q+1)
-	aq := ksg.QuorumAtLeast(q)
-	aq1 := ksg.QuorumAtLeast(q + 1)
-
-	// Difference: elements in aq but not in aq1
-	result := aq.bitmap.Clone()
-	result.AndNot(aq1.bitmap)
-
-	return NewKmerSetFromBitmap(ksg.k, result)
-}
--- a/pkg/obikmer/kmer_set_group_quorum_test.go
+++ b/pkg/obikmer/kmer_set_group_quorum_test.go
@@ -1,395 +0,0 @@
-package obikmer
-
-import (
-	"testing"
-)
-
-// TestQuorumAtLeastEdgeCases tests edge cases for QuorumAtLeast
-func TestQuorumAtLeastEdgeCases(t *testing.T) {
-	k := 5
-
-	// Test group with all empty sets
-	emptyGroup := NewKmerSetGroup(k, 3)
-	result := emptyGroup.QuorumAtLeast(1)
-	if result.Len() != 0 {
-		t.Errorf("Empty sets: expected 0 k-mers, got %d", result.Len())
-	}
-
-	// Test q <= 0
-	group := NewKmerSetGroup(k, 3)
-	result = group.QuorumAtLeast(0)
-	if result.Len() != 0 {
-		t.Errorf("q=0: expected 0 k-mers, got %d", result.Len())
-	}
-
-	result = group.QuorumAtLeast(-1)
-	if result.Len() != 0 {
-		t.Errorf("q=-1: expected 0 k-mers, got %d", result.Len())
-	}
-
-	// Test q > n
-	group.Get(0).AddKmerCode(1)
-	result = group.QuorumAtLeast(10)
-	if result.Len() != 0 {
-		t.Errorf("q>n: expected 0 k-mers, got %d", result.Len())
-	}
-}
-
-// TestQuorumAtLeastQ1 tests q=1 (should equal Union)
-func TestQuorumAtLeastQ1(t *testing.T) {
-	k := 5
-	group := NewKmerSetGroup(k, 3)
-
-	// Add different k-mers to each set
-	group.Get(0).AddKmerCode(1)
-	group.Get(0).AddKmerCode(2)
-	group.Get(1).AddKmerCode(2)
-	group.Get(1).AddKmerCode(3)
-	group.Get(2).AddKmerCode(3)
-	group.Get(2).AddKmerCode(4)
-
-	quorum := group.QuorumAtLeast(1)
-	union := group.Union()
-
-	if quorum.Len() != union.Len() {
-		t.Errorf("QuorumAtLeast(1) length %d != Union length %d", quorum.Len(), union.Len())
-	}
-
-	// Check all elements match
-	for kmer := uint64(1); kmer <= 4; kmer++ {
-		if quorum.Contains(kmer) != union.Contains(kmer) {
-			t.Errorf("Mismatch for k-mer %d", kmer)
-		}
-	}
-}
-
-// TestQuorumAtLeastQN tests q=n (should equal Intersect)
-func TestQuorumAtLeastQN(t *testing.T) {
-	k := 5
-	group := NewKmerSetGroup(k, 3)
-
-	// Add some common k-mers and some unique
-	for i := 0; i < 3; i++ {
-		group.Get(i).AddKmerCode(10) // common to all
-		group.Get(i).AddKmerCode(20) // common to all
-	}
-	group.Get(0).AddKmerCode(1) // unique to set 0
-	group.Get(1).AddKmerCode(2) // unique to set 1
-
-	quorum := group.QuorumAtLeast(3)
-	intersect := group.Intersect()
-
-	if quorum.Len() != intersect.Len() {
-		t.Errorf("QuorumAtLeast(n) length %d != Intersect length %d", quorum.Len(), intersect.Len())
-	}
-
-	if quorum.Len() != 2 {
-		t.Errorf("Expected 2 common k-mers, got %d", quorum.Len())
-	}
-
-	if !quorum.Contains(10) || !quorum.Contains(20) {
-		t.Error("Missing common k-mers")
-	}
-
-	if quorum.Contains(1) || quorum.Contains(2) {
-		t.Error("Unique k-mers should not be in result")
-	}
-}
-
-// TestQuorumAtLeastGeneral tests general quorum values
-func TestQuorumAtLeastGeneral(t *testing.T) {
-	k := 5
-	group := NewKmerSetGroup(k, 5)
-
-	// Setup: k-mer i appears in i sets (for i=1..5)
-	// k-mer 1: in set 0
-	// k-mer 2: in sets 0,1
-	// k-mer 3: in sets 0,1,2
-	// k-mer 4: in sets 0,1,2,3
-	// k-mer 5: in sets 0,1,2,3,4 (all)
-
-	for kmer := uint64(1); kmer <= 5; kmer++ {
-		for setIdx := 0; setIdx < int(kmer); setIdx++ {
-			group.Get(setIdx).AddKmerCode(kmer)
-		}
-	}
-
-	tests := []struct {
-		q        int
-		expected map[uint64]bool
-	}{
-		{1, map[uint64]bool{1: true, 2: true, 3: true, 4: true, 5: true}},
-		{2, map[uint64]bool{2: true, 3: true, 4: true, 5: true}},
-		{3, map[uint64]bool{3: true, 4: true, 5: true}},
-		{4, map[uint64]bool{4: true, 5: true}},
-		{5, map[uint64]bool{5: true}},
-	}
-
-	for _, tt := range tests {
-		result := group.QuorumAtLeast(tt.q)
-
-		if result.Len() != uint64(len(tt.expected)) {
-			t.Errorf("q=%d: expected %d k-mers, got %d", tt.q, len(tt.expected), result.Len())
-		}
-
-		for kmer := uint64(1); kmer <= 5; kmer++ {
-			shouldContain := tt.expected[kmer]
-			doesContain := result.Contains(kmer)
-			if shouldContain != doesContain {
-				t.Errorf("q=%d, k-mer=%d: expected contains=%v, got %v", tt.q, kmer, shouldContain, doesContain)
-			}
-		}
-	}
-}
-
-// TestQuorumExactlyBasic tests QuorumExactly basic functionality
-func TestQuorumExactlyBasic(t *testing.T) {
-	k := 5
-	group := NewKmerSetGroup(k, 5)
-
-	// Setup: k-mer i appears in exactly i sets
-	for kmer := uint64(1); kmer <= 5; kmer++ {
-		for setIdx := 0; setIdx < int(kmer); setIdx++ {
-			group.Get(setIdx).AddKmerCode(kmer)
-		}
-	}
-
-	tests := []struct {
-		q        int
-		expected []uint64
-	}{
-		{1, []uint64{1}},
-		{2, []uint64{2}},
-		{3, []uint64{3}},
-		{4, []uint64{4}},
-		{5, []uint64{5}},
-	}
-
-	for _, tt := range tests {
-		result := group.QuorumExactly(tt.q)
-
-		if result.Len() != uint64(len(tt.expected)) {
-			t.Errorf("q=%d: expected %d k-mers, got %d", tt.q, len(tt.expected), result.Len())
-		}
-
-		for _, kmer := range tt.expected {
-			if !result.Contains(kmer) {
-				t.Errorf("q=%d: missing k-mer %d", tt.q, kmer)
-			}
-		}
-	}
-}
-
-// TestQuorumIdentity tests the mathematical identity: Exactly(q) = AtLeast(q) - AtLeast(q+1)
-func TestQuorumIdentity(t *testing.T) {
-	k := 5
-	group := NewKmerSetGroup(k, 4)
-
-	// Add random distribution
-	group.Get(0).AddKmerCode(1)
-	group.Get(0).AddKmerCode(2)
-	group.Get(0).AddKmerCode(3)
-
-	group.Get(1).AddKmerCode(2)
-	group.Get(1).AddKmerCode(3)
-	group.Get(1).AddKmerCode(4)
-
-	group.Get(2).AddKmerCode(3)
-	group.Get(2).AddKmerCode(4)
-
-	group.Get(3).AddKmerCode(4)
-
-	for q := 1; q <= 4; q++ {
-		exactly := group.QuorumExactly(q)
-		atLeast := group.QuorumAtLeast(q)
-		atLeastPlus1 := group.QuorumAtLeast(q + 1)
-
-		// Verify: every element in exactly(q) is in atLeast(q)
-		iter := exactly.Iterator()
-		for iter.HasNext() {
-			kmer := iter.Next()
-			if !atLeast.Contains(kmer) {
-				t.Errorf("q=%d: k-mer %d in Exactly but not in AtLeast", q, kmer)
-			}
-			if atLeastPlus1.Contains(kmer) {
-				t.Errorf("q=%d: k-mer %d in Exactly but also in AtLeast(q+1)", q, kmer)
-			}
-		}
-	}
-}
-
-// TestQuorumDisjointSets tests quorum on completely disjoint sets
-func TestQuorumDisjointSets(t *testing.T) {
-	k := 5
-	group := NewKmerSetGroup(k, 3)
-
-	// Each set has unique k-mers
-	group.Get(0).AddKmerCode(1)
-	group.Get(1).AddKmerCode(2)
-	group.Get(2).AddKmerCode(3)
-
-	// q=1 should give all
-	result := group.QuorumAtLeast(1)
-	if result.Len() != 3 {
-		t.Errorf("Disjoint sets q=1: expected 3, got %d", result.Len())
-	}
-
-	// q=2 should give none
-	result = group.QuorumAtLeast(2)
-	if result.Len() != 0 {
-		t.Errorf("Disjoint sets q=2: expected 0, got %d", result.Len())
-	}
-}
-
-// TestQuorumIdenticalSets tests quorum on identical sets
-func TestQuorumIdenticalSets(t *testing.T) {
-	k := 5
-	group := NewKmerSetGroup(k, 3)
-
-	// All sets have same k-mers
-	for i := 0; i < 3; i++ {
-		group.Get(i).AddKmerCode(10)
-		group.Get(i).AddKmerCode(20)
-		group.Get(i).AddKmerCode(30)
-	}
-
-	// Any q <= n should give all k-mers
-	for q := 1; q <= 3; q++ {
-		result := group.QuorumAtLeast(q)
-		if result.Len() != 3 {
-			t.Errorf("Identical sets q=%d: expected 3, got %d", q, result.Len())
-		}
-	}
-}
-
-// TestQuorumLargeNumbers tests with large k-mer values
-func TestQuorumLargeNumbers(t *testing.T) {
-	k := 21
-	group := NewKmerSetGroup(k, 3)
-
-	// Use large uint64 values (actual k-mer encodings)
-	largeKmers := []uint64{
-		0x1234567890ABCDEF,
-		0xFEDCBA0987654321,
-		0xAAAAAAAAAAAAAAAA,
-	}
-
-	// Add to multiple sets
-	for i := 0; i < 3; i++ {
-		for j := 0; j <= i; j++ {
-			group.Get(j).AddKmerCode(largeKmers[i])
-		}
-	}
-
-	result := group.QuorumAtLeast(2)
-	if result.Len() != 2 {
-		t.Errorf("Large numbers q=2: expected 2, got %d", result.Len())
-	}
-
-	if !result.Contains(largeKmers[1]) || !result.Contains(largeKmers[2]) {
-		t.Error("Large numbers: wrong k-mers in result")
-	}
-}
-
-// TestQuorumAtMostBasic tests QuorumAtMost basic functionality
-func TestQuorumAtMostBasic(t *testing.T) {
-	k := 5
-	group := NewKmerSetGroup(k, 5)
-
-	// Setup: k-mer i appears in exactly i sets
-	for kmer := uint64(1); kmer <= 5; kmer++ {
-		for setIdx := 0; setIdx < int(kmer); setIdx++ {
-			group.Get(setIdx).AddKmerCode(kmer)
-		}
-	}
-
-	tests := []struct {
-		q        int
-		expected []uint64
-	}{
-		{0, []uint64{}},                          // at most 0: none
-		{1, []uint64{1}},                         // at most 1: only k-mer 1
-		{2, []uint64{1, 2}},                      // at most 2: k-mers 1,2
-		{3, []uint64{1, 2, 3}},                   // at most 3: k-mers 1,2,3
-		{4, []uint64{1, 2, 3, 4}},                // at most 4: k-mers 1,2,3,4
-		{5, []uint64{1, 2, 3, 4, 5}},             // at most 5: all k-mers
-		{10, []uint64{1, 2, 3, 4, 5}},            // at most 10: all k-mers
-	}
-
-	for _, tt := range tests {
-		result := group.QuorumAtMost(tt.q)
-
-		if result.Len() != uint64(len(tt.expected)) {
-			t.Errorf("q=%d: expected %d k-mers, got %d", tt.q, len(tt.expected), result.Len())
-		}
-
-		for _, kmer := range tt.expected {
-			if !result.Contains(kmer) {
-				t.Errorf("q=%d: missing k-mer %d", tt.q, kmer)
-			}
-		}
-	}
-}
-
-// TestQuorumComplementIdentity tests that AtLeast and AtMost are complementary
-func TestQuorumComplementIdentity(t *testing.T) {
-	k := 5
-	group := NewKmerSetGroup(k, 4)
-
-	// Add random distribution
-	group.Get(0).AddKmerCode(1)
-	group.Get(0).AddKmerCode(2)
-	group.Get(0).AddKmerCode(3)
-
-	group.Get(1).AddKmerCode(2)
-	group.Get(1).AddKmerCode(3)
-	group.Get(1).AddKmerCode(4)
-
-	group.Get(2).AddKmerCode(3)
-	group.Get(2).AddKmerCode(4)
-
-	group.Get(3).AddKmerCode(4)
-
-	union := group.Union()
-
-	for q := 1; q < 4; q++ {
-		atMost := group.QuorumAtMost(q)
-		atLeast := group.QuorumAtLeast(q + 1)
-
-		// Verify: AtMost(q) ∪ AtLeast(q+1) = Union()
-		combined := atMost.Union(atLeast)
-
-		if combined.Len() != union.Len() {
-			t.Errorf("q=%d: AtMost(q) ∪ AtLeast(q+1) has %d k-mers, Union has %d",
-				q, combined.Len(), union.Len())
-		}
-
-		// Verify: AtMost(q) ∩ AtLeast(q+1) = ∅
-		overlap := atMost.Intersect(atLeast)
-		if overlap.Len() != 0 {
-			t.Errorf("q=%d: AtMost(q) and AtLeast(q+1) overlap with %d k-mers",
-				q, overlap.Len())
-		}
-	}
-}
-
-// BenchmarkQuorumAtLeast benchmarks quorum operations
-func BenchmarkQuorumAtLeast(b *testing.B) {
-	k := 21
-	n := 10
-	group := NewKmerSetGroup(k, n)
-
-	// Populate with realistic data
-	for i := 0; i < n; i++ {
-		for j := uint64(0); j < 10000; j++ {
-			if (j % uint64(n)) <= uint64(i) {
-				group.Get(i).AddKmerCode(j)
-			}
-		}
-	}
-
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		_ = group.QuorumAtLeast(5)
-	}
-}
--- a/pkg/obikmer/kmer_set_persistence.go
+++ b/pkg/obikmer/kmer_set_persistence.go
@@ -1,376 +0,0 @@
-package obikmer
-
-import (
-	"encoding/json"
-	"fmt"
-	"os"
-	"path/filepath"
-	"strings"
-
-	"github.com/pelletier/go-toml/v2"
-	"gopkg.in/yaml.v3"
-)
-
-// MetadataFormat represents the metadata serialization format
-type MetadataFormat int
-
-const (
-	FormatTOML MetadataFormat = iota
-	FormatYAML
-	FormatJSON
-)
-
-// String returns the file extension for the format
-func (f MetadataFormat) String() string {
-	switch f {
-	case FormatTOML:
-		return "toml"
-	case FormatYAML:
-		return "yaml"
-	case FormatJSON:
-		return "json"
-	default:
-		return "toml"
-	}
-}
-
-// KmerSetMetadata contient les métadonnées d'un KmerSet ou KmerSetGroup
-type KmerSetMetadata struct {
-	ID            string                   `toml:"id,omitempty" yaml:"id,omitempty" json:"id,omitempty"`                        // Identifiant unique
-	K             int                      `toml:"k" yaml:"k" json:"k"`                                                         // Taille des k-mers
-	Type          string                   `toml:"type" yaml:"type" json:"type"`                                                // "KmerSet" ou "KmerSetGroup"
-	Size          int                      `toml:"size" yaml:"size" json:"size"`                                                // 1 pour KmerSet, n pour KmerSetGroup
-	Files         []string                 `toml:"files" yaml:"files" json:"files"`                                             // Liste des fichiers .roaring
-	SetsIDs       []string                 `toml:"sets_ids,omitempty" yaml:"sets_ids,omitempty" json:"sets_ids,omitempty"`      // IDs des KmerSet individuels
-	UserMetadata  map[string]interface{}   `toml:"user_metadata,omitempty" yaml:"user_metadata,omitempty" json:"user_metadata,omitempty"`         // Métadonnées KmerSet ou KmerSetGroup
-	SetsMetadata  []map[string]interface{} `toml:"sets_metadata,omitempty" yaml:"sets_metadata,omitempty" json:"sets_metadata,omitempty"`         // Métadonnées des KmerSet individuels dans un KmerSetGroup
-}
-
-// SaveKmerSet sauvegarde un KmerSet dans un répertoire
-// Format: directory/metadata.{toml,yaml,json} + directory/set_0.roaring
-func (ks *KmerSet) Save(directory string, format MetadataFormat) error {
-	// Créer le répertoire si nécessaire
-	if err := os.MkdirAll(directory, 0755); err != nil {
-		return fmt.Errorf("failed to create directory %s: %w", directory, err)
-	}
-
-	// Métadonnées
-	metadata := KmerSetMetadata{
-		ID:           ks.id,
-		K:            ks.k,
-		Type:         "KmerSet",
-		Size:         1,
-		Files:        []string{"set_0.roaring"},
-		UserMetadata: ks.Metadata, // Sauvegarder les métadonnées utilisateur
-	}
-
-	// Sauvegarder les métadonnées
-	if err := saveMetadata(filepath.Join(directory, "metadata."+format.String()), metadata, format); err != nil {
-		return err
-	}
-
-	// Sauvegarder le bitmap
-	bitmapPath := filepath.Join(directory, "set_0.roaring")
-	file, err := os.Create(bitmapPath)
-	if err != nil {
-		return fmt.Errorf("failed to create bitmap file %s: %w", bitmapPath, err)
-	}
-	defer file.Close()
-
-	if _, err := ks.bitmap.WriteTo(file); err != nil {
-		return fmt.Errorf("failed to write bitmap: %w", err)
-	}
-
-	return nil
-}
-
-// LoadKmerSet charge un KmerSet depuis un répertoire
-func LoadKmerSet(directory string) (*KmerSet, error) {
-	// Lire les métadonnées (essayer tous les formats)
-	metadata, err := loadMetadata(directory)
-	if err != nil {
-		return nil, err
-	}
-
-	// Vérifier le type
-	if metadata.Type != "KmerSet" {
-		return nil, fmt.Errorf("invalid type: expected KmerSet, got %s", metadata.Type)
-	}
-
-	// Vérifier qu'il n'y a qu'un seul fichier
-	if metadata.Size != 1 || len(metadata.Files) != 1 {
-		return nil, fmt.Errorf("KmerSet must have exactly 1 bitmap file, got %d", len(metadata.Files))
-	}
-
-	// Charger le bitmap
-	bitmapPath := filepath.Join(directory, metadata.Files[0])
-	file, err := os.Open(bitmapPath)
-	if err != nil {
-		return nil, fmt.Errorf("failed to open bitmap file %s: %w", bitmapPath, err)
-	}
-	defer file.Close()
-
-	ks := NewKmerSet(metadata.K)
-
-	// Charger l'ID
-	ks.id = metadata.ID
-
-	// Charger les métadonnées utilisateur
-	if metadata.UserMetadata != nil {
-		ks.Metadata = metadata.UserMetadata
-	}
-
-	if _, err := ks.bitmap.ReadFrom(file); err != nil {
-		return nil, fmt.Errorf("failed to read bitmap: %w", err)
-	}
-
-	return ks, nil
-}
-
-// SaveKmerSetGroup sauvegarde un KmerSetGroup dans un répertoire
-// Format: directory/metadata.{toml,yaml,json} + directory/set_0.roaring, set_1.roaring, ...
-func (ksg *KmerSetGroup) Save(directory string, format MetadataFormat) error {
-	// Créer le répertoire si nécessaire
-	if err := os.MkdirAll(directory, 0755); err != nil {
-		return fmt.Errorf("failed to create directory %s: %w", directory, err)
-	}
-
-	// Métadonnées
-	files := make([]string, len(ksg.sets))
-	for i := range ksg.sets {
-		files[i] = fmt.Sprintf("set_%d.roaring", i)
-	}
-
-	// Collecter les IDs et métadonnées de chaque KmerSet individuel
-	setsIDs := make([]string, len(ksg.sets))
-	setsMetadata := make([]map[string]interface{}, len(ksg.sets))
-	for i, ks := range ksg.sets {
-		setsIDs[i] = ks.id
-		setsMetadata[i] = ks.Metadata
-	}
-
-	metadata := KmerSetMetadata{
-		ID:           ksg.id,
-		K:            ksg.k,
-		Type:         "KmerSetGroup",
-		Size:         len(ksg.sets),
-		Files:        files,
-		SetsIDs:      setsIDs,          // IDs de chaque set
-		UserMetadata: ksg.Metadata,     // Métadonnées du groupe
-		SetsMetadata: setsMetadata,     // Métadonnées de chaque set
-	}
-
-	// Sauvegarder les métadonnées
-	if err := saveMetadata(filepath.Join(directory, "metadata."+format.String()), metadata, format); err != nil {
-		return err
-	}
-
-	// Sauvegarder chaque bitmap
-	for i, ks := range ksg.sets {
-		bitmapPath := filepath.Join(directory, files[i])
-		file, err := os.Create(bitmapPath)
-		if err != nil {
-			return fmt.Errorf("failed to create bitmap file %s: %w", bitmapPath, err)
-		}
-
-		if _, err := ks.bitmap.WriteTo(file); err != nil {
-			file.Close()
-			return fmt.Errorf("failed to write bitmap %d: %w", i, err)
-		}
-		file.Close()
-	}
-
-	return nil
-}
-
-// LoadKmerSetGroup charge un KmerSetGroup depuis un répertoire
-func LoadKmerSetGroup(directory string) (*KmerSetGroup, error) {
-	// Lire les métadonnées (essayer tous les formats)
-	metadata, err := loadMetadata(directory)
-	if err != nil {
-		return nil, err
-	}
-
-	// Vérifier le type
-	if metadata.Type != "KmerSetGroup" {
-		return nil, fmt.Errorf("invalid type: expected KmerSetGroup, got %s", metadata.Type)
-	}
-
-	// Vérifier la cohérence
-	if metadata.Size != len(metadata.Files) {
-		return nil, fmt.Errorf("size mismatch: size=%d but %d files listed", metadata.Size, len(metadata.Files))
-	}
-
-	// Créer le groupe
-	ksg := NewKmerSetGroup(metadata.K, metadata.Size)
-
-	// Charger l'ID du groupe
-	ksg.id = metadata.ID
-
-	// Charger les métadonnées du groupe
-	if metadata.UserMetadata != nil {
-		ksg.Metadata = metadata.UserMetadata
-	}
-
-	// Charger les IDs de chaque KmerSet
-	if metadata.SetsIDs != nil && len(metadata.SetsIDs) == metadata.Size {
-		for i := range ksg.sets {
-			ksg.sets[i].id = metadata.SetsIDs[i]
-		}
-	}
-
-	// Charger les métadonnées de chaque KmerSet individuel
-	if metadata.SetsMetadata != nil {
-		if len(metadata.SetsMetadata) != metadata.Size {
-			return nil, fmt.Errorf("sets metadata size mismatch: expected %d, got %d", metadata.Size, len(metadata.SetsMetadata))
-		}
-		for i := range ksg.sets {
-			ksg.sets[i].Metadata = metadata.SetsMetadata[i]
-		}
-	}
-
-	// Charger chaque bitmap
-	for i, filename := range metadata.Files {
-		bitmapPath := filepath.Join(directory, filename)
-		file, err := os.Open(bitmapPath)
-		if err != nil {
-			return nil, fmt.Errorf("failed to open bitmap file %s: %w", bitmapPath, err)
-		}
-
-		if _, err := ksg.sets[i].bitmap.ReadFrom(file); err != nil {
-			file.Close()
-			return nil, fmt.Errorf("failed to read bitmap %d: %w", i, err)
-		}
-		file.Close()
-	}
-
-	return ksg, nil
-}
-
-// saveMetadata sauvegarde les métadonnées dans le format spécifié
-func saveMetadata(path string, metadata KmerSetMetadata, format MetadataFormat) error {
-	file, err := os.Create(path)
-	if err != nil {
-		return fmt.Errorf("failed to create metadata file %s: %w", path, err)
-	}
-	defer file.Close()
-
-	var encoder interface{ Encode(interface{}) error }
-
-	switch format {
-	case FormatTOML:
-		encoder = toml.NewEncoder(file)
-	case FormatYAML:
-		encoder = yaml.NewEncoder(file)
-	case FormatJSON:
-		jsonEncoder := json.NewEncoder(file)
-		jsonEncoder.SetIndent("", "  ")
-		encoder = jsonEncoder
-	default:
-		return fmt.Errorf("unsupported format: %v", format)
-	}
-
-	if err := encoder.Encode(metadata); err != nil {
-		return fmt.Errorf("failed to encode metadata: %w", err)
-	}
-
-	return nil
-}
-
-// loadMetadata charge les métadonnées depuis un répertoire
-// Essaie tous les formats (TOML, YAML, JSON) dans l'ordre
-func loadMetadata(directory string) (*KmerSetMetadata, error) {
-	formats := []MetadataFormat{FormatTOML, FormatYAML, FormatJSON}
-
-	var lastErr error
-	for _, format := range formats {
-		path := filepath.Join(directory, "metadata."+format.String())
-
-		// Vérifier si le fichier existe
-		if _, err := os.Stat(path); os.IsNotExist(err) {
-			continue
-		}
-
-		metadata, err := loadMetadataFromFile(path, format)
-		if err != nil {
-			lastErr = err
-			continue
-		}
-		return metadata, nil
-	}
-
-	if lastErr != nil {
-		return nil, fmt.Errorf("failed to load metadata: %w", lastErr)
-	}
-	return nil, fmt.Errorf("no metadata file found in %s (tried .toml, .yaml, .json)", directory)
-}
-
-// loadMetadataFromFile charge les métadonnées depuis un fichier spécifique
-func loadMetadataFromFile(path string, format MetadataFormat) (*KmerSetMetadata, error) {
-	file, err := os.Open(path)
-	if err != nil {
-		return nil, fmt.Errorf("failed to open metadata file %s: %w", path, err)
-	}
-	defer file.Close()
-
-	var metadata KmerSetMetadata
-	var decoder interface{ Decode(interface{}) error }
-
-	switch format {
-	case FormatTOML:
-		decoder = toml.NewDecoder(file)
-	case FormatYAML:
-		decoder = yaml.NewDecoder(file)
-	case FormatJSON:
-		decoder = json.NewDecoder(file)
-	default:
-		return nil, fmt.Errorf("unsupported format: %v", format)
-	}
-
-	if err := decoder.Decode(&metadata); err != nil {
-		return nil, fmt.Errorf("failed to decode metadata: %w", err)
-	}
-
-	return &metadata, nil
-}
-
-// DetectFormat détecte le format des métadonnées dans un répertoire
-func DetectFormat(directory string) (MetadataFormat, error) {
-	formats := []MetadataFormat{FormatTOML, FormatYAML, FormatJSON}
-
-	for _, format := range formats {
-		path := filepath.Join(directory, "metadata."+format.String())
-		if _, err := os.Stat(path); err == nil {
-			return format, nil
-		}
-	}
-
-	return FormatTOML, fmt.Errorf("no metadata file found in %s", directory)
-}
-
-// IsKmerSetDirectory vérifie si un répertoire contient un KmerSet ou KmerSetGroup
-func IsKmerSetDirectory(directory string) (bool, string, error) {
-	metadata, err := loadMetadata(directory)
-	if err != nil {
-		return false, "", err
-	}
-
-	return true, metadata.Type, nil
-}
-
-// ListBitmapFiles liste tous les fichiers .roaring dans un répertoire
-func ListBitmapFiles(directory string) ([]string, error) {
-	entries, err := os.ReadDir(directory)
-	if err != nil {
-		return nil, fmt.Errorf("failed to read directory %s: %w", directory, err)
-	}
-
-	var files []string
-	for _, entry := range entries {
-		if !entry.IsDir() && strings.HasSuffix(entry.Name(), ".roaring") {
-			files = append(files, entry.Name())
-		}
-	}
-
-	return files, nil
-}
--- a/pkg/obikmer/kmer_set_test.go
+++ b/pkg/obikmer/kmer_set_test.go
@@ -1,272 +0,0 @@
-package obikmer
-
-import (
-	"math"
-	"testing"
-)
-
-func TestJaccardDistanceIdentical(t *testing.T) {
-	ks1 := NewKmerSet(5)
-	ks1.AddKmerCode(100)
-	ks1.AddKmerCode(200)
-	ks1.AddKmerCode(300)
-
-	ks2 := NewKmerSet(5)
-	ks2.AddKmerCode(100)
-	ks2.AddKmerCode(200)
-	ks2.AddKmerCode(300)
-
-	distance := ks1.JaccardDistance(ks2)
-	similarity := ks1.JaccardSimilarity(ks2)
-
-	if distance != 0.0 {
-		t.Errorf("Expected distance 0.0 for identical sets, got %f", distance)
-	}
-
-	if similarity != 1.0 {
-		t.Errorf("Expected similarity 1.0 for identical sets, got %f", similarity)
-	}
-}
-
-func TestJaccardDistanceDisjoint(t *testing.T) {
-	ks1 := NewKmerSet(5)
-	ks1.AddKmerCode(100)
-	ks1.AddKmerCode(200)
-	ks1.AddKmerCode(300)
-
-	ks2 := NewKmerSet(5)
-	ks2.AddKmerCode(400)
-	ks2.AddKmerCode(500)
-	ks2.AddKmerCode(600)
-
-	distance := ks1.JaccardDistance(ks2)
-	similarity := ks1.JaccardSimilarity(ks2)
-
-	if distance != 1.0 {
-		t.Errorf("Expected distance 1.0 for disjoint sets, got %f", distance)
-	}
-
-	if similarity != 0.0 {
-		t.Errorf("Expected similarity 0.0 for disjoint sets, got %f", similarity)
-	}
-}
-
-func TestJaccardDistancePartialOverlap(t *testing.T) {
-	// Set 1: {1, 2, 3}
-	ks1 := NewKmerSet(5)
-	ks1.AddKmerCode(1)
-	ks1.AddKmerCode(2)
-	ks1.AddKmerCode(3)
-
-	// Set 2: {2, 3, 4}
-	ks2 := NewKmerSet(5)
-	ks2.AddKmerCode(2)
-	ks2.AddKmerCode(3)
-	ks2.AddKmerCode(4)
-
-	// Intersection: {2, 3} -> cardinality = 2
-	// Union: {1, 2, 3, 4} -> cardinality = 4
-	// Similarity = 2/4 = 0.5
-	// Distance = 1 - 0.5 = 0.5
-
-	distance := ks1.JaccardDistance(ks2)
-	similarity := ks1.JaccardSimilarity(ks2)
-
-	expectedDistance := 0.5
-	expectedSimilarity := 0.5
-
-	if math.Abs(distance-expectedDistance) > 1e-10 {
-		t.Errorf("Expected distance %f, got %f", expectedDistance, distance)
-	}
-
-	if math.Abs(similarity-expectedSimilarity) > 1e-10 {
-		t.Errorf("Expected similarity %f, got %f", expectedSimilarity, similarity)
-	}
-}
-
-func TestJaccardDistanceOneSubsetOfOther(t *testing.T) {
-	// Set 1: {1, 2}
-	ks1 := NewKmerSet(5)
-	ks1.AddKmerCode(1)
-	ks1.AddKmerCode(2)
-
-	// Set 2: {1, 2, 3, 4}
-	ks2 := NewKmerSet(5)
-	ks2.AddKmerCode(1)
-	ks2.AddKmerCode(2)
-	ks2.AddKmerCode(3)
-	ks2.AddKmerCode(4)
-
-	// Intersection: {1, 2} -> cardinality = 2
-	// Union: {1, 2, 3, 4} -> cardinality = 4
-	// Similarity = 2/4 = 0.5
-	// Distance = 1 - 0.5 = 0.5
-
-	distance := ks1.JaccardDistance(ks2)
-	similarity := ks1.JaccardSimilarity(ks2)
-
-	expectedDistance := 0.5
-	expectedSimilarity := 0.5
-
-	if math.Abs(distance-expectedDistance) > 1e-10 {
-		t.Errorf("Expected distance %f, got %f", expectedDistance, distance)
-	}
-
-	if math.Abs(similarity-expectedSimilarity) > 1e-10 {
-		t.Errorf("Expected similarity %f, got %f", expectedSimilarity, similarity)
-	}
-}
-
-func TestJaccardDistanceEmptySets(t *testing.T) {
-	ks1 := NewKmerSet(5)
-	ks2 := NewKmerSet(5)
-
-	distance := ks1.JaccardDistance(ks2)
-	similarity := ks1.JaccardSimilarity(ks2)
-
-	// By convention, distance = 1.0 for empty sets
-	if distance != 1.0 {
-		t.Errorf("Expected distance 1.0 for empty sets, got %f", distance)
-	}
-
-	if similarity != 0.0 {
-		t.Errorf("Expected similarity 0.0 for empty sets, got %f", similarity)
-	}
-}
-
-func TestJaccardDistanceOneEmpty(t *testing.T) {
-	ks1 := NewKmerSet(5)
-	ks1.AddKmerCode(1)
-	ks1.AddKmerCode(2)
-	ks1.AddKmerCode(3)
-
-	ks2 := NewKmerSet(5)
-
-	distance := ks1.JaccardDistance(ks2)
-	similarity := ks1.JaccardSimilarity(ks2)
-
-	// Intersection: {} -> cardinality = 0
-	// Union: {1, 2, 3} -> cardinality = 3
-	// Similarity = 0/3 = 0.0
-	// Distance = 1.0
-
-	if distance != 1.0 {
-		t.Errorf("Expected distance 1.0 when one set is empty, got %f", distance)
-	}
-
-	if similarity != 0.0 {
-		t.Errorf("Expected similarity 0.0 when one set is empty, got %f", similarity)
-	}
-}
-
-func TestJaccardDistanceDifferentK(t *testing.T) {
-	ks1 := NewKmerSet(5)
-	ks1.AddKmerCode(1)
-
-	ks2 := NewKmerSet(7)
-	ks2.AddKmerCode(1)
-
-	defer func() {
-		if r := recover(); r == nil {
-			t.Errorf("Expected panic when computing Jaccard distance with different k values")
-		}
-	}()
-
-	_ = ks1.JaccardDistance(ks2)
-}
-
-func TestJaccardDistanceSimilarityRelation(t *testing.T) {
-	// Test that distance + similarity = 1.0 for all cases
-	testCases := []struct {
-		name string
-		ks1  *KmerSet
-		ks2  *KmerSet
-	}{
-		{
-			name: "partial overlap",
-			ks1: func() *KmerSet {
-				ks := NewKmerSet(5)
-				ks.AddKmerCode(1)
-				ks.AddKmerCode(2)
-				ks.AddKmerCode(3)
-				return ks
-			}(),
-			ks2: func() *KmerSet {
-				ks := NewKmerSet(5)
-				ks.AddKmerCode(2)
-				ks.AddKmerCode(3)
-				ks.AddKmerCode(4)
-				ks.AddKmerCode(5)
-				return ks
-			}(),
-		},
-		{
-			name: "identical",
-			ks1: func() *KmerSet {
-				ks := NewKmerSet(5)
-				ks.AddKmerCode(10)
-				ks.AddKmerCode(20)
-				return ks
-			}(),
-			ks2: func() *KmerSet {
-				ks := NewKmerSet(5)
-				ks.AddKmerCode(10)
-				ks.AddKmerCode(20)
-				return ks
-			}(),
-		},
-		{
-			name: "disjoint",
-			ks1: func() *KmerSet {
-				ks := NewKmerSet(5)
-				ks.AddKmerCode(1)
-				return ks
-			}(),
-			ks2: func() *KmerSet {
-				ks := NewKmerSet(5)
-				ks.AddKmerCode(100)
-				return ks
-			}(),
-		},
-	}
-
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			distance := tc.ks1.JaccardDistance(tc.ks2)
-			similarity := tc.ks1.JaccardSimilarity(tc.ks2)
-
-			sum := distance + similarity
-
-			if math.Abs(sum-1.0) > 1e-10 {
-				t.Errorf("Expected distance + similarity = 1.0, got %f + %f = %f",
-					distance, similarity, sum)
-			}
-		})
-	}
-}
-
-func TestJaccardDistanceSymmetry(t *testing.T) {
-	ks1 := NewKmerSet(5)
-	ks1.AddKmerCode(1)
-	ks1.AddKmerCode(2)
-	ks1.AddKmerCode(3)
-
-	ks2 := NewKmerSet(5)
-	ks2.AddKmerCode(2)
-	ks2.AddKmerCode(3)
-	ks2.AddKmerCode(4)
-
-	distance1 := ks1.JaccardDistance(ks2)
-	distance2 := ks2.JaccardDistance(ks1)
-
-	similarity1 := ks1.JaccardSimilarity(ks2)
-	similarity2 := ks2.JaccardSimilarity(ks1)
-
-	if math.Abs(distance1-distance2) > 1e-10 {
-		t.Errorf("Jaccard distance not symmetric: %f vs %f", distance1, distance2)
-	}
-
-	if math.Abs(similarity1-similarity2) > 1e-10 {
-		t.Errorf("Jaccard similarity not symmetric: %f vs %f", similarity1, similarity2)
-	}
-}
--- a/pkg/obikmer/kmermap.go
+++ b/pkg/obikmer/kmermap.go
@@ -5,6 +5,7 @@ import (
 	"sort"
 	"unsafe"

+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obifp"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obilog"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
@@ -267,6 +268,8 @@ func NewKmerMap[T obifp.FPUint[T]](
 	}

 	n := len(sequences)
+	var bar *progressbar.ProgressBar
+	if obidefault.ProgressBar() {
 		pbopt := make([]progressbar.Option, 0, 5)
 		pbopt = append(pbopt,
 			progressbar.OptionSetWriter(os.Stderr),
@@ -276,11 +279,12 @@ func NewKmerMap[T obifp.FPUint[T]](
 			progressbar.OptionSetDescription("Indexing kmers"),
 		)

-	bar := progressbar.NewOptions(n, pbopt...)
+		bar = progressbar.NewOptions(n, pbopt...)
+	}

 	for i, sequence := range sequences {
 		kmap.Push(sequence, maxoccurs)
-		if i%100 == 0 {
+		if bar != nil && i%100 == 0 {
 			bar.Add(100)
 		}
 	}
--- a/pkg/obikmer/minimizer_utils.go
+++ b/pkg/obikmer/minimizer_utils.go
@@ -0,0 +1,47 @@
+package obikmer
+
+import (
+	"math"
+
+	log "github.com/sirupsen/logrus"
+)
+
+// DefaultMinimizerSize returns ceil(k / 2.5) as a reasonable default minimizer size.
+func DefaultMinimizerSize(k int) int {
+	m := int(math.Ceil(float64(k) / 2.5))
+	if m < 1 {
+		m = 1
+	}
+	if m >= k {
+		m = k - 1
+	}
+	return m
+}
+
+// MinMinimizerSize returns the minimum m such that 4^m >= nworkers,
+// i.e. ceil(log(nworkers) / log(4)).
+func MinMinimizerSize(nworkers int) int {
+	if nworkers <= 1 {
+		return 1
+	}
+	return int(math.Ceil(math.Log(float64(nworkers)) / math.Log(4)))
+}
+
+// ValidateMinimizerSize checks and adjusts the minimizer size to satisfy constraints:
+// - m >= ceil(log(nworkers)/log(4))
+// - 1 <= m < k
+func ValidateMinimizerSize(m, k, nworkers int) int {
+	minM := MinMinimizerSize(nworkers)
+	if m < minM {
+		log.Warnf("Minimizer size %d too small for %d workers (4^%d = %d < %d), adjusting to %d",
+			m, nworkers, m, 1<<(2*m), nworkers, minM)
+		m = minM
+	}
+	if m < 1 {
+		m = 1
+	}
+	if m >= k {
+		m = k - 1
+	}
+	return m
+}
--- a/pkg/obikmer/skm_reader.go
+++ b/pkg/obikmer/skm_reader.go
@@ -0,0 +1,67 @@
+package obikmer
+
+import (
+	"bufio"
+	"encoding/binary"
+	"io"
+	"os"
+)
+
+// decode2bit maps 2-bit codes back to nucleotide bytes.
+var decode2bit = [4]byte{'a', 'c', 'g', 't'}
+
+// SkmReader reads super-kmers from a binary .skm file.
+type SkmReader struct {
+	r    *bufio.Reader
+	file *os.File
+}
+
+// NewSkmReader opens a .skm file for reading.
+func NewSkmReader(path string) (*SkmReader, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	return &SkmReader{
+		r:    bufio.NewReaderSize(f, 65536),
+		file: f,
+	}, nil
+}
+
+// Next reads the next super-kmer from the file.
+// Returns the SuperKmer and true, or a zero SuperKmer and false at EOF.
+func (sr *SkmReader) Next() (SuperKmer, bool) {
+	// Read length
+	var lenbuf [2]byte
+	if _, err := io.ReadFull(sr.r, lenbuf[:]); err != nil {
+		return SuperKmer{}, false
+	}
+	seqLen := int(binary.LittleEndian.Uint16(lenbuf[:]))
+
+	// Read packed bytes
+	nBytes := (seqLen + 3) / 4
+	packed := make([]byte, nBytes)
+	if _, err := io.ReadFull(sr.r, packed); err != nil {
+		return SuperKmer{}, false
+	}
+
+	// Decode to nucleotide bytes
+	seq := make([]byte, seqLen)
+	for i := 0; i < seqLen; i++ {
+		byteIdx := i / 4
+		bitPos := uint(6 - (i%4)*2)
+		code := (packed[byteIdx] >> bitPos) & 0x03
+		seq[i] = decode2bit[code]
+	}
+
+	return SuperKmer{
+		Sequence: seq,
+		Start:    0,
+		End:      seqLen,
+	}, true
+}
+
+// Close closes the underlying file.
+func (sr *SkmReader) Close() error {
+	return sr.file.Close()
+}
--- a/pkg/obikmer/skm_test.go
+++ b/pkg/obikmer/skm_test.go
@@ -0,0 +1,176 @@
+package obikmer
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+func TestSkmRoundTrip(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "test.skm")
+
+	// Create super-kmers from a known sequence
+	seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT")
+	k := 21
+	m := 9
+	superKmers := ExtractSuperKmers(seq, k, m, nil)
+	if len(superKmers) == 0 {
+		t.Fatal("no super-kmers extracted")
+	}
+
+	// Write
+	w, err := NewSkmWriter(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, sk := range superKmers {
+		if err := w.Write(sk); err != nil {
+			t.Fatal(err)
+		}
+	}
+	if err := w.Close(); err != nil {
+		t.Fatal(err)
+	}
+
+	// Read back
+	r, err := NewSkmReader(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer r.Close()
+
+	idx := 0
+	for {
+		sk, ok := r.Next()
+		if !ok {
+			break
+		}
+		if idx >= len(superKmers) {
+			t.Fatal("read more super-kmers than written")
+		}
+		expected := superKmers[idx]
+		if len(sk.Sequence) != len(expected.Sequence) {
+			t.Fatalf("super-kmer %d: length mismatch: got %d, want %d",
+				idx, len(sk.Sequence), len(expected.Sequence))
+		}
+		// Compare nucleotide-by-nucleotide (case insensitive since decode produces lowercase)
+		for j := range sk.Sequence {
+			got := sk.Sequence[j] | 0x20
+			want := expected.Sequence[j] | 0x20
+			if got != want {
+				t.Fatalf("super-kmer %d pos %d: got %c, want %c", idx, j, got, want)
+			}
+		}
+		idx++
+	}
+	if idx != len(superKmers) {
+		t.Fatalf("read %d super-kmers, want %d", idx, len(superKmers))
+	}
+}
+
+func TestSkmEmptyFile(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "empty.skm")
+
+	// Write nothing
+	w, err := NewSkmWriter(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := w.Close(); err != nil {
+		t.Fatal(err)
+	}
+
+	// Read back
+	r, err := NewSkmReader(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer r.Close()
+
+	_, ok := r.Next()
+	if ok {
+		t.Fatal("expected no super-kmers in empty file")
+	}
+}
+
+func TestSkmSingleBase(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "single.skm")
+
+	// Test with sequences of various lengths to check padding
+	sequences := [][]byte{
+		[]byte("A"),
+		[]byte("AC"),
+		[]byte("ACG"),
+		[]byte("ACGT"),
+		[]byte("ACGTA"),
+	}
+
+	w, err := NewSkmWriter(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, seq := range sequences {
+		sk := SuperKmer{Sequence: seq}
+		if err := w.Write(sk); err != nil {
+			t.Fatal(err)
+		}
+	}
+	if err := w.Close(); err != nil {
+		t.Fatal(err)
+	}
+
+	r, err := NewSkmReader(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer r.Close()
+
+	for i, expected := range sequences {
+		sk, ok := r.Next()
+		if !ok {
+			t.Fatalf("expected super-kmer %d, got EOF", i)
+		}
+		if len(sk.Sequence) != len(expected) {
+			t.Fatalf("sk %d: length %d, want %d", i, len(sk.Sequence), len(expected))
+		}
+		for j := range sk.Sequence {
+			got := sk.Sequence[j] | 0x20
+			want := expected[j] | 0x20
+			if got != want {
+				t.Fatalf("sk %d pos %d: got %c, want %c", i, j, got, want)
+			}
+		}
+	}
+}
+
+func TestSkmFileSize(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "size.skm")
+
+	// Write a sequence of known length
+	seq := []byte("ACGTACGTAC") // 10 bases
+	sk := SuperKmer{Sequence: seq}
+
+	w, err := NewSkmWriter(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := w.Write(sk); err != nil {
+		t.Fatal(err)
+	}
+	if err := w.Close(); err != nil {
+		t.Fatal(err)
+	}
+
+	// Expected: 2 bytes (length) + ceil(10/4)=3 bytes (data) = 5 bytes
+	info, err := os.Stat(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if info.Size() != 5 {
+		t.Fatalf("file size: got %d, want 5", info.Size())
+	}
+}
--- a/pkg/obikmer/skm_writer.go
+++ b/pkg/obikmer/skm_writer.go
@@ -0,0 +1,74 @@
+package obikmer
+
+import (
+	"bufio"
+	"encoding/binary"
+	"os"
+)
+
+// SkmWriter writes super-kmers to a binary .skm file.
+//
+// Format per super-kmer:
+//
+//	[len: uint16 LE]          length of the super-kmer in bases
+//	[data: ceil(len/4) bytes] sequence encoded 2 bits/base, packed
+//
+// Nucleotide encoding: A=00, C=01, G=10, T=11.
+// The last byte is zero-padded on the low bits if len%4 != 0.
+type SkmWriter struct {
+	w    *bufio.Writer
+	file *os.File
+}
+
+// NewSkmWriter creates a new SkmWriter writing to the given file path.
+func NewSkmWriter(path string) (*SkmWriter, error) {
+	f, err := os.Create(path)
+	if err != nil {
+		return nil, err
+	}
+	return &SkmWriter{
+		w:    bufio.NewWriterSize(f, 65536),
+		file: f,
+	}, nil
+}
+
+// Write encodes a SuperKmer to the .skm file.
+// The sequence bytes are packed 2 bits per base.
+func (sw *SkmWriter) Write(sk SuperKmer) error {
+	seq := sk.Sequence
+	seqLen := uint16(len(seq))
+
+	// Write length
+	var lenbuf [2]byte
+	binary.LittleEndian.PutUint16(lenbuf[:], seqLen)
+	if _, err := sw.w.Write(lenbuf[:]); err != nil {
+		return err
+	}
+
+	// Encode and write packed sequence (2 bits/base)
+	nBytes := (int(seqLen) + 3) / 4
+	for i := 0; i < nBytes; i++ {
+		var packed byte
+		for j := 0; j < 4; j++ {
+			pos := i*4 + j
+			packed <<= 2
+			if pos < int(seqLen) {
+				packed |= __single_base_code__[seq[pos]&31]
+			}
+		}
+		if err := sw.w.WriteByte(packed); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// Close flushes buffered data and closes the underlying file.
+func (sw *SkmWriter) Close() error {
+	if err := sw.w.Flush(); err != nil {
+		sw.file.Close()
+		return err
+	}
+	return sw.file.Close()
+}
--- a/pkg/obikmer/spectrum.go
+++ b/pkg/obikmer/spectrum.go
@@ -0,0 +1,253 @@
+package obikmer
+
+import (
+	"bufio"
+	"container/heap"
+	"encoding/csv"
+	"fmt"
+	"os"
+	"sort"
+	"strconv"
+)
+
+// KSP file magic bytes: "KSP\x01" (K-mer SPectrum v1)
+var kspMagic = [4]byte{'K', 'S', 'P', 0x01}
+
+// SpectrumEntry represents one entry in a k-mer frequency spectrum.
+type SpectrumEntry struct {
+	Frequency int    // how many times a k-mer was observed
+	Count     uint64 // how many distinct k-mers have this frequency
+}
+
+// KmerSpectrum represents the frequency distribution of k-mers.
+// Entries are sorted by Frequency in ascending order and only include
+// non-zero counts.
+type KmerSpectrum struct {
+	Entries []SpectrumEntry
+}
+
+// MaxFrequency returns the highest frequency in the spectrum, or 0 if empty.
+func (s *KmerSpectrum) MaxFrequency() int {
+	if len(s.Entries) == 0 {
+		return 0
+	}
+	return s.Entries[len(s.Entries)-1].Frequency
+}
+
+// ToMap converts a KmerSpectrum back to a map for easy lookup.
+func (s *KmerSpectrum) ToMap() map[int]uint64 {
+	m := make(map[int]uint64, len(s.Entries))
+	for _, e := range s.Entries {
+		m[e.Frequency] = e.Count
+	}
+	return m
+}
+
+// MapToSpectrum converts a map[int]uint64 to a sorted KmerSpectrum.
+func MapToSpectrum(m map[int]uint64) *KmerSpectrum {
+	entries := make([]SpectrumEntry, 0, len(m))
+	for freq, count := range m {
+		if count > 0 {
+			entries = append(entries, SpectrumEntry{Frequency: freq, Count: count})
+		}
+	}
+	sort.Slice(entries, func(i, j int) bool {
+		return entries[i].Frequency < entries[j].Frequency
+	})
+	return &KmerSpectrum{Entries: entries}
+}
+
+// MergeSpectraMaps adds all entries from b into a.
+func MergeSpectraMaps(a, b map[int]uint64) {
+	for freq, count := range b {
+		a[freq] += count
+	}
+}
+
+// WriteSpectrum writes a KmerSpectrum to a binary file.
+//
+// Format:
+//
+//	[magic: 4 bytes "KSP\x01"]
+//	[n_entries: varint]
+//	For each entry (sorted by frequency ascending):
+//	  [frequency: varint]
+//	  [count: varint]
+func WriteSpectrum(path string, spectrum *KmerSpectrum) error {
+	f, err := os.Create(path)
+	if err != nil {
+		return fmt.Errorf("create spectrum file: %w", err)
+	}
+	w := bufio.NewWriterSize(f, 65536)
+
+	// Magic
+	if _, err := w.Write(kspMagic[:]); err != nil {
+		f.Close()
+		return err
+	}
+
+	// Number of entries
+	if _, err := EncodeVarint(w, uint64(len(spectrum.Entries))); err != nil {
+		f.Close()
+		return err
+	}
+
+	// Entries
+	for _, e := range spectrum.Entries {
+		if _, err := EncodeVarint(w, uint64(e.Frequency)); err != nil {
+			f.Close()
+			return err
+		}
+		if _, err := EncodeVarint(w, e.Count); err != nil {
+			f.Close()
+			return err
+		}
+	}
+
+	if err := w.Flush(); err != nil {
+		f.Close()
+		return err
+	}
+	return f.Close()
+}
+
+// ReadSpectrum reads a KmerSpectrum from a binary file.
+func ReadSpectrum(path string) (*KmerSpectrum, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	r := bufio.NewReaderSize(f, 65536)
+
+	// Check magic
+	var magic [4]byte
+	if _, err := r.Read(magic[:]); err != nil {
+		return nil, fmt.Errorf("read spectrum magic: %w", err)
+	}
+	if magic != kspMagic {
+		return nil, fmt.Errorf("invalid spectrum file magic: %v", magic)
+	}
+
+	// Number of entries
+	nEntries, err := DecodeVarint(r)
+	if err != nil {
+		return nil, fmt.Errorf("read spectrum entry count: %w", err)
+	}
+
+	entries := make([]SpectrumEntry, nEntries)
+	for i := uint64(0); i < nEntries; i++ {
+		freq, err := DecodeVarint(r)
+		if err != nil {
+			return nil, fmt.Errorf("read spectrum freq at entry %d: %w", i, err)
+		}
+		count, err := DecodeVarint(r)
+		if err != nil {
+			return nil, fmt.Errorf("read spectrum count at entry %d: %w", i, err)
+		}
+		entries[i] = SpectrumEntry{
+			Frequency: int(freq),
+			Count:     count,
+		}
+	}
+
+	return &KmerSpectrum{Entries: entries}, nil
+}
+
+// KmerFreq associates a k-mer (encoded as uint64) with its observed frequency.
+type KmerFreq struct {
+	Kmer uint64
+	Freq int
+}
+
+// kmerFreqHeap is a min-heap of KmerFreq ordered by Freq (lowest first).
+// Used to maintain a top-N most frequent k-mers set.
+type kmerFreqHeap []KmerFreq
+
+func (h kmerFreqHeap) Len() int            { return len(h) }
+func (h kmerFreqHeap) Less(i, j int) bool  { return h[i].Freq < h[j].Freq }
+func (h kmerFreqHeap) Swap(i, j int)       { h[i], h[j] = h[j], h[i] }
+func (h *kmerFreqHeap) Push(x interface{}) { *h = append(*h, x.(KmerFreq)) }
+func (h *kmerFreqHeap) Pop() interface{} {
+	old := *h
+	n := len(old)
+	x := old[n-1]
+	*h = old[:n-1]
+	return x
+}
+
+// TopNKmers maintains a collection of the N most frequent k-mers
+// using a min-heap. Thread-safe usage requires external synchronization.
+type TopNKmers struct {
+	n int
+	h kmerFreqHeap
+}
+
+// NewTopNKmers creates a new top-N collector.
+func NewTopNKmers(n int) *TopNKmers {
+	return &TopNKmers{
+		n: n,
+		h: make(kmerFreqHeap, 0, n+1),
+	}
+}
+
+// Add considers a k-mer with the given frequency for inclusion in the top-N.
+func (t *TopNKmers) Add(kmer uint64, freq int) {
+	if t.n <= 0 {
+		return
+	}
+	if len(t.h) < t.n {
+		heap.Push(&t.h, KmerFreq{Kmer: kmer, Freq: freq})
+	} else if freq > t.h[0].Freq {
+		t.h[0] = KmerFreq{Kmer: kmer, Freq: freq}
+		heap.Fix(&t.h, 0)
+	}
+}
+
+// Results returns the collected k-mers sorted by frequency descending.
+func (t *TopNKmers) Results() []KmerFreq {
+	result := make([]KmerFreq, len(t.h))
+	copy(result, t.h)
+	sort.Slice(result, func(i, j int) bool {
+		return result[i].Freq > result[j].Freq
+	})
+	return result
+}
+
+// MergeTopN merges another TopNKmers into this one.
+func (t *TopNKmers) MergeTopN(other *TopNKmers) {
+	if other == nil {
+		return
+	}
+	for _, kf := range other.h {
+		t.Add(kf.Kmer, kf.Freq)
+	}
+}
+
+// WriteTopKmersCSV writes the top k-mers to a CSV file.
+// Columns: sequence, frequency
+func WriteTopKmersCSV(path string, topKmers []KmerFreq, k int) error {
+	f, err := os.Create(path)
+	if err != nil {
+		return fmt.Errorf("create top-kmers file: %w", err)
+	}
+	defer f.Close()
+
+	w := csv.NewWriter(f)
+	defer w.Flush()
+
+	if err := w.Write([]string{"sequence", "frequency"}); err != nil {
+		return err
+	}
+
+	buf := make([]byte, k)
+	for _, kf := range topKmers {
+		seq := DecodeKmer(kf.Kmer, k, buf)
+		if err := w.Write([]string{string(seq), strconv.Itoa(kf.Freq)}); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
--- a/pkg/obikmer/superkmer.go
+++ b/pkg/obikmer/superkmer.go
@@ -0,0 +1,59 @@
+package obikmer
+
+// SuperKmer represents a maximal subsequence where all consecutive k-mers
+// share the same minimizer.
+type SuperKmer struct {
+	Minimizer uint64 // The canonical minimizer value (normalized m-mer)
+	Start     int    // Starting position in the original sequence (0-indexed)
+	End       int    // Ending position (exclusive, like Go slice notation)
+	Sequence  []byte // The actual DNA subsequence [Start:End]
+}
+
+// dequeItem represents an element in the monotone deque used for
+// tracking minimizers in a sliding window.
+type dequeItem struct {
+	position  int    // Position of the m-mer in the sequence
+	canonical uint64 // Canonical (normalized) m-mer value
+}
+
+// ExtractSuperKmers extracts super k-mers from a DNA sequence.
+// A super k-mer is a maximal subsequence where all consecutive k-mers
+// share the same minimizer. The minimizer of a k-mer is the smallest
+// canonical m-mer among its (k-m+1) constituent m-mers.
+//
+// This function uses IterSuperKmers internally and collects results into a slice.
+//
+// Parameters:
+//   - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
+//   - k: k-mer size (must be between m+1 and 31)
+//   - m: minimizer size (must be between 1 and k-1)
+//   - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
+//
+// Returns:
+//   - slice of SuperKmer structs representing maximal subsequences
+//   - nil if parameters are invalid or sequence is too short
+//
+// Time complexity: O(n) where n is the sequence length
+// Space complexity: O(k-m+1) for the deque + O(number of super k-mers) for results
+func ExtractSuperKmers(seq []byte, k int, m int, buffer *[]SuperKmer) []SuperKmer {
+	if m < 1 || m >= k || k < 2 || k > 31 || len(seq) < k {
+		return nil
+	}
+
+	var result []SuperKmer
+	if buffer == nil {
+		estimatedSize := len(seq) / k
+		if estimatedSize < 1 {
+			estimatedSize = 1
+		}
+		result = make([]SuperKmer, 0, estimatedSize)
+	} else {
+		result = (*buffer)[:0]
+	}
+
+	for sk := range IterSuperKmers(seq, k, m) {
+		result = append(result, sk)
+	}
+
+	return result
+}
--- a/pkg/obikmer/superkmer_iter.go
+++ b/pkg/obikmer/superkmer_iter.go
@@ -0,0 +1,215 @@
+package obikmer
+
+import (
+	"fmt"
+	"iter"
+
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
+)
+
+// IterSuperKmers returns an iterator over super k-mers extracted from a DNA sequence.
+// It uses the same algorithm as ExtractSuperKmers but yields super k-mers one at a time.
+//
+// Parameters:
+//   - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
+//   - k: k-mer size (must be between m+1 and 31)
+//   - m: minimizer size (must be between 1 and k-1)
+//
+// Returns:
+//   - An iterator that yields SuperKmer structs
+//
+// Example:
+//
+//	for sk := range IterSuperKmers(sequence, 21, 11) {
+//	    fmt.Printf("SuperKmer at %d-%d with minimizer %d\n", sk.Start, sk.End, sk.Minimizer)
+//	}
+func IterSuperKmers(seq []byte, k int, m int) iter.Seq[SuperKmer] {
+	return func(yield func(SuperKmer) bool) {
+		if m < 1 || m >= k || k < 2 || k > 31 || len(seq) < k {
+			return
+		}
+
+		deque := make([]dequeItem, 0, k-m+1)
+
+		mMask := uint64(1)<<(m*2) - 1
+		rcShift := uint((m - 1) * 2)
+
+		var fwdMmer, rvcMmer uint64
+		for i := 0; i < m-1 && i < len(seq); i++ {
+			code := uint64(__single_base_code__[seq[i]&31])
+			fwdMmer = (fwdMmer << 2) | code
+			rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
+		}
+
+		superKmerStart := 0
+		var currentMinimizer uint64
+		firstKmer := true
+
+		for pos := m - 1; pos < len(seq); pos++ {
+			code := uint64(__single_base_code__[seq[pos]&31])
+			fwdMmer = ((fwdMmer << 2) | code) & mMask
+			rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
+
+			canonical := fwdMmer
+			if rvcMmer < fwdMmer {
+				canonical = rvcMmer
+			}
+
+			mmerPos := pos - m + 1
+
+			if pos >= k-1 {
+				windowStart := pos - k + 1
+				for len(deque) > 0 && deque[0].position < windowStart {
+					deque = deque[1:]
+				}
+			}
+
+			for len(deque) > 0 && deque[len(deque)-1].canonical >= canonical {
+				deque = deque[:len(deque)-1]
+			}
+
+			deque = append(deque, dequeItem{position: mmerPos, canonical: canonical})
+
+			if pos >= k-1 {
+				newMinimizer := deque[0].canonical
+				kmerStart := pos - k + 1
+
+				if firstKmer {
+					currentMinimizer = newMinimizer
+					firstKmer = false
+				} else if newMinimizer != currentMinimizer {
+					endPos := kmerStart + k - 1
+					superKmer := SuperKmer{
+						Minimizer: currentMinimizer,
+						Start:     superKmerStart,
+						End:       endPos,
+						Sequence:  seq[superKmerStart:endPos],
+					}
+					if !yield(superKmer) {
+						return
+					}
+
+					superKmerStart = kmerStart
+					currentMinimizer = newMinimizer
+				}
+			}
+		}
+
+		if !firstKmer && len(seq[superKmerStart:]) >= k {
+			superKmer := SuperKmer{
+				Minimizer: currentMinimizer,
+				Start:     superKmerStart,
+				End:       len(seq),
+				Sequence:  seq[superKmerStart:],
+			}
+			yield(superKmer)
+		}
+	}
+}
+
+// ToBioSequence converts a SuperKmer to a BioSequence with metadata.
+//
+// The resulting BioSequence contains:
+//   - ID: "{parentID}_superkmer_{start}_{end}"
+//   - Sequence: the actual DNA subsequence
+//   - Attributes:
+//   - "minimizer_value" (uint64): the canonical minimizer value
+//   - "minimizer_seq" (string): the DNA sequence of the minimizer
+//   - "k" (int): the k-mer size
+//   - "m" (int): the minimizer size
+//   - "start" (int): starting position in original sequence
+//   - "end" (int): ending position in original sequence
+//   - "parent_id" (string): ID of the parent sequence
+//
+// Parameters:
+//   - k: k-mer size used for extraction
+//   - m: minimizer size used for extraction
+//   - parentID: ID of the parent sequence
+//   - parentSource: source field from the parent sequence
+//
+// Returns:
+//   - *obiseq.BioSequence: A new BioSequence representing this super k-mer
+func (sk *SuperKmer) ToBioSequence(k int, m int, parentID string, parentSource string) *obiseq.BioSequence {
+	// Create ID for the super-kmer
+	var id string
+	if parentID != "" {
+		id = fmt.Sprintf("%s_superkmer_%d_%d", parentID, sk.Start, sk.End)
+	} else {
+		id = fmt.Sprintf("superkmer_%d_%d", sk.Start, sk.End)
+	}
+
+	// Create the BioSequence
+	seq := obiseq.NewBioSequence(id, sk.Sequence, "")
+
+	// Copy source from parent
+	if parentSource != "" {
+		seq.SetSource(parentSource)
+	}
+
+	// Set attributes
+	seq.SetAttribute("minimizer_value", sk.Minimizer)
+
+	// Decode the minimizer to get its DNA sequence
+	minimizerSeq := DecodeKmer(sk.Minimizer, m, nil)
+	seq.SetAttribute("minimizer_seq", string(minimizerSeq))
+
+	seq.SetAttribute("k", k)
+	seq.SetAttribute("m", m)
+	seq.SetAttribute("start", sk.Start)
+	seq.SetAttribute("end", sk.End)
+
+	if parentID != "" {
+		seq.SetAttribute("parent_id", parentID)
+	}
+
+	return seq
+}
+
+// SuperKmerWorker creates a SeqWorker that extracts super k-mers from a BioSequence
+// and returns them as a slice of BioSequence objects.
+//
+// The worker copies the source field from the parent sequence to all extracted super k-mers.
+//
+// Parameters:
+//   - k: k-mer size (must be between m+1 and 31)
+//   - m: minimizer size (must be between 1 and k-1)
+//
+// Returns:
+//   - SeqWorker: A worker function that can be used in obiiter pipelines
+//
+// Example:
+//
+//	worker := SuperKmerWorker(21, 11)
+//	iterator := iterator.MakeIWorker(worker, false)
+func SuperKmerWorker(k int, m int) obiseq.SeqWorker {
+	return func(seq *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
+		if seq == nil {
+			return obiseq.BioSequenceSlice{}, nil
+		}
+
+		// Validate parameters
+		if m < 1 || m >= k || k < 2 || k > 31 {
+			return obiseq.BioSequenceSlice{}, fmt.Errorf(
+				"invalid parameters: k=%d, m=%d (need 1 <= m < k <= 31)",
+				k, m)
+		}
+
+		sequence := seq.Sequence()
+		if len(sequence) < k {
+			return obiseq.BioSequenceSlice{}, nil
+		}
+
+		parentID := seq.Id()
+		parentSource := seq.Source()
+
+		// Extract super k-mers and convert to BioSequences
+		result := make(obiseq.BioSequenceSlice, 0)
+
+		for sk := range IterSuperKmers(sequence, k, m) {
+			bioSeq := sk.ToBioSequence(k, m, parentID, parentSource)
+			result = append(result, bioSeq)
+		}
+
+		return result, nil
+	}
+}
--- a/pkg/obikmer/superkmer_iter_test.go
+++ b/pkg/obikmer/superkmer_iter_test.go
@@ -0,0 +1,198 @@
+package obikmer
+
+import (
+	"testing"
+)
+
+func TestIterSuperKmers(t *testing.T) {
+	seq := []byte("ACGTACGTGGGGAAAA")
+	k := 5
+	m := 3
+
+	count := 0
+	for sk := range IterSuperKmers(seq, k, m) {
+		count++
+		t.Logf("SuperKmer %d: Minimizer=%d, Start=%d, End=%d, Seq=%s",
+			count, sk.Minimizer, sk.Start, sk.End, string(sk.Sequence))
+
+		// Verify sequence boundaries
+		if sk.Start < 0 || sk.End > len(seq) {
+			t.Errorf("Invalid boundaries: Start=%d, End=%d, seqLen=%d",
+				sk.Start, sk.End, len(seq))
+		}
+
+		// Verify sequence content
+		if string(sk.Sequence) != string(seq[sk.Start:sk.End]) {
+			t.Errorf("Sequence mismatch: expected %s, got %s",
+				string(seq[sk.Start:sk.End]), string(sk.Sequence))
+		}
+	}
+
+	if count == 0 {
+		t.Error("No super k-mers extracted")
+	}
+
+	t.Logf("Total super k-mers extracted: %d", count)
+}
+
+func TestIterSuperKmersVsSlice(t *testing.T) {
+	seq := []byte("ACGTACGTGGGGAAAAACGTACGT")
+	k := 7
+	m := 4
+
+	// Extract using slice version
+	sliceResult := ExtractSuperKmers(seq, k, m, nil)
+
+	// Extract using iterator version
+	var iterResult []SuperKmer
+	for sk := range IterSuperKmers(seq, k, m) {
+		iterResult = append(iterResult, sk)
+	}
+
+	// Compare counts
+	if len(sliceResult) != len(iterResult) {
+		t.Errorf("Different number of super k-mers: slice=%d, iter=%d",
+			len(sliceResult), len(iterResult))
+	}
+
+	// Compare each super k-mer
+	for i := 0; i < len(sliceResult) && i < len(iterResult); i++ {
+		slice := sliceResult[i]
+		iter := iterResult[i]
+
+		if slice.Minimizer != iter.Minimizer {
+			t.Errorf("SuperKmer %d: different minimizers: slice=%d, iter=%d",
+				i, slice.Minimizer, iter.Minimizer)
+		}
+
+		if slice.Start != iter.Start || slice.End != iter.End {
+			t.Errorf("SuperKmer %d: different boundaries: slice=[%d:%d], iter=[%d:%d]",
+				i, slice.Start, slice.End, iter.Start, iter.End)
+		}
+
+		if string(slice.Sequence) != string(iter.Sequence) {
+			t.Errorf("SuperKmer %d: different sequences: slice=%s, iter=%s",
+				i, string(slice.Sequence), string(iter.Sequence))
+		}
+	}
+}
+
+// TestSuperKmerMinimizerBijection validates the intrinsic property that
+// a super k-mer sequence has one and only one minimizer (bijection property).
+// This test ensures that:
+// 1. All k-mers in a super k-mer share the same minimizer
+// 2. Two identical super k-mer sequences must have the same minimizer
+func TestSuperKmerMinimizerBijection(t *testing.T) {
+	testCases := []struct {
+		name string
+		seq  []byte
+		k    int
+		m    int
+	}{
+		{
+			name: "simple sequence",
+			seq:  []byte("ACGTACGTACGTACGTACGTACGTACGTACGT"),
+			k:    21,
+			m:    11,
+		},
+		{
+			name: "homopolymer blocks",
+			seq:  []byte("AAAACCCCGGGGTTTTAAAACCCCGGGGTTTT"),
+			k:    21,
+			m:    11,
+		},
+		{
+			name: "complex sequence",
+			seq:  []byte("ATCGATCGATCGATCGATCGATCGATCGATCG"),
+			k:    15,
+			m:    7,
+		},
+		{
+			name: "longer sequence",
+			seq:  []byte("ACGTACGTGGGGAAAAACGTACGTTTTTCCCCACGTACGT"),
+			k:    13,
+			m:    7,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			// Map to track sequence -> minimizer
+			seqToMinimizer := make(map[string]uint64)
+
+			for sk := range IterSuperKmers(tc.seq, tc.k, tc.m) {
+				seqStr := string(sk.Sequence)
+
+				// Check if we've seen this sequence before
+				if prevMinimizer, exists := seqToMinimizer[seqStr]; exists {
+					if prevMinimizer != sk.Minimizer {
+						t.Errorf("BIJECTION VIOLATION: sequence %s has two different minimizers:\n"+
+							"  First: %d\n"+
+							"  Second: %d\n"+
+							"  This violates the super k-mer definition!",
+							seqStr, prevMinimizer, sk.Minimizer)
+					}
+				} else {
+					seqToMinimizer[seqStr] = sk.Minimizer
+				}
+
+				// Verify all k-mers in this super k-mer have the same minimizer
+				if len(sk.Sequence) >= tc.k {
+					for i := 0; i <= len(sk.Sequence)-tc.k; i++ {
+						kmerSeq := sk.Sequence[i : i+tc.k]
+						minimizer := findMinimizer(kmerSeq, tc.k, tc.m)
+						if minimizer != sk.Minimizer {
+							t.Errorf("K-mer at position %d in super k-mer has different minimizer:\n"+
+								"  K-mer: %s\n"+
+								"  Expected minimizer: %d\n"+
+								"  Actual minimizer: %d\n"+
+								"  Super k-mer: %s",
+								i, string(kmerSeq), sk.Minimizer, minimizer, seqStr)
+						}
+					}
+				}
+			}
+		})
+	}
+}
+
+// findMinimizer computes the minimizer of a k-mer for testing purposes
+func findMinimizer(kmer []byte, k int, m int) uint64 {
+	if len(kmer) != k {
+		return 0
+	}
+
+	mMask := uint64(1)<<(m*2) - 1
+	rcShift := uint((m - 1) * 2)
+
+	minMinimizer := uint64(^uint64(0)) // max uint64
+
+	// Scan all m-mers in the k-mer
+	var fwdMmer, rvcMmer uint64
+	for i := 0; i < m-1 && i < len(kmer); i++ {
+		code := uint64(__single_base_code__[kmer[i]&31])
+		fwdMmer = (fwdMmer << 2) | code
+		rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
+	}
+
+	for i := m - 1; i < len(kmer); i++ {
+		code := uint64(__single_base_code__[kmer[i]&31])
+		fwdMmer = ((fwdMmer << 2) | code) & mMask
+		rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
+
+		canonical := fwdMmer
+		if rvcMmer < fwdMmer {
+			canonical = rvcMmer
+		}
+
+		if canonical < minMinimizer {
+			minMinimizer = canonical
+		}
+	}
+
+	return minMinimizer
+}
+
+// Note: Tests for ToBioSequence and SuperKmerWorker are in a separate
+// integration test package to avoid circular dependencies between
+// obikmer and obiseq packages.
--- a/pkg/obikmer/varint.go
+++ b/pkg/obikmer/varint.go
@@ -0,0 +1,53 @@
+package obikmer
+
+import "io"
+
+// EncodeVarint writes a uint64 value as a variable-length integer to w.
+// Uses 7 bits per byte with the high bit as a continuation flag
+// (identical to protobuf unsigned varint encoding).
+// Returns the number of bytes written.
+func EncodeVarint(w io.Writer, v uint64) (int, error) {
+	var buf [10]byte // max 10 bytes for uint64 varint
+	n := 0
+	for v >= 0x80 {
+		buf[n] = byte(v) | 0x80
+		v >>= 7
+		n++
+	}
+	buf[n] = byte(v)
+	n++
+	return w.Write(buf[:n])
+}
+
+// DecodeVarint reads a variable-length encoded uint64 from r.
+// Returns the decoded value and any error encountered.
+func DecodeVarint(r io.Reader) (uint64, error) {
+	var val uint64
+	var shift uint
+	var buf [1]byte
+
+	for {
+		if _, err := io.ReadFull(r, buf[:]); err != nil {
+			return 0, err
+		}
+		b := buf[0]
+		val |= uint64(b&0x7F) << shift
+		if b < 0x80 {
+			return val, nil
+		}
+		shift += 7
+		if shift >= 70 {
+			return 0, io.ErrUnexpectedEOF
+		}
+	}
+}
+
+// VarintLen returns the number of bytes needed to encode v as a varint.
+func VarintLen(v uint64) int {
+	n := 1
+	for v >= 0x80 {
+		v >>= 7
+		n++
+	}
+	return n
+}
--- a/pkg/obikmer/varint_test.go
+++ b/pkg/obikmer/varint_test.go
@@ -0,0 +1,82 @@
+package obikmer
+
+import (
+	"bytes"
+	"testing"
+)
+
+func TestVarintRoundTrip(t *testing.T) {
+	values := []uint64{
+		0, 1, 127, 128, 255, 256,
+		16383, 16384,
+		1<<21 - 1, 1 << 21,
+		1<<28 - 1, 1 << 28,
+		1<<35 - 1, 1 << 35,
+		1<<42 - 1, 1 << 42,
+		1<<49 - 1, 1 << 49,
+		1<<56 - 1, 1 << 56,
+		1<<63 - 1, 1 << 63,
+		^uint64(0), // max uint64
+	}
+
+	for _, v := range values {
+		var buf bytes.Buffer
+		n, err := EncodeVarint(&buf, v)
+		if err != nil {
+			t.Fatalf("EncodeVarint(%d): %v", v, err)
+		}
+		if n != VarintLen(v) {
+			t.Fatalf("EncodeVarint(%d): wrote %d bytes, VarintLen says %d", v, n, VarintLen(v))
+		}
+
+		decoded, err := DecodeVarint(&buf)
+		if err != nil {
+			t.Fatalf("DecodeVarint for %d: %v", v, err)
+		}
+		if decoded != v {
+			t.Fatalf("roundtrip failed: encoded %d, decoded %d", v, decoded)
+		}
+	}
+}
+
+func TestVarintLen(t *testing.T) {
+	tests := []struct {
+		value    uint64
+		expected int
+	}{
+		{0, 1},
+		{127, 1},
+		{128, 2},
+		{16383, 2},
+		{16384, 3},
+		{^uint64(0), 10},
+	}
+
+	for _, tc := range tests {
+		got := VarintLen(tc.value)
+		if got != tc.expected {
+			t.Errorf("VarintLen(%d) = %d, want %d", tc.value, got, tc.expected)
+		}
+	}
+}
+
+func TestVarintSequence(t *testing.T) {
+	var buf bytes.Buffer
+	values := []uint64{0, 42, 1000000, ^uint64(0), 1}
+
+	for _, v := range values {
+		if _, err := EncodeVarint(&buf, v); err != nil {
+			t.Fatalf("EncodeVarint(%d): %v", v, err)
+		}
+	}
+
+	for _, expected := range values {
+		got, err := DecodeVarint(&buf)
+		if err != nil {
+			t.Fatalf("DecodeVarint: %v", err)
+		}
+		if got != expected {
+			t.Errorf("got %d, want %d", got, expected)
+		}
+	}
+}
--- a/pkg/obilua/obiseqslice.go
+++ b/pkg/obilua/obiseqslice.go
@@ -31,7 +31,8 @@ func obiseqslice2Lua(interpreter *lua.LState,
 }

 func newObiSeqSlice(luaState *lua.LState) int {
-	seqslice := obiseq.NewBioSequenceSlice()
+	capacity := luaState.OptInt(1, 0)
+	seqslice := obiseq.NewBioSequenceSlice(capacity)
 	luaState.Push(obiseqslice2Lua(luaState, seqslice))
 	return 1
 }
--- a/pkg/obioptions/options.go
+++ b/pkg/obioptions/options.go
@@ -8,6 +8,7 @@ import (

 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
 	log "github.com/sirupsen/logrus"

 	"github.com/DavidGamba/go-getoptions"
@@ -26,16 +27,11 @@ var __defaut_taxonomy_mutex__ sync.Mutex

 type ArgumentParser func([]string) (*getoptions.GetOpt, []string)

-func GenerateOptionParser(program string,
-	documentation string,
-	optionset ...func(*getoptions.GetOpt)) ArgumentParser {
-
-	options := getoptions.New()
-	options.Self(program, documentation)
-	options.SetMode(getoptions.Bundling)
-	options.SetUnknownMode(getoptions.Fail)
-	options.Bool("help", false, options.Alias("h", "?"))
-
+// RegisterGlobalOptions registers the global options shared by all obitools
+// commands onto the given GetOpt instance. It does NOT register --help,
+// which must be handled by the caller (either as a Bool option or via
+// HelpCommand for subcommand-based parsers).
+func RegisterGlobalOptions(options *getoptions.GetOpt) {
 	options.Bool("version", false,
 		options.Description("Prints the version and exits."))

@@ -46,17 +42,10 @@ func GenerateOptionParser(program string,
 	options.BoolVar(&_Pprof, "pprof", false,
 		options.Description("Enable pprof server. Look at the log for details."))

-	// options.IntVar(&_ParallelWorkers, "workers", _ParallelWorkers,
-	// 	options.Alias("w"),
-	// 	options.Description("Number of parallele threads computing the result"))
-
 	options.IntVar(obidefault.MaxCPUPtr(), "max-cpu", obidefault.MaxCPU(),
 		options.GetEnv("OBIMAXCPU"),
 		options.Description("Number of parallele threads computing the result"))

-	// options.BoolVar(&_Pprof, "force-one-cpu", false,
-	// 	options.Description("Force to use only one cpu core for parallel processing"))
-
 	options.IntVar(&_PprofMudex, "pprof-mutex", _PprofMudex,
 		options.GetEnv("OBIPPROFMUTEX"),
 		options.Description("Enable profiling of mutex lock."))
@@ -67,7 +56,15 @@ func GenerateOptionParser(program string,

 	options.IntVar(obidefault.BatchSizePtr(), "batch-size", obidefault.BatchSize(),
 		options.GetEnv("OBIBATCHSIZE"),
-		options.Description("Number of sequence per batch for paralelle processing"))
+		options.Description("Minimum number of sequences per batch (floor, default 1)"))
+
+	options.IntVar(obidefault.BatchSizeMaxPtr(), "batch-size-max", obidefault.BatchSizeMax(),
+		options.GetEnv("OBIBATCHSIZEMAX"),
+		options.Description("Maximum number of sequences per batch (ceiling, default 2000)"))
+
+	options.StringVar(obidefault.BatchMemStrPtr(), "batch-mem", "",
+		options.GetEnv("OBIBATCHMEM"),
+		options.Description("Maximum memory per batch (e.g. 128K, 64M, 1G; default: 128M). Set to 0 to disable."))

 	options.Bool("solexa", false,
 		options.GetEnv("OBISOLEXA"),
@@ -77,19 +74,22 @@ func GenerateOptionParser(program string,
 		options.GetEnv("OBIWARNING"),
 		options.Description("Stop printing of the warning message"),
 	)
+}

-	for _, o := range optionset {
-		o(options)
-	}
-
-	return func(args []string) (*getoptions.GetOpt, []string) {
-
-		remaining, err := options.Parse(args[1:])
-
+// ProcessParsedOptions handles the post-parse logic common to all obitools
+// commands: help, version, debug, pprof, taxonomy, cpu configuration, etc.
+// It receives the GetOpt instance and the parse error (if any).
+func ProcessParsedOptions(options *getoptions.GetOpt, parseErr error) {
+	// Note: "help" may not be registered as a Bool (e.g. when using HelpCommand
+	// for subcommand-based parsers). Only check if it won't panic.
+	// We use a recover guard to be safe.
+	func() {
+		defer func() { recover() }()
 		if options.Called("help") {
 			fmt.Fprint(os.Stderr, options.Help())
 			os.Exit(0)
 		}
+	}()

 	if options.Called("version") {
 		fmt.Fprintf(os.Stderr, "OBITools %s\n", VersionString())
@@ -107,7 +107,6 @@ func GenerateOptionParser(program string,

 		if err != nil {
 			log.Fatalf("Cannot load default taxonomy: %v", err)
-
 		}

 		taxonomy.SetAsDefault()
@@ -146,32 +145,14 @@ func GenerateOptionParser(program string,
 	}

 	// Handle user errors
-		if err != nil {
-			fmt.Fprintf(os.Stderr, "ERROR: %s\n\n", err)
+	if parseErr != nil {
+		fmt.Fprintf(os.Stderr, "ERROR: %s\n\n", parseErr)
 		fmt.Fprint(os.Stderr, options.Help(getoptions.HelpSynopsis))
 		os.Exit(1)
 	}

-		// // Setup the maximum number of CPU usable by the program
-		// if obidefault.MaxCPU() == 1 {
-		// 	log.Warn("Limitating the Maximum number of CPU to 1 is not recommanded")
-		// 	log.Warn("The number of CPU requested has been set to 2")
-		// 	obidefault.SetMaxCPU(2)
-		// }
-
-		// if options.Called("force-one-cpu") {
-		// 	log.Warn("Limitating the Maximum number of CPU to 1 is not recommanded")
-		// 	log.Warn("The number of CPU has been forced to 1")
-		// 	log.Warn("This can lead to unexpected behavior")
-		// 	obidefault.SetMaxCPU(1)
-		// }
-
 	runtime.GOMAXPROCS(obidefault.MaxCPU())

-		// if options.Called("max-cpu") || options.Called("force-one-cpu") {
-		// 	log.Printf("CPU number limited to %d", obidefault.MaxCPU())
-		// }
-
 	if options.Called("max-cpu") {
 		log.Printf("CPU number limited to %d", obidefault.MaxCPU())
 	}
@@ -182,14 +163,39 @@ func GenerateOptionParser(program string,

 	log.Printf("Number of workers set %d", obidefault.ParallelWorkers())

-		// if options.Called("workers") {
-
-		// }
-
 	if options.Called("solexa") {
 		obidefault.SetReadQualitiesShift(64)
 	}

+	if options.Called("batch-mem") {
+		n, err := obiutils.ParseMemSize(obidefault.BatchMemStr())
+		if err != nil {
+			log.Fatalf("Invalid --batch-mem value %q: %v", obidefault.BatchMemStr(), err)
+		}
+		obidefault.SetBatchMem(n)
+		log.Printf("Memory-based batching enabled: %s per batch", obidefault.BatchMemStr())
+	}
+}
+
+func GenerateOptionParser(program string,
+	documentation string,
+	optionset ...func(*getoptions.GetOpt)) ArgumentParser {
+
+	options := getoptions.New()
+	options.Self(program, documentation)
+	options.SetMode(getoptions.Bundling)
+	options.SetUnknownMode(getoptions.Fail)
+	options.Bool("help", false, options.Alias("h", "?"))
+
+	RegisterGlobalOptions(options)
+
+	for _, o := range optionset {
+		o(options)
+	}
+
+	return func(args []string) (*getoptions.GetOpt, []string) {
+		remaining, err := options.Parse(args[1:])
+		ProcessParsedOptions(options, err)
 		return options, remaining
 	}
 }
--- a/pkg/obioptions/subcommand.go
+++ b/pkg/obioptions/subcommand.go
@@ -0,0 +1,43 @@
+package obioptions
+
+import (
+	"github.com/DavidGamba/go-getoptions"
+)
+
+// GenerateSubcommandParser creates an option parser that supports subcommands
+// via go-getoptions' NewCommand/SetCommandFn/Dispatch API.
+//
+// The setup function receives the root *GetOpt and should register subcommands
+// using opt.NewCommand(). Global options (--debug, --max-cpu, etc.) are
+// registered before setup is called and are inherited by all subcommands.
+//
+// Returns the root *GetOpt (needed for Dispatch) and an ArgumentParser
+// that handles parsing and post-parse processing.
+func GenerateSubcommandParser(
+	program string,
+	documentation string,
+	setup func(opt *getoptions.GetOpt),
+) (*getoptions.GetOpt, ArgumentParser) {
+
+	options := getoptions.New()
+	options.Self(program, documentation)
+	options.SetMode(getoptions.Bundling)
+	options.SetUnknownMode(getoptions.Fail)
+
+	// Register global options (inherited by all subcommands)
+	RegisterGlobalOptions(options)
+
+	// Let the caller register subcommands
+	setup(options)
+
+	// Add automatic help subcommand (must be after all commands)
+	options.HelpCommand("help", options.Description("Show help for a command"))
+
+	parser := func(args []string) (*getoptions.GetOpt, []string) {
+		remaining, err := options.Parse(args[1:])
+		ProcessParsedOptions(options, err)
+		return options, remaining
+	}
+
+	return options, parser
+}
--- a/pkg/obioptions/version.go
+++ b/pkg/obioptions/version.go
@@ -3,7 +3,7 @@ package obioptions
 // Version is automatically updated by the Makefile from version.txt
 // The patch number (third digit) is incremented on each push to the repository

-var _Version = "Release 4.4.7"
+var _Version = "Release 4.4.29"

 // Version returns the version of the obitools package.
 //
--- a/pkg/obiseq/biosequence.go
+++ b/pkg/obiseq/biosequence.go
@@ -120,6 +120,19 @@ func NewBioSequence(id string,
 	return bs
 }

+// NewBioSequenceOwning creates a BioSequence taking ownership of the sequence
+// slice without copying it. The caller must not use the slice after this call.
+// Use this when the slice was allocated specifically for this sequence.
+func NewBioSequenceOwning(id string,
+	sequence []byte,
+	definition string) *BioSequence {
+	bs := NewEmptyBioSequence(0)
+	bs.SetId(id)
+	bs.TakeSequence(sequence)
+	bs.SetDefinition(definition)
+	return bs
+}
+
 // NewBioSequenceWithQualities creates a new BioSequence object with the given id, sequence, definition, and qualities.
 //
 // Parameters:
@@ -260,6 +273,28 @@ func (s *BioSequence) Len() int {
 	return len(s.sequence)
 }

+// MemorySize returns an estimate of the memory footprint of the BioSequence
+// in bytes. It accounts for the sequence, quality scores, feature data,
+// annotations, and fixed struct overhead. The estimate is conservative
+// (cap rather than len for byte slices) so it is suitable for memory-based
+// batching decisions.
+func (s *BioSequence) MemorySize() int {
+	if s == nil {
+		return 0
+	}
+	// fixed struct overhead (strings, pointers, mutex pointer)
+	const overhead = 128
+	n := overhead
+	n += cap(s.sequence)
+	n += cap(s.qualities)
+	n += cap(s.feature)
+	n += len(s.id)
+	n += len(s.source)
+	// rough annotation estimate: each key+value pair ~64 bytes on average
+	n += len(s.annotations) * 64
+	return n
+}
+
 // HasQualities checks if the BioSequence has sequence qualitiy scores.
 //
 // This function does not have any parameters.
@@ -444,6 +479,12 @@ func (s *BioSequence) SetSequence(sequence []byte) {
 	s.sequence = obiutils.InPlaceToLower(CopySlice(sequence))
 }

+// TakeSequence stores the slice directly without copying, then lowercases in-place.
+// The caller must not use the slice after this call.
+func (s *BioSequence) TakeSequence(sequence []byte) {
+	s.sequence = obiutils.InPlaceToLower(sequence)
+}
+
 func (s *BioSequence) HasValidSequence() bool {
 	for _, c := range s.sequence {
 		if !((c >= 'a' && c <= 'z') || c == '-' || c == '.' || c == '[' || c == ']') {
@@ -461,6 +502,15 @@ func (s *BioSequence) SetQualities(qualities Quality) {
 	s.qualities = CopySlice(qualities)
 }

+// TakeQualities stores the slice directly without copying.
+// The caller must not use the slice after this call.
+func (s *BioSequence) TakeQualities(qualities Quality) {
+	if s.qualities != nil {
+		RecycleSlice(&s.qualities)
+	}
+	s.qualities = qualities
+}
+
 // A method that appends a byte slice to the qualities of the BioSequence.
 func (s *BioSequence) WriteQualities(data []byte) (int, error) {
 	s.qualities = append(s.qualities, data...)
--- a/pkg/obiseq/biosequenceslice.go
+++ b/pkg/obiseq/biosequenceslice.go
@@ -195,7 +195,7 @@ func (s *BioSequenceSlice) ExtractTaxonomy(taxonomy *obitax.Taxonomy, seqAsTaxa
 				return nil, fmt.Errorf("sequence %v has no path", s.Id())
 			}
 			last := path[len(path)-1]
-			taxname, _ := obiutils.SplitInTwo(last, ':')
+			taxname, _ := obiutils.LeftSplitInTwo(last, ':')
 			if idx, ok := s.GetIntAttribute("seq_number"); !ok {
 				return nil, errors.New("sequences are not numbered")
 			} else {
--- a/pkg/obiseq/pool.go
+++ b/pkg/obiseq/pool.go
@@ -1,13 +1,20 @@
 package obiseq

 import (
+	"runtime"
 	"sync"
+	"sync/atomic"

 	log "github.com/sirupsen/logrus"

 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
 )

+const _LargeSliceThreshold = 100 * 1024        // 100 kb — below: leave to GC, above: trigger explicit GC
+const _GCBytesBudget = int64(256 * 1024 * 1024) // trigger GC every 256 MB of large discards
+
+var _largeSliceDiscardedBytes = atomic.Int64{}
+
 var _BioSequenceByteSlicePool = sync.Pool{
 	New: func() interface{} {
 		bs := make([]byte, 0, 300)
@@ -34,6 +41,13 @@ func RecycleSlice(s *[]byte) {
 		}
 		if cap(*s) <= 1024 {
 			_BioSequenceByteSlicePool.Put(s)
+		} else if cap(*s) >= _LargeSliceThreshold {
+			n := int64(cap(*s))
+			*s = nil
+			prev := _largeSliceDiscardedBytes.Load()
+			if _largeSliceDiscardedBytes.Add(n)/_GCBytesBudget > prev/_GCBytesBudget {
+				runtime.GC()
+			}
 		}
 	}
 }
--- a/pkg/obiseq/worker.go
+++ b/pkg/obiseq/worker.go
@@ -104,11 +104,11 @@ func SeqToSliceWorker(worker SeqWorker,
 			for _, s := range input {
 				r, err := worker(s)
 				if err == nil {
-					for _, rs := range r {
-						if i == len(output) {
-							output = slices.Grow(output, cap(output))
+					if i+len(r) > cap(output) {
+						output = slices.Grow(output[:i], len(r))
 						output = output[:cap(output)]
 					}
+					for _, rs := range r {
 						output[i] = rs
 						i++
 					}
--- a/pkg/obitax/taxid.go
+++ b/pkg/obitax/taxid.go
@@ -31,7 +31,7 @@ func NewTaxidFactory(code string, alphabet obiutils.AsciiSet) *TaxidFactory {
 // It extracts the relevant part of the string after the first colon (':') if present.
 func (f *TaxidFactory) FromString(taxid string) (Taxid, error) {
 	taxid = obiutils.AsciiSpaceSet.TrimLeft(taxid)
-	part1, part2 := obiutils.SplitInTwo(taxid, ':')
+	part1, part2 := obiutils.LeftSplitInTwo(taxid, ':')
 	if len(part2) == 0 {
 		taxid = part1
 	} else {
--- a/pkg/obitools/obiclean/graph.go
+++ b/pkg/obitools/obiclean/graph.go
@@ -13,6 +13,7 @@ import (
 	log "github.com/sirupsen/logrus"

 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
 	"github.com/schollz/progressbar/v3"
 )
@@ -63,12 +64,14 @@ func EmpiricalDistCsv(filename string, data [][]Ratio, compressed bool) {
 		fmt.Println(err)
 	}

-	destfile, err := obiutils.CompressStream(file, true, true)
+	destfile, err := obiutils.CompressStream(file, compressed, true)
 	if err != nil {
 		fmt.Println(err)
 	}
 	defer destfile.Close()

+	var bar *progressbar.ProgressBar
+	if obidefault.ProgressBar() {
 		pbopt := make([]progressbar.Option, 0, 5)
 		pbopt = append(pbopt,
 			progressbar.OptionSetWriter(os.Stderr),
@@ -77,8 +80,8 @@ func EmpiricalDistCsv(filename string, data [][]Ratio, compressed bool) {
 			progressbar.OptionSetPredictTime(true),
 			progressbar.OptionSetDescription("[Save CSV stat ratio file]"),
 		)
-
-	bar := progressbar.NewOptions(len(data), pbopt...)
+		bar = progressbar.NewOptions(len(data), pbopt...)
+	}

 	fmt.Fprintln(destfile, "Sample,Origin_id,Origin_status,Origin,Mutant,Origin_Weight,Mutant_Weight,Origin_Count,Mutant_Count,Position,Origin_length,A,C,G,T")
 	for code, dist := range data {
@@ -101,8 +104,10 @@ func EmpiricalDistCsv(filename string, data [][]Ratio, compressed bool) {
 				ratio.T,
 			)
 		}
+		if bar != nil {
 			bar.Add(1)
 		}
+	}
 }

 // It takes a slice of sequences, a sample name and a statistical threshold and returns a string
@@ -181,6 +186,8 @@ func SaveGMLGraphs(dirname string,
 		}
 	}

+	var bar *progressbar.ProgressBar
+	if obidefault.ProgressBar() {
 		pbopt := make([]progressbar.Option, 0, 5)
 		pbopt = append(pbopt,
 			progressbar.OptionSetWriter(os.Stderr),
@@ -189,8 +196,8 @@ func SaveGMLGraphs(dirname string,
 			progressbar.OptionSetPredictTime(true),
 			progressbar.OptionSetDescription("[Save GML Graph files]"),
 		)
-
-	bar := progressbar.NewOptions(len(samples), pbopt...)
+		bar = progressbar.NewOptions(len(samples), pbopt...)
+	}

 	for name, seqs := range samples {

@@ -204,8 +211,10 @@ func SaveGMLGraphs(dirname string,
 		file.WriteString(Gml(seqs, name, statThreshold))
 		file.Close()

+		if bar != nil {
 			bar.Add(1)
 		}
+	}

 }

@@ -495,6 +504,8 @@ func BuildSeqGraph(samples map[string]*[]*seqPCR,
 		npairs += nseq * (nseq - 1) / 2
 	}

+	var bar *progressbar.ProgressBar
+	if obidefault.ProgressBar() {
 		pbopt := make([]progressbar.Option, 0, 5)
 		pbopt = append(pbopt,
 			progressbar.OptionSetWriter(os.Stderr),
@@ -503,16 +514,19 @@ func BuildSeqGraph(samples map[string]*[]*seqPCR,
 			progressbar.OptionSetPredictTime(true),
 			progressbar.OptionSetDescription("[One error graph]"),
 		)
+		bar = progressbar.NewOptions(npairs, pbopt...)
+	}

-	bar := progressbar.NewOptions(npairs, pbopt...)
 	for _, seqs := range samples {
 		np := buildSamplePairs(seqs, workers)
-
+		if bar != nil {
 			bar.Add(np)
 		}
+	}

 	if maxError > 1 {
-		pbopt = make([]progressbar.Option, 0, 5)
+		if obidefault.ProgressBar() {
+			pbopt := make([]progressbar.Option, 0, 5)
 			pbopt = append(pbopt,
 				progressbar.OptionSetWriter(os.Stderr),
 				progressbar.OptionSetWidth(15),
@@ -520,12 +534,14 @@ func BuildSeqGraph(samples map[string]*[]*seqPCR,
 				progressbar.OptionSetPredictTime(true),
 				progressbar.OptionSetDescription("[Adds multiple errors]"),
 			)
-
 			bar = progressbar.NewOptions(npairs, pbopt...)
+		}

 		for _, seqs := range samples {
 			np := extendSimilarityGraph(seqs, maxError, workers)
+			if bar != nil {
 				bar.Add(np)
 			}
 		}
+	}
 }
--- a/pkg/obitools/obiconvert/options.go
+++ b/pkg/obitools/obiconvert/options.go
@@ -31,7 +31,6 @@ var __output_in_json__ = false
 var __output_fastjson_format__ = false
 var __output_fastobi_format__ = false

-var __no_progress_bar__ = false
 var __skip_empty__ = false
 var __skip_on_error__ = false

@@ -82,7 +81,7 @@ func InputOptionSet(options *getoptions.GetOpt) {
 }

 func OutputModeOptionSet(options *getoptions.GetOpt, compressed bool) {
-	options.BoolVar(&__no_progress_bar__, "no-progressbar", false,
+	options.BoolVar(obidefault.NoProgressBarPtr(), "no-progressbar", obidefault.NoProgressBar(),
 		options.Description("Disable the progress bar printing"))

 	if compressed {
@@ -224,13 +223,16 @@ func CLIAnalyzeOnly() int {

 func CLIProgressBar() bool {
 	// If the output is not a terminal, then we do not display the progress bar
-	o, _ := os.Stderr.Stat()
-	onTerminal := (o.Mode() & os.ModeCharDevice) == os.ModeCharDevice
+	oe, _ := os.Stderr.Stat()
+	onTerminal := (oe.Mode() & os.ModeCharDevice) == os.ModeCharDevice
 	if !onTerminal {
 		log.Info("Stderr is redirected, progress bar disabled")
 	}

-	return onTerminal && !__no_progress_bar__
+	oo, _ := os.Stdout.Stat()
+	toPipe := (oo.Mode() & os.ModeNamedPipe) == os.ModeNamedPipe
+
+	return onTerminal && !toPipe && obidefault.ProgressBar()
 }

 func CLIOutPutFileName() string {
--- a/pkg/obitools/obiconvert/sequence_reader.go
+++ b/pkg/obitools/obiconvert/sequence_reader.go
@@ -68,6 +68,8 @@ func ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) {
 						strings.HasSuffix(path, "seq.gz") ||
 						strings.HasSuffix(path, "gb") ||
 						strings.HasSuffix(path, "gb.gz") ||
+						strings.HasSuffix(path, "gbff") ||
+						strings.HasSuffix(path, "gbff.gz") ||
 						strings.HasSuffix(path, "dat") ||
 						strings.HasSuffix(path, "dat.gz") ||
 						strings.HasSuffix(path, "ecopcr") ||
@@ -204,15 +206,15 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
 					iterator = iterator.PairTo(ip)
 				}
 			} else {
-				iterator = obiiter.NilIBioSequence
+				return obiiter.NilIBioSequence, fmt.Errorf("no sequence files found in the provided paths")
 			}
 		}

 	}

-	if CLIProgressBar() {
 	iterator = iterator.Speed("Reading sequences")
-	}
+
+	iterator = iterator.RebatchBySize(obidefault.BatchMem(), obidefault.BatchSizeMax())

 	return iterator, nil
 }
--- a/pkg/obitools/obicsv/obicsv.go
+++ b/pkg/obitools/obicsv/obicsv.go
@@ -12,9 +12,7 @@ import (
 func CLIWriteSequenceCSV(iterator obiiter.IBioSequence,
 	terminalAction bool, filenames ...string) *obiitercsv.ICSVRecord {

-	if obiconvert.CLIProgressBar() {
 	iterator = iterator.Speed("Writing CSV")
-	}

 	opts := make([]WithOption, 0, 10)

--- a/pkg/obitools/obidistribute/distribute.go
+++ b/pkg/obitools/obidistribute/distribute.go
@@ -46,8 +46,7 @@ func CLIDistributeSequence(sequences obiiter.IBioSequence) {
 		formater = obiformats.WriteSequencesToFile
 	}

-	dispatcher := sequences.Distribute(CLISequenceClassifier(),
-		obidefault.BatchSize())
+	dispatcher := sequences.Distribute(CLISequenceClassifier())

 	obiformats.WriterDispatcher(CLIFileNamePattern(),
 		dispatcher, formater, opts...,
--- a/pkg/obitools/obik/cp.go
+++ b/pkg/obitools/obik/cp.go
@@ -0,0 +1,55 @@
+package obik
+
+import (
+	"context"
+	"fmt"
+
+	log "github.com/sirupsen/logrus"
+
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
+	"github.com/DavidGamba/go-getoptions"
+)
+
+func runCp(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
+	if len(args) < 2 {
+		return fmt.Errorf("usage: obik cp [--set PATTERN]... [--force] <source_index> <dest_index>")
+	}
+
+	srcDir := args[0]
+	destDir := args[1]
+
+	ksg, err := obikmer.OpenKmerSetGroup(srcDir)
+	if err != nil {
+		return fmt.Errorf("failed to open source kmer index: %w", err)
+	}
+
+	// Resolve set patterns
+	patterns := CLISetPatterns()
+	var ids []string
+	if len(patterns) > 0 {
+		indices, err := ksg.MatchSetIDs(patterns)
+		if err != nil {
+			return err
+		}
+		if len(indices) == 0 {
+			return fmt.Errorf("no sets match the given patterns")
+		}
+		ids = make([]string, len(indices))
+		for i, idx := range indices {
+			ids[i] = ksg.SetIDOf(idx)
+		}
+	} else {
+		// Copy all sets
+		ids = ksg.SetsIDs()
+	}
+
+	log.Infof("Copying %d set(s) from %s to %s", len(ids), srcDir, destDir)
+
+	dest, err := ksg.CopySetsByIDTo(ids, destDir, CLIForce())
+	if err != nil {
+		return err
+	}
+
+	log.Infof("Destination now has %d set(s)", dest.Size())
+	return nil
+}
--- a/pkg/obitools/obik/filter.go
+++ b/pkg/obitools/obik/filter.go
@@ -0,0 +1,344 @@
+package obik
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+	"sync/atomic"
+
+	"github.com/schollz/progressbar/v3"
+	log "github.com/sirupsen/logrus"
+
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
+	"github.com/DavidGamba/go-getoptions"
+)
+
+// KmerFilter is a predicate applied to individual k-mers during filtering.
+// Returns true if the k-mer should be kept.
+type KmerFilter func(kmer uint64) bool
+
+// KmerFilterFactory creates a new KmerFilter instance.
+// Each goroutine should call the factory to get its own filter,
+// since some filters (e.g. KmerEntropyFilter) are not thread-safe.
+type KmerFilterFactory func() KmerFilter
+
+// chainFilterFactories combines multiple KmerFilterFactory into one.
+// The resulting factory creates a filter that accepts a k-mer only
+// if all individual filters accept it.
+func chainFilterFactories(factories []KmerFilterFactory) KmerFilterFactory {
+	switch len(factories) {
+	case 0:
+		return func() KmerFilter { return func(uint64) bool { return true } }
+	case 1:
+		return factories[0]
+	default:
+		return func() KmerFilter {
+			filters := make([]KmerFilter, len(factories))
+			for i, f := range factories {
+				filters[i] = f()
+			}
+			return func(kmer uint64) bool {
+				for _, f := range filters {
+					if !f(kmer) {
+						return false
+					}
+				}
+				return true
+			}
+		}
+	}
+}
+
+// runFilter implements the "obik filter" subcommand.
+// It reads an existing kmer index, applies a chain of filters,
+// and writes a new filtered index.
+func runFilter(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
+	if len(args) < 1 {
+		return fmt.Errorf("usage: obik filter [options] <source_index> --out <dest_index>")
+	}
+
+	srcDir := args[0]
+	destDir := CLIOutputDirectory()
+	if destDir == "" || destDir == "-" {
+		return fmt.Errorf("--out option is required and must specify a destination directory")
+	}
+
+	// Open source index
+	src, err := obikmer.OpenKmerSetGroup(srcDir)
+	if err != nil {
+		return fmt.Errorf("failed to open source index: %w", err)
+	}
+
+	k := src.K()
+
+	// Build filter factory chain from CLI options.
+	// Factories are used so each goroutine creates its own filter instance,
+	// since some filters (e.g. KmerEntropyFilter) have mutable state.
+	var factories []KmerFilterFactory
+	var filterDescriptions []string
+
+	// Entropy filter
+	entropyThreshold := CLIIndexEntropyThreshold()
+	entropySize := CLIIndexEntropySize()
+	if entropyThreshold > 0 {
+		factories = append(factories, func() KmerFilter {
+			ef := obikmer.NewKmerEntropyFilter(k, entropySize, entropyThreshold)
+			return ef.Accept
+		})
+		filterDescriptions = append(filterDescriptions,
+			fmt.Sprintf("entropy(threshold=%.4f, level-max=%d)", entropyThreshold, entropySize))
+	}
+
+	// Future filters will be added here, e.g.:
+	// quorumFilter, frequencyFilter, ...
+
+	if len(factories) == 0 {
+		return fmt.Errorf("no filter specified; use --entropy-filter or other filter options")
+	}
+
+	filterFactory := chainFilterFactories(factories)
+
+	// Resolve set selection (default: all sets)
+	patterns := CLISetPatterns()
+	var setIndices []int
+	if len(patterns) > 0 {
+		setIndices, err = src.MatchSetIDs(patterns)
+		if err != nil {
+			return fmt.Errorf("failed to match set patterns: %w", err)
+		}
+		if len(setIndices) == 0 {
+			return fmt.Errorf("no sets match the given patterns")
+		}
+	} else {
+		setIndices = make([]int, src.Size())
+		for i := range setIndices {
+			setIndices[i] = i
+		}
+	}
+
+	log.Infof("Filtering %d set(s) from %s with: %s",
+		len(setIndices), srcDir, strings.Join(filterDescriptions, " + "))
+
+	// Create destination directory
+	if err := os.MkdirAll(destDir, 0755); err != nil {
+		return fmt.Errorf("failed to create destination: %w", err)
+	}
+
+	P := src.Partitions()
+
+	// Progress bar for partition filtering
+	totalPartitions := len(setIndices) * P
+	var bar *progressbar.ProgressBar
+	if obidefault.ProgressBar() {
+		pbopt := []progressbar.Option{
+			progressbar.OptionSetWriter(os.Stderr),
+			progressbar.OptionSetWidth(15),
+			progressbar.OptionShowCount(),
+			progressbar.OptionShowIts(),
+			progressbar.OptionSetPredictTime(true),
+			progressbar.OptionSetDescription("[Filtering partitions]"),
+		}
+		bar = progressbar.NewOptions(totalPartitions, pbopt...)
+	}
+
+	// Process each selected set
+	newCounts := make([]uint64, len(setIndices))
+
+	for si, srcIdx := range setIndices {
+		setID := src.SetIDOf(srcIdx)
+		if setID == "" {
+			setID = fmt.Sprintf("set_%d", srcIdx)
+		}
+
+		destSetDir := filepath.Join(destDir, fmt.Sprintf("set_%d", si))
+		if err := os.MkdirAll(destSetDir, 0755); err != nil {
+			return fmt.Errorf("failed to create set directory: %w", err)
+		}
+
+		// Process partitions in parallel
+		nWorkers := obidefault.ParallelWorkers()
+		if nWorkers > P {
+			nWorkers = P
+		}
+
+		var totalKept atomic.Uint64
+		var totalProcessed atomic.Uint64
+
+		type job struct {
+			partIdx int
+		}
+
+		jobs := make(chan job, P)
+		var wg sync.WaitGroup
+		var errMu sync.Mutex
+		var firstErr error
+
+		for w := 0; w < nWorkers; w++ {
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
+				// Each goroutine gets its own filter instance
+				workerFilter := filterFactory()
+				for j := range jobs {
+					kept, processed, err := filterPartition(
+						src.PartitionPath(srcIdx, j.partIdx),
+						filepath.Join(destSetDir, fmt.Sprintf("part_%04d.kdi", j.partIdx)),
+						workerFilter,
+					)
+					if err != nil {
+						errMu.Lock()
+						if firstErr == nil {
+							firstErr = err
+						}
+						errMu.Unlock()
+						return
+					}
+					totalKept.Add(kept)
+					totalProcessed.Add(processed)
+					if bar != nil {
+						bar.Add(1)
+					}
+				}
+			}()
+		}
+
+		for p := 0; p < P; p++ {
+			jobs <- job{p}
+		}
+		close(jobs)
+		wg.Wait()
+
+		if firstErr != nil {
+			return fmt.Errorf("failed to filter set %q: %w", setID, firstErr)
+		}
+
+		kept := totalKept.Load()
+		processed := totalProcessed.Load()
+		newCounts[si] = kept
+		log.Infof("Set %q: %d/%d k-mers kept (%.1f%% removed)",
+			setID, kept, processed,
+			100.0*float64(processed-kept)/float64(max(processed, 1)))
+
+		// Copy spectrum.bin if it exists
+		srcSpecPath := src.SpectrumPath(srcIdx)
+		if _, err := os.Stat(srcSpecPath); err == nil {
+			destSpecPath := filepath.Join(destSetDir, "spectrum.bin")
+			if err := copyFileHelper(srcSpecPath, destSpecPath); err != nil {
+				log.Warnf("Could not copy spectrum for set %q: %v", setID, err)
+			}
+		}
+	}
+
+	if bar != nil {
+		fmt.Fprintln(os.Stderr)
+	}
+
+	// Build destination metadata
+	setsIDs := make([]string, len(setIndices))
+	setsMetadata := make([]map[string]interface{}, len(setIndices))
+	for i, srcIdx := range setIndices {
+		setsIDs[i] = src.SetIDOf(srcIdx)
+		setsMetadata[i] = src.AllSetMetadata(srcIdx)
+		if setsMetadata[i] == nil {
+			setsMetadata[i] = make(map[string]interface{})
+		}
+	}
+
+	// Write metadata for the filtered index
+	dest, err := obikmer.NewFilteredKmerSetGroup(
+		destDir, k, src.M(), P,
+		len(setIndices), setsIDs, newCounts, setsMetadata,
+	)
+	if err != nil {
+		return fmt.Errorf("failed to create filtered metadata: %w", err)
+	}
+
+	// Copy group-level metadata and record applied filters
+	for key, value := range src.Metadata {
+		dest.SetAttribute(key, value)
+	}
+	if entropyThreshold > 0 {
+		dest.SetAttribute("entropy_filter", entropyThreshold)
+		dest.SetAttribute("entropy_filter_size", entropySize)
+	}
+	dest.SetAttribute("filtered_from", srcDir)
+
+	if err := dest.SaveMetadata(); err != nil {
+		return fmt.Errorf("failed to save metadata: %w", err)
+	}
+
+	log.Info("Done.")
+	return nil
+}
+
+// filterPartition reads a single .kdi partition, applies the filter predicate,
+// and writes the accepted k-mers to a new .kdi file.
+// Returns (kept, processed, error).
+func filterPartition(srcPath, destPath string, accept KmerFilter) (uint64, uint64, error) {
+	reader, err := obikmer.NewKdiReader(srcPath)
+	if err != nil {
+		// Empty partition — write empty KDI
+		w, err2 := obikmer.NewKdiWriter(destPath)
+		if err2 != nil {
+			return 0, 0, err2
+		}
+		return 0, 0, w.Close()
+	}
+	defer reader.Close()
+
+	w, err := obikmer.NewKdiWriter(destPath)
+	if err != nil {
+		return 0, 0, err
+	}
+
+	var kept, processed uint64
+	for {
+		kmer, ok := reader.Next()
+		if !ok {
+			break
+		}
+		processed++
+		if accept(kmer) {
+			if err := w.Write(kmer); err != nil {
+				w.Close()
+				return 0, 0, err
+			}
+			kept++
+		}
+	}
+
+	return kept, processed, w.Close()
+}
+
+// copyFileHelper copies a file (used for spectrum.bin etc.)
+func copyFileHelper(src, dst string) error {
+	in, err := os.Open(src)
+	if err != nil {
+		return err
+	}
+	defer in.Close()
+
+	out, err := os.Create(dst)
+	if err != nil {
+		return err
+	}
+	defer out.Close()
+
+	buf := make([]byte, 32*1024)
+	for {
+		n, readErr := in.Read(buf)
+		if n > 0 {
+			if _, writeErr := out.Write(buf[:n]); writeErr != nil {
+				return writeErr
+			}
+		}
+		if readErr != nil {
+			break
+		}
+	}
+	return out.Close()
+}
--- a/pkg/obitools/obik/index.go
+++ b/pkg/obitools/obik/index.go
@@ -0,0 +1,154 @@
+package obik
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+	"sync"
+	"sync/atomic"
+
+	log "github.com/sirupsen/logrus"
+
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
+	"github.com/DavidGamba/go-getoptions"
+)
+
+func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
+	outDir := CLIOutputDirectory()
+	if outDir == "" || outDir == "-" {
+		return fmt.Errorf("--out option is required and must specify a directory path")
+	}
+
+	k := CLIKmerSize()
+	if k < 2 || k > 31 {
+		return fmt.Errorf("invalid k-mer size: %d (must be between 2 and 31)", k)
+	}
+
+	m := CLIMinimizerSize()
+
+	minOcc := CLIMinOccurrence()
+	if minOcc < 1 {
+		return fmt.Errorf("invalid min-occurrence: %d (must be >= 1)", minOcc)
+	}
+
+	maxOcc := CLIMaxOccurrence()
+
+	entropyThreshold := CLIIndexEntropyThreshold()
+	entropySize := CLIIndexEntropySize()
+
+	// Build options
+	var opts []obikmer.BuilderOption
+	if minOcc > 1 {
+		opts = append(opts, obikmer.WithMinFrequency(minOcc))
+	}
+	if maxOcc > 0 {
+		opts = append(opts, obikmer.WithMaxFrequency(maxOcc))
+	}
+	if topN := CLISaveFreqKmer(); topN > 0 {
+		opts = append(opts, obikmer.WithSaveFreqKmers(topN))
+	}
+	if entropyThreshold > 0 {
+		opts = append(opts, obikmer.WithEntropyFilter(entropyThreshold, entropySize))
+	}
+
+	// Determine whether to append to existing group or create new
+	var builder *obikmer.KmerSetGroupBuilder
+	var err error
+	metaPath := filepath.Join(outDir, "metadata.toml")
+	if _, statErr := os.Stat(metaPath); statErr == nil {
+		// Existing group: append
+		log.Infof("Appending to existing kmer index at %s", outDir)
+		builder, err = obikmer.AppendKmerSetGroupBuilder(outDir, 1, opts...)
+		if err != nil {
+			return fmt.Errorf("failed to open existing kmer index for appending: %w", err)
+		}
+	} else {
+		// New group
+		if maxOcc > 0 {
+			log.Infof("Creating new kmer index: k=%d, m=%d, occurrence=[%d,%d]", k, m, minOcc, maxOcc)
+		} else {
+			log.Infof("Creating new kmer index: k=%d, m=%d, min-occurrence=%d", k, m, minOcc)
+		}
+		builder, err = obikmer.NewKmerSetGroupBuilder(outDir, k, m, 1, -1, opts...)
+		if err != nil {
+			return fmt.Errorf("failed to create kmer index builder: %w", err)
+		}
+	}
+
+	// Read and process sequences in parallel
+	sequences, err := obiconvert.CLIReadBioSequences(args...)
+	if err != nil {
+		return fmt.Errorf("failed to open sequence files: %w", err)
+	}
+
+	nworkers := obidefault.ParallelWorkers()
+	var seqCount atomic.Int64
+	var wg sync.WaitGroup
+
+	consumer := func(iter obiiter.IBioSequence) {
+		defer wg.Done()
+		for iter.Next() {
+			batch := iter.Get()
+			for _, seq := range batch.Slice() {
+				builder.AddSequence(0, seq)
+				seqCount.Add(1)
+			}
+		}
+	}
+
+	for i := 1; i < nworkers; i++ {
+		wg.Add(1)
+		go consumer(sequences.Split())
+	}
+	wg.Add(1)
+	go consumer(sequences)
+	wg.Wait()
+
+	log.Infof("Processed %d sequences", seqCount.Load())
+
+	// Finalize
+	ksg, err := builder.Close()
+	if err != nil {
+		return fmt.Errorf("failed to finalize kmer index: %w", err)
+	}
+
+	// Apply index-id to the new set
+	newSetIdx := builder.StartIndex()
+	if id := CLIIndexId(); id != "" {
+		ksg.SetSetID(newSetIdx, id)
+	}
+
+	// Apply group-level tags (-S)
+	for key, value := range CLISetTag() {
+		ksg.SetAttribute(key, value)
+	}
+
+	// Apply per-set tags (-T) to the new set
+	for key, value := range _setMetaTags {
+		ksg.SetSetMetadata(newSetIdx, key, value)
+	}
+
+	if minOcc > 1 {
+		ksg.SetAttribute("min_occurrence", minOcc)
+	}
+	if maxOcc > 0 {
+		ksg.SetAttribute("max_occurrence", maxOcc)
+	}
+
+	if entropyThreshold > 0 {
+		ksg.SetAttribute("entropy_filter", entropyThreshold)
+		ksg.SetAttribute("entropy_filter_size", entropySize)
+	}
+
+	if err := ksg.SaveMetadata(); err != nil {
+		return fmt.Errorf("failed to save metadata: %w", err)
+	}
+
+	log.Infof("Index contains %d k-mers for set %d in %s", ksg.Len(newSetIdx), newSetIdx, outDir)
+	log.Info("Done.")
+	return nil
+}
--- a/pkg/obitools/obik/lowmask.go
+++ b/pkg/obitools/obik/lowmask.go
@@ -0,0 +1,419 @@
+package obik
+
+import (
+	"context"
+	"fmt"
+	"math"
+
+	log "github.com/sirupsen/logrus"
+
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
+	"github.com/DavidGamba/go-getoptions"
+)
+
+// lowMaskWorker creates a worker to mask low-complexity regions in DNA sequences.
+func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte, keepShorter bool) obiseq.SeqWorker {
+
+	nLogN := make([]float64, kmer_size+1)
+	for i := 1; i <= kmer_size; i++ {
+		nLogN[i] = float64(i) * math.Log(float64(i))
+	}
+
+	normTables := make([][]int, level_max+1)
+	for ws := 1; ws <= level_max; ws++ {
+		size := 1 << (ws * 2)
+		normTables[ws] = make([]int, size)
+		for code := 0; code < size; code++ {
+			normTables[ws][code] = int(obikmer.NormalizeCircular(uint64(code), ws))
+		}
+	}
+
+	type pair struct {
+		index int
+		value float64
+	}
+
+	slidingMin := func(data []float64, window int) {
+		if len(data) == 0 || window <= 0 {
+			return
+		}
+		if window >= len(data) {
+			minVal := data[0]
+			for i := 1; i < len(data); i++ {
+				if data[i] < minVal {
+					minVal = data[i]
+				}
+			}
+			for i := range data {
+				data[i] = minVal
+			}
+			return
+		}
+
+		deque := make([]pair, 0, window)
+
+		for i, v := range data {
+			for len(deque) > 0 && deque[0].index <= i-window {
+				deque = deque[1:]
+			}
+
+			for len(deque) > 0 && deque[len(deque)-1].value >= v {
+				deque = deque[:len(deque)-1]
+			}
+
+			deque = append(deque, pair{index: i, value: v})
+
+			data[i] = deque[0].value
+		}
+	}
+
+	emaxValues := make([]float64, level_max+1)
+	logNwords := make([]float64, level_max+1)
+	for ws := 1; ws <= level_max; ws++ {
+		nw := kmer_size - ws + 1
+		na := obikmer.CanonicalCircularKmerCount(ws)
+		if nw < na {
+			logNwords[ws] = math.Log(float64(nw))
+			emaxValues[ws] = math.Log(float64(nw))
+		} else {
+			cov := nw / na
+			remains := nw - (na * cov)
+			f1 := float64(cov) / float64(nw)
+			f2 := float64(cov+1) / float64(nw)
+			logNwords[ws] = math.Log(float64(nw))
+			emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) +
+				float64(remains)*f2*math.Log(f2))
+		}
+	}
+
+	maskAmbiguities := func(sequence []byte) []int {
+		maskPositions := make([]int, len(sequence))
+		for i, nuc := range sequence {
+			if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
+				end := max(0, i-kmer_size+1)
+				for j := i; j >= end; j-- {
+					maskPositions[j] = -1
+				}
+			}
+		}
+		return maskPositions
+	}
+
+	cleanTable := func(table []int, over int) {
+		for i := 0; i < over; i++ {
+			table[i] = 0
+		}
+	}
+
+	computeEntropies := func(sequence []byte,
+		maskPositions []int,
+		entropies []float64,
+		table []int,
+		words []int,
+		wordSize int,
+		normTable []int) {
+
+		lseq := len(sequence)
+		tableSize := 1 << (wordSize * 2)
+		nwords := kmer_size - wordSize + 1
+		float_nwords := float64(nwords)
+		log_nwords := logNwords[wordSize]
+		entropyMax := emaxValues[wordSize]
+
+		cleanTable(table, tableSize)
+
+		for i := 1; i < lseq; i++ {
+			entropies[i] = 6
+		}
+		end := lseq - wordSize + 1
+
+		mask := (1 << (wordSize * 2)) - 1
+
+		word_index := 0
+		for i := 0; i < wordSize-1; i++ {
+			word_index = (word_index << 2) + int(obikmer.EncodeNucleotide(sequence[i]))
+		}
+
+		for i, j := 0, wordSize-1; i < end; i, j = i+1, j+1 {
+			word_index = ((word_index << 2) & mask) + int(obikmer.EncodeNucleotide(sequence[j]))
+			words[i] = normTable[word_index]
+		}
+
+		s := 0
+		sum_n_logn := 0.0
+		entropy := 1.0
+		cleaned := true
+
+		for i := range end {
+			s++
+
+			switch {
+			case s < nwords:
+				cleaned = false
+				table[words[i]]++
+
+			case i >= (nwords-1) && maskPositions[i-nwords+1] < 0:
+				entropies[i-nwords+1] = 4.0
+				if !cleaned {
+					cleanTable(table, tableSize)
+				}
+				cleaned = true
+				s = 0
+				sum_n_logn = 0.0
+
+			case s == nwords:
+				cleaned = false
+				table[words[i]]++
+
+				sum_n_logn = 0
+				for j := range tableSize {
+					n := float64(table[j])
+					if n > 0 {
+						sum_n_logn += nLogN[int(n)]
+					}
+				}
+				entropy = (log_nwords - sum_n_logn/float_nwords) / entropyMax
+
+			case s > nwords:
+				cleaned = false
+
+				new_word := words[i]
+				old_word := words[i-nwords]
+
+				if old_word != new_word {
+					table[new_word]++
+					table[old_word]--
+
+					n_old := float64(table[old_word])
+					n_new := float64(table[new_word])
+
+					sum_n_logn -= nLogN[int(n_old+1)]
+					if n_old > 0 {
+						sum_n_logn += nLogN[int(n_old)]
+					}
+					if n_new > 0 {
+						sum_n_logn += nLogN[int(n_new)]
+					}
+					if n_new > 1 {
+						sum_n_logn -= nLogN[int(n_new-1)]
+					}
+				}
+
+				entropy = (log_nwords - sum_n_logn/float_nwords) / entropyMax
+			}
+
+			if s >= nwords && maskPositions[i-nwords+1] >= 0 {
+				if entropy < 0 {
+					entropy = 0
+				}
+				entropy = math.Round(entropy*10000) / 10000
+				entropies[i-nwords+1] = entropy
+			}
+		}
+
+		slidingMin(entropies, kmer_size)
+	}
+
+	applyMaskMode := func(sequence *obiseq.BioSequence, maskPositions []bool, mask byte) (obiseq.BioSequenceSlice, error) {
+		seqCopy := sequence.Copy()
+		sequenceBytes := seqCopy.Sequence()
+
+		for i := range sequenceBytes {
+			if maskPositions[i] {
+				sequenceBytes[i] = mask
+			}
+		}
+
+		return obiseq.BioSequenceSlice{seqCopy}, nil
+	}
+
+	selectMasked := func(sequence *obiseq.BioSequence, maskPosition []bool) (obiseq.BioSequenceSlice, error) {
+		rep := obiseq.NewBioSequenceSlice()
+
+		inlow := false
+		fromlow := -1
+		for i, masked := range maskPosition {
+			if masked && !inlow {
+				fromlow = i
+				inlow = true
+			}
+			if inlow && !masked {
+				if fromlow >= 0 {
+					frgLen := i - fromlow
+					if keepShorter || frgLen >= kmer_size {
+						frg, err := sequence.Subsequence(fromlow, i, false)
+						if err != nil {
+							return nil, err
+						}
+						rep.Push(frg)
+					}
+				}
+				inlow = false
+				fromlow = -1
+			}
+		}
+
+		if inlow && fromlow >= 0 {
+			frgLen := len(maskPosition) - fromlow
+			if keepShorter || frgLen >= kmer_size {
+				frg, err := sequence.Subsequence(fromlow, len(maskPosition), false)
+				if err != nil {
+					return nil, err
+				}
+				rep.Push(frg)
+			}
+		}
+
+		return *rep, nil
+	}
+
+	selectunmasked := func(sequence *obiseq.BioSequence, maskPosition []bool) (obiseq.BioSequenceSlice, error) {
+		rep := obiseq.NewBioSequenceSlice()
+
+		inhigh := false
+		fromhigh := -1
+		for i, masked := range maskPosition {
+			if !masked && !inhigh {
+				fromhigh = i
+				inhigh = true
+			}
+			if inhigh && masked {
+				if fromhigh >= 0 {
+					frgLen := i - fromhigh
+					if keepShorter || frgLen >= kmer_size {
+						frg, err := sequence.Subsequence(fromhigh, i, false)
+						if err != nil {
+							return nil, err
+						}
+						rep.Push(frg)
+					}
+				}
+				inhigh = false
+				fromhigh = -1
+			}
+		}
+
+		if inhigh && fromhigh >= 0 {
+			frgLen := len(maskPosition) - fromhigh
+			if keepShorter || frgLen >= kmer_size {
+				frg, err := sequence.Subsequence(fromhigh, len(maskPosition), false)
+				if err != nil {
+					return nil, err
+				}
+				rep.Push(frg)
+			}
+		}
+
+		return *rep, nil
+	}
+
+	masking := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
+		if sequence.Len() < kmer_size {
+			sequence.SetAttribute("obilowmask_error", "Sequence too short")
+			remove := make([]bool, sequence.Len())
+			for i := range remove {
+				remove[i] = true
+			}
+			switch mode {
+			case MaskMode:
+				return applyMaskMode(sequence, remove, maskChar)
+			case SplitMode:
+				return selectunmasked(sequence, remove)
+			case ExtractMode:
+				return selectMasked(sequence, remove)
+			}
+			return nil, fmt.Errorf("unknown mode %d", mode)
+		}
+
+		bseq := sequence.Sequence()
+
+		maskPositions := maskAmbiguities(bseq)
+
+		maskFlags := make([]int, len(bseq))
+		entropies := make([]float64, len(bseq))
+		for i := range entropies {
+			entropies[i] = 4.0
+		}
+
+		freqs := make([]int, 1<<(2*level_max))
+		words := make([]int, len(bseq))
+		entropies2 := make([]float64, len(bseq))
+
+		computeEntropies(bseq, maskPositions, entropies, freqs, words, level_max, normTables[level_max])
+
+		for i := range bseq {
+			v := level_max
+			maskFlags[i] = v
+		}
+
+		for ws := level_max - 1; ws > 0; ws-- {
+			computeEntropies(bseq, maskPositions, entropies2, freqs, words, ws, normTables[ws])
+			for i, e2 := range entropies2 {
+				if e2 < entropies[i] {
+					entropies[i] = e2
+					maskFlags[i] = ws
+				}
+			}
+		}
+
+		for i, nuc := range bseq {
+			if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
+				entropies[i] = 0
+			}
+		}
+
+		remove := make([]bool, len(entropies))
+		for i, e := range entropies {
+			remove[i] = e <= threshold
+		}
+
+		sequence.SetAttribute("mask", maskFlags)
+		sequence.SetAttribute("Entropies", entropies)
+
+		switch mode {
+		case MaskMode:
+			return applyMaskMode(sequence, remove, maskChar)
+		case SplitMode:
+			return selectunmasked(sequence, remove)
+		case ExtractMode:
+			return selectMasked(sequence, remove)
+		}
+		return nil, fmt.Errorf("unknown mode %d", mode)
+	}
+
+	return masking
+}
+
+// runLowmask implements the "obik lowmask" subcommand.
+// It masks low-complexity regions in DNA sequences using entropy-based detection.
+func runLowmask(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
+	kmerSize := CLIKmerSize()
+	levelMax := CLIEntropySize()
+	threshold := CLIEntropyThreshold()
+	mode := CLIMaskingMode()
+	maskChar := CLIMaskingChar()
+
+	log.Printf("Low-complexity masking: kmer-size=%d, entropy-size=%d, threshold=%.4f", kmerSize, levelMax, threshold)
+
+	sequences, err := obiconvert.CLIReadBioSequences(args...)
+	if err != nil {
+		return fmt.Errorf("failed to open sequence files: %w", err)
+	}
+
+	worker := lowMaskWorker(kmerSize, levelMax, threshold, mode, maskChar, CLIKeepShorter())
+
+	masked := sequences.MakeIWorker(
+		worker,
+		false,
+		obidefault.ParallelWorkers(),
+	).FilterEmpty()
+
+	obiconvert.CLIWriteBioSequences(masked, true)
+	obiutils.WaitForLastPipe()
+
+	return nil
+}
--- a/pkg/obitools/obik/ls.go
+++ b/pkg/obitools/obik/ls.go
@@ -0,0 +1,96 @@
+package obik
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"strings"
+
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
+	"github.com/DavidGamba/go-getoptions"
+	"gopkg.in/yaml.v3"
+)
+
+type setEntry struct {
+	Index int    `json:"index" yaml:"index"`
+	ID    string `json:"id" yaml:"id"`
+	Count uint64 `json:"count" yaml:"count"`
+}
+
+func runLs(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
+	if len(args) < 1 {
+		return fmt.Errorf("usage: obik ls [options] <index_directory>")
+	}
+
+	ksg, err := obikmer.OpenKmerSetGroup(args[0])
+	if err != nil {
+		return fmt.Errorf("failed to open kmer index: %w", err)
+	}
+
+	// Determine which sets to show
+	patterns := CLISetPatterns()
+	var indices []int
+	if len(patterns) > 0 {
+		indices, err = ksg.MatchSetIDs(patterns)
+		if err != nil {
+			return err
+		}
+	} else {
+		indices = make([]int, ksg.Size())
+		for i := range indices {
+			indices[i] = i
+		}
+	}
+
+	entries := make([]setEntry, len(indices))
+	for i, idx := range indices {
+		entries[i] = setEntry{
+			Index: idx,
+			ID:    ksg.SetIDOf(idx),
+			Count: ksg.Len(idx),
+		}
+	}
+
+	format := CLIOutFormat()
+	switch format {
+	case "json":
+		return outputLsJSON(entries)
+	case "yaml":
+		return outputLsYAML(entries)
+	case "csv":
+		return outputLsCSV(entries)
+	default:
+		return outputLsCSV(entries)
+	}
+}
+
+func outputLsCSV(entries []setEntry) error {
+	fmt.Println("index,id,count")
+	for _, e := range entries {
+		// Escape commas in ID if needed
+		id := e.ID
+		if strings.ContainsAny(id, ",\"") {
+			id = "\"" + strings.ReplaceAll(id, "\"", "\"\"") + "\""
+		}
+		fmt.Printf("%d,%s,%d\n", e.Index, id, e.Count)
+	}
+	return nil
+}
+
+func outputLsJSON(entries []setEntry) error {
+	data, err := json.MarshalIndent(entries, "", "  ")
+	if err != nil {
+		return err
+	}
+	fmt.Println(string(data))
+	return nil
+}
+
+func outputLsYAML(entries []setEntry) error {
+	data, err := yaml.Marshal(entries)
+	if err != nil {
+		return err
+	}
+	fmt.Print(string(data))
+	return nil
+}
--- a/pkg/obitools/obik/match.go
+++ b/pkg/obitools/obik/match.go
@@ -0,0 +1,221 @@
+package obik
+
+import (
+	"context"
+	"fmt"
+	"sync"
+
+	log "github.com/sirupsen/logrus"
+
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
+	"github.com/DavidGamba/go-getoptions"
+)
+
+// defaultMatchQueryThreshold is the minimum number of k-mer entries to
+// accumulate before launching a MatchBatch. Larger values amortize the
+// cost of opening .kdi files across more query k-mers.
+const defaultMatchQueryThreshold = 10_000_000
+
+// preparedBatch pairs a batch with its pre-computed queries.
+type preparedBatch struct {
+	batch   obiiter.BioSequenceBatch
+	seqs    []*obiseq.BioSequence
+	queries *obikmer.PreparedQueries
+}
+
+// accumulatedWork holds multiple prepared batches whose queries have been
+// merged into a single PreparedQueries. The flat seqs slice allows
+// MatchBatch results (indexed by merged SeqIdx) to be mapped back to
+// the original sequences.
+type accumulatedWork struct {
+	batches []obiiter.BioSequenceBatch // original batches in order
+	seqs    []*obiseq.BioSequence      // flat: seqs from all batches concatenated
+	queries *obikmer.PreparedQueries   // merged queries with rebased SeqIdx
+}
+
+// runMatch implements the "obik match" subcommand.
+//
+// Pipeline architecture (no shared mutable state between stages):
+//
+//	[input batches]
+//	     │  Split across nCPU goroutines
+//	     ▼
+//	PrepareQueries (CPU, parallel)
+//	     │  preparedCh
+//	     ▼
+//	Accumulate & MergeQueries (1 goroutine)
+//	     │  matchCh — fires when totalKmers >= threshold
+//	     ▼
+//	MatchBatch + annotate (1 goroutine, internal parallelism per partition)
+//	     │
+//	     ▼
+//	[output batches]
+func runMatch(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
+	indexDir := CLIIndexDirectory()
+
+	// Open the k-mer index
+	ksg, err := obikmer.OpenKmerSetGroup(indexDir)
+	if err != nil {
+		return fmt.Errorf("failed to open kmer index: %w", err)
+	}
+
+	log.Infof("Opened index: k=%d, m=%d, %d partitions, %d set(s)",
+		ksg.K(), ksg.M(), ksg.Partitions(), ksg.Size())
+
+	// Resolve which sets to match against
+	patterns := CLISetPatterns()
+	var setIndices []int
+	if len(patterns) > 0 {
+		setIndices, err = ksg.MatchSetIDs(patterns)
+		if err != nil {
+			return fmt.Errorf("failed to match set patterns: %w", err)
+		}
+		if len(setIndices) == 0 {
+			return fmt.Errorf("no sets match the given patterns")
+		}
+	} else {
+		setIndices = make([]int, ksg.Size())
+		for i := range setIndices {
+			setIndices[i] = i
+		}
+	}
+
+	for _, idx := range setIndices {
+		id := ksg.SetIDOf(idx)
+		if id == "" {
+			id = fmt.Sprintf("set_%d", idx)
+		}
+		log.Infof("Matching against set %d (%s): %d k-mers", idx, id, ksg.Len(idx))
+	}
+
+	// Read input sequences
+	sequences, err := obiconvert.CLIReadBioSequences(args...)
+	if err != nil {
+		return fmt.Errorf("failed to open sequence files: %w", err)
+	}
+
+	nworkers := obidefault.ParallelWorkers()
+
+	// --- Stage 1: Prepare queries in parallel ---
+	preparedCh := make(chan preparedBatch, nworkers)
+
+	var prepWg sync.WaitGroup
+	preparer := func(iter obiiter.IBioSequence) {
+		defer prepWg.Done()
+		for iter.Next() {
+			batch := iter.Get()
+			slice := batch.Slice()
+
+			seqs := make([]*obiseq.BioSequence, len(slice))
+			for i, s := range slice {
+				seqs[i] = s
+			}
+
+			pq := ksg.PrepareQueries(seqs)
+
+			preparedCh <- preparedBatch{
+				batch:   batch,
+				seqs:    seqs,
+				queries: pq,
+			}
+		}
+	}
+
+	for i := 1; i < nworkers; i++ {
+		prepWg.Add(1)
+		go preparer(sequences.Split())
+	}
+	prepWg.Add(1)
+	go preparer(sequences)
+
+	go func() {
+		prepWg.Wait()
+		close(preparedCh)
+	}()
+
+	// --- Stage 2: Accumulate & merge queries ---
+	matchCh := make(chan *accumulatedWork, 2)
+
+	go func() {
+		defer close(matchCh)
+
+		var acc *accumulatedWork
+
+		for pb := range preparedCh {
+			if acc == nil {
+				acc = &accumulatedWork{
+					batches: []obiiter.BioSequenceBatch{pb.batch},
+					seqs:    pb.seqs,
+					queries: pb.queries,
+				}
+			} else {
+				// Merge this batch's queries into the accumulator
+				obikmer.MergeQueries(acc.queries, pb.queries)
+				acc.batches = append(acc.batches, pb.batch)
+				acc.seqs = append(acc.seqs, pb.seqs...)
+			}
+
+			// Flush when we exceed the threshold
+			if acc.queries.NKmers >= defaultMatchQueryThreshold {
+				matchCh <- acc
+				acc = nil
+			}
+		}
+
+		// Flush remaining
+		if acc != nil {
+			matchCh <- acc
+		}
+	}()
+
+	// --- Stage 3: Match & annotate ---
+	output := obiiter.MakeIBioSequence()
+	if sequences.IsPaired() {
+		output.MarkAsPaired()
+	}
+
+	output.Add(1)
+	go func() {
+		defer output.Done()
+
+		for work := range matchCh {
+			// Match against each selected set
+			for _, setIdx := range setIndices {
+				result := ksg.MatchBatch(setIdx, work.queries)
+
+				setID := ksg.SetIDOf(setIdx)
+				if setID == "" {
+					setID = fmt.Sprintf("set_%d", setIdx)
+				}
+				attrName := "kmer_matched_" + setID
+
+				for seqIdx, positions := range result {
+					if len(positions) > 0 {
+						work.seqs[seqIdx].SetAttribute(attrName, positions)
+					}
+				}
+			}
+
+			// Push annotated batches to output
+			for _, b := range work.batches {
+				output.Push(b)
+			}
+
+			// Help GC
+			work.seqs = nil
+			work.queries = nil
+		}
+	}()
+
+	go output.WaitAndClose()
+
+	obiconvert.CLIWriteBioSequences(output, true)
+	obiutils.WaitForLastPipe()
+
+	return nil
+}
--- a/pkg/obitools/obik/mv.go
+++ b/pkg/obitools/obik/mv.go
@@ -0,0 +1,63 @@
+package obik
+
+import (
+	"context"
+	"fmt"
+
+	log "github.com/sirupsen/logrus"
+
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
+	"github.com/DavidGamba/go-getoptions"
+)
+
+func runMv(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
+	if len(args) < 2 {
+		return fmt.Errorf("usage: obik mv [--set PATTERN]... [--force] <source_index> <dest_index>")
+	}
+
+	srcDir := args[0]
+	destDir := args[1]
+
+	ksg, err := obikmer.OpenKmerSetGroup(srcDir)
+	if err != nil {
+		return fmt.Errorf("failed to open source kmer index: %w", err)
+	}
+
+	// Resolve set patterns
+	patterns := CLISetPatterns()
+	var ids []string
+	if len(patterns) > 0 {
+		indices, err := ksg.MatchSetIDs(patterns)
+		if err != nil {
+			return err
+		}
+		if len(indices) == 0 {
+			return fmt.Errorf("no sets match the given patterns")
+		}
+		ids = make([]string, len(indices))
+		for i, idx := range indices {
+			ids[i] = ksg.SetIDOf(idx)
+		}
+	} else {
+		// Move all sets
+		ids = ksg.SetsIDs()
+	}
+
+	log.Infof("Moving %d set(s) from %s to %s", len(ids), srcDir, destDir)
+
+	// Copy first
+	dest, err := ksg.CopySetsByIDTo(ids, destDir, CLIForce())
+	if err != nil {
+		return err
+	}
+
+	// Remove from source (in reverse order to avoid renumbering issues)
+	for i := len(ids) - 1; i >= 0; i-- {
+		if err := ksg.RemoveSetByID(ids[i]); err != nil {
+			return fmt.Errorf("failed to remove set %q from source after copy: %w", ids[i], err)
+		}
+	}
+
+	log.Infof("Destination now has %d set(s), source has %d set(s)", dest.Size(), ksg.Size())
+	return nil
+}
--- a/pkg/obitools/obik/obik.go
+++ b/pkg/obitools/obik/obik.go
@@ -0,0 +1,85 @@
+package obik
+
+import (
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
+	"github.com/DavidGamba/go-getoptions"
+)
+
+// OptionSet registers all obik subcommands on the root GetOpt.
+func OptionSet(opt *getoptions.GetOpt) {
+	// index: build or extend a kmer index from sequence files
+	indexCmd := opt.NewCommand("index", "Build a disk-based kmer index from sequence files")
+	obiconvert.InputOptionSet(indexCmd)
+	obiconvert.OutputModeOptionSet(indexCmd, false)
+	KmerIndexOptionSet(indexCmd)
+	indexCmd.StringMapVar(&_setMetaTags, "tag", 1, 1,
+		indexCmd.Alias("T"),
+		indexCmd.ArgName("KEY=VALUE"),
+		indexCmd.Description("Per-set metadata tag (repeatable)."))
+	indexCmd.SetCommandFn(runIndex)
+
+	// ls: list sets in a kmer index
+	lsCmd := opt.NewCommand("ls", "List sets in a kmer index")
+	OutputFormatOptionSet(lsCmd)
+	SetSelectionOptionSet(lsCmd)
+	lsCmd.SetCommandFn(runLs)
+
+	// summary: detailed statistics
+	summaryCmd := opt.NewCommand("summary", "Show detailed statistics of a kmer index")
+	OutputFormatOptionSet(summaryCmd)
+	summaryCmd.BoolVar(&_jaccard, "jaccard", false,
+		summaryCmd.Description("Compute and display pairwise Jaccard distance matrix."))
+	summaryCmd.SetCommandFn(runSummary)
+
+	// cp: copy sets between indices
+	cpCmd := opt.NewCommand("cp", "Copy sets between kmer indices")
+	SetSelectionOptionSet(cpCmd)
+	ForceOptionSet(cpCmd)
+	cpCmd.SetCommandFn(runCp)
+
+	// mv: move sets between indices
+	mvCmd := opt.NewCommand("mv", "Move sets between kmer indices")
+	SetSelectionOptionSet(mvCmd)
+	ForceOptionSet(mvCmd)
+	mvCmd.SetCommandFn(runMv)
+
+	// rm: remove sets from an index
+	rmCmd := opt.NewCommand("rm", "Remove sets from a kmer index")
+	SetSelectionOptionSet(rmCmd)
+	rmCmd.SetCommandFn(runRm)
+
+	// spectrum: output k-mer frequency spectrum as CSV
+	spectrumCmd := opt.NewCommand("spectrum", "Output k-mer frequency spectrum as CSV")
+	SetSelectionOptionSet(spectrumCmd)
+	obiconvert.OutputModeOptionSet(spectrumCmd, false)
+	spectrumCmd.SetCommandFn(runSpectrum)
+
+	// super: extract super k-mers from sequences
+	superCmd := opt.NewCommand("super", "Extract super k-mers from sequence files")
+	obiconvert.InputOptionSet(superCmd)
+	obiconvert.OutputOptionSet(superCmd)
+	SuperKmerOptionSet(superCmd)
+	superCmd.SetCommandFn(runSuper)
+
+	// lowmask: mask low-complexity regions
+	lowmaskCmd := opt.NewCommand("lowmask", "Mask low-complexity regions in sequences using entropy")
+	obiconvert.InputOptionSet(lowmaskCmd)
+	obiconvert.OutputOptionSet(lowmaskCmd)
+	LowMaskOptionSet(lowmaskCmd)
+	lowmaskCmd.SetCommandFn(runLowmask)
+
+	// match: annotate sequences with k-mer match positions from an index
+	matchCmd := opt.NewCommand("match", "Annotate sequences with k-mer match positions from an index")
+	IndexDirectoryOptionSet(matchCmd)
+	obiconvert.InputOptionSet(matchCmd)
+	obiconvert.OutputOptionSet(matchCmd)
+	SetSelectionOptionSet(matchCmd)
+	matchCmd.SetCommandFn(runMatch)
+
+	// filter: filter an index to remove low-complexity k-mers
+	filterCmd := opt.NewCommand("filter", "Filter a kmer index to remove low-complexity k-mers")
+	obiconvert.OutputModeOptionSet(filterCmd, false)
+	EntropyFilterOptionSet(filterCmd)
+	SetSelectionOptionSet(filterCmd)
+	filterCmd.SetCommandFn(runFilter)
+}
--- a/pkg/obitools/obik/options.go
+++ b/pkg/obitools/obik/options.go
@@ -0,0 +1,360 @@
+package obik
+
+import (
+	"strings"
+
+	log "github.com/sirupsen/logrus"
+
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
+	"github.com/DavidGamba/go-getoptions"
+)
+
+// MaskingMode defines how to handle low-complexity regions
+type MaskingMode int
+
+const (
+	MaskMode    MaskingMode = iota // Replace low-complexity regions with masked characters
+	SplitMode                      // Split sequence into high-complexity fragments
+	ExtractMode                    // Extract low-complexity fragments
+)
+
+// Output format flags
+var _jsonOutput bool
+var _csvOutput bool
+var _yamlOutput bool
+
+// Set selection flags
+var _setPatterns []string
+
+// Force flag
+var _force bool
+
+// Jaccard flag
+var _jaccard bool
+
+// Per-set tags for index subcommand
+var _setMetaTags = make(map[string]string, 0)
+
+// ==============================
+// Shared kmer options (used by index, super, lowmask)
+// ==============================
+
+var _kmerSize = 31
+var _minimizerSize = -1 // -1 means auto: ceil(k / 2.5)
+
+// KmerSizeOptionSet registers --kmer-size / -k.
+// Shared by index, super, and lowmask subcommands.
+func KmerSizeOptionSet(options *getoptions.GetOpt) {
+	options.IntVar(&_kmerSize, "kmer-size", _kmerSize,
+		options.Alias("k"),
+		options.Description("Size of k-mers (must be between 2 and 31)."))
+}
+
+// MinimizerOptionSet registers --minimizer-size / -m.
+// Shared by index and super subcommands.
+func MinimizerOptionSet(options *getoptions.GetOpt) {
+	options.IntVar(&_minimizerSize, "minimizer-size", _minimizerSize,
+		options.Alias("m"),
+		options.Description("Size of minimizers for parallelization (-1 for auto = ceil(k/2.5))."))
+}
+
+// ==============================
+// Lowmask-specific options
+// ==============================
+
+var _entropySize = 6
+var _entropyThreshold = 0.5
+var _splitMode = false
+var _extractMode = false
+var _maskingChar = "."
+var _keepShorter = false
+
+// LowMaskOptionSet registers options specific to low-complexity masking.
+func LowMaskOptionSet(options *getoptions.GetOpt) {
+	KmerSizeOptionSet(options)
+
+	options.IntVar(&_entropySize, "entropy-size", _entropySize,
+		options.Description("Maximum word size considered for entropy estimate."))
+
+	options.Float64Var(&_entropyThreshold, "threshold", _entropyThreshold,
+		options.Description("Entropy threshold below which a kmer is masked (0 to 1)."))
+
+	options.BoolVar(&_splitMode, "extract-high", _splitMode,
+		options.Description("Extract only high-complexity regions."))
+
+	options.BoolVar(&_extractMode, "extract-low", _extractMode,
+		options.Description("Extract only low-complexity regions."))
+
+	options.StringVar(&_maskingChar, "masking-char", _maskingChar,
+		options.Description("Character used to mask low complexity regions."))
+
+	options.BoolVar(&_keepShorter, "keep-shorter", _keepShorter,
+		options.Description("Keep fragments shorter than kmer-size in split/extract mode."))
+}
+
+// ==============================
+// Index-specific options
+// ==============================
+
+var _indexId = ""
+var _metadataFormat = "toml"
+var _setTag = make(map[string]string, 0)
+var _minOccurrence = 1
+var _maxOccurrence = 0
+var _saveFullFilter = false
+var _saveFreqKmer = 0
+var _indexEntropyThreshold = 0.0
+var _indexEntropySize = 6
+
+// KmerIndexOptionSet defines every option related to kmer index building.
+func KmerIndexOptionSet(options *getoptions.GetOpt) {
+	KmerSizeOptionSet(options)
+	MinimizerOptionSet(options)
+
+	options.StringVar(&_indexId, "index-id", _indexId,
+		options.Description("Identifier for the kmer index."))
+
+	options.StringVar(&_metadataFormat, "metadata-format", _metadataFormat,
+		options.Description("Format for metadata file (toml, yaml, json)."))
+
+	options.StringMapVar(&_setTag, "set-tag", 1, 1,
+		options.Alias("S"),
+		options.ArgName("KEY=VALUE"),
+		options.Description("Adds a group-level metadata attribute KEY with value VALUE."))
+
+	options.IntVar(&_minOccurrence, "min-occurrence", _minOccurrence,
+		options.Description("Minimum number of occurrences for a k-mer to be kept (default 1 = keep all)."))
+
+	options.IntVar(&_maxOccurrence, "max-occurrence", _maxOccurrence,
+		options.Description("Maximum number of occurrences for a k-mer to be kept (default 0 = no upper bound)."))
+
+	options.BoolVar(&_saveFullFilter, "save-full-filter", _saveFullFilter,
+		options.Description("When using --min-occurrence > 1, save the full frequency filter instead of just the filtered index."))
+
+	options.IntVar(&_saveFreqKmer, "save-freq-kmer", _saveFreqKmer,
+		options.Description("Save the N most frequent k-mers per set to a CSV file (top_kmers.csv)."))
+
+	options.Float64Var(&_indexEntropyThreshold, "entropy-filter", _indexEntropyThreshold,
+		options.Description("Filter low-complexity k-mers with entropy <= threshold (0 = disabled)."))
+
+	options.IntVar(&_indexEntropySize, "entropy-filter-size", _indexEntropySize,
+		options.Description("Maximum word size for entropy filter computation (default 6)."))
+}
+
+// EntropyFilterOptionSet registers entropy filter options for commands
+// that process existing indices (e.g. filter).
+func EntropyFilterOptionSet(options *getoptions.GetOpt) {
+	options.Float64Var(&_indexEntropyThreshold, "entropy-filter", _indexEntropyThreshold,
+		options.Description("Filter low-complexity k-mers with entropy <= threshold (0 = disabled)."))
+
+	options.IntVar(&_indexEntropySize, "entropy-filter-size", _indexEntropySize,
+		options.Description("Maximum word size for entropy filter computation (default 6)."))
+}
+
+// ==============================
+// Super kmer options
+// ==============================
+
+// SuperKmerOptionSet registers options specific to super k-mer extraction.
+func SuperKmerOptionSet(options *getoptions.GetOpt) {
+	KmerSizeOptionSet(options)
+	MinimizerOptionSet(options)
+}
+
+// CLIKmerSize returns the k-mer size.
+func CLIKmerSize() int {
+	return _kmerSize
+}
+
+// CLIMinimizerSize returns the effective minimizer size.
+func CLIMinimizerSize() int {
+	m := _minimizerSize
+	if m < 0 {
+		m = obikmer.DefaultMinimizerSize(_kmerSize)
+	}
+	nworkers := obidefault.ParallelWorkers()
+	m = obikmer.ValidateMinimizerSize(m, _kmerSize, nworkers)
+	return m
+}
+
+// CLIIndexId returns the index identifier.
+func CLIIndexId() string {
+	return _indexId
+}
+
+// CLIMetadataFormat returns the metadata format.
+func CLIMetadataFormat() obikmer.MetadataFormat {
+	switch strings.ToLower(_metadataFormat) {
+	case "toml":
+		return obikmer.FormatTOML
+	case "yaml":
+		return obikmer.FormatYAML
+	case "json":
+		return obikmer.FormatJSON
+	default:
+		log.Warnf("Unknown metadata format %q, defaulting to TOML", _metadataFormat)
+		return obikmer.FormatTOML
+	}
+}
+
+// CLISetTag returns the group-level metadata key=value pairs.
+func CLISetTag() map[string]string {
+	return _setTag
+}
+
+// CLIMinOccurrence returns the minimum occurrence threshold.
+func CLIMinOccurrence() int {
+	return _minOccurrence
+}
+
+// CLIMaxOccurrence returns the maximum occurrence threshold (0 = no upper bound).
+func CLIMaxOccurrence() int {
+	return _maxOccurrence
+}
+
+// CLISaveFullFilter returns whether to save the full frequency filter.
+func CLISaveFullFilter() bool {
+	return _saveFullFilter
+}
+
+// CLISaveFreqKmer returns the number of top frequent k-mers to save (0 = disabled).
+func CLISaveFreqKmer() int {
+	return _saveFreqKmer
+}
+
+// CLIOutputDirectory returns the output directory path.
+func CLIOutputDirectory() string {
+	return obiconvert.CLIOutPutFileName()
+}
+
+// SetKmerSize sets the k-mer size (for testing).
+func SetKmerSize(k int) {
+	_kmerSize = k
+}
+
+// SetMinimizerSize sets the minimizer size (for testing).
+func SetMinimizerSize(m int) {
+	_minimizerSize = m
+}
+
+// SetMinOccurrence sets the minimum occurrence (for testing).
+func SetMinOccurrence(n int) {
+	_minOccurrence = n
+}
+
+// CLIMaskingMode returns the masking mode from CLI flags.
+func CLIMaskingMode() MaskingMode {
+	switch {
+	case _extractMode:
+		return ExtractMode
+	case _splitMode:
+		return SplitMode
+	default:
+		return MaskMode
+	}
+}
+
+// CLIMaskingChar returns the masking character, validated.
+func CLIMaskingChar() byte {
+	mask := strings.TrimSpace(_maskingChar)
+	if len(mask) != 1 {
+		log.Fatalf("--masking-char option accepts a single character, not %s", mask)
+	}
+	return []byte(mask)[0]
+}
+
+// CLIEntropySize returns the entropy word size.
+func CLIEntropySize() int {
+	return _entropySize
+}
+
+// CLIEntropyThreshold returns the entropy threshold.
+func CLIEntropyThreshold() float64 {
+	return _entropyThreshold
+}
+
+// CLIKeepShorter returns whether to keep short fragments.
+func CLIKeepShorter() bool {
+	return _keepShorter
+}
+
+// ==============================
+// Match-specific options
+// ==============================
+
+var _indexDirectory = ""
+
+// IndexDirectoryOptionSet registers --index / -i (mandatory directory for match).
+func IndexDirectoryOptionSet(options *getoptions.GetOpt) {
+	options.StringVar(&_indexDirectory, "index", _indexDirectory,
+		options.Alias("i"),
+		options.Required(),
+		options.ArgName("DIRECTORY"),
+		options.Description("Path to the kmer index directory."))
+}
+
+// CLIIndexDirectory returns the --index directory path.
+func CLIIndexDirectory() string {
+	return _indexDirectory
+}
+
+// CLIIndexEntropyThreshold returns the entropy filter threshold for index building (0 = disabled).
+func CLIIndexEntropyThreshold() float64 {
+	return _indexEntropyThreshold
+}
+
+// CLIIndexEntropySize returns the entropy filter word size for index building.
+func CLIIndexEntropySize() int {
+	return _indexEntropySize
+}
+
+// OutputFormatOptionSet registers --json-output, --csv-output, --yaml-output.
+func OutputFormatOptionSet(options *getoptions.GetOpt) {
+	options.BoolVar(&_jsonOutput, "json-output", false,
+		options.Description("Print results as JSON."))
+	options.BoolVar(&_csvOutput, "csv-output", false,
+		options.Description("Print results as CSV."))
+	options.BoolVar(&_yamlOutput, "yaml-output", false,
+		options.Description("Print results as YAML."))
+}
+
+// CLIOutFormat returns the selected output format: "json", "csv", "yaml", or "text".
+func CLIOutFormat() string {
+	if _jsonOutput {
+		return "json"
+	}
+	if _csvOutput {
+		return "csv"
+	}
+	if _yamlOutput {
+		return "yaml"
+	}
+	return "text"
+}
+
+// SetSelectionOptionSet registers --set <glob_pattern> (repeatable).
+func SetSelectionOptionSet(options *getoptions.GetOpt) {
+	options.StringSliceVar(&_setPatterns, "set", 1, 1,
+		options.Alias("s"),
+		options.ArgName("PATTERN"),
+		options.Description("Set ID or glob pattern (repeatable, supports *, ?, [...])."))
+}
+
+// CLISetPatterns returns the --set patterns provided by the user.
+func CLISetPatterns() []string {
+	return _setPatterns
+}
+
+// ForceOptionSet registers --force / -f.
+func ForceOptionSet(options *getoptions.GetOpt) {
+	options.BoolVar(&_force, "force", false,
+		options.Alias("f"),
+		options.Description("Force operation even if set ID already exists in destination."))
+}
+
+// CLIForce returns whether --force was specified.
+func CLIForce() bool {
+	return _force
+}
--- a/pkg/obitools/obik/rm.go
+++ b/pkg/obitools/obik/rm.go
@@ -0,0 +1,56 @@
+package obik
+
+import (
+	"context"
+	"fmt"
+
+	log "github.com/sirupsen/logrus"
+
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
+	"github.com/DavidGamba/go-getoptions"
+)
+
+func runRm(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
+	if len(args) < 1 {
+		return fmt.Errorf("usage: obik rm --set PATTERN [--set PATTERN]... <index_directory>")
+	}
+
+	patterns := CLISetPatterns()
+	if len(patterns) == 0 {
+		return fmt.Errorf("--set is required (specify which sets to remove)")
+	}
+
+	indexDir := args[0]
+
+	ksg, err := obikmer.OpenKmerSetGroup(indexDir)
+	if err != nil {
+		return fmt.Errorf("failed to open kmer index: %w", err)
+	}
+
+	indices, err := ksg.MatchSetIDs(patterns)
+	if err != nil {
+		return err
+	}
+	if len(indices) == 0 {
+		return fmt.Errorf("no sets match the given patterns")
+	}
+
+	// Collect IDs before removal (indices shift as we remove)
+	ids := make([]string, len(indices))
+	for i, idx := range indices {
+		ids[i] = ksg.SetIDOf(idx)
+	}
+
+	log.Infof("Removing %d set(s) from %s", len(ids), indexDir)
+
+	// Remove in reverse order to avoid renumbering issues
+	for i := len(ids) - 1; i >= 0; i-- {
+		if err := ksg.RemoveSetByID(ids[i]); err != nil {
+			return fmt.Errorf("failed to remove set %q: %w", ids[i], err)
+		}
+		log.Infof("Removed set %q", ids[i])
+	}
+
+	log.Infof("Index now has %d set(s)", ksg.Size())
+	return nil
+}
--- a/Show More
+++ b/Show More