Merge pull request #101 from metabarcoding/push-klzowrsmmnyv

Dynamic Batch Flushing and Build Improvements
2026-03-25 13:30:52 +00:00 · 2026-03-16 22:29:29 +01:00 · 2026-03-16 22:06:51 +01:00 · 2026-03-16 22:06:44 +01:00 · 2026-03-14 12:21:34 +01:00 · 2026-03-14 11:59:15 +01:00
97 changed files with 9445 additions and 4623 deletions
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -16,7 +16,7 @@ jobs:
      - name: Setup Go
        uses: actions/setup-go@v5
        with:
-          go-version: "1.23"
+          go-version: "1.26"
      - name: Checkout obitools4 project
        uses: actions/checkout@v4
      - name: Run tests
@@ -54,7 +54,7 @@ jobs:
      - name: Setup Go
        uses: actions/setup-go@v5
        with:
-          go-version: "1.23"
+          go-version: "1.26"
      - name: Extract version from tag
        id: get_version
@@ -69,7 +69,23 @@ jobs:
          xcode-select --install 2>/dev/null || true
          xcode-select -p
-      - name: Build binaries
+      - name: Build binaries (Linux)
        if: runner.os == 'Linux'
        env:
          VERSION: ${{ steps.get_version.outputs.version }}
        run: |
          docker run --rm \
            -v "$(pwd):/src" \
            -w /src \
            -e VERSION="${VERSION}" \
            golang:1.26-alpine \
            sh -c "apk add --no-cache gcc musl-dev zlib-dev zlib-static make && \
                   make LDFLAGS='-linkmode=external -extldflags=-static' obitools"
          mkdir -p artifacts
          tar -czf artifacts/obitools4_${VERSION}_${{ matrix.output_name }}.tar.gz -C build .
      - name: Build binaries (macOS)
        if: runner.os == 'macOS'
        env:
          GOOS: ${{ matrix.goos }}
          GOARCH: ${{ matrix.goarch }}
@@ -77,7 +93,6 @@ jobs:
        run: |
          make obitools
          mkdir -p artifacts
          # Create a single tar.gz with all binaries for this platform
          tar -czf artifacts/obitools4_${VERSION}_${{ matrix.output_name }}.tar.gz -C build .
      - name: Upload artifacts
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@
 **/*.tgz
 **/*.yaml
 **/*.csv
 **/*.pb.gz
 xx
 .rhistory
@@ -33,3 +34,4 @@ LLM/**
 entropy.html
 bug_id.txt
 obilowmask_ref
 test_*
--- a/148
+++ b/148
@@ -2,9 +2,17 @@
 #export GOBIN=$(GOPATH)/bin
 #export PATH=$(GOBIN):$(shell echo $${PATH})
 .DEFAULT_GOAL := all
 GREEN  := \033[0;32m
 YELLOW := \033[0;33m
 BLUE   := \033[0;34m
 NC     := \033[0m
 GOFLAGS=
 LDFLAGS=
 GOCMD=go
-GOBUILD=$(GOCMD) build $(GOFLAGS)
+GOBUILD=$(GOCMD) build $(GOFLAGS) $(if $(LDFLAGS),-ldflags="$(LDFLAGS)")
 GOGENERATE=$(GOCMD) generate
 GOCLEAN=$(GOCMD) clean
 GOTEST=$(GOCMD) test
@@ -43,7 +51,7 @@ $(OBITOOLS_PREFIX)$(notdir $(1)): $(BUILD_DIR) $(1) pkg/obioptions/version.go
 	@echo -n - Building obitool $(notdir $(1))...
 	@$(GOBUILD)  -o $(BUILD_DIR)/$(OBITOOLS_PREFIX)$(notdir $(1)) ./$(1) \
 	             2> $(OBITOOLS_PREFIX)$(notdir $(1)).log \
-				 || cat $(OBITOOLS_PREFIX)$(notdir $(1)).log
+				 || { cat $(OBITOOLS_PREFIX)$(notdir $(1)).log; rm -f $(OBITOOLS_PREFIX)$(notdir $(1)).log; exit 1; }
 	@rm -f $(OBITOOLS_PREFIX)$(notdir $(1)).log
 	@echo Done.
 endef
@@ -60,6 +68,28 @@ endif
 OUTPUT:=$(shell mktemp)
 help:
 	@printf "$(GREEN)OBITools4 Makefile$(NC)\n\n"
 	@printf "$(BLUE)Main targets:$(NC)\n"
 	@printf "  %-20s %s\n" "all"          "Build all obitools (default)"
 	@printf "  %-20s %s\n" "obitools"     "Build all obitools binaries to build/"
 	@printf "  %-20s %s\n" "test"         "Run Go unit tests"
 	@printf "  %-20s %s\n" "obitests"     "Run integration tests (obitests/)"
 	@printf "  %-20s %s\n" "bump-version" "Increment patch version (or set with VERSION=x.y.z)"
 	@printf "  %-20s %s\n" "update-deps"  "Update all Go dependencies"
 	@printf "\n$(BLUE)Jujutsu workflow:$(NC)\n"
 	@printf "  %-20s %s\n" "jjnew"        "Document current commit and start a new one"
 	@printf "  %-20s %s\n" "jjpush"       "Release: describe, bump, generate notes, push PR, tag (VERSION=x.y.z optional)"
 	@printf "  %-20s %s\n" "jjfetch"      "Fetch latest commits from origin"
 	@printf "\n$(BLUE)Required tools:$(NC)\n"
 	@printf "  %-20s " "go";      command -v go      >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(go version)" || printf "$(YELLOW)✗ not found$(NC)\n"
 	@printf "  %-20s " "git";     command -v git     >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(git --version)" || printf "$(YELLOW)✗ not found$(NC)\n"
 	@printf "  %-20s " "jj";      command -v jj      >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(jj --version)" || printf "$(YELLOW)✗ not found$(NC)\n"
 	@printf "  %-20s " "gh";      command -v gh      >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(gh --version | head -1)" || printf "$(YELLOW)✗ not found$(NC)  (brew install gh)\n"
 	@printf "\n$(BLUE)Optional tools (release notes generation):$(NC)\n"
 	@printf "  %-20s " "aichat";  command -v aichat  >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(aichat --version)" || printf "$(YELLOW)✗ not found$(NC)  (https://github.com/sigoden/aichat)\n"
 	@printf "  %-20s " "jq";      command -v jq      >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(jq --version)" || printf "$(YELLOW)✗ not found$(NC)  (brew install jq)\n"
 all: install-githook obitools
 obitools: $(patsubst %,$(OBITOOLS_PREFIX)%,$(OBITOOLS))
@@ -106,15 +136,20 @@ pkg/obioptions/version.go: version.txt .FORCE
 	@rm -f $(OUTPUT)
 bump-version:
 	@echo "Incrementing version..."
 	@current=$$(cat version.txt); \
-	echo "  Current version: $$current"; \
+	if [ -n "$(VERSION)" ]; then \
-	major=$$(echo $$current | cut -d. -f1); \
+		new_version="$(VERSION)"; \
-	minor=$$(echo $$current | cut -d. -f2); \
+		echo "Setting version to $$new_version (was $$current)"; \
-	patch=$$(echo $$current | cut -d. -f3); \
+	else \
-	new_patch=$$((patch + 1)); \
+		echo "Incrementing version..."; \
-	new_version="$$major.$$minor.$$new_patch"; \
+		echo "  Current version: $$current"; \
-	echo "  New version: $$new_version"; \
+		major=$$(echo $$current | cut -d. -f1); \
 		minor=$$(echo $$current | cut -d. -f2); \
 		patch=$$(echo $$current | cut -d. -f3); \
 		new_patch=$$((patch + 1)); \
 		new_version="$$major.$$minor.$$new_patch"; \
 		echo "  New version: $$new_version"; \
 	fi; \
 	echo "$$new_version" > version.txt
 	@echo "✓ Version updated in version.txt"
 	@$(MAKE) pkg/obioptions/version.go
@@ -128,40 +163,77 @@ jjnew:
 	@echo "$(GREEN)✓ New commit created$(NC)"
 jjpush:
-	@echo "$(YELLOW)→ Pushing commit to repository...$(NC)"
+	@$(MAKE) jjpush-describe
 	@$(MAKE) jjpush-bump
 	@$(MAKE) jjpush-notes
 	@$(MAKE) jjpush-push
 	@$(MAKE) jjpush-tag
 	@echo "$(GREEN)✓ Release complete$(NC)"
 jjpush-describe:
 	@echo "$(BLUE)→ Documenting current commit...$(NC)"
 	@jj auto-describe
 jjpush-bump:
 	@echo "$(BLUE)→ Creating new commit for version bump...$(NC)"
 	@jj new
-	@previous_version=$$(cat version.txt); \
+	@$(MAKE) bump-version
-	$(MAKE) bump-version; \
+
-	version=$$(cat version.txt); \
+jjpush-notes:
-	tag_name="Release_$$version"; \
+	@version=$$(cat version.txt); \
-	previous_tag="Release_$$previous_version"; \
+	echo "$(BLUE)→ Generating release notes for version $$version...$(NC)"; \
-	echo "$(BLUE)→ Documenting version bump commit...$(NC)"; \
+	release_title="Release $$version"; \
-	jj auto-describe; \
+	release_body=""; \
-	echo "$(BLUE)→ Generating release notes from $$previous_tag to current commit...$(NC)"; \
+	if command -v aichat >/dev/null 2>&1; then \
-	if command -v orla >/dev/null 2>&1 && command -v jq >/dev/null 2>&1; then \
+		previous_tag=$$(git describe --tags --abbrev=0 --match 'Release_*' 2>/dev/null); \
-		release_json=$$(ORLA_MAX_TOOL_CALLS=50 jj log -r "$$previous_tag::@" -T 'commit_id.short() ++ " " ++ description' | \
+		if [ -z "$$previous_tag" ]; then \
-			orla agent -m ollama:qwen3-coder-next:latest \
+			echo "$(YELLOW)⚠ No previous Release tag found, skipping release notes$(NC)"; \
 			"Summarize the following commits into a GitHub release note for version $$version. Ignore commits related to version bumps, .gitignore changes, or any internal housekeeping that is irrelevant to end users. Describe each user-facing change precisely without exposing code. Eliminate redundancy. Output strictly valid JSON with no surrounding text, using this exact schema: {\"title\": \"<short release title>\", \"body\": \"<detailed markdown release notes>\"}"); \
 		release_json=$$(echo "$$release_json" | sed -n '/^{/,/^}/p'); \
 		release_title=$$(echo "$$release_json" | jq -r '.title // empty') ; \
 		release_body=$$(echo "$$release_json" | jq -r '.body // empty') ; \
 		if [ -n "$$release_title" ] && [ -n "$$release_body" ]; then \
 			release_message="$$release_title"$$'\n\n'"$$release_body"; \
 		else \
-			echo "$(YELLOW)⚠ JSON parsing failed, falling back to raw output$(NC)"; \
+			raw_output=$$(git log --format="%h %B" "$$previous_tag..HEAD" | \
-			release_message="Release $$version"$$'\n\n'"$$release_json"; \
+				aichat \
 				"Summarize the following commits into a GitHub release note for version $$version. Ignore commits related to version bumps, .gitignore changes, or any internal housekeeping that is irrelevant to end users. Describe each user-facing change precisely without exposing code. Eliminate redundancy. Output strictly valid JSON with no surrounding text, using this exact schema: {\"title\": \"<short release title>\", \"body\": \"<detailed markdown release notes>\"}" 2>/dev/null) || true; \
 			if [ -n "$$raw_output" ]; then \
 				notes=$$(printf '%s\n' "$$raw_output" | python3 tools/json2md.py 2>/dev/null); \
 				if [ -n "$$notes" ]; then \
 					release_title=$$(echo "$$notes" | head -1); \
 					release_body=$$(echo "$$notes" | tail -n +3); \
 				else \
 					echo "$(YELLOW)⚠ JSON parsing failed, using default release message$(NC)"; \
 				fi; \
 			fi; \
 		fi; \
 	else \
 		release_message="Release $$version"; \
 	fi; \
-	echo "$(BLUE)→ Pushing commits and creating tag $$tag_name...$(NC)"; \
+	printf '%s' "$$release_title" > /tmp/obitools4-release-title.txt; \
-	jj git push --change @; \
+	printf '%s' "$$release_body" > /tmp/obitools4-release-body.txt; \
-	git tag -a "$$tag_name" -m "$$release_message" 2>/dev/null || echo "Tag $$tag_name already exists"; \
+	echo "$(BLUE)→ Setting release notes as commit description...$(NC)"; \
-	git push origin "$$tag_name" 2>/dev/null || echo "Tag already pushed"
+	jj desc -m "$$release_title"$$'\n\n'"$$release_body"
-	@echo "$(GREEN)✓ Commits and tag pushed to repository$(NC)"
+
 jjpush-push:
 	@echo "$(BLUE)→ Pushing commits...$(NC)"
 	@jj git push --change @
 	@echo "$(BLUE)→ Creating/updating PR...$(NC)"
 	@release_title=$$(cat /tmp/obitools4-release-title.txt 2>/dev/null || echo "Release $$(cat version.txt)"); \
 	release_body=$$(cat /tmp/obitools4-release-body.txt 2>/dev/null || echo ""); \
 	branch=$$(jj log -r @ --no-graph -T 'bookmarks.map(|b| b.name()).join("\n")' 2>/dev/null | head -1); \
 	if [ -n "$$branch" ] && command -v gh >/dev/null 2>&1; then \
 		gh pr create --title "$$release_title" --body "$$release_body" --base master --head "$$branch" 2>/dev/null \
 		|| gh pr edit "$$branch" --title "$$release_title" --body "$$release_body" 2>/dev/null \
 		|| echo "$(YELLOW)⚠ Could not create/update PR$(NC)"; \
 	fi
 jjpush-tag:
 	@version=$$(cat version.txt); \
 	tag_name="Release_$$version"; \
 	release_title=$$(cat /tmp/obitools4-release-title.txt 2>/dev/null || echo "Release $$version"); \
 	release_body=$$(cat /tmp/obitools4-release-body.txt 2>/dev/null || echo ""); \
 	install_section=$$'\n## Installation\n\n### Pre-built binaries\n\nDownload the appropriate archive for your system from the\n[release assets](https://github.com/metabarcoding/obitools4/releases/tag/Release_'"$$version"')\nand extract it:\n\n#### Linux (AMD64)\n```bash\ntar -xzf obitools4_'"$$version"'_linux_amd64.tar.gz\n```\n\n#### Linux (ARM64)\n```bash\ntar -xzf obitools4_'"$$version"'_linux_arm64.tar.gz\n```\n\n#### macOS (Intel)\n```bash\ntar -xzf obitools4_'"$$version"'_darwin_amd64.tar.gz\n```\n\n#### macOS (Apple Silicon)\n```bash\ntar -xzf obitools4_'"$$version"'_darwin_arm64.tar.gz\n```\n\nAll OBITools4 binaries are included in each archive.\n\n### From source\n\nYou can also compile and install OBITools4 directly from source using the\ninstallation script:\n\n```bash\ncurl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | bash -s -- --version '"$$version"'\n```\n\nBy default binaries are installed in `/usr/local/bin`. Use `--install-dir` to\nchange the destination and `--obitools-prefix` to add a prefix to command names:\n\n```bash\ncurl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | \\\n  bash -s -- --version '"$$version"' --install-dir ~/local --obitools-prefix k\n```\n'; \
 	release_message="$$release_title"$$'\n\n'"$$release_body$$install_section"; \
 	echo "$(BLUE)→ Creating tag $$tag_name...$(NC)"; \
 	commit_hash=$$(jj log -r @ --no-graph -T 'commit_id' 2>/dev/null); \
 	git tag -a "$$tag_name" $${commit_hash:+"$$commit_hash"} -m "$$release_message" 2>/dev/null || echo "$(YELLOW)⚠ Tag $$tag_name already exists$(NC)"; \
 	echo "$(BLUE)→ Pushing tag $$tag_name...$(NC)"; \
 	git push origin "$$tag_name" 2>/dev/null || echo "$(YELLOW)⚠ Tag push failed or already pushed$(NC)"; \
 	rm -f /tmp/obitools4-release-title.txt /tmp/obitools4-release-body.txt
 jjfetch:
 	@echo "$(YELLOW)→ Pulling latest commits...$(NC)"
@@ -169,5 +241,5 @@ jjfetch:
 	@jj new master@origin
 	@echo "$(GREEN)✓ Latest commits pulled$(NC)"
-.PHONY: all obitools update-deps obitests githubtests jjnew jjpush jjfetch bump-version .FORCE
+.PHONY: all obitools update-deps obitests githubtests help jjnew jjpush jjpush-describe jjpush-bump jjpush-notes jjpush-push jjpush-tag jjfetch bump-version .FORCE
 .FORCE:
--- a/README.md
+++ b/README.md
@@ -32,8 +32,12 @@ The installation script offers several options:
 > 
 >  -p, --obitools-prefix   Prefix added to the obitools command names if you
 >                          want to have several versions of obitools at the
->                          same time on your system (as example `-p g` will produce 
+>                          same time on your system (as example `-p g` will produce
 >                          `gobigrep` command instead of `obigrep`).
 >
 >  -j, --jobs              Number of parallel jobs used for compilation
 >                          (default: 1). Increase this value to speed up
 >                          compilation on multi-core systems (e.g., `-j 4`).
 ### Examples
--- a/blackboard/Prospective/kmer_disk_index_plan.md
+++ b/blackboard/Prospective/kmer_disk_index_plan.md
@@ -0,0 +1,508 @@
 # Plan de refonte du package obikmer : index disk-based par partitions minimizer
 ## Constat
 Les roaring64 bitmaps ne sont pas adaptés au stockage de 10^10 k-mers
 (k=31) dispersés sur un espace de 2^62. L'overhead structurel (containers
 roaring par high key 32 bits) dépasse la taille des données elles-mêmes,
 et les opérations `Or()` entre bitmaps fragmentés ne terminent pas en
 temps raisonnable.
 ## Principe de la nouvelle architecture
 Un `KmerSet` est un ensemble trié de k-mers canoniques (uint64) stocké
 sur disque, partitionné par minimizer. Chaque partition est un fichier
 binaire contenant des uint64 triés, compressés par delta-varint.
 Un `KmerSetGroup` est un répertoire contenant N ensembles partitionnés
 de la même façon (même k, même m, même P).
 Un `KmerSet` est un `KmerSetGroup` de taille 1 (singleton).
 Les opérations ensemblistes se font partition par partition, en merge
 streaming, sans charger l'index complet en mémoire.
 ## Cycle de vie d'un index
 L'index a deux phases distinctes :
 1. **Phase de construction (mutable)** : on ouvre un index, on y ajoute
   des séquences. Pour chaque séquence, les super-kmers sont extraits
   et écrits de manière compacte (2 bits/base) dans le fichier
   temporaire de partition correspondant (`minimizer % P`). Les
   super-kmers sont une représentation compressée naturelle des k-mers
   chevauchants : un super-kmer de longueur L encode L-k+1 k-mers en
   ne stockant que ~L/4 bytes au lieu de (L-k+1) × 8 bytes.
 2. **Phase de clôture (optimisation)** : on ferme l'index, ce qui
   déclenche le traitement **partition par partition** (indépendant,
   parallélisable) :
   - Charger les super-kmers de la partition
   - En extraire tous les k-mers canoniques
   - Trier le tableau de k-mers
   - Dédupliquer (et compter si FrequencyFilter)
   - Delta-encoder et écrire le fichier .kdi final
   Après clôture, l'index est statique et immuable.
 3. **Phase de lecture (immutable)** : opérations ensemblistes,
   Jaccard, Quorum, Contains, itération. Toutes en streaming.
 ---
 ## Format sur disque
 ### Index finalisé
 ```
 index_dir/
  metadata.toml
  set_0/
    part_0000.kdi
    part_0001.kdi
    ...
    part_{P-1}.kdi
  set_1/
    part_0000.kdi
    ...
  ...
  set_{N-1}/
    ...
 ```
 ### Fichiers temporaires pendant la construction
 ```
 index_dir/
  .build/
    set_0/
      part_0000.skm          # super-kmers encodés 2 bits/base
      part_0001.skm
      ...
    set_1/
      ...
 ```
 Le répertoire `.build/` est supprimé après Close().
 ### metadata.toml
 ```toml
 id = "mon_index"
 k = 31
 m = 13
 partitions = 1024
 type = "KmerSetGroup"       # ou "KmerSet" (N=1)
 size = 3                    # nombre de sets (N)
 sets_ids = ["genome_A", "genome_B", "genome_C"]
 [user_metadata]
 organism = "Triticum aestivum"
 [sets_metadata]
 # métadonnées individuelles par set si nécessaire
 ```
 ### Fichier .kdi (Kmer Delta Index)
 Format binaire :
 ```
 [magic: 4 bytes "KDI\x01"]
 [count: uint64 little-endian]       # nombre de k-mers dans cette partition
 [first: uint64 little-endian]       # premier k-mer (valeur absolue)
 [delta_1: varint]                   # arr[1] - arr[0]
 [delta_2: varint]                   # arr[2] - arr[1]
 ...
 [delta_{count-1}: varint]           # arr[count-1] - arr[count-2]
 ```
 Varint : encoding unsigned, 7 bits utiles par byte, bit de poids fort
 = continuation (identique au varint protobuf).
 Fichier vide (partition sans k-mer) : magic + count=0.
 ### Fichier .skm (Super-Kmer temporaire)
 Format binaire, séquence de super-kmers encodés :
 ```
 [len: uint16 little-endian]         # longueur du super-kmer en bases
 [sequence: ceil(len/4) bytes]       # séquence encodée 2 bits/base, packed
 ...
 ```
 **Compression par rapport au stockage de k-mers bruts** :
 Un super-kmer de longueur L contient L-k+1 k-mers.
 - Stockage super-kmer : 2 + ceil(L/4) bytes
 - Stockage k-mers bruts : (L-k+1) × 8 bytes
 Exemple avec k=31, super-kmer typique L=50 :
 - Super-kmer : 2 + 13 = 15 bytes → encode 20 k-mers
 - K-mers bruts : 20 × 8 = 160 bytes
 - **Facteur de compression : ~10×**
 Pour un génome de 10 Gbases (~10^10 k-mers bruts) :
 - K-mers bruts : ~80 Go par set temporaire
 - Super-kmers : **~8 Go** par set temporaire
 Avec FrequencyFilter et couverture 30× :
 - K-mers bruts : ~2.4 To
 - Super-kmers : **~240 Go**
 ---
 ## FrequencyFilter
 Le FrequencyFilter n'est plus un type de données séparé. C'est un
 **mode de construction** du builder. Le résultat est un KmerSetGroup
 standard.
 ### Principe
 Pendant la construction, tous les super-kmers sont écrits dans les
 fichiers temporaires .skm, y compris les doublons (chaque occurrence
 de chaque séquence est écrite).
 Pendant Close(), pour chaque partition :
 1. Charger tous les super-kmers de la partition
 2. Extraire tous les k-mers canoniques dans un tableau []uint64
 3. Trier le tableau
 4. Parcourir linéairement : les k-mers identiques sont consécutifs
 5. Compter les occurrences de chaque k-mer
 6. Si count >= minFreq → écrire dans le .kdi final (une seule fois)
 7. Sinon → ignorer
 ### Dimensionnement
 Pour un génome de 10 Gbases avec couverture 30× :
 - N_brut ≈ 3×10^11 k-mers bruts
 - Espace temporaire .skm ≈ 240 Go (compressé super-kmer)
 - RAM par partition pendant Close() :
  Avec P=1024 : ~3×10^8 k-mers/partition × 8 = **~2.4 Go**
  Avec P=4096 : ~7.3×10^7 k-mers/partition × 8 = **~600 Mo**
 Le choix de P détermine le compromis nombre de fichiers vs RAM par
 partition.
 ### Sans FrequencyFilter (déduplication simple)
 Pour de la déduplication simple (chaque k-mer écrit une fois), le
 builder peut dédupliquer au niveau des buffers en RAM avant flush.
 Cela réduit significativement l'espace temporaire car les doublons
 au sein d'un même buffer (provenant de séquences proches) sont
 éliminés immédiatement.
 ---
 ## API publique visée
 ### Structures
 ```go
 // KmerSetGroup est l'entité de base.
 // Un KmerSet est un KmerSetGroup avec Size() == 1.
 type KmerSetGroup struct {
    // champs internes : path, k, m, P, N, metadata, état
 }
 // KmerSetGroupBuilder construit un KmerSetGroup mutable.
 type KmerSetGroupBuilder struct {
    // champs internes : buffers I/O par partition et par set,
    // fichiers temporaires .skm, paramètres (minFreq, etc.)
 }
 ```
 ### Construction
 ```go
 // NewKmerSetGroupBuilder crée un builder pour un nouveau KmerSetGroup.
 //   directory : répertoire de destination
 //   k : taille des k-mers (1-31)
 //   m : taille des minimizers (-1 pour auto = ceil(k/2.5))
 //   n : nombre de sets dans le groupe
 //   P : nombre de partitions (-1 pour auto)
 //   options : options de construction (FrequencyFilter, etc.)
 func NewKmerSetGroupBuilder(directory string, k, m, n, P int,
    options ...BuilderOption) (*KmerSetGroupBuilder, error)
 // WithMinFrequency active le mode FrequencyFilter.
 // Seuls les k-mers vus >= minFreq fois sont conservés dans l'index
 // final. Les super-kmers sont écrits avec leurs doublons pendant
 // la construction ; le comptage exact se fait au Close().
 func WithMinFrequency(minFreq int) BuilderOption
 // AddSequence extrait les super-kmers d'une séquence et les écrit
 // dans les fichiers temporaires de partition du set i.
 func (b *KmerSetGroupBuilder) AddSequence(setIndex int, seq *obiseq.BioSequence)
 // AddSuperKmer écrit un super-kmer dans le fichier temporaire de
 // sa partition pour le set i.
 func (b *KmerSetGroupBuilder) AddSuperKmer(setIndex int, sk SuperKmer)
 // Close finalise la construction :
 //   - flush des buffers d'écriture
 //   - pour chaque partition de chaque set (parallélisable) :
 //     - charger les super-kmers depuis le .skm
 //     - extraire les k-mers canoniques
 //     - trier, dédupliquer (compter si freq filter)
 //     - delta-encoder et écrire le .kdi
 //   - écrire metadata.toml
 //   - supprimer le répertoire .build/
 // Retourne le KmerSetGroup en lecture seule.
 func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error)
 ```
 ### Lecture et opérations
 ```go
 // OpenKmerSetGroup ouvre un index finalisé en lecture seule.
 func OpenKmerSetGroup(directory string) (*KmerSetGroup, error)
 // --- Métadonnées (API inchangée) ---
 func (ksg *KmerSetGroup) K() int
 func (ksg *KmerSetGroup) M() int          // nouveau : taille du minimizer
 func (ksg *KmerSetGroup) Partitions() int  // nouveau : nombre de partitions
 func (ksg *KmerSetGroup) Size() int
 func (ksg *KmerSetGroup) Id() string
 func (ksg *KmerSetGroup) SetId(id string)
 func (ksg *KmerSetGroup) HasAttribute(key string) bool
 func (ksg *KmerSetGroup) GetAttribute(key string) (interface{}, bool)
 func (ksg *KmerSetGroup) SetAttribute(key string, value interface{})
 // ... etc (toute l'API attributs actuelle est conservée)
 // --- Opérations ensemblistes ---
 // Toutes produisent un nouveau KmerSetGroup singleton sur disque.
 // Opèrent partition par partition en streaming.
 func (ksg *KmerSetGroup) Union(outputDir string) (*KmerSetGroup, error)
 func (ksg *KmerSetGroup) Intersect(outputDir string) (*KmerSetGroup, error)
 func (ksg *KmerSetGroup) Difference(outputDir string) (*KmerSetGroup, error)
 func (ksg *KmerSetGroup) QuorumAtLeast(q int, outputDir string) (*KmerSetGroup, error)
 func (ksg *KmerSetGroup) QuorumExactly(q int, outputDir string) (*KmerSetGroup, error)
 func (ksg *KmerSetGroup) QuorumAtMost(q int, outputDir string) (*KmerSetGroup, error)
 // --- Opérations entre deux KmerSetGroups ---
 // Les deux groupes doivent avoir les mêmes k, m, P.
 func (ksg *KmerSetGroup) UnionWith(other *KmerSetGroup, outputDir string) (*KmerSetGroup, error)
 func (ksg *KmerSetGroup) IntersectWith(other *KmerSetGroup, outputDir string) (*KmerSetGroup, error)
 // --- Métriques (résultat en mémoire, pas de sortie disque) ---
 func (ksg *KmerSetGroup) JaccardDistanceMatrix() *obidist.DistMatrix
 func (ksg *KmerSetGroup) JaccardSimilarityMatrix() *obidist.DistMatrix
 // --- Accès individuel ---
 func (ksg *KmerSetGroup) Len(setIndex ...int) uint64
 func (ksg *KmerSetGroup) Contains(setIndex int, kmer uint64) bool
 func (ksg *KmerSetGroup) Iterator(setIndex int) iter.Seq[uint64]
 ```
 ---
 ## Implémentation interne
 ### Primitives bas niveau
 **`varint.go`** : encode/decode varint uint64
 ```go
 func EncodeVarint(w io.Writer, v uint64) (int, error)
 func DecodeVarint(r io.Reader) (uint64, error)
 ```
 ### Format .kdi
 **`kdi_writer.go`** : écriture d'un fichier .kdi à partir d'un flux
 trié de uint64 (delta-encode au vol).
 ```go
 type KdiWriter struct { ... }
 func NewKdiWriter(path string) (*KdiWriter, error)
 func (w *KdiWriter) Write(kmer uint64) error
 func (w *KdiWriter) Close() error
 ```
 **`kdi_reader.go`** : lecture streaming d'un fichier .kdi (décode
 les deltas au vol).
 ```go
 type KdiReader struct { ... }
 func NewKdiReader(path string) (*KdiReader, error)
 func (r *KdiReader) Next() (uint64, bool)
 func (r *KdiReader) Count() uint64
 func (r *KdiReader) Close() error
 ```
 ### Format .skm
 **`skm_writer.go`** : écriture de super-kmers encodés 2 bits/base.
 ```go
 type SkmWriter struct { ... }
 func NewSkmWriter(path string) (*SkmWriter, error)
 func (w *SkmWriter) Write(sk SuperKmer) error
 func (w *SkmWriter) Close() error
 ```
 **`skm_reader.go`** : lecture de super-kmers depuis un fichier .skm.
 ```go
 type SkmReader struct { ... }
 func NewSkmReader(path string) (*SkmReader, error)
 func (r *SkmReader) Next() (SuperKmer, bool)
 func (r *SkmReader) Close() error
 ```
 ### Merge streaming
 **`kdi_merge.go`** : k-way merge de plusieurs flux triés.
 ```go
 type KWayMerge struct { ... }
 func NewKWayMerge(readers []*KdiReader) *KWayMerge
 func (m *KWayMerge) Next() (kmer uint64, count int, ok bool)
 func (m *KWayMerge) Close() error
 ```
 ### Builder
 **`kmer_set_builder.go`** : construction d'un KmerSetGroup.
 Le builder gère :
 - P × N écrivains .skm bufferisés (un par partition × set)
 - À la clôture : traitement partition par partition
  (parallélisable sur plusieurs cores)
 Gestion mémoire des buffers d'écriture :
 - Chaque SkmWriter a un buffer I/O de taille raisonnable (~64 Ko)
 - Avec P=1024 et N=1 : 1024 × 64 Ko = 64 Mo de buffers
 - Avec P=1024 et N=10 : 640 Mo de buffers
 - Pas de buffer de k-mers en RAM : tout est écrit sur disque
  immédiatement via les super-kmers
 RAM pendant Close() (tri d'une partition) :
 - Charger les super-kmers → extraire les k-mers → tableau []uint64
 - Avec P=1024 et 10^10 k-mers/set : ~10^7 k-mers/partition × 8 = ~80 Mo
 - Avec FrequencyFilter (doublons) et couverture 30× :
  ~3×10^8/partition × 8 = ~2.4 Go (ajustable via P)
 ### Structure disk-based
 **`kmer_set_disk.go`** : KmerSetGroup en lecture seule.
 **`kmer_set_disk_ops.go`** : opérations ensemblistes par merge
 streaming partition par partition.
 ---
 ## Ce qui change par rapport à l'API actuelle
 ### Changements de sémantique
 | Aspect | Ancien (roaring) | Nouveau (disk-based) |
 |---|---|---|
 | Stockage | En mémoire (roaring64.Bitmap) | Sur disque (.kdi delta-encoded) |
 | Temporaire construction | En mémoire | Super-kmers sur disque (.skm 2 bits/base) |
 | Mutabilité | Mutable à tout moment | Builder → Close() → immutable |
 | Opérations ensemblistes | Résultat en mémoire | Résultat sur disque (nouveau répertoire) |
 | Contains | O(1) roaring lookup | O(log n) recherche binaire sur .kdi |
 | Itération | Roaring iterator | Streaming décodage delta-varint |
 ### API conservée (signatures identiques ou quasi-identiques)
 - `KmerSetGroup` : `K()`, `Size()`, `Id()`, `SetId()`
 - Toute l'API attributs
 - `JaccardDistanceMatrix()`, `JaccardSimilarityMatrix()`
 - `Len()`, `Contains()`
 ### API modifiée
 - `Union()`, `Intersect()`, etc. : ajout du paramètre `outputDir`
 - `QuorumAtLeast()`, etc. : idem
 - Construction : `NewKmerSetGroupBuilder()` + `AddSequence()` + `Close()`
  au lieu de manipulation directe
 ### API supprimée
 - `KmerSet` comme type distinct (remplacé par KmerSetGroup singleton)
 - `FrequencyFilter` comme type distinct (mode du Builder)
 - Tout accès direct à `roaring64.Bitmap`
 - `KmerSet.Copy()` (copie de répertoire à la place)
 - `KmerSet.Union()`, `.Intersect()`, `.Difference()` (deviennent méthodes
  de KmerSetGroup avec outputDir)
 ---
 ## Fichiers à créer / modifier dans pkg/obikmer
 ### Nouveaux fichiers
 | Fichier | Contenu |
 |---|---|
 | `varint.go` | Encode/Decode varint uint64 |
 | `kdi_writer.go` | Écrivain de fichiers .kdi (delta-encoded) |
 | `kdi_reader.go` | Lecteur streaming de fichiers .kdi |
 | `skm_writer.go` | Écrivain de super-kmers encodés 2 bits/base |
 | `skm_reader.go` | Lecteur de super-kmers depuis .skm |
 | `kdi_merge.go` | K-way merge streaming de flux triés |
 | `kmer_set_builder.go` | KmerSetGroupBuilder (construction) |
 | `kmer_set_disk.go` | KmerSetGroup disk-based (lecture, métadonnées) |
 | `kmer_set_disk_ops.go` | Opérations ensemblistes streaming |
 ### Fichiers à supprimer
 | Fichier | Raison |
 |---|---|
 | `kmer_set.go` | Remplacé par kmer_set_disk.go |
 | `kmer_set_group.go` | Idem |
 | `kmer_set_attributes.go` | Intégré dans kmer_set_disk.go |
 | `kmer_set_persistence.go` | L'index est nativement sur disque |
 | `kmer_set_group_quorum.go` | Intégré dans kmer_set_disk_ops.go |
 | `frequency_filter.go` | Mode du Builder, plus de type séparé |
 | `kmer_index_builder.go` | Remplacé par kmer_set_builder.go |
 ### Fichiers conservés tels quels
 | Fichier | Contenu |
 |---|---|
 | `encodekmer.go` | Encodage/décodage k-mers |
 | `superkmer.go` | Structure SuperKmer |
 | `superkmer_iter.go` | IterSuperKmers, IterCanonicalKmers |
 | `encodefourmer.go` | Encode4mer |
 | `counting.go` | Count4Mer |
 | `kmermap.go` | KmerMap (usage indépendant) |
 | `debruijn.go` | Graphe de de Bruijn |
 ---
 ## Ordre d'implémentation
 1. `varint.go` + tests
 2. `skm_writer.go` + `skm_reader.go` + tests
 3. `kdi_writer.go` + `kdi_reader.go` + tests
 4. `kdi_merge.go` + tests
 5. `kmer_set_builder.go` + tests (construction + Close)
 6. `kmer_set_disk.go` (structure, métadonnées, Open)
 7. `kmer_set_disk_ops.go` + tests (Union, Intersect, Quorum, Jaccard)
 8. Adaptation de `pkg/obitools/obikindex/`
 9. Suppression des anciens fichiers roaring
 10. Adaptation des tests existants
 Chaque étape est testable indépendamment.
 ---
 ## Dépendances externes
 ### Supprimées
 - `github.com/RoaringBitmap/roaring` : plus nécessaire pour les
  index k-mers (vérifier si d'autres packages l'utilisent encore)
 ### Ajoutées
 - Aucune. Varint, delta-encoding, merge, encodage 2 bits/base :
  tout est implémentable en Go standard.
--- a/blackboard/Prospective/large_sequence_parsing.md
+++ b/blackboard/Prospective/large_sequence_parsing.md
@@ -0,0 +1,264 @@
 # Optimisation du parsing des grandes séquences
 ## Contexte
 OBITools4 doit pouvoir traiter des séquences de taille chromosomique (plusieurs Gbp), notamment
 issues de fichiers GenBank/EMBL (assemblages de génomes) ou de fichiers FASTA convertis depuis
 ces formats.
 ## Architecture actuelle
 ### Pipeline de lecture (`pkg/obiformats/`)
 ```
 ReadFileChunk (goroutine)
    → ChannelFileChunk
    → N × _ParseGenbankFile / _ParseFastaFile (goroutines)
    → IBioSequence
 ```
 `ReadFileChunk` (`file_chunk_read.go`) lit le fichier par morceaux via une chaîne de
 `PieceOfChunk` (rope). Chaque nœud fait `fileChunkSize` bytes :
 - GenBank/EMBL : 128 MB (`1024*1024*128`)
 - FASTA/FASTQ  : 1 MB (`1024*1024`)
 La chaîne est accumulée jusqu'à trouver la fin du dernier enregistrement complet (splitter),
 puis `Pack()` est appelé pour fusionner tous les nœuds en un seul buffer contigu. Ce buffer
 est transmis au parseur via `FileChunk.Raw *bytes.Buffer`.
 ### Parseur GenBank (`genbank_read.go`)
 `GenbankChunkParser` reçoit un `io.Reader` sur le buffer packé, lit ligne par ligne via
 `bufio.NewReader` (buffer 4096 bytes), et pour chaque ligne de la section `ORIGIN` :
 ```go
 line = string(bline)                        // allocation par ligne
 cleanline := strings.TrimSpace(line)        // allocation
 parts := strings.SplitN(cleanline, " ", 7) // allocation []string + substrings
 for i := 1; i < lparts; i++ {
    seqBytes.WriteString(parts[i])
 }
 ```
 Point positif : `seqBytes` est pré-alloué grâce à `lseq` extrait de la ligne `LOCUS`.
 ### Parseur FASTA (`fastaseq_read.go`)
 `FastaChunkParser` lit **octet par octet** via `scanner.ReadByte()`. Pour 3 Gbp :
 3 milliards d'appels. `seqBytes` est un `bytes.Buffer{}` sans pré-allocation.
 ## Problème principal
 Pour une séquence de plusieurs Gbp, `Pack()` fusionne une chaîne de ~N nœuds de 128 MB en
 un seul buffer contigu. C'est une allocation de N × 128 MB suivie d'une copie de toutes les
 données. Bien que l'implémentation de `Pack()` soit efficace (libère les nœuds au fur et à
 mesure via `slices.Grow`), la copie est inévitable avec l'architecture actuelle.
 De plus, le parseur GenBank produit des dizaines de millions d'allocations temporaires pour
 parser la section `ORIGIN` (une par ligne).
 ## Invariant clé découvert
 **Si la rope a plus d'un nœud, le premier nœud seul ne se termine pas sur une frontière
 d'enregistrement** (pas de `//\n` en fin de `piece1`).
 Preuve par construction dans `ReadFileChunk` :
 - `splitter` est appelé dès le premier nœud (ligne 157)
 - Si `end >= 0` → frontière trouvée dans 128 MB → boucle interne sautée → rope à 1 nœud
 - Si `end < 0` → boucle interne ajoute des nœuds → rope à ≥ 2 nœuds
 Corollaire : si rope à 1 nœud, `Pack()` ne fait rien (aucun nœud suivant).
 **Attention** : rope à ≥ 2 nœuds ne signifie pas qu'il n'y a qu'une seule séquence dans
 la rope. La rope packée peut contenir plusieurs enregistrements complets. Exemple : records
 de 80 MB → `nextpieces` (48 MB de reste) + nouveau nœud (128 MB) = rope à 2 nœuds
 contenant 2 records complets + début d'un troisième.
 L'invariant dit seulement que `piece1` seul est incomplet — pas que la rope entière
 ne contient qu'un seul record.
 **Invariant : le dernier FileChunk envoyé finit sur une frontière d'enregistrement.**
 Deux chemins dans `ReadFileChunk` :
 1. **Chemin normal** (`end >= 0` via `splitter`) : le buffer est explicitement tronqué à
   `end` (ligne 200 : `pieces.data = pieces.data[:end]`). Frontière garantie par construction
   pour tous les formats. ✓
 2. **Chemin EOF** (`end < 0`, `end = pieces.Len()`) : tout le reste du fichier est envoyé.
   - **GenBank/EMBL** : présuppose fichier bien formé (se termine par `//\n`). Le parseur
     lève un `log.Fatalf` sur tout état inattendu — filet de sécurité suffisant. ✓
   - **FASTQ** : présupposé, vérifié par le parseur. ✓
   - **FASTA** : garanti par le format lui-même (fin d'enregistrement = EOF ou `>`). ✓
 **Hypothèse de travail adoptée** : les fichiers d'entrée sont bien formés. Dans le pire cas,
 le parseur lèvera une erreur explicite. Il n'y a pas de risque de corruption silencieuse.
 ## Piste d'optimisation : se dispenser de Pack()
 ### Idée centrale
 Au lieu de fusionner la rope avant de la passer au parseur, **parser directement la rope
 nœud par nœud**, et **écrire la séquence compactée in-place dans le premier nœud**.
 Pourquoi c'est sûr :
 - Le header (LOCUS, DEFINITION, SOURCE, FEATURES) est **petit** et traité en premier
 - La séquence (ORIGIN) est **à la fin** du record
 - Au moment d'écrire la séquence depuis l'offset 0 de `piece1`, le pointeur de lecture
  est profond dans la rope (offset >> 0) → jamais de collision
 - La séquence compactée est toujours plus courte que les données brutes
 ### Pré-allocation
 Pour GenBank/EMBL : `lseq` est connu dès la ligne `LOCUS`/`ID` (première ligne, dans
 `piece1`). On peut faire `slices.Grow(piece1.data, lseq)` dès ce moment.
 Pour FASTA : pas de taille garantie dans le header, mais `rope.Len()` donne un majorant.
 On peut utiliser `rope.Len() / 2` comme estimation initiale.
 ### Gestion des jonctions entre nœuds
 Une ligne peut chevaucher deux nœuds (rare avec 128 MB, mais possible). Solution : carry
 buffer de ~128 bytes pour les quelques bytes en fin de nœud.
 ### Cas FASTA/FASTQ multi-séquences
 Un FileChunk peut contenir N séquences (notamment FASTA/FASTQ courts). Dans ce cas
 l'écriture in-place dans `piece1` n'est pas applicable directement — on écrase des données
 nécessaires aux séquences suivantes.
 Stratégie par cas :
 - **Rope à 1 nœud** (record ≤ 128 MB) : `Pack()` est trivial (no-op), parseur actuel OK
 - **Rope à ≥ 2 nœuds** : par l'invariant, `piece1` ne contient pas de record complet →
  une seule grande séquence → in-place applicable
 ### Format d'une ligne séquence GenBank (Après ORIGIN)
 ```
 /^ *[0-9]+( [nuc]{10}){0,5} [nuc]{1,10}/
 ```
 ### Format d'une ligne séquence GenBank (Après SQ)
 La ligne SQ contient aussi la taille de la séquence
 ```
 /^ *( [nuc]{10}){0,5} [nuc]{1,10} *[0-9]+/
 ```
 Compactage in-place sur `bline` ([]byte brut, sans conversion `string`) :
 ```go
 w := 0
 i := 0
 for i < len(bline) && bline[i] == ' '  { i++ }   // skip indentation
 for i < len(bline) && bline[i] <= '9'  { i++ }   // skip position number
 for ; i < len(bline); i++ {
    if bline[i] != ' ' {
        bline[w] = bline[i]
        w++
    }
 }
 // écrire bline[:w] directement dans piece1.data[seqOffset:]
 ```
 ## Changements nécessaires
 1. **`FileChunk`** : exposer la rope `*PieceOfChunk` non-packée en plus (ou à la place)
   de `Raw *bytes.Buffer`
 2. **`GenbankChunkParser` / `EmblChunkParser`** : accepter `*PieceOfChunk`, parser la
   rope séquentiellement avec carry buffer pour les jonctions
 3. **`FastaChunkParser`** : idem, avec in-place conditionnel selon taille de la rope
 4. **`ReadFileChunk`** : ne pas appeler `Pack()` avant envoi sur le channel (ou version
   alternative `ReadFileChunkRope`)
 ## Fichiers concernés
 - `pkg/obiformats/file_chunk_read.go` — structure rope, `ReadFileChunk`
 - `pkg/obiformats/genbank_read.go` — `GenbankChunkParser`, `_ParseGenbankFile`
 - `pkg/obiformats/embl_read.go` — `EmblChunkParser`, `ReadEMBL`
 - `pkg/obiformats/fastaseq_read.go` — `FastaChunkParser`, `_ParseFastaFile`
 - `pkg/obiformats/fastqseq_read.go` — parseur FASTQ (même structure)
 ## Plan d'implémentation : parseur GenBank sur rope
 ### Contexte
 Baseline mesurée : `obiconvert gbpln640.seq.gz` → 49s real, 42s user, 29s sys, **57 GB RSS**.
 Le sys élevé indique des allocations massives. Deux causes :
 1. `Pack()` : fusionne toute la rope (N × 128 MB) en un buffer contigu avant de parser
 2. Parser ORIGIN : `string(bline)` + `TrimSpace` + `SplitN` × millions de lignes
 ### 1. `gbRopeScanner`
 Struct de lecture ligne par ligne sur la rope, sans allocation heap :
 ```go
 type gbRopeScanner struct {
    current *PieceOfChunk
    pos     int
    carry   [256]byte  // stack-allocated, max GenBank line = 80 chars
    carryN  int
 }
 ```
 `ReadLine()` :
 - Cherche `\n` dans `current.data[pos:]` via `bytes.IndexByte`
 - Si trouvé sans carry : retourne slice direct du node (zéro alloc)
 - Si trouvé avec carry : copie dans carry buffer, retourne `carry[:n]`
 - Si non trouvé : copie le reste dans carry, avance au node suivant, recommence
 - EOF : retourne `carry[:carryN]` puis nil
 `extractSequence(dest []byte, UtoT bool) int` :
 - Scan direct des bytes pour section ORIGIN, sans passer par ReadLine
 - Machine d'états : lineStart → skip espaces/digits → copier nucléotides dans dest
 - Stop sur `//` en début de ligne
 - Zéro allocation, UtoT inline
 ### 2. `GenbankChunkParserRope`
 ```go
 func GenbankChunkParserRope(source string, rope *PieceOfChunk,
    withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error)
 ```
 - Même machine d'états que `GenbankChunkParser`, sur `[]byte` (`bytes.HasPrefix`)
 - LOCUS : extrait `id` et `lseq` par scan direct (remplace `_seqlenght_rx`)
 - FEATURES / default inFeature : taxid extrait par scan de `/db_xref="taxon:`
  dans la source feature ; `featBytes` rempli seulement si `withFeatureTable=true`
 - DEFINITION : toujours conservée
 - ORIGIN : `dest = make([]byte, 0, lseq+20)` puis `s.extractSequence(dest, UtoT)`
 ### 3. Modifications `_ParseGenbankFile` et `ReadGenbank`
 `_ParseGenbankFile` utilise `chunk.Rope` :
 ```go
 sequences, err := GenbankChunkParserRope(chunk.Source, chunk.Rope, ...)
 ```
 `ReadGenbank` passe `pack=false` :
 ```go
 entry_channel := ReadFileChunk(..., false)
 ```
 ### 4. Ce qui NE change pas
 - `GenbankChunkParser` reste (référence, tests)
 - `ReadFileChunk`, `Pack()`, autres parseurs (EMBL, FASTA, FASTQ) : inchangés
 ### 5. Gains attendus
 - **RSS** : pic ≈ 128 MB × workers (au lieu de N × 128 MB)
 - **Temps sys** : élimination des mmap/munmap pour les gros buffers
 - **Temps user** : ~50M allocations éliminées
 ### 6. Vérification
 ```bash
 /usr/local/go/bin/go build ./...
 diff <(obiconvert gbpln640.seq.gz) gbpln640.reference.fasta
 cd bugs/genbank && ./benchmark.sh gbpln640.seq.gz
 ```
 Cible : RSS < 1 GB, temps comparable ou meilleur.
--- a/cmd/obitools/obik/main.go
+++ b/cmd/obitools/obik/main.go
@@ -0,0 +1,34 @@
 package main
 import (
 	"context"
 	"errors"
 	"os"
 	log "github.com/sirupsen/logrus"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obik"
 	"github.com/DavidGamba/go-getoptions"
 )
 func main() {
 	defer obiseq.LogBioSeqStatus()
 	opt, parser := obioptions.GenerateSubcommandParser(
 		"obik",
 		"Manage disk-based kmer indices",
 		obik.OptionSet,
 	)
 	_, remaining := parser(os.Args)
 	err := opt.Dispatch(context.Background(), remaining)
 	if err != nil {
 		if errors.Is(err, getoptions.ErrorHelpCalled) {
 			os.Exit(0)
 		}
 		log.Fatalf("Error: %v", err)
 	}
 }
--- a/cmd/obitools/obilowmask/main.go
+++ b/cmd/obitools/obilowmask/main.go
@@ -1,47 +0,0 @@
 package main
 import (
 	"os"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obilowmask"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
 )
 func main() {
 	defer obiseq.LogBioSeqStatus()
 	// go tool pprof -http=":8000" ./obipairing ./cpu.pprof
 	// f, err := os.Create("cpu.pprof")
 	// if err != nil {
 	// 	log.Fatal(err)
 	// }
 	// pprof.StartCPUProfile(f)
 	// defer pprof.StopCPUProfile()
 	// go tool trace cpu.trace
 	// ftrace, err := os.Create("cpu.trace")
 	// if err != nil {
 	// 	log.Fatal(err)
 	// }
 	// trace.Start(ftrace)
 	// defer trace.Stop()
 	optionParser := obioptions.GenerateOptionParser(
 		"obimicrosat",
 		"looks for microsatellites sequences in a sequence file",
 		obilowmask.OptionSet)
 	_, args := optionParser(os.Args)
 	sequences, err := obiconvert.CLIReadBioSequences(args...)
 	obiconvert.OpenSequenceDataErrorMessage(args, err)
 	selected := obilowmask.CLISequenceEntropyMasker(sequences)
 	obiconvert.CLIWriteBioSequences(selected, true)
 	obiutils.WaitForLastPipe()
 }
--- a/cmd/obitools/obisuperkmer/main.go
+++ b/cmd/obitools/obisuperkmer/main.go
@@ -1,34 +0,0 @@
 package main
 import (
 	"os"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obisuperkmer"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
 )
 func main() {
 	// Generate option parser
 	optionParser := obioptions.GenerateOptionParser(
 		"obisuperkmer",
 		"extract super k-mers from sequence files",
 		obisuperkmer.OptionSet)
 	// Parse command-line arguments
 	_, args := optionParser(os.Args)
 	// Read input sequences
 	sequences, err := obiconvert.CLIReadBioSequences(args...)
 	obiconvert.OpenSequenceDataErrorMessage(args, err)
 	// Extract super k-mers
 	superkmers := obisuperkmer.CLIExtractSuperKmers(sequences)
 	// Write output sequences
 	obiconvert.CLIWriteBioSequences(superkmers, true)
 	// Wait for pipeline completion
 	obiutils.WaitForLastPipe()
 }
--- a/go.mod
+++ b/go.mod
@@ -1,56 +1,50 @@
 module git.metabarcoding.org/obitools/obitools4/obitools4
-go 1.23.4
+go 1.26.1
 toolchain go1.24.2
 require (
-	github.com/DavidGamba/go-getoptions v0.28.0
+	github.com/DavidGamba/go-getoptions v0.33.0
-	github.com/PaesslerAG/gval v1.2.2
+	github.com/PaesslerAG/gval v1.2.4
 	github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df
 	github.com/buger/jsonparser v1.1.1
 	github.com/chen3feng/stl4go v0.1.1
-	github.com/dlclark/regexp2 v1.11.4
+	github.com/dlclark/regexp2 v1.11.5
-	github.com/goccy/go-json v0.10.3
+	github.com/goccy/go-json v0.10.6
 	github.com/klauspost/pgzip v1.2.6
 	github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58
 	github.com/pelletier/go-toml/v2 v2.2.4
 	github.com/rrethy/ahocorasick v1.0.0
-	github.com/schollz/progressbar/v3 v3.13.1
+	github.com/schollz/progressbar/v3 v3.19.0
-	github.com/sirupsen/logrus v1.9.3
+	github.com/sirupsen/logrus v1.9.4
-	github.com/stretchr/testify v1.8.4
+	github.com/stretchr/testify v1.10.0
 	github.com/tevino/abool/v2 v2.1.0
 	github.com/yuin/gopher-lua v1.1.1
-	golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa
+	golang.org/x/exp v0.0.0-20260312153236-7ab1446f8b90
-	gonum.org/v1/gonum v0.14.0
+	gonum.org/v1/gonum v0.17.0
 	gopkg.in/yaml.v3 v3.0.1
 	scientificgo.org/special v0.0.0
 )
 require (
 	github.com/RoaringBitmap/roaring v1.9.4 // indirect
 	github.com/bits-and-blooms/bitset v1.12.0 // indirect
 	github.com/davecgh/go-spew v1.1.1 // indirect
-	github.com/goombaio/orderedmap v0.0.0-20180924084748-ba921b7e2419 // indirect
+	github.com/goombaio/orderedmap v0.0.0-20180925151256-3da0e2f905f9 // indirect
 	github.com/kr/pretty v0.3.1 // indirect
 	github.com/kr/text v0.2.0 // indirect
 	github.com/mschoch/smat v0.2.0 // indirect
 	github.com/pelletier/go-toml/v2 v2.2.4 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
 	github.com/rogpeppe/go-internal v1.12.0 // indirect
 )
 require (
 	github.com/dsnet/compress v0.0.1
-	github.com/gabriel-vasile/mimetype v1.4.3
+	github.com/gabriel-vasile/mimetype v1.4.13
 	github.com/goombaio/orderedset v0.0.0-20180925151225-8e67b20a9b77
-	github.com/klauspost/compress v1.17.2
+	github.com/klauspost/compress v1.18.4
-	github.com/mattn/go-runewidth v0.0.15 // indirect
+	github.com/mattn/go-runewidth v0.0.21 // indirect
 	github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
-	github.com/rivo/uniseg v0.4.4 // indirect
+	github.com/rivo/uniseg v0.4.7 // indirect
-	github.com/shopspring/decimal v1.3.1 // indirect
+	github.com/shopspring/decimal v1.4.0 // indirect
-	github.com/ulikunitz/xz v0.5.11
+	github.com/ulikunitz/xz v0.5.15
-	golang.org/x/net v0.35.0 // indirect
+	golang.org/x/sys v0.42.0 // indirect
-	golang.org/x/sys v0.30.0 // indirect
+	golang.org/x/term v0.41.0 // indirect
 	golang.org/x/term v0.29.0 // indirect
 	gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c
 )
--- a/go.sum
+++ b/go.sum
@@ -1,40 +1,41 @@
-github.com/DavidGamba/go-getoptions v0.28.0 h1:18wgEvfZdrlfIhVDGEBO3Dl0fkOyXqXLa0tLMCKxM1c=
+github.com/DavidGamba/go-getoptions v0.33.0 h1:8xCPH87Yy5avYenygyHVlqqm8RpymH0YFe4a7IWlarE=
-github.com/DavidGamba/go-getoptions v0.28.0/go.mod h1:zE97E3PR9P3BI/HKyNYgdMlYxodcuiC6W68KIgeYT84=
+github.com/DavidGamba/go-getoptions v0.33.0/go.mod h1:zE97E3PR9P3BI/HKyNYgdMlYxodcuiC6W68KIgeYT84=
-github.com/PaesslerAG/gval v1.2.2 h1:Y7iBzhgE09IGTt5QgGQ2IdaYYYOU134YGHBThD+wm9E=
+github.com/PaesslerAG/gval v1.2.4 h1:rhX7MpjJlcxYwL2eTTYIOBUyEKZ+A96T9vQySWkVUiU=
-github.com/PaesslerAG/gval v1.2.2/go.mod h1:XRFLwvmkTEdYziLdaCeCa5ImcGVrfQbeNUbVR+C6xac=
+github.com/PaesslerAG/gval v1.2.4/go.mod h1:XRFLwvmkTEdYziLdaCeCa5ImcGVrfQbeNUbVR+C6xac=
 github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI=
 github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8=
 github.com/RoaringBitmap/roaring v1.9.4 h1:yhEIoH4YezLYT04s1nHehNO64EKFTop/wBhxv2QzDdQ=
 github.com/RoaringBitmap/roaring v1.9.4/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90=
 github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df h1:GSoSVRLoBaFpOOds6QyY1L8AX7uoY+Ln3BHc22W40X0=
 github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df/go.mod h1:hiVxq5OP2bUGBRNS3Z/bt/reCLFNbdcST6gISi1fiOM=
 github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA=
 github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
 github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMUs=
 github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0=
 github.com/chen3feng/stl4go v0.1.1 h1:0L1+mDw7pomftKDruM23f1mA7miavOj6C6MZeadzN2Q=
 github.com/chen3feng/stl4go v0.1.1/go.mod h1:5ml3psLgETJjRJnMbPE+JiHLrCpt+Ajc2weeTECXzWU=
 github.com/chengxilo/virtualterm v1.0.4 h1:Z6IpERbRVlfB8WkOmtbHiDbBANU7cimRIof7mk9/PwM=
 github.com/chengxilo/virtualterm v1.0.4/go.mod h1:DyxxBZz/x1iqJjFxTFcr6/x+jSpqN0iwWCOK1q10rlY=
 github.com/clipperhouse/uax29/v2 v2.2.0 h1:ChwIKnQN3kcZteTXMgb1wztSgaU+ZemkgWdohwgs8tY=
 github.com/clipperhouse/uax29/v2 v2.2.0/go.mod h1:EFJ2TJMRUaplDxHKj1qAEhCtQPW2tJSwu5BF98AuoVM=
 github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/dlclark/regexp2 v1.11.4 h1:rPYF9/LECdNymJufQKmri9gV604RvvABwgOA8un7yAo=
+github.com/dlclark/regexp2 v1.11.5 h1:Q/sSnsKerHeCkc/jSTNq1oCm7KiVgUMZRDUoRu0JQZQ=
-github.com/dlclark/regexp2 v1.11.4/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
+github.com/dlclark/regexp2 v1.11.5/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
 github.com/dsnet/compress v0.0.1 h1:PlZu0n3Tuv04TzpfPbrnI0HW/YwodEXDS+oPKahKF0Q=
 github.com/dsnet/compress v0.0.1/go.mod h1:Aw8dCMJ7RioblQeTqt88akK31OvO8Dhf5JflhBbQEHo=
 github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY=
-github.com/gabriel-vasile/mimetype v1.4.3 h1:in2uUcidCuFcDKtdcBxlR0rJ1+fsokWf+uqxgUFjbI0=
+github.com/gabriel-vasile/mimetype v1.4.13 h1:46nXokslUBsAJE/wMsp5gtO500a4F3Nkz9Ufpk2AcUM=
-github.com/gabriel-vasile/mimetype v1.4.3/go.mod h1:d8uq/6HKRL6CGdk+aubisF/M5GcPfT7nKyLpA0lbSSk=
+github.com/gabriel-vasile/mimetype v1.4.13/go.mod h1:d+9Oxyo1wTzWdyVUPMmXFvp4F9tea18J8ufA774AB3s=
-github.com/goccy/go-json v0.10.3 h1:KZ5WoDbxAIgm2HNbYckL0se1fHD6rz5j4ywS6ebzDqA=
+github.com/goccy/go-json v0.10.6 h1:p8HrPJzOakx/mn/bQtjgNjdTcN+/S6FcG2CTtQOrHVU=
-github.com/goccy/go-json v0.10.3/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M=
+github.com/goccy/go-json v0.10.6/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M=
-github.com/goombaio/orderedmap v0.0.0-20180924084748-ba921b7e2419 h1:SajEQ6tktpF9SRIuzbiPOX9AEZZ53Bvw0k9Mzrts8Lg=
+github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
 github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 github.com/goombaio/orderedmap v0.0.0-20180924084748-ba921b7e2419/go.mod h1:YKu81H3RSd1cFh0d7NhvUoTtUC9IY/vBX0WUQb1/o4Y=
 github.com/goombaio/orderedmap v0.0.0-20180925151256-3da0e2f905f9 h1:vFjPvFavIiDY71bQ9HIxPQBANvNl1SmFC4fgg5xRkho=
 github.com/goombaio/orderedmap v0.0.0-20180925151256-3da0e2f905f9/go.mod h1:YKu81H3RSd1cFh0d7NhvUoTtUC9IY/vBX0WUQb1/o4Y=
 github.com/goombaio/orderedset v0.0.0-20180925151225-8e67b20a9b77 h1:4dvq1tGHn1Y9KSRY0OZ24Khki4+4U+ZrA//YYsdUlJU=
 github.com/goombaio/orderedset v0.0.0-20180925151225-8e67b20a9b77/go.mod h1:HPelMYpOyy0XvglpBbmZ3krZpwaHmszj/vQNlnETPTM=
 github.com/k0kubun/go-ansi v0.0.0-20180517002512-3bf9e2903213/go.mod h1:vNUNkEQ1e29fT/6vq2aBdFsgNPmy8qMdSay1npru+Sw=
 github.com/klauspost/compress v1.4.1/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
-github.com/klauspost/compress v1.17.2 h1:RlWWUY/Dr4fL8qk9YG7DTZ7PDgME2V4csBXA8L/ixi4=
+github.com/klauspost/compress v1.18.4 h1:RPhnKRAQ4Fh8zU2FY/6ZFDwTVTxgJ/EMydqSTzE9a2c=
-github.com/klauspost/compress v1.17.2/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
+github.com/klauspost/compress v1.18.4/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
 github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
 github.com/klauspost/pgzip v1.2.6 h1:8RXeL5crjEUFnR2/Sn6GJNWtSQ3Dk8pq4CL3jvdDyjU=
 github.com/klauspost/pgzip v1.2.6/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
@@ -45,14 +46,10 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
 github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
-github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
+github.com/mattn/go-runewidth v0.0.21 h1:jJKAZiQH+2mIinzCJIaIG9Be1+0NR+5sz/lYEEjdM8w=
-github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
+github.com/mattn/go-runewidth v0.0.21/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs=
 github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZgg3U=
 github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
 github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ=
 github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw=
 github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM=
 github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw=
 github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
 github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
 github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4=
@@ -60,50 +57,40 @@ github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8
 github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
+github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
-github.com/rivo/uniseg v0.4.4 h1:8TfxU8dW6PdqD27gjM8MVNuicgxIjxpm4K7x4jp8sis=
+github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
 github.com/rivo/uniseg v0.4.4/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
 github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
 github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
 github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
 github.com/rrethy/ahocorasick v1.0.0 h1:YKkCB+E5PXc0xmLfMrWbfNht8vG9Re97IHSWZk/Lk8E=
 github.com/rrethy/ahocorasick v1.0.0/go.mod h1:nq8oScE7Vy1rOppoQxpQiiDmPHuKCuk9rXrNcxUV3R0=
-github.com/schollz/progressbar/v3 v3.13.1 h1:o8rySDYiQ59Mwzy2FELeHY5ZARXZTVJC7iHD6PEFUiE=
+github.com/schollz/progressbar/v3 v3.19.0 h1:Ea18xuIRQXLAUidVDox3AbwfUhD0/1IvohyTutOIFoc=
-github.com/schollz/progressbar/v3 v3.13.1/go.mod h1:xvrbki8kfT1fzWzBT/UZd9L6GA+jdL7HAgq2RFnO6fQ=
+github.com/schollz/progressbar/v3 v3.19.0/go.mod h1:IsO3lpbaGuzh8zIMzgY3+J8l4C8GjO0Y9S69eFvNsec=
 github.com/shopspring/decimal v1.3.1 h1:2Usl1nmF/WZucqkFZhnfFYxxxu8LG21F6nPQBE5gKV8=
 github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o=
-github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
+github.com/shopspring/decimal v1.4.0 h1:bxl37RwXBklmTi0C79JfXCEBD1cqqHt0bbgBAGFp81k=
-github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
+github.com/shopspring/decimal v1.4.0/go.mod h1:gawqmDU56v4yIKSwfBSFip1HdCCXN8/+DMd9qYNcwME=
-github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/sirupsen/logrus v1.9.4 h1:TsZE7l11zFCLZnZ+teH4Umoq5BhEIfIzfRDZ1Uzql2w=
-github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
+github.com/sirupsen/logrus v1.9.4/go.mod h1:ftWc9WdOfJ0a92nsE2jF5u5ZwH8Bv2zdeOC42RjbV2g=
-github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
-github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
+github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
 github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
 github.com/tevino/abool/v2 v2.1.0 h1:7w+Vf9f/5gmKT4m4qkayb33/92M+Um45F2BkHOR+L/c=
 github.com/tevino/abool/v2 v2.1.0/go.mod h1:+Lmlqk6bHDWHqN1cbxqhwEAwMPXgc8I1SDEamtseuXY=
 github.com/ulikunitz/xz v0.5.6/go.mod h1:2bypXElzHzzJZwzH67Y6wb67pO62Rzfn7BSiF4ABRW8=
-github.com/ulikunitz/xz v0.5.11 h1:kpFauv27b6ynzBNT/Xy+1k+fK4WswhN/6PN5WhFAGw8=
+github.com/ulikunitz/xz v0.5.15 h1:9DNdB5s+SgV3bQ2ApL10xRc35ck0DuIX/isZvIk+ubY=
-github.com/ulikunitz/xz v0.5.11/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
+github.com/ulikunitz/xz v0.5.15/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
 github.com/yuin/gopher-lua v1.1.1 h1:kYKnWBjvbNP4XLT3+bPEwAXJx262OhaHDWDVOPjL46M=
 github.com/yuin/gopher-lua v1.1.1/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7kdAd1Pw=
-golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa h1:FRnLl4eNAQl8hwxVVC17teOw8kdjVDVAiFMtgUdTSRQ=
+golang.org/x/exp v0.0.0-20260312153236-7ab1446f8b90 h1:jiDhWWeC7jfWqR9c/uplMOqJ0sbNlNWv0UkzE0vX1MA=
-golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa/go.mod h1:zk2irFbV9DP96SEBUUAy67IdHUaZuSnrz1n472HUCLE=
+golang.org/x/exp v0.0.0-20260312153236-7ab1446f8b90/go.mod h1:xE1HEv6b+1SCZ5/uscMRjUBKtIxworgEcEi+/n9NQDQ=
-golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8=
+golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
-golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk=
+golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
-golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/term v0.41.0 h1:QCgPso/Q3RTJx2Th4bDLqML4W6iJiaXFq2/ftQF13YU=
-golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/term v0.41.0/go.mod h1:3pfBgksrReYfZ5lvYM0kSO0LIkAl4Yl2bXOkKP7Ec2A=
-golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4=
-golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
+gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E=
 golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
 golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U=
 golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU=
 golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s=
 gonum.org/v1/gonum v0.14.0 h1:2NiG67LD1tEH0D7kM+ps2V+fXmsAnpUeec7n8tcr4S0=
 gonum.org/v1/gonum v0.14.0/go.mod h1:AoWeoz0becf9QMWtE8iWXNXc27fK4fNeHNf/oMejGfU=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 scientificgo.org/special v0.0.0 h1:P6WJkECo6tgtvZAEfNXl+KEB9ReAatjKAeX8U07mjSc=
--- a/go.work
+++ b/go.work
@@ -1,5 +1,3 @@
-go 1.23.4
+go 1.26.1
 toolchain go1.24.2
 use .
--- a/go.work.sum
+++ b/go.work.sum
@@ -52,6 +52,8 @@ golang.org/x/image v0.6.0/go.mod h1:MXLdDR43H7cDJq5GEGXEVeeNhPgi+YYEQ2pC1byI1x0=
 golang.org/x/mod v0.13.0 h1:I/DsJXRlw/8l/0c24sM9yb0T4z9liZTduXvdAWYiysY=
 golang.org/x/mod v0.13.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
 golang.org/x/mod v0.14.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
 golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
 golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw=
 golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4 h1:uVc8UZUe6tr40fFVnUP5Oj+veunVezqYl9z7DYw9xzw=
 golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k=
 golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
--- a/install_obitools.sh
+++ b/install_obitools.sh
@@ -7,6 +7,7 @@ INSTALL_DIR="/usr/local"
 OBITOOLS_PREFIX=""
 VERSION=""
 LIST_VERSIONS=false
 JOBS=1
 # Help message
 function display_help {
@@ -21,6 +22,7 @@ function display_help {
  echo "                          gobigrep command instead of obigrep)."
  echo "  -v, --version           Install a specific version (e.g., 4.4.8)."
  echo "                          If not specified, installs the latest version."
  echo "  -j, --jobs              Number of parallel jobs for compilation (default: 1)."
  echo "  -l, --list              List all available versions and exit."
  echo "  -h, --help              Display this help message."
  echo ""
@@ -65,6 +67,10 @@ while [ "$#" -gt 0 ]; do
      VERSION="$2"
      shift 2
      ;;
    -j|--jobs)
      JOBS="$2"
      shift 2
      ;;
    -l|--list)
      LIST_VERSIONS=true
      shift
@@ -122,9 +128,15 @@ mkdir -p "${WORK_DIR}/cache" \
      exit 1)
 # Create installation directory
-mkdir -p "${INSTALL_DIR}/bin" 2> /dev/null \
+if ! mkdir -p "${INSTALL_DIR}/bin" 2>/dev/null; then
-  || (echo "Please enter your password for installing obitools in ${INSTALL_DIR}" 1>&2
+  if [ ! -w "$(dirname "${INSTALL_DIR}")" ] && [ ! -w "${INSTALL_DIR}" ]; then
-      sudo mkdir -p "${INSTALL_DIR}/bin")
+    echo "Please enter your password for installing obitools in ${INSTALL_DIR}" 1>&2
    sudo mkdir -p "${INSTALL_DIR}/bin"
  else
    echo "Error: Could not create ${INSTALL_DIR}/bin (check path or disk space)" 1>&2
    exit 1
  fi
 fi
 if [[ ! -d "${INSTALL_DIR}/bin" ]]; then
  echo "Could not create ${INSTALL_DIR}/bin directory for installing obitools" 1>&2
@@ -171,22 +183,24 @@ GOURL=$(curl -s "${URL}${GOFILE}" \
 echo "Installing Go from: $GOURL" 1>&2
-curl -s "$GOURL" | tar zxf -
+curl --progress-bar "$GOURL" | tar zxf -
-PATH="$(pwd)/go/bin:$PATH"
+export GOROOT="$(pwd)/go"
 PATH="${GOROOT}/bin:$PATH"
 export PATH
-GOPATH="$(pwd)/go"
+export GOPATH="$(pwd)/gopath"
 export GOPATH
 export GOCACHE="$(pwd)/cache"
 export GOTOOLCHAIN=local
 echo "GOROOT=$GOROOT" 1>&2
 echo "GOCACHE=$GOCACHE" 1>&2
-mkdir -p "$GOCACHE"
+mkdir -p "$GOPATH" "$GOCACHE"
 # Download OBITools4 source
 echo "Downloading OBITools4 v${VERSION}..." 1>&2
 echo "Source URL: $OBIURL4" 1>&2
-if ! curl -sL "$OBIURL4" > obitools4.zip; then
+if ! curl --progress-bar -L "$OBIURL4" > obitools4.zip; then
  echo "Error: Could not download OBITools4 version ${VERSION}" 1>&2
  echo "Please check that this version exists with: $0 --list" 1>&2
  exit 1
@@ -208,16 +222,29 @@ mkdir -p vendor
 # Build with or without prefix
 if [[ -z "$OBITOOLS_PREFIX" ]] ; then
-  make GOFLAGS="-buildvcs=false"
+  make -j"${JOBS}" obitools GOFLAGS="-buildvcs=false"
 else
-  make GOFLAGS="-buildvcs=false" OBITOOLS_PREFIX="${OBITOOLS_PREFIX}"
+  make -j"${JOBS}" obitools GOFLAGS="-buildvcs=false" OBITOOLS_PREFIX="${OBITOOLS_PREFIX}"
 fi
 # Install binaries
 echo "Installing binaries to ${INSTALL_DIR}/bin..." 1>&2
-(cp build/* "${INSTALL_DIR}/bin" 2> /dev/null) \
+if ! cp build/* "${INSTALL_DIR}/bin" 2>/dev/null; then
-   || (echo "Please enter your password for installing obitools in ${INSTALL_DIR}" 1>&2
+  if [ ! -w "${INSTALL_DIR}/bin" ]; then
-       sudo cp build/* "${INSTALL_DIR}/bin")
+    echo "Please enter your password for installing obitools in ${INSTALL_DIR}" 1>&2
    sudo cp build/* "${INSTALL_DIR}/bin"
  else
    echo "Error: Could not copy binaries to ${INSTALL_DIR}/bin" 1>&2
    echo "  Source files: $(ls build/ 2>/dev/null || echo 'none found')" 1>&2
    echo "" 1>&2
    echo "The build directory has been preserved for manual recovery:" 1>&2
    echo "  $(pwd)/build/" 1>&2
    echo "You can install manually with:" 1>&2
    echo "  cp $(pwd)/build/* ${INSTALL_DIR}/bin/" 1>&2
    popd > /dev/null || true
    exit 1
  fi
 fi
 popd > /dev/null || exit
--- a/kmer_roaring_index/FREQUENCY_FILTER_FINAL.md
+++ b/kmer_roaring_index/FREQUENCY_FILTER_FINAL.md
@@ -1,292 +0,0 @@
 # Filtre de Fréquence avec v Niveaux de Roaring Bitmaps
 ## Algorithme
 ```go
 Pour chaque k-mer rencontré dans les données:
    c = 0
    tant que (k-mer ∈ index[c] ET c < v):
        c++
    si c < v:
        index[c].insert(k-mer)
 ```
 **Résultat** : `index[v-1]` contient les k-mers vus **≥ v fois**
 ---
 ## Exemple d'exécution (v=3)
 ```
 Données:
  Read1: kmer X
  Read2: kmer X
  Read3: kmer X  (X vu 3 fois)
  Read4: kmer Y
  Read5: kmer Y  (Y vu 2 fois)
  Read6: kmer Z  (Z vu 1 fois)
 Exécution:
 Read1 (X):
  c=0: X ∉ index[0] → index[0].add(X)
  État: index[0]={X}, index[1]={}, index[2]={}
 Read2 (X):
  c=0: X ∈ index[0] → c=1
  c=1: X ∉ index[1] → index[1].add(X)
  État: index[0]={X}, index[1]={X}, index[2]={}
 Read3 (X):
  c=0: X ∈ index[0] → c=1
  c=1: X ∈ index[1] → c=2
  c=2: X ∉ index[2] → index[2].add(X)
  État: index[0]={X}, index[1]={X}, index[2]={X}
 Read4 (Y):
  c=0: Y ∉ index[0] → index[0].add(Y)
  État: index[0]={X,Y}, index[1]={X}, index[2]={X}
 Read5 (Y):
  c=0: Y ∈ index[0] → c=1
  c=1: Y ∉ index[1] → index[1].add(Y)
  État: index[0]={X,Y}, index[1]={X,Y}, index[2]={X}
 Read6 (Z):
  c=0: Z ∉ index[0] → index[0].add(Z)
  État: index[0]={X,Y,Z}, index[1]={X,Y}, index[2]={X}
 Résultat final:
  index[0] (freq≥1): {X, Y, Z}
  index[1] (freq≥2): {X, Y}
  index[2] (freq≥3): {X}  ← K-mers filtrés ✓
 ```
 ---
 ## Utilisation
 ```go
 // Créer le filtre
 filter := obikmer.NewFrequencyFilter(31, 3) // k=31, minFreq=3
 // Ajouter les séquences
 for _, read := range reads {
    filter.AddSequence(read)
 }
 // Récupérer les k-mers filtrés (freq ≥ 3)
 filtered := filter.GetFilteredSet("filtered")
 fmt.Printf("K-mers de qualité: %d\n", filtered.Cardinality())
 // Statistiques
 stats := filter.Stats()
 fmt.Println(stats.String())
 ```
 ---
 ## Performance
 ### Complexité
 **Par k-mer** :
 - Lookups : Moyenne ~v/2, pire cas v
 - Insertions : 1 Add
 - **Pas de Remove** ✅
 **Total pour n k-mers** :
 - Temps : O(n × v/2)
 - Mémoire : O(unique_kmers × v × 2 bytes)
 ### Early exit pour distribution skewed
 Avec distribution typique (séquençage) :
 ```
 80% singletons → 1 lookup (early exit)
 15% freq 2-3   → 2-3 lookups
 5% freq ≥4     → jusqu'à v lookups
 Moyenne réelle : ~2 lookups/kmer (au lieu de v/2)
 ```
 ---
 ## Mémoire
 ### Pour 10^8 k-mers uniques
 | v (minFreq) | Nombre bitmaps | Mémoire | vs map simple |
 |-------------|----------------|---------|---------------|
 | v=2 | 2 | ~400 MB | 6x moins |
 | v=3 | 3 | ~600 MB | 4x moins |
 | v=5 | 5 | ~1 GB | 2.4x moins |
 | v=10 | 10 | ~2 GB | 1.2x moins |
 | v=20 | 20 | ~4 GB | ~égal |
 **Note** : Avec distribution skewed (beaucoup de singletons), la mémoire réelle est bien plus faible car les niveaux hauts ont peu d'éléments.
 ### Exemple réaliste (séquençage)
 Pour 10^8 k-mers totaux, v=3 :
 ```
 Distribution:
  80% singletons  → 80M dans index[0]
  15% freq 2-3    → 15M dans index[1]
  5% freq ≥3      → 5M dans index[2]
 Mémoire:
  index[0]: 80M × 2 bytes = 160 MB
  index[1]: 15M × 2 bytes = 30 MB
  index[2]: 5M × 2 bytes = 10 MB
  Total: ~200 MB ✅
 vs map simple: 80M × 24 bytes = ~2 GB
 Réduction: 10x
 ```
 ---
 ## Comparaison des approches
 | Approche | Mémoire (10^8 kmers) | Passes | Lookups/kmer | Quand utiliser |
 |----------|----------------------|--------|--------------|----------------|
 | **v-Bitmaps** | **200-600 MB** | **1** | **~2 (avg)** | **Standard** ✅ |
 | Map simple | 2.4 GB | 1 | 1 | Si RAM illimitée |
 | Multi-pass | 400 MB | v | v | Si I/O pas cher |
 ---
 ## Avantages de v-Bitmaps
 ✅ **Une seule passe** sur les données  
 ✅ **Mémoire optimale** avec Roaring bitmaps  
 ✅ **Pas de Remove** (seulement Contains + Add)  
 ✅ **Early exit** efficace sur singletons  
 ✅ **Scalable** jusqu'à v~10-20  
 ✅ **Simple** à implémenter et comprendre  
 ---
 ## Cas d'usage typiques
 ### 1. Éliminer erreurs de séquençage
 ```go
 filter := obikmer.NewFrequencyFilter(31, 3)
 // Traiter FASTQ
 for read := range StreamFastq("sample.fastq") {
    filter.AddSequence(read)
 }
 // K-mers de qualité (pas d'erreurs)
 cleaned := filter.GetFilteredSet("cleaned")
 ```
 **Résultat** : Élimine 70-80% des k-mers (erreurs)
 ### 2. Assemblage de génome
 ```go
 filter := obikmer.NewFrequencyFilter(31, 2)
 // Filtrer avant l'assemblage
 for read := range reads {
    filter.AddSequence(read)
 }
 solidKmers := filter.GetFilteredSet("solid")
 // Utiliser solidKmers pour le graphe de Bruijn
 ```
 ### 3. Comparaison de génomes
 ```go
 collection := obikmer.NewKmerSetCollection(31)
 for _, genome := range genomes {
    filter := obikmer.NewFrequencyFilter(31, 3)
    filter.AddSequences(genome.Reads)
    cleaned := filter.GetFilteredSet(genome.ID)
    collection.Add(cleaned)
 }
 // Analyses comparatives sur k-mers de qualité
 matrix := collection.ParallelPairwiseJaccard(8)
 ```
 ---
 ## Limites
 **Pour v > 20** :
 - Trop de lookups (v lookups/kmer)
 - Mémoire importante (v × 200MB pour 10^8 kmers)
 **Solutions alternatives pour v > 20** :
 - Utiliser map simple (9 bytes/kmer) si RAM disponible
 - Algorithme différent (sketch, probabiliste)
 ---
 ## Optimisations possibles
 ### 1. Parallélisation
 ```go
 // Traiter plusieurs fichiers en parallèle
 filters := make([]*FrequencyFilter, numFiles)
 var wg sync.WaitGroup
 for i, file := range files {
    wg.Add(1)
    go func(idx int, f string) {
        defer wg.Done()
        filters[idx] = ProcessFile(f, k, minFreq)
    }(i, file)
 }
 wg.Wait()
 // Merger les résultats
 merged := MergeFilters(filters)
 ```
 ### 2. Streaming avec seuil adaptatif
 ```go
 // Commencer avec v=5, réduire progressivement
 filter := obikmer.NewFrequencyFilter(31, 5)
 // ... traitement ...
 // Si trop de mémoire, réduire à v=3
 if filter.MemoryUsage() > threshold {
    filter = ConvertToLowerThreshold(filter, 3)
 }
 ```
 ---
 ## Récapitulatif final
 **Pour filtrer les k-mers par fréquence ≥ v :**
 1. **Créer** : `filter := NewFrequencyFilter(k, v)`
 2. **Traiter** : `filter.AddSequence(read)` pour chaque read
 3. **Résultat** : `filtered := filter.GetFilteredSet(id)`
 **Mémoire** : ~2v MB par million de k-mers uniques  
 **Temps** : Une seule passe, ~2 lookups/kmer en moyenne  
 **Optimal pour** : v ≤ 20, distribution skewed (séquençage)  
 ---
 ## Code fourni
 1. **frequency_filter.go** - Implémentation complète
 2. **examples_frequency_filter_final.go** - Exemples d'utilisation
 **Tout est prêt à utiliser !** 🚀
--- a/kmer_roaring_index/examples_frequency_filter_final.go
+++ b/kmer_roaring_index/examples_frequency_filter_final.go
@@ -1,320 +0,0 @@
 package main
 import (
 	"fmt"
 	"obikmer"
 )
 func main() {
 	// ==========================================
 	// EXEMPLE 1 : Utilisation basique
 	// ==========================================
 	fmt.Println("=== EXEMPLE 1 : Utilisation basique ===\n")
 	k := 31
 	minFreq := 3 // Garder les k-mers vus ≥3 fois
 	// Créer le filtre
 	filter := obikmer.NewFrequencyFilter(k, minFreq)
 	// Simuler des séquences avec différentes fréquences
 	sequences := [][]byte{
 		[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"), // Kmer X
 		[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"), // Kmer X (freq=2)
 		[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"), // Kmer X (freq=3) ✓
 		[]byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"), // Kmer Y
 		[]byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"), // Kmer Y (freq=2) ✗
 		[]byte("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"), // Kmer Z (freq=1) ✗
 	}
 	fmt.Printf("Traitement de %d séquences...\n", len(sequences))
 	for _, seq := range sequences {
 		filter.AddSequence(seq)
 	}
 	// Récupérer les k-mers filtrés
 	filtered := filter.GetFilteredSet("filtered")
 	fmt.Printf("\nK-mers avec freq ≥ %d: %d\n", minFreq, filtered.Cardinality())
 	// Statistiques
 	stats := filter.Stats()
 	fmt.Println("\n" + stats.String())
 	// ==========================================
 	// EXEMPLE 2 : Vérifier les niveaux
 	// ==========================================
 	fmt.Println("\n=== EXEMPLE 2 : Inspection des niveaux ===\n")
 	// Vérifier chaque niveau
 	for level := 0; level < minFreq; level++ {
 		levelSet := filter.GetKmersAtLevel(level)
 		fmt.Printf("Niveau %d (freq≥%d): %d k-mers\n",
 			level+1, level+1, levelSet.Cardinality())
 	}
 	// ==========================================
 	// EXEMPLE 3 : Données réalistes
 	// ==========================================
 	fmt.Println("\n=== EXEMPLE 3 : Simulation données séquençage ===\n")
 	filter2 := obikmer.NewFrequencyFilter(31, 3)
 	// Simuler un dataset réaliste :
 	// - 1000 reads
 	// - 80% contiennent des erreurs (singletons)
 	// - 15% vrais k-mers à basse fréquence
 	// - 5% vrais k-mers à haute fréquence
 	// Vraie séquence répétée
 	trueSeq := []byte("ACGTACGTACGTACGTACGTACGTACGTACG")
 	for i := 0; i < 50; i++ {
 		filter2.AddSequence(trueSeq)
 	}
 	// Séquence à fréquence moyenne
 	mediumSeq := []byte("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC")
 	for i := 0; i < 5; i++ {
 		filter2.AddSequence(mediumSeq)
 	}
 	// Erreurs de séquençage (singletons)
 	for i := 0; i < 100; i++ {
 		errorSeq := []byte(fmt.Sprintf("TTTTTTTTTTTTTTTTTTTTTTTTTTTT%03d", i))
 		filter2.AddSequence(errorSeq)
 	}
 	stats2 := filter2.Stats()
 	fmt.Println(stats2.String())
 	fmt.Println("Distribution attendue:")
 	fmt.Println("  - Beaucoup de singletons (erreurs)")
 	fmt.Println("  - Peu de k-mers à haute fréquence (signal)")
 	fmt.Println("  → Filtrage efficace !")
 	// ==========================================
 	// EXEMPLE 4 : Tester différents seuils
 	// ==========================================
 	fmt.Println("\n=== EXEMPLE 4 : Comparaison de seuils ===\n")
 	testSeqs := [][]byte{
 		[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
 		[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
 		[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
 		[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
 		[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"), // freq=5
 		[]byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"),
 		[]byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"),
 		[]byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"), // freq=3
 		[]byte("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"), // freq=1
 	}
 	for _, minFreq := range []int{2, 3, 5} {
 		f := obikmer.NewFrequencyFilter(31, minFreq)
 		f.AddSequences(testSeqs)
 		fmt.Printf("minFreq=%d: %d k-mers retenus (%.2f MB)\n",
 			minFreq,
 			f.Cardinality(),
 			float64(f.MemoryUsage())/1024/1024)
 	}
 	// ==========================================
 	// EXEMPLE 5 : Comparaison mémoire
 	// ==========================================
 	fmt.Println("\n=== EXEMPLE 5 : Comparaison mémoire ===\n")
 	filter3 := obikmer.NewFrequencyFilter(31, 3)
 	// Simuler 10000 séquences
 	for i := 0; i < 10000; i++ {
 		seq := make([]byte, 100)
 		for j := range seq {
 			seq[j] = "ACGT"[(i+j)%4]
 		}
 		filter3.AddSequence(seq)
 	}
 	fmt.Println(filter3.CompareWithSimpleMap())
 	// ==========================================
 	// EXEMPLE 6 : Workflow complet
 	// ==========================================
 	fmt.Println("\n=== EXEMPLE 6 : Workflow complet ===\n")
 	fmt.Println("1. Créer le filtre")
 	finalFilter := obikmer.NewFrequencyFilter(31, 3)
 	fmt.Println("2. Traiter les données (simulation)")
 	// En pratique : lire depuis FASTQ
 	// for read := range ReadFastq("data.fastq") {
 	//     finalFilter.AddSequence(read)
 	// }
 	// Simulation
 	for i := 0; i < 1000; i++ {
 		seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACG")
 		finalFilter.AddSequence(seq)
 	}
 	fmt.Println("3. Récupérer les k-mers filtrés")
 	result := finalFilter.GetFilteredSet("final")
 	fmt.Println("4. Utiliser le résultat")
 	fmt.Printf("   K-mers de qualité: %d\n", result.Cardinality())
 	fmt.Printf("   Mémoire utilisée: %.2f MB\n", float64(finalFilter.MemoryUsage())/1024/1024)
 	fmt.Println("5. Sauvegarder (optionnel)")
 	// result.Save("filtered_kmers.bin")
 	// ==========================================
 	// EXEMPLE 7 : Vérification individuelle
 	// ==========================================
 	fmt.Println("\n=== EXEMPLE 7 : Vérification de k-mers spécifiques ===\n")
 	checkFilter := obikmer.NewFrequencyFilter(31, 3)
 	testSeq := []byte("ACGTACGTACGTACGTACGTACGTACGTACG")
 	for i := 0; i < 5; i++ {
 		checkFilter.AddSequence(testSeq)
 	}
 	var kmers []uint64
 	kmers = obikmer.EncodeKmers(testSeq, 31, &kmers)
 	if len(kmers) > 0 {
 		testKmer := kmers[0]
 		fmt.Printf("K-mer test: 0x%016X\n", testKmer)
 		fmt.Printf("  Présent dans filtre: %v\n", checkFilter.Contains(testKmer))
 		fmt.Printf("  Fréquence approx: %d\n", checkFilter.GetFrequency(testKmer))
 	}
 	// ==========================================
 	// EXEMPLE 8 : Intégration avec collection
 	// ==========================================
 	fmt.Println("\n=== EXEMPLE 8 : Intégration avec KmerSetCollection ===\n")
 	// Créer une collection de génomes filtrés
 	collection := obikmer.NewKmerSetCollection(31)
 	genomes := map[string][][]byte{
 		"Genome1": {
 			[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
 			[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
 			[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
 			[]byte("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"), // Erreur
 		},
 		"Genome2": {
 			[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
 			[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
 			[]byte("ACGTACGTACGTACGTACGTACGTACGTACG"),
 			[]byte("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"), // Erreur
 		},
 	}
 	for id, sequences := range genomes {
 		// Filtrer chaque génome
 		genomeFilter := obikmer.NewFrequencyFilter(31, 3)
 		genomeFilter.AddSequences(sequences)
 		// Ajouter à la collection
 		filteredSet := genomeFilter.GetFilteredSet(id)
 		collection.Add(filteredSet)
 		fmt.Printf("%s: %d k-mers de qualité\n", id, filteredSet.Cardinality())
 	}
 	// Analyser la collection
 	fmt.Println("\nAnalyse comparative:")
 	collectionStats := collection.ComputeStats()
 	fmt.Printf("  Core genome: %d k-mers\n", collectionStats.CoreSize)
 	fmt.Printf("  Pan genome: %d k-mers\n", collectionStats.PanGenomeSize)
 	// ==========================================
 	// RÉSUMÉ
 	// ==========================================
 	fmt.Println("\n=== RÉSUMÉ ===\n")
 	fmt.Println("Le FrequencyFilter permet de:")
 	fmt.Println("  ✓ Filtrer les k-mers par fréquence minimale")
 	fmt.Println("  ✓ Utiliser une mémoire optimale avec Roaring bitmaps")
 	fmt.Println("  ✓ Une seule passe sur les données")
 	fmt.Println("  ✓ Éliminer efficacement les erreurs de séquençage")
 	fmt.Println("")
 	fmt.Println("Workflow typique:")
 	fmt.Println("  1. filter := NewFrequencyFilter(k, minFreq)")
 	fmt.Println("  2. for each sequence: filter.AddSequence(seq)")
 	fmt.Println("  3. filtered := filter.GetFilteredSet(id)")
 	fmt.Println("  4. Utiliser filtered dans vos analyses")
 }
 // ==================================
 // FONCTION HELPER POUR BENCHMARKS
 // ==================================
 func BenchmarkFrequencyFilter() {
 	k := 31
 	minFreq := 3
 	// Test avec différentes tailles
 	sizes := []int{1000, 10000, 100000}
 	fmt.Println("\n=== BENCHMARK ===\n")
 	for _, size := range sizes {
 		filter := obikmer.NewFrequencyFilter(k, minFreq)
 		// Générer des séquences
 		for i := 0; i < size; i++ {
 			seq := make([]byte, 100)
 			for j := range seq {
 				seq[j] = "ACGT"[(i+j)%4]
 			}
 			filter.AddSequence(seq)
 		}
 		fmt.Printf("Size=%d reads:\n", size)
 		fmt.Printf("  Filtered k-mers: %d\n", filter.Cardinality())
 		fmt.Printf("  Memory: %.2f MB\n", float64(filter.MemoryUsage())/1024/1024)
 		fmt.Println()
 	}
 }
 // ==================================
 // FONCTION POUR DONNÉES RÉELLES
 // ==================================
 func ProcessRealData() {
 	// Exemple pour traiter de vraies données FASTQ
 	k := 31
 	minFreq := 3
 	filter := obikmer.NewFrequencyFilter(k, minFreq)
 	// Pseudo-code pour lire un FASTQ
 	/*
 	fastqFile := "sample.fastq"
 	reader := NewFastqReader(fastqFile)
 	for reader.HasNext() {
 		read := reader.Next()
 		filter.AddSequence(read.Sequence)
 	}
 	// Récupérer le résultat
 	filtered := filter.GetFilteredSet("sample_filtered")
 	filtered.Save("sample_filtered_kmers.bin")
 	// Stats
 	stats := filter.Stats()
 	fmt.Println(stats.String())
 	*/
 	fmt.Println("Workflow pour données réelles:")
 	fmt.Println("  1. Créer le filtre avec minFreq approprié (2-5 typique)")
 	fmt.Println("  2. Stream les reads depuis FASTQ")
 	fmt.Println("  3. Récupérer les k-mers filtrés")
 	fmt.Println("  4. Utiliser pour assemblage/comparaison/etc.")
 	_ = filter // unused
 }
--- a/logs_60535302930.zip
+++ b/logs_60535302930.zip
--- a/obitests/obitools/obisuperkmer/test.sh
+++ b/obitests/obitools/obisuperkmer/test.sh
@@ -4,8 +4,8 @@
 # Here give the name of the test serie
 #
-TEST_NAME=obisuperkmer
+TEST_NAME=obik-super
-CMD=obisuperkmer
+CMD=obik
 ######
 #
@@ -16,7 +16,7 @@ TEST_DIR="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")"
 OBITOOLS_DIR="${TEST_DIR/obitest*/}build"
 export PATH="${OBITOOLS_DIR}:${PATH}"
-MCMD="$(echo "${CMD:0:4}" | tr '[:lower:]' '[:upper:]')$(echo "${CMD:4}" | tr '[:upper:]' '[:lower:]')"
+MCMD="OBIk-super"
 TMPDIR="$(mktemp -d)"
 ntest=0
@@ -65,31 +65,10 @@ log "files: $(find $TEST_DIR | awk -F'/' '{print $NF}' | tail -n +2)"
 ####
 #### Below are the tests
 ####
 #### Before each test :
 ####  - increment the variable ntest
 ####
 #### Run the command as the condition of an if / then /else
 ####  - The command must return 0 on success
 ####  - The command must return an exit code different from 0 on failure
 ####  - The datafiles are stored in the same directory than the test script
 ####  - The test script directory is stored in the TEST_DIR variable
 ####  - If result files have to be produced they must be stored
 ####    in the temporary directory (TMPDIR variable)
 ####
 #### then clause is executed on success of the command
 ####  - Write a success message using the log function
 ####  - increment the variable success
 ####
 #### else clause is executed on failure of the command
 ####  - Write a failure message using the log function
 ####  - increment the variable failed
 ####
 ######################################################################
 ((ntest++))
-if $CMD -h > "${TMPDIR}/help.txt" 2>&1
+if $CMD super -h > "${TMPDIR}/help.txt" 2>&1
 then
    log "$MCMD: printing help OK"
    ((success++))
@@ -100,7 +79,7 @@ fi
 # Test 1: Basic super k-mer extraction with default parameters
 ((ntest++))
-if obisuperkmer "${TEST_DIR}/test_sequences.fasta" \
+if $CMD super "${TEST_DIR}/test_sequences.fasta" \
    > "${TMPDIR}/output_default.fasta" 2>&1
 then
    log "$MCMD: basic extraction with default parameters OK"
@@ -148,7 +127,7 @@ fi
 # Test 5: Extract super k-mers with custom k and m parameters
 ((ntest++))
-if obisuperkmer -k 15 -m 7 "${TEST_DIR}/test_sequences.fasta" \
+if $CMD super -k 15 -m 7 "${TEST_DIR}/test_sequences.fasta" \
    > "${TMPDIR}/output_k15_m7.fasta" 2>&1
 then
    log "$MCMD: extraction with custom k=15, m=7 OK"
@@ -172,7 +151,7 @@ fi
 # Test 7: Test with different output format (FASTA output explicitly)
 ((ntest++))
-if obisuperkmer --fasta-output -k 21 -m 11 \
+if $CMD super --fasta-output -k 21 -m 11 \
    "${TEST_DIR}/test_sequences.fasta" \
    > "${TMPDIR}/output_fasta.fasta" 2>&1
 then
@@ -209,7 +188,7 @@ fi
 # Test 10: Test with output file option
 ((ntest++))
-if obisuperkmer -o "${TMPDIR}/output_file.fasta" \
+if $CMD super -o "${TMPDIR}/output_file.fasta" \
    "${TEST_DIR}/test_sequences.fasta" 2>&1
 then
    log "$MCMD: output to file with -o option OK"
--- a/pkg/obidefault/batch.go
+++ b/pkg/obidefault/batch.go
@@ -1,6 +1,12 @@
 package obidefault
-var _BatchSize = 2000
+// _BatchSize is the minimum number of sequences per batch (floor).
 // Used as the minSeqs argument to RebatchBySize.
 var _BatchSize = 1
 // _BatchSizeMax is the maximum number of sequences per batch (ceiling).
 // A batch is flushed when this count is reached regardless of memory usage.
 var _BatchSizeMax = 2000
 // SetBatchSize sets the size of the sequence batches.
 //
@@ -24,3 +30,42 @@ func BatchSize() int {
 func BatchSizePtr() *int {
 	return &_BatchSize
 }
 // BatchSizeMax returns the maximum number of sequences per batch.
 func BatchSizeMax() int {
 	return _BatchSizeMax
 }
 func BatchSizeMaxPtr() *int {
 	return &_BatchSizeMax
 }
 // _BatchMem holds the maximum cumulative memory (in bytes) per batch when
 // memory-based batching is requested. A value of 0 disables memory-based
 // batching and falls back to count-based batching.
 var _BatchMem = 128 * 1024 * 1024 // 128 MB default; set to 0 to disable
 var _BatchMemStr = ""
 // SetBatchMem sets the memory budget per batch in bytes.
 func SetBatchMem(n int) {
 	_BatchMem = n
 }
 // BatchMem returns the current memory budget per batch in bytes.
 // A value of 0 means memory-based batching is disabled.
 func BatchMem() int {
 	return _BatchMem
 }
 func BatchMemPtr() *int {
 	return &_BatchMem
 }
 // BatchMemStr returns the raw --batch-mem string value as provided on the CLI.
 func BatchMemStr() string {
 	return _BatchMemStr
 }
 func BatchMemStrPtr() *string {
 	return &_BatchMemStr
 }
--- a/pkg/obiformats/embl_read.go
+++ b/pkg/obiformats/embl_read.go
@@ -161,6 +161,149 @@ func EmblChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (obise
 	return parser
 }
 // extractEmblSeq scans the sequence section of an EMBL record directly on the
 // rope. EMBL sequence lines start with 5 spaces followed by bases in groups of
 // 10, separated by spaces, with a position number at the end. The section ends
 // with "//".
 func (s *ropeScanner) extractEmblSeq(dest []byte, UtoT bool) []byte {
 	// We use ReadLine and scan each line for bases (skip digits, spaces, newlines).
 	for {
 		line := s.ReadLine()
 		if line == nil {
 			break
 		}
 		if len(line) >= 2 && line[0] == '/' && line[1] == '/' {
 			break
 		}
 		// Lines start with 5 spaces; bases follow separated by single spaces.
 		// Digits at the end are the position counter — skip them.
 		// Simplest: take every byte that is a letter.
 		for _, b := range line {
 			if b >= 'A' && b <= 'Z' {
 				b += 'a' - 'A'
 			}
 			if UtoT && b == 'u' {
 				b = 't'
 			}
 			if b >= 'a' && b <= 'z' {
 				dest = append(dest, b)
 			}
 		}
 	}
 	return dest
 }
 // EmblChunkParserRope parses an EMBL chunk directly from a rope without Pack().
 func EmblChunkParserRope(source string, rope *PieceOfChunk, withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) {
 	scanner := newRopeScanner(rope)
 	sequences := obiseq.MakeBioSequenceSlice(100)[:0]
 	var id string
 	var scientificName string
 	defBytes := make([]byte, 0, 256)
 	featBytes := make([]byte, 0, 1024)
 	var taxid int
 	inSeq := false
 	for {
 		line := scanner.ReadLine()
 		if line == nil {
 			break
 		}
 		if inSeq {
 			// Should not happen — extractEmblSeq consumed up to "//"
 			inSeq = false
 			continue
 		}
 		switch {
 		case bytes.HasPrefix(line, []byte("ID   ")):
 			id = string(bytes.SplitN(line[5:], []byte(";"), 2)[0])
 		case bytes.HasPrefix(line, []byte("OS   ")):
 			scientificName = string(bytes.TrimSpace(line[5:]))
 		case bytes.HasPrefix(line, []byte("DE   ")):
 			if len(defBytes) > 0 {
 				defBytes = append(defBytes, ' ')
 			}
 			defBytes = append(defBytes, bytes.TrimSpace(line[5:])...)
 		case withFeatureTable && bytes.HasPrefix(line, []byte("FH   ")):
 			featBytes = append(featBytes, line...)
 		case withFeatureTable && bytes.Equal(line, []byte("FH")):
 			featBytes = append(featBytes, '\n')
 			featBytes = append(featBytes, line...)
 		case bytes.HasPrefix(line, []byte("FT   ")):
 			if withFeatureTable {
 				featBytes = append(featBytes, '\n')
 				featBytes = append(featBytes, line...)
 			}
 			if bytes.HasPrefix(line, []byte(`FT                   /db_xref="taxon:`)) {
 				rest := line[37:]
 				end := bytes.IndexByte(rest, '"')
 				if end > 0 {
 					taxid, _ = strconv.Atoi(string(rest[:end]))
 				}
 			}
 		case bytes.HasPrefix(line, []byte("     ")):
 			// First sequence line: extract all bases via extractEmblSeq,
 			// which also consumes this line's remaining content.
 			// But ReadLine already consumed this line — we need to process it
 			// plus subsequent lines. Process this line inline then call helper.
 			seqDest := make([]byte, 0, 4096)
 			for _, b := range line {
 				if b >= 'A' && b <= 'Z' {
 					b += 'a' - 'A'
 				}
 				if UtoT && b == 'u' {
 					b = 't'
 				}
 				if b >= 'a' && b <= 'z' {
 					seqDest = append(seqDest, b)
 				}
 			}
 			seqDest = scanner.extractEmblSeq(seqDest, UtoT)
 			seq := obiseq.NewBioSequenceOwning(id, seqDest, string(defBytes))
 			seq.SetSource(source)
 			if withFeatureTable {
 				seq.SetFeatures(featBytes)
 			}
 			annot := seq.Annotations()
 			annot["scientific_name"] = scientificName
 			annot["taxid"] = taxid
 			sequences = append(sequences, seq)
 			// Reset state
 			id = ""
 			scientificName = ""
 			defBytes = defBytes[:0]
 			featBytes = featBytes[:0]
 			taxid = 1
 		case bytes.Equal(line, []byte("//")):
 			// record ended without SQ/sequence section (e.g. WGS entries)
 			if id != "" {
 				seq := obiseq.NewBioSequenceOwning(id, []byte{}, string(defBytes))
 				seq.SetSource(source)
 				if withFeatureTable {
 					seq.SetFeatures(featBytes)
 				}
 				annot := seq.Annotations()
 				annot["scientific_name"] = scientificName
 				annot["taxid"] = taxid
 				sequences = append(sequences, seq)
 			}
 			id = ""
 			scientificName = ""
 			defBytes = defBytes[:0]
 			featBytes = featBytes[:0]
 			taxid = 1
 		}
 	}
 	return sequences, nil
 }
 func _ParseEmblFile(
 	input ChannelFileChunk,
 	out obiiter.IBioSequence,
@@ -171,7 +314,14 @@ func _ParseEmblFile(
 	for chunks := range input {
 		order := chunks.Order
-		sequences, err := parser(chunks.Source, chunks.Raw)
+		var sequences obiseq.BioSequenceSlice
 		var err error
 		if chunks.Rope != nil {
 			sequences, err = EmblChunkParserRope(chunks.Source, chunks.Rope, withFeatureTable, UtoT)
 		} else {
 			sequences, err = parser(chunks.Source, chunks.Raw)
 		}
 		if err != nil {
 			log.Fatalf("%s : Cannot parse the embl file : %v", chunks.Source, err)
@@ -196,6 +346,7 @@ func ReadEMBL(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, er
 		1024*1024*128,
 		EndOfLastFlatFileEntry,
 		"\nID   ",
 		false,
 	)
 	newIter := obiiter.MakeIBioSequence()
--- a/pkg/obiformats/fastaseq_read.go
+++ b/pkg/obiformats/fastaseq_read.go
@@ -209,28 +209,121 @@ func FastaChunkParser(UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlic
 	return parser
 }
 // extractFastaSeq scans sequence bytes from the rope directly into dest,
 // appending valid nucleotide characters and skipping whitespace.
 // Stops when '>' is found at the start of a line (next record) or at EOF.
 // Returns (dest with appended bases, hasMore).
 // hasMore=true means scanner is now positioned at '>' of the next record.
 func (s *ropeScanner) extractFastaSeq(dest []byte, UtoT bool) ([]byte, bool) {
 	lineStart := true
 	for s.current != nil {
 		data := s.current.data[s.pos:]
 		for i, b := range data {
 			if lineStart && b == '>' {
 				s.pos += i
 				if s.pos >= len(s.current.data) {
 					s.current = s.current.Next()
 					s.pos = 0
 				}
 				return dest, true
 			}
 			if b == '\n' || b == '\r' {
 				lineStart = true
 				continue
 			}
 			lineStart = false
 			if b == ' ' || b == '\t' {
 				continue
 			}
 			if b >= 'A' && b <= 'Z' {
 				b += 'a' - 'A'
 			}
 			if UtoT && b == 'u' {
 				b = 't'
 			}
 			dest = append(dest, b)
 		}
 		s.current = s.current.Next()
 		s.pos = 0
 	}
 	return dest, false
 }
 // FastaChunkParserRope parses a FASTA chunk directly from the rope without Pack().
 func FastaChunkParserRope(source string, rope *PieceOfChunk, UtoT bool) (obiseq.BioSequenceSlice, error) {
 	scanner := newRopeScanner(rope)
 	sequences := obiseq.MakeBioSequenceSlice(100)[:0]
 	for {
 		bline := scanner.ReadLine()
 		if bline == nil {
 			break
 		}
 		if len(bline) == 0 || bline[0] != '>' {
 			continue
 		}
 		// Parse header: ">id definition"
 		header := bline[1:]
 		var id string
 		var definition string
 		sp := bytes.IndexByte(header, ' ')
 		if sp < 0 {
 			sp = bytes.IndexByte(header, '\t')
 		}
 		if sp < 0 {
 			id = string(header)
 		} else {
 			id = string(header[:sp])
 			definition = string(bytes.TrimSpace(header[sp+1:]))
 		}
 		seqDest := make([]byte, 0, 4096)
 		var hasMore bool
 		seqDest, hasMore = scanner.extractFastaSeq(seqDest, UtoT)
 		if len(seqDest) == 0 {
 			log.Fatalf("%s [%s]: sequence is empty", source, id)
 		}
 		seq := obiseq.NewBioSequenceOwning(id, seqDest, definition)
 		seq.SetSource(source)
 		sequences = append(sequences, seq)
 		if !hasMore {
 			break
 		}
 	}
 	return sequences, nil
 }
 func _ParseFastaFile(
 	input ChannelFileChunk,
 	out obiiter.IBioSequence,
 	UtoT bool,
 ) {
 	parser := FastaChunkParser(UtoT)
 	for chunks := range input {
-		sequences, err := parser(chunks.Source, chunks.Raw)
+		var sequences obiseq.BioSequenceSlice
-		// obilog.Warnf("Chunck(%d:%d) -%d- ", chunks.Order, l, sequences.Len())
+		var err error
 		if chunks.Rope != nil {
 			sequences, err = FastaChunkParserRope(chunks.Source, chunks.Rope, UtoT)
 		} else {
 			sequences, err = parser(chunks.Source, chunks.Raw)
 		}
 		if err != nil {
 			log.Fatalf("File %s : Cannot parse the fasta file : %v", chunks.Source, err)
 		}
 		out.Push(obiiter.MakeBioSequenceBatch(chunks.Source, chunks.Order, sequences))
 	}
 	out.Done()
 }
 func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
@@ -245,6 +338,7 @@ func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
 		1024*1024,
 		EndOfLastFastaEntry,
 		"\n>",
 		false,
 	)
 	for i := 0; i < nworker; i++ {
--- a/pkg/obiformats/fastqseq_read.go
+++ b/pkg/obiformats/fastqseq_read.go
@@ -303,6 +303,80 @@ func FastqChunkParser(quality_shift byte, with_quality bool, UtoT bool) func(str
 	return parser
 }
 // FastqChunkParserRope parses a FASTQ chunk directly from a rope without Pack().
 func FastqChunkParserRope(source string, rope *PieceOfChunk, quality_shift byte, with_quality, UtoT bool) (obiseq.BioSequenceSlice, error) {
 	scanner := newRopeScanner(rope)
 	sequences := obiseq.MakeBioSequenceSlice(100)[:0]
 	for {
 		// Line 1: @id [definition]
 		hline := scanner.ReadLine()
 		if hline == nil {
 			break
 		}
 		if len(hline) == 0 || hline[0] != '@' {
 			continue
 		}
 		header := hline[1:]
 		var id string
 		var definition string
 		sp := bytes.IndexByte(header, ' ')
 		if sp < 0 {
 			sp = bytes.IndexByte(header, '\t')
 		}
 		if sp < 0 {
 			id = string(header)
 		} else {
 			id = string(header[:sp])
 			definition = string(bytes.TrimSpace(header[sp+1:]))
 		}
 		// Line 2: sequence
 		sline := scanner.ReadLine()
 		if sline == nil {
 			log.Fatalf("@%s[%s]: unexpected EOF after header", id, source)
 		}
 		seqDest := make([]byte, len(sline))
 		w := 0
 		for _, b := range sline {
 			if b >= 'A' && b <= 'Z' {
 				b += 'a' - 'A'
 			}
 			if UtoT && b == 'u' {
 				b = 't'
 			}
 			seqDest[w] = b
 			w++
 		}
 		seqDest = seqDest[:w]
 		if len(seqDest) == 0 {
 			log.Fatalf("@%s[%s]: sequence is empty", id, source)
 		}
 		// Line 3: + (skip)
 		scanner.ReadLine()
 		// Line 4: quality
 		qline := scanner.ReadLine()
 		seq := obiseq.NewBioSequenceOwning(id, seqDest, definition)
 		seq.SetSource(source)
 		if with_quality && qline != nil {
 			qDest := make([]byte, len(qline))
 			copy(qDest, qline)
 			for i := range qDest {
 				qDest[i] -= quality_shift
 			}
 			seq.TakeQualities(qDest)
 		}
 		sequences = append(sequences, seq)
 	}
 	return sequences, nil
 }
 func _ParseFastqFile(
 	input ChannelFileChunk,
 	out obiiter.IBioSequence,
@@ -313,7 +387,14 @@ func _ParseFastqFile(
 	parser := FastqChunkParser(quality_shift, with_quality, UtoT)
 	for chunks := range input {
-		sequences, err := parser(chunks.Source, chunks.Raw)
+		var sequences obiseq.BioSequenceSlice
 		var err error
 		if chunks.Rope != nil {
 			sequences, err = FastqChunkParserRope(chunks.Source, chunks.Rope, quality_shift, with_quality, UtoT)
 		} else {
 			sequences, err = parser(chunks.Source, chunks.Raw)
 		}
 		if err != nil {
 			log.Fatalf("File %s : Cannot parse the fastq file : %v", chunks.Source, err)
@@ -339,6 +420,7 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
 		1024*1024,
 		EndOfLastFastqEntry,
 		"\n@",
 		false,
 	)
 	for i := 0; i < nworker; i++ {
--- a/pkg/obiformats/fastseq_json_header.go
+++ b/pkg/obiformats/fastseq_json_header.go
@@ -296,7 +296,7 @@ func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {
 			case strings.HasSuffix(skey, "_taxid"):
 				if dataType == jsonparser.Number || dataType == jsonparser.String {
-					rank, _ := obiutils.SplitInTwo(skey, '_')
+					rank := skey[:len(skey)-len("_taxid")]
 					taxid := string(value)
 					sequence.SetTaxid(taxid, rank)
--- a/pkg/obiformats/fastseq_write_fasta.go
+++ b/pkg/obiformats/fastseq_write_fasta.go
@@ -77,45 +77,47 @@ func FormatFasta(seq *obiseq.BioSequence, formater FormatHeader) string {
 //
 // It returns a byte array containing the formatted sequences.
 func FormatFastaBatch(batch obiiter.BioSequenceBatch, formater FormatHeader, skipEmpty bool) *bytes.Buffer {
 	// Create a buffer to store the formatted sequences
 	var bs bytes.Buffer
 	lt := 0
 	for _, seq := range batch.Slice() {
 		lt += seq.Len()
 	}
-	// Iterate over each sequence in the batch
+	// Pre-allocate: sequence data + newlines every 60 chars + ~100 bytes header per sequence
 	bs.Grow(lt + lt/60 + 100*batch.Len() + 1)
 	log.Debugf("FormatFastaBatch: #%d : %d seqs", batch.Order(), batch.Len())
-	first := true
+
 	for _, seq := range batch.Slice() {
 		// Check if the sequence is empty
 		if seq.Len() > 0 {
-			// Format the sequence using the provided formater function
+			// Write header directly into bs — no intermediate string
-			formattedSeq := FormatFasta(seq, formater)
+			bs.WriteByte('>')
-
+			bs.WriteString(seq.Id())
-			if first {
+			bs.WriteByte(' ')
-				bs.Grow(lt + (len(formattedSeq)-seq.Len())*batch.Len()*5/4)
+			bs.WriteString(formater(seq))
 				first = false
 			}
 			// Append the formatted sequence to the buffer
 			bs.WriteString(formattedSeq)
 			bs.WriteByte('\n')
 			// Write folded sequence directly into bs — no copies
 			s := seq.Sequence()
 			l := len(s)
 			for i := 0; i < l; i += 60 {
 				to := i + 60
 				if to > l {
 					to = l
 				}
 				bs.Write(s[i:to])
 				bs.WriteByte('\n')
 			}
 		} else {
 			// Handle empty sequences
 			if skipEmpty {
 				// Skip empty sequences if skipEmpty is true
 				obilog.Warnf("Sequence %s is empty and skipped in output", seq.Id())
 			} else {
 				// Terminate the program if skipEmpty is false
 				log.Fatalf("Sequence %s is empty", seq.Id())
 			}
 		}
 	}
 	// Return the byte array representation of the buffer
 	return &bs
 }
--- a/pkg/obiformats/file_chunk_read.go
+++ b/pkg/obiformats/file_chunk_read.go
@@ -16,6 +16,7 @@ type SeqFileChunkParser func(string, io.Reader) (obiseq.BioSequenceSlice, error)
 type FileChunk struct {
 	Source string
 	Raw    *bytes.Buffer
 	Rope   *PieceOfChunk
 	Order  int
 }
@@ -97,11 +98,17 @@ func (piece *PieceOfChunk) IsLast() bool {
 	return piece.next == nil
 }
-func (piece *PieceOfChunk) FileChunk(source string, order int) FileChunk {
+func (piece *PieceOfChunk) FileChunk(source string, order int, pack bool) FileChunk {
-	piece.Pack()
+	piece = piece.Head()
 	var raw *bytes.Buffer
 	if pack {
 		piece.Pack()
 		raw = bytes.NewBuffer(piece.data)
 	}
 	return FileChunk{
 		Source: source,
-		Raw:    bytes.NewBuffer(piece.data),
+		Raw:    raw,
 		Rope:   piece,
 		Order:  order,
 	}
 }
@@ -133,7 +140,8 @@ func ReadFileChunk(
 	reader io.Reader,
 	fileChunkSize int,
 	splitter LastSeqRecord,
-	probe string) ChannelFileChunk {
+	probe string,
 	pack bool) ChannelFileChunk {
 	chunk_channel := make(ChannelFileChunk)
@@ -205,7 +213,7 @@ func ReadFileChunk(
 				if len(pieces.data) > 0 {
 					// obilog.Warnf("chuck %d :Read %d bytes from file %s", i, io.Len(), source)
-					chunk_channel <- pieces.FileChunk(source, i)
+					chunk_channel <- pieces.FileChunk(source, i, pack)
 					i++
 				}
@@ -222,7 +230,7 @@ func ReadFileChunk(
 		// Send the last chunk to the channel
 		if pieces.Len() > 0 {
-			chunk_channel <- pieces.FileChunk(source, i)
+			chunk_channel <- pieces.FileChunk(source, i, pack)
 		}
 		// Close the readers channel when the end of the file is reached
--- a/pkg/obiformats/genbank_read.go
+++ b/pkg/obiformats/genbank_read.go
@@ -29,6 +29,265 @@ const (
 var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp")
 // extractSequence scans the ORIGIN section byte-by-byte directly on the rope,
 // appending compacted bases to dest. Returns the extended slice.
 // Stops and returns when "//" is found at the start of a line.
 // The scanner is left positioned after the "//" line.
 func (s *ropeScanner) extractSequence(dest []byte, UtoT bool) []byte {
 	lineStart := true
 	skipDigits := true
 	for s.current != nil {
 		data := s.current.data[s.pos:]
 		for i, b := range data {
 			if lineStart {
 				if b == '/' {
 					// End-of-record marker "//"
 					s.pos += i + 1
 					if s.pos >= len(s.current.data) {
 						s.current = s.current.Next()
 						s.pos = 0
 					}
 					s.skipToNewline()
 					return dest
 				}
 				lineStart = false
 				skipDigits = true
 			}
 			switch {
 			case b == '\n':
 				lineStart = true
 			case b == '\r':
 				// skip
 			case skipDigits:
 				if b != ' ' && (b < '0' || b > '9') {
 					skipDigits = false
 					if UtoT && b == 'u' {
 						b = 't'
 					}
 					dest = append(dest, b)
 				}
 			case b != ' ':
 				if UtoT && b == 'u' {
 					b = 't'
 				}
 				dest = append(dest, b)
 			}
 		}
 		s.current = s.current.Next()
 		s.pos = 0
 	}
 	return dest
 }
 // parseLseqFromLocus extracts the declared sequence length from a LOCUS line.
 // Format: "LOCUS       <id> <length> bp ..."
 // Returns -1 if not found or parse error.
 func parseLseqFromLocus(line []byte) int {
 	if len(line) < 13 {
 		return -1
 	}
 	i := 12
 	for i < len(line) && line[i] != ' ' {
 		i++
 	}
 	for i < len(line) && line[i] == ' ' {
 		i++
 	}
 	start := i
 	for i < len(line) && line[i] >= '0' && line[i] <= '9' {
 		i++
 	}
 	if i == start {
 		return -1
 	}
 	n, err := strconv.Atoi(string(line[start:i]))
 	if err != nil {
 		return -1
 	}
 	return n
 }
 // Prefix constants for GenBank section headers (byte slices for zero-alloc comparison).
 var (
 	gbPfxLocus      = []byte("LOCUS       ")
 	gbPfxDefinition = []byte("DEFINITION  ")
 	gbPfxContinue   = []byte("            ")
 	gbPfxSource     = []byte("SOURCE      ")
 	gbPfxFeatures   = []byte("FEATURES    ")
 	gbPfxOrigin     = []byte("ORIGIN")
 	gbPfxContig     = []byte("CONTIG")
 	gbPfxEnd        = []byte("//")
 	gbPfxDbXref     = []byte(`                     /db_xref="taxon:`)
 )
 // GenbankChunkParserRope parses a GenBank FileChunk directly from the rope
 // (PieceOfChunk linked list) without calling Pack(). This eliminates the large
 // contiguous allocation required for chromosomal-scale sequences.
 func GenbankChunkParserRope(source string, rope *PieceOfChunk,
 	withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) {
 	state := inHeader
 	scanner := newRopeScanner(rope)
 	sequences := obiseq.MakeBioSequenceSlice(100)[:0]
 	id := ""
 	lseq := -1
 	scientificName := ""
 	defBytes := new(bytes.Buffer)
 	featBytes := new(bytes.Buffer)
 	var seqDest []byte
 	taxid := 1
 	nl := 0
 	for bline := scanner.ReadLine(); bline != nil; bline = scanner.ReadLine() {
 		nl++
 		processed := false
 		for !processed {
 			switch {
 			case bytes.HasPrefix(bline, gbPfxLocus):
 				if state != inHeader {
 					log.Fatalf("Line %d - Unexpected state %d while reading LOCUS: %s", nl, state, bline)
 				}
 				rest := bline[12:]
 				sp := bytes.IndexByte(rest, ' ')
 				if sp < 0 {
 					id = string(rest)
 				} else {
 					id = string(rest[:sp])
 				}
 				lseq = parseLseqFromLocus(bline)
 				cap0 := lseq + 20
 				if cap0 < 1024 {
 					cap0 = 1024
 				}
 				seqDest = make([]byte, 0, cap0)
 				state = inEntry
 				processed = true
 			case bytes.HasPrefix(bline, gbPfxDefinition):
 				if state != inEntry {
 					log.Fatalf("Line %d - Unexpected state %d while reading DEFINITION: %s", nl, state, bline)
 				}
 				defBytes.Write(bytes.TrimSpace(bline[12:]))
 				state = inDefinition
 				processed = true
 			case state == inDefinition:
 				if bytes.HasPrefix(bline, gbPfxContinue) {
 					defBytes.WriteByte(' ')
 					defBytes.Write(bytes.TrimSpace(bline[12:]))
 					processed = true
 				} else {
 					state = inEntry
 				}
 			case bytes.HasPrefix(bline, gbPfxSource):
 				if state != inEntry {
 					log.Fatalf("Line %d - Unexpected state %d while reading SOURCE: %s", nl, state, bline)
 				}
 				scientificName = string(bytes.TrimSpace(bline[12:]))
 				processed = true
 			case bytes.HasPrefix(bline, gbPfxFeatures):
 				if state != inEntry {
 					log.Fatalf("Line %d - Unexpected state %d while reading FEATURES: %s", nl, state, bline)
 				}
 				if withFeatureTable {
 					featBytes.Write(bline)
 				}
 				state = inFeature
 				processed = true
 			case bytes.HasPrefix(bline, gbPfxOrigin):
 				if state != inFeature && state != inContig {
 					log.Fatalf("Line %d - Unexpected state %d while reading ORIGIN: %s", nl, state, bline)
 				}
 				// Use fast byte-scan to extract sequence and consume through "//"
 				seqDest = scanner.extractSequence(seqDest, UtoT)
 				// Emit record
 				if id == "" {
 					log.Warn("Empty id when parsing genbank file")
 				}
 				sequence := obiseq.NewBioSequenceOwning(id, seqDest, defBytes.String())
 				sequence.SetSource(source)
 				if withFeatureTable {
 					sequence.SetFeatures(featBytes.Bytes())
 				}
 				annot := sequence.Annotations()
 				annot["scientific_name"] = scientificName
 				annot["taxid"] = taxid
 				sequences = append(sequences, sequence)
 				defBytes = bytes.NewBuffer(obiseq.GetSlice(200))
 				featBytes = new(bytes.Buffer)
 				nl = 0
 				taxid = 1
 				seqDest = nil
 				state = inHeader
 				processed = true
 			case bytes.HasPrefix(bline, gbPfxContig):
 				if state != inFeature && state != inContig {
 					log.Fatalf("Line %d - Unexpected state %d while reading CONTIG: %s", nl, state, bline)
 				}
 				state = inContig
 				processed = true
 			case bytes.Equal(bline, gbPfxEnd):
 				// Reached for CONTIG records (no ORIGIN section)
 				if state != inContig {
 					log.Fatalf("Line %d - Unexpected state %d while reading end of record %s", nl, state, id)
 				}
 				if id == "" {
 					log.Warn("Empty id when parsing genbank file")
 				}
 				sequence := obiseq.NewBioSequenceOwning(id, seqDest, defBytes.String())
 				sequence.SetSource(source)
 				if withFeatureTable {
 					sequence.SetFeatures(featBytes.Bytes())
 				}
 				annot := sequence.Annotations()
 				annot["scientific_name"] = scientificName
 				annot["taxid"] = taxid
 				sequences = append(sequences, sequence)
 				defBytes = bytes.NewBuffer(obiseq.GetSlice(200))
 				featBytes = new(bytes.Buffer)
 				nl = 0
 				taxid = 1
 				seqDest = nil
 				state = inHeader
 				processed = true
 			default:
 				switch state {
 				case inFeature:
 					if withFeatureTable {
 						featBytes.WriteByte('\n')
 						featBytes.Write(bline)
 					}
 					if bytes.HasPrefix(bline, gbPfxDbXref) {
 						rest := bline[len(gbPfxDbXref):]
 						q := bytes.IndexByte(rest, '"')
 						if q >= 0 {
 							taxid, _ = strconv.Atoi(string(rest[:q]))
 						}
 					}
 					processed = true
 				case inHeader, inEntry, inContig:
 					processed = true
 				default:
 					log.Fatalf("Unexpected state %d while reading: %s", state, bline)
 				}
 			}
 		}
 	}
 	return sequences, nil
 }
 func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlice, error) {
 	return func(source string, input io.Reader) (obiseq.BioSequenceSlice, error) {
 		state := inHeader
@@ -125,13 +384,10 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
 					if state != inSequence && state != inContig {
 						log.Fatalf("Line %d - Unexpected state %d while reading end of record %s", nl, state, id)
 					}
 					// log.Debugln("Total lines := ", nl)
 					if id == "" {
 						log.Warn("Empty id when parsing genbank file")
 					}
 					// log.Debugf("End of sequence %s: %dbp ", id, seqBytes.Len())
 					sequence := obiseq.NewBioSequence(id,
 						seqBytes.Bytes(),
 						defBytes.String())
@@ -144,9 +400,6 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
 					annot := sequence.Annotations()
 					annot["scientific_name"] = scientificName
 					annot["taxid"] = taxid
 					// log.Println(FormatFasta(sequence, FormatFastSeqJsonHeader))
 					// log.Debugf("Read sequences %s: %dbp (%d)", sequence.Id(),
 					//	sequence.Len(), seqBytes.Len())
 					sequences = append(sequences, sequence)
@@ -159,12 +412,11 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
 					processed = true
 				case state == inSequence:
 					// log.Debugf("Chunk %d : Genbank: line %d, state = %d : %s", chunks.order, nl, state, line)
 					sl++
-					parts := strings.SplitN(line[10:], " ", 6)
+					cleanline := strings.TrimSpace(line)
 					parts := strings.SplitN(cleanline, " ", 7)
 					lparts := len(parts)
-					for i := 0; i < lparts; i++ {
+					for i := 1; i < lparts; i++ {
 						if UtoT {
 							parts[i] = strings.ReplaceAll(parts[i], "u", "t")
 						}
@@ -197,6 +449,7 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
 		}
 		_ = sl
 		return sequences, nil
 	}
 }
@@ -205,10 +458,16 @@ func _ParseGenbankFile(input ChannelFileChunk,
 	out obiiter.IBioSequence,
 	withFeatureTable, UtoT bool) {
 	parser := GenbankChunkParser(withFeatureTable, UtoT)
 	for chunks := range input {
-		sequences, err := parser(chunks.Source, chunks.Raw)
+		var sequences obiseq.BioSequenceSlice
 		var err error
 		if chunks.Rope != nil {
 			sequences, err = GenbankChunkParserRope(chunks.Source, chunks.Rope, withFeatureTable, UtoT)
 		} else {
 			parser := GenbankChunkParser(withFeatureTable, UtoT)
 			sequences, err = parser(chunks.Source, chunks.Raw)
 		}
 		if err != nil {
 			log.Fatalf("File %s : Cannot parse the genbank file : %v", chunks.Source, err)
@@ -224,7 +483,6 @@ func _ParseGenbankFile(input ChannelFileChunk,
 func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
 	opt := MakeOptions(options)
 	// entry_channel := make(chan _FileChunk)
 	entry_channel := ReadFileChunk(
 		opt.Source(),
@@ -232,13 +490,13 @@ func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence,
 		1024*1024*128,
 		EndOfLastFlatFileEntry,
 		"\nLOCUS       ",
 		false, // do not pack: rope-based parser avoids contiguous allocation
 	)
 	newIter := obiiter.MakeIBioSequence()
 	nworkers := opt.ParallelWorkers()
 	// for j := 0; j < opt.ParallelWorkers(); j++ {
 	for j := 0; j < nworkers; j++ {
 		newIter.Add(1)
 		go _ParseGenbankFile(
@@ -249,8 +507,6 @@ func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence,
 		)
 	}
 	// go _ReadFlatFileChunk(reader, entry_channel)
 	go func() {
 		newIter.WaitAndClose()
 		log.Debug("End of the genbank file ", opt.Source())
--- a/pkg/obiformats/rope_scanner.go
+++ b/pkg/obiformats/rope_scanner.go
@@ -0,0 +1,77 @@
 package obiformats
 import "bytes"
 // ropeScanner reads lines from a PieceOfChunk rope.
 // The carry buffer handles lines that span two rope nodes; it grows as needed.
 type ropeScanner struct {
 	current *PieceOfChunk
 	pos     int
 	carry   []byte
 }
 func newRopeScanner(rope *PieceOfChunk) *ropeScanner {
 	return &ropeScanner{current: rope}
 }
 // ReadLine returns the next line without the trailing \n (or \r\n).
 // Returns nil at end of rope. The returned slice aliases carry[] or the node
 // data and is valid only until the next ReadLine call.
 func (s *ropeScanner) ReadLine() []byte {
 	for {
 		if s.current == nil {
 			if len(s.carry) > 0 {
 				line := s.carry
 				s.carry = s.carry[:0]
 				return line
 			}
 			return nil
 		}
 		data := s.current.data[s.pos:]
 		idx := bytes.IndexByte(data, '\n')
 		if idx >= 0 {
 			var line []byte
 			if len(s.carry) == 0 {
 				line = data[:idx]
 			} else {
 				s.carry = append(s.carry, data[:idx]...)
 				line = s.carry
 				s.carry = s.carry[:0]
 			}
 			s.pos += idx + 1
 			if s.pos >= len(s.current.data) {
 				s.current = s.current.Next()
 				s.pos = 0
 			}
 			if len(line) > 0 && line[len(line)-1] == '\r' {
 				line = line[:len(line)-1]
 			}
 			return line
 		}
 		// No \n in this node: accumulate into carry and advance
 		s.carry = append(s.carry, data...)
 		s.current = s.current.Next()
 		s.pos = 0
 	}
 }
 // skipToNewline advances the scanner past the next '\n'.
 func (s *ropeScanner) skipToNewline() {
 	for s.current != nil {
 		data := s.current.data[s.pos:]
 		idx := bytes.IndexByte(data, '\n')
 		if idx >= 0 {
 			s.pos += idx + 1
 			if s.pos >= len(s.current.data) {
 				s.current = s.current.Next()
 				s.pos = 0
 			}
 			return
 		}
 		s.current = s.current.Next()
 		s.pos = 0
 	}
 }
--- a/pkg/obiiter/batchiterator.go
+++ b/pkg/obiiter/batchiterator.go
@@ -444,6 +444,67 @@ func (iterator IBioSequence) Rebatch(size int) IBioSequence {
 	return newIter
 }
 // RebatchBySize reorganises the stream into batches bounded by two independent
 // upper limits: maxCount (max number of sequences) and maxBytes (max cumulative
 // estimated memory). A batch is flushed as soon as either limit would be
 // exceeded. A single sequence larger than maxBytes is always emitted alone.
 // Passing 0 for a limit disables that constraint; if both are 0 it falls back
 // to Rebatch(obidefault.BatchSizeMax()).
 func (iterator IBioSequence) RebatchBySize(maxBytes int, maxCount int) IBioSequence {
 	if maxBytes <= 0 && maxCount <= 0 {
 		return iterator.Rebatch(obidefault.BatchSizeMax())
 	}
 	newIter := MakeIBioSequence()
 	newIter.Add(1)
 	go func() {
 		newIter.WaitAndClose()
 	}()
 	go func() {
 		order := 0
 		iterator = iterator.SortBatches()
 		buffer := obiseq.MakeBioSequenceSlice()
 		bufBytes := 0
 		source := ""
 		flush := func() {
 			if len(buffer) > 0 {
 				newIter.Push(MakeBioSequenceBatch(source, order, buffer))
 				order++
 				buffer = obiseq.MakeBioSequenceSlice()
 				bufBytes = 0
 			}
 		}
 		for iterator.Next() {
 			seqs := iterator.Get()
 			source = seqs.Source()
 			for _, s := range seqs.Slice() {
 				sz := s.MemorySize()
 				countFull := maxCount > 0 && len(buffer) >= maxCount
 				memFull := maxBytes > 0 && bufBytes+sz > maxBytes && len(buffer) > 0
 				if countFull || memFull {
 					flush()
 				}
 				buffer = append(buffer, s)
 				bufBytes += sz
 			}
 		}
 		flush()
 		newIter.Done()
 	}()
 	if iterator.IsPaired() {
 		newIter.MarkAsPaired()
 	}
 	return newIter
 }
 func (iterator IBioSequence) FilterEmpty() IBioSequence {
 	newIter := MakeIBioSequence()
@@ -638,7 +699,7 @@ func (iterator IBioSequence) FilterOn(predicate obiseq.SequencePredicate,
 		trueIter.MarkAsPaired()
 	}
-	return trueIter.Rebatch(size)
+	return trueIter.RebatchBySize(obidefault.BatchMem(), obidefault.BatchSizeMax())
 }
 func (iterator IBioSequence) FilterAnd(predicate obiseq.SequencePredicate,
@@ -694,7 +755,7 @@ func (iterator IBioSequence) FilterAnd(predicate obiseq.SequencePredicate,
 		trueIter.MarkAsPaired()
 	}
-	return trueIter.Rebatch(size)
+	return trueIter.RebatchBySize(obidefault.BatchMem(), obidefault.BatchSizeMax())
 }
 // Load all sequences availables from an IBioSequenceBatch iterator into
--- a/pkg/obiiter/distribute.go
+++ b/pkg/obiiter/distribute.go
@@ -57,34 +57,21 @@ func (dist *IDistribute) Classifier() *obiseq.BioSequenceClassifier {
 }
 // Distribute organizes the biosequences from the iterator into batches
-// based on the provided classifier and batch sizes. It returns an
+// based on the provided classifier. It returns an IDistribute instance
-// IDistribute instance that manages the distribution of the sequences.
+// that manages the distribution of the sequences.
 //
-// Parameters:
+// Batches are flushed when either BatchSizeMax() sequences or BatchMem()
-//   - class: A pointer to a BioSequenceClassifier used to classify
+// bytes are accumulated per key, mirroring the RebatchBySize strategy.
-//     the biosequences during distribution.
+func (iterator IBioSequence) Distribute(class *obiseq.BioSequenceClassifier) IDistribute {
-//   - sizes: Optional integer values specifying the batch size. If
+	maxCount := obidefault.BatchSizeMax()
-//     no sizes are provided, a default batch size of 5000 is used.
+	maxBytes := obidefault.BatchMem()
 //
 // Returns:
 // An IDistribute instance that contains the outputs of the
 // classified biosequences, a channel for new data notifications,
 // and the classifier used for distribution. The method operates
 // asynchronously, processing the sequences in separate goroutines.
 // It ensures that the outputs are closed and cleaned up once
 // processing is complete.
 func (iterator IBioSequence) Distribute(class *obiseq.BioSequenceClassifier, sizes ...int) IDistribute {
 	batchsize := obidefault.BatchSize()
 	outputs := make(map[int]IBioSequence, 100)
 	slices := make(map[int]*obiseq.BioSequenceSlice, 100)
 	bufBytes := make(map[int]int, 100)
 	orders := make(map[int]int, 100)
 	news := make(chan int)
 	if len(sizes) > 0 {
 		batchsize = sizes[0]
 	}
 	jobDone := sync.WaitGroup{}
 	lock := sync.Mutex{}
@@ -115,6 +102,7 @@ func (iterator IBioSequence) Distribute(class *obiseq.BioSequenceClassifier, siz
 					slice = &s
 					slices[key] = slice
 					orders[key] = 0
 					bufBytes[key] = 0
 					lock.Lock()
 					outputs[key] = MakeIBioSequence()
@@ -123,14 +111,20 @@ func (iterator IBioSequence) Distribute(class *obiseq.BioSequenceClassifier, siz
 					news <- key
 				}
-				*slice = append(*slice, s)
+				sz := s.MemorySize()
-
+				countFull := maxCount > 0 && len(*slice) >= maxCount
-				if len(*slice) == batchsize {
+				memFull := maxBytes > 0 && bufBytes[key]+sz > maxBytes && len(*slice) > 0
 				if countFull || memFull {
 					outputs[key].Push(MakeBioSequenceBatch(source, orders[key], *slice))
 					orders[key]++
 					s := obiseq.MakeBioSequenceSlice()
 					slices[key] = &s
 					slice = &s
 					bufBytes[key] = 0
 				}
 				*slice = append(*slice, s)
 				bufBytes[key] += sz
 			}
 		}
--- a/pkg/obiiter/fragment.go
+++ b/pkg/obiiter/fragment.go
@@ -3,6 +3,7 @@ package obiiter
 import (
 	log "github.com/sirupsen/logrus"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 )
@@ -70,7 +71,7 @@ func IFragments(minsize, length, overlap, size, nworkers int) Pipeable {
 		}
 		go f(iterator)
-		return newiter.SortBatches().Rebatch(size)
+		return newiter.SortBatches().RebatchBySize(obidefault.BatchMem(), obidefault.BatchSizeMax())
 	}
 	return ifrg
--- a/pkg/obikmer/entropy.go
+++ b/pkg/obikmer/entropy.go
@@ -0,0 +1,281 @@
 package obikmer
 import "math"
 // KmerEntropy computes the entropy of a single encoded k-mer.
 //
 // The algorithm mirrors the lowmask entropy calculation: it decodes the k-mer
 // to a DNA sequence, extracts all sub-words of each size from 1 to levelMax,
 // normalizes them by circular canonical form, counts their frequencies, and
 // computes Shannon entropy normalized by the maximum possible entropy.
 // The returned value is the minimum entropy across all word sizes.
 //
 // A value close to 0 indicates very low complexity (e.g. "AAAA..."),
 // while a value close to 1 indicates high complexity.
 //
 // Parameters:
 //   - kmer: the encoded k-mer (2 bits per base)
 //   - k: the k-mer size
 //   - levelMax: maximum sub-word size for entropy (typically 6)
 //
 // Returns:
 //   - minimum normalized entropy across all word sizes 1..levelMax
 func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
 	if k < 1 || levelMax < 1 {
 		return 1.0
 	}
 	if levelMax >= k {
 		levelMax = k - 1
 	}
 	if levelMax < 1 {
 		return 1.0
 	}
 	// Decode k-mer to DNA sequence
 	var seqBuf [32]byte
 	seq := DecodeKmer(kmer, k, seqBuf[:])
 	// Pre-compute nLogN lookup (same as lowmask)
 	nLogN := make([]float64, k+1)
 	for i := 1; i <= k; i++ {
 		nLogN[i] = float64(i) * math.Log(float64(i))
 	}
 	// Build circular-canonical normalization tables per word size
 	normTables := make([][]int, levelMax+1)
 	for ws := 1; ws <= levelMax; ws++ {
 		size := 1 << (ws * 2)
 		normTables[ws] = make([]int, size)
 		for code := 0; code < size; code++ {
 			normTables[ws][code] = int(NormalizeCircular(uint64(code), ws))
 		}
 	}
 	minEntropy := math.MaxFloat64
 	for ws := 1; ws <= levelMax; ws++ {
 		nwords := k - ws + 1
 		if nwords < 1 {
 			continue
 		}
 		// Count circular-canonical sub-word frequencies
 		tableSize := 1 << (ws * 2)
 		table := make([]int, tableSize)
 		mask := (1 << (ws * 2)) - 1
 		wordIndex := 0
 		for i := 0; i < ws-1; i++ {
 			wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i]))
 		}
 		for i, j := 0, ws-1; j < k; i, j = i+1, j+1 {
 			wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j]))
 			normWord := normTables[ws][wordIndex]
 			table[normWord]++
 		}
 		// Compute Shannon entropy
 		floatNwords := float64(nwords)
 		logNwords := math.Log(floatNwords)
 		var sumNLogN float64
 		for j := 0; j < tableSize; j++ {
 			n := table[j]
 			if n > 0 {
 				sumNLogN += nLogN[n]
 			}
 		}
 		// Compute emax (maximum possible entropy for this word size)
 		na := CanonicalCircularKmerCount(ws)
 		var emax float64
 		if nwords < na {
 			emax = math.Log(float64(nwords))
 		} else {
 			cov := nwords / na
 			remains := nwords - (na * cov)
 			f1 := float64(cov) / floatNwords
 			f2 := float64(cov+1) / floatNwords
 			emax = -(float64(na-remains)*f1*math.Log(f1) +
 				float64(remains)*f2*math.Log(f2))
 		}
 		if emax <= 0 {
 			continue
 		}
 		entropy := (logNwords - sumNLogN/floatNwords) / emax
 		if entropy < 0 {
 			entropy = 0
 		}
 		if entropy < minEntropy {
 			minEntropy = entropy
 		}
 	}
 	if minEntropy == math.MaxFloat64 {
 		return 1.0
 	}
 	return math.Round(minEntropy*10000) / 10000
 }
 // KmerEntropyFilter is a reusable entropy filter for batch processing.
 // It pre-computes normalization tables and lookup values to avoid repeated
 // allocation across millions of k-mers.
 //
 // IMPORTANT: a KmerEntropyFilter is NOT safe for concurrent use.
 // Each goroutine must create its own instance via NewKmerEntropyFilter.
 type KmerEntropyFilter struct {
 	k          int
 	levelMax   int
 	threshold  float64
 	nLogN      []float64
 	normTables [][]int
 	emaxValues []float64
 	logNwords  []float64
 	// Pre-allocated frequency tables reused across Entropy() calls.
 	// One per word size (index 0 unused). Reset to zero before each use.
 	freqTables [][]int
 }
 // NewKmerEntropyFilter creates an entropy filter with pre-computed tables.
 //
 // Parameters:
 //   - k: the k-mer size
 //   - levelMax: maximum sub-word size for entropy (typically 6)
 //   - threshold: entropy threshold (k-mers with entropy <= threshold are rejected)
 func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter {
 	if levelMax >= k {
 		levelMax = k - 1
 	}
 	if levelMax < 1 {
 		levelMax = 1
 	}
 	nLogN := make([]float64, k+1)
 	for i := 1; i <= k; i++ {
 		nLogN[i] = float64(i) * math.Log(float64(i))
 	}
 	normTables := make([][]int, levelMax+1)
 	for ws := 1; ws <= levelMax; ws++ {
 		size := 1 << (ws * 2)
 		normTables[ws] = make([]int, size)
 		for code := 0; code < size; code++ {
 			normTables[ws][code] = int(NormalizeCircular(uint64(code), ws))
 		}
 	}
 	emaxValues := make([]float64, levelMax+1)
 	logNwords := make([]float64, levelMax+1)
 	for ws := 1; ws <= levelMax; ws++ {
 		nw := k - ws + 1
 		na := CanonicalCircularKmerCount(ws)
 		if nw < na {
 			logNwords[ws] = math.Log(float64(nw))
 			emaxValues[ws] = math.Log(float64(nw))
 		} else {
 			cov := nw / na
 			remains := nw - (na * cov)
 			f1 := float64(cov) / float64(nw)
 			f2 := float64(cov+1) / float64(nw)
 			logNwords[ws] = math.Log(float64(nw))
 			emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) +
 				float64(remains)*f2*math.Log(f2))
 		}
 	}
 	// Pre-allocate frequency tables per word size
 	freqTables := make([][]int, levelMax+1)
 	for ws := 1; ws <= levelMax; ws++ {
 		freqTables[ws] = make([]int, 1<<(ws*2))
 	}
 	return &KmerEntropyFilter{
 		k:          k,
 		levelMax:   levelMax,
 		threshold:  threshold,
 		nLogN:      nLogN,
 		normTables: normTables,
 		emaxValues: emaxValues,
 		logNwords:  logNwords,
 		freqTables: freqTables,
 	}
 }
 // Accept returns true if the k-mer has entropy strictly above the threshold.
 // Low-complexity k-mers (entropy <= threshold) are rejected.
 func (ef *KmerEntropyFilter) Accept(kmer uint64) bool {
 	return ef.Entropy(kmer) > ef.threshold
 }
 // Entropy computes the entropy for a single k-mer using pre-computed tables.
 func (ef *KmerEntropyFilter) Entropy(kmer uint64) float64 {
 	k := ef.k
 	// Decode k-mer to DNA sequence
 	var seqBuf [32]byte
 	seq := DecodeKmer(kmer, k, seqBuf[:])
 	minEntropy := math.MaxFloat64
 	for ws := 1; ws <= ef.levelMax; ws++ {
 		nwords := k - ws + 1
 		if nwords < 1 {
 			continue
 		}
 		emax := ef.emaxValues[ws]
 		if emax <= 0 {
 			continue
 		}
 		// Count circular-canonical sub-word frequencies
 		tableSize := 1 << (ws * 2)
 		table := ef.freqTables[ws]
 		clear(table) // reset to zero
 		mask := (1 << (ws * 2)) - 1
 		normTable := ef.normTables[ws]
 		wordIndex := 0
 		for i := 0; i < ws-1; i++ {
 			wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i]))
 		}
 		for i, j := 0, ws-1; j < k; i, j = i+1, j+1 {
 			wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j]))
 			normWord := normTable[wordIndex]
 			table[normWord]++
 		}
 		// Compute Shannon entropy
 		floatNwords := float64(nwords)
 		logNwords := ef.logNwords[ws]
 		var sumNLogN float64
 		for j := 0; j < tableSize; j++ {
 			n := table[j]
 			if n > 0 {
 				sumNLogN += ef.nLogN[n]
 			}
 		}
 		entropy := (logNwords - sumNLogN/floatNwords) / emax
 		if entropy < 0 {
 			entropy = 0
 		}
 		if entropy < minEntropy {
 			minEntropy = entropy
 		}
 	}
 	if minEntropy == math.MaxFloat64 {
 		return 1.0
 	}
 	return math.Round(minEntropy*10000) / 10000
 }
--- a/pkg/obikmer/frequency_filter.go
+++ b/pkg/obikmer/frequency_filter.go
@@ -1,310 +0,0 @@
 package obikmer
 import (
 	"fmt"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 )
 // FrequencyFilter filters k-mers by minimum frequency
 // Specialization of KmerSetGroup where index[i] contains k-mers seen at least i+1 times
 type FrequencyFilter struct {
 	*KmerSetGroup          // Group of KmerSet (one per frequency level)
 	MinFreq       int      // v - minimum required frequency
 }
 // NewFrequencyFilter creates a new frequency filter
 // minFreq: minimum number d'occurrences required (v)
 func NewFrequencyFilter(k, minFreq int) *FrequencyFilter {
 	ff := &FrequencyFilter{
 		KmerSetGroup: NewKmerSetGroup(k, minFreq),
 		MinFreq:      minFreq,
 	}
 	// Initialize group metadata
 	ff.SetAttribute("type", "FrequencyFilter")
 	ff.SetAttribute("min_freq", minFreq)
 	// Initialize metadata for each level
 	for i := 0; i < minFreq; i++ {
 		level := ff.Get(i)
 		level.SetAttribute("level", i)
 		level.SetAttribute("min_occurrences", i+1)
 		level.SetId(fmt.Sprintf("level_%d", i))
 	}
 	return ff
 }
 // AddSequence adds all k-mers from a sequence to the filter
 // Uses an iterator to avoid allocating an intermediate vector
 func (ff *FrequencyFilter) AddSequence(seq *obiseq.BioSequence) {
 	rawSeq := seq.Sequence()
 	for canonical := range IterCanonicalKmers(rawSeq, ff.K()) {
 		ff.AddKmerCode(canonical)
 	}
 }
 // AddKmerCode adds an encoded k-mer to the filter (main algorithm)
 func (ff *FrequencyFilter) AddKmerCode(kmer uint64) {
 	// Find the current level of the k-mer
 	c := 0
 	for c < ff.MinFreq && ff.Get(c).Contains(kmer) {
 		c++
 	}
 	// Add to next level (if not yet at maximum)
 	if c < ff.MinFreq {
 		ff.Get(c).AddKmerCode(kmer)
 	}
 }
 // AddCanonicalKmerCode adds an encoded canonical k-mer to the filter
 func (ff *FrequencyFilter) AddCanonicalKmerCode(kmer uint64) {
 	canonical := CanonicalKmer(kmer, ff.K())
 	ff.AddKmerCode(canonical)
 }
 // AddKmer adds a k-mer to the filter by encoding the sequence
 // The sequence must have exactly k nucleotides
 // Zero-allocation: encodes directly without creating an intermediate slice
 func (ff *FrequencyFilter) AddKmer(seq []byte) {
 	kmer := EncodeKmer(seq, ff.K())
 	ff.AddKmerCode(kmer)
 }
 // AddCanonicalKmer adds a canonical k-mer to the filter by encoding the sequence
 // The sequence must have exactly k nucleotides
 // Zero-allocation: encodes directly in canonical form without creating an intermediate slice
 func (ff *FrequencyFilter) AddCanonicalKmer(seq []byte) {
 	canonical := EncodeCanonicalKmer(seq, ff.K())
 	ff.AddKmerCode(canonical)
 }
 // GetFilteredSet returns a KmerSet of k-mers with frequency ≥ minFreq
 func (ff *FrequencyFilter) GetFilteredSet() *KmerSet {
 	// Filtered k-mers are in the last level
 	return ff.Get(ff.MinFreq - 1).Copy()
 }
 // GetKmersAtLevel returns a KmerSet of k-mers seen at least (level+1) times
 // level doit être dans [0, minFreq-1]
 func (ff *FrequencyFilter) GetKmersAtLevel(level int) *KmerSet {
 	ks := ff.Get(level)
 	if ks == nil {
 		return NewKmerSet(ff.K())
 	}
 	return ks.Copy()
 }
 // Stats returns statistics on frequency levels
 func (ff *FrequencyFilter) Stats() FrequencyFilterStats {
 	stats := FrequencyFilterStats{
 		MinFreq: ff.MinFreq,
 		Levels:  make([]LevelStats, ff.MinFreq),
 	}
 	for i := 0; i < ff.MinFreq; i++ {
 		ks := ff.Get(i)
 		card := ks.Len()
 		sizeBytes := ks.MemoryUsage()
 		stats.Levels[i] = LevelStats{
 			Level:       i + 1, // Level 1 = freq ≥ 1
 			Cardinality: card,
 			SizeBytes:   sizeBytes,
 		}
 		stats.TotalBytes += sizeBytes
 	}
 	// The last level contains the result
 	stats.FilteredKmers = stats.Levels[ff.MinFreq-1].Cardinality
 	return stats
 }
 // FrequencyFilterStats contains the filter statistics
 type FrequencyFilterStats struct {
 	MinFreq       int
 	FilteredKmers uint64      // K-mers with freq ≥ minFreq
 	TotalBytes    uint64      // Total memory used
 	Levels        []LevelStats
 }
 // LevelStats contains the stats of a level
 type LevelStats struct {
 	Level       int    // freq ≥ Level
 	Cardinality uint64 // Number of k-mers
 	SizeBytes   uint64 // Size in bytes
 }
 func (ffs FrequencyFilterStats) String() string {
 	result := fmt.Sprintf(`Frequency Filter Statistics (minFreq=%d):
  Filtered k-mers (freq≥%d): %d
  Total memory: %.2f MB
 Level breakdown:
 `, ffs.MinFreq, ffs.MinFreq, ffs.FilteredKmers, float64(ffs.TotalBytes)/1024/1024)
 	for _, level := range ffs.Levels {
 		result += fmt.Sprintf("  freq≥%d: %d k-mers (%.2f MB)\n",
 			level.Level,
 			level.Cardinality,
 			float64(level.SizeBytes)/1024/1024)
 	}
 	return result
 }
 // Clear libère la mémoire de tous les niveaux
 // (héritée de KmerSetGroup mais redéfinie pour clarté)
 func (ff *FrequencyFilter) Clear() {
 	ff.KmerSetGroup.Clear()
 }
 // ==================================
 // BATCH PROCESSING
 // ==================================
 // AddSequences adds multiple sequences in batch
 func (ff *FrequencyFilter) AddSequences(sequences *obiseq.BioSequenceSlice) {
 	for _, seq := range *sequences {
 		ff.AddSequence(seq)
 	}
 }
 // ==================================
 // PERSISTANCE
 // ==================================
 // Save sauvegarde le FrequencyFilter dans un répertoire
 // Utilise le format de sérialisation du KmerSetGroup sous-jacent
 // Les métadonnées incluent le type "FrequencyFilter" et min_freq
 //
 // Format:
 //   - directory/metadata.{toml,yaml,json} - métadonnées du filtre
 //   - directory/set_0.roaring - k-mers vus ≥1 fois
 //   - directory/set_1.roaring - k-mers vus ≥2 fois
 //   - ...
 //   - directory/set_{minFreq-1}.roaring - k-mers vus ≥minFreq fois
 //
 // Parameters:
 //   - directory: répertoire de destination
 //   - format: format des métadonnées (FormatTOML, FormatYAML, FormatJSON)
 //
 // Example:
 //
 //	err := ff.Save("./my_filter", obikmer.FormatTOML)
 func (ff *FrequencyFilter) Save(directory string, format MetadataFormat) error {
 	// Déléguer à KmerSetGroup qui gère déjà tout
 	return ff.KmerSetGroup.Save(directory, format)
 }
 // LoadFrequencyFilter charge un FrequencyFilter depuis un répertoire
 // Vérifie que les métadonnées correspondent à un FrequencyFilter
 //
 // Parameters:
 //   - directory: répertoire source
 //
 // Returns:
 //   - *FrequencyFilter: le filtre chargé
 //   - error: erreur si le chargement échoue ou si ce n'est pas un FrequencyFilter
 //
 // Example:
 //
 //	ff, err := obikmer.LoadFrequencyFilter("./my_filter")
 func LoadFrequencyFilter(directory string) (*FrequencyFilter, error) {
 	// Charger le KmerSetGroup
 	ksg, err := LoadKmerSetGroup(directory)
 	if err != nil {
 		return nil, err
 	}
 	// Vérifier que c'est bien un FrequencyFilter
 	if typeAttr, ok := ksg.GetAttribute("type"); !ok || typeAttr != "FrequencyFilter" {
 		return nil, fmt.Errorf("loaded data is not a FrequencyFilter (type=%v)", typeAttr)
 	}
 	// Récupérer min_freq
 	minFreqAttr, ok := ksg.GetIntAttribute("min_freq")
 	if !ok {
 		return nil, fmt.Errorf("FrequencyFilter missing min_freq attribute")
 	}
 	// Créer le FrequencyFilter
 	ff := &FrequencyFilter{
 		KmerSetGroup: ksg,
 		MinFreq:      minFreqAttr,
 	}
 	return ff, nil
 }
 // ==================================
 // UTILITAIRES
 // ==================================
 // Contains vérifie si un k-mer a atteint la fréquence minimale
 func (ff *FrequencyFilter) Contains(kmer uint64) bool {
 	canonical := CanonicalKmer(kmer, ff.K())
 	return ff.Get(ff.MinFreq - 1).Contains(canonical)
 }
 // GetFrequency returns the approximate frequency of a k-mer
 // Retourne le niveau maximum atteint (freq ≥ niveau)
 func (ff *FrequencyFilter) GetFrequency(kmer uint64) int {
 	canonical := CanonicalKmer(kmer, ff.K())
 	freq := 0
 	for i := 0; i < ff.MinFreq; i++ {
 		if ff.Get(i).Contains(canonical) {
 			freq = i + 1
 		} else {
 			break
 		}
 	}
 	return freq
 }
 // Len returns the number of filtered k-mers or at a specific level
 // Without argument: returns the number of k-mers with freq ≥ minFreq (last level)
 // With argument level: returns the number of k-mers with freq ≥ (level+1)
 // Exemple: Len() pour les k-mers filtrés, Len(2) pour freq ≥ 3
 // (héritée de KmerSetGroup mais redéfinie pour la documentation)
 func (ff *FrequencyFilter) Len(level ...int) uint64 {
 	return ff.KmerSetGroup.Len(level...)
 }
 // MemoryUsage returns memory usage in bytes
 // (héritée de KmerSetGroup mais redéfinie pour clarté)
 func (ff *FrequencyFilter) MemoryUsage() uint64 {
 	return ff.KmerSetGroup.MemoryUsage()
 }
 // ==================================
 // COMPARAISON AVEC D'AUTRES APPROCHES
 // ==================================
 // CompareWithSimpleMap compare la mémoire avec une simple map
 func (ff *FrequencyFilter) CompareWithSimpleMap() string {
 	totalKmers := ff.Get(0).Len()
 	simpleMapBytes := totalKmers * 24 // ~24 bytes par entrée
 	roaringBytes := ff.MemoryUsage()
 	reduction := float64(simpleMapBytes) / float64(roaringBytes)
 	return fmt.Sprintf(`Memory Comparison for %d k-mers:
  Simple map[uint64]uint32: %.2f MB
  Roaring filter (v=%d):    %.2f MB
  Reduction:                %.1fx
 `,
 		totalKmers,
 		float64(simpleMapBytes)/1024/1024,
 		ff.MinFreq,
 		float64(roaringBytes)/1024/1024,
 		reduction,
 	)
 }
--- a/pkg/obikmer/kdi_merge.go
+++ b/pkg/obikmer/kdi_merge.go
@@ -0,0 +1,86 @@
 package obikmer
 import "container/heap"
 // mergeItem represents an element in the min-heap for k-way merge.
 type mergeItem struct {
 	value uint64
 	idx   int // index of the reader that produced this value
 }
 // mergeHeap implements heap.Interface for k-way merge.
 type mergeHeap []mergeItem
 func (h mergeHeap) Len() int            { return len(h) }
 func (h mergeHeap) Less(i, j int) bool  { return h[i].value < h[j].value }
 func (h mergeHeap) Swap(i, j int)       { h[i], h[j] = h[j], h[i] }
 func (h *mergeHeap) Push(x interface{}) { *h = append(*h, x.(mergeItem)) }
 func (h *mergeHeap) Pop() interface{} {
 	old := *h
 	n := len(old)
 	x := old[n-1]
 	*h = old[:n-1]
 	return x
 }
 // KWayMerge performs a k-way merge of multiple sorted KdiReader streams.
 // For each unique k-mer value, it reports the value and the number of
 // input streams that contained it (count).
 type KWayMerge struct {
 	h       mergeHeap
 	readers []*KdiReader
 }
 // NewKWayMerge creates a k-way merge from multiple KdiReaders.
 // Each reader must produce values in sorted (ascending) order.
 func NewKWayMerge(readers []*KdiReader) *KWayMerge {
 	m := &KWayMerge{
 		h:       make(mergeHeap, 0, len(readers)),
 		readers: readers,
 	}
 	// Initialize heap with first value from each reader
 	for i, r := range readers {
 		if v, ok := r.Next(); ok {
 			m.h = append(m.h, mergeItem{value: v, idx: i})
 		}
 	}
 	heap.Init(&m.h)
 	return m
 }
 // Next returns the next smallest k-mer value, the number of readers
 // that contained this value (count), and true.
 // Returns (0, 0, false) when all streams are exhausted.
 func (m *KWayMerge) Next() (kmer uint64, count int, ok bool) {
 	if len(m.h) == 0 {
 		return 0, 0, false
 	}
 	minVal := m.h[0].value
 	count = 0
 	// Pop all items with the same value
 	for len(m.h) > 0 && m.h[0].value == minVal {
 		item := heap.Pop(&m.h).(mergeItem)
 		count++
 		// Advance that reader
 		if v, ok := m.readers[item.idx].Next(); ok {
 			heap.Push(&m.h, mergeItem{value: v, idx: item.idx})
 		}
 	}
 	return minVal, count, true
 }
 // Close closes all underlying readers.
 func (m *KWayMerge) Close() error {
 	var firstErr error
 	for _, r := range m.readers {
 		if err := r.Close(); err != nil && firstErr == nil {
 			firstErr = err
 		}
 	}
 	return firstErr
 }
--- a/pkg/obikmer/kdi_merge_test.go
+++ b/pkg/obikmer/kdi_merge_test.go
@@ -0,0 +1,159 @@
 package obikmer
 import (
 	"path/filepath"
 	"testing"
 )
 // writeKdi is a helper that writes sorted kmers to a .kdi file.
 func writeKdi(t *testing.T, dir, name string, kmers []uint64) string {
 	t.Helper()
 	path := filepath.Join(dir, name)
 	w, err := NewKdiWriter(path)
 	if err != nil {
 		t.Fatal(err)
 	}
 	for _, v := range kmers {
 		if err := w.Write(v); err != nil {
 			t.Fatal(err)
 		}
 	}
 	if err := w.Close(); err != nil {
 		t.Fatal(err)
 	}
 	return path
 }
 func TestKWayMergeBasic(t *testing.T) {
 	dir := t.TempDir()
 	// Three sorted streams
 	p1 := writeKdi(t, dir, "a.kdi", []uint64{1, 3, 5, 7})
 	p2 := writeKdi(t, dir, "b.kdi", []uint64{2, 3, 6, 7})
 	p3 := writeKdi(t, dir, "c.kdi", []uint64{3, 4, 7, 8})
 	r1, _ := NewKdiReader(p1)
 	r2, _ := NewKdiReader(p2)
 	r3, _ := NewKdiReader(p3)
 	m := NewKWayMerge([]*KdiReader{r1, r2, r3})
 	defer m.Close()
 	type result struct {
 		kmer  uint64
 		count int
 	}
 	var results []result
 	for {
 		kmer, count, ok := m.Next()
 		if !ok {
 			break
 		}
 		results = append(results, result{kmer, count})
 	}
 	expected := []result{
 		{1, 1}, {2, 1}, {3, 3}, {4, 1}, {5, 1}, {6, 1}, {7, 3}, {8, 1},
 	}
 	if len(results) != len(expected) {
 		t.Fatalf("got %d results, want %d", len(results), len(expected))
 	}
 	for i, exp := range expected {
 		if results[i] != exp {
 			t.Errorf("result %d: got %+v, want %+v", i, results[i], exp)
 		}
 	}
 }
 func TestKWayMergeSingleStream(t *testing.T) {
 	dir := t.TempDir()
 	p := writeKdi(t, dir, "a.kdi", []uint64{10, 20, 30})
 	r, _ := NewKdiReader(p)
 	m := NewKWayMerge([]*KdiReader{r})
 	defer m.Close()
 	vals := []uint64{10, 20, 30}
 	for _, expected := range vals {
 		kmer, count, ok := m.Next()
 		if !ok {
 			t.Fatal("unexpected EOF")
 		}
 		if kmer != expected || count != 1 {
 			t.Fatalf("got (%d, %d), want (%d, 1)", kmer, count, expected)
 		}
 	}
 	_, _, ok := m.Next()
 	if ok {
 		t.Fatal("expected EOF")
 	}
 }
 func TestKWayMergeEmpty(t *testing.T) {
 	dir := t.TempDir()
 	p1 := writeKdi(t, dir, "a.kdi", nil)
 	p2 := writeKdi(t, dir, "b.kdi", nil)
 	r1, _ := NewKdiReader(p1)
 	r2, _ := NewKdiReader(p2)
 	m := NewKWayMerge([]*KdiReader{r1, r2})
 	defer m.Close()
 	_, _, ok := m.Next()
 	if ok {
 		t.Fatal("expected no results from empty streams")
 	}
 }
 func TestKWayMergeDisjoint(t *testing.T) {
 	dir := t.TempDir()
 	p1 := writeKdi(t, dir, "a.kdi", []uint64{1, 2, 3})
 	p2 := writeKdi(t, dir, "b.kdi", []uint64{10, 20, 30})
 	r1, _ := NewKdiReader(p1)
 	r2, _ := NewKdiReader(p2)
 	m := NewKWayMerge([]*KdiReader{r1, r2})
 	defer m.Close()
 	expected := []uint64{1, 2, 3, 10, 20, 30}
 	for _, exp := range expected {
 		kmer, count, ok := m.Next()
 		if !ok {
 			t.Fatal("unexpected EOF")
 		}
 		if kmer != exp || count != 1 {
 			t.Fatalf("got (%d, %d), want (%d, 1)", kmer, count, exp)
 		}
 	}
 }
 func TestKWayMergeAllSame(t *testing.T) {
 	dir := t.TempDir()
 	p1 := writeKdi(t, dir, "a.kdi", []uint64{42})
 	p2 := writeKdi(t, dir, "b.kdi", []uint64{42})
 	p3 := writeKdi(t, dir, "c.kdi", []uint64{42})
 	r1, _ := NewKdiReader(p1)
 	r2, _ := NewKdiReader(p2)
 	r3, _ := NewKdiReader(p3)
 	m := NewKWayMerge([]*KdiReader{r1, r2, r3})
 	defer m.Close()
 	kmer, count, ok := m.Next()
 	if !ok {
 		t.Fatal("expected one result")
 	}
 	if kmer != 42 || count != 3 {
 		t.Fatalf("got (%d, %d), want (42, 3)", kmer, count)
 	}
 	_, _, ok = m.Next()
 	if ok {
 		t.Fatal("expected EOF")
 	}
 }
--- a/pkg/obikmer/kdi_reader.go
+++ b/pkg/obikmer/kdi_reader.go
@@ -0,0 +1,170 @@
 package obikmer
 import (
 	"bufio"
 	"encoding/binary"
 	"fmt"
 	"io"
 	"os"
 )
 // KdiReader reads k-mers from a .kdi file using streaming delta-varint decoding.
 type KdiReader struct {
 	r       *bufio.Reader
 	file    *os.File
 	count   uint64    // total number of k-mers
 	read    uint64    // number of k-mers already consumed
 	prev    uint64    // last decoded value
 	started bool      // whether first value has been read
 	index   *KdxIndex // optional sparse index for seeking
 }
 // NewKdiReader opens a .kdi file for streaming reading (no index).
 func NewKdiReader(path string) (*KdiReader, error) {
 	return openKdiReader(path, nil)
 }
 // NewKdiIndexedReader opens a .kdi file with its companion .kdx index
 // loaded for fast seeking. If the .kdx file does not exist, it gracefully
 // falls back to sequential reading.
 func NewKdiIndexedReader(path string) (*KdiReader, error) {
 	kdxPath := KdxPathForKdi(path)
 	idx, err := LoadKdxIndex(kdxPath)
 	if err != nil {
 		// Index load failed — fall back to non-indexed
 		return openKdiReader(path, nil)
 	}
 	// idx may be nil if file does not exist — that's fine
 	return openKdiReader(path, idx)
 }
 func openKdiReader(path string, idx *KdxIndex) (*KdiReader, error) {
 	f, err := os.Open(path)
 	if err != nil {
 		return nil, err
 	}
 	r := bufio.NewReaderSize(f, 65536)
 	// Read and verify magic
 	var magic [4]byte
 	if _, err := io.ReadFull(r, magic[:]); err != nil {
 		f.Close()
 		return nil, fmt.Errorf("kdi: read magic: %w", err)
 	}
 	if magic != kdiMagic {
 		f.Close()
 		return nil, fmt.Errorf("kdi: bad magic %v", magic)
 	}
 	// Read count
 	var countBuf [8]byte
 	if _, err := io.ReadFull(r, countBuf[:]); err != nil {
 		f.Close()
 		return nil, fmt.Errorf("kdi: read count: %w", err)
 	}
 	count := binary.LittleEndian.Uint64(countBuf[:])
 	return &KdiReader{
 		r:     r,
 		file:  f,
 		count: count,
 		index: idx,
 	}, nil
 }
 // Next returns the next k-mer and true, or (0, false) when exhausted.
 func (kr *KdiReader) Next() (uint64, bool) {
 	if kr.read >= kr.count {
 		return 0, false
 	}
 	if !kr.started {
 		// Read first value as absolute uint64 LE
 		var buf [8]byte
 		if _, err := io.ReadFull(kr.r, buf[:]); err != nil {
 			return 0, false
 		}
 		kr.prev = binary.LittleEndian.Uint64(buf[:])
 		kr.started = true
 		kr.read++
 		return kr.prev, true
 	}
 	// Read delta varint
 	delta, err := DecodeVarint(kr.r)
 	if err != nil {
 		return 0, false
 	}
 	kr.prev += delta
 	kr.read++
 	return kr.prev, true
 }
 // SeekTo positions the reader near the target k-mer using the sparse .kdx index.
 // After SeekTo, the reader is positioned so that the next call to Next()
 // returns the k-mer immediately after the indexed entry at or before target.
 //
 // If the reader has no index, or the target is before the current position,
 // SeekTo does nothing (linear scan continues from current position).
 func (kr *KdiReader) SeekTo(target uint64) error {
 	if kr.index == nil {
 		return nil
 	}
 	// If we've already passed the target, we can't seek backwards
 	if kr.started && kr.prev >= target {
 		return nil
 	}
 	offset, skipCount, ok := kr.index.FindOffset(target)
 	if !ok {
 		return nil
 	}
 	// skipCount is the number of k-mers consumed at the indexed position.
 	// The index was recorded AFTER writing the k-mer at position skipCount-1
 	// (since count%stride==0 after incrementing count). So the actual number
 	// of k-mers consumed is skipCount (the entry's kmer is the last one
 	// before the offset).
 	// Only seek if it would skip significant work
 	if kr.started && skipCount <= kr.read {
 		return nil
 	}
 	// The index entry stores (kmer_value, byte_offset_after_that_kmer).
 	// skipCount = (entryIdx+1)*stride, so entryIdx = skipCount/stride - 1
 	// We seek to that offset, set prev = indexedKmer, and the next Next()
 	// call will read the delta-varint of the following k-mer.
 	entryIdx := int(skipCount)/kr.index.stride - 1
 	if entryIdx < 0 || entryIdx >= len(kr.index.entries) {
 		return nil
 	}
 	indexedKmer := kr.index.entries[entryIdx].kmer
 	if _, err := kr.file.Seek(int64(offset), io.SeekStart); err != nil {
 		return fmt.Errorf("kdi: seek: %w", err)
 	}
 	kr.r.Reset(kr.file)
 	kr.prev = indexedKmer
 	kr.started = true
 	kr.read = skipCount
 	return nil
 }
 // Count returns the total number of k-mers in this partition.
 func (kr *KdiReader) Count() uint64 {
 	return kr.count
 }
 // Remaining returns how many k-mers have not been read yet.
 func (kr *KdiReader) Remaining() uint64 {
 	return kr.count - kr.read
 }
 // Close closes the underlying file.
 func (kr *KdiReader) Close() error {
 	return kr.file.Close()
 }
--- a/pkg/obikmer/kdi_test.go
+++ b/pkg/obikmer/kdi_test.go
@@ -0,0 +1,255 @@
 package obikmer
 import (
 	"os"
 	"path/filepath"
 	"sort"
 	"testing"
 )
 func TestKdiRoundTrip(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "test.kdi")
 	// Sorted k-mer values
 	kmers := []uint64{10, 20, 30, 100, 200, 500, 10000, 1 << 40, 1<<62 - 1}
 	w, err := NewKdiWriter(path)
 	if err != nil {
 		t.Fatal(err)
 	}
 	for _, v := range kmers {
 		if err := w.Write(v); err != nil {
 			t.Fatal(err)
 		}
 	}
 	if w.Count() != uint64(len(kmers)) {
 		t.Fatalf("writer count: got %d, want %d", w.Count(), len(kmers))
 	}
 	if err := w.Close(); err != nil {
 		t.Fatal(err)
 	}
 	// Read back
 	r, err := NewKdiReader(path)
 	if err != nil {
 		t.Fatal(err)
 	}
 	defer r.Close()
 	if r.Count() != uint64(len(kmers)) {
 		t.Fatalf("reader count: got %d, want %d", r.Count(), len(kmers))
 	}
 	for i, expected := range kmers {
 		got, ok := r.Next()
 		if !ok {
 			t.Fatalf("unexpected EOF at index %d", i)
 		}
 		if got != expected {
 			t.Fatalf("kmer %d: got %d, want %d", i, got, expected)
 		}
 	}
 	_, ok := r.Next()
 	if ok {
 		t.Fatal("expected EOF after all k-mers")
 	}
 }
 func TestKdiEmpty(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "empty.kdi")
 	w, err := NewKdiWriter(path)
 	if err != nil {
 		t.Fatal(err)
 	}
 	if err := w.Close(); err != nil {
 		t.Fatal(err)
 	}
 	r, err := NewKdiReader(path)
 	if err != nil {
 		t.Fatal(err)
 	}
 	defer r.Close()
 	if r.Count() != 0 {
 		t.Fatalf("expected count 0, got %d", r.Count())
 	}
 	_, ok := r.Next()
 	if ok {
 		t.Fatal("expected no k-mers in empty file")
 	}
 }
 func TestKdiSingleValue(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "single.kdi")
 	w, err := NewKdiWriter(path)
 	if err != nil {
 		t.Fatal(err)
 	}
 	if err := w.Write(42); err != nil {
 		t.Fatal(err)
 	}
 	if err := w.Close(); err != nil {
 		t.Fatal(err)
 	}
 	r, err := NewKdiReader(path)
 	if err != nil {
 		t.Fatal(err)
 	}
 	defer r.Close()
 	if r.Count() != 1 {
 		t.Fatalf("expected count 1, got %d", r.Count())
 	}
 	v, ok := r.Next()
 	if !ok {
 		t.Fatal("expected one k-mer")
 	}
 	if v != 42 {
 		t.Fatalf("got %d, want 42", v)
 	}
 }
 func TestKdiFileSize(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "size.kdi")
 	// Write: magic(4) + count(8) + first(8) = 20 bytes
 	w, err := NewKdiWriter(path)
 	if err != nil {
 		t.Fatal(err)
 	}
 	if err := w.Write(0); err != nil {
 		t.Fatal(err)
 	}
 	if err := w.Close(); err != nil {
 		t.Fatal(err)
 	}
 	info, err := os.Stat(path)
 	if err != nil {
 		t.Fatal(err)
 	}
 	// magic(4) + count(8) + first(8) = 20
 	if info.Size() != 20 {
 		t.Fatalf("file size: got %d, want 20", info.Size())
 	}
 }
 func TestKdiDeltaCompression(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "delta.kdi")
 	// Dense consecutive values should compress well
 	n := 10000
 	kmers := make([]uint64, n)
 	for i := range kmers {
 		kmers[i] = uint64(i * 2) // even numbers
 	}
 	w, err := NewKdiWriter(path)
 	if err != nil {
 		t.Fatal(err)
 	}
 	for _, v := range kmers {
 		if err := w.Write(v); err != nil {
 			t.Fatal(err)
 		}
 	}
 	if err := w.Close(); err != nil {
 		t.Fatal(err)
 	}
 	// Each delta is 2, encoded as 1 byte varint
 	// Total: magic(4) + count(8) + first(8) + (n-1)*1 = 20 + 9999 bytes
 	info, err := os.Stat(path)
 	if err != nil {
 		t.Fatal(err)
 	}
 	expected := int64(20 + n - 1)
 	if info.Size() != expected {
 		t.Fatalf("file size: got %d, want %d", info.Size(), expected)
 	}
 	// Verify round-trip
 	r, err := NewKdiReader(path)
 	if err != nil {
 		t.Fatal(err)
 	}
 	defer r.Close()
 	for i, expected := range kmers {
 		got, ok := r.Next()
 		if !ok {
 			t.Fatalf("unexpected EOF at index %d", i)
 		}
 		if got != expected {
 			t.Fatalf("kmer %d: got %d, want %d", i, got, expected)
 		}
 	}
 }
 func TestKdiFromRealKmers(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "real.kdi")
 	// Extract k-mers from a sequence, sort, dedup, write to KDI
 	seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT")
 	k := 15
 	var kmers []uint64
 	for kmer := range IterCanonicalKmers(seq, k) {
 		kmers = append(kmers, kmer)
 	}
 	sort.Slice(kmers, func(i, j int) bool { return kmers[i] < kmers[j] })
 	// Dedup
 	deduped := kmers[:0]
 	for i, v := range kmers {
 		if i == 0 || v != kmers[i-1] {
 			deduped = append(deduped, v)
 		}
 	}
 	w, err := NewKdiWriter(path)
 	if err != nil {
 		t.Fatal(err)
 	}
 	for _, v := range deduped {
 		if err := w.Write(v); err != nil {
 			t.Fatal(err)
 		}
 	}
 	if err := w.Close(); err != nil {
 		t.Fatal(err)
 	}
 	// Read back and verify
 	r, err := NewKdiReader(path)
 	if err != nil {
 		t.Fatal(err)
 	}
 	defer r.Close()
 	if r.Count() != uint64(len(deduped)) {
 		t.Fatalf("count: got %d, want %d", r.Count(), len(deduped))
 	}
 	for i, expected := range deduped {
 		got, ok := r.Next()
 		if !ok {
 			t.Fatalf("unexpected EOF at index %d", i)
 		}
 		if got != expected {
 			t.Fatalf("kmer %d: got %d, want %d", i, got, expected)
 		}
 	}
 }
--- a/pkg/obikmer/kdi_writer.go
+++ b/pkg/obikmer/kdi_writer.go
@@ -0,0 +1,151 @@
 package obikmer
 import (
 	"bufio"
 	"encoding/binary"
 	"os"
 )
 // KDI file magic bytes: "KDI\x01"
 var kdiMagic = [4]byte{'K', 'D', 'I', 0x01}
 // kdiHeaderSize is the size of the KDI header: magic(4) + count(8) = 12 bytes.
 const kdiHeaderSize = 12
 // KdiWriter writes a sorted sequence of uint64 k-mers to a .kdi file
 // using delta-varint encoding.
 //
 // Format:
 //
 //	[magic: 4 bytes "KDI\x01"]
 //	[count: uint64 LE]        number of k-mers
 //	[first: uint64 LE]        first k-mer (absolute value)
 //	[delta_1: varint]          arr[1] - arr[0]
 //	[delta_2: varint]          arr[2] - arr[1]
 //	...
 //
 // The caller must write k-mers in strictly increasing order.
 //
 // On Close(), a companion .kdx sparse index file is written alongside
 // the .kdi file for fast random access.
 type KdiWriter struct {
 	w            *bufio.Writer
 	file         *os.File
 	count        uint64
 	prev         uint64
 	first        bool
 	path         string
 	bytesWritten uint64     // bytes written after header (data section offset)
 	indexEntries []kdxEntry // sparse index entries collected during writes
 }
 // NewKdiWriter creates a new KdiWriter writing to the given file path.
 // The header (magic + count placeholder) is written immediately.
 // Count is patched on Close().
 func NewKdiWriter(path string) (*KdiWriter, error) {
 	f, err := os.Create(path)
 	if err != nil {
 		return nil, err
 	}
 	w := bufio.NewWriterSize(f, 65536)
 	// Write magic
 	if _, err := w.Write(kdiMagic[:]); err != nil {
 		f.Close()
 		return nil, err
 	}
 	// Write placeholder for count (will be patched on Close)
 	var countBuf [8]byte
 	if _, err := w.Write(countBuf[:]); err != nil {
 		f.Close()
 		return nil, err
 	}
 	return &KdiWriter{
 		w:            w,
 		file:         f,
 		first:        true,
 		path:         path,
 		bytesWritten: 0,
 		indexEntries: make([]kdxEntry, 0, 256),
 	}, nil
 }
 // Write adds a k-mer to the file. K-mers must be written in strictly
 // increasing order.
 func (kw *KdiWriter) Write(kmer uint64) error {
 	if kw.first {
 		// Write first value as absolute uint64 LE
 		var buf [8]byte
 		binary.LittleEndian.PutUint64(buf[:], kmer)
 		if _, err := kw.w.Write(buf[:]); err != nil {
 			return err
 		}
 		kw.bytesWritten += 8
 		kw.prev = kmer
 		kw.first = false
 	} else {
 		delta := kmer - kw.prev
 		n, err := EncodeVarint(kw.w, delta)
 		if err != nil {
 			return err
 		}
 		kw.bytesWritten += uint64(n)
 		kw.prev = kmer
 	}
 	kw.count++
 	// Record sparse index entry every defaultKdxStride k-mers.
 	// The offset recorded is AFTER writing this k-mer, so it points to
 	// where the next k-mer's data will start. SeekTo uses this: it seeks
 	// to the recorded offset, sets prev = indexedKmer, and Next() reads
 	// the delta of the following k-mer.
 	if kw.count%defaultKdxStride == 0 {
 		kw.indexEntries = append(kw.indexEntries, kdxEntry{
 			kmer:   kmer,
 			offset: kdiHeaderSize + kw.bytesWritten,
 		})
 	}
 	return nil
 }
 // Count returns the number of k-mers written so far.
 func (kw *KdiWriter) Count() uint64 {
 	return kw.count
 }
 // Close flushes buffered data, patches the count in the header,
 // writes the companion .kdx index file, and closes the file.
 func (kw *KdiWriter) Close() error {
 	if err := kw.w.Flush(); err != nil {
 		kw.file.Close()
 		return err
 	}
 	// Patch count at offset 4 (after magic)
 	if _, err := kw.file.Seek(4, 0); err != nil {
 		kw.file.Close()
 		return err
 	}
 	var countBuf [8]byte
 	binary.LittleEndian.PutUint64(countBuf[:], kw.count)
 	if _, err := kw.file.Write(countBuf[:]); err != nil {
 		kw.file.Close()
 		return err
 	}
 	if err := kw.file.Close(); err != nil {
 		return err
 	}
 	// Write .kdx index file if there are entries to index
 	if len(kw.indexEntries) > 0 {
 		kdxPath := KdxPathForKdi(kw.path)
 		if err := WriteKdxIndex(kdxPath, defaultKdxStride, kw.indexEntries); err != nil {
 			return err
 		}
 	}
 	return nil
 }
--- a/pkg/obikmer/kdx.go
+++ b/pkg/obikmer/kdx.go
@@ -0,0 +1,170 @@
 package obikmer
 import (
 	"encoding/binary"
 	"fmt"
 	"io"
 	"os"
 	"sort"
 	"strings"
 )
 // KDX file magic bytes: "KDX\x01"
 var kdxMagic = [4]byte{'K', 'D', 'X', 0x01}
 // defaultKdxStride is the number of k-mers between consecutive index entries.
 const defaultKdxStride = 4096
 // kdxEntry is a single entry in the sparse index: the absolute k-mer value
 // and the byte offset in the corresponding .kdi file where that k-mer is stored.
 type kdxEntry struct {
 	kmer   uint64
 	offset uint64 // absolute byte offset in .kdi file
 }
 // KdxIndex is a sparse, in-memory index for a .kdi file.
 // It stores one entry every `stride` k-mers, enabling O(log N / stride)
 // binary search followed by at most `stride` linear scan steps.
 type KdxIndex struct {
 	stride  int
 	entries []kdxEntry
 }
 // LoadKdxIndex reads a .kdx file into memory.
 // Returns (nil, nil) if the file does not exist (graceful degradation).
 func LoadKdxIndex(path string) (*KdxIndex, error) {
 	f, err := os.Open(path)
 	if err != nil {
 		if os.IsNotExist(err) {
 			return nil, nil
 		}
 		return nil, err
 	}
 	defer f.Close()
 	// Read magic
 	var magic [4]byte
 	if _, err := io.ReadFull(f, magic[:]); err != nil {
 		return nil, fmt.Errorf("kdx: read magic: %w", err)
 	}
 	if magic != kdxMagic {
 		return nil, fmt.Errorf("kdx: bad magic %v", magic)
 	}
 	// Read stride (uint32 LE)
 	var buf4 [4]byte
 	if _, err := io.ReadFull(f, buf4[:]); err != nil {
 		return nil, fmt.Errorf("kdx: read stride: %w", err)
 	}
 	stride := int(binary.LittleEndian.Uint32(buf4[:]))
 	// Read count (uint32 LE)
 	if _, err := io.ReadFull(f, buf4[:]); err != nil {
 		return nil, fmt.Errorf("kdx: read count: %w", err)
 	}
 	count := int(binary.LittleEndian.Uint32(buf4[:]))
 	// Read entries
 	entries := make([]kdxEntry, count)
 	var buf16 [16]byte
 	for i := 0; i < count; i++ {
 		if _, err := io.ReadFull(f, buf16[:]); err != nil {
 			return nil, fmt.Errorf("kdx: read entry %d: %w", i, err)
 		}
 		entries[i] = kdxEntry{
 			kmer:   binary.LittleEndian.Uint64(buf16[0:8]),
 			offset: binary.LittleEndian.Uint64(buf16[8:16]),
 		}
 	}
 	return &KdxIndex{
 		stride:  stride,
 		entries: entries,
 	}, nil
 }
 // FindOffset locates the best starting point in the .kdi file to scan for
 // the target k-mer. It returns:
 //   - offset: the byte offset in the .kdi file to seek to (positioned after
 //     the indexed k-mer, ready to read the next delta)
 //   - skipCount: the number of k-mers already consumed at that offset
 //     (to set the reader's internal counter)
 //   - ok: true if the index provides a useful starting point
 //
 // Index entries are recorded at k-mer count positions stride, 2*stride, etc.
 // Entry i corresponds to the k-mer written at count = (i+1)*stride.
 func (idx *KdxIndex) FindOffset(target uint64) (offset uint64, skipCount uint64, ok bool) {
 	if idx == nil || len(idx.entries) == 0 {
 		return 0, 0, false
 	}
 	// Binary search: find the largest entry with kmer <= target
 	i := sort.Search(len(idx.entries), func(i int) bool {
 		return idx.entries[i].kmer > target
 	})
 	// i is the first entry with kmer > target, so i-1 is the last with kmer <= target
 	if i == 0 {
 		// Target is before the first index entry.
 		// No useful jump point — caller should scan from the beginning.
 		return 0, 0, false
 	}
 	i-- // largest entry with kmer <= target
 	// Entry i was recorded after writing k-mer at count = (i+1)*stride
 	skipCount = uint64(i+1) * uint64(idx.stride)
 	return idx.entries[i].offset, skipCount, true
 }
 // Stride returns the stride of this index.
 func (idx *KdxIndex) Stride() int {
 	return idx.stride
 }
 // Len returns the number of entries in this index.
 func (idx *KdxIndex) Len() int {
 	return len(idx.entries)
 }
 // WriteKdxIndex writes a .kdx file from a slice of entries.
 func WriteKdxIndex(path string, stride int, entries []kdxEntry) error {
 	f, err := os.Create(path)
 	if err != nil {
 		return err
 	}
 	defer f.Close()
 	// Magic
 	if _, err := f.Write(kdxMagic[:]); err != nil {
 		return err
 	}
 	// Stride (uint32 LE)
 	var buf4 [4]byte
 	binary.LittleEndian.PutUint32(buf4[:], uint32(stride))
 	if _, err := f.Write(buf4[:]); err != nil {
 		return err
 	}
 	// Count (uint32 LE)
 	binary.LittleEndian.PutUint32(buf4[:], uint32(len(entries)))
 	if _, err := f.Write(buf4[:]); err != nil {
 		return err
 	}
 	// Entries
 	var buf16 [16]byte
 	for _, e := range entries {
 		binary.LittleEndian.PutUint64(buf16[0:8], e.kmer)
 		binary.LittleEndian.PutUint64(buf16[8:16], e.offset)
 		if _, err := f.Write(buf16[:]); err != nil {
 			return err
 		}
 	}
 	return nil
 }
 // KdxPathForKdi returns the .kdx path corresponding to a .kdi path.
 func KdxPathForKdi(kdiPath string) string {
 	return strings.TrimSuffix(kdiPath, ".kdi") + ".kdx"
 }
--- a/pkg/obikmer/kmer_match.go
+++ b/pkg/obikmer/kmer_match.go
@@ -0,0 +1,256 @@
 package obikmer
 import (
 	"cmp"
 	"slices"
 	"sync"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 )
 // QueryEntry represents a canonical k-mer to look up, together with
 // metadata to trace the result back to the originating sequence and position.
 type QueryEntry struct {
 	Kmer   uint64 // canonical k-mer value
 	SeqIdx int    // index within the batch
 	Pos    int    // 1-based position in the sequence
 }
 // MatchResult holds matched positions for each sequence in a batch.
 // results[i] contains the sorted matched positions for sequence i.
 type MatchResult [][]int
 // PreparedQueries holds pre-computed query buckets along with the number
 // of sequences they were built from. This is used by the accumulation
 // pipeline to merge queries from multiple batches.
 type PreparedQueries struct {
 	Buckets [][]QueryEntry // queries[partition], each sorted by Kmer
 	NSeqs   int            // number of sequences that produced these queries
 	NKmers  int            // total number of k-mer entries across all partitions
 }
 // MergeQueries merges src into dst, offsetting all SeqIdx values in src
 // by dst.NSeqs. Both dst and src must have the same number of partitions.
 // After merging, src should not be reused.
 //
 // Each partition's entries are merged in sorted order (merge-sort of two
 // already-sorted slices).
 func MergeQueries(dst, src *PreparedQueries) {
 	for p := range dst.Buckets {
 		if len(src.Buckets[p]) == 0 {
 			continue
 		}
 		offset := dst.NSeqs
 		srcB := src.Buckets[p]
 		// Offset SeqIdx in src entries
 		for i := range srcB {
 			srcB[i].SeqIdx += offset
 		}
 		if len(dst.Buckets[p]) == 0 {
 			dst.Buckets[p] = srcB
 			continue
 		}
 		// Merge two sorted slices
 		dstB := dst.Buckets[p]
 		merged := make([]QueryEntry, 0, len(dstB)+len(srcB))
 		i, j := 0, 0
 		for i < len(dstB) && j < len(srcB) {
 			if dstB[i].Kmer <= srcB[j].Kmer {
 				merged = append(merged, dstB[i])
 				i++
 			} else {
 				merged = append(merged, srcB[j])
 				j++
 			}
 		}
 		merged = append(merged, dstB[i:]...)
 		merged = append(merged, srcB[j:]...)
 		dst.Buckets[p] = merged
 	}
 	dst.NSeqs += src.NSeqs
 	dst.NKmers += src.NKmers
 }
 // PrepareQueries extracts all canonical k-mers from a batch of sequences
 // and groups them by partition using super-kmer minimizers.
 //
 // Returns a PreparedQueries with sorted per-partition buckets.
 func (ksg *KmerSetGroup) PrepareQueries(sequences []*obiseq.BioSequence) *PreparedQueries {
 	P := ksg.partitions
 	k := ksg.k
 	m := ksg.m
 	// Pre-allocate partition buckets
 	buckets := make([][]QueryEntry, P)
 	for i := range buckets {
 		buckets[i] = make([]QueryEntry, 0, 64)
 	}
 	totalKmers := 0
 	for seqIdx, seq := range sequences {
 		bseq := seq.Sequence()
 		if len(bseq) < k {
 			continue
 		}
 		// Iterate super-kmers to get minimizer → partition mapping
 		for sk := range IterSuperKmers(bseq, k, m) {
 			partition := int(sk.Minimizer % uint64(P))
 			// Iterate canonical k-mers within this super-kmer
 			skSeq := sk.Sequence
 			if len(skSeq) < k {
 				continue
 			}
 			localPos := 0
 			for kmer := range IterCanonicalKmers(skSeq, k) {
 				buckets[partition] = append(buckets[partition], QueryEntry{
 					Kmer:   kmer,
 					SeqIdx: seqIdx,
 					Pos:    sk.Start + localPos + 1,
 				})
 				localPos++
 				totalKmers++
 			}
 		}
 	}
 	// Sort each bucket by k-mer value for merge-scan
 	for p := range buckets {
 		slices.SortFunc(buckets[p], func(a, b QueryEntry) int {
 			return cmp.Compare(a.Kmer, b.Kmer)
 		})
 	}
 	return &PreparedQueries{
 		Buckets: buckets,
 		NSeqs:   len(sequences),
 		NKmers:  totalKmers,
 	}
 }
 // MatchBatch looks up pre-sorted queries against one set of the index.
 // Partitions are processed in parallel. For each partition, a merge-scan
 // compares the sorted queries against the sorted KDI stream.
 //
 // Returns a MatchResult where result[i] contains sorted matched positions
 // for sequence i.
 func (ksg *KmerSetGroup) MatchBatch(setIndex int, pq *PreparedQueries) MatchResult {
 	P := ksg.partitions
 	// Pre-allocated per-sequence results and mutexes.
 	// Each partition goroutine appends to results[seqIdx] with mus[seqIdx] held.
 	// Contention is low: a sequence's k-mers span many partitions, but each
 	// partition processes its queries sequentially and the critical section is tiny.
 	results := make([][]int, pq.NSeqs)
 	mus := make([]sync.Mutex, pq.NSeqs)
 	var wg sync.WaitGroup
 	for p := 0; p < P; p++ {
 		if len(pq.Buckets[p]) == 0 {
 			continue
 		}
 		wg.Add(1)
 		go func(part int) {
 			defer wg.Done()
 			ksg.matchPartition(setIndex, part, pq.Buckets[part], results, mus)
 		}(p)
 	}
 	wg.Wait()
 	// Sort positions within each sequence
 	for i := range results {
 		if len(results[i]) > 1 {
 			slices.Sort(results[i])
 		}
 	}
 	return MatchResult(results)
 }
 // matchPartition processes one partition: opens the KDI reader (with index),
 // seeks to the first query, then merge-scans queries against the KDI stream.
 func (ksg *KmerSetGroup) matchPartition(
 	setIndex int,
 	partIndex int,
 	queries []QueryEntry, // sorted by Kmer
 	results [][]int,
 	mus []sync.Mutex,
 ) {
 	r, err := NewKdiIndexedReader(ksg.partitionPath(setIndex, partIndex))
 	if err != nil {
 		return
 	}
 	defer r.Close()
 	if r.Count() == 0 || len(queries) == 0 {
 		return
 	}
 	// Seek to the first query's neighborhood
 	if err := r.SeekTo(queries[0].Kmer); err != nil {
 		return
 	}
 	// Read first kmer from the stream after seek
 	currentKmer, ok := r.Next()
 	if !ok {
 		return
 	}
 	qi := 0 // query index
 	for qi < len(queries) {
 		q := queries[qi]
 		// If the next query is far ahead, re-seek instead of linear scan.
 		// Only seek if we'd skip more k-mers than the index stride,
 		// otherwise linear scan through the buffer is faster than a syscall.
 		if r.index != nil && q.Kmer > currentKmer && r.Remaining() > uint64(r.index.stride) {
 			_, skipCount, found := r.index.FindOffset(q.Kmer)
 			if found && skipCount > r.read+uint64(r.index.stride) {
 				if err := r.SeekTo(q.Kmer); err == nil {
 					nextKmer, nextOk := r.Next()
 					if !nextOk {
 						return
 					}
 					currentKmer = nextKmer
 					ok = true
 				}
 			}
 		}
 		// Advance KDI stream until >= query kmer
 		for currentKmer < q.Kmer {
 			currentKmer, ok = r.Next()
 			if !ok {
 				return // KDI exhausted
 			}
 		}
 		if currentKmer == q.Kmer {
 			// Match! Record all queries with this same k-mer value
 			matchedKmer := q.Kmer
 			for qi < len(queries) && queries[qi].Kmer == matchedKmer {
 				idx := queries[qi].SeqIdx
 				mus[idx].Lock()
 				results[idx] = append(results[idx], queries[qi].Pos)
 				mus[idx].Unlock()
 				qi++
 			}
 		} else {
 			// currentKmer > q.Kmer: skip all queries with this kmer value
 			skippedKmer := q.Kmer
 			for qi < len(queries) && queries[qi].Kmer == skippedKmer {
 				qi++
 			}
 		}
 	}
 }
--- a/pkg/obikmer/kmer_set.go
+++ b/pkg/obikmer/kmer_set.go
@@ -1,217 +0,0 @@
 package obikmer
 import (
 	"fmt"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 	"github.com/RoaringBitmap/roaring/roaring64"
 )
 // KmerSet wraps a set of k-mers stored in a Roaring Bitmap
 // Provides utility methods for manipulating k-mer sets
 type KmerSet struct {
 	id       string                 // Unique identifier of the KmerSet
 	k        int                    // Size of k-mers (immutable)
 	bitmap   *roaring64.Bitmap      // Bitmap containing the k-mers
 	Metadata map[string]interface{} // User metadata (key=atomic value)
 }
 // NewKmerSet creates a new empty KmerSet
 func NewKmerSet(k int) *KmerSet {
 	return &KmerSet{
 		k:        k,
 		bitmap:   roaring64.New(),
 		Metadata: make(map[string]interface{}),
 	}
 }
 // NewKmerSetFromBitmap creates a KmerSet from an existing bitmap
 func NewKmerSetFromBitmap(k int, bitmap *roaring64.Bitmap) *KmerSet {
 	return &KmerSet{
 		k:        k,
 		bitmap:   bitmap,
 		Metadata: make(map[string]interface{}),
 	}
 }
 // K returns the size of k-mers (immutable)
 func (ks *KmerSet) K() int {
 	return ks.k
 }
 // AddKmerCode adds an encoded k-mer to the set
 func (ks *KmerSet) AddKmerCode(kmer uint64) {
 	ks.bitmap.Add(kmer)
 }
 // AddCanonicalKmerCode adds an encoded canonical k-mer to the set
 func (ks *KmerSet) AddCanonicalKmerCode(kmer uint64) {
 	canonical := CanonicalKmer(kmer, ks.k)
 	ks.bitmap.Add(canonical)
 }
 // AddKmer adds a k-mer to the set by encoding the sequence
 // The sequence must have exactly k nucleotides
 // Zero-allocation: encodes directly without creating an intermediate slice
 func (ks *KmerSet) AddKmer(seq []byte) {
 	kmer := EncodeKmer(seq, ks.k)
 	ks.bitmap.Add(kmer)
 }
 // AddCanonicalKmer adds a canonical k-mer to the set by encoding the sequence
 // The sequence must have exactly k nucleotides
 // Zero-allocation: encodes directly in canonical form without creating an intermediate slice
 func (ks *KmerSet) AddCanonicalKmer(seq []byte) {
 	canonical := EncodeCanonicalKmer(seq, ks.k)
 	ks.bitmap.Add(canonical)
 }
 // AddSequence adds all k-mers from a sequence to the set
 // Uses an iterator to avoid allocating an intermediate vector
 func (ks *KmerSet) AddSequence(seq *obiseq.BioSequence) {
 	rawSeq := seq.Sequence()
 	for canonical := range IterCanonicalKmers(rawSeq, ks.k) {
 		ks.bitmap.Add(canonical)
 	}
 }
 // AddSequences adds all k-mers from multiple sequences in batch
 func (ks *KmerSet) AddSequences(sequences *obiseq.BioSequenceSlice) {
 	for _, seq := range *sequences {
 		ks.AddSequence(seq)
 	}
 }
 // Contains checks if a k-mer is in the set
 func (ks *KmerSet) Contains(kmer uint64) bool {
 	return ks.bitmap.Contains(kmer)
 }
 // Len returns the number of k-mers in the set
 func (ks *KmerSet) Len() uint64 {
 	return ks.bitmap.GetCardinality()
 }
 // MemoryUsage returns memory usage in bytes
 func (ks *KmerSet) MemoryUsage() uint64 {
 	return ks.bitmap.GetSizeInBytes()
 }
 // Clear empties the set
 func (ks *KmerSet) Clear() {
 	ks.bitmap.Clear()
 }
 // Copy creates a copy of the set (consistent with BioSequence.Copy)
 func (ks *KmerSet) Copy() *KmerSet {
 	// Copy metadata
 	metadata := make(map[string]interface{}, len(ks.Metadata))
 	for k, v := range ks.Metadata {
 		metadata[k] = v
 	}
 	return &KmerSet{
 		id:       ks.id,
 		k:        ks.k,
 		bitmap:   ks.bitmap.Clone(),
 		Metadata: metadata,
 	}
 }
 // Id returns the identifier of the KmerSet (consistent with BioSequence.Id)
 func (ks *KmerSet) Id() string {
 	return ks.id
 }
 // SetId sets the identifier of the KmerSet (consistent with BioSequence.SetId)
 func (ks *KmerSet) SetId(id string) {
 	ks.id = id
 }
 // Union returns the union of this set with another
 func (ks *KmerSet) Union(other *KmerSet) *KmerSet {
 	if ks.k != other.k {
 		panic(fmt.Sprintf("Cannot union KmerSets with different k values: %d vs %d", ks.k, other.k))
 	}
 	result := ks.bitmap.Clone()
 	result.Or(other.bitmap)
 	return NewKmerSetFromBitmap(ks.k, result)
 }
 // Intersect returns the intersection of this set with another
 func (ks *KmerSet) Intersect(other *KmerSet) *KmerSet {
 	if ks.k != other.k {
 		panic(fmt.Sprintf("Cannot intersect KmerSets with different k values: %d vs %d", ks.k, other.k))
 	}
 	result := ks.bitmap.Clone()
 	result.And(other.bitmap)
 	return NewKmerSetFromBitmap(ks.k, result)
 }
 // Difference returns the difference of this set with another (this - other)
 func (ks *KmerSet) Difference(other *KmerSet) *KmerSet {
 	if ks.k != other.k {
 		panic(fmt.Sprintf("Cannot subtract KmerSets with different k values: %d vs %d", ks.k, other.k))
 	}
 	result := ks.bitmap.Clone()
 	result.AndNot(other.bitmap)
 	return NewKmerSetFromBitmap(ks.k, result)
 }
 // JaccardDistance computes the Jaccard distance between two KmerSets.
 // The Jaccard distance is defined as: 1 - (|A ∩ B| / |A ∪ B|)
 // where A and B are the two sets.
 //
 // Returns:
 //   - 0.0 when sets are identical (distance = 0, similarity = 1)
 //   - 1.0 when sets are completely disjoint (distance = 1, similarity = 0)
 //   - 1.0 when both sets are empty (by convention)
 //
 // Time complexity: O(|A| + |B|) for Roaring Bitmap operations
 // Space complexity: O(1) as operations are done in-place on temporary bitmaps
 func (ks *KmerSet) JaccardDistance(other *KmerSet) float64 {
 	if ks.k != other.k {
 		panic(fmt.Sprintf("Cannot compute Jaccard distance between KmerSets with different k values: %d vs %d", ks.k, other.k))
 	}
 	// Compute intersection cardinality
 	intersectionCard := ks.bitmap.AndCardinality(other.bitmap)
 	// Compute union cardinality
 	unionCard := ks.bitmap.OrCardinality(other.bitmap)
 	// If union is empty, both sets are empty - return 1.0 by convention
 	if unionCard == 0 {
 		return 1.0
 	}
 	// Jaccard similarity = |A ∩ B| / |A ∪ B|
 	similarity := float64(intersectionCard) / float64(unionCard)
 	// Jaccard distance = 1 - similarity
 	return 1.0 - similarity
 }
 // JaccardSimilarity computes the Jaccard similarity coefficient between two KmerSets.
 // The Jaccard similarity is defined as: |A ∩ B| / |A ∪ B|
 //
 // Returns:
 //   - 1.0 when sets are identical (maximum similarity)
 //   - 0.0 when sets are completely disjoint (no similarity)
 //   - 0.0 when both sets are empty (by convention)
 //
 // Time complexity: O(|A| + |B|) for Roaring Bitmap operations
 // Space complexity: O(1) as operations are done in-place on temporary bitmaps
 func (ks *KmerSet) JaccardSimilarity(other *KmerSet) float64 {
 	return 1.0 - ks.JaccardDistance(other)
 }
 // Iterator returns an iterator over all k-mers in the set
 func (ks *KmerSet) Iterator() roaring64.IntIterable64 {
 	return ks.bitmap.Iterator()
 }
 // Bitmap returns the underlying bitmap (for compatibility)
 func (ks *KmerSet) Bitmap() *roaring64.Bitmap {
 	return ks.bitmap
 }
--- a/pkg/obikmer/kmer_set_attributes.go
+++ b/pkg/obikmer/kmer_set_attributes.go
@@ -1,362 +0,0 @@
 package obikmer
 import (
 	"fmt"
 	"strconv"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
 )
 // ==================================
 // KMER SET ATTRIBUTE API
 // Mimic BioSequence attribute API from obiseq/attributes.go
 // ==================================
 // HasAttribute vérifie si une clé d'attribut existe
 func (ks *KmerSet) HasAttribute(key string) bool {
 	_, ok := ks.Metadata[key]
 	return ok
 }
 // GetAttribute récupère la valeur d'un attribut
 // Cas particuliers: "id" utilise Id(), "k" utilise K()
 func (ks *KmerSet) GetAttribute(key string) (interface{}, bool) {
 	switch key {
 	case "id":
 		return ks.Id(), true
 	case "k":
 		return ks.K(), true
 	default:
 		value, ok := ks.Metadata[key]
 		return value, ok
 	}
 }
 // SetAttribute sets the value of an attribute
 // Cas particuliers: "id" utilise SetId(), "k" est immutable (panique)
 func (ks *KmerSet) SetAttribute(key string, value interface{}) {
 	switch key {
 	case "id":
 		if id, ok := value.(string); ok {
 			ks.SetId(id)
 		} else {
 			panic(fmt.Sprintf("id must be a string, got %T", value))
 		}
 	case "k":
 		panic("k is immutable and cannot be modified via SetAttribute")
 	default:
 		ks.Metadata[key] = value
 	}
 }
 // DeleteAttribute supprime un attribut
 func (ks *KmerSet) DeleteAttribute(key string) {
 	delete(ks.Metadata, key)
 }
 // RemoveAttribute supprime un attribut (alias de DeleteAttribute)
 func (ks *KmerSet) RemoveAttribute(key string) {
 	ks.DeleteAttribute(key)
 }
 // RenameAttribute renomme un attribut
 func (ks *KmerSet) RenameAttribute(newName, oldName string) {
 	if value, ok := ks.Metadata[oldName]; ok {
 		ks.Metadata[newName] = value
 		delete(ks.Metadata, oldName)
 	}
 }
 // GetIntAttribute récupère un attribut en tant qu'entier
 func (ks *KmerSet) GetIntAttribute(key string) (int, bool) {
 	value, ok := ks.Metadata[key]
 	if !ok {
 		return 0, false
 	}
 	switch v := value.(type) {
 	case int:
 		return v, true
 	case int64:
 		return int(v), true
 	case float64:
 		return int(v), true
 	case string:
 		if i, err := strconv.Atoi(v); err == nil {
 			return i, true
 		}
 	}
 	return 0, false
 }
 // GetFloatAttribute récupère un attribut en tant que float64
 func (ks *KmerSet) GetFloatAttribute(key string) (float64, bool) {
 	value, ok := ks.Metadata[key]
 	if !ok {
 		return 0, false
 	}
 	switch v := value.(type) {
 	case float64:
 		return v, true
 	case float32:
 		return float64(v), true
 	case int:
 		return float64(v), true
 	case int64:
 		return float64(v), true
 	case string:
 		if f, err := strconv.ParseFloat(v, 64); err == nil {
 			return f, true
 		}
 	}
 	return 0, false
 }
 // GetNumericAttribute récupère un attribut numérique (alias de GetFloatAttribute)
 func (ks *KmerSet) GetNumericAttribute(key string) (float64, bool) {
 	return ks.GetFloatAttribute(key)
 }
 // GetStringAttribute récupère un attribut en tant que chaîne
 func (ks *KmerSet) GetStringAttribute(key string) (string, bool) {
 	value, ok := ks.Metadata[key]
 	if !ok {
 		return "", false
 	}
 	switch v := value.(type) {
 	case string:
 		return v, true
 	default:
 		return fmt.Sprintf("%v", v), true
 	}
 }
 // GetBoolAttribute récupère un attribut en tant que booléen
 func (ks *KmerSet) GetBoolAttribute(key string) (bool, bool) {
 	value, ok := ks.Metadata[key]
 	if !ok {
 		return false, false
 	}
 	switch v := value.(type) {
 	case bool:
 		return v, true
 	case int:
 		return v != 0, true
 	case string:
 		if b, err := strconv.ParseBool(v); err == nil {
 			return b, true
 		}
 	}
 	return false, false
 }
 // AttributeKeys returns the set of attribute keys
 func (ks *KmerSet) AttributeKeys() obiutils.Set[string] {
 	keys := obiutils.MakeSet[string]()
 	for key := range ks.Metadata {
 		keys.Add(key)
 	}
 	return keys
 }
 // Keys returns the set of attribute keys (alias of AttributeKeys)
 func (ks *KmerSet) Keys() obiutils.Set[string] {
 	return ks.AttributeKeys()
 }
 // ==================================
 // KMER SET GROUP ATTRIBUTE API
 // Métadonnées du groupe + accès via Get() pour les sets individuels
 // ==================================
 // HasAttribute vérifie si une clé d'attribut existe pour le groupe
 func (ksg *KmerSetGroup) HasAttribute(key string) bool {
 	_, ok := ksg.Metadata[key]
 	return ok
 }
 // GetAttribute récupère la valeur d'un attribut du groupe
 // Cas particuliers: "id" utilise Id(), "k" utilise K()
 func (ksg *KmerSetGroup) GetAttribute(key string) (interface{}, bool) {
 	switch key {
 	case "id":
 		return ksg.Id(), true
 	case "k":
 		return ksg.K(), true
 	default:
 		value, ok := ksg.Metadata[key]
 		return value, ok
 	}
 }
 // SetAttribute sets the value of an attribute du groupe
 // Cas particuliers: "id" utilise SetId(), "k" est immutable (panique)
 func (ksg *KmerSetGroup) SetAttribute(key string, value interface{}) {
 	switch key {
 	case "id":
 		if id, ok := value.(string); ok {
 			ksg.SetId(id)
 		} else {
 			panic(fmt.Sprintf("id must be a string, got %T", value))
 		}
 	case "k":
 		panic("k is immutable and cannot be modified via SetAttribute")
 	default:
 		ksg.Metadata[key] = value
 	}
 }
 // DeleteAttribute supprime un attribut du groupe
 func (ksg *KmerSetGroup) DeleteAttribute(key string) {
 	delete(ksg.Metadata, key)
 }
 // RemoveAttribute supprime un attribut du groupe (alias)
 func (ksg *KmerSetGroup) RemoveAttribute(key string) {
 	ksg.DeleteAttribute(key)
 }
 // RenameAttribute renomme un attribut du groupe
 func (ksg *KmerSetGroup) RenameAttribute(newName, oldName string) {
 	if value, ok := ksg.Metadata[oldName]; ok {
 		ksg.Metadata[newName] = value
 		delete(ksg.Metadata, oldName)
 	}
 }
 // GetIntAttribute récupère un attribut entier du groupe
 func (ksg *KmerSetGroup) GetIntAttribute(key string) (int, bool) {
 	value, ok := ksg.GetAttribute(key)
 	if !ok {
 		return 0, false
 	}
 	switch v := value.(type) {
 	case int:
 		return v, true
 	case int64:
 		return int(v), true
 	case float64:
 		return int(v), true
 	case string:
 		if i, err := strconv.Atoi(v); err == nil {
 			return i, true
 		}
 	}
 	return 0, false
 }
 // GetFloatAttribute récupère un attribut float64 du groupe
 func (ksg *KmerSetGroup) GetFloatAttribute(key string) (float64, bool) {
 	value, ok := ksg.GetAttribute(key)
 	if !ok {
 		return 0, false
 	}
 	switch v := value.(type) {
 	case float64:
 		return v, true
 	case float32:
 		return float64(v), true
 	case int:
 		return float64(v), true
 	case int64:
 		return float64(v), true
 	case string:
 		if f, err := strconv.ParseFloat(v, 64); err == nil {
 			return f, true
 		}
 	}
 	return 0, false
 }
 // GetNumericAttribute récupère un attribut numérique du groupe
 func (ksg *KmerSetGroup) GetNumericAttribute(key string) (float64, bool) {
 	return ksg.GetFloatAttribute(key)
 }
 // GetStringAttribute récupère un attribut chaîne du groupe
 func (ksg *KmerSetGroup) GetStringAttribute(key string) (string, bool) {
 	value, ok := ksg.GetAttribute(key)
 	if !ok {
 		return "", false
 	}
 	switch v := value.(type) {
 	case string:
 		return v, true
 	default:
 		return fmt.Sprintf("%v", v), true
 	}
 }
 // GetBoolAttribute récupère un attribut booléen du groupe
 func (ksg *KmerSetGroup) GetBoolAttribute(key string) (bool, bool) {
 	value, ok := ksg.GetAttribute(key)
 	if !ok {
 		return false, false
 	}
 	switch v := value.(type) {
 	case bool:
 		return v, true
 	case int:
 		return v != 0, true
 	case string:
 		if b, err := strconv.ParseBool(v); err == nil {
 			return b, true
 		}
 	}
 	return false, false
 }
 // AttributeKeys returns the set of attribute keys du groupe
 func (ksg *KmerSetGroup) AttributeKeys() obiutils.Set[string] {
 	keys := obiutils.MakeSet[string]()
 	for key := range ksg.Metadata {
 		keys.Add(key)
 	}
 	return keys
 }
 // Keys returns the set of group attribute keys (alias)
 func (ksg *KmerSetGroup) Keys() obiutils.Set[string] {
 	return ksg.AttributeKeys()
 }
 // ==================================
 // MÉTHODES POUR ACCÉDER AUX ATTRIBUTS DES SETS INDIVIDUELS VIA Get()
 // Architecture zero-copy: ksg.Get(i).SetAttribute(...)
 // ==================================
 // Exemple d'utilisation:
 // Pour accéder aux métadonnées d'un KmerSet individuel dans un groupe:
 //   ks := ksg.Get(0)
 //   ks.SetAttribute("level", 1)
 //   hasLevel := ks.HasAttribute("level")
 //
 // Pour les métadonnées du groupe:
 //   ksg.SetAttribute("name", "FrequencyFilter")
 //   name, ok := ksg.GetStringAttribute("name")
 // AllAttributeKeys returns all unique attribute keys of the group AND all its sets
 func (ksg *KmerSetGroup) AllAttributeKeys() obiutils.Set[string] {
 	keys := obiutils.MakeSet[string]()
 	// Ajouter les clés du groupe
 	for key := range ksg.Metadata {
 		keys.Add(key)
 	}
 	// Ajouter les clés de chaque set
 	for _, ks := range ksg.sets {
 		for key := range ks.Metadata {
 			keys.Add(key)
 		}
 	}
 	return keys
 }
--- a/pkg/obikmer/kmer_set_builder.go
+++ b/pkg/obikmer/kmer_set_builder.go
@@ -0,0 +1,702 @@
 package obikmer
 import (
 	"fmt"
 	"math"
 	"os"
 	"path/filepath"
 	"slices"
 	"sync"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 	"github.com/schollz/progressbar/v3"
 )
 // BuilderOption is a functional option for KmerSetGroupBuilder.
 type BuilderOption func(*builderConfig)
 type builderConfig struct {
 	minFreq          int     // 0 means no frequency filtering (simple dedup)
 	maxFreq          int     // 0 means no upper bound
 	saveFreqTopN     int     // >0 means save the N most frequent k-mers per set to CSV
 	entropyThreshold float64 // >0 means filter k-mers with entropy <= threshold
 	entropyLevelMax  int     // max sub-word size for entropy (typically 6)
 }
 // WithMinFrequency activates frequency filtering mode.
 // Only k-mers seen >= minFreq times are kept in the final index.
 func WithMinFrequency(minFreq int) BuilderOption {
 	return func(c *builderConfig) {
 		c.minFreq = minFreq
 	}
 }
 // WithMaxFrequency sets the upper frequency bound.
 // Only k-mers seen <= maxFreq times are kept in the final index.
 func WithMaxFrequency(maxFreq int) BuilderOption {
 	return func(c *builderConfig) {
 		c.maxFreq = maxFreq
 	}
 }
 // WithSaveFreqKmers saves the N most frequent k-mers per set to a CSV file
 // (top_kmers.csv in each set directory).
 func WithSaveFreqKmers(n int) BuilderOption {
 	return func(c *builderConfig) {
 		c.saveFreqTopN = n
 	}
 }
 // WithEntropyFilter activates entropy-based low-complexity filtering.
 // K-mers with entropy <= threshold are discarded during finalization.
 // levelMax is the maximum sub-word size for entropy computation (typically 6).
 func WithEntropyFilter(threshold float64, levelMax int) BuilderOption {
 	return func(c *builderConfig) {
 		c.entropyThreshold = threshold
 		c.entropyLevelMax = levelMax
 	}
 }
 // KmerSetGroupBuilder constructs a KmerSetGroup on disk.
 // During construction, super-kmers are written to temporary .skm files
 // partitioned by minimizer. On Close(), each partition is finalized
 // (sort, dedup, optional frequency filter) into .kdi files.
 type KmerSetGroupBuilder struct {
 	dir        string
 	k          int
 	m          int
 	n          int // number of NEW sets being built
 	P          int // number of partitions
 	startIndex int // first set index (0 for new groups, existingN for appends)
 	config     builderConfig
 	existing   *KmerSetGroup  // non-nil when appending to existing group
 	writers    [][]*SkmWriter // [setIndex][partIndex] (local index 0..n-1)
 	mu         [][]sync.Mutex // per-writer mutex for concurrent access
 	closed     bool
 }
 // NewKmerSetGroupBuilder creates a builder for a new KmerSetGroup.
 //
 // Parameters:
 //   - directory: destination directory (created if necessary)
 //   - k: k-mer size (1-31)
 //   - m: minimizer size (-1 for auto = ceil(k/2.5))
 //   - n: number of sets in the group
 //   - P: number of partitions (-1 for auto)
 //   - options: optional builder options (e.g. WithMinFrequency)
 func NewKmerSetGroupBuilder(directory string, k, m, n, P int,
 	options ...BuilderOption) (*KmerSetGroupBuilder, error) {
 	if k < 2 || k > 31 {
 		return nil, fmt.Errorf("obikmer: k must be between 2 and 31, got %d", k)
 	}
 	if n < 1 {
 		return nil, fmt.Errorf("obikmer: n must be >= 1, got %d", n)
 	}
 	// Auto minimizer size
 	if m < 0 {
 		m = int(math.Ceil(float64(k) / 2.5))
 	}
 	if m < 1 {
 		m = 1
 	}
 	if m >= k {
 		m = k - 1
 	}
 	// Auto partition count
 	if P < 0 {
 		// Use 4^m as the maximum, capped at a reasonable value
 		maxP := 1 << (2 * m) // 4^m
 		P = maxP
 		if P > 4096 {
 			P = 4096
 		}
 		if P < 64 {
 			P = 64
 		}
 	}
 	// Apply options
 	var config builderConfig
 	for _, opt := range options {
 		opt(&config)
 	}
 	// Create build directory structure
 	buildDir := filepath.Join(directory, ".build")
 	for s := 0; s < n; s++ {
 		setDir := filepath.Join(buildDir, fmt.Sprintf("set_%d", s))
 		if err := os.MkdirAll(setDir, 0755); err != nil {
 			return nil, fmt.Errorf("obikmer: create build dir: %w", err)
 		}
 	}
 	// Create SKM writers
 	writers := make([][]*SkmWriter, n)
 	mutexes := make([][]sync.Mutex, n)
 	for s := 0; s < n; s++ {
 		writers[s] = make([]*SkmWriter, P)
 		mutexes[s] = make([]sync.Mutex, P)
 		for p := 0; p < P; p++ {
 			path := filepath.Join(buildDir, fmt.Sprintf("set_%d", s),
 				fmt.Sprintf("part_%04d.skm", p))
 			w, err := NewSkmWriter(path)
 			if err != nil {
 				// Close already-created writers
 				for ss := 0; ss <= s; ss++ {
 					for pp := 0; pp < P; pp++ {
 						if writers[ss][pp] != nil {
 							writers[ss][pp].Close()
 						}
 					}
 				}
 				return nil, fmt.Errorf("obikmer: create skm writer: %w", err)
 			}
 			writers[s][p] = w
 		}
 	}
 	return &KmerSetGroupBuilder{
 		dir:        directory,
 		k:          k,
 		m:          m,
 		n:          n,
 		P:          P,
 		startIndex: 0,
 		config:     config,
 		writers:    writers,
 		mu:         mutexes,
 	}, nil
 }
 // AppendKmerSetGroupBuilder opens an existing KmerSetGroup and creates
 // a builder that adds n new sets starting from the existing set count.
 // The k, m, and partitions are inherited from the existing group.
 func AppendKmerSetGroupBuilder(directory string, n int, options ...BuilderOption) (*KmerSetGroupBuilder, error) {
 	existing, err := OpenKmerSetGroup(directory)
 	if err != nil {
 		return nil, fmt.Errorf("obikmer: open existing group: %w", err)
 	}
 	if n < 1 {
 		return nil, fmt.Errorf("obikmer: n must be >= 1, got %d", n)
 	}
 	k := existing.K()
 	m := existing.M()
 	P := existing.Partitions()
 	startIndex := existing.Size()
 	var config builderConfig
 	for _, opt := range options {
 		opt(&config)
 	}
 	// Create build directory structure for new sets
 	buildDir := filepath.Join(directory, ".build")
 	for s := 0; s < n; s++ {
 		setDir := filepath.Join(buildDir, fmt.Sprintf("set_%d", s))
 		if err := os.MkdirAll(setDir, 0755); err != nil {
 			return nil, fmt.Errorf("obikmer: create build dir: %w", err)
 		}
 	}
 	// Create SKM writers for new sets
 	writers := make([][]*SkmWriter, n)
 	mutexes := make([][]sync.Mutex, n)
 	for s := 0; s < n; s++ {
 		writers[s] = make([]*SkmWriter, P)
 		mutexes[s] = make([]sync.Mutex, P)
 		for p := 0; p < P; p++ {
 			path := filepath.Join(buildDir, fmt.Sprintf("set_%d", s),
 				fmt.Sprintf("part_%04d.skm", p))
 			w, err := NewSkmWriter(path)
 			if err != nil {
 				for ss := 0; ss <= s; ss++ {
 					for pp := 0; pp < P; pp++ {
 						if writers[ss][pp] != nil {
 							writers[ss][pp].Close()
 						}
 					}
 				}
 				return nil, fmt.Errorf("obikmer: create skm writer: %w", err)
 			}
 			writers[s][p] = w
 		}
 	}
 	return &KmerSetGroupBuilder{
 		dir:        directory,
 		k:          k,
 		m:          m,
 		n:          n,
 		P:          P,
 		startIndex: startIndex,
 		config:     config,
 		existing:   existing,
 		writers:    writers,
 		mu:         mutexes,
 	}, nil
 }
 // StartIndex returns the first global set index for the new sets being built.
 // For new groups this is 0; for appends it is the existing group's Size().
 func (b *KmerSetGroupBuilder) StartIndex() int {
 	return b.startIndex
 }
 // AddSequence extracts super-kmers from a sequence and writes them
 // to the appropriate partition files for the given set.
 func (b *KmerSetGroupBuilder) AddSequence(setIndex int, seq *obiseq.BioSequence) {
 	if setIndex < 0 || setIndex >= b.n {
 		return
 	}
 	rawSeq := seq.Sequence()
 	if len(rawSeq) < b.k {
 		return
 	}
 	for sk := range IterSuperKmers(rawSeq, b.k, b.m) {
 		part := int(sk.Minimizer % uint64(b.P))
 		b.mu[setIndex][part].Lock()
 		b.writers[setIndex][part].Write(sk)
 		b.mu[setIndex][part].Unlock()
 	}
 }
 // AddSuperKmer writes a single super-kmer to the appropriate partition.
 func (b *KmerSetGroupBuilder) AddSuperKmer(setIndex int, sk SuperKmer) {
 	if setIndex < 0 || setIndex >= b.n {
 		return
 	}
 	part := int(sk.Minimizer % uint64(b.P))
 	b.mu[setIndex][part].Lock()
 	b.writers[setIndex][part].Write(sk)
 	b.mu[setIndex][part].Unlock()
 }
 // Close finalizes the construction:
 //  1. Flush and close all SKM writers
 //  2. For each partition of each set (in parallel):
 //     - Load super-kmers from .skm
 //     - Extract canonical k-mers
 //     - Sort and deduplicate (count if frequency filter)
 //     - Write .kdi file
 //  3. Write metadata.toml
 //  4. Remove .build/ directory
 //
 // Returns the finalized KmerSetGroup in read-only mode.
 func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) {
 	if b.closed {
 		return nil, fmt.Errorf("obikmer: builder already closed")
 	}
 	b.closed = true
 	// 1. Close all SKM writers
 	for s := 0; s < b.n; s++ {
 		for p := 0; p < b.P; p++ {
 			if err := b.writers[s][p].Close(); err != nil {
 				return nil, fmt.Errorf("obikmer: close skm writer set=%d part=%d: %w", s, p, err)
 			}
 		}
 	}
 	// 2. Create output directory structure for new sets
 	for s := 0; s < b.n; s++ {
 		globalIdx := b.startIndex + s
 		setDir := filepath.Join(b.dir, fmt.Sprintf("set_%d", globalIdx))
 		if err := os.MkdirAll(setDir, 0755); err != nil {
 			return nil, fmt.Errorf("obikmer: create set dir: %w", err)
 		}
 	}
 	// =====================================================================
 	// 2-stage pipeline: readers (pure I/O) → workers (CPU + write)
 	//
 	// - nReaders goroutines read .skm files (pure I/O, fast)
 	// - nWorkers goroutines extract k-mers, sort, dedup, filter, write .kdi
 	//
 	// One unbuffered channel between stages. Readers are truly I/O-bound
 	// (small files, buffered reads), workers are CPU-bound and stay busy.
 	// =====================================================================
 	totalJobs := b.n * b.P
 	counts := make([][]uint64, b.n)
 	spectra := make([][]map[int]uint64, b.n)
 	var topKmers [][]*TopNKmers
 	for s := 0; s < b.n; s++ {
 		counts[s] = make([]uint64, b.P)
 		spectra[s] = make([]map[int]uint64, b.P)
 	}
 	if b.config.saveFreqTopN > 0 {
 		topKmers = make([][]*TopNKmers, b.n)
 		for s := 0; s < b.n; s++ {
 			topKmers[s] = make([]*TopNKmers, b.P)
 		}
 	}
 	nCPU := obidefault.ParallelWorkers()
 	// Stage sizing
 	nWorkers := nCPU     // CPU-bound: one per core
 	nReaders := nCPU / 4 // pure I/O: few goroutines suffice
 	if nReaders < 2 {
 		nReaders = 2
 	}
 	if nReaders > 4 {
 		nReaders = 4
 	}
 	if nWorkers > totalJobs {
 		nWorkers = totalJobs
 	}
 	if nReaders > totalJobs {
 		nReaders = totalJobs
 	}
 	var bar *progressbar.ProgressBar
 	if obidefault.ProgressBar() {
 		pbopt := []progressbar.Option{
 			progressbar.OptionSetWriter(os.Stderr),
 			progressbar.OptionSetWidth(15),
 			progressbar.OptionShowCount(),
 			progressbar.OptionShowIts(),
 			progressbar.OptionSetPredictTime(true),
 			progressbar.OptionSetDescription("[Finalizing partitions]"),
 		}
 		bar = progressbar.NewOptions(totalJobs, pbopt...)
 	}
 	// --- Channel types ---
 	type partitionData struct {
 		setIdx  int
 		partIdx int
 		skmers  []SuperKmer // raw super-kmers from I/O stage
 	}
 	type readJob struct {
 		setIdx  int
 		partIdx int
 	}
 	dataCh := make(chan *partitionData) // unbuffered
 	readJobs := make(chan readJob, totalJobs)
 	var errMu sync.Mutex
 	var firstErr error
 	// Fill job queue (buffered, all jobs pre-loaded)
 	for s := 0; s < b.n; s++ {
 		for p := 0; p < b.P; p++ {
 			readJobs <- readJob{s, p}
 		}
 	}
 	close(readJobs)
 	// --- Stage 1: Readers (pure I/O) ---
 	var readWg sync.WaitGroup
 	for w := 0; w < nReaders; w++ {
 		readWg.Add(1)
 		go func() {
 			defer readWg.Done()
 			for rj := range readJobs {
 				skmers, err := b.loadPartitionRaw(rj.setIdx, rj.partIdx)
 				if err != nil {
 					errMu.Lock()
 					if firstErr == nil {
 						firstErr = err
 					}
 					errMu.Unlock()
 				}
 				dataCh <- &partitionData{rj.setIdx, rj.partIdx, skmers}
 			}
 		}()
 	}
 	go func() {
 		readWg.Wait()
 		close(dataCh)
 	}()
 	// --- Stage 2: Workers (CPU: extract k-mers + sort/filter + write .kdi) ---
 	var workWg sync.WaitGroup
 	for w := 0; w < nWorkers; w++ {
 		workWg.Add(1)
 		go func() {
 			defer workWg.Done()
 			for pd := range dataCh {
 				// CPU: extract canonical k-mers from super-kmers
 				kmers := extractCanonicalKmers(pd.skmers, b.k)
 				pd.skmers = nil // allow GC of raw super-kmers
 				// CPU: sort, dedup, filter
 				filtered, spectrum, topN := b.sortFilterPartition(kmers)
 				kmers = nil // allow GC of unsorted data
 				// I/O: write .kdi file
 				globalIdx := b.startIndex + pd.setIdx
 				kdiPath := filepath.Join(b.dir,
 					fmt.Sprintf("set_%d", globalIdx),
 					fmt.Sprintf("part_%04d.kdi", pd.partIdx))
 				n, err := b.writePartitionKdi(kdiPath, filtered)
 				if err != nil {
 					errMu.Lock()
 					if firstErr == nil {
 						firstErr = err
 					}
 					errMu.Unlock()
 				}
 				counts[pd.setIdx][pd.partIdx] = n
 				spectra[pd.setIdx][pd.partIdx] = spectrum
 				if topKmers != nil {
 					topKmers[pd.setIdx][pd.partIdx] = topN
 				}
 				if bar != nil {
 					bar.Add(1)
 				}
 			}
 		}()
 	}
 	workWg.Wait()
 	if bar != nil {
 		fmt.Fprintln(os.Stderr)
 	}
 	if firstErr != nil {
 		return nil, firstErr
 	}
 	// Aggregate per-partition spectra into per-set spectra and write spectrum.bin
 	for s := 0; s < b.n; s++ {
 		globalIdx := b.startIndex + s
 		setSpectrum := make(map[int]uint64)
 		for p := 0; p < b.P; p++ {
 			if spectra[s][p] != nil {
 				MergeSpectraMaps(setSpectrum, spectra[s][p])
 			}
 		}
 		if len(setSpectrum) > 0 {
 			specPath := filepath.Join(b.dir, fmt.Sprintf("set_%d", globalIdx), "spectrum.bin")
 			if err := WriteSpectrum(specPath, MapToSpectrum(setSpectrum)); err != nil {
 				return nil, fmt.Errorf("obikmer: write spectrum set=%d: %w", globalIdx, err)
 			}
 		}
 	}
 	// Aggregate per-partition top-N k-mers and write CSV
 	if topKmers != nil {
 		for s := 0; s < b.n; s++ {
 			globalIdx := b.startIndex + s
 			merged := NewTopNKmers(b.config.saveFreqTopN)
 			for p := 0; p < b.P; p++ {
 				merged.MergeTopN(topKmers[s][p])
 			}
 			results := merged.Results()
 			if len(results) > 0 {
 				csvPath := filepath.Join(b.dir, fmt.Sprintf("set_%d", globalIdx), "top_kmers.csv")
 				if err := WriteTopKmersCSV(csvPath, results, b.k); err != nil {
 					return nil, fmt.Errorf("obikmer: write top kmers set=%d: %w", globalIdx, err)
 				}
 			}
 		}
 	}
 	// 3. Build KmerSetGroup and write metadata
 	newCounts := make([]uint64, b.n)
 	for s := 0; s < b.n; s++ {
 		for p := 0; p < b.P; p++ {
 			newCounts[s] += counts[s][p]
 		}
 	}
 	var ksg *KmerSetGroup
 	if b.existing != nil {
 		// Append mode: extend existing group
 		ksg = b.existing
 		ksg.n += b.n
 		ksg.setsIDs = append(ksg.setsIDs, make([]string, b.n)...)
 		ksg.counts = append(ksg.counts, newCounts...)
 		newMeta := make([]map[string]interface{}, b.n)
 		for i := range newMeta {
 			newMeta[i] = make(map[string]interface{})
 		}
 		ksg.setsMetadata = append(ksg.setsMetadata, newMeta...)
 	} else {
 		// New group
 		setsIDs := make([]string, b.n)
 		setsMetadata := make([]map[string]interface{}, b.n)
 		for i := range setsMetadata {
 			setsMetadata[i] = make(map[string]interface{})
 		}
 		ksg = &KmerSetGroup{
 			path:         b.dir,
 			k:            b.k,
 			m:            b.m,
 			partitions:   b.P,
 			n:            b.n,
 			setsIDs:      setsIDs,
 			counts:       newCounts,
 			setsMetadata: setsMetadata,
 			Metadata:     make(map[string]interface{}),
 		}
 	}
 	if err := ksg.saveMetadata(); err != nil {
 		return nil, fmt.Errorf("obikmer: write metadata: %w", err)
 	}
 	// 4. Remove .build/ directory
 	buildDir := filepath.Join(b.dir, ".build")
 	os.RemoveAll(buildDir)
 	return ksg, nil
 }
 // loadPartitionRaw reads a .skm file and returns raw super-kmers.
 // This is pure I/O — no k-mer extraction is done here.
 // Returns nil (not an error) if the .skm file is empty or missing.
 func (b *KmerSetGroupBuilder) loadPartitionRaw(setIdx, partIdx int) ([]SuperKmer, error) {
 	skmPath := filepath.Join(b.dir, ".build",
 		fmt.Sprintf("set_%d", setIdx),
 		fmt.Sprintf("part_%04d.skm", partIdx))
 	fi, err := os.Stat(skmPath)
 	if err != nil {
 		return nil, nil // empty partition, not an error
 	}
 	reader, err := NewSkmReader(skmPath)
 	if err != nil {
 		return nil, nil
 	}
 	// Estimate capacity from file size. Each super-kmer record is
 	// 2 bytes (length) + packed bases (~k/4 bytes), so roughly
 	// (2 + k/4) bytes per super-kmer on average.
 	avgRecordSize := 2 + b.k/4
 	if avgRecordSize < 4 {
 		avgRecordSize = 4
 	}
 	estCount := int(fi.Size()) / avgRecordSize
 	skmers := make([]SuperKmer, 0, estCount)
 	for {
 		sk, ok := reader.Next()
 		if !ok {
 			break
 		}
 		skmers = append(skmers, sk)
 	}
 	reader.Close()
 	return skmers, nil
 }
 // extractCanonicalKmers extracts all canonical k-mers from a slice of super-kmers.
 // This is CPU-bound work (sliding-window forward/reverse complement).
 func extractCanonicalKmers(skmers []SuperKmer, k int) []uint64 {
 	// Pre-compute total capacity to avoid repeated slice growth.
 	// Each super-kmer of length L yields L-k+1 canonical k-mers.
 	total := 0
 	for i := range skmers {
 		n := len(skmers[i].Sequence) - k + 1
 		if n > 0 {
 			total += n
 		}
 	}
 	kmers := make([]uint64, 0, total)
 	for _, sk := range skmers {
 		for kmer := range IterCanonicalKmers(sk.Sequence, k) {
 			kmers = append(kmers, kmer)
 		}
 	}
 	return kmers
 }
 // sortFilterPartition sorts, deduplicates, and filters k-mers in memory (CPU-bound).
 // Returns the filtered sorted slice, frequency spectrum, and optional top-N.
 func (b *KmerSetGroupBuilder) sortFilterPartition(kmers []uint64) ([]uint64, map[int]uint64, *TopNKmers) {
 	if len(kmers) == 0 {
 		return nil, nil, nil
 	}
 	// Sort (CPU-bound) — slices.Sort avoids reflection overhead of sort.Slice
 	slices.Sort(kmers)
 	minFreq := b.config.minFreq
 	if minFreq <= 0 {
 		minFreq = 1 // simple dedup
 	}
 	maxFreq := b.config.maxFreq
 	// Prepare entropy filter if requested
 	var entropyFilter *KmerEntropyFilter
 	if b.config.entropyThreshold > 0 && b.config.entropyLevelMax > 0 {
 		entropyFilter = NewKmerEntropyFilter(b.k, b.config.entropyLevelMax, b.config.entropyThreshold)
 	}
 	// Prepare top-N collector if requested
 	var topN *TopNKmers
 	if b.config.saveFreqTopN > 0 {
 		topN = NewTopNKmers(b.config.saveFreqTopN)
 	}
 	// Linear scan: count consecutive identical values, filter, accumulate spectrum
 	partSpectrum := make(map[int]uint64)
 	filtered := make([]uint64, 0, len(kmers)/2)
 	i := 0
 	for i < len(kmers) {
 		val := kmers[i]
 		c := 1
 		for i+c < len(kmers) && kmers[i+c] == val {
 			c++
 		}
 		partSpectrum[c]++
 		if topN != nil {
 			topN.Add(val, c)
 		}
 		if c >= minFreq && (maxFreq <= 0 || c <= maxFreq) {
 			if entropyFilter == nil || entropyFilter.Accept(val) {
 				filtered = append(filtered, val)
 			}
 		}
 		i += c
 	}
 	return filtered, partSpectrum, topN
 }
 // writePartitionKdi writes a sorted slice of k-mers to a .kdi file (I/O-bound).
 // Returns the number of k-mers written.
 func (b *KmerSetGroupBuilder) writePartitionKdi(kdiPath string, kmers []uint64) (uint64, error) {
 	w, err := NewKdiWriter(kdiPath)
 	if err != nil {
 		return 0, err
 	}
 	for _, val := range kmers {
 		if err := w.Write(val); err != nil {
 			w.Close()
 			return 0, err
 		}
 	}
 	n := w.Count()
 	return n, w.Close()
 }
 func (b *KmerSetGroupBuilder) writeEmptyKdi(path string, count *uint64) error {
 	w, err := NewKdiWriter(path)
 	if err != nil {
 		return err
 	}
 	*count = 0
 	return w.Close()
 }
--- a/pkg/obikmer/kmer_set_builder_test.go
+++ b/pkg/obikmer/kmer_set_builder_test.go
@@ -0,0 +1,278 @@
 package obikmer
 import (
 	"sort"
 	"testing"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 )
 func TestBuilderBasic(t *testing.T) {
 	dir := t.TempDir()
 	builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64)
 	if err != nil {
 		t.Fatal(err)
 	}
 	seq := obiseq.NewBioSequence("test", []byte("ACGTACGTACGTACGTACGTACGTACGT"), "")
 	builder.AddSequence(0, seq)
 	ksg, err := builder.Close()
 	if err != nil {
 		t.Fatal(err)
 	}
 	if ksg.K() != 15 {
 		t.Fatalf("K() = %d, want 15", ksg.K())
 	}
 	if ksg.M() != 7 {
 		t.Fatalf("M() = %d, want 7", ksg.M())
 	}
 	if ksg.Partitions() != 64 {
 		t.Fatalf("Partitions() = %d, want 64", ksg.Partitions())
 	}
 	if ksg.Size() != 1 {
 		t.Fatalf("Size() = %d, want 1", ksg.Size())
 	}
 	if ksg.Len(0) == 0 {
 		t.Fatal("Len(0) = 0, expected some k-mers")
 	}
 	// Verify k-mers match what we'd compute directly
 	var expected []uint64
 	for kmer := range IterCanonicalKmers(seq.Sequence(), 15) {
 		expected = append(expected, kmer)
 	}
 	sort.Slice(expected, func(i, j int) bool { return expected[i] < expected[j] })
 	// Dedup
 	deduped := expected[:0]
 	for i, v := range expected {
 		if i == 0 || v != expected[i-1] {
 			deduped = append(deduped, v)
 		}
 	}
 	if ksg.Len(0) != uint64(len(deduped)) {
 		t.Fatalf("Len(0) = %d, expected %d unique k-mers", ksg.Len(0), len(deduped))
 	}
 	// Check iterator
 	var fromIter []uint64
 	for kmer := range ksg.Iterator(0) {
 		fromIter = append(fromIter, kmer)
 	}
 	// The iterator does a k-way merge so should be sorted
 	for i := 1; i < len(fromIter); i++ {
 		if fromIter[i] <= fromIter[i-1] {
 			t.Fatalf("iterator not sorted at %d: %d <= %d", i, fromIter[i], fromIter[i-1])
 		}
 	}
 	if len(fromIter) != len(deduped) {
 		t.Fatalf("iterator yielded %d k-mers, expected %d", len(fromIter), len(deduped))
 	}
 	for i, v := range fromIter {
 		if v != deduped[i] {
 			t.Fatalf("iterator kmer %d: got %d, want %d", i, v, deduped[i])
 		}
 	}
 }
 func TestBuilderMultipleSequences(t *testing.T) {
 	dir := t.TempDir()
 	builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64)
 	if err != nil {
 		t.Fatal(err)
 	}
 	seqs := []string{
 		"ACGTACGTACGTACGTACGTACGTACGT",
 		"TTTTTTTTTTTTTTTTTTTTTTTTT",
 		"GGGGGGGGGGGGGGGGGGGGGGGG",
 	}
 	for _, s := range seqs {
 		seq := obiseq.NewBioSequence("", []byte(s), "")
 		builder.AddSequence(0, seq)
 	}
 	ksg, err := builder.Close()
 	if err != nil {
 		t.Fatal(err)
 	}
 	if ksg.Len(0) == 0 {
 		t.Fatal("expected k-mers after multiple sequences")
 	}
 }
 func TestBuilderFrequencyFilter(t *testing.T) {
 	dir := t.TempDir()
 	builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64,
 		WithMinFrequency(3))
 	if err != nil {
 		t.Fatal(err)
 	}
 	// Add same sequence 3 times — all k-mers should survive freq=3
 	seq := obiseq.NewBioSequence("test", []byte("ACGTACGTACGTACGTACGTACGTACGT"), "")
 	for i := 0; i < 3; i++ {
 		builder.AddSequence(0, seq)
 	}
 	ksg, err := builder.Close()
 	if err != nil {
 		t.Fatal(err)
 	}
 	// All k-mers appear exactly 3 times → all should survive
 	var expected []uint64
 	for kmer := range IterCanonicalKmers(seq.Sequence(), 15) {
 		expected = append(expected, kmer)
 	}
 	sort.Slice(expected, func(i, j int) bool { return expected[i] < expected[j] })
 	deduped := expected[:0]
 	for i, v := range expected {
 		if i == 0 || v != expected[i-1] {
 			deduped = append(deduped, v)
 		}
 	}
 	if ksg.Len(0) != uint64(len(deduped)) {
 		t.Fatalf("Len(0) = %d, expected %d (all k-mers at freq=3)", ksg.Len(0), len(deduped))
 	}
 }
 func TestBuilderFrequencyFilterRejects(t *testing.T) {
 	dir := t.TempDir()
 	builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64,
 		WithMinFrequency(5))
 	if err != nil {
 		t.Fatal(err)
 	}
 	// Use a non-repetitive sequence so each canonical k-mer appears once per pass.
 	// Adding it twice gives freq=2 per kmer, which is < minFreq=5 → all rejected.
 	seq := obiseq.NewBioSequence("test",
 		[]byte("ACGATCGATCTAGCTAGCTGATCGATCGATCG"), "")
 	builder.AddSequence(0, seq)
 	builder.AddSequence(0, seq)
 	ksg, err := builder.Close()
 	if err != nil {
 		t.Fatal(err)
 	}
 	if ksg.Len(0) != 0 {
 		t.Fatalf("Len(0) = %d, expected 0 (all k-mers at freq=2 < minFreq=5)", ksg.Len(0))
 	}
 }
 func TestBuilderMultipleSets(t *testing.T) {
 	dir := t.TempDir()
 	builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 3, 64)
 	if err != nil {
 		t.Fatal(err)
 	}
 	seqs := []string{
 		"ACGTACGTACGTACGTACGTACGTACGT",
 		"TTTTTTTTTTTTTTTTTTTTTTTTT",
 		"GGGGGGGGGGGGGGGGGGGGGGGG",
 	}
 	for i, s := range seqs {
 		seq := obiseq.NewBioSequence("", []byte(s), "")
 		builder.AddSequence(i, seq)
 	}
 	ksg, err := builder.Close()
 	if err != nil {
 		t.Fatal(err)
 	}
 	if ksg.Size() != 3 {
 		t.Fatalf("Size() = %d, want 3", ksg.Size())
 	}
 	for s := 0; s < 3; s++ {
 		if ksg.Len(s) == 0 {
 			t.Fatalf("Len(%d) = 0, expected some k-mers", s)
 		}
 	}
 }
 func TestBuilderOpenRoundTrip(t *testing.T) {
 	dir := t.TempDir()
 	builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64)
 	if err != nil {
 		t.Fatal(err)
 	}
 	seq := obiseq.NewBioSequence("test", []byte("ACGTACGTACGTACGTACGTACGTACGT"), "")
 	builder.AddSequence(0, seq)
 	ksg1, err := builder.Close()
 	if err != nil {
 		t.Fatal(err)
 	}
 	// Reopen
 	ksg2, err := OpenKmerSetGroup(dir)
 	if err != nil {
 		t.Fatal(err)
 	}
 	if ksg2.K() != ksg1.K() {
 		t.Fatalf("K mismatch: %d vs %d", ksg2.K(), ksg1.K())
 	}
 	if ksg2.M() != ksg1.M() {
 		t.Fatalf("M mismatch: %d vs %d", ksg2.M(), ksg1.M())
 	}
 	if ksg2.Partitions() != ksg1.Partitions() {
 		t.Fatalf("Partitions mismatch: %d vs %d", ksg2.Partitions(), ksg1.Partitions())
 	}
 	if ksg2.Len(0) != ksg1.Len(0) {
 		t.Fatalf("Len mismatch: %d vs %d", ksg2.Len(0), ksg1.Len(0))
 	}
 }
 func TestBuilderAttributes(t *testing.T) {
 	dir := t.TempDir()
 	builder, err := NewKmerSetGroupBuilder(dir, 15, 7, 1, 64)
 	if err != nil {
 		t.Fatal(err)
 	}
 	seq := obiseq.NewBioSequence("test", []byte("ACGTACGTACGTACGTACGTACGTACGT"), "")
 	builder.AddSequence(0, seq)
 	ksg, err := builder.Close()
 	if err != nil {
 		t.Fatal(err)
 	}
 	ksg.SetId("my_index")
 	ksg.SetAttribute("organism", "test")
 	ksg.SaveMetadata()
 	// Reopen and check
 	ksg2, err := OpenKmerSetGroup(dir)
 	if err != nil {
 		t.Fatal(err)
 	}
 	if ksg2.Id() != "my_index" {
 		t.Fatalf("Id() = %q, want %q", ksg2.Id(), "my_index")
 	}
 	if !ksg2.HasAttribute("organism") {
 		t.Fatal("expected 'organism' attribute")
 	}
 	v, _ := ksg2.GetAttribute("organism")
 	if v != "test" {
 		t.Fatalf("organism = %v, want 'test'", v)
 	}
 }
--- a/pkg/obikmer/kmer_set_disk.go
+++ b/pkg/obikmer/kmer_set_disk.go
@@ -0,0 +1,944 @@
 package obikmer
 import (
 	"fmt"
 	"io"
 	"iter"
 	"os"
 	"path"
 	"path/filepath"
 	"sort"
 	"sync"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidist"
 	"github.com/pelletier/go-toml/v2"
 )
 // MetadataFormat represents the metadata serialization format.
 // Currently only TOML is used for disk-based indices, but the type
 // is kept for backward compatibility with CLI options.
 type MetadataFormat int
 const (
 	FormatTOML MetadataFormat = iota
 	FormatYAML
 	FormatJSON
 )
 // String returns the file extension for the format.
 func (f MetadataFormat) String() string {
 	switch f {
 	case FormatTOML:
 		return "toml"
 	case FormatYAML:
 		return "yaml"
 	case FormatJSON:
 		return "json"
 	default:
 		return "toml"
 	}
 }
 // KmerSetGroup is a disk-based collection of N k-mer sets sharing the same
 // k, m, and partition count P. After construction (via KmerSetGroupBuilder),
 // it is immutable and all operations are streaming (partition by partition).
 //
 // A KmerSetGroup with Size()==1 is effectively a KmerSet (singleton).
 type KmerSetGroup struct {
 	path         string                   // root directory
 	id           string                   // user-assigned identifier
 	k            int                      // k-mer size
 	m            int                      // minimizer size
 	partitions   int                      // number of partitions P
 	n            int                      // number of sets N
 	setsIDs      []string                 // IDs of individual sets
 	counts       []uint64                 // total k-mer count per set (sum over partitions)
 	setsMetadata []map[string]interface{} // per-set user metadata
 	Metadata     map[string]interface{}   // group-level user metadata
 }
 // diskMetadata is the TOML-serializable structure for metadata.toml.
 type diskMetadata struct {
 	ID           string                   `toml:"id,omitempty"`
 	K            int                      `toml:"k"`
 	M            int                      `toml:"m"`
 	Partitions   int                      `toml:"partitions"`
 	Type         string                   `toml:"type"`
 	Size         int                      `toml:"size"`
 	SetsIDs      []string                 `toml:"sets_ids,omitempty"`
 	Counts       []uint64                 `toml:"counts,omitempty"`
 	SetsMetadata []map[string]interface{} `toml:"sets_metadata,omitempty"`
 	UserMetadata map[string]interface{}   `toml:"user_metadata,omitempty"`
 }
 // OpenKmerSetGroup opens a finalized index directory in read-only mode.
 func OpenKmerSetGroup(directory string) (*KmerSetGroup, error) {
 	metaPath := filepath.Join(directory, "metadata.toml")
 	f, err := os.Open(metaPath)
 	if err != nil {
 		return nil, fmt.Errorf("obikmer: open metadata: %w", err)
 	}
 	defer f.Close()
 	var meta diskMetadata
 	if err := toml.NewDecoder(f).Decode(&meta); err != nil {
 		return nil, fmt.Errorf("obikmer: decode metadata: %w", err)
 	}
 	ksg := &KmerSetGroup{
 		path:         directory,
 		id:           meta.ID,
 		k:            meta.K,
 		m:            meta.M,
 		partitions:   meta.Partitions,
 		n:            meta.Size,
 		setsIDs:      meta.SetsIDs,
 		counts:       meta.Counts,
 		setsMetadata: meta.SetsMetadata,
 		Metadata:     meta.UserMetadata,
 	}
 	if ksg.Metadata == nil {
 		ksg.Metadata = make(map[string]interface{})
 	}
 	if ksg.setsIDs == nil {
 		ksg.setsIDs = make([]string, ksg.n)
 	}
 	if ksg.setsMetadata == nil {
 		ksg.setsMetadata = make([]map[string]interface{}, ksg.n)
 		for i := range ksg.setsMetadata {
 			ksg.setsMetadata[i] = make(map[string]interface{})
 		}
 	}
 	if ksg.counts == nil {
 		// Compute counts by scanning partitions
 		ksg.counts = make([]uint64, ksg.n)
 		for s := 0; s < ksg.n; s++ {
 			for p := 0; p < ksg.partitions; p++ {
 				path := ksg.partitionPath(s, p)
 				r, err := NewKdiReader(path)
 				if err != nil {
 					continue
 				}
 				ksg.counts[s] += r.Count()
 				r.Close()
 			}
 		}
 	}
 	return ksg, nil
 }
 // NewFilteredKmerSetGroup creates a KmerSetGroup from pre-computed data.
 // Used by the filter command to construct a new group after filtering partitions.
 func NewFilteredKmerSetGroup(
 	directory string, k, m, partitions, n int,
 	setsIDs []string, counts []uint64,
 	setsMetadata []map[string]interface{},
 ) (*KmerSetGroup, error) {
 	ksg := &KmerSetGroup{
 		path:         directory,
 		k:            k,
 		m:            m,
 		partitions:   partitions,
 		n:            n,
 		setsIDs:      setsIDs,
 		counts:       counts,
 		setsMetadata: setsMetadata,
 		Metadata:     make(map[string]interface{}),
 	}
 	return ksg, nil
 }
 // SaveMetadata writes the metadata.toml file. This is useful after
 // modifying attributes or IDs on an already-finalized index.
 func (ksg *KmerSetGroup) SaveMetadata() error {
 	return ksg.saveMetadata()
 }
 // saveMetadata writes the metadata.toml file (internal).
 func (ksg *KmerSetGroup) saveMetadata() error {
 	meta := diskMetadata{
 		ID:           ksg.id,
 		K:            ksg.k,
 		M:            ksg.m,
 		Partitions:   ksg.partitions,
 		Type:         "KmerSetGroup",
 		Size:         ksg.n,
 		SetsIDs:      ksg.setsIDs,
 		Counts:       ksg.counts,
 		SetsMetadata: ksg.setsMetadata,
 		UserMetadata: ksg.Metadata,
 	}
 	metaPath := filepath.Join(ksg.path, "metadata.toml")
 	f, err := os.Create(metaPath)
 	if err != nil {
 		return err
 	}
 	defer f.Close()
 	return toml.NewEncoder(f).Encode(meta)
 }
 // partitionPath returns the file path for partition p of set s.
 func (ksg *KmerSetGroup) partitionPath(setIndex, partIndex int) string {
 	return filepath.Join(ksg.path, fmt.Sprintf("set_%d", setIndex),
 		fmt.Sprintf("part_%04d.kdi", partIndex))
 }
 // Path returns the root directory of the index.
 func (ksg *KmerSetGroup) Path() string {
 	return ksg.path
 }
 // K returns the k-mer size.
 func (ksg *KmerSetGroup) K() int {
 	return ksg.k
 }
 // M returns the minimizer size.
 func (ksg *KmerSetGroup) M() int {
 	return ksg.m
 }
 // Partitions returns the number of partitions P.
 func (ksg *KmerSetGroup) Partitions() int {
 	return ksg.partitions
 }
 // Size returns the number of sets N.
 func (ksg *KmerSetGroup) Size() int {
 	return ksg.n
 }
 // Id returns the group identifier.
 func (ksg *KmerSetGroup) Id() string {
 	return ksg.id
 }
 // SetId sets the group identifier and persists the change.
 func (ksg *KmerSetGroup) SetId(id string) {
 	ksg.id = id
 }
 // Len returns the total number of k-mers.
 // Without argument: total across all sets.
 // With argument setIndex: count for that specific set.
 func (ksg *KmerSetGroup) Len(setIndex ...int) uint64 {
 	if len(setIndex) == 0 {
 		var total uint64
 		for _, c := range ksg.counts {
 			total += c
 		}
 		return total
 	}
 	idx := setIndex[0]
 	if idx < 0 || idx >= ksg.n {
 		return 0
 	}
 	return ksg.counts[idx]
 }
 // Contains checks if a k-mer is present in the specified set.
 // Uses the .kdx sparse index (if available) for fast seeking within
 // each partition, then a short linear scan of at most `stride` entries.
 // All partitions are searched in parallel since the k-mer's partition
 // is not known without its minimizer context.
 func (ksg *KmerSetGroup) Contains(setIndex int, kmer uint64) bool {
 	if setIndex < 0 || setIndex >= ksg.n {
 		return false
 	}
 	type result struct {
 		found bool
 	}
 	ch := make(chan result, ksg.partitions)
 	for p := 0; p < ksg.partitions; p++ {
 		go func(part int) {
 			r, err := NewKdiIndexedReader(ksg.partitionPath(setIndex, part))
 			if err != nil {
 				ch <- result{false}
 				return
 			}
 			defer r.Close()
 			// Use index to jump near the target
 			if err := r.SeekTo(kmer); err != nil {
 				ch <- result{false}
 				return
 			}
 			// Linear scan from the seek position
 			for {
 				v, ok := r.Next()
 				if !ok {
 					ch <- result{false}
 					return
 				}
 				if v == kmer {
 					ch <- result{true}
 					return
 				}
 				if v > kmer {
 					ch <- result{false}
 					return
 				}
 			}
 		}(p)
 	}
 	for i := 0; i < ksg.partitions; i++ {
 		res := <-ch
 		if res.found {
 			// Drain remaining goroutines
 			go func() {
 				for j := i + 1; j < ksg.partitions; j++ {
 					<-ch
 				}
 			}()
 			return true
 		}
 	}
 	return false
 }
 // Iterator returns an iterator over all k-mers in the specified set,
 // in sorted order within each partition. Since partitions are independent,
 // to get a globally sorted stream, use iteratorSorted.
 func (ksg *KmerSetGroup) Iterator(setIndex int) iter.Seq[uint64] {
 	return func(yield func(uint64) bool) {
 		if setIndex < 0 || setIndex >= ksg.n {
 			return
 		}
 		// Open all partition readers and merge them
 		readers := make([]*KdiReader, 0, ksg.partitions)
 		for p := 0; p < ksg.partitions; p++ {
 			r, err := NewKdiReader(ksg.partitionPath(setIndex, p))
 			if err != nil {
 				continue
 			}
 			if r.Count() > 0 {
 				readers = append(readers, r)
 			} else {
 				r.Close()
 			}
 		}
 		if len(readers) == 0 {
 			return
 		}
 		m := NewKWayMerge(readers)
 		defer m.Close()
 		for {
 			kmer, _, ok := m.Next()
 			if !ok {
 				return
 			}
 			if !yield(kmer) {
 				return
 			}
 		}
 	}
 }
 // ==============================
 // Attribute API (compatible with old API)
 // ==============================
 // HasAttribute checks if a metadata key exists.
 func (ksg *KmerSetGroup) HasAttribute(key string) bool {
 	_, ok := ksg.Metadata[key]
 	return ok
 }
 // GetAttribute returns the value of an attribute.
 func (ksg *KmerSetGroup) GetAttribute(key string) (interface{}, bool) {
 	switch key {
 	case "id":
 		return ksg.Id(), true
 	case "k":
 		return ksg.K(), true
 	default:
 		value, ok := ksg.Metadata[key]
 		return value, ok
 	}
 }
 // SetAttribute sets a metadata attribute.
 func (ksg *KmerSetGroup) SetAttribute(key string, value interface{}) {
 	switch key {
 	case "id":
 		if id, ok := value.(string); ok {
 			ksg.SetId(id)
 		} else {
 			panic(fmt.Sprintf("id must be a string, got %T", value))
 		}
 	case "k":
 		panic("k is immutable")
 	default:
 		ksg.Metadata[key] = value
 	}
 }
 // DeleteAttribute removes a metadata attribute.
 func (ksg *KmerSetGroup) DeleteAttribute(key string) {
 	delete(ksg.Metadata, key)
 }
 // GetIntAttribute returns an attribute as int.
 func (ksg *KmerSetGroup) GetIntAttribute(key string) (int, bool) {
 	v, ok := ksg.GetAttribute(key)
 	if !ok {
 		return 0, false
 	}
 	switch val := v.(type) {
 	case int:
 		return val, true
 	case int64:
 		return int(val), true
 	case float64:
 		return int(val), true
 	}
 	return 0, false
 }
 // GetStringAttribute returns an attribute as string.
 func (ksg *KmerSetGroup) GetStringAttribute(key string) (string, bool) {
 	v, ok := ksg.GetAttribute(key)
 	if !ok {
 		return "", false
 	}
 	if s, ok := v.(string); ok {
 		return s, true
 	}
 	return fmt.Sprintf("%v", v), true
 }
 // ==============================
 // Jaccard metrics (streaming, disk-based)
 // ==============================
 // JaccardDistanceMatrix computes a pairwise Jaccard distance matrix
 // for all sets in the group. Operates partition by partition in streaming.
 func (ksg *KmerSetGroup) JaccardDistanceMatrix() *obidist.DistMatrix {
 	n := ksg.n
 	labels := make([]string, n)
 	for i := 0; i < n; i++ {
 		if i < len(ksg.setsIDs) && ksg.setsIDs[i] != "" {
 			labels[i] = ksg.setsIDs[i]
 		} else {
 			labels[i] = fmt.Sprintf("set_%d", i)
 		}
 	}
 	dm := obidist.NewDistMatrixWithLabels(labels)
 	// Accumulate intersection and union counts
 	intersections := make([][]uint64, n)
 	unions := make([][]uint64, n)
 	for i := 0; i < n; i++ {
 		intersections[i] = make([]uint64, n)
 		unions[i] = make([]uint64, n)
 	}
 	// Process partition by partition
 	var mu sync.Mutex
 	var wg sync.WaitGroup
 	for p := 0; p < ksg.partitions; p++ {
 		wg.Add(1)
 		go func(part int) {
 			defer wg.Done()
 			// Open all set readers for this partition
 			readers := make([]*KdiReader, n)
 			for s := 0; s < n; s++ {
 				r, err := NewKdiReader(ksg.partitionPath(s, part))
 				if err != nil {
 					continue
 				}
 				readers[s] = r
 			}
 			defer func() {
 				for _, r := range readers {
 					if r != nil {
 						r.Close()
 					}
 				}
 			}()
 			// Merge all N readers to count intersections and unions
 			activeReaders := make([]*KdiReader, 0, n)
 			activeIndices := make([]int, 0, n)
 			for i, r := range readers {
 				if r != nil && r.Count() > 0 {
 					activeReaders = append(activeReaders, r)
 					activeIndices = append(activeIndices, i)
 				}
 			}
 			if len(activeReaders) == 0 {
 				return
 			}
 			merge := NewKWayMerge(activeReaders)
 			// Don't close merge here since readers are managed above
 			// We only want to iterate
 			// We need per-set presence tracking, so we use a custom merge
 			// Rebuild with a direct approach
 			merge.Close() // close the merge (which closes readers)
 			// Reopen readers for custom merge
 			for s := 0; s < n; s++ {
 				readers[s] = nil
 				r, err := NewKdiReader(ksg.partitionPath(s, part))
 				if err != nil {
 					continue
 				}
 				if r.Count() > 0 {
 					readers[s] = r
 				} else {
 					r.Close()
 				}
 			}
 			// Custom k-way merge that tracks which sets contain each kmer
 			type entry struct {
 				val    uint64
 				setIdx int
 			}
 			// Use a simpler approach: read all values for this partition into memory
 			// for each set, then do a merge
 			setKmers := make([][]uint64, n)
 			for s := 0; s < n; s++ {
 				if readers[s] == nil {
 					continue
 				}
 				kmers := make([]uint64, 0, readers[s].Count())
 				for {
 					v, ok := readers[s].Next()
 					if !ok {
 						break
 					}
 					kmers = append(kmers, v)
 				}
 				setKmers[s] = kmers
 				readers[s].Close()
 				readers[s] = nil
 			}
 			// Count pairwise intersections using sorted merge
 			// For each pair (i,j), count kmers present in both
 			localInter := make([][]uint64, n)
 			localUnion := make([][]uint64, n)
 			for i := 0; i < n; i++ {
 				localInter[i] = make([]uint64, n)
 				localUnion[i] = make([]uint64, n)
 			}
 			for i := 0; i < n; i++ {
 				localUnion[i][i] = uint64(len(setKmers[i]))
 				for j := i + 1; j < n; j++ {
 					a, b := setKmers[i], setKmers[j]
 					var inter uint64
 					ai, bi := 0, 0
 					for ai < len(a) && bi < len(b) {
 						if a[ai] == b[bi] {
 							inter++
 							ai++
 							bi++
 						} else if a[ai] < b[bi] {
 							ai++
 						} else {
 							bi++
 						}
 					}
 					localInter[i][j] = inter
 					localUnion[i][j] = uint64(len(a)) + uint64(len(b)) - inter
 				}
 			}
 			mu.Lock()
 			for i := 0; i < n; i++ {
 				for j := i; j < n; j++ {
 					intersections[i][j] += localInter[i][j]
 					unions[i][j] += localUnion[i][j]
 				}
 			}
 			mu.Unlock()
 		}(p)
 	}
 	wg.Wait()
 	// Compute distances from accumulated counts
 	for i := 0; i < n-1; i++ {
 		for j := i + 1; j < n; j++ {
 			u := unions[i][j]
 			if u == 0 {
 				dm.Set(i, j, 1.0)
 			} else {
 				dm.Set(i, j, 1.0-float64(intersections[i][j])/float64(u))
 			}
 		}
 	}
 	return dm
 }
 // JaccardSimilarityMatrix computes a pairwise Jaccard similarity matrix.
 func (ksg *KmerSetGroup) JaccardSimilarityMatrix() *obidist.DistMatrix {
 	n := ksg.n
 	labels := make([]string, n)
 	for i := 0; i < n; i++ {
 		if i < len(ksg.setsIDs) && ksg.setsIDs[i] != "" {
 			labels[i] = ksg.setsIDs[i]
 		} else {
 			labels[i] = fmt.Sprintf("set_%d", i)
 		}
 	}
 	// Reuse distance computation
 	dm := ksg.JaccardDistanceMatrix()
 	sm := obidist.NewSimilarityMatrixWithLabels(labels)
 	for i := 0; i < n-1; i++ {
 		for j := i + 1; j < n; j++ {
 			sm.Set(i, j, 1.0-dm.Get(i, j))
 		}
 	}
 	return sm
 }
 // ==============================
 // Set ID accessors
 // ==============================
 // SetsIDs returns a copy of the per-set string identifiers.
 func (ksg *KmerSetGroup) SetsIDs() []string {
 	out := make([]string, len(ksg.setsIDs))
 	copy(out, ksg.setsIDs)
 	return out
 }
 // SetIDOf returns the string ID of the set at the given index.
 // Returns "" if index is out of range.
 func (ksg *KmerSetGroup) SetIDOf(index int) string {
 	if index < 0 || index >= ksg.n {
 		return ""
 	}
 	return ksg.setsIDs[index]
 }
 // SetSetID sets the string ID of the set at the given index.
 func (ksg *KmerSetGroup) SetSetID(index int, id string) {
 	if index >= 0 && index < ksg.n {
 		ksg.setsIDs[index] = id
 	}
 }
 // IndexOfSetID returns the numeric index for a set ID, or -1 if not found.
 func (ksg *KmerSetGroup) IndexOfSetID(id string) int {
 	for i, sid := range ksg.setsIDs {
 		if sid == id {
 			return i
 		}
 	}
 	return -1
 }
 // MatchSetIDs resolves glob patterns against set IDs and returns matching
 // indices sorted in ascending order. Uses path.Match for pattern matching
 // (supports *, ?, [...] patterns). Returns error if a pattern is malformed.
 func (ksg *KmerSetGroup) MatchSetIDs(patterns []string) ([]int, error) {
 	seen := make(map[int]bool)
 	for _, pattern := range patterns {
 		for i, sid := range ksg.setsIDs {
 			matched, err := path.Match(pattern, sid)
 			if err != nil {
 				return nil, fmt.Errorf("obikmer: invalid glob pattern %q: %w", pattern, err)
 			}
 			if matched {
 				seen[i] = true
 			}
 		}
 	}
 	result := make([]int, 0, len(seen))
 	for idx := range seen {
 		result = append(result, idx)
 	}
 	sort.Ints(result)
 	return result, nil
 }
 // ==============================
 // Per-set metadata accessors
 // ==============================
 // GetSetMetadata returns the value of a per-set metadata key.
 func (ksg *KmerSetGroup) GetSetMetadata(setIndex int, key string) (interface{}, bool) {
 	if setIndex < 0 || setIndex >= ksg.n {
 		return nil, false
 	}
 	v, ok := ksg.setsMetadata[setIndex][key]
 	return v, ok
 }
 // SetSetMetadata sets a per-set metadata attribute.
 func (ksg *KmerSetGroup) SetSetMetadata(setIndex int, key string, value interface{}) {
 	if setIndex < 0 || setIndex >= ksg.n {
 		return
 	}
 	if ksg.setsMetadata[setIndex] == nil {
 		ksg.setsMetadata[setIndex] = make(map[string]interface{})
 	}
 	ksg.setsMetadata[setIndex][key] = value
 }
 // DeleteSetMetadata removes a per-set metadata attribute.
 func (ksg *KmerSetGroup) DeleteSetMetadata(setIndex int, key string) {
 	if setIndex < 0 || setIndex >= ksg.n {
 		return
 	}
 	delete(ksg.setsMetadata[setIndex], key)
 }
 // AllSetMetadata returns a copy of all metadata for a given set.
 func (ksg *KmerSetGroup) AllSetMetadata(setIndex int) map[string]interface{} {
 	if setIndex < 0 || setIndex >= ksg.n {
 		return nil
 	}
 	out := make(map[string]interface{}, len(ksg.setsMetadata[setIndex]))
 	for k, v := range ksg.setsMetadata[setIndex] {
 		out[k] = v
 	}
 	return out
 }
 // ==============================
 // Exported partition path and compatibility
 // ==============================
 // PartitionPath returns the file path for partition partIndex of set setIndex.
 func (ksg *KmerSetGroup) PartitionPath(setIndex, partIndex int) string {
 	return ksg.partitionPath(setIndex, partIndex)
 }
 // SpectrumPath returns the path to the spectrum.bin file for the given set.
 func (ksg *KmerSetGroup) SpectrumPath(setIndex int) string {
 	return filepath.Join(ksg.path, fmt.Sprintf("set_%d", setIndex), "spectrum.bin")
 }
 // Spectrum reads the k-mer frequency spectrum for the given set.
 // Returns nil, nil if no spectrum file exists.
 func (ksg *KmerSetGroup) Spectrum(setIndex int) (*KmerSpectrum, error) {
 	path := ksg.SpectrumPath(setIndex)
 	if _, err := os.Stat(path); os.IsNotExist(err) {
 		return nil, nil
 	}
 	return ReadSpectrum(path)
 }
 // IsCompatibleWith returns true if the other group has the same k, m, and partitions.
 func (ksg *KmerSetGroup) IsCompatibleWith(other *KmerSetGroup) bool {
 	return ksg.k == other.k && ksg.m == other.m && ksg.partitions == other.partitions
 }
 // ==============================
 // Set management operations
 // ==============================
 // NewEmptyCompatible creates an empty KmerSetGroup at destDir with the same
 // k, m, and partitions as this group. The destination must not already exist.
 func (ksg *KmerSetGroup) NewEmptyCompatible(destDir string) (*KmerSetGroup, error) {
 	if err := os.MkdirAll(destDir, 0755); err != nil {
 		return nil, fmt.Errorf("obikmer: create directory: %w", err)
 	}
 	dest := &KmerSetGroup{
 		path:         destDir,
 		k:            ksg.k,
 		m:            ksg.m,
 		partitions:   ksg.partitions,
 		n:            0,
 		setsIDs:      []string{},
 		counts:       []uint64{},
 		setsMetadata: []map[string]interface{}{},
 		Metadata:     make(map[string]interface{}),
 	}
 	if err := dest.saveMetadata(); err != nil {
 		return nil, fmt.Errorf("obikmer: write metadata: %w", err)
 	}
 	return dest, nil
 }
 // RemoveSetByID removes the set with the given ID from the group.
 // It deletes the set directory, renumbers all subsequent sets, and
 // updates the metadata on disk.
 func (ksg *KmerSetGroup) RemoveSetByID(id string) error {
 	idx := ksg.IndexOfSetID(id)
 	if idx < 0 {
 		return fmt.Errorf("obikmer: set ID %q not found", id)
 	}
 	// Delete the set directory
 	setDir := filepath.Join(ksg.path, fmt.Sprintf("set_%d", idx))
 	if err := os.RemoveAll(setDir); err != nil {
 		return fmt.Errorf("obikmer: remove set directory: %w", err)
 	}
 	// Renumber subsequent sets
 	for i := idx + 1; i < ksg.n; i++ {
 		oldDir := filepath.Join(ksg.path, fmt.Sprintf("set_%d", i))
 		newDir := filepath.Join(ksg.path, fmt.Sprintf("set_%d", i-1))
 		if err := os.Rename(oldDir, newDir); err != nil {
 			return fmt.Errorf("obikmer: rename set_%d to set_%d: %w", i, i-1, err)
 		}
 	}
 	// Update slices
 	ksg.setsIDs = append(ksg.setsIDs[:idx], ksg.setsIDs[idx+1:]...)
 	ksg.counts = append(ksg.counts[:idx], ksg.counts[idx+1:]...)
 	ksg.setsMetadata = append(ksg.setsMetadata[:idx], ksg.setsMetadata[idx+1:]...)
 	ksg.n--
 	return ksg.saveMetadata()
 }
 // CopySetsByIDTo copies sets identified by their IDs into a KmerSetGroup
 // at destDir. If destDir does not exist, a new compatible empty group is
 // created. If it exists, compatibility (k, m, partitions) is checked.
 // If a set ID already exists in the destination, an error is returned
 // unless force is true (in which case the existing set is replaced).
 // Per-set metadata travels with the set.
 func (ksg *KmerSetGroup) CopySetsByIDTo(ids []string, destDir string, force bool) (*KmerSetGroup, error) {
 	// Resolve source IDs to indices
 	srcIndices := make([]int, len(ids))
 	for i, id := range ids {
 		idx := ksg.IndexOfSetID(id)
 		if idx < 0 {
 			return nil, fmt.Errorf("obikmer: source set ID %q not found", id)
 		}
 		srcIndices[i] = idx
 	}
 	// Open or create destination
 	var dest *KmerSetGroup
 	metaPath := filepath.Join(destDir, "metadata.toml")
 	if _, err := os.Stat(metaPath); err == nil {
 		// Destination exists
 		dest, err = OpenKmerSetGroup(destDir)
 		if err != nil {
 			return nil, fmt.Errorf("obikmer: open destination: %w", err)
 		}
 		if !ksg.IsCompatibleWith(dest) {
 			return nil, fmt.Errorf("obikmer: incompatible groups: source (k=%d, m=%d, P=%d) vs dest (k=%d, m=%d, P=%d)",
 				ksg.k, ksg.m, ksg.partitions, dest.k, dest.m, dest.partitions)
 		}
 	} else {
 		// Create new destination
 		var err error
 		dest, err = ksg.NewEmptyCompatible(destDir)
 		if err != nil {
 			return nil, err
 		}
 	}
 	// Copy each set
 	for i, srcIdx := range srcIndices {
 		srcID := ids[i]
 		// Check for ID conflict in destination
 		existingIdx := dest.IndexOfSetID(srcID)
 		if existingIdx >= 0 {
 			if !force {
 				return nil, fmt.Errorf("obikmer: set ID %q already exists in destination (use force to replace)", srcID)
 			}
 			// Force: remove existing set in destination
 			if err := dest.RemoveSetByID(srcID); err != nil {
 				return nil, fmt.Errorf("obikmer: remove existing set %q in destination: %w", srcID, err)
 			}
 		}
 		// Destination set index = current dest size
 		destIdx := dest.n
 		// Create destination set directory
 		destSetDir := filepath.Join(destDir, fmt.Sprintf("set_%d", destIdx))
 		if err := os.MkdirAll(destSetDir, 0755); err != nil {
 			return nil, fmt.Errorf("obikmer: create dest set dir: %w", err)
 		}
 		// Copy all partition files and their .kdx indices
 		for p := 0; p < ksg.partitions; p++ {
 			srcPath := ksg.partitionPath(srcIdx, p)
 			destPath := dest.partitionPath(destIdx, p)
 			if err := copyFile(srcPath, destPath); err != nil {
 				return nil, fmt.Errorf("obikmer: copy partition %d of set %q: %w", p, srcID, err)
 			}
 			// Copy .kdx index if it exists
 			srcKdx := KdxPathForKdi(srcPath)
 			if _, err := os.Stat(srcKdx); err == nil {
 				destKdx := KdxPathForKdi(destPath)
 				if err := copyFile(srcKdx, destKdx); err != nil {
 					return nil, fmt.Errorf("obikmer: copy index %d of set %q: %w", p, srcID, err)
 				}
 			}
 		}
 		// Copy spectrum.bin if it exists
 		srcSpecPath := ksg.SpectrumPath(srcIdx)
 		if _, err := os.Stat(srcSpecPath); err == nil {
 			destSpecPath := filepath.Join(destSetDir, "spectrum.bin")
 			if err := copyFile(srcSpecPath, destSpecPath); err != nil {
 				return nil, fmt.Errorf("obikmer: copy spectrum of set %q: %w", srcID, err)
 			}
 		}
 		// Update destination metadata
 		dest.setsIDs = append(dest.setsIDs, srcID)
 		dest.counts = append(dest.counts, ksg.counts[srcIdx])
 		// Copy per-set metadata
 		srcMeta := ksg.AllSetMetadata(srcIdx)
 		if srcMeta == nil {
 			srcMeta = make(map[string]interface{})
 		}
 		dest.setsMetadata = append(dest.setsMetadata, srcMeta)
 		dest.n++
 	}
 	if err := dest.saveMetadata(); err != nil {
 		return nil, fmt.Errorf("obikmer: save destination metadata: %w", err)
 	}
 	return dest, nil
 }
 // copyFile copies a file from src to dst.
 func copyFile(src, dst string) error {
 	in, err := os.Open(src)
 	if err != nil {
 		return err
 	}
 	defer in.Close()
 	out, err := os.Create(dst)
 	if err != nil {
 		return err
 	}
 	defer out.Close()
 	if _, err := io.Copy(out, in); err != nil {
 		return err
 	}
 	return out.Close()
 }
--- a/pkg/obikmer/kmer_set_disk_ops.go
+++ b/pkg/obikmer/kmer_set_disk_ops.go
@@ -0,0 +1,568 @@
 package obikmer
 import (
 	"fmt"
 	"os"
 	"path/filepath"
 	"runtime"
 	"sync"
 )
 // Union computes the union of all sets in the group, producing a new
 // singleton KmerSetGroup on disk. A k-mer is in the result if it
 // appears in any set.
 func (ksg *KmerSetGroup) Union(outputDir string) (*KmerSetGroup, error) {
 	return ksg.quorumOp(outputDir, 1, ksg.n)
 }
 // Intersect computes the intersection of all sets, producing a new
 // singleton KmerSetGroup on disk. A k-mer is in the result if it
 // appears in every set.
 func (ksg *KmerSetGroup) Intersect(outputDir string) (*KmerSetGroup, error) {
 	return ksg.quorumOp(outputDir, ksg.n, ksg.n)
 }
 // Difference computes set_0 minus the union of all other sets.
 func (ksg *KmerSetGroup) Difference(outputDir string) (*KmerSetGroup, error) {
 	return ksg.differenceOp(outputDir)
 }
 // QuorumAtLeast returns k-mers present in at least q sets.
 func (ksg *KmerSetGroup) QuorumAtLeast(q int, outputDir string) (*KmerSetGroup, error) {
 	return ksg.quorumOp(outputDir, q, ksg.n)
 }
 // QuorumExactly returns k-mers present in exactly q sets.
 func (ksg *KmerSetGroup) QuorumExactly(q int, outputDir string) (*KmerSetGroup, error) {
 	return ksg.quorumOp(outputDir, q, q)
 }
 // QuorumAtMost returns k-mers present in at most q sets.
 func (ksg *KmerSetGroup) QuorumAtMost(q int, outputDir string) (*KmerSetGroup, error) {
 	return ksg.quorumOp(outputDir, 1, q)
 }
 // UnionWith merges this group with another, producing a new KmerSetGroup
 // whose set_i is the union of this.set_i and other.set_i.
 // Both groups must have the same k, m, P, and N.
 func (ksg *KmerSetGroup) UnionWith(other *KmerSetGroup, outputDir string) (*KmerSetGroup, error) {
 	if err := ksg.checkCompatible(other); err != nil {
 		return nil, err
 	}
 	return ksg.pairwiseOp(other, outputDir, mergeUnion)
 }
 // IntersectWith merges this group with another, producing a new KmerSetGroup
 // whose set_i is the intersection of this.set_i and other.set_i.
 func (ksg *KmerSetGroup) IntersectWith(other *KmerSetGroup, outputDir string) (*KmerSetGroup, error) {
 	if err := ksg.checkCompatible(other); err != nil {
 		return nil, err
 	}
 	return ksg.pairwiseOp(other, outputDir, mergeIntersect)
 }
 // ==============================
 // Internal implementation
 // ==============================
 func (ksg *KmerSetGroup) checkCompatible(other *KmerSetGroup) error {
 	if ksg.k != other.k {
 		return fmt.Errorf("obikmer: incompatible k: %d vs %d", ksg.k, other.k)
 	}
 	if ksg.m != other.m {
 		return fmt.Errorf("obikmer: incompatible m: %d vs %d", ksg.m, other.m)
 	}
 	if ksg.partitions != other.partitions {
 		return fmt.Errorf("obikmer: incompatible partitions: %d vs %d", ksg.partitions, other.partitions)
 	}
 	if ksg.n != other.n {
 		return fmt.Errorf("obikmer: incompatible size: %d vs %d", ksg.n, other.n)
 	}
 	return nil
 }
 // quorumOp processes all N sets partition by partition.
 // For each partition, it opens N KdiReaders and does a k-way merge.
 // A kmer is written to the result if minQ <= count <= maxQ.
 func (ksg *KmerSetGroup) quorumOp(outputDir string, minQ, maxQ int) (*KmerSetGroup, error) {
 	if minQ < 1 {
 		minQ = 1
 	}
 	if maxQ > ksg.n {
 		maxQ = ksg.n
 	}
 	// Create output structure
 	setDir := filepath.Join(outputDir, "set_0")
 	if err := os.MkdirAll(setDir, 0755); err != nil {
 		return nil, err
 	}
 	counts := make([]uint64, ksg.partitions)
 	nWorkers := runtime.NumCPU()
 	if nWorkers > ksg.partitions {
 		nWorkers = ksg.partitions
 	}
 	jobs := make(chan int, ksg.partitions)
 	var wg sync.WaitGroup
 	var errMu sync.Mutex
 	var firstErr error
 	for w := 0; w < nWorkers; w++ {
 		wg.Add(1)
 		go func() {
 			defer wg.Done()
 			for p := range jobs {
 				c, err := ksg.quorumPartition(p, setDir, minQ, maxQ)
 				if err != nil {
 					errMu.Lock()
 					if firstErr == nil {
 						firstErr = err
 					}
 					errMu.Unlock()
 					return
 				}
 				counts[p] = c
 			}
 		}()
 	}
 	for p := 0; p < ksg.partitions; p++ {
 		jobs <- p
 	}
 	close(jobs)
 	wg.Wait()
 	if firstErr != nil {
 		return nil, firstErr
 	}
 	var totalCount uint64
 	for _, c := range counts {
 		totalCount += c
 	}
 	result := &KmerSetGroup{
 		path:       outputDir,
 		k:          ksg.k,
 		m:          ksg.m,
 		partitions: ksg.partitions,
 		n:          1,
 		setsIDs:    []string{""},
 		counts:     []uint64{totalCount},
 		Metadata:   make(map[string]interface{}),
 	}
 	if err := result.saveMetadata(); err != nil {
 		return nil, err
 	}
 	return result, nil
 }
 // quorumPartition processes a single partition for quorum filtering.
 func (ksg *KmerSetGroup) quorumPartition(partIdx int, outSetDir string, minQ, maxQ int) (uint64, error) {
 	// Open readers for all sets
 	readers := make([]*KdiReader, 0, ksg.n)
 	for s := 0; s < ksg.n; s++ {
 		r, err := NewKdiReader(ksg.partitionPath(s, partIdx))
 		if err != nil {
 			// Close already-opened readers
 			for _, rr := range readers {
 				rr.Close()
 			}
 			return 0, err
 		}
 		if r.Count() > 0 {
 			readers = append(readers, r)
 		} else {
 			r.Close()
 		}
 	}
 	outPath := filepath.Join(outSetDir, fmt.Sprintf("part_%04d.kdi", partIdx))
 	if len(readers) == 0 {
 		// Write empty KDI
 		w, err := NewKdiWriter(outPath)
 		if err != nil {
 			return 0, err
 		}
 		return 0, w.Close()
 	}
 	merge := NewKWayMerge(readers)
 	// merge.Close() will close readers
 	w, err := NewKdiWriter(outPath)
 	if err != nil {
 		merge.Close()
 		return 0, err
 	}
 	for {
 		kmer, count, ok := merge.Next()
 		if !ok {
 			break
 		}
 		if count >= minQ && count <= maxQ {
 			if err := w.Write(kmer); err != nil {
 				merge.Close()
 				w.Close()
 				return 0, err
 			}
 		}
 	}
 	merge.Close()
 	cnt := w.Count()
 	return cnt, w.Close()
 }
 // differenceOp computes set_0 minus the union of all other sets.
 func (ksg *KmerSetGroup) differenceOp(outputDir string) (*KmerSetGroup, error) {
 	if ksg.n < 1 {
 		return nil, fmt.Errorf("obikmer: difference requires at least 1 set")
 	}
 	setDir := filepath.Join(outputDir, "set_0")
 	if err := os.MkdirAll(setDir, 0755); err != nil {
 		return nil, err
 	}
 	counts := make([]uint64, ksg.partitions)
 	nWorkers := runtime.NumCPU()
 	if nWorkers > ksg.partitions {
 		nWorkers = ksg.partitions
 	}
 	jobs := make(chan int, ksg.partitions)
 	var wg sync.WaitGroup
 	var errMu sync.Mutex
 	var firstErr error
 	for w := 0; w < nWorkers; w++ {
 		wg.Add(1)
 		go func() {
 			defer wg.Done()
 			for p := range jobs {
 				c, err := ksg.differencePartition(p, setDir)
 				if err != nil {
 					errMu.Lock()
 					if firstErr == nil {
 						firstErr = err
 					}
 					errMu.Unlock()
 					return
 				}
 				counts[p] = c
 			}
 		}()
 	}
 	for p := 0; p < ksg.partitions; p++ {
 		jobs <- p
 	}
 	close(jobs)
 	wg.Wait()
 	if firstErr != nil {
 		return nil, firstErr
 	}
 	var totalCount uint64
 	for _, c := range counts {
 		totalCount += c
 	}
 	result := &KmerSetGroup{
 		path:       outputDir,
 		k:          ksg.k,
 		m:          ksg.m,
 		partitions: ksg.partitions,
 		n:          1,
 		setsIDs:    []string{""},
 		counts:     []uint64{totalCount},
 		Metadata:   make(map[string]interface{}),
 	}
 	if err := result.saveMetadata(); err != nil {
 		return nil, err
 	}
 	return result, nil
 }
 // differencePartition computes set_0 - union(set_1..set_{n-1}) for one partition.
 func (ksg *KmerSetGroup) differencePartition(partIdx int, outSetDir string) (uint64, error) {
 	outPath := filepath.Join(outSetDir, fmt.Sprintf("part_%04d.kdi", partIdx))
 	// Open set_0 reader
 	r0, err := NewKdiReader(ksg.partitionPath(0, partIdx))
 	if err != nil {
 		return 0, err
 	}
 	if r0.Count() == 0 {
 		r0.Close()
 		w, err := NewKdiWriter(outPath)
 		if err != nil {
 			return 0, err
 		}
 		return 0, w.Close()
 	}
 	// Open readers for the other sets and merge them
 	var otherReaders []*KdiReader
 	for s := 1; s < ksg.n; s++ {
 		r, err := NewKdiReader(ksg.partitionPath(s, partIdx))
 		if err != nil {
 			r0.Close()
 			for _, rr := range otherReaders {
 				rr.Close()
 			}
 			return 0, err
 		}
 		if r.Count() > 0 {
 			otherReaders = append(otherReaders, r)
 		} else {
 			r.Close()
 		}
 	}
 	w, err := NewKdiWriter(outPath)
 	if err != nil {
 		r0.Close()
 		for _, rr := range otherReaders {
 			rr.Close()
 		}
 		return 0, err
 	}
 	if len(otherReaders) == 0 {
 		// No other sets — copy set_0
 		for {
 			v, ok := r0.Next()
 			if !ok {
 				break
 			}
 			if err := w.Write(v); err != nil {
 				r0.Close()
 				w.Close()
 				return 0, err
 			}
 		}
 		r0.Close()
 		cnt := w.Count()
 		return cnt, w.Close()
 	}
 	// Merge other sets to get the "subtraction" stream
 	otherMerge := NewKWayMerge(otherReaders)
 	// Streaming difference: advance both streams
 	v0, ok0 := r0.Next()
 	vo, _, oko := otherMerge.Next()
 	for ok0 {
 		if !oko || v0 < vo {
 			// v0 not in others → emit
 			if err := w.Write(v0); err != nil {
 				r0.Close()
 				otherMerge.Close()
 				w.Close()
 				return 0, err
 			}
 			v0, ok0 = r0.Next()
 		} else if v0 == vo {
 			// v0 in others → skip
 			v0, ok0 = r0.Next()
 			vo, _, oko = otherMerge.Next()
 		} else {
 			// vo < v0 → advance others
 			vo, _, oko = otherMerge.Next()
 		}
 	}
 	r0.Close()
 	otherMerge.Close()
 	cnt := w.Count()
 	return cnt, w.Close()
 }
 // mergeMode defines how to combine two values during pairwise operations.
 type mergeMode int
 const (
 	mergeUnion     mergeMode = iota // emit if in either
 	mergeIntersect                  // emit if in both
 )
 // pairwiseOp applies a merge operation between corresponding sets of two groups.
 func (ksg *KmerSetGroup) pairwiseOp(other *KmerSetGroup, outputDir string, mode mergeMode) (*KmerSetGroup, error) {
 	for s := 0; s < ksg.n; s++ {
 		setDir := filepath.Join(outputDir, fmt.Sprintf("set_%d", s))
 		if err := os.MkdirAll(setDir, 0755); err != nil {
 			return nil, err
 		}
 	}
 	counts := make([][]uint64, ksg.n)
 	for s := 0; s < ksg.n; s++ {
 		counts[s] = make([]uint64, ksg.partitions)
 	}
 	nWorkers := runtime.NumCPU()
 	if nWorkers > ksg.partitions {
 		nWorkers = ksg.partitions
 	}
 	type job struct {
 		setIdx  int
 		partIdx int
 	}
 	jobs := make(chan job, ksg.n*ksg.partitions)
 	var wg sync.WaitGroup
 	var errMu sync.Mutex
 	var firstErr error
 	for w := 0; w < nWorkers; w++ {
 		wg.Add(1)
 		go func() {
 			defer wg.Done()
 			for j := range jobs {
 				c, err := pairwiseMergePartition(
 					ksg.partitionPath(j.setIdx, j.partIdx),
 					other.partitionPath(j.setIdx, j.partIdx),
 					filepath.Join(outputDir, fmt.Sprintf("set_%d", j.setIdx),
 						fmt.Sprintf("part_%04d.kdi", j.partIdx)),
 					mode,
 				)
 				if err != nil {
 					errMu.Lock()
 					if firstErr == nil {
 						firstErr = err
 					}
 					errMu.Unlock()
 					return
 				}
 				counts[j.setIdx][j.partIdx] = c
 			}
 		}()
 	}
 	for s := 0; s < ksg.n; s++ {
 		for p := 0; p < ksg.partitions; p++ {
 			jobs <- job{s, p}
 		}
 	}
 	close(jobs)
 	wg.Wait()
 	if firstErr != nil {
 		return nil, firstErr
 	}
 	totalCounts := make([]uint64, ksg.n)
 	setsIDs := make([]string, ksg.n)
 	for s := 0; s < ksg.n; s++ {
 		for p := 0; p < ksg.partitions; p++ {
 			totalCounts[s] += counts[s][p]
 		}
 	}
 	result := &KmerSetGroup{
 		path:       outputDir,
 		k:          ksg.k,
 		m:          ksg.m,
 		partitions: ksg.partitions,
 		n:          ksg.n,
 		setsIDs:    setsIDs,
 		counts:     totalCounts,
 		Metadata:   make(map[string]interface{}),
 	}
 	if err := result.saveMetadata(); err != nil {
 		return nil, err
 	}
 	return result, nil
 }
 // pairwiseMergePartition merges two KDI files (sorted streams) with the given mode.
 func pairwiseMergePartition(pathA, pathB, outPath string, mode mergeMode) (uint64, error) {
 	rA, err := NewKdiReader(pathA)
 	if err != nil {
 		return 0, err
 	}
 	rB, err := NewKdiReader(pathB)
 	if err != nil {
 		rA.Close()
 		return 0, err
 	}
 	w, err := NewKdiWriter(outPath)
 	if err != nil {
 		rA.Close()
 		rB.Close()
 		return 0, err
 	}
 	cnt, mergeErr := doPairwiseMerge(rA, rB, w, mode)
 	rA.Close()
 	rB.Close()
 	closeErr := w.Close()
 	if mergeErr != nil {
 		return 0, mergeErr
 	}
 	return cnt, closeErr
 }
 func doPairwiseMerge(rA, rB *KdiReader, w *KdiWriter, mode mergeMode) (uint64, error) {
 	vA, okA := rA.Next()
 	vB, okB := rB.Next()
 	for okA && okB {
 		if vA == vB {
 			if err := w.Write(vA); err != nil {
 				return 0, err
 			}
 			vA, okA = rA.Next()
 			vB, okB = rB.Next()
 		} else if vA < vB {
 			if mode == mergeUnion {
 				if err := w.Write(vA); err != nil {
 					return 0, err
 				}
 			}
 			vA, okA = rA.Next()
 		} else {
 			if mode == mergeUnion {
 				if err := w.Write(vB); err != nil {
 					return 0, err
 				}
 			}
 			vB, okB = rB.Next()
 		}
 	}
 	if mode == mergeUnion {
 		for okA {
 			if err := w.Write(vA); err != nil {
 				return 0, err
 			}
 			vA, okA = rA.Next()
 		}
 		for okB {
 			if err := w.Write(vB); err != nil {
 				return 0, err
 			}
 			vB, okB = rB.Next()
 		}
 	}
 	return w.Count(), nil
 }
--- a/pkg/obikmer/kmer_set_disk_ops_test.go
+++ b/pkg/obikmer/kmer_set_disk_ops_test.go
@@ -0,0 +1,251 @@
 package obikmer
 import (
 	"path/filepath"
 	"testing"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 )
 // buildGroupFromSeqs creates a KmerSetGroup with one set per sequence.
 func buildGroupFromSeqs(t *testing.T, dir string, k, m int, seqs []string) *KmerSetGroup {
 	t.Helper()
 	n := len(seqs)
 	builder, err := NewKmerSetGroupBuilder(dir, k, m, n, 64)
 	if err != nil {
 		t.Fatal(err)
 	}
 	for i, s := range seqs {
 		seq := obiseq.NewBioSequence("", []byte(s), "")
 		builder.AddSequence(i, seq)
 	}
 	ksg, err := builder.Close()
 	if err != nil {
 		t.Fatal(err)
 	}
 	return ksg
 }
 func collectKmers(t *testing.T, ksg *KmerSetGroup, setIdx int) []uint64 {
 	t.Helper()
 	var result []uint64
 	for kmer := range ksg.Iterator(setIdx) {
 		result = append(result, kmer)
 	}
 	return result
 }
 func TestDiskOpsUnion(t *testing.T) {
 	dir := t.TempDir()
 	indexDir := filepath.Join(dir, "index")
 	outDir := filepath.Join(dir, "union")
 	// Two sequences with some overlap
 	seqs := []string{
 		"ACGATCGATCTAGCTAGCTGATCGATCGATCG",
 		"CTAGCTAGCTGATCGATCGATCGTTTAAACCC",
 	}
 	ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs)
 	result, err := ksg.Union(outDir)
 	if err != nil {
 		t.Fatal(err)
 	}
 	// Union should have at least as many k-mers as each individual set
 	unionLen := result.Len(0)
 	if unionLen == 0 {
 		t.Fatal("union is empty")
 	}
 	if unionLen < ksg.Len(0) || unionLen < ksg.Len(1) {
 		t.Fatalf("union (%d) smaller than an input set (%d, %d)", unionLen, ksg.Len(0), ksg.Len(1))
 	}
 	// Union should not exceed the sum of both sets
 	if unionLen > ksg.Len(0)+ksg.Len(1) {
 		t.Fatalf("union (%d) larger than sum of sets (%d)", unionLen, ksg.Len(0)+ksg.Len(1))
 	}
 }
 func TestDiskOpsIntersect(t *testing.T) {
 	dir := t.TempDir()
 	indexDir := filepath.Join(dir, "index")
 	outDir := filepath.Join(dir, "intersect")
 	// Two sequences with some shared k-mers
 	seqs := []string{
 		"ACGATCGATCTAGCTAGCTGATCGATCGATCG",
 		"CTAGCTAGCTGATCGATCGATCGTTTAAACCC",
 	}
 	ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs)
 	result, err := ksg.Intersect(outDir)
 	if err != nil {
 		t.Fatal(err)
 	}
 	interLen := result.Len(0)
 	// Intersection should not be bigger than any individual set
 	if interLen > ksg.Len(0) || interLen > ksg.Len(1) {
 		t.Fatalf("intersection (%d) larger than input sets (%d, %d)", interLen, ksg.Len(0), ksg.Len(1))
 	}
 }
 func TestDiskOpsDifference(t *testing.T) {
 	dir := t.TempDir()
 	indexDir := filepath.Join(dir, "index")
 	outDir := filepath.Join(dir, "diff")
 	seqs := []string{
 		"ACGATCGATCTAGCTAGCTGATCGATCGATCG",
 		"CTAGCTAGCTGATCGATCGATCGTTTAAACCC",
 	}
 	ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs)
 	result, err := ksg.Difference(outDir)
 	if err != nil {
 		t.Fatal(err)
 	}
 	diffLen := result.Len(0)
 	// Difference = set_0 - set_1, so should be <= set_0
 	if diffLen > ksg.Len(0) {
 		t.Fatalf("difference (%d) larger than set_0 (%d)", diffLen, ksg.Len(0))
 	}
 }
 func TestDiskOpsConsistency(t *testing.T) {
 	dir := t.TempDir()
 	indexDir := filepath.Join(dir, "index")
 	seqs := []string{
 		"ACGATCGATCTAGCTAGCTGATCGATCGATCG",
 		"CTAGCTAGCTGATCGATCGATCGTTTAAACCC",
 	}
 	ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs)
 	unionResult, err := ksg.Union(filepath.Join(dir, "union"))
 	if err != nil {
 		t.Fatal(err)
 	}
 	interResult, err := ksg.Intersect(filepath.Join(dir, "intersect"))
 	if err != nil {
 		t.Fatal(err)
 	}
 	diffResult, err := ksg.Difference(filepath.Join(dir, "diff"))
 	if err != nil {
 		t.Fatal(err)
 	}
 	unionLen := unionResult.Len(0)
 	interLen := interResult.Len(0)
 	diffLen := diffResult.Len(0)
 	// |A ∪ B| = |A| + |B| - |A ∩ B|
 	expectedUnion := ksg.Len(0) + ksg.Len(1) - interLen
 	if unionLen != expectedUnion {
 		t.Fatalf("|A∪B|=%d, expected |A|+|B|-|A∩B|=%d+%d-%d=%d",
 			unionLen, ksg.Len(0), ksg.Len(1), interLen, expectedUnion)
 	}
 	// |A \ B| = |A| - |A ∩ B|
 	expectedDiff := ksg.Len(0) - interLen
 	if diffLen != expectedDiff {
 		t.Fatalf("|A\\B|=%d, expected |A|-|A∩B|=%d-%d=%d",
 			diffLen, ksg.Len(0), interLen, expectedDiff)
 	}
 }
 func TestDiskOpsQuorum(t *testing.T) {
 	dir := t.TempDir()
 	indexDir := filepath.Join(dir, "index")
 	// Three sets
 	seqs := []string{
 		"ACGATCGATCTAGCTAGCTGATCGATCGATCG",
 		"CTAGCTAGCTGATCGATCGATCGTTTAAACCC",
 		"GATCGATCGATCGAAATTTCCCGGG",
 	}
 	ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs)
 	// QuorumAtLeast(1) = Union
 	q1, err := ksg.QuorumAtLeast(1, filepath.Join(dir, "q1"))
 	if err != nil {
 		t.Fatal(err)
 	}
 	union, err := ksg.Union(filepath.Join(dir, "union"))
 	if err != nil {
 		t.Fatal(err)
 	}
 	if q1.Len(0) != union.Len(0) {
 		t.Fatalf("QuorumAtLeast(1)=%d != Union=%d", q1.Len(0), union.Len(0))
 	}
 	// QuorumAtLeast(3) = Intersect
 	q3, err := ksg.QuorumAtLeast(3, filepath.Join(dir, "q3"))
 	if err != nil {
 		t.Fatal(err)
 	}
 	inter, err := ksg.Intersect(filepath.Join(dir, "inter"))
 	if err != nil {
 		t.Fatal(err)
 	}
 	if q3.Len(0) != inter.Len(0) {
 		t.Fatalf("QuorumAtLeast(3)=%d != Intersect=%d", q3.Len(0), inter.Len(0))
 	}
 	// QuorumAtLeast(2) should be between Intersect and Union
 	q2, err := ksg.QuorumAtLeast(2, filepath.Join(dir, "q2"))
 	if err != nil {
 		t.Fatal(err)
 	}
 	if q2.Len(0) < q3.Len(0) || q2.Len(0) > q1.Len(0) {
 		t.Fatalf("QuorumAtLeast(2)=%d not between intersect=%d and union=%d",
 			q2.Len(0), q3.Len(0), q1.Len(0))
 	}
 }
 func TestDiskOpsJaccard(t *testing.T) {
 	dir := t.TempDir()
 	indexDir := filepath.Join(dir, "index")
 	seqs := []string{
 		"ACGATCGATCTAGCTAGCTGATCGATCGATCG",
 		"ACGATCGATCTAGCTAGCTGATCGATCGATCG", // identical to first
 		"TTTTTTTTTTTTTTTTTTTTTTTTT",        // completely different
 	}
 	ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs)
 	dm := ksg.JaccardDistanceMatrix()
 	if dm == nil {
 		t.Fatal("JaccardDistanceMatrix returned nil")
 	}
 	// Identical sets should have distance 0
 	d01 := dm.Get(0, 1)
 	if d01 != 0.0 {
 		t.Fatalf("distance(0,1) = %f, expected 0.0 for identical sets", d01)
 	}
 	// Completely different sets should have distance 1.0
 	d02 := dm.Get(0, 2)
 	if d02 != 1.0 {
 		t.Fatalf("distance(0,2) = %f, expected 1.0 for disjoint sets", d02)
 	}
 	// Similarity matrix
 	sm := ksg.JaccardSimilarityMatrix()
 	if sm == nil {
 		t.Fatal("JaccardSimilarityMatrix returned nil")
 	}
 	s01 := sm.Get(0, 1)
 	if s01 != 1.0 {
 		t.Fatalf("similarity(0,1) = %f, expected 1.0 for identical sets", s01)
 	}
 	s02 := sm.Get(0, 2)
 	if s02 != 0.0 {
 		t.Fatalf("similarity(0,2) = %f, expected 0.0 for disjoint sets", s02)
 	}
 }
--- a/pkg/obikmer/kmer_set_group.go
+++ b/pkg/obikmer/kmer_set_group.go
@@ -1,339 +0,0 @@
 package obikmer
 import (
 	"fmt"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidist"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 )
 // KmerSetGroup represents a vector of KmerSet
 // Used to manage multiple k-mer sets (for example, by frequency level)
 type KmerSetGroup struct {
 	id       string                 // Unique identifier of the KmerSetGroup
 	k        int                    // Size of k-mers (immutable)
 	sets     []*KmerSet             // Vector of KmerSet
 	Metadata map[string]interface{} // Group metadata (not individual sets)
 }
 // NewKmerSetGroup creates a new group of n KmerSets
 func NewKmerSetGroup(k int, n int) *KmerSetGroup {
 	if n < 1 {
 		panic("KmerSetGroup size must be >= 1")
 	}
 	sets := make([]*KmerSet, n)
 	for i := range sets {
 		sets[i] = NewKmerSet(k)
 	}
 	return &KmerSetGroup{
 		k:        k,
 		sets:     sets,
 		Metadata: make(map[string]interface{}),
 	}
 }
 // K returns the size of k-mers (immutable)
 func (ksg *KmerSetGroup) K() int {
 	return ksg.k
 }
 // Size returns the number of KmerSet in the group
 func (ksg *KmerSetGroup) Size() int {
 	return len(ksg.sets)
 }
 // Get returns the KmerSet at the given index
 // Returns nil if the index is invalid
 func (ksg *KmerSetGroup) Get(index int) *KmerSet {
 	if index < 0 || index >= len(ksg.sets) {
 		return nil
 	}
 	return ksg.sets[index]
 }
 // Set replaces the KmerSet at the given index
 // Panics if the index is invalid or if k does not match
 func (ksg *KmerSetGroup) Set(index int, ks *KmerSet) {
 	if index < 0 || index >= len(ksg.sets) {
 		panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets)))
 	}
 	if ks.k != ksg.k {
 		panic(fmt.Sprintf("KmerSet k mismatch: expected %d, got %d", ksg.k, ks.k))
 	}
 	ksg.sets[index] = ks
 }
 // Len returns the number of k-mers in a specific KmerSet
 // Without argument: returns the number of k-mers in the last KmerSet
 // With argument index: returns the number of k-mers in the KmerSet at this index
 func (ksg *KmerSetGroup) Len(index ...int) uint64 {
 	if len(index) == 0 {
 		// Without argument: last KmerSet
 		return ksg.sets[len(ksg.sets)-1].Len()
 	}
 	// With argument: specific KmerSet
 	idx := index[0]
 	if idx < 0 || idx >= len(ksg.sets) {
 		return 0
 	}
 	return ksg.sets[idx].Len()
 }
 // MemoryUsage returns the total memory usage in bytes
 func (ksg *KmerSetGroup) MemoryUsage() uint64 {
 	total := uint64(0)
 	for _, ks := range ksg.sets {
 		total += ks.MemoryUsage()
 	}
 	return total
 }
 // Clear empties all KmerSet in the group
 func (ksg *KmerSetGroup) Clear() {
 	for _, ks := range ksg.sets {
 		ks.Clear()
 	}
 }
 // Copy creates a complete copy of the group (consistent with BioSequence.Copy)
 func (ksg *KmerSetGroup) Copy() *KmerSetGroup {
 	copiedSets := make([]*KmerSet, len(ksg.sets))
 	for i, ks := range ksg.sets {
 		copiedSets[i] = ks.Copy() // Copy each KmerSet with its metadata
 	}
 	// Copy group metadata
 	groupMetadata := make(map[string]interface{}, len(ksg.Metadata))
 	for k, v := range ksg.Metadata {
 		groupMetadata[k] = v
 	}
 	return &KmerSetGroup{
 		id:       ksg.id,
 		k:        ksg.k,
 		sets:     copiedSets,
 		Metadata: groupMetadata,
 	}
 }
 // Id returns the identifier of the KmerSetGroup (consistent with BioSequence.Id)
 func (ksg *KmerSetGroup) Id() string {
 	return ksg.id
 }
 // SetId sets the identifier of the KmerSetGroup (consistent with BioSequence.SetId)
 func (ksg *KmerSetGroup) SetId(id string) {
 	ksg.id = id
 }
 // AddSequence adds all k-mers from a sequence to a specific KmerSet
 func (ksg *KmerSetGroup) AddSequence(seq *obiseq.BioSequence, index int) {
 	if index < 0 || index >= len(ksg.sets) {
 		panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets)))
 	}
 	ksg.sets[index].AddSequence(seq)
 }
 // AddSequences adds all k-mers from multiple sequences to a specific KmerSet
 func (ksg *KmerSetGroup) AddSequences(sequences *obiseq.BioSequenceSlice, index int) {
 	if index < 0 || index >= len(ksg.sets) {
 		panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets)))
 	}
 	ksg.sets[index].AddSequences(sequences)
 }
 // Union returns the union of all KmerSet in the group
 // Optimization: starts from the largest set to minimize operations
 func (ksg *KmerSetGroup) Union() *KmerSet {
 	if len(ksg.sets) == 0 {
 		return NewKmerSet(ksg.k)
 	}
 	if len(ksg.sets) == 1 {
 		return ksg.sets[0].Copy()
 	}
 	// Find the index of the largest set (the one with the most k-mers)
 	maxIdx := 0
 	maxCard := ksg.sets[0].Len()
 	for i := 1; i < len(ksg.sets); i++ {
 		card := ksg.sets[i].Len()
 		if card > maxCard {
 			maxCard = card
 			maxIdx = i
 		}
 	}
 	// Copy the largest set and perform unions in-place
 	result := ksg.sets[maxIdx].bitmap.Clone()
 	for i := 0; i < len(ksg.sets); i++ {
 		if i != maxIdx {
 			result.Or(ksg.sets[i].bitmap)
 		}
 	}
 	return NewKmerSetFromBitmap(ksg.k, result)
 }
 // Intersect returns the intersection of all KmerSet in the group
 // Optimization: starts from the smallest set to minimize operations
 func (ksg *KmerSetGroup) Intersect() *KmerSet {
 	if len(ksg.sets) == 0 {
 		return NewKmerSet(ksg.k)
 	}
 	if len(ksg.sets) == 1 {
 		return ksg.sets[0].Copy()
 	}
 	// Find the index of the smallest set (the one with the fewest k-mers)
 	minIdx := 0
 	minCard := ksg.sets[0].Len()
 	for i := 1; i < len(ksg.sets); i++ {
 		card := ksg.sets[i].Len()
 		if card < minCard {
 			minCard = card
 			minIdx = i
 		}
 	}
 	// Copy the smallest set and perform intersections in-place
 	result := ksg.sets[minIdx].bitmap.Clone()
 	for i := 0; i < len(ksg.sets); i++ {
 		if i != minIdx {
 			result.And(ksg.sets[i].bitmap)
 		}
 	}
 	return NewKmerSetFromBitmap(ksg.k, result)
 }
 // Stats returns statistics for each KmerSet in the group
 type KmerSetGroupStats struct {
 	K          int
 	Size       int              // Number of KmerSet
 	TotalBytes uint64           // Total memory used
 	Sets       []KmerSetStats   // Stats of each KmerSet
 }
 type KmerSetStats struct {
 	Index     int    // Index of the KmerSet in the group
 	Len       uint64 // Number of k-mers
 	SizeBytes uint64 // Size in bytes
 }
 func (ksg *KmerSetGroup) Stats() KmerSetGroupStats {
 	stats := KmerSetGroupStats{
 		K:    ksg.k,
 		Size: len(ksg.sets),
 		Sets: make([]KmerSetStats, len(ksg.sets)),
 	}
 	for i, ks := range ksg.sets {
 		sizeBytes := ks.MemoryUsage()
 		stats.Sets[i] = KmerSetStats{
 			Index:     i,
 			Len:       ks.Len(),
 			SizeBytes: sizeBytes,
 		}
 		stats.TotalBytes += sizeBytes
 	}
 	return stats
 }
 func (ksgs KmerSetGroupStats) String() string {
 	result := fmt.Sprintf(`KmerSetGroup Statistics (k=%d, size=%d):
  Total memory: %.2f MB
 Set breakdown:
 `, ksgs.K, ksgs.Size, float64(ksgs.TotalBytes)/1024/1024)
 	for _, set := range ksgs.Sets {
 		result += fmt.Sprintf("  Set[%d]: %d k-mers (%.2f MB)\n",
 			set.Index,
 			set.Len,
 			float64(set.SizeBytes)/1024/1024)
 	}
 	return result
 }
 // JaccardDistanceMatrix computes a pairwise Jaccard distance matrix for all KmerSets in the group.
 // Returns a triangular distance matrix where element (i, j) represents the Jaccard distance
 // between set i and set j.
 //
 // The Jaccard distance is: 1 - (|A ∩ B| / |A ∪ B|)
 //
 // The matrix labels are set to the IDs of the individual KmerSets if available,
 // otherwise they are set to "set_0", "set_1", etc.
 //
 // Time complexity: O(n² × (|A| + |B|)) where n is the number of sets
 // Space complexity: O(n²) for the distance matrix
 func (ksg *KmerSetGroup) JaccardDistanceMatrix() *obidist.DistMatrix {
 	n := len(ksg.sets)
 	// Create labels from set IDs
 	labels := make([]string, n)
 	for i, ks := range ksg.sets {
 		if ks.Id() != "" {
 			labels[i] = ks.Id()
 		} else {
 			labels[i] = fmt.Sprintf("set_%d", i)
 		}
 	}
 	dm := obidist.NewDistMatrixWithLabels(labels)
 	// Compute pairwise distances
 	for i := 0; i < n-1; i++ {
 		for j := i + 1; j < n; j++ {
 			distance := ksg.sets[i].JaccardDistance(ksg.sets[j])
 			dm.Set(i, j, distance)
 		}
 	}
 	return dm
 }
 // JaccardSimilarityMatrix computes a pairwise Jaccard similarity matrix for all KmerSets in the group.
 // Returns a similarity matrix where element (i, j) represents the Jaccard similarity
 // between set i and set j.
 //
 // The Jaccard similarity is: |A ∩ B| / |A ∪ B|
 //
 // The diagonal is 1.0 (similarity of a set to itself).
 //
 // The matrix labels are set to the IDs of the individual KmerSets if available,
 // otherwise they are set to "set_0", "set_1", etc.
 //
 // Time complexity: O(n² × (|A| + |B|)) where n is the number of sets
 // Space complexity: O(n²) for the similarity matrix
 func (ksg *KmerSetGroup) JaccardSimilarityMatrix() *obidist.DistMatrix {
 	n := len(ksg.sets)
 	// Create labels from set IDs
 	labels := make([]string, n)
 	for i, ks := range ksg.sets {
 		if ks.Id() != "" {
 			labels[i] = ks.Id()
 		} else {
 			labels[i] = fmt.Sprintf("set_%d", i)
 		}
 	}
 	sm := obidist.NewSimilarityMatrixWithLabels(labels)
 	// Compute pairwise similarities
 	for i := 0; i < n-1; i++ {
 		for j := i + 1; j < n; j++ {
 			similarity := ksg.sets[i].JaccardSimilarity(ksg.sets[j])
 			sm.Set(i, j, similarity)
 		}
 	}
 	return sm
 }
--- a/pkg/obikmer/kmer_set_group_jaccard_test.go
+++ b/pkg/obikmer/kmer_set_group_jaccard_test.go
@@ -1,231 +0,0 @@
 package obikmer
 import (
 	"math"
 	"testing"
 )
 func TestKmerSetGroupJaccardDistanceMatrix(t *testing.T) {
 	ksg := NewKmerSetGroup(5, 3)
 	// Set 0: {1, 2, 3}
 	ksg.Get(0).AddKmerCode(1)
 	ksg.Get(0).AddKmerCode(2)
 	ksg.Get(0).AddKmerCode(3)
 	ksg.Get(0).SetId("set_A")
 	// Set 1: {2, 3, 4}
 	ksg.Get(1).AddKmerCode(2)
 	ksg.Get(1).AddKmerCode(3)
 	ksg.Get(1).AddKmerCode(4)
 	ksg.Get(1).SetId("set_B")
 	// Set 2: {5, 6, 7}
 	ksg.Get(2).AddKmerCode(5)
 	ksg.Get(2).AddKmerCode(6)
 	ksg.Get(2).AddKmerCode(7)
 	ksg.Get(2).SetId("set_C")
 	dm := ksg.JaccardDistanceMatrix()
 	// Check labels
 	if dm.GetLabel(0) != "set_A" {
 		t.Errorf("Expected label 'set_A' at index 0, got '%s'", dm.GetLabel(0))
 	}
 	if dm.GetLabel(1) != "set_B" {
 		t.Errorf("Expected label 'set_B' at index 1, got '%s'", dm.GetLabel(1))
 	}
 	if dm.GetLabel(2) != "set_C" {
 		t.Errorf("Expected label 'set_C' at index 2, got '%s'", dm.GetLabel(2))
 	}
 	// Check distances
 	// Distance(0, 1):
 	// Intersection: {2, 3} -> 2 elements
 	// Union: {1, 2, 3, 4} -> 4 elements
 	// Similarity: 2/4 = 0.5
 	// Distance: 1 - 0.5 = 0.5
 	expectedDist01 := 0.5
 	actualDist01 := dm.Get(0, 1)
 	if math.Abs(actualDist01-expectedDist01) > 1e-10 {
 		t.Errorf("Distance(0, 1): expected %f, got %f", expectedDist01, actualDist01)
 	}
 	// Distance(0, 2):
 	// Intersection: {} -> 0 elements
 	// Union: {1, 2, 3, 5, 6, 7} -> 6 elements
 	// Similarity: 0/6 = 0
 	// Distance: 1 - 0 = 1.0
 	expectedDist02 := 1.0
 	actualDist02 := dm.Get(0, 2)
 	if math.Abs(actualDist02-expectedDist02) > 1e-10 {
 		t.Errorf("Distance(0, 2): expected %f, got %f", expectedDist02, actualDist02)
 	}
 	// Distance(1, 2):
 	// Intersection: {} -> 0 elements
 	// Union: {2, 3, 4, 5, 6, 7} -> 6 elements
 	// Similarity: 0/6 = 0
 	// Distance: 1 - 0 = 1.0
 	expectedDist12 := 1.0
 	actualDist12 := dm.Get(1, 2)
 	if math.Abs(actualDist12-expectedDist12) > 1e-10 {
 		t.Errorf("Distance(1, 2): expected %f, got %f", expectedDist12, actualDist12)
 	}
 	// Check symmetry
 	if dm.Get(0, 1) != dm.Get(1, 0) {
 		t.Errorf("Matrix not symmetric: Get(0, 1) = %f, Get(1, 0) = %f",
 			dm.Get(0, 1), dm.Get(1, 0))
 	}
 	// Check diagonal
 	if dm.Get(0, 0) != 0.0 {
 		t.Errorf("Diagonal should be 0, got %f", dm.Get(0, 0))
 	}
 	if dm.Get(1, 1) != 0.0 {
 		t.Errorf("Diagonal should be 0, got %f", dm.Get(1, 1))
 	}
 	if dm.Get(2, 2) != 0.0 {
 		t.Errorf("Diagonal should be 0, got %f", dm.Get(2, 2))
 	}
 }
 func TestKmerSetGroupJaccardSimilarityMatrix(t *testing.T) {
 	ksg := NewKmerSetGroup(5, 3)
 	// Set 0: {1, 2, 3}
 	ksg.Get(0).AddKmerCode(1)
 	ksg.Get(0).AddKmerCode(2)
 	ksg.Get(0).AddKmerCode(3)
 	// Set 1: {2, 3, 4}
 	ksg.Get(1).AddKmerCode(2)
 	ksg.Get(1).AddKmerCode(3)
 	ksg.Get(1).AddKmerCode(4)
 	// Set 2: {1, 2, 3} (same as set 0)
 	ksg.Get(2).AddKmerCode(1)
 	ksg.Get(2).AddKmerCode(2)
 	ksg.Get(2).AddKmerCode(3)
 	sm := ksg.JaccardSimilarityMatrix()
 	// Check similarities
 	// Similarity(0, 1): 0.5 (as calculated above)
 	expectedSim01 := 0.5
 	actualSim01 := sm.Get(0, 1)
 	if math.Abs(actualSim01-expectedSim01) > 1e-10 {
 		t.Errorf("Similarity(0, 1): expected %f, got %f", expectedSim01, actualSim01)
 	}
 	// Similarity(0, 2): 1.0 (identical sets)
 	expectedSim02 := 1.0
 	actualSim02 := sm.Get(0, 2)
 	if math.Abs(actualSim02-expectedSim02) > 1e-10 {
 		t.Errorf("Similarity(0, 2): expected %f, got %f", expectedSim02, actualSim02)
 	}
 	// Similarity(1, 2): 0.5
 	// Intersection: {2, 3} -> 2
 	// Union: {1, 2, 3, 4} -> 4
 	// Similarity: 2/4 = 0.5
 	expectedSim12 := 0.5
 	actualSim12 := sm.Get(1, 2)
 	if math.Abs(actualSim12-expectedSim12) > 1e-10 {
 		t.Errorf("Similarity(1, 2): expected %f, got %f", expectedSim12, actualSim12)
 	}
 	// Check diagonal (similarity to self = 1.0)
 	if sm.Get(0, 0) != 1.0 {
 		t.Errorf("Diagonal should be 1.0, got %f", sm.Get(0, 0))
 	}
 	if sm.Get(1, 1) != 1.0 {
 		t.Errorf("Diagonal should be 1.0, got %f", sm.Get(1, 1))
 	}
 	if sm.Get(2, 2) != 1.0 {
 		t.Errorf("Diagonal should be 1.0, got %f", sm.Get(2, 2))
 	}
 }
 func TestKmerSetGroupJaccardMatricesRelation(t *testing.T) {
 	ksg := NewKmerSetGroup(5, 4)
 	// Create different sets
 	ksg.Get(0).AddKmerCode(1)
 	ksg.Get(0).AddKmerCode(2)
 	ksg.Get(1).AddKmerCode(2)
 	ksg.Get(1).AddKmerCode(3)
 	ksg.Get(2).AddKmerCode(1)
 	ksg.Get(2).AddKmerCode(2)
 	ksg.Get(2).AddKmerCode(3)
 	ksg.Get(3).AddKmerCode(10)
 	ksg.Get(3).AddKmerCode(20)
 	dm := ksg.JaccardDistanceMatrix()
 	sm := ksg.JaccardSimilarityMatrix()
 	// For all pairs (including diagonal), distance + similarity should equal 1.0
 	for i := 0; i < 4; i++ {
 		for j := 0; j < 4; j++ {
 			distance := dm.Get(i, j)
 			similarity := sm.Get(i, j)
 			sum := distance + similarity
 			if math.Abs(sum-1.0) > 1e-10 {
 				t.Errorf("At (%d, %d): distance %f + similarity %f = %f, expected 1.0",
 					i, j, distance, similarity, sum)
 			}
 		}
 	}
 }
 func TestKmerSetGroupJaccardMatrixLabels(t *testing.T) {
 	ksg := NewKmerSetGroup(5, 3)
 	// Don't set IDs - should use default labels
 	ksg.Get(0).AddKmerCode(1)
 	ksg.Get(1).AddKmerCode(2)
 	ksg.Get(2).AddKmerCode(3)
 	dm := ksg.JaccardDistanceMatrix()
 	// Check default labels
 	if dm.GetLabel(0) != "set_0" {
 		t.Errorf("Expected default label 'set_0', got '%s'", dm.GetLabel(0))
 	}
 	if dm.GetLabel(1) != "set_1" {
 		t.Errorf("Expected default label 'set_1', got '%s'", dm.GetLabel(1))
 	}
 	if dm.GetLabel(2) != "set_2" {
 		t.Errorf("Expected default label 'set_2', got '%s'", dm.GetLabel(2))
 	}
 }
 func TestKmerSetGroupJaccardMatrixSize(t *testing.T) {
 	ksg := NewKmerSetGroup(5, 5)
 	for i := 0; i < 5; i++ {
 		ksg.Get(i).AddKmerCode(uint64(i))
 	}
 	dm := ksg.JaccardDistanceMatrix()
 	if dm.Size() != 5 {
 		t.Errorf("Expected matrix size 5, got %d", dm.Size())
 	}
 	// All sets are disjoint, so all distances should be 1.0
 	for i := 0; i < 5; i++ {
 		for j := i + 1; j < 5; j++ {
 			dist := dm.Get(i, j)
 			if math.Abs(dist-1.0) > 1e-10 {
 				t.Errorf("Expected distance 1.0 for disjoint sets (%d, %d), got %f",
 					i, j, dist)
 			}
 		}
 	}
 }
--- a/pkg/obikmer/kmer_set_group_quorum.go
+++ b/pkg/obikmer/kmer_set_group_quorum.go
@@ -1,235 +0,0 @@
 package obikmer
 import (
 	"container/heap"
 	"github.com/RoaringBitmap/roaring/roaring64"
 )
 // heapItem represents an element in the min-heap for k-way merge
 type heapItem struct {
 	value uint64
 	idx   int
 }
 // kmerMinHeap implements heap.Interface for k-way merge algorithm
 type kmerMinHeap []heapItem
 func (h kmerMinHeap) Len() int           { return len(h) }
 func (h kmerMinHeap) Less(i, j int) bool { return h[i].value < h[j].value }
 func (h kmerMinHeap) Swap(i, j int)      { h[i], h[j] = h[j], h[i] }
 func (h *kmerMinHeap) Push(x interface{}) {
 	*h = append(*h, x.(heapItem))
 }
 func (h *kmerMinHeap) Pop() interface{} {
 	old := *h
 	n := len(old)
 	x := old[n-1]
 	*h = old[0 : n-1]
 	return x
 }
 // QuorumAtLeast returns k-mers present in at least q sets
 //
 // Algorithm: K-way merge with min-heap counting
 //
 // The algorithm processes all k-mers in sorted order using a min-heap:
 //
 //  1. Initialize one iterator per non-empty set
 //  2. Build a min-heap of (value, set_index) pairs, one per iterator
 //  3. While heap is not empty:
 //     a. Extract the minimum value v from heap
 //     b. Pop ALL heap items with value == v (counting occurrences)
 //     c. If count >= q, add v to result
 //     d. Advance each popped iterator and re-insert into heap if valid
 //
 // This ensures each unique k-mer is counted exactly once across all sets.
 //
 // Time complexity: O(M log N)
 //   - M = sum of all set cardinalities (total k-mer occurrences)
 //   - N = number of sets
 //   - Each k-mer occurrence is inserted/extracted from heap once: O(M) operations
 //   - Each heap operation costs O(log N)
 //
 // Space complexity: O(N)
 //   - Heap contains at most N elements (one per set iterator)
 //   - Output bitmap size depends on quorum result
 //
 // Special cases (optimized):
 //   - q <= 0: returns empty set
 //   - q == 1: delegates to Union() (native OR operations)
 //   - q == n: delegates to Intersect() (native AND operations)
 //   - q > n: returns empty set (impossible to satisfy)
 func (ksg *KmerSetGroup) QuorumAtLeast(q int) *KmerSet {
 	n := len(ksg.sets)
 	// Edge cases
 	if q <= 0 || n == 0 {
 		return NewKmerSet(ksg.k)
 	}
 	if q > n {
 		return NewKmerSet(ksg.k)
 	}
 	if q == 1 {
 		return ksg.Union()
 	}
 	if q == n {
 		return ksg.Intersect()
 	}
 	// Initialize iterators for all non-empty sets
 	iterators := make([]roaring64.IntIterable64, 0, n)
 	iterIndices := make([]int, 0, n)
 	for i, set := range ksg.sets {
 		if set.Len() > 0 {
 			iter := set.bitmap.Iterator()
 			if iter.HasNext() {
 				iterators = append(iterators, iter)
 				iterIndices = append(iterIndices, i)
 			}
 		}
 	}
 	if len(iterators) == 0 {
 		return NewKmerSet(ksg.k)
 	}
 	// Initialize heap with first value from each iterator
 	h := make(kmerMinHeap, len(iterators))
 	for i, iter := range iterators {
 		h[i] = heapItem{value: iter.Next(), idx: i}
 	}
 	heap.Init(&h)
 	// Result bitmap
 	result := roaring64.New()
 	// K-way merge with counting
 	for len(h) > 0 {
 		minVal := h[0].value
 		count := 0
 		activeIndices := make([]int, 0, len(h))
 		// Pop all elements with same value (count occurrences)
 		for len(h) > 0 && h[0].value == minVal {
 			item := heap.Pop(&h).(heapItem)
 			count++
 			activeIndices = append(activeIndices, item.idx)
 		}
 		// Add to result if quorum reached
 		if count >= q {
 			result.Add(minVal)
 		}
 		// Advance iterators and re-insert into heap
 		for _, iterIdx := range activeIndices {
 			if iterators[iterIdx].HasNext() {
 				heap.Push(&h, heapItem{
 					value: iterators[iterIdx].Next(),
 					idx:   iterIdx,
 				})
 			}
 		}
 	}
 	return NewKmerSetFromBitmap(ksg.k, result)
 }
 // QuorumAtMost returns k-mers present in at most q sets
 //
 // Algorithm: Uses the mathematical identity
 //   AtMost(q) = Union() - AtLeast(q+1)
 //
 // Proof:
 //   - Union() contains all k-mers present in at least 1 set
 //   - AtLeast(q+1) contains all k-mers present in q+1 or more sets
 //   - Their difference contains only k-mers present in at most q sets
 //
 // Implementation:
 //  1. Compute U = Union()
 //  2. Compute A = QuorumAtLeast(q+1)
 //  3. Return U - A using bitmap AndNot operation
 //
 // Time complexity: O(M log N)
 //   - Union(): O(M) with native OR operations
 //   - QuorumAtLeast(q+1): O(M log N)
 //   - AndNot: O(|U|) where |U| <= M
 //   - Total: O(M log N)
 //
 // Space complexity: O(N)
 //   - Inherited from QuorumAtLeast heap
 //
 // Special cases:
 //   - q <= 0: returns empty set
 //   - q >= n: returns Union() (all k-mers are in at most n sets)
 func (ksg *KmerSetGroup) QuorumAtMost(q int) *KmerSet {
 	n := len(ksg.sets)
 	// Edge cases
 	if q <= 0 {
 		return NewKmerSet(ksg.k)
 	}
 	if q >= n {
 		return ksg.Union()
 	}
 	// Compute Union() - AtLeast(q+1)
 	union := ksg.Union()
 	atLeastQ1 := ksg.QuorumAtLeast(q + 1)
 	// Difference: elements in union but not in atLeastQ1
 	result := union.bitmap.Clone()
 	result.AndNot(atLeastQ1.bitmap)
 	return NewKmerSetFromBitmap(ksg.k, result)
 }
 // QuorumExactly returns k-mers present in exactly q sets
 //
 // Algorithm: Uses the mathematical identity
 //   Exactly(q) = AtLeast(q) - AtLeast(q+1)
 //
 // Proof:
 //   - AtLeast(q) contains all k-mers present in q or more sets
 //   - AtLeast(q+1) contains all k-mers present in q+1 or more sets
 //   - Their difference contains only k-mers present in exactly q sets
 //
 // Implementation:
 //  1. Compute A = QuorumAtLeast(q)
 //  2. Compute B = QuorumAtLeast(q+1)
 //  3. Return A - B using bitmap AndNot operation
 //
 // Time complexity: O(M log N)
 //   - Two calls to QuorumAtLeast: 2 * O(M log N)
 //   - One AndNot operation: O(|A|) where |A| <= M
 //   - Total: O(M log N) since AndNot is dominated by merge operations
 //
 // Space complexity: O(N)
 //   - Inherited from QuorumAtLeast heap
 //   - Two temporary bitmaps for intermediate results
 //
 // Special cases:
 //   - q <= 0: returns empty set
 //   - q > n: returns empty set (impossible to have k-mer in more than n sets)
 func (ksg *KmerSetGroup) QuorumExactly(q int) *KmerSet {
 	n := len(ksg.sets)
 	// Edge cases
 	if q <= 0 || q > n {
 		return NewKmerSet(ksg.k)
 	}
 	// Compute AtLeast(q) - AtLeast(q+1)
 	aq := ksg.QuorumAtLeast(q)
 	aq1 := ksg.QuorumAtLeast(q + 1)
 	// Difference: elements in aq but not in aq1
 	result := aq.bitmap.Clone()
 	result.AndNot(aq1.bitmap)
 	return NewKmerSetFromBitmap(ksg.k, result)
 }
--- a/pkg/obikmer/kmer_set_group_quorum_test.go
+++ b/pkg/obikmer/kmer_set_group_quorum_test.go
@@ -1,395 +0,0 @@
 package obikmer
 import (
 	"testing"
 )
 // TestQuorumAtLeastEdgeCases tests edge cases for QuorumAtLeast
 func TestQuorumAtLeastEdgeCases(t *testing.T) {
 	k := 5
 	// Test group with all empty sets
 	emptyGroup := NewKmerSetGroup(k, 3)
 	result := emptyGroup.QuorumAtLeast(1)
 	if result.Len() != 0 {
 		t.Errorf("Empty sets: expected 0 k-mers, got %d", result.Len())
 	}
 	// Test q <= 0
 	group := NewKmerSetGroup(k, 3)
 	result = group.QuorumAtLeast(0)
 	if result.Len() != 0 {
 		t.Errorf("q=0: expected 0 k-mers, got %d", result.Len())
 	}
 	result = group.QuorumAtLeast(-1)
 	if result.Len() != 0 {
 		t.Errorf("q=-1: expected 0 k-mers, got %d", result.Len())
 	}
 	// Test q > n
 	group.Get(0).AddKmerCode(1)
 	result = group.QuorumAtLeast(10)
 	if result.Len() != 0 {
 		t.Errorf("q>n: expected 0 k-mers, got %d", result.Len())
 	}
 }
 // TestQuorumAtLeastQ1 tests q=1 (should equal Union)
 func TestQuorumAtLeastQ1(t *testing.T) {
 	k := 5
 	group := NewKmerSetGroup(k, 3)
 	// Add different k-mers to each set
 	group.Get(0).AddKmerCode(1)
 	group.Get(0).AddKmerCode(2)
 	group.Get(1).AddKmerCode(2)
 	group.Get(1).AddKmerCode(3)
 	group.Get(2).AddKmerCode(3)
 	group.Get(2).AddKmerCode(4)
 	quorum := group.QuorumAtLeast(1)
 	union := group.Union()
 	if quorum.Len() != union.Len() {
 		t.Errorf("QuorumAtLeast(1) length %d != Union length %d", quorum.Len(), union.Len())
 	}
 	// Check all elements match
 	for kmer := uint64(1); kmer <= 4; kmer++ {
 		if quorum.Contains(kmer) != union.Contains(kmer) {
 			t.Errorf("Mismatch for k-mer %d", kmer)
 		}
 	}
 }
 // TestQuorumAtLeastQN tests q=n (should equal Intersect)
 func TestQuorumAtLeastQN(t *testing.T) {
 	k := 5
 	group := NewKmerSetGroup(k, 3)
 	// Add some common k-mers and some unique
 	for i := 0; i < 3; i++ {
 		group.Get(i).AddKmerCode(10) // common to all
 		group.Get(i).AddKmerCode(20) // common to all
 	}
 	group.Get(0).AddKmerCode(1) // unique to set 0
 	group.Get(1).AddKmerCode(2) // unique to set 1
 	quorum := group.QuorumAtLeast(3)
 	intersect := group.Intersect()
 	if quorum.Len() != intersect.Len() {
 		t.Errorf("QuorumAtLeast(n) length %d != Intersect length %d", quorum.Len(), intersect.Len())
 	}
 	if quorum.Len() != 2 {
 		t.Errorf("Expected 2 common k-mers, got %d", quorum.Len())
 	}
 	if !quorum.Contains(10) || !quorum.Contains(20) {
 		t.Error("Missing common k-mers")
 	}
 	if quorum.Contains(1) || quorum.Contains(2) {
 		t.Error("Unique k-mers should not be in result")
 	}
 }
 // TestQuorumAtLeastGeneral tests general quorum values
 func TestQuorumAtLeastGeneral(t *testing.T) {
 	k := 5
 	group := NewKmerSetGroup(k, 5)
 	// Setup: k-mer i appears in i sets (for i=1..5)
 	// k-mer 1: in set 0
 	// k-mer 2: in sets 0,1
 	// k-mer 3: in sets 0,1,2
 	// k-mer 4: in sets 0,1,2,3
 	// k-mer 5: in sets 0,1,2,3,4 (all)
 	for kmer := uint64(1); kmer <= 5; kmer++ {
 		for setIdx := 0; setIdx < int(kmer); setIdx++ {
 			group.Get(setIdx).AddKmerCode(kmer)
 		}
 	}
 	tests := []struct {
 		q        int
 		expected map[uint64]bool
 	}{
 		{1, map[uint64]bool{1: true, 2: true, 3: true, 4: true, 5: true}},
 		{2, map[uint64]bool{2: true, 3: true, 4: true, 5: true}},
 		{3, map[uint64]bool{3: true, 4: true, 5: true}},
 		{4, map[uint64]bool{4: true, 5: true}},
 		{5, map[uint64]bool{5: true}},
 	}
 	for _, tt := range tests {
 		result := group.QuorumAtLeast(tt.q)
 		if result.Len() != uint64(len(tt.expected)) {
 			t.Errorf("q=%d: expected %d k-mers, got %d", tt.q, len(tt.expected), result.Len())
 		}
 		for kmer := uint64(1); kmer <= 5; kmer++ {
 			shouldContain := tt.expected[kmer]
 			doesContain := result.Contains(kmer)
 			if shouldContain != doesContain {
 				t.Errorf("q=%d, k-mer=%d: expected contains=%v, got %v", tt.q, kmer, shouldContain, doesContain)
 			}
 		}
 	}
 }
 // TestQuorumExactlyBasic tests QuorumExactly basic functionality
 func TestQuorumExactlyBasic(t *testing.T) {
 	k := 5
 	group := NewKmerSetGroup(k, 5)
 	// Setup: k-mer i appears in exactly i sets
 	for kmer := uint64(1); kmer <= 5; kmer++ {
 		for setIdx := 0; setIdx < int(kmer); setIdx++ {
 			group.Get(setIdx).AddKmerCode(kmer)
 		}
 	}
 	tests := []struct {
 		q        int
 		expected []uint64
 	}{
 		{1, []uint64{1}},
 		{2, []uint64{2}},
 		{3, []uint64{3}},
 		{4, []uint64{4}},
 		{5, []uint64{5}},
 	}
 	for _, tt := range tests {
 		result := group.QuorumExactly(tt.q)
 		if result.Len() != uint64(len(tt.expected)) {
 			t.Errorf("q=%d: expected %d k-mers, got %d", tt.q, len(tt.expected), result.Len())
 		}
 		for _, kmer := range tt.expected {
 			if !result.Contains(kmer) {
 				t.Errorf("q=%d: missing k-mer %d", tt.q, kmer)
 			}
 		}
 	}
 }
 // TestQuorumIdentity tests the mathematical identity: Exactly(q) = AtLeast(q) - AtLeast(q+1)
 func TestQuorumIdentity(t *testing.T) {
 	k := 5
 	group := NewKmerSetGroup(k, 4)
 	// Add random distribution
 	group.Get(0).AddKmerCode(1)
 	group.Get(0).AddKmerCode(2)
 	group.Get(0).AddKmerCode(3)
 	group.Get(1).AddKmerCode(2)
 	group.Get(1).AddKmerCode(3)
 	group.Get(1).AddKmerCode(4)
 	group.Get(2).AddKmerCode(3)
 	group.Get(2).AddKmerCode(4)
 	group.Get(3).AddKmerCode(4)
 	for q := 1; q <= 4; q++ {
 		exactly := group.QuorumExactly(q)
 		atLeast := group.QuorumAtLeast(q)
 		atLeastPlus1 := group.QuorumAtLeast(q + 1)
 		// Verify: every element in exactly(q) is in atLeast(q)
 		iter := exactly.Iterator()
 		for iter.HasNext() {
 			kmer := iter.Next()
 			if !atLeast.Contains(kmer) {
 				t.Errorf("q=%d: k-mer %d in Exactly but not in AtLeast", q, kmer)
 			}
 			if atLeastPlus1.Contains(kmer) {
 				t.Errorf("q=%d: k-mer %d in Exactly but also in AtLeast(q+1)", q, kmer)
 			}
 		}
 	}
 }
 // TestQuorumDisjointSets tests quorum on completely disjoint sets
 func TestQuorumDisjointSets(t *testing.T) {
 	k := 5
 	group := NewKmerSetGroup(k, 3)
 	// Each set has unique k-mers
 	group.Get(0).AddKmerCode(1)
 	group.Get(1).AddKmerCode(2)
 	group.Get(2).AddKmerCode(3)
 	// q=1 should give all
 	result := group.QuorumAtLeast(1)
 	if result.Len() != 3 {
 		t.Errorf("Disjoint sets q=1: expected 3, got %d", result.Len())
 	}
 	// q=2 should give none
 	result = group.QuorumAtLeast(2)
 	if result.Len() != 0 {
 		t.Errorf("Disjoint sets q=2: expected 0, got %d", result.Len())
 	}
 }
 // TestQuorumIdenticalSets tests quorum on identical sets
 func TestQuorumIdenticalSets(t *testing.T) {
 	k := 5
 	group := NewKmerSetGroup(k, 3)
 	// All sets have same k-mers
 	for i := 0; i < 3; i++ {
 		group.Get(i).AddKmerCode(10)
 		group.Get(i).AddKmerCode(20)
 		group.Get(i).AddKmerCode(30)
 	}
 	// Any q <= n should give all k-mers
 	for q := 1; q <= 3; q++ {
 		result := group.QuorumAtLeast(q)
 		if result.Len() != 3 {
 			t.Errorf("Identical sets q=%d: expected 3, got %d", q, result.Len())
 		}
 	}
 }
 // TestQuorumLargeNumbers tests with large k-mer values
 func TestQuorumLargeNumbers(t *testing.T) {
 	k := 21
 	group := NewKmerSetGroup(k, 3)
 	// Use large uint64 values (actual k-mer encodings)
 	largeKmers := []uint64{
 		0x1234567890ABCDEF,
 		0xFEDCBA0987654321,
 		0xAAAAAAAAAAAAAAAA,
 	}
 	// Add to multiple sets
 	for i := 0; i < 3; i++ {
 		for j := 0; j <= i; j++ {
 			group.Get(j).AddKmerCode(largeKmers[i])
 		}
 	}
 	result := group.QuorumAtLeast(2)
 	if result.Len() != 2 {
 		t.Errorf("Large numbers q=2: expected 2, got %d", result.Len())
 	}
 	if !result.Contains(largeKmers[1]) || !result.Contains(largeKmers[2]) {
 		t.Error("Large numbers: wrong k-mers in result")
 	}
 }
 // TestQuorumAtMostBasic tests QuorumAtMost basic functionality
 func TestQuorumAtMostBasic(t *testing.T) {
 	k := 5
 	group := NewKmerSetGroup(k, 5)
 	// Setup: k-mer i appears in exactly i sets
 	for kmer := uint64(1); kmer <= 5; kmer++ {
 		for setIdx := 0; setIdx < int(kmer); setIdx++ {
 			group.Get(setIdx).AddKmerCode(kmer)
 		}
 	}
 	tests := []struct {
 		q        int
 		expected []uint64
 	}{
 		{0, []uint64{}},                          // at most 0: none
 		{1, []uint64{1}},                         // at most 1: only k-mer 1
 		{2, []uint64{1, 2}},                      // at most 2: k-mers 1,2
 		{3, []uint64{1, 2, 3}},                   // at most 3: k-mers 1,2,3
 		{4, []uint64{1, 2, 3, 4}},                // at most 4: k-mers 1,2,3,4
 		{5, []uint64{1, 2, 3, 4, 5}},             // at most 5: all k-mers
 		{10, []uint64{1, 2, 3, 4, 5}},            // at most 10: all k-mers
 	}
 	for _, tt := range tests {
 		result := group.QuorumAtMost(tt.q)
 		if result.Len() != uint64(len(tt.expected)) {
 			t.Errorf("q=%d: expected %d k-mers, got %d", tt.q, len(tt.expected), result.Len())
 		}
 		for _, kmer := range tt.expected {
 			if !result.Contains(kmer) {
 				t.Errorf("q=%d: missing k-mer %d", tt.q, kmer)
 			}
 		}
 	}
 }
 // TestQuorumComplementIdentity tests that AtLeast and AtMost are complementary
 func TestQuorumComplementIdentity(t *testing.T) {
 	k := 5
 	group := NewKmerSetGroup(k, 4)
 	// Add random distribution
 	group.Get(0).AddKmerCode(1)
 	group.Get(0).AddKmerCode(2)
 	group.Get(0).AddKmerCode(3)
 	group.Get(1).AddKmerCode(2)
 	group.Get(1).AddKmerCode(3)
 	group.Get(1).AddKmerCode(4)
 	group.Get(2).AddKmerCode(3)
 	group.Get(2).AddKmerCode(4)
 	group.Get(3).AddKmerCode(4)
 	union := group.Union()
 	for q := 1; q < 4; q++ {
 		atMost := group.QuorumAtMost(q)
 		atLeast := group.QuorumAtLeast(q + 1)
 		// Verify: AtMost(q) ∪ AtLeast(q+1) = Union()
 		combined := atMost.Union(atLeast)
 		if combined.Len() != union.Len() {
 			t.Errorf("q=%d: AtMost(q) ∪ AtLeast(q+1) has %d k-mers, Union has %d",
 				q, combined.Len(), union.Len())
 		}
 		// Verify: AtMost(q) ∩ AtLeast(q+1) = ∅
 		overlap := atMost.Intersect(atLeast)
 		if overlap.Len() != 0 {
 			t.Errorf("q=%d: AtMost(q) and AtLeast(q+1) overlap with %d k-mers",
 				q, overlap.Len())
 		}
 	}
 }
 // BenchmarkQuorumAtLeast benchmarks quorum operations
 func BenchmarkQuorumAtLeast(b *testing.B) {
 	k := 21
 	n := 10
 	group := NewKmerSetGroup(k, n)
 	// Populate with realistic data
 	for i := 0; i < n; i++ {
 		for j := uint64(0); j < 10000; j++ {
 			if (j % uint64(n)) <= uint64(i) {
 				group.Get(i).AddKmerCode(j)
 			}
 		}
 	}
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
 		_ = group.QuorumAtLeast(5)
 	}
 }
--- a/pkg/obikmer/kmer_set_persistence.go
+++ b/pkg/obikmer/kmer_set_persistence.go
@@ -1,376 +0,0 @@
 package obikmer
 import (
 	"encoding/json"
 	"fmt"
 	"os"
 	"path/filepath"
 	"strings"
 	"github.com/pelletier/go-toml/v2"
 	"gopkg.in/yaml.v3"
 )
 // MetadataFormat represents the metadata serialization format
 type MetadataFormat int
 const (
 	FormatTOML MetadataFormat = iota
 	FormatYAML
 	FormatJSON
 )
 // String returns the file extension for the format
 func (f MetadataFormat) String() string {
 	switch f {
 	case FormatTOML:
 		return "toml"
 	case FormatYAML:
 		return "yaml"
 	case FormatJSON:
 		return "json"
 	default:
 		return "toml"
 	}
 }
 // KmerSetMetadata contient les métadonnées d'un KmerSet ou KmerSetGroup
 type KmerSetMetadata struct {
 	ID            string                   `toml:"id,omitempty" yaml:"id,omitempty" json:"id,omitempty"`                        // Identifiant unique
 	K             int                      `toml:"k" yaml:"k" json:"k"`                                                         // Taille des k-mers
 	Type          string                   `toml:"type" yaml:"type" json:"type"`                                                // "KmerSet" ou "KmerSetGroup"
 	Size          int                      `toml:"size" yaml:"size" json:"size"`                                                // 1 pour KmerSet, n pour KmerSetGroup
 	Files         []string                 `toml:"files" yaml:"files" json:"files"`                                             // Liste des fichiers .roaring
 	SetsIDs       []string                 `toml:"sets_ids,omitempty" yaml:"sets_ids,omitempty" json:"sets_ids,omitempty"`      // IDs des KmerSet individuels
 	UserMetadata  map[string]interface{}   `toml:"user_metadata,omitempty" yaml:"user_metadata,omitempty" json:"user_metadata,omitempty"`         // Métadonnées KmerSet ou KmerSetGroup
 	SetsMetadata  []map[string]interface{} `toml:"sets_metadata,omitempty" yaml:"sets_metadata,omitempty" json:"sets_metadata,omitempty"`         // Métadonnées des KmerSet individuels dans un KmerSetGroup
 }
 // SaveKmerSet sauvegarde un KmerSet dans un répertoire
 // Format: directory/metadata.{toml,yaml,json} + directory/set_0.roaring
 func (ks *KmerSet) Save(directory string, format MetadataFormat) error {
 	// Créer le répertoire si nécessaire
 	if err := os.MkdirAll(directory, 0755); err != nil {
 		return fmt.Errorf("failed to create directory %s: %w", directory, err)
 	}
 	// Métadonnées
 	metadata := KmerSetMetadata{
 		ID:           ks.id,
 		K:            ks.k,
 		Type:         "KmerSet",
 		Size:         1,
 		Files:        []string{"set_0.roaring"},
 		UserMetadata: ks.Metadata, // Sauvegarder les métadonnées utilisateur
 	}
 	// Sauvegarder les métadonnées
 	if err := saveMetadata(filepath.Join(directory, "metadata."+format.String()), metadata, format); err != nil {
 		return err
 	}
 	// Sauvegarder le bitmap
 	bitmapPath := filepath.Join(directory, "set_0.roaring")
 	file, err := os.Create(bitmapPath)
 	if err != nil {
 		return fmt.Errorf("failed to create bitmap file %s: %w", bitmapPath, err)
 	}
 	defer file.Close()
 	if _, err := ks.bitmap.WriteTo(file); err != nil {
 		return fmt.Errorf("failed to write bitmap: %w", err)
 	}
 	return nil
 }
 // LoadKmerSet charge un KmerSet depuis un répertoire
 func LoadKmerSet(directory string) (*KmerSet, error) {
 	// Lire les métadonnées (essayer tous les formats)
 	metadata, err := loadMetadata(directory)
 	if err != nil {
 		return nil, err
 	}
 	// Vérifier le type
 	if metadata.Type != "KmerSet" {
 		return nil, fmt.Errorf("invalid type: expected KmerSet, got %s", metadata.Type)
 	}
 	// Vérifier qu'il n'y a qu'un seul fichier
 	if metadata.Size != 1 || len(metadata.Files) != 1 {
 		return nil, fmt.Errorf("KmerSet must have exactly 1 bitmap file, got %d", len(metadata.Files))
 	}
 	// Charger le bitmap
 	bitmapPath := filepath.Join(directory, metadata.Files[0])
 	file, err := os.Open(bitmapPath)
 	if err != nil {
 		return nil, fmt.Errorf("failed to open bitmap file %s: %w", bitmapPath, err)
 	}
 	defer file.Close()
 	ks := NewKmerSet(metadata.K)
 	// Charger l'ID
 	ks.id = metadata.ID
 	// Charger les métadonnées utilisateur
 	if metadata.UserMetadata != nil {
 		ks.Metadata = metadata.UserMetadata
 	}
 	if _, err := ks.bitmap.ReadFrom(file); err != nil {
 		return nil, fmt.Errorf("failed to read bitmap: %w", err)
 	}
 	return ks, nil
 }
 // SaveKmerSetGroup sauvegarde un KmerSetGroup dans un répertoire
 // Format: directory/metadata.{toml,yaml,json} + directory/set_0.roaring, set_1.roaring, ...
 func (ksg *KmerSetGroup) Save(directory string, format MetadataFormat) error {
 	// Créer le répertoire si nécessaire
 	if err := os.MkdirAll(directory, 0755); err != nil {
 		return fmt.Errorf("failed to create directory %s: %w", directory, err)
 	}
 	// Métadonnées
 	files := make([]string, len(ksg.sets))
 	for i := range ksg.sets {
 		files[i] = fmt.Sprintf("set_%d.roaring", i)
 	}
 	// Collecter les IDs et métadonnées de chaque KmerSet individuel
 	setsIDs := make([]string, len(ksg.sets))
 	setsMetadata := make([]map[string]interface{}, len(ksg.sets))
 	for i, ks := range ksg.sets {
 		setsIDs[i] = ks.id
 		setsMetadata[i] = ks.Metadata
 	}
 	metadata := KmerSetMetadata{
 		ID:           ksg.id,
 		K:            ksg.k,
 		Type:         "KmerSetGroup",
 		Size:         len(ksg.sets),
 		Files:        files,
 		SetsIDs:      setsIDs,          // IDs de chaque set
 		UserMetadata: ksg.Metadata,     // Métadonnées du groupe
 		SetsMetadata: setsMetadata,     // Métadonnées de chaque set
 	}
 	// Sauvegarder les métadonnées
 	if err := saveMetadata(filepath.Join(directory, "metadata."+format.String()), metadata, format); err != nil {
 		return err
 	}
 	// Sauvegarder chaque bitmap
 	for i, ks := range ksg.sets {
 		bitmapPath := filepath.Join(directory, files[i])
 		file, err := os.Create(bitmapPath)
 		if err != nil {
 			return fmt.Errorf("failed to create bitmap file %s: %w", bitmapPath, err)
 		}
 		if _, err := ks.bitmap.WriteTo(file); err != nil {
 			file.Close()
 			return fmt.Errorf("failed to write bitmap %d: %w", i, err)
 		}
 		file.Close()
 	}
 	return nil
 }
 // LoadKmerSetGroup charge un KmerSetGroup depuis un répertoire
 func LoadKmerSetGroup(directory string) (*KmerSetGroup, error) {
 	// Lire les métadonnées (essayer tous les formats)
 	metadata, err := loadMetadata(directory)
 	if err != nil {
 		return nil, err
 	}
 	// Vérifier le type
 	if metadata.Type != "KmerSetGroup" {
 		return nil, fmt.Errorf("invalid type: expected KmerSetGroup, got %s", metadata.Type)
 	}
 	// Vérifier la cohérence
 	if metadata.Size != len(metadata.Files) {
 		return nil, fmt.Errorf("size mismatch: size=%d but %d files listed", metadata.Size, len(metadata.Files))
 	}
 	// Créer le groupe
 	ksg := NewKmerSetGroup(metadata.K, metadata.Size)
 	// Charger l'ID du groupe
 	ksg.id = metadata.ID
 	// Charger les métadonnées du groupe
 	if metadata.UserMetadata != nil {
 		ksg.Metadata = metadata.UserMetadata
 	}
 	// Charger les IDs de chaque KmerSet
 	if metadata.SetsIDs != nil && len(metadata.SetsIDs) == metadata.Size {
 		for i := range ksg.sets {
 			ksg.sets[i].id = metadata.SetsIDs[i]
 		}
 	}
 	// Charger les métadonnées de chaque KmerSet individuel
 	if metadata.SetsMetadata != nil {
 		if len(metadata.SetsMetadata) != metadata.Size {
 			return nil, fmt.Errorf("sets metadata size mismatch: expected %d, got %d", metadata.Size, len(metadata.SetsMetadata))
 		}
 		for i := range ksg.sets {
 			ksg.sets[i].Metadata = metadata.SetsMetadata[i]
 		}
 	}
 	// Charger chaque bitmap
 	for i, filename := range metadata.Files {
 		bitmapPath := filepath.Join(directory, filename)
 		file, err := os.Open(bitmapPath)
 		if err != nil {
 			return nil, fmt.Errorf("failed to open bitmap file %s: %w", bitmapPath, err)
 		}
 		if _, err := ksg.sets[i].bitmap.ReadFrom(file); err != nil {
 			file.Close()
 			return nil, fmt.Errorf("failed to read bitmap %d: %w", i, err)
 		}
 		file.Close()
 	}
 	return ksg, nil
 }
 // saveMetadata sauvegarde les métadonnées dans le format spécifié
 func saveMetadata(path string, metadata KmerSetMetadata, format MetadataFormat) error {
 	file, err := os.Create(path)
 	if err != nil {
 		return fmt.Errorf("failed to create metadata file %s: %w", path, err)
 	}
 	defer file.Close()
 	var encoder interface{ Encode(interface{}) error }
 	switch format {
 	case FormatTOML:
 		encoder = toml.NewEncoder(file)
 	case FormatYAML:
 		encoder = yaml.NewEncoder(file)
 	case FormatJSON:
 		jsonEncoder := json.NewEncoder(file)
 		jsonEncoder.SetIndent("", "  ")
 		encoder = jsonEncoder
 	default:
 		return fmt.Errorf("unsupported format: %v", format)
 	}
 	if err := encoder.Encode(metadata); err != nil {
 		return fmt.Errorf("failed to encode metadata: %w", err)
 	}
 	return nil
 }
 // loadMetadata charge les métadonnées depuis un répertoire
 // Essaie tous les formats (TOML, YAML, JSON) dans l'ordre
 func loadMetadata(directory string) (*KmerSetMetadata, error) {
 	formats := []MetadataFormat{FormatTOML, FormatYAML, FormatJSON}
 	var lastErr error
 	for _, format := range formats {
 		path := filepath.Join(directory, "metadata."+format.String())
 		// Vérifier si le fichier existe
 		if _, err := os.Stat(path); os.IsNotExist(err) {
 			continue
 		}
 		metadata, err := loadMetadataFromFile(path, format)
 		if err != nil {
 			lastErr = err
 			continue
 		}
 		return metadata, nil
 	}
 	if lastErr != nil {
 		return nil, fmt.Errorf("failed to load metadata: %w", lastErr)
 	}
 	return nil, fmt.Errorf("no metadata file found in %s (tried .toml, .yaml, .json)", directory)
 }
 // loadMetadataFromFile charge les métadonnées depuis un fichier spécifique
 func loadMetadataFromFile(path string, format MetadataFormat) (*KmerSetMetadata, error) {
 	file, err := os.Open(path)
 	if err != nil {
 		return nil, fmt.Errorf("failed to open metadata file %s: %w", path, err)
 	}
 	defer file.Close()
 	var metadata KmerSetMetadata
 	var decoder interface{ Decode(interface{}) error }
 	switch format {
 	case FormatTOML:
 		decoder = toml.NewDecoder(file)
 	case FormatYAML:
 		decoder = yaml.NewDecoder(file)
 	case FormatJSON:
 		decoder = json.NewDecoder(file)
 	default:
 		return nil, fmt.Errorf("unsupported format: %v", format)
 	}
 	if err := decoder.Decode(&metadata); err != nil {
 		return nil, fmt.Errorf("failed to decode metadata: %w", err)
 	}
 	return &metadata, nil
 }
 // DetectFormat détecte le format des métadonnées dans un répertoire
 func DetectFormat(directory string) (MetadataFormat, error) {
 	formats := []MetadataFormat{FormatTOML, FormatYAML, FormatJSON}
 	for _, format := range formats {
 		path := filepath.Join(directory, "metadata."+format.String())
 		if _, err := os.Stat(path); err == nil {
 			return format, nil
 		}
 	}
 	return FormatTOML, fmt.Errorf("no metadata file found in %s", directory)
 }
 // IsKmerSetDirectory vérifie si un répertoire contient un KmerSet ou KmerSetGroup
 func IsKmerSetDirectory(directory string) (bool, string, error) {
 	metadata, err := loadMetadata(directory)
 	if err != nil {
 		return false, "", err
 	}
 	return true, metadata.Type, nil
 }
 // ListBitmapFiles liste tous les fichiers .roaring dans un répertoire
 func ListBitmapFiles(directory string) ([]string, error) {
 	entries, err := os.ReadDir(directory)
 	if err != nil {
 		return nil, fmt.Errorf("failed to read directory %s: %w", directory, err)
 	}
 	var files []string
 	for _, entry := range entries {
 		if !entry.IsDir() && strings.HasSuffix(entry.Name(), ".roaring") {
 			files = append(files, entry.Name())
 		}
 	}
 	return files, nil
 }
--- a/pkg/obikmer/kmer_set_test.go
+++ b/pkg/obikmer/kmer_set_test.go
@@ -1,272 +0,0 @@
 package obikmer
 import (
 	"math"
 	"testing"
 )
 func TestJaccardDistanceIdentical(t *testing.T) {
 	ks1 := NewKmerSet(5)
 	ks1.AddKmerCode(100)
 	ks1.AddKmerCode(200)
 	ks1.AddKmerCode(300)
 	ks2 := NewKmerSet(5)
 	ks2.AddKmerCode(100)
 	ks2.AddKmerCode(200)
 	ks2.AddKmerCode(300)
 	distance := ks1.JaccardDistance(ks2)
 	similarity := ks1.JaccardSimilarity(ks2)
 	if distance != 0.0 {
 		t.Errorf("Expected distance 0.0 for identical sets, got %f", distance)
 	}
 	if similarity != 1.0 {
 		t.Errorf("Expected similarity 1.0 for identical sets, got %f", similarity)
 	}
 }
 func TestJaccardDistanceDisjoint(t *testing.T) {
 	ks1 := NewKmerSet(5)
 	ks1.AddKmerCode(100)
 	ks1.AddKmerCode(200)
 	ks1.AddKmerCode(300)
 	ks2 := NewKmerSet(5)
 	ks2.AddKmerCode(400)
 	ks2.AddKmerCode(500)
 	ks2.AddKmerCode(600)
 	distance := ks1.JaccardDistance(ks2)
 	similarity := ks1.JaccardSimilarity(ks2)
 	if distance != 1.0 {
 		t.Errorf("Expected distance 1.0 for disjoint sets, got %f", distance)
 	}
 	if similarity != 0.0 {
 		t.Errorf("Expected similarity 0.0 for disjoint sets, got %f", similarity)
 	}
 }
 func TestJaccardDistancePartialOverlap(t *testing.T) {
 	// Set 1: {1, 2, 3}
 	ks1 := NewKmerSet(5)
 	ks1.AddKmerCode(1)
 	ks1.AddKmerCode(2)
 	ks1.AddKmerCode(3)
 	// Set 2: {2, 3, 4}
 	ks2 := NewKmerSet(5)
 	ks2.AddKmerCode(2)
 	ks2.AddKmerCode(3)
 	ks2.AddKmerCode(4)
 	// Intersection: {2, 3} -> cardinality = 2
 	// Union: {1, 2, 3, 4} -> cardinality = 4
 	// Similarity = 2/4 = 0.5
 	// Distance = 1 - 0.5 = 0.5
 	distance := ks1.JaccardDistance(ks2)
 	similarity := ks1.JaccardSimilarity(ks2)
 	expectedDistance := 0.5
 	expectedSimilarity := 0.5
 	if math.Abs(distance-expectedDistance) > 1e-10 {
 		t.Errorf("Expected distance %f, got %f", expectedDistance, distance)
 	}
 	if math.Abs(similarity-expectedSimilarity) > 1e-10 {
 		t.Errorf("Expected similarity %f, got %f", expectedSimilarity, similarity)
 	}
 }
 func TestJaccardDistanceOneSubsetOfOther(t *testing.T) {
 	// Set 1: {1, 2}
 	ks1 := NewKmerSet(5)
 	ks1.AddKmerCode(1)
 	ks1.AddKmerCode(2)
 	// Set 2: {1, 2, 3, 4}
 	ks2 := NewKmerSet(5)
 	ks2.AddKmerCode(1)
 	ks2.AddKmerCode(2)
 	ks2.AddKmerCode(3)
 	ks2.AddKmerCode(4)
 	// Intersection: {1, 2} -> cardinality = 2
 	// Union: {1, 2, 3, 4} -> cardinality = 4
 	// Similarity = 2/4 = 0.5
 	// Distance = 1 - 0.5 = 0.5
 	distance := ks1.JaccardDistance(ks2)
 	similarity := ks1.JaccardSimilarity(ks2)
 	expectedDistance := 0.5
 	expectedSimilarity := 0.5
 	if math.Abs(distance-expectedDistance) > 1e-10 {
 		t.Errorf("Expected distance %f, got %f", expectedDistance, distance)
 	}
 	if math.Abs(similarity-expectedSimilarity) > 1e-10 {
 		t.Errorf("Expected similarity %f, got %f", expectedSimilarity, similarity)
 	}
 }
 func TestJaccardDistanceEmptySets(t *testing.T) {
 	ks1 := NewKmerSet(5)
 	ks2 := NewKmerSet(5)
 	distance := ks1.JaccardDistance(ks2)
 	similarity := ks1.JaccardSimilarity(ks2)
 	// By convention, distance = 1.0 for empty sets
 	if distance != 1.0 {
 		t.Errorf("Expected distance 1.0 for empty sets, got %f", distance)
 	}
 	if similarity != 0.0 {
 		t.Errorf("Expected similarity 0.0 for empty sets, got %f", similarity)
 	}
 }
 func TestJaccardDistanceOneEmpty(t *testing.T) {
 	ks1 := NewKmerSet(5)
 	ks1.AddKmerCode(1)
 	ks1.AddKmerCode(2)
 	ks1.AddKmerCode(3)
 	ks2 := NewKmerSet(5)
 	distance := ks1.JaccardDistance(ks2)
 	similarity := ks1.JaccardSimilarity(ks2)
 	// Intersection: {} -> cardinality = 0
 	// Union: {1, 2, 3} -> cardinality = 3
 	// Similarity = 0/3 = 0.0
 	// Distance = 1.0
 	if distance != 1.0 {
 		t.Errorf("Expected distance 1.0 when one set is empty, got %f", distance)
 	}
 	if similarity != 0.0 {
 		t.Errorf("Expected similarity 0.0 when one set is empty, got %f", similarity)
 	}
 }
 func TestJaccardDistanceDifferentK(t *testing.T) {
 	ks1 := NewKmerSet(5)
 	ks1.AddKmerCode(1)
 	ks2 := NewKmerSet(7)
 	ks2.AddKmerCode(1)
 	defer func() {
 		if r := recover(); r == nil {
 			t.Errorf("Expected panic when computing Jaccard distance with different k values")
 		}
 	}()
 	_ = ks1.JaccardDistance(ks2)
 }
 func TestJaccardDistanceSimilarityRelation(t *testing.T) {
 	// Test that distance + similarity = 1.0 for all cases
 	testCases := []struct {
 		name string
 		ks1  *KmerSet
 		ks2  *KmerSet
 	}{
 		{
 			name: "partial overlap",
 			ks1: func() *KmerSet {
 				ks := NewKmerSet(5)
 				ks.AddKmerCode(1)
 				ks.AddKmerCode(2)
 				ks.AddKmerCode(3)
 				return ks
 			}(),
 			ks2: func() *KmerSet {
 				ks := NewKmerSet(5)
 				ks.AddKmerCode(2)
 				ks.AddKmerCode(3)
 				ks.AddKmerCode(4)
 				ks.AddKmerCode(5)
 				return ks
 			}(),
 		},
 		{
 			name: "identical",
 			ks1: func() *KmerSet {
 				ks := NewKmerSet(5)
 				ks.AddKmerCode(10)
 				ks.AddKmerCode(20)
 				return ks
 			}(),
 			ks2: func() *KmerSet {
 				ks := NewKmerSet(5)
 				ks.AddKmerCode(10)
 				ks.AddKmerCode(20)
 				return ks
 			}(),
 		},
 		{
 			name: "disjoint",
 			ks1: func() *KmerSet {
 				ks := NewKmerSet(5)
 				ks.AddKmerCode(1)
 				return ks
 			}(),
 			ks2: func() *KmerSet {
 				ks := NewKmerSet(5)
 				ks.AddKmerCode(100)
 				return ks
 			}(),
 		},
 	}
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
 			distance := tc.ks1.JaccardDistance(tc.ks2)
 			similarity := tc.ks1.JaccardSimilarity(tc.ks2)
 			sum := distance + similarity
 			if math.Abs(sum-1.0) > 1e-10 {
 				t.Errorf("Expected distance + similarity = 1.0, got %f + %f = %f",
 					distance, similarity, sum)
 			}
 		})
 	}
 }
 func TestJaccardDistanceSymmetry(t *testing.T) {
 	ks1 := NewKmerSet(5)
 	ks1.AddKmerCode(1)
 	ks1.AddKmerCode(2)
 	ks1.AddKmerCode(3)
 	ks2 := NewKmerSet(5)
 	ks2.AddKmerCode(2)
 	ks2.AddKmerCode(3)
 	ks2.AddKmerCode(4)
 	distance1 := ks1.JaccardDistance(ks2)
 	distance2 := ks2.JaccardDistance(ks1)
 	similarity1 := ks1.JaccardSimilarity(ks2)
 	similarity2 := ks2.JaccardSimilarity(ks1)
 	if math.Abs(distance1-distance2) > 1e-10 {
 		t.Errorf("Jaccard distance not symmetric: %f vs %f", distance1, distance2)
 	}
 	if math.Abs(similarity1-similarity2) > 1e-10 {
 		t.Errorf("Jaccard similarity not symmetric: %f vs %f", similarity1, similarity2)
 	}
 }
--- a/pkg/obikmer/minimizer_utils.go
+++ b/pkg/obikmer/minimizer_utils.go
@@ -0,0 +1,47 @@
 package obikmer
 import (
 	"math"
 	log "github.com/sirupsen/logrus"
 )
 // DefaultMinimizerSize returns ceil(k / 2.5) as a reasonable default minimizer size.
 func DefaultMinimizerSize(k int) int {
 	m := int(math.Ceil(float64(k) / 2.5))
 	if m < 1 {
 		m = 1
 	}
 	if m >= k {
 		m = k - 1
 	}
 	return m
 }
 // MinMinimizerSize returns the minimum m such that 4^m >= nworkers,
 // i.e. ceil(log(nworkers) / log(4)).
 func MinMinimizerSize(nworkers int) int {
 	if nworkers <= 1 {
 		return 1
 	}
 	return int(math.Ceil(math.Log(float64(nworkers)) / math.Log(4)))
 }
 // ValidateMinimizerSize checks and adjusts the minimizer size to satisfy constraints:
 // - m >= ceil(log(nworkers)/log(4))
 // - 1 <= m < k
 func ValidateMinimizerSize(m, k, nworkers int) int {
 	minM := MinMinimizerSize(nworkers)
 	if m < minM {
 		log.Warnf("Minimizer size %d too small for %d workers (4^%d = %d < %d), adjusting to %d",
 			m, nworkers, m, 1<<(2*m), nworkers, minM)
 		m = minM
 	}
 	if m < 1 {
 		m = 1
 	}
 	if m >= k {
 		m = k - 1
 	}
 	return m
 }
--- a/pkg/obikmer/skm_reader.go
+++ b/pkg/obikmer/skm_reader.go
@@ -0,0 +1,67 @@
 package obikmer
 import (
 	"bufio"
 	"encoding/binary"
 	"io"
 	"os"
 )
 // decode2bit maps 2-bit codes back to nucleotide bytes.
 var decode2bit = [4]byte{'a', 'c', 'g', 't'}
 // SkmReader reads super-kmers from a binary .skm file.
 type SkmReader struct {
 	r    *bufio.Reader
 	file *os.File
 }
 // NewSkmReader opens a .skm file for reading.
 func NewSkmReader(path string) (*SkmReader, error) {
 	f, err := os.Open(path)
 	if err != nil {
 		return nil, err
 	}
 	return &SkmReader{
 		r:    bufio.NewReaderSize(f, 65536),
 		file: f,
 	}, nil
 }
 // Next reads the next super-kmer from the file.
 // Returns the SuperKmer and true, or a zero SuperKmer and false at EOF.
 func (sr *SkmReader) Next() (SuperKmer, bool) {
 	// Read length
 	var lenbuf [2]byte
 	if _, err := io.ReadFull(sr.r, lenbuf[:]); err != nil {
 		return SuperKmer{}, false
 	}
 	seqLen := int(binary.LittleEndian.Uint16(lenbuf[:]))
 	// Read packed bytes
 	nBytes := (seqLen + 3) / 4
 	packed := make([]byte, nBytes)
 	if _, err := io.ReadFull(sr.r, packed); err != nil {
 		return SuperKmer{}, false
 	}
 	// Decode to nucleotide bytes
 	seq := make([]byte, seqLen)
 	for i := 0; i < seqLen; i++ {
 		byteIdx := i / 4
 		bitPos := uint(6 - (i%4)*2)
 		code := (packed[byteIdx] >> bitPos) & 0x03
 		seq[i] = decode2bit[code]
 	}
 	return SuperKmer{
 		Sequence: seq,
 		Start:    0,
 		End:      seqLen,
 	}, true
 }
 // Close closes the underlying file.
 func (sr *SkmReader) Close() error {
 	return sr.file.Close()
 }
--- a/pkg/obikmer/skm_test.go
+++ b/pkg/obikmer/skm_test.go
@@ -0,0 +1,176 @@
 package obikmer
 import (
 	"os"
 	"path/filepath"
 	"testing"
 )
 func TestSkmRoundTrip(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "test.skm")
 	// Create super-kmers from a known sequence
 	seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT")
 	k := 21
 	m := 9
 	superKmers := ExtractSuperKmers(seq, k, m, nil)
 	if len(superKmers) == 0 {
 		t.Fatal("no super-kmers extracted")
 	}
 	// Write
 	w, err := NewSkmWriter(path)
 	if err != nil {
 		t.Fatal(err)
 	}
 	for _, sk := range superKmers {
 		if err := w.Write(sk); err != nil {
 			t.Fatal(err)
 		}
 	}
 	if err := w.Close(); err != nil {
 		t.Fatal(err)
 	}
 	// Read back
 	r, err := NewSkmReader(path)
 	if err != nil {
 		t.Fatal(err)
 	}
 	defer r.Close()
 	idx := 0
 	for {
 		sk, ok := r.Next()
 		if !ok {
 			break
 		}
 		if idx >= len(superKmers) {
 			t.Fatal("read more super-kmers than written")
 		}
 		expected := superKmers[idx]
 		if len(sk.Sequence) != len(expected.Sequence) {
 			t.Fatalf("super-kmer %d: length mismatch: got %d, want %d",
 				idx, len(sk.Sequence), len(expected.Sequence))
 		}
 		// Compare nucleotide-by-nucleotide (case insensitive since decode produces lowercase)
 		for j := range sk.Sequence {
 			got := sk.Sequence[j] | 0x20
 			want := expected.Sequence[j] | 0x20
 			if got != want {
 				t.Fatalf("super-kmer %d pos %d: got %c, want %c", idx, j, got, want)
 			}
 		}
 		idx++
 	}
 	if idx != len(superKmers) {
 		t.Fatalf("read %d super-kmers, want %d", idx, len(superKmers))
 	}
 }
 func TestSkmEmptyFile(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "empty.skm")
 	// Write nothing
 	w, err := NewSkmWriter(path)
 	if err != nil {
 		t.Fatal(err)
 	}
 	if err := w.Close(); err != nil {
 		t.Fatal(err)
 	}
 	// Read back
 	r, err := NewSkmReader(path)
 	if err != nil {
 		t.Fatal(err)
 	}
 	defer r.Close()
 	_, ok := r.Next()
 	if ok {
 		t.Fatal("expected no super-kmers in empty file")
 	}
 }
 func TestSkmSingleBase(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "single.skm")
 	// Test with sequences of various lengths to check padding
 	sequences := [][]byte{
 		[]byte("A"),
 		[]byte("AC"),
 		[]byte("ACG"),
 		[]byte("ACGT"),
 		[]byte("ACGTA"),
 	}
 	w, err := NewSkmWriter(path)
 	if err != nil {
 		t.Fatal(err)
 	}
 	for _, seq := range sequences {
 		sk := SuperKmer{Sequence: seq}
 		if err := w.Write(sk); err != nil {
 			t.Fatal(err)
 		}
 	}
 	if err := w.Close(); err != nil {
 		t.Fatal(err)
 	}
 	r, err := NewSkmReader(path)
 	if err != nil {
 		t.Fatal(err)
 	}
 	defer r.Close()
 	for i, expected := range sequences {
 		sk, ok := r.Next()
 		if !ok {
 			t.Fatalf("expected super-kmer %d, got EOF", i)
 		}
 		if len(sk.Sequence) != len(expected) {
 			t.Fatalf("sk %d: length %d, want %d", i, len(sk.Sequence), len(expected))
 		}
 		for j := range sk.Sequence {
 			got := sk.Sequence[j] | 0x20
 			want := expected[j] | 0x20
 			if got != want {
 				t.Fatalf("sk %d pos %d: got %c, want %c", i, j, got, want)
 			}
 		}
 	}
 }
 func TestSkmFileSize(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "size.skm")
 	// Write a sequence of known length
 	seq := []byte("ACGTACGTAC") // 10 bases
 	sk := SuperKmer{Sequence: seq}
 	w, err := NewSkmWriter(path)
 	if err != nil {
 		t.Fatal(err)
 	}
 	if err := w.Write(sk); err != nil {
 		t.Fatal(err)
 	}
 	if err := w.Close(); err != nil {
 		t.Fatal(err)
 	}
 	// Expected: 2 bytes (length) + ceil(10/4)=3 bytes (data) = 5 bytes
 	info, err := os.Stat(path)
 	if err != nil {
 		t.Fatal(err)
 	}
 	if info.Size() != 5 {
 		t.Fatalf("file size: got %d, want 5", info.Size())
 	}
 }
--- a/pkg/obikmer/skm_writer.go
+++ b/pkg/obikmer/skm_writer.go
@@ -0,0 +1,74 @@
 package obikmer
 import (
 	"bufio"
 	"encoding/binary"
 	"os"
 )
 // SkmWriter writes super-kmers to a binary .skm file.
 //
 // Format per super-kmer:
 //
 //	[len: uint16 LE]          length of the super-kmer in bases
 //	[data: ceil(len/4) bytes] sequence encoded 2 bits/base, packed
 //
 // Nucleotide encoding: A=00, C=01, G=10, T=11.
 // The last byte is zero-padded on the low bits if len%4 != 0.
 type SkmWriter struct {
 	w    *bufio.Writer
 	file *os.File
 }
 // NewSkmWriter creates a new SkmWriter writing to the given file path.
 func NewSkmWriter(path string) (*SkmWriter, error) {
 	f, err := os.Create(path)
 	if err != nil {
 		return nil, err
 	}
 	return &SkmWriter{
 		w:    bufio.NewWriterSize(f, 65536),
 		file: f,
 	}, nil
 }
 // Write encodes a SuperKmer to the .skm file.
 // The sequence bytes are packed 2 bits per base.
 func (sw *SkmWriter) Write(sk SuperKmer) error {
 	seq := sk.Sequence
 	seqLen := uint16(len(seq))
 	// Write length
 	var lenbuf [2]byte
 	binary.LittleEndian.PutUint16(lenbuf[:], seqLen)
 	if _, err := sw.w.Write(lenbuf[:]); err != nil {
 		return err
 	}
 	// Encode and write packed sequence (2 bits/base)
 	nBytes := (int(seqLen) + 3) / 4
 	for i := 0; i < nBytes; i++ {
 		var packed byte
 		for j := 0; j < 4; j++ {
 			pos := i*4 + j
 			packed <<= 2
 			if pos < int(seqLen) {
 				packed |= __single_base_code__[seq[pos]&31]
 			}
 		}
 		if err := sw.w.WriteByte(packed); err != nil {
 			return err
 		}
 	}
 	return nil
 }
 // Close flushes buffered data and closes the underlying file.
 func (sw *SkmWriter) Close() error {
 	if err := sw.w.Flush(); err != nil {
 		sw.file.Close()
 		return err
 	}
 	return sw.file.Close()
 }
--- a/pkg/obikmer/spectrum.go
+++ b/pkg/obikmer/spectrum.go
@@ -0,0 +1,253 @@
 package obikmer
 import (
 	"bufio"
 	"container/heap"
 	"encoding/csv"
 	"fmt"
 	"os"
 	"sort"
 	"strconv"
 )
 // KSP file magic bytes: "KSP\x01" (K-mer SPectrum v1)
 var kspMagic = [4]byte{'K', 'S', 'P', 0x01}
 // SpectrumEntry represents one entry in a k-mer frequency spectrum.
 type SpectrumEntry struct {
 	Frequency int    // how many times a k-mer was observed
 	Count     uint64 // how many distinct k-mers have this frequency
 }
 // KmerSpectrum represents the frequency distribution of k-mers.
 // Entries are sorted by Frequency in ascending order and only include
 // non-zero counts.
 type KmerSpectrum struct {
 	Entries []SpectrumEntry
 }
 // MaxFrequency returns the highest frequency in the spectrum, or 0 if empty.
 func (s *KmerSpectrum) MaxFrequency() int {
 	if len(s.Entries) == 0 {
 		return 0
 	}
 	return s.Entries[len(s.Entries)-1].Frequency
 }
 // ToMap converts a KmerSpectrum back to a map for easy lookup.
 func (s *KmerSpectrum) ToMap() map[int]uint64 {
 	m := make(map[int]uint64, len(s.Entries))
 	for _, e := range s.Entries {
 		m[e.Frequency] = e.Count
 	}
 	return m
 }
 // MapToSpectrum converts a map[int]uint64 to a sorted KmerSpectrum.
 func MapToSpectrum(m map[int]uint64) *KmerSpectrum {
 	entries := make([]SpectrumEntry, 0, len(m))
 	for freq, count := range m {
 		if count > 0 {
 			entries = append(entries, SpectrumEntry{Frequency: freq, Count: count})
 		}
 	}
 	sort.Slice(entries, func(i, j int) bool {
 		return entries[i].Frequency < entries[j].Frequency
 	})
 	return &KmerSpectrum{Entries: entries}
 }
 // MergeSpectraMaps adds all entries from b into a.
 func MergeSpectraMaps(a, b map[int]uint64) {
 	for freq, count := range b {
 		a[freq] += count
 	}
 }
 // WriteSpectrum writes a KmerSpectrum to a binary file.
 //
 // Format:
 //
 //	[magic: 4 bytes "KSP\x01"]
 //	[n_entries: varint]
 //	For each entry (sorted by frequency ascending):
 //	  [frequency: varint]
 //	  [count: varint]
 func WriteSpectrum(path string, spectrum *KmerSpectrum) error {
 	f, err := os.Create(path)
 	if err != nil {
 		return fmt.Errorf("create spectrum file: %w", err)
 	}
 	w := bufio.NewWriterSize(f, 65536)
 	// Magic
 	if _, err := w.Write(kspMagic[:]); err != nil {
 		f.Close()
 		return err
 	}
 	// Number of entries
 	if _, err := EncodeVarint(w, uint64(len(spectrum.Entries))); err != nil {
 		f.Close()
 		return err
 	}
 	// Entries
 	for _, e := range spectrum.Entries {
 		if _, err := EncodeVarint(w, uint64(e.Frequency)); err != nil {
 			f.Close()
 			return err
 		}
 		if _, err := EncodeVarint(w, e.Count); err != nil {
 			f.Close()
 			return err
 		}
 	}
 	if err := w.Flush(); err != nil {
 		f.Close()
 		return err
 	}
 	return f.Close()
 }
 // ReadSpectrum reads a KmerSpectrum from a binary file.
 func ReadSpectrum(path string) (*KmerSpectrum, error) {
 	f, err := os.Open(path)
 	if err != nil {
 		return nil, err
 	}
 	defer f.Close()
 	r := bufio.NewReaderSize(f, 65536)
 	// Check magic
 	var magic [4]byte
 	if _, err := r.Read(magic[:]); err != nil {
 		return nil, fmt.Errorf("read spectrum magic: %w", err)
 	}
 	if magic != kspMagic {
 		return nil, fmt.Errorf("invalid spectrum file magic: %v", magic)
 	}
 	// Number of entries
 	nEntries, err := DecodeVarint(r)
 	if err != nil {
 		return nil, fmt.Errorf("read spectrum entry count: %w", err)
 	}
 	entries := make([]SpectrumEntry, nEntries)
 	for i := uint64(0); i < nEntries; i++ {
 		freq, err := DecodeVarint(r)
 		if err != nil {
 			return nil, fmt.Errorf("read spectrum freq at entry %d: %w", i, err)
 		}
 		count, err := DecodeVarint(r)
 		if err != nil {
 			return nil, fmt.Errorf("read spectrum count at entry %d: %w", i, err)
 		}
 		entries[i] = SpectrumEntry{
 			Frequency: int(freq),
 			Count:     count,
 		}
 	}
 	return &KmerSpectrum{Entries: entries}, nil
 }
 // KmerFreq associates a k-mer (encoded as uint64) with its observed frequency.
 type KmerFreq struct {
 	Kmer uint64
 	Freq int
 }
 // kmerFreqHeap is a min-heap of KmerFreq ordered by Freq (lowest first).
 // Used to maintain a top-N most frequent k-mers set.
 type kmerFreqHeap []KmerFreq
 func (h kmerFreqHeap) Len() int            { return len(h) }
 func (h kmerFreqHeap) Less(i, j int) bool  { return h[i].Freq < h[j].Freq }
 func (h kmerFreqHeap) Swap(i, j int)       { h[i], h[j] = h[j], h[i] }
 func (h *kmerFreqHeap) Push(x interface{}) { *h = append(*h, x.(KmerFreq)) }
 func (h *kmerFreqHeap) Pop() interface{} {
 	old := *h
 	n := len(old)
 	x := old[n-1]
 	*h = old[:n-1]
 	return x
 }
 // TopNKmers maintains a collection of the N most frequent k-mers
 // using a min-heap. Thread-safe usage requires external synchronization.
 type TopNKmers struct {
 	n int
 	h kmerFreqHeap
 }
 // NewTopNKmers creates a new top-N collector.
 func NewTopNKmers(n int) *TopNKmers {
 	return &TopNKmers{
 		n: n,
 		h: make(kmerFreqHeap, 0, n+1),
 	}
 }
 // Add considers a k-mer with the given frequency for inclusion in the top-N.
 func (t *TopNKmers) Add(kmer uint64, freq int) {
 	if t.n <= 0 {
 		return
 	}
 	if len(t.h) < t.n {
 		heap.Push(&t.h, KmerFreq{Kmer: kmer, Freq: freq})
 	} else if freq > t.h[0].Freq {
 		t.h[0] = KmerFreq{Kmer: kmer, Freq: freq}
 		heap.Fix(&t.h, 0)
 	}
 }
 // Results returns the collected k-mers sorted by frequency descending.
 func (t *TopNKmers) Results() []KmerFreq {
 	result := make([]KmerFreq, len(t.h))
 	copy(result, t.h)
 	sort.Slice(result, func(i, j int) bool {
 		return result[i].Freq > result[j].Freq
 	})
 	return result
 }
 // MergeTopN merges another TopNKmers into this one.
 func (t *TopNKmers) MergeTopN(other *TopNKmers) {
 	if other == nil {
 		return
 	}
 	for _, kf := range other.h {
 		t.Add(kf.Kmer, kf.Freq)
 	}
 }
 // WriteTopKmersCSV writes the top k-mers to a CSV file.
 // Columns: sequence, frequency
 func WriteTopKmersCSV(path string, topKmers []KmerFreq, k int) error {
 	f, err := os.Create(path)
 	if err != nil {
 		return fmt.Errorf("create top-kmers file: %w", err)
 	}
 	defer f.Close()
 	w := csv.NewWriter(f)
 	defer w.Flush()
 	if err := w.Write([]string{"sequence", "frequency"}); err != nil {
 		return err
 	}
 	buf := make([]byte, k)
 	for _, kf := range topKmers {
 		seq := DecodeKmer(kf.Kmer, k, buf)
 		if err := w.Write([]string{string(seq), strconv.Itoa(kf.Freq)}); err != nil {
 			return err
 		}
 	}
 	return nil
 }
--- a/pkg/obikmer/varint.go
+++ b/pkg/obikmer/varint.go
@@ -0,0 +1,53 @@
 package obikmer
 import "io"
 // EncodeVarint writes a uint64 value as a variable-length integer to w.
 // Uses 7 bits per byte with the high bit as a continuation flag
 // (identical to protobuf unsigned varint encoding).
 // Returns the number of bytes written.
 func EncodeVarint(w io.Writer, v uint64) (int, error) {
 	var buf [10]byte // max 10 bytes for uint64 varint
 	n := 0
 	for v >= 0x80 {
 		buf[n] = byte(v) | 0x80
 		v >>= 7
 		n++
 	}
 	buf[n] = byte(v)
 	n++
 	return w.Write(buf[:n])
 }
 // DecodeVarint reads a variable-length encoded uint64 from r.
 // Returns the decoded value and any error encountered.
 func DecodeVarint(r io.Reader) (uint64, error) {
 	var val uint64
 	var shift uint
 	var buf [1]byte
 	for {
 		if _, err := io.ReadFull(r, buf[:]); err != nil {
 			return 0, err
 		}
 		b := buf[0]
 		val |= uint64(b&0x7F) << shift
 		if b < 0x80 {
 			return val, nil
 		}
 		shift += 7
 		if shift >= 70 {
 			return 0, io.ErrUnexpectedEOF
 		}
 	}
 }
 // VarintLen returns the number of bytes needed to encode v as a varint.
 func VarintLen(v uint64) int {
 	n := 1
 	for v >= 0x80 {
 		v >>= 7
 		n++
 	}
 	return n
 }
--- a/pkg/obikmer/varint_test.go
+++ b/pkg/obikmer/varint_test.go
@@ -0,0 +1,82 @@
 package obikmer
 import (
 	"bytes"
 	"testing"
 )
 func TestVarintRoundTrip(t *testing.T) {
 	values := []uint64{
 		0, 1, 127, 128, 255, 256,
 		16383, 16384,
 		1<<21 - 1, 1 << 21,
 		1<<28 - 1, 1 << 28,
 		1<<35 - 1, 1 << 35,
 		1<<42 - 1, 1 << 42,
 		1<<49 - 1, 1 << 49,
 		1<<56 - 1, 1 << 56,
 		1<<63 - 1, 1 << 63,
 		^uint64(0), // max uint64
 	}
 	for _, v := range values {
 		var buf bytes.Buffer
 		n, err := EncodeVarint(&buf, v)
 		if err != nil {
 			t.Fatalf("EncodeVarint(%d): %v", v, err)
 		}
 		if n != VarintLen(v) {
 			t.Fatalf("EncodeVarint(%d): wrote %d bytes, VarintLen says %d", v, n, VarintLen(v))
 		}
 		decoded, err := DecodeVarint(&buf)
 		if err != nil {
 			t.Fatalf("DecodeVarint for %d: %v", v, err)
 		}
 		if decoded != v {
 			t.Fatalf("roundtrip failed: encoded %d, decoded %d", v, decoded)
 		}
 	}
 }
 func TestVarintLen(t *testing.T) {
 	tests := []struct {
 		value    uint64
 		expected int
 	}{
 		{0, 1},
 		{127, 1},
 		{128, 2},
 		{16383, 2},
 		{16384, 3},
 		{^uint64(0), 10},
 	}
 	for _, tc := range tests {
 		got := VarintLen(tc.value)
 		if got != tc.expected {
 			t.Errorf("VarintLen(%d) = %d, want %d", tc.value, got, tc.expected)
 		}
 	}
 }
 func TestVarintSequence(t *testing.T) {
 	var buf bytes.Buffer
 	values := []uint64{0, 42, 1000000, ^uint64(0), 1}
 	for _, v := range values {
 		if _, err := EncodeVarint(&buf, v); err != nil {
 			t.Fatalf("EncodeVarint(%d): %v", v, err)
 		}
 	}
 	for _, expected := range values {
 		got, err := DecodeVarint(&buf)
 		if err != nil {
 			t.Fatalf("DecodeVarint: %v", err)
 		}
 		if got != expected {
 			t.Errorf("got %d, want %d", got, expected)
 		}
 	}
 }
--- a/pkg/obilua/obiseqslice.go
+++ b/pkg/obilua/obiseqslice.go
@@ -31,7 +31,8 @@ func obiseqslice2Lua(interpreter *lua.LState,
 }
 func newObiSeqSlice(luaState *lua.LState) int {
-	seqslice := obiseq.NewBioSequenceSlice()
+	capacity := luaState.OptInt(1, 0)
 	seqslice := obiseq.NewBioSequenceSlice(capacity)
 	luaState.Push(obiseqslice2Lua(luaState, seqslice))
 	return 1
 }
--- a/pkg/obioptions/options.go
+++ b/pkg/obioptions/options.go
@@ -8,6 +8,7 @@ import (
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
 	log "github.com/sirupsen/logrus"
 	"github.com/DavidGamba/go-getoptions"
@@ -26,16 +27,11 @@ var __defaut_taxonomy_mutex__ sync.Mutex
 type ArgumentParser func([]string) (*getoptions.GetOpt, []string)
-func GenerateOptionParser(program string,
+// RegisterGlobalOptions registers the global options shared by all obitools
-	documentation string,
+// commands onto the given GetOpt instance. It does NOT register --help,
-	optionset ...func(*getoptions.GetOpt)) ArgumentParser {
+// which must be handled by the caller (either as a Bool option or via
-
+// HelpCommand for subcommand-based parsers).
-	options := getoptions.New()
+func RegisterGlobalOptions(options *getoptions.GetOpt) {
 	options.Self(program, documentation)
 	options.SetMode(getoptions.Bundling)
 	options.SetUnknownMode(getoptions.Fail)
 	options.Bool("help", false, options.Alias("h", "?"))
 	options.Bool("version", false,
 		options.Description("Prints the version and exits."))
@@ -46,17 +42,10 @@ func GenerateOptionParser(program string,
 	options.BoolVar(&_Pprof, "pprof", false,
 		options.Description("Enable pprof server. Look at the log for details."))
 	// options.IntVar(&_ParallelWorkers, "workers", _ParallelWorkers,
 	// 	options.Alias("w"),
 	// 	options.Description("Number of parallele threads computing the result"))
 	options.IntVar(obidefault.MaxCPUPtr(), "max-cpu", obidefault.MaxCPU(),
 		options.GetEnv("OBIMAXCPU"),
 		options.Description("Number of parallele threads computing the result"))
 	// options.BoolVar(&_Pprof, "force-one-cpu", false,
 	// 	options.Description("Force to use only one cpu core for parallel processing"))
 	options.IntVar(&_PprofMudex, "pprof-mutex", _PprofMudex,
 		options.GetEnv("OBIPPROFMUTEX"),
 		options.Description("Enable profiling of mutex lock."))
@@ -67,7 +56,15 @@ func GenerateOptionParser(program string,
 	options.IntVar(obidefault.BatchSizePtr(), "batch-size", obidefault.BatchSize(),
 		options.GetEnv("OBIBATCHSIZE"),
-		options.Description("Number of sequence per batch for paralelle processing"))
+		options.Description("Minimum number of sequences per batch (floor, default 1)"))
 	options.IntVar(obidefault.BatchSizeMaxPtr(), "batch-size-max", obidefault.BatchSizeMax(),
 		options.GetEnv("OBIBATCHSIZEMAX"),
 		options.Description("Maximum number of sequences per batch (ceiling, default 2000)"))
 	options.StringVar(obidefault.BatchMemStrPtr(), "batch-mem", "",
 		options.GetEnv("OBIBATCHMEM"),
 		options.Description("Maximum memory per batch (e.g. 128K, 64M, 1G; default: 128M). Set to 0 to disable."))
 	options.Bool("solexa", false,
 		options.GetEnv("OBISOLEXA"),
@@ -77,119 +74,128 @@ func GenerateOptionParser(program string,
 		options.GetEnv("OBIWARNING"),
 		options.Description("Stop printing of the warning message"),
 	)
 }
 // ProcessParsedOptions handles the post-parse logic common to all obitools
 // commands: help, version, debug, pprof, taxonomy, cpu configuration, etc.
 // It receives the GetOpt instance and the parse error (if any).
 func ProcessParsedOptions(options *getoptions.GetOpt, parseErr error) {
 	// Note: "help" may not be registered as a Bool (e.g. when using HelpCommand
 	// for subcommand-based parsers). Only check if it won't panic.
 	// We use a recover guard to be safe.
 	func() {
 		defer func() { recover() }()
 		if options.Called("help") {
 			fmt.Fprint(os.Stderr, options.Help())
 			os.Exit(0)
 		}
 	}()
 	if options.Called("version") {
 		fmt.Fprintf(os.Stderr, "OBITools %s\n", VersionString())
 		os.Exit(0)
 	}
 	if options.Called("taxonomy") {
 		__defaut_taxonomy_mutex__.Lock()
 		defer __defaut_taxonomy_mutex__.Unlock()
 		taxonomy, err := obiformats.LoadTaxonomy(
 			obidefault.SelectedTaxonomy(),
 			!obidefault.AreAlternativeNamesSelected(),
 			SeqAsTaxa(),
 		)
 		if err != nil {
 			log.Fatalf("Cannot load default taxonomy: %v", err)
 		}
 		taxonomy.SetAsDefault()
 	}
 	log.SetLevel(log.InfoLevel)
 	if options.Called("debug") {
 		log.SetLevel(log.DebugLevel)
 		log.Debugln("Switch to debug level logging")
 	}
 	if options.Called("pprof") {
 		url := "localhost:6060"
 		go http.ListenAndServe(url, nil)
 		log.Infof("Start a pprof server at address %s/debug/pprof", url)
 		log.Info("Profil can be followed running concurrently the command :")
 		log.Info("  go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/profile?seconds=30'")
 	}
 	if options.Called("pprof-mutex") {
 		url := "localhost:6060"
 		go http.ListenAndServe(url, nil)
 		runtime.SetMutexProfileFraction(_PprofMudex)
 		log.Infof("Start a pprof server at address %s/debug/pprof", url)
 		log.Info("Profil can be followed running concurrently the command :")
 		log.Info("  go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/mutex'")
 	}
 	if options.Called("pprof-goroutine") {
 		url := "localhost:6060"
 		go http.ListenAndServe(url, nil)
 		runtime.SetBlockProfileRate(_PprofGoroutine)
 		log.Infof("Start a pprof server at address %s/debug/pprof", url)
 		log.Info("Profil can be followed running concurrently the command :")
 		log.Info("  go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/block'")
 	}
 	// Handle user errors
 	if parseErr != nil {
 		fmt.Fprintf(os.Stderr, "ERROR: %s\n\n", parseErr)
 		fmt.Fprint(os.Stderr, options.Help(getoptions.HelpSynopsis))
 		os.Exit(1)
 	}
 	runtime.GOMAXPROCS(obidefault.MaxCPU())
 	if options.Called("max-cpu") {
 		log.Printf("CPU number limited to %d", obidefault.MaxCPU())
 	}
 	if options.Called("no-singleton") {
 		log.Printf("No singleton option set")
 	}
 	log.Printf("Number of workers set %d", obidefault.ParallelWorkers())
 	if options.Called("solexa") {
 		obidefault.SetReadQualitiesShift(64)
 	}
 	if options.Called("batch-mem") {
 		n, err := obiutils.ParseMemSize(obidefault.BatchMemStr())
 		if err != nil {
 			log.Fatalf("Invalid --batch-mem value %q: %v", obidefault.BatchMemStr(), err)
 		}
 		obidefault.SetBatchMem(n)
 		log.Printf("Memory-based batching enabled: %s per batch", obidefault.BatchMemStr())
 	}
 }
 func GenerateOptionParser(program string,
 	documentation string,
 	optionset ...func(*getoptions.GetOpt)) ArgumentParser {
 	options := getoptions.New()
 	options.Self(program, documentation)
 	options.SetMode(getoptions.Bundling)
 	options.SetUnknownMode(getoptions.Fail)
 	options.Bool("help", false, options.Alias("h", "?"))
 	RegisterGlobalOptions(options)
 	for _, o := range optionset {
 		o(options)
 	}
 	return func(args []string) (*getoptions.GetOpt, []string) {
 		remaining, err := options.Parse(args[1:])
-
+		ProcessParsedOptions(options, err)
 		if options.Called("help") {
 			fmt.Fprint(os.Stderr, options.Help())
 			os.Exit(0)
 		}
 		if options.Called("version") {
 			fmt.Fprintf(os.Stderr, "OBITools %s\n", VersionString())
 			os.Exit(0)
 		}
 		if options.Called("taxonomy") {
 			__defaut_taxonomy_mutex__.Lock()
 			defer __defaut_taxonomy_mutex__.Unlock()
 			taxonomy, err := obiformats.LoadTaxonomy(
 				obidefault.SelectedTaxonomy(),
 				!obidefault.AreAlternativeNamesSelected(),
 				SeqAsTaxa(),
 			)
 			if err != nil {
 				log.Fatalf("Cannot load default taxonomy: %v", err)
 			}
 			taxonomy.SetAsDefault()
 		}
 		log.SetLevel(log.InfoLevel)
 		if options.Called("debug") {
 			log.SetLevel(log.DebugLevel)
 			log.Debugln("Switch to debug level logging")
 		}
 		if options.Called("pprof") {
 			url := "localhost:6060"
 			go http.ListenAndServe(url, nil)
 			log.Infof("Start a pprof server at address %s/debug/pprof", url)
 			log.Info("Profil can be followed running concurrently the command :")
 			log.Info("  go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/profile?seconds=30'")
 		}
 		if options.Called("pprof-mutex") {
 			url := "localhost:6060"
 			go http.ListenAndServe(url, nil)
 			runtime.SetMutexProfileFraction(_PprofMudex)
 			log.Infof("Start a pprof server at address %s/debug/pprof", url)
 			log.Info("Profil can be followed running concurrently the command :")
 			log.Info("  go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/mutex'")
 		}
 		if options.Called("pprof-goroutine") {
 			url := "localhost:6060"
 			go http.ListenAndServe(url, nil)
 			runtime.SetBlockProfileRate(_PprofGoroutine)
 			log.Infof("Start a pprof server at address %s/debug/pprof", url)
 			log.Info("Profil can be followed running concurrently the command :")
 			log.Info("  go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/block'")
 		}
 		// Handle user errors
 		if err != nil {
 			fmt.Fprintf(os.Stderr, "ERROR: %s\n\n", err)
 			fmt.Fprint(os.Stderr, options.Help(getoptions.HelpSynopsis))
 			os.Exit(1)
 		}
 		// // Setup the maximum number of CPU usable by the program
 		// if obidefault.MaxCPU() == 1 {
 		// 	log.Warn("Limitating the Maximum number of CPU to 1 is not recommanded")
 		// 	log.Warn("The number of CPU requested has been set to 2")
 		// 	obidefault.SetMaxCPU(2)
 		// }
 		// if options.Called("force-one-cpu") {
 		// 	log.Warn("Limitating the Maximum number of CPU to 1 is not recommanded")
 		// 	log.Warn("The number of CPU has been forced to 1")
 		// 	log.Warn("This can lead to unexpected behavior")
 		// 	obidefault.SetMaxCPU(1)
 		// }
 		runtime.GOMAXPROCS(obidefault.MaxCPU())
 		// if options.Called("max-cpu") || options.Called("force-one-cpu") {
 		// 	log.Printf("CPU number limited to %d", obidefault.MaxCPU())
 		// }
 		if options.Called("max-cpu") {
 			log.Printf("CPU number limited to %d", obidefault.MaxCPU())
 		}
 		if options.Called("no-singleton") {
 			log.Printf("No singleton option set")
 		}
 		log.Printf("Number of workers set %d", obidefault.ParallelWorkers())
 		// if options.Called("workers") {
 		// }
 		if options.Called("solexa") {
 			obidefault.SetReadQualitiesShift(64)
 		}
 		return options, remaining
 	}
 }
--- a/pkg/obioptions/subcommand.go
+++ b/pkg/obioptions/subcommand.go
@@ -0,0 +1,43 @@
 package obioptions
 import (
 	"github.com/DavidGamba/go-getoptions"
 )
 // GenerateSubcommandParser creates an option parser that supports subcommands
 // via go-getoptions' NewCommand/SetCommandFn/Dispatch API.
 //
 // The setup function receives the root *GetOpt and should register subcommands
 // using opt.NewCommand(). Global options (--debug, --max-cpu, etc.) are
 // registered before setup is called and are inherited by all subcommands.
 //
 // Returns the root *GetOpt (needed for Dispatch) and an ArgumentParser
 // that handles parsing and post-parse processing.
 func GenerateSubcommandParser(
 	program string,
 	documentation string,
 	setup func(opt *getoptions.GetOpt),
 ) (*getoptions.GetOpt, ArgumentParser) {
 	options := getoptions.New()
 	options.Self(program, documentation)
 	options.SetMode(getoptions.Bundling)
 	options.SetUnknownMode(getoptions.Fail)
 	// Register global options (inherited by all subcommands)
 	RegisterGlobalOptions(options)
 	// Let the caller register subcommands
 	setup(options)
 	// Add automatic help subcommand (must be after all commands)
 	options.HelpCommand("help", options.Description("Show help for a command"))
 	parser := func(args []string) (*getoptions.GetOpt, []string) {
 		remaining, err := options.Parse(args[1:])
 		ProcessParsedOptions(options, err)
 		return options, remaining
 	}
 	return options, parser
 }
--- a/pkg/obioptions/version.go
+++ b/pkg/obioptions/version.go
@@ -3,7 +3,7 @@ package obioptions
 // Version is automatically updated by the Makefile from version.txt
 // The patch number (third digit) is incremented on each push to the repository
-var _Version = "Release 4.4.12"
+var _Version = "Release 4.4.29"
 // Version returns the version of the obitools package.
 //
--- a/pkg/obiseq/biosequence.go
+++ b/pkg/obiseq/biosequence.go
@@ -120,6 +120,19 @@ func NewBioSequence(id string,
 	return bs
 }
 // NewBioSequenceOwning creates a BioSequence taking ownership of the sequence
 // slice without copying it. The caller must not use the slice after this call.
 // Use this when the slice was allocated specifically for this sequence.
 func NewBioSequenceOwning(id string,
 	sequence []byte,
 	definition string) *BioSequence {
 	bs := NewEmptyBioSequence(0)
 	bs.SetId(id)
 	bs.TakeSequence(sequence)
 	bs.SetDefinition(definition)
 	return bs
 }
 // NewBioSequenceWithQualities creates a new BioSequence object with the given id, sequence, definition, and qualities.
 //
 // Parameters:
@@ -260,6 +273,28 @@ func (s *BioSequence) Len() int {
 	return len(s.sequence)
 }
 // MemorySize returns an estimate of the memory footprint of the BioSequence
 // in bytes. It accounts for the sequence, quality scores, feature data,
 // annotations, and fixed struct overhead. The estimate is conservative
 // (cap rather than len for byte slices) so it is suitable for memory-based
 // batching decisions.
 func (s *BioSequence) MemorySize() int {
 	if s == nil {
 		return 0
 	}
 	// fixed struct overhead (strings, pointers, mutex pointer)
 	const overhead = 128
 	n := overhead
 	n += cap(s.sequence)
 	n += cap(s.qualities)
 	n += cap(s.feature)
 	n += len(s.id)
 	n += len(s.source)
 	// rough annotation estimate: each key+value pair ~64 bytes on average
 	n += len(s.annotations) * 64
 	return n
 }
 // HasQualities checks if the BioSequence has sequence qualitiy scores.
 //
 // This function does not have any parameters.
@@ -444,6 +479,12 @@ func (s *BioSequence) SetSequence(sequence []byte) {
 	s.sequence = obiutils.InPlaceToLower(CopySlice(sequence))
 }
 // TakeSequence stores the slice directly without copying, then lowercases in-place.
 // The caller must not use the slice after this call.
 func (s *BioSequence) TakeSequence(sequence []byte) {
 	s.sequence = obiutils.InPlaceToLower(sequence)
 }
 func (s *BioSequence) HasValidSequence() bool {
 	for _, c := range s.sequence {
 		if !((c >= 'a' && c <= 'z') || c == '-' || c == '.' || c == '[' || c == ']') {
@@ -461,6 +502,15 @@ func (s *BioSequence) SetQualities(qualities Quality) {
 	s.qualities = CopySlice(qualities)
 }
 // TakeQualities stores the slice directly without copying.
 // The caller must not use the slice after this call.
 func (s *BioSequence) TakeQualities(qualities Quality) {
 	if s.qualities != nil {
 		RecycleSlice(&s.qualities)
 	}
 	s.qualities = qualities
 }
 // A method that appends a byte slice to the qualities of the BioSequence.
 func (s *BioSequence) WriteQualities(data []byte) (int, error) {
 	s.qualities = append(s.qualities, data...)
--- a/pkg/obiseq/biosequenceslice.go
+++ b/pkg/obiseq/biosequenceslice.go
@@ -195,7 +195,7 @@ func (s *BioSequenceSlice) ExtractTaxonomy(taxonomy *obitax.Taxonomy, seqAsTaxa
 				return nil, fmt.Errorf("sequence %v has no path", s.Id())
 			}
 			last := path[len(path)-1]
-			taxname, _ := obiutils.SplitInTwo(last, ':')
+			taxname, _ := obiutils.LeftSplitInTwo(last, ':')
 			if idx, ok := s.GetIntAttribute("seq_number"); !ok {
 				return nil, errors.New("sequences are not numbered")
 			} else {
--- a/pkg/obiseq/pool.go
+++ b/pkg/obiseq/pool.go
@@ -1,13 +1,20 @@
 package obiseq
 import (
 	"runtime"
 	"sync"
 	"sync/atomic"
 	log "github.com/sirupsen/logrus"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
 )
 const _LargeSliceThreshold = 100 * 1024        // 100 kb — below: leave to GC, above: trigger explicit GC
 const _GCBytesBudget = int64(256 * 1024 * 1024) // trigger GC every 256 MB of large discards
 var _largeSliceDiscardedBytes = atomic.Int64{}
 var _BioSequenceByteSlicePool = sync.Pool{
 	New: func() interface{} {
 		bs := make([]byte, 0, 300)
@@ -34,6 +41,13 @@ func RecycleSlice(s *[]byte) {
 		}
 		if cap(*s) <= 1024 {
 			_BioSequenceByteSlicePool.Put(s)
 		} else if cap(*s) >= _LargeSliceThreshold {
 			n := int64(cap(*s))
 			*s = nil
 			prev := _largeSliceDiscardedBytes.Load()
 			if _largeSliceDiscardedBytes.Add(n)/_GCBytesBudget > prev/_GCBytesBudget {
 				runtime.GC()
 			}
 		}
 	}
 }
--- a/pkg/obiseq/worker.go
+++ b/pkg/obiseq/worker.go
@@ -104,11 +104,11 @@ func SeqToSliceWorker(worker SeqWorker,
 			for _, s := range input {
 				r, err := worker(s)
 				if err == nil {
 					if i+len(r) > cap(output) {
 						output = slices.Grow(output[:i], len(r))
 						output = output[:cap(output)]
 					}
 					for _, rs := range r {
 						if i == len(output) {
 							output = slices.Grow(output, cap(output))
 							output = output[:cap(output)]
 						}
 						output[i] = rs
 						i++
 					}
--- a/pkg/obitax/taxid.go
+++ b/pkg/obitax/taxid.go
@@ -31,7 +31,7 @@ func NewTaxidFactory(code string, alphabet obiutils.AsciiSet) *TaxidFactory {
 // It extracts the relevant part of the string after the first colon (':') if present.
 func (f *TaxidFactory) FromString(taxid string) (Taxid, error) {
 	taxid = obiutils.AsciiSpaceSet.TrimLeft(taxid)
-	part1, part2 := obiutils.SplitInTwo(taxid, ':')
+	part1, part2 := obiutils.LeftSplitInTwo(taxid, ':')
 	if len(part2) == 0 {
 		taxid = part1
 	} else {
--- a/pkg/obitools/obiclean/graph.go
+++ b/pkg/obitools/obiclean/graph.go
@@ -64,7 +64,7 @@ func EmpiricalDistCsv(filename string, data [][]Ratio, compressed bool) {
 		fmt.Println(err)
 	}
-	destfile, err := obiutils.CompressStream(file, true, true)
+	destfile, err := obiutils.CompressStream(file, compressed, true)
 	if err != nil {
 		fmt.Println(err)
 	}
--- a/pkg/obitools/obiconvert/sequence_reader.go
+++ b/pkg/obitools/obiconvert/sequence_reader.go
@@ -68,6 +68,8 @@ func ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) {
 						strings.HasSuffix(path, "seq.gz") ||
 						strings.HasSuffix(path, "gb") ||
 						strings.HasSuffix(path, "gb.gz") ||
 						strings.HasSuffix(path, "gbff") ||
 						strings.HasSuffix(path, "gbff.gz") ||
 						strings.HasSuffix(path, "dat") ||
 						strings.HasSuffix(path, "dat.gz") ||
 						strings.HasSuffix(path, "ecopcr") ||
@@ -204,7 +206,7 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
 					iterator = iterator.PairTo(ip)
 				}
 			} else {
-				iterator = obiiter.NilIBioSequence
+				return obiiter.NilIBioSequence, fmt.Errorf("no sequence files found in the provided paths")
 			}
 		}
@@ -212,6 +214,8 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
 	iterator = iterator.Speed("Reading sequences")
 	iterator = iterator.RebatchBySize(obidefault.BatchMem(), obidefault.BatchSizeMax())
 	return iterator, nil
 }
--- a/pkg/obitools/obidistribute/distribute.go
+++ b/pkg/obitools/obidistribute/distribute.go
@@ -46,8 +46,7 @@ func CLIDistributeSequence(sequences obiiter.IBioSequence) {
 		formater = obiformats.WriteSequencesToFile
 	}
-	dispatcher := sequences.Distribute(CLISequenceClassifier(),
+	dispatcher := sequences.Distribute(CLISequenceClassifier())
 		obidefault.BatchSize())
 	obiformats.WriterDispatcher(CLIFileNamePattern(),
 		dispatcher, formater, opts...,
--- a/pkg/obitools/obik/cp.go
+++ b/pkg/obitools/obik/cp.go
@@ -0,0 +1,55 @@
 package obik
 import (
 	"context"
 	"fmt"
 	log "github.com/sirupsen/logrus"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
 	"github.com/DavidGamba/go-getoptions"
 )
 func runCp(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
 	if len(args) < 2 {
 		return fmt.Errorf("usage: obik cp [--set PATTERN]... [--force] <source_index> <dest_index>")
 	}
 	srcDir := args[0]
 	destDir := args[1]
 	ksg, err := obikmer.OpenKmerSetGroup(srcDir)
 	if err != nil {
 		return fmt.Errorf("failed to open source kmer index: %w", err)
 	}
 	// Resolve set patterns
 	patterns := CLISetPatterns()
 	var ids []string
 	if len(patterns) > 0 {
 		indices, err := ksg.MatchSetIDs(patterns)
 		if err != nil {
 			return err
 		}
 		if len(indices) == 0 {
 			return fmt.Errorf("no sets match the given patterns")
 		}
 		ids = make([]string, len(indices))
 		for i, idx := range indices {
 			ids[i] = ksg.SetIDOf(idx)
 		}
 	} else {
 		// Copy all sets
 		ids = ksg.SetsIDs()
 	}
 	log.Infof("Copying %d set(s) from %s to %s", len(ids), srcDir, destDir)
 	dest, err := ksg.CopySetsByIDTo(ids, destDir, CLIForce())
 	if err != nil {
 		return err
 	}
 	log.Infof("Destination now has %d set(s)", dest.Size())
 	return nil
 }
--- a/pkg/obitools/obik/filter.go
+++ b/pkg/obitools/obik/filter.go
@@ -0,0 +1,344 @@
 package obik
 import (
 	"context"
 	"fmt"
 	"os"
 	"path/filepath"
 	"strings"
 	"sync"
 	"sync/atomic"
 	"github.com/schollz/progressbar/v3"
 	log "github.com/sirupsen/logrus"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
 	"github.com/DavidGamba/go-getoptions"
 )
 // KmerFilter is a predicate applied to individual k-mers during filtering.
 // Returns true if the k-mer should be kept.
 type KmerFilter func(kmer uint64) bool
 // KmerFilterFactory creates a new KmerFilter instance.
 // Each goroutine should call the factory to get its own filter,
 // since some filters (e.g. KmerEntropyFilter) are not thread-safe.
 type KmerFilterFactory func() KmerFilter
 // chainFilterFactories combines multiple KmerFilterFactory into one.
 // The resulting factory creates a filter that accepts a k-mer only
 // if all individual filters accept it.
 func chainFilterFactories(factories []KmerFilterFactory) KmerFilterFactory {
 	switch len(factories) {
 	case 0:
 		return func() KmerFilter { return func(uint64) bool { return true } }
 	case 1:
 		return factories[0]
 	default:
 		return func() KmerFilter {
 			filters := make([]KmerFilter, len(factories))
 			for i, f := range factories {
 				filters[i] = f()
 			}
 			return func(kmer uint64) bool {
 				for _, f := range filters {
 					if !f(kmer) {
 						return false
 					}
 				}
 				return true
 			}
 		}
 	}
 }
 // runFilter implements the "obik filter" subcommand.
 // It reads an existing kmer index, applies a chain of filters,
 // and writes a new filtered index.
 func runFilter(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
 	if len(args) < 1 {
 		return fmt.Errorf("usage: obik filter [options] <source_index> --out <dest_index>")
 	}
 	srcDir := args[0]
 	destDir := CLIOutputDirectory()
 	if destDir == "" || destDir == "-" {
 		return fmt.Errorf("--out option is required and must specify a destination directory")
 	}
 	// Open source index
 	src, err := obikmer.OpenKmerSetGroup(srcDir)
 	if err != nil {
 		return fmt.Errorf("failed to open source index: %w", err)
 	}
 	k := src.K()
 	// Build filter factory chain from CLI options.
 	// Factories are used so each goroutine creates its own filter instance,
 	// since some filters (e.g. KmerEntropyFilter) have mutable state.
 	var factories []KmerFilterFactory
 	var filterDescriptions []string
 	// Entropy filter
 	entropyThreshold := CLIIndexEntropyThreshold()
 	entropySize := CLIIndexEntropySize()
 	if entropyThreshold > 0 {
 		factories = append(factories, func() KmerFilter {
 			ef := obikmer.NewKmerEntropyFilter(k, entropySize, entropyThreshold)
 			return ef.Accept
 		})
 		filterDescriptions = append(filterDescriptions,
 			fmt.Sprintf("entropy(threshold=%.4f, level-max=%d)", entropyThreshold, entropySize))
 	}
 	// Future filters will be added here, e.g.:
 	// quorumFilter, frequencyFilter, ...
 	if len(factories) == 0 {
 		return fmt.Errorf("no filter specified; use --entropy-filter or other filter options")
 	}
 	filterFactory := chainFilterFactories(factories)
 	// Resolve set selection (default: all sets)
 	patterns := CLISetPatterns()
 	var setIndices []int
 	if len(patterns) > 0 {
 		setIndices, err = src.MatchSetIDs(patterns)
 		if err != nil {
 			return fmt.Errorf("failed to match set patterns: %w", err)
 		}
 		if len(setIndices) == 0 {
 			return fmt.Errorf("no sets match the given patterns")
 		}
 	} else {
 		setIndices = make([]int, src.Size())
 		for i := range setIndices {
 			setIndices[i] = i
 		}
 	}
 	log.Infof("Filtering %d set(s) from %s with: %s",
 		len(setIndices), srcDir, strings.Join(filterDescriptions, " + "))
 	// Create destination directory
 	if err := os.MkdirAll(destDir, 0755); err != nil {
 		return fmt.Errorf("failed to create destination: %w", err)
 	}
 	P := src.Partitions()
 	// Progress bar for partition filtering
 	totalPartitions := len(setIndices) * P
 	var bar *progressbar.ProgressBar
 	if obidefault.ProgressBar() {
 		pbopt := []progressbar.Option{
 			progressbar.OptionSetWriter(os.Stderr),
 			progressbar.OptionSetWidth(15),
 			progressbar.OptionShowCount(),
 			progressbar.OptionShowIts(),
 			progressbar.OptionSetPredictTime(true),
 			progressbar.OptionSetDescription("[Filtering partitions]"),
 		}
 		bar = progressbar.NewOptions(totalPartitions, pbopt...)
 	}
 	// Process each selected set
 	newCounts := make([]uint64, len(setIndices))
 	for si, srcIdx := range setIndices {
 		setID := src.SetIDOf(srcIdx)
 		if setID == "" {
 			setID = fmt.Sprintf("set_%d", srcIdx)
 		}
 		destSetDir := filepath.Join(destDir, fmt.Sprintf("set_%d", si))
 		if err := os.MkdirAll(destSetDir, 0755); err != nil {
 			return fmt.Errorf("failed to create set directory: %w", err)
 		}
 		// Process partitions in parallel
 		nWorkers := obidefault.ParallelWorkers()
 		if nWorkers > P {
 			nWorkers = P
 		}
 		var totalKept atomic.Uint64
 		var totalProcessed atomic.Uint64
 		type job struct {
 			partIdx int
 		}
 		jobs := make(chan job, P)
 		var wg sync.WaitGroup
 		var errMu sync.Mutex
 		var firstErr error
 		for w := 0; w < nWorkers; w++ {
 			wg.Add(1)
 			go func() {
 				defer wg.Done()
 				// Each goroutine gets its own filter instance
 				workerFilter := filterFactory()
 				for j := range jobs {
 					kept, processed, err := filterPartition(
 						src.PartitionPath(srcIdx, j.partIdx),
 						filepath.Join(destSetDir, fmt.Sprintf("part_%04d.kdi", j.partIdx)),
 						workerFilter,
 					)
 					if err != nil {
 						errMu.Lock()
 						if firstErr == nil {
 							firstErr = err
 						}
 						errMu.Unlock()
 						return
 					}
 					totalKept.Add(kept)
 					totalProcessed.Add(processed)
 					if bar != nil {
 						bar.Add(1)
 					}
 				}
 			}()
 		}
 		for p := 0; p < P; p++ {
 			jobs <- job{p}
 		}
 		close(jobs)
 		wg.Wait()
 		if firstErr != nil {
 			return fmt.Errorf("failed to filter set %q: %w", setID, firstErr)
 		}
 		kept := totalKept.Load()
 		processed := totalProcessed.Load()
 		newCounts[si] = kept
 		log.Infof("Set %q: %d/%d k-mers kept (%.1f%% removed)",
 			setID, kept, processed,
 			100.0*float64(processed-kept)/float64(max(processed, 1)))
 		// Copy spectrum.bin if it exists
 		srcSpecPath := src.SpectrumPath(srcIdx)
 		if _, err := os.Stat(srcSpecPath); err == nil {
 			destSpecPath := filepath.Join(destSetDir, "spectrum.bin")
 			if err := copyFileHelper(srcSpecPath, destSpecPath); err != nil {
 				log.Warnf("Could not copy spectrum for set %q: %v", setID, err)
 			}
 		}
 	}
 	if bar != nil {
 		fmt.Fprintln(os.Stderr)
 	}
 	// Build destination metadata
 	setsIDs := make([]string, len(setIndices))
 	setsMetadata := make([]map[string]interface{}, len(setIndices))
 	for i, srcIdx := range setIndices {
 		setsIDs[i] = src.SetIDOf(srcIdx)
 		setsMetadata[i] = src.AllSetMetadata(srcIdx)
 		if setsMetadata[i] == nil {
 			setsMetadata[i] = make(map[string]interface{})
 		}
 	}
 	// Write metadata for the filtered index
 	dest, err := obikmer.NewFilteredKmerSetGroup(
 		destDir, k, src.M(), P,
 		len(setIndices), setsIDs, newCounts, setsMetadata,
 	)
 	if err != nil {
 		return fmt.Errorf("failed to create filtered metadata: %w", err)
 	}
 	// Copy group-level metadata and record applied filters
 	for key, value := range src.Metadata {
 		dest.SetAttribute(key, value)
 	}
 	if entropyThreshold > 0 {
 		dest.SetAttribute("entropy_filter", entropyThreshold)
 		dest.SetAttribute("entropy_filter_size", entropySize)
 	}
 	dest.SetAttribute("filtered_from", srcDir)
 	if err := dest.SaveMetadata(); err != nil {
 		return fmt.Errorf("failed to save metadata: %w", err)
 	}
 	log.Info("Done.")
 	return nil
 }
 // filterPartition reads a single .kdi partition, applies the filter predicate,
 // and writes the accepted k-mers to a new .kdi file.
 // Returns (kept, processed, error).
 func filterPartition(srcPath, destPath string, accept KmerFilter) (uint64, uint64, error) {
 	reader, err := obikmer.NewKdiReader(srcPath)
 	if err != nil {
 		// Empty partition — write empty KDI
 		w, err2 := obikmer.NewKdiWriter(destPath)
 		if err2 != nil {
 			return 0, 0, err2
 		}
 		return 0, 0, w.Close()
 	}
 	defer reader.Close()
 	w, err := obikmer.NewKdiWriter(destPath)
 	if err != nil {
 		return 0, 0, err
 	}
 	var kept, processed uint64
 	for {
 		kmer, ok := reader.Next()
 		if !ok {
 			break
 		}
 		processed++
 		if accept(kmer) {
 			if err := w.Write(kmer); err != nil {
 				w.Close()
 				return 0, 0, err
 			}
 			kept++
 		}
 	}
 	return kept, processed, w.Close()
 }
 // copyFileHelper copies a file (used for spectrum.bin etc.)
 func copyFileHelper(src, dst string) error {
 	in, err := os.Open(src)
 	if err != nil {
 		return err
 	}
 	defer in.Close()
 	out, err := os.Create(dst)
 	if err != nil {
 		return err
 	}
 	defer out.Close()
 	buf := make([]byte, 32*1024)
 	for {
 		n, readErr := in.Read(buf)
 		if n > 0 {
 			if _, writeErr := out.Write(buf[:n]); writeErr != nil {
 				return writeErr
 			}
 		}
 		if readErr != nil {
 			break
 		}
 	}
 	return out.Close()
 }
--- a/pkg/obitools/obik/index.go
+++ b/pkg/obitools/obik/index.go
@@ -0,0 +1,154 @@
 package obik
 import (
 	"context"
 	"fmt"
 	"os"
 	"path/filepath"
 	"sync"
 	"sync/atomic"
 	log "github.com/sirupsen/logrus"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
 	"github.com/DavidGamba/go-getoptions"
 )
 func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
 	outDir := CLIOutputDirectory()
 	if outDir == "" || outDir == "-" {
 		return fmt.Errorf("--out option is required and must specify a directory path")
 	}
 	k := CLIKmerSize()
 	if k < 2 || k > 31 {
 		return fmt.Errorf("invalid k-mer size: %d (must be between 2 and 31)", k)
 	}
 	m := CLIMinimizerSize()
 	minOcc := CLIMinOccurrence()
 	if minOcc < 1 {
 		return fmt.Errorf("invalid min-occurrence: %d (must be >= 1)", minOcc)
 	}
 	maxOcc := CLIMaxOccurrence()
 	entropyThreshold := CLIIndexEntropyThreshold()
 	entropySize := CLIIndexEntropySize()
 	// Build options
 	var opts []obikmer.BuilderOption
 	if minOcc > 1 {
 		opts = append(opts, obikmer.WithMinFrequency(minOcc))
 	}
 	if maxOcc > 0 {
 		opts = append(opts, obikmer.WithMaxFrequency(maxOcc))
 	}
 	if topN := CLISaveFreqKmer(); topN > 0 {
 		opts = append(opts, obikmer.WithSaveFreqKmers(topN))
 	}
 	if entropyThreshold > 0 {
 		opts = append(opts, obikmer.WithEntropyFilter(entropyThreshold, entropySize))
 	}
 	// Determine whether to append to existing group or create new
 	var builder *obikmer.KmerSetGroupBuilder
 	var err error
 	metaPath := filepath.Join(outDir, "metadata.toml")
 	if _, statErr := os.Stat(metaPath); statErr == nil {
 		// Existing group: append
 		log.Infof("Appending to existing kmer index at %s", outDir)
 		builder, err = obikmer.AppendKmerSetGroupBuilder(outDir, 1, opts...)
 		if err != nil {
 			return fmt.Errorf("failed to open existing kmer index for appending: %w", err)
 		}
 	} else {
 		// New group
 		if maxOcc > 0 {
 			log.Infof("Creating new kmer index: k=%d, m=%d, occurrence=[%d,%d]", k, m, minOcc, maxOcc)
 		} else {
 			log.Infof("Creating new kmer index: k=%d, m=%d, min-occurrence=%d", k, m, minOcc)
 		}
 		builder, err = obikmer.NewKmerSetGroupBuilder(outDir, k, m, 1, -1, opts...)
 		if err != nil {
 			return fmt.Errorf("failed to create kmer index builder: %w", err)
 		}
 	}
 	// Read and process sequences in parallel
 	sequences, err := obiconvert.CLIReadBioSequences(args...)
 	if err != nil {
 		return fmt.Errorf("failed to open sequence files: %w", err)
 	}
 	nworkers := obidefault.ParallelWorkers()
 	var seqCount atomic.Int64
 	var wg sync.WaitGroup
 	consumer := func(iter obiiter.IBioSequence) {
 		defer wg.Done()
 		for iter.Next() {
 			batch := iter.Get()
 			for _, seq := range batch.Slice() {
 				builder.AddSequence(0, seq)
 				seqCount.Add(1)
 			}
 		}
 	}
 	for i := 1; i < nworkers; i++ {
 		wg.Add(1)
 		go consumer(sequences.Split())
 	}
 	wg.Add(1)
 	go consumer(sequences)
 	wg.Wait()
 	log.Infof("Processed %d sequences", seqCount.Load())
 	// Finalize
 	ksg, err := builder.Close()
 	if err != nil {
 		return fmt.Errorf("failed to finalize kmer index: %w", err)
 	}
 	// Apply index-id to the new set
 	newSetIdx := builder.StartIndex()
 	if id := CLIIndexId(); id != "" {
 		ksg.SetSetID(newSetIdx, id)
 	}
 	// Apply group-level tags (-S)
 	for key, value := range CLISetTag() {
 		ksg.SetAttribute(key, value)
 	}
 	// Apply per-set tags (-T) to the new set
 	for key, value := range _setMetaTags {
 		ksg.SetSetMetadata(newSetIdx, key, value)
 	}
 	if minOcc > 1 {
 		ksg.SetAttribute("min_occurrence", minOcc)
 	}
 	if maxOcc > 0 {
 		ksg.SetAttribute("max_occurrence", maxOcc)
 	}
 	if entropyThreshold > 0 {
 		ksg.SetAttribute("entropy_filter", entropyThreshold)
 		ksg.SetAttribute("entropy_filter_size", entropySize)
 	}
 	if err := ksg.SaveMetadata(); err != nil {
 		return fmt.Errorf("failed to save metadata: %w", err)
 	}
 	log.Infof("Index contains %d k-mers for set %d in %s", ksg.Len(newSetIdx), newSetIdx, outDir)
 	log.Info("Done.")
 	return nil
 }
--- a/pkg/obitools/obilowmask/obilowmask.go
+++ b/pkg/obitools/obilowmask/obilowmask.go
@@ -1,39 +1,22 @@
-package obilowmask
+package obik
 import (
 	"context"
 	"fmt"
 	"math"
 	log "github.com/sirupsen/logrus"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
 	"github.com/DavidGamba/go-getoptions"
 )
-// MaskingMode defines how to handle low-complexity regions
+// lowMaskWorker creates a worker to mask low-complexity regions in DNA sequences.
-type MaskingMode int
+func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte, keepShorter bool) obiseq.SeqWorker {
 const (
 	Mask  MaskingMode = iota // Mask mode: replace low-complexity regions with masked characters
 	Split                    // Split mode: split sequence into high-complexity fragments
 	Extract
 )
 // LowMaskWorker creates a worker to mask low-complexity regions in DNA sequences.
 //
 // Algorithm principle:
 // Calculate the normalized entropy of each k-mer at different scales (wordSize = 1 to level_max).
 // K-mers with entropy below the threshold are masked.
 //
 // Parameters:
 //   - kmer_size: size of the sliding window for entropy calculation
 //   - level_max: maximum word size used for entropy calculation (finest scale)
 //   - threshold: normalized entropy threshold below which masking occurs (between 0 and 1)
 //   - mode: Mask (masking) or Split (splitting)
 //   - maskChar: character used for masking (typically 'n' or 'N')
 //
 // Returns: a SeqWorker function that can be applied to each sequence
 func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte) obiseq.SeqWorker {
 	nLogN := make([]float64, kmer_size+1)
 	for i := 1; i <= kmer_size; i++ {
@@ -87,6 +70,7 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
 			data[i] = deque[0].value
 		}
 	}
 	emaxValues := make([]float64, level_max+1)
 	logNwords := make([]float64, level_max+1)
 	for ws := 1; ws <= level_max; ws++ {
@@ -259,11 +243,14 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
 			}
 			if inlow && !masked {
 				if fromlow >= 0 {
-					frg, err := sequence.Subsequence(fromlow, i, false)
+					frgLen := i - fromlow
-					if err != nil {
+					if keepShorter || frgLen >= kmer_size {
-						return nil, err
+						frg, err := sequence.Subsequence(fromlow, i, false)
 						if err != nil {
 							return nil, err
 						}
 						rep.Push(frg)
 					}
 					rep.Push(frg)
 				}
 				inlow = false
 				fromlow = -1
@@ -271,11 +258,14 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
 		}
 		if inlow && fromlow >= 0 {
-			frg, err := sequence.Subsequence(fromlow, len(maskPosition), false)
+			frgLen := len(maskPosition) - fromlow
-			if err != nil {
+			if keepShorter || frgLen >= kmer_size {
-				return nil, err
+				frg, err := sequence.Subsequence(fromlow, len(maskPosition), false)
 				if err != nil {
 					return nil, err
 				}
 				rep.Push(frg)
 			}
 			rep.Push(frg)
 		}
 		return *rep, nil
@@ -293,11 +283,14 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
 			}
 			if inhigh && masked {
 				if fromhigh >= 0 {
-					frg, err := sequence.Subsequence(fromhigh, i, false)
+					frgLen := i - fromhigh
-					if err != nil {
+					if keepShorter || frgLen >= kmer_size {
-						return nil, err
+						frg, err := sequence.Subsequence(fromhigh, i, false)
 						if err != nil {
 							return nil, err
 						}
 						rep.Push(frg)
 					}
 					rep.Push(frg)
 				}
 				inhigh = false
 				fromhigh = -1
@@ -305,11 +298,14 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
 		}
 		if inhigh && fromhigh >= 0 {
-			frg, err := sequence.Subsequence(fromhigh, len(maskPosition), false)
+			frgLen := len(maskPosition) - fromhigh
-			if err != nil {
+			if keepShorter || frgLen >= kmer_size {
-				return nil, err
+				frg, err := sequence.Subsequence(fromhigh, len(maskPosition), false)
 				if err != nil {
 					return nil, err
 				}
 				rep.Push(frg)
 			}
 			rep.Push(frg)
 		}
 		return *rep, nil
@@ -322,14 +318,22 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
 			for i := range remove {
 				remove[i] = true
 			}
-			return applyMaskMode(sequence, remove, maskChar)
+			switch mode {
 			case MaskMode:
 				return applyMaskMode(sequence, remove, maskChar)
 			case SplitMode:
 				return selectunmasked(sequence, remove)
 			case ExtractMode:
 				return selectMasked(sequence, remove)
 			}
 			return nil, fmt.Errorf("unknown mode %d", mode)
 		}
 		bseq := sequence.Sequence()
 		maskPositions := maskAmbiguities(bseq)
-		mask := make([]int, len(bseq))
+		maskFlags := make([]int, len(bseq))
 		entropies := make([]float64, len(bseq))
 		for i := range entropies {
 			entropies[i] = 4.0
@@ -343,7 +347,7 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
 		for i := range bseq {
 			v := level_max
-			mask[i] = v
+			maskFlags[i] = v
 		}
 		for ws := level_max - 1; ws > 0; ws-- {
@@ -351,7 +355,7 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
 			for i, e2 := range entropies2 {
 				if e2 < entropies[i] {
 					entropies[i] = e2
-					mask[i] = ws
+					maskFlags[i] = ws
 				}
 			}
 		}
@@ -367,39 +371,49 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
 			remove[i] = e <= threshold
 		}
-		sequence.SetAttribute("mask", mask)
+		sequence.SetAttribute("mask", maskFlags)
 		sequence.SetAttribute("Entropies", entropies)
 		switch mode {
-		case Mask:
+		case MaskMode:
 			return applyMaskMode(sequence, remove, maskChar)
-		case Split:
+		case SplitMode:
 			return selectunmasked(sequence, remove)
-		case Extract:
+		case ExtractMode:
 			return selectMasked(sequence, remove)
 		}
-		return nil, fmt.Errorf("Unknown mode %d", mode)
+		return nil, fmt.Errorf("unknown mode %d", mode)
 	}
 	return masking
 }
-// CLISequenceEntropyMasker creates an iterator that applies entropy masking
+// runLowmask implements the "obik lowmask" subcommand.
-// to all sequences in an input iterator.
+// It masks low-complexity regions in DNA sequences using entropy-based detection.
-//
+func runLowmask(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
-// Uses command-line parameters to configure the worker.
+	kmerSize := CLIKmerSize()
-func CLISequenceEntropyMasker(iterator obiiter.IBioSequence) obiiter.IBioSequence {
+	levelMax := CLIEntropySize()
-	var newIter obiiter.IBioSequence
+	threshold := CLIEntropyThreshold()
 	mode := CLIMaskingMode()
 	maskChar := CLIMaskingChar()
-	worker := LowMaskWorker(
+	log.Printf("Low-complexity masking: kmer-size=%d, entropy-size=%d, threshold=%.4f", kmerSize, levelMax, threshold)
 		CLIKmerSize(),
 		CLILevelMax(),
 		CLIThreshold(),
 		CLIMaskingMode(),
 		CLIMaskingChar(),
 	)
-	newIter = iterator.MakeIWorker(worker, false, obidefault.ParallelWorkers())
+	sequences, err := obiconvert.CLIReadBioSequences(args...)
 	if err != nil {
 		return fmt.Errorf("failed to open sequence files: %w", err)
 	}
-	return newIter.FilterEmpty()
+	worker := lowMaskWorker(kmerSize, levelMax, threshold, mode, maskChar, CLIKeepShorter())
 	masked := sequences.MakeIWorker(
 		worker,
 		false,
 		obidefault.ParallelWorkers(),
 	).FilterEmpty()
 	obiconvert.CLIWriteBioSequences(masked, true)
 	obiutils.WaitForLastPipe()
 	return nil
 }
--- a/pkg/obitools/obik/ls.go
+++ b/pkg/obitools/obik/ls.go
@@ -0,0 +1,96 @@
 package obik
 import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"strings"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
 	"github.com/DavidGamba/go-getoptions"
 	"gopkg.in/yaml.v3"
 )
 type setEntry struct {
 	Index int    `json:"index" yaml:"index"`
 	ID    string `json:"id" yaml:"id"`
 	Count uint64 `json:"count" yaml:"count"`
 }
 func runLs(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
 	if len(args) < 1 {
 		return fmt.Errorf("usage: obik ls [options] <index_directory>")
 	}
 	ksg, err := obikmer.OpenKmerSetGroup(args[0])
 	if err != nil {
 		return fmt.Errorf("failed to open kmer index: %w", err)
 	}
 	// Determine which sets to show
 	patterns := CLISetPatterns()
 	var indices []int
 	if len(patterns) > 0 {
 		indices, err = ksg.MatchSetIDs(patterns)
 		if err != nil {
 			return err
 		}
 	} else {
 		indices = make([]int, ksg.Size())
 		for i := range indices {
 			indices[i] = i
 		}
 	}
 	entries := make([]setEntry, len(indices))
 	for i, idx := range indices {
 		entries[i] = setEntry{
 			Index: idx,
 			ID:    ksg.SetIDOf(idx),
 			Count: ksg.Len(idx),
 		}
 	}
 	format := CLIOutFormat()
 	switch format {
 	case "json":
 		return outputLsJSON(entries)
 	case "yaml":
 		return outputLsYAML(entries)
 	case "csv":
 		return outputLsCSV(entries)
 	default:
 		return outputLsCSV(entries)
 	}
 }
 func outputLsCSV(entries []setEntry) error {
 	fmt.Println("index,id,count")
 	for _, e := range entries {
 		// Escape commas in ID if needed
 		id := e.ID
 		if strings.ContainsAny(id, ",\"") {
 			id = "\"" + strings.ReplaceAll(id, "\"", "\"\"") + "\""
 		}
 		fmt.Printf("%d,%s,%d\n", e.Index, id, e.Count)
 	}
 	return nil
 }
 func outputLsJSON(entries []setEntry) error {
 	data, err := json.MarshalIndent(entries, "", "  ")
 	if err != nil {
 		return err
 	}
 	fmt.Println(string(data))
 	return nil
 }
 func outputLsYAML(entries []setEntry) error {
 	data, err := yaml.Marshal(entries)
 	if err != nil {
 		return err
 	}
 	fmt.Print(string(data))
 	return nil
 }
--- a/pkg/obitools/obik/match.go
+++ b/pkg/obitools/obik/match.go
@@ -0,0 +1,221 @@
 package obik
 import (
 	"context"
 	"fmt"
 	"sync"
 	log "github.com/sirupsen/logrus"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
 	"github.com/DavidGamba/go-getoptions"
 )
 // defaultMatchQueryThreshold is the minimum number of k-mer entries to
 // accumulate before launching a MatchBatch. Larger values amortize the
 // cost of opening .kdi files across more query k-mers.
 const defaultMatchQueryThreshold = 10_000_000
 // preparedBatch pairs a batch with its pre-computed queries.
 type preparedBatch struct {
 	batch   obiiter.BioSequenceBatch
 	seqs    []*obiseq.BioSequence
 	queries *obikmer.PreparedQueries
 }
 // accumulatedWork holds multiple prepared batches whose queries have been
 // merged into a single PreparedQueries. The flat seqs slice allows
 // MatchBatch results (indexed by merged SeqIdx) to be mapped back to
 // the original sequences.
 type accumulatedWork struct {
 	batches []obiiter.BioSequenceBatch // original batches in order
 	seqs    []*obiseq.BioSequence      // flat: seqs from all batches concatenated
 	queries *obikmer.PreparedQueries   // merged queries with rebased SeqIdx
 }
 // runMatch implements the "obik match" subcommand.
 //
 // Pipeline architecture (no shared mutable state between stages):
 //
 //	[input batches]
 //	     │  Split across nCPU goroutines
 //	     ▼
 //	PrepareQueries (CPU, parallel)
 //	     │  preparedCh
 //	     ▼
 //	Accumulate & MergeQueries (1 goroutine)
 //	     │  matchCh — fires when totalKmers >= threshold
 //	     ▼
 //	MatchBatch + annotate (1 goroutine, internal parallelism per partition)
 //	     │
 //	     ▼
 //	[output batches]
 func runMatch(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
 	indexDir := CLIIndexDirectory()
 	// Open the k-mer index
 	ksg, err := obikmer.OpenKmerSetGroup(indexDir)
 	if err != nil {
 		return fmt.Errorf("failed to open kmer index: %w", err)
 	}
 	log.Infof("Opened index: k=%d, m=%d, %d partitions, %d set(s)",
 		ksg.K(), ksg.M(), ksg.Partitions(), ksg.Size())
 	// Resolve which sets to match against
 	patterns := CLISetPatterns()
 	var setIndices []int
 	if len(patterns) > 0 {
 		setIndices, err = ksg.MatchSetIDs(patterns)
 		if err != nil {
 			return fmt.Errorf("failed to match set patterns: %w", err)
 		}
 		if len(setIndices) == 0 {
 			return fmt.Errorf("no sets match the given patterns")
 		}
 	} else {
 		setIndices = make([]int, ksg.Size())
 		for i := range setIndices {
 			setIndices[i] = i
 		}
 	}
 	for _, idx := range setIndices {
 		id := ksg.SetIDOf(idx)
 		if id == "" {
 			id = fmt.Sprintf("set_%d", idx)
 		}
 		log.Infof("Matching against set %d (%s): %d k-mers", idx, id, ksg.Len(idx))
 	}
 	// Read input sequences
 	sequences, err := obiconvert.CLIReadBioSequences(args...)
 	if err != nil {
 		return fmt.Errorf("failed to open sequence files: %w", err)
 	}
 	nworkers := obidefault.ParallelWorkers()
 	// --- Stage 1: Prepare queries in parallel ---
 	preparedCh := make(chan preparedBatch, nworkers)
 	var prepWg sync.WaitGroup
 	preparer := func(iter obiiter.IBioSequence) {
 		defer prepWg.Done()
 		for iter.Next() {
 			batch := iter.Get()
 			slice := batch.Slice()
 			seqs := make([]*obiseq.BioSequence, len(slice))
 			for i, s := range slice {
 				seqs[i] = s
 			}
 			pq := ksg.PrepareQueries(seqs)
 			preparedCh <- preparedBatch{
 				batch:   batch,
 				seqs:    seqs,
 				queries: pq,
 			}
 		}
 	}
 	for i := 1; i < nworkers; i++ {
 		prepWg.Add(1)
 		go preparer(sequences.Split())
 	}
 	prepWg.Add(1)
 	go preparer(sequences)
 	go func() {
 		prepWg.Wait()
 		close(preparedCh)
 	}()
 	// --- Stage 2: Accumulate & merge queries ---
 	matchCh := make(chan *accumulatedWork, 2)
 	go func() {
 		defer close(matchCh)
 		var acc *accumulatedWork
 		for pb := range preparedCh {
 			if acc == nil {
 				acc = &accumulatedWork{
 					batches: []obiiter.BioSequenceBatch{pb.batch},
 					seqs:    pb.seqs,
 					queries: pb.queries,
 				}
 			} else {
 				// Merge this batch's queries into the accumulator
 				obikmer.MergeQueries(acc.queries, pb.queries)
 				acc.batches = append(acc.batches, pb.batch)
 				acc.seqs = append(acc.seqs, pb.seqs...)
 			}
 			// Flush when we exceed the threshold
 			if acc.queries.NKmers >= defaultMatchQueryThreshold {
 				matchCh <- acc
 				acc = nil
 			}
 		}
 		// Flush remaining
 		if acc != nil {
 			matchCh <- acc
 		}
 	}()
 	// --- Stage 3: Match & annotate ---
 	output := obiiter.MakeIBioSequence()
 	if sequences.IsPaired() {
 		output.MarkAsPaired()
 	}
 	output.Add(1)
 	go func() {
 		defer output.Done()
 		for work := range matchCh {
 			// Match against each selected set
 			for _, setIdx := range setIndices {
 				result := ksg.MatchBatch(setIdx, work.queries)
 				setID := ksg.SetIDOf(setIdx)
 				if setID == "" {
 					setID = fmt.Sprintf("set_%d", setIdx)
 				}
 				attrName := "kmer_matched_" + setID
 				for seqIdx, positions := range result {
 					if len(positions) > 0 {
 						work.seqs[seqIdx].SetAttribute(attrName, positions)
 					}
 				}
 			}
 			// Push annotated batches to output
 			for _, b := range work.batches {
 				output.Push(b)
 			}
 			// Help GC
 			work.seqs = nil
 			work.queries = nil
 		}
 	}()
 	go output.WaitAndClose()
 	obiconvert.CLIWriteBioSequences(output, true)
 	obiutils.WaitForLastPipe()
 	return nil
 }
--- a/pkg/obitools/obik/mv.go
+++ b/pkg/obitools/obik/mv.go
@@ -0,0 +1,63 @@
 package obik
 import (
 	"context"
 	"fmt"
 	log "github.com/sirupsen/logrus"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
 	"github.com/DavidGamba/go-getoptions"
 )
 func runMv(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
 	if len(args) < 2 {
 		return fmt.Errorf("usage: obik mv [--set PATTERN]... [--force] <source_index> <dest_index>")
 	}
 	srcDir := args[0]
 	destDir := args[1]
 	ksg, err := obikmer.OpenKmerSetGroup(srcDir)
 	if err != nil {
 		return fmt.Errorf("failed to open source kmer index: %w", err)
 	}
 	// Resolve set patterns
 	patterns := CLISetPatterns()
 	var ids []string
 	if len(patterns) > 0 {
 		indices, err := ksg.MatchSetIDs(patterns)
 		if err != nil {
 			return err
 		}
 		if len(indices) == 0 {
 			return fmt.Errorf("no sets match the given patterns")
 		}
 		ids = make([]string, len(indices))
 		for i, idx := range indices {
 			ids[i] = ksg.SetIDOf(idx)
 		}
 	} else {
 		// Move all sets
 		ids = ksg.SetsIDs()
 	}
 	log.Infof("Moving %d set(s) from %s to %s", len(ids), srcDir, destDir)
 	// Copy first
 	dest, err := ksg.CopySetsByIDTo(ids, destDir, CLIForce())
 	if err != nil {
 		return err
 	}
 	// Remove from source (in reverse order to avoid renumbering issues)
 	for i := len(ids) - 1; i >= 0; i-- {
 		if err := ksg.RemoveSetByID(ids[i]); err != nil {
 			return fmt.Errorf("failed to remove set %q from source after copy: %w", ids[i], err)
 		}
 	}
 	log.Infof("Destination now has %d set(s), source has %d set(s)", dest.Size(), ksg.Size())
 	return nil
 }
--- a/pkg/obitools/obik/obik.go
+++ b/pkg/obitools/obik/obik.go
@@ -0,0 +1,85 @@
 package obik
 import (
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
 	"github.com/DavidGamba/go-getoptions"
 )
 // OptionSet registers all obik subcommands on the root GetOpt.
 func OptionSet(opt *getoptions.GetOpt) {
 	// index: build or extend a kmer index from sequence files
 	indexCmd := opt.NewCommand("index", "Build a disk-based kmer index from sequence files")
 	obiconvert.InputOptionSet(indexCmd)
 	obiconvert.OutputModeOptionSet(indexCmd, false)
 	KmerIndexOptionSet(indexCmd)
 	indexCmd.StringMapVar(&_setMetaTags, "tag", 1, 1,
 		indexCmd.Alias("T"),
 		indexCmd.ArgName("KEY=VALUE"),
 		indexCmd.Description("Per-set metadata tag (repeatable)."))
 	indexCmd.SetCommandFn(runIndex)
 	// ls: list sets in a kmer index
 	lsCmd := opt.NewCommand("ls", "List sets in a kmer index")
 	OutputFormatOptionSet(lsCmd)
 	SetSelectionOptionSet(lsCmd)
 	lsCmd.SetCommandFn(runLs)
 	// summary: detailed statistics
 	summaryCmd := opt.NewCommand("summary", "Show detailed statistics of a kmer index")
 	OutputFormatOptionSet(summaryCmd)
 	summaryCmd.BoolVar(&_jaccard, "jaccard", false,
 		summaryCmd.Description("Compute and display pairwise Jaccard distance matrix."))
 	summaryCmd.SetCommandFn(runSummary)
 	// cp: copy sets between indices
 	cpCmd := opt.NewCommand("cp", "Copy sets between kmer indices")
 	SetSelectionOptionSet(cpCmd)
 	ForceOptionSet(cpCmd)
 	cpCmd.SetCommandFn(runCp)
 	// mv: move sets between indices
 	mvCmd := opt.NewCommand("mv", "Move sets between kmer indices")
 	SetSelectionOptionSet(mvCmd)
 	ForceOptionSet(mvCmd)
 	mvCmd.SetCommandFn(runMv)
 	// rm: remove sets from an index
 	rmCmd := opt.NewCommand("rm", "Remove sets from a kmer index")
 	SetSelectionOptionSet(rmCmd)
 	rmCmd.SetCommandFn(runRm)
 	// spectrum: output k-mer frequency spectrum as CSV
 	spectrumCmd := opt.NewCommand("spectrum", "Output k-mer frequency spectrum as CSV")
 	SetSelectionOptionSet(spectrumCmd)
 	obiconvert.OutputModeOptionSet(spectrumCmd, false)
 	spectrumCmd.SetCommandFn(runSpectrum)
 	// super: extract super k-mers from sequences
 	superCmd := opt.NewCommand("super", "Extract super k-mers from sequence files")
 	obiconvert.InputOptionSet(superCmd)
 	obiconvert.OutputOptionSet(superCmd)
 	SuperKmerOptionSet(superCmd)
 	superCmd.SetCommandFn(runSuper)
 	// lowmask: mask low-complexity regions
 	lowmaskCmd := opt.NewCommand("lowmask", "Mask low-complexity regions in sequences using entropy")
 	obiconvert.InputOptionSet(lowmaskCmd)
 	obiconvert.OutputOptionSet(lowmaskCmd)
 	LowMaskOptionSet(lowmaskCmd)
 	lowmaskCmd.SetCommandFn(runLowmask)
 	// match: annotate sequences with k-mer match positions from an index
 	matchCmd := opt.NewCommand("match", "Annotate sequences with k-mer match positions from an index")
 	IndexDirectoryOptionSet(matchCmd)
 	obiconvert.InputOptionSet(matchCmd)
 	obiconvert.OutputOptionSet(matchCmd)
 	SetSelectionOptionSet(matchCmd)
 	matchCmd.SetCommandFn(runMatch)
 	// filter: filter an index to remove low-complexity k-mers
 	filterCmd := opt.NewCommand("filter", "Filter a kmer index to remove low-complexity k-mers")
 	obiconvert.OutputModeOptionSet(filterCmd, false)
 	EntropyFilterOptionSet(filterCmd)
 	SetSelectionOptionSet(filterCmd)
 	filterCmd.SetCommandFn(runFilter)
 }
--- a/pkg/obitools/obik/options.go
+++ b/pkg/obitools/obik/options.go
@@ -0,0 +1,360 @@
 package obik
 import (
 	"strings"
 	log "github.com/sirupsen/logrus"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
 	"github.com/DavidGamba/go-getoptions"
 )
 // MaskingMode defines how to handle low-complexity regions
 type MaskingMode int
 const (
 	MaskMode    MaskingMode = iota // Replace low-complexity regions with masked characters
 	SplitMode                      // Split sequence into high-complexity fragments
 	ExtractMode                    // Extract low-complexity fragments
 )
 // Output format flags
 var _jsonOutput bool
 var _csvOutput bool
 var _yamlOutput bool
 // Set selection flags
 var _setPatterns []string
 // Force flag
 var _force bool
 // Jaccard flag
 var _jaccard bool
 // Per-set tags for index subcommand
 var _setMetaTags = make(map[string]string, 0)
 // ==============================
 // Shared kmer options (used by index, super, lowmask)
 // ==============================
 var _kmerSize = 31
 var _minimizerSize = -1 // -1 means auto: ceil(k / 2.5)
 // KmerSizeOptionSet registers --kmer-size / -k.
 // Shared by index, super, and lowmask subcommands.
 func KmerSizeOptionSet(options *getoptions.GetOpt) {
 	options.IntVar(&_kmerSize, "kmer-size", _kmerSize,
 		options.Alias("k"),
 		options.Description("Size of k-mers (must be between 2 and 31)."))
 }
 // MinimizerOptionSet registers --minimizer-size / -m.
 // Shared by index and super subcommands.
 func MinimizerOptionSet(options *getoptions.GetOpt) {
 	options.IntVar(&_minimizerSize, "minimizer-size", _minimizerSize,
 		options.Alias("m"),
 		options.Description("Size of minimizers for parallelization (-1 for auto = ceil(k/2.5))."))
 }
 // ==============================
 // Lowmask-specific options
 // ==============================
 var _entropySize = 6
 var _entropyThreshold = 0.5
 var _splitMode = false
 var _extractMode = false
 var _maskingChar = "."
 var _keepShorter = false
 // LowMaskOptionSet registers options specific to low-complexity masking.
 func LowMaskOptionSet(options *getoptions.GetOpt) {
 	KmerSizeOptionSet(options)
 	options.IntVar(&_entropySize, "entropy-size", _entropySize,
 		options.Description("Maximum word size considered for entropy estimate."))
 	options.Float64Var(&_entropyThreshold, "threshold", _entropyThreshold,
 		options.Description("Entropy threshold below which a kmer is masked (0 to 1)."))
 	options.BoolVar(&_splitMode, "extract-high", _splitMode,
 		options.Description("Extract only high-complexity regions."))
 	options.BoolVar(&_extractMode, "extract-low", _extractMode,
 		options.Description("Extract only low-complexity regions."))
 	options.StringVar(&_maskingChar, "masking-char", _maskingChar,
 		options.Description("Character used to mask low complexity regions."))
 	options.BoolVar(&_keepShorter, "keep-shorter", _keepShorter,
 		options.Description("Keep fragments shorter than kmer-size in split/extract mode."))
 }
 // ==============================
 // Index-specific options
 // ==============================
 var _indexId = ""
 var _metadataFormat = "toml"
 var _setTag = make(map[string]string, 0)
 var _minOccurrence = 1
 var _maxOccurrence = 0
 var _saveFullFilter = false
 var _saveFreqKmer = 0
 var _indexEntropyThreshold = 0.0
 var _indexEntropySize = 6
 // KmerIndexOptionSet defines every option related to kmer index building.
 func KmerIndexOptionSet(options *getoptions.GetOpt) {
 	KmerSizeOptionSet(options)
 	MinimizerOptionSet(options)
 	options.StringVar(&_indexId, "index-id", _indexId,
 		options.Description("Identifier for the kmer index."))
 	options.StringVar(&_metadataFormat, "metadata-format", _metadataFormat,
 		options.Description("Format for metadata file (toml, yaml, json)."))
 	options.StringMapVar(&_setTag, "set-tag", 1, 1,
 		options.Alias("S"),
 		options.ArgName("KEY=VALUE"),
 		options.Description("Adds a group-level metadata attribute KEY with value VALUE."))
 	options.IntVar(&_minOccurrence, "min-occurrence", _minOccurrence,
 		options.Description("Minimum number of occurrences for a k-mer to be kept (default 1 = keep all)."))
 	options.IntVar(&_maxOccurrence, "max-occurrence", _maxOccurrence,
 		options.Description("Maximum number of occurrences for a k-mer to be kept (default 0 = no upper bound)."))
 	options.BoolVar(&_saveFullFilter, "save-full-filter", _saveFullFilter,
 		options.Description("When using --min-occurrence > 1, save the full frequency filter instead of just the filtered index."))
 	options.IntVar(&_saveFreqKmer, "save-freq-kmer", _saveFreqKmer,
 		options.Description("Save the N most frequent k-mers per set to a CSV file (top_kmers.csv)."))
 	options.Float64Var(&_indexEntropyThreshold, "entropy-filter", _indexEntropyThreshold,
 		options.Description("Filter low-complexity k-mers with entropy <= threshold (0 = disabled)."))
 	options.IntVar(&_indexEntropySize, "entropy-filter-size", _indexEntropySize,
 		options.Description("Maximum word size for entropy filter computation (default 6)."))
 }
 // EntropyFilterOptionSet registers entropy filter options for commands
 // that process existing indices (e.g. filter).
 func EntropyFilterOptionSet(options *getoptions.GetOpt) {
 	options.Float64Var(&_indexEntropyThreshold, "entropy-filter", _indexEntropyThreshold,
 		options.Description("Filter low-complexity k-mers with entropy <= threshold (0 = disabled)."))
 	options.IntVar(&_indexEntropySize, "entropy-filter-size", _indexEntropySize,
 		options.Description("Maximum word size for entropy filter computation (default 6)."))
 }
 // ==============================
 // Super kmer options
 // ==============================
 // SuperKmerOptionSet registers options specific to super k-mer extraction.
 func SuperKmerOptionSet(options *getoptions.GetOpt) {
 	KmerSizeOptionSet(options)
 	MinimizerOptionSet(options)
 }
 // CLIKmerSize returns the k-mer size.
 func CLIKmerSize() int {
 	return _kmerSize
 }
 // CLIMinimizerSize returns the effective minimizer size.
 func CLIMinimizerSize() int {
 	m := _minimizerSize
 	if m < 0 {
 		m = obikmer.DefaultMinimizerSize(_kmerSize)
 	}
 	nworkers := obidefault.ParallelWorkers()
 	m = obikmer.ValidateMinimizerSize(m, _kmerSize, nworkers)
 	return m
 }
 // CLIIndexId returns the index identifier.
 func CLIIndexId() string {
 	return _indexId
 }
 // CLIMetadataFormat returns the metadata format.
 func CLIMetadataFormat() obikmer.MetadataFormat {
 	switch strings.ToLower(_metadataFormat) {
 	case "toml":
 		return obikmer.FormatTOML
 	case "yaml":
 		return obikmer.FormatYAML
 	case "json":
 		return obikmer.FormatJSON
 	default:
 		log.Warnf("Unknown metadata format %q, defaulting to TOML", _metadataFormat)
 		return obikmer.FormatTOML
 	}
 }
 // CLISetTag returns the group-level metadata key=value pairs.
 func CLISetTag() map[string]string {
 	return _setTag
 }
 // CLIMinOccurrence returns the minimum occurrence threshold.
 func CLIMinOccurrence() int {
 	return _minOccurrence
 }
 // CLIMaxOccurrence returns the maximum occurrence threshold (0 = no upper bound).
 func CLIMaxOccurrence() int {
 	return _maxOccurrence
 }
 // CLISaveFullFilter returns whether to save the full frequency filter.
 func CLISaveFullFilter() bool {
 	return _saveFullFilter
 }
 // CLISaveFreqKmer returns the number of top frequent k-mers to save (0 = disabled).
 func CLISaveFreqKmer() int {
 	return _saveFreqKmer
 }
 // CLIOutputDirectory returns the output directory path.
 func CLIOutputDirectory() string {
 	return obiconvert.CLIOutPutFileName()
 }
 // SetKmerSize sets the k-mer size (for testing).
 func SetKmerSize(k int) {
 	_kmerSize = k
 }
 // SetMinimizerSize sets the minimizer size (for testing).
 func SetMinimizerSize(m int) {
 	_minimizerSize = m
 }
 // SetMinOccurrence sets the minimum occurrence (for testing).
 func SetMinOccurrence(n int) {
 	_minOccurrence = n
 }
 // CLIMaskingMode returns the masking mode from CLI flags.
 func CLIMaskingMode() MaskingMode {
 	switch {
 	case _extractMode:
 		return ExtractMode
 	case _splitMode:
 		return SplitMode
 	default:
 		return MaskMode
 	}
 }
 // CLIMaskingChar returns the masking character, validated.
 func CLIMaskingChar() byte {
 	mask := strings.TrimSpace(_maskingChar)
 	if len(mask) != 1 {
 		log.Fatalf("--masking-char option accepts a single character, not %s", mask)
 	}
 	return []byte(mask)[0]
 }
 // CLIEntropySize returns the entropy word size.
 func CLIEntropySize() int {
 	return _entropySize
 }
 // CLIEntropyThreshold returns the entropy threshold.
 func CLIEntropyThreshold() float64 {
 	return _entropyThreshold
 }
 // CLIKeepShorter returns whether to keep short fragments.
 func CLIKeepShorter() bool {
 	return _keepShorter
 }
 // ==============================
 // Match-specific options
 // ==============================
 var _indexDirectory = ""
 // IndexDirectoryOptionSet registers --index / -i (mandatory directory for match).
 func IndexDirectoryOptionSet(options *getoptions.GetOpt) {
 	options.StringVar(&_indexDirectory, "index", _indexDirectory,
 		options.Alias("i"),
 		options.Required(),
 		options.ArgName("DIRECTORY"),
 		options.Description("Path to the kmer index directory."))
 }
 // CLIIndexDirectory returns the --index directory path.
 func CLIIndexDirectory() string {
 	return _indexDirectory
 }
 // CLIIndexEntropyThreshold returns the entropy filter threshold for index building (0 = disabled).
 func CLIIndexEntropyThreshold() float64 {
 	return _indexEntropyThreshold
 }
 // CLIIndexEntropySize returns the entropy filter word size for index building.
 func CLIIndexEntropySize() int {
 	return _indexEntropySize
 }
 // OutputFormatOptionSet registers --json-output, --csv-output, --yaml-output.
 func OutputFormatOptionSet(options *getoptions.GetOpt) {
 	options.BoolVar(&_jsonOutput, "json-output", false,
 		options.Description("Print results as JSON."))
 	options.BoolVar(&_csvOutput, "csv-output", false,
 		options.Description("Print results as CSV."))
 	options.BoolVar(&_yamlOutput, "yaml-output", false,
 		options.Description("Print results as YAML."))
 }
 // CLIOutFormat returns the selected output format: "json", "csv", "yaml", or "text".
 func CLIOutFormat() string {
 	if _jsonOutput {
 		return "json"
 	}
 	if _csvOutput {
 		return "csv"
 	}
 	if _yamlOutput {
 		return "yaml"
 	}
 	return "text"
 }
 // SetSelectionOptionSet registers --set <glob_pattern> (repeatable).
 func SetSelectionOptionSet(options *getoptions.GetOpt) {
 	options.StringSliceVar(&_setPatterns, "set", 1, 1,
 		options.Alias("s"),
 		options.ArgName("PATTERN"),
 		options.Description("Set ID or glob pattern (repeatable, supports *, ?, [...])."))
 }
 // CLISetPatterns returns the --set patterns provided by the user.
 func CLISetPatterns() []string {
 	return _setPatterns
 }
 // ForceOptionSet registers --force / -f.
 func ForceOptionSet(options *getoptions.GetOpt) {
 	options.BoolVar(&_force, "force", false,
 		options.Alias("f"),
 		options.Description("Force operation even if set ID already exists in destination."))
 }
 // CLIForce returns whether --force was specified.
 func CLIForce() bool {
 	return _force
 }
--- a/pkg/obitools/obik/rm.go
+++ b/pkg/obitools/obik/rm.go
@@ -0,0 +1,56 @@
 package obik
 import (
 	"context"
 	"fmt"
 	log "github.com/sirupsen/logrus"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
 	"github.com/DavidGamba/go-getoptions"
 )
 func runRm(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
 	if len(args) < 1 {
 		return fmt.Errorf("usage: obik rm --set PATTERN [--set PATTERN]... <index_directory>")
 	}
 	patterns := CLISetPatterns()
 	if len(patterns) == 0 {
 		return fmt.Errorf("--set is required (specify which sets to remove)")
 	}
 	indexDir := args[0]
 	ksg, err := obikmer.OpenKmerSetGroup(indexDir)
 	if err != nil {
 		return fmt.Errorf("failed to open kmer index: %w", err)
 	}
 	indices, err := ksg.MatchSetIDs(patterns)
 	if err != nil {
 		return err
 	}
 	if len(indices) == 0 {
 		return fmt.Errorf("no sets match the given patterns")
 	}
 	// Collect IDs before removal (indices shift as we remove)
 	ids := make([]string, len(indices))
 	for i, idx := range indices {
 		ids[i] = ksg.SetIDOf(idx)
 	}
 	log.Infof("Removing %d set(s) from %s", len(ids), indexDir)
 	// Remove in reverse order to avoid renumbering issues
 	for i := len(ids) - 1; i >= 0; i-- {
 		if err := ksg.RemoveSetByID(ids[i]); err != nil {
 			return fmt.Errorf("failed to remove set %q: %w", ids[i], err)
 		}
 		log.Infof("Removed set %q", ids[i])
 	}
 	log.Infof("Index now has %d set(s)", ksg.Size())
 	return nil
 }
--- a/pkg/obitools/obik/spectrum.go
+++ b/pkg/obitools/obik/spectrum.go
@@ -0,0 +1,121 @@
 package obik
 import (
 	"context"
 	"encoding/csv"
 	"fmt"
 	"os"
 	"strconv"
 	log "github.com/sirupsen/logrus"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
 	"github.com/DavidGamba/go-getoptions"
 )
 // runSpectrum implements the "obik spectrum" subcommand.
 // It outputs k-mer frequency spectra as CSV with one column per set.
 func runSpectrum(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
 	if len(args) < 1 {
 		return fmt.Errorf("usage: obik spectrum [options] <index_directory>")
 	}
 	ksg, err := obikmer.OpenKmerSetGroup(args[0])
 	if err != nil {
 		return fmt.Errorf("failed to open kmer index: %w", err)
 	}
 	// Determine which sets to include
 	patterns := CLISetPatterns()
 	var indices []int
 	if len(patterns) > 0 {
 		indices, err = ksg.MatchSetIDs(patterns)
 		if err != nil {
 			return fmt.Errorf("failed to match set patterns: %w", err)
 		}
 		if len(indices) == 0 {
 			return fmt.Errorf("no sets match the given patterns")
 		}
 	} else {
 		// All sets
 		indices = make([]int, ksg.Size())
 		for i := range indices {
 			indices[i] = i
 		}
 	}
 	// Read spectra for selected sets
 	spectraMaps := make([]map[int]uint64, len(indices))
 	maxFreq := 0
 	for i, idx := range indices {
 		spectrum, err := ksg.Spectrum(idx)
 		if err != nil {
 			return fmt.Errorf("failed to read spectrum for set %d: %w", idx, err)
 		}
 		if spectrum == nil {
 			log.Warnf("No spectrum data for set %d (%s)", idx, ksg.SetIDOf(idx))
 			spectraMaps[i] = make(map[int]uint64)
 			continue
 		}
 		spectraMaps[i] = spectrum.ToMap()
 		if mf := spectrum.MaxFrequency(); mf > maxFreq {
 			maxFreq = mf
 		}
 	}
 	if maxFreq == 0 {
 		return fmt.Errorf("no spectrum data found in any selected set")
 	}
 	// Determine output destination
 	outFile := obiconvert.CLIOutPutFileName()
 	var w *csv.Writer
 	if outFile == "" || outFile == "-" {
 		w = csv.NewWriter(os.Stdout)
 	} else {
 		f, err := os.Create(outFile)
 		if err != nil {
 			return fmt.Errorf("failed to create output file: %w", err)
 		}
 		defer f.Close()
 		w = csv.NewWriter(f)
 	}
 	defer w.Flush()
 	// Build header: frequency, set_id_1, set_id_2, ...
 	header := make([]string, 1+len(indices))
 	header[0] = "frequency"
 	for i, idx := range indices {
 		id := ksg.SetIDOf(idx)
 		if id == "" {
 			id = fmt.Sprintf("set_%d", idx)
 		}
 		header[i+1] = id
 	}
 	if err := w.Write(header); err != nil {
 		return err
 	}
 	// Write rows for each frequency from 1 to maxFreq
 	record := make([]string, 1+len(indices))
 	for freq := 1; freq <= maxFreq; freq++ {
 		record[0] = strconv.Itoa(freq)
 		hasData := false
 		for i := range indices {
 			count := spectraMaps[i][freq]
 			record[i+1] = strconv.FormatUint(count, 10)
 			if count > 0 {
 				hasData = true
 			}
 		}
 		// Only write rows where at least one set has a non-zero count
 		if hasData {
 			if err := w.Write(record); err != nil {
 				return err
 			}
 		}
 	}
 	return nil
 }
--- a/pkg/obitools/obik/summary.go
+++ b/pkg/obitools/obik/summary.go
@@ -0,0 +1,148 @@
 package obik
 import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"os"
 	"path/filepath"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
 	"github.com/DavidGamba/go-getoptions"
 	"gopkg.in/yaml.v3"
 )
 type setSummary struct {
 	Index    int                    `json:"index" yaml:"index"`
 	ID       string                 `json:"id" yaml:"id"`
 	Count    uint64                 `json:"count" yaml:"count"`
 	DiskSize int64                  `json:"disk_bytes" yaml:"disk_bytes"`
 	Metadata map[string]interface{} `json:"metadata,omitempty" yaml:"metadata,omitempty"`
 }
 type groupSummary struct {
 	Path       string                 `json:"path" yaml:"path"`
 	ID         string                 `json:"id,omitempty" yaml:"id,omitempty"`
 	K          int                    `json:"k" yaml:"k"`
 	M          int                    `json:"m" yaml:"m"`
 	Partitions int                    `json:"partitions" yaml:"partitions"`
 	TotalSets  int                    `json:"total_sets" yaml:"total_sets"`
 	TotalKmers uint64                 `json:"total_kmers" yaml:"total_kmers"`
 	TotalDisk  int64                  `json:"total_disk_bytes" yaml:"total_disk_bytes"`
 	Metadata   map[string]interface{} `json:"metadata,omitempty" yaml:"metadata,omitempty"`
 	Sets       []setSummary           `json:"sets" yaml:"sets"`
 	Jaccard    [][]float64            `json:"jaccard,omitempty" yaml:"jaccard,omitempty"`
 }
 func runSummary(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
 	if len(args) < 1 {
 		return fmt.Errorf("usage: obik summary [options] <index_directory>")
 	}
 	ksg, err := obikmer.OpenKmerSetGroup(args[0])
 	if err != nil {
 		return fmt.Errorf("failed to open kmer index: %w", err)
 	}
 	summary := groupSummary{
 		Path:       ksg.Path(),
 		ID:         ksg.Id(),
 		K:          ksg.K(),
 		M:          ksg.M(),
 		Partitions: ksg.Partitions(),
 		TotalSets:  ksg.Size(),
 		TotalKmers: ksg.Len(),
 		Metadata:   ksg.Metadata,
 		Sets:       make([]setSummary, ksg.Size()),
 	}
 	var totalDisk int64
 	for i := 0; i < ksg.Size(); i++ {
 		diskSize := computeSetDiskSize(ksg, i)
 		totalDisk += diskSize
 		summary.Sets[i] = setSummary{
 			Index:    i,
 			ID:       ksg.SetIDOf(i),
 			Count:    ksg.Len(i),
 			DiskSize: diskSize,
 			Metadata: ksg.AllSetMetadata(i),
 		}
 	}
 	summary.TotalDisk = totalDisk
 	// Jaccard matrix
 	if _jaccard && ksg.Size() > 1 {
 		dm := ksg.JaccardDistanceMatrix()
 		n := ksg.Size()
 		matrix := make([][]float64, n)
 		for i := 0; i < n; i++ {
 			matrix[i] = make([]float64, n)
 			for j := 0; j < n; j++ {
 				if i == j {
 					matrix[i][j] = 0
 				} else {
 					matrix[i][j] = dm.Get(i, j)
 				}
 			}
 		}
 		summary.Jaccard = matrix
 	}
 	format := CLIOutFormat()
 	switch format {
 	case "json":
 		return outputSummaryJSON(summary)
 	case "yaml":
 		return outputSummaryYAML(summary)
 	case "csv":
 		return outputSummaryCSV(summary)
 	default:
 		return outputSummaryJSON(summary)
 	}
 }
 func computeSetDiskSize(ksg *obikmer.KmerSetGroup, setIndex int) int64 {
 	var total int64
 	for p := 0; p < ksg.Partitions(); p++ {
 		path := ksg.PartitionPath(setIndex, p)
 		info, err := os.Stat(path)
 		if err != nil {
 			continue
 		}
 		total += info.Size()
 	}
 	// Also count the set directory entry itself
 	setDir := filepath.Join(ksg.Path(), fmt.Sprintf("set_%d", setIndex))
 	entries, err := os.ReadDir(setDir)
 	if err == nil {
 		// We already counted .kdi files above; this is just for completeness
 		_ = entries
 	}
 	return total
 }
 func outputSummaryJSON(summary groupSummary) error {
 	data, err := json.MarshalIndent(summary, "", "  ")
 	if err != nil {
 		return err
 	}
 	fmt.Println(string(data))
 	return nil
 }
 func outputSummaryYAML(summary groupSummary) error {
 	data, err := yaml.Marshal(summary)
 	if err != nil {
 		return err
 	}
 	fmt.Print(string(data))
 	return nil
 }
 func outputSummaryCSV(summary groupSummary) error {
 	fmt.Println("index,id,count,disk_bytes")
 	for _, s := range summary.Sets {
 		fmt.Printf("%d,%s,%d,%d\n", s.Index, s.ID, s.Count, s.DiskSize)
 	}
 	return nil
 }
--- a/pkg/obitools/obik/super.go
+++ b/pkg/obitools/obik/super.go
@@ -0,0 +1,49 @@
 package obik
 import (
 	"context"
 	"fmt"
 	log "github.com/sirupsen/logrus"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
 	"github.com/DavidGamba/go-getoptions"
 )
 // runSuper implements the "obik super" subcommand.
 // It extracts super k-mers from DNA sequences.
 func runSuper(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
 	k := CLIKmerSize()
 	m := CLIMinimizerSize()
 	if k < 2 || k > 31 {
 		return fmt.Errorf("invalid k-mer size: %d (must be between 2 and 31)", k)
 	}
 	if m < 1 || m >= k {
 		return fmt.Errorf("invalid parameters: minimizer size (%d) must be between 1 and k-1 (%d)", m, k-1)
 	}
 	log.Printf("Extracting super k-mers with k=%d, m=%d", k, m)
 	sequences, err := obiconvert.CLIReadBioSequences(args...)
 	if err != nil {
 		return fmt.Errorf("failed to open sequence files: %w", err)
 	}
 	worker := obikmer.SuperKmerWorker(k, m)
 	superkmers := sequences.MakeIWorker(
 		worker,
 		false,
 		obidefault.ParallelWorkers(),
 	)
 	obiconvert.CLIWriteBioSequences(superkmers, true)
 	obiutils.WaitForLastPipe()
 	return nil
 }
--- a/pkg/obitools/obilowmask/entropy.qmd
+++ b/pkg/obitools/obilowmask/entropy.qmd
@@ -1,332 +0,0 @@
 ```{r}
 library(tidyverse)
 ```
 ```{r}
 x <- sample(1:4096, 29, replace=TRUE)
 ```
 ```{r}
 emax <- function(lseq,word_size) {
  nword = lseq - word_size + 1
  nalpha = 4^word_size
  if (nalpha < nword) {
    cov = nword %/% nalpha
    remains = nword %% nalpha   
    f1 = cov/nword
    f2 = (cov+1)/nword
    print(c(nalpha - remains,f1,remains,f2))
    e = -(nalpha - remains) * f1 * log(f1) -  
         remains * f2 * log(f2)
  } else {
    e = log(nword)
  }
  e
 }
 ```
 ```{r}
 ec <- function(data,kmer_size) {
  table <- table(data)
  s <- sum(table)
  e <- sum(table * log(table))/s
  ed <- log(s) - e
  em <- emax(s+kmer_size-1,kmer_size)
  ed/em
 }
 ```
 ```{r}
 ef <- function(data,kmer_size) {
  table <- table(data)
  s <- sum(table)
  f <- table / s
  f <- as.numeric(f)
  f <- f[f > 0]
  em <- emax(s+kmer_size-1,kmer_size)
  ed <- -sum(f * log(f))
  print(c(ed,em,ed/em))
  ed/em
 }
 ```
 ```{r}
 okmer <- function(data,kmer_size) {
    str_sub(data,1:(nchar(data)-kmer_size+1)) %>%
    str_sub(1,kmer_size)
 }
 ```
 ```{r}
 # Normalisation circulaire: retourne le plus petit k-mer par rotation circulaire
 normalize_circular <- function(kmer) {
  if (nchar(kmer) == 0) return(kmer)
  canonical <- kmer
  n <- nchar(kmer)
  # Tester toutes les rotations circulaires
  for (i in 2:n) {
    rotated <- paste0(str_sub(kmer, i, n), str_sub(kmer, 1, i-1))
    if (rotated < canonical) {
      canonical <- rotated
    }
  }
  canonical
 }
 ```
 ```{r}
 # Fonction totient d'Euler: compte le nombre d'entiers de 1 à n coprimes avec n
 euler_totient <- function(n) {
  if (n <= 0) return(0)
  result <- n
  p <- 2
  # Traiter tous les facteurs premiers
  while (p * p <= n) {
    if (n %% p == 0) {
      # Retirer toutes les occurrences de p
      while (n %% p == 0) {
        n <- n %/% p
      }
      # Appliquer la formule: φ(n) = n * (1 - 1/p)
      result <- result - result %/% p
    }
    p <- p + 1
  }
  # Si n est toujours > 1, alors c'est un facteur premier
  if (n > 1) {
    result <- result - result %/% n
  }
  result
 }
 ```
 ```{r}
 # Retourne tous les diviseurs de n
 divisors <- function(n) {
  if (n <= 0) return(integer(0))
  divs <- c()
  i <- 1
  while (i * i <= n) {
    if (n %% i == 0) {
      divs <- c(divs, i)
      if (i != n %/% i) {
        divs <- c(divs, n %/% i)
      }
    }
    i <- i + 1
  }
  sort(divs)
 }
 ```
 ```{r}
 # Compte le nombre de colliers (necklaces) distincts de longueur n
 # sur un alphabet de taille a en utilisant la formule de Moreau:
 # N(n, a) = (1/n) * Σ φ(d) * a^(n/d)
 # où la somme est sur tous les diviseurs d de n, et φ est la fonction totient d'Euler
 necklace_count <- function(n, alphabet_size) {
  if (n <= 0) return(0)
  divs <- divisors(n)
  sum_val <- 0
  for (d in divs) {
    # Calculer alphabet_size^(n/d)
    power <- alphabet_size^(n %/% d)
    sum_val <- sum_val + euler_totient(d) * power
  }
  sum_val %/% n
 }
 ```
 ```{r}
 # Nombre de classes d'équivalence pour les k-mers normalisés
 # Utilise la formule exacte de Moreau pour compter les colliers (necklaces)
 n_normalized_kmers <- function(kmer_size) {
  # Valeurs exactes pré-calculées pour k=1 à 6
  if (kmer_size == 1) return(4)
  if (kmer_size == 2) return(10)
  if (kmer_size == 3) return(24)
  if (kmer_size == 4) return(70)
  if (kmer_size == 5) return(208)
  if (kmer_size == 6) return(700)
  # Pour k > 6, utiliser la formule de Moreau (exacte)
  # Alphabet ADN a 4 bases
  necklace_count(kmer_size, 4)
 }
 ```
 ```{r}
 # Entropie maximale pour k-mers normalisés
 enmax <- function(lseq, word_size) {
  nword = lseq - word_size + 1
  nalpha = n_normalized_kmers(word_size)
  if (nalpha < nword) {
    cov = nword %/% nalpha
    remains = nword %% nalpha
    f1 = cov/nword
    f2 = (cov+1)/nword
    e = -(nalpha - remains) * f1 * log(f1) -
         remains * f2 * log(f2)
  } else {
    e = log(nword)
  }
  e
 }
 ```
 ```{r}
 # Entropie normalisée avec normalisation circulaire des k-mers
 ecn <- function(data, kmer_size) {
  # Normaliser tous les k-mers
  normalized_data <- sapply(data, normalize_circular)
  # Calculer la table des fréquences
  table <- table(normalized_data)
  s <- sum(table)
  e <- sum(table * log(table))/s
  ed <- log(s) - e
  # Entropie maximale avec normalisation
  em <- enmax(s + kmer_size - 1, kmer_size)
  ed/em
 }
 ```
 ```{r}
 k<-'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
 ec(okmer(k,1),1)
 ec(okmer(k,2),2)
 ec(okmer(k,3),3)
 ec(okmer(k,4),4)
 ```
 ```{r}
 k<-'atatatatatatatatatatatatatatata'
 ef(okmer(k,1),1)
 ef(okmer(k,2),2)
 ef(okmer(k,3),3)
 ef(okmer(k,4),4)
 ```
 ```{r}
 k<-'aaaaaaaaaaaaaaaattttttttttttttt'
 ef(okmer(k,1),1)
 ef(okmer(k,2),2)
 ef(okmer(k,3),3)
 ef(okmer(k,4),4)
 ```
 ```{r}
 k<-'atgatgatgatgatgatgatgatgatgatga'
 ef(okmer(k,1),1)
 ef(okmer(k,2),2)
 ef(okmer(k,3),3)
 ef(okmer(k,4),4)
 ```
 ```{r}
 k<-'atcgatcgatcgatcgatcgatcgatcgact'
 ecn(okmer(k,1),1)
 ecn(okmer(k,2),2)
 ecn(okmer(k,3),3)
 ecn(okmer(k,4),4)
 ```
 ```{r}
 k<-paste(sample(rep(c("a","c","g","t"),8),31),collapse="")
 k <- "actatggcaagtcgtaaccgcgcttatcagg"
 ecn(okmer(k,1),1)
 ecn(okmer(k,2),2)
 ecn(okmer(k,3),3)
 ecn(okmer(k,4),4)
 ```
 aattaaaaaaacaagataaaataatattttt
 ```{r}
 k<-'aattaaaaaaacaagataaaataatattttt'
 ecn(okmer(k,1),1)
 ecn(okmer(k,2),2)
 ecn(okmer(k,3),3)
 ecn(okmer(k,4),4)
 ```
 atg tga gat ,,,,
 cat tca atc
 tgatgatgatgatgatgatgatgatgatg
 ## Tests de normalisation circulaire
 ```{r}
 # Test de la fonction de normalisation
 normalize_circular("ca")   # devrait donner "ac"
 normalize_circular("tgca") # devrait donner "atgc"
 normalize_circular("acgt") # devrait donner "acgt"
 ```
 ```{r}
 # Comparaison ec vs ecn sur une séquence répétitive
 # Les k-mers "atg", "tga", "gat" sont équivalents par rotation
 k <- 'atgatgatgatgatgatgatgatgatgatga'
 cat("Séquence:", k, "\n")
 cat("ec(k,3)  =", ec(okmer(k,3),3), "\n")
 cat("ecn(k,3) =", ecn(okmer(k,3),3), "\n")
 ```
 ```{r}
 # Comparaison sur séquence aléatoire
 k <- "actatggcaagtcgtaaccgcgcttatcagg"
 cat("Séquence:", k, "\n")
 cat("Sans normalisation:\n")
 cat("  ec(k,2) =", ec(okmer(k,2),2), "\n")
 cat("  ec(k,3) =", ec(okmer(k,3),3), "\n")
 cat("  ec(k,4) =", ec(okmer(k,4),4), "\n")
 cat("Avec normalisation circulaire:\n")
 cat("  ecn(k,2) =", ecn(okmer(k,2),2), "\n")
 cat("  ecn(k,3) =", ecn(okmer(k,3),3), "\n")
 cat("  ecn(k,4) =", ecn(okmer(k,4),4), "\n")
 ```
 ```{r}
 sequence <- "ttcatcactcagcaatcctgaatgatGAGAGCTTTTTTTTTTTATATATATATATATGTATATGTATGAAATACACTtatgctccgtttgtttcgccgtaa"
 re <- rev(c(0.8108602271901116,0.8108602271901116,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.7800272339058549,0.7800272339058549,0.7751610144606091,0.7751610144606091,0.7751610144606091,0.764858185548322,0.7325526601302021,0.7137620699527615,0.6789199521982864,0.6584536373623372,0.634002687184193,0.6075290415873623,0.5785545803330997,0.5785545803330997,0.5503220289212184,0.5315314387437778,0.4966893209893028,0.46077361820145696,0.42388221293245526,0.4009547969713408,0.3561142883497758,0.3561142883497758,0.3561142883497758,0.3561142883497758,0.3561142883497758,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.35141814451677883,0.35141814451677883,0.35141814451677883,0.35141814451677883,0.35141814451677883,0.390029016052137,0.42781461756157363,0.45192285937059073,0.47238917420654,0.47238917420654,0.47238917420654,0.5092805794755417,0.5451962822633876,0.5800384000178626,0.602395141014297,0.6046146614886381,0.6046146614886381,0.6119084258128231,0.6119084258128231,0.6214217106113492,0.6424704346756562,0.6482381543085467,0.6635191587399633,0.6635191587399633,0.6635191587399633,0.6828444721058894,0.6950205907027562,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.7208976112999935))
 di <- c(0.7208976112999935,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6950205907027562,0.6828444721058894,0.6635191587399633,0.6635191587399633,0.6635191587399633,0.6482381543085467,0.6424704346756562,0.6214217106113492,0.6119084258128231,0.6119084258128231,0.6046146614886382,0.6046146614886382,0.6023951410142971,0.5800384000178627,0.5451962822633876,0.5092805794755418,0.47238917420654003,0.47238917420654003,0.47238917420654003,0.4519228593705908,0.4278146175615737,0.39002901605213713,0.35141814451677894,0.35141814451677894,0.35141814451677894,0.35141814451677894,0.35141814451677883,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3561142883497762,0.3561142883497762,0.3561142883497762,0.3561142883497762,0.3561142883497762,0.40095479697134073,0.42388221293245526,0.46077361820145696,0.4966893209893028,0.5315314387437778,0.5503220289212184,0.5785545803330997,0.5785545803330997,0.6075290415873625,0.6340026871841933,0.6584536373623374,0.6789199521982866,0.7137620699527616,0.7325526601302023,0.7648581855483221,0.7751610144606093,0.7751610144606093,0.7751610144606093,0.7800272339058549,0.7800272339058549,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8108602271901116,0.8108602271901116)
 ebidir <- tibble(direct=di,reverse=re) %>% 
          mutate(position = 1:length(re),
                 nucleotide = str_sub(sequence,position,position))
 ebidir %>%
  ggplot(aes(x=position,y=direct)) +
     geom_line() +
     scale_x_continuous(breaks = ebidir$position, labels = ebidir$nucleotide) +
     ylim(0,1)+
     geom_hline(yintercept=0.5, col = "red", linetype = "dashed")
 ```
--- a/pkg/obitools/obilowmask/obilowmask_test.go
+++ b/pkg/obitools/obilowmask/obilowmask_test.go
@@ -1,40 +0,0 @@
 package obilowmask
 import (
 	"testing"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 )
 func TestLowMaskWorker(t *testing.T) {
 	worker := LowMaskWorker(31, 6, 0.3, Mask, 'n')
 	seq := obiseq.NewBioSequence("test", []byte("acgtacgtacgtacgtacgtacgtacgtacgt"), "test")
 	result, err := worker(seq)
 	if err != nil {
 		t.Fatalf("Worker failed: %v", err)
 	}
 	if result.Len() != 1 {
 		t.Fatalf("Expected 1 sequence, got %d", result.Len())
 	}
 	resultSeq := result[0]
 	if resultSeq.Len() != 32 {
 		t.Fatalf("Expected sequence length 32, got %d", resultSeq.Len())
 	}
 }
 func TestLowMaskWorkerWithAmbiguity(t *testing.T) {
 	worker := LowMaskWorker(31, 6, 0.3, Mask, 'n')
 	seq := obiseq.NewBioSequence("test", []byte("acgtNcgtacgtacgtacgtacgtacgtacgt"), "test")
 	result, err := worker(seq)
 	if err != nil {
 		t.Fatalf("Worker failed: %v", err)
 	}
 	if result.Len() != 1 {
 		t.Fatalf("Expected 1 sequence, got %d", result.Len())
 	}
 }
--- a/pkg/obitools/obilowmask/options.go
+++ b/pkg/obitools/obilowmask/options.go
@@ -1,81 +0,0 @@
 package obilowmask
 import (
 	"strings"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
 	"github.com/DavidGamba/go-getoptions"
 	log "github.com/sirupsen/logrus"
 )
 var __kmer_size__ = 31
 var __level_max__ = 6
 var __threshold__ = 0.5
 var __split_mode__ = false
 var __low_mode__ = false
 var __mask__ = "."
 func LowMaskOptionSet(options *getoptions.GetOpt) {
 	options.IntVar(&__kmer_size__, "kmer-size", __kmer_size__,
 		options.Description("Size of the kmer considered to estimate entropy."),
 	)
 	options.IntVar(&__level_max__, "entropy_size", __level_max__,
 		options.Description("Maximum word size considered for entropy estimate"),
 	)
 	options.Float64Var(&__threshold__, "threshold", __threshold__,
 		options.Description("entropy theshold used to mask a kmer"),
 	)
 	options.BoolVar(&__split_mode__, "split-mode", __split_mode__,
 		options.Description("in split mode, input sequences are splitted to remove masked regions"),
 	)
 	options.BoolVar(&__low_mode__, "low-mode", __low_mode__,
 		options.Description("in split mode, input sequences are splitted to remove masked regions"),
 	)
 	options.StringVar(&__mask__, "masking-char", __mask__,
 		options.Description("Character used to mask low complexity region"),
 	)
 }
 func OptionSet(options *getoptions.GetOpt) {
 	LowMaskOptionSet(options)
 	obiconvert.InputOptionSet(options)
 	obiconvert.OutputOptionSet(options)
 }
 func CLIKmerSize() int {
 	return __kmer_size__
 }
 func CLILevelMax() int {
 	return __level_max__
 }
 func CLIThreshold() float64 {
 	return __threshold__
 }
 func CLIMaskingMode() MaskingMode {
 	switch {
 	case __low_mode__:
 		return Extract
 	case __split_mode__:
 		return Split
 	default:
 		return Mask
 	}
 }
 func CLIMaskingChar() byte {
 	mask := strings.TrimSpace(__mask__)
 	if len(mask) != 1 {
 		log.Fatalf("--masking-char option accept a single character, not %s", mask)
 	}
 	return []byte(mask)[0]
 }
--- a/pkg/obitools/obirefidx/obirefidx.go
+++ b/pkg/obitools/obirefidx/obirefidx.go
@@ -291,5 +291,5 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
 		go f()
 	}
-	return indexed.Rebatch(obidefault.BatchSize())
+	return indexed.RebatchBySize(obidefault.BatchMem(), obidefault.BatchSizeMax())
 }
--- a/pkg/obitools/obisuperkmer/obisuperkmer.go
+++ b/pkg/obitools/obisuperkmer/obisuperkmer.go
@@ -1,10 +0,0 @@
 // obisuperkmer function utility package.
 //
 // The obitools/obisuperkmer package contains every
 // function specifically required by the obisuperkmer utility.
 //
 // The obisuperkmer command extracts super k-mers from DNA sequences.
 // A super k-mer is a maximal subsequence where all consecutive k-mers
 // share the same minimizer. This decomposition is useful for efficient
 // k-mer indexing and analysis in bioinformatics applications.
 package obisuperkmer
--- a/pkg/obitools/obisuperkmer/options.go
+++ b/pkg/obitools/obisuperkmer/options.go
@@ -1,69 +0,0 @@
 package obisuperkmer
 import (
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
 	"github.com/DavidGamba/go-getoptions"
 )
 // Private variables for storing option values
 var _KmerSize = 31
 var _MinimizerSize = 13
 // SuperKmerOptionSet defines every option related to super k-mer extraction.
 //
 // The function adds to a CLI every option proposed to the user
 // to tune the parameters of the super k-mer extraction algorithm.
 //
 // Parameters:
 // - options: is a pointer to a getoptions.GetOpt instance normally
 //   produced by the obioptions.GenerateOptionParser function.
 func SuperKmerOptionSet(options *getoptions.GetOpt) {
 	options.IntVar(&_KmerSize, "kmer-size", _KmerSize,
 		options.Alias("k"),
 		options.Description("Size of k-mers (must be between m+1 and 31)."))
 	options.IntVar(&_MinimizerSize, "minimizer-size", _MinimizerSize,
 		options.Alias("m"),
 		options.Description("Size of minimizers (must be between 1 and k-1)."))
 }
 // OptionSet adds to the basic option set every option declared for
 // the obisuperkmer command.
 //
 // It takes a pointer to a GetOpt struct as its parameter and does not return anything.
 func OptionSet(options *getoptions.GetOpt) {
 	obiconvert.OptionSet(false)(options)
 	SuperKmerOptionSet(options)
 }
 // CLIKmerSize returns the k-mer size to use for super k-mer extraction.
 //
 // It does not take any parameters.
 // It returns an integer representing the k-mer size.
 func CLIKmerSize() int {
 	return _KmerSize
 }
 // SetKmerSize sets the k-mer size for super k-mer extraction.
 //
 // Parameters:
 // - k: the k-mer size (must be between m+1 and 31).
 func SetKmerSize(k int) {
 	_KmerSize = k
 }
 // CLIMinimizerSize returns the minimizer size to use for super k-mer extraction.
 //
 // It does not take any parameters.
 // It returns an integer representing the minimizer size.
 func CLIMinimizerSize() int {
 	return _MinimizerSize
 }
 // SetMinimizerSize sets the minimizer size for super k-mer extraction.
 //
 // Parameters:
 // - m: the minimizer size (must be between 1 and k-1).
 func SetMinimizerSize(m int) {
 	_MinimizerSize = m
 }
--- a/pkg/obitools/obisuperkmer/superkmer.go
+++ b/pkg/obitools/obisuperkmer/superkmer.go
@@ -1,59 +0,0 @@
 package obisuperkmer
 import (
 	log "github.com/sirupsen/logrus"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
 )
 // CLIExtractSuperKmers extracts super k-mers from an iterator of BioSequences.
 //
 // This function takes an iterator of BioSequence objects, extracts super k-mers
 // from each sequence using the k-mer and minimizer sizes specified by CLI options,
 // and returns a new iterator yielding the extracted super k-mers as BioSequence objects.
 //
 // Each super k-mer is a maximal subsequence where all consecutive k-mers share
 // the same minimizer. The resulting BioSequences contain metadata including:
 // - minimizer_value: the canonical minimizer value
 // - minimizer_seq: the DNA sequence of the minimizer
 // - k: the k-mer size used
 // - m: the minimizer size used
 // - start: starting position in the original sequence
 // - end: ending position in the original sequence
 // - parent_id: ID of the parent sequence
 //
 // Parameters:
 // - iterator: an iterator yielding BioSequence objects to process.
 //
 // Returns:
 // - An iterator yielding BioSequence objects representing super k-mers.
 func CLIExtractSuperKmers(iterator obiiter.IBioSequence) obiiter.IBioSequence {
 	// Get k-mer and minimizer sizes from CLI options
 	k := CLIKmerSize()
 	m := CLIMinimizerSize()
 	// Validate parameters
 	if m < 1 || m >= k {
 		log.Fatalf("Invalid parameters: minimizer size (%d) must be between 1 and k-1 (%d)", m, k-1)
 	}
 	if k < 2 || k > 31 {
 		log.Fatalf("Invalid k-mer size: %d (must be between 2 and 31)", k)
 	}
 	log.Printf("Extracting super k-mers with k=%d, m=%d", k, m)
 	// Create the worker for super k-mer extraction
 	worker := obikmer.SuperKmerWorker(k, m)
 	// Apply the worker to the iterator with parallel processing
 	newIter := iterator.MakeIWorker(
 		worker,
 		false, // don't merge results
 		obidefault.ParallelWorkers(),
 	)
 	return newIter
 }
--- a/pkg/obitools/obisuperkmer/superkmer_test.go
+++ b/pkg/obitools/obisuperkmer/superkmer_test.go
@@ -1,149 +0,0 @@
 package obisuperkmer
 import (
 	"testing"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 )
 func TestCLIExtractSuperKmers(t *testing.T) {
 	// Create a test sequence
 	testSeq := obiseq.NewBioSequence(
 		"test_seq",
 		[]byte("ACGTACGTACGTACGTACGTACGTACGTACGT"),
 		"",
 	)
 	// Create a batch with the test sequence
 	batch := obiseq.NewBioSequenceBatch()
 	batch.Add(testSeq)
 	// Create an iterator from the batch
 	iterator := obiiter.MakeBioSequenceBatchChannel(1)
 	go func() {
 		iterator.Push(batch)
 		iterator.Close()
 	}()
 	// Set test parameters
 	SetKmerSize(15)
 	SetMinimizerSize(7)
 	// Extract super k-mers
 	result := CLIExtractSuperKmers(iterator)
 	// Count the number of super k-mers
 	count := 0
 	for result.Next() {
 		batch := result.Get()
 		for _, sk := range batch.Slice() {
 			count++
 			// Verify that the super k-mer has the expected attributes
 			if !sk.HasAttribute("minimizer_value") {
 				t.Error("Super k-mer missing 'minimizer_value' attribute")
 			}
 			if !sk.HasAttribute("minimizer_seq") {
 				t.Error("Super k-mer missing 'minimizer_seq' attribute")
 			}
 			if !sk.HasAttribute("k") {
 				t.Error("Super k-mer missing 'k' attribute")
 			}
 			if !sk.HasAttribute("m") {
 				t.Error("Super k-mer missing 'm' attribute")
 			}
 			if !sk.HasAttribute("start") {
 				t.Error("Super k-mer missing 'start' attribute")
 			}
 			if !sk.HasAttribute("end") {
 				t.Error("Super k-mer missing 'end' attribute")
 			}
 			if !sk.HasAttribute("parent_id") {
 				t.Error("Super k-mer missing 'parent_id' attribute")
 			}
 			// Verify attribute values
 			k, _ := sk.GetIntAttribute("k")
 			m, _ := sk.GetIntAttribute("m")
 			if k != 15 {
 				t.Errorf("Expected k=15, got k=%d", k)
 			}
 			if m != 7 {
 				t.Errorf("Expected m=7, got m=%d", m)
 			}
 			parentID, _ := sk.GetStringAttribute("parent_id")
 			if parentID != "test_seq" {
 				t.Errorf("Expected parent_id='test_seq', got '%s'", parentID)
 			}
 		}
 	}
 	if count == 0 {
 		t.Error("No super k-mers were extracted")
 	}
 	t.Logf("Extracted %d super k-mers from test sequence", count)
 }
 func TestOptionGettersAndSetters(t *testing.T) {
 	// Test initial values
 	if CLIKmerSize() != 21 {
 		t.Errorf("Expected default k-mer size 21, got %d", CLIKmerSize())
 	}
 	if CLIMinimizerSize() != 11 {
 		t.Errorf("Expected default minimizer size 11, got %d", CLIMinimizerSize())
 	}
 	// Test setters
 	SetKmerSize(25)
 	SetMinimizerSize(13)
 	if CLIKmerSize() != 25 {
 		t.Errorf("SetKmerSize failed: expected 25, got %d", CLIKmerSize())
 	}
 	if CLIMinimizerSize() != 13 {
 		t.Errorf("SetMinimizerSize failed: expected 13, got %d", CLIMinimizerSize())
 	}
 	// Reset to defaults
 	SetKmerSize(21)
 	SetMinimizerSize(11)
 }
 func BenchmarkCLIExtractSuperKmers(b *testing.B) {
 	// Create a longer test sequence
 	longSeq := make([]byte, 1000)
 	bases := []byte{'A', 'C', 'G', 'T'}
 	for i := range longSeq {
 		longSeq[i] = bases[i%4]
 	}
 	testSeq := obiseq.NewBioSequence("bench_seq", longSeq, "")
 	// Set parameters
 	SetKmerSize(21)
 	SetMinimizerSize(11)
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
 		batch := obiseq.NewBioSequenceBatch()
 		batch.Add(testSeq)
 		iterator := obiiter.MakeBioSequenceBatchChannel(1)
 		go func() {
 			iterator.Push(batch)
 			iterator.Close()
 		}()
 		result := CLIExtractSuperKmers(iterator)
 		// Consume the iterator
 		for result.Next() {
 			result.Get()
 		}
 	}
 }
--- a/pkg/obiutils/memsize.go
+++ b/pkg/obiutils/memsize.go
@@ -0,0 +1,85 @@
 package obiutils
 import (
 	"fmt"
 	"strconv"
 	"strings"
 	"unicode"
 )
 // ParseMemSize parses a human-readable memory size string and returns the
 // equivalent number of bytes. The value is a number optionally followed by a
 // unit suffix (case-insensitive):
 //
 //	B  or (no suffix) — bytes
 //	K  or KB           — kibibytes  (1 024)
 //	M  or MB           — mebibytes  (1 048 576)
 //	G  or GB           — gibibytes  (1 073 741 824)
 //	T  or TB           — tebibytes  (1 099 511 627 776)
 //
 // Examples: "512", "128K", "128k", "64M", "1G", "2GB"
 func ParseMemSize(s string) (int, error) {
 	s = strings.TrimSpace(s)
 	if s == "" {
 		return 0, fmt.Errorf("empty memory size string")
 	}
 	// split numeric prefix from unit suffix
 	i := 0
 	for i < len(s) && (unicode.IsDigit(rune(s[i])) || s[i] == '.') {
 		i++
 	}
 	numStr := s[:i]
 	unit := strings.ToUpper(strings.TrimSpace(s[i:]))
 	// strip trailing 'B' from two-letter units (KB→K, MB→M …)
 	if len(unit) == 2 && unit[1] == 'B' {
 		unit = unit[:1]
 	}
 	val, err := strconv.ParseFloat(numStr, 64)
 	if err != nil {
 		return 0, fmt.Errorf("invalid memory size %q: %w", s, err)
 	}
 	var multiplier float64
 	switch unit {
 	case "", "B":
 		multiplier = 1
 	case "K":
 		multiplier = 1024
 	case "M":
 		multiplier = 1024 * 1024
 	case "G":
 		multiplier = 1024 * 1024 * 1024
 	case "T":
 		multiplier = 1024 * 1024 * 1024 * 1024
 	default:
 		return 0, fmt.Errorf("unknown memory unit %q in %q", unit, s)
 	}
 	return int(val * multiplier), nil
 }
 // FormatMemSize formats a byte count as a human-readable string with the
 // largest unit that produces a value ≥ 1 (e.g. 1536 → "1.5K").
 func FormatMemSize(n int) string {
 	units := []struct {
 		suffix string
 		size   int
 	}{
 		{"T", 1024 * 1024 * 1024 * 1024},
 		{"G", 1024 * 1024 * 1024},
 		{"M", 1024 * 1024},
 		{"K", 1024},
 	}
 	for _, u := range units {
 		if n >= u.size {
 			v := float64(n) / float64(u.size)
 			if v == float64(int(v)) {
 				return fmt.Sprintf("%d%s", int(v), u.suffix)
 			}
 			return fmt.Sprintf("%.1f%s", v, u.suffix)
 		}
 	}
 	return fmt.Sprintf("%dB", n)
 }
--- a/pkg/obiutils/strings.go
+++ b/pkg/obiutils/strings.go
@@ -144,7 +144,7 @@ func (r *AsciiSet) TrimLeft(s string) string {
 	return s[i:]
 }
-func SplitInTwo(s string, sep byte) (string, string) {
+func LeftSplitInTwo(s string, sep byte) (string, string) {
 	i := 0
 	for ; i < len(s); i++ {
 		c := s[i]
@@ -157,3 +157,17 @@ func SplitInTwo(s string, sep byte) (string, string) {
 	}
 	return s[:i], s[i+1:]
 }
 func RightSplitInTwo(s string, sep byte) (string, string) {
 	i := len(s) - 1
 	for ; i >= 0; i-- {
 		c := s[i]
 		if c == sep {
 			break
 		}
 	}
 	if i == len(s) {
 		return s, ""
 	}
 	return s[:i], s[i+1:]
 }
--- a/release_notes.sh
+++ b/release_notes.sh
@@ -0,0 +1,294 @@
 #!/bin/bash
 # Generate GitHub-compatible release notes for an OBITools4 version.
 #
 # Usage:
 #   ./release_notes.sh                 # latest version
 #   ./release_notes.sh -v 4.4.15       # specific version
 #   ./release_notes.sh -l              # list available versions
 #   ./release_notes.sh -r              # raw commit list (no LLM)
 #   ./release_notes.sh -c -v 4.4.16   # show LLM context for a version
 GITHUB_REPO="metabarcoding/obitools4"
 GITHUB_API="https://api.github.com/repos/${GITHUB_REPO}"
 VERSION=""
 LIST_VERSIONS=false
 RAW_MODE=false
 CONTEXT_MODE=false
 LLM_MODEL="ollama:qwen3-coder-next:latest"
 # ── Helpers ──────────────────────────────────────────────────────────────
 die() { echo "Error: $*" >&2; exit 1; }
 next_patch() {
  local v="$1"
  local major minor patch
  major=$(echo "$v" | cut -d. -f1)
  minor=$(echo "$v" | cut -d. -f2)
  patch=$(echo "$v" | cut -d. -f3)
  echo "${major}.${minor}.$(( patch + 1 ))"
 }
 # Strip "pre-" prefix to get the bare version number for installation section
 bare_version() {
  echo "$1" | sed 's/^pre-//'
 }
 installation_section() {
  local v
  v=$(bare_version "$1")
  cat <<INSTALL_EOF
 ## Installation
 ### Pre-built binaries
 Download the appropriate archive for your system from the
 [release assets](https://github.com/metabarcoding/obitools4/releases/tag/Release_${v})
 and extract it:
 #### Linux (AMD64)
 \`\`\`bash
 tar -xzf obitools4_${v}_linux_amd64.tar.gz
 \`\`\`
 #### Linux (ARM64)
 \`\`\`bash
 tar -xzf obitools4_${v}_linux_arm64.tar.gz
 \`\`\`
 #### macOS (Intel)
 \`\`\`bash
 tar -xzf obitools4_${v}_darwin_amd64.tar.gz
 \`\`\`
 #### macOS (Apple Silicon)
 \`\`\`bash
 tar -xzf obitools4_${v}_darwin_arm64.tar.gz
 \`\`\`
 All OBITools4 binaries are included in each archive.
 ### From source
 You can also compile and install OBITools4 directly from source using the
 installation script:
 \`\`\`bash
 curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | bash -s -- --version ${v}
 \`\`\`
 By default binaries are installed in \`/usr/local/bin\`. Use \`--install-dir\` to
 change the destination and \`--obitools-prefix\` to add a prefix to command names:
 \`\`\`bash
 curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | \\
  bash -s -- --version ${v} --install-dir ~/local --obitools-prefix k
 \`\`\`
 INSTALL_EOF
 }
 display_help() {
  cat <<EOF
 Usage: $(basename "$0") [OPTIONS]
 Generate GitHub-compatible Markdown release notes for an OBITools4 version.
 Options:
  -v, --version VERSION   Target version (e.g., 4.4.15). Default: latest.
  -l, --list              List all available versions and exit.
  -r, --raw               Output raw commit list without LLM summarization.
  -c, --context           Show the exact context (commits + prompt) sent to the LLM.
  -m, --model MODEL       LLM model for orla (default: $LLM_MODEL).
  -h, --help              Display this help message.
 Examples:
  $(basename "$0")                  # release notes for the latest version
  $(basename "$0") -v 4.4.15       # release notes for a specific version
  $(basename "$0") -l              # list versions
  $(basename "$0") -r -v 4.4.15    # raw commit log for a version
  $(basename "$0") -c -v 4.4.16    # show LLM context for a version
 EOF
 }
 # Fetch all Release tags from GitHub API (sorted newest first)
 fetch_versions() {
  curl -sf "${GITHUB_API}/releases" \
    | grep '"tag_name":' \
    | sed -E 's/.*"tag_name": "Release_([0-9.]+)".*/\1/' \
    | sort -V -r
 }
 # ── Parse arguments ──────────────────────────────────────────────────────
 while [ "$#" -gt 0 ]; do
  case "$1" in
    -v|--version)  VERSION="$2"; shift 2 ;;
    -l|--list)     LIST_VERSIONS=true; shift ;;
    -r|--raw)      RAW_MODE=true; shift ;;
    -c|--context)  CONTEXT_MODE=true; shift ;;
    -m|--model)    LLM_MODEL="$2"; shift 2 ;;
    -h|--help)     display_help; exit 0 ;;
    *)             die "Unsupported option: $1" ;;
  esac
 done
 # ── List mode ────────────────────────────────────────────────────────────
 if [ "$LIST_VERSIONS" = true ]; then
  echo "Available OBITools4 versions:" >&2
  echo "==============================" >&2
  fetch_versions
  exit 0
 fi
 # ── Resolve versions ─────────────────────────────────────────────────────
 all_versions=$(fetch_versions)
 [ -z "$all_versions" ] && die "Could not fetch versions from GitHub"
 if [ -z "$VERSION" ]; then
   # ── Pre-release mode: local HEAD vs latest GitHub tag ──────────────────
   PRE_RELEASE=true
   previous_tag="Release_${latest_version}"
   VERSION="pre-$(next_patch "$latest_version")"
   echo "Pre-release mode: $previous_tag -> HEAD (as $VERSION)" >&2
   # Need to be in a git repo
   if ! git rev-parse --is-inside-work-tree >/dev/null 2>&1; then
     die "Not inside a git repository. Pre-release mode requires a local git repo."
   fi
   # Check that the previous tag exists locally
   if ! git rev-parse "$previous_tag" >/dev/null 2>&1; then
     echo "Tag $previous_tag not found locally, fetching..." >&2
     git fetch --tags 2>/dev/null || true
     if ! git rev-parse "$previous_tag" >/dev/null 2>&1; then
       die "Tag $previous_tag not found locally or remotely"
     fi
   fi
  # Get local commits from the tag to HEAD (full messages)
  commit_list=$(git log --format="%h %B" "${previous_tag}..HEAD" 2>/dev/null)
   if [ -z "$commit_list" ]; then
     die "No local commits found since $previous_tag"
   fi
 else
   # ── Published release mode: between two GitHub tags ────────────────────
   PRE_RELEASE=false
   tag_name="Release_${VERSION}"
   # Verify the requested version exists
   if ! echo "$all_versions" | grep -qx "$VERSION"; then
     die "Version $VERSION not found. Use -l to list available versions."
   fi
   # Find the previous version
   previous_version=$(echo "$all_versions" | grep -A1 -x "$VERSION" | tail -1)
   if [ "$previous_version" = "$VERSION" ] || [ -z "$previous_version" ]; then
     previous_tag=""
     echo "No previous version found -- will include all commits for $tag_name" >&2
   else
     previous_tag="Release_${previous_version}"
     echo "Generating notes: $previous_tag -> $tag_name" >&2
   fi
   # Fetch commit messages between tags via GitHub compare API
   if [ -n "$previous_tag" ]; then
     commits_json=$(curl -sf "${GITHUB_API}/compare/${previous_tag}...${tag_name}")
     if [ -z "$commits_json" ]; then
       die "Could not fetch commit comparison from GitHub"
     fi
     commit_list=$(echo "$commits_json" \
      | jq -r '.commits[] | (.sha[:8] + " " + .commit.message)' 2>/dev/null)
   else
     commits_json=$(curl -sf "${GITHUB_API}/commits?sha=${tag_name}&per_page=50")
     if [ -z "$commits_json" ]; then
       die "Could not fetch commits from GitHub"
     fi
     commit_list=$(echo "$commits_json" \
      | jq -r '.[] | (.sha[:8] + " " + .commit.message)' 2>/dev/null)
   fi
   if [ -z "$commit_list" ]; then
     die "No commits found between $previous_tag and $tag_name"
   fi
 fi
 # ── LLM prompt (shared by context mode and summarization) ────────────────
 LLM_PROMPT="Summarize the following commits into a GitHub release note for version ${VERSION}. \
 Ignore commits related to version bumps, .gitignore changes, or any internal housekeeping \
 that is irrelevant to end users. Describe each user-facing change precisely without exposing \
 code. Eliminate redundancy. Output strictly valid JSON with no surrounding text, using this \
 exact schema: {\"title\": \"<short release title>\", \"body\": \"<detailed markdown release notes>\"}"
 # ── Raw mode: just output the commit list ────────────────────────────────
 if [ "$RAW_MODE" = true ]; then
  echo "# Release ${VERSION}"
  echo ""
  echo "## Commits"
  echo ""
  echo "$commit_list" | while IFS= read -r line; do
    echo "- ${line}"
  done
  installation_section "$VERSION"
  exit 0
 fi
 # ── Context mode: show what would be sent to the LLM ────────────────────
 if [ "$CONTEXT_MODE" = true ]; then
  echo "=== LLM Model ==="
  echo "$LLM_MODEL"
  echo ""
  echo "=== Prompt ==="
  echo "$LLM_PROMPT"
  echo ""
  echo "=== Stdin (commit list) ==="
  echo "$commit_list"
  exit 0
 fi
 # ── LLM summarization ───────────────────────────────────────────────────
 if ! command -v orla >/dev/null 2>&1; then
  die "orla is required for LLM summarization. Use -r for raw output."
 fi
 if ! command -v jq >/dev/null 2>&1; then
  die "jq is required for JSON parsing. Use -r for raw output."
 fi
 echo "Summarizing with LLM ($LLM_MODEL)..." >&2
 raw_output=$(echo "$commit_list" | \
  ORLA_MAX_TOOL_CALLS=50 orla agent -m "$LLM_MODEL" \
  "$LLM_PROMPT" \
  2>/dev/null) || true
 if [ -z "$raw_output" ]; then
  echo "Warning: LLM returned empty output, falling back to raw mode" >&2
  exec "$0" -r -v "$VERSION"
 fi
 # Sanitize: extract JSON object, strip control characters
 sanitized=$(echo "$raw_output" | sed -n '/^{/,/^}/p' | tr -d '\000-\011\013-\014\016-\037')
 release_title=$(echo "$sanitized" | jq -r '.title // empty' 2>/dev/null)
 release_body=$(echo "$sanitized" | jq -r '.body // empty' 2>/dev/null)
 if [ -n "$release_title" ] && [ -n "$release_body" ]; then
  echo "# ${release_title}"
  echo ""
  echo "$release_body"
  installation_section "$VERSION"
 else
  echo "Warning: JSON parsing failed, falling back to raw mode" >&2
  exec "$0" -r -v "$VERSION"
 fi
--- a/tools/json2md.py
+++ b/tools/json2md.py
@@ -0,0 +1,36 @@
 #!/usr/bin/env python3
 """
 Read potentially malformed JSON from stdin (aichat output), extract title and
 body, and print them as plain text: title on first line, blank line, then body.
 Exits with 1 on failure (no output).
 """
 import sys
 import json
 import re
 text = sys.stdin.read()
 m = re.search(r'\{.*\}', text, re.DOTALL)
 if not m:
    sys.exit(1)
 s = m.group()
 obj = None
 try:
    obj = json.loads(s)
 except Exception:
    s2 = re.sub(r'(?<!\\)\n', r'\\n', s)
    try:
        obj = json.loads(s2)
    except Exception:
        sys.exit(1)
 title = obj.get('title', '').strip()
 body  = obj.get('body', '').strip()
 if not title or not body:
    sys.exit(1)
 print(f"{title}\n\n{body}")
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-4.4.12
+4.4.29