Memory-aware Batching and Static Linux Builds

### Memory-Aware Batching - Replaced single batch size limits with configurable min/max bounds and memory limits for more precise control over resource usage. - Added `--batch-mem` CLI option to enable adaptive batching based on estimated sequence memory footprint (e.g., 128K, 64M, 1G). - Introduced `RebatchBySize()` with explicit support for both byte and count limits, flushing when either threshold is exceeded. - Implemented conservative memory estimation via `BioSequence.MemorySize()` and enhanced garbage collection to trigger explicit cleanup after large batch discards. - Updated internal batching logic across `batchiterator.go`, `fragment.go`, and `obirefidx.go` to consistently use default memory (128 MB) and size (min: 1, max: 2000) bounds. ### Linux Build Enhancements - Enabled static linking for Linux binaries using musl, producing portable, self-contained executables without external dependencies. ### Notes - This release consolidates and improves batching behavior introduced in 4.4.20, with no breaking changes to the public API. - All user-facing batching behavior is now governed by consistent memory and count constraints, improving predictability and stability during large dataset processing.
Replace Rebatch with RebatchBySize using default batch parameters
2026-03-25 13:30:52 +00:00 · 2026-03-13 15:16:41 +01:00 · 2026-03-13 15:16:33 +01:00 · 2026-03-13 15:07:35 +01:00 · 2026-03-13 14:54:21 +01:00 · 2026-03-13 14:26:31 +01:00
32 changed files with 1786 additions and 139 deletions
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -62,6 +62,12 @@ jobs:
          TAG=${GITHUB_REF#refs/tags/Release_}
          echo "version=$TAG" >> $GITHUB_OUTPUT

+      - name: Install build tools (Linux)
+        if: runner.os == 'Linux'
+        run: |
+          sudo apt-get update -q
+          sudo apt-get install -y musl-tools
+
      - name: Install build tools (macOS)
        if: runner.os == 'macOS'
        run: |
@@ -74,8 +80,13 @@ jobs:
          GOOS: ${{ matrix.goos }}
          GOARCH: ${{ matrix.goarch }}
          VERSION: ${{ steps.get_version.outputs.version }}
+          CC: ${{ matrix.goos == 'linux' && 'musl-gcc' || '' }}
        run: |
-          make obitools
+          if [ "$GOOS" = "linux" ]; then
+            make LDFLAGS='-linkmode=external -extldflags=-static' obitools
+          else
+            make obitools
+          fi
          mkdir -p artifacts
          # Create a single tar.gz with all binaries for this platform
          tar -czf artifacts/obitools4_${VERSION}_${{ matrix.output_name }}.tar.gz -C build .
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@
 **/*.tgz
 **/*.yaml
 **/*.csv
+**/*.pb.gz
 xx

 .rhistory
--- a/145
+++ b/145
@@ -2,9 +2,17 @@
 #export GOBIN=$(GOPATH)/bin
 #export PATH=$(GOBIN):$(shell echo $${PATH})

+.DEFAULT_GOAL := all
+
+GREEN  := \033[0;32m
+YELLOW := \033[0;33m
+BLUE   := \033[0;34m
+NC     := \033[0m
+
 GOFLAGS=
+LDFLAGS=
 GOCMD=go
-GOBUILD=$(GOCMD) build $(GOFLAGS)
+GOBUILD=$(GOCMD) build $(GOFLAGS) $(if $(LDFLAGS),-ldflags='$(LDFLAGS)')
 GOGENERATE=$(GOCMD) generate
 GOCLEAN=$(GOCMD) clean
 GOTEST=$(GOCMD) test
@@ -60,6 +68,28 @@ endif

 OUTPUT:=$(shell mktemp)

+help:
+	@printf "$(GREEN)OBITools4 Makefile$(NC)\n\n"
+	@printf "$(BLUE)Main targets:$(NC)\n"
+	@printf "  %-20s %s\n" "all"          "Build all obitools (default)"
+	@printf "  %-20s %s\n" "obitools"     "Build all obitools binaries to build/"
+	@printf "  %-20s %s\n" "test"         "Run Go unit tests"
+	@printf "  %-20s %s\n" "obitests"     "Run integration tests (obitests/)"
+	@printf "  %-20s %s\n" "bump-version" "Increment patch version (or set with VERSION=x.y.z)"
+	@printf "  %-20s %s\n" "update-deps"  "Update all Go dependencies"
+	@printf "\n$(BLUE)Jujutsu workflow:$(NC)\n"
+	@printf "  %-20s %s\n" "jjnew"        "Document current commit and start a new one"
+	@printf "  %-20s %s\n" "jjpush"       "Release: describe, bump, generate notes, push PR, tag (VERSION=x.y.z optional)"
+	@printf "  %-20s %s\n" "jjfetch"      "Fetch latest commits from origin"
+	@printf "\n$(BLUE)Required tools:$(NC)\n"
+	@printf "  %-20s " "go";      command -v go      >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(go version)" || printf "$(YELLOW)✗ not found$(NC)\n"
+	@printf "  %-20s " "git";     command -v git     >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(git --version)" || printf "$(YELLOW)✗ not found$(NC)\n"
+	@printf "  %-20s " "jj";      command -v jj      >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(jj --version)" || printf "$(YELLOW)✗ not found$(NC)\n"
+	@printf "  %-20s " "gh";      command -v gh      >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(gh --version | head -1)" || printf "$(YELLOW)✗ not found$(NC)  (brew install gh)\n"
+	@printf "\n$(BLUE)Optional tools (release notes generation):$(NC)\n"
+	@printf "  %-20s " "aichat";  command -v aichat  >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(aichat --version)" || printf "$(YELLOW)✗ not found$(NC)  (https://github.com/sigoden/aichat)\n"
+	@printf "  %-20s " "jq";      command -v jq      >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(jq --version)" || printf "$(YELLOW)✗ not found$(NC)  (brew install jq)\n"
+
 all: install-githook obitools

 obitools: $(patsubst %,$(OBITOOLS_PREFIX)%,$(OBITOOLS))
@@ -106,15 +136,20 @@ pkg/obioptions/version.go: version.txt .FORCE
 	@rm -f $(OUTPUT)

 bump-version:
-	@echo "Incrementing version..."
 	@current=$$(cat version.txt); \
-	echo "  Current version: $$current"; \
-	major=$$(echo $$current | cut -d. -f1); \
-	minor=$$(echo $$current | cut -d. -f2); \
-	patch=$$(echo $$current | cut -d. -f3); \
-	new_patch=$$((patch + 1)); \
-	new_version="$$major.$$minor.$$new_patch"; \
-	echo "  New version: $$new_version"; \
+	if [ -n "$(VERSION)" ]; then \
+		new_version="$(VERSION)"; \
+		echo "Setting version to $$new_version (was $$current)"; \
+	else \
+		echo "Incrementing version..."; \
+		echo "  Current version: $$current"; \
+		major=$$(echo $$current | cut -d. -f1); \
+		minor=$$(echo $$current | cut -d. -f2); \
+		patch=$$(echo $$current | cut -d. -f3); \
+		new_patch=$$((patch + 1)); \
+		new_version="$$major.$$minor.$$new_patch"; \
+		echo "  New version: $$new_version"; \
+	fi; \
 	echo "$$new_version" > version.txt
 	@echo "✓ Version updated in version.txt"
 	@$(MAKE) pkg/obioptions/version.go
@@ -128,40 +163,76 @@ jjnew:
 	@echo "$(GREEN)✓ New commit created$(NC)"

 jjpush:
-	@echo "$(YELLOW)→ Pushing commit to repository...$(NC)"
+	@$(MAKE) jjpush-describe
+	@$(MAKE) jjpush-bump
+	@$(MAKE) jjpush-notes
+	@$(MAKE) jjpush-push
+	@$(MAKE) jjpush-tag
+	@echo "$(GREEN)✓ Release complete$(NC)"
+
+jjpush-describe:
 	@echo "$(BLUE)→ Documenting current commit...$(NC)"
 	@jj auto-describe
+
+jjpush-bump:
 	@echo "$(BLUE)→ Creating new commit for version bump...$(NC)"
 	@jj new
-	@previous_version=$$(cat version.txt); \
-	$(MAKE) bump-version; \
-	version=$$(cat version.txt); \
-	tag_name="Release_$$version"; \
-	previous_tag="Release_$$previous_version"; \
-	echo "$(BLUE)→ Documenting version bump commit...$(NC)"; \
-	jj auto-describe; \
-	echo "$(BLUE)→ Generating release notes from $$previous_tag to current commit...$(NC)"; \
-	if command -v orla >/dev/null 2>&1 && command -v jq >/dev/null 2>&1; then \
-		release_json=$$(ORLA_MAX_TOOL_CALLS=50 jj log -r "$$previous_tag::@" -T 'commit_id.short() ++ " " ++ description' | \
-			orla agent -m ollama:qwen3-coder-next:latest \
-			"Summarize the following commits into a GitHub release note for version $$version. Ignore commits related to version bumps, .gitignore changes, or any internal housekeeping that is irrelevant to end users. Describe each user-facing change precisely without exposing code. Eliminate redundancy. Output strictly valid JSON with no surrounding text, using this exact schema: {\"title\": \"<short release title>\", \"body\": \"<detailed markdown release notes>\"}"); \
-		release_json=$$(echo "$$release_json" | sed -n '/^{/,/^}/p'); \
-		release_title=$$(echo "$$release_json" | jq -r '.title // empty') ; \
-		release_body=$$(echo "$$release_json" | jq -r '.body // empty') ; \
-		if [ -n "$$release_title" ] && [ -n "$$release_body" ]; then \
-			release_message="$$release_title"$$'\n\n'"$$release_body"; \
+	@$(MAKE) bump-version
+
+jjpush-notes:
+	@version=$$(cat version.txt); \
+	echo "$(BLUE)→ Generating release notes for version $$version...$(NC)"; \
+	release_title="Release $$version"; \
+	release_body=""; \
+	if command -v aichat >/dev/null 2>&1; then \
+		previous_tag=$$(git describe --tags --abbrev=0 --match 'Release_*' 2>/dev/null); \
+		if [ -z "$$previous_tag" ]; then \
+			echo "$(YELLOW)⚠ No previous Release tag found, skipping release notes$(NC)"; \
 		else \
-			echo "$(YELLOW)⚠ JSON parsing failed, falling back to raw output$(NC)"; \
-			release_message="Release $$version"$$'\n\n'"$$release_json"; \
+			raw_output=$$(git log --format="%h %B" "$$previous_tag..HEAD" | \
+				aichat \
+				"Summarize the following commits into a GitHub release note for version $$version. Ignore commits related to version bumps, .gitignore changes, or any internal housekeeping that is irrelevant to end users. Describe each user-facing change precisely without exposing code. Eliminate redundancy. Output strictly valid JSON with no surrounding text, using this exact schema: {\"title\": \"<short release title>\", \"body\": \"<detailed markdown release notes>\"}" 2>/dev/null) || true; \
+			if [ -n "$$raw_output" ]; then \
+				notes=$$(printf '%s\n' "$$raw_output" | python3 tools/json2md.py 2>/dev/null); \
+				if [ -n "$$notes" ]; then \
+					release_title=$$(echo "$$notes" | head -1); \
+					release_body=$$(echo "$$notes" | tail -n +3); \
+				else \
+					echo "$(YELLOW)⚠ JSON parsing failed, using default release message$(NC)"; \
+				fi; \
+			fi; \
 		fi; \
-	else \
-		release_message="Release $$version"; \
 	fi; \
-	echo "$(BLUE)→ Pushing commits and creating tag $$tag_name...$(NC)"; \
-	jj git push --change @; \
-	git tag -a "$$tag_name" -m "$$release_message" 2>/dev/null || echo "Tag $$tag_name already exists"; \
-	git push origin "$$tag_name" 2>/dev/null || echo "Tag already pushed"
-	@echo "$(GREEN)✓ Commits and tag pushed to repository$(NC)"
+	printf '%s' "$$release_title" > /tmp/obitools4-release-title.txt; \
+	printf '%s' "$$release_body" > /tmp/obitools4-release-body.txt; \
+	echo "$(BLUE)→ Setting release notes as commit description...$(NC)"; \
+	jj desc -m "$$release_title"$$'\n\n'"$$release_body"
+
+jjpush-push:
+	@echo "$(BLUE)→ Pushing commits...$(NC)"
+	@jj git push --change @
+	@echo "$(BLUE)→ Creating/updating PR...$(NC)"
+	@release_title=$$(cat /tmp/obitools4-release-title.txt 2>/dev/null || echo "Release $$(cat version.txt)"); \
+	release_body=$$(cat /tmp/obitools4-release-body.txt 2>/dev/null || echo ""); \
+	branch=$$(jj log -r @ --no-graph -T 'bookmarks.map(|b| b.name()).join("\n")' 2>/dev/null | head -1); \
+	if [ -n "$$branch" ] && command -v gh >/dev/null 2>&1; then \
+		gh pr create --title "$$release_title" --body "$$release_body" --base master --head "$$branch" 2>/dev/null \
+		|| gh pr edit "$$branch" --title "$$release_title" --body "$$release_body" 2>/dev/null \
+		|| echo "$(YELLOW)⚠ Could not create/update PR$(NC)"; \
+	fi
+
+jjpush-tag:
+	@version=$$(cat version.txt); \
+	tag_name="Release_$$version"; \
+	release_title=$$(cat /tmp/obitools4-release-title.txt 2>/dev/null || echo "Release $$version"); \
+	release_body=$$(cat /tmp/obitools4-release-body.txt 2>/dev/null || echo ""); \
+	install_section=$$'\n## Installation\n\n### Pre-built binaries\n\nDownload the appropriate archive for your system from the\n[release assets](https://github.com/metabarcoding/obitools4/releases/tag/Release_'"$$version"')\nand extract it:\n\n#### Linux (AMD64)\n```bash\ntar -xzf obitools4_'"$$version"'_linux_amd64.tar.gz\n```\n\n#### Linux (ARM64)\n```bash\ntar -xzf obitools4_'"$$version"'_linux_arm64.tar.gz\n```\n\n#### macOS (Intel)\n```bash\ntar -xzf obitools4_'"$$version"'_darwin_amd64.tar.gz\n```\n\n#### macOS (Apple Silicon)\n```bash\ntar -xzf obitools4_'"$$version"'_darwin_arm64.tar.gz\n```\n\nAll OBITools4 binaries are included in each archive.\n\n### From source\n\nYou can also compile and install OBITools4 directly from source using the\ninstallation script:\n\n```bash\ncurl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | bash -s -- --version '"$$version"'\n```\n\nBy default binaries are installed in `/usr/local/bin`. Use `--install-dir` to\nchange the destination and `--obitools-prefix` to add a prefix to command names:\n\n```bash\ncurl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | \\\n  bash -s -- --version '"$$version"' --install-dir ~/local --obitools-prefix k\n```\n'; \
+	release_message="$$release_title"$$'\n\n'"$$release_body$$install_section"; \
+	echo "$(BLUE)→ Creating tag $$tag_name...$(NC)"; \
+	git tag -a "$$tag_name" -m "$$release_message" 2>/dev/null || echo "$(YELLOW)⚠ Tag $$tag_name already exists$(NC)"; \
+	echo "$(BLUE)→ Pushing tag $$tag_name...$(NC)"; \
+	git push origin "$$tag_name" 2>/dev/null || echo "$(YELLOW)⚠ Tag push failed or already pushed$(NC)"; \
+	rm -f /tmp/obitools4-release-title.txt /tmp/obitools4-release-body.txt

 jjfetch:
 	@echo "$(YELLOW)→ Pulling latest commits...$(NC)"
@@ -169,5 +240,5 @@ jjfetch:
 	@jj new master@origin
 	@echo "$(GREEN)✓ Latest commits pulled$(NC)"

-.PHONY: all obitools update-deps obitests githubtests jjnew jjpush jjfetch bump-version .FORCE
+.PHONY: all obitools update-deps obitests githubtests help jjnew jjpush jjpush-describe jjpush-bump jjpush-notes jjpush-push jjpush-tag jjfetch bump-version .FORCE
 .FORCE:
--- a/README.md
+++ b/README.md
@@ -32,8 +32,12 @@ The installation script offers several options:
 > 
 >  -p, --obitools-prefix   Prefix added to the obitools command names if you
 >                          want to have several versions of obitools at the
->                          same time on your system (as example `-p g` will produce 
+>                          same time on your system (as example `-p g` will produce
 >                          `gobigrep` command instead of `obigrep`).
+>
+>  -j, --jobs              Number of parallel jobs used for compilation
+>                          (default: 1). Increase this value to speed up
+>                          compilation on multi-core systems (e.g., `-j 4`).

 ### Examples

--- a/blackboard/Prospective/large_sequence_parsing.md
+++ b/blackboard/Prospective/large_sequence_parsing.md
@@ -0,0 +1,264 @@
+# Optimisation du parsing des grandes séquences
+
+## Contexte
+
+OBITools4 doit pouvoir traiter des séquences de taille chromosomique (plusieurs Gbp), notamment
+issues de fichiers GenBank/EMBL (assemblages de génomes) ou de fichiers FASTA convertis depuis
+ces formats.
+
+## Architecture actuelle
+
+### Pipeline de lecture (`pkg/obiformats/`)
+
+```
+ReadFileChunk (goroutine)
+    → ChannelFileChunk
+    → N × _ParseGenbankFile / _ParseFastaFile (goroutines)
+    → IBioSequence
+```
+
+`ReadFileChunk` (`file_chunk_read.go`) lit le fichier par morceaux via une chaîne de
+`PieceOfChunk` (rope). Chaque nœud fait `fileChunkSize` bytes :
+
+- GenBank/EMBL : 128 MB (`1024*1024*128`)
+- FASTA/FASTQ  : 1 MB (`1024*1024`)
+
+La chaîne est accumulée jusqu'à trouver la fin du dernier enregistrement complet (splitter),
+puis `Pack()` est appelé pour fusionner tous les nœuds en un seul buffer contigu. Ce buffer
+est transmis au parseur via `FileChunk.Raw *bytes.Buffer`.
+
+### Parseur GenBank (`genbank_read.go`)
+
+`GenbankChunkParser` reçoit un `io.Reader` sur le buffer packé, lit ligne par ligne via
+`bufio.NewReader` (buffer 4096 bytes), et pour chaque ligne de la section `ORIGIN` :
+
+```go
+line = string(bline)                        // allocation par ligne
+cleanline := strings.TrimSpace(line)        // allocation
+parts := strings.SplitN(cleanline, " ", 7) // allocation []string + substrings
+for i := 1; i < lparts; i++ {
+    seqBytes.WriteString(parts[i])
+}
+```
+
+Point positif : `seqBytes` est pré-alloué grâce à `lseq` extrait de la ligne `LOCUS`.
+
+### Parseur FASTA (`fastaseq_read.go`)
+
+`FastaChunkParser` lit **octet par octet** via `scanner.ReadByte()`. Pour 3 Gbp :
+3 milliards d'appels. `seqBytes` est un `bytes.Buffer{}` sans pré-allocation.
+
+## Problème principal
+
+Pour une séquence de plusieurs Gbp, `Pack()` fusionne une chaîne de ~N nœuds de 128 MB en
+un seul buffer contigu. C'est une allocation de N × 128 MB suivie d'une copie de toutes les
+données. Bien que l'implémentation de `Pack()` soit efficace (libère les nœuds au fur et à
+mesure via `slices.Grow`), la copie est inévitable avec l'architecture actuelle.
+
+De plus, le parseur GenBank produit des dizaines de millions d'allocations temporaires pour
+parser la section `ORIGIN` (une par ligne).
+
+## Invariant clé découvert
+
+**Si la rope a plus d'un nœud, le premier nœud seul ne se termine pas sur une frontière
+d'enregistrement** (pas de `//\n` en fin de `piece1`).
+
+Preuve par construction dans `ReadFileChunk` :
+- `splitter` est appelé dès le premier nœud (ligne 157)
+- Si `end >= 0` → frontière trouvée dans 128 MB → boucle interne sautée → rope à 1 nœud
+- Si `end < 0` → boucle interne ajoute des nœuds → rope à ≥ 2 nœuds
+
+Corollaire : si rope à 1 nœud, `Pack()` ne fait rien (aucun nœud suivant).
+
+**Attention** : rope à ≥ 2 nœuds ne signifie pas qu'il n'y a qu'une seule séquence dans
+la rope. La rope packée peut contenir plusieurs enregistrements complets. Exemple : records
+de 80 MB → `nextpieces` (48 MB de reste) + nouveau nœud (128 MB) = rope à 2 nœuds
+contenant 2 records complets + début d'un troisième.
+
+L'invariant dit seulement que `piece1` seul est incomplet — pas que la rope entière
+ne contient qu'un seul record.
+
+**Invariant : le dernier FileChunk envoyé finit sur une frontière d'enregistrement.**
+
+Deux chemins dans `ReadFileChunk` :
+
+1. **Chemin normal** (`end >= 0` via `splitter`) : le buffer est explicitement tronqué à
+   `end` (ligne 200 : `pieces.data = pieces.data[:end]`). Frontière garantie par construction
+   pour tous les formats. ✓
+
+2. **Chemin EOF** (`end < 0`, `end = pieces.Len()`) : tout le reste du fichier est envoyé.
+   - **GenBank/EMBL** : présuppose fichier bien formé (se termine par `//\n`). Le parseur
+     lève un `log.Fatalf` sur tout état inattendu — filet de sécurité suffisant. ✓
+   - **FASTQ** : présupposé, vérifié par le parseur. ✓
+   - **FASTA** : garanti par le format lui-même (fin d'enregistrement = EOF ou `>`). ✓
+
+**Hypothèse de travail adoptée** : les fichiers d'entrée sont bien formés. Dans le pire cas,
+le parseur lèvera une erreur explicite. Il n'y a pas de risque de corruption silencieuse.
+
+## Piste d'optimisation : se dispenser de Pack()
+
+### Idée centrale
+
+Au lieu de fusionner la rope avant de la passer au parseur, **parser directement la rope
+nœud par nœud**, et **écrire la séquence compactée in-place dans le premier nœud**.
+
+Pourquoi c'est sûr :
+- Le header (LOCUS, DEFINITION, SOURCE, FEATURES) est **petit** et traité en premier
+- La séquence (ORIGIN) est **à la fin** du record
+- Au moment d'écrire la séquence depuis l'offset 0 de `piece1`, le pointeur de lecture
+  est profond dans la rope (offset >> 0) → jamais de collision
+- La séquence compactée est toujours plus courte que les données brutes
+
+### Pré-allocation
+
+Pour GenBank/EMBL : `lseq` est connu dès la ligne `LOCUS`/`ID` (première ligne, dans
+`piece1`). On peut faire `slices.Grow(piece1.data, lseq)` dès ce moment.
+
+Pour FASTA : pas de taille garantie dans le header, mais `rope.Len()` donne un majorant.
+On peut utiliser `rope.Len() / 2` comme estimation initiale.
+
+### Gestion des jonctions entre nœuds
+
+Une ligne peut chevaucher deux nœuds (rare avec 128 MB, mais possible). Solution : carry
+buffer de ~128 bytes pour les quelques bytes en fin de nœud.
+
+### Cas FASTA/FASTQ multi-séquences
+
+Un FileChunk peut contenir N séquences (notamment FASTA/FASTQ courts). Dans ce cas
+l'écriture in-place dans `piece1` n'est pas applicable directement — on écrase des données
+nécessaires aux séquences suivantes.
+
+Stratégie par cas :
+- **Rope à 1 nœud** (record ≤ 128 MB) : `Pack()` est trivial (no-op), parseur actuel OK
+- **Rope à ≥ 2 nœuds** : par l'invariant, `piece1` ne contient pas de record complet →
+  une seule grande séquence → in-place applicable
+
+### Format d'une ligne séquence GenBank (Après ORIGIN)
+
+```
+/^ *[0-9]+( [nuc]{10}){0,5} [nuc]{1,10}/
+```
+
+### Format d'une ligne séquence GenBank (Après SQ)
+
+La ligne SQ contient aussi la taille de la séquence
+
+```
+/^ *( [nuc]{10}){0,5} [nuc]{1,10} *[0-9]+/
+```
+
+Compactage in-place sur `bline` ([]byte brut, sans conversion `string`) :
+
+```go
+w := 0
+i := 0
+for i < len(bline) && bline[i] == ' '  { i++ }   // skip indentation
+for i < len(bline) && bline[i] <= '9'  { i++ }   // skip position number
+for ; i < len(bline); i++ {
+    if bline[i] != ' ' {
+        bline[w] = bline[i]
+        w++
+    }
+}
+// écrire bline[:w] directement dans piece1.data[seqOffset:]
+```
+
+## Changements nécessaires
+
+1. **`FileChunk`** : exposer la rope `*PieceOfChunk` non-packée en plus (ou à la place)
+   de `Raw *bytes.Buffer`
+2. **`GenbankChunkParser` / `EmblChunkParser`** : accepter `*PieceOfChunk`, parser la
+   rope séquentiellement avec carry buffer pour les jonctions
+3. **`FastaChunkParser`** : idem, avec in-place conditionnel selon taille de la rope
+4. **`ReadFileChunk`** : ne pas appeler `Pack()` avant envoi sur le channel (ou version
+   alternative `ReadFileChunkRope`)
+
+## Fichiers concernés
+
+- `pkg/obiformats/file_chunk_read.go` — structure rope, `ReadFileChunk`
+- `pkg/obiformats/genbank_read.go` — `GenbankChunkParser`, `_ParseGenbankFile`
+- `pkg/obiformats/embl_read.go` — `EmblChunkParser`, `ReadEMBL`
+- `pkg/obiformats/fastaseq_read.go` — `FastaChunkParser`, `_ParseFastaFile`
+- `pkg/obiformats/fastqseq_read.go` — parseur FASTQ (même structure)
+
+## Plan d'implémentation : parseur GenBank sur rope
+
+### Contexte
+
+Baseline mesurée : `obiconvert gbpln640.seq.gz` → 49s real, 42s user, 29s sys, **57 GB RSS**.
+Le sys élevé indique des allocations massives. Deux causes :
+1. `Pack()` : fusionne toute la rope (N × 128 MB) en un buffer contigu avant de parser
+2. Parser ORIGIN : `string(bline)` + `TrimSpace` + `SplitN` × millions de lignes
+
+### 1. `gbRopeScanner`
+
+Struct de lecture ligne par ligne sur la rope, sans allocation heap :
+
+```go
+type gbRopeScanner struct {
+    current *PieceOfChunk
+    pos     int
+    carry   [256]byte  // stack-allocated, max GenBank line = 80 chars
+    carryN  int
+}
+```
+
+`ReadLine()` :
+- Cherche `\n` dans `current.data[pos:]` via `bytes.IndexByte`
+- Si trouvé sans carry : retourne slice direct du node (zéro alloc)
+- Si trouvé avec carry : copie dans carry buffer, retourne `carry[:n]`
+- Si non trouvé : copie le reste dans carry, avance au node suivant, recommence
+- EOF : retourne `carry[:carryN]` puis nil
+
+`extractSequence(dest []byte, UtoT bool) int` :
+- Scan direct des bytes pour section ORIGIN, sans passer par ReadLine
+- Machine d'états : lineStart → skip espaces/digits → copier nucléotides dans dest
+- Stop sur `//` en début de ligne
+- Zéro allocation, UtoT inline
+
+### 2. `GenbankChunkParserRope`
+
+```go
+func GenbankChunkParserRope(source string, rope *PieceOfChunk,
+    withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error)
+```
+
+- Même machine d'états que `GenbankChunkParser`, sur `[]byte` (`bytes.HasPrefix`)
+- LOCUS : extrait `id` et `lseq` par scan direct (remplace `_seqlenght_rx`)
+- FEATURES / default inFeature : taxid extrait par scan de `/db_xref="taxon:`
+  dans la source feature ; `featBytes` rempli seulement si `withFeatureTable=true`
+- DEFINITION : toujours conservée
+- ORIGIN : `dest = make([]byte, 0, lseq+20)` puis `s.extractSequence(dest, UtoT)`
+
+### 3. Modifications `_ParseGenbankFile` et `ReadGenbank`
+
+`_ParseGenbankFile` utilise `chunk.Rope` :
+```go
+sequences, err := GenbankChunkParserRope(chunk.Source, chunk.Rope, ...)
+```
+
+`ReadGenbank` passe `pack=false` :
+```go
+entry_channel := ReadFileChunk(..., false)
+```
+
+### 4. Ce qui NE change pas
+
+- `GenbankChunkParser` reste (référence, tests)
+- `ReadFileChunk`, `Pack()`, autres parseurs (EMBL, FASTA, FASTQ) : inchangés
+
+### 5. Gains attendus
+
+- **RSS** : pic ≈ 128 MB × workers (au lieu de N × 128 MB)
+- **Temps sys** : élimination des mmap/munmap pour les gros buffers
+- **Temps user** : ~50M allocations éliminées
+
+### 6. Vérification
+
+```bash
+/usr/local/go/bin/go build ./...
+diff <(obiconvert gbpln640.seq.gz) gbpln640.reference.fasta
+cd bugs/genbank && ./benchmark.sh gbpln640.seq.gz
+```
+
+Cible : RSS < 1 GB, temps comparable ou meilleur.
--- a/install_obitools.sh
+++ b/install_obitools.sh
@@ -7,6 +7,7 @@ INSTALL_DIR="/usr/local"
 OBITOOLS_PREFIX=""
 VERSION=""
 LIST_VERSIONS=false
+JOBS=1

 # Help message
 function display_help {
@@ -21,6 +22,7 @@ function display_help {
  echo "                          gobigrep command instead of obigrep)."
  echo "  -v, --version           Install a specific version (e.g., 4.4.8)."
  echo "                          If not specified, installs the latest version."
+  echo "  -j, --jobs              Number of parallel jobs for compilation (default: 1)."
  echo "  -l, --list              List all available versions and exit."
  echo "  -h, --help              Display this help message."
  echo ""
@@ -65,6 +67,10 @@ while [ "$#" -gt 0 ]; do
      VERSION="$2"
      shift 2
      ;;
+    -j|--jobs)
+      JOBS="$2"
+      shift 2
+      ;;
    -l|--list)
      LIST_VERSIONS=true
      shift
@@ -122,9 +128,15 @@ mkdir -p "${WORK_DIR}/cache" \
      exit 1)

 # Create installation directory
-mkdir -p "${INSTALL_DIR}/bin" 2> /dev/null \
-  || (echo "Please enter your password for installing obitools in ${INSTALL_DIR}" 1>&2
-      sudo mkdir -p "${INSTALL_DIR}/bin")
+if ! mkdir -p "${INSTALL_DIR}/bin" 2>/dev/null; then
+  if [ ! -w "$(dirname "${INSTALL_DIR}")" ] && [ ! -w "${INSTALL_DIR}" ]; then
+    echo "Please enter your password for installing obitools in ${INSTALL_DIR}" 1>&2
+    sudo mkdir -p "${INSTALL_DIR}/bin"
+  else
+    echo "Error: Could not create ${INSTALL_DIR}/bin (check path or disk space)" 1>&2
+    exit 1
+  fi
+fi

 if [[ ! -d "${INSTALL_DIR}/bin" ]]; then
  echo "Could not create ${INSTALL_DIR}/bin directory for installing obitools" 1>&2
@@ -208,16 +220,29 @@ mkdir -p vendor

 # Build with or without prefix
 if [[ -z "$OBITOOLS_PREFIX" ]] ; then
-  make GOFLAGS="-buildvcs=false"
+  make -j"${JOBS}" obitools GOFLAGS="-buildvcs=false"
 else
-  make GOFLAGS="-buildvcs=false" OBITOOLS_PREFIX="${OBITOOLS_PREFIX}"
+  make -j"${JOBS}" obitools GOFLAGS="-buildvcs=false" OBITOOLS_PREFIX="${OBITOOLS_PREFIX}"
 fi

 # Install binaries
 echo "Installing binaries to ${INSTALL_DIR}/bin..." 1>&2
-(cp build/* "${INSTALL_DIR}/bin" 2> /dev/null) \
-   || (echo "Please enter your password for installing obitools in ${INSTALL_DIR}" 1>&2
-       sudo cp build/* "${INSTALL_DIR}/bin")
+if ! cp build/* "${INSTALL_DIR}/bin" 2>/dev/null; then
+  if [ ! -w "${INSTALL_DIR}/bin" ]; then
+    echo "Please enter your password for installing obitools in ${INSTALL_DIR}" 1>&2
+    sudo cp build/* "${INSTALL_DIR}/bin"
+  else
+    echo "Error: Could not copy binaries to ${INSTALL_DIR}/bin" 1>&2
+    echo "  Source files: $(ls build/ 2>/dev/null || echo 'none found')" 1>&2
+    echo "" 1>&2
+    echo "The build directory has been preserved for manual recovery:" 1>&2
+    echo "  $(pwd)/build/" 1>&2
+    echo "You can install manually with:" 1>&2
+    echo "  cp $(pwd)/build/* ${INSTALL_DIR}/bin/" 1>&2
+    popd > /dev/null || true
+    exit 1
+  fi
+fi

 popd > /dev/null || exit

--- a/obitests/obitools/obisuperkmer/test.sh
+++ b/obitests/obitools/obisuperkmer/test.sh
@@ -4,8 +4,8 @@
 # Here give the name of the test serie
 #

-TEST_NAME=obisuperkmer
-CMD=obisuperkmer
+TEST_NAME=obik-super
+CMD=obik

 ######
 #
@@ -16,7 +16,7 @@ TEST_DIR="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")"
 OBITOOLS_DIR="${TEST_DIR/obitest*/}build"
 export PATH="${OBITOOLS_DIR}:${PATH}"

-MCMD="$(echo "${CMD:0:4}" | tr '[:lower:]' '[:upper:]')$(echo "${CMD:4}" | tr '[:upper:]' '[:lower:]')"
+MCMD="OBIk-super"

 TMPDIR="$(mktemp -d)"
 ntest=0
@@ -65,31 +65,10 @@ log "files: $(find $TEST_DIR | awk -F'/' '{print $NF}' | tail -n +2)"
 ####
 #### Below are the tests
 ####
-#### Before each test :
-####  - increment the variable ntest
-####
-#### Run the command as the condition of an if / then /else
-####  - The command must return 0 on success
-####  - The command must return an exit code different from 0 on failure
-####  - The datafiles are stored in the same directory than the test script
-####  - The test script directory is stored in the TEST_DIR variable
-####  - If result files have to be produced they must be stored
-####    in the temporary directory (TMPDIR variable)
-####
-#### then clause is executed on success of the command
-####  - Write a success message using the log function
-####  - increment the variable success
-####
-#### else clause is executed on failure of the command
-####  - Write a failure message using the log function
-####  - increment the variable failed
-####
 ######################################################################

-
-
 ((ntest++))
-if $CMD -h > "${TMPDIR}/help.txt" 2>&1
+if $CMD super -h > "${TMPDIR}/help.txt" 2>&1
 then
    log "$MCMD: printing help OK"
    ((success++))
@@ -100,7 +79,7 @@ fi

 # Test 1: Basic super k-mer extraction with default parameters
 ((ntest++))
-if obisuperkmer "${TEST_DIR}/test_sequences.fasta" \
+if $CMD super "${TEST_DIR}/test_sequences.fasta" \
    > "${TMPDIR}/output_default.fasta" 2>&1
 then
    log "$MCMD: basic extraction with default parameters OK"
@@ -148,7 +127,7 @@ fi

 # Test 5: Extract super k-mers with custom k and m parameters
 ((ntest++))
-if obisuperkmer -k 15 -m 7 "${TEST_DIR}/test_sequences.fasta" \
+if $CMD super -k 15 -m 7 "${TEST_DIR}/test_sequences.fasta" \
    > "${TMPDIR}/output_k15_m7.fasta" 2>&1
 then
    log "$MCMD: extraction with custom k=15, m=7 OK"
@@ -172,7 +151,7 @@ fi

 # Test 7: Test with different output format (FASTA output explicitly)
 ((ntest++))
-if obisuperkmer --fasta-output -k 21 -m 11 \
+if $CMD super --fasta-output -k 21 -m 11 \
    "${TEST_DIR}/test_sequences.fasta" \
    > "${TMPDIR}/output_fasta.fasta" 2>&1
 then
@@ -209,7 +188,7 @@ fi

 # Test 10: Test with output file option
 ((ntest++))
-if obisuperkmer -o "${TMPDIR}/output_file.fasta" \
+if $CMD super -o "${TMPDIR}/output_file.fasta" \
    "${TEST_DIR}/test_sequences.fasta" 2>&1
 then
    log "$MCMD: output to file with -o option OK"
--- a/pkg/obidefault/batch.go
+++ b/pkg/obidefault/batch.go
@@ -1,6 +1,12 @@
 package obidefault

-var _BatchSize = 2000
+// _BatchSize is the minimum number of sequences per batch (floor).
+// Used as the minSeqs argument to RebatchBySize.
+var _BatchSize = 1
+
+// _BatchSizeMax is the maximum number of sequences per batch (ceiling).
+// A batch is flushed when this count is reached regardless of memory usage.
+var _BatchSizeMax = 2000

 // SetBatchSize sets the size of the sequence batches.
 //
@@ -24,3 +30,42 @@ func BatchSize() int {
 func BatchSizePtr() *int {
 	return &_BatchSize
 }
+
+// BatchSizeMax returns the maximum number of sequences per batch.
+func BatchSizeMax() int {
+	return _BatchSizeMax
+}
+
+func BatchSizeMaxPtr() *int {
+	return &_BatchSizeMax
+}
+
+// _BatchMem holds the maximum cumulative memory (in bytes) per batch when
+// memory-based batching is requested. A value of 0 disables memory-based
+// batching and falls back to count-based batching.
+var _BatchMem = 128 * 1024 * 1024 // 128 MB default; set to 0 to disable
+var _BatchMemStr = ""
+
+// SetBatchMem sets the memory budget per batch in bytes.
+func SetBatchMem(n int) {
+	_BatchMem = n
+}
+
+// BatchMem returns the current memory budget per batch in bytes.
+// A value of 0 means memory-based batching is disabled.
+func BatchMem() int {
+	return _BatchMem
+}
+
+func BatchMemPtr() *int {
+	return &_BatchMem
+}
+
+// BatchMemStr returns the raw --batch-mem string value as provided on the CLI.
+func BatchMemStr() string {
+	return _BatchMemStr
+}
+
+func BatchMemStrPtr() *string {
+	return &_BatchMemStr
+}
--- a/pkg/obiformats/embl_read.go
+++ b/pkg/obiformats/embl_read.go
@@ -161,6 +161,149 @@ func EmblChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (obise
 	return parser
 }

+// extractEmblSeq scans the sequence section of an EMBL record directly on the
+// rope. EMBL sequence lines start with 5 spaces followed by bases in groups of
+// 10, separated by spaces, with a position number at the end. The section ends
+// with "//".
+func (s *ropeScanner) extractEmblSeq(dest []byte, UtoT bool) []byte {
+	// We use ReadLine and scan each line for bases (skip digits, spaces, newlines).
+	for {
+		line := s.ReadLine()
+		if line == nil {
+			break
+		}
+		if len(line) >= 2 && line[0] == '/' && line[1] == '/' {
+			break
+		}
+		// Lines start with 5 spaces; bases follow separated by single spaces.
+		// Digits at the end are the position counter — skip them.
+		// Simplest: take every byte that is a letter.
+		for _, b := range line {
+			if b >= 'A' && b <= 'Z' {
+				b += 'a' - 'A'
+			}
+			if UtoT && b == 'u' {
+				b = 't'
+			}
+			if b >= 'a' && b <= 'z' {
+				dest = append(dest, b)
+			}
+		}
+	}
+	return dest
+}
+
+// EmblChunkParserRope parses an EMBL chunk directly from a rope without Pack().
+func EmblChunkParserRope(source string, rope *PieceOfChunk, withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) {
+	scanner := newRopeScanner(rope)
+	sequences := obiseq.MakeBioSequenceSlice(100)[:0]
+
+	var id string
+	var scientificName string
+	defBytes := make([]byte, 0, 256)
+	featBytes := make([]byte, 0, 1024)
+	var taxid int
+	inSeq := false
+
+	for {
+		line := scanner.ReadLine()
+		if line == nil {
+			break
+		}
+
+		if inSeq {
+			// Should not happen — extractEmblSeq consumed up to "//"
+			inSeq = false
+			continue
+		}
+
+		switch {
+		case bytes.HasPrefix(line, []byte("ID   ")):
+			id = string(bytes.SplitN(line[5:], []byte(";"), 2)[0])
+		case bytes.HasPrefix(line, []byte("OS   ")):
+			scientificName = string(bytes.TrimSpace(line[5:]))
+		case bytes.HasPrefix(line, []byte("DE   ")):
+			if len(defBytes) > 0 {
+				defBytes = append(defBytes, ' ')
+			}
+			defBytes = append(defBytes, bytes.TrimSpace(line[5:])...)
+		case withFeatureTable && bytes.HasPrefix(line, []byte("FH   ")):
+			featBytes = append(featBytes, line...)
+		case withFeatureTable && bytes.Equal(line, []byte("FH")):
+			featBytes = append(featBytes, '\n')
+			featBytes = append(featBytes, line...)
+		case bytes.HasPrefix(line, []byte("FT   ")):
+			if withFeatureTable {
+				featBytes = append(featBytes, '\n')
+				featBytes = append(featBytes, line...)
+			}
+			if bytes.HasPrefix(line, []byte(`FT                   /db_xref="taxon:`)) {
+				rest := line[37:]
+				end := bytes.IndexByte(rest, '"')
+				if end > 0 {
+					taxid, _ = strconv.Atoi(string(rest[:end]))
+				}
+			}
+		case bytes.HasPrefix(line, []byte("     ")):
+			// First sequence line: extract all bases via extractEmblSeq,
+			// which also consumes this line's remaining content.
+			// But ReadLine already consumed this line — we need to process it
+			// plus subsequent lines. Process this line inline then call helper.
+			seqDest := make([]byte, 0, 4096)
+			for _, b := range line {
+				if b >= 'A' && b <= 'Z' {
+					b += 'a' - 'A'
+				}
+				if UtoT && b == 'u' {
+					b = 't'
+				}
+				if b >= 'a' && b <= 'z' {
+					seqDest = append(seqDest, b)
+				}
+			}
+			seqDest = scanner.extractEmblSeq(seqDest, UtoT)
+
+			seq := obiseq.NewBioSequenceOwning(id, seqDest, string(defBytes))
+			seq.SetSource(source)
+			if withFeatureTable {
+				seq.SetFeatures(featBytes)
+			}
+			annot := seq.Annotations()
+			annot["scientific_name"] = scientificName
+			annot["taxid"] = taxid
+			sequences = append(sequences, seq)
+
+			// Reset state
+			id = ""
+			scientificName = ""
+			defBytes = defBytes[:0]
+			featBytes = featBytes[:0]
+			taxid = 1
+
+		case bytes.Equal(line, []byte("//")):
+			// record ended without SQ/sequence section (e.g. WGS entries)
+			if id != "" {
+				seq := obiseq.NewBioSequenceOwning(id, []byte{}, string(defBytes))
+				seq.SetSource(source)
+				if withFeatureTable {
+					seq.SetFeatures(featBytes)
+				}
+				annot := seq.Annotations()
+				annot["scientific_name"] = scientificName
+				annot["taxid"] = taxid
+				sequences = append(sequences, seq)
+			}
+			id = ""
+			scientificName = ""
+			defBytes = defBytes[:0]
+			featBytes = featBytes[:0]
+			taxid = 1
+		}
+	}
+
+	return sequences, nil
+}
+
 func _ParseEmblFile(
 	input ChannelFileChunk,
 	out obiiter.IBioSequence,
@@ -171,7 +314,14 @@ func _ParseEmblFile(

 	for chunks := range input {
 		order := chunks.Order
-		sequences, err := parser(chunks.Source, chunks.Raw)
+		var sequences obiseq.BioSequenceSlice
+		var err error
+
+		if chunks.Rope != nil {
+			sequences, err = EmblChunkParserRope(chunks.Source, chunks.Rope, withFeatureTable, UtoT)
+		} else {
+			sequences, err = parser(chunks.Source, chunks.Raw)
+		}

 		if err != nil {
 			log.Fatalf("%s : Cannot parse the embl file : %v", chunks.Source, err)
@@ -196,6 +346,7 @@ func ReadEMBL(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, er
 		1024*1024*128,
 		EndOfLastFlatFileEntry,
 		"\nID   ",
+		false,
 	)

 	newIter := obiiter.MakeIBioSequence()
--- a/pkg/obiformats/fastaseq_read.go
+++ b/pkg/obiformats/fastaseq_read.go
@@ -209,28 +209,121 @@ func FastaChunkParser(UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlic
 	return parser
 }

+// extractFastaSeq scans sequence bytes from the rope directly into dest,
+// appending valid nucleotide characters and skipping whitespace.
+// Stops when '>' is found at the start of a line (next record) or at EOF.
+// Returns (dest with appended bases, hasMore).
+// hasMore=true means scanner is now positioned at '>' of the next record.
+func (s *ropeScanner) extractFastaSeq(dest []byte, UtoT bool) ([]byte, bool) {
+	lineStart := true
+
+	for s.current != nil {
+		data := s.current.data[s.pos:]
+		for i, b := range data {
+			if lineStart && b == '>' {
+				s.pos += i
+				if s.pos >= len(s.current.data) {
+					s.current = s.current.Next()
+					s.pos = 0
+				}
+				return dest, true
+			}
+			if b == '\n' || b == '\r' {
+				lineStart = true
+				continue
+			}
+			lineStart = false
+			if b == ' ' || b == '\t' {
+				continue
+			}
+			if b >= 'A' && b <= 'Z' {
+				b += 'a' - 'A'
+			}
+			if UtoT && b == 'u' {
+				b = 't'
+			}
+			dest = append(dest, b)
+		}
+		s.current = s.current.Next()
+		s.pos = 0
+	}
+	return dest, false
+}
+
+// FastaChunkParserRope parses a FASTA chunk directly from the rope without Pack().
+func FastaChunkParserRope(source string, rope *PieceOfChunk, UtoT bool) (obiseq.BioSequenceSlice, error) {
+	scanner := newRopeScanner(rope)
+	sequences := obiseq.MakeBioSequenceSlice(100)[:0]
+
+	for {
+		bline := scanner.ReadLine()
+		if bline == nil {
+			break
+		}
+		if len(bline) == 0 || bline[0] != '>' {
+			continue
+		}
+
+		// Parse header: ">id definition"
+		header := bline[1:]
+		var id string
+		var definition string
+		sp := bytes.IndexByte(header, ' ')
+		if sp < 0 {
+			sp = bytes.IndexByte(header, '\t')
+		}
+		if sp < 0 {
+			id = string(header)
+		} else {
+			id = string(header[:sp])
+			definition = string(bytes.TrimSpace(header[sp+1:]))
+		}
+
+		seqDest := make([]byte, 0, 4096)
+		var hasMore bool
+		seqDest, hasMore = scanner.extractFastaSeq(seqDest, UtoT)
+
+		if len(seqDest) == 0 {
+			log.Fatalf("%s [%s]: sequence is empty", source, id)
+		}
+
+		seq := obiseq.NewBioSequenceOwning(id, seqDest, definition)
+		seq.SetSource(source)
+		sequences = append(sequences, seq)
+
+		if !hasMore {
+			break
+		}
+	}
+
+	return sequences, nil
+}
+
 func _ParseFastaFile(
 	input ChannelFileChunk,
 	out obiiter.IBioSequence,
 	UtoT bool,
 ) {
-
 	parser := FastaChunkParser(UtoT)

 	for chunks := range input {
-		sequences, err := parser(chunks.Source, chunks.Raw)
-		// obilog.Warnf("Chunck(%d:%d) -%d- ", chunks.Order, l, sequences.Len())
+		var sequences obiseq.BioSequenceSlice
+		var err error
+
+		if chunks.Rope != nil {
+			sequences, err = FastaChunkParserRope(chunks.Source, chunks.Rope, UtoT)
+		} else {
+			sequences, err = parser(chunks.Source, chunks.Raw)
+		}

 		if err != nil {
 			log.Fatalf("File %s : Cannot parse the fasta file : %v", chunks.Source, err)
 		}

 		out.Push(obiiter.MakeBioSequenceBatch(chunks.Source, chunks.Order, sequences))
-
 	}

 	out.Done()
-
 }

 func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
@@ -245,6 +338,7 @@ func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
 		1024*1024,
 		EndOfLastFastaEntry,
 		"\n>",
+		false,
 	)

 	for i := 0; i < nworker; i++ {
--- a/pkg/obiformats/fastqseq_read.go
+++ b/pkg/obiformats/fastqseq_read.go
@@ -303,6 +303,80 @@ func FastqChunkParser(quality_shift byte, with_quality bool, UtoT bool) func(str
 	return parser
 }

+// FastqChunkParserRope parses a FASTQ chunk directly from a rope without Pack().
+func FastqChunkParserRope(source string, rope *PieceOfChunk, quality_shift byte, with_quality, UtoT bool) (obiseq.BioSequenceSlice, error) {
+	scanner := newRopeScanner(rope)
+	sequences := obiseq.MakeBioSequenceSlice(100)[:0]
+
+	for {
+		// Line 1: @id [definition]
+		hline := scanner.ReadLine()
+		if hline == nil {
+			break
+		}
+		if len(hline) == 0 || hline[0] != '@' {
+			continue
+		}
+		header := hline[1:]
+		var id string
+		var definition string
+		sp := bytes.IndexByte(header, ' ')
+		if sp < 0 {
+			sp = bytes.IndexByte(header, '\t')
+		}
+		if sp < 0 {
+			id = string(header)
+		} else {
+			id = string(header[:sp])
+			definition = string(bytes.TrimSpace(header[sp+1:]))
+		}
+
+		// Line 2: sequence
+		sline := scanner.ReadLine()
+		if sline == nil {
+			log.Fatalf("@%s[%s]: unexpected EOF after header", id, source)
+		}
+		seqDest := make([]byte, len(sline))
+		w := 0
+		for _, b := range sline {
+			if b >= 'A' && b <= 'Z' {
+				b += 'a' - 'A'
+			}
+			if UtoT && b == 'u' {
+				b = 't'
+			}
+			seqDest[w] = b
+			w++
+		}
+		seqDest = seqDest[:w]
+		if len(seqDest) == 0 {
+			log.Fatalf("@%s[%s]: sequence is empty", id, source)
+		}
+
+		// Line 3: + (skip)
+		scanner.ReadLine()
+
+		// Line 4: quality
+		qline := scanner.ReadLine()
+
+		seq := obiseq.NewBioSequenceOwning(id, seqDest, definition)
+		seq.SetSource(source)
+
+		if with_quality && qline != nil {
+			qDest := make([]byte, len(qline))
+			copy(qDest, qline)
+			for i := range qDest {
+				qDest[i] -= quality_shift
+			}
+			seq.TakeQualities(qDest)
+		}
+
+		sequences = append(sequences, seq)
+	}
+
+	return sequences, nil
+}
+
 func _ParseFastqFile(
 	input ChannelFileChunk,
 	out obiiter.IBioSequence,
@@ -313,7 +387,14 @@ func _ParseFastqFile(
 	parser := FastqChunkParser(quality_shift, with_quality, UtoT)

 	for chunks := range input {
-		sequences, err := parser(chunks.Source, chunks.Raw)
+		var sequences obiseq.BioSequenceSlice
+		var err error
+
+		if chunks.Rope != nil {
+			sequences, err = FastqChunkParserRope(chunks.Source, chunks.Rope, quality_shift, with_quality, UtoT)
+		} else {
+			sequences, err = parser(chunks.Source, chunks.Raw)
+		}

 		if err != nil {
 			log.Fatalf("File %s : Cannot parse the fastq file : %v", chunks.Source, err)
@@ -339,6 +420,7 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
 		1024*1024,
 		EndOfLastFastqEntry,
 		"\n@",
+		false,
 	)

 	for i := 0; i < nworker; i++ {
--- a/pkg/obiformats/fastseq_json_header.go
+++ b/pkg/obiformats/fastseq_json_header.go
@@ -296,7 +296,7 @@ func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {

 			case strings.HasSuffix(skey, "_taxid"):
 				if dataType == jsonparser.Number || dataType == jsonparser.String {
-					rank, _ := obiutils.SplitInTwo(skey, '_')
+					rank := skey[:len(skey)-len("_taxid")]

 					taxid := string(value)
 					sequence.SetTaxid(taxid, rank)
--- a/pkg/obiformats/fastseq_write_fasta.go
+++ b/pkg/obiformats/fastseq_write_fasta.go
@@ -77,45 +77,47 @@ func FormatFasta(seq *obiseq.BioSequence, formater FormatHeader) string {
 //
 // It returns a byte array containing the formatted sequences.
 func FormatFastaBatch(batch obiiter.BioSequenceBatch, formater FormatHeader, skipEmpty bool) *bytes.Buffer {
-	// Create a buffer to store the formatted sequences
 	var bs bytes.Buffer

 	lt := 0
-
 	for _, seq := range batch.Slice() {
 		lt += seq.Len()
 	}

-	// Iterate over each sequence in the batch
+	// Pre-allocate: sequence data + newlines every 60 chars + ~100 bytes header per sequence
+	bs.Grow(lt + lt/60 + 100*batch.Len() + 1)
+
 	log.Debugf("FormatFastaBatch: #%d : %d seqs", batch.Order(), batch.Len())
-	first := true
+
 	for _, seq := range batch.Slice() {
-		// Check if the sequence is empty
 		if seq.Len() > 0 {
-			// Format the sequence using the provided formater function
-			formattedSeq := FormatFasta(seq, formater)
-
-			if first {
-				bs.Grow(lt + (len(formattedSeq)-seq.Len())*batch.Len()*5/4)
-				first = false
-			}
-
-			// Append the formatted sequence to the buffer
-			bs.WriteString(formattedSeq)
+			// Write header directly into bs — no intermediate string
+			bs.WriteByte('>')
+			bs.WriteString(seq.Id())
+			bs.WriteByte(' ')
+			bs.WriteString(formater(seq))
 			bs.WriteByte('\n')
+
+			// Write folded sequence directly into bs — no copies
+			s := seq.Sequence()
+			l := len(s)
+			for i := 0; i < l; i += 60 {
+				to := i + 60
+				if to > l {
+					to = l
+				}
+				bs.Write(s[i:to])
+				bs.WriteByte('\n')
+			}
 		} else {
-			// Handle empty sequences
 			if skipEmpty {
-				// Skip empty sequences if skipEmpty is true
 				obilog.Warnf("Sequence %s is empty and skipped in output", seq.Id())
 			} else {
-				// Terminate the program if skipEmpty is false
 				log.Fatalf("Sequence %s is empty", seq.Id())
 			}
 		}
 	}

-	// Return the byte array representation of the buffer
 	return &bs
 }

--- a/pkg/obiformats/file_chunk_read.go
+++ b/pkg/obiformats/file_chunk_read.go
@@ -16,6 +16,7 @@ type SeqFileChunkParser func(string, io.Reader) (obiseq.BioSequenceSlice, error)
 type FileChunk struct {
 	Source string
 	Raw    *bytes.Buffer
+	Rope   *PieceOfChunk
 	Order  int
 }

@@ -97,11 +98,17 @@ func (piece *PieceOfChunk) IsLast() bool {
 	return piece.next == nil
 }

-func (piece *PieceOfChunk) FileChunk(source string, order int) FileChunk {
-	piece.Pack()
+func (piece *PieceOfChunk) FileChunk(source string, order int, pack bool) FileChunk {
+	piece = piece.Head()
+	var raw *bytes.Buffer
+	if pack {
+		piece.Pack()
+		raw = bytes.NewBuffer(piece.data)
+	}
 	return FileChunk{
 		Source: source,
-		Raw:    bytes.NewBuffer(piece.data),
+		Raw:    raw,
+		Rope:   piece,
 		Order:  order,
 	}
 }
@@ -133,7 +140,8 @@ func ReadFileChunk(
 	reader io.Reader,
 	fileChunkSize int,
 	splitter LastSeqRecord,
-	probe string) ChannelFileChunk {
+	probe string,
+	pack bool) ChannelFileChunk {

 	chunk_channel := make(ChannelFileChunk)

@@ -205,7 +213,7 @@ func ReadFileChunk(

 				if len(pieces.data) > 0 {
 					// obilog.Warnf("chuck %d :Read %d bytes from file %s", i, io.Len(), source)
-					chunk_channel <- pieces.FileChunk(source, i)
+					chunk_channel <- pieces.FileChunk(source, i, pack)
 					i++
 				}

@@ -222,7 +230,7 @@ func ReadFileChunk(

 		// Send the last chunk to the channel
 		if pieces.Len() > 0 {
-			chunk_channel <- pieces.FileChunk(source, i)
+			chunk_channel <- pieces.FileChunk(source, i, pack)
 		}

 		// Close the readers channel when the end of the file is reached
--- a/pkg/obiformats/genbank_read.go
+++ b/pkg/obiformats/genbank_read.go
@@ -29,6 +29,265 @@ const (

 var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp")

+// extractSequence scans the ORIGIN section byte-by-byte directly on the rope,
+// appending compacted bases to dest. Returns the extended slice.
+// Stops and returns when "//" is found at the start of a line.
+// The scanner is left positioned after the "//" line.
+func (s *ropeScanner) extractSequence(dest []byte, UtoT bool) []byte {
+	lineStart := true
+	skipDigits := true
+
+	for s.current != nil {
+		data := s.current.data[s.pos:]
+		for i, b := range data {
+			if lineStart {
+				if b == '/' {
+					// End-of-record marker "//"
+					s.pos += i + 1
+					if s.pos >= len(s.current.data) {
+						s.current = s.current.Next()
+						s.pos = 0
+					}
+					s.skipToNewline()
+					return dest
+				}
+				lineStart = false
+				skipDigits = true
+			}
+			switch {
+			case b == '\n':
+				lineStart = true
+			case b == '\r':
+				// skip
+			case skipDigits:
+				if b != ' ' && (b < '0' || b > '9') {
+					skipDigits = false
+					if UtoT && b == 'u' {
+						b = 't'
+					}
+					dest = append(dest, b)
+				}
+			case b != ' ':
+				if UtoT && b == 'u' {
+					b = 't'
+				}
+				dest = append(dest, b)
+			}
+		}
+		s.current = s.current.Next()
+		s.pos = 0
+	}
+	return dest
+}
+
+// parseLseqFromLocus extracts the declared sequence length from a LOCUS line.
+// Format: "LOCUS       <id> <length> bp ..."
+// Returns -1 if not found or parse error.
+func parseLseqFromLocus(line []byte) int {
+	if len(line) < 13 {
+		return -1
+	}
+	i := 12
+	for i < len(line) && line[i] != ' ' {
+		i++
+	}
+	for i < len(line) && line[i] == ' ' {
+		i++
+	}
+	start := i
+	for i < len(line) && line[i] >= '0' && line[i] <= '9' {
+		i++
+	}
+	if i == start {
+		return -1
+	}
+	n, err := strconv.Atoi(string(line[start:i]))
+	if err != nil {
+		return -1
+	}
+	return n
+}
+
+// Prefix constants for GenBank section headers (byte slices for zero-alloc comparison).
+var (
+	gbPfxLocus      = []byte("LOCUS       ")
+	gbPfxDefinition = []byte("DEFINITION  ")
+	gbPfxContinue   = []byte("            ")
+	gbPfxSource     = []byte("SOURCE      ")
+	gbPfxFeatures   = []byte("FEATURES    ")
+	gbPfxOrigin     = []byte("ORIGIN")
+	gbPfxContig     = []byte("CONTIG")
+	gbPfxEnd        = []byte("//")
+	gbPfxDbXref     = []byte(`                     /db_xref="taxon:`)
+)
+
+// GenbankChunkParserRope parses a GenBank FileChunk directly from the rope
+// (PieceOfChunk linked list) without calling Pack(). This eliminates the large
+// contiguous allocation required for chromosomal-scale sequences.
+func GenbankChunkParserRope(source string, rope *PieceOfChunk,
+	withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) {
+
+	state := inHeader
+	scanner := newRopeScanner(rope)
+	sequences := obiseq.MakeBioSequenceSlice(100)[:0]
+
+	id := ""
+	lseq := -1
+	scientificName := ""
+	defBytes := new(bytes.Buffer)
+	featBytes := new(bytes.Buffer)
+	var seqDest []byte
+	taxid := 1
+	nl := 0
+
+	for bline := scanner.ReadLine(); bline != nil; bline = scanner.ReadLine() {
+		nl++
+		processed := false
+		for !processed {
+			switch {
+
+			case bytes.HasPrefix(bline, gbPfxLocus):
+				if state != inHeader {
+					log.Fatalf("Line %d - Unexpected state %d while reading LOCUS: %s", nl, state, bline)
+				}
+				rest := bline[12:]
+				sp := bytes.IndexByte(rest, ' ')
+				if sp < 0 {
+					id = string(rest)
+				} else {
+					id = string(rest[:sp])
+				}
+				lseq = parseLseqFromLocus(bline)
+				cap0 := lseq + 20
+				if cap0 < 1024 {
+					cap0 = 1024
+				}
+				seqDest = make([]byte, 0, cap0)
+				state = inEntry
+				processed = true
+
+			case bytes.HasPrefix(bline, gbPfxDefinition):
+				if state != inEntry {
+					log.Fatalf("Line %d - Unexpected state %d while reading DEFINITION: %s", nl, state, bline)
+				}
+				defBytes.Write(bytes.TrimSpace(bline[12:]))
+				state = inDefinition
+				processed = true
+
+			case state == inDefinition:
+				if bytes.HasPrefix(bline, gbPfxContinue) {
+					defBytes.WriteByte(' ')
+					defBytes.Write(bytes.TrimSpace(bline[12:]))
+					processed = true
+				} else {
+					state = inEntry
+				}
+
+			case bytes.HasPrefix(bline, gbPfxSource):
+				if state != inEntry {
+					log.Fatalf("Line %d - Unexpected state %d while reading SOURCE: %s", nl, state, bline)
+				}
+				scientificName = string(bytes.TrimSpace(bline[12:]))
+				processed = true
+
+			case bytes.HasPrefix(bline, gbPfxFeatures):
+				if state != inEntry {
+					log.Fatalf("Line %d - Unexpected state %d while reading FEATURES: %s", nl, state, bline)
+				}
+				if withFeatureTable {
+					featBytes.Write(bline)
+				}
+				state = inFeature
+				processed = true
+
+			case bytes.HasPrefix(bline, gbPfxOrigin):
+				if state != inFeature && state != inContig {
+					log.Fatalf("Line %d - Unexpected state %d while reading ORIGIN: %s", nl, state, bline)
+				}
+				// Use fast byte-scan to extract sequence and consume through "//"
+				seqDest = scanner.extractSequence(seqDest, UtoT)
+				// Emit record
+				if id == "" {
+					log.Warn("Empty id when parsing genbank file")
+				}
+				sequence := obiseq.NewBioSequenceOwning(id, seqDest, defBytes.String())
+				sequence.SetSource(source)
+				if withFeatureTable {
+					sequence.SetFeatures(featBytes.Bytes())
+				}
+				annot := sequence.Annotations()
+				annot["scientific_name"] = scientificName
+				annot["taxid"] = taxid
+				sequences = append(sequences, sequence)
+
+				defBytes = bytes.NewBuffer(obiseq.GetSlice(200))
+				featBytes = new(bytes.Buffer)
+				nl = 0
+				taxid = 1
+				seqDest = nil
+				state = inHeader
+				processed = true
+
+			case bytes.HasPrefix(bline, gbPfxContig):
+				if state != inFeature && state != inContig {
+					log.Fatalf("Line %d - Unexpected state %d while reading CONTIG: %s", nl, state, bline)
+				}
+				state = inContig
+				processed = true
+
+			case bytes.Equal(bline, gbPfxEnd):
+				// Reached for CONTIG records (no ORIGIN section)
+				if state != inContig {
+					log.Fatalf("Line %d - Unexpected state %d while reading end of record %s", nl, state, id)
+				}
+				if id == "" {
+					log.Warn("Empty id when parsing genbank file")
+				}
+				sequence := obiseq.NewBioSequenceOwning(id, seqDest, defBytes.String())
+				sequence.SetSource(source)
+				if withFeatureTable {
+					sequence.SetFeatures(featBytes.Bytes())
+				}
+				annot := sequence.Annotations()
+				annot["scientific_name"] = scientificName
+				annot["taxid"] = taxid
+				sequences = append(sequences, sequence)
+
+				defBytes = bytes.NewBuffer(obiseq.GetSlice(200))
+				featBytes = new(bytes.Buffer)
+				nl = 0
+				taxid = 1
+				seqDest = nil
+				state = inHeader
+				processed = true
+
+			default:
+				switch state {
+				case inFeature:
+					if withFeatureTable {
+						featBytes.WriteByte('\n')
+						featBytes.Write(bline)
+					}
+					if bytes.HasPrefix(bline, gbPfxDbXref) {
+						rest := bline[len(gbPfxDbXref):]
+						q := bytes.IndexByte(rest, '"')
+						if q >= 0 {
+							taxid, _ = strconv.Atoi(string(rest[:q]))
+						}
+					}
+					processed = true
+				case inHeader, inEntry, inContig:
+					processed = true
+				default:
+					log.Fatalf("Unexpected state %d while reading: %s", state, bline)
+				}
+			}
+		}
+	}
+
+	return sequences, nil
+}
+
 func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlice, error) {
 	return func(source string, input io.Reader) (obiseq.BioSequenceSlice, error) {
 		state := inHeader
@@ -125,13 +384,10 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
 					if state != inSequence && state != inContig {
 						log.Fatalf("Line %d - Unexpected state %d while reading end of record %s", nl, state, id)
 					}
-					// log.Debugln("Total lines := ", nl)
 					if id == "" {
 						log.Warn("Empty id when parsing genbank file")
 					}

-					// log.Debugf("End of sequence %s: %dbp ", id, seqBytes.Len())
-
 					sequence := obiseq.NewBioSequence(id,
 						seqBytes.Bytes(),
 						defBytes.String())
@@ -144,9 +400,6 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
 					annot := sequence.Annotations()
 					annot["scientific_name"] = scientificName
 					annot["taxid"] = taxid
-					// log.Println(FormatFasta(sequence, FormatFastSeqJsonHeader))
-					// log.Debugf("Read sequences %s: %dbp (%d)", sequence.Id(),
-					//	sequence.Len(), seqBytes.Len())

 					sequences = append(sequences, sequence)

@@ -159,12 +412,11 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
 					processed = true

 				case state == inSequence:
-					// log.Debugf("Chunk %d : Genbank: line %d, state = %d : %s", chunks.order, nl, state, line)
-
 					sl++
-					parts := strings.SplitN(line[10:], " ", 6)
+					cleanline := strings.TrimSpace(line)
+					parts := strings.SplitN(cleanline, " ", 7)
 					lparts := len(parts)
-					for i := 0; i < lparts; i++ {
+					for i := 1; i < lparts; i++ {
 						if UtoT {
 							parts[i] = strings.ReplaceAll(parts[i], "u", "t")
 						}
@@ -197,6 +449,7 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob

 		}

+		_ = sl
 		return sequences, nil
 	}
 }
@@ -205,10 +458,16 @@ func _ParseGenbankFile(input ChannelFileChunk,
 	out obiiter.IBioSequence,
 	withFeatureTable, UtoT bool) {

-	parser := GenbankChunkParser(withFeatureTable, UtoT)
-
 	for chunks := range input {
-		sequences, err := parser(chunks.Source, chunks.Raw)
+		var sequences obiseq.BioSequenceSlice
+		var err error
+
+		if chunks.Rope != nil {
+			sequences, err = GenbankChunkParserRope(chunks.Source, chunks.Rope, withFeatureTable, UtoT)
+		} else {
+			parser := GenbankChunkParser(withFeatureTable, UtoT)
+			sequences, err = parser(chunks.Source, chunks.Raw)
+		}

 		if err != nil {
 			log.Fatalf("File %s : Cannot parse the genbank file : %v", chunks.Source, err)
@@ -224,7 +483,6 @@ func _ParseGenbankFile(input ChannelFileChunk,

 func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
 	opt := MakeOptions(options)
-	// entry_channel := make(chan _FileChunk)

 	entry_channel := ReadFileChunk(
 		opt.Source(),
@@ -232,13 +490,13 @@ func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence,
 		1024*1024*128,
 		EndOfLastFlatFileEntry,
 		"\nLOCUS       ",
+		false, // do not pack: rope-based parser avoids contiguous allocation
 	)

 	newIter := obiiter.MakeIBioSequence()

 	nworkers := opt.ParallelWorkers()

-	// for j := 0; j < opt.ParallelWorkers(); j++ {
 	for j := 0; j < nworkers; j++ {
 		newIter.Add(1)
 		go _ParseGenbankFile(
@@ -249,8 +507,6 @@ func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence,
 		)
 	}

-	// go _ReadFlatFileChunk(reader, entry_channel)
-
 	go func() {
 		newIter.WaitAndClose()
 		log.Debug("End of the genbank file ", opt.Source())
--- a/pkg/obiformats/rope_scanner.go
+++ b/pkg/obiformats/rope_scanner.go
@@ -0,0 +1,77 @@
+package obiformats
+
+import "bytes"
+
+// ropeScanner reads lines from a PieceOfChunk rope.
+// The carry buffer handles lines that span two rope nodes; it grows as needed.
+type ropeScanner struct {
+	current *PieceOfChunk
+	pos     int
+	carry   []byte
+}
+
+func newRopeScanner(rope *PieceOfChunk) *ropeScanner {
+	return &ropeScanner{current: rope}
+}
+
+// ReadLine returns the next line without the trailing \n (or \r\n).
+// Returns nil at end of rope. The returned slice aliases carry[] or the node
+// data and is valid only until the next ReadLine call.
+func (s *ropeScanner) ReadLine() []byte {
+	for {
+		if s.current == nil {
+			if len(s.carry) > 0 {
+				line := s.carry
+				s.carry = s.carry[:0]
+				return line
+			}
+			return nil
+		}
+
+		data := s.current.data[s.pos:]
+		idx := bytes.IndexByte(data, '\n')
+
+		if idx >= 0 {
+			var line []byte
+			if len(s.carry) == 0 {
+				line = data[:idx]
+			} else {
+				s.carry = append(s.carry, data[:idx]...)
+				line = s.carry
+				s.carry = s.carry[:0]
+			}
+			s.pos += idx + 1
+			if s.pos >= len(s.current.data) {
+				s.current = s.current.Next()
+				s.pos = 0
+			}
+			if len(line) > 0 && line[len(line)-1] == '\r' {
+				line = line[:len(line)-1]
+			}
+			return line
+		}
+
+		// No \n in this node: accumulate into carry and advance
+		s.carry = append(s.carry, data...)
+		s.current = s.current.Next()
+		s.pos = 0
+	}
+}
+
+// skipToNewline advances the scanner past the next '\n'.
+func (s *ropeScanner) skipToNewline() {
+	for s.current != nil {
+		data := s.current.data[s.pos:]
+		idx := bytes.IndexByte(data, '\n')
+		if idx >= 0 {
+			s.pos += idx + 1
+			if s.pos >= len(s.current.data) {
+				s.current = s.current.Next()
+				s.pos = 0
+			}
+			return
+		}
+		s.current = s.current.Next()
+		s.pos = 0
+	}
+}
--- a/pkg/obiiter/batchiterator.go
+++ b/pkg/obiiter/batchiterator.go
@@ -444,6 +444,67 @@ func (iterator IBioSequence) Rebatch(size int) IBioSequence {
 	return newIter
 }

+// RebatchBySize reorganises the stream into batches bounded by two independent
+// upper limits: maxCount (max number of sequences) and maxBytes (max cumulative
+// estimated memory). A batch is flushed as soon as either limit would be
+// exceeded. A single sequence larger than maxBytes is always emitted alone.
+// Passing 0 for a limit disables that constraint; if both are 0 it falls back
+// to Rebatch(obidefault.BatchSizeMax()).
+func (iterator IBioSequence) RebatchBySize(maxBytes int, maxCount int) IBioSequence {
+	if maxBytes <= 0 && maxCount <= 0 {
+		return iterator.Rebatch(obidefault.BatchSizeMax())
+	}
+
+	newIter := MakeIBioSequence()
+
+	newIter.Add(1)
+
+	go func() {
+		newIter.WaitAndClose()
+	}()
+
+	go func() {
+		order := 0
+		iterator = iterator.SortBatches()
+		buffer := obiseq.MakeBioSequenceSlice()
+		bufBytes := 0
+		source := ""
+
+		flush := func() {
+			if len(buffer) > 0 {
+				newIter.Push(MakeBioSequenceBatch(source, order, buffer))
+				order++
+				buffer = obiseq.MakeBioSequenceSlice()
+				bufBytes = 0
+			}
+		}
+
+		for iterator.Next() {
+			seqs := iterator.Get()
+			source = seqs.Source()
+			for _, s := range seqs.Slice() {
+				sz := s.MemorySize()
+				countFull := maxCount > 0 && len(buffer) >= maxCount
+				memFull := maxBytes > 0 && bufBytes+sz > maxBytes && len(buffer) > 0
+				if countFull || memFull {
+					flush()
+				}
+				buffer = append(buffer, s)
+				bufBytes += sz
+			}
+		}
+		flush()
+
+		newIter.Done()
+	}()
+
+	if iterator.IsPaired() {
+		newIter.MarkAsPaired()
+	}
+
+	return newIter
+}
+
 func (iterator IBioSequence) FilterEmpty() IBioSequence {

 	newIter := MakeIBioSequence()
@@ -638,7 +699,7 @@ func (iterator IBioSequence) FilterOn(predicate obiseq.SequencePredicate,
 		trueIter.MarkAsPaired()
 	}

-	return trueIter.Rebatch(size)
+	return trueIter.RebatchBySize(obidefault.BatchMem(), obidefault.BatchSizeMax())
 }

 func (iterator IBioSequence) FilterAnd(predicate obiseq.SequencePredicate,
@@ -694,7 +755,7 @@ func (iterator IBioSequence) FilterAnd(predicate obiseq.SequencePredicate,
 		trueIter.MarkAsPaired()
 	}

-	return trueIter.Rebatch(size)
+	return trueIter.RebatchBySize(obidefault.BatchMem(), obidefault.BatchSizeMax())
 }

 // Load all sequences availables from an IBioSequenceBatch iterator into
--- a/pkg/obiiter/fragment.go
+++ b/pkg/obiiter/fragment.go
@@ -3,6 +3,7 @@ package obiiter
 import (
 	log "github.com/sirupsen/logrus"

+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 )

@@ -70,7 +71,7 @@ func IFragments(minsize, length, overlap, size, nworkers int) Pipeable {
 		}
 		go f(iterator)

-		return newiter.SortBatches().Rebatch(size)
+		return newiter.SortBatches().RebatchBySize(obidefault.BatchMem(), obidefault.BatchSizeMax())
 	}

 	return ifrg
--- a/pkg/obioptions/options.go
+++ b/pkg/obioptions/options.go
@@ -8,6 +8,7 @@ import (

 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
 	log "github.com/sirupsen/logrus"

 	"github.com/DavidGamba/go-getoptions"
@@ -55,7 +56,15 @@ func RegisterGlobalOptions(options *getoptions.GetOpt) {

 	options.IntVar(obidefault.BatchSizePtr(), "batch-size", obidefault.BatchSize(),
 		options.GetEnv("OBIBATCHSIZE"),
-		options.Description("Number of sequence per batch for paralelle processing"))
+		options.Description("Minimum number of sequences per batch (floor, default 1)"))
+
+	options.IntVar(obidefault.BatchSizeMaxPtr(), "batch-size-max", obidefault.BatchSizeMax(),
+		options.GetEnv("OBIBATCHSIZEMAX"),
+		options.Description("Maximum number of sequences per batch (ceiling, default 2000)"))
+
+	options.StringVar(obidefault.BatchMemStrPtr(), "batch-mem", "",
+		options.GetEnv("OBIBATCHMEM"),
+		options.Description("Maximum memory per batch (e.g. 128K, 64M, 1G; default: 128M). Set to 0 to disable."))

 	options.Bool("solexa", false,
 		options.GetEnv("OBISOLEXA"),
@@ -157,6 +166,15 @@ func ProcessParsedOptions(options *getoptions.GetOpt, parseErr error) {
 	if options.Called("solexa") {
 		obidefault.SetReadQualitiesShift(64)
 	}
+
+	if options.Called("batch-mem") {
+		n, err := obiutils.ParseMemSize(obidefault.BatchMemStr())
+		if err != nil {
+			log.Fatalf("Invalid --batch-mem value %q: %v", obidefault.BatchMemStr(), err)
+		}
+		obidefault.SetBatchMem(n)
+		log.Printf("Memory-based batching enabled: %s per batch", obidefault.BatchMemStr())
+	}
 }

 func GenerateOptionParser(program string,
--- a/pkg/obioptions/version.go
+++ b/pkg/obioptions/version.go
@@ -3,7 +3,7 @@ package obioptions
 // Version is automatically updated by the Makefile from version.txt
 // The patch number (third digit) is incremented on each push to the repository

-var _Version = "Release 4.4.12"
+var _Version = "Release 4.4.22"

 // Version returns the version of the obitools package.
 //
--- a/pkg/obiseq/biosequence.go
+++ b/pkg/obiseq/biosequence.go
@@ -120,6 +120,19 @@ func NewBioSequence(id string,
 	return bs
 }

+// NewBioSequenceOwning creates a BioSequence taking ownership of the sequence
+// slice without copying it. The caller must not use the slice after this call.
+// Use this when the slice was allocated specifically for this sequence.
+func NewBioSequenceOwning(id string,
+	sequence []byte,
+	definition string) *BioSequence {
+	bs := NewEmptyBioSequence(0)
+	bs.SetId(id)
+	bs.TakeSequence(sequence)
+	bs.SetDefinition(definition)
+	return bs
+}
+
 // NewBioSequenceWithQualities creates a new BioSequence object with the given id, sequence, definition, and qualities.
 //
 // Parameters:
@@ -260,6 +273,28 @@ func (s *BioSequence) Len() int {
 	return len(s.sequence)
 }

+// MemorySize returns an estimate of the memory footprint of the BioSequence
+// in bytes. It accounts for the sequence, quality scores, feature data,
+// annotations, and fixed struct overhead. The estimate is conservative
+// (cap rather than len for byte slices) so it is suitable for memory-based
+// batching decisions.
+func (s *BioSequence) MemorySize() int {
+	if s == nil {
+		return 0
+	}
+	// fixed struct overhead (strings, pointers, mutex pointer)
+	const overhead = 128
+	n := overhead
+	n += cap(s.sequence)
+	n += cap(s.qualities)
+	n += cap(s.feature)
+	n += len(s.id)
+	n += len(s.source)
+	// rough annotation estimate: each key+value pair ~64 bytes on average
+	n += len(s.annotations) * 64
+	return n
+}
+
 // HasQualities checks if the BioSequence has sequence qualitiy scores.
 //
 // This function does not have any parameters.
@@ -444,6 +479,12 @@ func (s *BioSequence) SetSequence(sequence []byte) {
 	s.sequence = obiutils.InPlaceToLower(CopySlice(sequence))
 }

+// TakeSequence stores the slice directly without copying, then lowercases in-place.
+// The caller must not use the slice after this call.
+func (s *BioSequence) TakeSequence(sequence []byte) {
+	s.sequence = obiutils.InPlaceToLower(sequence)
+}
+
 func (s *BioSequence) HasValidSequence() bool {
 	for _, c := range s.sequence {
 		if !((c >= 'a' && c <= 'z') || c == '-' || c == '.' || c == '[' || c == ']') {
@@ -461,6 +502,15 @@ func (s *BioSequence) SetQualities(qualities Quality) {
 	s.qualities = CopySlice(qualities)
 }

+// TakeQualities stores the slice directly without copying.
+// The caller must not use the slice after this call.
+func (s *BioSequence) TakeQualities(qualities Quality) {
+	if s.qualities != nil {
+		RecycleSlice(&s.qualities)
+	}
+	s.qualities = qualities
+}
+
 // A method that appends a byte slice to the qualities of the BioSequence.
 func (s *BioSequence) WriteQualities(data []byte) (int, error) {
 	s.qualities = append(s.qualities, data...)
--- a/pkg/obiseq/biosequenceslice.go
+++ b/pkg/obiseq/biosequenceslice.go
@@ -195,7 +195,7 @@ func (s *BioSequenceSlice) ExtractTaxonomy(taxonomy *obitax.Taxonomy, seqAsTaxa
 				return nil, fmt.Errorf("sequence %v has no path", s.Id())
 			}
 			last := path[len(path)-1]
-			taxname, _ := obiutils.SplitInTwo(last, ':')
+			taxname, _ := obiutils.LeftSplitInTwo(last, ':')
 			if idx, ok := s.GetIntAttribute("seq_number"); !ok {
 				return nil, errors.New("sequences are not numbered")
 			} else {
--- a/pkg/obiseq/pool.go
+++ b/pkg/obiseq/pool.go
@@ -1,13 +1,20 @@
 package obiseq

 import (
+	"runtime"
 	"sync"
+	"sync/atomic"

 	log "github.com/sirupsen/logrus"

 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
 )

+const _LargeSliceThreshold = 100 * 1024        // 100 kb — below: leave to GC, above: trigger explicit GC
+const _GCBytesBudget = int64(256 * 1024 * 1024) // trigger GC every 256 MB of large discards
+
+var _largeSliceDiscardedBytes = atomic.Int64{}
+
 var _BioSequenceByteSlicePool = sync.Pool{
 	New: func() interface{} {
 		bs := make([]byte, 0, 300)
@@ -34,6 +41,13 @@ func RecycleSlice(s *[]byte) {
 		}
 		if cap(*s) <= 1024 {
 			_BioSequenceByteSlicePool.Put(s)
+		} else if cap(*s) >= _LargeSliceThreshold {
+			n := int64(cap(*s))
+			*s = nil
+			prev := _largeSliceDiscardedBytes.Load()
+			if _largeSliceDiscardedBytes.Add(n)/_GCBytesBudget > prev/_GCBytesBudget {
+				runtime.GC()
+			}
 		}
 	}
 }
--- a/pkg/obitax/taxid.go
+++ b/pkg/obitax/taxid.go
@@ -31,7 +31,7 @@ func NewTaxidFactory(code string, alphabet obiutils.AsciiSet) *TaxidFactory {
 // It extracts the relevant part of the string after the first colon (':') if present.
 func (f *TaxidFactory) FromString(taxid string) (Taxid, error) {
 	taxid = obiutils.AsciiSpaceSet.TrimLeft(taxid)
-	part1, part2 := obiutils.SplitInTwo(taxid, ':')
+	part1, part2 := obiutils.LeftSplitInTwo(taxid, ':')
 	if len(part2) == 0 {
 		taxid = part1
 	} else {
--- a/pkg/obitools/obiclean/graph.go
+++ b/pkg/obitools/obiclean/graph.go
@@ -64,7 +64,7 @@ func EmpiricalDistCsv(filename string, data [][]Ratio, compressed bool) {
 		fmt.Println(err)
 	}

-	destfile, err := obiutils.CompressStream(file, true, true)
+	destfile, err := obiutils.CompressStream(file, compressed, true)
 	if err != nil {
 		fmt.Println(err)
 	}
--- a/pkg/obitools/obiconvert/sequence_reader.go
+++ b/pkg/obitools/obiconvert/sequence_reader.go
@@ -68,6 +68,8 @@ func ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) {
 						strings.HasSuffix(path, "seq.gz") ||
 						strings.HasSuffix(path, "gb") ||
 						strings.HasSuffix(path, "gb.gz") ||
+						strings.HasSuffix(path, "gbff") ||
+						strings.HasSuffix(path, "gbff.gz") ||
 						strings.HasSuffix(path, "dat") ||
 						strings.HasSuffix(path, "dat.gz") ||
 						strings.HasSuffix(path, "ecopcr") ||
@@ -204,7 +206,7 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
 					iterator = iterator.PairTo(ip)
 				}
 			} else {
-				iterator = obiiter.NilIBioSequence
+				return obiiter.NilIBioSequence, fmt.Errorf("no sequence files found in the provided paths")
 			}
 		}

@@ -212,6 +214,8 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {

 	iterator = iterator.Speed("Reading sequences")

+	iterator = iterator.RebatchBySize(obidefault.BatchMem(), obidefault.BatchSizeMax())
+
 	return iterator, nil
 }

--- a/pkg/obitools/obirefidx/obirefidx.go
+++ b/pkg/obitools/obirefidx/obirefidx.go
@@ -291,5 +291,5 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
 		go f()
 	}

-	return indexed.Rebatch(obidefault.BatchSize())
+	return indexed.RebatchBySize(obidefault.BatchMem(), obidefault.BatchSizeMax())
 }
--- a/pkg/obiutils/memsize.go
+++ b/pkg/obiutils/memsize.go
@@ -0,0 +1,85 @@
+package obiutils
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+	"unicode"
+)
+
+// ParseMemSize parses a human-readable memory size string and returns the
+// equivalent number of bytes. The value is a number optionally followed by a
+// unit suffix (case-insensitive):
+//
+//	B  or (no suffix) — bytes
+//	K  or KB           — kibibytes  (1 024)
+//	M  or MB           — mebibytes  (1 048 576)
+//	G  or GB           — gibibytes  (1 073 741 824)
+//	T  or TB           — tebibytes  (1 099 511 627 776)
+//
+// Examples: "512", "128K", "128k", "64M", "1G", "2GB"
+func ParseMemSize(s string) (int, error) {
+	s = strings.TrimSpace(s)
+	if s == "" {
+		return 0, fmt.Errorf("empty memory size string")
+	}
+
+	// split numeric prefix from unit suffix
+	i := 0
+	for i < len(s) && (unicode.IsDigit(rune(s[i])) || s[i] == '.') {
+		i++
+	}
+	numStr := s[:i]
+	unit := strings.ToUpper(strings.TrimSpace(s[i:]))
+	// strip trailing 'B' from two-letter units (KB→K, MB→M …)
+	if len(unit) == 2 && unit[1] == 'B' {
+		unit = unit[:1]
+	}
+
+	val, err := strconv.ParseFloat(numStr, 64)
+	if err != nil {
+		return 0, fmt.Errorf("invalid memory size %q: %w", s, err)
+	}
+
+	var multiplier float64
+	switch unit {
+	case "", "B":
+		multiplier = 1
+	case "K":
+		multiplier = 1024
+	case "M":
+		multiplier = 1024 * 1024
+	case "G":
+		multiplier = 1024 * 1024 * 1024
+	case "T":
+		multiplier = 1024 * 1024 * 1024 * 1024
+	default:
+		return 0, fmt.Errorf("unknown memory unit %q in %q", unit, s)
+	}
+
+	return int(val * multiplier), nil
+}
+
+// FormatMemSize formats a byte count as a human-readable string with the
+// largest unit that produces a value ≥ 1 (e.g. 1536 → "1.5K").
+func FormatMemSize(n int) string {
+	units := []struct {
+		suffix string
+		size   int
+	}{
+		{"T", 1024 * 1024 * 1024 * 1024},
+		{"G", 1024 * 1024 * 1024},
+		{"M", 1024 * 1024},
+		{"K", 1024},
+	}
+	for _, u := range units {
+		if n >= u.size {
+			v := float64(n) / float64(u.size)
+			if v == float64(int(v)) {
+				return fmt.Sprintf("%d%s", int(v), u.suffix)
+			}
+			return fmt.Sprintf("%.1f%s", v, u.suffix)
+		}
+	}
+	return fmt.Sprintf("%dB", n)
+}
--- a/pkg/obiutils/strings.go
+++ b/pkg/obiutils/strings.go
@@ -144,7 +144,7 @@ func (r *AsciiSet) TrimLeft(s string) string {
 	return s[i:]
 }

-func SplitInTwo(s string, sep byte) (string, string) {
+func LeftSplitInTwo(s string, sep byte) (string, string) {
 	i := 0
 	for ; i < len(s); i++ {
 		c := s[i]
@@ -157,3 +157,17 @@ func SplitInTwo(s string, sep byte) (string, string) {
 	}
 	return s[:i], s[i+1:]
 }
+
+func RightSplitInTwo(s string, sep byte) (string, string) {
+	i := len(s) - 1
+	for ; i >= 0; i-- {
+		c := s[i]
+		if c == sep {
+			break
+		}
+	}
+	if i == len(s) {
+		return s, ""
+	}
+	return s[:i], s[i+1:]
+}
--- a/release_notes.sh
+++ b/release_notes.sh
@@ -0,0 +1,294 @@
+#!/bin/bash
+
+# Generate GitHub-compatible release notes for an OBITools4 version.
+#
+# Usage:
+#   ./release_notes.sh                 # latest version
+#   ./release_notes.sh -v 4.4.15       # specific version
+#   ./release_notes.sh -l              # list available versions
+#   ./release_notes.sh -r              # raw commit list (no LLM)
+#   ./release_notes.sh -c -v 4.4.16   # show LLM context for a version
+
+GITHUB_REPO="metabarcoding/obitools4"
+GITHUB_API="https://api.github.com/repos/${GITHUB_REPO}"
+VERSION=""
+LIST_VERSIONS=false
+RAW_MODE=false
+CONTEXT_MODE=false
+LLM_MODEL="ollama:qwen3-coder-next:latest"
+
+# ── Helpers ──────────────────────────────────────────────────────────────
+
+die() { echo "Error: $*" >&2; exit 1; }
+
+next_patch() {
+  local v="$1"
+  local major minor patch
+  major=$(echo "$v" | cut -d. -f1)
+  minor=$(echo "$v" | cut -d. -f2)
+  patch=$(echo "$v" | cut -d. -f3)
+  echo "${major}.${minor}.$(( patch + 1 ))"
+}
+
+# Strip "pre-" prefix to get the bare version number for installation section
+bare_version() {
+  echo "$1" | sed 's/^pre-//'
+}
+
+installation_section() {
+  local v
+  v=$(bare_version "$1")
+  cat <<INSTALL_EOF
+
+## Installation
+
+### Pre-built binaries
+
+Download the appropriate archive for your system from the
+[release assets](https://github.com/metabarcoding/obitools4/releases/tag/Release_${v})
+and extract it:
+
+#### Linux (AMD64)
+\`\`\`bash
+tar -xzf obitools4_${v}_linux_amd64.tar.gz
+\`\`\`
+
+#### Linux (ARM64)
+\`\`\`bash
+tar -xzf obitools4_${v}_linux_arm64.tar.gz
+\`\`\`
+
+#### macOS (Intel)
+\`\`\`bash
+tar -xzf obitools4_${v}_darwin_amd64.tar.gz
+\`\`\`
+
+#### macOS (Apple Silicon)
+\`\`\`bash
+tar -xzf obitools4_${v}_darwin_arm64.tar.gz
+\`\`\`
+
+All OBITools4 binaries are included in each archive.
+
+### From source
+
+You can also compile and install OBITools4 directly from source using the
+installation script:
+
+\`\`\`bash
+curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | bash -s -- --version ${v}
+\`\`\`
+
+By default binaries are installed in \`/usr/local/bin\`. Use \`--install-dir\` to
+change the destination and \`--obitools-prefix\` to add a prefix to command names:
+
+\`\`\`bash
+curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | \\
+  bash -s -- --version ${v} --install-dir ~/local --obitools-prefix k
+\`\`\`
+INSTALL_EOF
+}
+
+display_help() {
+  cat <<EOF
+Usage: $(basename "$0") [OPTIONS]
+
+Generate GitHub-compatible Markdown release notes for an OBITools4 version.
+
+Options:
+  -v, --version VERSION   Target version (e.g., 4.4.15). Default: latest.
+  -l, --list              List all available versions and exit.
+  -r, --raw               Output raw commit list without LLM summarization.
+  -c, --context           Show the exact context (commits + prompt) sent to the LLM.
+  -m, --model MODEL       LLM model for orla (default: $LLM_MODEL).
+  -h, --help              Display this help message.
+
+Examples:
+  $(basename "$0")                  # release notes for the latest version
+  $(basename "$0") -v 4.4.15       # release notes for a specific version
+  $(basename "$0") -l              # list versions
+  $(basename "$0") -r -v 4.4.15    # raw commit log for a version
+  $(basename "$0") -c -v 4.4.16    # show LLM context for a version
+EOF
+}
+
+# Fetch all Release tags from GitHub API (sorted newest first)
+fetch_versions() {
+  curl -sf "${GITHUB_API}/releases" \
+    | grep '"tag_name":' \
+    | sed -E 's/.*"tag_name": "Release_([0-9.]+)".*/\1/' \
+    | sort -V -r
+}
+
+# ── Parse arguments ──────────────────────────────────────────────────────
+
+while [ "$#" -gt 0 ]; do
+  case "$1" in
+    -v|--version)  VERSION="$2"; shift 2 ;;
+    -l|--list)     LIST_VERSIONS=true; shift ;;
+    -r|--raw)      RAW_MODE=true; shift ;;
+    -c|--context)  CONTEXT_MODE=true; shift ;;
+    -m|--model)    LLM_MODEL="$2"; shift 2 ;;
+    -h|--help)     display_help; exit 0 ;;
+    *)             die "Unsupported option: $1" ;;
+  esac
+done
+
+# ── List mode ────────────────────────────────────────────────────────────
+
+if [ "$LIST_VERSIONS" = true ]; then
+  echo "Available OBITools4 versions:" >&2
+  echo "==============================" >&2
+  fetch_versions
+  exit 0
+fi
+
+# ── Resolve versions ─────────────────────────────────────────────────────
+
+all_versions=$(fetch_versions)
+[ -z "$all_versions" ] && die "Could not fetch versions from GitHub"
+
+if [ -z "$VERSION" ]; then
+   # ── Pre-release mode: local HEAD vs latest GitHub tag ──────────────────
+   PRE_RELEASE=true
+   previous_tag="Release_${latest_version}"
+   VERSION="pre-$(next_patch "$latest_version")"
+
+   echo "Pre-release mode: $previous_tag -> HEAD (as $VERSION)" >&2
+
+   # Need to be in a git repo
+   if ! git rev-parse --is-inside-work-tree >/dev/null 2>&1; then
+     die "Not inside a git repository. Pre-release mode requires a local git repo."
+   fi
+
+   # Check that the previous tag exists locally
+   if ! git rev-parse "$previous_tag" >/dev/null 2>&1; then
+     echo "Tag $previous_tag not found locally, fetching..." >&2
+     git fetch --tags 2>/dev/null || true
+     if ! git rev-parse "$previous_tag" >/dev/null 2>&1; then
+       die "Tag $previous_tag not found locally or remotely"
+     fi
+   fi
+
+  # Get local commits from the tag to HEAD (full messages)
+  commit_list=$(git log --format="%h %B" "${previous_tag}..HEAD" 2>/dev/null)
+
+   if [ -z "$commit_list" ]; then
+     die "No local commits found since $previous_tag"
+   fi
+ else
+   # ── Published release mode: between two GitHub tags ────────────────────
+   PRE_RELEASE=false
+   tag_name="Release_${VERSION}"
+
+   # Verify the requested version exists
+   if ! echo "$all_versions" | grep -qx "$VERSION"; then
+     die "Version $VERSION not found. Use -l to list available versions."
+   fi
+
+   # Find the previous version
+   previous_version=$(echo "$all_versions" | grep -A1 -x "$VERSION" | tail -1)
+
+   if [ "$previous_version" = "$VERSION" ] || [ -z "$previous_version" ]; then
+     previous_tag=""
+     echo "No previous version found -- will include all commits for $tag_name" >&2
+   else
+     previous_tag="Release_${previous_version}"
+     echo "Generating notes: $previous_tag -> $tag_name" >&2
+   fi
+
+   # Fetch commit messages between tags via GitHub compare API
+   if [ -n "$previous_tag" ]; then
+     commits_json=$(curl -sf "${GITHUB_API}/compare/${previous_tag}...${tag_name}")
+     if [ -z "$commits_json" ]; then
+       die "Could not fetch commit comparison from GitHub"
+     fi
+     commit_list=$(echo "$commits_json" \
+      | jq -r '.commits[] | (.sha[:8] + " " + .commit.message)' 2>/dev/null)
+   else
+     commits_json=$(curl -sf "${GITHUB_API}/commits?sha=${tag_name}&per_page=50")
+     if [ -z "$commits_json" ]; then
+       die "Could not fetch commits from GitHub"
+     fi
+     commit_list=$(echo "$commits_json" \
+      | jq -r '.[] | (.sha[:8] + " " + .commit.message)' 2>/dev/null)
+   fi
+
+   if [ -z "$commit_list" ]; then
+     die "No commits found between $previous_tag and $tag_name"
+   fi
+fi
+
+# ── LLM prompt (shared by context mode and summarization) ────────────────
+
+LLM_PROMPT="Summarize the following commits into a GitHub release note for version ${VERSION}. \
+Ignore commits related to version bumps, .gitignore changes, or any internal housekeeping \
+that is irrelevant to end users. Describe each user-facing change precisely without exposing \
+code. Eliminate redundancy. Output strictly valid JSON with no surrounding text, using this \
+exact schema: {\"title\": \"<short release title>\", \"body\": \"<detailed markdown release notes>\"}"
+
+# ── Raw mode: just output the commit list ────────────────────────────────
+
+if [ "$RAW_MODE" = true ]; then
+  echo "# Release ${VERSION}"
+  echo ""
+  echo "## Commits"
+  echo ""
+  echo "$commit_list" | while IFS= read -r line; do
+    echo "- ${line}"
+  done
+  installation_section "$VERSION"
+  exit 0
+fi
+
+# ── Context mode: show what would be sent to the LLM ────────────────────
+
+if [ "$CONTEXT_MODE" = true ]; then
+  echo "=== LLM Model ==="
+  echo "$LLM_MODEL"
+  echo ""
+  echo "=== Prompt ==="
+  echo "$LLM_PROMPT"
+  echo ""
+  echo "=== Stdin (commit list) ==="
+  echo "$commit_list"
+  exit 0
+fi
+
+# ── LLM summarization ───────────────────────────────────────────────────
+
+if ! command -v orla >/dev/null 2>&1; then
+  die "orla is required for LLM summarization. Use -r for raw output."
+fi
+
+if ! command -v jq >/dev/null 2>&1; then
+  die "jq is required for JSON parsing. Use -r for raw output."
+fi
+
+echo "Summarizing with LLM ($LLM_MODEL)..." >&2
+
+raw_output=$(echo "$commit_list" | \
+  ORLA_MAX_TOOL_CALLS=50 orla agent -m "$LLM_MODEL" \
+  "$LLM_PROMPT" \
+  2>/dev/null) || true
+
+if [ -z "$raw_output" ]; then
+  echo "Warning: LLM returned empty output, falling back to raw mode" >&2
+  exec "$0" -r -v "$VERSION"
+fi
+
+# Sanitize: extract JSON object, strip control characters
+sanitized=$(echo "$raw_output" | sed -n '/^{/,/^}/p' | tr -d '\000-\011\013-\014\016-\037')
+
+release_title=$(echo "$sanitized" | jq -r '.title // empty' 2>/dev/null)
+release_body=$(echo "$sanitized" | jq -r '.body // empty' 2>/dev/null)
+
+if [ -n "$release_title" ] && [ -n "$release_body" ]; then
+  echo "# ${release_title}"
+  echo ""
+  echo "$release_body"
+  installation_section "$VERSION"
+else
+  echo "Warning: JSON parsing failed, falling back to raw mode" >&2
+  exec "$0" -r -v "$VERSION"
+fi
--- a/tools/json2md.py
+++ b/tools/json2md.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+"""
+Read potentially malformed JSON from stdin (aichat output), extract title and
+body, and print them as plain text: title on first line, blank line, then body.
+Exits with 1 on failure (no output).
+"""
+
+import sys
+import json
+import re
+
+text = sys.stdin.read()
+
+m = re.search(r'\{.*\}', text, re.DOTALL)
+if not m:
+    sys.exit(1)
+
+s = m.group()
+obj = None
+
+try:
+    obj = json.loads(s)
+except Exception:
+    s2 = re.sub(r'(?<!\\)\n', r'\\n', s)
+    try:
+        obj = json.loads(s2)
+    except Exception:
+        sys.exit(1)
+
+title = obj.get('title', '').strip()
+body  = obj.get('body', '').strip()
+
+if not title or not body:
+    sys.exit(1)
+
+print(f"{title}\n\n{body}")
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-4.4.12
+4.4.22
Author	SHA1	Message	Date
Eric Coissac	94b0887069	Memory-aware Batching and Static Linux Builds ### Memory-Aware Batching - Replaced single batch size limits with configurable min/max bounds and memory limits for more precise control over resource usage. - Added `--batch-mem` CLI option to enable adaptive batching based on estimated sequence memory footprint (e.g., 128K, 64M, 1G). - Introduced `RebatchBySize()` with explicit support for both byte and count limits, flushing when either threshold is exceeded. - Implemented conservative memory estimation via `BioSequence.MemorySize()` and enhanced garbage collection to trigger explicit cleanup after large batch discards. - Updated internal batching logic across `batchiterator.go`, `fragment.go`, and `obirefidx.go` to consistently use default memory (128 MB) and size (min: 1, max: 2000) bounds. ### Linux Build Enhancements - Enabled static linking for Linux binaries using musl, producing portable, self-contained executables without external dependencies. ### Notes - This release consolidates and improves batching behavior introduced in 4.4.20, with no breaking changes to the public API. - All user-facing batching behavior is now governed by consistent memory and count constraints, improving predictability and stability during large dataset processing.	2026-03-13 15:16:41 +01:00
Eric Coissac	c188580aac	Replace Rebatch with RebatchBySize using default batch parameters Replace calls to Rebatch(size) with RebatchBySize(obidefault.BatchMem(), obidefault.BatchSizeMax()) in batchiterator.go, fragment.go, and obirefidx.go to ensure consistent use of default memory and size limits for batch rebatching.	2026-03-13 15:16:33 +01:00
Eric Coissac	1e1f575d1c	refactor: replace single batch size with min/max bounds and memory limits Introduce separate _BatchSize (min) and _BatchSizeMax (max) constants to replace the single _BatchSize variable. Update RebatchBySize to accept both maxBytes and maxCount parameters, flushing when either limit is exceeded. Set default batch size min to 1, max to 2000, and memory limit to 128 MB. Update CLI options and sequence_reader.go accordingly.	2026-03-13 15:07:35 +01:00
Eric Coissac	40769bf827	Add memory-based batching support Implement memory-aware batch sizing with --batch-mem CLI option, enabling adaptive batching based on estimated sequence memory footprint. Key changes: - Added _BatchMem and related getters/setters in pkg/obidefault - Implemented RebatchBySize() in pkg/obiter for memory-constrained batching - Added BioSequence.MemorySize() for conservative memory estimation - Integrated batch-mem option in pkg/obioptions with human-readable size parsing (e.g., 128K, 64M, 1G) - Added obiutils.ParseMemSize/FormatMemSize for unit conversion - Enhanced pool GC in pkg/obiseq/pool.go to trigger explicit GC for large slice discards - Updated sequence_reader.go to apply memory-based rebatching when enabled	2026-03-13 14:54:21 +01:00
Eric Coissac	74e6fcaf83	feat: add static linking for Linux builds using musl Enable static linking for Linux binaries by installing musl-tools and passing appropriate LDFLAGS during build. This ensures portable, self-contained executables for Linux targets.	2026-03-13 14:26:31 +01:00
coissac	30ec8b1b63	Merge pull request #92 from metabarcoding/push-mvpuxnxoyypu 4.4.21: Parallel builds, robust installation, and rope-based parsing enhancements	2026-03-13 12:00:32 +01:00
Eric Coissac	cdc72c5346	4.4.21: Parallel builds, robust installation, and rope-based parsing enhancements This release introduces significant improvements to build reliability and performance, alongside key parsing enhancements for sequence data. ### Build & Installation Improvements - Added support for parallel compilation via `-j/--jobs` option in both the Makefile and install script, enabling faster builds on multi-core systems. The default remains single-threaded for safety. - Enhanced Makefile with `.DEFAULT_GOAL := all` for consistent behavior and a documented `help` target. - Replaced fragile file operations with robust error handling, clear diagnostics, and automatic preservation of the build directory on copy failures to aid recovery. ### Rope-Based Parsing Enhancements (from 4.4.20) - Introduced direct rope-based parsers for FASTA, EMBL, and FASTQ formats, improving memory efficiency for large files. - Added U→T conversion support during sequence extraction and more reliable line ending detection. - Unified rope scanning logic under a new `ropeScanner` for better maintainability. - Added `TakeQualities()` method to BioSequence for more efficient handling of quality data. ### Bug Fixes (from 4.4.20) - Fixed `CompressStream` to correctly respect the `compressed` variable. - Replaced ambiguous string splitting utilities with precise left/right split variants (`LeftSplitInTwo`, `RightSplitInTwo`). ### Release Tooling (from 4.4.20) - Streamlined release process with modular targets (`jjpush-notes`, `jjpush-push`, `jjpush-tag`) and AI-assisted note generation via `aichat`. - Improved versioning support via the `VERSION` environment variable in `bump-version`. - Switched PR submission from raw `jj git push` to `stakk` for consistency and reliability. Note: This release incorporates key enhancements from 4.4.20 that impact end users, while focusing on build robustness and performance gains.	2026-03-13 11:59:32 +01:00
Eric Coissac	82a9972be7	Add parallel compilation support and improve Makefile/install script robustness - Add .DEFAULT_GOAL := all to Makefile for consistent default target - Document -j/--jobs option in README.md to allow parallel compilation - Add JOBS variable and -j/--jobs argument to install script (default: 1) - Replace fragile mkdir/cp commands with robust error handling and clear diagnostics - Add build directory preservation on copy failure for manual recovery - Pass -j option to make during compilation to enable parallel builds	2026-03-13 11:59:20 +01:00
coissac	ff6e515b2a	Merge pull request #91 from metabarcoding/push-uotrstkymowq 4.4.20: Rope-based parsing, improved release tooling, and bug fixes	2026-03-12 20:15:33 +01:00
Eric Coissac	cd0c525f50	4.4.20: Rope-based parsing, improved release tooling, and bug fixes ### Enhancements - Rope-based parsing: Added direct rope parsing for FASTA, EMBL, and FASTQ formats via `FastaChunkParserRope`, `EmblChunkParserRope`, and `FastqChunkParserRope`. Sequence extraction now supports U→T conversion and improved line ending detection. - Rope scanner refactoring: Unified rope scanning logic under a new `ropeScanner`, improving maintainability and consistency. - Sequence handling: Added `TakeQualities()` method to BioSequence for more efficient quality data handling. ### Bug Fixes - Compression behavior: Fixed `CompressStream` to correctly use the `compressed` variable instead of a hardcoded boolean. - String splitting: Replaced ambiguous `SplitInTwo` calls with precise `LeftSplitInTwo` or `RightSplitInTwo`, and added dedicated right-split utility. ### Tooling & Workflow Improvements - Makefile enhancements: Added colored terminal output, a `help` target for documenting all targets, and improved release workflow automation. - Release process: Refactored `jjpush` into modular targets (`jjpush-notes`, `jjpush-push`, `jjpush-tag`), replaced `orla` with `aichat` for AI-assisted release notes, and introduced robust JSON parsing using Python. Release notes are now generated and stored in temp files for tag creation. - Versioning: `bump-version` now supports the VERSION environment variable for manual version setting. - Submission: Switched from raw `jj git push` to `stakk` for PR submission. ### Internal Notes - Installation instructions are now included in release tags. - Fixed-size carry buffer replaced with dynamic slice for arbitrarily long line support without extra allocations.	2026-03-12 20:14:11 +01:00
Eric Coissac	abe935aa18	Add help target, colorize output, and improve release workflow - Add colored terminal output support (GREEN, YELLOW, BLUE, NC) - Introduce `help` target to document all Makefile targets - Enhance `bump-version` to accept VERSION env var for manual version setting - Refactor jjpush: split into modular targets (jjpush-notes, jjpush-push, jjpush-tag) - Replace orla with aichat for AI-powered release notes generation - Add robust JSON parsing using Python for release notes extraction - Use stakk for PR submission (replacing raw `jj git push`) - Generate and store release notes in temp files for tag creation - Add installation instructions to release tags - Update .PHONY with new targets 4.4.20: Rope-based parsing, improved release tooling, and bug fixes ### Enhancements - Rope-based parsing: Added direct rope parsing for FASTA, EMBL, and FASTQ formats via `FastaChunkParserRope`, `EmblChunkParserRope`, and `FastqChunkRope` functions, eliminating unnecessary memory allocation via Pack(). Sequence extraction now supports U→T conversion and improved line ending detection. - Rope scanner refactoring: Unified rope scanning logic under a new `ropeScanner`, improving maintainability and consistency across parsers. - Sequence handling: Added `TakeQualities()` method to BioSequence for more efficient quality data handling. ### Bug Fixes - Compression behavior: Fixed CompressStream to correctly use the `compressed` variable instead of a hardcoded boolean. - String splitting: Replaced ambiguous `SplitInTwo` calls with precise `LeftSplitInTwo` or `RightSplitInTwo`, and added dedicated right-split utility. ### Tooling & Workflow Improvements - Makefile enhancements: Added colored terminal output, a `help` target for documenting all targets, and improved release workflow automation. - Release process: Refactored `jjpush` into modular targets (`jjpush-notes`, `jjpush-push`, `jjpush-tag`), replaced `orla` with `aichat` for AI-assisted release notes, and introduced robust JSON parsing using Python. Release notes are now generated and stored in temp files for tag creation. - Versioning: `bump-version` now supports the VERSION environment variable for manual version setting. - Submission: Switched from raw `jj git push` to `stakk` for PR submission. ### Internal Notes - Installation instructions are now included in release tags. - Fixed-size carry buffer replaced with dynamic slice for arbitrarily long line support without extra allocations.	2026-03-12 20:14:11 +01:00
Eric Coissac	8dd32dc1bf	Fix CompressStream call to use compressed variable Replace hardcoded boolean with the `compressed` variable in CompressStream call to ensure correct compression behavior.	2026-03-12 18:48:22 +01:00
Eric Coissac	6ee8750635	Replace SplitInTwo with LeftSplitInTwo/RightSplitInTwo for precise splitting Replace SplitInTwo calls with LeftSplitInTwo or RightSplitInTwo depending on the intended split direction. In fastseq_json_header.go, extract rank from suffix without splitting; in biosequenceslice.go and taxid.go, use LeftSplitInTwo to split from the left; add RightSplitInTwo utility function for splitting from the right.	2026-03-12 18:41:28 +01:00
Eric Coissac	8c318c480e	replace fixed-size carry buffer with dynamic slice Replace the fixed [256]byte carry buffer with a dynamic []byte slice to support arbitrarily long lines without heap allocation during accumulation. Update all carry buffer handling logic to use len(s.carry) and append instead of fixed-size copy operations.	2026-03-11 20:44:45 +01:00
Eric Coissac	09fbc217d3	Add EMBL rope parsing support and improve sequence extraction Introduce EmblChunkParserRope function to parse EMBL chunks directly from a rope without using Pack(). Add extractEmblSeq helper to scan sequence sections and handle U to T conversion. Update parser logic to use rope-based parsing when available, and fix feature table handling for WGS entries.	2026-03-10 17:02:14 +01:00
Eric Coissac	3d2e205722	Refactor rope scanner and add FASTQ rope parser This commit refactors the rope scanner implementation by renaming gbRopeScanner to ropeScanner and extracting the common functionality into a new file. It also introduces a new FastqChunkParserRope function that parses FASTQ chunks directly from a rope without Pack(), enabling more efficient memory usage. The existing parsers are updated to use the new rope-based parser when available. The BioSequence type is enhanced with a TakeQualities method for more efficient quality data handling.	2026-03-10 16:47:03 +01:00
Eric Coissac	623116ab13	Add rope-based FASTA parsing and improve sequence handling Introduce FastaChunkParserRope for direct rope-based FASTA parsing, enhance sequence extraction with whitespace skipping and U->T conversion, and update parser logic to support both rope and raw data sources. - Added extractFastaSeq function to scan sequence bytes directly from rope - Implemented FastaChunkParserRope for rope-based parsing - Modified _ParseFastaFile to use rope when available - Updated sequence handling to support U->T conversion - Fixed line ending detection for FASTA parsing	2026-03-10 16:34:33 +01:00
coissac	1e4509cb63	Merge pull request #90 from metabarcoding/push-uzpqqoqvpnxw Push uzpqqoqvpnxw	2026-03-10 15:53:08 +01:00
Eric Coissac	b33d7705a8	Bump version to 4.4.19 Update version from 4.4.18 to 4.4.19 in both version.txt and pkg/obioptions/version.go	2026-03-10 15:51:36 +01:00
Eric Coissac	1342c83db6	Use NewBioSequenceOwning to avoid unnecessary sequence copying Replace NewBioSequence with NewBioSequenceOwning in genbank_read.go to take ownership of sequence slices without copying, improving performance. Update biosequence.go to add the new TakeSequence method and NewBioSequenceOwning constructor.	2026-03-10 15:51:35 +01:00
Eric Coissac	b246025907	Optimize Fasta batch formatting Optimize FormatFastaBatch to pre-allocate buffer and write sequences directly without intermediate strings, improving performance and memory usage.	2026-03-10 15:43:59 +01:00
Eric Coissac	761e0dbed3	Implémentation d'un parseur GenBank utilisant rope pour réduire l'usage de mémoire Ajout d'un parseur GenBank basé sur rope pour réduire l'usage de mémoire (RSS) et les allocations heap. - Ajout de `gbRopeScanner` pour lire les lignes sans allocation heap - Implémentation de `GenbankChunkParserRope` qui utilise rope au lieu de `Pack()` - Modification de `_ParseGenbankFile` et `ReadGenbank` pour utiliser le nouveau parseur - Réduction du RSS attendue de 57 GB à ~128 MB × workers - Conservation de l'ancien parseur pour compatibilité et tests Réduction significative des allocations (~50M) et temps sys, avec un temps user comparable ou meilleur.	2026-03-10 15:35:36 +01:00
Eric Coissac	a7ea47624b	Optimisation du parsing des grandes séquences Implémente une optimisation du parsing des grandes séquences en évitant l'allocation de mémoire inutile lors de la fusion des chunks. Ajoute un support pour le parsing direct de la structure rope, ce qui permet de réduire les allocations et d'améliorer les performances lors du traitement de fichiers GenBank/EMBL et FASTA/FASTQ de plusieurs Gbp. Les parseurs sont mis à jour pour utiliser la rope non-packée et le nouveau mécanisme d'écriture in-place pour les séquences GenBank.	2026-03-10 14:20:21 +01:00
Eric Coissac	61e346658e	Refactor jjpush workflow and enhance release notes generation Split the jjpush target into multiple sub-targets (jjpush-describe, jjpush-bump, jjpush-push, jjpush-tag) for better modularity and control. Enhance release notes generation by: - Using git log with full commit messages instead of GitHub API for pre-release mode - Adding robust JSON parsing with fallbacks for release notes - Including detailed installation instructions in release notes - Supporting both pre-release and published release modes Update release_notes.sh to handle pre-release mode, improve commit message fetching, and add installation section to release notes. Add .PHONY declarations for new sub-targets.	2026-03-10 11:09:19 +01:00
coissac	1ba1294b11	Merge pull request #89 from metabarcoding/push-uoqxkozlonwx Push uoqxkozlonwx	2026-02-20 11:42:40 +01:00
Eric Coissac	b2476fffcb	Bump version to 4.4.18 Update version from 4.4.17 to 4.4.18 in version.txt and corresponding Go variable _Version.	2026-02-20 11:40:43 +01:00
Eric Coissac	b05404721e	Bump version to 4.4.16 Update version from 4.4.15 to 4.4.16 in version.go and version.txt files.	2026-02-20 11:40:40 +01:00
Eric Coissac	c57e788459	Fix GenBank parsing and add release notes script This commit fixes an issue in the GenBank parser where empty parts were being included in the parsed data. It also introduces a new script `release_notes.sh` to automate the generation of GitHub-compatible release notes for OBITools4 versions, including support for LLM summarization and various output modes.	2026-02-20 11:37:51 +01:00
coissac	1cecf23978	Merge pull request #86 from metabarcoding/push-oulwykrpwxuz Push oulwykrpwxuz	2026-02-11 06:34:05 +01:00
Eric Coissac	4c824ef9b7	Bump version to 4.4.15 Update version from 4.4.14 to 4.4.15 in version.txt and pkg/obioptions/version.go	2026-02-11 06:31:11 +01:00
Eric Coissac	1ce5da9bee	Support new sequence file formats and improve error handling Add support for .gbff and .gbff.gz file extensions in sequence reader. Update the logic to return an error instead of using NilIBioSequence when no sequence files are found, improving the error handling and user feedback.	2026-02-11 06:31:10 +01:00
coissac	dc23d9de9a	Merge pull request #85 from metabarcoding/push-smturnsrozkp Push smturnsrozkp	2026-02-10 22:19:22 +01:00
Eric Coissac	aa9d7bbf72	Bump version to 4.4.14 Update version number from 4.4.13 to 4.4.14 in both version.go and version.txt files.	2026-02-10 22:17:23 +01:00
Eric Coissac	db22d20d0a	Rename obisuperkmer test script to obik-super and update command references Update test script name from obisuperkmer to obik-super and adjust all command references accordingly. - Changed TEST_NAME from 'obisuperkmer' to 'obik-super' - Changed CMD from 'obisuperkmer' to 'obik' - Updated MCMD to 'OBIk-super' - Modified command calls to use '$CMD super' instead of direct command names - Updated help test to use '$CMD super -h' - Updated all test cases to use the new command format	2026-02-10 22:17:22 +01:00
coissac	7c05bdb01c	Merge pull request #84 from metabarcoding/push-uxvowwlxkrlq Push uxvowwlxkrlq	2026-02-10 22:12:18 +01:00
Eric Coissac	b6542c4523	Bump version to 4.4.13 Update version from 4.4.12 to 4.4.13 in version.txt and pkg/obioptions/version.go	2026-02-10 22:10:38 +01:00
@@ -1 +1 @@
 .4.12
 .4.22