Merge pull request #90 from metabarcoding/push-uzpqqoqvpnxw

Push uzpqqoqvpnxw
2026-05-09 15:40:40 +00:00 · 2026-03-10 15:53:08 +01:00
parent 1ba1294b11 b33d7705a8
commit 1e4509cb63
13 changed files with 846 additions and 104 deletions
@@ -16,6 +16,7 @@
 **/*.tgz
 **/*.yaml
 **/*.csv
+**/*.pb.gz
 xx

 .rhistory
@@ -128,40 +128,58 @@ jjnew:
 	@echo "$(GREEN)✓ New commit created$(NC)"

 jjpush:
-	@echo "$(YELLOW)→ Pushing commit to repository...$(NC)"
+	@$(MAKE) jjpush-describe
+	@$(MAKE) jjpush-bump
+	@$(MAKE) jjpush-push
+	@$(MAKE) jjpush-tag
+	@echo "$(GREEN)✓ Release complete$(NC)"
+
+jjpush-describe:
 	@echo "$(BLUE)→ Documenting current commit...$(NC)"
 	@jj auto-describe
+
+jjpush-bump:
 	@echo "$(BLUE)→ Creating new commit for version bump...$(NC)"
 	@jj new
-	@previous_version=$$(cat version.txt); \
-	$(MAKE) bump-version; \
-	version=$$(cat version.txt); \
+	@$(MAKE) bump-version
+	@echo "$(BLUE)→ Documenting version bump commit...$(NC)"
+	@jj auto-describe
+
+jjpush-push:
+	@echo "$(BLUE)→ Pushing commits...$(NC)"
+	@jj git push --change @
+
+jjpush-tag:
+	@version=$$(cat version.txt); \
 	tag_name="Release_$$version"; \
-	previous_tag="Release_$$previous_version"; \
-	echo "$(BLUE)→ Documenting version bump commit...$(NC)"; \
-	jj auto-describe; \
-	echo "$(BLUE)→ Generating release notes from $$previous_tag to current commit...$(NC)"; \
+	echo "$(BLUE)→ Generating release notes for $$tag_name...$(NC)"; \
+	release_message="Release $$version"; \
 	if command -v orla >/dev/null 2>&1 && command -v jq >/dev/null 2>&1; then \
-		release_json=$$(ORLA_MAX_TOOL_CALLS=50 jj log -r "$$previous_tag::@" -T 'commit_id.short() ++ " " ++ description' | \
-			orla agent -m ollama:qwen3-coder-next:latest \
-			"Summarize the following commits into a GitHub release note for version $$version. Ignore commits related to version bumps, .gitignore changes, or any internal housekeeping that is irrelevant to end users. Describe each user-facing change precisely without exposing code. Eliminate redundancy. Output strictly valid JSON with no surrounding text, using this exact schema: {\"title\": \"<short release title>\", \"body\": \"<detailed markdown release notes>\"}"); \
-		release_json=$$(echo "$$release_json" | sed -n '/^{/,/^}/p'); \
-		release_title=$$(echo "$$release_json" | jq -r '.title // empty') ; \
-		release_body=$$(echo "$$release_json" | jq -r '.body // empty') ; \
+		previous_tag=$$(git describe --tags --abbrev=0 --match 'Release_*' HEAD^ 2>/dev/null); \
+		if [ -z "$$previous_tag" ]; then \
+			echo "$(YELLOW)⚠ No previous Release tag found, skipping release notes$(NC)"; \
+		else \
+			raw_output=$$(git log --format="%h %B" "$$previous_tag..HEAD" | \
+				ORLA_MAX_TOOL_CALLS=50 orla agent -m ollama:qwen3-coder-next:latest \
+				"Summarize the following commits into a GitHub release note for version $$version. Ignore commits related to version bumps, .gitignore changes, or any internal housekeeping that is irrelevant to end users. Describe each user-facing change precisely without exposing code. Eliminate redundancy. Output strictly valid JSON with no surrounding text, using this exact schema: {\"title\": \"<short release title>\", \"body\": \"<detailed markdown release notes>\"}" 2>/dev/null) || true; \
+			if [ -n "$$raw_output" ]; then \
+				sanitized=$$(echo "$$raw_output" | sed -n '/^{/,/^}/p' | tr -d '\000-\011\013-\014\016-\037'); \
+				release_title=$$(echo "$$sanitized" | jq -r '.title // empty' 2>/dev/null) ; \
+				release_body=$$(echo "$$sanitized" | jq -r '.body // empty' 2>/dev/null) ; \
 				if [ -n "$$release_title" ] && [ -n "$$release_body" ]; then \
 					release_message="$$release_title"$$'\n\n'"$$release_body"; \
 				else \
-			echo "$(YELLOW)⚠ JSON parsing failed, falling back to raw output$(NC)"; \
-			release_message="Release $$version"$$'\n\n'"$$release_json"; \
+					echo "$(YELLOW)⚠ JSON parsing failed, using default release message$(NC)"; \
 				fi; \
-	else \
-		release_message="Release $$version"; \
 			fi; \
-	echo "$(BLUE)→ Pushing commits and creating tag $$tag_name...$(NC)"; \
-	jj git push --change @; \
-	git tag -a "$$tag_name" -m "$$release_message" 2>/dev/null || echo "Tag $$tag_name already exists"; \
-	git push origin "$$tag_name" 2>/dev/null || echo "Tag already pushed"
-	@echo "$(GREEN)✓ Commits and tag pushed to repository$(NC)"
+		fi; \
+	fi; \
+	install_section=$$'\n## Installation\n\n### Pre-built binaries\n\nDownload the appropriate archive for your system from the\n[release assets](https://github.com/metabarcoding/obitools4/releases/tag/Release_'"$$version"')\nand extract it:\n\n#### Linux (AMD64)\n```bash\ntar -xzf obitools4_'"$$version"'_linux_amd64.tar.gz\n```\n\n#### Linux (ARM64)\n```bash\ntar -xzf obitools4_'"$$version"'_linux_arm64.tar.gz\n```\n\n#### macOS (Intel)\n```bash\ntar -xzf obitools4_'"$$version"'_darwin_amd64.tar.gz\n```\n\n#### macOS (Apple Silicon)\n```bash\ntar -xzf obitools4_'"$$version"'_darwin_arm64.tar.gz\n```\n\nAll OBITools4 binaries are included in each archive.\n\n### From source\n\nYou can also compile and install OBITools4 directly from source using the\ninstallation script:\n\n```bash\ncurl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | bash -s -- --version '"$$version"'\n```\n\nBy default binaries are installed in `/usr/local/bin`. Use `--install-dir` to\nchange the destination and `--obitools-prefix` to add a prefix to command names:\n\n```bash\ncurl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | \\\n  bash -s -- --version '"$$version"' --install-dir ~/local --obitools-prefix k\n```\n'; \
+	release_message="$$release_message$$install_section"; \
+	echo "$(BLUE)→ Creating tag $$tag_name...$(NC)"; \
+	git tag -a "$$tag_name" -m "$$release_message" 2>/dev/null || echo "$(YELLOW)⚠ Tag $$tag_name already exists$(NC)"; \
+	echo "$(BLUE)→ Pushing tag $$tag_name...$(NC)"; \
+	git push origin "$$tag_name" 2>/dev/null || echo "$(YELLOW)⚠ Tag push failed or already pushed$(NC)"

 jjfetch:
 	@echo "$(YELLOW)→ Pulling latest commits...$(NC)"
@@ -169,5 +187,5 @@ jjfetch:
 	@jj new master@origin
 	@echo "$(GREEN)✓ Latest commits pulled$(NC)"

-.PHONY: all obitools update-deps obitests githubtests jjnew jjpush jjfetch bump-version .FORCE
+.PHONY: all obitools update-deps obitests githubtests jjnew jjpush jjpush-describe jjpush-bump jjpush-push jjpush-tag jjfetch bump-version .FORCE
 .FORCE:
@@ -0,0 +1,264 @@
+# Optimisation du parsing des grandes séquences
+
+## Contexte
+
+OBITools4 doit pouvoir traiter des séquences de taille chromosomique (plusieurs Gbp), notamment
+issues de fichiers GenBank/EMBL (assemblages de génomes) ou de fichiers FASTA convertis depuis
+ces formats.
+
+## Architecture actuelle
+
+### Pipeline de lecture (`pkg/obiformats/`)
+
+```
+ReadFileChunk (goroutine)
+    → ChannelFileChunk
+    → N × _ParseGenbankFile / _ParseFastaFile (goroutines)
+    → IBioSequence
+```
+
+`ReadFileChunk` (`file_chunk_read.go`) lit le fichier par morceaux via une chaîne de
+`PieceOfChunk` (rope). Chaque nœud fait `fileChunkSize` bytes :
+
+- GenBank/EMBL : 128 MB (`1024*1024*128`)
+- FASTA/FASTQ  : 1 MB (`1024*1024`)
+
+La chaîne est accumulée jusqu'à trouver la fin du dernier enregistrement complet (splitter),
+puis `Pack()` est appelé pour fusionner tous les nœuds en un seul buffer contigu. Ce buffer
+est transmis au parseur via `FileChunk.Raw *bytes.Buffer`.
+
+### Parseur GenBank (`genbank_read.go`)
+
+`GenbankChunkParser` reçoit un `io.Reader` sur le buffer packé, lit ligne par ligne via
+`bufio.NewReader` (buffer 4096 bytes), et pour chaque ligne de la section `ORIGIN` :
+
+```go
+line = string(bline)                        // allocation par ligne
+cleanline := strings.TrimSpace(line)        // allocation
+parts := strings.SplitN(cleanline, " ", 7) // allocation []string + substrings
+for i := 1; i < lparts; i++ {
+    seqBytes.WriteString(parts[i])
+}
+```
+
+Point positif : `seqBytes` est pré-alloué grâce à `lseq` extrait de la ligne `LOCUS`.
+
+### Parseur FASTA (`fastaseq_read.go`)
+
+`FastaChunkParser` lit **octet par octet** via `scanner.ReadByte()`. Pour 3 Gbp :
+3 milliards d'appels. `seqBytes` est un `bytes.Buffer{}` sans pré-allocation.
+
+## Problème principal
+
+Pour une séquence de plusieurs Gbp, `Pack()` fusionne une chaîne de ~N nœuds de 128 MB en
+un seul buffer contigu. C'est une allocation de N × 128 MB suivie d'une copie de toutes les
+données. Bien que l'implémentation de `Pack()` soit efficace (libère les nœuds au fur et à
+mesure via `slices.Grow`), la copie est inévitable avec l'architecture actuelle.
+
+De plus, le parseur GenBank produit des dizaines de millions d'allocations temporaires pour
+parser la section `ORIGIN` (une par ligne).
+
+## Invariant clé découvert
+
+**Si la rope a plus d'un nœud, le premier nœud seul ne se termine pas sur une frontière
+d'enregistrement** (pas de `//\n` en fin de `piece1`).
+
+Preuve par construction dans `ReadFileChunk` :
+- `splitter` est appelé dès le premier nœud (ligne 157)
+- Si `end >= 0` → frontière trouvée dans 128 MB → boucle interne sautée → rope à 1 nœud
+- Si `end < 0` → boucle interne ajoute des nœuds → rope à ≥ 2 nœuds
+
+Corollaire : si rope à 1 nœud, `Pack()` ne fait rien (aucun nœud suivant).
+
+**Attention** : rope à ≥ 2 nœuds ne signifie pas qu'il n'y a qu'une seule séquence dans
+la rope. La rope packée peut contenir plusieurs enregistrements complets. Exemple : records
+de 80 MB → `nextpieces` (48 MB de reste) + nouveau nœud (128 MB) = rope à 2 nœuds
+contenant 2 records complets + début d'un troisième.
+
+L'invariant dit seulement que `piece1` seul est incomplet — pas que la rope entière
+ne contient qu'un seul record.
+
+**Invariant : le dernier FileChunk envoyé finit sur une frontière d'enregistrement.**
+
+Deux chemins dans `ReadFileChunk` :
+
+1. **Chemin normal** (`end >= 0` via `splitter`) : le buffer est explicitement tronqué à
+   `end` (ligne 200 : `pieces.data = pieces.data[:end]`). Frontière garantie par construction
+   pour tous les formats. ✓
+
+2. **Chemin EOF** (`end < 0`, `end = pieces.Len()`) : tout le reste du fichier est envoyé.
+   - **GenBank/EMBL** : présuppose fichier bien formé (se termine par `//\n`). Le parseur
+     lève un `log.Fatalf` sur tout état inattendu — filet de sécurité suffisant. ✓
+   - **FASTQ** : présupposé, vérifié par le parseur. ✓
+   - **FASTA** : garanti par le format lui-même (fin d'enregistrement = EOF ou `>`). ✓
+
+**Hypothèse de travail adoptée** : les fichiers d'entrée sont bien formés. Dans le pire cas,
+le parseur lèvera une erreur explicite. Il n'y a pas de risque de corruption silencieuse.
+
+## Piste d'optimisation : se dispenser de Pack()
+
+### Idée centrale
+
+Au lieu de fusionner la rope avant de la passer au parseur, **parser directement la rope
+nœud par nœud**, et **écrire la séquence compactée in-place dans le premier nœud**.
+
+Pourquoi c'est sûr :
+- Le header (LOCUS, DEFINITION, SOURCE, FEATURES) est **petit** et traité en premier
+- La séquence (ORIGIN) est **à la fin** du record
+- Au moment d'écrire la séquence depuis l'offset 0 de `piece1`, le pointeur de lecture
+  est profond dans la rope (offset >> 0) → jamais de collision
+- La séquence compactée est toujours plus courte que les données brutes
+
+### Pré-allocation
+
+Pour GenBank/EMBL : `lseq` est connu dès la ligne `LOCUS`/`ID` (première ligne, dans
+`piece1`). On peut faire `slices.Grow(piece1.data, lseq)` dès ce moment.
+
+Pour FASTA : pas de taille garantie dans le header, mais `rope.Len()` donne un majorant.
+On peut utiliser `rope.Len() / 2` comme estimation initiale.
+
+### Gestion des jonctions entre nœuds
+
+Une ligne peut chevaucher deux nœuds (rare avec 128 MB, mais possible). Solution : carry
+buffer de ~128 bytes pour les quelques bytes en fin de nœud.
+
+### Cas FASTA/FASTQ multi-séquences
+
+Un FileChunk peut contenir N séquences (notamment FASTA/FASTQ courts). Dans ce cas
+l'écriture in-place dans `piece1` n'est pas applicable directement — on écrase des données
+nécessaires aux séquences suivantes.
+
+Stratégie par cas :
+- **Rope à 1 nœud** (record ≤ 128 MB) : `Pack()` est trivial (no-op), parseur actuel OK
+- **Rope à ≥ 2 nœuds** : par l'invariant, `piece1` ne contient pas de record complet →
+  une seule grande séquence → in-place applicable
+
+### Format d'une ligne séquence GenBank (Après ORIGIN)
+
+```
+/^ *[0-9]+( [nuc]{10}){0,5} [nuc]{1,10}/
+```
+
+### Format d'une ligne séquence GenBank (Après SQ)
+
+La ligne SQ contient aussi la taille de la séquence
+
+```
+/^ *( [nuc]{10}){0,5} [nuc]{1,10} *[0-9]+/
+```
+
+Compactage in-place sur `bline` ([]byte brut, sans conversion `string`) :
+
+```go
+w := 0
+i := 0
+for i < len(bline) && bline[i] == ' '  { i++ }   // skip indentation
+for i < len(bline) && bline[i] <= '9'  { i++ }   // skip position number
+for ; i < len(bline); i++ {
+    if bline[i] != ' ' {
+        bline[w] = bline[i]
+        w++
+    }
+}
+// écrire bline[:w] directement dans piece1.data[seqOffset:]
+```
+
+## Changements nécessaires
+
+1. **`FileChunk`** : exposer la rope `*PieceOfChunk` non-packée en plus (ou à la place)
+   de `Raw *bytes.Buffer`
+2. **`GenbankChunkParser` / `EmblChunkParser`** : accepter `*PieceOfChunk`, parser la
+   rope séquentiellement avec carry buffer pour les jonctions
+3. **`FastaChunkParser`** : idem, avec in-place conditionnel selon taille de la rope
+4. **`ReadFileChunk`** : ne pas appeler `Pack()` avant envoi sur le channel (ou version
+   alternative `ReadFileChunkRope`)
+
+## Fichiers concernés
+
+- `pkg/obiformats/file_chunk_read.go` — structure rope, `ReadFileChunk`
+- `pkg/obiformats/genbank_read.go` — `GenbankChunkParser`, `_ParseGenbankFile`
+- `pkg/obiformats/embl_read.go` — `EmblChunkParser`, `ReadEMBL`
+- `pkg/obiformats/fastaseq_read.go` — `FastaChunkParser`, `_ParseFastaFile`
+- `pkg/obiformats/fastqseq_read.go` — parseur FASTQ (même structure)
+
+## Plan d'implémentation : parseur GenBank sur rope
+
+### Contexte
+
+Baseline mesurée : `obiconvert gbpln640.seq.gz` → 49s real, 42s user, 29s sys, **57 GB RSS**.
+Le sys élevé indique des allocations massives. Deux causes :
+1. `Pack()` : fusionne toute la rope (N × 128 MB) en un buffer contigu avant de parser
+2. Parser ORIGIN : `string(bline)` + `TrimSpace` + `SplitN` × millions de lignes
+
+### 1. `gbRopeScanner`
+
+Struct de lecture ligne par ligne sur la rope, sans allocation heap :
+
+```go
+type gbRopeScanner struct {
+    current *PieceOfChunk
+    pos     int
+    carry   [256]byte  // stack-allocated, max GenBank line = 80 chars
+    carryN  int
+}
+```
+
+`ReadLine()` :
+- Cherche `\n` dans `current.data[pos:]` via `bytes.IndexByte`
+- Si trouvé sans carry : retourne slice direct du node (zéro alloc)
+- Si trouvé avec carry : copie dans carry buffer, retourne `carry[:n]`
+- Si non trouvé : copie le reste dans carry, avance au node suivant, recommence
+- EOF : retourne `carry[:carryN]` puis nil
+
+`extractSequence(dest []byte, UtoT bool) int` :
+- Scan direct des bytes pour section ORIGIN, sans passer par ReadLine
+- Machine d'états : lineStart → skip espaces/digits → copier nucléotides dans dest
+- Stop sur `//` en début de ligne
+- Zéro allocation, UtoT inline
+
+### 2. `GenbankChunkParserRope`
+
+```go
+func GenbankChunkParserRope(source string, rope *PieceOfChunk,
+    withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error)
+```
+
+- Même machine d'états que `GenbankChunkParser`, sur `[]byte` (`bytes.HasPrefix`)
+- LOCUS : extrait `id` et `lseq` par scan direct (remplace `_seqlenght_rx`)
+- FEATURES / default inFeature : taxid extrait par scan de `/db_xref="taxon:`
+  dans la source feature ; `featBytes` rempli seulement si `withFeatureTable=true`
+- DEFINITION : toujours conservée
+- ORIGIN : `dest = make([]byte, 0, lseq+20)` puis `s.extractSequence(dest, UtoT)`
+
+### 3. Modifications `_ParseGenbankFile` et `ReadGenbank`
+
+`_ParseGenbankFile` utilise `chunk.Rope` :
+```go
+sequences, err := GenbankChunkParserRope(chunk.Source, chunk.Rope, ...)
+```
+
+`ReadGenbank` passe `pack=false` :
+```go
+entry_channel := ReadFileChunk(..., false)
+```
+
+### 4. Ce qui NE change pas
+
+- `GenbankChunkParser` reste (référence, tests)
+- `ReadFileChunk`, `Pack()`, autres parseurs (EMBL, FASTA, FASTQ) : inchangés
+
+### 5. Gains attendus
+
+- **RSS** : pic ≈ 128 MB × workers (au lieu de N × 128 MB)
+- **Temps sys** : élimination des mmap/munmap pour les gros buffers
+- **Temps user** : ~50M allocations éliminées
+
+### 6. Vérification
+
+```bash
+/usr/local/go/bin/go build ./...
+diff <(obiconvert gbpln640.seq.gz) gbpln640.reference.fasta
+cd bugs/genbank && ./benchmark.sh gbpln640.seq.gz
+```
+
+Cible : RSS < 1 GB, temps comparable ou meilleur.
@@ -196,6 +196,7 @@ func ReadEMBL(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, er
 		1024*1024*128,
 		EndOfLastFlatFileEntry,
 		"\nID   ",
+		true,
 	)

 	newIter := obiiter.MakeIBioSequence()
@@ -245,6 +245,7 @@ func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
 		1024*1024,
 		EndOfLastFastaEntry,
 		"\n>",
+		true,
 	)

 	for i := 0; i < nworker; i++ {
@@ -339,6 +339,7 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
 		1024*1024,
 		EndOfLastFastqEntry,
 		"\n@",
+		true,
 	)

 	for i := 0; i < nworker; i++ {
@@ -77,45 +77,47 @@ func FormatFasta(seq *obiseq.BioSequence, formater FormatHeader) string {
 //
 // It returns a byte array containing the formatted sequences.
 func FormatFastaBatch(batch obiiter.BioSequenceBatch, formater FormatHeader, skipEmpty bool) *bytes.Buffer {
-	// Create a buffer to store the formatted sequences
 	var bs bytes.Buffer

 	lt := 0
-
 	for _, seq := range batch.Slice() {
 		lt += seq.Len()
 	}

-	// Iterate over each sequence in the batch
+	// Pre-allocate: sequence data + newlines every 60 chars + ~100 bytes header per sequence
+	bs.Grow(lt + lt/60 + 100*batch.Len() + 1)
+
 	log.Debugf("FormatFastaBatch: #%d : %d seqs", batch.Order(), batch.Len())
-	first := true
+
 	for _, seq := range batch.Slice() {
-		// Check if the sequence is empty
 		if seq.Len() > 0 {
-			// Format the sequence using the provided formater function
-			formattedSeq := FormatFasta(seq, formater)
-
-			if first {
-				bs.Grow(lt + (len(formattedSeq)-seq.Len())*batch.Len()*5/4)
-				first = false
-			}
-
-			// Append the formatted sequence to the buffer
-			bs.WriteString(formattedSeq)
+			// Write header directly into bs — no intermediate string
+			bs.WriteByte('>')
+			bs.WriteString(seq.Id())
+			bs.WriteByte(' ')
+			bs.WriteString(formater(seq))
 			bs.WriteByte('\n')
+
+			// Write folded sequence directly into bs — no copies
+			s := seq.Sequence()
+			l := len(s)
+			for i := 0; i < l; i += 60 {
+				to := i + 60
+				if to > l {
+					to = l
+				}
+				bs.Write(s[i:to])
+				bs.WriteByte('\n')
+			}
 		} else {
-			// Handle empty sequences
 			if skipEmpty {
-				// Skip empty sequences if skipEmpty is true
 				obilog.Warnf("Sequence %s is empty and skipped in output", seq.Id())
 			} else {
-				// Terminate the program if skipEmpty is false
 				log.Fatalf("Sequence %s is empty", seq.Id())
 			}
 		}
 	}

-	// Return the byte array representation of the buffer
 	return &bs
 }

@@ -16,6 +16,7 @@ type SeqFileChunkParser func(string, io.Reader) (obiseq.BioSequenceSlice, error)
 type FileChunk struct {
 	Source string
 	Raw    *bytes.Buffer
+	Rope   *PieceOfChunk
 	Order  int
 }

@@ -97,11 +98,17 @@ func (piece *PieceOfChunk) IsLast() bool {
 	return piece.next == nil
 }

-func (piece *PieceOfChunk) FileChunk(source string, order int) FileChunk {
+func (piece *PieceOfChunk) FileChunk(source string, order int, pack bool) FileChunk {
+	piece = piece.Head()
+	var raw *bytes.Buffer
+	if pack {
 		piece.Pack()
+		raw = bytes.NewBuffer(piece.data)
+	}
 	return FileChunk{
 		Source: source,
-		Raw:    bytes.NewBuffer(piece.data),
+		Raw:    raw,
+		Rope:   piece,
 		Order:  order,
 	}
 }
@@ -133,7 +140,8 @@ func ReadFileChunk(
 	reader io.Reader,
 	fileChunkSize int,
 	splitter LastSeqRecord,
-	probe string) ChannelFileChunk {
+	probe string,
+	pack bool) ChannelFileChunk {

 	chunk_channel := make(ChannelFileChunk)

@@ -205,7 +213,7 @@ func ReadFileChunk(

 				if len(pieces.data) > 0 {
 					// obilog.Warnf("chuck %d :Read %d bytes from file %s", i, io.Len(), source)
-					chunk_channel <- pieces.FileChunk(source, i)
+					chunk_channel <- pieces.FileChunk(source, i, pack)
 					i++
 				}

@@ -222,7 +230,7 @@ func ReadFileChunk(

 		// Send the last chunk to the channel
 		if pieces.Len() > 0 {
-			chunk_channel <- pieces.FileChunk(source, i)
+			chunk_channel <- pieces.FileChunk(source, i, pack)
 		}

 		// Close the readers channel when the end of the file is reached
@@ -29,6 +29,342 @@ const (

 var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp")

+// gbRopeScanner reads lines from a PieceOfChunk rope without heap allocation.
+// The carry buffer (stack) handles lines that span two rope nodes.
+type gbRopeScanner struct {
+	current *PieceOfChunk
+	pos     int
+	carry   [256]byte // max GenBank line = 80 chars; 256 gives ample margin
+	carryN  int
+}
+
+func newGbRopeScanner(rope *PieceOfChunk) *gbRopeScanner {
+	return &gbRopeScanner{current: rope}
+}
+
+// ReadLine returns the next line without the trailing \n (or \r\n).
+// Returns nil at end of rope. The returned slice aliases carry[] or the node
+// data and is valid only until the next ReadLine call.
+func (s *gbRopeScanner) ReadLine() []byte {
+	for {
+		if s.current == nil {
+			if s.carryN > 0 {
+				n := s.carryN
+				s.carryN = 0
+				return s.carry[:n]
+			}
+			return nil
+		}
+
+		data := s.current.data[s.pos:]
+		idx := bytes.IndexByte(data, '\n')
+
+		if idx >= 0 {
+			var line []byte
+			if s.carryN == 0 {
+				line = data[:idx]
+			} else {
+				n := copy(s.carry[s.carryN:], data[:idx])
+				s.carryN += n
+				line = s.carry[:s.carryN]
+				s.carryN = 0
+			}
+			s.pos += idx + 1
+			if s.pos >= len(s.current.data) {
+				s.current = s.current.Next()
+				s.pos = 0
+			}
+			if len(line) > 0 && line[len(line)-1] == '\r' {
+				line = line[:len(line)-1]
+			}
+			return line
+		}
+
+		// No \n in this node: accumulate into carry and advance
+		n := copy(s.carry[s.carryN:], data)
+		s.carryN += n
+		s.current = s.current.Next()
+		s.pos = 0
+	}
+}
+
+// extractSequence scans the ORIGIN section byte-by-byte directly on the rope,
+// appending compacted bases to dest. Returns the extended slice.
+// Stops and returns when "//" is found at the start of a line.
+// The scanner is left positioned after the "//" line.
+func (s *gbRopeScanner) extractSequence(dest []byte, UtoT bool) []byte {
+	lineStart := true
+	skipDigits := true
+
+	for s.current != nil {
+		data := s.current.data[s.pos:]
+		for i, b := range data {
+			if lineStart {
+				if b == '/' {
+					// End-of-record marker "//"
+					s.pos += i + 1
+					if s.pos >= len(s.current.data) {
+						s.current = s.current.Next()
+						s.pos = 0
+					}
+					s.skipToNewline()
+					return dest
+				}
+				lineStart = false
+				skipDigits = true
+			}
+			switch {
+			case b == '\n':
+				lineStart = true
+			case b == '\r':
+				// skip
+			case skipDigits:
+				if b != ' ' && (b < '0' || b > '9') {
+					skipDigits = false
+					if UtoT && b == 'u' {
+						b = 't'
+					}
+					dest = append(dest, b)
+				}
+			case b != ' ':
+				if UtoT && b == 'u' {
+					b = 't'
+				}
+				dest = append(dest, b)
+			}
+		}
+		s.current = s.current.Next()
+		s.pos = 0
+	}
+	return dest
+}
+
+// skipToNewline advances the scanner past the next '\n'.
+func (s *gbRopeScanner) skipToNewline() {
+	for s.current != nil {
+		data := s.current.data[s.pos:]
+		idx := bytes.IndexByte(data, '\n')
+		if idx >= 0 {
+			s.pos += idx + 1
+			if s.pos >= len(s.current.data) {
+				s.current = s.current.Next()
+				s.pos = 0
+			}
+			return
+		}
+		s.current = s.current.Next()
+		s.pos = 0
+	}
+}
+
+// parseLseqFromLocus extracts the declared sequence length from a LOCUS line.
+// Format: "LOCUS       <id> <length> bp ..."
+// Returns -1 if not found or parse error.
+func parseLseqFromLocus(line []byte) int {
+	if len(line) < 13 {
+		return -1
+	}
+	i := 12
+	for i < len(line) && line[i] != ' ' {
+		i++
+	}
+	for i < len(line) && line[i] == ' ' {
+		i++
+	}
+	start := i
+	for i < len(line) && line[i] >= '0' && line[i] <= '9' {
+		i++
+	}
+	if i == start {
+		return -1
+	}
+	n, err := strconv.Atoi(string(line[start:i]))
+	if err != nil {
+		return -1
+	}
+	return n
+}
+
+// Prefix constants for GenBank section headers (byte slices for zero-alloc comparison).
+var (
+	gbPfxLocus      = []byte("LOCUS       ")
+	gbPfxDefinition = []byte("DEFINITION  ")
+	gbPfxContinue   = []byte("            ")
+	gbPfxSource     = []byte("SOURCE      ")
+	gbPfxFeatures   = []byte("FEATURES    ")
+	gbPfxOrigin     = []byte("ORIGIN")
+	gbPfxContig     = []byte("CONTIG")
+	gbPfxEnd        = []byte("//")
+	gbPfxDbXref     = []byte(`                     /db_xref="taxon:`)
+)
+
+// GenbankChunkParserRope parses a GenBank FileChunk directly from the rope
+// (PieceOfChunk linked list) without calling Pack(). This eliminates the large
+// contiguous allocation required for chromosomal-scale sequences.
+func GenbankChunkParserRope(source string, rope *PieceOfChunk,
+	withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) {
+
+	state := inHeader
+	scanner := newGbRopeScanner(rope)
+	sequences := obiseq.MakeBioSequenceSlice(100)[:0]
+
+	id := ""
+	lseq := -1
+	scientificName := ""
+	defBytes := new(bytes.Buffer)
+	featBytes := new(bytes.Buffer)
+	var seqDest []byte
+	taxid := 1
+	nl := 0
+
+	for bline := scanner.ReadLine(); bline != nil; bline = scanner.ReadLine() {
+		nl++
+		processed := false
+		for !processed {
+			switch {
+
+			case bytes.HasPrefix(bline, gbPfxLocus):
+				if state != inHeader {
+					log.Fatalf("Line %d - Unexpected state %d while reading LOCUS: %s", nl, state, bline)
+				}
+				rest := bline[12:]
+				sp := bytes.IndexByte(rest, ' ')
+				if sp < 0 {
+					id = string(rest)
+				} else {
+					id = string(rest[:sp])
+				}
+				lseq = parseLseqFromLocus(bline)
+				cap0 := lseq + 20
+				if cap0 < 1024 {
+					cap0 = 1024
+				}
+				seqDest = make([]byte, 0, cap0)
+				state = inEntry
+				processed = true
+
+			case bytes.HasPrefix(bline, gbPfxDefinition):
+				if state != inEntry {
+					log.Fatalf("Line %d - Unexpected state %d while reading DEFINITION: %s", nl, state, bline)
+				}
+				defBytes.Write(bytes.TrimSpace(bline[12:]))
+				state = inDefinition
+				processed = true
+
+			case state == inDefinition:
+				if bytes.HasPrefix(bline, gbPfxContinue) {
+					defBytes.WriteByte(' ')
+					defBytes.Write(bytes.TrimSpace(bline[12:]))
+					processed = true
+				} else {
+					state = inEntry
+				}
+
+			case bytes.HasPrefix(bline, gbPfxSource):
+				if state != inEntry {
+					log.Fatalf("Line %d - Unexpected state %d while reading SOURCE: %s", nl, state, bline)
+				}
+				scientificName = string(bytes.TrimSpace(bline[12:]))
+				processed = true
+
+			case bytes.HasPrefix(bline, gbPfxFeatures):
+				if state != inEntry {
+					log.Fatalf("Line %d - Unexpected state %d while reading FEATURES: %s", nl, state, bline)
+				}
+				if withFeatureTable {
+					featBytes.Write(bline)
+				}
+				state = inFeature
+				processed = true
+
+			case bytes.HasPrefix(bline, gbPfxOrigin):
+				if state != inFeature && state != inContig {
+					log.Fatalf("Line %d - Unexpected state %d while reading ORIGIN: %s", nl, state, bline)
+				}
+				// Use fast byte-scan to extract sequence and consume through "//"
+				seqDest = scanner.extractSequence(seqDest, UtoT)
+				// Emit record
+				if id == "" {
+					log.Warn("Empty id when parsing genbank file")
+				}
+				sequence := obiseq.NewBioSequenceOwning(id, seqDest, defBytes.String())
+				sequence.SetSource(source)
+				if withFeatureTable {
+					sequence.SetFeatures(featBytes.Bytes())
+				}
+				annot := sequence.Annotations()
+				annot["scientific_name"] = scientificName
+				annot["taxid"] = taxid
+				sequences = append(sequences, sequence)
+
+				defBytes = bytes.NewBuffer(obiseq.GetSlice(200))
+				featBytes = new(bytes.Buffer)
+				nl = 0
+				taxid = 1
+				seqDest = nil
+				state = inHeader
+				processed = true
+
+			case bytes.HasPrefix(bline, gbPfxContig):
+				if state != inFeature && state != inContig {
+					log.Fatalf("Line %d - Unexpected state %d while reading CONTIG: %s", nl, state, bline)
+				}
+				state = inContig
+				processed = true
+
+			case bytes.Equal(bline, gbPfxEnd):
+				// Reached for CONTIG records (no ORIGIN section)
+				if state != inContig {
+					log.Fatalf("Line %d - Unexpected state %d while reading end of record %s", nl, state, id)
+				}
+				if id == "" {
+					log.Warn("Empty id when parsing genbank file")
+				}
+				sequence := obiseq.NewBioSequenceOwning(id, seqDest, defBytes.String())
+				sequence.SetSource(source)
+				if withFeatureTable {
+					sequence.SetFeatures(featBytes.Bytes())
+				}
+				annot := sequence.Annotations()
+				annot["scientific_name"] = scientificName
+				annot["taxid"] = taxid
+				sequences = append(sequences, sequence)
+
+				defBytes = bytes.NewBuffer(obiseq.GetSlice(200))
+				featBytes = new(bytes.Buffer)
+				nl = 0
+				taxid = 1
+				seqDest = nil
+				state = inHeader
+				processed = true
+
+			default:
+				switch state {
+				case inFeature:
+					if withFeatureTable {
+						featBytes.WriteByte('\n')
+						featBytes.Write(bline)
+					}
+					if bytes.HasPrefix(bline, gbPfxDbXref) {
+						rest := bline[len(gbPfxDbXref):]
+						q := bytes.IndexByte(rest, '"')
+						if q >= 0 {
+							taxid, _ = strconv.Atoi(string(rest[:q]))
+						}
+					}
+					processed = true
+				case inHeader, inEntry, inContig:
+					processed = true
+				default:
+					log.Fatalf("Unexpected state %d while reading: %s", state, bline)
+				}
+			}
+		}
+	}
+
+	return sequences, nil
+}
+
 func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlice, error) {
 	return func(source string, input io.Reader) (obiseq.BioSequenceSlice, error) {
 		state := inHeader
@@ -125,13 +461,10 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
 					if state != inSequence && state != inContig {
 						log.Fatalf("Line %d - Unexpected state %d while reading end of record %s", nl, state, id)
 					}
-					// log.Debugln("Total lines := ", nl)
 					if id == "" {
 						log.Warn("Empty id when parsing genbank file")
 					}

-					// log.Debugf("End of sequence %s: %dbp ", id, seqBytes.Len())
-
 					sequence := obiseq.NewBioSequence(id,
 						seqBytes.Bytes(),
 						defBytes.String())
@@ -144,9 +477,6 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
 					annot := sequence.Annotations()
 					annot["scientific_name"] = scientificName
 					annot["taxid"] = taxid
-					// log.Println(FormatFasta(sequence, FormatFastSeqJsonHeader))
-					// log.Debugf("Read sequences %s: %dbp (%d)", sequence.Id(),
-					//	sequence.Len(), seqBytes.Len())

 					sequences = append(sequences, sequence)

@@ -159,8 +489,6 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
 					processed = true

 				case state == inSequence:
-					// log.Debugf("Chunk %d : Genbank: line %d, state = %d : %s", chunks.order, nl, state, line)
-
 					sl++
 					cleanline := strings.TrimSpace(line)
 					parts := strings.SplitN(cleanline, " ", 7)
@@ -198,6 +526,7 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob

 		}

+		_ = sl
 		return sequences, nil
 	}
 }
@@ -206,10 +535,16 @@ func _ParseGenbankFile(input ChannelFileChunk,
 	out obiiter.IBioSequence,
 	withFeatureTable, UtoT bool) {

-	parser := GenbankChunkParser(withFeatureTable, UtoT)
-
 	for chunks := range input {
-		sequences, err := parser(chunks.Source, chunks.Raw)
+		var sequences obiseq.BioSequenceSlice
+		var err error
+
+		if chunks.Rope != nil {
+			sequences, err = GenbankChunkParserRope(chunks.Source, chunks.Rope, withFeatureTable, UtoT)
+		} else {
+			parser := GenbankChunkParser(withFeatureTable, UtoT)
+			sequences, err = parser(chunks.Source, chunks.Raw)
+		}

 		if err != nil {
 			log.Fatalf("File %s : Cannot parse the genbank file : %v", chunks.Source, err)
@@ -225,7 +560,6 @@ func _ParseGenbankFile(input ChannelFileChunk,

 func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
 	opt := MakeOptions(options)
-	// entry_channel := make(chan _FileChunk)

 	entry_channel := ReadFileChunk(
 		opt.Source(),
@@ -233,13 +567,13 @@ func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence,
 		1024*1024*128,
 		EndOfLastFlatFileEntry,
 		"\nLOCUS       ",
+		false, // do not pack: rope-based parser avoids contiguous allocation
 	)

 	newIter := obiiter.MakeIBioSequence()

 	nworkers := opt.ParallelWorkers()

-	// for j := 0; j < opt.ParallelWorkers(); j++ {
 	for j := 0; j < nworkers; j++ {
 		newIter.Add(1)
 		go _ParseGenbankFile(
@@ -250,8 +584,6 @@ func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence,
 		)
 	}

-	// go _ReadFlatFileChunk(reader, entry_channel)
-
 	go func() {
 		newIter.WaitAndClose()
 		log.Debug("End of the genbank file ", opt.Source())
@@ -3,7 +3,7 @@ package obioptions
 // Version is automatically updated by the Makefile from version.txt
 // The patch number (third digit) is incremented on each push to the repository

-var _Version = "Release 4.4.18"
+var _Version = "Release 4.4.19"

 // Version returns the version of the obitools package.
 //
@@ -120,6 +120,19 @@ func NewBioSequence(id string,
 	return bs
 }

+// NewBioSequenceOwning creates a BioSequence taking ownership of the sequence
+// slice without copying it. The caller must not use the slice after this call.
+// Use this when the slice was allocated specifically for this sequence.
+func NewBioSequenceOwning(id string,
+	sequence []byte,
+	definition string) *BioSequence {
+	bs := NewEmptyBioSequence(0)
+	bs.SetId(id)
+	bs.TakeSequence(sequence)
+	bs.SetDefinition(definition)
+	return bs
+}
+
 // NewBioSequenceWithQualities creates a new BioSequence object with the given id, sequence, definition, and qualities.
 //
 // Parameters:
@@ -444,6 +457,12 @@ func (s *BioSequence) SetSequence(sequence []byte) {
 	s.sequence = obiutils.InPlaceToLower(CopySlice(sequence))
 }

+// TakeSequence stores the slice directly without copying, then lowercases in-place.
+// The caller must not use the slice after this call.
+func (s *BioSequence) TakeSequence(sequence []byte) {
+	s.sequence = obiutils.InPlaceToLower(sequence)
+}
+
 func (s *BioSequence) HasValidSequence() bool {
 	for _, c := range s.sequence {
 		if !((c >= 'a' && c <= 'z') || c == '-' || c == '.' || c == '[' || c == ']') {
@@ -21,6 +21,74 @@ LLM_MODEL="ollama:qwen3-coder-next:latest"

 die() { echo "Error: $*" >&2; exit 1; }

+next_patch() {
+  local v="$1"
+  local major minor patch
+  major=$(echo "$v" | cut -d. -f1)
+  minor=$(echo "$v" | cut -d. -f2)
+  patch=$(echo "$v" | cut -d. -f3)
+  echo "${major}.${minor}.$(( patch + 1 ))"
+}
+
+# Strip "pre-" prefix to get the bare version number for installation section
+bare_version() {
+  echo "$1" | sed 's/^pre-//'
+}
+
+installation_section() {
+  local v
+  v=$(bare_version "$1")
+  cat <<INSTALL_EOF
+
+## Installation
+
+### Pre-built binaries
+
+Download the appropriate archive for your system from the
+[release assets](https://github.com/metabarcoding/obitools4/releases/tag/Release_${v})
+and extract it:
+
+#### Linux (AMD64)
+\`\`\`bash
+tar -xzf obitools4_${v}_linux_amd64.tar.gz
+\`\`\`
+
+#### Linux (ARM64)
+\`\`\`bash
+tar -xzf obitools4_${v}_linux_arm64.tar.gz
+\`\`\`
+
+#### macOS (Intel)
+\`\`\`bash
+tar -xzf obitools4_${v}_darwin_amd64.tar.gz
+\`\`\`
+
+#### macOS (Apple Silicon)
+\`\`\`bash
+tar -xzf obitools4_${v}_darwin_arm64.tar.gz
+\`\`\`
+
+All OBITools4 binaries are included in each archive.
+
+### From source
+
+You can also compile and install OBITools4 directly from source using the
+installation script:
+
+\`\`\`bash
+curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | bash -s -- --version ${v}
+\`\`\`
+
+By default binaries are installed in \`/usr/local/bin\`. Use \`--install-dir\` to
+change the destination and \`--obitools-prefix\` to add a prefix to command names:
+
+\`\`\`bash
+curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | \\
+  bash -s -- --version ${v} --install-dir ~/local --obitools-prefix k
+\`\`\`
+INSTALL_EOF
+}
+
 display_help() {
  cat <<EOF
 Usage: $(basename "$0") [OPTIONS]
@@ -81,10 +149,36 @@ all_versions=$(fetch_versions)
 [ -z "$all_versions" ] && die "Could not fetch versions from GitHub"

 if [ -z "$VERSION" ]; then
-  VERSION=$(echo "$all_versions" | head -1)
-  echo "Using latest version: $VERSION" >&2
+   # ── Pre-release mode: local HEAD vs latest GitHub tag ──────────────────
+   PRE_RELEASE=true
+   previous_tag="Release_${latest_version}"
+   VERSION="pre-$(next_patch "$latest_version")"
+
+   echo "Pre-release mode: $previous_tag -> HEAD (as $VERSION)" >&2
+
+   # Need to be in a git repo
+   if ! git rev-parse --is-inside-work-tree >/dev/null 2>&1; then
+     die "Not inside a git repository. Pre-release mode requires a local git repo."
   fi

+   # Check that the previous tag exists locally
+   if ! git rev-parse "$previous_tag" >/dev/null 2>&1; then
+     echo "Tag $previous_tag not found locally, fetching..." >&2
+     git fetch --tags 2>/dev/null || true
+     if ! git rev-parse "$previous_tag" >/dev/null 2>&1; then
+       die "Tag $previous_tag not found locally or remotely"
+     fi
+   fi
+
+  # Get local commits from the tag to HEAD (full messages)
+  commit_list=$(git log --format="%h %B" "${previous_tag}..HEAD" 2>/dev/null)
+
+   if [ -z "$commit_list" ]; then
+     die "No local commits found since $previous_tag"
+   fi
+ else
+   # ── Published release mode: between two GitHub tags ────────────────────
+   PRE_RELEASE=false
   tag_name="Release_${VERSION}"

   # Verify the requested version exists
@@ -92,7 +186,7 @@ if ! echo "$all_versions" | grep -qx "$VERSION"; then
     die "Version $VERSION not found. Use -l to list available versions."
   fi

-# Find the previous version (the one right after in the sorted-descending list)
+   # Find the previous version
   previous_version=$(echo "$all_versions" | grep -A1 -x "$VERSION" | tail -1)

   if [ "$previous_version" = "$VERSION" ] || [ -z "$previous_version" ]; then
@@ -103,28 +197,27 @@ else
     echo "Generating notes: $previous_tag -> $tag_name" >&2
   fi

-# ── Fetch commit messages between tags via GitHub compare API ────────────
-
+   # Fetch commit messages between tags via GitHub compare API
   if [ -n "$previous_tag" ]; then
     commits_json=$(curl -sf "${GITHUB_API}/compare/${previous_tag}...${tag_name}")
     if [ -z "$commits_json" ]; then
       die "Could not fetch commit comparison from GitHub"
     fi
     commit_list=$(echo "$commits_json" \
-    | jq -r '.commits[] | (.sha[:8] + " " + (.commit.message | split("\n")[0]))' 2>/dev/null)
+      | jq -r '.commits[] | (.sha[:8] + " " + .commit.message)' 2>/dev/null)
   else
-  # First release: get commits up to this tag
     commits_json=$(curl -sf "${GITHUB_API}/commits?sha=${tag_name}&per_page=50")
     if [ -z "$commits_json" ]; then
       die "Could not fetch commits from GitHub"
     fi
     commit_list=$(echo "$commits_json" \
-    | jq -r '.[] | (.sha[:8] + " " + (.commit.message | split("\n")[0]))' 2>/dev/null)
+      | jq -r '.[] | (.sha[:8] + " " + .commit.message)' 2>/dev/null)
   fi

   if [ -z "$commit_list" ]; then
     die "No commits found between $previous_tag and $tag_name"
   fi
+fi

 # ── LLM prompt (shared by context mode and summarization) ────────────────

@@ -144,6 +237,7 @@ if [ "$RAW_MODE" = true ]; then
  echo "$commit_list" | while IFS= read -r line; do
    echo "- ${line}"
  done
+  installation_section "$VERSION"
  exit 0
 fi

@@ -193,6 +287,7 @@ if [ -n "$release_title" ] && [ -n "$release_body" ]; then
  echo "# ${release_title}"
  echo ""
  echo "$release_body"
+  installation_section "$VERSION"
 else
  echo "Warning: JSON parsing failed, falling back to raw mode" >&2
  exec "$0" -r -v "$VERSION"
@@ -1 +1 @@
-4.4.18
+4.4.19
@@ -1 +1 @@
 .4.18
 .4.19