From 61e346658e9211f5d2d09dd46fef382004803ccd Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Fri, 20 Feb 2026 11:52:34 +0100 Subject: [PATCH 1/6] Refactor jjpush workflow and enhance release notes generation Split the jjpush target into multiple sub-targets (jjpush-describe, jjpush-bump, jjpush-push, jjpush-tag) for better modularity and control. Enhance release notes generation by: - Using git log with full commit messages instead of GitHub API for pre-release mode - Adding robust JSON parsing with fallbacks for release notes - Including detailed installation instructions in release notes - Supporting both pre-release and published release modes Update release_notes.sh to handle pre-release mode, improve commit message fetching, and add installation section to release notes. Add .PHONY declarations for new sub-targets. --- Makefile | 70 ++++++++++++-------- release_notes.sh | 167 +++++++++++++++++++++++++++++++++++++---------- 2 files changed, 175 insertions(+), 62 deletions(-) diff --git a/Makefile b/Makefile index 36780d9..afc3772 100644 --- a/Makefile +++ b/Makefile @@ -128,40 +128,58 @@ jjnew: @echo "$(GREEN)✓ New commit created$(NC)" jjpush: - @echo "$(YELLOW)→ Pushing commit to repository...$(NC)" + @$(MAKE) jjpush-describe + @$(MAKE) jjpush-bump + @$(MAKE) jjpush-push + @$(MAKE) jjpush-tag + @echo "$(GREEN)✓ Release complete$(NC)" + +jjpush-describe: @echo "$(BLUE)→ Documenting current commit...$(NC)" @jj auto-describe + +jjpush-bump: @echo "$(BLUE)→ Creating new commit for version bump...$(NC)" @jj new - @previous_version=$$(cat version.txt); \ - $(MAKE) bump-version; \ - version=$$(cat version.txt); \ + @$(MAKE) bump-version + @echo "$(BLUE)→ Documenting version bump commit...$(NC)" + @jj auto-describe + +jjpush-push: + @echo "$(BLUE)→ Pushing commits...$(NC)" + @jj git push --change @ + +jjpush-tag: + @version=$$(cat version.txt); \ tag_name="Release_$$version"; \ - previous_tag="Release_$$previous_version"; \ - echo "$(BLUE)→ Documenting version bump commit...$(NC)"; \ - jj auto-describe; \ - echo "$(BLUE)→ Generating release notes from $$previous_tag to current commit...$(NC)"; \ + echo "$(BLUE)→ Generating release notes for $$tag_name...$(NC)"; \ + release_message="Release $$version"; \ if command -v orla >/dev/null 2>&1 && command -v jq >/dev/null 2>&1; then \ - release_json=$$(ORLA_MAX_TOOL_CALLS=50 jj log -r "$$previous_tag::@" -T 'commit_id.short() ++ " " ++ description' | \ - orla agent -m ollama:qwen3-coder-next:latest \ - "Summarize the following commits into a GitHub release note for version $$version. Ignore commits related to version bumps, .gitignore changes, or any internal housekeeping that is irrelevant to end users. Describe each user-facing change precisely without exposing code. Eliminate redundancy. Output strictly valid JSON with no surrounding text, using this exact schema: {\"title\": \"\", \"body\": \"\"}"); \ - release_json=$$(echo "$$release_json" | sed -n '/^{/,/^}/p'); \ - release_title=$$(echo "$$release_json" | jq -r '.title // empty') ; \ - release_body=$$(echo "$$release_json" | jq -r '.body // empty') ; \ - if [ -n "$$release_title" ] && [ -n "$$release_body" ]; then \ - release_message="$$release_title"$$'\n\n'"$$release_body"; \ + previous_tag=$$(git describe --tags --abbrev=0 --match 'Release_*' HEAD^ 2>/dev/null); \ + if [ -z "$$previous_tag" ]; then \ + echo "$(YELLOW)⚠ No previous Release tag found, skipping release notes$(NC)"; \ else \ - echo "$(YELLOW)⚠ JSON parsing failed, falling back to raw output$(NC)"; \ - release_message="Release $$version"$$'\n\n'"$$release_json"; \ + raw_output=$$(git log --format="%h %B" "$$previous_tag..HEAD" | \ + ORLA_MAX_TOOL_CALLS=50 orla agent -m ollama:qwen3-coder-next:latest \ + "Summarize the following commits into a GitHub release note for version $$version. Ignore commits related to version bumps, .gitignore changes, or any internal housekeeping that is irrelevant to end users. Describe each user-facing change precisely without exposing code. Eliminate redundancy. Output strictly valid JSON with no surrounding text, using this exact schema: {\"title\": \"\", \"body\": \"\"}" 2>/dev/null) || true; \ + if [ -n "$$raw_output" ]; then \ + sanitized=$$(echo "$$raw_output" | sed -n '/^{/,/^}/p' | tr -d '\000-\011\013-\014\016-\037'); \ + release_title=$$(echo "$$sanitized" | jq -r '.title // empty' 2>/dev/null) ; \ + release_body=$$(echo "$$sanitized" | jq -r '.body // empty' 2>/dev/null) ; \ + if [ -n "$$release_title" ] && [ -n "$$release_body" ]; then \ + release_message="$$release_title"$$'\n\n'"$$release_body"; \ + else \ + echo "$(YELLOW)⚠ JSON parsing failed, using default release message$(NC)"; \ + fi; \ + fi; \ fi; \ - else \ - release_message="Release $$version"; \ fi; \ - echo "$(BLUE)→ Pushing commits and creating tag $$tag_name...$(NC)"; \ - jj git push --change @; \ - git tag -a "$$tag_name" -m "$$release_message" 2>/dev/null || echo "Tag $$tag_name already exists"; \ - git push origin "$$tag_name" 2>/dev/null || echo "Tag already pushed" - @echo "$(GREEN)✓ Commits and tag pushed to repository$(NC)" + install_section=$$'\n## Installation\n\n### Pre-built binaries\n\nDownload the appropriate archive for your system from the\n[release assets](https://github.com/metabarcoding/obitools4/releases/tag/Release_'"$$version"')\nand extract it:\n\n#### Linux (AMD64)\n```bash\ntar -xzf obitools4_'"$$version"'_linux_amd64.tar.gz\n```\n\n#### Linux (ARM64)\n```bash\ntar -xzf obitools4_'"$$version"'_linux_arm64.tar.gz\n```\n\n#### macOS (Intel)\n```bash\ntar -xzf obitools4_'"$$version"'_darwin_amd64.tar.gz\n```\n\n#### macOS (Apple Silicon)\n```bash\ntar -xzf obitools4_'"$$version"'_darwin_arm64.tar.gz\n```\n\nAll OBITools4 binaries are included in each archive.\n\n### From source\n\nYou can also compile and install OBITools4 directly from source using the\ninstallation script:\n\n```bash\ncurl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | bash -s -- --version '"$$version"'\n```\n\nBy default binaries are installed in `/usr/local/bin`. Use `--install-dir` to\nchange the destination and `--obitools-prefix` to add a prefix to command names:\n\n```bash\ncurl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | \\\n bash -s -- --version '"$$version"' --install-dir ~/local --obitools-prefix k\n```\n'; \ + release_message="$$release_message$$install_section"; \ + echo "$(BLUE)→ Creating tag $$tag_name...$(NC)"; \ + git tag -a "$$tag_name" -m "$$release_message" 2>/dev/null || echo "$(YELLOW)⚠ Tag $$tag_name already exists$(NC)"; \ + echo "$(BLUE)→ Pushing tag $$tag_name...$(NC)"; \ + git push origin "$$tag_name" 2>/dev/null || echo "$(YELLOW)⚠ Tag push failed or already pushed$(NC)" jjfetch: @echo "$(YELLOW)→ Pulling latest commits...$(NC)" @@ -169,5 +187,5 @@ jjfetch: @jj new master@origin @echo "$(GREEN)✓ Latest commits pulled$(NC)" -.PHONY: all obitools update-deps obitests githubtests jjnew jjpush jjfetch bump-version .FORCE +.PHONY: all obitools update-deps obitests githubtests jjnew jjpush jjpush-describe jjpush-bump jjpush-push jjpush-tag jjfetch bump-version .FORCE .FORCE: diff --git a/release_notes.sh b/release_notes.sh index 32de401..10c05e0 100755 --- a/release_notes.sh +++ b/release_notes.sh @@ -21,6 +21,74 @@ LLM_MODEL="ollama:qwen3-coder-next:latest" die() { echo "Error: $*" >&2; exit 1; } +next_patch() { + local v="$1" + local major minor patch + major=$(echo "$v" | cut -d. -f1) + minor=$(echo "$v" | cut -d. -f2) + patch=$(echo "$v" | cut -d. -f3) + echo "${major}.${minor}.$(( patch + 1 ))" +} + +# Strip "pre-" prefix to get the bare version number for installation section +bare_version() { + echo "$1" | sed 's/^pre-//' +} + +installation_section() { + local v + v=$(bare_version "$1") + cat <&2 -fi + # ── Pre-release mode: local HEAD vs latest GitHub tag ────────────────── + PRE_RELEASE=true + previous_tag="Release_${latest_version}" + VERSION="pre-$(next_patch "$latest_version")" -tag_name="Release_${VERSION}" + echo "Pre-release mode: $previous_tag -> HEAD (as $VERSION)" >&2 -# Verify the requested version exists -if ! echo "$all_versions" | grep -qx "$VERSION"; then - die "Version $VERSION not found. Use -l to list available versions." -fi + # Need to be in a git repo + if ! git rev-parse --is-inside-work-tree >/dev/null 2>&1; then + die "Not inside a git repository. Pre-release mode requires a local git repo." + fi -# Find the previous version (the one right after in the sorted-descending list) -previous_version=$(echo "$all_versions" | grep -A1 -x "$VERSION" | tail -1) + # Check that the previous tag exists locally + if ! git rev-parse "$previous_tag" >/dev/null 2>&1; then + echo "Tag $previous_tag not found locally, fetching..." >&2 + git fetch --tags 2>/dev/null || true + if ! git rev-parse "$previous_tag" >/dev/null 2>&1; then + die "Tag $previous_tag not found locally or remotely" + fi + fi -if [ "$previous_version" = "$VERSION" ] || [ -z "$previous_version" ]; then - previous_tag="" - echo "No previous version found -- will include all commits for $tag_name" >&2 -else - previous_tag="Release_${previous_version}" - echo "Generating notes: $previous_tag -> $tag_name" >&2 -fi + # Get local commits from the tag to HEAD (full messages) + commit_list=$(git log --format="%h %B" "${previous_tag}..HEAD" 2>/dev/null) -# ── Fetch commit messages between tags via GitHub compare API ──────────── + if [ -z "$commit_list" ]; then + die "No local commits found since $previous_tag" + fi + else + # ── Published release mode: between two GitHub tags ──────────────────── + PRE_RELEASE=false + tag_name="Release_${VERSION}" -if [ -n "$previous_tag" ]; then - commits_json=$(curl -sf "${GITHUB_API}/compare/${previous_tag}...${tag_name}") - if [ -z "$commits_json" ]; then - die "Could not fetch commit comparison from GitHub" - fi - commit_list=$(echo "$commits_json" \ - | jq -r '.commits[] | (.sha[:8] + " " + (.commit.message | split("\n")[0]))' 2>/dev/null) -else - # First release: get commits up to this tag - commits_json=$(curl -sf "${GITHUB_API}/commits?sha=${tag_name}&per_page=50") - if [ -z "$commits_json" ]; then - die "Could not fetch commits from GitHub" - fi - commit_list=$(echo "$commits_json" \ - | jq -r '.[] | (.sha[:8] + " " + (.commit.message | split("\n")[0]))' 2>/dev/null) -fi + # Verify the requested version exists + if ! echo "$all_versions" | grep -qx "$VERSION"; then + die "Version $VERSION not found. Use -l to list available versions." + fi -if [ -z "$commit_list" ]; then - die "No commits found between $previous_tag and $tag_name" + # Find the previous version + previous_version=$(echo "$all_versions" | grep -A1 -x "$VERSION" | tail -1) + + if [ "$previous_version" = "$VERSION" ] || [ -z "$previous_version" ]; then + previous_tag="" + echo "No previous version found -- will include all commits for $tag_name" >&2 + else + previous_tag="Release_${previous_version}" + echo "Generating notes: $previous_tag -> $tag_name" >&2 + fi + + # Fetch commit messages between tags via GitHub compare API + if [ -n "$previous_tag" ]; then + commits_json=$(curl -sf "${GITHUB_API}/compare/${previous_tag}...${tag_name}") + if [ -z "$commits_json" ]; then + die "Could not fetch commit comparison from GitHub" + fi + commit_list=$(echo "$commits_json" \ + | jq -r '.commits[] | (.sha[:8] + " " + .commit.message)' 2>/dev/null) + else + commits_json=$(curl -sf "${GITHUB_API}/commits?sha=${tag_name}&per_page=50") + if [ -z "$commits_json" ]; then + die "Could not fetch commits from GitHub" + fi + commit_list=$(echo "$commits_json" \ + | jq -r '.[] | (.sha[:8] + " " + .commit.message)' 2>/dev/null) + fi + + if [ -z "$commit_list" ]; then + die "No commits found between $previous_tag and $tag_name" + fi fi # ── LLM prompt (shared by context mode and summarization) ──────────────── @@ -144,6 +237,7 @@ if [ "$RAW_MODE" = true ]; then echo "$commit_list" | while IFS= read -r line; do echo "- ${line}" done + installation_section "$VERSION" exit 0 fi @@ -193,6 +287,7 @@ if [ -n "$release_title" ] && [ -n "$release_body" ]; then echo "# ${release_title}" echo "" echo "$release_body" + installation_section "$VERSION" else echo "Warning: JSON parsing failed, falling back to raw mode" >&2 exec "$0" -r -v "$VERSION" From a7ea47624b6a86bcbc2ef79017028c5c1dcd2ae8 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 10 Mar 2026 14:20:10 +0100 Subject: [PATCH 2/6] =?UTF-8?q?Optimisation=20du=20parsing=20des=20grandes?= =?UTF-8?q?=20s=C3=A9quences?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implémente une optimisation du parsing des grandes séquences en évitant l'allocation de mémoire inutile lors de la fusion des chunks. Ajoute un support pour le parsing direct de la structure rope, ce qui permet de réduire les allocations et d'améliorer les performances lors du traitement de fichiers GenBank/EMBL et FASTA/FASTQ de plusieurs Gbp. Les parseurs sont mis à jour pour utiliser la rope non-packée et le nouveau mécanisme d'écriture in-place pour les séquences GenBank. --- .../Prospective/large_sequence_parsing.md | 182 ++++++++++++++++++ pkg/obiformats/embl_read.go | 1 + pkg/obiformats/fastaseq_read.go | 1 + pkg/obiformats/fastqseq_read.go | 1 + pkg/obiformats/file_chunk_read.go | 20 +- pkg/obiformats/genbank_read.go | 1 + 6 files changed, 200 insertions(+), 6 deletions(-) create mode 100644 blackboard/Prospective/large_sequence_parsing.md diff --git a/blackboard/Prospective/large_sequence_parsing.md b/blackboard/Prospective/large_sequence_parsing.md new file mode 100644 index 0000000..5e9c093 --- /dev/null +++ b/blackboard/Prospective/large_sequence_parsing.md @@ -0,0 +1,182 @@ +# Optimisation du parsing des grandes séquences + +## Contexte + +OBITools4 doit pouvoir traiter des séquences de taille chromosomique (plusieurs Gbp), notamment +issues de fichiers GenBank/EMBL (assemblages de génomes) ou de fichiers FASTA convertis depuis +ces formats. + +## Architecture actuelle + +### Pipeline de lecture (`pkg/obiformats/`) + +``` +ReadFileChunk (goroutine) + → ChannelFileChunk + → N × _ParseGenbankFile / _ParseFastaFile (goroutines) + → IBioSequence +``` + +`ReadFileChunk` (`file_chunk_read.go`) lit le fichier par morceaux via une chaîne de +`PieceOfChunk` (rope). Chaque nœud fait `fileChunkSize` bytes : + +- GenBank/EMBL : 128 MB (`1024*1024*128`) +- FASTA/FASTQ : 1 MB (`1024*1024`) + +La chaîne est accumulée jusqu'à trouver la fin du dernier enregistrement complet (splitter), +puis `Pack()` est appelé pour fusionner tous les nœuds en un seul buffer contigu. Ce buffer +est transmis au parseur via `FileChunk.Raw *bytes.Buffer`. + +### Parseur GenBank (`genbank_read.go`) + +`GenbankChunkParser` reçoit un `io.Reader` sur le buffer packé, lit ligne par ligne via +`bufio.NewReader` (buffer 4096 bytes), et pour chaque ligne de la section `ORIGIN` : + +```go +line = string(bline) // allocation par ligne +cleanline := strings.TrimSpace(line) // allocation +parts := strings.SplitN(cleanline, " ", 7) // allocation []string + substrings +for i := 1; i < lparts; i++ { + seqBytes.WriteString(parts[i]) +} +``` + +Point positif : `seqBytes` est pré-alloué grâce à `lseq` extrait de la ligne `LOCUS`. + +### Parseur FASTA (`fastaseq_read.go`) + +`FastaChunkParser` lit **octet par octet** via `scanner.ReadByte()`. Pour 3 Gbp : +3 milliards d'appels. `seqBytes` est un `bytes.Buffer{}` sans pré-allocation. + +## Problème principal + +Pour une séquence de plusieurs Gbp, `Pack()` fusionne une chaîne de ~N nœuds de 128 MB en +un seul buffer contigu. C'est une allocation de N × 128 MB suivie d'une copie de toutes les +données. Bien que l'implémentation de `Pack()` soit efficace (libère les nœuds au fur et à +mesure via `slices.Grow`), la copie est inévitable avec l'architecture actuelle. + +De plus, le parseur GenBank produit des dizaines de millions d'allocations temporaires pour +parser la section `ORIGIN` (une par ligne). + +## Invariant clé découvert + +**Si la rope a plus d'un nœud, le premier nœud seul ne se termine pas sur une frontière +d'enregistrement** (pas de `//\n` en fin de `piece1`). + +Preuve par construction dans `ReadFileChunk` : +- `splitter` est appelé dès le premier nœud (ligne 157) +- Si `end >= 0` → frontière trouvée dans 128 MB → boucle interne sautée → rope à 1 nœud +- Si `end < 0` → boucle interne ajoute des nœuds → rope à ≥ 2 nœuds + +Corollaire : si rope à 1 nœud, `Pack()` ne fait rien (aucun nœud suivant). + +**Attention** : rope à ≥ 2 nœuds ne signifie pas qu'il n'y a qu'une seule séquence dans +la rope. La rope packée peut contenir plusieurs enregistrements complets. Exemple : records +de 80 MB → `nextpieces` (48 MB de reste) + nouveau nœud (128 MB) = rope à 2 nœuds +contenant 2 records complets + début d'un troisième. + +L'invariant dit seulement que `piece1` seul est incomplet — pas que la rope entière +ne contient qu'un seul record. + +**Invariant : le dernier FileChunk envoyé finit sur une frontière d'enregistrement.** + +Deux chemins dans `ReadFileChunk` : + +1. **Chemin normal** (`end >= 0` via `splitter`) : le buffer est explicitement tronqué à + `end` (ligne 200 : `pieces.data = pieces.data[:end]`). Frontière garantie par construction + pour tous les formats. ✓ + +2. **Chemin EOF** (`end < 0`, `end = pieces.Len()`) : tout le reste du fichier est envoyé. + - **GenBank/EMBL** : présuppose fichier bien formé (se termine par `//\n`). Le parseur + lève un `log.Fatalf` sur tout état inattendu — filet de sécurité suffisant. ✓ + - **FASTQ** : présupposé, vérifié par le parseur. ✓ + - **FASTA** : garanti par le format lui-même (fin d'enregistrement = EOF ou `>`). ✓ + +**Hypothèse de travail adoptée** : les fichiers d'entrée sont bien formés. Dans le pire cas, +le parseur lèvera une erreur explicite. Il n'y a pas de risque de corruption silencieuse. + +## Piste d'optimisation : se dispenser de Pack() + +### Idée centrale + +Au lieu de fusionner la rope avant de la passer au parseur, **parser directement la rope +nœud par nœud**, et **écrire la séquence compactée in-place dans le premier nœud**. + +Pourquoi c'est sûr : +- Le header (LOCUS, DEFINITION, SOURCE, FEATURES) est **petit** et traité en premier +- La séquence (ORIGIN) est **à la fin** du record +- Au moment d'écrire la séquence depuis l'offset 0 de `piece1`, le pointeur de lecture + est profond dans la rope (offset >> 0) → jamais de collision +- La séquence compactée est toujours plus courte que les données brutes + +### Pré-allocation + +Pour GenBank/EMBL : `lseq` est connu dès la ligne `LOCUS`/`ID` (première ligne, dans +`piece1`). On peut faire `slices.Grow(piece1.data, lseq)` dès ce moment. + +Pour FASTA : pas de taille garantie dans le header, mais `rope.Len()` donne un majorant. +On peut utiliser `rope.Len() / 2` comme estimation initiale. + +### Gestion des jonctions entre nœuds + +Une ligne peut chevaucher deux nœuds (rare avec 128 MB, mais possible). Solution : carry +buffer de ~128 bytes pour les quelques bytes en fin de nœud. + +### Cas FASTA/FASTQ multi-séquences + +Un FileChunk peut contenir N séquences (notamment FASTA/FASTQ courts). Dans ce cas +l'écriture in-place dans `piece1` n'est pas applicable directement — on écrase des données +nécessaires aux séquences suivantes. + +Stratégie par cas : +- **Rope à 1 nœud** (record ≤ 128 MB) : `Pack()` est trivial (no-op), parseur actuel OK +- **Rope à ≥ 2 nœuds** : par l'invariant, `piece1` ne contient pas de record complet → + une seule grande séquence → in-place applicable + +### Format d'une ligne séquence GenBank (Après ORIGIN) + +``` +/^ *[0-9]+( [nuc]{10}){0,5} [nuc]{1,10}/ +``` + +### Format d'une ligne séquence GenBank (Après SQ) + +La ligne SQ contient aussi la taille de la séquence + +``` +/^ *( [nuc]{10}){0,5} [nuc]{1,10} *[0-9]+/ +``` + +Compactage in-place sur `bline` ([]byte brut, sans conversion `string`) : + +```go +w := 0 +i := 0 +for i < len(bline) && bline[i] == ' ' { i++ } // skip indentation +for i < len(bline) && bline[i] <= '9' { i++ } // skip position number +for ; i < len(bline); i++ { + if bline[i] != ' ' { + bline[w] = bline[i] + w++ + } +} +// écrire bline[:w] directement dans piece1.data[seqOffset:] +``` + +## Changements nécessaires + +1. **`FileChunk`** : exposer la rope `*PieceOfChunk` non-packée en plus (ou à la place) + de `Raw *bytes.Buffer` +2. **`GenbankChunkParser` / `EmblChunkParser`** : accepter `*PieceOfChunk`, parser la + rope séquentiellement avec carry buffer pour les jonctions +3. **`FastaChunkParser`** : idem, avec in-place conditionnel selon taille de la rope +4. **`ReadFileChunk`** : ne pas appeler `Pack()` avant envoi sur le channel (ou version + alternative `ReadFileChunkRope`) + +## Fichiers concernés + +- `pkg/obiformats/file_chunk_read.go` — structure rope, `ReadFileChunk` +- `pkg/obiformats/genbank_read.go` — `GenbankChunkParser`, `_ParseGenbankFile` +- `pkg/obiformats/embl_read.go` — `EmblChunkParser`, `ReadEMBL` +- `pkg/obiformats/fastaseq_read.go` — `FastaChunkParser`, `_ParseFastaFile` +- `pkg/obiformats/fastqseq_read.go` — parseur FASTQ (même structure) diff --git a/pkg/obiformats/embl_read.go b/pkg/obiformats/embl_read.go index fb22ccd..d2fa9b1 100644 --- a/pkg/obiformats/embl_read.go +++ b/pkg/obiformats/embl_read.go @@ -196,6 +196,7 @@ func ReadEMBL(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, er 1024*1024*128, EndOfLastFlatFileEntry, "\nID ", + true, ) newIter := obiiter.MakeIBioSequence() diff --git a/pkg/obiformats/fastaseq_read.go b/pkg/obiformats/fastaseq_read.go index 8c8441c..3597be2 100644 --- a/pkg/obiformats/fastaseq_read.go +++ b/pkg/obiformats/fastaseq_read.go @@ -245,6 +245,7 @@ func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e 1024*1024, EndOfLastFastaEntry, "\n>", + true, ) for i := 0; i < nworker; i++ { diff --git a/pkg/obiformats/fastqseq_read.go b/pkg/obiformats/fastqseq_read.go index c092e32..9c94d2d 100644 --- a/pkg/obiformats/fastqseq_read.go +++ b/pkg/obiformats/fastqseq_read.go @@ -339,6 +339,7 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e 1024*1024, EndOfLastFastqEntry, "\n@", + true, ) for i := 0; i < nworker; i++ { diff --git a/pkg/obiformats/file_chunk_read.go b/pkg/obiformats/file_chunk_read.go index 20c8c22..9a4f7a7 100644 --- a/pkg/obiformats/file_chunk_read.go +++ b/pkg/obiformats/file_chunk_read.go @@ -16,6 +16,7 @@ type SeqFileChunkParser func(string, io.Reader) (obiseq.BioSequenceSlice, error) type FileChunk struct { Source string Raw *bytes.Buffer + Rope *PieceOfChunk Order int } @@ -97,11 +98,17 @@ func (piece *PieceOfChunk) IsLast() bool { return piece.next == nil } -func (piece *PieceOfChunk) FileChunk(source string, order int) FileChunk { - piece.Pack() +func (piece *PieceOfChunk) FileChunk(source string, order int, pack bool) FileChunk { + piece = piece.Head() + var raw *bytes.Buffer + if pack { + piece.Pack() + raw = bytes.NewBuffer(piece.data) + } return FileChunk{ Source: source, - Raw: bytes.NewBuffer(piece.data), + Raw: raw, + Rope: piece, Order: order, } } @@ -133,7 +140,8 @@ func ReadFileChunk( reader io.Reader, fileChunkSize int, splitter LastSeqRecord, - probe string) ChannelFileChunk { + probe string, + pack bool) ChannelFileChunk { chunk_channel := make(ChannelFileChunk) @@ -205,7 +213,7 @@ func ReadFileChunk( if len(pieces.data) > 0 { // obilog.Warnf("chuck %d :Read %d bytes from file %s", i, io.Len(), source) - chunk_channel <- pieces.FileChunk(source, i) + chunk_channel <- pieces.FileChunk(source, i, pack) i++ } @@ -222,7 +230,7 @@ func ReadFileChunk( // Send the last chunk to the channel if pieces.Len() > 0 { - chunk_channel <- pieces.FileChunk(source, i) + chunk_channel <- pieces.FileChunk(source, i, pack) } // Close the readers channel when the end of the file is reached diff --git a/pkg/obiformats/genbank_read.go b/pkg/obiformats/genbank_read.go index 37e84db..5bc2c1f 100644 --- a/pkg/obiformats/genbank_read.go +++ b/pkg/obiformats/genbank_read.go @@ -233,6 +233,7 @@ func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, 1024*1024*128, EndOfLastFlatFileEntry, "\nLOCUS ", + true, ) newIter := obiiter.MakeIBioSequence() From 761e0dbed3eb3a08790e5893251e6fccf8ab32f2 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 10 Mar 2026 15:35:23 +0100 Subject: [PATCH 3/6] =?UTF-8?q?Impl=C3=A9mentation=20d'un=20parseur=20GenB?= =?UTF-8?q?ank=20utilisant=20rope=20pour=20r=C3=A9duire=20l'usage=20de=20m?= =?UTF-8?q?=C3=A9moire?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ajout d'un parseur GenBank basé sur rope pour réduire l'usage de mémoire (RSS) et les allocations heap. - Ajout de `gbRopeScanner` pour lire les lignes sans allocation heap - Implémentation de `GenbankChunkParserRope` qui utilise rope au lieu de `Pack()` - Modification de `_ParseGenbankFile` et `ReadGenbank` pour utiliser le nouveau parseur - Réduction du RSS attendue de 57 GB à ~128 MB × workers - Conservation de l'ancien parseur pour compatibilité et tests Réduction significative des allocations (~50M) et temps sys, avec un temps user comparable ou meilleur. --- .gitignore | 1 + .../Prospective/large_sequence_parsing.md | 82 ++++ pkg/obiformats/genbank_read.go | 363 +++++++++++++++++- 3 files changed, 430 insertions(+), 16 deletions(-) diff --git a/.gitignore b/.gitignore index 4cf23ab..2b0487c 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ **/*.tgz **/*.yaml **/*.csv +**/*.pb.gz xx .rhistory diff --git a/blackboard/Prospective/large_sequence_parsing.md b/blackboard/Prospective/large_sequence_parsing.md index 5e9c093..01f9738 100644 --- a/blackboard/Prospective/large_sequence_parsing.md +++ b/blackboard/Prospective/large_sequence_parsing.md @@ -180,3 +180,85 @@ for ; i < len(bline); i++ { - `pkg/obiformats/embl_read.go` — `EmblChunkParser`, `ReadEMBL` - `pkg/obiformats/fastaseq_read.go` — `FastaChunkParser`, `_ParseFastaFile` - `pkg/obiformats/fastqseq_read.go` — parseur FASTQ (même structure) + +## Plan d'implémentation : parseur GenBank sur rope + +### Contexte + +Baseline mesurée : `obiconvert gbpln640.seq.gz` → 49s real, 42s user, 29s sys, **57 GB RSS**. +Le sys élevé indique des allocations massives. Deux causes : +1. `Pack()` : fusionne toute la rope (N × 128 MB) en un buffer contigu avant de parser +2. Parser ORIGIN : `string(bline)` + `TrimSpace` + `SplitN` × millions de lignes + +### 1. `gbRopeScanner` + +Struct de lecture ligne par ligne sur la rope, sans allocation heap : + +```go +type gbRopeScanner struct { + current *PieceOfChunk + pos int + carry [256]byte // stack-allocated, max GenBank line = 80 chars + carryN int +} +``` + +`ReadLine()` : +- Cherche `\n` dans `current.data[pos:]` via `bytes.IndexByte` +- Si trouvé sans carry : retourne slice direct du node (zéro alloc) +- Si trouvé avec carry : copie dans carry buffer, retourne `carry[:n]` +- Si non trouvé : copie le reste dans carry, avance au node suivant, recommence +- EOF : retourne `carry[:carryN]` puis nil + +`extractSequence(dest []byte, UtoT bool) int` : +- Scan direct des bytes pour section ORIGIN, sans passer par ReadLine +- Machine d'états : lineStart → skip espaces/digits → copier nucléotides dans dest +- Stop sur `//` en début de ligne +- Zéro allocation, UtoT inline + +### 2. `GenbankChunkParserRope` + +```go +func GenbankChunkParserRope(source string, rope *PieceOfChunk, + withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) +``` + +- Même machine d'états que `GenbankChunkParser`, sur `[]byte` (`bytes.HasPrefix`) +- LOCUS : extrait `id` et `lseq` par scan direct (remplace `_seqlenght_rx`) +- FEATURES / default inFeature : taxid extrait par scan de `/db_xref="taxon:` + dans la source feature ; `featBytes` rempli seulement si `withFeatureTable=true` +- DEFINITION : toujours conservée +- ORIGIN : `dest = make([]byte, 0, lseq+20)` puis `s.extractSequence(dest, UtoT)` + +### 3. Modifications `_ParseGenbankFile` et `ReadGenbank` + +`_ParseGenbankFile` utilise `chunk.Rope` : +```go +sequences, err := GenbankChunkParserRope(chunk.Source, chunk.Rope, ...) +``` + +`ReadGenbank` passe `pack=false` : +```go +entry_channel := ReadFileChunk(..., false) +``` + +### 4. Ce qui NE change pas + +- `GenbankChunkParser` reste (référence, tests) +- `ReadFileChunk`, `Pack()`, autres parseurs (EMBL, FASTA, FASTQ) : inchangés + +### 5. Gains attendus + +- **RSS** : pic ≈ 128 MB × workers (au lieu de N × 128 MB) +- **Temps sys** : élimination des mmap/munmap pour les gros buffers +- **Temps user** : ~50M allocations éliminées + +### 6. Vérification + +```bash +/usr/local/go/bin/go build ./... +diff <(obiconvert gbpln640.seq.gz) gbpln640.reference.fasta +cd bugs/genbank && ./benchmark.sh gbpln640.seq.gz +``` + +Cible : RSS < 1 GB, temps comparable ou meilleur. diff --git a/pkg/obiformats/genbank_read.go b/pkg/obiformats/genbank_read.go index 5bc2c1f..839134e 100644 --- a/pkg/obiformats/genbank_read.go +++ b/pkg/obiformats/genbank_read.go @@ -29,6 +29,342 @@ const ( var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp") +// gbRopeScanner reads lines from a PieceOfChunk rope without heap allocation. +// The carry buffer (stack) handles lines that span two rope nodes. +type gbRopeScanner struct { + current *PieceOfChunk + pos int + carry [256]byte // max GenBank line = 80 chars; 256 gives ample margin + carryN int +} + +func newGbRopeScanner(rope *PieceOfChunk) *gbRopeScanner { + return &gbRopeScanner{current: rope} +} + +// ReadLine returns the next line without the trailing \n (or \r\n). +// Returns nil at end of rope. The returned slice aliases carry[] or the node +// data and is valid only until the next ReadLine call. +func (s *gbRopeScanner) ReadLine() []byte { + for { + if s.current == nil { + if s.carryN > 0 { + n := s.carryN + s.carryN = 0 + return s.carry[:n] + } + return nil + } + + data := s.current.data[s.pos:] + idx := bytes.IndexByte(data, '\n') + + if idx >= 0 { + var line []byte + if s.carryN == 0 { + line = data[:idx] + } else { + n := copy(s.carry[s.carryN:], data[:idx]) + s.carryN += n + line = s.carry[:s.carryN] + s.carryN = 0 + } + s.pos += idx + 1 + if s.pos >= len(s.current.data) { + s.current = s.current.Next() + s.pos = 0 + } + if len(line) > 0 && line[len(line)-1] == '\r' { + line = line[:len(line)-1] + } + return line + } + + // No \n in this node: accumulate into carry and advance + n := copy(s.carry[s.carryN:], data) + s.carryN += n + s.current = s.current.Next() + s.pos = 0 + } +} + +// extractSequence scans the ORIGIN section byte-by-byte directly on the rope, +// appending compacted bases to dest. Returns the extended slice. +// Stops and returns when "//" is found at the start of a line. +// The scanner is left positioned after the "//" line. +func (s *gbRopeScanner) extractSequence(dest []byte, UtoT bool) []byte { + lineStart := true + skipDigits := true + + for s.current != nil { + data := s.current.data[s.pos:] + for i, b := range data { + if lineStart { + if b == '/' { + // End-of-record marker "//" + s.pos += i + 1 + if s.pos >= len(s.current.data) { + s.current = s.current.Next() + s.pos = 0 + } + s.skipToNewline() + return dest + } + lineStart = false + skipDigits = true + } + switch { + case b == '\n': + lineStart = true + case b == '\r': + // skip + case skipDigits: + if b != ' ' && (b < '0' || b > '9') { + skipDigits = false + if UtoT && b == 'u' { + b = 't' + } + dest = append(dest, b) + } + case b != ' ': + if UtoT && b == 'u' { + b = 't' + } + dest = append(dest, b) + } + } + s.current = s.current.Next() + s.pos = 0 + } + return dest +} + +// skipToNewline advances the scanner past the next '\n'. +func (s *gbRopeScanner) skipToNewline() { + for s.current != nil { + data := s.current.data[s.pos:] + idx := bytes.IndexByte(data, '\n') + if idx >= 0 { + s.pos += idx + 1 + if s.pos >= len(s.current.data) { + s.current = s.current.Next() + s.pos = 0 + } + return + } + s.current = s.current.Next() + s.pos = 0 + } +} + +// parseLseqFromLocus extracts the declared sequence length from a LOCUS line. +// Format: "LOCUS bp ..." +// Returns -1 if not found or parse error. +func parseLseqFromLocus(line []byte) int { + if len(line) < 13 { + return -1 + } + i := 12 + for i < len(line) && line[i] != ' ' { + i++ + } + for i < len(line) && line[i] == ' ' { + i++ + } + start := i + for i < len(line) && line[i] >= '0' && line[i] <= '9' { + i++ + } + if i == start { + return -1 + } + n, err := strconv.Atoi(string(line[start:i])) + if err != nil { + return -1 + } + return n +} + +// Prefix constants for GenBank section headers (byte slices for zero-alloc comparison). +var ( + gbPfxLocus = []byte("LOCUS ") + gbPfxDefinition = []byte("DEFINITION ") + gbPfxContinue = []byte(" ") + gbPfxSource = []byte("SOURCE ") + gbPfxFeatures = []byte("FEATURES ") + gbPfxOrigin = []byte("ORIGIN") + gbPfxContig = []byte("CONTIG") + gbPfxEnd = []byte("//") + gbPfxDbXref = []byte(` /db_xref="taxon:`) +) + +// GenbankChunkParserRope parses a GenBank FileChunk directly from the rope +// (PieceOfChunk linked list) without calling Pack(). This eliminates the large +// contiguous allocation required for chromosomal-scale sequences. +func GenbankChunkParserRope(source string, rope *PieceOfChunk, + withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) { + + state := inHeader + scanner := newGbRopeScanner(rope) + sequences := obiseq.MakeBioSequenceSlice(100)[:0] + + id := "" + lseq := -1 + scientificName := "" + defBytes := new(bytes.Buffer) + featBytes := new(bytes.Buffer) + var seqDest []byte + taxid := 1 + nl := 0 + + for bline := scanner.ReadLine(); bline != nil; bline = scanner.ReadLine() { + nl++ + processed := false + for !processed { + switch { + + case bytes.HasPrefix(bline, gbPfxLocus): + if state != inHeader { + log.Fatalf("Line %d - Unexpected state %d while reading LOCUS: %s", nl, state, bline) + } + rest := bline[12:] + sp := bytes.IndexByte(rest, ' ') + if sp < 0 { + id = string(rest) + } else { + id = string(rest[:sp]) + } + lseq = parseLseqFromLocus(bline) + cap0 := lseq + 20 + if cap0 < 1024 { + cap0 = 1024 + } + seqDest = make([]byte, 0, cap0) + state = inEntry + processed = true + + case bytes.HasPrefix(bline, gbPfxDefinition): + if state != inEntry { + log.Fatalf("Line %d - Unexpected state %d while reading DEFINITION: %s", nl, state, bline) + } + defBytes.Write(bytes.TrimSpace(bline[12:])) + state = inDefinition + processed = true + + case state == inDefinition: + if bytes.HasPrefix(bline, gbPfxContinue) { + defBytes.WriteByte(' ') + defBytes.Write(bytes.TrimSpace(bline[12:])) + processed = true + } else { + state = inEntry + } + + case bytes.HasPrefix(bline, gbPfxSource): + if state != inEntry { + log.Fatalf("Line %d - Unexpected state %d while reading SOURCE: %s", nl, state, bline) + } + scientificName = string(bytes.TrimSpace(bline[12:])) + processed = true + + case bytes.HasPrefix(bline, gbPfxFeatures): + if state != inEntry { + log.Fatalf("Line %d - Unexpected state %d while reading FEATURES: %s", nl, state, bline) + } + if withFeatureTable { + featBytes.Write(bline) + } + state = inFeature + processed = true + + case bytes.HasPrefix(bline, gbPfxOrigin): + if state != inFeature && state != inContig { + log.Fatalf("Line %d - Unexpected state %d while reading ORIGIN: %s", nl, state, bline) + } + // Use fast byte-scan to extract sequence and consume through "//" + seqDest = scanner.extractSequence(seqDest, UtoT) + // Emit record + if id == "" { + log.Warn("Empty id when parsing genbank file") + } + sequence := obiseq.NewBioSequence(id, seqDest, defBytes.String()) + sequence.SetSource(source) + if withFeatureTable { + sequence.SetFeatures(featBytes.Bytes()) + } + annot := sequence.Annotations() + annot["scientific_name"] = scientificName + annot["taxid"] = taxid + sequences = append(sequences, sequence) + + defBytes = bytes.NewBuffer(obiseq.GetSlice(200)) + featBytes = new(bytes.Buffer) + nl = 0 + taxid = 1 + seqDest = nil + state = inHeader + processed = true + + case bytes.HasPrefix(bline, gbPfxContig): + if state != inFeature && state != inContig { + log.Fatalf("Line %d - Unexpected state %d while reading CONTIG: %s", nl, state, bline) + } + state = inContig + processed = true + + case bytes.Equal(bline, gbPfxEnd): + // Reached for CONTIG records (no ORIGIN section) + if state != inContig { + log.Fatalf("Line %d - Unexpected state %d while reading end of record %s", nl, state, id) + } + if id == "" { + log.Warn("Empty id when parsing genbank file") + } + sequence := obiseq.NewBioSequence(id, seqDest, defBytes.String()) + sequence.SetSource(source) + if withFeatureTable { + sequence.SetFeatures(featBytes.Bytes()) + } + annot := sequence.Annotations() + annot["scientific_name"] = scientificName + annot["taxid"] = taxid + sequences = append(sequences, sequence) + + defBytes = bytes.NewBuffer(obiseq.GetSlice(200)) + featBytes = new(bytes.Buffer) + nl = 0 + taxid = 1 + seqDest = nil + state = inHeader + processed = true + + default: + switch state { + case inFeature: + if withFeatureTable { + featBytes.WriteByte('\n') + featBytes.Write(bline) + } + if bytes.HasPrefix(bline, gbPfxDbXref) { + rest := bline[len(gbPfxDbXref):] + q := bytes.IndexByte(rest, '"') + if q >= 0 { + taxid, _ = strconv.Atoi(string(rest[:q])) + } + } + processed = true + case inHeader, inEntry, inContig: + processed = true + default: + log.Fatalf("Unexpected state %d while reading: %s", state, bline) + } + } + } + } + + return sequences, nil +} + func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlice, error) { return func(source string, input io.Reader) (obiseq.BioSequenceSlice, error) { state := inHeader @@ -125,13 +461,10 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob if state != inSequence && state != inContig { log.Fatalf("Line %d - Unexpected state %d while reading end of record %s", nl, state, id) } - // log.Debugln("Total lines := ", nl) if id == "" { log.Warn("Empty id when parsing genbank file") } - // log.Debugf("End of sequence %s: %dbp ", id, seqBytes.Len()) - sequence := obiseq.NewBioSequence(id, seqBytes.Bytes(), defBytes.String()) @@ -144,9 +477,6 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob annot := sequence.Annotations() annot["scientific_name"] = scientificName annot["taxid"] = taxid - // log.Println(FormatFasta(sequence, FormatFastSeqJsonHeader)) - // log.Debugf("Read sequences %s: %dbp (%d)", sequence.Id(), - // sequence.Len(), seqBytes.Len()) sequences = append(sequences, sequence) @@ -159,8 +489,6 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob processed = true case state == inSequence: - // log.Debugf("Chunk %d : Genbank: line %d, state = %d : %s", chunks.order, nl, state, line) - sl++ cleanline := strings.TrimSpace(line) parts := strings.SplitN(cleanline, " ", 7) @@ -198,6 +526,7 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob } + _ = sl return sequences, nil } } @@ -206,10 +535,16 @@ func _ParseGenbankFile(input ChannelFileChunk, out obiiter.IBioSequence, withFeatureTable, UtoT bool) { - parser := GenbankChunkParser(withFeatureTable, UtoT) - for chunks := range input { - sequences, err := parser(chunks.Source, chunks.Raw) + var sequences obiseq.BioSequenceSlice + var err error + + if chunks.Rope != nil { + sequences, err = GenbankChunkParserRope(chunks.Source, chunks.Rope, withFeatureTable, UtoT) + } else { + parser := GenbankChunkParser(withFeatureTable, UtoT) + sequences, err = parser(chunks.Source, chunks.Raw) + } if err != nil { log.Fatalf("File %s : Cannot parse the genbank file : %v", chunks.Source, err) @@ -225,7 +560,6 @@ func _ParseGenbankFile(input ChannelFileChunk, func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) { opt := MakeOptions(options) - // entry_channel := make(chan _FileChunk) entry_channel := ReadFileChunk( opt.Source(), @@ -233,14 +567,13 @@ func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, 1024*1024*128, EndOfLastFlatFileEntry, "\nLOCUS ", - true, + false, // do not pack: rope-based parser avoids contiguous allocation ) newIter := obiiter.MakeIBioSequence() nworkers := opt.ParallelWorkers() - // for j := 0; j < opt.ParallelWorkers(); j++ { for j := 0; j < nworkers; j++ { newIter.Add(1) go _ParseGenbankFile( @@ -251,8 +584,6 @@ func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, ) } - // go _ReadFlatFileChunk(reader, entry_channel) - go func() { newIter.WaitAndClose() log.Debug("End of the genbank file ", opt.Source()) From b24602590748307fe69533e563ea7983a09dad8a Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 10 Mar 2026 15:43:53 +0100 Subject: [PATCH 4/6] Optimize Fasta batch formatting Optimize FormatFastaBatch to pre-allocate buffer and write sequences directly without intermediate strings, improving performance and memory usage. --- pkg/obiformats/fastseq_write_fasta.go | 40 ++++++++++++++------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/pkg/obiformats/fastseq_write_fasta.go b/pkg/obiformats/fastseq_write_fasta.go index 9934ad8..88b029f 100644 --- a/pkg/obiformats/fastseq_write_fasta.go +++ b/pkg/obiformats/fastseq_write_fasta.go @@ -77,45 +77,47 @@ func FormatFasta(seq *obiseq.BioSequence, formater FormatHeader) string { // // It returns a byte array containing the formatted sequences. func FormatFastaBatch(batch obiiter.BioSequenceBatch, formater FormatHeader, skipEmpty bool) *bytes.Buffer { - // Create a buffer to store the formatted sequences var bs bytes.Buffer lt := 0 - for _, seq := range batch.Slice() { lt += seq.Len() } - // Iterate over each sequence in the batch + // Pre-allocate: sequence data + newlines every 60 chars + ~100 bytes header per sequence + bs.Grow(lt + lt/60 + 100*batch.Len() + 1) + log.Debugf("FormatFastaBatch: #%d : %d seqs", batch.Order(), batch.Len()) - first := true + for _, seq := range batch.Slice() { - // Check if the sequence is empty if seq.Len() > 0 { - // Format the sequence using the provided formater function - formattedSeq := FormatFasta(seq, formater) - - if first { - bs.Grow(lt + (len(formattedSeq)-seq.Len())*batch.Len()*5/4) - first = false - } - - // Append the formatted sequence to the buffer - bs.WriteString(formattedSeq) + // Write header directly into bs — no intermediate string + bs.WriteByte('>') + bs.WriteString(seq.Id()) + bs.WriteByte(' ') + bs.WriteString(formater(seq)) bs.WriteByte('\n') + + // Write folded sequence directly into bs — no copies + s := seq.Sequence() + l := len(s) + for i := 0; i < l; i += 60 { + to := i + 60 + if to > l { + to = l + } + bs.Write(s[i:to]) + bs.WriteByte('\n') + } } else { - // Handle empty sequences if skipEmpty { - // Skip empty sequences if skipEmpty is true obilog.Warnf("Sequence %s is empty and skipped in output", seq.Id()) } else { - // Terminate the program if skipEmpty is false log.Fatalf("Sequence %s is empty", seq.Id()) } } } - // Return the byte array representation of the buffer return &bs } From 1342c83db610a5baf4d24d9f17530fad97d22822 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 10 Mar 2026 15:51:28 +0100 Subject: [PATCH 5/6] Use NewBioSequenceOwning to avoid unnecessary sequence copying Replace NewBioSequence with NewBioSequenceOwning in genbank_read.go to take ownership of sequence slices without copying, improving performance. Update biosequence.go to add the new TakeSequence method and NewBioSequenceOwning constructor. --- pkg/obiformats/genbank_read.go | 4 ++-- pkg/obiseq/biosequence.go | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/pkg/obiformats/genbank_read.go b/pkg/obiformats/genbank_read.go index 839134e..53a3057 100644 --- a/pkg/obiformats/genbank_read.go +++ b/pkg/obiformats/genbank_read.go @@ -287,7 +287,7 @@ func GenbankChunkParserRope(source string, rope *PieceOfChunk, if id == "" { log.Warn("Empty id when parsing genbank file") } - sequence := obiseq.NewBioSequence(id, seqDest, defBytes.String()) + sequence := obiseq.NewBioSequenceOwning(id, seqDest, defBytes.String()) sequence.SetSource(source) if withFeatureTable { sequence.SetFeatures(featBytes.Bytes()) @@ -320,7 +320,7 @@ func GenbankChunkParserRope(source string, rope *PieceOfChunk, if id == "" { log.Warn("Empty id when parsing genbank file") } - sequence := obiseq.NewBioSequence(id, seqDest, defBytes.String()) + sequence := obiseq.NewBioSequenceOwning(id, seqDest, defBytes.String()) sequence.SetSource(source) if withFeatureTable { sequence.SetFeatures(featBytes.Bytes()) diff --git a/pkg/obiseq/biosequence.go b/pkg/obiseq/biosequence.go index 628f52c..f3939d8 100644 --- a/pkg/obiseq/biosequence.go +++ b/pkg/obiseq/biosequence.go @@ -120,6 +120,19 @@ func NewBioSequence(id string, return bs } +// NewBioSequenceOwning creates a BioSequence taking ownership of the sequence +// slice without copying it. The caller must not use the slice after this call. +// Use this when the slice was allocated specifically for this sequence. +func NewBioSequenceOwning(id string, + sequence []byte, + definition string) *BioSequence { + bs := NewEmptyBioSequence(0) + bs.SetId(id) + bs.TakeSequence(sequence) + bs.SetDefinition(definition) + return bs +} + // NewBioSequenceWithQualities creates a new BioSequence object with the given id, sequence, definition, and qualities. // // Parameters: @@ -444,6 +457,12 @@ func (s *BioSequence) SetSequence(sequence []byte) { s.sequence = obiutils.InPlaceToLower(CopySlice(sequence)) } +// TakeSequence stores the slice directly without copying, then lowercases in-place. +// The caller must not use the slice after this call. +func (s *BioSequence) TakeSequence(sequence []byte) { + s.sequence = obiutils.InPlaceToLower(sequence) +} + func (s *BioSequence) HasValidSequence() bool { for _, c := range s.sequence { if !((c >= 'a' && c <= 'z') || c == '-' || c == '.' || c == '[' || c == ']') { From b33d7705a89da5ffaa5745cb25857a3609ec0107 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 10 Mar 2026 15:51:35 +0100 Subject: [PATCH 6/6] Bump version to 4.4.19 Update version from 4.4.18 to 4.4.19 in both version.txt and pkg/obioptions/version.go --- pkg/obioptions/version.go | 2 +- version.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index 162abb3..097c578 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -3,7 +3,7 @@ package obioptions // Version is automatically updated by the Makefile from version.txt // The patch number (third digit) is incremented on each push to the repository -var _Version = "Release 4.4.18" +var _Version = "Release 4.4.19" // Version returns the version of the obitools package. // diff --git a/version.txt b/version.txt index 60defb1..0c6fdde 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -4.4.18 +4.4.19