diff --git a/.gitignore b/.gitignore index 4cf23ab..2b0487c 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ **/*.tgz **/*.yaml **/*.csv +**/*.pb.gz xx .rhistory diff --git a/Makefile b/Makefile index 36780d9..afc3772 100644 --- a/Makefile +++ b/Makefile @@ -128,40 +128,58 @@ jjnew: @echo "$(GREEN)✓ New commit created$(NC)" jjpush: - @echo "$(YELLOW)→ Pushing commit to repository...$(NC)" + @$(MAKE) jjpush-describe + @$(MAKE) jjpush-bump + @$(MAKE) jjpush-push + @$(MAKE) jjpush-tag + @echo "$(GREEN)✓ Release complete$(NC)" + +jjpush-describe: @echo "$(BLUE)→ Documenting current commit...$(NC)" @jj auto-describe + +jjpush-bump: @echo "$(BLUE)→ Creating new commit for version bump...$(NC)" @jj new - @previous_version=$$(cat version.txt); \ - $(MAKE) bump-version; \ - version=$$(cat version.txt); \ + @$(MAKE) bump-version + @echo "$(BLUE)→ Documenting version bump commit...$(NC)" + @jj auto-describe + +jjpush-push: + @echo "$(BLUE)→ Pushing commits...$(NC)" + @jj git push --change @ + +jjpush-tag: + @version=$$(cat version.txt); \ tag_name="Release_$$version"; \ - previous_tag="Release_$$previous_version"; \ - echo "$(BLUE)→ Documenting version bump commit...$(NC)"; \ - jj auto-describe; \ - echo "$(BLUE)→ Generating release notes from $$previous_tag to current commit...$(NC)"; \ + echo "$(BLUE)→ Generating release notes for $$tag_name...$(NC)"; \ + release_message="Release $$version"; \ if command -v orla >/dev/null 2>&1 && command -v jq >/dev/null 2>&1; then \ - release_json=$$(ORLA_MAX_TOOL_CALLS=50 jj log -r "$$previous_tag::@" -T 'commit_id.short() ++ " " ++ description' | \ - orla agent -m ollama:qwen3-coder-next:latest \ - "Summarize the following commits into a GitHub release note for version $$version. Ignore commits related to version bumps, .gitignore changes, or any internal housekeeping that is irrelevant to end users. Describe each user-facing change precisely without exposing code. Eliminate redundancy. Output strictly valid JSON with no surrounding text, using this exact schema: {\"title\": \"\", \"body\": \"\"}"); \ - release_json=$$(echo "$$release_json" | sed -n '/^{/,/^}/p'); \ - release_title=$$(echo "$$release_json" | jq -r '.title // empty') ; \ - release_body=$$(echo "$$release_json" | jq -r '.body // empty') ; \ - if [ -n "$$release_title" ] && [ -n "$$release_body" ]; then \ - release_message="$$release_title"$$'\n\n'"$$release_body"; \ + previous_tag=$$(git describe --tags --abbrev=0 --match 'Release_*' HEAD^ 2>/dev/null); \ + if [ -z "$$previous_tag" ]; then \ + echo "$(YELLOW)⚠ No previous Release tag found, skipping release notes$(NC)"; \ else \ - echo "$(YELLOW)⚠ JSON parsing failed, falling back to raw output$(NC)"; \ - release_message="Release $$version"$$'\n\n'"$$release_json"; \ + raw_output=$$(git log --format="%h %B" "$$previous_tag..HEAD" | \ + ORLA_MAX_TOOL_CALLS=50 orla agent -m ollama:qwen3-coder-next:latest \ + "Summarize the following commits into a GitHub release note for version $$version. Ignore commits related to version bumps, .gitignore changes, or any internal housekeeping that is irrelevant to end users. Describe each user-facing change precisely without exposing code. Eliminate redundancy. Output strictly valid JSON with no surrounding text, using this exact schema: {\"title\": \"\", \"body\": \"\"}" 2>/dev/null) || true; \ + if [ -n "$$raw_output" ]; then \ + sanitized=$$(echo "$$raw_output" | sed -n '/^{/,/^}/p' | tr -d '\000-\011\013-\014\016-\037'); \ + release_title=$$(echo "$$sanitized" | jq -r '.title // empty' 2>/dev/null) ; \ + release_body=$$(echo "$$sanitized" | jq -r '.body // empty' 2>/dev/null) ; \ + if [ -n "$$release_title" ] && [ -n "$$release_body" ]; then \ + release_message="$$release_title"$$'\n\n'"$$release_body"; \ + else \ + echo "$(YELLOW)⚠ JSON parsing failed, using default release message$(NC)"; \ + fi; \ + fi; \ fi; \ - else \ - release_message="Release $$version"; \ fi; \ - echo "$(BLUE)→ Pushing commits and creating tag $$tag_name...$(NC)"; \ - jj git push --change @; \ - git tag -a "$$tag_name" -m "$$release_message" 2>/dev/null || echo "Tag $$tag_name already exists"; \ - git push origin "$$tag_name" 2>/dev/null || echo "Tag already pushed" - @echo "$(GREEN)✓ Commits and tag pushed to repository$(NC)" + install_section=$$'\n## Installation\n\n### Pre-built binaries\n\nDownload the appropriate archive for your system from the\n[release assets](https://github.com/metabarcoding/obitools4/releases/tag/Release_'"$$version"')\nand extract it:\n\n#### Linux (AMD64)\n```bash\ntar -xzf obitools4_'"$$version"'_linux_amd64.tar.gz\n```\n\n#### Linux (ARM64)\n```bash\ntar -xzf obitools4_'"$$version"'_linux_arm64.tar.gz\n```\n\n#### macOS (Intel)\n```bash\ntar -xzf obitools4_'"$$version"'_darwin_amd64.tar.gz\n```\n\n#### macOS (Apple Silicon)\n```bash\ntar -xzf obitools4_'"$$version"'_darwin_arm64.tar.gz\n```\n\nAll OBITools4 binaries are included in each archive.\n\n### From source\n\nYou can also compile and install OBITools4 directly from source using the\ninstallation script:\n\n```bash\ncurl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | bash -s -- --version '"$$version"'\n```\n\nBy default binaries are installed in `/usr/local/bin`. Use `--install-dir` to\nchange the destination and `--obitools-prefix` to add a prefix to command names:\n\n```bash\ncurl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | \\\n bash -s -- --version '"$$version"' --install-dir ~/local --obitools-prefix k\n```\n'; \ + release_message="$$release_message$$install_section"; \ + echo "$(BLUE)→ Creating tag $$tag_name...$(NC)"; \ + git tag -a "$$tag_name" -m "$$release_message" 2>/dev/null || echo "$(YELLOW)⚠ Tag $$tag_name already exists$(NC)"; \ + echo "$(BLUE)→ Pushing tag $$tag_name...$(NC)"; \ + git push origin "$$tag_name" 2>/dev/null || echo "$(YELLOW)⚠ Tag push failed or already pushed$(NC)" jjfetch: @echo "$(YELLOW)→ Pulling latest commits...$(NC)" @@ -169,5 +187,5 @@ jjfetch: @jj new master@origin @echo "$(GREEN)✓ Latest commits pulled$(NC)" -.PHONY: all obitools update-deps obitests githubtests jjnew jjpush jjfetch bump-version .FORCE +.PHONY: all obitools update-deps obitests githubtests jjnew jjpush jjpush-describe jjpush-bump jjpush-push jjpush-tag jjfetch bump-version .FORCE .FORCE: diff --git a/blackboard/Prospective/large_sequence_parsing.md b/blackboard/Prospective/large_sequence_parsing.md new file mode 100644 index 0000000..01f9738 --- /dev/null +++ b/blackboard/Prospective/large_sequence_parsing.md @@ -0,0 +1,264 @@ +# Optimisation du parsing des grandes séquences + +## Contexte + +OBITools4 doit pouvoir traiter des séquences de taille chromosomique (plusieurs Gbp), notamment +issues de fichiers GenBank/EMBL (assemblages de génomes) ou de fichiers FASTA convertis depuis +ces formats. + +## Architecture actuelle + +### Pipeline de lecture (`pkg/obiformats/`) + +``` +ReadFileChunk (goroutine) + → ChannelFileChunk + → N × _ParseGenbankFile / _ParseFastaFile (goroutines) + → IBioSequence +``` + +`ReadFileChunk` (`file_chunk_read.go`) lit le fichier par morceaux via une chaîne de +`PieceOfChunk` (rope). Chaque nœud fait `fileChunkSize` bytes : + +- GenBank/EMBL : 128 MB (`1024*1024*128`) +- FASTA/FASTQ : 1 MB (`1024*1024`) + +La chaîne est accumulée jusqu'à trouver la fin du dernier enregistrement complet (splitter), +puis `Pack()` est appelé pour fusionner tous les nœuds en un seul buffer contigu. Ce buffer +est transmis au parseur via `FileChunk.Raw *bytes.Buffer`. + +### Parseur GenBank (`genbank_read.go`) + +`GenbankChunkParser` reçoit un `io.Reader` sur le buffer packé, lit ligne par ligne via +`bufio.NewReader` (buffer 4096 bytes), et pour chaque ligne de la section `ORIGIN` : + +```go +line = string(bline) // allocation par ligne +cleanline := strings.TrimSpace(line) // allocation +parts := strings.SplitN(cleanline, " ", 7) // allocation []string + substrings +for i := 1; i < lparts; i++ { + seqBytes.WriteString(parts[i]) +} +``` + +Point positif : `seqBytes` est pré-alloué grâce à `lseq` extrait de la ligne `LOCUS`. + +### Parseur FASTA (`fastaseq_read.go`) + +`FastaChunkParser` lit **octet par octet** via `scanner.ReadByte()`. Pour 3 Gbp : +3 milliards d'appels. `seqBytes` est un `bytes.Buffer{}` sans pré-allocation. + +## Problème principal + +Pour une séquence de plusieurs Gbp, `Pack()` fusionne une chaîne de ~N nœuds de 128 MB en +un seul buffer contigu. C'est une allocation de N × 128 MB suivie d'une copie de toutes les +données. Bien que l'implémentation de `Pack()` soit efficace (libère les nœuds au fur et à +mesure via `slices.Grow`), la copie est inévitable avec l'architecture actuelle. + +De plus, le parseur GenBank produit des dizaines de millions d'allocations temporaires pour +parser la section `ORIGIN` (une par ligne). + +## Invariant clé découvert + +**Si la rope a plus d'un nœud, le premier nœud seul ne se termine pas sur une frontière +d'enregistrement** (pas de `//\n` en fin de `piece1`). + +Preuve par construction dans `ReadFileChunk` : +- `splitter` est appelé dès le premier nœud (ligne 157) +- Si `end >= 0` → frontière trouvée dans 128 MB → boucle interne sautée → rope à 1 nœud +- Si `end < 0` → boucle interne ajoute des nœuds → rope à ≥ 2 nœuds + +Corollaire : si rope à 1 nœud, `Pack()` ne fait rien (aucun nœud suivant). + +**Attention** : rope à ≥ 2 nœuds ne signifie pas qu'il n'y a qu'une seule séquence dans +la rope. La rope packée peut contenir plusieurs enregistrements complets. Exemple : records +de 80 MB → `nextpieces` (48 MB de reste) + nouveau nœud (128 MB) = rope à 2 nœuds +contenant 2 records complets + début d'un troisième. + +L'invariant dit seulement que `piece1` seul est incomplet — pas que la rope entière +ne contient qu'un seul record. + +**Invariant : le dernier FileChunk envoyé finit sur une frontière d'enregistrement.** + +Deux chemins dans `ReadFileChunk` : + +1. **Chemin normal** (`end >= 0` via `splitter`) : le buffer est explicitement tronqué à + `end` (ligne 200 : `pieces.data = pieces.data[:end]`). Frontière garantie par construction + pour tous les formats. ✓ + +2. **Chemin EOF** (`end < 0`, `end = pieces.Len()`) : tout le reste du fichier est envoyé. + - **GenBank/EMBL** : présuppose fichier bien formé (se termine par `//\n`). Le parseur + lève un `log.Fatalf` sur tout état inattendu — filet de sécurité suffisant. ✓ + - **FASTQ** : présupposé, vérifié par le parseur. ✓ + - **FASTA** : garanti par le format lui-même (fin d'enregistrement = EOF ou `>`). ✓ + +**Hypothèse de travail adoptée** : les fichiers d'entrée sont bien formés. Dans le pire cas, +le parseur lèvera une erreur explicite. Il n'y a pas de risque de corruption silencieuse. + +## Piste d'optimisation : se dispenser de Pack() + +### Idée centrale + +Au lieu de fusionner la rope avant de la passer au parseur, **parser directement la rope +nœud par nœud**, et **écrire la séquence compactée in-place dans le premier nœud**. + +Pourquoi c'est sûr : +- Le header (LOCUS, DEFINITION, SOURCE, FEATURES) est **petit** et traité en premier +- La séquence (ORIGIN) est **à la fin** du record +- Au moment d'écrire la séquence depuis l'offset 0 de `piece1`, le pointeur de lecture + est profond dans la rope (offset >> 0) → jamais de collision +- La séquence compactée est toujours plus courte que les données brutes + +### Pré-allocation + +Pour GenBank/EMBL : `lseq` est connu dès la ligne `LOCUS`/`ID` (première ligne, dans +`piece1`). On peut faire `slices.Grow(piece1.data, lseq)` dès ce moment. + +Pour FASTA : pas de taille garantie dans le header, mais `rope.Len()` donne un majorant. +On peut utiliser `rope.Len() / 2` comme estimation initiale. + +### Gestion des jonctions entre nœuds + +Une ligne peut chevaucher deux nœuds (rare avec 128 MB, mais possible). Solution : carry +buffer de ~128 bytes pour les quelques bytes en fin de nœud. + +### Cas FASTA/FASTQ multi-séquences + +Un FileChunk peut contenir N séquences (notamment FASTA/FASTQ courts). Dans ce cas +l'écriture in-place dans `piece1` n'est pas applicable directement — on écrase des données +nécessaires aux séquences suivantes. + +Stratégie par cas : +- **Rope à 1 nœud** (record ≤ 128 MB) : `Pack()` est trivial (no-op), parseur actuel OK +- **Rope à ≥ 2 nœuds** : par l'invariant, `piece1` ne contient pas de record complet → + une seule grande séquence → in-place applicable + +### Format d'une ligne séquence GenBank (Après ORIGIN) + +``` +/^ *[0-9]+( [nuc]{10}){0,5} [nuc]{1,10}/ +``` + +### Format d'une ligne séquence GenBank (Après SQ) + +La ligne SQ contient aussi la taille de la séquence + +``` +/^ *( [nuc]{10}){0,5} [nuc]{1,10} *[0-9]+/ +``` + +Compactage in-place sur `bline` ([]byte brut, sans conversion `string`) : + +```go +w := 0 +i := 0 +for i < len(bline) && bline[i] == ' ' { i++ } // skip indentation +for i < len(bline) && bline[i] <= '9' { i++ } // skip position number +for ; i < len(bline); i++ { + if bline[i] != ' ' { + bline[w] = bline[i] + w++ + } +} +// écrire bline[:w] directement dans piece1.data[seqOffset:] +``` + +## Changements nécessaires + +1. **`FileChunk`** : exposer la rope `*PieceOfChunk` non-packée en plus (ou à la place) + de `Raw *bytes.Buffer` +2. **`GenbankChunkParser` / `EmblChunkParser`** : accepter `*PieceOfChunk`, parser la + rope séquentiellement avec carry buffer pour les jonctions +3. **`FastaChunkParser`** : idem, avec in-place conditionnel selon taille de la rope +4. **`ReadFileChunk`** : ne pas appeler `Pack()` avant envoi sur le channel (ou version + alternative `ReadFileChunkRope`) + +## Fichiers concernés + +- `pkg/obiformats/file_chunk_read.go` — structure rope, `ReadFileChunk` +- `pkg/obiformats/genbank_read.go` — `GenbankChunkParser`, `_ParseGenbankFile` +- `pkg/obiformats/embl_read.go` — `EmblChunkParser`, `ReadEMBL` +- `pkg/obiformats/fastaseq_read.go` — `FastaChunkParser`, `_ParseFastaFile` +- `pkg/obiformats/fastqseq_read.go` — parseur FASTQ (même structure) + +## Plan d'implémentation : parseur GenBank sur rope + +### Contexte + +Baseline mesurée : `obiconvert gbpln640.seq.gz` → 49s real, 42s user, 29s sys, **57 GB RSS**. +Le sys élevé indique des allocations massives. Deux causes : +1. `Pack()` : fusionne toute la rope (N × 128 MB) en un buffer contigu avant de parser +2. Parser ORIGIN : `string(bline)` + `TrimSpace` + `SplitN` × millions de lignes + +### 1. `gbRopeScanner` + +Struct de lecture ligne par ligne sur la rope, sans allocation heap : + +```go +type gbRopeScanner struct { + current *PieceOfChunk + pos int + carry [256]byte // stack-allocated, max GenBank line = 80 chars + carryN int +} +``` + +`ReadLine()` : +- Cherche `\n` dans `current.data[pos:]` via `bytes.IndexByte` +- Si trouvé sans carry : retourne slice direct du node (zéro alloc) +- Si trouvé avec carry : copie dans carry buffer, retourne `carry[:n]` +- Si non trouvé : copie le reste dans carry, avance au node suivant, recommence +- EOF : retourne `carry[:carryN]` puis nil + +`extractSequence(dest []byte, UtoT bool) int` : +- Scan direct des bytes pour section ORIGIN, sans passer par ReadLine +- Machine d'états : lineStart → skip espaces/digits → copier nucléotides dans dest +- Stop sur `//` en début de ligne +- Zéro allocation, UtoT inline + +### 2. `GenbankChunkParserRope` + +```go +func GenbankChunkParserRope(source string, rope *PieceOfChunk, + withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) +``` + +- Même machine d'états que `GenbankChunkParser`, sur `[]byte` (`bytes.HasPrefix`) +- LOCUS : extrait `id` et `lseq` par scan direct (remplace `_seqlenght_rx`) +- FEATURES / default inFeature : taxid extrait par scan de `/db_xref="taxon:` + dans la source feature ; `featBytes` rempli seulement si `withFeatureTable=true` +- DEFINITION : toujours conservée +- ORIGIN : `dest = make([]byte, 0, lseq+20)` puis `s.extractSequence(dest, UtoT)` + +### 3. Modifications `_ParseGenbankFile` et `ReadGenbank` + +`_ParseGenbankFile` utilise `chunk.Rope` : +```go +sequences, err := GenbankChunkParserRope(chunk.Source, chunk.Rope, ...) +``` + +`ReadGenbank` passe `pack=false` : +```go +entry_channel := ReadFileChunk(..., false) +``` + +### 4. Ce qui NE change pas + +- `GenbankChunkParser` reste (référence, tests) +- `ReadFileChunk`, `Pack()`, autres parseurs (EMBL, FASTA, FASTQ) : inchangés + +### 5. Gains attendus + +- **RSS** : pic ≈ 128 MB × workers (au lieu de N × 128 MB) +- **Temps sys** : élimination des mmap/munmap pour les gros buffers +- **Temps user** : ~50M allocations éliminées + +### 6. Vérification + +```bash +/usr/local/go/bin/go build ./... +diff <(obiconvert gbpln640.seq.gz) gbpln640.reference.fasta +cd bugs/genbank && ./benchmark.sh gbpln640.seq.gz +``` + +Cible : RSS < 1 GB, temps comparable ou meilleur. diff --git a/pkg/obiformats/embl_read.go b/pkg/obiformats/embl_read.go index fb22ccd..d2fa9b1 100644 --- a/pkg/obiformats/embl_read.go +++ b/pkg/obiformats/embl_read.go @@ -196,6 +196,7 @@ func ReadEMBL(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, er 1024*1024*128, EndOfLastFlatFileEntry, "\nID ", + true, ) newIter := obiiter.MakeIBioSequence() diff --git a/pkg/obiformats/fastaseq_read.go b/pkg/obiformats/fastaseq_read.go index 8c8441c..3597be2 100644 --- a/pkg/obiformats/fastaseq_read.go +++ b/pkg/obiformats/fastaseq_read.go @@ -245,6 +245,7 @@ func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e 1024*1024, EndOfLastFastaEntry, "\n>", + true, ) for i := 0; i < nworker; i++ { diff --git a/pkg/obiformats/fastqseq_read.go b/pkg/obiformats/fastqseq_read.go index c092e32..9c94d2d 100644 --- a/pkg/obiformats/fastqseq_read.go +++ b/pkg/obiformats/fastqseq_read.go @@ -339,6 +339,7 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e 1024*1024, EndOfLastFastqEntry, "\n@", + true, ) for i := 0; i < nworker; i++ { diff --git a/pkg/obiformats/fastseq_write_fasta.go b/pkg/obiformats/fastseq_write_fasta.go index 9934ad8..88b029f 100644 --- a/pkg/obiformats/fastseq_write_fasta.go +++ b/pkg/obiformats/fastseq_write_fasta.go @@ -77,45 +77,47 @@ func FormatFasta(seq *obiseq.BioSequence, formater FormatHeader) string { // // It returns a byte array containing the formatted sequences. func FormatFastaBatch(batch obiiter.BioSequenceBatch, formater FormatHeader, skipEmpty bool) *bytes.Buffer { - // Create a buffer to store the formatted sequences var bs bytes.Buffer lt := 0 - for _, seq := range batch.Slice() { lt += seq.Len() } - // Iterate over each sequence in the batch + // Pre-allocate: sequence data + newlines every 60 chars + ~100 bytes header per sequence + bs.Grow(lt + lt/60 + 100*batch.Len() + 1) + log.Debugf("FormatFastaBatch: #%d : %d seqs", batch.Order(), batch.Len()) - first := true + for _, seq := range batch.Slice() { - // Check if the sequence is empty if seq.Len() > 0 { - // Format the sequence using the provided formater function - formattedSeq := FormatFasta(seq, formater) - - if first { - bs.Grow(lt + (len(formattedSeq)-seq.Len())*batch.Len()*5/4) - first = false - } - - // Append the formatted sequence to the buffer - bs.WriteString(formattedSeq) + // Write header directly into bs — no intermediate string + bs.WriteByte('>') + bs.WriteString(seq.Id()) + bs.WriteByte(' ') + bs.WriteString(formater(seq)) bs.WriteByte('\n') + + // Write folded sequence directly into bs — no copies + s := seq.Sequence() + l := len(s) + for i := 0; i < l; i += 60 { + to := i + 60 + if to > l { + to = l + } + bs.Write(s[i:to]) + bs.WriteByte('\n') + } } else { - // Handle empty sequences if skipEmpty { - // Skip empty sequences if skipEmpty is true obilog.Warnf("Sequence %s is empty and skipped in output", seq.Id()) } else { - // Terminate the program if skipEmpty is false log.Fatalf("Sequence %s is empty", seq.Id()) } } } - // Return the byte array representation of the buffer return &bs } diff --git a/pkg/obiformats/file_chunk_read.go b/pkg/obiformats/file_chunk_read.go index 20c8c22..9a4f7a7 100644 --- a/pkg/obiformats/file_chunk_read.go +++ b/pkg/obiformats/file_chunk_read.go @@ -16,6 +16,7 @@ type SeqFileChunkParser func(string, io.Reader) (obiseq.BioSequenceSlice, error) type FileChunk struct { Source string Raw *bytes.Buffer + Rope *PieceOfChunk Order int } @@ -97,11 +98,17 @@ func (piece *PieceOfChunk) IsLast() bool { return piece.next == nil } -func (piece *PieceOfChunk) FileChunk(source string, order int) FileChunk { - piece.Pack() +func (piece *PieceOfChunk) FileChunk(source string, order int, pack bool) FileChunk { + piece = piece.Head() + var raw *bytes.Buffer + if pack { + piece.Pack() + raw = bytes.NewBuffer(piece.data) + } return FileChunk{ Source: source, - Raw: bytes.NewBuffer(piece.data), + Raw: raw, + Rope: piece, Order: order, } } @@ -133,7 +140,8 @@ func ReadFileChunk( reader io.Reader, fileChunkSize int, splitter LastSeqRecord, - probe string) ChannelFileChunk { + probe string, + pack bool) ChannelFileChunk { chunk_channel := make(ChannelFileChunk) @@ -205,7 +213,7 @@ func ReadFileChunk( if len(pieces.data) > 0 { // obilog.Warnf("chuck %d :Read %d bytes from file %s", i, io.Len(), source) - chunk_channel <- pieces.FileChunk(source, i) + chunk_channel <- pieces.FileChunk(source, i, pack) i++ } @@ -222,7 +230,7 @@ func ReadFileChunk( // Send the last chunk to the channel if pieces.Len() > 0 { - chunk_channel <- pieces.FileChunk(source, i) + chunk_channel <- pieces.FileChunk(source, i, pack) } // Close the readers channel when the end of the file is reached diff --git a/pkg/obiformats/genbank_read.go b/pkg/obiformats/genbank_read.go index 37e84db..53a3057 100644 --- a/pkg/obiformats/genbank_read.go +++ b/pkg/obiformats/genbank_read.go @@ -29,6 +29,342 @@ const ( var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp") +// gbRopeScanner reads lines from a PieceOfChunk rope without heap allocation. +// The carry buffer (stack) handles lines that span two rope nodes. +type gbRopeScanner struct { + current *PieceOfChunk + pos int + carry [256]byte // max GenBank line = 80 chars; 256 gives ample margin + carryN int +} + +func newGbRopeScanner(rope *PieceOfChunk) *gbRopeScanner { + return &gbRopeScanner{current: rope} +} + +// ReadLine returns the next line without the trailing \n (or \r\n). +// Returns nil at end of rope. The returned slice aliases carry[] or the node +// data and is valid only until the next ReadLine call. +func (s *gbRopeScanner) ReadLine() []byte { + for { + if s.current == nil { + if s.carryN > 0 { + n := s.carryN + s.carryN = 0 + return s.carry[:n] + } + return nil + } + + data := s.current.data[s.pos:] + idx := bytes.IndexByte(data, '\n') + + if idx >= 0 { + var line []byte + if s.carryN == 0 { + line = data[:idx] + } else { + n := copy(s.carry[s.carryN:], data[:idx]) + s.carryN += n + line = s.carry[:s.carryN] + s.carryN = 0 + } + s.pos += idx + 1 + if s.pos >= len(s.current.data) { + s.current = s.current.Next() + s.pos = 0 + } + if len(line) > 0 && line[len(line)-1] == '\r' { + line = line[:len(line)-1] + } + return line + } + + // No \n in this node: accumulate into carry and advance + n := copy(s.carry[s.carryN:], data) + s.carryN += n + s.current = s.current.Next() + s.pos = 0 + } +} + +// extractSequence scans the ORIGIN section byte-by-byte directly on the rope, +// appending compacted bases to dest. Returns the extended slice. +// Stops and returns when "//" is found at the start of a line. +// The scanner is left positioned after the "//" line. +func (s *gbRopeScanner) extractSequence(dest []byte, UtoT bool) []byte { + lineStart := true + skipDigits := true + + for s.current != nil { + data := s.current.data[s.pos:] + for i, b := range data { + if lineStart { + if b == '/' { + // End-of-record marker "//" + s.pos += i + 1 + if s.pos >= len(s.current.data) { + s.current = s.current.Next() + s.pos = 0 + } + s.skipToNewline() + return dest + } + lineStart = false + skipDigits = true + } + switch { + case b == '\n': + lineStart = true + case b == '\r': + // skip + case skipDigits: + if b != ' ' && (b < '0' || b > '9') { + skipDigits = false + if UtoT && b == 'u' { + b = 't' + } + dest = append(dest, b) + } + case b != ' ': + if UtoT && b == 'u' { + b = 't' + } + dest = append(dest, b) + } + } + s.current = s.current.Next() + s.pos = 0 + } + return dest +} + +// skipToNewline advances the scanner past the next '\n'. +func (s *gbRopeScanner) skipToNewline() { + for s.current != nil { + data := s.current.data[s.pos:] + idx := bytes.IndexByte(data, '\n') + if idx >= 0 { + s.pos += idx + 1 + if s.pos >= len(s.current.data) { + s.current = s.current.Next() + s.pos = 0 + } + return + } + s.current = s.current.Next() + s.pos = 0 + } +} + +// parseLseqFromLocus extracts the declared sequence length from a LOCUS line. +// Format: "LOCUS bp ..." +// Returns -1 if not found or parse error. +func parseLseqFromLocus(line []byte) int { + if len(line) < 13 { + return -1 + } + i := 12 + for i < len(line) && line[i] != ' ' { + i++ + } + for i < len(line) && line[i] == ' ' { + i++ + } + start := i + for i < len(line) && line[i] >= '0' && line[i] <= '9' { + i++ + } + if i == start { + return -1 + } + n, err := strconv.Atoi(string(line[start:i])) + if err != nil { + return -1 + } + return n +} + +// Prefix constants for GenBank section headers (byte slices for zero-alloc comparison). +var ( + gbPfxLocus = []byte("LOCUS ") + gbPfxDefinition = []byte("DEFINITION ") + gbPfxContinue = []byte(" ") + gbPfxSource = []byte("SOURCE ") + gbPfxFeatures = []byte("FEATURES ") + gbPfxOrigin = []byte("ORIGIN") + gbPfxContig = []byte("CONTIG") + gbPfxEnd = []byte("//") + gbPfxDbXref = []byte(` /db_xref="taxon:`) +) + +// GenbankChunkParserRope parses a GenBank FileChunk directly from the rope +// (PieceOfChunk linked list) without calling Pack(). This eliminates the large +// contiguous allocation required for chromosomal-scale sequences. +func GenbankChunkParserRope(source string, rope *PieceOfChunk, + withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) { + + state := inHeader + scanner := newGbRopeScanner(rope) + sequences := obiseq.MakeBioSequenceSlice(100)[:0] + + id := "" + lseq := -1 + scientificName := "" + defBytes := new(bytes.Buffer) + featBytes := new(bytes.Buffer) + var seqDest []byte + taxid := 1 + nl := 0 + + for bline := scanner.ReadLine(); bline != nil; bline = scanner.ReadLine() { + nl++ + processed := false + for !processed { + switch { + + case bytes.HasPrefix(bline, gbPfxLocus): + if state != inHeader { + log.Fatalf("Line %d - Unexpected state %d while reading LOCUS: %s", nl, state, bline) + } + rest := bline[12:] + sp := bytes.IndexByte(rest, ' ') + if sp < 0 { + id = string(rest) + } else { + id = string(rest[:sp]) + } + lseq = parseLseqFromLocus(bline) + cap0 := lseq + 20 + if cap0 < 1024 { + cap0 = 1024 + } + seqDest = make([]byte, 0, cap0) + state = inEntry + processed = true + + case bytes.HasPrefix(bline, gbPfxDefinition): + if state != inEntry { + log.Fatalf("Line %d - Unexpected state %d while reading DEFINITION: %s", nl, state, bline) + } + defBytes.Write(bytes.TrimSpace(bline[12:])) + state = inDefinition + processed = true + + case state == inDefinition: + if bytes.HasPrefix(bline, gbPfxContinue) { + defBytes.WriteByte(' ') + defBytes.Write(bytes.TrimSpace(bline[12:])) + processed = true + } else { + state = inEntry + } + + case bytes.HasPrefix(bline, gbPfxSource): + if state != inEntry { + log.Fatalf("Line %d - Unexpected state %d while reading SOURCE: %s", nl, state, bline) + } + scientificName = string(bytes.TrimSpace(bline[12:])) + processed = true + + case bytes.HasPrefix(bline, gbPfxFeatures): + if state != inEntry { + log.Fatalf("Line %d - Unexpected state %d while reading FEATURES: %s", nl, state, bline) + } + if withFeatureTable { + featBytes.Write(bline) + } + state = inFeature + processed = true + + case bytes.HasPrefix(bline, gbPfxOrigin): + if state != inFeature && state != inContig { + log.Fatalf("Line %d - Unexpected state %d while reading ORIGIN: %s", nl, state, bline) + } + // Use fast byte-scan to extract sequence and consume through "//" + seqDest = scanner.extractSequence(seqDest, UtoT) + // Emit record + if id == "" { + log.Warn("Empty id when parsing genbank file") + } + sequence := obiseq.NewBioSequenceOwning(id, seqDest, defBytes.String()) + sequence.SetSource(source) + if withFeatureTable { + sequence.SetFeatures(featBytes.Bytes()) + } + annot := sequence.Annotations() + annot["scientific_name"] = scientificName + annot["taxid"] = taxid + sequences = append(sequences, sequence) + + defBytes = bytes.NewBuffer(obiseq.GetSlice(200)) + featBytes = new(bytes.Buffer) + nl = 0 + taxid = 1 + seqDest = nil + state = inHeader + processed = true + + case bytes.HasPrefix(bline, gbPfxContig): + if state != inFeature && state != inContig { + log.Fatalf("Line %d - Unexpected state %d while reading CONTIG: %s", nl, state, bline) + } + state = inContig + processed = true + + case bytes.Equal(bline, gbPfxEnd): + // Reached for CONTIG records (no ORIGIN section) + if state != inContig { + log.Fatalf("Line %d - Unexpected state %d while reading end of record %s", nl, state, id) + } + if id == "" { + log.Warn("Empty id when parsing genbank file") + } + sequence := obiseq.NewBioSequenceOwning(id, seqDest, defBytes.String()) + sequence.SetSource(source) + if withFeatureTable { + sequence.SetFeatures(featBytes.Bytes()) + } + annot := sequence.Annotations() + annot["scientific_name"] = scientificName + annot["taxid"] = taxid + sequences = append(sequences, sequence) + + defBytes = bytes.NewBuffer(obiseq.GetSlice(200)) + featBytes = new(bytes.Buffer) + nl = 0 + taxid = 1 + seqDest = nil + state = inHeader + processed = true + + default: + switch state { + case inFeature: + if withFeatureTable { + featBytes.WriteByte('\n') + featBytes.Write(bline) + } + if bytes.HasPrefix(bline, gbPfxDbXref) { + rest := bline[len(gbPfxDbXref):] + q := bytes.IndexByte(rest, '"') + if q >= 0 { + taxid, _ = strconv.Atoi(string(rest[:q])) + } + } + processed = true + case inHeader, inEntry, inContig: + processed = true + default: + log.Fatalf("Unexpected state %d while reading: %s", state, bline) + } + } + } + } + + return sequences, nil +} + func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlice, error) { return func(source string, input io.Reader) (obiseq.BioSequenceSlice, error) { state := inHeader @@ -125,13 +461,10 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob if state != inSequence && state != inContig { log.Fatalf("Line %d - Unexpected state %d while reading end of record %s", nl, state, id) } - // log.Debugln("Total lines := ", nl) if id == "" { log.Warn("Empty id when parsing genbank file") } - // log.Debugf("End of sequence %s: %dbp ", id, seqBytes.Len()) - sequence := obiseq.NewBioSequence(id, seqBytes.Bytes(), defBytes.String()) @@ -144,9 +477,6 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob annot := sequence.Annotations() annot["scientific_name"] = scientificName annot["taxid"] = taxid - // log.Println(FormatFasta(sequence, FormatFastSeqJsonHeader)) - // log.Debugf("Read sequences %s: %dbp (%d)", sequence.Id(), - // sequence.Len(), seqBytes.Len()) sequences = append(sequences, sequence) @@ -159,8 +489,6 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob processed = true case state == inSequence: - // log.Debugf("Chunk %d : Genbank: line %d, state = %d : %s", chunks.order, nl, state, line) - sl++ cleanline := strings.TrimSpace(line) parts := strings.SplitN(cleanline, " ", 7) @@ -198,6 +526,7 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob } + _ = sl return sequences, nil } } @@ -206,10 +535,16 @@ func _ParseGenbankFile(input ChannelFileChunk, out obiiter.IBioSequence, withFeatureTable, UtoT bool) { - parser := GenbankChunkParser(withFeatureTable, UtoT) - for chunks := range input { - sequences, err := parser(chunks.Source, chunks.Raw) + var sequences obiseq.BioSequenceSlice + var err error + + if chunks.Rope != nil { + sequences, err = GenbankChunkParserRope(chunks.Source, chunks.Rope, withFeatureTable, UtoT) + } else { + parser := GenbankChunkParser(withFeatureTable, UtoT) + sequences, err = parser(chunks.Source, chunks.Raw) + } if err != nil { log.Fatalf("File %s : Cannot parse the genbank file : %v", chunks.Source, err) @@ -225,7 +560,6 @@ func _ParseGenbankFile(input ChannelFileChunk, func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) { opt := MakeOptions(options) - // entry_channel := make(chan _FileChunk) entry_channel := ReadFileChunk( opt.Source(), @@ -233,13 +567,13 @@ func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, 1024*1024*128, EndOfLastFlatFileEntry, "\nLOCUS ", + false, // do not pack: rope-based parser avoids contiguous allocation ) newIter := obiiter.MakeIBioSequence() nworkers := opt.ParallelWorkers() - // for j := 0; j < opt.ParallelWorkers(); j++ { for j := 0; j < nworkers; j++ { newIter.Add(1) go _ParseGenbankFile( @@ -250,8 +584,6 @@ func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, ) } - // go _ReadFlatFileChunk(reader, entry_channel) - go func() { newIter.WaitAndClose() log.Debug("End of the genbank file ", opt.Source()) diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index 162abb3..097c578 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -3,7 +3,7 @@ package obioptions // Version is automatically updated by the Makefile from version.txt // The patch number (third digit) is incremented on each push to the repository -var _Version = "Release 4.4.18" +var _Version = "Release 4.4.19" // Version returns the version of the obitools package. // diff --git a/pkg/obiseq/biosequence.go b/pkg/obiseq/biosequence.go index 628f52c..f3939d8 100644 --- a/pkg/obiseq/biosequence.go +++ b/pkg/obiseq/biosequence.go @@ -120,6 +120,19 @@ func NewBioSequence(id string, return bs } +// NewBioSequenceOwning creates a BioSequence taking ownership of the sequence +// slice without copying it. The caller must not use the slice after this call. +// Use this when the slice was allocated specifically for this sequence. +func NewBioSequenceOwning(id string, + sequence []byte, + definition string) *BioSequence { + bs := NewEmptyBioSequence(0) + bs.SetId(id) + bs.TakeSequence(sequence) + bs.SetDefinition(definition) + return bs +} + // NewBioSequenceWithQualities creates a new BioSequence object with the given id, sequence, definition, and qualities. // // Parameters: @@ -444,6 +457,12 @@ func (s *BioSequence) SetSequence(sequence []byte) { s.sequence = obiutils.InPlaceToLower(CopySlice(sequence)) } +// TakeSequence stores the slice directly without copying, then lowercases in-place. +// The caller must not use the slice after this call. +func (s *BioSequence) TakeSequence(sequence []byte) { + s.sequence = obiutils.InPlaceToLower(sequence) +} + func (s *BioSequence) HasValidSequence() bool { for _, c := range s.sequence { if !((c >= 'a' && c <= 'z') || c == '-' || c == '.' || c == '[' || c == ']') { diff --git a/release_notes.sh b/release_notes.sh index 32de401..10c05e0 100755 --- a/release_notes.sh +++ b/release_notes.sh @@ -21,6 +21,74 @@ LLM_MODEL="ollama:qwen3-coder-next:latest" die() { echo "Error: $*" >&2; exit 1; } +next_patch() { + local v="$1" + local major minor patch + major=$(echo "$v" | cut -d. -f1) + minor=$(echo "$v" | cut -d. -f2) + patch=$(echo "$v" | cut -d. -f3) + echo "${major}.${minor}.$(( patch + 1 ))" +} + +# Strip "pre-" prefix to get the bare version number for installation section +bare_version() { + echo "$1" | sed 's/^pre-//' +} + +installation_section() { + local v + v=$(bare_version "$1") + cat <&2 -fi + # ── Pre-release mode: local HEAD vs latest GitHub tag ────────────────── + PRE_RELEASE=true + previous_tag="Release_${latest_version}" + VERSION="pre-$(next_patch "$latest_version")" -tag_name="Release_${VERSION}" + echo "Pre-release mode: $previous_tag -> HEAD (as $VERSION)" >&2 -# Verify the requested version exists -if ! echo "$all_versions" | grep -qx "$VERSION"; then - die "Version $VERSION not found. Use -l to list available versions." -fi + # Need to be in a git repo + if ! git rev-parse --is-inside-work-tree >/dev/null 2>&1; then + die "Not inside a git repository. Pre-release mode requires a local git repo." + fi -# Find the previous version (the one right after in the sorted-descending list) -previous_version=$(echo "$all_versions" | grep -A1 -x "$VERSION" | tail -1) + # Check that the previous tag exists locally + if ! git rev-parse "$previous_tag" >/dev/null 2>&1; then + echo "Tag $previous_tag not found locally, fetching..." >&2 + git fetch --tags 2>/dev/null || true + if ! git rev-parse "$previous_tag" >/dev/null 2>&1; then + die "Tag $previous_tag not found locally or remotely" + fi + fi -if [ "$previous_version" = "$VERSION" ] || [ -z "$previous_version" ]; then - previous_tag="" - echo "No previous version found -- will include all commits for $tag_name" >&2 -else - previous_tag="Release_${previous_version}" - echo "Generating notes: $previous_tag -> $tag_name" >&2 -fi + # Get local commits from the tag to HEAD (full messages) + commit_list=$(git log --format="%h %B" "${previous_tag}..HEAD" 2>/dev/null) -# ── Fetch commit messages between tags via GitHub compare API ──────────── + if [ -z "$commit_list" ]; then + die "No local commits found since $previous_tag" + fi + else + # ── Published release mode: between two GitHub tags ──────────────────── + PRE_RELEASE=false + tag_name="Release_${VERSION}" -if [ -n "$previous_tag" ]; then - commits_json=$(curl -sf "${GITHUB_API}/compare/${previous_tag}...${tag_name}") - if [ -z "$commits_json" ]; then - die "Could not fetch commit comparison from GitHub" - fi - commit_list=$(echo "$commits_json" \ - | jq -r '.commits[] | (.sha[:8] + " " + (.commit.message | split("\n")[0]))' 2>/dev/null) -else - # First release: get commits up to this tag - commits_json=$(curl -sf "${GITHUB_API}/commits?sha=${tag_name}&per_page=50") - if [ -z "$commits_json" ]; then - die "Could not fetch commits from GitHub" - fi - commit_list=$(echo "$commits_json" \ - | jq -r '.[] | (.sha[:8] + " " + (.commit.message | split("\n")[0]))' 2>/dev/null) -fi + # Verify the requested version exists + if ! echo "$all_versions" | grep -qx "$VERSION"; then + die "Version $VERSION not found. Use -l to list available versions." + fi -if [ -z "$commit_list" ]; then - die "No commits found between $previous_tag and $tag_name" + # Find the previous version + previous_version=$(echo "$all_versions" | grep -A1 -x "$VERSION" | tail -1) + + if [ "$previous_version" = "$VERSION" ] || [ -z "$previous_version" ]; then + previous_tag="" + echo "No previous version found -- will include all commits for $tag_name" >&2 + else + previous_tag="Release_${previous_version}" + echo "Generating notes: $previous_tag -> $tag_name" >&2 + fi + + # Fetch commit messages between tags via GitHub compare API + if [ -n "$previous_tag" ]; then + commits_json=$(curl -sf "${GITHUB_API}/compare/${previous_tag}...${tag_name}") + if [ -z "$commits_json" ]; then + die "Could not fetch commit comparison from GitHub" + fi + commit_list=$(echo "$commits_json" \ + | jq -r '.commits[] | (.sha[:8] + " " + .commit.message)' 2>/dev/null) + else + commits_json=$(curl -sf "${GITHUB_API}/commits?sha=${tag_name}&per_page=50") + if [ -z "$commits_json" ]; then + die "Could not fetch commits from GitHub" + fi + commit_list=$(echo "$commits_json" \ + | jq -r '.[] | (.sha[:8] + " " + .commit.message)' 2>/dev/null) + fi + + if [ -z "$commit_list" ]; then + die "No commits found between $previous_tag and $tag_name" + fi fi # ── LLM prompt (shared by context mode and summarization) ──────────────── @@ -144,6 +237,7 @@ if [ "$RAW_MODE" = true ]; then echo "$commit_list" | while IFS= read -r line; do echo "- ${line}" done + installation_section "$VERSION" exit 0 fi @@ -193,6 +287,7 @@ if [ -n "$release_title" ] && [ -n "$release_body" ]; then echo "# ${release_title}" echo "" echo "$release_body" + installation_section "$VERSION" else echo "Warning: JSON parsing failed, falling back to raw mode" >&2 exec "$0" -r -v "$VERSION" diff --git a/version.txt b/version.txt index 60defb1..0c6fdde 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -4.4.18 +4.4.19