Merge pull request #90 from metabarcoding/push-uzpqqoqvpnxw

Push uzpqqoqvpnxw
This commit is contained in:
coissac
2026-03-10 15:53:08 +01:00
committed by GitHub
13 changed files with 846 additions and 104 deletions

1
.gitignore vendored
View File

@@ -16,6 +16,7 @@
**/*.tgz
**/*.yaml
**/*.csv
**/*.pb.gz
xx
.rhistory

View File

@@ -128,40 +128,58 @@ jjnew:
@echo "$(GREEN)✓ New commit created$(NC)"
jjpush:
@echo "$(YELLOW)→ Pushing commit to repository...$(NC)"
@$(MAKE) jjpush-describe
@$(MAKE) jjpush-bump
@$(MAKE) jjpush-push
@$(MAKE) jjpush-tag
@echo "$(GREEN)✓ Release complete$(NC)"
jjpush-describe:
@echo "$(BLUE)→ Documenting current commit...$(NC)"
@jj auto-describe
jjpush-bump:
@echo "$(BLUE)→ Creating new commit for version bump...$(NC)"
@jj new
@previous_version=$$(cat version.txt); \
$(MAKE) bump-version; \
version=$$(cat version.txt); \
@$(MAKE) bump-version
@echo "$(BLUE)→ Documenting version bump commit...$(NC)"
@jj auto-describe
jjpush-push:
@echo "$(BLUE)→ Pushing commits...$(NC)"
@jj git push --change @
jjpush-tag:
@version=$$(cat version.txt); \
tag_name="Release_$$version"; \
previous_tag="Release_$$previous_version"; \
echo "$(BLUE)→ Documenting version bump commit...$(NC)"; \
jj auto-describe; \
echo "$(BLUE)→ Generating release notes from $$previous_tag to current commit...$(NC)"; \
echo "$(BLUE)→ Generating release notes for $$tag_name...$(NC)"; \
release_message="Release $$version"; \
if command -v orla >/dev/null 2>&1 && command -v jq >/dev/null 2>&1; then \
release_json=$$(ORLA_MAX_TOOL_CALLS=50 jj log -r "$$previous_tag::@" -T 'commit_id.short() ++ " " ++ description' | \
orla agent -m ollama:qwen3-coder-next:latest \
"Summarize the following commits into a GitHub release note for version $$version. Ignore commits related to version bumps, .gitignore changes, or any internal housekeeping that is irrelevant to end users. Describe each user-facing change precisely without exposing code. Eliminate redundancy. Output strictly valid JSON with no surrounding text, using this exact schema: {\"title\": \"<short release title>\", \"body\": \"<detailed markdown release notes>\"}"); \
release_json=$$(echo "$$release_json" | sed -n '/^{/,/^}/p'); \
release_title=$$(echo "$$release_json" | jq -r '.title // empty') ; \
release_body=$$(echo "$$release_json" | jq -r '.body // empty') ; \
previous_tag=$$(git describe --tags --abbrev=0 --match 'Release_*' HEAD^ 2>/dev/null); \
if [ -z "$$previous_tag" ]; then \
echo "$(YELLOW)⚠ No previous Release tag found, skipping release notes$(NC)"; \
else \
raw_output=$$(git log --format="%h %B" "$$previous_tag..HEAD" | \
ORLA_MAX_TOOL_CALLS=50 orla agent -m ollama:qwen3-coder-next:latest \
"Summarize the following commits into a GitHub release note for version $$version. Ignore commits related to version bumps, .gitignore changes, or any internal housekeeping that is irrelevant to end users. Describe each user-facing change precisely without exposing code. Eliminate redundancy. Output strictly valid JSON with no surrounding text, using this exact schema: {\"title\": \"<short release title>\", \"body\": \"<detailed markdown release notes>\"}" 2>/dev/null) || true; \
if [ -n "$$raw_output" ]; then \
sanitized=$$(echo "$$raw_output" | sed -n '/^{/,/^}/p' | tr -d '\000-\011\013-\014\016-\037'); \
release_title=$$(echo "$$sanitized" | jq -r '.title // empty' 2>/dev/null) ; \
release_body=$$(echo "$$sanitized" | jq -r '.body // empty' 2>/dev/null) ; \
if [ -n "$$release_title" ] && [ -n "$$release_body" ]; then \
release_message="$$release_title"$$'\n\n'"$$release_body"; \
else \
echo "$(YELLOW)⚠ JSON parsing failed, falling back to raw output$(NC)"; \
release_message="Release $$version"$$'\n\n'"$$release_json"; \
echo "$(YELLOW)⚠ JSON parsing failed, using default release message$(NC)"; \
fi; \
else \
release_message="Release $$version"; \
fi; \
echo "$(BLUE)→ Pushing commits and creating tag $$tag_name...$(NC)"; \
jj git push --change @; \
git tag -a "$$tag_name" -m "$$release_message" 2>/dev/null || echo "Tag $$tag_name already exists"; \
git push origin "$$tag_name" 2>/dev/null || echo "Tag already pushed"
@echo "$(GREEN) Commits and tag pushed to repository$(NC)"
fi; \
fi; \
install_section=$$'\n## Installation\n\n### Pre-built binaries\n\nDownload the appropriate archive for your system from the\n[release assets](https://github.com/metabarcoding/obitools4/releases/tag/Release_'"$$version"')\nand extract it:\n\n#### Linux (AMD64)\n```bash\ntar -xzf obitools4_'"$$version"'_linux_amd64.tar.gz\n```\n\n#### Linux (ARM64)\n```bash\ntar -xzf obitools4_'"$$version"'_linux_arm64.tar.gz\n```\n\n#### macOS (Intel)\n```bash\ntar -xzf obitools4_'"$$version"'_darwin_amd64.tar.gz\n```\n\n#### macOS (Apple Silicon)\n```bash\ntar -xzf obitools4_'"$$version"'_darwin_arm64.tar.gz\n```\n\nAll OBITools4 binaries are included in each archive.\n\n### From source\n\nYou can also compile and install OBITools4 directly from source using the\ninstallation script:\n\n```bash\ncurl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | bash -s -- --version '"$$version"'\n```\n\nBy default binaries are installed in `/usr/local/bin`. Use `--install-dir` to\nchange the destination and `--obitools-prefix` to add a prefix to command names:\n\n```bash\ncurl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | \\\n bash -s -- --version '"$$version"' --install-dir ~/local --obitools-prefix k\n```\n'; \
release_message="$$release_message$$install_section"; \
echo "$(BLUE) Creating tag $$tag_name...$(NC)"; \
git tag -a "$$tag_name" -m "$$release_message" 2>/dev/null || echo "$(YELLOW)⚠ Tag $$tag_name already exists$(NC)"; \
echo "$(BLUE)→ Pushing tag $$tag_name...$(NC)"; \
git push origin "$$tag_name" 2>/dev/null || echo "$(YELLOW)⚠ Tag push failed or already pushed$(NC)"
jjfetch:
@echo "$(YELLOW)→ Pulling latest commits...$(NC)"
@@ -169,5 +187,5 @@ jjfetch:
@jj new master@origin
@echo "$(GREEN)✓ Latest commits pulled$(NC)"
.PHONY: all obitools update-deps obitests githubtests jjnew jjpush jjfetch bump-version .FORCE
.PHONY: all obitools update-deps obitests githubtests jjnew jjpush jjpush-describe jjpush-bump jjpush-push jjpush-tag jjfetch bump-version .FORCE
.FORCE:

View File

@@ -0,0 +1,264 @@
# Optimisation du parsing des grandes séquences
## Contexte
OBITools4 doit pouvoir traiter des séquences de taille chromosomique (plusieurs Gbp), notamment
issues de fichiers GenBank/EMBL (assemblages de génomes) ou de fichiers FASTA convertis depuis
ces formats.
## Architecture actuelle
### Pipeline de lecture (`pkg/obiformats/`)
```
ReadFileChunk (goroutine)
→ ChannelFileChunk
→ N × _ParseGenbankFile / _ParseFastaFile (goroutines)
→ IBioSequence
```
`ReadFileChunk` (`file_chunk_read.go`) lit le fichier par morceaux via une chaîne de
`PieceOfChunk` (rope). Chaque nœud fait `fileChunkSize` bytes :
- GenBank/EMBL : 128 MB (`1024*1024*128`)
- FASTA/FASTQ : 1 MB (`1024*1024`)
La chaîne est accumulée jusqu'à trouver la fin du dernier enregistrement complet (splitter),
puis `Pack()` est appelé pour fusionner tous les nœuds en un seul buffer contigu. Ce buffer
est transmis au parseur via `FileChunk.Raw *bytes.Buffer`.
### Parseur GenBank (`genbank_read.go`)
`GenbankChunkParser` reçoit un `io.Reader` sur le buffer packé, lit ligne par ligne via
`bufio.NewReader` (buffer 4096 bytes), et pour chaque ligne de la section `ORIGIN` :
```go
line = string(bline) // allocation par ligne
cleanline := strings.TrimSpace(line) // allocation
parts := strings.SplitN(cleanline, " ", 7) // allocation []string + substrings
for i := 1; i < lparts; i++ {
seqBytes.WriteString(parts[i])
}
```
Point positif : `seqBytes` est pré-alloué grâce à `lseq` extrait de la ligne `LOCUS`.
### Parseur FASTA (`fastaseq_read.go`)
`FastaChunkParser` lit **octet par octet** via `scanner.ReadByte()`. Pour 3 Gbp :
3 milliards d'appels. `seqBytes` est un `bytes.Buffer{}` sans pré-allocation.
## Problème principal
Pour une séquence de plusieurs Gbp, `Pack()` fusionne une chaîne de ~N nœuds de 128 MB en
un seul buffer contigu. C'est une allocation de N × 128 MB suivie d'une copie de toutes les
données. Bien que l'implémentation de `Pack()` soit efficace (libère les nœuds au fur et à
mesure via `slices.Grow`), la copie est inévitable avec l'architecture actuelle.
De plus, le parseur GenBank produit des dizaines de millions d'allocations temporaires pour
parser la section `ORIGIN` (une par ligne).
## Invariant clé découvert
**Si la rope a plus d'un nœud, le premier nœud seul ne se termine pas sur une frontière
d'enregistrement** (pas de `//\n` en fin de `piece1`).
Preuve par construction dans `ReadFileChunk` :
- `splitter` est appelé dès le premier nœud (ligne 157)
- Si `end >= 0` → frontière trouvée dans 128 MB → boucle interne sautée → rope à 1 nœud
- Si `end < 0` → boucle interne ajoute des nœuds → rope à ≥ 2 nœuds
Corollaire : si rope à 1 nœud, `Pack()` ne fait rien (aucun nœud suivant).
**Attention** : rope à ≥ 2 nœuds ne signifie pas qu'il n'y a qu'une seule séquence dans
la rope. La rope packée peut contenir plusieurs enregistrements complets. Exemple : records
de 80 MB → `nextpieces` (48 MB de reste) + nouveau nœud (128 MB) = rope à 2 nœuds
contenant 2 records complets + début d'un troisième.
L'invariant dit seulement que `piece1` seul est incomplet — pas que la rope entière
ne contient qu'un seul record.
**Invariant : le dernier FileChunk envoyé finit sur une frontière d'enregistrement.**
Deux chemins dans `ReadFileChunk` :
1. **Chemin normal** (`end >= 0` via `splitter`) : le buffer est explicitement tronqué à
`end` (ligne 200 : `pieces.data = pieces.data[:end]`). Frontière garantie par construction
pour tous les formats. ✓
2. **Chemin EOF** (`end < 0`, `end = pieces.Len()`) : tout le reste du fichier est envoyé.
- **GenBank/EMBL** : présuppose fichier bien formé (se termine par `//\n`). Le parseur
lève un `log.Fatalf` sur tout état inattendu — filet de sécurité suffisant. ✓
- **FASTQ** : présupposé, vérifié par le parseur. ✓
- **FASTA** : garanti par le format lui-même (fin d'enregistrement = EOF ou `>`). ✓
**Hypothèse de travail adoptée** : les fichiers d'entrée sont bien formés. Dans le pire cas,
le parseur lèvera une erreur explicite. Il n'y a pas de risque de corruption silencieuse.
## Piste d'optimisation : se dispenser de Pack()
### Idée centrale
Au lieu de fusionner la rope avant de la passer au parseur, **parser directement la rope
nœud par nœud**, et **écrire la séquence compactée in-place dans le premier nœud**.
Pourquoi c'est sûr :
- Le header (LOCUS, DEFINITION, SOURCE, FEATURES) est **petit** et traité en premier
- La séquence (ORIGIN) est **à la fin** du record
- Au moment d'écrire la séquence depuis l'offset 0 de `piece1`, le pointeur de lecture
est profond dans la rope (offset >> 0) → jamais de collision
- La séquence compactée est toujours plus courte que les données brutes
### Pré-allocation
Pour GenBank/EMBL : `lseq` est connu dès la ligne `LOCUS`/`ID` (première ligne, dans
`piece1`). On peut faire `slices.Grow(piece1.data, lseq)` dès ce moment.
Pour FASTA : pas de taille garantie dans le header, mais `rope.Len()` donne un majorant.
On peut utiliser `rope.Len() / 2` comme estimation initiale.
### Gestion des jonctions entre nœuds
Une ligne peut chevaucher deux nœuds (rare avec 128 MB, mais possible). Solution : carry
buffer de ~128 bytes pour les quelques bytes en fin de nœud.
### Cas FASTA/FASTQ multi-séquences
Un FileChunk peut contenir N séquences (notamment FASTA/FASTQ courts). Dans ce cas
l'écriture in-place dans `piece1` n'est pas applicable directement — on écrase des données
nécessaires aux séquences suivantes.
Stratégie par cas :
- **Rope à 1 nœud** (record ≤ 128 MB) : `Pack()` est trivial (no-op), parseur actuel OK
- **Rope à ≥ 2 nœuds** : par l'invariant, `piece1` ne contient pas de record complet →
une seule grande séquence → in-place applicable
### Format d'une ligne séquence GenBank (Après ORIGIN)
```
/^ *[0-9]+( [nuc]{10}){0,5} [nuc]{1,10}/
```
### Format d'une ligne séquence GenBank (Après SQ)
La ligne SQ contient aussi la taille de la séquence
```
/^ *( [nuc]{10}){0,5} [nuc]{1,10} *[0-9]+/
```
Compactage in-place sur `bline` ([]byte brut, sans conversion `string`) :
```go
w := 0
i := 0
for i < len(bline) && bline[i] == ' ' { i++ } // skip indentation
for i < len(bline) && bline[i] <= '9' { i++ } // skip position number
for ; i < len(bline); i++ {
if bline[i] != ' ' {
bline[w] = bline[i]
w++
}
}
// écrire bline[:w] directement dans piece1.data[seqOffset:]
```
## Changements nécessaires
1. **`FileChunk`** : exposer la rope `*PieceOfChunk` non-packée en plus (ou à la place)
de `Raw *bytes.Buffer`
2. **`GenbankChunkParser` / `EmblChunkParser`** : accepter `*PieceOfChunk`, parser la
rope séquentiellement avec carry buffer pour les jonctions
3. **`FastaChunkParser`** : idem, avec in-place conditionnel selon taille de la rope
4. **`ReadFileChunk`** : ne pas appeler `Pack()` avant envoi sur le channel (ou version
alternative `ReadFileChunkRope`)
## Fichiers concernés
- `pkg/obiformats/file_chunk_read.go` — structure rope, `ReadFileChunk`
- `pkg/obiformats/genbank_read.go``GenbankChunkParser`, `_ParseGenbankFile`
- `pkg/obiformats/embl_read.go``EmblChunkParser`, `ReadEMBL`
- `pkg/obiformats/fastaseq_read.go``FastaChunkParser`, `_ParseFastaFile`
- `pkg/obiformats/fastqseq_read.go` — parseur FASTQ (même structure)
## Plan d'implémentation : parseur GenBank sur rope
### Contexte
Baseline mesurée : `obiconvert gbpln640.seq.gz` → 49s real, 42s user, 29s sys, **57 GB RSS**.
Le sys élevé indique des allocations massives. Deux causes :
1. `Pack()` : fusionne toute la rope (N × 128 MB) en un buffer contigu avant de parser
2. Parser ORIGIN : `string(bline)` + `TrimSpace` + `SplitN` × millions de lignes
### 1. `gbRopeScanner`
Struct de lecture ligne par ligne sur la rope, sans allocation heap :
```go
type gbRopeScanner struct {
current *PieceOfChunk
pos int
carry [256]byte // stack-allocated, max GenBank line = 80 chars
carryN int
}
```
`ReadLine()` :
- Cherche `\n` dans `current.data[pos:]` via `bytes.IndexByte`
- Si trouvé sans carry : retourne slice direct du node (zéro alloc)
- Si trouvé avec carry : copie dans carry buffer, retourne `carry[:n]`
- Si non trouvé : copie le reste dans carry, avance au node suivant, recommence
- EOF : retourne `carry[:carryN]` puis nil
`extractSequence(dest []byte, UtoT bool) int` :
- Scan direct des bytes pour section ORIGIN, sans passer par ReadLine
- Machine d'états : lineStart → skip espaces/digits → copier nucléotides dans dest
- Stop sur `//` en début de ligne
- Zéro allocation, UtoT inline
### 2. `GenbankChunkParserRope`
```go
func GenbankChunkParserRope(source string, rope *PieceOfChunk,
withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error)
```
- Même machine d'états que `GenbankChunkParser`, sur `[]byte` (`bytes.HasPrefix`)
- LOCUS : extrait `id` et `lseq` par scan direct (remplace `_seqlenght_rx`)
- FEATURES / default inFeature : taxid extrait par scan de `/db_xref="taxon:`
dans la source feature ; `featBytes` rempli seulement si `withFeatureTable=true`
- DEFINITION : toujours conservée
- ORIGIN : `dest = make([]byte, 0, lseq+20)` puis `s.extractSequence(dest, UtoT)`
### 3. Modifications `_ParseGenbankFile` et `ReadGenbank`
`_ParseGenbankFile` utilise `chunk.Rope` :
```go
sequences, err := GenbankChunkParserRope(chunk.Source, chunk.Rope, ...)
```
`ReadGenbank` passe `pack=false` :
```go
entry_channel := ReadFileChunk(..., false)
```
### 4. Ce qui NE change pas
- `GenbankChunkParser` reste (référence, tests)
- `ReadFileChunk`, `Pack()`, autres parseurs (EMBL, FASTA, FASTQ) : inchangés
### 5. Gains attendus
- **RSS** : pic ≈ 128 MB × workers (au lieu de N × 128 MB)
- **Temps sys** : élimination des mmap/munmap pour les gros buffers
- **Temps user** : ~50M allocations éliminées
### 6. Vérification
```bash
/usr/local/go/bin/go build ./...
diff <(obiconvert gbpln640.seq.gz) gbpln640.reference.fasta
cd bugs/genbank && ./benchmark.sh gbpln640.seq.gz
```
Cible : RSS < 1 GB, temps comparable ou meilleur.

View File

@@ -196,6 +196,7 @@ func ReadEMBL(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, er
1024*1024*128,
EndOfLastFlatFileEntry,
"\nID ",
true,
)
newIter := obiiter.MakeIBioSequence()

View File

@@ -245,6 +245,7 @@ func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
1024*1024,
EndOfLastFastaEntry,
"\n>",
true,
)
for i := 0; i < nworker; i++ {

View File

@@ -339,6 +339,7 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
1024*1024,
EndOfLastFastqEntry,
"\n@",
true,
)
for i := 0; i < nworker; i++ {

View File

@@ -77,45 +77,47 @@ func FormatFasta(seq *obiseq.BioSequence, formater FormatHeader) string {
//
// It returns a byte array containing the formatted sequences.
func FormatFastaBatch(batch obiiter.BioSequenceBatch, formater FormatHeader, skipEmpty bool) *bytes.Buffer {
// Create a buffer to store the formatted sequences
var bs bytes.Buffer
lt := 0
for _, seq := range batch.Slice() {
lt += seq.Len()
}
// Iterate over each sequence in the batch
// Pre-allocate: sequence data + newlines every 60 chars + ~100 bytes header per sequence
bs.Grow(lt + lt/60 + 100*batch.Len() + 1)
log.Debugf("FormatFastaBatch: #%d : %d seqs", batch.Order(), batch.Len())
first := true
for _, seq := range batch.Slice() {
// Check if the sequence is empty
if seq.Len() > 0 {
// Format the sequence using the provided formater function
formattedSeq := FormatFasta(seq, formater)
if first {
bs.Grow(lt + (len(formattedSeq)-seq.Len())*batch.Len()*5/4)
first = false
}
// Append the formatted sequence to the buffer
bs.WriteString(formattedSeq)
// Write header directly into bs — no intermediate string
bs.WriteByte('>')
bs.WriteString(seq.Id())
bs.WriteByte(' ')
bs.WriteString(formater(seq))
bs.WriteByte('\n')
// Write folded sequence directly into bs — no copies
s := seq.Sequence()
l := len(s)
for i := 0; i < l; i += 60 {
to := i + 60
if to > l {
to = l
}
bs.Write(s[i:to])
bs.WriteByte('\n')
}
} else {
// Handle empty sequences
if skipEmpty {
// Skip empty sequences if skipEmpty is true
obilog.Warnf("Sequence %s is empty and skipped in output", seq.Id())
} else {
// Terminate the program if skipEmpty is false
log.Fatalf("Sequence %s is empty", seq.Id())
}
}
}
// Return the byte array representation of the buffer
return &bs
}

View File

@@ -16,6 +16,7 @@ type SeqFileChunkParser func(string, io.Reader) (obiseq.BioSequenceSlice, error)
type FileChunk struct {
Source string
Raw *bytes.Buffer
Rope *PieceOfChunk
Order int
}
@@ -97,11 +98,17 @@ func (piece *PieceOfChunk) IsLast() bool {
return piece.next == nil
}
func (piece *PieceOfChunk) FileChunk(source string, order int) FileChunk {
func (piece *PieceOfChunk) FileChunk(source string, order int, pack bool) FileChunk {
piece = piece.Head()
var raw *bytes.Buffer
if pack {
piece.Pack()
raw = bytes.NewBuffer(piece.data)
}
return FileChunk{
Source: source,
Raw: bytes.NewBuffer(piece.data),
Raw: raw,
Rope: piece,
Order: order,
}
}
@@ -133,7 +140,8 @@ func ReadFileChunk(
reader io.Reader,
fileChunkSize int,
splitter LastSeqRecord,
probe string) ChannelFileChunk {
probe string,
pack bool) ChannelFileChunk {
chunk_channel := make(ChannelFileChunk)
@@ -205,7 +213,7 @@ func ReadFileChunk(
if len(pieces.data) > 0 {
// obilog.Warnf("chuck %d :Read %d bytes from file %s", i, io.Len(), source)
chunk_channel <- pieces.FileChunk(source, i)
chunk_channel <- pieces.FileChunk(source, i, pack)
i++
}
@@ -222,7 +230,7 @@ func ReadFileChunk(
// Send the last chunk to the channel
if pieces.Len() > 0 {
chunk_channel <- pieces.FileChunk(source, i)
chunk_channel <- pieces.FileChunk(source, i, pack)
}
// Close the readers channel when the end of the file is reached

View File

@@ -29,6 +29,342 @@ const (
var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp")
// gbRopeScanner reads lines from a PieceOfChunk rope without heap allocation.
// The carry buffer (stack) handles lines that span two rope nodes.
type gbRopeScanner struct {
current *PieceOfChunk
pos int
carry [256]byte // max GenBank line = 80 chars; 256 gives ample margin
carryN int
}
func newGbRopeScanner(rope *PieceOfChunk) *gbRopeScanner {
return &gbRopeScanner{current: rope}
}
// ReadLine returns the next line without the trailing \n (or \r\n).
// Returns nil at end of rope. The returned slice aliases carry[] or the node
// data and is valid only until the next ReadLine call.
func (s *gbRopeScanner) ReadLine() []byte {
for {
if s.current == nil {
if s.carryN > 0 {
n := s.carryN
s.carryN = 0
return s.carry[:n]
}
return nil
}
data := s.current.data[s.pos:]
idx := bytes.IndexByte(data, '\n')
if idx >= 0 {
var line []byte
if s.carryN == 0 {
line = data[:idx]
} else {
n := copy(s.carry[s.carryN:], data[:idx])
s.carryN += n
line = s.carry[:s.carryN]
s.carryN = 0
}
s.pos += idx + 1
if s.pos >= len(s.current.data) {
s.current = s.current.Next()
s.pos = 0
}
if len(line) > 0 && line[len(line)-1] == '\r' {
line = line[:len(line)-1]
}
return line
}
// No \n in this node: accumulate into carry and advance
n := copy(s.carry[s.carryN:], data)
s.carryN += n
s.current = s.current.Next()
s.pos = 0
}
}
// extractSequence scans the ORIGIN section byte-by-byte directly on the rope,
// appending compacted bases to dest. Returns the extended slice.
// Stops and returns when "//" is found at the start of a line.
// The scanner is left positioned after the "//" line.
func (s *gbRopeScanner) extractSequence(dest []byte, UtoT bool) []byte {
lineStart := true
skipDigits := true
for s.current != nil {
data := s.current.data[s.pos:]
for i, b := range data {
if lineStart {
if b == '/' {
// End-of-record marker "//"
s.pos += i + 1
if s.pos >= len(s.current.data) {
s.current = s.current.Next()
s.pos = 0
}
s.skipToNewline()
return dest
}
lineStart = false
skipDigits = true
}
switch {
case b == '\n':
lineStart = true
case b == '\r':
// skip
case skipDigits:
if b != ' ' && (b < '0' || b > '9') {
skipDigits = false
if UtoT && b == 'u' {
b = 't'
}
dest = append(dest, b)
}
case b != ' ':
if UtoT && b == 'u' {
b = 't'
}
dest = append(dest, b)
}
}
s.current = s.current.Next()
s.pos = 0
}
return dest
}
// skipToNewline advances the scanner past the next '\n'.
func (s *gbRopeScanner) skipToNewline() {
for s.current != nil {
data := s.current.data[s.pos:]
idx := bytes.IndexByte(data, '\n')
if idx >= 0 {
s.pos += idx + 1
if s.pos >= len(s.current.data) {
s.current = s.current.Next()
s.pos = 0
}
return
}
s.current = s.current.Next()
s.pos = 0
}
}
// parseLseqFromLocus extracts the declared sequence length from a LOCUS line.
// Format: "LOCUS <id> <length> bp ..."
// Returns -1 if not found or parse error.
func parseLseqFromLocus(line []byte) int {
if len(line) < 13 {
return -1
}
i := 12
for i < len(line) && line[i] != ' ' {
i++
}
for i < len(line) && line[i] == ' ' {
i++
}
start := i
for i < len(line) && line[i] >= '0' && line[i] <= '9' {
i++
}
if i == start {
return -1
}
n, err := strconv.Atoi(string(line[start:i]))
if err != nil {
return -1
}
return n
}
// Prefix constants for GenBank section headers (byte slices for zero-alloc comparison).
var (
gbPfxLocus = []byte("LOCUS ")
gbPfxDefinition = []byte("DEFINITION ")
gbPfxContinue = []byte(" ")
gbPfxSource = []byte("SOURCE ")
gbPfxFeatures = []byte("FEATURES ")
gbPfxOrigin = []byte("ORIGIN")
gbPfxContig = []byte("CONTIG")
gbPfxEnd = []byte("//")
gbPfxDbXref = []byte(` /db_xref="taxon:`)
)
// GenbankChunkParserRope parses a GenBank FileChunk directly from the rope
// (PieceOfChunk linked list) without calling Pack(). This eliminates the large
// contiguous allocation required for chromosomal-scale sequences.
func GenbankChunkParserRope(source string, rope *PieceOfChunk,
withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) {
state := inHeader
scanner := newGbRopeScanner(rope)
sequences := obiseq.MakeBioSequenceSlice(100)[:0]
id := ""
lseq := -1
scientificName := ""
defBytes := new(bytes.Buffer)
featBytes := new(bytes.Buffer)
var seqDest []byte
taxid := 1
nl := 0
for bline := scanner.ReadLine(); bline != nil; bline = scanner.ReadLine() {
nl++
processed := false
for !processed {
switch {
case bytes.HasPrefix(bline, gbPfxLocus):
if state != inHeader {
log.Fatalf("Line %d - Unexpected state %d while reading LOCUS: %s", nl, state, bline)
}
rest := bline[12:]
sp := bytes.IndexByte(rest, ' ')
if sp < 0 {
id = string(rest)
} else {
id = string(rest[:sp])
}
lseq = parseLseqFromLocus(bline)
cap0 := lseq + 20
if cap0 < 1024 {
cap0 = 1024
}
seqDest = make([]byte, 0, cap0)
state = inEntry
processed = true
case bytes.HasPrefix(bline, gbPfxDefinition):
if state != inEntry {
log.Fatalf("Line %d - Unexpected state %d while reading DEFINITION: %s", nl, state, bline)
}
defBytes.Write(bytes.TrimSpace(bline[12:]))
state = inDefinition
processed = true
case state == inDefinition:
if bytes.HasPrefix(bline, gbPfxContinue) {
defBytes.WriteByte(' ')
defBytes.Write(bytes.TrimSpace(bline[12:]))
processed = true
} else {
state = inEntry
}
case bytes.HasPrefix(bline, gbPfxSource):
if state != inEntry {
log.Fatalf("Line %d - Unexpected state %d while reading SOURCE: %s", nl, state, bline)
}
scientificName = string(bytes.TrimSpace(bline[12:]))
processed = true
case bytes.HasPrefix(bline, gbPfxFeatures):
if state != inEntry {
log.Fatalf("Line %d - Unexpected state %d while reading FEATURES: %s", nl, state, bline)
}
if withFeatureTable {
featBytes.Write(bline)
}
state = inFeature
processed = true
case bytes.HasPrefix(bline, gbPfxOrigin):
if state != inFeature && state != inContig {
log.Fatalf("Line %d - Unexpected state %d while reading ORIGIN: %s", nl, state, bline)
}
// Use fast byte-scan to extract sequence and consume through "//"
seqDest = scanner.extractSequence(seqDest, UtoT)
// Emit record
if id == "" {
log.Warn("Empty id when parsing genbank file")
}
sequence := obiseq.NewBioSequenceOwning(id, seqDest, defBytes.String())
sequence.SetSource(source)
if withFeatureTable {
sequence.SetFeatures(featBytes.Bytes())
}
annot := sequence.Annotations()
annot["scientific_name"] = scientificName
annot["taxid"] = taxid
sequences = append(sequences, sequence)
defBytes = bytes.NewBuffer(obiseq.GetSlice(200))
featBytes = new(bytes.Buffer)
nl = 0
taxid = 1
seqDest = nil
state = inHeader
processed = true
case bytes.HasPrefix(bline, gbPfxContig):
if state != inFeature && state != inContig {
log.Fatalf("Line %d - Unexpected state %d while reading CONTIG: %s", nl, state, bline)
}
state = inContig
processed = true
case bytes.Equal(bline, gbPfxEnd):
// Reached for CONTIG records (no ORIGIN section)
if state != inContig {
log.Fatalf("Line %d - Unexpected state %d while reading end of record %s", nl, state, id)
}
if id == "" {
log.Warn("Empty id when parsing genbank file")
}
sequence := obiseq.NewBioSequenceOwning(id, seqDest, defBytes.String())
sequence.SetSource(source)
if withFeatureTable {
sequence.SetFeatures(featBytes.Bytes())
}
annot := sequence.Annotations()
annot["scientific_name"] = scientificName
annot["taxid"] = taxid
sequences = append(sequences, sequence)
defBytes = bytes.NewBuffer(obiseq.GetSlice(200))
featBytes = new(bytes.Buffer)
nl = 0
taxid = 1
seqDest = nil
state = inHeader
processed = true
default:
switch state {
case inFeature:
if withFeatureTable {
featBytes.WriteByte('\n')
featBytes.Write(bline)
}
if bytes.HasPrefix(bline, gbPfxDbXref) {
rest := bline[len(gbPfxDbXref):]
q := bytes.IndexByte(rest, '"')
if q >= 0 {
taxid, _ = strconv.Atoi(string(rest[:q]))
}
}
processed = true
case inHeader, inEntry, inContig:
processed = true
default:
log.Fatalf("Unexpected state %d while reading: %s", state, bline)
}
}
}
}
return sequences, nil
}
func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlice, error) {
return func(source string, input io.Reader) (obiseq.BioSequenceSlice, error) {
state := inHeader
@@ -125,13 +461,10 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
if state != inSequence && state != inContig {
log.Fatalf("Line %d - Unexpected state %d while reading end of record %s", nl, state, id)
}
// log.Debugln("Total lines := ", nl)
if id == "" {
log.Warn("Empty id when parsing genbank file")
}
// log.Debugf("End of sequence %s: %dbp ", id, seqBytes.Len())
sequence := obiseq.NewBioSequence(id,
seqBytes.Bytes(),
defBytes.String())
@@ -144,9 +477,6 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
annot := sequence.Annotations()
annot["scientific_name"] = scientificName
annot["taxid"] = taxid
// log.Println(FormatFasta(sequence, FormatFastSeqJsonHeader))
// log.Debugf("Read sequences %s: %dbp (%d)", sequence.Id(),
// sequence.Len(), seqBytes.Len())
sequences = append(sequences, sequence)
@@ -159,8 +489,6 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
processed = true
case state == inSequence:
// log.Debugf("Chunk %d : Genbank: line %d, state = %d : %s", chunks.order, nl, state, line)
sl++
cleanline := strings.TrimSpace(line)
parts := strings.SplitN(cleanline, " ", 7)
@@ -198,6 +526,7 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
}
_ = sl
return sequences, nil
}
}
@@ -206,10 +535,16 @@ func _ParseGenbankFile(input ChannelFileChunk,
out obiiter.IBioSequence,
withFeatureTable, UtoT bool) {
parser := GenbankChunkParser(withFeatureTable, UtoT)
for chunks := range input {
sequences, err := parser(chunks.Source, chunks.Raw)
var sequences obiseq.BioSequenceSlice
var err error
if chunks.Rope != nil {
sequences, err = GenbankChunkParserRope(chunks.Source, chunks.Rope, withFeatureTable, UtoT)
} else {
parser := GenbankChunkParser(withFeatureTable, UtoT)
sequences, err = parser(chunks.Source, chunks.Raw)
}
if err != nil {
log.Fatalf("File %s : Cannot parse the genbank file : %v", chunks.Source, err)
@@ -225,7 +560,6 @@ func _ParseGenbankFile(input ChannelFileChunk,
func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
opt := MakeOptions(options)
// entry_channel := make(chan _FileChunk)
entry_channel := ReadFileChunk(
opt.Source(),
@@ -233,13 +567,13 @@ func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence,
1024*1024*128,
EndOfLastFlatFileEntry,
"\nLOCUS ",
false, // do not pack: rope-based parser avoids contiguous allocation
)
newIter := obiiter.MakeIBioSequence()
nworkers := opt.ParallelWorkers()
// for j := 0; j < opt.ParallelWorkers(); j++ {
for j := 0; j < nworkers; j++ {
newIter.Add(1)
go _ParseGenbankFile(
@@ -250,8 +584,6 @@ func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence,
)
}
// go _ReadFlatFileChunk(reader, entry_channel)
go func() {
newIter.WaitAndClose()
log.Debug("End of the genbank file ", opt.Source())

View File

@@ -3,7 +3,7 @@ package obioptions
// Version is automatically updated by the Makefile from version.txt
// The patch number (third digit) is incremented on each push to the repository
var _Version = "Release 4.4.18"
var _Version = "Release 4.4.19"
// Version returns the version of the obitools package.
//

View File

@@ -120,6 +120,19 @@ func NewBioSequence(id string,
return bs
}
// NewBioSequenceOwning creates a BioSequence taking ownership of the sequence
// slice without copying it. The caller must not use the slice after this call.
// Use this when the slice was allocated specifically for this sequence.
func NewBioSequenceOwning(id string,
sequence []byte,
definition string) *BioSequence {
bs := NewEmptyBioSequence(0)
bs.SetId(id)
bs.TakeSequence(sequence)
bs.SetDefinition(definition)
return bs
}
// NewBioSequenceWithQualities creates a new BioSequence object with the given id, sequence, definition, and qualities.
//
// Parameters:
@@ -444,6 +457,12 @@ func (s *BioSequence) SetSequence(sequence []byte) {
s.sequence = obiutils.InPlaceToLower(CopySlice(sequence))
}
// TakeSequence stores the slice directly without copying, then lowercases in-place.
// The caller must not use the slice after this call.
func (s *BioSequence) TakeSequence(sequence []byte) {
s.sequence = obiutils.InPlaceToLower(sequence)
}
func (s *BioSequence) HasValidSequence() bool {
for _, c := range s.sequence {
if !((c >= 'a' && c <= 'z') || c == '-' || c == '.' || c == '[' || c == ']') {

View File

@@ -21,6 +21,74 @@ LLM_MODEL="ollama:qwen3-coder-next:latest"
die() { echo "Error: $*" >&2; exit 1; }
next_patch() {
local v="$1"
local major minor patch
major=$(echo "$v" | cut -d. -f1)
minor=$(echo "$v" | cut -d. -f2)
patch=$(echo "$v" | cut -d. -f3)
echo "${major}.${minor}.$(( patch + 1 ))"
}
# Strip "pre-" prefix to get the bare version number for installation section
bare_version() {
echo "$1" | sed 's/^pre-//'
}
installation_section() {
local v
v=$(bare_version "$1")
cat <<INSTALL_EOF
## Installation
### Pre-built binaries
Download the appropriate archive for your system from the
[release assets](https://github.com/metabarcoding/obitools4/releases/tag/Release_${v})
and extract it:
#### Linux (AMD64)
\`\`\`bash
tar -xzf obitools4_${v}_linux_amd64.tar.gz
\`\`\`
#### Linux (ARM64)
\`\`\`bash
tar -xzf obitools4_${v}_linux_arm64.tar.gz
\`\`\`
#### macOS (Intel)
\`\`\`bash
tar -xzf obitools4_${v}_darwin_amd64.tar.gz
\`\`\`
#### macOS (Apple Silicon)
\`\`\`bash
tar -xzf obitools4_${v}_darwin_arm64.tar.gz
\`\`\`
All OBITools4 binaries are included in each archive.
### From source
You can also compile and install OBITools4 directly from source using the
installation script:
\`\`\`bash
curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | bash -s -- --version ${v}
\`\`\`
By default binaries are installed in \`/usr/local/bin\`. Use \`--install-dir\` to
change the destination and \`--obitools-prefix\` to add a prefix to command names:
\`\`\`bash
curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | \\
bash -s -- --version ${v} --install-dir ~/local --obitools-prefix k
\`\`\`
INSTALL_EOF
}
display_help() {
cat <<EOF
Usage: $(basename "$0") [OPTIONS]
@@ -81,49 +149,74 @@ all_versions=$(fetch_versions)
[ -z "$all_versions" ] && die "Could not fetch versions from GitHub"
if [ -z "$VERSION" ]; then
VERSION=$(echo "$all_versions" | head -1)
echo "Using latest version: $VERSION" >&2
fi
# ── Pre-release mode: local HEAD vs latest GitHub tag ──────────────────
PRE_RELEASE=true
previous_tag="Release_${latest_version}"
VERSION="pre-$(next_patch "$latest_version")"
tag_name="Release_${VERSION}"
echo "Pre-release mode: $previous_tag -> HEAD (as $VERSION)" >&2
# Verify the requested version exists
if ! echo "$all_versions" | grep -qx "$VERSION"; then
# Need to be in a git repo
if ! git rev-parse --is-inside-work-tree >/dev/null 2>&1; then
die "Not inside a git repository. Pre-release mode requires a local git repo."
fi
# Check that the previous tag exists locally
if ! git rev-parse "$previous_tag" >/dev/null 2>&1; then
echo "Tag $previous_tag not found locally, fetching..." >&2
git fetch --tags 2>/dev/null || true
if ! git rev-parse "$previous_tag" >/dev/null 2>&1; then
die "Tag $previous_tag not found locally or remotely"
fi
fi
# Get local commits from the tag to HEAD (full messages)
commit_list=$(git log --format="%h %B" "${previous_tag}..HEAD" 2>/dev/null)
if [ -z "$commit_list" ]; then
die "No local commits found since $previous_tag"
fi
else
# ── Published release mode: between two GitHub tags ────────────────────
PRE_RELEASE=false
tag_name="Release_${VERSION}"
# Verify the requested version exists
if ! echo "$all_versions" | grep -qx "$VERSION"; then
die "Version $VERSION not found. Use -l to list available versions."
fi
fi
# Find the previous version (the one right after in the sorted-descending list)
previous_version=$(echo "$all_versions" | grep -A1 -x "$VERSION" | tail -1)
# Find the previous version
previous_version=$(echo "$all_versions" | grep -A1 -x "$VERSION" | tail -1)
if [ "$previous_version" = "$VERSION" ] || [ -z "$previous_version" ]; then
if [ "$previous_version" = "$VERSION" ] || [ -z "$previous_version" ]; then
previous_tag=""
echo "No previous version found -- will include all commits for $tag_name" >&2
else
else
previous_tag="Release_${previous_version}"
echo "Generating notes: $previous_tag -> $tag_name" >&2
fi
fi
# ── Fetch commit messages between tags via GitHub compare API ────────────
if [ -n "$previous_tag" ]; then
# Fetch commit messages between tags via GitHub compare API
if [ -n "$previous_tag" ]; then
commits_json=$(curl -sf "${GITHUB_API}/compare/${previous_tag}...${tag_name}")
if [ -z "$commits_json" ]; then
die "Could not fetch commit comparison from GitHub"
fi
commit_list=$(echo "$commits_json" \
| jq -r '.commits[] | (.sha[:8] + " " + (.commit.message | split("\n")[0]))' 2>/dev/null)
else
# First release: get commits up to this tag
| jq -r '.commits[] | (.sha[:8] + " " + .commit.message)' 2>/dev/null)
else
commits_json=$(curl -sf "${GITHUB_API}/commits?sha=${tag_name}&per_page=50")
if [ -z "$commits_json" ]; then
die "Could not fetch commits from GitHub"
fi
commit_list=$(echo "$commits_json" \
| jq -r '.[] | (.sha[:8] + " " + (.commit.message | split("\n")[0]))' 2>/dev/null)
fi
| jq -r '.[] | (.sha[:8] + " " + .commit.message)' 2>/dev/null)
fi
if [ -z "$commit_list" ]; then
if [ -z "$commit_list" ]; then
die "No commits found between $previous_tag and $tag_name"
fi
fi
# ── LLM prompt (shared by context mode and summarization) ────────────────
@@ -144,6 +237,7 @@ if [ "$RAW_MODE" = true ]; then
echo "$commit_list" | while IFS= read -r line; do
echo "- ${line}"
done
installation_section "$VERSION"
exit 0
fi
@@ -193,6 +287,7 @@ if [ -n "$release_title" ] && [ -n "$release_body" ]; then
echo "# ${release_title}"
echo ""
echo "$release_body"
installation_section "$VERSION"
else
echo "Warning: JSON parsing failed, falling back to raw mode" >&2
exec "$0" -r -v "$VERSION"

View File

@@ -1 +1 @@
4.4.18
4.4.19