mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 21:40:52 +00:00
Compare commits
27 Commits
Release_4.
...
push-mvpux
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cdc72c5346 | ||
|
|
82a9972be7 | ||
|
|
ff6e515b2a | ||
|
|
cd0c525f50 | ||
|
|
abe935aa18 | ||
|
|
8dd32dc1bf | ||
|
|
6ee8750635 | ||
|
|
8c318c480e | ||
|
|
09fbc217d3 | ||
|
|
3d2e205722 | ||
|
|
623116ab13 | ||
|
|
1e4509cb63 | ||
|
|
b33d7705a8 | ||
|
|
1342c83db6 | ||
|
|
b246025907 | ||
|
|
761e0dbed3 | ||
|
|
a7ea47624b | ||
|
|
61e346658e | ||
|
|
1ba1294b11 | ||
|
|
b2476fffcb | ||
|
|
b05404721e | ||
|
|
c57e788459 | ||
|
|
1cecf23978 | ||
|
|
4c824ef9b7 | ||
|
|
1ce5da9bee | ||
|
|
dc23d9de9a | ||
|
|
aa9d7bbf72 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -16,6 +16,7 @@
|
|||||||
**/*.tgz
|
**/*.tgz
|
||||||
**/*.yaml
|
**/*.yaml
|
||||||
**/*.csv
|
**/*.csv
|
||||||
|
**/*.pb.gz
|
||||||
xx
|
xx
|
||||||
|
|
||||||
.rhistory
|
.rhistory
|
||||||
|
|||||||
142
Makefile
142
Makefile
@@ -2,6 +2,13 @@
|
|||||||
#export GOBIN=$(GOPATH)/bin
|
#export GOBIN=$(GOPATH)/bin
|
||||||
#export PATH=$(GOBIN):$(shell echo $${PATH})
|
#export PATH=$(GOBIN):$(shell echo $${PATH})
|
||||||
|
|
||||||
|
.DEFAULT_GOAL := all
|
||||||
|
|
||||||
|
GREEN := \033[0;32m
|
||||||
|
YELLOW := \033[0;33m
|
||||||
|
BLUE := \033[0;34m
|
||||||
|
NC := \033[0m
|
||||||
|
|
||||||
GOFLAGS=
|
GOFLAGS=
|
||||||
GOCMD=go
|
GOCMD=go
|
||||||
GOBUILD=$(GOCMD) build $(GOFLAGS)
|
GOBUILD=$(GOCMD) build $(GOFLAGS)
|
||||||
@@ -60,6 +67,28 @@ endif
|
|||||||
|
|
||||||
OUTPUT:=$(shell mktemp)
|
OUTPUT:=$(shell mktemp)
|
||||||
|
|
||||||
|
help:
|
||||||
|
@printf "$(GREEN)OBITools4 Makefile$(NC)\n\n"
|
||||||
|
@printf "$(BLUE)Main targets:$(NC)\n"
|
||||||
|
@printf " %-20s %s\n" "all" "Build all obitools (default)"
|
||||||
|
@printf " %-20s %s\n" "obitools" "Build all obitools binaries to build/"
|
||||||
|
@printf " %-20s %s\n" "test" "Run Go unit tests"
|
||||||
|
@printf " %-20s %s\n" "obitests" "Run integration tests (obitests/)"
|
||||||
|
@printf " %-20s %s\n" "bump-version" "Increment patch version (or set with VERSION=x.y.z)"
|
||||||
|
@printf " %-20s %s\n" "update-deps" "Update all Go dependencies"
|
||||||
|
@printf "\n$(BLUE)Jujutsu workflow:$(NC)\n"
|
||||||
|
@printf " %-20s %s\n" "jjnew" "Document current commit and start a new one"
|
||||||
|
@printf " %-20s %s\n" "jjpush" "Release: describe, bump, generate notes, push PR, tag (VERSION=x.y.z optional)"
|
||||||
|
@printf " %-20s %s\n" "jjfetch" "Fetch latest commits from origin"
|
||||||
|
@printf "\n$(BLUE)Required tools:$(NC)\n"
|
||||||
|
@printf " %-20s " "go"; command -v go >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(go version)" || printf "$(YELLOW)✗ not found$(NC)\n"
|
||||||
|
@printf " %-20s " "git"; command -v git >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(git --version)" || printf "$(YELLOW)✗ not found$(NC)\n"
|
||||||
|
@printf " %-20s " "jj"; command -v jj >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(jj --version)" || printf "$(YELLOW)✗ not found$(NC)\n"
|
||||||
|
@printf " %-20s " "gh"; command -v gh >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(gh --version | head -1)" || printf "$(YELLOW)✗ not found$(NC) (brew install gh)\n"
|
||||||
|
@printf "\n$(BLUE)Optional tools (release notes generation):$(NC)\n"
|
||||||
|
@printf " %-20s " "aichat"; command -v aichat >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(aichat --version)" || printf "$(YELLOW)✗ not found$(NC) (https://github.com/sigoden/aichat)\n"
|
||||||
|
@printf " %-20s " "jq"; command -v jq >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(jq --version)" || printf "$(YELLOW)✗ not found$(NC) (brew install jq)\n"
|
||||||
|
|
||||||
all: install-githook obitools
|
all: install-githook obitools
|
||||||
|
|
||||||
obitools: $(patsubst %,$(OBITOOLS_PREFIX)%,$(OBITOOLS))
|
obitools: $(patsubst %,$(OBITOOLS_PREFIX)%,$(OBITOOLS))
|
||||||
@@ -106,15 +135,20 @@ pkg/obioptions/version.go: version.txt .FORCE
|
|||||||
@rm -f $(OUTPUT)
|
@rm -f $(OUTPUT)
|
||||||
|
|
||||||
bump-version:
|
bump-version:
|
||||||
@echo "Incrementing version..."
|
|
||||||
@current=$$(cat version.txt); \
|
@current=$$(cat version.txt); \
|
||||||
echo " Current version: $$current"; \
|
if [ -n "$(VERSION)" ]; then \
|
||||||
major=$$(echo $$current | cut -d. -f1); \
|
new_version="$(VERSION)"; \
|
||||||
minor=$$(echo $$current | cut -d. -f2); \
|
echo "Setting version to $$new_version (was $$current)"; \
|
||||||
patch=$$(echo $$current | cut -d. -f3); \
|
else \
|
||||||
new_patch=$$((patch + 1)); \
|
echo "Incrementing version..."; \
|
||||||
new_version="$$major.$$minor.$$new_patch"; \
|
echo " Current version: $$current"; \
|
||||||
echo " New version: $$new_version"; \
|
major=$$(echo $$current | cut -d. -f1); \
|
||||||
|
minor=$$(echo $$current | cut -d. -f2); \
|
||||||
|
patch=$$(echo $$current | cut -d. -f3); \
|
||||||
|
new_patch=$$((patch + 1)); \
|
||||||
|
new_version="$$major.$$minor.$$new_patch"; \
|
||||||
|
echo " New version: $$new_version"; \
|
||||||
|
fi; \
|
||||||
echo "$$new_version" > version.txt
|
echo "$$new_version" > version.txt
|
||||||
@echo "✓ Version updated in version.txt"
|
@echo "✓ Version updated in version.txt"
|
||||||
@$(MAKE) pkg/obioptions/version.go
|
@$(MAKE) pkg/obioptions/version.go
|
||||||
@@ -128,40 +162,76 @@ jjnew:
|
|||||||
@echo "$(GREEN)✓ New commit created$(NC)"
|
@echo "$(GREEN)✓ New commit created$(NC)"
|
||||||
|
|
||||||
jjpush:
|
jjpush:
|
||||||
@echo "$(YELLOW)→ Pushing commit to repository...$(NC)"
|
@$(MAKE) jjpush-describe
|
||||||
|
@$(MAKE) jjpush-bump
|
||||||
|
@$(MAKE) jjpush-notes
|
||||||
|
@$(MAKE) jjpush-push
|
||||||
|
@$(MAKE) jjpush-tag
|
||||||
|
@echo "$(GREEN)✓ Release complete$(NC)"
|
||||||
|
|
||||||
|
jjpush-describe:
|
||||||
@echo "$(BLUE)→ Documenting current commit...$(NC)"
|
@echo "$(BLUE)→ Documenting current commit...$(NC)"
|
||||||
@jj auto-describe
|
@jj auto-describe
|
||||||
|
|
||||||
|
jjpush-bump:
|
||||||
@echo "$(BLUE)→ Creating new commit for version bump...$(NC)"
|
@echo "$(BLUE)→ Creating new commit for version bump...$(NC)"
|
||||||
@jj new
|
@jj new
|
||||||
@previous_version=$$(cat version.txt); \
|
@$(MAKE) bump-version
|
||||||
$(MAKE) bump-version; \
|
|
||||||
version=$$(cat version.txt); \
|
jjpush-notes:
|
||||||
tag_name="Release_$$version"; \
|
@version=$$(cat version.txt); \
|
||||||
previous_tag="Release_$$previous_version"; \
|
echo "$(BLUE)→ Generating release notes for version $$version...$(NC)"; \
|
||||||
echo "$(BLUE)→ Documenting version bump commit...$(NC)"; \
|
release_title="Release $$version"; \
|
||||||
jj auto-describe; \
|
release_body=""; \
|
||||||
echo "$(BLUE)→ Generating release notes from $$previous_tag to current commit...$(NC)"; \
|
if command -v aichat >/dev/null 2>&1; then \
|
||||||
if command -v orla >/dev/null 2>&1 && command -v jq >/dev/null 2>&1; then \
|
previous_tag=$$(git describe --tags --abbrev=0 --match 'Release_*' 2>/dev/null); \
|
||||||
release_json=$$(ORLA_MAX_TOOL_CALLS=50 jj log -r "$$previous_tag::@" -T 'commit_id.short() ++ " " ++ description' | \
|
if [ -z "$$previous_tag" ]; then \
|
||||||
orla agent -m ollama:qwen3-coder-next:latest \
|
echo "$(YELLOW)⚠ No previous Release tag found, skipping release notes$(NC)"; \
|
||||||
"Summarize the following commits into a GitHub release note for version $$version. Ignore commits related to version bumps, .gitignore changes, or any internal housekeeping that is irrelevant to end users. Describe each user-facing change precisely without exposing code. Eliminate redundancy. Output strictly valid JSON with no surrounding text, using this exact schema: {\"title\": \"<short release title>\", \"body\": \"<detailed markdown release notes>\"}"); \
|
|
||||||
release_json=$$(echo "$$release_json" | sed -n '/^{/,/^}/p'); \
|
|
||||||
release_title=$$(echo "$$release_json" | jq -r '.title // empty') ; \
|
|
||||||
release_body=$$(echo "$$release_json" | jq -r '.body // empty') ; \
|
|
||||||
if [ -n "$$release_title" ] && [ -n "$$release_body" ]; then \
|
|
||||||
release_message="$$release_title"$$'\n\n'"$$release_body"; \
|
|
||||||
else \
|
else \
|
||||||
echo "$(YELLOW)⚠ JSON parsing failed, falling back to raw output$(NC)"; \
|
raw_output=$$(git log --format="%h %B" "$$previous_tag..HEAD" | \
|
||||||
release_message="Release $$version"$$'\n\n'"$$release_json"; \
|
aichat \
|
||||||
|
"Summarize the following commits into a GitHub release note for version $$version. Ignore commits related to version bumps, .gitignore changes, or any internal housekeeping that is irrelevant to end users. Describe each user-facing change precisely without exposing code. Eliminate redundancy. Output strictly valid JSON with no surrounding text, using this exact schema: {\"title\": \"<short release title>\", \"body\": \"<detailed markdown release notes>\"}" 2>/dev/null) || true; \
|
||||||
|
if [ -n "$$raw_output" ]; then \
|
||||||
|
notes=$$(printf '%s\n' "$$raw_output" | python3 tools/json2md.py 2>/dev/null); \
|
||||||
|
if [ -n "$$notes" ]; then \
|
||||||
|
release_title=$$(echo "$$notes" | head -1); \
|
||||||
|
release_body=$$(echo "$$notes" | tail -n +3); \
|
||||||
|
else \
|
||||||
|
echo "$(YELLOW)⚠ JSON parsing failed, using default release message$(NC)"; \
|
||||||
|
fi; \
|
||||||
|
fi; \
|
||||||
fi; \
|
fi; \
|
||||||
else \
|
|
||||||
release_message="Release $$version"; \
|
|
||||||
fi; \
|
fi; \
|
||||||
echo "$(BLUE)→ Pushing commits and creating tag $$tag_name...$(NC)"; \
|
printf '%s' "$$release_title" > /tmp/obitools4-release-title.txt; \
|
||||||
jj git push --change @; \
|
printf '%s' "$$release_body" > /tmp/obitools4-release-body.txt; \
|
||||||
git tag -a "$$tag_name" -m "$$release_message" 2>/dev/null || echo "Tag $$tag_name already exists"; \
|
echo "$(BLUE)→ Setting release notes as commit description...$(NC)"; \
|
||||||
git push origin "$$tag_name" 2>/dev/null || echo "Tag already pushed"
|
jj desc -m "$$release_title"$$'\n\n'"$$release_body"
|
||||||
@echo "$(GREEN)✓ Commits and tag pushed to repository$(NC)"
|
|
||||||
|
jjpush-push:
|
||||||
|
@echo "$(BLUE)→ Pushing commits...$(NC)"
|
||||||
|
@jj git push --change @
|
||||||
|
@echo "$(BLUE)→ Creating/updating PR...$(NC)"
|
||||||
|
@release_title=$$(cat /tmp/obitools4-release-title.txt 2>/dev/null || echo "Release $$(cat version.txt)"); \
|
||||||
|
release_body=$$(cat /tmp/obitools4-release-body.txt 2>/dev/null || echo ""); \
|
||||||
|
branch=$$(jj log -r @ --no-graph -T 'bookmarks.map(|b| b.name()).join("\n")' 2>/dev/null | head -1); \
|
||||||
|
if [ -n "$$branch" ] && command -v gh >/dev/null 2>&1; then \
|
||||||
|
gh pr create --title "$$release_title" --body "$$release_body" --base master --head "$$branch" 2>/dev/null \
|
||||||
|
|| gh pr edit "$$branch" --title "$$release_title" --body "$$release_body" 2>/dev/null \
|
||||||
|
|| echo "$(YELLOW)⚠ Could not create/update PR$(NC)"; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
jjpush-tag:
|
||||||
|
@version=$$(cat version.txt); \
|
||||||
|
tag_name="Release_$$version"; \
|
||||||
|
release_title=$$(cat /tmp/obitools4-release-title.txt 2>/dev/null || echo "Release $$version"); \
|
||||||
|
release_body=$$(cat /tmp/obitools4-release-body.txt 2>/dev/null || echo ""); \
|
||||||
|
install_section=$$'\n## Installation\n\n### Pre-built binaries\n\nDownload the appropriate archive for your system from the\n[release assets](https://github.com/metabarcoding/obitools4/releases/tag/Release_'"$$version"')\nand extract it:\n\n#### Linux (AMD64)\n```bash\ntar -xzf obitools4_'"$$version"'_linux_amd64.tar.gz\n```\n\n#### Linux (ARM64)\n```bash\ntar -xzf obitools4_'"$$version"'_linux_arm64.tar.gz\n```\n\n#### macOS (Intel)\n```bash\ntar -xzf obitools4_'"$$version"'_darwin_amd64.tar.gz\n```\n\n#### macOS (Apple Silicon)\n```bash\ntar -xzf obitools4_'"$$version"'_darwin_arm64.tar.gz\n```\n\nAll OBITools4 binaries are included in each archive.\n\n### From source\n\nYou can also compile and install OBITools4 directly from source using the\ninstallation script:\n\n```bash\ncurl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | bash -s -- --version '"$$version"'\n```\n\nBy default binaries are installed in `/usr/local/bin`. Use `--install-dir` to\nchange the destination and `--obitools-prefix` to add a prefix to command names:\n\n```bash\ncurl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | \\\n bash -s -- --version '"$$version"' --install-dir ~/local --obitools-prefix k\n```\n'; \
|
||||||
|
release_message="$$release_title"$$'\n\n'"$$release_body$$install_section"; \
|
||||||
|
echo "$(BLUE)→ Creating tag $$tag_name...$(NC)"; \
|
||||||
|
git tag -a "$$tag_name" -m "$$release_message" 2>/dev/null || echo "$(YELLOW)⚠ Tag $$tag_name already exists$(NC)"; \
|
||||||
|
echo "$(BLUE)→ Pushing tag $$tag_name...$(NC)"; \
|
||||||
|
git push origin "$$tag_name" 2>/dev/null || echo "$(YELLOW)⚠ Tag push failed or already pushed$(NC)"; \
|
||||||
|
rm -f /tmp/obitools4-release-title.txt /tmp/obitools4-release-body.txt
|
||||||
|
|
||||||
jjfetch:
|
jjfetch:
|
||||||
@echo "$(YELLOW)→ Pulling latest commits...$(NC)"
|
@echo "$(YELLOW)→ Pulling latest commits...$(NC)"
|
||||||
@@ -169,5 +239,5 @@ jjfetch:
|
|||||||
@jj new master@origin
|
@jj new master@origin
|
||||||
@echo "$(GREEN)✓ Latest commits pulled$(NC)"
|
@echo "$(GREEN)✓ Latest commits pulled$(NC)"
|
||||||
|
|
||||||
.PHONY: all obitools update-deps obitests githubtests jjnew jjpush jjfetch bump-version .FORCE
|
.PHONY: all obitools update-deps obitests githubtests help jjnew jjpush jjpush-describe jjpush-bump jjpush-notes jjpush-push jjpush-tag jjfetch bump-version .FORCE
|
||||||
.FORCE:
|
.FORCE:
|
||||||
|
|||||||
@@ -32,8 +32,12 @@ The installation script offers several options:
|
|||||||
>
|
>
|
||||||
> -p, --obitools-prefix Prefix added to the obitools command names if you
|
> -p, --obitools-prefix Prefix added to the obitools command names if you
|
||||||
> want to have several versions of obitools at the
|
> want to have several versions of obitools at the
|
||||||
> same time on your system (as example `-p g` will produce
|
> same time on your system (as example `-p g` will produce
|
||||||
> `gobigrep` command instead of `obigrep`).
|
> `gobigrep` command instead of `obigrep`).
|
||||||
|
>
|
||||||
|
> -j, --jobs Number of parallel jobs used for compilation
|
||||||
|
> (default: 1). Increase this value to speed up
|
||||||
|
> compilation on multi-core systems (e.g., `-j 4`).
|
||||||
|
|
||||||
### Examples
|
### Examples
|
||||||
|
|
||||||
|
|||||||
264
blackboard/Prospective/large_sequence_parsing.md
Normal file
264
blackboard/Prospective/large_sequence_parsing.md
Normal file
@@ -0,0 +1,264 @@
|
|||||||
|
# Optimisation du parsing des grandes séquences
|
||||||
|
|
||||||
|
## Contexte
|
||||||
|
|
||||||
|
OBITools4 doit pouvoir traiter des séquences de taille chromosomique (plusieurs Gbp), notamment
|
||||||
|
issues de fichiers GenBank/EMBL (assemblages de génomes) ou de fichiers FASTA convertis depuis
|
||||||
|
ces formats.
|
||||||
|
|
||||||
|
## Architecture actuelle
|
||||||
|
|
||||||
|
### Pipeline de lecture (`pkg/obiformats/`)
|
||||||
|
|
||||||
|
```
|
||||||
|
ReadFileChunk (goroutine)
|
||||||
|
→ ChannelFileChunk
|
||||||
|
→ N × _ParseGenbankFile / _ParseFastaFile (goroutines)
|
||||||
|
→ IBioSequence
|
||||||
|
```
|
||||||
|
|
||||||
|
`ReadFileChunk` (`file_chunk_read.go`) lit le fichier par morceaux via une chaîne de
|
||||||
|
`PieceOfChunk` (rope). Chaque nœud fait `fileChunkSize` bytes :
|
||||||
|
|
||||||
|
- GenBank/EMBL : 128 MB (`1024*1024*128`)
|
||||||
|
- FASTA/FASTQ : 1 MB (`1024*1024`)
|
||||||
|
|
||||||
|
La chaîne est accumulée jusqu'à trouver la fin du dernier enregistrement complet (splitter),
|
||||||
|
puis `Pack()` est appelé pour fusionner tous les nœuds en un seul buffer contigu. Ce buffer
|
||||||
|
est transmis au parseur via `FileChunk.Raw *bytes.Buffer`.
|
||||||
|
|
||||||
|
### Parseur GenBank (`genbank_read.go`)
|
||||||
|
|
||||||
|
`GenbankChunkParser` reçoit un `io.Reader` sur le buffer packé, lit ligne par ligne via
|
||||||
|
`bufio.NewReader` (buffer 4096 bytes), et pour chaque ligne de la section `ORIGIN` :
|
||||||
|
|
||||||
|
```go
|
||||||
|
line = string(bline) // allocation par ligne
|
||||||
|
cleanline := strings.TrimSpace(line) // allocation
|
||||||
|
parts := strings.SplitN(cleanline, " ", 7) // allocation []string + substrings
|
||||||
|
for i := 1; i < lparts; i++ {
|
||||||
|
seqBytes.WriteString(parts[i])
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Point positif : `seqBytes` est pré-alloué grâce à `lseq` extrait de la ligne `LOCUS`.
|
||||||
|
|
||||||
|
### Parseur FASTA (`fastaseq_read.go`)
|
||||||
|
|
||||||
|
`FastaChunkParser` lit **octet par octet** via `scanner.ReadByte()`. Pour 3 Gbp :
|
||||||
|
3 milliards d'appels. `seqBytes` est un `bytes.Buffer{}` sans pré-allocation.
|
||||||
|
|
||||||
|
## Problème principal
|
||||||
|
|
||||||
|
Pour une séquence de plusieurs Gbp, `Pack()` fusionne une chaîne de ~N nœuds de 128 MB en
|
||||||
|
un seul buffer contigu. C'est une allocation de N × 128 MB suivie d'une copie de toutes les
|
||||||
|
données. Bien que l'implémentation de `Pack()` soit efficace (libère les nœuds au fur et à
|
||||||
|
mesure via `slices.Grow`), la copie est inévitable avec l'architecture actuelle.
|
||||||
|
|
||||||
|
De plus, le parseur GenBank produit des dizaines de millions d'allocations temporaires pour
|
||||||
|
parser la section `ORIGIN` (une par ligne).
|
||||||
|
|
||||||
|
## Invariant clé découvert
|
||||||
|
|
||||||
|
**Si la rope a plus d'un nœud, le premier nœud seul ne se termine pas sur une frontière
|
||||||
|
d'enregistrement** (pas de `//\n` en fin de `piece1`).
|
||||||
|
|
||||||
|
Preuve par construction dans `ReadFileChunk` :
|
||||||
|
- `splitter` est appelé dès le premier nœud (ligne 157)
|
||||||
|
- Si `end >= 0` → frontière trouvée dans 128 MB → boucle interne sautée → rope à 1 nœud
|
||||||
|
- Si `end < 0` → boucle interne ajoute des nœuds → rope à ≥ 2 nœuds
|
||||||
|
|
||||||
|
Corollaire : si rope à 1 nœud, `Pack()` ne fait rien (aucun nœud suivant).
|
||||||
|
|
||||||
|
**Attention** : rope à ≥ 2 nœuds ne signifie pas qu'il n'y a qu'une seule séquence dans
|
||||||
|
la rope. La rope packée peut contenir plusieurs enregistrements complets. Exemple : records
|
||||||
|
de 80 MB → `nextpieces` (48 MB de reste) + nouveau nœud (128 MB) = rope à 2 nœuds
|
||||||
|
contenant 2 records complets + début d'un troisième.
|
||||||
|
|
||||||
|
L'invariant dit seulement que `piece1` seul est incomplet — pas que la rope entière
|
||||||
|
ne contient qu'un seul record.
|
||||||
|
|
||||||
|
**Invariant : le dernier FileChunk envoyé finit sur une frontière d'enregistrement.**
|
||||||
|
|
||||||
|
Deux chemins dans `ReadFileChunk` :
|
||||||
|
|
||||||
|
1. **Chemin normal** (`end >= 0` via `splitter`) : le buffer est explicitement tronqué à
|
||||||
|
`end` (ligne 200 : `pieces.data = pieces.data[:end]`). Frontière garantie par construction
|
||||||
|
pour tous les formats. ✓
|
||||||
|
|
||||||
|
2. **Chemin EOF** (`end < 0`, `end = pieces.Len()`) : tout le reste du fichier est envoyé.
|
||||||
|
- **GenBank/EMBL** : présuppose fichier bien formé (se termine par `//\n`). Le parseur
|
||||||
|
lève un `log.Fatalf` sur tout état inattendu — filet de sécurité suffisant. ✓
|
||||||
|
- **FASTQ** : présupposé, vérifié par le parseur. ✓
|
||||||
|
- **FASTA** : garanti par le format lui-même (fin d'enregistrement = EOF ou `>`). ✓
|
||||||
|
|
||||||
|
**Hypothèse de travail adoptée** : les fichiers d'entrée sont bien formés. Dans le pire cas,
|
||||||
|
le parseur lèvera une erreur explicite. Il n'y a pas de risque de corruption silencieuse.
|
||||||
|
|
||||||
|
## Piste d'optimisation : se dispenser de Pack()
|
||||||
|
|
||||||
|
### Idée centrale
|
||||||
|
|
||||||
|
Au lieu de fusionner la rope avant de la passer au parseur, **parser directement la rope
|
||||||
|
nœud par nœud**, et **écrire la séquence compactée in-place dans le premier nœud**.
|
||||||
|
|
||||||
|
Pourquoi c'est sûr :
|
||||||
|
- Le header (LOCUS, DEFINITION, SOURCE, FEATURES) est **petit** et traité en premier
|
||||||
|
- La séquence (ORIGIN) est **à la fin** du record
|
||||||
|
- Au moment d'écrire la séquence depuis l'offset 0 de `piece1`, le pointeur de lecture
|
||||||
|
est profond dans la rope (offset >> 0) → jamais de collision
|
||||||
|
- La séquence compactée est toujours plus courte que les données brutes
|
||||||
|
|
||||||
|
### Pré-allocation
|
||||||
|
|
||||||
|
Pour GenBank/EMBL : `lseq` est connu dès la ligne `LOCUS`/`ID` (première ligne, dans
|
||||||
|
`piece1`). On peut faire `slices.Grow(piece1.data, lseq)` dès ce moment.
|
||||||
|
|
||||||
|
Pour FASTA : pas de taille garantie dans le header, mais `rope.Len()` donne un majorant.
|
||||||
|
On peut utiliser `rope.Len() / 2` comme estimation initiale.
|
||||||
|
|
||||||
|
### Gestion des jonctions entre nœuds
|
||||||
|
|
||||||
|
Une ligne peut chevaucher deux nœuds (rare avec 128 MB, mais possible). Solution : carry
|
||||||
|
buffer de ~128 bytes pour les quelques bytes en fin de nœud.
|
||||||
|
|
||||||
|
### Cas FASTA/FASTQ multi-séquences
|
||||||
|
|
||||||
|
Un FileChunk peut contenir N séquences (notamment FASTA/FASTQ courts). Dans ce cas
|
||||||
|
l'écriture in-place dans `piece1` n'est pas applicable directement — on écrase des données
|
||||||
|
nécessaires aux séquences suivantes.
|
||||||
|
|
||||||
|
Stratégie par cas :
|
||||||
|
- **Rope à 1 nœud** (record ≤ 128 MB) : `Pack()` est trivial (no-op), parseur actuel OK
|
||||||
|
- **Rope à ≥ 2 nœuds** : par l'invariant, `piece1` ne contient pas de record complet →
|
||||||
|
une seule grande séquence → in-place applicable
|
||||||
|
|
||||||
|
### Format d'une ligne séquence GenBank (Après ORIGIN)
|
||||||
|
|
||||||
|
```
|
||||||
|
/^ *[0-9]+( [nuc]{10}){0,5} [nuc]{1,10}/
|
||||||
|
```
|
||||||
|
|
||||||
|
### Format d'une ligne séquence GenBank (Après SQ)
|
||||||
|
|
||||||
|
La ligne SQ contient aussi la taille de la séquence
|
||||||
|
|
||||||
|
```
|
||||||
|
/^ *( [nuc]{10}){0,5} [nuc]{1,10} *[0-9]+/
|
||||||
|
```
|
||||||
|
|
||||||
|
Compactage in-place sur `bline` ([]byte brut, sans conversion `string`) :
|
||||||
|
|
||||||
|
```go
|
||||||
|
w := 0
|
||||||
|
i := 0
|
||||||
|
for i < len(bline) && bline[i] == ' ' { i++ } // skip indentation
|
||||||
|
for i < len(bline) && bline[i] <= '9' { i++ } // skip position number
|
||||||
|
for ; i < len(bline); i++ {
|
||||||
|
if bline[i] != ' ' {
|
||||||
|
bline[w] = bline[i]
|
||||||
|
w++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// écrire bline[:w] directement dans piece1.data[seqOffset:]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Changements nécessaires
|
||||||
|
|
||||||
|
1. **`FileChunk`** : exposer la rope `*PieceOfChunk` non-packée en plus (ou à la place)
|
||||||
|
de `Raw *bytes.Buffer`
|
||||||
|
2. **`GenbankChunkParser` / `EmblChunkParser`** : accepter `*PieceOfChunk`, parser la
|
||||||
|
rope séquentiellement avec carry buffer pour les jonctions
|
||||||
|
3. **`FastaChunkParser`** : idem, avec in-place conditionnel selon taille de la rope
|
||||||
|
4. **`ReadFileChunk`** : ne pas appeler `Pack()` avant envoi sur le channel (ou version
|
||||||
|
alternative `ReadFileChunkRope`)
|
||||||
|
|
||||||
|
## Fichiers concernés
|
||||||
|
|
||||||
|
- `pkg/obiformats/file_chunk_read.go` — structure rope, `ReadFileChunk`
|
||||||
|
- `pkg/obiformats/genbank_read.go` — `GenbankChunkParser`, `_ParseGenbankFile`
|
||||||
|
- `pkg/obiformats/embl_read.go` — `EmblChunkParser`, `ReadEMBL`
|
||||||
|
- `pkg/obiformats/fastaseq_read.go` — `FastaChunkParser`, `_ParseFastaFile`
|
||||||
|
- `pkg/obiformats/fastqseq_read.go` — parseur FASTQ (même structure)
|
||||||
|
|
||||||
|
## Plan d'implémentation : parseur GenBank sur rope
|
||||||
|
|
||||||
|
### Contexte
|
||||||
|
|
||||||
|
Baseline mesurée : `obiconvert gbpln640.seq.gz` → 49s real, 42s user, 29s sys, **57 GB RSS**.
|
||||||
|
Le sys élevé indique des allocations massives. Deux causes :
|
||||||
|
1. `Pack()` : fusionne toute la rope (N × 128 MB) en un buffer contigu avant de parser
|
||||||
|
2. Parser ORIGIN : `string(bline)` + `TrimSpace` + `SplitN` × millions de lignes
|
||||||
|
|
||||||
|
### 1. `gbRopeScanner`
|
||||||
|
|
||||||
|
Struct de lecture ligne par ligne sur la rope, sans allocation heap :
|
||||||
|
|
||||||
|
```go
|
||||||
|
type gbRopeScanner struct {
|
||||||
|
current *PieceOfChunk
|
||||||
|
pos int
|
||||||
|
carry [256]byte // stack-allocated, max GenBank line = 80 chars
|
||||||
|
carryN int
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
`ReadLine()` :
|
||||||
|
- Cherche `\n` dans `current.data[pos:]` via `bytes.IndexByte`
|
||||||
|
- Si trouvé sans carry : retourne slice direct du node (zéro alloc)
|
||||||
|
- Si trouvé avec carry : copie dans carry buffer, retourne `carry[:n]`
|
||||||
|
- Si non trouvé : copie le reste dans carry, avance au node suivant, recommence
|
||||||
|
- EOF : retourne `carry[:carryN]` puis nil
|
||||||
|
|
||||||
|
`extractSequence(dest []byte, UtoT bool) int` :
|
||||||
|
- Scan direct des bytes pour section ORIGIN, sans passer par ReadLine
|
||||||
|
- Machine d'états : lineStart → skip espaces/digits → copier nucléotides dans dest
|
||||||
|
- Stop sur `//` en début de ligne
|
||||||
|
- Zéro allocation, UtoT inline
|
||||||
|
|
||||||
|
### 2. `GenbankChunkParserRope`
|
||||||
|
|
||||||
|
```go
|
||||||
|
func GenbankChunkParserRope(source string, rope *PieceOfChunk,
|
||||||
|
withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error)
|
||||||
|
```
|
||||||
|
|
||||||
|
- Même machine d'états que `GenbankChunkParser`, sur `[]byte` (`bytes.HasPrefix`)
|
||||||
|
- LOCUS : extrait `id` et `lseq` par scan direct (remplace `_seqlenght_rx`)
|
||||||
|
- FEATURES / default inFeature : taxid extrait par scan de `/db_xref="taxon:`
|
||||||
|
dans la source feature ; `featBytes` rempli seulement si `withFeatureTable=true`
|
||||||
|
- DEFINITION : toujours conservée
|
||||||
|
- ORIGIN : `dest = make([]byte, 0, lseq+20)` puis `s.extractSequence(dest, UtoT)`
|
||||||
|
|
||||||
|
### 3. Modifications `_ParseGenbankFile` et `ReadGenbank`
|
||||||
|
|
||||||
|
`_ParseGenbankFile` utilise `chunk.Rope` :
|
||||||
|
```go
|
||||||
|
sequences, err := GenbankChunkParserRope(chunk.Source, chunk.Rope, ...)
|
||||||
|
```
|
||||||
|
|
||||||
|
`ReadGenbank` passe `pack=false` :
|
||||||
|
```go
|
||||||
|
entry_channel := ReadFileChunk(..., false)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Ce qui NE change pas
|
||||||
|
|
||||||
|
- `GenbankChunkParser` reste (référence, tests)
|
||||||
|
- `ReadFileChunk`, `Pack()`, autres parseurs (EMBL, FASTA, FASTQ) : inchangés
|
||||||
|
|
||||||
|
### 5. Gains attendus
|
||||||
|
|
||||||
|
- **RSS** : pic ≈ 128 MB × workers (au lieu de N × 128 MB)
|
||||||
|
- **Temps sys** : élimination des mmap/munmap pour les gros buffers
|
||||||
|
- **Temps user** : ~50M allocations éliminées
|
||||||
|
|
||||||
|
### 6. Vérification
|
||||||
|
|
||||||
|
```bash
|
||||||
|
/usr/local/go/bin/go build ./...
|
||||||
|
diff <(obiconvert gbpln640.seq.gz) gbpln640.reference.fasta
|
||||||
|
cd bugs/genbank && ./benchmark.sh gbpln640.seq.gz
|
||||||
|
```
|
||||||
|
|
||||||
|
Cible : RSS < 1 GB, temps comparable ou meilleur.
|
||||||
@@ -7,6 +7,7 @@ INSTALL_DIR="/usr/local"
|
|||||||
OBITOOLS_PREFIX=""
|
OBITOOLS_PREFIX=""
|
||||||
VERSION=""
|
VERSION=""
|
||||||
LIST_VERSIONS=false
|
LIST_VERSIONS=false
|
||||||
|
JOBS=1
|
||||||
|
|
||||||
# Help message
|
# Help message
|
||||||
function display_help {
|
function display_help {
|
||||||
@@ -21,6 +22,7 @@ function display_help {
|
|||||||
echo " gobigrep command instead of obigrep)."
|
echo " gobigrep command instead of obigrep)."
|
||||||
echo " -v, --version Install a specific version (e.g., 4.4.8)."
|
echo " -v, --version Install a specific version (e.g., 4.4.8)."
|
||||||
echo " If not specified, installs the latest version."
|
echo " If not specified, installs the latest version."
|
||||||
|
echo " -j, --jobs Number of parallel jobs for compilation (default: 1)."
|
||||||
echo " -l, --list List all available versions and exit."
|
echo " -l, --list List all available versions and exit."
|
||||||
echo " -h, --help Display this help message."
|
echo " -h, --help Display this help message."
|
||||||
echo ""
|
echo ""
|
||||||
@@ -65,6 +67,10 @@ while [ "$#" -gt 0 ]; do
|
|||||||
VERSION="$2"
|
VERSION="$2"
|
||||||
shift 2
|
shift 2
|
||||||
;;
|
;;
|
||||||
|
-j|--jobs)
|
||||||
|
JOBS="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
-l|--list)
|
-l|--list)
|
||||||
LIST_VERSIONS=true
|
LIST_VERSIONS=true
|
||||||
shift
|
shift
|
||||||
@@ -122,9 +128,15 @@ mkdir -p "${WORK_DIR}/cache" \
|
|||||||
exit 1)
|
exit 1)
|
||||||
|
|
||||||
# Create installation directory
|
# Create installation directory
|
||||||
mkdir -p "${INSTALL_DIR}/bin" 2> /dev/null \
|
if ! mkdir -p "${INSTALL_DIR}/bin" 2>/dev/null; then
|
||||||
|| (echo "Please enter your password for installing obitools in ${INSTALL_DIR}" 1>&2
|
if [ ! -w "$(dirname "${INSTALL_DIR}")" ] && [ ! -w "${INSTALL_DIR}" ]; then
|
||||||
sudo mkdir -p "${INSTALL_DIR}/bin")
|
echo "Please enter your password for installing obitools in ${INSTALL_DIR}" 1>&2
|
||||||
|
sudo mkdir -p "${INSTALL_DIR}/bin"
|
||||||
|
else
|
||||||
|
echo "Error: Could not create ${INSTALL_DIR}/bin (check path or disk space)" 1>&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
if [[ ! -d "${INSTALL_DIR}/bin" ]]; then
|
if [[ ! -d "${INSTALL_DIR}/bin" ]]; then
|
||||||
echo "Could not create ${INSTALL_DIR}/bin directory for installing obitools" 1>&2
|
echo "Could not create ${INSTALL_DIR}/bin directory for installing obitools" 1>&2
|
||||||
@@ -208,16 +220,29 @@ mkdir -p vendor
|
|||||||
|
|
||||||
# Build with or without prefix
|
# Build with or without prefix
|
||||||
if [[ -z "$OBITOOLS_PREFIX" ]] ; then
|
if [[ -z "$OBITOOLS_PREFIX" ]] ; then
|
||||||
make GOFLAGS="-buildvcs=false"
|
make -j"${JOBS}" obitools GOFLAGS="-buildvcs=false"
|
||||||
else
|
else
|
||||||
make GOFLAGS="-buildvcs=false" OBITOOLS_PREFIX="${OBITOOLS_PREFIX}"
|
make -j"${JOBS}" obitools GOFLAGS="-buildvcs=false" OBITOOLS_PREFIX="${OBITOOLS_PREFIX}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Install binaries
|
# Install binaries
|
||||||
echo "Installing binaries to ${INSTALL_DIR}/bin..." 1>&2
|
echo "Installing binaries to ${INSTALL_DIR}/bin..." 1>&2
|
||||||
(cp build/* "${INSTALL_DIR}/bin" 2> /dev/null) \
|
if ! cp build/* "${INSTALL_DIR}/bin" 2>/dev/null; then
|
||||||
|| (echo "Please enter your password for installing obitools in ${INSTALL_DIR}" 1>&2
|
if [ ! -w "${INSTALL_DIR}/bin" ]; then
|
||||||
sudo cp build/* "${INSTALL_DIR}/bin")
|
echo "Please enter your password for installing obitools in ${INSTALL_DIR}" 1>&2
|
||||||
|
sudo cp build/* "${INSTALL_DIR}/bin"
|
||||||
|
else
|
||||||
|
echo "Error: Could not copy binaries to ${INSTALL_DIR}/bin" 1>&2
|
||||||
|
echo " Source files: $(ls build/ 2>/dev/null || echo 'none found')" 1>&2
|
||||||
|
echo "" 1>&2
|
||||||
|
echo "The build directory has been preserved for manual recovery:" 1>&2
|
||||||
|
echo " $(pwd)/build/" 1>&2
|
||||||
|
echo "You can install manually with:" 1>&2
|
||||||
|
echo " cp $(pwd)/build/* ${INSTALL_DIR}/bin/" 1>&2
|
||||||
|
popd > /dev/null || true
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
popd > /dev/null || exit
|
popd > /dev/null || exit
|
||||||
|
|
||||||
|
|||||||
@@ -161,6 +161,149 @@ func EmblChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (obise
|
|||||||
return parser
|
return parser
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// extractEmblSeq scans the sequence section of an EMBL record directly on the
|
||||||
|
// rope. EMBL sequence lines start with 5 spaces followed by bases in groups of
|
||||||
|
// 10, separated by spaces, with a position number at the end. The section ends
|
||||||
|
// with "//".
|
||||||
|
func (s *ropeScanner) extractEmblSeq(dest []byte, UtoT bool) []byte {
|
||||||
|
// We use ReadLine and scan each line for bases (skip digits, spaces, newlines).
|
||||||
|
for {
|
||||||
|
line := s.ReadLine()
|
||||||
|
if line == nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if len(line) >= 2 && line[0] == '/' && line[1] == '/' {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
// Lines start with 5 spaces; bases follow separated by single spaces.
|
||||||
|
// Digits at the end are the position counter — skip them.
|
||||||
|
// Simplest: take every byte that is a letter.
|
||||||
|
for _, b := range line {
|
||||||
|
if b >= 'A' && b <= 'Z' {
|
||||||
|
b += 'a' - 'A'
|
||||||
|
}
|
||||||
|
if UtoT && b == 'u' {
|
||||||
|
b = 't'
|
||||||
|
}
|
||||||
|
if b >= 'a' && b <= 'z' {
|
||||||
|
dest = append(dest, b)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return dest
|
||||||
|
}
|
||||||
|
|
||||||
|
// EmblChunkParserRope parses an EMBL chunk directly from a rope without Pack().
|
||||||
|
func EmblChunkParserRope(source string, rope *PieceOfChunk, withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) {
|
||||||
|
scanner := newRopeScanner(rope)
|
||||||
|
sequences := obiseq.MakeBioSequenceSlice(100)[:0]
|
||||||
|
|
||||||
|
var id string
|
||||||
|
var scientificName string
|
||||||
|
defBytes := make([]byte, 0, 256)
|
||||||
|
featBytes := make([]byte, 0, 1024)
|
||||||
|
var taxid int
|
||||||
|
inSeq := false
|
||||||
|
|
||||||
|
for {
|
||||||
|
line := scanner.ReadLine()
|
||||||
|
if line == nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
if inSeq {
|
||||||
|
// Should not happen — extractEmblSeq consumed up to "//"
|
||||||
|
inSeq = false
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case bytes.HasPrefix(line, []byte("ID ")):
|
||||||
|
id = string(bytes.SplitN(line[5:], []byte(";"), 2)[0])
|
||||||
|
case bytes.HasPrefix(line, []byte("OS ")):
|
||||||
|
scientificName = string(bytes.TrimSpace(line[5:]))
|
||||||
|
case bytes.HasPrefix(line, []byte("DE ")):
|
||||||
|
if len(defBytes) > 0 {
|
||||||
|
defBytes = append(defBytes, ' ')
|
||||||
|
}
|
||||||
|
defBytes = append(defBytes, bytes.TrimSpace(line[5:])...)
|
||||||
|
case withFeatureTable && bytes.HasPrefix(line, []byte("FH ")):
|
||||||
|
featBytes = append(featBytes, line...)
|
||||||
|
case withFeatureTable && bytes.Equal(line, []byte("FH")):
|
||||||
|
featBytes = append(featBytes, '\n')
|
||||||
|
featBytes = append(featBytes, line...)
|
||||||
|
case bytes.HasPrefix(line, []byte("FT ")):
|
||||||
|
if withFeatureTable {
|
||||||
|
featBytes = append(featBytes, '\n')
|
||||||
|
featBytes = append(featBytes, line...)
|
||||||
|
}
|
||||||
|
if bytes.HasPrefix(line, []byte(`FT /db_xref="taxon:`)) {
|
||||||
|
rest := line[37:]
|
||||||
|
end := bytes.IndexByte(rest, '"')
|
||||||
|
if end > 0 {
|
||||||
|
taxid, _ = strconv.Atoi(string(rest[:end]))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case bytes.HasPrefix(line, []byte(" ")):
|
||||||
|
// First sequence line: extract all bases via extractEmblSeq,
|
||||||
|
// which also consumes this line's remaining content.
|
||||||
|
// But ReadLine already consumed this line — we need to process it
|
||||||
|
// plus subsequent lines. Process this line inline then call helper.
|
||||||
|
seqDest := make([]byte, 0, 4096)
|
||||||
|
for _, b := range line {
|
||||||
|
if b >= 'A' && b <= 'Z' {
|
||||||
|
b += 'a' - 'A'
|
||||||
|
}
|
||||||
|
if UtoT && b == 'u' {
|
||||||
|
b = 't'
|
||||||
|
}
|
||||||
|
if b >= 'a' && b <= 'z' {
|
||||||
|
seqDest = append(seqDest, b)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
seqDest = scanner.extractEmblSeq(seqDest, UtoT)
|
||||||
|
|
||||||
|
seq := obiseq.NewBioSequenceOwning(id, seqDest, string(defBytes))
|
||||||
|
seq.SetSource(source)
|
||||||
|
if withFeatureTable {
|
||||||
|
seq.SetFeatures(featBytes)
|
||||||
|
}
|
||||||
|
annot := seq.Annotations()
|
||||||
|
annot["scientific_name"] = scientificName
|
||||||
|
annot["taxid"] = taxid
|
||||||
|
sequences = append(sequences, seq)
|
||||||
|
|
||||||
|
// Reset state
|
||||||
|
id = ""
|
||||||
|
scientificName = ""
|
||||||
|
defBytes = defBytes[:0]
|
||||||
|
featBytes = featBytes[:0]
|
||||||
|
taxid = 1
|
||||||
|
|
||||||
|
case bytes.Equal(line, []byte("//")):
|
||||||
|
// record ended without SQ/sequence section (e.g. WGS entries)
|
||||||
|
if id != "" {
|
||||||
|
seq := obiseq.NewBioSequenceOwning(id, []byte{}, string(defBytes))
|
||||||
|
seq.SetSource(source)
|
||||||
|
if withFeatureTable {
|
||||||
|
seq.SetFeatures(featBytes)
|
||||||
|
}
|
||||||
|
annot := seq.Annotations()
|
||||||
|
annot["scientific_name"] = scientificName
|
||||||
|
annot["taxid"] = taxid
|
||||||
|
sequences = append(sequences, seq)
|
||||||
|
}
|
||||||
|
id = ""
|
||||||
|
scientificName = ""
|
||||||
|
defBytes = defBytes[:0]
|
||||||
|
featBytes = featBytes[:0]
|
||||||
|
taxid = 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return sequences, nil
|
||||||
|
}
|
||||||
|
|
||||||
func _ParseEmblFile(
|
func _ParseEmblFile(
|
||||||
input ChannelFileChunk,
|
input ChannelFileChunk,
|
||||||
out obiiter.IBioSequence,
|
out obiiter.IBioSequence,
|
||||||
@@ -171,7 +314,14 @@ func _ParseEmblFile(
|
|||||||
|
|
||||||
for chunks := range input {
|
for chunks := range input {
|
||||||
order := chunks.Order
|
order := chunks.Order
|
||||||
sequences, err := parser(chunks.Source, chunks.Raw)
|
var sequences obiseq.BioSequenceSlice
|
||||||
|
var err error
|
||||||
|
|
||||||
|
if chunks.Rope != nil {
|
||||||
|
sequences, err = EmblChunkParserRope(chunks.Source, chunks.Rope, withFeatureTable, UtoT)
|
||||||
|
} else {
|
||||||
|
sequences, err = parser(chunks.Source, chunks.Raw)
|
||||||
|
}
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("%s : Cannot parse the embl file : %v", chunks.Source, err)
|
log.Fatalf("%s : Cannot parse the embl file : %v", chunks.Source, err)
|
||||||
@@ -196,6 +346,7 @@ func ReadEMBL(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, er
|
|||||||
1024*1024*128,
|
1024*1024*128,
|
||||||
EndOfLastFlatFileEntry,
|
EndOfLastFlatFileEntry,
|
||||||
"\nID ",
|
"\nID ",
|
||||||
|
false,
|
||||||
)
|
)
|
||||||
|
|
||||||
newIter := obiiter.MakeIBioSequence()
|
newIter := obiiter.MakeIBioSequence()
|
||||||
|
|||||||
@@ -209,28 +209,121 @@ func FastaChunkParser(UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlic
|
|||||||
return parser
|
return parser
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// extractFastaSeq scans sequence bytes from the rope directly into dest,
|
||||||
|
// appending valid nucleotide characters and skipping whitespace.
|
||||||
|
// Stops when '>' is found at the start of a line (next record) or at EOF.
|
||||||
|
// Returns (dest with appended bases, hasMore).
|
||||||
|
// hasMore=true means scanner is now positioned at '>' of the next record.
|
||||||
|
func (s *ropeScanner) extractFastaSeq(dest []byte, UtoT bool) ([]byte, bool) {
|
||||||
|
lineStart := true
|
||||||
|
|
||||||
|
for s.current != nil {
|
||||||
|
data := s.current.data[s.pos:]
|
||||||
|
for i, b := range data {
|
||||||
|
if lineStart && b == '>' {
|
||||||
|
s.pos += i
|
||||||
|
if s.pos >= len(s.current.data) {
|
||||||
|
s.current = s.current.Next()
|
||||||
|
s.pos = 0
|
||||||
|
}
|
||||||
|
return dest, true
|
||||||
|
}
|
||||||
|
if b == '\n' || b == '\r' {
|
||||||
|
lineStart = true
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
lineStart = false
|
||||||
|
if b == ' ' || b == '\t' {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if b >= 'A' && b <= 'Z' {
|
||||||
|
b += 'a' - 'A'
|
||||||
|
}
|
||||||
|
if UtoT && b == 'u' {
|
||||||
|
b = 't'
|
||||||
|
}
|
||||||
|
dest = append(dest, b)
|
||||||
|
}
|
||||||
|
s.current = s.current.Next()
|
||||||
|
s.pos = 0
|
||||||
|
}
|
||||||
|
return dest, false
|
||||||
|
}
|
||||||
|
|
||||||
|
// FastaChunkParserRope parses a FASTA chunk directly from the rope without Pack().
|
||||||
|
func FastaChunkParserRope(source string, rope *PieceOfChunk, UtoT bool) (obiseq.BioSequenceSlice, error) {
|
||||||
|
scanner := newRopeScanner(rope)
|
||||||
|
sequences := obiseq.MakeBioSequenceSlice(100)[:0]
|
||||||
|
|
||||||
|
for {
|
||||||
|
bline := scanner.ReadLine()
|
||||||
|
if bline == nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if len(bline) == 0 || bline[0] != '>' {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse header: ">id definition"
|
||||||
|
header := bline[1:]
|
||||||
|
var id string
|
||||||
|
var definition string
|
||||||
|
sp := bytes.IndexByte(header, ' ')
|
||||||
|
if sp < 0 {
|
||||||
|
sp = bytes.IndexByte(header, '\t')
|
||||||
|
}
|
||||||
|
if sp < 0 {
|
||||||
|
id = string(header)
|
||||||
|
} else {
|
||||||
|
id = string(header[:sp])
|
||||||
|
definition = string(bytes.TrimSpace(header[sp+1:]))
|
||||||
|
}
|
||||||
|
|
||||||
|
seqDest := make([]byte, 0, 4096)
|
||||||
|
var hasMore bool
|
||||||
|
seqDest, hasMore = scanner.extractFastaSeq(seqDest, UtoT)
|
||||||
|
|
||||||
|
if len(seqDest) == 0 {
|
||||||
|
log.Fatalf("%s [%s]: sequence is empty", source, id)
|
||||||
|
}
|
||||||
|
|
||||||
|
seq := obiseq.NewBioSequenceOwning(id, seqDest, definition)
|
||||||
|
seq.SetSource(source)
|
||||||
|
sequences = append(sequences, seq)
|
||||||
|
|
||||||
|
if !hasMore {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return sequences, nil
|
||||||
|
}
|
||||||
|
|
||||||
func _ParseFastaFile(
|
func _ParseFastaFile(
|
||||||
input ChannelFileChunk,
|
input ChannelFileChunk,
|
||||||
out obiiter.IBioSequence,
|
out obiiter.IBioSequence,
|
||||||
UtoT bool,
|
UtoT bool,
|
||||||
) {
|
) {
|
||||||
|
|
||||||
parser := FastaChunkParser(UtoT)
|
parser := FastaChunkParser(UtoT)
|
||||||
|
|
||||||
for chunks := range input {
|
for chunks := range input {
|
||||||
sequences, err := parser(chunks.Source, chunks.Raw)
|
var sequences obiseq.BioSequenceSlice
|
||||||
// obilog.Warnf("Chunck(%d:%d) -%d- ", chunks.Order, l, sequences.Len())
|
var err error
|
||||||
|
|
||||||
|
if chunks.Rope != nil {
|
||||||
|
sequences, err = FastaChunkParserRope(chunks.Source, chunks.Rope, UtoT)
|
||||||
|
} else {
|
||||||
|
sequences, err = parser(chunks.Source, chunks.Raw)
|
||||||
|
}
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("File %s : Cannot parse the fasta file : %v", chunks.Source, err)
|
log.Fatalf("File %s : Cannot parse the fasta file : %v", chunks.Source, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
out.Push(obiiter.MakeBioSequenceBatch(chunks.Source, chunks.Order, sequences))
|
out.Push(obiiter.MakeBioSequenceBatch(chunks.Source, chunks.Order, sequences))
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
out.Done()
|
out.Done()
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
|
func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||||
@@ -245,6 +338,7 @@ func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
|
|||||||
1024*1024,
|
1024*1024,
|
||||||
EndOfLastFastaEntry,
|
EndOfLastFastaEntry,
|
||||||
"\n>",
|
"\n>",
|
||||||
|
false,
|
||||||
)
|
)
|
||||||
|
|
||||||
for i := 0; i < nworker; i++ {
|
for i := 0; i < nworker; i++ {
|
||||||
|
|||||||
@@ -303,6 +303,80 @@ func FastqChunkParser(quality_shift byte, with_quality bool, UtoT bool) func(str
|
|||||||
return parser
|
return parser
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FastqChunkParserRope parses a FASTQ chunk directly from a rope without Pack().
|
||||||
|
func FastqChunkParserRope(source string, rope *PieceOfChunk, quality_shift byte, with_quality, UtoT bool) (obiseq.BioSequenceSlice, error) {
|
||||||
|
scanner := newRopeScanner(rope)
|
||||||
|
sequences := obiseq.MakeBioSequenceSlice(100)[:0]
|
||||||
|
|
||||||
|
for {
|
||||||
|
// Line 1: @id [definition]
|
||||||
|
hline := scanner.ReadLine()
|
||||||
|
if hline == nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if len(hline) == 0 || hline[0] != '@' {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
header := hline[1:]
|
||||||
|
var id string
|
||||||
|
var definition string
|
||||||
|
sp := bytes.IndexByte(header, ' ')
|
||||||
|
if sp < 0 {
|
||||||
|
sp = bytes.IndexByte(header, '\t')
|
||||||
|
}
|
||||||
|
if sp < 0 {
|
||||||
|
id = string(header)
|
||||||
|
} else {
|
||||||
|
id = string(header[:sp])
|
||||||
|
definition = string(bytes.TrimSpace(header[sp+1:]))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Line 2: sequence
|
||||||
|
sline := scanner.ReadLine()
|
||||||
|
if sline == nil {
|
||||||
|
log.Fatalf("@%s[%s]: unexpected EOF after header", id, source)
|
||||||
|
}
|
||||||
|
seqDest := make([]byte, len(sline))
|
||||||
|
w := 0
|
||||||
|
for _, b := range sline {
|
||||||
|
if b >= 'A' && b <= 'Z' {
|
||||||
|
b += 'a' - 'A'
|
||||||
|
}
|
||||||
|
if UtoT && b == 'u' {
|
||||||
|
b = 't'
|
||||||
|
}
|
||||||
|
seqDest[w] = b
|
||||||
|
w++
|
||||||
|
}
|
||||||
|
seqDest = seqDest[:w]
|
||||||
|
if len(seqDest) == 0 {
|
||||||
|
log.Fatalf("@%s[%s]: sequence is empty", id, source)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Line 3: + (skip)
|
||||||
|
scanner.ReadLine()
|
||||||
|
|
||||||
|
// Line 4: quality
|
||||||
|
qline := scanner.ReadLine()
|
||||||
|
|
||||||
|
seq := obiseq.NewBioSequenceOwning(id, seqDest, definition)
|
||||||
|
seq.SetSource(source)
|
||||||
|
|
||||||
|
if with_quality && qline != nil {
|
||||||
|
qDest := make([]byte, len(qline))
|
||||||
|
copy(qDest, qline)
|
||||||
|
for i := range qDest {
|
||||||
|
qDest[i] -= quality_shift
|
||||||
|
}
|
||||||
|
seq.TakeQualities(qDest)
|
||||||
|
}
|
||||||
|
|
||||||
|
sequences = append(sequences, seq)
|
||||||
|
}
|
||||||
|
|
||||||
|
return sequences, nil
|
||||||
|
}
|
||||||
|
|
||||||
func _ParseFastqFile(
|
func _ParseFastqFile(
|
||||||
input ChannelFileChunk,
|
input ChannelFileChunk,
|
||||||
out obiiter.IBioSequence,
|
out obiiter.IBioSequence,
|
||||||
@@ -313,7 +387,14 @@ func _ParseFastqFile(
|
|||||||
parser := FastqChunkParser(quality_shift, with_quality, UtoT)
|
parser := FastqChunkParser(quality_shift, with_quality, UtoT)
|
||||||
|
|
||||||
for chunks := range input {
|
for chunks := range input {
|
||||||
sequences, err := parser(chunks.Source, chunks.Raw)
|
var sequences obiseq.BioSequenceSlice
|
||||||
|
var err error
|
||||||
|
|
||||||
|
if chunks.Rope != nil {
|
||||||
|
sequences, err = FastqChunkParserRope(chunks.Source, chunks.Rope, quality_shift, with_quality, UtoT)
|
||||||
|
} else {
|
||||||
|
sequences, err = parser(chunks.Source, chunks.Raw)
|
||||||
|
}
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("File %s : Cannot parse the fastq file : %v", chunks.Source, err)
|
log.Fatalf("File %s : Cannot parse the fastq file : %v", chunks.Source, err)
|
||||||
@@ -339,6 +420,7 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
|
|||||||
1024*1024,
|
1024*1024,
|
||||||
EndOfLastFastqEntry,
|
EndOfLastFastqEntry,
|
||||||
"\n@",
|
"\n@",
|
||||||
|
false,
|
||||||
)
|
)
|
||||||
|
|
||||||
for i := 0; i < nworker; i++ {
|
for i := 0; i < nworker; i++ {
|
||||||
|
|||||||
@@ -296,7 +296,7 @@ func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {
|
|||||||
|
|
||||||
case strings.HasSuffix(skey, "_taxid"):
|
case strings.HasSuffix(skey, "_taxid"):
|
||||||
if dataType == jsonparser.Number || dataType == jsonparser.String {
|
if dataType == jsonparser.Number || dataType == jsonparser.String {
|
||||||
rank, _ := obiutils.SplitInTwo(skey, '_')
|
rank := skey[:len(skey)-len("_taxid")]
|
||||||
|
|
||||||
taxid := string(value)
|
taxid := string(value)
|
||||||
sequence.SetTaxid(taxid, rank)
|
sequence.SetTaxid(taxid, rank)
|
||||||
|
|||||||
@@ -77,45 +77,47 @@ func FormatFasta(seq *obiseq.BioSequence, formater FormatHeader) string {
|
|||||||
//
|
//
|
||||||
// It returns a byte array containing the formatted sequences.
|
// It returns a byte array containing the formatted sequences.
|
||||||
func FormatFastaBatch(batch obiiter.BioSequenceBatch, formater FormatHeader, skipEmpty bool) *bytes.Buffer {
|
func FormatFastaBatch(batch obiiter.BioSequenceBatch, formater FormatHeader, skipEmpty bool) *bytes.Buffer {
|
||||||
// Create a buffer to store the formatted sequences
|
|
||||||
var bs bytes.Buffer
|
var bs bytes.Buffer
|
||||||
|
|
||||||
lt := 0
|
lt := 0
|
||||||
|
|
||||||
for _, seq := range batch.Slice() {
|
for _, seq := range batch.Slice() {
|
||||||
lt += seq.Len()
|
lt += seq.Len()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Iterate over each sequence in the batch
|
// Pre-allocate: sequence data + newlines every 60 chars + ~100 bytes header per sequence
|
||||||
|
bs.Grow(lt + lt/60 + 100*batch.Len() + 1)
|
||||||
|
|
||||||
log.Debugf("FormatFastaBatch: #%d : %d seqs", batch.Order(), batch.Len())
|
log.Debugf("FormatFastaBatch: #%d : %d seqs", batch.Order(), batch.Len())
|
||||||
first := true
|
|
||||||
for _, seq := range batch.Slice() {
|
for _, seq := range batch.Slice() {
|
||||||
// Check if the sequence is empty
|
|
||||||
if seq.Len() > 0 {
|
if seq.Len() > 0 {
|
||||||
// Format the sequence using the provided formater function
|
// Write header directly into bs — no intermediate string
|
||||||
formattedSeq := FormatFasta(seq, formater)
|
bs.WriteByte('>')
|
||||||
|
bs.WriteString(seq.Id())
|
||||||
if first {
|
bs.WriteByte(' ')
|
||||||
bs.Grow(lt + (len(formattedSeq)-seq.Len())*batch.Len()*5/4)
|
bs.WriteString(formater(seq))
|
||||||
first = false
|
|
||||||
}
|
|
||||||
|
|
||||||
// Append the formatted sequence to the buffer
|
|
||||||
bs.WriteString(formattedSeq)
|
|
||||||
bs.WriteByte('\n')
|
bs.WriteByte('\n')
|
||||||
|
|
||||||
|
// Write folded sequence directly into bs — no copies
|
||||||
|
s := seq.Sequence()
|
||||||
|
l := len(s)
|
||||||
|
for i := 0; i < l; i += 60 {
|
||||||
|
to := i + 60
|
||||||
|
if to > l {
|
||||||
|
to = l
|
||||||
|
}
|
||||||
|
bs.Write(s[i:to])
|
||||||
|
bs.WriteByte('\n')
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// Handle empty sequences
|
|
||||||
if skipEmpty {
|
if skipEmpty {
|
||||||
// Skip empty sequences if skipEmpty is true
|
|
||||||
obilog.Warnf("Sequence %s is empty and skipped in output", seq.Id())
|
obilog.Warnf("Sequence %s is empty and skipped in output", seq.Id())
|
||||||
} else {
|
} else {
|
||||||
// Terminate the program if skipEmpty is false
|
|
||||||
log.Fatalf("Sequence %s is empty", seq.Id())
|
log.Fatalf("Sequence %s is empty", seq.Id())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return the byte array representation of the buffer
|
|
||||||
return &bs
|
return &bs
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ type SeqFileChunkParser func(string, io.Reader) (obiseq.BioSequenceSlice, error)
|
|||||||
type FileChunk struct {
|
type FileChunk struct {
|
||||||
Source string
|
Source string
|
||||||
Raw *bytes.Buffer
|
Raw *bytes.Buffer
|
||||||
|
Rope *PieceOfChunk
|
||||||
Order int
|
Order int
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -97,11 +98,17 @@ func (piece *PieceOfChunk) IsLast() bool {
|
|||||||
return piece.next == nil
|
return piece.next == nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (piece *PieceOfChunk) FileChunk(source string, order int) FileChunk {
|
func (piece *PieceOfChunk) FileChunk(source string, order int, pack bool) FileChunk {
|
||||||
piece.Pack()
|
piece = piece.Head()
|
||||||
|
var raw *bytes.Buffer
|
||||||
|
if pack {
|
||||||
|
piece.Pack()
|
||||||
|
raw = bytes.NewBuffer(piece.data)
|
||||||
|
}
|
||||||
return FileChunk{
|
return FileChunk{
|
||||||
Source: source,
|
Source: source,
|
||||||
Raw: bytes.NewBuffer(piece.data),
|
Raw: raw,
|
||||||
|
Rope: piece,
|
||||||
Order: order,
|
Order: order,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -133,7 +140,8 @@ func ReadFileChunk(
|
|||||||
reader io.Reader,
|
reader io.Reader,
|
||||||
fileChunkSize int,
|
fileChunkSize int,
|
||||||
splitter LastSeqRecord,
|
splitter LastSeqRecord,
|
||||||
probe string) ChannelFileChunk {
|
probe string,
|
||||||
|
pack bool) ChannelFileChunk {
|
||||||
|
|
||||||
chunk_channel := make(ChannelFileChunk)
|
chunk_channel := make(ChannelFileChunk)
|
||||||
|
|
||||||
@@ -205,7 +213,7 @@ func ReadFileChunk(
|
|||||||
|
|
||||||
if len(pieces.data) > 0 {
|
if len(pieces.data) > 0 {
|
||||||
// obilog.Warnf("chuck %d :Read %d bytes from file %s", i, io.Len(), source)
|
// obilog.Warnf("chuck %d :Read %d bytes from file %s", i, io.Len(), source)
|
||||||
chunk_channel <- pieces.FileChunk(source, i)
|
chunk_channel <- pieces.FileChunk(source, i, pack)
|
||||||
i++
|
i++
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -222,7 +230,7 @@ func ReadFileChunk(
|
|||||||
|
|
||||||
// Send the last chunk to the channel
|
// Send the last chunk to the channel
|
||||||
if pieces.Len() > 0 {
|
if pieces.Len() > 0 {
|
||||||
chunk_channel <- pieces.FileChunk(source, i)
|
chunk_channel <- pieces.FileChunk(source, i, pack)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Close the readers channel when the end of the file is reached
|
// Close the readers channel when the end of the file is reached
|
||||||
|
|||||||
@@ -29,6 +29,265 @@ const (
|
|||||||
|
|
||||||
var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp")
|
var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp")
|
||||||
|
|
||||||
|
// extractSequence scans the ORIGIN section byte-by-byte directly on the rope,
|
||||||
|
// appending compacted bases to dest. Returns the extended slice.
|
||||||
|
// Stops and returns when "//" is found at the start of a line.
|
||||||
|
// The scanner is left positioned after the "//" line.
|
||||||
|
func (s *ropeScanner) extractSequence(dest []byte, UtoT bool) []byte {
|
||||||
|
lineStart := true
|
||||||
|
skipDigits := true
|
||||||
|
|
||||||
|
for s.current != nil {
|
||||||
|
data := s.current.data[s.pos:]
|
||||||
|
for i, b := range data {
|
||||||
|
if lineStart {
|
||||||
|
if b == '/' {
|
||||||
|
// End-of-record marker "//"
|
||||||
|
s.pos += i + 1
|
||||||
|
if s.pos >= len(s.current.data) {
|
||||||
|
s.current = s.current.Next()
|
||||||
|
s.pos = 0
|
||||||
|
}
|
||||||
|
s.skipToNewline()
|
||||||
|
return dest
|
||||||
|
}
|
||||||
|
lineStart = false
|
||||||
|
skipDigits = true
|
||||||
|
}
|
||||||
|
switch {
|
||||||
|
case b == '\n':
|
||||||
|
lineStart = true
|
||||||
|
case b == '\r':
|
||||||
|
// skip
|
||||||
|
case skipDigits:
|
||||||
|
if b != ' ' && (b < '0' || b > '9') {
|
||||||
|
skipDigits = false
|
||||||
|
if UtoT && b == 'u' {
|
||||||
|
b = 't'
|
||||||
|
}
|
||||||
|
dest = append(dest, b)
|
||||||
|
}
|
||||||
|
case b != ' ':
|
||||||
|
if UtoT && b == 'u' {
|
||||||
|
b = 't'
|
||||||
|
}
|
||||||
|
dest = append(dest, b)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
s.current = s.current.Next()
|
||||||
|
s.pos = 0
|
||||||
|
}
|
||||||
|
return dest
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseLseqFromLocus extracts the declared sequence length from a LOCUS line.
|
||||||
|
// Format: "LOCUS <id> <length> bp ..."
|
||||||
|
// Returns -1 if not found or parse error.
|
||||||
|
func parseLseqFromLocus(line []byte) int {
|
||||||
|
if len(line) < 13 {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
i := 12
|
||||||
|
for i < len(line) && line[i] != ' ' {
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
for i < len(line) && line[i] == ' ' {
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
start := i
|
||||||
|
for i < len(line) && line[i] >= '0' && line[i] <= '9' {
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
if i == start {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
n, err := strconv.Atoi(string(line[start:i]))
|
||||||
|
if err != nil {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prefix constants for GenBank section headers (byte slices for zero-alloc comparison).
|
||||||
|
var (
|
||||||
|
gbPfxLocus = []byte("LOCUS ")
|
||||||
|
gbPfxDefinition = []byte("DEFINITION ")
|
||||||
|
gbPfxContinue = []byte(" ")
|
||||||
|
gbPfxSource = []byte("SOURCE ")
|
||||||
|
gbPfxFeatures = []byte("FEATURES ")
|
||||||
|
gbPfxOrigin = []byte("ORIGIN")
|
||||||
|
gbPfxContig = []byte("CONTIG")
|
||||||
|
gbPfxEnd = []byte("//")
|
||||||
|
gbPfxDbXref = []byte(` /db_xref="taxon:`)
|
||||||
|
)
|
||||||
|
|
||||||
|
// GenbankChunkParserRope parses a GenBank FileChunk directly from the rope
|
||||||
|
// (PieceOfChunk linked list) without calling Pack(). This eliminates the large
|
||||||
|
// contiguous allocation required for chromosomal-scale sequences.
|
||||||
|
func GenbankChunkParserRope(source string, rope *PieceOfChunk,
|
||||||
|
withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) {
|
||||||
|
|
||||||
|
state := inHeader
|
||||||
|
scanner := newRopeScanner(rope)
|
||||||
|
sequences := obiseq.MakeBioSequenceSlice(100)[:0]
|
||||||
|
|
||||||
|
id := ""
|
||||||
|
lseq := -1
|
||||||
|
scientificName := ""
|
||||||
|
defBytes := new(bytes.Buffer)
|
||||||
|
featBytes := new(bytes.Buffer)
|
||||||
|
var seqDest []byte
|
||||||
|
taxid := 1
|
||||||
|
nl := 0
|
||||||
|
|
||||||
|
for bline := scanner.ReadLine(); bline != nil; bline = scanner.ReadLine() {
|
||||||
|
nl++
|
||||||
|
processed := false
|
||||||
|
for !processed {
|
||||||
|
switch {
|
||||||
|
|
||||||
|
case bytes.HasPrefix(bline, gbPfxLocus):
|
||||||
|
if state != inHeader {
|
||||||
|
log.Fatalf("Line %d - Unexpected state %d while reading LOCUS: %s", nl, state, bline)
|
||||||
|
}
|
||||||
|
rest := bline[12:]
|
||||||
|
sp := bytes.IndexByte(rest, ' ')
|
||||||
|
if sp < 0 {
|
||||||
|
id = string(rest)
|
||||||
|
} else {
|
||||||
|
id = string(rest[:sp])
|
||||||
|
}
|
||||||
|
lseq = parseLseqFromLocus(bline)
|
||||||
|
cap0 := lseq + 20
|
||||||
|
if cap0 < 1024 {
|
||||||
|
cap0 = 1024
|
||||||
|
}
|
||||||
|
seqDest = make([]byte, 0, cap0)
|
||||||
|
state = inEntry
|
||||||
|
processed = true
|
||||||
|
|
||||||
|
case bytes.HasPrefix(bline, gbPfxDefinition):
|
||||||
|
if state != inEntry {
|
||||||
|
log.Fatalf("Line %d - Unexpected state %d while reading DEFINITION: %s", nl, state, bline)
|
||||||
|
}
|
||||||
|
defBytes.Write(bytes.TrimSpace(bline[12:]))
|
||||||
|
state = inDefinition
|
||||||
|
processed = true
|
||||||
|
|
||||||
|
case state == inDefinition:
|
||||||
|
if bytes.HasPrefix(bline, gbPfxContinue) {
|
||||||
|
defBytes.WriteByte(' ')
|
||||||
|
defBytes.Write(bytes.TrimSpace(bline[12:]))
|
||||||
|
processed = true
|
||||||
|
} else {
|
||||||
|
state = inEntry
|
||||||
|
}
|
||||||
|
|
||||||
|
case bytes.HasPrefix(bline, gbPfxSource):
|
||||||
|
if state != inEntry {
|
||||||
|
log.Fatalf("Line %d - Unexpected state %d while reading SOURCE: %s", nl, state, bline)
|
||||||
|
}
|
||||||
|
scientificName = string(bytes.TrimSpace(bline[12:]))
|
||||||
|
processed = true
|
||||||
|
|
||||||
|
case bytes.HasPrefix(bline, gbPfxFeatures):
|
||||||
|
if state != inEntry {
|
||||||
|
log.Fatalf("Line %d - Unexpected state %d while reading FEATURES: %s", nl, state, bline)
|
||||||
|
}
|
||||||
|
if withFeatureTable {
|
||||||
|
featBytes.Write(bline)
|
||||||
|
}
|
||||||
|
state = inFeature
|
||||||
|
processed = true
|
||||||
|
|
||||||
|
case bytes.HasPrefix(bline, gbPfxOrigin):
|
||||||
|
if state != inFeature && state != inContig {
|
||||||
|
log.Fatalf("Line %d - Unexpected state %d while reading ORIGIN: %s", nl, state, bline)
|
||||||
|
}
|
||||||
|
// Use fast byte-scan to extract sequence and consume through "//"
|
||||||
|
seqDest = scanner.extractSequence(seqDest, UtoT)
|
||||||
|
// Emit record
|
||||||
|
if id == "" {
|
||||||
|
log.Warn("Empty id when parsing genbank file")
|
||||||
|
}
|
||||||
|
sequence := obiseq.NewBioSequenceOwning(id, seqDest, defBytes.String())
|
||||||
|
sequence.SetSource(source)
|
||||||
|
if withFeatureTable {
|
||||||
|
sequence.SetFeatures(featBytes.Bytes())
|
||||||
|
}
|
||||||
|
annot := sequence.Annotations()
|
||||||
|
annot["scientific_name"] = scientificName
|
||||||
|
annot["taxid"] = taxid
|
||||||
|
sequences = append(sequences, sequence)
|
||||||
|
|
||||||
|
defBytes = bytes.NewBuffer(obiseq.GetSlice(200))
|
||||||
|
featBytes = new(bytes.Buffer)
|
||||||
|
nl = 0
|
||||||
|
taxid = 1
|
||||||
|
seqDest = nil
|
||||||
|
state = inHeader
|
||||||
|
processed = true
|
||||||
|
|
||||||
|
case bytes.HasPrefix(bline, gbPfxContig):
|
||||||
|
if state != inFeature && state != inContig {
|
||||||
|
log.Fatalf("Line %d - Unexpected state %d while reading CONTIG: %s", nl, state, bline)
|
||||||
|
}
|
||||||
|
state = inContig
|
||||||
|
processed = true
|
||||||
|
|
||||||
|
case bytes.Equal(bline, gbPfxEnd):
|
||||||
|
// Reached for CONTIG records (no ORIGIN section)
|
||||||
|
if state != inContig {
|
||||||
|
log.Fatalf("Line %d - Unexpected state %d while reading end of record %s", nl, state, id)
|
||||||
|
}
|
||||||
|
if id == "" {
|
||||||
|
log.Warn("Empty id when parsing genbank file")
|
||||||
|
}
|
||||||
|
sequence := obiseq.NewBioSequenceOwning(id, seqDest, defBytes.String())
|
||||||
|
sequence.SetSource(source)
|
||||||
|
if withFeatureTable {
|
||||||
|
sequence.SetFeatures(featBytes.Bytes())
|
||||||
|
}
|
||||||
|
annot := sequence.Annotations()
|
||||||
|
annot["scientific_name"] = scientificName
|
||||||
|
annot["taxid"] = taxid
|
||||||
|
sequences = append(sequences, sequence)
|
||||||
|
|
||||||
|
defBytes = bytes.NewBuffer(obiseq.GetSlice(200))
|
||||||
|
featBytes = new(bytes.Buffer)
|
||||||
|
nl = 0
|
||||||
|
taxid = 1
|
||||||
|
seqDest = nil
|
||||||
|
state = inHeader
|
||||||
|
processed = true
|
||||||
|
|
||||||
|
default:
|
||||||
|
switch state {
|
||||||
|
case inFeature:
|
||||||
|
if withFeatureTable {
|
||||||
|
featBytes.WriteByte('\n')
|
||||||
|
featBytes.Write(bline)
|
||||||
|
}
|
||||||
|
if bytes.HasPrefix(bline, gbPfxDbXref) {
|
||||||
|
rest := bline[len(gbPfxDbXref):]
|
||||||
|
q := bytes.IndexByte(rest, '"')
|
||||||
|
if q >= 0 {
|
||||||
|
taxid, _ = strconv.Atoi(string(rest[:q]))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
processed = true
|
||||||
|
case inHeader, inEntry, inContig:
|
||||||
|
processed = true
|
||||||
|
default:
|
||||||
|
log.Fatalf("Unexpected state %d while reading: %s", state, bline)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return sequences, nil
|
||||||
|
}
|
||||||
|
|
||||||
func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlice, error) {
|
func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlice, error) {
|
||||||
return func(source string, input io.Reader) (obiseq.BioSequenceSlice, error) {
|
return func(source string, input io.Reader) (obiseq.BioSequenceSlice, error) {
|
||||||
state := inHeader
|
state := inHeader
|
||||||
@@ -125,13 +384,10 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
|
|||||||
if state != inSequence && state != inContig {
|
if state != inSequence && state != inContig {
|
||||||
log.Fatalf("Line %d - Unexpected state %d while reading end of record %s", nl, state, id)
|
log.Fatalf("Line %d - Unexpected state %d while reading end of record %s", nl, state, id)
|
||||||
}
|
}
|
||||||
// log.Debugln("Total lines := ", nl)
|
|
||||||
if id == "" {
|
if id == "" {
|
||||||
log.Warn("Empty id when parsing genbank file")
|
log.Warn("Empty id when parsing genbank file")
|
||||||
}
|
}
|
||||||
|
|
||||||
// log.Debugf("End of sequence %s: %dbp ", id, seqBytes.Len())
|
|
||||||
|
|
||||||
sequence := obiseq.NewBioSequence(id,
|
sequence := obiseq.NewBioSequence(id,
|
||||||
seqBytes.Bytes(),
|
seqBytes.Bytes(),
|
||||||
defBytes.String())
|
defBytes.String())
|
||||||
@@ -144,9 +400,6 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
|
|||||||
annot := sequence.Annotations()
|
annot := sequence.Annotations()
|
||||||
annot["scientific_name"] = scientificName
|
annot["scientific_name"] = scientificName
|
||||||
annot["taxid"] = taxid
|
annot["taxid"] = taxid
|
||||||
// log.Println(FormatFasta(sequence, FormatFastSeqJsonHeader))
|
|
||||||
// log.Debugf("Read sequences %s: %dbp (%d)", sequence.Id(),
|
|
||||||
// sequence.Len(), seqBytes.Len())
|
|
||||||
|
|
||||||
sequences = append(sequences, sequence)
|
sequences = append(sequences, sequence)
|
||||||
|
|
||||||
@@ -159,12 +412,11 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
|
|||||||
processed = true
|
processed = true
|
||||||
|
|
||||||
case state == inSequence:
|
case state == inSequence:
|
||||||
// log.Debugf("Chunk %d : Genbank: line %d, state = %d : %s", chunks.order, nl, state, line)
|
|
||||||
|
|
||||||
sl++
|
sl++
|
||||||
parts := strings.SplitN(line[10:], " ", 6)
|
cleanline := strings.TrimSpace(line)
|
||||||
|
parts := strings.SplitN(cleanline, " ", 7)
|
||||||
lparts := len(parts)
|
lparts := len(parts)
|
||||||
for i := 0; i < lparts; i++ {
|
for i := 1; i < lparts; i++ {
|
||||||
if UtoT {
|
if UtoT {
|
||||||
parts[i] = strings.ReplaceAll(parts[i], "u", "t")
|
parts[i] = strings.ReplaceAll(parts[i], "u", "t")
|
||||||
}
|
}
|
||||||
@@ -197,6 +449,7 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_ = sl
|
||||||
return sequences, nil
|
return sequences, nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -205,10 +458,16 @@ func _ParseGenbankFile(input ChannelFileChunk,
|
|||||||
out obiiter.IBioSequence,
|
out obiiter.IBioSequence,
|
||||||
withFeatureTable, UtoT bool) {
|
withFeatureTable, UtoT bool) {
|
||||||
|
|
||||||
parser := GenbankChunkParser(withFeatureTable, UtoT)
|
|
||||||
|
|
||||||
for chunks := range input {
|
for chunks := range input {
|
||||||
sequences, err := parser(chunks.Source, chunks.Raw)
|
var sequences obiseq.BioSequenceSlice
|
||||||
|
var err error
|
||||||
|
|
||||||
|
if chunks.Rope != nil {
|
||||||
|
sequences, err = GenbankChunkParserRope(chunks.Source, chunks.Rope, withFeatureTable, UtoT)
|
||||||
|
} else {
|
||||||
|
parser := GenbankChunkParser(withFeatureTable, UtoT)
|
||||||
|
sequences, err = parser(chunks.Source, chunks.Raw)
|
||||||
|
}
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("File %s : Cannot parse the genbank file : %v", chunks.Source, err)
|
log.Fatalf("File %s : Cannot parse the genbank file : %v", chunks.Source, err)
|
||||||
@@ -224,7 +483,6 @@ func _ParseGenbankFile(input ChannelFileChunk,
|
|||||||
|
|
||||||
func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
|
func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||||
opt := MakeOptions(options)
|
opt := MakeOptions(options)
|
||||||
// entry_channel := make(chan _FileChunk)
|
|
||||||
|
|
||||||
entry_channel := ReadFileChunk(
|
entry_channel := ReadFileChunk(
|
||||||
opt.Source(),
|
opt.Source(),
|
||||||
@@ -232,13 +490,13 @@ func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence,
|
|||||||
1024*1024*128,
|
1024*1024*128,
|
||||||
EndOfLastFlatFileEntry,
|
EndOfLastFlatFileEntry,
|
||||||
"\nLOCUS ",
|
"\nLOCUS ",
|
||||||
|
false, // do not pack: rope-based parser avoids contiguous allocation
|
||||||
)
|
)
|
||||||
|
|
||||||
newIter := obiiter.MakeIBioSequence()
|
newIter := obiiter.MakeIBioSequence()
|
||||||
|
|
||||||
nworkers := opt.ParallelWorkers()
|
nworkers := opt.ParallelWorkers()
|
||||||
|
|
||||||
// for j := 0; j < opt.ParallelWorkers(); j++ {
|
|
||||||
for j := 0; j < nworkers; j++ {
|
for j := 0; j < nworkers; j++ {
|
||||||
newIter.Add(1)
|
newIter.Add(1)
|
||||||
go _ParseGenbankFile(
|
go _ParseGenbankFile(
|
||||||
@@ -249,8 +507,6 @@ func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence,
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
// go _ReadFlatFileChunk(reader, entry_channel)
|
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
newIter.WaitAndClose()
|
newIter.WaitAndClose()
|
||||||
log.Debug("End of the genbank file ", opt.Source())
|
log.Debug("End of the genbank file ", opt.Source())
|
||||||
|
|||||||
77
pkg/obiformats/rope_scanner.go
Normal file
77
pkg/obiformats/rope_scanner.go
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
package obiformats
|
||||||
|
|
||||||
|
import "bytes"
|
||||||
|
|
||||||
|
// ropeScanner reads lines from a PieceOfChunk rope.
|
||||||
|
// The carry buffer handles lines that span two rope nodes; it grows as needed.
|
||||||
|
type ropeScanner struct {
|
||||||
|
current *PieceOfChunk
|
||||||
|
pos int
|
||||||
|
carry []byte
|
||||||
|
}
|
||||||
|
|
||||||
|
func newRopeScanner(rope *PieceOfChunk) *ropeScanner {
|
||||||
|
return &ropeScanner{current: rope}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ReadLine returns the next line without the trailing \n (or \r\n).
|
||||||
|
// Returns nil at end of rope. The returned slice aliases carry[] or the node
|
||||||
|
// data and is valid only until the next ReadLine call.
|
||||||
|
func (s *ropeScanner) ReadLine() []byte {
|
||||||
|
for {
|
||||||
|
if s.current == nil {
|
||||||
|
if len(s.carry) > 0 {
|
||||||
|
line := s.carry
|
||||||
|
s.carry = s.carry[:0]
|
||||||
|
return line
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
data := s.current.data[s.pos:]
|
||||||
|
idx := bytes.IndexByte(data, '\n')
|
||||||
|
|
||||||
|
if idx >= 0 {
|
||||||
|
var line []byte
|
||||||
|
if len(s.carry) == 0 {
|
||||||
|
line = data[:idx]
|
||||||
|
} else {
|
||||||
|
s.carry = append(s.carry, data[:idx]...)
|
||||||
|
line = s.carry
|
||||||
|
s.carry = s.carry[:0]
|
||||||
|
}
|
||||||
|
s.pos += idx + 1
|
||||||
|
if s.pos >= len(s.current.data) {
|
||||||
|
s.current = s.current.Next()
|
||||||
|
s.pos = 0
|
||||||
|
}
|
||||||
|
if len(line) > 0 && line[len(line)-1] == '\r' {
|
||||||
|
line = line[:len(line)-1]
|
||||||
|
}
|
||||||
|
return line
|
||||||
|
}
|
||||||
|
|
||||||
|
// No \n in this node: accumulate into carry and advance
|
||||||
|
s.carry = append(s.carry, data...)
|
||||||
|
s.current = s.current.Next()
|
||||||
|
s.pos = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// skipToNewline advances the scanner past the next '\n'.
|
||||||
|
func (s *ropeScanner) skipToNewline() {
|
||||||
|
for s.current != nil {
|
||||||
|
data := s.current.data[s.pos:]
|
||||||
|
idx := bytes.IndexByte(data, '\n')
|
||||||
|
if idx >= 0 {
|
||||||
|
s.pos += idx + 1
|
||||||
|
if s.pos >= len(s.current.data) {
|
||||||
|
s.current = s.current.Next()
|
||||||
|
s.pos = 0
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s.current = s.current.Next()
|
||||||
|
s.pos = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -3,7 +3,7 @@ package obioptions
|
|||||||
// Version is automatically updated by the Makefile from version.txt
|
// Version is automatically updated by the Makefile from version.txt
|
||||||
// The patch number (third digit) is incremented on each push to the repository
|
// The patch number (third digit) is incremented on each push to the repository
|
||||||
|
|
||||||
var _Version = "Release 4.4.13"
|
var _Version = "Release 4.4.21"
|
||||||
|
|
||||||
// Version returns the version of the obitools package.
|
// Version returns the version of the obitools package.
|
||||||
//
|
//
|
||||||
|
|||||||
@@ -120,6 +120,19 @@ func NewBioSequence(id string,
|
|||||||
return bs
|
return bs
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NewBioSequenceOwning creates a BioSequence taking ownership of the sequence
|
||||||
|
// slice without copying it. The caller must not use the slice after this call.
|
||||||
|
// Use this when the slice was allocated specifically for this sequence.
|
||||||
|
func NewBioSequenceOwning(id string,
|
||||||
|
sequence []byte,
|
||||||
|
definition string) *BioSequence {
|
||||||
|
bs := NewEmptyBioSequence(0)
|
||||||
|
bs.SetId(id)
|
||||||
|
bs.TakeSequence(sequence)
|
||||||
|
bs.SetDefinition(definition)
|
||||||
|
return bs
|
||||||
|
}
|
||||||
|
|
||||||
// NewBioSequenceWithQualities creates a new BioSequence object with the given id, sequence, definition, and qualities.
|
// NewBioSequenceWithQualities creates a new BioSequence object with the given id, sequence, definition, and qualities.
|
||||||
//
|
//
|
||||||
// Parameters:
|
// Parameters:
|
||||||
@@ -444,6 +457,12 @@ func (s *BioSequence) SetSequence(sequence []byte) {
|
|||||||
s.sequence = obiutils.InPlaceToLower(CopySlice(sequence))
|
s.sequence = obiutils.InPlaceToLower(CopySlice(sequence))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TakeSequence stores the slice directly without copying, then lowercases in-place.
|
||||||
|
// The caller must not use the slice after this call.
|
||||||
|
func (s *BioSequence) TakeSequence(sequence []byte) {
|
||||||
|
s.sequence = obiutils.InPlaceToLower(sequence)
|
||||||
|
}
|
||||||
|
|
||||||
func (s *BioSequence) HasValidSequence() bool {
|
func (s *BioSequence) HasValidSequence() bool {
|
||||||
for _, c := range s.sequence {
|
for _, c := range s.sequence {
|
||||||
if !((c >= 'a' && c <= 'z') || c == '-' || c == '.' || c == '[' || c == ']') {
|
if !((c >= 'a' && c <= 'z') || c == '-' || c == '.' || c == '[' || c == ']') {
|
||||||
@@ -461,6 +480,15 @@ func (s *BioSequence) SetQualities(qualities Quality) {
|
|||||||
s.qualities = CopySlice(qualities)
|
s.qualities = CopySlice(qualities)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TakeQualities stores the slice directly without copying.
|
||||||
|
// The caller must not use the slice after this call.
|
||||||
|
func (s *BioSequence) TakeQualities(qualities Quality) {
|
||||||
|
if s.qualities != nil {
|
||||||
|
RecycleSlice(&s.qualities)
|
||||||
|
}
|
||||||
|
s.qualities = qualities
|
||||||
|
}
|
||||||
|
|
||||||
// A method that appends a byte slice to the qualities of the BioSequence.
|
// A method that appends a byte slice to the qualities of the BioSequence.
|
||||||
func (s *BioSequence) WriteQualities(data []byte) (int, error) {
|
func (s *BioSequence) WriteQualities(data []byte) (int, error) {
|
||||||
s.qualities = append(s.qualities, data...)
|
s.qualities = append(s.qualities, data...)
|
||||||
|
|||||||
@@ -195,7 +195,7 @@ func (s *BioSequenceSlice) ExtractTaxonomy(taxonomy *obitax.Taxonomy, seqAsTaxa
|
|||||||
return nil, fmt.Errorf("sequence %v has no path", s.Id())
|
return nil, fmt.Errorf("sequence %v has no path", s.Id())
|
||||||
}
|
}
|
||||||
last := path[len(path)-1]
|
last := path[len(path)-1]
|
||||||
taxname, _ := obiutils.SplitInTwo(last, ':')
|
taxname, _ := obiutils.LeftSplitInTwo(last, ':')
|
||||||
if idx, ok := s.GetIntAttribute("seq_number"); !ok {
|
if idx, ok := s.GetIntAttribute("seq_number"); !ok {
|
||||||
return nil, errors.New("sequences are not numbered")
|
return nil, errors.New("sequences are not numbered")
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ func NewTaxidFactory(code string, alphabet obiutils.AsciiSet) *TaxidFactory {
|
|||||||
// It extracts the relevant part of the string after the first colon (':') if present.
|
// It extracts the relevant part of the string after the first colon (':') if present.
|
||||||
func (f *TaxidFactory) FromString(taxid string) (Taxid, error) {
|
func (f *TaxidFactory) FromString(taxid string) (Taxid, error) {
|
||||||
taxid = obiutils.AsciiSpaceSet.TrimLeft(taxid)
|
taxid = obiutils.AsciiSpaceSet.TrimLeft(taxid)
|
||||||
part1, part2 := obiutils.SplitInTwo(taxid, ':')
|
part1, part2 := obiutils.LeftSplitInTwo(taxid, ':')
|
||||||
if len(part2) == 0 {
|
if len(part2) == 0 {
|
||||||
taxid = part1
|
taxid = part1
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ func EmpiricalDistCsv(filename string, data [][]Ratio, compressed bool) {
|
|||||||
fmt.Println(err)
|
fmt.Println(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
destfile, err := obiutils.CompressStream(file, true, true)
|
destfile, err := obiutils.CompressStream(file, compressed, true)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Println(err)
|
fmt.Println(err)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -68,6 +68,8 @@ func ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) {
|
|||||||
strings.HasSuffix(path, "seq.gz") ||
|
strings.HasSuffix(path, "seq.gz") ||
|
||||||
strings.HasSuffix(path, "gb") ||
|
strings.HasSuffix(path, "gb") ||
|
||||||
strings.HasSuffix(path, "gb.gz") ||
|
strings.HasSuffix(path, "gb.gz") ||
|
||||||
|
strings.HasSuffix(path, "gbff") ||
|
||||||
|
strings.HasSuffix(path, "gbff.gz") ||
|
||||||
strings.HasSuffix(path, "dat") ||
|
strings.HasSuffix(path, "dat") ||
|
||||||
strings.HasSuffix(path, "dat.gz") ||
|
strings.HasSuffix(path, "dat.gz") ||
|
||||||
strings.HasSuffix(path, "ecopcr") ||
|
strings.HasSuffix(path, "ecopcr") ||
|
||||||
@@ -204,7 +206,7 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
|
|||||||
iterator = iterator.PairTo(ip)
|
iterator = iterator.PairTo(ip)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
iterator = obiiter.NilIBioSequence
|
return obiiter.NilIBioSequence, fmt.Errorf("no sequence files found in the provided paths")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -144,7 +144,7 @@ func (r *AsciiSet) TrimLeft(s string) string {
|
|||||||
return s[i:]
|
return s[i:]
|
||||||
}
|
}
|
||||||
|
|
||||||
func SplitInTwo(s string, sep byte) (string, string) {
|
func LeftSplitInTwo(s string, sep byte) (string, string) {
|
||||||
i := 0
|
i := 0
|
||||||
for ; i < len(s); i++ {
|
for ; i < len(s); i++ {
|
||||||
c := s[i]
|
c := s[i]
|
||||||
@@ -157,3 +157,17 @@ func SplitInTwo(s string, sep byte) (string, string) {
|
|||||||
}
|
}
|
||||||
return s[:i], s[i+1:]
|
return s[:i], s[i+1:]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func RightSplitInTwo(s string, sep byte) (string, string) {
|
||||||
|
i := len(s) - 1
|
||||||
|
for ; i >= 0; i-- {
|
||||||
|
c := s[i]
|
||||||
|
if c == sep {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if i == len(s) {
|
||||||
|
return s, ""
|
||||||
|
}
|
||||||
|
return s[:i], s[i+1:]
|
||||||
|
}
|
||||||
|
|||||||
294
release_notes.sh
Executable file
294
release_notes.sh
Executable file
@@ -0,0 +1,294 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Generate GitHub-compatible release notes for an OBITools4 version.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# ./release_notes.sh # latest version
|
||||||
|
# ./release_notes.sh -v 4.4.15 # specific version
|
||||||
|
# ./release_notes.sh -l # list available versions
|
||||||
|
# ./release_notes.sh -r # raw commit list (no LLM)
|
||||||
|
# ./release_notes.sh -c -v 4.4.16 # show LLM context for a version
|
||||||
|
|
||||||
|
GITHUB_REPO="metabarcoding/obitools4"
|
||||||
|
GITHUB_API="https://api.github.com/repos/${GITHUB_REPO}"
|
||||||
|
VERSION=""
|
||||||
|
LIST_VERSIONS=false
|
||||||
|
RAW_MODE=false
|
||||||
|
CONTEXT_MODE=false
|
||||||
|
LLM_MODEL="ollama:qwen3-coder-next:latest"
|
||||||
|
|
||||||
|
# ── Helpers ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
die() { echo "Error: $*" >&2; exit 1; }
|
||||||
|
|
||||||
|
next_patch() {
|
||||||
|
local v="$1"
|
||||||
|
local major minor patch
|
||||||
|
major=$(echo "$v" | cut -d. -f1)
|
||||||
|
minor=$(echo "$v" | cut -d. -f2)
|
||||||
|
patch=$(echo "$v" | cut -d. -f3)
|
||||||
|
echo "${major}.${minor}.$(( patch + 1 ))"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Strip "pre-" prefix to get the bare version number for installation section
|
||||||
|
bare_version() {
|
||||||
|
echo "$1" | sed 's/^pre-//'
|
||||||
|
}
|
||||||
|
|
||||||
|
installation_section() {
|
||||||
|
local v
|
||||||
|
v=$(bare_version "$1")
|
||||||
|
cat <<INSTALL_EOF
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
### Pre-built binaries
|
||||||
|
|
||||||
|
Download the appropriate archive for your system from the
|
||||||
|
[release assets](https://github.com/metabarcoding/obitools4/releases/tag/Release_${v})
|
||||||
|
and extract it:
|
||||||
|
|
||||||
|
#### Linux (AMD64)
|
||||||
|
\`\`\`bash
|
||||||
|
tar -xzf obitools4_${v}_linux_amd64.tar.gz
|
||||||
|
\`\`\`
|
||||||
|
|
||||||
|
#### Linux (ARM64)
|
||||||
|
\`\`\`bash
|
||||||
|
tar -xzf obitools4_${v}_linux_arm64.tar.gz
|
||||||
|
\`\`\`
|
||||||
|
|
||||||
|
#### macOS (Intel)
|
||||||
|
\`\`\`bash
|
||||||
|
tar -xzf obitools4_${v}_darwin_amd64.tar.gz
|
||||||
|
\`\`\`
|
||||||
|
|
||||||
|
#### macOS (Apple Silicon)
|
||||||
|
\`\`\`bash
|
||||||
|
tar -xzf obitools4_${v}_darwin_arm64.tar.gz
|
||||||
|
\`\`\`
|
||||||
|
|
||||||
|
All OBITools4 binaries are included in each archive.
|
||||||
|
|
||||||
|
### From source
|
||||||
|
|
||||||
|
You can also compile and install OBITools4 directly from source using the
|
||||||
|
installation script:
|
||||||
|
|
||||||
|
\`\`\`bash
|
||||||
|
curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | bash -s -- --version ${v}
|
||||||
|
\`\`\`
|
||||||
|
|
||||||
|
By default binaries are installed in \`/usr/local/bin\`. Use \`--install-dir\` to
|
||||||
|
change the destination and \`--obitools-prefix\` to add a prefix to command names:
|
||||||
|
|
||||||
|
\`\`\`bash
|
||||||
|
curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | \\
|
||||||
|
bash -s -- --version ${v} --install-dir ~/local --obitools-prefix k
|
||||||
|
\`\`\`
|
||||||
|
INSTALL_EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
display_help() {
|
||||||
|
cat <<EOF
|
||||||
|
Usage: $(basename "$0") [OPTIONS]
|
||||||
|
|
||||||
|
Generate GitHub-compatible Markdown release notes for an OBITools4 version.
|
||||||
|
|
||||||
|
Options:
|
||||||
|
-v, --version VERSION Target version (e.g., 4.4.15). Default: latest.
|
||||||
|
-l, --list List all available versions and exit.
|
||||||
|
-r, --raw Output raw commit list without LLM summarization.
|
||||||
|
-c, --context Show the exact context (commits + prompt) sent to the LLM.
|
||||||
|
-m, --model MODEL LLM model for orla (default: $LLM_MODEL).
|
||||||
|
-h, --help Display this help message.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
$(basename "$0") # release notes for the latest version
|
||||||
|
$(basename "$0") -v 4.4.15 # release notes for a specific version
|
||||||
|
$(basename "$0") -l # list versions
|
||||||
|
$(basename "$0") -r -v 4.4.15 # raw commit log for a version
|
||||||
|
$(basename "$0") -c -v 4.4.16 # show LLM context for a version
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
# Fetch all Release tags from GitHub API (sorted newest first)
|
||||||
|
fetch_versions() {
|
||||||
|
curl -sf "${GITHUB_API}/releases" \
|
||||||
|
| grep '"tag_name":' \
|
||||||
|
| sed -E 's/.*"tag_name": "Release_([0-9.]+)".*/\1/' \
|
||||||
|
| sort -V -r
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Parse arguments ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
while [ "$#" -gt 0 ]; do
|
||||||
|
case "$1" in
|
||||||
|
-v|--version) VERSION="$2"; shift 2 ;;
|
||||||
|
-l|--list) LIST_VERSIONS=true; shift ;;
|
||||||
|
-r|--raw) RAW_MODE=true; shift ;;
|
||||||
|
-c|--context) CONTEXT_MODE=true; shift ;;
|
||||||
|
-m|--model) LLM_MODEL="$2"; shift 2 ;;
|
||||||
|
-h|--help) display_help; exit 0 ;;
|
||||||
|
*) die "Unsupported option: $1" ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# ── List mode ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
if [ "$LIST_VERSIONS" = true ]; then
|
||||||
|
echo "Available OBITools4 versions:" >&2
|
||||||
|
echo "==============================" >&2
|
||||||
|
fetch_versions
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Resolve versions ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
all_versions=$(fetch_versions)
|
||||||
|
[ -z "$all_versions" ] && die "Could not fetch versions from GitHub"
|
||||||
|
|
||||||
|
if [ -z "$VERSION" ]; then
|
||||||
|
# ── Pre-release mode: local HEAD vs latest GitHub tag ──────────────────
|
||||||
|
PRE_RELEASE=true
|
||||||
|
previous_tag="Release_${latest_version}"
|
||||||
|
VERSION="pre-$(next_patch "$latest_version")"
|
||||||
|
|
||||||
|
echo "Pre-release mode: $previous_tag -> HEAD (as $VERSION)" >&2
|
||||||
|
|
||||||
|
# Need to be in a git repo
|
||||||
|
if ! git rev-parse --is-inside-work-tree >/dev/null 2>&1; then
|
||||||
|
die "Not inside a git repository. Pre-release mode requires a local git repo."
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check that the previous tag exists locally
|
||||||
|
if ! git rev-parse "$previous_tag" >/dev/null 2>&1; then
|
||||||
|
echo "Tag $previous_tag not found locally, fetching..." >&2
|
||||||
|
git fetch --tags 2>/dev/null || true
|
||||||
|
if ! git rev-parse "$previous_tag" >/dev/null 2>&1; then
|
||||||
|
die "Tag $previous_tag not found locally or remotely"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Get local commits from the tag to HEAD (full messages)
|
||||||
|
commit_list=$(git log --format="%h %B" "${previous_tag}..HEAD" 2>/dev/null)
|
||||||
|
|
||||||
|
if [ -z "$commit_list" ]; then
|
||||||
|
die "No local commits found since $previous_tag"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
# ── Published release mode: between two GitHub tags ────────────────────
|
||||||
|
PRE_RELEASE=false
|
||||||
|
tag_name="Release_${VERSION}"
|
||||||
|
|
||||||
|
# Verify the requested version exists
|
||||||
|
if ! echo "$all_versions" | grep -qx "$VERSION"; then
|
||||||
|
die "Version $VERSION not found. Use -l to list available versions."
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Find the previous version
|
||||||
|
previous_version=$(echo "$all_versions" | grep -A1 -x "$VERSION" | tail -1)
|
||||||
|
|
||||||
|
if [ "$previous_version" = "$VERSION" ] || [ -z "$previous_version" ]; then
|
||||||
|
previous_tag=""
|
||||||
|
echo "No previous version found -- will include all commits for $tag_name" >&2
|
||||||
|
else
|
||||||
|
previous_tag="Release_${previous_version}"
|
||||||
|
echo "Generating notes: $previous_tag -> $tag_name" >&2
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Fetch commit messages between tags via GitHub compare API
|
||||||
|
if [ -n "$previous_tag" ]; then
|
||||||
|
commits_json=$(curl -sf "${GITHUB_API}/compare/${previous_tag}...${tag_name}")
|
||||||
|
if [ -z "$commits_json" ]; then
|
||||||
|
die "Could not fetch commit comparison from GitHub"
|
||||||
|
fi
|
||||||
|
commit_list=$(echo "$commits_json" \
|
||||||
|
| jq -r '.commits[] | (.sha[:8] + " " + .commit.message)' 2>/dev/null)
|
||||||
|
else
|
||||||
|
commits_json=$(curl -sf "${GITHUB_API}/commits?sha=${tag_name}&per_page=50")
|
||||||
|
if [ -z "$commits_json" ]; then
|
||||||
|
die "Could not fetch commits from GitHub"
|
||||||
|
fi
|
||||||
|
commit_list=$(echo "$commits_json" \
|
||||||
|
| jq -r '.[] | (.sha[:8] + " " + .commit.message)' 2>/dev/null)
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "$commit_list" ]; then
|
||||||
|
die "No commits found between $previous_tag and $tag_name"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── LLM prompt (shared by context mode and summarization) ────────────────
|
||||||
|
|
||||||
|
LLM_PROMPT="Summarize the following commits into a GitHub release note for version ${VERSION}. \
|
||||||
|
Ignore commits related to version bumps, .gitignore changes, or any internal housekeeping \
|
||||||
|
that is irrelevant to end users. Describe each user-facing change precisely without exposing \
|
||||||
|
code. Eliminate redundancy. Output strictly valid JSON with no surrounding text, using this \
|
||||||
|
exact schema: {\"title\": \"<short release title>\", \"body\": \"<detailed markdown release notes>\"}"
|
||||||
|
|
||||||
|
# ── Raw mode: just output the commit list ────────────────────────────────
|
||||||
|
|
||||||
|
if [ "$RAW_MODE" = true ]; then
|
||||||
|
echo "# Release ${VERSION}"
|
||||||
|
echo ""
|
||||||
|
echo "## Commits"
|
||||||
|
echo ""
|
||||||
|
echo "$commit_list" | while IFS= read -r line; do
|
||||||
|
echo "- ${line}"
|
||||||
|
done
|
||||||
|
installation_section "$VERSION"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Context mode: show what would be sent to the LLM ────────────────────
|
||||||
|
|
||||||
|
if [ "$CONTEXT_MODE" = true ]; then
|
||||||
|
echo "=== LLM Model ==="
|
||||||
|
echo "$LLM_MODEL"
|
||||||
|
echo ""
|
||||||
|
echo "=== Prompt ==="
|
||||||
|
echo "$LLM_PROMPT"
|
||||||
|
echo ""
|
||||||
|
echo "=== Stdin (commit list) ==="
|
||||||
|
echo "$commit_list"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── LLM summarization ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
if ! command -v orla >/dev/null 2>&1; then
|
||||||
|
die "orla is required for LLM summarization. Use -r for raw output."
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! command -v jq >/dev/null 2>&1; then
|
||||||
|
die "jq is required for JSON parsing. Use -r for raw output."
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Summarizing with LLM ($LLM_MODEL)..." >&2
|
||||||
|
|
||||||
|
raw_output=$(echo "$commit_list" | \
|
||||||
|
ORLA_MAX_TOOL_CALLS=50 orla agent -m "$LLM_MODEL" \
|
||||||
|
"$LLM_PROMPT" \
|
||||||
|
2>/dev/null) || true
|
||||||
|
|
||||||
|
if [ -z "$raw_output" ]; then
|
||||||
|
echo "Warning: LLM returned empty output, falling back to raw mode" >&2
|
||||||
|
exec "$0" -r -v "$VERSION"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Sanitize: extract JSON object, strip control characters
|
||||||
|
sanitized=$(echo "$raw_output" | sed -n '/^{/,/^}/p' | tr -d '\000-\011\013-\014\016-\037')
|
||||||
|
|
||||||
|
release_title=$(echo "$sanitized" | jq -r '.title // empty' 2>/dev/null)
|
||||||
|
release_body=$(echo "$sanitized" | jq -r '.body // empty' 2>/dev/null)
|
||||||
|
|
||||||
|
if [ -n "$release_title" ] && [ -n "$release_body" ]; then
|
||||||
|
echo "# ${release_title}"
|
||||||
|
echo ""
|
||||||
|
echo "$release_body"
|
||||||
|
installation_section "$VERSION"
|
||||||
|
else
|
||||||
|
echo "Warning: JSON parsing failed, falling back to raw mode" >&2
|
||||||
|
exec "$0" -r -v "$VERSION"
|
||||||
|
fi
|
||||||
36
tools/json2md.py
Executable file
36
tools/json2md.py
Executable file
@@ -0,0 +1,36 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Read potentially malformed JSON from stdin (aichat output), extract title and
|
||||||
|
body, and print them as plain text: title on first line, blank line, then body.
|
||||||
|
Exits with 1 on failure (no output).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
|
text = sys.stdin.read()
|
||||||
|
|
||||||
|
m = re.search(r'\{.*\}', text, re.DOTALL)
|
||||||
|
if not m:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
s = m.group()
|
||||||
|
obj = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
obj = json.loads(s)
|
||||||
|
except Exception:
|
||||||
|
s2 = re.sub(r'(?<!\\)\n', r'\\n', s)
|
||||||
|
try:
|
||||||
|
obj = json.loads(s2)
|
||||||
|
except Exception:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
title = obj.get('title', '').strip()
|
||||||
|
body = obj.get('body', '').strip()
|
||||||
|
|
||||||
|
if not title or not body:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
print(f"{title}\n\n{body}")
|
||||||
@@ -1 +1 @@
|
|||||||
4.4.13
|
4.4.21
|
||||||
|
|||||||
Reference in New Issue
Block a user