mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-05-01 12:30:39 +00:00
Compare commits
90 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 449544bd63 | |||
| 434d2e5930 | |||
| 7cb02ded69 | |||
| 6d469bd711 | |||
| 3d8e4a3a4e | |||
| 07d04a6967 | |||
| 03f251c365 | |||
| 5714fa6cd3 | |||
| f101625771 | |||
| 4359b52eaf | |||
| da0c8b6f28 | |||
| 841e5c9e2a | |||
| e298daeef9 | |||
| d9e6f67a6e | |||
| f036c7fa96 | |||
| e33665e716 | |||
| c955a614ca | |||
| f19065261e | |||
| 3e349e92e1 | |||
| a4ce24a418 | |||
| 960ad1531d | |||
| 137f49d1d1 | |||
| 083a92e13d | |||
| 67683435e8 | |||
| f32b29db4f | |||
| 10f49fe64b | |||
| d257917748 | |||
| fec078c04c | |||
| a92393dd51 | |||
| 7e76698490 | |||
| 64b0b32f61 | |||
| c8e6a218cb | |||
| 8c7017a99d | |||
| c7816973a6 | |||
| 670edc1958 | |||
| f92f285417 | |||
| a786b58ed3 | |||
| a2b26712b2 | |||
| 1599abc9ad | |||
| af213ab446 | |||
| a60184c115 | |||
| 585b024bf0 | |||
| afc9ffda85 | |||
| fdd972bbd2 | |||
| 76f595e1fe | |||
| 1e1e5443e3 | |||
| 15d1f1fd80 | |||
| 8df2cbe22f | |||
| 58d685926b | |||
| e9f24426df | |||
| 2f7be10b5d | |||
| 43125f9f5e | |||
| c23368e929 | |||
| 6cb5a81685 | |||
| 94b0887069 | |||
| c188580aac | |||
| 1e1f575d1c | |||
| 40769bf827 | |||
| 74e6fcaf83 | |||
| 30ec8b1b63 | |||
| cdc72c5346 | |||
| 82a9972be7 | |||
| ff6e515b2a | |||
| cd0c525f50 | |||
| abe935aa18 | |||
| 8dd32dc1bf | |||
| 6ee8750635 | |||
| 8c318c480e | |||
| 09fbc217d3 | |||
| 3d2e205722 | |||
| 623116ab13 | |||
| 1e4509cb63 | |||
| b33d7705a8 | |||
| 1342c83db6 | |||
| b246025907 | |||
| 761e0dbed3 | |||
| a7ea47624b | |||
| 61e346658e | |||
| 1ba1294b11 | |||
| b2476fffcb | |||
| b05404721e | |||
| c57e788459 | |||
| 1cecf23978 | |||
| 4c824ef9b7 | |||
| 1ce5da9bee | |||
| dc23d9de9a | |||
| aa9d7bbf72 | |||
| db22d20d0a | |||
| 7c05bdb01c | |||
| b6542c4523 |
@@ -10,10 +10,10 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@v2
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.23'
|
||||
- name: Checkout obitools4 project
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v5
|
||||
- name: Run tests
|
||||
run: make githubtests
|
||||
|
||||
@@ -16,9 +16,9 @@ jobs:
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: "1.23"
|
||||
go-version: "1.26"
|
||||
- name: Checkout obitools4 project
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v5
|
||||
- name: Run tests
|
||||
run: make githubtests
|
||||
|
||||
@@ -49,12 +49,12 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v5
|
||||
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: "1.23"
|
||||
go-version: "1.26"
|
||||
|
||||
- name: Extract version from tag
|
||||
id: get_version
|
||||
@@ -69,7 +69,23 @@ jobs:
|
||||
xcode-select --install 2>/dev/null || true
|
||||
xcode-select -p
|
||||
|
||||
- name: Build binaries
|
||||
- name: Build binaries (Linux)
|
||||
if: runner.os == 'Linux'
|
||||
env:
|
||||
VERSION: ${{ steps.get_version.outputs.version }}
|
||||
run: |
|
||||
docker run --rm \
|
||||
-v "$(pwd):/src" \
|
||||
-w /src \
|
||||
-e VERSION="${VERSION}" \
|
||||
golang:1.26-alpine \
|
||||
sh -c "apk add --no-cache gcc musl-dev zlib-dev zlib-static make && \
|
||||
make LDFLAGS='-linkmode=external -extldflags=-static' obitools"
|
||||
mkdir -p artifacts
|
||||
tar -czf artifacts/obitools4_${VERSION}_${{ matrix.output_name }}.tar.gz -C build .
|
||||
|
||||
- name: Build binaries (macOS)
|
||||
if: runner.os == 'macOS'
|
||||
env:
|
||||
GOOS: ${{ matrix.goos }}
|
||||
GOARCH: ${{ matrix.goarch }}
|
||||
@@ -77,7 +93,6 @@ jobs:
|
||||
run: |
|
||||
make obitools
|
||||
mkdir -p artifacts
|
||||
# Create a single tar.gz with all binaries for this platform
|
||||
tar -czf artifacts/obitools4_${VERSION}_${{ matrix.output_name }}.tar.gz -C build .
|
||||
|
||||
- name: Upload artifacts
|
||||
@@ -92,7 +107,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v5
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
|
||||
+2
-1
@@ -16,13 +16,14 @@
|
||||
**/*.tgz
|
||||
**/*.yaml
|
||||
**/*.csv
|
||||
**/*.pb.gz
|
||||
xx
|
||||
|
||||
.rhistory
|
||||
/.vscode
|
||||
/build
|
||||
/bugs
|
||||
|
||||
autodoc
|
||||
/ncbitaxo
|
||||
|
||||
!/obitests/**
|
||||
|
||||
@@ -2,9 +2,17 @@
|
||||
#export GOBIN=$(GOPATH)/bin
|
||||
#export PATH=$(GOBIN):$(shell echo $${PATH})
|
||||
|
||||
.DEFAULT_GOAL := all
|
||||
|
||||
GREEN := \033[0;32m
|
||||
YELLOW := \033[0;33m
|
||||
BLUE := \033[0;34m
|
||||
NC := \033[0m
|
||||
|
||||
GOFLAGS=
|
||||
LDFLAGS=
|
||||
GOCMD=go
|
||||
GOBUILD=$(GOCMD) build $(GOFLAGS)
|
||||
GOBUILD=$(GOCMD) build $(GOFLAGS) $(if $(LDFLAGS),-ldflags="$(LDFLAGS)")
|
||||
GOGENERATE=$(GOCMD) generate
|
||||
GOCLEAN=$(GOCMD) clean
|
||||
GOTEST=$(GOCMD) test
|
||||
@@ -43,7 +51,7 @@ $(OBITOOLS_PREFIX)$(notdir $(1)): $(BUILD_DIR) $(1) pkg/obioptions/version.go
|
||||
@echo -n - Building obitool $(notdir $(1))...
|
||||
@$(GOBUILD) -o $(BUILD_DIR)/$(OBITOOLS_PREFIX)$(notdir $(1)) ./$(1) \
|
||||
2> $(OBITOOLS_PREFIX)$(notdir $(1)).log \
|
||||
|| cat $(OBITOOLS_PREFIX)$(notdir $(1)).log
|
||||
|| { cat $(OBITOOLS_PREFIX)$(notdir $(1)).log; rm -f $(OBITOOLS_PREFIX)$(notdir $(1)).log; exit 1; }
|
||||
@rm -f $(OBITOOLS_PREFIX)$(notdir $(1)).log
|
||||
@echo Done.
|
||||
endef
|
||||
@@ -60,6 +68,28 @@ endif
|
||||
|
||||
OUTPUT:=$(shell mktemp)
|
||||
|
||||
help:
|
||||
@printf "$(GREEN)OBITools4 Makefile$(NC)\n\n"
|
||||
@printf "$(BLUE)Main targets:$(NC)\n"
|
||||
@printf " %-20s %s\n" "all" "Build all obitools (default)"
|
||||
@printf " %-20s %s\n" "obitools" "Build all obitools binaries to build/"
|
||||
@printf " %-20s %s\n" "test" "Run Go unit tests"
|
||||
@printf " %-20s %s\n" "obitests" "Run integration tests (obitests/)"
|
||||
@printf " %-20s %s\n" "bump-version" "Increment patch version (or set with VERSION=x.y.z)"
|
||||
@printf " %-20s %s\n" "update-deps" "Update all Go dependencies"
|
||||
@printf "\n$(BLUE)Jujutsu workflow:$(NC)\n"
|
||||
@printf " %-20s %s\n" "jjnew" "Document current commit and start a new one"
|
||||
@printf " %-20s %s\n" "jjpush" "Release: describe, bump, generate notes, push PR, tag (VERSION=x.y.z optional)"
|
||||
@printf " %-20s %s\n" "jjfetch" "Fetch latest commits from origin"
|
||||
@printf "\n$(BLUE)Required tools:$(NC)\n"
|
||||
@printf " %-20s " "go"; command -v go >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(go version)" || printf "$(YELLOW)✗ not found$(NC)\n"
|
||||
@printf " %-20s " "git"; command -v git >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(git --version)" || printf "$(YELLOW)✗ not found$(NC)\n"
|
||||
@printf " %-20s " "jj"; command -v jj >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(jj --version)" || printf "$(YELLOW)✗ not found$(NC)\n"
|
||||
@printf " %-20s " "gh"; command -v gh >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(gh --version | head -1)" || printf "$(YELLOW)✗ not found$(NC) (brew install gh)\n"
|
||||
@printf "\n$(BLUE)Optional tools (release notes generation):$(NC)\n"
|
||||
@printf " %-20s " "aichat"; command -v aichat >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(aichat --version)" || printf "$(YELLOW)✗ not found$(NC) (https://github.com/sigoden/aichat)\n"
|
||||
@printf " %-20s " "jq"; command -v jq >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(jq --version)" || printf "$(YELLOW)✗ not found$(NC) (brew install jq)\n"
|
||||
|
||||
all: install-githook obitools
|
||||
|
||||
obitools: $(patsubst %,$(OBITOOLS_PREFIX)%,$(OBITOOLS))
|
||||
@@ -106,15 +136,20 @@ pkg/obioptions/version.go: version.txt .FORCE
|
||||
@rm -f $(OUTPUT)
|
||||
|
||||
bump-version:
|
||||
@echo "Incrementing version..."
|
||||
@current=$$(cat version.txt); \
|
||||
echo " Current version: $$current"; \
|
||||
major=$$(echo $$current | cut -d. -f1); \
|
||||
minor=$$(echo $$current | cut -d. -f2); \
|
||||
patch=$$(echo $$current | cut -d. -f3); \
|
||||
new_patch=$$((patch + 1)); \
|
||||
new_version="$$major.$$minor.$$new_patch"; \
|
||||
echo " New version: $$new_version"; \
|
||||
if [ -n "$(VERSION)" ]; then \
|
||||
new_version="$(VERSION)"; \
|
||||
echo "Setting version to $$new_version (was $$current)"; \
|
||||
else \
|
||||
echo "Incrementing version..."; \
|
||||
echo " Current version: $$current"; \
|
||||
major=$$(echo $$current | cut -d. -f1); \
|
||||
minor=$$(echo $$current | cut -d. -f2); \
|
||||
patch=$$(echo $$current | cut -d. -f3); \
|
||||
new_patch=$$((patch + 1)); \
|
||||
new_version="$$major.$$minor.$$new_patch"; \
|
||||
echo " New version: $$new_version"; \
|
||||
fi; \
|
||||
echo "$$new_version" > version.txt
|
||||
@echo "✓ Version updated in version.txt"
|
||||
@$(MAKE) pkg/obioptions/version.go
|
||||
@@ -128,40 +163,77 @@ jjnew:
|
||||
@echo "$(GREEN)✓ New commit created$(NC)"
|
||||
|
||||
jjpush:
|
||||
@echo "$(YELLOW)→ Pushing commit to repository...$(NC)"
|
||||
@$(MAKE) jjpush-describe
|
||||
@$(MAKE) jjpush-bump
|
||||
@$(MAKE) jjpush-notes
|
||||
@$(MAKE) jjpush-push
|
||||
@$(MAKE) jjpush-tag
|
||||
@echo "$(GREEN)✓ Release complete$(NC)"
|
||||
|
||||
jjpush-describe:
|
||||
@echo "$(BLUE)→ Documenting current commit...$(NC)"
|
||||
@jj auto-describe
|
||||
|
||||
jjpush-bump:
|
||||
@echo "$(BLUE)→ Creating new commit for version bump...$(NC)"
|
||||
@jj new
|
||||
@previous_version=$$(cat version.txt); \
|
||||
$(MAKE) bump-version; \
|
||||
version=$$(cat version.txt); \
|
||||
tag_name="Release_$$version"; \
|
||||
previous_tag="Release_$$previous_version"; \
|
||||
echo "$(BLUE)→ Documenting version bump commit...$(NC)"; \
|
||||
jj auto-describe; \
|
||||
echo "$(BLUE)→ Generating release notes from $$previous_tag to current commit...$(NC)"; \
|
||||
if command -v orla >/dev/null 2>&1 && command -v jq >/dev/null 2>&1; then \
|
||||
release_json=$$(ORLA_MAX_TOOL_CALLS=50 jj log -r "$$previous_tag::@" -T 'commit_id.short() ++ " " ++ description' | \
|
||||
orla agent -m ollama:qwen3-coder-next:latest \
|
||||
"Summarize the following commits into a GitHub release note for version $$version. Ignore commits related to version bumps, .gitignore changes, or any internal housekeeping that is irrelevant to end users. Describe each user-facing change precisely without exposing code. Eliminate redundancy. Output strictly valid JSON with no surrounding text, using this exact schema: {\"title\": \"<short release title>\", \"body\": \"<detailed markdown release notes>\"}"); \
|
||||
release_json=$$(echo "$$release_json" | sed -n '/^{/,/^}/p'); \
|
||||
release_title=$$(echo "$$release_json" | jq -r '.title // empty') ; \
|
||||
release_body=$$(echo "$$release_json" | jq -r '.body // empty') ; \
|
||||
if [ -n "$$release_title" ] && [ -n "$$release_body" ]; then \
|
||||
release_message="$$release_title"$$'\n\n'"$$release_body"; \
|
||||
@$(MAKE) bump-version
|
||||
|
||||
jjpush-notes:
|
||||
@version=$$(cat version.txt); \
|
||||
echo "$(BLUE)→ Generating release notes for version $$version...$(NC)"; \
|
||||
release_title="Release $$version"; \
|
||||
release_body=""; \
|
||||
if command -v aichat >/dev/null 2>&1; then \
|
||||
previous_tag=$$(git describe --tags --abbrev=0 --match 'Release_*' 2>/dev/null); \
|
||||
if [ -z "$$previous_tag" ]; then \
|
||||
echo "$(YELLOW)⚠ No previous Release tag found, skipping release notes$(NC)"; \
|
||||
else \
|
||||
echo "$(YELLOW)⚠ JSON parsing failed, falling back to raw output$(NC)"; \
|
||||
release_message="Release $$version"$$'\n\n'"$$release_json"; \
|
||||
raw_output=$$(git log --format="%h %B" "$$previous_tag..HEAD" | \
|
||||
aichat \
|
||||
"Summarize the following commits into a GitHub release note for version $$version. Ignore commits related to version bumps, .gitignore changes, or any internal housekeeping that is irrelevant to end users. Describe each user-facing change precisely without exposing code. Eliminate redundancy. Output strictly valid JSON with no surrounding text, using this exact schema: {\"title\": \"<short release title>\", \"body\": \"<detailed markdown release notes>\"}" 2>/dev/null) || true; \
|
||||
if [ -n "$$raw_output" ]; then \
|
||||
notes=$$(printf '%s\n' "$$raw_output" | python3 tools/json2md.py 2>/dev/null); \
|
||||
if [ -n "$$notes" ]; then \
|
||||
release_title=$$(echo "$$notes" | head -1); \
|
||||
release_body=$$(echo "$$notes" | tail -n +3); \
|
||||
else \
|
||||
echo "$(YELLOW)⚠ JSON parsing failed, using default release message$(NC)"; \
|
||||
fi; \
|
||||
fi; \
|
||||
fi; \
|
||||
else \
|
||||
release_message="Release $$version"; \
|
||||
fi; \
|
||||
echo "$(BLUE)→ Pushing commits and creating tag $$tag_name...$(NC)"; \
|
||||
jj git push --change @; \
|
||||
git tag -a "$$tag_name" -m "$$release_message" 2>/dev/null || echo "Tag $$tag_name already exists"; \
|
||||
git push origin "$$tag_name" 2>/dev/null || echo "Tag already pushed"
|
||||
@echo "$(GREEN)✓ Commits and tag pushed to repository$(NC)"
|
||||
printf '%s' "$$release_title" > /tmp/obitools4-release-title.txt; \
|
||||
printf '%s' "$$release_body" > /tmp/obitools4-release-body.txt; \
|
||||
echo "$(BLUE)→ Setting release notes as commit description...$(NC)"; \
|
||||
jj desc -m "$$release_title"$$'\n\n'"$$release_body"
|
||||
|
||||
jjpush-push:
|
||||
@echo "$(BLUE)→ Pushing commits...$(NC)"
|
||||
@jj git push --change @
|
||||
@echo "$(BLUE)→ Creating/updating PR...$(NC)"
|
||||
@release_title=$$(cat /tmp/obitools4-release-title.txt 2>/dev/null || echo "Release $$(cat version.txt)"); \
|
||||
release_body=$$(cat /tmp/obitools4-release-body.txt 2>/dev/null || echo ""); \
|
||||
branch=$$(jj log -r @ --no-graph -T 'bookmarks.map(|b| b.name()).join("\n")' 2>/dev/null | head -1); \
|
||||
if [ -n "$$branch" ] && command -v gh >/dev/null 2>&1; then \
|
||||
gh pr create --title "$$release_title" --body "$$release_body" --base master --head "$$branch" 2>/dev/null \
|
||||
|| gh pr edit "$$branch" --title "$$release_title" --body "$$release_body" 2>/dev/null \
|
||||
|| echo "$(YELLOW)⚠ Could not create/update PR$(NC)"; \
|
||||
fi
|
||||
|
||||
jjpush-tag:
|
||||
@version=$$(cat version.txt); \
|
||||
tag_name="Release_$$version"; \
|
||||
release_title=$$(cat /tmp/obitools4-release-title.txt 2>/dev/null || echo "Release $$version"); \
|
||||
release_body=$$(cat /tmp/obitools4-release-body.txt 2>/dev/null || echo ""); \
|
||||
install_section=$$'\n## Installation\n\n### Pre-built binaries\n\nDownload the appropriate archive for your system from the\n[release assets](https://github.com/metabarcoding/obitools4/releases/tag/Release_'"$$version"')\nand extract it:\n\n#### Linux (AMD64)\n```bash\ntar -xzf obitools4_'"$$version"'_linux_amd64.tar.gz\n```\n\n#### Linux (ARM64)\n```bash\ntar -xzf obitools4_'"$$version"'_linux_arm64.tar.gz\n```\n\n#### macOS (Intel)\n```bash\ntar -xzf obitools4_'"$$version"'_darwin_amd64.tar.gz\n```\n\n#### macOS (Apple Silicon)\n```bash\ntar -xzf obitools4_'"$$version"'_darwin_arm64.tar.gz\n```\n\nAll OBITools4 binaries are included in each archive.\n\n### From source\n\nYou can also compile and install OBITools4 directly from source using the\ninstallation script:\n\n```bash\ncurl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | bash -s -- --version '"$$version"'\n```\n\nBy default binaries are installed in `/usr/local/bin`. Use `--install-dir` to\nchange the destination and `--obitools-prefix` to add a prefix to command names:\n\n```bash\ncurl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | \\\n bash -s -- --version '"$$version"' --install-dir ~/local --obitools-prefix k\n```\n'; \
|
||||
release_message="$$release_title"$$'\n\n'"$$release_body$$install_section"; \
|
||||
echo "$(BLUE)→ Creating tag $$tag_name...$(NC)"; \
|
||||
commit_hash=$$(jj log -r @ --no-graph -T 'commit_id' 2>/dev/null); \
|
||||
git tag -a "$$tag_name" $${commit_hash:+"$$commit_hash"} -m "$$release_message" 2>/dev/null || echo "$(YELLOW)⚠ Tag $$tag_name already exists$(NC)"; \
|
||||
echo "$(BLUE)→ Pushing tag $$tag_name...$(NC)"; \
|
||||
git push origin "$$tag_name" 2>/dev/null || echo "$(YELLOW)⚠ Tag push failed or already pushed$(NC)"; \
|
||||
rm -f /tmp/obitools4-release-title.txt /tmp/obitools4-release-body.txt
|
||||
|
||||
jjfetch:
|
||||
@echo "$(YELLOW)→ Pulling latest commits...$(NC)"
|
||||
@@ -169,5 +241,5 @@ jjfetch:
|
||||
@jj new master@origin
|
||||
@echo "$(GREEN)✓ Latest commits pulled$(NC)"
|
||||
|
||||
.PHONY: all obitools update-deps obitests githubtests jjnew jjpush jjfetch bump-version .FORCE
|
||||
.PHONY: all obitools update-deps obitests githubtests help jjnew jjpush jjpush-describe jjpush-bump jjpush-notes jjpush-push jjpush-tag jjfetch bump-version .FORCE
|
||||
.FORCE:
|
||||
|
||||
@@ -32,8 +32,12 @@ The installation script offers several options:
|
||||
>
|
||||
> -p, --obitools-prefix Prefix added to the obitools command names if you
|
||||
> want to have several versions of obitools at the
|
||||
> same time on your system (as example `-p g` will produce
|
||||
> same time on your system (as example `-p g` will produce
|
||||
> `gobigrep` command instead of `obigrep`).
|
||||
>
|
||||
> -j, --jobs Number of parallel jobs used for compilation
|
||||
> (default: 1). Increase this value to speed up
|
||||
> compilation on multi-core systems (e.g., `-j 4`).
|
||||
|
||||
### Examples
|
||||
|
||||
|
||||
@@ -0,0 +1,264 @@
|
||||
# Optimisation du parsing des grandes séquences
|
||||
|
||||
## Contexte
|
||||
|
||||
OBITools4 doit pouvoir traiter des séquences de taille chromosomique (plusieurs Gbp), notamment
|
||||
issues de fichiers GenBank/EMBL (assemblages de génomes) ou de fichiers FASTA convertis depuis
|
||||
ces formats.
|
||||
|
||||
## Architecture actuelle
|
||||
|
||||
### Pipeline de lecture (`pkg/obiformats/`)
|
||||
|
||||
```
|
||||
ReadFileChunk (goroutine)
|
||||
→ ChannelFileChunk
|
||||
→ N × _ParseGenbankFile / _ParseFastaFile (goroutines)
|
||||
→ IBioSequence
|
||||
```
|
||||
|
||||
`ReadFileChunk` (`file_chunk_read.go`) lit le fichier par morceaux via une chaîne de
|
||||
`PieceOfChunk` (rope). Chaque nœud fait `fileChunkSize` bytes :
|
||||
|
||||
- GenBank/EMBL : 128 MB (`1024*1024*128`)
|
||||
- FASTA/FASTQ : 1 MB (`1024*1024`)
|
||||
|
||||
La chaîne est accumulée jusqu'à trouver la fin du dernier enregistrement complet (splitter),
|
||||
puis `Pack()` est appelé pour fusionner tous les nœuds en un seul buffer contigu. Ce buffer
|
||||
est transmis au parseur via `FileChunk.Raw *bytes.Buffer`.
|
||||
|
||||
### Parseur GenBank (`genbank_read.go`)
|
||||
|
||||
`GenbankChunkParser` reçoit un `io.Reader` sur le buffer packé, lit ligne par ligne via
|
||||
`bufio.NewReader` (buffer 4096 bytes), et pour chaque ligne de la section `ORIGIN` :
|
||||
|
||||
```go
|
||||
line = string(bline) // allocation par ligne
|
||||
cleanline := strings.TrimSpace(line) // allocation
|
||||
parts := strings.SplitN(cleanline, " ", 7) // allocation []string + substrings
|
||||
for i := 1; i < lparts; i++ {
|
||||
seqBytes.WriteString(parts[i])
|
||||
}
|
||||
```
|
||||
|
||||
Point positif : `seqBytes` est pré-alloué grâce à `lseq` extrait de la ligne `LOCUS`.
|
||||
|
||||
### Parseur FASTA (`fastaseq_read.go`)
|
||||
|
||||
`FastaChunkParser` lit **octet par octet** via `scanner.ReadByte()`. Pour 3 Gbp :
|
||||
3 milliards d'appels. `seqBytes` est un `bytes.Buffer{}` sans pré-allocation.
|
||||
|
||||
## Problème principal
|
||||
|
||||
Pour une séquence de plusieurs Gbp, `Pack()` fusionne une chaîne de ~N nœuds de 128 MB en
|
||||
un seul buffer contigu. C'est une allocation de N × 128 MB suivie d'une copie de toutes les
|
||||
données. Bien que l'implémentation de `Pack()` soit efficace (libère les nœuds au fur et à
|
||||
mesure via `slices.Grow`), la copie est inévitable avec l'architecture actuelle.
|
||||
|
||||
De plus, le parseur GenBank produit des dizaines de millions d'allocations temporaires pour
|
||||
parser la section `ORIGIN` (une par ligne).
|
||||
|
||||
## Invariant clé découvert
|
||||
|
||||
**Si la rope a plus d'un nœud, le premier nœud seul ne se termine pas sur une frontière
|
||||
d'enregistrement** (pas de `//\n` en fin de `piece1`).
|
||||
|
||||
Preuve par construction dans `ReadFileChunk` :
|
||||
- `splitter` est appelé dès le premier nœud (ligne 157)
|
||||
- Si `end >= 0` → frontière trouvée dans 128 MB → boucle interne sautée → rope à 1 nœud
|
||||
- Si `end < 0` → boucle interne ajoute des nœuds → rope à ≥ 2 nœuds
|
||||
|
||||
Corollaire : si rope à 1 nœud, `Pack()` ne fait rien (aucun nœud suivant).
|
||||
|
||||
**Attention** : rope à ≥ 2 nœuds ne signifie pas qu'il n'y a qu'une seule séquence dans
|
||||
la rope. La rope packée peut contenir plusieurs enregistrements complets. Exemple : records
|
||||
de 80 MB → `nextpieces` (48 MB de reste) + nouveau nœud (128 MB) = rope à 2 nœuds
|
||||
contenant 2 records complets + début d'un troisième.
|
||||
|
||||
L'invariant dit seulement que `piece1` seul est incomplet — pas que la rope entière
|
||||
ne contient qu'un seul record.
|
||||
|
||||
**Invariant : le dernier FileChunk envoyé finit sur une frontière d'enregistrement.**
|
||||
|
||||
Deux chemins dans `ReadFileChunk` :
|
||||
|
||||
1. **Chemin normal** (`end >= 0` via `splitter`) : le buffer est explicitement tronqué à
|
||||
`end` (ligne 200 : `pieces.data = pieces.data[:end]`). Frontière garantie par construction
|
||||
pour tous les formats. ✓
|
||||
|
||||
2. **Chemin EOF** (`end < 0`, `end = pieces.Len()`) : tout le reste du fichier est envoyé.
|
||||
- **GenBank/EMBL** : présuppose fichier bien formé (se termine par `//\n`). Le parseur
|
||||
lève un `log.Fatalf` sur tout état inattendu — filet de sécurité suffisant. ✓
|
||||
- **FASTQ** : présupposé, vérifié par le parseur. ✓
|
||||
- **FASTA** : garanti par le format lui-même (fin d'enregistrement = EOF ou `>`). ✓
|
||||
|
||||
**Hypothèse de travail adoptée** : les fichiers d'entrée sont bien formés. Dans le pire cas,
|
||||
le parseur lèvera une erreur explicite. Il n'y a pas de risque de corruption silencieuse.
|
||||
|
||||
## Piste d'optimisation : se dispenser de Pack()
|
||||
|
||||
### Idée centrale
|
||||
|
||||
Au lieu de fusionner la rope avant de la passer au parseur, **parser directement la rope
|
||||
nœud par nœud**, et **écrire la séquence compactée in-place dans le premier nœud**.
|
||||
|
||||
Pourquoi c'est sûr :
|
||||
- Le header (LOCUS, DEFINITION, SOURCE, FEATURES) est **petit** et traité en premier
|
||||
- La séquence (ORIGIN) est **à la fin** du record
|
||||
- Au moment d'écrire la séquence depuis l'offset 0 de `piece1`, le pointeur de lecture
|
||||
est profond dans la rope (offset >> 0) → jamais de collision
|
||||
- La séquence compactée est toujours plus courte que les données brutes
|
||||
|
||||
### Pré-allocation
|
||||
|
||||
Pour GenBank/EMBL : `lseq` est connu dès la ligne `LOCUS`/`ID` (première ligne, dans
|
||||
`piece1`). On peut faire `slices.Grow(piece1.data, lseq)` dès ce moment.
|
||||
|
||||
Pour FASTA : pas de taille garantie dans le header, mais `rope.Len()` donne un majorant.
|
||||
On peut utiliser `rope.Len() / 2` comme estimation initiale.
|
||||
|
||||
### Gestion des jonctions entre nœuds
|
||||
|
||||
Une ligne peut chevaucher deux nœuds (rare avec 128 MB, mais possible). Solution : carry
|
||||
buffer de ~128 bytes pour les quelques bytes en fin de nœud.
|
||||
|
||||
### Cas FASTA/FASTQ multi-séquences
|
||||
|
||||
Un FileChunk peut contenir N séquences (notamment FASTA/FASTQ courts). Dans ce cas
|
||||
l'écriture in-place dans `piece1` n'est pas applicable directement — on écrase des données
|
||||
nécessaires aux séquences suivantes.
|
||||
|
||||
Stratégie par cas :
|
||||
- **Rope à 1 nœud** (record ≤ 128 MB) : `Pack()` est trivial (no-op), parseur actuel OK
|
||||
- **Rope à ≥ 2 nœuds** : par l'invariant, `piece1` ne contient pas de record complet →
|
||||
une seule grande séquence → in-place applicable
|
||||
|
||||
### Format d'une ligne séquence GenBank (Après ORIGIN)
|
||||
|
||||
```
|
||||
/^ *[0-9]+( [nuc]{10}){0,5} [nuc]{1,10}/
|
||||
```
|
||||
|
||||
### Format d'une ligne séquence GenBank (Après SQ)
|
||||
|
||||
La ligne SQ contient aussi la taille de la séquence
|
||||
|
||||
```
|
||||
/^ *( [nuc]{10}){0,5} [nuc]{1,10} *[0-9]+/
|
||||
```
|
||||
|
||||
Compactage in-place sur `bline` ([]byte brut, sans conversion `string`) :
|
||||
|
||||
```go
|
||||
w := 0
|
||||
i := 0
|
||||
for i < len(bline) && bline[i] == ' ' { i++ } // skip indentation
|
||||
for i < len(bline) && bline[i] <= '9' { i++ } // skip position number
|
||||
for ; i < len(bline); i++ {
|
||||
if bline[i] != ' ' {
|
||||
bline[w] = bline[i]
|
||||
w++
|
||||
}
|
||||
}
|
||||
// écrire bline[:w] directement dans piece1.data[seqOffset:]
|
||||
```
|
||||
|
||||
## Changements nécessaires
|
||||
|
||||
1. **`FileChunk`** : exposer la rope `*PieceOfChunk` non-packée en plus (ou à la place)
|
||||
de `Raw *bytes.Buffer`
|
||||
2. **`GenbankChunkParser` / `EmblChunkParser`** : accepter `*PieceOfChunk`, parser la
|
||||
rope séquentiellement avec carry buffer pour les jonctions
|
||||
3. **`FastaChunkParser`** : idem, avec in-place conditionnel selon taille de la rope
|
||||
4. **`ReadFileChunk`** : ne pas appeler `Pack()` avant envoi sur le channel (ou version
|
||||
alternative `ReadFileChunkRope`)
|
||||
|
||||
## Fichiers concernés
|
||||
|
||||
- `pkg/obiformats/file_chunk_read.go` — structure rope, `ReadFileChunk`
|
||||
- `pkg/obiformats/genbank_read.go` — `GenbankChunkParser`, `_ParseGenbankFile`
|
||||
- `pkg/obiformats/embl_read.go` — `EmblChunkParser`, `ReadEMBL`
|
||||
- `pkg/obiformats/fastaseq_read.go` — `FastaChunkParser`, `_ParseFastaFile`
|
||||
- `pkg/obiformats/fastqseq_read.go` — parseur FASTQ (même structure)
|
||||
|
||||
## Plan d'implémentation : parseur GenBank sur rope
|
||||
|
||||
### Contexte
|
||||
|
||||
Baseline mesurée : `obiconvert gbpln640.seq.gz` → 49s real, 42s user, 29s sys, **57 GB RSS**.
|
||||
Le sys élevé indique des allocations massives. Deux causes :
|
||||
1. `Pack()` : fusionne toute la rope (N × 128 MB) en un buffer contigu avant de parser
|
||||
2. Parser ORIGIN : `string(bline)` + `TrimSpace` + `SplitN` × millions de lignes
|
||||
|
||||
### 1. `gbRopeScanner`
|
||||
|
||||
Struct de lecture ligne par ligne sur la rope, sans allocation heap :
|
||||
|
||||
```go
|
||||
type gbRopeScanner struct {
|
||||
current *PieceOfChunk
|
||||
pos int
|
||||
carry [256]byte // stack-allocated, max GenBank line = 80 chars
|
||||
carryN int
|
||||
}
|
||||
```
|
||||
|
||||
`ReadLine()` :
|
||||
- Cherche `\n` dans `current.data[pos:]` via `bytes.IndexByte`
|
||||
- Si trouvé sans carry : retourne slice direct du node (zéro alloc)
|
||||
- Si trouvé avec carry : copie dans carry buffer, retourne `carry[:n]`
|
||||
- Si non trouvé : copie le reste dans carry, avance au node suivant, recommence
|
||||
- EOF : retourne `carry[:carryN]` puis nil
|
||||
|
||||
`extractSequence(dest []byte, UtoT bool) int` :
|
||||
- Scan direct des bytes pour section ORIGIN, sans passer par ReadLine
|
||||
- Machine d'états : lineStart → skip espaces/digits → copier nucléotides dans dest
|
||||
- Stop sur `//` en début de ligne
|
||||
- Zéro allocation, UtoT inline
|
||||
|
||||
### 2. `GenbankChunkParserRope`
|
||||
|
||||
```go
|
||||
func GenbankChunkParserRope(source string, rope *PieceOfChunk,
|
||||
withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error)
|
||||
```
|
||||
|
||||
- Même machine d'états que `GenbankChunkParser`, sur `[]byte` (`bytes.HasPrefix`)
|
||||
- LOCUS : extrait `id` et `lseq` par scan direct (remplace `_seqlenght_rx`)
|
||||
- FEATURES / default inFeature : taxid extrait par scan de `/db_xref="taxon:`
|
||||
dans la source feature ; `featBytes` rempli seulement si `withFeatureTable=true`
|
||||
- DEFINITION : toujours conservée
|
||||
- ORIGIN : `dest = make([]byte, 0, lseq+20)` puis `s.extractSequence(dest, UtoT)`
|
||||
|
||||
### 3. Modifications `_ParseGenbankFile` et `ReadGenbank`
|
||||
|
||||
`_ParseGenbankFile` utilise `chunk.Rope` :
|
||||
```go
|
||||
sequences, err := GenbankChunkParserRope(chunk.Source, chunk.Rope, ...)
|
||||
```
|
||||
|
||||
`ReadGenbank` passe `pack=false` :
|
||||
```go
|
||||
entry_channel := ReadFileChunk(..., false)
|
||||
```
|
||||
|
||||
### 4. Ce qui NE change pas
|
||||
|
||||
- `GenbankChunkParser` reste (référence, tests)
|
||||
- `ReadFileChunk`, `Pack()`, autres parseurs (EMBL, FASTA, FASTQ) : inchangés
|
||||
|
||||
### 5. Gains attendus
|
||||
|
||||
- **RSS** : pic ≈ 128 MB × workers (au lieu de N × 128 MB)
|
||||
- **Temps sys** : élimination des mmap/munmap pour les gros buffers
|
||||
- **Temps user** : ~50M allocations éliminées
|
||||
|
||||
### 6. Vérification
|
||||
|
||||
```bash
|
||||
/usr/local/go/bin/go build ./...
|
||||
diff <(obiconvert gbpln640.seq.gz) gbpln640.reference.fasta
|
||||
cd bugs/genbank && ./benchmark.sh gbpln640.seq.gz
|
||||
```
|
||||
|
||||
Cible : RSS < 1 GB, temps comparable ou meilleur.
|
||||
@@ -0,0 +1,10 @@
|
||||
{
|
||||
"people": [
|
||||
"Software",
|
||||
"Agreement",
|
||||
"Module"
|
||||
],
|
||||
"projects": [
|
||||
"Code"
|
||||
]
|
||||
}
|
||||
@@ -1,35 +1,33 @@
|
||||
module git.metabarcoding.org/obitools/obitools4/obitools4
|
||||
|
||||
go 1.23.4
|
||||
|
||||
toolchain go1.24.2
|
||||
go 1.26.1
|
||||
|
||||
require (
|
||||
github.com/DavidGamba/go-getoptions v0.28.0
|
||||
github.com/PaesslerAG/gval v1.2.2
|
||||
github.com/DavidGamba/go-getoptions v0.33.0
|
||||
github.com/PaesslerAG/gval v1.2.4
|
||||
github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df
|
||||
github.com/buger/jsonparser v1.1.1
|
||||
github.com/buger/jsonparser v1.1.2
|
||||
github.com/chen3feng/stl4go v0.1.1
|
||||
github.com/dlclark/regexp2 v1.11.4
|
||||
github.com/goccy/go-json v0.10.3
|
||||
github.com/dlclark/regexp2 v1.11.5
|
||||
github.com/goccy/go-json v0.10.6
|
||||
github.com/klauspost/pgzip v1.2.6
|
||||
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58
|
||||
github.com/pelletier/go-toml/v2 v2.2.4
|
||||
github.com/rrethy/ahocorasick v1.0.0
|
||||
github.com/schollz/progressbar/v3 v3.13.1
|
||||
github.com/sirupsen/logrus v1.9.3
|
||||
github.com/stretchr/testify v1.8.4
|
||||
github.com/schollz/progressbar/v3 v3.19.0
|
||||
github.com/sirupsen/logrus v1.9.4
|
||||
github.com/stretchr/testify v1.10.0
|
||||
github.com/tevino/abool/v2 v2.1.0
|
||||
github.com/yuin/gopher-lua v1.1.1
|
||||
golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa
|
||||
gonum.org/v1/gonum v0.14.0
|
||||
golang.org/x/exp v0.0.0-20260312153236-7ab1446f8b90
|
||||
gonum.org/v1/gonum v0.17.0
|
||||
gopkg.in/yaml.v3 v3.0.1
|
||||
scientificgo.org/special v0.0.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/goombaio/orderedmap v0.0.0-20180924084748-ba921b7e2419 // indirect
|
||||
github.com/goombaio/orderedmap v0.0.0-20180925151256-3da0e2f905f9 // indirect
|
||||
github.com/kr/pretty v0.3.1 // indirect
|
||||
github.com/kr/text v0.2.0 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
@@ -38,16 +36,15 @@ require (
|
||||
|
||||
require (
|
||||
github.com/dsnet/compress v0.0.1
|
||||
github.com/gabriel-vasile/mimetype v1.4.3
|
||||
github.com/gabriel-vasile/mimetype v1.4.13
|
||||
github.com/goombaio/orderedset v0.0.0-20180925151225-8e67b20a9b77
|
||||
github.com/klauspost/compress v1.17.2
|
||||
github.com/mattn/go-runewidth v0.0.15 // indirect
|
||||
github.com/klauspost/compress v1.18.4
|
||||
github.com/mattn/go-runewidth v0.0.21 // indirect
|
||||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
|
||||
github.com/rivo/uniseg v0.4.4 // indirect
|
||||
github.com/shopspring/decimal v1.3.1 // indirect
|
||||
github.com/ulikunitz/xz v0.5.11
|
||||
golang.org/x/net v0.35.0 // indirect
|
||||
golang.org/x/sys v0.30.0 // indirect
|
||||
golang.org/x/term v0.29.0 // indirect
|
||||
github.com/rivo/uniseg v0.4.7 // indirect
|
||||
github.com/shopspring/decimal v1.4.0 // indirect
|
||||
github.com/ulikunitz/xz v0.5.15
|
||||
golang.org/x/sys v0.42.0 // indirect
|
||||
golang.org/x/term v0.41.0 // indirect
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c
|
||||
)
|
||||
|
||||
@@ -1,36 +1,41 @@
|
||||
github.com/DavidGamba/go-getoptions v0.28.0 h1:18wgEvfZdrlfIhVDGEBO3Dl0fkOyXqXLa0tLMCKxM1c=
|
||||
github.com/DavidGamba/go-getoptions v0.28.0/go.mod h1:zE97E3PR9P3BI/HKyNYgdMlYxodcuiC6W68KIgeYT84=
|
||||
github.com/PaesslerAG/gval v1.2.2 h1:Y7iBzhgE09IGTt5QgGQ2IdaYYYOU134YGHBThD+wm9E=
|
||||
github.com/PaesslerAG/gval v1.2.2/go.mod h1:XRFLwvmkTEdYziLdaCeCa5ImcGVrfQbeNUbVR+C6xac=
|
||||
github.com/DavidGamba/go-getoptions v0.33.0 h1:8xCPH87Yy5avYenygyHVlqqm8RpymH0YFe4a7IWlarE=
|
||||
github.com/DavidGamba/go-getoptions v0.33.0/go.mod h1:zE97E3PR9P3BI/HKyNYgdMlYxodcuiC6W68KIgeYT84=
|
||||
github.com/PaesslerAG/gval v1.2.4 h1:rhX7MpjJlcxYwL2eTTYIOBUyEKZ+A96T9vQySWkVUiU=
|
||||
github.com/PaesslerAG/gval v1.2.4/go.mod h1:XRFLwvmkTEdYziLdaCeCa5ImcGVrfQbeNUbVR+C6xac=
|
||||
github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI=
|
||||
github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8=
|
||||
github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df h1:GSoSVRLoBaFpOOds6QyY1L8AX7uoY+Ln3BHc22W40X0=
|
||||
github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df/go.mod h1:hiVxq5OP2bUGBRNS3Z/bt/reCLFNbdcST6gISi1fiOM=
|
||||
github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMUs=
|
||||
github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0=
|
||||
github.com/buger/jsonparser v1.1.2 h1:frqHqw7otoVbk5M8LlE/L7HTnIq2v9RX6EJ48i9AxJk=
|
||||
github.com/buger/jsonparser v1.1.2/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0=
|
||||
github.com/chen3feng/stl4go v0.1.1 h1:0L1+mDw7pomftKDruM23f1mA7miavOj6C6MZeadzN2Q=
|
||||
github.com/chen3feng/stl4go v0.1.1/go.mod h1:5ml3psLgETJjRJnMbPE+JiHLrCpt+Ajc2weeTECXzWU=
|
||||
github.com/chengxilo/virtualterm v1.0.4 h1:Z6IpERbRVlfB8WkOmtbHiDbBANU7cimRIof7mk9/PwM=
|
||||
github.com/chengxilo/virtualterm v1.0.4/go.mod h1:DyxxBZz/x1iqJjFxTFcr6/x+jSpqN0iwWCOK1q10rlY=
|
||||
github.com/clipperhouse/uax29/v2 v2.2.0 h1:ChwIKnQN3kcZteTXMgb1wztSgaU+ZemkgWdohwgs8tY=
|
||||
github.com/clipperhouse/uax29/v2 v2.2.0/go.mod h1:EFJ2TJMRUaplDxHKj1qAEhCtQPW2tJSwu5BF98AuoVM=
|
||||
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dlclark/regexp2 v1.11.4 h1:rPYF9/LECdNymJufQKmri9gV604RvvABwgOA8un7yAo=
|
||||
github.com/dlclark/regexp2 v1.11.4/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
|
||||
github.com/dlclark/regexp2 v1.11.5 h1:Q/sSnsKerHeCkc/jSTNq1oCm7KiVgUMZRDUoRu0JQZQ=
|
||||
github.com/dlclark/regexp2 v1.11.5/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
|
||||
github.com/dsnet/compress v0.0.1 h1:PlZu0n3Tuv04TzpfPbrnI0HW/YwodEXDS+oPKahKF0Q=
|
||||
github.com/dsnet/compress v0.0.1/go.mod h1:Aw8dCMJ7RioblQeTqt88akK31OvO8Dhf5JflhBbQEHo=
|
||||
github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY=
|
||||
github.com/gabriel-vasile/mimetype v1.4.3 h1:in2uUcidCuFcDKtdcBxlR0rJ1+fsokWf+uqxgUFjbI0=
|
||||
github.com/gabriel-vasile/mimetype v1.4.3/go.mod h1:d8uq/6HKRL6CGdk+aubisF/M5GcPfT7nKyLpA0lbSSk=
|
||||
github.com/goccy/go-json v0.10.3 h1:KZ5WoDbxAIgm2HNbYckL0se1fHD6rz5j4ywS6ebzDqA=
|
||||
github.com/goccy/go-json v0.10.3/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M=
|
||||
github.com/goombaio/orderedmap v0.0.0-20180924084748-ba921b7e2419 h1:SajEQ6tktpF9SRIuzbiPOX9AEZZ53Bvw0k9Mzrts8Lg=
|
||||
github.com/gabriel-vasile/mimetype v1.4.13 h1:46nXokslUBsAJE/wMsp5gtO500a4F3Nkz9Ufpk2AcUM=
|
||||
github.com/gabriel-vasile/mimetype v1.4.13/go.mod h1:d+9Oxyo1wTzWdyVUPMmXFvp4F9tea18J8ufA774AB3s=
|
||||
github.com/goccy/go-json v0.10.6 h1:p8HrPJzOakx/mn/bQtjgNjdTcN+/S6FcG2CTtQOrHVU=
|
||||
github.com/goccy/go-json v0.10.6/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M=
|
||||
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
|
||||
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||
github.com/goombaio/orderedmap v0.0.0-20180924084748-ba921b7e2419/go.mod h1:YKu81H3RSd1cFh0d7NhvUoTtUC9IY/vBX0WUQb1/o4Y=
|
||||
github.com/goombaio/orderedmap v0.0.0-20180925151256-3da0e2f905f9 h1:vFjPvFavIiDY71bQ9HIxPQBANvNl1SmFC4fgg5xRkho=
|
||||
github.com/goombaio/orderedmap v0.0.0-20180925151256-3da0e2f905f9/go.mod h1:YKu81H3RSd1cFh0d7NhvUoTtUC9IY/vBX0WUQb1/o4Y=
|
||||
github.com/goombaio/orderedset v0.0.0-20180925151225-8e67b20a9b77 h1:4dvq1tGHn1Y9KSRY0OZ24Khki4+4U+ZrA//YYsdUlJU=
|
||||
github.com/goombaio/orderedset v0.0.0-20180925151225-8e67b20a9b77/go.mod h1:HPelMYpOyy0XvglpBbmZ3krZpwaHmszj/vQNlnETPTM=
|
||||
github.com/k0kubun/go-ansi v0.0.0-20180517002512-3bf9e2903213/go.mod h1:vNUNkEQ1e29fT/6vq2aBdFsgNPmy8qMdSay1npru+Sw=
|
||||
github.com/klauspost/compress v1.4.1/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
|
||||
github.com/klauspost/compress v1.17.2 h1:RlWWUY/Dr4fL8qk9YG7DTZ7PDgME2V4csBXA8L/ixi4=
|
||||
github.com/klauspost/compress v1.17.2/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
|
||||
github.com/klauspost/compress v1.18.4 h1:RPhnKRAQ4Fh8zU2FY/6ZFDwTVTxgJ/EMydqSTzE9a2c=
|
||||
github.com/klauspost/compress v1.18.4/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
|
||||
github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
|
||||
github.com/klauspost/pgzip v1.2.6 h1:8RXeL5crjEUFnR2/Sn6GJNWtSQ3Dk8pq4CL3jvdDyjU=
|
||||
github.com/klauspost/pgzip v1.2.6/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
|
||||
@@ -41,10 +46,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
|
||||
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
|
||||
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||
github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
|
||||
github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
|
||||
github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZgg3U=
|
||||
github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
|
||||
github.com/mattn/go-runewidth v0.0.21 h1:jJKAZiQH+2mIinzCJIaIG9Be1+0NR+5sz/lYEEjdM8w=
|
||||
github.com/mattn/go-runewidth v0.0.21/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs=
|
||||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ=
|
||||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw=
|
||||
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
|
||||
@@ -54,50 +57,40 @@ github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8
|
||||
github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
|
||||
github.com/rivo/uniseg v0.4.4 h1:8TfxU8dW6PdqD27gjM8MVNuicgxIjxpm4K7x4jp8sis=
|
||||
github.com/rivo/uniseg v0.4.4/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
|
||||
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
|
||||
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
|
||||
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
|
||||
github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
|
||||
github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
|
||||
github.com/rrethy/ahocorasick v1.0.0 h1:YKkCB+E5PXc0xmLfMrWbfNht8vG9Re97IHSWZk/Lk8E=
|
||||
github.com/rrethy/ahocorasick v1.0.0/go.mod h1:nq8oScE7Vy1rOppoQxpQiiDmPHuKCuk9rXrNcxUV3R0=
|
||||
github.com/schollz/progressbar/v3 v3.13.1 h1:o8rySDYiQ59Mwzy2FELeHY5ZARXZTVJC7iHD6PEFUiE=
|
||||
github.com/schollz/progressbar/v3 v3.13.1/go.mod h1:xvrbki8kfT1fzWzBT/UZd9L6GA+jdL7HAgq2RFnO6fQ=
|
||||
github.com/shopspring/decimal v1.3.1 h1:2Usl1nmF/WZucqkFZhnfFYxxxu8LG21F6nPQBE5gKV8=
|
||||
github.com/schollz/progressbar/v3 v3.19.0 h1:Ea18xuIRQXLAUidVDox3AbwfUhD0/1IvohyTutOIFoc=
|
||||
github.com/schollz/progressbar/v3 v3.19.0/go.mod h1:IsO3lpbaGuzh8zIMzgY3+J8l4C8GjO0Y9S69eFvNsec=
|
||||
github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o=
|
||||
github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
|
||||
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
|
||||
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
|
||||
github.com/shopspring/decimal v1.4.0 h1:bxl37RwXBklmTi0C79JfXCEBD1cqqHt0bbgBAGFp81k=
|
||||
github.com/shopspring/decimal v1.4.0/go.mod h1:gawqmDU56v4yIKSwfBSFip1HdCCXN8/+DMd9qYNcwME=
|
||||
github.com/sirupsen/logrus v1.9.4 h1:TsZE7l11zFCLZnZ+teH4Umoq5BhEIfIzfRDZ1Uzql2w=
|
||||
github.com/sirupsen/logrus v1.9.4/go.mod h1:ftWc9WdOfJ0a92nsE2jF5u5ZwH8Bv2zdeOC42RjbV2g=
|
||||
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
|
||||
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
github.com/tevino/abool/v2 v2.1.0 h1:7w+Vf9f/5gmKT4m4qkayb33/92M+Um45F2BkHOR+L/c=
|
||||
github.com/tevino/abool/v2 v2.1.0/go.mod h1:+Lmlqk6bHDWHqN1cbxqhwEAwMPXgc8I1SDEamtseuXY=
|
||||
github.com/ulikunitz/xz v0.5.6/go.mod h1:2bypXElzHzzJZwzH67Y6wb67pO62Rzfn7BSiF4ABRW8=
|
||||
github.com/ulikunitz/xz v0.5.11 h1:kpFauv27b6ynzBNT/Xy+1k+fK4WswhN/6PN5WhFAGw8=
|
||||
github.com/ulikunitz/xz v0.5.11/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
|
||||
github.com/ulikunitz/xz v0.5.15 h1:9DNdB5s+SgV3bQ2ApL10xRc35ck0DuIX/isZvIk+ubY=
|
||||
github.com/ulikunitz/xz v0.5.15/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
|
||||
github.com/yuin/gopher-lua v1.1.1 h1:kYKnWBjvbNP4XLT3+bPEwAXJx262OhaHDWDVOPjL46M=
|
||||
github.com/yuin/gopher-lua v1.1.1/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7kdAd1Pw=
|
||||
golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa h1:FRnLl4eNAQl8hwxVVC17teOw8kdjVDVAiFMtgUdTSRQ=
|
||||
golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa/go.mod h1:zk2irFbV9DP96SEBUUAy67IdHUaZuSnrz1n472HUCLE=
|
||||
golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8=
|
||||
golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk=
|
||||
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
|
||||
golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U=
|
||||
golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU=
|
||||
golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s=
|
||||
gonum.org/v1/gonum v0.14.0 h1:2NiG67LD1tEH0D7kM+ps2V+fXmsAnpUeec7n8tcr4S0=
|
||||
gonum.org/v1/gonum v0.14.0/go.mod h1:AoWeoz0becf9QMWtE8iWXNXc27fK4fNeHNf/oMejGfU=
|
||||
golang.org/x/exp v0.0.0-20260312153236-7ab1446f8b90 h1:jiDhWWeC7jfWqR9c/uplMOqJ0sbNlNWv0UkzE0vX1MA=
|
||||
golang.org/x/exp v0.0.0-20260312153236-7ab1446f8b90/go.mod h1:xE1HEv6b+1SCZ5/uscMRjUBKtIxworgEcEi+/n9NQDQ=
|
||||
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
|
||||
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||
golang.org/x/term v0.41.0 h1:QCgPso/Q3RTJx2Th4bDLqML4W6iJiaXFq2/ftQF13YU=
|
||||
golang.org/x/term v0.41.0/go.mod h1:3pfBgksrReYfZ5lvYM0kSO0LIkAl4Yl2bXOkKP7Ec2A=
|
||||
gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4=
|
||||
gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
scientificgo.org/special v0.0.0 h1:P6WJkECo6tgtvZAEfNXl+KEB9ReAatjKAeX8U07mjSc=
|
||||
|
||||
@@ -52,6 +52,8 @@ golang.org/x/image v0.6.0/go.mod h1:MXLdDR43H7cDJq5GEGXEVeeNhPgi+YYEQ2pC1byI1x0=
|
||||
golang.org/x/mod v0.13.0 h1:I/DsJXRlw/8l/0c24sM9yb0T4z9liZTduXvdAWYiysY=
|
||||
golang.org/x/mod v0.13.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
|
||||
golang.org/x/mod v0.14.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
|
||||
golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
|
||||
golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw=
|
||||
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4 h1:uVc8UZUe6tr40fFVnUP5Oj+veunVezqYl9z7DYw9xzw=
|
||||
golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k=
|
||||
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
|
||||
|
||||
+41
-14
@@ -7,6 +7,7 @@ INSTALL_DIR="/usr/local"
|
||||
OBITOOLS_PREFIX=""
|
||||
VERSION=""
|
||||
LIST_VERSIONS=false
|
||||
JOBS=1
|
||||
|
||||
# Help message
|
||||
function display_help {
|
||||
@@ -21,6 +22,7 @@ function display_help {
|
||||
echo " gobigrep command instead of obigrep)."
|
||||
echo " -v, --version Install a specific version (e.g., 4.4.8)."
|
||||
echo " If not specified, installs the latest version."
|
||||
echo " -j, --jobs Number of parallel jobs for compilation (default: 1)."
|
||||
echo " -l, --list List all available versions and exit."
|
||||
echo " -h, --help Display this help message."
|
||||
echo ""
|
||||
@@ -65,6 +67,10 @@ while [ "$#" -gt 0 ]; do
|
||||
VERSION="$2"
|
||||
shift 2
|
||||
;;
|
||||
-j|--jobs)
|
||||
JOBS="$2"
|
||||
shift 2
|
||||
;;
|
||||
-l|--list)
|
||||
LIST_VERSIONS=true
|
||||
shift
|
||||
@@ -122,9 +128,15 @@ mkdir -p "${WORK_DIR}/cache" \
|
||||
exit 1)
|
||||
|
||||
# Create installation directory
|
||||
mkdir -p "${INSTALL_DIR}/bin" 2> /dev/null \
|
||||
|| (echo "Please enter your password for installing obitools in ${INSTALL_DIR}" 1>&2
|
||||
sudo mkdir -p "${INSTALL_DIR}/bin")
|
||||
if ! mkdir -p "${INSTALL_DIR}/bin" 2>/dev/null; then
|
||||
if [ ! -w "$(dirname "${INSTALL_DIR}")" ] && [ ! -w "${INSTALL_DIR}" ]; then
|
||||
echo "Please enter your password for installing obitools in ${INSTALL_DIR}" 1>&2
|
||||
sudo mkdir -p "${INSTALL_DIR}/bin"
|
||||
else
|
||||
echo "Error: Could not create ${INSTALL_DIR}/bin (check path or disk space)" 1>&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ ! -d "${INSTALL_DIR}/bin" ]]; then
|
||||
echo "Could not create ${INSTALL_DIR}/bin directory for installing obitools" 1>&2
|
||||
@@ -171,22 +183,24 @@ GOURL=$(curl -s "${URL}${GOFILE}" \
|
||||
|
||||
echo "Installing Go from: $GOURL" 1>&2
|
||||
|
||||
curl -s "$GOURL" | tar zxf -
|
||||
curl --progress-bar "$GOURL" | tar zxf -
|
||||
|
||||
PATH="$(pwd)/go/bin:$PATH"
|
||||
export GOROOT="$(pwd)/go"
|
||||
PATH="${GOROOT}/bin:$PATH"
|
||||
export PATH
|
||||
GOPATH="$(pwd)/go"
|
||||
export GOPATH
|
||||
export GOPATH="$(pwd)/gopath"
|
||||
export GOCACHE="$(pwd)/cache"
|
||||
export GOTOOLCHAIN=local
|
||||
|
||||
echo "GOROOT=$GOROOT" 1>&2
|
||||
echo "GOCACHE=$GOCACHE" 1>&2
|
||||
mkdir -p "$GOCACHE"
|
||||
mkdir -p "$GOPATH" "$GOCACHE"
|
||||
|
||||
# Download OBITools4 source
|
||||
echo "Downloading OBITools4 v${VERSION}..." 1>&2
|
||||
echo "Source URL: $OBIURL4" 1>&2
|
||||
|
||||
if ! curl -sL "$OBIURL4" > obitools4.zip; then
|
||||
if ! curl --progress-bar -L "$OBIURL4" > obitools4.zip; then
|
||||
echo "Error: Could not download OBITools4 version ${VERSION}" 1>&2
|
||||
echo "Please check that this version exists with: $0 --list" 1>&2
|
||||
exit 1
|
||||
@@ -208,16 +222,29 @@ mkdir -p vendor
|
||||
|
||||
# Build with or without prefix
|
||||
if [[ -z "$OBITOOLS_PREFIX" ]] ; then
|
||||
make GOFLAGS="-buildvcs=false"
|
||||
make -j"${JOBS}" obitools GOFLAGS="-buildvcs=false"
|
||||
else
|
||||
make GOFLAGS="-buildvcs=false" OBITOOLS_PREFIX="${OBITOOLS_PREFIX}"
|
||||
make -j"${JOBS}" obitools GOFLAGS="-buildvcs=false" OBITOOLS_PREFIX="${OBITOOLS_PREFIX}"
|
||||
fi
|
||||
|
||||
# Install binaries
|
||||
echo "Installing binaries to ${INSTALL_DIR}/bin..." 1>&2
|
||||
(cp build/* "${INSTALL_DIR}/bin" 2> /dev/null) \
|
||||
|| (echo "Please enter your password for installing obitools in ${INSTALL_DIR}" 1>&2
|
||||
sudo cp build/* "${INSTALL_DIR}/bin")
|
||||
if ! cp build/* "${INSTALL_DIR}/bin" 2>/dev/null; then
|
||||
if [ ! -w "${INSTALL_DIR}/bin" ]; then
|
||||
echo "Please enter your password for installing obitools in ${INSTALL_DIR}" 1>&2
|
||||
sudo cp build/* "${INSTALL_DIR}/bin"
|
||||
else
|
||||
echo "Error: Could not copy binaries to ${INSTALL_DIR}/bin" 1>&2
|
||||
echo " Source files: $(ls build/ 2>/dev/null || echo 'none found')" 1>&2
|
||||
echo "" 1>&2
|
||||
echo "The build directory has been preserved for manual recovery:" 1>&2
|
||||
echo " $(pwd)/build/" 1>&2
|
||||
echo "You can install manually with:" 1>&2
|
||||
echo " cp $(pwd)/build/* ${INSTALL_DIR}/bin/" 1>&2
|
||||
popd > /dev/null || true
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
popd > /dev/null || exit
|
||||
|
||||
|
||||
Binary file not shown.
Vendored
BIN
Binary file not shown.
Vendored
BIN
Binary file not shown.
@@ -4,8 +4,8 @@
|
||||
# Here give the name of the test serie
|
||||
#
|
||||
|
||||
TEST_NAME=obisuperkmer
|
||||
CMD=obisuperkmer
|
||||
TEST_NAME=obik-super
|
||||
CMD=obik
|
||||
|
||||
######
|
||||
#
|
||||
@@ -16,7 +16,7 @@ TEST_DIR="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")"
|
||||
OBITOOLS_DIR="${TEST_DIR/obitest*/}build"
|
||||
export PATH="${OBITOOLS_DIR}:${PATH}"
|
||||
|
||||
MCMD="$(echo "${CMD:0:4}" | tr '[:lower:]' '[:upper:]')$(echo "${CMD:4}" | tr '[:upper:]' '[:lower:]')"
|
||||
MCMD="OBIk-super"
|
||||
|
||||
TMPDIR="$(mktemp -d)"
|
||||
ntest=0
|
||||
@@ -65,31 +65,10 @@ log "files: $(find $TEST_DIR | awk -F'/' '{print $NF}' | tail -n +2)"
|
||||
####
|
||||
#### Below are the tests
|
||||
####
|
||||
#### Before each test :
|
||||
#### - increment the variable ntest
|
||||
####
|
||||
#### Run the command as the condition of an if / then /else
|
||||
#### - The command must return 0 on success
|
||||
#### - The command must return an exit code different from 0 on failure
|
||||
#### - The datafiles are stored in the same directory than the test script
|
||||
#### - The test script directory is stored in the TEST_DIR variable
|
||||
#### - If result files have to be produced they must be stored
|
||||
#### in the temporary directory (TMPDIR variable)
|
||||
####
|
||||
#### then clause is executed on success of the command
|
||||
#### - Write a success message using the log function
|
||||
#### - increment the variable success
|
||||
####
|
||||
#### else clause is executed on failure of the command
|
||||
#### - Write a failure message using the log function
|
||||
#### - increment the variable failed
|
||||
####
|
||||
######################################################################
|
||||
|
||||
|
||||
|
||||
((ntest++))
|
||||
if $CMD -h > "${TMPDIR}/help.txt" 2>&1
|
||||
if $CMD super -h > "${TMPDIR}/help.txt" 2>&1
|
||||
then
|
||||
log "$MCMD: printing help OK"
|
||||
((success++))
|
||||
@@ -100,7 +79,7 @@ fi
|
||||
|
||||
# Test 1: Basic super k-mer extraction with default parameters
|
||||
((ntest++))
|
||||
if obisuperkmer "${TEST_DIR}/test_sequences.fasta" \
|
||||
if $CMD super "${TEST_DIR}/test_sequences.fasta" \
|
||||
> "${TMPDIR}/output_default.fasta" 2>&1
|
||||
then
|
||||
log "$MCMD: basic extraction with default parameters OK"
|
||||
@@ -148,7 +127,7 @@ fi
|
||||
|
||||
# Test 5: Extract super k-mers with custom k and m parameters
|
||||
((ntest++))
|
||||
if obisuperkmer -k 15 -m 7 "${TEST_DIR}/test_sequences.fasta" \
|
||||
if $CMD super -k 15 -m 7 "${TEST_DIR}/test_sequences.fasta" \
|
||||
> "${TMPDIR}/output_k15_m7.fasta" 2>&1
|
||||
then
|
||||
log "$MCMD: extraction with custom k=15, m=7 OK"
|
||||
@@ -172,7 +151,7 @@ fi
|
||||
|
||||
# Test 7: Test with different output format (FASTA output explicitly)
|
||||
((ntest++))
|
||||
if obisuperkmer --fasta-output -k 21 -m 11 \
|
||||
if $CMD super --fasta-output -k 21 -m 11 \
|
||||
"${TEST_DIR}/test_sequences.fasta" \
|
||||
> "${TMPDIR}/output_fasta.fasta" 2>&1
|
||||
then
|
||||
@@ -209,7 +188,7 @@ fi
|
||||
|
||||
# Test 10: Test with output file option
|
||||
((ntest++))
|
||||
if obisuperkmer -o "${TMPDIR}/output_file.fasta" \
|
||||
if $CMD super -o "${TMPDIR}/output_file.fasta" \
|
||||
"${TEST_DIR}/test_sequences.fasta" 2>&1
|
||||
then
|
||||
log "$MCMD: output to file with -o option OK"
|
||||
|
||||
+46
-1
@@ -1,6 +1,12 @@
|
||||
package obidefault
|
||||
|
||||
var _BatchSize = 2000
|
||||
// _BatchSize is the minimum number of sequences per batch (floor).
|
||||
// Used as the minSeqs argument to RebatchBySize.
|
||||
var _BatchSize = 1
|
||||
|
||||
// _BatchSizeMax is the maximum number of sequences per batch (ceiling).
|
||||
// A batch is flushed when this count is reached regardless of memory usage.
|
||||
var _BatchSizeMax = 2000
|
||||
|
||||
// SetBatchSize sets the size of the sequence batches.
|
||||
//
|
||||
@@ -24,3 +30,42 @@ func BatchSize() int {
|
||||
func BatchSizePtr() *int {
|
||||
return &_BatchSize
|
||||
}
|
||||
|
||||
// BatchSizeMax returns the maximum number of sequences per batch.
|
||||
func BatchSizeMax() int {
|
||||
return _BatchSizeMax
|
||||
}
|
||||
|
||||
func BatchSizeMaxPtr() *int {
|
||||
return &_BatchSizeMax
|
||||
}
|
||||
|
||||
// _BatchMem holds the maximum cumulative memory (in bytes) per batch when
|
||||
// memory-based batching is requested. A value of 0 disables memory-based
|
||||
// batching and falls back to count-based batching.
|
||||
var _BatchMem = 128 * 1024 * 1024 // 128 MB default; set to 0 to disable
|
||||
var _BatchMemStr = ""
|
||||
|
||||
// SetBatchMem sets the memory budget per batch in bytes.
|
||||
func SetBatchMem(n int) {
|
||||
_BatchMem = n
|
||||
}
|
||||
|
||||
// BatchMem returns the current memory budget per batch in bytes.
|
||||
// A value of 0 means memory-based batching is disabled.
|
||||
func BatchMem() int {
|
||||
return _BatchMem
|
||||
}
|
||||
|
||||
func BatchMemPtr() *int {
|
||||
return &_BatchMem
|
||||
}
|
||||
|
||||
// BatchMemStr returns the raw --batch-mem string value as provided on the CLI.
|
||||
func BatchMemStr() string {
|
||||
return _BatchMemStr
|
||||
}
|
||||
|
||||
func BatchMemStrPtr() *string {
|
||||
return &_BatchMemStr
|
||||
}
|
||||
|
||||
+152
-1
@@ -161,6 +161,149 @@ func EmblChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (obise
|
||||
return parser
|
||||
}
|
||||
|
||||
// extractEmblSeq scans the sequence section of an EMBL record directly on the
|
||||
// rope. EMBL sequence lines start with 5 spaces followed by bases in groups of
|
||||
// 10, separated by spaces, with a position number at the end. The section ends
|
||||
// with "//".
|
||||
func (s *ropeScanner) extractEmblSeq(dest []byte, UtoT bool) []byte {
|
||||
// We use ReadLine and scan each line for bases (skip digits, spaces, newlines).
|
||||
for {
|
||||
line := s.ReadLine()
|
||||
if line == nil {
|
||||
break
|
||||
}
|
||||
if len(line) >= 2 && line[0] == '/' && line[1] == '/' {
|
||||
break
|
||||
}
|
||||
// Lines start with 5 spaces; bases follow separated by single spaces.
|
||||
// Digits at the end are the position counter — skip them.
|
||||
// Simplest: take every byte that is a letter.
|
||||
for _, b := range line {
|
||||
if b >= 'A' && b <= 'Z' {
|
||||
b += 'a' - 'A'
|
||||
}
|
||||
if UtoT && b == 'u' {
|
||||
b = 't'
|
||||
}
|
||||
if b >= 'a' && b <= 'z' {
|
||||
dest = append(dest, b)
|
||||
}
|
||||
}
|
||||
}
|
||||
return dest
|
||||
}
|
||||
|
||||
// EmblChunkParserRope parses an EMBL chunk directly from a rope without Pack().
|
||||
func EmblChunkParserRope(source string, rope *PieceOfChunk, withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) {
|
||||
scanner := newRopeScanner(rope)
|
||||
sequences := obiseq.MakeBioSequenceSlice(100)[:0]
|
||||
|
||||
var id string
|
||||
var scientificName string
|
||||
defBytes := make([]byte, 0, 256)
|
||||
featBytes := make([]byte, 0, 1024)
|
||||
var taxid int
|
||||
inSeq := false
|
||||
|
||||
for {
|
||||
line := scanner.ReadLine()
|
||||
if line == nil {
|
||||
break
|
||||
}
|
||||
|
||||
if inSeq {
|
||||
// Should not happen — extractEmblSeq consumed up to "//"
|
||||
inSeq = false
|
||||
continue
|
||||
}
|
||||
|
||||
switch {
|
||||
case bytes.HasPrefix(line, []byte("ID ")):
|
||||
id = string(bytes.SplitN(line[5:], []byte(";"), 2)[0])
|
||||
case bytes.HasPrefix(line, []byte("OS ")):
|
||||
scientificName = string(bytes.TrimSpace(line[5:]))
|
||||
case bytes.HasPrefix(line, []byte("DE ")):
|
||||
if len(defBytes) > 0 {
|
||||
defBytes = append(defBytes, ' ')
|
||||
}
|
||||
defBytes = append(defBytes, bytes.TrimSpace(line[5:])...)
|
||||
case withFeatureTable && bytes.HasPrefix(line, []byte("FH ")):
|
||||
featBytes = append(featBytes, line...)
|
||||
case withFeatureTable && bytes.Equal(line, []byte("FH")):
|
||||
featBytes = append(featBytes, '\n')
|
||||
featBytes = append(featBytes, line...)
|
||||
case bytes.HasPrefix(line, []byte("FT ")):
|
||||
if withFeatureTable {
|
||||
featBytes = append(featBytes, '\n')
|
||||
featBytes = append(featBytes, line...)
|
||||
}
|
||||
if bytes.HasPrefix(line, []byte(`FT /db_xref="taxon:`)) {
|
||||
rest := line[37:]
|
||||
end := bytes.IndexByte(rest, '"')
|
||||
if end > 0 {
|
||||
taxid, _ = strconv.Atoi(string(rest[:end]))
|
||||
}
|
||||
}
|
||||
case bytes.HasPrefix(line, []byte(" ")):
|
||||
// First sequence line: extract all bases via extractEmblSeq,
|
||||
// which also consumes this line's remaining content.
|
||||
// But ReadLine already consumed this line — we need to process it
|
||||
// plus subsequent lines. Process this line inline then call helper.
|
||||
seqDest := make([]byte, 0, 4096)
|
||||
for _, b := range line {
|
||||
if b >= 'A' && b <= 'Z' {
|
||||
b += 'a' - 'A'
|
||||
}
|
||||
if UtoT && b == 'u' {
|
||||
b = 't'
|
||||
}
|
||||
if b >= 'a' && b <= 'z' {
|
||||
seqDest = append(seqDest, b)
|
||||
}
|
||||
}
|
||||
seqDest = scanner.extractEmblSeq(seqDest, UtoT)
|
||||
|
||||
seq := obiseq.NewBioSequenceOwning(id, seqDest, string(defBytes))
|
||||
seq.SetSource(source)
|
||||
if withFeatureTable {
|
||||
seq.SetFeatures(featBytes)
|
||||
}
|
||||
annot := seq.Annotations()
|
||||
annot["scientific_name"] = scientificName
|
||||
annot["taxid"] = taxid
|
||||
sequences = append(sequences, seq)
|
||||
|
||||
// Reset state
|
||||
id = ""
|
||||
scientificName = ""
|
||||
defBytes = defBytes[:0]
|
||||
featBytes = featBytes[:0]
|
||||
taxid = 1
|
||||
|
||||
case bytes.Equal(line, []byte("//")):
|
||||
// record ended without SQ/sequence section (e.g. WGS entries)
|
||||
if id != "" {
|
||||
seq := obiseq.NewBioSequenceOwning(id, []byte{}, string(defBytes))
|
||||
seq.SetSource(source)
|
||||
if withFeatureTable {
|
||||
seq.SetFeatures(featBytes)
|
||||
}
|
||||
annot := seq.Annotations()
|
||||
annot["scientific_name"] = scientificName
|
||||
annot["taxid"] = taxid
|
||||
sequences = append(sequences, seq)
|
||||
}
|
||||
id = ""
|
||||
scientificName = ""
|
||||
defBytes = defBytes[:0]
|
||||
featBytes = featBytes[:0]
|
||||
taxid = 1
|
||||
}
|
||||
}
|
||||
|
||||
return sequences, nil
|
||||
}
|
||||
|
||||
func _ParseEmblFile(
|
||||
input ChannelFileChunk,
|
||||
out obiiter.IBioSequence,
|
||||
@@ -171,7 +314,14 @@ func _ParseEmblFile(
|
||||
|
||||
for chunks := range input {
|
||||
order := chunks.Order
|
||||
sequences, err := parser(chunks.Source, chunks.Raw)
|
||||
var sequences obiseq.BioSequenceSlice
|
||||
var err error
|
||||
|
||||
if chunks.Rope != nil {
|
||||
sequences, err = EmblChunkParserRope(chunks.Source, chunks.Rope, withFeatureTable, UtoT)
|
||||
} else {
|
||||
sequences, err = parser(chunks.Source, chunks.Raw)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("%s : Cannot parse the embl file : %v", chunks.Source, err)
|
||||
@@ -196,6 +346,7 @@ func ReadEMBL(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, er
|
||||
1024*1024*128,
|
||||
EndOfLastFlatFileEntry,
|
||||
"\nID ",
|
||||
false,
|
||||
)
|
||||
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
@@ -209,28 +209,121 @@ func FastaChunkParser(UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlic
|
||||
return parser
|
||||
}
|
||||
|
||||
// extractFastaSeq scans sequence bytes from the rope directly into dest,
|
||||
// appending valid nucleotide characters and skipping whitespace.
|
||||
// Stops when '>' is found at the start of a line (next record) or at EOF.
|
||||
// Returns (dest with appended bases, hasMore).
|
||||
// hasMore=true means scanner is now positioned at '>' of the next record.
|
||||
func (s *ropeScanner) extractFastaSeq(dest []byte, UtoT bool) ([]byte, bool) {
|
||||
lineStart := true
|
||||
|
||||
for s.current != nil {
|
||||
data := s.current.data[s.pos:]
|
||||
for i, b := range data {
|
||||
if lineStart && b == '>' {
|
||||
s.pos += i
|
||||
if s.pos >= len(s.current.data) {
|
||||
s.current = s.current.Next()
|
||||
s.pos = 0
|
||||
}
|
||||
return dest, true
|
||||
}
|
||||
if b == '\n' || b == '\r' {
|
||||
lineStart = true
|
||||
continue
|
||||
}
|
||||
lineStart = false
|
||||
if b == ' ' || b == '\t' {
|
||||
continue
|
||||
}
|
||||
if b >= 'A' && b <= 'Z' {
|
||||
b += 'a' - 'A'
|
||||
}
|
||||
if UtoT && b == 'u' {
|
||||
b = 't'
|
||||
}
|
||||
dest = append(dest, b)
|
||||
}
|
||||
s.current = s.current.Next()
|
||||
s.pos = 0
|
||||
}
|
||||
return dest, false
|
||||
}
|
||||
|
||||
// FastaChunkParserRope parses a FASTA chunk directly from the rope without Pack().
|
||||
func FastaChunkParserRope(source string, rope *PieceOfChunk, UtoT bool) (obiseq.BioSequenceSlice, error) {
|
||||
scanner := newRopeScanner(rope)
|
||||
sequences := obiseq.MakeBioSequenceSlice(100)[:0]
|
||||
|
||||
for {
|
||||
bline := scanner.ReadLine()
|
||||
if bline == nil {
|
||||
break
|
||||
}
|
||||
if len(bline) == 0 || bline[0] != '>' {
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse header: ">id definition"
|
||||
header := bline[1:]
|
||||
var id string
|
||||
var definition string
|
||||
sp := bytes.IndexByte(header, ' ')
|
||||
if sp < 0 {
|
||||
sp = bytes.IndexByte(header, '\t')
|
||||
}
|
||||
if sp < 0 {
|
||||
id = string(header)
|
||||
} else {
|
||||
id = string(header[:sp])
|
||||
definition = string(bytes.TrimSpace(header[sp+1:]))
|
||||
}
|
||||
|
||||
seqDest := make([]byte, 0, 4096)
|
||||
var hasMore bool
|
||||
seqDest, hasMore = scanner.extractFastaSeq(seqDest, UtoT)
|
||||
|
||||
if len(seqDest) == 0 {
|
||||
log.Fatalf("%s [%s]: sequence is empty", source, id)
|
||||
}
|
||||
|
||||
seq := obiseq.NewBioSequenceOwning(id, seqDest, definition)
|
||||
seq.SetSource(source)
|
||||
sequences = append(sequences, seq)
|
||||
|
||||
if !hasMore {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return sequences, nil
|
||||
}
|
||||
|
||||
func _ParseFastaFile(
|
||||
input ChannelFileChunk,
|
||||
out obiiter.IBioSequence,
|
||||
UtoT bool,
|
||||
) {
|
||||
|
||||
parser := FastaChunkParser(UtoT)
|
||||
|
||||
for chunks := range input {
|
||||
sequences, err := parser(chunks.Source, chunks.Raw)
|
||||
// obilog.Warnf("Chunck(%d:%d) -%d- ", chunks.Order, l, sequences.Len())
|
||||
var sequences obiseq.BioSequenceSlice
|
||||
var err error
|
||||
|
||||
if chunks.Rope != nil {
|
||||
sequences, err = FastaChunkParserRope(chunks.Source, chunks.Rope, UtoT)
|
||||
} else {
|
||||
sequences, err = parser(chunks.Source, chunks.Raw)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("File %s : Cannot parse the fasta file : %v", chunks.Source, err)
|
||||
}
|
||||
|
||||
out.Push(obiiter.MakeBioSequenceBatch(chunks.Source, chunks.Order, sequences))
|
||||
|
||||
}
|
||||
|
||||
out.Done()
|
||||
|
||||
}
|
||||
|
||||
func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
@@ -245,6 +338,7 @@ func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
|
||||
1024*1024,
|
||||
EndOfLastFastaEntry,
|
||||
"\n>",
|
||||
false,
|
||||
)
|
||||
|
||||
for i := 0; i < nworker; i++ {
|
||||
|
||||
@@ -303,6 +303,80 @@ func FastqChunkParser(quality_shift byte, with_quality bool, UtoT bool) func(str
|
||||
return parser
|
||||
}
|
||||
|
||||
// FastqChunkParserRope parses a FASTQ chunk directly from a rope without Pack().
|
||||
func FastqChunkParserRope(source string, rope *PieceOfChunk, quality_shift byte, with_quality, UtoT bool) (obiseq.BioSequenceSlice, error) {
|
||||
scanner := newRopeScanner(rope)
|
||||
sequences := obiseq.MakeBioSequenceSlice(100)[:0]
|
||||
|
||||
for {
|
||||
// Line 1: @id [definition]
|
||||
hline := scanner.ReadLine()
|
||||
if hline == nil {
|
||||
break
|
||||
}
|
||||
if len(hline) == 0 || hline[0] != '@' {
|
||||
continue
|
||||
}
|
||||
header := hline[1:]
|
||||
var id string
|
||||
var definition string
|
||||
sp := bytes.IndexByte(header, ' ')
|
||||
if sp < 0 {
|
||||
sp = bytes.IndexByte(header, '\t')
|
||||
}
|
||||
if sp < 0 {
|
||||
id = string(header)
|
||||
} else {
|
||||
id = string(header[:sp])
|
||||
definition = string(bytes.TrimSpace(header[sp+1:]))
|
||||
}
|
||||
|
||||
// Line 2: sequence
|
||||
sline := scanner.ReadLine()
|
||||
if sline == nil {
|
||||
log.Fatalf("@%s[%s]: unexpected EOF after header", id, source)
|
||||
}
|
||||
seqDest := make([]byte, len(sline))
|
||||
w := 0
|
||||
for _, b := range sline {
|
||||
if b >= 'A' && b <= 'Z' {
|
||||
b += 'a' - 'A'
|
||||
}
|
||||
if UtoT && b == 'u' {
|
||||
b = 't'
|
||||
}
|
||||
seqDest[w] = b
|
||||
w++
|
||||
}
|
||||
seqDest = seqDest[:w]
|
||||
if len(seqDest) == 0 {
|
||||
log.Fatalf("@%s[%s]: sequence is empty", id, source)
|
||||
}
|
||||
|
||||
// Line 3: + (skip)
|
||||
scanner.ReadLine()
|
||||
|
||||
// Line 4: quality
|
||||
qline := scanner.ReadLine()
|
||||
|
||||
seq := obiseq.NewBioSequenceOwning(id, seqDest, definition)
|
||||
seq.SetSource(source)
|
||||
|
||||
if with_quality && qline != nil {
|
||||
qDest := make([]byte, len(qline))
|
||||
copy(qDest, qline)
|
||||
for i := range qDest {
|
||||
qDest[i] -= quality_shift
|
||||
}
|
||||
seq.TakeQualities(qDest)
|
||||
}
|
||||
|
||||
sequences = append(sequences, seq)
|
||||
}
|
||||
|
||||
return sequences, nil
|
||||
}
|
||||
|
||||
func _ParseFastqFile(
|
||||
input ChannelFileChunk,
|
||||
out obiiter.IBioSequence,
|
||||
@@ -313,7 +387,14 @@ func _ParseFastqFile(
|
||||
parser := FastqChunkParser(quality_shift, with_quality, UtoT)
|
||||
|
||||
for chunks := range input {
|
||||
sequences, err := parser(chunks.Source, chunks.Raw)
|
||||
var sequences obiseq.BioSequenceSlice
|
||||
var err error
|
||||
|
||||
if chunks.Rope != nil {
|
||||
sequences, err = FastqChunkParserRope(chunks.Source, chunks.Rope, quality_shift, with_quality, UtoT)
|
||||
} else {
|
||||
sequences, err = parser(chunks.Source, chunks.Raw)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("File %s : Cannot parse the fastq file : %v", chunks.Source, err)
|
||||
@@ -339,6 +420,7 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
|
||||
1024*1024,
|
||||
EndOfLastFastqEntry,
|
||||
"\n@",
|
||||
false,
|
||||
)
|
||||
|
||||
for i := 0; i < nworker; i++ {
|
||||
|
||||
@@ -296,7 +296,7 @@ func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {
|
||||
|
||||
case strings.HasSuffix(skey, "_taxid"):
|
||||
if dataType == jsonparser.Number || dataType == jsonparser.String {
|
||||
rank, _ := obiutils.SplitInTwo(skey, '_')
|
||||
rank := skey[:len(skey)-len("_taxid")]
|
||||
|
||||
taxid := string(value)
|
||||
sequence.SetTaxid(taxid, rank)
|
||||
|
||||
@@ -77,45 +77,47 @@ func FormatFasta(seq *obiseq.BioSequence, formater FormatHeader) string {
|
||||
//
|
||||
// It returns a byte array containing the formatted sequences.
|
||||
func FormatFastaBatch(batch obiiter.BioSequenceBatch, formater FormatHeader, skipEmpty bool) *bytes.Buffer {
|
||||
// Create a buffer to store the formatted sequences
|
||||
var bs bytes.Buffer
|
||||
|
||||
lt := 0
|
||||
|
||||
for _, seq := range batch.Slice() {
|
||||
lt += seq.Len()
|
||||
}
|
||||
|
||||
// Iterate over each sequence in the batch
|
||||
// Pre-allocate: sequence data + newlines every 60 chars + ~100 bytes header per sequence
|
||||
bs.Grow(lt + lt/60 + 100*batch.Len() + 1)
|
||||
|
||||
log.Debugf("FormatFastaBatch: #%d : %d seqs", batch.Order(), batch.Len())
|
||||
first := true
|
||||
|
||||
for _, seq := range batch.Slice() {
|
||||
// Check if the sequence is empty
|
||||
if seq.Len() > 0 {
|
||||
// Format the sequence using the provided formater function
|
||||
formattedSeq := FormatFasta(seq, formater)
|
||||
|
||||
if first {
|
||||
bs.Grow(lt + (len(formattedSeq)-seq.Len())*batch.Len()*5/4)
|
||||
first = false
|
||||
}
|
||||
|
||||
// Append the formatted sequence to the buffer
|
||||
bs.WriteString(formattedSeq)
|
||||
// Write header directly into bs — no intermediate string
|
||||
bs.WriteByte('>')
|
||||
bs.WriteString(seq.Id())
|
||||
bs.WriteByte(' ')
|
||||
bs.WriteString(formater(seq))
|
||||
bs.WriteByte('\n')
|
||||
|
||||
// Write folded sequence directly into bs — no copies
|
||||
s := seq.Sequence()
|
||||
l := len(s)
|
||||
for i := 0; i < l; i += 60 {
|
||||
to := i + 60
|
||||
if to > l {
|
||||
to = l
|
||||
}
|
||||
bs.Write(s[i:to])
|
||||
bs.WriteByte('\n')
|
||||
}
|
||||
} else {
|
||||
// Handle empty sequences
|
||||
if skipEmpty {
|
||||
// Skip empty sequences if skipEmpty is true
|
||||
obilog.Warnf("Sequence %s is empty and skipped in output", seq.Id())
|
||||
} else {
|
||||
// Terminate the program if skipEmpty is false
|
||||
log.Fatalf("Sequence %s is empty", seq.Id())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Return the byte array representation of the buffer
|
||||
return &bs
|
||||
}
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@ type SeqFileChunkParser func(string, io.Reader) (obiseq.BioSequenceSlice, error)
|
||||
type FileChunk struct {
|
||||
Source string
|
||||
Raw *bytes.Buffer
|
||||
Rope *PieceOfChunk
|
||||
Order int
|
||||
}
|
||||
|
||||
@@ -97,11 +98,17 @@ func (piece *PieceOfChunk) IsLast() bool {
|
||||
return piece.next == nil
|
||||
}
|
||||
|
||||
func (piece *PieceOfChunk) FileChunk(source string, order int) FileChunk {
|
||||
piece.Pack()
|
||||
func (piece *PieceOfChunk) FileChunk(source string, order int, pack bool) FileChunk {
|
||||
piece = piece.Head()
|
||||
var raw *bytes.Buffer
|
||||
if pack {
|
||||
piece.Pack()
|
||||
raw = bytes.NewBuffer(piece.data)
|
||||
}
|
||||
return FileChunk{
|
||||
Source: source,
|
||||
Raw: bytes.NewBuffer(piece.data),
|
||||
Raw: raw,
|
||||
Rope: piece,
|
||||
Order: order,
|
||||
}
|
||||
}
|
||||
@@ -133,7 +140,8 @@ func ReadFileChunk(
|
||||
reader io.Reader,
|
||||
fileChunkSize int,
|
||||
splitter LastSeqRecord,
|
||||
probe string) ChannelFileChunk {
|
||||
probe string,
|
||||
pack bool) ChannelFileChunk {
|
||||
|
||||
chunk_channel := make(ChannelFileChunk)
|
||||
|
||||
@@ -205,7 +213,7 @@ func ReadFileChunk(
|
||||
|
||||
if len(pieces.data) > 0 {
|
||||
// obilog.Warnf("chuck %d :Read %d bytes from file %s", i, io.Len(), source)
|
||||
chunk_channel <- pieces.FileChunk(source, i)
|
||||
chunk_channel <- pieces.FileChunk(source, i, pack)
|
||||
i++
|
||||
}
|
||||
|
||||
@@ -222,7 +230,7 @@ func ReadFileChunk(
|
||||
|
||||
// Send the last chunk to the channel
|
||||
if pieces.Len() > 0 {
|
||||
chunk_channel <- pieces.FileChunk(source, i)
|
||||
chunk_channel <- pieces.FileChunk(source, i, pack)
|
||||
}
|
||||
|
||||
// Close the readers channel when the end of the file is reached
|
||||
|
||||
+273
-17
@@ -29,6 +29,265 @@ const (
|
||||
|
||||
var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp")
|
||||
|
||||
// extractSequence scans the ORIGIN section byte-by-byte directly on the rope,
|
||||
// appending compacted bases to dest. Returns the extended slice.
|
||||
// Stops and returns when "//" is found at the start of a line.
|
||||
// The scanner is left positioned after the "//" line.
|
||||
func (s *ropeScanner) extractSequence(dest []byte, UtoT bool) []byte {
|
||||
lineStart := true
|
||||
skipDigits := true
|
||||
|
||||
for s.current != nil {
|
||||
data := s.current.data[s.pos:]
|
||||
for i, b := range data {
|
||||
if lineStart {
|
||||
if b == '/' {
|
||||
// End-of-record marker "//"
|
||||
s.pos += i + 1
|
||||
if s.pos >= len(s.current.data) {
|
||||
s.current = s.current.Next()
|
||||
s.pos = 0
|
||||
}
|
||||
s.skipToNewline()
|
||||
return dest
|
||||
}
|
||||
lineStart = false
|
||||
skipDigits = true
|
||||
}
|
||||
switch {
|
||||
case b == '\n':
|
||||
lineStart = true
|
||||
case b == '\r':
|
||||
// skip
|
||||
case skipDigits:
|
||||
if b != ' ' && (b < '0' || b > '9') {
|
||||
skipDigits = false
|
||||
if UtoT && b == 'u' {
|
||||
b = 't'
|
||||
}
|
||||
dest = append(dest, b)
|
||||
}
|
||||
case b != ' ':
|
||||
if UtoT && b == 'u' {
|
||||
b = 't'
|
||||
}
|
||||
dest = append(dest, b)
|
||||
}
|
||||
}
|
||||
s.current = s.current.Next()
|
||||
s.pos = 0
|
||||
}
|
||||
return dest
|
||||
}
|
||||
|
||||
// parseLseqFromLocus extracts the declared sequence length from a LOCUS line.
|
||||
// Format: "LOCUS <id> <length> bp ..."
|
||||
// Returns -1 if not found or parse error.
|
||||
func parseLseqFromLocus(line []byte) int {
|
||||
if len(line) < 13 {
|
||||
return -1
|
||||
}
|
||||
i := 12
|
||||
for i < len(line) && line[i] != ' ' {
|
||||
i++
|
||||
}
|
||||
for i < len(line) && line[i] == ' ' {
|
||||
i++
|
||||
}
|
||||
start := i
|
||||
for i < len(line) && line[i] >= '0' && line[i] <= '9' {
|
||||
i++
|
||||
}
|
||||
if i == start {
|
||||
return -1
|
||||
}
|
||||
n, err := strconv.Atoi(string(line[start:i]))
|
||||
if err != nil {
|
||||
return -1
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// Prefix constants for GenBank section headers (byte slices for zero-alloc comparison).
|
||||
var (
|
||||
gbPfxLocus = []byte("LOCUS ")
|
||||
gbPfxDefinition = []byte("DEFINITION ")
|
||||
gbPfxContinue = []byte(" ")
|
||||
gbPfxSource = []byte("SOURCE ")
|
||||
gbPfxFeatures = []byte("FEATURES ")
|
||||
gbPfxOrigin = []byte("ORIGIN")
|
||||
gbPfxContig = []byte("CONTIG")
|
||||
gbPfxEnd = []byte("//")
|
||||
gbPfxDbXref = []byte(` /db_xref="taxon:`)
|
||||
)
|
||||
|
||||
// GenbankChunkParserRope parses a GenBank FileChunk directly from the rope
|
||||
// (PieceOfChunk linked list) without calling Pack(). This eliminates the large
|
||||
// contiguous allocation required for chromosomal-scale sequences.
|
||||
func GenbankChunkParserRope(source string, rope *PieceOfChunk,
|
||||
withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) {
|
||||
|
||||
state := inHeader
|
||||
scanner := newRopeScanner(rope)
|
||||
sequences := obiseq.MakeBioSequenceSlice(100)[:0]
|
||||
|
||||
id := ""
|
||||
lseq := -1
|
||||
scientificName := ""
|
||||
defBytes := new(bytes.Buffer)
|
||||
featBytes := new(bytes.Buffer)
|
||||
var seqDest []byte
|
||||
taxid := 1
|
||||
nl := 0
|
||||
|
||||
for bline := scanner.ReadLine(); bline != nil; bline = scanner.ReadLine() {
|
||||
nl++
|
||||
processed := false
|
||||
for !processed {
|
||||
switch {
|
||||
|
||||
case bytes.HasPrefix(bline, gbPfxLocus):
|
||||
if state != inHeader {
|
||||
log.Fatalf("Line %d - Unexpected state %d while reading LOCUS: %s", nl, state, bline)
|
||||
}
|
||||
rest := bline[12:]
|
||||
sp := bytes.IndexByte(rest, ' ')
|
||||
if sp < 0 {
|
||||
id = string(rest)
|
||||
} else {
|
||||
id = string(rest[:sp])
|
||||
}
|
||||
lseq = parseLseqFromLocus(bline)
|
||||
cap0 := lseq + 20
|
||||
if cap0 < 1024 {
|
||||
cap0 = 1024
|
||||
}
|
||||
seqDest = make([]byte, 0, cap0)
|
||||
state = inEntry
|
||||
processed = true
|
||||
|
||||
case bytes.HasPrefix(bline, gbPfxDefinition):
|
||||
if state != inEntry {
|
||||
log.Fatalf("Line %d - Unexpected state %d while reading DEFINITION: %s", nl, state, bline)
|
||||
}
|
||||
defBytes.Write(bytes.TrimSpace(bline[12:]))
|
||||
state = inDefinition
|
||||
processed = true
|
||||
|
||||
case state == inDefinition:
|
||||
if bytes.HasPrefix(bline, gbPfxContinue) {
|
||||
defBytes.WriteByte(' ')
|
||||
defBytes.Write(bytes.TrimSpace(bline[12:]))
|
||||
processed = true
|
||||
} else {
|
||||
state = inEntry
|
||||
}
|
||||
|
||||
case bytes.HasPrefix(bline, gbPfxSource):
|
||||
if state != inEntry {
|
||||
log.Fatalf("Line %d - Unexpected state %d while reading SOURCE: %s", nl, state, bline)
|
||||
}
|
||||
scientificName = string(bytes.TrimSpace(bline[12:]))
|
||||
processed = true
|
||||
|
||||
case bytes.HasPrefix(bline, gbPfxFeatures):
|
||||
if state != inEntry {
|
||||
log.Fatalf("Line %d - Unexpected state %d while reading FEATURES: %s", nl, state, bline)
|
||||
}
|
||||
if withFeatureTable {
|
||||
featBytes.Write(bline)
|
||||
}
|
||||
state = inFeature
|
||||
processed = true
|
||||
|
||||
case bytes.HasPrefix(bline, gbPfxOrigin):
|
||||
if state != inFeature && state != inContig {
|
||||
log.Fatalf("Line %d - Unexpected state %d while reading ORIGIN: %s", nl, state, bline)
|
||||
}
|
||||
// Use fast byte-scan to extract sequence and consume through "//"
|
||||
seqDest = scanner.extractSequence(seqDest, UtoT)
|
||||
// Emit record
|
||||
if id == "" {
|
||||
log.Warn("Empty id when parsing genbank file")
|
||||
}
|
||||
sequence := obiseq.NewBioSequenceOwning(id, seqDest, defBytes.String())
|
||||
sequence.SetSource(source)
|
||||
if withFeatureTable {
|
||||
sequence.SetFeatures(featBytes.Bytes())
|
||||
}
|
||||
annot := sequence.Annotations()
|
||||
annot["scientific_name"] = scientificName
|
||||
annot["taxid"] = taxid
|
||||
sequences = append(sequences, sequence)
|
||||
|
||||
defBytes = bytes.NewBuffer(obiseq.GetSlice(200))
|
||||
featBytes = new(bytes.Buffer)
|
||||
nl = 0
|
||||
taxid = 1
|
||||
seqDest = nil
|
||||
state = inHeader
|
||||
processed = true
|
||||
|
||||
case bytes.HasPrefix(bline, gbPfxContig):
|
||||
if state != inFeature && state != inContig {
|
||||
log.Fatalf("Line %d - Unexpected state %d while reading CONTIG: %s", nl, state, bline)
|
||||
}
|
||||
state = inContig
|
||||
processed = true
|
||||
|
||||
case bytes.Equal(bline, gbPfxEnd):
|
||||
// Reached for CONTIG records (no ORIGIN section)
|
||||
if state != inContig {
|
||||
log.Fatalf("Line %d - Unexpected state %d while reading end of record %s", nl, state, id)
|
||||
}
|
||||
if id == "" {
|
||||
log.Warn("Empty id when parsing genbank file")
|
||||
}
|
||||
sequence := obiseq.NewBioSequenceOwning(id, seqDest, defBytes.String())
|
||||
sequence.SetSource(source)
|
||||
if withFeatureTable {
|
||||
sequence.SetFeatures(featBytes.Bytes())
|
||||
}
|
||||
annot := sequence.Annotations()
|
||||
annot["scientific_name"] = scientificName
|
||||
annot["taxid"] = taxid
|
||||
sequences = append(sequences, sequence)
|
||||
|
||||
defBytes = bytes.NewBuffer(obiseq.GetSlice(200))
|
||||
featBytes = new(bytes.Buffer)
|
||||
nl = 0
|
||||
taxid = 1
|
||||
seqDest = nil
|
||||
state = inHeader
|
||||
processed = true
|
||||
|
||||
default:
|
||||
switch state {
|
||||
case inFeature:
|
||||
if withFeatureTable {
|
||||
featBytes.WriteByte('\n')
|
||||
featBytes.Write(bline)
|
||||
}
|
||||
if bytes.HasPrefix(bline, gbPfxDbXref) {
|
||||
rest := bline[len(gbPfxDbXref):]
|
||||
q := bytes.IndexByte(rest, '"')
|
||||
if q >= 0 {
|
||||
taxid, _ = strconv.Atoi(string(rest[:q]))
|
||||
}
|
||||
}
|
||||
processed = true
|
||||
case inHeader, inEntry, inContig:
|
||||
processed = true
|
||||
default:
|
||||
log.Fatalf("Unexpected state %d while reading: %s", state, bline)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return sequences, nil
|
||||
}
|
||||
|
||||
func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlice, error) {
|
||||
return func(source string, input io.Reader) (obiseq.BioSequenceSlice, error) {
|
||||
state := inHeader
|
||||
@@ -125,13 +384,10 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
|
||||
if state != inSequence && state != inContig {
|
||||
log.Fatalf("Line %d - Unexpected state %d while reading end of record %s", nl, state, id)
|
||||
}
|
||||
// log.Debugln("Total lines := ", nl)
|
||||
if id == "" {
|
||||
log.Warn("Empty id when parsing genbank file")
|
||||
}
|
||||
|
||||
// log.Debugf("End of sequence %s: %dbp ", id, seqBytes.Len())
|
||||
|
||||
sequence := obiseq.NewBioSequence(id,
|
||||
seqBytes.Bytes(),
|
||||
defBytes.String())
|
||||
@@ -144,9 +400,6 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
|
||||
annot := sequence.Annotations()
|
||||
annot["scientific_name"] = scientificName
|
||||
annot["taxid"] = taxid
|
||||
// log.Println(FormatFasta(sequence, FormatFastSeqJsonHeader))
|
||||
// log.Debugf("Read sequences %s: %dbp (%d)", sequence.Id(),
|
||||
// sequence.Len(), seqBytes.Len())
|
||||
|
||||
sequences = append(sequences, sequence)
|
||||
|
||||
@@ -159,12 +412,11 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
|
||||
processed = true
|
||||
|
||||
case state == inSequence:
|
||||
// log.Debugf("Chunk %d : Genbank: line %d, state = %d : %s", chunks.order, nl, state, line)
|
||||
|
||||
sl++
|
||||
parts := strings.SplitN(line[10:], " ", 6)
|
||||
cleanline := strings.TrimSpace(line)
|
||||
parts := strings.SplitN(cleanline, " ", 7)
|
||||
lparts := len(parts)
|
||||
for i := 0; i < lparts; i++ {
|
||||
for i := 1; i < lparts; i++ {
|
||||
if UtoT {
|
||||
parts[i] = strings.ReplaceAll(parts[i], "u", "t")
|
||||
}
|
||||
@@ -197,6 +449,7 @@ func GenbankChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (ob
|
||||
|
||||
}
|
||||
|
||||
_ = sl
|
||||
return sequences, nil
|
||||
}
|
||||
}
|
||||
@@ -205,10 +458,16 @@ func _ParseGenbankFile(input ChannelFileChunk,
|
||||
out obiiter.IBioSequence,
|
||||
withFeatureTable, UtoT bool) {
|
||||
|
||||
parser := GenbankChunkParser(withFeatureTable, UtoT)
|
||||
|
||||
for chunks := range input {
|
||||
sequences, err := parser(chunks.Source, chunks.Raw)
|
||||
var sequences obiseq.BioSequenceSlice
|
||||
var err error
|
||||
|
||||
if chunks.Rope != nil {
|
||||
sequences, err = GenbankChunkParserRope(chunks.Source, chunks.Rope, withFeatureTable, UtoT)
|
||||
} else {
|
||||
parser := GenbankChunkParser(withFeatureTable, UtoT)
|
||||
sequences, err = parser(chunks.Source, chunks.Raw)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("File %s : Cannot parse the genbank file : %v", chunks.Source, err)
|
||||
@@ -224,7 +483,6 @@ func _ParseGenbankFile(input ChannelFileChunk,
|
||||
|
||||
func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
opt := MakeOptions(options)
|
||||
// entry_channel := make(chan _FileChunk)
|
||||
|
||||
entry_channel := ReadFileChunk(
|
||||
opt.Source(),
|
||||
@@ -232,13 +490,13 @@ func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence,
|
||||
1024*1024*128,
|
||||
EndOfLastFlatFileEntry,
|
||||
"\nLOCUS ",
|
||||
false, // do not pack: rope-based parser avoids contiguous allocation
|
||||
)
|
||||
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
nworkers := opt.ParallelWorkers()
|
||||
|
||||
// for j := 0; j < opt.ParallelWorkers(); j++ {
|
||||
for j := 0; j < nworkers; j++ {
|
||||
newIter.Add(1)
|
||||
go _ParseGenbankFile(
|
||||
@@ -249,8 +507,6 @@ func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence,
|
||||
)
|
||||
}
|
||||
|
||||
// go _ReadFlatFileChunk(reader, entry_channel)
|
||||
|
||||
go func() {
|
||||
newIter.WaitAndClose()
|
||||
log.Debug("End of the genbank file ", opt.Source())
|
||||
|
||||
@@ -0,0 +1,77 @@
|
||||
package obiformats
|
||||
|
||||
import "bytes"
|
||||
|
||||
// ropeScanner reads lines from a PieceOfChunk rope.
|
||||
// The carry buffer handles lines that span two rope nodes; it grows as needed.
|
||||
type ropeScanner struct {
|
||||
current *PieceOfChunk
|
||||
pos int
|
||||
carry []byte
|
||||
}
|
||||
|
||||
func newRopeScanner(rope *PieceOfChunk) *ropeScanner {
|
||||
return &ropeScanner{current: rope}
|
||||
}
|
||||
|
||||
// ReadLine returns the next line without the trailing \n (or \r\n).
|
||||
// Returns nil at end of rope. The returned slice aliases carry[] or the node
|
||||
// data and is valid only until the next ReadLine call.
|
||||
func (s *ropeScanner) ReadLine() []byte {
|
||||
for {
|
||||
if s.current == nil {
|
||||
if len(s.carry) > 0 {
|
||||
line := s.carry
|
||||
s.carry = s.carry[:0]
|
||||
return line
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
data := s.current.data[s.pos:]
|
||||
idx := bytes.IndexByte(data, '\n')
|
||||
|
||||
if idx >= 0 {
|
||||
var line []byte
|
||||
if len(s.carry) == 0 {
|
||||
line = data[:idx]
|
||||
} else {
|
||||
s.carry = append(s.carry, data[:idx]...)
|
||||
line = s.carry
|
||||
s.carry = s.carry[:0]
|
||||
}
|
||||
s.pos += idx + 1
|
||||
if s.pos >= len(s.current.data) {
|
||||
s.current = s.current.Next()
|
||||
s.pos = 0
|
||||
}
|
||||
if len(line) > 0 && line[len(line)-1] == '\r' {
|
||||
line = line[:len(line)-1]
|
||||
}
|
||||
return line
|
||||
}
|
||||
|
||||
// No \n in this node: accumulate into carry and advance
|
||||
s.carry = append(s.carry, data...)
|
||||
s.current = s.current.Next()
|
||||
s.pos = 0
|
||||
}
|
||||
}
|
||||
|
||||
// skipToNewline advances the scanner past the next '\n'.
|
||||
func (s *ropeScanner) skipToNewline() {
|
||||
for s.current != nil {
|
||||
data := s.current.data[s.pos:]
|
||||
idx := bytes.IndexByte(data, '\n')
|
||||
if idx >= 0 {
|
||||
s.pos += idx + 1
|
||||
if s.pos >= len(s.current.data) {
|
||||
s.current = s.current.Next()
|
||||
s.pos = 0
|
||||
}
|
||||
return
|
||||
}
|
||||
s.current = s.current.Next()
|
||||
s.pos = 0
|
||||
}
|
||||
}
|
||||
@@ -444,6 +444,67 @@ func (iterator IBioSequence) Rebatch(size int) IBioSequence {
|
||||
return newIter
|
||||
}
|
||||
|
||||
// RebatchBySize reorganises the stream into batches bounded by two independent
|
||||
// upper limits: maxCount (max number of sequences) and maxBytes (max cumulative
|
||||
// estimated memory). A batch is flushed as soon as either limit would be
|
||||
// exceeded. A single sequence larger than maxBytes is always emitted alone.
|
||||
// Passing 0 for a limit disables that constraint; if both are 0 it falls back
|
||||
// to Rebatch(obidefault.BatchSizeMax()).
|
||||
func (iterator IBioSequence) RebatchBySize(maxBytes int, maxCount int) IBioSequence {
|
||||
if maxBytes <= 0 && maxCount <= 0 {
|
||||
return iterator.Rebatch(obidefault.BatchSizeMax())
|
||||
}
|
||||
|
||||
newIter := MakeIBioSequence()
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
newIter.WaitAndClose()
|
||||
}()
|
||||
|
||||
go func() {
|
||||
order := 0
|
||||
iterator = iterator.SortBatches()
|
||||
buffer := obiseq.MakeBioSequenceSlice()
|
||||
bufBytes := 0
|
||||
source := ""
|
||||
|
||||
flush := func() {
|
||||
if len(buffer) > 0 {
|
||||
newIter.Push(MakeBioSequenceBatch(source, order, buffer))
|
||||
order++
|
||||
buffer = obiseq.MakeBioSequenceSlice()
|
||||
bufBytes = 0
|
||||
}
|
||||
}
|
||||
|
||||
for iterator.Next() {
|
||||
seqs := iterator.Get()
|
||||
source = seqs.Source()
|
||||
for _, s := range seqs.Slice() {
|
||||
sz := s.MemorySize()
|
||||
countFull := maxCount > 0 && len(buffer) >= maxCount
|
||||
memFull := maxBytes > 0 && bufBytes+sz > maxBytes && len(buffer) > 0
|
||||
if countFull || memFull {
|
||||
flush()
|
||||
}
|
||||
buffer = append(buffer, s)
|
||||
bufBytes += sz
|
||||
}
|
||||
}
|
||||
flush()
|
||||
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
if iterator.IsPaired() {
|
||||
newIter.MarkAsPaired()
|
||||
}
|
||||
|
||||
return newIter
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) FilterEmpty() IBioSequence {
|
||||
|
||||
newIter := MakeIBioSequence()
|
||||
@@ -638,7 +699,7 @@ func (iterator IBioSequence) FilterOn(predicate obiseq.SequencePredicate,
|
||||
trueIter.MarkAsPaired()
|
||||
}
|
||||
|
||||
return trueIter.Rebatch(size)
|
||||
return trueIter.RebatchBySize(obidefault.BatchMem(), obidefault.BatchSizeMax())
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) FilterAnd(predicate obiseq.SequencePredicate,
|
||||
@@ -694,7 +755,7 @@ func (iterator IBioSequence) FilterAnd(predicate obiseq.SequencePredicate,
|
||||
trueIter.MarkAsPaired()
|
||||
}
|
||||
|
||||
return trueIter.Rebatch(size)
|
||||
return trueIter.RebatchBySize(obidefault.BatchMem(), obidefault.BatchSizeMax())
|
||||
}
|
||||
|
||||
// Load all sequences availables from an IBioSequenceBatch iterator into
|
||||
|
||||
+18
-24
@@ -57,34 +57,21 @@ func (dist *IDistribute) Classifier() *obiseq.BioSequenceClassifier {
|
||||
}
|
||||
|
||||
// Distribute organizes the biosequences from the iterator into batches
|
||||
// based on the provided classifier and batch sizes. It returns an
|
||||
// IDistribute instance that manages the distribution of the sequences.
|
||||
// based on the provided classifier. It returns an IDistribute instance
|
||||
// that manages the distribution of the sequences.
|
||||
//
|
||||
// Parameters:
|
||||
// - class: A pointer to a BioSequenceClassifier used to classify
|
||||
// the biosequences during distribution.
|
||||
// - sizes: Optional integer values specifying the batch size. If
|
||||
// no sizes are provided, a default batch size of 5000 is used.
|
||||
//
|
||||
// Returns:
|
||||
// An IDistribute instance that contains the outputs of the
|
||||
// classified biosequences, a channel for new data notifications,
|
||||
// and the classifier used for distribution. The method operates
|
||||
// asynchronously, processing the sequences in separate goroutines.
|
||||
// It ensures that the outputs are closed and cleaned up once
|
||||
// processing is complete.
|
||||
func (iterator IBioSequence) Distribute(class *obiseq.BioSequenceClassifier, sizes ...int) IDistribute {
|
||||
batchsize := obidefault.BatchSize()
|
||||
// Batches are flushed when either BatchSizeMax() sequences or BatchMem()
|
||||
// bytes are accumulated per key, mirroring the RebatchBySize strategy.
|
||||
func (iterator IBioSequence) Distribute(class *obiseq.BioSequenceClassifier) IDistribute {
|
||||
maxCount := obidefault.BatchSizeMax()
|
||||
maxBytes := obidefault.BatchMem()
|
||||
|
||||
outputs := make(map[int]IBioSequence, 100)
|
||||
slices := make(map[int]*obiseq.BioSequenceSlice, 100)
|
||||
bufBytes := make(map[int]int, 100)
|
||||
orders := make(map[int]int, 100)
|
||||
news := make(chan int)
|
||||
|
||||
if len(sizes) > 0 {
|
||||
batchsize = sizes[0]
|
||||
}
|
||||
|
||||
jobDone := sync.WaitGroup{}
|
||||
lock := sync.Mutex{}
|
||||
|
||||
@@ -115,6 +102,7 @@ func (iterator IBioSequence) Distribute(class *obiseq.BioSequenceClassifier, siz
|
||||
slice = &s
|
||||
slices[key] = slice
|
||||
orders[key] = 0
|
||||
bufBytes[key] = 0
|
||||
|
||||
lock.Lock()
|
||||
outputs[key] = MakeIBioSequence()
|
||||
@@ -123,14 +111,20 @@ func (iterator IBioSequence) Distribute(class *obiseq.BioSequenceClassifier, siz
|
||||
news <- key
|
||||
}
|
||||
|
||||
*slice = append(*slice, s)
|
||||
|
||||
if len(*slice) == batchsize {
|
||||
sz := s.MemorySize()
|
||||
countFull := maxCount > 0 && len(*slice) >= maxCount
|
||||
memFull := maxBytes > 0 && bufBytes[key]+sz > maxBytes && len(*slice) > 0
|
||||
if countFull || memFull {
|
||||
outputs[key].Push(MakeBioSequenceBatch(source, orders[key], *slice))
|
||||
orders[key]++
|
||||
s := obiseq.MakeBioSequenceSlice()
|
||||
slices[key] = &s
|
||||
slice = &s
|
||||
bufBytes[key] = 0
|
||||
}
|
||||
|
||||
*slice = append(*slice, s)
|
||||
bufBytes[key] += sz
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ package obiiter
|
||||
import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
)
|
||||
|
||||
@@ -70,7 +71,7 @@ func IFragments(minsize, length, overlap, size, nworkers int) Pipeable {
|
||||
}
|
||||
go f(iterator)
|
||||
|
||||
return newiter.SortBatches().Rebatch(size)
|
||||
return newiter.SortBatches().RebatchBySize(obidefault.BatchMem(), obidefault.BatchSizeMax())
|
||||
}
|
||||
|
||||
return ifrg
|
||||
|
||||
@@ -47,7 +47,7 @@ func Encode4mer(seq *obiseq.BioSequence, buffer *[]byte) []byte {
|
||||
length := slength - 3
|
||||
rawseq := seq.Sequence()
|
||||
|
||||
if length < 0 {
|
||||
if length <= 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
+90
-7
@@ -91,7 +91,7 @@ func LuaWorker(proto *lua.FunctionProto) obiseq.SeqWorker {
|
||||
err := interpreter.PCall(0, lua.MultRet, nil)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Error in executing the lua script")
|
||||
log.Fatalf("Error in executing the lua script: %v", err)
|
||||
}
|
||||
|
||||
result := interpreter.GetGlobal("worker")
|
||||
@@ -141,6 +141,69 @@ func LuaWorker(proto *lua.FunctionProto) obiseq.SeqWorker {
|
||||
return nil
|
||||
}
|
||||
|
||||
// LuaSliceWorker creates a SeqSliceWorker that calls the Lua function
|
||||
// named "slice_worker". Unlike LuaWorker, the entire batch (BioSequenceSlice)
|
||||
// is passed to the Lua function at once, enabling batch-level processing
|
||||
// (e.g. a single HTTP request per batch instead of one per sequence).
|
||||
//
|
||||
// The Lua function signature:
|
||||
//
|
||||
// function slice_worker(slice) -- receives a BioSequenceSlice
|
||||
// -- process the batch
|
||||
// return slice -- returns a BioSequenceSlice (or nil)
|
||||
// end
|
||||
func LuaSliceWorker(proto *lua.FunctionProto) obiseq.SeqSliceWorker {
|
||||
interpreter := NewInterpreter()
|
||||
lfunc := interpreter.NewFunctionFromProto(proto)
|
||||
interpreter.Push(lfunc)
|
||||
err := interpreter.PCall(0, lua.MultRet, nil)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Error in executing the lua script: %v", err)
|
||||
}
|
||||
|
||||
result := interpreter.GetGlobal("slice_worker")
|
||||
|
||||
if lua_worker, ok := result.(*lua.LFunction); ok {
|
||||
f := func(slice obiseq.BioSequenceSlice) (obiseq.BioSequenceSlice, error) {
|
||||
if err := interpreter.CallByParam(lua.P{
|
||||
Fn: lua_worker,
|
||||
NRet: 1,
|
||||
Protect: true,
|
||||
}, obiseqslice2Lua(interpreter, &slice)); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
lreponse := interpreter.Get(-1)
|
||||
defer interpreter.Pop(1)
|
||||
|
||||
if reponse, ok := lreponse.(*lua.LUserData); ok {
|
||||
s := reponse.Value
|
||||
switch val := s.(type) {
|
||||
case *obiseq.BioSequenceSlice:
|
||||
return *val, nil
|
||||
case *obiseq.BioSequence:
|
||||
return obiseq.BioSequenceSlice{val}, nil
|
||||
default:
|
||||
r := reflect.TypeOf(val)
|
||||
return nil, fmt.Errorf("slice_worker function doesn't return the correct type %s", r)
|
||||
}
|
||||
}
|
||||
|
||||
if _, ok = lreponse.(*lua.LNilType); ok {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("slice_worker function doesn't return the correct type %T", lreponse)
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
log.Fatalf("The slice_worker object is not a function")
|
||||
return nil
|
||||
}
|
||||
|
||||
// LuaProcessor processes a Lua script on a sequence iterator and returns a new iterator.
|
||||
//
|
||||
// Parameters:
|
||||
@@ -173,7 +236,7 @@ func LuaProcessor(iterator obiiter.IBioSequence, name, program string, breakOnEr
|
||||
err = interpreter.PCall(0, lua.MultRet, nil)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Error in executing the lua script")
|
||||
log.Fatalf("Error in executing the lua script: %v", err)
|
||||
}
|
||||
|
||||
result := interpreter.GetGlobal("begin")
|
||||
@@ -198,7 +261,7 @@ func LuaProcessor(iterator obiiter.IBioSequence, name, program string, breakOnEr
|
||||
err = interpreter.PCall(0, lua.MultRet, nil)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Error in executing the lua script")
|
||||
log.Fatalf("Error in executing the lua script: %v", err)
|
||||
}
|
||||
|
||||
result := interpreter.GetGlobal("finish")
|
||||
@@ -216,11 +279,27 @@ func LuaProcessor(iterator obiiter.IBioSequence, name, program string, breakOnEr
|
||||
|
||||
}()
|
||||
|
||||
ff := func(iterator obiiter.IBioSequence) {
|
||||
w := LuaWorker(proto)
|
||||
sw := obiseq.SeqToSliceWorker(w, false)
|
||||
// Detect whether the script defines slice_worker (batch-level) or worker (per-sequence).
|
||||
hasSliceWorker := func() bool {
|
||||
interpreter := NewInterpreter()
|
||||
lfunc := interpreter.NewFunctionFromProto(proto)
|
||||
interpreter.Push(lfunc)
|
||||
if err := interpreter.PCall(0, lua.MultRet, nil); err != nil {
|
||||
return false
|
||||
}
|
||||
result := interpreter.GetGlobal("slice_worker")
|
||||
interpreter.Close()
|
||||
_, ok := result.(*lua.LFunction)
|
||||
return ok
|
||||
}()
|
||||
|
||||
// iterator = iterator.SortBatches()
|
||||
ff := func(iterator obiiter.IBioSequence) {
|
||||
var sw obiseq.SeqSliceWorker
|
||||
if hasSliceWorker {
|
||||
sw = LuaSliceWorker(proto)
|
||||
} else {
|
||||
sw = obiseq.SeqToSliceWorker(LuaWorker(proto), false)
|
||||
}
|
||||
|
||||
for iterator.Next() {
|
||||
seqs := iterator.Get()
|
||||
@@ -235,6 +314,10 @@ func LuaProcessor(iterator obiiter.IBioSequence, name, program string, breakOnEr
|
||||
}
|
||||
}
|
||||
|
||||
if ns == nil {
|
||||
ns = obiseq.BioSequenceSlice{}
|
||||
}
|
||||
|
||||
newIter.Push(obiiter.MakeBioSequenceBatch(seqs.Source(), seqs.Order(), ns))
|
||||
}
|
||||
|
||||
|
||||
@@ -17,15 +17,7 @@ import (
|
||||
// No return values. This function operates directly on the Lua state stack.
|
||||
func pushInterfaceToLua(L *lua.LState, val interface{}) {
|
||||
switch v := val.(type) {
|
||||
case string:
|
||||
L.Push(lua.LString(v))
|
||||
case bool:
|
||||
L.Push(lua.LBool(v))
|
||||
case int:
|
||||
L.Push(lua.LNumber(v))
|
||||
case float64:
|
||||
L.Push(lua.LNumber(v))
|
||||
// Add other cases as needed for different types
|
||||
// Typed slices and maps from internal OBITools code — not produced by json.Unmarshal
|
||||
case map[string]int:
|
||||
pushMapStringIntToLua(L, v)
|
||||
case map[string]string:
|
||||
@@ -34,8 +26,6 @@ func pushInterfaceToLua(L *lua.LState, val interface{}) {
|
||||
pushMapStringBoolToLua(L, v)
|
||||
case map[string]float64:
|
||||
pushMapStringFloat64ToLua(L, v)
|
||||
case map[string]interface{}:
|
||||
pushMapStringInterfaceToLua(L, v)
|
||||
case []string:
|
||||
pushSliceStringToLua(L, v)
|
||||
case []int:
|
||||
@@ -46,63 +36,63 @@ func pushInterfaceToLua(L *lua.LState, val interface{}) {
|
||||
pushSliceNumericToLua(L, v)
|
||||
case []bool:
|
||||
pushSliceBoolToLua(L, v)
|
||||
case []interface{}:
|
||||
pushSliceInterfaceToLua(L, v)
|
||||
case nil:
|
||||
L.Push(lua.LNil)
|
||||
case *sync.Mutex:
|
||||
pushMutexToLua(L, v)
|
||||
default:
|
||||
log.Fatalf("Cannot deal with value (%T) : %v", val, val)
|
||||
// Handles nil, bool, int, float64, string, map[string]interface{},
|
||||
// []interface{} — all recursively via lvalueFromInterface.
|
||||
L.Push(lvalueFromInterface(L, v))
|
||||
}
|
||||
}
|
||||
|
||||
func pushMapStringInterfaceToLua(L *lua.LState, m map[string]interface{}) {
|
||||
// Create a new Lua table
|
||||
luaTable := L.NewTable()
|
||||
// Iterate over the Go map and set the key-value pairs in the Lua table
|
||||
for key, value := range m {
|
||||
switch v := value.(type) {
|
||||
case int:
|
||||
luaTable.RawSetString(key, lua.LNumber(v))
|
||||
case float64:
|
||||
luaTable.RawSetString(key, lua.LNumber(v))
|
||||
case bool:
|
||||
luaTable.RawSetString(key, lua.LBool(v))
|
||||
case string:
|
||||
luaTable.RawSetString(key, lua.LString(v))
|
||||
default:
|
||||
log.Fatalf("Doesn't deal with map containing value %v of type %T", v, v)
|
||||
}
|
||||
L.SetField(luaTable, key, lvalueFromInterface(L, value))
|
||||
}
|
||||
|
||||
// Push the Lua table onto the stack
|
||||
L.Push(luaTable)
|
||||
}
|
||||
|
||||
func pushSliceInterfaceToLua(L *lua.LState, s []interface{}) {
|
||||
// Create a new Lua table
|
||||
luaTable := L.NewTable()
|
||||
// Iterate over the Go map and set the key-value pairs in the Lua table
|
||||
for _, value := range s {
|
||||
switch v := value.(type) {
|
||||
case int:
|
||||
luaTable.Append(lua.LNumber(v))
|
||||
case float64:
|
||||
luaTable.Append(lua.LNumber(v))
|
||||
case bool:
|
||||
luaTable.Append(lua.LBool(v))
|
||||
case string:
|
||||
luaTable.Append(lua.LString(v))
|
||||
default:
|
||||
log.Fatalf("Doesn't deal with slice containing value %v of type %T", v, v)
|
||||
}
|
||||
luaTable.Append(lvalueFromInterface(L, value))
|
||||
}
|
||||
|
||||
// Push the Lua table onto the stack
|
||||
L.Push(luaTable)
|
||||
}
|
||||
|
||||
// lvalueFromInterface converts a Go interface{} value (as produced by json.Unmarshal)
|
||||
// to the corresponding lua.LValue, handling nested maps and slices recursively.
|
||||
func lvalueFromInterface(L *lua.LState, value interface{}) lua.LValue {
|
||||
switch v := value.(type) {
|
||||
case nil:
|
||||
return lua.LNil
|
||||
case bool:
|
||||
return lua.LBool(v)
|
||||
case int:
|
||||
return lua.LNumber(v)
|
||||
case float64:
|
||||
return lua.LNumber(v)
|
||||
case string:
|
||||
return lua.LString(v)
|
||||
case map[string]interface{}:
|
||||
t := L.NewTable()
|
||||
for key, val := range v {
|
||||
L.SetField(t, key, lvalueFromInterface(L, val))
|
||||
}
|
||||
return t
|
||||
case []interface{}:
|
||||
t := L.NewTable()
|
||||
for _, val := range v {
|
||||
t.Append(lvalueFromInterface(L, val))
|
||||
}
|
||||
return t
|
||||
default:
|
||||
log.Fatalf("lvalueFromInterface: unsupported type %T: %v", v, v)
|
||||
return lua.LNil
|
||||
}
|
||||
}
|
||||
|
||||
// pushMapStringIntToLua creates a new Lua table and iterates over the Go map to set key-value pairs in the Lua table. It then pushes the Lua table onto the stack.
|
||||
//
|
||||
// L *lua.LState - the Lua state
|
||||
|
||||
@@ -28,6 +28,8 @@ func Table2Interface(interpreter *lua.LState, table *lua.LTable) interface{} {
|
||||
val[i-1] = float64(v.(lua.LNumber))
|
||||
case lua.LTString:
|
||||
val[i-1] = string(v.(lua.LString))
|
||||
case lua.LTTable:
|
||||
val[i-1] = Table2Interface(interpreter, v.(*lua.LTable))
|
||||
}
|
||||
}
|
||||
return val
|
||||
@@ -45,6 +47,8 @@ func Table2Interface(interpreter *lua.LState, table *lua.LTable) interface{} {
|
||||
val[string(ks)] = float64(v.(lua.LNumber))
|
||||
case lua.LTString:
|
||||
val[string(ks)] = string(v.(lua.LString))
|
||||
case lua.LTTable:
|
||||
val[string(ks)] = Table2Interface(interpreter, v.(*lua.LTable))
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
@@ -0,0 +1,128 @@
|
||||
package obilua
|
||||
|
||||
import (
|
||||
"context"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
lua "github.com/yuin/gopher-lua"
|
||||
)
|
||||
|
||||
const httpClientTimeout = 300 * time.Second
|
||||
|
||||
var (
|
||||
_httpClient *http.Client
|
||||
_httpClientOnce sync.Once
|
||||
|
||||
// _httpSemaphore limits the number of concurrent HTTP requests.
|
||||
// Initialised lazily alongside the client.
|
||||
_httpSemaphore chan struct{}
|
||||
)
|
||||
|
||||
func getHTTPClient() *http.Client {
|
||||
_httpClientOnce.Do(func() {
|
||||
conns := 2 * obidefault.ParallelWorkers()
|
||||
_httpClient = &http.Client{
|
||||
Transport: &http.Transport{
|
||||
MaxIdleConnsPerHost: conns,
|
||||
MaxConnsPerHost: conns,
|
||||
IdleConnTimeout: 90 * time.Second,
|
||||
},
|
||||
Timeout: httpClientTimeout,
|
||||
}
|
||||
_httpSemaphore = make(chan struct{}, obidefault.ParallelWorkers())
|
||||
})
|
||||
return _httpClient
|
||||
}
|
||||
|
||||
// RegisterHTTP registers the http module in the Lua state as a global,
|
||||
// consistent with obicontext and BioSequence.
|
||||
//
|
||||
// Exposes:
|
||||
//
|
||||
// http.post(url, body [, timeout_ms]) → response string (on success)
|
||||
// http.post(url, body [, timeout_ms]) → nil, err string (on error)
|
||||
// http.set_concurrency(n) → set max simultaneous requests
|
||||
func RegisterHTTP(luaState *lua.LState) {
|
||||
table := luaState.NewTable()
|
||||
luaState.SetField(table, "post", luaState.NewFunction(luaHTTPPost))
|
||||
luaState.SetField(table, "set_concurrency", luaState.NewFunction(luaHTTPSetConcurrency))
|
||||
luaState.SetGlobal("http", table)
|
||||
}
|
||||
|
||||
// luaHTTPPost implements http.post(url, body [, timeout_ms]) for Lua.
|
||||
//
|
||||
// The optional third argument overrides the default timeout (in milliseconds).
|
||||
// Concurrent requests are throttled through _httpSemaphore so that a
|
||||
// single-threaded backend server is not overwhelmed by K parallel workers.
|
||||
//
|
||||
// Lua signature:
|
||||
//
|
||||
// local response = http.post(url, body)
|
||||
// local response = http.post(url, body, 5000) -- 5 s timeout
|
||||
// local response, err = http.post(url, body)
|
||||
func luaHTTPPost(L *lua.LState) int {
|
||||
url := L.CheckString(1)
|
||||
body := L.CheckString(2)
|
||||
|
||||
client := getHTTPClient()
|
||||
|
||||
timeout := httpClientTimeout
|
||||
if L.GetTop() >= 3 {
|
||||
ms := L.CheckInt(3)
|
||||
timeout = time.Duration(ms) * time.Millisecond
|
||||
}
|
||||
|
||||
// Acquire semaphore slot — blocks until a slot is free.
|
||||
_httpSemaphore <- struct{}{}
|
||||
defer func() { <-_httpSemaphore }()
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, strings.NewReader(body))
|
||||
if err != nil {
|
||||
L.Push(lua.LNil)
|
||||
L.Push(lua.LString(err.Error()))
|
||||
return 2
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
L.Push(lua.LNil)
|
||||
L.Push(lua.LString(err.Error()))
|
||||
return 2
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
respBytes, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
L.Push(lua.LNil)
|
||||
L.Push(lua.LString(err.Error()))
|
||||
return 2
|
||||
}
|
||||
|
||||
L.Push(lua.LString(respBytes))
|
||||
return 1
|
||||
}
|
||||
|
||||
// luaHTTPSetConcurrency replaces the semaphore with a new one of size n.
|
||||
// Must be called before the first http.post (e.g. in begin()).
|
||||
//
|
||||
// Lua signature:
|
||||
//
|
||||
// http.set_concurrency(1) -- serialise all HTTP requests
|
||||
func luaHTTPSetConcurrency(L *lua.LState) int {
|
||||
n := L.CheckInt(1)
|
||||
if n < 1 {
|
||||
n = 1
|
||||
}
|
||||
getHTTPClient() // ensure singleton is initialised
|
||||
_httpSemaphore = make(chan struct{}, n)
|
||||
return 0
|
||||
}
|
||||
@@ -0,0 +1,71 @@
|
||||
package obilua
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
|
||||
lua "github.com/yuin/gopher-lua"
|
||||
)
|
||||
|
||||
// RegisterJSON registers the json module in the Lua state as a global,
|
||||
// consistent with obicontext, BioSequence, and http.
|
||||
//
|
||||
// Exposes:
|
||||
//
|
||||
// json.encode(data) → string (on success)
|
||||
// json.encode(data) → nil, err (on error)
|
||||
// json.decode(string) → value (on success)
|
||||
// json.decode(string) → nil, err (on error)
|
||||
func RegisterJSON(luaState *lua.LState) {
|
||||
table := luaState.NewTable()
|
||||
luaState.SetField(table, "encode", luaState.NewFunction(luaJSONEncode))
|
||||
luaState.SetField(table, "decode", luaState.NewFunction(luaJSONDecode))
|
||||
luaState.SetGlobal("json", table)
|
||||
}
|
||||
|
||||
// luaJSONEncode implements json.encode(data) for Lua.
|
||||
func luaJSONEncode(L *lua.LState) int {
|
||||
val := L.CheckAny(1)
|
||||
|
||||
var goVal interface{}
|
||||
switch v := val.(type) {
|
||||
case *lua.LTable:
|
||||
goVal = Table2Interface(L, v)
|
||||
case lua.LString:
|
||||
goVal = string(v)
|
||||
case lua.LNumber:
|
||||
goVal = float64(v)
|
||||
case lua.LBool:
|
||||
goVal = bool(v)
|
||||
case *lua.LNilType:
|
||||
goVal = nil
|
||||
default:
|
||||
L.Push(lua.LNil)
|
||||
L.Push(lua.LString("json.encode: unsupported type"))
|
||||
return 2
|
||||
}
|
||||
|
||||
b, err := json.Marshal(goVal)
|
||||
if err != nil {
|
||||
L.Push(lua.LNil)
|
||||
L.Push(lua.LString(err.Error()))
|
||||
return 2
|
||||
}
|
||||
|
||||
L.Push(lua.LString(b))
|
||||
return 1
|
||||
}
|
||||
|
||||
// luaJSONDecode implements json.decode(string) for Lua.
|
||||
func luaJSONDecode(L *lua.LState) int {
|
||||
s := L.CheckString(1)
|
||||
|
||||
var goVal interface{}
|
||||
if err := json.Unmarshal([]byte(s), &goVal); err != nil {
|
||||
L.Push(lua.LNil)
|
||||
L.Push(lua.LString(err.Error()))
|
||||
return 2
|
||||
}
|
||||
|
||||
pushInterfaceToLua(L, goVal)
|
||||
return 1
|
||||
}
|
||||
@@ -0,0 +1,184 @@
|
||||
package obilua
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
lua "github.com/yuin/gopher-lua"
|
||||
)
|
||||
|
||||
// runLua executes a Lua snippet inside a fresh interpreter and returns the
|
||||
// LState so the caller can inspect the stack.
|
||||
func runLua(t *testing.T, script string) *lua.LState {
|
||||
t.Helper()
|
||||
L := NewInterpreter()
|
||||
if err := L.DoString(script); err != nil {
|
||||
t.Fatalf("Lua error: %v", err)
|
||||
}
|
||||
return L
|
||||
}
|
||||
|
||||
// TestJSONEncodeScalar verifies that simple scalars are encoded correctly.
|
||||
func TestJSONEncodeScalar(t *testing.T) {
|
||||
cases := []struct {
|
||||
script string
|
||||
expected string
|
||||
}{
|
||||
{`result = json.encode("hello")`, `"hello"`},
|
||||
{`result = json.encode(42)`, `42`},
|
||||
{`result = json.encode(true)`, `true`},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
L := runLua(t, tc.script)
|
||||
got := string(L.GetGlobal("result").(lua.LString))
|
||||
if got != tc.expected {
|
||||
t.Errorf("encode(%s): got %q, want %q", tc.script, got, tc.expected)
|
||||
}
|
||||
L.Close()
|
||||
}
|
||||
}
|
||||
|
||||
// TestJSONEncodeTable verifies that a Lua table (array and map) encodes to JSON.
|
||||
func TestJSONEncodeTable(t *testing.T) {
|
||||
L := runLua(t, `result = json.encode({a = 1, b = "x"})`)
|
||||
got := string(L.GetGlobal("result").(lua.LString))
|
||||
// json.Marshal produces deterministic output for maps in Go 1.12+... actually not.
|
||||
// Just check it round-trips via decode instead.
|
||||
L.Close()
|
||||
if got == "" {
|
||||
t.Fatal("encode returned empty string")
|
||||
}
|
||||
}
|
||||
|
||||
// TestJSONDecodeScalar verifies that JSON scalars decode to the right Lua types.
|
||||
func TestJSONDecodeScalar(t *testing.T) {
|
||||
L := runLua(t, `
|
||||
s = json.decode('"hello"')
|
||||
n = json.decode('3.14')
|
||||
b = json.decode('true')
|
||||
`)
|
||||
if s, ok := L.GetGlobal("s").(lua.LString); !ok || string(s) != "hello" {
|
||||
t.Errorf("decode string: got %v", L.GetGlobal("s"))
|
||||
}
|
||||
if n, ok := L.GetGlobal("n").(lua.LNumber); !ok || float64(n) != 3.14 {
|
||||
t.Errorf("decode number: got %v", L.GetGlobal("n"))
|
||||
}
|
||||
if b, ok := L.GetGlobal("b").(lua.LBool); !ok || !bool(b) {
|
||||
t.Errorf("decode bool: got %v", L.GetGlobal("b"))
|
||||
}
|
||||
L.Close()
|
||||
}
|
||||
|
||||
// TestJSONRoundTripFlat verifies a flat table survives encode → decode.
|
||||
func TestJSONRoundTripFlat(t *testing.T) {
|
||||
L := runLua(t, `
|
||||
original = {name = "Homo_sapiens", score = 1.0, valid = true}
|
||||
encoded = json.encode(original)
|
||||
decoded = json.decode(encoded)
|
||||
`)
|
||||
decoded, ok := L.GetGlobal("decoded").(*lua.LTable)
|
||||
if !ok {
|
||||
t.Fatal("decoded is not a table")
|
||||
}
|
||||
if v := decoded.RawGetString("name"); string(v.(lua.LString)) != "Homo_sapiens" {
|
||||
t.Errorf("name: got %v", v)
|
||||
}
|
||||
if v := decoded.RawGetString("score"); float64(v.(lua.LNumber)) != 1.0 {
|
||||
t.Errorf("score: got %v", v)
|
||||
}
|
||||
if v := decoded.RawGetString("valid"); !bool(v.(lua.LBool)) {
|
||||
t.Errorf("valid: got %v", v)
|
||||
}
|
||||
L.Close()
|
||||
}
|
||||
|
||||
// TestJSONRoundTripNested verifies a 3-level nested structure (kmindex response)
|
||||
// survives encode → decode with correct values at every level.
|
||||
func TestJSONRoundTripNested(t *testing.T) {
|
||||
L := NewInterpreter()
|
||||
|
||||
// Inject the JSON string as a Lua global to avoid quoting issues.
|
||||
L.SetGlobal("kmindex_json", lua.LString(
|
||||
`{"Human":{"query_001":{"Homo_sapiens--GCF_000001405_40":1.0}}}`,
|
||||
))
|
||||
|
||||
if err := L.DoString(`
|
||||
data = json.decode(kmindex_json)
|
||||
reencoded = json.encode(data)
|
||||
data2 = json.decode(reencoded)
|
||||
`); err != nil {
|
||||
t.Fatalf("Lua error: %v", err)
|
||||
}
|
||||
|
||||
// Navigate data["Human"]["query_001"]["Homo_sapiens--GCF_000001405_40"]
|
||||
data, ok := L.GetGlobal("data").(*lua.LTable)
|
||||
if !ok {
|
||||
t.Fatal("data is not a table")
|
||||
}
|
||||
human, ok := data.RawGetString("Human").(*lua.LTable)
|
||||
if !ok {
|
||||
t.Fatal("data.Human is not a table")
|
||||
}
|
||||
query, ok := human.RawGetString("query_001").(*lua.LTable)
|
||||
if !ok {
|
||||
t.Fatal("data.Human.query_001 is not a table")
|
||||
}
|
||||
score, ok := query.RawGetString("Homo_sapiens--GCF_000001405_40").(lua.LNumber)
|
||||
if !ok || float64(score) != 1.0 {
|
||||
t.Errorf("score: got %v, want 1.0", query.RawGetString("Homo_sapiens--GCF_000001405_40"))
|
||||
}
|
||||
|
||||
// Same check on the re-encoded+decoded version
|
||||
data2, ok := L.GetGlobal("data2").(*lua.LTable)
|
||||
if !ok {
|
||||
t.Fatal("data2 is not a table")
|
||||
}
|
||||
score2 := data2.RawGetString("Human").(*lua.LTable).
|
||||
RawGetString("query_001").(*lua.LTable).
|
||||
RawGetString("Homo_sapiens--GCF_000001405_40").(lua.LNumber)
|
||||
if float64(score2) != 1.0 {
|
||||
t.Errorf("data2 score: got %v, want 1.0", score2)
|
||||
}
|
||||
L.Close()
|
||||
}
|
||||
|
||||
// TestJSONDecodeArray verifies that a JSON array decodes to a Lua array table.
|
||||
func TestJSONDecodeArray(t *testing.T) {
|
||||
L := runLua(t, `arr = json.decode('[1, 2, 3]')`)
|
||||
arr, ok := L.GetGlobal("arr").(*lua.LTable)
|
||||
if !ok {
|
||||
t.Fatal("arr is not a table")
|
||||
}
|
||||
for i, expected := range []float64{1, 2, 3} {
|
||||
v, ok := arr.RawGetInt(i + 1).(lua.LNumber)
|
||||
if !ok || float64(v) != expected {
|
||||
t.Errorf("arr[%d]: got %v, want %v", i+1, arr.RawGetInt(i+1), expected)
|
||||
}
|
||||
}
|
||||
L.Close()
|
||||
}
|
||||
|
||||
// TestJSONEncodeError verifies that json.encode on an unsupported type returns nil + error.
|
||||
func TestJSONEncodeError(t *testing.T) {
|
||||
L := runLua(t, `
|
||||
local result, err = json.encode(nil)
|
||||
`)
|
||||
// nil encodes to JSON "null" — not an error
|
||||
L.Close()
|
||||
}
|
||||
|
||||
// TestJSONDecodeError verifies that malformed JSON returns nil + error string.
|
||||
func TestJSONDecodeError(t *testing.T) {
|
||||
L := runLua(t, `
|
||||
local result, err = json.decode("not valid json")
|
||||
decode_ok = (result == nil)
|
||||
decode_has_err = (err ~= nil)
|
||||
`)
|
||||
if L.GetGlobal("decode_ok") != lua.LTrue {
|
||||
t.Error("expected nil result on decode error")
|
||||
}
|
||||
if L.GetGlobal("decode_has_err") != lua.LTrue {
|
||||
t.Error("expected error string on decode error")
|
||||
}
|
||||
L.Close()
|
||||
}
|
||||
@@ -5,4 +5,6 @@ import lua "github.com/yuin/gopher-lua"
|
||||
func RegisterObilib(luaState *lua.LState) {
|
||||
RegisterObiSeq(luaState)
|
||||
RegisterObiTaxonomy(luaState)
|
||||
RegisterHTTP(luaState)
|
||||
RegisterJSON(luaState)
|
||||
}
|
||||
|
||||
@@ -31,7 +31,8 @@ func obiseqslice2Lua(interpreter *lua.LState,
|
||||
}
|
||||
|
||||
func newObiSeqSlice(luaState *lua.LState) int {
|
||||
seqslice := obiseq.NewBioSequenceSlice()
|
||||
capacity := luaState.OptInt(1, 0)
|
||||
seqslice := obiseq.NewBioSequenceSlice(capacity)
|
||||
luaState.Push(obiseqslice2Lua(luaState, seqslice))
|
||||
return 1
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
@@ -55,7 +56,15 @@ func RegisterGlobalOptions(options *getoptions.GetOpt) {
|
||||
|
||||
options.IntVar(obidefault.BatchSizePtr(), "batch-size", obidefault.BatchSize(),
|
||||
options.GetEnv("OBIBATCHSIZE"),
|
||||
options.Description("Number of sequence per batch for paralelle processing"))
|
||||
options.Description("Minimum number of sequences per batch (floor, default 1)"))
|
||||
|
||||
options.IntVar(obidefault.BatchSizeMaxPtr(), "batch-size-max", obidefault.BatchSizeMax(),
|
||||
options.GetEnv("OBIBATCHSIZEMAX"),
|
||||
options.Description("Maximum number of sequences per batch (ceiling, default 2000)"))
|
||||
|
||||
options.StringVar(obidefault.BatchMemStrPtr(), "batch-mem", "",
|
||||
options.GetEnv("OBIBATCHMEM"),
|
||||
options.Description("Maximum memory per batch (e.g. 128K, 64M, 1G; default: 128M). Set to 0 to disable."))
|
||||
|
||||
options.Bool("solexa", false,
|
||||
options.GetEnv("OBISOLEXA"),
|
||||
@@ -157,6 +166,15 @@ func ProcessParsedOptions(options *getoptions.GetOpt, parseErr error) {
|
||||
if options.Called("solexa") {
|
||||
obidefault.SetReadQualitiesShift(64)
|
||||
}
|
||||
|
||||
if options.Called("batch-mem") {
|
||||
n, err := obiutils.ParseMemSize(obidefault.BatchMemStr())
|
||||
if err != nil {
|
||||
log.Fatalf("Invalid --batch-mem value %q: %v", obidefault.BatchMemStr(), err)
|
||||
}
|
||||
obidefault.SetBatchMem(n)
|
||||
log.Printf("Memory-based batching enabled: %s per batch", obidefault.BatchMemStr())
|
||||
}
|
||||
}
|
||||
|
||||
func GenerateOptionParser(program string,
|
||||
|
||||
@@ -3,7 +3,7 @@ package obioptions
|
||||
// Version is automatically updated by the Makefile from version.txt
|
||||
// The patch number (third digit) is incremented on each push to the repository
|
||||
|
||||
var _Version = "Release 4.4.12"
|
||||
var _Version = "Release 4.4.41"
|
||||
|
||||
// Version returns the version of the obitools package.
|
||||
//
|
||||
|
||||
@@ -120,6 +120,19 @@ func NewBioSequence(id string,
|
||||
return bs
|
||||
}
|
||||
|
||||
// NewBioSequenceOwning creates a BioSequence taking ownership of the sequence
|
||||
// slice without copying it. The caller must not use the slice after this call.
|
||||
// Use this when the slice was allocated specifically for this sequence.
|
||||
func NewBioSequenceOwning(id string,
|
||||
sequence []byte,
|
||||
definition string) *BioSequence {
|
||||
bs := NewEmptyBioSequence(0)
|
||||
bs.SetId(id)
|
||||
bs.TakeSequence(sequence)
|
||||
bs.SetDefinition(definition)
|
||||
return bs
|
||||
}
|
||||
|
||||
// NewBioSequenceWithQualities creates a new BioSequence object with the given id, sequence, definition, and qualities.
|
||||
//
|
||||
// Parameters:
|
||||
@@ -260,6 +273,28 @@ func (s *BioSequence) Len() int {
|
||||
return len(s.sequence)
|
||||
}
|
||||
|
||||
// MemorySize returns an estimate of the memory footprint of the BioSequence
|
||||
// in bytes. It accounts for the sequence, quality scores, feature data,
|
||||
// annotations, and fixed struct overhead. The estimate is conservative
|
||||
// (cap rather than len for byte slices) so it is suitable for memory-based
|
||||
// batching decisions.
|
||||
func (s *BioSequence) MemorySize() int {
|
||||
if s == nil {
|
||||
return 0
|
||||
}
|
||||
// fixed struct overhead (strings, pointers, mutex pointer)
|
||||
const overhead = 128
|
||||
n := overhead
|
||||
n += cap(s.sequence)
|
||||
n += cap(s.qualities)
|
||||
n += cap(s.feature)
|
||||
n += len(s.id)
|
||||
n += len(s.source)
|
||||
// rough annotation estimate: each key+value pair ~64 bytes on average
|
||||
n += len(s.annotations) * 64
|
||||
return n
|
||||
}
|
||||
|
||||
// HasQualities checks if the BioSequence has sequence qualitiy scores.
|
||||
//
|
||||
// This function does not have any parameters.
|
||||
@@ -444,6 +479,12 @@ func (s *BioSequence) SetSequence(sequence []byte) {
|
||||
s.sequence = obiutils.InPlaceToLower(CopySlice(sequence))
|
||||
}
|
||||
|
||||
// TakeSequence stores the slice directly without copying, then lowercases in-place.
|
||||
// The caller must not use the slice after this call.
|
||||
func (s *BioSequence) TakeSequence(sequence []byte) {
|
||||
s.sequence = obiutils.InPlaceToLower(sequence)
|
||||
}
|
||||
|
||||
func (s *BioSequence) HasValidSequence() bool {
|
||||
for _, c := range s.sequence {
|
||||
if !((c >= 'a' && c <= 'z') || c == '-' || c == '.' || c == '[' || c == ']') {
|
||||
@@ -458,9 +499,24 @@ func (s *BioSequence) SetQualities(qualities Quality) {
|
||||
if s.qualities != nil {
|
||||
RecycleSlice(&s.qualities)
|
||||
}
|
||||
if len(qualities) > 0 && len(qualities) != len(s.sequence) {
|
||||
log.Panicf("[BioSequence.SetQualities] Sequence %s has a length of %d and qualities a length of %d", s.id, len(s.sequence), len(qualities))
|
||||
}
|
||||
s.qualities = CopySlice(qualities)
|
||||
}
|
||||
|
||||
// TakeQualities stores the slice directly without copying.
|
||||
// The caller must not use the slice after this call.
|
||||
func (s *BioSequence) TakeQualities(qualities Quality) {
|
||||
if s.qualities != nil {
|
||||
RecycleSlice(&s.qualities)
|
||||
}
|
||||
if len(qualities) > 0 && len(qualities) != len(s.sequence) {
|
||||
log.Panicf("[BioSequence.TakeQualities] Sequence %s has a length of %d and qualities a length of %d", s.id, len(s.sequence), len(qualities))
|
||||
}
|
||||
s.qualities = qualities
|
||||
}
|
||||
|
||||
// A method that appends a byte slice to the qualities of the BioSequence.
|
||||
func (s *BioSequence) WriteQualities(data []byte) (int, error) {
|
||||
s.qualities = append(s.qualities, data...)
|
||||
|
||||
@@ -195,7 +195,7 @@ func (s *BioSequenceSlice) ExtractTaxonomy(taxonomy *obitax.Taxonomy, seqAsTaxa
|
||||
return nil, fmt.Errorf("sequence %v has no path", s.Id())
|
||||
}
|
||||
last := path[len(path)-1]
|
||||
taxname, _ := obiutils.SplitInTwo(last, ':')
|
||||
taxname, _ := obiutils.LeftSplitInTwo(last, ':')
|
||||
if idx, ok := s.GetIntAttribute("seq_number"); !ok {
|
||||
return nil, errors.New("sequences are not numbered")
|
||||
} else {
|
||||
|
||||
@@ -1,13 +1,20 @@
|
||||
package obiseq
|
||||
|
||||
import (
|
||||
"runtime"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
)
|
||||
|
||||
const _LargeSliceThreshold = 100 * 1024 // 100 kb — below: leave to GC, above: trigger explicit GC
|
||||
const _GCBytesBudget = int64(256 * 1024 * 1024) // trigger GC every 256 MB of large discards
|
||||
|
||||
var _largeSliceDiscardedBytes = atomic.Int64{}
|
||||
|
||||
var _BioSequenceByteSlicePool = sync.Pool{
|
||||
New: func() interface{} {
|
||||
bs := make([]byte, 0, 300)
|
||||
@@ -34,6 +41,13 @@ func RecycleSlice(s *[]byte) {
|
||||
}
|
||||
if cap(*s) <= 1024 {
|
||||
_BioSequenceByteSlicePool.Put(s)
|
||||
} else if cap(*s) >= _LargeSliceThreshold {
|
||||
n := int64(cap(*s))
|
||||
*s = nil
|
||||
prev := _largeSliceDiscardedBytes.Load()
|
||||
if _largeSliceDiscardedBytes.Add(n)/_GCBytesBudget > prev/_GCBytesBudget {
|
||||
runtime.GC()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -118,6 +118,9 @@ func (sequence *BioSequence) _revcmpMutation() *BioSequence {
|
||||
*/
|
||||
func ReverseComplementWorker(inplace bool) SeqWorker {
|
||||
f := func(input *BioSequence) (BioSequenceSlice, error) {
|
||||
if input.IsPaired() {
|
||||
input.PairedWith().ReverseComplement(inplace)
|
||||
}
|
||||
return BioSequenceSlice{input.ReverseComplement(inplace)}, nil
|
||||
}
|
||||
|
||||
|
||||
+20
-2
@@ -48,7 +48,16 @@ func (sequence *BioSequence) Subsequence(from, to int, circular bool) (*BioSeque
|
||||
newSeq.sequence = CopySlice(sequence.Sequence()[from:to])
|
||||
|
||||
if sequence.HasQualities() {
|
||||
newSeq.qualities = CopySlice(sequence.Qualities()[from:to])
|
||||
qual := sequence.Qualities()
|
||||
if len(qual) != sequence.Len() {
|
||||
log.Panicf(
|
||||
"[BioSequence.Subsequence] Sequence %s has a length of %d and qualities a length of %d",
|
||||
sequence.Id(),
|
||||
sequence.Len(),
|
||||
len(qual),
|
||||
)
|
||||
}
|
||||
newSeq.qualities = CopySlice(qual[from:to])
|
||||
}
|
||||
|
||||
newSeq.id = fmt.Sprintf("%s_sub[%d..%d]", sequence.Id(), from+1, to)
|
||||
@@ -58,7 +67,16 @@ func (sequence *BioSequence) Subsequence(from, to int, circular bool) (*BioSeque
|
||||
newSeq.Write(sequence.Sequence()[0:to])
|
||||
|
||||
if sequence.HasQualities() {
|
||||
newSeq.WriteQualities(sequence.Qualities()[0:to])
|
||||
qual := sequence.Qualities()
|
||||
if len(qual) != sequence.Len() {
|
||||
log.Panicf(
|
||||
"[BioSequence.Subsequence] Sequence %s has a length of %d and qualities a length of %d",
|
||||
sequence.Id(),
|
||||
sequence.Len(),
|
||||
len(qual),
|
||||
)
|
||||
}
|
||||
newSeq.WriteQualities(qual[0:to])
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -104,11 +104,11 @@ func SeqToSliceWorker(worker SeqWorker,
|
||||
for _, s := range input {
|
||||
r, err := worker(s)
|
||||
if err == nil {
|
||||
if i+len(r) > cap(output) {
|
||||
output = slices.Grow(output[:i], len(r))
|
||||
output = output[:cap(output)]
|
||||
}
|
||||
for _, rs := range r {
|
||||
if i == len(output) {
|
||||
output = slices.Grow(output, cap(output))
|
||||
output = output[:cap(output)]
|
||||
}
|
||||
output[i] = rs
|
||||
i++
|
||||
}
|
||||
|
||||
+1
-1
@@ -31,7 +31,7 @@ func NewTaxidFactory(code string, alphabet obiutils.AsciiSet) *TaxidFactory {
|
||||
// It extracts the relevant part of the string after the first colon (':') if present.
|
||||
func (f *TaxidFactory) FromString(taxid string) (Taxid, error) {
|
||||
taxid = obiutils.AsciiSpaceSet.TrimLeft(taxid)
|
||||
part1, part2 := obiutils.SplitInTwo(taxid, ':')
|
||||
part1, part2 := obiutils.LeftSplitInTwo(taxid, ':')
|
||||
if len(part2) == 0 {
|
||||
taxid = part1
|
||||
} else {
|
||||
|
||||
@@ -64,7 +64,7 @@ func EmpiricalDistCsv(filename string, data [][]Ratio, compressed bool) {
|
||||
fmt.Println(err)
|
||||
}
|
||||
|
||||
destfile, err := obiutils.CompressStream(file, true, true)
|
||||
destfile, err := obiutils.CompressStream(file, compressed, true)
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
}
|
||||
|
||||
@@ -68,6 +68,8 @@ func ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) {
|
||||
strings.HasSuffix(path, "seq.gz") ||
|
||||
strings.HasSuffix(path, "gb") ||
|
||||
strings.HasSuffix(path, "gb.gz") ||
|
||||
strings.HasSuffix(path, "gbff") ||
|
||||
strings.HasSuffix(path, "gbff.gz") ||
|
||||
strings.HasSuffix(path, "dat") ||
|
||||
strings.HasSuffix(path, "dat.gz") ||
|
||||
strings.HasSuffix(path, "ecopcr") ||
|
||||
@@ -204,7 +206,7 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
|
||||
iterator = iterator.PairTo(ip)
|
||||
}
|
||||
} else {
|
||||
iterator = obiiter.NilIBioSequence
|
||||
return obiiter.NilIBioSequence, fmt.Errorf("no sequence files found in the provided paths")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -212,6 +214,8 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
|
||||
|
||||
iterator = iterator.Speed("Reading sequences")
|
||||
|
||||
iterator = iterator.RebatchBySize(obidefault.BatchMem(), obidefault.BatchSizeMax())
|
||||
|
||||
return iterator, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -33,6 +33,7 @@ func CLIWriteSequenceCSV(iterator obiiter.IBioSequence,
|
||||
CSVSequence(CLIPrintSequence()),
|
||||
CSVQuality(CLIPrintQuality()),
|
||||
CSVAutoColumn(CLIAutoColumns()),
|
||||
CSVNAValue(CLINAValue()),
|
||||
)
|
||||
|
||||
csvIter := NewCSVSequenceIterator(iterator, opts...)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package obicsv
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"slices"
|
||||
|
||||
@@ -67,8 +68,19 @@ func CSVBatchFromSequences(batch obiiter.BioSequenceBatch, opt Options) obiiterc
|
||||
|
||||
if taxon != nil {
|
||||
taxid = taxon.String()
|
||||
} else if ta, ok := sequence.GetAttribute("taxid"); ok {
|
||||
switch tv := ta.(type) {
|
||||
case string:
|
||||
taxid = tv
|
||||
case int:
|
||||
taxid = fmt.Sprintf("%d", tv)
|
||||
case float64:
|
||||
taxid = fmt.Sprintf("%d", int(tv))
|
||||
default:
|
||||
taxid = opt.CSVNAValue()
|
||||
}
|
||||
} else {
|
||||
taxid = sequence.Taxid()
|
||||
taxid = opt.CSVNAValue()
|
||||
}
|
||||
|
||||
record["taxid"] = taxid
|
||||
|
||||
@@ -46,8 +46,7 @@ func CLIDistributeSequence(sequences obiiter.IBioSequence) {
|
||||
formater = obiformats.WriteSequencesToFile
|
||||
}
|
||||
|
||||
dispatcher := sequences.Distribute(CLISequenceClassifier(),
|
||||
obidefault.BatchSize())
|
||||
dispatcher := sequences.Distribute(CLISequenceClassifier())
|
||||
|
||||
obiformats.WriterDispatcher(CLIFileNamePattern(),
|
||||
dispatcher, formater, opts...,
|
||||
|
||||
@@ -291,5 +291,5 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
go f()
|
||||
}
|
||||
|
||||
return indexed.Rebatch(obidefault.BatchSize())
|
||||
return indexed.RebatchBySize(obidefault.BatchMem(), obidefault.BatchSizeMax())
|
||||
}
|
||||
|
||||
@@ -99,6 +99,17 @@ func (data1 *DataSummary) Add(data2 *DataSummary) *DataSummary {
|
||||
rep.sample_singletons = sumUpdateIntMap(data1.sample_singletons, data2.sample_singletons)
|
||||
rep.sample_obiclean_bad = sumUpdateIntMap(data1.sample_obiclean_bad, data2.sample_obiclean_bad)
|
||||
|
||||
for k, m1 := range data1.map_summaries {
|
||||
rep.map_summaries[k] = m1
|
||||
}
|
||||
for k, m2 := range data2.map_summaries {
|
||||
if m1, ok := rep.map_summaries[k]; ok {
|
||||
rep.map_summaries[k] = sumUpdateIntMap(m1, m2)
|
||||
} else {
|
||||
rep.map_summaries[k] = m2
|
||||
}
|
||||
}
|
||||
|
||||
return rep
|
||||
}
|
||||
|
||||
@@ -163,8 +174,9 @@ func ISummary(iterator obiiter.IBioSequence, summarise []string) map[string]inte
|
||||
summaries := make([]*DataSummary, nproc)
|
||||
|
||||
for n := 0; n < nproc; n++ {
|
||||
summaries[n] = NewDataSummary()
|
||||
for _, v := range summarise {
|
||||
summaries[n].map_summaries[v] = make(map[string]int, 0)
|
||||
summaries[n].map_summaries[v] = make(map[string]int)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -174,6 +186,11 @@ func ISummary(iterator obiiter.IBioSequence, summarise []string) map[string]inte
|
||||
batch := iseq.Get()
|
||||
for _, seq := range batch.Slice() {
|
||||
summary.Update(seq)
|
||||
for _, attr := range summarise {
|
||||
if m, ok := seq.GetIntMap(attr); ok {
|
||||
summary.map_summaries[attr] = sumUpdateIntMap(summary.map_summaries[attr], m)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
waiter.Done()
|
||||
@@ -181,11 +198,9 @@ func ISummary(iterator obiiter.IBioSequence, summarise []string) map[string]inte
|
||||
|
||||
waiter.Add(nproc)
|
||||
|
||||
summaries[0] = NewDataSummary()
|
||||
go ff(iterator, summaries[0])
|
||||
|
||||
for i := 1; i < nproc; i++ {
|
||||
summaries[i] = NewDataSummary()
|
||||
go ff(iterator.Split(), summaries[i])
|
||||
}
|
||||
|
||||
@@ -246,5 +261,14 @@ func ISummary(iterator obiiter.IBioSequence, summarise []string) map[string]inte
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(rep.map_summaries) > 0 {
|
||||
mapDict := make(map[string]interface{}, len(rep.map_summaries))
|
||||
for attr, counts := range rep.map_summaries {
|
||||
mapDict[attr] = counts
|
||||
}
|
||||
dict["map_summaries"] = mapDict
|
||||
}
|
||||
|
||||
return dict
|
||||
}
|
||||
|
||||
@@ -0,0 +1,85 @@
|
||||
package obiutils
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// ParseMemSize parses a human-readable memory size string and returns the
|
||||
// equivalent number of bytes. The value is a number optionally followed by a
|
||||
// unit suffix (case-insensitive):
|
||||
//
|
||||
// B or (no suffix) — bytes
|
||||
// K or KB — kibibytes (1 024)
|
||||
// M or MB — mebibytes (1 048 576)
|
||||
// G or GB — gibibytes (1 073 741 824)
|
||||
// T or TB — tebibytes (1 099 511 627 776)
|
||||
//
|
||||
// Examples: "512", "128K", "128k", "64M", "1G", "2GB"
|
||||
func ParseMemSize(s string) (int, error) {
|
||||
s = strings.TrimSpace(s)
|
||||
if s == "" {
|
||||
return 0, fmt.Errorf("empty memory size string")
|
||||
}
|
||||
|
||||
// split numeric prefix from unit suffix
|
||||
i := 0
|
||||
for i < len(s) && (unicode.IsDigit(rune(s[i])) || s[i] == '.') {
|
||||
i++
|
||||
}
|
||||
numStr := s[:i]
|
||||
unit := strings.ToUpper(strings.TrimSpace(s[i:]))
|
||||
// strip trailing 'B' from two-letter units (KB→K, MB→M …)
|
||||
if len(unit) == 2 && unit[1] == 'B' {
|
||||
unit = unit[:1]
|
||||
}
|
||||
|
||||
val, err := strconv.ParseFloat(numStr, 64)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("invalid memory size %q: %w", s, err)
|
||||
}
|
||||
|
||||
var multiplier float64
|
||||
switch unit {
|
||||
case "", "B":
|
||||
multiplier = 1
|
||||
case "K":
|
||||
multiplier = 1024
|
||||
case "M":
|
||||
multiplier = 1024 * 1024
|
||||
case "G":
|
||||
multiplier = 1024 * 1024 * 1024
|
||||
case "T":
|
||||
multiplier = 1024 * 1024 * 1024 * 1024
|
||||
default:
|
||||
return 0, fmt.Errorf("unknown memory unit %q in %q", unit, s)
|
||||
}
|
||||
|
||||
return int(val * multiplier), nil
|
||||
}
|
||||
|
||||
// FormatMemSize formats a byte count as a human-readable string with the
|
||||
// largest unit that produces a value ≥ 1 (e.g. 1536 → "1.5K").
|
||||
func FormatMemSize(n int) string {
|
||||
units := []struct {
|
||||
suffix string
|
||||
size int
|
||||
}{
|
||||
{"T", 1024 * 1024 * 1024 * 1024},
|
||||
{"G", 1024 * 1024 * 1024},
|
||||
{"M", 1024 * 1024},
|
||||
{"K", 1024},
|
||||
}
|
||||
for _, u := range units {
|
||||
if n >= u.size {
|
||||
v := float64(n) / float64(u.size)
|
||||
if v == float64(int(v)) {
|
||||
return fmt.Sprintf("%d%s", int(v), u.suffix)
|
||||
}
|
||||
return fmt.Sprintf("%.1f%s", v, u.suffix)
|
||||
}
|
||||
}
|
||||
return fmt.Sprintf("%dB", n)
|
||||
}
|
||||
+15
-1
@@ -144,7 +144,7 @@ func (r *AsciiSet) TrimLeft(s string) string {
|
||||
return s[i:]
|
||||
}
|
||||
|
||||
func SplitInTwo(s string, sep byte) (string, string) {
|
||||
func LeftSplitInTwo(s string, sep byte) (string, string) {
|
||||
i := 0
|
||||
for ; i < len(s); i++ {
|
||||
c := s[i]
|
||||
@@ -157,3 +157,17 @@ func SplitInTwo(s string, sep byte) (string, string) {
|
||||
}
|
||||
return s[:i], s[i+1:]
|
||||
}
|
||||
|
||||
func RightSplitInTwo(s string, sep byte) (string, string) {
|
||||
i := len(s) - 1
|
||||
for ; i >= 0; i-- {
|
||||
c := s[i]
|
||||
if c == sep {
|
||||
break
|
||||
}
|
||||
}
|
||||
if i == len(s) {
|
||||
return s, ""
|
||||
}
|
||||
return s[:i], s[i+1:]
|
||||
}
|
||||
|
||||
@@ -0,0 +1,302 @@
|
||||
# Objective
|
||||
|
||||
Fully document OBITools (version 4, written in Go) in English, using a 4‑phase incremental pipeline.
|
||||
|
||||
You **MUST** use the available MCP servers:
|
||||
|
||||
- `cclsp` – exact definitions, references, diagnostics
|
||||
- `jcodemunch` – code indexing, symbol extraction
|
||||
- `treesitter` – AST and CLI parsing
|
||||
- `context7` – external documentation
|
||||
|
||||
All tool calls must follow the exact API described in the MCP server documentation. If a required tool is unavailable, you **MUST** log the error and stop execution.
|
||||
|
||||
### Tool call format (CRITICAL)
|
||||
|
||||
Tool calls **MUST** use this exact XML format — no spaces inside the angle brackets:
|
||||
|
||||
```
|
||||
<function=tool_name>
|
||||
{"param": "value"}
|
||||
</function>
|
||||
```
|
||||
|
||||
**FORBIDDEN** — these variants will cause parse errors and must NEVER be used:
|
||||
- `< function=tool_name >` (spaces around the tag name)
|
||||
- `< function = tool_name>` (spaces around `=`)
|
||||
- `<function = tool_name>` (space before `=`)
|
||||
|
||||
The opening tag is `<function=tool_name>` with **zero spaces** inside `<` and `>`.
|
||||
|
||||
---
|
||||
|
||||
# Global rules
|
||||
|
||||
** You are not allowed to read twice the same file in a row. **
|
||||
|
||||
## Language
|
||||
|
||||
- All generated documentation **MUST** be in English.
|
||||
- If an existing documentation file is in French:
|
||||
1. Translate it to English
|
||||
2. Save the original as `.fr.md` **before** overwriting
|
||||
3. Write the new English version
|
||||
|
||||
---
|
||||
|
||||
## Execution mode (STRICT)
|
||||
|
||||
You are operating in **STRICT TOOL MODE**:
|
||||
|
||||
- If a file must be written, you **MUST** use the `Shell` tool.
|
||||
- You **MUST NOT** read entire directory listings into memory.
|
||||
- You **MUST** work with **one item at a time** using a simple text file as a task queue.
|
||||
|
||||
### Reading files before writing
|
||||
|
||||
- **Before writing to an existing documentation file**, you must first read it using the `Read` tool.
|
||||
- **When documenting a single Go source file**, you only need to read that one file (plus up to 4-5 helper files if needed for context).
|
||||
- Do NOT read the entire codebase - only what is necessary to document the current file.
|
||||
|
||||
---
|
||||
|
||||
### Rules
|
||||
|
||||
- Always write the **full** file (no partial updates).
|
||||
- Paths are relative to the project root; directories are created implicitly.
|
||||
- Content must be valid UTF‑8; use `\n` line endings.
|
||||
- Do **not** wrap content in backticks.
|
||||
|
||||
---
|
||||
|
||||
## Progress tracking: task queue files
|
||||
|
||||
We use **line‑oriented task files** to avoid loading large lists into memory. Each phase has its own task file:
|
||||
|
||||
- `docs/todo/phase1.txt` – list of Go files (one per line) to document.
|
||||
- `docs/todo/phase1bis.txt` – same list, but after phase1 is done.
|
||||
- `docs/todo/phase2.txt` – list of packages.
|
||||
- `docs/todo/phase3.txt` – list of tools.
|
||||
|
||||
**How it works:**
|
||||
|
||||
1. At the start of a phase, if the task file does not exist, it is created by scanning the codebase once (Phase 0 or Phase X init).
|
||||
2. **Each run of the LLM processes only the first line of the task file.**
|
||||
3. After processing the item (success or permanent failure), the line is removed from the task file.
|
||||
- On success, the line is deleted (no extra sentinel file needed).
|
||||
- On transient failure (retry < 3), we keep the line but increment a retry counter stored in a separate file.
|
||||
- On permanent failure (retry ≥ 3), we move the line to a `failed.txt` file and log the error.
|
||||
4. The LLM then exits (or continues if the task file is still non‑empty, but it must never load more than one line).
|
||||
|
||||
This way, the LLM’s context never holds more than a single task at a time.
|
||||
|
||||
### Retry mechanism
|
||||
|
||||
For each item (e.g., `internal/align/align.go`), we maintain a retry counter in:
|
||||
|
||||
- `docs/retry/phase1/internal/align/align.go.count`
|
||||
|
||||
If the file does not exist, retries = 0.
|
||||
Each time processing fails, we increment the counter (write the new number).
|
||||
If after increment the counter < 3, we keep the line in the task file.
|
||||
If counter reaches 3, we **remove the line from the task file**, add it to `docs/failed/phase1/internal/align/align.go.failed` (just a marker), and log the error.
|
||||
|
||||
---
|
||||
|
||||
## Documentation quality requirements (CRITICAL)
|
||||
|
||||
Documentation MUST NOT be superficial. For each documented element (file, function, struct, package):
|
||||
|
||||
### You MUST explain:
|
||||
|
||||
- what it does
|
||||
- why it exists (context, problem solved)
|
||||
- how it is used
|
||||
- assumptions and preconditions
|
||||
- possible edge cases
|
||||
|
||||
### Forbidden patterns
|
||||
|
||||
- Vague phrases like “This function handles…”, “Utility for…”, “Helper function…”.
|
||||
- Generic descriptions that could apply to any project.
|
||||
|
||||
### Required content per element type
|
||||
|
||||
- Functions:
|
||||
- Purpose
|
||||
- Parameter meaning
|
||||
- Return values
|
||||
- Notable behaviour (panic conditions, side effects, concurrency)
|
||||
- Structs:
|
||||
- Role in the system
|
||||
- Meaning of key fields
|
||||
- Files:
|
||||
- Role within the package
|
||||
- Interactions with other files
|
||||
|
||||
### Anti‑generic rule
|
||||
|
||||
If the description could apply to any project, it is INVALID. You MUST include domain‑specific context (bioinformatics, sequence processing, etc.) and concrete behaviour.
|
||||
|
||||
### Quality validation
|
||||
|
||||
Before marking an item as done (i.e., creating the .done sentinel), you MUST perform a self‑validation:
|
||||
|
||||
- Check that all required sections are present.
|
||||
- Verify that no forbidden patterns remain.
|
||||
|
||||
If validation fails, increment the retry counter and keep the item pending.
|
||||
|
||||
|
||||
---
|
||||
|
||||
# Directory structure
|
||||
|
||||
```
|
||||
docs/
|
||||
todo/ # task queues
|
||||
phase1.txt
|
||||
phase1bis.txt
|
||||
phase2.txt
|
||||
phase3.txt
|
||||
retry/ # retry counters
|
||||
phase1/ # mirrors file structure
|
||||
internal/align/align.go.count
|
||||
phase1bis/
|
||||
phase2/
|
||||
phase3/
|
||||
failed/ # permanent failure markers
|
||||
phase1/
|
||||
internal/align/align.go.failed
|
||||
phase1bis/
|
||||
phase2/
|
||||
phase3/
|
||||
phase1/ # actual documentation
|
||||
<relative_path>/<file>.go.md
|
||||
phase2/
|
||||
<package>.md
|
||||
phase3/
|
||||
<tool>.md
|
||||
error.log
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
# Phase 0: Initialization
|
||||
|
||||
1. Ensure required directories exist: `docs/todo`, `docs/retry`, `docs/failed`, `docs/phase1`, `docs/phase2`, `docs/phase3`.
|
||||
2. **If `docs/todo/phase1.txt` does not exist**:
|
||||
- Use `find pkg -name "*.go" ! -name "*_test.go" ! -path "*/cmd/*"` to list all Go files (excluding tests and main.go).
|
||||
- Write the list (one relative path per line, e.g., `internal/align/align.go`) to `docs/todo/phase1.txt`.
|
||||
3. Do the same for phase2 and phase3 later when those phases start.
|
||||
4. **No other state is stored.**
|
||||
|
||||
---
|
||||
|
||||
# Phase 1: File documentation
|
||||
|
||||
**Processing rule:**
|
||||
- Read the **first line** of `docs/todo/phase1.txt` (using `head -n 1`).
|
||||
- If the file is empty, Phase 1 is complete → proceed to Phase 1bis initialization.
|
||||
- Otherwise, process that single file.
|
||||
|
||||
**Processing a file:**
|
||||
|
||||
1. Let `relpath` be the line content (e.g., `internal/align/align.go`).
|
||||
2. Check if a permanent failure marker exists at `docs/failed/phase1/${relpath}.failed`. If yes, remove the line from the task file and skip (line will be deleted).
|
||||
3. If the documentation file `docs/phase1/${relpath}.go.md` exists go directly to its validation (step 6).
|
||||
4. Otherwise, generate documentation for that file (using MCP tools as before).
|
||||
5. Write the documentation to `docs/phase1/${relpath}.go.md`.
|
||||
6. Validate quality.
|
||||
7. If validation succeeds:
|
||||
- Remove the line from the task file.
|
||||
- Remove any retry counter file for this item.
|
||||
- (No sentinel needed; the removal from todo indicates completion.)
|
||||
8. If validation fails:
|
||||
- Increment retry counter:
|
||||
- If `docs/retry/phase1/${relpath}.count` does not exist, set to 1.
|
||||
- Else read it, add 1, write back.
|
||||
- If new counter >= 3:
|
||||
- Remove line from task file.
|
||||
- Create `docs/failed/phase1/${relpath}.failed`.
|
||||
- Log error.
|
||||
- If new counter < 3:
|
||||
- Keep the line in the task file (do nothing, it stays as first line for next run).
|
||||
9. **Exit** (or stop if this was a single run). The next invocation will read the first line again (same if retry, or next if removed).
|
||||
|
||||
**Important:**
|
||||
- Do **not** read more than one line.
|
||||
- Do **not** attempt to process multiple items in one run.
|
||||
- The LLM should finish after handling one item.
|
||||
|
||||
---
|
||||
|
||||
# Phase 1bis: Review and harmonization
|
||||
|
||||
When Phase 1 is complete (i.e., `docs/todo/phase1.txt` empty), we initialize `docs/todo/phase1bis.txt` with the same list of files (the ones that succeeded).
|
||||
But note: we need to know which files were successfully documented. Since we removed lines from `phase1.txt` on success, we need a record. The simplest is to reuse the same list but we can generate it by listing the existing `.go.md` files in `docs/phase1/` (since every successful file has a `.go.md`).
|
||||
Thus, Phase 1bis initialization:
|
||||
|
||||
- If `docs/todo/phase1bis.txt` does not exist, create it by listing all `.go.md` files under `docs/phase1/`, stripping the `docs/phase1/` prefix and the `.go.md` suffix, and writing the relative path (same format as phase1).
|
||||
|
||||
Then processing is identical to Phase 1, but using `docs/todo/phase1bis.txt` and output is overwriting the same `.go.md` files (with improvements). Retry counters go in `docs/retry/phase1bis/`.
|
||||
|
||||
---
|
||||
|
||||
# Phase 2: Package documentation
|
||||
|
||||
When Phase 1bis is complete (`docs/todo/phase1bis.txt` empty), initialize `docs/todo/phase2.txt`:
|
||||
|
||||
- List all packages: unique directories under `pkg/` that contain at least one `.go` file and are not tools.
|
||||
- Write each package identifier (e.g., `align`, `internal/align`) as a line.
|
||||
|
||||
Processing: read first line, generate `docs/phase2/<package>.md`, validate, remove line on success, retry logic in `docs/retry/phase2/`.
|
||||
|
||||
---
|
||||
|
||||
# Phase 3: Tool documentation
|
||||
|
||||
When Phase 2 complete, initialize `docs/todo/phase3.txt`:
|
||||
|
||||
- List all directories under `cmd/` that contain a `main.go`. Write each tool name as a line.
|
||||
|
||||
Processing: read first line, generate `docs/phase3/<tool>.md`, validate, remove line on success, retry logic in `docs/retry/phase3/`.
|
||||
|
||||
---
|
||||
|
||||
# Finalization
|
||||
|
||||
When all task files are empty and no pending phases, generate `docs/README.md` by:
|
||||
|
||||
- Listing all package docs (files in `docs/phase2/`) and linking.
|
||||
- Listing all tool docs (files in `docs/phase3/`) and linking.
|
||||
|
||||
Write using `Shell`.
|
||||
|
||||
---
|
||||
|
||||
# Execution flow summary
|
||||
|
||||
1. **Phase 0**: Create directories and initial `todo/phase1.txt` if missing. Exit.
|
||||
2. **Phase 1**:
|
||||
- If `todo/phase1.txt` exists and non‑empty → process first line.
|
||||
- Else → move to Phase 1bis initialization.
|
||||
3. **Phase 1bis**:
|
||||
- If `todo/phase1bis.txt` does not exist → create from successful phase1 docs.
|
||||
- If non‑empty → process first line.
|
||||
- Else → move to Phase 2 initialization.
|
||||
4. **Phase 2**: similar.
|
||||
5. **Phase 3**: similar.
|
||||
6. **Finalization**: generate README.
|
||||
|
||||
The LLM should be invoked repeatedly (e.g., by a scheduler) until all phases are done. Each invocation processes exactly one item.
|
||||
|
||||
---
|
||||
|
||||
# Important reminders
|
||||
|
||||
- Always call `Shell` to write files; never output content in plain text.
|
||||
- Validate quality before removing a line from the task file.
|
||||
- Log all failures to `docs/error.log` in JSON lines format.
|
||||
- If any MCP tool fails, treat as failure and increment retry counter.
|
||||
- Never read more than one line from a task file in a single run.
|
||||
Executable
+294
@@ -0,0 +1,294 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Generate GitHub-compatible release notes for an OBITools4 version.
|
||||
#
|
||||
# Usage:
|
||||
# ./release_notes.sh # latest version
|
||||
# ./release_notes.sh -v 4.4.15 # specific version
|
||||
# ./release_notes.sh -l # list available versions
|
||||
# ./release_notes.sh -r # raw commit list (no LLM)
|
||||
# ./release_notes.sh -c -v 4.4.16 # show LLM context for a version
|
||||
|
||||
GITHUB_REPO="metabarcoding/obitools4"
|
||||
GITHUB_API="https://api.github.com/repos/${GITHUB_REPO}"
|
||||
VERSION=""
|
||||
LIST_VERSIONS=false
|
||||
RAW_MODE=false
|
||||
CONTEXT_MODE=false
|
||||
LLM_MODEL="ollama:qwen3-coder-next:latest"
|
||||
|
||||
# ── Helpers ──────────────────────────────────────────────────────────────
|
||||
|
||||
die() { echo "Error: $*" >&2; exit 1; }
|
||||
|
||||
next_patch() {
|
||||
local v="$1"
|
||||
local major minor patch
|
||||
major=$(echo "$v" | cut -d. -f1)
|
||||
minor=$(echo "$v" | cut -d. -f2)
|
||||
patch=$(echo "$v" | cut -d. -f3)
|
||||
echo "${major}.${minor}.$(( patch + 1 ))"
|
||||
}
|
||||
|
||||
# Strip "pre-" prefix to get the bare version number for installation section
|
||||
bare_version() {
|
||||
echo "$1" | sed 's/^pre-//'
|
||||
}
|
||||
|
||||
installation_section() {
|
||||
local v
|
||||
v=$(bare_version "$1")
|
||||
cat <<INSTALL_EOF
|
||||
|
||||
## Installation
|
||||
|
||||
### Pre-built binaries
|
||||
|
||||
Download the appropriate archive for your system from the
|
||||
[release assets](https://github.com/metabarcoding/obitools4/releases/tag/Release_${v})
|
||||
and extract it:
|
||||
|
||||
#### Linux (AMD64)
|
||||
\`\`\`bash
|
||||
tar -xzf obitools4_${v}_linux_amd64.tar.gz
|
||||
\`\`\`
|
||||
|
||||
#### Linux (ARM64)
|
||||
\`\`\`bash
|
||||
tar -xzf obitools4_${v}_linux_arm64.tar.gz
|
||||
\`\`\`
|
||||
|
||||
#### macOS (Intel)
|
||||
\`\`\`bash
|
||||
tar -xzf obitools4_${v}_darwin_amd64.tar.gz
|
||||
\`\`\`
|
||||
|
||||
#### macOS (Apple Silicon)
|
||||
\`\`\`bash
|
||||
tar -xzf obitools4_${v}_darwin_arm64.tar.gz
|
||||
\`\`\`
|
||||
|
||||
All OBITools4 binaries are included in each archive.
|
||||
|
||||
### From source
|
||||
|
||||
You can also compile and install OBITools4 directly from source using the
|
||||
installation script:
|
||||
|
||||
\`\`\`bash
|
||||
curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | bash -s -- --version ${v}
|
||||
\`\`\`
|
||||
|
||||
By default binaries are installed in \`/usr/local/bin\`. Use \`--install-dir\` to
|
||||
change the destination and \`--obitools-prefix\` to add a prefix to command names:
|
||||
|
||||
\`\`\`bash
|
||||
curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | \\
|
||||
bash -s -- --version ${v} --install-dir ~/local --obitools-prefix k
|
||||
\`\`\`
|
||||
INSTALL_EOF
|
||||
}
|
||||
|
||||
display_help() {
|
||||
cat <<EOF
|
||||
Usage: $(basename "$0") [OPTIONS]
|
||||
|
||||
Generate GitHub-compatible Markdown release notes for an OBITools4 version.
|
||||
|
||||
Options:
|
||||
-v, --version VERSION Target version (e.g., 4.4.15). Default: latest.
|
||||
-l, --list List all available versions and exit.
|
||||
-r, --raw Output raw commit list without LLM summarization.
|
||||
-c, --context Show the exact context (commits + prompt) sent to the LLM.
|
||||
-m, --model MODEL LLM model for orla (default: $LLM_MODEL).
|
||||
-h, --help Display this help message.
|
||||
|
||||
Examples:
|
||||
$(basename "$0") # release notes for the latest version
|
||||
$(basename "$0") -v 4.4.15 # release notes for a specific version
|
||||
$(basename "$0") -l # list versions
|
||||
$(basename "$0") -r -v 4.4.15 # raw commit log for a version
|
||||
$(basename "$0") -c -v 4.4.16 # show LLM context for a version
|
||||
EOF
|
||||
}
|
||||
|
||||
# Fetch all Release tags from GitHub API (sorted newest first)
|
||||
fetch_versions() {
|
||||
curl -sf "${GITHUB_API}/releases" \
|
||||
| grep '"tag_name":' \
|
||||
| sed -E 's/.*"tag_name": "Release_([0-9.]+)".*/\1/' \
|
||||
| sort -V -r
|
||||
}
|
||||
|
||||
# ── Parse arguments ──────────────────────────────────────────────────────
|
||||
|
||||
while [ "$#" -gt 0 ]; do
|
||||
case "$1" in
|
||||
-v|--version) VERSION="$2"; shift 2 ;;
|
||||
-l|--list) LIST_VERSIONS=true; shift ;;
|
||||
-r|--raw) RAW_MODE=true; shift ;;
|
||||
-c|--context) CONTEXT_MODE=true; shift ;;
|
||||
-m|--model) LLM_MODEL="$2"; shift 2 ;;
|
||||
-h|--help) display_help; exit 0 ;;
|
||||
*) die "Unsupported option: $1" ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# ── List mode ────────────────────────────────────────────────────────────
|
||||
|
||||
if [ "$LIST_VERSIONS" = true ]; then
|
||||
echo "Available OBITools4 versions:" >&2
|
||||
echo "==============================" >&2
|
||||
fetch_versions
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# ── Resolve versions ─────────────────────────────────────────────────────
|
||||
|
||||
all_versions=$(fetch_versions)
|
||||
[ -z "$all_versions" ] && die "Could not fetch versions from GitHub"
|
||||
|
||||
if [ -z "$VERSION" ]; then
|
||||
# ── Pre-release mode: local HEAD vs latest GitHub tag ──────────────────
|
||||
PRE_RELEASE=true
|
||||
previous_tag="Release_${latest_version}"
|
||||
VERSION="pre-$(next_patch "$latest_version")"
|
||||
|
||||
echo "Pre-release mode: $previous_tag -> HEAD (as $VERSION)" >&2
|
||||
|
||||
# Need to be in a git repo
|
||||
if ! git rev-parse --is-inside-work-tree >/dev/null 2>&1; then
|
||||
die "Not inside a git repository. Pre-release mode requires a local git repo."
|
||||
fi
|
||||
|
||||
# Check that the previous tag exists locally
|
||||
if ! git rev-parse "$previous_tag" >/dev/null 2>&1; then
|
||||
echo "Tag $previous_tag not found locally, fetching..." >&2
|
||||
git fetch --tags 2>/dev/null || true
|
||||
if ! git rev-parse "$previous_tag" >/dev/null 2>&1; then
|
||||
die "Tag $previous_tag not found locally or remotely"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Get local commits from the tag to HEAD (full messages)
|
||||
commit_list=$(git log --format="%h %B" "${previous_tag}..HEAD" 2>/dev/null)
|
||||
|
||||
if [ -z "$commit_list" ]; then
|
||||
die "No local commits found since $previous_tag"
|
||||
fi
|
||||
else
|
||||
# ── Published release mode: between two GitHub tags ────────────────────
|
||||
PRE_RELEASE=false
|
||||
tag_name="Release_${VERSION}"
|
||||
|
||||
# Verify the requested version exists
|
||||
if ! echo "$all_versions" | grep -qx "$VERSION"; then
|
||||
die "Version $VERSION not found. Use -l to list available versions."
|
||||
fi
|
||||
|
||||
# Find the previous version
|
||||
previous_version=$(echo "$all_versions" | grep -A1 -x "$VERSION" | tail -1)
|
||||
|
||||
if [ "$previous_version" = "$VERSION" ] || [ -z "$previous_version" ]; then
|
||||
previous_tag=""
|
||||
echo "No previous version found -- will include all commits for $tag_name" >&2
|
||||
else
|
||||
previous_tag="Release_${previous_version}"
|
||||
echo "Generating notes: $previous_tag -> $tag_name" >&2
|
||||
fi
|
||||
|
||||
# Fetch commit messages between tags via GitHub compare API
|
||||
if [ -n "$previous_tag" ]; then
|
||||
commits_json=$(curl -sf "${GITHUB_API}/compare/${previous_tag}...${tag_name}")
|
||||
if [ -z "$commits_json" ]; then
|
||||
die "Could not fetch commit comparison from GitHub"
|
||||
fi
|
||||
commit_list=$(echo "$commits_json" \
|
||||
| jq -r '.commits[] | (.sha[:8] + " " + .commit.message)' 2>/dev/null)
|
||||
else
|
||||
commits_json=$(curl -sf "${GITHUB_API}/commits?sha=${tag_name}&per_page=50")
|
||||
if [ -z "$commits_json" ]; then
|
||||
die "Could not fetch commits from GitHub"
|
||||
fi
|
||||
commit_list=$(echo "$commits_json" \
|
||||
| jq -r '.[] | (.sha[:8] + " " + .commit.message)' 2>/dev/null)
|
||||
fi
|
||||
|
||||
if [ -z "$commit_list" ]; then
|
||||
die "No commits found between $previous_tag and $tag_name"
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── LLM prompt (shared by context mode and summarization) ────────────────
|
||||
|
||||
LLM_PROMPT="Summarize the following commits into a GitHub release note for version ${VERSION}. \
|
||||
Ignore commits related to version bumps, .gitignore changes, or any internal housekeeping \
|
||||
that is irrelevant to end users. Describe each user-facing change precisely without exposing \
|
||||
code. Eliminate redundancy. Output strictly valid JSON with no surrounding text, using this \
|
||||
exact schema: {\"title\": \"<short release title>\", \"body\": \"<detailed markdown release notes>\"}"
|
||||
|
||||
# ── Raw mode: just output the commit list ────────────────────────────────
|
||||
|
||||
if [ "$RAW_MODE" = true ]; then
|
||||
echo "# Release ${VERSION}"
|
||||
echo ""
|
||||
echo "## Commits"
|
||||
echo ""
|
||||
echo "$commit_list" | while IFS= read -r line; do
|
||||
echo "- ${line}"
|
||||
done
|
||||
installation_section "$VERSION"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# ── Context mode: show what would be sent to the LLM ────────────────────
|
||||
|
||||
if [ "$CONTEXT_MODE" = true ]; then
|
||||
echo "=== LLM Model ==="
|
||||
echo "$LLM_MODEL"
|
||||
echo ""
|
||||
echo "=== Prompt ==="
|
||||
echo "$LLM_PROMPT"
|
||||
echo ""
|
||||
echo "=== Stdin (commit list) ==="
|
||||
echo "$commit_list"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# ── LLM summarization ───────────────────────────────────────────────────
|
||||
|
||||
if ! command -v orla >/dev/null 2>&1; then
|
||||
die "orla is required for LLM summarization. Use -r for raw output."
|
||||
fi
|
||||
|
||||
if ! command -v jq >/dev/null 2>&1; then
|
||||
die "jq is required for JSON parsing. Use -r for raw output."
|
||||
fi
|
||||
|
||||
echo "Summarizing with LLM ($LLM_MODEL)..." >&2
|
||||
|
||||
raw_output=$(echo "$commit_list" | \
|
||||
ORLA_MAX_TOOL_CALLS=50 orla agent -m "$LLM_MODEL" \
|
||||
"$LLM_PROMPT" \
|
||||
2>/dev/null) || true
|
||||
|
||||
if [ -z "$raw_output" ]; then
|
||||
echo "Warning: LLM returned empty output, falling back to raw mode" >&2
|
||||
exec "$0" -r -v "$VERSION"
|
||||
fi
|
||||
|
||||
# Sanitize: extract JSON object, strip control characters
|
||||
sanitized=$(echo "$raw_output" | sed -n '/^{/,/^}/p' | tr -d '\000-\011\013-\014\016-\037')
|
||||
|
||||
release_title=$(echo "$sanitized" | jq -r '.title // empty' 2>/dev/null)
|
||||
release_body=$(echo "$sanitized" | jq -r '.body // empty' 2>/dev/null)
|
||||
|
||||
if [ -n "$release_title" ] && [ -n "$release_body" ]; then
|
||||
echo "# ${release_title}"
|
||||
echo ""
|
||||
echo "$release_body"
|
||||
installation_section "$VERSION"
|
||||
else
|
||||
echo "Warning: JSON parsing failed, falling back to raw mode" >&2
|
||||
exec "$0" -r -v "$VERSION"
|
||||
fi
|
||||
@@ -0,0 +1,222 @@
|
||||
//go:build ignore
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type Reference struct {
|
||||
File string `json:"file"`
|
||||
Line int `json:"line"`
|
||||
Column int `json:"column"`
|
||||
Key string `json:"key"`
|
||||
Function string `json:"function"`
|
||||
Context string `json:"context"`
|
||||
}
|
||||
|
||||
type Result struct {
|
||||
Method string `json:"method"`
|
||||
Signature string `json:"signature"`
|
||||
Definition string `json:"definition"`
|
||||
References []Reference `json:"references"`
|
||||
Total int `json:"total"`
|
||||
}
|
||||
|
||||
var basePath = "/Users/coissac/Sync/travail/__MOI__/GO/obitools4"
|
||||
|
||||
func main() {
|
||||
cmd := exec.Command("rg", "-n", `\.SetAttribute\(`, basePath+"/pkg", "--type", "go")
|
||||
output, err := cmd.Output()
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error running rg: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
lines := strings.Split(string(output), "\n")
|
||||
lineRe := regexp.MustCompile(`^(.+?):(\d+):\s*(.+)$`)
|
||||
keyRe := regexp.MustCompile(`SetAttribute\("([^"]+)"`)
|
||||
templateKeyRe := regexp.MustCompile(`SetAttribute\("([^"]+)[^"]*"\s*,`)
|
||||
|
||||
var refs []Reference
|
||||
seen := make(map[string]bool)
|
||||
|
||||
for _, line := range lines {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
matches := lineRe.FindStringSubmatch(line)
|
||||
if matches == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
file := matches[1]
|
||||
lineNum, _ := strconv.Atoi(matches[2])
|
||||
context := strings.TrimSpace(matches[3])
|
||||
|
||||
// Skip definition
|
||||
if strings.Contains(file, "obiseq/attributes.go") && lineNum == 132 {
|
||||
continue
|
||||
}
|
||||
|
||||
// Extract key
|
||||
var key string
|
||||
if keyMatches := keyRe.FindStringSubmatch(context); keyMatches != nil {
|
||||
key = keyMatches[1]
|
||||
} else if tmplMatches := templateKeyRe.FindStringSubmatch(context); tmplMatches != nil {
|
||||
key = tmplMatches[1]
|
||||
} else {
|
||||
continue
|
||||
}
|
||||
|
||||
// Get function name using treesitter
|
||||
funcName := getFunctionNameTreesitter(file, lineNum)
|
||||
|
||||
uniqueKey := fmt.Sprintf("%s:%d", file, lineNum)
|
||||
if seen[uniqueKey] {
|
||||
continue
|
||||
}
|
||||
seen[uniqueKey] = true
|
||||
|
||||
refs = append(refs, Reference{
|
||||
File: filepath.Base(file),
|
||||
Line: lineNum,
|
||||
Column: 0,
|
||||
Key: key,
|
||||
Function: funcName,
|
||||
Context: context,
|
||||
})
|
||||
}
|
||||
|
||||
sort.Slice(refs, func(i, j int) bool {
|
||||
if refs[i].File != refs[j].File {
|
||||
return refs[i].File < refs[j].File
|
||||
}
|
||||
return refs[i].Line < refs[j].Line
|
||||
})
|
||||
|
||||
result := Result{
|
||||
Method: "SetAttribute",
|
||||
Signature: "func (s *BioSequence) SetAttribute(key string, value interface{})",
|
||||
Definition: basePath + "/pkg/obiseq/attributes.go:132",
|
||||
References: refs,
|
||||
Total: len(refs),
|
||||
}
|
||||
|
||||
outputJSON, err := json.MarshalIndent(result, "", " ")
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error marshaling JSON: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
fmt.Println(string(outputJSON))
|
||||
}
|
||||
|
||||
// getFunctionNameTreesitter uses the treesitter_cursor_walk tool to get the containing function
|
||||
func getFunctionNameTreesitter(file string, targetLine int) string {
|
||||
// Convert to 0-based for treesitter
|
||||
row := targetLine - 1
|
||||
|
||||
// Use treesitter cursor walk to get ancestors
|
||||
cmd := exec.Command("bash", "-c",
|
||||
fmt.Sprintf(`kilo treesitter_cursor_walk --file_path %q --row %d --column 0 --max_depth 10 2>/dev/null`, file, row))
|
||||
|
||||
output, err := cmd.Output()
|
||||
if err != nil {
|
||||
return findContainingFunction(file, targetLine)
|
||||
}
|
||||
|
||||
// Parse the JSON output to find function_declaration or method_declaration
|
||||
var result map[string]interface{}
|
||||
if err := json.Unmarshal(output, &result); err != nil {
|
||||
return findContainingFunction(file, targetLine)
|
||||
}
|
||||
|
||||
// Check ancestors for function declaration
|
||||
if ancestors, ok := result["ancestors"].([]interface{}); ok {
|
||||
for _, a := range ancestors {
|
||||
if anc, ok := a.(map[string]interface{}); ok {
|
||||
nodeType, _ := anc["type"].(string)
|
||||
if nodeType == "function_declaration" || nodeType == "method_declaration" {
|
||||
// Try to get the function name from children
|
||||
if children, ok := anc["children"].([]interface{}); ok {
|
||||
for _, c := range children {
|
||||
if child, ok := c.(map[string]interface{}); ok {
|
||||
childType, _ := child["type"].(string)
|
||||
if childType == "identifier" {
|
||||
if text, ok := child["text"].(string); ok {
|
||||
return text
|
||||
}
|
||||
}
|
||||
if childType == "field_identifier" {
|
||||
if text, ok := child["text"].(string); ok {
|
||||
return text
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if nodeType == "func_literal" {
|
||||
return "closure"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return findContainingFunction(file, targetLine)
|
||||
}
|
||||
|
||||
func findContainingFunction(file string, targetLine int) string {
|
||||
data, err := os.ReadFile(file)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
lines := strings.Split(string(data), "\n")
|
||||
|
||||
for i := targetLine - 1; i >= 0 && i >= targetLine-200; i-- {
|
||||
if i >= len(lines) {
|
||||
continue
|
||||
}
|
||||
line := strings.TrimSpace(lines[i])
|
||||
|
||||
if line == "}" && i > 0 {
|
||||
for j := i - 1; j >= 0 && j >= i-50; j-- {
|
||||
if j >= len(lines) {
|
||||
continue
|
||||
}
|
||||
funcLine := strings.TrimSpace(lines[j])
|
||||
if strings.HasPrefix(funcLine, "func ") {
|
||||
if match := regexp.MustCompile(`func\s+\([^)]+\)\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(`).FindStringSubmatch(funcLine); match != nil {
|
||||
return match[1]
|
||||
}
|
||||
if match := regexp.MustCompile(`func\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(`).FindStringSubmatch(funcLine); match != nil {
|
||||
return match[1]
|
||||
}
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
if strings.HasPrefix(line, "func ") {
|
||||
if match := regexp.MustCompile(`func\s+\([^)]+\)\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(`).FindStringSubmatch(line); match != nil {
|
||||
return match[1]
|
||||
}
|
||||
if match := regexp.MustCompile(`func\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(`).FindStringSubmatch(line); match != nil {
|
||||
return match[1]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
Executable
+36
@@ -0,0 +1,36 @@
|
||||
#!/bin/bash
|
||||
|
||||
basePath="/Users/coissac/Sync/travail/__MOI__/GO/obitools4"
|
||||
OUTPUT_FILE="${1:-/dev/stdout}"
|
||||
|
||||
# Get all SetAttribute calls
|
||||
rg -n '\.SetAttribute\(' "$basePath/pkg" --type go | while read -r line; do
|
||||
file="${line%%:*}"
|
||||
line_num="${line%:*}"
|
||||
line_num="${line_num##*:}"
|
||||
context="${line##*: }"
|
||||
|
||||
# Extract key (only literal strings)
|
||||
key=$(echo "$context" | sed -n 's/.*SetAttribute("\([^"]*\)".*/\1/p')
|
||||
[ -z "$key" ] && continue
|
||||
|
||||
# Get function name using treesitter
|
||||
func=$(kilo treesitter_cursor_walk \
|
||||
--file_path "$file" \
|
||||
--row "$((line_num - 1))" \
|
||||
--column 0 \
|
||||
--max_depth 10 2>/dev/null |
|
||||
jq -r '.ancestors[] | select(.type == "function_declaration" or .type == "method_declaration") | .children[] | select(.type == "identifier" or .type == "field_identifier") | .text' 2>/dev/null)
|
||||
|
||||
# Fallback to func_literal for closures
|
||||
if [ -z "$func" ]; then
|
||||
func=$(kilo treesitter_cursor_walk \
|
||||
--file_path "$file" \
|
||||
--row "$((line_num - 1))" \
|
||||
--column 0 \
|
||||
--max_depth 10 2>/dev/null |
|
||||
jq -r '.ancestors[] | select(.type == "func_literal") | "closure"' 2>/dev/null)
|
||||
fi
|
||||
|
||||
echo "$(basename "$file")|$line_num|$key|${func:-unknown}|$context"
|
||||
done | sort -t'|' -k1,1 -k2,2n
|
||||
@@ -0,0 +1,308 @@
|
||||
{
|
||||
"obiannotate": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
],
|
||||
"(git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter.IBioSequence).NumberSequences$1": [
|
||||
"seq_number"
|
||||
]
|
||||
},
|
||||
"obiclean": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
]
|
||||
},
|
||||
"obicleandb": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetCount": [
|
||||
"count"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
]
|
||||
},
|
||||
"obicomplement": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
]
|
||||
},
|
||||
"obiconsensus": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetCount": [
|
||||
"count"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
],
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconsensus.BuildConsensus": [
|
||||
"obiconsensus_kmer_max_occur",
|
||||
"obiconsensus_filtered_graph_size",
|
||||
"obiconsensus_full_graph_size",
|
||||
"obiconsensus_consensus",
|
||||
"obiconsensus_weight",
|
||||
"obiconsensus_seq_length",
|
||||
"obiconsensus_kmer_size"
|
||||
],
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconsensus.MinionClusterDenoise": [
|
||||
"obiconsensus_consensus"
|
||||
],
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconsensus.MinionDenoise$1": [
|
||||
"obiconsensus_consensus",
|
||||
"obiconsensus_weight"
|
||||
]
|
||||
},
|
||||
"obiconvert": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
]
|
||||
},
|
||||
"obicount": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
]
|
||||
},
|
||||
"obicsv": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
]
|
||||
},
|
||||
"obidemerge": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
]
|
||||
},
|
||||
"obidistribute": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
]
|
||||
},
|
||||
"obigrep": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
]
|
||||
},
|
||||
"obijoin": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
]
|
||||
},
|
||||
"obikmermatch": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
]
|
||||
},
|
||||
"obikmersimcount": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
]
|
||||
},
|
||||
"obilandmark": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetCoordinate": [
|
||||
"landmark_coord"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetOBITagGeomRefIndex": [
|
||||
"obitag_geomref_index"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
],
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obilandmark.CLISelectLandmarkSequences": [
|
||||
"landmark_id"
|
||||
]
|
||||
},
|
||||
"obimatrix": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
]
|
||||
},
|
||||
"obimicrosat": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
]
|
||||
},
|
||||
"obimultiplex": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
]
|
||||
},
|
||||
"obipairing": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence)._revcmpMutation": [
|
||||
"pairing_mismatches"
|
||||
],
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign.BuildQualityConsensus": [
|
||||
"pairing_mismatches"
|
||||
]
|
||||
},
|
||||
"obipcr": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
]
|
||||
},
|
||||
"obireffamidx": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetOBITagRefIndex": [
|
||||
"obitag_ref_index"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
],
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obirefidx.IndexFamilyDB": [
|
||||
"reffamidx_id"
|
||||
]
|
||||
},
|
||||
"obirefidx": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetOBITagRefIndex": [
|
||||
"obitag_ref_index"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
]
|
||||
},
|
||||
"obiscript": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
]
|
||||
},
|
||||
"obisplit": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
]
|
||||
},
|
||||
"obisummary": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
]
|
||||
},
|
||||
"obitag": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetPath": [
|
||||
"taxonomic_path"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
]
|
||||
},
|
||||
"obitagpcr": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obingslibrary.NGSLibrary).ExtractMultiBarcode": [
|
||||
"obimultiplex_error",
|
||||
"obimultiplex_amplicon_rank"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence)._revcmpMutation": [
|
||||
"pairing_mismatches"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence)._subseqMutation": [
|
||||
"pairing_mismatches"
|
||||
],
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign.BuildQualityConsensus": [
|
||||
"pairing_mismatches"
|
||||
]
|
||||
},
|
||||
"obitaxonomy": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetPath": [
|
||||
"taxonomic_path"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
],
|
||||
"(git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter.IBioSequence).NumberSequences$1": [
|
||||
"seq_number"
|
||||
]
|
||||
},
|
||||
"obiuniq": {
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetCount": [
|
||||
"count"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetDefinition": [
|
||||
"definition"
|
||||
],
|
||||
"(*git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq.BioSequence).SetTaxid": [
|
||||
"taxid"
|
||||
]
|
||||
}
|
||||
}
|
||||
Executable
+36
@@ -0,0 +1,36 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Read potentially malformed JSON from stdin (aichat output), extract title and
|
||||
body, and print them as plain text: title on first line, blank line, then body.
|
||||
Exits with 1 on failure (no output).
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import re
|
||||
|
||||
text = sys.stdin.read()
|
||||
|
||||
m = re.search(r'\{.*\}', text, re.DOTALL)
|
||||
if not m:
|
||||
sys.exit(1)
|
||||
|
||||
s = m.group()
|
||||
obj = None
|
||||
|
||||
try:
|
||||
obj = json.loads(s)
|
||||
except Exception:
|
||||
s2 = re.sub(r'(?<!\\)\n', r'\\n', s)
|
||||
try:
|
||||
obj = json.loads(s2)
|
||||
except Exception:
|
||||
sys.exit(1)
|
||||
|
||||
title = obj.get('title', '').strip()
|
||||
body = obj.get('body', '').strip()
|
||||
|
||||
if not title or not body:
|
||||
sys.exit(1)
|
||||
|
||||
print(f"{title}\n\n{body}")
|
||||
+1
-1
@@ -1 +1 @@
|
||||
4.4.12
|
||||
4.4.41
|
||||
|
||||
@@ -0,0 +1,19 @@
|
||||
```markdown
|
||||
# DNA Scoring and Matching Utilities in `obialign`
|
||||
|
||||
This module provides low-level utilities for computing nucleotide alignment scores using probabilistic and bit-encoded representations.
|
||||
|
||||
- **Bit Encoding**: Nucleotides are encoded in 4-bit groups (e.g., `A=0b0001`, `C=0b0010`, etc.), enabling efficient bitwise comparison.
|
||||
- **`_MatchRatio(a, b)`**: Computes a normalized match ratio between two encoded bytes based on shared bits:
|
||||
`ratio = common_bits / (bits_in_a × bits_in_b)`.
|
||||
- **`_FourBitsCount`**: Precomputed lookup table for Hamming weight (popcount) of 4-bit values.
|
||||
- **Log-space Arithmetic**: Helper functions (`_Logaddexp`, `_Logdiffexp`, `_Log1mexp`) ensure numerical stability in probabilistic computations.
|
||||
- **Phred-scaled Quality Integration**:
|
||||
`_MatchScoreRatio(QF, QR)` derives log-odds match/mismatch scores from Phred quality values (`QF`, `QR`), modeling sequencing error probabilities.
|
||||
- **Precomputed Matrices**:
|
||||
- `_NucPartMatch[i][j]`: Match ratios for all nucleotide pairs (from 4-bit codes).
|
||||
- `_NucScorePartMatchMatch/Mismatch[i][j]`: Integer-scaled match/mismatch scores (×10) for quality pairs `(i, j)` in `[0..99]`.
|
||||
- **Thread-Safe Initialization**: `_InitDNAScoreMatrix()` ensures one-time, synchronized initialization of all scoring tables via a mutex.
|
||||
|
||||
Designed for high-performance alignment kernels where speed and numerical robustness are critical.
|
||||
```
|
||||
Reference in New Issue
Block a user