Compare commits
75 Commits
817b02cbc1
..
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 19660f8cd0 | |||
| 7b07540a69 | |||
| 89c43e28f5 | |||
| b9b2e42ad2 | |||
| ca42fdff2f | |||
| 136cd89efb | |||
| a4bbf607b7 | |||
| 9927100a1c | |||
| 527258f822 | |||
| ef62f1947e | |||
| d02316dcf6 | |||
| c323b3eaef | |||
| b77d8e9ca0 | |||
| 7c5bab3694 | |||
| fab4e0d6de | |||
| 973a3f3d6e | |||
| 1a839a295a | |||
| 2ea58703c7 | |||
| ac3ef106e7 | |||
| 469e53b6f5 | |||
| 9f1df96ea7 | |||
| 4e4cce2879 | |||
| 68b05b93c4 | |||
| 0a668cf8a6 | |||
| e6d6942e2f | |||
| bf9c9aeacb | |||
| 22a65857a1 | |||
| d16a867640 | |||
| 616050075f | |||
| e22afe9621 | |||
| bdfac71e65 | |||
| a00bb37478 | |||
| d30a4efd9b | |||
| 6baf2e64ca | |||
| c0a71a2d49 | |||
| a609c1af95 | |||
| 3d32be8a83 | |||
| c4c71dc892 | |||
| 4e625afaba | |||
| a522c0907e | |||
| c1d6f277ce | |||
| 9356be4ec0 | |||
| c694e1f2b0 | |||
| 280ca1f5a3 | |||
| 9abb2db92f | |||
| 7c1efa9cbb | |||
| 4c4524766c | |||
| 7eea71fdcd | |||
| f91c5a3f79 | |||
| fb4962c4fe | |||
| 1d38d87ff9 | |||
| 93559c3294 | |||
| 1f0d77d5bf | |||
| eeba43ac4f | |||
| 7ed7b26039 | |||
| 26de90f18d | |||
| 497d250d8a | |||
| aa98e82875 | |||
| 5ff5b04d2d | |||
| df7b400fda | |||
| d1717688d2 | |||
| cde6457eea | |||
| b6fcbc545f | |||
| 9578f991f4 | |||
| 1cd7916e06 | |||
| bc92dc4592 | |||
| a9567ad023 | |||
| 4a64718fd1 | |||
| 7a87e911b6 | |||
| 313d73838a | |||
| 175ea5bbd0 | |||
| c6ea0c53e3 | |||
| ea767376bd | |||
| f1d76f3203 | |||
| c4071eb450 |
@@ -0,0 +1,35 @@
|
|||||||
|
name: CI
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
branches: ['main']
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
working-directory: src
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Install Rust
|
||||||
|
run: |
|
||||||
|
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
|
||||||
|
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||||
|
|
||||||
|
- name: Cache cargo registry
|
||||||
|
uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
~/.cargo/registry
|
||||||
|
~/.cargo/git
|
||||||
|
src/target
|
||||||
|
key: ${{ runner.os }}-cargo-${{ hashFiles('src/Cargo.lock') }}
|
||||||
|
restore-keys: ${{ runner.os }}-cargo-
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
run: cargo build --release
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
run: cargo test --release
|
||||||
@@ -0,0 +1,123 @@
|
|||||||
|
name: Release
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- "v*"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
create-release:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
outputs:
|
||||||
|
release_id: ${{ steps.create.outputs.release_id }}
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Create Gitea release
|
||||||
|
id: create
|
||||||
|
env:
|
||||||
|
GITEA_TOKEN: ${{ secrets.GITEATOKEN }}
|
||||||
|
TAG: ${{ github.ref_name }}
|
||||||
|
run: |
|
||||||
|
sudo apt-get update -qq && sudo apt-get install -y -qq jq
|
||||||
|
body=$(git for-each-ref --format='%(contents)' "refs/tags/$TAG")
|
||||||
|
release_id=$(curl -s -X POST \
|
||||||
|
"${{ github.server_url }}/api/v1/repos/${{ github.repository }}/releases" \
|
||||||
|
-H "Authorization: token $GITEA_TOKEN" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d "{\"tag_name\":\"$TAG\",\"name\":\"$TAG\",\"body\":$(echo "$body" | jq -Rs .)}" | jq -r '.id')
|
||||||
|
echo "release_id=$release_id" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
|
build-linux-x86_64:
|
||||||
|
needs: create-release
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
working-directory: src
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Install Rust + zigbuild
|
||||||
|
run: |
|
||||||
|
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
|
||||||
|
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||||
|
sudo apt-get update -qq && sudo apt-get install -y -qq jq
|
||||||
|
pip install ziglang --quiet --break-system-packages
|
||||||
|
$HOME/.cargo/bin/cargo install cargo-zigbuild
|
||||||
|
$HOME/.cargo/bin/rustup target add x86_64-unknown-linux-musl
|
||||||
|
|
||||||
|
- name: Create musl C/C++ wrappers
|
||||||
|
run: |
|
||||||
|
ZIG=$(python3 -c "import ziglang, os; print(os.path.join(os.path.dirname(ziglang.__file__), 'zig'))")
|
||||||
|
printf '#!/bin/sh\nexec "%s" cc -target x86_64-linux-musl "$@"\n' "$ZIG" | sudo tee /usr/local/bin/x86_64-linux-musl-gcc > /dev/null
|
||||||
|
printf '#!/bin/sh\nexec "%s" c++ -target x86_64-linux-musl "$@"\n' "$ZIG" | sudo tee /usr/local/bin/x86_64-linux-musl-g++ > /dev/null
|
||||||
|
sudo chmod +x /usr/local/bin/x86_64-linux-musl-gcc /usr/local/bin/x86_64-linux-musl-g++
|
||||||
|
|
||||||
|
- name: Cache cargo registry
|
||||||
|
uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
~/.cargo/registry
|
||||||
|
~/.cargo/git
|
||||||
|
src/target
|
||||||
|
key: linux-musl-cargo-${{ hashFiles('src/Cargo.lock') }}
|
||||||
|
restore-keys: linux-musl-cargo-
|
||||||
|
|
||||||
|
- name: Build static binary
|
||||||
|
env:
|
||||||
|
PKG_CONFIG_ALLOW_CROSS: "1"
|
||||||
|
run: cargo zigbuild --release --target x86_64-unknown-linux-musl
|
||||||
|
|
||||||
|
- name: Prepare and upload artifact
|
||||||
|
env:
|
||||||
|
GITEA_TOKEN: ${{ secrets.GITEATOKEN }}
|
||||||
|
RELEASE_ID: ${{ needs.create-release.outputs.release_id }}
|
||||||
|
run: |
|
||||||
|
mkdir -p /tmp/dist
|
||||||
|
cp target/x86_64-unknown-linux-musl/release/obikmer /tmp/dist/obikmer-linux-x86_64
|
||||||
|
strip /tmp/dist/obikmer-linux-x86_64
|
||||||
|
curl -s -X POST \
|
||||||
|
"${{ github.server_url }}/api/v1/repos/${{ github.repository }}/releases/$RELEASE_ID/assets" \
|
||||||
|
-H "Authorization: token $GITEA_TOKEN" \
|
||||||
|
-F "attachment=@/tmp/dist/obikmer-linux-x86_64"
|
||||||
|
|
||||||
|
build-macos-arm64:
|
||||||
|
needs: create-release
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
container:
|
||||||
|
image: registry.metabarcoding.org/cibuilder/rustcrossosx:latest
|
||||||
|
credentials:
|
||||||
|
username: ${{ github.actor }}
|
||||||
|
password: ${{ secrets.REGISTRYTOKEN }}
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
working-directory: src
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Cache cargo registry
|
||||||
|
uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
~/.cargo/registry
|
||||||
|
~/.cargo/git
|
||||||
|
src/target
|
||||||
|
key: macos-arm64-cargo-${{ hashFiles('src/Cargo.lock') }}
|
||||||
|
restore-keys: macos-arm64-cargo-
|
||||||
|
|
||||||
|
- name: Build macOS binary
|
||||||
|
run: cargo build --release --target aarch64-apple-darwin --no-default-features
|
||||||
|
|
||||||
|
- name: Prepare and upload artifact
|
||||||
|
env:
|
||||||
|
GITEA_TOKEN: ${{ secrets.GITEATOKEN }}
|
||||||
|
RELEASE_ID: ${{ needs.create-release.outputs.release_id }}
|
||||||
|
run: |
|
||||||
|
mkdir -p /tmp/dist
|
||||||
|
cp target/aarch64-apple-darwin/release/obikmer /tmp/dist/obikmer-macos-arm64
|
||||||
|
curl -s -X POST \
|
||||||
|
"${{ github.server_url }}/api/v1/repos/${{ github.repository }}/releases/$RELEASE_ID/assets" \
|
||||||
|
-H "Authorization: token $GITEA_TOKEN" \
|
||||||
|
-F "attachment=@/tmp/dist/obikmer-macos-arm64"
|
||||||
+12
@@ -9,3 +9,15 @@ data-stress
|
|||||||
./**/*.json
|
./**/*.json
|
||||||
*.bin
|
*.bin
|
||||||
Betula_exilis--IGA-24-33
|
Betula_exilis--IGA-24-33
|
||||||
|
benchmark/genomes
|
||||||
|
benchmark/simulated_data
|
||||||
|
benchmark/specimen_index_presence
|
||||||
|
benchmark/specimen_index_count
|
||||||
|
benchmark/global_index_presence
|
||||||
|
benchmark/global_index_count
|
||||||
|
benchmark/stats
|
||||||
|
benchmark/reference_index
|
||||||
|
benchmark/reference_dist
|
||||||
|
benchmark/obikmer_dist
|
||||||
|
benchmark/specific_index_count
|
||||||
|
benchmark/specific_index_presence
|
||||||
|
|||||||
@@ -0,0 +1,2 @@
|
|||||||
|
/cache
|
||||||
|
/project.local.yml
|
||||||
@@ -0,0 +1,133 @@
|
|||||||
|
# the name by which the project can be referenced within Serena
|
||||||
|
project_name: "obikmer"
|
||||||
|
|
||||||
|
|
||||||
|
# list of languages for which language servers are started; choose from:
|
||||||
|
# al angular ansible bash clojure
|
||||||
|
# cpp cpp_ccls crystal csharp csharp_omnisharp
|
||||||
|
# dart elixir elm erlang fortran
|
||||||
|
# fsharp go groovy haskell haxe
|
||||||
|
# hlsl html java json julia
|
||||||
|
# kotlin lean4 lua luau markdown
|
||||||
|
# matlab msl nix ocaml pascal
|
||||||
|
# perl php php_phpactor powershell python
|
||||||
|
# python_jedi python_ty r rego ruby
|
||||||
|
# ruby_solargraph rust scala scss solidity
|
||||||
|
# svelte swift systemverilog terraform toml
|
||||||
|
# typescript typescript_vts vue yaml zig
|
||||||
|
# (This list may be outdated. For the current list, see values of Language enum here:
|
||||||
|
# https://github.com/oraios/serena/blob/main/src/solidlsp/ls_config.py
|
||||||
|
# For some languages, there are alternative language servers, e.g. csharp_omnisharp, ruby_solargraph.)
|
||||||
|
# Note:
|
||||||
|
# - For C, use cpp
|
||||||
|
# - For JavaScript, use typescript
|
||||||
|
# - For Angular projects, use angular (subsumes typescript+html; requires `npm install` in the project root)
|
||||||
|
# - For Svelte projects, use svelte (subsumes typescript/javascript for .svelte projects; requires npm)
|
||||||
|
# - For SCSS / Sass / plain CSS, use scss (some-sass-language-server handles all three)
|
||||||
|
# - For Free Pascal/Lazarus, use pascal
|
||||||
|
# Special requirements:
|
||||||
|
# Some languages require additional setup/installations.
|
||||||
|
# See here for details: https://oraios.github.io/serena/01-about/020_programming-languages.html#language-servers
|
||||||
|
# When using multiple languages, the first language server that supports a given file will be used for that file.
|
||||||
|
# The first language is the default language and the respective language server will be used as a fallback.
|
||||||
|
# Note that when using the JetBrains backend, language servers are not used and this list is correspondingly ignored.
|
||||||
|
languages:
|
||||||
|
- rust
|
||||||
|
|
||||||
|
# the encoding used by text files in the project
|
||||||
|
# For a list of possible encodings, see https://docs.python.org/3.11/library/codecs.html#standard-encodings
|
||||||
|
encoding: "utf-8"
|
||||||
|
|
||||||
|
# line ending convention to use when writing source files.
|
||||||
|
# Possible values: unset (use global setting), "lf", "crlf", or "native" (platform default)
|
||||||
|
# This does not affect Serena's own files (e.g. memories and configuration files), which always use native line endings.
|
||||||
|
line_ending:
|
||||||
|
|
||||||
|
# The language backend to use for this project.
|
||||||
|
# If not set, the global setting from serena_config.yml is used.
|
||||||
|
# Valid values: LSP, JetBrains
|
||||||
|
# Note: the backend is fixed at startup. If a project with a different backend
|
||||||
|
# is activated post-init, an error will be returned.
|
||||||
|
language_backend:
|
||||||
|
|
||||||
|
# whether to use project's .gitignore files to ignore files
|
||||||
|
ignore_all_files_in_gitignore: true
|
||||||
|
|
||||||
|
# advanced configuration option allowing to configure language server-specific options.
|
||||||
|
# Maps the language key to the options.
|
||||||
|
# Have a look at the docstring of the constructors of the LS implementations within solidlsp (e.g., for C# or PHP) to see which options are available.
|
||||||
|
# No documentation on options means no options are available.
|
||||||
|
ls_specific_settings: {}
|
||||||
|
|
||||||
|
# list of additional workspace folder paths for cross-package reference support (e.g. in monorepos).
|
||||||
|
# Paths can be absolute or relative to the project root.
|
||||||
|
# Each folder is registered as an LSP workspace folder, enabling language servers to discover
|
||||||
|
# symbols and references across package boundaries.
|
||||||
|
# Currently supported for: TypeScript.
|
||||||
|
# Example:
|
||||||
|
# additional_workspace_folders:
|
||||||
|
# - ../sibling-package
|
||||||
|
# - ../shared-lib
|
||||||
|
additional_workspace_folders: []
|
||||||
|
|
||||||
|
# list of additional paths to ignore in this project.
|
||||||
|
# Same syntax as gitignore, so you can use * and **.
|
||||||
|
# Note: global ignored_paths from serena_config.yml are also applied additively.
|
||||||
|
ignored_paths: []
|
||||||
|
|
||||||
|
# whether the project is in read-only mode
|
||||||
|
# If set to true, all editing tools will be disabled and attempts to use them will result in an error
|
||||||
|
# Added on 2025-04-18
|
||||||
|
read_only: false
|
||||||
|
|
||||||
|
# list of tool names to exclude.
|
||||||
|
# This extends the existing exclusions (e.g. from the global configuration)
|
||||||
|
# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
|
||||||
|
excluded_tools: []
|
||||||
|
|
||||||
|
# list of tools to include that would otherwise be disabled (particularly optional tools that are disabled by default).
|
||||||
|
# This extends the existing inclusions (e.g. from the global configuration).
|
||||||
|
# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
|
||||||
|
included_optional_tools: []
|
||||||
|
|
||||||
|
# fixed set of tools to use as the base tool set (if non-empty), replacing Serena's default set of tools.
|
||||||
|
# This cannot be combined with non-empty excluded_tools or included_optional_tools.
|
||||||
|
# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
|
||||||
|
fixed_tools: []
|
||||||
|
|
||||||
|
# list of mode names that are to be activated by default, overriding the setting in the global configuration.
|
||||||
|
# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
|
||||||
|
# If the setting is undefined/empty, the default_modes from the global configuration (serena_config.yml) apply.
|
||||||
|
# Otherwise, this overrides the setting from the global configuration (serena_config.yml).
|
||||||
|
# Therefore, you can set this to [] if you do not want the default modes defined in the global config to apply
|
||||||
|
# for this project.
|
||||||
|
# This setting can, in turn, be overridden by CLI parameters (--mode).
|
||||||
|
# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
|
||||||
|
default_modes:
|
||||||
|
|
||||||
|
# list of mode names to be activated additionally for this project, e.g. ["query-projects"]
|
||||||
|
# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
|
||||||
|
# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
|
||||||
|
added_modes:
|
||||||
|
|
||||||
|
# initial prompt for the project. It will always be given to the LLM upon activating the project
|
||||||
|
# (contrary to the memories, which are loaded on demand).
|
||||||
|
initial_prompt: ""
|
||||||
|
|
||||||
|
# time budget (seconds) per tool call for the retrieval of additional symbol information
|
||||||
|
# such as docstrings or parameter information.
|
||||||
|
# This overrides the corresponding setting in the global configuration; see the documentation there.
|
||||||
|
# If null or missing, use the setting from the global configuration.
|
||||||
|
symbol_info_budget:
|
||||||
|
|
||||||
|
# list of regex patterns which, when matched, mark a memory entry as read‑only.
|
||||||
|
# Extends the list from the global configuration, merging the two lists.
|
||||||
|
read_only_memory_patterns: []
|
||||||
|
|
||||||
|
# list of regex patterns for memories to completely ignore.
|
||||||
|
# Matching memories will not appear in list_memories or activate_project output
|
||||||
|
# and cannot be accessed via read_memory or write_memory.
|
||||||
|
# To access ignored memory files, use the read_file tool on the raw file path.
|
||||||
|
# Extends the list from the global configuration, merging the two lists.
|
||||||
|
# Example: ["_archive/.*", "_episodes/.*"]
|
||||||
|
ignored_memory_patterns: []
|
||||||
@@ -73,3 +73,29 @@ Lors de l'ajout de nouveaux fichiers Markdown dans `docmd/`, mettre à jour la s
|
|||||||
---
|
---
|
||||||
|
|
||||||
Je continue à poser mes questions et à guider la discussion.
|
Je continue à poser mes questions et à guider la discussion.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## MCP Tools
|
||||||
|
|
||||||
|
**Règle absolue : avant tout travail de code, appeler `mcp__serena__initial_instructions` pour charger les instructions Serena.**
|
||||||
|
|
||||||
|
### Hiérarchie des outils pour ce projet Rust
|
||||||
|
|
||||||
|
**Navigation et édition de code → serena en priorité**
|
||||||
|
- Trouver un symbole, une déclaration, les implémentations d'un trait : `mcp__serena__find_symbol`, `mcp__serena__find_declaration`, `mcp__serena__find_implementations`
|
||||||
|
- Trouver les usages d'un symbole : `mcp__serena__find_referencing_symbols`
|
||||||
|
- Diagnostics LSP (erreurs de compilation) : `mcp__serena__get_diagnostics_for_file`
|
||||||
|
- Vue d'ensemble d'un fichier : `mcp__serena__get_symbols_overview`
|
||||||
|
- Modifier le corps d'une fonction/impl : `mcp__serena__replace_symbol_body`
|
||||||
|
- Ne pas utiliser `cclsp` quand serena couvre le besoin
|
||||||
|
|
||||||
|
**Analyse architecturale → jcodemunch**
|
||||||
|
- Hotspots, couplage, dead code, dépendances entre modules
|
||||||
|
- Utiliser avant de refactorer une zone critique
|
||||||
|
|
||||||
|
**Raisonnement complexe → sequential-thinking**
|
||||||
|
- Décisions d'architecture, choix d'algorithme, trade-offs non triviaux
|
||||||
|
|
||||||
|
**Documentation de crates → context7**
|
||||||
|
- Toujours consulter avant d'utiliser une API de bibliothèque externe
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ $(MKDOCS): $(VENV)/bin/activate
|
|||||||
mkdocs mkdocs-material \
|
mkdocs mkdocs-material \
|
||||||
mkdocs-mermaid2-plugin \
|
mkdocs-mermaid2-plugin \
|
||||||
mkdocs-bibtex
|
mkdocs-bibtex
|
||||||
|
$(PIP) install --quiet --upgrade InSilicoSeq
|
||||||
|
|
||||||
# ── obikmer binary ───────────────────────────────────────────────────────────
|
# ── obikmer binary ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
@@ -62,3 +63,36 @@ clean-doc:
|
|||||||
.PHONY: clean
|
.PHONY: clean
|
||||||
clean: clean-doc
|
clean: clean-doc
|
||||||
rm -rf $(VENV)
|
rm -rf $(VENV)
|
||||||
|
|
||||||
|
# ── release ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
CARGO_TOML := $(CARGO_DIR)/obikmer/Cargo.toml
|
||||||
|
|
||||||
|
.PHONY: bump-version
|
||||||
|
bump-version:
|
||||||
|
@current=$$(grep '^version = ' $(CARGO_TOML) | head -n 1 | sed 's/version = "\(.*\)"/\1/'); \
|
||||||
|
if [ -n "$(RELEASE)" ]; then \
|
||||||
|
new_version="$(RELEASE)"; \
|
||||||
|
else \
|
||||||
|
major=$$(echo $$current | cut -d. -f1); \
|
||||||
|
minor=$$(echo $$current | cut -d. -f2); \
|
||||||
|
patch=$$(echo $$current | cut -d. -f3); \
|
||||||
|
new_patch=$$((patch + 1)); \
|
||||||
|
new_version="$$major.$$minor.$$new_patch"; \
|
||||||
|
fi; \
|
||||||
|
echo "Version: $$current -> $$new_version"; \
|
||||||
|
sed -i.bak "s/^version = \"$$current\"/version = \"$$new_version\"/" $(CARGO_TOML) && \
|
||||||
|
rm $(CARGO_TOML).bak
|
||||||
|
|
||||||
|
.PHONY: release
|
||||||
|
release: bump-version
|
||||||
|
@jj auto-describe
|
||||||
|
@jj git push --change @
|
||||||
|
@new_version=$$(grep '^version = ' $(CARGO_TOML) | head -n 1 | sed 's/version = "\(.*\)"/\1/'); \
|
||||||
|
git_hash=$$(jj log -r @ --no-graph -T 'commit_id'); \
|
||||||
|
commits=$$(jj log -r 'latest(tags())..@' --no-graph -T 'description ++ "\n"' 2>/dev/null || \
|
||||||
|
jj log --no-graph -T 'description ++ "\n"' --limit 30); \
|
||||||
|
notes=$$(printf 'Write concise markdown release notes for obikmer (a Rust kmer genomics tool). Be technical and direct. Base them strictly on these commit messages:\n\n%s' "$$commits" | aichat 2>/dev/null); \
|
||||||
|
tag_msg="$${notes:-Release v$$new_version}"; \
|
||||||
|
git tag -a "v$$new_version" -m "$$tag_msg" "$$git_hash" && \
|
||||||
|
git push origin "v$$new_version"
|
||||||
|
|||||||
@@ -0,0 +1,230 @@
|
|||||||
|
# Requires GNU Make >= 4.3 (grouped targets &:) — use gmake on macOS
|
||||||
|
BINARY := ../src/target/release/obikmer
|
||||||
|
VENV_PY := ../.venv/bin/python3
|
||||||
|
|
||||||
|
GENOMES := $(wildcard genomes/*.fna.gz)
|
||||||
|
|
||||||
|
# SPECIMENS, SPECIES, and the full dependency graph are generated by
|
||||||
|
# make_deps.py from the genome FASTA headers — like .d files in C.
|
||||||
|
# Make rebuilds deps.mk whenever genomes/ changes and restarts.
|
||||||
|
-include deps.mk
|
||||||
|
|
||||||
|
REF_NPZS := $(SPECIMENS:%=reference_index/%.npz)
|
||||||
|
REF_DIST_CSVS := $(addprefix reference_dist/, \
|
||||||
|
shared_kmers.csv hamming_dist.csv jaccard_dist.csv \
|
||||||
|
bray_curtis_dist.csv relfreq_bray_curtis_dist.csv \
|
||||||
|
euclidean_dist.csv relfreq_euclidean_dist.csv \
|
||||||
|
hellinger_dist.csv hellinger_euclidean_dist.csv)
|
||||||
|
OBIKMER_PRESENCE_DIST := $(addprefix obikmer_dist/presence/, \
|
||||||
|
jaccard_dist.csv jaccard_shared.csv jaccard_nj.nwk \
|
||||||
|
hamming_dist.csv hamming_nj.nwk)
|
||||||
|
OBIKMER_COUNT_DIST := $(addprefix obikmer_dist/count/, \
|
||||||
|
jaccard_dist.csv jaccard_shared.csv jaccard_nj.nwk \
|
||||||
|
bray_curtis_dist.csv bray_curtis_nj.nwk \
|
||||||
|
relfreq_bray_curtis_dist.csv relfreq_bray_curtis_nj.nwk \
|
||||||
|
euclidean_dist.csv euclidean_nj.nwk \
|
||||||
|
relfreq_euclidean_dist.csv relfreq_euclidean_nj.nwk \
|
||||||
|
hellinger_dist.csv hellinger_nj.nwk \
|
||||||
|
hellinger_euclidean_dist.csv hellinger_euclidean_nj.nwk)
|
||||||
|
DIST_COMPARISON := stats/dist_comparison/summary.csv
|
||||||
|
PRESENCE_DONE := $(SPECIMENS:%=specimen_index_presence/%/index.done)
|
||||||
|
PRESENCE_STATS := $(SPECIMENS:%=stats/indexing_presence/%.stats)
|
||||||
|
COUNT_DONE := $(SPECIMENS:%=specimen_index_count/%/index.done)
|
||||||
|
COUNT_STATS := $(SPECIMENS:%=stats/indexing_count/%.stats)
|
||||||
|
VERIFY_PRESENCE_STATS := $(SPECIMENS:%=stats/verify_presence/%.stats)
|
||||||
|
VERIFY_COUNT_STATS := $(SPECIMENS:%=stats/verify_count/%.stats)
|
||||||
|
SPECIFIC_PRESENCE_DONE := $(SPECIES:%=specific_index_presence/%/index.done)
|
||||||
|
SPECIFIC_PRESENCE_STATS := $(SPECIES:%=stats/specific_kmer_presence/%.stats)
|
||||||
|
SPECIFIC_COUNT_DONE := $(SPECIES:%=specific_index_count/%/index.done)
|
||||||
|
SPECIFIC_COUNT_STATS := $(SPECIES:%=stats/specific_kmer_count/%.stats)
|
||||||
|
SIMULATED_READS := $(foreach s,$(SPECIMENS),simulated_data/$(subst --,/,$s)/reads_R1.fastq.gz)
|
||||||
|
|
||||||
|
.NOTPARALLEL:
|
||||||
|
|
||||||
|
.PHONY: all simulate reference reference_dist \
|
||||||
|
obikmer_dist obikmer_dist_presence obikmer_dist_count \
|
||||||
|
dist_comparison \
|
||||||
|
index_presence index_count \
|
||||||
|
aggregate_index_presence aggregate_index_count \
|
||||||
|
merge_presence merge_count \
|
||||||
|
verify_presence verify_count \
|
||||||
|
aggregate_verify_presence aggregate_verify_count \
|
||||||
|
verify_merge_presence verify_merge_count \
|
||||||
|
filter_presence filter_count \
|
||||||
|
aggregate_filter_presence aggregate_filter_count
|
||||||
|
|
||||||
|
verify_merge_presence: stats/verify_merge_presence/current.csv
|
||||||
|
verify_merge_count: stats/verify_merge_count/current.csv
|
||||||
|
|
||||||
|
all: aggregate_verify_presence aggregate_verify_count \
|
||||||
|
verify_merge_presence verify_merge_count \
|
||||||
|
aggregate_filter_presence aggregate_filter_count \
|
||||||
|
dist_comparison
|
||||||
|
|
||||||
|
# ── dependency file ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
deps.mk: $(GENOMES)
|
||||||
|
$(VENV_PY) make_deps.py $^ > $@
|
||||||
|
|
||||||
|
# ── simulation ────────────────────────────────────────────────────────────────
|
||||||
|
# Prerequisites (genome → reads) are in deps.mk; $< is the genome file.
|
||||||
|
|
||||||
|
$(SIMULATED_READS):
|
||||||
|
bash simulate_one.sh $< $(dir $@)
|
||||||
|
|
||||||
|
simulate: $(SIMULATED_READS)
|
||||||
|
|
||||||
|
# ── reference kmer sets ───────────────────────────────────────────────────────
|
||||||
|
# Prerequisites (reads → npz) are in deps.mk.
|
||||||
|
|
||||||
|
reference_index/%.npz:
|
||||||
|
bash build_reference.sh $*
|
||||||
|
|
||||||
|
reference: $(REF_NPZS)
|
||||||
|
|
||||||
|
# ── reference distance matrices ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
$(REF_DIST_CSVS) &: $(REF_NPZS) build_reference_dist.py
|
||||||
|
$(VENV_PY) build_reference_dist.py
|
||||||
|
|
||||||
|
reference_dist: $(REF_DIST_CSVS)
|
||||||
|
|
||||||
|
# ── obikmer distance (presence index) ────────────────────────────────────────
|
||||||
|
|
||||||
|
$(OBIKMER_PRESENCE_DIST) &: global_index_presence/index.done $(BINARY)
|
||||||
|
mkdir -p obikmer_dist/presence
|
||||||
|
$(BINARY) distance \
|
||||||
|
--output obikmer_dist/presence/jaccard \
|
||||||
|
--metric jaccard --shared-kmers --nj \
|
||||||
|
global_index_presence
|
||||||
|
$(BINARY) distance \
|
||||||
|
--output obikmer_dist/presence/hamming \
|
||||||
|
--metric hamming --nj \
|
||||||
|
global_index_presence
|
||||||
|
|
||||||
|
obikmer_dist_presence: $(OBIKMER_PRESENCE_DIST)
|
||||||
|
|
||||||
|
# ── obikmer distance (count index) ───────────────────────────────────────────
|
||||||
|
|
||||||
|
$(OBIKMER_COUNT_DIST) &: global_index_count/index.done $(BINARY)
|
||||||
|
mkdir -p obikmer_dist/count
|
||||||
|
$(BINARY) distance \
|
||||||
|
--output obikmer_dist/count/jaccard \
|
||||||
|
--metric jaccard --shared-kmers --nj \
|
||||||
|
global_index_count
|
||||||
|
$(BINARY) distance \
|
||||||
|
--output obikmer_dist/count/bray_curtis \
|
||||||
|
--metric bray-curtis --nj \
|
||||||
|
global_index_count
|
||||||
|
$(BINARY) distance \
|
||||||
|
--output obikmer_dist/count/relfreq_bray_curtis \
|
||||||
|
--metric relfreq-bray-curtis --nj \
|
||||||
|
global_index_count
|
||||||
|
$(BINARY) distance \
|
||||||
|
--output obikmer_dist/count/euclidean \
|
||||||
|
--metric euclidean --nj \
|
||||||
|
global_index_count
|
||||||
|
$(BINARY) distance \
|
||||||
|
--output obikmer_dist/count/relfreq_euclidean \
|
||||||
|
--metric relfreq-euclidean --nj \
|
||||||
|
global_index_count
|
||||||
|
$(BINARY) distance \
|
||||||
|
--output obikmer_dist/count/hellinger \
|
||||||
|
--metric hellinger --nj \
|
||||||
|
global_index_count
|
||||||
|
$(BINARY) distance \
|
||||||
|
--output obikmer_dist/count/hellinger_euclidean \
|
||||||
|
--metric hellinger-euclidean --nj \
|
||||||
|
global_index_count
|
||||||
|
|
||||||
|
obikmer_dist_count: $(OBIKMER_COUNT_DIST)
|
||||||
|
|
||||||
|
obikmer_dist: obikmer_dist_presence obikmer_dist_count
|
||||||
|
|
||||||
|
# ── distance comparison ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
$(DIST_COMPARISON): $(REF_DIST_CSVS) $(OBIKMER_PRESENCE_DIST) $(OBIKMER_COUNT_DIST) compare_all_dist.py
|
||||||
|
$(VENV_PY) compare_all_dist.py --out $(DIST_COMPARISON)
|
||||||
|
|
||||||
|
dist_comparison: $(DIST_COMPARISON)
|
||||||
|
|
||||||
|
# ── per-specimen indexing ─────────────────────────────────────────────────────
|
||||||
|
# Prerequisites (reads → index.done + .stats) are in deps.mk.
|
||||||
|
|
||||||
|
specimen_index_presence/%/index.done \
|
||||||
|
stats/indexing_presence/%.stats &: $(BINARY)
|
||||||
|
bash index_one_presence.sh $*
|
||||||
|
|
||||||
|
specimen_index_count/%/index.done \
|
||||||
|
stats/indexing_count/%.stats &: $(BINARY)
|
||||||
|
bash index_one_count.sh $*
|
||||||
|
|
||||||
|
index_presence: $(PRESENCE_DONE)
|
||||||
|
index_count: $(COUNT_DONE)
|
||||||
|
|
||||||
|
# ── indexing stats aggregation ────────────────────────────────────────────────
|
||||||
|
|
||||||
|
aggregate_index_presence: $(PRESENCE_STATS)
|
||||||
|
bash aggregate_stats.sh indexing_presence
|
||||||
|
|
||||||
|
aggregate_index_count: $(COUNT_STATS)
|
||||||
|
bash aggregate_stats.sh indexing_count
|
||||||
|
|
||||||
|
# ── global merge ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
global_index_presence/index.done: $(PRESENCE_DONE) $(BINARY)
|
||||||
|
bash merge_presence.sh
|
||||||
|
|
||||||
|
global_index_count/index.done: $(COUNT_DONE) $(BINARY)
|
||||||
|
bash merge_count.sh
|
||||||
|
|
||||||
|
merge_presence: global_index_presence/index.done
|
||||||
|
merge_count: global_index_count/index.done
|
||||||
|
|
||||||
|
# ── per-specimen verification ─────────────────────────────────────────────────
|
||||||
|
# Prerequisites (index.done + npz → .stats) are in deps.mk.
|
||||||
|
|
||||||
|
stats/verify_presence/%.stats:
|
||||||
|
bash verify_one_presence.sh $*
|
||||||
|
|
||||||
|
stats/verify_count/%.stats:
|
||||||
|
bash verify_one_count.sh $*
|
||||||
|
|
||||||
|
verify_presence: $(VERIFY_PRESENCE_STATS)
|
||||||
|
verify_count: $(VERIFY_COUNT_STATS)
|
||||||
|
|
||||||
|
# ── verification stats aggregation ───────────────────────────────────────────
|
||||||
|
|
||||||
|
aggregate_verify_presence: $(VERIFY_PRESENCE_STATS)
|
||||||
|
bash aggregate_stats.sh verify_presence
|
||||||
|
|
||||||
|
aggregate_verify_count: $(VERIFY_COUNT_STATS)
|
||||||
|
bash aggregate_stats.sh verify_count
|
||||||
|
|
||||||
|
# ── species-specific indexes ──────────────────────────────────────────────────
|
||||||
|
# Prerequisites (global index → specific index) are in deps.mk.
|
||||||
|
|
||||||
|
specific_index_presence/%/index.done \
|
||||||
|
stats/specific_kmer_presence/%.stats &: $(BINARY)
|
||||||
|
bash filter_one_presence.sh $*
|
||||||
|
|
||||||
|
specific_index_count/%/index.done \
|
||||||
|
stats/specific_kmer_count/%.stats &: $(BINARY)
|
||||||
|
bash filter_one_count.sh $*
|
||||||
|
|
||||||
|
filter_presence: $(SPECIFIC_PRESENCE_DONE)
|
||||||
|
filter_count: $(SPECIFIC_COUNT_DONE)
|
||||||
|
|
||||||
|
aggregate_filter_presence: $(SPECIFIC_PRESENCE_STATS)
|
||||||
|
bash aggregate_stats.sh specific_kmer_presence
|
||||||
|
|
||||||
|
aggregate_filter_count: $(SPECIFIC_COUNT_STATS)
|
||||||
|
bash aggregate_stats.sh specific_kmer_count
|
||||||
|
|
||||||
|
# ── merged index verification ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
stats/verify_merge_presence/current.csv: $(REF_NPZS) global_index_presence/index.done
|
||||||
|
bash verify_merge_presence.sh
|
||||||
|
|
||||||
|
stats/verify_merge_count/current.csv: $(REF_NPZS) global_index_count/index.done
|
||||||
|
bash verify_merge_count.sh
|
||||||
@@ -0,0 +1,132 @@
|
|||||||
|
# Benchmark pipeline
|
||||||
|
|
||||||
|
Requires **GNU Make ≥ 4.3** (grouped targets `&:`). On macOS use `gmake`.
|
||||||
|
|
||||||
|
```
|
||||||
|
gmake all # full pipeline
|
||||||
|
gmake simulate # simulation only
|
||||||
|
gmake reference # reference kmer sets only
|
||||||
|
```
|
||||||
|
|
||||||
|
## Pipeline overview
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart TD
|
||||||
|
GENOMES["genomes/*.fna.gz"]
|
||||||
|
BIN["obikmer binary"]
|
||||||
|
|
||||||
|
GENOMES --> simulate
|
||||||
|
simulate --> simdata[("simulated_data/")]
|
||||||
|
|
||||||
|
simdata --> reference
|
||||||
|
reference --> refnpz[("reference_index/*.npz")]
|
||||||
|
|
||||||
|
subgraph presence ["Presence track"]
|
||||||
|
simdata --> index_presence
|
||||||
|
BIN --> index_presence
|
||||||
|
index_presence --> pres_done[("specimen_index_presence/")]
|
||||||
|
index_presence --> pres_istats[("stats/indexing_presence/")]
|
||||||
|
pres_istats --> aggregate_index_presence
|
||||||
|
|
||||||
|
pres_done --> merge_presence
|
||||||
|
BIN --> merge_presence
|
||||||
|
merge_presence --> gpres[("global_index_presence/")]
|
||||||
|
|
||||||
|
refnpz --> verify_presence
|
||||||
|
pres_done --> verify_presence
|
||||||
|
verify_presence --> vpres_stats[("stats/verify_presence/")]
|
||||||
|
vpres_stats --> aggregate_verify_presence
|
||||||
|
|
||||||
|
gpres --> filter_presence
|
||||||
|
BIN --> filter_presence
|
||||||
|
filter_presence --> spec_pres[("specific_index_presence/")]
|
||||||
|
filter_presence --> spec_pres_stats[("stats/specific_kmer_presence/")]
|
||||||
|
spec_pres_stats --> aggregate_filter_presence
|
||||||
|
|
||||||
|
refnpz --> verify_merge_presence
|
||||||
|
gpres --> verify_merge_presence
|
||||||
|
verify_merge_presence --> vmp[("stats/verify_merge_presence/")]
|
||||||
|
end
|
||||||
|
|
||||||
|
subgraph count ["Count track"]
|
||||||
|
simdata --> index_count
|
||||||
|
BIN --> index_count
|
||||||
|
index_count --> count_done[("specimen_index_count/")]
|
||||||
|
index_count --> count_istats[("stats/indexing_count/")]
|
||||||
|
count_istats --> aggregate_index_count
|
||||||
|
|
||||||
|
count_done --> merge_count
|
||||||
|
BIN --> merge_count
|
||||||
|
merge_count --> gcount[("global_index_count/")]
|
||||||
|
|
||||||
|
refnpz --> verify_count
|
||||||
|
count_done --> verify_count
|
||||||
|
verify_count --> vcount_stats[("stats/verify_count/")]
|
||||||
|
vcount_stats --> aggregate_verify_count
|
||||||
|
|
||||||
|
gcount --> filter_count
|
||||||
|
BIN --> filter_count
|
||||||
|
filter_count --> spec_count[("specific_index_count/")]
|
||||||
|
filter_count --> spec_count_stats[("stats/specific_kmer_count/")]
|
||||||
|
spec_count_stats --> aggregate_filter_count
|
||||||
|
|
||||||
|
refnpz --> verify_merge_count
|
||||||
|
gcount --> verify_merge_count
|
||||||
|
verify_merge_count --> vmc[("stats/verify_merge_count/")]
|
||||||
|
end
|
||||||
|
|
||||||
|
aggregate_verify_presence --> all
|
||||||
|
aggregate_verify_count --> all
|
||||||
|
vmp --> all
|
||||||
|
vmc --> all
|
||||||
|
all -. "$(MAKE) re-eval" .-> aggregate_filter_presence
|
||||||
|
all -. "$(MAKE) re-eval" .-> aggregate_filter_count
|
||||||
|
```
|
||||||
|
|
||||||
|
## Steps
|
||||||
|
|
||||||
|
| Target | Script | Description |
|
||||||
|
|---|---|---|
|
||||||
|
| `simulate` | `simulate.sh` | Simulate sequencing reads from the reference genomes |
|
||||||
|
| `reference` | `build_reference.sh` | Build reference kmer sets (`.npz`) from simulation truth |
|
||||||
|
| `index_presence` | `index_one_presence.sh` | Index each specimen (presence mode) |
|
||||||
|
| `index_count` | `index_one_count.sh` | Index each specimen (count mode) |
|
||||||
|
| `aggregate_index_presence` | `aggregate_stats.sh` | Aggregate per-specimen indexing stats (presence) |
|
||||||
|
| `aggregate_index_count` | `aggregate_stats.sh` | Aggregate per-specimen indexing stats (count) |
|
||||||
|
| `merge_presence` | `merge_presence.sh` | Merge all specimen presence indexes into a global index |
|
||||||
|
| `merge_count` | `merge_count.sh` | Merge all specimen count indexes into a global index |
|
||||||
|
| `verify_presence` | `verify_one_presence.sh` | Verify each specimen presence index against reference |
|
||||||
|
| `verify_count` | `verify_one_count.sh` | Verify each specimen count index against reference |
|
||||||
|
| `aggregate_verify_presence` | `aggregate_stats.sh` | Aggregate per-specimen verification stats (presence) |
|
||||||
|
| `aggregate_verify_count` | `aggregate_stats.sh` | Aggregate per-specimen verification stats (count) |
|
||||||
|
| `filter_presence` | `filter_one_presence.sh` | Extract species-specific presence indexes from global index |
|
||||||
|
| `filter_count` | `filter_one_count.sh` | Extract species-specific count indexes from global index |
|
||||||
|
| `aggregate_filter_presence` | `aggregate_stats.sh` | Aggregate species-specific kmer stats (presence) |
|
||||||
|
| `aggregate_filter_count` | `aggregate_stats.sh` | Aggregate species-specific kmer stats (count) |
|
||||||
|
| `verify_merge_presence` | `verify_merge_presence.sh` | Verify global presence index against all reference sets |
|
||||||
|
| `verify_merge_count` | `verify_merge_count.sh` | Verify global count index against all reference sets |
|
||||||
|
|
||||||
|
## Directory layout
|
||||||
|
|
||||||
|
```
|
||||||
|
benchmark/
|
||||||
|
├── genomes/ # input reference genomes (.fna.gz)
|
||||||
|
├── simulated_data/ # generated by simulate
|
||||||
|
│ └── <species>/<specimen>/
|
||||||
|
├── reference_index/ # reference kmer sets (.npz)
|
||||||
|
├── specimen_index_presence/ # per-specimen presence indexes
|
||||||
|
├── specimen_index_count/ # per-specimen count indexes
|
||||||
|
├── global_index_presence/ # merged global presence index
|
||||||
|
├── global_index_count/ # merged global count index
|
||||||
|
├── specific_index_presence/ # species-specific presence indexes
|
||||||
|
├── specific_index_count/ # species-specific count indexes
|
||||||
|
└── stats/ # all benchmark statistics
|
||||||
|
├── indexing_presence/
|
||||||
|
├── indexing_count/
|
||||||
|
├── verify_presence/
|
||||||
|
├── verify_count/
|
||||||
|
├── specific_kmer_presence/
|
||||||
|
├── specific_kmer_count/
|
||||||
|
├── verify_merge_presence/
|
||||||
|
└── verify_merge_count/
|
||||||
|
```
|
||||||
Executable
+53
@@ -0,0 +1,53 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: aggregate_stats.sh TYPE
|
||||||
|
# TYPE = indexing_presence | indexing_count | verify_presence | verify_count
|
||||||
|
#
|
||||||
|
# Reads all stats/TYPE/*.stats files (one CSV data row each, no header).
|
||||||
|
# Creates a new stats/TYPE/run_NNN.csv only if any .stats file is newer than
|
||||||
|
# the most recent run CSV (idempotent when nothing changed).
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
TYPE="$1"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/${TYPE}"
|
||||||
|
|
||||||
|
case "${TYPE}" in
|
||||||
|
indexing_presence|indexing_count)
|
||||||
|
HEADER="run,species,strain,scatter_wall_s,scatter_rss_b,dereplicate_wall_s,dereplicate_rss_b,count_kmer_wall_s,count_kmer_rss_b,index_wall_s,index_rss_b,total_wall_s,total_rss_b"
|
||||||
|
;;
|
||||||
|
verify_presence)
|
||||||
|
HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct"
|
||||||
|
;;
|
||||||
|
verify_count)
|
||||||
|
HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,fn_pct,fp_pct,cm_pct"
|
||||||
|
;;
|
||||||
|
specific_kmer_presence|specific_kmer_count)
|
||||||
|
HEADER="run,species,rebuild_wall_s,rebuild_rss_b,pack_wall_s,pack_rss_b,filter_total_wall_s,filter_total_rss_b,select_wall_s,select_rss_b,select_total_wall_s,select_total_rss_b"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "ERROR: unknown stats type '${TYPE}'" >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# Find most recent existing run CSV (empty string if none).
|
||||||
|
latest_csv=$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | sort | tail -1)
|
||||||
|
|
||||||
|
# Check if any .stats file is newer than the latest run CSV.
|
||||||
|
if [[ -n "${latest_csv}" ]] && \
|
||||||
|
[[ -z "$(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' -newer "${latest_csv}" 2>/dev/null)" ]]; then
|
||||||
|
echo "[${TYPE}] stats up to date (${latest_csv})"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | wc -l | tr -d ' ')")
|
||||||
|
CSV="${STATS_DIR}/run_${run_n}.csv"
|
||||||
|
|
||||||
|
echo "${HEADER}" >"${CSV}"
|
||||||
|
|
||||||
|
# Sort .stats files by name for reproducible row order.
|
||||||
|
while IFS= read -r stats_file; do
|
||||||
|
sed "s/^/${run_n},/" "${stats_file}"
|
||||||
|
done < <(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' | sort) >>"${CSV}"
|
||||||
|
|
||||||
|
echo "[${TYPE}] run ${run_n} → ${CSV}"
|
||||||
Executable
+137
@@ -0,0 +1,137 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Build a reference kmer index from paired-end FASTQ reads.
|
||||||
|
|
||||||
|
Extracts canonical kmers — min(kmer, revcomp(kmer)) encoded as uint64 —
|
||||||
|
counts their abundances, and saves a sorted numpy pair (kmers, counts).
|
||||||
|
|
||||||
|
Output .npz arrays
|
||||||
|
kmers : uint64, sorted ascending — canonical kmer integers
|
||||||
|
counts : uint32, same order — raw read abundances
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import gzip
|
||||||
|
import sys
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
# ── encoding ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
|
||||||
|
'a': 0, 'c': 1, 'g': 2, 't': 3}
|
||||||
|
|
||||||
|
# Lookup table: revcomp of one byte (4 bases, 8 bits).
|
||||||
|
# Precomputed once at import time.
|
||||||
|
_REVCOMP8 = [0] * 256
|
||||||
|
for _i in range(256):
|
||||||
|
_rc, _x = 0, _i
|
||||||
|
for _ in range(4):
|
||||||
|
_rc = (_rc << 2) | (3 - (_x & 3))
|
||||||
|
_x >>= 2
|
||||||
|
_REVCOMP8[_i] = _rc
|
||||||
|
del _i, _rc, _x
|
||||||
|
|
||||||
|
|
||||||
|
def revcomp_int(kmer: int, k: int) -> int:
|
||||||
|
"""Reverse-complement of a kmer encoded as an integer (2 bits/base).
|
||||||
|
|
||||||
|
Uses byte-level lookup (4 bases at a time) for speed.
|
||||||
|
"""
|
||||||
|
rc = 0
|
||||||
|
bits_left = 2 * k
|
||||||
|
while bits_left > 0:
|
||||||
|
chunk = min(8, bits_left)
|
||||||
|
rc_byte = _REVCOMP8[kmer & 0xFF] >> (8 - chunk)
|
||||||
|
rc = (rc << chunk) | rc_byte
|
||||||
|
kmer >>= chunk
|
||||||
|
bits_left -= chunk
|
||||||
|
return rc
|
||||||
|
|
||||||
|
|
||||||
|
# ── FASTQ parsing ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def iter_sequences(path: str):
|
||||||
|
"""Yield raw sequences from a (gzipped) FASTQ file."""
|
||||||
|
opener = gzip.open if path.endswith('.gz') else open
|
||||||
|
with opener(path, 'rt') as fh:
|
||||||
|
while True:
|
||||||
|
if not fh.readline(): # '@' header
|
||||||
|
break
|
||||||
|
seq = fh.readline().rstrip('\n')
|
||||||
|
fh.readline() # '+'
|
||||||
|
fh.readline() # quality
|
||||||
|
yield seq
|
||||||
|
|
||||||
|
|
||||||
|
# ── kmer counting ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def count_kmers(paths: list[str], k: int) -> dict[int, int]:
|
||||||
|
mask = (1 << (2 * k)) - 1
|
||||||
|
counts: dict[int, int] = defaultdict(int)
|
||||||
|
n_reads = 0
|
||||||
|
|
||||||
|
for path in paths:
|
||||||
|
for seq in iter_sequences(path):
|
||||||
|
n_reads += 1
|
||||||
|
kmer = 0
|
||||||
|
run = 0 # consecutive valid bases
|
||||||
|
|
||||||
|
for c in seq:
|
||||||
|
b = _ENCODE.get(c)
|
||||||
|
if b is None: # N or unexpected character → reset
|
||||||
|
kmer = 0
|
||||||
|
run = 0
|
||||||
|
continue
|
||||||
|
kmer = ((kmer << 2) | b) & mask
|
||||||
|
run += 1
|
||||||
|
if run >= k:
|
||||||
|
rc = revcomp_int(kmer, k)
|
||||||
|
counts[kmer if kmer <= rc else rc] += 1
|
||||||
|
|
||||||
|
if n_reads % 100_000 == 0:
|
||||||
|
print(f' {n_reads:,} reads processed, '
|
||||||
|
f'{len(counts):,} distinct kmers so far',
|
||||||
|
file=sys.stderr)
|
||||||
|
|
||||||
|
print(f' {n_reads:,} reads total, {len(counts):,} distinct kmers',
|
||||||
|
file=sys.stderr)
|
||||||
|
return counts
|
||||||
|
|
||||||
|
|
||||||
|
# ── main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
ap = argparse.ArgumentParser(description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
ap.add_argument('reads', nargs='+', metavar='FASTQ',
|
||||||
|
help='Input reads (FASTQ, gzip OK)')
|
||||||
|
ap.add_argument('-k', '--kmer-size', type=int, default=31,
|
||||||
|
metavar='K')
|
||||||
|
ap.add_argument('--min-abundance', type=int, default=1,
|
||||||
|
metavar='N', help='Drop kmers with count < N (default 1)')
|
||||||
|
ap.add_argument('-o', '--output', required=True,
|
||||||
|
metavar='FILE', help='Output .npz path')
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
print(f'k={args.kmer_size} files={len(args.reads)}', file=sys.stderr)
|
||||||
|
counts = count_kmers(args.reads, args.kmer_size)
|
||||||
|
|
||||||
|
if args.min_abundance > 1:
|
||||||
|
before = len(counts)
|
||||||
|
counts = {k: v for k, v in counts.items() if v >= args.min_abundance}
|
||||||
|
print(f' min-abundance={args.min_abundance}: '
|
||||||
|
f'{before - len(counts):,} kmers dropped, '
|
||||||
|
f'{len(counts):,} retained',
|
||||||
|
file=sys.stderr)
|
||||||
|
|
||||||
|
print(f'Sorting and saving → {args.output}', file=sys.stderr)
|
||||||
|
kmers_arr = np.fromiter(sorted(counts), dtype=np.uint64, count=len(counts))
|
||||||
|
counts_arr = np.array([counts[int(k)] for k in kmers_arr], dtype=np.uint32)
|
||||||
|
|
||||||
|
np.savez_compressed(args.output, kmers=kmers_arr, counts=counts_arr)
|
||||||
|
print(f'Done {len(kmers_arr):,} kmers → {args.output}', file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Executable
+39
@@ -0,0 +1,39 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
SIMDATA_DIR="${SCRIPT_DIR}/simulated_data"
|
||||||
|
REF_DIR="${SCRIPT_DIR}/reference_index"
|
||||||
|
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
|
||||||
|
BUILD_PY="${SCRIPT_DIR}/build_reference.py"
|
||||||
|
|
||||||
|
KMER_SIZE="${KMER_SIZE:-31}"
|
||||||
|
MIN_ABUNDANCE="${MIN_ABUNDANCE:-1}"
|
||||||
|
|
||||||
|
mkdir -p "${REF_DIR}"
|
||||||
|
|
||||||
|
for species_dir in "${SIMDATA_DIR}"/*/; do
|
||||||
|
[[ -d "${species_dir}" ]] || continue
|
||||||
|
species=$(basename "${species_dir}")
|
||||||
|
|
||||||
|
for strain_dir in "${species_dir}"*/; do
|
||||||
|
[[ -d "${strain_dir}" ]] || continue
|
||||||
|
strain=$(basename "${strain_dir}")
|
||||||
|
|
||||||
|
r1="${strain_dir}/reads_R1.fastq.gz"
|
||||||
|
r2="${strain_dir}/reads_R2.fastq.gz"
|
||||||
|
if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
|
||||||
|
echo "SKIP ${species}--${strain}: reads not found" >&2
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
out="${REF_DIR}/${species}--${strain}.npz"
|
||||||
|
echo "[${species}--${strain}] → ${out}"
|
||||||
|
|
||||||
|
"${PYTHON}" "${BUILD_PY}" \
|
||||||
|
--kmer-size "${KMER_SIZE}" \
|
||||||
|
--min-abundance "${MIN_ABUNDANCE}" \
|
||||||
|
--output "${out}" \
|
||||||
|
"${r1}" "${r2}"
|
||||||
|
done
|
||||||
|
done
|
||||||
Executable
+226
@@ -0,0 +1,226 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Compute reference pairwise distance matrices from per-specimen .npz kmer indexes.
|
||||||
|
|
||||||
|
Reads all .npz files in reference_index/ (each containing sorted uint64 `kmers`
|
||||||
|
and uint32 `counts`), computes all distance metrics supported by `obikmer distance`,
|
||||||
|
and writes one CSV per metric to reference_dist/.
|
||||||
|
|
||||||
|
Output CSV format matches `obikmer distance --output`:
|
||||||
|
- first row: "genome", then specimen names
|
||||||
|
- subsequent rows: specimen name, then float or int values
|
||||||
|
|
||||||
|
Metrics written
|
||||||
|
jaccard_dist.csv Jaccard distance (presence/absence)
|
||||||
|
shared_kmers.csv Shared-kmer count matrix (intersection size)
|
||||||
|
bray_curtis_dist.csv Bray-Curtis dissimilarity (raw counts)
|
||||||
|
relfreq_bray_curtis_dist.csv Bray-Curtis on relative frequencies
|
||||||
|
euclidean_dist.csv Euclidean distance (raw counts)
|
||||||
|
relfreq_euclidean_dist.csv Euclidean distance on relative frequencies
|
||||||
|
hellinger_dist.csv Hellinger distance
|
||||||
|
hellinger_euclidean_dist.csv Euclidean distance in Hellinger space
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
# ── pairwise helpers ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def shared_indices(a_kmers: np.ndarray, b_kmers: np.ndarray):
|
||||||
|
"""Return index arrays (idx_a, idx_b) for kmers present in both sets.
|
||||||
|
|
||||||
|
Both arrays must be sorted uint64. Uses searchsorted: O(|B| log |A|).
|
||||||
|
"""
|
||||||
|
pos = np.searchsorted(a_kmers, b_kmers)
|
||||||
|
pos = np.clip(pos, 0, len(a_kmers) - 1)
|
||||||
|
mask = a_kmers[pos] == b_kmers
|
||||||
|
idx_b = np.where(mask)[0]
|
||||||
|
idx_a = pos[idx_b]
|
||||||
|
return idx_a, idx_b
|
||||||
|
|
||||||
|
|
||||||
|
def pairwise_stats(specimens: list[dict]) -> dict[str, np.ndarray]:
|
||||||
|
"""Compute all pairwise distance matrices at once.
|
||||||
|
|
||||||
|
Returns a dict metric_name → ndarray (n×n float64 or int64).
|
||||||
|
Each specimen dict has keys: name, kmers, counts.
|
||||||
|
"""
|
||||||
|
n = len(specimens)
|
||||||
|
|
||||||
|
# Pre-compute per-specimen scalars
|
||||||
|
kmer_counts = np.array([len(s['kmers']) for s in specimens], dtype=np.uint64)
|
||||||
|
count_sums = np.array([s['counts'].sum() for s in specimens], dtype=np.uint64)
|
||||||
|
|
||||||
|
# Per-specimen sum-of-squares (for Euclidean decomposition)
|
||||||
|
sq_sums = np.array([(s['counts'].astype(np.float64) ** 2).sum() for s in specimens])
|
||||||
|
|
||||||
|
# Allocate output matrices
|
||||||
|
shared_mat = np.zeros((n, n), dtype=np.uint64)
|
||||||
|
hamming_mat = np.zeros((n, n), dtype=np.float64)
|
||||||
|
jaccard_mat = np.zeros((n, n), dtype=np.float64)
|
||||||
|
bray_mat = np.zeros((n, n), dtype=np.float64)
|
||||||
|
relfreq_bray = np.zeros((n, n), dtype=np.float64)
|
||||||
|
euclidean_mat = np.zeros((n, n), dtype=np.float64)
|
||||||
|
relfreq_eucl = np.zeros((n, n), dtype=np.float64)
|
||||||
|
hellinger_mat = np.zeros((n, n), dtype=np.float64)
|
||||||
|
hell_eucl_mat = np.zeros((n, n), dtype=np.float64)
|
||||||
|
|
||||||
|
for i in range(n):
|
||||||
|
a_km = specimens[i]['kmers']
|
||||||
|
a_ct = specimens[i]['counts'].astype(np.float64)
|
||||||
|
sa = float(count_sums[i])
|
||||||
|
na = int(kmer_counts[i])
|
||||||
|
|
||||||
|
for j in range(i + 1, n):
|
||||||
|
b_km = specimens[j]['kmers']
|
||||||
|
b_ct = specimens[j]['counts'].astype(np.float64)
|
||||||
|
sb = float(count_sums[j])
|
||||||
|
nb = int(kmer_counts[j])
|
||||||
|
|
||||||
|
idx_a, idx_b = shared_indices(a_km, b_km)
|
||||||
|
inter = len(idx_a)
|
||||||
|
|
||||||
|
ca_sh = a_ct[idx_a]
|
||||||
|
cb_sh = b_ct[idx_b]
|
||||||
|
|
||||||
|
# ── Presence metrics ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
union = na + nb - inter
|
||||||
|
jac = (1.0 - inter / union) if union else 0.0
|
||||||
|
hamming = float(na + nb - 2 * inter) # |A Δ B|
|
||||||
|
|
||||||
|
# ── Count metrics ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# Bray-Curtis: 1 - 2*Σmin(a,b) / (Σa + Σb)
|
||||||
|
sum_min = np.minimum(ca_sh, cb_sh).sum()
|
||||||
|
denom_bc = sa + sb
|
||||||
|
bc = (1.0 - 2.0 * sum_min / denom_bc) if denom_bc else 0.0
|
||||||
|
|
||||||
|
# RelfreqBray: 1 - Σmin(a/sa, b/sb) [only shared contribute]
|
||||||
|
if sa and sb:
|
||||||
|
rfb = 1.0 - np.minimum(ca_sh / sa, cb_sh / sb).sum()
|
||||||
|
else:
|
||||||
|
rfb = 0.0
|
||||||
|
|
||||||
|
# Euclidean: √(Σa² + Σb² - 2·Σ(a·b)_shared)
|
||||||
|
cross = (ca_sh * cb_sh).sum()
|
||||||
|
eucl_partial = sq_sums[i] + sq_sums[j] - 2.0 * cross
|
||||||
|
eucl = np.sqrt(max(eucl_partial, 0.0))
|
||||||
|
|
||||||
|
# RelfreqEuclidean: √(Σ(a/sa - b/sb)²)
|
||||||
|
# = √(Σa²/sa² + Σb²/sb² - 2·Σ(a·b)_shared/(sa·sb))
|
||||||
|
if sa and sb:
|
||||||
|
rf_cross = (ca_sh / sa * (cb_sh / sb)).sum()
|
||||||
|
rfe_partial = (sq_sums[i] / sa**2
|
||||||
|
+ sq_sums[j] / sb**2
|
||||||
|
- 2.0 * rf_cross)
|
||||||
|
rfe = np.sqrt(max(rfe_partial, 0.0))
|
||||||
|
else:
|
||||||
|
rfe = 0.0
|
||||||
|
|
||||||
|
# Hellinger partial: Σ(√(a/sa) - √(b/sb))² over global universe
|
||||||
|
# = 2 - 2·Σ√(a·b)_shared / √(sa·sb)
|
||||||
|
if sa and sb:
|
||||||
|
bc_coeff = np.sqrt(ca_sh * cb_sh).sum() / np.sqrt(sa * sb)
|
||||||
|
hell_partial = max(2.0 - 2.0 * bc_coeff, 0.0)
|
||||||
|
else:
|
||||||
|
hell_partial = 0.0
|
||||||
|
|
||||||
|
sq2 = np.sqrt(2.0)
|
||||||
|
hell = np.sqrt(hell_partial) / sq2
|
||||||
|
hell_euc = np.sqrt(hell_partial)
|
||||||
|
|
||||||
|
# ── Fill symmetric matrices ───────────────────────────────────────
|
||||||
|
for mat, val in [
|
||||||
|
(shared_mat, inter),
|
||||||
|
(hamming_mat, hamming),
|
||||||
|
(jaccard_mat, jac),
|
||||||
|
(bray_mat, bc),
|
||||||
|
(relfreq_bray, rfb),
|
||||||
|
(euclidean_mat, eucl),
|
||||||
|
(relfreq_eucl, rfe),
|
||||||
|
(hellinger_mat, hell),
|
||||||
|
(hell_eucl_mat, hell_euc),
|
||||||
|
]:
|
||||||
|
mat[i, j] = val
|
||||||
|
mat[j, i] = val
|
||||||
|
|
||||||
|
return {
|
||||||
|
'shared_kmers': shared_mat,
|
||||||
|
'hamming_dist': hamming_mat,
|
||||||
|
'jaccard_dist': jaccard_mat,
|
||||||
|
'bray_curtis_dist': bray_mat,
|
||||||
|
'relfreq_bray_curtis_dist': relfreq_bray,
|
||||||
|
'euclidean_dist': euclidean_mat,
|
||||||
|
'relfreq_euclidean_dist': relfreq_eucl,
|
||||||
|
'hellinger_dist': hellinger_mat,
|
||||||
|
'hellinger_euclidean_dist': hell_eucl_mat,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ── I/O ───────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def write_csv(path: Path, labels: list[str], mat: np.ndarray, fmt: str) -> None:
|
||||||
|
with path.open('w') as fh:
|
||||||
|
fh.write('genome,' + ','.join(labels) + '\n')
|
||||||
|
for i, label in enumerate(labels):
|
||||||
|
row = ','.join(format(mat[i, j], fmt) for j in range(len(labels)))
|
||||||
|
fh.write(f'{label},{row}\n')
|
||||||
|
print(f' → {path}', file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
|
# ── main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
ap = argparse.ArgumentParser(description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
ap.add_argument('--ref-dir', default='reference_index',
|
||||||
|
help='Directory with per-specimen .npz files (default: reference_index)')
|
||||||
|
ap.add_argument('--out-dir', default='reference_dist',
|
||||||
|
help='Output directory for CSV files (default: reference_dist)')
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
ref_dir = Path(args.ref_dir)
|
||||||
|
out_dir = Path(args.out_dir)
|
||||||
|
out_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
npz_files = sorted(ref_dir.glob('*.npz'))
|
||||||
|
if not npz_files:
|
||||||
|
print(f'ERROR: no .npz files found in {ref_dir}', file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
print(f'Loading {len(npz_files)} specimen(s) from {ref_dir}/', file=sys.stderr)
|
||||||
|
specimens = []
|
||||||
|
for f in npz_files:
|
||||||
|
data = np.load(f)
|
||||||
|
specimens.append({
|
||||||
|
'name': f.stem,
|
||||||
|
'kmers': data['kmers'],
|
||||||
|
'counts': data['counts'],
|
||||||
|
})
|
||||||
|
print(f' {f.stem}: {len(data["kmers"]):,} kmers', file=sys.stderr)
|
||||||
|
|
||||||
|
labels = [s['name'] for s in specimens]
|
||||||
|
n = len(labels)
|
||||||
|
print(f'\nComputing pairwise distances for {n} specimens…', file=sys.stderr)
|
||||||
|
|
||||||
|
matrices = pairwise_stats(specimens)
|
||||||
|
|
||||||
|
print(f'\nWriting CSVs to {out_dir}/', file=sys.stderr)
|
||||||
|
write_csv(out_dir / 'shared_kmers.csv', labels, matrices['shared_kmers'], 'd')
|
||||||
|
write_csv(out_dir / 'hamming_dist.csv', labels, matrices['hamming_dist'], '.6f')
|
||||||
|
write_csv(out_dir / 'jaccard_dist.csv', labels, matrices['jaccard_dist'], '.6f')
|
||||||
|
write_csv(out_dir / 'bray_curtis_dist.csv', labels, matrices['bray_curtis_dist'], '.6f')
|
||||||
|
write_csv(out_dir / 'relfreq_bray_curtis_dist.csv', labels, matrices['relfreq_bray_curtis_dist'], '.6f')
|
||||||
|
write_csv(out_dir / 'euclidean_dist.csv', labels, matrices['euclidean_dist'], '.6f')
|
||||||
|
write_csv(out_dir / 'relfreq_euclidean_dist.csv', labels, matrices['relfreq_euclidean_dist'], '.6f')
|
||||||
|
write_csv(out_dir / 'hellinger_dist.csv', labels, matrices['hellinger_dist'], '.6f')
|
||||||
|
write_csv(out_dir / 'hellinger_euclidean_dist.csv', labels, matrices['hellinger_euclidean_dist'], '.6f')
|
||||||
|
|
||||||
|
print('\nDone.', file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Executable
+182
@@ -0,0 +1,182 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Compare all reference distance matrices against obikmer distance outputs.
|
||||||
|
|
||||||
|
Reads from:
|
||||||
|
reference_dist/ — ground-truth matrices computed by build_reference_dist.py
|
||||||
|
obikmer_dist/ — matrices produced by `obikmer distance`
|
||||||
|
|
||||||
|
Handles label reordering: both matrices are sorted by genome label before
|
||||||
|
element-wise comparison, so column/row order differences are irrelevant.
|
||||||
|
|
||||||
|
Output: stats/dist_comparison/summary.csv
|
||||||
|
comparison,max_abs,mean_abs,rmse,n_pairs,status
|
||||||
|
"""
|
||||||
|
import csv
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
# ── CSV loading ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def load_matrix(path: Path) -> tuple[list[str], np.ndarray]:
|
||||||
|
"""Load a distance-matrix CSV; return (sorted_labels, matrix_float64)."""
|
||||||
|
with path.open() as fh:
|
||||||
|
reader = csv.reader(fh)
|
||||||
|
header = next(reader)[1:] # skip 'genome' column
|
||||||
|
raw: dict[str, list[float]] = {}
|
||||||
|
for row in reader:
|
||||||
|
raw[row[0]] = [float(x) for x in row[1:]]
|
||||||
|
|
||||||
|
label_to_col = {h: i for i, h in enumerate(header)}
|
||||||
|
labels = sorted(raw.keys())
|
||||||
|
n = len(labels)
|
||||||
|
mat = np.zeros((n, n), dtype=np.float64)
|
||||||
|
for i, ri in enumerate(labels):
|
||||||
|
for j, cj in enumerate(labels):
|
||||||
|
mat[i, j] = raw[ri][label_to_col[cj]]
|
||||||
|
return labels, mat
|
||||||
|
|
||||||
|
|
||||||
|
# ── comparison ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def compare(label: str,
|
||||||
|
ref_path: Path,
|
||||||
|
obi_path: Path,
|
||||||
|
tol: float = 1e-4) -> dict:
|
||||||
|
if not ref_path.exists():
|
||||||
|
return {'comparison': label, 'status': 'REF_MISSING',
|
||||||
|
'max_abs': '', 'mean_abs': '', 'rmse': '', 'n_pairs': ''}
|
||||||
|
if not obi_path.exists():
|
||||||
|
return {'comparison': label, 'status': 'OBI_MISSING',
|
||||||
|
'max_abs': '', 'mean_abs': '', 'rmse': '', 'n_pairs': ''}
|
||||||
|
|
||||||
|
ref_labels, ref_mat = load_matrix(ref_path)
|
||||||
|
obi_labels, obi_mat = load_matrix(obi_path)
|
||||||
|
|
||||||
|
if ref_labels != obi_labels:
|
||||||
|
only_ref = sorted(set(ref_labels) - set(obi_labels))
|
||||||
|
only_obi = sorted(set(obi_labels) - set(ref_labels))
|
||||||
|
print(f' [{label}] label mismatch — '
|
||||||
|
f'only_ref={only_ref} only_obi={only_obi}', file=sys.stderr)
|
||||||
|
return {'comparison': label, 'status': 'LABEL_MISMATCH',
|
||||||
|
'max_abs': '', 'mean_abs': '', 'rmse': '', 'n_pairs': ''}
|
||||||
|
|
||||||
|
n = len(ref_labels)
|
||||||
|
# Off-diagonal mask
|
||||||
|
mask = ~np.eye(n, dtype=bool)
|
||||||
|
diff = np.abs(ref_mat[mask] - obi_mat[mask])
|
||||||
|
n_pairs = diff.size
|
||||||
|
|
||||||
|
max_abs = float(diff.max())
|
||||||
|
mean_abs = float(diff.mean())
|
||||||
|
rmse = float(np.sqrt((diff ** 2).mean()))
|
||||||
|
status = 'PASS' if max_abs <= tol else 'FAIL'
|
||||||
|
|
||||||
|
print(f' [{label}] n={n_pairs} '
|
||||||
|
f'max={max_abs:.3e} mean={mean_abs:.3e} rmse={rmse:.3e} {status}',
|
||||||
|
file=sys.stderr)
|
||||||
|
return {
|
||||||
|
'comparison': label,
|
||||||
|
'max_abs': f'{max_abs:.6e}',
|
||||||
|
'mean_abs': f'{mean_abs:.6e}',
|
||||||
|
'rmse': f'{rmse:.6e}',
|
||||||
|
'n_pairs': str(n_pairs),
|
||||||
|
'status': status,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ── comparison table ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# (label, ref_csv, obikmer_csv)
|
||||||
|
# The reference jaccard/shared is presence-based, which should match both
|
||||||
|
# presence/jaccard and count/jaccard (threshold=1).
|
||||||
|
COMPARISONS = [
|
||||||
|
# ── presence index ────────────────────────────────────────────────────────
|
||||||
|
('presence/jaccard_dist',
|
||||||
|
'reference_dist/jaccard_dist.csv',
|
||||||
|
'obikmer_dist/presence/jaccard_dist.csv'),
|
||||||
|
|
||||||
|
('presence/jaccard_shared',
|
||||||
|
'reference_dist/shared_kmers.csv',
|
||||||
|
'obikmer_dist/presence/jaccard_shared.csv'),
|
||||||
|
|
||||||
|
('presence/hamming_dist',
|
||||||
|
'reference_dist/hamming_dist.csv',
|
||||||
|
'obikmer_dist/presence/hamming_dist.csv'),
|
||||||
|
|
||||||
|
# ── count index (jaccard cross-check) ─────────────────────────────────────
|
||||||
|
('count/jaccard_dist',
|
||||||
|
'reference_dist/jaccard_dist.csv',
|
||||||
|
'obikmer_dist/count/jaccard_dist.csv'),
|
||||||
|
|
||||||
|
('count/jaccard_shared',
|
||||||
|
'reference_dist/shared_kmers.csv',
|
||||||
|
'obikmer_dist/count/jaccard_shared.csv'),
|
||||||
|
|
||||||
|
# ── count index (count-based metrics) ────────────────────────────────────
|
||||||
|
('count/bray_curtis_dist',
|
||||||
|
'reference_dist/bray_curtis_dist.csv',
|
||||||
|
'obikmer_dist/count/bray_curtis_dist.csv'),
|
||||||
|
|
||||||
|
('count/relfreq_bray_curtis_dist',
|
||||||
|
'reference_dist/relfreq_bray_curtis_dist.csv',
|
||||||
|
'obikmer_dist/count/relfreq_bray_curtis_dist.csv'),
|
||||||
|
|
||||||
|
('count/euclidean_dist',
|
||||||
|
'reference_dist/euclidean_dist.csv',
|
||||||
|
'obikmer_dist/count/euclidean_dist.csv'),
|
||||||
|
|
||||||
|
('count/relfreq_euclidean_dist',
|
||||||
|
'reference_dist/relfreq_euclidean_dist.csv',
|
||||||
|
'obikmer_dist/count/relfreq_euclidean_dist.csv'),
|
||||||
|
|
||||||
|
('count/hellinger_dist',
|
||||||
|
'reference_dist/hellinger_dist.csv',
|
||||||
|
'obikmer_dist/count/hellinger_dist.csv'),
|
||||||
|
|
||||||
|
('count/hellinger_euclidean_dist',
|
||||||
|
'reference_dist/hellinger_euclidean_dist.csv',
|
||||||
|
'obikmer_dist/count/hellinger_euclidean_dist.csv'),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# ── main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
import argparse
|
||||||
|
ap = argparse.ArgumentParser(description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
ap.add_argument('--tol', type=float, default=1e-4,
|
||||||
|
help='Max abs diff threshold for PASS/FAIL (default 1e-4)')
|
||||||
|
ap.add_argument('--out', default='stats/dist_comparison/summary.csv',
|
||||||
|
help='Output summary CSV path')
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
out_path = Path(args.out)
|
||||||
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
print(f'Comparing {len(COMPARISONS)} matrix pairs…', file=sys.stderr)
|
||||||
|
rows = []
|
||||||
|
for label, ref, obi in COMPARISONS:
|
||||||
|
rows.append(compare(label, Path(ref), Path(obi), tol=args.tol))
|
||||||
|
|
||||||
|
fields = ['comparison', 'max_abs', 'mean_abs', 'rmse', 'n_pairs', 'status']
|
||||||
|
with out_path.open('w', newline='') as fh:
|
||||||
|
w = csv.DictWriter(fh, fieldnames=fields)
|
||||||
|
w.writeheader()
|
||||||
|
w.writerows(rows)
|
||||||
|
|
||||||
|
print(f'\n→ {out_path}', file=sys.stderr)
|
||||||
|
|
||||||
|
n_fail = sum(1 for r in rows if r.get('status') == 'FAIL')
|
||||||
|
n_pass = sum(1 for r in rows if r.get('status') == 'PASS')
|
||||||
|
print(f'Summary: {n_pass} PASS {n_fail} FAIL '
|
||||||
|
f'{len(rows) - n_pass - n_fail} SKIP', file=sys.stderr)
|
||||||
|
if n_fail:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
@@ -0,0 +1,199 @@
|
|||||||
|
SPECIMENS := Escherichia_coli--K-12_MG1655 Escherichia_coli--EDL933 Salmonella_enterica--LT2 Escherichia_coli--CFT073 Bacillus_subtilis--168 Salmonella_enterica--P125109 Shouchella_clausii--KSM-K16 Escherichia_coli--K-12_W3110 Klebsiella_pneumoniae--MGH_78578 Opitutus_terrae--PB90-1 Saccharolobus_islandicus--M.16.4 Acidobacterium_capsulatum--ATCC_51196 Salmonella_enterica--AKU_12601 Proteus_mirabilis--HI4320 Salmonella_enterica--CT18 Klebsiella_pneumoniae--HS11286 Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1 Klebsiella_pneumoniae--ATCC_13883 Yersinia_ruckeri--YRB Candidozyma_auris--GCF_003013715.1_ASM301371v2
|
||||||
|
SPECIES := Escherichia_coli Salmonella_enterica Bacillus_subtilis Shouchella_clausii Klebsiella_pneumoniae Opitutus_terrae Saccharolobus_islandicus Acidobacterium_capsulatum Proteus_mirabilis Wolbachia_endosymbiont Yersinia_ruckeri Candidozyma_auris
|
||||||
|
|
||||||
|
# Escherichia_coli--K-12_MG1655
|
||||||
|
simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz: genomes/GCF_000005845.2_ASM584v2_genomic.fna.gz
|
||||||
|
reference_index/Escherichia_coli--K-12_MG1655.npz: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Escherichia_coli--K-12_MG1655/index.done stats/indexing_presence/Escherichia_coli--K-12_MG1655.stats: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Escherichia_coli--K-12_MG1655/index.done stats/indexing_count/Escherichia_coli--K-12_MG1655.stats: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Escherichia_coli--K-12_MG1655.stats: reference_index/Escherichia_coli--K-12_MG1655.npz specimen_index_presence/Escherichia_coli--K-12_MG1655/index.done
|
||||||
|
stats/verify_count/Escherichia_coli--K-12_MG1655.stats: reference_index/Escherichia_coli--K-12_MG1655.npz specimen_index_count/Escherichia_coli--K-12_MG1655/index.done
|
||||||
|
|
||||||
|
# Escherichia_coli--EDL933
|
||||||
|
simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz: genomes/GCF_000006665.1_ASM666v1_genomic.fna.gz
|
||||||
|
reference_index/Escherichia_coli--EDL933.npz: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Escherichia_coli--EDL933/index.done stats/indexing_presence/Escherichia_coli--EDL933.stats: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Escherichia_coli--EDL933/index.done stats/indexing_count/Escherichia_coli--EDL933.stats: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Escherichia_coli--EDL933.stats: reference_index/Escherichia_coli--EDL933.npz specimen_index_presence/Escherichia_coli--EDL933/index.done
|
||||||
|
stats/verify_count/Escherichia_coli--EDL933.stats: reference_index/Escherichia_coli--EDL933.npz specimen_index_count/Escherichia_coli--EDL933/index.done
|
||||||
|
|
||||||
|
# Salmonella_enterica--LT2
|
||||||
|
simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz: genomes/GCF_000006945.2_ASM694v2_genomic.fna.gz
|
||||||
|
reference_index/Salmonella_enterica--LT2.npz: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Salmonella_enterica--LT2/index.done stats/indexing_presence/Salmonella_enterica--LT2.stats: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Salmonella_enterica--LT2/index.done stats/indexing_count/Salmonella_enterica--LT2.stats: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Salmonella_enterica--LT2.stats: reference_index/Salmonella_enterica--LT2.npz specimen_index_presence/Salmonella_enterica--LT2/index.done
|
||||||
|
stats/verify_count/Salmonella_enterica--LT2.stats: reference_index/Salmonella_enterica--LT2.npz specimen_index_count/Salmonella_enterica--LT2/index.done
|
||||||
|
|
||||||
|
# Escherichia_coli--CFT073
|
||||||
|
simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz: genomes/GCF_000007445.1_ASM744v1_genomic.fna.gz
|
||||||
|
reference_index/Escherichia_coli--CFT073.npz: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Escherichia_coli--CFT073/index.done stats/indexing_presence/Escherichia_coli--CFT073.stats: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Escherichia_coli--CFT073/index.done stats/indexing_count/Escherichia_coli--CFT073.stats: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Escherichia_coli--CFT073.stats: reference_index/Escherichia_coli--CFT073.npz specimen_index_presence/Escherichia_coli--CFT073/index.done
|
||||||
|
stats/verify_count/Escherichia_coli--CFT073.stats: reference_index/Escherichia_coli--CFT073.npz specimen_index_count/Escherichia_coli--CFT073/index.done
|
||||||
|
|
||||||
|
# Bacillus_subtilis--168
|
||||||
|
simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz: genomes/GCF_000009045.1_ASM904v1_genomic.fna.gz
|
||||||
|
reference_index/Bacillus_subtilis--168.npz: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Bacillus_subtilis--168/index.done stats/indexing_presence/Bacillus_subtilis--168.stats: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Bacillus_subtilis--168/index.done stats/indexing_count/Bacillus_subtilis--168.stats: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Bacillus_subtilis--168.stats: reference_index/Bacillus_subtilis--168.npz specimen_index_presence/Bacillus_subtilis--168/index.done
|
||||||
|
stats/verify_count/Bacillus_subtilis--168.stats: reference_index/Bacillus_subtilis--168.npz specimen_index_count/Bacillus_subtilis--168/index.done
|
||||||
|
|
||||||
|
# Salmonella_enterica--P125109
|
||||||
|
simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz: genomes/GCF_000009505.1_ASM950v1_genomic.fna.gz
|
||||||
|
reference_index/Salmonella_enterica--P125109.npz: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Salmonella_enterica--P125109/index.done stats/indexing_presence/Salmonella_enterica--P125109.stats: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Salmonella_enterica--P125109/index.done stats/indexing_count/Salmonella_enterica--P125109.stats: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Salmonella_enterica--P125109.stats: reference_index/Salmonella_enterica--P125109.npz specimen_index_presence/Salmonella_enterica--P125109/index.done
|
||||||
|
stats/verify_count/Salmonella_enterica--P125109.stats: reference_index/Salmonella_enterica--P125109.npz specimen_index_count/Salmonella_enterica--P125109/index.done
|
||||||
|
|
||||||
|
# Shouchella_clausii--KSM-K16
|
||||||
|
simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz: genomes/GCF_000009825.1_ASM982v1_genomic.fna.gz
|
||||||
|
reference_index/Shouchella_clausii--KSM-K16.npz: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Shouchella_clausii--KSM-K16/index.done stats/indexing_presence/Shouchella_clausii--KSM-K16.stats: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Shouchella_clausii--KSM-K16/index.done stats/indexing_count/Shouchella_clausii--KSM-K16.stats: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Shouchella_clausii--KSM-K16.stats: reference_index/Shouchella_clausii--KSM-K16.npz specimen_index_presence/Shouchella_clausii--KSM-K16/index.done
|
||||||
|
stats/verify_count/Shouchella_clausii--KSM-K16.stats: reference_index/Shouchella_clausii--KSM-K16.npz specimen_index_count/Shouchella_clausii--KSM-K16/index.done
|
||||||
|
|
||||||
|
# Escherichia_coli--K-12_W3110
|
||||||
|
simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz: genomes/GCF_000010245.2_ASM1024v1_genomic.fna.gz
|
||||||
|
reference_index/Escherichia_coli--K-12_W3110.npz: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Escherichia_coli--K-12_W3110/index.done stats/indexing_presence/Escherichia_coli--K-12_W3110.stats: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Escherichia_coli--K-12_W3110/index.done stats/indexing_count/Escherichia_coli--K-12_W3110.stats: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Escherichia_coli--K-12_W3110.stats: reference_index/Escherichia_coli--K-12_W3110.npz specimen_index_presence/Escherichia_coli--K-12_W3110/index.done
|
||||||
|
stats/verify_count/Escherichia_coli--K-12_W3110.stats: reference_index/Escherichia_coli--K-12_W3110.npz specimen_index_count/Escherichia_coli--K-12_W3110/index.done
|
||||||
|
|
||||||
|
# Klebsiella_pneumoniae--MGH_78578
|
||||||
|
simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz: genomes/GCF_000016305.1_ASM1630v1_genomic.fna.gz
|
||||||
|
reference_index/Klebsiella_pneumoniae--MGH_78578.npz: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Klebsiella_pneumoniae--MGH_78578/index.done stats/indexing_presence/Klebsiella_pneumoniae--MGH_78578.stats: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Klebsiella_pneumoniae--MGH_78578/index.done stats/indexing_count/Klebsiella_pneumoniae--MGH_78578.stats: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Klebsiella_pneumoniae--MGH_78578.stats: reference_index/Klebsiella_pneumoniae--MGH_78578.npz specimen_index_presence/Klebsiella_pneumoniae--MGH_78578/index.done
|
||||||
|
stats/verify_count/Klebsiella_pneumoniae--MGH_78578.stats: reference_index/Klebsiella_pneumoniae--MGH_78578.npz specimen_index_count/Klebsiella_pneumoniae--MGH_78578/index.done
|
||||||
|
|
||||||
|
# Opitutus_terrae--PB90-1
|
||||||
|
simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz: genomes/GCF_000019965.1_ASM1996v1_genomic.fna.gz
|
||||||
|
reference_index/Opitutus_terrae--PB90-1.npz: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Opitutus_terrae--PB90-1/index.done stats/indexing_presence/Opitutus_terrae--PB90-1.stats: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Opitutus_terrae--PB90-1/index.done stats/indexing_count/Opitutus_terrae--PB90-1.stats: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Opitutus_terrae--PB90-1.stats: reference_index/Opitutus_terrae--PB90-1.npz specimen_index_presence/Opitutus_terrae--PB90-1/index.done
|
||||||
|
stats/verify_count/Opitutus_terrae--PB90-1.stats: reference_index/Opitutus_terrae--PB90-1.npz specimen_index_count/Opitutus_terrae--PB90-1/index.done
|
||||||
|
|
||||||
|
# Saccharolobus_islandicus--M.16.4
|
||||||
|
simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz: genomes/GCF_000022445.1_ASM2244v1_genomic.fna.gz
|
||||||
|
reference_index/Saccharolobus_islandicus--M.16.4.npz: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Saccharolobus_islandicus--M.16.4/index.done stats/indexing_presence/Saccharolobus_islandicus--M.16.4.stats: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Saccharolobus_islandicus--M.16.4/index.done stats/indexing_count/Saccharolobus_islandicus--M.16.4.stats: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Saccharolobus_islandicus--M.16.4.stats: reference_index/Saccharolobus_islandicus--M.16.4.npz specimen_index_presence/Saccharolobus_islandicus--M.16.4/index.done
|
||||||
|
stats/verify_count/Saccharolobus_islandicus--M.16.4.stats: reference_index/Saccharolobus_islandicus--M.16.4.npz specimen_index_count/Saccharolobus_islandicus--M.16.4/index.done
|
||||||
|
|
||||||
|
# Acidobacterium_capsulatum--ATCC_51196
|
||||||
|
simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz: genomes/GCF_000022565.1_ASM2256v1_genomic.fna.gz
|
||||||
|
reference_index/Acidobacterium_capsulatum--ATCC_51196.npz: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Acidobacterium_capsulatum--ATCC_51196/index.done stats/indexing_presence/Acidobacterium_capsulatum--ATCC_51196.stats: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Acidobacterium_capsulatum--ATCC_51196/index.done stats/indexing_count/Acidobacterium_capsulatum--ATCC_51196.stats: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Acidobacterium_capsulatum--ATCC_51196.stats: reference_index/Acidobacterium_capsulatum--ATCC_51196.npz specimen_index_presence/Acidobacterium_capsulatum--ATCC_51196/index.done
|
||||||
|
stats/verify_count/Acidobacterium_capsulatum--ATCC_51196.stats: reference_index/Acidobacterium_capsulatum--ATCC_51196.npz specimen_index_count/Acidobacterium_capsulatum--ATCC_51196/index.done
|
||||||
|
|
||||||
|
# Salmonella_enterica--AKU_12601
|
||||||
|
simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz: genomes/GCF_000026565.1_ASM2656v1_genomic.fna.gz
|
||||||
|
reference_index/Salmonella_enterica--AKU_12601.npz: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Salmonella_enterica--AKU_12601/index.done stats/indexing_presence/Salmonella_enterica--AKU_12601.stats: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Salmonella_enterica--AKU_12601/index.done stats/indexing_count/Salmonella_enterica--AKU_12601.stats: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Salmonella_enterica--AKU_12601.stats: reference_index/Salmonella_enterica--AKU_12601.npz specimen_index_presence/Salmonella_enterica--AKU_12601/index.done
|
||||||
|
stats/verify_count/Salmonella_enterica--AKU_12601.stats: reference_index/Salmonella_enterica--AKU_12601.npz specimen_index_count/Salmonella_enterica--AKU_12601/index.done
|
||||||
|
|
||||||
|
# Proteus_mirabilis--HI4320
|
||||||
|
simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz: genomes/GCF_000069965.1_ASM6996v1_genomic.fna.gz
|
||||||
|
reference_index/Proteus_mirabilis--HI4320.npz: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Proteus_mirabilis--HI4320/index.done stats/indexing_presence/Proteus_mirabilis--HI4320.stats: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Proteus_mirabilis--HI4320/index.done stats/indexing_count/Proteus_mirabilis--HI4320.stats: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Proteus_mirabilis--HI4320.stats: reference_index/Proteus_mirabilis--HI4320.npz specimen_index_presence/Proteus_mirabilis--HI4320/index.done
|
||||||
|
stats/verify_count/Proteus_mirabilis--HI4320.stats: reference_index/Proteus_mirabilis--HI4320.npz specimen_index_count/Proteus_mirabilis--HI4320/index.done
|
||||||
|
|
||||||
|
# Salmonella_enterica--CT18
|
||||||
|
simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz: genomes/GCF_000195995.1_ASM19599v1_genomic.fna.gz
|
||||||
|
reference_index/Salmonella_enterica--CT18.npz: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Salmonella_enterica--CT18/index.done stats/indexing_presence/Salmonella_enterica--CT18.stats: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Salmonella_enterica--CT18/index.done stats/indexing_count/Salmonella_enterica--CT18.stats: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Salmonella_enterica--CT18.stats: reference_index/Salmonella_enterica--CT18.npz specimen_index_presence/Salmonella_enterica--CT18/index.done
|
||||||
|
stats/verify_count/Salmonella_enterica--CT18.stats: reference_index/Salmonella_enterica--CT18.npz specimen_index_count/Salmonella_enterica--CT18/index.done
|
||||||
|
|
||||||
|
# Klebsiella_pneumoniae--HS11286
|
||||||
|
simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz: genomes/GCF_000240185.1_ASM24018v2_genomic.fna.gz
|
||||||
|
reference_index/Klebsiella_pneumoniae--HS11286.npz: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Klebsiella_pneumoniae--HS11286/index.done stats/indexing_presence/Klebsiella_pneumoniae--HS11286.stats: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Klebsiella_pneumoniae--HS11286/index.done stats/indexing_count/Klebsiella_pneumoniae--HS11286.stats: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Klebsiella_pneumoniae--HS11286.stats: reference_index/Klebsiella_pneumoniae--HS11286.npz specimen_index_presence/Klebsiella_pneumoniae--HS11286/index.done
|
||||||
|
stats/verify_count/Klebsiella_pneumoniae--HS11286.stats: reference_index/Klebsiella_pneumoniae--HS11286.npz specimen_index_count/Klebsiella_pneumoniae--HS11286/index.done
|
||||||
|
|
||||||
|
# Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1
|
||||||
|
simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz: genomes/GCF_000306885.1_ASM30688v1_genomic.fna.gz
|
||||||
|
reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done stats/indexing_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done stats/indexing_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz specimen_index_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done
|
||||||
|
stats/verify_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz specimen_index_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done
|
||||||
|
|
||||||
|
# Klebsiella_pneumoniae--ATCC_13883
|
||||||
|
simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz: genomes/GCF_000742135.1_ASM74213v1_genomic.fna.gz
|
||||||
|
reference_index/Klebsiella_pneumoniae--ATCC_13883.npz: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Klebsiella_pneumoniae--ATCC_13883/index.done stats/indexing_presence/Klebsiella_pneumoniae--ATCC_13883.stats: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Klebsiella_pneumoniae--ATCC_13883/index.done stats/indexing_count/Klebsiella_pneumoniae--ATCC_13883.stats: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Klebsiella_pneumoniae--ATCC_13883.stats: reference_index/Klebsiella_pneumoniae--ATCC_13883.npz specimen_index_presence/Klebsiella_pneumoniae--ATCC_13883/index.done
|
||||||
|
stats/verify_count/Klebsiella_pneumoniae--ATCC_13883.stats: reference_index/Klebsiella_pneumoniae--ATCC_13883.npz specimen_index_count/Klebsiella_pneumoniae--ATCC_13883/index.done
|
||||||
|
|
||||||
|
# Yersinia_ruckeri--YRB
|
||||||
|
simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz: genomes/GCF_000834255.1_ASM83425v1_genomic.fna.gz
|
||||||
|
reference_index/Yersinia_ruckeri--YRB.npz: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Yersinia_ruckeri--YRB/index.done stats/indexing_presence/Yersinia_ruckeri--YRB.stats: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Yersinia_ruckeri--YRB/index.done stats/indexing_count/Yersinia_ruckeri--YRB.stats: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Yersinia_ruckeri--YRB.stats: reference_index/Yersinia_ruckeri--YRB.npz specimen_index_presence/Yersinia_ruckeri--YRB/index.done
|
||||||
|
stats/verify_count/Yersinia_ruckeri--YRB.stats: reference_index/Yersinia_ruckeri--YRB.npz specimen_index_count/Yersinia_ruckeri--YRB/index.done
|
||||||
|
|
||||||
|
# Candidozyma_auris--GCF_003013715.1_ASM301371v2
|
||||||
|
simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz: genomes/GCF_003013715.1_ASM301371v2_genomic.fna.gz
|
||||||
|
reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done stats/indexing_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done stats/indexing_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz specimen_index_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done
|
||||||
|
stats/verify_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz specimen_index_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done
|
||||||
|
|
||||||
|
# Escherichia_coli
|
||||||
|
specific_index_presence/Escherichia_coli/index.done stats/specific_kmer_presence/Escherichia_coli.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Escherichia_coli/index.done stats/specific_kmer_count/Escherichia_coli.stats: global_index_count/index.done
|
||||||
|
# Salmonella_enterica
|
||||||
|
specific_index_presence/Salmonella_enterica/index.done stats/specific_kmer_presence/Salmonella_enterica.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Salmonella_enterica/index.done stats/specific_kmer_count/Salmonella_enterica.stats: global_index_count/index.done
|
||||||
|
# Bacillus_subtilis
|
||||||
|
specific_index_presence/Bacillus_subtilis/index.done stats/specific_kmer_presence/Bacillus_subtilis.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Bacillus_subtilis/index.done stats/specific_kmer_count/Bacillus_subtilis.stats: global_index_count/index.done
|
||||||
|
# Shouchella_clausii
|
||||||
|
specific_index_presence/Shouchella_clausii/index.done stats/specific_kmer_presence/Shouchella_clausii.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Shouchella_clausii/index.done stats/specific_kmer_count/Shouchella_clausii.stats: global_index_count/index.done
|
||||||
|
# Klebsiella_pneumoniae
|
||||||
|
specific_index_presence/Klebsiella_pneumoniae/index.done stats/specific_kmer_presence/Klebsiella_pneumoniae.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Klebsiella_pneumoniae/index.done stats/specific_kmer_count/Klebsiella_pneumoniae.stats: global_index_count/index.done
|
||||||
|
# Opitutus_terrae
|
||||||
|
specific_index_presence/Opitutus_terrae/index.done stats/specific_kmer_presence/Opitutus_terrae.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Opitutus_terrae/index.done stats/specific_kmer_count/Opitutus_terrae.stats: global_index_count/index.done
|
||||||
|
# Saccharolobus_islandicus
|
||||||
|
specific_index_presence/Saccharolobus_islandicus/index.done stats/specific_kmer_presence/Saccharolobus_islandicus.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Saccharolobus_islandicus/index.done stats/specific_kmer_count/Saccharolobus_islandicus.stats: global_index_count/index.done
|
||||||
|
# Acidobacterium_capsulatum
|
||||||
|
specific_index_presence/Acidobacterium_capsulatum/index.done stats/specific_kmer_presence/Acidobacterium_capsulatum.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Acidobacterium_capsulatum/index.done stats/specific_kmer_count/Acidobacterium_capsulatum.stats: global_index_count/index.done
|
||||||
|
# Proteus_mirabilis
|
||||||
|
specific_index_presence/Proteus_mirabilis/index.done stats/specific_kmer_presence/Proteus_mirabilis.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Proteus_mirabilis/index.done stats/specific_kmer_count/Proteus_mirabilis.stats: global_index_count/index.done
|
||||||
|
# Wolbachia_endosymbiont
|
||||||
|
specific_index_presence/Wolbachia_endosymbiont/index.done stats/specific_kmer_presence/Wolbachia_endosymbiont.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Wolbachia_endosymbiont/index.done stats/specific_kmer_count/Wolbachia_endosymbiont.stats: global_index_count/index.done
|
||||||
|
# Yersinia_ruckeri
|
||||||
|
specific_index_presence/Yersinia_ruckeri/index.done stats/specific_kmer_presence/Yersinia_ruckeri.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Yersinia_ruckeri/index.done stats/specific_kmer_count/Yersinia_ruckeri.stats: global_index_count/index.done
|
||||||
|
# Candidozyma_auris
|
||||||
|
specific_index_presence/Candidozyma_auris/index.done stats/specific_kmer_presence/Candidozyma_auris.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Candidozyma_auris/index.done stats/specific_kmer_count/Candidozyma_auris.stats: global_index_count/index.done
|
||||||
Executable
+48
@@ -0,0 +1,48 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
assemblies=(
|
||||||
|
GCF_000005845.2
|
||||||
|
GCF_000010245.2
|
||||||
|
GCF_000007445.1
|
||||||
|
GCF_000006665.1
|
||||||
|
|
||||||
|
GCF_000006945.2
|
||||||
|
GCF_000195995.1
|
||||||
|
GCF_000009505.1
|
||||||
|
GCF_000026565.1
|
||||||
|
|
||||||
|
GCF_000016305.1
|
||||||
|
GCF_000019965.1
|
||||||
|
GCF_000240185.1
|
||||||
|
GCF_000742135.1
|
||||||
|
|
||||||
|
GCF_000069965.1
|
||||||
|
GCF_000022565.1
|
||||||
|
GCF_000306885.1
|
||||||
|
GCF_003013715.1
|
||||||
|
|
||||||
|
GCF_000009045.1
|
||||||
|
GCF_000009825.1
|
||||||
|
GCF_000022445.1
|
||||||
|
GCF_000834255.1
|
||||||
|
)
|
||||||
|
|
||||||
|
mkdir -p genomes
|
||||||
|
|
||||||
|
for acc in "${assemblies[@]}"; do
|
||||||
|
echo "Downloading ${acc}"
|
||||||
|
|
||||||
|
datasets download genome accession "${acc}" \
|
||||||
|
--include genome \
|
||||||
|
--filename "${acc}.zip"
|
||||||
|
|
||||||
|
unzip -q "${acc}.zip" -d "${acc}"
|
||||||
|
find "${acc}" -name "*.fna" |
|
||||||
|
while read file; do
|
||||||
|
obiconvert -Z ${file} >genomes/$(basename ${file}).gz
|
||||||
|
done
|
||||||
|
|
||||||
|
rm -rf "${acc}" "${acc}.zip"
|
||||||
|
done
|
||||||
Executable
+108
@@ -0,0 +1,108 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: filter_one_count.sh SPECIES
|
||||||
|
# Filters global_index_count to keep only kmers specific to SPECIES,
|
||||||
|
# then selects the SPECIES column in-place.
|
||||||
|
# Outputs:
|
||||||
|
# specific_index_count/SPECIES/index.done (written by obikmer select)
|
||||||
|
# stats/specific_kmer_count/SPECIES.stats (one CSV data row, no header)
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SPECIES="$1"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
|
||||||
|
SOURCE="${SCRIPT_DIR}/global_index_count"
|
||||||
|
OUTPUT="${SCRIPT_DIR}/specific_index_count/${SPECIES}"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/specific_kmer_count"
|
||||||
|
STATS_FILE="${STATS_DIR}/${SPECIES}.stats"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
echo "[${SPECIES}] filter (count) → ${OUTPUT}"
|
||||||
|
|
||||||
|
LOG_FILTER=$(mktemp)
|
||||||
|
LOG_SELECT=$(mktemp)
|
||||||
|
trap 'rm -f "${LOG_FILTER}" "${LOG_SELECT}"' EXIT
|
||||||
|
|
||||||
|
"${BINARY}" filter \
|
||||||
|
--output "${OUTPUT}" \
|
||||||
|
--force \
|
||||||
|
--ingroup "species=${SPECIES}" \
|
||||||
|
--outgroup all \
|
||||||
|
--min-frac 0.5 \
|
||||||
|
--max-frac 1.0 \
|
||||||
|
--max-outgroup-count 0 \
|
||||||
|
"${SOURCE}" \
|
||||||
|
2>"${LOG_FILTER}"
|
||||||
|
|
||||||
|
cat "${LOG_FILTER}" >&2
|
||||||
|
|
||||||
|
"${BINARY}" select \
|
||||||
|
--in-place \
|
||||||
|
--group "${SPECIES}:species=${SPECIES}" \
|
||||||
|
--group-op "${SPECIES}:any" \
|
||||||
|
--select "${SPECIES}" \
|
||||||
|
"${OUTPUT}" \
|
||||||
|
2>"${LOG_SELECT}"
|
||||||
|
|
||||||
|
cat "${LOG_SELECT}" >&2
|
||||||
|
|
||||||
|
python3 - "${SPECIES}" "${LOG_FILTER}" "${LOG_SELECT}" <<'PYEOF' >"${STATS_FILE}"
|
||||||
|
import sys, re
|
||||||
|
|
||||||
|
species, log_filter, log_select = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||||
|
|
||||||
|
def strip_ansi(s):
|
||||||
|
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||||
|
|
||||||
|
def parse_wall(s):
|
||||||
|
s = s.strip()
|
||||||
|
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||||
|
if s.endswith('s'): return float(s[:-1])
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def parse_rss(s):
|
||||||
|
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||||
|
if not m: return 0
|
||||||
|
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||||
|
|
||||||
|
def is_sep(s):
|
||||||
|
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||||
|
|
||||||
|
def parse_reporter(logfile):
|
||||||
|
stats = {}
|
||||||
|
state = 'scan'
|
||||||
|
with open(logfile, errors='replace') as fh:
|
||||||
|
for raw in fh:
|
||||||
|
line = strip_ansi(raw.rstrip('\n'))
|
||||||
|
s = line.strip()
|
||||||
|
if state == 'scan':
|
||||||
|
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||||
|
state = 'in_header'
|
||||||
|
elif state == 'in_header':
|
||||||
|
if is_sep(s): state = 'rows'
|
||||||
|
elif state == 'rows':
|
||||||
|
if is_sep(s): state = 'total'
|
||||||
|
elif s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 4:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||||
|
elif state == 'total':
|
||||||
|
if s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 3:
|
||||||
|
stats['TOTAL'] = (parse_wall(parts[1]),
|
||||||
|
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||||
|
break
|
||||||
|
return stats
|
||||||
|
|
||||||
|
f = parse_reporter(log_filter)
|
||||||
|
s = parse_reporter(log_select)
|
||||||
|
|
||||||
|
row = [species]
|
||||||
|
for stage, d in [('rebuild', f), ('pack', f), ('filter_total', f), ('select', s), ('select_total', s)]:
|
||||||
|
key = 'TOTAL' if stage.endswith('_total') else stage
|
||||||
|
w, r = d.get(key, ('', ''))
|
||||||
|
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||||
|
print(','.join(row))
|
||||||
|
PYEOF
|
||||||
Executable
+108
@@ -0,0 +1,108 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: filter_one_presence.sh SPECIES
|
||||||
|
# Filters global_index_presence to keep only kmers specific to SPECIES,
|
||||||
|
# then selects the SPECIES column in-place.
|
||||||
|
# Outputs:
|
||||||
|
# specific_index_presence/SPECIES/index.done (written by obikmer select)
|
||||||
|
# stats/specific_kmer_presence/SPECIES.stats (one CSV data row, no header)
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SPECIES="$1"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
|
||||||
|
SOURCE="${SCRIPT_DIR}/global_index_presence"
|
||||||
|
OUTPUT="${SCRIPT_DIR}/specific_index_presence/${SPECIES}"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/specific_kmer_presence"
|
||||||
|
STATS_FILE="${STATS_DIR}/${SPECIES}.stats"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
echo "[${SPECIES}] filter (presence) → ${OUTPUT}"
|
||||||
|
|
||||||
|
LOG_FILTER=$(mktemp)
|
||||||
|
LOG_SELECT=$(mktemp)
|
||||||
|
trap 'rm -f "${LOG_FILTER}" "${LOG_SELECT}"' EXIT
|
||||||
|
|
||||||
|
"${BINARY}" filter \
|
||||||
|
--output "${OUTPUT}" \
|
||||||
|
--force \
|
||||||
|
--ingroup "species=${SPECIES}" \
|
||||||
|
--outgroup all \
|
||||||
|
--min-frac 0.5 \
|
||||||
|
--max-frac 1.0 \
|
||||||
|
--max-outgroup-count 0 \
|
||||||
|
"${SOURCE}" \
|
||||||
|
2>"${LOG_FILTER}"
|
||||||
|
|
||||||
|
cat "${LOG_FILTER}" >&2
|
||||||
|
|
||||||
|
"${BINARY}" select \
|
||||||
|
--in-place \
|
||||||
|
--group "${SPECIES}:species=${SPECIES}" \
|
||||||
|
--group-op "${SPECIES}:any" \
|
||||||
|
--select "${SPECIES}" \
|
||||||
|
"${OUTPUT}" \
|
||||||
|
2>"${LOG_SELECT}"
|
||||||
|
|
||||||
|
cat "${LOG_SELECT}" >&2
|
||||||
|
|
||||||
|
python3 - "${SPECIES}" "${LOG_FILTER}" "${LOG_SELECT}" <<'PYEOF' >"${STATS_FILE}"
|
||||||
|
import sys, re
|
||||||
|
|
||||||
|
species, log_filter, log_select = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||||
|
|
||||||
|
def strip_ansi(s):
|
||||||
|
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||||
|
|
||||||
|
def parse_wall(s):
|
||||||
|
s = s.strip()
|
||||||
|
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||||
|
if s.endswith('s'): return float(s[:-1])
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def parse_rss(s):
|
||||||
|
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||||
|
if not m: return 0
|
||||||
|
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||||
|
|
||||||
|
def is_sep(s):
|
||||||
|
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||||
|
|
||||||
|
def parse_reporter(logfile):
|
||||||
|
stats = {}
|
||||||
|
state = 'scan'
|
||||||
|
with open(logfile, errors='replace') as fh:
|
||||||
|
for raw in fh:
|
||||||
|
line = strip_ansi(raw.rstrip('\n'))
|
||||||
|
s = line.strip()
|
||||||
|
if state == 'scan':
|
||||||
|
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||||
|
state = 'in_header'
|
||||||
|
elif state == 'in_header':
|
||||||
|
if is_sep(s): state = 'rows'
|
||||||
|
elif state == 'rows':
|
||||||
|
if is_sep(s): state = 'total'
|
||||||
|
elif s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 4:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||||
|
elif state == 'total':
|
||||||
|
if s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 3:
|
||||||
|
stats['TOTAL'] = (parse_wall(parts[1]),
|
||||||
|
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||||
|
break
|
||||||
|
return stats
|
||||||
|
|
||||||
|
f = parse_reporter(log_filter)
|
||||||
|
s = parse_reporter(log_select)
|
||||||
|
|
||||||
|
row = [species]
|
||||||
|
for stage, d in [('rebuild', f), ('pack', f), ('filter_total', f), ('select', s), ('select_total', s)]:
|
||||||
|
key = 'TOTAL' if stage.endswith('_total') else stage
|
||||||
|
w, r = d.get(key, ('', ''))
|
||||||
|
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||||
|
print(','.join(row))
|
||||||
|
PYEOF
|
||||||
Executable
+103
@@ -0,0 +1,103 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: index_one_count.sh SPECIMEN
|
||||||
|
# SPECIMEN = "species--strain" (Make pattern stem)
|
||||||
|
# Outputs:
|
||||||
|
# specimen_index_count/SPECIMEN/index.done (written by obikmer)
|
||||||
|
# stats/indexing_count/SPECIMEN.stats (one CSV data row, no header)
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SPECIMEN="$1"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
|
||||||
|
species="${SPECIMEN%%--*}"
|
||||||
|
strain="${SPECIMEN#*--}"
|
||||||
|
|
||||||
|
READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}"
|
||||||
|
INDEX_PATH="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/indexing_count"
|
||||||
|
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
r1="${READS_DIR}/reads_R1.fastq.gz"
|
||||||
|
r2="${READS_DIR}/reads_R2.fastq.gz"
|
||||||
|
if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
|
||||||
|
echo "ERROR: reads not found in ${READS_DIR}" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[${SPECIMEN}] indexing (count) → ${INDEX_PATH}"
|
||||||
|
|
||||||
|
STDERR_LOG=$(mktemp)
|
||||||
|
trap 'rm -f "${STDERR_LOG}"' EXIT
|
||||||
|
|
||||||
|
"${BINARY}" index \
|
||||||
|
--output "${INDEX_PATH}" \
|
||||||
|
--force \
|
||||||
|
--theta 0 \
|
||||||
|
--with-counts \
|
||||||
|
--label "${SPECIMEN}" \
|
||||||
|
--meta "species=${species}" \
|
||||||
|
"${r1}" "${r2}" \
|
||||||
|
2>"${STDERR_LOG}"
|
||||||
|
|
||||||
|
cat "${STDERR_LOG}" >&2
|
||||||
|
|
||||||
|
python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}"
|
||||||
|
import sys, re
|
||||||
|
|
||||||
|
species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||||
|
|
||||||
|
def strip_ansi(s):
|
||||||
|
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||||
|
|
||||||
|
def parse_wall(s):
|
||||||
|
s = s.strip()
|
||||||
|
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||||
|
if s.endswith('s'): return float(s[:-1])
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def parse_rss(s):
|
||||||
|
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||||
|
if not m: return 0
|
||||||
|
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||||
|
|
||||||
|
def is_sep(s):
|
||||||
|
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||||
|
|
||||||
|
stats = {}
|
||||||
|
state = 'scan'
|
||||||
|
|
||||||
|
with open(logfile, errors='replace') as fh:
|
||||||
|
for raw in fh:
|
||||||
|
line = strip_ansi(raw.rstrip('\n'))
|
||||||
|
s = line.strip()
|
||||||
|
if state == 'scan':
|
||||||
|
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||||
|
state = 'in_header'
|
||||||
|
elif state == 'in_header':
|
||||||
|
if is_sep(s): state = 'rows'
|
||||||
|
elif state == 'rows':
|
||||||
|
if is_sep(s): state = 'total'
|
||||||
|
elif s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 4:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||||
|
elif state == 'total':
|
||||||
|
if s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 3:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]),
|
||||||
|
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||||
|
break
|
||||||
|
|
||||||
|
STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index']
|
||||||
|
row = [species, strain]
|
||||||
|
for stage in STAGE_ORDER:
|
||||||
|
w, r = stats.get(stage, ('', ''))
|
||||||
|
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||||
|
tw, tr = stats.get('TOTAL', ('', ''))
|
||||||
|
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
|
||||||
|
print(','.join(row))
|
||||||
|
PYEOF
|
||||||
Executable
+102
@@ -0,0 +1,102 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: index_one_presence.sh SPECIMEN
|
||||||
|
# SPECIMEN = "species--strain" (Make pattern stem)
|
||||||
|
# Outputs:
|
||||||
|
# specimen_index_presence/SPECIMEN/index.done (written by obikmer)
|
||||||
|
# stats/indexing_presence/SPECIMEN.stats (one CSV data row, no header)
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SPECIMEN="$1"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
|
||||||
|
species="${SPECIMEN%%--*}"
|
||||||
|
strain="${SPECIMEN#*--}"
|
||||||
|
|
||||||
|
READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}"
|
||||||
|
INDEX_PATH="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/indexing_presence"
|
||||||
|
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
r1="${READS_DIR}/reads_R1.fastq.gz"
|
||||||
|
r2="${READS_DIR}/reads_R2.fastq.gz"
|
||||||
|
if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
|
||||||
|
echo "ERROR: reads not found in ${READS_DIR}" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[${SPECIMEN}] indexing (presence) → ${INDEX_PATH}"
|
||||||
|
|
||||||
|
STDERR_LOG=$(mktemp)
|
||||||
|
trap 'rm -f "${STDERR_LOG}"' EXIT
|
||||||
|
|
||||||
|
"${BINARY}" index \
|
||||||
|
--output "${INDEX_PATH}" \
|
||||||
|
--force \
|
||||||
|
--theta 0 \
|
||||||
|
--label "${SPECIMEN}" \
|
||||||
|
--meta "species=${species}" \
|
||||||
|
"${r1}" "${r2}" \
|
||||||
|
2>"${STDERR_LOG}"
|
||||||
|
|
||||||
|
cat "${STDERR_LOG}" >&2
|
||||||
|
|
||||||
|
python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}"
|
||||||
|
import sys, re
|
||||||
|
|
||||||
|
species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||||
|
|
||||||
|
def strip_ansi(s):
|
||||||
|
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||||
|
|
||||||
|
def parse_wall(s):
|
||||||
|
s = s.strip()
|
||||||
|
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||||
|
if s.endswith('s'): return float(s[:-1])
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def parse_rss(s):
|
||||||
|
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||||
|
if not m: return 0
|
||||||
|
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||||
|
|
||||||
|
def is_sep(s):
|
||||||
|
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||||
|
|
||||||
|
stats = {}
|
||||||
|
state = 'scan'
|
||||||
|
|
||||||
|
with open(logfile, errors='replace') as fh:
|
||||||
|
for raw in fh:
|
||||||
|
line = strip_ansi(raw.rstrip('\n'))
|
||||||
|
s = line.strip()
|
||||||
|
if state == 'scan':
|
||||||
|
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||||
|
state = 'in_header'
|
||||||
|
elif state == 'in_header':
|
||||||
|
if is_sep(s): state = 'rows'
|
||||||
|
elif state == 'rows':
|
||||||
|
if is_sep(s): state = 'total'
|
||||||
|
elif s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 4:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||||
|
elif state == 'total':
|
||||||
|
if s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 3:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]),
|
||||||
|
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||||
|
break
|
||||||
|
|
||||||
|
STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index']
|
||||||
|
row = [species, strain]
|
||||||
|
for stage in STAGE_ORDER:
|
||||||
|
w, r = stats.get(stage, ('', ''))
|
||||||
|
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||||
|
tw, tr = stats.get('TOTAL', ('', ''))
|
||||||
|
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
|
||||||
|
print(','.join(row))
|
||||||
|
PYEOF
|
||||||
@@ -0,0 +1,118 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Generate deps.mk — pure dependency declarations for the benchmark pipeline.
|
||||||
|
|
||||||
|
Like C .d files: only target: prerequisites lines, no recipes.
|
||||||
|
Recipes stay in the Makefile as generic rules.
|
||||||
|
"""
|
||||||
|
import gzip
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
STOP_WORDS = {'complete', 'chromosome', 'whole', 'sequence', 'genome',
|
||||||
|
'endosymbiont', 'of'}
|
||||||
|
STOP_PREFIXES = ('scaffold', 'contig', 'plasmid')
|
||||||
|
|
||||||
|
|
||||||
|
def is_stop(tok):
|
||||||
|
t = tok.lower()
|
||||||
|
return t in STOP_WORDS or any(t.startswith(p) for p in STOP_PREFIXES)
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize(s):
|
||||||
|
return re.sub(r'[^A-Za-z0-9._-]', '_', s).strip('_')
|
||||||
|
|
||||||
|
|
||||||
|
def collect_tokens(text):
|
||||||
|
parts = []
|
||||||
|
for tok in text.split():
|
||||||
|
tok = tok.rstrip(',.')
|
||||||
|
if is_stop(tok):
|
||||||
|
break
|
||||||
|
parts.append(sanitize(tok))
|
||||||
|
return '_'.join(filter(None, parts))
|
||||||
|
|
||||||
|
|
||||||
|
def parse_organism(defn, gcf_id):
|
||||||
|
words = defn.split()
|
||||||
|
species = sanitize(words[0] + '_' + words[1])
|
||||||
|
|
||||||
|
m = re.search(r'\bstr\.\s+(\S+)(?:\s+substr\.\s+(\S+))?', defn)
|
||||||
|
if m:
|
||||||
|
strain = sanitize(m.group(1))
|
||||||
|
if m.group(2):
|
||||||
|
strain += '_' + sanitize(m.group(2))
|
||||||
|
return species, strain
|
||||||
|
|
||||||
|
m = re.search(r'\bstrain\b\s+(.*)', defn)
|
||||||
|
if m:
|
||||||
|
strain = collect_tokens(m.group(1))
|
||||||
|
if strain:
|
||||||
|
return species, strain
|
||||||
|
|
||||||
|
remainder = re.sub(r'^\S+ \S+\s*', '', defn)
|
||||||
|
remainder = re.sub(r'^subsp\.\s+\S+\s*', '', remainder)
|
||||||
|
remainder = re.sub(r'^serovar\s+\S+\s*', '', remainder)
|
||||||
|
strain = collect_tokens(remainder)
|
||||||
|
return species, strain if strain else gcf_id
|
||||||
|
|
||||||
|
|
||||||
|
def first_definition(path):
|
||||||
|
with gzip.open(path, 'rt') as fh:
|
||||||
|
for line in fh:
|
||||||
|
if line.startswith('>'):
|
||||||
|
m = re.search(r'"definition":"([^"]*)"', line)
|
||||||
|
return m.group(1) if m else line[1:].split()[0]
|
||||||
|
return Path(path).stem
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
entries = [] # (specimen, species, sim_dir, genome_path)
|
||||||
|
species_seen = []
|
||||||
|
|
||||||
|
for path in sorted(sys.argv[1:]):
|
||||||
|
gcf_id = Path(path).name.replace('_genomic.fna.gz', '')
|
||||||
|
defn = first_definition(path)
|
||||||
|
sp, st = parse_organism(defn, gcf_id)
|
||||||
|
specimen = f'{sp}--{st}'
|
||||||
|
sim_dir = f'simulated_data/{sp}/{st}'
|
||||||
|
entries.append((specimen, sp, sim_dir, path))
|
||||||
|
if sp not in species_seen:
|
||||||
|
species_seen.append(sp)
|
||||||
|
|
||||||
|
specimens = [e[0] for e in entries]
|
||||||
|
print('SPECIMENS :=', ' '.join(specimens))
|
||||||
|
print('SPECIES :=', ' '.join(species_seen))
|
||||||
|
|
||||||
|
for specimen, species, sim_dir, genome in entries:
|
||||||
|
reads = f'{sim_dir}/reads_R1.fastq.gz'
|
||||||
|
p_done = f'specimen_index_presence/{specimen}/index.done'
|
||||||
|
p_stats = f'stats/indexing_presence/{specimen}.stats'
|
||||||
|
c_done = f'specimen_index_count/{specimen}/index.done'
|
||||||
|
c_stats = f'stats/indexing_count/{specimen}.stats'
|
||||||
|
ref = f'reference_index/{specimen}.npz'
|
||||||
|
vp = f'stats/verify_presence/{specimen}.stats'
|
||||||
|
vc = f'stats/verify_count/{specimen}.stats'
|
||||||
|
|
||||||
|
print()
|
||||||
|
print(f'# {specimen}')
|
||||||
|
print(f'{reads}: {genome}')
|
||||||
|
print(f'{ref}: {reads}')
|
||||||
|
print(f'{p_done} {p_stats}: {reads}')
|
||||||
|
print(f'{c_done} {c_stats}: {reads}')
|
||||||
|
print(f'{vp}: {ref} {p_done}')
|
||||||
|
print(f'{vc}: {ref} {c_done}')
|
||||||
|
|
||||||
|
print()
|
||||||
|
for sp in species_seen:
|
||||||
|
sp_done = f'specific_index_presence/{sp}/index.done'
|
||||||
|
sp_stats = f'stats/specific_kmer_presence/{sp}.stats'
|
||||||
|
sc_done = f'specific_index_count/{sp}/index.done'
|
||||||
|
sc_stats = f'stats/specific_kmer_count/{sp}.stats'
|
||||||
|
print(f'# {sp}')
|
||||||
|
print(f'{sp_done} {sp_stats}: global_index_presence/index.done')
|
||||||
|
print(f'{sc_done} {sc_stats}: global_index_count/index.done')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Executable
+103
@@ -0,0 +1,103 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
IDX_DIR="${SCRIPT_DIR}/specimen_index_count"
|
||||||
|
OUTPUT="${SCRIPT_DIR}/global_index_count"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/merge_count"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')")
|
||||||
|
CSV="${STATS_DIR}/run_${run_n}.csv"
|
||||||
|
|
||||||
|
printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}"
|
||||||
|
|
||||||
|
parse_reporter() {
|
||||||
|
local run="$1" n_sources="$2" logfile="$3"
|
||||||
|
python3 - "$run" "$n_sources" "$logfile" <<'PYEOF'
|
||||||
|
import sys, re
|
||||||
|
|
||||||
|
run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||||
|
|
||||||
|
def strip_ansi(s):
|
||||||
|
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||||
|
|
||||||
|
def parse_wall(s):
|
||||||
|
s = s.strip()
|
||||||
|
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||||
|
if s.endswith('s'): return float(s[:-1])
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def parse_rss(s):
|
||||||
|
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||||
|
if not m: return 0
|
||||||
|
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||||
|
|
||||||
|
def is_sep(s):
|
||||||
|
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||||
|
|
||||||
|
stats = {}
|
||||||
|
state = 'scan'
|
||||||
|
|
||||||
|
with open(logfile, errors='replace') as fh:
|
||||||
|
for raw in fh:
|
||||||
|
line = strip_ansi(raw.rstrip('\n'))
|
||||||
|
s = line.strip()
|
||||||
|
|
||||||
|
if state == 'scan':
|
||||||
|
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||||
|
state = 'in_header'
|
||||||
|
elif state == 'in_header':
|
||||||
|
if is_sep(s):
|
||||||
|
state = 'rows'
|
||||||
|
elif state == 'rows':
|
||||||
|
if is_sep(s):
|
||||||
|
state = 'total'
|
||||||
|
elif s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 4:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||||
|
elif state == 'total':
|
||||||
|
if s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 3:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]),
|
||||||
|
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||||
|
break
|
||||||
|
|
||||||
|
STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack']
|
||||||
|
row = [run, n_sources]
|
||||||
|
for stage in STAGE_ORDER:
|
||||||
|
w, r = stats.get(stage, ('', ''))
|
||||||
|
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||||
|
tw, tr = stats.get('TOTAL', ('', ''))
|
||||||
|
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
|
||||||
|
print(','.join(row))
|
||||||
|
PYEOF
|
||||||
|
}
|
||||||
|
|
||||||
|
mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort)
|
||||||
|
|
||||||
|
if [[ ${#sources[@]} -eq 0 ]]; then
|
||||||
|
echo "ERROR: no indexes found in ${IDX_DIR}" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Merging ${#sources[@]} count indexes → ${OUTPUT}"
|
||||||
|
printf ' %s\n' "${sources[@]}"
|
||||||
|
|
||||||
|
STDERR_LOG=$(mktemp)
|
||||||
|
trap 'rm -f "${STDERR_LOG}"' EXIT
|
||||||
|
|
||||||
|
"${BINARY}" merge \
|
||||||
|
--output "${OUTPUT}" \
|
||||||
|
--force \
|
||||||
|
"${sources[@]}" \
|
||||||
|
2>"${STDERR_LOG}"
|
||||||
|
|
||||||
|
cat "${STDERR_LOG}" >&2
|
||||||
|
parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}"
|
||||||
|
|
||||||
|
echo "Done. Run ${run_n} → ${CSV}"
|
||||||
Executable
+104
@@ -0,0 +1,104 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
IDX_DIR="${SCRIPT_DIR}/specimen_index_presence"
|
||||||
|
OUTPUT="${SCRIPT_DIR}/global_index_presence"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/merge_presence"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')")
|
||||||
|
CSV="${STATS_DIR}/run_${run_n}.csv"
|
||||||
|
|
||||||
|
printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}"
|
||||||
|
|
||||||
|
parse_reporter() {
|
||||||
|
local run="$1" n_sources="$2" logfile="$3"
|
||||||
|
python3 - "$run" "$n_sources" "$logfile" <<'PYEOF'
|
||||||
|
import sys, re
|
||||||
|
|
||||||
|
run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||||
|
|
||||||
|
def strip_ansi(s):
|
||||||
|
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||||
|
|
||||||
|
def parse_wall(s):
|
||||||
|
s = s.strip()
|
||||||
|
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||||
|
if s.endswith('s'): return float(s[:-1])
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def parse_rss(s):
|
||||||
|
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||||
|
if not m: return 0
|
||||||
|
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||||
|
|
||||||
|
def is_sep(s):
|
||||||
|
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||||
|
|
||||||
|
stats = {}
|
||||||
|
state = 'scan'
|
||||||
|
|
||||||
|
with open(logfile, errors='replace') as fh:
|
||||||
|
for raw in fh:
|
||||||
|
line = strip_ansi(raw.rstrip('\n'))
|
||||||
|
s = line.strip()
|
||||||
|
|
||||||
|
if state == 'scan':
|
||||||
|
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||||
|
state = 'in_header'
|
||||||
|
elif state == 'in_header':
|
||||||
|
if is_sep(s):
|
||||||
|
state = 'rows'
|
||||||
|
elif state == 'rows':
|
||||||
|
if is_sep(s):
|
||||||
|
state = 'total'
|
||||||
|
elif s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 4:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||||
|
elif state == 'total':
|
||||||
|
if s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 3:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]),
|
||||||
|
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||||
|
break
|
||||||
|
|
||||||
|
STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack']
|
||||||
|
row = [run, n_sources]
|
||||||
|
for stage in STAGE_ORDER:
|
||||||
|
w, r = stats.get(stage, ('', ''))
|
||||||
|
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||||
|
tw, tr = stats.get('TOTAL', ('', ''))
|
||||||
|
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
|
||||||
|
print(','.join(row))
|
||||||
|
PYEOF
|
||||||
|
}
|
||||||
|
|
||||||
|
mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort)
|
||||||
|
|
||||||
|
if [[ ${#sources[@]} -eq 0 ]]; then
|
||||||
|
echo "ERROR: no indexes found in ${IDX_DIR}" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Merging ${#sources[@]} presence indexes → ${OUTPUT}"
|
||||||
|
printf ' %s\n' "${sources[@]}"
|
||||||
|
|
||||||
|
STDERR_LOG=$(mktemp)
|
||||||
|
trap 'rm -f "${STDERR_LOG}"' EXIT
|
||||||
|
|
||||||
|
"${BINARY}" merge \
|
||||||
|
--output "${OUTPUT}" \
|
||||||
|
--force \
|
||||||
|
--force-presence \
|
||||||
|
"${sources[@]}" \
|
||||||
|
2>"${STDERR_LOG}"
|
||||||
|
|
||||||
|
cat "${STDERR_LOG}" >&2
|
||||||
|
parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}"
|
||||||
|
|
||||||
|
echo "Done. Run ${run_n} → ${CSV}"
|
||||||
Executable
+12
@@ -0,0 +1,12 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Simulate all genomes. Delegates to simulate_one.sh per genome.
|
||||||
|
# Prefer running via `gmake simulate` which handles individual dependencies.
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
|
||||||
|
for genome_file in "${SCRIPT_DIR}"/genomes/*.fna.gz; do
|
||||||
|
out_dir=$("${SCRIPT_DIR}/../.venv/bin/python3" "${SCRIPT_DIR}/make_deps.py" \
|
||||||
|
--dir-for "${genome_file}")
|
||||||
|
bash "${SCRIPT_DIR}/simulate_one.sh" "${genome_file}" "${out_dir}"
|
||||||
|
done
|
||||||
@@ -0,0 +1,33 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: simulate_one.sh genome.fna.gz output_dir
|
||||||
|
# Simulates paired-end HiSeq reads for a single genome.
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
ISS="${SCRIPT_DIR}/../.venv/bin/iss"
|
||||||
|
COVERAGE=15
|
||||||
|
READ_LENGTH=150
|
||||||
|
CPUS="${CPUS:-$(sysctl -n hw.logicalcpu 2>/dev/null || nproc 2>/dev/null || echo 2)}"
|
||||||
|
|
||||||
|
genome_file="$1"
|
||||||
|
out_dir="$2"
|
||||||
|
|
||||||
|
mkdir -p "${out_dir}"
|
||||||
|
|
||||||
|
tmp_fasta=$(mktemp "${TMPDIR:-/tmp}/obikmer_XXXXXX.fna")
|
||||||
|
trap 'rm -f "${tmp_fasta}"' EXIT
|
||||||
|
|
||||||
|
gzip -dc "${genome_file}" > "${tmp_fasta}"
|
||||||
|
|
||||||
|
genome_size=$(grep -v "^>" "${tmp_fasta}" | tr -d '[:space:]' | wc -c | tr -d ' ')
|
||||||
|
n_reads=$(python3 -c "import math; print(math.ceil(${COVERAGE} * ${genome_size} / (2 * ${READ_LENGTH})))")
|
||||||
|
|
||||||
|
echo "[${out_dir}] genome=${genome_size} bp → ${n_reads} read pairs (${COVERAGE}x HiSeq)"
|
||||||
|
|
||||||
|
"${ISS}" generate \
|
||||||
|
--genomes "${tmp_fasta}" \
|
||||||
|
--model HiSeq \
|
||||||
|
--n_reads "${n_reads}" \
|
||||||
|
--cpus "${CPUS}" \
|
||||||
|
--compress \
|
||||||
|
--output "${out_dir}/reads"
|
||||||
@@ -0,0 +1,21 @@
|
|||||||
|
genome,Candidozyma_auris--GCF_003013715.1_ASM301371v2,Acidobacterium_capsulatum--ATCC_51196,Bacillus_subtilis--168,Escherichia_coli--CFT073,Escherichia_coli--EDL933,Escherichia_coli--K-12_MG1655,Escherichia_coli--K-12_W3110,Klebsiella_pneumoniae--ATCC_13883,Klebsiella_pneumoniae--HS11286,Klebsiella_pneumoniae--MGH_78578,Opitutus_terrae--PB90-1,Proteus_mirabilis--HI4320,Saccharolobus_islandicus--M.16.4,Salmonella_enterica--AKU_12601,Salmonella_enterica--CT18,Salmonella_enterica--LT2,Salmonella_enterica--P125109,Shouchella_clausii--KSM-K16,Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1,Yersinia_ruckeri--YRB
|
||||||
|
Candidozyma_auris--GCF_003013715.1_ASM301371v2,0.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
|
||||||
|
Acidobacterium_capsulatum--ATCC_51196,1.000000,0.000000,0.999981,0.999990,0.999989,0.999987,0.999987,0.999990,0.999988,0.999988,0.999994,0.999989,1.000000,0.999988,0.999987,0.999987,0.999988,0.999989,0.999991,0.999987
|
||||||
|
Bacillus_subtilis--168,1.000000,0.999981,0.000000,0.999990,0.999989,0.999989,0.999989,0.999989,0.999988,0.999986,0.999995,0.999985,0.999999,0.999988,0.999987,0.999989,0.999988,0.999778,0.999993,0.999987
|
||||||
|
Escherichia_coli--CFT073,1.000000,0.999990,0.999990,0.000000,0.825741,0.807495,0.807218,0.991156,0.996855,0.997849,0.999996,0.999633,1.000000,0.993885,0.996736,0.994148,0.993821,0.999991,0.999984,0.999291
|
||||||
|
Escherichia_coli--EDL933,1.000000,0.999989,0.999989,0.825741,0.000000,0.735107,0.734775,0.996126,0.998058,0.997908,0.999997,0.999640,1.000000,0.993993,0.997126,0.994390,0.994059,0.999991,0.999986,0.999292
|
||||||
|
Escherichia_coli--K-12_MG1655,1.000000,0.999987,0.999989,0.807495,0.735107,0.000000,0.382567,0.996190,0.997747,0.997455,0.999996,0.999604,1.000000,0.993444,0.996645,0.993773,0.993431,0.999989,0.999984,0.999174
|
||||||
|
Escherichia_coli--K-12_W3110,1.000000,0.999987,0.999989,0.807218,0.734775,0.382567,0.000000,0.996220,0.997761,0.997467,0.999995,0.999604,1.000000,0.993445,0.996669,0.993769,0.993443,0.999990,0.999985,0.999165
|
||||||
|
Klebsiella_pneumoniae--ATCC_13883,1.000000,0.999990,0.999989,0.991156,0.996126,0.996190,0.996220,0.000000,0.845220,0.840545,0.999997,0.999648,1.000000,0.996177,0.998128,0.996268,0.996052,0.999990,0.999987,0.999325
|
||||||
|
Klebsiella_pneumoniae--HS11286,1.000000,0.999988,0.999988,0.996855,0.998058,0.997747,0.997761,0.845220,0.000000,0.906475,0.999996,0.999683,1.000000,0.997724,0.995697,0.997776,0.997769,0.999989,0.999979,0.999463
|
||||||
|
Klebsiella_pneumoniae--MGH_78578,1.000000,0.999988,0.999986,0.997849,0.997908,0.997455,0.997467,0.840545,0.906475,0.000000,0.999996,0.999704,1.000000,0.997928,0.995054,0.997844,0.997868,0.999990,0.999980,0.999479
|
||||||
|
Opitutus_terrae--PB90-1,1.000000,0.999994,0.999995,0.999996,0.999997,0.999996,0.999995,0.999997,0.999996,0.999996,0.000000,0.999997,0.999998,0.999996,0.999996,0.999996,0.999995,0.999997,0.999993,0.999996
|
||||||
|
Proteus_mirabilis--HI4320,1.000000,0.999989,0.999985,0.999633,0.999640,0.999604,0.999604,0.999648,0.999683,0.999704,0.999997,0.000000,1.000000,0.999604,0.999699,0.999622,0.999613,0.999987,0.999983,0.999505
|
||||||
|
Saccharolobus_islandicus--M.16.4,1.000000,1.000000,0.999999,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.999998,1.000000,0.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
|
||||||
|
Salmonella_enterica--AKU_12601,1.000000,0.999988,0.999988,0.993885,0.993993,0.993444,0.993445,0.996177,0.997724,0.997928,0.999996,0.999604,1.000000,0.000000,0.869238,0.682277,0.663383,0.999990,0.999985,0.999260
|
||||||
|
Salmonella_enterica--CT18,1.000000,0.999987,0.999987,0.996736,0.997126,0.996645,0.996669,0.998128,0.995697,0.995054,0.999996,0.999699,1.000000,0.869238,0.000000,0.890872,0.886148,0.999988,0.999976,0.999524
|
||||||
|
Salmonella_enterica--LT2,1.000000,0.999987,0.999989,0.994148,0.994390,0.993773,0.993769,0.996268,0.997776,0.997844,0.999996,0.999622,1.000000,0.682277,0.890872,0.000000,0.622606,0.999989,0.999985,0.999296
|
||||||
|
Salmonella_enterica--P125109,1.000000,0.999988,0.999988,0.993821,0.994059,0.993431,0.993443,0.996052,0.997769,0.997868,0.999995,0.999613,1.000000,0.663383,0.886148,0.622606,0.000000,0.999988,0.999983,0.999270
|
||||||
|
Shouchella_clausii--KSM-K16,1.000000,0.999989,0.999778,0.999991,0.999991,0.999989,0.999990,0.999990,0.999989,0.999990,0.999997,0.999987,1.000000,0.999990,0.999988,0.999989,0.999988,0.000000,0.999991,0.999988
|
||||||
|
Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1,1.000000,0.999991,0.999993,0.999984,0.999986,0.999984,0.999985,0.999987,0.999979,0.999980,0.999993,0.999983,1.000000,0.999985,0.999976,0.999985,0.999983,0.999991,0.000000,0.999983
|
||||||
|
Yersinia_ruckeri--YRB,1.000000,0.999987,0.999987,0.999291,0.999292,0.999174,0.999165,0.999325,0.999463,0.999479,0.999996,0.999505,1.000000,0.999260,0.999524,0.999296,0.999270,0.999988,0.999983,0.000000
|
||||||
|
@@ -0,0 +1 @@
|
|||||||
|
(((((((((((Candidozyma_auris--GCF_003013715.1_ASM301371v2:0.5000001881725941,Saccharolobus_islandicus--M.16.4:0.4999993211600824):0.0000023411501775538747,Opitutus_terrae--PB90-1:0.499997075187947):0.0000029791191795691675,(Acidobacterium_capsulatum--ATCC_51196:0.49999227771334689,(Bacillus_subtilis--168:0.49988797935621456,Shouchella_clausii--KSM-K16:0.49988984146059159):0.0001037210285571577):0.0000023959836053522034):0.0000034093646568700288,Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1:0.4999920159222422):0.000199555100890203,Proteus_mirabilis--HI4320:0.49979129185300427):0.00010103619067070024,Yersinia_ruckeri--YRB:0.4996806650749249):0.0013719139155004,(Klebsiella_pneumoniae--HS11286:0.43798845051648258,(Klebsiella_pneumoniae--ATCC_13883:0.41780293826821265,Klebsiella_pneumoniae--MGH_78578:0.42274184870836559):0.017586732339732737):0.0604124197073832):0.0006482538063555254,(Salmonella_enterica--CT18:0.43952894448143017,(Salmonella_enterica--AKU_12601:0.3357977326267918,(Salmonella_enterica--LT2:0.31203395843666389,Salmonella_enterica--P125109:0.31057217324861216):0.025729515856701136):0.10292985918524672):0.05825411485542886):0.08937928015651564,Escherichia_coli--CFT073:0.40806501650701029):0.0410131211869626,Escherichia_coli--EDL933:0.3681464750911808):0.1755112579711463,Escherichia_coli--K-12_MG1655:0.19129818036662728,Escherichia_coli--K-12_W3110:0.19126872019906239);
|
||||||
@@ -0,0 +1,21 @@
|
|||||||
|
genome,Candidozyma_auris--GCF_003013715.1_ASM301371v2,Acidobacterium_capsulatum--ATCC_51196,Bacillus_subtilis--168,Escherichia_coli--CFT073,Escherichia_coli--EDL933,Escherichia_coli--K-12_MG1655,Escherichia_coli--K-12_W3110,Klebsiella_pneumoniae--ATCC_13883,Klebsiella_pneumoniae--HS11286,Klebsiella_pneumoniae--MGH_78578,Opitutus_terrae--PB90-1,Proteus_mirabilis--HI4320,Saccharolobus_islandicus--M.16.4,Salmonella_enterica--AKU_12601,Salmonella_enterica--CT18,Salmonella_enterica--LT2,Salmonella_enterica--P125109,Shouchella_clausii--KSM-K16,Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1,Yersinia_ruckeri--YRB
|
||||||
|
Candidozyma_auris--GCF_003013715.1_ASM301371v2,0,0,0,0,0,0,0,0,0,0,0,0,8,0,1,0,0,0,0,3
|
||||||
|
Acidobacterium_capsulatum--ATCC_51196,0,0,203,119,128,141,140,116,109,111,78,112,0,136,109,147,134,117,55,129
|
||||||
|
Bacillus_subtilis--168,0,203,0,124,132,128,123,133,109,130,66,158,6,131,112,124,135,2393,46,124
|
||||||
|
Escherichia_coli--CFT073,0,119,124,0,1966777,1998059,1999094,117743,32029,22312,63,4225,0,74946,31918,73311,76585,113,128,7854
|
||||||
|
Escherichia_coli--EDL933,0,128,132,1966777,0,2627885,2628700,52488,20134,22064,48,4202,0,74655,28602,71244,74665,112,108,7963
|
||||||
|
Escherichia_coli--K-12_MG1655,0,141,128,1998059,2627885,0,4452541,48302,21382,24602,47,4277,0,75729,30449,73622,76778,119,111,8566
|
||||||
|
Escherichia_coli--K-12_W3110,0,140,123,1999094,2628700,4452541,0,47894,21226,24470,68,4278,0,75658,30207,73614,76583,112,108,8660
|
||||||
|
Klebsiella_pneumoniae--ATCC_13883,0,116,133,117743,52488,48302,47894,0,1416091,1477759,42,4172,0,48296,18988,48144,50416,120,106,7712
|
||||||
|
Klebsiella_pneumoniae--HS11286,0,109,109,32029,20134,21382,21226,1416091,0,644063,42,2738,0,21498,29758,21606,21376,99,102,4417
|
||||||
|
Klebsiella_pneumoniae--MGH_78578,0,111,130,22312,22064,24602,24470,1477759,644063,0,42,2614,0,19948,35067,21330,20813,97,102,4374
|
||||||
|
Opitutus_terrae--PB90-1,0,78,66,63,48,47,68,42,42,42,0,43,18,57,42,53,66,39,58,43
|
||||||
|
Proteus_mirabilis--HI4320,0,112,158,4225,4202,4277,4278,4172,2738,2614,43,0,0,4254,2481,4166,4215,131,103,4704
|
||||||
|
Saccharolobus_islandicus--M.16.4,8,0,6,0,0,0,0,0,0,0,18,0,0,0,0,0,0,0,0,0
|
||||||
|
Salmonella_enterica--AKU_12601,0,136,131,74946,74655,75729,75658,48296,21498,19948,57,4254,0,0,1047731,2857146,2951421,117,108,7643
|
||||||
|
Salmonella_enterica--CT18,1,109,112,31918,28602,30449,30207,18988,29758,35067,42,2481,0,1047731,0,917948,940297,106,106,3716
|
||||||
|
Salmonella_enterica--LT2,0,147,124,73311,71244,73622,73614,48144,21606,21330,53,4166,0,2857146,917948,0,3284800,122,108,7460
|
||||||
|
Salmonella_enterica--P125109,0,134,135,76585,74665,76778,76583,50416,21376,20813,66,4215,0,2951421,940297,3284800,0,134,124,7645
|
||||||
|
Shouchella_clausii--KSM-K16,0,117,2393,113,112,119,112,120,99,97,39,131,0,117,106,122,134,0,58,124
|
||||||
|
Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1,0,55,46,128,108,111,108,106,102,102,58,103,0,108,106,108,124,58,0,96
|
||||||
|
Yersinia_ruckeri--YRB,3,129,124,7854,7963,8566,8660,7712,4417,4374,43,4704,0,7643,3716,7460,7645,124,96,0
|
||||||
|
Executable
+181
@@ -0,0 +1,181 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Compare an obikmer count index against a reference kmer set (presence + counts).
|
||||||
|
|
||||||
|
Loads the reference .npz (sorted uint64 kmers + uint32 counts from build_reference.py),
|
||||||
|
streams `obikmer dump` from a --with-counts index, then reports:
|
||||||
|
- false negatives : kmers in reference absent from the index
|
||||||
|
- false positives : kmers in the index absent from the reference
|
||||||
|
- count mismatches: kmers present in both but with differing counts
|
||||||
|
|
||||||
|
Output to stdout: one CSV row
|
||||||
|
species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,
|
||||||
|
fn_pct,fp_pct,cm_pct
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
# ── encoding ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
|
||||||
|
'a': 0, 'c': 1, 'g': 2, 't': 3}
|
||||||
|
|
||||||
|
_DECODE = ['A', 'C', 'G', 'T']
|
||||||
|
|
||||||
|
|
||||||
|
def encode_kmer(s: str) -> int:
|
||||||
|
kmer = 0
|
||||||
|
for c in s:
|
||||||
|
kmer = (kmer << 2) | _ENCODE[c]
|
||||||
|
return kmer
|
||||||
|
|
||||||
|
|
||||||
|
def decode_kmer(val: int, k: int) -> str:
|
||||||
|
bases = []
|
||||||
|
for _ in range(k):
|
||||||
|
bases.append(_DECODE[val & 3])
|
||||||
|
val >>= 2
|
||||||
|
return ''.join(reversed(bases))
|
||||||
|
|
||||||
|
|
||||||
|
# ── dump parsing ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def load_index(obikmer_bin: str, index_dir: str) -> tuple[np.ndarray, np.ndarray]:
|
||||||
|
"""Stream `obikmer dump` and return (kmers_sorted_uint64, counts_uint32)."""
|
||||||
|
cmd = [obikmer_bin, 'dump', index_dir]
|
||||||
|
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
|
||||||
|
text=True)
|
||||||
|
kmers, counts = [], []
|
||||||
|
header = True
|
||||||
|
for line in proc.stdout:
|
||||||
|
if header:
|
||||||
|
header = False
|
||||||
|
continue
|
||||||
|
parts = line.rstrip('\n').split(',')
|
||||||
|
kmers.append(encode_kmer(parts[0]))
|
||||||
|
counts.append(int(parts[1]))
|
||||||
|
proc.wait()
|
||||||
|
if proc.returncode != 0:
|
||||||
|
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
order = np.argsort(np.array(kmers, dtype=np.uint64), kind='stable')
|
||||||
|
return (np.array(kmers, dtype=np.uint64)[order],
|
||||||
|
np.array(counts, dtype=np.uint32)[order])
|
||||||
|
|
||||||
|
|
||||||
|
# ── comparison ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def compare(ref_kmers: np.ndarray, ref_counts: np.ndarray,
|
||||||
|
idx_kmers: np.ndarray, idx_counts: np.ndarray,
|
||||||
|
) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
|
||||||
|
"""Return (false_neg, false_pos, cm_ref_kmers, cm_ref_counts, cm_idx_counts).
|
||||||
|
|
||||||
|
All arrays sorted; cm_* cover kmers present in both arrays but with
|
||||||
|
differing counts.
|
||||||
|
"""
|
||||||
|
false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
|
||||||
|
false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
|
||||||
|
|
||||||
|
# Count mismatches among shared kmers.
|
||||||
|
# Both arrays are sorted so we can use searchsorted.
|
||||||
|
pos_in_idx = np.searchsorted(idx_kmers, ref_kmers)
|
||||||
|
pos_in_idx = np.clip(pos_in_idx, 0, len(idx_kmers) - 1)
|
||||||
|
shared_mask = idx_kmers[pos_in_idx] == ref_kmers
|
||||||
|
|
||||||
|
shared_ref_counts = ref_counts[shared_mask]
|
||||||
|
shared_idx_counts = idx_counts[pos_in_idx[shared_mask]]
|
||||||
|
mismatch_mask = shared_ref_counts != shared_idx_counts
|
||||||
|
|
||||||
|
cm_kmers = ref_kmers[shared_mask][mismatch_mask]
|
||||||
|
cm_ref_counts = shared_ref_counts[mismatch_mask]
|
||||||
|
cm_idx_counts = shared_idx_counts[mismatch_mask]
|
||||||
|
|
||||||
|
return false_neg, false_pos, cm_kmers, cm_ref_counts, cm_idx_counts
|
||||||
|
|
||||||
|
|
||||||
|
# ── main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
ap = argparse.ArgumentParser(description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
ap.add_argument('reference', metavar='REF_NPZ', nargs='?',
|
||||||
|
help='Reference .npz file')
|
||||||
|
ap.add_argument('index', metavar='INDEX_DIR', nargs='?',
|
||||||
|
help='obikmer index directory (built with --with-counts)')
|
||||||
|
ap.add_argument('--obikmer', default='obikmer',
|
||||||
|
help='Path to obikmer binary')
|
||||||
|
ap.add_argument('--species', default='')
|
||||||
|
ap.add_argument('--strain', default='')
|
||||||
|
ap.add_argument('--header', action='store_true',
|
||||||
|
help='Print CSV header and exit')
|
||||||
|
ap.add_argument('--save-fp', metavar='FILE',
|
||||||
|
help='Save false-positive kmer strings to FILE')
|
||||||
|
ap.add_argument('--save-fn', metavar='FILE',
|
||||||
|
help='Save false-negative kmer strings to FILE')
|
||||||
|
ap.add_argument('--save-cm', metavar='FILE',
|
||||||
|
help='Save count-mismatch rows (kmer,ref_count,idx_count) to FILE')
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
if args.header:
|
||||||
|
print('species,strain,ref_kmers,idx_kmers,'
|
||||||
|
'false_neg,false_pos,count_mismatch,'
|
||||||
|
'fn_pct,fp_pct,cm_pct')
|
||||||
|
return
|
||||||
|
|
||||||
|
# Detect k
|
||||||
|
cmd1 = [args.obikmer, 'dump', '--head', '1', args.index]
|
||||||
|
out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True)
|
||||||
|
k = len(out1.splitlines()[1].split(',')[0])
|
||||||
|
|
||||||
|
# Load reference
|
||||||
|
print(f'Loading reference: {args.reference}', file=sys.stderr)
|
||||||
|
npz = np.load(args.reference)
|
||||||
|
ref_kmers = npz['kmers'] # sorted uint64
|
||||||
|
ref_counts = npz['counts'] # uint32
|
||||||
|
|
||||||
|
# Load index
|
||||||
|
print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr)
|
||||||
|
idx_kmers, idx_counts = load_index(args.obikmer, args.index)
|
||||||
|
|
||||||
|
print(f'k={k} ref={len(ref_kmers):,} idx={len(idx_kmers):,}', file=sys.stderr)
|
||||||
|
|
||||||
|
false_neg, false_pos, cm_kmers, cm_ref, cm_idx = compare(
|
||||||
|
ref_kmers, ref_counts, idx_kmers, idx_counts)
|
||||||
|
|
||||||
|
n_shared = len(ref_kmers) - len(false_neg)
|
||||||
|
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
|
||||||
|
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
|
||||||
|
cm_pct = 100.0 * len(cm_kmers) / n_shared if n_shared else 0.0
|
||||||
|
|
||||||
|
print(f'false negatives : {len(false_neg):,} ({fn_pct:.4f}%)', file=sys.stderr)
|
||||||
|
print(f'false positives : {len(false_pos):,} ({fp_pct:.4f}%)', file=sys.stderr)
|
||||||
|
print(f'count mismatches: {len(cm_kmers):,} ({cm_pct:.4f}% of shared)',
|
||||||
|
file=sys.stderr)
|
||||||
|
|
||||||
|
if args.save_fn and len(false_neg):
|
||||||
|
with open(args.save_fn, 'w') as fh:
|
||||||
|
for v in false_neg:
|
||||||
|
fh.write(decode_kmer(int(v), k) + '\n')
|
||||||
|
|
||||||
|
if args.save_fp and len(false_pos):
|
||||||
|
with open(args.save_fp, 'w') as fh:
|
||||||
|
for v in false_pos:
|
||||||
|
fh.write(decode_kmer(int(v), k) + '\n')
|
||||||
|
|
||||||
|
if args.save_cm and len(cm_kmers):
|
||||||
|
with open(args.save_cm, 'w') as fh:
|
||||||
|
fh.write('kmer,ref_count,idx_count\n')
|
||||||
|
for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx):
|
||||||
|
fh.write(f'{decode_kmer(int(v), k)},{rc},{ic}\n')
|
||||||
|
|
||||||
|
print(f'{args.species},{args.strain},'
|
||||||
|
f'{len(ref_kmers)},{len(idx_kmers)},'
|
||||||
|
f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},'
|
||||||
|
f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Executable
+201
@@ -0,0 +1,201 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Verify the merged count index against all per-specimen reference sets.
|
||||||
|
|
||||||
|
Streams `obikmer dump` once on the merged index, accumulates per-specimen
|
||||||
|
kmer+count pairs from each column, then compares each against its reference .npz.
|
||||||
|
|
||||||
|
Output to stdout: one CSV row per specimen (same columns as verify_count.py)
|
||||||
|
species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,
|
||||||
|
fn_pct,fp_pct,cm_pct
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
# ── encoding ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
|
||||||
|
'a': 0, 'c': 1, 'g': 2, 't': 3}
|
||||||
|
|
||||||
|
_DECODE = ['A', 'C', 'G', 'T']
|
||||||
|
|
||||||
|
|
||||||
|
def encode_kmer(s: str) -> int:
|
||||||
|
kmer = 0
|
||||||
|
for c in s:
|
||||||
|
kmer = (kmer << 2) | _ENCODE[c]
|
||||||
|
return kmer
|
||||||
|
|
||||||
|
|
||||||
|
def decode_kmer(val: int, k: int) -> str:
|
||||||
|
bases = []
|
||||||
|
for _ in range(k):
|
||||||
|
bases.append(_DECODE[val & 3])
|
||||||
|
val >>= 2
|
||||||
|
return ''.join(reversed(bases))
|
||||||
|
|
||||||
|
|
||||||
|
# ── single-pass dump ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def stream_merged_dump(obikmer_bin: str, index_dir: str,
|
||||||
|
) -> tuple[list[str], dict[str, tuple[list[int], list[int]]]]:
|
||||||
|
"""Stream the merged dump once.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
specimen_names : column labels in dump order
|
||||||
|
per_specimen : mapping label → (kmer_ints, counts) for entries > 0
|
||||||
|
"""
|
||||||
|
cmd = [obikmer_bin, 'dump', index_dir]
|
||||||
|
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
|
||||||
|
text=True)
|
||||||
|
|
||||||
|
header_line = proc.stdout.readline().rstrip('\n')
|
||||||
|
cols = header_line.split(',')
|
||||||
|
specimen_names = cols[1:]
|
||||||
|
per_specimen: dict[str, tuple[list[int], list[int]]] = {
|
||||||
|
name: ([], []) for name in specimen_names}
|
||||||
|
|
||||||
|
for line in proc.stdout:
|
||||||
|
parts = line.rstrip('\n').split(',')
|
||||||
|
kmer_int = encode_kmer(parts[0])
|
||||||
|
for i, name in enumerate(specimen_names):
|
||||||
|
count = int(parts[i + 1])
|
||||||
|
if count > 0:
|
||||||
|
per_specimen[name][0].append(kmer_int)
|
||||||
|
per_specimen[name][1].append(count)
|
||||||
|
|
||||||
|
proc.wait()
|
||||||
|
if proc.returncode != 0:
|
||||||
|
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
return specimen_names, per_specimen
|
||||||
|
|
||||||
|
|
||||||
|
# ── per-specimen comparison ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def compare_specimen(name: str,
|
||||||
|
kmer_list: list[int],
|
||||||
|
count_list: list[int],
|
||||||
|
ref_dir: Path,
|
||||||
|
k: int,
|
||||||
|
save_fn: Path | None,
|
||||||
|
save_fp: Path | None,
|
||||||
|
save_cm: Path | None,
|
||||||
|
) -> str:
|
||||||
|
ref_path = ref_dir / f'{name}.npz'
|
||||||
|
if not ref_path.exists():
|
||||||
|
print(f' SKIP {name}: no reference at {ref_path}', file=sys.stderr)
|
||||||
|
return ''
|
||||||
|
|
||||||
|
species = name.split('--')[0]
|
||||||
|
strain = name[len(species) + 2:]
|
||||||
|
|
||||||
|
npz = np.load(ref_path)
|
||||||
|
ref_kmers = npz['kmers'] # sorted uint64
|
||||||
|
ref_counts = npz['counts'] # uint32
|
||||||
|
|
||||||
|
order = np.argsort(np.array(kmer_list, dtype=np.uint64), kind='stable')
|
||||||
|
idx_kmers = np.array(kmer_list, dtype=np.uint64)[order]
|
||||||
|
idx_counts = np.array(count_list, dtype=np.uint32)[order]
|
||||||
|
|
||||||
|
false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
|
||||||
|
false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
|
||||||
|
|
||||||
|
# Count mismatches among shared kmers
|
||||||
|
pos_in_idx = np.searchsorted(idx_kmers, ref_kmers)
|
||||||
|
pos_in_idx = np.clip(pos_in_idx, 0, len(idx_kmers) - 1)
|
||||||
|
shared_mask = idx_kmers[pos_in_idx] == ref_kmers
|
||||||
|
mismatch_mask = ref_counts[shared_mask] != idx_counts[pos_in_idx[shared_mask]]
|
||||||
|
cm_kmers = ref_kmers[shared_mask][mismatch_mask]
|
||||||
|
cm_ref = ref_counts[shared_mask][mismatch_mask]
|
||||||
|
cm_idx = idx_counts[pos_in_idx[shared_mask]][mismatch_mask]
|
||||||
|
|
||||||
|
n_shared = int(shared_mask.sum())
|
||||||
|
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
|
||||||
|
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
|
||||||
|
cm_pct = 100.0 * len(cm_kmers) / n_shared if n_shared else 0.0
|
||||||
|
|
||||||
|
print(f' {name}: ref={len(ref_kmers):,} idx={len(idx_kmers):,} '
|
||||||
|
f'fn={len(false_neg):,} ({fn_pct:.4f}%) '
|
||||||
|
f'fp={len(false_pos):,} ({fp_pct:.4f}%) '
|
||||||
|
f'cm={len(cm_kmers):,} ({cm_pct:.4f}%)',
|
||||||
|
file=sys.stderr)
|
||||||
|
|
||||||
|
if save_fn and len(false_neg):
|
||||||
|
fn_file = save_fn / f'{name}_fn.txt'
|
||||||
|
fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n')
|
||||||
|
|
||||||
|
if save_fp and len(false_pos):
|
||||||
|
fp_file = save_fp / f'{name}_fp.txt'
|
||||||
|
fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n')
|
||||||
|
|
||||||
|
if save_cm and len(cm_kmers):
|
||||||
|
cm_file = save_cm / f'{name}_cm.csv'
|
||||||
|
lines = ['kmer,ref_count,idx_count']
|
||||||
|
for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx):
|
||||||
|
lines.append(f'{decode_kmer(int(v), k)},{rc},{ic}')
|
||||||
|
cm_file.write_text('\n'.join(lines) + '\n')
|
||||||
|
|
||||||
|
return (f'{species},{strain},'
|
||||||
|
f'{len(ref_kmers)},{len(idx_kmers)},'
|
||||||
|
f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},'
|
||||||
|
f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}')
|
||||||
|
|
||||||
|
|
||||||
|
# ── main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
ap = argparse.ArgumentParser(description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
ap.add_argument('index', metavar='INDEX_DIR', nargs='?',
|
||||||
|
help='Merged count index directory')
|
||||||
|
ap.add_argument('ref_dir', metavar='REF_DIR', nargs='?',
|
||||||
|
help='Directory containing per-specimen .npz reference files')
|
||||||
|
ap.add_argument('--obikmer', default='obikmer')
|
||||||
|
ap.add_argument('--header', action='store_true',
|
||||||
|
help='Print CSV header and exit')
|
||||||
|
ap.add_argument('--save-fn', metavar='DIR',
|
||||||
|
help='Directory for false-negative kmer lists')
|
||||||
|
ap.add_argument('--save-fp', metavar='DIR',
|
||||||
|
help='Directory for false-positive kmer lists')
|
||||||
|
ap.add_argument('--save-cm', metavar='DIR',
|
||||||
|
help='Directory for count-mismatch CSV files')
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
if args.header:
|
||||||
|
print('species,strain,ref_kmers,idx_kmers,'
|
||||||
|
'false_neg,false_pos,count_mismatch,'
|
||||||
|
'fn_pct,fp_pct,cm_pct')
|
||||||
|
return
|
||||||
|
|
||||||
|
ref_dir = Path(args.ref_dir)
|
||||||
|
save_fn = Path(args.save_fn) if args.save_fn else None
|
||||||
|
save_fp = Path(args.save_fp) if args.save_fp else None
|
||||||
|
save_cm = Path(args.save_cm) if args.save_cm else None
|
||||||
|
for d in (save_fn, save_fp, save_cm):
|
||||||
|
if d: d.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
out1 = subprocess.check_output(
|
||||||
|
[args.obikmer, 'dump', '--head', '1', args.index],
|
||||||
|
stderr=subprocess.DEVNULL, text=True)
|
||||||
|
k = len(out1.splitlines()[1].split(',')[0])
|
||||||
|
|
||||||
|
print(f'k={k} streaming merged dump: {args.index}', file=sys.stderr)
|
||||||
|
specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index)
|
||||||
|
print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr)
|
||||||
|
|
||||||
|
for name in specimen_names:
|
||||||
|
kmers, counts = per_specimen[name]
|
||||||
|
row = compare_specimen(name, kmers, counts, ref_dir, k,
|
||||||
|
save_fn, save_fp, save_cm)
|
||||||
|
if row:
|
||||||
|
print(row)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Executable
+27
@@ -0,0 +1,27 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
INDEX="${SCRIPT_DIR}/global_index_count"
|
||||||
|
REF_DIR="${SCRIPT_DIR}/reference_index"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/verify_merge_count"
|
||||||
|
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
|
||||||
|
VERIFY_PY="${SCRIPT_DIR}/verify_merge_count.py"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
CURRENT="${STATS_DIR}/current.csv"
|
||||||
|
|
||||||
|
"${PYTHON}" "${VERIFY_PY}" --header >"${CURRENT}"
|
||||||
|
|
||||||
|
"${PYTHON}" "${VERIFY_PY}" \
|
||||||
|
--obikmer "${BINARY}" \
|
||||||
|
"${INDEX}" "${REF_DIR}" \
|
||||||
|
>>"${CURRENT}"
|
||||||
|
|
||||||
|
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'count_*.csv' | wc -l | tr -d ' ')")
|
||||||
|
ARCHIVE="${STATS_DIR}/count_${run_n}.csv"
|
||||||
|
cp "${CURRENT}" "${ARCHIVE}"
|
||||||
|
|
||||||
|
echo "Done. Results → ${ARCHIVE}"
|
||||||
Executable
+170
@@ -0,0 +1,170 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Verify the merged presence index against all per-specimen reference sets.
|
||||||
|
|
||||||
|
Streams `obikmer dump` once on the merged index, accumulates per-specimen
|
||||||
|
kmer sets from each column, then compares each against its reference .npz.
|
||||||
|
|
||||||
|
Output to stdout: one CSV row per specimen (same columns as verify_presence.py)
|
||||||
|
species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
# ── encoding ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
|
||||||
|
'a': 0, 'c': 1, 'g': 2, 't': 3}
|
||||||
|
|
||||||
|
_DECODE = ['A', 'C', 'G', 'T']
|
||||||
|
|
||||||
|
|
||||||
|
def encode_kmer(s: str) -> int:
|
||||||
|
kmer = 0
|
||||||
|
for c in s:
|
||||||
|
kmer = (kmer << 2) | _ENCODE[c]
|
||||||
|
return kmer
|
||||||
|
|
||||||
|
|
||||||
|
def decode_kmer(val: int, k: int) -> str:
|
||||||
|
bases = []
|
||||||
|
for _ in range(k):
|
||||||
|
bases.append(_DECODE[val & 3])
|
||||||
|
val >>= 2
|
||||||
|
return ''.join(reversed(bases))
|
||||||
|
|
||||||
|
|
||||||
|
# ── single-pass dump ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def stream_merged_dump(obikmer_bin: str, index_dir: str,
|
||||||
|
) -> tuple[list[str], dict[str, list[int]]]:
|
||||||
|
"""Stream the merged dump once.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
specimen_names : column labels in dump order (excluding 'kmer')
|
||||||
|
per_specimen : mapping label → list of kmer ints where presence > 0
|
||||||
|
"""
|
||||||
|
cmd = [obikmer_bin, 'dump', index_dir]
|
||||||
|
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
|
||||||
|
text=True)
|
||||||
|
|
||||||
|
header_line = proc.stdout.readline().rstrip('\n')
|
||||||
|
cols = header_line.split(',')
|
||||||
|
specimen_names = cols[1:] # first col is 'kmer'
|
||||||
|
per_specimen: dict[str, list[int]] = {name: [] for name in specimen_names}
|
||||||
|
|
||||||
|
for line in proc.stdout:
|
||||||
|
parts = line.rstrip('\n').split(',')
|
||||||
|
kmer_int = encode_kmer(parts[0])
|
||||||
|
for i, name in enumerate(specimen_names):
|
||||||
|
if int(parts[i + 1]) > 0:
|
||||||
|
per_specimen[name].append(kmer_int)
|
||||||
|
|
||||||
|
proc.wait()
|
||||||
|
if proc.returncode != 0:
|
||||||
|
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
return specimen_names, per_specimen
|
||||||
|
|
||||||
|
|
||||||
|
# ── per-specimen comparison ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def compare_specimen(name: str,
|
||||||
|
kmer_list: list[int],
|
||||||
|
ref_dir: Path,
|
||||||
|
k: int,
|
||||||
|
save_fn: Path | None,
|
||||||
|
save_fp: Path | None,
|
||||||
|
) -> str:
|
||||||
|
"""Compare one specimen column against its reference .npz.
|
||||||
|
|
||||||
|
Returns a CSV row string.
|
||||||
|
"""
|
||||||
|
ref_path = ref_dir / f'{name}.npz'
|
||||||
|
if not ref_path.exists():
|
||||||
|
print(f' SKIP {name}: no reference at {ref_path}', file=sys.stderr)
|
||||||
|
return ''
|
||||||
|
|
||||||
|
species = name.split('--')[0]
|
||||||
|
strain = name[len(species) + 2:]
|
||||||
|
|
||||||
|
ref_kmers = np.load(ref_path)['kmers'] # sorted uint64
|
||||||
|
idx_kmers = np.array(sorted(kmer_list), dtype=np.uint64)
|
||||||
|
|
||||||
|
false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
|
||||||
|
false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
|
||||||
|
|
||||||
|
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
|
||||||
|
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
|
||||||
|
|
||||||
|
print(f' {name}: ref={len(ref_kmers):,} idx={len(idx_kmers):,} '
|
||||||
|
f'fn={len(false_neg):,} ({fn_pct:.4f}%) '
|
||||||
|
f'fp={len(false_pos):,} ({fp_pct:.4f}%)',
|
||||||
|
file=sys.stderr)
|
||||||
|
|
||||||
|
if save_fn and len(false_neg):
|
||||||
|
fn_file = save_fn / f'{name}_fn.txt'
|
||||||
|
fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n')
|
||||||
|
|
||||||
|
if save_fp and len(false_pos):
|
||||||
|
fp_file = save_fp / f'{name}_fp.txt'
|
||||||
|
fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n')
|
||||||
|
|
||||||
|
return (f'{species},{strain},'
|
||||||
|
f'{len(ref_kmers)},{len(idx_kmers)},'
|
||||||
|
f'{len(false_neg)},{len(false_pos)},'
|
||||||
|
f'{fn_pct:.4f},{fp_pct:.4f}')
|
||||||
|
|
||||||
|
|
||||||
|
# ── main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
ap = argparse.ArgumentParser(description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
ap.add_argument('index', metavar='INDEX_DIR', nargs='?',
|
||||||
|
help='Merged presence index directory')
|
||||||
|
ap.add_argument('ref_dir', metavar='REF_DIR', nargs='?',
|
||||||
|
help='Directory containing per-specimen .npz reference files')
|
||||||
|
ap.add_argument('--obikmer', default='obikmer')
|
||||||
|
ap.add_argument('--header', action='store_true',
|
||||||
|
help='Print CSV header and exit')
|
||||||
|
ap.add_argument('--save-fn', metavar='DIR',
|
||||||
|
help='Directory to save false-negative kmer lists')
|
||||||
|
ap.add_argument('--save-fp', metavar='DIR',
|
||||||
|
help='Directory to save false-positive kmer lists')
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
if args.header:
|
||||||
|
print('species,strain,ref_kmers,idx_kmers,'
|
||||||
|
'false_neg,false_pos,fn_pct,fp_pct')
|
||||||
|
return
|
||||||
|
|
||||||
|
ref_dir = Path(args.ref_dir)
|
||||||
|
save_fn = Path(args.save_fn) if args.save_fn else None
|
||||||
|
save_fp = Path(args.save_fp) if args.save_fp else None
|
||||||
|
if save_fn: save_fn.mkdir(parents=True, exist_ok=True)
|
||||||
|
if save_fp: save_fp.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Detect k
|
||||||
|
out1 = subprocess.check_output(
|
||||||
|
[args.obikmer, 'dump', '--head', '1', args.index],
|
||||||
|
stderr=subprocess.DEVNULL, text=True)
|
||||||
|
k = len(out1.splitlines()[1].split(',')[0])
|
||||||
|
|
||||||
|
print(f'k={k} streaming merged dump: {args.index}', file=sys.stderr)
|
||||||
|
specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index)
|
||||||
|
print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr)
|
||||||
|
|
||||||
|
for name in specimen_names:
|
||||||
|
row = compare_specimen(name, per_specimen[name], ref_dir, k, save_fn, save_fp)
|
||||||
|
if row:
|
||||||
|
print(row)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Executable
+27
@@ -0,0 +1,27 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
INDEX="${SCRIPT_DIR}/global_index_presence"
|
||||||
|
REF_DIR="${SCRIPT_DIR}/reference_index"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/verify_merge_presence"
|
||||||
|
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
|
||||||
|
VERIFY_PY="${SCRIPT_DIR}/verify_merge_presence.py"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
CURRENT="${STATS_DIR}/current.csv"
|
||||||
|
|
||||||
|
"${PYTHON}" "${VERIFY_PY}" --header >"${CURRENT}"
|
||||||
|
|
||||||
|
"${PYTHON}" "${VERIFY_PY}" \
|
||||||
|
--obikmer "${BINARY}" \
|
||||||
|
"${INDEX}" "${REF_DIR}" \
|
||||||
|
>>"${CURRENT}"
|
||||||
|
|
||||||
|
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'presence_*.csv' | wc -l | tr -d ' ')")
|
||||||
|
ARCHIVE="${STATS_DIR}/presence_${run_n}.csv"
|
||||||
|
cp "${CURRENT}" "${ARCHIVE}"
|
||||||
|
|
||||||
|
echo "Done. Results → ${ARCHIVE}"
|
||||||
Executable
+30
@@ -0,0 +1,30 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: verify_one_count.sh SPECIMEN
|
||||||
|
# SPECIMEN = "species--strain" (Make pattern stem)
|
||||||
|
# Output: stats/verify_count/SPECIMEN.stats (one CSV data row, no header)
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SPECIMEN="$1"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
|
||||||
|
VERIFY_PY="${SCRIPT_DIR}/verify_count.py"
|
||||||
|
|
||||||
|
species="${SPECIMEN%%--*}"
|
||||||
|
strain="${SPECIMEN#*--}"
|
||||||
|
|
||||||
|
REF_NPZ="${SCRIPT_DIR}/reference_index/${SPECIMEN}.npz"
|
||||||
|
INDEX_DIR="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/verify_count"
|
||||||
|
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
echo "[${SPECIMEN}] verifying count"
|
||||||
|
|
||||||
|
"${PYTHON}" "${VERIFY_PY}" \
|
||||||
|
--obikmer "${BINARY}" \
|
||||||
|
--species "${species}" \
|
||||||
|
--strain "${strain}" \
|
||||||
|
"${REF_NPZ}" "${INDEX_DIR}" \
|
||||||
|
>"${STATS_FILE}"
|
||||||
Executable
+30
@@ -0,0 +1,30 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: verify_one_presence.sh SPECIMEN
|
||||||
|
# SPECIMEN = "species--strain" (Make pattern stem)
|
||||||
|
# Output: stats/verify_presence/SPECIMEN.stats (one CSV data row, no header)
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SPECIMEN="$1"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
|
||||||
|
VERIFY_PY="${SCRIPT_DIR}/verify_presence.py"
|
||||||
|
|
||||||
|
species="${SPECIMEN%%--*}"
|
||||||
|
strain="${SPECIMEN#*--}"
|
||||||
|
|
||||||
|
REF_NPZ="${SCRIPT_DIR}/reference_index/${SPECIMEN}.npz"
|
||||||
|
INDEX_DIR="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/verify_presence"
|
||||||
|
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
echo "[${SPECIMEN}] verifying presence"
|
||||||
|
|
||||||
|
"${PYTHON}" "${VERIFY_PY}" \
|
||||||
|
--obikmer "${BINARY}" \
|
||||||
|
--species "${species}" \
|
||||||
|
--strain "${strain}" \
|
||||||
|
"${REF_NPZ}" "${INDEX_DIR}" \
|
||||||
|
>"${STATS_FILE}"
|
||||||
Executable
+139
@@ -0,0 +1,139 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Compare an obikmer index against a reference kmer set (presence/absence).
|
||||||
|
|
||||||
|
Loads the reference .npz (sorted uint64 kmers built by build_reference.py),
|
||||||
|
streams the output of `obikmer dump`, encodes each kmer string to uint64,
|
||||||
|
then reports false negatives and false positives using numpy set operations.
|
||||||
|
|
||||||
|
Output to stdout: one CSV row
|
||||||
|
species, strain, ref_kmers, idx_kmers, false_neg, false_pos, fn_pct, fp_pct
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
# ── encoding ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
|
||||||
|
'a': 0, 'c': 1, 'g': 2, 't': 3}
|
||||||
|
|
||||||
|
_DECODE = ['A', 'C', 'G', 'T']
|
||||||
|
|
||||||
|
|
||||||
|
def encode_kmer(s: str) -> int:
|
||||||
|
kmer = 0
|
||||||
|
for c in s:
|
||||||
|
kmer = (kmer << 2) | _ENCODE[c]
|
||||||
|
return kmer
|
||||||
|
|
||||||
|
|
||||||
|
def decode_kmer(val: int, k: int) -> str:
|
||||||
|
bases = []
|
||||||
|
for _ in range(k):
|
||||||
|
bases.append(_DECODE[val & 3])
|
||||||
|
val >>= 2
|
||||||
|
return ''.join(reversed(bases))
|
||||||
|
|
||||||
|
|
||||||
|
# ── dump parsing ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def load_index_kmers(obikmer_bin: str, index_dir: str) -> np.ndarray:
|
||||||
|
"""Stream `obikmer dump` and return a sorted uint64 array of kmer integers."""
|
||||||
|
cmd = [obikmer_bin, 'dump', index_dir]
|
||||||
|
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
|
||||||
|
text=True)
|
||||||
|
kmers = []
|
||||||
|
header = True
|
||||||
|
for line in proc.stdout:
|
||||||
|
if header:
|
||||||
|
header = False
|
||||||
|
continue
|
||||||
|
kmer_str = line.split(',', 1)[0]
|
||||||
|
kmers.append(encode_kmer(kmer_str))
|
||||||
|
proc.wait()
|
||||||
|
if proc.returncode != 0:
|
||||||
|
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
arr = np.array(kmers, dtype=np.uint64)
|
||||||
|
arr.sort()
|
||||||
|
return arr
|
||||||
|
|
||||||
|
|
||||||
|
# ── comparison ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def compare(ref: np.ndarray, idx: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
||||||
|
"""Return (false_negatives, false_positives) as uint64 arrays."""
|
||||||
|
false_neg = np.setdiff1d(ref, idx, assume_unique=True)
|
||||||
|
false_pos = np.setdiff1d(idx, ref, assume_unique=True)
|
||||||
|
return false_neg, false_pos
|
||||||
|
|
||||||
|
|
||||||
|
# ── main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
ap = argparse.ArgumentParser(description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
ap.add_argument('reference', metavar='REF_NPZ', nargs='?', help='Reference .npz file')
|
||||||
|
ap.add_argument('index', metavar='INDEX_DIR', nargs='?', help='obikmer index directory')
|
||||||
|
ap.add_argument('--obikmer', default='obikmer', help='Path to obikmer binary')
|
||||||
|
ap.add_argument('--species', default='', help='Species label for CSV row')
|
||||||
|
ap.add_argument('--strain', default='', help='Strain label for CSV row')
|
||||||
|
ap.add_argument('--header', action='store_true', help='Print CSV header and exit')
|
||||||
|
ap.add_argument('--save-fp', metavar='FILE',
|
||||||
|
help='Save false-positive kmer strings to FILE')
|
||||||
|
ap.add_argument('--save-fn', metavar='FILE',
|
||||||
|
help='Save false-negative kmer strings to FILE')
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
if args.header:
|
||||||
|
print('species,strain,ref_kmers,idx_kmers,'
|
||||||
|
'false_neg,false_pos,fn_pct,fp_pct')
|
||||||
|
return
|
||||||
|
|
||||||
|
# Detect k from the index (one cheap call before the full dump).
|
||||||
|
cmd1 = [args.obikmer, 'dump', '--head', '1', args.index]
|
||||||
|
out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True)
|
||||||
|
k = len(out1.splitlines()[1].split(',')[0])
|
||||||
|
|
||||||
|
# Load reference
|
||||||
|
print(f'Loading reference: {args.reference}', file=sys.stderr)
|
||||||
|
npz = np.load(args.reference)
|
||||||
|
ref_kmers = npz['kmers'] # already sorted uint64
|
||||||
|
|
||||||
|
# Load index
|
||||||
|
print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr)
|
||||||
|
idx_kmers = load_index_kmers(args.obikmer, args.index)
|
||||||
|
|
||||||
|
print(f'k={k} ref={len(ref_kmers):,} idx={len(idx_kmers):,}', file=sys.stderr)
|
||||||
|
|
||||||
|
false_neg, false_pos = compare(ref_kmers, idx_kmers)
|
||||||
|
|
||||||
|
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
|
||||||
|
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
|
||||||
|
|
||||||
|
print(f'false negatives: {len(false_neg):,} ({fn_pct:.4f}%)', file=sys.stderr)
|
||||||
|
print(f'false positives: {len(false_pos):,} ({fp_pct:.4f}%)', file=sys.stderr)
|
||||||
|
|
||||||
|
if args.save_fn and len(false_neg):
|
||||||
|
with open(args.save_fn, 'w') as fh:
|
||||||
|
for v in false_neg:
|
||||||
|
fh.write(decode_kmer(int(v), k) + '\n')
|
||||||
|
print(f'False negatives saved → {args.save_fn}', file=sys.stderr)
|
||||||
|
|
||||||
|
if args.save_fp and len(false_pos):
|
||||||
|
with open(args.save_fp, 'w') as fh:
|
||||||
|
for v in false_pos:
|
||||||
|
fh.write(decode_kmer(int(v), k) + '\n')
|
||||||
|
print(f'False positives saved → {args.save_fp}', file=sys.stderr)
|
||||||
|
|
||||||
|
print(f'{args.species},{args.strain},'
|
||||||
|
f'{len(ref_kmers)},{len(idx_kmers)},'
|
||||||
|
f'{len(false_neg)},{len(false_pos)},'
|
||||||
|
f'{fn_pct:.4f},{fp_pct:.4f}')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
@@ -0,0 +1,179 @@
|
|||||||
|
# NUMA-aware partition runner
|
||||||
|
|
||||||
|
## Problem
|
||||||
|
|
||||||
|
All partition-level parallel loops in obikindex currently fall into two
|
||||||
|
categories:
|
||||||
|
|
||||||
|
**Naive Rayon** — used in `build_layers`, `pack_matrices`, `dump`, `select`,
|
||||||
|
`stats`, `rebuild`, `reindex`:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
(0..n).into_par_iter().for_each(|i| work(i));
|
||||||
|
```
|
||||||
|
|
||||||
|
Threads come from the global Rayon pool with no NUMA awareness. On
|
||||||
|
multi-socket machines this produces cross-socket memory traffic and degrades
|
||||||
|
performance super-linearly (see [NUMA-aware worker pools](numa_worker_pools.md)).
|
||||||
|
|
||||||
|
**Ad-hoc adaptive pool** — used in `merge`:
|
||||||
|
|
||||||
|
A bespoke implementation with pre-spawned workers, channel-based dispatch, and
|
||||||
|
activation control. It handles NUMA correctly but is not reusable.
|
||||||
|
|
||||||
|
Both cases should be replaced by a single generic mechanism.
|
||||||
|
|
||||||
|
## Unified model
|
||||||
|
|
||||||
|
The key insight is that **UMA is just the NUMA case with a single node**. The
|
||||||
|
runner always works the same way: one controller thread per node, each
|
||||||
|
independently managing its own workers with the same adaptive logic. The only
|
||||||
|
difference between UMA and NUMA is the number of nodes and whether workers are
|
||||||
|
pinned.
|
||||||
|
|
||||||
|
```
|
||||||
|
NUMA (k nodes) UMA (1 node)
|
||||||
|
|
||||||
|
controller-0 controller-1 … controller-0
|
||||||
|
│ │ │
|
||||||
|
workers[0] workers[1] workers[0]
|
||||||
|
(pinned) (pinned) (global pool)
|
||||||
|
└───────────────┴──────────────────┘
|
||||||
|
shared work queue
|
||||||
|
```
|
||||||
|
|
||||||
|
On each node, the Rayon `ThreadPool` is pinned to that node's CPUs.
|
||||||
|
`pool.install()` ensures all internal Rayon calls (inside the work function)
|
||||||
|
use the node-local pool. Linux first-touch then places heap allocations in
|
||||||
|
local DRAM automatically.
|
||||||
|
|
||||||
|
On UMA the global Rayon pool is used directly — no pinning, no overhead.
|
||||||
|
|
||||||
|
## Adaptive mechanism
|
||||||
|
|
||||||
|
Each controller follows the same logic regardless of node count:
|
||||||
|
|
||||||
|
1. Pre-spawn `workers_per_node` dormant worker threads (blocked on `activate_rx`).
|
||||||
|
2. Activate the first worker immediately.
|
||||||
|
3. Loop on result channel with a `SPAWN_POLL` timeout:
|
||||||
|
- On result: call `on_done`; check whether to activate the next worker.
|
||||||
|
- On timeout: same check.
|
||||||
|
- Activation criterion: `should_spawn_worker(active, global_efficiency, prev_efficiency)`.
|
||||||
|
4. Drop `activate_tx` when done — dormant workers exit cleanly.
|
||||||
|
|
||||||
|
**Global CPU efficiency** (`CpuSample`, reads `/proc/stat` on Linux) is used by
|
||||||
|
all controllers — no per-node measurement needed. The signal is coarser than
|
||||||
|
per-node efficiency but correct in practice: if any node saturates memory
|
||||||
|
bandwidth, the global efficiency drops and all controllers stop activating
|
||||||
|
workers. Using a standard portable primitive avoids platform-specific CPU
|
||||||
|
accounting and keeps the implementation clean.
|
||||||
|
|
||||||
|
## Proposed API
|
||||||
|
|
||||||
|
```rust
|
||||||
|
pub struct PartitionRunner {
|
||||||
|
// One entry per NUMA node; one entry total on UMA.
|
||||||
|
nodes: Vec<NodeConfig>,
|
||||||
|
}
|
||||||
|
|
||||||
|
struct NodeConfig {
|
||||||
|
pool: Option<Arc<rayon::ThreadPool>>, // None = global Rayon pool (UMA)
|
||||||
|
cpu_ids: Vec<usize>, // empty = no pinning (UMA)
|
||||||
|
max_workers: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartitionRunner {
|
||||||
|
/// Detect topology and build the runner.
|
||||||
|
/// Returns a single-node runner on UMA / macOS / hwloc failure.
|
||||||
|
pub fn new() -> Self;
|
||||||
|
|
||||||
|
/// Run `f(i)` for every index in `order`, collecting results.
|
||||||
|
///
|
||||||
|
/// `on_done(i, result, elapsed)` is called under an internal mutex as
|
||||||
|
/// each partition completes — use it for progress bars and aggregation.
|
||||||
|
/// The runner serialises all calls to `on_done` via an internal
|
||||||
|
/// `Arc<Mutex<C>>`, so no `Sync` bound is required on the callback.
|
||||||
|
/// `Send` is required because the Arc clone crosses thread boundaries.
|
||||||
|
///
|
||||||
|
/// Serialisation is free in practice: a partition takes seconds to
|
||||||
|
/// minutes; the callback takes microseconds. Contention is negligible.
|
||||||
|
///
|
||||||
|
/// Returns the first error from `f`, if any.
|
||||||
|
pub fn run<F, R, E, C>(
|
||||||
|
&self,
|
||||||
|
order: &[usize],
|
||||||
|
f: F,
|
||||||
|
on_done: C,
|
||||||
|
) -> Result<(), E>
|
||||||
|
where
|
||||||
|
F: Fn(usize) -> Result<R, E> + Send + Sync,
|
||||||
|
R: Send,
|
||||||
|
E: Send,
|
||||||
|
C: FnMut(usize, R, Duration) + Send; // Send required, Sync is not
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
`order` is caller-supplied so each command chooses its scheduling strategy:
|
||||||
|
largest-first for `merge`, sequential for `build_layers`, etc.
|
||||||
|
|
||||||
|
## Migration examples
|
||||||
|
|
||||||
|
### merge.rs (before: ~180 lines of bespoke machinery)
|
||||||
|
|
||||||
|
```rust
|
||||||
|
let runner = PartitionRunner::new();
|
||||||
|
runner.run(
|
||||||
|
&order,
|
||||||
|
|i| dst_partition.merge_partition(i, srcs, mode, n_dst_genomes, block_bits, evidence)
|
||||||
|
.map_err(OKIError::Partition),
|
||||||
|
|i, g_len, dur| {
|
||||||
|
pb.inc(1);
|
||||||
|
debug!("partition {i}: done in {:.1}s — {g_len} new kmers", dur.as_secs_f64());
|
||||||
|
part_stats.push(PartStat { id: i, unitig_bytes: partition_sizes[i], g_len });
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
```
|
||||||
|
|
||||||
|
### index.rs build_layers (before: naive into_par_iter)
|
||||||
|
|
||||||
|
```rust
|
||||||
|
let order: Vec<usize> = (0..n).collect();
|
||||||
|
let runner = PartitionRunner::new();
|
||||||
|
runner.run(
|
||||||
|
&order,
|
||||||
|
|i| self.partition.build_index_layer(i, min_ab, max_ab, with_counts, &evidence, block_bits)
|
||||||
|
.map_err(OKIError::Partition),
|
||||||
|
|_, n_kmers, _| {
|
||||||
|
total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
|
||||||
|
pb.inc(1);
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
```
|
||||||
|
|
||||||
|
All other sites (`pack_matrices`, `dump`, `select`, etc.) follow the same
|
||||||
|
pattern.
|
||||||
|
|
||||||
|
## Placement
|
||||||
|
|
||||||
|
`PartitionRunner` lives in `obikindex/src/numa.rs` alongside `NumaSetup`.
|
||||||
|
It depends only on standard library primitives and Rayon — no new dependencies.
|
||||||
|
|
||||||
|
A single `PartitionRunner` instance can be built once per command invocation
|
||||||
|
and reused across multiple `run()` calls (e.g. `merge` runs
|
||||||
|
`merge_partitions` then `pack_matrices`).
|
||||||
|
|
||||||
|
## Open questions
|
||||||
|
|
||||||
|
- **Error handling**: `run` currently returns the first error; remaining errors
|
||||||
|
are dropped. A `Vec<E>` return would give complete diagnostics.
|
||||||
|
|
||||||
|
- **`workers_per_node` tuning**: currently `(cpus / 8).max(3).min(8)`, calibrated
|
||||||
|
for merge on BeeGFS. I/O-bound commands (`dump`, `select`) may benefit from
|
||||||
|
a higher value. A per-call override could be added to the API.
|
||||||
|
|
||||||
|
- **`on_done` ordering**: the runner serialises calls to `on_done` via an
|
||||||
|
internal `Arc<Mutex<C>>`. `Send` is required (the Arc clone crosses thread
|
||||||
|
boundaries); `Sync` is not (only one thread holds the lock at a time).
|
||||||
|
Contention is negligible because a partition takes seconds while the callback
|
||||||
|
takes microseconds. The callback is therefore simple to write (plain
|
||||||
|
`Vec::push`, plain `FnMut`) with no measurable performance cost.
|
||||||
@@ -0,0 +1,97 @@
|
|||||||
|
# NUMA-aware worker pools for merge
|
||||||
|
|
||||||
|
## Problem
|
||||||
|
|
||||||
|
The merge command's bottleneck is `compute_degrees` in `obidebruinj`: a random pointer-chase over 20–70 M node hash maps that saturates DRAM bandwidth. When multiple partition workers run concurrently, they contend for the shared memory bus, causing super-linear slowdown (measured: 0.016 µs/node solo → 0.95 µs/node with 4–5 concurrent workers, ×60 degradation).
|
||||||
|
|
||||||
|
Modern HPC nodes are multi-socket NUMA machines (observed: 2 sockets × 4 NUMA nodes × 24 cores = 192 cores). Cross-NUMA memory traffic compounds the contention:
|
||||||
|
|
||||||
|
- Full 192-core run: ~15 min/partition (×10 worse than M3 Mac)
|
||||||
|
- `taskset` restricted to 4 NUMA nodes (96 cores): ~90 s/partition
|
||||||
|
- OAR job on 1 NUMA node (24 cores): ~80 s/partition, same throughput as 96 cores
|
||||||
|
|
||||||
|
**Conclusion**: the bottleneck is memory bandwidth per NUMA node, not core count. 24 cores on one NUMA node achieve the same throughput as 96 cores across four.
|
||||||
|
|
||||||
|
## Strategy
|
||||||
|
|
||||||
|
Run N worker groups in parallel, one per NUMA node, each with its own Rayon thread pool whose threads are pinned to the NUMA node's CPUs. Linux's first-touch policy then places graph allocations on local DRAM automatically — no explicit NUMA allocator needed.
|
||||||
|
|
||||||
|
Expected throughput: N × single-NUMA throughput. On the 8-NUMA-node HPC: 8 × ~80 s = 9–10 min total instead of >60 min with the current single-pool approach.
|
||||||
|
|
||||||
|
## Rayon thread pool isolation
|
||||||
|
|
||||||
|
Rayon provides `ThreadPool::install(|| { ... })`: any Rayon call (`par_iter`, `current_num_threads`, etc.) inside the closure uses *that* pool exclusively. Wrapping `merge_partition` in `pool.install()` redirects all downstream Rayon calls — including those in `debruijn.rs` and `partition.rs` — without touching those crates.
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// worker thread, assigned to NUMA pool `pool`
|
||||||
|
pool.install(|| {
|
||||||
|
dst_partition.merge_partition(i, srcs, mode, n_dst_genomes, block_bits, evidence)
|
||||||
|
})
|
||||||
|
```
|
||||||
|
|
||||||
|
`rayon::current_num_threads()` inside `merge_partition` will return the pool size (e.g. 24), not the global thread count — which is the right value for buffer sizing.
|
||||||
|
|
||||||
|
## Thread pinning
|
||||||
|
|
||||||
|
`ThreadPoolBuilder::spawn_handler` provides a hook executed for each thread at creation. Inside, `libc::sched_setaffinity` pins the thread to a CPU set:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
let cpus: Vec<usize> = numa_node_cpus(node); // from /sys/devices/system/node/nodeN/cpulist
|
||||||
|
rayon::ThreadPoolBuilder::new()
|
||||||
|
.num_threads(cpus.len())
|
||||||
|
.spawn_handler(move |thread| {
|
||||||
|
let mut b = std::thread::Builder::new();
|
||||||
|
std::thread::Builder::new().spawn(move || {
|
||||||
|
pin_to_cpus(&cpus); // sched_setaffinity via libc
|
||||||
|
thread.run()
|
||||||
|
})?;
|
||||||
|
Ok(())
|
||||||
|
})
|
||||||
|
.build()?
|
||||||
|
```
|
||||||
|
|
||||||
|
NUMA topology is read from `/sys/devices/system/node/node*/cpulist` — no `libnuma` dependency required. If the `numa` crate is linked, `numa_available()` / `numa_run_on_node()` are an alternative.
|
||||||
|
|
||||||
|
## Memory locality
|
||||||
|
|
||||||
|
Linux allocates pages on the NUMA node of the thread that first writes them (first-touch policy). Once Rayon threads are pinned to node N, all graph data built by those threads lands on node N's DRAM. No changes to the allocator, no explicit `numa_alloc_onnode` calls.
|
||||||
|
|
||||||
|
## Adaptive spawn criterion
|
||||||
|
|
||||||
|
The current criterion uses `std::thread::available_parallelism()` (returns total cores = 192) and `max_workers = n_cores / 2`. With NUMA pools:
|
||||||
|
|
||||||
|
- `n_cores` per pool = cores per NUMA node (e.g. 24)
|
||||||
|
- `max_workers` per pool = pool size / 2 (e.g. 12)
|
||||||
|
- CPU efficiency is measured per pool, not globally
|
||||||
|
|
||||||
|
Each NUMA group runs its own independent adaptive pool. Workers are distributed across NUMA groups round-robin or by workload (partition assignment can be pre-split by NUMA group index).
|
||||||
|
|
||||||
|
## Required changes
|
||||||
|
|
||||||
|
| File | Change |
|
||||||
|
|------|--------|
|
||||||
|
| `obikindex/src/merge.rs` | Detect NUMA topology; build N `ThreadPool`s with pinned threads; assign each pre-spawned worker to a pool; wrap `merge_partition` in `pool.install()` |
|
||||||
|
| `obikindex/src/merge.rs` | Replace `available_parallelism()` with per-NUMA core count for spawn criterion |
|
||||||
|
| `obikpartitionner/src/merge_layer.rs` | No change — `merge_partition` already works inside any Rayon context |
|
||||||
|
| `obidebruinj/src/debruijn.rs` | No change — `par_iter` and `current_num_threads` are pool-context-aware |
|
||||||
|
| `obikpartitionner/src/partition.rs` | No change — same reason |
|
||||||
|
|
||||||
|
## Platform guard
|
||||||
|
|
||||||
|
NUMA pinning is Linux-only. The fallback is the current single global pool:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
fn build_numa_pools() -> Option<Vec<rayon::ThreadPool>> { ... }
|
||||||
|
|
||||||
|
#[cfg(not(target_os = "linux"))]
|
||||||
|
fn build_numa_pools() -> Option<Vec<rayon::ThreadPool>> { None }
|
||||||
|
```
|
||||||
|
|
||||||
|
When `build_numa_pools()` returns `None` (macOS, UMA, or single-socket), `merge.rs` uses the existing code path unchanged.
|
||||||
|
|
||||||
|
## Open questions
|
||||||
|
|
||||||
|
- **Partition assignment**: split partitions by NUMA group up-front (static) or use a shared queue with per-group workers stealing from a common pool? Static split is simpler; stealing is better for load balance when partitions vary widely in size.
|
||||||
|
- **Intra-NUMA adaptive criterion**: with 24 cores and ~3–5 effective workers per NUMA node, the current marginal-gain criterion needs re-tuning or can be left as-is with per-pool `n_cores = 24`.
|
||||||
|
- **I/O**: partition data (unitig files) is on a shared filesystem. With 8 concurrent NUMA groups, I/O concurrency increases 8× — need to verify the filesystem (Lustre or local SSD) can absorb it without becoming the new bottleneck.
|
||||||
@@ -0,0 +1,105 @@
|
|||||||
|
# Rebuild / filter — column-first design
|
||||||
|
|
||||||
|
## Problem with the current two-pass design
|
||||||
|
|
||||||
|
`rebuild_partition` currently makes **two full passes** over source data:
|
||||||
|
|
||||||
|
**Pass 1** — read unitigs → MPHF lookup (source) → read row (108 values) → apply filter → push kmer into `GraphDeBruijn`, **discard row**.
|
||||||
|
|
||||||
|
**Pass 2** — read unitigs again → MPHF lookup again → read row again → for each passing kmer, look up slot in new MPHF → fill column builders.
|
||||||
|
|
||||||
|
Both passes do random access into the source matrix: for each kmer, the MPHF returns a slot, then we read 108 values scattered across 108 column positions. This is cache-hostile even with a packed matrix (`.pbmx`), because the matrix is column-major: consecutive row reads jump across the file.
|
||||||
|
|
||||||
|
## Memory budget
|
||||||
|
|
||||||
|
The `keep` bitvector costs **1 bit per slot**. With 256 partitions and realistic kmer counts, each partition holds at most a few tens of millions of slots → a few MB per bitvector. Even in the absolute worst case (800 M slots), it stays under 100 MB. This is negligible.
|
||||||
|
|
||||||
|
The `slot_map` option (Option B, 8–16 bytes per slot) is heavier but still bounded: at 15 M slots and 8 bytes, that is 120 MB per partition, acceptable for a single worker.
|
||||||
|
|
||||||
|
## Key observation
|
||||||
|
|
||||||
|
**The filter operates on column values, not on kmers.** A filter like `--max-outgroup-count 0` only needs to know, for each slot, whether any outgroup column is non-zero. It does not need to know which kmer occupies that slot.
|
||||||
|
|
||||||
|
This means filtering can be done as a **sequential column scan** that produces a `keep: BitVec[n_slots]` — no MPHF lookups, no kmer knowledge, perfectly cache-friendly.
|
||||||
|
|
||||||
|
## Proposed single-scan design
|
||||||
|
|
||||||
|
### Step 1 — column scan → `keep` bitvector
|
||||||
|
|
||||||
|
```
|
||||||
|
for each column c in source matrix:
|
||||||
|
read column c sequentially (one mmap range)
|
||||||
|
update keep[slot] according to filter contribution of column c
|
||||||
|
```
|
||||||
|
|
||||||
|
For `GroupQuorumFilter` with ingroup/outgroup:
|
||||||
|
- ingroup columns: count presence per slot → `ingroup_count[slot]`
|
||||||
|
- outgroup columns: `keep[slot] &= (value[slot] == 0)` (early-exit possible)
|
||||||
|
|
||||||
|
Result: `keep: BitVec` of size `n_slots`, computed with purely sequential IO.
|
||||||
|
|
||||||
|
### Step 2 — unitig scan → kept kmers + new MPHF
|
||||||
|
|
||||||
|
```
|
||||||
|
for each kmer in unitig files:
|
||||||
|
old_slot = old_MPHF(kmer)
|
||||||
|
if keep[old_slot]:
|
||||||
|
push kmer into new GraphDeBruijn
|
||||||
|
record (old_slot, kmer) ← or just old_slot in order
|
||||||
|
```
|
||||||
|
|
||||||
|
Build new MPHF from `GraphDeBruijn` via `materialize_layer`.
|
||||||
|
|
||||||
|
### Step 3 — fill new matrix
|
||||||
|
|
||||||
|
Two sub-options:
|
||||||
|
|
||||||
|
**Option A — from recorded (old_slot, kmer) pairs:**
|
||||||
|
|
||||||
|
```
|
||||||
|
for each (old_slot, kmer) in recorded list:
|
||||||
|
new_slot = new_MPHF(kmer)
|
||||||
|
for each column c:
|
||||||
|
new_matrix[new_slot, c] = old_matrix[old_slot, c]
|
||||||
|
```
|
||||||
|
|
||||||
|
Memory cost: `n_kept × (8 + 8)` bytes for `(old_slot: usize, kmer: CanonicalKmer)`.
|
||||||
|
For species-specific filters, `n_kept` is small. For unfiltered rebuild, `n_kept = n_slots`.
|
||||||
|
|
||||||
|
**Option B — column-by-column copy using old→new slot mapping:**
|
||||||
|
|
||||||
|
Precompute `slot_map: Vec<Option<usize>>` of size `n_slots`:
|
||||||
|
- For each kmer in unitig file: `slot_map[old_MPHF(kmer)] = Some(new_MPHF(kmer))`
|
||||||
|
|
||||||
|
Then for each source column:
|
||||||
|
```
|
||||||
|
read source column sequentially
|
||||||
|
for each slot where slot_map[slot] = Some(new_slot):
|
||||||
|
write value to new column at new_slot
|
||||||
|
```
|
||||||
|
|
||||||
|
Memory cost: `n_slots × sizeof(usize)` for the slot map (one usize per source slot).
|
||||||
|
IO pattern: sequential read of each source column → random write into new column builders.
|
||||||
|
|
||||||
|
Option B avoids storing kmer values and works uniformly regardless of filter selectivity.
|
||||||
|
|
||||||
|
## Comparison
|
||||||
|
|
||||||
|
| | Current | Proposed |
|
||||||
|
|---|---|---|
|
||||||
|
| Disk reads | 2× unitigs + 2× random matrix | 1× columns (sequential) + 1× unitigs |
|
||||||
|
| MPHF lookups (source) | 2× N_kmers | 1× N_kept (step 2) or 0 (option B, col scan only) |
|
||||||
|
| Cache behavior | poor (random row access) | good (sequential column scan) |
|
||||||
|
| Extra memory | none | slot_map (option B) or (old_slot, kmer) list (option A) |
|
||||||
|
|
||||||
|
## Files to modify
|
||||||
|
|
||||||
|
- `src/obikpartitionner/src/rebuild_layer.rs` — `rebuild_partition` and `iter_src_layers`
|
||||||
|
- Possibly `src/obicompactvec/` — add column iterator API if not already present
|
||||||
|
- `src/obilayeredmap/` — check if per-column sequential access is exposed on `SrcLayerData`
|
||||||
|
|
||||||
|
## Open questions
|
||||||
|
|
||||||
|
- Does `SrcLayerData` expose per-column sequential iteration, or only `lookup(kmer, n_genomes)` random access?
|
||||||
|
- For option B: are new column builders writable in random-slot order (i.e. `set_val(slot, value)` without sequential constraint)?
|
||||||
|
- For `GroupQuorumFilter` specifically: can the filter be decomposed into independent per-column contributions, or does it need the full row?
|
||||||
@@ -29,16 +29,23 @@ Multiple values separated by `|` are always OR-ed within the predicate.
|
|||||||
|
|
||||||
### Path matching (`~` and `!~`)
|
### Path matching (`~` and `!~`)
|
||||||
|
|
||||||
Metadata values can represent hierarchical taxonomic paths such as
|
Metadata values can represent hierarchical concept paths such as
|
||||||
`/Eukaryota/Viridiplantae/Streptophyta/Betulaceae/Betula/nana`.
|
`/Eukaryota/Viridiplantae/Streptophyta/Betulaceae/Betula/nana`.
|
||||||
|
|
||||||
- **Absolute pattern** (starts with `/`): the value must start with the pattern
|
Stored taxonomy values always start with `/` (the root of the path).
|
||||||
at a segment boundary.
|
Query patterns do **not** need to start with `/` — a leading `/` is an optional
|
||||||
`taxon~/Betulaceae/Betula` matches `/Betulaceae/Betula/nana` and
|
start anchor, not a requirement.
|
||||||
`/Betulaceae/Betula` but not `/Betulaceae/Betuloides/…`.
|
|
||||||
- **Bare segment** (no leading `/`): the value must contain the pattern as an
|
| Pattern form | Semantics |
|
||||||
exact path component anywhere.
|
|---|---|
|
||||||
`taxon~Betula` matches any path that has `Betula` as one of its segments.
|
| `A/B` | contiguous sub-path A then B, anywhere in the value |
|
||||||
|
| `/A/B` | value starts with A then B |
|
||||||
|
| `A/B$` | value ends with A then B |
|
||||||
|
| `/A/B$` | value is exactly A then B |
|
||||||
|
| `A@x/B` | A with class `x` followed by B with any class |
|
||||||
|
|
||||||
|
- `taxon~/Betulaceae/Betula` matches any path that starts with `Betulaceae` then `Betula`.
|
||||||
|
- `taxon~Betula` matches any path containing `Betula` as a segment, anywhere.
|
||||||
|
|
||||||
### Missing metadata key → NA
|
### Missing metadata key → NA
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,520 @@
|
|||||||
|
# obicompactvec — Complete Reference
|
||||||
|
|
||||||
|
## Module structure
|
||||||
|
|
||||||
|
```
|
||||||
|
src/obicompactvec/src/
|
||||||
|
lib.rs public re-exports
|
||||||
|
views.rs BitSliceView<'a>, IntSliceView<'a> — zero-copy read views
|
||||||
|
traits.rs ColumnWeights, CountPartials, BitPartials (matrix aggregation)
|
||||||
|
bitvec.rs PersistentBitVec, PersistentBitVecBuilder, BitIter
|
||||||
|
reader.rs PersistentCompactIntVec (read-only)
|
||||||
|
builder.rs PersistentCompactIntVecBuilder (read-write)
|
||||||
|
tempintvec.rs TempCompactIntVec, TempCompactIntVecBuilder (temp-file-backed)
|
||||||
|
tempbitvec.rs TempBitVec, TempBitVecBuilder (temp-file-backed)
|
||||||
|
bitmatrix.rs PersistentBitMatrix, PersistentBitMatrixBuilder
|
||||||
|
intmatrix.rs PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder
|
||||||
|
colgroup.rs ColGroup, MatrixGroupOps trait
|
||||||
|
format.rs file format constants, encode/decode helpers
|
||||||
|
layer_meta.rs LayerMeta (column metadata)
|
||||||
|
meta.rs matrix metadata
|
||||||
|
```
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
graph TD
|
||||||
|
views --> bitvec
|
||||||
|
views --> builder
|
||||||
|
views --> tempbitvec
|
||||||
|
views --> tempintvec
|
||||||
|
views --> bitmatrix
|
||||||
|
views --> intmatrix
|
||||||
|
format --> reader
|
||||||
|
format --> builder
|
||||||
|
reader --> intmatrix
|
||||||
|
reader --> tempintvec
|
||||||
|
builder --> intmatrix
|
||||||
|
builder --> tempintvec
|
||||||
|
bitvec --> tempbitvec
|
||||||
|
bitvec --> bitmatrix
|
||||||
|
tempintvec --> intmatrix
|
||||||
|
tempintvec --> bitmatrix
|
||||||
|
tempbitvec --> intmatrix
|
||||||
|
tempbitvec --> bitmatrix
|
||||||
|
colgroup --> intmatrix
|
||||||
|
colgroup --> bitmatrix
|
||||||
|
layer_meta --> bitmatrix
|
||||||
|
layer_meta --> intmatrix
|
||||||
|
meta --> bitmatrix
|
||||||
|
meta --> intmatrix
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Compact int encoding
|
||||||
|
|
||||||
|
All integer vectors use the same two-tier encoding regardless of storage backend.
|
||||||
|
|
||||||
|
**Primary array** — one `u8` per slot:
|
||||||
|
|
||||||
|
- Values **0–254** are stored directly. No overhead.
|
||||||
|
- Value **255 is a sentinel**: the slot's actual value is ≥ 255 and lives in the overflow store.
|
||||||
|
|
||||||
|
**Overflow store** — maps slot index to a `u32` value ≥ 255:
|
||||||
|
|
||||||
|
- In `PersistentCompactIntVecBuilder`: a `HashMap<usize, u32>` in RAM.
|
||||||
|
- In `PersistentCompactIntVec` (reader): a sorted `[(slot: u64, value: u32)]` array in the mmap, with a sparse L1-resident index for binary search.
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart LR
|
||||||
|
slot --> P["primary[slot]: u8"]
|
||||||
|
P -->|"< 255"| V["value = byte (0–254)"]
|
||||||
|
P -->|"= 255 sentinel"| OV["overflow store"]
|
||||||
|
OV -->|"Builder"| HM["HashMap<usize, u32>\nin RAM"]
|
||||||
|
OV -->|"PersistentCompactIntVec"| SA["sorted [(slot,value)] in mmap\n+ sparse L1 index"]
|
||||||
|
```
|
||||||
|
|
||||||
|
**Key property — sentinel 255 = +∞ on `u8`:**
|
||||||
|
|
||||||
|
- `min(a, 255) = a` for all `a ≤ 254` → correct when only one side is overflow
|
||||||
|
- `max(a, 255) = 255` → correct sentinel when either side is overflow
|
||||||
|
- Only the **both-overflow** case requires reading actual values from the overflow store.
|
||||||
|
|
||||||
|
In practice, k (overflow count) ≪ n (total slots). Observed genomic data: ~0.07% of kmer slots are in overflow.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## View types
|
||||||
|
|
||||||
|
The previous trait hierarchy (`BitSlice`, `BitSliceMut`, `IntSlice`, `IntSliceMut`) has been replaced by two concrete zero-copy view structs with inherent methods. Views are **`Copy`** — passing them is free. All read operations live on these two types.
|
||||||
|
|
||||||
|
### `BitSliceView<'a>`
|
||||||
|
|
||||||
|
```rust
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
pub struct BitSliceView<'a> { pub(crate) words: &'a [u64], pub(crate) n: usize }
|
||||||
|
```
|
||||||
|
|
||||||
|
Bit `i` is at `words[i >> 6]` bit `i & 63` (LSB-first). Padding bits in the last word are zero.
|
||||||
|
|
||||||
|
| Method | Cost |
|
||||||
|
|---|---|
|
||||||
|
| `len()`, `is_empty()` | O(1) |
|
||||||
|
| `get(slot)` | O(1) |
|
||||||
|
| `count_ones()` | POPCNT per word, O(n/64) |
|
||||||
|
| `count_zeros()` | `n − count_ones()`, O(n/64) |
|
||||||
|
| `iter() -> BitSliceIter<'a>` | O(1) setup, O(n) iteration |
|
||||||
|
| `partial_jaccard_dist(other: BitSliceView)` | `(a&b).popcount`, `(a\|b).popcount` per word, O(n/64) |
|
||||||
|
| `jaccard_dist(other: BitSliceView)` | from partial, O(n/64) |
|
||||||
|
| `hamming_dist(other: BitSliceView)` | `(a^b).popcount` per word, O(n/64) |
|
||||||
|
|
||||||
|
`BitSliceIter<'a>`: word-level scan; one word per 64 iterations.
|
||||||
|
|
||||||
|
### `IntSliceView<'a>`
|
||||||
|
|
||||||
|
```rust
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
pub struct IntSliceView<'a> {
|
||||||
|
pub(crate) primary: &'a [u8],
|
||||||
|
pub(crate) overflow_raw: &'a [u8], // sorted [(slot:u64, value:u32)] entries
|
||||||
|
pub(crate) n_overflow: usize,
|
||||||
|
pub(crate) n: usize,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
`overflow_raw` contains `n_overflow` entries of `OVERFLOW_ENTRY_SIZE` bytes each, sorted by slot. The sort invariant is established at `close()`/`freeze()` time.
|
||||||
|
|
||||||
|
| Method | Cost |
|
||||||
|
|---|---|
|
||||||
|
| `len()`, `is_empty()` | O(1) |
|
||||||
|
| `primary_bytes()` | O(1) |
|
||||||
|
| `overflow_entries() -> impl Iterator<(usize,u32)>` | O(n_overflow) iteration |
|
||||||
|
| `get(slot)` | O(1) primary; binary search O(log k) for overflow slots |
|
||||||
|
| `iter() -> IntSliceViewIter<'a>` | merge scan, O(n + k) |
|
||||||
|
| `sum()` | byte scan + overflow, O(n + k) |
|
||||||
|
| `count_nonzero()` | byte scan, O(n) |
|
||||||
|
| Distance methods (`bray_dist`, `euclidean_dist`, `jaccard_dist`, …) | O(n + k) |
|
||||||
|
|
||||||
|
`IntSliceViewIter<'a>`: merge scan using `overflow_pos` index. Requires sorted overflow — guaranteed by the construction lifecycle.
|
||||||
|
|
||||||
|
**Builder `view()` vs reader `view()`:** `PersistentCompactIntVecBuilder` stores overflow as an unsorted `HashMap`, not raw bytes. Its `view()` returns an `IntSliceView` with `overflow_raw = &[]` and `n_overflow = 0`. This is intentional — the view is primarily useful after `freeze()`. During building, callers that need overflow use `overflow_entries()` directly.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Concrete types
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
classDiagram
|
||||||
|
class BitSliceView {
|
||||||
|
+words: &[u64]
|
||||||
|
+n: usize
|
||||||
|
+get(slot) bool
|
||||||
|
+count_ones() u64
|
||||||
|
+iter() BitSliceIter
|
||||||
|
+jaccard_dist/hamming_dist(other: BitSliceView)
|
||||||
|
}
|
||||||
|
class IntSliceView {
|
||||||
|
+primary: &[u8]
|
||||||
|
+overflow_raw: &[u8]
|
||||||
|
+n_overflow: usize
|
||||||
|
+n: usize
|
||||||
|
+get(slot) u32
|
||||||
|
+iter() IntSliceViewIter
|
||||||
|
+overflow_entries() Iterator
|
||||||
|
+bray_dist/euclidean_dist/…(other: IntSliceView)
|
||||||
|
}
|
||||||
|
class PersistentBitVec {
|
||||||
|
-mmap: Mmap
|
||||||
|
-n: usize
|
||||||
|
+view() BitSliceView
|
||||||
|
+get(slot) bool
|
||||||
|
+count_ones/zeros() u64
|
||||||
|
+iter() BitIter
|
||||||
|
+partial_jaccard_dist(&Self) (u64,u64)
|
||||||
|
+jaccard_dist/hamming_dist(&Self) …
|
||||||
|
}
|
||||||
|
class PersistentBitVecBuilder {
|
||||||
|
-mmap: MmapMut
|
||||||
|
-n: usize
|
||||||
|
+view() BitSliceView
|
||||||
|
+set(slot, bool)
|
||||||
|
+or/and/xor/not(BitSliceView)
|
||||||
|
+copy_from(BitSliceView)
|
||||||
|
+close() / finish() → PersistentBitVec
|
||||||
|
}
|
||||||
|
class PersistentCompactIntVec {
|
||||||
|
-mmap: Mmap
|
||||||
|
-n: usize
|
||||||
|
-n_overflow: usize
|
||||||
|
-step: usize
|
||||||
|
-index: Vec~(usize,usize)~
|
||||||
|
+view() IntSliceView
|
||||||
|
+get(slot) u32
|
||||||
|
+iter() Iter
|
||||||
|
+sum/count_nonzero() u64
|
||||||
|
+bray_dist/euclidean_dist/… (&Self)
|
||||||
|
}
|
||||||
|
class PersistentCompactIntVecBuilder {
|
||||||
|
-mmap: MmapMut
|
||||||
|
-n: usize
|
||||||
|
-overflow: HashMap~usize,u32~
|
||||||
|
+view() IntSliceView
|
||||||
|
+set(slot, u32) / get(slot) u32
|
||||||
|
+inc / inc_present / inc_present_fast
|
||||||
|
+inc_predicate / inc_predicate_fast
|
||||||
|
+add/min/max/diff/mask_with(…View)
|
||||||
|
+primary_bytes/primary_bytes_mut()
|
||||||
|
+close() / finish() → PersistentCompactIntVec
|
||||||
|
}
|
||||||
|
|
||||||
|
PersistentBitVec --> BitSliceView : view()
|
||||||
|
PersistentBitVecBuilder --> BitSliceView : view()
|
||||||
|
PersistentCompactIntVec --> IntSliceView : view()
|
||||||
|
PersistentCompactIntVecBuilder --> IntSliceView : view() (primary only)
|
||||||
|
PersistentBitVecBuilder --> PersistentBitVec : close() then open()
|
||||||
|
PersistentCompactIntVecBuilder --> PersistentCompactIntVec : close() then open()
|
||||||
|
```
|
||||||
|
|
||||||
|
### `PersistentBitVec` / `PersistentBitVecBuilder`
|
||||||
|
|
||||||
|
`PersistentBitVec` is the read-only type. `view()` returns a `BitSliceView<'_>` over the mmap word array. Direct inherent methods delegate to the view: `count_ones()`, `count_zeros()`, `partial_jaccard_dist(&Self)`, `jaccard_dist(&Self)`, `hamming_dist(&Self)`.
|
||||||
|
|
||||||
|
`BitIter<'a>` — exported iterator for `PersistentBitVec::iter()`:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
pub struct BitIter<'a> { pub(crate) words: &'a [u64], pub(crate) slot: usize, pub(crate) n: usize }
|
||||||
|
```
|
||||||
|
|
||||||
|
`PersistentBitVecBuilder` is the read-write type. Mutation operations accept `BitSliceView<'_>`:
|
||||||
|
|
||||||
|
| Method | Cost |
|
||||||
|
|---|---|
|
||||||
|
| `set(slot, bool)` | O(1) |
|
||||||
|
| `view() -> BitSliceView<'_>` | O(1) |
|
||||||
|
| `or/and/xor(BitSliceView)` | word-level, O(n/64), SIMD-friendly |
|
||||||
|
| `not()` | `w ^= u64::MAX` per word, re-masks last word | O(n/64) |
|
||||||
|
| `copy_from(BitSliceView)` | `copy_from_slice` | O(n/64) |
|
||||||
|
|
||||||
|
### `PersistentCompactIntVec` / `PersistentCompactIntVecBuilder`
|
||||||
|
|
||||||
|
`PersistentCompactIntVec` is the read-only type. `view()` returns an `IntSliceView<'_>` over the mmap primary and overflow arrays. Inherent `iter()` is a merge scan (`Iter` struct). Inherent `sum()` and `count_nonzero()` use fast byte-scan helpers.
|
||||||
|
|
||||||
|
`PersistentCompactIntVecBuilder` is the read-write type. Mutation methods on the builder fall into two categories:
|
||||||
|
|
||||||
|
**Point mutations:**
|
||||||
|
|
||||||
|
| Method | Note |
|
||||||
|
|---|---|
|
||||||
|
| `set(slot, u32)` | writes primary[slot] or 255+overflow |
|
||||||
|
| `get(slot) -> u32` | reads primary byte or HashMap |
|
||||||
|
| `inc(slot)` | `get` + `set`, O(1) |
|
||||||
|
|
||||||
|
**Bulk computation methods** — accept view arguments:
|
||||||
|
|
||||||
|
| Method | Semantics | Overflow |
|
||||||
|
|---|---|---|
|
||||||
|
| `inc_present(BitSliceView)` | `+= 1` at each 1-bit | via `inc`, safe for any group size |
|
||||||
|
| `inc_present_fast(BitSliceView)` | same, raw u8 `+= 1` | `debug_assert` no 255 reached |
|
||||||
|
| `inc_predicate(IntSliceView, pred)` | `+= 1` where `pred(col[s])` | two-pass, safe |
|
||||||
|
| `inc_predicate_fast(IntSliceView, pred)` | same, raw u8 | `debug_assert` no 255 reached |
|
||||||
|
| `add(IntSliceView)` | `self[s] += other[s]` | primary fast path + overflow fallback |
|
||||||
|
| `min(IntSliceView)` | byte min + both-overflow fixup | see algorithm below |
|
||||||
|
| `max(IntSliceView)` | pre-pass + byte max | see algorithm below |
|
||||||
|
| `diff(IntSliceView)` | saturating sub | self<255 hot path |
|
||||||
|
| `mask_with(BitSliceView)` | zeros slots where mask bit = 0 | O(n_zeros) |
|
||||||
|
|
||||||
|
**`inc_present_fast` / `inc_predicate_fast` invariant:** caller guarantees no counter reaches 255 during the operation (group size < 255 for `inc_present_fast`, or chunk size < 255 for `inc_predicate_fast`). Violation is caught by `debug_assert` in dev builds.
|
||||||
|
|
||||||
|
**`min` algorithm:**
|
||||||
|
|
||||||
|
Exploits 255 = +∞: byte-level min is correct unless both sides are overflow.
|
||||||
|
|
||||||
|
```
|
||||||
|
snapshot self_ov: Vec<(slot,val)>
|
||||||
|
snapshot other_ov: HashMap<slot,val>
|
||||||
|
clear_overflow()
|
||||||
|
Pass 1 — byte min, SIMD-vectorizable, O(n)
|
||||||
|
Pass 2 — both-overflow fixup, O(k_self):
|
||||||
|
for (slot, self_val) in self_ov:
|
||||||
|
if slot ∈ other_ov: set(slot, min(self_val, other_ov[slot]))
|
||||||
|
```
|
||||||
|
|
||||||
|
**`max` algorithm:**
|
||||||
|
|
||||||
|
Cannot do byte max first — `max(255, b<255)=255` overwrites self's original overflow value. Pre-pass reads self's value at other's overflow slots before the byte pass.
|
||||||
|
|
||||||
|
```
|
||||||
|
Pre-pass O(k_other): for (slot, other_val) in other.overflow_entries():
|
||||||
|
set(slot, max(self.get(slot), other_val))
|
||||||
|
Pass 1 — byte max, SIMD-vectorizable, O(n)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Matrix types
|
||||||
|
|
||||||
|
Four matrix types, two encodings × two formats:
|
||||||
|
|
||||||
|
| | Columnar format | Packed format |
|
||||||
|
|---|---|---|
|
||||||
|
| **Bit** | `PersistentBitMatrix` (Columnar variant) | `PersistentBitMatrix` (Packed variant) |
|
||||||
|
| **Int** | `PersistentCompactIntMatrix` (Columnar variant) | `PersistentCompactIntMatrix` (Packed variant) |
|
||||||
|
|
||||||
|
Both matrix types are enums (`Columnar` / `Packed` / `Implicit` for bit) behind a transparent API. `col_view(c)` returns the appropriate view directly:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// PersistentBitMatrix
|
||||||
|
pub fn col_view(&self, c: usize) -> BitSliceView<'_>
|
||||||
|
|
||||||
|
// PersistentCompactIntMatrix
|
||||||
|
pub fn col_view(&self, c: usize) -> IntSliceView<'_>
|
||||||
|
```
|
||||||
|
|
||||||
|
No wrapper enums (`BitColView`, `IntColView`): the caller receives a `Copy` view struct immediately usable with any view method or bulk builder method.
|
||||||
|
|
||||||
|
`pack_compact_int_matrix` and `pack_bit_matrix` convert columnar → packed format.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Aggregation traits (matrix level)
|
||||||
|
|
||||||
|
### ColumnWeights
|
||||||
|
|
||||||
|
```rust
|
||||||
|
trait ColumnWeights: Send + Sync {
|
||||||
|
fn col_weights(&self) -> Array1<u64>; // sum per column
|
||||||
|
fn partial_kmer_counts(&self) -> Array1<u64>; // default = col_weights()
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
`partial_kmer_counts` is overridden for count matrices to return `count_nonzero` per column (distinct kmers) rather than total count.
|
||||||
|
|
||||||
|
### CountPartials
|
||||||
|
|
||||||
|
Abstract required methods: `partial_bray`, `partial_euclidean`, `partial_threshold_jaccard`, `partial_relfreq_bray`, `partial_relfreq_euclidean`, `partial_hellinger`.
|
||||||
|
|
||||||
|
**Additivity rule:** self-contained partials (`partial_bray`, `partial_euclidean`, `partial_threshold_jaccard`) can be element-wise summed across all `(partition, layer)` pairs. Normalised partials (`partial_relfreq_*`, `partial_hellinger`) require the **global** `col_weights` (accumulated across all layers and all partitions) as parameter.
|
||||||
|
|
||||||
|
**`partial_threshold_jaccard` returns `(inter, union)`** because `union[i,j]` depends on both columns simultaneously.
|
||||||
|
|
||||||
|
Provided finalisations:
|
||||||
|
|
||||||
|
| Finalisation | Formula |
|
||||||
|
|---|---|
|
||||||
|
| `bray_dist_matrix()` | `1 − 2·partial_bray[i,j] / (w[i] + w[j])` |
|
||||||
|
| `euclidean_dist_matrix()` | `√partial_euclidean[i,j]` |
|
||||||
|
| `threshold_jaccard_dist_matrix(t)` | `1 − inter[i,j] / union[i,j]` |
|
||||||
|
| `relfreq_bray_dist_matrix()` | `1 − partial_relfreq_bray[i,j]` |
|
||||||
|
| `relfreq_euclidean_dist_matrix()` | `√partial_relfreq_euclidean[i,j]` |
|
||||||
|
| `hellinger_dist_matrix()` | `√partial_hellinger[i,j] / √2` |
|
||||||
|
| `hellinger_euclidean_dist_matrix()` | `√partial_hellinger[i,j]` |
|
||||||
|
|
||||||
|
### BitPartials
|
||||||
|
|
||||||
|
Required: `partial_jaccard() -> (Array2<u64>, Array2<u64>)`, `partial_hamming() -> Array2<u64>`. Both additive across layers and partitions.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Temp-file-backed types
|
||||||
|
|
||||||
|
**All inter-function results use temp-file-backed types** so the OS can page them out under memory pressure. This matters in practice: processing dozens of layers × hundreds of partitions in parallel would otherwise accumulate gigabytes of live anonymous memory.
|
||||||
|
|
||||||
|
### Lifecycle
|
||||||
|
|
||||||
|
```
|
||||||
|
TempCompactIntVecBuilder::new(n) → writable mmap in TempDir
|
||||||
|
↓ (inc_present_fast / inc_predicate_fast / add / mask_with / …)
|
||||||
|
.freeze() → TempCompactIntVec (read-only mmap + TempDir)
|
||||||
|
↓ (optional)
|
||||||
|
.make_persistent(path) → PersistentCompactIntVec (permanent file)
|
||||||
|
```
|
||||||
|
|
||||||
|
Same pattern for `TempBitVecBuilder` → `TempBitVec` → `PersistentBitVec`.
|
||||||
|
|
||||||
|
**Drop order**: `TempCompactIntVec { vec: PersistentCompactIntVec, _temp: TempDir }` — Rust drops fields in declaration order. `vec` (mmap) released before `_temp` (directory deleted). No explicit `drop()` needed.
|
||||||
|
|
||||||
|
### TempCompactIntVec / TempCompactIntVecBuilder
|
||||||
|
|
||||||
|
```rust
|
||||||
|
pub struct TempCompactIntVec {
|
||||||
|
vec: PersistentCompactIntVec,
|
||||||
|
_temp: TempDir, // dropped after vec
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) struct TempCompactIntVecBuilder {
|
||||||
|
builder: PersistentCompactIntVecBuilder,
|
||||||
|
temp: TempDir,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
`TempCompactIntVec`: read access via `get(slot)`, `sum()`, `iter()`, `view() -> IntSliceView<'_>`.
|
||||||
|
|
||||||
|
`TempCompactIntVecBuilder`: full delegation to inner `PersistentCompactIntVecBuilder` — all bulk computation methods (`inc_present_fast`, `inc_predicate_fast`, `add`, `min`, `max`, `diff`, `mask_with`) are exposed as `pub(crate)`.
|
||||||
|
|
||||||
|
### TempBitVec / TempBitVecBuilder
|
||||||
|
|
||||||
|
```rust
|
||||||
|
pub struct TempBitVec {
|
||||||
|
vec: PersistentBitVec,
|
||||||
|
_temp: TempDir,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) struct TempBitVecBuilder {
|
||||||
|
builder: PersistentBitVecBuilder,
|
||||||
|
temp: TempDir,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
`TempBitVec`: read access via `get(slot)`, `count_ones()`, `view() -> BitSliceView<'_>`, `iter()`.
|
||||||
|
|
||||||
|
`TempBitVecBuilder`: exposes `set(slot, bool)`, `or(BitSliceView)`, and:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
pub(crate) fn or_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool)
|
||||||
|
```
|
||||||
|
|
||||||
|
`or_where` — two passes, no intermediate allocation:
|
||||||
|
|
||||||
|
```
|
||||||
|
Pass 1 — primary bytes, O(n):
|
||||||
|
for slot in 0..n:
|
||||||
|
b = col.primary_bytes()[slot]
|
||||||
|
if b < 255 AND pred(b as u32): self.set(slot, true)
|
||||||
|
|
||||||
|
Pass 2 — overflow, O(k):
|
||||||
|
for (slot, val) in col.overflow_entries():
|
||||||
|
if pred(val): self.set(slot, true)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Filter / Select API
|
||||||
|
|
||||||
|
### ColGroup
|
||||||
|
|
||||||
|
```rust
|
||||||
|
pub struct ColGroup { pub name: String, pub indices: Vec<usize> }
|
||||||
|
```
|
||||||
|
|
||||||
|
Defined **once at the index level** from column metadata. Valid in all matrices of all layers and partitions — column structure is identical across the entire hierarchy; only rows (kmer slots) are partitioned.
|
||||||
|
|
||||||
|
### Composition axis
|
||||||
|
|
||||||
|
- **Across partitions**: kmer space is partitioned → partial results **concatenated** (disjoint kmer ranges).
|
||||||
|
- **Across layers**: same kmer space, different counts → partial results **aggregated** (add, OR, etc.).
|
||||||
|
|
||||||
|
### MatrixGroupOps
|
||||||
|
|
||||||
|
Five required primitives + two default methods derived from them. All return temp-file-backed types.
|
||||||
|
|
||||||
|
```rust
|
||||||
|
pub trait MatrixGroupOps {
|
||||||
|
// required
|
||||||
|
fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32)
|
||||||
|
-> io::Result<TempCompactIntVec>;
|
||||||
|
fn partial_group_sum(&self, g: &ColGroup)
|
||||||
|
-> io::Result<TempCompactIntVec>;
|
||||||
|
fn partial_group_any(&self, g: &ColGroup, threshold: u32)
|
||||||
|
-> io::Result<TempBitVec>;
|
||||||
|
fn partial_group_min(&self, g: &ColGroup)
|
||||||
|
-> io::Result<TempCompactIntVec>;
|
||||||
|
fn partial_group_max(&self, g: &ColGroup)
|
||||||
|
-> io::Result<TempCompactIntVec>;
|
||||||
|
|
||||||
|
// defaults derived from partial_group_presence_count
|
||||||
|
fn partial_group_all(&self, g: &ColGroup, threshold: u32)
|
||||||
|
-> io::Result<TempBitVec>; // slot=1 iff count == g.indices.len()
|
||||||
|
fn partial_group_none(&self, g: &ColGroup, threshold: u32)
|
||||||
|
-> io::Result<TempBitVec>; // slot=1 iff count == 0
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Implemented for both `PersistentCompactIntMatrix` and `PersistentBitMatrix`.
|
||||||
|
|
||||||
|
For **bit matrices**: values are 0/1, so `partial_group_sum` = `partial_group_presence_count(g, 1)`; `partial_group_min` is AND (set first column then mask-with remaining); `partial_group_max` is OR via `partial_group_any` + `inc_present`.
|
||||||
|
|
||||||
|
**`partial_group_presence_count` — chunking for large groups:**
|
||||||
|
|
||||||
|
When `g.indices.len() < 255`: per-slot counts stay within `u8` range. Use `inc_present_fast` (bit) or `inc_predicate_fast(col_view(c), |v| v >= threshold)` (int) — raw u8 increment, no overflow entry written.
|
||||||
|
|
||||||
|
When `g.indices.len() ≥ 255`: process in chunks of 254 columns, accumulate via `.add(chunk_frozen.view())`.
|
||||||
|
|
||||||
|
**`partial_group_min` (int matrix)**: copy first column via `.add(col_view(first))` (start from 0 ⇒ copy), then `.min(col_view(c))` for remaining.
|
||||||
|
|
||||||
|
**`partial_group_max` (int matrix)**: `.max(col_view(c))` for all columns (start from 0 ⇒ first column acts as copy).
|
||||||
|
|
||||||
|
**`partial_group_any`** uses `or_where` on `TempBitVecBuilder` (two-pass: primary bytes then overflow entries).
|
||||||
|
|
||||||
|
**`partial_group_all` / `partial_group_none`** (default): call `partial_group_presence_count`, then iterate slots to produce the bit result. O(n) extra pass, not chunked.
|
||||||
|
|
||||||
|
### add_col_from — matrix builder integration
|
||||||
|
|
||||||
|
Both matrix builders accept temp-file results directly:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// PersistentBitMatrixBuilder
|
||||||
|
fn add_col_from(&mut self, src: &TempBitVec) -> io::Result<()>
|
||||||
|
fn add_col_from_int(&mut self, src: &TempCompactIntVec) -> io::Result<()> // nonzero → 1
|
||||||
|
|
||||||
|
// PersistentCompactIntMatrixBuilder
|
||||||
|
fn add_col_from(&mut self, src: &TempCompactIntVec) -> io::Result<()>
|
||||||
|
fn add_col_from_bit(&mut self, src: &TempBitVec) -> io::Result<()> // bit → 0/1 u32
|
||||||
|
```
|
||||||
|
|
||||||
|
`add_col_from` copies the temp file to the matrix directory and increments `n_cols`; `close()` writes `meta.json` with the final column count. No separate `write_meta` step needed.
|
||||||
|
|
||||||
|
### mask_with
|
||||||
|
|
||||||
|
Direct method on `PersistentCompactIntVecBuilder` (and delegation via `TempCompactIntVecBuilder`). Zeros every slot where the corresponding mask bit is 0. Iterates only zero bits — O(n_zeros), O(1) when mask is all-ones.
|
||||||
|
|
||||||
|
```
|
||||||
|
for (w_idx, word) in mask.words():
|
||||||
|
if word == u64::MAX: continue // skip all-ones words
|
||||||
|
zeros = !word
|
||||||
|
while zeros != 0:
|
||||||
|
bit = trailing_zeros(zeros)
|
||||||
|
s = w_idx * 64 + bit
|
||||||
|
if primary[s] != 0: set(s, 0) // clears overflow entry too
|
||||||
|
zeros &= zeros − 1
|
||||||
|
```
|
||||||
|
|
||||||
|
Terminal operation for Filter (retain only selected kmer slots in a count vector) and Select (positional selection without MPHF).
|
||||||
@@ -0,0 +1,143 @@
|
|||||||
|
# `obitaxonomy` — taxonomy concept paths
|
||||||
|
|
||||||
|
`obitaxonomy` is a dependency-free crate that defines a typed representation
|
||||||
|
of hierarchical concept paths (taxonomic or otherwise) stored in genome metadata.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Concept path syntax
|
||||||
|
|
||||||
|
A concept path is stored as a metadata value with the prefix `taxonomy:/`:
|
||||||
|
|
||||||
|
```
|
||||||
|
taxonomy:/enterobacteriaceae@family/Escherichia@genus/Escherichia coli@species
|
||||||
|
```
|
||||||
|
|
||||||
|
Structure:
|
||||||
|
|
||||||
|
- The `taxonomy:/` prefix is the type discriminator. Any metadata value starting
|
||||||
|
with it is parsed as a `TaxPath`; all others remain plain strings.
|
||||||
|
- The remainder is one or more `/`-separated segments.
|
||||||
|
- Each segment is `name` or `name@rank`, where `rank` is a label for the
|
||||||
|
taxonomic level (e.g. `family`, `genus`, `species`).
|
||||||
|
- Rank annotations are **optional per segment** and can be mixed freely.
|
||||||
|
- Spaces are allowed in both names and ranks.
|
||||||
|
|
||||||
|
### Reserved character
|
||||||
|
|
||||||
|
`@` is reserved throughout the taxonomy system and may **not** appear in:
|
||||||
|
|
||||||
|
| Context | Constraint |
|
||||||
|
|---------|------------|
|
||||||
|
| Segment name | forbidden |
|
||||||
|
| Rank/class label | forbidden |
|
||||||
|
| Metadata key names | forbidden (used as `key@rank` in predicate syntax) |
|
||||||
|
|
||||||
|
`@` is freely allowed in plain-text metadata values (non-taxonomy).
|
||||||
|
|
||||||
|
### Parse errors
|
||||||
|
|
||||||
|
| Condition | Error |
|
||||||
|
|-----------|-------|
|
||||||
|
| Value does not start with `taxonomy:/` | `MissingPrefix` |
|
||||||
|
| No segments after the prefix | `EmptyPath` |
|
||||||
|
| Segment with empty name (consecutive `/`) | `EmptySegmentName` |
|
||||||
|
| Segment with trailing `@` and no rank (`name@`) | `EmptyRankName` |
|
||||||
|
| Segment with more than one `@` | `AmbiguousRank` |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Public API
|
||||||
|
|
||||||
|
### `TaxSegment`
|
||||||
|
|
||||||
|
A single node: a name and an optional rank.
|
||||||
|
|
||||||
|
```rust
|
||||||
|
seg.name() // &str
|
||||||
|
seg.rank() // Option<&str>
|
||||||
|
seg.to_string() // "name" or "name@rank"
|
||||||
|
TaxSegment::parse(s) // Result<TaxSegment, TaxError>
|
||||||
|
```
|
||||||
|
|
||||||
|
### `TaxPath`
|
||||||
|
|
||||||
|
```rust
|
||||||
|
TaxPath::parse(s) // Result<TaxPath, TaxError>
|
||||||
|
path.segments() // &[TaxSegment]
|
||||||
|
path.depth() // usize — number of segments
|
||||||
|
path.is_ancestor_of(&other) // bool — prefix match by name, ranks ignored
|
||||||
|
path.name_at_rank("genus") // Option<&str>
|
||||||
|
path.to_string() // reconstructs "taxonomy:/…"
|
||||||
|
```
|
||||||
|
|
||||||
|
`is_ancestor_of` compares segment **names** only — rank annotations are
|
||||||
|
informational and do not affect the ancestry relation.
|
||||||
|
|
||||||
|
```rust
|
||||||
|
let a: TaxPath = "taxonomy:/Enterobacteriaceae@family/Escherichia@genus".parse()?;
|
||||||
|
let b: TaxPath = "taxonomy:/Enterobacteriaceae@family/Escherichia@genus/Escherichia coli@species".parse()?;
|
||||||
|
|
||||||
|
assert!(a.is_ancestor_of(&b)); // true
|
||||||
|
assert!(b.is_ancestor_of(&a)); // false
|
||||||
|
assert!(a.is_ancestor_of(&a)); // true (equal ⇒ ancestor)
|
||||||
|
|
||||||
|
assert_eq!(b.name_at_rank("species"), Some("Escherichia coli"));
|
||||||
|
assert_eq!(b.name_at_rank("genus"), Some("Escherichia"));
|
||||||
|
assert_eq!(b.name_at_rank("order"), None);
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Integration with `GenomeInfo`
|
||||||
|
|
||||||
|
At index load time, every metadata value is inspected once:
|
||||||
|
|
||||||
|
- Starts with `taxonomy:/` → parsed into `TaxPath`, stored in `genome.taxonomy`.
|
||||||
|
- Otherwise → kept as-is in `genome.meta`.
|
||||||
|
|
||||||
|
```rust
|
||||||
|
struct GenomeInfo {
|
||||||
|
label: String,
|
||||||
|
meta: HashMap<String, String>, // plain text metadata
|
||||||
|
taxonomy: HashMap<String, TaxPath>, // parsed taxonomy metadata
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
The raw string is not duplicated. `TaxPath::to_string()` reconstructs the
|
||||||
|
original value losslessly for serialisation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Predicate operators (in `filter` / `select`)
|
||||||
|
|
||||||
|
Path predicates use the `~` / `!~` operators. The **stored value** always starts
|
||||||
|
with `/` (rooted path); the **query pattern** does not need to.
|
||||||
|
|
||||||
|
### Path pattern syntax
|
||||||
|
|
||||||
|
| Pattern | Semantics |
|
||||||
|
|---------|-----------|
|
||||||
|
| `A/B` | contiguous sub-path A then B, anywhere in the value |
|
||||||
|
| `/A/B` | value starts with A then B (start-anchored) |
|
||||||
|
| `A/B$` | value ends with A then B (end-anchored) |
|
||||||
|
| `/A/B$` | value is exactly A then B (fully anchored) |
|
||||||
|
| `A@x/B` | A with class `x` followed by B with any class |
|
||||||
|
| `A@x/B@y` | A with class `x` followed by B with class `y` |
|
||||||
|
|
||||||
|
A segment pattern without `@` matches the segment name regardless of its stored class.
|
||||||
|
|
||||||
|
### Rank-aware queries
|
||||||
|
|
||||||
|
```
|
||||||
|
key@rank=value
|
||||||
|
```
|
||||||
|
|
||||||
|
| Predicate form | Semantics |
|
||||||
|
|----------------|-----------|
|
||||||
|
| `key@rank=value` | genome's `key` has `value` at rank `rank` |
|
||||||
|
| `key@rank!=value` | does not |
|
||||||
|
| `key@rank=v1\|v2` | value at `rank` is `v1` or `v2` |
|
||||||
|
|
||||||
|
`~` combined with `@rank` on the key (e.g. `key@genus~pattern`) is not defined
|
||||||
|
and is rejected at parse time.
|
||||||
@@ -0,0 +1,84 @@
|
|||||||
|
# Installation
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
### Rust toolchain
|
||||||
|
|
||||||
|
`obikmer` requires **Rust 1.85 or later** (edition 2024). Install or update via [rustup](https://rustup.rs):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||||
|
rustup update stable
|
||||||
|
```
|
||||||
|
|
||||||
|
### C build environment (required for hwloc)
|
||||||
|
|
||||||
|
`obikmer` embeds [hwloc](https://www.open-mpi.org/projects/hwloc/) (Hardware Locality) for NUMA-aware thread placement on multi-socket machines. hwloc is built from source at compile time via the `vendored` feature of the `hwlocality` crate. This requires a standard C build environment.
|
||||||
|
|
||||||
|
#### Linux (Debian/Ubuntu)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
apt install build-essential automake libtool autoconf pkg-config
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Linux (RHEL/Rocky/AlmaLinux)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
dnf install gcc make automake libtool autoconf pkgconfig
|
||||||
|
```
|
||||||
|
|
||||||
|
#### HPC clusters
|
||||||
|
|
||||||
|
Most HPC clusters provide these tools via the module system:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
module load gcc automake libtool autoconf
|
||||||
|
```
|
||||||
|
|
||||||
|
If in doubt, check whether `autoreconf --version` and `libtool --version` return successfully.
|
||||||
|
|
||||||
|
#### macOS
|
||||||
|
|
||||||
|
```bash
|
||||||
|
brew install automake libtool autoconf pkg-config
|
||||||
|
```
|
||||||
|
|
||||||
|
## Building
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone <repository-url>
|
||||||
|
cd obikmer/src
|
||||||
|
cargo build --release
|
||||||
|
```
|
||||||
|
|
||||||
|
The compiled binary is at `target/release/obikmer`.
|
||||||
|
|
||||||
|
### Building on HPC clusters (network filesystems)
|
||||||
|
|
||||||
|
HPC home directories are typically on a network filesystem (Lustre, NFS) optimised for large sequential reads — not for the thousands of small file operations that Cargo generates during compilation. Building directly on such a filesystem can be extremely slow (0.1% CPU utilisation, tens of minutes for what should take seconds).
|
||||||
|
|
||||||
|
**Always redirect the build directory to a local scratch disk:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
CARGO_TARGET_DIR=/scratch/$USER/cargo-target cargo build --release
|
||||||
|
```
|
||||||
|
|
||||||
|
Adapt the path to the local scratch available on your cluster (`/var/tmp`, `/tmp`, `/scratch/local`, etc.). Once built, copy the binary to a permanent location:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cp /scratch/$USER/cargo-target/release/obikmer ~/bin/
|
||||||
|
```
|
||||||
|
|
||||||
|
## NUMA support
|
||||||
|
|
||||||
|
NUMA-aware thread placement is active automatically on multi-socket Linux machines (detected at runtime via hwloc). No special build flag is required — the detection is built in and falls back gracefully to the single-pool adaptive strategy on:
|
||||||
|
|
||||||
|
- macOS (Apple Silicon, unified memory)
|
||||||
|
- single-socket Linux machines
|
||||||
|
- any system where hwloc reports only one NUMA node
|
||||||
|
|
||||||
|
## Verifying the installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
obikmer --help
|
||||||
|
```
|
||||||
@@ -29,6 +29,7 @@ extra_javascript:
|
|||||||
|
|
||||||
nav:
|
nav:
|
||||||
- Home: index.md
|
- Home: index.md
|
||||||
|
- Installation: installation.md
|
||||||
- Theory:
|
- Theory:
|
||||||
- Kmers and super-kmers: kmers.md
|
- Kmers and super-kmers: kmers.md
|
||||||
- DNA encoding: theory/encoding.md
|
- DNA encoding: theory/encoding.md
|
||||||
@@ -52,9 +53,12 @@ nav:
|
|||||||
- Merge parallelism & memory: implementation/merge_parallelism.md
|
- Merge parallelism & memory: implementation/merge_parallelism.md
|
||||||
- Kmer filtering: implementation/filtering.md
|
- Kmer filtering: implementation/filtering.md
|
||||||
- Select command: implementation/select.md
|
- Select command: implementation/select.md
|
||||||
|
- obitaxonomy crate: implementation/obitaxonomy.md
|
||||||
- Architecture:
|
- Architecture:
|
||||||
- Sequences: architecture/sequences/invariant.md
|
- Sequences: architecture/sequences/invariant.md
|
||||||
- Kmer index: architecture/index_architecture.md
|
- Kmer index: architecture/index_architecture.md
|
||||||
|
- NUMA-aware worker pools: architecture/numa_worker_pools.md
|
||||||
|
- NUMA-aware partition runner: architecture/numa_partition_runner.md
|
||||||
|
|
||||||
watch:
|
watch:
|
||||||
- docmd
|
- docmd
|
||||||
|
|||||||
@@ -0,0 +1,44 @@
|
|||||||
|
# La crate obicompactvector
|
||||||
|
|
||||||
|
Le code actuelle est ce qu'il est. Ce n'est pad la vrérité absolue, c'est un premier effort d'implémentation rien de plus. Ci-dessous je vais décrire les objectif et la structure qui devrait être. LA VERITE A ATTEINDRE.
|
||||||
|
|
||||||
|
La crate fournie des représentations les plus compact possible en mémoire de matrice de comptage ou de présence de k-mer dans des génomes. Chaque colonne représente un génome chaque ligne un kmer. une matrice est une collection de vecteur ou chacun des vecteur est un colonne de la matrice.
|
||||||
|
|
||||||
|
Les matrices comme les colonnes ont vocation à être persistante. Les données sont stockées dans des fichiers binaires. Les données sont mappées en mémoire via `mmap`
|
||||||
|
|
||||||
|
Les structure sont par essence immutables. Il existe des représentations mutables des colonnes qui permettent leur construction. À la fin de leur construction, les colonnes sont fermée ce qui les rends immutable.
|
||||||
|
|
||||||
|
Les matrices peuvent êtres représenté de deux façons:
|
||||||
|
- via un répertoire contenant une collection de fichier colonnes
|
||||||
|
- via un fichier matrix qui est la concatenation de plusieurs fichiers colonnes.
|
||||||
|
|
||||||
|
|
||||||
|
## Les matrices de comptage
|
||||||
|
|
||||||
|
Ce sont des matrice d'entiers positif la plus part du temps de petites valeurs (inferieurs à 255). On assume que toutes les valeurs sont représentables sur un `u32`
|
||||||
|
|
||||||
|
## Les matrices de presence
|
||||||
|
|
||||||
|
Ce sont des matrices de boolean représenté comme des champs de bits
|
||||||
|
|
||||||
|
Il existe une forme implicite des vecteur de présence, qui n'est représenté par aucun fichier pour lequel toutes les valeurs sont vraies
|
||||||
|
|
||||||
|
## représentation légère des colonnes
|
||||||
|
|
||||||
|
Les colonnes qu'elles soient de unitiaire (fichier colonne) ou partie d'un fichier composite matrice peuvent être représenté par un objet léger donnant acces à ces valeurs ainsi qu'à la longeur du vecteurs. Toutes les méthodes de calcules doivent uniquement travailler à partir de ces représentations légère unifiées des colonnes.
|
||||||
|
|
||||||
|
### Représentation légère d'un vecteur de présence
|
||||||
|
|
||||||
|
Le vecteur est représenté par
|
||||||
|
- un champs de bits encodé comme un [u64]
|
||||||
|
- un usize encodant la longeur du champs de bits
|
||||||
|
|
||||||
|
### Représentation légère d'un vecteur de présence
|
||||||
|
|
||||||
|
Le vecteur est représenté par
|
||||||
|
- un vecteur [u8] encodant directement les valeur faibe du vecteur [0,255[
|
||||||
|
La valeur 255 est une valeur sentinelle indiquant que la valeure vraie est >=255
|
||||||
|
et se trouvent dans une structure d'overflow
|
||||||
|
- un iterateur de (usize,u32) listant les valeurs d'overflow coorespondant aux valeurs
|
||||||
|
sentinels (255) du [u8]
|
||||||
|
- un usize encodant la longeur du champs de bits
|
||||||
Generated
+305
-6
@@ -128,6 +128,12 @@ version = "0.4.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2ad8689a486416c401ea15715a4694de30054248ec627edbf31f49cb64ee4086"
|
checksum = "2ad8689a486416c401ea15715a4694de30054248ec627edbf31f49cb64ee4086"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "arrayvec"
|
||||||
|
version = "0.7.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "as-slice"
|
name = "as-slice"
|
||||||
version = "0.2.1"
|
version = "0.2.1"
|
||||||
@@ -143,6 +149,15 @@ version = "1.5.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
|
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "autotools"
|
||||||
|
version = "0.2.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ef941527c41b0fc0dd48511a8154cd5fc7e29200a0ff8b7203c5d777dbc795cf"
|
||||||
|
dependencies = [
|
||||||
|
"cc",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "backtrace"
|
name = "backtrace"
|
||||||
version = "0.3.76"
|
version = "0.3.76"
|
||||||
@@ -224,6 +239,15 @@ dependencies = [
|
|||||||
"generic-array",
|
"generic-array",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "block-buffer"
|
||||||
|
version = "0.12.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d2f6c7dbe95a6ed67ad9f18e57daf93a2f034c524b99fd2b76d18fdfeb6660aa"
|
||||||
|
dependencies = [
|
||||||
|
"hybrid-array",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "block-pseudorand"
|
name = "block-pseudorand"
|
||||||
version = "0.1.2"
|
version = "0.1.2"
|
||||||
@@ -415,6 +439,15 @@ version = "1.1.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
|
checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cmake"
|
||||||
|
version = "0.1.58"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678"
|
||||||
|
dependencies = [
|
||||||
|
"cc",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "colorchoice"
|
name = "colorchoice"
|
||||||
version = "1.0.5"
|
version = "1.0.5"
|
||||||
@@ -464,6 +497,21 @@ dependencies = [
|
|||||||
"windows-sys 0.59.0",
|
"windows-sys 0.59.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "const-oid"
|
||||||
|
version = "0.10.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a6ef517f0926dd24a1582492c791b6a4818a4d94e789a334894aa15b0d12f55c"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "convert_case"
|
||||||
|
version = "0.10.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "633458d4ef8c78b72454de2d54fd6ab2e60f9e02be22f3c6104cdc8a4e0fceb9"
|
||||||
|
dependencies = [
|
||||||
|
"unicode-segmentation",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "core-foundation-sys"
|
name = "core-foundation-sys"
|
||||||
version = "0.8.7"
|
version = "0.8.7"
|
||||||
@@ -488,6 +536,15 @@ dependencies = [
|
|||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cpufeatures"
|
||||||
|
version = "0.3.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "crc32fast"
|
name = "crc32fast"
|
||||||
version = "1.5.0"
|
version = "1.5.0"
|
||||||
@@ -601,6 +658,15 @@ dependencies = [
|
|||||||
"typenum",
|
"typenum",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "crypto-common"
|
||||||
|
version = "0.2.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ce6e4c961d6cd6c9a86db418387425e8bdeaf05b3c8bc1411e6dca4c252f1453"
|
||||||
|
dependencies = [
|
||||||
|
"hybrid-array",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "csv"
|
name = "csv"
|
||||||
version = "1.4.0"
|
version = "1.4.0"
|
||||||
@@ -640,14 +706,48 @@ dependencies = [
|
|||||||
"uuid",
|
"uuid",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "derive_more"
|
||||||
|
version = "2.1.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d751e9e49156b02b44f9c1815bcb94b984cdcc4396ecc32521c739452808b134"
|
||||||
|
dependencies = [
|
||||||
|
"derive_more-impl",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "derive_more-impl"
|
||||||
|
version = "2.1.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "799a97264921d8623a957f6c3b9011f3b5492f557bbb7a5a19b7fa6d06ba8dcb"
|
||||||
|
dependencies = [
|
||||||
|
"convert_case",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"rustc_version",
|
||||||
|
"syn",
|
||||||
|
"unicode-xid",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "digest"
|
name = "digest"
|
||||||
version = "0.10.7"
|
version = "0.10.7"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
|
checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"block-buffer",
|
"block-buffer 0.10.4",
|
||||||
"crypto-common",
|
"crypto-common 0.1.7",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "digest"
|
||||||
|
version = "0.11.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2"
|
||||||
|
dependencies = [
|
||||||
|
"block-buffer 0.12.1",
|
||||||
|
"const-oid",
|
||||||
|
"crypto-common 0.2.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -742,6 +842,16 @@ version = "2.4.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
|
checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "filetime"
|
||||||
|
version = "0.2.29"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5c287a33c7f0a620c38e641e7f60827713987b3c0f26e8ddc9462cc69cf75759"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
"libc",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "find-msvc-tools"
|
name = "find-msvc-tools"
|
||||||
version = "0.1.9"
|
version = "0.1.9"
|
||||||
@@ -916,6 +1026,65 @@ version = "0.5.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
|
checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "http"
|
||||||
|
version = "1.4.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6970f50e31d6fc17d3fa27329444bfa74e196cf62e95052a3f6fee181dba6425"
|
||||||
|
dependencies = [
|
||||||
|
"bytes",
|
||||||
|
"itoa",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "httparse"
|
||||||
|
version = "1.10.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "hwlocality"
|
||||||
|
version = "1.0.0-alpha.12"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4c2e65a48d3b300843ac84a2fe8e166bb5a5b00f30054593bcee8157e4b465fd"
|
||||||
|
dependencies = [
|
||||||
|
"arrayvec",
|
||||||
|
"bitflags 2.11.1",
|
||||||
|
"derive_more",
|
||||||
|
"errno",
|
||||||
|
"hwlocality-sys",
|
||||||
|
"libc",
|
||||||
|
"strum",
|
||||||
|
"thiserror 2.0.18",
|
||||||
|
"windows-sys 0.61.2",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "hwlocality-sys"
|
||||||
|
version = "0.7.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "10a83c43a772c1f774b806deb44891c2a9578eb33cec48aad513482e0da3d4d4"
|
||||||
|
dependencies = [
|
||||||
|
"autotools",
|
||||||
|
"cmake",
|
||||||
|
"flate2",
|
||||||
|
"libc",
|
||||||
|
"pkg-config",
|
||||||
|
"sha3",
|
||||||
|
"tar",
|
||||||
|
"ureq 3.3.0",
|
||||||
|
"windows-sys 0.61.2",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "hybrid-array"
|
||||||
|
version = "0.4.12"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9155a582abd142abc056962c29e3ce5ff2ad5469f4246b537ed42c5deba857da"
|
||||||
|
dependencies = [
|
||||||
|
"typenum",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "icu_collections"
|
name = "icu_collections"
|
||||||
version = "2.2.0"
|
version = "2.2.0"
|
||||||
@@ -1145,6 +1314,16 @@ dependencies = [
|
|||||||
"wasm-bindgen",
|
"wasm-bindgen",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "keccak"
|
||||||
|
version = "0.2.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9e24a010dd405bd7ed803e5253182815b41bf2e6a80cc3bfc066658e03a198aa"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
"cpufeatures 0.3.0",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "kodama"
|
name = "kodama"
|
||||||
version = "0.2.3"
|
version = "0.2.3"
|
||||||
@@ -1508,6 +1687,7 @@ name = "obikindex"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"crossbeam-channel",
|
"crossbeam-channel",
|
||||||
|
"hwlocality",
|
||||||
"indicatif",
|
"indicatif",
|
||||||
"ndarray",
|
"ndarray",
|
||||||
"obicompactvec",
|
"obicompactvec",
|
||||||
@@ -1524,7 +1704,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "obikmer"
|
name = "obikmer"
|
||||||
version = "0.1.0"
|
version = "1.1.27"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"csv",
|
"csv",
|
||||||
@@ -1542,6 +1722,7 @@ dependencies = [
|
|||||||
"obiskbuilder",
|
"obiskbuilder",
|
||||||
"obiskio",
|
"obiskio",
|
||||||
"obisys",
|
"obisys",
|
||||||
|
"obitaxonomy",
|
||||||
"pprof",
|
"pprof",
|
||||||
"rayon",
|
"rayon",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
@@ -1636,7 +1817,7 @@ dependencies = [
|
|||||||
"regex",
|
"regex",
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
"ureq",
|
"ureq 2.12.1",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -1673,6 +1854,10 @@ dependencies = [
|
|||||||
"tracing",
|
"tracing",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "obitaxonomy"
|
||||||
|
version = "0.1.0"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "object"
|
name = "object"
|
||||||
version = "0.37.3"
|
version = "0.37.3"
|
||||||
@@ -2177,6 +2362,15 @@ version = "2.1.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe"
|
checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rustc_version"
|
||||||
|
version = "0.4.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
|
||||||
|
dependencies = [
|
||||||
|
"semver",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustix"
|
name = "rustix"
|
||||||
version = "1.1.4"
|
version = "1.1.4"
|
||||||
@@ -2263,6 +2457,12 @@ dependencies = [
|
|||||||
"syn",
|
"syn",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "semver"
|
||||||
|
version = "1.0.28"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde"
|
name = "serde"
|
||||||
version = "1.0.228"
|
version = "1.0.228"
|
||||||
@@ -2313,8 +2513,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
|
checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"cpufeatures",
|
"cpufeatures 0.2.17",
|
||||||
"digest",
|
"digest 0.10.7",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "sha3"
|
||||||
|
version = "0.11.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "be176f1a57ce4e3d31c1a166222d9768de5954f811601fb7ca06fc8203905ce1"
|
||||||
|
dependencies = [
|
||||||
|
"digest 0.11.3",
|
||||||
|
"keccak",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -2375,6 +2585,27 @@ version = "0.11.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "strum"
|
||||||
|
version = "0.28.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9628de9b8791db39ceda2b119bbe13134770b56c138ec1d3af810d045c04f9bd"
|
||||||
|
dependencies = [
|
||||||
|
"strum_macros",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "strum_macros"
|
||||||
|
version = "0.28.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ab85eea0270ee17587ed4156089e10b9e6880ee688791d45a905f5b1ca36f664"
|
||||||
|
dependencies = [
|
||||||
|
"heck",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "subtle"
|
name = "subtle"
|
||||||
version = "2.6.1"
|
version = "2.6.1"
|
||||||
@@ -2470,6 +2701,17 @@ version = "1.0.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
|
checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tar"
|
||||||
|
version = "0.4.46"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3f6221d9a6003c78398e3b239969f352578258df48c8eb051caadae0015bc840"
|
||||||
|
dependencies = [
|
||||||
|
"filetime",
|
||||||
|
"libc",
|
||||||
|
"xattr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tempfile"
|
name = "tempfile"
|
||||||
version = "3.27.0"
|
version = "3.27.0"
|
||||||
@@ -2645,12 +2887,24 @@ version = "1.0.24"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
|
checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unicode-segmentation"
|
||||||
|
version = "1.13.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c6f5d3c3b1bf09027a88a6bc961fc00497d651009560b5463668dc81b0fa87a8"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "unicode-width"
|
name = "unicode-width"
|
||||||
version = "0.2.2"
|
version = "0.2.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
|
checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unicode-xid"
|
||||||
|
version = "0.2.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "untrusted"
|
name = "untrusted"
|
||||||
version = "0.9.0"
|
version = "0.9.0"
|
||||||
@@ -2673,6 +2927,35 @@ dependencies = [
|
|||||||
"webpki-roots 0.26.11",
|
"webpki-roots 0.26.11",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ureq"
|
||||||
|
version = "3.3.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "dea7109cdcd5864d4eeb1b58a1648dc9bf520360d7af16ec26d0a9354bafcfc0"
|
||||||
|
dependencies = [
|
||||||
|
"base64",
|
||||||
|
"flate2",
|
||||||
|
"log",
|
||||||
|
"percent-encoding",
|
||||||
|
"rustls",
|
||||||
|
"rustls-pki-types",
|
||||||
|
"ureq-proto",
|
||||||
|
"utf8-zero",
|
||||||
|
"webpki-roots 1.0.7",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ureq-proto"
|
||||||
|
version = "0.6.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e994ba84b0bd1b1b0cf92878b7ef898a5c1760108fe7b6010327e274917a808c"
|
||||||
|
dependencies = [
|
||||||
|
"base64",
|
||||||
|
"http",
|
||||||
|
"httparse",
|
||||||
|
"log",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "url"
|
name = "url"
|
||||||
version = "2.5.8"
|
version = "2.5.8"
|
||||||
@@ -2685,6 +2968,12 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "utf8-zero"
|
||||||
|
version = "0.8.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b8c0a043c9540bae7c578c88f91dda8bd82e59ae27c21baca69c8b191aaf5a6e"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "utf8_iter"
|
name = "utf8_iter"
|
||||||
version = "1.0.4"
|
version = "1.0.4"
|
||||||
@@ -3110,6 +3399,16 @@ dependencies = [
|
|||||||
"tap",
|
"tap",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "xattr"
|
||||||
|
version = "1.6.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
"rustix",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "xxhash-rust"
|
name = "xxhash-rust"
|
||||||
version = "0.8.15"
|
version = "0.8.15"
|
||||||
|
|||||||
+1
-1
@@ -1,5 +1,5 @@
|
|||||||
[workspace]
|
[workspace]
|
||||||
resolver = "3"
|
resolver = "3"
|
||||||
members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap", "obicompactvec", "obisys", "obikindex"]
|
members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap", "obicompactvec", "obisys", "obikindex", "obitaxonomy"]
|
||||||
[profile.release]
|
[profile.release]
|
||||||
debug = 1
|
debug = 1
|
||||||
|
|||||||
@@ -7,6 +7,6 @@ edition = "2024"
|
|||||||
memmap2 = "0.9"
|
memmap2 = "0.9"
|
||||||
ndarray = "0.16"
|
ndarray = "0.16"
|
||||||
rayon = "1"
|
rayon = "1"
|
||||||
|
tempfile = "3"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
tempfile = "3"
|
|
||||||
|
|||||||
+187
-101
@@ -1,5 +1,5 @@
|
|||||||
use std::fs::{self, File};
|
use std::fs::{self, File};
|
||||||
use std::io::{self, Write as _};
|
use std::io::{self, BufWriter, Write as _};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
use memmap2::Mmap;
|
use memmap2::Mmap;
|
||||||
@@ -7,8 +7,12 @@ use ndarray::{Array1, Array2};
|
|||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
|
|
||||||
use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
|
use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
|
||||||
|
use crate::colgroup::{ColGroup, MatrixGroupOps};
|
||||||
use crate::layer_meta::LayerMeta;
|
use crate::layer_meta::LayerMeta;
|
||||||
use crate::meta::MatrixMeta;
|
use crate::meta::MatrixMeta;
|
||||||
|
use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
|
||||||
|
use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
|
||||||
|
use crate::views::BitSliceView;
|
||||||
|
|
||||||
fn col_path(dir: &Path, col: usize) -> PathBuf {
|
fn col_path(dir: &Path, col: usize) -> PathBuf {
|
||||||
dir.join(format!("col_{col:06}.pbiv"))
|
dir.join(format!("col_{col:06}.pbiv"))
|
||||||
@@ -54,34 +58,11 @@ impl ColumnarBitMatrix {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_jaccard_dist_matrix(&self) -> (Array2<u64>, Array2<u64>) {
|
pub(crate) fn partial_jaccard_dist_matrix(&self) -> (Array2<u64>, Array2<u64>) {
|
||||||
let n = self.n_cols();
|
pairwise2_matrix(self.n_cols(), |i, j| self.col(i).partial_jaccard_dist(self.col(j)))
|
||||||
let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n)
|
|
||||||
.into_par_iter()
|
|
||||||
.map(|(i, j)| {
|
|
||||||
let (inter, union) = self.col(i).partial_jaccard_dist(self.col(j));
|
|
||||||
(i, j, inter, union)
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
let mut inter_m = Array2::zeros((n, n));
|
|
||||||
let mut union_m = Array2::zeros((n, n));
|
|
||||||
for (i, j, inter, union) in results {
|
|
||||||
inter_m[[i, j]] = inter; inter_m[[j, i]] = inter;
|
|
||||||
union_m[[i, j]] = union; union_m[[j, i]] = union;
|
|
||||||
}
|
|
||||||
(inter_m, union_m)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_hamming_dist_matrix(&self) -> Array2<u64> {
|
pub(crate) fn partial_hamming_dist_matrix(&self) -> Array2<u64> {
|
||||||
self.pairwise_u64(|i, j| self.col(i).hamming_dist(self.col(j)))
|
pairwise_matrix(self.n_cols(), |i, j| self.col(i).hamming_dist(self.col(j)))
|
||||||
}
|
|
||||||
|
|
||||||
fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2<u64> {
|
|
||||||
let n = self.n_cols();
|
|
||||||
let results: Vec<(usize, usize, u64)> = upper_pairs(n)
|
|
||||||
.into_par_iter()
|
|
||||||
.map(|(i, j)| (i, j, f(i, j)))
|
|
||||||
.collect();
|
|
||||||
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn append_column(dir: &Path, value_of: impl Fn(usize) -> bool) -> io::Result<()> {
|
pub(crate) fn append_column(dir: &Path, value_of: impl Fn(usize) -> bool) -> io::Result<()> {
|
||||||
@@ -147,113 +128,92 @@ impl PackedBitMatrix {
|
|||||||
}).collect()
|
}).collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
|
||||||
fn col_bytes(&self, c: usize) -> &[u8] {
|
fn col_bytes(&self, c: usize) -> &[u8] {
|
||||||
let start = self.data_offsets[c];
|
let start = self.data_offsets[c];
|
||||||
let len = (self.n_rows + 7) / 8;
|
&self.mmap[start..start + self.n_rows.div_ceil(8)]
|
||||||
&self.mmap[start..start + len]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn count_ones_col(&self, c: usize) -> u64 {
|
fn col_words(&self, c: usize) -> &[u64] {
|
||||||
let bytes = self.col_bytes(c);
|
let nw = self.n_rows.div_ceil(64);
|
||||||
let full = self.n_rows / 8;
|
// SAFETY: data_offsets[c] is always 8-byte aligned.
|
||||||
let rem = self.n_rows % 8;
|
// PBMX header = 24 + n_cols×8 (multiple of 8); each PBIV blob =
|
||||||
let mut n: u64 = bytes[..full].iter().map(|b| b.count_ones() as u64).sum();
|
// 16 + nwords×8 (multiple of 8); mmap base is page-aligned.
|
||||||
if rem > 0 { n += (bytes[full] & ((1u8 << rem) - 1)).count_ones() as u64; }
|
let ptr = self.mmap[self.data_offsets[c]..].as_ptr() as *const u64;
|
||||||
n
|
unsafe { std::slice::from_raw_parts(ptr, nw) }
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pair_op(&self, i: usize, j: usize, and_or: bool) -> u64 {
|
pub(crate) fn col_slice(&self, c: usize) -> BitSliceView<'_> {
|
||||||
let ai = self.col_bytes(i);
|
BitSliceView::new(self.col_words(c), self.n_rows)
|
||||||
let aj = self.col_bytes(j);
|
|
||||||
let full = self.n_rows / 8;
|
|
||||||
let rem = self.n_rows % 8;
|
|
||||||
let mut n: u64 = ai[..full].iter().zip(aj[..full].iter())
|
|
||||||
.map(|(a, b)| if and_or { a & b } else { a ^ b }.count_ones() as u64)
|
|
||||||
.sum();
|
|
||||||
if rem > 0 {
|
|
||||||
let mask = (1u8 << rem) - 1;
|
|
||||||
let last = if and_or { ai[full] & aj[full] } else { ai[full] ^ aj[full] };
|
|
||||||
n += (last & mask).count_ones() as u64;
|
|
||||||
}
|
|
||||||
n
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn partial_jaccard_col(&self, i: usize, j: usize) -> (u64, u64) {
|
pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentBitVecBuilder> {
|
||||||
let ai = self.col_bytes(i);
|
PersistentBitVecBuilder::from_raw_bytes(self.col_bytes(c), self.n_rows, path)
|
||||||
let aj = self.col_bytes(j);
|
|
||||||
let full = self.n_rows / 8;
|
|
||||||
let rem = self.n_rows % 8;
|
|
||||||
let (mut inter, mut union) = ai[..full].iter().zip(aj[..full].iter())
|
|
||||||
.fold((0u64, 0u64), |(inter, union), (a, b)| {
|
|
||||||
(inter + (a & b).count_ones() as u64,
|
|
||||||
union + (a | b).count_ones() as u64)
|
|
||||||
});
|
|
||||||
if rem > 0 {
|
|
||||||
let mask = (1u8 << rem) - 1;
|
|
||||||
inter += ((ai[full] & aj[full]) & mask).count_ones() as u64;
|
|
||||||
union += ((ai[full] | aj[full]) & mask).count_ones() as u64;
|
|
||||||
}
|
|
||||||
(inter, union)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn count_ones(&self) -> Array1<u64> {
|
pub(crate) fn count_ones(&self) -> Array1<u64> {
|
||||||
Array1::from_vec(
|
Array1::from_vec(
|
||||||
(0..self.n_cols).into_par_iter().map(|c| self.count_ones_col(c)).collect()
|
(0..self.n_cols).into_par_iter()
|
||||||
|
.map(|c| self.col_slice(c).count_ones())
|
||||||
|
.collect()
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_jaccard_dist_matrix(&self) -> (Array2<u64>, Array2<u64>) {
|
pub(crate) fn partial_jaccard_dist_matrix(&self) -> (Array2<u64>, Array2<u64>) {
|
||||||
let n = self.n_cols;
|
pairwise2_matrix(self.n_cols, |i, j| {
|
||||||
let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n)
|
self.col_slice(i).partial_jaccard_dist(self.col_slice(j))
|
||||||
.into_par_iter()
|
})
|
||||||
.map(|(i, j)| { let (inter, union) = self.partial_jaccard_col(i, j); (i, j, inter, union) })
|
|
||||||
.collect();
|
|
||||||
let mut inter_m = Array2::zeros((n, n));
|
|
||||||
let mut union_m = Array2::zeros((n, n));
|
|
||||||
for (i, j, inter, union) in results {
|
|
||||||
inter_m[[i, j]] = inter; inter_m[[j, i]] = inter;
|
|
||||||
union_m[[i, j]] = union; union_m[[j, i]] = union;
|
|
||||||
}
|
|
||||||
(inter_m, union_m)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_hamming_dist_matrix(&self) -> Array2<u64> {
|
pub(crate) fn partial_hamming_dist_matrix(&self) -> Array2<u64> {
|
||||||
let n = self.n_cols;
|
pairwise_matrix(self.n_cols, |i, j| {
|
||||||
let results: Vec<(usize, usize, u64)> = upper_pairs(n)
|
self.col_slice(i).hamming_dist(self.col_slice(j))
|
||||||
.into_par_iter()
|
})
|
||||||
.map(|(i, j)| (i, j, self.pair_op(i, j, false)))
|
|
||||||
.collect();
|
|
||||||
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Build `presence/matrix.pbmx` from existing `col_*.pbiv` files.
|
/// Build `presence/matrix.pbmx` from existing `col_*.pbiv` files.
|
||||||
pub fn pack_bit_matrix(dir: &Path) -> io::Result<()> {
|
pub fn pack_bit_matrix(dir: &Path) -> io::Result<()> {
|
||||||
|
let packed_path = dir.join("matrix.pbmx");
|
||||||
|
if packed_path.exists() {
|
||||||
|
// Matrix complete; remove any leftover column files from a killed cleanup.
|
||||||
|
if let Ok(meta) = MatrixMeta::load(dir) {
|
||||||
|
for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); }
|
||||||
|
let _ = fs::remove_file(dir.join("meta.json"));
|
||||||
|
}
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
let meta = MatrixMeta::load(dir)?;
|
let meta = MatrixMeta::load(dir)?;
|
||||||
let n_cols = meta.n_cols;
|
let n_cols = meta.n_cols;
|
||||||
|
|
||||||
let col_files: Vec<Vec<u8>> = (0..n_cols)
|
// Compute offsets from file sizes — no column data loaded into RAM.
|
||||||
.map(|c| fs::read(col_path(dir, c)))
|
let col_sizes: Vec<u64> = (0..n_cols)
|
||||||
|
.map(|c| fs::metadata(col_path(dir, c)).map(|m| m.len()))
|
||||||
.collect::<io::Result<_>>()?;
|
.collect::<io::Result<_>>()?;
|
||||||
|
|
||||||
let header_size = PBMX_HEADER + n_cols * 8;
|
let header_size = (PBMX_HEADER + n_cols * 8) as u64;
|
||||||
let mut col_offset = header_size;
|
let mut col_offset = header_size;
|
||||||
let mut offsets = Vec::with_capacity(n_cols);
|
let mut offsets = Vec::with_capacity(n_cols);
|
||||||
for data in &col_files {
|
for &size in &col_sizes {
|
||||||
offsets.push(col_offset as u64);
|
offsets.push(col_offset);
|
||||||
col_offset += data.len();
|
col_offset += size;
|
||||||
}
|
}
|
||||||
|
|
||||||
let packed_path = dir.join("matrix.pbmx");
|
// Write to a temp file; rename atomically so a killed process never leaves
|
||||||
let mut file = File::create(&packed_path)?;
|
// a truncated matrix.pbmx that would be mistaken for a complete file.
|
||||||
file.write_all(&PBMX_MAGIC)?;
|
let tmp_path = dir.join("matrix.pbmx.tmp");
|
||||||
file.write_all(&[0u8; 4])?;
|
let mut out = BufWriter::new(File::create(&tmp_path)?);
|
||||||
file.write_all(&(meta.n as u64).to_le_bytes())?;
|
out.write_all(&PBMX_MAGIC)?;
|
||||||
file.write_all(&(n_cols as u64).to_le_bytes())?;
|
out.write_all(&[0u8; 4])?;
|
||||||
for &off in &offsets { file.write_all(&off.to_le_bytes())?; }
|
out.write_all(&(meta.n as u64).to_le_bytes())?;
|
||||||
for data in &col_files { file.write_all(data)?; }
|
out.write_all(&(n_cols as u64).to_le_bytes())?;
|
||||||
drop(file);
|
for &off in &offsets { out.write_all(&off.to_le_bytes())?; }
|
||||||
|
for c in 0..n_cols {
|
||||||
|
io::copy(&mut File::open(col_path(dir, c))?, &mut out)?;
|
||||||
|
}
|
||||||
|
out.flush()?;
|
||||||
|
drop(out);
|
||||||
|
fs::rename(&tmp_path, &packed_path)?;
|
||||||
|
|
||||||
for c in 0..n_cols { fs::remove_file(col_path(dir, c))?; }
|
for c in 0..n_cols { fs::remove_file(col_path(dir, c))?; }
|
||||||
fs::remove_file(dir.join("meta.json"))?;
|
fs::remove_file(dir.join("meta.json"))?;
|
||||||
@@ -326,6 +286,24 @@ impl PersistentBitMatrix {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn col_view(&self, c: usize) -> BitSliceView<'_> {
|
||||||
|
match self {
|
||||||
|
Self::Columnar(m) => m.col(c).view(),
|
||||||
|
Self::Packed(m) => m.col_slice(c),
|
||||||
|
Self::Implicit { .. } => panic!("col_view() not available on Implicit PersistentBitMatrix"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentBitVecBuilder> {
|
||||||
|
match self {
|
||||||
|
Self::Columnar(m) => PersistentBitVecBuilder::build_from(m.col(c), path),
|
||||||
|
Self::Packed(m) => m.col_persist(c, path),
|
||||||
|
Self::Implicit { n_rows, .. } => {
|
||||||
|
PersistentBitVecBuilder::new_ones(*n_rows, path)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn row(&self, slot: usize) -> Box<[bool]> {
|
pub fn row(&self, slot: usize) -> Box<[bool]> {
|
||||||
match self {
|
match self {
|
||||||
Self::Columnar(m) => m.row(slot),
|
Self::Columnar(m) => m.row(slot),
|
||||||
@@ -422,12 +400,93 @@ impl PersistentBitMatrixBuilder {
|
|||||||
PersistentBitVecBuilder::new(self.n, &path)
|
PersistentBitVecBuilder::new(self.n, &path)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn add_col_ones(&mut self) -> io::Result<PersistentBitVecBuilder> {
|
||||||
|
let path = col_path(&self.dir, self.n_cols);
|
||||||
|
self.n_cols += 1;
|
||||||
|
PersistentBitVecBuilder::new_ones(self.n, &path)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_col_from(&mut self, src: &TempBitVec) -> io::Result<()> {
|
||||||
|
src.make_persistent(&col_path(&self.dir, self.n_cols))?;
|
||||||
|
self.n_cols += 1;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_col_from_int(&mut self, src: &TempCompactIntVec) -> io::Result<()> {
|
||||||
|
let path = col_path(&self.dir, self.n_cols);
|
||||||
|
self.n_cols += 1;
|
||||||
|
let mut b = PersistentBitVecBuilder::new(self.n, &path)?;
|
||||||
|
b.or_where(src.view(), |v| v > 0);
|
||||||
|
b.close()
|
||||||
|
}
|
||||||
|
|
||||||
pub fn close(self) -> io::Result<()> {
|
pub fn close(self) -> io::Result<()> {
|
||||||
MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
|
MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
// ── MatrixGroupOps ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
impl MatrixGroupOps for PersistentBitMatrix {
|
||||||
|
fn partial_group_presence_count(&self, g: &ColGroup, _threshold: u32) -> io::Result<TempCompactIntVec> {
|
||||||
|
// Bit matrices store 0/1 — threshold is structurally always 1.
|
||||||
|
let n = self.n();
|
||||||
|
if g.indices.len() < 255 {
|
||||||
|
let mut builder = TempCompactIntVecBuilder::new(n)?;
|
||||||
|
for &c in &g.indices {
|
||||||
|
builder.inc_present_fast(self.col_view(c));
|
||||||
|
}
|
||||||
|
builder.freeze()
|
||||||
|
} else {
|
||||||
|
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||||
|
for chunk in g.indices.chunks(254) {
|
||||||
|
let mut chunk_b = TempCompactIntVecBuilder::new(n)?;
|
||||||
|
for &c in chunk {
|
||||||
|
chunk_b.inc_present_fast(self.col_view(c));
|
||||||
|
}
|
||||||
|
let frozen = chunk_b.freeze()?;
|
||||||
|
result.add(frozen.view());
|
||||||
|
}
|
||||||
|
result.freeze()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn partial_group_sum(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
|
||||||
|
// For bit matrices, sum = count of 1-bits — identical to presence_count.
|
||||||
|
self.partial_group_presence_count(g, 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn partial_group_any(&self, g: &ColGroup, _threshold: u32) -> io::Result<TempBitVec> {
|
||||||
|
let n = self.n();
|
||||||
|
let mut result = TempBitVecBuilder::new(n)?;
|
||||||
|
for &c in &g.indices {
|
||||||
|
result.or(self.col_view(c));
|
||||||
|
}
|
||||||
|
result.freeze()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn partial_group_min(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
|
||||||
|
// min of 0/1 values = AND: 1 only if ALL columns are 1
|
||||||
|
let n = self.n();
|
||||||
|
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||||
|
if let Some((&first, rest)) = g.indices.split_first() {
|
||||||
|
result.inc_present_fast(self.col_view(first));
|
||||||
|
for &c in rest { result.mask_with(self.col_view(c)); }
|
||||||
|
}
|
||||||
|
result.freeze()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn partial_group_max(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
|
||||||
|
// max of 0/1 values = OR: 1 if any column is 1
|
||||||
|
let any = self.partial_group_any(g, 1)?;
|
||||||
|
let n = any.len();
|
||||||
|
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||||
|
result.inc_present(any.view());
|
||||||
|
result.freeze()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Shared matrix helpers (also used by intmatrix.rs) ─────────────────────────
|
||||||
|
|
||||||
fn upper_pairs(n: usize) -> Vec<(usize, usize)> {
|
fn upper_pairs(n: usize) -> Vec<(usize, usize)> {
|
||||||
(0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect()
|
(0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect()
|
||||||
@@ -439,3 +498,30 @@ where T: Clone + Default {
|
|||||||
for (i, j, vij, vji) in vals { m[[i, j]] = vij; m[[j, i]] = vji; }
|
for (i, j, vij, vji) in vals { m[[i, j]] = vij; m[[j, i]] = vji; }
|
||||||
m
|
m
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Compute a symmetric `n×n` matrix in parallel by evaluating `f(i,j)` for
|
||||||
|
/// all upper-triangle pairs. `T: Copy` avoids the `.clone()` needed for the
|
||||||
|
/// lower-triangle mirror.
|
||||||
|
pub(crate) fn pairwise_matrix<T>(n: usize, f: impl Fn(usize, usize) -> T + Sync) -> Array2<T>
|
||||||
|
where T: Copy + Default + Send {
|
||||||
|
let results: Vec<(usize, usize, T)> = upper_pairs(n)
|
||||||
|
.into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
|
||||||
|
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Same as `pairwise_matrix` but `f` returns two values that fill two
|
||||||
|
/// symmetric matrices simultaneously (e.g. intersection + union for Jaccard).
|
||||||
|
pub(crate) fn pairwise2_matrix<T>(n: usize, f: impl Fn(usize, usize) -> (T, T) + Sync) -> (Array2<T>, Array2<T>)
|
||||||
|
where T: Copy + Default + Send {
|
||||||
|
let results: Vec<(usize, usize, T, T)> = upper_pairs(n)
|
||||||
|
.into_par_iter()
|
||||||
|
.map(|(i, j)| { let (a, b) = f(i, j); (i, j, a, b) })
|
||||||
|
.collect();
|
||||||
|
let mut m0 = Array2::from_elem((n, n), T::default());
|
||||||
|
let mut m1 = Array2::from_elem((n, n), T::default());
|
||||||
|
for (i, j, a, b) in results {
|
||||||
|
m0[[i, j]] = a; m0[[j, i]] = a;
|
||||||
|
m1[[i, j]] = b; m1[[j, i]] = b;
|
||||||
|
}
|
||||||
|
(m0, m1)
|
||||||
|
}
|
||||||
|
|||||||
+221
-179
@@ -5,29 +5,25 @@ use std::path::{Path, PathBuf};
|
|||||||
use memmap2::{Mmap, MmapMut};
|
use memmap2::{Mmap, MmapMut};
|
||||||
|
|
||||||
use crate::reader::PersistentCompactIntVec;
|
use crate::reader::PersistentCompactIntVec;
|
||||||
|
use crate::views::{BitSliceIter, BitSliceView, IntSliceView};
|
||||||
|
|
||||||
const MAGIC: [u8; 4] = *b"PBIV";
|
const MAGIC: [u8; 4] = *b"PBIV";
|
||||||
|
|
||||||
// Header: magic(4) + _pad(4) + n(8) = 16 bytes.
|
// Header: magic(4) + _pad(4) + n(8) = 16 bytes.
|
||||||
// Data starts at offset 16, which is divisible by 8 → u64-aligned
|
// Data starts at offset 16, u64-aligned (mmap base is page-aligned, 16 % 8 == 0).
|
||||||
// (mmap base is page-aligned, 16 % 8 == 0).
|
|
||||||
const HEADER_SIZE: usize = 16;
|
const HEADER_SIZE: usize = 16;
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn n_words(n: usize) -> usize {
|
pub(crate) fn n_words(n: usize) -> usize { n.div_ceil(64) }
|
||||||
n.div_ceil(64)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn n_bytes_for_words(n: usize) -> usize {
|
fn n_bytes_for_words(n: usize) -> usize { n_words(n) * 8 }
|
||||||
n_words(n) * 8
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Reader ────────────────────────────────────────────────────────────────────
|
// ── PersistentBitVec ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
pub struct PersistentBitVec {
|
pub struct PersistentBitVec {
|
||||||
mmap: Mmap,
|
mmap: Mmap,
|
||||||
n: usize,
|
n: usize,
|
||||||
path: PathBuf,
|
path: PathBuf,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -35,157 +31,145 @@ impl PersistentBitVec {
|
|||||||
pub fn open(path: &Path) -> io::Result<Self> {
|
pub fn open(path: &Path) -> io::Result<Self> {
|
||||||
let mmap = unsafe { Mmap::map(&File::open(path)?)? };
|
let mmap = unsafe { Mmap::map(&File::open(path)?)? };
|
||||||
if mmap.len() < HEADER_SIZE {
|
if mmap.len() < HEADER_SIZE {
|
||||||
return Err(io::Error::new(
|
return Err(io::Error::new(io::ErrorKind::InvalidData, "PBIV file too short"));
|
||||||
io::ErrorKind::InvalidData,
|
|
||||||
"PBIV file too short",
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
if &mmap[0..4] != &MAGIC {
|
if &mmap[0..4] != &MAGIC {
|
||||||
return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PBIV magic"));
|
return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PBIV magic"));
|
||||||
}
|
}
|
||||||
let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize;
|
let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize;
|
||||||
Ok(Self {
|
Ok(Self { mmap, n, path: path.to_path_buf() })
|
||||||
mmap,
|
|
||||||
n,
|
|
||||||
path: path.to_path_buf(),
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn path(&self) -> &Path {
|
pub fn path(&self) -> &Path { &self.path }
|
||||||
&self.path
|
pub fn len(&self) -> usize { self.n }
|
||||||
}
|
pub fn is_empty(&self) -> bool { self.n == 0 }
|
||||||
pub fn len(&self) -> usize {
|
|
||||||
self.n
|
|
||||||
}
|
|
||||||
pub fn is_empty(&self) -> bool {
|
|
||||||
self.n == 0
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get(&self, slot: usize) -> bool {
|
pub fn get(&self, slot: usize) -> bool {
|
||||||
(self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0
|
(self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0
|
||||||
}
|
}
|
||||||
|
|
||||||
// Used by iter() and get(): exact byte window, no padding.
|
// SAFETY: mmap is page-aligned, HEADER_SIZE=16 divisible by 8 → u64-aligned.
|
||||||
fn data_bytes(&self) -> &[u8] {
|
|
||||||
&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n.div_ceil(8)]
|
|
||||||
}
|
|
||||||
|
|
||||||
// Bulk word view. SAFETY: mmap is page-aligned, HEADER_SIZE=16 is divisible by 8,
|
|
||||||
// so &mmap[HEADER_SIZE] is u64-aligned. Slice length is n_words * 8 bytes.
|
|
||||||
fn data_words(&self) -> &[u64] {
|
fn data_words(&self) -> &[u64] {
|
||||||
let nw = n_words(self.n);
|
let nw = n_words(self.n);
|
||||||
let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64;
|
let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64;
|
||||||
unsafe { std::slice::from_raw_parts(ptr, nw) }
|
unsafe { std::slice::from_raw_parts(ptr, nw) }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn count_ones(&self) -> u64 {
|
pub fn view(&self) -> BitSliceView<'_> {
|
||||||
// Padding bits in the last word are 0, so no masking needed.
|
BitSliceView::new(self.data_words(), self.n)
|
||||||
self.data_words()
|
|
||||||
.iter()
|
|
||||||
.map(|w| w.count_ones() as u64)
|
|
||||||
.sum()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn count_zeros(&self) -> u64 {
|
pub fn words(&self) -> &[u64] { self.data_words() }
|
||||||
self.n as u64 - self.count_ones()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn jaccard_dist(&self, other: &PersistentBitVec) -> f64 {
|
pub fn count_ones(&self) -> u64 { self.view().count_ones() }
|
||||||
let (inter, union) = self.partial_jaccard_dist(other);
|
pub fn count_zeros(&self) -> u64 { self.view().count_zeros() }
|
||||||
if union == 0 {
|
|
||||||
return 0.0;
|
|
||||||
}
|
|
||||||
1.0 - inter as f64 / union as f64
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn partial_jaccard_dist(&self, other: &PersistentBitVec) -> (u64, u64) {
|
pub fn partial_jaccard_dist(&self, other: &PersistentBitVec) -> (u64, u64) {
|
||||||
assert_eq!(self.n, other.n, "length mismatch");
|
self.view().partial_jaccard_dist(other.view())
|
||||||
self.data_words()
|
}
|
||||||
.iter()
|
pub fn jaccard_dist(&self, other: &PersistentBitVec) -> f64 {
|
||||||
.zip(other.data_words())
|
self.view().jaccard_dist(other.view())
|
||||||
.fold((0u64, 0u64), |(i, u), (&a, &b)| {
|
|
||||||
(
|
|
||||||
i + (a & b).count_ones() as u64,
|
|
||||||
u + (a | b).count_ones() as u64,
|
|
||||||
)
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn hamming_dist(&self, other: &PersistentBitVec) -> u64 {
|
pub fn hamming_dist(&self, other: &PersistentBitVec) -> u64 {
|
||||||
assert_eq!(self.n, other.n, "length mismatch");
|
self.view().hamming_dist(other.view())
|
||||||
self.data_words()
|
|
||||||
.iter()
|
|
||||||
.zip(other.data_words())
|
|
||||||
.map(|(&a, &b)| (a ^ b).count_ones() as u64)
|
|
||||||
.sum()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn iter(&self) -> BitIter<'_> {
|
pub fn iter(&self) -> BitIter<'_> {
|
||||||
BitIter {
|
BitIter { words: self.data_words(), slot: 0, n: self.n }
|
||||||
bytes: self.data_bytes(),
|
|
||||||
slot: 0,
|
|
||||||
n: self.n,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> IntoIterator for &'a PersistentBitVec {
|
impl<'a> IntoIterator for &'a PersistentBitVec {
|
||||||
type Item = bool;
|
type Item = bool;
|
||||||
type IntoIter = BitIter<'a>;
|
type IntoIter = BitIter<'a>;
|
||||||
fn into_iter(self) -> BitIter<'a> {
|
fn into_iter(self) -> BitIter<'a> { self.iter() }
|
||||||
self.iter()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── BitIter ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
pub struct BitIter<'a> {
|
pub struct BitIter<'a> {
|
||||||
bytes: &'a [u8],
|
words: &'a [u64],
|
||||||
slot: usize,
|
slot: usize,
|
||||||
n: usize,
|
n: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ExactSizeIterator for BitIter<'_> {}
|
impl ExactSizeIterator for BitIter<'_> {}
|
||||||
|
|
||||||
impl Iterator for BitIter<'_> {
|
impl Iterator for BitIter<'_> {
|
||||||
type Item = bool;
|
type Item = bool;
|
||||||
|
|
||||||
fn next(&mut self) -> Option<bool> {
|
fn next(&mut self) -> Option<bool> {
|
||||||
if self.slot >= self.n {
|
if self.slot >= self.n { return None; }
|
||||||
return None;
|
let v = (self.words[self.slot >> 6] >> (self.slot & 63)) & 1 != 0;
|
||||||
}
|
|
||||||
let v = (self.bytes[self.slot >> 3] >> (self.slot & 7)) & 1 != 0;
|
|
||||||
self.slot += 1;
|
self.slot += 1;
|
||||||
Some(v)
|
Some(v)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||||
let rem = self.n - self.slot;
|
let rem = self.n - self.slot;
|
||||||
(rem, Some(rem))
|
(rem, Some(rem))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Builder ───────────────────────────────────────────────────────────────────
|
// ── PersistentBitVecBuilder ───────────────────────────────────────────────────
|
||||||
|
|
||||||
pub struct PersistentBitVecBuilder {
|
pub struct PersistentBitVecBuilder {
|
||||||
mmap: MmapMut,
|
mmap: MmapMut,
|
||||||
n: usize,
|
n: usize,
|
||||||
|
path: PathBuf,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PersistentBitVecBuilder {
|
impl PersistentBitVecBuilder {
|
||||||
pub fn new(n: usize, path: &Path) -> io::Result<Self> {
|
pub fn new(n: usize, path: &Path) -> io::Result<Self> {
|
||||||
let file_size = HEADER_SIZE + n_bytes_for_words(n);
|
let file_size = HEADER_SIZE + n_bytes_for_words(n);
|
||||||
let mut file = OpenOptions::new()
|
let mut file = OpenOptions::new()
|
||||||
.read(true)
|
.read(true).write(true).create(true).truncate(true)
|
||||||
.write(true)
|
|
||||||
.create(true)
|
|
||||||
.truncate(true)
|
|
||||||
.open(path)?;
|
.open(path)?;
|
||||||
file.write_all(&MAGIC)?;
|
file.write_all(&MAGIC)?;
|
||||||
file.write_all(&[0u8; 4])?; // padding
|
file.write_all(&[0u8; 4])?;
|
||||||
file.write_all(&(n as u64).to_le_bytes())?;
|
file.write_all(&(n as u64).to_le_bytes())?;
|
||||||
file.seek(SeekFrom::Start(0))?;
|
file.seek(SeekFrom::Start(0))?;
|
||||||
file.set_len(file_size as u64)?;
|
file.set_len(file_size as u64)?;
|
||||||
let mmap = unsafe { MmapMut::map_mut(&file)? };
|
let mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||||
Ok(Self { mmap, n })
|
Ok(Self { mmap, n, path: path.to_path_buf() })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn from_raw_bytes(bytes: &[u8], n: usize, path: &Path) -> io::Result<Self> {
|
||||||
|
let file_size = HEADER_SIZE + n_bytes_for_words(n);
|
||||||
|
let file = OpenOptions::new()
|
||||||
|
.read(true).write(true).create(true).truncate(true)
|
||||||
|
.open(path)?;
|
||||||
|
file.set_len(file_size as u64)?;
|
||||||
|
let mut mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||||
|
mmap[0..4].copy_from_slice(&MAGIC);
|
||||||
|
mmap[8..16].copy_from_slice(&(n as u64).to_le_bytes());
|
||||||
|
mmap[HEADER_SIZE..HEADER_SIZE + bytes.len()].copy_from_slice(bytes);
|
||||||
|
Ok(Self { mmap, n, path: path.to_path_buf() })
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create an all-ones bit vector of length `n` at `path`.
|
||||||
|
///
|
||||||
|
/// More efficient than `new(n, path)` + `not()`: the data is written as
|
||||||
|
/// 0xFF bytes in a single sequential pass, with no intermediate all-zeros state.
|
||||||
|
pub fn new_ones(n: usize, path: &Path) -> io::Result<Self> {
|
||||||
|
let nw = n_words(n);
|
||||||
|
let file_size = HEADER_SIZE + nw * 8;
|
||||||
|
let mut file = OpenOptions::new()
|
||||||
|
.read(true).write(true).create(true).truncate(true)
|
||||||
|
.open(path)?;
|
||||||
|
file.write_all(&MAGIC)?;
|
||||||
|
file.write_all(&[0u8; 4])?;
|
||||||
|
file.write_all(&(n as u64).to_le_bytes())?;
|
||||||
|
file.write_all(&vec![0xFFu8; nw * 8])?;
|
||||||
|
file.seek(SeekFrom::Start(0))?;
|
||||||
|
file.set_len(file_size as u64)?;
|
||||||
|
let mut mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||||
|
// Clear padding bits in the last word so trailing bits are always 0.
|
||||||
|
let rem = n % 64;
|
||||||
|
if rem != 0 {
|
||||||
|
let ptr = mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64;
|
||||||
|
let words = unsafe { std::slice::from_raw_parts_mut(ptr, nw) };
|
||||||
|
words[nw - 1] &= (1u64 << rem) - 1;
|
||||||
|
}
|
||||||
|
Ok(Self { mmap, n, path: path.to_path_buf() })
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn build_from(source: &PersistentBitVec, path: &Path) -> io::Result<Self> {
|
pub fn build_from(source: &PersistentBitVec, path: &Path) -> io::Result<Self> {
|
||||||
@@ -193,86 +177,14 @@ impl PersistentBitVecBuilder {
|
|||||||
let file = OpenOptions::new().read(true).write(true).open(path)?;
|
let file = OpenOptions::new().read(true).write(true).open(path)?;
|
||||||
let mmap = unsafe { MmapMut::map_mut(&file)? };
|
let mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||||
let n = source.len();
|
let n = source.len();
|
||||||
Ok(Self { mmap, n })
|
Ok(Self { mmap, n, path: path.to_path_buf() })
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn len(&self) -> usize {
|
pub fn build_from_counts(source: &PersistentCompactIntVec, threshold: u32, path: &Path) -> io::Result<Self> {
|
||||||
self.n
|
|
||||||
}
|
|
||||||
pub fn is_empty(&self) -> bool {
|
|
||||||
self.n == 0
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get(&self, slot: usize) -> bool {
|
|
||||||
(self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn set(&mut self, slot: usize, value: bool) {
|
|
||||||
let byte = HEADER_SIZE + (slot >> 3);
|
|
||||||
let bit = 1u8 << (slot & 7);
|
|
||||||
if value {
|
|
||||||
self.mmap[byte] |= bit;
|
|
||||||
} else {
|
|
||||||
self.mmap[byte] &= !bit;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// SAFETY: same alignment argument as PersistentBitVec::data_words.
|
|
||||||
fn data_words_mut(&mut self) -> &mut [u64] {
|
|
||||||
let nw = n_words(self.n);
|
|
||||||
let ptr = self.mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64;
|
|
||||||
unsafe { std::slice::from_raw_parts_mut(ptr, nw) }
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn and(&mut self, other: &PersistentBitVec) {
|
|
||||||
assert_eq!(self.n, other.n, "length mismatch");
|
|
||||||
for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) {
|
|
||||||
*sw &= ow;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn or(&mut self, other: &PersistentBitVec) {
|
|
||||||
assert_eq!(self.n, other.n, "length mismatch");
|
|
||||||
for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) {
|
|
||||||
*sw |= ow;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn xor(&mut self, other: &PersistentBitVec) {
|
|
||||||
assert_eq!(self.n, other.n, "length mismatch");
|
|
||||||
for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) {
|
|
||||||
*sw ^= ow;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn not(&mut self) {
|
|
||||||
let rem = self.n % 64;
|
|
||||||
let words = self.data_words_mut();
|
|
||||||
for w in words.iter_mut() {
|
|
||||||
*w ^= u64::MAX;
|
|
||||||
}
|
|
||||||
// Zero padding bits in the last word so count_ones / jaccard remain correct.
|
|
||||||
if rem != 0 {
|
|
||||||
if let Some(last) = words.last_mut() {
|
|
||||||
*last &= (1u64 << rem) - 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Convert a count vector to a bit vector: bit set iff count >= threshold.
|
|
||||||
/// Fills u64 words directly from the count iterator — O(n), no bit-level set() overhead.
|
|
||||||
pub fn build_from_counts(
|
|
||||||
source: &PersistentCompactIntVec,
|
|
||||||
threshold: u32,
|
|
||||||
path: &Path,
|
|
||||||
) -> io::Result<Self> {
|
|
||||||
let n = source.len();
|
let n = source.len();
|
||||||
let file_size = HEADER_SIZE + n_bytes_for_words(n);
|
let file_size = HEADER_SIZE + n_bytes_for_words(n);
|
||||||
let mut file = OpenOptions::new()
|
let mut file = OpenOptions::new()
|
||||||
.read(true)
|
.read(true).write(true).create(true).truncate(true)
|
||||||
.write(true)
|
|
||||||
.create(true)
|
|
||||||
.truncate(true)
|
|
||||||
.open(path)?;
|
.open(path)?;
|
||||||
file.write_all(&MAGIC)?;
|
file.write_all(&MAGIC)?;
|
||||||
file.write_all(&[0u8; 4])?;
|
file.write_all(&[0u8; 4])?;
|
||||||
@@ -280,27 +192,157 @@ impl PersistentBitVecBuilder {
|
|||||||
file.seek(SeekFrom::Start(0))?;
|
file.seek(SeekFrom::Start(0))?;
|
||||||
file.set_len(file_size as u64)?;
|
file.set_len(file_size as u64)?;
|
||||||
let mut mmap = unsafe { MmapMut::map_mut(&file)? };
|
let mut mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||||
|
|
||||||
{
|
{
|
||||||
let nw = n_words(n);
|
let nw = n_words(n);
|
||||||
let ptr = mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64;
|
let ptr = mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64;
|
||||||
let words = unsafe { std::slice::from_raw_parts_mut(ptr, nw) };
|
let words = unsafe { std::slice::from_raw_parts_mut(ptr, nw) };
|
||||||
for (slot, count) in source.iter().enumerate() {
|
for (slot, count) in source.iter().enumerate() {
|
||||||
if count >= threshold {
|
if count >= threshold { words[slot >> 6] |= 1u64 << (slot & 63); }
|
||||||
words[slot >> 6] |= 1u64 << (slot & 63);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Ok(Self { mmap, n, path: path.to_path_buf() })
|
||||||
Ok(Self { mmap, n })
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Convert a count vector to a presence/absence bit vector (threshold = 1).
|
|
||||||
pub fn build_from_presence(source: &PersistentCompactIntVec, path: &Path) -> io::Result<Self> {
|
pub fn build_from_presence(source: &PersistentCompactIntVec, path: &Path) -> io::Result<Self> {
|
||||||
Self::build_from_counts(source, 1, path)
|
Self::build_from_counts(source, 1, path)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn close(self) -> io::Result<()> {
|
pub fn len(&self) -> usize { self.n }
|
||||||
self.mmap.flush()
|
pub fn is_empty(&self) -> bool { self.n == 0 }
|
||||||
|
|
||||||
|
pub fn get(&self, slot: usize) -> bool {
|
||||||
|
(self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn set(&mut self, slot: usize, value: bool) {
|
||||||
|
let bit = 1u64 << (slot & 63);
|
||||||
|
if value { self.data_words_mut()[slot >> 6] |= bit; }
|
||||||
|
else { self.data_words_mut()[slot >> 6] &= !bit; }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn data_words(&self) -> &[u64] {
|
||||||
|
let nw = n_words(self.n);
|
||||||
|
let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64;
|
||||||
|
unsafe { std::slice::from_raw_parts(ptr, nw) }
|
||||||
|
}
|
||||||
|
|
||||||
|
// SAFETY: same alignment argument as PersistentBitVec::data_words.
|
||||||
|
fn data_words_mut(&mut self) -> &mut [u64] {
|
||||||
|
let nw = n_words(self.n);
|
||||||
|
let ptr = self.mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64;
|
||||||
|
unsafe { std::slice::from_raw_parts_mut(ptr, nw) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn view(&self) -> BitSliceView<'_> {
|
||||||
|
BitSliceView::new(self.data_words(), self.n)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn words(&self) -> &[u64] { self.data_words() }
|
||||||
|
|
||||||
|
pub fn copy_from(&mut self, src: BitSliceView<'_>) {
|
||||||
|
assert_eq!(self.n, src.len(), "BitSliceView length mismatch");
|
||||||
|
self.data_words_mut().copy_from_slice(src.words());
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn and(&mut self, other: BitSliceView<'_>) {
|
||||||
|
assert_eq!(self.n, other.len(), "BitSliceView length mismatch");
|
||||||
|
for (w, &o) in self.data_words_mut().iter_mut().zip(other.words()) { *w &= o; }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn or(&mut self, other: BitSliceView<'_>) {
|
||||||
|
assert_eq!(self.n, other.len(), "BitSliceView length mismatch");
|
||||||
|
for (w, &o) in self.data_words_mut().iter_mut().zip(other.words()) { *w |= o; }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn xor(&mut self, other: BitSliceView<'_>) {
|
||||||
|
assert_eq!(self.n, other.len(), "BitSliceView length mismatch");
|
||||||
|
for (w, &o) in self.data_words_mut().iter_mut().zip(other.words()) { *w ^= o; }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn not(&mut self) {
|
||||||
|
let rem = self.n % 64;
|
||||||
|
let words = self.data_words_mut();
|
||||||
|
for w in words.iter_mut() { *w ^= u64::MAX; }
|
||||||
|
if rem != 0 {
|
||||||
|
if let Some(last) = words.last_mut() { *last &= (1u64 << rem) - 1; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// OR in bits at slots where `pred(col[slot])` is true.
|
||||||
|
pub fn or_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||||
|
assert_eq!(self.n, col.len(), "IntSliceView length mismatch");
|
||||||
|
let n = self.n;
|
||||||
|
let primary = col.primary_bytes();
|
||||||
|
let words = self.data_words_mut();
|
||||||
|
let nw = n_words(n);
|
||||||
|
for wi in 0..nw {
|
||||||
|
let base = wi * 64;
|
||||||
|
let limit = (base + 64).min(n);
|
||||||
|
let mut mask = 0u64;
|
||||||
|
for bit in 0..(limit - base) {
|
||||||
|
let b = primary[base + bit];
|
||||||
|
if b < 255 && pred(b as u32) { mask |= 1u64 << bit; }
|
||||||
|
}
|
||||||
|
words[wi] |= mask;
|
||||||
|
}
|
||||||
|
for (slot, val) in col.overflow_entries() {
|
||||||
|
if pred(val) { words[slot >> 6] |= 1u64 << (slot & 63); }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Clear bits at slots where `pred(col[slot])` is false.
|
||||||
|
pub fn and_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||||
|
assert_eq!(self.n, col.len(), "IntSliceView length mismatch");
|
||||||
|
let n = self.n;
|
||||||
|
let primary = col.primary_bytes();
|
||||||
|
let words = self.data_words_mut();
|
||||||
|
let nw = n_words(n);
|
||||||
|
for wi in 0..nw {
|
||||||
|
let base = wi * 64;
|
||||||
|
let limit = (base + 64).min(n);
|
||||||
|
let mut mask = 0u64;
|
||||||
|
for bit in 0..(limit - base) {
|
||||||
|
let b = primary[base + bit];
|
||||||
|
if b < 255 && !pred(b as u32) { mask |= 1u64 << bit; }
|
||||||
|
}
|
||||||
|
words[wi] &= !mask;
|
||||||
|
}
|
||||||
|
for (slot, val) in col.overflow_entries() {
|
||||||
|
if !pred(val) { words[slot >> 6] &= !(1u64 << (slot & 63)); }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Toggle bits at slots where `pred(col[slot])` is true.
|
||||||
|
pub fn xor_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||||
|
assert_eq!(self.n, col.len(), "IntSliceView length mismatch");
|
||||||
|
let n = self.n;
|
||||||
|
let primary = col.primary_bytes();
|
||||||
|
let words = self.data_words_mut();
|
||||||
|
let nw = n_words(n);
|
||||||
|
for wi in 0..nw {
|
||||||
|
let base = wi * 64;
|
||||||
|
let limit = (base + 64).min(n);
|
||||||
|
let mut mask = 0u64;
|
||||||
|
for bit in 0..(limit - base) {
|
||||||
|
let b = primary[base + bit];
|
||||||
|
if b < 255 && pred(b as u32) { mask |= 1u64 << bit; }
|
||||||
|
}
|
||||||
|
words[wi] ^= mask;
|
||||||
|
}
|
||||||
|
for (slot, val) in col.overflow_entries() {
|
||||||
|
if pred(val) { words[slot >> 6] ^= 1u64 << (slot & 63); }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn iter(&self) -> BitSliceIter<'_> {
|
||||||
|
self.view().iter()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn close(self) -> io::Result<()> { self.mmap.flush() }
|
||||||
|
|
||||||
|
pub fn finish(self) -> io::Result<PersistentBitVec> {
|
||||||
|
let path = self.path.clone();
|
||||||
|
self.close()?;
|
||||||
|
PersistentBitVec::open(&path)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,71 +5,57 @@ use std::path::{Path, PathBuf};
|
|||||||
|
|
||||||
use memmap2::MmapMut;
|
use memmap2::MmapMut;
|
||||||
|
|
||||||
use crate::format::{HEADER_SIZE, OVERFLOW_ENTRY_SIZE, finalize_pciv};
|
use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, finalize_pciv, parse_overflow_entry};
|
||||||
use crate::reader::PersistentCompactIntVec;
|
use crate::reader::PersistentCompactIntVec;
|
||||||
|
use crate::views::{BitSliceView, IntSliceView};
|
||||||
|
|
||||||
pub struct PersistentCompactIntVecBuilder {
|
pub struct PersistentCompactIntVecBuilder {
|
||||||
path: PathBuf,
|
path: PathBuf,
|
||||||
mmap: MmapMut,
|
mmap: MmapMut,
|
||||||
n: usize,
|
n: usize,
|
||||||
overflow: HashMap<usize, u32>,
|
overflow: HashMap<usize, u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PersistentCompactIntVecBuilder {
|
impl PersistentCompactIntVecBuilder {
|
||||||
/// Create a new, zero-filled PCIV at `path`. Primary is mmapped immediately.
|
|
||||||
pub fn new(n: usize, path: &Path) -> io::Result<Self> {
|
pub fn new(n: usize, path: &Path) -> io::Result<Self> {
|
||||||
let file = OpenOptions::new()
|
let file = OpenOptions::new()
|
||||||
.read(true)
|
.read(true).write(true).create(true).truncate(true)
|
||||||
.write(true)
|
|
||||||
.create(true)
|
|
||||||
.truncate(true)
|
|
||||||
.open(path)?;
|
.open(path)?;
|
||||||
file.set_len((HEADER_SIZE + n) as u64)?;
|
file.set_len((HEADER_SIZE + n) as u64)?;
|
||||||
let mmap = unsafe { MmapMut::map_mut(&file)? };
|
let mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||||
Ok(Self {
|
Ok(Self { path: path.to_path_buf(), mmap, n, overflow: HashMap::new() })
|
||||||
path: path.to_path_buf(),
|
}
|
||||||
mmap,
|
|
||||||
n,
|
pub fn from_raw_primary(primary: &[u8], overflow: HashMap<usize, u32>, path: &Path) -> io::Result<Self> {
|
||||||
overflow: HashMap::new(),
|
let n = primary.len();
|
||||||
})
|
let file = OpenOptions::new()
|
||||||
|
.read(true).write(true).create(true).truncate(true)
|
||||||
|
.open(path)?;
|
||||||
|
file.set_len((HEADER_SIZE + n) as u64)?;
|
||||||
|
let mut mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||||
|
mmap[HEADER_SIZE..HEADER_SIZE + n].copy_from_slice(primary);
|
||||||
|
Ok(Self { path: path.to_path_buf(), mmap, n, overflow })
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Copy `source`'s file to `path`, mmap the primary section, load overflow into RAM.
|
|
||||||
/// Avoids iterating all n slots: the file copy is OS-level, overflow loading is O(n_overflow).
|
|
||||||
pub fn build_from(source: &PersistentCompactIntVec, path: &Path) -> io::Result<Self> {
|
pub fn build_from(source: &PersistentCompactIntVec, path: &Path) -> io::Result<Self> {
|
||||||
fs::copy(source.path(), path)?;
|
fs::copy(source.path(), path)?;
|
||||||
|
|
||||||
let file = OpenOptions::new().read(true).write(true).open(path)?;
|
let file = OpenOptions::new().read(true).write(true).open(path)?;
|
||||||
let mmap = unsafe { MmapMut::map_mut(&file)? };
|
let mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||||
|
let n = source.len();
|
||||||
let n = source.len();
|
|
||||||
let n_overflow = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize;
|
let n_overflow = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize;
|
||||||
let data_offset = HEADER_SIZE + n;
|
let data_offset = HEADER_SIZE + n;
|
||||||
|
|
||||||
let mut overflow = HashMap::with_capacity(n_overflow);
|
let mut overflow = HashMap::with_capacity(n_overflow);
|
||||||
for i in 0..n_overflow {
|
for i in 0..n_overflow {
|
||||||
let off = data_offset + i * OVERFLOW_ENTRY_SIZE;
|
let (slot, value) = parse_overflow_entry(&mmap, data_offset, i);
|
||||||
let slot = u64::from_le_bytes(mmap[off..off + 8].try_into().unwrap()) as usize;
|
|
||||||
let value = u32::from_le_bytes(mmap[off + 8..off + 12].try_into().unwrap());
|
|
||||||
overflow.insert(slot, value);
|
overflow.insert(slot, value);
|
||||||
}
|
}
|
||||||
|
Ok(Self { path: path.to_path_buf(), mmap, n, overflow })
|
||||||
Ok(Self {
|
|
||||||
path: path.to_path_buf(),
|
|
||||||
mmap,
|
|
||||||
n,
|
|
||||||
overflow,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the value at the given slot, handling overflow if necessary.
|
|
||||||
pub fn get(&self, slot: usize) -> u32 {
|
pub fn get(&self, slot: usize) -> u32 {
|
||||||
match self.mmap[HEADER_SIZE + slot] {
|
match self.mmap[HEADER_SIZE + slot] {
|
||||||
255 => *self
|
255 => *self.overflow.get(&slot).expect("sentinel without overflow entry"),
|
||||||
.overflow
|
v => v as u32,
|
||||||
.get(&slot)
|
|
||||||
.expect("sentinel without overflow entry"),
|
|
||||||
v => v as u32,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -83,61 +69,201 @@ impl PersistentCompactIntVecBuilder {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn len(&self) -> usize {
|
pub fn len(&self) -> usize { self.n }
|
||||||
self.n
|
pub fn is_empty(&self) -> bool { self.n == 0 }
|
||||||
|
|
||||||
|
pub fn primary_bytes(&self) -> &[u8] { &self.mmap[HEADER_SIZE..HEADER_SIZE + self.n] }
|
||||||
|
pub fn primary_bytes_mut(&mut self) -> &mut [u8] { &mut self.mmap[HEADER_SIZE..HEADER_SIZE + self.n] }
|
||||||
|
pub fn clear_overflow(&mut self) { self.overflow.clear(); }
|
||||||
|
|
||||||
|
pub fn sum(&self) -> u64 {
|
||||||
|
byte_sum(&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n], self.overflow.values().copied())
|
||||||
|
}
|
||||||
|
pub fn count_nonzero(&self) -> u64 {
|
||||||
|
byte_count_nonzero(&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n])
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn is_empty(&self) -> bool {
|
pub fn view(&self) -> IntSliceView<'_> {
|
||||||
self.n == 0
|
// Builder overflow is a HashMap, not sorted raw bytes — convert on the fly
|
||||||
|
// by collecting into a sorted vec and storing in a thread-local buffer.
|
||||||
|
// For read-back during building, just call get(slot) directly.
|
||||||
|
// view() is primarily useful AFTER freeze (on PersistentCompactIntVec).
|
||||||
|
// Here we expose it via a zero-alloc path: primary only, no overflow raw.
|
||||||
|
// Callers that need overflow_entries during building use overflow_entries().
|
||||||
|
let primary = &self.mmap[HEADER_SIZE..HEADER_SIZE + self.n];
|
||||||
|
IntSliceView::new(primary, &[], 0, self.n)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn min(&mut self, other: &PersistentCompactIntVec) {
|
pub fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
|
||||||
assert_eq!(self.n, other.len(), "length mismatch");
|
self.overflow.iter().map(|(&k, &v)| (k, v))
|
||||||
for (slot, other_val) in other.iter().enumerate() {
|
}
|
||||||
if other_val < self.get(slot) {
|
|
||||||
self.set(slot, other_val);
|
pub fn inc(&mut self, slot: usize) {
|
||||||
|
let v = self.get(slot);
|
||||||
|
self.set(slot, v.saturating_add(1));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Computation methods ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Increment one counter per 1-bit of `col`. Safe for any group size.
|
||||||
|
pub fn inc_present(&mut self, col: BitSliceView<'_>) {
|
||||||
|
let n = self.n;
|
||||||
|
for (wi, &word) in col.words().iter().enumerate() {
|
||||||
|
if word == 0 { continue; }
|
||||||
|
let mut w = word;
|
||||||
|
while w != 0 {
|
||||||
|
let bit = w.trailing_zeros() as usize;
|
||||||
|
let slot = wi * 64 + bit;
|
||||||
|
if slot < n { self.inc(slot); }
|
||||||
|
w &= w - 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn max(&mut self, other: &PersistentCompactIntVec) {
|
/// Increment one counter per 1-bit of `col`, using raw u8 arithmetic.
|
||||||
assert_eq!(self.n, other.len(), "length mismatch");
|
/// Caller guarantees no counter will reach 255 (group size < 255).
|
||||||
for (slot, other_val) in other.iter().enumerate() {
|
pub fn inc_present_fast(&mut self, col: BitSliceView<'_>) {
|
||||||
if other_val > self.get(slot) {
|
{
|
||||||
self.set(slot, other_val);
|
let primary = self.primary_bytes_mut();
|
||||||
|
let n = primary.len();
|
||||||
|
for (wi, &word) in col.words().iter().enumerate() {
|
||||||
|
if word == 0 { continue; }
|
||||||
|
let mut w = word;
|
||||||
|
while w != 0 {
|
||||||
|
let bit = w.trailing_zeros() as usize;
|
||||||
|
let s = wi * 64 + bit;
|
||||||
|
if s < n { primary[s] += 1; }
|
||||||
|
w &= w - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
debug_assert!(
|
||||||
|
!self.primary_bytes().contains(&255),
|
||||||
|
"sentinel 255 reached in inc_present_fast — group size must be < 255"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Two-pass: primary bytes then overflow. Increments `self[slot]` for each
|
||||||
|
/// slot where `pred(col[slot])` is true. Safe for any group size.
|
||||||
|
pub fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||||
|
let n = col.len();
|
||||||
|
for slot in 0..n {
|
||||||
|
let b = col.primary_bytes()[slot];
|
||||||
|
if b < 255 && pred(b as u32) {
|
||||||
|
self.inc(slot);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (slot, val) in col.overflow_entries() {
|
||||||
|
if pred(val) { self.inc(slot); }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fast two-pass: raw u8 arithmetic. Caller guarantees no counter reaches 255.
|
||||||
|
pub fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||||
|
let n = col.len();
|
||||||
|
{
|
||||||
|
let primary = self.primary_bytes_mut();
|
||||||
|
for slot in 0..n {
|
||||||
|
let b = col.primary_bytes()[slot];
|
||||||
|
if b < 255 && pred(b as u32) {
|
||||||
|
primary[slot] += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (slot, val) in col.overflow_entries() {
|
||||||
|
if pred(val) { self.primary_bytes_mut()[slot] += 1; }
|
||||||
|
}
|
||||||
|
debug_assert!(
|
||||||
|
!self.primary_bytes().contains(&255),
|
||||||
|
"sentinel 255 reached in inc_predicate_fast — group size must be < 255"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add(&mut self, other: IntSliceView<'_>) {
|
||||||
|
let n = self.n;
|
||||||
|
for s in 0..n {
|
||||||
|
let sb = self.primary_bytes()[s];
|
||||||
|
let ob = other.primary_bytes()[s];
|
||||||
|
if sb < 255 && ob < 255 {
|
||||||
|
let sum = sb as u32 + ob as u32;
|
||||||
|
if sum < 255 { self.primary_bytes_mut()[s] = sum as u8; }
|
||||||
|
else { self.set(s, sum); }
|
||||||
|
} else {
|
||||||
|
let sv = self.get(s);
|
||||||
|
let ov = other.get(s);
|
||||||
|
self.set(s, sv + ov);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add(&mut self, other: &PersistentCompactIntVec) {
|
pub fn min(&mut self, other: IntSliceView<'_>) {
|
||||||
assert_eq!(self.n, other.len(), "length mismatch");
|
let self_ov: Vec<(usize, u32)> = self.overflow_entries().collect();
|
||||||
for (slot, other_val) in other.iter().enumerate() {
|
let other_ov: HashMap<usize, u32> = other.overflow_entries().collect();
|
||||||
let cur = self.get(slot);
|
self.clear_overflow();
|
||||||
self.set(slot, cur.checked_add(other_val).expect("u32 overflow in add"));
|
for (a, &b) in self.primary_bytes_mut().iter_mut().zip(other.primary_bytes()) {
|
||||||
|
if b < *a { *a = b; }
|
||||||
|
}
|
||||||
|
for (slot, self_val) in self_ov {
|
||||||
|
if let Some(&other_val) = other_ov.get(&slot) {
|
||||||
|
self.set(slot, self_val.min(other_val));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn diff(&mut self, other: &PersistentCompactIntVec) {
|
pub fn max(&mut self, other: IntSliceView<'_>) {
|
||||||
assert_eq!(self.n, other.len(), "length mismatch");
|
for (slot, other_val) in other.overflow_entries() {
|
||||||
for (slot, other_val) in other.iter().enumerate() {
|
let sv = self.get(slot);
|
||||||
self.set(slot, self.get(slot).saturating_sub(other_val));
|
self.set(slot, sv.max(other_val));
|
||||||
|
}
|
||||||
|
for (a, &b) in self.primary_bytes_mut().iter_mut().zip(other.primary_bytes()) {
|
||||||
|
if b > *a { *a = b; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn diff(&mut self, other: IntSliceView<'_>) {
|
||||||
|
let n = self.n;
|
||||||
|
for s in 0..n {
|
||||||
|
let sb = self.primary_bytes()[s];
|
||||||
|
let ob = other.primary_bytes()[s];
|
||||||
|
if sb < 255 {
|
||||||
|
self.primary_bytes_mut()[s] = if ob < 255 { sb.saturating_sub(ob) } else { 0 };
|
||||||
|
} else {
|
||||||
|
let sv = self.get(s);
|
||||||
|
let ov = if ob < 255 { ob as u32 } else { other.get(s) };
|
||||||
|
self.set(s, sv.saturating_sub(ov));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn mask_with(&mut self, mask: BitSliceView<'_>) {
|
||||||
|
let n = self.n;
|
||||||
|
for (wi, &word) in mask.words().iter().enumerate() {
|
||||||
|
if word == u64::MAX { continue; }
|
||||||
|
let mut zeros = !word;
|
||||||
|
while zeros != 0 {
|
||||||
|
let bit = zeros.trailing_zeros() as usize;
|
||||||
|
let s = wi * 64 + bit;
|
||||||
|
if s < n {
|
||||||
|
let b = self.primary_bytes()[s];
|
||||||
|
if b != 0 { self.set(s, 0); }
|
||||||
|
}
|
||||||
|
zeros &= zeros - 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Flush the primary mmap, then write sorted overflow data + index and fix the header.
|
|
||||||
pub fn close(self) -> io::Result<()> {
|
pub fn close(self) -> io::Result<()> {
|
||||||
self.mmap.flush()?;
|
self.mmap.flush()?;
|
||||||
let Self {
|
let Self { path, mmap, n, overflow } = self;
|
||||||
path,
|
|
||||||
mmap,
|
|
||||||
n,
|
|
||||||
overflow,
|
|
||||||
} = self;
|
|
||||||
drop(mmap);
|
drop(mmap);
|
||||||
|
|
||||||
let mut entries: Vec<(usize, u32)> = overflow.into_iter().collect();
|
let mut entries: Vec<(usize, u32)> = overflow.into_iter().collect();
|
||||||
entries.sort_unstable_by_key(|&(slot, _)| slot);
|
entries.sort_unstable_by_key(|&(slot, _)| slot);
|
||||||
|
|
||||||
finalize_pciv(&path, n, &entries)
|
finalize_pciv(&path, n, &entries)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn finish(self) -> io::Result<PersistentCompactIntVec> {
|
||||||
|
let path = self.path.clone();
|
||||||
|
self.close()?;
|
||||||
|
PersistentCompactIntVec::open(&path)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,137 @@
|
|||||||
|
use std::io;
|
||||||
|
|
||||||
|
use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
|
||||||
|
use crate::tempintvec::TempCompactIntVec;
|
||||||
|
|
||||||
|
// ── ColGroup ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// A named subset of columns, identified by their indices within the matrix.
|
||||||
|
///
|
||||||
|
/// Defined once at the index level; the same indices are valid across all
|
||||||
|
/// partitions and layers because the column structure (samples / genomes) is
|
||||||
|
/// identical everywhere — only the row space (kmer slots) is partitioned.
|
||||||
|
pub struct ColGroup {
|
||||||
|
pub name: String,
|
||||||
|
pub indices: Vec<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ColGroup {
|
||||||
|
pub fn new(name: impl Into<String>, indices: Vec<usize>) -> Self {
|
||||||
|
Self { name: name.into(), indices }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── MatrixGroupOps ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Per-matrix group aggregations.
|
||||||
|
///
|
||||||
|
/// `partial_group_presence_count`, `partial_group_sum`, `partial_group_any`,
|
||||||
|
/// `partial_group_min`, `partial_group_max` are the primitives; each impl must
|
||||||
|
/// provide all five.
|
||||||
|
///
|
||||||
|
/// `partial_group_all` and `partial_group_none` have default implementations
|
||||||
|
/// derived from `partial_group_presence_count` and should rarely need overriding.
|
||||||
|
pub trait MatrixGroupOps {
|
||||||
|
/// Per-slot count of group columns whose value ≥ `threshold`.
|
||||||
|
fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> io::Result<TempCompactIntVec>;
|
||||||
|
|
||||||
|
/// Per-slot sum of values across all group columns.
|
||||||
|
fn partial_group_sum(&self, g: &ColGroup) -> io::Result<TempCompactIntVec>;
|
||||||
|
|
||||||
|
/// Per-slot OR: 1 if any group column has value ≥ `threshold`.
|
||||||
|
fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec>;
|
||||||
|
|
||||||
|
/// Per-slot min value across all group columns (0 if group is empty).
|
||||||
|
fn partial_group_min(&self, g: &ColGroup) -> io::Result<TempCompactIntVec>;
|
||||||
|
|
||||||
|
/// Per-slot max value across all group columns (0 if group is empty).
|
||||||
|
fn partial_group_max(&self, g: &ColGroup) -> io::Result<TempCompactIntVec>;
|
||||||
|
|
||||||
|
/// Per-slot AND: 1 if ALL group columns have value ≥ `threshold`.
|
||||||
|
fn partial_group_all(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec> {
|
||||||
|
let counts = self.partial_group_presence_count(g, threshold)?;
|
||||||
|
let n = counts.len();
|
||||||
|
let n_required = g.indices.len() as u32;
|
||||||
|
let mut b = TempBitVecBuilder::new(n)?;
|
||||||
|
b.or_where(counts.view(), |v| v >= n_required);
|
||||||
|
b.freeze()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Per-slot NOR: 1 if NO group column has value ≥ `threshold`.
|
||||||
|
fn partial_group_none(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec> {
|
||||||
|
let counts = self.partial_group_presence_count(g, threshold)?;
|
||||||
|
let n = counts.len();
|
||||||
|
let mut b = TempBitVecBuilder::new(n)?;
|
||||||
|
b.or_where(counts.view(), |v| v == 0);
|
||||||
|
b.freeze()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── FilterMask — expression tree for column-based slot filters ────────────────
|
||||||
|
|
||||||
|
/// A composable filter expression that can be evaluated against a matrix
|
||||||
|
/// using only column operations (no MPHF lookup per kmer).
|
||||||
|
///
|
||||||
|
/// `threshold` semantics follow [`MatrixGroupOps::partial_group_presence_count`]:
|
||||||
|
/// a slot contributes to the count when its value is **≥ threshold**.
|
||||||
|
/// To match the row-level filter (`value > t`), callers should pass `t + 1`.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub enum FilterMask {
|
||||||
|
/// Slot passes if count of columns in `indices` with value ≥ `threshold` is ≥ `min_count`.
|
||||||
|
PresenceGeq { indices: Vec<usize>, threshold: u32, min_count: usize },
|
||||||
|
/// Slot passes if count of columns in `indices` with value ≥ `threshold` is ≤ `max_count`.
|
||||||
|
PresenceLeq { indices: Vec<usize>, threshold: u32, max_count: usize },
|
||||||
|
/// Slot passes if sum of values across `indices` columns is ≥ `min_sum`.
|
||||||
|
SumGeq { indices: Vec<usize>, min_sum: u32 },
|
||||||
|
/// Slot passes if sum of values across `indices` columns is ≤ `max_sum`.
|
||||||
|
SumLeq { indices: Vec<usize>, max_sum: u32 },
|
||||||
|
/// Slot passes if it passes all sub-expressions. Empty `And` is always true.
|
||||||
|
And(Vec<FilterMask>),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Evaluate a [`FilterMask`] against `mat`, returning a per-slot `TempBitVec`
|
||||||
|
/// where bit=1 means the slot passes the filter.
|
||||||
|
pub fn eval_filter_mask(expr: &FilterMask, mat: &dyn MatrixGroupOps, n: usize) -> io::Result<TempBitVec> {
|
||||||
|
match expr {
|
||||||
|
FilterMask::PresenceGeq { indices, threshold, min_count } => {
|
||||||
|
let g = ColGroup::new("", indices.clone());
|
||||||
|
let counts = mat.partial_group_presence_count(&g, *threshold)?;
|
||||||
|
let mut b = TempBitVecBuilder::new(n)?;
|
||||||
|
let mc = *min_count as u32;
|
||||||
|
b.or_where(counts.view(), |v| v >= mc);
|
||||||
|
b.freeze()
|
||||||
|
}
|
||||||
|
FilterMask::PresenceLeq { indices, threshold, max_count } => {
|
||||||
|
let g = ColGroup::new("", indices.clone());
|
||||||
|
let counts = mat.partial_group_presence_count(&g, *threshold)?;
|
||||||
|
let mut b = TempBitVecBuilder::new(n)?;
|
||||||
|
let mc = *max_count as u32;
|
||||||
|
b.or_where(counts.view(), |v| v <= mc);
|
||||||
|
b.freeze()
|
||||||
|
}
|
||||||
|
FilterMask::SumGeq { indices, min_sum } => {
|
||||||
|
let g = ColGroup::new("", indices.clone());
|
||||||
|
let sums = mat.partial_group_sum(&g)?;
|
||||||
|
let mut b = TempBitVecBuilder::new(n)?;
|
||||||
|
let ms = *min_sum;
|
||||||
|
b.or_where(sums.view(), |v| v >= ms);
|
||||||
|
b.freeze()
|
||||||
|
}
|
||||||
|
FilterMask::SumLeq { indices, max_sum } => {
|
||||||
|
let g = ColGroup::new("", indices.clone());
|
||||||
|
let sums = mat.partial_group_sum(&g)?;
|
||||||
|
let mut b = TempBitVecBuilder::new(n)?;
|
||||||
|
let ms = *max_sum;
|
||||||
|
b.or_where(sums.view(), |v| v <= ms);
|
||||||
|
b.freeze()
|
||||||
|
}
|
||||||
|
FilterMask::And(parts) => {
|
||||||
|
let mut b = TempBitVecBuilder::new_ones(n)?;
|
||||||
|
for part in parts {
|
||||||
|
let m = eval_filter_mask(part, mat, n)?;
|
||||||
|
b.and(m.view());
|
||||||
|
}
|
||||||
|
b.freeze()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -13,6 +13,44 @@ pub const OVERFLOW_ENTRY_SIZE: usize = 12;
|
|||||||
// Index entry: slot(u64) + pos(u64) = 16 bytes.
|
// Index entry: slot(u64) + pos(u64) = 16 bytes.
|
||||||
pub const INDEX_ENTRY_SIZE: usize = 16;
|
pub const INDEX_ENTRY_SIZE: usize = 16;
|
||||||
|
|
||||||
|
/// Sum all values in a compact-int primary byte slice, correcting for overflow sentinels.
|
||||||
|
///
|
||||||
|
/// `primary` is the raw `&[u8]` where 255 is a sentinel for large values.
|
||||||
|
/// `overflow` yields the true values (≥ 255) for each sentinel, in any order.
|
||||||
|
#[inline]
|
||||||
|
pub(crate) fn byte_sum(primary: &[u8], overflow: impl Iterator<Item = u32>) -> u64 {
|
||||||
|
let raw: u64 = primary.iter().map(|&b| b as u64).sum();
|
||||||
|
let (n, ov) = overflow.fold((0u64, 0u64), |(n, s), v| (n + 1, s + v as u64));
|
||||||
|
raw - 255 * n + ov
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Count non-zero values in a compact-int primary byte slice.
|
||||||
|
///
|
||||||
|
/// Overflow sentinels (255) are always non-zero by construction, so a single
|
||||||
|
/// `b != 0` test is sufficient — no overflow map lookup needed.
|
||||||
|
#[inline]
|
||||||
|
pub(crate) fn byte_count_nonzero(primary: &[u8]) -> u64 {
|
||||||
|
primary.iter().filter(|&&b| b != 0).count() as u64
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse a single overflow entry `(slot, value)` from a byte slice.
|
||||||
|
#[inline]
|
||||||
|
pub fn parse_overflow_entry(data: &[u8], base: usize, i: usize) -> (usize, u32) {
|
||||||
|
let off = base + i * OVERFLOW_ENTRY_SIZE;
|
||||||
|
let slot = u64::from_le_bytes(data[off..off+8].try_into().unwrap()) as usize;
|
||||||
|
let value = u32::from_le_bytes(data[off+8..off+12].try_into().unwrap());
|
||||||
|
(slot, value)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse a single sparse-index entry `(slot, pos)` from a byte slice.
|
||||||
|
#[inline]
|
||||||
|
pub fn parse_index_entry(data: &[u8], base: usize, i: usize) -> (usize, usize) {
|
||||||
|
let off = base + i * INDEX_ENTRY_SIZE;
|
||||||
|
let slot = u64::from_le_bytes(data[off..off+8].try_into().unwrap()) as usize;
|
||||||
|
let pos = u64::from_le_bytes(data[off+8..off+16].try_into().unwrap()) as usize;
|
||||||
|
(slot, pos)
|
||||||
|
}
|
||||||
|
|
||||||
// Sparse index target: ≤ 32 KB in L1 cache (16 B per entry → 2048 entries).
|
// Sparse index target: ≤ 32 KB in L1 cache (16 B per entry → 2048 entries).
|
||||||
pub const L1_INDEX_ENTRIES: usize = 2048;
|
pub const L1_INDEX_ENTRIES: usize = 2048;
|
||||||
|
|
||||||
|
|||||||
+166
-225
@@ -1,16 +1,20 @@
|
|||||||
use std::cmp::Ordering;
|
|
||||||
use std::fs::{self, File};
|
use std::fs::{self, File};
|
||||||
use std::io::{self, Write as _};
|
use std::io::{self, BufWriter, Write as _};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
use memmap2::Mmap;
|
use memmap2::Mmap;
|
||||||
use ndarray::{Array1, Array2};
|
use ndarray::{Array1, Array2};
|
||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
|
|
||||||
|
use crate::bitmatrix::{pairwise_matrix, pairwise2_matrix};
|
||||||
use crate::builder::PersistentCompactIntVecBuilder;
|
use crate::builder::PersistentCompactIntVecBuilder;
|
||||||
use crate::format::{HEADER_SIZE, INDEX_ENTRY_SIZE, OVERFLOW_ENTRY_SIZE};
|
use crate::colgroup::{ColGroup, MatrixGroupOps};
|
||||||
|
use crate::format::{HEADER_SIZE, OVERFLOW_ENTRY_SIZE};
|
||||||
use crate::meta::MatrixMeta;
|
use crate::meta::MatrixMeta;
|
||||||
use crate::reader::PersistentCompactIntVec;
|
use crate::reader::PersistentCompactIntVec;
|
||||||
|
use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
|
||||||
|
use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
|
||||||
|
use crate::views::IntSliceView;
|
||||||
|
|
||||||
fn col_path(dir: &Path, col: usize) -> PathBuf {
|
fn col_path(dir: &Path, col: usize) -> PathBuf {
|
||||||
dir.join(format!("col_{col:06}.pciv"))
|
dir.join(format!("col_{col:06}.pciv"))
|
||||||
@@ -41,9 +45,7 @@ impl ColumnarCompactIntMatrix {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
||||||
for (c, col) in self.cols.iter().enumerate() {
|
for (c, col) in self.cols.iter().enumerate() { buf[c] = col.get(slot); }
|
||||||
buf[c] = col.get(slot);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn sum(&self) -> Array1<u64> {
|
pub(crate) fn sum(&self) -> Array1<u64> {
|
||||||
@@ -63,49 +65,26 @@ impl ColumnarCompactIntMatrix {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||||
self.pairwise_u64(|i, j| self.col(i).partial_bray_dist(self.col(j)))
|
pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_bray_dist(self.col(j)))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
|
pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||||
self.pairwise(|i, j| self.col(i).partial_euclidean_dist(self.col(j)))
|
pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_euclidean_dist(self.col(j)))
|
||||||
}
|
}
|
||||||
|
pub(crate) fn partial_threshold_jaccard_dist_matrix(&self, threshold: u32) -> (Array2<u64>, Array2<u64>) {
|
||||||
pub(crate) fn partial_threshold_jaccard_dist_matrix(
|
pairwise2_matrix(self.n_cols(), |i, j| self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold))
|
||||||
&self, threshold: u32,
|
|
||||||
) -> (Array2<u64>, Array2<u64>) {
|
|
||||||
let n = self.n_cols();
|
|
||||||
let pairs = upper_pairs(n);
|
|
||||||
let results: Vec<(usize, usize, u64, u64)> = pairs
|
|
||||||
.into_par_iter()
|
|
||||||
.map(|(i, j)| {
|
|
||||||
let (inter, union) =
|
|
||||||
self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold);
|
|
||||||
(i, j, inter, union)
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
let mut inter_m = Array2::zeros((n, n));
|
|
||||||
let mut union_m = Array2::zeros((n, n));
|
|
||||||
for (i, j, inter, union) in results {
|
|
||||||
inter_m[[i, j]] = inter; inter_m[[j, i]] = inter;
|
|
||||||
union_m[[i, j]] = union; union_m[[j, i]] = union;
|
|
||||||
}
|
|
||||||
(inter_m, union_m)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||||
self.pairwise(|i, j| {
|
pairwise_matrix(self.n_cols(), |i, j| {
|
||||||
self.col(i).partial_relfreq_bray_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
|
self.col(i).partial_relfreq_bray_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||||
self.pairwise(|i, j| {
|
pairwise_matrix(self.n_cols(), |i, j| {
|
||||||
self.col(i).partial_relfreq_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
|
self.col(i).partial_relfreq_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||||
self.pairwise(|i, j| {
|
pairwise_matrix(self.n_cols(), |i, j| {
|
||||||
self.col(i).partial_hellinger_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
|
self.col(i).partial_hellinger_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -118,20 +97,6 @@ impl ColumnarCompactIntMatrix {
|
|||||||
meta.n_cols += 1;
|
meta.n_cols += 1;
|
||||||
meta.save(dir)
|
meta.save(dir)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pairwise(&self, f: impl Fn(usize, usize) -> f64 + Sync) -> Array2<f64> {
|
|
||||||
let n = self.n_cols();
|
|
||||||
let results: Vec<(usize, usize, f64)> = upper_pairs(n)
|
|
||||||
.into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
|
|
||||||
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2<u64> {
|
|
||||||
let n = self.n_cols();
|
|
||||||
let results: Vec<(usize, usize, u64)> = upper_pairs(n)
|
|
||||||
.into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
|
|
||||||
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── PackedCompactIntMatrix ────────────────────────────────────────────────────
|
// ── PackedCompactIntMatrix ────────────────────────────────────────────────────
|
||||||
@@ -139,13 +104,10 @@ impl ColumnarCompactIntMatrix {
|
|||||||
const PCMX_MAGIC: [u8; 4] = *b"PCMX";
|
const PCMX_MAGIC: [u8; 4] = *b"PCMX";
|
||||||
const PCMX_HEADER: usize = 24; // magic(4) + pad(4) + n_rows(8) + n_cols(8)
|
const PCMX_HEADER: usize = 24; // magic(4) + pad(4) + n_rows(8) + n_cols(8)
|
||||||
|
|
||||||
/// Per-column metadata pre-parsed from the embedded PCIV header.
|
|
||||||
struct ColInfo {
|
struct ColInfo {
|
||||||
primary_start: usize, // absolute mmap offset to primary array
|
primary_start: usize,
|
||||||
data_offset: usize, // absolute mmap offset to overflow array
|
data_offset: usize,
|
||||||
n_overflow: usize,
|
n_overflow: usize,
|
||||||
step: usize,
|
|
||||||
index: Vec<(usize, usize)>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct PackedCompactIntMatrix {
|
pub struct PackedCompactIntMatrix {
|
||||||
@@ -171,61 +133,31 @@ impl PackedCompactIntMatrix {
|
|||||||
for c in 0..n_cols {
|
for c in 0..n_cols {
|
||||||
let off_pos = PCMX_HEADER + c * 8;
|
let off_pos = PCMX_HEADER + c * 8;
|
||||||
let col_base = u64::from_le_bytes(mmap[off_pos..off_pos+8].try_into().unwrap()) as usize;
|
let col_base = u64::from_le_bytes(mmap[off_pos..off_pos+8].try_into().unwrap()) as usize;
|
||||||
// Parse embedded PCIV header at col_base
|
let n_ov = u64::from_le_bytes(mmap[col_base+16..col_base+24].try_into().unwrap()) as usize;
|
||||||
let n_ov = u64::from_le_bytes(mmap[col_base+16..col_base+24].try_into().unwrap()) as usize;
|
let n_pciv = u64::from_le_bytes(mmap[col_base+8..col_base+16].try_into().unwrap()) as usize;
|
||||||
let n_idx = u64::from_le_bytes(mmap[col_base+24..col_base+32].try_into().unwrap()) as usize;
|
|
||||||
let step = u64::from_le_bytes(mmap[col_base+32..col_base+40].try_into().unwrap()) as usize;
|
|
||||||
let n_pciv = u64::from_le_bytes(mmap[col_base+8..col_base+16].try_into().unwrap()) as usize;
|
|
||||||
|
|
||||||
let primary_start = col_base + HEADER_SIZE;
|
let primary_start = col_base + HEADER_SIZE;
|
||||||
let data_offset = primary_start + n_pciv;
|
let data_offset = primary_start + n_pciv;
|
||||||
let index_offset = data_offset + n_ov * OVERFLOW_ENTRY_SIZE;
|
columns.push(ColInfo { primary_start, data_offset, n_overflow: n_ov });
|
||||||
|
|
||||||
let mut index = Vec::with_capacity(n_idx);
|
|
||||||
for i in 0..n_idx {
|
|
||||||
let ioff = index_offset + i * INDEX_ENTRY_SIZE;
|
|
||||||
let slot = u64::from_le_bytes(mmap[ioff..ioff+8].try_into().unwrap()) as usize;
|
|
||||||
let pos = u64::from_le_bytes(mmap[ioff+8..ioff+16].try_into().unwrap()) as usize;
|
|
||||||
index.push((slot, pos));
|
|
||||||
}
|
|
||||||
columns.push(ColInfo { primary_start, data_offset, n_overflow: n_ov, step, index });
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(Self { mmap, n_rows, n_cols, columns })
|
Ok(Self { mmap, n_rows, n_cols, columns })
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
pub(crate) fn col_view(&self, c: usize) -> IntSliceView<'_> {
|
||||||
pub(crate) fn get(&self, col: usize, slot: usize) -> u32 {
|
let ci = &self.columns[c];
|
||||||
let ci = &self.columns[col];
|
let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
|
||||||
let v = self.mmap[ci.primary_start + slot];
|
let overflow_raw = &self.mmap[ci.data_offset..ci.data_offset + ci.n_overflow * OVERFLOW_ENTRY_SIZE];
|
||||||
if v < 255 { return v as u32; }
|
IntSliceView::new(primary, overflow_raw, ci.n_overflow, self.n_rows)
|
||||||
self.overflow_get(ci, slot)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn overflow_get(&self, ci: &ColInfo, slot: usize) -> u32 {
|
pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
|
||||||
let (pos_start, pos_end) = if ci.step == 0 {
|
let view = self.col_view(c);
|
||||||
(0, ci.n_overflow)
|
let overflow: std::collections::HashMap<usize, u32> = view.overflow_entries().collect();
|
||||||
} else {
|
PersistentCompactIntVecBuilder::from_raw_primary(view.primary_bytes(), overflow, path)
|
||||||
let i = ci.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1);
|
|
||||||
let start = ci.index[i].1;
|
|
||||||
let end = if i + 1 < ci.index.len() { ci.index[i+1].1 } else { ci.n_overflow };
|
|
||||||
(start, end)
|
|
||||||
};
|
|
||||||
let mut lo = pos_start;
|
|
||||||
let mut hi = pos_end;
|
|
||||||
while lo < hi {
|
|
||||||
let mid = lo + (hi - lo) / 2;
|
|
||||||
let off = ci.data_offset + mid * OVERFLOW_ENTRY_SIZE;
|
|
||||||
let stored = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize;
|
|
||||||
match stored.cmp(&slot) {
|
|
||||||
Ordering::Equal => return u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap()),
|
|
||||||
Ordering::Less => lo = mid + 1,
|
|
||||||
Ordering::Greater => hi = mid,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
panic!("slot {slot} marked overflow but not found")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub(crate) fn get(&self, col: usize, slot: usize) -> u32 { self.col_view(col).get(slot) }
|
||||||
|
|
||||||
pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
||||||
for c in 0..self.n_cols { buf[c] = self.get(c, slot); }
|
for c in 0..self.n_cols { buf[c] = self.get(c, slot); }
|
||||||
}
|
}
|
||||||
@@ -236,149 +168,96 @@ impl PackedCompactIntMatrix {
|
|||||||
|
|
||||||
pub(crate) fn sum(&self) -> Array1<u64> {
|
pub(crate) fn sum(&self) -> Array1<u64> {
|
||||||
Array1::from_vec(
|
Array1::from_vec(
|
||||||
(0..self.n_cols).into_par_iter()
|
(0..self.n_cols).into_par_iter().map(|c| self.col_view(c).sum()).collect()
|
||||||
.map(|c| (0..self.n_rows).map(|s| self.get(c, s) as u64).sum())
|
|
||||||
.collect()
|
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn count_nonzero(&self) -> Array1<u64> {
|
pub(crate) fn count_nonzero(&self) -> Array1<u64> {
|
||||||
Array1::from_vec(
|
Array1::from_vec(
|
||||||
(0..self.n_cols).into_par_iter()
|
(0..self.n_cols).into_par_iter().map(|c| self.col_view(c).count_nonzero()).collect()
|
||||||
.map(|c| (0..self.n_rows).filter(|&s| self.get(c, s) > 0).count() as u64)
|
|
||||||
.collect()
|
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Pair primitives ───────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
fn pair_partial_bray(&self, i: usize, j: usize) -> u64 {
|
fn pair_partial_bray(&self, i: usize, j: usize) -> u64 {
|
||||||
(0..self.n_rows).map(|s| self.get(i, s).min(self.get(j, s)) as u64).sum()
|
self.col_view(i).iter().zip(self.col_view(j).iter()).map(|(a, b)| a.min(b) as u64).sum()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pair_partial_euclidean(&self, i: usize, j: usize) -> f64 {
|
fn pair_partial_euclidean(&self, i: usize, j: usize) -> f64 {
|
||||||
(0..self.n_rows).map(|s| {
|
self.col_view(i).iter().zip(self.col_view(j).iter())
|
||||||
let d = self.get(i, s) as f64 - self.get(j, s) as f64;
|
.map(|(a, b)| { let d = a as f64 - b as f64; d * d }).sum()
|
||||||
d * d
|
|
||||||
}).sum()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pair_partial_threshold_jaccard(&self, i: usize, j: usize, t: u32) -> (u64, u64) {
|
fn pair_partial_threshold_jaccard(&self, i: usize, j: usize, t: u32) -> (u64, u64) {
|
||||||
let (mut inter, mut union) = (0u64, 0u64);
|
self.col_view(i).iter().zip(self.col_view(j).iter())
|
||||||
for s in 0..self.n_rows {
|
.fold((0u64, 0u64), |(inter, uni), (a, b)| {
|
||||||
let a = self.get(i, s) >= t;
|
let ap = a >= t; let bp = b >= t;
|
||||||
let b = self.get(j, s) >= t;
|
(inter + (ap & bp) as u64, uni + (ap | bp) as u64)
|
||||||
if a && b { inter += 1; }
|
})
|
||||||
if a || b { union += 1; }
|
|
||||||
}
|
|
||||||
(inter, union)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pair_partial_relfreq_bray(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
fn pair_partial_relfreq_bray(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
||||||
if si == 0.0 || sj == 0.0 { return 0.0; }
|
if si == 0.0 || sj == 0.0 { return 0.0; }
|
||||||
(0..self.n_rows).map(|s| {
|
self.col_view(i).iter().zip(self.col_view(j).iter())
|
||||||
(self.get(i, s) as f64 / si).min(self.get(j, s) as f64 / sj)
|
.map(|(a, b)| (a as f64 / si).min(b as f64 / sj)).sum()
|
||||||
}).sum()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pair_partial_relfreq_euclidean(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
fn pair_partial_relfreq_euclidean(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
||||||
if si == 0.0 || sj == 0.0 { return 0.0; }
|
if si == 0.0 || sj == 0.0 { return 0.0; }
|
||||||
(0..self.n_rows).map(|s| {
|
self.col_view(i).iter().zip(self.col_view(j).iter())
|
||||||
let d = self.get(i, s) as f64 / si - self.get(j, s) as f64 / sj;
|
.map(|(a, b)| { let d = a as f64 / si - b as f64 / sj; d * d }).sum()
|
||||||
d * d
|
|
||||||
}).sum()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pair_partial_hellinger(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
fn pair_partial_hellinger(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
||||||
if si == 0.0 || sj == 0.0 { return 0.0; }
|
if si == 0.0 || sj == 0.0 { return 0.0; }
|
||||||
(0..self.n_rows).map(|s| {
|
self.col_view(i).iter().zip(self.col_view(j).iter())
|
||||||
let d = (self.get(i, s) as f64 / si).sqrt() - (self.get(j, s) as f64 / sj).sqrt();
|
.map(|(a, b)| { let d = (a as f64 / si).sqrt() - (b as f64 / sj).sqrt(); d * d }).sum()
|
||||||
d * d
|
|
||||||
}).sum()
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Matrix methods ────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
fn pairwise<T>(&self, f: impl Fn(usize, usize) -> T + Sync) -> Array2<T>
|
|
||||||
where T: Clone + Default + Send {
|
|
||||||
let n = self.n_cols;
|
|
||||||
let results: Vec<(usize, usize, T)> = upper_pairs(n)
|
|
||||||
.into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
|
|
||||||
fill_symmetric(n, results.into_iter().map(|(i, j, v)| { let w = v.clone(); (i, j, v, w) }))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2<u64> {
|
|
||||||
let n = self.n_cols;
|
|
||||||
let results: Vec<(usize, usize, u64)> = upper_pairs(n)
|
|
||||||
.into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
|
|
||||||
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||||
self.pairwise_u64(|i, j| self.pair_partial_bray(i, j))
|
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_bray(i, j))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
|
pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||||
self.pairwise(|i, j| self.pair_partial_euclidean(i, j))
|
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_euclidean(i, j))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_threshold_jaccard_dist_matrix(&self, t: u32) -> (Array2<u64>, Array2<u64>) {
|
pub(crate) fn partial_threshold_jaccard_dist_matrix(&self, t: u32) -> (Array2<u64>, Array2<u64>) {
|
||||||
let n = self.n_cols;
|
pairwise2_matrix(self.n_cols, |i, j| self.pair_partial_threshold_jaccard(i, j, t))
|
||||||
let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n)
|
|
||||||
.into_par_iter()
|
|
||||||
.map(|(i, j)| { let (inter, union) = self.pair_partial_threshold_jaccard(i, j, t); (i, j, inter, union) })
|
|
||||||
.collect();
|
|
||||||
let mut inter_m = Array2::zeros((n, n));
|
|
||||||
let mut union_m = Array2::zeros((n, n));
|
|
||||||
for (i, j, inter, union) in results {
|
|
||||||
inter_m[[i, j]] = inter; inter_m[[j, i]] = inter;
|
|
||||||
union_m[[i, j]] = union; union_m[[j, i]] = union;
|
|
||||||
}
|
|
||||||
(inter_m, union_m)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||||
self.pairwise(|i, j| self.pair_partial_relfreq_bray(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_bray(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||||
self.pairwise(|i, j| self.pair_partial_relfreq_euclidean(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_euclidean(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||||
self.pairwise(|i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Build `counts/matrix.pcmx` from existing `col_*.pciv` files.
|
/// Build `counts/matrix.pcmx` from existing `col_*.pciv` files.
|
||||||
pub fn pack_compact_int_matrix(dir: &Path) -> io::Result<()> {
|
pub fn pack_compact_int_matrix(dir: &Path) -> io::Result<()> {
|
||||||
let meta = MatrixMeta::load(dir)?;
|
let packed_path = dir.join("matrix.pcmx");
|
||||||
|
if packed_path.exists() {
|
||||||
|
if let Ok(meta) = MatrixMeta::load(dir) {
|
||||||
|
for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); }
|
||||||
|
let _ = fs::remove_file(dir.join("meta.json"));
|
||||||
|
}
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
let meta = MatrixMeta::load(dir)?;
|
||||||
let n_cols = meta.n_cols;
|
let n_cols = meta.n_cols;
|
||||||
|
let col_sizes: Vec<u64> = (0..n_cols)
|
||||||
let col_files: Vec<Vec<u8>> = (0..n_cols)
|
.map(|c| fs::metadata(col_path(dir, c)).map(|m| m.len()))
|
||||||
.map(|c| fs::read(col_path(dir, c)))
|
|
||||||
.collect::<io::Result<_>>()?;
|
.collect::<io::Result<_>>()?;
|
||||||
|
let header_size = (PCMX_HEADER + n_cols * 8) as u64;
|
||||||
let header_size = PCMX_HEADER + n_cols * 8;
|
|
||||||
let mut col_offset = header_size;
|
let mut col_offset = header_size;
|
||||||
let mut offsets = Vec::with_capacity(n_cols);
|
let mut offsets = Vec::with_capacity(n_cols);
|
||||||
for data in &col_files {
|
for &size in &col_sizes { offsets.push(col_offset); col_offset += size; }
|
||||||
offsets.push(col_offset as u64);
|
let tmp_path = dir.join("matrix.pcmx.tmp");
|
||||||
col_offset += data.len();
|
let mut out = BufWriter::new(File::create(&tmp_path)?);
|
||||||
}
|
out.write_all(&PCMX_MAGIC)?;
|
||||||
|
out.write_all(&[0u8; 4])?;
|
||||||
let packed_path = dir.join("matrix.pcmx");
|
out.write_all(&(meta.n as u64).to_le_bytes())?;
|
||||||
let mut file = File::create(&packed_path)?;
|
out.write_all(&(n_cols as u64).to_le_bytes())?;
|
||||||
file.write_all(&PCMX_MAGIC)?;
|
for &off in &offsets { out.write_all(&off.to_le_bytes())?; }
|
||||||
file.write_all(&[0u8; 4])?;
|
for c in 0..n_cols { io::copy(&mut File::open(col_path(dir, c))?, &mut out)?; }
|
||||||
file.write_all(&(meta.n as u64).to_le_bytes())?;
|
out.flush()?;
|
||||||
file.write_all(&(n_cols as u64).to_le_bytes())?;
|
drop(out);
|
||||||
for &off in &offsets { file.write_all(&off.to_le_bytes())?; }
|
fs::rename(&tmp_path, &packed_path)?;
|
||||||
for data in &col_files { file.write_all(data)?; }
|
|
||||||
drop(file);
|
|
||||||
|
|
||||||
for c in 0..n_cols { fs::remove_file(col_path(dir, c))?; }
|
for c in 0..n_cols { fs::remove_file(col_path(dir, c))?; }
|
||||||
fs::remove_file(dir.join("meta.json"))?;
|
fs::remove_file(dir.join("meta.json"))?;
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -392,18 +271,14 @@ pub enum PersistentCompactIntMatrix {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl PersistentCompactIntMatrix {
|
impl PersistentCompactIntMatrix {
|
||||||
/// Open from `layer_dir`, auto-detecting Packed or Columnar.
|
|
||||||
pub fn open(layer_dir: &Path) -> io::Result<Self> {
|
pub fn open(layer_dir: &Path) -> io::Result<Self> {
|
||||||
let counts_dir = layer_dir.join("counts");
|
let counts_dir = layer_dir.join("counts");
|
||||||
|
|
||||||
if counts_dir.join("matrix.pcmx").exists() {
|
if counts_dir.join("matrix.pcmx").exists() {
|
||||||
return Ok(Self::Packed(PackedCompactIntMatrix::open(&counts_dir.join("matrix.pcmx"))?));
|
return Ok(Self::Packed(PackedCompactIntMatrix::open(&counts_dir.join("matrix.pcmx"))?));
|
||||||
}
|
}
|
||||||
|
|
||||||
if MatrixMeta::load(&counts_dir).is_ok() {
|
if MatrixMeta::load(&counts_dir).is_ok() {
|
||||||
return Ok(Self::Columnar(ColumnarCompactIntMatrix::open(&counts_dir)?));
|
return Ok(Self::Columnar(ColumnarCompactIntMatrix::open(&counts_dir)?));
|
||||||
}
|
}
|
||||||
|
|
||||||
Err(io::Error::new(
|
Err(io::Error::new(
|
||||||
io::ErrorKind::NotFound,
|
io::ErrorKind::NotFound,
|
||||||
format!("no count matrix found in {} — run 'obikmer upgrade'", layer_dir.display()),
|
format!("no count matrix found in {} — run 'obikmer upgrade'", layer_dir.display()),
|
||||||
@@ -413,7 +288,6 @@ impl PersistentCompactIntMatrix {
|
|||||||
pub fn n(&self) -> usize {
|
pub fn n(&self) -> usize {
|
||||||
match self { Self::Columnar(m) => m.n(), Self::Packed(m) => m.n_rows }
|
match self { Self::Columnar(m) => m.n(), Self::Packed(m) => m.n_rows }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn n_cols(&self) -> usize {
|
pub fn n_cols(&self) -> usize {
|
||||||
match self { Self::Columnar(m) => m.n_cols(), Self::Packed(m) => m.n_cols }
|
match self { Self::Columnar(m) => m.n_cols(), Self::Packed(m) => m.n_cols }
|
||||||
}
|
}
|
||||||
@@ -425,22 +299,32 @@ impl PersistentCompactIntMatrix {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn col_view(&self, c: usize) -> IntSliceView<'_> {
|
||||||
|
match self {
|
||||||
|
Self::Columnar(m) => m.col(c).view(),
|
||||||
|
Self::Packed(m) => m.col_view(c),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
|
||||||
|
match self {
|
||||||
|
Self::Columnar(m) => PersistentCompactIntVecBuilder::build_from(m.col(c), path),
|
||||||
|
Self::Packed(m) => m.col_persist(c, path),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn row(&self, slot: usize) -> Box<[u32]> {
|
pub fn row(&self, slot: usize) -> Box<[u32]> {
|
||||||
match self { Self::Columnar(m) => m.row(slot), Self::Packed(m) => m.row(slot) }
|
match self { Self::Columnar(m) => m.row(slot), Self::Packed(m) => m.row(slot) }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
pub fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
||||||
match self { Self::Columnar(m) => m.fill_row(slot, buf), Self::Packed(m) => m.fill_row(slot, buf) }
|
match self { Self::Columnar(m) => m.fill_row(slot, buf), Self::Packed(m) => m.fill_row(slot, buf) }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn sum(&self) -> Array1<u64> {
|
pub fn sum(&self) -> Array1<u64> {
|
||||||
match self { Self::Columnar(m) => m.sum(), Self::Packed(m) => m.sum() }
|
match self { Self::Columnar(m) => m.sum(), Self::Packed(m) => m.sum() }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn count_nonzero(&self) -> Array1<u64> {
|
pub fn count_nonzero(&self) -> Array1<u64> {
|
||||||
match self { Self::Columnar(m) => m.count_nonzero(), Self::Packed(m) => m.count_nonzero() }
|
match self { Self::Columnar(m) => m.count_nonzero(), Self::Packed(m) => m.count_nonzero() }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
pub fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||||
match self { Self::Columnar(m) => m.partial_bray_dist_matrix(), Self::Packed(m) => m.partial_bray_dist_matrix() }
|
match self { Self::Columnar(m) => m.partial_bray_dist_matrix(), Self::Packed(m) => m.partial_bray_dist_matrix() }
|
||||||
}
|
}
|
||||||
@@ -459,7 +343,6 @@ impl PersistentCompactIntMatrix {
|
|||||||
pub fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
pub fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||||
match self { Self::Columnar(m) => m.partial_hellinger_euclidean_dist_matrix(col_sums), Self::Packed(m) => m.partial_hellinger_euclidean_dist_matrix(col_sums) }
|
match self { Self::Columnar(m) => m.partial_hellinger_euclidean_dist_matrix(col_sums), Self::Packed(m) => m.partial_hellinger_euclidean_dist_matrix(col_sums) }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn append_column(dir: &Path, value_of: impl Fn(usize) -> u32) -> io::Result<()> {
|
pub fn append_column(dir: &Path, value_of: impl Fn(usize) -> u32) -> io::Result<()> {
|
||||||
ColumnarCompactIntMatrix::append_column(dir, value_of)
|
ColumnarCompactIntMatrix::append_column(dir, value_of)
|
||||||
}
|
}
|
||||||
@@ -475,12 +358,12 @@ impl ColumnWeights for PersistentCompactIntMatrix {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl CountPartials for PersistentCompactIntMatrix {
|
impl CountPartials for PersistentCompactIntMatrix {
|
||||||
fn partial_bray(&self) -> Array2<u64> { self.partial_bray_dist_matrix() }
|
fn partial_bray(&self) -> Array2<u64> { self.partial_bray_dist_matrix() }
|
||||||
fn partial_euclidean(&self) -> Array2<f64> { self.partial_euclidean_dist_matrix() }
|
fn partial_euclidean(&self) -> Array2<f64> { self.partial_euclidean_dist_matrix() }
|
||||||
fn partial_threshold_jaccard(&self, t: u32) -> (Array2<u64>, Array2<u64>) { self.partial_threshold_jaccard_dist_matrix(t) }
|
fn partial_threshold_jaccard(&self, t: u32) -> (Array2<u64>, Array2<u64>) { self.partial_threshold_jaccard_dist_matrix(t) }
|
||||||
fn partial_relfreq_bray(&self, g: &Array1<u64>) -> Array2<f64> { self.partial_relfreq_bray_dist_matrix(g) }
|
fn partial_relfreq_bray(&self, g: &Array1<u64>) -> Array2<f64> { self.partial_relfreq_bray_dist_matrix(g) }
|
||||||
fn partial_relfreq_euclidean(&self, g: &Array1<u64>) -> Array2<f64> { self.partial_relfreq_euclidean_dist_matrix(g) }
|
fn partial_relfreq_euclidean(&self, g: &Array1<u64>) -> Array2<f64> { self.partial_relfreq_euclidean_dist_matrix(g) }
|
||||||
fn partial_hellinger(&self, g: &Array1<u64>) -> Array2<f64> { self.partial_hellinger_euclidean_dist_matrix(g) }
|
fn partial_hellinger(&self, g: &Array1<u64>) -> Array2<f64> { self.partial_hellinger_euclidean_dist_matrix(g) }
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Builder ───────────────────────────────────────────────────────────────────
|
// ── Builder ───────────────────────────────────────────────────────────────────
|
||||||
@@ -496,30 +379,88 @@ impl PersistentCompactIntMatrixBuilder {
|
|||||||
fs::create_dir_all(dir)?;
|
fs::create_dir_all(dir)?;
|
||||||
Ok(Self { dir: dir.to_path_buf(), n, n_cols: 0 })
|
Ok(Self { dir: dir.to_path_buf(), n, n_cols: 0 })
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn n(&self) -> usize { self.n }
|
pub fn n(&self) -> usize { self.n }
|
||||||
pub fn n_cols(&self) -> usize { self.n_cols }
|
pub fn n_cols(&self) -> usize { self.n_cols }
|
||||||
|
|
||||||
pub fn add_col(&mut self) -> io::Result<PersistentCompactIntVecBuilder> {
|
pub fn add_col(&mut self) -> io::Result<PersistentCompactIntVecBuilder> {
|
||||||
let path = col_path(&self.dir, self.n_cols);
|
let path = col_path(&self.dir, self.n_cols);
|
||||||
self.n_cols += 1;
|
self.n_cols += 1;
|
||||||
PersistentCompactIntVecBuilder::new(self.n, &path)
|
PersistentCompactIntVecBuilder::new(self.n, &path)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn add_col_from(&mut self, src: &TempCompactIntVec) -> io::Result<()> {
|
||||||
|
src.make_persistent(&col_path(&self.dir, self.n_cols))?;
|
||||||
|
self.n_cols += 1;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_col_from_bit(&mut self, src: &TempBitVec) -> io::Result<()> {
|
||||||
|
let path = col_path(&self.dir, self.n_cols);
|
||||||
|
self.n_cols += 1;
|
||||||
|
let mut b = PersistentCompactIntVecBuilder::new(self.n, &path)?;
|
||||||
|
b.inc_present(src.view());
|
||||||
|
b.close()
|
||||||
|
}
|
||||||
|
|
||||||
pub fn close(self) -> io::Result<()> {
|
pub fn close(self) -> io::Result<()> {
|
||||||
MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
|
MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
// ── MatrixGroupOps ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
fn upper_pairs(n: usize) -> Vec<(usize, usize)> {
|
impl MatrixGroupOps for PersistentCompactIntMatrix {
|
||||||
(0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect()
|
fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> io::Result<TempCompactIntVec> {
|
||||||
}
|
let n = self.n();
|
||||||
|
if g.indices.len() < 255 {
|
||||||
|
let mut builder = TempCompactIntVecBuilder::new(n)?;
|
||||||
|
for &c in &g.indices {
|
||||||
|
builder.inc_predicate_fast(self.col_view(c), |v| v >= threshold);
|
||||||
|
}
|
||||||
|
builder.freeze()
|
||||||
|
} else {
|
||||||
|
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||||
|
for chunk in g.indices.chunks(254) {
|
||||||
|
let mut chunk_b = TempCompactIntVecBuilder::new(n)?;
|
||||||
|
for &c in chunk {
|
||||||
|
chunk_b.inc_predicate_fast(self.col_view(c), |v| v >= threshold);
|
||||||
|
}
|
||||||
|
let frozen = chunk_b.freeze()?;
|
||||||
|
result.add(frozen.view());
|
||||||
|
}
|
||||||
|
result.freeze()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn fill_symmetric<T>(n: usize, vals: impl Iterator<Item = (usize, usize, T, T)>) -> Array2<T>
|
fn partial_group_sum(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
|
||||||
where T: Clone + Default {
|
let n = self.n();
|
||||||
let mut m = Array2::from_elem((n, n), T::default());
|
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||||
for (i, j, vij, vji) in vals { m[[i, j]] = vij; m[[j, i]] = vji; }
|
for &c in &g.indices { result.add(self.col_view(c)); }
|
||||||
m
|
result.freeze()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec> {
|
||||||
|
let n = self.n();
|
||||||
|
let mut result = TempBitVecBuilder::new(n)?;
|
||||||
|
for &c in &g.indices {
|
||||||
|
result.or_where(self.col_view(c), |v| v >= threshold);
|
||||||
|
}
|
||||||
|
result.freeze()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn partial_group_min(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
|
||||||
|
let n = self.n();
|
||||||
|
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||||
|
if let Some((&first, rest)) = g.indices.split_first() {
|
||||||
|
result.add(self.col_view(first));
|
||||||
|
for &c in rest { result.min(self.col_view(c)); }
|
||||||
|
}
|
||||||
|
result.freeze()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn partial_group_max(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
|
||||||
|
let n = self.n();
|
||||||
|
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||||
|
for &c in &g.indices { result.max(self.col_view(c)); }
|
||||||
|
result.freeze()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -23,11 +23,6 @@ impl LayerMeta {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn parse(s: &str) -> Option<Self> {
|
fn parse(s: &str) -> Option<Self> {
|
||||||
let key = "\"n\":";
|
Some(Self { n: crate::meta::field(s, "n")? })
|
||||||
let pos = s.find(key)? + key.len();
|
|
||||||
let rest = s[pos..].trim_start();
|
|
||||||
let end = rest.find(|c: char| !c.is_ascii_digit()).unwrap_or(rest.len());
|
|
||||||
let n = rest[..end].parse().ok()?;
|
|
||||||
Some(Self { n })
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,20 +1,28 @@
|
|||||||
mod bitvec;
|
mod bitvec;
|
||||||
mod bitmatrix;
|
mod bitmatrix;
|
||||||
mod builder;
|
mod builder;
|
||||||
|
mod colgroup;
|
||||||
mod format;
|
mod format;
|
||||||
mod intmatrix;
|
mod intmatrix;
|
||||||
mod layer_meta;
|
mod layer_meta;
|
||||||
mod meta;
|
mod meta;
|
||||||
mod reader;
|
mod reader;
|
||||||
|
mod tempbitvec;
|
||||||
|
mod tempintvec;
|
||||||
|
mod views;
|
||||||
pub mod traits;
|
pub mod traits;
|
||||||
|
|
||||||
pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder};
|
pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder};
|
||||||
pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_matrix};
|
pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_matrix};
|
||||||
pub use builder::PersistentCompactIntVecBuilder;
|
pub use builder::PersistentCompactIntVecBuilder;
|
||||||
|
pub use colgroup::{ColGroup, FilterMask, MatrixGroupOps, eval_filter_mask};
|
||||||
pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
|
pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
|
||||||
pub use layer_meta::LayerMeta;
|
pub use layer_meta::LayerMeta;
|
||||||
pub use reader::PersistentCompactIntVec;
|
pub use reader::{PersistentCompactIntVec, Iter as CompactIntVecIter};
|
||||||
|
pub use tempbitvec::{TempBitVec, TempBitVecBuilder};
|
||||||
|
pub use tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
|
||||||
pub use traits::{BitPartials, ColumnWeights, CountPartials};
|
pub use traits::{BitPartials, ColumnWeights, CountPartials};
|
||||||
|
pub use views::{BitSliceView, BitSliceIter, IntSliceView, IntSliceViewIter};
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
#[path = "tests/mod.rs"]
|
#[path = "tests/mod.rs"]
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ fn parse(s: &str) -> Option<MatrixMeta> {
|
|||||||
Some(MatrixMeta { n: field(s, "n")?, n_cols: field(s, "n_cols")? })
|
Some(MatrixMeta { n: field(s, "n")?, n_cols: field(s, "n_cols")? })
|
||||||
}
|
}
|
||||||
|
|
||||||
fn field(s: &str, name: &str) -> Option<usize> {
|
pub(crate) fn field(s: &str, name: &str) -> Option<usize> {
|
||||||
let key = format!("\"{}\":", name);
|
let key = format!("\"{}\":", name);
|
||||||
let pos = s.find(&key)? + key.len();
|
let pos = s.find(&key)? + key.len();
|
||||||
let rest = s[pos..].trim_start();
|
let rest = s[pos..].trim_start();
|
||||||
|
|||||||
+70
-211
@@ -4,7 +4,8 @@ use std::path::{Path, PathBuf};
|
|||||||
|
|
||||||
use memmap2::Mmap;
|
use memmap2::Mmap;
|
||||||
|
|
||||||
use crate::format::{HEADER_SIZE, INDEX_ENTRY_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE};
|
use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE, parse_index_entry};
|
||||||
|
use crate::views::IntSliceView;
|
||||||
|
|
||||||
pub struct PersistentCompactIntVec {
|
pub struct PersistentCompactIntVec {
|
||||||
mmap: Mmap,
|
mmap: Mmap,
|
||||||
@@ -18,100 +19,60 @@ pub struct PersistentCompactIntVec {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl PersistentCompactIntVec {
|
impl PersistentCompactIntVec {
|
||||||
/// Opens a persistent compact int vector from the given path.
|
|
||||||
pub fn open(path: &Path) -> io::Result<Self> {
|
pub fn open(path: &Path) -> io::Result<Self> {
|
||||||
let mmap = unsafe { Mmap::map(&File::open(path)?)? };
|
let mmap = unsafe { Mmap::map(&File::open(path)?)? };
|
||||||
|
|
||||||
if mmap.len() < HEADER_SIZE {
|
if mmap.len() < HEADER_SIZE {
|
||||||
return Err(io::Error::new(
|
return Err(io::Error::new(io::ErrorKind::InvalidData, "PCIV file too short"));
|
||||||
io::ErrorKind::InvalidData,
|
|
||||||
"PCIV file too short",
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
if &mmap[0..4] != &MAGIC {
|
if &mmap[0..4] != &MAGIC {
|
||||||
return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PCIV magic"));
|
return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PCIV magic"));
|
||||||
}
|
}
|
||||||
|
|
||||||
let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize;
|
let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize;
|
||||||
let n_overflow = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize;
|
let n_overflow = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize;
|
||||||
let n_index = u64::from_le_bytes(mmap[24..32].try_into().unwrap()) as usize;
|
let n_index = u64::from_le_bytes(mmap[24..32].try_into().unwrap()) as usize;
|
||||||
let step = u64::from_le_bytes(mmap[32..40].try_into().unwrap()) as usize;
|
let step = u64::from_le_bytes(mmap[32..40].try_into().unwrap()) as usize;
|
||||||
|
|
||||||
let primary_offset = HEADER_SIZE;
|
let primary_offset = HEADER_SIZE;
|
||||||
let data_offset = primary_offset + n;
|
let data_offset = primary_offset + n;
|
||||||
let index_offset = data_offset + n_overflow * OVERFLOW_ENTRY_SIZE;
|
let index_offset = data_offset + n_overflow * OVERFLOW_ENTRY_SIZE;
|
||||||
|
|
||||||
let mut index = Vec::with_capacity(n_index);
|
let mut index = Vec::with_capacity(n_index);
|
||||||
for i in 0..n_index {
|
for i in 0..n_index {
|
||||||
let off = index_offset + i * INDEX_ENTRY_SIZE;
|
index.push(parse_index_entry(&mmap, index_offset, i));
|
||||||
let slot = u64::from_le_bytes(mmap[off..off + 8].try_into().unwrap()) as usize;
|
|
||||||
let pos = u64::from_le_bytes(mmap[off + 8..off + 16].try_into().unwrap()) as usize;
|
|
||||||
index.push((slot, pos));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self { mmap, n, n_overflow, step, index, primary_offset, data_offset, path: path.to_path_buf() })
|
||||||
mmap,
|
|
||||||
n,
|
|
||||||
n_overflow,
|
|
||||||
step,
|
|
||||||
index,
|
|
||||||
primary_offset,
|
|
||||||
data_offset,
|
|
||||||
path: path.to_path_buf(),
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the path of the compact int vector file.
|
pub fn path(&self) -> &Path { &self.path }
|
||||||
pub fn path(&self) -> &Path {
|
pub fn len(&self) -> usize { self.n }
|
||||||
&self.path
|
pub fn is_empty(&self) -> bool { self.n == 0 }
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the length of the compact int vector.
|
|
||||||
pub fn len(&self) -> usize {
|
|
||||||
self.n
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns whether the compact int vector is empty.
|
|
||||||
pub fn is_empty(&self) -> bool {
|
|
||||||
self.n == 0
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the value at the given slot.
|
|
||||||
pub fn get(&self, slot: usize) -> u32 {
|
pub fn get(&self, slot: usize) -> u32 {
|
||||||
match self.mmap[self.primary_offset + slot] {
|
match self.mmap[self.primary_offset + slot] {
|
||||||
255 => self.overflow_get(slot),
|
255 => self.overflow_get(slot),
|
||||||
v => v as u32,
|
v => v as u32,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the value at the given slot from the overflow region.
|
|
||||||
fn overflow_get(&self, slot: usize) -> u32 {
|
fn overflow_get(&self, slot: usize) -> u32 {
|
||||||
let pos_start;
|
let (pos_start, pos_end) = if self.step == 0 {
|
||||||
let pos_end;
|
(0, self.n_overflow)
|
||||||
|
|
||||||
if self.step == 0 {
|
|
||||||
pos_start = 0;
|
|
||||||
pos_end = self.n_overflow;
|
|
||||||
} else {
|
} else {
|
||||||
let i = self
|
let i = self.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1);
|
||||||
.index
|
let start = self.index[i].1;
|
||||||
.partition_point(|&(s, _)| s <= slot)
|
let end = if i + 1 < self.index.len() { self.index[i + 1].1 } else { self.n_overflow };
|
||||||
.saturating_sub(1);
|
(start, end)
|
||||||
pos_start = self.index[i].1;
|
};
|
||||||
pos_end = if i + 1 < self.index.len() {
|
|
||||||
self.index[i + 1].1
|
|
||||||
} else {
|
|
||||||
self.n_overflow
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut lo = pos_start;
|
let mut lo = pos_start;
|
||||||
let mut hi = pos_end;
|
let mut hi = pos_end;
|
||||||
while lo < hi {
|
while lo < hi {
|
||||||
let mid = lo + (hi - lo) / 2;
|
let mid = lo + (hi - lo) / 2;
|
||||||
match self.data_slot(mid).cmp(&slot) {
|
match self.data_slot(mid).cmp(&slot) {
|
||||||
std::cmp::Ordering::Equal => return self.data_value(mid),
|
std::cmp::Ordering::Equal => return self.data_value(mid),
|
||||||
std::cmp::Ordering::Less => lo = mid + 1,
|
std::cmp::Ordering::Less => lo = mid + 1,
|
||||||
std::cmp::Ordering::Greater => hi = mid,
|
std::cmp::Ordering::Greater => hi = mid,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -119,144 +80,91 @@ impl PersistentCompactIntVec {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
/// Returns the slot at the given index in the overflow region.
|
|
||||||
fn data_slot(&self, i: usize) -> usize {
|
fn data_slot(&self, i: usize) -> usize {
|
||||||
let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE;
|
let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE;
|
||||||
u64::from_le_bytes(self.mmap[off..off + 8].try_into().unwrap()) as usize
|
u64::from_le_bytes(self.mmap[off..off + 8].try_into().unwrap()) as usize
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
/// Returns the value at the given index in the overflow region.
|
|
||||||
fn data_value(&self, i: usize) -> u32 {
|
fn data_value(&self, i: usize) -> u32 {
|
||||||
let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE + 8;
|
let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE + 8;
|
||||||
u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap())
|
u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
|
||||||
pub fn sum(&self) -> u64 {
|
pub fn sum(&self) -> u64 {
|
||||||
self.iter().map(|v| v as u64).sum()
|
let primary = &self.mmap[self.primary_offset..self.primary_offset + self.n];
|
||||||
|
byte_sum(primary, (0..self.n_overflow).map(|i| self.data_value(i)))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
|
||||||
pub fn count_nonzero(&self) -> u64 {
|
pub fn count_nonzero(&self) -> u64 {
|
||||||
self.iter().filter(|&v| v > 0).count() as u64
|
let primary = &self.mmap[self.primary_offset..self.primary_offset + self.n];
|
||||||
|
byte_count_nonzero(primary)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
/// Lightweight zero-copy view — primary and overflow point into the mmap.
|
||||||
/// Returns the Bray-Curtis distance between two compact int vectors.
|
pub fn view(&self) -> IntSliceView<'_> {
|
||||||
|
let primary = &self.mmap[self.primary_offset..self.primary_offset + self.n];
|
||||||
|
let overflow_raw = &self.mmap[self.data_offset..self.data_offset + self.n_overflow * OVERFLOW_ENTRY_SIZE];
|
||||||
|
IntSliceView::new(primary, overflow_raw, self.n_overflow, self.n)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn iter(&self) -> Iter<'_> {
|
||||||
|
Iter { pciv: self, slot: 0, overflow_pos: 0 }
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Distance methods ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
pub fn bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
pub fn bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||||
let sum_min = self.partial_bray_dist(other);
|
let sum_min = self.partial_bray_dist(other);
|
||||||
let denom = self.sum() + other.sum();
|
let denom = self.sum() + other.sum();
|
||||||
if denom == 0 {
|
if denom == 0 { 0.0 } else { 1.0 - 2.0 * sum_min as f64 / denom as f64 }
|
||||||
return 0.0;
|
|
||||||
}
|
|
||||||
1.0 - 2.0 * sum_min as f64 / denom as f64
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns `Σ_slot min(self[slot], other[slot])` — the additive numerator of Bray-Curtis.
|
|
||||||
/// The denominator `sum_a + sum_b` is obtained from `self.sum() + other.sum()`.
|
|
||||||
pub fn partial_bray_dist(&self, other: &PersistentCompactIntVec) -> u64 {
|
pub fn partial_bray_dist(&self, other: &PersistentCompactIntVec) -> u64 {
|
||||||
assert_eq!(self.n, other.len(), "length mismatch");
|
assert_eq!(self.n, other.len(), "length mismatch");
|
||||||
self.iter()
|
self.iter().zip(other.iter()).map(|(a, b)| a.min(b) as u64).sum()
|
||||||
.zip(other.iter())
|
|
||||||
.map(|(a, b)| a.min(b) as u64)
|
|
||||||
.sum()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the relative frequency Bray-Curtis distance between two compact int vectors.
|
|
||||||
///
|
|
||||||
/// This is a variant of [`bray_dist`] that uses relative frequencies instead of raw counts.
|
|
||||||
pub fn relfreq_bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
pub fn relfreq_bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||||
assert_eq!(self.n, other.len(), "length mismatch");
|
assert_eq!(self.n, other.len(), "length mismatch");
|
||||||
let sum_a = self.sum() as f64;
|
let sa = self.sum() as f64;
|
||||||
let sum_b = other.sum() as f64;
|
let sb = other.sum() as f64;
|
||||||
if sum_a == 0.0 && sum_b == 0.0 {
|
if sa == 0.0 && sb == 0.0 { return 0.0; }
|
||||||
return 0.0;
|
1.0 - self.partial_relfreq_bray_dist(other, sa, sb)
|
||||||
}
|
|
||||||
let sum_min = self.partial_relfreq_bray_dist(other, sum_a, sum_b);
|
|
||||||
1.0 - sum_min
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the partial relative frequency Bray-Curtis distance between two compact int vectors.
|
pub fn partial_relfreq_bray_dist(&self, other: &PersistentCompactIntVec, sum_a: f64, sum_b: f64) -> f64 {
|
||||||
///
|
|
||||||
/// This is used internally by [`relfreq_bray_dist`] and to easily compute the relative frequency
|
|
||||||
/// Bray-Curtis distance over a set of vector pairs.
|
|
||||||
///
|
|
||||||
/// Arguments:
|
|
||||||
/// - `other`: the other compact int vector to compare with
|
|
||||||
/// - `sum_a`: the sum of the first vector's counts
|
|
||||||
/// - `sum_b`: the sum of the second vector's counts
|
|
||||||
///
|
|
||||||
/// Returns the sum of the minimum relative frequencies at each index.
|
|
||||||
pub fn partial_relfreq_bray_dist(
|
|
||||||
&self,
|
|
||||||
other: &PersistentCompactIntVec,
|
|
||||||
sum_a: f64,
|
|
||||||
sum_b: f64,
|
|
||||||
) -> f64 {
|
|
||||||
assert_eq!(self.n, other.len(), "length mismatch");
|
assert_eq!(self.n, other.len(), "length mismatch");
|
||||||
let sum_min: f64 = self
|
self.iter().zip(other.iter())
|
||||||
.iter()
|
|
||||||
.zip(other.iter())
|
|
||||||
.map(|(a, b)| {
|
.map(|(a, b)| {
|
||||||
let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 };
|
let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 };
|
||||||
let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 };
|
let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 };
|
||||||
pa.min(pb)
|
pa.min(pb)
|
||||||
})
|
})
|
||||||
.sum();
|
.sum()
|
||||||
sum_min
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the euclidean distance between two compact int vectors.
|
|
||||||
pub fn euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
pub fn euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||||
self.partial_euclidean_dist(other).sqrt()
|
self.partial_euclidean_dist(other).sqrt()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the partial euclidean distance between two compact int vectors.
|
|
||||||
///
|
|
||||||
/// This is used internally by [`euclidean_dist`] and to easily compute the euclidean distance
|
|
||||||
/// over a set of vector pairs.
|
|
||||||
///
|
|
||||||
/// The result is the sum of the squared differences between corresponding elements of the two
|
|
||||||
/// vectors.
|
|
||||||
pub fn partial_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
pub fn partial_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||||
assert_eq!(self.n, other.len(), "length mismatch");
|
assert_eq!(self.n, other.len(), "length mismatch");
|
||||||
self.iter()
|
self.iter().zip(other.iter())
|
||||||
.zip(other.iter())
|
.map(|(a, b)| { let d = a as f64 - b as f64; d * d })
|
||||||
.map(|(a, b)| {
|
|
||||||
let d = a as f64 - b as f64;
|
|
||||||
d * d
|
|
||||||
})
|
|
||||||
.sum()
|
.sum()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the relative frequency euclidean distance between two compact int vectors.
|
|
||||||
///
|
|
||||||
/// This is a variant of [`euclidean_dist`] that uses relative frequencies instead of raw counts.
|
|
||||||
pub fn relfreq_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
pub fn relfreq_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||||
assert_eq!(self.n, other.len(), "length mismatch");
|
let sa = self.sum() as f64;
|
||||||
let sum_a = self.sum() as f64;
|
let sb = other.sum() as f64;
|
||||||
let sum_b = other.sum() as f64;
|
if sa == 0.0 && sb == 0.0 { return 0.0; }
|
||||||
if sum_a == 0.0 && sum_b == 0.0 {
|
self.partial_relfreq_euclidean_dist(other, sa, sb).sqrt()
|
||||||
return 0.0;
|
|
||||||
}
|
|
||||||
self.partial_relfreq_euclidean_dist(other, sum_a, sum_b)
|
|
||||||
.sqrt()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the partial relative frequency euclidean distance between two compact int vectors.
|
pub fn partial_relfreq_euclidean_dist(&self, other: &PersistentCompactIntVec, sum_a: f64, sum_b: f64) -> f64 {
|
||||||
///
|
|
||||||
/// This is used internally by [`relfreq_euclidean_dist`] and to easily compute the relative frequency
|
|
||||||
/// euclidean distance over a set of vector pairs.
|
|
||||||
pub fn partial_relfreq_euclidean_dist(
|
|
||||||
&self,
|
|
||||||
other: &PersistentCompactIntVec,
|
|
||||||
sum_a: f64,
|
|
||||||
sum_b: f64,
|
|
||||||
) -> f64 {
|
|
||||||
assert_eq!(self.n, other.len(), "length mismatch");
|
assert_eq!(self.n, other.len(), "length mismatch");
|
||||||
self.iter()
|
self.iter().zip(other.iter())
|
||||||
.zip(other.iter())
|
|
||||||
.map(|(a, b)| {
|
.map(|(a, b)| {
|
||||||
let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 };
|
let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 };
|
||||||
let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 };
|
let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 };
|
||||||
@@ -266,46 +174,19 @@ impl PersistentCompactIntVec {
|
|||||||
.sum()
|
.sum()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the Euclidean distance between two compact int vectors using the Hellinger transform.
|
|
||||||
///
|
|
||||||
/// The Hellinger transform is applied to the raw counts of each vector, and the result is
|
|
||||||
/// the Euclidean distance between the transformed vectors. The Hellinger transform is defined
|
|
||||||
/// as the square root of the relative frequencies.
|
|
||||||
pub fn hellinger_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
pub fn hellinger_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||||
assert_eq!(self.n, other.len(), "length mismatch");
|
let sa = self.sum() as f64;
|
||||||
let sum_a = self.sum() as f64;
|
let sb = other.sum() as f64;
|
||||||
let sum_b = other.sum() as f64;
|
if sa == 0.0 && sb == 0.0 { return 0.0; }
|
||||||
if sum_a == 0.0 && sum_b == 0.0 {
|
self.partial_hellinger_euclidean_dist(other, sa, sb).sqrt()
|
||||||
return 0.0;
|
|
||||||
}
|
|
||||||
self.partial_hellinger_euclidean_dist(other, sum_a, sum_b)
|
|
||||||
.sqrt()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the partial Hellinger Euclidean distance between two compact int vectors.
|
pub fn partial_hellinger_euclidean_dist(&self, other: &PersistentCompactIntVec, sum_a: f64, sum_b: f64) -> f64 {
|
||||||
///
|
|
||||||
/// This is used internally by [`hellinger_euclidean_dist`] and to easily compute the Hellinger
|
|
||||||
/// Euclidean distance over a set of vector pairs.
|
|
||||||
pub fn partial_hellinger_euclidean_dist(
|
|
||||||
&self,
|
|
||||||
other: &PersistentCompactIntVec,
|
|
||||||
sum_a: f64,
|
|
||||||
sum_b: f64,
|
|
||||||
) -> f64 {
|
|
||||||
assert_eq!(self.n, other.len(), "length mismatch");
|
assert_eq!(self.n, other.len(), "length mismatch");
|
||||||
self.iter()
|
self.iter().zip(other.iter())
|
||||||
.zip(other.iter())
|
|
||||||
.map(|(a, b)| {
|
.map(|(a, b)| {
|
||||||
let pa = if sum_a > 0.0 {
|
let pa = if sum_a > 0.0 { (a as f64 / sum_a).sqrt() } else { 0.0 };
|
||||||
(a as f64 / sum_a).sqrt()
|
let pb = if sum_b > 0.0 { (b as f64 / sum_b).sqrt() } else { 0.0 };
|
||||||
} else {
|
|
||||||
0.0
|
|
||||||
};
|
|
||||||
let pb = if sum_b > 0.0 {
|
|
||||||
(b as f64 / sum_b).sqrt()
|
|
||||||
} else {
|
|
||||||
0.0
|
|
||||||
};
|
|
||||||
let d = pa - pb;
|
let d = pa - pb;
|
||||||
d * d
|
d * d
|
||||||
})
|
})
|
||||||
@@ -317,22 +198,13 @@ impl PersistentCompactIntVec {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn threshold_jaccard_dist(&self, other: &PersistentCompactIntVec, threshold: u32) -> f64 {
|
pub fn threshold_jaccard_dist(&self, other: &PersistentCompactIntVec, threshold: u32) -> f64 {
|
||||||
assert_eq!(self.n, other.len(), "length mismatch");
|
|
||||||
let (intersection, union) = self.partial_threshold_jaccard_dist(other, threshold);
|
let (intersection, union) = self.partial_threshold_jaccard_dist(other, threshold);
|
||||||
if union == 0 {
|
if union == 0 { 0.0 } else { 1.0 - intersection as f64 / union as f64 }
|
||||||
return 0.0;
|
|
||||||
}
|
|
||||||
1.0 - intersection as f64 / union as f64
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn partial_threshold_jaccard_dist(
|
pub fn partial_threshold_jaccard_dist(&self, other: &PersistentCompactIntVec, threshold: u32) -> (u64, u64) {
|
||||||
&self,
|
|
||||||
other: &PersistentCompactIntVec,
|
|
||||||
threshold: u32,
|
|
||||||
) -> (u64, u64) {
|
|
||||||
assert_eq!(self.n, other.len(), "length mismatch");
|
assert_eq!(self.n, other.len(), "length mismatch");
|
||||||
self.iter()
|
self.iter().zip(other.iter())
|
||||||
.zip(other.iter())
|
|
||||||
.fold((0u64, 0u64), |(inter, uni), (a, b)| {
|
.fold((0u64, 0u64), |(inter, uni), (a, b)| {
|
||||||
let ap = a >= threshold;
|
let ap = a >= threshold;
|
||||||
let bp = b >= threshold;
|
let bp = b >= threshold;
|
||||||
@@ -343,23 +215,12 @@ impl PersistentCompactIntVec {
|
|||||||
pub fn jaccard_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
pub fn jaccard_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||||
self.threshold_jaccard_dist(other, 1)
|
self.threshold_jaccard_dist(other, 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn iter(&self) -> Iter<'_> {
|
|
||||||
Iter {
|
|
||||||
pciv: self,
|
|
||||||
slot: 0,
|
|
||||||
overflow_pos: 0,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> IntoIterator for &'a PersistentCompactIntVec {
|
impl<'a> IntoIterator for &'a PersistentCompactIntVec {
|
||||||
type Item = u32;
|
type Item = u32;
|
||||||
type IntoIter = Iter<'a>;
|
type IntoIter = Iter<'a>;
|
||||||
|
fn into_iter(self) -> Iter<'a> { self.iter() }
|
||||||
fn into_iter(self) -> Iter<'a> {
|
|
||||||
self.iter()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Iter<'a> {
|
pub struct Iter<'a> {
|
||||||
@@ -374,9 +235,7 @@ impl Iterator for Iter<'_> {
|
|||||||
type Item = u32;
|
type Item = u32;
|
||||||
|
|
||||||
fn next(&mut self) -> Option<u32> {
|
fn next(&mut self) -> Option<u32> {
|
||||||
if self.slot >= self.pciv.n {
|
if self.slot >= self.pciv.n { return None; }
|
||||||
return None;
|
|
||||||
}
|
|
||||||
let v = self.pciv.mmap[self.pciv.primary_offset + self.slot];
|
let v = self.pciv.mmap[self.pciv.primary_offset + self.slot];
|
||||||
self.slot += 1;
|
self.slot += 1;
|
||||||
if v < 255 {
|
if v < 255 {
|
||||||
|
|||||||
@@ -0,0 +1,111 @@
|
|||||||
|
use std::io;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
use tempfile::TempDir;
|
||||||
|
|
||||||
|
use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
|
||||||
|
use crate::views::{BitSliceIter, BitSliceView, IntSliceView};
|
||||||
|
|
||||||
|
// ── TempBitVec — frozen read-only, auto-deleted on drop ──────────────────────
|
||||||
|
|
||||||
|
pub struct TempBitVec {
|
||||||
|
vec: PersistentBitVec,
|
||||||
|
// Dropped after `vec` (field order), so the mmap is released before the
|
||||||
|
// temp directory is deleted.
|
||||||
|
_temp: TempDir,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TempBitVec {
|
||||||
|
pub fn make_persistent(&self, path: &Path) -> io::Result<PersistentBitVec> {
|
||||||
|
std::fs::copy(self.vec.path(), path)?;
|
||||||
|
PersistentBitVec::open(path)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn len(&self) -> usize {
|
||||||
|
self.vec.len()
|
||||||
|
}
|
||||||
|
pub fn is_empty(&self) -> bool {
|
||||||
|
self.vec.is_empty()
|
||||||
|
}
|
||||||
|
pub fn get(&self, slot: usize) -> bool {
|
||||||
|
self.vec.get(slot)
|
||||||
|
}
|
||||||
|
pub fn count_ones(&self) -> u64 {
|
||||||
|
self.vec.count_ones()
|
||||||
|
}
|
||||||
|
pub fn view(&self) -> BitSliceView<'_> {
|
||||||
|
self.vec.view()
|
||||||
|
}
|
||||||
|
pub fn iter(&self) -> BitSliceIter<'_> {
|
||||||
|
self.view().iter()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── TempBitVecBuilder — mutable, becomes TempBitVec on freeze ────────────────
|
||||||
|
|
||||||
|
pub struct TempBitVecBuilder {
|
||||||
|
builder: PersistentBitVecBuilder,
|
||||||
|
temp: TempDir,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TempBitVecBuilder {
|
||||||
|
pub fn new(n: usize) -> io::Result<Self> {
|
||||||
|
let temp = TempDir::new()?;
|
||||||
|
let path = temp.path().join("data.pbiv");
|
||||||
|
let builder = PersistentBitVecBuilder::new(n, &path)?;
|
||||||
|
Ok(Self { builder, temp })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn new_ones(n: usize) -> io::Result<Self> {
|
||||||
|
let temp = TempDir::new()?;
|
||||||
|
let path = temp.path().join("data.pbiv");
|
||||||
|
let builder = PersistentBitVecBuilder::new_ones(n, &path)?;
|
||||||
|
Ok(Self { builder, temp })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn freeze(self) -> io::Result<TempBitVec> {
|
||||||
|
let Self { builder, temp } = self;
|
||||||
|
let vec = builder.finish()?;
|
||||||
|
Ok(TempBitVec { vec, _temp: temp })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn set(&mut self, slot: usize, value: bool) {
|
||||||
|
self.builder.set(slot, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn view(&self) -> BitSliceView<'_> {
|
||||||
|
self.builder.view()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn or(&mut self, other: BitSliceView<'_>) {
|
||||||
|
self.builder.or(other);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn and(&mut self, other: BitSliceView<'_>) {
|
||||||
|
self.builder.and(other);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn xor(&mut self, other: BitSliceView<'_>) {
|
||||||
|
self.builder.xor(other);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn not(&mut self) {
|
||||||
|
self.builder.not();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn copy_from(&mut self, src: BitSliceView<'_>) {
|
||||||
|
self.builder.copy_from(src);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn or_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||||
|
self.builder.or_where(col, pred);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn and_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||||
|
self.builder.and_where(col, pred);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn xor_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||||
|
self.builder.xor_where(col, pred);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,89 @@
|
|||||||
|
use std::io;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
use tempfile::TempDir;
|
||||||
|
|
||||||
|
use crate::builder::PersistentCompactIntVecBuilder;
|
||||||
|
use crate::reader::PersistentCompactIntVec;
|
||||||
|
use crate::views::{BitSliceView, IntSliceView};
|
||||||
|
|
||||||
|
// ── TempCompactIntVec — frozen read-only, auto-deleted on drop ────────────────
|
||||||
|
|
||||||
|
pub struct TempCompactIntVec {
|
||||||
|
vec: PersistentCompactIntVec,
|
||||||
|
// Dropped after `vec` (field order), so the mmap is released before the
|
||||||
|
// temp directory is deleted.
|
||||||
|
_temp: TempDir,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TempCompactIntVec {
|
||||||
|
pub fn make_persistent(&self, path: &Path) -> io::Result<PersistentCompactIntVec> {
|
||||||
|
std::fs::copy(self.vec.path(), path)?;
|
||||||
|
PersistentCompactIntVec::open(path)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn len(&self) -> usize { self.vec.len() }
|
||||||
|
pub fn is_empty(&self) -> bool { self.vec.is_empty() }
|
||||||
|
pub fn get(&self, slot: usize) -> u32 { self.vec.get(slot) }
|
||||||
|
pub fn sum(&self) -> u64 { self.vec.sum() }
|
||||||
|
pub fn view(&self) -> IntSliceView<'_> { self.vec.view() }
|
||||||
|
pub fn iter(&self) -> crate::reader::Iter<'_> { self.vec.iter() }
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── TempCompactIntVecBuilder — mutable, becomes TempCompactIntVec on freeze ──
|
||||||
|
|
||||||
|
pub struct TempCompactIntVecBuilder {
|
||||||
|
builder: PersistentCompactIntVecBuilder,
|
||||||
|
temp: TempDir,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TempCompactIntVecBuilder {
|
||||||
|
pub fn new(n: usize) -> io::Result<Self> {
|
||||||
|
let temp = TempDir::new()?;
|
||||||
|
let path = temp.path().join("data.pciv");
|
||||||
|
let builder = PersistentCompactIntVecBuilder::new(n, &path)?;
|
||||||
|
Ok(Self { builder, temp })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn freeze(self) -> io::Result<TempCompactIntVec> {
|
||||||
|
let Self { builder, temp } = self;
|
||||||
|
let vec = builder.finish()?;
|
||||||
|
Ok(TempCompactIntVec { vec, _temp: temp })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn n(&self) -> usize { self.builder.len() }
|
||||||
|
|
||||||
|
pub fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); }
|
||||||
|
pub fn get(&self, slot: usize) -> u32 { self.builder.get(slot) }
|
||||||
|
|
||||||
|
pub fn primary_bytes(&self) -> &[u8] { self.builder.primary_bytes() }
|
||||||
|
pub fn primary_bytes_mut(&mut self) -> &mut [u8] { self.builder.primary_bytes_mut() }
|
||||||
|
|
||||||
|
pub fn inc_present(&mut self, col: BitSliceView<'_>) {
|
||||||
|
self.builder.inc_present(col);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn inc_present_fast(&mut self, col: BitSliceView<'_>) {
|
||||||
|
self.builder.inc_present_fast(col);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||||
|
self.builder.inc_predicate(col, pred);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||||
|
self.builder.inc_predicate_fast(col, pred);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add(&mut self, other: IntSliceView<'_>) {
|
||||||
|
self.builder.add(other);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn mask_with(&mut self, mask: BitSliceView<'_>) {
|
||||||
|
self.builder.mask_with(mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn min(&mut self, other: IntSliceView<'_>) { self.builder.min(other); }
|
||||||
|
pub fn max(&mut self, other: IntSliceView<'_>) { self.builder.max(other); }
|
||||||
|
pub fn diff(&mut self, other: IntSliceView<'_>) { self.builder.diff(other); }
|
||||||
|
}
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
use tempfile::tempdir;
|
use tempfile::tempdir;
|
||||||
|
|
||||||
use crate::{PersistentBitMatrix, PersistentBitMatrixBuilder};
|
use crate::{pack_bit_matrix, PersistentBitMatrix, PersistentBitMatrixBuilder};
|
||||||
use crate::traits::BitPartials;
|
use crate::traits::BitPartials;
|
||||||
|
|
||||||
fn make_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) {
|
fn make_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) {
|
||||||
@@ -203,3 +203,57 @@ fn partial_hamming_matches_hamming() {
|
|||||||
let full = m.hamming_dist_matrix();
|
let full = m.hamming_dist_matrix();
|
||||||
assert_eq!(partial, full);
|
assert_eq!(partial, full);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── col_view on Packed ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn col_view_packed_values() {
|
||||||
|
let (dir, _) = make_matrix(&[
|
||||||
|
&[true, false, true, true],
|
||||||
|
&[false, true, false, true],
|
||||||
|
]);
|
||||||
|
pack_bit_matrix(&dir.path().join("presence")).unwrap();
|
||||||
|
let m = PersistentBitMatrix::open(dir.path()).unwrap();
|
||||||
|
|
||||||
|
// col 0: [T, F, T, T]
|
||||||
|
let v0 = m.col_view(0);
|
||||||
|
assert_eq!(v0.len(), 4);
|
||||||
|
assert_eq!(v0.get(0), true);
|
||||||
|
assert_eq!(v0.get(1), false);
|
||||||
|
assert_eq!(v0.get(2), true);
|
||||||
|
assert_eq!(v0.get(3), true);
|
||||||
|
assert_eq!(v0.count_ones(), 3);
|
||||||
|
|
||||||
|
// col 1: [F, T, F, T]
|
||||||
|
let v1 = m.col_view(1);
|
||||||
|
assert_eq!(v1.get(0), false);
|
||||||
|
assert_eq!(v1.get(1), true);
|
||||||
|
assert_eq!(v1.get(2), false);
|
||||||
|
assert_eq!(v1.get(3), true);
|
||||||
|
assert_eq!(v1.count_ones(), 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn col_view_packed_matches_columnar() {
|
||||||
|
let data: &[&[bool]] = &[
|
||||||
|
&[true, false, true, false, true, true, false, true],
|
||||||
|
&[false, false, true, true, false, true, true, false],
|
||||||
|
&[true, true, true, false, false, false, true, true],
|
||||||
|
];
|
||||||
|
let (dir_col, m_col) = make_matrix(data);
|
||||||
|
let (dir_pack, _) = make_matrix(data);
|
||||||
|
pack_bit_matrix(&dir_pack.path().join("presence")).unwrap();
|
||||||
|
let m_pack = PersistentBitMatrix::open(dir_pack.path()).unwrap();
|
||||||
|
|
||||||
|
for c in 0..data.len() {
|
||||||
|
let col_ref = m_col.col(c);
|
||||||
|
let col_view = m_pack.col_view(c);
|
||||||
|
assert_eq!(col_view.len(), col_ref.len(), "col={c} len");
|
||||||
|
for s in 0..col_ref.len() {
|
||||||
|
assert_eq!(col_view.get(s), col_ref.get(s), "col={c} slot={s}");
|
||||||
|
}
|
||||||
|
assert_eq!(col_view.count_ones(), col_ref.count_ones(), "col={c} count_ones");
|
||||||
|
assert_eq!(col_view.words(), col_ref.words(), "col={c} words");
|
||||||
|
}
|
||||||
|
drop(dir_col);
|
||||||
|
}
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ fn op_and() {
|
|||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
let path = dir.path().join("out.pbiv");
|
let path = dir.path().join("out.pbiv");
|
||||||
let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
|
let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
|
||||||
b.and(&rb);
|
b.and(rb.view());
|
||||||
b.close().unwrap();
|
b.close().unwrap();
|
||||||
let r = PersistentBitVec::open(&path).unwrap();
|
let r = PersistentBitVec::open(&path).unwrap();
|
||||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![true, false, false, false]);
|
assert_eq!(r.iter().collect::<Vec<_>>(), vec![true, false, false, false]);
|
||||||
@@ -90,7 +90,7 @@ fn op_or() {
|
|||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
let path = dir.path().join("out.pbiv");
|
let path = dir.path().join("out.pbiv");
|
||||||
let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
|
let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
|
||||||
b.or(&rb);
|
b.or(rb.view());
|
||||||
b.close().unwrap();
|
b.close().unwrap();
|
||||||
let r = PersistentBitVec::open(&path).unwrap();
|
let r = PersistentBitVec::open(&path).unwrap();
|
||||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![true, true, true, false]);
|
assert_eq!(r.iter().collect::<Vec<_>>(), vec![true, true, true, false]);
|
||||||
@@ -103,7 +103,7 @@ fn op_xor() {
|
|||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
let path = dir.path().join("out.pbiv");
|
let path = dir.path().join("out.pbiv");
|
||||||
let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
|
let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
|
||||||
b.xor(&rb);
|
b.xor(rb.view());
|
||||||
b.close().unwrap();
|
b.close().unwrap();
|
||||||
let r = PersistentBitVec::open(&path).unwrap();
|
let r = PersistentBitVec::open(&path).unwrap();
|
||||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![false, true, true, false]);
|
assert_eq!(r.iter().collect::<Vec<_>>(), vec![false, true, true, false]);
|
||||||
|
|||||||
@@ -0,0 +1,223 @@
|
|||||||
|
use tempfile::tempdir;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
ColGroup, MatrixGroupOps,
|
||||||
|
PersistentBitMatrix, PersistentBitMatrixBuilder,
|
||||||
|
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
|
||||||
|
};
|
||||||
|
use crate::{PersistentBitVecBuilder, PersistentCompactIntVec, PersistentCompactIntVecBuilder};
|
||||||
|
|
||||||
|
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
fn make_int_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {
|
||||||
|
let n = cols.first().map_or(0, |c| c.len());
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
let mut b = PersistentCompactIntMatrixBuilder::new(n, &dir.path().join("counts")).unwrap();
|
||||||
|
for &col in cols {
|
||||||
|
let mut cb = b.add_col().unwrap();
|
||||||
|
for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); }
|
||||||
|
cb.close().unwrap();
|
||||||
|
}
|
||||||
|
b.close().unwrap();
|
||||||
|
let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
|
||||||
|
(dir, m)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn make_bit_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) {
|
||||||
|
let n = cols.first().map_or(0, |c| c.len());
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
let presence = dir.path().join("presence");
|
||||||
|
let mut b = PersistentBitMatrixBuilder::new(n, &presence).unwrap();
|
||||||
|
for &col in cols {
|
||||||
|
let mut cb = b.add_col().unwrap();
|
||||||
|
for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); }
|
||||||
|
cb.close().unwrap();
|
||||||
|
}
|
||||||
|
b.close().unwrap();
|
||||||
|
let m = PersistentBitMatrix::open(dir.path()).unwrap();
|
||||||
|
(dir, m)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── IntMatrix: partial_group_sum ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn int_partial_group_sum_basic() {
|
||||||
|
// col0=[1,2,3], col1=[10,20,30], col2=[100,0,5]
|
||||||
|
// group {0,2}: sum = [101, 2, 8]
|
||||||
|
let (_d, m) = make_int_matrix(&[&[1, 2, 3], &[10, 20, 30], &[100, 0, 5]]);
|
||||||
|
let g = ColGroup::new("g", vec![0, 2]);
|
||||||
|
let result = m.partial_group_sum(&g).unwrap();
|
||||||
|
assert_eq!(result.get(0), 101);
|
||||||
|
assert_eq!(result.get(1), 2);
|
||||||
|
assert_eq!(result.get(2), 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn int_partial_group_sum_with_overflow() {
|
||||||
|
// col0=[300,0], col1=[200,400]: group {0,1}: sum=[500, 400]
|
||||||
|
let (_d, m) = make_int_matrix(&[&[300, 0], &[200, 400]]);
|
||||||
|
let g = ColGroup::new("g", vec![0, 1]);
|
||||||
|
let result = m.partial_group_sum(&g).unwrap();
|
||||||
|
assert_eq!(result.get(0), 500);
|
||||||
|
assert_eq!(result.get(1), 400);
|
||||||
|
assert_eq!(result.sum(), 900);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── IntMatrix: partial_group_presence_count ───────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn int_partial_group_presence_count() {
|
||||||
|
// col0=[5,1,0,3], col1=[2,0,4,3], col2=[0,3,1,0]
|
||||||
|
// threshold=2: col0: [T,F,F,T], col1: [T,F,T,T], col2: [F,T,F,F]
|
||||||
|
// group {0,1,2}: counts = [2, 1, 1, 2]
|
||||||
|
let (_d, m) = make_int_matrix(&[&[5, 1, 0, 3], &[2, 0, 4, 3], &[0, 3, 1, 0]]);
|
||||||
|
let g = ColGroup::new("g", vec![0, 1, 2]);
|
||||||
|
let result = m.partial_group_presence_count(&g, 2).unwrap();
|
||||||
|
assert_eq!(result.get(0), 2);
|
||||||
|
assert_eq!(result.get(1), 1);
|
||||||
|
assert_eq!(result.get(2), 1);
|
||||||
|
assert_eq!(result.get(3), 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn int_partial_group_presence_count_with_overflow() {
|
||||||
|
// col0=[300,0,10], col1=[0,400,10], col2=[1,1,10]
|
||||||
|
// threshold=5: col0: [T,F,T], col1: [F,T,T], col2: [F,F,T]
|
||||||
|
// group {0,1,2}: counts = [1, 1, 3]
|
||||||
|
let (_d, m) = make_int_matrix(&[&[300, 0, 10], &[0, 400, 10], &[1, 1, 10]]);
|
||||||
|
let g = ColGroup::new("g", vec![0, 1, 2]);
|
||||||
|
let result = m.partial_group_presence_count(&g, 5).unwrap();
|
||||||
|
assert_eq!(result.get(0), 1);
|
||||||
|
assert_eq!(result.get(1), 1);
|
||||||
|
assert_eq!(result.get(2), 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── IntMatrix: partial_group_any ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn int_partial_group_any() {
|
||||||
|
// col0=[0,3,0,1], col1=[2,0,0,0], col2=[0,0,5,0]
|
||||||
|
// threshold=2: col0: [F,T,F,F], col1: [T,F,F,F], col2: [F,F,T,F]
|
||||||
|
// group {0,1,2}: any = [T, T, T, F]
|
||||||
|
let (_d, m) = make_int_matrix(&[&[0, 3, 0, 1], &[2, 0, 0, 0], &[0, 0, 5, 0]]);
|
||||||
|
let g = ColGroup::new("g", vec![0, 1, 2]);
|
||||||
|
let result = m.partial_group_any(&g, 2).unwrap();
|
||||||
|
assert_eq!(result.get(0), true);
|
||||||
|
assert_eq!(result.get(1), true);
|
||||||
|
assert_eq!(result.get(2), true);
|
||||||
|
assert_eq!(result.get(3), false);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── IntMatrix: mask_with ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn mask_with_zeros_selected_slots() {
|
||||||
|
// count vec [10, 20, 30, 40], mask [T, F, T, F] → [10, 0, 30, 0]
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
let mut v = PersistentCompactIntVecBuilder::new(4, &dir.path().join("v.pciv")).unwrap();
|
||||||
|
v.set(0, 10); v.set(1, 20); v.set(2, 30); v.set(3, 40);
|
||||||
|
let mut mask = PersistentBitVecBuilder::new(4, &dir.path().join("m.pbiv")).unwrap();
|
||||||
|
mask.set(0, true); mask.set(2, true);
|
||||||
|
v.mask_with(mask.view());
|
||||||
|
v.close().unwrap();
|
||||||
|
let r = PersistentCompactIntVec::open(&dir.path().join("v.pciv")).unwrap();
|
||||||
|
assert_eq!(r.get(0), 10);
|
||||||
|
assert_eq!(r.get(1), 0);
|
||||||
|
assert_eq!(r.get(2), 30);
|
||||||
|
assert_eq!(r.get(3), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn mask_with_overflow_slot_zeroed() {
|
||||||
|
// overflow slot (value 500) masked out → removed from overflow, primary=0
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
let mut v = PersistentCompactIntVecBuilder::new(3, &dir.path().join("v.pciv")).unwrap();
|
||||||
|
v.set(0, 10); v.set(1, 500); v.set(2, 5);
|
||||||
|
let mut mask = PersistentBitVecBuilder::new(3, &dir.path().join("m.pbiv")).unwrap();
|
||||||
|
mask.set(0, true); mask.set(2, true); // slot 1 masked out
|
||||||
|
v.mask_with(mask.view());
|
||||||
|
v.close().unwrap();
|
||||||
|
let r = PersistentCompactIntVec::open(&dir.path().join("v.pciv")).unwrap();
|
||||||
|
assert_eq!(r.get(0), 10);
|
||||||
|
assert_eq!(r.get(1), 0);
|
||||||
|
assert_eq!(r.get(2), 5);
|
||||||
|
let ov: Vec<_> = r.view().overflow_entries().collect();
|
||||||
|
assert!(ov.is_empty(), "overflow entry for masked-out slot should be gone");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn mask_with_all_ones_is_noop() {
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
let mut v = PersistentCompactIntVecBuilder::new(4, &dir.path().join("v.pciv")).unwrap();
|
||||||
|
v.set(0, 300); v.set(1, 1); v.set(2, 0); v.set(3, 42);
|
||||||
|
let mask = PersistentBitVecBuilder::new_ones(4, &dir.path().join("m.pbiv")).unwrap();
|
||||||
|
v.mask_with(mask.view());
|
||||||
|
v.close().unwrap();
|
||||||
|
let r = PersistentCompactIntVec::open(&dir.path().join("v.pciv")).unwrap();
|
||||||
|
assert_eq!(r.get(0), 300);
|
||||||
|
assert_eq!(r.get(1), 1);
|
||||||
|
assert_eq!(r.get(2), 0);
|
||||||
|
assert_eq!(r.get(3), 42);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── BitMatrix: partial_group_presence_count ───────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bit_partial_group_presence_count() {
|
||||||
|
// col0=[T,F,T,F], col1=[T,T,F,F], col2=[F,T,T,F]
|
||||||
|
// group {0,1,2}: counts = [2, 2, 2, 0]
|
||||||
|
let (_d, m) = make_bit_matrix(&[
|
||||||
|
&[true, false, true, false],
|
||||||
|
&[true, true, false, false],
|
||||||
|
&[false,true, true, false],
|
||||||
|
]);
|
||||||
|
let g = ColGroup::new("g", vec![0, 1, 2]);
|
||||||
|
let result = m.partial_group_presence_count(&g, 1).unwrap();
|
||||||
|
assert_eq!(result.get(0), 2);
|
||||||
|
assert_eq!(result.get(1), 2);
|
||||||
|
assert_eq!(result.get(2), 2);
|
||||||
|
assert_eq!(result.get(3), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── BitMatrix: partial_group_any ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bit_partial_group_any() {
|
||||||
|
// col0=[T,F,F], col1=[F,F,T], group {0,1}: any = [T, F, T]
|
||||||
|
let (_d, m) = make_bit_matrix(&[
|
||||||
|
&[true, false, false],
|
||||||
|
&[false, false, true],
|
||||||
|
]);
|
||||||
|
let g = ColGroup::new("g", vec![0, 1]);
|
||||||
|
let result = m.partial_group_any(&g, 1).unwrap();
|
||||||
|
assert_eq!(result.get(0), true);
|
||||||
|
assert_eq!(result.get(1), false);
|
||||||
|
assert_eq!(result.get(2), true);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Composition: partial results are additive ─────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn int_presence_count_additive_across_split() {
|
||||||
|
// Simulate two partitions (different kmer ranges) whose counts should add.
|
||||||
|
// Global data for col0: [5,1,0,3,2], col1: [2,0,4,3,1] — threshold=2
|
||||||
|
// Split: partition A = slots 0..2, partition B = slots 2..5
|
||||||
|
let data_a: &[&[u32]] = &[&[5, 1], &[2, 0]];
|
||||||
|
let data_b: &[&[u32]] = &[&[0, 3, 2], &[4, 3, 1]];
|
||||||
|
let (_da, ma) = make_int_matrix(data_a);
|
||||||
|
let (_db, mb) = make_int_matrix(data_b);
|
||||||
|
let g = ColGroup::new("g", vec![0, 1]);
|
||||||
|
|
||||||
|
let pa = ma.partial_group_presence_count(&g, 2).unwrap();
|
||||||
|
let pb = mb.partial_group_presence_count(&g, 2).unwrap();
|
||||||
|
|
||||||
|
// Concatenate by adding (disjoint kmer ranges — here we just verify
|
||||||
|
// individual results match the expected per-partition counts).
|
||||||
|
// partition A: col0=[5≥2,1<2]=[T,F], col1=[2≥2,0<2]=[T,F] → [2, 0]
|
||||||
|
assert_eq!(pa.get(0), 2);
|
||||||
|
assert_eq!(pa.get(1), 0);
|
||||||
|
// partition B: col0=[0<2,3≥2,2≥2]=[F,T,T], col1=[4≥2,3≥2,1<2]=[T,T,F] → [1, 2, 1]
|
||||||
|
assert_eq!(pb.get(0), 1);
|
||||||
|
assert_eq!(pb.get(1), 2);
|
||||||
|
assert_eq!(pb.get(2), 1);
|
||||||
|
}
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
use tempfile::tempdir;
|
use tempfile::tempdir;
|
||||||
|
|
||||||
use crate::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder};
|
use crate::{pack_compact_int_matrix, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder};
|
||||||
use crate::traits::CountPartials;
|
use crate::traits::CountPartials;
|
||||||
|
|
||||||
fn make_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {
|
fn make_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {
|
||||||
@@ -243,6 +243,61 @@ fn partial_hellinger_matches_full() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn col_view_packed_values() {
|
||||||
|
// Build Columnar with overflow values (≥ 255), pack, reopen as Packed, exercise col_view().
|
||||||
|
let (dir, _col) = make_matrix(&[&[10, 300, 500], &[200, 50, 1000]]);
|
||||||
|
pack_compact_int_matrix(&dir.path().join("counts")).unwrap();
|
||||||
|
let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
|
||||||
|
|
||||||
|
// col 0: [10, 300, 500] — two overflow slots
|
||||||
|
let v0 = m.col_view(0);
|
||||||
|
assert_eq!(v0.get(0), 10);
|
||||||
|
assert_eq!(v0.get(1), 300);
|
||||||
|
assert_eq!(v0.get(2), 500);
|
||||||
|
assert_eq!(v0.sum(), 810);
|
||||||
|
assert_eq!(v0.count_nonzero(), 3);
|
||||||
|
let mut ov0: Vec<(usize, u32)> = v0.overflow_entries().collect();
|
||||||
|
ov0.sort_unstable_by_key(|&(s, _)| s);
|
||||||
|
assert_eq!(ov0, vec![(1, 300), (2, 500)]);
|
||||||
|
|
||||||
|
// col 1: [200, 50, 1000] — one overflow slot
|
||||||
|
let v1 = m.col_view(1);
|
||||||
|
assert_eq!(v1.get(0), 200);
|
||||||
|
assert_eq!(v1.get(1), 50);
|
||||||
|
assert_eq!(v1.get(2), 1000);
|
||||||
|
let mut ov1: Vec<(usize, u32)> = v1.overflow_entries().collect();
|
||||||
|
ov1.sort_unstable_by_key(|&(s, _)| s);
|
||||||
|
assert_eq!(ov1, vec![(2, 1000)]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn col_view_packed_matches_columnar() {
|
||||||
|
// Same data, compare col_view() on Packed against col() on Columnar slot-by-slot.
|
||||||
|
let data: &[&[u32]] = &[&[0, 255, 1, 300, 128], &[500, 3, 0, 700, 42]];
|
||||||
|
let (dir_col, m_col) = make_matrix(data);
|
||||||
|
// Re-build in a separate dir so we can pack without touching m_col's files.
|
||||||
|
let (dir_pack, _) = make_matrix(data);
|
||||||
|
pack_compact_int_matrix(&dir_pack.path().join("counts")).unwrap();
|
||||||
|
let m_pack = PersistentCompactIntMatrix::open(dir_pack.path()).unwrap();
|
||||||
|
|
||||||
|
for c in 0..data.len() {
|
||||||
|
let col_ref = m_col.col(c);
|
||||||
|
let col_view = m_pack.col_view(c);
|
||||||
|
assert_eq!(col_view.len(), col_ref.len());
|
||||||
|
for s in 0..col_ref.len() {
|
||||||
|
assert_eq!(col_view.get(s), col_ref.get(s), "col={c} slot={s}");
|
||||||
|
}
|
||||||
|
assert_eq!(col_view.sum(), col_ref.sum(), "col={c} sum");
|
||||||
|
let mut ov_view: Vec<(usize, u32)> = col_view.overflow_entries().collect();
|
||||||
|
let mut ov_ref: Vec<(usize, u32)> = col_ref.view().overflow_entries().collect();
|
||||||
|
ov_view.sort_unstable_by_key(|&(s, _)| s);
|
||||||
|
ov_ref.sort_unstable_by_key(|&(s, _)| s);
|
||||||
|
assert_eq!(ov_view, ov_ref, "col={c} overflow_entries");
|
||||||
|
}
|
||||||
|
drop(dir_col);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn partial_relfreq_bray_additive_across_split() {
|
fn partial_relfreq_bray_additive_across_split() {
|
||||||
// Split rows [1,2,3,4,5] between two matrices; partial sums should add up.
|
// Split rows [1,2,3,4,5] between two matrices; partial sums should add up.
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
mod bitmatrix;
|
mod bitmatrix;
|
||||||
mod bitvec;
|
mod bitvec;
|
||||||
|
mod colgroup;
|
||||||
mod intmatrix;
|
mod intmatrix;
|
||||||
|
|
||||||
use tempfile::tempdir;
|
use tempfile::tempdir;
|
||||||
@@ -169,7 +170,7 @@ fn combine_min() {
|
|||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
let path = dir.path().join("out.pciv");
|
let path = dir.path().join("out.pciv");
|
||||||
let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
|
let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
|
||||||
b.min(&rb);
|
b.min(rb.view());
|
||||||
b.close().unwrap();
|
b.close().unwrap();
|
||||||
let r = PersistentCompactIntVec::open(&path).unwrap();
|
let r = PersistentCompactIntVec::open(&path).unwrap();
|
||||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![10, 100, 0, 800]);
|
assert_eq!(r.iter().collect::<Vec<_>>(), vec![10, 100, 0, 800]);
|
||||||
@@ -182,7 +183,7 @@ fn combine_max() {
|
|||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
let path = dir.path().join("out.pciv");
|
let path = dir.path().join("out.pciv");
|
||||||
let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
|
let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
|
||||||
b.max(&rb);
|
b.max(rb.view());
|
||||||
b.close().unwrap();
|
b.close().unwrap();
|
||||||
let r = PersistentCompactIntVec::open(&path).unwrap();
|
let r = PersistentCompactIntVec::open(&path).unwrap();
|
||||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![20, 300, 500, 1000]);
|
assert_eq!(r.iter().collect::<Vec<_>>(), vec![20, 300, 500, 1000]);
|
||||||
@@ -195,7 +196,7 @@ fn combine_add() {
|
|||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
let path = dir.path().join("out.pciv");
|
let path = dir.path().join("out.pciv");
|
||||||
let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
|
let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
|
||||||
b.add(&rb);
|
b.add(rb.view());
|
||||||
b.close().unwrap();
|
b.close().unwrap();
|
||||||
let r = PersistentCompactIntVec::open(&path).unwrap();
|
let r = PersistentCompactIntVec::open(&path).unwrap();
|
||||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![30, 300, 5, 101]);
|
assert_eq!(r.iter().collect::<Vec<_>>(), vec![30, 300, 5, 101]);
|
||||||
@@ -220,7 +221,7 @@ fn combine_diff() {
|
|||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
let path = dir.path().join("out.pciv");
|
let path = dir.path().join("out.pciv");
|
||||||
let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
|
let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
|
||||||
b.diff(&rb);
|
b.diff(rb.view());
|
||||||
b.close().unwrap();
|
b.close().unwrap();
|
||||||
let r = PersistentCompactIntVec::open(&path).unwrap();
|
let r = PersistentCompactIntVec::open(&path).unwrap();
|
||||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![10, 700, 0, 0]);
|
assert_eq!(r.iter().collect::<Vec<_>>(), vec![10, 700, 0, 0]);
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
use ndarray::{Array1, Array2};
|
use ndarray::{Array1, Array2};
|
||||||
|
|
||||||
/// Column-level weight statistic — total count or presence count per column.
|
// ── Column-level weight statistic — total count or presence count per column.
|
||||||
/// Additive across layers and partitions; used as denominator in normalised distances.
|
/// Additive across layers and partitions; used as denominator in normalised distances.
|
||||||
///
|
///
|
||||||
/// `partial_kmer_counts` returns the number of **distinct k-mers** present per
|
/// `partial_kmer_counts` returns the number of **distinct k-mers** present per
|
||||||
|
|||||||
@@ -0,0 +1,278 @@
|
|||||||
|
use crate::format::{byte_count_nonzero, byte_sum, parse_overflow_entry};
|
||||||
|
|
||||||
|
// ── BitSliceView ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Lightweight, copy-able read-only view over a u64 word array.
|
||||||
|
/// Bit `i` is in `words[i >> 6]` at position `i & 63`. Padding bits are zero.
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
pub struct BitSliceView<'a> {
|
||||||
|
pub(crate) words: &'a [u64],
|
||||||
|
pub(crate) n: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> BitSliceView<'a> {
|
||||||
|
#[inline]
|
||||||
|
pub fn new(words: &'a [u64], n: usize) -> Self { Self { words, n } }
|
||||||
|
|
||||||
|
pub fn len(&self) -> usize { self.n }
|
||||||
|
pub fn is_empty(&self) -> bool { self.n == 0 }
|
||||||
|
pub fn words(&self) -> &'a [u64] { self.words }
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub fn get(&self, slot: usize) -> bool {
|
||||||
|
(self.words[slot >> 6] >> (slot & 63)) & 1 != 0
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn count_ones(&self) -> u64 {
|
||||||
|
self.words.iter().map(|w| w.count_ones() as u64).sum()
|
||||||
|
}
|
||||||
|
pub fn count_zeros(&self) -> u64 { self.n as u64 - self.count_ones() }
|
||||||
|
|
||||||
|
pub fn iter(&self) -> BitSliceIter<'a> {
|
||||||
|
BitSliceIter { words: self.words, slot: 0, n: self.n }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn partial_jaccard_dist(self, other: BitSliceView<'_>) -> (u64, u64) {
|
||||||
|
assert_eq!(self.n, other.n, "BitSliceView length mismatch");
|
||||||
|
self.words.iter().zip(other.words)
|
||||||
|
.fold((0u64, 0u64), |(i, u), (&a, &b)| {
|
||||||
|
(i + (a & b).count_ones() as u64, u + (a | b).count_ones() as u64)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn jaccard_dist(self, other: BitSliceView<'_>) -> f64 {
|
||||||
|
let (inter, union) = self.partial_jaccard_dist(other);
|
||||||
|
if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn hamming_dist(self, other: BitSliceView<'_>) -> u64 {
|
||||||
|
assert_eq!(self.n, other.n, "BitSliceView length mismatch");
|
||||||
|
self.words.iter().zip(other.words)
|
||||||
|
.map(|(&a, &b)| (a ^ b).count_ones() as u64)
|
||||||
|
.sum()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── BitSliceIter ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
pub struct BitSliceIter<'a> {
|
||||||
|
words: &'a [u64],
|
||||||
|
slot: usize,
|
||||||
|
n: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Iterator for BitSliceIter<'_> {
|
||||||
|
type Item = bool;
|
||||||
|
fn next(&mut self) -> Option<bool> {
|
||||||
|
if self.slot >= self.n { return None; }
|
||||||
|
let v = (self.words[self.slot >> 6] >> (self.slot & 63)) & 1 != 0;
|
||||||
|
self.slot += 1;
|
||||||
|
Some(v)
|
||||||
|
}
|
||||||
|
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||||
|
let rem = self.n - self.slot;
|
||||||
|
(rem, Some(rem))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl ExactSizeIterator for BitSliceIter<'_> {}
|
||||||
|
|
||||||
|
// ── IntSliceView ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Lightweight, copy-able read-only view over a compact-int primary array plus
|
||||||
|
/// its sorted raw overflow bytes. Zero-copy: all data lives in the caller's mmap.
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
pub struct IntSliceView<'a> {
|
||||||
|
pub(crate) primary: &'a [u8],
|
||||||
|
pub(crate) overflow_raw: &'a [u8], // n_overflow × OVERFLOW_ENTRY_SIZE bytes, sorted by slot
|
||||||
|
pub(crate) n_overflow: usize,
|
||||||
|
pub(crate) n: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> IntSliceView<'a> {
|
||||||
|
#[inline]
|
||||||
|
pub fn new(primary: &'a [u8], overflow_raw: &'a [u8], n_overflow: usize, n: usize) -> Self {
|
||||||
|
Self { primary, overflow_raw, n_overflow, n }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn len(&self) -> usize { self.n }
|
||||||
|
pub fn is_empty(&self) -> bool { self.n == 0 }
|
||||||
|
pub fn primary_bytes(&self) -> &'a [u8] { self.primary }
|
||||||
|
pub fn n_overflow(&self) -> usize { self.n_overflow }
|
||||||
|
|
||||||
|
pub fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + 'a {
|
||||||
|
let raw = self.overflow_raw;
|
||||||
|
let n_ov = self.n_overflow;
|
||||||
|
(0..n_ov).map(move |i| parse_overflow_entry(raw, 0, i))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// O(log n_overflow) via binary search (overflow is always sorted by slot).
|
||||||
|
pub fn get(&self, slot: usize) -> u32 {
|
||||||
|
let b = self.primary[slot];
|
||||||
|
if b < 255 { return b as u32; }
|
||||||
|
let mut lo = 0usize;
|
||||||
|
let mut hi = self.n_overflow;
|
||||||
|
while lo < hi {
|
||||||
|
let mid = lo + (hi - lo) / 2;
|
||||||
|
let (s, v) = parse_overflow_entry(self.overflow_raw, 0, mid);
|
||||||
|
match s.cmp(&slot) {
|
||||||
|
std::cmp::Ordering::Equal => return v,
|
||||||
|
std::cmp::Ordering::Less => lo = mid + 1,
|
||||||
|
std::cmp::Ordering::Greater => hi = mid,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
panic!("slot {slot} marked overflow but not found")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Sequential merge scan: yields all n values in slot order.
|
||||||
|
pub fn iter(&self) -> IntSliceViewIter<'a> {
|
||||||
|
IntSliceViewIter {
|
||||||
|
primary: self.primary,
|
||||||
|
overflow_raw: self.overflow_raw,
|
||||||
|
slot: 0,
|
||||||
|
overflow_pos: 0,
|
||||||
|
n: self.n,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn sum(&self) -> u64 {
|
||||||
|
byte_sum(self.primary, self.overflow_entries().map(|(_, v)| v))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn count_nonzero(&self) -> u64 {
|
||||||
|
byte_count_nonzero(self.primary)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Distance methods ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
pub fn partial_bray_dist(self, other: IntSliceView<'_>) -> u64 {
|
||||||
|
assert_eq!(self.n, other.n, "length mismatch");
|
||||||
|
self.iter().zip(other.iter()).map(|(a, b)| a.min(b) as u64).sum()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn bray_dist(self, other: IntSliceView<'_>) -> f64 {
|
||||||
|
let sum_min = self.partial_bray_dist(other);
|
||||||
|
let denom = self.sum() + other.sum();
|
||||||
|
if denom == 0 { 0.0 } else { 1.0 - 2.0 * sum_min as f64 / denom as f64 }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn partial_relfreq_bray_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 {
|
||||||
|
assert_eq!(self.n, other.n, "length mismatch");
|
||||||
|
self.iter().zip(other.iter())
|
||||||
|
.map(|(a, b)| {
|
||||||
|
let pa = if sa > 0.0 { a as f64 / sa } else { 0.0 };
|
||||||
|
let pb = if sb > 0.0 { b as f64 / sb } else { 0.0 };
|
||||||
|
pa.min(pb)
|
||||||
|
})
|
||||||
|
.sum()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn relfreq_bray_dist(self, other: IntSliceView<'_>) -> f64 {
|
||||||
|
let sa = self.sum() as f64;
|
||||||
|
let sb = other.sum() as f64;
|
||||||
|
if sa == 0.0 && sb == 0.0 { return 0.0; }
|
||||||
|
1.0 - self.partial_relfreq_bray_dist(other, sa, sb)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn partial_euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
|
||||||
|
assert_eq!(self.n, other.n, "length mismatch");
|
||||||
|
self.iter().zip(other.iter())
|
||||||
|
.map(|(a, b)| { let d = a as f64 - b as f64; d * d })
|
||||||
|
.sum()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
|
||||||
|
self.partial_euclidean_dist(other).sqrt()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn partial_relfreq_euclidean_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 {
|
||||||
|
assert_eq!(self.n, other.n, "length mismatch");
|
||||||
|
self.iter().zip(other.iter())
|
||||||
|
.map(|(a, b)| {
|
||||||
|
let pa = if sa > 0.0 { a as f64 / sa } else { 0.0 };
|
||||||
|
let pb = if sb > 0.0 { b as f64 / sb } else { 0.0 };
|
||||||
|
let d = pa - pb;
|
||||||
|
d * d
|
||||||
|
})
|
||||||
|
.sum()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn relfreq_euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
|
||||||
|
let sa = self.sum() as f64;
|
||||||
|
let sb = other.sum() as f64;
|
||||||
|
if sa == 0.0 && sb == 0.0 { return 0.0; }
|
||||||
|
self.partial_relfreq_euclidean_dist(other, sa, sb).sqrt()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn partial_hellinger_euclidean_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 {
|
||||||
|
assert_eq!(self.n, other.n, "length mismatch");
|
||||||
|
self.iter().zip(other.iter())
|
||||||
|
.map(|(a, b)| {
|
||||||
|
let pa = if sa > 0.0 { (a as f64 / sa).sqrt() } else { 0.0 };
|
||||||
|
let pb = if sb > 0.0 { (b as f64 / sb).sqrt() } else { 0.0 };
|
||||||
|
let d = pa - pb;
|
||||||
|
d * d
|
||||||
|
})
|
||||||
|
.sum()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn hellinger_euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
|
||||||
|
let sa = self.sum() as f64;
|
||||||
|
let sb = other.sum() as f64;
|
||||||
|
if sa == 0.0 && sb == 0.0 { return 0.0; }
|
||||||
|
self.partial_hellinger_euclidean_dist(other, sa, sb).sqrt()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn hellinger_dist(self, other: IntSliceView<'_>) -> f64 {
|
||||||
|
self.hellinger_euclidean_dist(other) / std::f64::consts::SQRT_2
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn partial_threshold_jaccard_dist(self, other: IntSliceView<'_>, threshold: u32) -> (u64, u64) {
|
||||||
|
assert_eq!(self.n, other.n, "length mismatch");
|
||||||
|
self.iter().zip(other.iter())
|
||||||
|
.fold((0u64, 0u64), |(inter, uni), (a, b)| {
|
||||||
|
let ap = a >= threshold;
|
||||||
|
let bp = b >= threshold;
|
||||||
|
(inter + (ap & bp) as u64, uni + (ap | bp) as u64)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn threshold_jaccard_dist(self, other: IntSliceView<'_>, threshold: u32) -> f64 {
|
||||||
|
let (inter, union) = self.partial_threshold_jaccard_dist(other, threshold);
|
||||||
|
if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn jaccard_dist(self, other: IntSliceView<'_>) -> f64 {
|
||||||
|
self.threshold_jaccard_dist(other, 1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── IntSliceViewIter ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
pub struct IntSliceViewIter<'a> {
|
||||||
|
primary: &'a [u8],
|
||||||
|
overflow_raw: &'a [u8],
|
||||||
|
slot: usize,
|
||||||
|
overflow_pos: usize,
|
||||||
|
n: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Iterator for IntSliceViewIter<'_> {
|
||||||
|
type Item = u32;
|
||||||
|
fn next(&mut self) -> Option<u32> {
|
||||||
|
if self.slot >= self.n { return None; }
|
||||||
|
let v = self.primary[self.slot];
|
||||||
|
self.slot += 1;
|
||||||
|
if v < 255 {
|
||||||
|
Some(v as u32)
|
||||||
|
} else {
|
||||||
|
let (_, val) = parse_overflow_entry(self.overflow_raw, 0, self.overflow_pos);
|
||||||
|
self.overflow_pos += 1;
|
||||||
|
Some(val)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||||
|
let rem = self.n - self.slot;
|
||||||
|
(rem, Some(rem))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl ExactSizeIterator for IntSliceViewIter<'_> {}
|
||||||
@@ -3,6 +3,7 @@ use crossbeam_channel;
|
|||||||
use hashbrown::HashMap;
|
use hashbrown::HashMap;
|
||||||
use obikseq::k;
|
use obikseq::k;
|
||||||
use obikseq::{CanonicalKmer, Sequence, Unitig};
|
use obikseq::{CanonicalKmer, Sequence, Unitig};
|
||||||
|
#[cfg(not(any(test, feature = "test-utils")))]
|
||||||
use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
|
use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
|
||||||
use std::cell::RefCell;
|
use std::cell::RefCell;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
|||||||
@@ -17,3 +17,8 @@ serde = { version = "1", features = ["derive"] }
|
|||||||
serde_json = "1"
|
serde_json = "1"
|
||||||
indicatif = "0.17"
|
indicatif = "0.17"
|
||||||
tracing = "0.1.44"
|
tracing = "0.1.44"
|
||||||
|
hwlocality = { version = "1.0.0-alpha.11", features = ["vendored"], optional = true }
|
||||||
|
|
||||||
|
[features]
|
||||||
|
default = ["numa"]
|
||||||
|
numa = ["hwlocality"]
|
||||||
|
|||||||
+29
-45
@@ -1,8 +1,6 @@
|
|||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
use std::fs;
|
use std::fs;
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
|
||||||
use std::sync::{Arc, Mutex};
|
|
||||||
|
|
||||||
use obikpartitionner::{KmerPartition, KmerSpectrum};
|
use obikpartitionner::{KmerPartition, KmerSpectrum};
|
||||||
use obilayeredmap;
|
use obilayeredmap;
|
||||||
@@ -152,31 +150,25 @@ impl KmerIndex {
|
|||||||
let with_counts = self.meta.config.with_counts;
|
let with_counts = self.meta.config.with_counts;
|
||||||
let evidence = self.meta.config.evidence.clone();
|
let evidence = self.meta.config.evidence.clone();
|
||||||
let block_bits = self.meta.config.block_bits;
|
let block_bits = self.meta.config.block_bits;
|
||||||
let total_kmers = AtomicUsize::new(0);
|
let mut total_kmers: usize = 0;
|
||||||
|
let pb = progress_bar("index", n as u64, "partitions");
|
||||||
|
|
||||||
let pb = Arc::new(Mutex::new(progress_bar("index", n as u64, "partitions")));
|
let order: Vec<usize> = (0..n).collect();
|
||||||
|
let runner = crate::numa::PartitionRunner::new();
|
||||||
(0..n).into_par_iter().for_each(|i| {
|
runner.run(
|
||||||
match self.partition.build_index_layer(i, min_ab, max_ab, with_counts, &evidence, block_bits) {
|
&order,
|
||||||
Ok(0) => {}
|
|i| self.partition.build_index_layer(i, min_ab, max_ab, with_counts, &evidence, block_bits),
|
||||||
Ok(n_kmers) => {
|
|i, n_kmers, _| {
|
||||||
total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
|
if n_kmers > 0 {
|
||||||
let pb = pb.lock().unwrap();
|
total_kmers += n_kmers;
|
||||||
pb.inc(1);
|
pb.inc(1);
|
||||||
pb.set_message(format!("{i}: {n_kmers} kmers"));
|
pb.set_message(format!("{i}: {n_kmers} kmers"));
|
||||||
}
|
}
|
||||||
Err(e) => {
|
},
|
||||||
eprintln!("error building layer for partition {i}: {e}");
|
).map_err(OKIError::Partition)?;
|
||||||
std::process::exit(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
pb.lock().unwrap().finish_and_clear();
|
pb.finish_and_clear();
|
||||||
info!(
|
info!("done — {} total kmers indexed", total_kmers);
|
||||||
"done — {} total kmers indexed",
|
|
||||||
total_kmers.load(Ordering::Relaxed)
|
|
||||||
);
|
|
||||||
|
|
||||||
if !keep_intermediate {
|
if !keep_intermediate {
|
||||||
for i in 0..n {
|
for i in 0..n {
|
||||||
@@ -211,35 +203,27 @@ impl KmerIndex {
|
|||||||
use obilayeredmap::meta::PartitionMeta;
|
use obilayeredmap::meta::PartitionMeta;
|
||||||
|
|
||||||
let n = self.n_partitions();
|
let n = self.n_partitions();
|
||||||
let errors: Vec<_> = (0..n)
|
let order: Vec<usize> = (0..n).collect();
|
||||||
.into_par_iter()
|
let pb = progress_bar("pack", n as u64, "partitions");
|
||||||
.filter_map(|i| {
|
crate::numa::PartitionRunner::new().run(
|
||||||
|
&order,
|
||||||
|
|i| -> OKIResult<()> {
|
||||||
let index_dir = self.partition.part_dir(i).join("index");
|
let index_dir = self.partition.part_dir(i).join("index");
|
||||||
if !index_dir.exists() { return None; }
|
if !index_dir.exists() { return Ok(()); }
|
||||||
let meta = match PartitionMeta::load(&index_dir) {
|
let meta = PartitionMeta::load(&index_dir)
|
||||||
Ok(m) => m,
|
.map_err(|e| OKIError::Io(std::io::Error::new(std::io::ErrorKind::Other, e.to_string())))?;
|
||||||
Err(e) => return Some(OKIError::Io(std::io::Error::new(std::io::ErrorKind::Other, e.to_string()))),
|
|
||||||
};
|
|
||||||
for l in 0..meta.n_layers {
|
for l in 0..meta.n_layers {
|
||||||
let layer_dir = index_dir.join(format!("layer_{l}"));
|
let layer_dir = index_dir.join(format!("layer_{l}"));
|
||||||
let presence_dir = layer_dir.join("presence");
|
let presence_dir = layer_dir.join("presence");
|
||||||
let counts_dir = layer_dir.join("counts");
|
let counts_dir = layer_dir.join("counts");
|
||||||
if presence_dir.exists() {
|
if presence_dir.exists() { pack_bit_matrix(&presence_dir).map_err(OKIError::Io)?; }
|
||||||
if let Err(e) = pack_bit_matrix(&presence_dir) {
|
if counts_dir.exists() { pack_compact_int_matrix(&counts_dir).map_err(OKIError::Io)?; }
|
||||||
return Some(OKIError::Io(e));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if counts_dir.exists() {
|
|
||||||
if let Err(e) = pack_compact_int_matrix(&counts_dir) {
|
|
||||||
return Some(OKIError::Io(e));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
None
|
Ok(())
|
||||||
})
|
},
|
||||||
.collect();
|
|_, _, _| { pb.inc(1); },
|
||||||
|
)?;
|
||||||
if let Some(e) = errors.into_iter().next() { return Err(e); }
|
pb.finish_and_clear();
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ mod distance;
|
|||||||
mod dump;
|
mod dump;
|
||||||
mod index;
|
mod index;
|
||||||
mod merge;
|
mod merge;
|
||||||
|
mod numa;
|
||||||
mod rebuild;
|
mod rebuild;
|
||||||
mod reindex;
|
mod reindex;
|
||||||
mod select;
|
mod select;
|
||||||
|
|||||||
+17
-140
@@ -2,10 +2,8 @@ use std::collections::HashMap;
|
|||||||
use std::fs;
|
use std::fs;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::time::{Duration, Instant};
|
|
||||||
|
|
||||||
use crossbeam_channel::unbounded;
|
use obisys::{Reporter, Stage, progress_bar, spinner};
|
||||||
use obisys::{CpuSample, Reporter, Stage, progress_bar, spinner};
|
|
||||||
use tracing::{debug, info};
|
use tracing::{debug, info};
|
||||||
|
|
||||||
use obilayeredmap::IndexMode;
|
use obilayeredmap::IndexMode;
|
||||||
@@ -13,7 +11,7 @@ use obilayeredmap::IndexMode;
|
|||||||
use crate::error::{OKIError, OKIResult};
|
use crate::error::{OKIError, OKIResult};
|
||||||
use crate::index::KmerIndex;
|
use crate::index::KmerIndex;
|
||||||
use crate::meta::{GenomeInfo, IndexMeta};
|
use crate::meta::{GenomeInfo, IndexMeta};
|
||||||
use crate::state::IndexState;
|
use crate::state::{IndexState, SENTINEL_INDEXED};
|
||||||
|
|
||||||
pub use obikpartitionner::MergeMode;
|
pub use obikpartitionner::MergeMode;
|
||||||
|
|
||||||
@@ -223,156 +221,33 @@ impl KmerIndex {
|
|||||||
let mut order: Vec<usize> = (0..n_partitions).collect();
|
let mut order: Vec<usize> = (0..n_partitions).collect();
|
||||||
order.sort_unstable_by_key(|&i| std::cmp::Reverse(partition_sizes[i]));
|
order.sort_unstable_by_key(|&i| std::cmp::Reverse(partition_sizes[i]));
|
||||||
|
|
||||||
// ── Adaptive worker pool ──────────────────────────────────────────
|
|
||||||
// Start with 1 worker thread. After each completed partition,
|
|
||||||
// measure CPU efficiency (via getrusage delta). If efficiency is
|
|
||||||
// below the spawn threshold and more partitions remain, spawn one
|
|
||||||
// additional worker. Workers share a crossbeam channel of partition
|
|
||||||
// IDs; each reports (id, g_len, duration) on a result channel.
|
|
||||||
const SPAWN_THRESHOLD: f64 = 0.95; // spawn when >5% capacity idle
|
|
||||||
let n_cores = std::thread::available_parallelism()
|
|
||||||
.map(|n| n.get())
|
|
||||||
.unwrap_or(1);
|
|
||||||
let max_workers = (n_cores / 2).max(1);
|
|
||||||
let _ = budget_fraction; // kept in signature for CLI compatibility
|
let _ = budget_fraction; // kept in signature for CLI compatibility
|
||||||
|
|
||||||
let (part_tx, part_rx) = unbounded::<usize>();
|
|
||||||
let (result_tx, result_rx) =
|
|
||||||
unbounded::<(usize, Result<usize, obiskio::SKError>, Duration)>();
|
|
||||||
// activate_tx: controller sends () to wake the next dormant worker.
|
|
||||||
// Dropping activate_tx closes the channel; dormant workers exit.
|
|
||||||
let (activate_tx, activate_rx) = unbounded::<()>();
|
|
||||||
|
|
||||||
for &i in &order {
|
|
||||||
part_tx.send(i).ok();
|
|
||||||
}
|
|
||||||
drop(part_tx);
|
|
||||||
|
|
||||||
let mut part_stats: Vec<PartStat> = Vec::with_capacity(n_partitions);
|
|
||||||
let mut n_workers = 0usize;
|
|
||||||
let mut cpu_sample = CpuSample::now();
|
|
||||||
// Efficiency measured just before each spawn, used to assess
|
|
||||||
// whether the previous worker delivered its expected marginal gain.
|
|
||||||
let mut efficiency_at_last_spawn = 0.0f64;
|
|
||||||
|
|
||||||
// Shadow as references so closures can capture them by copy.
|
// Shadow as references so closures can capture them by copy.
|
||||||
let srcs = &srcs;
|
let srcs = &srcs;
|
||||||
let evidence = &evidence;
|
let evidence = &evidence;
|
||||||
|
|
||||||
std::thread::scope(|s| -> OKIResult<()> {
|
let runner = crate::numa::PartitionRunner::new();
|
||||||
// Pre-spawn max_workers threads; each waits for an activation
|
let mut part_stats: Vec<PartStat> = Vec::with_capacity(n_partitions);
|
||||||
// signal before consuming from part_rx.
|
|
||||||
for _ in 0..max_workers {
|
|
||||||
let prx = part_rx.clone();
|
|
||||||
let rtx = result_tx.clone();
|
|
||||||
let arx = activate_rx.clone();
|
|
||||||
s.spawn(move || {
|
|
||||||
if arx.recv().is_ok() {
|
|
||||||
for i in &prx {
|
|
||||||
let t = Instant::now();
|
|
||||||
let r = dst_partition.merge_partition(
|
|
||||||
i,
|
|
||||||
srcs,
|
|
||||||
mode,
|
|
||||||
n_dst_genomes,
|
|
||||||
block_bits,
|
|
||||||
evidence,
|
|
||||||
);
|
|
||||||
rtx.send((i, r, t.elapsed())).ok();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
drop(result_tx);
|
|
||||||
|
|
||||||
// Activate first worker immediately.
|
runner.run(
|
||||||
activate_tx.send(()).ok();
|
&order,
|
||||||
n_workers = 1;
|
|i| dst_partition.merge_partition(i, srcs, mode, n_dst_genomes, block_bits, evidence),
|
||||||
|
|i, g_len, dur| {
|
||||||
const SPAWN_POLL: Duration = Duration::from_secs(10);
|
|
||||||
|
|
||||||
let mut completed = 0usize;
|
|
||||||
while completed < n_partitions {
|
|
||||||
let result = result_rx.recv_timeout(SPAWN_POLL);
|
|
||||||
|
|
||||||
// On timeout: no partition finished yet, just check efficiency.
|
|
||||||
let (i, r, dur) = match result {
|
|
||||||
Ok(v) => v,
|
|
||||||
Err(crossbeam_channel::RecvTimeoutError::Timeout) => {
|
|
||||||
if n_workers < max_workers {
|
|
||||||
let eff = cpu_sample.cpu_efficiency(n_cores);
|
|
||||||
if eff < SPAWN_THRESHOLD {
|
|
||||||
debug!(
|
|
||||||
"activated worker {} (poll) — efficiency {:.0}%",
|
|
||||||
n_workers + 1,
|
|
||||||
eff * 100.0,
|
|
||||||
);
|
|
||||||
efficiency_at_last_spawn = eff;
|
|
||||||
activate_tx.send(()).ok();
|
|
||||||
n_workers += 1;
|
|
||||||
cpu_sample = CpuSample::now();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
Err(crossbeam_channel::RecvTimeoutError::Disconnected) => {
|
|
||||||
return Err(OKIError::Io(io::Error::new(
|
|
||||||
io::ErrorKind::UnexpectedEof,
|
|
||||||
"worker channel closed",
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let g_len = r.map_err(OKIError::Partition)?;
|
|
||||||
pb.inc(1);
|
pb.inc(1);
|
||||||
debug!(
|
debug!(
|
||||||
"partition {i}: done in {:.1}s — {} new kmers",
|
"partition {i}: done in {:.1}s — {} new kmers",
|
||||||
dur.as_secs_f64(),
|
dur.as_secs_f64(),
|
||||||
g_len
|
|
||||||
);
|
|
||||||
part_stats.push(PartStat {
|
|
||||||
id: i,
|
|
||||||
unitig_bytes: partition_sizes[i],
|
|
||||||
g_len,
|
g_len,
|
||||||
});
|
);
|
||||||
completed += 1;
|
part_stats.push(PartStat { id: i, unitig_bytes: partition_sizes[i], g_len });
|
||||||
|
},
|
||||||
if n_workers < max_workers && completed < n_partitions {
|
).map_err(OKIError::Partition)?;
|
||||||
let eff = cpu_sample.cpu_efficiency(n_cores);
|
|
||||||
// For the first spawn use SPAWN_THRESHOLD.
|
|
||||||
// For subsequent spawns: the previous worker should
|
|
||||||
// have raised efficiency by at least a quarter of the expected
|
|
||||||
// marginal gain (1/n_workers). If not, adding another
|
|
||||||
// worker won't help.
|
|
||||||
let should_spawn = if n_workers == 1 {
|
|
||||||
eff < SPAWN_THRESHOLD
|
|
||||||
} else {
|
|
||||||
let gain = eff - efficiency_at_last_spawn;
|
|
||||||
let expected = 1.0 / n_workers as f64;
|
|
||||||
gain >= expected * 0.25
|
|
||||||
};
|
|
||||||
if should_spawn {
|
|
||||||
debug!(
|
|
||||||
"activated worker {} — efficiency {:.0}%, gain vs prev {:.0}%",
|
|
||||||
n_workers + 1,
|
|
||||||
eff * 100.0,
|
|
||||||
(eff - efficiency_at_last_spawn) * 100.0,
|
|
||||||
);
|
|
||||||
efficiency_at_last_spawn = eff;
|
|
||||||
activate_tx.send(()).ok();
|
|
||||||
n_workers += 1;
|
|
||||||
cpu_sample = CpuSample::now();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Close activate_tx: dormant workers exit cleanly.
|
|
||||||
drop(activate_tx);
|
|
||||||
Ok(())
|
|
||||||
})?;
|
|
||||||
|
|
||||||
pb.finish_and_clear();
|
pb.finish_and_clear();
|
||||||
|
|
||||||
// ── Diagnostic report ─────────────────────────────────────────────
|
// ── Diagnostic report ─────────────────────────────────────────────
|
||||||
print_merge_partition_report(&part_stats, n_workers, max_workers);
|
print_merge_partition_report(&part_stats, runner.max_workers());
|
||||||
|
|
||||||
rep.push(t.stop());
|
rep.push(t.stop());
|
||||||
}
|
}
|
||||||
@@ -388,13 +263,15 @@ impl KmerIndex {
|
|||||||
rep.push(t.stop());
|
rep.push(t.stop());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fs::File::create(output.join(SENTINEL_INDEXED)).map_err(OKIError::Io)?;
|
||||||
|
|
||||||
KmerIndex::open(output)
|
KmerIndex::open(output)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Diagnostic report ─────────────────────────────────────────────────────────
|
// ── Diagnostic report ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
fn print_merge_partition_report(stats: &[PartStat], n_workers: usize, max_workers: usize) {
|
fn print_merge_partition_report(stats: &[PartStat], max_workers: usize) {
|
||||||
let total_new: usize = stats.iter().map(|s| s.g_len).sum();
|
let total_new: usize = stats.iter().map(|s| s.g_len).sum();
|
||||||
let non_empty = stats.iter().filter(|s| s.unitig_bytes > 0).count();
|
let non_empty = stats.iter().filter(|s| s.unitig_bytes > 0).count();
|
||||||
|
|
||||||
@@ -408,7 +285,7 @@ fn print_merge_partition_report(stats: &[PartStat], n_workers: usize, max_worker
|
|||||||
" {} partition(s) processed, {} total new kmers",
|
" {} partition(s) processed, {} total new kmers",
|
||||||
non_empty, total_new,
|
non_empty, total_new,
|
||||||
);
|
);
|
||||||
info!(" workers spawned: {n_workers} / {max_workers} (max)",);
|
info!(" max workers: {max_workers}");
|
||||||
|
|
||||||
// Top 8 partitions by new-kmer count
|
// Top 8 partitions by new-kmer count
|
||||||
let mut by_new: Vec<&PartStat> = stats.iter().filter(|s| s.g_len > 0).collect();
|
let mut by_new: Vec<&PartStat> = stats.iter().filter(|s| s.g_len > 0).collect();
|
||||||
|
|||||||
@@ -0,0 +1,384 @@
|
|||||||
|
// NUMA-aware partition runner via hwlocality.
|
||||||
|
//
|
||||||
|
// Detects NUMA topology using hwloc (cross-platform: Linux, macOS, etc.) and
|
||||||
|
// builds one Rayon ThreadPool per NUMA node with threads pinned to that node's
|
||||||
|
// CPUs. Linux first-touch policy then places graph allocations in local DRAM
|
||||||
|
// automatically — no explicit memory binding needed.
|
||||||
|
//
|
||||||
|
// UMA systems (single socket, Apple Silicon, etc.) are the degenerate case:
|
||||||
|
// one synthetic node containing all cores, no pool, no pinning.
|
||||||
|
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
|
use crossbeam_channel::unbounded;
|
||||||
|
#[cfg(feature = "numa")]
|
||||||
|
use hwlocality::Topology;
|
||||||
|
#[cfg(feature = "numa")]
|
||||||
|
use hwlocality::cpu::binding::CpuBindingFlags;
|
||||||
|
#[cfg(feature = "numa")]
|
||||||
|
use hwlocality::cpu::cpuset::CpuSet;
|
||||||
|
#[cfg(feature = "numa")]
|
||||||
|
use hwlocality::object::types::ObjectType;
|
||||||
|
use obisys::CpuSample;
|
||||||
|
use tracing::debug;
|
||||||
|
|
||||||
|
// ── Public interface ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
pub struct NumaSetup {
|
||||||
|
/// One entry per NUMA node. `None` on UMA systems (no pool, no pinning).
|
||||||
|
pub pools: Vec<Option<Arc<rayon::ThreadPool>>>,
|
||||||
|
/// CPU indices for each NUMA node, in node order.
|
||||||
|
pub cpus_per_node: Vec<Vec<usize>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl NumaSetup {
|
||||||
|
/// Maximum worker slots per node (one per physical core in the node).
|
||||||
|
pub fn workers_per_node(&self) -> usize {
|
||||||
|
self.cpus_per_node
|
||||||
|
.first()
|
||||||
|
.map(|c| c.len().max(1))
|
||||||
|
.unwrap_or(1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Detect NUMA topology and build per-node Rayon pools.
|
||||||
|
/// Always succeeds: falls back to a single synthetic UMA node on failure.
|
||||||
|
#[cfg(feature = "numa")]
|
||||||
|
pub fn build() -> NumaSetup {
|
||||||
|
if let Ok(topology) = Topology::new() {
|
||||||
|
let nodes: Vec<Vec<usize>> = topology
|
||||||
|
.objects_with_type(ObjectType::NUMANode)
|
||||||
|
.filter_map(|obj| obj.cpuset())
|
||||||
|
.map(|cpuset| {
|
||||||
|
cpuset
|
||||||
|
.iter_set()
|
||||||
|
.map(|idx| usize::from(idx))
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
})
|
||||||
|
.filter(|v| !v.is_empty())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if nodes.len() > 1 {
|
||||||
|
if let Some(pools) = nodes
|
||||||
|
.iter()
|
||||||
|
.map(|cpus| build_pool(cpus).map(|p| Some(Arc::new(p))))
|
||||||
|
.collect::<Option<Vec<_>>>()
|
||||||
|
{
|
||||||
|
debug!(
|
||||||
|
"NUMA topology: {} node(s), {} core(s)/node",
|
||||||
|
nodes.len(),
|
||||||
|
nodes.first().map_or(0, |v| v.len()),
|
||||||
|
);
|
||||||
|
return NumaSetup { pools, cpus_per_node: nodes };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// UMA fallback: single synthetic node, all cores, no pool, no pinning.
|
||||||
|
let n_cores = std::thread::available_parallelism()
|
||||||
|
.map(|n| n.get())
|
||||||
|
.unwrap_or(1);
|
||||||
|
debug!("UMA: single synthetic node, {} core(s)", n_cores);
|
||||||
|
NumaSetup {
|
||||||
|
pools: vec![None],
|
||||||
|
cpus_per_node: vec![(0..n_cores).collect()],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(feature = "numa"))]
|
||||||
|
pub fn build() -> NumaSetup {
|
||||||
|
let n_cores = std::thread::available_parallelism()
|
||||||
|
.map(|n| n.get())
|
||||||
|
.unwrap_or(1);
|
||||||
|
debug!("UMA: single synthetic node, {} core(s)", n_cores);
|
||||||
|
NumaSetup {
|
||||||
|
pools: vec![None],
|
||||||
|
cpus_per_node: vec![(0..n_cores).collect()],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Bind the calling thread to `cpu_indices` using hwloc.
|
||||||
|
/// Silently returns on any error so the thread still runs, just unbound.
|
||||||
|
#[cfg(feature = "numa")]
|
||||||
|
pub fn pin_current_thread(cpu_indices: &[usize]) {
|
||||||
|
let Ok(topology) = Topology::new() else { return };
|
||||||
|
let mut cpuset = CpuSet::new();
|
||||||
|
for &idx in cpu_indices {
|
||||||
|
cpuset.set(idx);
|
||||||
|
}
|
||||||
|
let _ = topology.bind_cpu(&cpuset, CpuBindingFlags::THREAD);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(feature = "numa"))]
|
||||||
|
pub fn pin_current_thread(_cpu_indices: &[usize]) {}
|
||||||
|
|
||||||
|
// ── Internal helpers ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[cfg(feature = "numa")]
|
||||||
|
fn build_pool(cpus: &[usize]) -> Option<rayon::ThreadPool> {
|
||||||
|
let cpus = cpus.to_vec();
|
||||||
|
rayon::ThreadPoolBuilder::new()
|
||||||
|
.num_threads(cpus.len())
|
||||||
|
.spawn_handler(move |thread| {
|
||||||
|
let cpus = cpus.clone();
|
||||||
|
std::thread::Builder::new().spawn(move || {
|
||||||
|
pin_current_thread(&cpus);
|
||||||
|
thread.run();
|
||||||
|
})?;
|
||||||
|
Ok(())
|
||||||
|
})
|
||||||
|
.build()
|
||||||
|
.ok()
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── PartitionRunner ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
struct NodeConfig {
|
||||||
|
pool: Option<Arc<rayon::ThreadPool>>,
|
||||||
|
cpu_ids: Vec<usize>,
|
||||||
|
max_workers: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Generic NUMA-aware runner for partition-level parallel work.
|
||||||
|
///
|
||||||
|
/// Workers are distributed round-robin across NUMA nodes and pinned to their
|
||||||
|
/// node's CPUs. UMA is the degenerate case: one node, no pinning.
|
||||||
|
///
|
||||||
|
/// Workers are pre-spawned dormant and activated one by one as CPU efficiency
|
||||||
|
/// falls below `SPAWN_THRESHOLD`. This avoids over-provisioning on I/O-bound
|
||||||
|
/// or memory-bandwidth-bound workloads while saturating CPU-bound ones.
|
||||||
|
///
|
||||||
|
/// # Termination
|
||||||
|
///
|
||||||
|
/// ```text
|
||||||
|
/// drop(part_tx) → part_rx drains → workers exit → drop their result_tx
|
||||||
|
/// drop(result_tx) → result_rx closes → controller loop exits
|
||||||
|
/// drop(activate_tx) → dormant workers exit cleanly
|
||||||
|
/// ```
|
||||||
|
pub struct PartitionRunner {
|
||||||
|
nodes: Vec<NodeConfig>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartitionRunner {
|
||||||
|
/// Total worker slots across all nodes.
|
||||||
|
pub fn max_workers(&self) -> usize {
|
||||||
|
self.nodes.iter().map(|n| n.max_workers).sum()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Detect topology and build. Always succeeds.
|
||||||
|
pub fn new() -> Self {
|
||||||
|
let ns = build();
|
||||||
|
let wpn = ns.workers_per_node();
|
||||||
|
debug!(
|
||||||
|
"PartitionRunner: {} node(s) × {} worker(s)/node max",
|
||||||
|
ns.pools.len(),
|
||||||
|
wpn,
|
||||||
|
);
|
||||||
|
let nodes = ns.pools
|
||||||
|
.into_iter()
|
||||||
|
.zip(ns.cpus_per_node)
|
||||||
|
.map(|(pool, cpu_ids)| NodeConfig {
|
||||||
|
pool,
|
||||||
|
cpu_ids,
|
||||||
|
max_workers: wpn,
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
Self { nodes }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Run `f(i)` for every index in `order`.
|
||||||
|
///
|
||||||
|
/// Workers are pre-spawned dormant and activated adaptively. A timer thread
|
||||||
|
/// fires a CPU-efficiency check every `TIMER_SECS` seconds; each completed
|
||||||
|
/// partition resets that timer (forcing an immediate check) and also
|
||||||
|
/// triggers its own inline check. A new worker is activated whenever
|
||||||
|
/// efficiency falls below `SPAWN_THRESHOLD`.
|
||||||
|
///
|
||||||
|
/// `on_done(i, result, elapsed)` is called from the controller thread as
|
||||||
|
/// each partition completes — suitable for progress bars and result
|
||||||
|
/// aggregation.
|
||||||
|
///
|
||||||
|
/// Returns the first error produced by `f`, if any.
|
||||||
|
pub fn run<F, R, E, C>(
|
||||||
|
&self,
|
||||||
|
order: &[usize],
|
||||||
|
f: F,
|
||||||
|
mut on_done: C,
|
||||||
|
) -> Result<(), E>
|
||||||
|
where
|
||||||
|
F: Fn(usize) -> Result<R, E> + Send + Sync,
|
||||||
|
R: Send,
|
||||||
|
E: Send,
|
||||||
|
C: FnMut(usize, R, Duration) + Send,
|
||||||
|
{
|
||||||
|
let n_total = order.len();
|
||||||
|
if n_total == 0 {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
const SPAWN_THRESHOLD: f64 = 0.95;
|
||||||
|
const TIMER_SECS: u64 = 30;
|
||||||
|
|
||||||
|
let n_cores = std::thread::available_parallelism()
|
||||||
|
.map(|n| n.get())
|
||||||
|
.unwrap_or(1);
|
||||||
|
|
||||||
|
// ── Channels ──────────────────────────────────────────────────────────
|
||||||
|
let (part_tx, part_rx) = unbounded::<usize>();
|
||||||
|
let (activate_tx, activate_rx) = unbounded::<()>();
|
||||||
|
// reset_tx: controller → timer ("reset the 30 s window")
|
||||||
|
let (reset_tx, reset_rx) = unbounded::<()>();
|
||||||
|
// event_tx: workers + timer → controller (unified event stream)
|
||||||
|
let (event_tx, event_rx) = unbounded::<WorkerEvent<R, E>>();
|
||||||
|
|
||||||
|
for &i in order { part_tx.send(i).ok(); }
|
||||||
|
drop(part_tx);
|
||||||
|
|
||||||
|
let max_workers = self.max_workers();
|
||||||
|
let n_nodes = self.nodes.len();
|
||||||
|
let f = &f;
|
||||||
|
|
||||||
|
let mut first_err: Option<E> = None;
|
||||||
|
|
||||||
|
std::thread::scope(|s| {
|
||||||
|
// ── Timer thread ──────────────────────────────────────────────────
|
||||||
|
// Sends TimerTick every TIMER_SECS seconds. Resets its window each
|
||||||
|
// time reset_rx receives a message (i.e. on partition completion).
|
||||||
|
let timer_tx = event_tx.clone();
|
||||||
|
s.spawn(move || {
|
||||||
|
let period = Duration::from_secs(TIMER_SECS);
|
||||||
|
loop {
|
||||||
|
crossbeam_channel::select! {
|
||||||
|
recv(reset_rx) -> r => {
|
||||||
|
if r.is_err() { break; } // reset_tx dropped → exit
|
||||||
|
}
|
||||||
|
default(period) => {
|
||||||
|
if timer_tx.send(WorkerEvent::TimerTick).is_err() { break; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ── Pre-spawn workers dormant, round-robin across NUMA nodes ──────
|
||||||
|
for w in 0..max_workers {
|
||||||
|
let node = &self.nodes[w % n_nodes];
|
||||||
|
let prx = part_rx.clone();
|
||||||
|
let etx = event_tx.clone();
|
||||||
|
let arx = activate_rx.clone();
|
||||||
|
let pool = node.pool.clone();
|
||||||
|
let cpu_ids = &node.cpu_ids;
|
||||||
|
|
||||||
|
s.spawn(move || {
|
||||||
|
if arx.recv().is_err() { return; }
|
||||||
|
if !cpu_ids.is_empty() { pin_current_thread(cpu_ids); }
|
||||||
|
for i in &prx {
|
||||||
|
let t = Instant::now();
|
||||||
|
let r = match &pool {
|
||||||
|
Some(p) => p.install(|| f(i)),
|
||||||
|
None => f(i),
|
||||||
|
};
|
||||||
|
etx.send(WorkerEvent::Completed(i, r, t.elapsed())).ok();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
// Drop controller's event_tx: event_rx closes when all workers +
|
||||||
|
// timer have exited.
|
||||||
|
drop(event_tx);
|
||||||
|
|
||||||
|
// ── Controller ────────────────────────────────────────────────────
|
||||||
|
let initial_workers = n_nodes.min(max_workers).min(n_total);
|
||||||
|
for _ in 0..initial_workers { activate_tx.send(()).ok(); }
|
||||||
|
let mut n_active = initial_workers;
|
||||||
|
let mut cpu_sample = CpuSample::now();
|
||||||
|
let mut eff_at_last_spawn = 0.0f64; // 0 = no previous spawn to evaluate
|
||||||
|
let mut completed = 0usize;
|
||||||
|
|
||||||
|
while completed < n_total {
|
||||||
|
let Ok(event) = event_rx.recv() else { break };
|
||||||
|
match event {
|
||||||
|
WorkerEvent::Completed(i, r, dur) => {
|
||||||
|
match r {
|
||||||
|
Ok(v) => on_done(i, v, dur),
|
||||||
|
Err(e) => { if first_err.is_none() { first_err = Some(e); } }
|
||||||
|
}
|
||||||
|
completed += 1;
|
||||||
|
// Reset the 30 s timer.
|
||||||
|
reset_tx.send(()).ok();
|
||||||
|
// Inline check: same logic as a timer tick.
|
||||||
|
maybe_activate(
|
||||||
|
&activate_tx, &mut n_active, max_workers,
|
||||||
|
&mut cpu_sample, &mut eff_at_last_spawn,
|
||||||
|
n_cores, SPAWN_THRESHOLD, completed, n_total,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
WorkerEvent::TimerTick => {
|
||||||
|
maybe_activate(
|
||||||
|
&activate_tx, &mut n_active, max_workers,
|
||||||
|
&mut cpu_sample, &mut eff_at_last_spawn,
|
||||||
|
n_cores, SPAWN_THRESHOLD, completed, n_total,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dormant workers exit when activate_tx closes.
|
||||||
|
drop(activate_tx);
|
||||||
|
// Timer thread exits when reset_tx closes.
|
||||||
|
drop(reset_tx);
|
||||||
|
});
|
||||||
|
|
||||||
|
match first_err {
|
||||||
|
Some(e) => Err(e),
|
||||||
|
None => Ok(()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Internal event type ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
enum WorkerEvent<R, E> {
|
||||||
|
Completed(usize, Result<R, E>, Duration),
|
||||||
|
TimerTick,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn maybe_activate(
|
||||||
|
activate_tx: &crossbeam_channel::Sender<()>,
|
||||||
|
n_active: &mut usize,
|
||||||
|
max_workers: usize,
|
||||||
|
cpu_sample: &mut CpuSample,
|
||||||
|
eff_at_last_spawn: &mut f64,
|
||||||
|
n_cores: usize,
|
||||||
|
threshold: f64,
|
||||||
|
completed: usize,
|
||||||
|
n_total: usize,
|
||||||
|
) {
|
||||||
|
if *n_active >= max_workers || completed >= n_total { return; }
|
||||||
|
|
||||||
|
let eff = cpu_sample.cpu_efficiency(n_cores);
|
||||||
|
if eff >= threshold { return; } // CPU already saturated
|
||||||
|
|
||||||
|
// Check that the previous activation was beneficial enough.
|
||||||
|
// Going from k-1 → k workers, the minimum acceptable speedup is (k-1+0.2)/(k-1).
|
||||||
|
// For the very first extra worker (n_active == 1, no previous spawn), skip this
|
||||||
|
// check: eff_at_last_spawn == 0 acts as the sentinel.
|
||||||
|
let last_spawn_was_beneficial = if *eff_at_last_spawn < 1e-9 || eff < 1e-9 {
|
||||||
|
true // first additional worker, or measurement too short: no prior data to evaluate
|
||||||
|
} else {
|
||||||
|
let k_new = *n_active as f64; // worker count after the last spawn
|
||||||
|
let min_gain = 0.2 / k_new;
|
||||||
|
let actual_gain = (eff - *eff_at_last_spawn) / eff;
|
||||||
|
actual_gain >= min_gain
|
||||||
|
};
|
||||||
|
|
||||||
|
if last_spawn_was_beneficial {
|
||||||
|
activate_tx.send(()).ok();
|
||||||
|
*eff_at_last_spawn = eff;
|
||||||
|
*n_active += 1;
|
||||||
|
*cpu_sample = CpuSample::now();
|
||||||
|
debug!(
|
||||||
|
"activated worker {}/{} — efficiency {:.0}%",
|
||||||
|
n_active, max_workers, eff * 100.0,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -4,7 +4,6 @@ use std::path::Path;
|
|||||||
|
|
||||||
use obikpartitionner::{KmerFilter, KmerPartition, MergeMode};
|
use obikpartitionner::{KmerFilter, KmerPartition, MergeMode};
|
||||||
use obisys::{Reporter, Stage, progress_bar};
|
use obisys::{Reporter, Stage, progress_bar};
|
||||||
use rayon::prelude::*;
|
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
use crate::error::{OKIError, OKIResult};
|
use crate::error::{OKIError, OKIResult};
|
||||||
@@ -83,30 +82,25 @@ impl KmerIndex {
|
|||||||
let src_partition = &src.partition;
|
let src_partition = &src.partition;
|
||||||
let block_bits = meta.config.block_bits;
|
let block_bits = meta.config.block_bits;
|
||||||
|
|
||||||
let errors: Vec<obiskio::SKError> = (0..n_partitions)
|
let order: Vec<usize> = (0..n_partitions).collect();
|
||||||
.into_par_iter()
|
let runner = crate::numa::PartitionRunner::new();
|
||||||
.filter_map(|i| {
|
runner.run(
|
||||||
let result = dst_partition
|
&order,
|
||||||
.rebuild_partition(src_partition, i, filters, mode, n_genomes, block_bits)
|
|i| dst_partition.rebuild_partition(src_partition, i, filters, mode, n_genomes, block_bits),
|
||||||
.err();
|
|_, _, _| { pb.inc(1); },
|
||||||
pb.inc(1);
|
).map_err(OKIError::Partition)?;
|
||||||
result
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
pb.finish_and_clear();
|
pb.finish_and_clear();
|
||||||
|
|
||||||
if let Some(e) = errors.into_iter().next() {
|
|
||||||
return Err(OKIError::Partition(e));
|
|
||||||
}
|
|
||||||
|
|
||||||
rep.push(t.stop());
|
rep.push(t.stop());
|
||||||
|
|
||||||
// Write SENTINEL_INDEXED — output is ready to use.
|
// Write SENTINEL_INDEXED — output is ready to use.
|
||||||
fs::File::create(output.join(SENTINEL_INDEXED))?;
|
fs::File::create(output.join(SENTINEL_INDEXED))?;
|
||||||
|
|
||||||
let idx = KmerIndex::open(output)?;
|
let idx = KmerIndex::open(output)?;
|
||||||
|
let t_pack = Stage::start("pack");
|
||||||
idx.pack_matrices()?;
|
idx.pack_matrices()?;
|
||||||
|
rep.push(t_pack.stop());
|
||||||
Ok(idx)
|
Ok(idx)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ use std::path::Path;
|
|||||||
use obilayeredmap::{IndexMode, layer::Layer};
|
use obilayeredmap::{IndexMode, layer::Layer};
|
||||||
use obilayeredmap::meta::PartitionMeta;
|
use obilayeredmap::meta::PartitionMeta;
|
||||||
use obisys::{Reporter, Stage, progress_bar};
|
use obisys::{Reporter, Stage, progress_bar};
|
||||||
use rayon::prelude::*;
|
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
use crate::error::{OKIError, OKIResult};
|
use crate::error::{OKIError, OKIResult};
|
||||||
@@ -45,25 +44,17 @@ impl KmerIndex {
|
|||||||
let t = Stage::start("reindex");
|
let t = Stage::start("reindex");
|
||||||
let pb = progress_bar("reindex", n as u64, "partitions");
|
let pb = progress_bar("reindex", n as u64, "partitions");
|
||||||
|
|
||||||
let errors: Vec<String> = (0..n)
|
let order: Vec<usize> = (0..n).collect();
|
||||||
.into_par_iter()
|
let runner = crate::numa::PartitionRunner::new();
|
||||||
.filter_map(|i| {
|
runner.run(
|
||||||
let res = reindex_partition(
|
&order,
|
||||||
&self.partition.part_dir(i).join("index"),
|
|i| reindex_partition(&self.partition.part_dir(i).join("index"), &target, block_bits)
|
||||||
&target,
|
.map_err(|e| OKIError::InvalidInput(format!("partition {i}: {e}"))),
|
||||||
block_bits,
|
|_, _, _| { pb.inc(1); },
|
||||||
);
|
)?;
|
||||||
pb.inc(1);
|
|
||||||
res.err().map(|e| format!("partition {i}: {e}"))
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
pb.finish_and_clear();
|
pb.finish_and_clear();
|
||||||
|
|
||||||
if let Some(e) = errors.into_iter().next() {
|
|
||||||
return Err(OKIError::InvalidInput(e));
|
|
||||||
}
|
|
||||||
|
|
||||||
self.meta.config.evidence = target;
|
self.meta.config.evidence = target;
|
||||||
if matches!(self.meta.config.evidence, IndexMode::Exact) {
|
if matches!(self.meta.config.evidence, IndexMode::Exact) {
|
||||||
self.meta.config.block_bits = block_bits;
|
self.meta.config.block_bits = block_bits;
|
||||||
|
|||||||
+24
-40
@@ -3,8 +3,7 @@ use std::io;
|
|||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
use obikpartitionner::{KmerPartition, OutputCol, PARTITIONS_SUBDIR};
|
use obikpartitionner::{KmerPartition, OutputCol, PARTITIONS_SUBDIR};
|
||||||
use obisys::{Stage, progress_bar};
|
use obisys::{Reporter, Stage, progress_bar};
|
||||||
use rayon::prelude::*;
|
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
use crate::error::{OKIError, OKIResult};
|
use crate::error::{OKIError, OKIResult};
|
||||||
@@ -26,6 +25,7 @@ impl KmerIndex {
|
|||||||
threshold: u32,
|
threshold: u32,
|
||||||
output_presence: bool,
|
output_presence: bool,
|
||||||
force: bool,
|
force: bool,
|
||||||
|
rep: &mut Reporter,
|
||||||
) -> OKIResult<Self> {
|
) -> OKIResult<Self> {
|
||||||
let output = output.as_ref();
|
let output = output.as_ref();
|
||||||
|
|
||||||
@@ -72,31 +72,23 @@ impl KmerIndex {
|
|||||||
let pb = progress_bar("select", n_partitions as u64, "partitions");
|
let pb = progress_bar("select", n_partitions as u64, "partitions");
|
||||||
let src_partition = &src.partition;
|
let src_partition = &src.partition;
|
||||||
|
|
||||||
let errors: Vec<obiskio::SKError> = (0..n_partitions)
|
let order: Vec<usize> = (0..n_partitions).collect();
|
||||||
.into_par_iter()
|
let runner = crate::numa::PartitionRunner::new();
|
||||||
.filter_map(|i| {
|
runner.run(
|
||||||
let result = dst_partition.select_partition(
|
&order,
|
||||||
src_partition, i, specs,
|
|i| dst_partition.select_partition(src_partition, i, specs, n_src_genomes, threshold, output_presence, false),
|
||||||
n_src_genomes, threshold, output_presence,
|
|_, _, _| { pb.inc(1); },
|
||||||
false,
|
).map_err(OKIError::Partition)?;
|
||||||
);
|
|
||||||
pb.inc(1);
|
|
||||||
result.err()
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
pb.finish_and_clear();
|
pb.finish_and_clear();
|
||||||
|
rep.push(t.stop());
|
||||||
if let Some(e) = errors.into_iter().next() {
|
|
||||||
return Err(OKIError::Partition(e));
|
|
||||||
}
|
|
||||||
|
|
||||||
let _ = t.stop();
|
|
||||||
|
|
||||||
fs::File::create(output.join(SENTINEL_INDEXED))?;
|
fs::File::create(output.join(SENTINEL_INDEXED))?;
|
||||||
|
|
||||||
let idx = KmerIndex::open(output)?;
|
let idx = KmerIndex::open(output)?;
|
||||||
|
let t_pack = Stage::start("pack");
|
||||||
idx.pack_matrices()?;
|
idx.pack_matrices()?;
|
||||||
|
rep.push(t_pack.stop());
|
||||||
Ok(idx)
|
Ok(idx)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -108,6 +100,7 @@ impl KmerIndex {
|
|||||||
specs: &[OutputCol],
|
specs: &[OutputCol],
|
||||||
threshold: u32,
|
threshold: u32,
|
||||||
output_presence: bool,
|
output_presence: bool,
|
||||||
|
rep: &mut Reporter,
|
||||||
) -> OKIResult<()> {
|
) -> OKIResult<()> {
|
||||||
if self.state() != IndexState::Indexed {
|
if self.state() != IndexState::Indexed {
|
||||||
return Err(OKIError::NotIndexed(self.root_path.clone()));
|
return Err(OKIError::NotIndexed(self.root_path.clone()));
|
||||||
@@ -116,7 +109,6 @@ impl KmerIndex {
|
|||||||
let n_src_genomes = self.meta.genomes.len();
|
let n_src_genomes = self.meta.genomes.len();
|
||||||
let n_partitions = self.partition.n_partitions();
|
let n_partitions = self.partition.n_partitions();
|
||||||
|
|
||||||
// Open a second handle to the same path so we can borrow src and dst simultaneously.
|
|
||||||
let src_partition = KmerPartition::open_with_config(
|
let src_partition = KmerPartition::open_with_config(
|
||||||
&self.root_path,
|
&self.root_path,
|
||||||
self.meta.config.kmer_size,
|
self.meta.config.kmer_size,
|
||||||
@@ -132,35 +124,27 @@ impl KmerIndex {
|
|||||||
let t = Stage::start("select");
|
let t = Stage::start("select");
|
||||||
let pb = progress_bar("select", n_partitions as u64, "partitions");
|
let pb = progress_bar("select", n_partitions as u64, "partitions");
|
||||||
|
|
||||||
let errors: Vec<obiskio::SKError> = (0..n_partitions)
|
let partition = &self.partition;
|
||||||
.into_par_iter()
|
let order: Vec<usize> = (0..n_partitions).collect();
|
||||||
.filter_map(|i| {
|
let runner = crate::numa::PartitionRunner::new();
|
||||||
let result = self.partition.select_partition(
|
runner.run(
|
||||||
&src_partition, i, specs,
|
&order,
|
||||||
n_src_genomes, threshold, output_presence,
|
|i| partition.select_partition(&src_partition, i, specs, n_src_genomes, threshold, output_presence, true),
|
||||||
true,
|
|_, _, _| { pb.inc(1); },
|
||||||
);
|
).map_err(OKIError::Partition)?;
|
||||||
pb.inc(1);
|
|
||||||
result.err()
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
pb.finish_and_clear();
|
pb.finish_and_clear();
|
||||||
|
rep.push(t.stop());
|
||||||
|
|
||||||
if let Some(e) = errors.into_iter().next() {
|
|
||||||
return Err(OKIError::Partition(e));
|
|
||||||
}
|
|
||||||
|
|
||||||
let _ = t.stop();
|
|
||||||
|
|
||||||
// Update index.meta with new genome list and with_counts flag.
|
|
||||||
self.meta.config.with_counts = !output_presence;
|
self.meta.config.with_counts = !output_presence;
|
||||||
self.meta.genomes = specs.iter()
|
self.meta.genomes = specs.iter()
|
||||||
.map(|s| GenomeInfo::new(s.label.clone()))
|
.map(|s| GenomeInfo::new(s.label.clone()))
|
||||||
.collect();
|
.collect();
|
||||||
self.meta.write(&self.root_path)?;
|
self.meta.write(&self.root_path)?;
|
||||||
|
|
||||||
|
let t_pack = Stage::start("pack");
|
||||||
self.pack_matrices()?;
|
self.pack_matrices()?;
|
||||||
|
rep.push(t_pack.stop());
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "obikmer"
|
name = "obikmer"
|
||||||
version = "0.1.0"
|
version = "1.1.27"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
|
|
||||||
[[bin]]
|
[[bin]]
|
||||||
@@ -18,7 +18,8 @@ obikrope = { path = "../obikrope" }
|
|||||||
obikpartitionner = { path = "../obikpartitionner" }
|
obikpartitionner = { path = "../obikpartitionner" }
|
||||||
obisys = { path = "../obisys" }
|
obisys = { path = "../obisys" }
|
||||||
obiskio = { path = "../obiskio" }
|
obiskio = { path = "../obiskio" }
|
||||||
obikindex = { path = "../obikindex" }
|
obikindex = { path = "../obikindex", default-features = false }
|
||||||
|
obitaxonomy = { path = "../obitaxonomy" }
|
||||||
obilayeredmap = { path = "../obilayeredmap" }
|
obilayeredmap = { path = "../obilayeredmap" }
|
||||||
clap = { version = "4", features = ["derive"] }
|
clap = { version = "4", features = ["derive"] }
|
||||||
serde_json = "1"
|
serde_json = "1"
|
||||||
@@ -32,4 +33,6 @@ tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] }
|
|||||||
pprof = { version = "0.13", features = ["prost-codec"], optional = true }
|
pprof = { version = "0.13", features = ["prost-codec"], optional = true }
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
|
default = ["numa"]
|
||||||
|
numa = ["obikindex/numa"]
|
||||||
profiling = ["dep:pprof"]
|
profiling = ["dep:pprof"]
|
||||||
|
|||||||
+2
-63
@@ -1,9 +1,9 @@
|
|||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::sync::{Arc, Condvar, Mutex};
|
|
||||||
|
|
||||||
use clap::Args;
|
use clap::Args;
|
||||||
use obiread::NucPage;
|
use obiread::NucPage;
|
||||||
use obikseq::RoutableSuperKmer;
|
use obikseq::RoutableSuperKmer;
|
||||||
|
use obipipeline::Throttled;
|
||||||
|
|
||||||
// ── Shared arguments ──────────────────────────────────────────────────────────
|
// ── Shared arguments ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
@@ -103,54 +103,10 @@ impl CommonArgs {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Open-file throttling ──────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
struct FileSlots {
|
|
||||||
count: Mutex<usize>,
|
|
||||||
condvar: Condvar,
|
|
||||||
max: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl FileSlots {
|
|
||||||
fn new(max: usize) -> Self {
|
|
||||||
Self { count: Mutex::new(0), condvar: Condvar::new(), max }
|
|
||||||
}
|
|
||||||
|
|
||||||
fn acquire(&self) {
|
|
||||||
let mut count = self.count.lock().unwrap();
|
|
||||||
while *count >= self.max {
|
|
||||||
count = self.condvar.wait(count).unwrap();
|
|
||||||
}
|
|
||||||
*count += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
fn release(&self) {
|
|
||||||
let mut count = self.count.lock().unwrap();
|
|
||||||
*count -= 1;
|
|
||||||
self.condvar.notify_one();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct SlotsGuard(Arc<FileSlots>);
|
|
||||||
|
|
||||||
impl Drop for SlotsGuard {
|
|
||||||
fn drop(&mut self) {
|
|
||||||
self.0.release();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Pipeline data carrier ─────────────────────────────────────────────────────
|
// ── Pipeline data carrier ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
/// A path bundled with an opaque guard token.
|
|
||||||
/// The guard is acquired in the source thread and dropped by the flat worker
|
|
||||||
/// once the file is fully read, releasing the open-file slot.
|
|
||||||
pub struct PathWithSlot {
|
|
||||||
pub path: PathBuf,
|
|
||||||
pub _guard: Box<dyn Send + 'static>,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub enum PipelineData {
|
pub enum PipelineData {
|
||||||
Path(PathWithSlot),
|
Path(Throttled<PathBuf>),
|
||||||
NucPage(NucPage),
|
NucPage(NucPage),
|
||||||
Batch(Vec<RoutableSuperKmer>),
|
Batch(Vec<RoutableSuperKmer>),
|
||||||
}
|
}
|
||||||
@@ -158,20 +114,3 @@ pub enum PipelineData {
|
|||||||
unsafe impl Send for PipelineData {}
|
unsafe impl Send for PipelineData {}
|
||||||
unsafe impl Sync for PipelineData {}
|
unsafe impl Sync for PipelineData {}
|
||||||
|
|
||||||
/// Wrap a path iterator so that at most `max_open` files are open simultaneously.
|
|
||||||
/// Acquisition happens in the caller's thread (the pipeline source thread),
|
|
||||||
/// never inside a worker, preventing deadlocks.
|
|
||||||
pub fn throttle_paths(
|
|
||||||
source: impl Iterator<Item = PathBuf> + Send + 'static,
|
|
||||||
max_open: usize,
|
|
||||||
) -> impl Iterator<Item = PathWithSlot> + Send + 'static {
|
|
||||||
let slots = Arc::new(FileSlots::new(max_open));
|
|
||||||
source.map(move |path| {
|
|
||||||
slots.acquire();
|
|
||||||
PathWithSlot {
|
|
||||||
path,
|
|
||||||
_guard: Box::new(SlotsGuard(Arc::clone(&slots))),
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ use std::collections::HashMap;
|
|||||||
use clap::Args;
|
use clap::Args;
|
||||||
use obikindex::GenomeInfo;
|
use obikindex::GenomeInfo;
|
||||||
use obikpartitionner::{GroupQuorumFilter, KmerFilter};
|
use obikpartitionner::{GroupQuorumFilter, KmerFilter};
|
||||||
|
use obitaxonomy::{TaxPath, TaxPattern};
|
||||||
|
|
||||||
// ── Operator ──────────────────────────────────────────────────────────────────
|
// ── Operator ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
@@ -49,7 +50,6 @@ impl MetaPred {
|
|||||||
if values.iter().any(|v| v.is_empty()) {
|
if values.iter().any(|v| v.is_empty()) {
|
||||||
return Err(format!("empty value in predicate: {s}"));
|
return Err(format!("empty value in predicate: {s}"));
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(Self { key, op, values })
|
Ok(Self { key, op, values })
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -70,18 +70,15 @@ impl MetaPred {
|
|||||||
|
|
||||||
// ── Path matching ─────────────────────────────────────────────────────────────
|
// ── Path matching ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
/// True if `value` is equal to `pattern` or is a descendant of it in a `/`-separated hierarchy.
|
/// True if the stored taxonomy `value` matches `pattern`.
|
||||||
///
|
///
|
||||||
/// - Absolute pattern (`/a/b`): `value` must start with `/a/b` at a segment boundary.
|
/// `value` must be a valid `TaxPath` (starts with `taxonomy:/`).
|
||||||
/// - Bare segment (`b`): `value` must contain `b` as an exact segment anywhere.
|
/// `pattern` is a `TaxPattern` query (see `obitaxonomy::TaxPattern` for syntax).
|
||||||
|
/// Returns `false` if either fails to parse.
|
||||||
fn path_matches(value: &str, pattern: &str) -> bool {
|
fn path_matches(value: &str, pattern: &str) -> bool {
|
||||||
if pattern.starts_with('/') {
|
let Ok(path) = TaxPath::parse(value) else { return false };
|
||||||
value == pattern
|
let Ok(pat) = TaxPattern::parse(pattern) else { return false };
|
||||||
|| (value.starts_with(pattern)
|
pat.matches(&path)
|
||||||
&& value[pattern.len()..].starts_with('/'))
|
|
||||||
} else {
|
|
||||||
value.split('/').any(|seg| seg == pattern)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Three-value group evaluation ──────────────────────────────────────────────
|
// ── Three-value group evaluation ──────────────────────────────────────────────
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ use std::path::PathBuf;
|
|||||||
use clap::{Args, ValueEnum};
|
use clap::{Args, ValueEnum};
|
||||||
use obikindex::{GenomeInfo, KmerIndex};
|
use obikindex::{GenomeInfo, KmerIndex};
|
||||||
use obikpartitionner::{AggOp, OutputCol};
|
use obikpartitionner::{AggOp, OutputCol};
|
||||||
|
use obisys::Reporter;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
use super::predicate::matching_genome_indices;
|
use super::predicate::matching_genome_indices;
|
||||||
@@ -229,20 +230,24 @@ pub fn run(args: SelectArgs) {
|
|||||||
if output_presence { "presence" } else { "count" },
|
if output_presence { "presence" } else { "count" },
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let mut rep = Reporter::new();
|
||||||
|
|
||||||
if args.in_place {
|
if args.in_place {
|
||||||
src.select_in_place(&specs, args.presence_threshold, output_presence)
|
src.select_in_place(&specs, args.presence_threshold, output_presence, &mut rep)
|
||||||
.unwrap_or_else(|e| {
|
.unwrap_or_else(|e| {
|
||||||
eprintln!("select error: {e}");
|
eprintln!("select error: {e}");
|
||||||
std::process::exit(1);
|
std::process::exit(1);
|
||||||
});
|
});
|
||||||
|
rep.print();
|
||||||
info!("selected in-place → {}", args.source.display());
|
info!("selected in-place → {}", args.source.display());
|
||||||
} else {
|
} else {
|
||||||
let output = args.output.unwrap();
|
let output = args.output.unwrap();
|
||||||
KmerIndex::select(&output, &src, &specs, args.presence_threshold, output_presence, args.force)
|
KmerIndex::select(&output, &src, &specs, args.presence_threshold, output_presence, args.force, &mut rep)
|
||||||
.unwrap_or_else(|e| {
|
.unwrap_or_else(|e| {
|
||||||
eprintln!("select error: {e}");
|
eprintln!("select error: {e}");
|
||||||
std::process::exit(1);
|
std::process::exit(1);
|
||||||
});
|
});
|
||||||
|
rep.print();
|
||||||
info!("selected index → {}", output.display());
|
info!("selected index → {}", output.display());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,10 +1,13 @@
|
|||||||
use std::io::{self, BufWriter, Write};
|
use std::io::{self, BufWriter, Write};
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
use clap::Args;
|
use clap::Args;
|
||||||
use obifastwrite::write_scatter;
|
use obifastwrite::write_scatter;
|
||||||
use obikseq::{RoutableSuperKmer, set_k, set_m};
|
use obikseq::{RoutableSuperKmer, set_k, set_m};
|
||||||
|
|
||||||
use crate::cli::{CommonArgs, PipelineData, PathWithSlot, partitions_to_bits, throttle_paths};
|
use obipipeline::{Throttled, throttle};
|
||||||
|
|
||||||
|
use crate::cli::{CommonArgs, PipelineData, partitions_to_bits};
|
||||||
|
|
||||||
#[derive(Args)]
|
#[derive(Args)]
|
||||||
pub struct SuperkmerArgs {
|
pub struct SuperkmerArgs {
|
||||||
@@ -46,14 +49,15 @@ pub fn run(args: SuperkmerArgs) {
|
|||||||
set_k(k);
|
set_k(k);
|
||||||
set_m(m);
|
set_m(m);
|
||||||
|
|
||||||
let path_source = throttle_paths(args.common.seqfile_paths(), max_open);
|
let path_source = throttle(args.common.seqfile_paths(), max_open);
|
||||||
|
|
||||||
let pipe = obipipeline::make_pipe! {
|
let pipe = obipipeline::make_pipe! {
|
||||||
PipelineData : PathWithSlot => Vec<RoutableSuperKmer>,
|
PipelineData : Throttled<PathBuf> => Vec<RoutableSuperKmer>,
|
||||||
||? {
|
||? {
|
||||||
let k = k;
|
let k = k;
|
||||||
move |pw: PathWithSlot| {
|
move |pw: Throttled<PathBuf>| {
|
||||||
let path_str = pw.path.to_str().unwrap_or("").to_owned();
|
let path_str = pw.item.to_str().unwrap_or("").to_owned();
|
||||||
|
let _guard = pw.guard;
|
||||||
obiread::open_nuc_stream(&path_str, k)
|
obiread::open_nuc_stream(&path_str, k)
|
||||||
}
|
}
|
||||||
} : Path => NucPage,
|
} : Path => NucPage,
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ use clap::{Parser, Subcommand};
|
|||||||
use tracing_subscriber::{EnvFilter, fmt};
|
use tracing_subscriber::{EnvFilter, fmt};
|
||||||
|
|
||||||
#[derive(Parser)]
|
#[derive(Parser)]
|
||||||
#[command(name = "obikmer", about = "DNA k-mer tools")]
|
#[command(name = "obikmer", about = "DNA k-mer tools", version)]
|
||||||
struct Cli {
|
struct Cli {
|
||||||
#[command(subcommand)]
|
#[command(subcommand)]
|
||||||
command: Commands,
|
command: Commands,
|
||||||
|
|||||||
@@ -6,16 +6,17 @@ use std::time::Instant;
|
|||||||
use obisys::spinner;
|
use obisys::spinner;
|
||||||
use obiread::NucPage;
|
use obiread::NucPage;
|
||||||
use obikpartitionner::KmerPartition;
|
use obikpartitionner::KmerPartition;
|
||||||
|
use obipipeline::{ThrottleGuard, Throttled, throttle};
|
||||||
use obisys::{Reporter, Stage};
|
use obisys::{Reporter, Stage};
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
use crate::cli::{PipelineData, PathWithSlot, throttle_paths};
|
use crate::cli::PipelineData;
|
||||||
|
|
||||||
// ── Iterator that keeps the slot guard alive until the file is exhausted ──────
|
// ── Iterator that keeps the slot guard alive until the file is exhausted ──────
|
||||||
|
|
||||||
struct GuardedIter {
|
struct GuardedIter {
|
||||||
inner: Box<dyn Iterator<Item = NucPage> + Send>,
|
inner: Box<dyn Iterator<Item = NucPage> + Send>,
|
||||||
_guard: Box<dyn Send + 'static>,
|
_guard: ThrottleGuard,
|
||||||
flat_active: Arc<AtomicU32>,
|
flat_active: Arc<AtomicU32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -49,7 +50,7 @@ pub fn scatter(
|
|||||||
use obikseq::RoutableSuperKmer;
|
use obikseq::RoutableSuperKmer;
|
||||||
|
|
||||||
// Throttle in the source thread — never in a worker — to prevent deadlock.
|
// Throttle in the source thread — never in a worker — to prevent deadlock.
|
||||||
let throttled = throttle_paths(path_source, max_open);
|
let throttled = throttle(path_source, max_open);
|
||||||
|
|
||||||
let file_count = Arc::new(AtomicU64::new(0));
|
let file_count = Arc::new(AtomicU64::new(0));
|
||||||
let flat_active = Arc::new(AtomicU32::new(0));
|
let flat_active = Arc::new(AtomicU32::new(0));
|
||||||
@@ -57,19 +58,20 @@ pub fn scatter(
|
|||||||
|
|
||||||
let t = Stage::start("scatter");
|
let t = Stage::start("scatter");
|
||||||
let pipe = obipipeline::make_pipe! {
|
let pipe = obipipeline::make_pipe! {
|
||||||
PipelineData : PathWithSlot => Vec<RoutableSuperKmer>,
|
PipelineData : Throttled<PathBuf> => Vec<RoutableSuperKmer>,
|
||||||
||? {
|
||? {
|
||||||
let file_count = Arc::clone(&file_count);
|
let file_count = Arc::clone(&file_count);
|
||||||
let flat_active = Arc::clone(&flat_active);
|
let flat_active = Arc::clone(&flat_active);
|
||||||
let k = k;
|
let k = k;
|
||||||
move |pw: PathWithSlot| {
|
move |pw: Throttled<PathBuf>| {
|
||||||
let PathWithSlot { path, _guard } = pw;
|
let path = pw.item;
|
||||||
|
let guard = pw.guard;
|
||||||
let n = file_count.fetch_add(1, Ordering::Relaxed) + 1;
|
let n = file_count.fetch_add(1, Ordering::Relaxed) + 1;
|
||||||
info!("indexing [{}]: {}", n, path.display());
|
info!("indexing [{}]: {}", n, path.display());
|
||||||
let path_str = path.to_str().unwrap_or("").to_owned();
|
let path_str = path.to_str().unwrap_or("").to_owned();
|
||||||
flat_active.fetch_add(1, Ordering::Relaxed);
|
flat_active.fetch_add(1, Ordering::Relaxed);
|
||||||
obiread::open_nuc_stream(&path_str, k)
|
obiread::open_nuc_stream(&path_str, k)
|
||||||
.map(|iter| GuardedIter { inner: iter, _guard, flat_active: Arc::clone(&flat_active) })
|
.map(|iter| GuardedIter { inner: iter, _guard: guard, flat_active: Arc::clone(&flat_active) })
|
||||||
}
|
}
|
||||||
} : Path => NucPage,
|
} : Path => NucPage,
|
||||||
| {
|
| {
|
||||||
|
|||||||
@@ -0,0 +1,84 @@
|
|||||||
|
use std::fs;
|
||||||
|
use std::io;
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
|
use obicompactvec::{PersistentBitVecBuilder, PersistentCompactIntVecBuilder};
|
||||||
|
use obilayeredmap::meta::PartitionMeta;
|
||||||
|
use obilayeredmap::{IndexMode, OLMError};
|
||||||
|
use obiskio::{SKError, SKResult};
|
||||||
|
|
||||||
|
// ── olm_to_sk ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
pub(crate) fn olm_to_sk(e: OLMError, context: &'static str) -> SKError {
|
||||||
|
match e {
|
||||||
|
OLMError::Io(e) => SKError::Io(e),
|
||||||
|
other => SKError::InvalidData {
|
||||||
|
context,
|
||||||
|
detail: other.to_string(),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── load_meta ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Load PartitionMeta, or recover it by probing layer directories.
|
||||||
|
/// Indexes built before meta.json was introduced lack the file.
|
||||||
|
pub(crate) fn load_meta(dir: &Path, context: &'static str) -> SKResult<PartitionMeta> {
|
||||||
|
match PartitionMeta::load(dir) {
|
||||||
|
Ok(m) => Ok(m),
|
||||||
|
Err(e) if matches!(e, OLMError::Io(ref io_e) if io_e.kind() == std::io::ErrorKind::NotFound) =>
|
||||||
|
{
|
||||||
|
let mut n = 0usize;
|
||||||
|
while dir.join(format!("layer_{n}")).exists() {
|
||||||
|
n += 1;
|
||||||
|
}
|
||||||
|
let m = PartitionMeta {
|
||||||
|
n_layers: n,
|
||||||
|
mode: IndexMode::default(),
|
||||||
|
};
|
||||||
|
m.save(dir).map_err(|e| olm_to_sk(e, context))?;
|
||||||
|
Ok(m)
|
||||||
|
}
|
||||||
|
Err(e) => Err(olm_to_sk(e, context)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── path helpers ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
pub(crate) fn col_path_bit(dir: &Path, col: usize) -> PathBuf {
|
||||||
|
dir.join(format!("col_{col:06}.pbiv"))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn col_path_int(dir: &Path, col: usize) -> PathBuf {
|
||||||
|
dir.join(format!("col_{col:06}.pciv"))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn write_matrix_meta(dir: &Path, n: usize, n_cols: usize) -> io::Result<()> {
|
||||||
|
fs::write(
|
||||||
|
dir.join("meta.json"),
|
||||||
|
format!("{{\"n\":{n},\"n_cols\":{n_cols}}}\n"),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── ColBuilder ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
pub(crate) enum ColBuilder {
|
||||||
|
Bit(PersistentBitVecBuilder),
|
||||||
|
Int(PersistentCompactIntVecBuilder),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ColBuilder {
|
||||||
|
pub(crate) fn set_val(&mut self, slot: usize, value: u32) {
|
||||||
|
match self {
|
||||||
|
ColBuilder::Bit(b) => b.set(slot, value > 0),
|
||||||
|
ColBuilder::Int(b) => b.set(slot, value),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn close(self) -> SKResult<()> {
|
||||||
|
match self {
|
||||||
|
ColBuilder::Bit(b) => b.close().map_err(SKError::Io),
|
||||||
|
ColBuilder::Int(b) => b.close().map_err(SKError::Io),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,9 +1,24 @@
|
|||||||
|
use obicompactvec::FilterMask;
|
||||||
|
|
||||||
/// Trait for kmer row filters.
|
/// Trait for kmer row filters.
|
||||||
///
|
///
|
||||||
/// `row` contains raw per-genome counts (or 0/1 for presence/absence data).
|
/// `row` contains raw per-genome counts (or 0/1 for presence/absence data).
|
||||||
/// `n_genomes` equals `row.len()`.
|
/// `n_genomes` equals `row.len()`.
|
||||||
pub trait KmerFilter: Send + Sync {
|
pub trait KmerFilter: Send + Sync {
|
||||||
fn passes(&self, row: &[u32], n_genomes: usize) -> bool;
|
fn passes(&self, row: &[u32], n_genomes: usize) -> bool;
|
||||||
|
|
||||||
|
/// Express this filter as a [`FilterMask`] column-operation expression.
|
||||||
|
///
|
||||||
|
/// Returns `Some(expr)` if the filter can be evaluated solely from matrix
|
||||||
|
/// column aggregates (no per-kmer row scan needed). Returns `None` if the
|
||||||
|
/// filter requires row-level inspection.
|
||||||
|
///
|
||||||
|
/// `threshold` semantics in the returned mask use `>= threshold`, matching
|
||||||
|
/// [`obicompactvec::MatrixGroupOps`]. Implementations must add 1 to any
|
||||||
|
/// row-level threshold that uses strict `>` comparison.
|
||||||
|
fn column_mask_expr(&self, _n_genomes: usize) -> Option<FilterMask> {
|
||||||
|
None
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// True when `row` passes every filter in `filters`.
|
/// True when `row` passes every filter in `filters`.
|
||||||
@@ -29,6 +44,16 @@ impl KmerFilter for MinGenomeFraction {
|
|||||||
let p = present_count(row, self.threshold);
|
let p = present_count(row, self.threshold);
|
||||||
p as f64 / n_genomes as f64 >= self.frac
|
p as f64 / n_genomes as f64 >= self.frac
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn column_mask_expr(&self, n_genomes: usize) -> Option<FilterMask> {
|
||||||
|
let t = self.threshold.checked_add(1)?;
|
||||||
|
let min_count = (self.frac * n_genomes as f64).ceil() as usize;
|
||||||
|
Some(FilterMask::PresenceGeq {
|
||||||
|
indices: (0..n_genomes).collect(),
|
||||||
|
threshold: t,
|
||||||
|
min_count,
|
||||||
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// At most `frac` fraction of genomes contain this kmer (count > `threshold`).
|
/// At most `frac` fraction of genomes contain this kmer (count > `threshold`).
|
||||||
@@ -42,6 +67,16 @@ impl KmerFilter for MaxGenomeFraction {
|
|||||||
let p = present_count(row, self.threshold);
|
let p = present_count(row, self.threshold);
|
||||||
p as f64 / n_genomes as f64 <= self.frac
|
p as f64 / n_genomes as f64 <= self.frac
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn column_mask_expr(&self, n_genomes: usize) -> Option<FilterMask> {
|
||||||
|
let t = self.threshold.checked_add(1)?;
|
||||||
|
let max_count = (self.frac * n_genomes as f64).floor() as usize;
|
||||||
|
Some(FilterMask::PresenceLeq {
|
||||||
|
indices: (0..n_genomes).collect(),
|
||||||
|
threshold: t,
|
||||||
|
max_count,
|
||||||
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// At least `count` genomes contain this kmer (count > `threshold`).
|
/// At least `count` genomes contain this kmer (count > `threshold`).
|
||||||
@@ -54,6 +89,15 @@ impl KmerFilter for MinGenomeCount {
|
|||||||
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
|
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
|
||||||
present_count(row, self.threshold) >= self.count
|
present_count(row, self.threshold) >= self.count
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn column_mask_expr(&self, n_genomes: usize) -> Option<FilterMask> {
|
||||||
|
let t = self.threshold.checked_add(1)?;
|
||||||
|
Some(FilterMask::PresenceGeq {
|
||||||
|
indices: (0..n_genomes).collect(),
|
||||||
|
threshold: t,
|
||||||
|
min_count: self.count,
|
||||||
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// At most `count` genomes contain this kmer (count > `threshold`).
|
/// At most `count` genomes contain this kmer (count > `threshold`).
|
||||||
@@ -66,6 +110,15 @@ impl KmerFilter for MaxGenomeCount {
|
|||||||
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
|
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
|
||||||
present_count(row, self.threshold) <= self.count
|
present_count(row, self.threshold) <= self.count
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn column_mask_expr(&self, n_genomes: usize) -> Option<FilterMask> {
|
||||||
|
let t = self.threshold.checked_add(1)?;
|
||||||
|
Some(FilterMask::PresenceLeq {
|
||||||
|
indices: (0..n_genomes).collect(),
|
||||||
|
threshold: t,
|
||||||
|
max_count: self.count,
|
||||||
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Total-count filters (count indexes only) ───────────────────────────────────
|
// ── Total-count filters (count indexes only) ───────────────────────────────────
|
||||||
@@ -79,6 +132,13 @@ impl KmerFilter for MinTotalCount {
|
|||||||
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
|
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
|
||||||
row.iter().sum::<u32>() >= self.total
|
row.iter().sum::<u32>() >= self.total
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn column_mask_expr(&self, n_genomes: usize) -> Option<FilterMask> {
|
||||||
|
Some(FilterMask::SumGeq {
|
||||||
|
indices: (0..n_genomes).collect(),
|
||||||
|
min_sum: self.total,
|
||||||
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Sum of counts across all genomes <= `total`.
|
/// Sum of counts across all genomes <= `total`.
|
||||||
@@ -90,6 +150,13 @@ impl KmerFilter for MaxTotalCount {
|
|||||||
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
|
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
|
||||||
row.iter().sum::<u32>() <= self.total
|
row.iter().sum::<u32>() <= self.total
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn column_mask_expr(&self, n_genomes: usize) -> Option<FilterMask> {
|
||||||
|
Some(FilterMask::SumLeq {
|
||||||
|
indices: (0..n_genomes).collect(),
|
||||||
|
max_sum: self.total,
|
||||||
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Group-based quorum filter ─────────────────────────────────────────────────
|
// ── Group-based quorum filter ─────────────────────────────────────────────────
|
||||||
@@ -113,6 +180,37 @@ pub struct GroupQuorumFilter {
|
|||||||
pub max_outgroup_frac: f64,
|
pub max_outgroup_frac: f64,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl GroupQuorumFilter {
|
||||||
|
// Build PresenceGeq/PresenceLeq constraints for one group (ingroup or outgroup).
|
||||||
|
fn group_mask_parts(
|
||||||
|
indices: &[usize],
|
||||||
|
threshold: u32,
|
||||||
|
min_count: usize,
|
||||||
|
max_count: usize,
|
||||||
|
min_frac: f64,
|
||||||
|
max_frac: f64,
|
||||||
|
parts: &mut Vec<FilterMask>,
|
||||||
|
) {
|
||||||
|
let n = indices.len();
|
||||||
|
let geq = min_count.max((min_frac * n as f64).ceil() as usize);
|
||||||
|
if geq > 0 {
|
||||||
|
parts.push(FilterMask::PresenceGeq {
|
||||||
|
indices: indices.to_vec(),
|
||||||
|
threshold,
|
||||||
|
min_count: geq,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
let leq = max_count.min((max_frac * n as f64).floor() as usize);
|
||||||
|
if leq < n {
|
||||||
|
parts.push(FilterMask::PresenceLeq {
|
||||||
|
indices: indices.to_vec(),
|
||||||
|
threshold,
|
||||||
|
max_count: leq,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl KmerFilter for GroupQuorumFilter {
|
impl KmerFilter for GroupQuorumFilter {
|
||||||
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
|
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
|
||||||
if !self.ingroup_idx.is_empty() {
|
if !self.ingroup_idx.is_empty() {
|
||||||
@@ -139,4 +237,26 @@ impl KmerFilter for GroupQuorumFilter {
|
|||||||
}
|
}
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn column_mask_expr(&self, _n_genomes: usize) -> Option<FilterMask> {
|
||||||
|
let t = self.threshold.checked_add(1)?;
|
||||||
|
let mut parts: Vec<FilterMask> = Vec::new();
|
||||||
|
if !self.ingroup_idx.is_empty() {
|
||||||
|
Self::group_mask_parts(
|
||||||
|
&self.ingroup_idx, t,
|
||||||
|
self.min_count, self.max_count,
|
||||||
|
self.min_frac, self.max_frac,
|
||||||
|
&mut parts,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if !self.outgroup_idx.is_empty() {
|
||||||
|
Self::group_mask_parts(
|
||||||
|
&self.outgroup_idx, t,
|
||||||
|
self.min_outgroup_count, self.max_outgroup_count,
|
||||||
|
self.min_outgroup_frac, self.max_outgroup_frac,
|
||||||
|
&mut parts,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Some(FilterMask::And(parts))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,152 @@
|
|||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
use std::sync::{Arc, Mutex};
|
||||||
|
|
||||||
|
use tracing::debug;
|
||||||
|
use obipipeline::{
|
||||||
|
Pipeline, PipelineError, PipelineSender, SharedFlatFn, Stage, WorkerPool,
|
||||||
|
make_sink, make_source, make_transform,
|
||||||
|
throttle,
|
||||||
|
};
|
||||||
|
|
||||||
|
use obidebruinj::GraphDeBruijn;
|
||||||
|
use obikseq::CanonicalKmer;
|
||||||
|
use obilayeredmap::{IndexMode, Layer};
|
||||||
|
use obiskio::{SKError, SKResult};
|
||||||
|
|
||||||
|
use crate::common::olm_to_sk;
|
||||||
|
|
||||||
|
// ── KmerGraphData ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
enum KmerGraphData {
|
||||||
|
File(obipipeline::Throttled<PathBuf>),
|
||||||
|
RawBatch(Vec<CanonicalKmer>),
|
||||||
|
FilteredBatch(Vec<CanonicalKmer>),
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── build_graph ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Phase 1: pipeline that reads files, filters kmers, pushes into a GraphDeBruijn.
|
||||||
|
///
|
||||||
|
/// `flat_fn(path, emit)`: opens path, iterates kmers, calls `emit(batch)` for each batch.
|
||||||
|
/// `filter(kmer) -> bool`: secondary filter applied in the Transform stage.
|
||||||
|
pub(crate) fn build_graph<I, F, G>(
|
||||||
|
file_source: I,
|
||||||
|
flat_fn: F,
|
||||||
|
filter: G,
|
||||||
|
n_workers: usize,
|
||||||
|
max_open: usize,
|
||||||
|
) -> SKResult<GraphDeBruijn>
|
||||||
|
where
|
||||||
|
I: Iterator<Item = PathBuf> + Send + 'static,
|
||||||
|
F: Fn(PathBuf, &mut dyn FnMut(Vec<CanonicalKmer>)) -> SKResult<()> + Send + Sync + 'static,
|
||||||
|
G: Fn(CanonicalKmer) -> bool + Send + Sync + 'static,
|
||||||
|
{
|
||||||
|
let capacity = 2;
|
||||||
|
|
||||||
|
let flat_fn = Arc::new(flat_fn);
|
||||||
|
let filter = Arc::new(filter);
|
||||||
|
let g_shared = Arc::new(Mutex::new(GraphDeBruijn::new()));
|
||||||
|
let g_sink = Arc::clone(&g_shared);
|
||||||
|
let err_cap: Arc<Mutex<Option<SKError>>> = Arc::new(Mutex::new(None));
|
||||||
|
let err_flat = Arc::clone(&err_cap);
|
||||||
|
|
||||||
|
let throttled = throttle(file_source, max_open);
|
||||||
|
|
||||||
|
let pipeline = Pipeline::new(
|
||||||
|
make_source!(KmerGraphData, throttled, File),
|
||||||
|
vec![
|
||||||
|
Stage::Flat(Arc::new(
|
||||||
|
move |data: KmerGraphData,
|
||||||
|
push: &PipelineSender<Result<KmerGraphData, PipelineError>>,
|
||||||
|
delta: &PipelineSender<isize>|
|
||||||
|
{
|
||||||
|
if let KmerGraphData::File(t) = data {
|
||||||
|
let path = t.item;
|
||||||
|
let _guard = t.guard; // released at end of block
|
||||||
|
let mut count: isize = 0;
|
||||||
|
let push_clone = push.clone();
|
||||||
|
let result = flat_fn(path, &mut |batch: Vec<CanonicalKmer>| {
|
||||||
|
push_clone.send(Ok(KmerGraphData::RawBatch(batch))).ok();
|
||||||
|
count += 1;
|
||||||
|
});
|
||||||
|
match result {
|
||||||
|
Ok(()) => {
|
||||||
|
delta.send(count - 1).ok();
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
*err_flat.lock().unwrap() = Some(e);
|
||||||
|
delta.send(-1).ok();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
) as SharedFlatFn<KmerGraphData>),
|
||||||
|
make_transform!(KmerGraphData, {
|
||||||
|
let filter = Arc::clone(&filter);
|
||||||
|
move |batch: Vec<CanonicalKmer>| -> Vec<CanonicalKmer> {
|
||||||
|
batch.into_iter().filter(|k| filter(*k)).collect()
|
||||||
|
}
|
||||||
|
}, RawBatch, FilteredBatch),
|
||||||
|
],
|
||||||
|
make_sink!(KmerGraphData, {
|
||||||
|
move |batch: Vec<CanonicalKmer>| {
|
||||||
|
let mut g = g_sink.lock().unwrap();
|
||||||
|
for kmer in batch {
|
||||||
|
g.push(kmer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, FilteredBatch),
|
||||||
|
);
|
||||||
|
|
||||||
|
WorkerPool::new(pipeline, n_workers, capacity).run();
|
||||||
|
|
||||||
|
if let Some(e) = Arc::try_unwrap(err_cap)
|
||||||
|
.unwrap_or_else(|_| panic!("build_graph: err_cap not uniquely owned after pipeline"))
|
||||||
|
.into_inner()
|
||||||
|
.unwrap_or_else(|e| e.into_inner())
|
||||||
|
{
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
let g = Arc::try_unwrap(g_shared)
|
||||||
|
.unwrap_or_else(|_| panic!("build_graph: g_shared not uniquely owned after pipeline"))
|
||||||
|
.into_inner()
|
||||||
|
.unwrap_or_else(|e| e.into_inner());
|
||||||
|
|
||||||
|
Ok(g)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── write_graph_as_unitigs ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Phase 2 (write unitigs only): compute degrees, write unitigs to `layer_dir`, drop graph.
|
||||||
|
///
|
||||||
|
/// Returns n_kmers. Does NOT build the MPHF — caller does it.
|
||||||
|
pub(crate) fn write_graph_as_unitigs(g: GraphDeBruijn, layer_dir: &Path) -> SKResult<usize> {
|
||||||
|
let n_kmers = g.len();
|
||||||
|
g.compute_degrees_and_mark_starts();
|
||||||
|
std::fs::create_dir_all(layer_dir)?;
|
||||||
|
let mut uw = Layer::<()>::unitig_writer(layer_dir).map_err(|e| olm_to_sk(e, "graph pipeline"))?;
|
||||||
|
g.try_for_each_unitig(|unitig| uw.write(unitig))?;
|
||||||
|
uw.close()?;
|
||||||
|
drop(g);
|
||||||
|
Ok(n_kmers)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── materialize_layer ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Phase 2 (full): write_graph_as_unitigs + `Layer::<()>::build`.
|
||||||
|
///
|
||||||
|
/// Returns n_kmers.
|
||||||
|
pub(crate) fn materialize_layer(
|
||||||
|
g: GraphDeBruijn,
|
||||||
|
layer_dir: &Path,
|
||||||
|
block_bits: u8,
|
||||||
|
evidence: &IndexMode,
|
||||||
|
) -> SKResult<usize> {
|
||||||
|
let n = write_graph_as_unitigs(g, layer_dir)?;
|
||||||
|
debug!("materialize_layer: unitigs written ({n} kmers), building MPHF");
|
||||||
|
Layer::<()>::build(layer_dir, block_bits, evidence)
|
||||||
|
.map_err(|e| olm_to_sk(e, "graph pipeline"))?;
|
||||||
|
debug!("materialize_layer: MPHF build done");
|
||||||
|
Ok(n)
|
||||||
|
}
|
||||||
@@ -6,24 +6,16 @@ use epserde::prelude::*;
|
|||||||
use obicompactvec::{PersistentCompactIntMatrix, PersistentCompactIntVec};
|
use obicompactvec::{PersistentCompactIntMatrix, PersistentCompactIntVec};
|
||||||
use obidebruinj::GraphDeBruijn;
|
use obidebruinj::GraphDeBruijn;
|
||||||
use obilayeredmap::meta::PartitionMeta;
|
use obilayeredmap::meta::PartitionMeta;
|
||||||
use obilayeredmap::{IndexMode, OLMError, layer::Layer};
|
use obilayeredmap::{IndexMode, layer::Layer};
|
||||||
use obiskio::{SKError, SKFileMeta, SKFileReader};
|
use obiskio::{SKError, SKFileMeta, SKFileReader};
|
||||||
use ptr_hash::{PtrHash, bucket_fn::CubicEps, hash::Xx64};
|
use ptr_hash::{PtrHash, bucket_fn::CubicEps, hash::Xx64};
|
||||||
|
|
||||||
|
use crate::common::olm_to_sk;
|
||||||
|
use crate::graph_pipeline::{materialize_layer, write_graph_as_unitigs};
|
||||||
use crate::partition::KmerPartition;
|
use crate::partition::KmerPartition;
|
||||||
|
|
||||||
type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
|
type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
|
||||||
|
|
||||||
fn olm_to_sk(e: OLMError) -> SKError {
|
|
||||||
match e {
|
|
||||||
OLMError::Io(io_err) => SKError::Io(io_err),
|
|
||||||
other => SKError::InvalidData {
|
|
||||||
context: "layer build",
|
|
||||||
detail: other.to_string(),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn remove_if_exists(path: &std::path::Path) {
|
fn remove_if_exists(path: &std::path::Path) {
|
||||||
if let Err(e) = fs::remove_file(path) {
|
if let Err(e) = fs::remove_file(path) {
|
||||||
if e.kind() != io::ErrorKind::NotFound {
|
if e.kind() != io::ErrorKind::NotFound {
|
||||||
@@ -101,18 +93,8 @@ impl KmerPartition {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let n_kmers = g.len();
|
let n_kmers = if with_counts {
|
||||||
g.compute_degrees_and_mark_starts();
|
let n = write_graph_as_unitigs(g, &layer_dir)?;
|
||||||
|
|
||||||
fs::create_dir_all(&layer_dir)?;
|
|
||||||
|
|
||||||
let mut uw = Layer::<()>::unitig_writer(&layer_dir).map_err(olm_to_sk)?;
|
|
||||||
g.try_for_each_unitig(|unitig| {
|
|
||||||
uw.write(unitig)
|
|
||||||
})?;
|
|
||||||
uw.close()?;
|
|
||||||
|
|
||||||
if with_counts {
|
|
||||||
Layer::<PersistentCompactIntMatrix>::build(
|
Layer::<PersistentCompactIntMatrix>::build(
|
||||||
&layer_dir,
|
&layer_dir,
|
||||||
block_bits,
|
block_bits,
|
||||||
@@ -122,10 +104,11 @@ impl KmerPartition {
|
|||||||
_ => 1,
|
_ => 1,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
.map_err(olm_to_sk)?;
|
.map_err(|e| olm_to_sk(e, "layer build"))?;
|
||||||
|
n
|
||||||
} else {
|
} else {
|
||||||
Layer::<()>::build(&layer_dir, block_bits, mode).map_err(olm_to_sk)?;
|
materialize_layer(g, &layer_dir, block_bits, mode)?
|
||||||
}
|
};
|
||||||
|
|
||||||
let index_dir = layer_dir.parent().expect("layer_dir has a parent");
|
let index_dir = layer_dir.parent().expect("layer_dir has a parent");
|
||||||
PartitionMeta {
|
PartitionMeta {
|
||||||
@@ -133,7 +116,7 @@ impl KmerPartition {
|
|||||||
mode: mode.clone(),
|
mode: mode.clone(),
|
||||||
}
|
}
|
||||||
.save(index_dir)
|
.save(index_dir)
|
||||||
.map_err(olm_to_sk)?;
|
.map_err(|e| olm_to_sk(e, "layer build"))?;
|
||||||
|
|
||||||
Ok(n_kmers)
|
Ok(n_kmers)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
pub mod filter;
|
pub mod filter;
|
||||||
|
mod common;
|
||||||
mod distance;
|
mod distance;
|
||||||
mod dump_layer;
|
mod dump_layer;
|
||||||
|
mod graph_pipeline;
|
||||||
mod index_layer;
|
mod index_layer;
|
||||||
mod kmer_sort;
|
mod kmer_sort;
|
||||||
mod merge_layer;
|
mod merge_layer;
|
||||||
|
|||||||
@@ -1,24 +1,26 @@
|
|||||||
use std::fs;
|
use std::fs;
|
||||||
use std::io;
|
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use std::sync::{Arc, Mutex};
|
use std::sync::{Arc, Mutex};
|
||||||
|
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
use obipipeline::{
|
use obipipeline::{
|
||||||
Pipeline, PipelineError, PipelineSender, SharedFlatFn, Stage, WorkerPool,
|
Pipeline, PipelineError, PipelineSender, SharedFlatFn, Stage, WorkerPool,
|
||||||
|
ThrottleGuard, throttle,
|
||||||
make_sink, make_source, make_transform,
|
make_sink, make_source, make_transform,
|
||||||
};
|
};
|
||||||
|
|
||||||
use obicompactvec::{
|
use obicompactvec::{
|
||||||
|
MatrixGroupOps,
|
||||||
PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentBitVecBuilder,
|
PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentBitVecBuilder,
|
||||||
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder,
|
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder,
|
||||||
};
|
};
|
||||||
use obidebruinj::GraphDeBruijn;
|
|
||||||
use obikseq::CanonicalKmer;
|
use obikseq::CanonicalKmer;
|
||||||
use obilayeredmap::meta::PartitionMeta;
|
use obilayeredmap::meta::PartitionMeta;
|
||||||
use obilayeredmap::{IndexMode, Layer, LayeredMap, MphfOnly, OLMError};
|
use obilayeredmap::{IndexMode, Layer, LayeredMap, MphfOnly};
|
||||||
use obiskio::{SKError, SKResult, UnitigFileReader};
|
use obiskio::{SKError, SKResult, UnitigFileReader};
|
||||||
|
|
||||||
|
use crate::common::{ColBuilder, col_path_bit, col_path_int, load_meta, olm_to_sk, write_matrix_meta};
|
||||||
|
use crate::graph_pipeline::{build_graph, materialize_layer};
|
||||||
use crate::partition::KmerPartition;
|
use crate::partition::KmerPartition;
|
||||||
|
|
||||||
// ── MergeMode ─────────────────────────────────────────────────────────────────
|
// ── MergeMode ─────────────────────────────────────────────────────────────────
|
||||||
@@ -29,29 +31,6 @@ pub enum MergeMode {
|
|||||||
Count,
|
Count,
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── ColBuilder — enum dispatch to avoid trait-object boxing issues ─────────────
|
|
||||||
|
|
||||||
enum ColBuilder {
|
|
||||||
Bit(PersistentBitVecBuilder),
|
|
||||||
Int(PersistentCompactIntVecBuilder),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ColBuilder {
|
|
||||||
fn set_val(&mut self, slot: usize, value: u32) {
|
|
||||||
match self {
|
|
||||||
ColBuilder::Bit(b) => b.set(slot, value > 0),
|
|
||||||
ColBuilder::Int(b) => b.set(slot, value),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn close(self) -> SKResult<()> {
|
|
||||||
match self {
|
|
||||||
ColBuilder::Bit(b) => b.close().map_err(SKError::Io),
|
|
||||||
ColBuilder::Int(b) => b.close().map_err(SKError::Io),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── SrcLayerData — opened source matrix for pass-2 lookup ─────────────────────
|
// ── SrcLayerData — opened source matrix for pass-2 lookup ─────────────────────
|
||||||
|
|
||||||
pub(crate) enum SrcLayerData {
|
pub(crate) enum SrcLayerData {
|
||||||
@@ -65,18 +44,18 @@ impl SrcLayerData {
|
|||||||
match merge_mode {
|
match merge_mode {
|
||||||
MergeMode::Presence => {
|
MergeMode::Presence => {
|
||||||
if counts_dir.exists() && !layer_dir.join("presence").exists() {
|
if counts_dir.exists() && !layer_dir.join("presence").exists() {
|
||||||
let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
|
let mphf = MphfOnly::open(layer_dir).map_err(|e| olm_to_sk(e, "merge"))?;
|
||||||
let mat = PersistentCompactIntMatrix::open(layer_dir).map_err(SKError::Io)?;
|
let mat = PersistentCompactIntMatrix::open(layer_dir).map_err(SKError::Io)?;
|
||||||
Ok(SrcLayerData::Count(mphf, mat))
|
Ok(SrcLayerData::Count(mphf, mat))
|
||||||
} else {
|
} else {
|
||||||
// presence dir exists, or neither exists → Implicit handled by open()
|
// presence dir exists, or neither exists → Implicit handled by open()
|
||||||
let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
|
let mphf = MphfOnly::open(layer_dir).map_err(|e| olm_to_sk(e, "merge"))?;
|
||||||
let mat = PersistentBitMatrix::open(layer_dir).map_err(SKError::Io)?;
|
let mat = PersistentBitMatrix::open(layer_dir).map_err(SKError::Io)?;
|
||||||
Ok(SrcLayerData::Presence(mphf, mat))
|
Ok(SrcLayerData::Presence(mphf, mat))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
MergeMode::Count => {
|
MergeMode::Count => {
|
||||||
let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
|
let mphf = MphfOnly::open(layer_dir).map_err(|e| olm_to_sk(e, "merge"))?;
|
||||||
if counts_dir.exists() {
|
if counts_dir.exists() {
|
||||||
let mat = PersistentCompactIntMatrix::open(layer_dir).map_err(SKError::Io)?;
|
let mat = PersistentCompactIntMatrix::open(layer_dir).map_err(SKError::Io)?;
|
||||||
Ok(SrcLayerData::Count(mphf, mat))
|
Ok(SrcLayerData::Count(mphf, mat))
|
||||||
@@ -100,59 +79,47 @@ impl SrcLayerData {
|
|||||||
}
|
}
|
||||||
buf
|
buf
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn n_slots(&self) -> usize {
|
||||||
|
match self {
|
||||||
|
SrcLayerData::Presence(_, mat) => mat.n(),
|
||||||
|
SrcLayerData::Count(_, mat) => mat.n(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// MPHF lookup: returns the slot index for `kmer` (kmer must be in the domain).
|
||||||
|
#[inline]
|
||||||
|
pub(crate) fn slot(&self, kmer: CanonicalKmer) -> usize {
|
||||||
|
match self {
|
||||||
|
SrcLayerData::Presence(mphf, _) => mphf.index(kmer),
|
||||||
|
SrcLayerData::Count(mphf, _) => mphf.index(kmer),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Row lookup by slot index, bypassing the MPHF.
|
||||||
|
#[inline]
|
||||||
|
pub(crate) fn fill_row_by_slot(&self, slot: usize, n_genomes: usize) -> Vec<u32> {
|
||||||
|
let mut buf = vec![0u32; n_genomes];
|
||||||
|
match self {
|
||||||
|
SrcLayerData::Presence(_, mat) => mat.fill_row(slot, &mut buf),
|
||||||
|
SrcLayerData::Count(_, mat) => mat.fill_row(slot, &mut buf),
|
||||||
|
}
|
||||||
|
buf
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Call `f` with a reference to the underlying matrix as `&dyn MatrixGroupOps`.
|
||||||
|
pub(crate) fn with_matrix<R>(&self, f: impl FnOnce(&dyn MatrixGroupOps) -> R) -> R {
|
||||||
|
match self {
|
||||||
|
SrcLayerData::Presence(_, mat) => f(mat),
|
||||||
|
SrcLayerData::Count(_, mat) => f(mat),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── helpers ───────────────────────────────────────────────────────────────────
|
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
const INDEX_SUBDIR: &str = "index";
|
const INDEX_SUBDIR: &str = "index";
|
||||||
|
|
||||||
/// Load PartitionMeta, or recover it by probing layer directories.
|
|
||||||
/// Indexes built before meta.json was introduced lack the file.
|
|
||||||
fn load_meta(dir: &Path) -> SKResult<PartitionMeta> {
|
|
||||||
match PartitionMeta::load(dir) {
|
|
||||||
Ok(m) => Ok(m),
|
|
||||||
Err(e) if matches!(e, OLMError::Io(ref io_e) if io_e.kind() == std::io::ErrorKind::NotFound) =>
|
|
||||||
{
|
|
||||||
let mut n = 0usize;
|
|
||||||
while dir.join(format!("layer_{n}")).exists() {
|
|
||||||
n += 1;
|
|
||||||
}
|
|
||||||
let m = PartitionMeta {
|
|
||||||
n_layers: n,
|
|
||||||
mode: IndexMode::default(),
|
|
||||||
};
|
|
||||||
m.save(dir).map_err(olm_to_sk)?;
|
|
||||||
Ok(m)
|
|
||||||
}
|
|
||||||
Err(e) => Err(olm_to_sk(e)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn olm_to_sk(e: OLMError) -> SKError {
|
|
||||||
match e {
|
|
||||||
OLMError::Io(e) => SKError::Io(e),
|
|
||||||
other => SKError::InvalidData {
|
|
||||||
context: "merge",
|
|
||||||
detail: other.to_string(),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn col_path_bit(dir: &Path, col: usize) -> PathBuf {
|
|
||||||
dir.join(format!("col_{col:06}.pbiv"))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn col_path_int(dir: &Path, col: usize) -> PathBuf {
|
|
||||||
dir.join(format!("col_{col:06}.pciv"))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn write_matrix_meta(dir: &Path, n: usize, n_cols: usize) -> io::Result<()> {
|
|
||||||
fs::write(
|
|
||||||
dir.join("meta.json"),
|
|
||||||
format!("{{\"n\":{n},\"n_cols\":{n_cols}}}\n"),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── KmerPartition::merge_partition ────────────────────────────────────────────
|
// ── KmerPartition::merge_partition ────────────────────────────────────────────
|
||||||
|
|
||||||
impl KmerPartition {
|
impl KmerPartition {
|
||||||
@@ -179,8 +146,8 @@ impl KmerPartition {
|
|||||||
return Ok(0);
|
return Ok(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
load_meta(&dst_index_dir)?; // ensure meta.json exists before LayeredMap::open
|
load_meta(&dst_index_dir, "merge")?; // ensure meta.json exists before LayeredMap::open
|
||||||
let dst_map = Arc::new(LayeredMap::<()>::open(&dst_index_dir).map_err(olm_to_sk)?);
|
let dst_map = Arc::new(LayeredMap::<()>::open(&dst_index_dir).map_err(|e| olm_to_sk(e, "merge"))?);
|
||||||
let n_dst_layers = dst_map.n_layers();
|
let n_dst_layers = dst_map.n_layers();
|
||||||
let n_src_total: usize = sources.iter().map(|(_, n)| *n).sum();
|
let n_src_total: usize = sources.iter().map(|(_, n)| *n).sum();
|
||||||
|
|
||||||
@@ -190,7 +157,7 @@ impl KmerPartition {
|
|||||||
for l in 0..n_dst_layers {
|
for l in 0..n_dst_layers {
|
||||||
let layer_dir = dst_index_dir.join(format!("layer_{l}"));
|
let layer_dir = dst_index_dir.join(format!("layer_{l}"));
|
||||||
Layer::<()>::init_presence_matrix(&layer_dir, dst_map.layer(l).n())
|
Layer::<()>::init_presence_matrix(&layer_dir, dst_map.layer(l).n())
|
||||||
.map_err(olm_to_sk)?;
|
.map_err(|e| olm_to_sk(e, "merge"))?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -208,7 +175,7 @@ impl KmerPartition {
|
|||||||
if !src_index_dir.exists() {
|
if !src_index_dir.exists() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
let src_meta = load_meta(&src_index_dir)?;
|
let src_meta = load_meta(&src_index_dir, "merge")?;
|
||||||
for l in 0..src_meta.n_layers {
|
for l in 0..src_meta.n_layers {
|
||||||
let p = src_index_dir.join(format!("layer_{l}")).join("unitigs.bin");
|
let p = src_index_dir.join(format!("layer_{l}")).join("unitigs.bin");
|
||||||
if p.exists() {
|
if p.exists() {
|
||||||
@@ -217,85 +184,39 @@ impl KmerPartition {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
enum Pass1Data {
|
let n_src_layers = unitig_paths.len();
|
||||||
File(PathBuf),
|
debug!("partition {i}: de Bruijn graph build start — {n_src_layers} source layer(s)");
|
||||||
Batch(Vec<CanonicalKmer>),
|
|
||||||
NewKmers(Vec<CanonicalKmer>),
|
|
||||||
}
|
|
||||||
|
|
||||||
const BATCH: usize = 4096;
|
const BATCH: usize = 4096;
|
||||||
let n_workers = std::thread::available_parallelism().map_or(4, |n| n.get());
|
let n_workers = rayon::current_num_threads().min(16).max(4);
|
||||||
let capacity = n_workers * 8;
|
// At most 2 files open simultaneously: keeps n_workers-2 workers free
|
||||||
|
// for the Transform stage. Each open file monopolises one worker for the
|
||||||
|
// full duration of its read, so this must stay well below n_workers.
|
||||||
|
let max_open = 2;
|
||||||
|
|
||||||
let dst_filter = Arc::clone(&dst_map);
|
let dst_filter = Arc::clone(&dst_map);
|
||||||
let g_shared = Arc::new(Mutex::new(GraphDeBruijn::new()));
|
|
||||||
let g_sink = Arc::clone(&g_shared);
|
|
||||||
let pass1_err: Arc<Mutex<Option<String>>> = Arc::new(Mutex::new(None));
|
|
||||||
let err_cap = Arc::clone(&pass1_err);
|
|
||||||
|
|
||||||
let pipeline = Pipeline::new(
|
let g = build_graph(
|
||||||
make_source!(Pass1Data, unitig_paths, File),
|
unitig_paths.into_iter(),
|
||||||
vec![
|
move |path: PathBuf, emit: &mut dyn FnMut(Vec<CanonicalKmer>)| -> SKResult<()> {
|
||||||
Stage::Flat(Arc::new(
|
let reader = UnitigFileReader::open_sequential(&path)?;
|
||||||
move |data: Pass1Data,
|
let mut batch: Vec<CanonicalKmer> = Vec::with_capacity(BATCH);
|
||||||
push: &PipelineSender<Result<Pass1Data, PipelineError>>,
|
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||||
delta: &PipelineSender<isize>|
|
batch.push(kmer);
|
||||||
{
|
if batch.len() == BATCH {
|
||||||
if let Pass1Data::File(path) = data {
|
emit(std::mem::replace(&mut batch, Vec::with_capacity(BATCH)));
|
||||||
let reader = match UnitigFileReader::open_sequential(&path) {
|
|
||||||
Ok(r) => r,
|
|
||||||
Err(e) => {
|
|
||||||
*err_cap.lock().unwrap() = Some(e.to_string());
|
|
||||||
delta.send(-1).ok();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let mut batch: Vec<CanonicalKmer> = Vec::with_capacity(BATCH);
|
|
||||||
let mut count: isize = 0;
|
|
||||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
|
||||||
batch.push(kmer);
|
|
||||||
if batch.len() == BATCH {
|
|
||||||
let b = std::mem::replace(&mut batch, Vec::with_capacity(BATCH));
|
|
||||||
push.send(Ok(Pass1Data::Batch(b))).ok();
|
|
||||||
count += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !batch.is_empty() {
|
|
||||||
push.send(Ok(Pass1Data::Batch(batch))).ok();
|
|
||||||
count += 1;
|
|
||||||
}
|
|
||||||
delta.send(count - 1).ok();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
) as SharedFlatFn<Pass1Data>),
|
|
||||||
make_transform!(Pass1Data, {
|
|
||||||
move |batch: Vec<CanonicalKmer>| -> Vec<CanonicalKmer> {
|
|
||||||
batch.into_iter()
|
|
||||||
.filter(|&k| dst_filter.query(k).is_none())
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
}, Batch, NewKmers),
|
|
||||||
],
|
|
||||||
make_sink!(Pass1Data, {
|
|
||||||
move |batch: Vec<CanonicalKmer>| {
|
|
||||||
let mut g = g_sink.lock().unwrap();
|
|
||||||
for kmer in batch {
|
|
||||||
g.push(kmer);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}, NewKmers),
|
if !batch.is_empty() {
|
||||||
);
|
emit(batch);
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
},
|
||||||
|
move |kmer| dst_filter.query(kmer).is_none(),
|
||||||
|
n_workers,
|
||||||
|
max_open,
|
||||||
|
)?;
|
||||||
|
|
||||||
WorkerPool::new(pipeline, n_workers, capacity).run();
|
|
||||||
|
|
||||||
if let Some(msg) = Arc::try_unwrap(pass1_err).unwrap().into_inner().unwrap() {
|
|
||||||
return Err(SKError::InvalidData { context: "merge pass1", detail: msg });
|
|
||||||
}
|
|
||||||
|
|
||||||
let g = Arc::try_unwrap(g_shared)
|
|
||||||
.unwrap_or_else(|_| panic!("pass1: g_shared not uniquely owned after pipeline"))
|
|
||||||
.into_inner()
|
|
||||||
.unwrap_or_else(|e| e.into_inner());
|
|
||||||
let any_new = g.len() > 0;
|
let any_new = g.len() > 0;
|
||||||
debug!("partition {i}: de Bruijn graph done — {} new kmers", g.len());
|
debug!("partition {i}: de Bruijn graph done — {} new kmers", g.len());
|
||||||
|
|
||||||
@@ -304,25 +225,10 @@ impl KmerPartition {
|
|||||||
let new_layer_dir = dst_index_dir.join(format!("layer_{new_layer_idx}"));
|
let new_layer_dir = dst_index_dir.join(format!("layer_{new_layer_idx}"));
|
||||||
|
|
||||||
let n_new = if any_new {
|
let n_new = if any_new {
|
||||||
let t_deg = std::time::Instant::now();
|
|
||||||
g.compute_degrees_and_mark_starts();
|
|
||||||
debug!("partition {i}: compute_degrees in {:.3}s — {} nodes",
|
|
||||||
t_deg.elapsed().as_secs_f64(), g.len());
|
|
||||||
fs::create_dir_all(&new_layer_dir)?;
|
|
||||||
let mut uw = Layer::<()>::unitig_writer(&new_layer_dir).map_err(olm_to_sk)?;
|
|
||||||
debug!("partition {i}: unitig traversal start — {} nodes", g.len());
|
debug!("partition {i}: unitig traversal start — {} nodes", g.len());
|
||||||
g.try_for_each_unitig(|unitig| {
|
let n_nodes = materialize_layer(g, &new_layer_dir, block_bits, evidence)?;
|
||||||
uw.write(unitig)
|
|
||||||
})?;
|
|
||||||
debug!("partition {i}: unitig writer closing");
|
|
||||||
uw.close()?;
|
|
||||||
debug!("partition {i}: unitig writer closed — dropping graph ({} nodes)", g.len());
|
|
||||||
let n = g.len();
|
|
||||||
drop(g);
|
|
||||||
debug!("partition {i}: graph dropped — starting MPHF build ({n} unitigs)");
|
|
||||||
Layer::<()>::build(&new_layer_dir, block_bits, evidence).map_err(olm_to_sk)?;
|
|
||||||
debug!("partition {i}: MPHF build done");
|
debug!("partition {i}: MPHF build done");
|
||||||
n
|
n_nodes
|
||||||
} else {
|
} else {
|
||||||
drop(g);
|
drop(g);
|
||||||
0
|
0
|
||||||
@@ -330,7 +236,7 @@ impl KmerPartition {
|
|||||||
|
|
||||||
let t_open = std::time::Instant::now();
|
let t_open = std::time::Instant::now();
|
||||||
let new_mphf: Option<Arc<MphfOnly>> = if any_new {
|
let new_mphf: Option<Arc<MphfOnly>> = if any_new {
|
||||||
Some(Arc::new(MphfOnly::open(&new_layer_dir).map_err(olm_to_sk)?))
|
Some(Arc::new(MphfOnly::open(&new_layer_dir).map_err(|e| olm_to_sk(e, "merge"))?))
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
};
|
};
|
||||||
@@ -436,7 +342,7 @@ impl KmerPartition {
|
|||||||
col_offset += src_n;
|
col_offset += src_n;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
let src_meta = load_meta(&src_index_dir)?;
|
let src_meta = load_meta(&src_index_dir, "merge")?;
|
||||||
for l in 0..src_meta.n_layers {
|
for l in 0..src_meta.n_layers {
|
||||||
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
|
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
|
||||||
if src_layer_dir.join("unitigs.bin").exists() {
|
if src_layer_dir.join("unitigs.bin").exists() {
|
||||||
@@ -448,7 +354,7 @@ impl KmerPartition {
|
|||||||
}
|
}
|
||||||
|
|
||||||
enum Pass2Data {
|
enum Pass2Data {
|
||||||
SrcLayer((usize, usize, PathBuf)),
|
SrcLayer((usize, usize, PathBuf, ThrottleGuard)),
|
||||||
RawBatch((usize, usize, Arc<SrcLayerData>, Vec<CanonicalKmer>)),
|
RawBatch((usize, usize, Arc<SrcLayerData>, Vec<CanonicalKmer>)),
|
||||||
WriteBatch(Vec<(Option<usize>, usize, usize, u32)>),
|
WriteBatch(Vec<(Option<usize>, usize, usize, u32)>),
|
||||||
}
|
}
|
||||||
@@ -470,15 +376,22 @@ impl KmerPartition {
|
|||||||
let pass2_err: Arc<Mutex<Option<String>>> = Arc::new(Mutex::new(None));
|
let pass2_err: Arc<Mutex<Option<String>>> = Arc::new(Mutex::new(None));
|
||||||
let err_cap2 = Arc::clone(&pass2_err);
|
let err_cap2 = Arc::clone(&pass2_err);
|
||||||
|
|
||||||
|
let capacity = 2;
|
||||||
|
let throttled_pass2 = throttle(pass2_items.into_iter(), max_open);
|
||||||
|
|
||||||
let pipeline2 = Pipeline::new(
|
let pipeline2 = Pipeline::new(
|
||||||
make_source!(Pass2Data, pass2_items, SrcLayer),
|
make_source!(Pass2Data, throttled_pass2.map(|t| {
|
||||||
|
let (col_offset, src_n, src_layer_dir) = t.item;
|
||||||
|
(col_offset, src_n, src_layer_dir, t.guard)
|
||||||
|
}), SrcLayer),
|
||||||
vec![
|
vec![
|
||||||
Stage::Flat(Arc::new(
|
Stage::Flat(Arc::new(
|
||||||
move |data: Pass2Data,
|
move |data: Pass2Data,
|
||||||
push: &PipelineSender<Result<Pass2Data, PipelineError>>,
|
push: &PipelineSender<Result<Pass2Data, PipelineError>>,
|
||||||
delta: &PipelineSender<isize>|
|
delta: &PipelineSender<isize>|
|
||||||
{
|
{
|
||||||
if let Pass2Data::SrcLayer((col_offset, src_n, src_layer_dir)) = data {
|
if let Pass2Data::SrcLayer((col_offset, src_n, src_layer_dir, _guard)) = data {
|
||||||
|
// _guard dropped at end of block, releasing the slot.
|
||||||
let reader = match UnitigFileReader::open_sequential(
|
let reader = match UnitigFileReader::open_sequential(
|
||||||
&src_layer_dir.join("unitigs.bin"),
|
&src_layer_dir.join("unitigs.bin"),
|
||||||
) {
|
) {
|
||||||
@@ -497,6 +410,7 @@ impl KmerPartition {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
const BATCH: usize = 4096;
|
||||||
let mut batch: Vec<CanonicalKmer> = Vec::with_capacity(BATCH);
|
let mut batch: Vec<CanonicalKmer> = Vec::with_capacity(BATCH);
|
||||||
let mut count: isize = 0;
|
let mut count: isize = 0;
|
||||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||||
@@ -598,9 +512,9 @@ impl KmerPartition {
|
|||||||
write_matrix_meta(&data_dir, n_new, n_dst_genomes + n_src_total)
|
write_matrix_meta(&data_dir, n_new, n_dst_genomes + n_src_total)
|
||||||
.map_err(SKError::Io)?;
|
.map_err(SKError::Io)?;
|
||||||
|
|
||||||
let mut part_meta = PartitionMeta::load(&dst_index_dir).map_err(olm_to_sk)?;
|
let mut part_meta = PartitionMeta::load(&dst_index_dir).map_err(|e| olm_to_sk(e, "merge"))?;
|
||||||
part_meta.n_layers = new_layer_idx + 1;
|
part_meta.n_layers = new_layer_idx + 1;
|
||||||
part_meta.save(&dst_index_dir).map_err(olm_to_sk)?;
|
part_meta.save(&dst_index_dir).map_err(|e| olm_to_sk(e, "merge"))?;
|
||||||
}
|
}
|
||||||
|
|
||||||
debug!("partition {i}: builders closed in {:.3}s", t_close.elapsed().as_secs_f64());
|
debug!("partition {i}: builders closed in {:.3}s", t_close.elapsed().as_secs_f64());
|
||||||
|
|||||||
@@ -1,97 +1,145 @@
|
|||||||
use std::fs;
|
use std::path::Path;
|
||||||
use std::io;
|
|
||||||
use std::path::{Path, PathBuf};
|
|
||||||
|
|
||||||
use obicompactvec::{
|
use obicompactvec::{
|
||||||
PersistentBitMatrixBuilder, PersistentBitVecBuilder, PersistentCompactIntMatrixBuilder,
|
FilterMask, eval_filter_mask,
|
||||||
PersistentCompactIntVecBuilder,
|
PersistentBitMatrixBuilder, PersistentBitVecBuilder,
|
||||||
|
PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder,
|
||||||
};
|
};
|
||||||
use obidebruinj::GraphDeBruijn;
|
use obidebruinj::GraphDeBruijn;
|
||||||
use obikseq::CanonicalKmer;
|
use obikseq::CanonicalKmer;
|
||||||
use obilayeredmap::meta::PartitionMeta;
|
use obilayeredmap::meta::PartitionMeta;
|
||||||
use obilayeredmap::{IndexMode, Layer, MphfLayer, OLMError};
|
use obilayeredmap::{IndexMode, MphfLayer};
|
||||||
use obiskio::{SKError, SKResult, UnitigFileReader};
|
use obiskio::{SKError, SKResult, UnitigFileReader};
|
||||||
|
|
||||||
use crate::filter::{KmerFilter, passes_all};
|
use crate::common::{load_meta, olm_to_sk};
|
||||||
|
use crate::filter::KmerFilter;
|
||||||
|
use crate::graph_pipeline::materialize_layer;
|
||||||
use crate::merge_layer::{MergeMode, SrcLayerData};
|
use crate::merge_layer::{MergeMode, SrcLayerData};
|
||||||
use crate::partition::KmerPartition;
|
use crate::partition::KmerPartition;
|
||||||
|
|
||||||
const INDEX_SUBDIR: &str = "index";
|
const INDEX_SUBDIR: &str = "index";
|
||||||
|
|
||||||
fn olm_to_sk(e: OLMError) -> SKError {
|
// ── Builders — pair matrix builder + column builders for one mode ─────────────
|
||||||
match e {
|
|
||||||
OLMError::Io(e) => SKError::Io(e),
|
enum Builders {
|
||||||
other => SKError::InvalidData {
|
Presence(PersistentBitMatrixBuilder, Vec<PersistentBitVecBuilder>),
|
||||||
context: "rebuild",
|
Count(PersistentCompactIntMatrixBuilder, Vec<PersistentCompactIntVecBuilder>),
|
||||||
detail: other.to_string(),
|
}
|
||||||
},
|
|
||||||
|
impl Builders {
|
||||||
|
fn new(mode: MergeMode, n: usize, dir: &Path, n_genomes: usize) -> SKResult<Self> {
|
||||||
|
match mode {
|
||||||
|
MergeMode::Presence => {
|
||||||
|
let mut mat = PersistentBitMatrixBuilder::new(n, dir).map_err(SKError::Io)?;
|
||||||
|
let mut cols = Vec::with_capacity(n_genomes);
|
||||||
|
for _ in 0..n_genomes { cols.push(mat.add_col().map_err(SKError::Io)?); }
|
||||||
|
Ok(Builders::Presence(mat, cols))
|
||||||
|
}
|
||||||
|
MergeMode::Count => {
|
||||||
|
let mut mat = PersistentCompactIntMatrixBuilder::new(n, dir).map_err(SKError::Io)?;
|
||||||
|
let mut cols = Vec::with_capacity(n_genomes);
|
||||||
|
for _ in 0..n_genomes { cols.push(mat.add_col().map_err(SKError::Io)?); }
|
||||||
|
Ok(Builders::Count(mat, cols))
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
fn col_path_bit(dir: &Path, col: usize) -> PathBuf {
|
fn set_val(&mut self, col: usize, slot: usize, value: u32) {
|
||||||
dir.join(format!("col_{col:06}.pbiv"))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn col_path_int(dir: &Path, col: usize) -> PathBuf {
|
|
||||||
dir.join(format!("col_{col:06}.pciv"))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn write_matrix_meta(dir: &Path, n: usize, n_cols: usize) -> io::Result<()> {
|
|
||||||
fs::write(
|
|
||||||
dir.join("meta.json"),
|
|
||||||
format!("{{\"n\":{n},\"n_cols\":{n_cols}}}\n"),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── ColBuilder ────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
enum ColBuilder {
|
|
||||||
Bit(PersistentBitVecBuilder),
|
|
||||||
Int(PersistentCompactIntVecBuilder),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ColBuilder {
|
|
||||||
fn set_val(&mut self, slot: usize, value: u32) {
|
|
||||||
match self {
|
match self {
|
||||||
ColBuilder::Bit(b) => b.set(slot, value > 0),
|
Builders::Presence(_, cols) => cols[col].set(slot, value > 0),
|
||||||
ColBuilder::Int(b) => b.set(slot, value),
|
Builders::Count(_, cols) => cols[col].set(slot, value),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn close(self) -> SKResult<()> {
|
fn close(self) -> SKResult<()> {
|
||||||
match self {
|
match self {
|
||||||
ColBuilder::Bit(b) => b.close().map_err(SKError::Io),
|
Builders::Presence(mat, cols) => {
|
||||||
ColBuilder::Int(b) => b.close().map_err(SKError::Io),
|
for b in cols { b.close().map_err(SKError::Io)?; }
|
||||||
}
|
mat.close().map_err(SKError::Io)
|
||||||
}
|
}
|
||||||
}
|
Builders::Count(mat, cols) => {
|
||||||
|
for b in cols { b.close().map_err(SKError::Io)?; }
|
||||||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
mat.close().map_err(SKError::Io)
|
||||||
|
|
||||||
fn load_meta(dir: &Path) -> SKResult<PartitionMeta> {
|
|
||||||
match PartitionMeta::load(dir) {
|
|
||||||
Ok(m) => Ok(m),
|
|
||||||
Err(e) if matches!(e, OLMError::Io(ref io_e) if io_e.kind() == std::io::ErrorKind::NotFound) =>
|
|
||||||
{
|
|
||||||
let mut n = 0usize;
|
|
||||||
while dir.join(format!("layer_{n}")).exists() {
|
|
||||||
n += 1;
|
|
||||||
}
|
}
|
||||||
let m = PartitionMeta {
|
|
||||||
n_layers: n,
|
|
||||||
mode: IndexMode::default(),
|
|
||||||
};
|
|
||||||
m.save(dir).map_err(olm_to_sk)?;
|
|
||||||
Ok(m)
|
|
||||||
}
|
}
|
||||||
Err(e) => Err(olm_to_sk(e)),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Iterate all kmers in `src_index_dir` that pass `filters`, yielding `(kmer, row)`.
|
// ── try_compute_combined_mask ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Build a per-slot `TempBitVec` mask from `filters` using column operations
|
||||||
|
/// on the source matrix — no per-kmer MPHF lookup or row read needed.
|
||||||
///
|
///
|
||||||
/// Uses [`SrcLayerData`] semantics: counts take priority over presence when
|
/// Returns `Some(mask)` when every filter in `filters` can express itself as
|
||||||
/// `mode = Count`; presence (or implicit all-ones) is used for `Presence`.
|
/// a [`FilterMask`] expression. Returns `None` when any filter requires
|
||||||
|
/// row-level inspection (fall back to `passes_all`).
|
||||||
|
fn try_compute_combined_mask(
|
||||||
|
filters: &[Box<dyn KmerFilter>],
|
||||||
|
src_data: &SrcLayerData,
|
||||||
|
n_genomes: usize,
|
||||||
|
) -> SKResult<Option<obicompactvec::TempBitVec>> {
|
||||||
|
if filters.is_empty() {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
let mut exprs: Vec<FilterMask> = Vec::with_capacity(filters.len());
|
||||||
|
for f in filters {
|
||||||
|
match f.column_mask_expr(n_genomes) {
|
||||||
|
Some(expr) => exprs.push(expr),
|
||||||
|
None => return Ok(None),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let combined = FilterMask::And(exprs);
|
||||||
|
let n = src_data.n_slots();
|
||||||
|
let mask = src_data
|
||||||
|
.with_matrix(|mat| eval_filter_mask(&combined, mat, n))
|
||||||
|
.map_err(SKError::Io)?;
|
||||||
|
Ok(Some(mask))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── iter_src_kmers_masked (pass 1) ────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Iterate all passing kmers in `src_index_dir`, yielding only the kmer value.
|
||||||
|
///
|
||||||
|
/// When all filters can be expressed as column operations, a per-slot mask is
|
||||||
|
/// computed once per layer and used for O(1) slot-check per kmer instead of a
|
||||||
|
/// full row read. Falls back to row-level `passes_all` otherwise.
|
||||||
|
fn iter_src_kmers_masked(
|
||||||
|
src_index_dir: &Path,
|
||||||
|
mode: MergeMode,
|
||||||
|
n_genomes: usize,
|
||||||
|
filters: &[Box<dyn KmerFilter>],
|
||||||
|
mut cb: impl FnMut(CanonicalKmer),
|
||||||
|
) -> SKResult<()> {
|
||||||
|
let src_meta = load_meta(src_index_dir, "rebuild")?;
|
||||||
|
for l in 0..src_meta.n_layers {
|
||||||
|
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
|
||||||
|
let unitigs_path = src_layer_dir.join("unitigs.bin");
|
||||||
|
if !unitigs_path.exists() { continue; }
|
||||||
|
|
||||||
|
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
|
||||||
|
let mask = try_compute_combined_mask(filters, &src_data, n_genomes)?;
|
||||||
|
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
|
||||||
|
|
||||||
|
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||||
|
let slot = src_data.slot(kmer);
|
||||||
|
let passes = match &mask {
|
||||||
|
Some(m) => m.get(slot),
|
||||||
|
None => {
|
||||||
|
let row = src_data.fill_row_by_slot(slot, n_genomes);
|
||||||
|
filters.iter().all(|f| f.passes(&row, n_genomes))
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if passes { cb(kmer); }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── iter_src_layers (pass 2) ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Iterate all passing kmers in `src_index_dir`, yielding `(kmer, row)`.
|
||||||
|
///
|
||||||
|
/// When the slot mask is available, skips the row read for filtered-out slots.
|
||||||
fn iter_src_layers(
|
fn iter_src_layers(
|
||||||
src_index_dir: &Path,
|
src_index_dir: &Path,
|
||||||
mode: MergeMode,
|
mode: MergeMode,
|
||||||
@@ -99,21 +147,27 @@ fn iter_src_layers(
|
|||||||
filters: &[Box<dyn KmerFilter>],
|
filters: &[Box<dyn KmerFilter>],
|
||||||
mut cb: impl FnMut(CanonicalKmer, Box<[u32]>),
|
mut cb: impl FnMut(CanonicalKmer, Box<[u32]>),
|
||||||
) -> SKResult<()> {
|
) -> SKResult<()> {
|
||||||
let src_meta = load_meta(src_index_dir)?;
|
let src_meta = load_meta(src_index_dir, "rebuild")?;
|
||||||
for l in 0..src_meta.n_layers {
|
for l in 0..src_meta.n_layers {
|
||||||
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
|
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
|
||||||
let unitigs_path = src_layer_dir.join("unitigs.bin");
|
let unitigs_path = src_layer_dir.join("unitigs.bin");
|
||||||
if !unitigs_path.exists() {
|
if !unitigs_path.exists() { continue; }
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
|
|
||||||
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
|
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
|
||||||
|
let mask = try_compute_combined_mask(filters, &src_data, n_genomes)?;
|
||||||
|
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
|
||||||
|
|
||||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||||
let row = src_data.lookup(kmer, n_genomes);
|
let slot = src_data.slot(kmer);
|
||||||
if passes_all(filters, &row, n_genomes) {
|
if let Some(ref m) = mask {
|
||||||
|
if !m.get(slot) { continue; }
|
||||||
|
let row = src_data.fill_row_by_slot(slot, n_genomes);
|
||||||
cb(kmer, row.into_boxed_slice());
|
cb(kmer, row.into_boxed_slice());
|
||||||
|
} else {
|
||||||
|
let row = src_data.fill_row_by_slot(slot, n_genomes);
|
||||||
|
if filters.iter().all(|f| f.passes(&row, n_genomes)) {
|
||||||
|
cb(kmer, row.into_boxed_slice());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -144,14 +198,14 @@ impl KmerPartition {
|
|||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
let src_meta = load_meta(&src_index_dir)?;
|
let src_meta = load_meta(&src_index_dir, "rebuild")?;
|
||||||
if src_meta.n_layers == 0 {
|
if src_meta.n_layers == 0 {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Pass 1: collect filtered kmers into de Bruijn graph ───────────────
|
// ── Pass 1: collect filtered kmers into de Bruijn graph ───────────────
|
||||||
let mut g = GraphDeBruijn::new();
|
let mut g = GraphDeBruijn::new();
|
||||||
iter_src_layers(&src_index_dir, mode, n_genomes, filters, |kmer, _row| {
|
iter_src_kmers_masked(&src_index_dir, mode, n_genomes, filters, |kmer| {
|
||||||
g.push(kmer);
|
g.push(kmer);
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
@@ -159,82 +213,40 @@ impl KmerPartition {
|
|||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
let n_new = g.len();
|
|
||||||
g.compute_degrees_and_mark_starts();
|
|
||||||
|
|
||||||
// ── Build MPHF in dst layer_0 ─────────────────────────────────────────
|
// ── Build MPHF in dst layer_0 ─────────────────────────────────────────
|
||||||
let dst_index_dir = self.part_dir(i).join(INDEX_SUBDIR);
|
let dst_index_dir = self.part_dir(i).join(INDEX_SUBDIR);
|
||||||
let dst_layer_dir = dst_index_dir.join("layer_0");
|
let dst_layer_dir = dst_index_dir.join("layer_0");
|
||||||
fs::create_dir_all(&dst_layer_dir)?;
|
|
||||||
|
|
||||||
let mut uw = Layer::<()>::unitig_writer(&dst_layer_dir).map_err(olm_to_sk)?;
|
let n_new = materialize_layer(g, &dst_layer_dir, block_bits, &IndexMode::Exact)?;
|
||||||
g.try_for_each_unitig(|unitig| {
|
let dst_mphf = MphfLayer::open(&dst_layer_dir, &IndexMode::Exact)
|
||||||
uw.write(unitig)
|
.map_err(|e| olm_to_sk(e, "rebuild"))?;
|
||||||
})?;
|
|
||||||
uw.close()?;
|
|
||||||
drop(g);
|
|
||||||
|
|
||||||
Layer::<()>::build(&dst_layer_dir, block_bits, &IndexMode::Exact).map_err(olm_to_sk)?;
|
|
||||||
let dst_mphf = MphfLayer::open(&dst_layer_dir, &IndexMode::Exact).map_err(olm_to_sk)?;
|
|
||||||
|
|
||||||
// ── Prepare matrix builders (one column per genome) ───────────────────
|
// ── Prepare matrix builders (one column per genome) ───────────────────
|
||||||
let data_dir = match mode {
|
let data_dir = match mode {
|
||||||
MergeMode::Presence => dst_layer_dir.join("presence"),
|
MergeMode::Presence => dst_layer_dir.join("presence"),
|
||||||
MergeMode::Count => dst_layer_dir.join("counts"),
|
MergeMode::Count => dst_layer_dir.join("counts"),
|
||||||
};
|
|
||||||
fs::create_dir_all(&data_dir)?;
|
|
||||||
|
|
||||||
let mut builders: Vec<ColBuilder> = match mode {
|
|
||||||
MergeMode::Presence => {
|
|
||||||
PersistentBitMatrixBuilder::new(n_new, &data_dir)
|
|
||||||
.map_err(SKError::Io)?
|
|
||||||
.close()
|
|
||||||
.map_err(SKError::Io)?;
|
|
||||||
(0..n_genomes)
|
|
||||||
.map(|g| -> SKResult<ColBuilder> {
|
|
||||||
let b = PersistentBitVecBuilder::new(n_new, &col_path_bit(&data_dir, g))?;
|
|
||||||
Ok(ColBuilder::Bit(b))
|
|
||||||
})
|
|
||||||
.collect::<SKResult<_>>()?
|
|
||||||
}
|
|
||||||
MergeMode::Count => {
|
|
||||||
PersistentCompactIntMatrixBuilder::new(n_new, &data_dir)
|
|
||||||
.map_err(SKError::Io)?
|
|
||||||
.close()
|
|
||||||
.map_err(SKError::Io)?;
|
|
||||||
(0..n_genomes)
|
|
||||||
.map(|g| -> SKResult<ColBuilder> {
|
|
||||||
let b = PersistentCompactIntVecBuilder::new(
|
|
||||||
n_new,
|
|
||||||
&col_path_int(&data_dir, g),
|
|
||||||
)?;
|
|
||||||
Ok(ColBuilder::Int(b))
|
|
||||||
})
|
|
||||||
.collect::<SKResult<_>>()?
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
std::fs::create_dir_all(&data_dir)?;
|
||||||
|
let mut builders = Builders::new(mode, n_new, &data_dir, n_genomes)?;
|
||||||
|
|
||||||
// ── Pass 2: fill builders ─────────────────────────────────────────────
|
// ── Pass 2: fill builders ─────────────────────────────────────────────
|
||||||
iter_src_layers(&src_index_dir, mode, n_genomes, filters, |kmer, row| {
|
iter_src_layers(&src_index_dir, mode, n_genomes, filters, |kmer, row| {
|
||||||
if let Some(slot) = dst_mphf.find(kmer) {
|
if let Some(slot) = dst_mphf.find(kmer) {
|
||||||
for (col, &value) in row.iter().enumerate() {
|
for (col, &value) in row.iter().enumerate() {
|
||||||
builders[col].set_val(slot, value);
|
builders.set_val(col, slot, value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
// ── Close builders, write metadata ────────────────────────────────────
|
// ── Close builders and write metadata ─────────────────────────────────
|
||||||
for b in builders {
|
builders.close()?;
|
||||||
b.close()?;
|
|
||||||
}
|
|
||||||
write_matrix_meta(&data_dir, n_new, n_genomes).map_err(SKError::Io)?;
|
|
||||||
|
|
||||||
PartitionMeta {
|
PartitionMeta {
|
||||||
n_layers: 1,
|
n_layers: 1,
|
||||||
mode: IndexMode::Exact,
|
mode: IndexMode::Exact,
|
||||||
}
|
}
|
||||||
.save(&dst_index_dir)
|
.save(&dst_index_dir)
|
||||||
.map_err(olm_to_sk)?;
|
.map_err(|e| olm_to_sk(e, "rebuild"))?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,8 +3,9 @@ use std::io;
|
|||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
use obicompactvec::{
|
use obicompactvec::{
|
||||||
PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentBitVecBuilder,
|
ColGroup, MatrixGroupOps,
|
||||||
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder,
|
PersistentBitMatrix, PersistentBitMatrixBuilder,
|
||||||
|
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
|
||||||
};
|
};
|
||||||
use obilayeredmap::meta::PartitionMeta;
|
use obilayeredmap::meta::PartitionMeta;
|
||||||
use obilayeredmap::OLMError;
|
use obilayeredmap::OLMError;
|
||||||
@@ -40,52 +41,6 @@ pub struct OutputCol {
|
|||||||
pub op: AggOp,
|
pub op: AggOp,
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Aggregation ───────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
#[inline]
|
|
||||||
fn aggregate(op: AggOp, indices: &[usize], src_row: &[u32], threshold: u32) -> u32 {
|
|
||||||
match op {
|
|
||||||
AggOp::Any => {
|
|
||||||
if indices.iter().any(|&i| src_row[i] > threshold) { 1 } else { 0 }
|
|
||||||
}
|
|
||||||
AggOp::All => {
|
|
||||||
if indices.is_empty() { return 0; }
|
|
||||||
if indices.iter().all(|&i| src_row[i] > threshold) { 1 } else { 0 }
|
|
||||||
}
|
|
||||||
AggOp::None => {
|
|
||||||
if indices.iter().all(|&i| src_row[i] <= threshold) { 1 } else { 0 }
|
|
||||||
}
|
|
||||||
AggOp::Sum => {
|
|
||||||
indices.iter().map(|&i| src_row[i]).fold(0u32, |a, b| a.saturating_add(b))
|
|
||||||
}
|
|
||||||
AggOp::Min => indices.iter().map(|&i| src_row[i]).min().unwrap_or(0),
|
|
||||||
AggOp::Max => indices.iter().map(|&i| src_row[i]).max().unwrap_or(0),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── ColBuilder ────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
enum ColBuilder {
|
|
||||||
Bit(PersistentBitVecBuilder),
|
|
||||||
Int(PersistentCompactIntVecBuilder),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ColBuilder {
|
|
||||||
fn set_val(&mut self, slot: usize, value: u32) {
|
|
||||||
match self {
|
|
||||||
ColBuilder::Bit(b) => b.set(slot, value > 0),
|
|
||||||
ColBuilder::Int(b) => b.set(slot, value),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn close(self) -> SKResult<()> {
|
|
||||||
match self {
|
|
||||||
ColBuilder::Bit(b) => b.close().map_err(SKError::Io),
|
|
||||||
ColBuilder::Int(b) => b.close().map_err(SKError::Io),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
fn olm_to_sk(e: OLMError) -> SKError {
|
fn olm_to_sk(e: OLMError) -> SKError {
|
||||||
@@ -95,21 +50,6 @@ fn olm_to_sk(e: OLMError) -> SKError {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn col_path_bit(dir: &Path, col: usize) -> PathBuf {
|
|
||||||
dir.join(format!("col_{col:06}.pbiv"))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn col_path_int(dir: &Path, col: usize) -> PathBuf {
|
|
||||||
dir.join(format!("col_{col:06}.pciv"))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn write_matrix_meta(dir: &Path, n: usize, n_cols: usize) -> io::Result<()> {
|
|
||||||
fs::write(
|
|
||||||
dir.join("meta.json"),
|
|
||||||
format!("{{\"n\":{n},\"n_cols\":{n_cols}}}\n"),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Copy all plain files (not subdirectories) from `src_dir` to `dst_dir`.
|
/// Copy all plain files (not subdirectories) from `src_dir` to `dst_dir`.
|
||||||
fn copy_layer_files(src_dir: &Path, dst_dir: &Path) -> io::Result<()> {
|
fn copy_layer_files(src_dir: &Path, dst_dir: &Path) -> io::Result<()> {
|
||||||
for entry in fs::read_dir(src_dir)? {
|
for entry in fs::read_dir(src_dir)? {
|
||||||
@@ -125,30 +65,64 @@ fn copy_layer_files(src_dir: &Path, dst_dir: &Path) -> io::Result<()> {
|
|||||||
// ── fill_builders ─────────────────────────────────────────────────────────────
|
// ── fill_builders ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
fn fill_builders(
|
fn fill_builders(
|
||||||
builders: &mut [ColBuilder],
|
|
||||||
specs: &[OutputCol],
|
specs: &[OutputCol],
|
||||||
n: usize,
|
|
||||||
n_src: usize,
|
|
||||||
src_layer_dir: &Path,
|
src_layer_dir: &Path,
|
||||||
src_is_count: bool,
|
src_is_count: bool,
|
||||||
threshold: u32,
|
threshold: u32,
|
||||||
|
output_presence: bool,
|
||||||
|
mut dst_bit: Option<&mut PersistentBitMatrixBuilder>,
|
||||||
|
mut dst_int: Option<&mut PersistentCompactIntMatrixBuilder>,
|
||||||
) -> SKResult<()> {
|
) -> SKResult<()> {
|
||||||
let mut src_buf = vec![0u32; n_src];
|
|
||||||
|
|
||||||
if src_is_count {
|
if src_is_count {
|
||||||
let mat = PersistentCompactIntMatrix::open(src_layer_dir).map_err(SKError::Io)?;
|
let mat = PersistentCompactIntMatrix::open(src_layer_dir).map_err(SKError::Io)?;
|
||||||
for slot in 0..n {
|
for spec in specs {
|
||||||
mat.fill_row(slot, &mut src_buf);
|
let g = ColGroup::new(&spec.label, spec.indices.clone());
|
||||||
for (col, spec) in specs.iter().enumerate() {
|
if output_presence {
|
||||||
builders[col].set_val(slot, aggregate(spec.op, &spec.indices, &src_buf, threshold));
|
let b = dst_bit.as_deref_mut().unwrap();
|
||||||
|
match spec.op {
|
||||||
|
AggOp::Any => b.add_col_from (&mat.partial_group_any (&g, threshold).map_err(SKError::Io)?),
|
||||||
|
AggOp::All => b.add_col_from (&mat.partial_group_all (&g, threshold).map_err(SKError::Io)?),
|
||||||
|
AggOp::None => b.add_col_from (&mat.partial_group_none(&g, threshold).map_err(SKError::Io)?),
|
||||||
|
AggOp::Sum => b.add_col_from_int(&mat.partial_group_sum (&g).map_err(SKError::Io)?),
|
||||||
|
AggOp::Min => b.add_col_from_int(&mat.partial_group_min (&g).map_err(SKError::Io)?),
|
||||||
|
AggOp::Max => b.add_col_from_int(&mat.partial_group_max (&g).map_err(SKError::Io)?),
|
||||||
|
}.map_err(SKError::Io)?;
|
||||||
|
} else {
|
||||||
|
let b = dst_int.as_deref_mut().unwrap();
|
||||||
|
match spec.op {
|
||||||
|
AggOp::Sum => b.add_col_from (&mat.partial_group_sum (&g).map_err(SKError::Io)?),
|
||||||
|
AggOp::Min => b.add_col_from (&mat.partial_group_min (&g).map_err(SKError::Io)?),
|
||||||
|
AggOp::Max => b.add_col_from (&mat.partial_group_max (&g).map_err(SKError::Io)?),
|
||||||
|
AggOp::Any => b.add_col_from_bit(&mat.partial_group_any (&g, threshold).map_err(SKError::Io)?),
|
||||||
|
AggOp::All => b.add_col_from_bit(&mat.partial_group_all (&g, threshold).map_err(SKError::Io)?),
|
||||||
|
AggOp::None => b.add_col_from_bit(&mat.partial_group_none(&g, threshold).map_err(SKError::Io)?),
|
||||||
|
}.map_err(SKError::Io)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
let mat = PersistentBitMatrix::open(src_layer_dir).map_err(SKError::Io)?;
|
let mat = PersistentBitMatrix::open(src_layer_dir).map_err(SKError::Io)?;
|
||||||
for slot in 0..n {
|
for spec in specs {
|
||||||
mat.fill_row(slot, &mut src_buf);
|
let g = ColGroup::new(&spec.label, spec.indices.clone());
|
||||||
for (col, spec) in specs.iter().enumerate() {
|
if output_presence {
|
||||||
builders[col].set_val(slot, aggregate(spec.op, &spec.indices, &src_buf, threshold));
|
let b = dst_bit.as_deref_mut().unwrap();
|
||||||
|
match spec.op {
|
||||||
|
AggOp::Any => b.add_col_from (&mat.partial_group_any (&g, 1).map_err(SKError::Io)?),
|
||||||
|
AggOp::All => b.add_col_from (&mat.partial_group_all (&g, 1).map_err(SKError::Io)?),
|
||||||
|
AggOp::None => b.add_col_from (&mat.partial_group_none(&g, 1).map_err(SKError::Io)?),
|
||||||
|
AggOp::Sum => b.add_col_from_int(&mat.partial_group_sum (&g).map_err(SKError::Io)?),
|
||||||
|
AggOp::Min => b.add_col_from_int(&mat.partial_group_min (&g).map_err(SKError::Io)?),
|
||||||
|
AggOp::Max => b.add_col_from_int(&mat.partial_group_max (&g).map_err(SKError::Io)?),
|
||||||
|
}.map_err(SKError::Io)?;
|
||||||
|
} else {
|
||||||
|
let b = dst_int.as_deref_mut().unwrap();
|
||||||
|
match spec.op {
|
||||||
|
AggOp::Sum => b.add_col_from (&mat.partial_group_sum (&g).map_err(SKError::Io)?),
|
||||||
|
AggOp::Min => b.add_col_from (&mat.partial_group_min (&g).map_err(SKError::Io)?),
|
||||||
|
AggOp::Max => b.add_col_from (&mat.partial_group_max (&g).map_err(SKError::Io)?),
|
||||||
|
AggOp::Any => b.add_col_from_bit(&mat.partial_group_any (&g, 1).map_err(SKError::Io)?),
|
||||||
|
AggOp::All => b.add_col_from_bit(&mat.partial_group_all (&g, 1).map_err(SKError::Io)?),
|
||||||
|
AggOp::None => b.add_col_from_bit(&mat.partial_group_none(&g, 1).map_err(SKError::Io)?),
|
||||||
|
}.map_err(SKError::Io)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -168,7 +142,7 @@ impl KmerPartition {
|
|||||||
src: &KmerPartition,
|
src: &KmerPartition,
|
||||||
i: usize,
|
i: usize,
|
||||||
specs: &[OutputCol],
|
specs: &[OutputCol],
|
||||||
n_src_genomes: usize,
|
_n_src_genomes: usize,
|
||||||
threshold: u32,
|
threshold: u32,
|
||||||
output_presence: bool,
|
output_presence: bool,
|
||||||
in_place: bool,
|
in_place: bool,
|
||||||
@@ -188,7 +162,6 @@ impl KmerPartition {
|
|||||||
fs::create_dir_all(&dst_index_dir)?;
|
fs::create_dir_all(&dst_index_dir)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let n_out = specs.len();
|
|
||||||
let data_subdir = if output_presence { "presence" } else { "counts" };
|
let data_subdir = if output_presence { "presence" } else { "counts" };
|
||||||
|
|
||||||
for l in 0..src_meta.n_layers {
|
for l in 0..src_meta.n_layers {
|
||||||
@@ -201,7 +174,7 @@ impl KmerPartition {
|
|||||||
let presence_dir = src_layer_dir.join("presence");
|
let presence_dir = src_layer_dir.join("presence");
|
||||||
let src_is_count = counts_dir.exists() && !presence_dir.exists();
|
let src_is_count = counts_dir.exists() && !presence_dir.exists();
|
||||||
|
|
||||||
// Determine number of slots from the source matrix.
|
// Determine number of slots and detect implicit layers.
|
||||||
let n = if counts_dir.exists() {
|
let n = if counts_dir.exists() {
|
||||||
PersistentCompactIntMatrix::open(&src_layer_dir).map_err(SKError::Io)?.n()
|
PersistentCompactIntMatrix::open(&src_layer_dir).map_err(SKError::Io)?.n()
|
||||||
} else if presence_dir.exists() {
|
} else if presence_dir.exists() {
|
||||||
@@ -216,7 +189,7 @@ impl KmerPartition {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Choose the output data directory (temp name for in-place).
|
// Choose the output data directory (temp name for in-place).
|
||||||
let (dst_data_dir, final_data_dir) = if in_place {
|
let (dst_data_dir, final_data_dir): (PathBuf, PathBuf) = if in_place {
|
||||||
let tmp = dst_layer_dir.join(format!("{data_subdir}_new"));
|
let tmp = dst_layer_dir.join(format!("{data_subdir}_new"));
|
||||||
let perm = dst_layer_dir.join(data_subdir);
|
let perm = dst_layer_dir.join(data_subdir);
|
||||||
(tmp, perm)
|
(tmp, perm)
|
||||||
@@ -231,37 +204,22 @@ impl KmerPartition {
|
|||||||
}
|
}
|
||||||
fs::create_dir_all(&dst_data_dir)?;
|
fs::create_dir_all(&dst_data_dir)?;
|
||||||
|
|
||||||
// Initialise packed-format skeleton.
|
let (mut dst_bit, mut dst_int) = if output_presence {
|
||||||
if output_presence {
|
(Some(PersistentBitMatrixBuilder::new(n, &dst_data_dir).map_err(SKError::Io)?), None)
|
||||||
PersistentBitMatrixBuilder::new(n, &dst_data_dir)
|
|
||||||
.map_err(SKError::Io)?.close().map_err(SKError::Io)?;
|
|
||||||
} else {
|
} else {
|
||||||
PersistentCompactIntMatrixBuilder::new(n, &dst_data_dir)
|
(None, Some(PersistentCompactIntMatrixBuilder::new(n, &dst_data_dir).map_err(SKError::Io)?))
|
||||||
.map_err(SKError::Io)?.close().map_err(SKError::Io)?;
|
};
|
||||||
}
|
|
||||||
|
|
||||||
// Create column builders.
|
|
||||||
let mut builders: Vec<ColBuilder> = (0..n_out)
|
|
||||||
.map(|col| -> SKResult<ColBuilder> {
|
|
||||||
if output_presence {
|
|
||||||
Ok(ColBuilder::Bit(PersistentBitVecBuilder::new(
|
|
||||||
n, &col_path_bit(&dst_data_dir, col),
|
|
||||||
)?))
|
|
||||||
} else {
|
|
||||||
Ok(ColBuilder::Int(PersistentCompactIntVecBuilder::new(
|
|
||||||
n, &col_path_int(&dst_data_dir, col),
|
|
||||||
)?))
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.collect::<SKResult<_>>()?;
|
|
||||||
|
|
||||||
fill_builders(
|
fill_builders(
|
||||||
&mut builders, specs, n, n_src_genomes,
|
specs, &src_layer_dir, src_is_count, threshold, output_presence,
|
||||||
&src_layer_dir, src_is_count, threshold,
|
dst_bit.as_mut(), dst_int.as_mut(),
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
for b in builders { b.close()?; }
|
if output_presence {
|
||||||
write_matrix_meta(&dst_data_dir, n, n_out).map_err(SKError::Io)?;
|
dst_bit.unwrap().close().map_err(SKError::Io)?;
|
||||||
|
} else {
|
||||||
|
dst_int.unwrap().close().map_err(SKError::Io)?;
|
||||||
|
}
|
||||||
|
|
||||||
// In-place: swap old data dir for new.
|
// In-place: swap old data dir for new.
|
||||||
if in_place {
|
if in_place {
|
||||||
|
|||||||
@@ -106,11 +106,7 @@ impl Layer<()> {
|
|||||||
let presence_dir = layer_dir.join(PRESENCE_DIR);
|
let presence_dir = layer_dir.join(PRESENCE_DIR);
|
||||||
fs::create_dir_all(&presence_dir).map_err(OLMError::Io)?;
|
fs::create_dir_all(&presence_dir).map_err(OLMError::Io)?;
|
||||||
let mut mb = PersistentBitMatrixBuilder::new(n_kmers, &presence_dir).map_err(OLMError::Io)?;
|
let mut mb = PersistentBitMatrixBuilder::new(n_kmers, &presence_dir).map_err(OLMError::Io)?;
|
||||||
let mut col = mb.add_col().map_err(OLMError::Io)?;
|
mb.add_col_ones().map_err(OLMError::Io)?.close().map_err(OLMError::Io)?;
|
||||||
for slot in 0..n_kmers {
|
|
||||||
col.set(slot, true);
|
|
||||||
}
|
|
||||||
col.close().map_err(OLMError::Io)?;
|
|
||||||
mb.close().map_err(OLMError::Io)
|
mb.close().map_err(OLMError::Io)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
mod scheduler;
|
mod scheduler;
|
||||||
|
pub mod throttle;
|
||||||
|
|
||||||
pub use scheduler::Pipe;
|
pub use scheduler::Pipe;
|
||||||
pub use scheduler::PipeIter;
|
pub use scheduler::PipeIter;
|
||||||
@@ -10,6 +11,10 @@ pub use scheduler::SinkFn;
|
|||||||
pub use scheduler::SourceFn;
|
pub use scheduler::SourceFn;
|
||||||
pub use scheduler::Stage;
|
pub use scheduler::Stage;
|
||||||
pub use scheduler::WorkerPool;
|
pub use scheduler::WorkerPool;
|
||||||
|
pub use throttle::Throttle;
|
||||||
|
pub use throttle::ThrottleGuard;
|
||||||
|
pub use throttle::Throttled;
|
||||||
|
pub use throttle::throttle;
|
||||||
|
|
||||||
/// Re-export de `crossbeam_channel::Sender` utilisé dans les macros flat transform.
|
/// Re-export de `crossbeam_channel::Sender` utilisé dans les macros flat transform.
|
||||||
/// Permet aux macros `make_flat_transform!` / `make_flat_transform_fallible!` d'utiliser
|
/// Permet aux macros `make_flat_transform!` / `make_flat_transform_fallible!` d'utiliser
|
||||||
|
|||||||
@@ -0,0 +1,86 @@
|
|||||||
|
use std::sync::{Arc, Condvar, Mutex};
|
||||||
|
|
||||||
|
// ── Throttle ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Counting semaphore: limits how many items from a source are in-flight
|
||||||
|
/// simultaneously through the Flat stage of a pipeline.
|
||||||
|
///
|
||||||
|
/// Slots are acquired in the source thread before an item is emitted, and
|
||||||
|
/// released when the corresponding `ThrottleGuard` is dropped (i.e. when the
|
||||||
|
/// Flat worker finishes processing the item). Acquisition must never happen
|
||||||
|
/// inside a worker — only in the source thread — to prevent deadlocks.
|
||||||
|
pub struct Throttle {
|
||||||
|
count: Mutex<usize>,
|
||||||
|
condvar: Condvar,
|
||||||
|
max: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Throttle {
|
||||||
|
pub fn new(max: usize) -> Self {
|
||||||
|
Self { count: Mutex::new(0), condvar: Condvar::new(), max }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn acquire(&self) {
|
||||||
|
let mut count = self.count.lock().unwrap();
|
||||||
|
while *count >= self.max {
|
||||||
|
count = self.condvar.wait(count).unwrap();
|
||||||
|
}
|
||||||
|
*count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn release(&self) {
|
||||||
|
let mut count = self.count.lock().unwrap();
|
||||||
|
*count -= 1;
|
||||||
|
self.condvar.notify_one();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── ThrottleGuard ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// RAII guard: releases one slot in the `Throttle` when dropped.
|
||||||
|
pub struct ThrottleGuard(Arc<Throttle>);
|
||||||
|
|
||||||
|
impl Drop for ThrottleGuard {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
self.0.release();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Throttled<T> ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// An item paired with its throttle guard.
|
||||||
|
///
|
||||||
|
/// The guard keeps a slot acquired until this value is dropped. In a Flat
|
||||||
|
/// pipeline stage, carry the guard inside the worker closure until the item
|
||||||
|
/// is fully processed, then let it drop.
|
||||||
|
pub struct Throttled<T> {
|
||||||
|
pub item: T,
|
||||||
|
pub guard: ThrottleGuard,
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── throttle() ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Wrap `source` so that at most `max_concurrent` items are emitted before
|
||||||
|
/// earlier ones have been fully processed (i.e. their `ThrottleGuard` dropped).
|
||||||
|
///
|
||||||
|
/// Acquisition blocks the source thread until a slot is available. This must
|
||||||
|
/// be called in the source thread, never inside a pipeline worker.
|
||||||
|
///
|
||||||
|
/// # Example
|
||||||
|
///
|
||||||
|
/// ```ignore
|
||||||
|
/// let throttled = obipipeline::throttle(file_paths, n_workers - 1);
|
||||||
|
/// // Use `throttled` as the pipeline source; carry `item.guard` through the
|
||||||
|
/// // Flat stage and let it drop when the file is fully read.
|
||||||
|
/// ```
|
||||||
|
pub fn throttle<I>(source: I, max_concurrent: usize) -> impl Iterator<Item = Throttled<I::Item>>
|
||||||
|
where
|
||||||
|
I: Iterator,
|
||||||
|
I::Item: Send + 'static,
|
||||||
|
{
|
||||||
|
let t = Arc::new(Throttle::new(max_concurrent));
|
||||||
|
source.map(move |item| {
|
||||||
|
t.acquire();
|
||||||
|
Throttled { item, guard: ThrottleGuard(Arc::clone(&t)) }
|
||||||
|
})
|
||||||
|
}
|
||||||
@@ -0,0 +1,6 @@
|
|||||||
|
[package]
|
||||||
|
name = "obitaxonomy"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2024"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
@@ -0,0 +1,38 @@
|
|||||||
|
use std::fmt;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
|
pub enum TaxError {
|
||||||
|
/// Stored value does not start with the `taxonomy:/` prefix.
|
||||||
|
MissingPrefix,
|
||||||
|
/// Stored path contains no segments after the prefix.
|
||||||
|
EmptyPath,
|
||||||
|
/// Query pattern contains no segments (after stripping anchors).
|
||||||
|
EmptyPattern,
|
||||||
|
/// A segment has an empty name (e.g. consecutive `/`).
|
||||||
|
EmptySegmentName,
|
||||||
|
/// A segment has a trailing `@` with no rank name.
|
||||||
|
EmptyRankName { segment: String },
|
||||||
|
/// A segment contains more than one `@`.
|
||||||
|
AmbiguousRank { segment: String },
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for TaxError {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
match self {
|
||||||
|
TaxError::MissingPrefix =>
|
||||||
|
write!(f, "taxonomy path must start with \"taxonomy:/\""),
|
||||||
|
TaxError::EmptyPath =>
|
||||||
|
write!(f, "taxonomy path has no segments"),
|
||||||
|
TaxError::EmptyPattern =>
|
||||||
|
write!(f, "taxonomy query pattern has no segments"),
|
||||||
|
TaxError::EmptySegmentName =>
|
||||||
|
write!(f, "segment has an empty name"),
|
||||||
|
TaxError::EmptyRankName { segment } =>
|
||||||
|
write!(f, "segment has '@' with no rank name: {segment:?}"),
|
||||||
|
TaxError::AmbiguousRank { segment } =>
|
||||||
|
write!(f, "segment contains more than one '@': {segment:?}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::error::Error for TaxError {}
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
mod error;
|
||||||
|
mod segment;
|
||||||
|
mod segment_pattern;
|
||||||
|
mod path;
|
||||||
|
mod pattern;
|
||||||
|
|
||||||
|
pub use error::TaxError;
|
||||||
|
pub use segment::TaxSegment;
|
||||||
|
pub use segment_pattern::SegmentPattern;
|
||||||
|
pub use path::{TaxPath, PREFIX};
|
||||||
|
pub use pattern::TaxPattern;
|
||||||
@@ -0,0 +1,82 @@
|
|||||||
|
use std::fmt;
|
||||||
|
use std::str::FromStr;
|
||||||
|
|
||||||
|
use crate::error::TaxError;
|
||||||
|
use crate::segment::TaxSegment;
|
||||||
|
|
||||||
|
/// The prefix that marks a metadata value as a taxonomy path.
|
||||||
|
pub const PREFIX: &str = "taxonomy:/";
|
||||||
|
|
||||||
|
/// A rooted, `/`-separated taxonomy path with optional per-segment rank annotations.
|
||||||
|
///
|
||||||
|
/// Stored form: `taxonomy:/seg1@rank1/seg2/seg3@rank3`
|
||||||
|
/// The leading `taxonomy:/` is the discriminator; the remainder is one or more
|
||||||
|
/// `/`-separated segments, each of the form `name` or `name@rank`.
|
||||||
|
///
|
||||||
|
/// `@` is reserved and may not appear in segment names or rank names.
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
|
pub struct TaxPath {
|
||||||
|
segments: Vec<TaxSegment>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TaxPath {
|
||||||
|
pub fn parse(s: &str) -> Result<Self, TaxError> {
|
||||||
|
let tail = s.strip_prefix(PREFIX).ok_or(TaxError::MissingPrefix)?;
|
||||||
|
if tail.is_empty() {
|
||||||
|
return Err(TaxError::EmptyPath);
|
||||||
|
}
|
||||||
|
let segments = tail.split('/')
|
||||||
|
.map(TaxSegment::parse)
|
||||||
|
.collect::<Result<Vec<_>, _>>()?;
|
||||||
|
Ok(Self { segments })
|
||||||
|
}
|
||||||
|
|
||||||
|
/// True if `self` is an ancestor of — or equal to — `other`.
|
||||||
|
///
|
||||||
|
/// Comparison is by segment name only; rank annotations are ignored.
|
||||||
|
/// `self` must be a prefix of `other` at segment granularity.
|
||||||
|
pub fn is_ancestor_of(&self, other: &TaxPath) -> bool {
|
||||||
|
self.segments.len() <= other.segments.len()
|
||||||
|
&& self.segments.iter().zip(other.segments.iter())
|
||||||
|
.all(|(a, b)| a.name() == b.name())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the name of the first segment whose rank equals `rank`, if any.
|
||||||
|
pub fn name_at_rank(&self, rank: &str) -> Option<&str> {
|
||||||
|
self.segments.iter()
|
||||||
|
.find(|s| s.rank() == Some(rank))
|
||||||
|
.map(|s| s.name())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// True if any segment has the given rank.
|
||||||
|
pub fn has_rank(&self, rank: &str) -> bool {
|
||||||
|
self.segments.iter().any(|s| s.rank() == Some(rank))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// True if the path contains a segment with both the given rank and name.
|
||||||
|
pub fn matches_rank(&self, rank: &str, name: &str) -> bool {
|
||||||
|
self.segments.iter().any(|s| s.rank() == Some(rank) && s.name() == name)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn segments(&self) -> &[TaxSegment] { &self.segments }
|
||||||
|
pub fn depth(&self) -> usize { self.segments.len() }
|
||||||
|
pub fn is_empty(&self) -> bool { self.segments.is_empty() }
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for TaxPath {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
write!(f, "{}", PREFIX)?;
|
||||||
|
let mut first = true;
|
||||||
|
for seg in &self.segments {
|
||||||
|
if !first { write!(f, "/")?; }
|
||||||
|
write!(f, "{seg}")?;
|
||||||
|
first = false;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromStr for TaxPath {
|
||||||
|
type Err = TaxError;
|
||||||
|
fn from_str(s: &str) -> Result<Self, Self::Err> { Self::parse(s) }
|
||||||
|
}
|
||||||
@@ -0,0 +1,72 @@
|
|||||||
|
use crate::error::TaxError;
|
||||||
|
use crate::path::TaxPath;
|
||||||
|
use crate::segment::TaxSegment;
|
||||||
|
use crate::segment_pattern::SegmentPattern;
|
||||||
|
|
||||||
|
/// A query pattern for matching against stored `TaxPath` values.
|
||||||
|
///
|
||||||
|
/// Syntax:
|
||||||
|
///
|
||||||
|
/// | Form | Semantics |
|
||||||
|
/// |----------|-----------|
|
||||||
|
/// | `A/B` | A then B as a contiguous sub-path, anywhere in the value |
|
||||||
|
/// | `/A/B` | value starts with A then B (start-anchored) |
|
||||||
|
/// | `A/B$` | value ends with A then B (end-anchored) |
|
||||||
|
/// | `/A/B$` | value is exactly A then B (fully anchored) |
|
||||||
|
/// | `A@x/B` | A with rank `x`, followed by B with any rank |
|
||||||
|
///
|
||||||
|
/// A segment pattern without `@` matches any segment with that name regardless
|
||||||
|
/// of its stored rank.
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
|
pub struct TaxPattern {
|
||||||
|
start_anchored: bool,
|
||||||
|
end_anchored: bool,
|
||||||
|
segments: Vec<SegmentPattern>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TaxPattern {
|
||||||
|
pub fn parse(s: &str) -> Result<Self, TaxError> {
|
||||||
|
let s = s.trim();
|
||||||
|
|
||||||
|
let start_anchored = s.starts_with('/');
|
||||||
|
let s = if start_anchored { &s[1..] } else { s };
|
||||||
|
|
||||||
|
let end_anchored = s.ends_with('$');
|
||||||
|
let s = if end_anchored { &s[..s.len() - 1] } else { s };
|
||||||
|
|
||||||
|
if s.is_empty() {
|
||||||
|
return Err(TaxError::EmptyPattern);
|
||||||
|
}
|
||||||
|
|
||||||
|
let segments = s.split('/')
|
||||||
|
.map(SegmentPattern::parse)
|
||||||
|
.collect::<Result<Vec<_>, _>>()?;
|
||||||
|
|
||||||
|
Ok(Self { start_anchored, end_anchored, segments })
|
||||||
|
}
|
||||||
|
|
||||||
|
/// True if this pattern matches `path` according to the anchor flags.
|
||||||
|
///
|
||||||
|
/// The pattern must match a contiguous run of segments in the path.
|
||||||
|
/// Start/end anchors restrict where that run may begin or end.
|
||||||
|
pub fn matches(&self, path: &TaxPath) -> bool {
|
||||||
|
let n = self.segments.len();
|
||||||
|
let m = path.depth();
|
||||||
|
|
||||||
|
if n > m { return false; }
|
||||||
|
|
||||||
|
let segs = path.segments();
|
||||||
|
match (self.start_anchored, self.end_anchored) {
|
||||||
|
(true, true) => n == m && self.window_matches(segs, 0),
|
||||||
|
(true, false) => self.window_matches(segs, 0),
|
||||||
|
(false, true) => self.window_matches(segs, m - n),
|
||||||
|
(false, false) => (0..=(m - n)).any(|i| self.window_matches(segs, i)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn window_matches(&self, segs: &[TaxSegment], start: usize) -> bool {
|
||||||
|
self.segments.iter()
|
||||||
|
.zip(segs[start..start + self.segments.len()].iter())
|
||||||
|
.all(|(pat, seg)| pat.matches(seg))
|
||||||
|
}
|
||||||
|
}
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user