Compare commits
163 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 21a20ce7ca | |||
| 2ea58703c7 | |||
| ac3ef106e7 | |||
| 469e53b6f5 | |||
| 9f1df96ea7 | |||
| 4e4cce2879 | |||
| 68b05b93c4 | |||
| 0a668cf8a6 | |||
| e6d6942e2f | |||
| bf9c9aeacb | |||
| 22a65857a1 | |||
| d16a867640 | |||
| 616050075f | |||
| e22afe9621 | |||
| bdfac71e65 | |||
| a00bb37478 | |||
| d30a4efd9b | |||
| 6baf2e64ca | |||
| c0a71a2d49 | |||
| a609c1af95 | |||
| 3d32be8a83 | |||
| c4c71dc892 | |||
| 4e625afaba | |||
| a522c0907e | |||
| c1d6f277ce | |||
| 9356be4ec0 | |||
| c694e1f2b0 | |||
| 280ca1f5a3 | |||
| 9abb2db92f | |||
| 7c1efa9cbb | |||
| 4c4524766c | |||
| 7eea71fdcd | |||
| f91c5a3f79 | |||
| fb4962c4fe | |||
| 1d38d87ff9 | |||
| 93559c3294 | |||
| 1f0d77d5bf | |||
| eeba43ac4f | |||
| 7ed7b26039 | |||
| 26de90f18d | |||
| 497d250d8a | |||
| aa98e82875 | |||
| 5ff5b04d2d | |||
| df7b400fda | |||
| d1717688d2 | |||
| cde6457eea | |||
| b6fcbc545f | |||
| 9578f991f4 | |||
| 1cd7916e06 | |||
| bc92dc4592 | |||
| a9567ad023 | |||
| 4a64718fd1 | |||
| 7a87e911b6 | |||
| 313d73838a | |||
| 175ea5bbd0 | |||
| c6ea0c53e3 | |||
| ea767376bd | |||
| f1d76f3203 | |||
| c4071eb450 | |||
| 817b02cbc1 | |||
| 547cb72d76 | |||
| 6d85387077 | |||
| fb5b53dca9 | |||
| fddf630772 | |||
| bc14346f5f | |||
| fb8c6e427c | |||
| 1f336fe496 | |||
| 5f98d2ef96 | |||
| 8b563d0804 | |||
| 7208dcbb4a | |||
| 2e69b0b7fe | |||
| 9ea1dff5d6 | |||
| b2c8373586 | |||
| ba49af6f9e | |||
| 2bc189e962 | |||
| db9c604199 | |||
| 52fd2cf801 | |||
| 97e3fb9761 | |||
| b5e027f23b | |||
| f44fe042bc | |||
| 94e0a370b3 | |||
| 970460be42 | |||
| e66adef23d | |||
| b0dab452f6 | |||
| db730e9cf6 | |||
| f65ecd19cc | |||
| 7dd8db1409 | |||
| ce45e2fbe1 | |||
| 2465cfbc4b | |||
| d626d42ec7 | |||
| 650eea43b6 | |||
| eb7805c747 | |||
| 1ec65922df | |||
| 09d9e21744 | |||
| 3f47e22083 | |||
| 03c7bb0b99 | |||
| b39eee688a | |||
| 95b3461405 | |||
| 952a21eef8 | |||
| 5c2f48535f | |||
| 27088ab810 | |||
| ea2c594c86 | |||
| d202ead385 | |||
| 249998beed | |||
| 2f29ee2240 | |||
| edd5e3f8ee | |||
| bb7adc1154 | |||
| 9306ec1c56 | |||
| 712a03a3a6 | |||
| 3e62ffe010 | |||
| a1499e6153 | |||
| 476c7a6394 | |||
| edc18b4908 | |||
| 02cb30c0ef | |||
| 4677d6f177 | |||
| 7a29ca6305 | |||
| bba5147f0f | |||
| bfe0cb4b82 | |||
| 173ac9fb42 | |||
| de1a41810a | |||
| 1661dd6b1c | |||
| 2ebc5f0d75 | |||
| 4fd0eb989f | |||
| add6d7f873 | |||
| 0350ca855b | |||
| 1e2115a1b0 | |||
| 657f964dda | |||
| 8b57c91ab7 | |||
| 728476a0a6 | |||
| 8a0b898b4b | |||
| 708b0abf9b | |||
| 3138f6382c | |||
| 86b88acb95 | |||
| be0e8f1041 | |||
| eaa52eaab5 | |||
| cfadf63bbc | |||
| a4b57a96de | |||
| 0d9be53d1f | |||
| 82ec6aa1cf | |||
| 694da5208e | |||
| 26ab165807 | |||
| dfa0b2bac2 | |||
| 9e60a711bc | |||
| 98c14aade9 | |||
| 7501b6e854 | |||
| 6f7abddeaf | |||
| 1d880fdc5f | |||
| 009a328c58 | |||
| 9d46400898 | |||
| 036d044291 | |||
| 88365e444c | |||
| da56c3e290 | |||
| b7db3a33ed | |||
| b2a52bfb37 | |||
| bc51cd9861 | |||
| 876bc0127f | |||
| 16a6b0d033 | |||
| e1dab86daf | |||
| 24afd74e2f | |||
| 8478072b78 | |||
| 4a5ab0b8c2 | |||
| 9b700ff4a4 | |||
| ca71e100ef |
@@ -0,0 +1,36 @@
|
|||||||
|
name: CI
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: ['main']
|
||||||
|
pull_request:
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
working-directory: src
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Install Rust
|
||||||
|
run: |
|
||||||
|
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
|
||||||
|
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||||
|
|
||||||
|
- name: Cache cargo registry
|
||||||
|
uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
~/.cargo/registry
|
||||||
|
~/.cargo/git
|
||||||
|
src/target
|
||||||
|
key: ${{ runner.os }}-cargo-${{ hashFiles('src/Cargo.lock') }}
|
||||||
|
restore-keys: ${{ runner.os }}-cargo-
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
run: cargo build --release
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
run: cargo test --release
|
||||||
@@ -0,0 +1,127 @@
|
|||||||
|
name: Release
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- "v*"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
create-release:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
outputs:
|
||||||
|
release_id: ${{ steps.create.outputs.release_id }}
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-tags: true
|
||||||
|
|
||||||
|
- name: Create Gitea release
|
||||||
|
id: create
|
||||||
|
env:
|
||||||
|
GITEA_TOKEN: ${{ secrets.GITEATOKEN }}
|
||||||
|
TAG: ${{ github.ref_name }}
|
||||||
|
run: |
|
||||||
|
sudo apt-get update -qq && sudo apt-get install -y -qq jq
|
||||||
|
body=$(git for-each-ref --format='%(contents)' "refs/tags/$TAG")
|
||||||
|
release_id=$(curl -s -X POST \
|
||||||
|
"${{ github.server_url }}/api/v1/repos/${{ github.repository }}/releases" \
|
||||||
|
-H "Authorization: token $GITEA_TOKEN" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d "{\"tag_name\":\"$TAG\",\"name\":\"$TAG\",\"body\":$(echo "$body" | jq -Rs .)}" | jq -r '.id')
|
||||||
|
echo "release_id=$release_id" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
|
build-linux-x86_64:
|
||||||
|
needs: create-release
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
working-directory: src
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Install Rust + zigbuild
|
||||||
|
run: |
|
||||||
|
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
|
||||||
|
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||||
|
sudo apt-get update -qq && sudo apt-get install -y -qq jq
|
||||||
|
pip install ziglang --quiet --break-system-packages
|
||||||
|
$HOME/.cargo/bin/cargo install cargo-zigbuild
|
||||||
|
$HOME/.cargo/bin/rustup target add x86_64-unknown-linux-musl
|
||||||
|
|
||||||
|
- name: Create musl C/C++ wrappers
|
||||||
|
run: |
|
||||||
|
ZIG=$(python3 -c "import ziglang, os; print(os.path.join(os.path.dirname(ziglang.__file__), 'zig'))")
|
||||||
|
printf '#!/bin/sh\nexec "%s" cc -target x86_64-linux-musl "$@"\n' "$ZIG" | sudo tee /usr/local/bin/x86_64-linux-musl-gcc > /dev/null
|
||||||
|
printf '#!/bin/sh\nexec "%s" c++ -target x86_64-linux-musl "$@"\n' "$ZIG" | sudo tee /usr/local/bin/x86_64-linux-musl-g++ > /dev/null
|
||||||
|
sudo chmod +x /usr/local/bin/x86_64-linux-musl-gcc /usr/local/bin/x86_64-linux-musl-g++
|
||||||
|
|
||||||
|
- name: Cache cargo registry
|
||||||
|
uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
~/.cargo/registry
|
||||||
|
~/.cargo/git
|
||||||
|
src/target
|
||||||
|
key: linux-musl-cargo-${{ hashFiles('src/Cargo.lock') }}
|
||||||
|
restore-keys: linux-musl-cargo-
|
||||||
|
|
||||||
|
- name: Build static binary
|
||||||
|
env:
|
||||||
|
PKG_CONFIG_ALLOW_CROSS: "1"
|
||||||
|
run: cargo zigbuild --release --target x86_64-unknown-linux-musl
|
||||||
|
|
||||||
|
- name: Prepare and upload artifact
|
||||||
|
env:
|
||||||
|
GITEA_TOKEN: ${{ secrets.GITEATOKEN }}
|
||||||
|
RELEASE_ID: ${{ needs.create-release.outputs.release_id }}
|
||||||
|
run: |
|
||||||
|
mkdir -p /tmp/dist
|
||||||
|
cp target/x86_64-unknown-linux-musl/release/obikmer /tmp/dist/obikmer-linux-x86_64
|
||||||
|
strip /tmp/dist/obikmer-linux-x86_64
|
||||||
|
curl -s -X POST \
|
||||||
|
"${{ github.server_url }}/api/v1/repos/${{ github.repository }}/releases/$RELEASE_ID/assets" \
|
||||||
|
-H "Authorization: token $GITEA_TOKEN" \
|
||||||
|
-F "attachment=@/tmp/dist/obikmer-linux-x86_64"
|
||||||
|
|
||||||
|
build-macos-arm64:
|
||||||
|
needs: create-release
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
working-directory: src
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Install Rust + zigbuild
|
||||||
|
run: |
|
||||||
|
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
|
||||||
|
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||||
|
sudo apt-get update -qq && sudo apt-get install -y -qq jq
|
||||||
|
pip install ziglang --quiet --break-system-packages
|
||||||
|
$HOME/.cargo/bin/cargo install cargo-zigbuild
|
||||||
|
$HOME/.cargo/bin/rustup target add aarch64-apple-darwin
|
||||||
|
|
||||||
|
- name: Cache cargo registry
|
||||||
|
uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
~/.cargo/registry
|
||||||
|
~/.cargo/git
|
||||||
|
src/target
|
||||||
|
key: macos-arm64-cargo-${{ hashFiles('src/Cargo.lock') }}
|
||||||
|
restore-keys: macos-arm64-cargo-
|
||||||
|
|
||||||
|
- name: Build macOS binary
|
||||||
|
run: cargo zigbuild --release --target aarch64-apple-darwin --no-default-features
|
||||||
|
|
||||||
|
- name: Prepare and upload artifact
|
||||||
|
env:
|
||||||
|
GITEA_TOKEN: ${{ secrets.GITEATOKEN }}
|
||||||
|
RELEASE_ID: ${{ needs.create-release.outputs.release_id }}
|
||||||
|
run: |
|
||||||
|
mkdir -p /tmp/dist
|
||||||
|
cp target/aarch64-apple-darwin/release/obikmer /tmp/dist/obikmer-macos-arm64
|
||||||
|
curl -s -X POST \
|
||||||
|
"${{ github.server_url }}/api/v1/repos/${{ github.repository }}/releases/$RELEASE_ID/assets" \
|
||||||
|
-H "Authorization: token $GITEA_TOKEN" \
|
||||||
|
-F "attachment=@/tmp/dist/obikmer-macos-arm64"
|
||||||
+12
@@ -9,3 +9,15 @@ data-stress
|
|||||||
./**/*.json
|
./**/*.json
|
||||||
*.bin
|
*.bin
|
||||||
Betula_exilis--IGA-24-33
|
Betula_exilis--IGA-24-33
|
||||||
|
benchmark/genomes
|
||||||
|
benchmark/simulated_data
|
||||||
|
benchmark/specimen_index_presence
|
||||||
|
benchmark/specimen_index_count
|
||||||
|
benchmark/global_index_presence
|
||||||
|
benchmark/global_index_count
|
||||||
|
benchmark/stats
|
||||||
|
benchmark/reference_index
|
||||||
|
benchmark/reference_dist
|
||||||
|
benchmark/obikmer_dist
|
||||||
|
benchmark/specific_index_count
|
||||||
|
benchmark/specific_index_presence
|
||||||
|
|||||||
@@ -0,0 +1,2 @@
|
|||||||
|
/cache
|
||||||
|
/project.local.yml
|
||||||
@@ -0,0 +1,133 @@
|
|||||||
|
# the name by which the project can be referenced within Serena
|
||||||
|
project_name: "obikmer"
|
||||||
|
|
||||||
|
|
||||||
|
# list of languages for which language servers are started; choose from:
|
||||||
|
# al angular ansible bash clojure
|
||||||
|
# cpp cpp_ccls crystal csharp csharp_omnisharp
|
||||||
|
# dart elixir elm erlang fortran
|
||||||
|
# fsharp go groovy haskell haxe
|
||||||
|
# hlsl html java json julia
|
||||||
|
# kotlin lean4 lua luau markdown
|
||||||
|
# matlab msl nix ocaml pascal
|
||||||
|
# perl php php_phpactor powershell python
|
||||||
|
# python_jedi python_ty r rego ruby
|
||||||
|
# ruby_solargraph rust scala scss solidity
|
||||||
|
# svelte swift systemverilog terraform toml
|
||||||
|
# typescript typescript_vts vue yaml zig
|
||||||
|
# (This list may be outdated. For the current list, see values of Language enum here:
|
||||||
|
# https://github.com/oraios/serena/blob/main/src/solidlsp/ls_config.py
|
||||||
|
# For some languages, there are alternative language servers, e.g. csharp_omnisharp, ruby_solargraph.)
|
||||||
|
# Note:
|
||||||
|
# - For C, use cpp
|
||||||
|
# - For JavaScript, use typescript
|
||||||
|
# - For Angular projects, use angular (subsumes typescript+html; requires `npm install` in the project root)
|
||||||
|
# - For Svelte projects, use svelte (subsumes typescript/javascript for .svelte projects; requires npm)
|
||||||
|
# - For SCSS / Sass / plain CSS, use scss (some-sass-language-server handles all three)
|
||||||
|
# - For Free Pascal/Lazarus, use pascal
|
||||||
|
# Special requirements:
|
||||||
|
# Some languages require additional setup/installations.
|
||||||
|
# See here for details: https://oraios.github.io/serena/01-about/020_programming-languages.html#language-servers
|
||||||
|
# When using multiple languages, the first language server that supports a given file will be used for that file.
|
||||||
|
# The first language is the default language and the respective language server will be used as a fallback.
|
||||||
|
# Note that when using the JetBrains backend, language servers are not used and this list is correspondingly ignored.
|
||||||
|
languages:
|
||||||
|
- rust
|
||||||
|
|
||||||
|
# the encoding used by text files in the project
|
||||||
|
# For a list of possible encodings, see https://docs.python.org/3.11/library/codecs.html#standard-encodings
|
||||||
|
encoding: "utf-8"
|
||||||
|
|
||||||
|
# line ending convention to use when writing source files.
|
||||||
|
# Possible values: unset (use global setting), "lf", "crlf", or "native" (platform default)
|
||||||
|
# This does not affect Serena's own files (e.g. memories and configuration files), which always use native line endings.
|
||||||
|
line_ending:
|
||||||
|
|
||||||
|
# The language backend to use for this project.
|
||||||
|
# If not set, the global setting from serena_config.yml is used.
|
||||||
|
# Valid values: LSP, JetBrains
|
||||||
|
# Note: the backend is fixed at startup. If a project with a different backend
|
||||||
|
# is activated post-init, an error will be returned.
|
||||||
|
language_backend:
|
||||||
|
|
||||||
|
# whether to use project's .gitignore files to ignore files
|
||||||
|
ignore_all_files_in_gitignore: true
|
||||||
|
|
||||||
|
# advanced configuration option allowing to configure language server-specific options.
|
||||||
|
# Maps the language key to the options.
|
||||||
|
# Have a look at the docstring of the constructors of the LS implementations within solidlsp (e.g., for C# or PHP) to see which options are available.
|
||||||
|
# No documentation on options means no options are available.
|
||||||
|
ls_specific_settings: {}
|
||||||
|
|
||||||
|
# list of additional workspace folder paths for cross-package reference support (e.g. in monorepos).
|
||||||
|
# Paths can be absolute or relative to the project root.
|
||||||
|
# Each folder is registered as an LSP workspace folder, enabling language servers to discover
|
||||||
|
# symbols and references across package boundaries.
|
||||||
|
# Currently supported for: TypeScript.
|
||||||
|
# Example:
|
||||||
|
# additional_workspace_folders:
|
||||||
|
# - ../sibling-package
|
||||||
|
# - ../shared-lib
|
||||||
|
additional_workspace_folders: []
|
||||||
|
|
||||||
|
# list of additional paths to ignore in this project.
|
||||||
|
# Same syntax as gitignore, so you can use * and **.
|
||||||
|
# Note: global ignored_paths from serena_config.yml are also applied additively.
|
||||||
|
ignored_paths: []
|
||||||
|
|
||||||
|
# whether the project is in read-only mode
|
||||||
|
# If set to true, all editing tools will be disabled and attempts to use them will result in an error
|
||||||
|
# Added on 2025-04-18
|
||||||
|
read_only: false
|
||||||
|
|
||||||
|
# list of tool names to exclude.
|
||||||
|
# This extends the existing exclusions (e.g. from the global configuration)
|
||||||
|
# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
|
||||||
|
excluded_tools: []
|
||||||
|
|
||||||
|
# list of tools to include that would otherwise be disabled (particularly optional tools that are disabled by default).
|
||||||
|
# This extends the existing inclusions (e.g. from the global configuration).
|
||||||
|
# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
|
||||||
|
included_optional_tools: []
|
||||||
|
|
||||||
|
# fixed set of tools to use as the base tool set (if non-empty), replacing Serena's default set of tools.
|
||||||
|
# This cannot be combined with non-empty excluded_tools or included_optional_tools.
|
||||||
|
# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
|
||||||
|
fixed_tools: []
|
||||||
|
|
||||||
|
# list of mode names that are to be activated by default, overriding the setting in the global configuration.
|
||||||
|
# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
|
||||||
|
# If the setting is undefined/empty, the default_modes from the global configuration (serena_config.yml) apply.
|
||||||
|
# Otherwise, this overrides the setting from the global configuration (serena_config.yml).
|
||||||
|
# Therefore, you can set this to [] if you do not want the default modes defined in the global config to apply
|
||||||
|
# for this project.
|
||||||
|
# This setting can, in turn, be overridden by CLI parameters (--mode).
|
||||||
|
# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
|
||||||
|
default_modes:
|
||||||
|
|
||||||
|
# list of mode names to be activated additionally for this project, e.g. ["query-projects"]
|
||||||
|
# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
|
||||||
|
# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
|
||||||
|
added_modes:
|
||||||
|
|
||||||
|
# initial prompt for the project. It will always be given to the LLM upon activating the project
|
||||||
|
# (contrary to the memories, which are loaded on demand).
|
||||||
|
initial_prompt: ""
|
||||||
|
|
||||||
|
# time budget (seconds) per tool call for the retrieval of additional symbol information
|
||||||
|
# such as docstrings or parameter information.
|
||||||
|
# This overrides the corresponding setting in the global configuration; see the documentation there.
|
||||||
|
# If null or missing, use the setting from the global configuration.
|
||||||
|
symbol_info_budget:
|
||||||
|
|
||||||
|
# list of regex patterns which, when matched, mark a memory entry as read‑only.
|
||||||
|
# Extends the list from the global configuration, merging the two lists.
|
||||||
|
read_only_memory_patterns: []
|
||||||
|
|
||||||
|
# list of regex patterns for memories to completely ignore.
|
||||||
|
# Matching memories will not appear in list_memories or activate_project output
|
||||||
|
# and cannot be accessed via read_memory or write_memory.
|
||||||
|
# To access ignored memory files, use the read_file tool on the raw file path.
|
||||||
|
# Extends the list from the global configuration, merging the two lists.
|
||||||
|
# Example: ["_archive/.*", "_episodes/.*"]
|
||||||
|
ignored_memory_patterns: []
|
||||||
@@ -8,6 +8,9 @@ Ne modifier aucun fichier à moins d'une demande explicite de modification. En p
|
|||||||
**Règle absolue : ne jamais substituer une dépendance ou une bibliothèque sans validation explicite.**
|
**Règle absolue : ne jamais substituer une dépendance ou une bibliothèque sans validation explicite.**
|
||||||
Si une dépendance demandée pose problème (erreur de compilation, bug, API manquante), exposer le problème et proposer des alternatives — ne jamais switcher silencieusement vers une autre bibliothèque. Le choix des dépendances est une décision d'architecture qui appartient au développeur.
|
Si une dépendance demandée pose problème (erreur de compilation, bug, API manquante), exposer le problème et proposer des alternatives — ne jamais switcher silencieusement vers une autre bibliothèque. Le choix des dépendances est une décision d'architecture qui appartient au développeur.
|
||||||
|
|
||||||
|
**Règle absolue : le code existant est une hypothèse, pas une vérité.**
|
||||||
|
Quand une nouvelle construction (type, itérateur, abstraction) rend du code historique injustifié, le signaler immédiatement et proposer de le supprimer — ne pas conserver les deux en parallèle par inertie. Le développeur demande explicitement de remettre en cause le code base : ne pas attendre qu'il insiste.
|
||||||
|
|
||||||
Tu maintiens en **anglais**, dense et sans remplissage, les documents suivants :
|
Tu maintiens en **anglais**, dense et sans remplissage, les documents suivants :
|
||||||
- `docmd/index.md` — document de discussion de base, enrichi progressivement au fil de nos échanges ; il reflète l'état courant de la réflexion sur le projet
|
- `docmd/index.md` — document de discussion de base, enrichi progressivement au fil de nos échanges ; il reflète l'état courant de la réflexion sur le projet
|
||||||
- les autres fichiers Markdown dans `docmd/` selon leur thème respectif
|
- les autres fichiers Markdown dans `docmd/` selon leur thème respectif
|
||||||
@@ -70,3 +73,29 @@ Lors de l'ajout de nouveaux fichiers Markdown dans `docmd/`, mettre à jour la s
|
|||||||
---
|
---
|
||||||
|
|
||||||
Je continue à poser mes questions et à guider la discussion.
|
Je continue à poser mes questions et à guider la discussion.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## MCP Tools
|
||||||
|
|
||||||
|
**Règle absolue : avant tout travail de code, appeler `mcp__serena__initial_instructions` pour charger les instructions Serena.**
|
||||||
|
|
||||||
|
### Hiérarchie des outils pour ce projet Rust
|
||||||
|
|
||||||
|
**Navigation et édition de code → serena en priorité**
|
||||||
|
- Trouver un symbole, une déclaration, les implémentations d'un trait : `mcp__serena__find_symbol`, `mcp__serena__find_declaration`, `mcp__serena__find_implementations`
|
||||||
|
- Trouver les usages d'un symbole : `mcp__serena__find_referencing_symbols`
|
||||||
|
- Diagnostics LSP (erreurs de compilation) : `mcp__serena__get_diagnostics_for_file`
|
||||||
|
- Vue d'ensemble d'un fichier : `mcp__serena__get_symbols_overview`
|
||||||
|
- Modifier le corps d'une fonction/impl : `mcp__serena__replace_symbol_body`
|
||||||
|
- Ne pas utiliser `cclsp` quand serena couvre le besoin
|
||||||
|
|
||||||
|
**Analyse architecturale → jcodemunch**
|
||||||
|
- Hotspots, couplage, dead code, dépendances entre modules
|
||||||
|
- Utiliser avant de refactorer une zone critique
|
||||||
|
|
||||||
|
**Raisonnement complexe → sequential-thinking**
|
||||||
|
- Décisions d'architecture, choix d'algorithme, trade-offs non triviaux
|
||||||
|
|
||||||
|
**Documentation de crates → context7**
|
||||||
|
- Toujours consulter avant d'utiliser une API de bibliothèque externe
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ $(MKDOCS): $(VENV)/bin/activate
|
|||||||
mkdocs mkdocs-material \
|
mkdocs mkdocs-material \
|
||||||
mkdocs-mermaid2-plugin \
|
mkdocs-mermaid2-plugin \
|
||||||
mkdocs-bibtex
|
mkdocs-bibtex
|
||||||
|
$(PIP) install --quiet --upgrade InSilicoSeq
|
||||||
|
|
||||||
# ── obikmer binary ───────────────────────────────────────────────────────────
|
# ── obikmer binary ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
@@ -62,3 +63,35 @@ clean-doc:
|
|||||||
.PHONY: clean
|
.PHONY: clean
|
||||||
clean: clean-doc
|
clean: clean-doc
|
||||||
rm -rf $(VENV)
|
rm -rf $(VENV)
|
||||||
|
|
||||||
|
# ── release ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
CARGO_TOML := $(CARGO_DIR)/obikmer/Cargo.toml
|
||||||
|
|
||||||
|
.PHONY: bump-version
|
||||||
|
bump-version:
|
||||||
|
@current=$$(grep '^version = ' $(CARGO_TOML) | head -n 1 | sed 's/version = "\(.*\)"/\1/'); \
|
||||||
|
if [ -n "$(RELEASE)" ]; then \
|
||||||
|
new_version="$(RELEASE)"; \
|
||||||
|
else \
|
||||||
|
major=$$(echo $$current | cut -d. -f1); \
|
||||||
|
minor=$$(echo $$current | cut -d. -f2); \
|
||||||
|
patch=$$(echo $$current | cut -d. -f3); \
|
||||||
|
new_patch=$$((patch + 1)); \
|
||||||
|
new_version="$$major.$$minor.$$new_patch"; \
|
||||||
|
fi; \
|
||||||
|
echo "Version: $$current -> $$new_version"; \
|
||||||
|
sed -i.bak "s/^version = \"$$current\"/version = \"$$new_version\"/" $(CARGO_TOML) && \
|
||||||
|
rm $(CARGO_TOML).bak
|
||||||
|
|
||||||
|
.PHONY: release
|
||||||
|
release: bump-version
|
||||||
|
@jj auto-describe
|
||||||
|
@jj git push --change @
|
||||||
|
@new_version=$$(grep '^version = ' $(CARGO_TOML) | head -n 1 | sed 's/version = "\(.*\)"/\1/'); \
|
||||||
|
git_hash=$$(jj log -r @ --no-graph -T 'commit_id'); \
|
||||||
|
commits=$$(jj log -r 'latest(tags())..@' --no-graph -T 'description ++ "\n"' 2>/dev/null || \
|
||||||
|
jj log --no-graph -T 'description ++ "\n"' --limit 30); \
|
||||||
|
notes=$$(printf 'Write concise markdown release notes for obikmer (a Rust kmer genomics tool). Be technical and direct. Base them strictly on these commit messages:\n\n%s' "$$commits" | aichat); \
|
||||||
|
git tag -a "v$$new_version" -m "$$notes" "$$git_hash" && \
|
||||||
|
git push origin "v$$new_version"
|
||||||
|
|||||||
@@ -1 +1,123 @@
|
|||||||
toto
|
# obikmer
|
||||||
|
|
||||||
|
`obikmer` is a Rust toolkit for indexing, querying, and comparing DNA sequences
|
||||||
|
represented as sets of k-mers. It targets individual genome datasets (tens of
|
||||||
|
Gbases) with maximum efficiency in computation, memory, and disk usage.
|
||||||
|
|
||||||
|
## Key principles
|
||||||
|
|
||||||
|
**Compact k-mer encoding.** Each k-mer is stored in a `u64` at 2 bits/base.
|
||||||
|
k is odd, k ∈ [11, 31], fixed at runtime. The canonical form `min(kmer, revcomp(kmer))`
|
||||||
|
halves the effective space by collapsing both strands.
|
||||||
|
|
||||||
|
**Superkmer-based partitioning.** Sequences are decomposed into superkmers —
|
||||||
|
maximal runs of k-mers sharing the same minimizer. Superkmers route naturally to
|
||||||
|
partitions via the minimizer hash, enabling partition-parallel indexing and querying
|
||||||
|
with no cross-partition communication.
|
||||||
|
|
||||||
|
**Layered MPHF index.** Each partition holds a stack of layers. Each layer is a
|
||||||
|
minimal perfect hash function (MPHF) over the k-mers of one input genome, paired
|
||||||
|
with a per-genome presence/count matrix. Queries scatter k-mers to their partition,
|
||||||
|
probe each layer in order, and aggregate results.
|
||||||
|
|
||||||
|
**Approximate indexing (Findere).** With `-z Z`, the index stores k-mers of size
|
||||||
|
`s = k − z + 1` instead of k. At query time, results are produced at size s, then
|
||||||
|
a per-genome sliding window of size z aggregates z consecutive s-mer hits into one
|
||||||
|
confirmed k-mer answer. This reduces the false-positive rate from `1/2^b` per s-mer
|
||||||
|
to `1/2^(b·z)` per k-mer, at the cost of z−1 unconfirmed positions at each sequence
|
||||||
|
break. The aggregation window spans the full query sequence, not individual superkmers,
|
||||||
|
to avoid false negatives at superkmer boundaries.
|
||||||
|
|
||||||
|
**Multi-genome.** A single index can hold any number of genomes. Each k-mer slot
|
||||||
|
carries a per-genome count or presence vector. Distance matrices, NJ/UPGMA trees,
|
||||||
|
and classification are derived from these vectors without rebuilding the index.
|
||||||
|
|
||||||
|
## Input formats
|
||||||
|
|
||||||
|
Command Formats accepted
|
||||||
|
─────────────────── ──────────────────────────────────────────────────────────────
|
||||||
|
index, superkmer FASTA (.fa .fasta), FASTQ (.fq .fastq), GenBank flat file
|
||||||
|
(.gb .gbk .gbff), all optionally gzip-compressed.
|
||||||
|
Directories expanded recursively. Streaming stdin via -.
|
||||||
|
query FASTA, FASTQ, optionally gzip-compressed. Stdin via -.
|
||||||
|
|
||||||
|
Non-ACGT characters act as hard breaks between k-mer segments in all formats.
|
||||||
|
|
||||||
|
## Commands
|
||||||
|
|
||||||
|
Command Role
|
||||||
|
───────── ────────────────────────────────────────────────────────────────────
|
||||||
|
index Build a genome index from sequence files.
|
||||||
|
Runs scatter → dereplicate → count → layered MPHF.
|
||||||
|
Resumes automatically if interrupted.
|
||||||
|
merge Merge multiple independently built indexes into one.
|
||||||
|
Schedules partitions largest-first under a memory budget semaphore
|
||||||
|
to avoid OOM on machines with many cores. The worst partition runs
|
||||||
|
alone first to calibrate the expansion estimator; subsequent
|
||||||
|
partitions run in parallel within the budget.
|
||||||
|
--budget-fraction F fraction of available RAM to use as budget
|
||||||
|
(default 0.5; reduce if OOM persists).
|
||||||
|
filter Filter and compact an existing index: apply count thresholds,
|
||||||
|
drop layers, rewrite as a single-layer index.
|
||||||
|
reindex Convert evidence in-place across all layers:
|
||||||
|
exact (evidence.bin) ↔ approximate (fingerprint.bin).
|
||||||
|
Does not touch the MPHF or unitigs.
|
||||||
|
query Query an index with FASTA/FASTQ sequences.
|
||||||
|
Annotates each sequence with per-genome k-mer match counts
|
||||||
|
and optional per-position coverage vectors (--detail).
|
||||||
|
Parallel over sequence chunks.
|
||||||
|
distance Compute a pairwise Bray-Curtis or Jaccard distance matrix
|
||||||
|
between all indexed genomes.
|
||||||
|
Optionally outputs a Newick NJ or UPGMA tree.
|
||||||
|
annotate Add or update genome metadata (taxonomy, etc.) from a CSV
|
||||||
|
file; or dump the current metadata as CSV.
|
||||||
|
estimate Dry-run: resolve and print approximate-index parameters
|
||||||
|
(z, evidence bits b, FP rates) given any two of (b, z, fp).
|
||||||
|
Does not touch any index.
|
||||||
|
dump Dump all indexed k-mers as CSV with per-genome counts or
|
||||||
|
presence flags.
|
||||||
|
superkmer Extract superkmers from a sequence file and write to stdout.
|
||||||
|
Diagnostic / pipeline use.
|
||||||
|
unitig Dump the unitig sequences stored in a built index. Debug use.
|
||||||
|
utils Miscellaneous utilities.
|
||||||
|
--new-label NEW=OLD rename a genome label in-place.
|
||||||
|
--bits-per-kmer print MPHF / evidence / matrix size breakdown.
|
||||||
|
--stats per-genome k-mer counts as CSV.
|
||||||
|
--partition-stats partition size distribution across one or more
|
||||||
|
indexes (markdown report to stdout). Useful to
|
||||||
|
diagnose minimizer imbalance before a large merge.
|
||||||
|
--csv FILE write per-(partition, source) raw data to FILE
|
||||||
|
(used with --partition-stats).
|
||||||
|
|
||||||
|
## Quick start
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# Build an exact index for each genome independently
|
||||||
|
obikmer index --kmer-size 31 --label genome_a genome_a.fa --output index_a/
|
||||||
|
obikmer index --kmer-size 31 --label genome_b genome_b.fa --output index_b/
|
||||||
|
|
||||||
|
# Merge into a single multi-genome index
|
||||||
|
obikmer merge --output index/ index_a/ index_b/
|
||||||
|
|
||||||
|
# Convert to approximate index (z=5, 8-bit fingerprints)
|
||||||
|
obikmer reindex --approx -z 5 --evidence-bits 8 index/
|
||||||
|
|
||||||
|
# Query reads
|
||||||
|
obikmer query index/ reads.fq.gz > annotated.fa
|
||||||
|
|
||||||
|
# Pairwise distances
|
||||||
|
obikmer distance index/ > distances.tsv
|
||||||
|
```
|
||||||
|
|
||||||
|
## Parameter constraints
|
||||||
|
|
||||||
|
Parameter Constraint
|
||||||
|
───────────────────── ──────────────
|
||||||
|
k (--kmer-size) odd, 11 ≤ k ≤ 31
|
||||||
|
m (--minimizer-size) odd, 3 ≤ m ≤ k−1
|
||||||
|
z (-z, --approx only) 1 ≤ z ≤ k−1
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
|
||||||
|
Extended architecture and implementation notes are in `docmd/`. Build with
|
||||||
|
`make doc` (requires Python + MkDocs Material).
|
||||||
|
|||||||
@@ -1,3 +1,21 @@
|
|||||||
|
## A finir dans le cadre de l'extension des index à une forme approximative
|
||||||
|
|
||||||
|
- Il faut avoir un chemin explicite pour construire en mode exact avec des méthodes qui ont ce mot exact à l'intérieur.
|
||||||
|
- pub fn find_exact (src/obilayeredmap/src/mphf_layer.rs)
|
||||||
|
- pub fn build_exact_evidence (src/obilayeredmap/src/layer.rs)
|
||||||
|
|
||||||
|
Comme elles existent actuellement pour le mode approx.
|
||||||
|
|
||||||
|
Ensuite, il faudra définir des méthodes génériques
|
||||||
|
- find()
|
||||||
|
- build_evidence()
|
||||||
|
|
||||||
|
qui utilise la bonne version suivant le mode de l'index de manière complètement transparente.
|
||||||
|
Avec ce système, tout le reste du code devrait être insensible au fait que l'on utilise un index exact ou approximatif.
|
||||||
|
|
||||||
|
Sauf qu'avec un index approximatif, les résultats seront approximatifs.
|
||||||
|
|
||||||
|
|
||||||
## commandes à ajouter
|
## commandes à ajouter
|
||||||
|
|
||||||
- aggregate : aggrege toutes les colonnes d'une matrice d'index en une seule colonne.
|
- aggregate : aggrege toutes les colonnes d'une matrice d'index en une seule colonne.
|
||||||
@@ -6,3 +24,37 @@
|
|||||||
--detail et --mismatch à implementer
|
--detail et --mismatch à implementer
|
||||||
|
|
||||||
- status : affiche le statut de l'index
|
- status : affiche le statut de l'index
|
||||||
|
|
||||||
|
## Problème biologique sur l'identification des contaminants
|
||||||
|
|
||||||
|
Exemple de reads problématiques:
|
||||||
|
```
|
||||||
|
>LH00534:161:22WMGWLT4:4:1101:45301:1420 {"coverage":{"gbbct":[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]},"kmer_count":117,"kmer_strict_matches":{"gbbct":117}}
|
||||||
|
GCCCCTACCGTACTCCAGCTTGGTAGTTTCCACCGCCTGTCCAGGGTTGAGCCCTGGGATTTGACGGCGGACTTAAAAAGCCACCTACAGACGCTTTACGCCCAATCATTCCGGATAACGCTTGCATCCTCTGTATTACCGCGGCTGCTGG
|
||||||
|
```
|
||||||
|
|
||||||
|
Par blast match une quantité invréssemblable de genomes chloroplastique avec un match de 100% (6554 hits pour Streptophyta)
|
||||||
|
|
||||||
|
mais aussi une quantité de sequences importantes à des OTU bactériennes (uncutured bacteria 115 hits) aussi avec 100% de similarité.
|
||||||
|
|
||||||
|
```
|
||||||
|
Uncultured bacterium clone Otu01032 16S ribosomal RNA gene, partial sequence
|
||||||
|
Sequence ID: KX996137.1Length: 440Number of Matches: 1
|
||||||
|
Range 1: 153 to 303GenBankGraphics
|
||||||
|
Next Match
|
||||||
|
Previous Match
|
||||||
|
Alignment statistics for match #1 Score Expect Identities Gaps Strand
|
||||||
|
273 bits(302) 2e-69 151/151(100%) 0/151(0%) Plus/Minus
|
||||||
|
|
||||||
|
Query 1 GCCCCTACCGTACTCCAGCTTGGTAGTTTCCACCGCCTGTCCAGGGTTGAGCCCTGGGAT 60
|
||||||
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
||||||
|
Sbjct 303 GCCCCTACCGTACTCCAGCTTGGTAGTTTCCACCGCCTGTCCAGGGTTGAGCCCTGGGAT 244
|
||||||
|
|
||||||
|
Query 61 TTGACGGCGGACTTAAAAAGCCACCTACAGACGCTTTACGCCCAATCATTCCGGATAACG 120
|
||||||
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
||||||
|
Sbjct 243 TTGACGGCGGACTTAAAAAGCCACCTACAGACGCTTTACGCCCAATCATTCCGGATAACG 184
|
||||||
|
|
||||||
|
Query 121 CTTGCATCCTCTGTATTACCGCGGCTGCTGG 151
|
||||||
|
|||||||||||||||||||||||||||||||
|
||||||
|
Sbjct 183 CTTGCATCCTCTGTATTACCGCGGCTGCTGG 153
|
||||||
|
```
|
||||||
|
|||||||
@@ -0,0 +1,230 @@
|
|||||||
|
# Requires GNU Make >= 4.3 (grouped targets &:) — use gmake on macOS
|
||||||
|
BINARY := ../src/target/release/obikmer
|
||||||
|
VENV_PY := ../.venv/bin/python3
|
||||||
|
|
||||||
|
GENOMES := $(wildcard genomes/*.fna.gz)
|
||||||
|
|
||||||
|
# SPECIMENS, SPECIES, and the full dependency graph are generated by
|
||||||
|
# make_deps.py from the genome FASTA headers — like .d files in C.
|
||||||
|
# Make rebuilds deps.mk whenever genomes/ changes and restarts.
|
||||||
|
-include deps.mk
|
||||||
|
|
||||||
|
REF_NPZS := $(SPECIMENS:%=reference_index/%.npz)
|
||||||
|
REF_DIST_CSVS := $(addprefix reference_dist/, \
|
||||||
|
shared_kmers.csv hamming_dist.csv jaccard_dist.csv \
|
||||||
|
bray_curtis_dist.csv relfreq_bray_curtis_dist.csv \
|
||||||
|
euclidean_dist.csv relfreq_euclidean_dist.csv \
|
||||||
|
hellinger_dist.csv hellinger_euclidean_dist.csv)
|
||||||
|
OBIKMER_PRESENCE_DIST := $(addprefix obikmer_dist/presence/, \
|
||||||
|
jaccard_dist.csv jaccard_shared.csv jaccard_nj.nwk \
|
||||||
|
hamming_dist.csv hamming_nj.nwk)
|
||||||
|
OBIKMER_COUNT_DIST := $(addprefix obikmer_dist/count/, \
|
||||||
|
jaccard_dist.csv jaccard_shared.csv jaccard_nj.nwk \
|
||||||
|
bray_curtis_dist.csv bray_curtis_nj.nwk \
|
||||||
|
relfreq_bray_curtis_dist.csv relfreq_bray_curtis_nj.nwk \
|
||||||
|
euclidean_dist.csv euclidean_nj.nwk \
|
||||||
|
relfreq_euclidean_dist.csv relfreq_euclidean_nj.nwk \
|
||||||
|
hellinger_dist.csv hellinger_nj.nwk \
|
||||||
|
hellinger_euclidean_dist.csv hellinger_euclidean_nj.nwk)
|
||||||
|
DIST_COMPARISON := stats/dist_comparison/summary.csv
|
||||||
|
PRESENCE_DONE := $(SPECIMENS:%=specimen_index_presence/%/index.done)
|
||||||
|
PRESENCE_STATS := $(SPECIMENS:%=stats/indexing_presence/%.stats)
|
||||||
|
COUNT_DONE := $(SPECIMENS:%=specimen_index_count/%/index.done)
|
||||||
|
COUNT_STATS := $(SPECIMENS:%=stats/indexing_count/%.stats)
|
||||||
|
VERIFY_PRESENCE_STATS := $(SPECIMENS:%=stats/verify_presence/%.stats)
|
||||||
|
VERIFY_COUNT_STATS := $(SPECIMENS:%=stats/verify_count/%.stats)
|
||||||
|
SPECIFIC_PRESENCE_DONE := $(SPECIES:%=specific_index_presence/%/index.done)
|
||||||
|
SPECIFIC_PRESENCE_STATS := $(SPECIES:%=stats/specific_kmer_presence/%.stats)
|
||||||
|
SPECIFIC_COUNT_DONE := $(SPECIES:%=specific_index_count/%/index.done)
|
||||||
|
SPECIFIC_COUNT_STATS := $(SPECIES:%=stats/specific_kmer_count/%.stats)
|
||||||
|
SIMULATED_READS := $(foreach s,$(SPECIMENS),simulated_data/$(subst --,/,$s)/reads_R1.fastq.gz)
|
||||||
|
|
||||||
|
.NOTPARALLEL:
|
||||||
|
|
||||||
|
.PHONY: all simulate reference reference_dist \
|
||||||
|
obikmer_dist obikmer_dist_presence obikmer_dist_count \
|
||||||
|
dist_comparison \
|
||||||
|
index_presence index_count \
|
||||||
|
aggregate_index_presence aggregate_index_count \
|
||||||
|
merge_presence merge_count \
|
||||||
|
verify_presence verify_count \
|
||||||
|
aggregate_verify_presence aggregate_verify_count \
|
||||||
|
verify_merge_presence verify_merge_count \
|
||||||
|
filter_presence filter_count \
|
||||||
|
aggregate_filter_presence aggregate_filter_count
|
||||||
|
|
||||||
|
verify_merge_presence: stats/verify_merge_presence/current.csv
|
||||||
|
verify_merge_count: stats/verify_merge_count/current.csv
|
||||||
|
|
||||||
|
all: aggregate_verify_presence aggregate_verify_count \
|
||||||
|
verify_merge_presence verify_merge_count \
|
||||||
|
aggregate_filter_presence aggregate_filter_count \
|
||||||
|
dist_comparison
|
||||||
|
|
||||||
|
# ── dependency file ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
deps.mk: $(GENOMES)
|
||||||
|
$(VENV_PY) make_deps.py $^ > $@
|
||||||
|
|
||||||
|
# ── simulation ────────────────────────────────────────────────────────────────
|
||||||
|
# Prerequisites (genome → reads) are in deps.mk; $< is the genome file.
|
||||||
|
|
||||||
|
$(SIMULATED_READS):
|
||||||
|
bash simulate_one.sh $< $(dir $@)
|
||||||
|
|
||||||
|
simulate: $(SIMULATED_READS)
|
||||||
|
|
||||||
|
# ── reference kmer sets ───────────────────────────────────────────────────────
|
||||||
|
# Prerequisites (reads → npz) are in deps.mk.
|
||||||
|
|
||||||
|
reference_index/%.npz:
|
||||||
|
bash build_reference.sh $*
|
||||||
|
|
||||||
|
reference: $(REF_NPZS)
|
||||||
|
|
||||||
|
# ── reference distance matrices ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
$(REF_DIST_CSVS) &: $(REF_NPZS) build_reference_dist.py
|
||||||
|
$(VENV_PY) build_reference_dist.py
|
||||||
|
|
||||||
|
reference_dist: $(REF_DIST_CSVS)
|
||||||
|
|
||||||
|
# ── obikmer distance (presence index) ────────────────────────────────────────
|
||||||
|
|
||||||
|
$(OBIKMER_PRESENCE_DIST) &: global_index_presence/index.done $(BINARY)
|
||||||
|
mkdir -p obikmer_dist/presence
|
||||||
|
$(BINARY) distance \
|
||||||
|
--output obikmer_dist/presence/jaccard \
|
||||||
|
--metric jaccard --shared-kmers --nj \
|
||||||
|
global_index_presence
|
||||||
|
$(BINARY) distance \
|
||||||
|
--output obikmer_dist/presence/hamming \
|
||||||
|
--metric hamming --nj \
|
||||||
|
global_index_presence
|
||||||
|
|
||||||
|
obikmer_dist_presence: $(OBIKMER_PRESENCE_DIST)
|
||||||
|
|
||||||
|
# ── obikmer distance (count index) ───────────────────────────────────────────
|
||||||
|
|
||||||
|
$(OBIKMER_COUNT_DIST) &: global_index_count/index.done $(BINARY)
|
||||||
|
mkdir -p obikmer_dist/count
|
||||||
|
$(BINARY) distance \
|
||||||
|
--output obikmer_dist/count/jaccard \
|
||||||
|
--metric jaccard --shared-kmers --nj \
|
||||||
|
global_index_count
|
||||||
|
$(BINARY) distance \
|
||||||
|
--output obikmer_dist/count/bray_curtis \
|
||||||
|
--metric bray-curtis --nj \
|
||||||
|
global_index_count
|
||||||
|
$(BINARY) distance \
|
||||||
|
--output obikmer_dist/count/relfreq_bray_curtis \
|
||||||
|
--metric relfreq-bray-curtis --nj \
|
||||||
|
global_index_count
|
||||||
|
$(BINARY) distance \
|
||||||
|
--output obikmer_dist/count/euclidean \
|
||||||
|
--metric euclidean --nj \
|
||||||
|
global_index_count
|
||||||
|
$(BINARY) distance \
|
||||||
|
--output obikmer_dist/count/relfreq_euclidean \
|
||||||
|
--metric relfreq-euclidean --nj \
|
||||||
|
global_index_count
|
||||||
|
$(BINARY) distance \
|
||||||
|
--output obikmer_dist/count/hellinger \
|
||||||
|
--metric hellinger --nj \
|
||||||
|
global_index_count
|
||||||
|
$(BINARY) distance \
|
||||||
|
--output obikmer_dist/count/hellinger_euclidean \
|
||||||
|
--metric hellinger-euclidean --nj \
|
||||||
|
global_index_count
|
||||||
|
|
||||||
|
obikmer_dist_count: $(OBIKMER_COUNT_DIST)
|
||||||
|
|
||||||
|
obikmer_dist: obikmer_dist_presence obikmer_dist_count
|
||||||
|
|
||||||
|
# ── distance comparison ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
$(DIST_COMPARISON): $(REF_DIST_CSVS) $(OBIKMER_PRESENCE_DIST) $(OBIKMER_COUNT_DIST) compare_all_dist.py
|
||||||
|
$(VENV_PY) compare_all_dist.py --out $(DIST_COMPARISON)
|
||||||
|
|
||||||
|
dist_comparison: $(DIST_COMPARISON)
|
||||||
|
|
||||||
|
# ── per-specimen indexing ─────────────────────────────────────────────────────
|
||||||
|
# Prerequisites (reads → index.done + .stats) are in deps.mk.
|
||||||
|
|
||||||
|
specimen_index_presence/%/index.done \
|
||||||
|
stats/indexing_presence/%.stats &: $(BINARY)
|
||||||
|
bash index_one_presence.sh $*
|
||||||
|
|
||||||
|
specimen_index_count/%/index.done \
|
||||||
|
stats/indexing_count/%.stats &: $(BINARY)
|
||||||
|
bash index_one_count.sh $*
|
||||||
|
|
||||||
|
index_presence: $(PRESENCE_DONE)
|
||||||
|
index_count: $(COUNT_DONE)
|
||||||
|
|
||||||
|
# ── indexing stats aggregation ────────────────────────────────────────────────
|
||||||
|
|
||||||
|
aggregate_index_presence: $(PRESENCE_STATS)
|
||||||
|
bash aggregate_stats.sh indexing_presence
|
||||||
|
|
||||||
|
aggregate_index_count: $(COUNT_STATS)
|
||||||
|
bash aggregate_stats.sh indexing_count
|
||||||
|
|
||||||
|
# ── global merge ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
global_index_presence/index.done: $(PRESENCE_DONE) $(BINARY)
|
||||||
|
bash merge_presence.sh
|
||||||
|
|
||||||
|
global_index_count/index.done: $(COUNT_DONE) $(BINARY)
|
||||||
|
bash merge_count.sh
|
||||||
|
|
||||||
|
merge_presence: global_index_presence/index.done
|
||||||
|
merge_count: global_index_count/index.done
|
||||||
|
|
||||||
|
# ── per-specimen verification ─────────────────────────────────────────────────
|
||||||
|
# Prerequisites (index.done + npz → .stats) are in deps.mk.
|
||||||
|
|
||||||
|
stats/verify_presence/%.stats:
|
||||||
|
bash verify_one_presence.sh $*
|
||||||
|
|
||||||
|
stats/verify_count/%.stats:
|
||||||
|
bash verify_one_count.sh $*
|
||||||
|
|
||||||
|
verify_presence: $(VERIFY_PRESENCE_STATS)
|
||||||
|
verify_count: $(VERIFY_COUNT_STATS)
|
||||||
|
|
||||||
|
# ── verification stats aggregation ───────────────────────────────────────────
|
||||||
|
|
||||||
|
aggregate_verify_presence: $(VERIFY_PRESENCE_STATS)
|
||||||
|
bash aggregate_stats.sh verify_presence
|
||||||
|
|
||||||
|
aggregate_verify_count: $(VERIFY_COUNT_STATS)
|
||||||
|
bash aggregate_stats.sh verify_count
|
||||||
|
|
||||||
|
# ── species-specific indexes ──────────────────────────────────────────────────
|
||||||
|
# Prerequisites (global index → specific index) are in deps.mk.
|
||||||
|
|
||||||
|
specific_index_presence/%/index.done \
|
||||||
|
stats/specific_kmer_presence/%.stats &: $(BINARY)
|
||||||
|
bash filter_one_presence.sh $*
|
||||||
|
|
||||||
|
specific_index_count/%/index.done \
|
||||||
|
stats/specific_kmer_count/%.stats &: $(BINARY)
|
||||||
|
bash filter_one_count.sh $*
|
||||||
|
|
||||||
|
filter_presence: $(SPECIFIC_PRESENCE_DONE)
|
||||||
|
filter_count: $(SPECIFIC_COUNT_DONE)
|
||||||
|
|
||||||
|
aggregate_filter_presence: $(SPECIFIC_PRESENCE_STATS)
|
||||||
|
bash aggregate_stats.sh specific_kmer_presence
|
||||||
|
|
||||||
|
aggregate_filter_count: $(SPECIFIC_COUNT_STATS)
|
||||||
|
bash aggregate_stats.sh specific_kmer_count
|
||||||
|
|
||||||
|
# ── merged index verification ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
stats/verify_merge_presence/current.csv: $(REF_NPZS) global_index_presence/index.done
|
||||||
|
bash verify_merge_presence.sh
|
||||||
|
|
||||||
|
stats/verify_merge_count/current.csv: $(REF_NPZS) global_index_count/index.done
|
||||||
|
bash verify_merge_count.sh
|
||||||
@@ -0,0 +1,132 @@
|
|||||||
|
# Benchmark pipeline
|
||||||
|
|
||||||
|
Requires **GNU Make ≥ 4.3** (grouped targets `&:`). On macOS use `gmake`.
|
||||||
|
|
||||||
|
```
|
||||||
|
gmake all # full pipeline
|
||||||
|
gmake simulate # simulation only
|
||||||
|
gmake reference # reference kmer sets only
|
||||||
|
```
|
||||||
|
|
||||||
|
## Pipeline overview
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart TD
|
||||||
|
GENOMES["genomes/*.fna.gz"]
|
||||||
|
BIN["obikmer binary"]
|
||||||
|
|
||||||
|
GENOMES --> simulate
|
||||||
|
simulate --> simdata[("simulated_data/")]
|
||||||
|
|
||||||
|
simdata --> reference
|
||||||
|
reference --> refnpz[("reference_index/*.npz")]
|
||||||
|
|
||||||
|
subgraph presence ["Presence track"]
|
||||||
|
simdata --> index_presence
|
||||||
|
BIN --> index_presence
|
||||||
|
index_presence --> pres_done[("specimen_index_presence/")]
|
||||||
|
index_presence --> pres_istats[("stats/indexing_presence/")]
|
||||||
|
pres_istats --> aggregate_index_presence
|
||||||
|
|
||||||
|
pres_done --> merge_presence
|
||||||
|
BIN --> merge_presence
|
||||||
|
merge_presence --> gpres[("global_index_presence/")]
|
||||||
|
|
||||||
|
refnpz --> verify_presence
|
||||||
|
pres_done --> verify_presence
|
||||||
|
verify_presence --> vpres_stats[("stats/verify_presence/")]
|
||||||
|
vpres_stats --> aggregate_verify_presence
|
||||||
|
|
||||||
|
gpres --> filter_presence
|
||||||
|
BIN --> filter_presence
|
||||||
|
filter_presence --> spec_pres[("specific_index_presence/")]
|
||||||
|
filter_presence --> spec_pres_stats[("stats/specific_kmer_presence/")]
|
||||||
|
spec_pres_stats --> aggregate_filter_presence
|
||||||
|
|
||||||
|
refnpz --> verify_merge_presence
|
||||||
|
gpres --> verify_merge_presence
|
||||||
|
verify_merge_presence --> vmp[("stats/verify_merge_presence/")]
|
||||||
|
end
|
||||||
|
|
||||||
|
subgraph count ["Count track"]
|
||||||
|
simdata --> index_count
|
||||||
|
BIN --> index_count
|
||||||
|
index_count --> count_done[("specimen_index_count/")]
|
||||||
|
index_count --> count_istats[("stats/indexing_count/")]
|
||||||
|
count_istats --> aggregate_index_count
|
||||||
|
|
||||||
|
count_done --> merge_count
|
||||||
|
BIN --> merge_count
|
||||||
|
merge_count --> gcount[("global_index_count/")]
|
||||||
|
|
||||||
|
refnpz --> verify_count
|
||||||
|
count_done --> verify_count
|
||||||
|
verify_count --> vcount_stats[("stats/verify_count/")]
|
||||||
|
vcount_stats --> aggregate_verify_count
|
||||||
|
|
||||||
|
gcount --> filter_count
|
||||||
|
BIN --> filter_count
|
||||||
|
filter_count --> spec_count[("specific_index_count/")]
|
||||||
|
filter_count --> spec_count_stats[("stats/specific_kmer_count/")]
|
||||||
|
spec_count_stats --> aggregate_filter_count
|
||||||
|
|
||||||
|
refnpz --> verify_merge_count
|
||||||
|
gcount --> verify_merge_count
|
||||||
|
verify_merge_count --> vmc[("stats/verify_merge_count/")]
|
||||||
|
end
|
||||||
|
|
||||||
|
aggregate_verify_presence --> all
|
||||||
|
aggregate_verify_count --> all
|
||||||
|
vmp --> all
|
||||||
|
vmc --> all
|
||||||
|
all -. "$(MAKE) re-eval" .-> aggregate_filter_presence
|
||||||
|
all -. "$(MAKE) re-eval" .-> aggregate_filter_count
|
||||||
|
```
|
||||||
|
|
||||||
|
## Steps
|
||||||
|
|
||||||
|
| Target | Script | Description |
|
||||||
|
|---|---|---|
|
||||||
|
| `simulate` | `simulate.sh` | Simulate sequencing reads from the reference genomes |
|
||||||
|
| `reference` | `build_reference.sh` | Build reference kmer sets (`.npz`) from simulation truth |
|
||||||
|
| `index_presence` | `index_one_presence.sh` | Index each specimen (presence mode) |
|
||||||
|
| `index_count` | `index_one_count.sh` | Index each specimen (count mode) |
|
||||||
|
| `aggregate_index_presence` | `aggregate_stats.sh` | Aggregate per-specimen indexing stats (presence) |
|
||||||
|
| `aggregate_index_count` | `aggregate_stats.sh` | Aggregate per-specimen indexing stats (count) |
|
||||||
|
| `merge_presence` | `merge_presence.sh` | Merge all specimen presence indexes into a global index |
|
||||||
|
| `merge_count` | `merge_count.sh` | Merge all specimen count indexes into a global index |
|
||||||
|
| `verify_presence` | `verify_one_presence.sh` | Verify each specimen presence index against reference |
|
||||||
|
| `verify_count` | `verify_one_count.sh` | Verify each specimen count index against reference |
|
||||||
|
| `aggregate_verify_presence` | `aggregate_stats.sh` | Aggregate per-specimen verification stats (presence) |
|
||||||
|
| `aggregate_verify_count` | `aggregate_stats.sh` | Aggregate per-specimen verification stats (count) |
|
||||||
|
| `filter_presence` | `filter_one_presence.sh` | Extract species-specific presence indexes from global index |
|
||||||
|
| `filter_count` | `filter_one_count.sh` | Extract species-specific count indexes from global index |
|
||||||
|
| `aggregate_filter_presence` | `aggregate_stats.sh` | Aggregate species-specific kmer stats (presence) |
|
||||||
|
| `aggregate_filter_count` | `aggregate_stats.sh` | Aggregate species-specific kmer stats (count) |
|
||||||
|
| `verify_merge_presence` | `verify_merge_presence.sh` | Verify global presence index against all reference sets |
|
||||||
|
| `verify_merge_count` | `verify_merge_count.sh` | Verify global count index against all reference sets |
|
||||||
|
|
||||||
|
## Directory layout
|
||||||
|
|
||||||
|
```
|
||||||
|
benchmark/
|
||||||
|
├── genomes/ # input reference genomes (.fna.gz)
|
||||||
|
├── simulated_data/ # generated by simulate
|
||||||
|
│ └── <species>/<specimen>/
|
||||||
|
├── reference_index/ # reference kmer sets (.npz)
|
||||||
|
├── specimen_index_presence/ # per-specimen presence indexes
|
||||||
|
├── specimen_index_count/ # per-specimen count indexes
|
||||||
|
├── global_index_presence/ # merged global presence index
|
||||||
|
├── global_index_count/ # merged global count index
|
||||||
|
├── specific_index_presence/ # species-specific presence indexes
|
||||||
|
├── specific_index_count/ # species-specific count indexes
|
||||||
|
└── stats/ # all benchmark statistics
|
||||||
|
├── indexing_presence/
|
||||||
|
├── indexing_count/
|
||||||
|
├── verify_presence/
|
||||||
|
├── verify_count/
|
||||||
|
├── specific_kmer_presence/
|
||||||
|
├── specific_kmer_count/
|
||||||
|
├── verify_merge_presence/
|
||||||
|
└── verify_merge_count/
|
||||||
|
```
|
||||||
Executable
+53
@@ -0,0 +1,53 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: aggregate_stats.sh TYPE
|
||||||
|
# TYPE = indexing_presence | indexing_count | verify_presence | verify_count
|
||||||
|
#
|
||||||
|
# Reads all stats/TYPE/*.stats files (one CSV data row each, no header).
|
||||||
|
# Creates a new stats/TYPE/run_NNN.csv only if any .stats file is newer than
|
||||||
|
# the most recent run CSV (idempotent when nothing changed).
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
TYPE="$1"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/${TYPE}"
|
||||||
|
|
||||||
|
case "${TYPE}" in
|
||||||
|
indexing_presence|indexing_count)
|
||||||
|
HEADER="run,species,strain,scatter_wall_s,scatter_rss_b,dereplicate_wall_s,dereplicate_rss_b,count_kmer_wall_s,count_kmer_rss_b,index_wall_s,index_rss_b,total_wall_s,total_rss_b"
|
||||||
|
;;
|
||||||
|
verify_presence)
|
||||||
|
HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct"
|
||||||
|
;;
|
||||||
|
verify_count)
|
||||||
|
HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,fn_pct,fp_pct,cm_pct"
|
||||||
|
;;
|
||||||
|
specific_kmer_presence|specific_kmer_count)
|
||||||
|
HEADER="run,species,rebuild_wall_s,rebuild_rss_b,pack_wall_s,pack_rss_b,filter_total_wall_s,filter_total_rss_b,select_wall_s,select_rss_b,select_total_wall_s,select_total_rss_b"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "ERROR: unknown stats type '${TYPE}'" >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# Find most recent existing run CSV (empty string if none).
|
||||||
|
latest_csv=$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | sort | tail -1)
|
||||||
|
|
||||||
|
# Check if any .stats file is newer than the latest run CSV.
|
||||||
|
if [[ -n "${latest_csv}" ]] && \
|
||||||
|
[[ -z "$(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' -newer "${latest_csv}" 2>/dev/null)" ]]; then
|
||||||
|
echo "[${TYPE}] stats up to date (${latest_csv})"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | wc -l | tr -d ' ')")
|
||||||
|
CSV="${STATS_DIR}/run_${run_n}.csv"
|
||||||
|
|
||||||
|
echo "${HEADER}" >"${CSV}"
|
||||||
|
|
||||||
|
# Sort .stats files by name for reproducible row order.
|
||||||
|
while IFS= read -r stats_file; do
|
||||||
|
sed "s/^/${run_n},/" "${stats_file}"
|
||||||
|
done < <(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' | sort) >>"${CSV}"
|
||||||
|
|
||||||
|
echo "[${TYPE}] run ${run_n} → ${CSV}"
|
||||||
Executable
+137
@@ -0,0 +1,137 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Build a reference kmer index from paired-end FASTQ reads.
|
||||||
|
|
||||||
|
Extracts canonical kmers — min(kmer, revcomp(kmer)) encoded as uint64 —
|
||||||
|
counts their abundances, and saves a sorted numpy pair (kmers, counts).
|
||||||
|
|
||||||
|
Output .npz arrays
|
||||||
|
kmers : uint64, sorted ascending — canonical kmer integers
|
||||||
|
counts : uint32, same order — raw read abundances
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import gzip
|
||||||
|
import sys
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
# ── encoding ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
|
||||||
|
'a': 0, 'c': 1, 'g': 2, 't': 3}
|
||||||
|
|
||||||
|
# Lookup table: revcomp of one byte (4 bases, 8 bits).
|
||||||
|
# Precomputed once at import time.
|
||||||
|
_REVCOMP8 = [0] * 256
|
||||||
|
for _i in range(256):
|
||||||
|
_rc, _x = 0, _i
|
||||||
|
for _ in range(4):
|
||||||
|
_rc = (_rc << 2) | (3 - (_x & 3))
|
||||||
|
_x >>= 2
|
||||||
|
_REVCOMP8[_i] = _rc
|
||||||
|
del _i, _rc, _x
|
||||||
|
|
||||||
|
|
||||||
|
def revcomp_int(kmer: int, k: int) -> int:
|
||||||
|
"""Reverse-complement of a kmer encoded as an integer (2 bits/base).
|
||||||
|
|
||||||
|
Uses byte-level lookup (4 bases at a time) for speed.
|
||||||
|
"""
|
||||||
|
rc = 0
|
||||||
|
bits_left = 2 * k
|
||||||
|
while bits_left > 0:
|
||||||
|
chunk = min(8, bits_left)
|
||||||
|
rc_byte = _REVCOMP8[kmer & 0xFF] >> (8 - chunk)
|
||||||
|
rc = (rc << chunk) | rc_byte
|
||||||
|
kmer >>= chunk
|
||||||
|
bits_left -= chunk
|
||||||
|
return rc
|
||||||
|
|
||||||
|
|
||||||
|
# ── FASTQ parsing ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def iter_sequences(path: str):
|
||||||
|
"""Yield raw sequences from a (gzipped) FASTQ file."""
|
||||||
|
opener = gzip.open if path.endswith('.gz') else open
|
||||||
|
with opener(path, 'rt') as fh:
|
||||||
|
while True:
|
||||||
|
if not fh.readline(): # '@' header
|
||||||
|
break
|
||||||
|
seq = fh.readline().rstrip('\n')
|
||||||
|
fh.readline() # '+'
|
||||||
|
fh.readline() # quality
|
||||||
|
yield seq
|
||||||
|
|
||||||
|
|
||||||
|
# ── kmer counting ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def count_kmers(paths: list[str], k: int) -> dict[int, int]:
|
||||||
|
mask = (1 << (2 * k)) - 1
|
||||||
|
counts: dict[int, int] = defaultdict(int)
|
||||||
|
n_reads = 0
|
||||||
|
|
||||||
|
for path in paths:
|
||||||
|
for seq in iter_sequences(path):
|
||||||
|
n_reads += 1
|
||||||
|
kmer = 0
|
||||||
|
run = 0 # consecutive valid bases
|
||||||
|
|
||||||
|
for c in seq:
|
||||||
|
b = _ENCODE.get(c)
|
||||||
|
if b is None: # N or unexpected character → reset
|
||||||
|
kmer = 0
|
||||||
|
run = 0
|
||||||
|
continue
|
||||||
|
kmer = ((kmer << 2) | b) & mask
|
||||||
|
run += 1
|
||||||
|
if run >= k:
|
||||||
|
rc = revcomp_int(kmer, k)
|
||||||
|
counts[kmer if kmer <= rc else rc] += 1
|
||||||
|
|
||||||
|
if n_reads % 100_000 == 0:
|
||||||
|
print(f' {n_reads:,} reads processed, '
|
||||||
|
f'{len(counts):,} distinct kmers so far',
|
||||||
|
file=sys.stderr)
|
||||||
|
|
||||||
|
print(f' {n_reads:,} reads total, {len(counts):,} distinct kmers',
|
||||||
|
file=sys.stderr)
|
||||||
|
return counts
|
||||||
|
|
||||||
|
|
||||||
|
# ── main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
ap = argparse.ArgumentParser(description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
ap.add_argument('reads', nargs='+', metavar='FASTQ',
|
||||||
|
help='Input reads (FASTQ, gzip OK)')
|
||||||
|
ap.add_argument('-k', '--kmer-size', type=int, default=31,
|
||||||
|
metavar='K')
|
||||||
|
ap.add_argument('--min-abundance', type=int, default=1,
|
||||||
|
metavar='N', help='Drop kmers with count < N (default 1)')
|
||||||
|
ap.add_argument('-o', '--output', required=True,
|
||||||
|
metavar='FILE', help='Output .npz path')
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
print(f'k={args.kmer_size} files={len(args.reads)}', file=sys.stderr)
|
||||||
|
counts = count_kmers(args.reads, args.kmer_size)
|
||||||
|
|
||||||
|
if args.min_abundance > 1:
|
||||||
|
before = len(counts)
|
||||||
|
counts = {k: v for k, v in counts.items() if v >= args.min_abundance}
|
||||||
|
print(f' min-abundance={args.min_abundance}: '
|
||||||
|
f'{before - len(counts):,} kmers dropped, '
|
||||||
|
f'{len(counts):,} retained',
|
||||||
|
file=sys.stderr)
|
||||||
|
|
||||||
|
print(f'Sorting and saving → {args.output}', file=sys.stderr)
|
||||||
|
kmers_arr = np.fromiter(sorted(counts), dtype=np.uint64, count=len(counts))
|
||||||
|
counts_arr = np.array([counts[int(k)] for k in kmers_arr], dtype=np.uint32)
|
||||||
|
|
||||||
|
np.savez_compressed(args.output, kmers=kmers_arr, counts=counts_arr)
|
||||||
|
print(f'Done {len(kmers_arr):,} kmers → {args.output}', file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Executable
+39
@@ -0,0 +1,39 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
SIMDATA_DIR="${SCRIPT_DIR}/simulated_data"
|
||||||
|
REF_DIR="${SCRIPT_DIR}/reference_index"
|
||||||
|
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
|
||||||
|
BUILD_PY="${SCRIPT_DIR}/build_reference.py"
|
||||||
|
|
||||||
|
KMER_SIZE="${KMER_SIZE:-31}"
|
||||||
|
MIN_ABUNDANCE="${MIN_ABUNDANCE:-1}"
|
||||||
|
|
||||||
|
mkdir -p "${REF_DIR}"
|
||||||
|
|
||||||
|
for species_dir in "${SIMDATA_DIR}"/*/; do
|
||||||
|
[[ -d "${species_dir}" ]] || continue
|
||||||
|
species=$(basename "${species_dir}")
|
||||||
|
|
||||||
|
for strain_dir in "${species_dir}"*/; do
|
||||||
|
[[ -d "${strain_dir}" ]] || continue
|
||||||
|
strain=$(basename "${strain_dir}")
|
||||||
|
|
||||||
|
r1="${strain_dir}/reads_R1.fastq.gz"
|
||||||
|
r2="${strain_dir}/reads_R2.fastq.gz"
|
||||||
|
if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
|
||||||
|
echo "SKIP ${species}--${strain}: reads not found" >&2
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
out="${REF_DIR}/${species}--${strain}.npz"
|
||||||
|
echo "[${species}--${strain}] → ${out}"
|
||||||
|
|
||||||
|
"${PYTHON}" "${BUILD_PY}" \
|
||||||
|
--kmer-size "${KMER_SIZE}" \
|
||||||
|
--min-abundance "${MIN_ABUNDANCE}" \
|
||||||
|
--output "${out}" \
|
||||||
|
"${r1}" "${r2}"
|
||||||
|
done
|
||||||
|
done
|
||||||
Executable
+226
@@ -0,0 +1,226 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Compute reference pairwise distance matrices from per-specimen .npz kmer indexes.
|
||||||
|
|
||||||
|
Reads all .npz files in reference_index/ (each containing sorted uint64 `kmers`
|
||||||
|
and uint32 `counts`), computes all distance metrics supported by `obikmer distance`,
|
||||||
|
and writes one CSV per metric to reference_dist/.
|
||||||
|
|
||||||
|
Output CSV format matches `obikmer distance --output`:
|
||||||
|
- first row: "genome", then specimen names
|
||||||
|
- subsequent rows: specimen name, then float or int values
|
||||||
|
|
||||||
|
Metrics written
|
||||||
|
jaccard_dist.csv Jaccard distance (presence/absence)
|
||||||
|
shared_kmers.csv Shared-kmer count matrix (intersection size)
|
||||||
|
bray_curtis_dist.csv Bray-Curtis dissimilarity (raw counts)
|
||||||
|
relfreq_bray_curtis_dist.csv Bray-Curtis on relative frequencies
|
||||||
|
euclidean_dist.csv Euclidean distance (raw counts)
|
||||||
|
relfreq_euclidean_dist.csv Euclidean distance on relative frequencies
|
||||||
|
hellinger_dist.csv Hellinger distance
|
||||||
|
hellinger_euclidean_dist.csv Euclidean distance in Hellinger space
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
# ── pairwise helpers ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def shared_indices(a_kmers: np.ndarray, b_kmers: np.ndarray):
|
||||||
|
"""Return index arrays (idx_a, idx_b) for kmers present in both sets.
|
||||||
|
|
||||||
|
Both arrays must be sorted uint64. Uses searchsorted: O(|B| log |A|).
|
||||||
|
"""
|
||||||
|
pos = np.searchsorted(a_kmers, b_kmers)
|
||||||
|
pos = np.clip(pos, 0, len(a_kmers) - 1)
|
||||||
|
mask = a_kmers[pos] == b_kmers
|
||||||
|
idx_b = np.where(mask)[0]
|
||||||
|
idx_a = pos[idx_b]
|
||||||
|
return idx_a, idx_b
|
||||||
|
|
||||||
|
|
||||||
|
def pairwise_stats(specimens: list[dict]) -> dict[str, np.ndarray]:
|
||||||
|
"""Compute all pairwise distance matrices at once.
|
||||||
|
|
||||||
|
Returns a dict metric_name → ndarray (n×n float64 or int64).
|
||||||
|
Each specimen dict has keys: name, kmers, counts.
|
||||||
|
"""
|
||||||
|
n = len(specimens)
|
||||||
|
|
||||||
|
# Pre-compute per-specimen scalars
|
||||||
|
kmer_counts = np.array([len(s['kmers']) for s in specimens], dtype=np.uint64)
|
||||||
|
count_sums = np.array([s['counts'].sum() for s in specimens], dtype=np.uint64)
|
||||||
|
|
||||||
|
# Per-specimen sum-of-squares (for Euclidean decomposition)
|
||||||
|
sq_sums = np.array([(s['counts'].astype(np.float64) ** 2).sum() for s in specimens])
|
||||||
|
|
||||||
|
# Allocate output matrices
|
||||||
|
shared_mat = np.zeros((n, n), dtype=np.uint64)
|
||||||
|
hamming_mat = np.zeros((n, n), dtype=np.float64)
|
||||||
|
jaccard_mat = np.zeros((n, n), dtype=np.float64)
|
||||||
|
bray_mat = np.zeros((n, n), dtype=np.float64)
|
||||||
|
relfreq_bray = np.zeros((n, n), dtype=np.float64)
|
||||||
|
euclidean_mat = np.zeros((n, n), dtype=np.float64)
|
||||||
|
relfreq_eucl = np.zeros((n, n), dtype=np.float64)
|
||||||
|
hellinger_mat = np.zeros((n, n), dtype=np.float64)
|
||||||
|
hell_eucl_mat = np.zeros((n, n), dtype=np.float64)
|
||||||
|
|
||||||
|
for i in range(n):
|
||||||
|
a_km = specimens[i]['kmers']
|
||||||
|
a_ct = specimens[i]['counts'].astype(np.float64)
|
||||||
|
sa = float(count_sums[i])
|
||||||
|
na = int(kmer_counts[i])
|
||||||
|
|
||||||
|
for j in range(i + 1, n):
|
||||||
|
b_km = specimens[j]['kmers']
|
||||||
|
b_ct = specimens[j]['counts'].astype(np.float64)
|
||||||
|
sb = float(count_sums[j])
|
||||||
|
nb = int(kmer_counts[j])
|
||||||
|
|
||||||
|
idx_a, idx_b = shared_indices(a_km, b_km)
|
||||||
|
inter = len(idx_a)
|
||||||
|
|
||||||
|
ca_sh = a_ct[idx_a]
|
||||||
|
cb_sh = b_ct[idx_b]
|
||||||
|
|
||||||
|
# ── Presence metrics ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
union = na + nb - inter
|
||||||
|
jac = (1.0 - inter / union) if union else 0.0
|
||||||
|
hamming = float(na + nb - 2 * inter) # |A Δ B|
|
||||||
|
|
||||||
|
# ── Count metrics ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# Bray-Curtis: 1 - 2*Σmin(a,b) / (Σa + Σb)
|
||||||
|
sum_min = np.minimum(ca_sh, cb_sh).sum()
|
||||||
|
denom_bc = sa + sb
|
||||||
|
bc = (1.0 - 2.0 * sum_min / denom_bc) if denom_bc else 0.0
|
||||||
|
|
||||||
|
# RelfreqBray: 1 - Σmin(a/sa, b/sb) [only shared contribute]
|
||||||
|
if sa and sb:
|
||||||
|
rfb = 1.0 - np.minimum(ca_sh / sa, cb_sh / sb).sum()
|
||||||
|
else:
|
||||||
|
rfb = 0.0
|
||||||
|
|
||||||
|
# Euclidean: √(Σa² + Σb² - 2·Σ(a·b)_shared)
|
||||||
|
cross = (ca_sh * cb_sh).sum()
|
||||||
|
eucl_partial = sq_sums[i] + sq_sums[j] - 2.0 * cross
|
||||||
|
eucl = np.sqrt(max(eucl_partial, 0.0))
|
||||||
|
|
||||||
|
# RelfreqEuclidean: √(Σ(a/sa - b/sb)²)
|
||||||
|
# = √(Σa²/sa² + Σb²/sb² - 2·Σ(a·b)_shared/(sa·sb))
|
||||||
|
if sa and sb:
|
||||||
|
rf_cross = (ca_sh / sa * (cb_sh / sb)).sum()
|
||||||
|
rfe_partial = (sq_sums[i] / sa**2
|
||||||
|
+ sq_sums[j] / sb**2
|
||||||
|
- 2.0 * rf_cross)
|
||||||
|
rfe = np.sqrt(max(rfe_partial, 0.0))
|
||||||
|
else:
|
||||||
|
rfe = 0.0
|
||||||
|
|
||||||
|
# Hellinger partial: Σ(√(a/sa) - √(b/sb))² over global universe
|
||||||
|
# = 2 - 2·Σ√(a·b)_shared / √(sa·sb)
|
||||||
|
if sa and sb:
|
||||||
|
bc_coeff = np.sqrt(ca_sh * cb_sh).sum() / np.sqrt(sa * sb)
|
||||||
|
hell_partial = max(2.0 - 2.0 * bc_coeff, 0.0)
|
||||||
|
else:
|
||||||
|
hell_partial = 0.0
|
||||||
|
|
||||||
|
sq2 = np.sqrt(2.0)
|
||||||
|
hell = np.sqrt(hell_partial) / sq2
|
||||||
|
hell_euc = np.sqrt(hell_partial)
|
||||||
|
|
||||||
|
# ── Fill symmetric matrices ───────────────────────────────────────
|
||||||
|
for mat, val in [
|
||||||
|
(shared_mat, inter),
|
||||||
|
(hamming_mat, hamming),
|
||||||
|
(jaccard_mat, jac),
|
||||||
|
(bray_mat, bc),
|
||||||
|
(relfreq_bray, rfb),
|
||||||
|
(euclidean_mat, eucl),
|
||||||
|
(relfreq_eucl, rfe),
|
||||||
|
(hellinger_mat, hell),
|
||||||
|
(hell_eucl_mat, hell_euc),
|
||||||
|
]:
|
||||||
|
mat[i, j] = val
|
||||||
|
mat[j, i] = val
|
||||||
|
|
||||||
|
return {
|
||||||
|
'shared_kmers': shared_mat,
|
||||||
|
'hamming_dist': hamming_mat,
|
||||||
|
'jaccard_dist': jaccard_mat,
|
||||||
|
'bray_curtis_dist': bray_mat,
|
||||||
|
'relfreq_bray_curtis_dist': relfreq_bray,
|
||||||
|
'euclidean_dist': euclidean_mat,
|
||||||
|
'relfreq_euclidean_dist': relfreq_eucl,
|
||||||
|
'hellinger_dist': hellinger_mat,
|
||||||
|
'hellinger_euclidean_dist': hell_eucl_mat,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ── I/O ───────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def write_csv(path: Path, labels: list[str], mat: np.ndarray, fmt: str) -> None:
|
||||||
|
with path.open('w') as fh:
|
||||||
|
fh.write('genome,' + ','.join(labels) + '\n')
|
||||||
|
for i, label in enumerate(labels):
|
||||||
|
row = ','.join(format(mat[i, j], fmt) for j in range(len(labels)))
|
||||||
|
fh.write(f'{label},{row}\n')
|
||||||
|
print(f' → {path}', file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
|
# ── main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
ap = argparse.ArgumentParser(description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
ap.add_argument('--ref-dir', default='reference_index',
|
||||||
|
help='Directory with per-specimen .npz files (default: reference_index)')
|
||||||
|
ap.add_argument('--out-dir', default='reference_dist',
|
||||||
|
help='Output directory for CSV files (default: reference_dist)')
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
ref_dir = Path(args.ref_dir)
|
||||||
|
out_dir = Path(args.out_dir)
|
||||||
|
out_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
npz_files = sorted(ref_dir.glob('*.npz'))
|
||||||
|
if not npz_files:
|
||||||
|
print(f'ERROR: no .npz files found in {ref_dir}', file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
print(f'Loading {len(npz_files)} specimen(s) from {ref_dir}/', file=sys.stderr)
|
||||||
|
specimens = []
|
||||||
|
for f in npz_files:
|
||||||
|
data = np.load(f)
|
||||||
|
specimens.append({
|
||||||
|
'name': f.stem,
|
||||||
|
'kmers': data['kmers'],
|
||||||
|
'counts': data['counts'],
|
||||||
|
})
|
||||||
|
print(f' {f.stem}: {len(data["kmers"]):,} kmers', file=sys.stderr)
|
||||||
|
|
||||||
|
labels = [s['name'] for s in specimens]
|
||||||
|
n = len(labels)
|
||||||
|
print(f'\nComputing pairwise distances for {n} specimens…', file=sys.stderr)
|
||||||
|
|
||||||
|
matrices = pairwise_stats(specimens)
|
||||||
|
|
||||||
|
print(f'\nWriting CSVs to {out_dir}/', file=sys.stderr)
|
||||||
|
write_csv(out_dir / 'shared_kmers.csv', labels, matrices['shared_kmers'], 'd')
|
||||||
|
write_csv(out_dir / 'hamming_dist.csv', labels, matrices['hamming_dist'], '.6f')
|
||||||
|
write_csv(out_dir / 'jaccard_dist.csv', labels, matrices['jaccard_dist'], '.6f')
|
||||||
|
write_csv(out_dir / 'bray_curtis_dist.csv', labels, matrices['bray_curtis_dist'], '.6f')
|
||||||
|
write_csv(out_dir / 'relfreq_bray_curtis_dist.csv', labels, matrices['relfreq_bray_curtis_dist'], '.6f')
|
||||||
|
write_csv(out_dir / 'euclidean_dist.csv', labels, matrices['euclidean_dist'], '.6f')
|
||||||
|
write_csv(out_dir / 'relfreq_euclidean_dist.csv', labels, matrices['relfreq_euclidean_dist'], '.6f')
|
||||||
|
write_csv(out_dir / 'hellinger_dist.csv', labels, matrices['hellinger_dist'], '.6f')
|
||||||
|
write_csv(out_dir / 'hellinger_euclidean_dist.csv', labels, matrices['hellinger_euclidean_dist'], '.6f')
|
||||||
|
|
||||||
|
print('\nDone.', file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Executable
+182
@@ -0,0 +1,182 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Compare all reference distance matrices against obikmer distance outputs.
|
||||||
|
|
||||||
|
Reads from:
|
||||||
|
reference_dist/ — ground-truth matrices computed by build_reference_dist.py
|
||||||
|
obikmer_dist/ — matrices produced by `obikmer distance`
|
||||||
|
|
||||||
|
Handles label reordering: both matrices are sorted by genome label before
|
||||||
|
element-wise comparison, so column/row order differences are irrelevant.
|
||||||
|
|
||||||
|
Output: stats/dist_comparison/summary.csv
|
||||||
|
comparison,max_abs,mean_abs,rmse,n_pairs,status
|
||||||
|
"""
|
||||||
|
import csv
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
# ── CSV loading ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def load_matrix(path: Path) -> tuple[list[str], np.ndarray]:
|
||||||
|
"""Load a distance-matrix CSV; return (sorted_labels, matrix_float64)."""
|
||||||
|
with path.open() as fh:
|
||||||
|
reader = csv.reader(fh)
|
||||||
|
header = next(reader)[1:] # skip 'genome' column
|
||||||
|
raw: dict[str, list[float]] = {}
|
||||||
|
for row in reader:
|
||||||
|
raw[row[0]] = [float(x) for x in row[1:]]
|
||||||
|
|
||||||
|
label_to_col = {h: i for i, h in enumerate(header)}
|
||||||
|
labels = sorted(raw.keys())
|
||||||
|
n = len(labels)
|
||||||
|
mat = np.zeros((n, n), dtype=np.float64)
|
||||||
|
for i, ri in enumerate(labels):
|
||||||
|
for j, cj in enumerate(labels):
|
||||||
|
mat[i, j] = raw[ri][label_to_col[cj]]
|
||||||
|
return labels, mat
|
||||||
|
|
||||||
|
|
||||||
|
# ── comparison ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def compare(label: str,
|
||||||
|
ref_path: Path,
|
||||||
|
obi_path: Path,
|
||||||
|
tol: float = 1e-4) -> dict:
|
||||||
|
if not ref_path.exists():
|
||||||
|
return {'comparison': label, 'status': 'REF_MISSING',
|
||||||
|
'max_abs': '', 'mean_abs': '', 'rmse': '', 'n_pairs': ''}
|
||||||
|
if not obi_path.exists():
|
||||||
|
return {'comparison': label, 'status': 'OBI_MISSING',
|
||||||
|
'max_abs': '', 'mean_abs': '', 'rmse': '', 'n_pairs': ''}
|
||||||
|
|
||||||
|
ref_labels, ref_mat = load_matrix(ref_path)
|
||||||
|
obi_labels, obi_mat = load_matrix(obi_path)
|
||||||
|
|
||||||
|
if ref_labels != obi_labels:
|
||||||
|
only_ref = sorted(set(ref_labels) - set(obi_labels))
|
||||||
|
only_obi = sorted(set(obi_labels) - set(ref_labels))
|
||||||
|
print(f' [{label}] label mismatch — '
|
||||||
|
f'only_ref={only_ref} only_obi={only_obi}', file=sys.stderr)
|
||||||
|
return {'comparison': label, 'status': 'LABEL_MISMATCH',
|
||||||
|
'max_abs': '', 'mean_abs': '', 'rmse': '', 'n_pairs': ''}
|
||||||
|
|
||||||
|
n = len(ref_labels)
|
||||||
|
# Off-diagonal mask
|
||||||
|
mask = ~np.eye(n, dtype=bool)
|
||||||
|
diff = np.abs(ref_mat[mask] - obi_mat[mask])
|
||||||
|
n_pairs = diff.size
|
||||||
|
|
||||||
|
max_abs = float(diff.max())
|
||||||
|
mean_abs = float(diff.mean())
|
||||||
|
rmse = float(np.sqrt((diff ** 2).mean()))
|
||||||
|
status = 'PASS' if max_abs <= tol else 'FAIL'
|
||||||
|
|
||||||
|
print(f' [{label}] n={n_pairs} '
|
||||||
|
f'max={max_abs:.3e} mean={mean_abs:.3e} rmse={rmse:.3e} {status}',
|
||||||
|
file=sys.stderr)
|
||||||
|
return {
|
||||||
|
'comparison': label,
|
||||||
|
'max_abs': f'{max_abs:.6e}',
|
||||||
|
'mean_abs': f'{mean_abs:.6e}',
|
||||||
|
'rmse': f'{rmse:.6e}',
|
||||||
|
'n_pairs': str(n_pairs),
|
||||||
|
'status': status,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ── comparison table ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# (label, ref_csv, obikmer_csv)
|
||||||
|
# The reference jaccard/shared is presence-based, which should match both
|
||||||
|
# presence/jaccard and count/jaccard (threshold=1).
|
||||||
|
COMPARISONS = [
|
||||||
|
# ── presence index ────────────────────────────────────────────────────────
|
||||||
|
('presence/jaccard_dist',
|
||||||
|
'reference_dist/jaccard_dist.csv',
|
||||||
|
'obikmer_dist/presence/jaccard_dist.csv'),
|
||||||
|
|
||||||
|
('presence/jaccard_shared',
|
||||||
|
'reference_dist/shared_kmers.csv',
|
||||||
|
'obikmer_dist/presence/jaccard_shared.csv'),
|
||||||
|
|
||||||
|
('presence/hamming_dist',
|
||||||
|
'reference_dist/hamming_dist.csv',
|
||||||
|
'obikmer_dist/presence/hamming_dist.csv'),
|
||||||
|
|
||||||
|
# ── count index (jaccard cross-check) ─────────────────────────────────────
|
||||||
|
('count/jaccard_dist',
|
||||||
|
'reference_dist/jaccard_dist.csv',
|
||||||
|
'obikmer_dist/count/jaccard_dist.csv'),
|
||||||
|
|
||||||
|
('count/jaccard_shared',
|
||||||
|
'reference_dist/shared_kmers.csv',
|
||||||
|
'obikmer_dist/count/jaccard_shared.csv'),
|
||||||
|
|
||||||
|
# ── count index (count-based metrics) ────────────────────────────────────
|
||||||
|
('count/bray_curtis_dist',
|
||||||
|
'reference_dist/bray_curtis_dist.csv',
|
||||||
|
'obikmer_dist/count/bray_curtis_dist.csv'),
|
||||||
|
|
||||||
|
('count/relfreq_bray_curtis_dist',
|
||||||
|
'reference_dist/relfreq_bray_curtis_dist.csv',
|
||||||
|
'obikmer_dist/count/relfreq_bray_curtis_dist.csv'),
|
||||||
|
|
||||||
|
('count/euclidean_dist',
|
||||||
|
'reference_dist/euclidean_dist.csv',
|
||||||
|
'obikmer_dist/count/euclidean_dist.csv'),
|
||||||
|
|
||||||
|
('count/relfreq_euclidean_dist',
|
||||||
|
'reference_dist/relfreq_euclidean_dist.csv',
|
||||||
|
'obikmer_dist/count/relfreq_euclidean_dist.csv'),
|
||||||
|
|
||||||
|
('count/hellinger_dist',
|
||||||
|
'reference_dist/hellinger_dist.csv',
|
||||||
|
'obikmer_dist/count/hellinger_dist.csv'),
|
||||||
|
|
||||||
|
('count/hellinger_euclidean_dist',
|
||||||
|
'reference_dist/hellinger_euclidean_dist.csv',
|
||||||
|
'obikmer_dist/count/hellinger_euclidean_dist.csv'),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# ── main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
import argparse
|
||||||
|
ap = argparse.ArgumentParser(description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
ap.add_argument('--tol', type=float, default=1e-4,
|
||||||
|
help='Max abs diff threshold for PASS/FAIL (default 1e-4)')
|
||||||
|
ap.add_argument('--out', default='stats/dist_comparison/summary.csv',
|
||||||
|
help='Output summary CSV path')
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
out_path = Path(args.out)
|
||||||
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
print(f'Comparing {len(COMPARISONS)} matrix pairs…', file=sys.stderr)
|
||||||
|
rows = []
|
||||||
|
for label, ref, obi in COMPARISONS:
|
||||||
|
rows.append(compare(label, Path(ref), Path(obi), tol=args.tol))
|
||||||
|
|
||||||
|
fields = ['comparison', 'max_abs', 'mean_abs', 'rmse', 'n_pairs', 'status']
|
||||||
|
with out_path.open('w', newline='') as fh:
|
||||||
|
w = csv.DictWriter(fh, fieldnames=fields)
|
||||||
|
w.writeheader()
|
||||||
|
w.writerows(rows)
|
||||||
|
|
||||||
|
print(f'\n→ {out_path}', file=sys.stderr)
|
||||||
|
|
||||||
|
n_fail = sum(1 for r in rows if r.get('status') == 'FAIL')
|
||||||
|
n_pass = sum(1 for r in rows if r.get('status') == 'PASS')
|
||||||
|
print(f'Summary: {n_pass} PASS {n_fail} FAIL '
|
||||||
|
f'{len(rows) - n_pass - n_fail} SKIP', file=sys.stderr)
|
||||||
|
if n_fail:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
@@ -0,0 +1,199 @@
|
|||||||
|
SPECIMENS := Escherichia_coli--K-12_MG1655 Escherichia_coli--EDL933 Salmonella_enterica--LT2 Escherichia_coli--CFT073 Bacillus_subtilis--168 Salmonella_enterica--P125109 Shouchella_clausii--KSM-K16 Escherichia_coli--K-12_W3110 Klebsiella_pneumoniae--MGH_78578 Opitutus_terrae--PB90-1 Saccharolobus_islandicus--M.16.4 Acidobacterium_capsulatum--ATCC_51196 Salmonella_enterica--AKU_12601 Proteus_mirabilis--HI4320 Salmonella_enterica--CT18 Klebsiella_pneumoniae--HS11286 Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1 Klebsiella_pneumoniae--ATCC_13883 Yersinia_ruckeri--YRB Candidozyma_auris--GCF_003013715.1_ASM301371v2
|
||||||
|
SPECIES := Escherichia_coli Salmonella_enterica Bacillus_subtilis Shouchella_clausii Klebsiella_pneumoniae Opitutus_terrae Saccharolobus_islandicus Acidobacterium_capsulatum Proteus_mirabilis Wolbachia_endosymbiont Yersinia_ruckeri Candidozyma_auris
|
||||||
|
|
||||||
|
# Escherichia_coli--K-12_MG1655
|
||||||
|
simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz: genomes/GCF_000005845.2_ASM584v2_genomic.fna.gz
|
||||||
|
reference_index/Escherichia_coli--K-12_MG1655.npz: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Escherichia_coli--K-12_MG1655/index.done stats/indexing_presence/Escherichia_coli--K-12_MG1655.stats: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Escherichia_coli--K-12_MG1655/index.done stats/indexing_count/Escherichia_coli--K-12_MG1655.stats: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Escherichia_coli--K-12_MG1655.stats: reference_index/Escherichia_coli--K-12_MG1655.npz specimen_index_presence/Escherichia_coli--K-12_MG1655/index.done
|
||||||
|
stats/verify_count/Escherichia_coli--K-12_MG1655.stats: reference_index/Escherichia_coli--K-12_MG1655.npz specimen_index_count/Escherichia_coli--K-12_MG1655/index.done
|
||||||
|
|
||||||
|
# Escherichia_coli--EDL933
|
||||||
|
simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz: genomes/GCF_000006665.1_ASM666v1_genomic.fna.gz
|
||||||
|
reference_index/Escherichia_coli--EDL933.npz: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Escherichia_coli--EDL933/index.done stats/indexing_presence/Escherichia_coli--EDL933.stats: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Escherichia_coli--EDL933/index.done stats/indexing_count/Escherichia_coli--EDL933.stats: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Escherichia_coli--EDL933.stats: reference_index/Escherichia_coli--EDL933.npz specimen_index_presence/Escherichia_coli--EDL933/index.done
|
||||||
|
stats/verify_count/Escherichia_coli--EDL933.stats: reference_index/Escherichia_coli--EDL933.npz specimen_index_count/Escherichia_coli--EDL933/index.done
|
||||||
|
|
||||||
|
# Salmonella_enterica--LT2
|
||||||
|
simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz: genomes/GCF_000006945.2_ASM694v2_genomic.fna.gz
|
||||||
|
reference_index/Salmonella_enterica--LT2.npz: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Salmonella_enterica--LT2/index.done stats/indexing_presence/Salmonella_enterica--LT2.stats: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Salmonella_enterica--LT2/index.done stats/indexing_count/Salmonella_enterica--LT2.stats: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Salmonella_enterica--LT2.stats: reference_index/Salmonella_enterica--LT2.npz specimen_index_presence/Salmonella_enterica--LT2/index.done
|
||||||
|
stats/verify_count/Salmonella_enterica--LT2.stats: reference_index/Salmonella_enterica--LT2.npz specimen_index_count/Salmonella_enterica--LT2/index.done
|
||||||
|
|
||||||
|
# Escherichia_coli--CFT073
|
||||||
|
simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz: genomes/GCF_000007445.1_ASM744v1_genomic.fna.gz
|
||||||
|
reference_index/Escherichia_coli--CFT073.npz: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Escherichia_coli--CFT073/index.done stats/indexing_presence/Escherichia_coli--CFT073.stats: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Escherichia_coli--CFT073/index.done stats/indexing_count/Escherichia_coli--CFT073.stats: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Escherichia_coli--CFT073.stats: reference_index/Escherichia_coli--CFT073.npz specimen_index_presence/Escherichia_coli--CFT073/index.done
|
||||||
|
stats/verify_count/Escherichia_coli--CFT073.stats: reference_index/Escherichia_coli--CFT073.npz specimen_index_count/Escherichia_coli--CFT073/index.done
|
||||||
|
|
||||||
|
# Bacillus_subtilis--168
|
||||||
|
simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz: genomes/GCF_000009045.1_ASM904v1_genomic.fna.gz
|
||||||
|
reference_index/Bacillus_subtilis--168.npz: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Bacillus_subtilis--168/index.done stats/indexing_presence/Bacillus_subtilis--168.stats: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Bacillus_subtilis--168/index.done stats/indexing_count/Bacillus_subtilis--168.stats: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Bacillus_subtilis--168.stats: reference_index/Bacillus_subtilis--168.npz specimen_index_presence/Bacillus_subtilis--168/index.done
|
||||||
|
stats/verify_count/Bacillus_subtilis--168.stats: reference_index/Bacillus_subtilis--168.npz specimen_index_count/Bacillus_subtilis--168/index.done
|
||||||
|
|
||||||
|
# Salmonella_enterica--P125109
|
||||||
|
simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz: genomes/GCF_000009505.1_ASM950v1_genomic.fna.gz
|
||||||
|
reference_index/Salmonella_enterica--P125109.npz: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Salmonella_enterica--P125109/index.done stats/indexing_presence/Salmonella_enterica--P125109.stats: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Salmonella_enterica--P125109/index.done stats/indexing_count/Salmonella_enterica--P125109.stats: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Salmonella_enterica--P125109.stats: reference_index/Salmonella_enterica--P125109.npz specimen_index_presence/Salmonella_enterica--P125109/index.done
|
||||||
|
stats/verify_count/Salmonella_enterica--P125109.stats: reference_index/Salmonella_enterica--P125109.npz specimen_index_count/Salmonella_enterica--P125109/index.done
|
||||||
|
|
||||||
|
# Shouchella_clausii--KSM-K16
|
||||||
|
simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz: genomes/GCF_000009825.1_ASM982v1_genomic.fna.gz
|
||||||
|
reference_index/Shouchella_clausii--KSM-K16.npz: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Shouchella_clausii--KSM-K16/index.done stats/indexing_presence/Shouchella_clausii--KSM-K16.stats: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Shouchella_clausii--KSM-K16/index.done stats/indexing_count/Shouchella_clausii--KSM-K16.stats: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Shouchella_clausii--KSM-K16.stats: reference_index/Shouchella_clausii--KSM-K16.npz specimen_index_presence/Shouchella_clausii--KSM-K16/index.done
|
||||||
|
stats/verify_count/Shouchella_clausii--KSM-K16.stats: reference_index/Shouchella_clausii--KSM-K16.npz specimen_index_count/Shouchella_clausii--KSM-K16/index.done
|
||||||
|
|
||||||
|
# Escherichia_coli--K-12_W3110
|
||||||
|
simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz: genomes/GCF_000010245.2_ASM1024v1_genomic.fna.gz
|
||||||
|
reference_index/Escherichia_coli--K-12_W3110.npz: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Escherichia_coli--K-12_W3110/index.done stats/indexing_presence/Escherichia_coli--K-12_W3110.stats: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Escherichia_coli--K-12_W3110/index.done stats/indexing_count/Escherichia_coli--K-12_W3110.stats: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Escherichia_coli--K-12_W3110.stats: reference_index/Escherichia_coli--K-12_W3110.npz specimen_index_presence/Escherichia_coli--K-12_W3110/index.done
|
||||||
|
stats/verify_count/Escherichia_coli--K-12_W3110.stats: reference_index/Escherichia_coli--K-12_W3110.npz specimen_index_count/Escherichia_coli--K-12_W3110/index.done
|
||||||
|
|
||||||
|
# Klebsiella_pneumoniae--MGH_78578
|
||||||
|
simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz: genomes/GCF_000016305.1_ASM1630v1_genomic.fna.gz
|
||||||
|
reference_index/Klebsiella_pneumoniae--MGH_78578.npz: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Klebsiella_pneumoniae--MGH_78578/index.done stats/indexing_presence/Klebsiella_pneumoniae--MGH_78578.stats: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Klebsiella_pneumoniae--MGH_78578/index.done stats/indexing_count/Klebsiella_pneumoniae--MGH_78578.stats: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Klebsiella_pneumoniae--MGH_78578.stats: reference_index/Klebsiella_pneumoniae--MGH_78578.npz specimen_index_presence/Klebsiella_pneumoniae--MGH_78578/index.done
|
||||||
|
stats/verify_count/Klebsiella_pneumoniae--MGH_78578.stats: reference_index/Klebsiella_pneumoniae--MGH_78578.npz specimen_index_count/Klebsiella_pneumoniae--MGH_78578/index.done
|
||||||
|
|
||||||
|
# Opitutus_terrae--PB90-1
|
||||||
|
simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz: genomes/GCF_000019965.1_ASM1996v1_genomic.fna.gz
|
||||||
|
reference_index/Opitutus_terrae--PB90-1.npz: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Opitutus_terrae--PB90-1/index.done stats/indexing_presence/Opitutus_terrae--PB90-1.stats: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Opitutus_terrae--PB90-1/index.done stats/indexing_count/Opitutus_terrae--PB90-1.stats: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Opitutus_terrae--PB90-1.stats: reference_index/Opitutus_terrae--PB90-1.npz specimen_index_presence/Opitutus_terrae--PB90-1/index.done
|
||||||
|
stats/verify_count/Opitutus_terrae--PB90-1.stats: reference_index/Opitutus_terrae--PB90-1.npz specimen_index_count/Opitutus_terrae--PB90-1/index.done
|
||||||
|
|
||||||
|
# Saccharolobus_islandicus--M.16.4
|
||||||
|
simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz: genomes/GCF_000022445.1_ASM2244v1_genomic.fna.gz
|
||||||
|
reference_index/Saccharolobus_islandicus--M.16.4.npz: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Saccharolobus_islandicus--M.16.4/index.done stats/indexing_presence/Saccharolobus_islandicus--M.16.4.stats: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Saccharolobus_islandicus--M.16.4/index.done stats/indexing_count/Saccharolobus_islandicus--M.16.4.stats: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Saccharolobus_islandicus--M.16.4.stats: reference_index/Saccharolobus_islandicus--M.16.4.npz specimen_index_presence/Saccharolobus_islandicus--M.16.4/index.done
|
||||||
|
stats/verify_count/Saccharolobus_islandicus--M.16.4.stats: reference_index/Saccharolobus_islandicus--M.16.4.npz specimen_index_count/Saccharolobus_islandicus--M.16.4/index.done
|
||||||
|
|
||||||
|
# Acidobacterium_capsulatum--ATCC_51196
|
||||||
|
simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz: genomes/GCF_000022565.1_ASM2256v1_genomic.fna.gz
|
||||||
|
reference_index/Acidobacterium_capsulatum--ATCC_51196.npz: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Acidobacterium_capsulatum--ATCC_51196/index.done stats/indexing_presence/Acidobacterium_capsulatum--ATCC_51196.stats: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Acidobacterium_capsulatum--ATCC_51196/index.done stats/indexing_count/Acidobacterium_capsulatum--ATCC_51196.stats: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Acidobacterium_capsulatum--ATCC_51196.stats: reference_index/Acidobacterium_capsulatum--ATCC_51196.npz specimen_index_presence/Acidobacterium_capsulatum--ATCC_51196/index.done
|
||||||
|
stats/verify_count/Acidobacterium_capsulatum--ATCC_51196.stats: reference_index/Acidobacterium_capsulatum--ATCC_51196.npz specimen_index_count/Acidobacterium_capsulatum--ATCC_51196/index.done
|
||||||
|
|
||||||
|
# Salmonella_enterica--AKU_12601
|
||||||
|
simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz: genomes/GCF_000026565.1_ASM2656v1_genomic.fna.gz
|
||||||
|
reference_index/Salmonella_enterica--AKU_12601.npz: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Salmonella_enterica--AKU_12601/index.done stats/indexing_presence/Salmonella_enterica--AKU_12601.stats: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Salmonella_enterica--AKU_12601/index.done stats/indexing_count/Salmonella_enterica--AKU_12601.stats: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Salmonella_enterica--AKU_12601.stats: reference_index/Salmonella_enterica--AKU_12601.npz specimen_index_presence/Salmonella_enterica--AKU_12601/index.done
|
||||||
|
stats/verify_count/Salmonella_enterica--AKU_12601.stats: reference_index/Salmonella_enterica--AKU_12601.npz specimen_index_count/Salmonella_enterica--AKU_12601/index.done
|
||||||
|
|
||||||
|
# Proteus_mirabilis--HI4320
|
||||||
|
simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz: genomes/GCF_000069965.1_ASM6996v1_genomic.fna.gz
|
||||||
|
reference_index/Proteus_mirabilis--HI4320.npz: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Proteus_mirabilis--HI4320/index.done stats/indexing_presence/Proteus_mirabilis--HI4320.stats: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Proteus_mirabilis--HI4320/index.done stats/indexing_count/Proteus_mirabilis--HI4320.stats: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Proteus_mirabilis--HI4320.stats: reference_index/Proteus_mirabilis--HI4320.npz specimen_index_presence/Proteus_mirabilis--HI4320/index.done
|
||||||
|
stats/verify_count/Proteus_mirabilis--HI4320.stats: reference_index/Proteus_mirabilis--HI4320.npz specimen_index_count/Proteus_mirabilis--HI4320/index.done
|
||||||
|
|
||||||
|
# Salmonella_enterica--CT18
|
||||||
|
simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz: genomes/GCF_000195995.1_ASM19599v1_genomic.fna.gz
|
||||||
|
reference_index/Salmonella_enterica--CT18.npz: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Salmonella_enterica--CT18/index.done stats/indexing_presence/Salmonella_enterica--CT18.stats: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Salmonella_enterica--CT18/index.done stats/indexing_count/Salmonella_enterica--CT18.stats: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Salmonella_enterica--CT18.stats: reference_index/Salmonella_enterica--CT18.npz specimen_index_presence/Salmonella_enterica--CT18/index.done
|
||||||
|
stats/verify_count/Salmonella_enterica--CT18.stats: reference_index/Salmonella_enterica--CT18.npz specimen_index_count/Salmonella_enterica--CT18/index.done
|
||||||
|
|
||||||
|
# Klebsiella_pneumoniae--HS11286
|
||||||
|
simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz: genomes/GCF_000240185.1_ASM24018v2_genomic.fna.gz
|
||||||
|
reference_index/Klebsiella_pneumoniae--HS11286.npz: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Klebsiella_pneumoniae--HS11286/index.done stats/indexing_presence/Klebsiella_pneumoniae--HS11286.stats: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Klebsiella_pneumoniae--HS11286/index.done stats/indexing_count/Klebsiella_pneumoniae--HS11286.stats: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Klebsiella_pneumoniae--HS11286.stats: reference_index/Klebsiella_pneumoniae--HS11286.npz specimen_index_presence/Klebsiella_pneumoniae--HS11286/index.done
|
||||||
|
stats/verify_count/Klebsiella_pneumoniae--HS11286.stats: reference_index/Klebsiella_pneumoniae--HS11286.npz specimen_index_count/Klebsiella_pneumoniae--HS11286/index.done
|
||||||
|
|
||||||
|
# Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1
|
||||||
|
simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz: genomes/GCF_000306885.1_ASM30688v1_genomic.fna.gz
|
||||||
|
reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done stats/indexing_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done stats/indexing_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz specimen_index_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done
|
||||||
|
stats/verify_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz specimen_index_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done
|
||||||
|
|
||||||
|
# Klebsiella_pneumoniae--ATCC_13883
|
||||||
|
simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz: genomes/GCF_000742135.1_ASM74213v1_genomic.fna.gz
|
||||||
|
reference_index/Klebsiella_pneumoniae--ATCC_13883.npz: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Klebsiella_pneumoniae--ATCC_13883/index.done stats/indexing_presence/Klebsiella_pneumoniae--ATCC_13883.stats: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Klebsiella_pneumoniae--ATCC_13883/index.done stats/indexing_count/Klebsiella_pneumoniae--ATCC_13883.stats: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Klebsiella_pneumoniae--ATCC_13883.stats: reference_index/Klebsiella_pneumoniae--ATCC_13883.npz specimen_index_presence/Klebsiella_pneumoniae--ATCC_13883/index.done
|
||||||
|
stats/verify_count/Klebsiella_pneumoniae--ATCC_13883.stats: reference_index/Klebsiella_pneumoniae--ATCC_13883.npz specimen_index_count/Klebsiella_pneumoniae--ATCC_13883/index.done
|
||||||
|
|
||||||
|
# Yersinia_ruckeri--YRB
|
||||||
|
simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz: genomes/GCF_000834255.1_ASM83425v1_genomic.fna.gz
|
||||||
|
reference_index/Yersinia_ruckeri--YRB.npz: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Yersinia_ruckeri--YRB/index.done stats/indexing_presence/Yersinia_ruckeri--YRB.stats: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Yersinia_ruckeri--YRB/index.done stats/indexing_count/Yersinia_ruckeri--YRB.stats: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Yersinia_ruckeri--YRB.stats: reference_index/Yersinia_ruckeri--YRB.npz specimen_index_presence/Yersinia_ruckeri--YRB/index.done
|
||||||
|
stats/verify_count/Yersinia_ruckeri--YRB.stats: reference_index/Yersinia_ruckeri--YRB.npz specimen_index_count/Yersinia_ruckeri--YRB/index.done
|
||||||
|
|
||||||
|
# Candidozyma_auris--GCF_003013715.1_ASM301371v2
|
||||||
|
simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz: genomes/GCF_003013715.1_ASM301371v2_genomic.fna.gz
|
||||||
|
reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done stats/indexing_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done stats/indexing_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz specimen_index_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done
|
||||||
|
stats/verify_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz specimen_index_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done
|
||||||
|
|
||||||
|
# Escherichia_coli
|
||||||
|
specific_index_presence/Escherichia_coli/index.done stats/specific_kmer_presence/Escherichia_coli.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Escherichia_coli/index.done stats/specific_kmer_count/Escherichia_coli.stats: global_index_count/index.done
|
||||||
|
# Salmonella_enterica
|
||||||
|
specific_index_presence/Salmonella_enterica/index.done stats/specific_kmer_presence/Salmonella_enterica.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Salmonella_enterica/index.done stats/specific_kmer_count/Salmonella_enterica.stats: global_index_count/index.done
|
||||||
|
# Bacillus_subtilis
|
||||||
|
specific_index_presence/Bacillus_subtilis/index.done stats/specific_kmer_presence/Bacillus_subtilis.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Bacillus_subtilis/index.done stats/specific_kmer_count/Bacillus_subtilis.stats: global_index_count/index.done
|
||||||
|
# Shouchella_clausii
|
||||||
|
specific_index_presence/Shouchella_clausii/index.done stats/specific_kmer_presence/Shouchella_clausii.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Shouchella_clausii/index.done stats/specific_kmer_count/Shouchella_clausii.stats: global_index_count/index.done
|
||||||
|
# Klebsiella_pneumoniae
|
||||||
|
specific_index_presence/Klebsiella_pneumoniae/index.done stats/specific_kmer_presence/Klebsiella_pneumoniae.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Klebsiella_pneumoniae/index.done stats/specific_kmer_count/Klebsiella_pneumoniae.stats: global_index_count/index.done
|
||||||
|
# Opitutus_terrae
|
||||||
|
specific_index_presence/Opitutus_terrae/index.done stats/specific_kmer_presence/Opitutus_terrae.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Opitutus_terrae/index.done stats/specific_kmer_count/Opitutus_terrae.stats: global_index_count/index.done
|
||||||
|
# Saccharolobus_islandicus
|
||||||
|
specific_index_presence/Saccharolobus_islandicus/index.done stats/specific_kmer_presence/Saccharolobus_islandicus.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Saccharolobus_islandicus/index.done stats/specific_kmer_count/Saccharolobus_islandicus.stats: global_index_count/index.done
|
||||||
|
# Acidobacterium_capsulatum
|
||||||
|
specific_index_presence/Acidobacterium_capsulatum/index.done stats/specific_kmer_presence/Acidobacterium_capsulatum.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Acidobacterium_capsulatum/index.done stats/specific_kmer_count/Acidobacterium_capsulatum.stats: global_index_count/index.done
|
||||||
|
# Proteus_mirabilis
|
||||||
|
specific_index_presence/Proteus_mirabilis/index.done stats/specific_kmer_presence/Proteus_mirabilis.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Proteus_mirabilis/index.done stats/specific_kmer_count/Proteus_mirabilis.stats: global_index_count/index.done
|
||||||
|
# Wolbachia_endosymbiont
|
||||||
|
specific_index_presence/Wolbachia_endosymbiont/index.done stats/specific_kmer_presence/Wolbachia_endosymbiont.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Wolbachia_endosymbiont/index.done stats/specific_kmer_count/Wolbachia_endosymbiont.stats: global_index_count/index.done
|
||||||
|
# Yersinia_ruckeri
|
||||||
|
specific_index_presence/Yersinia_ruckeri/index.done stats/specific_kmer_presence/Yersinia_ruckeri.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Yersinia_ruckeri/index.done stats/specific_kmer_count/Yersinia_ruckeri.stats: global_index_count/index.done
|
||||||
|
# Candidozyma_auris
|
||||||
|
specific_index_presence/Candidozyma_auris/index.done stats/specific_kmer_presence/Candidozyma_auris.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Candidozyma_auris/index.done stats/specific_kmer_count/Candidozyma_auris.stats: global_index_count/index.done
|
||||||
Executable
+48
@@ -0,0 +1,48 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
assemblies=(
|
||||||
|
GCF_000005845.2
|
||||||
|
GCF_000010245.2
|
||||||
|
GCF_000007445.1
|
||||||
|
GCF_000006665.1
|
||||||
|
|
||||||
|
GCF_000006945.2
|
||||||
|
GCF_000195995.1
|
||||||
|
GCF_000009505.1
|
||||||
|
GCF_000026565.1
|
||||||
|
|
||||||
|
GCF_000016305.1
|
||||||
|
GCF_000019965.1
|
||||||
|
GCF_000240185.1
|
||||||
|
GCF_000742135.1
|
||||||
|
|
||||||
|
GCF_000069965.1
|
||||||
|
GCF_000022565.1
|
||||||
|
GCF_000306885.1
|
||||||
|
GCF_003013715.1
|
||||||
|
|
||||||
|
GCF_000009045.1
|
||||||
|
GCF_000009825.1
|
||||||
|
GCF_000022445.1
|
||||||
|
GCF_000834255.1
|
||||||
|
)
|
||||||
|
|
||||||
|
mkdir -p genomes
|
||||||
|
|
||||||
|
for acc in "${assemblies[@]}"; do
|
||||||
|
echo "Downloading ${acc}"
|
||||||
|
|
||||||
|
datasets download genome accession "${acc}" \
|
||||||
|
--include genome \
|
||||||
|
--filename "${acc}.zip"
|
||||||
|
|
||||||
|
unzip -q "${acc}.zip" -d "${acc}"
|
||||||
|
find "${acc}" -name "*.fna" |
|
||||||
|
while read file; do
|
||||||
|
obiconvert -Z ${file} >genomes/$(basename ${file}).gz
|
||||||
|
done
|
||||||
|
|
||||||
|
rm -rf "${acc}" "${acc}.zip"
|
||||||
|
done
|
||||||
Executable
+108
@@ -0,0 +1,108 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: filter_one_count.sh SPECIES
|
||||||
|
# Filters global_index_count to keep only kmers specific to SPECIES,
|
||||||
|
# then selects the SPECIES column in-place.
|
||||||
|
# Outputs:
|
||||||
|
# specific_index_count/SPECIES/index.done (written by obikmer select)
|
||||||
|
# stats/specific_kmer_count/SPECIES.stats (one CSV data row, no header)
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SPECIES="$1"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
|
||||||
|
SOURCE="${SCRIPT_DIR}/global_index_count"
|
||||||
|
OUTPUT="${SCRIPT_DIR}/specific_index_count/${SPECIES}"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/specific_kmer_count"
|
||||||
|
STATS_FILE="${STATS_DIR}/${SPECIES}.stats"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
echo "[${SPECIES}] filter (count) → ${OUTPUT}"
|
||||||
|
|
||||||
|
LOG_FILTER=$(mktemp)
|
||||||
|
LOG_SELECT=$(mktemp)
|
||||||
|
trap 'rm -f "${LOG_FILTER}" "${LOG_SELECT}"' EXIT
|
||||||
|
|
||||||
|
"${BINARY}" filter \
|
||||||
|
--output "${OUTPUT}" \
|
||||||
|
--force \
|
||||||
|
--ingroup "species=${SPECIES}" \
|
||||||
|
--outgroup all \
|
||||||
|
--min-frac 0.5 \
|
||||||
|
--max-frac 1.0 \
|
||||||
|
--max-outgroup-count 0 \
|
||||||
|
"${SOURCE}" \
|
||||||
|
2>"${LOG_FILTER}"
|
||||||
|
|
||||||
|
cat "${LOG_FILTER}" >&2
|
||||||
|
|
||||||
|
"${BINARY}" select \
|
||||||
|
--in-place \
|
||||||
|
--group "${SPECIES}:species=${SPECIES}" \
|
||||||
|
--group-op "${SPECIES}:any" \
|
||||||
|
--select "${SPECIES}" \
|
||||||
|
"${OUTPUT}" \
|
||||||
|
2>"${LOG_SELECT}"
|
||||||
|
|
||||||
|
cat "${LOG_SELECT}" >&2
|
||||||
|
|
||||||
|
python3 - "${SPECIES}" "${LOG_FILTER}" "${LOG_SELECT}" <<'PYEOF' >"${STATS_FILE}"
|
||||||
|
import sys, re
|
||||||
|
|
||||||
|
species, log_filter, log_select = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||||
|
|
||||||
|
def strip_ansi(s):
|
||||||
|
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||||
|
|
||||||
|
def parse_wall(s):
|
||||||
|
s = s.strip()
|
||||||
|
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||||
|
if s.endswith('s'): return float(s[:-1])
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def parse_rss(s):
|
||||||
|
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||||
|
if not m: return 0
|
||||||
|
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||||
|
|
||||||
|
def is_sep(s):
|
||||||
|
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||||
|
|
||||||
|
def parse_reporter(logfile):
|
||||||
|
stats = {}
|
||||||
|
state = 'scan'
|
||||||
|
with open(logfile, errors='replace') as fh:
|
||||||
|
for raw in fh:
|
||||||
|
line = strip_ansi(raw.rstrip('\n'))
|
||||||
|
s = line.strip()
|
||||||
|
if state == 'scan':
|
||||||
|
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||||
|
state = 'in_header'
|
||||||
|
elif state == 'in_header':
|
||||||
|
if is_sep(s): state = 'rows'
|
||||||
|
elif state == 'rows':
|
||||||
|
if is_sep(s): state = 'total'
|
||||||
|
elif s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 4:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||||
|
elif state == 'total':
|
||||||
|
if s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 3:
|
||||||
|
stats['TOTAL'] = (parse_wall(parts[1]),
|
||||||
|
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||||
|
break
|
||||||
|
return stats
|
||||||
|
|
||||||
|
f = parse_reporter(log_filter)
|
||||||
|
s = parse_reporter(log_select)
|
||||||
|
|
||||||
|
row = [species]
|
||||||
|
for stage, d in [('rebuild', f), ('pack', f), ('filter_total', f), ('select', s), ('select_total', s)]:
|
||||||
|
key = 'TOTAL' if stage.endswith('_total') else stage
|
||||||
|
w, r = d.get(key, ('', ''))
|
||||||
|
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||||
|
print(','.join(row))
|
||||||
|
PYEOF
|
||||||
Executable
+108
@@ -0,0 +1,108 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: filter_one_presence.sh SPECIES
|
||||||
|
# Filters global_index_presence to keep only kmers specific to SPECIES,
|
||||||
|
# then selects the SPECIES column in-place.
|
||||||
|
# Outputs:
|
||||||
|
# specific_index_presence/SPECIES/index.done (written by obikmer select)
|
||||||
|
# stats/specific_kmer_presence/SPECIES.stats (one CSV data row, no header)
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SPECIES="$1"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
|
||||||
|
SOURCE="${SCRIPT_DIR}/global_index_presence"
|
||||||
|
OUTPUT="${SCRIPT_DIR}/specific_index_presence/${SPECIES}"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/specific_kmer_presence"
|
||||||
|
STATS_FILE="${STATS_DIR}/${SPECIES}.stats"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
echo "[${SPECIES}] filter (presence) → ${OUTPUT}"
|
||||||
|
|
||||||
|
LOG_FILTER=$(mktemp)
|
||||||
|
LOG_SELECT=$(mktemp)
|
||||||
|
trap 'rm -f "${LOG_FILTER}" "${LOG_SELECT}"' EXIT
|
||||||
|
|
||||||
|
"${BINARY}" filter \
|
||||||
|
--output "${OUTPUT}" \
|
||||||
|
--force \
|
||||||
|
--ingroup "species=${SPECIES}" \
|
||||||
|
--outgroup all \
|
||||||
|
--min-frac 0.5 \
|
||||||
|
--max-frac 1.0 \
|
||||||
|
--max-outgroup-count 0 \
|
||||||
|
"${SOURCE}" \
|
||||||
|
2>"${LOG_FILTER}"
|
||||||
|
|
||||||
|
cat "${LOG_FILTER}" >&2
|
||||||
|
|
||||||
|
"${BINARY}" select \
|
||||||
|
--in-place \
|
||||||
|
--group "${SPECIES}:species=${SPECIES}" \
|
||||||
|
--group-op "${SPECIES}:any" \
|
||||||
|
--select "${SPECIES}" \
|
||||||
|
"${OUTPUT}" \
|
||||||
|
2>"${LOG_SELECT}"
|
||||||
|
|
||||||
|
cat "${LOG_SELECT}" >&2
|
||||||
|
|
||||||
|
python3 - "${SPECIES}" "${LOG_FILTER}" "${LOG_SELECT}" <<'PYEOF' >"${STATS_FILE}"
|
||||||
|
import sys, re
|
||||||
|
|
||||||
|
species, log_filter, log_select = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||||
|
|
||||||
|
def strip_ansi(s):
|
||||||
|
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||||
|
|
||||||
|
def parse_wall(s):
|
||||||
|
s = s.strip()
|
||||||
|
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||||
|
if s.endswith('s'): return float(s[:-1])
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def parse_rss(s):
|
||||||
|
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||||
|
if not m: return 0
|
||||||
|
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||||
|
|
||||||
|
def is_sep(s):
|
||||||
|
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||||
|
|
||||||
|
def parse_reporter(logfile):
|
||||||
|
stats = {}
|
||||||
|
state = 'scan'
|
||||||
|
with open(logfile, errors='replace') as fh:
|
||||||
|
for raw in fh:
|
||||||
|
line = strip_ansi(raw.rstrip('\n'))
|
||||||
|
s = line.strip()
|
||||||
|
if state == 'scan':
|
||||||
|
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||||
|
state = 'in_header'
|
||||||
|
elif state == 'in_header':
|
||||||
|
if is_sep(s): state = 'rows'
|
||||||
|
elif state == 'rows':
|
||||||
|
if is_sep(s): state = 'total'
|
||||||
|
elif s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 4:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||||
|
elif state == 'total':
|
||||||
|
if s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 3:
|
||||||
|
stats['TOTAL'] = (parse_wall(parts[1]),
|
||||||
|
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||||
|
break
|
||||||
|
return stats
|
||||||
|
|
||||||
|
f = parse_reporter(log_filter)
|
||||||
|
s = parse_reporter(log_select)
|
||||||
|
|
||||||
|
row = [species]
|
||||||
|
for stage, d in [('rebuild', f), ('pack', f), ('filter_total', f), ('select', s), ('select_total', s)]:
|
||||||
|
key = 'TOTAL' if stage.endswith('_total') else stage
|
||||||
|
w, r = d.get(key, ('', ''))
|
||||||
|
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||||
|
print(','.join(row))
|
||||||
|
PYEOF
|
||||||
Executable
+103
@@ -0,0 +1,103 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: index_one_count.sh SPECIMEN
|
||||||
|
# SPECIMEN = "species--strain" (Make pattern stem)
|
||||||
|
# Outputs:
|
||||||
|
# specimen_index_count/SPECIMEN/index.done (written by obikmer)
|
||||||
|
# stats/indexing_count/SPECIMEN.stats (one CSV data row, no header)
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SPECIMEN="$1"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
|
||||||
|
species="${SPECIMEN%%--*}"
|
||||||
|
strain="${SPECIMEN#*--}"
|
||||||
|
|
||||||
|
READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}"
|
||||||
|
INDEX_PATH="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/indexing_count"
|
||||||
|
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
r1="${READS_DIR}/reads_R1.fastq.gz"
|
||||||
|
r2="${READS_DIR}/reads_R2.fastq.gz"
|
||||||
|
if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
|
||||||
|
echo "ERROR: reads not found in ${READS_DIR}" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[${SPECIMEN}] indexing (count) → ${INDEX_PATH}"
|
||||||
|
|
||||||
|
STDERR_LOG=$(mktemp)
|
||||||
|
trap 'rm -f "${STDERR_LOG}"' EXIT
|
||||||
|
|
||||||
|
"${BINARY}" index \
|
||||||
|
--output "${INDEX_PATH}" \
|
||||||
|
--force \
|
||||||
|
--theta 0 \
|
||||||
|
--with-counts \
|
||||||
|
--label "${SPECIMEN}" \
|
||||||
|
--meta "species=${species}" \
|
||||||
|
"${r1}" "${r2}" \
|
||||||
|
2>"${STDERR_LOG}"
|
||||||
|
|
||||||
|
cat "${STDERR_LOG}" >&2
|
||||||
|
|
||||||
|
python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}"
|
||||||
|
import sys, re
|
||||||
|
|
||||||
|
species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||||
|
|
||||||
|
def strip_ansi(s):
|
||||||
|
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||||
|
|
||||||
|
def parse_wall(s):
|
||||||
|
s = s.strip()
|
||||||
|
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||||
|
if s.endswith('s'): return float(s[:-1])
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def parse_rss(s):
|
||||||
|
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||||
|
if not m: return 0
|
||||||
|
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||||
|
|
||||||
|
def is_sep(s):
|
||||||
|
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||||
|
|
||||||
|
stats = {}
|
||||||
|
state = 'scan'
|
||||||
|
|
||||||
|
with open(logfile, errors='replace') as fh:
|
||||||
|
for raw in fh:
|
||||||
|
line = strip_ansi(raw.rstrip('\n'))
|
||||||
|
s = line.strip()
|
||||||
|
if state == 'scan':
|
||||||
|
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||||
|
state = 'in_header'
|
||||||
|
elif state == 'in_header':
|
||||||
|
if is_sep(s): state = 'rows'
|
||||||
|
elif state == 'rows':
|
||||||
|
if is_sep(s): state = 'total'
|
||||||
|
elif s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 4:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||||
|
elif state == 'total':
|
||||||
|
if s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 3:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]),
|
||||||
|
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||||
|
break
|
||||||
|
|
||||||
|
STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index']
|
||||||
|
row = [species, strain]
|
||||||
|
for stage in STAGE_ORDER:
|
||||||
|
w, r = stats.get(stage, ('', ''))
|
||||||
|
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||||
|
tw, tr = stats.get('TOTAL', ('', ''))
|
||||||
|
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
|
||||||
|
print(','.join(row))
|
||||||
|
PYEOF
|
||||||
Executable
+102
@@ -0,0 +1,102 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: index_one_presence.sh SPECIMEN
|
||||||
|
# SPECIMEN = "species--strain" (Make pattern stem)
|
||||||
|
# Outputs:
|
||||||
|
# specimen_index_presence/SPECIMEN/index.done (written by obikmer)
|
||||||
|
# stats/indexing_presence/SPECIMEN.stats (one CSV data row, no header)
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SPECIMEN="$1"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
|
||||||
|
species="${SPECIMEN%%--*}"
|
||||||
|
strain="${SPECIMEN#*--}"
|
||||||
|
|
||||||
|
READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}"
|
||||||
|
INDEX_PATH="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/indexing_presence"
|
||||||
|
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
r1="${READS_DIR}/reads_R1.fastq.gz"
|
||||||
|
r2="${READS_DIR}/reads_R2.fastq.gz"
|
||||||
|
if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
|
||||||
|
echo "ERROR: reads not found in ${READS_DIR}" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[${SPECIMEN}] indexing (presence) → ${INDEX_PATH}"
|
||||||
|
|
||||||
|
STDERR_LOG=$(mktemp)
|
||||||
|
trap 'rm -f "${STDERR_LOG}"' EXIT
|
||||||
|
|
||||||
|
"${BINARY}" index \
|
||||||
|
--output "${INDEX_PATH}" \
|
||||||
|
--force \
|
||||||
|
--theta 0 \
|
||||||
|
--label "${SPECIMEN}" \
|
||||||
|
--meta "species=${species}" \
|
||||||
|
"${r1}" "${r2}" \
|
||||||
|
2>"${STDERR_LOG}"
|
||||||
|
|
||||||
|
cat "${STDERR_LOG}" >&2
|
||||||
|
|
||||||
|
python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}"
|
||||||
|
import sys, re
|
||||||
|
|
||||||
|
species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||||
|
|
||||||
|
def strip_ansi(s):
|
||||||
|
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||||
|
|
||||||
|
def parse_wall(s):
|
||||||
|
s = s.strip()
|
||||||
|
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||||
|
if s.endswith('s'): return float(s[:-1])
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def parse_rss(s):
|
||||||
|
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||||
|
if not m: return 0
|
||||||
|
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||||
|
|
||||||
|
def is_sep(s):
|
||||||
|
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||||
|
|
||||||
|
stats = {}
|
||||||
|
state = 'scan'
|
||||||
|
|
||||||
|
with open(logfile, errors='replace') as fh:
|
||||||
|
for raw in fh:
|
||||||
|
line = strip_ansi(raw.rstrip('\n'))
|
||||||
|
s = line.strip()
|
||||||
|
if state == 'scan':
|
||||||
|
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||||
|
state = 'in_header'
|
||||||
|
elif state == 'in_header':
|
||||||
|
if is_sep(s): state = 'rows'
|
||||||
|
elif state == 'rows':
|
||||||
|
if is_sep(s): state = 'total'
|
||||||
|
elif s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 4:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||||
|
elif state == 'total':
|
||||||
|
if s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 3:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]),
|
||||||
|
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||||
|
break
|
||||||
|
|
||||||
|
STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index']
|
||||||
|
row = [species, strain]
|
||||||
|
for stage in STAGE_ORDER:
|
||||||
|
w, r = stats.get(stage, ('', ''))
|
||||||
|
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||||
|
tw, tr = stats.get('TOTAL', ('', ''))
|
||||||
|
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
|
||||||
|
print(','.join(row))
|
||||||
|
PYEOF
|
||||||
@@ -0,0 +1,118 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Generate deps.mk — pure dependency declarations for the benchmark pipeline.
|
||||||
|
|
||||||
|
Like C .d files: only target: prerequisites lines, no recipes.
|
||||||
|
Recipes stay in the Makefile as generic rules.
|
||||||
|
"""
|
||||||
|
import gzip
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
STOP_WORDS = {'complete', 'chromosome', 'whole', 'sequence', 'genome',
|
||||||
|
'endosymbiont', 'of'}
|
||||||
|
STOP_PREFIXES = ('scaffold', 'contig', 'plasmid')
|
||||||
|
|
||||||
|
|
||||||
|
def is_stop(tok):
|
||||||
|
t = tok.lower()
|
||||||
|
return t in STOP_WORDS or any(t.startswith(p) for p in STOP_PREFIXES)
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize(s):
|
||||||
|
return re.sub(r'[^A-Za-z0-9._-]', '_', s).strip('_')
|
||||||
|
|
||||||
|
|
||||||
|
def collect_tokens(text):
|
||||||
|
parts = []
|
||||||
|
for tok in text.split():
|
||||||
|
tok = tok.rstrip(',.')
|
||||||
|
if is_stop(tok):
|
||||||
|
break
|
||||||
|
parts.append(sanitize(tok))
|
||||||
|
return '_'.join(filter(None, parts))
|
||||||
|
|
||||||
|
|
||||||
|
def parse_organism(defn, gcf_id):
|
||||||
|
words = defn.split()
|
||||||
|
species = sanitize(words[0] + '_' + words[1])
|
||||||
|
|
||||||
|
m = re.search(r'\bstr\.\s+(\S+)(?:\s+substr\.\s+(\S+))?', defn)
|
||||||
|
if m:
|
||||||
|
strain = sanitize(m.group(1))
|
||||||
|
if m.group(2):
|
||||||
|
strain += '_' + sanitize(m.group(2))
|
||||||
|
return species, strain
|
||||||
|
|
||||||
|
m = re.search(r'\bstrain\b\s+(.*)', defn)
|
||||||
|
if m:
|
||||||
|
strain = collect_tokens(m.group(1))
|
||||||
|
if strain:
|
||||||
|
return species, strain
|
||||||
|
|
||||||
|
remainder = re.sub(r'^\S+ \S+\s*', '', defn)
|
||||||
|
remainder = re.sub(r'^subsp\.\s+\S+\s*', '', remainder)
|
||||||
|
remainder = re.sub(r'^serovar\s+\S+\s*', '', remainder)
|
||||||
|
strain = collect_tokens(remainder)
|
||||||
|
return species, strain if strain else gcf_id
|
||||||
|
|
||||||
|
|
||||||
|
def first_definition(path):
|
||||||
|
with gzip.open(path, 'rt') as fh:
|
||||||
|
for line in fh:
|
||||||
|
if line.startswith('>'):
|
||||||
|
m = re.search(r'"definition":"([^"]*)"', line)
|
||||||
|
return m.group(1) if m else line[1:].split()[0]
|
||||||
|
return Path(path).stem
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
entries = [] # (specimen, species, sim_dir, genome_path)
|
||||||
|
species_seen = []
|
||||||
|
|
||||||
|
for path in sorted(sys.argv[1:]):
|
||||||
|
gcf_id = Path(path).name.replace('_genomic.fna.gz', '')
|
||||||
|
defn = first_definition(path)
|
||||||
|
sp, st = parse_organism(defn, gcf_id)
|
||||||
|
specimen = f'{sp}--{st}'
|
||||||
|
sim_dir = f'simulated_data/{sp}/{st}'
|
||||||
|
entries.append((specimen, sp, sim_dir, path))
|
||||||
|
if sp not in species_seen:
|
||||||
|
species_seen.append(sp)
|
||||||
|
|
||||||
|
specimens = [e[0] for e in entries]
|
||||||
|
print('SPECIMENS :=', ' '.join(specimens))
|
||||||
|
print('SPECIES :=', ' '.join(species_seen))
|
||||||
|
|
||||||
|
for specimen, species, sim_dir, genome in entries:
|
||||||
|
reads = f'{sim_dir}/reads_R1.fastq.gz'
|
||||||
|
p_done = f'specimen_index_presence/{specimen}/index.done'
|
||||||
|
p_stats = f'stats/indexing_presence/{specimen}.stats'
|
||||||
|
c_done = f'specimen_index_count/{specimen}/index.done'
|
||||||
|
c_stats = f'stats/indexing_count/{specimen}.stats'
|
||||||
|
ref = f'reference_index/{specimen}.npz'
|
||||||
|
vp = f'stats/verify_presence/{specimen}.stats'
|
||||||
|
vc = f'stats/verify_count/{specimen}.stats'
|
||||||
|
|
||||||
|
print()
|
||||||
|
print(f'# {specimen}')
|
||||||
|
print(f'{reads}: {genome}')
|
||||||
|
print(f'{ref}: {reads}')
|
||||||
|
print(f'{p_done} {p_stats}: {reads}')
|
||||||
|
print(f'{c_done} {c_stats}: {reads}')
|
||||||
|
print(f'{vp}: {ref} {p_done}')
|
||||||
|
print(f'{vc}: {ref} {c_done}')
|
||||||
|
|
||||||
|
print()
|
||||||
|
for sp in species_seen:
|
||||||
|
sp_done = f'specific_index_presence/{sp}/index.done'
|
||||||
|
sp_stats = f'stats/specific_kmer_presence/{sp}.stats'
|
||||||
|
sc_done = f'specific_index_count/{sp}/index.done'
|
||||||
|
sc_stats = f'stats/specific_kmer_count/{sp}.stats'
|
||||||
|
print(f'# {sp}')
|
||||||
|
print(f'{sp_done} {sp_stats}: global_index_presence/index.done')
|
||||||
|
print(f'{sc_done} {sc_stats}: global_index_count/index.done')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Executable
+103
@@ -0,0 +1,103 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
IDX_DIR="${SCRIPT_DIR}/specimen_index_count"
|
||||||
|
OUTPUT="${SCRIPT_DIR}/global_index_count"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/merge_count"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')")
|
||||||
|
CSV="${STATS_DIR}/run_${run_n}.csv"
|
||||||
|
|
||||||
|
printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}"
|
||||||
|
|
||||||
|
parse_reporter() {
|
||||||
|
local run="$1" n_sources="$2" logfile="$3"
|
||||||
|
python3 - "$run" "$n_sources" "$logfile" <<'PYEOF'
|
||||||
|
import sys, re
|
||||||
|
|
||||||
|
run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||||
|
|
||||||
|
def strip_ansi(s):
|
||||||
|
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||||
|
|
||||||
|
def parse_wall(s):
|
||||||
|
s = s.strip()
|
||||||
|
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||||
|
if s.endswith('s'): return float(s[:-1])
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def parse_rss(s):
|
||||||
|
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||||
|
if not m: return 0
|
||||||
|
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||||
|
|
||||||
|
def is_sep(s):
|
||||||
|
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||||
|
|
||||||
|
stats = {}
|
||||||
|
state = 'scan'
|
||||||
|
|
||||||
|
with open(logfile, errors='replace') as fh:
|
||||||
|
for raw in fh:
|
||||||
|
line = strip_ansi(raw.rstrip('\n'))
|
||||||
|
s = line.strip()
|
||||||
|
|
||||||
|
if state == 'scan':
|
||||||
|
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||||
|
state = 'in_header'
|
||||||
|
elif state == 'in_header':
|
||||||
|
if is_sep(s):
|
||||||
|
state = 'rows'
|
||||||
|
elif state == 'rows':
|
||||||
|
if is_sep(s):
|
||||||
|
state = 'total'
|
||||||
|
elif s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 4:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||||
|
elif state == 'total':
|
||||||
|
if s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 3:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]),
|
||||||
|
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||||
|
break
|
||||||
|
|
||||||
|
STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack']
|
||||||
|
row = [run, n_sources]
|
||||||
|
for stage in STAGE_ORDER:
|
||||||
|
w, r = stats.get(stage, ('', ''))
|
||||||
|
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||||
|
tw, tr = stats.get('TOTAL', ('', ''))
|
||||||
|
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
|
||||||
|
print(','.join(row))
|
||||||
|
PYEOF
|
||||||
|
}
|
||||||
|
|
||||||
|
mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort)
|
||||||
|
|
||||||
|
if [[ ${#sources[@]} -eq 0 ]]; then
|
||||||
|
echo "ERROR: no indexes found in ${IDX_DIR}" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Merging ${#sources[@]} count indexes → ${OUTPUT}"
|
||||||
|
printf ' %s\n' "${sources[@]}"
|
||||||
|
|
||||||
|
STDERR_LOG=$(mktemp)
|
||||||
|
trap 'rm -f "${STDERR_LOG}"' EXIT
|
||||||
|
|
||||||
|
"${BINARY}" merge \
|
||||||
|
--output "${OUTPUT}" \
|
||||||
|
--force \
|
||||||
|
"${sources[@]}" \
|
||||||
|
2>"${STDERR_LOG}"
|
||||||
|
|
||||||
|
cat "${STDERR_LOG}" >&2
|
||||||
|
parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}"
|
||||||
|
|
||||||
|
echo "Done. Run ${run_n} → ${CSV}"
|
||||||
Executable
+104
@@ -0,0 +1,104 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
IDX_DIR="${SCRIPT_DIR}/specimen_index_presence"
|
||||||
|
OUTPUT="${SCRIPT_DIR}/global_index_presence"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/merge_presence"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')")
|
||||||
|
CSV="${STATS_DIR}/run_${run_n}.csv"
|
||||||
|
|
||||||
|
printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}"
|
||||||
|
|
||||||
|
parse_reporter() {
|
||||||
|
local run="$1" n_sources="$2" logfile="$3"
|
||||||
|
python3 - "$run" "$n_sources" "$logfile" <<'PYEOF'
|
||||||
|
import sys, re
|
||||||
|
|
||||||
|
run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||||
|
|
||||||
|
def strip_ansi(s):
|
||||||
|
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||||
|
|
||||||
|
def parse_wall(s):
|
||||||
|
s = s.strip()
|
||||||
|
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||||
|
if s.endswith('s'): return float(s[:-1])
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def parse_rss(s):
|
||||||
|
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||||
|
if not m: return 0
|
||||||
|
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||||
|
|
||||||
|
def is_sep(s):
|
||||||
|
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||||
|
|
||||||
|
stats = {}
|
||||||
|
state = 'scan'
|
||||||
|
|
||||||
|
with open(logfile, errors='replace') as fh:
|
||||||
|
for raw in fh:
|
||||||
|
line = strip_ansi(raw.rstrip('\n'))
|
||||||
|
s = line.strip()
|
||||||
|
|
||||||
|
if state == 'scan':
|
||||||
|
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||||
|
state = 'in_header'
|
||||||
|
elif state == 'in_header':
|
||||||
|
if is_sep(s):
|
||||||
|
state = 'rows'
|
||||||
|
elif state == 'rows':
|
||||||
|
if is_sep(s):
|
||||||
|
state = 'total'
|
||||||
|
elif s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 4:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||||
|
elif state == 'total':
|
||||||
|
if s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 3:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]),
|
||||||
|
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||||
|
break
|
||||||
|
|
||||||
|
STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack']
|
||||||
|
row = [run, n_sources]
|
||||||
|
for stage in STAGE_ORDER:
|
||||||
|
w, r = stats.get(stage, ('', ''))
|
||||||
|
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||||
|
tw, tr = stats.get('TOTAL', ('', ''))
|
||||||
|
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
|
||||||
|
print(','.join(row))
|
||||||
|
PYEOF
|
||||||
|
}
|
||||||
|
|
||||||
|
mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort)
|
||||||
|
|
||||||
|
if [[ ${#sources[@]} -eq 0 ]]; then
|
||||||
|
echo "ERROR: no indexes found in ${IDX_DIR}" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Merging ${#sources[@]} presence indexes → ${OUTPUT}"
|
||||||
|
printf ' %s\n' "${sources[@]}"
|
||||||
|
|
||||||
|
STDERR_LOG=$(mktemp)
|
||||||
|
trap 'rm -f "${STDERR_LOG}"' EXIT
|
||||||
|
|
||||||
|
"${BINARY}" merge \
|
||||||
|
--output "${OUTPUT}" \
|
||||||
|
--force \
|
||||||
|
--force-presence \
|
||||||
|
"${sources[@]}" \
|
||||||
|
2>"${STDERR_LOG}"
|
||||||
|
|
||||||
|
cat "${STDERR_LOG}" >&2
|
||||||
|
parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}"
|
||||||
|
|
||||||
|
echo "Done. Run ${run_n} → ${CSV}"
|
||||||
Executable
+12
@@ -0,0 +1,12 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Simulate all genomes. Delegates to simulate_one.sh per genome.
|
||||||
|
# Prefer running via `gmake simulate` which handles individual dependencies.
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
|
||||||
|
for genome_file in "${SCRIPT_DIR}"/genomes/*.fna.gz; do
|
||||||
|
out_dir=$("${SCRIPT_DIR}/../.venv/bin/python3" "${SCRIPT_DIR}/make_deps.py" \
|
||||||
|
--dir-for "${genome_file}")
|
||||||
|
bash "${SCRIPT_DIR}/simulate_one.sh" "${genome_file}" "${out_dir}"
|
||||||
|
done
|
||||||
@@ -0,0 +1,33 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: simulate_one.sh genome.fna.gz output_dir
|
||||||
|
# Simulates paired-end HiSeq reads for a single genome.
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
ISS="${SCRIPT_DIR}/../.venv/bin/iss"
|
||||||
|
COVERAGE=15
|
||||||
|
READ_LENGTH=150
|
||||||
|
CPUS="${CPUS:-$(sysctl -n hw.logicalcpu 2>/dev/null || nproc 2>/dev/null || echo 2)}"
|
||||||
|
|
||||||
|
genome_file="$1"
|
||||||
|
out_dir="$2"
|
||||||
|
|
||||||
|
mkdir -p "${out_dir}"
|
||||||
|
|
||||||
|
tmp_fasta=$(mktemp "${TMPDIR:-/tmp}/obikmer_XXXXXX.fna")
|
||||||
|
trap 'rm -f "${tmp_fasta}"' EXIT
|
||||||
|
|
||||||
|
gzip -dc "${genome_file}" > "${tmp_fasta}"
|
||||||
|
|
||||||
|
genome_size=$(grep -v "^>" "${tmp_fasta}" | tr -d '[:space:]' | wc -c | tr -d ' ')
|
||||||
|
n_reads=$(python3 -c "import math; print(math.ceil(${COVERAGE} * ${genome_size} / (2 * ${READ_LENGTH})))")
|
||||||
|
|
||||||
|
echo "[${out_dir}] genome=${genome_size} bp → ${n_reads} read pairs (${COVERAGE}x HiSeq)"
|
||||||
|
|
||||||
|
"${ISS}" generate \
|
||||||
|
--genomes "${tmp_fasta}" \
|
||||||
|
--model HiSeq \
|
||||||
|
--n_reads "${n_reads}" \
|
||||||
|
--cpus "${CPUS}" \
|
||||||
|
--compress \
|
||||||
|
--output "${out_dir}/reads"
|
||||||
@@ -0,0 +1,21 @@
|
|||||||
|
genome,Candidozyma_auris--GCF_003013715.1_ASM301371v2,Acidobacterium_capsulatum--ATCC_51196,Bacillus_subtilis--168,Escherichia_coli--CFT073,Escherichia_coli--EDL933,Escherichia_coli--K-12_MG1655,Escherichia_coli--K-12_W3110,Klebsiella_pneumoniae--ATCC_13883,Klebsiella_pneumoniae--HS11286,Klebsiella_pneumoniae--MGH_78578,Opitutus_terrae--PB90-1,Proteus_mirabilis--HI4320,Saccharolobus_islandicus--M.16.4,Salmonella_enterica--AKU_12601,Salmonella_enterica--CT18,Salmonella_enterica--LT2,Salmonella_enterica--P125109,Shouchella_clausii--KSM-K16,Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1,Yersinia_ruckeri--YRB
|
||||||
|
Candidozyma_auris--GCF_003013715.1_ASM301371v2,0.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
|
||||||
|
Acidobacterium_capsulatum--ATCC_51196,1.000000,0.000000,0.999981,0.999990,0.999989,0.999987,0.999987,0.999990,0.999988,0.999988,0.999994,0.999989,1.000000,0.999988,0.999987,0.999987,0.999988,0.999989,0.999991,0.999987
|
||||||
|
Bacillus_subtilis--168,1.000000,0.999981,0.000000,0.999990,0.999989,0.999989,0.999989,0.999989,0.999988,0.999986,0.999995,0.999985,0.999999,0.999988,0.999987,0.999989,0.999988,0.999778,0.999993,0.999987
|
||||||
|
Escherichia_coli--CFT073,1.000000,0.999990,0.999990,0.000000,0.825741,0.807495,0.807218,0.991156,0.996855,0.997849,0.999996,0.999633,1.000000,0.993885,0.996736,0.994148,0.993821,0.999991,0.999984,0.999291
|
||||||
|
Escherichia_coli--EDL933,1.000000,0.999989,0.999989,0.825741,0.000000,0.735107,0.734775,0.996126,0.998058,0.997908,0.999997,0.999640,1.000000,0.993993,0.997126,0.994390,0.994059,0.999991,0.999986,0.999292
|
||||||
|
Escherichia_coli--K-12_MG1655,1.000000,0.999987,0.999989,0.807495,0.735107,0.000000,0.382567,0.996190,0.997747,0.997455,0.999996,0.999604,1.000000,0.993444,0.996645,0.993773,0.993431,0.999989,0.999984,0.999174
|
||||||
|
Escherichia_coli--K-12_W3110,1.000000,0.999987,0.999989,0.807218,0.734775,0.382567,0.000000,0.996220,0.997761,0.997467,0.999995,0.999604,1.000000,0.993445,0.996669,0.993769,0.993443,0.999990,0.999985,0.999165
|
||||||
|
Klebsiella_pneumoniae--ATCC_13883,1.000000,0.999990,0.999989,0.991156,0.996126,0.996190,0.996220,0.000000,0.845220,0.840545,0.999997,0.999648,1.000000,0.996177,0.998128,0.996268,0.996052,0.999990,0.999987,0.999325
|
||||||
|
Klebsiella_pneumoniae--HS11286,1.000000,0.999988,0.999988,0.996855,0.998058,0.997747,0.997761,0.845220,0.000000,0.906475,0.999996,0.999683,1.000000,0.997724,0.995697,0.997776,0.997769,0.999989,0.999979,0.999463
|
||||||
|
Klebsiella_pneumoniae--MGH_78578,1.000000,0.999988,0.999986,0.997849,0.997908,0.997455,0.997467,0.840545,0.906475,0.000000,0.999996,0.999704,1.000000,0.997928,0.995054,0.997844,0.997868,0.999990,0.999980,0.999479
|
||||||
|
Opitutus_terrae--PB90-1,1.000000,0.999994,0.999995,0.999996,0.999997,0.999996,0.999995,0.999997,0.999996,0.999996,0.000000,0.999997,0.999998,0.999996,0.999996,0.999996,0.999995,0.999997,0.999993,0.999996
|
||||||
|
Proteus_mirabilis--HI4320,1.000000,0.999989,0.999985,0.999633,0.999640,0.999604,0.999604,0.999648,0.999683,0.999704,0.999997,0.000000,1.000000,0.999604,0.999699,0.999622,0.999613,0.999987,0.999983,0.999505
|
||||||
|
Saccharolobus_islandicus--M.16.4,1.000000,1.000000,0.999999,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.999998,1.000000,0.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
|
||||||
|
Salmonella_enterica--AKU_12601,1.000000,0.999988,0.999988,0.993885,0.993993,0.993444,0.993445,0.996177,0.997724,0.997928,0.999996,0.999604,1.000000,0.000000,0.869238,0.682277,0.663383,0.999990,0.999985,0.999260
|
||||||
|
Salmonella_enterica--CT18,1.000000,0.999987,0.999987,0.996736,0.997126,0.996645,0.996669,0.998128,0.995697,0.995054,0.999996,0.999699,1.000000,0.869238,0.000000,0.890872,0.886148,0.999988,0.999976,0.999524
|
||||||
|
Salmonella_enterica--LT2,1.000000,0.999987,0.999989,0.994148,0.994390,0.993773,0.993769,0.996268,0.997776,0.997844,0.999996,0.999622,1.000000,0.682277,0.890872,0.000000,0.622606,0.999989,0.999985,0.999296
|
||||||
|
Salmonella_enterica--P125109,1.000000,0.999988,0.999988,0.993821,0.994059,0.993431,0.993443,0.996052,0.997769,0.997868,0.999995,0.999613,1.000000,0.663383,0.886148,0.622606,0.000000,0.999988,0.999983,0.999270
|
||||||
|
Shouchella_clausii--KSM-K16,1.000000,0.999989,0.999778,0.999991,0.999991,0.999989,0.999990,0.999990,0.999989,0.999990,0.999997,0.999987,1.000000,0.999990,0.999988,0.999989,0.999988,0.000000,0.999991,0.999988
|
||||||
|
Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1,1.000000,0.999991,0.999993,0.999984,0.999986,0.999984,0.999985,0.999987,0.999979,0.999980,0.999993,0.999983,1.000000,0.999985,0.999976,0.999985,0.999983,0.999991,0.000000,0.999983
|
||||||
|
Yersinia_ruckeri--YRB,1.000000,0.999987,0.999987,0.999291,0.999292,0.999174,0.999165,0.999325,0.999463,0.999479,0.999996,0.999505,1.000000,0.999260,0.999524,0.999296,0.999270,0.999988,0.999983,0.000000
|
||||||
|
@@ -0,0 +1 @@
|
|||||||
|
(((((((((((Candidozyma_auris--GCF_003013715.1_ASM301371v2:0.5000001881725941,Saccharolobus_islandicus--M.16.4:0.4999993211600824):0.0000023411501775538747,Opitutus_terrae--PB90-1:0.499997075187947):0.0000029791191795691675,(Acidobacterium_capsulatum--ATCC_51196:0.49999227771334689,(Bacillus_subtilis--168:0.49988797935621456,Shouchella_clausii--KSM-K16:0.49988984146059159):0.0001037210285571577):0.0000023959836053522034):0.0000034093646568700288,Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1:0.4999920159222422):0.000199555100890203,Proteus_mirabilis--HI4320:0.49979129185300427):0.00010103619067070024,Yersinia_ruckeri--YRB:0.4996806650749249):0.0013719139155004,(Klebsiella_pneumoniae--HS11286:0.43798845051648258,(Klebsiella_pneumoniae--ATCC_13883:0.41780293826821265,Klebsiella_pneumoniae--MGH_78578:0.42274184870836559):0.017586732339732737):0.0604124197073832):0.0006482538063555254,(Salmonella_enterica--CT18:0.43952894448143017,(Salmonella_enterica--AKU_12601:0.3357977326267918,(Salmonella_enterica--LT2:0.31203395843666389,Salmonella_enterica--P125109:0.31057217324861216):0.025729515856701136):0.10292985918524672):0.05825411485542886):0.08937928015651564,Escherichia_coli--CFT073:0.40806501650701029):0.0410131211869626,Escherichia_coli--EDL933:0.3681464750911808):0.1755112579711463,Escherichia_coli--K-12_MG1655:0.19129818036662728,Escherichia_coli--K-12_W3110:0.19126872019906239);
|
||||||
@@ -0,0 +1,21 @@
|
|||||||
|
genome,Candidozyma_auris--GCF_003013715.1_ASM301371v2,Acidobacterium_capsulatum--ATCC_51196,Bacillus_subtilis--168,Escherichia_coli--CFT073,Escherichia_coli--EDL933,Escherichia_coli--K-12_MG1655,Escherichia_coli--K-12_W3110,Klebsiella_pneumoniae--ATCC_13883,Klebsiella_pneumoniae--HS11286,Klebsiella_pneumoniae--MGH_78578,Opitutus_terrae--PB90-1,Proteus_mirabilis--HI4320,Saccharolobus_islandicus--M.16.4,Salmonella_enterica--AKU_12601,Salmonella_enterica--CT18,Salmonella_enterica--LT2,Salmonella_enterica--P125109,Shouchella_clausii--KSM-K16,Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1,Yersinia_ruckeri--YRB
|
||||||
|
Candidozyma_auris--GCF_003013715.1_ASM301371v2,0,0,0,0,0,0,0,0,0,0,0,0,8,0,1,0,0,0,0,3
|
||||||
|
Acidobacterium_capsulatum--ATCC_51196,0,0,203,119,128,141,140,116,109,111,78,112,0,136,109,147,134,117,55,129
|
||||||
|
Bacillus_subtilis--168,0,203,0,124,132,128,123,133,109,130,66,158,6,131,112,124,135,2393,46,124
|
||||||
|
Escherichia_coli--CFT073,0,119,124,0,1966777,1998059,1999094,117743,32029,22312,63,4225,0,74946,31918,73311,76585,113,128,7854
|
||||||
|
Escherichia_coli--EDL933,0,128,132,1966777,0,2627885,2628700,52488,20134,22064,48,4202,0,74655,28602,71244,74665,112,108,7963
|
||||||
|
Escherichia_coli--K-12_MG1655,0,141,128,1998059,2627885,0,4452541,48302,21382,24602,47,4277,0,75729,30449,73622,76778,119,111,8566
|
||||||
|
Escherichia_coli--K-12_W3110,0,140,123,1999094,2628700,4452541,0,47894,21226,24470,68,4278,0,75658,30207,73614,76583,112,108,8660
|
||||||
|
Klebsiella_pneumoniae--ATCC_13883,0,116,133,117743,52488,48302,47894,0,1416091,1477759,42,4172,0,48296,18988,48144,50416,120,106,7712
|
||||||
|
Klebsiella_pneumoniae--HS11286,0,109,109,32029,20134,21382,21226,1416091,0,644063,42,2738,0,21498,29758,21606,21376,99,102,4417
|
||||||
|
Klebsiella_pneumoniae--MGH_78578,0,111,130,22312,22064,24602,24470,1477759,644063,0,42,2614,0,19948,35067,21330,20813,97,102,4374
|
||||||
|
Opitutus_terrae--PB90-1,0,78,66,63,48,47,68,42,42,42,0,43,18,57,42,53,66,39,58,43
|
||||||
|
Proteus_mirabilis--HI4320,0,112,158,4225,4202,4277,4278,4172,2738,2614,43,0,0,4254,2481,4166,4215,131,103,4704
|
||||||
|
Saccharolobus_islandicus--M.16.4,8,0,6,0,0,0,0,0,0,0,18,0,0,0,0,0,0,0,0,0
|
||||||
|
Salmonella_enterica--AKU_12601,0,136,131,74946,74655,75729,75658,48296,21498,19948,57,4254,0,0,1047731,2857146,2951421,117,108,7643
|
||||||
|
Salmonella_enterica--CT18,1,109,112,31918,28602,30449,30207,18988,29758,35067,42,2481,0,1047731,0,917948,940297,106,106,3716
|
||||||
|
Salmonella_enterica--LT2,0,147,124,73311,71244,73622,73614,48144,21606,21330,53,4166,0,2857146,917948,0,3284800,122,108,7460
|
||||||
|
Salmonella_enterica--P125109,0,134,135,76585,74665,76778,76583,50416,21376,20813,66,4215,0,2951421,940297,3284800,0,134,124,7645
|
||||||
|
Shouchella_clausii--KSM-K16,0,117,2393,113,112,119,112,120,99,97,39,131,0,117,106,122,134,0,58,124
|
||||||
|
Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1,0,55,46,128,108,111,108,106,102,102,58,103,0,108,106,108,124,58,0,96
|
||||||
|
Yersinia_ruckeri--YRB,3,129,124,7854,7963,8566,8660,7712,4417,4374,43,4704,0,7643,3716,7460,7645,124,96,0
|
||||||
|
Executable
+181
@@ -0,0 +1,181 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Compare an obikmer count index against a reference kmer set (presence + counts).
|
||||||
|
|
||||||
|
Loads the reference .npz (sorted uint64 kmers + uint32 counts from build_reference.py),
|
||||||
|
streams `obikmer dump` from a --with-counts index, then reports:
|
||||||
|
- false negatives : kmers in reference absent from the index
|
||||||
|
- false positives : kmers in the index absent from the reference
|
||||||
|
- count mismatches: kmers present in both but with differing counts
|
||||||
|
|
||||||
|
Output to stdout: one CSV row
|
||||||
|
species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,
|
||||||
|
fn_pct,fp_pct,cm_pct
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
# ── encoding ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
|
||||||
|
'a': 0, 'c': 1, 'g': 2, 't': 3}
|
||||||
|
|
||||||
|
_DECODE = ['A', 'C', 'G', 'T']
|
||||||
|
|
||||||
|
|
||||||
|
def encode_kmer(s: str) -> int:
|
||||||
|
kmer = 0
|
||||||
|
for c in s:
|
||||||
|
kmer = (kmer << 2) | _ENCODE[c]
|
||||||
|
return kmer
|
||||||
|
|
||||||
|
|
||||||
|
def decode_kmer(val: int, k: int) -> str:
|
||||||
|
bases = []
|
||||||
|
for _ in range(k):
|
||||||
|
bases.append(_DECODE[val & 3])
|
||||||
|
val >>= 2
|
||||||
|
return ''.join(reversed(bases))
|
||||||
|
|
||||||
|
|
||||||
|
# ── dump parsing ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def load_index(obikmer_bin: str, index_dir: str) -> tuple[np.ndarray, np.ndarray]:
|
||||||
|
"""Stream `obikmer dump` and return (kmers_sorted_uint64, counts_uint32)."""
|
||||||
|
cmd = [obikmer_bin, 'dump', index_dir]
|
||||||
|
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
|
||||||
|
text=True)
|
||||||
|
kmers, counts = [], []
|
||||||
|
header = True
|
||||||
|
for line in proc.stdout:
|
||||||
|
if header:
|
||||||
|
header = False
|
||||||
|
continue
|
||||||
|
parts = line.rstrip('\n').split(',')
|
||||||
|
kmers.append(encode_kmer(parts[0]))
|
||||||
|
counts.append(int(parts[1]))
|
||||||
|
proc.wait()
|
||||||
|
if proc.returncode != 0:
|
||||||
|
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
order = np.argsort(np.array(kmers, dtype=np.uint64), kind='stable')
|
||||||
|
return (np.array(kmers, dtype=np.uint64)[order],
|
||||||
|
np.array(counts, dtype=np.uint32)[order])
|
||||||
|
|
||||||
|
|
||||||
|
# ── comparison ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def compare(ref_kmers: np.ndarray, ref_counts: np.ndarray,
|
||||||
|
idx_kmers: np.ndarray, idx_counts: np.ndarray,
|
||||||
|
) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
|
||||||
|
"""Return (false_neg, false_pos, cm_ref_kmers, cm_ref_counts, cm_idx_counts).
|
||||||
|
|
||||||
|
All arrays sorted; cm_* cover kmers present in both arrays but with
|
||||||
|
differing counts.
|
||||||
|
"""
|
||||||
|
false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
|
||||||
|
false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
|
||||||
|
|
||||||
|
# Count mismatches among shared kmers.
|
||||||
|
# Both arrays are sorted so we can use searchsorted.
|
||||||
|
pos_in_idx = np.searchsorted(idx_kmers, ref_kmers)
|
||||||
|
pos_in_idx = np.clip(pos_in_idx, 0, len(idx_kmers) - 1)
|
||||||
|
shared_mask = idx_kmers[pos_in_idx] == ref_kmers
|
||||||
|
|
||||||
|
shared_ref_counts = ref_counts[shared_mask]
|
||||||
|
shared_idx_counts = idx_counts[pos_in_idx[shared_mask]]
|
||||||
|
mismatch_mask = shared_ref_counts != shared_idx_counts
|
||||||
|
|
||||||
|
cm_kmers = ref_kmers[shared_mask][mismatch_mask]
|
||||||
|
cm_ref_counts = shared_ref_counts[mismatch_mask]
|
||||||
|
cm_idx_counts = shared_idx_counts[mismatch_mask]
|
||||||
|
|
||||||
|
return false_neg, false_pos, cm_kmers, cm_ref_counts, cm_idx_counts
|
||||||
|
|
||||||
|
|
||||||
|
# ── main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
ap = argparse.ArgumentParser(description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
ap.add_argument('reference', metavar='REF_NPZ', nargs='?',
|
||||||
|
help='Reference .npz file')
|
||||||
|
ap.add_argument('index', metavar='INDEX_DIR', nargs='?',
|
||||||
|
help='obikmer index directory (built with --with-counts)')
|
||||||
|
ap.add_argument('--obikmer', default='obikmer',
|
||||||
|
help='Path to obikmer binary')
|
||||||
|
ap.add_argument('--species', default='')
|
||||||
|
ap.add_argument('--strain', default='')
|
||||||
|
ap.add_argument('--header', action='store_true',
|
||||||
|
help='Print CSV header and exit')
|
||||||
|
ap.add_argument('--save-fp', metavar='FILE',
|
||||||
|
help='Save false-positive kmer strings to FILE')
|
||||||
|
ap.add_argument('--save-fn', metavar='FILE',
|
||||||
|
help='Save false-negative kmer strings to FILE')
|
||||||
|
ap.add_argument('--save-cm', metavar='FILE',
|
||||||
|
help='Save count-mismatch rows (kmer,ref_count,idx_count) to FILE')
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
if args.header:
|
||||||
|
print('species,strain,ref_kmers,idx_kmers,'
|
||||||
|
'false_neg,false_pos,count_mismatch,'
|
||||||
|
'fn_pct,fp_pct,cm_pct')
|
||||||
|
return
|
||||||
|
|
||||||
|
# Detect k
|
||||||
|
cmd1 = [args.obikmer, 'dump', '--head', '1', args.index]
|
||||||
|
out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True)
|
||||||
|
k = len(out1.splitlines()[1].split(',')[0])
|
||||||
|
|
||||||
|
# Load reference
|
||||||
|
print(f'Loading reference: {args.reference}', file=sys.stderr)
|
||||||
|
npz = np.load(args.reference)
|
||||||
|
ref_kmers = npz['kmers'] # sorted uint64
|
||||||
|
ref_counts = npz['counts'] # uint32
|
||||||
|
|
||||||
|
# Load index
|
||||||
|
print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr)
|
||||||
|
idx_kmers, idx_counts = load_index(args.obikmer, args.index)
|
||||||
|
|
||||||
|
print(f'k={k} ref={len(ref_kmers):,} idx={len(idx_kmers):,}', file=sys.stderr)
|
||||||
|
|
||||||
|
false_neg, false_pos, cm_kmers, cm_ref, cm_idx = compare(
|
||||||
|
ref_kmers, ref_counts, idx_kmers, idx_counts)
|
||||||
|
|
||||||
|
n_shared = len(ref_kmers) - len(false_neg)
|
||||||
|
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
|
||||||
|
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
|
||||||
|
cm_pct = 100.0 * len(cm_kmers) / n_shared if n_shared else 0.0
|
||||||
|
|
||||||
|
print(f'false negatives : {len(false_neg):,} ({fn_pct:.4f}%)', file=sys.stderr)
|
||||||
|
print(f'false positives : {len(false_pos):,} ({fp_pct:.4f}%)', file=sys.stderr)
|
||||||
|
print(f'count mismatches: {len(cm_kmers):,} ({cm_pct:.4f}% of shared)',
|
||||||
|
file=sys.stderr)
|
||||||
|
|
||||||
|
if args.save_fn and len(false_neg):
|
||||||
|
with open(args.save_fn, 'w') as fh:
|
||||||
|
for v in false_neg:
|
||||||
|
fh.write(decode_kmer(int(v), k) + '\n')
|
||||||
|
|
||||||
|
if args.save_fp and len(false_pos):
|
||||||
|
with open(args.save_fp, 'w') as fh:
|
||||||
|
for v in false_pos:
|
||||||
|
fh.write(decode_kmer(int(v), k) + '\n')
|
||||||
|
|
||||||
|
if args.save_cm and len(cm_kmers):
|
||||||
|
with open(args.save_cm, 'w') as fh:
|
||||||
|
fh.write('kmer,ref_count,idx_count\n')
|
||||||
|
for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx):
|
||||||
|
fh.write(f'{decode_kmer(int(v), k)},{rc},{ic}\n')
|
||||||
|
|
||||||
|
print(f'{args.species},{args.strain},'
|
||||||
|
f'{len(ref_kmers)},{len(idx_kmers)},'
|
||||||
|
f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},'
|
||||||
|
f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Executable
+201
@@ -0,0 +1,201 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Verify the merged count index against all per-specimen reference sets.
|
||||||
|
|
||||||
|
Streams `obikmer dump` once on the merged index, accumulates per-specimen
|
||||||
|
kmer+count pairs from each column, then compares each against its reference .npz.
|
||||||
|
|
||||||
|
Output to stdout: one CSV row per specimen (same columns as verify_count.py)
|
||||||
|
species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,
|
||||||
|
fn_pct,fp_pct,cm_pct
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
# ── encoding ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
|
||||||
|
'a': 0, 'c': 1, 'g': 2, 't': 3}
|
||||||
|
|
||||||
|
_DECODE = ['A', 'C', 'G', 'T']
|
||||||
|
|
||||||
|
|
||||||
|
def encode_kmer(s: str) -> int:
|
||||||
|
kmer = 0
|
||||||
|
for c in s:
|
||||||
|
kmer = (kmer << 2) | _ENCODE[c]
|
||||||
|
return kmer
|
||||||
|
|
||||||
|
|
||||||
|
def decode_kmer(val: int, k: int) -> str:
|
||||||
|
bases = []
|
||||||
|
for _ in range(k):
|
||||||
|
bases.append(_DECODE[val & 3])
|
||||||
|
val >>= 2
|
||||||
|
return ''.join(reversed(bases))
|
||||||
|
|
||||||
|
|
||||||
|
# ── single-pass dump ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def stream_merged_dump(obikmer_bin: str, index_dir: str,
|
||||||
|
) -> tuple[list[str], dict[str, tuple[list[int], list[int]]]]:
|
||||||
|
"""Stream the merged dump once.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
specimen_names : column labels in dump order
|
||||||
|
per_specimen : mapping label → (kmer_ints, counts) for entries > 0
|
||||||
|
"""
|
||||||
|
cmd = [obikmer_bin, 'dump', index_dir]
|
||||||
|
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
|
||||||
|
text=True)
|
||||||
|
|
||||||
|
header_line = proc.stdout.readline().rstrip('\n')
|
||||||
|
cols = header_line.split(',')
|
||||||
|
specimen_names = cols[1:]
|
||||||
|
per_specimen: dict[str, tuple[list[int], list[int]]] = {
|
||||||
|
name: ([], []) for name in specimen_names}
|
||||||
|
|
||||||
|
for line in proc.stdout:
|
||||||
|
parts = line.rstrip('\n').split(',')
|
||||||
|
kmer_int = encode_kmer(parts[0])
|
||||||
|
for i, name in enumerate(specimen_names):
|
||||||
|
count = int(parts[i + 1])
|
||||||
|
if count > 0:
|
||||||
|
per_specimen[name][0].append(kmer_int)
|
||||||
|
per_specimen[name][1].append(count)
|
||||||
|
|
||||||
|
proc.wait()
|
||||||
|
if proc.returncode != 0:
|
||||||
|
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
return specimen_names, per_specimen
|
||||||
|
|
||||||
|
|
||||||
|
# ── per-specimen comparison ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def compare_specimen(name: str,
|
||||||
|
kmer_list: list[int],
|
||||||
|
count_list: list[int],
|
||||||
|
ref_dir: Path,
|
||||||
|
k: int,
|
||||||
|
save_fn: Path | None,
|
||||||
|
save_fp: Path | None,
|
||||||
|
save_cm: Path | None,
|
||||||
|
) -> str:
|
||||||
|
ref_path = ref_dir / f'{name}.npz'
|
||||||
|
if not ref_path.exists():
|
||||||
|
print(f' SKIP {name}: no reference at {ref_path}', file=sys.stderr)
|
||||||
|
return ''
|
||||||
|
|
||||||
|
species = name.split('--')[0]
|
||||||
|
strain = name[len(species) + 2:]
|
||||||
|
|
||||||
|
npz = np.load(ref_path)
|
||||||
|
ref_kmers = npz['kmers'] # sorted uint64
|
||||||
|
ref_counts = npz['counts'] # uint32
|
||||||
|
|
||||||
|
order = np.argsort(np.array(kmer_list, dtype=np.uint64), kind='stable')
|
||||||
|
idx_kmers = np.array(kmer_list, dtype=np.uint64)[order]
|
||||||
|
idx_counts = np.array(count_list, dtype=np.uint32)[order]
|
||||||
|
|
||||||
|
false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
|
||||||
|
false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
|
||||||
|
|
||||||
|
# Count mismatches among shared kmers
|
||||||
|
pos_in_idx = np.searchsorted(idx_kmers, ref_kmers)
|
||||||
|
pos_in_idx = np.clip(pos_in_idx, 0, len(idx_kmers) - 1)
|
||||||
|
shared_mask = idx_kmers[pos_in_idx] == ref_kmers
|
||||||
|
mismatch_mask = ref_counts[shared_mask] != idx_counts[pos_in_idx[shared_mask]]
|
||||||
|
cm_kmers = ref_kmers[shared_mask][mismatch_mask]
|
||||||
|
cm_ref = ref_counts[shared_mask][mismatch_mask]
|
||||||
|
cm_idx = idx_counts[pos_in_idx[shared_mask]][mismatch_mask]
|
||||||
|
|
||||||
|
n_shared = int(shared_mask.sum())
|
||||||
|
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
|
||||||
|
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
|
||||||
|
cm_pct = 100.0 * len(cm_kmers) / n_shared if n_shared else 0.0
|
||||||
|
|
||||||
|
print(f' {name}: ref={len(ref_kmers):,} idx={len(idx_kmers):,} '
|
||||||
|
f'fn={len(false_neg):,} ({fn_pct:.4f}%) '
|
||||||
|
f'fp={len(false_pos):,} ({fp_pct:.4f}%) '
|
||||||
|
f'cm={len(cm_kmers):,} ({cm_pct:.4f}%)',
|
||||||
|
file=sys.stderr)
|
||||||
|
|
||||||
|
if save_fn and len(false_neg):
|
||||||
|
fn_file = save_fn / f'{name}_fn.txt'
|
||||||
|
fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n')
|
||||||
|
|
||||||
|
if save_fp and len(false_pos):
|
||||||
|
fp_file = save_fp / f'{name}_fp.txt'
|
||||||
|
fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n')
|
||||||
|
|
||||||
|
if save_cm and len(cm_kmers):
|
||||||
|
cm_file = save_cm / f'{name}_cm.csv'
|
||||||
|
lines = ['kmer,ref_count,idx_count']
|
||||||
|
for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx):
|
||||||
|
lines.append(f'{decode_kmer(int(v), k)},{rc},{ic}')
|
||||||
|
cm_file.write_text('\n'.join(lines) + '\n')
|
||||||
|
|
||||||
|
return (f'{species},{strain},'
|
||||||
|
f'{len(ref_kmers)},{len(idx_kmers)},'
|
||||||
|
f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},'
|
||||||
|
f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}')
|
||||||
|
|
||||||
|
|
||||||
|
# ── main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
ap = argparse.ArgumentParser(description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
ap.add_argument('index', metavar='INDEX_DIR', nargs='?',
|
||||||
|
help='Merged count index directory')
|
||||||
|
ap.add_argument('ref_dir', metavar='REF_DIR', nargs='?',
|
||||||
|
help='Directory containing per-specimen .npz reference files')
|
||||||
|
ap.add_argument('--obikmer', default='obikmer')
|
||||||
|
ap.add_argument('--header', action='store_true',
|
||||||
|
help='Print CSV header and exit')
|
||||||
|
ap.add_argument('--save-fn', metavar='DIR',
|
||||||
|
help='Directory for false-negative kmer lists')
|
||||||
|
ap.add_argument('--save-fp', metavar='DIR',
|
||||||
|
help='Directory for false-positive kmer lists')
|
||||||
|
ap.add_argument('--save-cm', metavar='DIR',
|
||||||
|
help='Directory for count-mismatch CSV files')
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
if args.header:
|
||||||
|
print('species,strain,ref_kmers,idx_kmers,'
|
||||||
|
'false_neg,false_pos,count_mismatch,'
|
||||||
|
'fn_pct,fp_pct,cm_pct')
|
||||||
|
return
|
||||||
|
|
||||||
|
ref_dir = Path(args.ref_dir)
|
||||||
|
save_fn = Path(args.save_fn) if args.save_fn else None
|
||||||
|
save_fp = Path(args.save_fp) if args.save_fp else None
|
||||||
|
save_cm = Path(args.save_cm) if args.save_cm else None
|
||||||
|
for d in (save_fn, save_fp, save_cm):
|
||||||
|
if d: d.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
out1 = subprocess.check_output(
|
||||||
|
[args.obikmer, 'dump', '--head', '1', args.index],
|
||||||
|
stderr=subprocess.DEVNULL, text=True)
|
||||||
|
k = len(out1.splitlines()[1].split(',')[0])
|
||||||
|
|
||||||
|
print(f'k={k} streaming merged dump: {args.index}', file=sys.stderr)
|
||||||
|
specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index)
|
||||||
|
print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr)
|
||||||
|
|
||||||
|
for name in specimen_names:
|
||||||
|
kmers, counts = per_specimen[name]
|
||||||
|
row = compare_specimen(name, kmers, counts, ref_dir, k,
|
||||||
|
save_fn, save_fp, save_cm)
|
||||||
|
if row:
|
||||||
|
print(row)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Executable
+27
@@ -0,0 +1,27 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
INDEX="${SCRIPT_DIR}/global_index_count"
|
||||||
|
REF_DIR="${SCRIPT_DIR}/reference_index"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/verify_merge_count"
|
||||||
|
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
|
||||||
|
VERIFY_PY="${SCRIPT_DIR}/verify_merge_count.py"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
CURRENT="${STATS_DIR}/current.csv"
|
||||||
|
|
||||||
|
"${PYTHON}" "${VERIFY_PY}" --header >"${CURRENT}"
|
||||||
|
|
||||||
|
"${PYTHON}" "${VERIFY_PY}" \
|
||||||
|
--obikmer "${BINARY}" \
|
||||||
|
"${INDEX}" "${REF_DIR}" \
|
||||||
|
>>"${CURRENT}"
|
||||||
|
|
||||||
|
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'count_*.csv' | wc -l | tr -d ' ')")
|
||||||
|
ARCHIVE="${STATS_DIR}/count_${run_n}.csv"
|
||||||
|
cp "${CURRENT}" "${ARCHIVE}"
|
||||||
|
|
||||||
|
echo "Done. Results → ${ARCHIVE}"
|
||||||
Executable
+170
@@ -0,0 +1,170 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Verify the merged presence index against all per-specimen reference sets.
|
||||||
|
|
||||||
|
Streams `obikmer dump` once on the merged index, accumulates per-specimen
|
||||||
|
kmer sets from each column, then compares each against its reference .npz.
|
||||||
|
|
||||||
|
Output to stdout: one CSV row per specimen (same columns as verify_presence.py)
|
||||||
|
species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
# ── encoding ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
|
||||||
|
'a': 0, 'c': 1, 'g': 2, 't': 3}
|
||||||
|
|
||||||
|
_DECODE = ['A', 'C', 'G', 'T']
|
||||||
|
|
||||||
|
|
||||||
|
def encode_kmer(s: str) -> int:
|
||||||
|
kmer = 0
|
||||||
|
for c in s:
|
||||||
|
kmer = (kmer << 2) | _ENCODE[c]
|
||||||
|
return kmer
|
||||||
|
|
||||||
|
|
||||||
|
def decode_kmer(val: int, k: int) -> str:
|
||||||
|
bases = []
|
||||||
|
for _ in range(k):
|
||||||
|
bases.append(_DECODE[val & 3])
|
||||||
|
val >>= 2
|
||||||
|
return ''.join(reversed(bases))
|
||||||
|
|
||||||
|
|
||||||
|
# ── single-pass dump ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def stream_merged_dump(obikmer_bin: str, index_dir: str,
|
||||||
|
) -> tuple[list[str], dict[str, list[int]]]:
|
||||||
|
"""Stream the merged dump once.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
specimen_names : column labels in dump order (excluding 'kmer')
|
||||||
|
per_specimen : mapping label → list of kmer ints where presence > 0
|
||||||
|
"""
|
||||||
|
cmd = [obikmer_bin, 'dump', index_dir]
|
||||||
|
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
|
||||||
|
text=True)
|
||||||
|
|
||||||
|
header_line = proc.stdout.readline().rstrip('\n')
|
||||||
|
cols = header_line.split(',')
|
||||||
|
specimen_names = cols[1:] # first col is 'kmer'
|
||||||
|
per_specimen: dict[str, list[int]] = {name: [] for name in specimen_names}
|
||||||
|
|
||||||
|
for line in proc.stdout:
|
||||||
|
parts = line.rstrip('\n').split(',')
|
||||||
|
kmer_int = encode_kmer(parts[0])
|
||||||
|
for i, name in enumerate(specimen_names):
|
||||||
|
if int(parts[i + 1]) > 0:
|
||||||
|
per_specimen[name].append(kmer_int)
|
||||||
|
|
||||||
|
proc.wait()
|
||||||
|
if proc.returncode != 0:
|
||||||
|
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
return specimen_names, per_specimen
|
||||||
|
|
||||||
|
|
||||||
|
# ── per-specimen comparison ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def compare_specimen(name: str,
|
||||||
|
kmer_list: list[int],
|
||||||
|
ref_dir: Path,
|
||||||
|
k: int,
|
||||||
|
save_fn: Path | None,
|
||||||
|
save_fp: Path | None,
|
||||||
|
) -> str:
|
||||||
|
"""Compare one specimen column against its reference .npz.
|
||||||
|
|
||||||
|
Returns a CSV row string.
|
||||||
|
"""
|
||||||
|
ref_path = ref_dir / f'{name}.npz'
|
||||||
|
if not ref_path.exists():
|
||||||
|
print(f' SKIP {name}: no reference at {ref_path}', file=sys.stderr)
|
||||||
|
return ''
|
||||||
|
|
||||||
|
species = name.split('--')[0]
|
||||||
|
strain = name[len(species) + 2:]
|
||||||
|
|
||||||
|
ref_kmers = np.load(ref_path)['kmers'] # sorted uint64
|
||||||
|
idx_kmers = np.array(sorted(kmer_list), dtype=np.uint64)
|
||||||
|
|
||||||
|
false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
|
||||||
|
false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
|
||||||
|
|
||||||
|
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
|
||||||
|
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
|
||||||
|
|
||||||
|
print(f' {name}: ref={len(ref_kmers):,} idx={len(idx_kmers):,} '
|
||||||
|
f'fn={len(false_neg):,} ({fn_pct:.4f}%) '
|
||||||
|
f'fp={len(false_pos):,} ({fp_pct:.4f}%)',
|
||||||
|
file=sys.stderr)
|
||||||
|
|
||||||
|
if save_fn and len(false_neg):
|
||||||
|
fn_file = save_fn / f'{name}_fn.txt'
|
||||||
|
fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n')
|
||||||
|
|
||||||
|
if save_fp and len(false_pos):
|
||||||
|
fp_file = save_fp / f'{name}_fp.txt'
|
||||||
|
fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n')
|
||||||
|
|
||||||
|
return (f'{species},{strain},'
|
||||||
|
f'{len(ref_kmers)},{len(idx_kmers)},'
|
||||||
|
f'{len(false_neg)},{len(false_pos)},'
|
||||||
|
f'{fn_pct:.4f},{fp_pct:.4f}')
|
||||||
|
|
||||||
|
|
||||||
|
# ── main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
ap = argparse.ArgumentParser(description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
ap.add_argument('index', metavar='INDEX_DIR', nargs='?',
|
||||||
|
help='Merged presence index directory')
|
||||||
|
ap.add_argument('ref_dir', metavar='REF_DIR', nargs='?',
|
||||||
|
help='Directory containing per-specimen .npz reference files')
|
||||||
|
ap.add_argument('--obikmer', default='obikmer')
|
||||||
|
ap.add_argument('--header', action='store_true',
|
||||||
|
help='Print CSV header and exit')
|
||||||
|
ap.add_argument('--save-fn', metavar='DIR',
|
||||||
|
help='Directory to save false-negative kmer lists')
|
||||||
|
ap.add_argument('--save-fp', metavar='DIR',
|
||||||
|
help='Directory to save false-positive kmer lists')
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
if args.header:
|
||||||
|
print('species,strain,ref_kmers,idx_kmers,'
|
||||||
|
'false_neg,false_pos,fn_pct,fp_pct')
|
||||||
|
return
|
||||||
|
|
||||||
|
ref_dir = Path(args.ref_dir)
|
||||||
|
save_fn = Path(args.save_fn) if args.save_fn else None
|
||||||
|
save_fp = Path(args.save_fp) if args.save_fp else None
|
||||||
|
if save_fn: save_fn.mkdir(parents=True, exist_ok=True)
|
||||||
|
if save_fp: save_fp.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Detect k
|
||||||
|
out1 = subprocess.check_output(
|
||||||
|
[args.obikmer, 'dump', '--head', '1', args.index],
|
||||||
|
stderr=subprocess.DEVNULL, text=True)
|
||||||
|
k = len(out1.splitlines()[1].split(',')[0])
|
||||||
|
|
||||||
|
print(f'k={k} streaming merged dump: {args.index}', file=sys.stderr)
|
||||||
|
specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index)
|
||||||
|
print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr)
|
||||||
|
|
||||||
|
for name in specimen_names:
|
||||||
|
row = compare_specimen(name, per_specimen[name], ref_dir, k, save_fn, save_fp)
|
||||||
|
if row:
|
||||||
|
print(row)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Executable
+27
@@ -0,0 +1,27 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
INDEX="${SCRIPT_DIR}/global_index_presence"
|
||||||
|
REF_DIR="${SCRIPT_DIR}/reference_index"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/verify_merge_presence"
|
||||||
|
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
|
||||||
|
VERIFY_PY="${SCRIPT_DIR}/verify_merge_presence.py"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
CURRENT="${STATS_DIR}/current.csv"
|
||||||
|
|
||||||
|
"${PYTHON}" "${VERIFY_PY}" --header >"${CURRENT}"
|
||||||
|
|
||||||
|
"${PYTHON}" "${VERIFY_PY}" \
|
||||||
|
--obikmer "${BINARY}" \
|
||||||
|
"${INDEX}" "${REF_DIR}" \
|
||||||
|
>>"${CURRENT}"
|
||||||
|
|
||||||
|
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'presence_*.csv' | wc -l | tr -d ' ')")
|
||||||
|
ARCHIVE="${STATS_DIR}/presence_${run_n}.csv"
|
||||||
|
cp "${CURRENT}" "${ARCHIVE}"
|
||||||
|
|
||||||
|
echo "Done. Results → ${ARCHIVE}"
|
||||||
Executable
+30
@@ -0,0 +1,30 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: verify_one_count.sh SPECIMEN
|
||||||
|
# SPECIMEN = "species--strain" (Make pattern stem)
|
||||||
|
# Output: stats/verify_count/SPECIMEN.stats (one CSV data row, no header)
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SPECIMEN="$1"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
|
||||||
|
VERIFY_PY="${SCRIPT_DIR}/verify_count.py"
|
||||||
|
|
||||||
|
species="${SPECIMEN%%--*}"
|
||||||
|
strain="${SPECIMEN#*--}"
|
||||||
|
|
||||||
|
REF_NPZ="${SCRIPT_DIR}/reference_index/${SPECIMEN}.npz"
|
||||||
|
INDEX_DIR="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/verify_count"
|
||||||
|
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
echo "[${SPECIMEN}] verifying count"
|
||||||
|
|
||||||
|
"${PYTHON}" "${VERIFY_PY}" \
|
||||||
|
--obikmer "${BINARY}" \
|
||||||
|
--species "${species}" \
|
||||||
|
--strain "${strain}" \
|
||||||
|
"${REF_NPZ}" "${INDEX_DIR}" \
|
||||||
|
>"${STATS_FILE}"
|
||||||
Executable
+30
@@ -0,0 +1,30 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: verify_one_presence.sh SPECIMEN
|
||||||
|
# SPECIMEN = "species--strain" (Make pattern stem)
|
||||||
|
# Output: stats/verify_presence/SPECIMEN.stats (one CSV data row, no header)
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SPECIMEN="$1"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
|
||||||
|
VERIFY_PY="${SCRIPT_DIR}/verify_presence.py"
|
||||||
|
|
||||||
|
species="${SPECIMEN%%--*}"
|
||||||
|
strain="${SPECIMEN#*--}"
|
||||||
|
|
||||||
|
REF_NPZ="${SCRIPT_DIR}/reference_index/${SPECIMEN}.npz"
|
||||||
|
INDEX_DIR="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/verify_presence"
|
||||||
|
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
echo "[${SPECIMEN}] verifying presence"
|
||||||
|
|
||||||
|
"${PYTHON}" "${VERIFY_PY}" \
|
||||||
|
--obikmer "${BINARY}" \
|
||||||
|
--species "${species}" \
|
||||||
|
--strain "${strain}" \
|
||||||
|
"${REF_NPZ}" "${INDEX_DIR}" \
|
||||||
|
>"${STATS_FILE}"
|
||||||
Executable
+139
@@ -0,0 +1,139 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Compare an obikmer index against a reference kmer set (presence/absence).
|
||||||
|
|
||||||
|
Loads the reference .npz (sorted uint64 kmers built by build_reference.py),
|
||||||
|
streams the output of `obikmer dump`, encodes each kmer string to uint64,
|
||||||
|
then reports false negatives and false positives using numpy set operations.
|
||||||
|
|
||||||
|
Output to stdout: one CSV row
|
||||||
|
species, strain, ref_kmers, idx_kmers, false_neg, false_pos, fn_pct, fp_pct
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
# ── encoding ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
|
||||||
|
'a': 0, 'c': 1, 'g': 2, 't': 3}
|
||||||
|
|
||||||
|
_DECODE = ['A', 'C', 'G', 'T']
|
||||||
|
|
||||||
|
|
||||||
|
def encode_kmer(s: str) -> int:
|
||||||
|
kmer = 0
|
||||||
|
for c in s:
|
||||||
|
kmer = (kmer << 2) | _ENCODE[c]
|
||||||
|
return kmer
|
||||||
|
|
||||||
|
|
||||||
|
def decode_kmer(val: int, k: int) -> str:
|
||||||
|
bases = []
|
||||||
|
for _ in range(k):
|
||||||
|
bases.append(_DECODE[val & 3])
|
||||||
|
val >>= 2
|
||||||
|
return ''.join(reversed(bases))
|
||||||
|
|
||||||
|
|
||||||
|
# ── dump parsing ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def load_index_kmers(obikmer_bin: str, index_dir: str) -> np.ndarray:
|
||||||
|
"""Stream `obikmer dump` and return a sorted uint64 array of kmer integers."""
|
||||||
|
cmd = [obikmer_bin, 'dump', index_dir]
|
||||||
|
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
|
||||||
|
text=True)
|
||||||
|
kmers = []
|
||||||
|
header = True
|
||||||
|
for line in proc.stdout:
|
||||||
|
if header:
|
||||||
|
header = False
|
||||||
|
continue
|
||||||
|
kmer_str = line.split(',', 1)[0]
|
||||||
|
kmers.append(encode_kmer(kmer_str))
|
||||||
|
proc.wait()
|
||||||
|
if proc.returncode != 0:
|
||||||
|
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
arr = np.array(kmers, dtype=np.uint64)
|
||||||
|
arr.sort()
|
||||||
|
return arr
|
||||||
|
|
||||||
|
|
||||||
|
# ── comparison ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def compare(ref: np.ndarray, idx: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
||||||
|
"""Return (false_negatives, false_positives) as uint64 arrays."""
|
||||||
|
false_neg = np.setdiff1d(ref, idx, assume_unique=True)
|
||||||
|
false_pos = np.setdiff1d(idx, ref, assume_unique=True)
|
||||||
|
return false_neg, false_pos
|
||||||
|
|
||||||
|
|
||||||
|
# ── main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
ap = argparse.ArgumentParser(description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
ap.add_argument('reference', metavar='REF_NPZ', nargs='?', help='Reference .npz file')
|
||||||
|
ap.add_argument('index', metavar='INDEX_DIR', nargs='?', help='obikmer index directory')
|
||||||
|
ap.add_argument('--obikmer', default='obikmer', help='Path to obikmer binary')
|
||||||
|
ap.add_argument('--species', default='', help='Species label for CSV row')
|
||||||
|
ap.add_argument('--strain', default='', help='Strain label for CSV row')
|
||||||
|
ap.add_argument('--header', action='store_true', help='Print CSV header and exit')
|
||||||
|
ap.add_argument('--save-fp', metavar='FILE',
|
||||||
|
help='Save false-positive kmer strings to FILE')
|
||||||
|
ap.add_argument('--save-fn', metavar='FILE',
|
||||||
|
help='Save false-negative kmer strings to FILE')
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
if args.header:
|
||||||
|
print('species,strain,ref_kmers,idx_kmers,'
|
||||||
|
'false_neg,false_pos,fn_pct,fp_pct')
|
||||||
|
return
|
||||||
|
|
||||||
|
# Detect k from the index (one cheap call before the full dump).
|
||||||
|
cmd1 = [args.obikmer, 'dump', '--head', '1', args.index]
|
||||||
|
out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True)
|
||||||
|
k = len(out1.splitlines()[1].split(',')[0])
|
||||||
|
|
||||||
|
# Load reference
|
||||||
|
print(f'Loading reference: {args.reference}', file=sys.stderr)
|
||||||
|
npz = np.load(args.reference)
|
||||||
|
ref_kmers = npz['kmers'] # already sorted uint64
|
||||||
|
|
||||||
|
# Load index
|
||||||
|
print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr)
|
||||||
|
idx_kmers = load_index_kmers(args.obikmer, args.index)
|
||||||
|
|
||||||
|
print(f'k={k} ref={len(ref_kmers):,} idx={len(idx_kmers):,}', file=sys.stderr)
|
||||||
|
|
||||||
|
false_neg, false_pos = compare(ref_kmers, idx_kmers)
|
||||||
|
|
||||||
|
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
|
||||||
|
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
|
||||||
|
|
||||||
|
print(f'false negatives: {len(false_neg):,} ({fn_pct:.4f}%)', file=sys.stderr)
|
||||||
|
print(f'false positives: {len(false_pos):,} ({fp_pct:.4f}%)', file=sys.stderr)
|
||||||
|
|
||||||
|
if args.save_fn and len(false_neg):
|
||||||
|
with open(args.save_fn, 'w') as fh:
|
||||||
|
for v in false_neg:
|
||||||
|
fh.write(decode_kmer(int(v), k) + '\n')
|
||||||
|
print(f'False negatives saved → {args.save_fn}', file=sys.stderr)
|
||||||
|
|
||||||
|
if args.save_fp and len(false_pos):
|
||||||
|
with open(args.save_fp, 'w') as fh:
|
||||||
|
for v in false_pos:
|
||||||
|
fh.write(decode_kmer(int(v), k) + '\n')
|
||||||
|
print(f'False positives saved → {args.save_fp}', file=sys.stderr)
|
||||||
|
|
||||||
|
print(f'{args.species},{args.strain},'
|
||||||
|
f'{len(ref_kmers)},{len(idx_kmers)},'
|
||||||
|
f'{len(false_neg)},{len(false_pos)},'
|
||||||
|
f'{fn_pct:.4f},{fp_pct:.4f}')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
@@ -638,6 +638,34 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="/implementation/evidence_elimination/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Evidence elimination (discussion)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="/implementation/obilayeredmap/" class="md-nav__link">
|
<a href="/implementation/obilayeredmap/" class="md-nav__link">
|
||||||
|
|
||||||
@@ -716,6 +744,62 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="/implementation/merge/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Merge command
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="/implementation/rebuild_filter/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Kmer filtering (rebuild/dump/unitig)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
</nav>
|
</nav>
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -9,7 +9,7 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
<link rel="prev" href="../../../implementation/persistent_bit_vec/">
|
<link rel="prev" href="../../../implementation/rebuild_filter/">
|
||||||
|
|
||||||
|
|
||||||
<link rel="next" href="../../index_architecture/">
|
<link rel="next" href="../../index_architecture/">
|
||||||
@@ -647,6 +647,34 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../../../implementation/evidence_elimination/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Evidence elimination (discussion)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="../../../implementation/obilayeredmap/" class="md-nav__link">
|
<a href="../../../implementation/obilayeredmap/" class="md-nav__link">
|
||||||
|
|
||||||
@@ -725,6 +753,62 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../../../implementation/merge/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Merge command
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../../../implementation/rebuild_filter/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Kmer filtering (rebuild/dump/unitig)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
</nav>
|
</nav>
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -243,19 +243,28 @@
|
|||||||
</label>
|
</label>
|
||||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix="">
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix="">
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a class="md-nav__link" href="#output-type-rope">
|
<a class="md-nav__link" href="#two-reading-paths">
|
||||||
<span class="md-ellipsis">
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
Output type: rope
|
Two reading paths
|
||||||
|
|
||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
</li>
|
</li>
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a class="md-nav__link" href="#allocation-policy">
|
<a class="md-nav__link" href="#record-path-chunk-reader">
|
||||||
<span class="md-ellipsis">
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
Allocation policy
|
Record path: chunk reader
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a class="md-nav__link" href="#output-type-rope">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Output type: Rope
|
||||||
|
|
||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
@@ -347,6 +356,18 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a class="md-nav__link" href="../evidence_elimination/">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Evidence elimination (discussion)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
</li>
|
</li>
|
||||||
@@ -383,6 +404,30 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a class="md-nav__link" href="../merge/">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Merge command
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a class="md-nav__link" href="../rebuild_filter/">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Kmer filtering (rebuild/dump/unitig)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
</li>
|
</li>
|
||||||
@@ -454,19 +499,28 @@
|
|||||||
</label>
|
</label>
|
||||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix="">
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix="">
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a class="md-nav__link" href="#output-type-rope">
|
<a class="md-nav__link" href="#two-reading-paths">
|
||||||
<span class="md-ellipsis">
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
Output type: rope
|
Two reading paths
|
||||||
|
|
||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
</li>
|
</li>
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a class="md-nav__link" href="#allocation-policy">
|
<a class="md-nav__link" href="#record-path-chunk-reader">
|
||||||
<span class="md-ellipsis">
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
Allocation policy
|
Record path: chunk reader
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a class="md-nav__link" href="#output-type-rope">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Output type: Rope
|
||||||
|
|
||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
@@ -506,68 +560,77 @@
|
|||||||
<div class="md-content" data-md-component="content">
|
<div class="md-content" data-md-component="content">
|
||||||
<article class="md-content__inner md-typeset">
|
<article class="md-content__inner md-typeset">
|
||||||
<h1 id="chunk-reader-implementation">Chunk reader — implementation</h1>
|
<h1 id="chunk-reader-implementation">Chunk reader — implementation</h1>
|
||||||
<p>The <code>obiread</code> crate provides a streaming iterator that reads FASTA or FASTQ files in fixed-size blocks and yields self-contained chunks, each ending on a complete sequence record boundary. Chunks are consumed in parallel by downstream workers.</p>
|
<p><code>obiread</code> exposes two distinct sequence reading paths, each optimised for a different use case.</p>
|
||||||
<h2 id="output-type-rope">Output type: rope</h2>
|
<h2 id="two-reading-paths">Two reading paths</h2>
|
||||||
<p>Each chunk is a <code>Vec<Bytes></code> — a <strong>rope</strong>: a list of reference-counted byte slices that are not necessarily contiguous in memory. The consumer iterates over the slices in order.</p>
|
|
||||||
<p>Using <code>bytes::Bytes</code> means the split at the record boundary is O(1): <code>Bytes::split_to(n)</code> adjusts a reference counter, not memory. No <code>memcpy</code> in the common case.</p>
|
|
||||||
<h2 id="allocation-policy">Allocation policy</h2>
|
|
||||||
<table>
|
<table>
|
||||||
<thead>
|
<thead>
|
||||||
<tr>
|
<tr>
|
||||||
<th>Case</th>
|
<th>Path</th>
|
||||||
<th>Cost</th>
|
<th>API</th>
|
||||||
|
<th>Output unit</th>
|
||||||
|
<th>Per-record identity</th>
|
||||||
|
<th>Use case</th>
|
||||||
</tr>
|
</tr>
|
||||||
</thead>
|
</thead>
|
||||||
<tbody>
|
<tbody>
|
||||||
<tr>
|
<tr>
|
||||||
<td>Boundary found in the current block (common)</td>
|
<td><strong>Record path</strong></td>
|
||||||
<td>zero extra allocation — <code>split_to</code> only</td>
|
<td><code>read_sequence_chunks</code> → <code>parse_chunk</code></td>
|
||||||
|
<td><code>SeqRecord</code> (id + raw sequence + normalised rope)</td>
|
||||||
|
<td>yes</td>
|
||||||
|
<td><code>query</code> — must read complete records</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td>Boundary straddles multiple blocks (sequence > block size, rare)</td>
|
<td><strong>Stream path</strong></td>
|
||||||
<td>one allocation to pack the rope into a flat buffer</td>
|
<td><code>open_nuc_stream</code></td>
|
||||||
</tr>
|
<td><code>NucPage</code> (flat normalised byte buffer)</td>
|
||||||
<tr>
|
<td>no</td>
|
||||||
<td>EOF flush</td>
|
<td><code>index</code>, <code>superkmer</code> — bulk throughput</td>
|
||||||
<td>zero extra allocation</td>
|
|
||||||
</tr>
|
</tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
|
<p>The record path uses <code>Rope</code>-backed chunks and is described in detail below.
|
||||||
|
The stream path (<code>NucStream</code> / <code>NucPage</code>) is described in the scatter section of <a href="../pipeline/">pipeline</a>.</p>
|
||||||
|
<hr/>
|
||||||
|
<h2 id="record-path-chunk-reader">Record path: chunk reader</h2>
|
||||||
|
<p>The chunk reader reads FASTA or FASTQ files in fixed-size blocks and yields self-contained chunks, each ending on a complete sequence record boundary. <code>parse_chunk</code> then converts each chunk into a <code>Vec<SeqRecord></code>, where each record carries its identifier, raw sequence bytes, and a normalised rope ready for superkmer building.</p>
|
||||||
|
<p>This path is mandatory for <code>query</code>, where superkmers must be tracked back to their originating sequence (id, kmer offset) for output annotation.</p>
|
||||||
|
<h2 id="output-type-rope">Output type: Rope</h2>
|
||||||
|
<p>Each chunk is a <code>Rope</code> — a segmented byte sequence: a <code>Vec</code> of blocks, where each block is a <code>Vec<Cell<u8>></code>. The consumer iterates over the blocks via a forward or backward cursor.</p>
|
||||||
|
<p><code>Rope::split_off(pos)</code> splits at an absolute byte offset in O(log n) (binary search over block-start index). If <code>pos</code> falls inside a block, that block is split in two via <code>Vec::split_off</code> — no <code>memcpy</code> in the common case.</p>
|
||||||
<h2 id="seqchunkiter">SeqChunkIter</h2>
|
<h2 id="seqchunkiter">SeqChunkIter</h2>
|
||||||
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">struct</span><span class="w"> </span><span class="nc">SeqChunkIter</span><span class="o"><</span><span class="n">R</span><span class="p">:</span><span class="w"> </span><span class="nc">Read</span><span class="o">></span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="cm">/* private */</span><span class="w"> </span><span class="p">}</span>
|
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">struct</span><span class="w"> </span><span class="nc">SeqChunkIter</span><span class="o"><</span><span class="n">R</span><span class="p">:</span><span class="w"> </span><span class="nc">Read</span><span class="o">></span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="cm">/* private */</span><span class="w"> </span><span class="p">}</span>
|
||||||
|
|
||||||
<span class="k">impl</span><span class="o"><</span><span class="n">R</span><span class="p">:</span><span class="w"> </span><span class="nc">Read</span><span class="o">></span><span class="w"> </span><span class="nb">Iterator</span><span class="w"> </span><span class="k">for</span><span class="w"> </span><span class="n">SeqChunkIter</span><span class="o"><</span><span class="n">R</span><span class="o">></span><span class="w"> </span><span class="p">{</span>
|
<span class="k">impl</span><span class="o"><</span><span class="n">R</span><span class="p">:</span><span class="w"> </span><span class="nc">Read</span><span class="o">></span><span class="w"> </span><span class="nb">Iterator</span><span class="w"> </span><span class="k">for</span><span class="w"> </span><span class="n">SeqChunkIter</span><span class="o"><</span><span class="n">R</span><span class="o">></span><span class="w"> </span><span class="p">{</span>
|
||||||
<span class="w"> </span><span class="k">type</span><span class="w"> </span><span class="nc">Item</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">io</span><span class="p">::</span><span class="nb">Result</span><span class="o"><</span><span class="nb">Vec</span><span class="o"><</span><span class="n">Bytes</span><span class="o">>></span><span class="p">;</span>
|
<span class="w"> </span><span class="k">type</span><span class="w"> </span><span class="nc">Item</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">io</span><span class="p">::</span><span class="nb">Result</span><span class="o"><</span><span class="n">Rope</span><span class="o">></span><span class="p">;</span>
|
||||||
<span class="p">}</span>
|
<span class="p">}</span>
|
||||||
|
|
||||||
<span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">fasta_chunks</span><span class="o"><</span><span class="n">R</span><span class="p">:</span><span class="w"> </span><span class="nc">Read</span><span class="o">></span><span class="p">(</span><span class="n">source</span><span class="p">:</span><span class="w"> </span><span class="nc">R</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">SeqChunkIter</span><span class="o"><</span><span class="n">R</span><span class="o">></span>
|
<span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">fasta_chunks</span><span class="o"><</span><span class="n">R</span><span class="p">:</span><span class="w"> </span><span class="nc">Read</span><span class="o">></span><span class="p">(</span><span class="n">source</span><span class="p">:</span><span class="w"> </span><span class="nc">R</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">SeqChunkIter</span><span class="o"><</span><span class="n">R</span><span class="o">></span>
|
||||||
<span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">fastq_chunks</span><span class="o"><</span><span class="n">R</span><span class="p">:</span><span class="w"> </span><span class="nc">Read</span><span class="o">></span><span class="p">(</span><span class="n">source</span><span class="p">:</span><span class="w"> </span><span class="nc">R</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">SeqChunkIter</span><span class="o"><</span><span class="n">R</span><span class="o">></span>
|
<span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">fastq_chunks</span><span class="o"><</span><span class="n">R</span><span class="p">:</span><span class="w"> </span><span class="nc">Read</span><span class="o">></span><span class="p">(</span><span class="n">source</span><span class="p">:</span><span class="w"> </span><span class="nc">R</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">SeqChunkIter</span><span class="o"><</span><span class="n">R</span><span class="o">></span>
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p><code>next()</code> loop:</p>
|
<p><code>next()</code> loop:</p>
|
||||||
<div class="highlight"><pre><span></span><code>1. read one block of block_size bytes → push onto rope
|
<div class="highlight"><pre><span></span><code>1. read one block of block_size bytes → push onto Rope
|
||||||
2. probe check: if the boundary marker ("\n>" or "\n@") is absent from the
|
2. call splitter(rope) → Option<abs_offset>
|
||||||
last block, skip the splitter (avoids a full backward scan for nothing)
|
if Some(pos):
|
||||||
3. call splitter on last block
|
tail = rope.split_off(pos) ← O(log n), may split one block
|
||||||
if found at offset n:
|
chunk = mem::replace(&mut rope, tail)
|
||||||
remainder = last_block.split_to(n) ← O(1), zero copy
|
return Some(Ok(chunk))
|
||||||
return std::mem::take(&mut self.rope) ← the chunk
|
3. if EOF and rope non-empty: return Some(Ok(rope)) as final chunk
|
||||||
4. if rope.len() > 1 (multi-block accumulation):
|
4. if EOF and rope empty: return None
|
||||||
pack rope into one flat buffer ← one alloc
|
|
||||||
retry splitter on flat buffer
|
|
||||||
5. if EOF: flush remaining rope as final chunk
|
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
|
<p>The <code>Splitter</code> function signature is <code>fn(&Rope) -> Option<usize></code>. It returns the absolute byte offset of the start of the last complete record, or <code>None</code> if no boundary was found in the accumulated rope (need more data).</p>
|
||||||
<h2 id="boundary-detection-fasta">Boundary detection — FASTA</h2>
|
<h2 id="boundary-detection-fasta">Boundary detection — FASTA</h2>
|
||||||
<p>Backward scan with a 2-state machine. Searches for <code>></code> immediately preceded by <code>\n</code> or <code>\r</code>:</p>
|
<p>Backward scan with a 2-state machine. Searches (right to left) for <code>></code> followed by <code>\n</code> or <code>\r</code> (i.e., a <code>></code> that is preceded by a newline in forward order):</p>
|
||||||
<pre class="mermaid"><code>stateDiagram-v2
|
<pre class="mermaid"><code>stateDiagram-v2
|
||||||
direction LR
|
direction LR
|
||||||
[*] --> Scanning
|
[*] --> Scanning
|
||||||
Scanning --> FoundGt : '>'
|
Scanning --> FoundGt : '>'
|
||||||
FoundGt --> Scanning : other
|
FoundGt --> Scanning : other
|
||||||
FoundGt --> [*] : '\\n' / '\\r' ✓</code></pre>
|
FoundGt --> [*] : '\\n' / '\\r' ✓</code></pre>
|
||||||
<p>Returns the byte offset of the <code>></code> that starts the last complete record.</p>
|
<p>Returns the byte offset of the <code>></code> that starts the last complete record. Returns <code>None</code> if only one <code>></code> is found (cannot confirm there is a prior complete record).</p>
|
||||||
<h2 id="boundary-detection-fastq">Boundary detection — FASTQ</h2>
|
<h2 id="boundary-detection-fastq">Boundary detection — FASTQ</h2>
|
||||||
<p>FASTQ records have a rigid 4-line structure (<code>@header</code>, sequence, <code>+</code>, quality). The <code>@</code> character (ASCII 64, Phred score 31) can appear legitimately in quality lines, making any forward heuristic unreliable. The backward scanner verifies the full structural context before accepting a candidate <code>@</code>.</p>
|
<p>FASTQ records have a rigid 4-line structure (<code>@header</code>, sequence, <code>+</code>, quality). The <code>@</code> character (ASCII 64, Phred score 31) can appear legitimately in quality lines, making any forward heuristic unreliable. The backward scanner verifies the full structural context before accepting a candidate <code>@</code>.</p>
|
||||||
<p>7-state machine (port of Go's <code>EndOfLastFastqEntry</code>), scanning from <strong>right to left</strong>. Each time a <code>+</code> is found, its position is saved as <code>restart</code>; any state mismatch resets the scan to that position.</p>
|
<p>7-state machine (states 0–6), scanning from <strong>right to left</strong>. Each time a <code>+</code> is found, its position is saved as <code>restart</code>; any state mismatch resets the scan to that position.</p>
|
||||||
<pre class="mermaid"><code>stateDiagram-v2
|
<pre class="mermaid"><code>stateDiagram-v2
|
||||||
direction LR
|
direction LR
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -514,10 +514,21 @@
|
|||||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="#memory-layout" class="md-nav__link">
|
<a href="#types-and-layout" class="md-nav__link">
|
||||||
<span class="md-ellipsis">
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
Memory layout
|
Types and layout
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#global-parameters" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Global parameters
|
||||||
|
|
||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
@@ -558,10 +569,32 @@
|
|||||||
</li>
|
</li>
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="#canonical-form" class="md-nav__link">
|
<a href="#canonical-form-and-canonicalkmerof" class="md-nav__link">
|
||||||
<span class="md-ellipsis">
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
Canonical form
|
Canonical form and CanonicalKmerOf
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#sliding-window-helpers" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Sliding window helpers
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#hashing" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Hashing
|
||||||
|
|
||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
@@ -751,6 +784,34 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../evidence_elimination/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Evidence elimination (discussion)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="../obilayeredmap/" class="md-nav__link">
|
<a href="../obilayeredmap/" class="md-nav__link">
|
||||||
|
|
||||||
@@ -829,6 +890,62 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../merge/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Merge command
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../rebuild_filter/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Kmer filtering (rebuild/dump/unitig)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
</nav>
|
</nav>
|
||||||
|
|
||||||
@@ -973,10 +1090,21 @@
|
|||||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="#memory-layout" class="md-nav__link">
|
<a href="#types-and-layout" class="md-nav__link">
|
||||||
<span class="md-ellipsis">
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
Memory layout
|
Types and layout
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#global-parameters" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Global parameters
|
||||||
|
|
||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
@@ -1017,10 +1145,32 @@
|
|||||||
</li>
|
</li>
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="#canonical-form" class="md-nav__link">
|
<a href="#canonical-form-and-canonicalkmerof" class="md-nav__link">
|
||||||
<span class="md-ellipsis">
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
Canonical form
|
Canonical form and CanonicalKmerOf
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#sliding-window-helpers" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Sliding window helpers
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#hashing" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Hashing
|
||||||
|
|
||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
@@ -1045,12 +1195,43 @@
|
|||||||
|
|
||||||
|
|
||||||
<h1 id="kmer-implementation">Kmer — implementation</h1>
|
<h1 id="kmer-implementation">Kmer — implementation</h1>
|
||||||
<h2 id="memory-layout">Memory layout</h2>
|
<h2 id="types-and-layout">Types and layout</h2>
|
||||||
<p><code>Kmer</code> is a <code>#[repr(transparent)]</code> newtype over <code>u64</code>:</p>
|
<p><code>KmerOf<L></code> is a <code>#[repr(transparent)]</code> newtype over <code>u64</code> parameterized by a <code>KmerLength</code> marker:</p>
|
||||||
<div class="highlight"><pre><span></span><code><span class="cp">#[repr(transparent)]</span>
|
<div class="highlight"><pre><span></span><code><span class="cp">#[repr(transparent)]</span>
|
||||||
<span class="k">pub</span><span class="w"> </span><span class="k">struct</span><span class="w"> </span><span class="nc">Kmer</span><span class="p">(</span><span class="kt">u64</span><span class="p">);</span>
|
<span class="k">pub</span><span class="w"> </span><span class="k">struct</span><span class="w"> </span><span class="nc">KmerOf</span><span class="o"><</span><span class="n">L</span><span class="p">:</span><span class="w"> </span><span class="nc">KmerLength</span><span class="o">></span><span class="p">(</span><span class="kt">u64</span><span class="p">,</span><span class="w"> </span><span class="n">PhantomData</span><span class="o"><</span><span class="n">L</span><span class="o">></span><span class="p">);</span>
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p>Nucleotides are packed 2 bits each, <strong>left-aligned</strong>, MSB-first. Nucleotide 0 occupies bits 63–62; nucleotide i occupies bits 63−2i and 62−2i. The low 64−2k bits are always zero. k is <strong>not stored</strong> — it is a parameter of every operation that needs it, and will be owned by the future collection-level indexer.</p>
|
<p>Three marker types implement <code>KmerLength</code>:</p>
|
||||||
|
<table>
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Marker</th>
|
||||||
|
<th><code>len()</code> source</th>
|
||||||
|
<th>Used for</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td><code>KLen</code></td>
|
||||||
|
<td><code>params::k()</code></td>
|
||||||
|
<td>k-mers</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>MLen</code></td>
|
||||||
|
<td><code>params::m()</code></td>
|
||||||
|
<td>minimizers</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>ConstLen<N></code></td>
|
||||||
|
<td>const generic <code>N</code></td>
|
||||||
|
<td>tests</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
<p>Public aliases:</p>
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">type</span><span class="w"> </span><span class="nc">Kmer</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">KmerOf</span><span class="o"><</span><span class="n">KLen</span><span class="o">></span><span class="p">;</span><span class="w"> </span><span class="c1">// k-mer, global k</span>
|
||||||
|
<span class="k">pub</span><span class="w"> </span><span class="k">type</span><span class="w"> </span><span class="nc">Minimizer</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">CanonicalKmerOf</span><span class="o"><</span><span class="n">MLen</span><span class="o">></span><span class="p">;</span><span class="w"> </span><span class="c1">// canonical m-mer, global m</span>
|
||||||
|
</code></pre></div>
|
||||||
|
<p>Nucleotides are packed 2 bits each, <strong>left-aligned</strong>, MSB-first. Nucleotide 0 occupies bits 63–62; nucleotide i occupies bits 63−2i and 62−2i. The low 64−2·len bits are always zero. The length is <strong>not stored</strong> — every operation reads it from <code>L::len()</code>.</p>
|
||||||
<table>
|
<table>
|
||||||
<thead>
|
<thead>
|
||||||
<tr>
|
<tr>
|
||||||
@@ -1071,33 +1252,41 @@
|
|||||||
</tr>
|
</tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
|
<h2 id="global-parameters">Global parameters</h2>
|
||||||
|
<p><code>params::set_k(k)</code> / <code>params::k()</code> and <code>params::set_m(m)</code> / <code>params::m()</code> are backed by <code>OnceLock<usize></code> in production (write-once, panic on conflict) and by <code>thread_local! { Cell<usize> }</code> in test builds (per-thread, freely writable). <code>params::init(k, m)</code> sets both in one call.</p>
|
||||||
<h2 id="encoding">Encoding</h2>
|
<h2 id="encoding">Encoding</h2>
|
||||||
<p><code>Kmer::from_ascii(ascii, k)</code> encodes the first k bytes of an ASCII slice using the shared <code>ENC</code> table (see <a href="../superkmer/#ascii-encoding-and-decoding">SuperKmer — ASCII encoding</a>):</p>
|
<p><code>KmerOf::<L>::from_ascii(ascii)</code> encodes the first <code>L::len()</code> bytes using the shared <code>ENC</code> table (see <a href="../superkmer/#ascii-encoding-and-decoding">SuperKmer — ASCII encoding</a>):</p>
|
||||||
<div class="highlight"><pre><span></span><code><span class="k">for</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="k">in</span><span class="w"> </span><span class="mi">0</span><span class="o">..</span><span class="n">k</span><span class="w"> </span><span class="p">{</span>
|
<div class="highlight"><pre><span></span><code><span class="k">for</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="k">in</span><span class="w"> </span><span class="mi">0</span><span class="o">..</span><span class="n">k</span><span class="w"> </span><span class="p">{</span>
|
||||||
<span class="w"> </span><span class="n">val</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">(</span><span class="n">val</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="mi">2</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="n">encode_base</span><span class="p">(</span><span class="n">ascii</span><span class="p">[</span><span class="n">i</span><span class="p">])</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="kt">u64</span><span class="p">;</span>
|
<span class="w"> </span><span class="n">val</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">(</span><span class="n">val</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="mi">2</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="n">encode_base</span><span class="p">(</span><span class="n">ascii</span><span class="p">[</span><span class="n">i</span><span class="p">])</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="kt">u64</span><span class="p">;</span>
|
||||||
<span class="p">}</span>
|
<span class="p">}</span>
|
||||||
<span class="n">Kmer</span><span class="p">(</span><span class="n">val</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="p">(</span><span class="mi">64</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="mi">2</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="n">k</span><span class="p">))</span>
|
<span class="n">KmerOf</span><span class="p">(</span><span class="n">val</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="p">(</span><span class="mi">64</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="mi">2</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="n">k</span><span class="p">),</span><span class="w"> </span><span class="n">PhantomData</span><span class="p">)</span>
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p>Zero allocation — result lives on the stack.</p>
|
<p>Zero allocation — result lives on the stack.</p>
|
||||||
<h2 id="decoding">Decoding</h2>
|
<h2 id="decoding">Decoding</h2>
|
||||||
<p><code>write_ascii(k, buf)</code> appends k ASCII characters to a caller-supplied <code>Vec<u8></code> using the shared <code>DEC4</code> table: one lookup per 4 nucleotides, two partial-byte lookups for the remainder. No allocation in the hot path.</p>
|
<p><code>write_ascii(writer)</code> writes k ASCII characters to any <code>W: Write</code> using the shared <code>DEC4</code> table: one lookup per 4 nucleotides, one partial lookup for the remainder. No allocation in the hot path.</p>
|
||||||
<p><code>to_ascii(k)</code> is a convenience wrapper that allocates and returns a <code>Vec<u8></code>; intended for tests and display only.</p>
|
<p><code>to_ascii()</code> is a convenience wrapper that allocates and returns a <code>Vec<u8></code>; intended for tests and display only.</p>
|
||||||
<h2 id="reverse-complement">Reverse complement</h2>
|
<h2 id="reverse-complement">Reverse complement</h2>
|
||||||
<p>Computed as pure arithmetic — no lookup table, no memory access:</p>
|
<p>Computed as pure arithmetic — no lookup table, no memory access:</p>
|
||||||
<div class="highlight"><pre><span></span><code><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="o">!</span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="p">;</span><span class="w"> </span><span class="c1">// complement</span>
|
<div class="highlight"><pre><span></span><code><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="o">!</span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="p">;</span><span class="w"> </span><span class="c1">// complement</span>
|
||||||
<span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">x</span><span class="p">.</span><span class="n">swap_bytes</span><span class="p">();</span><span class="w"> </span><span class="c1">// reverse bytes</span>
|
<span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">x</span><span class="p">.</span><span class="n">swap_bytes</span><span class="p">();</span><span class="w"> </span><span class="c1">// reverse bytes</span>
|
||||||
<span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">((</span><span class="n">x</span><span class="w"> </span><span class="o">>></span><span class="w"> </span><span class="mi">4</span><span class="p">)</span><span class="w"> </span><span class="o">&</span><span class="w"> </span><span class="mh">0x0F0F0F0F0F0F0F0F</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="p">((</span><span class="n">x</span><span class="w"> </span><span class="o">&</span><span class="w"> </span><span class="mh">0x0F0F0F0F0F0F0F0F</span><span class="p">)</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="mi">4</span><span class="p">);</span><span class="w"> </span><span class="c1">// swap nibbles</span>
|
<span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">((</span><span class="n">x</span><span class="w"> </span><span class="o">>></span><span class="w"> </span><span class="mi">4</span><span class="p">)</span><span class="w"> </span><span class="o">&</span><span class="w"> </span><span class="mh">0x0F0F0F0F0F0F0F0F</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="p">((</span><span class="n">x</span><span class="w"> </span><span class="o">&</span><span class="w"> </span><span class="mh">0x0F0F0F0F0F0F0F0F</span><span class="p">)</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="mi">4</span><span class="p">);</span><span class="w"> </span><span class="c1">// swap nibbles</span>
|
||||||
<span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">((</span><span class="n">x</span><span class="w"> </span><span class="o">>></span><span class="w"> </span><span class="mi">2</span><span class="p">)</span><span class="w"> </span><span class="o">&</span><span class="w"> </span><span class="mh">0x3333333333333333</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="p">((</span><span class="n">x</span><span class="w"> </span><span class="o">&</span><span class="w"> </span><span class="mh">0x3333333333333333</span><span class="p">)</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="mi">2</span><span class="p">);</span><span class="w"> </span><span class="c1">// swap 2-bit groups</span>
|
<span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">((</span><span class="n">x</span><span class="w"> </span><span class="o">>></span><span class="w"> </span><span class="mi">2</span><span class="p">)</span><span class="w"> </span><span class="o">&</span><span class="w"> </span><span class="mh">0x3333333333333333</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="p">((</span><span class="n">x</span><span class="w"> </span><span class="o">&</span><span class="w"> </span><span class="mh">0x3333333333333333</span><span class="p">)</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="mi">2</span><span class="p">);</span><span class="w"> </span><span class="c1">// swap 2-bit groups</span>
|
||||||
<span class="n">Kmer</span><span class="p">(</span><span class="n">x</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="p">(</span><span class="mi">64</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="mi">2</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="n">k</span><span class="p">))</span>
|
<span class="n">KmerOf</span><span class="p">(</span><span class="n">x</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="p">(</span><span class="mi">64</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="mi">2</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="n">k</span><span class="p">),</span><span class="w"> </span><span class="n">PhantomData</span><span class="p">)</span>
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p>After complementing, bytes are reversed (<code>swap_bytes</code>), then nibbles, then 2-bit groups — restoring 2-bit nucleotides to their correct positions in reverse order. A final left-shift realigns to MSB. Zero allocation — result lives on the stack.</p>
|
<p>After complementing, bytes are reversed (<code>swap_bytes</code>), then nibbles, then 2-bit groups — restoring 2-bit nucleotides to their correct positions in reverse order. A final left-shift realigns to MSB. Zero allocation — result lives on the stack.</p>
|
||||||
<h2 id="canonical-form">Canonical form</h2>
|
<h2 id="canonical-form-and-canonicalkmerof">Canonical form and <code>CanonicalKmerOf</code></h2>
|
||||||
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">canonical</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">k</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Self</span><span class="w"> </span><span class="p">{</span>
|
<p><code>canonical()</code> returns a <code>CanonicalKmerOf<L></code> — a distinct newtype that carries the same <code>u64</code> layout but enforces the invariant that the stored value equals <code>min(kmer, revcomp)</code>:</p>
|
||||||
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">rc</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">revcomp</span><span class="p">(</span><span class="n">k</span><span class="p">);</span>
|
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">canonical</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">CanonicalKmerOf</span><span class="o"><</span><span class="n">L</span><span class="o">></span><span class="w"> </span><span class="p">{</span>
|
||||||
<span class="w"> </span><span class="k">if</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o"><=</span><span class="w"> </span><span class="n">rc</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="o">*</span><span class="bp">self</span><span class="w"> </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">rc</span><span class="w"> </span><span class="p">}</span>
|
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">rc</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">revcomp</span><span class="p">();</span>
|
||||||
|
<span class="w"> </span><span class="n">CanonicalKmerOf</span><span class="p">(</span><span class="k">if</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o"><=</span><span class="w"> </span><span class="n">rc</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">rc</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="p">},</span><span class="w"> </span><span class="n">PhantomData</span><span class="p">)</span>
|
||||||
<span class="p">}</span>
|
<span class="p">}</span>
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p>Lexicographic minimum of forward and reverse-complement, comparing the raw <code>u64</code> values directly (left-aligned encoding makes this equivalent to nucleotide-wise comparison). Zero allocation — result lives on the stack.</p>
|
<p>Lexicographic minimum of forward and reverse-complement, comparing the raw <code>u64</code> values directly (left-aligned encoding makes this equivalent to nucleotide-wise comparison). Zero allocation — result lives on the stack.</p>
|
||||||
|
<p><code>CanonicalKmerOf::from_raw_unchecked(raw)</code> is the only other public constructor, for trusted paths such as deserialisation.</p>
|
||||||
|
<h2 id="sliding-window-helpers">Sliding window helpers</h2>
|
||||||
|
<p><code>push_right(nuc)</code> / <code>push_left(nuc)</code> shift the window by one base in O(1). <code>is_overlapping(other)</code> checks whether the last k−1 nucleotides of <code>self</code> equal the first k−1 of <code>other</code>.</p>
|
||||||
|
<h2 id="hashing">Hashing</h2>
|
||||||
|
<p><code>hash_kmer(raw: u64) -> u64</code> computes <code>mix64(raw ^ 0x9e3779b97f4a7c15)</code>, the seeded splitmix64 finalizer. <code>CanonicalKmerOf::seq_hash()</code> delegates to <code>hash_kmer</code>.</p>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -757,6 +757,28 @@
|
|||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#evidence-modes" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Evidence modes
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#build-functions" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Build functions
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
</li>
|
</li>
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
@@ -840,6 +862,34 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../evidence_elimination/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Evidence elimination (discussion)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="../obilayeredmap/" class="md-nav__link">
|
<a href="../obilayeredmap/" class="md-nav__link">
|
||||||
|
|
||||||
@@ -918,6 +968,62 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../merge/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Merge command
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../rebuild_filter/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Kmer filtering (rebuild/dump/unitig)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
</nav>
|
</nav>
|
||||||
|
|
||||||
@@ -1165,6 +1271,28 @@
|
|||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#evidence-modes" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Evidence modes
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#build-functions" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Build functions
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
</li>
|
</li>
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
@@ -1226,26 +1354,26 @@
|
|||||||
<h2 id="why-two-phases-are-needed">Why two phases are needed</h2>
|
<h2 id="why-two-phases-are-needed">Why two phases are needed</h2>
|
||||||
<p>Kmer indexing per partition proceeds in two phases. The separation is necessary because the exact number of surviving unique kmers is not known until after counting and filtering low-abundance kmers.</p>
|
<p>Kmer indexing per partition proceeds in two phases. The separation is necessary because the exact number of surviving unique kmers is not known until after counting and filtering low-abundance kmers.</p>
|
||||||
<h3 id="phase-1-provisional-mphf-kmer-spectrum">Phase 1 — provisional MPHF + kmer spectrum</h3>
|
<h3 id="phase-1-provisional-mphf-kmer-spectrum">Phase 1 — provisional MPHF + kmer spectrum</h3>
|
||||||
<p>Implemented in <code>obikpartitionner::KmerPartition::count_kmer()</code>.</p>
|
<p>Implemented in <code>obikpartitionner::KmerPartition::count_kmer()</code> → <code>count_partition()</code>.</p>
|
||||||
<ol>
|
<ol>
|
||||||
<li><strong>Pass 1</strong>: read the dereplicated superkmer file; enumerate all unique canonical kmers into a <code>HashSet</code>. Exact count known after this pass.</li>
|
<li><strong>External sort</strong>: read the dereplicated superkmer file; extract the raw <code>u64</code> canonical kmer value for every kmer of every superkmer. Sort in RAM-bounded chunks (adaptive budget: 40% of available RAM ÷ n_threads, minimum 1 M kmers per chunk), then k-way merge with inline dedup. Result: <code>sorted_unique.bin</code> — a flat array of f0 distinct sorted <code>u64</code> values. Exact kmer count f0 is known at this point.</li>
|
||||||
<li><strong>Build a provisional MPHF</strong> (<code>GOFunction</code> from the <code>ph</code> crate) over the exact kmer set. Produces <code>mphf1.bin</code>.</li>
|
<li><strong>Build provisional MPHF</strong> (ptr_hash, same configuration as phase 2) over <code>sorted_unique.bin</code> using <code>new_from_par_iter</code>. Delete <code>sorted_unique.bin</code> immediately after. Persist to <code>mphf1.bin</code>.</li>
|
||||||
<li><strong>Create <code>counts1.bin</code></strong>: one zero-initialised <code>u32</code> per MPHF slot (mmap'd).</li>
|
<li><strong>Create <code>counts1.bin</code></strong>: <code>PersistentCompactIntVec</code> with f0 slots, zero-initialised.</li>
|
||||||
<li><strong>Pass 2</strong>: re-read the dereplicated file; for each kmer, query <code>mphf1.get(kmer)</code> and atomically accumulate the superkmer count into <code>counts1[slot]</code>.</li>
|
<li><strong>Accumulation pass</strong>: re-read the dereplicated superkmer file; for each kmer in each superkmer, compute <code>slot = mphf.index(kmer.raw())</code> and increment <code>counts1[slot]</code> by the superkmer's COUNT.</li>
|
||||||
<li><strong>Build kmer frequency spectrum</strong> from <code>counts1</code>: histogram <code>{count → n_kmers}</code>, totals f0 (distinct kmers) and f1 (total abundance). Written to <code>kmer_spectrum_raw.json</code> per partition, then merged globally.</li>
|
<li><strong>Build kmer frequency spectrum</strong> from <code>counts1</code>: histogram <code>{count → n_kmers}</code>, totals f0 (distinct kmers) and f1 (total abundance). Written to <code>kmer_spectrum_raw.json</code> per partition, then merged globally.</li>
|
||||||
</ol>
|
</ol>
|
||||||
<p>Files produced per partition:</p>
|
<p>Files produced per partition:</p>
|
||||||
<div class="highlight"><pre><span></span><code>part_XXXXX/
|
<div class="highlight"><pre><span></span><code>part_XXXXX/
|
||||||
mphf1.bin — GOFunction (provisional MPHF, discarded after phase 2)
|
mphf1.bin — ptr_hash provisional MPHF (discarded after phase 2)
|
||||||
counts1.bin — [u32; n_kmers] kmer counts, mmap'd
|
counts1.bin — PersistentCompactIntVec, f0 × u32 kmer counts
|
||||||
kmer_spectrum_raw.json — local frequency spectrum
|
kmer_spectrum_raw.json — local frequency spectrum
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<h3 id="phase-2-definitive-mphf">Phase 2 — definitive MPHF</h3>
|
<h3 id="phase-2-definitive-mphf">Phase 2 — definitive MPHF</h3>
|
||||||
<p>After filtering (applying a min-count threshold derived from the spectrum) and building the local De Bruijn graph + unitigs (see <a href="../pipeline/">Construction pipeline</a>), the exact filtered kmer set is available via <code>unitigs.bin</code>.</p>
|
<p>After filtering (applying a min-count threshold derived from the spectrum) and building the local De Bruijn graph + unitigs (see <a href="../pipeline/">Construction pipeline</a>), the exact filtered kmer set is available via <code>unitigs.bin</code>.</p>
|
||||||
<p><code>MphfLayer::build</code> is called on the unitig file:</p>
|
<p><code>MphfLayer::build(dir, block_bits, mode: &IndexMode, fill_slot)</code> is called on the unitig directory:</p>
|
||||||
<ol>
|
<ol>
|
||||||
<li><strong>Pass 1</strong>: iterate all canonical kmers from <code>unitigs.bin</code> in parallel, build and store <code>mphf.bin</code> (ptr_hash).</li>
|
<li><strong>Pass 1</strong> (parallel): a <code>CanonicalKmerIter</code> — clonable via <code>Arc<Mmap></code>, no file reopening — is passed directly to <code>new_from_par_iter</code> via <code>par_bridge()</code>. No <code>.idx</code> is read or created at this stage; parallelism is at partition/layer level, not within a single MPHF. Produces <code>mphf.bin</code>.</li>
|
||||||
<li><strong>Pass 2</strong>: iterate sequentially, fill <code>evidence.bin</code>, call the mode-specific <code>fill_slot</code> callback.</li>
|
<li><strong>Pass 2</strong> (sequential): iterate with <code>iter_indexed_canonical_kmers</code>; fill evidence files; call <code>fill_slot(slot, kmer)</code> callback per kmer. For Exact/Hybrid, <code>.idx</code> is written at the end of this pass — never earlier.</li>
|
||||||
</ol>
|
</ol>
|
||||||
<p><code>mphf1.bin</code> and <code>counts1.bin</code> are no longer needed after phase 2 and can be deleted.</p>
|
<p><code>mphf1.bin</code> and <code>counts1.bin</code> are no longer needed after phase 2 and can be deleted.</p>
|
||||||
<hr />
|
<hr />
|
||||||
@@ -1265,13 +1393,11 @@
|
|||||||
<p><strong>FMPH/FMPHGO</strong> (<code>ph</code> crate, Beling, ACM JEA 2023):</p>
|
<p><strong>FMPH/FMPHGO</strong> (<code>ph</code> crate, Beling, ACM JEA 2023):</p>
|
||||||
<ul>
|
<ul>
|
||||||
<li>~2.1 bits/key — most compact; good query speed; deterministic construction</li>
|
<li>~2.1 bits/key — most compact; good query speed; deterministic construction</li>
|
||||||
<li>Works well from an exact or slightly overestimated count</li>
|
<li><code>GOFunction</code> (group-oriented variant) was the original phase-1 choice; eliminated when the external sort made the exact count available at phase 1 as well</li>
|
||||||
<li><code>GOFunction</code> (group-oriented variant) is the specific type used</li>
|
|
||||||
</ul>
|
</ul>
|
||||||
<h2 id="mphf-choice-per-phase">MPHF choice per phase</h2>
|
<h2 id="mphf-choice-per-phase">MPHF choice per phase</h2>
|
||||||
<p><strong>Phase 1</strong> (provisional, discarded after spectrum computation): <code>ph::fmph::GOFunction</code>. Compact, fast to build from the exact post-dedup kmer set. Query speed is secondary — the structure is only used during pass 2 of <code>count_kmer</code>.</p>
|
<p><strong>Both phases</strong>: <strong>ptr_hash</strong>, same type alias and construction parameters. The external sort (phase 1) and the unitig index (phase 2) both provide the exact key count before MPHF construction, so ptr_hash's requirement is satisfied in both cases. Using a single MPHF implementation removes the <code>ph</code> crate dependency.</p>
|
||||||
<p><strong>Phase 2</strong> (persistent, queried repeatedly): <strong>ptr_hash</strong>. Exact key count is available from the unitig index; ptr_hash query speed (≥2.1×) and construction speed (≥3.1× over FMPH) are the decisive factors. The 2.4 bits/key overhead is acceptable.</p>
|
<p>boomphf: eliminated — largest space overhead, streaming advantage no longer needed. FMPH/GOFunction: eliminated — exact count available, ptr_hash is faster at equivalent compactness.</p>
|
||||||
<p>boomphf is eliminated: largest space overhead, streaming advantage does not apply.</p>
|
|
||||||
<hr />
|
<hr />
|
||||||
<h2 id="space-at-scale">Space at scale</h2>
|
<h2 id="space-at-scale">Space at scale</h2>
|
||||||
<p>For 1 024 partitions × 100 M kmers/partition (phase 2 index, after filtering):</p>
|
<p>For 1 024 partitions × 100 M kmers/partition (phase 2 index, after filtering):</p>
|
||||||
@@ -1320,9 +1446,12 @@
|
|||||||
<h3 id="layer-structure">Layer structure</h3>
|
<h3 id="layer-structure">Layer structure</h3>
|
||||||
<p>Each layer is a self-contained unit. See <a href="../obilayeredmap/">obilayeredmap</a> for the full on-disk layout. The MPHF-relevant files are:</p>
|
<p>Each layer is a self-contained unit. See <a href="../obilayeredmap/">obilayeredmap</a> for the full on-disk layout. The MPHF-relevant files are:</p>
|
||||||
<div class="highlight"><pre><span></span><code>layer_i/
|
<div class="highlight"><pre><span></span><code>layer_i/
|
||||||
unitigs.bin — packed 2-bit nucleotide sequences (kmer evidence)
|
unitigs.bin — packed 2-bit nucleotide sequences (kmer evidence source)
|
||||||
|
unitigs.bin.idx — random-access block index (block_bits controls granularity)
|
||||||
mphf.bin — ptr_hash phase-2 MPHF
|
mphf.bin — ptr_hash phase-2 MPHF
|
||||||
evidence.bin — n × u32: (chunk_id: 25 bits | rank: 7 bits) per slot
|
evidence.bin — n × (chunk_id: 25 bits | rank: 7 bits) per slot [exact mode]
|
||||||
|
fingerprint.bin — n × b-bit fingerprints per slot [approx mode]
|
||||||
|
[no layer_meta.json — mode stored once in partition-level meta.json]
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p>Layers are <strong>disjoint</strong>: a canonical kmer belongs to exactly one layer. Layer 0 is built from dataset A. Adding dataset B:</p>
|
<p>Layers are <strong>disjoint</strong>: a canonical kmer belongs to exactly one layer. Layer 0 is built from dataset A. Adding dataset B:</p>
|
||||||
<ol>
|
<ol>
|
||||||
@@ -1330,17 +1459,43 @@
|
|||||||
<li>Collect kmers of B not present in any layer → set <code>B \ A</code>.</li>
|
<li>Collect kmers of B not present in any layer → set <code>B \ A</code>.</li>
|
||||||
<li>Build layer 1 from <code>B \ A</code> (dereplicate → count → De Bruijn → unitigs → <code>MphfLayer::build</code>).</li>
|
<li>Build layer 1 from <code>B \ A</code> (dereplicate → count → De Bruijn → unitigs → <code>MphfLayer::build</code>).</li>
|
||||||
</ol>
|
</ol>
|
||||||
|
<h3 id="evidence-modes">Evidence modes</h3>
|
||||||
|
<p>Three evidence modes are supported via <code>IndexMode</code>, stored once in <code>PartitionMeta</code> at partition root. There is no <code>layer_meta.json</code>.</p>
|
||||||
|
<p><strong>Exact</strong> (<code>IndexMode::Exact</code>): <code>evidence.bin</code> stores one <code>(chunk_id, rank)</code> pair per MPHF slot. Verification reconstructs the kmer and compares to the query. Zero false positives. <code>.idx</code> required at query time.</p>
|
||||||
|
<p><strong>Approx</strong> (<code>IndexMode::Approx { b, z }</code>): <code>fingerprint.bin</code> stores a b-bit hash per slot. False-positive rate 1/2^b per query; Findere z-parameter reduces window FP to ≈ 1/2^(b·z). No <code>.idx</code> written or needed.</p>
|
||||||
|
<p><strong>Hybrid</strong> (<code>IndexMode::Hybrid { b, z }</code>): both <code>fingerprint.bin</code> and <code>evidence.bin</code> + <code>.idx</code>. <code>find()</code> uses the fingerprint (O(1)); <code>find_strict()</code> uses exact evidence (O(1)).</p>
|
||||||
|
<h3 id="build-functions">Build functions</h3>
|
||||||
|
<div class="highlight"><pre><span></span><code>MphfLayer::build(dir, block_bits, mode: &IndexMode, fill_slot)
|
||||||
|
Pass 1: CanonicalKmerIter + par_bridge() → build mphf.bin (no .idx used)
|
||||||
|
Pass 2: sequential iter → fill evidence files + call fill_slot
|
||||||
|
.idx written last for Exact/Hybrid (query-time only)
|
||||||
|
|
||||||
|
MphfLayer::build_exact_evidence(dir, block_bits)
|
||||||
|
Post-hoc: builds evidence.bin + .idx from existing mphf.bin + unitigs.bin
|
||||||
|
Uses open_sequential(); no .idx required on entry
|
||||||
|
|
||||||
|
MphfLayer::build_approx_evidence(dir, b, z)
|
||||||
|
Post-hoc: builds fingerprint.bin from existing mphf.bin + unitigs.bin
|
||||||
|
Uses open_sequential(); never writes .idx
|
||||||
|
</code></pre></div>
|
||||||
|
<p>There is no <code>build_evidence</code> dispatch wrapper. Callers choose the appropriate post-hoc build directly.</p>
|
||||||
|
<p>In <code>obikpartitionner</code>, <code>build_index_layer</code> receives <code>block_bits: u8</code> from <code>IndexConfig::block_bits</code> and forwards it directly to <code>Layer::build</code> and <code>Layer::build_approx_evidence</code>.</p>
|
||||||
<h3 id="membership-verification">Membership verification</h3>
|
<h3 id="membership-verification">Membership verification</h3>
|
||||||
<p>ptr_hash maps any input to a valid slot — it does not natively detect absent keys. Membership is verified using the evidence entry: decode the kmer from <code>(chunk_id, rank)</code> and compare to the query. A mismatch means the kmer is absent from this layer; probe the next layer.</p>
|
<p>ptr_hash maps any input to a valid slot — it does not natively detect absent keys. Membership is verified using the evidence entry:</p>
|
||||||
|
<ul>
|
||||||
|
<li><strong>Exact</strong>: decode <code>(chunk_id, rank)</code> from <code>evidence.bin</code>; reconstruct the kmer via <code>unitigs.verify_canonical_kmer</code>; compare to query.</li>
|
||||||
|
<li><strong>Approx</strong>: compare <code>kmer.seq_hash()</code> to the b-bit fingerprint stored at the slot.</li>
|
||||||
|
</ul>
|
||||||
|
<p>A mismatch in either mode means the kmer is absent from this layer; probe the next layer.</p>
|
||||||
<h3 id="query-algorithm">Query algorithm</h3>
|
<h3 id="query-algorithm">Query algorithm</h3>
|
||||||
<div class="highlight"><pre><span></span><code>fn query(kmer) → Option<(layer_index, slot)>:
|
<div class="highlight"><pre><span></span><code>fn query(kmer) → Option<(layer_index, slot)>:
|
||||||
for (i, layer) in layers.iter().enumerate():
|
for (i, layer) in layers.iter().enumerate():
|
||||||
slot = layer.mphf.index(kmer)
|
slot = layer.mphf.index(kmer)
|
||||||
if layer.evidence.decode(slot) matches kmer:
|
if layer.evidence.matches(slot, kmer): // exact or approx dispatch
|
||||||
return Some((i, slot))
|
return Some((i, slot))
|
||||||
return None
|
return None
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p>Expected probe depth: 1 for kmers in layer 0. Each probe is a ptr_hash lookup (~10 ns) plus one evidence decode.</p>
|
<p><code>MphfLayer::find</code> dispatches on <code>LayerEvidence</code> at O(1) — no panicking <code>find_exact</code>/<code>find_approx</code> methods. <code>find_strict</code> always performs an exact check: O(1) for Exact/Hybrid, O(n) sequential scan for Approx. Expected probe depth: 1 for kmers in layer 0. Each probe is a ptr_hash lookup (~10 ns) plus one evidence check.</p>
|
||||||
<h3 id="merging-layers">Merging layers</h3>
|
<h3 id="merging-layers">Merging layers</h3>
|
||||||
<p>Two layer chains can be merged by re-indexing their union through the full pipeline. This is expensive (full rebuild) but produces an optimal single-layer index. Merge is a maintenance operation, not a query-path requirement.</p>
|
<p>Two layer chains can be merged by re-indexing their union through the full pipeline. This is expensive (full rebuild) but produces an optimal single-layer index. Merge is a maintenance operation, not a query-path requirement.</p>
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -9,7 +9,7 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
<link rel="prev" href="../unitig_evidence/">
|
<link rel="prev" href="../evidence_elimination/">
|
||||||
|
|
||||||
|
|
||||||
<link rel="next" href="../persistent_compact_int_vec/">
|
<link rel="next" href="../persistent_compact_int_vec/">
|
||||||
@@ -649,6 +649,34 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../evidence_elimination/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Evidence elimination (discussion)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<li class="md-nav__item md-nav__item--active">
|
<li class="md-nav__item md-nav__item--active">
|
||||||
@@ -729,6 +757,17 @@
|
|||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#index-mode-homogeneity-invariant" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Index mode (homogeneity invariant)
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
</li>
|
</li>
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
@@ -740,6 +779,34 @@
|
|||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="MphfLayer — autonomous kmer → slot mapping">
|
||||||
|
<ul class="md-nav__list">
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#query-api" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Query API
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#build-surface" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Build surface
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
|
||||||
</li>
|
</li>
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
@@ -751,6 +818,73 @@
|
|||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="Layer\<D: LayerData> — MPHF + payload">
|
||||||
|
<ul class="md-nav__list">
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#build-signatures" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Build signatures
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#fingerprintvec-and-fingerprintvecwriter" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
FingerprintVec and FingerprintVecWriter
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#layeredmapd-collection-of-layers" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
LayeredMap\<D> — collection of layers
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="LayeredMap\<D> — collection of layers">
|
||||||
|
<ul class="md-nav__list">
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#common-methods" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Common methods
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#push_layer" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
push_layer
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
|
||||||
</li>
|
</li>
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
@@ -776,10 +910,10 @@
|
|||||||
</li>
|
</li>
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="#evidence-encoding" class="md-nav__link">
|
<a href="#evidence-encoding-exact" class="md-nav__link">
|
||||||
<span class="md-ellipsis">
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
Evidence encoding
|
Evidence encoding (exact)
|
||||||
|
|
||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
@@ -798,14 +932,53 @@
|
|||||||
</li>
|
</li>
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="#query-path" class="md-nav__link">
|
<a href="#column-append-and-merge-support" class="md-nav__link">
|
||||||
<span class="md-ellipsis">
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
Query path
|
Column append and merge support
|
||||||
|
|
||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="Column append and merge support">
|
||||||
|
<ul class="md-nav__list">
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#layer-level-genome-column-append" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Layer-level genome column append
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#presence-matrix-initialisation" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Presence matrix initialisation
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#why-the-mphf-is-never-rebuilt" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Why the MPHF is never rebuilt
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
|
||||||
</li>
|
</li>
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
@@ -895,6 +1068,62 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../merge/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Merge command
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../rebuild_filter/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Kmer filtering (rebuild/dump/unitig)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
</nav>
|
</nav>
|
||||||
|
|
||||||
@@ -1058,6 +1287,17 @@
|
|||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#index-mode-homogeneity-invariant" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Index mode (homogeneity invariant)
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
</li>
|
</li>
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
@@ -1069,6 +1309,34 @@
|
|||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="MphfLayer — autonomous kmer → slot mapping">
|
||||||
|
<ul class="md-nav__list">
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#query-api" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Query API
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#build-surface" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Build surface
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
|
||||||
</li>
|
</li>
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
@@ -1080,6 +1348,73 @@
|
|||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="Layer\<D: LayerData> — MPHF + payload">
|
||||||
|
<ul class="md-nav__list">
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#build-signatures" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Build signatures
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#fingerprintvec-and-fingerprintvecwriter" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
FingerprintVec and FingerprintVecWriter
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#layeredmapd-collection-of-layers" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
LayeredMap\<D> — collection of layers
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="LayeredMap\<D> — collection of layers">
|
||||||
|
<ul class="md-nav__list">
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#common-methods" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Common methods
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#push_layer" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
push_layer
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
|
||||||
</li>
|
</li>
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
@@ -1105,10 +1440,10 @@
|
|||||||
</li>
|
</li>
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="#evidence-encoding" class="md-nav__link">
|
<a href="#evidence-encoding-exact" class="md-nav__link">
|
||||||
<span class="md-ellipsis">
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
Evidence encoding
|
Evidence encoding (exact)
|
||||||
|
|
||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
@@ -1127,14 +1462,53 @@
|
|||||||
</li>
|
</li>
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="#query-path" class="md-nav__link">
|
<a href="#column-append-and-merge-support" class="md-nav__link">
|
||||||
<span class="md-ellipsis">
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
Query path
|
Column append and merge support
|
||||||
|
|
||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="Column append and merge support">
|
||||||
|
<ul class="md-nav__list">
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#layer-level-genome-column-append" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Layer-level genome column append
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#presence-matrix-initialisation" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Presence matrix initialisation
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#why-the-mphf-is-never-rebuilt" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Why the MPHF is never rebuilt
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
|
||||||
</li>
|
</li>
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
@@ -1178,7 +1552,7 @@
|
|||||||
|
|
||||||
<h1 id="obilayeredmap-layered-kmer-index-crate">obilayeredmap — layered kmer index crate</h1>
|
<h1 id="obilayeredmap-layered-kmer-index-crate">obilayeredmap — layered kmer index crate</h1>
|
||||||
<h2 id="purpose">Purpose</h2>
|
<h2 id="purpose">Purpose</h2>
|
||||||
<p><code>obilayeredmap</code> implements a persistent, incrementally extensible kmer index. The index is organised in three levels: <strong>index root → partition → layer</strong>. Each layer covers a disjoint kmer set and wraps a <code>ptr_hash</code> MPHF with associated per-slot data. Adding a new dataset never rebuilds existing layers.</p>
|
<p><code>obilayeredmap</code> implements a persistent, incrementally extensible kmer index. Each layer covers a disjoint kmer set and wraps a <code>ptr_hash</code> MPHF with associated per-slot data. Adding a new dataset never rebuilds existing layers.</p>
|
||||||
<hr />
|
<hr />
|
||||||
<h2 id="three-usage-modes">Three usage modes</h2>
|
<h2 id="three-usage-modes">Three usage modes</h2>
|
||||||
<p>The MPHF + evidence infrastructure is the same for all modes. The <strong>payload</strong> varies.</p>
|
<p>The MPHF + evidence infrastructure is the same for all modes. The <strong>payload</strong> varies.</p>
|
||||||
@@ -1214,34 +1588,65 @@
|
|||||||
</table>
|
</table>
|
||||||
<p>Both <code>PersistentCompactIntMatrix</code> and <code>PersistentBitMatrix</code> come from the <code>obicompactvec</code> crate.</p>
|
<p>Both <code>PersistentCompactIntMatrix</code> and <code>PersistentBitMatrix</code> come from the <code>obicompactvec</code> crate.</p>
|
||||||
<hr />
|
<hr />
|
||||||
|
<h2 id="index-mode-homogeneity-invariant">Index mode (homogeneity invariant)</h2>
|
||||||
|
<p>A partitioned index is homogeneous: every layer within a partition shares the same mode. The mode is determined once at <code>LayeredMap::open()</code> from <code>PartitionMeta.mode</code> and passed to each <code>Layer::open()</code> — no per-layer file is read.</p>
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="cp">#[derive(Serialize, Deserialize, Default)]</span>
|
||||||
|
<span class="cp">#[serde(tag = </span><span class="s">"type"</span><span class="cp">, rename_all = </span><span class="s">"snake_case"</span><span class="cp">)]</span>
|
||||||
|
<span class="k">pub</span><span class="w"> </span><span class="k">enum</span><span class="w"> </span><span class="nc">IndexMode</span><span class="w"> </span><span class="p">{</span>
|
||||||
|
<span class="w"> </span><span class="cp">#[default]</span>
|
||||||
|
<span class="w"> </span><span class="n">Exact</span><span class="p">,</span>
|
||||||
|
<span class="w"> </span><span class="n">Approx</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">b</span><span class="p">:</span><span class="w"> </span><span class="kt">u8</span><span class="p">,</span><span class="w"> </span><span class="n">z</span><span class="p">:</span><span class="w"> </span><span class="kt">u8</span><span class="w"> </span><span class="p">},</span>
|
||||||
|
<span class="w"> </span><span class="n">Hybrid</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">b</span><span class="p">:</span><span class="w"> </span><span class="kt">u8</span><span class="p">,</span><span class="w"> </span><span class="n">z</span><span class="p">:</span><span class="w"> </span><span class="kt">u8</span><span class="w"> </span><span class="p">},</span>
|
||||||
|
<span class="p">}</span>
|
||||||
|
</code></pre></div>
|
||||||
|
<p><code>IndexMode</code> is stored once in <code>PartitionMeta</code> (<code>meta.json</code> at partition root). There is no <code>layer_meta.json</code>.</p>
|
||||||
|
<ul>
|
||||||
|
<li><strong>Exact</strong>: writes <code>evidence.bin</code> + <code>unitigs.bin.idx</code>. Zero false positives.</li>
|
||||||
|
<li><strong>Approx</strong>: writes <code>fingerprint.bin</code> only. FP rate per kmer = 1/2^b; with Findere z-parameter, z consecutive kmers must all match → effective window FP ≈ 1/2^(b·z). No <code>.idx</code> written or required.</li>
|
||||||
|
<li><strong>Hybrid</strong>: writes both <code>fingerprint.bin</code> and <code>evidence.bin</code> + <code>.idx</code>. <code>find()</code> uses the fingerprint (fast, O(1)); <code>find_strict()</code> uses exact evidence.</li>
|
||||||
|
</ul>
|
||||||
|
<hr />
|
||||||
<h2 id="mphflayer-autonomous-kmer-slot-mapping">MphfLayer — autonomous kmer → slot mapping</h2>
|
<h2 id="mphflayer-autonomous-kmer-slot-mapping">MphfLayer — autonomous kmer → slot mapping</h2>
|
||||||
<p><code>MphfLayer</code> encapsulates the MPHF + evidence + unitig spine for one layer. It is independent of any payload data.</p>
|
<p><code>MphfLayer</code> encapsulates the MPHF and evidence store for one layer. It is independent of any payload.</p>
|
||||||
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">struct</span><span class="w"> </span><span class="nc">MphfLayer</span><span class="w"> </span><span class="p">{</span>
|
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">struct</span><span class="w"> </span><span class="nc">MphfLayer</span><span class="w"> </span><span class="p">{</span>
|
||||||
<span class="w"> </span><span class="n">mphf</span><span class="p">:</span><span class="w"> </span><span class="nc">Mphf</span><span class="p">,</span>
|
<span class="w"> </span><span class="n">mphf</span><span class="p">:</span><span class="w"> </span><span class="nc">Mphf</span><span class="p">,</span>
|
||||||
<span class="w"> </span><span class="n">evidence</span><span class="p">:</span><span class="w"> </span><span class="nc">Evidence</span><span class="p">,</span>
|
<span class="w"> </span><span class="n">ev</span><span class="p">:</span><span class="w"> </span><span class="nc">LayerEvidence</span><span class="p">,</span><span class="w"> </span><span class="c1">// loaded at open() time</span>
|
||||||
<span class="w"> </span><span class="n">unitigs</span><span class="p">:</span><span class="w"> </span><span class="nc">UnitigFileReader</span><span class="p">,</span>
|
<span class="w"> </span><span class="n">n</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span>
|
||||||
<span class="w"> </span><span class="n">n</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span><span class="w"> </span><span class="c1">// number of indexed kmers = number of MPHF slots</span>
|
|
||||||
<span class="p">}</span>
|
<span class="p">}</span>
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p>Public API:</p>
|
<p><code>LayerEvidence</code> is an internal enum, not public:</p>
|
||||||
<div class="highlight"><pre><span></span><code><span class="k">impl</span><span class="w"> </span><span class="n">MphfLayer</span><span class="w"> </span><span class="p">{</span>
|
<div class="highlight"><pre><span></span><code><span class="k">enum</span><span class="w"> </span><span class="nc">LayerEvidence</span><span class="w"> </span><span class="p">{</span>
|
||||||
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">open</span><span class="p">(</span><span class="n">dir</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Path</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="bp">Self</span><span class="o">></span>
|
<span class="w"> </span><span class="n">Exact</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">evidence</span><span class="p">:</span><span class="w"> </span><span class="nc">Evidence</span><span class="p">,</span><span class="w"> </span><span class="n">unitigs</span><span class="p">:</span><span class="w"> </span><span class="nc">UnitigFileReader</span><span class="w"> </span><span class="p">},</span>
|
||||||
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">find</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">kmer</span><span class="p">:</span><span class="w"> </span><span class="nc">CanonicalKmer</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nb">Option</span><span class="o"><</span><span class="kt">usize</span><span class="o">></span><span class="w"> </span><span class="c1">// Some(slot) or None</span>
|
<span class="w"> </span><span class="n">Approx</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">fingerprint</span><span class="p">:</span><span class="w"> </span><span class="nc">FingerprintVec</span><span class="p">,</span><span class="w"> </span><span class="n">unitigs_path</span><span class="p">:</span><span class="w"> </span><span class="nc">PathBuf</span><span class="w"> </span><span class="p">},</span>
|
||||||
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">n</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">usize</span>
|
<span class="w"> </span><span class="n">Hybrid</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">evidence</span><span class="p">:</span><span class="w"> </span><span class="nc">Evidence</span><span class="p">,</span><span class="w"> </span><span class="n">unitigs</span><span class="p">:</span><span class="w"> </span><span class="nc">UnitigFileReader</span><span class="p">,</span><span class="w"> </span><span class="n">fingerprint</span><span class="p">:</span><span class="w"> </span><span class="nc">FingerprintVec</span><span class="w"> </span><span class="p">},</span>
|
||||||
<span class="w"> </span><span class="nc">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">unitig_writer</span><span class="p">(</span><span class="n">dir</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Path</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="n">UnitigFileWriter</span><span class="o">></span>
|
|
||||||
<span class="w"> </span><span class="k">pub</span><span class="p">(</span><span class="k">crate</span><span class="p">)</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">build</span><span class="p">(</span>
|
|
||||||
<span class="w"> </span><span class="n">dir</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Path</span><span class="p">,</span>
|
|
||||||
<span class="w"> </span><span class="n">fill_slot</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">mut</span><span class="w"> </span><span class="k">impl</span><span class="w"> </span><span class="nb">FnMut</span><span class="p">(</span><span class="kt">usize</span><span class="p">,</span><span class="w"> </span><span class="n">CanonicalKmer</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="p">()</span><span class="o">></span><span class="p">,</span>
|
|
||||||
<span class="w"> </span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="kt">usize</span><span class="o">></span>
|
|
||||||
<span class="p">}</span>
|
<span class="p">}</span>
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p><code>find</code> returns <code>Some(slot)</code> only after verifying via evidence that the kmer is actually indexed. It returns <code>None</code> for absent keys (ptr_hash maps any input to a valid slot; evidence verification is the only correct-membership test).</p>
|
<p><code>MphfLayer::open(dir, mode: &IndexMode)</code> receives the mode from <code>PartitionMeta</code> — no per-layer file is read.</p>
|
||||||
<p><code>build</code> runs two sequential passes over <code>unitigs.bin</code>:</p>
|
<h3 id="query-api">Query API</h3>
|
||||||
|
<p>Two public query methods, both returning <code>Option<usize></code> (slot index):</p>
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">find</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">kmer</span><span class="p">:</span><span class="w"> </span><span class="nc">CanonicalKmer</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nb">Option</span><span class="o"><</span><span class="kt">usize</span><span class="o">></span>
|
||||||
|
<span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">find_strict</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">kmer</span><span class="p">:</span><span class="w"> </span><span class="nc">CanonicalKmer</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nb">Option</span><span class="o"><</span><span class="kt">usize</span><span class="o">></span>
|
||||||
|
</code></pre></div>
|
||||||
|
<ul>
|
||||||
|
<li><code>find</code>: O(1) auto-dispatch. Exact/Hybrid → exact evidence check. Approx/Hybrid → fingerprint comparison.</li>
|
||||||
|
<li><code>find_strict</code>: always exact. Exact/Hybrid → O(1) evidence check. Approx → O(n) sequential scan (no <code>.idx</code>).</li>
|
||||||
|
</ul>
|
||||||
|
<p>There are no <code>find_exact</code>/<code>find_approx</code> methods; panicking dispatch is eliminated.</p>
|
||||||
|
<h3 id="build-surface">Build surface</h3>
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="c1">// Full MPHF + evidence build (two-pass)</span>
|
||||||
|
<span class="k">pub</span><span class="p">(</span><span class="k">crate</span><span class="p">)</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">build</span><span class="p">(</span><span class="n">dir</span><span class="p">,</span><span class="w"> </span><span class="n">block_bits</span><span class="p">,</span><span class="w"> </span><span class="n">mode</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">IndexMode</span><span class="p">,</span><span class="w"> </span><span class="n">fill_slot</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="kt">usize</span><span class="o">></span>
|
||||||
|
|
||||||
|
<span class="c1">// Evidence-only post-hoc builds (MPHF already present)</span>
|
||||||
|
<span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">build_exact_evidence</span><span class="p">(</span><span class="n">dir</span><span class="p">,</span><span class="w"> </span><span class="n">block_bits</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="kt">usize</span><span class="o">></span>
|
||||||
|
<span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">build_approx_evidence</span><span class="p">(</span><span class="n">dir</span><span class="p">,</span><span class="w"> </span><span class="n">b</span><span class="p">,</span><span class="w"> </span><span class="n">z</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="kt">usize</span><span class="o">></span>
|
||||||
|
</code></pre></div>
|
||||||
|
<p><code>MphfLayer::build</code> runs two passes over <code>unitigs.bin</code>:</p>
|
||||||
<ol>
|
<ol>
|
||||||
<li><strong>Pass 1</strong>: iterate all canonical kmers in parallel via rayon, construct and store <code>mphf.bin</code>. <code>new_from_par_iter</code> avoids materialising a full key <code>Vec</code>.</li>
|
<li><strong>Pass 1</strong> (parallel via rayon): a <code>CanonicalKmerIter</code> (clonable, <code>Arc<Mmap></code>, no file reopening) is passed to <code>new_from_par_iter</code> via <code>par_bridge()</code>. Produces <code>mphf.bin</code>. No <code>.idx</code> is read or created at this stage.</li>
|
||||||
<li><strong>Pass 2</strong>: iterate again sequentially, fill <code>evidence.bin</code>, call <code>fill_slot(slot, kmer)</code> once per kmer for payload population. A compact <code>n/8</code>-byte seen-bitset verifies MPHF injectivity inline.</li>
|
<li><strong>Pass 2</strong> (sequential): fill evidence files; call <code>fill_slot(slot, kmer)</code> per kmer. <code>.idx</code> is written last for Exact/Hybrid modes (query-time only).</li>
|
||||||
</ol>
|
</ol>
|
||||||
<p>For empty layers (n = 0), <code>build</code> returns <code>Ok(0)</code> immediately after creating empty <code>mphf.bin</code> and <code>evidence.bin</code>.</p>
|
<p>There is no <code>build_evidence</code> dispatch wrapper — callers invoke <code>build_exact_evidence</code> or <code>build_approx_evidence</code> directly.</p>
|
||||||
|
<p>For empty layers (n = 0), all build variants return <code>Ok(0)</code> immediately after creating empty output files.</p>
|
||||||
<hr />
|
<hr />
|
||||||
<h2 id="layerd-layerdata-mphf-payload">Layer\<D: LayerData> — MPHF + payload</h2>
|
<h2 id="layerd-layerdata-mphf-payload">Layer\<D: LayerData> — MPHF + payload</h2>
|
||||||
<p><code>Layer<D></code> pairs an <code>MphfLayer</code> with one payload store.</p>
|
<p><code>Layer<D></code> pairs an <code>MphfLayer</code> with one payload store.</p>
|
||||||
@@ -1261,7 +1666,7 @@
|
|||||||
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="n">data</span><span class="p">:</span><span class="w"> </span><span class="nc">T</span><span class="p">,</span>
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="n">data</span><span class="p">:</span><span class="w"> </span><span class="nc">T</span><span class="p">,</span>
|
||||||
<span class="p">}</span>
|
<span class="p">}</span>
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p><code>LayerData</code> covers the <strong>read path only</strong> (<code>open</code> + <code>read</code>). Build signatures differ between modes and are not in the trait.</p>
|
<p><code>LayerData</code> covers the <strong>read path only</strong> (<code>open</code> + <code>read</code>). Build signatures differ between modes and are not part of the trait.</p>
|
||||||
<table>
|
<table>
|
||||||
<thead>
|
<thead>
|
||||||
<tr>
|
<tr>
|
||||||
@@ -1288,28 +1693,89 @@
|
|||||||
</tr>
|
</tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
<p><strong>Build signatures:</strong></p>
|
<h3 id="build-signatures">Build signatures</h3>
|
||||||
<div class="highlight"><pre><span></span><code><span class="c1">// mode 1</span>
|
<div class="highlight"><pre><span></span><code><span class="c1">// mode 1</span>
|
||||||
<span class="k">impl</span><span class="w"> </span><span class="n">Layer</span><span class="o"><</span><span class="p">()</span><span class="o">></span><span class="w"> </span><span class="p">{</span>
|
<span class="k">impl</span><span class="w"> </span><span class="n">Layer</span><span class="o"><</span><span class="p">()</span><span class="o">></span><span class="w"> </span><span class="p">{</span>
|
||||||
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">build</span><span class="p">(</span><span class="n">out_dir</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Path</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="kt">usize</span><span class="o">></span>
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">build</span><span class="p">(</span><span class="n">out_dir</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Path</span><span class="p">,</span><span class="w"> </span><span class="n">block_bits</span><span class="p">:</span><span class="w"> </span><span class="kt">u8</span><span class="p">,</span><span class="w"> </span><span class="n">mode</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">IndexMode</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="kt">usize</span><span class="o">></span>
|
||||||
<span class="p">}</span>
|
<span class="p">}</span>
|
||||||
|
|
||||||
<span class="c1">// mode 2</span>
|
<span class="c1">// mode 2</span>
|
||||||
<span class="k">impl</span><span class="w"> </span><span class="n">Layer</span><span class="o"><</span><span class="n">PersistentCompactIntMatrix</span><span class="o">></span><span class="w"> </span><span class="p">{</span>
|
<span class="k">impl</span><span class="w"> </span><span class="n">Layer</span><span class="o"><</span><span class="n">PersistentCompactIntMatrix</span><span class="o">></span><span class="w"> </span><span class="p">{</span>
|
||||||
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">build</span><span class="p">(</span><span class="n">out_dir</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Path</span><span class="p">,</span><span class="w"> </span><span class="n">count_of</span><span class="p">:</span><span class="w"> </span><span class="nc">impl</span><span class="w"> </span><span class="nb">Fn</span><span class="p">(</span><span class="n">CanonicalKmer</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">u32</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="kt">usize</span><span class="o">></span>
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">build</span><span class="p">(</span><span class="n">out_dir</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Path</span><span class="p">,</span><span class="w"> </span><span class="n">block_bits</span><span class="p">:</span><span class="w"> </span><span class="kt">u8</span><span class="p">,</span><span class="w"> </span><span class="n">mode</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">IndexMode</span><span class="p">,</span>
|
||||||
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">build_from_map</span><span class="p">(</span><span class="n">out_dir</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Path</span><span class="p">,</span><span class="w"> </span><span class="n">counts</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">HashMap</span><span class="o"><</span><span class="n">CanonicalKmer</span><span class="p">,</span><span class="w"> </span><span class="kt">u32</span><span class="o">></span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="kt">usize</span><span class="o">></span>
|
<span class="w"> </span><span class="n">count_of</span><span class="p">:</span><span class="w"> </span><span class="nc">impl</span><span class="w"> </span><span class="nb">Fn</span><span class="p">(</span><span class="n">CanonicalKmer</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">u32</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="kt">usize</span><span class="o">></span>
|
||||||
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">build_from_map</span><span class="p">(</span><span class="n">out_dir</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Path</span><span class="p">,</span><span class="w"> </span><span class="n">block_bits</span><span class="p">:</span><span class="w"> </span><span class="kt">u8</span><span class="p">,</span><span class="w"> </span><span class="n">mode</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">IndexMode</span><span class="p">,</span>
|
||||||
|
<span class="w"> </span><span class="n">counts</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">HashMap</span><span class="o"><</span><span class="n">CanonicalKmer</span><span class="p">,</span><span class="w"> </span><span class="kt">u32</span><span class="o">></span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="kt">usize</span><span class="o">></span>
|
||||||
<span class="p">}</span>
|
<span class="p">}</span>
|
||||||
|
|
||||||
<span class="c1">// mode 3</span>
|
<span class="c1">// mode 3</span>
|
||||||
<span class="k">impl</span><span class="w"> </span><span class="n">Layer</span><span class="o"><</span><span class="n">PersistentBitMatrix</span><span class="o">></span><span class="w"> </span><span class="p">{</span>
|
<span class="k">impl</span><span class="w"> </span><span class="n">Layer</span><span class="o"><</span><span class="n">PersistentBitMatrix</span><span class="o">></span><span class="w"> </span><span class="p">{</span>
|
||||||
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">build_presence</span><span class="p">(</span>
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">build_presence</span><span class="p">(</span><span class="n">out_dir</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Path</span><span class="p">,</span><span class="w"> </span><span class="n">block_bits</span><span class="p">:</span><span class="w"> </span><span class="kt">u8</span><span class="p">,</span><span class="w"> </span><span class="n">mode</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">IndexMode</span><span class="p">,</span>
|
||||||
<span class="w"> </span><span class="n">out_dir</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Path</span><span class="p">,</span>
|
|
||||||
<span class="w"> </span><span class="n">n_genomes</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span>
|
<span class="w"> </span><span class="n">n_genomes</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span>
|
||||||
<span class="w"> </span><span class="n">present_in</span><span class="p">:</span><span class="w"> </span><span class="nc">impl</span><span class="w"> </span><span class="nb">Fn</span><span class="p">(</span><span class="n">CanonicalKmer</span><span class="p">,</span><span class="w"> </span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">bool</span><span class="p">,</span>
|
<span class="w"> </span><span class="n">present_in</span><span class="p">:</span><span class="w"> </span><span class="nc">impl</span><span class="w"> </span><span class="nb">Fn</span><span class="p">(</span><span class="n">CanonicalKmer</span><span class="p">,</span><span class="w"> </span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">bool</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="kt">usize</span><span class="o">></span>
|
||||||
<span class="w"> </span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="kt">usize</span><span class="o">></span>
|
|
||||||
<span class="p">}</span>
|
<span class="p">}</span>
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p>All build impls delegate MPHF + evidence construction to <code>MphfLayer::build</code> via a mode-specific <code>fill_slot</code> callback. Mode 2 pre-reads <code>n_kmers</code> from <code>unitigs.bin</code> to size the <code>PersistentCompactIntMatrixBuilder</code> before calling <code>MphfLayer::build</code>. Mode 3 does the same for <code>PersistentBitMatrixBuilder</code>.</p>
|
<p>All build impls delegate to <code>MphfLayer::build</code> via a mode-specific <code>fill_slot</code> callback. The <code>mode</code> parameter is forwarded directly — no <code>LayerMeta</code> is written.</p>
|
||||||
|
<p>Evidence-only post-hoc builds are accessible directly on <code>Layer<D></code>:</p>
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="k">impl</span><span class="o"><</span><span class="n">D</span><span class="p">:</span><span class="w"> </span><span class="nc">LayerData</span><span class="o">></span><span class="w"> </span><span class="n">Layer</span><span class="o"><</span><span class="n">D</span><span class="o">></span><span class="w"> </span><span class="p">{</span>
|
||||||
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">build_exact_evidence</span><span class="p">(</span><span class="n">layer_dir</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Path</span><span class="p">,</span><span class="w"> </span><span class="n">block_bits</span><span class="p">:</span><span class="w"> </span><span class="kt">u8</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="kt">usize</span><span class="o">></span>
|
||||||
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">build_approx_evidence</span><span class="p">(</span><span class="n">layer_dir</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Path</span><span class="p">,</span><span class="w"> </span><span class="n">b</span><span class="p">:</span><span class="w"> </span><span class="kt">u8</span><span class="p">,</span><span class="w"> </span><span class="n">z</span><span class="p">:</span><span class="w"> </span><span class="kt">u8</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="kt">usize</span><span class="o">></span>
|
||||||
|
<span class="p">}</span>
|
||||||
|
</code></pre></div>
|
||||||
|
<p>There is no <code>build_evidence</code> dispatch wrapper.</p>
|
||||||
|
<hr />
|
||||||
|
<h2 id="fingerprintvec-and-fingerprintvecwriter">FingerprintVec and FingerprintVecWriter</h2>
|
||||||
|
<p>Approximate evidence is stored as a packed b-bit array, one fingerprint per MPHF slot.</p>
|
||||||
|
<div class="highlight"><pre><span></span><code>fingerprint.bin format:
|
||||||
|
magic: b"FPVF" (4 bytes)
|
||||||
|
b: u8 (bits per fingerprint, 1..=64)
|
||||||
|
padding: [0u8; 3]
|
||||||
|
n: u64 LE (number of slots)
|
||||||
|
data: packed bits, ceil(n*b/8) bytes, Lsb0 order
|
||||||
|
</code></pre></div>
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="k">impl</span><span class="w"> </span><span class="n">FingerprintVec</span><span class="w"> </span><span class="p">{</span>
|
||||||
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">open</span><span class="p">(</span><span class="n">path</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Path</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="bp">Self</span><span class="o">></span>
|
||||||
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">get</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">slot</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">u64</span>
|
||||||
|
<span class="w"> </span><span class="nc">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">matches</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">slot</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span><span class="w"> </span><span class="n">fingerprint</span><span class="p">:</span><span class="w"> </span><span class="kt">u64</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">bool</span>
|
||||||
|
<span class="w"> </span><span class="nc">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">n</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">usize</span>
|
||||||
|
<span class="w"> </span><span class="nc">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">b</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">u8</span>
|
||||||
|
<span class="p">}</span>
|
||||||
|
</code></pre></div>
|
||||||
|
<p><code>matches(slot, hash)</code> extracts the b-bit fingerprint stored at <code>slot</code> and compares it to the low b bits of <code>hash</code>. It is the core operation of <code>find_approx</code>.</p>
|
||||||
|
<hr />
|
||||||
|
<h2 id="layeredmapd-collection-of-layers">LayeredMap\<D> — collection of layers</h2>
|
||||||
|
<p><code>LayeredMap<D></code> wraps <code>Vec<Layer<D>></code> for a single partition directory.</p>
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">struct</span><span class="w"> </span><span class="nc">LayeredMap</span><span class="o"><</span><span class="n">D</span><span class="p">:</span><span class="w"> </span><span class="nc">LayerData</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">()</span><span class="o">></span><span class="w"> </span><span class="p">{</span>
|
||||||
|
<span class="w"> </span><span class="n">root</span><span class="p">:</span><span class="w"> </span><span class="nc">PathBuf</span><span class="p">,</span>
|
||||||
|
<span class="w"> </span><span class="n">meta</span><span class="p">:</span><span class="w"> </span><span class="nc">PartitionMeta</span><span class="p">,</span>
|
||||||
|
<span class="w"> </span><span class="n">layers</span><span class="p">:</span><span class="w"> </span><span class="nb">Vec</span><span class="o"><</span><span class="n">Layer</span><span class="o"><</span><span class="n">D</span><span class="o">>></span><span class="p">,</span>
|
||||||
|
<span class="p">}</span>
|
||||||
|
</code></pre></div>
|
||||||
|
<p><code>PartitionMeta</code> (<code>meta.json</code> at the partition root) stores <code>n_layers</code>.</p>
|
||||||
|
<h3 id="common-methods">Common methods</h3>
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">open</span><span class="p">(</span><span class="n">root</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Path</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="bp">Self</span><span class="o">></span>
|
||||||
|
<span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">create</span><span class="p">(</span><span class="n">root</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Path</span><span class="p">,</span><span class="w"> </span><span class="n">mode</span><span class="p">:</span><span class="w"> </span><span class="nc">IndexMode</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="bp">Self</span><span class="o">></span>
|
||||||
|
<span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">n_layers</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">usize</span>
|
||||||
|
<span class="nc">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">layer</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">i</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kp">&</span><span class="nc">Layer</span><span class="o"><</span><span class="n">D</span><span class="o">></span>
|
||||||
|
<span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">mode</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kp">&</span><span class="nc">IndexMode</span>
|
||||||
|
<span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">query</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">kmer</span><span class="p">:</span><span class="w"> </span><span class="nc">CanonicalKmer</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nb">Option</span><span class="o"><</span><span class="p">(</span><span class="kt">usize</span><span class="p">,</span><span class="w"> </span><span class="n">Hit</span><span class="o"><</span><span class="n">D</span><span class="p">::</span><span class="n">Item</span><span class="o">></span><span class="p">)</span><span class="o">></span>
|
||||||
|
<span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">next_layer_writer</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="n">UnitigFileWriter</span><span class="o">></span>
|
||||||
|
</code></pre></div>
|
||||||
|
<p><code>open</code> reads <code>PartitionMeta</code> once, extracts <code>mode</code>, and passes it to every <code>Layer::open</code> — no per-layer file is read. <code>create</code> stores the given mode in <code>PartitionMeta</code>.</p>
|
||||||
|
<p><code>query</code> probes layers in order and returns <code>(layer_index, Hit)</code> on the first match. Expected probe depth: 1 for kmers in layer 0.</p>
|
||||||
|
<h3 id="push_layer">push_layer</h3>
|
||||||
|
<p><code>push_layer</code> builds the next layer from a <code>unitigs.bin</code> already written via <code>next_layer_writer</code>, using <code>DEFAULT_BLOCK_BITS</code>:</p>
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="c1">// mode 1</span>
|
||||||
|
<span class="k">impl</span><span class="w"> </span><span class="n">LayeredMap</span><span class="o"><</span><span class="p">()</span><span class="o">></span><span class="w"> </span><span class="p">{</span>
|
||||||
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">push_layer</span><span class="p">(</span><span class="o">&</span><span class="k">mut</span><span class="w"> </span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="kt">usize</span><span class="o">></span>
|
||||||
|
<span class="p">}</span>
|
||||||
|
|
||||||
|
<span class="c1">// mode 2</span>
|
||||||
|
<span class="k">impl</span><span class="w"> </span><span class="n">LayeredMap</span><span class="o"><</span><span class="n">PersistentCompactIntMatrix</span><span class="o">></span><span class="w"> </span><span class="p">{</span>
|
||||||
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">push_layer</span><span class="p">(</span><span class="o">&</span><span class="k">mut</span><span class="w"> </span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">count_of</span><span class="p">:</span><span class="w"> </span><span class="nc">impl</span><span class="w"> </span><span class="nb">Fn</span><span class="p">(</span><span class="n">CanonicalKmer</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">u32</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="kt">usize</span><span class="o">></span>
|
||||||
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">push_layer_from_map</span><span class="p">(</span><span class="o">&</span><span class="k">mut</span><span class="w"> </span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">counts</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">HashMap</span><span class="o"><</span><span class="n">CanonicalKmer</span><span class="p">,</span><span class="w"> </span><span class="kt">u32</span><span class="o">></span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="kt">usize</span><span class="o">></span>
|
||||||
|
<span class="p">}</span>
|
||||||
|
</code></pre></div>
|
||||||
|
<p>Mode 3 (<code>PersistentBitMatrix</code>) has no <code>push_layer</code> on <code>LayeredMap</code>; callers build directly via <code>Layer<PersistentBitMatrix>::build_presence</code>.</p>
|
||||||
<hr />
|
<hr />
|
||||||
<h2 id="layeredstores-and-aggregation-traits">LayeredStore\<S> and aggregation traits</h2>
|
<h2 id="layeredstores-and-aggregation-traits">LayeredStore\<S> and aggregation traits</h2>
|
||||||
<p><code>LayeredStore<S></code> is a generic aggregation wrapper over <code>Vec<S></code>. It propagates three traits from <code>obicompactvec::traits</code> up the hierarchy via blanket impls:</p>
|
<p><code>LayeredStore<S></code> is a generic aggregation wrapper over <code>Vec<S></code>. It propagates three traits from <code>obicompactvec::traits</code> up the hierarchy via blanket impls:</p>
|
||||||
@@ -1320,11 +1786,6 @@
|
|||||||
<span class="k">impl</span><span class="o"><</span><span class="n">S</span><span class="p">:</span><span class="w"> </span><span class="nc">BitPartials</span><span class="o">></span><span class="w"> </span><span class="n">BitPartials</span><span class="w"> </span><span class="k">for</span><span class="w"> </span><span class="n">LayeredStore</span><span class="o"><</span><span class="n">S</span><span class="o">></span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="err">…</span><span class="w"> </span><span class="p">}</span><span class="w"> </span><span class="c1">// element-wise Σ partials</span>
|
<span class="k">impl</span><span class="o"><</span><span class="n">S</span><span class="p">:</span><span class="w"> </span><span class="nc">BitPartials</span><span class="o">></span><span class="w"> </span><span class="n">BitPartials</span><span class="w"> </span><span class="k">for</span><span class="w"> </span><span class="n">LayeredStore</span><span class="o"><</span><span class="n">S</span><span class="o">></span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="err">…</span><span class="w"> </span><span class="p">}</span><span class="w"> </span><span class="c1">// element-wise Σ partials</span>
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p>Because blanket impls compose, <code>LayeredStore<LayeredStore<S>></code> automatically inherits all three traits when <code>S</code> does — providing the partitioned level without a separate type.</p>
|
<p>Because blanket impls compose, <code>LayeredStore<LayeredStore<S>></code> automatically inherits all three traits when <code>S</code> does — providing the partitioned level without a separate type.</p>
|
||||||
<p><strong>Aggregation hierarchy:</strong></p>
|
|
||||||
<div class="highlight"><pre><span></span><code>PersistentCompactIntMatrix implements CountPartials
|
|
||||||
LayeredStore<PersistentCompactIntMatrix> via blanket impl (one partition)
|
|
||||||
LayeredStore<LayeredStore<…>> via blanket impl (partitioned index)
|
|
||||||
</code></pre></div>
|
|
||||||
<p><strong>Leaf implementors</strong> (in <code>obicompactvec</code>):</p>
|
<p><strong>Leaf implementors</strong> (in <code>obicompactvec</code>):</p>
|
||||||
<table>
|
<table>
|
||||||
<thead>
|
<thead>
|
||||||
@@ -1344,69 +1805,77 @@ LayeredStore<LayeredStore<…>> via blanket impl
|
|||||||
</tr>
|
</tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
<p><code>PersistentCompactIntVec</code> and <code>PersistentBitVec</code> do not implement these traits — they are single-column primitives, not matrix-level aggregators.</p>
|
|
||||||
<p>See <a href="../../architecture/index_architecture/">Kmer index architecture</a> for the full trait API and the two-pass normalised-metric pattern.</p>
|
<p>See <a href="../../architecture/index_architecture/">Kmer index architecture</a> for the full trait API and the two-pass normalised-metric pattern.</p>
|
||||||
<hr />
|
<hr />
|
||||||
<h2 id="on-disk-structure">On-disk structure</h2>
|
<h2 id="on-disk-structure">On-disk structure</h2>
|
||||||
<div class="highlight"><pre><span></span><code>index_root/ ← LayeredMap (collection)
|
<div class="highlight"><pre><span></span><code>partition_root/ ← LayeredMap (one partition)
|
||||||
meta.json
|
meta.json — {"n_layers": N, "mode": {"type": "exact"|"approx"|"hybrid", ...}}
|
||||||
part_00000/ ← Partition
|
|
||||||
layer_0/ ← Layer
|
layer_0/ ← Layer
|
||||||
mphf.bin — ptr_hash MPHF (epserde format)
|
mphf.bin — ptr_hash MPHF (epserde format)
|
||||||
unitigs.bin — packed 2-bit nucleotide sequences
|
unitigs.bin — packed 2-bit nucleotide sequences
|
||||||
unitigs.bin.idx — UIDX index: n_unitigs, n_kmers, seqls[], packed_offsets[]
|
unitigs.bin.idx — UIDX index (Exact/Hybrid only; query-time, never built during MPHF construction)
|
||||||
evidence.bin — n × u32, each = (chunk_id: 25 bits | rank: 7 bits), LE
|
evidence.bin — [u32; n], LE (Exact/Hybrid only)
|
||||||
|
fingerprint.bin — packed b-bit array (Approx/Hybrid only)
|
||||||
counts/ [mode 2] PersistentCompactIntMatrix
|
counts/ [mode 2] PersistentCompactIntMatrix
|
||||||
meta.json {"n": N, "n_cols": 1}
|
meta.json
|
||||||
col_000000.pciv
|
col_000000.pciv
|
||||||
presence/ [mode 3] PersistentBitMatrix
|
presence/ [mode 3] PersistentBitMatrix
|
||||||
meta.json {"n": N, "n_cols": G}
|
meta.json
|
||||||
col_000000.pbiv
|
col_000000.pbiv …
|
||||||
…
|
|
||||||
layer_1/
|
layer_1/
|
||||||
…
|
…
|
||||||
part_00001/
|
|
||||||
…
|
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p><strong>Partition</strong> (<code>part_XXXXX/</code>): all kmers whose canonical minimiser hashes to this bucket. Partitions are independent and can be processed in parallel.</p>
|
<p>There is no <code>layer_meta.json</code>. The mode is stored once in <code>PartitionMeta</code> and is valid for all layers. <code>unitigs.bin.idx</code> is built at the end of <code>build_exact_evidence</code> — never during MPHF construction — and is consumed at query time only.</p>
|
||||||
<p><strong>Layer</strong> (<code>layer_N/</code>): one <code>MphfLayer</code> plus optional payload. Layer 0 covers dataset A; layer 1 covers kmers in B absent from A; etc. Layers within a partition are always disjoint.</p>
|
|
||||||
<hr />
|
<hr />
|
||||||
<h2 id="evidence-encoding">Evidence encoding</h2>
|
<h2 id="evidence-encoding-exact">Evidence encoding (exact)</h2>
|
||||||
<p><code>evidence.bin</code> is a flat <code>[u32; n]</code> array with no header. Each u32 encodes one slot:</p>
|
<p><code>evidence.bin</code> is a flat <code>[u32; n]</code> array with no header. Each u32 encodes one slot:</p>
|
||||||
<div class="highlight"><pre><span></span><code>bits [31:7] = chunk_id (25 bits) — index of the unitig chunk
|
<div class="highlight"><pre><span></span><code>bits [31:7] = chunk_id (25 bits) — index of the unitig chunk
|
||||||
bits [6:0] = rank (7 bits) — kmer index within the chunk (0-based)
|
bits [6:0] = rank (7 bits) — kmer index within the chunk (0-based)
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p>Decoding: <code>chunk_id = raw >> 7</code>, <code>rank = raw & 0x7F</code>. Reconstructing the kmer: read k nucleotides at position <code>rank</code> within unitig <code>chunk_id</code>.</p>
|
<p><code>chunk_id = raw >> 7</code>, <code>rank = raw & 0x7F</code>. Reconstructing the kmer: read k nucleotides at position <code>rank</code> within unitig <code>chunk_id</code> (requires <code>unitigs.bin.idx</code> for random access).</p>
|
||||||
<p>For k=31, m=11, the observed maximum is ~46 kmers per chunk — well within the 127-kmer u7 capacity. The structural maximum from superkmer construction is k − m + 1 = 21 kmers/unitig; longer unitigs arise from paths spanning more than one superkmer.</p>
|
<p>For k=31, m=11, the observed maximum is ~46 kmers per chunk — well within the 127-kmer u7 capacity.</p>
|
||||||
<hr />
|
<hr />
|
||||||
<h2 id="ptr_hash-configuration">ptr_hash configuration</h2>
|
<h2 id="ptr_hash-configuration">ptr_hash configuration</h2>
|
||||||
<div class="highlight"><pre><span></span><code><span class="k">type</span><span class="w"> </span><span class="nc">Mphf</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">PtrHash</span><span class="o"><</span>
|
<div class="highlight"><pre><span></span><code><span class="k">type</span><span class="w"> </span><span class="nc">Mphf</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">PtrHash</span><span class="o"><</span>
|
||||||
<span class="w"> </span><span class="kt">u64</span><span class="p">,</span><span class="w"> </span><span class="c1">// key type: canonical kmer raw encoding</span>
|
<span class="w"> </span><span class="kt">u64</span><span class="p">,</span><span class="w"> </span><span class="c1">// key type: canonical kmer raw encoding</span>
|
||||||
<span class="w"> </span><span class="n">CubicEps</span><span class="p">,</span><span class="w"> </span><span class="c1">// bucket fn: 2.4 bits/key, λ=3.5, α=0.99</span>
|
<span class="w"> </span><span class="n">CubicEps</span><span class="p">,</span><span class="w"> </span><span class="c1">// bucket fn: 2.4 bits/key, λ=3.5, α=0.99</span>
|
||||||
<span class="w"> </span><span class="n">CachelineEfVec</span><span class="o"><</span><span class="nb">Vec</span><span class="o"><</span><span class="n">CachelineEf</span><span class="o">>></span><span class="p">,</span><span class="w"> </span><span class="c1">// remap: 11.6 bits/entry (Elias-Fano)</span>
|
<span class="w"> </span><span class="n">CachelineEfVec</span><span class="o"><</span><span class="nb">Vec</span><span class="o"><</span><span class="n">CachelineEf</span><span class="o">>></span><span class="p">,</span><span class="w"> </span><span class="c1">// remap: Elias-Fano</span>
|
||||||
<span class="w"> </span><span class="n">Xx64</span><span class="p">,</span><span class="w"> </span><span class="c1">// hasher: XXH3-64 with seed</span>
|
<span class="w"> </span><span class="n">Xx64</span><span class="p">,</span><span class="w"> </span><span class="c1">// hasher: XXH3-64 with seed</span>
|
||||||
<span class="w"> </span><span class="nb">Vec</span><span class="o"><</span><span class="kt">u8</span><span class="o">></span><span class="p">,</span><span class="w"> </span><span class="c1">// pilots</span>
|
<span class="w"> </span><span class="nb">Vec</span><span class="o"><</span><span class="kt">u8</span><span class="o">></span><span class="p">,</span><span class="w"> </span><span class="c1">// pilots</span>
|
||||||
<span class="o">></span><span class="p">;</span>
|
<span class="o">></span><span class="p">;</span>
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p><code>Xx64</code> is chosen over <code>FxHash</code> because canonical kmer raw values are left-aligned u64 with structural zeros in the low bits (42 zeros for k=11, 2 zeros for k=31), which single-multiply hashes distribute poorly.</p>
|
<p><code>Xx64</code> is chosen over <code>FxHash</code> because canonical kmer raw values are left-aligned u64 with structural zeros in the low bits (42 zeros for k=11, 2 zeros for k=31), which single-multiply hashes distribute poorly.</p>
|
||||||
<p><code>CubicEps</code> with <code>PtrHashParams::<CubicEps>::default()</code> (λ=3.5) is a balanced tradeoff: 2× slower construction than <code>Linear/λ=3.0</code>, 20% less space.</p>
|
<p><code>CubicEps</code> with <code>PtrHashParams::<CubicEps>::default()</code> (λ=3.5): 2× slower construction than <code>Linear/λ=3.0</code>, ~20% less space.</p>
|
||||||
<hr />
|
<hr />
|
||||||
<h2 id="query-path">Query path</h2>
|
<h2 id="column-append-and-merge-support">Column append and merge support</h2>
|
||||||
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">query</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">kmer</span><span class="p">:</span><span class="w"> </span><span class="nc">CanonicalKmer</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nb">Option</span><span class="o"><</span><span class="n">Hit</span><span class="o"><</span><span class="n">D</span><span class="p">::</span><span class="n">Item</span><span class="o">>></span><span class="w"> </span><span class="p">{</span>
|
<p>These methods extend existing layers with new genome columns without touching the MPHF.</p>
|
||||||
<span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">mphf</span><span class="p">.</span><span class="n">find</span><span class="p">(</span><span class="n">kmer</span><span class="p">).</span><span class="n">map</span><span class="p">(</span><span class="o">|</span><span class="n">slot</span><span class="o">|</span><span class="w"> </span><span class="n">Hit</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">slot</span><span class="p">,</span><span class="w"> </span><span class="n">data</span><span class="p">:</span><span class="w"> </span><span class="nc">self</span><span class="p">.</span><span class="n">data</span><span class="p">.</span><span class="n">read</span><span class="p">(</span><span class="n">slot</span><span class="p">)</span><span class="w"> </span><span class="p">})</span>
|
<h3 id="layer-level-genome-column-append">Layer-level genome column append</h3>
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="k">impl</span><span class="w"> </span><span class="n">Layer</span><span class="o"><</span><span class="n">PersistentBitMatrix</span><span class="o">></span><span class="w"> </span><span class="p">{</span>
|
||||||
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">append_genome_column</span><span class="p">(</span><span class="n">layer_dir</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Path</span><span class="p">,</span><span class="w"> </span><span class="n">value_of</span><span class="p">:</span><span class="w"> </span><span class="nc">impl</span><span class="w"> </span><span class="nb">Fn</span><span class="p">(</span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">bool</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="p">()</span><span class="o">></span>
|
||||||
|
<span class="p">}</span>
|
||||||
|
|
||||||
|
<span class="k">impl</span><span class="w"> </span><span class="n">Layer</span><span class="o"><</span><span class="n">PersistentCompactIntMatrix</span><span class="o">></span><span class="w"> </span><span class="p">{</span>
|
||||||
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">append_genome_column</span><span class="p">(</span><span class="n">layer_dir</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Path</span><span class="p">,</span><span class="w"> </span><span class="n">value_of</span><span class="p">:</span><span class="w"> </span><span class="nc">impl</span><span class="w"> </span><span class="nb">Fn</span><span class="p">(</span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">u32</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="p">()</span><span class="o">></span>
|
||||||
<span class="p">}</span>
|
<span class="p">}</span>
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p><code>MphfLayer::find</code> probes the MPHF, decodes evidence, and verifies the kmer — returning <code>Some(slot)</code> on match, <code>None</code> otherwise. <code>data.read(slot)</code> is called only on a confirmed hit.</p>
|
<p>Both delegate to the corresponding <code>PersistentBitMatrix::append_column</code> / <code>PersistentCompactIntMatrix::append_column</code>. They write a new column file (<code>col_NNNNNN.pbiv</code> / <code>col_NNNNNN.pciv</code>) and update <code>meta.json</code> to increment <code>n_cols</code>. <code>value_of</code> is called once per slot (0..n).</p>
|
||||||
<p>In <code>LayeredMap</code>, layers are probed in order; the first match wins. Expected probe depth: 1 for kmers in layer 0.</p>
|
<h3 id="presence-matrix-initialisation">Presence matrix initialisation</h3>
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="k">impl</span><span class="w"> </span><span class="n">Layer</span><span class="o"><</span><span class="p">()</span><span class="o">></span><span class="w"> </span><span class="p">{</span>
|
||||||
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">init_presence_matrix</span><span class="p">(</span><span class="n">layer_dir</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Path</span><span class="p">,</span><span class="w"> </span><span class="n">n_kmers</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="p">()</span><span class="o">></span>
|
||||||
|
<span class="p">}</span>
|
||||||
|
</code></pre></div>
|
||||||
|
<p>Called on the first merge of a Presence-mode index. Creates <code>presence/</code> with <code>meta.json {"n": n_kmers, "n_cols": 1}</code> and <code>col_000000.pbiv</code> set entirely to <code>true</code>. This retroactively records genome 0 (the original source) as present in every slot, satisfying the column-count invariant before any new-source column is appended.</p>
|
||||||
|
<h3 id="why-the-mphf-is-never-rebuilt">Why the MPHF is never rebuilt</h3>
|
||||||
|
<p>The MPHF, evidence, and unitigs are built once from the kmer set of a layer and are immutable for the lifetime of that layer. Adding a genome column does not change the kmer set — it only appends a new data column indexed by the same slot numbers. The only disk writes are one new <code>.pciv</code>/<code>.pbiv</code> file and a single <code>meta.json</code> update.</p>
|
||||||
<hr />
|
<hr />
|
||||||
<h2 id="add-layer-algorithm">Add-layer algorithm</h2>
|
<h2 id="add-layer-algorithm">Add-layer algorithm</h2>
|
||||||
<p>When adding dataset B to an existing index:</p>
|
<p>When adding dataset B to an existing index:</p>
|
||||||
<ol>
|
<ol>
|
||||||
<li>For each partition, probe existing layers for kmers of B routed to that partition.</li>
|
<li>For each partition, probe existing layers for kmers of B routed to that partition.</li>
|
||||||
<li>Collect kmers absent from all layers → <code>B \ index</code>.</li>
|
<li>Collect kmers absent from all layers → <code>B \ index</code>.</li>
|
||||||
<li>Write <code>B \ index</code> to a new <code>unitigs.bin</code> via <code>MphfLayer::unitig_writer</code>.</li>
|
<li>Write <code>B \ index</code> to a new <code>unitigs.bin</code> via <code>next_layer_writer()</code>.</li>
|
||||||
<li>Call <code>Layer<D>::build</code> on the new directory.</li>
|
<li>Call <code>Layer<D>::build</code> (or <code>build_presence</code>) on the new layer directory.</li>
|
||||||
<li>Update <code>meta.json</code>.</li>
|
<li>Call <code>push_layer</code> (or <code>append_layer</code>) to register the new layer in <code>meta.json</code>.</li>
|
||||||
</ol>
|
</ol>
|
||||||
<p>Each partition's new layer is built independently; the operation is fully parallel across partitions.</p>
|
<p>Each partition's new layer is built independently; the operation is fully parallel across partitions.</p>
|
||||||
<hr />
|
<hr />
|
||||||
@@ -1433,11 +1902,15 @@ bits [6:0] = rank (7 bits) — kmer index within the chunk (0-based)
|
|||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td><code>memmap2 0.9</code></td>
|
<td><code>memmap2 0.9</code></td>
|
||||||
<td>mmap of evidence and payload files</td>
|
<td>mmap of evidence and fingerprint files</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>bitvec</code></td>
|
||||||
|
<td>packed b-bit fingerprint storage</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td><code>obiskio</code></td>
|
<td><code>obiskio</code></td>
|
||||||
<td>unitig file writer/reader</td>
|
<td>unitig file writer/reader + <code>.idx</code> build</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td><code>obicompactvec</code></td>
|
<td><code>obicompactvec</code></td>
|
||||||
@@ -1448,8 +1921,8 @@ bits [6:0] = rank (7 bits) — kmer index within the chunk (0-based)
|
|||||||
<td>parallel MPHF construction pass</td>
|
<td>parallel MPHF construction pass</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td><code>ndarray 0.16</code></td>
|
<td><code>serde / serde_json</code></td>
|
||||||
<td>aggregation output arrays</td>
|
<td><code>PartitionMeta</code> serialisation</td>
|
||||||
</tr>
|
</tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -662,6 +662,17 @@
|
|||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#make_pipe-dsl" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
make_pipe! DSL
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
</li>
|
</li>
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
@@ -801,6 +812,34 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../evidence_elimination/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Evidence elimination (discussion)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="../obilayeredmap/" class="md-nav__link">
|
<a href="../obilayeredmap/" class="md-nav__link">
|
||||||
|
|
||||||
@@ -879,6 +918,62 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../merge/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Merge command
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../rebuild_filter/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Kmer filtering (rebuild/dump/unitig)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
</nav>
|
</nav>
|
||||||
|
|
||||||
@@ -1087,6 +1182,17 @@
|
|||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#make_pipe-dsl" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
make_pipe! DSL
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
</li>
|
</li>
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
@@ -1145,7 +1251,7 @@
|
|||||||
|
|
||||||
|
|
||||||
<h1 id="obipipeline-parallel-pipeline-library">obipipeline — parallel pipeline library</h1>
|
<h1 id="obipipeline-parallel-pipeline-library">obipipeline — parallel pipeline library</h1>
|
||||||
<p><code>obipipeline</code> is a generic, multi-threaded data pipeline crate. It connects a <strong>source</strong>, a chain of <strong>transforms</strong>, and a <strong>sink</strong> via crossbeam channels, running each stage with a shared worker pool and a biased scheduler.</p>
|
<p><code>obipipeline</code> is a generic, multi-threaded data pipeline crate. It connects a <strong>source</strong>, a chain of <strong>stages</strong>, and a <strong>sink</strong> via crossbeam channels, running each stage with a shared worker pool and a biased scheduler.</p>
|
||||||
<h2 id="core-types">Core types</h2>
|
<h2 id="core-types">Core types</h2>
|
||||||
<table>
|
<table>
|
||||||
<thead>
|
<thead>
|
||||||
@@ -1158,22 +1264,33 @@
|
|||||||
<tbody>
|
<tbody>
|
||||||
<tr>
|
<tr>
|
||||||
<td><code>SourceFn<D></code></td>
|
<td><code>SourceFn<D></code></td>
|
||||||
<td><code>Box<dyn FnMut() -> Result<D, PipelineError> + Send+Sync></code></td>
|
<td><code>Box<dyn FnMut() -> Result<D, PipelineError> + Send></code></td>
|
||||||
<td>Called repeatedly; <code>FnMut</code> because it holds iterator state</td>
|
<td>Called repeatedly; <code>FnMut</code> because it holds iterator state</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td><code>SharedFn<D></code></td>
|
<td><code>SharedFn<D></code></td>
|
||||||
<td><code>Arc<dyn Fn(D) -> Result<D, PipelineError> + Send+Sync></code></td>
|
<td><code>Arc<dyn Fn(D) -> Result<D, PipelineError> + Send + Sync></code></td>
|
||||||
<td>Shared across workers via <code>Arc::clone</code> (no copy of the closure)</td>
|
<td>1→1 transform shared across workers via <code>Arc::clone</code></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>SharedFlatFn<D></code></td>
|
||||||
|
<td><code>Arc<dyn Fn(D, &Sender<Result<D, _>>, &Sender<isize>) + Send + Sync></code></td>
|
||||||
|
<td>1→N transform; pushes items into channel, sends delta</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td><code>SinkFn<D></code></td>
|
<td><code>SinkFn<D></code></td>
|
||||||
<td><code>Box<dyn Fn(D) -> Result<(), PipelineError> + Send+Sync></code></td>
|
<td><code>Box<dyn Fn(D) -> Result<(), PipelineError> + Send></code></td>
|
||||||
<td>Final consumer; returns <code>Result</code> so errors propagate back</td>
|
<td>Final consumer; returns <code>Result</code> so errors propagate back</td>
|
||||||
</tr>
|
</tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
<p><code>Pipeline<D></code> holds one <code>SourceFn</code>, a <code>Vec<SharedFn></code>, and one <code>SinkFn</code>.<br />
|
<p>Stages come in two variants:</p>
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">enum</span><span class="w"> </span><span class="nc">Stage</span><span class="o"><</span><span class="n">D</span><span class="o">></span><span class="w"> </span><span class="p">{</span>
|
||||||
|
<span class="w"> </span><span class="n">Transform</span><span class="p">(</span><span class="n">SharedFn</span><span class="o"><</span><span class="n">D</span><span class="o">></span><span class="p">),</span><span class="w"> </span><span class="c1">// 1→1</span>
|
||||||
|
<span class="w"> </span><span class="n">Flat</span><span class="p">(</span><span class="n">SharedFlatFn</span><span class="o"><</span><span class="n">D</span><span class="o">></span><span class="p">),</span><span class="w"> </span><span class="c1">// 1→N</span>
|
||||||
|
<span class="p">}</span>
|
||||||
|
</code></pre></div>
|
||||||
|
<p><code>Pipeline<D></code> holds one <code>SourceFn</code>, a <code>Vec<Stage></code>, and one <code>SinkFn</code>.<br />
|
||||||
<code>WorkerPool<D></code> wraps a <code>Pipeline</code> with <code>n_workers</code> and channel <code>capacity</code>.</p>
|
<code>WorkerPool<D></code> wraps a <code>Pipeline</code> with <code>n_workers</code> and channel <code>capacity</code>.</p>
|
||||||
<h2 id="workerpool">WorkerPool</h2>
|
<h2 id="workerpool">WorkerPool</h2>
|
||||||
<div class="highlight"><pre><span></span><code><span class="n">WorkerPool</span><span class="p">::</span><span class="n">new</span><span class="p">(</span><span class="n">pipeline</span><span class="p">:</span><span class="w"> </span><span class="nc">Pipeline</span><span class="o"><</span><span class="n">D</span><span class="o">></span><span class="p">,</span><span class="w"> </span><span class="n">n_workers</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span><span class="w"> </span><span class="n">capacity</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Self</span>
|
<div class="highlight"><pre><span></span><code><span class="n">WorkerPool</span><span class="p">::</span><span class="n">new</span><span class="p">(</span><span class="n">pipeline</span><span class="p">:</span><span class="w"> </span><span class="nc">Pipeline</span><span class="o"><</span><span class="n">D</span><span class="o">></span><span class="p">,</span><span class="w"> </span><span class="n">n_workers</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span><span class="w"> </span><span class="n">capacity</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Self</span>
|
||||||
@@ -1193,7 +1310,7 @@
|
|||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td><code>capacity</code></td>
|
<td><code>capacity</code></td>
|
||||||
<td>Bound on every crossbeam channel in the pipeline (source output, inter-stage channels, worker input, sink input, sink error). Controls memory and back-pressure: a full channel blocks the sender until a slot frees.</td>
|
<td>Bound on every crossbeam channel in the pipeline. Controls memory and back-pressure: a full channel blocks the sender until a slot frees.</td>
|
||||||
</tr>
|
</tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
@@ -1208,7 +1325,7 @@
|
|||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p>Each variant carries the concrete type for one stage's output. The macros pattern-match on this enum to route values between stages.</p>
|
<p>Each variant carries the concrete type for one stage's output. The macros pattern-match on this enum to route values between stages.</p>
|
||||||
<h2 id="macros">Macros</h2>
|
<h2 id="macros">Macros</h2>
|
||||||
<p>Six low-level macros build individual stages; one high-level macro (<code>make_pipeline!</code>) composes them.</p>
|
<p>Eight low-level macros build individual stages; one high-level macro (<code>make_pipeline!</code>) composes them.</p>
|
||||||
<h3 id="low-level">Low-level</h3>
|
<h3 id="low-level">Low-level</h3>
|
||||||
<div class="highlight"><pre><span></span><code><span class="n">make_source</span><span class="o">!</span><span class="p">(</span><span class="n">Enum</span><span class="p">,</span><span class="w"> </span><span class="n">iterator</span><span class="p">,</span><span class="w"> </span><span class="n">OutputVariant</span><span class="p">)</span><span class="w"> </span><span class="c1">// iterator yields T</span>
|
<div class="highlight"><pre><span></span><code><span class="n">make_source</span><span class="o">!</span><span class="p">(</span><span class="n">Enum</span><span class="p">,</span><span class="w"> </span><span class="n">iterator</span><span class="p">,</span><span class="w"> </span><span class="n">OutputVariant</span><span class="p">)</span><span class="w"> </span><span class="c1">// iterator yields T</span>
|
||||||
<span class="n">make_source_fallible</span><span class="o">!</span><span class="p">(</span><span class="n">Enum</span><span class="p">,</span><span class="w"> </span><span class="n">iterator</span><span class="p">,</span><span class="w"> </span><span class="n">OutputVariant</span><span class="p">)</span><span class="w"> </span><span class="c1">// iterator yields Result<T, E></span>
|
<span class="n">make_source_fallible</span><span class="o">!</span><span class="p">(</span><span class="n">Enum</span><span class="p">,</span><span class="w"> </span><span class="n">iterator</span><span class="p">,</span><span class="w"> </span><span class="n">OutputVariant</span><span class="p">)</span><span class="w"> </span><span class="c1">// iterator yields Result<T, E></span>
|
||||||
@@ -1216,6 +1333,9 @@
|
|||||||
<span class="n">make_transform</span><span class="o">!</span><span class="p">(</span><span class="n">Enum</span><span class="p">,</span><span class="w"> </span><span class="n">func</span><span class="p">,</span><span class="w"> </span><span class="n">InputVariant</span><span class="p">,</span><span class="w"> </span><span class="n">OutputVariant</span><span class="p">)</span><span class="w"> </span><span class="c1">// func: T -> U</span>
|
<span class="n">make_transform</span><span class="o">!</span><span class="p">(</span><span class="n">Enum</span><span class="p">,</span><span class="w"> </span><span class="n">func</span><span class="p">,</span><span class="w"> </span><span class="n">InputVariant</span><span class="p">,</span><span class="w"> </span><span class="n">OutputVariant</span><span class="p">)</span><span class="w"> </span><span class="c1">// func: T -> U</span>
|
||||||
<span class="n">make_transform_fallible</span><span class="o">!</span><span class="p">(</span><span class="n">Enum</span><span class="p">,</span><span class="w"> </span><span class="n">func</span><span class="p">,</span><span class="w"> </span><span class="n">InputVariant</span><span class="p">,</span><span class="w"> </span><span class="n">OutputVariant</span><span class="p">)</span><span class="w"> </span><span class="c1">// func: T -> Result<U, E></span>
|
<span class="n">make_transform_fallible</span><span class="o">!</span><span class="p">(</span><span class="n">Enum</span><span class="p">,</span><span class="w"> </span><span class="n">func</span><span class="p">,</span><span class="w"> </span><span class="n">InputVariant</span><span class="p">,</span><span class="w"> </span><span class="n">OutputVariant</span><span class="p">)</span><span class="w"> </span><span class="c1">// func: T -> Result<U, E></span>
|
||||||
|
|
||||||
|
<span class="n">make_flat_transform</span><span class="o">!</span><span class="p">(</span><span class="n">Enum</span><span class="p">,</span><span class="w"> </span><span class="n">func</span><span class="p">,</span><span class="w"> </span><span class="n">InputVariant</span><span class="p">,</span><span class="w"> </span><span class="n">OutputVariant</span><span class="p">)</span><span class="w"> </span><span class="c1">// func: T -> impl IntoIterator<Item=U></span>
|
||||||
|
<span class="n">make_flat_transform_fallible</span><span class="o">!</span><span class="p">(</span><span class="n">Enum</span><span class="p">,</span><span class="w"> </span><span class="n">func</span><span class="p">,</span><span class="w"> </span><span class="n">InputVariant</span><span class="p">,</span><span class="w"> </span><span class="n">OutputVariant</span><span class="p">)</span><span class="w"> </span><span class="c1">// func: T -> Result<impl IntoIterator<Item=U>, E></span>
|
||||||
|
|
||||||
<span class="n">make_sink</span><span class="o">!</span><span class="p">(</span><span class="n">Enum</span><span class="p">,</span><span class="w"> </span><span class="n">func</span><span class="p">,</span><span class="w"> </span><span class="n">InputVariant</span><span class="p">)</span><span class="w"> </span><span class="c1">// func: T -> ()</span>
|
<span class="n">make_sink</span><span class="o">!</span><span class="p">(</span><span class="n">Enum</span><span class="p">,</span><span class="w"> </span><span class="n">func</span><span class="p">,</span><span class="w"> </span><span class="n">InputVariant</span><span class="p">)</span><span class="w"> </span><span class="c1">// func: T -> ()</span>
|
||||||
<span class="n">make_sink_fallible</span><span class="o">!</span><span class="p">(</span><span class="n">Enum</span><span class="p">,</span><span class="w"> </span><span class="n">func</span><span class="p">,</span><span class="w"> </span><span class="n">InputVariant</span><span class="p">)</span><span class="w"> </span><span class="c1">// func: T -> Result<(), E></span>
|
<span class="n">make_sink_fallible</span><span class="o">!</span><span class="p">(</span><span class="n">Enum</span><span class="p">,</span><span class="w"> </span><span class="n">func</span><span class="p">,</span><span class="w"> </span><span class="n">InputVariant</span><span class="p">)</span><span class="w"> </span><span class="c1">// func: T -> Result<(), E></span>
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
@@ -1224,17 +1344,31 @@
|
|||||||
<div class="highlight"><pre><span></span><code>make_pipeline! {
|
<div class="highlight"><pre><span></span><code>make_pipeline! {
|
||||||
DataEnum,
|
DataEnum,
|
||||||
source iterator => OutputVariant, // or source? for fallible
|
source iterator => OutputVariant, // or source? for fallible
|
||||||
| func: In => Out, // non-fallible transform
|
| func: In => Out, // 1→1 non-fallible transform
|
||||||
|? func: In => Out, // fallible transform
|
|? func: In => Out, // 1→1 fallible transform
|
||||||
|
|| func: In => Out, // 1→N non-fallible flat transform
|
||||||
|
||? func: In => Out, // 1→N fallible flat transform
|
||||||
sink func @ InputVariant, // or sink? for fallible
|
sink func @ InputVariant, // or sink? for fallible
|
||||||
}
|
}
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p><code>?</code> marks fallibility on source, individual transforms, or sink independently.<br />
|
<p><code>?</code> marks fallibility on source, individual transforms, or sink independently.<br />
|
||||||
Implemented as a <strong>TT muncher</strong>: the internal rule <code>@build</code> recurses over transform tokens one at a time, accumulating them into a <code>vec![]</code>, then terminates on <code>sink</code>/<code>sink?</code>.</p>
|
Implemented as a <strong>TT muncher</strong>: the internal rule <code>@build</code> recurses over transform tokens one at a time, accumulating them into a <code>vec![]</code>, then terminates on <code>sink</code>/<code>sink?</code>.</p>
|
||||||
|
<h3 id="make_pipe-dsl">make_pipe! DSL</h3>
|
||||||
|
<p><code>make_pipe!</code> builds a sourceless/sinkless <code>Pipe<D, In, Out></code> — a reusable, composable stage sequence:</p>
|
||||||
|
<div class="highlight"><pre><span></span><code>make_pipe! {
|
||||||
|
DataEnum : InType => OutType,
|
||||||
|
| func: InVariant => OutVariant,
|
||||||
|
|? func: InVariant => OutVariant,
|
||||||
|
|| func: InVariant => OutVariant,
|
||||||
|
||? func: InVariant => OutVariant,
|
||||||
|
}
|
||||||
|
</code></pre></div>
|
||||||
|
<p>Two pipes compose with <code>.then(other)</code>. Apply to an iterator with <code>.apply(iter, n_workers, capacity)</code> to get a <code>PipeIter<Out></code> — an iterator over the pipeline output, backed by a background <code>WorkerPool</code>. The scatter step in <code>obikmer</code> uses <code>make_pipe!</code> and <code>.apply()</code> rather than the full <code>make_pipeline!</code> / <code>WorkerPool</code> pattern.</p>
|
||||||
<h2 id="scheduler-architecture">Scheduler architecture</h2>
|
<h2 id="scheduler-architecture">Scheduler architecture</h2>
|
||||||
<div class="highlight"><pre><span></span><code>Source thread ──► [source_rx] ──► Scheduler ──► [worker_tx] ──► Workers (×N)
|
<div class="highlight"><pre><span></span><code>Source thread ──► [source_rx] ──► Scheduler ──► [worker_tx] ──► Workers (×N)
|
||||||
▲ │
|
▲ │
|
||||||
[stage_rxs] ────────┘◄──────────────────────────────┘
|
[stage_rxs] ────────┘◄──────────────────────────────┘
|
||||||
|
[flat_delta_rx] ──► Scheduler (in_flight adjustment)
|
||||||
│
|
│
|
||||||
[sink_err_rx] ← errors from sink (highest priority)
|
[sink_err_rx] ← errors from sink (highest priority)
|
||||||
│
|
│
|
||||||
@@ -1242,20 +1376,20 @@ Implemented as a <strong>TT muncher</strong>: the internal rule <code>@build</co
|
|||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p>The scheduler is a single thread running a biased <code>Select</code> over all input channels. Priority order (highest first):</p>
|
<p>The scheduler is a single thread running a biased <code>Select</code> over all input channels. Priority order (highest first):</p>
|
||||||
<div class="highlight"><pre><span></span><code>index 0 sink_err_rx abort on sink error
|
<div class="highlight"><pre><span></span><code>index 0 sink_err_rx abort on sink error
|
||||||
index 1 stage_rxs[N-1] drain last stage first
|
index 1 flat_delta_rx adjust in_flight before dispatching
|
||||||
...
|
index 2..=n+1 stage_rxs[n-1..0] drain last stage first
|
||||||
index N stage_rxs[0]
|
index n+2 source_rx pull new data last
|
||||||
index N+1 source_rx pull new data last
|
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p>This back-pressure-friendly ordering ensures downstream stages are drained before new items enter the pipeline.</p>
|
<p>This back-pressure-friendly ordering ensures downstream stages are drained before new items enter the pipeline.</p>
|
||||||
<p><strong>Workers</strong> are generic: each receives <code>(data, SharedFn, result_tx)</code> and calls <code>f(data)</code>, sending the result to the provided channel. The scheduler decides which transform to apply and where to route the result.</p>
|
<p><strong>Workers</strong> are generic: each receives a <code>WorkerTask</code> — either <code>Transform(data, stage_idx)</code> or <code>Flat(data, stage_idx)</code>. For <code>Transform</code>, the worker calls <code>f(data)</code> and sends the result to <code>stage_txs[stage_idx]</code>. For <code>Flat</code>, the worker calls <code>f(data, &push_tx, &delta_tx)</code>: the closure pushes N items into <code>push_tx</code> then sends <code>N-1</code> to <code>delta_tx</code>. The scheduler uses the delta to adjust <code>in_flight</code> without knowing N in advance.</p>
|
||||||
<p><strong>Termination</strong> uses an <code>in_flight</code> counter:</p>
|
<p><strong>Termination</strong> uses an <code>in_flight: isize</code> counter and a <code>flat_workers_active: usize</code> counter:</p>
|
||||||
<ul>
|
<ul>
|
||||||
<li>incremented when an item is dispatched from source to workers</li>
|
<li><code>in_flight</code> incremented when an item is dispatched from source to workers</li>
|
||||||
<li>decremented when the item exits the last stage</li>
|
<li><code>in_flight</code> decremented when the item exits the last stage to the sink</li>
|
||||||
<li>the loop exits only when <code>source_done && in_flight == 0</code></li>
|
<li><code>flat_workers_active</code> incremented when a <code>Flat</code> task is dispatched, decremented when the delta arrives</li>
|
||||||
|
<li>the loop exits only when <code>source_done && in_flight == 0 && flat_workers_active == 0</code></li>
|
||||||
</ul>
|
</ul>
|
||||||
<p>This guarantees all in-flight items complete before <code>join()</code>.</p>
|
<p>This guarantees all in-flight items complete (including all N outputs of a flat stage) before <code>join()</code>.</p>
|
||||||
<h2 id="error-handling">Error handling</h2>
|
<h2 id="error-handling">Error handling</h2>
|
||||||
<p><code>PipelineError</code> has four variants:</p>
|
<p><code>PipelineError</code> has four variants:</p>
|
||||||
<table>
|
<table>
|
||||||
@@ -1279,7 +1413,7 @@ index N+1 source_rx pull new data last
|
|||||||
<td>Internal routing error</td>
|
<td>Internal routing error</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td><code>StepError(Box<dyn Error>)</code></td>
|
<td><code>StepError(Box<dyn Error + Send + Sync>)</code></td>
|
||||||
<td>Error from user code (wrapped by <code>make_*_fallible!</code>)</td>
|
<td>Error from user code (wrapped by <code>make_*_fallible!</code>)</td>
|
||||||
</tr>
|
</tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -12,7 +12,7 @@
|
|||||||
<link rel="prev" href="../persistent_compact_int_vec/">
|
<link rel="prev" href="../persistent_compact_int_vec/">
|
||||||
|
|
||||||
|
|
||||||
<link rel="next" href="../../architecture/sequences/invariant/">
|
<link rel="next" href="../merge/">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -649,6 +649,34 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../evidence_elimination/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Evidence elimination (discussion)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="../obilayeredmap/" class="md-nav__link">
|
<a href="../obilayeredmap/" class="md-nav__link">
|
||||||
|
|
||||||
@@ -1002,6 +1030,62 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../merge/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Merge command
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../rebuild_filter/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Kmer filtering (rebuild/dump/unitig)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
</nav>
|
</nav>
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -649,6 +649,34 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../evidence_elimination/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Evidence elimination (discussion)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="../obilayeredmap/" class="md-nav__link">
|
<a href="../obilayeredmap/" class="md-nav__link">
|
||||||
|
|
||||||
@@ -985,6 +1013,62 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../merge/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Merge command
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../rebuild_filter/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Kmer filtering (rebuild/dump/unitig)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
</nav>
|
</nav>
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -773,6 +773,34 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../evidence_elimination/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Evidence elimination (discussion)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="../obilayeredmap/" class="md-nav__link">
|
<a href="../obilayeredmap/" class="md-nav__link">
|
||||||
|
|
||||||
@@ -851,6 +879,62 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../merge/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Merge command
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../rebuild_filter/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Kmer filtering (rebuild/dump/unitig)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
</nav>
|
</nav>
|
||||||
|
|
||||||
@@ -1104,7 +1188,9 @@
|
|||||||
<li><strong>error valley</strong> → suggests min_count (typically the local minimum between the error peak and the coverage peak)</li>
|
<li><strong>error valley</strong> → suggests min_count (typically the local minimum between the error peak and the coverage peak)</li>
|
||||||
</ul>
|
</ul>
|
||||||
<h2 id="phase-1-scatter">Phase 1 — Scatter</h2>
|
<h2 id="phase-1-scatter">Phase 1 — Scatter</h2>
|
||||||
<p>Single streaming pass over raw input files (FASTA/FASTQ, gzip). FASTQ quality scores are ignored. For each read:</p>
|
<p>Single streaming pass over raw input files (FASTA/FASTQ, gzip). FASTQ quality scores are ignored.</p>
|
||||||
|
<p>Input files are read via <code>open_nuc_stream</code>, which opens and decompresses the file, auto-detects the format (FASTA / FASTQ / GenBank), and yields a sequence of <code>NucPage</code> buffers. Each <code>NucPage</code> is a flat 64 KB buffer of normalised bytes (<code>ACGT</code> + <code>\x00</code> separators), carrying a k−1 byte overlap from the preceding page so that no k-mer is lost at page boundaries. Per-record identity (sequence id, raw bytes) is not preserved; this is intentional — the scatter phase only needs normalised bases to produce superkmers.</p>
|
||||||
|
<p>For each read fragment within a page:</p>
|
||||||
<ol>
|
<ol>
|
||||||
<li><strong>Ambiguous base filter</strong>: cut at any non-ACGT base; discard fragments shorter than k.</li>
|
<li><strong>Ambiguous base filter</strong>: cut at any non-ACGT base; discard fragments shorter than k.</li>
|
||||||
<li><strong>Entropy filter</strong>: scan each fragment with a sliding window of size k. When the kmer <span class="arithmatex">\(K_i = S[i \mathinner{..} i+k-1]\)</span> ended by nucleotide <span class="arithmatex">\(S[j]\)</span> (with <span class="arithmatex">\(j = i+k-1\)</span>) has entropy below threshold <span class="arithmatex">\(\theta\)</span>, emit the current segment and start a new one (see algorithm below). <span class="arithmatex">\(K_i\)</span> belongs to neither segment, and no valid kmer is lost.</li>
|
<li><strong>Entropy filter</strong>: scan each fragment with a sliding window of size k. When the kmer <span class="arithmatex">\(K_i = S[i \mathinner{..} i+k-1]\)</span> ended by nucleotide <span class="arithmatex">\(S[j]\)</span> (with <span class="arithmatex">\(j = i+k-1\)</span>) has entropy below threshold <span class="arithmatex">\(\theta\)</span>, emit the current segment and start a new one (see algorithm below). <span class="arithmatex">\(K_i\)</span> belongs to neither segment, and no valid kmer is lost.</li>
|
||||||
@@ -1154,8 +1240,13 @@ B ≈ 100 is tunable; RAM needed ≈ partition_size / B.</p>
|
|||||||
for each kmer in sequence:
|
for each kmer in sequence:
|
||||||
kmer_counts[canonical(kmer)] += COUNT
|
kmer_counts[canonical(kmer)] += COUNT
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p>Implemented as an external sort or a temporary HashMap, depending on partition size. At the end of this phase, each distinct canonical kmer has its exact total count.</p>
|
<p>Implemented as a three-step pipeline in <code>count_partition()</code>:</p>
|
||||||
<p>Abundance filter applied here: kmers with <code>total_count < q</code> are discarded. <code>q</code> is a collection parameter (0 = keep all, including singletons for ≤1x data).</p>
|
<ol>
|
||||||
|
<li><strong>External sort</strong> (<code>kmer_sort::sort_unique_kmers</code>): read dereplicated superkmers, extract canonical kmer raw <code>u64</code> values, sort in RAM-bounded chunks (adaptive: 40% of available RAM ÷ n_threads, min 1 M kmers/chunk), k-way merge with inline dedup → <code>sorted_unique.bin</code>. f0 is now known exactly.</li>
|
||||||
|
<li><strong>Provisional MPHF</strong> (ptr_hash): built from <code>sorted_unique.bin</code> via <code>new_from_par_iter(f0, ...)</code>. Stored to <code>mphf1.bin</code>; <code>sorted_unique.bin</code> deleted immediately.</li>
|
||||||
|
<li><strong>Accumulation pass</strong>: re-read dereplicated superkmers; for each kmer, <code>slot = mphf.index(kmer.raw())</code>, increment <code>counts1[slot]</code> by the superkmer COUNT. Stored in a <code>PersistentCompactIntVec</code> (<code>counts1.bin</code>).</li>
|
||||||
|
</ol>
|
||||||
|
<p>At the end of this phase, each distinct canonical kmer has its exact total count, and the frequency spectrum (<code>spectrums/{label}.json</code>) is written to the index root.</p>
|
||||||
<p>No pre-filter on super-kmer COUNT is possible at phase 2: a super-kmer with COUNT=1 may contain only high-abundance kmers, each present in many other super-kmers across the partition.</p>
|
<p>No pre-filter on super-kmer COUNT is possible at phase 2: a super-kmer with COUNT=1 may contain only high-abundance kmers, each present in many other super-kmers across the partition.</p>
|
||||||
<h2 id="phase-4-super-kmer-compaction">Phase 4 — Super-kmer compaction</h2>
|
<h2 id="phase-4-super-kmer-compaction">Phase 4 — Super-kmer compaction</h2>
|
||||||
<p>The valid kmer set from phase 3 is used as a mask to rewrite the super-kmer files:</p>
|
<p>The valid kmer set from phase 3 is used as a mask to rewrite the super-kmer files:</p>
|
||||||
@@ -1188,14 +1279,52 @@ branching / dead-end → unitig start or end
|
|||||||
<p>Output: <code>unitigs.bin</code> — the permanent evidence structure for the partition. Each kmer in the partition appears at exactly one (unitig_id, offset) location.</p>
|
<p>Output: <code>unitigs.bin</code> — the permanent evidence structure for the partition. Each kmer in the partition appears at exactly one (unitig_id, offset) location.</p>
|
||||||
<p><strong>Scope of local unitigs:</strong> these are unitigs of the partition's local de Bruijn graph, not global unitigs. A kmer whose k-1 successor or predecessor falls in another partition appears as a dead end locally and terminates the unitig. This does not affect correctness of verification but means partition-local unitigs cannot be directly reused for global assembly.</p>
|
<p><strong>Scope of local unitigs:</strong> these are unitigs of the partition's local de Bruijn graph, not global unitigs. A kmer whose k-1 successor or predecessor falls in another partition appears as a dead end locally and terminates the unitig. This does not affect correctness of verification but means partition-local unitigs cannot be directly reused for global assembly.</p>
|
||||||
<h2 id="phase-6-mphf-construction-and-index-finalisation">Phase 6 — MPHF construction and index finalisation</h2>
|
<h2 id="phase-6-mphf-construction-and-index-finalisation">Phase 6 — MPHF construction and index finalisation</h2>
|
||||||
<p>Built once on the definitive kmer set (all kmers in all unitigs of the partition). See <a href="../obilayeredmap/">obilayeredmap</a> and <a href="../mphf/">MPHF selection</a> for the current implementation.</p>
|
<p><code>build_index_layer</code> is called per partition (in parallel via <code>build_layers</code>) with the following parameters sourced from <code>IndexConfig</code>:</p>
|
||||||
<div class="highlight"><pre><span></span><code>kmers from unitigs → MPHF → mphf.bin
|
<ul>
|
||||||
→ evidence.bin : n × u32, each = (chunk_id: 25 bits | rank: 7 bits)
|
<li><code>block_bits</code> — from <code>IndexConfig::block_bits</code>; controls the <code>.idx</code> block size (2^block_bits unitig chunks per block) for exact evidence</li>
|
||||||
→ payload : counts/ (mode 2) or presence/ (mode 3)
|
<li><code>evidence</code> — <code>EvidenceKind::Exact</code> or <code>EvidenceKind::Approx { b, z }</code>; propagated unchanged from <code>IndexConfig::evidence</code></li>
|
||||||
|
<li><code>min_ab</code> / <code>max_ab</code> — abundance bounds applied before graph construction</li>
|
||||||
|
<li><code>with_counts</code> — whether to store kmer counts alongside set membership</li>
|
||||||
|
</ul>
|
||||||
|
<p><strong>Abundance filtering:</strong> when <code>min_ab > 1</code> or <code>max_ab.is_some()</code>, the provisional <code>mphf1.bin</code> and <code>counts1.bin</code> produced in phase 3 are memory-mapped. Each canonical kmer is accepted only if its count in <code>counts1</code> satisfies the bounds. If either file is absent, filtering is skipped (all kmers accepted).</p>
|
||||||
|
<div class="highlight"><pre><span></span><code>for each kmer in dereplicated super-kmer:
|
||||||
|
ab = counts1[mphf1.index(kmer.raw())]
|
||||||
|
if ab < min_ab || ab > max_ab: skip
|
||||||
|
graph.push(kmer)
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p>The MPHF is built in two passes over <code>unitigs.bin</code>: parallel pass for <code>mphf.bin</code>, sequential pass for <code>evidence.bin</code> and payload. The exact kmer count is available from the unitig index (<code>unitigs.bin.idx</code>) before the passes begin.</p>
|
<p><strong>Graph build and unitig write:</strong></p>
|
||||||
<p><strong>Exact verification via unitig evidence:</strong></p>
|
<p>The surviving kmers are fed into <code>GraphDeBruijn</code>, which computes degrees and yields unitigs. Unitigs are written to <code>layer_0/unitigs.bin</code> via a <code>UnitigFileWriter</code>.</p>
|
||||||
<p><code>unitigs.bin</code> serves as the evidence structure. The MPHF maps every input to <code>[0, N)</code> including absent kmers — the unitig read-back (via <code>evidence.bin</code>) is the only correct membership test.</p>
|
<p><strong>MPHF and evidence build:</strong></p>
|
||||||
|
<p><code>Layer::build</code> (membership-only) or <code>Layer::<PersistentCompactIntMatrix>::build</code> (with counts) is called next. Internally, <code>MphfLayer::build</code> performs two passes:</p>
|
||||||
|
<ol>
|
||||||
|
<li><strong>Pass 1 (parallel):</strong> build <code>unitigs.bin.idx</code> (block size = 2^<code>block_bits</code>) then construct the MPHF from all canonical kmers in <code>unitigs.bin</code>; store to <code>mphf.bin</code>.</li>
|
||||||
|
<li><strong>Pass 2 (sequential):</strong> for each kmer in <code>unitigs.bin</code>, compute its slot and write <code>evidence.bin</code> (<code>chunk_id: 25 bits | rank: 7 bits</code> packed into a <code>u32</code>); also invoke the payload callback (<code>fill_slot</code>) to populate <code>counts/</code> if <code>with_counts</code>.</li>
|
||||||
|
</ol>
|
||||||
|
<p>After <code>Layer::build</code> completes, <code>layer_meta.json</code> records <code>EvidenceKind::Exact</code>.</p>
|
||||||
|
<p><strong>Approximate evidence override:</strong></p>
|
||||||
|
<p>If <code>evidence</code> is <code>EvidenceKind::Approx { b, z }</code>, <code>build_approx_evidence</code> is called immediately after <code>Layer::build</code>. It overwrites the exact evidence bundle with <code>fingerprint.bin</code> (b-bit hash per slot) and rewrites <code>layer_meta.json</code> with <code>EvidenceKind::Approx { b, z }</code>. No <code>.idx</code> file is needed at query time in this mode.</p>
|
||||||
|
<div class="highlight"><pre><span></span><code>// Exact path → evidence.bin + unitigs.bin.idx + layer_meta.json(Exact)
|
||||||
|
// Approx path → fingerprint.bin + layer_meta.json(Approx{b,z})
|
||||||
|
// (evidence.bin left on disk but not used)
|
||||||
|
</code></pre></div>
|
||||||
|
<p><strong>Partition metadata:</strong></p>
|
||||||
|
<p>After all layer files are written, <code>PartitionMeta { n_layers: 1 }</code> is serialised to <code>index/meta.json</code> inside the partition directory. This file is required by <code>LayeredMap::open</code> for subsequent merge operations.</p>
|
||||||
|
<p><strong>File layout per partition after phase 6:</strong></p>
|
||||||
|
<div class="highlight"><pre><span></span><code>part_XXXXX/
|
||||||
|
index/
|
||||||
|
meta.json ← PartitionMeta { n_layers: 1 }
|
||||||
|
layer_0/
|
||||||
|
unitigs.bin ← permanent evidence (all modes)
|
||||||
|
unitigs.bin.idx ← block index (exact mode only)
|
||||||
|
mphf.bin ← MPHF
|
||||||
|
evidence.bin ← exact evidence (exact mode)
|
||||||
|
fingerprint.bin ← b-bit fingerprints (approx mode)
|
||||||
|
layer_meta.json ← EvidenceKind tag
|
||||||
|
counts/ ← PersistentCompactIntMatrix (with_counts only)
|
||||||
|
</code></pre></div>
|
||||||
|
<p><strong>Cleanup:</strong> unless <code>--keep-intermediate</code> is set, <code>remove_build_artifacts</code> deletes <code>dereplicated.skmer.zst</code>, <code>mphf1.bin</code>, and <code>counts1.bin</code> after all partitions are indexed.</p>
|
||||||
|
<p>See <a href="../obilayeredmap/">obilayeredmap</a> and <a href="../mphf/">MPHF selection</a> for data structure details.</p>
|
||||||
|
<p><strong>Query path (exact evidence):</strong></p>
|
||||||
<div class="highlight"><pre><span></span><code>query kmer q
|
<div class="highlight"><pre><span></span><code>query kmer q
|
||||||
→ canonical_minimizer(q) → hash → PART → part_XXXXX/
|
→ canonical_minimizer(q) → hash → PART → part_XXXXX/
|
||||||
→ MPHF(q) → slot s
|
→ MPHF(q) → slot s
|
||||||
@@ -1204,7 +1333,13 @@ branching / dead-end → unitig start or end
|
|||||||
→ match : return payload[s] ← exact hit
|
→ match : return payload[s] ← exact hit
|
||||||
→ no match: kmer absent ← MPHF collision on absent kmer
|
→ no match: kmer absent ← MPHF collision on absent kmer
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p><code>superkmers.bin.gz</code> is no longer needed at this point and can be deleted.</p>
|
<p><strong>Query path (approximate evidence):</strong></p>
|
||||||
|
<div class="highlight"><pre><span></span><code>query kmer q
|
||||||
|
→ MPHF(q) → slot s
|
||||||
|
→ fingerprint[s] matches seq_hash(q)?
|
||||||
|
→ yes : probable hit (FP rate = 1/2^b per kmer, 1/2^(b·z) per z-window)
|
||||||
|
→ no : kmer absent
|
||||||
|
</code></pre></div>
|
||||||
<div class="footnote">
|
<div class="footnote">
|
||||||
<hr />
|
<hr />
|
||||||
<ol>
|
<ol>
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -64,7 +64,7 @@
|
|||||||
<div data-md-component="skip">
|
<div data-md-component="skip">
|
||||||
|
|
||||||
|
|
||||||
<a href="#on-disk-collection-structure" class="md-skip">
|
<a href="#on-disk-index-layout" class="md-skip">
|
||||||
Skip to content
|
Skip to content
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
@@ -575,6 +575,24 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link md-nav__link--active" for="__toc">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
On-disk storage
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
<a href="./" class="md-nav__link md-nav__link--active">
|
<a href="./" class="md-nav__link md-nav__link--active">
|
||||||
|
|
||||||
|
|
||||||
@@ -592,6 +610,174 @@
|
|||||||
|
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__title" for="__toc">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
Table of contents
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#directory-tree" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Directory tree
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#state-machine-sentinels" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
State machine (sentinels)
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#indexmeta-indexmeta" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
index.meta (IndexMeta)
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#layer-files" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Layer files
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="Layer files">
|
||||||
|
<ul class="md-nav__list">
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#unitigsbin" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
unitigs.bin
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#unitigsbinidx-exact-only" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
unitigs.bin.idx (Exact only)
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#mphfbin" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
mphf.bin
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#layer_metajson-layermeta" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
layer_meta.json (LayerMeta)
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#evidencebin-exact" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
evidence.bin (Exact)
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#fingerprintbin-approx" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
fingerprint.bin (Approx)
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#counts-persistentcompactintmatrix" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
counts/ (PersistentCompactIntMatrix)
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#presence-persistentbitmatrix" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
presence/ (PersistentBitMatrix)
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#metajson-partitionmeta" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
meta.json (PartitionMeta)
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
</nav>
|
||||||
|
|
||||||
</li>
|
</li>
|
||||||
|
|
||||||
|
|
||||||
@@ -659,6 +845,34 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../evidence_elimination/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Evidence elimination (discussion)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="../obilayeredmap/" class="md-nav__link">
|
<a href="../obilayeredmap/" class="md-nav__link">
|
||||||
|
|
||||||
@@ -737,6 +951,62 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../merge/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Merge command
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../rebuild_filter/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Kmer filtering (rebuild/dump/unitig)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
</nav>
|
</nav>
|
||||||
|
|
||||||
@@ -874,6 +1144,163 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__title" for="__toc">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
Table of contents
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#directory-tree" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Directory tree
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#state-machine-sentinels" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
State machine (sentinels)
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#indexmeta-indexmeta" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
index.meta (IndexMeta)
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#layer-files" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Layer files
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="Layer files">
|
||||||
|
<ul class="md-nav__list">
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#unitigsbin" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
unitigs.bin
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#unitigsbinidx-exact-only" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
unitigs.bin.idx (Exact only)
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#mphfbin" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
mphf.bin
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#layer_metajson-layermeta" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
layer_meta.json (LayerMeta)
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#evidencebin-exact" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
evidence.bin (Exact)
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#fingerprintbin-approx" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
fingerprint.bin (Approx)
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#counts-persistentcompactintmatrix" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
counts/ (PersistentCompactIntMatrix)
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#presence-persistentbitmatrix" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
presence/ (PersistentBitMatrix)
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#metajson-partitionmeta" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
meta.json (PartitionMeta)
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
|
||||||
</nav>
|
</nav>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -889,9 +1316,131 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
<h1 id="on-disk-collection-structure">On-disk collection structure</h1>
|
<h1 id="on-disk-index-layout">On-disk index layout</h1>
|
||||||
<p>See <a href="../obilayeredmap/">obilayeredmap crate</a> for the current on-disk layout.</p>
|
<h2 id="directory-tree">Directory tree</h2>
|
||||||
<p>The index root contains one <code>part_XXXXX/</code> directory per partition, each holding one or more <code>layer_N/</code> directories. Each layer directory contains <code>mphf.bin</code>, <code>unitigs.bin</code>, <code>unitigs.bin.idx</code>, <code>evidence.bin</code>, and optionally a <code>counts/</code> or <code>presence/</code> payload directory.</p>
|
<div class="highlight"><pre><span></span><code><index_root>/
|
||||||
|
index.meta ← JSON: IndexMeta
|
||||||
|
scatter.done ← sentinel: scatter phase complete
|
||||||
|
count.done ← sentinel: dereplicate + count complete
|
||||||
|
index.done ← sentinel: MPHF index fully built
|
||||||
|
spectrums/
|
||||||
|
<label>.json ← kmer frequency spectrum per genome
|
||||||
|
partitions/
|
||||||
|
part_00000/ ← one dir per partition (zero-padded 5 digits, 0..2^n_bits−1)
|
||||||
|
index/
|
||||||
|
meta.json ← PartitionMeta { n_layers }
|
||||||
|
layer_0/
|
||||||
|
unitigs.bin ← binary unitig sequences (2-bit packed)
|
||||||
|
unitigs.bin.idx ← block-sampled offset index (exact evidence only)
|
||||||
|
mphf.bin ← serialised PtrHash MPHF
|
||||||
|
layer_meta.json ← LayerMeta { evidence: EvidenceKind }
|
||||||
|
evidence.bin ← chunk_id:rank per MPHF slot (Exact only)
|
||||||
|
fingerprint.bin ← b-bit fingerprints per MPHF slot (Approx only)
|
||||||
|
counts/ ← PersistentCompactIntMatrix (if with_counts=true)
|
||||||
|
presence/ ← PersistentBitMatrix (if presence mode, merge)
|
||||||
|
layer_1/ ← added by merge; same structure as layer_0
|
||||||
|
layer_2/ …
|
||||||
|
part_00001/ …
|
||||||
|
</code></pre></div>
|
||||||
|
<h2 id="state-machine-sentinels">State machine (sentinels)</h2>
|
||||||
|
<p>The sentinels are touched atomically at the end of each pipeline stage.
|
||||||
|
A partial run (e.g. scatter interrupted) leaves no sentinel; the state is
|
||||||
|
detected as the lowest sentinel present.</p>
|
||||||
|
<table>
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>State</th>
|
||||||
|
<th>Sentinel present</th>
|
||||||
|
<th>Meaning</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td><code>Empty</code></td>
|
||||||
|
<td>—</td>
|
||||||
|
<td><code>index.meta</code> exists; scatter not started or interrupted</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>Scattered</code></td>
|
||||||
|
<td><code>scatter.done</code></td>
|
||||||
|
<td>All super-kmers routed to partition files</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>Counted</code></td>
|
||||||
|
<td><code>count.done</code></td>
|
||||||
|
<td>Partitions dereplicated; <code>spectrums/</code> written</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>Indexed</code></td>
|
||||||
|
<td><code>index.done</code></td>
|
||||||
|
<td>All MPHF layers built; index ready for queries</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
<h2 id="indexmeta-indexmeta">index.meta (IndexMeta)</h2>
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="p">{</span>
|
||||||
|
<span class="w"> </span><span class="nt">"version"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span>
|
||||||
|
<span class="w"> </span><span class="nt">"config"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||||||
|
<span class="w"> </span><span class="nt">"kmer_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">31</span><span class="p">,</span>
|
||||||
|
<span class="w"> </span><span class="nt">"minimizer_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">11</span><span class="p">,</span>
|
||||||
|
<span class="w"> </span><span class="nt">"n_bits"</span><span class="p">:</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span>
|
||||||
|
<span class="w"> </span><span class="nt">"with_counts"</span><span class="p">:</span><span class="w"> </span><span class="kc">false</span><span class="p">,</span>
|
||||||
|
<span class="w"> </span><span class="nt">"evidence"</span><span class="p">:</span><span class="w"> </span><span class="s2">"Exact"</span><span class="p">,</span>
|
||||||
|
<span class="w"> </span><span class="nt">"block_bits"</span><span class="p">:</span><span class="w"> </span><span class="mi">0</span>
|
||||||
|
<span class="w"> </span><span class="p">},</span>
|
||||||
|
<span class="w"> </span><span class="nt">"genomes"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span>
|
||||||
|
<span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="nt">"label"</span><span class="p">:</span><span class="w"> </span><span class="s2">"genome_A"</span><span class="p">,</span><span class="w"> </span><span class="nt">"meta"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="nt">"species"</span><span class="p">:</span><span class="w"> </span><span class="s2">"Homo sapiens"</span><span class="w"> </span><span class="p">}</span><span class="w"> </span><span class="p">}</span>
|
||||||
|
<span class="w"> </span><span class="p">]</span>
|
||||||
|
<span class="p">}</span>
|
||||||
|
</code></pre></div>
|
||||||
|
<p><code>n_bits</code> determines the partition count: <code>2^n_bits</code> directories under <code>partitions/</code>.</p>
|
||||||
|
<p><code>evidence</code> is either the string <code>"Exact"</code> or <code>{"Approx": {"b": 8, "z": 1}}</code>.</p>
|
||||||
|
<p><code>block_bits</code> controls the <code>.idx</code> granularity: one offset entry every <code>2^block_bits</code>
|
||||||
|
chunks. <code>block_bits=0</code> stores one entry per chunk (O(1) random access, largest <code>.idx</code>).</p>
|
||||||
|
<p><code>GenomeInfo.meta</code> is a free-form string→string map for categorical metadata (e.g.
|
||||||
|
taxonomy, sample origin). It is optional; defaults to empty.</p>
|
||||||
|
<h2 id="layer-files">Layer files</h2>
|
||||||
|
<h3 id="unitigsbin">unitigs.bin</h3>
|
||||||
|
<p>2-bit packed binary unitig sequences. Each record: 1 byte <code>seql_minus_k</code>
|
||||||
|
(nucleotide length − k), followed by <code>ceil((seql_minus_k + k) / 4)</code> bytes of
|
||||||
|
packed sequence. Long unitigs are transparently split into overlapping chunks
|
||||||
|
(k−1 nucleotide overlap) so no k-mer crosses a chunk boundary.</p>
|
||||||
|
<h3 id="unitigsbinidx-exact-only">unitigs.bin.idx (Exact only)</h3>
|
||||||
|
<p>Magic <code>UIX3</code>, little-endian header: <code>block_bits</code> (u32), <code>n_unitigs</code> (u32),
|
||||||
|
<code>n_kmers</code> (u64), then <code>ceil(n_unitigs / 2^block_bits) + 1</code> byte-offset entries
|
||||||
|
(u32 each, last entry is a sentinel past-end offset). Absent for Approx layers.</p>
|
||||||
|
<h3 id="mphfbin">mphf.bin</h3>
|
||||||
|
<p>PtrHash MPHF serialised with epserde. Maps canonical kmer (u64, left-aligned
|
||||||
|
2-bit) to a slot index in <code>[0, n_kmers)</code>.</p>
|
||||||
|
<h3 id="layer_metajson-layermeta">layer_meta.json (LayerMeta)</h3>
|
||||||
|
<p><div class="highlight"><pre><span></span><code><span class="p">{</span><span class="w"> </span><span class="nt">"evidence"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="nt">"type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"exact"</span><span class="w"> </span><span class="p">}</span><span class="w"> </span><span class="p">}</span>
|
||||||
|
</code></pre></div>
|
||||||
|
or
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="p">{</span><span class="w"> </span><span class="nt">"evidence"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="nt">"type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"approx"</span><span class="p">,</span><span class="w"> </span><span class="nt">"b"</span><span class="p">:</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span><span class="w"> </span><span class="nt">"z"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="p">}</span><span class="w"> </span><span class="p">}</span>
|
||||||
|
</code></pre></div></p>
|
||||||
|
<h3 id="evidencebin-exact">evidence.bin (Exact)</h3>
|
||||||
|
<p>One <code>(chunk_id: u32, rank: u8)</code> record per MPHF slot, packed. Used to verify
|
||||||
|
that the kmer mapped to a slot is actually present: <code>unitigs.bin[chunk_id][rank]</code>
|
||||||
|
is re-read and compared against the query.</p>
|
||||||
|
<h3 id="fingerprintbin-approx">fingerprint.bin (Approx)</h3>
|
||||||
|
<p><code>b</code>-bit fingerprint per MPHF slot derived from the kmer's sequence hash.
|
||||||
|
False-positive rate per query ≈ <code>1/2^b</code>. With Findere parameter <code>z ≥ 2</code>,
|
||||||
|
<code>z</code> consecutive k-mers must all match, reducing the effective FP rate to
|
||||||
|
approximately <code>W / 2^(b·z)</code> per read of length <code>L</code>
|
||||||
|
(where <code>W = L − k − z + 2</code>).</p>
|
||||||
|
<h3 id="counts-persistentcompactintmatrix">counts/ (PersistentCompactIntMatrix)</h3>
|
||||||
|
<p>Present when <code>with_counts=true</code>. One column per genome; each row holds the
|
||||||
|
per-genome k-mer count for the corresponding MPHF slot. Appended column-by-column
|
||||||
|
during indexing and merge.</p>
|
||||||
|
<h3 id="presence-persistentbitmatrix">presence/ (PersistentBitMatrix)</h3>
|
||||||
|
<p>Present when the layer was built in presence/absence mode (merge path).
|
||||||
|
One bit per genome per MPHF slot. Written during merge; never present on a
|
||||||
|
freshly indexed single-genome layer.</p>
|
||||||
|
<h2 id="metajson-partitionmeta">meta.json (PartitionMeta)</h2>
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="p">{</span><span class="w"> </span><span class="nt">"n_layers"</span><span class="p">:</span><span class="w"> </span><span class="mi">2</span><span class="w"> </span><span class="p">}</span>
|
||||||
|
</code></pre></div>
|
||||||
|
<p>Records how many <code>layer_N/</code> directories exist under <code>index/</code>. Incremented by
|
||||||
|
each merge that adds a layer.</p>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -751,6 +751,34 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../evidence_elimination/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Evidence elimination (discussion)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="../obilayeredmap/" class="md-nav__link">
|
<a href="../obilayeredmap/" class="md-nav__link">
|
||||||
|
|
||||||
@@ -829,6 +857,62 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../merge/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Merge command
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../rebuild_filter/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Kmer filtering (rebuild/dump/unitig)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
</nav>
|
</nav>
|
||||||
|
|
||||||
@@ -1046,61 +1130,49 @@
|
|||||||
|
|
||||||
<h1 id="superkmer-implementation">SuperKmer — implementation</h1>
|
<h1 id="superkmer-implementation">SuperKmer — implementation</h1>
|
||||||
<h2 id="memory-layout">Memory layout</h2>
|
<h2 id="memory-layout">Memory layout</h2>
|
||||||
<p>A super-kmer is stored as a <strong>32-bit header</strong> followed by a <strong>byte-aligned nucleotide sequence</strong> (2 bits/base, nucleotide 0 at the MSB of the first byte):</p>
|
<p><code>SuperKmer</code> holds two separate fields:</p>
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">struct</span><span class="w"> </span><span class="nc">SuperKmer</span><span class="w"> </span><span class="p">{</span>
|
||||||
|
<span class="w"> </span><span class="k">pub</span><span class="p">(</span><span class="k">crate</span><span class="p">)</span><span class="w"> </span><span class="n">count</span><span class="p">:</span><span class="w"> </span><span class="kt">u32</span><span class="p">,</span>
|
||||||
|
<span class="w"> </span><span class="k">pub</span><span class="p">(</span><span class="k">crate</span><span class="p">)</span><span class="w"> </span><span class="n">inner</span><span class="p">:</span><span class="w"> </span><span class="nc">PackedSeq</span><span class="p">,</span>
|
||||||
|
<span class="p">}</span>
|
||||||
|
</code></pre></div>
|
||||||
|
<p><code>PackedSeq</code> stores a 2-bit packed DNA sequence as a heap-allocated <code>Box<[u8]></code> plus a <code>tail: u8</code> field:</p>
|
||||||
<table>
|
<table>
|
||||||
<thead>
|
<thead>
|
||||||
<tr>
|
<tr>
|
||||||
<th>Field</th>
|
<th>Field</th>
|
||||||
<th>Bits</th>
|
<th>Type</th>
|
||||||
<th>Role</th>
|
<th>Role</th>
|
||||||
</tr>
|
</tr>
|
||||||
</thead>
|
</thead>
|
||||||
<tbody>
|
<tbody>
|
||||||
<tr>
|
<tr>
|
||||||
<td>COUNT</td>
|
<td><code>tail</code></td>
|
||||||
<td>24</td>
|
<td><code>u8</code></td>
|
||||||
<td>Occurrence count (≤ 16 M)</td>
|
<td>Number of valid nucleotides in the last byte: 0 encodes 4, 1–3 are identity</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td>NKMERS</td>
|
<td><code>seq</code></td>
|
||||||
<td>8</td>
|
<td><code>Box<[u8]></code></td>
|
||||||
<td>Number of kmers (= seq_length − k + 1, range 1–255)</td>
|
<td>2-bit packed bytes, nucleotide 0 at bits 7–6 of <code>seq[0]</code></td>
|
||||||
</tr>
|
</tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
<p>Bit layout (MSB to LSB): <code>[31:8] COUNT [7:0] NKMERS</code></p>
|
<p>Nucleotide length is recovered without storing it explicitly:</p>
|
||||||
<p>NKMERS is stored as a raw <code>u8</code> in <strong>kmer units</strong>, not nucleotides. The nucleotide length is recovered as <code>NKMERS + k − 1</code>. This avoids the awkward wrapping convention (<code>0 = 256</code>) that would be needed if nucleotide length were stored directly, and gains k−1 = 30 units of headroom:</p>
|
<div class="highlight"><pre><span></span><code>seql = (seq.len() - 1) * 4 + tail_count(tail)
|
||||||
<table>
|
</code></pre></div>
|
||||||
<thead>
|
<p>There is no packed header word — <code>count</code> and the sequence live in separate fields.</p>
|
||||||
<tr>
|
<p>The on-disk binary format (produced by <code>write_to_binary</code>) is:</p>
|
||||||
<th>unit</th>
|
<div class="highlight"><pre><span></span><code>[varint(count)] [u8: seql − k] [packed bytes…]
|
||||||
<th>u8 covers</th>
|
</code></pre></div>
|
||||||
<th>max nucleotides</th>
|
<p><code>seql − k</code> fits in a <code>u8</code> when <code>n_kmers = seql − k + 1 ≤ MAX_KMERS_PER_CHUNK (= 256)</code>. If a super-kmer exceeds 256 kmers, <code>write_to_binary</code> splits it into overlapping chunks (k−1 nucleotide overlap, same count per chunk), each a self-contained record readable by <code>read_from_binary</code>.</p>
|
||||||
</tr>
|
<p>The public accessors operate on the struct fields directly:</p>
|
||||||
</thead>
|
<div class="highlight"><pre><span></span><code><span class="k">fn</span><span class="w"> </span><span class="nf">seql</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">usize</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">inner</span><span class="p">.</span><span class="n">seql</span><span class="p">()</span><span class="w"> </span><span class="p">}</span>
|
||||||
<tbody>
|
<span class="k">fn</span><span class="w"> </span><span class="nf">count</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">u32</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">count</span><span class="w"> </span><span class="p">}</span>
|
||||||
<tr>
|
<span class="k">fn</span><span class="w"> </span><span class="nf">increment</span><span class="p">(</span><span class="o">&</span><span class="k">mut</span><span class="w"> </span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">count</span><span class="w"> </span><span class="o">+=</span><span class="w"> </span><span class="mi">1</span><span class="p">;</span><span class="w"> </span><span class="p">}</span>
|
||||||
<td>nucleotides</td>
|
<span class="k">fn</span><span class="w"> </span><span class="nf">add</span><span class="p">(</span><span class="o">&</span><span class="k">mut</span><span class="w"> </span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">n</span><span class="p">:</span><span class="w"> </span><span class="kt">u32</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">count</span><span class="w"> </span><span class="o">+=</span><span class="w"> </span><span class="n">n</span><span class="p">;</span><span class="w"> </span><span class="p">}</span>
|
||||||
<td>255 nt</td>
|
<span class="k">fn</span><span class="w"> </span><span class="nf">set_count</span><span class="p">(</span><span class="o">&</span><span class="k">mut</span><span class="w"> </span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">n</span><span class="p">:</span><span class="w"> </span><span class="kt">u32</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">count</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">n</span><span class="p">;</span><span class="w"> </span><span class="p">}</span>
|
||||||
<td>225 kmers</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>kmers</strong></td>
|
|
||||||
<td><strong>255 kmers</strong></td>
|
|
||||||
<td><strong>285 nt</strong></td>
|
|
||||||
</tr>
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
<p>The public accessors:</p>
|
|
||||||
<div class="highlight"><pre><span></span><code><span class="k">fn</span><span class="w"> </span><span class="nf">n_kmers</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">usize</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="p">(</span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">&</span><span class="w"> </span><span class="mh">0xFF</span><span class="p">)</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="kt">usize</span><span class="w"> </span><span class="p">}</span>
|
|
||||||
<span class="k">fn</span><span class="w"> </span><span class="nf">seql</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">usize</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">n_kmers</span><span class="p">()</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">K</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="p">}</span>
|
|
||||||
<span class="k">fn</span><span class="w"> </span><span class="nf">count</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">u32</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">>></span><span class="w"> </span><span class="mi">8</span><span class="w"> </span><span class="p">}</span>
|
|
||||||
<span class="k">fn</span><span class="w"> </span><span class="nf">increment</span><span class="p">(</span><span class="o">&</span><span class="k">mut</span><span class="w"> </span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">+=</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="mi">8</span><span class="p">;</span><span class="w"> </span><span class="p">}</span>
|
|
||||||
<span class="k">fn</span><span class="w"> </span><span class="nf">add</span><span class="p">(</span><span class="o">&</span><span class="k">mut</span><span class="w"> </span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">n</span><span class="p">:</span><span class="w"> </span><span class="kt">u32</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">+=</span><span class="w"> </span><span class="n">n</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="mi">8</span><span class="p">;</span><span class="w"> </span><span class="p">}</span>
|
|
||||||
<span class="k">fn</span><span class="w"> </span><span class="nf">set_count</span><span class="p">(</span><span class="o">&</span><span class="k">mut</span><span class="w"> </span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">n</span><span class="p">:</span><span class="w"> </span><span class="kt">u32</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">(</span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">&</span><span class="w"> </span><span class="mh">0xFF</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="p">(</span><span class="n">n</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="mi">8</span><span class="p">);</span><span class="w"> </span><span class="p">}</span>
|
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p>In practice, observed super-kmer lengths on metagenomic data (k=31) are below 55 nucleotides (≤ 25 kmers) — far from the 255-kmer cap. If a super-kmer ever exceeds 255 kmers, it is split with a k−1 nucleotide overlap, preserving all kmers without duplication (identical mechanism to partition-boundary splits).</p>
|
|
||||||
<p>The sequence is always stored in canonical form (lexicographic minimum of forward and reverse complement), with nucleotide 0 at the MSB of the first byte. The byte array can be hashed directly without any adjustment.</p>
|
|
||||||
<h2 id="ascii-encoding-and-decoding">ASCII encoding and decoding</h2>
|
<h2 id="ascii-encoding-and-decoding">ASCII encoding and decoding</h2>
|
||||||
<p>Two lookup tables handle ASCII ↔ 2-bit conversion:</p>
|
<p>Two lookup tables handle ASCII ↔ 2-bit conversion:</p>
|
||||||
<ul>
|
<ul>
|
||||||
@@ -1125,7 +1197,7 @@
|
|||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p><code>REVCOMP4</code> is 256 bytes (fits in L1 cache), computed at compile time. No endianness dependency — all operations are pure arithmetic on byte values.</p>
|
<p><code>REVCOMP4</code> is 256 bytes (fits in L1 cache), computed at compile time. No endianness dependency — all operations are pure arithmetic on byte values.</p>
|
||||||
<p><strong>Step 2 — realignment.</strong> After step 1, <code>padding = n × 8 − seql × 2</code> spurious bits (complements of the original padding A's) appear at the start of the array. They are flushed left using <code>BitSlice<u8, Msb0>::rotate_left(padding)</code> from the <code>bitvec</code> crate, which is SIMD-accelerated. The trailing <code>padding</code> bits are then zeroed:</p>
|
<p><strong>Step 2 — realignment.</strong> After step 1, <code>padding = n × 8 − seql × 2</code> spurious bits (complements of the original padding A's) appear at the start of the array. They are flushed left using <code>BitSlice<u8, Msb0>::rotate_left(padding)</code> from the <code>bitvec</code> crate, which is SIMD-accelerated. The trailing <code>padding</code> bits are then zeroed:</p>
|
||||||
<div class="highlight"><pre><span></span><code><span class="kd">let</span><span class="w"> </span><span class="n">seql</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">n_kmers</span><span class="p">()</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">k</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="mi">1</span><span class="p">;</span>
|
<div class="highlight"><pre><span></span><code><span class="kd">let</span><span class="w"> </span><span class="n">seql</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">seql</span><span class="p">();</span>
|
||||||
<span class="n">shift</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">n</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="mi">8</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">seql</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="mi">2</span><span class="w"> </span><span class="c1">// number of padding bits</span>
|
<span class="n">shift</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">n</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="mi">8</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">seql</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="mi">2</span><span class="w"> </span><span class="c1">// number of padding bits</span>
|
||||||
<span class="n">bits</span><span class="p">.</span><span class="n">rotate_left</span><span class="p">(</span><span class="n">shift</span><span class="p">)</span>
|
<span class="n">bits</span><span class="p">.</span><span class="n">rotate_left</span><span class="p">(</span><span class="n">shift</span><span class="p">)</span>
|
||||||
<span class="n">bits</span><span class="p">[</span><span class="n">len</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">shift</span><span class="o">..</span><span class="p">].</span><span class="n">fill</span><span class="p">(</span><span class="kc">false</span><span class="p">)</span>
|
<span class="n">bits</span><span class="p">[</span><span class="n">len</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">shift</span><span class="o">..</span><span class="p">].</span><span class="n">fill</span><span class="p">(</span><span class="kc">false</span><span class="p">)</span>
|
||||||
@@ -1143,7 +1215,7 @@
|
|||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
</div>
|
</div>
|
||||||
<h2 id="minimizer-sliding-window">Minimizer sliding window</h2>
|
<h2 id="minimizer-sliding-window">Minimizer sliding window</h2>
|
||||||
<p>Super-kmers are built by <code>SuperKmerIter</code> (crate <code>obiskbuilder</code>), which maintains the current minimizer with a <strong>monotonic deque</strong> over a sliding window of W = k − m + 1 m-mer positions.</p>
|
<p>Super-kmers are built by <code>SuperKmerIter</code> (crate <code>obiskbuilder</code>), which tracks the current minimizer with a <strong>monotonic deque</strong> (<code>Ring<MmerItem, 32></code>) inside <code>RollingStat</code>, a rolling-window entropy and minimizer tracker.</p>
|
||||||
<p>Each deque entry stores:</p>
|
<p>Each deque entry stores:</p>
|
||||||
<table>
|
<table>
|
||||||
<thead>
|
<thead>
|
||||||
@@ -1167,20 +1239,11 @@
|
|||||||
<tr>
|
<tr>
|
||||||
<td><code>hash</code></td>
|
<td><code>hash</code></td>
|
||||||
<td>u64</td>
|
<td>u64</td>
|
||||||
<td><span class="arithmatex">\(H(\text{canonical})\)</span> — ordering key for random minimizer selection</td>
|
<td><code>hash_kmer(canonical << (64 − 2m))</code> — ordering key for random minimizer selection</td>
|
||||||
</tr>
|
</tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
<p>The hash <span class="arithmatex">\(H\)</span> is the seeded splitmix64 finalizer (see <a href="../../theory/minimizer/">Minimizer selection</a>):</p>
|
<p>The hash uses the seeded splitmix64 finalizer (<code>mix64(raw ^ 0x9e3779b97f4a7c15)</code>), the same function as <code>kmer::hash_kmer</code>.</p>
|
||||||
<div class="highlight"><pre><span></span><code><span class="k">fn</span><span class="w"> </span><span class="nf">hash_mmer</span><span class="p">(</span><span class="n">canonical</span><span class="p">:</span><span class="w"> </span><span class="kt">u64</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">u64</span><span class="w"> </span><span class="p">{</span>
|
|
||||||
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">canonical</span><span class="w"> </span><span class="o">^</span><span class="w"> </span><span class="mh">0x9e3779b97f4a7c15</span><span class="p">;</span><span class="w"> </span><span class="c1">// seed: eliminates fixed point at 0</span>
|
|
||||||
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">^</span><span class="w"> </span><span class="p">(</span><span class="n">x</span><span class="w"> </span><span class="o">>></span><span class="w"> </span><span class="mi">30</span><span class="p">);</span>
|
|
||||||
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">x</span><span class="p">.</span><span class="n">wrapping_mul</span><span class="p">(</span><span class="mh">0xbf58476d1ce4e5b9</span><span class="p">);</span>
|
|
||||||
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">^</span><span class="w"> </span><span class="p">(</span><span class="n">x</span><span class="w"> </span><span class="o">>></span><span class="w"> </span><span class="mi">27</span><span class="p">);</span>
|
|
||||||
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">x</span><span class="p">.</span><span class="n">wrapping_mul</span><span class="p">(</span><span class="mh">0x94d049bb133111eb</span><span class="p">);</span>
|
|
||||||
<span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">^</span><span class="w"> </span><span class="p">(</span><span class="n">x</span><span class="w"> </span><span class="o">>></span><span class="w"> </span><span class="mi">31</span><span class="p">)</span>
|
|
||||||
<span class="p">}</span>
|
|
||||||
</code></pre></div>
|
|
||||||
<p>On each new nucleotide, once the window is full, the deque is updated:</p>
|
<p>On each new nucleotide, once the window is full, the deque is updated:</p>
|
||||||
<div class="admonition abstract">
|
<div class="admonition abstract">
|
||||||
<p class="admonition-title">Algorithm — minimizer deque update</p>
|
<p class="admonition-title">Algorithm — minimizer deque update</p>
|
||||||
@@ -1196,17 +1259,21 @@
|
|||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
</div>
|
</div>
|
||||||
<p>The front of the deque is always the current minimizer. Because the deque is maintained in strictly increasing hash order, each entry is popped at most once — O(1) amortized per nucleotide.</p>
|
<p>The front of the deque is always the current minimizer. Because the deque is maintained in strictly increasing hash order, each entry is popped at most once — O(1) amortized per nucleotide.</p>
|
||||||
<p>A super-kmer boundary is emitted when the minimizer changes: <code>deque.front.hash ≠ prev_hash</code>. The <code>canonical</code> field of the front entry is <strong>not</strong> used for boundary detection — that uses the hash alone. The canonical value is stored so that the partition key <span class="arithmatex">\(H(\text{canonical})\)</span> can be recomputed independently at routing time from the stored <code>minimizer_pos</code>, without inheriting the minimum-order-statistic bias (see <a href="../../theory/minimizer/#partition-key-independence">Minimizer selection — partition key independence</a>).</p>
|
<p>A super-kmer boundary is emitted when the minimizer changes: <code>current_minimizer != prev_minimizer</code>. <code>SuperKmerIter</code> also emits a boundary when:</p>
|
||||||
|
<ul>
|
||||||
|
<li>entropy of the current k-mer falls at or below the threshold θ (cursor retreated by k−1)</li>
|
||||||
|
<li>super-kmer length reaches 256 nucleotides (cursor retreated by k)</li>
|
||||||
|
</ul>
|
||||||
<h2 id="kmer-extraction">Kmer extraction</h2>
|
<h2 id="kmer-extraction">Kmer extraction</h2>
|
||||||
<p>A k-mer is extracted from a super-kmer with <code>SuperKmer::kmer(i, k)</code>, which returns a <code>Kmer</code> — a left-aligned <code>u64</code> newtype (see <a href="../kmer/">Kmer implementation</a>):</p>
|
<p>A k-mer is extracted from a super-kmer with <code>SuperKmer::kmer(i)</code>, which delegates to <code>PackedSeq::extract::<KLen>(i)</code> and returns a <code>Kmer</code> — a left-aligned <code>u64</code> newtype (see <a href="../kmer/">Kmer implementation</a>):</p>
|
||||||
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">kmer</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">i</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span><span class="w"> </span><span class="n">k</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nb">Result</span><span class="o"><</span><span class="n">Kmer</span><span class="p">,</span><span class="w"> </span><span class="n">KmerError</span><span class="o">></span>
|
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">kmer</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">i</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nb">Result</span><span class="o"><</span><span class="n">Kmer</span><span class="p">,</span><span class="w"> </span><span class="n">KmerError</span><span class="o">></span>
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
<p>The bit slice <code>seq[i*2 .. (i+k)*2]</code> (Msb0 order) is loaded as a big-endian <code>u64</code> via <code>bitvec::load_be</code>, then left-shifted to produce the canonical left-aligned layout. One call — no loop, no allocation.</p>
|
<p>The bit slice <code>seq[i*2 .. (i+k)*2]</code> (Msb0 order) is loaded as a <code>u64</code> via <code>bitvec::load_be</code>, then left-shifted to produce the canonical left-aligned layout. One call — no loop, no allocation.</p>
|
||||||
<hr />
|
<hr />
|
||||||
<div class="admonition abstract">
|
<div class="admonition abstract">
|
||||||
<p class="admonition-title">Algorithm — Super-kmer reverse complement</p>
|
<p class="admonition-title">Algorithm — Super-kmer reverse complement</p>
|
||||||
<div class="highlight"><pre><span></span><code>procedure SuperKmerRevcomp(seq, SEQL):
|
<div class="highlight"><pre><span></span><code>procedure SuperKmerRevcomp(seq, SEQL):
|
||||||
seql ← NKMERS + k − 1 -- nucleotide length
|
seql ← nucleotide length
|
||||||
n ← ⌈seql / 4⌉ -- number of bytes
|
n ← ⌈seql / 4⌉ -- number of bytes
|
||||||
shift ← n × 8 − seql × 2 -- padding bits to flush
|
shift ← n × 8 − seql × 2 -- padding bits to flush
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
+294
-1
@@ -213,6 +213,17 @@
|
|||||||
</label>
|
</label>
|
||||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#subcommands" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Subcommands
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="#constraints" class="md-nav__link">
|
<a href="#constraints" class="md-nav__link">
|
||||||
<span class="md-ellipsis">
|
<span class="md-ellipsis">
|
||||||
@@ -222,6 +233,28 @@
|
|||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#parameter-constraints-enforced-at-cli" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Parameter constraints (enforced at CLI)
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#genome-label-constraints" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Genome label constraints
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
</li>
|
</li>
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
@@ -714,6 +747,34 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="implementation/evidence_elimination/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Evidence elimination (discussion)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="implementation/obilayeredmap/" class="md-nav__link">
|
<a href="implementation/obilayeredmap/" class="md-nav__link">
|
||||||
|
|
||||||
@@ -792,6 +853,62 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="implementation/merge/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Merge command
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="implementation/rebuild_filter/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Kmer filtering (rebuild/dump/unitig)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
</nav>
|
</nav>
|
||||||
|
|
||||||
@@ -935,6 +1052,17 @@
|
|||||||
</label>
|
</label>
|
||||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#subcommands" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Subcommands
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="#constraints" class="md-nav__link">
|
<a href="#constraints" class="md-nav__link">
|
||||||
<span class="md-ellipsis">
|
<span class="md-ellipsis">
|
||||||
@@ -944,6 +1072,28 @@
|
|||||||
</span>
|
</span>
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#parameter-constraints-enforced-at-cli" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Parameter constraints (enforced at CLI)
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#genome-label-constraints" class="md-nav__link">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Genome label constraints
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
</li>
|
</li>
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
@@ -976,12 +1126,155 @@
|
|||||||
|
|
||||||
<h1 id="obikmer">obikmer</h1>
|
<h1 id="obikmer">obikmer</h1>
|
||||||
<p><code>obikmer</code> is a Rust tool for manipulation, counting, indexing, and set operations on DNA sequences represented as kmer sets.</p>
|
<p><code>obikmer</code> is a Rust tool for manipulation, counting, indexing, and set operations on DNA sequences represented as kmer sets.</p>
|
||||||
|
<h2 id="subcommands">Subcommands</h2>
|
||||||
|
<table>
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Subcommand</th>
|
||||||
|
<th>Purpose</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td><code>superkmer</code></td>
|
||||||
|
<td>Extract super-kmers from a sequence file and write to stdout</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>index</code></td>
|
||||||
|
<td>Build a complete genome index (scatter → dereplicate → count → layered MPHF)</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>merge</code></td>
|
||||||
|
<td>Merge multiple built indexes into one</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>rebuild</code></td>
|
||||||
|
<td>Filter and compact an existing index into a new single-layer index; supports ingroup/outgroup predicates on genome metadata</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>query</code></td>
|
||||||
|
<td>Query an index with sequences and annotate matches</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>dump</code></td>
|
||||||
|
<td>Dump all indexed k-mers as CSV (kmer + per-genome counts or presence); supports the same ingroup/outgroup filtering as <code>rebuild</code></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>annotate</code></td>
|
||||||
|
<td>Add or update genome metadata from a CSV file; or dump metadata as CSV</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>distance</code></td>
|
||||||
|
<td>Compute pairwise distance matrix between genomes; optionally build NJ/UPGMA trees</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>unitig</code></td>
|
||||||
|
<td>Build a global de Bruijn graph across all partitions and enumerate its unitigs as FASTA; supports the same ingroup/outgroup filtering as <code>rebuild</code></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>estimate</code></td>
|
||||||
|
<td>Estimate approximate-index parameters (z, evidence bits, FP rates) before indexing</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>reindex</code></td>
|
||||||
|
<td>Convert an index's evidence in-place: exact ↔ approx</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>utils</code></td>
|
||||||
|
<td>Miscellaneous index utilities: <code>--new-label NEW=OLD</code> renames a genome label; <code>--upgrade-index</code> adds missing <code>layer_meta.json</code> to old indexes</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>pack</code></td>
|
||||||
|
<td>Pack per-column matrix files into single-file format to reduce query I/O</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
<h2 id="constraints">Constraints</h2>
|
<h2 id="constraints">Constraints</h2>
|
||||||
<ul>
|
<ul>
|
||||||
<li>Target scale: individual genome datasets, tens of Gbases</li>
|
<li>Target scale: individual genome datasets, tens of Gbases</li>
|
||||||
<li>Maximum efficiency in computation, memory, and disk usage</li>
|
<li>Maximum efficiency in computation, memory, and disk usage</li>
|
||||||
<li>Input formats: FASTA, FASTQ, gzip, streaming stdin</li>
|
<li>k odd, k ∈ [11, 31], fixed at runtime; kmer fits in a u64 (2 bits/base)</li>
|
||||||
|
<li>Canonical form: <code>min(kmer, revcomp(kmer))</code> reduces strand-symmetric space by half</li>
|
||||||
|
<li>Input formats for <code>index</code>/<code>superkmer</code>: FASTA (<code>.fa</code>, <code>.fasta</code>), FASTQ (<code>.fq</code>, <code>.fastq</code>), GenBank flat file (<code>.gb</code>, <code>.gbk</code>, <code>.gbff</code>), all optionally gzip-compressed; directories expanded recursively; streaming stdin via <code>-</code></li>
|
||||||
|
<li>Input formats for <code>query</code>: FASTA, FASTQ, optionally gzip-compressed; streaming stdin via <code>-</code></li>
|
||||||
</ul>
|
</ul>
|
||||||
|
<h2 id="parameter-constraints-enforced-at-cli">Parameter constraints (enforced at CLI)</h2>
|
||||||
|
<p>All constraints below are checked by <code>CommonArgs::validate()</code> at the start of <code>superkmer</code> and <code>index</code>. Invalid values exit immediately with an error.</p>
|
||||||
|
<table>
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Parameter</th>
|
||||||
|
<th>Constraint</th>
|
||||||
|
<th>Reason</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td>k (<code>--kmer-size</code>)</td>
|
||||||
|
<td>odd</td>
|
||||||
|
<td>even k allows palindromic k-mers: kmer == revcomp(kmer), breaking the canonical form invariant</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>k (<code>--kmer-size</code>)</td>
|
||||||
|
<td>k ∈ [11, 31]</td>
|
||||||
|
<td>k > 31 overflows u64 at 2 bits/base; k < 11 gives insufficient specificity</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>m (<code>--minimizer-size</code>)</td>
|
||||||
|
<td>odd</td>
|
||||||
|
<td>same palindrome argument as k</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>m (<code>--minimizer-size</code>)</td>
|
||||||
|
<td>3 ≤ m ≤ k−1</td>
|
||||||
|
<td>minimizer must be strictly shorter than the kmer</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>z (<code>-z</code>, Findere, <code>index --approx</code> only)</td>
|
||||||
|
<td>z ≤ k−1</td>
|
||||||
|
<td>effective indexed kmer size is k−z+1; z ≥ k would make it ≤ 0</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
<h2 id="genome-label-constraints">Genome label constraints</h2>
|
||||||
|
<p>Genome labels are arbitrary Unicode strings with the following restrictions:</p>
|
||||||
|
<table>
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Character</th>
|
||||||
|
<th>Forbidden</th>
|
||||||
|
<th>Reason</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td><code>/</code></td>
|
||||||
|
<td>yes</td>
|
||||||
|
<td>filesystem path separator</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>=</code></td>
|
||||||
|
<td>yes</td>
|
||||||
|
<td><code>--new-label</code> parser separator</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>\0</code></td>
|
||||||
|
<td>yes</td>
|
||||||
|
<td>null byte</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>\n</code> <code>\r</code> <code>\t</code></td>
|
||||||
|
<td>yes</td>
|
||||||
|
<td>break CSV output</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>spaces</td>
|
||||||
|
<td><strong>allowed</strong></td>
|
||||||
|
<td>use shell quoting: <code>--new-label 'new label=old label'</code></td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
<p>Empty labels are also rejected. Labels derived automatically from the index directory name (when <code>--label</code> is omitted) are not validated since they come from the filesystem and are already safe.</p>
|
||||||
<h2 id="priority-operations">Priority operations</h2>
|
<h2 id="priority-operations">Priority operations</h2>
|
||||||
<ul>
|
<ul>
|
||||||
<li>Kmer counting (frequencies)</li>
|
<li>Kmer counting (frequencies)</li>
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
+87
-2
@@ -746,6 +746,34 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../implementation/evidence_elimination/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Evidence elimination (discussion)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="../implementation/obilayeredmap/" class="md-nav__link">
|
<a href="../implementation/obilayeredmap/" class="md-nav__link">
|
||||||
|
|
||||||
@@ -824,6 +852,62 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../implementation/merge/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Merge command
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../implementation/rebuild_filter/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Kmer filtering (rebuild/dump/unitig)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
</nav>
|
</nav>
|
||||||
|
|
||||||
@@ -1038,11 +1122,12 @@
|
|||||||
<h2 id="kmers">Kmers</h2>
|
<h2 id="kmers">Kmers</h2>
|
||||||
<p>A <strong>kmer</strong> is a DNA subsequence of fixed length k. Two constraints govern the choice of k:</p>
|
<p>A <strong>kmer</strong> is a DNA subsequence of fixed length k. Two constraints govern the choice of k:</p>
|
||||||
<ul>
|
<ul>
|
||||||
<li><strong>k ∈ [11, 31]</strong>: the range ensures the kmer is long enough to be specific and short enough to fit in a single machine word.</li>
|
<li><strong>k ∈ [11, 31]</strong>: the range ensures the kmer is long enough to be specific and short enough to fit in a single machine word (u64 at 2 bits/base requires k ≤ 32; k < 11 yields insufficient specificity).</li>
|
||||||
<li><strong>k is odd</strong>: an odd-length sequence cannot equal its own reverse complement (no palindromes). This guarantees that the canonical form <code>min(kmer, revcomp(kmer))</code> is always strictly defined — the two orientations are always distinct — which is required for strand-independent counting.</li>
|
<li><strong>k is odd</strong>: an odd-length sequence cannot equal its own reverse complement (no palindromes). This guarantees that the canonical form <code>min(kmer, revcomp(kmer))</code> is always strictly defined — the two orientations are always distinct — which is required for strand-independent counting.</li>
|
||||||
</ul>
|
</ul>
|
||||||
|
<p>Both constraints are <strong>enforced at CLI entry</strong> by <code>CommonArgs::validate()</code> in <code>superkmer</code> and <code>index</code>. Passing an invalid k exits immediately with an error message.</p>
|
||||||
<h2 id="super-kmers">Super-kmers</h2>
|
<h2 id="super-kmers">Super-kmers</h2>
|
||||||
<p>A <strong>super-kmer</strong> is a maximal run of consecutive kmers from a DNA read, each overlapping the next by k−1 nucleotides. Each kmer of the run carries the same <strong>canonical minimizer</strong>. The <strong>canonical minimizer</strong> of a kmer is the smallest value of <code>min(m-mer, revcomp(m-mer))</code> over all m-mers within the kmer (m < k, m odd), with the constraint that <strong>non-degenerate m-mers are always preferred</strong> over degenerate ones. A degenerate m-mer is one composed of a single repeated nucleotide (all-A, all-C, all-G, or all-T); such m-mers are selected only if no non-degenerate candidate exists in the window.</p>
|
<p>A <strong>super-kmer</strong> is a maximal run of consecutive kmers from a DNA read, each overlapping the next by k−1 nucleotides, sharing the same <strong>canonical minimizer</strong>. The <strong>canonical minimizer</strong> of a kmer is the m-mer (m < k) whose canonical hash <code>hash_kmer(min(m-mer, revcomp(m-mer)))</code> is smallest over all m-mers in the kmer window. The hash function is a <code>mix64</code>-based bijection; selection is purely hash-ordered with no degeneracy filter. A super-kmer is capped at 256 nucleotides; a longer run is split at that boundary.</p>
|
||||||
<h3 id="canonical-super-kmers">Canonical super-kmers</h3>
|
<h3 id="canonical-super-kmers">Canonical super-kmers</h3>
|
||||||
<p>A <strong>canonical super-kmer</strong> is the lexicographic minimum of a super-kmer and its reverse complement:</p>
|
<p>A <strong>canonical super-kmer</strong> is the lexicographic minimum of a super-kmer and its reverse complement:</p>
|
||||||
<div class="highlight"><pre><span></span><code>canonical(super-kmer) = min(super-kmer, revcomp(super-kmer))
|
<div class="highlight"><pre><span></span><code>canonical(super-kmer) = min(super-kmer, revcomp(super-kmer))
|
||||||
|
|||||||
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -718,6 +718,34 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../../implementation/evidence_elimination/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Evidence elimination (discussion)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="../../implementation/obilayeredmap/" class="md-nav__link">
|
<a href="../../implementation/obilayeredmap/" class="md-nav__link">
|
||||||
|
|
||||||
@@ -796,6 +824,62 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../../implementation/merge/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Merge command
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../../implementation/rebuild_filter/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Kmer filtering (rebuild/dump/unitig)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
</nav>
|
</nav>
|
||||||
|
|
||||||
@@ -1010,17 +1094,20 @@
|
|||||||
<p>The Watson-Crick complement of any base is its bitwise NOT on 2 bits: <code>complement(base) = ~base & 0b11</code>.</p>
|
<p>The Watson-Crick complement of any base is its bitwise NOT on 2 bits: <code>complement(base) = ~base & 0b11</code>.</p>
|
||||||
<h2 id="kmer-encoding">Kmer encoding</h2>
|
<h2 id="kmer-encoding">Kmer encoding</h2>
|
||||||
<p>A kmer fits in a single <code>u64</code>. Nucleotide 0 occupies bits 63–62, nucleotide i occupies bits 63−2i and 62−2i, and the low 64−2k bits are zero. Extraction of nucleotide i (0 ≤ i < k): <code>(kmer >> (62 - 2*i)) & 0b11</code>.</p>
|
<p>A kmer fits in a single <code>u64</code>. Nucleotide 0 occupies bits 63–62, nucleotide i occupies bits 63−2i and 62−2i, and the low 64−2k bits are zero. Extraction of nucleotide i (0 ≤ i < k): <code>(kmer >> (62 - 2*i)) & 0b11</code>.</p>
|
||||||
<p>Reverse complement is computed via a <strong>16-bit lookup table</strong> (65 536 entries × 2 bytes = 128 KB, fits in L2 cache) storing the reverse-complement of every 8-base chunk.</p>
|
<p>Reverse complement is computed by <strong>bit manipulation in four steps</strong>, with no lookup table:</p>
|
||||||
<div class="admonition abstract">
|
<div class="admonition abstract">
|
||||||
<p class="admonition-title">Algorithm — Kmer reverse complement</p>
|
<p class="admonition-title">Algorithm — Kmer reverse complement</p>
|
||||||
<div class="highlight"><pre><span></span><code>procedure KmerRevcomp(kmer, k):
|
<div class="highlight"><pre><span></span><code>procedure KmerRevcomp(kmer, k):
|
||||||
raw ← TABLE16[kmer & 0xFFFF] << 48
|
x ← ~kmer -- complement all bases
|
||||||
| TABLE16[(kmer >> 16) & 0xFFFF] << 32
|
x ← swap_bytes(x) -- reverse byte order
|
||||||
| TABLE16[(kmer >> 32) & 0xFFFF] << 16
|
x ← ((x >> 4) & 0x0F0F0F0F0F0F0F0F)
|
||||||
| TABLE16[(kmer >> 48) & 0xFFFF]
|
| ((x & 0x0F0F0F0F0F0F0F0F) << 4) -- swap nibbles within each byte
|
||||||
return raw << (64 - 2*k)
|
x ← ((x >> 2) & 0x3333333333333333)
|
||||||
|
| ((x & 0x3333333333333333) << 2) -- swap 2-bit pairs within each nibble
|
||||||
|
return x << (64 - 2*k) -- re-align to MSB
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
</div>
|
</div>
|
||||||
|
<p>The three reorder passes together reverse the order of all 2-bit base codes across the 64-bit word. The bitwise NOT in the first step complements each base (A↔T, C↔G). The final left shift clears the low 64−2k padding bits.</p>
|
||||||
<p>The <strong>canonical form</strong> is the lexicographic minimum of the kmer and its reverse complement:</p>
|
<p>The <strong>canonical form</strong> is the lexicographic minimum of the kmer and its reverse complement:</p>
|
||||||
<div class="highlight"><pre><span></span><code>canonical(kmer) = min(kmer, revcomp(kmer))
|
<div class="highlight"><pre><span></span><code>canonical(kmer) = min(kmer, revcomp(kmer))
|
||||||
</code></pre></div>
|
</code></pre></div>
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -773,6 +773,34 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../../implementation/evidence_elimination/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Evidence elimination (discussion)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="../../implementation/obilayeredmap/" class="md-nav__link">
|
<a href="../../implementation/obilayeredmap/" class="md-nav__link">
|
||||||
|
|
||||||
@@ -851,6 +879,62 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../../implementation/merge/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Merge command
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../../implementation/rebuild_filter/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Kmer filtering (rebuild/dump/unitig)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
</nav>
|
</nav>
|
||||||
|
|
||||||
@@ -1109,7 +1193,7 @@
|
|||||||
<h2 id="final-score">Final score</h2>
|
<h2 id="final-score">Final score</h2>
|
||||||
<p>The filter computes <span class="arithmatex">\(\hat{H}(ws)\)</span> for each word size ws from 1 to ws_max and returns the <strong>minimum</strong>:</p>
|
<p>The filter computes <span class="arithmatex">\(\hat{H}(ws)\)</span> for each word size ws from 1 to ws_max and returns the <strong>minimum</strong>:</p>
|
||||||
<div class="arithmatex">\[\text{entropy}(kmer) = \min_{ws=1}^{ws_{\max}} \hat{H}(ws)\]</div>
|
<div class="arithmatex">\[\text{entropy}(kmer) = \min_{ws=1}^{ws_{\max}} \hat{H}(ws)\]</div>
|
||||||
<p>A value near 0 indicates low complexity (e.g. AAAA…); near 1 indicates high complexity. A kmer is rejected if <span class="arithmatex">\(\text{entropy}(kmer) \leq \theta\)</span>, where <span class="arithmatex">\(\theta\)</span> is a collection parameter. The minimum across word sizes ensures that any scale of repetition is detected independently: polyA is caught at ws=1, dinucleotide repeats at ws=2, etc.</p>
|
<p>A value near 0 indicates low complexity (e.g. AAAA…); near 1 indicates high complexity. A kmer is rejected if <span class="arithmatex">\(\text{entropy}(kmer) < \theta\)</span>, where <span class="arithmatex">\(\theta\)</span> is a collection parameter (default 0.7). The minimum across word sizes ensures that any scale of repetition is detected independently: polyA is caught at ws=1, dinucleotide repeats at ws=2, etc.</p>
|
||||||
<h2 id="interpretation-as-an-effective-number-of-classes">Interpretation as an effective number of classes</h2>
|
<h2 id="interpretation-as-an-effective-number-of-classes">Interpretation as an effective number of classes</h2>
|
||||||
<p><span class="arithmatex">\(H_{\text{corr}}\)</span> is a standard Shannon entropy over raw words (after unfolding the equivalence classes), so the classical perplexity interpretation holds directly: <span class="arithmatex">\(N_{\text{eff}} = e^{H_{\text{corr}}}\)</span> is the number of equiprobable classes that would yield the same entropy.</p>
|
<p><span class="arithmatex">\(H_{\text{corr}}\)</span> is a standard Shannon entropy over raw words (after unfolding the equivalence classes), so the classical perplexity interpretation holds directly: <span class="arithmatex">\(N_{\text{eff}} = e^{H_{\text{corr}}}\)</span> is the number of equiprobable classes that would yield the same entropy.</p>
|
||||||
<p>For the normalised score <span class="arithmatex">\(\hat{H}\)</span>, dividing by <span class="arithmatex">\(H_{\text{max}}\)</span> changes the logarithm base:</p>
|
<p>For the normalised score <span class="arithmatex">\(\hat{H}\)</span>, dividing by <span class="arithmatex">\(H_{\text{max}}\)</span> changes the logarithm base:</p>
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -718,6 +718,34 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../../implementation/evidence_elimination/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Evidence elimination (discussion)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="../../implementation/obilayeredmap/" class="md-nav__link">
|
<a href="../../implementation/obilayeredmap/" class="md-nav__link">
|
||||||
|
|
||||||
@@ -796,6 +824,62 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../../implementation/merge/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Merge command
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../../implementation/rebuild_filter/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Kmer filtering (rebuild/dump/unitig)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
</nav>
|
</nav>
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -762,6 +762,34 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../../implementation/evidence_elimination/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Evidence elimination (discussion)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<li class="md-nav__item">
|
<li class="md-nav__item">
|
||||||
<a href="../../implementation/obilayeredmap/" class="md-nav__link">
|
<a href="../../implementation/obilayeredmap/" class="md-nav__link">
|
||||||
|
|
||||||
@@ -840,6 +868,62 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../../implementation/merge/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Merge command
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../../implementation/rebuild_filter/" class="md-nav__link">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
|
||||||
|
Kmer filtering (rebuild/dump/unitig)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</span>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
</nav>
|
</nav>
|
||||||
|
|
||||||
|
|||||||
@@ -2,169 +2,155 @@
|
|||||||
|
|
||||||
## Fundamental invariant
|
## Fundamental invariant
|
||||||
|
|
||||||
A given canonical kmer belongs to **exactly one partition** and **exactly one layer** within that partition. This is the property that makes all aggregation operations decomposable and parallelisable without coordination.
|
A given canonical kmer belongs to **exactly one partition** and **exactly one layer** within that partition. This property makes all aggregation operations decomposable and parallelisable without coordination.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Three-level hierarchy
|
## Three-level hierarchy
|
||||||
|
|
||||||
```
|
```
|
||||||
PartitionedIndex
|
KmerIndex (index.meta + KmerPartition)
|
||||||
├── LayeredPartition (one per minimiser bucket)
|
├── partition_0/index/ one directory per minimiser bucket
|
||||||
│ ├── MphfLayer 0 kmer → slot (immutable bijection)
|
│ ├── meta.json PartitionMeta { n_layers }
|
||||||
│ │ ├── DataStore A slot → T (e.g. counts)
|
│ ├── layer_0/
|
||||||
│ │ └── DataStore B slot → T (e.g. presence/absence, derived)
|
│ │ ├── layer_meta.json LayerMeta { evidence: EvidenceKind }
|
||||||
│ ├── MphfLayer 1
|
│ │ ├── mphf.bin PtrHash MPHF
|
||||||
│ │ └── DataStore A
|
│ │ ├── unitigs.bin unitig spine (never overwritten)
|
||||||
│ └── ...
|
│ │ ├── evidence.bin exact evidence (Exact only)
|
||||||
├── LayeredPartition
|
│ │ ├── unitigs.bin.idx block index (Exact only)
|
||||||
|
│ │ ├── fingerprint.bin fingerprints (Approx only)
|
||||||
|
│ │ ├── counts/ PersistentCompactIntMatrix (with_counts = true)
|
||||||
|
│ │ └── presence/ PersistentBitMatrix
|
||||||
|
│ └── layer_1/
|
||||||
│ └── ...
|
│ └── ...
|
||||||
|
└── partition_1/index/
|
||||||
|
└── ...
|
||||||
```
|
```
|
||||||
|
|
||||||
**PartitionedIndex**: routes queries to partitions via canonical minimiser hash. Owns the partition count and routing scheme (fixed at creation). Dispatches aggregations across partitions in parallel.
|
**KmerIndex**: root entry point. Owns `IndexMeta` (written to `index.meta`) and a `KmerPartition` that routes canonical kmers to partition directories. All partition-level operations are dispatched in parallel via rayon.
|
||||||
|
|
||||||
**LayeredPartition**: one directory per minimiser bucket. Holds a `Vec<MphfLayer>`. Each layer covers a disjoint kmer set — layer 0 is built from dataset A; layer 1 covers kmers in B absent from layer 0; and so on. Layers within a partition are always disjoint.
|
**Partition directory**: one directory per minimiser bucket. `PartitionMeta` (stored as `meta.json`) records `n_layers`. Layers within a partition cover disjoint kmer sets.
|
||||||
|
|
||||||
**MphfLayer**: the MPHF + evidence + unitig spine. Maps `kmer → slot` for its disjoint kmer set. Immutable once built. Independent of any data attached to it.
|
**Layer directory**: one `MphfLayer` plus optional data stores. `LayerMeta` (stored as `layer_meta.json`) records which `EvidenceKind` was used. The MPHF and `unitigs.bin` are immutable once built; evidence files are the only part replaced by `reindex`.
|
||||||
|
|
||||||
**DataStore**: a slot-indexed data array (e.g. `PersistentCompactIntMatrix`, `PersistentBitMatrix`). Attached to a `MphfLayer` externally. Multiple stores of different types can coexist on the same `MphfLayer`.
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## MphfLayer — autonomous mapping layer
|
## IndexConfig and IndexMeta
|
||||||
|
|
||||||
```rust
|
```rust
|
||||||
MphfLayer::find(kmer: CanonicalKmer) -> Option<usize> // slot, or None if absent
|
pub struct IndexConfig {
|
||||||
MphfLayer::n() -> usize // number of slots
|
pub kmer_size: usize,
|
||||||
MphfLayer::build(dir: &Path) -> OLMResult<(Self, usize)> // from unitigs.bin
|
pub minimizer_size: usize,
|
||||||
MphfLayer::open(dir: &Path) -> OLMResult<Self>
|
pub n_bits: usize, // log2(n_partitions)
|
||||||
|
pub with_counts: bool,
|
||||||
|
pub evidence: EvidenceKind,
|
||||||
|
pub block_bits: u8, // .idx granularity: 2^block_bits unitigs/block; 0 = one entry per unitig
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct IndexMeta {
|
||||||
|
pub version: u32,
|
||||||
|
pub config: IndexConfig,
|
||||||
|
pub genomes: Vec<GenomeInfo>, // ordered; index = genome column number
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct GenomeInfo {
|
||||||
|
pub label: String,
|
||||||
|
pub meta: HashMap<String, String>, // arbitrary categorical metadata
|
||||||
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
`find` returns `Some(slot)` only if the kmer is actually in this layer (evidence check included). Returns `None` for kmers present in other layers or absent from the index.
|
`IndexMeta` is serialised as `index.meta` (JSON). It is the authority for the ordered list of genomes and for the parameters that govern all subsequent operations on the index.
|
||||||
|
|
||||||
The MPHF (`mphf.bin`, `evidence.bin`, `unitigs.bin`) is built once and never rebuilt. All data derivation operations (count → presence, thresholding, merging) reuse the same `MphfLayer`.
|
---
|
||||||
|
|
||||||
|
## EvidenceKind
|
||||||
|
|
||||||
|
```rust
|
||||||
|
pub enum EvidenceKind {
|
||||||
|
Exact,
|
||||||
|
Approx { b: u8, z: u8 },
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Controls which files are written per layer and which query path is taken:
|
||||||
|
|
||||||
|
| Variant | Files written | False-positive rate |
|
||||||
|
|---|---|---|
|
||||||
|
| `Exact` | `evidence.bin`, `unitigs.bin.idx` | 0 |
|
||||||
|
| `Approx { b, z }` | `fingerprint.bin` | ≈ W / 2^(b·z) per read (Findere) |
|
||||||
|
|
||||||
|
`EvidenceKind` is stored both in `IndexConfig` (index-wide default, updated by `reindex`) and in each `LayerMeta` (per-layer record of what was actually built).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## MphfLayer — autonomous kmer → slot mapping
|
||||||
|
|
||||||
|
```rust
|
||||||
|
pub struct MphfLayer {
|
||||||
|
mphf: PtrHash<…>,
|
||||||
|
ev: LayerEvidence, // Exact { evidence, unitigs } | Approx { fingerprint }
|
||||||
|
n: usize,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
`MphfLayer::find(kmer)` dispatches transparently to `find_exact` or `find_approx` based on the evidence loaded at `open` time (read from `layer_meta.json`). Returns `Some(slot)` only if the kmer is confirmed present; `None` for absent or out-of-range.
|
||||||
|
|
||||||
|
```
|
||||||
|
find_exact: slot = mphf(kmer); decode evidence → (chunk_id, rank); verify kmer in unitigs
|
||||||
|
find_approx: slot = mphf(kmer); check fingerprint[slot] == seq_hash(kmer)
|
||||||
|
```
|
||||||
|
|
||||||
|
`block_bits` controls the `.idx` file written alongside `evidence.bin`. At `block_bits = 0`, every unitig chunk has an index entry, giving O(1) random access; larger values trade access time for a smaller `.idx`.
|
||||||
|
|
||||||
|
The MPHF and `unitigs.bin` are never rebuilt by any post-build operation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Layer\<D\> — MPHF + data payload
|
||||||
|
|
||||||
|
```rust
|
||||||
|
pub struct Layer<D: LayerData = ()> {
|
||||||
|
mphf: MphfLayer,
|
||||||
|
data: D,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
`D` selects the attached data payload:
|
||||||
|
|
||||||
|
| `D` | Data directory | `Item` returned by `query` |
|
||||||
|
|---|---|---|
|
||||||
|
| `()` | — | `()` (set membership only) |
|
||||||
|
| `PersistentCompactIntMatrix` | `counts/` | `Box<[u32]>` (counts per genome) |
|
||||||
|
| `PersistentBitMatrix` | `presence/` | `Box<[bool]>` (presence per genome) |
|
||||||
|
|
||||||
|
`Layer::query(kmer)` delegates to `MphfLayer::find`, then calls `data.read(slot)` if a slot is returned. Both exact and approximate evidence are handled transparently; the caller sees only `Option<Hit<D::Item>>`.
|
||||||
|
|
||||||
|
Build-time entry points:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
Layer<()>::build(out_dir, block_bits) // set membership
|
||||||
|
Layer<PersistentCompactIntMatrix>::build(out_dir, block_bits, count_of)
|
||||||
|
Layer<PersistentBitMatrix>::build_presence(out_dir, block_bits, n_genomes, present_in)
|
||||||
|
|
||||||
|
Layer::<()>::build_evidence(layer_dir, kind, block_bits) // evidence only (reindex path)
|
||||||
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## DataStore — slot-indexed data
|
## DataStore — slot-indexed data
|
||||||
|
|
||||||
```rust
|
`PersistentCompactIntMatrix` and `PersistentBitMatrix` are slot-indexed stores. They know nothing about kmers or MPHFs.
|
||||||
trait DataStore {
|
|
||||||
type Item;
|
|
||||||
fn get(&self, slot: usize) -> Self::Item;
|
|
||||||
fn n(&self) -> usize;
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Concrete types from `obicompactvec`:
|
| Type | `Item` | Aggregation method | Use |
|
||||||
|
|
||||||
| Type | `Item` | Column stats | Use |
|
|
||||||
|---|---|---|---|
|
|---|---|---|---|
|
||||||
| `PersistentCompactIntMatrix` | `Box<[u32]>` | `sum() -> Array1<u64>` | count per sample per slot |
|
| `PersistentCompactIntMatrix` | `Box<[u32]>` | `sum() → Array1<u64>` | counts per genome per slot |
|
||||||
| `PersistentBitMatrix` | `Box<[bool]>` | `count_ones() -> Array1<u64>` | presence per sample per slot |
|
| `PersistentBitMatrix` | `Box<[bool]>` | `count_ones() → Array1<u64>` | presence per genome per slot |
|
||||||
|
|
||||||
`sum()` and `count_ones()` are the bridge between the per-matrix level and cross-layer aggregation: they give the total weight of each column within one (partition, layer) pair, which can be summed to get global column weights.
|
|
||||||
|
|
||||||
A `DataStore` knows nothing about kmers or MPHFs. It is indexed by `usize` slot only.
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Distance matrix API on DataStore types
|
## Aggregation traits — `obicompactvec::traits`
|
||||||
|
|
||||||
Both `PersistentCompactIntMatrix` and `PersistentBitMatrix` expose two families of distance matrix methods.
|
Three traits unify the aggregation API across all hierarchy levels.
|
||||||
|
|
||||||
### Full distance matrices
|
|
||||||
|
|
||||||
Compute the final `n_cols × n_cols` distance matrix from data within a single matrix. Internally parallelised over the upper triangle via rayon.
|
|
||||||
|
|
||||||
```rust
|
|
||||||
// PersistentCompactIntMatrix
|
|
||||||
fn bray_dist_matrix(&self) -> Array2<f64>
|
|
||||||
fn relfreq_bray_dist_matrix(&self) -> Array2<f64>
|
|
||||||
fn euclidean_dist_matrix(&self) -> Array2<f64>
|
|
||||||
fn relfreq_euclidean_dist_matrix(&self) -> Array2<f64>
|
|
||||||
fn hellinger_dist_matrix(&self) -> Array2<f64>
|
|
||||||
fn jaccard_dist_matrix(&self) -> Array2<f64>
|
|
||||||
fn threshold_jaccard_dist_matrix(&self, threshold: u32) -> Array2<f64>
|
|
||||||
|
|
||||||
// PersistentBitMatrix
|
|
||||||
fn jaccard_dist_matrix(&self) -> Array2<f64>
|
|
||||||
fn hamming_dist_matrix(&self) -> Array2<u64>
|
|
||||||
```
|
|
||||||
|
|
||||||
These are convenience methods. For a `LayeredDataStore` or `PartitionedDataStore` they cannot be used directly — the partial API is required.
|
|
||||||
|
|
||||||
### Partial distance matrices
|
|
||||||
|
|
||||||
Return additive components that can be summed element-wise across (partition, layer) pairs before computing the final distance. This is what makes cross-layer and cross-partition aggregation possible.
|
|
||||||
|
|
||||||
**Category 1 — self-contained partials**: additive without any external parameter.
|
|
||||||
|
|
||||||
```rust
|
|
||||||
// PersistentCompactIntMatrix
|
|
||||||
fn partial_bray_dist_matrix(&self)
|
|
||||||
-> (Array2<u64>, // sum_min[i,j]
|
|
||||||
Array1<u64>) // col_sums[k]
|
|
||||||
|
|
||||||
fn partial_euclidean_dist_matrix(&self) -> Array2<f64> // sum of squared diffs
|
|
||||||
fn partial_threshold_jaccard_dist_matrix(&self, threshold: u32)
|
|
||||||
-> (Array2<u64>, // inter[i,j]
|
|
||||||
Array2<u64>) // union[i,j]
|
|
||||||
|
|
||||||
// PersistentBitMatrix
|
|
||||||
fn partial_jaccard_dist_matrix(&self)
|
|
||||||
-> (Array2<u64>, // inter[i,j]
|
|
||||||
Array2<u64>) // union[i,j]
|
|
||||||
fn partial_hamming_dist_matrix(&self) -> Array2<u64> // differing bits
|
|
||||||
```
|
|
||||||
|
|
||||||
**Category 2 — normalised partials**: require global column sums as input, computed beforehand across all (partition, layer) pairs.
|
|
||||||
|
|
||||||
```rust
|
|
||||||
// PersistentCompactIntMatrix only
|
|
||||||
fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>)
|
|
||||||
-> Array2<f64> // Σ_slot min(a_slot/sum_i, b_slot/sum_j)
|
|
||||||
|
|
||||||
fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>)
|
|
||||||
-> Array2<f64> // Σ_slot (a_slot/sum_i - b_slot/sum_j)²
|
|
||||||
|
|
||||||
fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>)
|
|
||||||
-> Array2<f64> // Σ_slot (√(a/sum_i) - √(b/sum_j))²
|
|
||||||
```
|
|
||||||
|
|
||||||
The `col_sums` parameter must reflect the GLOBAL count across all layers and all partitions — passing a per-layer sum would give a wrong result. This constraint drives the two-pass algorithm described below.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Progressive aggregation principle
|
|
||||||
|
|
||||||
Aggregation is **hierarchical**: each level computes its contribution by aggregating from the level immediately below it. No level skips a level or collects raw data from two levels down.
|
|
||||||
|
|
||||||
```
|
|
||||||
PersistentCompactIntMatrix::col_weights() — column sums for one (partition, layer) matrix
|
|
||||||
↓ Σ across layers
|
|
||||||
LayeredStore<PersistentCompactIntMatrix>::col_weights() — column sums for one partition
|
|
||||||
↓ Σ across partitions
|
|
||||||
LayeredStore<LayeredStore<…>>::col_weights() — global column sums
|
|
||||||
```
|
|
||||||
|
|
||||||
The same cascade applies to every partial:
|
|
||||||
|
|
||||||
```
|
|
||||||
PersistentCompactIntMatrix::partial_bray() — one (partition, layer)
|
|
||||||
↓ element-wise Σ across layers
|
|
||||||
LayeredStore<PersistentCompactIntMatrix>::partial_bray() — one partition
|
|
||||||
↓ element-wise Σ across partitions
|
|
||||||
LayeredStore<LayeredStore<…>>::partial_bray() — global partial → final dist
|
|
||||||
```
|
|
||||||
|
|
||||||
Each level presents a stable trait surface to the level above; no level reaches two levels down.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Traits — `obicompactvec::traits`
|
|
||||||
|
|
||||||
Three traits unify the aggregation API across all levels of the hierarchy.
|
|
||||||
|
|
||||||
```rust
|
```rust
|
||||||
trait ColumnWeights: Send + Sync {
|
trait ColumnWeights: Send + Sync {
|
||||||
@@ -172,21 +158,16 @@ trait ColumnWeights: Send + Sync {
|
|||||||
}
|
}
|
||||||
|
|
||||||
trait CountPartials: ColumnWeights {
|
trait CountPartials: ColumnWeights {
|
||||||
// self-contained partials (additive, no parameter)
|
|
||||||
fn partial_bray(&self) -> Array2<u64>;
|
fn partial_bray(&self) -> Array2<u64>;
|
||||||
fn partial_euclidean(&self) -> Array2<f64>;
|
fn partial_euclidean(&self) -> Array2<f64>;
|
||||||
fn partial_threshold_jaccard(&self, threshold: u32) -> (Array2<u64>, Array2<u64>);
|
fn partial_threshold_jaccard(&self, threshold: u32) -> (Array2<u64>, Array2<u64>);
|
||||||
// normalised partials (global col_weights passed in cascade)
|
|
||||||
fn partial_relfreq_bray(&self, global: &Array1<u64>) -> Array2<f64>;
|
fn partial_relfreq_bray(&self, global: &Array1<u64>) -> Array2<f64>;
|
||||||
fn partial_relfreq_euclidean(&self, global: &Array1<u64>) -> Array2<f64>;
|
fn partial_relfreq_euclidean(&self, global: &Array1<u64>) -> Array2<f64>;
|
||||||
fn partial_hellinger(&self, global: &Array1<u64>) -> Array2<f64>;
|
fn partial_hellinger(&self, global: &Array1<u64>) -> Array2<f64>;
|
||||||
// provided finalisation methods (default implementations)
|
// provided finalisation methods with default impls
|
||||||
fn bray_dist_matrix(&self) -> Array2<f64> { … }
|
fn bray_dist_matrix(&self) -> Array2<f64> { … }
|
||||||
fn euclidean_dist_matrix(&self) -> Array2<f64> { … }
|
|
||||||
fn threshold_jaccard_dist_matrix(&self, threshold: u32) -> Array2<f64> { … }
|
|
||||||
fn relfreq_bray_dist_matrix(&self) -> Array2<f64> { … }
|
fn relfreq_bray_dist_matrix(&self) -> Array2<f64> { … }
|
||||||
fn relfreq_euclidean_dist_matrix(&self) -> Array2<f64> { … }
|
// …
|
||||||
fn hellinger_dist_matrix(&self) -> Array2<f64> { … }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
trait BitPartials: ColumnWeights {
|
trait BitPartials: ColumnWeights {
|
||||||
@@ -198,55 +179,109 @@ trait BitPartials: ColumnWeights {
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
**Leaf implementors** (in `obicompactvec`):
|
Leaf implementors:
|
||||||
|
|
||||||
| Type | Traits |
|
| Type | Traits |
|
||||||
|---|---|
|
|---|---|
|
||||||
| `PersistentCompactIntMatrix` | `ColumnWeights` (via `sum()`), `CountPartials` |
|
| `PersistentCompactIntMatrix` | `ColumnWeights`, `CountPartials` |
|
||||||
| `PersistentBitMatrix` | `ColumnWeights` (via `count_ones()`), `BitPartials` |
|
| `PersistentBitMatrix` | `ColumnWeights`, `BitPartials` |
|
||||||
|
|
||||||
`PersistentCompactIntVec` and `PersistentBitVec` do **not** implement these traits — they are single-column primitives, not matrix-level aggregators.
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## `LayeredStore<S>` — `obilayeredmap`
|
## LayeredStore\<S\> — recursive aggregation wrapper
|
||||||
|
|
||||||
A single generic wrapper replaces the need for named `LayeredDataStore` and `PartitionedDataStore` types:
|
|
||||||
|
|
||||||
```rust
|
```rust
|
||||||
pub struct LayeredStore<S>(Vec<S>);
|
pub struct LayeredStore<S>(Vec<S>);
|
||||||
```
|
```
|
||||||
|
|
||||||
Three blanket impls propagate the traits up the hierarchy:
|
Three blanket impls propagate all traits up the hierarchy:
|
||||||
|
|
||||||
```rust
|
```rust
|
||||||
impl<S: ColumnWeights> ColumnWeights for LayeredStore<S> { … } // Σ across inner stores
|
impl<S: ColumnWeights> ColumnWeights for LayeredStore<S> { … }
|
||||||
impl<S: CountPartials> CountPartials for LayeredStore<S> { … } // same pattern
|
impl<S: CountPartials> CountPartials for LayeredStore<S> { … }
|
||||||
impl<S: BitPartials> BitPartials for LayeredStore<S> { … } // same pattern
|
impl<S: BitPartials> BitPartials for LayeredStore<S> { … }
|
||||||
```
|
```
|
||||||
|
|
||||||
Because the blanket impl is recursive, **`LayeredStore<LayeredStore<S>>`** automatically inherits all three traits when `S` does — no separate `PartitionedStore` type is needed:
|
This makes `LayeredStore<LayeredStore<PersistentCompactIntMatrix>>` automatically implement `CountPartials` — no separate `PartitionedStore` type is needed:
|
||||||
|
|
||||||
```
|
```
|
||||||
PersistentCompactIntMatrix implements CountPartials
|
PersistentCompactIntMatrix leaf (one layer)
|
||||||
LayeredStore<PersistentCompactIntMatrix> via blanket impl (= one partition)
|
LayeredStore<PersistentCompactIntMatrix> one partition (layers are disjoint)
|
||||||
LayeredStore<LayeredStore<…>> via blanket impl (= partitioned index)
|
LayeredStore<LayeredStore<…>> whole index (partitions are independent)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Normalised metrics — two-pass cascade
|
Normalised metrics require global column sums — computed in a two-pass cascade:
|
||||||
|
|
||||||
The normalised finalisation methods call `col_weights()` first (pass 1), then the normalised partial (pass 2). Both calls go through the same blanket impl, so the cascade is automatic:
|
|
||||||
|
|
||||||
```rust
|
```rust
|
||||||
// called on LayeredStore<LayeredStore<PersistentCompactIntMatrix>>
|
// on LayeredStore<LayeredStore<PersistentCompactIntMatrix>>
|
||||||
fn relfreq_bray_dist_matrix(&self) -> Array2<f64> {
|
fn relfreq_bray_dist_matrix(&self) -> Array2<f64> {
|
||||||
let global = self.col_weights(); // pass 1 — progressive sum at every level
|
let global = self.col_weights(); // pass 1 — sums up hierarchy
|
||||||
let p = self.partial_relfreq_bray(&global); // pass 2 — global passed in cascade
|
let p = self.partial_relfreq_bray(&global); // pass 2 — global broadcast read-only
|
||||||
p.mapv(|v| 1.0 - v) // finalise (diagonal zeroed separately)
|
p.mapv(|v| 1.0 - v)
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
`global` is exact: each kmer belongs to exactly one `(partition, layer)` pair, so there is no double-counting across the hierarchy.
|
Because each kmer belongs to exactly one `(partition, layer)` pair, `col_weights()` has no double-counting across the hierarchy.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Progressive aggregation principle
|
||||||
|
|
||||||
|
No level reaches two levels down. Each level sums contributions from the level immediately below:
|
||||||
|
|
||||||
|
```
|
||||||
|
PersistentCompactIntMatrix::col_weights() — one (partition, layer)
|
||||||
|
↓ Σ across layers
|
||||||
|
LayeredStore<PersistentCompactIntMatrix>::col_weights() — one partition
|
||||||
|
↓ Σ across partitions
|
||||||
|
LayeredStore<LayeredStore<…>>::col_weights() — global
|
||||||
|
```
|
||||||
|
|
||||||
|
The same cascade applies to every partial method.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Multi-genome column invariant
|
||||||
|
|
||||||
|
After any merge, every layer in every partition has exactly `n_genomes` columns, where `n_genomes` is the current total in `index.meta`. This holds for both `PersistentCompactIntMatrix` and `PersistentBitMatrix`.
|
||||||
|
|
||||||
|
Maintained by three coordinated operations:
|
||||||
|
|
||||||
|
**Existing layers — column append.** `Layer::append_genome_column` appends one column to each existing layer. Slots matching the incoming genome receive its count or `true`; all other slots receive 0 or `false`.
|
||||||
|
|
||||||
|
**New layers — absent columns prepended.** When a new layer is created for kmers unique to the incoming genome, `n_existing_genomes` absent columns are prepended before the incoming genome's column, so the new layer immediately has the same column count as all other layers.
|
||||||
|
|
||||||
|
**First merge, Presence mode — `init_presence_matrix`.** The initial single-genome index has no `presence/` directory (presence is implicit). On the first merge, `Layer<()>::init_presence_matrix` materialises genome 0's presence column (all `true`) retroactively, raising the column count from 0 to 1 before appending column 1.
|
||||||
|
|
||||||
|
This invariant is the precondition for correct progressive aggregation: every level can blindly sum matrices from below because all matrices have the same shape.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Query model
|
||||||
|
|
||||||
|
### Point query
|
||||||
|
|
||||||
|
```
|
||||||
|
minimiser(kmer) → partition p
|
||||||
|
for each layer l in p:
|
||||||
|
if let Some(slot) = MphfLayer_l.find(kmer):
|
||||||
|
return data_l.read(slot)
|
||||||
|
return None
|
||||||
|
```
|
||||||
|
|
||||||
|
O(n_layers) MPHF probes worst case; O(1) expected. The result comes from exactly one `(partition, layer)`.
|
||||||
|
|
||||||
|
### Aggregation
|
||||||
|
|
||||||
|
```
|
||||||
|
result = reduce(
|
||||||
|
for p in partitions: // parallel
|
||||||
|
for l in layers(p): // parallel
|
||||||
|
partial(data_p_l)
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
For normalised metrics, replace with the two-pass cascade.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -254,103 +289,25 @@ fn relfreq_bray_dist_matrix(&self) -> Array2<f64> {
|
|||||||
|
|
||||||
| Level | Unit | Coordination |
|
| Level | Unit | Coordination |
|
||||||
|---|---|---|
|
|---|---|---|
|
||||||
| Across partitions | `LayeredStore<LayeredStore<S>>` inner stores | none — fully independent |
|
| Across partitions | inner stores of `LayeredStore<LayeredStore<S>>` | none |
|
||||||
| Across layers within a partition | `LayeredStore<S>` inner stores | none — disjoint kmer sets |
|
| Across layers within a partition | inner stores of `LayeredStore<S>` | none — disjoint kmer sets |
|
||||||
| Normalised pass 1 (`col_weights`) | per inner store | none — additive |
|
| Normalised pass 1 (`col_weights`) | per inner store | none — additive |
|
||||||
| Normalised pass 2 (partial) | per inner store | `global` broadcast read-only |
|
| Normalised pass 2 (partial) | per inner store | `global` broadcast read-only |
|
||||||
| Within a matrix (distance) | upper-triangle pair `(i,j)` | none — rayon `par_iter` |
|
| Within a matrix (distance) | upper-triangle pair `(i,j)` | none — rayon `par_iter` |
|
||||||
|
|
||||||
All levels use rayon `par_iter` internally; `reduce_with` performs a parallel tree reduction.
|
---
|
||||||
|
|
||||||
|
## reindex — evidence conversion in place
|
||||||
|
|
||||||
|
`KmerIndex::reindex(target, block_bits)` converts every layer's evidence bundle to `target` without touching the MPHF or `unitigs.bin`:
|
||||||
|
|
||||||
|
- `→ Exact`: builds `evidence.bin` + `unitigs.bin.idx`; removes `fingerprint.bin`
|
||||||
|
- `→ Approx { b, z }`: builds `fingerprint.bin`; removes `evidence.bin` + `unitigs.bin.idx`
|
||||||
|
|
||||||
|
On success, `IndexConfig::evidence` and `IndexConfig::block_bits` are updated in `index.meta`. Each layer's `layer_meta.json` is also rewritten with the new `EvidenceKind`.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Query model
|
## estimate — parameter dry-run
|
||||||
|
|
||||||
### Point query — `kmer → Option<Item>`
|
`estimate` resolves approximate-evidence parameters (`z`, `b`, target FP rate) and prints the resulting effective kmer size and per-kmer / per-z-window false-positive rates without touching any index. Used to calibrate `Approx { b, z }` before building or reindexing.
|
||||||
|
|
||||||
```
|
|
||||||
minimiser(kmer) → partition p
|
|
||||||
for each layer l in p:
|
|
||||||
slot = MphfLayer_l.find(kmer)
|
|
||||||
if slot is Some:
|
|
||||||
return DataStore_l.get(slot)
|
|
||||||
return None
|
|
||||||
```
|
|
||||||
|
|
||||||
O(n_layers) MPHF probes worst case; O(1) expected. No cross-layer fusion — the result comes from exactly one (partition, layer).
|
|
||||||
|
|
||||||
### Aggregation — `→ Result`
|
|
||||||
|
|
||||||
```
|
|
||||||
result = reduce(
|
|
||||||
for p in partitions: // parallel
|
|
||||||
for l in layers(p): // parallel
|
|
||||||
partial(DataStore_p_l)
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
For normalised metrics replace with the two-pass scheme above.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## DataStore derivation
|
|
||||||
|
|
||||||
Because the `MphfLayer` is independent of its data stores, new stores can be derived from existing ones without rebuilding the MPHF:
|
|
||||||
|
|
||||||
```
|
|
||||||
// count → presence/absence, parallel across (partition, layer)
|
|
||||||
for (p, l) in all_partition_layer_pairs().par_iter():
|
|
||||||
count_store = open PersistentCompactIntMatrix at (p, l)
|
|
||||||
presence_store = PersistentBitMatrix::from_count_matrix(count_store, threshold, dir)
|
|
||||||
```
|
|
||||||
|
|
||||||
Other derivations: threshold a count matrix → binary presence matrix; union two presence matrices; merge two count matrices (saturating add, column-wise). All are local to one `(partition, layer)` pair.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Relationship to current implementation
|
|
||||||
|
|
||||||
### What is implemented
|
|
||||||
|
|
||||||
- **`obicompactvec::traits`**: `ColumnWeights`, `CountPartials`, `BitPartials` are defined and implemented on `PersistentCompactIntMatrix` and `PersistentBitMatrix`.
|
|
||||||
- **`obilayeredmap::LayeredStore<S>`**: generic wrapper with blanket impls for all three traits. `LayeredStore<LayeredStore<S>>` is the partitioned level — no separate type needed. Tests confirm that splitting data across layers and across partitions gives the same distance matrices as computing on flat combined data.
|
|
||||||
|
|
||||||
### What is not yet implemented
|
|
||||||
|
|
||||||
- `Layer<D: LayerData>` still fuses `MphfLayer` and one `DataStore`. Multiple data stores on the same MPHF are not supported.
|
|
||||||
- `LayeredMap` is a single-partition structure without distance matrix API; it does not yet use `LayeredStore`.
|
|
||||||
- No `PartitionedIndex` type for point queries with parallel partition dispatch.
|
|
||||||
|
|
||||||
### Planned refactoring
|
|
||||||
|
|
||||||
1. Extract `MphfLayer` from `Layer<D>` as an autonomous type.
|
|
||||||
2. Replace `LayerData` trait with the `DataStore` / `ColumnWeights` / `CountPartials` / `BitPartials` system.
|
|
||||||
3. Rewire `LayeredMap` to hold `LayeredStore<PersistentCompactIntMatrix>` (or bit variant) alongside the MPHF layers.
|
|
||||||
4. Implement `PartitionedIndex` using `LayeredStore<LayeredStore<S>>` for data and parallel dispatch for queries.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Multi-genome column invariant
|
|
||||||
|
|
||||||
### The invariant
|
|
||||||
|
|
||||||
After any merge operation, **every layer in every partition has exactly `n_genomes` columns**, where `n_genomes` is the current total genome count recorded in `index.meta`. This applies to both `PersistentCompactIntMatrix` (Count mode) and `PersistentBitMatrix` (Presence mode).
|
|
||||||
|
|
||||||
### How it is maintained
|
|
||||||
|
|
||||||
The invariant is established and preserved by three coordinated operations:
|
|
||||||
|
|
||||||
**1. Existing layers — column append.**
|
|
||||||
When merging source genome G into an existing index with `n_existing_genomes` genomes, one column is appended to every existing layer via `append_genome_column`. Slots that contain a kmer present in source G receive its count or `true`; all other slots receive 0 or `false`. After this step, every pre-existing layer has `n_existing_genomes + 1` columns.
|
|
||||||
|
|
||||||
**2. New layers — absent columns prepended.**
|
|
||||||
If source G introduces kmers not found in any existing layer, a new layer is created for those kmers. Before appending source G's own column, `n_existing_genomes` absent columns (all-zero or all-false) are prepended — one per genome already in the index. This ensures the new layer starts at the same column count as every other layer in the partition immediately after creation.
|
|
||||||
|
|
||||||
**3. First merge, Presence mode — `init_presence_matrix`.**
|
|
||||||
The initial single-genome index has no `presence/` directory (presence is implicit: every kmer in the index is present in genome 0). On the first merge, before appending any column for source 1, `Layer<()>::init_presence_matrix` creates `presence/col_000000.pbiv` set entirely to `true` for each existing layer. This retroactively materialises genome 0's presence column, bringing the column count from 0 to 1 so that the subsequent append for source 1 raises it to 2.
|
|
||||||
|
|
||||||
### Why the invariant is required
|
|
||||||
|
|
||||||
The `LayeredStore` aggregation traits (`col_weights`, `partial_bray`, `partial_jaccard`, etc.) sum contributions across all `(partition, layer)` pairs without any shape check. If one layer had fewer columns than others, its contribution would silently produce a malformed result — wrong column sums, wrong distance matrices, and incorrect genome-level statistics.
|
|
||||||
|
|
||||||
The invariant is the precondition that makes the progressive aggregation principle correct: each level can blindly sum matrices from the level below because all matrices have the same shape.
|
|
||||||
|
|||||||
@@ -0,0 +1,22 @@
|
|||||||
|
<!-- coverage sidecar — ne pas ajouter au nav mkdocs -->
|
||||||
|
# Coverage: architecture/index_architecture.md
|
||||||
|
|
||||||
|
## Code couvert
|
||||||
|
|
||||||
|
- `obilayeredmap/src/layer.rs` — Layer<D>, trait LayerData, modes () / PersistentCompactIntMatrix / PersistentBitMatrix
|
||||||
|
- `obilayeredmap/src/mphf_layer.rs` — MphfLayer, EvidenceKind (Exact / Approx), LayerEvidence enum
|
||||||
|
- `obilayeredmap/src/map.rs` — LayeredMap<D>
|
||||||
|
- `obilayeredmap/src/meta.rs` — LayerMeta, PartitionMeta
|
||||||
|
- `obikindex/src/meta.rs` — IndexConfig (kmer_size, n_bits, with_counts, evidence, block_bits), IndexMeta
|
||||||
|
- `obikindex/src/index.rs` — KmerIndex, build_layers
|
||||||
|
- `obicompactvec/src/` — PersistentCompactIntMatrix, PersistentBitMatrix (DataStore implementations)
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
FORT RISQUE DE DÉRIVE. Nombreux changements récents :
|
||||||
|
- Ajout de `EvidenceKind` (Exact / Approx { b, z }) dans `IndexConfig` et `LayerMeta`
|
||||||
|
- Ajout de `block_bits` dans `IndexConfig`
|
||||||
|
- `LayerEvidence` enum dans `mphf_layer.rs` remplace l'ancienne approche monolithique
|
||||||
|
- Distinction `open()` vs `open_sequential()` dans `UnitigFileReader`
|
||||||
|
- Commandes `reindex` et `estimate` ajoutées
|
||||||
|
Vérifier que la hiérarchie à 3 niveaux décrite est toujours exacte et que les nouveaux paramètres sont documentés.
|
||||||
@@ -0,0 +1,179 @@
|
|||||||
|
# NUMA-aware partition runner
|
||||||
|
|
||||||
|
## Problem
|
||||||
|
|
||||||
|
All partition-level parallel loops in obikindex currently fall into two
|
||||||
|
categories:
|
||||||
|
|
||||||
|
**Naive Rayon** — used in `build_layers`, `pack_matrices`, `dump`, `select`,
|
||||||
|
`stats`, `rebuild`, `reindex`:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
(0..n).into_par_iter().for_each(|i| work(i));
|
||||||
|
```
|
||||||
|
|
||||||
|
Threads come from the global Rayon pool with no NUMA awareness. On
|
||||||
|
multi-socket machines this produces cross-socket memory traffic and degrades
|
||||||
|
performance super-linearly (see [NUMA-aware worker pools](numa_worker_pools.md)).
|
||||||
|
|
||||||
|
**Ad-hoc adaptive pool** — used in `merge`:
|
||||||
|
|
||||||
|
A bespoke implementation with pre-spawned workers, channel-based dispatch, and
|
||||||
|
activation control. It handles NUMA correctly but is not reusable.
|
||||||
|
|
||||||
|
Both cases should be replaced by a single generic mechanism.
|
||||||
|
|
||||||
|
## Unified model
|
||||||
|
|
||||||
|
The key insight is that **UMA is just the NUMA case with a single node**. The
|
||||||
|
runner always works the same way: one controller thread per node, each
|
||||||
|
independently managing its own workers with the same adaptive logic. The only
|
||||||
|
difference between UMA and NUMA is the number of nodes and whether workers are
|
||||||
|
pinned.
|
||||||
|
|
||||||
|
```
|
||||||
|
NUMA (k nodes) UMA (1 node)
|
||||||
|
|
||||||
|
controller-0 controller-1 … controller-0
|
||||||
|
│ │ │
|
||||||
|
workers[0] workers[1] workers[0]
|
||||||
|
(pinned) (pinned) (global pool)
|
||||||
|
└───────────────┴──────────────────┘
|
||||||
|
shared work queue
|
||||||
|
```
|
||||||
|
|
||||||
|
On each node, the Rayon `ThreadPool` is pinned to that node's CPUs.
|
||||||
|
`pool.install()` ensures all internal Rayon calls (inside the work function)
|
||||||
|
use the node-local pool. Linux first-touch then places heap allocations in
|
||||||
|
local DRAM automatically.
|
||||||
|
|
||||||
|
On UMA the global Rayon pool is used directly — no pinning, no overhead.
|
||||||
|
|
||||||
|
## Adaptive mechanism
|
||||||
|
|
||||||
|
Each controller follows the same logic regardless of node count:
|
||||||
|
|
||||||
|
1. Pre-spawn `workers_per_node` dormant worker threads (blocked on `activate_rx`).
|
||||||
|
2. Activate the first worker immediately.
|
||||||
|
3. Loop on result channel with a `SPAWN_POLL` timeout:
|
||||||
|
- On result: call `on_done`; check whether to activate the next worker.
|
||||||
|
- On timeout: same check.
|
||||||
|
- Activation criterion: `should_spawn_worker(active, global_efficiency, prev_efficiency)`.
|
||||||
|
4. Drop `activate_tx` when done — dormant workers exit cleanly.
|
||||||
|
|
||||||
|
**Global CPU efficiency** (`CpuSample`, reads `/proc/stat` on Linux) is used by
|
||||||
|
all controllers — no per-node measurement needed. The signal is coarser than
|
||||||
|
per-node efficiency but correct in practice: if any node saturates memory
|
||||||
|
bandwidth, the global efficiency drops and all controllers stop activating
|
||||||
|
workers. Using a standard portable primitive avoids platform-specific CPU
|
||||||
|
accounting and keeps the implementation clean.
|
||||||
|
|
||||||
|
## Proposed API
|
||||||
|
|
||||||
|
```rust
|
||||||
|
pub struct PartitionRunner {
|
||||||
|
// One entry per NUMA node; one entry total on UMA.
|
||||||
|
nodes: Vec<NodeConfig>,
|
||||||
|
}
|
||||||
|
|
||||||
|
struct NodeConfig {
|
||||||
|
pool: Option<Arc<rayon::ThreadPool>>, // None = global Rayon pool (UMA)
|
||||||
|
cpu_ids: Vec<usize>, // empty = no pinning (UMA)
|
||||||
|
max_workers: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartitionRunner {
|
||||||
|
/// Detect topology and build the runner.
|
||||||
|
/// Returns a single-node runner on UMA / macOS / hwloc failure.
|
||||||
|
pub fn new() -> Self;
|
||||||
|
|
||||||
|
/// Run `f(i)` for every index in `order`, collecting results.
|
||||||
|
///
|
||||||
|
/// `on_done(i, result, elapsed)` is called under an internal mutex as
|
||||||
|
/// each partition completes — use it for progress bars and aggregation.
|
||||||
|
/// The runner serialises all calls to `on_done` via an internal
|
||||||
|
/// `Arc<Mutex<C>>`, so no `Sync` bound is required on the callback.
|
||||||
|
/// `Send` is required because the Arc clone crosses thread boundaries.
|
||||||
|
///
|
||||||
|
/// Serialisation is free in practice: a partition takes seconds to
|
||||||
|
/// minutes; the callback takes microseconds. Contention is negligible.
|
||||||
|
///
|
||||||
|
/// Returns the first error from `f`, if any.
|
||||||
|
pub fn run<F, R, E, C>(
|
||||||
|
&self,
|
||||||
|
order: &[usize],
|
||||||
|
f: F,
|
||||||
|
on_done: C,
|
||||||
|
) -> Result<(), E>
|
||||||
|
where
|
||||||
|
F: Fn(usize) -> Result<R, E> + Send + Sync,
|
||||||
|
R: Send,
|
||||||
|
E: Send,
|
||||||
|
C: FnMut(usize, R, Duration) + Send; // Send required, Sync is not
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
`order` is caller-supplied so each command chooses its scheduling strategy:
|
||||||
|
largest-first for `merge`, sequential for `build_layers`, etc.
|
||||||
|
|
||||||
|
## Migration examples
|
||||||
|
|
||||||
|
### merge.rs (before: ~180 lines of bespoke machinery)
|
||||||
|
|
||||||
|
```rust
|
||||||
|
let runner = PartitionRunner::new();
|
||||||
|
runner.run(
|
||||||
|
&order,
|
||||||
|
|i| dst_partition.merge_partition(i, srcs, mode, n_dst_genomes, block_bits, evidence)
|
||||||
|
.map_err(OKIError::Partition),
|
||||||
|
|i, g_len, dur| {
|
||||||
|
pb.inc(1);
|
||||||
|
debug!("partition {i}: done in {:.1}s — {g_len} new kmers", dur.as_secs_f64());
|
||||||
|
part_stats.push(PartStat { id: i, unitig_bytes: partition_sizes[i], g_len });
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
```
|
||||||
|
|
||||||
|
### index.rs build_layers (before: naive into_par_iter)
|
||||||
|
|
||||||
|
```rust
|
||||||
|
let order: Vec<usize> = (0..n).collect();
|
||||||
|
let runner = PartitionRunner::new();
|
||||||
|
runner.run(
|
||||||
|
&order,
|
||||||
|
|i| self.partition.build_index_layer(i, min_ab, max_ab, with_counts, &evidence, block_bits)
|
||||||
|
.map_err(OKIError::Partition),
|
||||||
|
|_, n_kmers, _| {
|
||||||
|
total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
|
||||||
|
pb.inc(1);
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
```
|
||||||
|
|
||||||
|
All other sites (`pack_matrices`, `dump`, `select`, etc.) follow the same
|
||||||
|
pattern.
|
||||||
|
|
||||||
|
## Placement
|
||||||
|
|
||||||
|
`PartitionRunner` lives in `obikindex/src/numa.rs` alongside `NumaSetup`.
|
||||||
|
It depends only on standard library primitives and Rayon — no new dependencies.
|
||||||
|
|
||||||
|
A single `PartitionRunner` instance can be built once per command invocation
|
||||||
|
and reused across multiple `run()` calls (e.g. `merge` runs
|
||||||
|
`merge_partitions` then `pack_matrices`).
|
||||||
|
|
||||||
|
## Open questions
|
||||||
|
|
||||||
|
- **Error handling**: `run` currently returns the first error; remaining errors
|
||||||
|
are dropped. A `Vec<E>` return would give complete diagnostics.
|
||||||
|
|
||||||
|
- **`workers_per_node` tuning**: currently `(cpus / 8).max(3).min(8)`, calibrated
|
||||||
|
for merge on BeeGFS. I/O-bound commands (`dump`, `select`) may benefit from
|
||||||
|
a higher value. A per-call override could be added to the API.
|
||||||
|
|
||||||
|
- **`on_done` ordering**: the runner serialises calls to `on_done` via an
|
||||||
|
internal `Arc<Mutex<C>>`. `Send` is required (the Arc clone crosses thread
|
||||||
|
boundaries); `Sync` is not (only one thread holds the lock at a time).
|
||||||
|
Contention is negligible because a partition takes seconds while the callback
|
||||||
|
takes microseconds. The callback is therefore simple to write (plain
|
||||||
|
`Vec::push`, plain `FnMut`) with no measurable performance cost.
|
||||||
@@ -0,0 +1,97 @@
|
|||||||
|
# NUMA-aware worker pools for merge
|
||||||
|
|
||||||
|
## Problem
|
||||||
|
|
||||||
|
The merge command's bottleneck is `compute_degrees` in `obidebruinj`: a random pointer-chase over 20–70 M node hash maps that saturates DRAM bandwidth. When multiple partition workers run concurrently, they contend for the shared memory bus, causing super-linear slowdown (measured: 0.016 µs/node solo → 0.95 µs/node with 4–5 concurrent workers, ×60 degradation).
|
||||||
|
|
||||||
|
Modern HPC nodes are multi-socket NUMA machines (observed: 2 sockets × 4 NUMA nodes × 24 cores = 192 cores). Cross-NUMA memory traffic compounds the contention:
|
||||||
|
|
||||||
|
- Full 192-core run: ~15 min/partition (×10 worse than M3 Mac)
|
||||||
|
- `taskset` restricted to 4 NUMA nodes (96 cores): ~90 s/partition
|
||||||
|
- OAR job on 1 NUMA node (24 cores): ~80 s/partition, same throughput as 96 cores
|
||||||
|
|
||||||
|
**Conclusion**: the bottleneck is memory bandwidth per NUMA node, not core count. 24 cores on one NUMA node achieve the same throughput as 96 cores across four.
|
||||||
|
|
||||||
|
## Strategy
|
||||||
|
|
||||||
|
Run N worker groups in parallel, one per NUMA node, each with its own Rayon thread pool whose threads are pinned to the NUMA node's CPUs. Linux's first-touch policy then places graph allocations on local DRAM automatically — no explicit NUMA allocator needed.
|
||||||
|
|
||||||
|
Expected throughput: N × single-NUMA throughput. On the 8-NUMA-node HPC: 8 × ~80 s = 9–10 min total instead of >60 min with the current single-pool approach.
|
||||||
|
|
||||||
|
## Rayon thread pool isolation
|
||||||
|
|
||||||
|
Rayon provides `ThreadPool::install(|| { ... })`: any Rayon call (`par_iter`, `current_num_threads`, etc.) inside the closure uses *that* pool exclusively. Wrapping `merge_partition` in `pool.install()` redirects all downstream Rayon calls — including those in `debruijn.rs` and `partition.rs` — without touching those crates.
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// worker thread, assigned to NUMA pool `pool`
|
||||||
|
pool.install(|| {
|
||||||
|
dst_partition.merge_partition(i, srcs, mode, n_dst_genomes, block_bits, evidence)
|
||||||
|
})
|
||||||
|
```
|
||||||
|
|
||||||
|
`rayon::current_num_threads()` inside `merge_partition` will return the pool size (e.g. 24), not the global thread count — which is the right value for buffer sizing.
|
||||||
|
|
||||||
|
## Thread pinning
|
||||||
|
|
||||||
|
`ThreadPoolBuilder::spawn_handler` provides a hook executed for each thread at creation. Inside, `libc::sched_setaffinity` pins the thread to a CPU set:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
let cpus: Vec<usize> = numa_node_cpus(node); // from /sys/devices/system/node/nodeN/cpulist
|
||||||
|
rayon::ThreadPoolBuilder::new()
|
||||||
|
.num_threads(cpus.len())
|
||||||
|
.spawn_handler(move |thread| {
|
||||||
|
let mut b = std::thread::Builder::new();
|
||||||
|
std::thread::Builder::new().spawn(move || {
|
||||||
|
pin_to_cpus(&cpus); // sched_setaffinity via libc
|
||||||
|
thread.run()
|
||||||
|
})?;
|
||||||
|
Ok(())
|
||||||
|
})
|
||||||
|
.build()?
|
||||||
|
```
|
||||||
|
|
||||||
|
NUMA topology is read from `/sys/devices/system/node/node*/cpulist` — no `libnuma` dependency required. If the `numa` crate is linked, `numa_available()` / `numa_run_on_node()` are an alternative.
|
||||||
|
|
||||||
|
## Memory locality
|
||||||
|
|
||||||
|
Linux allocates pages on the NUMA node of the thread that first writes them (first-touch policy). Once Rayon threads are pinned to node N, all graph data built by those threads lands on node N's DRAM. No changes to the allocator, no explicit `numa_alloc_onnode` calls.
|
||||||
|
|
||||||
|
## Adaptive spawn criterion
|
||||||
|
|
||||||
|
The current criterion uses `std::thread::available_parallelism()` (returns total cores = 192) and `max_workers = n_cores / 2`. With NUMA pools:
|
||||||
|
|
||||||
|
- `n_cores` per pool = cores per NUMA node (e.g. 24)
|
||||||
|
- `max_workers` per pool = pool size / 2 (e.g. 12)
|
||||||
|
- CPU efficiency is measured per pool, not globally
|
||||||
|
|
||||||
|
Each NUMA group runs its own independent adaptive pool. Workers are distributed across NUMA groups round-robin or by workload (partition assignment can be pre-split by NUMA group index).
|
||||||
|
|
||||||
|
## Required changes
|
||||||
|
|
||||||
|
| File | Change |
|
||||||
|
|------|--------|
|
||||||
|
| `obikindex/src/merge.rs` | Detect NUMA topology; build N `ThreadPool`s with pinned threads; assign each pre-spawned worker to a pool; wrap `merge_partition` in `pool.install()` |
|
||||||
|
| `obikindex/src/merge.rs` | Replace `available_parallelism()` with per-NUMA core count for spawn criterion |
|
||||||
|
| `obikpartitionner/src/merge_layer.rs` | No change — `merge_partition` already works inside any Rayon context |
|
||||||
|
| `obidebruinj/src/debruijn.rs` | No change — `par_iter` and `current_num_threads` are pool-context-aware |
|
||||||
|
| `obikpartitionner/src/partition.rs` | No change — same reason |
|
||||||
|
|
||||||
|
## Platform guard
|
||||||
|
|
||||||
|
NUMA pinning is Linux-only. The fallback is the current single global pool:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
fn build_numa_pools() -> Option<Vec<rayon::ThreadPool>> { ... }
|
||||||
|
|
||||||
|
#[cfg(not(target_os = "linux"))]
|
||||||
|
fn build_numa_pools() -> Option<Vec<rayon::ThreadPool>> { None }
|
||||||
|
```
|
||||||
|
|
||||||
|
When `build_numa_pools()` returns `None` (macOS, UMA, or single-socket), `merge.rs` uses the existing code path unchanged.
|
||||||
|
|
||||||
|
## Open questions
|
||||||
|
|
||||||
|
- **Partition assignment**: split partitions by NUMA group up-front (static) or use a shared queue with per-group workers stealing from a common pool? Static split is simpler; stealing is better for load balance when partitions vary widely in size.
|
||||||
|
- **Intra-NUMA adaptive criterion**: with 24 cores and ~3–5 effective workers per NUMA node, the current marginal-gain criterion needs re-tuning or can be left as-is with per-pool `n_cores = 24`.
|
||||||
|
- **I/O**: partition data (unitig files) is on a shared filesystem. With 8 concurrent NUMA groups, I/O concurrency increases 8× — need to verify the filesystem (Lustre or local SSD) can absorb it without becoming the new bottleneck.
|
||||||
+127
-49
@@ -8,7 +8,7 @@ Given a set of query sequences, determine for each sequence how many of its k-me
|
|||||||
|
|
||||||
## Input
|
## Input
|
||||||
|
|
||||||
- Query sequences in FASTA or FASTQ format (gzip supported, streaming stdin supported).
|
- Query sequences in FASTA or FASTQ format (gzip supported, streaming stdin supported). GenBank flat files are not supported at query time (only at index time).
|
||||||
- Sequences shorter than k bases are silently skipped.
|
- Sequences shorter than k bases are silently skipped.
|
||||||
- Non-ACGT characters are handled by the superkmer decomposition layer: they act as hard breaks, producing shorter superkmers (identical to the behaviour at indexing time).
|
- Non-ACGT characters are handled by the superkmer decomposition layer: they act as hard breaks, producing shorter superkmers (identical to the behaviour at indexing time).
|
||||||
|
|
||||||
@@ -19,34 +19,111 @@ Given a set of query sequences, determine for each sequence how many of its k-me
|
|||||||
The query follows the same superkmer-based partitioning strategy used at indexing time.
|
The query follows the same superkmer-based partitioning strategy used at indexing time.
|
||||||
|
|
||||||
```
|
```
|
||||||
for each query sequence:
|
for each chunk of sequences (parallel workers via obipipeline):
|
||||||
decompose into superkmers (non-ACGT breaks, same minimiser scheme as indexing)
|
build QueryBatch: decompose all sequences into s-mers via superkmers, deduplicate
|
||||||
for each superkmer:
|
allocate seq_results[seq_idx][smer_pos] = None ← per-sequence s-mer result vectors
|
||||||
route to partition p via minimiser hash
|
split superkmers by partition via minimiser hash
|
||||||
for each kmer in the superkmer:
|
for each partition p:
|
||||||
lookup kmer in partition p (MPHF → evidence check → matrix row)
|
query_partition(p, superkmers_routed_to_p)
|
||||||
accumulate result into per-sequence accumulators
|
→ load QueryLayer(s) for p
|
||||||
emit annotated sequence
|
→ for each s-mer in each superkmer: MphfLayer::find(smer)
|
||||||
|
fill seq_results[seq_idx][kmer_offset + j] from partition results
|
||||||
|
for each sequence:
|
||||||
|
apply_findere(seq_results[seq_idx], effective_z) ← per full sequence
|
||||||
|
accumulate confirmed k-mer results into acc and cov
|
||||||
|
emit annotated sequences
|
||||||
```
|
```
|
||||||
|
|
||||||
Parallelism is **per sequence**: each worker thread handles all partitions of one sequence independently. This avoids cross-thread coordination when merging partial results and keeps memory usage proportional to the number of concurrent sequences rather than to the number of partitions.
|
Superkmers that appear more than once in the batch (same sequence or across sequences) are deduplicated: each unique `RoutableSuperKmer` is queried once per partition, and the result is broadcast to every `SKDesc` entry that references it.
|
||||||
|
|
||||||
|
**Findere requires full-sequence aggregation.** `apply_findere` is applied once per sequence on the complete s-mer result vector, after all partitions have contributed. Applying it per superkmer would produce false negatives at superkmer boundaries, where the z-window spans two superkmers.
|
||||||
|
|
||||||
|
Batches are processed in parallel via `obipipeline` workers; the `--threads` flag controls the number of worker threads.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Exact vs. approximate matching
|
## Findere z-window filter
|
||||||
|
|
||||||
### Exact (default)
|
For approximate index modes, the index physically stores s-mers of size `s = k_user − z + 1`. At query time, `set_k(s)` is in effect, so queries naturally produce s-mer results. `apply_findere` then aggregates z consecutive s-mer results into one k_user-mer answer:
|
||||||
|
|
||||||
Standard MPHF lookup followed by evidence check. O(1) per k-mer.
|
```rust
|
||||||
|
fn apply_findere(
|
||||||
|
results: &[Option<Box<[u32]>>], // N s-mer results
|
||||||
|
z: usize,
|
||||||
|
n_genomes: usize,
|
||||||
|
) -> Vec<Option<Box<[u32]>>> // N − z + 1 k_user-mer results
|
||||||
|
```
|
||||||
|
|
||||||
### 1-mismatch (`--mismatch` flag)
|
Input length N (s-mers), output length N − z + 1 (k_user-mers).
|
||||||
|
|
||||||
For each k-mer of the query, generate all `3·k` single-substitution variants. Each variant is canonicalised and looked up independently in the index. If one or more variants are found, their per-genome rows are **summed** into the result for that k-mer position.
|
For each genome g independently, a sliding window of size z scans the input. Output position i is confirmed for genome g iff all z values `results[i..i+z][g]` are nonzero (`None` counts as zero for all genomes). The scan is O(n) per genome.
|
||||||
|
|
||||||
- If a k-mer matches exactly AND one of its variants also matches (distinct k-mers in the index), both contributions are accumulated.
|
Output values come from `results[i]` (leftmost s-mer of each window); genomes not confirmed are zeroed. If all genomes are zero, the position is returned as `None`.
|
||||||
- Exact and approximate matches are tracked separately in the output (see annotation schema below).
|
|
||||||
- The superkmer routing optimisation is **not** used in 1-mismatch mode: each variant is looked up directly via its own minimiser.
|
**Short sequences**: when the s-mer count is less than z, no complete window can form — `apply_findere` returns an empty vector. K-mers from sequences shorter than k_user are not emitted.
|
||||||
- Cost: up to `3·k` MPHF probes per k-mer position vs. 1 in exact mode.
|
|
||||||
|
**Exact indexes**: `z = 1`, `apply_findere` is a passthrough (output length = input length).
|
||||||
|
|
||||||
|
### Effective z at query time
|
||||||
|
|
||||||
|
`effective_z` is resolved at the start of `run()`:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
let effective_z = args.findere_z.unwrap_or_else(|| match idx.meta().config.evidence {
|
||||||
|
IndexMode::Approx { z, .. } | IndexMode::Hybrid { z, .. } => z as usize,
|
||||||
|
IndexMode::Exact => 1,
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
The `-z` CLI option overrides the index metadata value. A higher z increases stringency (lower FP, some true positives may be discarded at sequence ends); a lower z increases sensitivity.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Layer lookup: `MphfLayer::find`
|
||||||
|
|
||||||
|
`MphfLayer::open(dir, mode: &IndexMode)` receives the mode from `PartitionMeta` — no per-layer file is read. The caller (`QueryLayer`) never chooses the dispatch path: it is fixed at open time by `LayerEvidence`. See [obilayeredmap](../implementation/obilayeredmap.md) for the full `find` / `find_strict` API.
|
||||||
|
|
||||||
|
### `QueryLayer` variant selection
|
||||||
|
|
||||||
|
`QueryLayer::open` in `query_layer.rs` selects the data matrix to pair with `MphfLayer`:
|
||||||
|
|
||||||
|
| Condition | Variant | Data returned per k-mer |
|
||||||
|
|---|---|---|
|
||||||
|
| `with_counts=true` and `counts/` exists | `Count` | raw count per genome |
|
||||||
|
| `presence/` exists | `Presence` | 0/1 per genome (bit matrix) |
|
||||||
|
| only `counts/` exists | `Count` | counts used as-is |
|
||||||
|
| neither exists | `SetOnly` | 1 for every genome |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Presence / count mode at query time
|
||||||
|
|
||||||
|
The `--force-presence` flag and `--presence-threshold` control how per-genome values are accumulated, independently of what the index stores:
|
||||||
|
|
||||||
|
```
|
||||||
|
genome_totals[g] += if presence { u32::from(v >= threshold) } else { v }
|
||||||
|
```
|
||||||
|
|
||||||
|
`presence` is true when `--force-presence` is set or when the index has no counts (`!with_counts`). The default `presence_threshold` is 1, so any nonzero count counts as a match.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Coverage vectors (`--detail`)
|
||||||
|
|
||||||
|
When `--detail` is requested, a 3-D accumulator `cov[seq_idx][genome][kmer_pos]` is allocated after all partitions are queried, with dimensions derived from `n_kmers_out = n_smers − z + 1` (k_user-mer positions, not s-mer positions):
|
||||||
|
|
||||||
|
```
|
||||||
|
cov[seq_idx][g][pos] += contribution
|
||||||
|
where pos is the k_user-mer index in the filtered (post-Findere) vector
|
||||||
|
```
|
||||||
|
|
||||||
|
Coverage reflects confirmed k_user-mers only. The vectors are emitted in the JSON annotation under the key `"coverage"`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## `kmer_missing` semantics
|
||||||
|
|
||||||
|
`kmer_missing` counts k_user-mer positions where the first s-mer (`seq_results[seq_idx][pos]`) is `None` — i.e. absent from the index entirely. K-mers where the z-window fails because a later s-mer is absent or zero are not counted as missing (the first s-mer being present is used as proxy for index membership).
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -55,57 +132,58 @@ For each k-mer of the query, generate all `3·k` single-substitution variants. E
|
|||||||
Output sequences are written in **OBITools4 format**: the original sequence with a JSON annotation map in the title line.
|
Output sequences are written in **OBITools4 format**: the original sequence with a JSON annotation map in the title line.
|
||||||
|
|
||||||
```
|
```
|
||||||
>read_id {"kmer_total":150,"kmer_found":59,...}
|
>read_id {"kmer_count":59,"kmer_strict_matches":{"genome_a":42,"genome_b":7}}
|
||||||
ATCGATCG...
|
ATCGATCG...
|
||||||
```
|
```
|
||||||
|
|
||||||
Genome order in all list-valued annotations follows the genome order recorded in `index.meta`.
|
With `--detail`:
|
||||||
|
|
||||||
|
```
|
||||||
|
>read_id {"kmer_count":59,"kmer_strict_matches":{...},"coverage":{"genome_a":[0,1,2,...],...}}
|
||||||
|
ATCGATCG...
|
||||||
|
```
|
||||||
|
|
||||||
|
Genome keys follow the iteration order of `meta.genomes`.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Annotation schema
|
## Annotation schema
|
||||||
|
|
||||||
### Summary mode (default)
|
|
||||||
|
|
||||||
| Key | Type | Condition | Semantics |
|
| Key | Type | Condition | Semantics |
|
||||||
|---|---|---|---|
|
|---|---|---|---|
|
||||||
| `kmer_total` | int | always | total k-mers in the (masked) sequence |
|
| `kmer_count` | int | always | k-mers confirmed (post-Findere) with at least one genome match |
|
||||||
| `kmer_found` | int | always | k-mers with at least one match (exact or approx) |
|
| `kmer_missing` | int | `--count-missing` | k-mers absent from the index entirely (pre-Findere None) |
|
||||||
| `kmer_missing` | int | `--count-missing` | k-mers absent from the index |
|
| `kmer_strict_matches` | object | always | per-genome accumulated value (label → count or 0/1) |
|
||||||
| `kmer_match` | list[int] | always | per-genome matched k-mer count (exact + approx) |
|
| `coverage` | object | `--detail` | per-genome array of per-position contributions (label → [u32]) |
|
||||||
| `kmer_match_exact` | list[int] | `--mismatch` | per-genome exact match count |
|
|
||||||
| `kmer_match_approx` | list[int] | `--mismatch` | per-genome approx match count |
|
|
||||||
| `count_match` | list[int] | count index | per-genome sum of index counts for matched k-mers |
|
|
||||||
|
|
||||||
`kmer_match[i]` is the number of k-mer positions in the query that contribute at least one match to genome i. In 1-mismatch mode, a single k-mer position can contribute to multiple genomes if several of its variants are present in the index.
|
`kmer_count + kmer_missing` ≤ total k_user-mers in the sequence. The gap corresponds to k_user-mers whose z-window was not fully confirmed (at least one s-mer absent or zero for all genomes) but whose first s-mer was present in the index.
|
||||||
|
|
||||||
`count_match[i]` sums raw index counts across all matched k-mer positions for genome i. Only meaningful for count indexes.
|
|
||||||
|
|
||||||
### Detail mode (`--detail`)
|
|
||||||
|
|
||||||
All summary keys, plus per-position coverage vectors — one list per genome, length `len(sequence) − k + 1`:
|
|
||||||
|
|
||||||
| Key | Type | Condition | Semantics |
|
|
||||||
|---|---|---|---|
|
|
||||||
| `cov_<i>` | list[int] | `--detail` | coverage at each k-mer position for genome i; raw count (count index) or 0/1 (presence index); 0 if absent |
|
|
||||||
| `cov_exact_<i>` | list[int] | `--detail` + `--mismatch` | exact-match contribution per position |
|
|
||||||
| `cov_approx_<i>` | list[int] | `--detail` + `--mismatch` | approx-match contribution per position |
|
|
||||||
|
|
||||||
Genome indices in key names are 0-based integers matching the `index.meta` genome order. Genome labels are not used as key names to avoid issues with special characters in long or complex genome identifiers.
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## CLI
|
## CLI
|
||||||
|
|
||||||
```
|
```
|
||||||
obikmer query -i <index> [--summary | --detail] [--mismatch] [--count-missing] <query.fa>
|
obikmer query <index> [--detail] [--mismatch] [--count-missing]
|
||||||
|
[--force-presence] [--presence-threshold <n>]
|
||||||
|
[-z <z>] [-T <threads>]
|
||||||
|
<query.fa> [<query2.fa> ...]
|
||||||
```
|
```
|
||||||
|
|
||||||
`--summary` is the default; `--detail` implies `--summary` (all summary keys are always present).
|
| Option | Default | Semantics |
|
||||||
|
|---|---|---|
|
||||||
|
| `-z` / `--findere-z` | from index metadata | Override Findere z parameter |
|
||||||
|
| `--detail` | off | Emit per-position coverage vectors in JSON |
|
||||||
|
| `--count-missing` | off | Add `kmer_missing` field to JSON |
|
||||||
|
| `--force-presence` | off | Report 0/1 per genome regardless of index counts |
|
||||||
|
| `--presence-threshold` | 1 | Minimum count to declare genome present |
|
||||||
|
| `-T` / `--threads` | all CPUs | Worker threads |
|
||||||
|
|
||||||
|
`--mismatch` is accepted but currently ignored with a warning on stderr.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Future work
|
## Future work
|
||||||
|
|
||||||
- **Read classification** (`--classify`): assign each read to the genome with the highest `kmer_match` score; emit as a single annotation key.
|
- **`--mismatch`**: 1-mismatch approximate matching — generate `3·k` single-substitution variants per k-mer, look each up independently.
|
||||||
- **Whitelist / blacklist filtering**: accept or reject sequences based on whether their k-mer match score for a designated set of genomes exceeds a threshold.
|
- **Read classification** (`--classify`): assign each read to the genome with the highest match score.
|
||||||
|
- **Whitelist / blacklist filtering**: threshold-based accept/reject on per-genome match scores.
|
||||||
|
|||||||
@@ -0,0 +1,16 @@
|
|||||||
|
<!-- coverage sidecar — ne pas ajouter au nav mkdocs -->
|
||||||
|
# Coverage: architecture/query.md
|
||||||
|
|
||||||
|
## Code couvert
|
||||||
|
|
||||||
|
- `obikmer/src/cmd/query.rs` — commande query, format de sortie
|
||||||
|
- `obikpartitionner/src/query_layer.rs` — routage de la requête à travers les partitions
|
||||||
|
- `obiread/src/lib.rs` — lecture des séquences d'entrée pour la requête
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
RISQUE DE DÉRIVE. Vérifier :
|
||||||
|
- La commande `unitig` a été modifiée pour utiliser `open_sequential()` — vérifier si query est concerné
|
||||||
|
- `find_exact` / `find_approx` / `find` générique ont été ajoutés dans `MphfLayer` — le chemin de requête a changé
|
||||||
|
- Si l'index est approximatif (Approx), la requête peut produire des faux positifs : la doc le mentionne-t-elle ?
|
||||||
|
- Format de sortie CSV (`obikindex/src/csv.rs` ou équivalent) à vérifier
|
||||||
@@ -0,0 +1,105 @@
|
|||||||
|
# Rebuild / filter — column-first design
|
||||||
|
|
||||||
|
## Problem with the current two-pass design
|
||||||
|
|
||||||
|
`rebuild_partition` currently makes **two full passes** over source data:
|
||||||
|
|
||||||
|
**Pass 1** — read unitigs → MPHF lookup (source) → read row (108 values) → apply filter → push kmer into `GraphDeBruijn`, **discard row**.
|
||||||
|
|
||||||
|
**Pass 2** — read unitigs again → MPHF lookup again → read row again → for each passing kmer, look up slot in new MPHF → fill column builders.
|
||||||
|
|
||||||
|
Both passes do random access into the source matrix: for each kmer, the MPHF returns a slot, then we read 108 values scattered across 108 column positions. This is cache-hostile even with a packed matrix (`.pbmx`), because the matrix is column-major: consecutive row reads jump across the file.
|
||||||
|
|
||||||
|
## Memory budget
|
||||||
|
|
||||||
|
The `keep` bitvector costs **1 bit per slot**. With 256 partitions and realistic kmer counts, each partition holds at most a few tens of millions of slots → a few MB per bitvector. Even in the absolute worst case (800 M slots), it stays under 100 MB. This is negligible.
|
||||||
|
|
||||||
|
The `slot_map` option (Option B, 8–16 bytes per slot) is heavier but still bounded: at 15 M slots and 8 bytes, that is 120 MB per partition, acceptable for a single worker.
|
||||||
|
|
||||||
|
## Key observation
|
||||||
|
|
||||||
|
**The filter operates on column values, not on kmers.** A filter like `--max-outgroup-count 0` only needs to know, for each slot, whether any outgroup column is non-zero. It does not need to know which kmer occupies that slot.
|
||||||
|
|
||||||
|
This means filtering can be done as a **sequential column scan** that produces a `keep: BitVec[n_slots]` — no MPHF lookups, no kmer knowledge, perfectly cache-friendly.
|
||||||
|
|
||||||
|
## Proposed single-scan design
|
||||||
|
|
||||||
|
### Step 1 — column scan → `keep` bitvector
|
||||||
|
|
||||||
|
```
|
||||||
|
for each column c in source matrix:
|
||||||
|
read column c sequentially (one mmap range)
|
||||||
|
update keep[slot] according to filter contribution of column c
|
||||||
|
```
|
||||||
|
|
||||||
|
For `GroupQuorumFilter` with ingroup/outgroup:
|
||||||
|
- ingroup columns: count presence per slot → `ingroup_count[slot]`
|
||||||
|
- outgroup columns: `keep[slot] &= (value[slot] == 0)` (early-exit possible)
|
||||||
|
|
||||||
|
Result: `keep: BitVec` of size `n_slots`, computed with purely sequential IO.
|
||||||
|
|
||||||
|
### Step 2 — unitig scan → kept kmers + new MPHF
|
||||||
|
|
||||||
|
```
|
||||||
|
for each kmer in unitig files:
|
||||||
|
old_slot = old_MPHF(kmer)
|
||||||
|
if keep[old_slot]:
|
||||||
|
push kmer into new GraphDeBruijn
|
||||||
|
record (old_slot, kmer) ← or just old_slot in order
|
||||||
|
```
|
||||||
|
|
||||||
|
Build new MPHF from `GraphDeBruijn` via `materialize_layer`.
|
||||||
|
|
||||||
|
### Step 3 — fill new matrix
|
||||||
|
|
||||||
|
Two sub-options:
|
||||||
|
|
||||||
|
**Option A — from recorded (old_slot, kmer) pairs:**
|
||||||
|
|
||||||
|
```
|
||||||
|
for each (old_slot, kmer) in recorded list:
|
||||||
|
new_slot = new_MPHF(kmer)
|
||||||
|
for each column c:
|
||||||
|
new_matrix[new_slot, c] = old_matrix[old_slot, c]
|
||||||
|
```
|
||||||
|
|
||||||
|
Memory cost: `n_kept × (8 + 8)` bytes for `(old_slot: usize, kmer: CanonicalKmer)`.
|
||||||
|
For species-specific filters, `n_kept` is small. For unfiltered rebuild, `n_kept = n_slots`.
|
||||||
|
|
||||||
|
**Option B — column-by-column copy using old→new slot mapping:**
|
||||||
|
|
||||||
|
Precompute `slot_map: Vec<Option<usize>>` of size `n_slots`:
|
||||||
|
- For each kmer in unitig file: `slot_map[old_MPHF(kmer)] = Some(new_MPHF(kmer))`
|
||||||
|
|
||||||
|
Then for each source column:
|
||||||
|
```
|
||||||
|
read source column sequentially
|
||||||
|
for each slot where slot_map[slot] = Some(new_slot):
|
||||||
|
write value to new column at new_slot
|
||||||
|
```
|
||||||
|
|
||||||
|
Memory cost: `n_slots × sizeof(usize)` for the slot map (one usize per source slot).
|
||||||
|
IO pattern: sequential read of each source column → random write into new column builders.
|
||||||
|
|
||||||
|
Option B avoids storing kmer values and works uniformly regardless of filter selectivity.
|
||||||
|
|
||||||
|
## Comparison
|
||||||
|
|
||||||
|
| | Current | Proposed |
|
||||||
|
|---|---|---|
|
||||||
|
| Disk reads | 2× unitigs + 2× random matrix | 1× columns (sequential) + 1× unitigs |
|
||||||
|
| MPHF lookups (source) | 2× N_kmers | 1× N_kept (step 2) or 0 (option B, col scan only) |
|
||||||
|
| Cache behavior | poor (random row access) | good (sequential column scan) |
|
||||||
|
| Extra memory | none | slot_map (option B) or (old_slot, kmer) list (option A) |
|
||||||
|
|
||||||
|
## Files to modify
|
||||||
|
|
||||||
|
- `src/obikpartitionner/src/rebuild_layer.rs` — `rebuild_partition` and `iter_src_layers`
|
||||||
|
- Possibly `src/obicompactvec/` — add column iterator API if not already present
|
||||||
|
- `src/obilayeredmap/` — check if per-column sequential access is exposed on `SrcLayerData`
|
||||||
|
|
||||||
|
## Open questions
|
||||||
|
|
||||||
|
- Does `SrcLayerData` expose per-column sequential iteration, or only `lookup(kmer, n_genomes)` random access?
|
||||||
|
- For option B: are new column builders writable in random-slot order (i.e. `set_val(slot, value)` without sequential constraint)?
|
||||||
|
- For `GroupQuorumFilter` specifically: can the filter be decomposed into independent per-column contributions, or does it need the full row?
|
||||||
@@ -0,0 +1,12 @@
|
|||||||
|
<!-- coverage sidecar — ne pas ajouter au nav mkdocs -->
|
||||||
|
# Coverage: architecture/sequences/invariant.md
|
||||||
|
|
||||||
|
## Code couvert
|
||||||
|
|
||||||
|
- `obikseq/src/sequence.rs` — invariants de représentation des séquences (ACGT, longueur max)
|
||||||
|
- `obikseq/src/unitig.rs` — type Unitig, contrainte MAX_KMERS_PER_CHUNK (255 kmers par chunk)
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
Document court et stable. Vérifier que la limite de 256 nucléotides (ou 255 kmers) par chunk
|
||||||
|
est toujours la même dans `obiskio::MAX_KMERS_PER_CHUNK`.
|
||||||
@@ -1,20 +1,30 @@
|
|||||||
# Chunk reader — implementation
|
# Chunk reader — implementation
|
||||||
|
|
||||||
The `obiread` crate provides a streaming iterator that reads FASTA or FASTQ files in fixed-size blocks and yields self-contained chunks, each ending on a complete sequence record boundary. Chunks are consumed in parallel by downstream workers.
|
`obiread` exposes two distinct sequence reading paths, each optimised for a different use case.
|
||||||
|
|
||||||
## Output type: rope
|
## Two reading paths
|
||||||
|
|
||||||
Each chunk is a `Vec<Bytes>` — a **rope**: a list of reference-counted byte slices that are not necessarily contiguous in memory. The consumer iterates over the slices in order.
|
| Path | API | Output unit | Per-record identity | Use case |
|
||||||
|
|------|-----|-------------|---------------------|----------|
|
||||||
|
| **Record path** | `read_sequence_chunks` → `parse_chunk` | `SeqRecord` (id + raw sequence + normalised rope) | yes | `query` — must read complete records |
|
||||||
|
| **Stream path** | `open_nuc_stream` | `NucPage` (flat normalised byte buffer) | no | `index`, `superkmer` — bulk throughput |
|
||||||
|
|
||||||
Using `bytes::Bytes` means the split at the record boundary is O(1): `Bytes::split_to(n)` adjusts a reference counter, not memory. No `memcpy` in the common case.
|
The record path uses `Rope`-backed chunks and is described in detail below.
|
||||||
|
The stream path (`NucStream` / `NucPage`) is described in the scatter section of [pipeline](pipeline.md).
|
||||||
|
|
||||||
## Allocation policy
|
---
|
||||||
|
|
||||||
| Case | Cost |
|
## Record path: chunk reader
|
||||||
|------|------|
|
|
||||||
| Boundary found in the current block (common) | zero extra allocation — `split_to` only |
|
The chunk reader reads FASTA or FASTQ files in fixed-size blocks and yields self-contained chunks, each ending on a complete sequence record boundary. `parse_chunk` then converts each chunk into a `Vec<SeqRecord>`, where each record carries its identifier, raw sequence bytes, and a normalised rope ready for superkmer building.
|
||||||
| Boundary straddles multiple blocks (sequence > block size, rare) | one allocation to pack the rope into a flat buffer |
|
|
||||||
| EOF flush | zero extra allocation |
|
This path is mandatory for `query`, where superkmers must be tracked back to their originating sequence (id, kmer offset) for output annotation.
|
||||||
|
|
||||||
|
## Output type: Rope
|
||||||
|
|
||||||
|
Each chunk is a `Rope` — a segmented byte sequence: a `Vec` of blocks, where each block is a `Vec<Cell<u8>>`. The consumer iterates over the blocks via a forward or backward cursor.
|
||||||
|
|
||||||
|
`Rope::split_off(pos)` splits at an absolute byte offset in O(log n) (binary search over block-start index). If `pos` falls inside a block, that block is split in two via `Vec::split_off` — no `memcpy` in the common case.
|
||||||
|
|
||||||
## SeqChunkIter
|
## SeqChunkIter
|
||||||
|
|
||||||
@@ -22,7 +32,7 @@ Using `bytes::Bytes` means the split at the record boundary is O(1): `Bytes::spl
|
|||||||
pub struct SeqChunkIter<R: Read> { /* private */ }
|
pub struct SeqChunkIter<R: Read> { /* private */ }
|
||||||
|
|
||||||
impl<R: Read> Iterator for SeqChunkIter<R> {
|
impl<R: Read> Iterator for SeqChunkIter<R> {
|
||||||
type Item = io::Result<Vec<Bytes>>;
|
type Item = io::Result<Rope>;
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn fasta_chunks<R: Read>(source: R) -> SeqChunkIter<R>
|
pub fn fasta_chunks<R: Read>(source: R) -> SeqChunkIter<R>
|
||||||
@@ -32,22 +42,21 @@ pub fn fastq_chunks<R: Read>(source: R) -> SeqChunkIter<R>
|
|||||||
`next()` loop:
|
`next()` loop:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
1. read one block of block_size bytes → push onto rope
|
1. read one block of block_size bytes → push onto Rope
|
||||||
2. probe check: if the boundary marker ("\n>" or "\n@") is absent from the
|
2. call splitter(rope) → Option<abs_offset>
|
||||||
last block, skip the splitter (avoids a full backward scan for nothing)
|
if Some(pos):
|
||||||
3. call splitter on last block
|
tail = rope.split_off(pos) ← O(log n), may split one block
|
||||||
if found at offset n:
|
chunk = mem::replace(&mut rope, tail)
|
||||||
remainder = last_block.split_to(n) ← O(1), zero copy
|
return Some(Ok(chunk))
|
||||||
return std::mem::take(&mut self.rope) ← the chunk
|
3. if EOF and rope non-empty: return Some(Ok(rope)) as final chunk
|
||||||
4. if rope.len() > 1 (multi-block accumulation):
|
4. if EOF and rope empty: return None
|
||||||
pack rope into one flat buffer ← one alloc
|
|
||||||
retry splitter on flat buffer
|
|
||||||
5. if EOF: flush remaining rope as final chunk
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
The `Splitter` function signature is `fn(&Rope) -> Option<usize>`. It returns the absolute byte offset of the start of the last complete record, or `None` if no boundary was found in the accumulated rope (need more data).
|
||||||
|
|
||||||
## Boundary detection — FASTA
|
## Boundary detection — FASTA
|
||||||
|
|
||||||
Backward scan with a 2-state machine. Searches for `>` immediately preceded by `\n` or `\r`:
|
Backward scan with a 2-state machine. Searches (right to left) for `>` followed by `\n` or `\r` (i.e., a `>` that is preceded by a newline in forward order):
|
||||||
|
|
||||||
```mermaid
|
```mermaid
|
||||||
stateDiagram-v2
|
stateDiagram-v2
|
||||||
@@ -58,13 +67,13 @@ stateDiagram-v2
|
|||||||
FoundGt --> [*] : '\\n' / '\\r' ✓
|
FoundGt --> [*] : '\\n' / '\\r' ✓
|
||||||
```
|
```
|
||||||
|
|
||||||
Returns the byte offset of the `>` that starts the last complete record.
|
Returns the byte offset of the `>` that starts the last complete record. Returns `None` if only one `>` is found (cannot confirm there is a prior complete record).
|
||||||
|
|
||||||
## Boundary detection — FASTQ
|
## Boundary detection — FASTQ
|
||||||
|
|
||||||
FASTQ records have a rigid 4-line structure (`@header`, sequence, `+`, quality). The `@` character (ASCII 64, Phred score 31) can appear legitimately in quality lines, making any forward heuristic unreliable. The backward scanner verifies the full structural context before accepting a candidate `@`.
|
FASTQ records have a rigid 4-line structure (`@header`, sequence, `+`, quality). The `@` character (ASCII 64, Phred score 31) can appear legitimately in quality lines, making any forward heuristic unreliable. The backward scanner verifies the full structural context before accepting a candidate `@`.
|
||||||
|
|
||||||
7-state machine (port of Go's `EndOfLastFastqEntry`), scanning from **right to left**. Each time a `+` is found, its position is saved as `restart`; any state mismatch resets the scan to that position.
|
7-state machine (states 0–6), scanning from **right to left**. Each time a `+` is found, its position is saved as `restart`; any state mismatch resets the scan to that position.
|
||||||
|
|
||||||
```mermaid
|
```mermaid
|
||||||
stateDiagram-v2
|
stateDiagram-v2
|
||||||
|
|||||||
@@ -0,0 +1,12 @@
|
|||||||
|
<!-- coverage sidecar — ne pas ajouter au nav mkdocs -->
|
||||||
|
# Coverage: implementation/chunkreader.md
|
||||||
|
|
||||||
|
## Code couvert
|
||||||
|
|
||||||
|
- `obiread/src/chunk.rs` — SeqChunkIter, détection de frontières FASTA/FASTQ, state machines
|
||||||
|
- `obikrope/src/lib.rs` — type Rope (Vec<Bytes>), opérations zero-copy
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
Document stable (la stratégie de chunking rope ne devrait pas avoir changé).
|
||||||
|
Vérifier que le split FASTA/FASTQ reste correct si de nouveaux formats ont été ajoutés.
|
||||||
@@ -0,0 +1,181 @@
|
|||||||
|
# Approximate evidence: fingerprint-based index
|
||||||
|
|
||||||
|
## Motivation
|
||||||
|
|
||||||
|
`evidence.bin` maps each MPHF slot to the position of the k-mer that owns it,
|
||||||
|
enabling zero-FP verification. On the bacterial BCT dataset (2048 partitions,
|
||||||
|
k=31, ~33 M k-mers/partition) it accounts for 66 % of the lookup-layer footprint:
|
||||||
|
|
||||||
|
| file | size/partition | fraction |
|
||||||
|
|---|---|---|
|
||||||
|
| evidence.bin | 132 MB | 66 % |
|
||||||
|
| unitigs.bin | 58 MB | 29 % |
|
||||||
|
| mphf.bin | 10 MB | 5 % |
|
||||||
|
|
||||||
|
`evidence.bin` is a bijection from MPHF-space to unitig-position-space and
|
||||||
|
costs at minimum ⌈log₂ N⌉ bits per slot — an information-theoretic floor with
|
||||||
|
only ~22 % packing headroom. Compression is not a path to elimination.
|
||||||
|
|
||||||
|
The approximate index replaces `evidence.bin` + `unitigs.bin.idx` with a
|
||||||
|
`fingerprint.bin` file. The MPHF and `unitigs.bin` are kept unchanged. Set
|
||||||
|
operations still require an exact index; the approximate index targets query
|
||||||
|
workloads that can tolerate a bounded false-positive rate.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## The Findere model
|
||||||
|
|
||||||
|
A B-bit fingerprint stored per MPHF slot provides the discrimination that
|
||||||
|
`evidence.bin` would otherwise provide through full k-mer reconstruction.
|
||||||
|
|
||||||
|
For a foreign k-mer query, the MPHF maps it to some slot `s`. The fingerprint
|
||||||
|
stored at `s` belongs to the legitimate k-mer at that slot. The FP event is:
|
||||||
|
|
||||||
|
```
|
||||||
|
P(FP per k-mer) = 1 / 2^b
|
||||||
|
```
|
||||||
|
|
||||||
|
The Findere trick reduces the indexed k-mer size. When the user specifies k_user
|
||||||
|
and z, the index physically stores k-mers of size `s = k_user − z + 1`. At query
|
||||||
|
time, the same s-mer size is used. After collecting per-position s-mer results
|
||||||
|
over the full query sequence, a sliding window of size z aggregates z consecutive
|
||||||
|
s-mer hits into one confirmed k_user-mer hit, reducing the per-window FP rate:
|
||||||
|
|
||||||
|
```
|
||||||
|
P(FP per k_user-mer) = 1 / 2^(b·z)
|
||||||
|
```
|
||||||
|
|
||||||
|
`IndexConfig::kmer_size` stores `s = k_user − z + 1`, not k_user. Both indexing
|
||||||
|
and querying use this stored size via `set_k(idx.kmer_size())`.
|
||||||
|
|
||||||
|
Parameters b and z are stored in `layer_meta.json` (`EvidenceKind::Approx { b, z }`).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## `FingerprintVec` on disk
|
||||||
|
|
||||||
|
`fingerprint.bin` layout:
|
||||||
|
|
||||||
|
```
|
||||||
|
magic: b"FPVF" (4 bytes)
|
||||||
|
b: u8 (bits per slot, 1..=64)
|
||||||
|
padding: [0u8; 3]
|
||||||
|
n: u64 LE (number of slots)
|
||||||
|
data: packed bits, ceil(n·b/8) bytes, Lsb0 order
|
||||||
|
```
|
||||||
|
|
||||||
|
`FingerprintVec` is memory-mapped. The match check against a query k-mer:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
fn matches(&self, slot: usize, fingerprint: u64) -> bool {
|
||||||
|
self.get(slot) == (fingerprint & self.mask)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
`build_approx_evidence` iterates `unitigs.bin` sequentially, writes
|
||||||
|
`kmer.seq_hash()` into the slot assigned by the MPHF, then saves `fingerprint.bin`
|
||||||
|
and `layer_meta.json`. No `.idx` file is produced; random access into
|
||||||
|
`unitigs.bin` is not needed.
|
||||||
|
|
||||||
|
At build time, `find_approx` in `MphfLayer`:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
let slot = self.mphf.index(&kmer.raw());
|
||||||
|
if fingerprint.matches(slot, kmer.seq_hash()) { Some(slot) } else { None }
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## `EvidenceKind` and metadata
|
||||||
|
|
||||||
|
`layer_meta.json` records which evidence bundle is present:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
pub enum EvidenceKind {
|
||||||
|
Exact,
|
||||||
|
Approx { b: u8, z: u8 },
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
`MphfLayer::open` reads this tag and dispatches `find` to `find_exact` or
|
||||||
|
`find_approx` transparently. `find_exact` panics on an approximate layer;
|
||||||
|
`find_approx` panics on an exact layer — mode mixing is a programming error.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Parameter resolution (`resolve_approx_params`)
|
||||||
|
|
||||||
|
The identity `b·z = ⌈−log₂(fp)⌉` lets any two of (b, z, fp) derive the third.
|
||||||
|
`resolve_approx_params` implements a 2-of-3 rule with conservative ceiling
|
||||||
|
rounding:
|
||||||
|
|
||||||
|
| given | derived |
|
||||||
|
|---|---|
|
||||||
|
| b, z | fp = 1/2^(b·z) |
|
||||||
|
| z, fp | b = ⌈−log₂(fp) / z⌉ |
|
||||||
|
| b, fp | z = ⌈−log₂(fp) / b⌉ |
|
||||||
|
| z only | b = 8 (default), fp derived |
|
||||||
|
| b only | z = 1 (default), fp derived |
|
||||||
|
| fp only | b = 8 (default), z derived |
|
||||||
|
| none | b = 8, z = 1, fp = 1/256 |
|
||||||
|
|
||||||
|
When all three are given, b and z are authoritative and fp is recomputed.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## CLI flags
|
||||||
|
|
||||||
|
Both `index` and `reindex` accept the same flags:
|
||||||
|
|
||||||
|
| flag | type | meaning |
|
||||||
|
|---|---|---|
|
||||||
|
| `--approx` | bool | enable fingerprint evidence |
|
||||||
|
| `--evidence-bits` (`b`) | u8 | fingerprint bits per slot |
|
||||||
|
| `-z` | u8 | Findere z parameter |
|
||||||
|
| `--fp` | f64 | target FP rate per z-window |
|
||||||
|
| `--block-size` | usize | unitig block size for exact `.idx`; ignored in approx mode |
|
||||||
|
|
||||||
|
`--approx` must be set explicitly; the other three flags are optional and
|
||||||
|
resolved by the 2-of-3 rule. Omitting all three produces b=8, z=1.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## `reindex` command
|
||||||
|
|
||||||
|
`reindex` converts an existing index between exact and approximate evidence
|
||||||
|
in-place across all partitions and layers, running partitions in parallel via
|
||||||
|
Rayon.
|
||||||
|
|
||||||
|
Conversion to approximate (`--approx`):
|
||||||
|
|
||||||
|
- Builds `fingerprint.bin` from `unitigs.bin` + `mphf.bin`.
|
||||||
|
- Removes `evidence.bin` and `unitigs.bin.idx`.
|
||||||
|
- Updates `layer_meta.json` with `EvidenceKind::Approx { b, z }`.
|
||||||
|
|
||||||
|
Conversion to exact (default, no `--approx`):
|
||||||
|
|
||||||
|
- Builds `evidence.bin` + `unitigs.bin.idx` from `unitigs.bin` + `mphf.bin`.
|
||||||
|
- Removes `fingerprint.bin`.
|
||||||
|
- Updates `layer_meta.json` with `EvidenceKind::Exact`.
|
||||||
|
|
||||||
|
The root `index.meta` is updated with the new evidence kind on success.
|
||||||
|
`mphf.bin` and `unitigs.bin` are never modified.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## `estimate` command
|
||||||
|
|
||||||
|
`estimate` is a dry-run that resolves and prints (b, z, fp) without touching
|
||||||
|
any index. It accepts the same `--evidence-bits`, `-z`, and `--fp` flags and
|
||||||
|
additionally accepts `-k` to display the effective indexed k-mer length:
|
||||||
|
|
||||||
|
```
|
||||||
|
k (user): 31
|
||||||
|
k (indexed, s=k-z+1): 27
|
||||||
|
z: 5
|
||||||
|
evidence bits (b): 8
|
||||||
|
FP per s-mer: 3.906e-3 (1/2^8)
|
||||||
|
FP per k-mer window: 9.537e-7 (1/2^(8·5))
|
||||||
|
```
|
||||||
|
|
||||||
|
Useful for choosing parameters before committing to an index build.
|
||||||
@@ -0,0 +1,22 @@
|
|||||||
|
<!-- coverage sidecar — ne pas ajouter au nav mkdocs -->
|
||||||
|
# Coverage: implementation/evidence_elimination.md
|
||||||
|
|
||||||
|
## Code couvert
|
||||||
|
|
||||||
|
- `obilayeredmap/src/fingerprint.rs` — FingerprintVec, FingerprintVecWriter, stockage b bits/slot, matches()
|
||||||
|
- `obilayeredmap/src/mphf_layer.rs` — build_approx_evidence(dir, b, z), find_approx()
|
||||||
|
- `obilayeredmap/src/meta.rs` — EvidenceKind::Approx { b, z }, LayerMeta
|
||||||
|
- `obikindex/src/reindex.rs` — KmerIndex::reindex(), conversion exact↔approx en place
|
||||||
|
- `obikmer/src/cmd/reindex.rs` — CLI reindex, options --approx, -z, --evidence-bits, --fp, --block-size
|
||||||
|
- `obikmer/src/cmd/index.rs` — resolve_approx_params(), options --approx, -z, --evidence-bits, --fp
|
||||||
|
- `obikmer/src/cmd/estimate.rs` — commande estimate (dry-run des paramètres)
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
Ce document était à l'origine une discussion de design (4 approches). L'implémentation
|
||||||
|
a maintenant convergé vers l'approche fingerprint (Findere-style).
|
||||||
|
FORT RISQUE DE DÉRIVE — le contenu est probablement un mélange de design et d'implémentation :
|
||||||
|
- Le modèle FP = 1/2^(b·z) et les règles de résolution (2-of-3 parmi b, z, fp) sont implémentés
|
||||||
|
- La commande `reindex` permet la conversion a posteriori exact↔approx
|
||||||
|
- La commande `estimate` fait le dry-run des paramètres
|
||||||
|
Cette page doit être réécrite pour documenter l'implémentation Findere réelle plutôt que les alternatives abandonnées.
|
||||||
@@ -0,0 +1,279 @@
|
|||||||
|
# Kmer filtering and ingroup/outgroup predicates
|
||||||
|
|
||||||
|
The `filter`, `dump`, and `unitig` commands share the same filtering system,
|
||||||
|
implemented as a shared `FilterArgs` clap argument group embedded in each command
|
||||||
|
via `#[command(flatten)]`. Filters select k-mers based on per-genome quorum
|
||||||
|
counts, optionally restricted to **ingroup** and **outgroup** genome sets derived
|
||||||
|
from genome metadata. All rules described here apply identically to all three commands.
|
||||||
|
|
||||||
|
`filter` additionally accepts `--min-total-count` / `--max-total-count` filters
|
||||||
|
that operate on the sum of counts across all genomes.
|
||||||
|
|
||||||
|
## Predicate syntax
|
||||||
|
|
||||||
|
Each `--ingroup` and `--outgroup` flag takes a predicate of the form:
|
||||||
|
|
||||||
|
```
|
||||||
|
key OP value1|value2|…
|
||||||
|
```
|
||||||
|
|
||||||
|
| Operator | Meaning |
|
||||||
|
|----------|---------|
|
||||||
|
| `*` or `all` | wildcard — every genome matches unconditionally |
|
||||||
|
| `key=v1\|v2` | exact match — genome's `key` equals `v1` or `v2` |
|
||||||
|
| `key!=v` | negation — genome's `key` equals none of the values |
|
||||||
|
| `key~path` | path ancestry — genome's `key` is `path` or a descendant |
|
||||||
|
| `key!~path` | not a descendant |
|
||||||
|
|
||||||
|
Multiple values separated by `|` are always OR-ed within the predicate.
|
||||||
|
|
||||||
|
### Path matching (`~` and `!~`)
|
||||||
|
|
||||||
|
Metadata values can represent hierarchical concept paths such as
|
||||||
|
`/Eukaryota/Viridiplantae/Streptophyta/Betulaceae/Betula/nana`.
|
||||||
|
|
||||||
|
Stored taxonomy values always start with `/` (the root of the path).
|
||||||
|
Query patterns do **not** need to start with `/` — a leading `/` is an optional
|
||||||
|
start anchor, not a requirement.
|
||||||
|
|
||||||
|
| Pattern form | Semantics |
|
||||||
|
|---|---|
|
||||||
|
| `A/B` | contiguous sub-path A then B, anywhere in the value |
|
||||||
|
| `/A/B` | value starts with A then B |
|
||||||
|
| `A/B$` | value ends with A then B |
|
||||||
|
| `/A/B$` | value is exactly A then B |
|
||||||
|
| `A@x/B` | A with class `x` followed by B with any class |
|
||||||
|
|
||||||
|
- `taxon~/Betulaceae/Betula` matches any path that starts with `Betulaceae` then `Betula`.
|
||||||
|
- `taxon~Betula` matches any path containing `Betula` as a segment, anywhere.
|
||||||
|
|
||||||
|
### Missing metadata key → NA
|
||||||
|
|
||||||
|
If a genome does not carry the queried metadata key, the predicate returns **NA**.
|
||||||
|
NA propagates through the group evaluation logic (see below), and genomes that
|
||||||
|
cannot be classified are **ignored** in all quorum counts.
|
||||||
|
|
||||||
|
## Group semantics
|
||||||
|
|
||||||
|
### Multiple predicates
|
||||||
|
|
||||||
|
| Flag | Combination rule |
|
||||||
|
|------|-----------------|
|
||||||
|
| `--ingroup` (repeated) | **AND** — genome must satisfy all predicates |
|
||||||
|
| `--outgroup` (repeated) | **OR** — genome satisfies any predicate |
|
||||||
|
|
||||||
|
### Three-value logic
|
||||||
|
|
||||||
|
Each predicate returns `true`, `false`, or `NA` (absent key).
|
||||||
|
|
||||||
|
- AND: `false` absorbs everything; `NA` propagates unless already `false`.
|
||||||
|
- OR: `true` absorbs everything; `NA` propagates unless already `true`.
|
||||||
|
|
||||||
|
### Classification and priority
|
||||||
|
|
||||||
|
For each genome:
|
||||||
|
|
||||||
|
1. Evaluate `AND(ingroup predicates)` → `in_result`
|
||||||
|
2. Evaluate `OR(outgroup predicates)` → `out_result`
|
||||||
|
3. If `in_result = true` → **Ingroup** (ingroup wins over outgroup)
|
||||||
|
4. Else if `out_result = true` → **Outgroup**
|
||||||
|
5. Otherwise → **Uncategorized** (ignored in all quorum counts)
|
||||||
|
|
||||||
|
### Implicit groups
|
||||||
|
|
||||||
|
| `--ingroup` | `--outgroup` | Effective behaviour |
|
||||||
|
|-------------|--------------|---------------------|
|
||||||
|
| not set | not set | all genomes form the ingroup |
|
||||||
|
| set | not set | only ingroup quorum flags apply |
|
||||||
|
| not set | set | only outgroup quorum flags apply |
|
||||||
|
| set | set | both constraints apply simultaneously |
|
||||||
|
|
||||||
|
## Quorum flags
|
||||||
|
|
||||||
|
| Flag | Applies to | Meaning |
|
||||||
|
|------|-----------|---------|
|
||||||
|
| `--min-count N` | ingroup | k-mer present in at least N ingroup genomes |
|
||||||
|
| `--max-count N` | ingroup | k-mer present in at most N ingroup genomes |
|
||||||
|
| `--min-frac F` | ingroup | k-mer present in at least fraction F of ingroup genomes |
|
||||||
|
| `--max-frac F` | ingroup | k-mer present in at most fraction F of ingroup genomes |
|
||||||
|
| `--min-outgroup-count N` | outgroup | k-mer present in at least N outgroup genomes |
|
||||||
|
| `--max-outgroup-count N` | outgroup | k-mer present in at most N outgroup genomes |
|
||||||
|
| `--min-outgroup-frac F` | outgroup | k-mer present in at least fraction F of outgroup genomes |
|
||||||
|
| `--max-outgroup-frac F` | outgroup | k-mer present in at most fraction F of outgroup genomes |
|
||||||
|
| `--min-total-count N` | all genomes | sum of per-genome counts ≥ N (`filter` only) |
|
||||||
|
| `--max-total-count N` | all genomes | sum of per-genome counts ≤ N (`filter` only) |
|
||||||
|
| `--presence-threshold N` | all | per-genome count > N to be considered "present" (default 0) |
|
||||||
|
|
||||||
|
**Conditional defaults** — the defaults for `--min-frac` and `--max-outgroup-count` depend on two conditions:
|
||||||
|
whether the corresponding group was declared, **and** whether any quorum flag for that group was explicitly set.
|
||||||
|
|
||||||
|
> **Rule**: declaring a group activates the smart default **only if no quorum flag for that group is explicitly set**.
|
||||||
|
> As soon as any quorum flag for a group is present on the command line, all defaults for that group revert to no-op values.
|
||||||
|
|
||||||
|
| `--ingroup` | Any ingroup quorum flag? | `--min-frac` default |
|
||||||
|
|-------------|--------------------------|----------------------|
|
||||||
|
| not set | — | 0.0 (no-op) |
|
||||||
|
| set | no | **1.0** — all ingroup genomes must carry the k-mer |
|
||||||
|
| set | yes | 0.0 — user controls quorum explicitly |
|
||||||
|
|
||||||
|
| `--outgroup` | Any outgroup quorum flag? | `--max-outgroup-count` default |
|
||||||
|
|--------------|---------------------------|-------------------------------|
|
||||||
|
| not set | — | outgroup size (no-op) |
|
||||||
|
| set | no | **0** — no outgroup genome may carry the k-mer |
|
||||||
|
| set | yes | outgroup size — user controls quorum explicitly |
|
||||||
|
|
||||||
|
"Any ingroup quorum flag" means any of: `--min-count`, `--max-count`, `--min-frac`, `--max-frac`.
|
||||||
|
"Any outgroup quorum flag" means any of: `--min-outgroup-count`, `--max-outgroup-count`, `--min-outgroup-frac`, `--max-outgroup-frac`.
|
||||||
|
|
||||||
|
**Why this rule?** Setting any quorum flag signals explicit intent — the defaults are there to help when the user omits quorum entirely, not to interfere with deliberate constraints. Mixing implicit and explicit quorum on the same group would risk silent incoherence (e.g. `--max-count 0` with an implicit `--min-frac 1.0`).
|
||||||
|
|
||||||
|
All other bounds default to 0 / group size / 0.0 / 1.0 regardless of whether groups are declared.
|
||||||
|
|
||||||
|
### Validation
|
||||||
|
|
||||||
|
After resolving defaults, the following are checked and cause an immediate error:
|
||||||
|
|
||||||
|
| Condition | Error |
|
||||||
|
|-----------|-------|
|
||||||
|
| `--min-count > --max-count` | incoherent bounds |
|
||||||
|
| `--min-frac > --max-frac` | incoherent bounds |
|
||||||
|
| `--min-outgroup-count > --max-outgroup-count` | incoherent bounds |
|
||||||
|
| `--min-outgroup-frac > --max-outgroup-frac` | incoherent bounds |
|
||||||
|
| any fraction outside `[0.0, 1.0]` | invalid value |
|
||||||
|
|
||||||
|
The check applies to the **effective** values (after defaults are resolved), so an explicit `--max-frac 0.5` with an implicit `--min-frac 1.0` would have been caught — but the rule above prevents that situation from arising in the first place.
|
||||||
|
|
||||||
|
Fractions are computed over the size of the classified group, not over total
|
||||||
|
genome count. An empty group (no genome classified as ingroup/outgroup) never
|
||||||
|
triggers a filter failure.
|
||||||
|
|
||||||
|
### Conservative rounding of fraction thresholds
|
||||||
|
|
||||||
|
When a fraction threshold `F` is applied to a group of size `N`, the effective
|
||||||
|
integer threshold is determined by the direction of the bound:
|
||||||
|
|
||||||
|
| Bound | Effective count | Rounding | Rationale |
|
||||||
|
|-------|----------------|----------|-----------|
|
||||||
|
| `--min-frac F` | k-mer in ≥ ⌈F·N⌉ genomes | **ceil** | stricter — a kmer present in exactly ⌊F·N⌋ genomes does not meet the fraction |
|
||||||
|
| `--max-frac F` | k-mer in ≤ ⌊F·N⌋ genomes | **floor** | stricter — a kmer present in ⌈F·N⌉ genomes already exceeds the fraction |
|
||||||
|
|
||||||
|
The same rule applies symmetrically to `--min-outgroup-frac` (ceil) and
|
||||||
|
`--max-outgroup-frac` (floor). The outgroup direction is not inverted: the
|
||||||
|
conservative rounding depends only on whether the bound is a minimum or a
|
||||||
|
maximum, not on which group it applies to.
|
||||||
|
|
||||||
|
**Example** — `--min-frac 0.5` with an ingroup of 3 genomes:
|
||||||
|
`⌈0.5 × 3⌉ = ⌈1.5⌉ = 2` → at least 2 of 3 ingroup genomes must carry the k-mer.
|
||||||
|
|
||||||
|
**Implementation note** — the filter evaluates `n / denom < min_frac` directly
|
||||||
|
(integer `n`, float comparison) rather than pre-computing `⌈F·N⌉`. This is
|
||||||
|
mathematically equivalent for integer counts: `n / N < F` ↔ `n < F·N` ↔
|
||||||
|
`n ≤ ⌈F·N⌉ − 1` ↔ `n < ⌈F·N⌉`. No explicit rounding is needed.
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
Keep k-mers specific to *Betula nana* — present in at least 2 *B. nana* genomes
|
||||||
|
and absent from every other genome in the index:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
obikmer filter src --output dst \
|
||||||
|
--ingroup "species=Betula_nana" \
|
||||||
|
--outgroup "*" \
|
||||||
|
--min-count 2 \
|
||||||
|
--max-outgroup-count 0
|
||||||
|
```
|
||||||
|
|
||||||
|
Keep k-mers found in at least 2 *Betula nana* genomes and absent from all
|
||||||
|
other *Betula*:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
obikmer filter src --output dst \
|
||||||
|
--ingroup "species=Betula_nana" \
|
||||||
|
--outgroup "genus=Betula" \
|
||||||
|
--min-count 2 \
|
||||||
|
--max-outgroup-count 0
|
||||||
|
```
|
||||||
|
|
||||||
|
Use taxonomic paths — keep k-mers present in ≥ 50 % of the *Betula* clade
|
||||||
|
and in fewer than 10 % of everything outside *Betulaceae*:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
obikmer filter src --output dst \
|
||||||
|
--ingroup "taxon~/Betulaceae/Betula" \
|
||||||
|
--outgroup "taxon!~/Betulaceae" \
|
||||||
|
--min-frac 0.5 \
|
||||||
|
--max-outgroup-frac 0.1
|
||||||
|
```
|
||||||
|
|
||||||
|
Multiple outgroup predicates (OR): exclude k-mers present in *Alnus* or *Carpinus*:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
obikmer filter src --output dst \
|
||||||
|
--ingroup "genus=Betula" \
|
||||||
|
--outgroup "genus=Alnus" \
|
||||||
|
--outgroup "genus=Carpinus" \
|
||||||
|
--max-outgroup-count 0
|
||||||
|
```
|
||||||
|
|
||||||
|
To dump only k-mers specific to *Betula nana*:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
obikmer dump myindex \
|
||||||
|
--ingroup "species=Betula_nana" \
|
||||||
|
--outgroup "*" \
|
||||||
|
--min-count 1 \
|
||||||
|
--max-outgroup-count 0
|
||||||
|
```
|
||||||
|
|
||||||
|
To enumerate unitigs of the *Betula*-specific subgraph:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
obikmer unitig myindex \
|
||||||
|
--ingroup "genus=Betula" \
|
||||||
|
--outgroup "*" \
|
||||||
|
--min-count 2 \
|
||||||
|
--max-outgroup-count 0
|
||||||
|
```
|
||||||
|
|
||||||
|
## Command-specific options
|
||||||
|
|
||||||
|
### `dump --head N`
|
||||||
|
|
||||||
|
Stops output after the first N k-mers that pass all active filters.
|
||||||
|
Iteration terminates immediately — subsequent partitions and layers are not scanned.
|
||||||
|
Useful for quick inspection of large indexes without loading the entire dataset.
|
||||||
|
|
||||||
|
```sh
|
||||||
|
obikmer dump myindex --head 100
|
||||||
|
obikmer dump myindex --head 20 --ingroup "species=Betula_nana" --min-count 1
|
||||||
|
```
|
||||||
|
|
||||||
|
### `distance --presence-threshold N`
|
||||||
|
|
||||||
|
When computing Jaccard distance on a **count index**, a k-mer is considered present in a genome if its count is ≥ N (default 1).
|
||||||
|
This option is independent of the `--presence-threshold` used in filtering.
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# Jaccard treating kmers with count ≥ 2 as present
|
||||||
|
obikmer distance myindex --metric jaccard --presence-threshold 2
|
||||||
|
```
|
||||||
|
|
||||||
|
This parameter has no effect on presence/absence indexes (where values are already 0/1) or on metrics other than Jaccard.
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
- **`obikpartitionner::filter::GroupQuorumFilter`** — implements `KmerFilter`
|
||||||
|
using pre-computed ingroup and outgroup index vectors. The heavy logic
|
||||||
|
(predicate parsing, three-value evaluation, genome classification) happens
|
||||||
|
once before any iteration; each k-mer row evaluation is a simple index
|
||||||
|
lookup and counter.
|
||||||
|
|
||||||
|
- **`obikmer::cmd::predicate::FilterArgs`** — shared `clap` argument group
|
||||||
|
embedded via `#[command(flatten)]` in `FilterArgs`, `DumpArgs`, and
|
||||||
|
`UnitigArgs`. `FilterArgs::build_filters()` returns a ready-to-use filter
|
||||||
|
list.
|
||||||
|
|
||||||
|
- **`obikpartitionner::KmerPartition::iter_partition_kmers`** — accepts
|
||||||
|
`filters: &[Box<dyn KmerFilter>]` and applies them per-kmer before invoking
|
||||||
|
the callback. `filter`, `dump`, and `unitig` all go through this single
|
||||||
|
entry point.
|
||||||
@@ -1,38 +1,57 @@
|
|||||||
# Kmer — implementation
|
# Kmer — implementation
|
||||||
|
|
||||||
## Memory layout
|
## Types and layout
|
||||||
|
|
||||||
`Kmer` is a `#[repr(transparent)]` newtype over `u64`:
|
`KmerOf<L>` is a `#[repr(transparent)]` newtype over `u64` parameterized by a `KmerLength` marker:
|
||||||
|
|
||||||
```rust
|
```rust
|
||||||
#[repr(transparent)]
|
#[repr(transparent)]
|
||||||
pub struct Kmer(u64);
|
pub struct KmerOf<L: KmerLength>(u64, PhantomData<L>);
|
||||||
```
|
```
|
||||||
|
|
||||||
Nucleotides are packed 2 bits each, **left-aligned**, MSB-first. Nucleotide 0 occupies bits 63–62; nucleotide i occupies bits 63−2i and 62−2i. The low 64−2k bits are always zero. k is **not stored** — it is a parameter of every operation that needs it, and will be owned by the future collection-level indexer.
|
Three marker types implement `KmerLength`:
|
||||||
|
|
||||||
|
| Marker | `len()` source | Used for |
|
||||||
|
|--------|---------------|---------|
|
||||||
|
| `KLen` | `params::k()` | k-mers |
|
||||||
|
| `MLen` | `params::m()` | minimizers |
|
||||||
|
| `ConstLen<N>` | const generic `N` | tests |
|
||||||
|
|
||||||
|
Public aliases:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
pub type Kmer = KmerOf<KLen>; // k-mer, global k
|
||||||
|
pub type Minimizer = CanonicalKmerOf<MLen>; // canonical m-mer, global m
|
||||||
|
```
|
||||||
|
|
||||||
|
Nucleotides are packed 2 bits each, **left-aligned**, MSB-first. Nucleotide 0 occupies bits 63–62; nucleotide i occupies bits 63−2i and 62−2i. The low 64−2·len bits are always zero. The length is **not stored** — every operation reads it from `L::len()`.
|
||||||
|
|
||||||
| 63–62 | 61–60 | … | 63−2(k−1)−1 to 63−2(k−1) | 63−2k down to 0 |
|
| 63–62 | 61–60 | … | 63−2(k−1)−1 to 63−2(k−1) | 63−2k down to 0 |
|
||||||
|-------|-------|---|--------------------------|-----------------|
|
|-------|-------|---|--------------------------|-----------------|
|
||||||
| nt 0 | nt 1 | … | nt k−1 | zero padding |
|
| nt 0 | nt 1 | … | nt k−1 | zero padding |
|
||||||
|
|
||||||
|
## Global parameters
|
||||||
|
|
||||||
|
`params::set_k(k)` / `params::k()` and `params::set_m(m)` / `params::m()` are backed by `OnceLock<usize>` in production (write-once, panic on conflict) and by `thread_local! { Cell<usize> }` in test builds (per-thread, freely writable). `params::init(k, m)` sets both in one call.
|
||||||
|
|
||||||
## Encoding
|
## Encoding
|
||||||
|
|
||||||
`Kmer::from_ascii(ascii, k)` encodes the first k bytes of an ASCII slice using the shared `ENC` table (see [SuperKmer — ASCII encoding](superkmer.md#ascii-encoding-and-decoding)):
|
`KmerOf::<L>::from_ascii(ascii)` encodes the first `L::len()` bytes using the shared `ENC` table (see [SuperKmer — ASCII encoding](superkmer.md#ascii-encoding-and-decoding)):
|
||||||
|
|
||||||
```rust
|
```rust
|
||||||
for i in 0..k {
|
for i in 0..k {
|
||||||
val = (val << 2) | encode_base(ascii[i]) as u64;
|
val = (val << 2) | encode_base(ascii[i]) as u64;
|
||||||
}
|
}
|
||||||
Kmer(val << (64 - 2 * k))
|
KmerOf(val << (64 - 2 * k), PhantomData)
|
||||||
```
|
```
|
||||||
|
|
||||||
Zero allocation — result lives on the stack.
|
Zero allocation — result lives on the stack.
|
||||||
|
|
||||||
## Decoding
|
## Decoding
|
||||||
|
|
||||||
`write_ascii(k, buf)` appends k ASCII characters to a caller-supplied `Vec<u8>` using the shared `DEC4` table: one lookup per 4 nucleotides, two partial-byte lookups for the remainder. No allocation in the hot path.
|
`write_ascii(writer)` writes k ASCII characters to any `W: Write` using the shared `DEC4` table: one lookup per 4 nucleotides, one partial lookup for the remainder. No allocation in the hot path.
|
||||||
|
|
||||||
`to_ascii(k)` is a convenience wrapper that allocates and returns a `Vec<u8>`; intended for tests and display only.
|
`to_ascii()` is a convenience wrapper that allocates and returns a `Vec<u8>`; intended for tests and display only.
|
||||||
|
|
||||||
## Reverse complement
|
## Reverse complement
|
||||||
|
|
||||||
@@ -43,18 +62,30 @@ let x = !self.0; /
|
|||||||
let x = x.swap_bytes(); // reverse bytes
|
let x = x.swap_bytes(); // reverse bytes
|
||||||
let x = ((x >> 4) & 0x0F0F0F0F0F0F0F0F) | ((x & 0x0F0F0F0F0F0F0F0F) << 4); // swap nibbles
|
let x = ((x >> 4) & 0x0F0F0F0F0F0F0F0F) | ((x & 0x0F0F0F0F0F0F0F0F) << 4); // swap nibbles
|
||||||
let x = ((x >> 2) & 0x3333333333333333) | ((x & 0x3333333333333333) << 2); // swap 2-bit groups
|
let x = ((x >> 2) & 0x3333333333333333) | ((x & 0x3333333333333333) << 2); // swap 2-bit groups
|
||||||
Kmer(x << (64 - 2 * k))
|
KmerOf(x << (64 - 2 * k), PhantomData)
|
||||||
```
|
```
|
||||||
|
|
||||||
After complementing, bytes are reversed (`swap_bytes`), then nibbles, then 2-bit groups — restoring 2-bit nucleotides to their correct positions in reverse order. A final left-shift realigns to MSB. Zero allocation — result lives on the stack.
|
After complementing, bytes are reversed (`swap_bytes`), then nibbles, then 2-bit groups — restoring 2-bit nucleotides to their correct positions in reverse order. A final left-shift realigns to MSB. Zero allocation — result lives on the stack.
|
||||||
|
|
||||||
## Canonical form
|
## Canonical form and `CanonicalKmerOf`
|
||||||
|
|
||||||
|
`canonical()` returns a `CanonicalKmerOf<L>` — a distinct newtype that carries the same `u64` layout but enforces the invariant that the stored value equals `min(kmer, revcomp)`:
|
||||||
|
|
||||||
```rust
|
```rust
|
||||||
pub fn canonical(&self, k: usize) -> Self {
|
pub fn canonical(&self) -> CanonicalKmerOf<L> {
|
||||||
let rc = self.revcomp(k);
|
let rc = self.revcomp();
|
||||||
if self.0 <= rc.0 { *self } else { rc }
|
CanonicalKmerOf(if self.0 <= rc.0 { self.0 } else { rc.0 }, PhantomData)
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Lexicographic minimum of forward and reverse-complement, comparing the raw `u64` values directly (left-aligned encoding makes this equivalent to nucleotide-wise comparison). Zero allocation — result lives on the stack.
|
Lexicographic minimum of forward and reverse-complement, comparing the raw `u64` values directly (left-aligned encoding makes this equivalent to nucleotide-wise comparison). Zero allocation — result lives on the stack.
|
||||||
|
|
||||||
|
`CanonicalKmerOf::from_raw_unchecked(raw)` is the only other public constructor, for trusted paths such as deserialisation.
|
||||||
|
|
||||||
|
## Sliding window helpers
|
||||||
|
|
||||||
|
`push_right(nuc)` / `push_left(nuc)` shift the window by one base in O(1). `is_overlapping(other)` checks whether the last k−1 nucleotides of `self` equal the first k−1 of `other`.
|
||||||
|
|
||||||
|
## Hashing
|
||||||
|
|
||||||
|
`hash_kmer(raw: u64) -> u64` computes `mix64(raw ^ 0x9e3779b97f4a7c15)`, the seeded splitmix64 finalizer. `CanonicalKmerOf::seq_hash()` delegates to `hash_kmer`.
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user