Push mtzqmmrlmzzx #34
@@ -0,0 +1,36 @@
|
||||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: ['**']
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
defaults:
|
||||
run:
|
||||
working-directory: src
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Install Rust
|
||||
run: |
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
|
||||
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Cache cargo registry
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/registry
|
||||
~/.cargo/git
|
||||
src/target
|
||||
key: ${{ runner.os }}-cargo-${{ hashFiles('src/Cargo.lock') }}
|
||||
restore-keys: ${{ runner.os }}-cargo-
|
||||
|
||||
- name: Build
|
||||
run: cargo build --release
|
||||
|
||||
- name: Test
|
||||
run: cargo test --release
|
||||
@@ -0,0 +1,48 @@
|
||||
name: Release
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'v*'
|
||||
|
||||
jobs:
|
||||
build-linux-static:
|
||||
runs-on: ubuntu-latest
|
||||
defaults:
|
||||
run:
|
||||
working-directory: src
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Install Rust + musl target
|
||||
run: |
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
|
||||
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||
sudo apt-get update -qq && sudo apt-get install -y -qq musl-tools
|
||||
$HOME/.cargo/bin/rustup target add x86_64-unknown-linux-musl
|
||||
|
||||
- name: Cache cargo registry
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/registry
|
||||
~/.cargo/git
|
||||
src/target
|
||||
key: linux-musl-cargo-${{ hashFiles('src/Cargo.lock') }}
|
||||
restore-keys: linux-musl-cargo-
|
||||
|
||||
- name: Build static binary
|
||||
run: cargo build --release --target x86_64-unknown-linux-musl
|
||||
|
||||
- name: Prepare artifact
|
||||
run: |
|
||||
mkdir -p /tmp/dist
|
||||
cp target/x86_64-unknown-linux-musl/release/obikmer /tmp/dist/obikmer-linux-x86_64
|
||||
strip /tmp/dist/obikmer-linux-x86_64
|
||||
|
||||
- name: Upload release asset
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: obikmer-linux-x86_64
|
||||
path: /tmp/dist/obikmer-linux-x86_64
|
||||
if-no-files-found: error
|
||||
+10
@@ -9,3 +9,13 @@ data-stress
|
||||
./**/*.json
|
||||
*.bin
|
||||
Betula_exilis--IGA-24-33
|
||||
benchmark/genomes
|
||||
benchmark/simulated_data
|
||||
benchmark/specimen_index_presence
|
||||
benchmark/specimen_index_count
|
||||
benchmark/global_index_presence
|
||||
benchmark/global_index_count
|
||||
benchmark/stats
|
||||
benchmark/reference_index
|
||||
benchmark/specific_index_count
|
||||
benchmark/specific_index_presence
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
/cache
|
||||
/project.local.yml
|
||||
@@ -0,0 +1,133 @@
|
||||
# the name by which the project can be referenced within Serena
|
||||
project_name: "obikmer"
|
||||
|
||||
|
||||
# list of languages for which language servers are started; choose from:
|
||||
# al angular ansible bash clojure
|
||||
# cpp cpp_ccls crystal csharp csharp_omnisharp
|
||||
# dart elixir elm erlang fortran
|
||||
# fsharp go groovy haskell haxe
|
||||
# hlsl html java json julia
|
||||
# kotlin lean4 lua luau markdown
|
||||
# matlab msl nix ocaml pascal
|
||||
# perl php php_phpactor powershell python
|
||||
# python_jedi python_ty r rego ruby
|
||||
# ruby_solargraph rust scala scss solidity
|
||||
# svelte swift systemverilog terraform toml
|
||||
# typescript typescript_vts vue yaml zig
|
||||
# (This list may be outdated. For the current list, see values of Language enum here:
|
||||
# https://github.com/oraios/serena/blob/main/src/solidlsp/ls_config.py
|
||||
# For some languages, there are alternative language servers, e.g. csharp_omnisharp, ruby_solargraph.)
|
||||
# Note:
|
||||
# - For C, use cpp
|
||||
# - For JavaScript, use typescript
|
||||
# - For Angular projects, use angular (subsumes typescript+html; requires `npm install` in the project root)
|
||||
# - For Svelte projects, use svelte (subsumes typescript/javascript for .svelte projects; requires npm)
|
||||
# - For SCSS / Sass / plain CSS, use scss (some-sass-language-server handles all three)
|
||||
# - For Free Pascal/Lazarus, use pascal
|
||||
# Special requirements:
|
||||
# Some languages require additional setup/installations.
|
||||
# See here for details: https://oraios.github.io/serena/01-about/020_programming-languages.html#language-servers
|
||||
# When using multiple languages, the first language server that supports a given file will be used for that file.
|
||||
# The first language is the default language and the respective language server will be used as a fallback.
|
||||
# Note that when using the JetBrains backend, language servers are not used and this list is correspondingly ignored.
|
||||
languages:
|
||||
- rust
|
||||
|
||||
# the encoding used by text files in the project
|
||||
# For a list of possible encodings, see https://docs.python.org/3.11/library/codecs.html#standard-encodings
|
||||
encoding: "utf-8"
|
||||
|
||||
# line ending convention to use when writing source files.
|
||||
# Possible values: unset (use global setting), "lf", "crlf", or "native" (platform default)
|
||||
# This does not affect Serena's own files (e.g. memories and configuration files), which always use native line endings.
|
||||
line_ending:
|
||||
|
||||
# The language backend to use for this project.
|
||||
# If not set, the global setting from serena_config.yml is used.
|
||||
# Valid values: LSP, JetBrains
|
||||
# Note: the backend is fixed at startup. If a project with a different backend
|
||||
# is activated post-init, an error will be returned.
|
||||
language_backend:
|
||||
|
||||
# whether to use project's .gitignore files to ignore files
|
||||
ignore_all_files_in_gitignore: true
|
||||
|
||||
# advanced configuration option allowing to configure language server-specific options.
|
||||
# Maps the language key to the options.
|
||||
# Have a look at the docstring of the constructors of the LS implementations within solidlsp (e.g., for C# or PHP) to see which options are available.
|
||||
# No documentation on options means no options are available.
|
||||
ls_specific_settings: {}
|
||||
|
||||
# list of additional workspace folder paths for cross-package reference support (e.g. in monorepos).
|
||||
# Paths can be absolute or relative to the project root.
|
||||
# Each folder is registered as an LSP workspace folder, enabling language servers to discover
|
||||
# symbols and references across package boundaries.
|
||||
# Currently supported for: TypeScript.
|
||||
# Example:
|
||||
# additional_workspace_folders:
|
||||
# - ../sibling-package
|
||||
# - ../shared-lib
|
||||
additional_workspace_folders: []
|
||||
|
||||
# list of additional paths to ignore in this project.
|
||||
# Same syntax as gitignore, so you can use * and **.
|
||||
# Note: global ignored_paths from serena_config.yml are also applied additively.
|
||||
ignored_paths: []
|
||||
|
||||
# whether the project is in read-only mode
|
||||
# If set to true, all editing tools will be disabled and attempts to use them will result in an error
|
||||
# Added on 2025-04-18
|
||||
read_only: false
|
||||
|
||||
# list of tool names to exclude.
|
||||
# This extends the existing exclusions (e.g. from the global configuration)
|
||||
# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
|
||||
excluded_tools: []
|
||||
|
||||
# list of tools to include that would otherwise be disabled (particularly optional tools that are disabled by default).
|
||||
# This extends the existing inclusions (e.g. from the global configuration).
|
||||
# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
|
||||
included_optional_tools: []
|
||||
|
||||
# fixed set of tools to use as the base tool set (if non-empty), replacing Serena's default set of tools.
|
||||
# This cannot be combined with non-empty excluded_tools or included_optional_tools.
|
||||
# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
|
||||
fixed_tools: []
|
||||
|
||||
# list of mode names that are to be activated by default, overriding the setting in the global configuration.
|
||||
# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
|
||||
# If the setting is undefined/empty, the default_modes from the global configuration (serena_config.yml) apply.
|
||||
# Otherwise, this overrides the setting from the global configuration (serena_config.yml).
|
||||
# Therefore, you can set this to [] if you do not want the default modes defined in the global config to apply
|
||||
# for this project.
|
||||
# This setting can, in turn, be overridden by CLI parameters (--mode).
|
||||
# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
|
||||
default_modes:
|
||||
|
||||
# list of mode names to be activated additionally for this project, e.g. ["query-projects"]
|
||||
# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
|
||||
# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
|
||||
added_modes:
|
||||
|
||||
# initial prompt for the project. It will always be given to the LLM upon activating the project
|
||||
# (contrary to the memories, which are loaded on demand).
|
||||
initial_prompt: ""
|
||||
|
||||
# time budget (seconds) per tool call for the retrieval of additional symbol information
|
||||
# such as docstrings or parameter information.
|
||||
# This overrides the corresponding setting in the global configuration; see the documentation there.
|
||||
# If null or missing, use the setting from the global configuration.
|
||||
symbol_info_budget:
|
||||
|
||||
# list of regex patterns which, when matched, mark a memory entry as read‑only.
|
||||
# Extends the list from the global configuration, merging the two lists.
|
||||
read_only_memory_patterns: []
|
||||
|
||||
# list of regex patterns for memories to completely ignore.
|
||||
# Matching memories will not appear in list_memories or activate_project output
|
||||
# and cannot be accessed via read_memory or write_memory.
|
||||
# To access ignored memory files, use the read_file tool on the raw file path.
|
||||
# Extends the list from the global configuration, merging the two lists.
|
||||
# Example: ["_archive/.*", "_episodes/.*"]
|
||||
ignored_memory_patterns: []
|
||||
@@ -73,3 +73,29 @@ Lors de l'ajout de nouveaux fichiers Markdown dans `docmd/`, mettre à jour la s
|
||||
---
|
||||
|
||||
Je continue à poser mes questions et à guider la discussion.
|
||||
|
||||
---
|
||||
|
||||
## MCP Tools
|
||||
|
||||
**Règle absolue : avant tout travail de code, appeler `mcp__serena__initial_instructions` pour charger les instructions Serena.**
|
||||
|
||||
### Hiérarchie des outils pour ce projet Rust
|
||||
|
||||
**Navigation et édition de code → serena en priorité**
|
||||
- Trouver un symbole, une déclaration, les implémentations d'un trait : `mcp__serena__find_symbol`, `mcp__serena__find_declaration`, `mcp__serena__find_implementations`
|
||||
- Trouver les usages d'un symbole : `mcp__serena__find_referencing_symbols`
|
||||
- Diagnostics LSP (erreurs de compilation) : `mcp__serena__get_diagnostics_for_file`
|
||||
- Vue d'ensemble d'un fichier : `mcp__serena__get_symbols_overview`
|
||||
- Modifier le corps d'une fonction/impl : `mcp__serena__replace_symbol_body`
|
||||
- Ne pas utiliser `cclsp` quand serena couvre le besoin
|
||||
|
||||
**Analyse architecturale → jcodemunch**
|
||||
- Hotspots, couplage, dead code, dépendances entre modules
|
||||
- Utiliser avant de refactorer une zone critique
|
||||
|
||||
**Raisonnement complexe → sequential-thinking**
|
||||
- Décisions d'architecture, choix d'algorithme, trade-offs non triviaux
|
||||
|
||||
**Documentation de crates → context7**
|
||||
- Toujours consulter avant d'utiliser une API de bibliothèque externe
|
||||
|
||||
@@ -22,6 +22,7 @@ $(MKDOCS): $(VENV)/bin/activate
|
||||
mkdocs mkdocs-material \
|
||||
mkdocs-mermaid2-plugin \
|
||||
mkdocs-bibtex
|
||||
$(PIP) install --quiet --upgrade InSilicoSeq
|
||||
|
||||
# ── obikmer binary ───────────────────────────────────────────────────────────
|
||||
|
||||
@@ -62,3 +63,28 @@ clean-doc:
|
||||
.PHONY: clean
|
||||
clean: clean-doc
|
||||
rm -rf $(VENV)
|
||||
|
||||
# ── release ───────────────────────────────────────────────────────────────────
|
||||
|
||||
CARGO_TOML := $(CARGO_DIR)/obikmer/Cargo.toml
|
||||
|
||||
.PHONY: bump-version
|
||||
bump-version:
|
||||
@current=$$(grep '^version = ' $(CARGO_TOML) | head -n 1 | sed 's/version = "\(.*\)"/\1/'); \
|
||||
if [ -n "$(RELEASE)" ]; then \
|
||||
new_version="$(RELEASE)"; \
|
||||
else \
|
||||
major=$$(echo $$current | cut -d. -f1); \
|
||||
minor=$$(echo $$current | cut -d. -f2); \
|
||||
patch=$$(echo $$current | cut -d. -f3); \
|
||||
new_patch=$$((patch + 1)); \
|
||||
new_version="$$major.$$minor.$$new_patch"; \
|
||||
fi; \
|
||||
echo "Version: $$current -> $$new_version"; \
|
||||
sed -i.bak "s/^version = \"$$current\"/version = \"$$new_version\"/" $(CARGO_TOML) && \
|
||||
rm $(CARGO_TOML).bak
|
||||
|
||||
.PHONY: release
|
||||
release: bump-version
|
||||
@jj auto-describe
|
||||
@jj git push --change @
|
||||
|
||||
@@ -0,0 +1,144 @@
|
||||
# Requires GNU Make >= 4.3 (grouped targets &:) — use gmake on macOS
|
||||
BINARY := ../src/target/release/obikmer
|
||||
VENV_PY := ../.venv/bin/python3
|
||||
|
||||
GENOMES := $(wildcard genomes/*.fna.gz)
|
||||
|
||||
# SPECIMENS, SPECIES, and the full dependency graph are generated by
|
||||
# make_deps.py from the genome FASTA headers — like .d files in C.
|
||||
# Make rebuilds deps.mk whenever genomes/ changes and restarts.
|
||||
-include deps.mk
|
||||
|
||||
REF_NPZS := $(SPECIMENS:%=reference_index/%.npz)
|
||||
PRESENCE_DONE := $(SPECIMENS:%=specimen_index_presence/%/index.done)
|
||||
PRESENCE_STATS := $(SPECIMENS:%=stats/indexing_presence/%.stats)
|
||||
COUNT_DONE := $(SPECIMENS:%=specimen_index_count/%/index.done)
|
||||
COUNT_STATS := $(SPECIMENS:%=stats/indexing_count/%.stats)
|
||||
VERIFY_PRESENCE_STATS := $(SPECIMENS:%=stats/verify_presence/%.stats)
|
||||
VERIFY_COUNT_STATS := $(SPECIMENS:%=stats/verify_count/%.stats)
|
||||
SPECIFIC_PRESENCE_DONE := $(SPECIES:%=specific_index_presence/%/index.done)
|
||||
SPECIFIC_PRESENCE_STATS := $(SPECIES:%=stats/specific_kmer_presence/%.stats)
|
||||
SPECIFIC_COUNT_DONE := $(SPECIES:%=specific_index_count/%/index.done)
|
||||
SPECIFIC_COUNT_STATS := $(SPECIES:%=stats/specific_kmer_count/%.stats)
|
||||
SIMULATED_READS := $(foreach s,$(SPECIMENS),simulated_data/$(subst --,/,$s)/reads_R1.fastq.gz)
|
||||
|
||||
.NOTPARALLEL:
|
||||
|
||||
.PHONY: all simulate reference \
|
||||
index_presence index_count \
|
||||
aggregate_index_presence aggregate_index_count \
|
||||
merge_presence merge_count \
|
||||
verify_presence verify_count \
|
||||
aggregate_verify_presence aggregate_verify_count \
|
||||
verify_merge_presence verify_merge_count \
|
||||
filter_presence filter_count \
|
||||
aggregate_filter_presence aggregate_filter_count
|
||||
|
||||
verify_merge_presence: stats/verify_merge_presence/current.csv
|
||||
verify_merge_count: stats/verify_merge_count/current.csv
|
||||
|
||||
all: aggregate_verify_presence aggregate_verify_count \
|
||||
verify_merge_presence verify_merge_count \
|
||||
aggregate_filter_presence aggregate_filter_count
|
||||
|
||||
# ── dependency file ───────────────────────────────────────────────────────────
|
||||
|
||||
deps.mk: $(GENOMES)
|
||||
$(VENV_PY) make_deps.py $^ > $@
|
||||
|
||||
# ── simulation ────────────────────────────────────────────────────────────────
|
||||
# Prerequisites (genome → reads) are in deps.mk; $< is the genome file.
|
||||
|
||||
$(SIMULATED_READS):
|
||||
bash simulate_one.sh $< $(dir $@)
|
||||
|
||||
simulate: $(SIMULATED_READS)
|
||||
|
||||
# ── reference kmer sets ───────────────────────────────────────────────────────
|
||||
# Prerequisites (reads → npz) are in deps.mk.
|
||||
|
||||
reference_index/%.npz:
|
||||
bash build_reference.sh $*
|
||||
|
||||
reference: $(REF_NPZS)
|
||||
|
||||
# ── per-specimen indexing ─────────────────────────────────────────────────────
|
||||
# Prerequisites (reads → index.done + .stats) are in deps.mk.
|
||||
|
||||
specimen_index_presence/%/index.done \
|
||||
stats/indexing_presence/%.stats &: $(BINARY)
|
||||
bash index_one_presence.sh $*
|
||||
|
||||
specimen_index_count/%/index.done \
|
||||
stats/indexing_count/%.stats &: $(BINARY)
|
||||
bash index_one_count.sh $*
|
||||
|
||||
index_presence: $(PRESENCE_DONE)
|
||||
index_count: $(COUNT_DONE)
|
||||
|
||||
# ── indexing stats aggregation ────────────────────────────────────────────────
|
||||
|
||||
aggregate_index_presence: $(PRESENCE_STATS)
|
||||
bash aggregate_stats.sh indexing_presence
|
||||
|
||||
aggregate_index_count: $(COUNT_STATS)
|
||||
bash aggregate_stats.sh indexing_count
|
||||
|
||||
# ── global merge ──────────────────────────────────────────────────────────────
|
||||
|
||||
global_index_presence/index.done: $(PRESENCE_DONE) $(BINARY)
|
||||
bash merge_presence.sh
|
||||
|
||||
global_index_count/index.done: $(COUNT_DONE) $(BINARY)
|
||||
bash merge_count.sh
|
||||
|
||||
merge_presence: global_index_presence/index.done
|
||||
merge_count: global_index_count/index.done
|
||||
|
||||
# ── per-specimen verification ─────────────────────────────────────────────────
|
||||
# Prerequisites (index.done + npz → .stats) are in deps.mk.
|
||||
|
||||
stats/verify_presence/%.stats:
|
||||
bash verify_one_presence.sh $*
|
||||
|
||||
stats/verify_count/%.stats:
|
||||
bash verify_one_count.sh $*
|
||||
|
||||
verify_presence: $(VERIFY_PRESENCE_STATS)
|
||||
verify_count: $(VERIFY_COUNT_STATS)
|
||||
|
||||
# ── verification stats aggregation ───────────────────────────────────────────
|
||||
|
||||
aggregate_verify_presence: $(VERIFY_PRESENCE_STATS)
|
||||
bash aggregate_stats.sh verify_presence
|
||||
|
||||
aggregate_verify_count: $(VERIFY_COUNT_STATS)
|
||||
bash aggregate_stats.sh verify_count
|
||||
|
||||
# ── species-specific indexes ──────────────────────────────────────────────────
|
||||
# Prerequisites (global index → specific index) are in deps.mk.
|
||||
|
||||
specific_index_presence/%/index.done \
|
||||
stats/specific_kmer_presence/%.stats &: $(BINARY)
|
||||
bash filter_one_presence.sh $*
|
||||
|
||||
specific_index_count/%/index.done \
|
||||
stats/specific_kmer_count/%.stats &: $(BINARY)
|
||||
bash filter_one_count.sh $*
|
||||
|
||||
filter_presence: $(SPECIFIC_PRESENCE_DONE)
|
||||
filter_count: $(SPECIFIC_COUNT_DONE)
|
||||
|
||||
aggregate_filter_presence: $(SPECIFIC_PRESENCE_STATS)
|
||||
bash aggregate_stats.sh specific_kmer_presence
|
||||
|
||||
aggregate_filter_count: $(SPECIFIC_COUNT_STATS)
|
||||
bash aggregate_stats.sh specific_kmer_count
|
||||
|
||||
# ── merged index verification ─────────────────────────────────────────────────
|
||||
|
||||
stats/verify_merge_presence/current.csv: $(REF_NPZS) global_index_presence/index.done
|
||||
bash verify_merge_presence.sh
|
||||
|
||||
stats/verify_merge_count/current.csv: $(REF_NPZS) global_index_count/index.done
|
||||
bash verify_merge_count.sh
|
||||
@@ -0,0 +1,132 @@
|
||||
# Benchmark pipeline
|
||||
|
||||
Requires **GNU Make ≥ 4.3** (grouped targets `&:`). On macOS use `gmake`.
|
||||
|
||||
```
|
||||
gmake all # full pipeline
|
||||
gmake simulate # simulation only
|
||||
gmake reference # reference kmer sets only
|
||||
```
|
||||
|
||||
## Pipeline overview
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
GENOMES["genomes/*.fna.gz"]
|
||||
BIN["obikmer binary"]
|
||||
|
||||
GENOMES --> simulate
|
||||
simulate --> simdata[("simulated_data/")]
|
||||
|
||||
simdata --> reference
|
||||
reference --> refnpz[("reference_index/*.npz")]
|
||||
|
||||
subgraph presence ["Presence track"]
|
||||
simdata --> index_presence
|
||||
BIN --> index_presence
|
||||
index_presence --> pres_done[("specimen_index_presence/")]
|
||||
index_presence --> pres_istats[("stats/indexing_presence/")]
|
||||
pres_istats --> aggregate_index_presence
|
||||
|
||||
pres_done --> merge_presence
|
||||
BIN --> merge_presence
|
||||
merge_presence --> gpres[("global_index_presence/")]
|
||||
|
||||
refnpz --> verify_presence
|
||||
pres_done --> verify_presence
|
||||
verify_presence --> vpres_stats[("stats/verify_presence/")]
|
||||
vpres_stats --> aggregate_verify_presence
|
||||
|
||||
gpres --> filter_presence
|
||||
BIN --> filter_presence
|
||||
filter_presence --> spec_pres[("specific_index_presence/")]
|
||||
filter_presence --> spec_pres_stats[("stats/specific_kmer_presence/")]
|
||||
spec_pres_stats --> aggregate_filter_presence
|
||||
|
||||
refnpz --> verify_merge_presence
|
||||
gpres --> verify_merge_presence
|
||||
verify_merge_presence --> vmp[("stats/verify_merge_presence/")]
|
||||
end
|
||||
|
||||
subgraph count ["Count track"]
|
||||
simdata --> index_count
|
||||
BIN --> index_count
|
||||
index_count --> count_done[("specimen_index_count/")]
|
||||
index_count --> count_istats[("stats/indexing_count/")]
|
||||
count_istats --> aggregate_index_count
|
||||
|
||||
count_done --> merge_count
|
||||
BIN --> merge_count
|
||||
merge_count --> gcount[("global_index_count/")]
|
||||
|
||||
refnpz --> verify_count
|
||||
count_done --> verify_count
|
||||
verify_count --> vcount_stats[("stats/verify_count/")]
|
||||
vcount_stats --> aggregate_verify_count
|
||||
|
||||
gcount --> filter_count
|
||||
BIN --> filter_count
|
||||
filter_count --> spec_count[("specific_index_count/")]
|
||||
filter_count --> spec_count_stats[("stats/specific_kmer_count/")]
|
||||
spec_count_stats --> aggregate_filter_count
|
||||
|
||||
refnpz --> verify_merge_count
|
||||
gcount --> verify_merge_count
|
||||
verify_merge_count --> vmc[("stats/verify_merge_count/")]
|
||||
end
|
||||
|
||||
aggregate_verify_presence --> all
|
||||
aggregate_verify_count --> all
|
||||
vmp --> all
|
||||
vmc --> all
|
||||
all -. "$(MAKE) re-eval" .-> aggregate_filter_presence
|
||||
all -. "$(MAKE) re-eval" .-> aggregate_filter_count
|
||||
```
|
||||
|
||||
## Steps
|
||||
|
||||
| Target | Script | Description |
|
||||
|---|---|---|
|
||||
| `simulate` | `simulate.sh` | Simulate sequencing reads from the reference genomes |
|
||||
| `reference` | `build_reference.sh` | Build reference kmer sets (`.npz`) from simulation truth |
|
||||
| `index_presence` | `index_one_presence.sh` | Index each specimen (presence mode) |
|
||||
| `index_count` | `index_one_count.sh` | Index each specimen (count mode) |
|
||||
| `aggregate_index_presence` | `aggregate_stats.sh` | Aggregate per-specimen indexing stats (presence) |
|
||||
| `aggregate_index_count` | `aggregate_stats.sh` | Aggregate per-specimen indexing stats (count) |
|
||||
| `merge_presence` | `merge_presence.sh` | Merge all specimen presence indexes into a global index |
|
||||
| `merge_count` | `merge_count.sh` | Merge all specimen count indexes into a global index |
|
||||
| `verify_presence` | `verify_one_presence.sh` | Verify each specimen presence index against reference |
|
||||
| `verify_count` | `verify_one_count.sh` | Verify each specimen count index against reference |
|
||||
| `aggregate_verify_presence` | `aggregate_stats.sh` | Aggregate per-specimen verification stats (presence) |
|
||||
| `aggregate_verify_count` | `aggregate_stats.sh` | Aggregate per-specimen verification stats (count) |
|
||||
| `filter_presence` | `filter_one_presence.sh` | Extract species-specific presence indexes from global index |
|
||||
| `filter_count` | `filter_one_count.sh` | Extract species-specific count indexes from global index |
|
||||
| `aggregate_filter_presence` | `aggregate_stats.sh` | Aggregate species-specific kmer stats (presence) |
|
||||
| `aggregate_filter_count` | `aggregate_stats.sh` | Aggregate species-specific kmer stats (count) |
|
||||
| `verify_merge_presence` | `verify_merge_presence.sh` | Verify global presence index against all reference sets |
|
||||
| `verify_merge_count` | `verify_merge_count.sh` | Verify global count index against all reference sets |
|
||||
|
||||
## Directory layout
|
||||
|
||||
```
|
||||
benchmark/
|
||||
├── genomes/ # input reference genomes (.fna.gz)
|
||||
├── simulated_data/ # generated by simulate
|
||||
│ └── <species>/<specimen>/
|
||||
├── reference_index/ # reference kmer sets (.npz)
|
||||
├── specimen_index_presence/ # per-specimen presence indexes
|
||||
├── specimen_index_count/ # per-specimen count indexes
|
||||
├── global_index_presence/ # merged global presence index
|
||||
├── global_index_count/ # merged global count index
|
||||
├── specific_index_presence/ # species-specific presence indexes
|
||||
├── specific_index_count/ # species-specific count indexes
|
||||
└── stats/ # all benchmark statistics
|
||||
├── indexing_presence/
|
||||
├── indexing_count/
|
||||
├── verify_presence/
|
||||
├── verify_count/
|
||||
├── specific_kmer_presence/
|
||||
├── specific_kmer_count/
|
||||
├── verify_merge_presence/
|
||||
└── verify_merge_count/
|
||||
```
|
||||
Executable
+53
@@ -0,0 +1,53 @@
|
||||
#!/usr/bin/env bash
|
||||
# Usage: aggregate_stats.sh TYPE
|
||||
# TYPE = indexing_presence | indexing_count | verify_presence | verify_count
|
||||
#
|
||||
# Reads all stats/TYPE/*.stats files (one CSV data row each, no header).
|
||||
# Creates a new stats/TYPE/run_NNN.csv only if any .stats file is newer than
|
||||
# the most recent run CSV (idempotent when nothing changed).
|
||||
set -euo pipefail
|
||||
|
||||
TYPE="$1"
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
STATS_DIR="${SCRIPT_DIR}/stats/${TYPE}"
|
||||
|
||||
case "${TYPE}" in
|
||||
indexing_presence|indexing_count)
|
||||
HEADER="run,species,strain,scatter_wall_s,scatter_rss_b,dereplicate_wall_s,dereplicate_rss_b,count_kmer_wall_s,count_kmer_rss_b,index_wall_s,index_rss_b,total_wall_s,total_rss_b"
|
||||
;;
|
||||
verify_presence)
|
||||
HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct"
|
||||
;;
|
||||
verify_count)
|
||||
HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,fn_pct,fp_pct,cm_pct"
|
||||
;;
|
||||
specific_kmer_presence|specific_kmer_count)
|
||||
HEADER="run,species,rebuild_wall_s,rebuild_rss_b,pack_wall_s,pack_rss_b,filter_total_wall_s,filter_total_rss_b,select_wall_s,select_rss_b,select_total_wall_s,select_total_rss_b"
|
||||
;;
|
||||
*)
|
||||
echo "ERROR: unknown stats type '${TYPE}'" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
# Find most recent existing run CSV (empty string if none).
|
||||
latest_csv=$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | sort | tail -1)
|
||||
|
||||
# Check if any .stats file is newer than the latest run CSV.
|
||||
if [[ -n "${latest_csv}" ]] && \
|
||||
[[ -z "$(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' -newer "${latest_csv}" 2>/dev/null)" ]]; then
|
||||
echo "[${TYPE}] stats up to date (${latest_csv})"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | wc -l | tr -d ' ')")
|
||||
CSV="${STATS_DIR}/run_${run_n}.csv"
|
||||
|
||||
echo "${HEADER}" >"${CSV}"
|
||||
|
||||
# Sort .stats files by name for reproducible row order.
|
||||
while IFS= read -r stats_file; do
|
||||
sed "s/^/${run_n},/" "${stats_file}"
|
||||
done < <(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' | sort) >>"${CSV}"
|
||||
|
||||
echo "[${TYPE}] run ${run_n} → ${CSV}"
|
||||
Executable
+137
@@ -0,0 +1,137 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Build a reference kmer index from paired-end FASTQ reads.
|
||||
|
||||
Extracts canonical kmers — min(kmer, revcomp(kmer)) encoded as uint64 —
|
||||
counts their abundances, and saves a sorted numpy pair (kmers, counts).
|
||||
|
||||
Output .npz arrays
|
||||
kmers : uint64, sorted ascending — canonical kmer integers
|
||||
counts : uint32, same order — raw read abundances
|
||||
"""
|
||||
import argparse
|
||||
import gzip
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
# ── encoding ────────────────────────────────────────────────────────────────
|
||||
|
||||
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
|
||||
'a': 0, 'c': 1, 'g': 2, 't': 3}
|
||||
|
||||
# Lookup table: revcomp of one byte (4 bases, 8 bits).
|
||||
# Precomputed once at import time.
|
||||
_REVCOMP8 = [0] * 256
|
||||
for _i in range(256):
|
||||
_rc, _x = 0, _i
|
||||
for _ in range(4):
|
||||
_rc = (_rc << 2) | (3 - (_x & 3))
|
||||
_x >>= 2
|
||||
_REVCOMP8[_i] = _rc
|
||||
del _i, _rc, _x
|
||||
|
||||
|
||||
def revcomp_int(kmer: int, k: int) -> int:
|
||||
"""Reverse-complement of a kmer encoded as an integer (2 bits/base).
|
||||
|
||||
Uses byte-level lookup (4 bases at a time) for speed.
|
||||
"""
|
||||
rc = 0
|
||||
bits_left = 2 * k
|
||||
while bits_left > 0:
|
||||
chunk = min(8, bits_left)
|
||||
rc_byte = _REVCOMP8[kmer & 0xFF] >> (8 - chunk)
|
||||
rc = (rc << chunk) | rc_byte
|
||||
kmer >>= chunk
|
||||
bits_left -= chunk
|
||||
return rc
|
||||
|
||||
|
||||
# ── FASTQ parsing ────────────────────────────────────────────────────────────
|
||||
|
||||
def iter_sequences(path: str):
|
||||
"""Yield raw sequences from a (gzipped) FASTQ file."""
|
||||
opener = gzip.open if path.endswith('.gz') else open
|
||||
with opener(path, 'rt') as fh:
|
||||
while True:
|
||||
if not fh.readline(): # '@' header
|
||||
break
|
||||
seq = fh.readline().rstrip('\n')
|
||||
fh.readline() # '+'
|
||||
fh.readline() # quality
|
||||
yield seq
|
||||
|
||||
|
||||
# ── kmer counting ────────────────────────────────────────────────────────────
|
||||
|
||||
def count_kmers(paths: list[str], k: int) -> dict[int, int]:
|
||||
mask = (1 << (2 * k)) - 1
|
||||
counts: dict[int, int] = defaultdict(int)
|
||||
n_reads = 0
|
||||
|
||||
for path in paths:
|
||||
for seq in iter_sequences(path):
|
||||
n_reads += 1
|
||||
kmer = 0
|
||||
run = 0 # consecutive valid bases
|
||||
|
||||
for c in seq:
|
||||
b = _ENCODE.get(c)
|
||||
if b is None: # N or unexpected character → reset
|
||||
kmer = 0
|
||||
run = 0
|
||||
continue
|
||||
kmer = ((kmer << 2) | b) & mask
|
||||
run += 1
|
||||
if run >= k:
|
||||
rc = revcomp_int(kmer, k)
|
||||
counts[kmer if kmer <= rc else rc] += 1
|
||||
|
||||
if n_reads % 100_000 == 0:
|
||||
print(f' {n_reads:,} reads processed, '
|
||||
f'{len(counts):,} distinct kmers so far',
|
||||
file=sys.stderr)
|
||||
|
||||
print(f' {n_reads:,} reads total, {len(counts):,} distinct kmers',
|
||||
file=sys.stderr)
|
||||
return counts
|
||||
|
||||
|
||||
# ── main ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser(description=__doc__,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||
ap.add_argument('reads', nargs='+', metavar='FASTQ',
|
||||
help='Input reads (FASTQ, gzip OK)')
|
||||
ap.add_argument('-k', '--kmer-size', type=int, default=31,
|
||||
metavar='K')
|
||||
ap.add_argument('--min-abundance', type=int, default=1,
|
||||
metavar='N', help='Drop kmers with count < N (default 1)')
|
||||
ap.add_argument('-o', '--output', required=True,
|
||||
metavar='FILE', help='Output .npz path')
|
||||
args = ap.parse_args()
|
||||
|
||||
print(f'k={args.kmer_size} files={len(args.reads)}', file=sys.stderr)
|
||||
counts = count_kmers(args.reads, args.kmer_size)
|
||||
|
||||
if args.min_abundance > 1:
|
||||
before = len(counts)
|
||||
counts = {k: v for k, v in counts.items() if v >= args.min_abundance}
|
||||
print(f' min-abundance={args.min_abundance}: '
|
||||
f'{before - len(counts):,} kmers dropped, '
|
||||
f'{len(counts):,} retained',
|
||||
file=sys.stderr)
|
||||
|
||||
print(f'Sorting and saving → {args.output}', file=sys.stderr)
|
||||
kmers_arr = np.fromiter(sorted(counts), dtype=np.uint64, count=len(counts))
|
||||
counts_arr = np.array([counts[int(k)] for k in kmers_arr], dtype=np.uint32)
|
||||
|
||||
np.savez_compressed(args.output, kmers=kmers_arr, counts=counts_arr)
|
||||
print(f'Done {len(kmers_arr):,} kmers → {args.output}', file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Executable
+39
@@ -0,0 +1,39 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
SIMDATA_DIR="${SCRIPT_DIR}/simulated_data"
|
||||
REF_DIR="${SCRIPT_DIR}/reference_index"
|
||||
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
|
||||
BUILD_PY="${SCRIPT_DIR}/build_reference.py"
|
||||
|
||||
KMER_SIZE="${KMER_SIZE:-31}"
|
||||
MIN_ABUNDANCE="${MIN_ABUNDANCE:-1}"
|
||||
|
||||
mkdir -p "${REF_DIR}"
|
||||
|
||||
for species_dir in "${SIMDATA_DIR}"/*/; do
|
||||
[[ -d "${species_dir}" ]] || continue
|
||||
species=$(basename "${species_dir}")
|
||||
|
||||
for strain_dir in "${species_dir}"*/; do
|
||||
[[ -d "${strain_dir}" ]] || continue
|
||||
strain=$(basename "${strain_dir}")
|
||||
|
||||
r1="${strain_dir}/reads_R1.fastq.gz"
|
||||
r2="${strain_dir}/reads_R2.fastq.gz"
|
||||
if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
|
||||
echo "SKIP ${species}--${strain}: reads not found" >&2
|
||||
continue
|
||||
fi
|
||||
|
||||
out="${REF_DIR}/${species}--${strain}.npz"
|
||||
echo "[${species}--${strain}] → ${out}"
|
||||
|
||||
"${PYTHON}" "${BUILD_PY}" \
|
||||
--kmer-size "${KMER_SIZE}" \
|
||||
--min-abundance "${MIN_ABUNDANCE}" \
|
||||
--output "${out}" \
|
||||
"${r1}" "${r2}"
|
||||
done
|
||||
done
|
||||
@@ -0,0 +1,199 @@
|
||||
SPECIMENS := Escherichia_coli--K-12_MG1655 Escherichia_coli--EDL933 Salmonella_enterica--LT2 Escherichia_coli--CFT073 Bacillus_subtilis--168 Salmonella_enterica--P125109 Shouchella_clausii--KSM-K16 Escherichia_coli--K-12_W3110 Klebsiella_pneumoniae--MGH_78578 Opitutus_terrae--PB90-1 Saccharolobus_islandicus--M.16.4 Acidobacterium_capsulatum--ATCC_51196 Salmonella_enterica--AKU_12601 Proteus_mirabilis--HI4320 Salmonella_enterica--CT18 Klebsiella_pneumoniae--HS11286 Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1 Klebsiella_pneumoniae--ATCC_13883 Yersinia_ruckeri--YRB Candidozyma_auris--GCF_003013715.1_ASM301371v2
|
||||
SPECIES := Escherichia_coli Salmonella_enterica Bacillus_subtilis Shouchella_clausii Klebsiella_pneumoniae Opitutus_terrae Saccharolobus_islandicus Acidobacterium_capsulatum Proteus_mirabilis Wolbachia_endosymbiont Yersinia_ruckeri Candidozyma_auris
|
||||
|
||||
# Escherichia_coli--K-12_MG1655
|
||||
simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz: genomes/GCF_000005845.2_ASM584v2_genomic.fna.gz
|
||||
reference_index/Escherichia_coli--K-12_MG1655.npz: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
|
||||
specimen_index_presence/Escherichia_coli--K-12_MG1655/index.done stats/indexing_presence/Escherichia_coli--K-12_MG1655.stats: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
|
||||
specimen_index_count/Escherichia_coli--K-12_MG1655/index.done stats/indexing_count/Escherichia_coli--K-12_MG1655.stats: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
|
||||
stats/verify_presence/Escherichia_coli--K-12_MG1655.stats: reference_index/Escherichia_coli--K-12_MG1655.npz specimen_index_presence/Escherichia_coli--K-12_MG1655/index.done
|
||||
stats/verify_count/Escherichia_coli--K-12_MG1655.stats: reference_index/Escherichia_coli--K-12_MG1655.npz specimen_index_count/Escherichia_coli--K-12_MG1655/index.done
|
||||
|
||||
# Escherichia_coli--EDL933
|
||||
simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz: genomes/GCF_000006665.1_ASM666v1_genomic.fna.gz
|
||||
reference_index/Escherichia_coli--EDL933.npz: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
|
||||
specimen_index_presence/Escherichia_coli--EDL933/index.done stats/indexing_presence/Escherichia_coli--EDL933.stats: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
|
||||
specimen_index_count/Escherichia_coli--EDL933/index.done stats/indexing_count/Escherichia_coli--EDL933.stats: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
|
||||
stats/verify_presence/Escherichia_coli--EDL933.stats: reference_index/Escherichia_coli--EDL933.npz specimen_index_presence/Escherichia_coli--EDL933/index.done
|
||||
stats/verify_count/Escherichia_coli--EDL933.stats: reference_index/Escherichia_coli--EDL933.npz specimen_index_count/Escherichia_coli--EDL933/index.done
|
||||
|
||||
# Salmonella_enterica--LT2
|
||||
simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz: genomes/GCF_000006945.2_ASM694v2_genomic.fna.gz
|
||||
reference_index/Salmonella_enterica--LT2.npz: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
|
||||
specimen_index_presence/Salmonella_enterica--LT2/index.done stats/indexing_presence/Salmonella_enterica--LT2.stats: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
|
||||
specimen_index_count/Salmonella_enterica--LT2/index.done stats/indexing_count/Salmonella_enterica--LT2.stats: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
|
||||
stats/verify_presence/Salmonella_enterica--LT2.stats: reference_index/Salmonella_enterica--LT2.npz specimen_index_presence/Salmonella_enterica--LT2/index.done
|
||||
stats/verify_count/Salmonella_enterica--LT2.stats: reference_index/Salmonella_enterica--LT2.npz specimen_index_count/Salmonella_enterica--LT2/index.done
|
||||
|
||||
# Escherichia_coli--CFT073
|
||||
simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz: genomes/GCF_000007445.1_ASM744v1_genomic.fna.gz
|
||||
reference_index/Escherichia_coli--CFT073.npz: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
|
||||
specimen_index_presence/Escherichia_coli--CFT073/index.done stats/indexing_presence/Escherichia_coli--CFT073.stats: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
|
||||
specimen_index_count/Escherichia_coli--CFT073/index.done stats/indexing_count/Escherichia_coli--CFT073.stats: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
|
||||
stats/verify_presence/Escherichia_coli--CFT073.stats: reference_index/Escherichia_coli--CFT073.npz specimen_index_presence/Escherichia_coli--CFT073/index.done
|
||||
stats/verify_count/Escherichia_coli--CFT073.stats: reference_index/Escherichia_coli--CFT073.npz specimen_index_count/Escherichia_coli--CFT073/index.done
|
||||
|
||||
# Bacillus_subtilis--168
|
||||
simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz: genomes/GCF_000009045.1_ASM904v1_genomic.fna.gz
|
||||
reference_index/Bacillus_subtilis--168.npz: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
|
||||
specimen_index_presence/Bacillus_subtilis--168/index.done stats/indexing_presence/Bacillus_subtilis--168.stats: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
|
||||
specimen_index_count/Bacillus_subtilis--168/index.done stats/indexing_count/Bacillus_subtilis--168.stats: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
|
||||
stats/verify_presence/Bacillus_subtilis--168.stats: reference_index/Bacillus_subtilis--168.npz specimen_index_presence/Bacillus_subtilis--168/index.done
|
||||
stats/verify_count/Bacillus_subtilis--168.stats: reference_index/Bacillus_subtilis--168.npz specimen_index_count/Bacillus_subtilis--168/index.done
|
||||
|
||||
# Salmonella_enterica--P125109
|
||||
simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz: genomes/GCF_000009505.1_ASM950v1_genomic.fna.gz
|
||||
reference_index/Salmonella_enterica--P125109.npz: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
|
||||
specimen_index_presence/Salmonella_enterica--P125109/index.done stats/indexing_presence/Salmonella_enterica--P125109.stats: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
|
||||
specimen_index_count/Salmonella_enterica--P125109/index.done stats/indexing_count/Salmonella_enterica--P125109.stats: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
|
||||
stats/verify_presence/Salmonella_enterica--P125109.stats: reference_index/Salmonella_enterica--P125109.npz specimen_index_presence/Salmonella_enterica--P125109/index.done
|
||||
stats/verify_count/Salmonella_enterica--P125109.stats: reference_index/Salmonella_enterica--P125109.npz specimen_index_count/Salmonella_enterica--P125109/index.done
|
||||
|
||||
# Shouchella_clausii--KSM-K16
|
||||
simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz: genomes/GCF_000009825.1_ASM982v1_genomic.fna.gz
|
||||
reference_index/Shouchella_clausii--KSM-K16.npz: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
|
||||
specimen_index_presence/Shouchella_clausii--KSM-K16/index.done stats/indexing_presence/Shouchella_clausii--KSM-K16.stats: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
|
||||
specimen_index_count/Shouchella_clausii--KSM-K16/index.done stats/indexing_count/Shouchella_clausii--KSM-K16.stats: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
|
||||
stats/verify_presence/Shouchella_clausii--KSM-K16.stats: reference_index/Shouchella_clausii--KSM-K16.npz specimen_index_presence/Shouchella_clausii--KSM-K16/index.done
|
||||
stats/verify_count/Shouchella_clausii--KSM-K16.stats: reference_index/Shouchella_clausii--KSM-K16.npz specimen_index_count/Shouchella_clausii--KSM-K16/index.done
|
||||
|
||||
# Escherichia_coli--K-12_W3110
|
||||
simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz: genomes/GCF_000010245.2_ASM1024v1_genomic.fna.gz
|
||||
reference_index/Escherichia_coli--K-12_W3110.npz: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
|
||||
specimen_index_presence/Escherichia_coli--K-12_W3110/index.done stats/indexing_presence/Escherichia_coli--K-12_W3110.stats: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
|
||||
specimen_index_count/Escherichia_coli--K-12_W3110/index.done stats/indexing_count/Escherichia_coli--K-12_W3110.stats: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
|
||||
stats/verify_presence/Escherichia_coli--K-12_W3110.stats: reference_index/Escherichia_coli--K-12_W3110.npz specimen_index_presence/Escherichia_coli--K-12_W3110/index.done
|
||||
stats/verify_count/Escherichia_coli--K-12_W3110.stats: reference_index/Escherichia_coli--K-12_W3110.npz specimen_index_count/Escherichia_coli--K-12_W3110/index.done
|
||||
|
||||
# Klebsiella_pneumoniae--MGH_78578
|
||||
simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz: genomes/GCF_000016305.1_ASM1630v1_genomic.fna.gz
|
||||
reference_index/Klebsiella_pneumoniae--MGH_78578.npz: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
|
||||
specimen_index_presence/Klebsiella_pneumoniae--MGH_78578/index.done stats/indexing_presence/Klebsiella_pneumoniae--MGH_78578.stats: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
|
||||
specimen_index_count/Klebsiella_pneumoniae--MGH_78578/index.done stats/indexing_count/Klebsiella_pneumoniae--MGH_78578.stats: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
|
||||
stats/verify_presence/Klebsiella_pneumoniae--MGH_78578.stats: reference_index/Klebsiella_pneumoniae--MGH_78578.npz specimen_index_presence/Klebsiella_pneumoniae--MGH_78578/index.done
|
||||
stats/verify_count/Klebsiella_pneumoniae--MGH_78578.stats: reference_index/Klebsiella_pneumoniae--MGH_78578.npz specimen_index_count/Klebsiella_pneumoniae--MGH_78578/index.done
|
||||
|
||||
# Opitutus_terrae--PB90-1
|
||||
simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz: genomes/GCF_000019965.1_ASM1996v1_genomic.fna.gz
|
||||
reference_index/Opitutus_terrae--PB90-1.npz: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
|
||||
specimen_index_presence/Opitutus_terrae--PB90-1/index.done stats/indexing_presence/Opitutus_terrae--PB90-1.stats: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
|
||||
specimen_index_count/Opitutus_terrae--PB90-1/index.done stats/indexing_count/Opitutus_terrae--PB90-1.stats: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
|
||||
stats/verify_presence/Opitutus_terrae--PB90-1.stats: reference_index/Opitutus_terrae--PB90-1.npz specimen_index_presence/Opitutus_terrae--PB90-1/index.done
|
||||
stats/verify_count/Opitutus_terrae--PB90-1.stats: reference_index/Opitutus_terrae--PB90-1.npz specimen_index_count/Opitutus_terrae--PB90-1/index.done
|
||||
|
||||
# Saccharolobus_islandicus--M.16.4
|
||||
simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz: genomes/GCF_000022445.1_ASM2244v1_genomic.fna.gz
|
||||
reference_index/Saccharolobus_islandicus--M.16.4.npz: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
|
||||
specimen_index_presence/Saccharolobus_islandicus--M.16.4/index.done stats/indexing_presence/Saccharolobus_islandicus--M.16.4.stats: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
|
||||
specimen_index_count/Saccharolobus_islandicus--M.16.4/index.done stats/indexing_count/Saccharolobus_islandicus--M.16.4.stats: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
|
||||
stats/verify_presence/Saccharolobus_islandicus--M.16.4.stats: reference_index/Saccharolobus_islandicus--M.16.4.npz specimen_index_presence/Saccharolobus_islandicus--M.16.4/index.done
|
||||
stats/verify_count/Saccharolobus_islandicus--M.16.4.stats: reference_index/Saccharolobus_islandicus--M.16.4.npz specimen_index_count/Saccharolobus_islandicus--M.16.4/index.done
|
||||
|
||||
# Acidobacterium_capsulatum--ATCC_51196
|
||||
simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz: genomes/GCF_000022565.1_ASM2256v1_genomic.fna.gz
|
||||
reference_index/Acidobacterium_capsulatum--ATCC_51196.npz: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
|
||||
specimen_index_presence/Acidobacterium_capsulatum--ATCC_51196/index.done stats/indexing_presence/Acidobacterium_capsulatum--ATCC_51196.stats: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
|
||||
specimen_index_count/Acidobacterium_capsulatum--ATCC_51196/index.done stats/indexing_count/Acidobacterium_capsulatum--ATCC_51196.stats: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
|
||||
stats/verify_presence/Acidobacterium_capsulatum--ATCC_51196.stats: reference_index/Acidobacterium_capsulatum--ATCC_51196.npz specimen_index_presence/Acidobacterium_capsulatum--ATCC_51196/index.done
|
||||
stats/verify_count/Acidobacterium_capsulatum--ATCC_51196.stats: reference_index/Acidobacterium_capsulatum--ATCC_51196.npz specimen_index_count/Acidobacterium_capsulatum--ATCC_51196/index.done
|
||||
|
||||
# Salmonella_enterica--AKU_12601
|
||||
simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz: genomes/GCF_000026565.1_ASM2656v1_genomic.fna.gz
|
||||
reference_index/Salmonella_enterica--AKU_12601.npz: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
|
||||
specimen_index_presence/Salmonella_enterica--AKU_12601/index.done stats/indexing_presence/Salmonella_enterica--AKU_12601.stats: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
|
||||
specimen_index_count/Salmonella_enterica--AKU_12601/index.done stats/indexing_count/Salmonella_enterica--AKU_12601.stats: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
|
||||
stats/verify_presence/Salmonella_enterica--AKU_12601.stats: reference_index/Salmonella_enterica--AKU_12601.npz specimen_index_presence/Salmonella_enterica--AKU_12601/index.done
|
||||
stats/verify_count/Salmonella_enterica--AKU_12601.stats: reference_index/Salmonella_enterica--AKU_12601.npz specimen_index_count/Salmonella_enterica--AKU_12601/index.done
|
||||
|
||||
# Proteus_mirabilis--HI4320
|
||||
simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz: genomes/GCF_000069965.1_ASM6996v1_genomic.fna.gz
|
||||
reference_index/Proteus_mirabilis--HI4320.npz: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
|
||||
specimen_index_presence/Proteus_mirabilis--HI4320/index.done stats/indexing_presence/Proteus_mirabilis--HI4320.stats: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
|
||||
specimen_index_count/Proteus_mirabilis--HI4320/index.done stats/indexing_count/Proteus_mirabilis--HI4320.stats: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
|
||||
stats/verify_presence/Proteus_mirabilis--HI4320.stats: reference_index/Proteus_mirabilis--HI4320.npz specimen_index_presence/Proteus_mirabilis--HI4320/index.done
|
||||
stats/verify_count/Proteus_mirabilis--HI4320.stats: reference_index/Proteus_mirabilis--HI4320.npz specimen_index_count/Proteus_mirabilis--HI4320/index.done
|
||||
|
||||
# Salmonella_enterica--CT18
|
||||
simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz: genomes/GCF_000195995.1_ASM19599v1_genomic.fna.gz
|
||||
reference_index/Salmonella_enterica--CT18.npz: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
|
||||
specimen_index_presence/Salmonella_enterica--CT18/index.done stats/indexing_presence/Salmonella_enterica--CT18.stats: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
|
||||
specimen_index_count/Salmonella_enterica--CT18/index.done stats/indexing_count/Salmonella_enterica--CT18.stats: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
|
||||
stats/verify_presence/Salmonella_enterica--CT18.stats: reference_index/Salmonella_enterica--CT18.npz specimen_index_presence/Salmonella_enterica--CT18/index.done
|
||||
stats/verify_count/Salmonella_enterica--CT18.stats: reference_index/Salmonella_enterica--CT18.npz specimen_index_count/Salmonella_enterica--CT18/index.done
|
||||
|
||||
# Klebsiella_pneumoniae--HS11286
|
||||
simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz: genomes/GCF_000240185.1_ASM24018v2_genomic.fna.gz
|
||||
reference_index/Klebsiella_pneumoniae--HS11286.npz: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
|
||||
specimen_index_presence/Klebsiella_pneumoniae--HS11286/index.done stats/indexing_presence/Klebsiella_pneumoniae--HS11286.stats: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
|
||||
specimen_index_count/Klebsiella_pneumoniae--HS11286/index.done stats/indexing_count/Klebsiella_pneumoniae--HS11286.stats: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
|
||||
stats/verify_presence/Klebsiella_pneumoniae--HS11286.stats: reference_index/Klebsiella_pneumoniae--HS11286.npz specimen_index_presence/Klebsiella_pneumoniae--HS11286/index.done
|
||||
stats/verify_count/Klebsiella_pneumoniae--HS11286.stats: reference_index/Klebsiella_pneumoniae--HS11286.npz specimen_index_count/Klebsiella_pneumoniae--HS11286/index.done
|
||||
|
||||
# Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1
|
||||
simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz: genomes/GCF_000306885.1_ASM30688v1_genomic.fna.gz
|
||||
reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
|
||||
specimen_index_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done stats/indexing_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
|
||||
specimen_index_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done stats/indexing_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
|
||||
stats/verify_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz specimen_index_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done
|
||||
stats/verify_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz specimen_index_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done
|
||||
|
||||
# Klebsiella_pneumoniae--ATCC_13883
|
||||
simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz: genomes/GCF_000742135.1_ASM74213v1_genomic.fna.gz
|
||||
reference_index/Klebsiella_pneumoniae--ATCC_13883.npz: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
|
||||
specimen_index_presence/Klebsiella_pneumoniae--ATCC_13883/index.done stats/indexing_presence/Klebsiella_pneumoniae--ATCC_13883.stats: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
|
||||
specimen_index_count/Klebsiella_pneumoniae--ATCC_13883/index.done stats/indexing_count/Klebsiella_pneumoniae--ATCC_13883.stats: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
|
||||
stats/verify_presence/Klebsiella_pneumoniae--ATCC_13883.stats: reference_index/Klebsiella_pneumoniae--ATCC_13883.npz specimen_index_presence/Klebsiella_pneumoniae--ATCC_13883/index.done
|
||||
stats/verify_count/Klebsiella_pneumoniae--ATCC_13883.stats: reference_index/Klebsiella_pneumoniae--ATCC_13883.npz specimen_index_count/Klebsiella_pneumoniae--ATCC_13883/index.done
|
||||
|
||||
# Yersinia_ruckeri--YRB
|
||||
simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz: genomes/GCF_000834255.1_ASM83425v1_genomic.fna.gz
|
||||
reference_index/Yersinia_ruckeri--YRB.npz: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
|
||||
specimen_index_presence/Yersinia_ruckeri--YRB/index.done stats/indexing_presence/Yersinia_ruckeri--YRB.stats: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
|
||||
specimen_index_count/Yersinia_ruckeri--YRB/index.done stats/indexing_count/Yersinia_ruckeri--YRB.stats: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
|
||||
stats/verify_presence/Yersinia_ruckeri--YRB.stats: reference_index/Yersinia_ruckeri--YRB.npz specimen_index_presence/Yersinia_ruckeri--YRB/index.done
|
||||
stats/verify_count/Yersinia_ruckeri--YRB.stats: reference_index/Yersinia_ruckeri--YRB.npz specimen_index_count/Yersinia_ruckeri--YRB/index.done
|
||||
|
||||
# Candidozyma_auris--GCF_003013715.1_ASM301371v2
|
||||
simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz: genomes/GCF_003013715.1_ASM301371v2_genomic.fna.gz
|
||||
reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
|
||||
specimen_index_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done stats/indexing_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
|
||||
specimen_index_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done stats/indexing_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
|
||||
stats/verify_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz specimen_index_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done
|
||||
stats/verify_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz specimen_index_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done
|
||||
|
||||
# Escherichia_coli
|
||||
specific_index_presence/Escherichia_coli/index.done stats/specific_kmer_presence/Escherichia_coli.stats: global_index_presence/index.done
|
||||
specific_index_count/Escherichia_coli/index.done stats/specific_kmer_count/Escherichia_coli.stats: global_index_count/index.done
|
||||
# Salmonella_enterica
|
||||
specific_index_presence/Salmonella_enterica/index.done stats/specific_kmer_presence/Salmonella_enterica.stats: global_index_presence/index.done
|
||||
specific_index_count/Salmonella_enterica/index.done stats/specific_kmer_count/Salmonella_enterica.stats: global_index_count/index.done
|
||||
# Bacillus_subtilis
|
||||
specific_index_presence/Bacillus_subtilis/index.done stats/specific_kmer_presence/Bacillus_subtilis.stats: global_index_presence/index.done
|
||||
specific_index_count/Bacillus_subtilis/index.done stats/specific_kmer_count/Bacillus_subtilis.stats: global_index_count/index.done
|
||||
# Shouchella_clausii
|
||||
specific_index_presence/Shouchella_clausii/index.done stats/specific_kmer_presence/Shouchella_clausii.stats: global_index_presence/index.done
|
||||
specific_index_count/Shouchella_clausii/index.done stats/specific_kmer_count/Shouchella_clausii.stats: global_index_count/index.done
|
||||
# Klebsiella_pneumoniae
|
||||
specific_index_presence/Klebsiella_pneumoniae/index.done stats/specific_kmer_presence/Klebsiella_pneumoniae.stats: global_index_presence/index.done
|
||||
specific_index_count/Klebsiella_pneumoniae/index.done stats/specific_kmer_count/Klebsiella_pneumoniae.stats: global_index_count/index.done
|
||||
# Opitutus_terrae
|
||||
specific_index_presence/Opitutus_terrae/index.done stats/specific_kmer_presence/Opitutus_terrae.stats: global_index_presence/index.done
|
||||
specific_index_count/Opitutus_terrae/index.done stats/specific_kmer_count/Opitutus_terrae.stats: global_index_count/index.done
|
||||
# Saccharolobus_islandicus
|
||||
specific_index_presence/Saccharolobus_islandicus/index.done stats/specific_kmer_presence/Saccharolobus_islandicus.stats: global_index_presence/index.done
|
||||
specific_index_count/Saccharolobus_islandicus/index.done stats/specific_kmer_count/Saccharolobus_islandicus.stats: global_index_count/index.done
|
||||
# Acidobacterium_capsulatum
|
||||
specific_index_presence/Acidobacterium_capsulatum/index.done stats/specific_kmer_presence/Acidobacterium_capsulatum.stats: global_index_presence/index.done
|
||||
specific_index_count/Acidobacterium_capsulatum/index.done stats/specific_kmer_count/Acidobacterium_capsulatum.stats: global_index_count/index.done
|
||||
# Proteus_mirabilis
|
||||
specific_index_presence/Proteus_mirabilis/index.done stats/specific_kmer_presence/Proteus_mirabilis.stats: global_index_presence/index.done
|
||||
specific_index_count/Proteus_mirabilis/index.done stats/specific_kmer_count/Proteus_mirabilis.stats: global_index_count/index.done
|
||||
# Wolbachia_endosymbiont
|
||||
specific_index_presence/Wolbachia_endosymbiont/index.done stats/specific_kmer_presence/Wolbachia_endosymbiont.stats: global_index_presence/index.done
|
||||
specific_index_count/Wolbachia_endosymbiont/index.done stats/specific_kmer_count/Wolbachia_endosymbiont.stats: global_index_count/index.done
|
||||
# Yersinia_ruckeri
|
||||
specific_index_presence/Yersinia_ruckeri/index.done stats/specific_kmer_presence/Yersinia_ruckeri.stats: global_index_presence/index.done
|
||||
specific_index_count/Yersinia_ruckeri/index.done stats/specific_kmer_count/Yersinia_ruckeri.stats: global_index_count/index.done
|
||||
# Candidozyma_auris
|
||||
specific_index_presence/Candidozyma_auris/index.done stats/specific_kmer_presence/Candidozyma_auris.stats: global_index_presence/index.done
|
||||
specific_index_count/Candidozyma_auris/index.done stats/specific_kmer_count/Candidozyma_auris.stats: global_index_count/index.done
|
||||
Executable
+48
@@ -0,0 +1,48 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
assemblies=(
|
||||
GCF_000005845.2
|
||||
GCF_000010245.2
|
||||
GCF_000007445.1
|
||||
GCF_000006665.1
|
||||
|
||||
GCF_000006945.2
|
||||
GCF_000195995.1
|
||||
GCF_000009505.1
|
||||
GCF_000026565.1
|
||||
|
||||
GCF_000016305.1
|
||||
GCF_000019965.1
|
||||
GCF_000240185.1
|
||||
GCF_000742135.1
|
||||
|
||||
GCF_000069965.1
|
||||
GCF_000022565.1
|
||||
GCF_000306885.1
|
||||
GCF_003013715.1
|
||||
|
||||
GCF_000009045.1
|
||||
GCF_000009825.1
|
||||
GCF_000022445.1
|
||||
GCF_000834255.1
|
||||
)
|
||||
|
||||
mkdir -p genomes
|
||||
|
||||
for acc in "${assemblies[@]}"; do
|
||||
echo "Downloading ${acc}"
|
||||
|
||||
datasets download genome accession "${acc}" \
|
||||
--include genome \
|
||||
--filename "${acc}.zip"
|
||||
|
||||
unzip -q "${acc}.zip" -d "${acc}"
|
||||
find "${acc}" -name "*.fna" |
|
||||
while read file; do
|
||||
obiconvert -Z ${file} >genomes/$(basename ${file}).gz
|
||||
done
|
||||
|
||||
rm -rf "${acc}" "${acc}.zip"
|
||||
done
|
||||
Executable
+108
@@ -0,0 +1,108 @@
|
||||
#!/usr/bin/env bash
|
||||
# Usage: filter_one_count.sh SPECIES
|
||||
# Filters global_index_count to keep only kmers specific to SPECIES,
|
||||
# then selects the SPECIES column in-place.
|
||||
# Outputs:
|
||||
# specific_index_count/SPECIES/index.done (written by obikmer select)
|
||||
# stats/specific_kmer_count/SPECIES.stats (one CSV data row, no header)
|
||||
set -euo pipefail
|
||||
|
||||
SPECIES="$1"
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||
|
||||
SOURCE="${SCRIPT_DIR}/global_index_count"
|
||||
OUTPUT="${SCRIPT_DIR}/specific_index_count/${SPECIES}"
|
||||
STATS_DIR="${SCRIPT_DIR}/stats/specific_kmer_count"
|
||||
STATS_FILE="${STATS_DIR}/${SPECIES}.stats"
|
||||
|
||||
mkdir -p "${STATS_DIR}"
|
||||
|
||||
echo "[${SPECIES}] filter (count) → ${OUTPUT}"
|
||||
|
||||
LOG_FILTER=$(mktemp)
|
||||
LOG_SELECT=$(mktemp)
|
||||
trap 'rm -f "${LOG_FILTER}" "${LOG_SELECT}"' EXIT
|
||||
|
||||
"${BINARY}" filter \
|
||||
--output "${OUTPUT}" \
|
||||
--force \
|
||||
--ingroup "species=${SPECIES}" \
|
||||
--outgroup all \
|
||||
--min-frac 0.5 \
|
||||
--max-frac 1.0 \
|
||||
--max-outgroup-count 0 \
|
||||
"${SOURCE}" \
|
||||
2>"${LOG_FILTER}"
|
||||
|
||||
cat "${LOG_FILTER}" >&2
|
||||
|
||||
"${BINARY}" select \
|
||||
--in-place \
|
||||
--group "${SPECIES}:species=${SPECIES}" \
|
||||
--group-op "${SPECIES}:any" \
|
||||
--select "${SPECIES}" \
|
||||
"${OUTPUT}" \
|
||||
2>"${LOG_SELECT}"
|
||||
|
||||
cat "${LOG_SELECT}" >&2
|
||||
|
||||
python3 - "${SPECIES}" "${LOG_FILTER}" "${LOG_SELECT}" <<'PYEOF' >"${STATS_FILE}"
|
||||
import sys, re
|
||||
|
||||
species, log_filter, log_select = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||
|
||||
def strip_ansi(s):
|
||||
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||
|
||||
def parse_wall(s):
|
||||
s = s.strip()
|
||||
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||
if s.endswith('s'): return float(s[:-1])
|
||||
return 0.0
|
||||
|
||||
def parse_rss(s):
|
||||
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||
if not m: return 0
|
||||
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||
|
||||
def is_sep(s):
|
||||
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||
|
||||
def parse_reporter(logfile):
|
||||
stats = {}
|
||||
state = 'scan'
|
||||
with open(logfile, errors='replace') as fh:
|
||||
for raw in fh:
|
||||
line = strip_ansi(raw.rstrip('\n'))
|
||||
s = line.strip()
|
||||
if state == 'scan':
|
||||
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||
state = 'in_header'
|
||||
elif state == 'in_header':
|
||||
if is_sep(s): state = 'rows'
|
||||
elif state == 'rows':
|
||||
if is_sep(s): state = 'total'
|
||||
elif s:
|
||||
parts = re.split(r' +', s)
|
||||
if len(parts) >= 4:
|
||||
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||
elif state == 'total':
|
||||
if s:
|
||||
parts = re.split(r' +', s)
|
||||
if len(parts) >= 3:
|
||||
stats['TOTAL'] = (parse_wall(parts[1]),
|
||||
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||
break
|
||||
return stats
|
||||
|
||||
f = parse_reporter(log_filter)
|
||||
s = parse_reporter(log_select)
|
||||
|
||||
row = [species]
|
||||
for stage, d in [('rebuild', f), ('pack', f), ('filter_total', f), ('select', s), ('select_total', s)]:
|
||||
key = 'TOTAL' if stage.endswith('_total') else stage
|
||||
w, r = d.get(key, ('', ''))
|
||||
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||
print(','.join(row))
|
||||
PYEOF
|
||||
Executable
+108
@@ -0,0 +1,108 @@
|
||||
#!/usr/bin/env bash
|
||||
# Usage: filter_one_presence.sh SPECIES
|
||||
# Filters global_index_presence to keep only kmers specific to SPECIES,
|
||||
# then selects the SPECIES column in-place.
|
||||
# Outputs:
|
||||
# specific_index_presence/SPECIES/index.done (written by obikmer select)
|
||||
# stats/specific_kmer_presence/SPECIES.stats (one CSV data row, no header)
|
||||
set -euo pipefail
|
||||
|
||||
SPECIES="$1"
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||
|
||||
SOURCE="${SCRIPT_DIR}/global_index_presence"
|
||||
OUTPUT="${SCRIPT_DIR}/specific_index_presence/${SPECIES}"
|
||||
STATS_DIR="${SCRIPT_DIR}/stats/specific_kmer_presence"
|
||||
STATS_FILE="${STATS_DIR}/${SPECIES}.stats"
|
||||
|
||||
mkdir -p "${STATS_DIR}"
|
||||
|
||||
echo "[${SPECIES}] filter (presence) → ${OUTPUT}"
|
||||
|
||||
LOG_FILTER=$(mktemp)
|
||||
LOG_SELECT=$(mktemp)
|
||||
trap 'rm -f "${LOG_FILTER}" "${LOG_SELECT}"' EXIT
|
||||
|
||||
"${BINARY}" filter \
|
||||
--output "${OUTPUT}" \
|
||||
--force \
|
||||
--ingroup "species=${SPECIES}" \
|
||||
--outgroup all \
|
||||
--min-frac 0.5 \
|
||||
--max-frac 1.0 \
|
||||
--max-outgroup-count 0 \
|
||||
"${SOURCE}" \
|
||||
2>"${LOG_FILTER}"
|
||||
|
||||
cat "${LOG_FILTER}" >&2
|
||||
|
||||
"${BINARY}" select \
|
||||
--in-place \
|
||||
--group "${SPECIES}:species=${SPECIES}" \
|
||||
--group-op "${SPECIES}:any" \
|
||||
--select "${SPECIES}" \
|
||||
"${OUTPUT}" \
|
||||
2>"${LOG_SELECT}"
|
||||
|
||||
cat "${LOG_SELECT}" >&2
|
||||
|
||||
python3 - "${SPECIES}" "${LOG_FILTER}" "${LOG_SELECT}" <<'PYEOF' >"${STATS_FILE}"
|
||||
import sys, re
|
||||
|
||||
species, log_filter, log_select = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||
|
||||
def strip_ansi(s):
|
||||
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||
|
||||
def parse_wall(s):
|
||||
s = s.strip()
|
||||
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||
if s.endswith('s'): return float(s[:-1])
|
||||
return 0.0
|
||||
|
||||
def parse_rss(s):
|
||||
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||
if not m: return 0
|
||||
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||
|
||||
def is_sep(s):
|
||||
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||
|
||||
def parse_reporter(logfile):
|
||||
stats = {}
|
||||
state = 'scan'
|
||||
with open(logfile, errors='replace') as fh:
|
||||
for raw in fh:
|
||||
line = strip_ansi(raw.rstrip('\n'))
|
||||
s = line.strip()
|
||||
if state == 'scan':
|
||||
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||
state = 'in_header'
|
||||
elif state == 'in_header':
|
||||
if is_sep(s): state = 'rows'
|
||||
elif state == 'rows':
|
||||
if is_sep(s): state = 'total'
|
||||
elif s:
|
||||
parts = re.split(r' +', s)
|
||||
if len(parts) >= 4:
|
||||
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||
elif state == 'total':
|
||||
if s:
|
||||
parts = re.split(r' +', s)
|
||||
if len(parts) >= 3:
|
||||
stats['TOTAL'] = (parse_wall(parts[1]),
|
||||
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||
break
|
||||
return stats
|
||||
|
||||
f = parse_reporter(log_filter)
|
||||
s = parse_reporter(log_select)
|
||||
|
||||
row = [species]
|
||||
for stage, d in [('rebuild', f), ('pack', f), ('filter_total', f), ('select', s), ('select_total', s)]:
|
||||
key = 'TOTAL' if stage.endswith('_total') else stage
|
||||
w, r = d.get(key, ('', ''))
|
||||
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||
print(','.join(row))
|
||||
PYEOF
|
||||
Executable
+103
@@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env bash
|
||||
# Usage: index_one_count.sh SPECIMEN
|
||||
# SPECIMEN = "species--strain" (Make pattern stem)
|
||||
# Outputs:
|
||||
# specimen_index_count/SPECIMEN/index.done (written by obikmer)
|
||||
# stats/indexing_count/SPECIMEN.stats (one CSV data row, no header)
|
||||
set -euo pipefail
|
||||
|
||||
SPECIMEN="$1"
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||
|
||||
species="${SPECIMEN%%--*}"
|
||||
strain="${SPECIMEN#*--}"
|
||||
|
||||
READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}"
|
||||
INDEX_PATH="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}"
|
||||
STATS_DIR="${SCRIPT_DIR}/stats/indexing_count"
|
||||
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
|
||||
|
||||
mkdir -p "${STATS_DIR}"
|
||||
|
||||
r1="${READS_DIR}/reads_R1.fastq.gz"
|
||||
r2="${READS_DIR}/reads_R2.fastq.gz"
|
||||
if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
|
||||
echo "ERROR: reads not found in ${READS_DIR}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "[${SPECIMEN}] indexing (count) → ${INDEX_PATH}"
|
||||
|
||||
STDERR_LOG=$(mktemp)
|
||||
trap 'rm -f "${STDERR_LOG}"' EXIT
|
||||
|
||||
"${BINARY}" index \
|
||||
--output "${INDEX_PATH}" \
|
||||
--force \
|
||||
--theta 0 \
|
||||
--with-counts \
|
||||
--label "${SPECIMEN}" \
|
||||
--meta "species=${species}" \
|
||||
"${r1}" "${r2}" \
|
||||
2>"${STDERR_LOG}"
|
||||
|
||||
cat "${STDERR_LOG}" >&2
|
||||
|
||||
python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}"
|
||||
import sys, re
|
||||
|
||||
species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||
|
||||
def strip_ansi(s):
|
||||
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||
|
||||
def parse_wall(s):
|
||||
s = s.strip()
|
||||
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||
if s.endswith('s'): return float(s[:-1])
|
||||
return 0.0
|
||||
|
||||
def parse_rss(s):
|
||||
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||
if not m: return 0
|
||||
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||
|
||||
def is_sep(s):
|
||||
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||
|
||||
stats = {}
|
||||
state = 'scan'
|
||||
|
||||
with open(logfile, errors='replace') as fh:
|
||||
for raw in fh:
|
||||
line = strip_ansi(raw.rstrip('\n'))
|
||||
s = line.strip()
|
||||
if state == 'scan':
|
||||
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||
state = 'in_header'
|
||||
elif state == 'in_header':
|
||||
if is_sep(s): state = 'rows'
|
||||
elif state == 'rows':
|
||||
if is_sep(s): state = 'total'
|
||||
elif s:
|
||||
parts = re.split(r' +', s)
|
||||
if len(parts) >= 4:
|
||||
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||
elif state == 'total':
|
||||
if s:
|
||||
parts = re.split(r' +', s)
|
||||
if len(parts) >= 3:
|
||||
stats[parts[0]] = (parse_wall(parts[1]),
|
||||
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||
break
|
||||
|
||||
STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index']
|
||||
row = [species, strain]
|
||||
for stage in STAGE_ORDER:
|
||||
w, r = stats.get(stage, ('', ''))
|
||||
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||
tw, tr = stats.get('TOTAL', ('', ''))
|
||||
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
|
||||
print(','.join(row))
|
||||
PYEOF
|
||||
Executable
+102
@@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env bash
|
||||
# Usage: index_one_presence.sh SPECIMEN
|
||||
# SPECIMEN = "species--strain" (Make pattern stem)
|
||||
# Outputs:
|
||||
# specimen_index_presence/SPECIMEN/index.done (written by obikmer)
|
||||
# stats/indexing_presence/SPECIMEN.stats (one CSV data row, no header)
|
||||
set -euo pipefail
|
||||
|
||||
SPECIMEN="$1"
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||
|
||||
species="${SPECIMEN%%--*}"
|
||||
strain="${SPECIMEN#*--}"
|
||||
|
||||
READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}"
|
||||
INDEX_PATH="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}"
|
||||
STATS_DIR="${SCRIPT_DIR}/stats/indexing_presence"
|
||||
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
|
||||
|
||||
mkdir -p "${STATS_DIR}"
|
||||
|
||||
r1="${READS_DIR}/reads_R1.fastq.gz"
|
||||
r2="${READS_DIR}/reads_R2.fastq.gz"
|
||||
if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
|
||||
echo "ERROR: reads not found in ${READS_DIR}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "[${SPECIMEN}] indexing (presence) → ${INDEX_PATH}"
|
||||
|
||||
STDERR_LOG=$(mktemp)
|
||||
trap 'rm -f "${STDERR_LOG}"' EXIT
|
||||
|
||||
"${BINARY}" index \
|
||||
--output "${INDEX_PATH}" \
|
||||
--force \
|
||||
--theta 0 \
|
||||
--label "${SPECIMEN}" \
|
||||
--meta "species=${species}" \
|
||||
"${r1}" "${r2}" \
|
||||
2>"${STDERR_LOG}"
|
||||
|
||||
cat "${STDERR_LOG}" >&2
|
||||
|
||||
python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}"
|
||||
import sys, re
|
||||
|
||||
species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||
|
||||
def strip_ansi(s):
|
||||
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||
|
||||
def parse_wall(s):
|
||||
s = s.strip()
|
||||
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||
if s.endswith('s'): return float(s[:-1])
|
||||
return 0.0
|
||||
|
||||
def parse_rss(s):
|
||||
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||
if not m: return 0
|
||||
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||
|
||||
def is_sep(s):
|
||||
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||
|
||||
stats = {}
|
||||
state = 'scan'
|
||||
|
||||
with open(logfile, errors='replace') as fh:
|
||||
for raw in fh:
|
||||
line = strip_ansi(raw.rstrip('\n'))
|
||||
s = line.strip()
|
||||
if state == 'scan':
|
||||
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||
state = 'in_header'
|
||||
elif state == 'in_header':
|
||||
if is_sep(s): state = 'rows'
|
||||
elif state == 'rows':
|
||||
if is_sep(s): state = 'total'
|
||||
elif s:
|
||||
parts = re.split(r' +', s)
|
||||
if len(parts) >= 4:
|
||||
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||
elif state == 'total':
|
||||
if s:
|
||||
parts = re.split(r' +', s)
|
||||
if len(parts) >= 3:
|
||||
stats[parts[0]] = (parse_wall(parts[1]),
|
||||
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||
break
|
||||
|
||||
STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index']
|
||||
row = [species, strain]
|
||||
for stage in STAGE_ORDER:
|
||||
w, r = stats.get(stage, ('', ''))
|
||||
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||
tw, tr = stats.get('TOTAL', ('', ''))
|
||||
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
|
||||
print(','.join(row))
|
||||
PYEOF
|
||||
@@ -0,0 +1,118 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate deps.mk — pure dependency declarations for the benchmark pipeline.
|
||||
|
||||
Like C .d files: only target: prerequisites lines, no recipes.
|
||||
Recipes stay in the Makefile as generic rules.
|
||||
"""
|
||||
import gzip
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
STOP_WORDS = {'complete', 'chromosome', 'whole', 'sequence', 'genome',
|
||||
'endosymbiont', 'of'}
|
||||
STOP_PREFIXES = ('scaffold', 'contig', 'plasmid')
|
||||
|
||||
|
||||
def is_stop(tok):
|
||||
t = tok.lower()
|
||||
return t in STOP_WORDS or any(t.startswith(p) for p in STOP_PREFIXES)
|
||||
|
||||
|
||||
def sanitize(s):
|
||||
return re.sub(r'[^A-Za-z0-9._-]', '_', s).strip('_')
|
||||
|
||||
|
||||
def collect_tokens(text):
|
||||
parts = []
|
||||
for tok in text.split():
|
||||
tok = tok.rstrip(',.')
|
||||
if is_stop(tok):
|
||||
break
|
||||
parts.append(sanitize(tok))
|
||||
return '_'.join(filter(None, parts))
|
||||
|
||||
|
||||
def parse_organism(defn, gcf_id):
|
||||
words = defn.split()
|
||||
species = sanitize(words[0] + '_' + words[1])
|
||||
|
||||
m = re.search(r'\bstr\.\s+(\S+)(?:\s+substr\.\s+(\S+))?', defn)
|
||||
if m:
|
||||
strain = sanitize(m.group(1))
|
||||
if m.group(2):
|
||||
strain += '_' + sanitize(m.group(2))
|
||||
return species, strain
|
||||
|
||||
m = re.search(r'\bstrain\b\s+(.*)', defn)
|
||||
if m:
|
||||
strain = collect_tokens(m.group(1))
|
||||
if strain:
|
||||
return species, strain
|
||||
|
||||
remainder = re.sub(r'^\S+ \S+\s*', '', defn)
|
||||
remainder = re.sub(r'^subsp\.\s+\S+\s*', '', remainder)
|
||||
remainder = re.sub(r'^serovar\s+\S+\s*', '', remainder)
|
||||
strain = collect_tokens(remainder)
|
||||
return species, strain if strain else gcf_id
|
||||
|
||||
|
||||
def first_definition(path):
|
||||
with gzip.open(path, 'rt') as fh:
|
||||
for line in fh:
|
||||
if line.startswith('>'):
|
||||
m = re.search(r'"definition":"([^"]*)"', line)
|
||||
return m.group(1) if m else line[1:].split()[0]
|
||||
return Path(path).stem
|
||||
|
||||
|
||||
def main():
|
||||
entries = [] # (specimen, species, sim_dir, genome_path)
|
||||
species_seen = []
|
||||
|
||||
for path in sorted(sys.argv[1:]):
|
||||
gcf_id = Path(path).name.replace('_genomic.fna.gz', '')
|
||||
defn = first_definition(path)
|
||||
sp, st = parse_organism(defn, gcf_id)
|
||||
specimen = f'{sp}--{st}'
|
||||
sim_dir = f'simulated_data/{sp}/{st}'
|
||||
entries.append((specimen, sp, sim_dir, path))
|
||||
if sp not in species_seen:
|
||||
species_seen.append(sp)
|
||||
|
||||
specimens = [e[0] for e in entries]
|
||||
print('SPECIMENS :=', ' '.join(specimens))
|
||||
print('SPECIES :=', ' '.join(species_seen))
|
||||
|
||||
for specimen, species, sim_dir, genome in entries:
|
||||
reads = f'{sim_dir}/reads_R1.fastq.gz'
|
||||
p_done = f'specimen_index_presence/{specimen}/index.done'
|
||||
p_stats = f'stats/indexing_presence/{specimen}.stats'
|
||||
c_done = f'specimen_index_count/{specimen}/index.done'
|
||||
c_stats = f'stats/indexing_count/{specimen}.stats'
|
||||
ref = f'reference_index/{specimen}.npz'
|
||||
vp = f'stats/verify_presence/{specimen}.stats'
|
||||
vc = f'stats/verify_count/{specimen}.stats'
|
||||
|
||||
print()
|
||||
print(f'# {specimen}')
|
||||
print(f'{reads}: {genome}')
|
||||
print(f'{ref}: {reads}')
|
||||
print(f'{p_done} {p_stats}: {reads}')
|
||||
print(f'{c_done} {c_stats}: {reads}')
|
||||
print(f'{vp}: {ref} {p_done}')
|
||||
print(f'{vc}: {ref} {c_done}')
|
||||
|
||||
print()
|
||||
for sp in species_seen:
|
||||
sp_done = f'specific_index_presence/{sp}/index.done'
|
||||
sp_stats = f'stats/specific_kmer_presence/{sp}.stats'
|
||||
sc_done = f'specific_index_count/{sp}/index.done'
|
||||
sc_stats = f'stats/specific_kmer_count/{sp}.stats'
|
||||
print(f'# {sp}')
|
||||
print(f'{sp_done} {sp_stats}: global_index_presence/index.done')
|
||||
print(f'{sc_done} {sc_stats}: global_index_count/index.done')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Executable
+103
@@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||
IDX_DIR="${SCRIPT_DIR}/specimen_index_count"
|
||||
OUTPUT="${SCRIPT_DIR}/global_index_count"
|
||||
STATS_DIR="${SCRIPT_DIR}/stats/merge_count"
|
||||
|
||||
mkdir -p "${STATS_DIR}"
|
||||
|
||||
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')")
|
||||
CSV="${STATS_DIR}/run_${run_n}.csv"
|
||||
|
||||
printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}"
|
||||
|
||||
parse_reporter() {
|
||||
local run="$1" n_sources="$2" logfile="$3"
|
||||
python3 - "$run" "$n_sources" "$logfile" <<'PYEOF'
|
||||
import sys, re
|
||||
|
||||
run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||
|
||||
def strip_ansi(s):
|
||||
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||
|
||||
def parse_wall(s):
|
||||
s = s.strip()
|
||||
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||
if s.endswith('s'): return float(s[:-1])
|
||||
return 0.0
|
||||
|
||||
def parse_rss(s):
|
||||
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||
if not m: return 0
|
||||
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||
|
||||
def is_sep(s):
|
||||
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||
|
||||
stats = {}
|
||||
state = 'scan'
|
||||
|
||||
with open(logfile, errors='replace') as fh:
|
||||
for raw in fh:
|
||||
line = strip_ansi(raw.rstrip('\n'))
|
||||
s = line.strip()
|
||||
|
||||
if state == 'scan':
|
||||
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||
state = 'in_header'
|
||||
elif state == 'in_header':
|
||||
if is_sep(s):
|
||||
state = 'rows'
|
||||
elif state == 'rows':
|
||||
if is_sep(s):
|
||||
state = 'total'
|
||||
elif s:
|
||||
parts = re.split(r' +', s)
|
||||
if len(parts) >= 4:
|
||||
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||
elif state == 'total':
|
||||
if s:
|
||||
parts = re.split(r' +', s)
|
||||
if len(parts) >= 3:
|
||||
stats[parts[0]] = (parse_wall(parts[1]),
|
||||
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||
break
|
||||
|
||||
STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack']
|
||||
row = [run, n_sources]
|
||||
for stage in STAGE_ORDER:
|
||||
w, r = stats.get(stage, ('', ''))
|
||||
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||
tw, tr = stats.get('TOTAL', ('', ''))
|
||||
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
|
||||
print(','.join(row))
|
||||
PYEOF
|
||||
}
|
||||
|
||||
mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort)
|
||||
|
||||
if [[ ${#sources[@]} -eq 0 ]]; then
|
||||
echo "ERROR: no indexes found in ${IDX_DIR}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Merging ${#sources[@]} count indexes → ${OUTPUT}"
|
||||
printf ' %s\n' "${sources[@]}"
|
||||
|
||||
STDERR_LOG=$(mktemp)
|
||||
trap 'rm -f "${STDERR_LOG}"' EXIT
|
||||
|
||||
"${BINARY}" merge \
|
||||
--output "${OUTPUT}" \
|
||||
--force \
|
||||
"${sources[@]}" \
|
||||
2>"${STDERR_LOG}"
|
||||
|
||||
cat "${STDERR_LOG}" >&2
|
||||
parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}"
|
||||
|
||||
echo "Done. Run ${run_n} → ${CSV}"
|
||||
Executable
+104
@@ -0,0 +1,104 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||
IDX_DIR="${SCRIPT_DIR}/specimen_index_presence"
|
||||
OUTPUT="${SCRIPT_DIR}/global_index_presence"
|
||||
STATS_DIR="${SCRIPT_DIR}/stats/merge_presence"
|
||||
|
||||
mkdir -p "${STATS_DIR}"
|
||||
|
||||
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')")
|
||||
CSV="${STATS_DIR}/run_${run_n}.csv"
|
||||
|
||||
printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}"
|
||||
|
||||
parse_reporter() {
|
||||
local run="$1" n_sources="$2" logfile="$3"
|
||||
python3 - "$run" "$n_sources" "$logfile" <<'PYEOF'
|
||||
import sys, re
|
||||
|
||||
run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||
|
||||
def strip_ansi(s):
|
||||
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||
|
||||
def parse_wall(s):
|
||||
s = s.strip()
|
||||
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||
if s.endswith('s'): return float(s[:-1])
|
||||
return 0.0
|
||||
|
||||
def parse_rss(s):
|
||||
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||
if not m: return 0
|
||||
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||
|
||||
def is_sep(s):
|
||||
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||
|
||||
stats = {}
|
||||
state = 'scan'
|
||||
|
||||
with open(logfile, errors='replace') as fh:
|
||||
for raw in fh:
|
||||
line = strip_ansi(raw.rstrip('\n'))
|
||||
s = line.strip()
|
||||
|
||||
if state == 'scan':
|
||||
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||
state = 'in_header'
|
||||
elif state == 'in_header':
|
||||
if is_sep(s):
|
||||
state = 'rows'
|
||||
elif state == 'rows':
|
||||
if is_sep(s):
|
||||
state = 'total'
|
||||
elif s:
|
||||
parts = re.split(r' +', s)
|
||||
if len(parts) >= 4:
|
||||
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||
elif state == 'total':
|
||||
if s:
|
||||
parts = re.split(r' +', s)
|
||||
if len(parts) >= 3:
|
||||
stats[parts[0]] = (parse_wall(parts[1]),
|
||||
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||
break
|
||||
|
||||
STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack']
|
||||
row = [run, n_sources]
|
||||
for stage in STAGE_ORDER:
|
||||
w, r = stats.get(stage, ('', ''))
|
||||
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||
tw, tr = stats.get('TOTAL', ('', ''))
|
||||
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
|
||||
print(','.join(row))
|
||||
PYEOF
|
||||
}
|
||||
|
||||
mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort)
|
||||
|
||||
if [[ ${#sources[@]} -eq 0 ]]; then
|
||||
echo "ERROR: no indexes found in ${IDX_DIR}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Merging ${#sources[@]} presence indexes → ${OUTPUT}"
|
||||
printf ' %s\n' "${sources[@]}"
|
||||
|
||||
STDERR_LOG=$(mktemp)
|
||||
trap 'rm -f "${STDERR_LOG}"' EXIT
|
||||
|
||||
"${BINARY}" merge \
|
||||
--output "${OUTPUT}" \
|
||||
--force \
|
||||
--force-presence \
|
||||
"${sources[@]}" \
|
||||
2>"${STDERR_LOG}"
|
||||
|
||||
cat "${STDERR_LOG}" >&2
|
||||
parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}"
|
||||
|
||||
echo "Done. Run ${run_n} → ${CSV}"
|
||||
Executable
+12
@@ -0,0 +1,12 @@
|
||||
#!/usr/bin/env bash
|
||||
# Simulate all genomes. Delegates to simulate_one.sh per genome.
|
||||
# Prefer running via `gmake simulate` which handles individual dependencies.
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
for genome_file in "${SCRIPT_DIR}"/genomes/*.fna.gz; do
|
||||
out_dir=$("${SCRIPT_DIR}/../.venv/bin/python3" "${SCRIPT_DIR}/make_deps.py" \
|
||||
--dir-for "${genome_file}")
|
||||
bash "${SCRIPT_DIR}/simulate_one.sh" "${genome_file}" "${out_dir}"
|
||||
done
|
||||
@@ -0,0 +1,33 @@
|
||||
#!/usr/bin/env bash
|
||||
# Usage: simulate_one.sh genome.fna.gz output_dir
|
||||
# Simulates paired-end HiSeq reads for a single genome.
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ISS="${SCRIPT_DIR}/../.venv/bin/iss"
|
||||
COVERAGE=15
|
||||
READ_LENGTH=150
|
||||
CPUS="${CPUS:-$(sysctl -n hw.logicalcpu 2>/dev/null || nproc 2>/dev/null || echo 2)}"
|
||||
|
||||
genome_file="$1"
|
||||
out_dir="$2"
|
||||
|
||||
mkdir -p "${out_dir}"
|
||||
|
||||
tmp_fasta=$(mktemp "${TMPDIR:-/tmp}/obikmer_XXXXXX.fna")
|
||||
trap 'rm -f "${tmp_fasta}"' EXIT
|
||||
|
||||
gzip -dc "${genome_file}" > "${tmp_fasta}"
|
||||
|
||||
genome_size=$(grep -v "^>" "${tmp_fasta}" | tr -d '[:space:]' | wc -c | tr -d ' ')
|
||||
n_reads=$(python3 -c "import math; print(math.ceil(${COVERAGE} * ${genome_size} / (2 * ${READ_LENGTH})))")
|
||||
|
||||
echo "[${out_dir}] genome=${genome_size} bp → ${n_reads} read pairs (${COVERAGE}x HiSeq)"
|
||||
|
||||
"${ISS}" generate \
|
||||
--genomes "${tmp_fasta}" \
|
||||
--model HiSeq \
|
||||
--n_reads "${n_reads}" \
|
||||
--cpus "${CPUS}" \
|
||||
--compress \
|
||||
--output "${out_dir}/reads"
|
||||
Executable
+181
@@ -0,0 +1,181 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Compare an obikmer count index against a reference kmer set (presence + counts).
|
||||
|
||||
Loads the reference .npz (sorted uint64 kmers + uint32 counts from build_reference.py),
|
||||
streams `obikmer dump` from a --with-counts index, then reports:
|
||||
- false negatives : kmers in reference absent from the index
|
||||
- false positives : kmers in the index absent from the reference
|
||||
- count mismatches: kmers present in both but with differing counts
|
||||
|
||||
Output to stdout: one CSV row
|
||||
species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,
|
||||
fn_pct,fp_pct,cm_pct
|
||||
"""
|
||||
import argparse
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
# ── encoding ──────────────────────────────────────────────────────────────────
|
||||
|
||||
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
|
||||
'a': 0, 'c': 1, 'g': 2, 't': 3}
|
||||
|
||||
_DECODE = ['A', 'C', 'G', 'T']
|
||||
|
||||
|
||||
def encode_kmer(s: str) -> int:
|
||||
kmer = 0
|
||||
for c in s:
|
||||
kmer = (kmer << 2) | _ENCODE[c]
|
||||
return kmer
|
||||
|
||||
|
||||
def decode_kmer(val: int, k: int) -> str:
|
||||
bases = []
|
||||
for _ in range(k):
|
||||
bases.append(_DECODE[val & 3])
|
||||
val >>= 2
|
||||
return ''.join(reversed(bases))
|
||||
|
||||
|
||||
# ── dump parsing ──────────────────────────────────────────────────────────────
|
||||
|
||||
def load_index(obikmer_bin: str, index_dir: str) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""Stream `obikmer dump` and return (kmers_sorted_uint64, counts_uint32)."""
|
||||
cmd = [obikmer_bin, 'dump', index_dir]
|
||||
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
|
||||
text=True)
|
||||
kmers, counts = [], []
|
||||
header = True
|
||||
for line in proc.stdout:
|
||||
if header:
|
||||
header = False
|
||||
continue
|
||||
parts = line.rstrip('\n').split(',')
|
||||
kmers.append(encode_kmer(parts[0]))
|
||||
counts.append(int(parts[1]))
|
||||
proc.wait()
|
||||
if proc.returncode != 0:
|
||||
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
order = np.argsort(np.array(kmers, dtype=np.uint64), kind='stable')
|
||||
return (np.array(kmers, dtype=np.uint64)[order],
|
||||
np.array(counts, dtype=np.uint32)[order])
|
||||
|
||||
|
||||
# ── comparison ────────────────────────────────────────────────────────────────
|
||||
|
||||
def compare(ref_kmers: np.ndarray, ref_counts: np.ndarray,
|
||||
idx_kmers: np.ndarray, idx_counts: np.ndarray,
|
||||
) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
|
||||
"""Return (false_neg, false_pos, cm_ref_kmers, cm_ref_counts, cm_idx_counts).
|
||||
|
||||
All arrays sorted; cm_* cover kmers present in both arrays but with
|
||||
differing counts.
|
||||
"""
|
||||
false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
|
||||
false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
|
||||
|
||||
# Count mismatches among shared kmers.
|
||||
# Both arrays are sorted so we can use searchsorted.
|
||||
pos_in_idx = np.searchsorted(idx_kmers, ref_kmers)
|
||||
pos_in_idx = np.clip(pos_in_idx, 0, len(idx_kmers) - 1)
|
||||
shared_mask = idx_kmers[pos_in_idx] == ref_kmers
|
||||
|
||||
shared_ref_counts = ref_counts[shared_mask]
|
||||
shared_idx_counts = idx_counts[pos_in_idx[shared_mask]]
|
||||
mismatch_mask = shared_ref_counts != shared_idx_counts
|
||||
|
||||
cm_kmers = ref_kmers[shared_mask][mismatch_mask]
|
||||
cm_ref_counts = shared_ref_counts[mismatch_mask]
|
||||
cm_idx_counts = shared_idx_counts[mismatch_mask]
|
||||
|
||||
return false_neg, false_pos, cm_kmers, cm_ref_counts, cm_idx_counts
|
||||
|
||||
|
||||
# ── main ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser(description=__doc__,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||
ap.add_argument('reference', metavar='REF_NPZ', nargs='?',
|
||||
help='Reference .npz file')
|
||||
ap.add_argument('index', metavar='INDEX_DIR', nargs='?',
|
||||
help='obikmer index directory (built with --with-counts)')
|
||||
ap.add_argument('--obikmer', default='obikmer',
|
||||
help='Path to obikmer binary')
|
||||
ap.add_argument('--species', default='')
|
||||
ap.add_argument('--strain', default='')
|
||||
ap.add_argument('--header', action='store_true',
|
||||
help='Print CSV header and exit')
|
||||
ap.add_argument('--save-fp', metavar='FILE',
|
||||
help='Save false-positive kmer strings to FILE')
|
||||
ap.add_argument('--save-fn', metavar='FILE',
|
||||
help='Save false-negative kmer strings to FILE')
|
||||
ap.add_argument('--save-cm', metavar='FILE',
|
||||
help='Save count-mismatch rows (kmer,ref_count,idx_count) to FILE')
|
||||
args = ap.parse_args()
|
||||
|
||||
if args.header:
|
||||
print('species,strain,ref_kmers,idx_kmers,'
|
||||
'false_neg,false_pos,count_mismatch,'
|
||||
'fn_pct,fp_pct,cm_pct')
|
||||
return
|
||||
|
||||
# Detect k
|
||||
cmd1 = [args.obikmer, 'dump', '--head', '1', args.index]
|
||||
out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True)
|
||||
k = len(out1.splitlines()[1].split(',')[0])
|
||||
|
||||
# Load reference
|
||||
print(f'Loading reference: {args.reference}', file=sys.stderr)
|
||||
npz = np.load(args.reference)
|
||||
ref_kmers = npz['kmers'] # sorted uint64
|
||||
ref_counts = npz['counts'] # uint32
|
||||
|
||||
# Load index
|
||||
print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr)
|
||||
idx_kmers, idx_counts = load_index(args.obikmer, args.index)
|
||||
|
||||
print(f'k={k} ref={len(ref_kmers):,} idx={len(idx_kmers):,}', file=sys.stderr)
|
||||
|
||||
false_neg, false_pos, cm_kmers, cm_ref, cm_idx = compare(
|
||||
ref_kmers, ref_counts, idx_kmers, idx_counts)
|
||||
|
||||
n_shared = len(ref_kmers) - len(false_neg)
|
||||
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
|
||||
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
|
||||
cm_pct = 100.0 * len(cm_kmers) / n_shared if n_shared else 0.0
|
||||
|
||||
print(f'false negatives : {len(false_neg):,} ({fn_pct:.4f}%)', file=sys.stderr)
|
||||
print(f'false positives : {len(false_pos):,} ({fp_pct:.4f}%)', file=sys.stderr)
|
||||
print(f'count mismatches: {len(cm_kmers):,} ({cm_pct:.4f}% of shared)',
|
||||
file=sys.stderr)
|
||||
|
||||
if args.save_fn and len(false_neg):
|
||||
with open(args.save_fn, 'w') as fh:
|
||||
for v in false_neg:
|
||||
fh.write(decode_kmer(int(v), k) + '\n')
|
||||
|
||||
if args.save_fp and len(false_pos):
|
||||
with open(args.save_fp, 'w') as fh:
|
||||
for v in false_pos:
|
||||
fh.write(decode_kmer(int(v), k) + '\n')
|
||||
|
||||
if args.save_cm and len(cm_kmers):
|
||||
with open(args.save_cm, 'w') as fh:
|
||||
fh.write('kmer,ref_count,idx_count\n')
|
||||
for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx):
|
||||
fh.write(f'{decode_kmer(int(v), k)},{rc},{ic}\n')
|
||||
|
||||
print(f'{args.species},{args.strain},'
|
||||
f'{len(ref_kmers)},{len(idx_kmers)},'
|
||||
f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},'
|
||||
f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Executable
+201
@@ -0,0 +1,201 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Verify the merged count index against all per-specimen reference sets.
|
||||
|
||||
Streams `obikmer dump` once on the merged index, accumulates per-specimen
|
||||
kmer+count pairs from each column, then compares each against its reference .npz.
|
||||
|
||||
Output to stdout: one CSV row per specimen (same columns as verify_count.py)
|
||||
species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,
|
||||
fn_pct,fp_pct,cm_pct
|
||||
"""
|
||||
import argparse
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
# ── encoding ──────────────────────────────────────────────────────────────────
|
||||
|
||||
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
|
||||
'a': 0, 'c': 1, 'g': 2, 't': 3}
|
||||
|
||||
_DECODE = ['A', 'C', 'G', 'T']
|
||||
|
||||
|
||||
def encode_kmer(s: str) -> int:
|
||||
kmer = 0
|
||||
for c in s:
|
||||
kmer = (kmer << 2) | _ENCODE[c]
|
||||
return kmer
|
||||
|
||||
|
||||
def decode_kmer(val: int, k: int) -> str:
|
||||
bases = []
|
||||
for _ in range(k):
|
||||
bases.append(_DECODE[val & 3])
|
||||
val >>= 2
|
||||
return ''.join(reversed(bases))
|
||||
|
||||
|
||||
# ── single-pass dump ──────────────────────────────────────────────────────────
|
||||
|
||||
def stream_merged_dump(obikmer_bin: str, index_dir: str,
|
||||
) -> tuple[list[str], dict[str, tuple[list[int], list[int]]]]:
|
||||
"""Stream the merged dump once.
|
||||
|
||||
Returns:
|
||||
specimen_names : column labels in dump order
|
||||
per_specimen : mapping label → (kmer_ints, counts) for entries > 0
|
||||
"""
|
||||
cmd = [obikmer_bin, 'dump', index_dir]
|
||||
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
|
||||
text=True)
|
||||
|
||||
header_line = proc.stdout.readline().rstrip('\n')
|
||||
cols = header_line.split(',')
|
||||
specimen_names = cols[1:]
|
||||
per_specimen: dict[str, tuple[list[int], list[int]]] = {
|
||||
name: ([], []) for name in specimen_names}
|
||||
|
||||
for line in proc.stdout:
|
||||
parts = line.rstrip('\n').split(',')
|
||||
kmer_int = encode_kmer(parts[0])
|
||||
for i, name in enumerate(specimen_names):
|
||||
count = int(parts[i + 1])
|
||||
if count > 0:
|
||||
per_specimen[name][0].append(kmer_int)
|
||||
per_specimen[name][1].append(count)
|
||||
|
||||
proc.wait()
|
||||
if proc.returncode != 0:
|
||||
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
return specimen_names, per_specimen
|
||||
|
||||
|
||||
# ── per-specimen comparison ───────────────────────────────────────────────────
|
||||
|
||||
def compare_specimen(name: str,
|
||||
kmer_list: list[int],
|
||||
count_list: list[int],
|
||||
ref_dir: Path,
|
||||
k: int,
|
||||
save_fn: Path | None,
|
||||
save_fp: Path | None,
|
||||
save_cm: Path | None,
|
||||
) -> str:
|
||||
ref_path = ref_dir / f'{name}.npz'
|
||||
if not ref_path.exists():
|
||||
print(f' SKIP {name}: no reference at {ref_path}', file=sys.stderr)
|
||||
return ''
|
||||
|
||||
species = name.split('--')[0]
|
||||
strain = name[len(species) + 2:]
|
||||
|
||||
npz = np.load(ref_path)
|
||||
ref_kmers = npz['kmers'] # sorted uint64
|
||||
ref_counts = npz['counts'] # uint32
|
||||
|
||||
order = np.argsort(np.array(kmer_list, dtype=np.uint64), kind='stable')
|
||||
idx_kmers = np.array(kmer_list, dtype=np.uint64)[order]
|
||||
idx_counts = np.array(count_list, dtype=np.uint32)[order]
|
||||
|
||||
false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
|
||||
false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
|
||||
|
||||
# Count mismatches among shared kmers
|
||||
pos_in_idx = np.searchsorted(idx_kmers, ref_kmers)
|
||||
pos_in_idx = np.clip(pos_in_idx, 0, len(idx_kmers) - 1)
|
||||
shared_mask = idx_kmers[pos_in_idx] == ref_kmers
|
||||
mismatch_mask = ref_counts[shared_mask] != idx_counts[pos_in_idx[shared_mask]]
|
||||
cm_kmers = ref_kmers[shared_mask][mismatch_mask]
|
||||
cm_ref = ref_counts[shared_mask][mismatch_mask]
|
||||
cm_idx = idx_counts[pos_in_idx[shared_mask]][mismatch_mask]
|
||||
|
||||
n_shared = int(shared_mask.sum())
|
||||
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
|
||||
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
|
||||
cm_pct = 100.0 * len(cm_kmers) / n_shared if n_shared else 0.0
|
||||
|
||||
print(f' {name}: ref={len(ref_kmers):,} idx={len(idx_kmers):,} '
|
||||
f'fn={len(false_neg):,} ({fn_pct:.4f}%) '
|
||||
f'fp={len(false_pos):,} ({fp_pct:.4f}%) '
|
||||
f'cm={len(cm_kmers):,} ({cm_pct:.4f}%)',
|
||||
file=sys.stderr)
|
||||
|
||||
if save_fn and len(false_neg):
|
||||
fn_file = save_fn / f'{name}_fn.txt'
|
||||
fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n')
|
||||
|
||||
if save_fp and len(false_pos):
|
||||
fp_file = save_fp / f'{name}_fp.txt'
|
||||
fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n')
|
||||
|
||||
if save_cm and len(cm_kmers):
|
||||
cm_file = save_cm / f'{name}_cm.csv'
|
||||
lines = ['kmer,ref_count,idx_count']
|
||||
for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx):
|
||||
lines.append(f'{decode_kmer(int(v), k)},{rc},{ic}')
|
||||
cm_file.write_text('\n'.join(lines) + '\n')
|
||||
|
||||
return (f'{species},{strain},'
|
||||
f'{len(ref_kmers)},{len(idx_kmers)},'
|
||||
f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},'
|
||||
f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}')
|
||||
|
||||
|
||||
# ── main ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser(description=__doc__,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||
ap.add_argument('index', metavar='INDEX_DIR', nargs='?',
|
||||
help='Merged count index directory')
|
||||
ap.add_argument('ref_dir', metavar='REF_DIR', nargs='?',
|
||||
help='Directory containing per-specimen .npz reference files')
|
||||
ap.add_argument('--obikmer', default='obikmer')
|
||||
ap.add_argument('--header', action='store_true',
|
||||
help='Print CSV header and exit')
|
||||
ap.add_argument('--save-fn', metavar='DIR',
|
||||
help='Directory for false-negative kmer lists')
|
||||
ap.add_argument('--save-fp', metavar='DIR',
|
||||
help='Directory for false-positive kmer lists')
|
||||
ap.add_argument('--save-cm', metavar='DIR',
|
||||
help='Directory for count-mismatch CSV files')
|
||||
args = ap.parse_args()
|
||||
|
||||
if args.header:
|
||||
print('species,strain,ref_kmers,idx_kmers,'
|
||||
'false_neg,false_pos,count_mismatch,'
|
||||
'fn_pct,fp_pct,cm_pct')
|
||||
return
|
||||
|
||||
ref_dir = Path(args.ref_dir)
|
||||
save_fn = Path(args.save_fn) if args.save_fn else None
|
||||
save_fp = Path(args.save_fp) if args.save_fp else None
|
||||
save_cm = Path(args.save_cm) if args.save_cm else None
|
||||
for d in (save_fn, save_fp, save_cm):
|
||||
if d: d.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
out1 = subprocess.check_output(
|
||||
[args.obikmer, 'dump', '--head', '1', args.index],
|
||||
stderr=subprocess.DEVNULL, text=True)
|
||||
k = len(out1.splitlines()[1].split(',')[0])
|
||||
|
||||
print(f'k={k} streaming merged dump: {args.index}', file=sys.stderr)
|
||||
specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index)
|
||||
print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr)
|
||||
|
||||
for name in specimen_names:
|
||||
kmers, counts = per_specimen[name]
|
||||
row = compare_specimen(name, kmers, counts, ref_dir, k,
|
||||
save_fn, save_fp, save_cm)
|
||||
if row:
|
||||
print(row)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Executable
+27
@@ -0,0 +1,27 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||
INDEX="${SCRIPT_DIR}/global_index_count"
|
||||
REF_DIR="${SCRIPT_DIR}/reference_index"
|
||||
STATS_DIR="${SCRIPT_DIR}/stats/verify_merge_count"
|
||||
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
|
||||
VERIFY_PY="${SCRIPT_DIR}/verify_merge_count.py"
|
||||
|
||||
mkdir -p "${STATS_DIR}"
|
||||
|
||||
CURRENT="${STATS_DIR}/current.csv"
|
||||
|
||||
"${PYTHON}" "${VERIFY_PY}" --header >"${CURRENT}"
|
||||
|
||||
"${PYTHON}" "${VERIFY_PY}" \
|
||||
--obikmer "${BINARY}" \
|
||||
"${INDEX}" "${REF_DIR}" \
|
||||
>>"${CURRENT}"
|
||||
|
||||
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'count_*.csv' | wc -l | tr -d ' ')")
|
||||
ARCHIVE="${STATS_DIR}/count_${run_n}.csv"
|
||||
cp "${CURRENT}" "${ARCHIVE}"
|
||||
|
||||
echo "Done. Results → ${ARCHIVE}"
|
||||
Executable
+170
@@ -0,0 +1,170 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Verify the merged presence index against all per-specimen reference sets.
|
||||
|
||||
Streams `obikmer dump` once on the merged index, accumulates per-specimen
|
||||
kmer sets from each column, then compares each against its reference .npz.
|
||||
|
||||
Output to stdout: one CSV row per specimen (same columns as verify_presence.py)
|
||||
species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct
|
||||
"""
|
||||
import argparse
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
# ── encoding ──────────────────────────────────────────────────────────────────
|
||||
|
||||
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
|
||||
'a': 0, 'c': 1, 'g': 2, 't': 3}
|
||||
|
||||
_DECODE = ['A', 'C', 'G', 'T']
|
||||
|
||||
|
||||
def encode_kmer(s: str) -> int:
|
||||
kmer = 0
|
||||
for c in s:
|
||||
kmer = (kmer << 2) | _ENCODE[c]
|
||||
return kmer
|
||||
|
||||
|
||||
def decode_kmer(val: int, k: int) -> str:
|
||||
bases = []
|
||||
for _ in range(k):
|
||||
bases.append(_DECODE[val & 3])
|
||||
val >>= 2
|
||||
return ''.join(reversed(bases))
|
||||
|
||||
|
||||
# ── single-pass dump ──────────────────────────────────────────────────────────
|
||||
|
||||
def stream_merged_dump(obikmer_bin: str, index_dir: str,
|
||||
) -> tuple[list[str], dict[str, list[int]]]:
|
||||
"""Stream the merged dump once.
|
||||
|
||||
Returns:
|
||||
specimen_names : column labels in dump order (excluding 'kmer')
|
||||
per_specimen : mapping label → list of kmer ints where presence > 0
|
||||
"""
|
||||
cmd = [obikmer_bin, 'dump', index_dir]
|
||||
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
|
||||
text=True)
|
||||
|
||||
header_line = proc.stdout.readline().rstrip('\n')
|
||||
cols = header_line.split(',')
|
||||
specimen_names = cols[1:] # first col is 'kmer'
|
||||
per_specimen: dict[str, list[int]] = {name: [] for name in specimen_names}
|
||||
|
||||
for line in proc.stdout:
|
||||
parts = line.rstrip('\n').split(',')
|
||||
kmer_int = encode_kmer(parts[0])
|
||||
for i, name in enumerate(specimen_names):
|
||||
if int(parts[i + 1]) > 0:
|
||||
per_specimen[name].append(kmer_int)
|
||||
|
||||
proc.wait()
|
||||
if proc.returncode != 0:
|
||||
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
return specimen_names, per_specimen
|
||||
|
||||
|
||||
# ── per-specimen comparison ───────────────────────────────────────────────────
|
||||
|
||||
def compare_specimen(name: str,
|
||||
kmer_list: list[int],
|
||||
ref_dir: Path,
|
||||
k: int,
|
||||
save_fn: Path | None,
|
||||
save_fp: Path | None,
|
||||
) -> str:
|
||||
"""Compare one specimen column against its reference .npz.
|
||||
|
||||
Returns a CSV row string.
|
||||
"""
|
||||
ref_path = ref_dir / f'{name}.npz'
|
||||
if not ref_path.exists():
|
||||
print(f' SKIP {name}: no reference at {ref_path}', file=sys.stderr)
|
||||
return ''
|
||||
|
||||
species = name.split('--')[0]
|
||||
strain = name[len(species) + 2:]
|
||||
|
||||
ref_kmers = np.load(ref_path)['kmers'] # sorted uint64
|
||||
idx_kmers = np.array(sorted(kmer_list), dtype=np.uint64)
|
||||
|
||||
false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
|
||||
false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
|
||||
|
||||
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
|
||||
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
|
||||
|
||||
print(f' {name}: ref={len(ref_kmers):,} idx={len(idx_kmers):,} '
|
||||
f'fn={len(false_neg):,} ({fn_pct:.4f}%) '
|
||||
f'fp={len(false_pos):,} ({fp_pct:.4f}%)',
|
||||
file=sys.stderr)
|
||||
|
||||
if save_fn and len(false_neg):
|
||||
fn_file = save_fn / f'{name}_fn.txt'
|
||||
fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n')
|
||||
|
||||
if save_fp and len(false_pos):
|
||||
fp_file = save_fp / f'{name}_fp.txt'
|
||||
fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n')
|
||||
|
||||
return (f'{species},{strain},'
|
||||
f'{len(ref_kmers)},{len(idx_kmers)},'
|
||||
f'{len(false_neg)},{len(false_pos)},'
|
||||
f'{fn_pct:.4f},{fp_pct:.4f}')
|
||||
|
||||
|
||||
# ── main ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser(description=__doc__,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||
ap.add_argument('index', metavar='INDEX_DIR', nargs='?',
|
||||
help='Merged presence index directory')
|
||||
ap.add_argument('ref_dir', metavar='REF_DIR', nargs='?',
|
||||
help='Directory containing per-specimen .npz reference files')
|
||||
ap.add_argument('--obikmer', default='obikmer')
|
||||
ap.add_argument('--header', action='store_true',
|
||||
help='Print CSV header and exit')
|
||||
ap.add_argument('--save-fn', metavar='DIR',
|
||||
help='Directory to save false-negative kmer lists')
|
||||
ap.add_argument('--save-fp', metavar='DIR',
|
||||
help='Directory to save false-positive kmer lists')
|
||||
args = ap.parse_args()
|
||||
|
||||
if args.header:
|
||||
print('species,strain,ref_kmers,idx_kmers,'
|
||||
'false_neg,false_pos,fn_pct,fp_pct')
|
||||
return
|
||||
|
||||
ref_dir = Path(args.ref_dir)
|
||||
save_fn = Path(args.save_fn) if args.save_fn else None
|
||||
save_fp = Path(args.save_fp) if args.save_fp else None
|
||||
if save_fn: save_fn.mkdir(parents=True, exist_ok=True)
|
||||
if save_fp: save_fp.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Detect k
|
||||
out1 = subprocess.check_output(
|
||||
[args.obikmer, 'dump', '--head', '1', args.index],
|
||||
stderr=subprocess.DEVNULL, text=True)
|
||||
k = len(out1.splitlines()[1].split(',')[0])
|
||||
|
||||
print(f'k={k} streaming merged dump: {args.index}', file=sys.stderr)
|
||||
specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index)
|
||||
print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr)
|
||||
|
||||
for name in specimen_names:
|
||||
row = compare_specimen(name, per_specimen[name], ref_dir, k, save_fn, save_fp)
|
||||
if row:
|
||||
print(row)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Executable
+27
@@ -0,0 +1,27 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||
INDEX="${SCRIPT_DIR}/global_index_presence"
|
||||
REF_DIR="${SCRIPT_DIR}/reference_index"
|
||||
STATS_DIR="${SCRIPT_DIR}/stats/verify_merge_presence"
|
||||
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
|
||||
VERIFY_PY="${SCRIPT_DIR}/verify_merge_presence.py"
|
||||
|
||||
mkdir -p "${STATS_DIR}"
|
||||
|
||||
CURRENT="${STATS_DIR}/current.csv"
|
||||
|
||||
"${PYTHON}" "${VERIFY_PY}" --header >"${CURRENT}"
|
||||
|
||||
"${PYTHON}" "${VERIFY_PY}" \
|
||||
--obikmer "${BINARY}" \
|
||||
"${INDEX}" "${REF_DIR}" \
|
||||
>>"${CURRENT}"
|
||||
|
||||
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'presence_*.csv' | wc -l | tr -d ' ')")
|
||||
ARCHIVE="${STATS_DIR}/presence_${run_n}.csv"
|
||||
cp "${CURRENT}" "${ARCHIVE}"
|
||||
|
||||
echo "Done. Results → ${ARCHIVE}"
|
||||
Executable
+30
@@ -0,0 +1,30 @@
|
||||
#!/usr/bin/env bash
|
||||
# Usage: verify_one_count.sh SPECIMEN
|
||||
# SPECIMEN = "species--strain" (Make pattern stem)
|
||||
# Output: stats/verify_count/SPECIMEN.stats (one CSV data row, no header)
|
||||
set -euo pipefail
|
||||
|
||||
SPECIMEN="$1"
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
|
||||
VERIFY_PY="${SCRIPT_DIR}/verify_count.py"
|
||||
|
||||
species="${SPECIMEN%%--*}"
|
||||
strain="${SPECIMEN#*--}"
|
||||
|
||||
REF_NPZ="${SCRIPT_DIR}/reference_index/${SPECIMEN}.npz"
|
||||
INDEX_DIR="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}"
|
||||
STATS_DIR="${SCRIPT_DIR}/stats/verify_count"
|
||||
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
|
||||
|
||||
mkdir -p "${STATS_DIR}"
|
||||
|
||||
echo "[${SPECIMEN}] verifying count"
|
||||
|
||||
"${PYTHON}" "${VERIFY_PY}" \
|
||||
--obikmer "${BINARY}" \
|
||||
--species "${species}" \
|
||||
--strain "${strain}" \
|
||||
"${REF_NPZ}" "${INDEX_DIR}" \
|
||||
>"${STATS_FILE}"
|
||||
Executable
+30
@@ -0,0 +1,30 @@
|
||||
#!/usr/bin/env bash
|
||||
# Usage: verify_one_presence.sh SPECIMEN
|
||||
# SPECIMEN = "species--strain" (Make pattern stem)
|
||||
# Output: stats/verify_presence/SPECIMEN.stats (one CSV data row, no header)
|
||||
set -euo pipefail
|
||||
|
||||
SPECIMEN="$1"
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
|
||||
VERIFY_PY="${SCRIPT_DIR}/verify_presence.py"
|
||||
|
||||
species="${SPECIMEN%%--*}"
|
||||
strain="${SPECIMEN#*--}"
|
||||
|
||||
REF_NPZ="${SCRIPT_DIR}/reference_index/${SPECIMEN}.npz"
|
||||
INDEX_DIR="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}"
|
||||
STATS_DIR="${SCRIPT_DIR}/stats/verify_presence"
|
||||
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
|
||||
|
||||
mkdir -p "${STATS_DIR}"
|
||||
|
||||
echo "[${SPECIMEN}] verifying presence"
|
||||
|
||||
"${PYTHON}" "${VERIFY_PY}" \
|
||||
--obikmer "${BINARY}" \
|
||||
--species "${species}" \
|
||||
--strain "${strain}" \
|
||||
"${REF_NPZ}" "${INDEX_DIR}" \
|
||||
>"${STATS_FILE}"
|
||||
Executable
+139
@@ -0,0 +1,139 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Compare an obikmer index against a reference kmer set (presence/absence).
|
||||
|
||||
Loads the reference .npz (sorted uint64 kmers built by build_reference.py),
|
||||
streams the output of `obikmer dump`, encodes each kmer string to uint64,
|
||||
then reports false negatives and false positives using numpy set operations.
|
||||
|
||||
Output to stdout: one CSV row
|
||||
species, strain, ref_kmers, idx_kmers, false_neg, false_pos, fn_pct, fp_pct
|
||||
"""
|
||||
import argparse
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
# ── encoding ──────────────────────────────────────────────────────────────────
|
||||
|
||||
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
|
||||
'a': 0, 'c': 1, 'g': 2, 't': 3}
|
||||
|
||||
_DECODE = ['A', 'C', 'G', 'T']
|
||||
|
||||
|
||||
def encode_kmer(s: str) -> int:
|
||||
kmer = 0
|
||||
for c in s:
|
||||
kmer = (kmer << 2) | _ENCODE[c]
|
||||
return kmer
|
||||
|
||||
|
||||
def decode_kmer(val: int, k: int) -> str:
|
||||
bases = []
|
||||
for _ in range(k):
|
||||
bases.append(_DECODE[val & 3])
|
||||
val >>= 2
|
||||
return ''.join(reversed(bases))
|
||||
|
||||
|
||||
# ── dump parsing ──────────────────────────────────────────────────────────────
|
||||
|
||||
def load_index_kmers(obikmer_bin: str, index_dir: str) -> np.ndarray:
|
||||
"""Stream `obikmer dump` and return a sorted uint64 array of kmer integers."""
|
||||
cmd = [obikmer_bin, 'dump', index_dir]
|
||||
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
|
||||
text=True)
|
||||
kmers = []
|
||||
header = True
|
||||
for line in proc.stdout:
|
||||
if header:
|
||||
header = False
|
||||
continue
|
||||
kmer_str = line.split(',', 1)[0]
|
||||
kmers.append(encode_kmer(kmer_str))
|
||||
proc.wait()
|
||||
if proc.returncode != 0:
|
||||
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
arr = np.array(kmers, dtype=np.uint64)
|
||||
arr.sort()
|
||||
return arr
|
||||
|
||||
|
||||
# ── comparison ────────────────────────────────────────────────────────────────
|
||||
|
||||
def compare(ref: np.ndarray, idx: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""Return (false_negatives, false_positives) as uint64 arrays."""
|
||||
false_neg = np.setdiff1d(ref, idx, assume_unique=True)
|
||||
false_pos = np.setdiff1d(idx, ref, assume_unique=True)
|
||||
return false_neg, false_pos
|
||||
|
||||
|
||||
# ── main ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser(description=__doc__,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||
ap.add_argument('reference', metavar='REF_NPZ', nargs='?', help='Reference .npz file')
|
||||
ap.add_argument('index', metavar='INDEX_DIR', nargs='?', help='obikmer index directory')
|
||||
ap.add_argument('--obikmer', default='obikmer', help='Path to obikmer binary')
|
||||
ap.add_argument('--species', default='', help='Species label for CSV row')
|
||||
ap.add_argument('--strain', default='', help='Strain label for CSV row')
|
||||
ap.add_argument('--header', action='store_true', help='Print CSV header and exit')
|
||||
ap.add_argument('--save-fp', metavar='FILE',
|
||||
help='Save false-positive kmer strings to FILE')
|
||||
ap.add_argument('--save-fn', metavar='FILE',
|
||||
help='Save false-negative kmer strings to FILE')
|
||||
args = ap.parse_args()
|
||||
|
||||
if args.header:
|
||||
print('species,strain,ref_kmers,idx_kmers,'
|
||||
'false_neg,false_pos,fn_pct,fp_pct')
|
||||
return
|
||||
|
||||
# Detect k from the index (one cheap call before the full dump).
|
||||
cmd1 = [args.obikmer, 'dump', '--head', '1', args.index]
|
||||
out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True)
|
||||
k = len(out1.splitlines()[1].split(',')[0])
|
||||
|
||||
# Load reference
|
||||
print(f'Loading reference: {args.reference}', file=sys.stderr)
|
||||
npz = np.load(args.reference)
|
||||
ref_kmers = npz['kmers'] # already sorted uint64
|
||||
|
||||
# Load index
|
||||
print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr)
|
||||
idx_kmers = load_index_kmers(args.obikmer, args.index)
|
||||
|
||||
print(f'k={k} ref={len(ref_kmers):,} idx={len(idx_kmers):,}', file=sys.stderr)
|
||||
|
||||
false_neg, false_pos = compare(ref_kmers, idx_kmers)
|
||||
|
||||
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
|
||||
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
|
||||
|
||||
print(f'false negatives: {len(false_neg):,} ({fn_pct:.4f}%)', file=sys.stderr)
|
||||
print(f'false positives: {len(false_pos):,} ({fp_pct:.4f}%)', file=sys.stderr)
|
||||
|
||||
if args.save_fn and len(false_neg):
|
||||
with open(args.save_fn, 'w') as fh:
|
||||
for v in false_neg:
|
||||
fh.write(decode_kmer(int(v), k) + '\n')
|
||||
print(f'False negatives saved → {args.save_fn}', file=sys.stderr)
|
||||
|
||||
if args.save_fp and len(false_pos):
|
||||
with open(args.save_fp, 'w') as fh:
|
||||
for v in false_pos:
|
||||
fh.write(decode_kmer(int(v), k) + '\n')
|
||||
print(f'False positives saved → {args.save_fp}', file=sys.stderr)
|
||||
|
||||
print(f'{args.species},{args.strain},'
|
||||
f'{len(ref_kmers)},{len(idx_kmers)},'
|
||||
f'{len(false_neg)},{len(false_pos)},'
|
||||
f'{fn_pct:.4f},{fp_pct:.4f}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,105 @@
|
||||
# Rebuild / filter — column-first design
|
||||
|
||||
## Problem with the current two-pass design
|
||||
|
||||
`rebuild_partition` currently makes **two full passes** over source data:
|
||||
|
||||
**Pass 1** — read unitigs → MPHF lookup (source) → read row (108 values) → apply filter → push kmer into `GraphDeBruijn`, **discard row**.
|
||||
|
||||
**Pass 2** — read unitigs again → MPHF lookup again → read row again → for each passing kmer, look up slot in new MPHF → fill column builders.
|
||||
|
||||
Both passes do random access into the source matrix: for each kmer, the MPHF returns a slot, then we read 108 values scattered across 108 column positions. This is cache-hostile even with a packed matrix (`.pbmx`), because the matrix is column-major: consecutive row reads jump across the file.
|
||||
|
||||
## Memory budget
|
||||
|
||||
The `keep` bitvector costs **1 bit per slot**. With 256 partitions and realistic kmer counts, each partition holds at most a few tens of millions of slots → a few MB per bitvector. Even in the absolute worst case (800 M slots), it stays under 100 MB. This is negligible.
|
||||
|
||||
The `slot_map` option (Option B, 8–16 bytes per slot) is heavier but still bounded: at 15 M slots and 8 bytes, that is 120 MB per partition, acceptable for a single worker.
|
||||
|
||||
## Key observation
|
||||
|
||||
**The filter operates on column values, not on kmers.** A filter like `--max-outgroup-count 0` only needs to know, for each slot, whether any outgroup column is non-zero. It does not need to know which kmer occupies that slot.
|
||||
|
||||
This means filtering can be done as a **sequential column scan** that produces a `keep: BitVec[n_slots]` — no MPHF lookups, no kmer knowledge, perfectly cache-friendly.
|
||||
|
||||
## Proposed single-scan design
|
||||
|
||||
### Step 1 — column scan → `keep` bitvector
|
||||
|
||||
```
|
||||
for each column c in source matrix:
|
||||
read column c sequentially (one mmap range)
|
||||
update keep[slot] according to filter contribution of column c
|
||||
```
|
||||
|
||||
For `GroupQuorumFilter` with ingroup/outgroup:
|
||||
- ingroup columns: count presence per slot → `ingroup_count[slot]`
|
||||
- outgroup columns: `keep[slot] &= (value[slot] == 0)` (early-exit possible)
|
||||
|
||||
Result: `keep: BitVec` of size `n_slots`, computed with purely sequential IO.
|
||||
|
||||
### Step 2 — unitig scan → kept kmers + new MPHF
|
||||
|
||||
```
|
||||
for each kmer in unitig files:
|
||||
old_slot = old_MPHF(kmer)
|
||||
if keep[old_slot]:
|
||||
push kmer into new GraphDeBruijn
|
||||
record (old_slot, kmer) ← or just old_slot in order
|
||||
```
|
||||
|
||||
Build new MPHF from `GraphDeBruijn` via `materialize_layer`.
|
||||
|
||||
### Step 3 — fill new matrix
|
||||
|
||||
Two sub-options:
|
||||
|
||||
**Option A — from recorded (old_slot, kmer) pairs:**
|
||||
|
||||
```
|
||||
for each (old_slot, kmer) in recorded list:
|
||||
new_slot = new_MPHF(kmer)
|
||||
for each column c:
|
||||
new_matrix[new_slot, c] = old_matrix[old_slot, c]
|
||||
```
|
||||
|
||||
Memory cost: `n_kept × (8 + 8)` bytes for `(old_slot: usize, kmer: CanonicalKmer)`.
|
||||
For species-specific filters, `n_kept` is small. For unfiltered rebuild, `n_kept = n_slots`.
|
||||
|
||||
**Option B — column-by-column copy using old→new slot mapping:**
|
||||
|
||||
Precompute `slot_map: Vec<Option<usize>>` of size `n_slots`:
|
||||
- For each kmer in unitig file: `slot_map[old_MPHF(kmer)] = Some(new_MPHF(kmer))`
|
||||
|
||||
Then for each source column:
|
||||
```
|
||||
read source column sequentially
|
||||
for each slot where slot_map[slot] = Some(new_slot):
|
||||
write value to new column at new_slot
|
||||
```
|
||||
|
||||
Memory cost: `n_slots × sizeof(usize)` for the slot map (one usize per source slot).
|
||||
IO pattern: sequential read of each source column → random write into new column builders.
|
||||
|
||||
Option B avoids storing kmer values and works uniformly regardless of filter selectivity.
|
||||
|
||||
## Comparison
|
||||
|
||||
| | Current | Proposed |
|
||||
|---|---|---|
|
||||
| Disk reads | 2× unitigs + 2× random matrix | 1× columns (sequential) + 1× unitigs |
|
||||
| MPHF lookups (source) | 2× N_kmers | 1× N_kept (step 2) or 0 (option B, col scan only) |
|
||||
| Cache behavior | poor (random row access) | good (sequential column scan) |
|
||||
| Extra memory | none | slot_map (option B) or (old_slot, kmer) list (option A) |
|
||||
|
||||
## Files to modify
|
||||
|
||||
- `src/obikpartitionner/src/rebuild_layer.rs` — `rebuild_partition` and `iter_src_layers`
|
||||
- Possibly `src/obicompactvec/` — add column iterator API if not already present
|
||||
- `src/obilayeredmap/` — check if per-column sequential access is exposed on `SrcLayerData`
|
||||
|
||||
## Open questions
|
||||
|
||||
- Does `SrcLayerData` expose per-column sequential iteration, or only `lookup(kmer, n_genomes)` random access?
|
||||
- For option B: are new column builders writable in random-slot order (i.e. `set_val(slot, value)` without sequential constraint)?
|
||||
- For `GroupQuorumFilter` specifically: can the filter be decomposed into independent per-column contributions, or does it need the full row?
|
||||
@@ -29,16 +29,23 @@ Multiple values separated by `|` are always OR-ed within the predicate.
|
||||
|
||||
### Path matching (`~` and `!~`)
|
||||
|
||||
Metadata values can represent hierarchical taxonomic paths such as
|
||||
Metadata values can represent hierarchical concept paths such as
|
||||
`/Eukaryota/Viridiplantae/Streptophyta/Betulaceae/Betula/nana`.
|
||||
|
||||
- **Absolute pattern** (starts with `/`): the value must start with the pattern
|
||||
at a segment boundary.
|
||||
`taxon~/Betulaceae/Betula` matches `/Betulaceae/Betula/nana` and
|
||||
`/Betulaceae/Betula` but not `/Betulaceae/Betuloides/…`.
|
||||
- **Bare segment** (no leading `/`): the value must contain the pattern as an
|
||||
exact path component anywhere.
|
||||
`taxon~Betula` matches any path that has `Betula` as one of its segments.
|
||||
Stored taxonomy values always start with `/` (the root of the path).
|
||||
Query patterns do **not** need to start with `/` — a leading `/` is an optional
|
||||
start anchor, not a requirement.
|
||||
|
||||
| Pattern form | Semantics |
|
||||
|---|---|
|
||||
| `A/B` | contiguous sub-path A then B, anywhere in the value |
|
||||
| `/A/B` | value starts with A then B |
|
||||
| `A/B$` | value ends with A then B |
|
||||
| `/A/B$` | value is exactly A then B |
|
||||
| `A@x/B` | A with class `x` followed by B with any class |
|
||||
|
||||
- `taxon~/Betulaceae/Betula` matches any path that starts with `Betulaceae` then `Betula`.
|
||||
- `taxon~Betula` matches any path containing `Betula` as a segment, anywhere.
|
||||
|
||||
### Missing metadata key → NA
|
||||
|
||||
|
||||
@@ -0,0 +1,520 @@
|
||||
# obicompactvec — Complete Reference
|
||||
|
||||
## Module structure
|
||||
|
||||
```
|
||||
src/obicompactvec/src/
|
||||
lib.rs public re-exports
|
||||
views.rs BitSliceView<'a>, IntSliceView<'a> — zero-copy read views
|
||||
traits.rs ColumnWeights, CountPartials, BitPartials (matrix aggregation)
|
||||
bitvec.rs PersistentBitVec, PersistentBitVecBuilder, BitIter
|
||||
reader.rs PersistentCompactIntVec (read-only)
|
||||
builder.rs PersistentCompactIntVecBuilder (read-write)
|
||||
tempintvec.rs TempCompactIntVec, TempCompactIntVecBuilder (temp-file-backed)
|
||||
tempbitvec.rs TempBitVec, TempBitVecBuilder (temp-file-backed)
|
||||
bitmatrix.rs PersistentBitMatrix, PersistentBitMatrixBuilder
|
||||
intmatrix.rs PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder
|
||||
colgroup.rs ColGroup, MatrixGroupOps trait
|
||||
format.rs file format constants, encode/decode helpers
|
||||
layer_meta.rs LayerMeta (column metadata)
|
||||
meta.rs matrix metadata
|
||||
```
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
views --> bitvec
|
||||
views --> builder
|
||||
views --> tempbitvec
|
||||
views --> tempintvec
|
||||
views --> bitmatrix
|
||||
views --> intmatrix
|
||||
format --> reader
|
||||
format --> builder
|
||||
reader --> intmatrix
|
||||
reader --> tempintvec
|
||||
builder --> intmatrix
|
||||
builder --> tempintvec
|
||||
bitvec --> tempbitvec
|
||||
bitvec --> bitmatrix
|
||||
tempintvec --> intmatrix
|
||||
tempintvec --> bitmatrix
|
||||
tempbitvec --> intmatrix
|
||||
tempbitvec --> bitmatrix
|
||||
colgroup --> intmatrix
|
||||
colgroup --> bitmatrix
|
||||
layer_meta --> bitmatrix
|
||||
layer_meta --> intmatrix
|
||||
meta --> bitmatrix
|
||||
meta --> intmatrix
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Compact int encoding
|
||||
|
||||
All integer vectors use the same two-tier encoding regardless of storage backend.
|
||||
|
||||
**Primary array** — one `u8` per slot:
|
||||
|
||||
- Values **0–254** are stored directly. No overhead.
|
||||
- Value **255 is a sentinel**: the slot's actual value is ≥ 255 and lives in the overflow store.
|
||||
|
||||
**Overflow store** — maps slot index to a `u32` value ≥ 255:
|
||||
|
||||
- In `PersistentCompactIntVecBuilder`: a `HashMap<usize, u32>` in RAM.
|
||||
- In `PersistentCompactIntVec` (reader): a sorted `[(slot: u64, value: u32)]` array in the mmap, with a sparse L1-resident index for binary search.
|
||||
|
||||
```mermaid
|
||||
flowchart LR
|
||||
slot --> P["primary[slot]: u8"]
|
||||
P -->|"< 255"| V["value = byte (0–254)"]
|
||||
P -->|"= 255 sentinel"| OV["overflow store"]
|
||||
OV -->|"Builder"| HM["HashMap<usize, u32>\nin RAM"]
|
||||
OV -->|"PersistentCompactIntVec"| SA["sorted [(slot,value)] in mmap\n+ sparse L1 index"]
|
||||
```
|
||||
|
||||
**Key property — sentinel 255 = +∞ on `u8`:**
|
||||
|
||||
- `min(a, 255) = a` for all `a ≤ 254` → correct when only one side is overflow
|
||||
- `max(a, 255) = 255` → correct sentinel when either side is overflow
|
||||
- Only the **both-overflow** case requires reading actual values from the overflow store.
|
||||
|
||||
In practice, k (overflow count) ≪ n (total slots). Observed genomic data: ~0.07% of kmer slots are in overflow.
|
||||
|
||||
---
|
||||
|
||||
## View types
|
||||
|
||||
The previous trait hierarchy (`BitSlice`, `BitSliceMut`, `IntSlice`, `IntSliceMut`) has been replaced by two concrete zero-copy view structs with inherent methods. Views are **`Copy`** — passing them is free. All read operations live on these two types.
|
||||
|
||||
### `BitSliceView<'a>`
|
||||
|
||||
```rust
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct BitSliceView<'a> { pub(crate) words: &'a [u64], pub(crate) n: usize }
|
||||
```
|
||||
|
||||
Bit `i` is at `words[i >> 6]` bit `i & 63` (LSB-first). Padding bits in the last word are zero.
|
||||
|
||||
| Method | Cost |
|
||||
|---|---|
|
||||
| `len()`, `is_empty()` | O(1) |
|
||||
| `get(slot)` | O(1) |
|
||||
| `count_ones()` | POPCNT per word, O(n/64) |
|
||||
| `count_zeros()` | `n − count_ones()`, O(n/64) |
|
||||
| `iter() -> BitSliceIter<'a>` | O(1) setup, O(n) iteration |
|
||||
| `partial_jaccard_dist(other: BitSliceView)` | `(a&b).popcount`, `(a\|b).popcount` per word, O(n/64) |
|
||||
| `jaccard_dist(other: BitSliceView)` | from partial, O(n/64) |
|
||||
| `hamming_dist(other: BitSliceView)` | `(a^b).popcount` per word, O(n/64) |
|
||||
|
||||
`BitSliceIter<'a>`: word-level scan; one word per 64 iterations.
|
||||
|
||||
### `IntSliceView<'a>`
|
||||
|
||||
```rust
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct IntSliceView<'a> {
|
||||
pub(crate) primary: &'a [u8],
|
||||
pub(crate) overflow_raw: &'a [u8], // sorted [(slot:u64, value:u32)] entries
|
||||
pub(crate) n_overflow: usize,
|
||||
pub(crate) n: usize,
|
||||
}
|
||||
```
|
||||
|
||||
`overflow_raw` contains `n_overflow` entries of `OVERFLOW_ENTRY_SIZE` bytes each, sorted by slot. The sort invariant is established at `close()`/`freeze()` time.
|
||||
|
||||
| Method | Cost |
|
||||
|---|---|
|
||||
| `len()`, `is_empty()` | O(1) |
|
||||
| `primary_bytes()` | O(1) |
|
||||
| `overflow_entries() -> impl Iterator<(usize,u32)>` | O(n_overflow) iteration |
|
||||
| `get(slot)` | O(1) primary; binary search O(log k) for overflow slots |
|
||||
| `iter() -> IntSliceViewIter<'a>` | merge scan, O(n + k) |
|
||||
| `sum()` | byte scan + overflow, O(n + k) |
|
||||
| `count_nonzero()` | byte scan, O(n) |
|
||||
| Distance methods (`bray_dist`, `euclidean_dist`, `jaccard_dist`, …) | O(n + k) |
|
||||
|
||||
`IntSliceViewIter<'a>`: merge scan using `overflow_pos` index. Requires sorted overflow — guaranteed by the construction lifecycle.
|
||||
|
||||
**Builder `view()` vs reader `view()`:** `PersistentCompactIntVecBuilder` stores overflow as an unsorted `HashMap`, not raw bytes. Its `view()` returns an `IntSliceView` with `overflow_raw = &[]` and `n_overflow = 0`. This is intentional — the view is primarily useful after `freeze()`. During building, callers that need overflow use `overflow_entries()` directly.
|
||||
|
||||
---
|
||||
|
||||
## Concrete types
|
||||
|
||||
```mermaid
|
||||
classDiagram
|
||||
class BitSliceView {
|
||||
+words: &[u64]
|
||||
+n: usize
|
||||
+get(slot) bool
|
||||
+count_ones() u64
|
||||
+iter() BitSliceIter
|
||||
+jaccard_dist/hamming_dist(other: BitSliceView)
|
||||
}
|
||||
class IntSliceView {
|
||||
+primary: &[u8]
|
||||
+overflow_raw: &[u8]
|
||||
+n_overflow: usize
|
||||
+n: usize
|
||||
+get(slot) u32
|
||||
+iter() IntSliceViewIter
|
||||
+overflow_entries() Iterator
|
||||
+bray_dist/euclidean_dist/…(other: IntSliceView)
|
||||
}
|
||||
class PersistentBitVec {
|
||||
-mmap: Mmap
|
||||
-n: usize
|
||||
+view() BitSliceView
|
||||
+get(slot) bool
|
||||
+count_ones/zeros() u64
|
||||
+iter() BitIter
|
||||
+partial_jaccard_dist(&Self) (u64,u64)
|
||||
+jaccard_dist/hamming_dist(&Self) …
|
||||
}
|
||||
class PersistentBitVecBuilder {
|
||||
-mmap: MmapMut
|
||||
-n: usize
|
||||
+view() BitSliceView
|
||||
+set(slot, bool)
|
||||
+or/and/xor/not(BitSliceView)
|
||||
+copy_from(BitSliceView)
|
||||
+close() / finish() → PersistentBitVec
|
||||
}
|
||||
class PersistentCompactIntVec {
|
||||
-mmap: Mmap
|
||||
-n: usize
|
||||
-n_overflow: usize
|
||||
-step: usize
|
||||
-index: Vec~(usize,usize)~
|
||||
+view() IntSliceView
|
||||
+get(slot) u32
|
||||
+iter() Iter
|
||||
+sum/count_nonzero() u64
|
||||
+bray_dist/euclidean_dist/… (&Self)
|
||||
}
|
||||
class PersistentCompactIntVecBuilder {
|
||||
-mmap: MmapMut
|
||||
-n: usize
|
||||
-overflow: HashMap~usize,u32~
|
||||
+view() IntSliceView
|
||||
+set(slot, u32) / get(slot) u32
|
||||
+inc / inc_present / inc_present_fast
|
||||
+inc_predicate / inc_predicate_fast
|
||||
+add/min/max/diff/mask_with(…View)
|
||||
+primary_bytes/primary_bytes_mut()
|
||||
+close() / finish() → PersistentCompactIntVec
|
||||
}
|
||||
|
||||
PersistentBitVec --> BitSliceView : view()
|
||||
PersistentBitVecBuilder --> BitSliceView : view()
|
||||
PersistentCompactIntVec --> IntSliceView : view()
|
||||
PersistentCompactIntVecBuilder --> IntSliceView : view() (primary only)
|
||||
PersistentBitVecBuilder --> PersistentBitVec : close() then open()
|
||||
PersistentCompactIntVecBuilder --> PersistentCompactIntVec : close() then open()
|
||||
```
|
||||
|
||||
### `PersistentBitVec` / `PersistentBitVecBuilder`
|
||||
|
||||
`PersistentBitVec` is the read-only type. `view()` returns a `BitSliceView<'_>` over the mmap word array. Direct inherent methods delegate to the view: `count_ones()`, `count_zeros()`, `partial_jaccard_dist(&Self)`, `jaccard_dist(&Self)`, `hamming_dist(&Self)`.
|
||||
|
||||
`BitIter<'a>` — exported iterator for `PersistentBitVec::iter()`:
|
||||
|
||||
```rust
|
||||
pub struct BitIter<'a> { pub(crate) words: &'a [u64], pub(crate) slot: usize, pub(crate) n: usize }
|
||||
```
|
||||
|
||||
`PersistentBitVecBuilder` is the read-write type. Mutation operations accept `BitSliceView<'_>`:
|
||||
|
||||
| Method | Cost |
|
||||
|---|---|
|
||||
| `set(slot, bool)` | O(1) |
|
||||
| `view() -> BitSliceView<'_>` | O(1) |
|
||||
| `or/and/xor(BitSliceView)` | word-level, O(n/64), SIMD-friendly |
|
||||
| `not()` | `w ^= u64::MAX` per word, re-masks last word | O(n/64) |
|
||||
| `copy_from(BitSliceView)` | `copy_from_slice` | O(n/64) |
|
||||
|
||||
### `PersistentCompactIntVec` / `PersistentCompactIntVecBuilder`
|
||||
|
||||
`PersistentCompactIntVec` is the read-only type. `view()` returns an `IntSliceView<'_>` over the mmap primary and overflow arrays. Inherent `iter()` is a merge scan (`Iter` struct). Inherent `sum()` and `count_nonzero()` use fast byte-scan helpers.
|
||||
|
||||
`PersistentCompactIntVecBuilder` is the read-write type. Mutation methods on the builder fall into two categories:
|
||||
|
||||
**Point mutations:**
|
||||
|
||||
| Method | Note |
|
||||
|---|---|
|
||||
| `set(slot, u32)` | writes primary[slot] or 255+overflow |
|
||||
| `get(slot) -> u32` | reads primary byte or HashMap |
|
||||
| `inc(slot)` | `get` + `set`, O(1) |
|
||||
|
||||
**Bulk computation methods** — accept view arguments:
|
||||
|
||||
| Method | Semantics | Overflow |
|
||||
|---|---|---|
|
||||
| `inc_present(BitSliceView)` | `+= 1` at each 1-bit | via `inc`, safe for any group size |
|
||||
| `inc_present_fast(BitSliceView)` | same, raw u8 `+= 1` | `debug_assert` no 255 reached |
|
||||
| `inc_predicate(IntSliceView, pred)` | `+= 1` where `pred(col[s])` | two-pass, safe |
|
||||
| `inc_predicate_fast(IntSliceView, pred)` | same, raw u8 | `debug_assert` no 255 reached |
|
||||
| `add(IntSliceView)` | `self[s] += other[s]` | primary fast path + overflow fallback |
|
||||
| `min(IntSliceView)` | byte min + both-overflow fixup | see algorithm below |
|
||||
| `max(IntSliceView)` | pre-pass + byte max | see algorithm below |
|
||||
| `diff(IntSliceView)` | saturating sub | self<255 hot path |
|
||||
| `mask_with(BitSliceView)` | zeros slots where mask bit = 0 | O(n_zeros) |
|
||||
|
||||
**`inc_present_fast` / `inc_predicate_fast` invariant:** caller guarantees no counter reaches 255 during the operation (group size < 255 for `inc_present_fast`, or chunk size < 255 for `inc_predicate_fast`). Violation is caught by `debug_assert` in dev builds.
|
||||
|
||||
**`min` algorithm:**
|
||||
|
||||
Exploits 255 = +∞: byte-level min is correct unless both sides are overflow.
|
||||
|
||||
```
|
||||
snapshot self_ov: Vec<(slot,val)>
|
||||
snapshot other_ov: HashMap<slot,val>
|
||||
clear_overflow()
|
||||
Pass 1 — byte min, SIMD-vectorizable, O(n)
|
||||
Pass 2 — both-overflow fixup, O(k_self):
|
||||
for (slot, self_val) in self_ov:
|
||||
if slot ∈ other_ov: set(slot, min(self_val, other_ov[slot]))
|
||||
```
|
||||
|
||||
**`max` algorithm:**
|
||||
|
||||
Cannot do byte max first — `max(255, b<255)=255` overwrites self's original overflow value. Pre-pass reads self's value at other's overflow slots before the byte pass.
|
||||
|
||||
```
|
||||
Pre-pass O(k_other): for (slot, other_val) in other.overflow_entries():
|
||||
set(slot, max(self.get(slot), other_val))
|
||||
Pass 1 — byte max, SIMD-vectorizable, O(n)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Matrix types
|
||||
|
||||
Four matrix types, two encodings × two formats:
|
||||
|
||||
| | Columnar format | Packed format |
|
||||
|---|---|---|
|
||||
| **Bit** | `PersistentBitMatrix` (Columnar variant) | `PersistentBitMatrix` (Packed variant) |
|
||||
| **Int** | `PersistentCompactIntMatrix` (Columnar variant) | `PersistentCompactIntMatrix` (Packed variant) |
|
||||
|
||||
Both matrix types are enums (`Columnar` / `Packed` / `Implicit` for bit) behind a transparent API. `col_view(c)` returns the appropriate view directly:
|
||||
|
||||
```rust
|
||||
// PersistentBitMatrix
|
||||
pub fn col_view(&self, c: usize) -> BitSliceView<'_>
|
||||
|
||||
// PersistentCompactIntMatrix
|
||||
pub fn col_view(&self, c: usize) -> IntSliceView<'_>
|
||||
```
|
||||
|
||||
No wrapper enums (`BitColView`, `IntColView`): the caller receives a `Copy` view struct immediately usable with any view method or bulk builder method.
|
||||
|
||||
`pack_compact_int_matrix` and `pack_bit_matrix` convert columnar → packed format.
|
||||
|
||||
---
|
||||
|
||||
## Aggregation traits (matrix level)
|
||||
|
||||
### ColumnWeights
|
||||
|
||||
```rust
|
||||
trait ColumnWeights: Send + Sync {
|
||||
fn col_weights(&self) -> Array1<u64>; // sum per column
|
||||
fn partial_kmer_counts(&self) -> Array1<u64>; // default = col_weights()
|
||||
}
|
||||
```
|
||||
|
||||
`partial_kmer_counts` is overridden for count matrices to return `count_nonzero` per column (distinct kmers) rather than total count.
|
||||
|
||||
### CountPartials
|
||||
|
||||
Abstract required methods: `partial_bray`, `partial_euclidean`, `partial_threshold_jaccard`, `partial_relfreq_bray`, `partial_relfreq_euclidean`, `partial_hellinger`.
|
||||
|
||||
**Additivity rule:** self-contained partials (`partial_bray`, `partial_euclidean`, `partial_threshold_jaccard`) can be element-wise summed across all `(partition, layer)` pairs. Normalised partials (`partial_relfreq_*`, `partial_hellinger`) require the **global** `col_weights` (accumulated across all layers and all partitions) as parameter.
|
||||
|
||||
**`partial_threshold_jaccard` returns `(inter, union)`** because `union[i,j]` depends on both columns simultaneously.
|
||||
|
||||
Provided finalisations:
|
||||
|
||||
| Finalisation | Formula |
|
||||
|---|---|
|
||||
| `bray_dist_matrix()` | `1 − 2·partial_bray[i,j] / (w[i] + w[j])` |
|
||||
| `euclidean_dist_matrix()` | `√partial_euclidean[i,j]` |
|
||||
| `threshold_jaccard_dist_matrix(t)` | `1 − inter[i,j] / union[i,j]` |
|
||||
| `relfreq_bray_dist_matrix()` | `1 − partial_relfreq_bray[i,j]` |
|
||||
| `relfreq_euclidean_dist_matrix()` | `√partial_relfreq_euclidean[i,j]` |
|
||||
| `hellinger_dist_matrix()` | `√partial_hellinger[i,j] / √2` |
|
||||
| `hellinger_euclidean_dist_matrix()` | `√partial_hellinger[i,j]` |
|
||||
|
||||
### BitPartials
|
||||
|
||||
Required: `partial_jaccard() -> (Array2<u64>, Array2<u64>)`, `partial_hamming() -> Array2<u64>`. Both additive across layers and partitions.
|
||||
|
||||
---
|
||||
|
||||
## Temp-file-backed types
|
||||
|
||||
**All inter-function results use temp-file-backed types** so the OS can page them out under memory pressure. This matters in practice: processing dozens of layers × hundreds of partitions in parallel would otherwise accumulate gigabytes of live anonymous memory.
|
||||
|
||||
### Lifecycle
|
||||
|
||||
```
|
||||
TempCompactIntVecBuilder::new(n) → writable mmap in TempDir
|
||||
↓ (inc_present_fast / inc_predicate_fast / add / mask_with / …)
|
||||
.freeze() → TempCompactIntVec (read-only mmap + TempDir)
|
||||
↓ (optional)
|
||||
.make_persistent(path) → PersistentCompactIntVec (permanent file)
|
||||
```
|
||||
|
||||
Same pattern for `TempBitVecBuilder` → `TempBitVec` → `PersistentBitVec`.
|
||||
|
||||
**Drop order**: `TempCompactIntVec { vec: PersistentCompactIntVec, _temp: TempDir }` — Rust drops fields in declaration order. `vec` (mmap) released before `_temp` (directory deleted). No explicit `drop()` needed.
|
||||
|
||||
### TempCompactIntVec / TempCompactIntVecBuilder
|
||||
|
||||
```rust
|
||||
pub struct TempCompactIntVec {
|
||||
vec: PersistentCompactIntVec,
|
||||
_temp: TempDir, // dropped after vec
|
||||
}
|
||||
|
||||
pub(crate) struct TempCompactIntVecBuilder {
|
||||
builder: PersistentCompactIntVecBuilder,
|
||||
temp: TempDir,
|
||||
}
|
||||
```
|
||||
|
||||
`TempCompactIntVec`: read access via `get(slot)`, `sum()`, `iter()`, `view() -> IntSliceView<'_>`.
|
||||
|
||||
`TempCompactIntVecBuilder`: full delegation to inner `PersistentCompactIntVecBuilder` — all bulk computation methods (`inc_present_fast`, `inc_predicate_fast`, `add`, `min`, `max`, `diff`, `mask_with`) are exposed as `pub(crate)`.
|
||||
|
||||
### TempBitVec / TempBitVecBuilder
|
||||
|
||||
```rust
|
||||
pub struct TempBitVec {
|
||||
vec: PersistentBitVec,
|
||||
_temp: TempDir,
|
||||
}
|
||||
|
||||
pub(crate) struct TempBitVecBuilder {
|
||||
builder: PersistentBitVecBuilder,
|
||||
temp: TempDir,
|
||||
}
|
||||
```
|
||||
|
||||
`TempBitVec`: read access via `get(slot)`, `count_ones()`, `view() -> BitSliceView<'_>`, `iter()`.
|
||||
|
||||
`TempBitVecBuilder`: exposes `set(slot, bool)`, `or(BitSliceView)`, and:
|
||||
|
||||
```rust
|
||||
pub(crate) fn or_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool)
|
||||
```
|
||||
|
||||
`or_where` — two passes, no intermediate allocation:
|
||||
|
||||
```
|
||||
Pass 1 — primary bytes, O(n):
|
||||
for slot in 0..n:
|
||||
b = col.primary_bytes()[slot]
|
||||
if b < 255 AND pred(b as u32): self.set(slot, true)
|
||||
|
||||
Pass 2 — overflow, O(k):
|
||||
for (slot, val) in col.overflow_entries():
|
||||
if pred(val): self.set(slot, true)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Filter / Select API
|
||||
|
||||
### ColGroup
|
||||
|
||||
```rust
|
||||
pub struct ColGroup { pub name: String, pub indices: Vec<usize> }
|
||||
```
|
||||
|
||||
Defined **once at the index level** from column metadata. Valid in all matrices of all layers and partitions — column structure is identical across the entire hierarchy; only rows (kmer slots) are partitioned.
|
||||
|
||||
### Composition axis
|
||||
|
||||
- **Across partitions**: kmer space is partitioned → partial results **concatenated** (disjoint kmer ranges).
|
||||
- **Across layers**: same kmer space, different counts → partial results **aggregated** (add, OR, etc.).
|
||||
|
||||
### MatrixGroupOps
|
||||
|
||||
Five required primitives + two default methods derived from them. All return temp-file-backed types.
|
||||
|
||||
```rust
|
||||
pub trait MatrixGroupOps {
|
||||
// required
|
||||
fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32)
|
||||
-> io::Result<TempCompactIntVec>;
|
||||
fn partial_group_sum(&self, g: &ColGroup)
|
||||
-> io::Result<TempCompactIntVec>;
|
||||
fn partial_group_any(&self, g: &ColGroup, threshold: u32)
|
||||
-> io::Result<TempBitVec>;
|
||||
fn partial_group_min(&self, g: &ColGroup)
|
||||
-> io::Result<TempCompactIntVec>;
|
||||
fn partial_group_max(&self, g: &ColGroup)
|
||||
-> io::Result<TempCompactIntVec>;
|
||||
|
||||
// defaults derived from partial_group_presence_count
|
||||
fn partial_group_all(&self, g: &ColGroup, threshold: u32)
|
||||
-> io::Result<TempBitVec>; // slot=1 iff count == g.indices.len()
|
||||
fn partial_group_none(&self, g: &ColGroup, threshold: u32)
|
||||
-> io::Result<TempBitVec>; // slot=1 iff count == 0
|
||||
}
|
||||
```
|
||||
|
||||
Implemented for both `PersistentCompactIntMatrix` and `PersistentBitMatrix`.
|
||||
|
||||
For **bit matrices**: values are 0/1, so `partial_group_sum` = `partial_group_presence_count(g, 1)`; `partial_group_min` is AND (set first column then mask-with remaining); `partial_group_max` is OR via `partial_group_any` + `inc_present`.
|
||||
|
||||
**`partial_group_presence_count` — chunking for large groups:**
|
||||
|
||||
When `g.indices.len() < 255`: per-slot counts stay within `u8` range. Use `inc_present_fast` (bit) or `inc_predicate_fast(col_view(c), |v| v >= threshold)` (int) — raw u8 increment, no overflow entry written.
|
||||
|
||||
When `g.indices.len() ≥ 255`: process in chunks of 254 columns, accumulate via `.add(chunk_frozen.view())`.
|
||||
|
||||
**`partial_group_min` (int matrix)**: copy first column via `.add(col_view(first))` (start from 0 ⇒ copy), then `.min(col_view(c))` for remaining.
|
||||
|
||||
**`partial_group_max` (int matrix)**: `.max(col_view(c))` for all columns (start from 0 ⇒ first column acts as copy).
|
||||
|
||||
**`partial_group_any`** uses `or_where` on `TempBitVecBuilder` (two-pass: primary bytes then overflow entries).
|
||||
|
||||
**`partial_group_all` / `partial_group_none`** (default): call `partial_group_presence_count`, then iterate slots to produce the bit result. O(n) extra pass, not chunked.
|
||||
|
||||
### add_col_from — matrix builder integration
|
||||
|
||||
Both matrix builders accept temp-file results directly:
|
||||
|
||||
```rust
|
||||
// PersistentBitMatrixBuilder
|
||||
fn add_col_from(&mut self, src: &TempBitVec) -> io::Result<()>
|
||||
fn add_col_from_int(&mut self, src: &TempCompactIntVec) -> io::Result<()> // nonzero → 1
|
||||
|
||||
// PersistentCompactIntMatrixBuilder
|
||||
fn add_col_from(&mut self, src: &TempCompactIntVec) -> io::Result<()>
|
||||
fn add_col_from_bit(&mut self, src: &TempBitVec) -> io::Result<()> // bit → 0/1 u32
|
||||
```
|
||||
|
||||
`add_col_from` copies the temp file to the matrix directory and increments `n_cols`; `close()` writes `meta.json` with the final column count. No separate `write_meta` step needed.
|
||||
|
||||
### mask_with
|
||||
|
||||
Direct method on `PersistentCompactIntVecBuilder` (and delegation via `TempCompactIntVecBuilder`). Zeros every slot where the corresponding mask bit is 0. Iterates only zero bits — O(n_zeros), O(1) when mask is all-ones.
|
||||
|
||||
```
|
||||
for (w_idx, word) in mask.words():
|
||||
if word == u64::MAX: continue // skip all-ones words
|
||||
zeros = !word
|
||||
while zeros != 0:
|
||||
bit = trailing_zeros(zeros)
|
||||
s = w_idx * 64 + bit
|
||||
if primary[s] != 0: set(s, 0) // clears overflow entry too
|
||||
zeros &= zeros − 1
|
||||
```
|
||||
|
||||
Terminal operation for Filter (retain only selected kmer slots in a count vector) and Select (positional selection without MPHF).
|
||||
@@ -0,0 +1,143 @@
|
||||
# `obitaxonomy` — taxonomy concept paths
|
||||
|
||||
`obitaxonomy` is a dependency-free crate that defines a typed representation
|
||||
of hierarchical concept paths (taxonomic or otherwise) stored in genome metadata.
|
||||
|
||||
---
|
||||
|
||||
## Concept path syntax
|
||||
|
||||
A concept path is stored as a metadata value with the prefix `taxonomy:/`:
|
||||
|
||||
```
|
||||
taxonomy:/enterobacteriaceae@family/Escherichia@genus/Escherichia coli@species
|
||||
```
|
||||
|
||||
Structure:
|
||||
|
||||
- The `taxonomy:/` prefix is the type discriminator. Any metadata value starting
|
||||
with it is parsed as a `TaxPath`; all others remain plain strings.
|
||||
- The remainder is one or more `/`-separated segments.
|
||||
- Each segment is `name` or `name@rank`, where `rank` is a label for the
|
||||
taxonomic level (e.g. `family`, `genus`, `species`).
|
||||
- Rank annotations are **optional per segment** and can be mixed freely.
|
||||
- Spaces are allowed in both names and ranks.
|
||||
|
||||
### Reserved character
|
||||
|
||||
`@` is reserved throughout the taxonomy system and may **not** appear in:
|
||||
|
||||
| Context | Constraint |
|
||||
|---------|------------|
|
||||
| Segment name | forbidden |
|
||||
| Rank/class label | forbidden |
|
||||
| Metadata key names | forbidden (used as `key@rank` in predicate syntax) |
|
||||
|
||||
`@` is freely allowed in plain-text metadata values (non-taxonomy).
|
||||
|
||||
### Parse errors
|
||||
|
||||
| Condition | Error |
|
||||
|-----------|-------|
|
||||
| Value does not start with `taxonomy:/` | `MissingPrefix` |
|
||||
| No segments after the prefix | `EmptyPath` |
|
||||
| Segment with empty name (consecutive `/`) | `EmptySegmentName` |
|
||||
| Segment with trailing `@` and no rank (`name@`) | `EmptyRankName` |
|
||||
| Segment with more than one `@` | `AmbiguousRank` |
|
||||
|
||||
---
|
||||
|
||||
## Public API
|
||||
|
||||
### `TaxSegment`
|
||||
|
||||
A single node: a name and an optional rank.
|
||||
|
||||
```rust
|
||||
seg.name() // &str
|
||||
seg.rank() // Option<&str>
|
||||
seg.to_string() // "name" or "name@rank"
|
||||
TaxSegment::parse(s) // Result<TaxSegment, TaxError>
|
||||
```
|
||||
|
||||
### `TaxPath`
|
||||
|
||||
```rust
|
||||
TaxPath::parse(s) // Result<TaxPath, TaxError>
|
||||
path.segments() // &[TaxSegment]
|
||||
path.depth() // usize — number of segments
|
||||
path.is_ancestor_of(&other) // bool — prefix match by name, ranks ignored
|
||||
path.name_at_rank("genus") // Option<&str>
|
||||
path.to_string() // reconstructs "taxonomy:/…"
|
||||
```
|
||||
|
||||
`is_ancestor_of` compares segment **names** only — rank annotations are
|
||||
informational and do not affect the ancestry relation.
|
||||
|
||||
```rust
|
||||
let a: TaxPath = "taxonomy:/Enterobacteriaceae@family/Escherichia@genus".parse()?;
|
||||
let b: TaxPath = "taxonomy:/Enterobacteriaceae@family/Escherichia@genus/Escherichia coli@species".parse()?;
|
||||
|
||||
assert!(a.is_ancestor_of(&b)); // true
|
||||
assert!(b.is_ancestor_of(&a)); // false
|
||||
assert!(a.is_ancestor_of(&a)); // true (equal ⇒ ancestor)
|
||||
|
||||
assert_eq!(b.name_at_rank("species"), Some("Escherichia coli"));
|
||||
assert_eq!(b.name_at_rank("genus"), Some("Escherichia"));
|
||||
assert_eq!(b.name_at_rank("order"), None);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Integration with `GenomeInfo`
|
||||
|
||||
At index load time, every metadata value is inspected once:
|
||||
|
||||
- Starts with `taxonomy:/` → parsed into `TaxPath`, stored in `genome.taxonomy`.
|
||||
- Otherwise → kept as-is in `genome.meta`.
|
||||
|
||||
```rust
|
||||
struct GenomeInfo {
|
||||
label: String,
|
||||
meta: HashMap<String, String>, // plain text metadata
|
||||
taxonomy: HashMap<String, TaxPath>, // parsed taxonomy metadata
|
||||
}
|
||||
```
|
||||
|
||||
The raw string is not duplicated. `TaxPath::to_string()` reconstructs the
|
||||
original value losslessly for serialisation.
|
||||
|
||||
---
|
||||
|
||||
## Predicate operators (in `filter` / `select`)
|
||||
|
||||
Path predicates use the `~` / `!~` operators. The **stored value** always starts
|
||||
with `/` (rooted path); the **query pattern** does not need to.
|
||||
|
||||
### Path pattern syntax
|
||||
|
||||
| Pattern | Semantics |
|
||||
|---------|-----------|
|
||||
| `A/B` | contiguous sub-path A then B, anywhere in the value |
|
||||
| `/A/B` | value starts with A then B (start-anchored) |
|
||||
| `A/B$` | value ends with A then B (end-anchored) |
|
||||
| `/A/B$` | value is exactly A then B (fully anchored) |
|
||||
| `A@x/B` | A with class `x` followed by B with any class |
|
||||
| `A@x/B@y` | A with class `x` followed by B with class `y` |
|
||||
|
||||
A segment pattern without `@` matches the segment name regardless of its stored class.
|
||||
|
||||
### Rank-aware queries
|
||||
|
||||
```
|
||||
key@rank=value
|
||||
```
|
||||
|
||||
| Predicate form | Semantics |
|
||||
|----------------|-----------|
|
||||
| `key@rank=value` | genome's `key` has `value` at rank `rank` |
|
||||
| `key@rank!=value` | does not |
|
||||
| `key@rank=v1\|v2` | value at `rank` is `v1` or `v2` |
|
||||
|
||||
`~` combined with `@rank` on the key (e.g. `key@genus~pattern`) is not defined
|
||||
and is rejected at parse time.
|
||||
@@ -60,13 +60,13 @@ HPC home directories are typically on a network filesystem (Lustre, NFS) optimis
|
||||
**Always redirect the build directory to a local scratch disk:**
|
||||
|
||||
```bash
|
||||
CARGO_TARGET_DIR=/scratch/local/$USER/cargo-target cargo build --release
|
||||
CARGO_TARGET_DIR=/scratch/$USER/cargo-target cargo build --release
|
||||
```
|
||||
|
||||
Adapt the path to the local scratch available on your cluster (`/var/tmp`, `/tmp`, `/scratch/local`, etc.). Once built, copy the binary to a permanent location:
|
||||
|
||||
```bash
|
||||
cp /scratch/local/$USER/cargo-target/release/obikmer ~/bin/
|
||||
cp /scratch/$USER/cargo-target/release/obikmer ~/bin/
|
||||
```
|
||||
|
||||
## NUMA support
|
||||
|
||||
@@ -53,6 +53,7 @@ nav:
|
||||
- Merge parallelism & memory: implementation/merge_parallelism.md
|
||||
- Kmer filtering: implementation/filtering.md
|
||||
- Select command: implementation/select.md
|
||||
- obitaxonomy crate: implementation/obitaxonomy.md
|
||||
- Architecture:
|
||||
- Sequences: architecture/sequences/invariant.md
|
||||
- Kmer index: architecture/index_architecture.md
|
||||
|
||||
@@ -0,0 +1,44 @@
|
||||
# La crate obicompactvector
|
||||
|
||||
Le code actuelle est ce qu'il est. Ce n'est pad la vrérité absolue, c'est un premier effort d'implémentation rien de plus. Ci-dessous je vais décrire les objectif et la structure qui devrait être. LA VERITE A ATTEINDRE.
|
||||
|
||||
La crate fournie des représentations les plus compact possible en mémoire de matrice de comptage ou de présence de k-mer dans des génomes. Chaque colonne représente un génome chaque ligne un kmer. une matrice est une collection de vecteur ou chacun des vecteur est un colonne de la matrice.
|
||||
|
||||
Les matrices comme les colonnes ont vocation à être persistante. Les données sont stockées dans des fichiers binaires. Les données sont mappées en mémoire via `mmap`
|
||||
|
||||
Les structure sont par essence immutables. Il existe des représentations mutables des colonnes qui permettent leur construction. À la fin de leur construction, les colonnes sont fermée ce qui les rends immutable.
|
||||
|
||||
Les matrices peuvent êtres représenté de deux façons:
|
||||
- via un répertoire contenant une collection de fichier colonnes
|
||||
- via un fichier matrix qui est la concatenation de plusieurs fichiers colonnes.
|
||||
|
||||
|
||||
## Les matrices de comptage
|
||||
|
||||
Ce sont des matrice d'entiers positif la plus part du temps de petites valeurs (inferieurs à 255). On assume que toutes les valeurs sont représentables sur un `u32`
|
||||
|
||||
## Les matrices de presence
|
||||
|
||||
Ce sont des matrices de boolean représenté comme des champs de bits
|
||||
|
||||
Il existe une forme implicite des vecteur de présence, qui n'est représenté par aucun fichier pour lequel toutes les valeurs sont vraies
|
||||
|
||||
## représentation légère des colonnes
|
||||
|
||||
Les colonnes qu'elles soient de unitiaire (fichier colonne) ou partie d'un fichier composite matrice peuvent être représenté par un objet léger donnant acces à ces valeurs ainsi qu'à la longeur du vecteurs. Toutes les méthodes de calcules doivent uniquement travailler à partir de ces représentations légère unifiées des colonnes.
|
||||
|
||||
### Représentation légère d'un vecteur de présence
|
||||
|
||||
Le vecteur est représenté par
|
||||
- un champs de bits encodé comme un [u64]
|
||||
- un usize encodant la longeur du champs de bits
|
||||
|
||||
### Représentation légère d'un vecteur de présence
|
||||
|
||||
Le vecteur est représenté par
|
||||
- un vecteur [u8] encodant directement les valeur faibe du vecteur [0,255[
|
||||
La valeur 255 est une valeur sentinelle indiquant que la valeure vraie est >=255
|
||||
et se trouvent dans une structure d'overflow
|
||||
- un iterateur de (usize,u32) listant les valeurs d'overflow coorespondant aux valeurs
|
||||
sentinels (255) du [u8]
|
||||
- un usize encodant la longeur du champs de bits
|
||||
Generated
+6
-1
@@ -1704,7 +1704,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "obikmer"
|
||||
version = "0.1.0"
|
||||
version = "0.1.3"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"csv",
|
||||
@@ -1722,6 +1722,7 @@ dependencies = [
|
||||
"obiskbuilder",
|
||||
"obiskio",
|
||||
"obisys",
|
||||
"obitaxonomy",
|
||||
"pprof",
|
||||
"rayon",
|
||||
"serde_json",
|
||||
@@ -1853,6 +1854,10 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "obitaxonomy"
|
||||
version = "0.1.0"
|
||||
|
||||
[[package]]
|
||||
name = "object"
|
||||
version = "0.37.3"
|
||||
|
||||
+1
-1
@@ -1,5 +1,5 @@
|
||||
[workspace]
|
||||
resolver = "3"
|
||||
members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap", "obicompactvec", "obisys", "obikindex"]
|
||||
members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap", "obicompactvec", "obisys", "obikindex", "obitaxonomy"]
|
||||
[profile.release]
|
||||
debug = 1
|
||||
|
||||
@@ -7,6 +7,6 @@ edition = "2024"
|
||||
memmap2 = "0.9"
|
||||
ndarray = "0.16"
|
||||
rayon = "1"
|
||||
tempfile = "3"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3"
|
||||
|
||||
@@ -7,8 +7,12 @@ use ndarray::{Array1, Array2};
|
||||
use rayon::prelude::*;
|
||||
|
||||
use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
|
||||
use crate::colgroup::{ColGroup, MatrixGroupOps};
|
||||
use crate::layer_meta::LayerMeta;
|
||||
use crate::meta::MatrixMeta;
|
||||
use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
|
||||
use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
|
||||
use crate::views::BitSliceView;
|
||||
|
||||
fn col_path(dir: &Path, col: usize) -> PathBuf {
|
||||
dir.join(format!("col_{col:06}.pbiv"))
|
||||
@@ -54,34 +58,11 @@ impl ColumnarBitMatrix {
|
||||
}
|
||||
|
||||
pub(crate) fn partial_jaccard_dist_matrix(&self) -> (Array2<u64>, Array2<u64>) {
|
||||
let n = self.n_cols();
|
||||
let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n)
|
||||
.into_par_iter()
|
||||
.map(|(i, j)| {
|
||||
let (inter, union) = self.col(i).partial_jaccard_dist(self.col(j));
|
||||
(i, j, inter, union)
|
||||
})
|
||||
.collect();
|
||||
let mut inter_m = Array2::zeros((n, n));
|
||||
let mut union_m = Array2::zeros((n, n));
|
||||
for (i, j, inter, union) in results {
|
||||
inter_m[[i, j]] = inter; inter_m[[j, i]] = inter;
|
||||
union_m[[i, j]] = union; union_m[[j, i]] = union;
|
||||
}
|
||||
(inter_m, union_m)
|
||||
pairwise2_matrix(self.n_cols(), |i, j| self.col(i).partial_jaccard_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub(crate) fn partial_hamming_dist_matrix(&self) -> Array2<u64> {
|
||||
self.pairwise_u64(|i, j| self.col(i).hamming_dist(self.col(j)))
|
||||
}
|
||||
|
||||
fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2<u64> {
|
||||
let n = self.n_cols();
|
||||
let results: Vec<(usize, usize, u64)> = upper_pairs(n)
|
||||
.into_par_iter()
|
||||
.map(|(i, j)| (i, j, f(i, j)))
|
||||
.collect();
|
||||
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
|
||||
pairwise_matrix(self.n_cols(), |i, j| self.col(i).hamming_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub(crate) fn append_column(dir: &Path, value_of: impl Fn(usize) -> bool) -> io::Result<()> {
|
||||
@@ -147,84 +128,46 @@ impl PackedBitMatrix {
|
||||
}).collect()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn col_bytes(&self, c: usize) -> &[u8] {
|
||||
let start = self.data_offsets[c];
|
||||
let len = (self.n_rows + 7) / 8;
|
||||
&self.mmap[start..start + len]
|
||||
&self.mmap[start..start + self.n_rows.div_ceil(8)]
|
||||
}
|
||||
|
||||
fn count_ones_col(&self, c: usize) -> u64 {
|
||||
let bytes = self.col_bytes(c);
|
||||
let full = self.n_rows / 8;
|
||||
let rem = self.n_rows % 8;
|
||||
let mut n: u64 = bytes[..full].iter().map(|b| b.count_ones() as u64).sum();
|
||||
if rem > 0 { n += (bytes[full] & ((1u8 << rem) - 1)).count_ones() as u64; }
|
||||
n
|
||||
fn col_words(&self, c: usize) -> &[u64] {
|
||||
let nw = self.n_rows.div_ceil(64);
|
||||
// SAFETY: data_offsets[c] is always 8-byte aligned.
|
||||
// PBMX header = 24 + n_cols×8 (multiple of 8); each PBIV blob =
|
||||
// 16 + nwords×8 (multiple of 8); mmap base is page-aligned.
|
||||
let ptr = self.mmap[self.data_offsets[c]..].as_ptr() as *const u64;
|
||||
unsafe { std::slice::from_raw_parts(ptr, nw) }
|
||||
}
|
||||
|
||||
fn pair_op(&self, i: usize, j: usize, and_or: bool) -> u64 {
|
||||
let ai = self.col_bytes(i);
|
||||
let aj = self.col_bytes(j);
|
||||
let full = self.n_rows / 8;
|
||||
let rem = self.n_rows % 8;
|
||||
let mut n: u64 = ai[..full].iter().zip(aj[..full].iter())
|
||||
.map(|(a, b)| if and_or { a & b } else { a ^ b }.count_ones() as u64)
|
||||
.sum();
|
||||
if rem > 0 {
|
||||
let mask = (1u8 << rem) - 1;
|
||||
let last = if and_or { ai[full] & aj[full] } else { ai[full] ^ aj[full] };
|
||||
n += (last & mask).count_ones() as u64;
|
||||
}
|
||||
n
|
||||
pub(crate) fn col_slice(&self, c: usize) -> BitSliceView<'_> {
|
||||
BitSliceView::new(self.col_words(c), self.n_rows)
|
||||
}
|
||||
|
||||
fn partial_jaccard_col(&self, i: usize, j: usize) -> (u64, u64) {
|
||||
let ai = self.col_bytes(i);
|
||||
let aj = self.col_bytes(j);
|
||||
let full = self.n_rows / 8;
|
||||
let rem = self.n_rows % 8;
|
||||
let (mut inter, mut union) = ai[..full].iter().zip(aj[..full].iter())
|
||||
.fold((0u64, 0u64), |(inter, union), (a, b)| {
|
||||
(inter + (a & b).count_ones() as u64,
|
||||
union + (a | b).count_ones() as u64)
|
||||
});
|
||||
if rem > 0 {
|
||||
let mask = (1u8 << rem) - 1;
|
||||
inter += ((ai[full] & aj[full]) & mask).count_ones() as u64;
|
||||
union += ((ai[full] | aj[full]) & mask).count_ones() as u64;
|
||||
}
|
||||
(inter, union)
|
||||
pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentBitVecBuilder> {
|
||||
PersistentBitVecBuilder::from_raw_bytes(self.col_bytes(c), self.n_rows, path)
|
||||
}
|
||||
|
||||
pub(crate) fn count_ones(&self) -> Array1<u64> {
|
||||
Array1::from_vec(
|
||||
(0..self.n_cols).into_par_iter().map(|c| self.count_ones_col(c)).collect()
|
||||
(0..self.n_cols).into_par_iter()
|
||||
.map(|c| self.col_slice(c).count_ones())
|
||||
.collect()
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn partial_jaccard_dist_matrix(&self) -> (Array2<u64>, Array2<u64>) {
|
||||
let n = self.n_cols;
|
||||
let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n)
|
||||
.into_par_iter()
|
||||
.map(|(i, j)| { let (inter, union) = self.partial_jaccard_col(i, j); (i, j, inter, union) })
|
||||
.collect();
|
||||
let mut inter_m = Array2::zeros((n, n));
|
||||
let mut union_m = Array2::zeros((n, n));
|
||||
for (i, j, inter, union) in results {
|
||||
inter_m[[i, j]] = inter; inter_m[[j, i]] = inter;
|
||||
union_m[[i, j]] = union; union_m[[j, i]] = union;
|
||||
}
|
||||
(inter_m, union_m)
|
||||
pairwise2_matrix(self.n_cols, |i, j| {
|
||||
self.col_slice(i).partial_jaccard_dist(self.col_slice(j))
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn partial_hamming_dist_matrix(&self) -> Array2<u64> {
|
||||
let n = self.n_cols;
|
||||
let results: Vec<(usize, usize, u64)> = upper_pairs(n)
|
||||
.into_par_iter()
|
||||
.map(|(i, j)| (i, j, self.pair_op(i, j, false)))
|
||||
.collect();
|
||||
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
|
||||
pairwise_matrix(self.n_cols, |i, j| {
|
||||
self.col_slice(i).hamming_dist(self.col_slice(j))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -343,6 +286,24 @@ impl PersistentBitMatrix {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn col_view(&self, c: usize) -> BitSliceView<'_> {
|
||||
match self {
|
||||
Self::Columnar(m) => m.col(c).view(),
|
||||
Self::Packed(m) => m.col_slice(c),
|
||||
Self::Implicit { .. } => panic!("col_view() not available on Implicit PersistentBitMatrix"),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentBitVecBuilder> {
|
||||
match self {
|
||||
Self::Columnar(m) => PersistentBitVecBuilder::build_from(m.col(c), path),
|
||||
Self::Packed(m) => m.col_persist(c, path),
|
||||
Self::Implicit { n_rows, .. } => {
|
||||
PersistentBitVecBuilder::new_ones(*n_rows, path)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn row(&self, slot: usize) -> Box<[bool]> {
|
||||
match self {
|
||||
Self::Columnar(m) => m.row(slot),
|
||||
@@ -439,12 +400,93 @@ impl PersistentBitMatrixBuilder {
|
||||
PersistentBitVecBuilder::new(self.n, &path)
|
||||
}
|
||||
|
||||
pub fn add_col_ones(&mut self) -> io::Result<PersistentBitVecBuilder> {
|
||||
let path = col_path(&self.dir, self.n_cols);
|
||||
self.n_cols += 1;
|
||||
PersistentBitVecBuilder::new_ones(self.n, &path)
|
||||
}
|
||||
|
||||
pub fn add_col_from(&mut self, src: &TempBitVec) -> io::Result<()> {
|
||||
src.make_persistent(&col_path(&self.dir, self.n_cols))?;
|
||||
self.n_cols += 1;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn add_col_from_int(&mut self, src: &TempCompactIntVec) -> io::Result<()> {
|
||||
let path = col_path(&self.dir, self.n_cols);
|
||||
self.n_cols += 1;
|
||||
let mut b = PersistentBitVecBuilder::new(self.n, &path)?;
|
||||
b.or_where(src.view(), |v| v > 0);
|
||||
b.close()
|
||||
}
|
||||
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
// ── MatrixGroupOps ────────────────────────────────────────────────────────────
|
||||
|
||||
impl MatrixGroupOps for PersistentBitMatrix {
|
||||
fn partial_group_presence_count(&self, g: &ColGroup, _threshold: u32) -> io::Result<TempCompactIntVec> {
|
||||
// Bit matrices store 0/1 — threshold is structurally always 1.
|
||||
let n = self.n();
|
||||
if g.indices.len() < 255 {
|
||||
let mut builder = TempCompactIntVecBuilder::new(n)?;
|
||||
for &c in &g.indices {
|
||||
builder.inc_present_fast(self.col_view(c));
|
||||
}
|
||||
builder.freeze()
|
||||
} else {
|
||||
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||
for chunk in g.indices.chunks(254) {
|
||||
let mut chunk_b = TempCompactIntVecBuilder::new(n)?;
|
||||
for &c in chunk {
|
||||
chunk_b.inc_present_fast(self.col_view(c));
|
||||
}
|
||||
let frozen = chunk_b.freeze()?;
|
||||
result.add(frozen.view());
|
||||
}
|
||||
result.freeze()
|
||||
}
|
||||
}
|
||||
|
||||
fn partial_group_sum(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
|
||||
// For bit matrices, sum = count of 1-bits — identical to presence_count.
|
||||
self.partial_group_presence_count(g, 1)
|
||||
}
|
||||
|
||||
fn partial_group_any(&self, g: &ColGroup, _threshold: u32) -> io::Result<TempBitVec> {
|
||||
let n = self.n();
|
||||
let mut result = TempBitVecBuilder::new(n)?;
|
||||
for &c in &g.indices {
|
||||
result.or(self.col_view(c));
|
||||
}
|
||||
result.freeze()
|
||||
}
|
||||
|
||||
fn partial_group_min(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
|
||||
// min of 0/1 values = AND: 1 only if ALL columns are 1
|
||||
let n = self.n();
|
||||
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||
if let Some((&first, rest)) = g.indices.split_first() {
|
||||
result.inc_present_fast(self.col_view(first));
|
||||
for &c in rest { result.mask_with(self.col_view(c)); }
|
||||
}
|
||||
result.freeze()
|
||||
}
|
||||
|
||||
fn partial_group_max(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
|
||||
// max of 0/1 values = OR: 1 if any column is 1
|
||||
let any = self.partial_group_any(g, 1)?;
|
||||
let n = any.len();
|
||||
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||
result.inc_present(any.view());
|
||||
result.freeze()
|
||||
}
|
||||
}
|
||||
|
||||
// ── Shared matrix helpers (also used by intmatrix.rs) ─────────────────────────
|
||||
|
||||
fn upper_pairs(n: usize) -> Vec<(usize, usize)> {
|
||||
(0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect()
|
||||
@@ -456,3 +498,30 @@ where T: Clone + Default {
|
||||
for (i, j, vij, vji) in vals { m[[i, j]] = vij; m[[j, i]] = vji; }
|
||||
m
|
||||
}
|
||||
|
||||
/// Compute a symmetric `n×n` matrix in parallel by evaluating `f(i,j)` for
|
||||
/// all upper-triangle pairs. `T: Copy` avoids the `.clone()` needed for the
|
||||
/// lower-triangle mirror.
|
||||
pub(crate) fn pairwise_matrix<T>(n: usize, f: impl Fn(usize, usize) -> T + Sync) -> Array2<T>
|
||||
where T: Copy + Default + Send {
|
||||
let results: Vec<(usize, usize, T)> = upper_pairs(n)
|
||||
.into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
|
||||
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
|
||||
}
|
||||
|
||||
/// Same as `pairwise_matrix` but `f` returns two values that fill two
|
||||
/// symmetric matrices simultaneously (e.g. intersection + union for Jaccard).
|
||||
pub(crate) fn pairwise2_matrix<T>(n: usize, f: impl Fn(usize, usize) -> (T, T) + Sync) -> (Array2<T>, Array2<T>)
|
||||
where T: Copy + Default + Send {
|
||||
let results: Vec<(usize, usize, T, T)> = upper_pairs(n)
|
||||
.into_par_iter()
|
||||
.map(|(i, j)| { let (a, b) = f(i, j); (i, j, a, b) })
|
||||
.collect();
|
||||
let mut m0 = Array2::from_elem((n, n), T::default());
|
||||
let mut m1 = Array2::from_elem((n, n), T::default());
|
||||
for (i, j, a, b) in results {
|
||||
m0[[i, j]] = a; m0[[j, i]] = a;
|
||||
m1[[i, j]] = b; m1[[j, i]] = b;
|
||||
}
|
||||
(m0, m1)
|
||||
}
|
||||
|
||||
+197
-155
@@ -5,25 +5,21 @@ use std::path::{Path, PathBuf};
|
||||
use memmap2::{Mmap, MmapMut};
|
||||
|
||||
use crate::reader::PersistentCompactIntVec;
|
||||
use crate::views::{BitSliceIter, BitSliceView, IntSliceView};
|
||||
|
||||
const MAGIC: [u8; 4] = *b"PBIV";
|
||||
|
||||
// Header: magic(4) + _pad(4) + n(8) = 16 bytes.
|
||||
// Data starts at offset 16, which is divisible by 8 → u64-aligned
|
||||
// (mmap base is page-aligned, 16 % 8 == 0).
|
||||
// Data starts at offset 16, u64-aligned (mmap base is page-aligned, 16 % 8 == 0).
|
||||
const HEADER_SIZE: usize = 16;
|
||||
|
||||
#[inline]
|
||||
fn n_words(n: usize) -> usize {
|
||||
n.div_ceil(64)
|
||||
}
|
||||
pub(crate) fn n_words(n: usize) -> usize { n.div_ceil(64) }
|
||||
|
||||
#[inline]
|
||||
fn n_bytes_for_words(n: usize) -> usize {
|
||||
n_words(n) * 8
|
||||
}
|
||||
fn n_bytes_for_words(n: usize) -> usize { n_words(n) * 8 }
|
||||
|
||||
// ── Reader ────────────────────────────────────────────────────────────────────
|
||||
// ── PersistentBitVec ──────────────────────────────────────────────────────────
|
||||
|
||||
pub struct PersistentBitVec {
|
||||
mmap: Mmap,
|
||||
@@ -35,110 +31,64 @@ impl PersistentBitVec {
|
||||
pub fn open(path: &Path) -> io::Result<Self> {
|
||||
let mmap = unsafe { Mmap::map(&File::open(path)?)? };
|
||||
if mmap.len() < HEADER_SIZE {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"PBIV file too short",
|
||||
));
|
||||
return Err(io::Error::new(io::ErrorKind::InvalidData, "PBIV file too short"));
|
||||
}
|
||||
if &mmap[0..4] != &MAGIC {
|
||||
return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PBIV magic"));
|
||||
}
|
||||
let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize;
|
||||
Ok(Self {
|
||||
mmap,
|
||||
n,
|
||||
path: path.to_path_buf(),
|
||||
})
|
||||
Ok(Self { mmap, n, path: path.to_path_buf() })
|
||||
}
|
||||
|
||||
pub fn path(&self) -> &Path {
|
||||
&self.path
|
||||
}
|
||||
pub fn len(&self) -> usize {
|
||||
self.n
|
||||
}
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.n == 0
|
||||
}
|
||||
pub fn path(&self) -> &Path { &self.path }
|
||||
pub fn len(&self) -> usize { self.n }
|
||||
pub fn is_empty(&self) -> bool { self.n == 0 }
|
||||
|
||||
pub fn get(&self, slot: usize) -> bool {
|
||||
(self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0
|
||||
}
|
||||
|
||||
// Used by iter() and get(): exact byte window, no padding.
|
||||
fn data_bytes(&self) -> &[u8] {
|
||||
&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n.div_ceil(8)]
|
||||
}
|
||||
|
||||
// Bulk word view. SAFETY: mmap is page-aligned, HEADER_SIZE=16 is divisible by 8,
|
||||
// so &mmap[HEADER_SIZE] is u64-aligned. Slice length is n_words * 8 bytes.
|
||||
// SAFETY: mmap is page-aligned, HEADER_SIZE=16 divisible by 8 → u64-aligned.
|
||||
fn data_words(&self) -> &[u64] {
|
||||
let nw = n_words(self.n);
|
||||
let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64;
|
||||
unsafe { std::slice::from_raw_parts(ptr, nw) }
|
||||
}
|
||||
|
||||
pub fn count_ones(&self) -> u64 {
|
||||
// Padding bits in the last word are 0, so no masking needed.
|
||||
self.data_words()
|
||||
.iter()
|
||||
.map(|w| w.count_ones() as u64)
|
||||
.sum()
|
||||
pub fn view(&self) -> BitSliceView<'_> {
|
||||
BitSliceView::new(self.data_words(), self.n)
|
||||
}
|
||||
|
||||
pub fn count_zeros(&self) -> u64 {
|
||||
self.n as u64 - self.count_ones()
|
||||
}
|
||||
pub fn words(&self) -> &[u64] { self.data_words() }
|
||||
|
||||
pub fn jaccard_dist(&self, other: &PersistentBitVec) -> f64 {
|
||||
let (inter, union) = self.partial_jaccard_dist(other);
|
||||
if union == 0 {
|
||||
return 0.0;
|
||||
}
|
||||
1.0 - inter as f64 / union as f64
|
||||
}
|
||||
pub fn count_ones(&self) -> u64 { self.view().count_ones() }
|
||||
pub fn count_zeros(&self) -> u64 { self.view().count_zeros() }
|
||||
|
||||
pub fn partial_jaccard_dist(&self, other: &PersistentBitVec) -> (u64, u64) {
|
||||
assert_eq!(self.n, other.n, "length mismatch");
|
||||
self.data_words()
|
||||
.iter()
|
||||
.zip(other.data_words())
|
||||
.fold((0u64, 0u64), |(i, u), (&a, &b)| {
|
||||
(
|
||||
i + (a & b).count_ones() as u64,
|
||||
u + (a | b).count_ones() as u64,
|
||||
)
|
||||
})
|
||||
self.view().partial_jaccard_dist(other.view())
|
||||
}
|
||||
pub fn jaccard_dist(&self, other: &PersistentBitVec) -> f64 {
|
||||
self.view().jaccard_dist(other.view())
|
||||
}
|
||||
|
||||
pub fn hamming_dist(&self, other: &PersistentBitVec) -> u64 {
|
||||
assert_eq!(self.n, other.n, "length mismatch");
|
||||
self.data_words()
|
||||
.iter()
|
||||
.zip(other.data_words())
|
||||
.map(|(&a, &b)| (a ^ b).count_ones() as u64)
|
||||
.sum()
|
||||
self.view().hamming_dist(other.view())
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> BitIter<'_> {
|
||||
BitIter {
|
||||
bytes: self.data_bytes(),
|
||||
slot: 0,
|
||||
n: self.n,
|
||||
}
|
||||
BitIter { words: self.data_words(), slot: 0, n: self.n }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> IntoIterator for &'a PersistentBitVec {
|
||||
type Item = bool;
|
||||
type IntoIter = BitIter<'a>;
|
||||
fn into_iter(self) -> BitIter<'a> {
|
||||
self.iter()
|
||||
}
|
||||
fn into_iter(self) -> BitIter<'a> { self.iter() }
|
||||
}
|
||||
|
||||
// ── BitIter ───────────────────────────────────────────────────────────────────
|
||||
|
||||
pub struct BitIter<'a> {
|
||||
bytes: &'a [u8],
|
||||
words: &'a [u64],
|
||||
slot: usize,
|
||||
n: usize,
|
||||
}
|
||||
@@ -147,45 +97,79 @@ impl ExactSizeIterator for BitIter<'_> {}
|
||||
|
||||
impl Iterator for BitIter<'_> {
|
||||
type Item = bool;
|
||||
|
||||
fn next(&mut self) -> Option<bool> {
|
||||
if self.slot >= self.n {
|
||||
return None;
|
||||
}
|
||||
let v = (self.bytes[self.slot >> 3] >> (self.slot & 7)) & 1 != 0;
|
||||
if self.slot >= self.n { return None; }
|
||||
let v = (self.words[self.slot >> 6] >> (self.slot & 63)) & 1 != 0;
|
||||
self.slot += 1;
|
||||
Some(v)
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let rem = self.n - self.slot;
|
||||
(rem, Some(rem))
|
||||
}
|
||||
}
|
||||
|
||||
// ── Builder ───────────────────────────────────────────────────────────────────
|
||||
// ── PersistentBitVecBuilder ───────────────────────────────────────────────────
|
||||
|
||||
pub struct PersistentBitVecBuilder {
|
||||
mmap: MmapMut,
|
||||
n: usize,
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
impl PersistentBitVecBuilder {
|
||||
pub fn new(n: usize, path: &Path) -> io::Result<Self> {
|
||||
let file_size = HEADER_SIZE + n_bytes_for_words(n);
|
||||
let mut file = OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.read(true).write(true).create(true).truncate(true)
|
||||
.open(path)?;
|
||||
file.write_all(&MAGIC)?;
|
||||
file.write_all(&[0u8; 4])?; // padding
|
||||
file.write_all(&[0u8; 4])?;
|
||||
file.write_all(&(n as u64).to_le_bytes())?;
|
||||
file.seek(SeekFrom::Start(0))?;
|
||||
file.set_len(file_size as u64)?;
|
||||
let mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||
Ok(Self { mmap, n })
|
||||
Ok(Self { mmap, n, path: path.to_path_buf() })
|
||||
}
|
||||
|
||||
pub fn from_raw_bytes(bytes: &[u8], n: usize, path: &Path) -> io::Result<Self> {
|
||||
let file_size = HEADER_SIZE + n_bytes_for_words(n);
|
||||
let file = OpenOptions::new()
|
||||
.read(true).write(true).create(true).truncate(true)
|
||||
.open(path)?;
|
||||
file.set_len(file_size as u64)?;
|
||||
let mut mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||
mmap[0..4].copy_from_slice(&MAGIC);
|
||||
mmap[8..16].copy_from_slice(&(n as u64).to_le_bytes());
|
||||
mmap[HEADER_SIZE..HEADER_SIZE + bytes.len()].copy_from_slice(bytes);
|
||||
Ok(Self { mmap, n, path: path.to_path_buf() })
|
||||
}
|
||||
|
||||
/// Create an all-ones bit vector of length `n` at `path`.
|
||||
///
|
||||
/// More efficient than `new(n, path)` + `not()`: the data is written as
|
||||
/// 0xFF bytes in a single sequential pass, with no intermediate all-zeros state.
|
||||
pub fn new_ones(n: usize, path: &Path) -> io::Result<Self> {
|
||||
let nw = n_words(n);
|
||||
let file_size = HEADER_SIZE + nw * 8;
|
||||
let mut file = OpenOptions::new()
|
||||
.read(true).write(true).create(true).truncate(true)
|
||||
.open(path)?;
|
||||
file.write_all(&MAGIC)?;
|
||||
file.write_all(&[0u8; 4])?;
|
||||
file.write_all(&(n as u64).to_le_bytes())?;
|
||||
file.write_all(&vec![0xFFu8; nw * 8])?;
|
||||
file.seek(SeekFrom::Start(0))?;
|
||||
file.set_len(file_size as u64)?;
|
||||
let mut mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||
// Clear padding bits in the last word so trailing bits are always 0.
|
||||
let rem = n % 64;
|
||||
if rem != 0 {
|
||||
let ptr = mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64;
|
||||
let words = unsafe { std::slice::from_raw_parts_mut(ptr, nw) };
|
||||
words[nw - 1] &= (1u64 << rem) - 1;
|
||||
}
|
||||
Ok(Self { mmap, n, path: path.to_path_buf() })
|
||||
}
|
||||
|
||||
pub fn build_from(source: &PersistentBitVec, path: &Path) -> io::Result<Self> {
|
||||
@@ -193,28 +177,53 @@ impl PersistentBitVecBuilder {
|
||||
let file = OpenOptions::new().read(true).write(true).open(path)?;
|
||||
let mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||
let n = source.len();
|
||||
Ok(Self { mmap, n })
|
||||
Ok(Self { mmap, n, path: path.to_path_buf() })
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.n
|
||||
pub fn build_from_counts(source: &PersistentCompactIntVec, threshold: u32, path: &Path) -> io::Result<Self> {
|
||||
let n = source.len();
|
||||
let file_size = HEADER_SIZE + n_bytes_for_words(n);
|
||||
let mut file = OpenOptions::new()
|
||||
.read(true).write(true).create(true).truncate(true)
|
||||
.open(path)?;
|
||||
file.write_all(&MAGIC)?;
|
||||
file.write_all(&[0u8; 4])?;
|
||||
file.write_all(&(n as u64).to_le_bytes())?;
|
||||
file.seek(SeekFrom::Start(0))?;
|
||||
file.set_len(file_size as u64)?;
|
||||
let mut mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||
{
|
||||
let nw = n_words(n);
|
||||
let ptr = mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64;
|
||||
let words = unsafe { std::slice::from_raw_parts_mut(ptr, nw) };
|
||||
for (slot, count) in source.iter().enumerate() {
|
||||
if count >= threshold { words[slot >> 6] |= 1u64 << (slot & 63); }
|
||||
}
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.n == 0
|
||||
}
|
||||
Ok(Self { mmap, n, path: path.to_path_buf() })
|
||||
}
|
||||
|
||||
pub fn build_from_presence(source: &PersistentCompactIntVec, path: &Path) -> io::Result<Self> {
|
||||
Self::build_from_counts(source, 1, path)
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize { self.n }
|
||||
pub fn is_empty(&self) -> bool { self.n == 0 }
|
||||
|
||||
pub fn get(&self, slot: usize) -> bool {
|
||||
(self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0
|
||||
}
|
||||
|
||||
pub fn set(&mut self, slot: usize, value: bool) {
|
||||
let byte = HEADER_SIZE + (slot >> 3);
|
||||
let bit = 1u8 << (slot & 7);
|
||||
if value {
|
||||
self.mmap[byte] |= bit;
|
||||
} else {
|
||||
self.mmap[byte] &= !bit;
|
||||
let bit = 1u64 << (slot & 63);
|
||||
if value { self.data_words_mut()[slot >> 6] |= bit; }
|
||||
else { self.data_words_mut()[slot >> 6] &= !bit; }
|
||||
}
|
||||
|
||||
fn data_words(&self) -> &[u64] {
|
||||
let nw = n_words(self.n);
|
||||
let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64;
|
||||
unsafe { std::slice::from_raw_parts(ptr, nw) }
|
||||
}
|
||||
|
||||
// SAFETY: same alignment argument as PersistentBitVec::data_words.
|
||||
@@ -224,83 +233,116 @@ impl PersistentBitVecBuilder {
|
||||
unsafe { std::slice::from_raw_parts_mut(ptr, nw) }
|
||||
}
|
||||
|
||||
pub fn and(&mut self, other: &PersistentBitVec) {
|
||||
assert_eq!(self.n, other.n, "length mismatch");
|
||||
for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) {
|
||||
*sw &= ow;
|
||||
}
|
||||
pub fn view(&self) -> BitSliceView<'_> {
|
||||
BitSliceView::new(self.data_words(), self.n)
|
||||
}
|
||||
|
||||
pub fn or(&mut self, other: &PersistentBitVec) {
|
||||
assert_eq!(self.n, other.n, "length mismatch");
|
||||
for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) {
|
||||
*sw |= ow;
|
||||
}
|
||||
pub fn words(&self) -> &[u64] { self.data_words() }
|
||||
|
||||
pub fn copy_from(&mut self, src: BitSliceView<'_>) {
|
||||
assert_eq!(self.n, src.len(), "BitSliceView length mismatch");
|
||||
self.data_words_mut().copy_from_slice(src.words());
|
||||
}
|
||||
|
||||
pub fn xor(&mut self, other: &PersistentBitVec) {
|
||||
assert_eq!(self.n, other.n, "length mismatch");
|
||||
for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) {
|
||||
*sw ^= ow;
|
||||
pub fn and(&mut self, other: BitSliceView<'_>) {
|
||||
assert_eq!(self.n, other.len(), "BitSliceView length mismatch");
|
||||
for (w, &o) in self.data_words_mut().iter_mut().zip(other.words()) { *w &= o; }
|
||||
}
|
||||
|
||||
pub fn or(&mut self, other: BitSliceView<'_>) {
|
||||
assert_eq!(self.n, other.len(), "BitSliceView length mismatch");
|
||||
for (w, &o) in self.data_words_mut().iter_mut().zip(other.words()) { *w |= o; }
|
||||
}
|
||||
|
||||
pub fn xor(&mut self, other: BitSliceView<'_>) {
|
||||
assert_eq!(self.n, other.len(), "BitSliceView length mismatch");
|
||||
for (w, &o) in self.data_words_mut().iter_mut().zip(other.words()) { *w ^= o; }
|
||||
}
|
||||
|
||||
pub fn not(&mut self) {
|
||||
let rem = self.n % 64;
|
||||
let words = self.data_words_mut();
|
||||
for w in words.iter_mut() {
|
||||
*w ^= u64::MAX;
|
||||
}
|
||||
// Zero padding bits in the last word so count_ones / jaccard remain correct.
|
||||
for w in words.iter_mut() { *w ^= u64::MAX; }
|
||||
if rem != 0 {
|
||||
if let Some(last) = words.last_mut() {
|
||||
*last &= (1u64 << rem) - 1;
|
||||
}
|
||||
if let Some(last) = words.last_mut() { *last &= (1u64 << rem) - 1; }
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert a count vector to a bit vector: bit set iff count >= threshold.
|
||||
/// Fills u64 words directly from the count iterator — O(n), no bit-level set() overhead.
|
||||
pub fn build_from_counts(
|
||||
source: &PersistentCompactIntVec,
|
||||
threshold: u32,
|
||||
path: &Path,
|
||||
) -> io::Result<Self> {
|
||||
let n = source.len();
|
||||
let file_size = HEADER_SIZE + n_bytes_for_words(n);
|
||||
let mut file = OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.open(path)?;
|
||||
file.write_all(&MAGIC)?;
|
||||
file.write_all(&[0u8; 4])?;
|
||||
file.write_all(&(n as u64).to_le_bytes())?;
|
||||
file.seek(SeekFrom::Start(0))?;
|
||||
file.set_len(file_size as u64)?;
|
||||
let mut mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||
|
||||
{
|
||||
/// OR in bits at slots where `pred(col[slot])` is true.
|
||||
pub fn or_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||
assert_eq!(self.n, col.len(), "IntSliceView length mismatch");
|
||||
let n = self.n;
|
||||
let primary = col.primary_bytes();
|
||||
let words = self.data_words_mut();
|
||||
let nw = n_words(n);
|
||||
let ptr = mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64;
|
||||
let words = unsafe { std::slice::from_raw_parts_mut(ptr, nw) };
|
||||
for (slot, count) in source.iter().enumerate() {
|
||||
if count >= threshold {
|
||||
words[slot >> 6] |= 1u64 << (slot & 63);
|
||||
for wi in 0..nw {
|
||||
let base = wi * 64;
|
||||
let limit = (base + 64).min(n);
|
||||
let mut mask = 0u64;
|
||||
for bit in 0..(limit - base) {
|
||||
let b = primary[base + bit];
|
||||
if b < 255 && pred(b as u32) { mask |= 1u64 << bit; }
|
||||
}
|
||||
words[wi] |= mask;
|
||||
}
|
||||
for (slot, val) in col.overflow_entries() {
|
||||
if pred(val) { words[slot >> 6] |= 1u64 << (slot & 63); }
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Self { mmap, n })
|
||||
/// Clear bits at slots where `pred(col[slot])` is false.
|
||||
pub fn and_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||
assert_eq!(self.n, col.len(), "IntSliceView length mismatch");
|
||||
let n = self.n;
|
||||
let primary = col.primary_bytes();
|
||||
let words = self.data_words_mut();
|
||||
let nw = n_words(n);
|
||||
for wi in 0..nw {
|
||||
let base = wi * 64;
|
||||
let limit = (base + 64).min(n);
|
||||
let mut mask = 0u64;
|
||||
for bit in 0..(limit - base) {
|
||||
let b = primary[base + bit];
|
||||
if b < 255 && !pred(b as u32) { mask |= 1u64 << bit; }
|
||||
}
|
||||
words[wi] &= !mask;
|
||||
}
|
||||
for (slot, val) in col.overflow_entries() {
|
||||
if !pred(val) { words[slot >> 6] &= !(1u64 << (slot & 63)); }
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert a count vector to a presence/absence bit vector (threshold = 1).
|
||||
pub fn build_from_presence(source: &PersistentCompactIntVec, path: &Path) -> io::Result<Self> {
|
||||
Self::build_from_counts(source, 1, path)
|
||||
/// Toggle bits at slots where `pred(col[slot])` is true.
|
||||
pub fn xor_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||
assert_eq!(self.n, col.len(), "IntSliceView length mismatch");
|
||||
let n = self.n;
|
||||
let primary = col.primary_bytes();
|
||||
let words = self.data_words_mut();
|
||||
let nw = n_words(n);
|
||||
for wi in 0..nw {
|
||||
let base = wi * 64;
|
||||
let limit = (base + 64).min(n);
|
||||
let mut mask = 0u64;
|
||||
for bit in 0..(limit - base) {
|
||||
let b = primary[base + bit];
|
||||
if b < 255 && pred(b as u32) { mask |= 1u64 << bit; }
|
||||
}
|
||||
words[wi] ^= mask;
|
||||
}
|
||||
for (slot, val) in col.overflow_entries() {
|
||||
if pred(val) { words[slot >> 6] ^= 1u64 << (slot & 63); }
|
||||
}
|
||||
}
|
||||
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
self.mmap.flush()
|
||||
pub fn iter(&self) -> BitSliceIter<'_> {
|
||||
self.view().iter()
|
||||
}
|
||||
|
||||
pub fn close(self) -> io::Result<()> { self.mmap.flush() }
|
||||
|
||||
pub fn finish(self) -> io::Result<PersistentBitVec> {
|
||||
let path = self.path.clone();
|
||||
self.close()?;
|
||||
PersistentBitVec::open(&path)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,8 +5,9 @@ use std::path::{Path, PathBuf};
|
||||
|
||||
use memmap2::MmapMut;
|
||||
|
||||
use crate::format::{HEADER_SIZE, OVERFLOW_ENTRY_SIZE, finalize_pciv};
|
||||
use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, finalize_pciv, parse_overflow_entry};
|
||||
use crate::reader::PersistentCompactIntVec;
|
||||
use crate::views::{BitSliceView, IntSliceView};
|
||||
|
||||
pub struct PersistentCompactIntVecBuilder {
|
||||
path: PathBuf,
|
||||
@@ -16,59 +17,44 @@ pub struct PersistentCompactIntVecBuilder {
|
||||
}
|
||||
|
||||
impl PersistentCompactIntVecBuilder {
|
||||
/// Create a new, zero-filled PCIV at `path`. Primary is mmapped immediately.
|
||||
pub fn new(n: usize, path: &Path) -> io::Result<Self> {
|
||||
let file = OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.read(true).write(true).create(true).truncate(true)
|
||||
.open(path)?;
|
||||
file.set_len((HEADER_SIZE + n) as u64)?;
|
||||
let mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||
Ok(Self {
|
||||
path: path.to_path_buf(),
|
||||
mmap,
|
||||
n,
|
||||
overflow: HashMap::new(),
|
||||
})
|
||||
Ok(Self { path: path.to_path_buf(), mmap, n, overflow: HashMap::new() })
|
||||
}
|
||||
|
||||
pub fn from_raw_primary(primary: &[u8], overflow: HashMap<usize, u32>, path: &Path) -> io::Result<Self> {
|
||||
let n = primary.len();
|
||||
let file = OpenOptions::new()
|
||||
.read(true).write(true).create(true).truncate(true)
|
||||
.open(path)?;
|
||||
file.set_len((HEADER_SIZE + n) as u64)?;
|
||||
let mut mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||
mmap[HEADER_SIZE..HEADER_SIZE + n].copy_from_slice(primary);
|
||||
Ok(Self { path: path.to_path_buf(), mmap, n, overflow })
|
||||
}
|
||||
|
||||
/// Copy `source`'s file to `path`, mmap the primary section, load overflow into RAM.
|
||||
/// Avoids iterating all n slots: the file copy is OS-level, overflow loading is O(n_overflow).
|
||||
pub fn build_from(source: &PersistentCompactIntVec, path: &Path) -> io::Result<Self> {
|
||||
fs::copy(source.path(), path)?;
|
||||
|
||||
let file = OpenOptions::new().read(true).write(true).open(path)?;
|
||||
let mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||
|
||||
let n = source.len();
|
||||
let n_overflow = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize;
|
||||
let data_offset = HEADER_SIZE + n;
|
||||
|
||||
let mut overflow = HashMap::with_capacity(n_overflow);
|
||||
for i in 0..n_overflow {
|
||||
let off = data_offset + i * OVERFLOW_ENTRY_SIZE;
|
||||
let slot = u64::from_le_bytes(mmap[off..off + 8].try_into().unwrap()) as usize;
|
||||
let value = u32::from_le_bytes(mmap[off + 8..off + 12].try_into().unwrap());
|
||||
let (slot, value) = parse_overflow_entry(&mmap, data_offset, i);
|
||||
overflow.insert(slot, value);
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
path: path.to_path_buf(),
|
||||
mmap,
|
||||
n,
|
||||
overflow,
|
||||
})
|
||||
Ok(Self { path: path.to_path_buf(), mmap, n, overflow })
|
||||
}
|
||||
|
||||
/// Get the value at the given slot, handling overflow if necessary.
|
||||
pub fn get(&self, slot: usize) -> u32 {
|
||||
match self.mmap[HEADER_SIZE + slot] {
|
||||
255 => *self
|
||||
.overflow
|
||||
.get(&slot)
|
||||
.expect("sentinel without overflow entry"),
|
||||
255 => *self.overflow.get(&slot).expect("sentinel without overflow entry"),
|
||||
v => v as u32,
|
||||
}
|
||||
}
|
||||
@@ -83,61 +69,201 @@ impl PersistentCompactIntVecBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.n
|
||||
pub fn len(&self) -> usize { self.n }
|
||||
pub fn is_empty(&self) -> bool { self.n == 0 }
|
||||
|
||||
pub fn primary_bytes(&self) -> &[u8] { &self.mmap[HEADER_SIZE..HEADER_SIZE + self.n] }
|
||||
pub fn primary_bytes_mut(&mut self) -> &mut [u8] { &mut self.mmap[HEADER_SIZE..HEADER_SIZE + self.n] }
|
||||
pub fn clear_overflow(&mut self) { self.overflow.clear(); }
|
||||
|
||||
pub fn sum(&self) -> u64 {
|
||||
byte_sum(&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n], self.overflow.values().copied())
|
||||
}
|
||||
pub fn count_nonzero(&self) -> u64 {
|
||||
byte_count_nonzero(&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n])
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.n == 0
|
||||
pub fn view(&self) -> IntSliceView<'_> {
|
||||
// Builder overflow is a HashMap, not sorted raw bytes — convert on the fly
|
||||
// by collecting into a sorted vec and storing in a thread-local buffer.
|
||||
// For read-back during building, just call get(slot) directly.
|
||||
// view() is primarily useful AFTER freeze (on PersistentCompactIntVec).
|
||||
// Here we expose it via a zero-alloc path: primary only, no overflow raw.
|
||||
// Callers that need overflow_entries during building use overflow_entries().
|
||||
let primary = &self.mmap[HEADER_SIZE..HEADER_SIZE + self.n];
|
||||
IntSliceView::new(primary, &[], 0, self.n)
|
||||
}
|
||||
|
||||
pub fn min(&mut self, other: &PersistentCompactIntVec) {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
for (slot, other_val) in other.iter().enumerate() {
|
||||
if other_val < self.get(slot) {
|
||||
self.set(slot, other_val);
|
||||
pub fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
|
||||
self.overflow.iter().map(|(&k, &v)| (k, v))
|
||||
}
|
||||
|
||||
pub fn inc(&mut self, slot: usize) {
|
||||
let v = self.get(slot);
|
||||
self.set(slot, v.saturating_add(1));
|
||||
}
|
||||
|
||||
// ── Computation methods ───────────────────────────────────────────────────
|
||||
|
||||
/// Increment one counter per 1-bit of `col`. Safe for any group size.
|
||||
pub fn inc_present(&mut self, col: BitSliceView<'_>) {
|
||||
let n = self.n;
|
||||
for (wi, &word) in col.words().iter().enumerate() {
|
||||
if word == 0 { continue; }
|
||||
let mut w = word;
|
||||
while w != 0 {
|
||||
let bit = w.trailing_zeros() as usize;
|
||||
let slot = wi * 64 + bit;
|
||||
if slot < n { self.inc(slot); }
|
||||
w &= w - 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn max(&mut self, other: &PersistentCompactIntVec) {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
for (slot, other_val) in other.iter().enumerate() {
|
||||
if other_val > self.get(slot) {
|
||||
self.set(slot, other_val);
|
||||
/// Increment one counter per 1-bit of `col`, using raw u8 arithmetic.
|
||||
/// Caller guarantees no counter will reach 255 (group size < 255).
|
||||
pub fn inc_present_fast(&mut self, col: BitSliceView<'_>) {
|
||||
{
|
||||
let primary = self.primary_bytes_mut();
|
||||
let n = primary.len();
|
||||
for (wi, &word) in col.words().iter().enumerate() {
|
||||
if word == 0 { continue; }
|
||||
let mut w = word;
|
||||
while w != 0 {
|
||||
let bit = w.trailing_zeros() as usize;
|
||||
let s = wi * 64 + bit;
|
||||
if s < n { primary[s] += 1; }
|
||||
w &= w - 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
debug_assert!(
|
||||
!self.primary_bytes().contains(&255),
|
||||
"sentinel 255 reached in inc_present_fast — group size must be < 255"
|
||||
);
|
||||
}
|
||||
|
||||
/// Two-pass: primary bytes then overflow. Increments `self[slot]` for each
|
||||
/// slot where `pred(col[slot])` is true. Safe for any group size.
|
||||
pub fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||
let n = col.len();
|
||||
for slot in 0..n {
|
||||
let b = col.primary_bytes()[slot];
|
||||
if b < 255 && pred(b as u32) {
|
||||
self.inc(slot);
|
||||
}
|
||||
}
|
||||
for (slot, val) in col.overflow_entries() {
|
||||
if pred(val) { self.inc(slot); }
|
||||
}
|
||||
}
|
||||
|
||||
/// Fast two-pass: raw u8 arithmetic. Caller guarantees no counter reaches 255.
|
||||
pub fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||
let n = col.len();
|
||||
{
|
||||
let primary = self.primary_bytes_mut();
|
||||
for slot in 0..n {
|
||||
let b = col.primary_bytes()[slot];
|
||||
if b < 255 && pred(b as u32) {
|
||||
primary[slot] += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (slot, val) in col.overflow_entries() {
|
||||
if pred(val) { self.primary_bytes_mut()[slot] += 1; }
|
||||
}
|
||||
debug_assert!(
|
||||
!self.primary_bytes().contains(&255),
|
||||
"sentinel 255 reached in inc_predicate_fast — group size must be < 255"
|
||||
);
|
||||
}
|
||||
|
||||
pub fn add(&mut self, other: IntSliceView<'_>) {
|
||||
let n = self.n;
|
||||
for s in 0..n {
|
||||
let sb = self.primary_bytes()[s];
|
||||
let ob = other.primary_bytes()[s];
|
||||
if sb < 255 && ob < 255 {
|
||||
let sum = sb as u32 + ob as u32;
|
||||
if sum < 255 { self.primary_bytes_mut()[s] = sum as u8; }
|
||||
else { self.set(s, sum); }
|
||||
} else {
|
||||
let sv = self.get(s);
|
||||
let ov = other.get(s);
|
||||
self.set(s, sv + ov);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add(&mut self, other: &PersistentCompactIntVec) {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
for (slot, other_val) in other.iter().enumerate() {
|
||||
let cur = self.get(slot);
|
||||
self.set(slot, cur.checked_add(other_val).expect("u32 overflow in add"));
|
||||
pub fn min(&mut self, other: IntSliceView<'_>) {
|
||||
let self_ov: Vec<(usize, u32)> = self.overflow_entries().collect();
|
||||
let other_ov: HashMap<usize, u32> = other.overflow_entries().collect();
|
||||
self.clear_overflow();
|
||||
for (a, &b) in self.primary_bytes_mut().iter_mut().zip(other.primary_bytes()) {
|
||||
if b < *a { *a = b; }
|
||||
}
|
||||
for (slot, self_val) in self_ov {
|
||||
if let Some(&other_val) = other_ov.get(&slot) {
|
||||
self.set(slot, self_val.min(other_val));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn diff(&mut self, other: &PersistentCompactIntVec) {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
for (slot, other_val) in other.iter().enumerate() {
|
||||
self.set(slot, self.get(slot).saturating_sub(other_val));
|
||||
pub fn max(&mut self, other: IntSliceView<'_>) {
|
||||
for (slot, other_val) in other.overflow_entries() {
|
||||
let sv = self.get(slot);
|
||||
self.set(slot, sv.max(other_val));
|
||||
}
|
||||
for (a, &b) in self.primary_bytes_mut().iter_mut().zip(other.primary_bytes()) {
|
||||
if b > *a { *a = b; }
|
||||
}
|
||||
}
|
||||
|
||||
pub fn diff(&mut self, other: IntSliceView<'_>) {
|
||||
let n = self.n;
|
||||
for s in 0..n {
|
||||
let sb = self.primary_bytes()[s];
|
||||
let ob = other.primary_bytes()[s];
|
||||
if sb < 255 {
|
||||
self.primary_bytes_mut()[s] = if ob < 255 { sb.saturating_sub(ob) } else { 0 };
|
||||
} else {
|
||||
let sv = self.get(s);
|
||||
let ov = if ob < 255 { ob as u32 } else { other.get(s) };
|
||||
self.set(s, sv.saturating_sub(ov));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn mask_with(&mut self, mask: BitSliceView<'_>) {
|
||||
let n = self.n;
|
||||
for (wi, &word) in mask.words().iter().enumerate() {
|
||||
if word == u64::MAX { continue; }
|
||||
let mut zeros = !word;
|
||||
while zeros != 0 {
|
||||
let bit = zeros.trailing_zeros() as usize;
|
||||
let s = wi * 64 + bit;
|
||||
if s < n {
|
||||
let b = self.primary_bytes()[s];
|
||||
if b != 0 { self.set(s, 0); }
|
||||
}
|
||||
zeros &= zeros - 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Flush the primary mmap, then write sorted overflow data + index and fix the header.
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
self.mmap.flush()?;
|
||||
let Self {
|
||||
path,
|
||||
mmap,
|
||||
n,
|
||||
overflow,
|
||||
} = self;
|
||||
let Self { path, mmap, n, overflow } = self;
|
||||
drop(mmap);
|
||||
|
||||
let mut entries: Vec<(usize, u32)> = overflow.into_iter().collect();
|
||||
entries.sort_unstable_by_key(|&(slot, _)| slot);
|
||||
|
||||
finalize_pciv(&path, n, &entries)
|
||||
}
|
||||
|
||||
pub fn finish(self) -> io::Result<PersistentCompactIntVec> {
|
||||
let path = self.path.clone();
|
||||
self.close()?;
|
||||
PersistentCompactIntVec::open(&path)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,137 @@
|
||||
use std::io;
|
||||
|
||||
use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
|
||||
use crate::tempintvec::TempCompactIntVec;
|
||||
|
||||
// ── ColGroup ──────────────────────────────────────────────────────────────────
|
||||
|
||||
/// A named subset of columns, identified by their indices within the matrix.
|
||||
///
|
||||
/// Defined once at the index level; the same indices are valid across all
|
||||
/// partitions and layers because the column structure (samples / genomes) is
|
||||
/// identical everywhere — only the row space (kmer slots) is partitioned.
|
||||
pub struct ColGroup {
|
||||
pub name: String,
|
||||
pub indices: Vec<usize>,
|
||||
}
|
||||
|
||||
impl ColGroup {
|
||||
pub fn new(name: impl Into<String>, indices: Vec<usize>) -> Self {
|
||||
Self { name: name.into(), indices }
|
||||
}
|
||||
}
|
||||
|
||||
// ── MatrixGroupOps ────────────────────────────────────────────────────────────
|
||||
|
||||
/// Per-matrix group aggregations.
|
||||
///
|
||||
/// `partial_group_presence_count`, `partial_group_sum`, `partial_group_any`,
|
||||
/// `partial_group_min`, `partial_group_max` are the primitives; each impl must
|
||||
/// provide all five.
|
||||
///
|
||||
/// `partial_group_all` and `partial_group_none` have default implementations
|
||||
/// derived from `partial_group_presence_count` and should rarely need overriding.
|
||||
pub trait MatrixGroupOps {
|
||||
/// Per-slot count of group columns whose value ≥ `threshold`.
|
||||
fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> io::Result<TempCompactIntVec>;
|
||||
|
||||
/// Per-slot sum of values across all group columns.
|
||||
fn partial_group_sum(&self, g: &ColGroup) -> io::Result<TempCompactIntVec>;
|
||||
|
||||
/// Per-slot OR: 1 if any group column has value ≥ `threshold`.
|
||||
fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec>;
|
||||
|
||||
/// Per-slot min value across all group columns (0 if group is empty).
|
||||
fn partial_group_min(&self, g: &ColGroup) -> io::Result<TempCompactIntVec>;
|
||||
|
||||
/// Per-slot max value across all group columns (0 if group is empty).
|
||||
fn partial_group_max(&self, g: &ColGroup) -> io::Result<TempCompactIntVec>;
|
||||
|
||||
/// Per-slot AND: 1 if ALL group columns have value ≥ `threshold`.
|
||||
fn partial_group_all(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec> {
|
||||
let counts = self.partial_group_presence_count(g, threshold)?;
|
||||
let n = counts.len();
|
||||
let n_required = g.indices.len() as u32;
|
||||
let mut b = TempBitVecBuilder::new(n)?;
|
||||
b.or_where(counts.view(), |v| v >= n_required);
|
||||
b.freeze()
|
||||
}
|
||||
|
||||
/// Per-slot NOR: 1 if NO group column has value ≥ `threshold`.
|
||||
fn partial_group_none(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec> {
|
||||
let counts = self.partial_group_presence_count(g, threshold)?;
|
||||
let n = counts.len();
|
||||
let mut b = TempBitVecBuilder::new(n)?;
|
||||
b.or_where(counts.view(), |v| v == 0);
|
||||
b.freeze()
|
||||
}
|
||||
}
|
||||
|
||||
// ── FilterMask — expression tree for column-based slot filters ────────────────
|
||||
|
||||
/// A composable filter expression that can be evaluated against a matrix
|
||||
/// using only column operations (no MPHF lookup per kmer).
|
||||
///
|
||||
/// `threshold` semantics follow [`MatrixGroupOps::partial_group_presence_count`]:
|
||||
/// a slot contributes to the count when its value is **≥ threshold**.
|
||||
/// To match the row-level filter (`value > t`), callers should pass `t + 1`.
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum FilterMask {
|
||||
/// Slot passes if count of columns in `indices` with value ≥ `threshold` is ≥ `min_count`.
|
||||
PresenceGeq { indices: Vec<usize>, threshold: u32, min_count: usize },
|
||||
/// Slot passes if count of columns in `indices` with value ≥ `threshold` is ≤ `max_count`.
|
||||
PresenceLeq { indices: Vec<usize>, threshold: u32, max_count: usize },
|
||||
/// Slot passes if sum of values across `indices` columns is ≥ `min_sum`.
|
||||
SumGeq { indices: Vec<usize>, min_sum: u32 },
|
||||
/// Slot passes if sum of values across `indices` columns is ≤ `max_sum`.
|
||||
SumLeq { indices: Vec<usize>, max_sum: u32 },
|
||||
/// Slot passes if it passes all sub-expressions. Empty `And` is always true.
|
||||
And(Vec<FilterMask>),
|
||||
}
|
||||
|
||||
/// Evaluate a [`FilterMask`] against `mat`, returning a per-slot `TempBitVec`
|
||||
/// where bit=1 means the slot passes the filter.
|
||||
pub fn eval_filter_mask(expr: &FilterMask, mat: &dyn MatrixGroupOps, n: usize) -> io::Result<TempBitVec> {
|
||||
match expr {
|
||||
FilterMask::PresenceGeq { indices, threshold, min_count } => {
|
||||
let g = ColGroup::new("", indices.clone());
|
||||
let counts = mat.partial_group_presence_count(&g, *threshold)?;
|
||||
let mut b = TempBitVecBuilder::new(n)?;
|
||||
let mc = *min_count as u32;
|
||||
b.or_where(counts.view(), |v| v >= mc);
|
||||
b.freeze()
|
||||
}
|
||||
FilterMask::PresenceLeq { indices, threshold, max_count } => {
|
||||
let g = ColGroup::new("", indices.clone());
|
||||
let counts = mat.partial_group_presence_count(&g, *threshold)?;
|
||||
let mut b = TempBitVecBuilder::new(n)?;
|
||||
let mc = *max_count as u32;
|
||||
b.or_where(counts.view(), |v| v <= mc);
|
||||
b.freeze()
|
||||
}
|
||||
FilterMask::SumGeq { indices, min_sum } => {
|
||||
let g = ColGroup::new("", indices.clone());
|
||||
let sums = mat.partial_group_sum(&g)?;
|
||||
let mut b = TempBitVecBuilder::new(n)?;
|
||||
let ms = *min_sum;
|
||||
b.or_where(sums.view(), |v| v >= ms);
|
||||
b.freeze()
|
||||
}
|
||||
FilterMask::SumLeq { indices, max_sum } => {
|
||||
let g = ColGroup::new("", indices.clone());
|
||||
let sums = mat.partial_group_sum(&g)?;
|
||||
let mut b = TempBitVecBuilder::new(n)?;
|
||||
let ms = *max_sum;
|
||||
b.or_where(sums.view(), |v| v <= ms);
|
||||
b.freeze()
|
||||
}
|
||||
FilterMask::And(parts) => {
|
||||
let mut b = TempBitVecBuilder::new_ones(n)?;
|
||||
for part in parts {
|
||||
let m = eval_filter_mask(part, mat, n)?;
|
||||
b.and(m.view());
|
||||
}
|
||||
b.freeze()
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -13,6 +13,44 @@ pub const OVERFLOW_ENTRY_SIZE: usize = 12;
|
||||
// Index entry: slot(u64) + pos(u64) = 16 bytes.
|
||||
pub const INDEX_ENTRY_SIZE: usize = 16;
|
||||
|
||||
/// Sum all values in a compact-int primary byte slice, correcting for overflow sentinels.
|
||||
///
|
||||
/// `primary` is the raw `&[u8]` where 255 is a sentinel for large values.
|
||||
/// `overflow` yields the true values (≥ 255) for each sentinel, in any order.
|
||||
#[inline]
|
||||
pub(crate) fn byte_sum(primary: &[u8], overflow: impl Iterator<Item = u32>) -> u64 {
|
||||
let raw: u64 = primary.iter().map(|&b| b as u64).sum();
|
||||
let (n, ov) = overflow.fold((0u64, 0u64), |(n, s), v| (n + 1, s + v as u64));
|
||||
raw - 255 * n + ov
|
||||
}
|
||||
|
||||
/// Count non-zero values in a compact-int primary byte slice.
|
||||
///
|
||||
/// Overflow sentinels (255) are always non-zero by construction, so a single
|
||||
/// `b != 0` test is sufficient — no overflow map lookup needed.
|
||||
#[inline]
|
||||
pub(crate) fn byte_count_nonzero(primary: &[u8]) -> u64 {
|
||||
primary.iter().filter(|&&b| b != 0).count() as u64
|
||||
}
|
||||
|
||||
/// Parse a single overflow entry `(slot, value)` from a byte slice.
|
||||
#[inline]
|
||||
pub fn parse_overflow_entry(data: &[u8], base: usize, i: usize) -> (usize, u32) {
|
||||
let off = base + i * OVERFLOW_ENTRY_SIZE;
|
||||
let slot = u64::from_le_bytes(data[off..off+8].try_into().unwrap()) as usize;
|
||||
let value = u32::from_le_bytes(data[off+8..off+12].try_into().unwrap());
|
||||
(slot, value)
|
||||
}
|
||||
|
||||
/// Parse a single sparse-index entry `(slot, pos)` from a byte slice.
|
||||
#[inline]
|
||||
pub fn parse_index_entry(data: &[u8], base: usize, i: usize) -> (usize, usize) {
|
||||
let off = base + i * INDEX_ENTRY_SIZE;
|
||||
let slot = u64::from_le_bytes(data[off..off+8].try_into().unwrap()) as usize;
|
||||
let pos = u64::from_le_bytes(data[off+8..off+16].try_into().unwrap()) as usize;
|
||||
(slot, pos)
|
||||
}
|
||||
|
||||
// Sparse index target: ≤ 32 KB in L1 cache (16 B per entry → 2048 entries).
|
||||
pub const L1_INDEX_ENTRIES: usize = 2048;
|
||||
|
||||
|
||||
+136
-212
@@ -1,4 +1,3 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{self, BufWriter, Write as _};
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -7,10 +6,15 @@ use memmap2::Mmap;
|
||||
use ndarray::{Array1, Array2};
|
||||
use rayon::prelude::*;
|
||||
|
||||
use crate::bitmatrix::{pairwise_matrix, pairwise2_matrix};
|
||||
use crate::builder::PersistentCompactIntVecBuilder;
|
||||
use crate::format::{HEADER_SIZE, INDEX_ENTRY_SIZE, OVERFLOW_ENTRY_SIZE};
|
||||
use crate::colgroup::{ColGroup, MatrixGroupOps};
|
||||
use crate::format::{HEADER_SIZE, OVERFLOW_ENTRY_SIZE};
|
||||
use crate::meta::MatrixMeta;
|
||||
use crate::reader::PersistentCompactIntVec;
|
||||
use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
|
||||
use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
|
||||
use crate::views::IntSliceView;
|
||||
|
||||
fn col_path(dir: &Path, col: usize) -> PathBuf {
|
||||
dir.join(format!("col_{col:06}.pciv"))
|
||||
@@ -41,9 +45,7 @@ impl ColumnarCompactIntMatrix {
|
||||
}
|
||||
|
||||
pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
||||
for (c, col) in self.cols.iter().enumerate() {
|
||||
buf[c] = col.get(slot);
|
||||
}
|
||||
for (c, col) in self.cols.iter().enumerate() { buf[c] = col.get(slot); }
|
||||
}
|
||||
|
||||
pub(crate) fn sum(&self) -> Array1<u64> {
|
||||
@@ -63,49 +65,26 @@ impl ColumnarCompactIntMatrix {
|
||||
}
|
||||
|
||||
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||
self.pairwise_u64(|i, j| self.col(i).partial_bray_dist(self.col(j)))
|
||||
pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_bray_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).partial_euclidean_dist(self.col(j)))
|
||||
pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_euclidean_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub(crate) fn partial_threshold_jaccard_dist_matrix(
|
||||
&self, threshold: u32,
|
||||
) -> (Array2<u64>, Array2<u64>) {
|
||||
let n = self.n_cols();
|
||||
let pairs = upper_pairs(n);
|
||||
let results: Vec<(usize, usize, u64, u64)> = pairs
|
||||
.into_par_iter()
|
||||
.map(|(i, j)| {
|
||||
let (inter, union) =
|
||||
self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold);
|
||||
(i, j, inter, union)
|
||||
})
|
||||
.collect();
|
||||
let mut inter_m = Array2::zeros((n, n));
|
||||
let mut union_m = Array2::zeros((n, n));
|
||||
for (i, j, inter, union) in results {
|
||||
inter_m[[i, j]] = inter; inter_m[[j, i]] = inter;
|
||||
union_m[[i, j]] = union; union_m[[j, i]] = union;
|
||||
pub(crate) fn partial_threshold_jaccard_dist_matrix(&self, threshold: u32) -> (Array2<u64>, Array2<u64>) {
|
||||
pairwise2_matrix(self.n_cols(), |i, j| self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold))
|
||||
}
|
||||
(inter_m, union_m)
|
||||
}
|
||||
|
||||
pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
self.pairwise(|i, j| {
|
||||
pairwise_matrix(self.n_cols(), |i, j| {
|
||||
self.col(i).partial_relfreq_bray_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
self.pairwise(|i, j| {
|
||||
pairwise_matrix(self.n_cols(), |i, j| {
|
||||
self.col(i).partial_relfreq_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
self.pairwise(|i, j| {
|
||||
pairwise_matrix(self.n_cols(), |i, j| {
|
||||
self.col(i).partial_hellinger_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
|
||||
})
|
||||
}
|
||||
@@ -118,20 +97,6 @@ impl ColumnarCompactIntMatrix {
|
||||
meta.n_cols += 1;
|
||||
meta.save(dir)
|
||||
}
|
||||
|
||||
fn pairwise(&self, f: impl Fn(usize, usize) -> f64 + Sync) -> Array2<f64> {
|
||||
let n = self.n_cols();
|
||||
let results: Vec<(usize, usize, f64)> = upper_pairs(n)
|
||||
.into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
|
||||
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
|
||||
}
|
||||
|
||||
fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2<u64> {
|
||||
let n = self.n_cols();
|
||||
let results: Vec<(usize, usize, u64)> = upper_pairs(n)
|
||||
.into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
|
||||
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
|
||||
}
|
||||
}
|
||||
|
||||
// ── PackedCompactIntMatrix ────────────────────────────────────────────────────
|
||||
@@ -139,13 +104,10 @@ impl ColumnarCompactIntMatrix {
|
||||
const PCMX_MAGIC: [u8; 4] = *b"PCMX";
|
||||
const PCMX_HEADER: usize = 24; // magic(4) + pad(4) + n_rows(8) + n_cols(8)
|
||||
|
||||
/// Per-column metadata pre-parsed from the embedded PCIV header.
|
||||
struct ColInfo {
|
||||
primary_start: usize, // absolute mmap offset to primary array
|
||||
data_offset: usize, // absolute mmap offset to overflow array
|
||||
primary_start: usize,
|
||||
data_offset: usize,
|
||||
n_overflow: usize,
|
||||
step: usize,
|
||||
index: Vec<(usize, usize)>,
|
||||
}
|
||||
|
||||
pub struct PackedCompactIntMatrix {
|
||||
@@ -171,61 +133,31 @@ impl PackedCompactIntMatrix {
|
||||
for c in 0..n_cols {
|
||||
let off_pos = PCMX_HEADER + c * 8;
|
||||
let col_base = u64::from_le_bytes(mmap[off_pos..off_pos+8].try_into().unwrap()) as usize;
|
||||
// Parse embedded PCIV header at col_base
|
||||
let n_ov = u64::from_le_bytes(mmap[col_base+16..col_base+24].try_into().unwrap()) as usize;
|
||||
let n_idx = u64::from_le_bytes(mmap[col_base+24..col_base+32].try_into().unwrap()) as usize;
|
||||
let step = u64::from_le_bytes(mmap[col_base+32..col_base+40].try_into().unwrap()) as usize;
|
||||
let n_pciv = u64::from_le_bytes(mmap[col_base+8..col_base+16].try_into().unwrap()) as usize;
|
||||
|
||||
let primary_start = col_base + HEADER_SIZE;
|
||||
let data_offset = primary_start + n_pciv;
|
||||
let index_offset = data_offset + n_ov * OVERFLOW_ENTRY_SIZE;
|
||||
|
||||
let mut index = Vec::with_capacity(n_idx);
|
||||
for i in 0..n_idx {
|
||||
let ioff = index_offset + i * INDEX_ENTRY_SIZE;
|
||||
let slot = u64::from_le_bytes(mmap[ioff..ioff+8].try_into().unwrap()) as usize;
|
||||
let pos = u64::from_le_bytes(mmap[ioff+8..ioff+16].try_into().unwrap()) as usize;
|
||||
index.push((slot, pos));
|
||||
columns.push(ColInfo { primary_start, data_offset, n_overflow: n_ov });
|
||||
}
|
||||
columns.push(ColInfo { primary_start, data_offset, n_overflow: n_ov, step, index });
|
||||
}
|
||||
|
||||
Ok(Self { mmap, n_rows, n_cols, columns })
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn get(&self, col: usize, slot: usize) -> u32 {
|
||||
let ci = &self.columns[col];
|
||||
let v = self.mmap[ci.primary_start + slot];
|
||||
if v < 255 { return v as u32; }
|
||||
self.overflow_get(ci, slot)
|
||||
pub(crate) fn col_view(&self, c: usize) -> IntSliceView<'_> {
|
||||
let ci = &self.columns[c];
|
||||
let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
|
||||
let overflow_raw = &self.mmap[ci.data_offset..ci.data_offset + ci.n_overflow * OVERFLOW_ENTRY_SIZE];
|
||||
IntSliceView::new(primary, overflow_raw, ci.n_overflow, self.n_rows)
|
||||
}
|
||||
|
||||
fn overflow_get(&self, ci: &ColInfo, slot: usize) -> u32 {
|
||||
let (pos_start, pos_end) = if ci.step == 0 {
|
||||
(0, ci.n_overflow)
|
||||
} else {
|
||||
let i = ci.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1);
|
||||
let start = ci.index[i].1;
|
||||
let end = if i + 1 < ci.index.len() { ci.index[i+1].1 } else { ci.n_overflow };
|
||||
(start, end)
|
||||
};
|
||||
let mut lo = pos_start;
|
||||
let mut hi = pos_end;
|
||||
while lo < hi {
|
||||
let mid = lo + (hi - lo) / 2;
|
||||
let off = ci.data_offset + mid * OVERFLOW_ENTRY_SIZE;
|
||||
let stored = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize;
|
||||
match stored.cmp(&slot) {
|
||||
Ordering::Equal => return u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap()),
|
||||
Ordering::Less => lo = mid + 1,
|
||||
Ordering::Greater => hi = mid,
|
||||
}
|
||||
}
|
||||
panic!("slot {slot} marked overflow but not found")
|
||||
pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
|
||||
let view = self.col_view(c);
|
||||
let overflow: std::collections::HashMap<usize, u32> = view.overflow_entries().collect();
|
||||
PersistentCompactIntVecBuilder::from_raw_primary(view.primary_bytes(), overflow, path)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn get(&self, col: usize, slot: usize) -> u32 { self.col_view(col).get(slot) }
|
||||
|
||||
pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
||||
for c in 0..self.n_cols { buf[c] = self.get(c, slot); }
|
||||
}
|
||||
@@ -236,152 +168,85 @@ impl PackedCompactIntMatrix {
|
||||
|
||||
pub(crate) fn sum(&self) -> Array1<u64> {
|
||||
Array1::from_vec(
|
||||
(0..self.n_cols).into_par_iter()
|
||||
.map(|c| (0..self.n_rows).map(|s| self.get(c, s) as u64).sum())
|
||||
.collect()
|
||||
(0..self.n_cols).into_par_iter().map(|c| self.col_view(c).sum()).collect()
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn count_nonzero(&self) -> Array1<u64> {
|
||||
Array1::from_vec(
|
||||
(0..self.n_cols).into_par_iter()
|
||||
.map(|c| (0..self.n_rows).filter(|&s| self.get(c, s) > 0).count() as u64)
|
||||
.collect()
|
||||
(0..self.n_cols).into_par_iter().map(|c| self.col_view(c).count_nonzero()).collect()
|
||||
)
|
||||
}
|
||||
|
||||
// ── Pair primitives ───────────────────────────────────────────────────────
|
||||
|
||||
fn pair_partial_bray(&self, i: usize, j: usize) -> u64 {
|
||||
(0..self.n_rows).map(|s| self.get(i, s).min(self.get(j, s)) as u64).sum()
|
||||
self.col_view(i).iter().zip(self.col_view(j).iter()).map(|(a, b)| a.min(b) as u64).sum()
|
||||
}
|
||||
|
||||
fn pair_partial_euclidean(&self, i: usize, j: usize) -> f64 {
|
||||
(0..self.n_rows).map(|s| {
|
||||
let d = self.get(i, s) as f64 - self.get(j, s) as f64;
|
||||
d * d
|
||||
}).sum()
|
||||
self.col_view(i).iter().zip(self.col_view(j).iter())
|
||||
.map(|(a, b)| { let d = a as f64 - b as f64; d * d }).sum()
|
||||
}
|
||||
|
||||
fn pair_partial_threshold_jaccard(&self, i: usize, j: usize, t: u32) -> (u64, u64) {
|
||||
let (mut inter, mut union) = (0u64, 0u64);
|
||||
for s in 0..self.n_rows {
|
||||
let a = self.get(i, s) >= t;
|
||||
let b = self.get(j, s) >= t;
|
||||
if a && b { inter += 1; }
|
||||
if a || b { union += 1; }
|
||||
self.col_view(i).iter().zip(self.col_view(j).iter())
|
||||
.fold((0u64, 0u64), |(inter, uni), (a, b)| {
|
||||
let ap = a >= t; let bp = b >= t;
|
||||
(inter + (ap & bp) as u64, uni + (ap | bp) as u64)
|
||||
})
|
||||
}
|
||||
(inter, union)
|
||||
}
|
||||
|
||||
fn pair_partial_relfreq_bray(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
||||
if si == 0.0 || sj == 0.0 { return 0.0; }
|
||||
(0..self.n_rows).map(|s| {
|
||||
(self.get(i, s) as f64 / si).min(self.get(j, s) as f64 / sj)
|
||||
}).sum()
|
||||
self.col_view(i).iter().zip(self.col_view(j).iter())
|
||||
.map(|(a, b)| (a as f64 / si).min(b as f64 / sj)).sum()
|
||||
}
|
||||
|
||||
fn pair_partial_relfreq_euclidean(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
||||
if si == 0.0 || sj == 0.0 { return 0.0; }
|
||||
(0..self.n_rows).map(|s| {
|
||||
let d = self.get(i, s) as f64 / si - self.get(j, s) as f64 / sj;
|
||||
d * d
|
||||
}).sum()
|
||||
self.col_view(i).iter().zip(self.col_view(j).iter())
|
||||
.map(|(a, b)| { let d = a as f64 / si - b as f64 / sj; d * d }).sum()
|
||||
}
|
||||
|
||||
fn pair_partial_hellinger(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
||||
if si == 0.0 || sj == 0.0 { return 0.0; }
|
||||
(0..self.n_rows).map(|s| {
|
||||
let d = (self.get(i, s) as f64 / si).sqrt() - (self.get(j, s) as f64 / sj).sqrt();
|
||||
d * d
|
||||
}).sum()
|
||||
}
|
||||
|
||||
// ── Matrix methods ────────────────────────────────────────────────────────
|
||||
|
||||
fn pairwise<T>(&self, f: impl Fn(usize, usize) -> T + Sync) -> Array2<T>
|
||||
where T: Clone + Default + Send {
|
||||
let n = self.n_cols;
|
||||
let results: Vec<(usize, usize, T)> = upper_pairs(n)
|
||||
.into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
|
||||
fill_symmetric(n, results.into_iter().map(|(i, j, v)| { let w = v.clone(); (i, j, v, w) }))
|
||||
}
|
||||
|
||||
fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2<u64> {
|
||||
let n = self.n_cols;
|
||||
let results: Vec<(usize, usize, u64)> = upper_pairs(n)
|
||||
.into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
|
||||
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
|
||||
self.col_view(i).iter().zip(self.col_view(j).iter())
|
||||
.map(|(a, b)| { let d = (a as f64 / si).sqrt() - (b as f64 / sj).sqrt(); d * d }).sum()
|
||||
}
|
||||
|
||||
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||
self.pairwise_u64(|i, j| self.pair_partial_bray(i, j))
|
||||
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_bray(i, j))
|
||||
}
|
||||
|
||||
|
||||
pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.pair_partial_euclidean(i, j))
|
||||
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_euclidean(i, j))
|
||||
}
|
||||
|
||||
pub(crate) fn partial_threshold_jaccard_dist_matrix(&self, t: u32) -> (Array2<u64>, Array2<u64>) {
|
||||
let n = self.n_cols;
|
||||
let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n)
|
||||
.into_par_iter()
|
||||
.map(|(i, j)| { let (inter, union) = self.pair_partial_threshold_jaccard(i, j, t); (i, j, inter, union) })
|
||||
.collect();
|
||||
let mut inter_m = Array2::zeros((n, n));
|
||||
let mut union_m = Array2::zeros((n, n));
|
||||
for (i, j, inter, union) in results {
|
||||
inter_m[[i, j]] = inter; inter_m[[j, i]] = inter;
|
||||
union_m[[i, j]] = union; union_m[[j, i]] = union;
|
||||
pairwise2_matrix(self.n_cols, |i, j| self.pair_partial_threshold_jaccard(i, j, t))
|
||||
}
|
||||
(inter_m, union_m)
|
||||
}
|
||||
|
||||
pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.pair_partial_relfreq_bray(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_bray(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||
}
|
||||
|
||||
pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.pair_partial_relfreq_euclidean(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_euclidean(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||
}
|
||||
|
||||
pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/// Build `counts/matrix.pcmx` from existing `col_*.pciv` files.
|
||||
pub fn pack_compact_int_matrix(dir: &Path) -> io::Result<()> {
|
||||
let packed_path = dir.join("matrix.pcmx");
|
||||
if packed_path.exists() {
|
||||
// Matrix complete; remove any leftover column files from a killed cleanup.
|
||||
if let Ok(meta) = MatrixMeta::load(dir) {
|
||||
for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); }
|
||||
let _ = fs::remove_file(dir.join("meta.json"));
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let meta = MatrixMeta::load(dir)?;
|
||||
let n_cols = meta.n_cols;
|
||||
|
||||
// Compute offsets from file sizes — no column data loaded into RAM.
|
||||
let col_sizes: Vec<u64> = (0..n_cols)
|
||||
.map(|c| fs::metadata(col_path(dir, c)).map(|m| m.len()))
|
||||
.collect::<io::Result<_>>()?;
|
||||
|
||||
let header_size = (PCMX_HEADER + n_cols * 8) as u64;
|
||||
let mut col_offset = header_size;
|
||||
let mut offsets = Vec::with_capacity(n_cols);
|
||||
for &size in &col_sizes {
|
||||
offsets.push(col_offset);
|
||||
col_offset += size;
|
||||
}
|
||||
|
||||
// Write to a temp file; rename atomically so a killed process never leaves
|
||||
// a truncated matrix.pcmx that would be mistaken for a complete file.
|
||||
for &size in &col_sizes { offsets.push(col_offset); col_offset += size; }
|
||||
let tmp_path = dir.join("matrix.pcmx.tmp");
|
||||
let mut out = BufWriter::new(File::create(&tmp_path)?);
|
||||
out.write_all(&PCMX_MAGIC)?;
|
||||
@@ -389,13 +254,10 @@ pub fn pack_compact_int_matrix(dir: &Path) -> io::Result<()> {
|
||||
out.write_all(&(meta.n as u64).to_le_bytes())?;
|
||||
out.write_all(&(n_cols as u64).to_le_bytes())?;
|
||||
for &off in &offsets { out.write_all(&off.to_le_bytes())?; }
|
||||
for c in 0..n_cols {
|
||||
io::copy(&mut File::open(col_path(dir, c))?, &mut out)?;
|
||||
}
|
||||
for c in 0..n_cols { io::copy(&mut File::open(col_path(dir, c))?, &mut out)?; }
|
||||
out.flush()?;
|
||||
drop(out);
|
||||
fs::rename(&tmp_path, &packed_path)?;
|
||||
|
||||
for c in 0..n_cols { fs::remove_file(col_path(dir, c))?; }
|
||||
fs::remove_file(dir.join("meta.json"))?;
|
||||
Ok(())
|
||||
@@ -409,18 +271,14 @@ pub enum PersistentCompactIntMatrix {
|
||||
}
|
||||
|
||||
impl PersistentCompactIntMatrix {
|
||||
/// Open from `layer_dir`, auto-detecting Packed or Columnar.
|
||||
pub fn open(layer_dir: &Path) -> io::Result<Self> {
|
||||
let counts_dir = layer_dir.join("counts");
|
||||
|
||||
if counts_dir.join("matrix.pcmx").exists() {
|
||||
return Ok(Self::Packed(PackedCompactIntMatrix::open(&counts_dir.join("matrix.pcmx"))?));
|
||||
}
|
||||
|
||||
if MatrixMeta::load(&counts_dir).is_ok() {
|
||||
return Ok(Self::Columnar(ColumnarCompactIntMatrix::open(&counts_dir)?));
|
||||
}
|
||||
|
||||
Err(io::Error::new(
|
||||
io::ErrorKind::NotFound,
|
||||
format!("no count matrix found in {} — run 'obikmer upgrade'", layer_dir.display()),
|
||||
@@ -430,7 +288,6 @@ impl PersistentCompactIntMatrix {
|
||||
pub fn n(&self) -> usize {
|
||||
match self { Self::Columnar(m) => m.n(), Self::Packed(m) => m.n_rows }
|
||||
}
|
||||
|
||||
pub fn n_cols(&self) -> usize {
|
||||
match self { Self::Columnar(m) => m.n_cols(), Self::Packed(m) => m.n_cols }
|
||||
}
|
||||
@@ -442,22 +299,32 @@ impl PersistentCompactIntMatrix {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn col_view(&self, c: usize) -> IntSliceView<'_> {
|
||||
match self {
|
||||
Self::Columnar(m) => m.col(c).view(),
|
||||
Self::Packed(m) => m.col_view(c),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
|
||||
match self {
|
||||
Self::Columnar(m) => PersistentCompactIntVecBuilder::build_from(m.col(c), path),
|
||||
Self::Packed(m) => m.col_persist(c, path),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn row(&self, slot: usize) -> Box<[u32]> {
|
||||
match self { Self::Columnar(m) => m.row(slot), Self::Packed(m) => m.row(slot) }
|
||||
}
|
||||
|
||||
pub fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
||||
match self { Self::Columnar(m) => m.fill_row(slot, buf), Self::Packed(m) => m.fill_row(slot, buf) }
|
||||
}
|
||||
|
||||
pub fn sum(&self) -> Array1<u64> {
|
||||
match self { Self::Columnar(m) => m.sum(), Self::Packed(m) => m.sum() }
|
||||
}
|
||||
|
||||
pub fn count_nonzero(&self) -> Array1<u64> {
|
||||
match self { Self::Columnar(m) => m.count_nonzero(), Self::Packed(m) => m.count_nonzero() }
|
||||
}
|
||||
|
||||
pub fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||
match self { Self::Columnar(m) => m.partial_bray_dist_matrix(), Self::Packed(m) => m.partial_bray_dist_matrix() }
|
||||
}
|
||||
@@ -476,7 +343,6 @@ impl PersistentCompactIntMatrix {
|
||||
pub fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
match self { Self::Columnar(m) => m.partial_hellinger_euclidean_dist_matrix(col_sums), Self::Packed(m) => m.partial_hellinger_euclidean_dist_matrix(col_sums) }
|
||||
}
|
||||
|
||||
pub fn append_column(dir: &Path, value_of: impl Fn(usize) -> u32) -> io::Result<()> {
|
||||
ColumnarCompactIntMatrix::append_column(dir, value_of)
|
||||
}
|
||||
@@ -513,30 +379,88 @@ impl PersistentCompactIntMatrixBuilder {
|
||||
fs::create_dir_all(dir)?;
|
||||
Ok(Self { dir: dir.to_path_buf(), n, n_cols: 0 })
|
||||
}
|
||||
|
||||
pub fn n(&self) -> usize { self.n }
|
||||
pub fn n_cols(&self) -> usize { self.n_cols }
|
||||
|
||||
pub fn add_col(&mut self) -> io::Result<PersistentCompactIntVecBuilder> {
|
||||
let path = col_path(&self.dir, self.n_cols);
|
||||
self.n_cols += 1;
|
||||
PersistentCompactIntVecBuilder::new(self.n, &path)
|
||||
}
|
||||
|
||||
pub fn add_col_from(&mut self, src: &TempCompactIntVec) -> io::Result<()> {
|
||||
src.make_persistent(&col_path(&self.dir, self.n_cols))?;
|
||||
self.n_cols += 1;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn add_col_from_bit(&mut self, src: &TempBitVec) -> io::Result<()> {
|
||||
let path = col_path(&self.dir, self.n_cols);
|
||||
self.n_cols += 1;
|
||||
let mut b = PersistentCompactIntVecBuilder::new(self.n, &path)?;
|
||||
b.inc_present(src.view());
|
||||
b.close()
|
||||
}
|
||||
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
// ── MatrixGroupOps ────────────────────────────────────────────────────────────
|
||||
|
||||
fn upper_pairs(n: usize) -> Vec<(usize, usize)> {
|
||||
(0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect()
|
||||
}
|
||||
impl MatrixGroupOps for PersistentCompactIntMatrix {
|
||||
fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> io::Result<TempCompactIntVec> {
|
||||
let n = self.n();
|
||||
if g.indices.len() < 255 {
|
||||
let mut builder = TempCompactIntVecBuilder::new(n)?;
|
||||
for &c in &g.indices {
|
||||
builder.inc_predicate_fast(self.col_view(c), |v| v >= threshold);
|
||||
}
|
||||
builder.freeze()
|
||||
} else {
|
||||
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||
for chunk in g.indices.chunks(254) {
|
||||
let mut chunk_b = TempCompactIntVecBuilder::new(n)?;
|
||||
for &c in chunk {
|
||||
chunk_b.inc_predicate_fast(self.col_view(c), |v| v >= threshold);
|
||||
}
|
||||
let frozen = chunk_b.freeze()?;
|
||||
result.add(frozen.view());
|
||||
}
|
||||
result.freeze()
|
||||
}
|
||||
}
|
||||
|
||||
fn fill_symmetric<T>(n: usize, vals: impl Iterator<Item = (usize, usize, T, T)>) -> Array2<T>
|
||||
where T: Clone + Default {
|
||||
let mut m = Array2::from_elem((n, n), T::default());
|
||||
for (i, j, vij, vji) in vals { m[[i, j]] = vij; m[[j, i]] = vji; }
|
||||
m
|
||||
fn partial_group_sum(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
|
||||
let n = self.n();
|
||||
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||
for &c in &g.indices { result.add(self.col_view(c)); }
|
||||
result.freeze()
|
||||
}
|
||||
|
||||
fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec> {
|
||||
let n = self.n();
|
||||
let mut result = TempBitVecBuilder::new(n)?;
|
||||
for &c in &g.indices {
|
||||
result.or_where(self.col_view(c), |v| v >= threshold);
|
||||
}
|
||||
result.freeze()
|
||||
}
|
||||
|
||||
fn partial_group_min(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
|
||||
let n = self.n();
|
||||
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||
if let Some((&first, rest)) = g.indices.split_first() {
|
||||
result.add(self.col_view(first));
|
||||
for &c in rest { result.min(self.col_view(c)); }
|
||||
}
|
||||
result.freeze()
|
||||
}
|
||||
|
||||
fn partial_group_max(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
|
||||
let n = self.n();
|
||||
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||
for &c in &g.indices { result.max(self.col_view(c)); }
|
||||
result.freeze()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,11 +23,6 @@ impl LayerMeta {
|
||||
}
|
||||
|
||||
fn parse(s: &str) -> Option<Self> {
|
||||
let key = "\"n\":";
|
||||
let pos = s.find(key)? + key.len();
|
||||
let rest = s[pos..].trim_start();
|
||||
let end = rest.find(|c: char| !c.is_ascii_digit()).unwrap_or(rest.len());
|
||||
let n = rest[..end].parse().ok()?;
|
||||
Some(Self { n })
|
||||
Some(Self { n: crate::meta::field(s, "n")? })
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,20 +1,28 @@
|
||||
mod bitvec;
|
||||
mod bitmatrix;
|
||||
mod builder;
|
||||
mod colgroup;
|
||||
mod format;
|
||||
mod intmatrix;
|
||||
mod layer_meta;
|
||||
mod meta;
|
||||
mod reader;
|
||||
mod tempbitvec;
|
||||
mod tempintvec;
|
||||
mod views;
|
||||
pub mod traits;
|
||||
|
||||
pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder};
|
||||
pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_matrix};
|
||||
pub use builder::PersistentCompactIntVecBuilder;
|
||||
pub use colgroup::{ColGroup, FilterMask, MatrixGroupOps, eval_filter_mask};
|
||||
pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
|
||||
pub use layer_meta::LayerMeta;
|
||||
pub use reader::PersistentCompactIntVec;
|
||||
pub use reader::{PersistentCompactIntVec, Iter as CompactIntVecIter};
|
||||
pub use tempbitvec::{TempBitVec, TempBitVecBuilder};
|
||||
pub use tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
|
||||
pub use traits::{BitPartials, ColumnWeights, CountPartials};
|
||||
pub use views::{BitSliceView, BitSliceIter, IntSliceView, IntSliceViewIter};
|
||||
|
||||
#[cfg(test)]
|
||||
#[path = "tests/mod.rs"]
|
||||
|
||||
@@ -23,7 +23,7 @@ fn parse(s: &str) -> Option<MatrixMeta> {
|
||||
Some(MatrixMeta { n: field(s, "n")?, n_cols: field(s, "n_cols")? })
|
||||
}
|
||||
|
||||
fn field(s: &str, name: &str) -> Option<usize> {
|
||||
pub(crate) fn field(s: &str, name: &str) -> Option<usize> {
|
||||
let key = format!("\"{}\":", name);
|
||||
let pos = s.find(&key)? + key.len();
|
||||
let rest = s[pos..].trim_start();
|
||||
|
||||
+61
-202
@@ -4,7 +4,8 @@ use std::path::{Path, PathBuf};
|
||||
|
||||
use memmap2::Mmap;
|
||||
|
||||
use crate::format::{HEADER_SIZE, INDEX_ENTRY_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE};
|
||||
use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE, parse_index_entry};
|
||||
use crate::views::IntSliceView;
|
||||
|
||||
pub struct PersistentCompactIntVec {
|
||||
mmap: Mmap,
|
||||
@@ -18,15 +19,11 @@ pub struct PersistentCompactIntVec {
|
||||
}
|
||||
|
||||
impl PersistentCompactIntVec {
|
||||
/// Opens a persistent compact int vector from the given path.
|
||||
pub fn open(path: &Path) -> io::Result<Self> {
|
||||
let mmap = unsafe { Mmap::map(&File::open(path)?)? };
|
||||
|
||||
if mmap.len() < HEADER_SIZE {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"PCIV file too short",
|
||||
));
|
||||
return Err(io::Error::new(io::ErrorKind::InvalidData, "PCIV file too short"));
|
||||
}
|
||||
if &mmap[0..4] != &MAGIC {
|
||||
return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PCIV magic"));
|
||||
@@ -43,40 +40,16 @@ impl PersistentCompactIntVec {
|
||||
|
||||
let mut index = Vec::with_capacity(n_index);
|
||||
for i in 0..n_index {
|
||||
let off = index_offset + i * INDEX_ENTRY_SIZE;
|
||||
let slot = u64::from_le_bytes(mmap[off..off + 8].try_into().unwrap()) as usize;
|
||||
let pos = u64::from_le_bytes(mmap[off + 8..off + 16].try_into().unwrap()) as usize;
|
||||
index.push((slot, pos));
|
||||
index.push(parse_index_entry(&mmap, index_offset, i));
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
mmap,
|
||||
n,
|
||||
n_overflow,
|
||||
step,
|
||||
index,
|
||||
primary_offset,
|
||||
data_offset,
|
||||
path: path.to_path_buf(),
|
||||
})
|
||||
Ok(Self { mmap, n, n_overflow, step, index, primary_offset, data_offset, path: path.to_path_buf() })
|
||||
}
|
||||
|
||||
/// Returns the path of the compact int vector file.
|
||||
pub fn path(&self) -> &Path {
|
||||
&self.path
|
||||
}
|
||||
pub fn path(&self) -> &Path { &self.path }
|
||||
pub fn len(&self) -> usize { self.n }
|
||||
pub fn is_empty(&self) -> bool { self.n == 0 }
|
||||
|
||||
/// Returns the length of the compact int vector.
|
||||
pub fn len(&self) -> usize {
|
||||
self.n
|
||||
}
|
||||
|
||||
/// Returns whether the compact int vector is empty.
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.n == 0
|
||||
}
|
||||
|
||||
/// Returns the value at the given slot.
|
||||
pub fn get(&self, slot: usize) -> u32 {
|
||||
match self.mmap[self.primary_offset + slot] {
|
||||
255 => self.overflow_get(slot),
|
||||
@@ -84,27 +57,15 @@ impl PersistentCompactIntVec {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the value at the given slot from the overflow region.
|
||||
fn overflow_get(&self, slot: usize) -> u32 {
|
||||
let pos_start;
|
||||
let pos_end;
|
||||
|
||||
if self.step == 0 {
|
||||
pos_start = 0;
|
||||
pos_end = self.n_overflow;
|
||||
let (pos_start, pos_end) = if self.step == 0 {
|
||||
(0, self.n_overflow)
|
||||
} else {
|
||||
let i = self
|
||||
.index
|
||||
.partition_point(|&(s, _)| s <= slot)
|
||||
.saturating_sub(1);
|
||||
pos_start = self.index[i].1;
|
||||
pos_end = if i + 1 < self.index.len() {
|
||||
self.index[i + 1].1
|
||||
} else {
|
||||
self.n_overflow
|
||||
let i = self.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1);
|
||||
let start = self.index[i].1;
|
||||
let end = if i + 1 < self.index.len() { self.index[i + 1].1 } else { self.n_overflow };
|
||||
(start, end)
|
||||
};
|
||||
}
|
||||
|
||||
let mut lo = pos_start;
|
||||
let mut hi = pos_end;
|
||||
while lo < hi {
|
||||
@@ -119,144 +80,91 @@ impl PersistentCompactIntVec {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the slot at the given index in the overflow region.
|
||||
fn data_slot(&self, i: usize) -> usize {
|
||||
let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE;
|
||||
u64::from_le_bytes(self.mmap[off..off + 8].try_into().unwrap()) as usize
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the value at the given index in the overflow region.
|
||||
fn data_value(&self, i: usize) -> u32 {
|
||||
let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE + 8;
|
||||
u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn sum(&self) -> u64 {
|
||||
self.iter().map(|v| v as u64).sum()
|
||||
let primary = &self.mmap[self.primary_offset..self.primary_offset + self.n];
|
||||
byte_sum(primary, (0..self.n_overflow).map(|i| self.data_value(i)))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn count_nonzero(&self) -> u64 {
|
||||
self.iter().filter(|&v| v > 0).count() as u64
|
||||
let primary = &self.mmap[self.primary_offset..self.primary_offset + self.n];
|
||||
byte_count_nonzero(primary)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the Bray-Curtis distance between two compact int vectors.
|
||||
/// Lightweight zero-copy view — primary and overflow point into the mmap.
|
||||
pub fn view(&self) -> IntSliceView<'_> {
|
||||
let primary = &self.mmap[self.primary_offset..self.primary_offset + self.n];
|
||||
let overflow_raw = &self.mmap[self.data_offset..self.data_offset + self.n_overflow * OVERFLOW_ENTRY_SIZE];
|
||||
IntSliceView::new(primary, overflow_raw, self.n_overflow, self.n)
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> Iter<'_> {
|
||||
Iter { pciv: self, slot: 0, overflow_pos: 0 }
|
||||
}
|
||||
|
||||
// ── Distance methods ──────────────────────────────────────────────────────
|
||||
|
||||
pub fn bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||
let sum_min = self.partial_bray_dist(other);
|
||||
let denom = self.sum() + other.sum();
|
||||
if denom == 0 {
|
||||
return 0.0;
|
||||
}
|
||||
1.0 - 2.0 * sum_min as f64 / denom as f64
|
||||
if denom == 0 { 0.0 } else { 1.0 - 2.0 * sum_min as f64 / denom as f64 }
|
||||
}
|
||||
|
||||
/// Returns `Σ_slot min(self[slot], other[slot])` — the additive numerator of Bray-Curtis.
|
||||
/// The denominator `sum_a + sum_b` is obtained from `self.sum() + other.sum()`.
|
||||
pub fn partial_bray_dist(&self, other: &PersistentCompactIntVec) -> u64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
self.iter()
|
||||
.zip(other.iter())
|
||||
.map(|(a, b)| a.min(b) as u64)
|
||||
.sum()
|
||||
self.iter().zip(other.iter()).map(|(a, b)| a.min(b) as u64).sum()
|
||||
}
|
||||
|
||||
/// Returns the relative frequency Bray-Curtis distance between two compact int vectors.
|
||||
///
|
||||
/// This is a variant of [`bray_dist`] that uses relative frequencies instead of raw counts.
|
||||
pub fn relfreq_bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
let sum_a = self.sum() as f64;
|
||||
let sum_b = other.sum() as f64;
|
||||
if sum_a == 0.0 && sum_b == 0.0 {
|
||||
return 0.0;
|
||||
}
|
||||
let sum_min = self.partial_relfreq_bray_dist(other, sum_a, sum_b);
|
||||
1.0 - sum_min
|
||||
let sa = self.sum() as f64;
|
||||
let sb = other.sum() as f64;
|
||||
if sa == 0.0 && sb == 0.0 { return 0.0; }
|
||||
1.0 - self.partial_relfreq_bray_dist(other, sa, sb)
|
||||
}
|
||||
|
||||
/// Returns the partial relative frequency Bray-Curtis distance between two compact int vectors.
|
||||
///
|
||||
/// This is used internally by [`relfreq_bray_dist`] and to easily compute the relative frequency
|
||||
/// Bray-Curtis distance over a set of vector pairs.
|
||||
///
|
||||
/// Arguments:
|
||||
/// - `other`: the other compact int vector to compare with
|
||||
/// - `sum_a`: the sum of the first vector's counts
|
||||
/// - `sum_b`: the sum of the second vector's counts
|
||||
///
|
||||
/// Returns the sum of the minimum relative frequencies at each index.
|
||||
pub fn partial_relfreq_bray_dist(
|
||||
&self,
|
||||
other: &PersistentCompactIntVec,
|
||||
sum_a: f64,
|
||||
sum_b: f64,
|
||||
) -> f64 {
|
||||
pub fn partial_relfreq_bray_dist(&self, other: &PersistentCompactIntVec, sum_a: f64, sum_b: f64) -> f64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
let sum_min: f64 = self
|
||||
.iter()
|
||||
.zip(other.iter())
|
||||
self.iter().zip(other.iter())
|
||||
.map(|(a, b)| {
|
||||
let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 };
|
||||
let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 };
|
||||
pa.min(pb)
|
||||
})
|
||||
.sum();
|
||||
sum_min
|
||||
.sum()
|
||||
}
|
||||
|
||||
/// Returns the euclidean distance between two compact int vectors.
|
||||
pub fn euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||
self.partial_euclidean_dist(other).sqrt()
|
||||
}
|
||||
|
||||
/// Returns the partial euclidean distance between two compact int vectors.
|
||||
///
|
||||
/// This is used internally by [`euclidean_dist`] and to easily compute the euclidean distance
|
||||
/// over a set of vector pairs.
|
||||
///
|
||||
/// The result is the sum of the squared differences between corresponding elements of the two
|
||||
/// vectors.
|
||||
pub fn partial_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
self.iter()
|
||||
.zip(other.iter())
|
||||
.map(|(a, b)| {
|
||||
let d = a as f64 - b as f64;
|
||||
d * d
|
||||
})
|
||||
self.iter().zip(other.iter())
|
||||
.map(|(a, b)| { let d = a as f64 - b as f64; d * d })
|
||||
.sum()
|
||||
}
|
||||
|
||||
/// Returns the relative frequency euclidean distance between two compact int vectors.
|
||||
///
|
||||
/// This is a variant of [`euclidean_dist`] that uses relative frequencies instead of raw counts.
|
||||
pub fn relfreq_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
let sum_a = self.sum() as f64;
|
||||
let sum_b = other.sum() as f64;
|
||||
if sum_a == 0.0 && sum_b == 0.0 {
|
||||
return 0.0;
|
||||
}
|
||||
self.partial_relfreq_euclidean_dist(other, sum_a, sum_b)
|
||||
.sqrt()
|
||||
let sa = self.sum() as f64;
|
||||
let sb = other.sum() as f64;
|
||||
if sa == 0.0 && sb == 0.0 { return 0.0; }
|
||||
self.partial_relfreq_euclidean_dist(other, sa, sb).sqrt()
|
||||
}
|
||||
|
||||
/// Returns the partial relative frequency euclidean distance between two compact int vectors.
|
||||
///
|
||||
/// This is used internally by [`relfreq_euclidean_dist`] and to easily compute the relative frequency
|
||||
/// euclidean distance over a set of vector pairs.
|
||||
pub fn partial_relfreq_euclidean_dist(
|
||||
&self,
|
||||
other: &PersistentCompactIntVec,
|
||||
sum_a: f64,
|
||||
sum_b: f64,
|
||||
) -> f64 {
|
||||
pub fn partial_relfreq_euclidean_dist(&self, other: &PersistentCompactIntVec, sum_a: f64, sum_b: f64) -> f64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
self.iter()
|
||||
.zip(other.iter())
|
||||
self.iter().zip(other.iter())
|
||||
.map(|(a, b)| {
|
||||
let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 };
|
||||
let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 };
|
||||
@@ -266,46 +174,19 @@ impl PersistentCompactIntVec {
|
||||
.sum()
|
||||
}
|
||||
|
||||
/// Returns the Euclidean distance between two compact int vectors using the Hellinger transform.
|
||||
///
|
||||
/// The Hellinger transform is applied to the raw counts of each vector, and the result is
|
||||
/// the Euclidean distance between the transformed vectors. The Hellinger transform is defined
|
||||
/// as the square root of the relative frequencies.
|
||||
pub fn hellinger_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
let sum_a = self.sum() as f64;
|
||||
let sum_b = other.sum() as f64;
|
||||
if sum_a == 0.0 && sum_b == 0.0 {
|
||||
return 0.0;
|
||||
}
|
||||
self.partial_hellinger_euclidean_dist(other, sum_a, sum_b)
|
||||
.sqrt()
|
||||
let sa = self.sum() as f64;
|
||||
let sb = other.sum() as f64;
|
||||
if sa == 0.0 && sb == 0.0 { return 0.0; }
|
||||
self.partial_hellinger_euclidean_dist(other, sa, sb).sqrt()
|
||||
}
|
||||
|
||||
/// Returns the partial Hellinger Euclidean distance between two compact int vectors.
|
||||
///
|
||||
/// This is used internally by [`hellinger_euclidean_dist`] and to easily compute the Hellinger
|
||||
/// Euclidean distance over a set of vector pairs.
|
||||
pub fn partial_hellinger_euclidean_dist(
|
||||
&self,
|
||||
other: &PersistentCompactIntVec,
|
||||
sum_a: f64,
|
||||
sum_b: f64,
|
||||
) -> f64 {
|
||||
pub fn partial_hellinger_euclidean_dist(&self, other: &PersistentCompactIntVec, sum_a: f64, sum_b: f64) -> f64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
self.iter()
|
||||
.zip(other.iter())
|
||||
self.iter().zip(other.iter())
|
||||
.map(|(a, b)| {
|
||||
let pa = if sum_a > 0.0 {
|
||||
(a as f64 / sum_a).sqrt()
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
let pb = if sum_b > 0.0 {
|
||||
(b as f64 / sum_b).sqrt()
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
let pa = if sum_a > 0.0 { (a as f64 / sum_a).sqrt() } else { 0.0 };
|
||||
let pb = if sum_b > 0.0 { (b as f64 / sum_b).sqrt() } else { 0.0 };
|
||||
let d = pa - pb;
|
||||
d * d
|
||||
})
|
||||
@@ -317,22 +198,13 @@ impl PersistentCompactIntVec {
|
||||
}
|
||||
|
||||
pub fn threshold_jaccard_dist(&self, other: &PersistentCompactIntVec, threshold: u32) -> f64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
let (intersection, union) = self.partial_threshold_jaccard_dist(other, threshold);
|
||||
if union == 0 {
|
||||
return 0.0;
|
||||
}
|
||||
1.0 - intersection as f64 / union as f64
|
||||
if union == 0 { 0.0 } else { 1.0 - intersection as f64 / union as f64 }
|
||||
}
|
||||
|
||||
pub fn partial_threshold_jaccard_dist(
|
||||
&self,
|
||||
other: &PersistentCompactIntVec,
|
||||
threshold: u32,
|
||||
) -> (u64, u64) {
|
||||
pub fn partial_threshold_jaccard_dist(&self, other: &PersistentCompactIntVec, threshold: u32) -> (u64, u64) {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
self.iter()
|
||||
.zip(other.iter())
|
||||
self.iter().zip(other.iter())
|
||||
.fold((0u64, 0u64), |(inter, uni), (a, b)| {
|
||||
let ap = a >= threshold;
|
||||
let bp = b >= threshold;
|
||||
@@ -343,23 +215,12 @@ impl PersistentCompactIntVec {
|
||||
pub fn jaccard_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||
self.threshold_jaccard_dist(other, 1)
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> Iter<'_> {
|
||||
Iter {
|
||||
pciv: self,
|
||||
slot: 0,
|
||||
overflow_pos: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> IntoIterator for &'a PersistentCompactIntVec {
|
||||
type Item = u32;
|
||||
type IntoIter = Iter<'a>;
|
||||
|
||||
fn into_iter(self) -> Iter<'a> {
|
||||
self.iter()
|
||||
}
|
||||
fn into_iter(self) -> Iter<'a> { self.iter() }
|
||||
}
|
||||
|
||||
pub struct Iter<'a> {
|
||||
@@ -374,9 +235,7 @@ impl Iterator for Iter<'_> {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<u32> {
|
||||
if self.slot >= self.pciv.n {
|
||||
return None;
|
||||
}
|
||||
if self.slot >= self.pciv.n { return None; }
|
||||
let v = self.pciv.mmap[self.pciv.primary_offset + self.slot];
|
||||
self.slot += 1;
|
||||
if v < 255 {
|
||||
|
||||
@@ -0,0 +1,111 @@
|
||||
use std::io;
|
||||
use std::path::Path;
|
||||
|
||||
use tempfile::TempDir;
|
||||
|
||||
use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
|
||||
use crate::views::{BitSliceIter, BitSliceView, IntSliceView};
|
||||
|
||||
// ── TempBitVec — frozen read-only, auto-deleted on drop ──────────────────────
|
||||
|
||||
pub struct TempBitVec {
|
||||
vec: PersistentBitVec,
|
||||
// Dropped after `vec` (field order), so the mmap is released before the
|
||||
// temp directory is deleted.
|
||||
_temp: TempDir,
|
||||
}
|
||||
|
||||
impl TempBitVec {
|
||||
pub fn make_persistent(&self, path: &Path) -> io::Result<PersistentBitVec> {
|
||||
std::fs::copy(self.vec.path(), path)?;
|
||||
PersistentBitVec::open(path)
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.vec.len()
|
||||
}
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.vec.is_empty()
|
||||
}
|
||||
pub fn get(&self, slot: usize) -> bool {
|
||||
self.vec.get(slot)
|
||||
}
|
||||
pub fn count_ones(&self) -> u64 {
|
||||
self.vec.count_ones()
|
||||
}
|
||||
pub fn view(&self) -> BitSliceView<'_> {
|
||||
self.vec.view()
|
||||
}
|
||||
pub fn iter(&self) -> BitSliceIter<'_> {
|
||||
self.view().iter()
|
||||
}
|
||||
}
|
||||
|
||||
// ── TempBitVecBuilder — mutable, becomes TempBitVec on freeze ────────────────
|
||||
|
||||
pub struct TempBitVecBuilder {
|
||||
builder: PersistentBitVecBuilder,
|
||||
temp: TempDir,
|
||||
}
|
||||
|
||||
impl TempBitVecBuilder {
|
||||
pub fn new(n: usize) -> io::Result<Self> {
|
||||
let temp = TempDir::new()?;
|
||||
let path = temp.path().join("data.pbiv");
|
||||
let builder = PersistentBitVecBuilder::new(n, &path)?;
|
||||
Ok(Self { builder, temp })
|
||||
}
|
||||
|
||||
pub fn new_ones(n: usize) -> io::Result<Self> {
|
||||
let temp = TempDir::new()?;
|
||||
let path = temp.path().join("data.pbiv");
|
||||
let builder = PersistentBitVecBuilder::new_ones(n, &path)?;
|
||||
Ok(Self { builder, temp })
|
||||
}
|
||||
|
||||
pub fn freeze(self) -> io::Result<TempBitVec> {
|
||||
let Self { builder, temp } = self;
|
||||
let vec = builder.finish()?;
|
||||
Ok(TempBitVec { vec, _temp: temp })
|
||||
}
|
||||
|
||||
pub fn set(&mut self, slot: usize, value: bool) {
|
||||
self.builder.set(slot, value);
|
||||
}
|
||||
|
||||
pub fn view(&self) -> BitSliceView<'_> {
|
||||
self.builder.view()
|
||||
}
|
||||
|
||||
pub fn or(&mut self, other: BitSliceView<'_>) {
|
||||
self.builder.or(other);
|
||||
}
|
||||
|
||||
pub fn and(&mut self, other: BitSliceView<'_>) {
|
||||
self.builder.and(other);
|
||||
}
|
||||
|
||||
pub fn xor(&mut self, other: BitSliceView<'_>) {
|
||||
self.builder.xor(other);
|
||||
}
|
||||
|
||||
pub fn not(&mut self) {
|
||||
self.builder.not();
|
||||
}
|
||||
|
||||
pub fn copy_from(&mut self, src: BitSliceView<'_>) {
|
||||
self.builder.copy_from(src);
|
||||
}
|
||||
|
||||
pub fn or_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||
self.builder.or_where(col, pred);
|
||||
}
|
||||
|
||||
pub fn and_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||
self.builder.and_where(col, pred);
|
||||
}
|
||||
|
||||
pub fn xor_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||
self.builder.xor_where(col, pred);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,89 @@
|
||||
use std::io;
|
||||
use std::path::Path;
|
||||
|
||||
use tempfile::TempDir;
|
||||
|
||||
use crate::builder::PersistentCompactIntVecBuilder;
|
||||
use crate::reader::PersistentCompactIntVec;
|
||||
use crate::views::{BitSliceView, IntSliceView};
|
||||
|
||||
// ── TempCompactIntVec — frozen read-only, auto-deleted on drop ────────────────
|
||||
|
||||
pub struct TempCompactIntVec {
|
||||
vec: PersistentCompactIntVec,
|
||||
// Dropped after `vec` (field order), so the mmap is released before the
|
||||
// temp directory is deleted.
|
||||
_temp: TempDir,
|
||||
}
|
||||
|
||||
impl TempCompactIntVec {
|
||||
pub fn make_persistent(&self, path: &Path) -> io::Result<PersistentCompactIntVec> {
|
||||
std::fs::copy(self.vec.path(), path)?;
|
||||
PersistentCompactIntVec::open(path)
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize { self.vec.len() }
|
||||
pub fn is_empty(&self) -> bool { self.vec.is_empty() }
|
||||
pub fn get(&self, slot: usize) -> u32 { self.vec.get(slot) }
|
||||
pub fn sum(&self) -> u64 { self.vec.sum() }
|
||||
pub fn view(&self) -> IntSliceView<'_> { self.vec.view() }
|
||||
pub fn iter(&self) -> crate::reader::Iter<'_> { self.vec.iter() }
|
||||
}
|
||||
|
||||
// ── TempCompactIntVecBuilder — mutable, becomes TempCompactIntVec on freeze ──
|
||||
|
||||
pub struct TempCompactIntVecBuilder {
|
||||
builder: PersistentCompactIntVecBuilder,
|
||||
temp: TempDir,
|
||||
}
|
||||
|
||||
impl TempCompactIntVecBuilder {
|
||||
pub fn new(n: usize) -> io::Result<Self> {
|
||||
let temp = TempDir::new()?;
|
||||
let path = temp.path().join("data.pciv");
|
||||
let builder = PersistentCompactIntVecBuilder::new(n, &path)?;
|
||||
Ok(Self { builder, temp })
|
||||
}
|
||||
|
||||
pub fn freeze(self) -> io::Result<TempCompactIntVec> {
|
||||
let Self { builder, temp } = self;
|
||||
let vec = builder.finish()?;
|
||||
Ok(TempCompactIntVec { vec, _temp: temp })
|
||||
}
|
||||
|
||||
pub fn n(&self) -> usize { self.builder.len() }
|
||||
|
||||
pub fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); }
|
||||
pub fn get(&self, slot: usize) -> u32 { self.builder.get(slot) }
|
||||
|
||||
pub fn primary_bytes(&self) -> &[u8] { self.builder.primary_bytes() }
|
||||
pub fn primary_bytes_mut(&mut self) -> &mut [u8] { self.builder.primary_bytes_mut() }
|
||||
|
||||
pub fn inc_present(&mut self, col: BitSliceView<'_>) {
|
||||
self.builder.inc_present(col);
|
||||
}
|
||||
|
||||
pub fn inc_present_fast(&mut self, col: BitSliceView<'_>) {
|
||||
self.builder.inc_present_fast(col);
|
||||
}
|
||||
|
||||
pub fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||
self.builder.inc_predicate(col, pred);
|
||||
}
|
||||
|
||||
pub fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||
self.builder.inc_predicate_fast(col, pred);
|
||||
}
|
||||
|
||||
pub fn add(&mut self, other: IntSliceView<'_>) {
|
||||
self.builder.add(other);
|
||||
}
|
||||
|
||||
pub fn mask_with(&mut self, mask: BitSliceView<'_>) {
|
||||
self.builder.mask_with(mask);
|
||||
}
|
||||
|
||||
pub fn min(&mut self, other: IntSliceView<'_>) { self.builder.min(other); }
|
||||
pub fn max(&mut self, other: IntSliceView<'_>) { self.builder.max(other); }
|
||||
pub fn diff(&mut self, other: IntSliceView<'_>) { self.builder.diff(other); }
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
use tempfile::tempdir;
|
||||
|
||||
use crate::{PersistentBitMatrix, PersistentBitMatrixBuilder};
|
||||
use crate::{pack_bit_matrix, PersistentBitMatrix, PersistentBitMatrixBuilder};
|
||||
use crate::traits::BitPartials;
|
||||
|
||||
fn make_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) {
|
||||
@@ -203,3 +203,57 @@ fn partial_hamming_matches_hamming() {
|
||||
let full = m.hamming_dist_matrix();
|
||||
assert_eq!(partial, full);
|
||||
}
|
||||
|
||||
// ── col_view on Packed ────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn col_view_packed_values() {
|
||||
let (dir, _) = make_matrix(&[
|
||||
&[true, false, true, true],
|
||||
&[false, true, false, true],
|
||||
]);
|
||||
pack_bit_matrix(&dir.path().join("presence")).unwrap();
|
||||
let m = PersistentBitMatrix::open(dir.path()).unwrap();
|
||||
|
||||
// col 0: [T, F, T, T]
|
||||
let v0 = m.col_view(0);
|
||||
assert_eq!(v0.len(), 4);
|
||||
assert_eq!(v0.get(0), true);
|
||||
assert_eq!(v0.get(1), false);
|
||||
assert_eq!(v0.get(2), true);
|
||||
assert_eq!(v0.get(3), true);
|
||||
assert_eq!(v0.count_ones(), 3);
|
||||
|
||||
// col 1: [F, T, F, T]
|
||||
let v1 = m.col_view(1);
|
||||
assert_eq!(v1.get(0), false);
|
||||
assert_eq!(v1.get(1), true);
|
||||
assert_eq!(v1.get(2), false);
|
||||
assert_eq!(v1.get(3), true);
|
||||
assert_eq!(v1.count_ones(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn col_view_packed_matches_columnar() {
|
||||
let data: &[&[bool]] = &[
|
||||
&[true, false, true, false, true, true, false, true],
|
||||
&[false, false, true, true, false, true, true, false],
|
||||
&[true, true, true, false, false, false, true, true],
|
||||
];
|
||||
let (dir_col, m_col) = make_matrix(data);
|
||||
let (dir_pack, _) = make_matrix(data);
|
||||
pack_bit_matrix(&dir_pack.path().join("presence")).unwrap();
|
||||
let m_pack = PersistentBitMatrix::open(dir_pack.path()).unwrap();
|
||||
|
||||
for c in 0..data.len() {
|
||||
let col_ref = m_col.col(c);
|
||||
let col_view = m_pack.col_view(c);
|
||||
assert_eq!(col_view.len(), col_ref.len(), "col={c} len");
|
||||
for s in 0..col_ref.len() {
|
||||
assert_eq!(col_view.get(s), col_ref.get(s), "col={c} slot={s}");
|
||||
}
|
||||
assert_eq!(col_view.count_ones(), col_ref.count_ones(), "col={c} count_ones");
|
||||
assert_eq!(col_view.words(), col_ref.words(), "col={c} words");
|
||||
}
|
||||
drop(dir_col);
|
||||
}
|
||||
|
||||
@@ -77,7 +77,7 @@ fn op_and() {
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("out.pbiv");
|
||||
let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
|
||||
b.and(&rb);
|
||||
b.and(rb.view());
|
||||
b.close().unwrap();
|
||||
let r = PersistentBitVec::open(&path).unwrap();
|
||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![true, false, false, false]);
|
||||
@@ -90,7 +90,7 @@ fn op_or() {
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("out.pbiv");
|
||||
let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
|
||||
b.or(&rb);
|
||||
b.or(rb.view());
|
||||
b.close().unwrap();
|
||||
let r = PersistentBitVec::open(&path).unwrap();
|
||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![true, true, true, false]);
|
||||
@@ -103,7 +103,7 @@ fn op_xor() {
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("out.pbiv");
|
||||
let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
|
||||
b.xor(&rb);
|
||||
b.xor(rb.view());
|
||||
b.close().unwrap();
|
||||
let r = PersistentBitVec::open(&path).unwrap();
|
||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![false, true, true, false]);
|
||||
|
||||
@@ -0,0 +1,223 @@
|
||||
use tempfile::tempdir;
|
||||
|
||||
use crate::{
|
||||
ColGroup, MatrixGroupOps,
|
||||
PersistentBitMatrix, PersistentBitMatrixBuilder,
|
||||
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
|
||||
};
|
||||
use crate::{PersistentBitVecBuilder, PersistentCompactIntVec, PersistentCompactIntVecBuilder};
|
||||
|
||||
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
fn make_int_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {
|
||||
let n = cols.first().map_or(0, |c| c.len());
|
||||
let dir = tempdir().unwrap();
|
||||
let mut b = PersistentCompactIntMatrixBuilder::new(n, &dir.path().join("counts")).unwrap();
|
||||
for &col in cols {
|
||||
let mut cb = b.add_col().unwrap();
|
||||
for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); }
|
||||
cb.close().unwrap();
|
||||
}
|
||||
b.close().unwrap();
|
||||
let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
|
||||
(dir, m)
|
||||
}
|
||||
|
||||
fn make_bit_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) {
|
||||
let n = cols.first().map_or(0, |c| c.len());
|
||||
let dir = tempdir().unwrap();
|
||||
let presence = dir.path().join("presence");
|
||||
let mut b = PersistentBitMatrixBuilder::new(n, &presence).unwrap();
|
||||
for &col in cols {
|
||||
let mut cb = b.add_col().unwrap();
|
||||
for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); }
|
||||
cb.close().unwrap();
|
||||
}
|
||||
b.close().unwrap();
|
||||
let m = PersistentBitMatrix::open(dir.path()).unwrap();
|
||||
(dir, m)
|
||||
}
|
||||
|
||||
// ── IntMatrix: partial_group_sum ──────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn int_partial_group_sum_basic() {
|
||||
// col0=[1,2,3], col1=[10,20,30], col2=[100,0,5]
|
||||
// group {0,2}: sum = [101, 2, 8]
|
||||
let (_d, m) = make_int_matrix(&[&[1, 2, 3], &[10, 20, 30], &[100, 0, 5]]);
|
||||
let g = ColGroup::new("g", vec![0, 2]);
|
||||
let result = m.partial_group_sum(&g).unwrap();
|
||||
assert_eq!(result.get(0), 101);
|
||||
assert_eq!(result.get(1), 2);
|
||||
assert_eq!(result.get(2), 8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn int_partial_group_sum_with_overflow() {
|
||||
// col0=[300,0], col1=[200,400]: group {0,1}: sum=[500, 400]
|
||||
let (_d, m) = make_int_matrix(&[&[300, 0], &[200, 400]]);
|
||||
let g = ColGroup::new("g", vec![0, 1]);
|
||||
let result = m.partial_group_sum(&g).unwrap();
|
||||
assert_eq!(result.get(0), 500);
|
||||
assert_eq!(result.get(1), 400);
|
||||
assert_eq!(result.sum(), 900);
|
||||
}
|
||||
|
||||
// ── IntMatrix: partial_group_presence_count ───────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn int_partial_group_presence_count() {
|
||||
// col0=[5,1,0,3], col1=[2,0,4,3], col2=[0,3,1,0]
|
||||
// threshold=2: col0: [T,F,F,T], col1: [T,F,T,T], col2: [F,T,F,F]
|
||||
// group {0,1,2}: counts = [2, 1, 1, 2]
|
||||
let (_d, m) = make_int_matrix(&[&[5, 1, 0, 3], &[2, 0, 4, 3], &[0, 3, 1, 0]]);
|
||||
let g = ColGroup::new("g", vec![0, 1, 2]);
|
||||
let result = m.partial_group_presence_count(&g, 2).unwrap();
|
||||
assert_eq!(result.get(0), 2);
|
||||
assert_eq!(result.get(1), 1);
|
||||
assert_eq!(result.get(2), 1);
|
||||
assert_eq!(result.get(3), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn int_partial_group_presence_count_with_overflow() {
|
||||
// col0=[300,0,10], col1=[0,400,10], col2=[1,1,10]
|
||||
// threshold=5: col0: [T,F,T], col1: [F,T,T], col2: [F,F,T]
|
||||
// group {0,1,2}: counts = [1, 1, 3]
|
||||
let (_d, m) = make_int_matrix(&[&[300, 0, 10], &[0, 400, 10], &[1, 1, 10]]);
|
||||
let g = ColGroup::new("g", vec![0, 1, 2]);
|
||||
let result = m.partial_group_presence_count(&g, 5).unwrap();
|
||||
assert_eq!(result.get(0), 1);
|
||||
assert_eq!(result.get(1), 1);
|
||||
assert_eq!(result.get(2), 3);
|
||||
}
|
||||
|
||||
// ── IntMatrix: partial_group_any ──────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn int_partial_group_any() {
|
||||
// col0=[0,3,0,1], col1=[2,0,0,0], col2=[0,0,5,0]
|
||||
// threshold=2: col0: [F,T,F,F], col1: [T,F,F,F], col2: [F,F,T,F]
|
||||
// group {0,1,2}: any = [T, T, T, F]
|
||||
let (_d, m) = make_int_matrix(&[&[0, 3, 0, 1], &[2, 0, 0, 0], &[0, 0, 5, 0]]);
|
||||
let g = ColGroup::new("g", vec![0, 1, 2]);
|
||||
let result = m.partial_group_any(&g, 2).unwrap();
|
||||
assert_eq!(result.get(0), true);
|
||||
assert_eq!(result.get(1), true);
|
||||
assert_eq!(result.get(2), true);
|
||||
assert_eq!(result.get(3), false);
|
||||
}
|
||||
|
||||
// ── IntMatrix: mask_with ──────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn mask_with_zeros_selected_slots() {
|
||||
// count vec [10, 20, 30, 40], mask [T, F, T, F] → [10, 0, 30, 0]
|
||||
let dir = tempdir().unwrap();
|
||||
let mut v = PersistentCompactIntVecBuilder::new(4, &dir.path().join("v.pciv")).unwrap();
|
||||
v.set(0, 10); v.set(1, 20); v.set(2, 30); v.set(3, 40);
|
||||
let mut mask = PersistentBitVecBuilder::new(4, &dir.path().join("m.pbiv")).unwrap();
|
||||
mask.set(0, true); mask.set(2, true);
|
||||
v.mask_with(mask.view());
|
||||
v.close().unwrap();
|
||||
let r = PersistentCompactIntVec::open(&dir.path().join("v.pciv")).unwrap();
|
||||
assert_eq!(r.get(0), 10);
|
||||
assert_eq!(r.get(1), 0);
|
||||
assert_eq!(r.get(2), 30);
|
||||
assert_eq!(r.get(3), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mask_with_overflow_slot_zeroed() {
|
||||
// overflow slot (value 500) masked out → removed from overflow, primary=0
|
||||
let dir = tempdir().unwrap();
|
||||
let mut v = PersistentCompactIntVecBuilder::new(3, &dir.path().join("v.pciv")).unwrap();
|
||||
v.set(0, 10); v.set(1, 500); v.set(2, 5);
|
||||
let mut mask = PersistentBitVecBuilder::new(3, &dir.path().join("m.pbiv")).unwrap();
|
||||
mask.set(0, true); mask.set(2, true); // slot 1 masked out
|
||||
v.mask_with(mask.view());
|
||||
v.close().unwrap();
|
||||
let r = PersistentCompactIntVec::open(&dir.path().join("v.pciv")).unwrap();
|
||||
assert_eq!(r.get(0), 10);
|
||||
assert_eq!(r.get(1), 0);
|
||||
assert_eq!(r.get(2), 5);
|
||||
let ov: Vec<_> = r.view().overflow_entries().collect();
|
||||
assert!(ov.is_empty(), "overflow entry for masked-out slot should be gone");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mask_with_all_ones_is_noop() {
|
||||
let dir = tempdir().unwrap();
|
||||
let mut v = PersistentCompactIntVecBuilder::new(4, &dir.path().join("v.pciv")).unwrap();
|
||||
v.set(0, 300); v.set(1, 1); v.set(2, 0); v.set(3, 42);
|
||||
let mask = PersistentBitVecBuilder::new_ones(4, &dir.path().join("m.pbiv")).unwrap();
|
||||
v.mask_with(mask.view());
|
||||
v.close().unwrap();
|
||||
let r = PersistentCompactIntVec::open(&dir.path().join("v.pciv")).unwrap();
|
||||
assert_eq!(r.get(0), 300);
|
||||
assert_eq!(r.get(1), 1);
|
||||
assert_eq!(r.get(2), 0);
|
||||
assert_eq!(r.get(3), 42);
|
||||
}
|
||||
|
||||
// ── BitMatrix: partial_group_presence_count ───────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn bit_partial_group_presence_count() {
|
||||
// col0=[T,F,T,F], col1=[T,T,F,F], col2=[F,T,T,F]
|
||||
// group {0,1,2}: counts = [2, 2, 2, 0]
|
||||
let (_d, m) = make_bit_matrix(&[
|
||||
&[true, false, true, false],
|
||||
&[true, true, false, false],
|
||||
&[false,true, true, false],
|
||||
]);
|
||||
let g = ColGroup::new("g", vec![0, 1, 2]);
|
||||
let result = m.partial_group_presence_count(&g, 1).unwrap();
|
||||
assert_eq!(result.get(0), 2);
|
||||
assert_eq!(result.get(1), 2);
|
||||
assert_eq!(result.get(2), 2);
|
||||
assert_eq!(result.get(3), 0);
|
||||
}
|
||||
|
||||
// ── BitMatrix: partial_group_any ──────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn bit_partial_group_any() {
|
||||
// col0=[T,F,F], col1=[F,F,T], group {0,1}: any = [T, F, T]
|
||||
let (_d, m) = make_bit_matrix(&[
|
||||
&[true, false, false],
|
||||
&[false, false, true],
|
||||
]);
|
||||
let g = ColGroup::new("g", vec![0, 1]);
|
||||
let result = m.partial_group_any(&g, 1).unwrap();
|
||||
assert_eq!(result.get(0), true);
|
||||
assert_eq!(result.get(1), false);
|
||||
assert_eq!(result.get(2), true);
|
||||
}
|
||||
|
||||
// ── Composition: partial results are additive ─────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn int_presence_count_additive_across_split() {
|
||||
// Simulate two partitions (different kmer ranges) whose counts should add.
|
||||
// Global data for col0: [5,1,0,3,2], col1: [2,0,4,3,1] — threshold=2
|
||||
// Split: partition A = slots 0..2, partition B = slots 2..5
|
||||
let data_a: &[&[u32]] = &[&[5, 1], &[2, 0]];
|
||||
let data_b: &[&[u32]] = &[&[0, 3, 2], &[4, 3, 1]];
|
||||
let (_da, ma) = make_int_matrix(data_a);
|
||||
let (_db, mb) = make_int_matrix(data_b);
|
||||
let g = ColGroup::new("g", vec![0, 1]);
|
||||
|
||||
let pa = ma.partial_group_presence_count(&g, 2).unwrap();
|
||||
let pb = mb.partial_group_presence_count(&g, 2).unwrap();
|
||||
|
||||
// Concatenate by adding (disjoint kmer ranges — here we just verify
|
||||
// individual results match the expected per-partition counts).
|
||||
// partition A: col0=[5≥2,1<2]=[T,F], col1=[2≥2,0<2]=[T,F] → [2, 0]
|
||||
assert_eq!(pa.get(0), 2);
|
||||
assert_eq!(pa.get(1), 0);
|
||||
// partition B: col0=[0<2,3≥2,2≥2]=[F,T,T], col1=[4≥2,3≥2,1<2]=[T,T,F] → [1, 2, 1]
|
||||
assert_eq!(pb.get(0), 1);
|
||||
assert_eq!(pb.get(1), 2);
|
||||
assert_eq!(pb.get(2), 1);
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
use tempfile::tempdir;
|
||||
|
||||
use crate::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder};
|
||||
use crate::{pack_compact_int_matrix, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder};
|
||||
use crate::traits::CountPartials;
|
||||
|
||||
fn make_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {
|
||||
@@ -243,6 +243,61 @@ fn partial_hellinger_matches_full() {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn col_view_packed_values() {
|
||||
// Build Columnar with overflow values (≥ 255), pack, reopen as Packed, exercise col_view().
|
||||
let (dir, _col) = make_matrix(&[&[10, 300, 500], &[200, 50, 1000]]);
|
||||
pack_compact_int_matrix(&dir.path().join("counts")).unwrap();
|
||||
let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
|
||||
|
||||
// col 0: [10, 300, 500] — two overflow slots
|
||||
let v0 = m.col_view(0);
|
||||
assert_eq!(v0.get(0), 10);
|
||||
assert_eq!(v0.get(1), 300);
|
||||
assert_eq!(v0.get(2), 500);
|
||||
assert_eq!(v0.sum(), 810);
|
||||
assert_eq!(v0.count_nonzero(), 3);
|
||||
let mut ov0: Vec<(usize, u32)> = v0.overflow_entries().collect();
|
||||
ov0.sort_unstable_by_key(|&(s, _)| s);
|
||||
assert_eq!(ov0, vec![(1, 300), (2, 500)]);
|
||||
|
||||
// col 1: [200, 50, 1000] — one overflow slot
|
||||
let v1 = m.col_view(1);
|
||||
assert_eq!(v1.get(0), 200);
|
||||
assert_eq!(v1.get(1), 50);
|
||||
assert_eq!(v1.get(2), 1000);
|
||||
let mut ov1: Vec<(usize, u32)> = v1.overflow_entries().collect();
|
||||
ov1.sort_unstable_by_key(|&(s, _)| s);
|
||||
assert_eq!(ov1, vec![(2, 1000)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn col_view_packed_matches_columnar() {
|
||||
// Same data, compare col_view() on Packed against col() on Columnar slot-by-slot.
|
||||
let data: &[&[u32]] = &[&[0, 255, 1, 300, 128], &[500, 3, 0, 700, 42]];
|
||||
let (dir_col, m_col) = make_matrix(data);
|
||||
// Re-build in a separate dir so we can pack without touching m_col's files.
|
||||
let (dir_pack, _) = make_matrix(data);
|
||||
pack_compact_int_matrix(&dir_pack.path().join("counts")).unwrap();
|
||||
let m_pack = PersistentCompactIntMatrix::open(dir_pack.path()).unwrap();
|
||||
|
||||
for c in 0..data.len() {
|
||||
let col_ref = m_col.col(c);
|
||||
let col_view = m_pack.col_view(c);
|
||||
assert_eq!(col_view.len(), col_ref.len());
|
||||
for s in 0..col_ref.len() {
|
||||
assert_eq!(col_view.get(s), col_ref.get(s), "col={c} slot={s}");
|
||||
}
|
||||
assert_eq!(col_view.sum(), col_ref.sum(), "col={c} sum");
|
||||
let mut ov_view: Vec<(usize, u32)> = col_view.overflow_entries().collect();
|
||||
let mut ov_ref: Vec<(usize, u32)> = col_ref.view().overflow_entries().collect();
|
||||
ov_view.sort_unstable_by_key(|&(s, _)| s);
|
||||
ov_ref.sort_unstable_by_key(|&(s, _)| s);
|
||||
assert_eq!(ov_view, ov_ref, "col={c} overflow_entries");
|
||||
}
|
||||
drop(dir_col);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn partial_relfreq_bray_additive_across_split() {
|
||||
// Split rows [1,2,3,4,5] between two matrices; partial sums should add up.
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
mod bitmatrix;
|
||||
mod bitvec;
|
||||
mod colgroup;
|
||||
mod intmatrix;
|
||||
|
||||
use tempfile::tempdir;
|
||||
@@ -169,7 +170,7 @@ fn combine_min() {
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("out.pciv");
|
||||
let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
|
||||
b.min(&rb);
|
||||
b.min(rb.view());
|
||||
b.close().unwrap();
|
||||
let r = PersistentCompactIntVec::open(&path).unwrap();
|
||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![10, 100, 0, 800]);
|
||||
@@ -182,7 +183,7 @@ fn combine_max() {
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("out.pciv");
|
||||
let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
|
||||
b.max(&rb);
|
||||
b.max(rb.view());
|
||||
b.close().unwrap();
|
||||
let r = PersistentCompactIntVec::open(&path).unwrap();
|
||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![20, 300, 500, 1000]);
|
||||
@@ -195,7 +196,7 @@ fn combine_add() {
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("out.pciv");
|
||||
let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
|
||||
b.add(&rb);
|
||||
b.add(rb.view());
|
||||
b.close().unwrap();
|
||||
let r = PersistentCompactIntVec::open(&path).unwrap();
|
||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![30, 300, 5, 101]);
|
||||
@@ -220,7 +221,7 @@ fn combine_diff() {
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("out.pciv");
|
||||
let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
|
||||
b.diff(&rb);
|
||||
b.diff(rb.view());
|
||||
b.close().unwrap();
|
||||
let r = PersistentCompactIntVec::open(&path).unwrap();
|
||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![10, 700, 0, 0]);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use ndarray::{Array1, Array2};
|
||||
|
||||
/// Column-level weight statistic — total count or presence count per column.
|
||||
// ── Column-level weight statistic — total count or presence count per column.
|
||||
/// Additive across layers and partitions; used as denominator in normalised distances.
|
||||
///
|
||||
/// `partial_kmer_counts` returns the number of **distinct k-mers** present per
|
||||
|
||||
@@ -0,0 +1,278 @@
|
||||
use crate::format::{byte_count_nonzero, byte_sum, parse_overflow_entry};
|
||||
|
||||
// ── BitSliceView ──────────────────────────────────────────────────────────────
|
||||
|
||||
/// Lightweight, copy-able read-only view over a u64 word array.
|
||||
/// Bit `i` is in `words[i >> 6]` at position `i & 63`. Padding bits are zero.
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct BitSliceView<'a> {
|
||||
pub(crate) words: &'a [u64],
|
||||
pub(crate) n: usize,
|
||||
}
|
||||
|
||||
impl<'a> BitSliceView<'a> {
|
||||
#[inline]
|
||||
pub fn new(words: &'a [u64], n: usize) -> Self { Self { words, n } }
|
||||
|
||||
pub fn len(&self) -> usize { self.n }
|
||||
pub fn is_empty(&self) -> bool { self.n == 0 }
|
||||
pub fn words(&self) -> &'a [u64] { self.words }
|
||||
|
||||
#[inline]
|
||||
pub fn get(&self, slot: usize) -> bool {
|
||||
(self.words[slot >> 6] >> (slot & 63)) & 1 != 0
|
||||
}
|
||||
|
||||
pub fn count_ones(&self) -> u64 {
|
||||
self.words.iter().map(|w| w.count_ones() as u64).sum()
|
||||
}
|
||||
pub fn count_zeros(&self) -> u64 { self.n as u64 - self.count_ones() }
|
||||
|
||||
pub fn iter(&self) -> BitSliceIter<'a> {
|
||||
BitSliceIter { words: self.words, slot: 0, n: self.n }
|
||||
}
|
||||
|
||||
pub fn partial_jaccard_dist(self, other: BitSliceView<'_>) -> (u64, u64) {
|
||||
assert_eq!(self.n, other.n, "BitSliceView length mismatch");
|
||||
self.words.iter().zip(other.words)
|
||||
.fold((0u64, 0u64), |(i, u), (&a, &b)| {
|
||||
(i + (a & b).count_ones() as u64, u + (a | b).count_ones() as u64)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn jaccard_dist(self, other: BitSliceView<'_>) -> f64 {
|
||||
let (inter, union) = self.partial_jaccard_dist(other);
|
||||
if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 }
|
||||
}
|
||||
|
||||
pub fn hamming_dist(self, other: BitSliceView<'_>) -> u64 {
|
||||
assert_eq!(self.n, other.n, "BitSliceView length mismatch");
|
||||
self.words.iter().zip(other.words)
|
||||
.map(|(&a, &b)| (a ^ b).count_ones() as u64)
|
||||
.sum()
|
||||
}
|
||||
}
|
||||
|
||||
// ── BitSliceIter ──────────────────────────────────────────────────────────────
|
||||
|
||||
pub struct BitSliceIter<'a> {
|
||||
words: &'a [u64],
|
||||
slot: usize,
|
||||
n: usize,
|
||||
}
|
||||
|
||||
impl Iterator for BitSliceIter<'_> {
|
||||
type Item = bool;
|
||||
fn next(&mut self) -> Option<bool> {
|
||||
if self.slot >= self.n { return None; }
|
||||
let v = (self.words[self.slot >> 6] >> (self.slot & 63)) & 1 != 0;
|
||||
self.slot += 1;
|
||||
Some(v)
|
||||
}
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let rem = self.n - self.slot;
|
||||
(rem, Some(rem))
|
||||
}
|
||||
}
|
||||
impl ExactSizeIterator for BitSliceIter<'_> {}
|
||||
|
||||
// ── IntSliceView ──────────────────────────────────────────────────────────────
|
||||
|
||||
/// Lightweight, copy-able read-only view over a compact-int primary array plus
|
||||
/// its sorted raw overflow bytes. Zero-copy: all data lives in the caller's mmap.
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct IntSliceView<'a> {
|
||||
pub(crate) primary: &'a [u8],
|
||||
pub(crate) overflow_raw: &'a [u8], // n_overflow × OVERFLOW_ENTRY_SIZE bytes, sorted by slot
|
||||
pub(crate) n_overflow: usize,
|
||||
pub(crate) n: usize,
|
||||
}
|
||||
|
||||
impl<'a> IntSliceView<'a> {
|
||||
#[inline]
|
||||
pub fn new(primary: &'a [u8], overflow_raw: &'a [u8], n_overflow: usize, n: usize) -> Self {
|
||||
Self { primary, overflow_raw, n_overflow, n }
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize { self.n }
|
||||
pub fn is_empty(&self) -> bool { self.n == 0 }
|
||||
pub fn primary_bytes(&self) -> &'a [u8] { self.primary }
|
||||
pub fn n_overflow(&self) -> usize { self.n_overflow }
|
||||
|
||||
pub fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + 'a {
|
||||
let raw = self.overflow_raw;
|
||||
let n_ov = self.n_overflow;
|
||||
(0..n_ov).map(move |i| parse_overflow_entry(raw, 0, i))
|
||||
}
|
||||
|
||||
/// O(log n_overflow) via binary search (overflow is always sorted by slot).
|
||||
pub fn get(&self, slot: usize) -> u32 {
|
||||
let b = self.primary[slot];
|
||||
if b < 255 { return b as u32; }
|
||||
let mut lo = 0usize;
|
||||
let mut hi = self.n_overflow;
|
||||
while lo < hi {
|
||||
let mid = lo + (hi - lo) / 2;
|
||||
let (s, v) = parse_overflow_entry(self.overflow_raw, 0, mid);
|
||||
match s.cmp(&slot) {
|
||||
std::cmp::Ordering::Equal => return v,
|
||||
std::cmp::Ordering::Less => lo = mid + 1,
|
||||
std::cmp::Ordering::Greater => hi = mid,
|
||||
}
|
||||
}
|
||||
panic!("slot {slot} marked overflow but not found")
|
||||
}
|
||||
|
||||
/// Sequential merge scan: yields all n values in slot order.
|
||||
pub fn iter(&self) -> IntSliceViewIter<'a> {
|
||||
IntSliceViewIter {
|
||||
primary: self.primary,
|
||||
overflow_raw: self.overflow_raw,
|
||||
slot: 0,
|
||||
overflow_pos: 0,
|
||||
n: self.n,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn sum(&self) -> u64 {
|
||||
byte_sum(self.primary, self.overflow_entries().map(|(_, v)| v))
|
||||
}
|
||||
|
||||
pub fn count_nonzero(&self) -> u64 {
|
||||
byte_count_nonzero(self.primary)
|
||||
}
|
||||
|
||||
// ── Distance methods ──────────────────────────────────────────────────────
|
||||
|
||||
pub fn partial_bray_dist(self, other: IntSliceView<'_>) -> u64 {
|
||||
assert_eq!(self.n, other.n, "length mismatch");
|
||||
self.iter().zip(other.iter()).map(|(a, b)| a.min(b) as u64).sum()
|
||||
}
|
||||
|
||||
pub fn bray_dist(self, other: IntSliceView<'_>) -> f64 {
|
||||
let sum_min = self.partial_bray_dist(other);
|
||||
let denom = self.sum() + other.sum();
|
||||
if denom == 0 { 0.0 } else { 1.0 - 2.0 * sum_min as f64 / denom as f64 }
|
||||
}
|
||||
|
||||
pub fn partial_relfreq_bray_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 {
|
||||
assert_eq!(self.n, other.n, "length mismatch");
|
||||
self.iter().zip(other.iter())
|
||||
.map(|(a, b)| {
|
||||
let pa = if sa > 0.0 { a as f64 / sa } else { 0.0 };
|
||||
let pb = if sb > 0.0 { b as f64 / sb } else { 0.0 };
|
||||
pa.min(pb)
|
||||
})
|
||||
.sum()
|
||||
}
|
||||
|
||||
pub fn relfreq_bray_dist(self, other: IntSliceView<'_>) -> f64 {
|
||||
let sa = self.sum() as f64;
|
||||
let sb = other.sum() as f64;
|
||||
if sa == 0.0 && sb == 0.0 { return 0.0; }
|
||||
1.0 - self.partial_relfreq_bray_dist(other, sa, sb)
|
||||
}
|
||||
|
||||
pub fn partial_euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
|
||||
assert_eq!(self.n, other.n, "length mismatch");
|
||||
self.iter().zip(other.iter())
|
||||
.map(|(a, b)| { let d = a as f64 - b as f64; d * d })
|
||||
.sum()
|
||||
}
|
||||
|
||||
pub fn euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
|
||||
self.partial_euclidean_dist(other).sqrt()
|
||||
}
|
||||
|
||||
pub fn partial_relfreq_euclidean_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 {
|
||||
assert_eq!(self.n, other.n, "length mismatch");
|
||||
self.iter().zip(other.iter())
|
||||
.map(|(a, b)| {
|
||||
let pa = if sa > 0.0 { a as f64 / sa } else { 0.0 };
|
||||
let pb = if sb > 0.0 { b as f64 / sb } else { 0.0 };
|
||||
let d = pa - pb;
|
||||
d * d
|
||||
})
|
||||
.sum()
|
||||
}
|
||||
|
||||
pub fn relfreq_euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
|
||||
let sa = self.sum() as f64;
|
||||
let sb = other.sum() as f64;
|
||||
if sa == 0.0 && sb == 0.0 { return 0.0; }
|
||||
self.partial_relfreq_euclidean_dist(other, sa, sb).sqrt()
|
||||
}
|
||||
|
||||
pub fn partial_hellinger_euclidean_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 {
|
||||
assert_eq!(self.n, other.n, "length mismatch");
|
||||
self.iter().zip(other.iter())
|
||||
.map(|(a, b)| {
|
||||
let pa = if sa > 0.0 { (a as f64 / sa).sqrt() } else { 0.0 };
|
||||
let pb = if sb > 0.0 { (b as f64 / sb).sqrt() } else { 0.0 };
|
||||
let d = pa - pb;
|
||||
d * d
|
||||
})
|
||||
.sum()
|
||||
}
|
||||
|
||||
pub fn hellinger_euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
|
||||
let sa = self.sum() as f64;
|
||||
let sb = other.sum() as f64;
|
||||
if sa == 0.0 && sb == 0.0 { return 0.0; }
|
||||
self.partial_hellinger_euclidean_dist(other, sa, sb).sqrt()
|
||||
}
|
||||
|
||||
pub fn hellinger_dist(self, other: IntSliceView<'_>) -> f64 {
|
||||
self.hellinger_euclidean_dist(other) / std::f64::consts::SQRT_2
|
||||
}
|
||||
|
||||
pub fn partial_threshold_jaccard_dist(self, other: IntSliceView<'_>, threshold: u32) -> (u64, u64) {
|
||||
assert_eq!(self.n, other.n, "length mismatch");
|
||||
self.iter().zip(other.iter())
|
||||
.fold((0u64, 0u64), |(inter, uni), (a, b)| {
|
||||
let ap = a >= threshold;
|
||||
let bp = b >= threshold;
|
||||
(inter + (ap & bp) as u64, uni + (ap | bp) as u64)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn threshold_jaccard_dist(self, other: IntSliceView<'_>, threshold: u32) -> f64 {
|
||||
let (inter, union) = self.partial_threshold_jaccard_dist(other, threshold);
|
||||
if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 }
|
||||
}
|
||||
|
||||
pub fn jaccard_dist(self, other: IntSliceView<'_>) -> f64 {
|
||||
self.threshold_jaccard_dist(other, 1)
|
||||
}
|
||||
}
|
||||
|
||||
// ── IntSliceViewIter ──────────────────────────────────────────────────────────
|
||||
|
||||
pub struct IntSliceViewIter<'a> {
|
||||
primary: &'a [u8],
|
||||
overflow_raw: &'a [u8],
|
||||
slot: usize,
|
||||
overflow_pos: usize,
|
||||
n: usize,
|
||||
}
|
||||
|
||||
impl Iterator for IntSliceViewIter<'_> {
|
||||
type Item = u32;
|
||||
fn next(&mut self) -> Option<u32> {
|
||||
if self.slot >= self.n { return None; }
|
||||
let v = self.primary[self.slot];
|
||||
self.slot += 1;
|
||||
if v < 255 {
|
||||
Some(v as u32)
|
||||
} else {
|
||||
let (_, val) = parse_overflow_entry(self.overflow_raw, 0, self.overflow_pos);
|
||||
self.overflow_pos += 1;
|
||||
Some(val)
|
||||
}
|
||||
}
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let rem = self.n - self.slot;
|
||||
(rem, Some(rem))
|
||||
}
|
||||
}
|
||||
impl ExactSizeIterator for IntSliceViewIter<'_> {}
|
||||
@@ -3,6 +3,7 @@ use crossbeam_channel;
|
||||
use hashbrown::HashMap;
|
||||
use obikseq::k;
|
||||
use obikseq::{CanonicalKmer, Sequence, Unitig};
|
||||
#[cfg(not(any(test, feature = "test-utils")))]
|
||||
use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
|
||||
use std::cell::RefCell;
|
||||
use std::fmt;
|
||||
|
||||
@@ -204,6 +204,7 @@ impl KmerIndex {
|
||||
|
||||
let n = self.n_partitions();
|
||||
let order: Vec<usize> = (0..n).collect();
|
||||
let pb = progress_bar("pack", n as u64, "partitions");
|
||||
crate::numa::PartitionRunner::new().run(
|
||||
&order,
|
||||
|i| -> OKIResult<()> {
|
||||
@@ -220,8 +221,10 @@ impl KmerIndex {
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
|_, _, _| {},
|
||||
)
|
||||
|_, _, _| { pb.inc(1); },
|
||||
)?;
|
||||
pb.finish_and_clear();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Write a `layer_meta.json` in any layer directory that is missing one.
|
||||
|
||||
@@ -11,7 +11,7 @@ use obilayeredmap::IndexMode;
|
||||
use crate::error::{OKIError, OKIResult};
|
||||
use crate::index::KmerIndex;
|
||||
use crate::meta::{GenomeInfo, IndexMeta};
|
||||
use crate::state::IndexState;
|
||||
use crate::state::{IndexState, SENTINEL_INDEXED};
|
||||
|
||||
pub use obikpartitionner::MergeMode;
|
||||
|
||||
@@ -263,6 +263,8 @@ impl KmerIndex {
|
||||
rep.push(t.stop());
|
||||
}
|
||||
|
||||
fs::File::create(output.join(SENTINEL_INDEXED)).map_err(OKIError::Io)?;
|
||||
|
||||
KmerIndex::open(output)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,7 +98,9 @@ impl KmerIndex {
|
||||
fs::File::create(output.join(SENTINEL_INDEXED))?;
|
||||
|
||||
let idx = KmerIndex::open(output)?;
|
||||
let t_pack = Stage::start("pack");
|
||||
idx.pack_matrices()?;
|
||||
rep.push(t_pack.stop());
|
||||
Ok(idx)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::io;
|
||||
use std::path::Path;
|
||||
|
||||
use obikpartitionner::{KmerPartition, OutputCol, PARTITIONS_SUBDIR};
|
||||
use obisys::{Stage, progress_bar};
|
||||
use obisys::{Reporter, Stage, progress_bar};
|
||||
use tracing::info;
|
||||
|
||||
use crate::error::{OKIError, OKIResult};
|
||||
@@ -25,6 +25,7 @@ impl KmerIndex {
|
||||
threshold: u32,
|
||||
output_presence: bool,
|
||||
force: bool,
|
||||
rep: &mut Reporter,
|
||||
) -> OKIResult<Self> {
|
||||
let output = output.as_ref();
|
||||
|
||||
@@ -80,13 +81,14 @@ impl KmerIndex {
|
||||
).map_err(OKIError::Partition)?;
|
||||
|
||||
pb.finish_and_clear();
|
||||
|
||||
let _ = t.stop();
|
||||
rep.push(t.stop());
|
||||
|
||||
fs::File::create(output.join(SENTINEL_INDEXED))?;
|
||||
|
||||
let idx = KmerIndex::open(output)?;
|
||||
let t_pack = Stage::start("pack");
|
||||
idx.pack_matrices()?;
|
||||
rep.push(t_pack.stop());
|
||||
Ok(idx)
|
||||
}
|
||||
|
||||
@@ -98,6 +100,7 @@ impl KmerIndex {
|
||||
specs: &[OutputCol],
|
||||
threshold: u32,
|
||||
output_presence: bool,
|
||||
rep: &mut Reporter,
|
||||
) -> OKIResult<()> {
|
||||
if self.state() != IndexState::Indexed {
|
||||
return Err(OKIError::NotIndexed(self.root_path.clone()));
|
||||
@@ -106,7 +109,6 @@ impl KmerIndex {
|
||||
let n_src_genomes = self.meta.genomes.len();
|
||||
let n_partitions = self.partition.n_partitions();
|
||||
|
||||
// Open a second handle to the same path so we can borrow src and dst simultaneously.
|
||||
let src_partition = KmerPartition::open_with_config(
|
||||
&self.root_path,
|
||||
self.meta.config.kmer_size,
|
||||
@@ -132,17 +134,17 @@ impl KmerIndex {
|
||||
).map_err(OKIError::Partition)?;
|
||||
|
||||
pb.finish_and_clear();
|
||||
rep.push(t.stop());
|
||||
|
||||
let _ = t.stop();
|
||||
|
||||
// Update index.meta with new genome list and with_counts flag.
|
||||
self.meta.config.with_counts = !output_presence;
|
||||
self.meta.genomes = specs.iter()
|
||||
.map(|s| GenomeInfo::new(s.label.clone()))
|
||||
.collect();
|
||||
self.meta.write(&self.root_path)?;
|
||||
|
||||
let t_pack = Stage::start("pack");
|
||||
self.pack_matrices()?;
|
||||
rep.push(t_pack.stop());
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "obikmer"
|
||||
version = "0.1.0"
|
||||
version = "0.1.3"
|
||||
edition = "2024"
|
||||
|
||||
[[bin]]
|
||||
@@ -19,6 +19,7 @@ obikpartitionner = { path = "../obikpartitionner" }
|
||||
obisys = { path = "../obisys" }
|
||||
obiskio = { path = "../obiskio" }
|
||||
obikindex = { path = "../obikindex" }
|
||||
obitaxonomy = { path = "../obitaxonomy" }
|
||||
obilayeredmap = { path = "../obilayeredmap" }
|
||||
clap = { version = "4", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
|
||||
@@ -3,6 +3,7 @@ use std::collections::HashMap;
|
||||
use clap::Args;
|
||||
use obikindex::GenomeInfo;
|
||||
use obikpartitionner::{GroupQuorumFilter, KmerFilter};
|
||||
use obitaxonomy::{TaxPath, TaxPattern};
|
||||
|
||||
// ── Operator ──────────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -49,7 +50,6 @@ impl MetaPred {
|
||||
if values.iter().any(|v| v.is_empty()) {
|
||||
return Err(format!("empty value in predicate: {s}"));
|
||||
}
|
||||
|
||||
Ok(Self { key, op, values })
|
||||
}
|
||||
|
||||
@@ -70,18 +70,15 @@ impl MetaPred {
|
||||
|
||||
// ── Path matching ─────────────────────────────────────────────────────────────
|
||||
|
||||
/// True if `value` is equal to `pattern` or is a descendant of it in a `/`-separated hierarchy.
|
||||
/// True if the stored taxonomy `value` matches `pattern`.
|
||||
///
|
||||
/// - Absolute pattern (`/a/b`): `value` must start with `/a/b` at a segment boundary.
|
||||
/// - Bare segment (`b`): `value` must contain `b` as an exact segment anywhere.
|
||||
/// `value` must be a valid `TaxPath` (starts with `taxonomy:/`).
|
||||
/// `pattern` is a `TaxPattern` query (see `obitaxonomy::TaxPattern` for syntax).
|
||||
/// Returns `false` if either fails to parse.
|
||||
fn path_matches(value: &str, pattern: &str) -> bool {
|
||||
if pattern.starts_with('/') {
|
||||
value == pattern
|
||||
|| (value.starts_with(pattern)
|
||||
&& value[pattern.len()..].starts_with('/'))
|
||||
} else {
|
||||
value.split('/').any(|seg| seg == pattern)
|
||||
}
|
||||
let Ok(path) = TaxPath::parse(value) else { return false };
|
||||
let Ok(pat) = TaxPattern::parse(pattern) else { return false };
|
||||
pat.matches(&path)
|
||||
}
|
||||
|
||||
// ── Three-value group evaluation ──────────────────────────────────────────────
|
||||
|
||||
@@ -4,6 +4,7 @@ use std::path::PathBuf;
|
||||
use clap::{Args, ValueEnum};
|
||||
use obikindex::{GenomeInfo, KmerIndex};
|
||||
use obikpartitionner::{AggOp, OutputCol};
|
||||
use obisys::Reporter;
|
||||
use tracing::info;
|
||||
|
||||
use super::predicate::matching_genome_indices;
|
||||
@@ -229,20 +230,24 @@ pub fn run(args: SelectArgs) {
|
||||
if output_presence { "presence" } else { "count" },
|
||||
);
|
||||
|
||||
let mut rep = Reporter::new();
|
||||
|
||||
if args.in_place {
|
||||
src.select_in_place(&specs, args.presence_threshold, output_presence)
|
||||
src.select_in_place(&specs, args.presence_threshold, output_presence, &mut rep)
|
||||
.unwrap_or_else(|e| {
|
||||
eprintln!("select error: {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
rep.print();
|
||||
info!("selected in-place → {}", args.source.display());
|
||||
} else {
|
||||
let output = args.output.unwrap();
|
||||
KmerIndex::select(&output, &src, &specs, args.presence_threshold, output_presence, args.force)
|
||||
KmerIndex::select(&output, &src, &specs, args.presence_threshold, output_presence, args.force, &mut rep)
|
||||
.unwrap_or_else(|e| {
|
||||
eprintln!("select error: {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
rep.print();
|
||||
info!("selected index → {}", output.display());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ use clap::{Parser, Subcommand};
|
||||
use tracing_subscriber::{EnvFilter, fmt};
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(name = "obikmer", about = "DNA k-mer tools")]
|
||||
#[command(name = "obikmer", about = "DNA k-mer tools", version)]
|
||||
struct Cli {
|
||||
#[command(subcommand)]
|
||||
command: Commands,
|
||||
|
||||
@@ -1,9 +1,24 @@
|
||||
use obicompactvec::FilterMask;
|
||||
|
||||
/// Trait for kmer row filters.
|
||||
///
|
||||
/// `row` contains raw per-genome counts (or 0/1 for presence/absence data).
|
||||
/// `n_genomes` equals `row.len()`.
|
||||
pub trait KmerFilter: Send + Sync {
|
||||
fn passes(&self, row: &[u32], n_genomes: usize) -> bool;
|
||||
|
||||
/// Express this filter as a [`FilterMask`] column-operation expression.
|
||||
///
|
||||
/// Returns `Some(expr)` if the filter can be evaluated solely from matrix
|
||||
/// column aggregates (no per-kmer row scan needed). Returns `None` if the
|
||||
/// filter requires row-level inspection.
|
||||
///
|
||||
/// `threshold` semantics in the returned mask use `>= threshold`, matching
|
||||
/// [`obicompactvec::MatrixGroupOps`]. Implementations must add 1 to any
|
||||
/// row-level threshold that uses strict `>` comparison.
|
||||
fn column_mask_expr(&self, _n_genomes: usize) -> Option<FilterMask> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// True when `row` passes every filter in `filters`.
|
||||
@@ -29,6 +44,16 @@ impl KmerFilter for MinGenomeFraction {
|
||||
let p = present_count(row, self.threshold);
|
||||
p as f64 / n_genomes as f64 >= self.frac
|
||||
}
|
||||
|
||||
fn column_mask_expr(&self, n_genomes: usize) -> Option<FilterMask> {
|
||||
let t = self.threshold.checked_add(1)?;
|
||||
let min_count = (self.frac * n_genomes as f64).ceil() as usize;
|
||||
Some(FilterMask::PresenceGeq {
|
||||
indices: (0..n_genomes).collect(),
|
||||
threshold: t,
|
||||
min_count,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// At most `frac` fraction of genomes contain this kmer (count > `threshold`).
|
||||
@@ -42,6 +67,16 @@ impl KmerFilter for MaxGenomeFraction {
|
||||
let p = present_count(row, self.threshold);
|
||||
p as f64 / n_genomes as f64 <= self.frac
|
||||
}
|
||||
|
||||
fn column_mask_expr(&self, n_genomes: usize) -> Option<FilterMask> {
|
||||
let t = self.threshold.checked_add(1)?;
|
||||
let max_count = (self.frac * n_genomes as f64).floor() as usize;
|
||||
Some(FilterMask::PresenceLeq {
|
||||
indices: (0..n_genomes).collect(),
|
||||
threshold: t,
|
||||
max_count,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// At least `count` genomes contain this kmer (count > `threshold`).
|
||||
@@ -54,6 +89,15 @@ impl KmerFilter for MinGenomeCount {
|
||||
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
|
||||
present_count(row, self.threshold) >= self.count
|
||||
}
|
||||
|
||||
fn column_mask_expr(&self, n_genomes: usize) -> Option<FilterMask> {
|
||||
let t = self.threshold.checked_add(1)?;
|
||||
Some(FilterMask::PresenceGeq {
|
||||
indices: (0..n_genomes).collect(),
|
||||
threshold: t,
|
||||
min_count: self.count,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// At most `count` genomes contain this kmer (count > `threshold`).
|
||||
@@ -66,6 +110,15 @@ impl KmerFilter for MaxGenomeCount {
|
||||
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
|
||||
present_count(row, self.threshold) <= self.count
|
||||
}
|
||||
|
||||
fn column_mask_expr(&self, n_genomes: usize) -> Option<FilterMask> {
|
||||
let t = self.threshold.checked_add(1)?;
|
||||
Some(FilterMask::PresenceLeq {
|
||||
indices: (0..n_genomes).collect(),
|
||||
threshold: t,
|
||||
max_count: self.count,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ── Total-count filters (count indexes only) ───────────────────────────────────
|
||||
@@ -79,6 +132,13 @@ impl KmerFilter for MinTotalCount {
|
||||
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
|
||||
row.iter().sum::<u32>() >= self.total
|
||||
}
|
||||
|
||||
fn column_mask_expr(&self, n_genomes: usize) -> Option<FilterMask> {
|
||||
Some(FilterMask::SumGeq {
|
||||
indices: (0..n_genomes).collect(),
|
||||
min_sum: self.total,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Sum of counts across all genomes <= `total`.
|
||||
@@ -90,6 +150,13 @@ impl KmerFilter for MaxTotalCount {
|
||||
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
|
||||
row.iter().sum::<u32>() <= self.total
|
||||
}
|
||||
|
||||
fn column_mask_expr(&self, n_genomes: usize) -> Option<FilterMask> {
|
||||
Some(FilterMask::SumLeq {
|
||||
indices: (0..n_genomes).collect(),
|
||||
max_sum: self.total,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ── Group-based quorum filter ─────────────────────────────────────────────────
|
||||
@@ -113,6 +180,37 @@ pub struct GroupQuorumFilter {
|
||||
pub max_outgroup_frac: f64,
|
||||
}
|
||||
|
||||
impl GroupQuorumFilter {
|
||||
// Build PresenceGeq/PresenceLeq constraints for one group (ingroup or outgroup).
|
||||
fn group_mask_parts(
|
||||
indices: &[usize],
|
||||
threshold: u32,
|
||||
min_count: usize,
|
||||
max_count: usize,
|
||||
min_frac: f64,
|
||||
max_frac: f64,
|
||||
parts: &mut Vec<FilterMask>,
|
||||
) {
|
||||
let n = indices.len();
|
||||
let geq = min_count.max((min_frac * n as f64).ceil() as usize);
|
||||
if geq > 0 {
|
||||
parts.push(FilterMask::PresenceGeq {
|
||||
indices: indices.to_vec(),
|
||||
threshold,
|
||||
min_count: geq,
|
||||
});
|
||||
}
|
||||
let leq = max_count.min((max_frac * n as f64).floor() as usize);
|
||||
if leq < n {
|
||||
parts.push(FilterMask::PresenceLeq {
|
||||
indices: indices.to_vec(),
|
||||
threshold,
|
||||
max_count: leq,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl KmerFilter for GroupQuorumFilter {
|
||||
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
|
||||
if !self.ingroup_idx.is_empty() {
|
||||
@@ -139,4 +237,26 @@ impl KmerFilter for GroupQuorumFilter {
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
fn column_mask_expr(&self, _n_genomes: usize) -> Option<FilterMask> {
|
||||
let t = self.threshold.checked_add(1)?;
|
||||
let mut parts: Vec<FilterMask> = Vec::new();
|
||||
if !self.ingroup_idx.is_empty() {
|
||||
Self::group_mask_parts(
|
||||
&self.ingroup_idx, t,
|
||||
self.min_count, self.max_count,
|
||||
self.min_frac, self.max_frac,
|
||||
&mut parts,
|
||||
);
|
||||
}
|
||||
if !self.outgroup_idx.is_empty() {
|
||||
Self::group_mask_parts(
|
||||
&self.outgroup_idx, t,
|
||||
self.min_outgroup_count, self.max_outgroup_count,
|
||||
self.min_outgroup_frac, self.max_outgroup_frac,
|
||||
&mut parts,
|
||||
);
|
||||
}
|
||||
Some(FilterMask::And(parts))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ use obipipeline::{
|
||||
};
|
||||
|
||||
use obicompactvec::{
|
||||
MatrixGroupOps,
|
||||
PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentBitVecBuilder,
|
||||
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder,
|
||||
};
|
||||
@@ -78,6 +79,41 @@ impl SrcLayerData {
|
||||
}
|
||||
buf
|
||||
}
|
||||
|
||||
pub(crate) fn n_slots(&self) -> usize {
|
||||
match self {
|
||||
SrcLayerData::Presence(_, mat) => mat.n(),
|
||||
SrcLayerData::Count(_, mat) => mat.n(),
|
||||
}
|
||||
}
|
||||
|
||||
/// MPHF lookup: returns the slot index for `kmer` (kmer must be in the domain).
|
||||
#[inline]
|
||||
pub(crate) fn slot(&self, kmer: CanonicalKmer) -> usize {
|
||||
match self {
|
||||
SrcLayerData::Presence(mphf, _) => mphf.index(kmer),
|
||||
SrcLayerData::Count(mphf, _) => mphf.index(kmer),
|
||||
}
|
||||
}
|
||||
|
||||
/// Row lookup by slot index, bypassing the MPHF.
|
||||
#[inline]
|
||||
pub(crate) fn fill_row_by_slot(&self, slot: usize, n_genomes: usize) -> Vec<u32> {
|
||||
let mut buf = vec![0u32; n_genomes];
|
||||
match self {
|
||||
SrcLayerData::Presence(_, mat) => mat.fill_row(slot, &mut buf),
|
||||
SrcLayerData::Count(_, mat) => mat.fill_row(slot, &mut buf),
|
||||
}
|
||||
buf
|
||||
}
|
||||
|
||||
/// Call `f` with a reference to the underlying matrix as `&dyn MatrixGroupOps`.
|
||||
pub(crate) fn with_matrix<R>(&self, f: impl FnOnce(&dyn MatrixGroupOps) -> R) -> R {
|
||||
match self {
|
||||
SrcLayerData::Presence(_, mat) => f(mat),
|
||||
SrcLayerData::Count(_, mat) => f(mat),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
use std::path::Path;
|
||||
|
||||
use obicompactvec::{
|
||||
PersistentBitMatrixBuilder, PersistentBitVecBuilder, PersistentCompactIntMatrixBuilder,
|
||||
PersistentCompactIntVecBuilder,
|
||||
FilterMask, eval_filter_mask,
|
||||
PersistentBitMatrixBuilder, PersistentBitVecBuilder,
|
||||
PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder,
|
||||
};
|
||||
use obidebruinj::GraphDeBruijn;
|
||||
use obikseq::CanonicalKmer;
|
||||
@@ -10,18 +11,135 @@ use obilayeredmap::meta::PartitionMeta;
|
||||
use obilayeredmap::{IndexMode, MphfLayer};
|
||||
use obiskio::{SKError, SKResult, UnitigFileReader};
|
||||
|
||||
use crate::common::{ColBuilder, col_path_bit, col_path_int, load_meta, olm_to_sk, write_matrix_meta};
|
||||
use crate::filter::{KmerFilter, passes_all};
|
||||
use crate::common::{load_meta, olm_to_sk};
|
||||
use crate::filter::KmerFilter;
|
||||
use crate::graph_pipeline::materialize_layer;
|
||||
use crate::merge_layer::{MergeMode, SrcLayerData};
|
||||
use crate::partition::KmerPartition;
|
||||
|
||||
const INDEX_SUBDIR: &str = "index";
|
||||
|
||||
/// Iterate all kmers in `src_index_dir` that pass `filters`, yielding `(kmer, row)`.
|
||||
// ── Builders — pair matrix builder + column builders for one mode ─────────────
|
||||
|
||||
enum Builders {
|
||||
Presence(PersistentBitMatrixBuilder, Vec<PersistentBitVecBuilder>),
|
||||
Count(PersistentCompactIntMatrixBuilder, Vec<PersistentCompactIntVecBuilder>),
|
||||
}
|
||||
|
||||
impl Builders {
|
||||
fn new(mode: MergeMode, n: usize, dir: &Path, n_genomes: usize) -> SKResult<Self> {
|
||||
match mode {
|
||||
MergeMode::Presence => {
|
||||
let mut mat = PersistentBitMatrixBuilder::new(n, dir).map_err(SKError::Io)?;
|
||||
let mut cols = Vec::with_capacity(n_genomes);
|
||||
for _ in 0..n_genomes { cols.push(mat.add_col().map_err(SKError::Io)?); }
|
||||
Ok(Builders::Presence(mat, cols))
|
||||
}
|
||||
MergeMode::Count => {
|
||||
let mut mat = PersistentCompactIntMatrixBuilder::new(n, dir).map_err(SKError::Io)?;
|
||||
let mut cols = Vec::with_capacity(n_genomes);
|
||||
for _ in 0..n_genomes { cols.push(mat.add_col().map_err(SKError::Io)?); }
|
||||
Ok(Builders::Count(mat, cols))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn set_val(&mut self, col: usize, slot: usize, value: u32) {
|
||||
match self {
|
||||
Builders::Presence(_, cols) => cols[col].set(slot, value > 0),
|
||||
Builders::Count(_, cols) => cols[col].set(slot, value),
|
||||
}
|
||||
}
|
||||
|
||||
fn close(self) -> SKResult<()> {
|
||||
match self {
|
||||
Builders::Presence(mat, cols) => {
|
||||
for b in cols { b.close().map_err(SKError::Io)?; }
|
||||
mat.close().map_err(SKError::Io)
|
||||
}
|
||||
Builders::Count(mat, cols) => {
|
||||
for b in cols { b.close().map_err(SKError::Io)?; }
|
||||
mat.close().map_err(SKError::Io)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── try_compute_combined_mask ─────────────────────────────────────────────────
|
||||
|
||||
/// Build a per-slot `TempBitVec` mask from `filters` using column operations
|
||||
/// on the source matrix — no per-kmer MPHF lookup or row read needed.
|
||||
///
|
||||
/// Uses [`SrcLayerData`] semantics: counts take priority over presence when
|
||||
/// `mode = Count`; presence (or implicit all-ones) is used for `Presence`.
|
||||
/// Returns `Some(mask)` when every filter in `filters` can express itself as
|
||||
/// a [`FilterMask`] expression. Returns `None` when any filter requires
|
||||
/// row-level inspection (fall back to `passes_all`).
|
||||
fn try_compute_combined_mask(
|
||||
filters: &[Box<dyn KmerFilter>],
|
||||
src_data: &SrcLayerData,
|
||||
n_genomes: usize,
|
||||
) -> SKResult<Option<obicompactvec::TempBitVec>> {
|
||||
if filters.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
let mut exprs: Vec<FilterMask> = Vec::with_capacity(filters.len());
|
||||
for f in filters {
|
||||
match f.column_mask_expr(n_genomes) {
|
||||
Some(expr) => exprs.push(expr),
|
||||
None => return Ok(None),
|
||||
}
|
||||
}
|
||||
let combined = FilterMask::And(exprs);
|
||||
let n = src_data.n_slots();
|
||||
let mask = src_data
|
||||
.with_matrix(|mat| eval_filter_mask(&combined, mat, n))
|
||||
.map_err(SKError::Io)?;
|
||||
Ok(Some(mask))
|
||||
}
|
||||
|
||||
// ── iter_src_kmers_masked (pass 1) ────────────────────────────────────────────
|
||||
|
||||
/// Iterate all passing kmers in `src_index_dir`, yielding only the kmer value.
|
||||
///
|
||||
/// When all filters can be expressed as column operations, a per-slot mask is
|
||||
/// computed once per layer and used for O(1) slot-check per kmer instead of a
|
||||
/// full row read. Falls back to row-level `passes_all` otherwise.
|
||||
fn iter_src_kmers_masked(
|
||||
src_index_dir: &Path,
|
||||
mode: MergeMode,
|
||||
n_genomes: usize,
|
||||
filters: &[Box<dyn KmerFilter>],
|
||||
mut cb: impl FnMut(CanonicalKmer),
|
||||
) -> SKResult<()> {
|
||||
let src_meta = load_meta(src_index_dir, "rebuild")?;
|
||||
for l in 0..src_meta.n_layers {
|
||||
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
|
||||
let unitigs_path = src_layer_dir.join("unitigs.bin");
|
||||
if !unitigs_path.exists() { continue; }
|
||||
|
||||
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
|
||||
let mask = try_compute_combined_mask(filters, &src_data, n_genomes)?;
|
||||
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
|
||||
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
let slot = src_data.slot(kmer);
|
||||
let passes = match &mask {
|
||||
Some(m) => m.get(slot),
|
||||
None => {
|
||||
let row = src_data.fill_row_by_slot(slot, n_genomes);
|
||||
filters.iter().all(|f| f.passes(&row, n_genomes))
|
||||
}
|
||||
};
|
||||
if passes { cb(kmer); }
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ── iter_src_layers (pass 2) ──────────────────────────────────────────────────
|
||||
|
||||
/// Iterate all passing kmers in `src_index_dir`, yielding `(kmer, row)`.
|
||||
///
|
||||
/// When the slot mask is available, skips the row read for filtered-out slots.
|
||||
fn iter_src_layers(
|
||||
src_index_dir: &Path,
|
||||
mode: MergeMode,
|
||||
@@ -33,17 +151,23 @@ fn iter_src_layers(
|
||||
for l in 0..src_meta.n_layers {
|
||||
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
|
||||
let unitigs_path = src_layer_dir.join("unitigs.bin");
|
||||
if !unitigs_path.exists() {
|
||||
continue;
|
||||
}
|
||||
if !unitigs_path.exists() { continue; }
|
||||
|
||||
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
|
||||
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
|
||||
let mask = try_compute_combined_mask(filters, &src_data, n_genomes)?;
|
||||
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
|
||||
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
let row = src_data.lookup(kmer, n_genomes);
|
||||
if passes_all(filters, &row, n_genomes) {
|
||||
let slot = src_data.slot(kmer);
|
||||
if let Some(ref m) = mask {
|
||||
if !m.get(slot) { continue; }
|
||||
let row = src_data.fill_row_by_slot(slot, n_genomes);
|
||||
cb(kmer, row.into_boxed_slice());
|
||||
} else {
|
||||
let row = src_data.fill_row_by_slot(slot, n_genomes);
|
||||
if filters.iter().all(|f| f.passes(&row, n_genomes)) {
|
||||
cb(kmer, row.into_boxed_slice());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -81,7 +205,7 @@ impl KmerPartition {
|
||||
|
||||
// ── Pass 1: collect filtered kmers into de Bruijn graph ───────────────
|
||||
let mut g = GraphDeBruijn::new();
|
||||
iter_src_layers(&src_index_dir, mode, n_genomes, filters, |kmer, _row| {
|
||||
iter_src_kmers_masked(&src_index_dir, mode, n_genomes, filters, |kmer| {
|
||||
g.push(kmer);
|
||||
})?;
|
||||
|
||||
@@ -103,51 +227,19 @@ impl KmerPartition {
|
||||
MergeMode::Count => dst_layer_dir.join("counts"),
|
||||
};
|
||||
std::fs::create_dir_all(&data_dir)?;
|
||||
|
||||
let mut builders: Vec<ColBuilder> = match mode {
|
||||
MergeMode::Presence => {
|
||||
PersistentBitMatrixBuilder::new(n_new, &data_dir)
|
||||
.map_err(SKError::Io)?
|
||||
.close()
|
||||
.map_err(SKError::Io)?;
|
||||
(0..n_genomes)
|
||||
.map(|g| -> SKResult<ColBuilder> {
|
||||
let b = PersistentBitVecBuilder::new(n_new, &col_path_bit(&data_dir, g))?;
|
||||
Ok(ColBuilder::Bit(b))
|
||||
})
|
||||
.collect::<SKResult<_>>()?
|
||||
}
|
||||
MergeMode::Count => {
|
||||
PersistentCompactIntMatrixBuilder::new(n_new, &data_dir)
|
||||
.map_err(SKError::Io)?
|
||||
.close()
|
||||
.map_err(SKError::Io)?;
|
||||
(0..n_genomes)
|
||||
.map(|g| -> SKResult<ColBuilder> {
|
||||
let b = PersistentCompactIntVecBuilder::new(
|
||||
n_new,
|
||||
&col_path_int(&data_dir, g),
|
||||
)?;
|
||||
Ok(ColBuilder::Int(b))
|
||||
})
|
||||
.collect::<SKResult<_>>()?
|
||||
}
|
||||
};
|
||||
let mut builders = Builders::new(mode, n_new, &data_dir, n_genomes)?;
|
||||
|
||||
// ── Pass 2: fill builders ─────────────────────────────────────────────
|
||||
iter_src_layers(&src_index_dir, mode, n_genomes, filters, |kmer, row| {
|
||||
if let Some(slot) = dst_mphf.find(kmer) {
|
||||
for (col, &value) in row.iter().enumerate() {
|
||||
builders[col].set_val(slot, value);
|
||||
builders.set_val(col, slot, value);
|
||||
}
|
||||
}
|
||||
})?;
|
||||
|
||||
// ── Close builders, write metadata ────────────────────────────────────
|
||||
for b in builders {
|
||||
b.close()?;
|
||||
}
|
||||
write_matrix_meta(&data_dir, n_new, n_genomes).map_err(SKError::Io)?;
|
||||
// ── Close builders and write metadata ─────────────────────────────────
|
||||
builders.close()?;
|
||||
|
||||
PartitionMeta {
|
||||
n_layers: 1,
|
||||
|
||||
@@ -3,8 +3,9 @@ use std::io;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use obicompactvec::{
|
||||
PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentBitVecBuilder,
|
||||
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder,
|
||||
ColGroup, MatrixGroupOps,
|
||||
PersistentBitMatrix, PersistentBitMatrixBuilder,
|
||||
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
|
||||
};
|
||||
use obilayeredmap::meta::PartitionMeta;
|
||||
use obilayeredmap::OLMError;
|
||||
@@ -40,52 +41,6 @@ pub struct OutputCol {
|
||||
pub op: AggOp,
|
||||
}
|
||||
|
||||
// ── Aggregation ───────────────────────────────────────────────────────────────
|
||||
|
||||
#[inline]
|
||||
fn aggregate(op: AggOp, indices: &[usize], src_row: &[u32], threshold: u32) -> u32 {
|
||||
match op {
|
||||
AggOp::Any => {
|
||||
if indices.iter().any(|&i| src_row[i] > threshold) { 1 } else { 0 }
|
||||
}
|
||||
AggOp::All => {
|
||||
if indices.is_empty() { return 0; }
|
||||
if indices.iter().all(|&i| src_row[i] > threshold) { 1 } else { 0 }
|
||||
}
|
||||
AggOp::None => {
|
||||
if indices.iter().all(|&i| src_row[i] <= threshold) { 1 } else { 0 }
|
||||
}
|
||||
AggOp::Sum => {
|
||||
indices.iter().map(|&i| src_row[i]).fold(0u32, |a, b| a.saturating_add(b))
|
||||
}
|
||||
AggOp::Min => indices.iter().map(|&i| src_row[i]).min().unwrap_or(0),
|
||||
AggOp::Max => indices.iter().map(|&i| src_row[i]).max().unwrap_or(0),
|
||||
}
|
||||
}
|
||||
|
||||
// ── ColBuilder ────────────────────────────────────────────────────────────────
|
||||
|
||||
enum ColBuilder {
|
||||
Bit(PersistentBitVecBuilder),
|
||||
Int(PersistentCompactIntVecBuilder),
|
||||
}
|
||||
|
||||
impl ColBuilder {
|
||||
fn set_val(&mut self, slot: usize, value: u32) {
|
||||
match self {
|
||||
ColBuilder::Bit(b) => b.set(slot, value > 0),
|
||||
ColBuilder::Int(b) => b.set(slot, value),
|
||||
}
|
||||
}
|
||||
|
||||
fn close(self) -> SKResult<()> {
|
||||
match self {
|
||||
ColBuilder::Bit(b) => b.close().map_err(SKError::Io),
|
||||
ColBuilder::Int(b) => b.close().map_err(SKError::Io),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
fn olm_to_sk(e: OLMError) -> SKError {
|
||||
@@ -95,21 +50,6 @@ fn olm_to_sk(e: OLMError) -> SKError {
|
||||
}
|
||||
}
|
||||
|
||||
fn col_path_bit(dir: &Path, col: usize) -> PathBuf {
|
||||
dir.join(format!("col_{col:06}.pbiv"))
|
||||
}
|
||||
|
||||
fn col_path_int(dir: &Path, col: usize) -> PathBuf {
|
||||
dir.join(format!("col_{col:06}.pciv"))
|
||||
}
|
||||
|
||||
fn write_matrix_meta(dir: &Path, n: usize, n_cols: usize) -> io::Result<()> {
|
||||
fs::write(
|
||||
dir.join("meta.json"),
|
||||
format!("{{\"n\":{n},\"n_cols\":{n_cols}}}\n"),
|
||||
)
|
||||
}
|
||||
|
||||
/// Copy all plain files (not subdirectories) from `src_dir` to `dst_dir`.
|
||||
fn copy_layer_files(src_dir: &Path, dst_dir: &Path) -> io::Result<()> {
|
||||
for entry in fs::read_dir(src_dir)? {
|
||||
@@ -125,30 +65,64 @@ fn copy_layer_files(src_dir: &Path, dst_dir: &Path) -> io::Result<()> {
|
||||
// ── fill_builders ─────────────────────────────────────────────────────────────
|
||||
|
||||
fn fill_builders(
|
||||
builders: &mut [ColBuilder],
|
||||
specs: &[OutputCol],
|
||||
n: usize,
|
||||
n_src: usize,
|
||||
src_layer_dir: &Path,
|
||||
src_is_count: bool,
|
||||
threshold: u32,
|
||||
output_presence: bool,
|
||||
mut dst_bit: Option<&mut PersistentBitMatrixBuilder>,
|
||||
mut dst_int: Option<&mut PersistentCompactIntMatrixBuilder>,
|
||||
) -> SKResult<()> {
|
||||
let mut src_buf = vec![0u32; n_src];
|
||||
|
||||
if src_is_count {
|
||||
let mat = PersistentCompactIntMatrix::open(src_layer_dir).map_err(SKError::Io)?;
|
||||
for slot in 0..n {
|
||||
mat.fill_row(slot, &mut src_buf);
|
||||
for (col, spec) in specs.iter().enumerate() {
|
||||
builders[col].set_val(slot, aggregate(spec.op, &spec.indices, &src_buf, threshold));
|
||||
for spec in specs {
|
||||
let g = ColGroup::new(&spec.label, spec.indices.clone());
|
||||
if output_presence {
|
||||
let b = dst_bit.as_deref_mut().unwrap();
|
||||
match spec.op {
|
||||
AggOp::Any => b.add_col_from (&mat.partial_group_any (&g, threshold).map_err(SKError::Io)?),
|
||||
AggOp::All => b.add_col_from (&mat.partial_group_all (&g, threshold).map_err(SKError::Io)?),
|
||||
AggOp::None => b.add_col_from (&mat.partial_group_none(&g, threshold).map_err(SKError::Io)?),
|
||||
AggOp::Sum => b.add_col_from_int(&mat.partial_group_sum (&g).map_err(SKError::Io)?),
|
||||
AggOp::Min => b.add_col_from_int(&mat.partial_group_min (&g).map_err(SKError::Io)?),
|
||||
AggOp::Max => b.add_col_from_int(&mat.partial_group_max (&g).map_err(SKError::Io)?),
|
||||
}.map_err(SKError::Io)?;
|
||||
} else {
|
||||
let b = dst_int.as_deref_mut().unwrap();
|
||||
match spec.op {
|
||||
AggOp::Sum => b.add_col_from (&mat.partial_group_sum (&g).map_err(SKError::Io)?),
|
||||
AggOp::Min => b.add_col_from (&mat.partial_group_min (&g).map_err(SKError::Io)?),
|
||||
AggOp::Max => b.add_col_from (&mat.partial_group_max (&g).map_err(SKError::Io)?),
|
||||
AggOp::Any => b.add_col_from_bit(&mat.partial_group_any (&g, threshold).map_err(SKError::Io)?),
|
||||
AggOp::All => b.add_col_from_bit(&mat.partial_group_all (&g, threshold).map_err(SKError::Io)?),
|
||||
AggOp::None => b.add_col_from_bit(&mat.partial_group_none(&g, threshold).map_err(SKError::Io)?),
|
||||
}.map_err(SKError::Io)?;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let mat = PersistentBitMatrix::open(src_layer_dir).map_err(SKError::Io)?;
|
||||
for slot in 0..n {
|
||||
mat.fill_row(slot, &mut src_buf);
|
||||
for (col, spec) in specs.iter().enumerate() {
|
||||
builders[col].set_val(slot, aggregate(spec.op, &spec.indices, &src_buf, threshold));
|
||||
for spec in specs {
|
||||
let g = ColGroup::new(&spec.label, spec.indices.clone());
|
||||
if output_presence {
|
||||
let b = dst_bit.as_deref_mut().unwrap();
|
||||
match spec.op {
|
||||
AggOp::Any => b.add_col_from (&mat.partial_group_any (&g, 1).map_err(SKError::Io)?),
|
||||
AggOp::All => b.add_col_from (&mat.partial_group_all (&g, 1).map_err(SKError::Io)?),
|
||||
AggOp::None => b.add_col_from (&mat.partial_group_none(&g, 1).map_err(SKError::Io)?),
|
||||
AggOp::Sum => b.add_col_from_int(&mat.partial_group_sum (&g).map_err(SKError::Io)?),
|
||||
AggOp::Min => b.add_col_from_int(&mat.partial_group_min (&g).map_err(SKError::Io)?),
|
||||
AggOp::Max => b.add_col_from_int(&mat.partial_group_max (&g).map_err(SKError::Io)?),
|
||||
}.map_err(SKError::Io)?;
|
||||
} else {
|
||||
let b = dst_int.as_deref_mut().unwrap();
|
||||
match spec.op {
|
||||
AggOp::Sum => b.add_col_from (&mat.partial_group_sum (&g).map_err(SKError::Io)?),
|
||||
AggOp::Min => b.add_col_from (&mat.partial_group_min (&g).map_err(SKError::Io)?),
|
||||
AggOp::Max => b.add_col_from (&mat.partial_group_max (&g).map_err(SKError::Io)?),
|
||||
AggOp::Any => b.add_col_from_bit(&mat.partial_group_any (&g, 1).map_err(SKError::Io)?),
|
||||
AggOp::All => b.add_col_from_bit(&mat.partial_group_all (&g, 1).map_err(SKError::Io)?),
|
||||
AggOp::None => b.add_col_from_bit(&mat.partial_group_none(&g, 1).map_err(SKError::Io)?),
|
||||
}.map_err(SKError::Io)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -168,7 +142,7 @@ impl KmerPartition {
|
||||
src: &KmerPartition,
|
||||
i: usize,
|
||||
specs: &[OutputCol],
|
||||
n_src_genomes: usize,
|
||||
_n_src_genomes: usize,
|
||||
threshold: u32,
|
||||
output_presence: bool,
|
||||
in_place: bool,
|
||||
@@ -188,7 +162,6 @@ impl KmerPartition {
|
||||
fs::create_dir_all(&dst_index_dir)?;
|
||||
}
|
||||
|
||||
let n_out = specs.len();
|
||||
let data_subdir = if output_presence { "presence" } else { "counts" };
|
||||
|
||||
for l in 0..src_meta.n_layers {
|
||||
@@ -201,7 +174,7 @@ impl KmerPartition {
|
||||
let presence_dir = src_layer_dir.join("presence");
|
||||
let src_is_count = counts_dir.exists() && !presence_dir.exists();
|
||||
|
||||
// Determine number of slots from the source matrix.
|
||||
// Determine number of slots and detect implicit layers.
|
||||
let n = if counts_dir.exists() {
|
||||
PersistentCompactIntMatrix::open(&src_layer_dir).map_err(SKError::Io)?.n()
|
||||
} else if presence_dir.exists() {
|
||||
@@ -216,7 +189,7 @@ impl KmerPartition {
|
||||
};
|
||||
|
||||
// Choose the output data directory (temp name for in-place).
|
||||
let (dst_data_dir, final_data_dir) = if in_place {
|
||||
let (dst_data_dir, final_data_dir): (PathBuf, PathBuf) = if in_place {
|
||||
let tmp = dst_layer_dir.join(format!("{data_subdir}_new"));
|
||||
let perm = dst_layer_dir.join(data_subdir);
|
||||
(tmp, perm)
|
||||
@@ -231,37 +204,22 @@ impl KmerPartition {
|
||||
}
|
||||
fs::create_dir_all(&dst_data_dir)?;
|
||||
|
||||
// Initialise packed-format skeleton.
|
||||
if output_presence {
|
||||
PersistentBitMatrixBuilder::new(n, &dst_data_dir)
|
||||
.map_err(SKError::Io)?.close().map_err(SKError::Io)?;
|
||||
let (mut dst_bit, mut dst_int) = if output_presence {
|
||||
(Some(PersistentBitMatrixBuilder::new(n, &dst_data_dir).map_err(SKError::Io)?), None)
|
||||
} else {
|
||||
PersistentCompactIntMatrixBuilder::new(n, &dst_data_dir)
|
||||
.map_err(SKError::Io)?.close().map_err(SKError::Io)?;
|
||||
}
|
||||
|
||||
// Create column builders.
|
||||
let mut builders: Vec<ColBuilder> = (0..n_out)
|
||||
.map(|col| -> SKResult<ColBuilder> {
|
||||
if output_presence {
|
||||
Ok(ColBuilder::Bit(PersistentBitVecBuilder::new(
|
||||
n, &col_path_bit(&dst_data_dir, col),
|
||||
)?))
|
||||
} else {
|
||||
Ok(ColBuilder::Int(PersistentCompactIntVecBuilder::new(
|
||||
n, &col_path_int(&dst_data_dir, col),
|
||||
)?))
|
||||
}
|
||||
})
|
||||
.collect::<SKResult<_>>()?;
|
||||
(None, Some(PersistentCompactIntMatrixBuilder::new(n, &dst_data_dir).map_err(SKError::Io)?))
|
||||
};
|
||||
|
||||
fill_builders(
|
||||
&mut builders, specs, n, n_src_genomes,
|
||||
&src_layer_dir, src_is_count, threshold,
|
||||
specs, &src_layer_dir, src_is_count, threshold, output_presence,
|
||||
dst_bit.as_mut(), dst_int.as_mut(),
|
||||
)?;
|
||||
|
||||
for b in builders { b.close()?; }
|
||||
write_matrix_meta(&dst_data_dir, n, n_out).map_err(SKError::Io)?;
|
||||
if output_presence {
|
||||
dst_bit.unwrap().close().map_err(SKError::Io)?;
|
||||
} else {
|
||||
dst_int.unwrap().close().map_err(SKError::Io)?;
|
||||
}
|
||||
|
||||
// In-place: swap old data dir for new.
|
||||
if in_place {
|
||||
|
||||
@@ -106,11 +106,7 @@ impl Layer<()> {
|
||||
let presence_dir = layer_dir.join(PRESENCE_DIR);
|
||||
fs::create_dir_all(&presence_dir).map_err(OLMError::Io)?;
|
||||
let mut mb = PersistentBitMatrixBuilder::new(n_kmers, &presence_dir).map_err(OLMError::Io)?;
|
||||
let mut col = mb.add_col().map_err(OLMError::Io)?;
|
||||
for slot in 0..n_kmers {
|
||||
col.set(slot, true);
|
||||
}
|
||||
col.close().map_err(OLMError::Io)?;
|
||||
mb.add_col_ones().map_err(OLMError::Io)?.close().map_err(OLMError::Io)?;
|
||||
mb.close().map_err(OLMError::Io)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
[package]
|
||||
name = "obitaxonomy"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
@@ -0,0 +1,38 @@
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum TaxError {
|
||||
/// Stored value does not start with the `taxonomy:/` prefix.
|
||||
MissingPrefix,
|
||||
/// Stored path contains no segments after the prefix.
|
||||
EmptyPath,
|
||||
/// Query pattern contains no segments (after stripping anchors).
|
||||
EmptyPattern,
|
||||
/// A segment has an empty name (e.g. consecutive `/`).
|
||||
EmptySegmentName,
|
||||
/// A segment has a trailing `@` with no rank name.
|
||||
EmptyRankName { segment: String },
|
||||
/// A segment contains more than one `@`.
|
||||
AmbiguousRank { segment: String },
|
||||
}
|
||||
|
||||
impl fmt::Display for TaxError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
TaxError::MissingPrefix =>
|
||||
write!(f, "taxonomy path must start with \"taxonomy:/\""),
|
||||
TaxError::EmptyPath =>
|
||||
write!(f, "taxonomy path has no segments"),
|
||||
TaxError::EmptyPattern =>
|
||||
write!(f, "taxonomy query pattern has no segments"),
|
||||
TaxError::EmptySegmentName =>
|
||||
write!(f, "segment has an empty name"),
|
||||
TaxError::EmptyRankName { segment } =>
|
||||
write!(f, "segment has '@' with no rank name: {segment:?}"),
|
||||
TaxError::AmbiguousRank { segment } =>
|
||||
write!(f, "segment contains more than one '@': {segment:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for TaxError {}
|
||||
@@ -0,0 +1,11 @@
|
||||
mod error;
|
||||
mod segment;
|
||||
mod segment_pattern;
|
||||
mod path;
|
||||
mod pattern;
|
||||
|
||||
pub use error::TaxError;
|
||||
pub use segment::TaxSegment;
|
||||
pub use segment_pattern::SegmentPattern;
|
||||
pub use path::{TaxPath, PREFIX};
|
||||
pub use pattern::TaxPattern;
|
||||
@@ -0,0 +1,82 @@
|
||||
use std::fmt;
|
||||
use std::str::FromStr;
|
||||
|
||||
use crate::error::TaxError;
|
||||
use crate::segment::TaxSegment;
|
||||
|
||||
/// The prefix that marks a metadata value as a taxonomy path.
|
||||
pub const PREFIX: &str = "taxonomy:/";
|
||||
|
||||
/// A rooted, `/`-separated taxonomy path with optional per-segment rank annotations.
|
||||
///
|
||||
/// Stored form: `taxonomy:/seg1@rank1/seg2/seg3@rank3`
|
||||
/// The leading `taxonomy:/` is the discriminator; the remainder is one or more
|
||||
/// `/`-separated segments, each of the form `name` or `name@rank`.
|
||||
///
|
||||
/// `@` is reserved and may not appear in segment names or rank names.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct TaxPath {
|
||||
segments: Vec<TaxSegment>,
|
||||
}
|
||||
|
||||
impl TaxPath {
|
||||
pub fn parse(s: &str) -> Result<Self, TaxError> {
|
||||
let tail = s.strip_prefix(PREFIX).ok_or(TaxError::MissingPrefix)?;
|
||||
if tail.is_empty() {
|
||||
return Err(TaxError::EmptyPath);
|
||||
}
|
||||
let segments = tail.split('/')
|
||||
.map(TaxSegment::parse)
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
Ok(Self { segments })
|
||||
}
|
||||
|
||||
/// True if `self` is an ancestor of — or equal to — `other`.
|
||||
///
|
||||
/// Comparison is by segment name only; rank annotations are ignored.
|
||||
/// `self` must be a prefix of `other` at segment granularity.
|
||||
pub fn is_ancestor_of(&self, other: &TaxPath) -> bool {
|
||||
self.segments.len() <= other.segments.len()
|
||||
&& self.segments.iter().zip(other.segments.iter())
|
||||
.all(|(a, b)| a.name() == b.name())
|
||||
}
|
||||
|
||||
/// Returns the name of the first segment whose rank equals `rank`, if any.
|
||||
pub fn name_at_rank(&self, rank: &str) -> Option<&str> {
|
||||
self.segments.iter()
|
||||
.find(|s| s.rank() == Some(rank))
|
||||
.map(|s| s.name())
|
||||
}
|
||||
|
||||
/// True if any segment has the given rank.
|
||||
pub fn has_rank(&self, rank: &str) -> bool {
|
||||
self.segments.iter().any(|s| s.rank() == Some(rank))
|
||||
}
|
||||
|
||||
/// True if the path contains a segment with both the given rank and name.
|
||||
pub fn matches_rank(&self, rank: &str, name: &str) -> bool {
|
||||
self.segments.iter().any(|s| s.rank() == Some(rank) && s.name() == name)
|
||||
}
|
||||
|
||||
pub fn segments(&self) -> &[TaxSegment] { &self.segments }
|
||||
pub fn depth(&self) -> usize { self.segments.len() }
|
||||
pub fn is_empty(&self) -> bool { self.segments.is_empty() }
|
||||
}
|
||||
|
||||
impl fmt::Display for TaxPath {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{}", PREFIX)?;
|
||||
let mut first = true;
|
||||
for seg in &self.segments {
|
||||
if !first { write!(f, "/")?; }
|
||||
write!(f, "{seg}")?;
|
||||
first = false;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for TaxPath {
|
||||
type Err = TaxError;
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> { Self::parse(s) }
|
||||
}
|
||||
@@ -0,0 +1,72 @@
|
||||
use crate::error::TaxError;
|
||||
use crate::path::TaxPath;
|
||||
use crate::segment::TaxSegment;
|
||||
use crate::segment_pattern::SegmentPattern;
|
||||
|
||||
/// A query pattern for matching against stored `TaxPath` values.
|
||||
///
|
||||
/// Syntax:
|
||||
///
|
||||
/// | Form | Semantics |
|
||||
/// |----------|-----------|
|
||||
/// | `A/B` | A then B as a contiguous sub-path, anywhere in the value |
|
||||
/// | `/A/B` | value starts with A then B (start-anchored) |
|
||||
/// | `A/B$` | value ends with A then B (end-anchored) |
|
||||
/// | `/A/B$` | value is exactly A then B (fully anchored) |
|
||||
/// | `A@x/B` | A with rank `x`, followed by B with any rank |
|
||||
///
|
||||
/// A segment pattern without `@` matches any segment with that name regardless
|
||||
/// of its stored rank.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct TaxPattern {
|
||||
start_anchored: bool,
|
||||
end_anchored: bool,
|
||||
segments: Vec<SegmentPattern>,
|
||||
}
|
||||
|
||||
impl TaxPattern {
|
||||
pub fn parse(s: &str) -> Result<Self, TaxError> {
|
||||
let s = s.trim();
|
||||
|
||||
let start_anchored = s.starts_with('/');
|
||||
let s = if start_anchored { &s[1..] } else { s };
|
||||
|
||||
let end_anchored = s.ends_with('$');
|
||||
let s = if end_anchored { &s[..s.len() - 1] } else { s };
|
||||
|
||||
if s.is_empty() {
|
||||
return Err(TaxError::EmptyPattern);
|
||||
}
|
||||
|
||||
let segments = s.split('/')
|
||||
.map(SegmentPattern::parse)
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
Ok(Self { start_anchored, end_anchored, segments })
|
||||
}
|
||||
|
||||
/// True if this pattern matches `path` according to the anchor flags.
|
||||
///
|
||||
/// The pattern must match a contiguous run of segments in the path.
|
||||
/// Start/end anchors restrict where that run may begin or end.
|
||||
pub fn matches(&self, path: &TaxPath) -> bool {
|
||||
let n = self.segments.len();
|
||||
let m = path.depth();
|
||||
|
||||
if n > m { return false; }
|
||||
|
||||
let segs = path.segments();
|
||||
match (self.start_anchored, self.end_anchored) {
|
||||
(true, true) => n == m && self.window_matches(segs, 0),
|
||||
(true, false) => self.window_matches(segs, 0),
|
||||
(false, true) => self.window_matches(segs, m - n),
|
||||
(false, false) => (0..=(m - n)).any(|i| self.window_matches(segs, i)),
|
||||
}
|
||||
}
|
||||
|
||||
fn window_matches(&self, segs: &[TaxSegment], start: usize) -> bool {
|
||||
self.segments.iter()
|
||||
.zip(segs[start..start + self.segments.len()].iter())
|
||||
.all(|(pat, seg)| pat.matches(seg))
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
use std::fmt;
|
||||
|
||||
use crate::error::TaxError;
|
||||
|
||||
/// A single node in a taxonomy path: a name and an optional rank.
|
||||
///
|
||||
/// Neither `name` nor `rank` may contain `@` (reserved separator).
|
||||
/// Serialised form: `name` or `name@rank`.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct TaxSegment {
|
||||
name: String,
|
||||
rank: Option<String>,
|
||||
}
|
||||
|
||||
impl TaxSegment {
|
||||
pub fn parse(raw: &str) -> Result<Self, TaxError> {
|
||||
let parts: Vec<&str> = raw.splitn(3, '@').collect();
|
||||
|
||||
let (name_raw, rank_raw) = match parts.as_slice() {
|
||||
[name] => (*name, None),
|
||||
[name, rank] => (*name, Some(*rank)),
|
||||
_ => return Err(TaxError::AmbiguousRank { segment: raw.to_string() }),
|
||||
};
|
||||
|
||||
if name_raw.is_empty() {
|
||||
return Err(TaxError::EmptySegmentName);
|
||||
}
|
||||
|
||||
let rank = match rank_raw {
|
||||
None => None,
|
||||
Some("") => return Err(TaxError::EmptyRankName { segment: raw.to_string() }),
|
||||
Some(r) => Some(r.to_string()),
|
||||
};
|
||||
|
||||
Ok(Self { name: name_raw.to_string(), rank })
|
||||
}
|
||||
|
||||
pub fn name(&self) -> &str { &self.name }
|
||||
pub fn rank(&self) -> Option<&str> { self.rank.as_deref() }
|
||||
}
|
||||
|
||||
impl fmt::Display for TaxSegment {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match &self.rank {
|
||||
None => write!(f, "{}", self.name),
|
||||
Some(r) => write!(f, "{}@{}", self.name, r),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,41 @@
|
||||
use crate::error::TaxError;
|
||||
use crate::segment::TaxSegment;
|
||||
|
||||
/// A single segment in a query pattern: a required name and an optional rank filter.
|
||||
///
|
||||
/// If `rank` is `None`, the pattern matches any segment with the given name,
|
||||
/// regardless of its stored rank. If `rank` is `Some(r)`, both name and rank
|
||||
/// must match exactly.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct SegmentPattern {
|
||||
name: String,
|
||||
rank: Option<String>,
|
||||
}
|
||||
|
||||
impl SegmentPattern {
|
||||
pub fn parse(raw: &str) -> Result<Self, TaxError> {
|
||||
let parts: Vec<&str> = raw.splitn(3, '@').collect();
|
||||
let (name_raw, rank_raw) = match parts.as_slice() {
|
||||
[name] => (*name, None),
|
||||
[name, rank] => (*name, Some(*rank)),
|
||||
_ => return Err(TaxError::AmbiguousRank { segment: raw.to_string() }),
|
||||
};
|
||||
if name_raw.is_empty() {
|
||||
return Err(TaxError::EmptySegmentName);
|
||||
}
|
||||
let rank = match rank_raw {
|
||||
None => None,
|
||||
Some("") => return Err(TaxError::EmptyRankName { segment: raw.to_string() }),
|
||||
Some(r) => Some(r.to_string()),
|
||||
};
|
||||
Ok(Self { name: name_raw.to_string(), rank })
|
||||
}
|
||||
|
||||
/// True if this pattern matches `seg`.
|
||||
/// Name must match exactly. If a rank is specified in the pattern, the
|
||||
/// segment's rank must match; otherwise any rank (or no rank) is accepted.
|
||||
pub fn matches(&self, seg: &TaxSegment) -> bool {
|
||||
self.name == seg.name()
|
||||
&& self.rank.as_deref().map_or(true, |r| seg.rank() == Some(r))
|
||||
}
|
||||
}
|
||||
@@ -1,28 +0,0 @@
|
||||
>F1FE4776BF3E1F06 {"seq_length":51,"kmer_size":31,"minimizer_size":11,"partition":229,"minimizer":"AAAAAAAATTA"}
|
||||
GAGTATACTCATGTGAGGGTAAAAAAAATTAAGTCCCATATTGAAACATTA
|
||||
>C14BF81526DD6CB7 {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":84,"minimizer":"AAAAAAATTAA"}
|
||||
AAAAAAATTAAGTCCCATATTGAAACATTAT
|
||||
>9156D79605E4AC23 {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":87,"minimizer":"AAAAAATTAAG"}
|
||||
AAAAAATTAAGTCCCATATTGAAACATTATC
|
||||
>74666D1D78812D1E {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":118,"minimizer":"AAAAATTAAGT"}
|
||||
AAAAATTAAGTCCCATATTGAAACATTATCA
|
||||
>45EEFC3520FBDA9A {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":32,"minimizer":"AAAATTAAGTC"}
|
||||
AAAATTAAGTCCCATATTGAAACATTATCAC
|
||||
>5F44864B90170AF4 {"seq_length":49,"kmer_size":31,"minimizer_size":11,"partition":137,"minimizer":"AAACATTATCA"}
|
||||
AAATTAAGTCCCATATTGAAACATTATCACAAATGTGAGTTGTTAATAT
|
||||
>8D10A11C86F8EF26 {"seq_length":42,"kmer_size":31,"minimizer_size":11,"partition":26,"minimizer":"AAATGTGAGTT"}
|
||||
AACATTATCACAAATGTGAGTTGTTAATATTACATAATTGGG
|
||||
>C18F1086D0AF6E34 {"seq_length":32,"kmer_size":31,"minimizer_size":11,"partition":9,"minimizer":"TGTGAGTTGTT"}
|
||||
AATGTGAGTTGTTAATATTACATAATTGGGTT
|
||||
>933477394DAF03BB {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":48,"minimizer":"TAATTGGGTTT"}
|
||||
TGTGAGTTGTTAATATTACATAATTGGGTTT
|
||||
>3CEE7E5227956042 {"seq_length":36,"kmer_size":31,"minimizer_size":11,"partition":252,"minimizer":"AATTGGGTTTT"}
|
||||
GTGAGTTGTTAATATTACATAATTGGGTTTTATGCT
|
||||
>1BAF5B8767D63D0B {"seq_length":33,"kmer_size":31,"minimizer_size":11,"partition":201,"minimizer":"AAAGGCTCCCT"}
|
||||
TGAAAGGCTCCCTAGCGTGTTAATTAATCTCCC
|
||||
>8368A897DB263C6F {"seq_length":38,"kmer_size":31,"minimizer_size":11,"partition":22,"minimizer":"CCTAGCGTGTT"}
|
||||
AAGGCTCCCTAGCGTGTTAATTAATCTCCCTGACAAGT
|
||||
>247DC82E11CF8055 {"seq_length":35,"kmer_size":31,"minimizer_size":11,"partition":128,"minimizer":"AATCTCCCTGA"}
|
||||
CTAGCGTGTTAATTAATCTCCCTGACAAGTAGTGT
|
||||
>11C93BBC8A5F6327 {"seq_length":35,"kmer_size":31,"minimizer_size":11,"partition":62,"minimizer":"CAAGTAGTGTT"}
|
||||
GTGTTAATTAATCTCCCTGACAAGTAGTGTTAGTG
|
||||
Reference in New Issue
Block a user