diff --git a/doc/404.html b/doc/404.html index 0b196ed..dbb8998 100644 --- a/doc/404.html +++ b/doc/404.html @@ -638,6 +638,34 @@ +
  • + + + + + + + + Evidence elimination (discussion) + + + + + + + + +
  • + + + + + + + + + +
  • @@ -716,6 +744,62 @@ + + + + + + +
  • + + + + + + + + Merge command + + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + Kmer filtering (rebuild/dump/unitig) + + + + + + + + +
  • + + + + diff --git a/doc/architecture/index_architecture.refs/index.html b/doc/architecture/index_architecture.refs/index.html new file mode 100644 index 0000000..7e1aca7 --- /dev/null +++ b/doc/architecture/index_architecture.refs/index.html @@ -0,0 +1,1068 @@ + + + + + + + + + + + + + + + + + + + + + + Index architecture.refs - obikmer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + Skip to content + + +
    +
    + +
    + + + + + + +
    + + +
    + +
    + + + + + + +
    +
    + + + +
    +
    +
    + + + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    + +
    + + + + + + +

    Coverage: architecture/index_architecture.md

    +

    Code couvert

    +
      +
    • obilayeredmap/src/layer.rs — Layer, trait LayerData, modes () / PersistentCompactIntMatrix / PersistentBitMatrix
    • +
    • obilayeredmap/src/mphf_layer.rs — MphfLayer, EvidenceKind (Exact / Approx), LayerEvidence enum
    • +
    • obilayeredmap/src/map.rs — LayeredMap
    • +
    • obilayeredmap/src/meta.rs — LayerMeta, PartitionMeta
    • +
    • obikindex/src/meta.rs — IndexConfig (kmer_size, n_bits, with_counts, evidence, block_bits), IndexMeta
    • +
    • obikindex/src/index.rs — KmerIndex, build_layers
    • +
    • obicompactvec/src/ — PersistentCompactIntMatrix, PersistentBitMatrix (DataStore implementations)
    • +
    +

    Notes

    +

    FORT RISQUE DE DÉRIVE. Nombreux changements récents : +- Ajout de EvidenceKind (Exact / Approx { b, z }) dans IndexConfig et LayerMeta +- Ajout de block_bits dans IndexConfig +- LayerEvidence enum dans mphf_layer.rs remplace l'ancienne approche monolithique +- Distinction open() vs open_sequential() dans UnitigFileReader +- Commandes reindex et estimate ajoutées +Vérifier que la hiérarchie à 3 niveaux décrite est toujours exacte et que les nouveaux paramètres sont documentés.

    + + + + + + + + + + + + + +
    +
    + + + +
    + +
    + + + +
    +
    +
    +
    + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/architecture/index_architecture/index.html b/doc/architecture/index_architecture/index.html index 24d05a5..5565517 100644 --- a/doc/architecture/index_architecture/index.html +++ b/doc/architecture/index_architecture/index.html @@ -645,6 +645,34 @@ +
  • + + + + + + + + Evidence elimination (discussion) + + + + + + + + +
  • + + + + + + + + + +
  • @@ -723,6 +751,62 @@ + + + + + + +
  • + + + + + + + + Merge command + + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + Kmer filtering (rebuild/dump/unitig) + + + + + + + + +
  • + + + + @@ -896,10 +980,43 @@
  • - + - MphfLayer — autonomous mapping layer + IndexConfig and IndexMeta + + + + +
  • + +
  • + + + + EvidenceKind + + + + +
  • + +
  • + + + + MphfLayer — autonomous kmer → slot mapping + + + + +
  • + +
  • + + + + Layer\<D> — MPHF + data payload @@ -918,42 +1035,25 @@
  • - + - Distance matrix API on DataStore types - - - - - -
  • @@ -968,10 +1068,10 @@
  • - + - Traits — obicompactvec::traits + Multi-genome column invariant @@ -979,22 +1079,33 @@
  • - + - LayeredStore<S> — obilayeredmap + Query model -
  • - + - MphfLayer — autonomous mapping layer + IndexConfig and IndexMeta + + + + +
  • + +
  • + + + + EvidenceKind + + + + +
  • + +
  • + + + + MphfLayer — autonomous kmer → slot mapping + + + + +
  • + +
  • + + + + Layer\<D> — MPHF + data payload @@ -1204,42 +1270,25 @@
  • - + - Distance matrix API on DataStore types - - - - - -
  • @@ -1254,10 +1303,10 @@
  • - + - Traits — obicompactvec::traits + Multi-genome column invariant @@ -1265,22 +1314,33 @@
  • - + - LayeredStore<S> — obilayeredmap + Query model - @@ -973,10 +1090,21 @@
    • - + - Memory layout + Types and layout + + + + +
    • + +
    • + + + + Global parameters @@ -1017,10 +1145,32 @@
    • - + - Canonical form + Canonical form and CanonicalKmerOf + + + + +
    • + +
    • + + + + Sliding window helpers + + + + +
    • + +
    • + + + + Hashing @@ -1045,12 +1195,43 @@

      Kmer — implementation

      -

      Memory layout

      -

      Kmer is a #[repr(transparent)] newtype over u64:

      +

      Types and layout

      +

      KmerOf<L> is a #[repr(transparent)] newtype over u64 parameterized by a KmerLength marker:

      #[repr(transparent)]
      -pub struct Kmer(u64);
      +pub struct KmerOf<L: KmerLength>(u64, PhantomData<L>);
       
      -

      Nucleotides are packed 2 bits each, left-aligned, MSB-first. Nucleotide 0 occupies bits 63–62; nucleotide i occupies bits 63−2i and 62−2i. The low 64−2k bits are always zero. k is not stored — it is a parameter of every operation that needs it, and will be owned by the future collection-level indexer.

      +

      Three marker types implement KmerLength:

      + + + + + + + + + + + + + + + + + + + + + + + + + +
      Markerlen() sourceUsed for
      KLenparams::k()k-mers
      MLenparams::m()minimizers
      ConstLen<N>const generic Ntests
      +

      Public aliases:

      +
      pub type Kmer     = KmerOf<KLen>;         // k-mer, global k
      +pub type Minimizer = CanonicalKmerOf<MLen>; // canonical m-mer, global m
      +
      +

      Nucleotides are packed 2 bits each, left-aligned, MSB-first. Nucleotide 0 occupies bits 63–62; nucleotide i occupies bits 63−2i and 62−2i. The low 64−2·len bits are always zero. The length is not stored — every operation reads it from L::len().

      @@ -1071,33 +1252,41 @@
      +

      Global parameters

      +

      params::set_k(k) / params::k() and params::set_m(m) / params::m() are backed by OnceLock<usize> in production (write-once, panic on conflict) and by thread_local! { Cell<usize> } in test builds (per-thread, freely writable). params::init(k, m) sets both in one call.

      Encoding

      -

      Kmer::from_ascii(ascii, k) encodes the first k bytes of an ASCII slice using the shared ENC table (see SuperKmer — ASCII encoding):

      +

      KmerOf::<L>::from_ascii(ascii) encodes the first L::len() bytes using the shared ENC table (see SuperKmer — ASCII encoding):

      for i in 0..k {
           val = (val << 2) | encode_base(ascii[i]) as u64;
       }
      -Kmer(val << (64 - 2 * k))
      +KmerOf(val << (64 - 2 * k), PhantomData)
       

      Zero allocation — result lives on the stack.

      Decoding

      -

      write_ascii(k, buf) appends k ASCII characters to a caller-supplied Vec<u8> using the shared DEC4 table: one lookup per 4 nucleotides, two partial-byte lookups for the remainder. No allocation in the hot path.

      -

      to_ascii(k) is a convenience wrapper that allocates and returns a Vec<u8>; intended for tests and display only.

      +

      write_ascii(writer) writes k ASCII characters to any W: Write using the shared DEC4 table: one lookup per 4 nucleotides, one partial lookup for the remainder. No allocation in the hot path.

      +

      to_ascii() is a convenience wrapper that allocates and returns a Vec<u8>; intended for tests and display only.

      Reverse complement

      Computed as pure arithmetic — no lookup table, no memory access:

      let x = !self.0;                                                               // complement
       let x = x.swap_bytes();                                                        // reverse bytes
       let x = ((x >> 4) & 0x0F0F0F0F0F0F0F0F) | ((x & 0x0F0F0F0F0F0F0F0F) << 4); // swap nibbles
       let x = ((x >> 2) & 0x3333333333333333) | ((x & 0x3333333333333333) << 2);   // swap 2-bit groups
      -Kmer(x << (64 - 2 * k))
      +KmerOf(x << (64 - 2 * k), PhantomData)
       

      After complementing, bytes are reversed (swap_bytes), then nibbles, then 2-bit groups — restoring 2-bit nucleotides to their correct positions in reverse order. A final left-shift realigns to MSB. Zero allocation — result lives on the stack.

      -

      Canonical form

      -
      pub fn canonical(&self, k: usize) -> Self {
      -    let rc = self.revcomp(k);
      -    if self.0 <= rc.0 { *self } else { rc }
      +

      Canonical form and CanonicalKmerOf

      +

      canonical() returns a CanonicalKmerOf<L> — a distinct newtype that carries the same u64 layout but enforces the invariant that the stored value equals min(kmer, revcomp):

      +
      pub fn canonical(&self) -> CanonicalKmerOf<L> {
      +    let rc = self.revcomp();
      +    CanonicalKmerOf(if self.0 <= rc.0 { self.0 } else { rc.0 }, PhantomData)
       }
       

      Lexicographic minimum of forward and reverse-complement, comparing the raw u64 values directly (left-aligned encoding makes this equivalent to nucleotide-wise comparison). Zero allocation — result lives on the stack.

      +

      CanonicalKmerOf::from_raw_unchecked(raw) is the only other public constructor, for trusted paths such as deserialisation.

      +

      Sliding window helpers

      +

      push_right(nuc) / push_left(nuc) shift the window by one base in O(1). is_overlapping(other) checks whether the last k−1 nucleotides of self equal the first k−1 of other.

      +

      Hashing

      +

      hash_kmer(raw: u64) -> u64 computes mix64(raw ^ 0x9e3779b97f4a7c15), the seeded splitmix64 finalizer. CanonicalKmerOf::seq_hash() delegates to hash_kmer.

      diff --git a/doc/implementation/merge.refs/index.html b/doc/implementation/merge.refs/index.html new file mode 100644 index 0000000..ff3be53 --- /dev/null +++ b/doc/implementation/merge.refs/index.html @@ -0,0 +1,1065 @@ + + + + + + + + + + + + + + + + + + + + + + Merge.refs - obikmer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      + +
      + + + + + + +
      + + +
      + +
      + + + + + + +
      +
      + + + +
      +
      +
      + + + + + +
      +
      +
      + + + +
      +
      +
      + + + +
      +
      +
      + + + +
      + +
      + + + + + + +

      Coverage: implementation/merge.md

      +

      Code couvert

      +
        +
      • obikindex/src/merge.rsKmerIndex::merge(), validation de compatibilité d'évidence, validate_evidence_compat()
      • +
      • obikpartitionner/src/merge_layer.rsmerge_partition(), construction de la nouvelle layer, paramètre block_bits
      • +
      • obikpartitionner/src/rebuild_layer.rsrebuild_partition(), paramètre block_bits
      • +
      • obilayeredmap/src/layer.rsLayer::append_genome_column() (PersistentCompactIntMatrix et PersistentBitMatrix)
      • +
      • obicompactvec/src/intmatrix.rsappend_column pour PersistentCompactIntMatrix
      • +
      • obicompactvec/src/bitmatrix.rsappend_column pour PersistentBitMatrix
      • +
      +

      Notes

      +

      FORT RISQUE DE DÉRIVE. Changements récents : +- Ajout de la validation de compatibilité d'évidence : merge exact+approx → erreur (OKIError::IncompatibleEvidence) +- merge_partition reçoit maintenant block_bits: u8 +- La commande reindex a été ajoutée comme outil de conversion exact↔approx avant merge +Vérifier que la doc décrit la politique de merge mixed-evidence et le recours à reindex.

      + + + + + + + + + + + + + +
      +
      + + + +
      + +
      + + + +
      +
      +
      +
      + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/implementation/merge/index.html b/doc/implementation/merge/index.html new file mode 100644 index 0000000..1dc9363 --- /dev/null +++ b/doc/implementation/merge/index.html @@ -0,0 +1,1577 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + Merge command - obikmer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      + +
      + + + + + + +
      + + +
      + +
      + + + + + + +
      +
      + + + +
      +
      +
      + + + + + +
      +
      +
      + + + + + + + +
      + +
      + + + + + +

      Merge command

      +

      Purpose

      +

      obikmer merge combines multiple existing kmer indexes into a single index. The result contains all kmers from all sources, with per-genome presence/absence or count data for every genome across every layer.

      +
      +

      Modes

      +
      pub enum MergeMode { Presence, Count }
      +
      +

      Default mode is Presence. Count mode requires all source indexes to have with_counts=true; mixing count and non-count sources is rejected at validation.

      + + + + + + + + + + + + + + + + + + + + +
      ModeColumn typeConstraint
      PresencePersistentBitMatrix (one bit per genome per slot)none
      CountPersistentCompactIntMatrix (one u32 per genome per slot)all sources with_counts=true
      +
      +

      Input / output constraints

      +

      All source indexes must satisfy:

      +
        +
      • IndexState::Indexed (fully built — index.done sentinel present)
      • +
      • Same kmer_size, minimizer_size, n_partitions
      • +
      • Same evidence kind: all Exact, or all Approx with identical (b, z) parameters
      • +
      • If Count mode: all sources must have with_counts=true
      • +
      +

      --force: if the output directory already exists, it is deleted before the merge begins.

      +
      +

      Evidence compatibility

      +

      validate_evidence_compat(sources) is called before any I/O. It compares each source's EvidenceKind against sources[0]:

      +
        +
      • All Exact → accepted, output uses Exact
      • +
      • All Approx { b, z } with same (b, z) → accepted, output uses those parameters
      • +
      • Any other combination → OKIError::IncompatibleEvidence, with a message directing the user to run reindex first
      • +
      +

      Mixed exact/approx is a hard error, not a silent conversion.

      +
      fn validate_evidence_compat(sources: &[&KmerIndex]) -> OKIResult<EvidenceKind>
      +
      +
      +

      Genome label deduplication

      +

      compute_labels(sources, rename_duplicates) assigns final genome labels across all sources before any file is written. The first occurrence of a label keeps the original name. Subsequent occurrences receive .1, .2, … suffixes when rename_duplicates is true, or trigger OKIError::DuplicateGenomeLabel otherwise.

      +
      +

      Algorithm

      +

      1. Validation

      +

      Check all sources against the constraints above. Abort on any mismatch.

      +

      2. Bootstrap output from first source

      +

      Recursive file copy of sources[0]output. Immediately after the copy:

      +
        +
      • index.meta is rewritten with the final genome list (all sources, possibly renamed) and the effective evidence kind.
      • +
      • In Presence mode, any counts/ directories inherited from source_0 are removed.
      • +
      • spectrums/ from source_0 is removed and rebuilt from scratch across all sources, applying the (possibly renamed) labels.
      • +
      +

      This establishes the partition layout, all existing MPHFs, unitigs, and evidence files. The first source's genomes occupy columns 0 … n_dst_genomes - 1 in the destination.

      +

      3. For each subsequent source (parallel across partitions)

      +

      KmerPartition::merge_partition(i, sources, mode, n_dst_genomes, block_bits) is called for each partition index i. block_bits is taken from dst.meta.config.block_bits.

      +

      Each entry in sources is (&KmerPartition, n_genomes) where n_genomes is the column count that source contributes (> 1 when the source is itself a merged index).

      +

      First merge, Presence mode: when n_dst_genomes == 1, Layer::<()>::init_presence_matrix is called on every existing destination layer before any source column is appended. This creates presence/col_000000.pbiv set all-true (genome 0 is present in every slot).

      +

      Pass 1 — classify kmers

      +

      Iterate all kmers from all source partitions (via UnitigFileReader + canonical kmer iteration). For each kmer, probe the destination LayeredMap<()>:

      +
        +
      • Hit: kmer already in the destination; record for Pass 2.
      • +
      • Miss: push kmer into a GraphDeBruijn accumulator.
      • +
      +

      New layer construction

      +

      If the accumulator is non-empty, compute de Bruijn unitigs and call Layer::<()>::build(&new_layer_dir, block_bits). All kmers absent from the destination — across all sources — accumulate into a single graph, producing one new layer per merge operation (not one per source).

      +

      Pass 2 — fill column builders

      +

      For each source and each of its layers, re-iterate unitigs and look up stored values via SrcLayerData::lookup(kmer, src_n):

      +
        +
      • SrcLayerData::SetMembership — no data directory exists; every kmer returns vec![1; n_genomes]
      • +
      • SrcLayerData::Presence — reads PersistentBitMatrix from presence/
      • +
      • SrcLayerData::Count — reads PersistentCompactIntMatrix from counts/
      • +
      +

      Hits are routed to exist_builders[dst_layer][src_col]; misses are routed to new_src_builders[src_col].

      +

      Column prepending for new layers

      +

      Before source columns are written to the new layer, n_dst_genomes absent columns (all-zero / all-false) are prepended — one per genome already in the index — so the column count invariant holds immediately after layer creation.

      +

      Close and update metadata

      +

      Close all builders; update presence/meta.json or counts/meta.json with {"n": N, "n_cols": n_dst_genomes + n_src_total}; increment PartitionMeta::n_layers if a new layer was added.

      +

      4. Update index metadata

      +

      index.meta was already written during bootstrap with the complete genome list and evidence kind. No further update is needed after the partition loop.

      +
      +

      append_genome_column

      +

      Defined on two concrete specialisations of Layer<D>:

      +
      impl Layer<PersistentCompactIntMatrix> {
      +    pub fn append_genome_column(layer_dir: &Path, value_of: impl Fn(usize) -> u32) -> OLMResult<()>
      +}
      +
      +impl Layer<PersistentBitMatrix> {
      +    pub fn append_genome_column(layer_dir: &Path, value_of: impl Fn(usize) -> bool) -> OLMResult<()>
      +}
      +
      +

      Each appends one column file to the matrix subdirectory (counts/ or presence/). In merge_partition, columns are written directly via PersistentBitVecBuilder / PersistentCompactIntVecBuilder rather than through these helpers, but the invariant they enforce is the same.

      +
      +

      Column count invariant

      +

      After any merge, every layer in every partition has exactly n_genomes columns, where n_genomes is the total genome count in the index at that point.

      +

      Maintained by three mechanisms:

      +
        +
      1. Existing layers: n_src_total columns appended (one per source genome).
      2. +
      3. New layers created during merge: n_dst_genomes absent columns prepended before source columns.
      4. +
      5. First merge, Presence mode: init_presence_matrix retroactively creates presence/col_0 all-true for genome 0.
      6. +
      +

      The invariant is a precondition of LayeredStore aggregation traits: col_weights() and all partial distance methods assume every inner store has the same column count.

      +
      +

      Error variants relevant to merge

      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      VariantCondition
      OKIError::NotIndexed(path)Source not in Indexed state
      OKIError::IncompatibleConfigMismatched kmer_size, minimizer_size, or n_partitions
      OKIError::MismatchedModeCount mode but a source has with_counts=false
      OKIError::IncompatibleEvidence(msg)Mixed exact/approx or different approx (b, z)
      OKIError::DuplicateGenomeLabel(label)Duplicate label and rename_duplicates=false
      +
      +

      On-disk impact

      +

      After merging G genomes (sources_0 contributes G0, subsequent sources the rest):

      +
      partitions/
      +  part_00000/
      +    index/
      +      meta.json              ← n_layers updated if new layer added
      +      layer_0/
      +        mphf.bin             ← unchanged
      +        unitigs.bin          ← unchanged
      +        evidence.bin         ← unchanged
      +        presence/            ← created on first merge (Presence mode)
      +          meta.json            {"n": N, "n_cols": G}
      +          col_000000.pbiv      ← all-true (genome 0 … G0-1)
      +          col_000001.pbiv      ← next source
      +          ...
      +        counts/              ← extended (Count mode)
      +          meta.json            {"n": N, "n_cols": G}
      +          col_000000.pciv      ← genome 0 counts (from original build)
      +          col_000001.pciv      ← next source
      +          ...
      +      layer_N/               ← new layer (if new kmers found)
      +        mphf.bin
      +        unitigs.bin
      +        evidence.bin
      +        presence/ or counts/
      +          meta.json            {"n": N1, "n_cols": G}
      +          col_000000.pbiv      ← all-false (absent for existing genomes)
      +          ...
      +spectrums/
      +  <label>.json               ← one file per genome, rebuilt from all sources
      +index.meta                   ← complete genome list + evidence kind written at bootstrap
      +
      + + + + + + + + + + + + + +
      +
      + + + +
      + +
      + + + +
      +
      +
      +
      + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/implementation/mphf.refs/index.html b/doc/implementation/mphf.refs/index.html new file mode 100644 index 0000000..1cbb890 --- /dev/null +++ b/doc/implementation/mphf.refs/index.html @@ -0,0 +1,1062 @@ + + + + + + + + + + + + + + + + + + + + + + Mphf.refs - obikmer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      + +
      + + + + + + +
      + + +
      + +
      + + + + + + +
      +
      + + + +
      +
      +
      + + + + + +
      +
      +
      + + + +
      +
      +
      + + + +
      +
      +
      + + + +
      + +
      + + + + + + +

      Coverage: implementation/mphf.md

      +

      Code couvert

      +
        +
      • obilayeredmap/src/mphf_layer.rs — type Mphf (PtrHash + CubicEps + CachelineEfVec + Xx64), construction en 2 passes, build(), build_exact_evidence(), build_approx_evidence(), build_evidence()
      • +
      • obikpartitionner/src/index_layer.rsbuild_index_layer() avec passage de block_bits
      • +
      +

      Notes

      +

      FORT RISQUE DE DÉRIVE. Changements récents : +- build_exact_evidence(dir, block_bits)block_bits maintenant paramétrisé (défaut 0) +- build_approx_evidence(dir, b, z) — nouvelle fonction pour l'évidence fingerprint +- build_evidence(dir, kind, block_bits) — dispatch selon EvidenceKind +- Construction en 2 phases : pass 1 (Rayon parallèle) + pass 2 (callback fill_slot) +Vérifier que la doc décrit correctement les deux nouvelles routes d'évidence et le paramètre block_bits.

      + + + + + + + + + + + + + +
      +
      + + + +
      + +
      + + + +
      +
      +
      +
      + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/implementation/mphf/index.html b/doc/implementation/mphf/index.html index 56c46b0..5d4b4dd 100644 --- a/doc/implementation/mphf/index.html +++ b/doc/implementation/mphf/index.html @@ -757,6 +757,28 @@ +
    • + +
    • + + + + Evidence modes + + + + +
    • + +
    • + + + + Build functions + + + +
    • @@ -840,6 +862,34 @@ +
    • + + + + + + + + Evidence elimination (discussion) + + + + + + + + +
    • + + + + + + + + + +
    • @@ -918,6 +968,62 @@ + + + + + + +
    • + + + + + + + + Merge command + + + + + + + + +
    • + + + + + + + + + + +
    • + + + + + + + + Kmer filtering (rebuild/dump/unitig) + + + + + + + + +
    • + + + +
    @@ -1165,6 +1271,28 @@ +
  • + +
  • + + + + Evidence modes + + + + +
  • + +
  • + + + + Build functions + + + +
  • @@ -1226,26 +1354,26 @@

    Why two phases are needed

    Kmer indexing per partition proceeds in two phases. The separation is necessary because the exact number of surviving unique kmers is not known until after counting and filtering low-abundance kmers.

    Phase 1 — provisional MPHF + kmer spectrum

    -

    Implemented in obikpartitionner::KmerPartition::count_kmer().

    +

    Implemented in obikpartitionner::KmerPartition::count_kmer()count_partition().

      -
    1. Pass 1: read the dereplicated superkmer file; enumerate all unique canonical kmers into a HashSet. Exact count known after this pass.
    2. -
    3. Build a provisional MPHF (GOFunction from the ph crate) over the exact kmer set. Produces mphf1.bin.
    4. -
    5. Create counts1.bin: one zero-initialised u32 per MPHF slot (mmap'd).
    6. -
    7. Pass 2: re-read the dereplicated file; for each kmer, query mphf1.get(kmer) and atomically accumulate the superkmer count into counts1[slot].
    8. +
    9. External sort: read the dereplicated superkmer file; extract the raw u64 canonical kmer value for every kmer of every superkmer. Sort in RAM-bounded chunks (adaptive budget: 40% of available RAM ÷ n_threads, minimum 1 M kmers per chunk), then k-way merge with inline dedup. Result: sorted_unique.bin — a flat array of f0 distinct sorted u64 values. Exact kmer count f0 is known at this point.
    10. +
    11. Build provisional MPHF (ptr_hash, same configuration as phase 2) over sorted_unique.bin using new_from_par_iter. Delete sorted_unique.bin immediately after. Persist to mphf1.bin.
    12. +
    13. Create counts1.bin: PersistentCompactIntVec with f0 slots, zero-initialised.
    14. +
    15. Accumulation pass: re-read the dereplicated superkmer file; for each kmer in each superkmer, compute slot = mphf.index(kmer.raw()) and increment counts1[slot] by the superkmer's COUNT.
    16. Build kmer frequency spectrum from counts1: histogram {count → n_kmers}, totals f0 (distinct kmers) and f1 (total abundance). Written to kmer_spectrum_raw.json per partition, then merged globally.

    Files produced per partition:

    part_XXXXX/
    -  mphf1.bin               — GOFunction (provisional MPHF, discarded after phase 2)
    -  counts1.bin             — [u32; n_kmers] kmer counts, mmap'd
    +  mphf1.bin               — ptr_hash provisional MPHF (discarded after phase 2)
    +  counts1.bin             — PersistentCompactIntVec, f0 × u32 kmer counts
       kmer_spectrum_raw.json  — local frequency spectrum
     

    Phase 2 — definitive MPHF

    After filtering (applying a min-count threshold derived from the spectrum) and building the local De Bruijn graph + unitigs (see Construction pipeline), the exact filtered kmer set is available via unitigs.bin.

    -

    MphfLayer::build is called on the unitig file:

    +

    MphfLayer::build(dir, block_bits, mode: &IndexMode, fill_slot) is called on the unitig directory:

      -
    1. Pass 1: iterate all canonical kmers from unitigs.bin in parallel, build and store mphf.bin (ptr_hash).
    2. -
    3. Pass 2: iterate sequentially, fill evidence.bin, call the mode-specific fill_slot callback.
    4. +
    5. Pass 1 (parallel): a CanonicalKmerIter — clonable via Arc<Mmap>, no file reopening — is passed directly to new_from_par_iter via par_bridge(). No .idx is read or created at this stage; parallelism is at partition/layer level, not within a single MPHF. Produces mphf.bin.
    6. +
    7. Pass 2 (sequential): iterate with iter_indexed_canonical_kmers; fill evidence files; call fill_slot(slot, kmer) callback per kmer. For Exact/Hybrid, .idx is written at the end of this pass — never earlier.

    mphf1.bin and counts1.bin are no longer needed after phase 2 and can be deleted.


    @@ -1265,13 +1393,11 @@

    FMPH/FMPHGO (ph crate, Beling, ACM JEA 2023):

    • ~2.1 bits/key — most compact; good query speed; deterministic construction
    • -
    • Works well from an exact or slightly overestimated count
    • -
    • GOFunction (group-oriented variant) is the specific type used
    • +
    • GOFunction (group-oriented variant) was the original phase-1 choice; eliminated when the external sort made the exact count available at phase 1 as well

    MPHF choice per phase

    -

    Phase 1 (provisional, discarded after spectrum computation): ph::fmph::GOFunction. Compact, fast to build from the exact post-dedup kmer set. Query speed is secondary — the structure is only used during pass 2 of count_kmer.

    -

    Phase 2 (persistent, queried repeatedly): ptr_hash. Exact key count is available from the unitig index; ptr_hash query speed (≥2.1×) and construction speed (≥3.1× over FMPH) are the decisive factors. The 2.4 bits/key overhead is acceptable.

    -

    boomphf is eliminated: largest space overhead, streaming advantage does not apply.

    +

    Both phases: ptr_hash, same type alias and construction parameters. The external sort (phase 1) and the unitig index (phase 2) both provide the exact key count before MPHF construction, so ptr_hash's requirement is satisfied in both cases. Using a single MPHF implementation removes the ph crate dependency.

    +

    boomphf: eliminated — largest space overhead, streaming advantage no longer needed. FMPH/GOFunction: eliminated — exact count available, ptr_hash is faster at equivalent compactness.


    Space at scale

    For 1 024 partitions × 100 M kmers/partition (phase 2 index, after filtering):

    @@ -1320,9 +1446,12 @@

    Layer structure

    Each layer is a self-contained unit. See obilayeredmap for the full on-disk layout. The MPHF-relevant files are:

    layer_i/
    -  unitigs.bin      — packed 2-bit nucleotide sequences (kmer evidence)
    +  unitigs.bin      — packed 2-bit nucleotide sequences (kmer evidence source)
    +  unitigs.bin.idx  — random-access block index (block_bits controls granularity)
       mphf.bin         — ptr_hash phase-2 MPHF
    -  evidence.bin     — n × u32: (chunk_id: 25 bits | rank: 7 bits) per slot
    +  evidence.bin     — n × (chunk_id: 25 bits | rank: 7 bits) per slot  [exact mode]
    +  fingerprint.bin  — n × b-bit fingerprints per slot                  [approx mode]
    +  [no layer_meta.json — mode stored once in partition-level meta.json]
     

    Layers are disjoint: a canonical kmer belongs to exactly one layer. Layer 0 is built from dataset A. Adding dataset B:

      @@ -1330,17 +1459,43 @@
    1. Collect kmers of B not present in any layer → set B \ A.
    2. Build layer 1 from B \ A (dereplicate → count → De Bruijn → unitigs → MphfLayer::build).
    +

    Evidence modes

    +

    Three evidence modes are supported via IndexMode, stored once in PartitionMeta at partition root. There is no layer_meta.json.

    +

    Exact (IndexMode::Exact): evidence.bin stores one (chunk_id, rank) pair per MPHF slot. Verification reconstructs the kmer and compares to the query. Zero false positives. .idx required at query time.

    +

    Approx (IndexMode::Approx { b, z }): fingerprint.bin stores a b-bit hash per slot. False-positive rate 1/2^b per query; Findere z-parameter reduces window FP to ≈ 1/2^(b·z). No .idx written or needed.

    +

    Hybrid (IndexMode::Hybrid { b, z }): both fingerprint.bin and evidence.bin + .idx. find() uses the fingerprint (O(1)); find_strict() uses exact evidence (O(1)).

    +

    Build functions

    +
    MphfLayer::build(dir, block_bits, mode: &IndexMode, fill_slot)
    +    Pass 1: CanonicalKmerIter + par_bridge() → build mphf.bin  (no .idx used)
    +    Pass 2: sequential iter → fill evidence files + call fill_slot
    +            .idx written last for Exact/Hybrid (query-time only)
    +
    +MphfLayer::build_exact_evidence(dir, block_bits)
    +    Post-hoc: builds evidence.bin + .idx from existing mphf.bin + unitigs.bin
    +    Uses open_sequential(); no .idx required on entry
    +
    +MphfLayer::build_approx_evidence(dir, b, z)
    +    Post-hoc: builds fingerprint.bin from existing mphf.bin + unitigs.bin
    +    Uses open_sequential(); never writes .idx
    +
    +

    There is no build_evidence dispatch wrapper. Callers choose the appropriate post-hoc build directly.

    +

    In obikpartitionner, build_index_layer receives block_bits: u8 from IndexConfig::block_bits and forwards it directly to Layer::build and Layer::build_approx_evidence.

    Membership verification

    -

    ptr_hash maps any input to a valid slot — it does not natively detect absent keys. Membership is verified using the evidence entry: decode the kmer from (chunk_id, rank) and compare to the query. A mismatch means the kmer is absent from this layer; probe the next layer.

    +

    ptr_hash maps any input to a valid slot — it does not natively detect absent keys. Membership is verified using the evidence entry:

    +
      +
    • Exact: decode (chunk_id, rank) from evidence.bin; reconstruct the kmer via unitigs.verify_canonical_kmer; compare to query.
    • +
    • Approx: compare kmer.seq_hash() to the b-bit fingerprint stored at the slot.
    • +
    +

    A mismatch in either mode means the kmer is absent from this layer; probe the next layer.

    Query algorithm

    fn query(kmer) → Option<(layer_index, slot)>:
         for (i, layer) in layers.iter().enumerate():
             slot = layer.mphf.index(kmer)
    -        if layer.evidence.decode(slot) matches kmer:
    +        if layer.evidence.matches(slot, kmer):   // exact or approx dispatch
                 return Some((i, slot))
         return None
     
    -

    Expected probe depth: 1 for kmers in layer 0. Each probe is a ptr_hash lookup (~10 ns) plus one evidence decode.

    +

    MphfLayer::find dispatches on LayerEvidence at O(1) — no panicking find_exact/find_approx methods. find_strict always performs an exact check: O(1) for Exact/Hybrid, O(n) sequential scan for Approx. Expected probe depth: 1 for kmers in layer 0. Each probe is a ptr_hash lookup (~10 ns) plus one evidence check.

    Merging layers

    Two layer chains can be merged by re-indexing their union through the full pipeline. This is expensive (full rebuild) but produces an optimal single-layer index. Merge is a maintenance operation, not a query-path requirement.

    diff --git a/doc/implementation/obilayeredmap.refs/index.html b/doc/implementation/obilayeredmap.refs/index.html new file mode 100644 index 0000000..2a11dce --- /dev/null +++ b/doc/implementation/obilayeredmap.refs/index.html @@ -0,0 +1,1068 @@ + + + + + + + + + + + + + + + + + + + + + + Obilayeredmap.refs - obikmer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + Skip to content + + +
    +
    + +
    + + + + + + +
    + + +
    + +
    + + + + + + +
    +
    + + + +
    +
    +
    + + + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    + +
    + + + + + + +

    Coverage: implementation/obilayeredmap.md

    +

    Code couvert

    +
      +
    • obilayeredmap/src/mphf_layer.rs — MphfLayer, LayerEvidence enum (Exact/Approx), find(), find_exact(), find_approx()
    • +
    • obilayeredmap/src/layer.rs — Layer, trait LayerData, modes () / PersistentCompactIntMatrix / PersistentBitMatrix, build(), build_evidence(), append_genome_column()
    • +
    • obilayeredmap/src/map.rs — LayeredMap, push_layer(), query()
    • +
    • obilayeredmap/src/evidence.rs — Evidence, EvidenceWriter, encodage chunk_id:rank
    • +
    • obilayeredmap/src/fingerprint.rs — FingerprintVec, FingerprintVecWriter, matches()
    • +
    • obilayeredmap/src/meta.rs — LayerMeta, EvidenceKind (Exact / Approx { b, z })
    • +
    +

    Notes

    +

    FORT RISQUE DE DÉRIVE. C'est le fichier le plus affecté par les changements récents : +- EvidenceKind (Exact / Approx) est désormais un concept de premier plan — toute la sémantique de query en dépend +- LayerEvidence enum interne à MphfLayer : dispatch transparent find() → find_exact() ou find_approx() +- fingerprint.rs : module entièrement nouveau (FingerprintVec + FingerprintVecWriter) +- build_evidence() / build_exact_evidence() / build_approx_evidence() sont nouveaux +- block_bits dans les fonctions build : O(1) garanti avec le chemin chaud explicit pour block_bits=0 +- Séparation open() (accès aléatoire, requiert .idx) vs open_sequential() (itération seule) +Pratiquement toute cette page est à réécrire.

    + + + + + + + + + + + + + +
    +
    + + + +
    + +
    + + + +
    +
    +
    +
    + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/implementation/obilayeredmap/index.html b/doc/implementation/obilayeredmap/index.html index 95fe20e..06fe65e 100644 --- a/doc/implementation/obilayeredmap/index.html +++ b/doc/implementation/obilayeredmap/index.html @@ -9,7 +9,7 @@ - + @@ -647,6 +647,34 @@ + + +
  • + + + + + + + + Evidence elimination (discussion) + + + + + + + + +
  • + + + + + + + + @@ -729,6 +757,17 @@ + + +
  • + + + + Index mode (homogeneity invariant) + + + +
  • @@ -740,6 +779,34 @@ + +
  • @@ -751,6 +818,73 @@ + + +
  • + +
  • + + + + FingerprintVec and FingerprintVecWriter + + + + +
  • + +
  • + + + + LayeredMap\<D> — collection of layers + + + + + +
  • @@ -776,10 +910,10 @@
  • - + - Evidence encoding + Evidence encoding (exact) @@ -798,14 +932,53 @@
  • - + - Query path + Column append and merge support + +
  • @@ -895,6 +1068,62 @@ + + + + + + +
  • + + + + + + + + Merge command + + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + Kmer filtering (rebuild/dump/unitig) + + + + + + + + +
  • + + + + @@ -1058,6 +1287,17 @@ + + +
  • + + + + Index mode (homogeneity invariant) + + + +
  • @@ -1069,6 +1309,34 @@ + +
  • @@ -1080,6 +1348,73 @@ + + +
  • + +
  • + + + + FingerprintVec and FingerprintVecWriter + + + + +
  • + +
  • + + + + LayeredMap\<D> — collection of layers + + + + + +
  • @@ -1105,10 +1440,10 @@
  • - + - Evidence encoding + Evidence encoding (exact) @@ -1127,14 +1462,53 @@
  • - + - Query path + Column append and merge support + +
  • @@ -1178,7 +1552,7 @@

    obilayeredmap — layered kmer index crate

    Purpose

    -

    obilayeredmap implements a persistent, incrementally extensible kmer index. The index is organised in three levels: index root → partition → layer. Each layer covers a disjoint kmer set and wraps a ptr_hash MPHF with associated per-slot data. Adding a new dataset never rebuilds existing layers.

    +

    obilayeredmap implements a persistent, incrementally extensible kmer index. Each layer covers a disjoint kmer set and wraps a ptr_hash MPHF with associated per-slot data. Adding a new dataset never rebuilds existing layers.


    Three usage modes

    The MPHF + evidence infrastructure is the same for all modes. The payload varies.

    @@ -1214,34 +1588,65 @@

    Both PersistentCompactIntMatrix and PersistentBitMatrix come from the obicompactvec crate.


    +

    Index mode (homogeneity invariant)

    +

    A partitioned index is homogeneous: every layer within a partition shares the same mode. The mode is determined once at LayeredMap::open() from PartitionMeta.mode and passed to each Layer::open() — no per-layer file is read.

    +
    #[derive(Serialize, Deserialize, Default)]
    +#[serde(tag = "type", rename_all = "snake_case")]
    +pub enum IndexMode {
    +    #[default]
    +    Exact,
    +    Approx { b: u8, z: u8 },
    +    Hybrid { b: u8, z: u8 },
    +}
    +
    +

    IndexMode is stored once in PartitionMeta (meta.json at partition root). There is no layer_meta.json.

    +
      +
    • Exact: writes evidence.bin + unitigs.bin.idx. Zero false positives.
    • +
    • Approx: writes fingerprint.bin only. FP rate per kmer = 1/2^b; with Findere z-parameter, z consecutive kmers must all match → effective window FP ≈ 1/2^(b·z). No .idx written or required.
    • +
    • Hybrid: writes both fingerprint.bin and evidence.bin + .idx. find() uses the fingerprint (fast, O(1)); find_strict() uses exact evidence.
    • +
    +

    MphfLayer — autonomous kmer → slot mapping

    -

    MphfLayer encapsulates the MPHF + evidence + unitig spine for one layer. It is independent of any payload data.

    +

    MphfLayer encapsulates the MPHF and evidence store for one layer. It is independent of any payload.

    pub struct MphfLayer {
    -    mphf:     Mphf,
    -    evidence: Evidence,
    -    unitigs:  UnitigFileReader,
    -    n:        usize,   // number of indexed kmers = number of MPHF slots
    +    mphf: Mphf,
    +    ev:   LayerEvidence,   // loaded at open() time
    +    n:    usize,
     }
     
    -

    Public API:

    -
    impl MphfLayer {
    -    pub fn open(dir: &Path) -> OLMResult<Self>
    -    pub fn find(&self, kmer: CanonicalKmer) -> Option<usize>   // Some(slot) or None
    -    pub fn n(&self) -> usize
    -    pub fn unitig_writer(dir: &Path) -> OLMResult<UnitigFileWriter>
    -    pub(crate) fn build(
    -        dir: &Path,
    -        fill_slot: &mut impl FnMut(usize, CanonicalKmer) -> OLMResult<()>,
    -    ) -> OLMResult<usize>
    +

    LayerEvidence is an internal enum, not public:

    +
    enum LayerEvidence {
    +    Exact  { evidence: Evidence, unitigs: UnitigFileReader },
    +    Approx { fingerprint: FingerprintVec, unitigs_path: PathBuf },
    +    Hybrid { evidence: Evidence, unitigs: UnitigFileReader, fingerprint: FingerprintVec },
     }
     
    -

    find returns Some(slot) only after verifying via evidence that the kmer is actually indexed. It returns None for absent keys (ptr_hash maps any input to a valid slot; evidence verification is the only correct-membership test).

    -

    build runs two sequential passes over unitigs.bin:

    +

    MphfLayer::open(dir, mode: &IndexMode) receives the mode from PartitionMeta — no per-layer file is read.

    +

    Query API

    +

    Two public query methods, both returning Option<usize> (slot index):

    +
    pub fn find(&self, kmer: CanonicalKmer) -> Option<usize>
    +pub fn find_strict(&self, kmer: CanonicalKmer) -> Option<usize>
    +
    +
      +
    • find: O(1) auto-dispatch. Exact/Hybrid → exact evidence check. Approx/Hybrid → fingerprint comparison.
    • +
    • find_strict: always exact. Exact/Hybrid → O(1) evidence check. Approx → O(n) sequential scan (no .idx).
    • +
    +

    There are no find_exact/find_approx methods; panicking dispatch is eliminated.

    +

    Build surface

    +
    // Full MPHF + evidence build (two-pass)
    +pub(crate) fn build(dir, block_bits, mode: &IndexMode, fill_slot) -> OLMResult<usize>
    +
    +// Evidence-only post-hoc builds (MPHF already present)
    +pub fn build_exact_evidence(dir, block_bits) -> OLMResult<usize>
    +pub fn build_approx_evidence(dir, b, z)      -> OLMResult<usize>
    +
    +

    MphfLayer::build runs two passes over unitigs.bin:

      -
    1. Pass 1: iterate all canonical kmers in parallel via rayon, construct and store mphf.bin. new_from_par_iter avoids materialising a full key Vec.
    2. -
    3. Pass 2: iterate again sequentially, fill evidence.bin, call fill_slot(slot, kmer) once per kmer for payload population. A compact n/8-byte seen-bitset verifies MPHF injectivity inline.
    4. +
    5. Pass 1 (parallel via rayon): a CanonicalKmerIter (clonable, Arc<Mmap>, no file reopening) is passed to new_from_par_iter via par_bridge(). Produces mphf.bin. No .idx is read or created at this stage.
    6. +
    7. Pass 2 (sequential): fill evidence files; call fill_slot(slot, kmer) per kmer. .idx is written last for Exact/Hybrid modes (query-time only).
    -

    For empty layers (n = 0), build returns Ok(0) immediately after creating empty mphf.bin and evidence.bin.

    +

    There is no build_evidence dispatch wrapper — callers invoke build_exact_evidence or build_approx_evidence directly.

    +

    For empty layers (n = 0), all build variants return Ok(0) immediately after creating empty output files.


    Layer\<D: LayerData> — MPHF + payload

    Layer<D> pairs an MphfLayer with one payload store.

    @@ -1261,7 +1666,7 @@ pub data: T, }
    -

    LayerData covers the read path only (open + read). Build signatures differ between modes and are not in the trait.

    +

    LayerData covers the read path only (open + read). Build signatures differ between modes and are not part of the trait.

    @@ -1288,28 +1693,89 @@
    -

    Build signatures:

    +

    Build signatures

    // mode 1
     impl Layer<()> {
    -    pub fn build(out_dir: &Path) -> OLMResult<usize>
    +    pub fn build(out_dir: &Path, block_bits: u8, mode: &IndexMode) -> OLMResult<usize>
     }
     
     // mode 2
     impl Layer<PersistentCompactIntMatrix> {
    -    pub fn build(out_dir: &Path, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult<usize>
    -    pub fn build_from_map(out_dir: &Path, counts: &HashMap<CanonicalKmer, u32>) -> OLMResult<usize>
    +    pub fn build(out_dir: &Path, block_bits: u8, mode: &IndexMode,
    +                 count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult<usize>
    +    pub fn build_from_map(out_dir: &Path, block_bits: u8, mode: &IndexMode,
    +                          counts: &HashMap<CanonicalKmer, u32>) -> OLMResult<usize>
     }
     
     // mode 3
     impl Layer<PersistentBitMatrix> {
    -    pub fn build_presence(
    -        out_dir: &Path,
    -        n_genomes: usize,
    -        present_in: impl Fn(CanonicalKmer, usize) -> bool,
    -    ) -> OLMResult<usize>
    +    pub fn build_presence(out_dir: &Path, block_bits: u8, mode: &IndexMode,
    +                          n_genomes: usize,
    +                          present_in: impl Fn(CanonicalKmer, usize) -> bool) -> OLMResult<usize>
     }
     
    -

    All build impls delegate MPHF + evidence construction to MphfLayer::build via a mode-specific fill_slot callback. Mode 2 pre-reads n_kmers from unitigs.bin to size the PersistentCompactIntMatrixBuilder before calling MphfLayer::build. Mode 3 does the same for PersistentBitMatrixBuilder.

    +

    All build impls delegate to MphfLayer::build via a mode-specific fill_slot callback. The mode parameter is forwarded directly — no LayerMeta is written.

    +

    Evidence-only post-hoc builds are accessible directly on Layer<D>:

    +
    impl<D: LayerData> Layer<D> {
    +    pub fn build_exact_evidence(layer_dir: &Path, block_bits: u8) -> OLMResult<usize>
    +    pub fn build_approx_evidence(layer_dir: &Path, b: u8, z: u8)  -> OLMResult<usize>
    +}
    +
    +

    There is no build_evidence dispatch wrapper.

    +
    +

    FingerprintVec and FingerprintVecWriter

    +

    Approximate evidence is stored as a packed b-bit array, one fingerprint per MPHF slot.

    +
    fingerprint.bin format:
    +  magic:   b"FPVF"  (4 bytes)
    +  b:       u8       (bits per fingerprint, 1..=64)
    +  padding: [0u8; 3]
    +  n:       u64 LE   (number of slots)
    +  data:    packed bits, ceil(n*b/8) bytes, Lsb0 order
    +
    +
    impl FingerprintVec {
    +    pub fn open(path: &Path) -> OLMResult<Self>
    +    pub fn get(&self, slot: usize) -> u64
    +    pub fn matches(&self, slot: usize, fingerprint: u64) -> bool
    +    pub fn n(&self) -> usize
    +    pub fn b(&self) -> u8
    +}
    +
    +

    matches(slot, hash) extracts the b-bit fingerprint stored at slot and compares it to the low b bits of hash. It is the core operation of find_approx.

    +
    +

    LayeredMap\<D> — collection of layers

    +

    LayeredMap<D> wraps Vec<Layer<D>> for a single partition directory.

    +
    pub struct LayeredMap<D: LayerData = ()> {
    +    root:   PathBuf,
    +    meta:   PartitionMeta,
    +    layers: Vec<Layer<D>>,
    +}
    +
    +

    PartitionMeta (meta.json at the partition root) stores n_layers.

    +

    Common methods

    +
    pub fn open(root: &Path)              -> OLMResult<Self>
    +pub fn create(root: &Path, mode: IndexMode) -> OLMResult<Self>
    +pub fn n_layers(&self)                -> usize
    +pub fn layer(&self, i: usize)         -> &Layer<D>
    +pub fn mode(&self)                    -> &IndexMode
    +pub fn query(&self, kmer: CanonicalKmer) -> Option<(usize, Hit<D::Item>)>
    +pub fn next_layer_writer(&self)       -> OLMResult<UnitigFileWriter>
    +
    +

    open reads PartitionMeta once, extracts mode, and passes it to every Layer::open — no per-layer file is read. create stores the given mode in PartitionMeta.

    +

    query probes layers in order and returns (layer_index, Hit) on the first match. Expected probe depth: 1 for kmers in layer 0.

    +

    push_layer

    +

    push_layer builds the next layer from a unitigs.bin already written via next_layer_writer, using DEFAULT_BLOCK_BITS:

    +
    // mode 1
    +impl LayeredMap<()> {
    +    pub fn push_layer(&mut self) -> OLMResult<usize>
    +}
    +
    +// mode 2
    +impl LayeredMap<PersistentCompactIntMatrix> {
    +    pub fn push_layer(&mut self, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult<usize>
    +    pub fn push_layer_from_map(&mut self, counts: &HashMap<CanonicalKmer, u32>) -> OLMResult<usize>
    +}
    +
    +

    Mode 3 (PersistentBitMatrix) has no push_layer on LayeredMap; callers build directly via Layer<PersistentBitMatrix>::build_presence.


    LayeredStore\<S> and aggregation traits

    LayeredStore<S> is a generic aggregation wrapper over Vec<S>. It propagates three traits from obicompactvec::traits up the hierarchy via blanket impls:

    @@ -1320,11 +1786,6 @@ impl<S: BitPartials> BitPartials for LayeredStore<S> { } // element-wise Σ partials

    Because blanket impls compose, LayeredStore<LayeredStore<S>> automatically inherits all three traits when S does — providing the partitioned level without a separate type.

    -

    Aggregation hierarchy:

    -
    PersistentCompactIntMatrix                  implements CountPartials
    -LayeredStore<PersistentCompactIntMatrix>         via blanket impl  (one partition)
    -LayeredStore<LayeredStore<…>>                    via blanket impl  (partitioned index)
    -

    Leaf implementors (in obicompactvec):

    @@ -1344,69 +1805,77 @@ LayeredStore<LayeredStore<…>> via blanket impl
    -

    PersistentCompactIntVec and PersistentBitVec do not implement these traits — they are single-column primitives, not matrix-level aggregators.

    See Kmer index architecture for the full trait API and the two-pass normalised-metric pattern.


    On-disk structure

    -
    index_root/                        ← LayeredMap (collection)
    -  meta.json
    -  part_00000/                      ← Partition
    -    layer_0/                       ← Layer
    -      mphf.bin           — ptr_hash MPHF (epserde format)
    -      unitigs.bin        — packed 2-bit nucleotide sequences
    -      unitigs.bin.idx    — UIDX index: n_unitigs, n_kmers, seqls[], packed_offsets[]
    -      evidence.bin       — n × u32, each = (chunk_id: 25 bits | rank: 7 bits), LE
    -      counts/            [mode 2] PersistentCompactIntMatrix
    -        meta.json          {"n": N, "n_cols": 1}
    -        col_000000.pciv
    -      presence/          [mode 3] PersistentBitMatrix
    -        meta.json          {"n": N, "n_cols": G}
    -        col_000000.pbiv
    -        …
    -    layer_1/
    -      …
    -  part_00001/
    +
    partition_root/                    ← LayeredMap (one partition)
    +  meta.json                        — {"n_layers": N, "mode": {"type": "exact"|"approx"|"hybrid", ...}}
    +  layer_0/                         ← Layer
    +    mphf.bin                       — ptr_hash MPHF (epserde format)
    +    unitigs.bin                    — packed 2-bit nucleotide sequences
    +    unitigs.bin.idx                — UIDX index (Exact/Hybrid only; query-time, never built during MPHF construction)
    +    evidence.bin                   — [u32; n], LE  (Exact/Hybrid only)
    +    fingerprint.bin                — packed b-bit array  (Approx/Hybrid only)
    +    counts/                        [mode 2] PersistentCompactIntMatrix
    +      meta.json
    +      col_000000.pciv
    +    presence/                      [mode 3] PersistentBitMatrix
    +      meta.json
    +      col_000000.pbiv  …
    +  layer_1/
         …
     
    -

    Partition (part_XXXXX/): all kmers whose canonical minimiser hashes to this bucket. Partitions are independent and can be processed in parallel.

    -

    Layer (layer_N/): one MphfLayer plus optional payload. Layer 0 covers dataset A; layer 1 covers kmers in B absent from A; etc. Layers within a partition are always disjoint.

    +

    There is no layer_meta.json. The mode is stored once in PartitionMeta and is valid for all layers. unitigs.bin.idx is built at the end of build_exact_evidence — never during MPHF construction — and is consumed at query time only.


    -

    Evidence encoding

    +

    Evidence encoding (exact)

    evidence.bin is a flat [u32; n] array with no header. Each u32 encodes one slot:

    bits [31:7] = chunk_id (25 bits) — index of the unitig chunk
     bits [6:0]  = rank     (7 bits)  — kmer index within the chunk (0-based)
     
    -

    Decoding: chunk_id = raw >> 7, rank = raw & 0x7F. Reconstructing the kmer: read k nucleotides at position rank within unitig chunk_id.

    -

    For k=31, m=11, the observed maximum is ~46 kmers per chunk — well within the 127-kmer u7 capacity. The structural maximum from superkmer construction is k − m + 1 = 21 kmers/unitig; longer unitigs arise from paths spanning more than one superkmer.

    +

    chunk_id = raw >> 7, rank = raw & 0x7F. Reconstructing the kmer: read k nucleotides at position rank within unitig chunk_id (requires unitigs.bin.idx for random access).

    +

    For k=31, m=11, the observed maximum is ~46 kmers per chunk — well within the 127-kmer u7 capacity.


    ptr_hash configuration

    type Mphf = PtrHash<
         u64,                              // key type: canonical kmer raw encoding
         CubicEps,                         // bucket fn: 2.4 bits/key, λ=3.5, α=0.99
    -    CachelineEfVec<Vec<CachelineEf>>, // remap: 11.6 bits/entry (Elias-Fano)
    +    CachelineEfVec<Vec<CachelineEf>>, // remap: Elias-Fano
         Xx64,                             // hasher: XXH3-64 with seed
         Vec<u8>,                          // pilots
     >;
     

    Xx64 is chosen over FxHash because canonical kmer raw values are left-aligned u64 with structural zeros in the low bits (42 zeros for k=11, 2 zeros for k=31), which single-multiply hashes distribute poorly.

    -

    CubicEps with PtrHashParams::<CubicEps>::default() (λ=3.5) is a balanced tradeoff: 2× slower construction than Linear/λ=3.0, 20% less space.

    +

    CubicEps with PtrHashParams::<CubicEps>::default() (λ=3.5): 2× slower construction than Linear/λ=3.0, ~20% less space.


    -

    Query path

    -
    pub fn query(&self, kmer: CanonicalKmer) -> Option<Hit<D::Item>> {
    -    self.mphf.find(kmer).map(|slot| Hit { slot, data: self.data.read(slot) })
    +

    Column append and merge support

    +

    These methods extend existing layers with new genome columns without touching the MPHF.

    +

    Layer-level genome column append

    +
    impl Layer<PersistentBitMatrix> {
    +    pub fn append_genome_column(layer_dir: &Path, value_of: impl Fn(usize) -> bool) -> OLMResult<()>
    +}
    +
    +impl Layer<PersistentCompactIntMatrix> {
    +    pub fn append_genome_column(layer_dir: &Path, value_of: impl Fn(usize) -> u32) -> OLMResult<()>
     }
     
    -

    MphfLayer::find probes the MPHF, decodes evidence, and verifies the kmer — returning Some(slot) on match, None otherwise. data.read(slot) is called only on a confirmed hit.

    -

    In LayeredMap, layers are probed in order; the first match wins. Expected probe depth: 1 for kmers in layer 0.

    +

    Both delegate to the corresponding PersistentBitMatrix::append_column / PersistentCompactIntMatrix::append_column. They write a new column file (col_NNNNNN.pbiv / col_NNNNNN.pciv) and update meta.json to increment n_cols. value_of is called once per slot (0..n).

    +

    Presence matrix initialisation

    +
    impl Layer<()> {
    +    pub fn init_presence_matrix(layer_dir: &Path, n_kmers: usize) -> OLMResult<()>
    +}
    +
    +

    Called on the first merge of a Presence-mode index. Creates presence/ with meta.json {"n": n_kmers, "n_cols": 1} and col_000000.pbiv set entirely to true. This retroactively records genome 0 (the original source) as present in every slot, satisfying the column-count invariant before any new-source column is appended.

    +

    Why the MPHF is never rebuilt

    +

    The MPHF, evidence, and unitigs are built once from the kmer set of a layer and are immutable for the lifetime of that layer. Adding a genome column does not change the kmer set — it only appends a new data column indexed by the same slot numbers. The only disk writes are one new .pciv/.pbiv file and a single meta.json update.


    Add-layer algorithm

    When adding dataset B to an existing index:

    1. For each partition, probe existing layers for kmers of B routed to that partition.
    2. Collect kmers absent from all layers → B \ index.
    3. -
    4. Write B \ index to a new unitigs.bin via MphfLayer::unitig_writer.
    5. -
    6. Call Layer<D>::build on the new directory.
    7. -
    8. Update meta.json.
    9. +
    10. Write B \ index to a new unitigs.bin via next_layer_writer().
    11. +
    12. Call Layer<D>::build (or build_presence) on the new layer directory.
    13. +
    14. Call push_layer (or append_layer) to register the new layer in meta.json.

    Each partition's new layer is built independently; the operation is fully parallel across partitions.


    @@ -1433,11 +1902,15 @@ bits [6:0] = rank (7 bits) — kmer index within the chunk (0-based) memmap2 0.9 -mmap of evidence and payload files +mmap of evidence and fingerprint files + + +bitvec +packed b-bit fingerprint storage obiskio -unitig file writer/reader +unitig file writer/reader + .idx build obicompactvec @@ -1448,8 +1921,8 @@ bits [6:0] = rank (7 bits) — kmer index within the chunk (0-based) parallel MPHF construction pass -ndarray 0.16 -aggregation output arrays +serde / serde_json +PartitionMeta serialisation diff --git a/doc/implementation/obipipeline.refs/index.html b/doc/implementation/obipipeline.refs/index.html new file mode 100644 index 0000000..dd1acf0 --- /dev/null +++ b/doc/implementation/obipipeline.refs/index.html @@ -0,0 +1,1059 @@ + + + + + + + + + + + + + + + + + + + + + + Obipipeline.refs - obikmer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +
    + + + + + + +
    + + +
    + +
    + + + + + + +
    +
    + + + +
    +
    +
    + + + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    + +
    + + + + + + +

    Coverage: implementation/obipipeline.md

    +

    Code couvert

    +
      +
    • obipipeline/src/lib.rs — WorkerPool, Pipeline, macro make_pipeline!
    • +
    • obipipeline/src/scheduler.rs — Scheduler avec Select biaisé sur les entrées de canaux
    • +
    +

    Notes

    +

    Document stable (librairie générique, peu de risque de dérive). +Vérifier si obipipeline est toujours utilisé dans la phase scatter de obikpartitionner +ou s'il a été remplacé par Rayon dans certains chemins.

    + + + + + + + + + + + + + +
    +
    + + + +
    + +
    + + + +
    +
    +
    +
    + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/implementation/obipipeline/index.html b/doc/implementation/obipipeline/index.html index 5018f41..8693d77 100644 --- a/doc/implementation/obipipeline/index.html +++ b/doc/implementation/obipipeline/index.html @@ -662,6 +662,17 @@ +
  • + +
  • + + + + make_pipe! DSL + + + +
  • @@ -801,6 +812,34 @@ +
  • + + + + + + + + Evidence elimination (discussion) + + + + + + + + +
  • + + + + + + + + + +
  • @@ -879,6 +918,62 @@ + + + + + + +
  • + + + + + + + + Merge command + + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + Kmer filtering (rebuild/dump/unitig) + + + + + + + + +
  • + + + + @@ -1087,6 +1182,17 @@ + + +
  • + + + + make_pipe! DSL + + + +
  • @@ -1145,7 +1251,7 @@

    obipipeline — parallel pipeline library

    -

    obipipeline is a generic, multi-threaded data pipeline crate. It connects a source, a chain of transforms, and a sink via crossbeam channels, running each stage with a shared worker pool and a biased scheduler.

    +

    obipipeline is a generic, multi-threaded data pipeline crate. It connects a source, a chain of stages, and a sink via crossbeam channels, running each stage with a shared worker pool and a biased scheduler.

    Core types

    @@ -1158,22 +1264,33 @@ - + - - + + + + + + + - +
    SourceFn<D>Box<dyn FnMut() -> Result<D, PipelineError> + Send+Sync>Box<dyn FnMut() -> Result<D, PipelineError> + Send> Called repeatedly; FnMut because it holds iterator state
    SharedFn<D>Arc<dyn Fn(D) -> Result<D, PipelineError> + Send+Sync>Shared across workers via Arc::clone (no copy of the closure)Arc<dyn Fn(D) -> Result<D, PipelineError> + Send + Sync>1→1 transform shared across workers via Arc::clone
    SharedFlatFn<D>Arc<dyn Fn(D, &Sender<Result<D, _>>, &Sender<isize>) + Send + Sync>1→N transform; pushes items into channel, sends delta
    SinkFn<D>Box<dyn Fn(D) -> Result<(), PipelineError> + Send+Sync>Box<dyn Fn(D) -> Result<(), PipelineError> + Send> Final consumer; returns Result so errors propagate back
    -

    Pipeline<D> holds one SourceFn, a Vec<SharedFn>, and one SinkFn.
    +

    Stages come in two variants:

    +
    pub enum Stage<D> {
    +    Transform(SharedFn<D>),   // 1→1
    +    Flat(SharedFlatFn<D>),    // 1→N
    +}
    +
    +

    Pipeline<D> holds one SourceFn, a Vec<Stage>, and one SinkFn.
    WorkerPool<D> wraps a Pipeline with n_workers and channel capacity.

    WorkerPool

    WorkerPool::new(pipeline: Pipeline<D>, n_workers: usize, capacity: usize) -> Self
    @@ -1193,7 +1310,7 @@
     
     
     capacity
    -Bound on every crossbeam channel in the pipeline (source output, inter-stage channels, worker input, sink input, sink error). Controls memory and back-pressure: a full channel blocks the sender until a slot frees.
    +Bound on every crossbeam channel in the pipeline. Controls memory and back-pressure: a full channel blocks the sender until a slot frees.
     
     
     
    @@ -1208,7 +1325,7 @@
     

    Each variant carries the concrete type for one stage's output. The macros pattern-match on this enum to route values between stages.

    Macros

    -

    Six low-level macros build individual stages; one high-level macro (make_pipeline!) composes them.

    +

    Eight low-level macros build individual stages; one high-level macro (make_pipeline!) composes them.

    Low-level

    make_source!(Enum, iterator, OutputVariant)          // iterator yields T
     make_source_fallible!(Enum, iterator, OutputVariant) // iterator yields Result<T, E>
    @@ -1216,6 +1333,9 @@
     make_transform!(Enum, func, InputVariant, OutputVariant)          // func: T -> U
     make_transform_fallible!(Enum, func, InputVariant, OutputVariant) // func: T -> Result<U, E>
     
    +make_flat_transform!(Enum, func, InputVariant, OutputVariant)          // func: T -> impl IntoIterator<Item=U>
    +make_flat_transform_fallible!(Enum, func, InputVariant, OutputVariant) // func: T -> Result<impl IntoIterator<Item=U>, E>
    +
     make_sink!(Enum, func, InputVariant)           // func: T -> ()
     make_sink_fallible!(Enum, func, InputVariant)  // func: T -> Result<(), E>
     
    @@ -1224,17 +1344,31 @@
    make_pipeline! {
         DataEnum,
         source   iterator     => OutputVariant,   // or source?  for fallible
    -    | func:  In => Out,                        // non-fallible transform
    -    |? func: In => Out,                        // fallible transform
    +    |   func: In => Out,                       // 1→1 non-fallible transform
    +    |?  func: In => Out,                       // 1→1 fallible transform
    +    ||  func: In => Out,                       // 1→N non-fallible flat transform
    +    ||? func: In => Out,                       // 1→N fallible flat transform
         sink     func         @ InputVariant,      // or sink?    for fallible
     }
     

    ? marks fallibility on source, individual transforms, or sink independently.
    Implemented as a TT muncher: the internal rule @build recurses over transform tokens one at a time, accumulating them into a vec![], then terminates on sink/sink?.

    +

    make_pipe! DSL

    +

    make_pipe! builds a sourceless/sinkless Pipe<D, In, Out> — a reusable, composable stage sequence:

    +
    make_pipe! {
    +    DataEnum : InType => OutType,
    +    |   func: InVariant => OutVariant,
    +    |?  func: InVariant => OutVariant,
    +    ||  func: InVariant => OutVariant,
    +    ||? func: InVariant => OutVariant,
    +}
    +
    +

    Two pipes compose with .then(other). Apply to an iterator with .apply(iter, n_workers, capacity) to get a PipeIter<Out> — an iterator over the pipeline output, backed by a background WorkerPool. The scatter step in obikmer uses make_pipe! and .apply() rather than the full make_pipeline! / WorkerPool pattern.

    Scheduler architecture

    Source thread ──► [source_rx] ──► Scheduler ──► [worker_tx] ──► Workers (×N)
                                           ▲                               │
                       [stage_rxs] ────────┘◄──────────────────────────────┘
    +                  [flat_delta_rx] ──► Scheduler  (in_flight adjustment)
                                           │
                                   [sink_err_rx]  ← errors from sink (highest priority)
                                           │
    @@ -1242,20 +1376,20 @@ Implemented as a TT muncher: the internal rule @build

    The scheduler is a single thread running a biased Select over all input channels. Priority order (highest first):

    index 0       sink_err_rx          abort on sink error
    -index 1       stage_rxs[N-1]       drain last stage first
    -...
    -index N       stage_rxs[0]
    -index N+1     source_rx            pull new data last
    +index 1       flat_delta_rx        adjust in_flight before dispatching
    +index 2..=n+1 stage_rxs[n-1..0]   drain last stage first
    +index n+2     source_rx            pull new data last
     

    This back-pressure-friendly ordering ensures downstream stages are drained before new items enter the pipeline.

    -

    Workers are generic: each receives (data, SharedFn, result_tx) and calls f(data), sending the result to the provided channel. The scheduler decides which transform to apply and where to route the result.

    -

    Termination uses an in_flight counter:

    +

    Workers are generic: each receives a WorkerTask — either Transform(data, stage_idx) or Flat(data, stage_idx). For Transform, the worker calls f(data) and sends the result to stage_txs[stage_idx]. For Flat, the worker calls f(data, &push_tx, &delta_tx): the closure pushes N items into push_tx then sends N-1 to delta_tx. The scheduler uses the delta to adjust in_flight without knowing N in advance.

    +

    Termination uses an in_flight: isize counter and a flat_workers_active: usize counter:

      -
    • incremented when an item is dispatched from source to workers
    • -
    • decremented when the item exits the last stage
    • -
    • the loop exits only when source_done && in_flight == 0
    • +
    • in_flight incremented when an item is dispatched from source to workers
    • +
    • in_flight decremented when the item exits the last stage to the sink
    • +
    • flat_workers_active incremented when a Flat task is dispatched, decremented when the delta arrives
    • +
    • the loop exits only when source_done && in_flight == 0 && flat_workers_active == 0
    -

    This guarantees all in-flight items complete before join().

    +

    This guarantees all in-flight items complete (including all N outputs of a flat stage) before join().

    Error handling

    PipelineError has four variants:

    @@ -1279,7 +1413,7 @@ index N+1 source_rx pull new data last - + diff --git a/doc/implementation/persistent_bit_vec.refs/index.html b/doc/implementation/persistent_bit_vec.refs/index.html new file mode 100644 index 0000000..54b32a2 --- /dev/null +++ b/doc/implementation/persistent_bit_vec.refs/index.html @@ -0,0 +1,1059 @@ + + + + + + + + + + + + + + + + + + + + + + Persistent bit vec.refs - obikmer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +
    + + + + + + +
    + + +
    + +
    + + + + + + +
    +
    + + + +
    +
    +
    + + + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    + +
    + + + + + + +

    Coverage: implementation/persistent_bit_vec.md

    +

    Code couvert

    +
      +
    • obicompactvec/src/bitvec.rs — PersistentBitVec, opérations mot u64, invariant de padding
    • +
    • obicompactvec/src/bitmatrix.rs — PersistentBitMatrix, wrapper colonne-major, append_column
    • +
    • obicompactvec/src/bitmatrix.rs — PersistentBitMatrixBuilder
    • +
    +

    Notes

    +

    Document d'implémentation stable. Vérifier que PersistentBitMatrixBuilder et append_column +sont couverts (utilisés dans Layer::<PersistentBitMatrix>::build_presence et append_genome_column).

    + + + + + + + + + + + + + +
    +
    + + + +
    + +
    + + + +
    +
    +
    +
    + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/implementation/persistent_bit_vec/index.html b/doc/implementation/persistent_bit_vec/index.html index e1ae9f5..c7105ac 100644 --- a/doc/implementation/persistent_bit_vec/index.html +++ b/doc/implementation/persistent_bit_vec/index.html @@ -12,7 +12,7 @@ - + @@ -649,6 +649,34 @@ +
  • + + + + + + + + Evidence elimination (discussion) + + + + + + + + +
  • + + + + + + + + + +
  • @@ -1002,6 +1030,62 @@ + + + + + + +
  • + + + + + + + + Merge command + + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + Kmer filtering (rebuild/dump/unitig) + + + + + + + + +
  • + + + + diff --git a/doc/implementation/persistent_compact_int_vec.refs/index.html b/doc/implementation/persistent_compact_int_vec.refs/index.html new file mode 100644 index 0000000..9945fd0 --- /dev/null +++ b/doc/implementation/persistent_compact_int_vec.refs/index.html @@ -0,0 +1,1060 @@ + + + + + + + + + + + + + + + + + + + + + + Persistent compact int vec.refs - obikmer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +
    + + + + + + +
    + + +
    + +
    + + + + + + +
    +
    + + + +
    +
    +
    + + + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    + +
    + + + + + + +

    Coverage: implementation/persistent_compact_int_vec.md

    +

    Code couvert

    +
      +
    • obicompactvec/src/builder.rs — PersistentCompactIntVecBuilder, cycle de vie
    • +
    • obicompactvec/src/reader.rs — PersistentCompactIntVec, accès aléatoire et séquentiel
    • +
    • obicompactvec/src/intmatrix.rs — PersistentCompactIntMatrix, wrapper colonne-major, append_column
    • +
    • obicompactvec/src/format.rs — format de fichier (magic PCIV, header, primary u8, overflow, index)
    • +
    +

    Notes

    +

    Document d'implémentation stable. Vérifier que append_column (utilisé dans merge et reindex) +est décrit. Vérifier que PersistentCompactIntMatrixBuilder est couvert (utilisé dans layer.rs).

    + + + + + + + + + + + + + +
    +
    + + + +
    + +
    + + + +
    +
    +
    +
    + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/implementation/persistent_compact_int_vec/index.html b/doc/implementation/persistent_compact_int_vec/index.html index f37e659..3f9ea66 100644 --- a/doc/implementation/persistent_compact_int_vec/index.html +++ b/doc/implementation/persistent_compact_int_vec/index.html @@ -649,6 +649,34 @@ +
  • + + + + + + + + Evidence elimination (discussion) + + + + + + + + +
  • + + + + + + + + + +
  • @@ -985,6 +1013,62 @@ + + + + + + +
  • + + + + + + + + Merge command + + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + Kmer filtering (rebuild/dump/unitig) + + + + + + + + +
  • + + + + diff --git a/doc/implementation/pipeline.refs/index.html b/doc/implementation/pipeline.refs/index.html new file mode 100644 index 0000000..0513aa0 --- /dev/null +++ b/doc/implementation/pipeline.refs/index.html @@ -0,0 +1,1065 @@ + + + + + + + + + + + + + + + + + + + + + + Pipeline.refs - obikmer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +
    + + + + + + +
    + + +
    + +
    + + + + + + +
    +
    + + + +
    +
    +
    + + + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    + +
    + + + + + + +

    Coverage: implementation/pipeline.md

    +

    Code couvert

    +
      +
    • obikpartitionner/src/partition.rs — estimation des paramètres (phase 0)
    • +
    • obiskbuilder/src/iter.rs — scatter : filtre entropie, extraction superkmers, routage partition (phase 1)
    • +
    • obikpartitionner/src/filter.rs — déduplication bucket-sort (phase 2)
    • +
    • obikpartitionner/src/kmer_sort.rs — tri externe + agrégation de comptages (phase 3)
    • +
    • obidebruinj/src/debruijn.rs — graphe De Bruijn, extraction des unitigs (phase 5)
    • +
    • obikpartitionner/src/index_layer.rs — construction MPHF + évidence (phase 6), paramètre block_bits
    • +
    • obikindex/src/index.rsbuild_layers(), dereplicate_and_count()
    • +
    +

    Notes

    +

    RISQUE DE DÉRIVE modéré. Vérifier : +- Phase 6 : la doc mentionne-t-elle le filtre d'abondance (min_ab, max_ab) ? +- Phase 6 : block_bits passé à build_index_layer depuis IndexConfig +- Phase 6 : dispatch exact/approx selon EvidenceKind dans build_index_layer

    + + + + + + + + + + + + + +
    +
    + + + +
    + +
    + + + +
    +
    +
    +
    + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/implementation/pipeline/index.html b/doc/implementation/pipeline/index.html index a38914c..91d32f0 100644 --- a/doc/implementation/pipeline/index.html +++ b/doc/implementation/pipeline/index.html @@ -773,6 +773,34 @@ +
  • + + + + + + + + Evidence elimination (discussion) + + + + + + + + +
  • + + + + + + + + + +
  • @@ -851,6 +879,62 @@ + + + + + + +
  • + + + + + + + + Merge command + + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + Kmer filtering (rebuild/dump/unitig) + + + + + + + + +
  • + + + + @@ -1104,7 +1188,9 @@
  • error valley → suggests min_count (typically the local minimum between the error peak and the coverage peak)
  • Phase 1 — Scatter

    -

    Single streaming pass over raw input files (FASTA/FASTQ, gzip). FASTQ quality scores are ignored. For each read:

    +

    Single streaming pass over raw input files (FASTA/FASTQ, gzip). FASTQ quality scores are ignored.

    +

    Input files are read via open_nuc_stream, which opens and decompresses the file, auto-detects the format (FASTA / FASTQ / GenBank), and yields a sequence of NucPage buffers. Each NucPage is a flat 64 KB buffer of normalised bytes (ACGT + \x00 separators), carrying a k−1 byte overlap from the preceding page so that no k-mer is lost at page boundaries. Per-record identity (sequence id, raw bytes) is not preserved; this is intentional — the scatter phase only needs normalised bases to produce superkmers.

    +

    For each read fragment within a page:

    1. Ambiguous base filter: cut at any non-ACGT base; discard fragments shorter than k.
    2. Entropy filter: scan each fragment with a sliding window of size k. When the kmer \(K_i = S[i \mathinner{..} i+k-1]\) ended by nucleotide \(S[j]\) (with \(j = i+k-1\)) has entropy below threshold \(\theta\), emit the current segment and start a new one (see algorithm below). \(K_i\) belongs to neither segment, and no valid kmer is lost.
    3. @@ -1154,8 +1240,13 @@ B ≈ 100 is tunable; RAM needed ≈ partition_size / B.

      for each kmer in sequence: kmer_counts[canonical(kmer)] += COUNT -

      Implemented as an external sort or a temporary HashMap, depending on partition size. At the end of this phase, each distinct canonical kmer has its exact total count.

      -

      Abundance filter applied here: kmers with total_count < q are discarded. q is a collection parameter (0 = keep all, including singletons for ≤1x data).

      +

      Implemented as a three-step pipeline in count_partition():

      +
        +
      1. External sort (kmer_sort::sort_unique_kmers): read dereplicated superkmers, extract canonical kmer raw u64 values, sort in RAM-bounded chunks (adaptive: 40% of available RAM ÷ n_threads, min 1 M kmers/chunk), k-way merge with inline dedup → sorted_unique.bin. f0 is now known exactly.
      2. +
      3. Provisional MPHF (ptr_hash): built from sorted_unique.bin via new_from_par_iter(f0, ...). Stored to mphf1.bin; sorted_unique.bin deleted immediately.
      4. +
      5. Accumulation pass: re-read dereplicated superkmers; for each kmer, slot = mphf.index(kmer.raw()), increment counts1[slot] by the superkmer COUNT. Stored in a PersistentCompactIntVec (counts1.bin).
      6. +
      +

      At the end of this phase, each distinct canonical kmer has its exact total count, and the frequency spectrum (spectrums/{label}.json) is written to the index root.

      No pre-filter on super-kmer COUNT is possible at phase 2: a super-kmer with COUNT=1 may contain only high-abundance kmers, each present in many other super-kmers across the partition.

      Phase 4 — Super-kmer compaction

      The valid kmer set from phase 3 is used as a mask to rewrite the super-kmer files:

      @@ -1188,14 +1279,52 @@ branching / dead-end → unitig start or end

      Output: unitigs.bin — the permanent evidence structure for the partition. Each kmer in the partition appears at exactly one (unitig_id, offset) location.

      Scope of local unitigs: these are unitigs of the partition's local de Bruijn graph, not global unitigs. A kmer whose k-1 successor or predecessor falls in another partition appears as a dead end locally and terminates the unitig. This does not affect correctness of verification but means partition-local unitigs cannot be directly reused for global assembly.

      Phase 6 — MPHF construction and index finalisation

      -

      Built once on the definitive kmer set (all kmers in all unitigs of the partition). See obilayeredmap and MPHF selection for the current implementation.

      -
      kmers from unitigs → MPHF → mphf.bin
      -                   → evidence.bin : n × u32, each = (chunk_id: 25 bits | rank: 7 bits)
      -                   → payload      : counts/ (mode 2) or presence/ (mode 3)
      +

      build_index_layer is called per partition (in parallel via build_layers) with the following parameters sourced from IndexConfig:

      +
        +
      • block_bits — from IndexConfig::block_bits; controls the .idx block size (2^block_bits unitig chunks per block) for exact evidence
      • +
      • evidenceEvidenceKind::Exact or EvidenceKind::Approx { b, z }; propagated unchanged from IndexConfig::evidence
      • +
      • min_ab / max_ab — abundance bounds applied before graph construction
      • +
      • with_counts — whether to store kmer counts alongside set membership
      • +
      +

      Abundance filtering: when min_ab > 1 or max_ab.is_some(), the provisional mphf1.bin and counts1.bin produced in phase 3 are memory-mapped. Each canonical kmer is accepted only if its count in counts1 satisfies the bounds. If either file is absent, filtering is skipped (all kmers accepted).

      +
      for each kmer in dereplicated super-kmer:
      +    ab = counts1[mphf1.index(kmer.raw())]
      +    if ab < min_ab || ab > max_ab: skip
      +    graph.push(kmer)
       
      -

      The MPHF is built in two passes over unitigs.bin: parallel pass for mphf.bin, sequential pass for evidence.bin and payload. The exact kmer count is available from the unitig index (unitigs.bin.idx) before the passes begin.

      -

      Exact verification via unitig evidence:

      -

      unitigs.bin serves as the evidence structure. The MPHF maps every input to [0, N) including absent kmers — the unitig read-back (via evidence.bin) is the only correct membership test.

      +

      Graph build and unitig write:

      +

      The surviving kmers are fed into GraphDeBruijn, which computes degrees and yields unitigs. Unitigs are written to layer_0/unitigs.bin via a UnitigFileWriter.

      +

      MPHF and evidence build:

      +

      Layer::build (membership-only) or Layer::<PersistentCompactIntMatrix>::build (with counts) is called next. Internally, MphfLayer::build performs two passes:

      +
        +
      1. Pass 1 (parallel): build unitigs.bin.idx (block size = 2^block_bits) then construct the MPHF from all canonical kmers in unitigs.bin; store to mphf.bin.
      2. +
      3. Pass 2 (sequential): for each kmer in unitigs.bin, compute its slot and write evidence.bin (chunk_id: 25 bits | rank: 7 bits packed into a u32); also invoke the payload callback (fill_slot) to populate counts/ if with_counts.
      4. +
      +

      After Layer::build completes, layer_meta.json records EvidenceKind::Exact.

      +

      Approximate evidence override:

      +

      If evidence is EvidenceKind::Approx { b, z }, build_approx_evidence is called immediately after Layer::build. It overwrites the exact evidence bundle with fingerprint.bin (b-bit hash per slot) and rewrites layer_meta.json with EvidenceKind::Approx { b, z }. No .idx file is needed at query time in this mode.

      +
      // Exact path  →  evidence.bin + unitigs.bin.idx + layer_meta.json(Exact)
      +// Approx path →  fingerprint.bin + layer_meta.json(Approx{b,z})
      +//               (evidence.bin left on disk but not used)
      +
      +

      Partition metadata:

      +

      After all layer files are written, PartitionMeta { n_layers: 1 } is serialised to index/meta.json inside the partition directory. This file is required by LayeredMap::open for subsequent merge operations.

      +

      File layout per partition after phase 6:

      +
      part_XXXXX/
      +  index/
      +    meta.json                ← PartitionMeta { n_layers: 1 }
      +    layer_0/
      +      unitigs.bin            ← permanent evidence (all modes)
      +      unitigs.bin.idx        ← block index (exact mode only)
      +      mphf.bin               ← MPHF
      +      evidence.bin           ← exact evidence (exact mode)
      +      fingerprint.bin        ← b-bit fingerprints (approx mode)
      +      layer_meta.json        ← EvidenceKind tag
      +      counts/                ← PersistentCompactIntMatrix (with_counts only)
      +
      +

      Cleanup: unless --keep-intermediate is set, remove_build_artifacts deletes dereplicated.skmer.zst, mphf1.bin, and counts1.bin after all partitions are indexed.

      +

      See obilayeredmap and MPHF selection for data structure details.

      +

      Query path (exact evidence):

      query kmer q
         → canonical_minimizer(q) → hash → PART → part_XXXXX/
         → MPHF(q) → slot s
      @@ -1204,7 +1333,13 @@ branching / dead-end → unitig start or end
         → match   : return payload[s]   ← exact hit
         → no match: kmer absent         ← MPHF collision on absent kmer
       
      -

      superkmers.bin.gz is no longer needed at this point and can be deleted.

      +

      Query path (approximate evidence):

      +
      query kmer q
      +  → MPHF(q) → slot s
      +  → fingerprint[s] matches seq_hash(q)?
      +  → yes : probable hit (FP rate = 1/2^b per kmer, 1/2^(b·z) per z-window)
      +  → no  : kmer absent
      +

        diff --git a/doc/implementation/rebuild_filter/index.html b/doc/implementation/rebuild_filter/index.html new file mode 100644 index 0000000..b4abb1e --- /dev/null +++ b/doc/implementation/rebuild_filter/index.html @@ -0,0 +1,1611 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + Kmer filtering (rebuild/dump/unitig) - obikmer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        + +
        + + + + + + +
        + + +
        + +
        + + + + + + +
        +
        + + + +
        +
        +
        + + + + + +
        +
        +
        + + + + + + + +
        + +
        + + + + + +

        Kmer filtering and ingroup/outgroup predicates

        +

        The rebuild, dump, and unitig commands all share the same filtering +system. Filters can select k-mers based on per-genome quorum counts, optionally +restricted to ingroup and outgroup genome sets derived from genome +metadata.

        +

        rebuild additionally accepts --min-total-count / --max-total-count filters +that operate on the sum of counts across all genomes.

        +

        Predicate syntax

        +

        Each --ingroup and --outgroup flag takes a predicate of the form:

        +
        key OP value1|value2|…
        +
        +
    Internal routing error
    StepError(Box<dyn Error>)StepError(Box<dyn Error + Send + Sync>) Error from user code (wrapped by make_*_fallible!)
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    OperatorMeaning
    * or allwildcard — every genome matches unconditionally
    key=v1\|v2exact match — genome's key equals v1 or v2
    key!=vnegation — genome's key equals none of the values
    key~pathpath ancestry — genome's key is path or a descendant
    key!~pathnot a descendant
    +

    Multiple values separated by | are always OR-ed within the predicate.

    +

    Path matching (~ and !~)

    +

    Metadata values can represent hierarchical taxonomic paths such as +/Eukaryota/Viridiplantae/Streptophyta/Betulaceae/Betula/nana.

    +
      +
    • Absolute pattern (starts with /): the value must start with the pattern + at a segment boundary. + taxon~/Betulaceae/Betula matches /Betulaceae/Betula/nana and + /Betulaceae/Betula but not /Betulaceae/Betuloides/….
    • +
    • Bare segment (no leading /): the value must contain the pattern as an + exact path component anywhere. + taxon~Betula matches any path that has Betula as one of its segments.
    • +
    +

    Missing metadata key → NA

    +

    If a genome does not carry the queried metadata key, the predicate returns NA. +NA propagates through the group evaluation logic (see below), and genomes that +cannot be classified are ignored in all quorum counts.

    +

    Group semantics

    +

    Multiple predicates

    + + + + + + + + + + + + + + + + + +
    FlagCombination rule
    --ingroup (repeated)AND — genome must satisfy all predicates
    --outgroup (repeated)OR — genome satisfies any predicate
    +

    Three-value logic

    +

    Each predicate returns true, false, or NA (absent key).

    +
      +
    • AND: false absorbs everything; NA propagates unless already false.
    • +
    • OR: true absorbs everything; NA propagates unless already true.
    • +
    +

    Classification and priority

    +

    For each genome:

    +
      +
    1. Evaluate AND(ingroup predicates)in_result
    2. +
    3. Evaluate OR(outgroup predicates)out_result
    4. +
    5. If in_result = trueIngroup (ingroup wins over outgroup)
    6. +
    7. Else if out_result = trueOutgroup
    8. +
    9. Otherwise → Uncategorized (ignored in all quorum counts)
    10. +
    +

    Implicit groups

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    --ingroup--outgroupEffective behaviour
    not setnot setall genomes form the ingroup
    setnot setonly ingroup quorum flags apply
    not setsetonly outgroup quorum flags apply
    setsetboth constraints apply simultaneously
    +

    Quorum flags

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    FlagApplies toMeaning
    --min-count Ningroupk-mer present in at least N ingroup genomes
    --max-count Ningroupk-mer present in at most N ingroup genomes
    --min-frac Fingroupk-mer present in at least fraction F of ingroup genomes
    --max-frac Fingroupk-mer present in at most fraction F of ingroup genomes
    --min-outgroup-count Noutgroupk-mer present in at least N outgroup genomes
    --max-outgroup-count Noutgroupk-mer present in at most N outgroup genomes
    --min-outgroup-frac Foutgroupk-mer present in at least fraction F of outgroup genomes
    --max-outgroup-frac Foutgroupk-mer present in at most fraction F of outgroup genomes
    --min-total-count Nall genomessum of per-genome counts ≥ N (rebuild only)
    --max-total-count Nall genomessum of per-genome counts ≤ N (rebuild only)
    --presence-threshold Nallper-genome count > N to be considered "present" (default 0)
    +

    Defaults: mins = 0 (no lower bound), max counts = group size, max fracs = 1.0 +(no upper bound). A filter with all defaults is a no-op.

    +

    Fractions are computed over the size of the classified group, not over total +genome count. An empty group (no genome classified as ingroup/outgroup) never +triggers a filter failure.

    +

    Examples

    +

    Keep k-mers specific to Betula nana — present in at least 2 B. nana genomes +and absent from every other genome in the index:

    +
    obikmer rebuild src --output dst \
    +  --ingroup  "species=Betula_nana" \
    +  --outgroup "*" \
    +  --min-count 2 \
    +  --max-outgroup-count 0
    +
    +

    Keep k-mers found in at least 2 Betula nana genomes and absent from all +other Betula:

    +
    obikmer rebuild src --output dst \
    +  --ingroup  "species=Betula_nana" \
    +  --outgroup "genus=Betula" \
    +  --min-count 2 \
    +  --max-outgroup-count 0
    +
    +

    Use taxonomic paths — keep k-mers present in ≥ 50 % of the Betula clade +and in fewer than 10 % of everything outside Betulaceae:

    +
    obikmer rebuild src --output dst \
    +  --ingroup  "taxon~/Betulaceae/Betula" \
    +  --outgroup "taxon!~/Betulaceae" \
    +  --min-frac 0.5 \
    +  --max-outgroup-frac 0.1
    +
    +

    Multiple outgroup predicates (OR): exclude k-mers present in Alnus or Carpinus:

    +
    obikmer rebuild src --output dst \
    +  --ingroup  "genus=Betula" \
    +  --outgroup "genus=Alnus" \
    +  --outgroup "genus=Carpinus" \
    +  --max-outgroup-count 0
    +
    +

    The same flags work identically for dump and unitig. To dump only k-mers +specific to Betula nana:

    +
    obikmer dump myindex \
    +  --ingroup  "species=Betula_nana" \
    +  --outgroup "*" \
    +  --min-count 1 \
    +  --max-outgroup-count 0
    +
    +

    To enumerate unitigs of the Betula-specific subgraph:

    +
    obikmer unitig myindex \
    +  --ingroup  "genus=Betula" \
    +  --outgroup "*" \
    +  --min-count 2 \
    +  --max-outgroup-count 0
    +
    +

    Implementation

    +
      +
    • +

      obikpartitionner::filter::GroupQuorumFilter — implements KmerFilter + using pre-computed ingroup and outgroup index vectors. The heavy logic + (predicate parsing, three-value evaluation, genome classification) happens + once before any iteration; each k-mer row evaluation is a simple index + lookup and counter.

      +
    • +
    • +

      obikmer::cmd::predicate::FilterArgs — shared clap argument group + embedded via #[command(flatten)] in RebuildArgs, DumpArgs, and + UnitigArgs. FilterArgs::build_filters() returns a ready-to-use filter + list.

      +
    • +
    • +

      obikpartitionner::KmerPartition::iter_partition_kmers — accepts + filters: &[Box<dyn KmerFilter>] and applies them per-kmer before invoking + the callback. rebuild, dump, and unitig all go through this single + entry point.

      +
    • +
    + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/implementation/storage.refs/index.html b/doc/implementation/storage.refs/index.html new file mode 100644 index 0000000..6f3db94 --- /dev/null +++ b/doc/implementation/storage.refs/index.html @@ -0,0 +1,1064 @@ + + + + + + + + + + + + + + + + + + + + + + Storage.refs - obikmer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +
    + + + + + + +
    + + +
    + +
    + + + + + + +
    +
    + + + +
    +
    +
    + + + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    + +
    + + + + + + +

    Coverage: implementation/storage.md

    +

    Code couvert

    +
      +
    • obikindex/src/meta.rs — IndexMeta, IndexConfig (version, config, genomes)
    • +
    • obikindex/src/index.rs — layout sur disque : partitions/, index.meta
    • +
    • obilayeredmap/src/meta.rs — LayerMeta (evidence kind), PartitionMeta (n_layers)
    • +
    • obiskio/src/unitig_index.rs — fichiers unitigs.bin + unitigs.bin.idx
    • +
    +

    Notes

    +

    FORT RISQUE DE DÉRIVE. Nombreux champs ajoutés : +- IndexConfig : champs evidence (EvidenceKind) et block_bits ajoutés +- Nouveau fichier fingerprint.bin pour l'évidence approximative +- LayerMeta / layer_meta.json introduit pour stocker EvidenceKind par layer +- Structure du répertoire layer : evidence.bin vs fingerprint.bin selon le mode +Mettre à jour le schéma de layout sur disque en conséquence.

    + + + + + + + + + + + + + +
    +
    + + + +
    + +
    + + + +
    +
    +
    +
    + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/implementation/storage/index.html b/doc/implementation/storage/index.html index 543a74d..d284fd8 100644 --- a/doc/implementation/storage/index.html +++ b/doc/implementation/storage/index.html @@ -64,7 +64,7 @@
    - + Skip to content @@ -575,6 +575,24 @@ + + @@ -592,6 +610,174 @@ + + + + @@ -659,6 +845,34 @@ +
  • + + + + + + + + Evidence elimination (discussion) + + + + + + + + +
  • + + + + + + + + + +
  • @@ -737,6 +951,62 @@ + + + + + + +
  • + + + + + + + + Merge command + + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + Kmer filtering (rebuild/dump/unitig) + + + + + + + + +
  • + + + + @@ -874,6 +1144,163 @@ + + +
    @@ -889,9 +1316,131 @@ -

    On-disk collection structure

    -

    See obilayeredmap crate for the current on-disk layout.

    -

    The index root contains one part_XXXXX/ directory per partition, each holding one or more layer_N/ directories. Each layer directory contains mphf.bin, unitigs.bin, unitigs.bin.idx, evidence.bin, and optionally a counts/ or presence/ payload directory.

    +

    On-disk index layout

    +

    Directory tree

    +
    <index_root>/
    +  index.meta                  ← JSON: IndexMeta
    +  scatter.done                ← sentinel: scatter phase complete
    +  count.done                  ← sentinel: dereplicate + count complete
    +  index.done                  ← sentinel: MPHF index fully built
    +  spectrums/
    +    <label>.json              ← kmer frequency spectrum per genome
    +  partitions/
    +    part_00000/               ← one dir per partition (zero-padded 5 digits, 0..2^n_bits−1)
    +      index/
    +        meta.json             ← PartitionMeta { n_layers }
    +        layer_0/
    +          unitigs.bin         ← binary unitig sequences (2-bit packed)
    +          unitigs.bin.idx     ← block-sampled offset index (exact evidence only)
    +          mphf.bin            ← serialised PtrHash MPHF
    +          layer_meta.json     ← LayerMeta { evidence: EvidenceKind }
    +          evidence.bin        ← chunk_id:rank per MPHF slot (Exact only)
    +          fingerprint.bin     ← b-bit fingerprints per MPHF slot (Approx only)
    +          counts/             ← PersistentCompactIntMatrix (if with_counts=true)
    +          presence/           ← PersistentBitMatrix (if presence mode, merge)
    +        layer_1/              ← added by merge; same structure as layer_0
    +        layer_2/ …
    +    part_00001/ …
    +
    +

    State machine (sentinels)

    +

    The sentinels are touched atomically at the end of each pipeline stage. +A partial run (e.g. scatter interrupted) leaves no sentinel; the state is +detected as the lowest sentinel present.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    StateSentinel presentMeaning
    Emptyindex.meta exists; scatter not started or interrupted
    Scatteredscatter.doneAll super-kmers routed to partition files
    Countedcount.donePartitions dereplicated; spectrums/ written
    Indexedindex.doneAll MPHF layers built; index ready for queries
    +

    index.meta (IndexMeta)

    +
    {
    +  "version": 1,
    +  "config": {
    +    "kmer_size": 31,
    +    "minimizer_size": 11,
    +    "n_bits": 8,
    +    "with_counts": false,
    +    "evidence": "Exact",
    +    "block_bits": 0
    +  },
    +  "genomes": [
    +    { "label": "genome_A", "meta": { "species": "Homo sapiens" } }
    +  ]
    +}
    +
    +

    n_bits determines the partition count: 2^n_bits directories under partitions/.

    +

    evidence is either the string "Exact" or {"Approx": {"b": 8, "z": 1}}.

    +

    block_bits controls the .idx granularity: one offset entry every 2^block_bits +chunks. block_bits=0 stores one entry per chunk (O(1) random access, largest .idx).

    +

    GenomeInfo.meta is a free-form string→string map for categorical metadata (e.g. +taxonomy, sample origin). It is optional; defaults to empty.

    +

    Layer files

    +

    unitigs.bin

    +

    2-bit packed binary unitig sequences. Each record: 1 byte seql_minus_k +(nucleotide length − k), followed by ceil((seql_minus_k + k) / 4) bytes of +packed sequence. Long unitigs are transparently split into overlapping chunks +(k−1 nucleotide overlap) so no k-mer crosses a chunk boundary.

    +

    unitigs.bin.idx (Exact only)

    +

    Magic UIX3, little-endian header: block_bits (u32), n_unitigs (u32), +n_kmers (u64), then ceil(n_unitigs / 2^block_bits) + 1 byte-offset entries +(u32 each, last entry is a sentinel past-end offset). Absent for Approx layers.

    +

    mphf.bin

    +

    PtrHash MPHF serialised with epserde. Maps canonical kmer (u64, left-aligned +2-bit) to a slot index in [0, n_kmers).

    +

    layer_meta.json (LayerMeta)

    +

    { "evidence": { "type": "exact" } }
    +
    +or +
    { "evidence": { "type": "approx", "b": 8, "z": 1 } }
    +

    +

    evidence.bin (Exact)

    +

    One (chunk_id: u32, rank: u8) record per MPHF slot, packed. Used to verify +that the kmer mapped to a slot is actually present: unitigs.bin[chunk_id][rank] +is re-read and compared against the query.

    +

    fingerprint.bin (Approx)

    +

    b-bit fingerprint per MPHF slot derived from the kmer's sequence hash. +False-positive rate per query ≈ 1/2^b. With Findere parameter z ≥ 2, +z consecutive k-mers must all match, reducing the effective FP rate to +approximately W / 2^(b·z) per read of length L +(where W = L − k − z + 2).

    +

    counts/ (PersistentCompactIntMatrix)

    +

    Present when with_counts=true. One column per genome; each row holds the +per-genome k-mer count for the corresponding MPHF slot. Appended column-by-column +during indexing and merge.

    +

    presence/ (PersistentBitMatrix)

    +

    Present when the layer was built in presence/absence mode (merge path). +One bit per genome per MPHF slot. Written during merge; never present on a +freshly indexed single-genome layer.

    +

    meta.json (PartitionMeta)

    +
    { "n_layers": 2 }
    +
    +

    Records how many layer_N/ directories exist under index/. Incremented by +each merge that adds a layer.

    diff --git a/doc/implementation/superkmer.refs/index.html b/doc/implementation/superkmer.refs/index.html new file mode 100644 index 0000000..f5ada01 --- /dev/null +++ b/doc/implementation/superkmer.refs/index.html @@ -0,0 +1,1059 @@ + + + + + + + + + + + + + + + + + + + + + + Superkmer.refs - obikmer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +
    + + + + + + +
    + + +
    + +
    + + + + + + +
    +
    + + + +
    +
    +
    + + + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    + +
    + + + + + + +

    Coverage: implementation/superkmer.md

    +

    Code couvert

    +
      +
    • obikseq/src/superkmer.rs — layout mémoire SuperKmer (header 32 bits + séquence byte-alignée), encodage ASCII, revcomp, deque minimiseur
    • +
    • obiskbuilder/src/lib.rs — fenêtre glissante monotone pour le maintien du minimiseur
    • +
    +

    Notes

    +

    Document d'implémentation détaillé. Vérifier que le layout header (longueur, orientation, +position minimiseur) n'a pas changé. La doc mentionne un revcomp SIMD — vérifier si c'est +toujours le cas ou si l'implémentation est scalaire.

    + + + + + + + + + + + + + +
    +
    + + + +
    + +
    + + + +
    +
    +
    +
    + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/implementation/superkmer/index.html b/doc/implementation/superkmer/index.html index 2537ee6..d7ee7a0 100644 --- a/doc/implementation/superkmer/index.html +++ b/doc/implementation/superkmer/index.html @@ -751,6 +751,34 @@ +
  • + + + + + + + + Evidence elimination (discussion) + + + + + + + + +
  • + + + + + + + + + +
  • @@ -829,6 +857,62 @@ + + + + + + +
  • + + + + + + + + Merge command + + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + Kmer filtering (rebuild/dump/unitig) + + + + + + + + +
  • + + + + @@ -1046,61 +1130,49 @@

    SuperKmer — implementation

    Memory layout

    -

    A super-kmer is stored as a 32-bit header followed by a byte-aligned nucleotide sequence (2 bits/base, nucleotide 0 at the MSB of the first byte):

    +

    SuperKmer holds two separate fields:

    +
    pub struct SuperKmer {
    +    pub(crate) count: u32,
    +    pub(crate) inner: PackedSeq,
    +}
    +
    +

    PackedSeq stores a 2-bit packed DNA sequence as a heap-allocated Box<[u8]> plus a tail: u8 field:

    - + - - - + + + - - - + + +
    FieldBitsType Role
    COUNT24Occurrence count (≤ 16 M)tailu8Number of valid nucleotides in the last byte: 0 encodes 4, 1–3 are identity
    NKMERS8Number of kmers (= seq_length − k + 1, range 1–255)seqBox<[u8]>2-bit packed bytes, nucleotide 0 at bits 7–6 of seq[0]
    -

    Bit layout (MSB to LSB): [31:8] COUNT [7:0] NKMERS

    -

    NKMERS is stored as a raw u8 in kmer units, not nucleotides. The nucleotide length is recovered as NKMERS + k − 1. This avoids the awkward wrapping convention (0 = 256) that would be needed if nucleotide length were stored directly, and gains k−1 = 30 units of headroom:

    - - - - - - - - - - - - - - - - - - - - -
    unitu8 coversmax nucleotides
    nucleotides255 nt225 kmers
    kmers255 kmers285 nt
    -

    The public accessors:

    -
    fn n_kmers(&self)            -> usize { (self.0 & 0xFF) as usize }
    -fn seql(&self)               -> usize { self.n_kmers() + K - 1 }
    -fn count(&self)              -> u32   { self.0 >> 8 }
    -fn increment(&mut self)               { self.0 += 1 << 8; }
    -fn add(&mut self, n: u32)             { self.0 += n << 8; }
    -fn set_count(&mut self, n: u32)       { self.0 = (self.0 & 0xFF) | (n << 8); }
    +

    Nucleotide length is recovered without storing it explicitly:

    +
    seql = (seq.len() - 1) * 4 + tail_count(tail)
    +
    +

    There is no packed header word — count and the sequence live in separate fields.

    +

    The on-disk binary format (produced by write_to_binary) is:

    +
    [varint(count)] [u8: seql − k] [packed bytes…]
    +
    +

    seql − k fits in a u8 when n_kmers = seql − k + 1 ≤ MAX_KMERS_PER_CHUNK (= 256). If a super-kmer exceeds 256 kmers, write_to_binary splits it into overlapping chunks (k−1 nucleotide overlap, same count per chunk), each a self-contained record readable by read_from_binary.

    +

    The public accessors operate on the struct fields directly:

    +
    fn seql(&self)            -> usize { self.inner.seql() }
    +fn count(&self)           -> u32   { self.count }
    +fn increment(&mut self)            { self.count += 1; }
    +fn add(&mut self, n: u32)          { self.count += n; }
    +fn set_count(&mut self, n: u32)    { self.count = n; }
     
    -

    In practice, observed super-kmer lengths on metagenomic data (k=31) are below 55 nucleotides (≤ 25 kmers) — far from the 255-kmer cap. If a super-kmer ever exceeds 255 kmers, it is split with a k−1 nucleotide overlap, preserving all kmers without duplication (identical mechanism to partition-boundary splits).

    -

    The sequence is always stored in canonical form (lexicographic minimum of forward and reverse complement), with nucleotide 0 at the MSB of the first byte. The byte array can be hashed directly without any adjustment.

    ASCII encoding and decoding

    Two lookup tables handle ASCII ↔ 2-bit conversion:

      @@ -1125,7 +1197,7 @@

    REVCOMP4 is 256 bytes (fits in L1 cache), computed at compile time. No endianness dependency — all operations are pure arithmetic on byte values.

    Step 2 — realignment. After step 1, padding = n × 8 − seql × 2 spurious bits (complements of the original padding A's) appear at the start of the array. They are flushed left using BitSlice<u8, Msb0>::rotate_left(padding) from the bitvec crate, which is SIMD-accelerated. The trailing padding bits are then zeroed:

    -
    let seql = self.n_kmers() + k - 1;
    +
    let seql = self.seql();
     shift = n * 8 - seql * 2          // number of padding bits
     bits.rotate_left(shift)
     bits[len - shift..].fill(false)
    @@ -1143,7 +1215,7 @@
     

    Minimizer sliding window

    -

    Super-kmers are built by SuperKmerIter (crate obiskbuilder), which maintains the current minimizer with a monotonic deque over a sliding window of W = k − m + 1 m-mer positions.

    +

    Super-kmers are built by SuperKmerIter (crate obiskbuilder), which tracks the current minimizer with a monotonic deque (Ring<MmerItem, 32>) inside RollingStat, a rolling-window entropy and minimizer tracker.

    Each deque entry stores:

    @@ -1167,20 +1239,11 @@ - +
    hash u64\(H(\text{canonical})\) — ordering key for random minimizer selectionhash_kmer(canonical << (64 − 2m)) — ordering key for random minimizer selection
    -

    The hash \(H\) is the seeded splitmix64 finalizer (see Minimizer selection):

    -
    fn hash_mmer(canonical: u64) -> u64 {
    -    let x = canonical ^ 0x9e3779b97f4a7c15;   // seed: eliminates fixed point at 0
    -    let x = x ^ (x >> 30);
    -    let x = x.wrapping_mul(0xbf58476d1ce4e5b9);
    -    let x = x ^ (x >> 27);
    -    let x = x.wrapping_mul(0x94d049bb133111eb);
    -    x ^ (x >> 31)
    -}
    -
    +

    The hash uses the seeded splitmix64 finalizer (mix64(raw ^ 0x9e3779b97f4a7c15)), the same function as kmer::hash_kmer.

    On each new nucleotide, once the window is full, the deque is updated:

    Algorithm — minimizer deque update

    @@ -1196,17 +1259,21 @@

    The front of the deque is always the current minimizer. Because the deque is maintained in strictly increasing hash order, each entry is popped at most once — O(1) amortized per nucleotide.

    -

    A super-kmer boundary is emitted when the minimizer changes: deque.front.hash ≠ prev_hash. The canonical field of the front entry is not used for boundary detection — that uses the hash alone. The canonical value is stored so that the partition key \(H(\text{canonical})\) can be recomputed independently at routing time from the stored minimizer_pos, without inheriting the minimum-order-statistic bias (see Minimizer selection — partition key independence).

    +

    A super-kmer boundary is emitted when the minimizer changes: current_minimizer != prev_minimizer. SuperKmerIter also emits a boundary when:

    +
      +
    • entropy of the current k-mer falls at or below the threshold θ (cursor retreated by k−1)
    • +
    • super-kmer length reaches 256 nucleotides (cursor retreated by k)
    • +

    Kmer extraction

    -

    A k-mer is extracted from a super-kmer with SuperKmer::kmer(i, k), which returns a Kmer — a left-aligned u64 newtype (see Kmer implementation):

    -
    pub fn kmer(&self, i: usize, k: usize) -> Result<Kmer, KmerError>
    +

    A k-mer is extracted from a super-kmer with SuperKmer::kmer(i), which delegates to PackedSeq::extract::<KLen>(i) and returns a Kmer — a left-aligned u64 newtype (see Kmer implementation):

    +
    pub fn kmer(&self, i: usize) -> Result<Kmer, KmerError>
     
    -

    The bit slice seq[i*2 .. (i+k)*2] (Msb0 order) is loaded as a big-endian u64 via bitvec::load_be, then left-shifted to produce the canonical left-aligned layout. One call — no loop, no allocation.

    +

    The bit slice seq[i*2 .. (i+k)*2] (Msb0 order) is loaded as a u64 via bitvec::load_be, then left-shifted to produce the canonical left-aligned layout. One call — no loop, no allocation.


    Algorithm — Super-kmer reverse complement

    procedure SuperKmerRevcomp(seq, SEQL):
    -    seql  ← NKMERS + k − 1               -- nucleotide length
    +    seql  ← nucleotide length
         n     ← ⌈seql / 4⌉                  -- number of bytes
         shift ← n × 8 − seql × 2            -- padding bits to flush
     
    diff --git a/doc/implementation/unitig_evidence.refs/index.html b/doc/implementation/unitig_evidence.refs/index.html
    new file mode 100644
    index 0000000..af35b04
    --- /dev/null
    +++ b/doc/implementation/unitig_evidence.refs/index.html
    @@ -0,0 +1,1064 @@
    +
    +
    +
    +  
    +    
    +      
    +      
    +      
    +      
    +      
    +      
    +      
    +      
    +        
    +      
    +      
    +      
    +      
    +    
    +    
    +      
    +        Unitig evidence.refs - obikmer
    +      
    +    
    +    
    +      
    +      
    +      
    +
    +
    +    
    +    
    +      
    +    
    +    
    +      
    +        
    +        
    +        
    +        
    +        
    +      
    +    
    +    
    +    
    +    
    +      
    +
    +    
    +    
    +  
    +  
    +  
    +    
    +  
    +    
    +    
    +    
    +    
    +    
    +    
    + +
    + + + + + + +
    + + +
    + +
    + + + + + + +
    +
    + + + +
    +
    +
    + + + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    + +
    + + + + + + +

    Coverage: implementation/unitig_evidence.md

    +

    Code couvert

    +
      +
    • obiskio/src/unitig_index.rs — format unitigs.bin + unitigs.bin.idx, UnitigFileWriter, UnitigFileReader, build_unitig_idx(), DEFAULT_BLOCK_BITS=0, chemin chaud block_bits=0 dans chunk_start()
    • +
    • obilayeredmap/src/evidence.rs — encodage Evidence (chunk_id 25 bits | rank 7 bits), EvidenceWriter
    • +
    • obidebruinj/src/debruijn.rs — extraction unitigs, chunking à MAX_KMERS_PER_CHUNK
    • +
    +

    Notes

    +

    FORT RISQUE DE DÉRIVE. Changements récents : +- DEFAULT_BLOCK_BITS est passé de 6 à 0 (accès O(1) par défaut) +- block_bits est maintenant un paramètre runtime de build_unitig_idx() et UnitigFileWriter +- chunk_start() a un chemin chaud explicite pour block_bits=0 (accès tableau direct, 0 scan) +- open() vs open_sequential() : distinction nouvelle, importante pour la compréhension du coût +- iter_unitigs() ajouté comme alias public de iter_chunks_sequential() +Mettre à jour la description du format .idx et le modèle de coût d'accès aléatoire.

    + + + + + + + + + + + + + +
    +
    + + + +
    + +
    + + + +
    +
    +
    +
    + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/implementation/unitig_evidence/index.html b/doc/implementation/unitig_evidence/index.html index 3cf38b7..9440a04 100644 --- a/doc/implementation/unitig_evidence/index.html +++ b/doc/implementation/unitig_evidence/index.html @@ -1,76 +1,166 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + Unitig evidence encoding - obikmer + + + + + + - - - - - - - - -Unitig evidence encoding - obikmer - - - - - - - - - - - +
    + +
    + + + + + +
    - +
    -
    -
    -
    -
    -
    -
    - - -
  • - -
  • + + + + + + + + + + +
  • + + + + + + + + Merge command + + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + Kmer filtering (rebuild/dump/unitig) + + + + + + + + +
  • + + + + + + + + + + + + + + + + + + + + + + +
  • + + + + + + + -
  • + + + + - - - -
    -
    -
    -
    -
    -
    -
    +
    +
    + + + +
    +
    +
    + + + - -
  • - - - - Non-determinism of the unitig decomposition - - - - -
  • -
  • - - - - Open questions - - - -
  • - - -
    -
    -
    -
    -
    +
    +
    +
    + + + +
    + +
    + + + + +

    Unitig-based MPHF evidence encoding

    Role of unitigs in the index

    -

    The MPHF maps each canonical kmer to an integer slot, but provides no way to reconstruct the kmer from its slot. A downstream operation (query, set operation) that receives a slot index and needs the kmer sequence must be able to retrieve it. The evidence file serves this purpose: it stores the kmer sequences in compact form and provides, for each MPHF slot, a pointer to where the corresponding kmer can be decoded.

    -

    Unitigs are the natural compact representation: a run of L nucleotides encodes L − k + 1 consecutive canonical kmers. The entire kmer set of a partition can be reconstructed from its unitig FASTA file.

    -
    -

    Two encoding strategies

    -

    Strategy A — global nucleotide offset

    -

    Each MPHF slot stores a single integer: the byte offset of the kmer's first nucleotide within a packed 2-bit nucleotide array that concatenates all unitigs.

    -
    evidence[slot] = global_offset  (bits: ⌈log₂ N_nuc⌉)
    -
    -

    where N_nuc is the total number of nucleotides across all unitigs in the partition.

    -

    Decoding: read k nucleotides starting at global_offset.

    -

    Strategy B — (unitig_id, rank within unitig)

    -

    Each MPHF slot stores a pair:

    -
    evidence[slot] = (unitig_id, rank)
    +

    The MPHF maps each canonical kmer to an integer slot but provides no inverse: a slot index alone cannot reconstruct the kmer. The evidence file supplies this inverse: for each MPHF slot it stores a pointer into the unitig sequence file, from which k nucleotides can be extracted.

    +

    Unitigs are the natural compact representation: a run of L nucleotides encodes L − k + 1 consecutive canonical kmers. The entire kmer set of a partition is reconstructible from its unitig binary file.

    +
    +

    Binary file formats

    +

    unitigs.bin — sequence chunks

    +

    A sequence of binary records. Each record:

    +
    [u8: seql − k]  [ceil(seql / 4) bytes: 2-bit packed nucleotides]
     
      -
    • unitig_id : index of the unitig in the partition (0-based)
    • -
    • rank : kmer index within the unitig (0 ≤ rank < n_kmers); kmer i starts at nucleotide i, so the nucleotide offset is identical numerically but the kmer-unit interpretation is the natural one
    • +
    • seql − k (0–255): nucleotide length minus k, so seql = byte[0] + k and n_kmers = byte[0] + 1.
    • +
    • Packed nucleotides: A=00, C=01, G=10, T=11, MSB-first within each byte; last byte zero-padded.
    • +
    • Byte count for packed sequence: ceil(seql / 4).
    -

    Decoding: look up the unitig at unitig_id, then read k nucleotides starting at rank.

    -
    -

    Bit-cost analysis

    -

    Define for a partition of P kmers with average kmers-per-unitig m:

    -
      -
    • total nucleotides: \(N_{nuc} = P \cdot \left(1 + \dfrac{k-1}{m}\right)\)
    • -
    • number of unitigs: \(U = P / m\)
    • -
    -

    Strategy A

    -
    \[ -b_A = \left\lceil \log_2 N_{nuc} \right\rceil = \left\lceil \log_2 P + \log_2\!\left(1 + \frac{k-1}{m}\right) \right\rceil -\]
    -

    Strategy B

    -
    \[ -b_B = \left\lceil \log_2 U \right\rceil + \left\lceil \log_2 L_{max} \right\rceil -\]
    -

    where \(L_{max}\) is the maximum unitig length (in nucleotides). In practice \(L_{max} \ll P\), so the rank field is much cheaper than the full global offset. If unitig lengths are bounded (e.g. by partition structure), the rank field width is a small constant independent of P.

    -

    Empirical bound on unitig length

    -

    Lengths and ranks are expressed in kmer units (not nucleotides): the nucleotide length is n_kmers + k − 1, so storing n_kmers instead of seq_length saves k−1 = 30 units of headroom in the same field width.

    -

    Consequence for u8 capacity:

    - - - - - - - - - - - - - - - - - - - - -
    unitmax representablemax nucleotides
    nucleotides255 nuc225 kmers
    kmers255 kmers285 nuc
    -

    Structural maximum from superkmer construction. For k=31 and m=11, the maximum number of consecutive kmers sharing the same minimiser is k − m + 1 = 21 kmers (the minimiser traverses from position k−m to 0 as the window slides). A unitig that is a single full superkmer therefore has exactly 21 kmers. This is confirmed by a bimodal distribution in empirical data: a sharp peak at 21 kmers appears in all partitions, including the anomalous partition 145. The observed maximum is ~46 kmers (unitigs spanning more than one superkmer), well within u8 range.

    -

    On Betula nana (k=31, 256 partitions), m_u ≈ 37.9 kmers/unitig on average. The rank field (kmer index within the unitig) fits in a u8 as long as no unitig exceeds 255 kmers — guaranteed by the split strategy below and amply satisfied by empirical maximums (~46 kmers observed).

    -

    Split strategy for long unitigs

    -

    For the rare cases where a unitig exceeds 255 kmers, the unitig is split into chunks of at most 255 kmers, with a k−1 nucleotide overlap at each junction — identical to the way super-kmers are delimited at partition boundaries. Each chunk is self-contained and independently decodable.

    -
    original unitig: kmer_0 … kmer_254 | kmer_255 … kmer_N
    -                                   ↑ cut here
    -
    -chunk 1: nucleotides 0 … 284        (255 kmers)
    -chunk 2: nucleotides 255 … N+k-1    (N-255+1 kmers)
    -shared:  nucleotides 255 … 284      (k-1 = 30 nucleotides, stored in both)
    +

    Unitigs with more than MAX_KMERS_PER_CHUNK = 256 k-mers are transparently split into overlapping chunks. Each chunk has at most 256 k-mers (= seql − k + 1 ≤ 256); consecutive chunks overlap by k−1 nucleotides so no kmer is lost:

    +
    chunk 1: nucleotides [0,   MAX_KMERS_PER_CHUNK + k − 2]   (256 kmers)
    +chunk 2: nucleotides [256, end]                             (remaining kmers)
    +overlap: k−1 nucleotides shared between the two chunks
     
    -

    Cost of one split: k−1 = 30 redundant nucleotides = 60 bits. This event is rare in practice (m_u ≈ 38 for B. nana, well below the 255-kmer cap). No kmer is lost: kmer i is in chunk 1 if i < 255, in chunk 2 (at rank i−255) otherwise.

    -

    Savings from u8 length fields

    -

    Because all chunks are guaranteed ≤ 255 kmers, the per-chunk length array in the binary index is a flat u8 array — 1 byte per chunk instead of 8 bytes (usize) or 4 bytes (u32). For a partition with 4 M unitigs:

    - - - - - - - - - - - - - - - - - - - - - - - - - -
    length typebytes/chunktotal (4 M chunks)
    usize (u64)832 MB
    u32416 MB
    u814 MB
    -

    Random access to chunk i is recovered at load time by a single prefix-sum pass over the u8 array, computing a u32/u64 offset array in O(n_chunks) time and O(n_chunks × 4) bytes — paid once at open time, cached for the lifetime of the partition handle.

    -

    Bit costs for Betula nana (k=31, 256 partitions, P ≈ 10.4 M, U ≈ 275 k, m_u ≈ 37.9):

    +

    unitigs.bin.idx — block-sampled offset index

    +
    magic     : 4 bytes  = "UIX3"
    +block_bits: u32 LE   — granularity parameter (0–31)
    +n_unitigs : u32 LE   — total number of chunks in unitigs.bin
    +n_kmers   : u64 LE   — total number of kmers across all chunks
    +offsets   : [u32 LE] — byte offsets into unitigs.bin, one per 2^block_bits chunks + sentinel
    +
    +

    One offset entry is stored every 2^block_bits chunks; the array is sentinel-terminated (last entry = file size). DEFAULT_BLOCK_BITS = 0 stores one offset per chunk (exact table, no scan).

    +

    evidence.bin — per-slot MPHF evidence

    +

    A flat array of u32 values, one per MPHF slot, no header:

    +
    bits [31:7] = chunk_id  (25 bits)
    +bits  [6:0] = rank      (7 bits, 0–127)
    +
    +

    File size = n_slots × 4 bytes. chunk_id is the 0-based index of the record in unitigs.bin; rank is the position of the canonical kmer within that chunk (counting only canonical kmers). Encoding: raw = (chunk_id << 7) | (rank & 0x7F). Decoding: chunk_id = raw >> 7, rank = raw & 0x7F.

    +
    +

    Building and reading the index

    +

    build_unitig_idx(path, block_bits)

    +

    Scans unitigs.bin sequentially: for each chunk at byte offset offset, if chunk_count & mask == 0 (where mask = (1 << block_bits) − 1), appends offset as u32 to block_offsets. After the scan, appends a sentinel (= total file size), then writes the .idx file. Called after the unitig file is fully written and closed.

    +

    open(), open_sequential(), open_direct_access()

    +

    UnitigFileReader has three constructors:

    +
      +
    • open(path) — smart default: if unitigs.bin.idx exists, delegates to open_direct_access; otherwise delegates to open_sequential. Prefer this in call sites that don't require one specific mode.
    • +
    • open_sequential(path) — never reads .idx. Sequential iterators only; chunk_start(i) falls back to an O(i) mmap scan rather than panicking.
    • +
    • open_direct_access(path) — requires .idx to be present. Enables O(1) or O(2^block_bits) chunk_start(i), used by verify_canonical_kmer at query time.
    • +
    +

    CanonicalKmerIter — a clonable sequential iterator returned by UnitigFileReader::iter_canonical_kmers(). It holds an Arc<Mmap> so cloning resets the cursor to the start without reopening the file. This makes it usable with par_bridge() for parallel MPHF construction without random access.

    +

    chunk_start(i) — access modes

    +

    When .idx is loaded (open_direct_access):

    +
      +
    • block_bits = 0: single array lookup, O(1).
    • +
    • block_bits > 0: lookup block, then scan ≤ 2^block_bits records, O(2^block_bits).
    • +
    +

    When .idx is absent (open_sequential): chunk_start(i) performs an O(i) sequential mmap scan from offset 0. No panic — the function degrades gracefully. This degraded path is used by find_strict() on Approx layers (sequential scan of all canonical kmers).

    +

    Decoding a kmer from slot s

    +
    let (chunk_id, rank) = evidence.decode(s);      // u32 → (chunk_id: u32, rank: u8)
    +let kmer = unitigs.raw_kmer(chunk_id, rank);    // 2-bit packed slice → left-aligned u64
    +
    +

    Two memory accesses: one 4-byte read from evidence.bin, one packed-bit extraction from unitigs.bin via the mmap. The retrieved sequence is already canonical (only canonical kmers are inserted into the De Bruijn graph).

    +
    +

    Field widths and capacity

    - - - - - - - - - - - - - - - - - - - - - -
    fieldstrategy Astrategy B
    offset / id\(\lceil\log_2(P \cdot (1 + 30/m_u))\rceil = 25\) bits\(\lceil\log_2(U)\rceil = 19\) bits
    rank8 bits (u8, fixed)
    total25 bits27 bits
    -

    Strategy A is 2 bits cheaper. Strategy B's main advantage is locality: decoding a kmer touches one unitig's cache lines rather than an arbitrary offset in a large flat array, and the rank field doubles as a direct index into the packed nucleotide sequence without pointer arithmetic.

    -
    -

    Partition-size tradeoff

    -

    The total bits/kmer for the index (sequence + evidence + MPHF) as a function of partition size is:

    -
    \[ -\text{total} = \underbrace{2\!\left(1 + \frac{k-1}{m}\right)}_{\text{sequence}} + \underbrace{\log_2 P + \log_2\!\left(1+\frac{k-1}{m}\right)}_{\text{evidence}} + \underbrace{c_{MPHF}}_{\approx 2\text{–}4} -\]
    -

    Empirical observation: m_u is set by De Bruijn graph topology, not partition count

    -

    Measured on Betula nana (k=31, m=11), summing n_kmers and sequence counts across all partition files:

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    N partitionsm_skm_ufactor m_u/m_sknuc ratio (u/sk)
    112.1341.893.45×0.273
    1612.1338.193.15×0.376
    25612.1337.903.12×0.388
    1 02412.1337.893.12×0.389
    -
      -
    • m_sk = avg kmers/super-kmer (invariant — same dataset regardless of partition scheme)
    • -
    • m_u = avg kmers/unitig = total_n_kmers / total_unitigs, summed across all partitions
    • -
    • nuc ratio = (u_symbols + 30·u_reads) / (sk_symbols + 30·sk_reads)
    • -
    -

    X-axis in both charts: partition bits (0 = 1 partition, 10 = 1024 partitions) — each step doubles the partition count.

    -
    xychart-beta
    -    title "m_u (avg kmers/unitig) vs partition bits — B. nana k=31"
    -    x-axis "partition bits" 0 --> 10
    -    y-axis "m_u" 37 --> 43
    -    line [41.89, 40.78, 39.22, 38.52, 38.19, 38.03, 37.96, 37.92, 37.90, 37.89, 37.89]
    -
    xychart-beta
    -    title "Nucleotide storage: unitigs / super-kmers (%) vs partition bits — B. nana k=31"
    -    x-axis "partition bits" 0 --> 10
    -    y-axis "%" 25 --> 42
    -    line [27.3, 29.7, 33.9, 36.3, 37.6, 38.3, 38.6, 38.7, 38.8, 38.9, 38.9]
    -

    Key observations:

    -
      -
    1. Partition boundaries have a small but non-zero effect on m_u. Going from 1 to 1024 partitions reduces m_u by 10% (41.9 → 37.9). Within the practical range 16–1024, the variation is under 1% — m_u is effectively constant.
    2. -
    3. m_u is a property of the De Bruijn graph, not the partition scheme. The dominant factor is graph branching (heterozygosity, repeats, sequencing errors).
    4. -
    5. Unitigs provide substantial compaction over super-kmers. At 256 partitions, unitigs cover the same unique kmers using 39% of the raw nucleotide content of super-kmers (3.1× compaction factor).
    6. -
    -

    Per-partition compaction ratio (sk_symbols / u_symbols)

    -

    The ratio measures how much super-kmer kmer-slots are "shared" across different super-kmer records: a ratio of 1.35 means each unique kmer (counted once in unitigs) appears in 1.35 super-kmer kmer-slots on average.

    - - - - - - - - + + - - - - - - - - - - - - - - - - + - - - - - + + - - - - - - + + + + - - - - - - + + + +
    bitsN partitionsmedian ratiomin ratiomin partitionmin u_readsrangecapacity check (B. nana, 256 partitions)
    6641.3551.0734.5 M
    71281.3521.0374.1 M
    seql − k 82561.3501.0121453.8 M0–255max n_kmers per chunk = 256 = MAX_KMERS_PER_CHUNK
    95121.3500.9981453.6 Mrank70–127observed max ~46 kmers/chunk; structural max k−m+1 = 21
    1010241.3510.9921453.6 Mchunk_id250–33 554 431avg U ≈ 275 k chunks/partition
    -

    The median stabilises at 1.35 from 64 partitions onward (stdev = 0.027 at 256 partitions). There is one persistent outlier: partition 145 (at 256-partition resolution) is consistently anomalous across all partition depths — it contains 10–14× more super-kmers and unitigs than the average partition, with a ratio near 1.0, meaning the unitig representation provides almost no kmer deduplication. This is consistent with a highly repetitive or organellar region where the dominant minimiser belongs to a sequence that appears in many reads without forming long overlapping paths in the De Bruijn graph.

    -

    Per-partition parameters at 256 partitions (B. nana):

    +

    The rank field is 7 bits (max 127) even though chunks can contain up to 256 k-mers, because rank counts only canonical kmers within the chunk, and the canonical kmer count is at most half the total.

    +
    +

    Evidence bit-cost

    +

    Strategy B (chunk_id + rank) is the implemented strategy. For B. nana (k=31, 256 partitions, P ≈ 10.4 M unique kmers/partition, U ≈ 275 k chunks/partition, m_u ≈ 37.9 kmers/chunk):

    - + + - - + + + - - + + + - - - - - - - - - - + + +
    quantityfieldtheoretical cost value
    P (unique kmers/partition, avg)≈ 10.4 Mchunk_id⌈log₂ U⌉19 bits
    U (unitigs/partition, avg)≈ 275 krank⌈log₂ m_u⌉ (≈ fixed)6 bits
    m_u≈ 37.9
    Strategy A bits/kmer⌈log₂(P·(1+30/m_u))⌉ = 25
    Strategy B bits/kmer⌈log₂(U)⌉ + 8 = 27storedaligned u3232 bits/slot
    -

    Consequence: the partition count should be as large as memory and parallelism allow. Each doubling saves 1 bit/kmer in evidence (log₂ P decreases by 1). The sequence term 2·(1 + 30/m_u) ≈ 3.6 bits/kmer is approximately constant.

    -

    Strategy B partially decouples evidence cost from P: log₂(U) = log₂(P/m_u) grows more slowly than log₂(P) by a fixed log₂(m_u) ≈ 5 bits. Strategy B's main benefit remains locality and bounded rank width, not asymptotic compression.

    -
    -

    Implementation notes

    -

    Evidence file layout (strategy B — implemented)

    -

    evidence.bin is a flat [u32; n] array with no header:

    -
    evidence.bin: n × 4 bytes, little-endian
    -  each u32:  bits [31:7] = chunk_id (25 bits)
    -             bits [6:0]  = rank     (7 bits)
    -
    -

    File size = n × 4 bytes exactly. Decoding a slot: chunk_id = raw >> 7, rank = raw & 0x7F.

    -

    The theoretical bit cost of strategy B (19 bits id + 8 bits rank = 27 bits) is not recovered: packing into aligned u32 costs 32 bits per slot. The u32 layout is chosen for simplicity and alignment — one word per slot, no bit-addressing arithmetic.

    -

    Unitig file layout

    -

    Binary packed 2-bit nucleotide file (unitigs.bin) with a companion index (unitigs.bin.idx). The index stores: magic UIDX, n_unitigs: u32, n_kmers: u64, seqls: [u8; n_unitigs] (kmer count − 1 per chunk), and packed_offsets: [u32; n_unitigs + 1] (byte offsets into unitigs.bin, sentinel-terminated). This gives O(1) random access to any unitig and the total kmer count without scanning the sequence file.

    -

    Decoding a kmer from slot s

    -
    (chunk_id, rank) = evidence.decode(s)          // u32 → (u25, u7)
    -kmer = unitigs.raw_kmer(chunk_id, rank)        // 2-bit packed slice, k nucleotides
    -
    -

    Two memory accesses: one into evidence.bin, one into unitigs.bin. The canonical kmer is the stored sequence (by construction — only canonical kmers are inserted into the De Bruijn graph).

    -

    Field widths in practice

    -

    Rank is stored in 7 bits (0–127). On B. nana (k=31, m=11), the observed maximum unitig length is ~46 kmers/chunk — well within the 127-kmer u7 capacity. The structural maximum from superkmer construction is k − m + 1 = 21 kmers per unitig; longer paths arise across multiple superkmers. u7 is sufficient.

    -

    chunk_id is stored in 25 bits (0–33 554 431). For B. nana at 256 partitions, avg U ≈ 275 k — well within the 25-bit capacity.

    -

    Forward vs reverse complement

    -

    The De Bruijn graph stores only canonical kmers. The evidence encodes the canonical orientation. Callers that need the strand of the original kmer must compare the retrieved kmer with its revcomp at query time; this is a single 64-bit comparison.

    -
    -

    Non-determinism of the unitig decomposition

    -

    The unitig extraction is not deterministic: two runs on identical input can produce a different number of unitigs with different sequences, while covering exactly the same canonical k-mer set.

    -

    Source of non-determinism

    -

    The graph nodes are stored in a hash map whose iteration order depends on the hash seed (random per run with ahash::RandomState::new()). The start_iter first pass emits every node whose can_extend_left flag is false — which includes not only true dead-end nodes but also branch points (nodes with 2 or more left neighbours, for which unique_neighbor returns None).

    -

    When a branch point is encountered before its upstream neighbours, it claims the downstream chain and those neighbours later produce length-k degenerate unitigs. When upstream neighbours are encountered first, they extend through the branch point and consume it.

    +

    The u32 layout is chosen for alignment and simplicity; no bit-addressing arithmetic is needed.

    +

    Comparison with strategy A (global nucleotide offset): ⌈log₂(P · (1 + (k−1)/m_u))⌉ = 25 bits. Strategy A is theoretically 2 bits cheaper; strategy B's advantage is locality (decoding touches one chunk's cache lines) and a bounded, constant-width rank field independent of partition size.

    +
    +

    Unitig decomposition non-determinism

    +

    The unitig extraction from GraphDeBruijn is not deterministic: two runs on identical input can produce different unitig counts and sequences while covering exactly the same canonical kmer set.

    +

    The hash map (hashbrown::HashMap with Xxh3Builder) has run-dependent iteration order. The start_iter first pass emits every node where can_extend_left is false — this includes true dead-ends and branch points (nodes with ≥2 left neighbours). When a branch point is encountered before its upstream neighbours, it claims the downstream chain and those upstream neighbours later produce length-k degenerate unitigs. When upstream neighbours appear first, they extend through the branch point.

    Example — fork topology (k = 31):

    A → B ← C
         ↓
         D
     
    -

    All four nodes are in the graph. B has two left neighbours (A and C), so can_extend_left = false; B also has one right neighbour D, so can_extend_right = true.

    +

    B has two left neighbours, so can_extend_left = false. Two valid tilings:

    - + - - + + - - + +
    iteration orderunitigs producedunitigs count
    A first, then B, CABD · CA firstABD, C 2
    B first, then A, CBD · A · CB firstBD, A, C 3
    -

    Both tilings cover the same 4 canonical k-mers.

    -

    Pure cycles (all nodes have both extensions present) are unaffected by this: they are never emitted in the first pass and each cycle produces exactly one unitig regardless of which node the second pass starts from. Only the cycle cut point (and therefore the sequence content) varies.

    -

    Consequence for MPHF construction

    -

    The MPHF is built from the k-mer set, not from the unitig sequences themselves. Because both tilings contain the same canonical k-mers, the resulting MPHF is identical. The non-determinism is benign for this use case.

    -
    -

    Open questions

    -
      -
    • Cross-partition evidence: for set operations spanning multiple partitions, strategy B allows unitig-level operations (e.g. mark entire unitigs as present/absent) rather than kmer-level, potentially reducing the operation cost by a factor of m_u.
    • -
    -
    -
    +

    Both cover the same 4 canonical kmers. Pure cycles are unaffected: all cycle nodes have both extensions present, so none are emitted in the first pass; each cycle produces exactly one unitig regardless of entry point (only the cut point varies).

    +

    This non-determinism is benign for MPHF construction: the MPHF is built from the kmer set, which is identical across tilings.

    +
    +

    Partition-size tradeoff

    +

    Measured on B. nana (k=31, m=11), summing across all partitions:

    + + + + + + + + + + + + + + + + + + + + + + + + + +
    N partitionsm_u
    141.89
    1638.19
    25637.90
    1 02437.89
    +

    m_u is set by De Bruijn graph topology (heterozygosity, repeats, sequencing errors), not partition count. The variation from 1 to 1024 partitions is under 10%; within 16–1024 it is under 1%. Unitigs provide ~3.1× nucleotide compaction over super-kmers at 256 partitions.

    +

    Evidence cost decreases by 1 bit/kmer with each doubling of partition count (via log₂ U = log₂(P/m_u)). The sequence storage term 2 · (1 + (k−1)/m_u) ≈ 3.6 bits/kmer is approximately constant.

    +
    +

    Alternative: fingerprint evidence

    +

    evidence.bin can be replaced by fingerprint.bin at index build time (--approx) or after the fact (reindex --approx). The fingerprint stores b bits per MPHF slot (the low b bits of kmer.seq_hash()); verification becomes a single bitfield comparison instead of a unitig dereference. False-positive rate per k-mer query: 1/2^b. With the Findere z parameter, z consecutive k-mers must all match, reducing the effective window FP rate to 1/2^(b·z) while skipping z−1 of every z k-mers. No .idx file is written or read in approx mode.

    +

    See Approximate evidence (Findere fingerprint) for the full design and CLI parameters.

    + + + + + + + + + + + + + + +
    + + -
    - - -
    -
    -
    -
    - - - - + + +
    +
    +
    + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/index.html b/doc/index.html index 22a67a0..3b3e969 100644 --- a/doc/index.html +++ b/doc/index.html @@ -213,6 +213,17 @@ @@ -935,6 +1052,17 @@
      +
    • + + + + Subcommands + + + + +
    • +
    • @@ -944,6 +1072,28 @@ +
    • + +
    • + + + + Parameter constraints (enforced at CLI) + + + + +
    • + +
    • + + + + Genome label constraints + + + +
    • @@ -976,12 +1126,155 @@

      obikmer

      obikmer is a Rust tool for manipulation, counting, indexing, and set operations on DNA sequences represented as kmer sets.

      +

      Subcommands

      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      SubcommandPurpose
      superkmerExtract super-kmers from a sequence file and write to stdout
      indexBuild a complete genome index (scatter → dereplicate → count → layered MPHF)
      mergeMerge multiple built indexes into one
      rebuildFilter and compact an existing index into a new single-layer index; supports ingroup/outgroup predicates on genome metadata
      queryQuery an index with sequences and annotate matches
      dumpDump all indexed k-mers as CSV (kmer + per-genome counts or presence); supports the same ingroup/outgroup filtering as rebuild
      annotateAdd or update genome metadata from a CSV file; or dump metadata as CSV
      distanceCompute pairwise distance matrix between genomes; optionally build NJ/UPGMA trees
      unitigBuild a global de Bruijn graph across all partitions and enumerate its unitigs as FASTA; supports the same ingroup/outgroup filtering as rebuild
      estimateEstimate approximate-index parameters (z, evidence bits, FP rates) before indexing
      reindexConvert an index's evidence in-place: exact ↔ approx
      utilsMiscellaneous index utilities: --new-label NEW=OLD renames a genome label; --upgrade-index adds missing layer_meta.json to old indexes
      packPack per-column matrix files into single-file format to reduce query I/O

      Constraints

      • Target scale: individual genome datasets, tens of Gbases
      • Maximum efficiency in computation, memory, and disk usage
      • -
      • Input formats: FASTA, FASTQ, gzip, streaming stdin
      • +
      • k odd, k ∈ [11, 31], fixed at runtime; kmer fits in a u64 (2 bits/base)
      • +
      • Canonical form: min(kmer, revcomp(kmer)) reduces strand-symmetric space by half
      • +
      • Input formats for index/superkmer: FASTA (.fa, .fasta), FASTQ (.fq, .fastq), GenBank flat file (.gb, .gbk, .gbff), all optionally gzip-compressed; directories expanded recursively; streaming stdin via -
      • +
      • Input formats for query: FASTA, FASTQ, optionally gzip-compressed; streaming stdin via -
      +

      Parameter constraints (enforced at CLI)

      +

      All constraints below are checked by CommonArgs::validate() at the start of superkmer and index. Invalid values exit immediately with an error.

      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      ParameterConstraintReason
      k (--kmer-size)oddeven k allows palindromic k-mers: kmer == revcomp(kmer), breaking the canonical form invariant
      k (--kmer-size)k ∈ [11, 31]k > 31 overflows u64 at 2 bits/base; k < 11 gives insufficient specificity
      m (--minimizer-size)oddsame palindrome argument as k
      m (--minimizer-size)3 ≤ m ≤ k−1minimizer must be strictly shorter than the kmer
      z (-z, Findere, index --approx only)z ≤ k−1effective indexed kmer size is k−z+1; z ≥ k would make it ≤ 0
      +

      Genome label constraints

      +

      Genome labels are arbitrary Unicode strings with the following restrictions:

      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      CharacterForbiddenReason
      /yesfilesystem path separator
      =yes--new-label parser separator
      \0yesnull byte
      \n \r \tyesbreak CSV output
      spacesalloweduse shell quoting: --new-label 'new label=old label'
      +

      Empty labels are also rejected. Labels derived automatically from the index directory name (when --label is omitted) are not validated since they come from the filesystem and are already safe.

      Priority operations

      @@ -1038,11 +1122,12 @@

      Kmers

      A kmer is a DNA subsequence of fixed length k. Two constraints govern the choice of k:

        -
      • k ∈ [11, 31]: the range ensures the kmer is long enough to be specific and short enough to fit in a single machine word.
      • +
      • k ∈ [11, 31]: the range ensures the kmer is long enough to be specific and short enough to fit in a single machine word (u64 at 2 bits/base requires k ≤ 32; k < 11 yields insufficient specificity).
      • k is odd: an odd-length sequence cannot equal its own reverse complement (no palindromes). This guarantees that the canonical form min(kmer, revcomp(kmer)) is always strictly defined — the two orientations are always distinct — which is required for strand-independent counting.
      +

      Both constraints are enforced at CLI entry by CommonArgs::validate() in superkmer and index. Passing an invalid k exits immediately with an error message.

      Super-kmers

      -

      A super-kmer is a maximal run of consecutive kmers from a DNA read, each overlapping the next by k−1 nucleotides. Each kmer of the run carries the same canonical minimizer. The canonical minimizer of a kmer is the smallest value of min(m-mer, revcomp(m-mer)) over all m-mers within the kmer (m < k, m odd), with the constraint that non-degenerate m-mers are always preferred over degenerate ones. A degenerate m-mer is one composed of a single repeated nucleotide (all-A, all-C, all-G, or all-T); such m-mers are selected only if no non-degenerate candidate exists in the window.

      +

      A super-kmer is a maximal run of consecutive kmers from a DNA read, each overlapping the next by k−1 nucleotides, sharing the same canonical minimizer. The canonical minimizer of a kmer is the m-mer (m < k) whose canonical hash hash_kmer(min(m-mer, revcomp(m-mer))) is smallest over all m-mers in the kmer window. The hash function is a mix64-based bijection; selection is purely hash-ordered with no degeneracy filter. A super-kmer is capped at 256 nucleotides; a longer run is split at that boundary.

      Canonical super-kmers

      A canonical super-kmer is the lexicographic minimum of a super-kmer and its reverse complement:

      canonical(super-kmer) = min(super-kmer, revcomp(super-kmer))
      diff --git a/doc/sitemap.xml.gz b/doc/sitemap.xml.gz
      index 7a87a36..53a78b5 100644
      Binary files a/doc/sitemap.xml.gz and b/doc/sitemap.xml.gz differ
      diff --git a/doc/theory/encoding.refs/index.html b/doc/theory/encoding.refs/index.html
      new file mode 100644
      index 0000000..8f60d46
      --- /dev/null
      +++ b/doc/theory/encoding.refs/index.html
      @@ -0,0 +1,1057 @@
      +
      +
      +
      +  
      +    
      +      
      +      
      +      
      +      
      +      
      +      
      +      
      +      
      +        
      +      
      +      
      +      
      +      
      +    
      +    
      +      
      +        Encoding.refs - obikmer
      +      
      +    
      +    
      +      
      +      
      +      
      +
      +
      +    
      +    
      +      
      +    
      +    
      +      
      +        
      +        
      +        
      +        
      +        
      +      
      +    
      +    
      +    
      +    
      +      
      +
      +    
      +    
      +  
      +  
      +  
      +    
      +  
      +    
      +    
      +    
      +    
      +    
      +    
      + +
      + + + + + + +
      + + +
      + +
      + + + + + + +
      +
      + + + +
      +
      +
      + + + + + +
      +
      +
      + + + +
      +
      +
      + + + +
      +
      +
      + + + +
      + +
      + + + + + + +

      Coverage: theory/encoding.md

      +

      Code couvert

      +
        +
      • obikseq/src/kmer.rs — encodage 2 bits/base, revcomp, forme canonique
      • +
      +

      Notes

      +

      Document purement théorique. Peu de risque de dérive sauf si l'encodage interne de Kmer change. +Vérifier que la table d'encodage A=00, C=01, G=10, T=11 est toujours celle du code.

      + + + + + + + + + + + + + +
      +
      + + + +
      + +
      + + + +
      +
      +
      +
      + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/theory/encoding/index.html b/doc/theory/encoding/index.html index 6adf132..d44aeb9 100644 --- a/doc/theory/encoding/index.html +++ b/doc/theory/encoding/index.html @@ -718,6 +718,34 @@ +
    • + + + + + + + + Evidence elimination (discussion) + + + + + + + + +
    • + + + + + + + + + +
    • @@ -796,6 +824,62 @@ + + + + + + +
    • + + + + + + + + Merge command + + + + + + + + +
    • + + + + + + + + + + +
    • + + + + + + + + Kmer filtering (rebuild/dump/unitig) + + + + + + + + +
    • + + + +
    @@ -1010,17 +1094,20 @@

    The Watson-Crick complement of any base is its bitwise NOT on 2 bits: complement(base) = ~base & 0b11.

    Kmer encoding

    A kmer fits in a single u64. Nucleotide 0 occupies bits 63–62, nucleotide i occupies bits 63−2i and 62−2i, and the low 64−2k bits are zero. Extraction of nucleotide i (0 ≤ i < k): (kmer >> (62 - 2*i)) & 0b11.

    -

    Reverse complement is computed via a 16-bit lookup table (65 536 entries × 2 bytes = 128 KB, fits in L2 cache) storing the reverse-complement of every 8-base chunk.

    +

    Reverse complement is computed by bit manipulation in four steps, with no lookup table:

    Algorithm — Kmer reverse complement

    procedure KmerRevcomp(kmer, k):
    -    raw ←   TABLE16[kmer & 0xFFFF]         << 48
    -          | TABLE16[(kmer >> 16) & 0xFFFF] << 32
    -          | TABLE16[(kmer >> 32) & 0xFFFF] << 16
    -          | TABLE16[(kmer >> 48) & 0xFFFF]
    -    return raw << (64 - 2*k)
    +    x ← ~kmer                                           -- complement all bases
    +    x ← swap_bytes(x)                                   -- reverse byte order
    +    x ← ((x >> 4) & 0x0F0F0F0F0F0F0F0F)
    +       | ((x & 0x0F0F0F0F0F0F0F0F) << 4)               -- swap nibbles within each byte
    +    x ← ((x >> 2) & 0x3333333333333333)
    +       | ((x & 0x3333333333333333) << 2)                -- swap 2-bit pairs within each nibble
    +    return x << (64 - 2*k)                              -- re-align to MSB
     
    +

    The three reorder passes together reverse the order of all 2-bit base codes across the 64-bit word. The bitwise NOT in the first step complements each base (A↔T, C↔G). The final left shift clears the low 64−2k padding bits.

    The canonical form is the lexicographic minimum of the kmer and its reverse complement:

    canonical(kmer) = min(kmer, revcomp(kmer))
     
    diff --git a/doc/theory/entropy.refs/index.html b/doc/theory/entropy.refs/index.html new file mode 100644 index 0000000..7373853 --- /dev/null +++ b/doc/theory/entropy.refs/index.html @@ -0,0 +1,1058 @@ + + + + + + + + + + + + + + + + + + + + + + Entropy.refs - obikmer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +
    + + + + + + +
    + + +
    + +
    + + + + + + +
    +
    + + + +
    +
    +
    + + + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    + +
    + + + + + + +

    Coverage: theory/entropy.md

    +

    Code couvert

    +
      +
    • obiskbuilder/src/entropy_table.rs — filtre Shannon sur les kmers à basse complexité
    • +
    • obiskbuilder/src/lib.rs — application du filtre lors du scatter (phase 1)
    • +
    +

    Notes

    +

    Document théorique stable. Vérifier que les paramètres theta et level_max dans le CLI +(obikmer/src/cli.rsCommonArgs) correspondent bien à ce qui est décrit.

    + + + + + + + + + + + + + +
    +
    + + + +
    + +
    + + + +
    +
    +
    +
    + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/theory/entropy/index.html b/doc/theory/entropy/index.html index 20e5a37..e1f2229 100644 --- a/doc/theory/entropy/index.html +++ b/doc/theory/entropy/index.html @@ -773,6 +773,34 @@ +
  • + + + + + + + + Evidence elimination (discussion) + + + + + + + + +
  • + + + + + + + + + +
  • @@ -851,6 +879,62 @@ + + + + + + +
  • + + + + + + + + Merge command + + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + Kmer filtering (rebuild/dump/unitig) + + + + + + + + +
  • + + + + @@ -1109,7 +1193,7 @@

    Final score

    The filter computes \(\hat{H}(ws)\) for each word size ws from 1 to ws_max and returns the minimum:

    \[\text{entropy}(kmer) = \min_{ws=1}^{ws_{\max}} \hat{H}(ws)\]
    -

    A value near 0 indicates low complexity (e.g. AAAA…); near 1 indicates high complexity. A kmer is rejected if \(\text{entropy}(kmer) \leq \theta\), where \(\theta\) is a collection parameter. The minimum across word sizes ensures that any scale of repetition is detected independently: polyA is caught at ws=1, dinucleotide repeats at ws=2, etc.

    +

    A value near 0 indicates low complexity (e.g. AAAA…); near 1 indicates high complexity. A kmer is rejected if \(\text{entropy}(kmer) < \theta\), where \(\theta\) is a collection parameter (default 0.7). The minimum across word sizes ensures that any scale of repetition is detected independently: polyA is caught at ws=1, dinucleotide repeats at ws=2, etc.

    Interpretation as an effective number of classes

    \(H_{\text{corr}}\) is a standard Shannon entropy over raw words (after unfolding the equivalence classes), so the classical perplexity interpretation holds directly: \(N_{\text{eff}} = e^{H_{\text{corr}}}\) is the number of equiprobable classes that would yield the same entropy.

    For the normalised score \(\hat{H}\), dividing by \(H_{\text{max}}\) changes the logarithm base:

    diff --git a/doc/theory/indexing.refs/index.html b/doc/theory/indexing.refs/index.html new file mode 100644 index 0000000..94cc883 --- /dev/null +++ b/doc/theory/indexing.refs/index.html @@ -0,0 +1,1058 @@ + + + + + + + + + + + + + + + + + + + + + + Indexing.refs - obikmer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +
    + + + + + + +
    + + +
    + +
    + + + + + + +
    +
    + + + +
    +
    +
    + + + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    + +
    + + + + + + +

    Coverage: theory/indexing.md

    +

    Code couvert

    +
      +
    • obikpartitionner/src/partition.rs — routage par hash de minimiseur, choix des paramètres
    • +
    • obikpartitionner/src/lib.rs — structure KmerPartition, nombre de partitions
    • +
    +

    Notes

    +

    Vérifier que la doc mentionne bien que le nombre de partitions est une puissance de 2 +(converti par partitions_to_bits dans obikmer/src/cli.rs).

    + + + + + + + + + + + + + +
    +
    + + + +
    + +
    + + + +
    +
    +
    +
    + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/theory/indexing/index.html b/doc/theory/indexing/index.html index 97513e3..fa2f09b 100644 --- a/doc/theory/indexing/index.html +++ b/doc/theory/indexing/index.html @@ -718,6 +718,34 @@ +
  • + + + + + + + + Evidence elimination (discussion) + + + + + + + + +
  • + + + + + + + + + +
  • @@ -796,6 +824,62 @@ + + + + + + +
  • + + + + + + + + Merge command + + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + Kmer filtering (rebuild/dump/unitig) + + + + + + + + +
  • + + + + diff --git a/doc/theory/minimizer.refs/index.html b/doc/theory/minimizer.refs/index.html new file mode 100644 index 0000000..45c1a01 --- /dev/null +++ b/doc/theory/minimizer.refs/index.html @@ -0,0 +1,1058 @@ + + + + + + + + + + + + + + + + + + + + + + Minimizer.refs - obikmer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +
    + + + + + + +
    + + +
    + +
    + + + + + + +
    +
    + + + +
    +
    +
    + + + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    + +
    + + + + + + +

    Coverage: theory/minimizer.md

    +

    Code couvert

    +
      +
    • obiskbuilder/src/lib.rs — sélection du minimiseur par hash seedé (splitmix64 finalizer)
    • +
    • obikseq/src/superkmer.rs — forme canonique du minimiseur, fenêtre glissante
    • +
    +

    Notes

    +

    Vérifier que la fonction de hash décrite (splitmix64 finalizer avec graine) correspond +au code actuel. Vérifier aussi que la définition de « minimiseur canonique » est toujours cohérente.

    + + + + + + + + + + + + + +
    +
    + + + +
    + +
    + + + +
    +
    +
    +
    + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/theory/minimizer/index.html b/doc/theory/minimizer/index.html index 0c56933..1cbf856 100644 --- a/doc/theory/minimizer/index.html +++ b/doc/theory/minimizer/index.html @@ -762,6 +762,34 @@ +
  • + + + + + + + + Evidence elimination (discussion) + + + + + + + + +
  • + + + + + + + + + +
  • @@ -840,6 +868,62 @@ + + + + + + +
  • + + + + + + + + Merge command + + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + Kmer filtering (rebuild/dump/unitig) + + + + + + + + +
  • + + + + diff --git a/docmd/implementation/rebuild_filter.md b/docmd/implementation/rebuild_filter.md new file mode 100644 index 0000000..88bf3b2 --- /dev/null +++ b/docmd/implementation/rebuild_filter.md @@ -0,0 +1,188 @@ +# Kmer filtering and ingroup/outgroup predicates + +The `rebuild`, `dump`, and `unitig` commands all share the same filtering +system. Filters can select k-mers based on per-genome quorum counts, optionally +restricted to **ingroup** and **outgroup** genome sets derived from genome +metadata. + +`rebuild` additionally accepts `--min-total-count` / `--max-total-count` filters +that operate on the sum of counts across all genomes. + +## Predicate syntax + +Each `--ingroup` and `--outgroup` flag takes a predicate of the form: + +``` +key OP value1|value2|… +``` + +| Operator | Meaning | +|----------|---------| +| `*` or `all` | wildcard — every genome matches unconditionally | +| `key=v1\|v2` | exact match — genome's `key` equals `v1` or `v2` | +| `key!=v` | negation — genome's `key` equals none of the values | +| `key~path` | path ancestry — genome's `key` is `path` or a descendant | +| `key!~path` | not a descendant | + +Multiple values separated by `|` are always OR-ed within the predicate. + +### Path matching (`~` and `!~`) + +Metadata values can represent hierarchical taxonomic paths such as +`/Eukaryota/Viridiplantae/Streptophyta/Betulaceae/Betula/nana`. + +- **Absolute pattern** (starts with `/`): the value must start with the pattern + at a segment boundary. + `taxon~/Betulaceae/Betula` matches `/Betulaceae/Betula/nana` and + `/Betulaceae/Betula` but not `/Betulaceae/Betuloides/…`. +- **Bare segment** (no leading `/`): the value must contain the pattern as an + exact path component anywhere. + `taxon~Betula` matches any path that has `Betula` as one of its segments. + +### Missing metadata key → NA + +If a genome does not carry the queried metadata key, the predicate returns **NA**. +NA propagates through the group evaluation logic (see below), and genomes that +cannot be classified are **ignored** in all quorum counts. + +## Group semantics + +### Multiple predicates + +| Flag | Combination rule | +|------|-----------------| +| `--ingroup` (repeated) | **AND** — genome must satisfy all predicates | +| `--outgroup` (repeated) | **OR** — genome satisfies any predicate | + +### Three-value logic + +Each predicate returns `true`, `false`, or `NA` (absent key). + +- AND: `false` absorbs everything; `NA` propagates unless already `false`. +- OR: `true` absorbs everything; `NA` propagates unless already `true`. + +### Classification and priority + +For each genome: + +1. Evaluate `AND(ingroup predicates)` → `in_result` +2. Evaluate `OR(outgroup predicates)` → `out_result` +3. If `in_result = true` → **Ingroup** (ingroup wins over outgroup) +4. Else if `out_result = true` → **Outgroup** +5. Otherwise → **Uncategorized** (ignored in all quorum counts) + +### Implicit groups + +| `--ingroup` | `--outgroup` | Effective behaviour | +|-------------|--------------|---------------------| +| not set | not set | all genomes form the ingroup | +| set | not set | only ingroup quorum flags apply | +| not set | set | only outgroup quorum flags apply | +| set | set | both constraints apply simultaneously | + +## Quorum flags + +| Flag | Applies to | Meaning | +|------|-----------|---------| +| `--min-count N` | ingroup | k-mer present in at least N ingroup genomes | +| `--max-count N` | ingroup | k-mer present in at most N ingroup genomes | +| `--min-frac F` | ingroup | k-mer present in at least fraction F of ingroup genomes | +| `--max-frac F` | ingroup | k-mer present in at most fraction F of ingroup genomes | +| `--min-outgroup-count N` | outgroup | k-mer present in at least N outgroup genomes | +| `--max-outgroup-count N` | outgroup | k-mer present in at most N outgroup genomes | +| `--min-outgroup-frac F` | outgroup | k-mer present in at least fraction F of outgroup genomes | +| `--max-outgroup-frac F` | outgroup | k-mer present in at most fraction F of outgroup genomes | +| `--min-total-count N` | all genomes | sum of per-genome counts ≥ N (`rebuild` only) | +| `--max-total-count N` | all genomes | sum of per-genome counts ≤ N (`rebuild` only) | +| `--presence-threshold N` | all | per-genome count > N to be considered "present" (default 0) | + +Defaults: mins = 0 (no lower bound), max counts = group size, max fracs = 1.0 +(no upper bound). A filter with all defaults is a no-op. + +Fractions are computed over the size of the classified group, not over total +genome count. An empty group (no genome classified as ingroup/outgroup) never +triggers a filter failure. + +## Examples + +Keep k-mers specific to *Betula nana* — present in at least 2 *B. nana* genomes +and absent from every other genome in the index: + +```sh +obikmer rebuild src --output dst \ + --ingroup "species=Betula_nana" \ + --outgroup "*" \ + --min-count 2 \ + --max-outgroup-count 0 +``` + +Keep k-mers found in at least 2 *Betula nana* genomes and absent from all +other *Betula*: + +```sh +obikmer rebuild src --output dst \ + --ingroup "species=Betula_nana" \ + --outgroup "genus=Betula" \ + --min-count 2 \ + --max-outgroup-count 0 +``` + +Use taxonomic paths — keep k-mers present in ≥ 50 % of the *Betula* clade +and in fewer than 10 % of everything outside *Betulaceae*: + +```sh +obikmer rebuild src --output dst \ + --ingroup "taxon~/Betulaceae/Betula" \ + --outgroup "taxon!~/Betulaceae" \ + --min-frac 0.5 \ + --max-outgroup-frac 0.1 +``` + +Multiple outgroup predicates (OR): exclude k-mers present in *Alnus* or *Carpinus*: + +```sh +obikmer rebuild src --output dst \ + --ingroup "genus=Betula" \ + --outgroup "genus=Alnus" \ + --outgroup "genus=Carpinus" \ + --max-outgroup-count 0 +``` + +The same flags work identically for `dump` and `unitig`. To dump only k-mers +specific to *Betula nana*: + +```sh +obikmer dump myindex \ + --ingroup "species=Betula_nana" \ + --outgroup "*" \ + --min-count 1 \ + --max-outgroup-count 0 +``` + +To enumerate unitigs of the *Betula*-specific subgraph: + +```sh +obikmer unitig myindex \ + --ingroup "genus=Betula" \ + --outgroup "*" \ + --min-count 2 \ + --max-outgroup-count 0 +``` + +## Implementation + +- **`obikpartitionner::filter::GroupQuorumFilter`** — implements `KmerFilter` + using pre-computed ingroup and outgroup index vectors. The heavy logic + (predicate parsing, three-value evaluation, genome classification) happens + once before any iteration; each k-mer row evaluation is a simple index + lookup and counter. + +- **`obikmer::cmd::predicate::FilterArgs`** — shared `clap` argument group + embedded via `#[command(flatten)]` in `RebuildArgs`, `DumpArgs`, and + `UnitigArgs`. `FilterArgs::build_filters()` returns a ready-to-use filter + list. + +- **`obikpartitionner::KmerPartition::iter_partition_kmers`** — accepts + `filters: &[Box]` and applies them per-kmer before invoking + the callback. `rebuild`, `dump`, and `unitig` all go through this single + entry point. diff --git a/docmd/index.md b/docmd/index.md index 262a6c3..5e04a48 100644 --- a/docmd/index.md +++ b/docmd/index.md @@ -9,15 +9,16 @@ | `superkmer` | Extract super-kmers from a sequence file and write to stdout | | `index` | Build a complete genome index (scatter → dereplicate → count → layered MPHF) | | `merge` | Merge multiple built indexes into one | -| `rebuild` | Filter and compact an existing index into a new single-layer index | +| `rebuild` | Filter and compact an existing index into a new single-layer index; supports ingroup/outgroup predicates on genome metadata | | `query` | Query an index with sequences and annotate matches | -| `dump` | Dump all indexed kmers as CSV (kmer + per-genome counts or presence) | +| `dump` | Dump all indexed k-mers as CSV (kmer + per-genome counts or presence); supports the same ingroup/outgroup filtering as `rebuild` | | `annotate` | Add or update genome metadata from a CSV file; or dump metadata as CSV | | `distance` | Compute pairwise distance matrix between genomes; optionally build NJ/UPGMA trees | -| `unitig` | Dump unitigs from a built index to stdout (debug) | +| `unitig` | Build a global de Bruijn graph across all partitions and enumerate its unitigs as FASTA; supports the same ingroup/outgroup filtering as `rebuild` | | `estimate` | Estimate approximate-index parameters (z, evidence bits, FP rates) before indexing | | `reindex` | Convert an index's evidence in-place: exact ↔ approx | -| `utils` | Miscellaneous index utilities: `--new-label NEW=OLD` renames a genome label in-place (NEW gets OLD's identity) | +| `utils` | Miscellaneous index utilities: `--new-label NEW=OLD` renames a genome label; `--upgrade-index` adds missing `layer_meta.json` to old indexes | +| `pack` | Pack per-column matrix files into single-file format to reduce query I/O | ## Constraints diff --git a/mkdocs.yml b/mkdocs.yml index 6d9bc3a..1900320 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -49,6 +49,7 @@ nav: - PersistentCompactIntVec: implementation/persistent_compact_int_vec.md - PersistentBitVec: implementation/persistent_bit_vec.md - Merge command: implementation/merge.md + - Kmer filtering (rebuild/dump/unitig): implementation/rebuild_filter.md - Architecture: - Sequences: architecture/sequences/invariant.md - Kmer index: architecture/index_architecture.md diff --git a/src/Cargo.lock b/src/Cargo.lock index fe3f9a1..20e91ae 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -884,6 +884,7 @@ checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" dependencies = [ "ahash", "allocator-api2", + "rayon", ] [[package]] @@ -1488,6 +1489,7 @@ dependencies = [ "hashbrown 0.14.5", "obifastwrite", "obikseq", + "rayon", "xxhash-rust", ] @@ -1525,6 +1527,7 @@ dependencies = [ "csv", "indicatif", "kodama", + "obidebruinj", "obifastwrite", "obikindex", "obikpartitionner", diff --git a/src/obidebruinj/Cargo.toml b/src/obidebruinj/Cargo.toml index 5fb8273..82e1cad 100644 --- a/src/obidebruinj/Cargo.toml +++ b/src/obidebruinj/Cargo.toml @@ -7,8 +7,12 @@ edition = "2021" obikseq = { path = "../obikseq" } obifastwrite = { path = "../obifastwrite" } ahash = "0.8" -hashbrown = "0.14" +hashbrown = { version = "0.14", features = ["rayon"] } +rayon = "1" xxhash-rust = { version = "0.8.15", features = ["xxh3", "const_xxh3"] } +[features] +test-utils = [] + [dev-dependencies] obikseq = { path = "../obikseq", features = ["test-utils"] } diff --git a/src/obidebruinj/src/debruijn.rs b/src/obidebruinj/src/debruijn.rs index a27de2a..f96958c 100644 --- a/src/obidebruinj/src/debruijn.rs +++ b/src/obidebruinj/src/debruijn.rs @@ -3,8 +3,9 @@ use hashbrown::HashMap; use obikseq::k; use obikseq::unitig::Unitig; use obikseq::{CanonicalKmer, Kmer, Sequence}; -use std::cell::Cell; +use rayon::prelude::*; use std::fmt; +use std::sync::atomic::{AtomicU8, Ordering}; use xxhash_rust::xxh3::Xxh3Builder; // ── Types ───────────────────────────────────────────────────────────────────── @@ -140,7 +141,7 @@ impl fmt::Display for Node { // ── GraphDeBruijn ───────────────────────────────────────────────────────────── pub struct GraphDeBruijn { - nodes: FastHashMap>, + nodes: FastHashMap, } impl GraphDeBruijn { @@ -158,24 +159,34 @@ impl GraphDeBruijn { /// Insert a canonical kmer into the graph. No-op if already present. pub fn push(&mut self, kmer: CanonicalKmer) { - self.nodes - .entry(kmer) - .or_insert_with(|| Cell::new(Node::default())); + self.nodes.entry(kmer).or_insert_with(|| AtomicU8::new(0)); } /// For every node, find its unique right/left canonical neighbour (if any) /// and store the nucleotide index in the Node flags. /// - /// Single pass thanks to Cell interior mutability. + /// In production builds, runs in parallel across all nodes (each entry is + /// written by exactly one thread). In test builds, runs sequentially to + /// avoid propagating thread-local k/m values to rayon worker threads. pub fn compute_degrees(&self) { - for (&kmer, cell) in &self.nodes { + #[cfg(not(any(test, feature = "test-utils")))] + self.nodes.par_iter().for_each(|(&kmer, atomic)| { let (rc, rn) = count_neighbors(kmer.right_canonical_neighbors(), &self.nodes); let (lc, ln) = count_neighbors(kmer.left_canonical_neighbors(), &self.nodes); - - let mut node = cell.get(); + let mut node = Node(atomic.load(Ordering::Relaxed)); node.set_right(rc, rn); node.set_left(lc, ln); - cell.set(node); + atomic.store(node.0, Ordering::Relaxed); + }); + + #[cfg(any(test, feature = "test-utils"))] + for (&kmer, atomic) in &self.nodes { + let (rc, rn) = count_neighbors(kmer.right_canonical_neighbors(), &self.nodes); + let (lc, ln) = count_neighbors(kmer.left_canonical_neighbors(), &self.nodes); + let mut node = Node(atomic.load(Ordering::Relaxed)); + node.set_right(rc, rn); + node.set_left(lc, ln); + atomic.store(node.0, Ordering::Relaxed); } } @@ -206,20 +217,18 @@ impl GraphDeBruijn { } pub fn is_visited(&self, kmer: &CanonicalKmer) -> Option { - self.nodes.get(kmer).map(|cell| cell.get().is_visited()) + self.nodes.get(kmer).map(|a| Node(a.load(Ordering::Relaxed)).is_visited()) } pub fn set_visited(&self, kmer: CanonicalKmer) { - if let Some(cell) = self.nodes.get(&kmer) { - let mut node = cell.get(); - node.set_visited(); - cell.set(node); + if let Some(a) = self.nodes.get(&kmer) { + a.fetch_or(0b0000_0100, Ordering::AcqRel); } } /// Returns the single right neighbor of `kmer`, if it exists. pub fn the_single_right_neighbor(&self, kmer: CanonicalKmer) -> Option { - let node = self.nodes.get(&kmer)?.get(); + let node = Node(self.nodes.get(&kmer)?.load(Ordering::Relaxed)); if !node.can_extend_right() { return None; } @@ -229,7 +238,7 @@ impl GraphDeBruijn { /// Returns the single left neighbor of `kmer`, if it exists. pub fn the_single_left_neighbor(&self, kmer: CanonicalKmer) -> Option { - let node = self.nodes.get(&kmer)?.get(); + let node = Node(self.nodes.get(&kmer)?.load(Ordering::Relaxed)); if !node.can_extend_left() { return None; } @@ -253,7 +262,7 @@ impl GraphDeBruijn { fn next_unitig_kmer(&self, kmer: Kmer) -> Option { let canonical = kmer.canonical(); - let node = self.nodes.get(&canonical)?.get(); + let node = Node(self.nodes.get(&canonical)?.load(Ordering::Relaxed)); let direct = kmer.raw() == canonical.raw(); @@ -270,8 +279,8 @@ impl GraphDeBruijn { canonical.into_kmer().push_left(node.left_nuc()).canonical() }; - let cell = self.nodes.get(&next_c)?; - let next_node = cell.get(); + let atomic = self.nodes.get(&next_c)?; + let next_node = Node(atomic.load(Ordering::Relaxed)); if next_node.is_visited() { return None; } @@ -285,9 +294,9 @@ impl GraphDeBruijn { return None; } - let mut updated = next_node; - updated.set_visited(); - cell.set(updated); + if !try_claim(atomic) { + return None; + } Some(oriented) } @@ -311,6 +320,124 @@ impl GraphDeBruijn { }) } + /// Merge `other` into `self`. + /// + /// The caller guarantees that the two graphs have **disjoint** kmer sets — + /// this is structurally true when each graph was built from a distinct + /// minimizer partition. No conflict resolution is needed: entries are moved + /// directly without checking for duplicates. + pub fn merge(&mut self, other: Self) { + self.nodes.reserve(other.nodes.len()); + self.nodes.extend(other.nodes); + } + + /// Build one unitig starting at `start`, whose node flags are `node` (pre-claim value). + /// The caller has already atomically claimed `start`; this method claims subsequent nodes. + /// + /// The first right extension is always included (mirroring the original `start_iter` + /// behaviour). Branching checks for subsequent extensions are handled inside + /// `iter_unitig_kmers` → `next_unitig_kmer`. + fn collect_from_start(&self, start: CanonicalKmer, node: Node, k: usize) -> Unitig { + let mut nucs: Vec = (0..k).map(|i| start.nucleotide(i)).collect(); + if node.can_extend_right() { + let next_c = start.into_kmer().push_right(node.right_nuc()).canonical(); + if let Some(next_a) = self.nodes.get(&next_c) { + let old = next_a.fetch_or(0b0000_0100, Ordering::AcqRel); + if old & 0b0000_0100 == 0 { + let oriented = oriented_next(start.into_kmer(), next_c); + nucs.push(oriented.nucleotide(k - 1)); + for kmer in self.iter_unitig_kmers(oriented) { + nucs.push(kmer.nucleotide(k - 1)); + } + } + } + } + Unitig::from_nucleotides(&nucs) + } + + /// Returns `true` if `start` is a unitig start node: + /// - no unique left predecessor (`!can_extend_left`), or + /// - unique left predecessor exists but cannot extend right + /// (i.e., no chain traversal from the left can reach `start`). + fn is_start(&self, start: CanonicalKmer, node: Node) -> bool { + if !node.can_extend_left() { + return true; + } + let pred = start.into_kmer().push_left(node.left_nuc()).canonical(); + self.nodes.get(&pred) + .map(|a| !Node(a.load(Ordering::Acquire)).can_extend_right()) + .unwrap_or(false) + } + + /// Call `f` once per unitig. + /// + /// Uses the extended start definition: a node is a start if it has no unique + /// left predecessor, or if its left predecessor cannot extend right (so no chain + /// traversal from the left could ever claim it). + /// + /// Two-step execution to avoid races between start claiming and chain extension: + /// 1. Claim all starts atomically (parallel). + /// 2. Build and emit chains from claimed starts (parallel). + /// + /// Parallel in production builds; sequential in test builds. + /// Must be called before [`for_each_remaining_unitig`]. + pub fn par_for_each_chain_unitig(&self, f: impl Fn(Unitig) + Sync) { + let k = k(); + + #[cfg(not(any(test, feature = "test-utils")))] + { + // Step 1 — claim all starts in parallel. + let starts: Vec = self.nodes + .par_iter() + .filter_map(|(&start, atomic)| { + let node = Node(atomic.load(Ordering::Acquire)); + if node.is_visited() || !self.is_start(start, node) { + return None; + } + let old = atomic.fetch_or(0b0000_0100, Ordering::AcqRel); + (old & 0b0000_0100 == 0).then_some(start) + }) + .collect(); + + // Step 2 — build chains in parallel. + starts.into_par_iter().for_each(|start| { + let node = Node(self.nodes[&start].load(Ordering::Acquire)); + f(self.collect_from_start(start, node, k)); + }); + } + + #[cfg(any(test, feature = "test-utils"))] + { + let starts: Vec = self.nodes + .iter() + .filter_map(|(&start, atomic)| { + let node = Node(atomic.load(Ordering::Acquire)); + if node.is_visited() || !self.is_start(start, node) { + return None; + } + let old = atomic.fetch_or(0b0000_0100, Ordering::AcqRel); + (old & 0b0000_0100 == 0).then_some(start) + }) + .collect(); + + for start in starts { + let node = Node(self.nodes[&start].load(Ordering::Acquire)); + f(self.collect_from_start(start, node, k)); + } + } + } + + /// Call `f` for each node still unvisited after [`par_for_each_chain_unitig`]. + /// Handles true cycles and rare deeply-nested junctions. Always sequential. + pub fn for_each_remaining_unitig(&self, f: impl Fn(Unitig)) { + let k = k(); + for (&kmer, atomic) in &self.nodes { + let old = atomic.fetch_or(0b0000_0100, Ordering::AcqRel); + if old & 0b0000_0100 != 0 { continue; } + f(self.collect_from_start(kmer, Node(old), k)); + } + } + pub fn len(&self) -> usize { self.nodes.len() } @@ -323,7 +450,7 @@ impl GraphDeBruijn { // --- StartIter ----------------------------------------------------------------- struct StartIter<'a> { graph: &'a GraphDeBruijn, - nodes: hashbrown::hash_map::Iter<'a, CanonicalKmer, Cell>, + nodes: hashbrown::hash_map::Iter<'a, CanonicalKmer, AtomicU8>, suspended: Vec, in_cycle_pass: bool, } @@ -364,7 +491,7 @@ impl<'a> Iterator for StartIter<'a> { }; let node = match self.graph.nodes.get(¤t) { - Some(c) => c.get(), + Some(a) => Node(a.load(Ordering::Relaxed)), None => continue, }; if node.is_visited() { @@ -426,6 +553,12 @@ impl Iterator for UnitigIter<'_> { // ── helpers ─────────────────────────────────────────────────────────────────── +/// Atomically set the visited bit. Returns `true` iff this call claimed the node. +#[inline] +fn try_claim(atomic: &AtomicU8) -> bool { + atomic.fetch_or(0b0000_0100, Ordering::AcqRel) & 0b0000_0100 == 0 +} + fn oriented_next(from: Kmer, to: CanonicalKmer) -> Kmer { if from.is_overlapping(to.into_kmer()) { to.into_kmer() @@ -439,7 +572,7 @@ fn oriented_next(from: Kmer, to: CanonicalKmer) -> Kmer { /// zero or ≥2 existing neighbours. fn count_neighbors( neighbors: [CanonicalKmer; 4], - nodes: &FastHashMap>, + nodes: &FastHashMap, ) -> (u8, Option) { let mut count = 0u8; let mut first = None; diff --git a/src/obidebruinj/src/tests/debruijn.rs b/src/obidebruinj/src/tests/debruijn.rs index 1ba2459..4169421 100644 --- a/src/obidebruinj/src/tests/debruijn.rs +++ b/src/obidebruinj/src/tests/debruijn.rs @@ -115,7 +115,7 @@ fn unitig_roundtrip_linear() { g.compute_degrees(); println!("Les kmers:"); for (kmer, v) in g.nodes.iter() { - println!("{}: {}", String::from_utf8_lossy(&kmer.to_ascii()), v.get()); + println!("{}: {}", String::from_utf8_lossy(&kmer.to_ascii()), v.load(std::sync::atomic::Ordering::Relaxed)); } println!("Les unitig:"); diff --git a/src/obikindex/src/dump.rs b/src/obikindex/src/dump.rs index 037bb58..6e8481a 100644 --- a/src/obikindex/src/dump.rs +++ b/src/obikindex/src/dump.rs @@ -2,6 +2,7 @@ use std::io::Write; use crate::error::{OKIError, OKIResult}; use crate::index::KmerIndex; +use obikpartitionner::KmerFilter; impl KmerIndex { /// Write a CSV table of all indexed kmers to `out`. @@ -14,8 +15,13 @@ impl KmerIndex { /// /// The caller must have set the global kmer length (`obikseq::set_k`) before /// calling this method. - pub fn dump(&self, out: &mut W, force_presence: bool, debug: bool) -> OKIResult<()> { - + pub fn dump( + &self, + out: &mut W, + force_presence: bool, + debug: bool, + filters: &[Box], + ) -> OKIResult<()> { let genomes = &self.meta.genomes; let use_counts = self.meta.config.with_counts && !force_presence; let n_genomes = genomes.len().max(1); @@ -36,25 +42,21 @@ impl KmerIndex { for i in 0..n { if debug { self.partition - .iter_partition_kmers_located(i, use_counts, n_genomes, |part, layer, kmer, row| { + .iter_partition_kmers_located(i, use_counts, n_genomes, filters, |part, layer, kmer, row| { let seq = String::from_utf8(kmer.to_ascii()) .unwrap_or_else(|_| "?".repeat(kmer_size)); let _ = write!(out, "{part},{layer},{seq}"); - for &v in row.iter() { - let _ = write!(out, ",{v}"); - } + for &v in row.iter() { let _ = write!(out, ",{v}"); } let _ = writeln!(out); }) .map_err(OKIError::Partition)?; } else { self.partition - .iter_partition_kmers(i, use_counts, n_genomes, |kmer, row| { + .iter_partition_kmers(i, use_counts, n_genomes, filters, |kmer, row| { let seq = String::from_utf8(kmer.to_ascii()) .unwrap_or_else(|_| "?".repeat(kmer_size)); let _ = write!(out, "{seq}"); - for &v in row.iter() { - let _ = write!(out, ",{v}"); - } + for &v in row.iter() { let _ = write!(out, ",{v}"); } let _ = writeln!(out); }) .map_err(OKIError::Partition)?; diff --git a/src/obikindex/src/lib.rs b/src/obikindex/src/lib.rs index b5e881e..f5f20ae 100644 --- a/src/obikindex/src/lib.rs +++ b/src/obikindex/src/lib.rs @@ -7,6 +7,7 @@ mod index; mod merge; mod rebuild; mod reindex; +mod stats; pub use error::{OKIError, OKIResult}; pub use distance::{DistanceMetric, DistanceOutput}; @@ -14,3 +15,4 @@ pub use index::KmerIndex; pub use merge::MergeMode; pub use meta::{validate_label, GenomeInfo, IndexConfig, IndexMeta, META_FILENAME}; pub use state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED}; +pub use stats::IndexBitsPerKmer; diff --git a/src/obikindex/src/stats.rs b/src/obikindex/src/stats.rs new file mode 100644 index 0000000..e3dffc9 --- /dev/null +++ b/src/obikindex/src/stats.rs @@ -0,0 +1,127 @@ +use std::fs; +use std::path::Path; + +use obicompactvec::LayerMeta; +use obilayeredmap::meta::PartitionMeta; +use rayon::prelude::*; + +use crate::error::OKIResult; +use crate::index::KmerIndex; + +/// Bits per kmer broken down by index component. +pub struct IndexBitsPerKmer { + /// Total distinct k-mers across all partitions and layers. + pub n_kmers: usize, + /// Number of genomes in the index. + pub n_genomes: usize, + /// Bits used by the minimal perfect hash function (`mphf.bin`). + pub mphf: f64, + /// Bits used by the evidence files (`evidence.bin`, `unitigs.bin*`, + /// `fingerprint.bin`). + pub evidence: f64, + /// Bits used by the count/presence matrices (`counts/` and `presence/`), + /// normalised by k-mers only. + pub matrix: f64, + /// `matrix` divided by the number of genomes — intrinsic encoding + /// efficiency, independent of index size. + pub matrix_per_genome: f64, + /// Sum of mphf + evidence + matrix. + pub total: f64, +} + +// ── File-size helpers ───────────────────────────────────────────────────────── + +fn file_bytes(path: &Path) -> u64 { + fs::metadata(path).map(|m| m.len()).unwrap_or(0) +} + +fn dir_bytes(dir: &Path) -> u64 { + if !dir.exists() { return 0; } + fs::read_dir(dir) + .map(|entries| { + entries + .filter_map(|e| e.ok()) + .filter_map(|e| e.metadata().ok()) + .filter(|m| m.is_file()) + .map(|m| m.len()) + .sum() + }) + .unwrap_or(0) +} + +// ── Per-layer accounting ────────────────────────────────────────────────────── + +struct LayerBytes { + n_kmers: usize, + mphf: u64, + evidence: u64, + matrix: u64, +} + +fn layer_bytes(layer_dir: &Path) -> LayerBytes { + let n_kmers = LayerMeta::load(layer_dir).map(|m| m.n).unwrap_or(0); + + let mphf = file_bytes(&layer_dir.join("mphf.bin")); + + let evidence = file_bytes(&layer_dir.join("unitigs.bin")) + + file_bytes(&layer_dir.join("unitigs.bin.idx")) + + file_bytes(&layer_dir.join("evidence.bin")) + + file_bytes(&layer_dir.join("fingerprint.bin")); + + let matrix = dir_bytes(&layer_dir.join("counts")) + + dir_bytes(&layer_dir.join("presence")); + + LayerBytes { n_kmers, mphf, evidence, matrix } +} + +// ── KmerIndex::bits_per_kmer ────────────────────────────────────────────────── + +impl KmerIndex { + /// Compute bits-per-kmer statistics for the built index. + /// + /// File sizes are read directly from disk; kmer counts come from + /// `layer_meta.json` (no need to scan the MPHF or unitig files). + /// Computation is parallelised across partitions. + pub fn bits_per_kmer(&self) -> OKIResult { + let n = self.n_partitions(); + let n_genomes = self.meta().genomes.len().max(1); + + let (n_kmers, mphf_b, evidence_b, matrix_b) = (0..n) + .into_par_iter() + .map(|i| { + let index_dir = self.partition.part_dir(i).join("index"); + if !index_dir.exists() { return (0usize, 0u64, 0u64, 0u64); } + + let n_layers = PartitionMeta::load(&index_dir) + .map(|m| m.n_layers) + .unwrap_or(0); + + (0..n_layers).fold((0usize, 0u64, 0u64, 0u64), |acc, l| { + let lb = layer_bytes(&index_dir.join(format!("layer_{l}"))); + (acc.0 + lb.n_kmers, acc.1 + lb.mphf, acc.2 + lb.evidence, acc.3 + lb.matrix) + }) + }) + .reduce(|| (0, 0, 0, 0), |a, b| (a.0 + b.0, a.1 + b.1, a.2 + b.2, a.3 + b.3)); + + if n_kmers == 0 { + return Ok(IndexBitsPerKmer { + n_kmers: 0, n_genomes, + mphf: 0.0, evidence: 0.0, + matrix: 0.0, matrix_per_genome: 0.0, total: 0.0, + }); + } + + let bpk = |bytes: u64| bytes as f64 * 8.0 / n_kmers as f64; + let matrix = bpk(matrix_b); + + Ok(IndexBitsPerKmer { + n_kmers, + n_genomes, + mphf: bpk(mphf_b), + evidence: bpk(evidence_b), + matrix, + matrix_per_genome: matrix / n_genomes as f64, + total: bpk(mphf_b + evidence_b + matrix_b), + }) + } +} diff --git a/src/obikmer/Cargo.toml b/src/obikmer/Cargo.toml index cb7a7a2..2dcfb91 100644 --- a/src/obikmer/Cargo.toml +++ b/src/obikmer/Cargo.toml @@ -12,6 +12,7 @@ obikseq = { path = "../obikseq" } obiread = { path = "../obiread" } obiskbuilder = { path = "../obiskbuilder" } obifastwrite = { path = "../obifastwrite" } +obidebruinj = { path = "../obidebruinj" } obipipeline = { path = "../obipipeline" } obikrope = { path = "../obikrope" } obikpartitionner = { path = "../obikpartitionner" } diff --git a/src/obikmer/src/cmd/dump.rs b/src/obikmer/src/cmd/dump.rs index c1d56ed..7ec5dc7 100644 --- a/src/obikmer/src/cmd/dump.rs +++ b/src/obikmer/src/cmd/dump.rs @@ -5,6 +5,8 @@ use clap::Args; use obikindex::KmerIndex; use tracing::info; +use super::predicate::FilterArgs; + #[derive(Args)] pub struct DumpArgs { /// Index directory to dump @@ -17,6 +19,9 @@ pub struct DumpArgs { /// Prepend partition and layer columns to each row #[arg(long, default_value_t = false)] pub debug: bool, + + #[command(flatten)] + pub filter: FilterArgs, } pub fn run(args: DumpArgs) { @@ -28,13 +33,15 @@ pub fn run(args: DumpArgs) { info!( "dumping {} partitions, {} genome(s)", idx.n_partitions(), - idx.meta().genomes.len() + &idx.meta().genomes.len() ); + let filters = args.filter.build_filters(&idx.meta().genomes); + let stdout = io::stdout(); let mut out = BufWriter::new(stdout.lock()); - idx.dump(&mut out, args.force_presence, args.debug).unwrap_or_else(|e| { + idx.dump(&mut out, args.force_presence, args.debug, &filters).unwrap_or_else(|e| { eprintln!("dump error: {e}"); std::process::exit(1); }); diff --git a/src/obikmer/src/cmd/mod.rs b/src/obikmer/src/cmd/mod.rs index 569b543..27048f8 100644 --- a/src/obikmer/src/cmd/mod.rs +++ b/src/obikmer/src/cmd/mod.rs @@ -1,5 +1,6 @@ pub mod annotate; pub mod pack; +pub(crate) mod predicate; pub mod utils; pub mod distance; pub mod dump; diff --git a/src/obikmer/src/cmd/predicate.rs b/src/obikmer/src/cmd/predicate.rs new file mode 100644 index 0000000..e568b58 --- /dev/null +++ b/src/obikmer/src/cmd/predicate.rs @@ -0,0 +1,274 @@ +use std::collections::HashMap; + +use clap::Args; +use obikindex::GenomeInfo; +use obikpartitionner::{GroupQuorumFilter, KmerFilter}; + +// ── Operator ────────────────────────────────────────────────────────────────── + +enum PredOp { Wildcard, Eq, Ne, Matches, NotMatches } + +// ── MetaPred ────────────────────────────────────────────────────────────────── + +/// A single predicate on genome metadata: `key OP val1|val2|…` +/// +/// Operators: `=` (exact), `!=` (not equal), `~` (path ancestor), `!~` (not ancestor). +/// Multiple values separated by `|` are OR'd. +pub struct MetaPred { + key: String, + op: PredOp, + values: Vec, +} + +impl MetaPred { + /// Parse a predicate string of the form `key=v1|v2`, `key!=v`, `key~path`, `key!~path`. + /// The special values `*` and `all` (case-insensitive) match every genome. + pub fn parse(s: &str) -> Result { + let t = s.trim(); + if t == "*" || t.eq_ignore_ascii_case("all") { + return Ok(Self { key: String::new(), op: PredOp::Wildcard, values: vec![] }); + } + + let (op, key, rhs) = + if let Some(pos) = s.find("!=") { + (PredOp::Ne, &s[..pos], &s[pos+2..]) + } else if let Some(pos) = s.find("!~") { + (PredOp::NotMatches, &s[..pos], &s[pos+2..]) + } else if let Some(pos) = s.find('=') { + (PredOp::Eq, &s[..pos], &s[pos+1..]) + } else if let Some(pos) = s.find('~') { + (PredOp::Matches, &s[..pos], &s[pos+1..]) + } else { + return Err(format!("no operator found in predicate: {s}")); + }; + + let key = key.trim().to_string(); + if key.is_empty() { return Err(format!("empty key in predicate: {s}")); } + + let values: Vec = rhs.split('|').map(|v| v.trim().to_string()).collect(); + if values.iter().any(|v| v.is_empty()) { + return Err(format!("empty value in predicate: {s}")); + } + + Ok(Self { key, op, values }) + } + + /// Evaluate against one genome's metadata. + /// Returns `None` when the key is absent (NA propagation). + fn eval(&self, meta: &HashMap) -> Option { + if matches!(self.op, PredOp::Wildcard) { return Some(true); } + let value = meta.get(&self.key)?; + Some(match self.op { + PredOp::Wildcard => unreachable!(), + PredOp::Eq => self.values.iter().any(|v| v == value), + PredOp::Ne => self.values.iter().all(|v| v != value), + PredOp::Matches => self.values.iter().any(|v| path_matches(value, v)), + PredOp::NotMatches => self.values.iter().all(|v| !path_matches(value, v)), + }) + } +} + +// ── Path matching ───────────────────────────────────────────────────────────── + +/// True if `value` is equal to `pattern` or is a descendant of it in a `/`-separated hierarchy. +/// +/// - Absolute pattern (`/a/b`): `value` must start with `/a/b` at a segment boundary. +/// - Bare segment (`b`): `value` must contain `b` as an exact segment anywhere. +fn path_matches(value: &str, pattern: &str) -> bool { + if pattern.starts_with('/') { + value == pattern + || (value.starts_with(pattern) + && value[pattern.len()..].starts_with('/')) + } else { + value.split('/').any(|seg| seg == pattern) + } +} + +// ── Three-value group evaluation ────────────────────────────────────────────── + +/// AND of all predicates (ingroup semantics). +/// Short-circuits on `Some(false)`; propagates `None` if no predicate returns `false`. +fn eval_and(preds: &[MetaPred], meta: &HashMap) -> Option { + let mut has_na = false; + for pred in preds { + match pred.eval(meta) { + Some(false) => return Some(false), + Some(true) => {} + None => has_na = true, + } + } + if has_na { None } else { Some(true) } +} + +/// OR of all predicates (outgroup semantics). +/// Short-circuits on `Some(true)`; propagates `None` if no predicate returns `true`. +fn eval_or(preds: &[MetaPred], meta: &HashMap) -> Option { + let mut has_na = false; + for pred in preds { + match pred.eval(meta) { + Some(true) => return Some(true), + Some(false) => {} + None => has_na = true, + } + } + if has_na { None } else { Some(false) } +} + +// ── Genome classification ───────────────────────────────────────────────────── + +enum Membership { Ingroup, Outgroup, Uncategorized } + +fn classify( + genomes: &[GenomeInfo], + ingroup: &[MetaPred], + outgroup: &[MetaPred], +) -> Vec { + genomes.iter().map(|g| { + let in_r = if ingroup.is_empty() { None } else { eval_and(ingroup, &g.meta) }; + let out_r = if outgroup.is_empty() { None } else { eval_or(outgroup, &g.meta) }; + + // Ingroup wins over outgroup. + if in_r == Some(true) { return Membership::Ingroup; } + if out_r == Some(true) { return Membership::Outgroup; } + Membership::Uncategorized + }).collect() +} + +// ── Public constructor ──────────────────────────────────────────────────────── + +/// Build a `GroupQuorumFilter` from parsed predicates and genome metadata. +/// +/// - No groups defined: `ingroup_idx` = all genomes (implicit ingroup). +/// - `ingroup` predicates only: outgroup indices are empty. +/// - `outgroup` predicates only: ingroup indices are empty. +/// - Both defined: ingroup wins on overlap; uncategorized genomes are ignored. +/// CLI args for ingroup/outgroup filtering — embeddable in any command via `#[command(flatten)]`. +#[derive(Args)] +pub struct FilterArgs { + /// Ingroup predicate (repeatable; AND). Forms: `key=v1|v2`, `key!=v`, `key~path`, `key!~path`, `*`/`all` + #[arg(long, value_name = "PRED")] + pub ingroup: Vec, + + /// Outgroup predicate (repeatable; OR). Forms: `key=v1|v2`, `key!=v`, `key~path`, `key!~path`, `*`/`all` + #[arg(long, value_name = "PRED")] + pub outgroup: Vec, + + /// Minimum number of ingroup genomes containing the k-mer + #[arg(long)] + pub min_count: Option, + + /// Maximum number of ingroup genomes containing the k-mer + #[arg(long)] + pub max_count: Option, + + /// Minimum fraction of ingroup genomes containing the k-mer [0.0–1.0] + #[arg(long)] + pub min_frac: Option, + + /// Maximum fraction of ingroup genomes containing the k-mer [0.0–1.0] + #[arg(long)] + pub max_frac: Option, + + /// Minimum number of outgroup genomes containing the k-mer + #[arg(long)] + pub min_outgroup_count: Option, + + /// Maximum number of outgroup genomes containing the k-mer + #[arg(long)] + pub max_outgroup_count: Option, + + /// Minimum fraction of outgroup genomes containing the k-mer [0.0–1.0] + #[arg(long)] + pub min_outgroup_frac: Option, + + /// Maximum fraction of outgroup genomes containing the k-mer [0.0–1.0] + #[arg(long)] + pub max_outgroup_frac: Option, + + /// Per-genome count threshold to consider a genome as "containing" the k-mer (default 0) + #[arg(long, default_value = "0")] + pub presence_threshold: u32, +} + +impl FilterArgs { + /// Parse predicates and build a filter list ready to pass to `iter_partition_kmers`. + pub fn build_filters(&self, genomes: &[GenomeInfo]) -> Vec> { + let ingroup_preds: Vec = self.ingroup.iter() + .map(|s| MetaPred::parse(s).unwrap_or_else(|e| { + eprintln!("error in --ingroup: {e}"); + std::process::exit(1); + })) + .collect(); + let outgroup_preds: Vec = self.outgroup.iter() + .map(|s| MetaPred::parse(s).unwrap_or_else(|e| { + eprintln!("error in --outgroup: {e}"); + std::process::exit(1); + })) + .collect(); + vec![Box::new(build_group_filter( + genomes, + &ingroup_preds, + &outgroup_preds, + GroupFilterParams { + threshold: self.presence_threshold, + min_count: self.min_count, + max_count: self.max_count, + min_frac: self.min_frac, + max_frac: self.max_frac, + min_outgroup_count: self.min_outgroup_count, + max_outgroup_count: self.max_outgroup_count, + min_outgroup_frac: self.min_outgroup_frac, + max_outgroup_frac: self.max_outgroup_frac, + }, + ))] + } +} + +pub struct GroupFilterParams { + pub threshold: u32, + pub min_count: Option, + pub max_count: Option, + pub min_frac: Option, + pub max_frac: Option, + pub min_outgroup_count: Option, + pub max_outgroup_count: Option, + pub min_outgroup_frac: Option, + pub max_outgroup_frac: Option, +} + +pub fn build_group_filter( + genomes: &[GenomeInfo], + ingroup_preds: &[MetaPred], + outgroup_preds: &[MetaPred], + p: GroupFilterParams, +) -> GroupQuorumFilter { + let (ingroup_idx, outgroup_idx) = if ingroup_preds.is_empty() && outgroup_preds.is_empty() { + ((0..genomes.len()).collect(), vec![]) + } else { + let members = classify(genomes, ingroup_preds, outgroup_preds); + let in_idx: Vec = members.iter().enumerate() + .filter(|(_, m)| matches!(m, Membership::Ingroup)) + .map(|(i, _)| i).collect(); + let out_idx: Vec = members.iter().enumerate() + .filter(|(_, m)| matches!(m, Membership::Outgroup)) + .map(|(i, _)| i).collect(); + (in_idx, out_idx) + }; + + let in_size = ingroup_idx.len(); + let out_size = outgroup_idx.len(); + + GroupQuorumFilter { + ingroup_idx, + outgroup_idx, + threshold: p.threshold, + min_count: p.min_count.unwrap_or(0), + max_count: p.max_count.unwrap_or(in_size), + min_frac: p.min_frac.unwrap_or(0.0), + max_frac: p.max_frac.unwrap_or(1.0), + min_outgroup_count: p.min_outgroup_count.unwrap_or(0), + max_outgroup_count: p.max_outgroup_count.unwrap_or(out_size), + min_outgroup_frac: p.min_outgroup_frac.unwrap_or(0.0), + max_outgroup_frac: p.max_outgroup_frac.unwrap_or(1.0), + } +} diff --git a/src/obikmer/src/cmd/query.rs b/src/obikmer/src/cmd/query.rs index 10662da..f5172f7 100644 --- a/src/obikmer/src/cmd/query.rs +++ b/src/obikmer/src/cmd/query.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::{HashMap, VecDeque}; use std::io::{self, BufWriter, Write}; use std::path::PathBuf; use std::sync::Arc; @@ -277,10 +277,18 @@ fn process_chunk( }); } - // Findere z-window filter + accumulation — no intermediate allocations. - // One `confirmed` buffer reused across all sequences. + // Sliding window minimum — one reusable buffer and one deque per batch. + // + // win_min[pos * n_genomes + g] = min count across the z-window [pos, pos+z) + // for genome g, where "not in index" counts as 0. + // + // win_min > 0 ↔ all z consecutive kmers are in the index with count > 0 + // ↔ Findere confirmation (for z=1 this degenerates to the + // simple case with no overhead). + // + // Works uniformly for count matrices and presence/absence (0/1) matrices. let max_n_kmers = batch.n_kmers.iter().map(|&n| n as usize).max().unwrap_or(0); - let mut confirmed = vec![false; max_n_kmers * n_genomes]; + let mut win_min = vec![0u32; max_n_kmers * n_genomes]; let mut accs: Vec = (0..n_seqs).map(|_| SeqAcc::new(n_genomes)).collect(); @@ -289,114 +297,74 @@ fn process_chunk( .iter() .map(|&n| { let n = n as usize; - if n >= effective_z { - n - effective_z + 1 - } else { - 0 - } + if n >= effective_z { n - effective_z + 1 } else { 0 } }) .collect(); let mut cov: Vec>> = if detail { - n_kmers_out - .iter() - .map(|&n| vec![vec![0u32; n]; n_genomes]) - .collect() + n_kmers_out.iter().map(|&n| vec![vec![0u32; n]; n_genomes]).collect() } else { Vec::new() }; - let presence = force_presence || !with_counts; + let presence = force_presence || !with_counts; let threshold = presence_threshold; - let z = effective_z; + let z = effective_z; + + // Deque reused across all (seq, genome) pairs. + let mut dq: VecDeque<(usize, u32)> = VecDeque::with_capacity(z + 1); for seq_idx in 0..n_seqs { - let n = results.n_kmers_for(seq_idx); + let n = results.n_kmers_for(seq_idx); let out_n = n_kmers_out[seq_idx]; - if out_n == 0 { - continue; + if out_n == 0 { continue; } + + let mins = &mut win_min[..out_n * n_genomes]; + mins.fill(0); + + // ── Per-genome sliding window minimum ───────────────────────────────── + for g in 0..n_genomes { + dq.clear(); + for i in 0..n { + let v_i = if results.is_in_index(seq_idx, i) { + results.val(seq_idx, i, g) + } else { + 0 + }; + // Evict elements that have left the window. + while dq.front().map_or(false, |&(f, _)| f + z <= i) { + dq.pop_front(); + } + // Maintain monotone non-decreasing back→front for minimum at front. + while dq.back().map_or(false, |&(_, v)| v >= v_i) { + dq.pop_back(); + } + dq.push_back((i, v_i)); + // Window [pos, pos+z) is complete when i = pos + z - 1. + if i + 1 >= z { + let pos = i + 1 - z; + mins[pos * n_genomes + g] = dq.front().unwrap().1; + } + } } - if z <= 1 { - // No Findere — every indexed s-mer is confirmed. - let acc = &mut accs[seq_idx]; - for pos in 0..n { + // ── Accumulate ──────────────────────────────────────────────────────── + let acc = &mut accs[seq_idx]; + for pos in 0..out_n { + let any = (0..n_genomes).any(|g| mins[pos * n_genomes + g] > 0); + if !any { if !results.is_in_index(seq_idx, pos) { acc.kmer_missing += 1; - continue; - } - acc.kmer_count += 1; - for g in 0..n_genomes { - let v = results.val(seq_idx, pos, g); - if v == 0 { - continue; - } - let c = if presence { - u32::from(v >= threshold) - } else { - v - }; - acc.genome_totals[g] += c; - if detail { - cov[seq_idx][g][pos] += c; - } } + continue; } - } else { - // Build confirmed[pos * n_genomes + g] via sliding window. - let conf = &mut confirmed[..out_n * n_genomes]; - conf.fill(false); - + acc.kmer_count += 1; for g in 0..n_genomes { - let hit = - |i: usize| results.is_in_index(seq_idx, i) && results.val(seq_idx, i, g) > 0; - - let mut cnt: u32 = (0..z).filter(|&j| hit(j)).count() as u32; - if cnt == z as u32 { - conf[g] = true; - } - - for i in 1..out_n { - if hit(i - 1) { - cnt -= 1; - } - if hit(i + z - 1) { - cnt += 1; - } - if cnt == z as u32 { - conf[i * n_genomes + g] = true; - } - } - } - - let acc = &mut accs[seq_idx]; - for pos in 0..out_n { - let any = (0..n_genomes).any(|g| conf[pos * n_genomes + g]); - if !any { - if !results.is_in_index(seq_idx, pos) { - acc.kmer_missing += 1; - } - continue; - } - acc.kmer_count += 1; - for g in 0..n_genomes { - if !conf[pos * n_genomes + g] { - continue; - } - let v = results.val(seq_idx, pos, g); - if v == 0 { - continue; - } - let c = if presence { - u32::from(v >= threshold) - } else { - v - }; - acc.genome_totals[g] += c; - if detail { - cov[seq_idx][g][pos] += c; - } - } + let v = mins[pos * n_genomes + g]; + if v == 0 { continue; } + let c = if presence { u32::from(v >= threshold) } else { v }; + acc.genome_totals[g] += c; + if detail { cov[seq_idx][g][pos] += c; } } } } diff --git a/src/obikmer/src/cmd/rebuild.rs b/src/obikmer/src/cmd/rebuild.rs index 8d24d93..594d5f4 100644 --- a/src/obikmer/src/cmd/rebuild.rs +++ b/src/obikmer/src/cmd/rebuild.rs @@ -2,13 +2,12 @@ use std::path::PathBuf; use clap::Args; use obikindex::{KmerIndex, MergeMode}; -use obikpartitionner::filter::{ - KmerFilter, MaxGenomeCount, MaxGenomeFraction, MaxTotalCount, - MinGenomeCount, MinGenomeFraction, MinTotalCount, -}; +use obikpartitionner::filter::{MaxTotalCount, MinTotalCount}; use obisys::Reporter; use tracing::info; +use super::predicate::FilterArgs; + #[derive(Args)] pub struct RebuildArgs { /// Source index directory @@ -18,21 +17,8 @@ pub struct RebuildArgs { #[arg(short, long)] pub output: PathBuf, - /// Minimum fraction of genomes containing the k-mer [0.0–1.0] - #[arg(long)] - pub min_quorum_frac: Option, - - /// Maximum fraction of genomes containing the k-mer [0.0–1.0] - #[arg(long)] - pub max_quorum_frac: Option, - - /// Minimum number of genomes containing the k-mer - #[arg(long)] - pub min_quorum_count: Option, - - /// Maximum number of genomes containing the k-mer - #[arg(long)] - pub max_quorum_count: Option, + #[command(flatten)] + pub filter: FilterArgs, /// Minimum total count across all genomes (count index only) #[arg(long)] @@ -42,10 +28,6 @@ pub struct RebuildArgs { #[arg(long)] pub max_total_count: Option, - /// Per-genome count threshold for presence in quorum filters (default 0) - #[arg(long, default_value = "0")] - pub presence_threshold: u32, - /// Output as presence/absence instead of counts #[arg(long)] pub presence: bool, @@ -61,8 +43,6 @@ pub fn run(args: RebuildArgs) { std::process::exit(1); }); - - let n_genomes = src.meta().genomes.len(); let mode = if args.presence || !src.meta().config.with_counts { MergeMode::Presence } else { @@ -71,24 +51,11 @@ pub fn run(args: RebuildArgs) { info!( "rebuild: {} genome(s), mode={:?}, source={}", - n_genomes, mode, args.source.display() + &src.meta().genomes.len(), mode, args.source.display() ); - let pt = args.presence_threshold; - let mut filters: Vec> = Vec::new(); + let mut filters = args.filter.build_filters(&src.meta().genomes); - if let Some(v) = args.min_quorum_frac { - filters.push(Box::new(MinGenomeFraction { frac: v, threshold: pt })); - } - if let Some(v) = args.max_quorum_frac { - filters.push(Box::new(MaxGenomeFraction { frac: v, threshold: pt })); - } - if let Some(v) = args.min_quorum_count { - filters.push(Box::new(MinGenomeCount { count: v, threshold: pt })); - } - if let Some(v) = args.max_quorum_count { - filters.push(Box::new(MaxGenomeCount { count: v, threshold: pt })); - } if let Some(v) = args.min_total_count { filters.push(Box::new(MinTotalCount { total: v })); } diff --git a/src/obikmer/src/cmd/unitig.rs b/src/obikmer/src/cmd/unitig.rs index 82fe6bc..79f6ed2 100644 --- a/src/obikmer/src/cmd/unitig.rs +++ b/src/obikmer/src/cmd/unitig.rs @@ -1,54 +1,101 @@ use std::io::{self, BufWriter, Write}; use std::path::PathBuf; use std::sync::Mutex; +use std::sync::atomic::{AtomicUsize, Ordering}; use clap::Args; +use obidebruinj::GraphDeBruijn; use obifastwrite::write_unitig; use obikindex::KmerIndex; -use obikseq::set_k; -use obiskio::UnitigFileReader; +use obisys::{Reporter, Stage, progress_bar, spinner}; use rayon::prelude::*; use tracing::info; +use super::predicate::FilterArgs; + #[derive(Args)] pub struct UnitigArgs { - /// Index directory produced by the `index` command + /// Index directory pub index: PathBuf, + + #[command(flatten)] + pub filter: FilterArgs, } pub fn run(args: UnitigArgs) { let idx = KmerIndex::open(&args.index).unwrap_or_else(|e| { eprintln!("error opening index: {e}"); - std::process::exit(1) + std::process::exit(1); }); - let k = idx.kmer_size(); - set_k(k); - let n = idx.n_partitions(); - info!("dumping unitigs from {n} partitions (k={k})"); + let k = idx.kmer_size(); + let n = idx.n_partitions(); + let n_genomes = idx.meta().genomes.len().max(1); + let use_counts = idx.meta().config.with_counts; - let stdout = Mutex::new(BufWriter::new(io::stdout())); + info!("unitig: building de Bruijn graph from {n} partition(s) (k={k})"); - (0..n).into_par_iter().for_each(|i| { - let path = idx.layer_unitigs_path(i, 0); - if !path.exists() { - return; - } + let filters = args.filter.build_filters(&idx.meta().genomes); + let partition = idx.partition(); + let mut rep = Reporter::new(); - // open_sequential: works with and without .idx (approx or exact index) - let reader = UnitigFileReader::open_sequential(&path).unwrap_or_else(|e| { - eprintln!("error opening unitigs (partition {i}): {e}"); - std::process::exit(1) + // ── Phase 1 : collect filtered kmers in parallel ────────────────────────── + let pb = progress_bar("unitig", n as u64, "partitions"); + let stage = Stage::start("build graph"); + let g = (0..n) + .into_par_iter() + .fold( + GraphDeBruijn::new, + |mut local_g, i| { + partition + .iter_partition_kmers(i, use_counts, n_genomes, &filters, |kmer, _row| { + local_g.push(kmer); + }) + .unwrap_or_else(|e| { + eprintln!("error reading partition {i}: {e}"); + std::process::exit(1); + }); + pb.inc(1); + local_g + }, + ) + .reduce(GraphDeBruijn::new, |mut a, b| { a.merge(b); a }); + pb.finish_and_clear(); + rep.push(stage.stop()); + + info!("unitig: {} distinct k-mers", g.len()); + + // ── Phase 2 : compute degrees (in-memory, no progress needed) ──────────── + let stage = Stage::start("compute degrees"); + g.compute_degrees(); + rep.push(stage.stop()); + + // ── Phase 3 : enumerate unitigs and write as FASTA ─────────────────────── + let pb = spinner("unitig"); + let out = Mutex::new(BufWriter::new(io::stdout())); + let j = AtomicUsize::new(0); + + let write_unitig_fn = |unitig: obikseq::unitig::Unitig| { + let idx = j.fetch_add(1, Ordering::Relaxed); + let mut w = out.lock().unwrap(); + write_unitig(&unitig, k, 0, idx, &mut *w).unwrap_or_else(|e| { + eprintln!("write error: {e}"); + std::process::exit(1); }); - - for (j, unitig) in reader.iter_unitigs() { - let mut out = stdout.lock().unwrap(); - write_unitig(&unitig, k, i, j, &mut *out).unwrap_or_else(|e| { - eprintln!("write error: {e}"); - std::process::exit(1) - }); + if idx % 10_000 == 0 { + pb.set_message(format!("{idx} unitigs written")); } - }); + }; - stdout.into_inner().unwrap().flush().expect("flush error"); + let stage = Stage::start("chain unitigs"); + g.par_for_each_chain_unitig(&write_unitig_fn); + rep.push(stage.stop()); + + let stage = Stage::start("remaining unitigs"); + g.for_each_remaining_unitig(&write_unitig_fn); + pb.finish_and_clear(); + rep.push(stage.stop()); + + out.into_inner().unwrap().flush().expect("flush error"); + rep.print(); } diff --git a/src/obikmer/src/cmd/utils.rs b/src/obikmer/src/cmd/utils.rs index c5fa3e7..5aa564a 100644 --- a/src/obikmer/src/cmd/utils.rs +++ b/src/obikmer/src/cmd/utils.rs @@ -1,7 +1,7 @@ use std::path::PathBuf; use clap::Args; -use obikindex::{validate_label, KmerIndex}; +use obikindex::{validate_label, IndexBitsPerKmer, KmerIndex}; use tracing::info; #[derive(Args)] @@ -16,6 +16,10 @@ pub struct UtilsArgs { /// Add missing layer_meta.json files to each layer (required after upgrading from old indexes) #[arg(long)] pub upgrade_index: bool, + + /// Print bits-per-kmer statistics (MPHF, evidence, matrix, total) + #[arg(long)] + pub bits_per_kmer: bool, } pub fn run(args: UtilsArgs) { @@ -31,12 +35,35 @@ pub fn run(args: UtilsArgs) { run_upgrade_index(&args.index); } + if args.bits_per_kmer { + any = true; + run_bits_per_kmer(&args.index); + } + if !any { - eprintln!("utils: no operation specified. Available options: --new-label NEW=OLD, --upgrade-index"); + eprintln!("utils: no operation specified. Available options: --new-label NEW=OLD, --upgrade-index, --bits-per-kmer"); std::process::exit(1); } } +fn run_bits_per_kmer(index_path: &PathBuf) { + let idx = KmerIndex::open(index_path).unwrap_or_else(|e| { + eprintln!("error opening index: {e}"); + std::process::exit(1); + }); + let stats: IndexBitsPerKmer = idx.bits_per_kmer().unwrap_or_else(|e| { + eprintln!("error computing bits/kmer: {e}"); + std::process::exit(1); + }); + println!("k-mers : {}", stats.n_kmers); + println!("genomes : {}", stats.n_genomes); + println!("mphf : {:6.2} bits/kmer", stats.mphf); + println!("evidence : {:6.2} bits/kmer", stats.evidence); + println!("matrix : {:6.2} bits/kmer ({:.2} bits/kmer/genome)", + stats.matrix, stats.matrix_per_genome); + println!("total : {:6.2} bits/kmer", stats.total); +} + fn run_upgrade_index(index_path: &PathBuf) { let idx = KmerIndex::open(index_path).unwrap_or_else(|e| { eprintln!("error opening index: {e}"); diff --git a/src/obikpartitionner/src/dump_layer.rs b/src/obikpartitionner/src/dump_layer.rs index 77c7cd3..adc26ac 100644 --- a/src/obikpartitionner/src/dump_layer.rs +++ b/src/obikpartitionner/src/dump_layer.rs @@ -4,6 +4,7 @@ use obiskio::{SKError, SKResult, UnitigFileReader}; use obilayeredmap::{IndexMode, MphfLayer, OLMError}; use obilayeredmap::meta::PartitionMeta; +use crate::filter::{KmerFilter, passes_all}; use crate::partition::KmerPartition; const INDEX_SUBDIR: &str = "index"; @@ -16,18 +17,21 @@ fn olm_to_sk(e: OLMError) -> SKError { } impl KmerPartition { - /// Iterate all indexed kmers in partition `part`, calling `cb(kmer, row)` for each. + /// Iterate all indexed kmers in partition `part`, calling `cb(kmer, row)` for each + /// kmer that passes every filter in `filters`. /// /// `use_counts = true` → reads count columns (u32 values per genome). /// `use_counts = false` → reads presence columns, converted to 0/1 u32. /// /// If no data matrix exists for a layer (pure set-membership, single genome), - /// a row of `n_genomes` ones is emitted for every kmer in that layer. + /// a row of `n_genomes` ones is emitted for every kmer in that layer — unless + /// the filter rejects it, in which case the whole layer is skipped. pub fn iter_partition_kmers( &self, part: usize, use_counts: bool, n_genomes: usize, + filters: &[Box], mut cb: impl FnMut(CanonicalKmer, Box<[u32]>), ) -> SKResult<()> { let index_dir = self.part_dir(part).join(INDEX_SUBDIR); @@ -44,17 +48,20 @@ impl KmerPartition { let layer_dir = index_dir.join(format!("layer_{l}")); if !layer_dir.exists() { break; } l += 1; - let mphf = MphfLayer::open(&layer_dir, &index_mode).map_err(olm_to_sk)?; + let mphf = MphfLayer::open(&layer_dir, &index_mode).map_err(olm_to_sk)?; let reader = UnitigFileReader::open_sequential(&layer_dir.join("unitigs.bin"))?; - let counts_dir = layer_dir.join("counts"); + let counts_dir = layer_dir.join("counts"); let presence_dir = layer_dir.join("presence"); if use_counts && counts_dir.exists() { let mat = PersistentCompactIntMatrix::open(&layer_dir).map_err(SKError::Io)?; for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { if let Some(slot) = mphf.find(kmer) { - cb(kmer, mat.row(slot)); + let row = mat.row(slot); + if passes_all(filters, &row, n_genomes) { + cb(kmer, row); + } } } } else if !use_counts && presence_dir.exists() { @@ -62,15 +69,20 @@ impl KmerPartition { for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { if let Some(slot) = mphf.find(kmer) { let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect(); - cb(kmer, row); + if passes_all(filters, &row, n_genomes) { + cb(kmer, row); + } } } } else { - // No data matrix: pure set-membership layer, all kmers belong to every genome. + // No data matrix: implicit presence — all values are 1. + // The filter result is identical for every kmer, so evaluate once. let all_present: Box<[u32]> = vec![1u32; n_genomes].into(); - for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { - if mphf.find(kmer).is_some() { - cb(kmer, all_present.clone()); + if passes_all(filters, &all_present, n_genomes) { + for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { + if mphf.find(kmer).is_some() { + cb(kmer, all_present.clone()); + } } } } @@ -79,13 +91,14 @@ impl KmerPartition { Ok(()) } - /// Like [`iter_partition_kmers`] but the callback also receives the layer index, - /// enabling debug output that identifies where each kmer was stored. + /// Like [`iter_partition_kmers`] but the callback also receives `(partition, layer)` + /// indices, enabling debug output that identifies where each kmer was stored. pub fn iter_partition_kmers_located( &self, part: usize, use_counts: bool, n_genomes: usize, + filters: &[Box], mut cb: impl FnMut(usize, usize, CanonicalKmer, Box<[u32]>), ) -> SKResult<()> { let index_dir = self.part_dir(part).join(INDEX_SUBDIR); @@ -101,7 +114,7 @@ impl KmerPartition { loop { let layer_dir = index_dir.join(format!("layer_{layer}")); if !layer_dir.exists() { break; } - let mphf = MphfLayer::open(&layer_dir, &index_mode).map_err(olm_to_sk)?; + let mphf = MphfLayer::open(&layer_dir, &index_mode).map_err(olm_to_sk)?; let reader = UnitigFileReader::open_sequential(&layer_dir.join("unitigs.bin"))?; let counts_dir = layer_dir.join("counts"); @@ -111,7 +124,10 @@ impl KmerPartition { let mat = PersistentCompactIntMatrix::open(&layer_dir).map_err(SKError::Io)?; for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { if let Some(slot) = mphf.find(kmer) { - cb(part, layer, kmer, mat.row(slot)); + let row = mat.row(slot); + if passes_all(filters, &row, n_genomes) { + cb(part, layer, kmer, row); + } } } } else if !use_counts && presence_dir.exists() { @@ -119,14 +135,18 @@ impl KmerPartition { for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { if let Some(slot) = mphf.find(kmer) { let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect(); - cb(part, layer, kmer, row); + if passes_all(filters, &row, n_genomes) { + cb(part, layer, kmer, row); + } } } } else { let all_present: Box<[u32]> = vec![1u32; n_genomes].into(); - for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { - if mphf.find(kmer).is_some() { - cb(part, layer, kmer, all_present.clone()); + if passes_all(filters, &all_present, n_genomes) { + for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { + if mphf.find(kmer).is_some() { + cb(part, layer, kmer, all_present.clone()); + } } } } diff --git a/src/obikpartitionner/src/filter.rs b/src/obikpartitionner/src/filter.rs index f35a3e8..d5c6346 100644 --- a/src/obikpartitionner/src/filter.rs +++ b/src/obikpartitionner/src/filter.rs @@ -1,4 +1,4 @@ -/// Trait for kmer row filters used in the rebuild command. +/// Trait for kmer row filters. /// /// `row` contains raw per-genome counts (or 0/1 for presence/absence data). /// `n_genomes` equals `row.len()`. @@ -6,6 +6,12 @@ pub trait KmerFilter: Send + Sync { fn passes(&self, row: &[u32], n_genomes: usize) -> bool; } +/// True when `row` passes every filter in `filters`. +/// Returns `true` if `filters` is empty. +pub fn passes_all(filters: &[Box], row: &[u32], n_genomes: usize) -> bool { + filters.iter().all(|f| f.passes(row, n_genomes)) +} + // ── Quorum filters ───────────────────────────────────────────────────────────── fn present_count(row: &[u32], threshold: u32) -> usize { @@ -85,3 +91,52 @@ impl KmerFilter for MaxTotalCount { row.iter().sum::() <= self.total } } + +// ── Group-based quorum filter ───────────────────────────────────────────────── + +/// Quorum filter operating on pre-classified genome groups. +/// +/// `ingroup_idx` / `outgroup_idx` are column indices into the per-genome row. +/// When `ingroup_idx` is empty, no ingroup quorum is checked. +/// When `outgroup_idx` is empty, no outgroup quorum is checked. +pub struct GroupQuorumFilter { + pub ingroup_idx: Vec, + pub outgroup_idx: Vec, + pub threshold: u32, + pub min_count: usize, + pub max_count: usize, + pub min_frac: f64, + pub max_frac: f64, + pub min_outgroup_count: usize, + pub max_outgroup_count: usize, + pub min_outgroup_frac: f64, + pub max_outgroup_frac: f64, +} + +impl KmerFilter for GroupQuorumFilter { + fn passes(&self, row: &[u32], _n_genomes: usize) -> bool { + if !self.ingroup_idx.is_empty() { + let n = self.ingroup_idx.iter() + .filter(|&&i| row.get(i).copied().unwrap_or(0) > self.threshold) + .count(); + let denom = self.ingroup_idx.len(); + if n < self.min_count { return false; } + if n > self.max_count { return false; } + let frac = n as f64 / denom as f64; + if frac < self.min_frac { return false; } + if frac > self.max_frac { return false; } + } + if !self.outgroup_idx.is_empty() { + let n = self.outgroup_idx.iter() + .filter(|&&i| row.get(i).copied().unwrap_or(0) > self.threshold) + .count(); + let denom = self.outgroup_idx.len(); + if n < self.min_outgroup_count { return false; } + if n > self.max_outgroup_count { return false; } + let frac = n as f64 / denom as f64; + if frac < self.min_outgroup_frac { return false; } + if frac > self.max_outgroup_frac { return false; } + } + true + } +} diff --git a/src/obikpartitionner/src/lib.rs b/src/obikpartitionner/src/lib.rs index 49a81fc..b152383 100644 --- a/src/obikpartitionner/src/lib.rs +++ b/src/obikpartitionner/src/lib.rs @@ -8,6 +8,6 @@ mod partition; mod query_layer; mod rebuild_layer; -pub use filter::KmerFilter; +pub use filter::{GroupQuorumFilter, KmerFilter, passes_all}; pub use merge_layer::MergeMode; pub use partition::{KmerPartition, KmerSpectrum, PARTITIONS_SUBDIR}; diff --git a/src/obikpartitionner/src/rebuild_layer.rs b/src/obikpartitionner/src/rebuild_layer.rs index ed6732a..714bde8 100644 --- a/src/obikpartitionner/src/rebuild_layer.rs +++ b/src/obikpartitionner/src/rebuild_layer.rs @@ -7,11 +7,12 @@ use obicompactvec::{PersistentBitMatrixBuilder, PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder}; use obidebruinj::GraphDeBruijn; +use obikseq::CanonicalKmer; use obiskio::{SKError, SKResult, UnitigFileReader}; use obilayeredmap::{IndexMode, Layer, MphfLayer, OLMError}; use obilayeredmap::meta::PartitionMeta; -use crate::filter::KmerFilter; +use crate::filter::{KmerFilter, passes_all}; use crate::merge_layer::{MergeMode, SrcLayerData}; use crate::partition::KmerPartition; @@ -75,8 +76,34 @@ fn load_meta(dir: &Path) -> SKResult { } } -fn passes_all(filters: &[Box], row: &[u32], n_genomes: usize) -> bool { - filters.iter().all(|f| f.passes(row, n_genomes)) +/// Iterate all kmers in `src_index_dir` that pass `filters`, yielding `(kmer, row)`. +/// +/// Uses [`SrcLayerData`] semantics: counts take priority over presence when +/// `mode = Count`; presence (or implicit all-ones) is used for `Presence`. +fn iter_src_layers( + src_index_dir: &Path, + mode: MergeMode, + n_genomes: usize, + filters: &[Box], + mut cb: impl FnMut(CanonicalKmer, Box<[u32]>), +) -> SKResult<()> { + let src_meta = load_meta(src_index_dir)?; + for l in 0..src_meta.n_layers { + let src_layer_dir = src_index_dir.join(format!("layer_{l}")); + let unitigs_path = src_layer_dir.join("unitigs.bin"); + if !unitigs_path.exists() { continue; } + + let reader = UnitigFileReader::open_sequential(&unitigs_path)?; + let src_data = SrcLayerData::open(&src_layer_dir, mode)?; + + for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { + let row = src_data.lookup(kmer, n_genomes); + if passes_all(filters, &row, n_genomes) { + cb(kmer, row.into_boxed_slice()); + } + } + } + Ok(()) } // ── KmerPartition::rebuild_partition ───────────────────────────────────────── @@ -110,22 +137,9 @@ impl KmerPartition { // ── Pass 1: collect filtered kmers into de Bruijn graph ─────────────── let mut g = GraphDeBruijn::new(); - - for l in 0..src_meta.n_layers { - let src_layer_dir = src_index_dir.join(format!("layer_{l}")); - let unitigs_path = src_layer_dir.join("unitigs.bin"); - if !unitigs_path.exists() { continue; } - - let reader = UnitigFileReader::open_sequential(&unitigs_path)?; - let src_data = SrcLayerData::open(&src_layer_dir, mode)?; - - for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { - let row = src_data.lookup(kmer, n_genomes); - if passes_all(filters, &row, n_genomes) { - g.push(kmer); - } - } - } + iter_src_layers(&src_index_dir, mode, n_genomes, filters, |kmer, _row| { + g.push(kmer); + })?; if g.len() == 0 { return Ok(()); @@ -176,24 +190,13 @@ impl KmerPartition { }; // ── Pass 2: fill builders ───────────────────────────────────────────── - for l in 0..src_meta.n_layers { - let src_layer_dir = src_index_dir.join(format!("layer_{l}")); - let unitigs_path = src_layer_dir.join("unitigs.bin"); - if !unitigs_path.exists() { continue; } - - let reader = UnitigFileReader::open_sequential(&unitigs_path)?; - let src_data = SrcLayerData::open(&src_layer_dir, mode)?; - - for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { - let row = src_data.lookup(kmer, n_genomes); - if !passes_all(filters, &row, n_genomes) { continue; } - if let Some(slot) = dst_mphf.find(kmer) { - for (col, &value) in row.iter().enumerate() { - builders[col].set_val(slot, value); - } + iter_src_layers(&src_index_dir, mode, n_genomes, filters, |kmer, row| { + if let Some(slot) = dst_mphf.find(kmer) { + for (col, &value) in row.iter().enumerate() { + builders[col].set_val(slot, value); } } - } + })?; // ── Close builders, write metadata ──────────────────────────────────── for b in builders { b.close()?; }