refactor: extract obikindex crate and remove deprecated CLI commands
Extracted core indexing logic, state tracking, and metadata management into a new `obikindex` crate. Refactored the `index` and `unitig` commands to leverage the `KmerIndex` abstraction and state-driven pipeline transitions. Removed obsolete CLI subcommands (`count`, `fasta`, `longtig`, `partition`) and their associated pipeline steps. Updated FASTA writing utilities for single-line output and deterministic identifiers, and refreshed workspace dependencies.
This commit is contained in:
@@ -1,4 +1,28 @@
|
||||
## Dans OBILayeredMap
|
||||
## Chose à vérifier suite à la commande index
|
||||
|
||||
- Est-ce que CachelineEfVec est vraiment justifier dans notre cas. vu les contraintes sur la distribution des valeurs imposées par CachelineEfVec en terme d'ordre, de grandeur et de dispersion ?
|
||||
- Il semble que le count de kmer soit stocké, ce qui doit-être une possibilité pas une obligation.
|
||||
- partition.meta ne devrait plus exister
|
||||
- les spectrums globaux devrait etre identifier par génome
|
||||
- regrouper dans un sous-dossier spectrums à la racine de l'index avec un nom basé sur le génome
|
||||
- les spectrum patiels ont-ils vocation à être conserver ?
|
||||
- l'étape de déreplication dure quasiment autant de temps que le comptage mais ne laisse aucune trace de progression à l'utilisateur
|
||||
|
||||
## commandes à ajouter
|
||||
|
||||
- merge : pour construire un index à partir d'index existants
|
||||
- deux modes : count et presence/absence. count exige que tous les index mergés soient déjà en mode count. mode presence/absence par defaut. Si passage de mode count à mode presence/absence, par defaut presence = count >= 1. Possibilité de spécifier un seuil personnalisé.
|
||||
|
||||
- filter : produit un nouvel index filtré à partir d'un index existant en verifiant que les kmer présents dans le nouvel index respectent les critères de filtrage spécifiés
|
||||
- quorum de presence en fraction-(min/max) du nombre de génomes, en nombre-(min/max) de génomes, si mode count la présence peut être défini par un seuil personnalisé minimum et maximum
|
||||
|
||||
- aggregate : aggrege toutes les colonnes d'une matrice d'index en une seule colonne.
|
||||
|
||||
- query : scan un fichier de sequences et retourne pour chaque sequence quels kmer sont présents dans l'index et dans quel genomes
|
||||
|
||||
- distance : calcule la matrice de distance entre les genomes
|
||||
- proposer une option pour chaque distance à calculer
|
||||
- un possibité de récuperer la matrice des kmer communs
|
||||
- un possibité de calculer l'arbre nj
|
||||
- les matrices sont sauvegardées en CSV
|
||||
- les arbres NJ sont sauvegardés en Newick avec les longeurs de branche
|
||||
|
||||
- dump : une table csv de l'index avec les kmer et les genomes associés en mode count ou presence/absence avec une option pour forcer le mode presence/absence meme si l'index est en mode count. Par defaut, le mode count est utilisé pour les index en mode count et le mode presence/absence pour les index en mode presence/absence.
|
||||
|
||||
Generated
+62
-658
File diff suppressed because it is too large
Load Diff
+1
-1
@@ -1,5 +1,5 @@
|
||||
[workspace]
|
||||
resolver = "3"
|
||||
members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap", "obicompactvec", "obisys"]
|
||||
members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap", "obicompactvec", "obisys", "obikindex"]
|
||||
[profile.release]
|
||||
debug = 1
|
||||
|
||||
@@ -1,12 +1,10 @@
|
||||
//use ahash::RandomState;
|
||||
use hashbrown::HashMap;
|
||||
use obifastwrite::write_unitig;
|
||||
use obikseq::k;
|
||||
use obikseq::unitig::Unitig;
|
||||
use obikseq::{CanonicalKmer, Kmer, Sequence};
|
||||
use std::cell::Cell;
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use xxhash_rust::xxh3::Xxh3Builder;
|
||||
|
||||
// ── Types ─────────────────────────────────────────────────────────────────────
|
||||
@@ -293,59 +291,6 @@ impl GraphDeBruijn {
|
||||
Some(oriented)
|
||||
}
|
||||
|
||||
fn next_longtig_kmer(&self, kmer: Kmer) -> Option<Kmer> {
|
||||
let canonical = kmer.canonical();
|
||||
let node = self.nodes.get(&canonical)?.get();
|
||||
|
||||
let direct = kmer.raw() == canonical.raw();
|
||||
|
||||
if (direct && node.n_right_neighbours() == 0) || (!direct && node.n_left_neighbours() == 0)
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
let next_c: CanonicalKmer = if direct {
|
||||
if node.can_extend_right() {
|
||||
canonical
|
||||
.into_kmer()
|
||||
.push_right(node.right_nuc())
|
||||
.canonical()
|
||||
} else {
|
||||
self.iter_right_neighbors(canonical)
|
||||
.filter(|n| !self.is_visited(n).unwrap_or(true))
|
||||
.next()?
|
||||
}
|
||||
} else {
|
||||
if node.can_extend_left() {
|
||||
canonical.into_kmer().push_left(node.left_nuc()).canonical()
|
||||
} else {
|
||||
self.iter_left_neighbors(canonical)
|
||||
.filter(|n| !self.is_visited(n).unwrap_or(true))
|
||||
.next()?
|
||||
}
|
||||
};
|
||||
|
||||
let cell = self.nodes.get(&next_c)?;
|
||||
let next_node = cell.get();
|
||||
if next_node.is_visited() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let oriented = oriented_next(kmer, next_c);
|
||||
let ndirect = oriented.raw() == next_c.raw();
|
||||
|
||||
if (ndirect && next_node.n_right_neighbours() > 1)
|
||||
|| (!ndirect && next_node.n_left_neighbours() > 1)
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut updated = next_node;
|
||||
updated.set_visited();
|
||||
cell.set(updated);
|
||||
Some(oriented)
|
||||
}
|
||||
|
||||
fn iter_unitig_kmers(&self, start: Kmer) -> UnitigIter<'_> {
|
||||
UnitigIter {
|
||||
graph: self,
|
||||
@@ -353,13 +298,6 @@ impl GraphDeBruijn {
|
||||
}
|
||||
}
|
||||
|
||||
fn iter_longtig_kmers(&self, start: Kmer) -> LongtigIter<'_> {
|
||||
LongtigIter {
|
||||
graph: self,
|
||||
current: Some(start),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter_unitig(&self) -> impl Iterator<Item = Unitig> + '_ {
|
||||
let k = k();
|
||||
self.start_iter().map(move |(start, first_next)| {
|
||||
@@ -373,36 +311,6 @@ impl GraphDeBruijn {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn iter_longtig(&self) -> impl Iterator<Item = Unitig> + '_ {
|
||||
let k = k();
|
||||
self.start_iter().map(move |(start, first_next)| {
|
||||
let mut nucs: Vec<u8> = (0..k).map(|i| start.nucleotide(i)).collect();
|
||||
if let Some(next_c) = first_next {
|
||||
for kmer in self.iter_longtig_kmers(next_c) {
|
||||
nucs.push(kmer.nucleotide(k - 1));
|
||||
}
|
||||
}
|
||||
Unitig::from_nucleotides(&nucs)
|
||||
})
|
||||
}
|
||||
|
||||
/// Write all unitigs to `out` in FASTA format.
|
||||
///
|
||||
/// Calls [`obifastwrite::write_unitig`] for each unitig produced by
|
||||
/// [`iter_unitig`]. Stops and returns the first I/O error encountered.
|
||||
pub fn write_fasta<W: io::Write>(&self, out: &mut W, unitig: bool) -> io::Result<()> {
|
||||
if unitig {
|
||||
for unitig in self.iter_unitig() {
|
||||
write_unitig(&unitig, k(), out)?;
|
||||
}
|
||||
} else {
|
||||
for unitig in self.iter_longtig() {
|
||||
write_unitig(&unitig, k(), out)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.nodes.len()
|
||||
}
|
||||
@@ -516,23 +424,6 @@ impl Iterator for UnitigIter<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
// ── UnitigIter ────────────────────────────────────────────────────────────────
|
||||
|
||||
struct LongtigIter<'a> {
|
||||
graph: &'a GraphDeBruijn,
|
||||
current: Option<Kmer>,
|
||||
}
|
||||
|
||||
impl Iterator for LongtigIter<'_> {
|
||||
type Item = Kmer;
|
||||
|
||||
fn next(&mut self) -> Option<Kmer> {
|
||||
let current = self.current?;
|
||||
self.current = self.graph.next_longtig_kmer(current);
|
||||
Some(current)
|
||||
}
|
||||
}
|
||||
|
||||
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
fn oriented_next(from: Kmer, to: CanonicalKmer) -> Kmer {
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
use std::fmt;
|
||||
use std::io::{self, Write};
|
||||
use xxhash_rust::xxh64::xxh64;
|
||||
|
||||
pub(crate) enum JsonVal<'a> {
|
||||
/// A JSON value that is either a number or a quoted string.
|
||||
pub enum JsonVal<'a> {
|
||||
/// Integer value, serialised without quotes.
|
||||
Num(u64),
|
||||
/// String value, serialised with double quotes.
|
||||
Str(&'a str),
|
||||
}
|
||||
|
||||
@@ -16,11 +18,8 @@ impl fmt::Display for JsonVal<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn seq_id(ascii: &[u8]) -> String {
|
||||
format!("{:016X}", xxh64(ascii, 0))
|
||||
}
|
||||
|
||||
pub(crate) fn annotation<W: Write>(
|
||||
/// Write a JSON object `{"k1":v1,"k2":v2,...}` to `writer`.
|
||||
pub fn annotation<W: Write>(
|
||||
writer: &mut W,
|
||||
fields: &[(&str, JsonVal<'_>)],
|
||||
) -> io::Result<()> {
|
||||
@@ -34,10 +33,29 @@ pub(crate) fn annotation<W: Write>(
|
||||
write!(writer, "}}")
|
||||
}
|
||||
|
||||
pub(crate) fn write_sequence<W: Write>(writer: &mut W, seq: &[u8], width: usize) -> io::Result<()> {
|
||||
for chunk in seq.chunks(width) {
|
||||
// SAFETY: seq is valid UTF-8; any contiguous slice of ASCII bytes is too
|
||||
writeln!(writer, "{}", unsafe { std::str::from_utf8_unchecked(chunk) })?;
|
||||
/// xxHash-64 of `ascii`, formatted as 16 uppercase hex digits.
|
||||
pub fn seq_id(ascii: &[u8]) -> String {
|
||||
use xxhash_rust::xxh64::xxh64;
|
||||
format!("{:016X}", xxh64(ascii, 0))
|
||||
}
|
||||
Ok(())
|
||||
|
||||
/// Write `seq` as one line of ASCII DNA, followed by a newline.
|
||||
pub fn write_sequence<W: Write>(writer: &mut W, seq: &[u8]) -> io::Result<()> {
|
||||
// SAFETY: seq is valid ASCII DNA (A/C/G/T).
|
||||
writeln!(writer, "{}", unsafe { std::str::from_utf8_unchecked(seq) })
|
||||
}
|
||||
|
||||
/// Core FASTA record writer.
|
||||
///
|
||||
/// Writes `>{id} {annotation}\n{sequence}\n`.
|
||||
pub fn write_record<W: Write>(
|
||||
seq: &[u8],
|
||||
id: &str,
|
||||
fields: &[(&str, JsonVal<'_>)],
|
||||
out: &mut W,
|
||||
) -> io::Result<()> {
|
||||
write!(out, ">{id} ")?;
|
||||
annotation(out, fields)?;
|
||||
writeln!(out)?;
|
||||
write_sequence(out, seq)
|
||||
}
|
||||
|
||||
+89
-116
@@ -1,32 +1,20 @@
|
||||
//! FASTA serialisation of [`SuperKmer`] values.
|
||||
//! FASTA serialisation for obikmer sequence types.
|
||||
//!
|
||||
//! Two functions cover the two phases of the scatter pipeline:
|
||||
//! Three public functions cover the main output cases:
|
||||
//!
|
||||
//! - [`write_scatter`]: scatter phase (before routing). The header annotation
|
||||
//! contains the minimizer sequence decoded from [`SuperKmer::minimizer_pos`].
|
||||
//! - [`write_scatter`]: super-kmers in scatter phase (minimizer annotation)
|
||||
//! - [`write_count`]: super-kmers in count phase (occurrence count annotation)
|
||||
//! - [`write_unitig`]: unitigs from the layered index (partition + index annotation)
|
||||
//!
|
||||
//! - [`write_count`]: count phase (after deduplication). The header annotation
|
||||
//! contains the occurrence count from [`SuperKmer::count`].
|
||||
//!
|
||||
//! Both functions write standard OBITools-compatible FASTA:
|
||||
//! All produce OBITools-compatible FASTA:
|
||||
//!
|
||||
//! ```text
|
||||
//! >ID {"seq_length":32,"kmer_size":31,"minimizer_size":11,"partition":42,"minimizer":"CGTGCTAGATC"}
|
||||
//! GCTAGCATGCTAGCTGTAGCTGTGAGTGCTG
|
||||
//! >ID {"key":value,...}
|
||||
//! SEQUENCE
|
||||
//! ```
|
||||
//!
|
||||
//! The record identifier is the xxHash-64 of the ASCII sequence, formatted as
|
||||
//! a 16-digit uppercase hexadecimal string. xxHash-64 is collision-resistant
|
||||
//! enough for debugging identifiers (collision probability < 1e-9 for billions
|
||||
//! of distinct super-kmers).
|
||||
//!
|
||||
//! # Phase contract
|
||||
//!
|
||||
//! `write_scatter` reads [`SuperKmer::minimizer_pos`], which is only valid
|
||||
//! **before** [`SuperKmer::init_count`] is called. `write_count` reads
|
||||
//! [`SuperKmer::count`], which is only meaningful **after** `init_count`.
|
||||
//! Mixing the two functions in the wrong phase produces silently wrong output;
|
||||
//! this is enforced by pipeline structure, not by the type system.
|
||||
//! The lower-level primitive [`write_record`] and the [`JsonVal`] type are also
|
||||
//! public for callers that need custom annotations.
|
||||
|
||||
#![deny(missing_docs)]
|
||||
|
||||
@@ -35,22 +23,15 @@ mod fasta;
|
||||
use std::io::{self, Write};
|
||||
|
||||
use obikseq::{Minimizer, SuperKmer, Unitig};
|
||||
use xxhash_rust::xxh64::xxh64;
|
||||
|
||||
pub use fasta::{JsonVal, annotation, seq_id, write_record};
|
||||
|
||||
// ── public API ────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Write one super-kmer in FASTA format — **scatter phase**.
|
||||
///
|
||||
/// The `minimizer` field in the JSON annotation contains the ASCII sequence of
|
||||
/// the minimizer, decoded from [`SuperKmer::minimizer_pos`] (scatter-phase
|
||||
/// value of the payload field).
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `sk`: the super-kmer to serialise (must be in scatter phase)
|
||||
/// - `out`: destination writer
|
||||
/// - `k`: k-mer size used to build `sk`
|
||||
/// - `m`: minimizer size
|
||||
/// - `partition`: partition index computed from the minimizer hash
|
||||
/// ID is the xxHash-64 of the sequence. JSON annotation includes
|
||||
/// `seq_length`, `kmer_size`, `minimizer_size`, `partition`, `minimizer`.
|
||||
pub fn write_scatter<W: Write>(
|
||||
sk: &SuperKmer,
|
||||
out: &mut W,
|
||||
@@ -61,37 +42,26 @@ pub fn write_scatter<W: Write>(
|
||||
) -> io::Result<()> {
|
||||
let ascii = sk.to_ascii();
|
||||
let id = seq_id(&ascii);
|
||||
let seq_len = ascii.len();
|
||||
let min_seq = minimizer.to_ascii();
|
||||
|
||||
writeln!(
|
||||
let min_str = unsafe { std::str::from_utf8_unchecked(&min_seq) };
|
||||
write_record(
|
||||
&ascii,
|
||||
&id,
|
||||
&[
|
||||
("seq_length", JsonVal::Num(ascii.len() as u64)),
|
||||
("kmer_size", JsonVal::Num(k as u64)),
|
||||
("minimizer_size",JsonVal::Num(m as u64)),
|
||||
("partition", JsonVal::Num(partition as u64)),
|
||||
("minimizer", JsonVal::Str(min_str)),
|
||||
],
|
||||
out,
|
||||
">{id} {{\"seq_length\":{seq_len},\"kmer_size\":{k},\
|
||||
\"minimizer_size\":{m},\"partition\":{partition},\
|
||||
\"minimizer\":\"{min}\"}}",
|
||||
id = id,
|
||||
seq_len = seq_len,
|
||||
k = k,
|
||||
m = m,
|
||||
partition = partition,
|
||||
min = unsafe { std::str::from_utf8_unchecked(&min_seq) },
|
||||
)?;
|
||||
out.write_all(&ascii)?;
|
||||
out.write_all(b"\n")
|
||||
)
|
||||
}
|
||||
|
||||
/// Write one super-kmer in FASTA format — **count phase**.
|
||||
///
|
||||
/// The `count` field in the JSON annotation contains the occurrence count from
|
||||
/// [`SuperKmer::count`] (count-phase value of the payload field).
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `sk`: the super-kmer to serialise (must be in count phase, i.e. after
|
||||
/// [`SuperKmer::init_count`] has been called)
|
||||
/// - `out`: destination writer
|
||||
/// - `k`: k-mer size
|
||||
/// - `m`: minimizer size
|
||||
/// - `partition`: partition index
|
||||
/// ID is the xxHash-64 of the sequence. JSON annotation includes
|
||||
/// `seq_length`, `kmer_size`, `minimizer_size`, `partition`, `count`.
|
||||
pub fn write_count<W: Write>(
|
||||
sk: &SuperKmer,
|
||||
out: &mut W,
|
||||
@@ -101,52 +71,47 @@ pub fn write_count<W: Write>(
|
||||
) -> io::Result<()> {
|
||||
let ascii = sk.to_ascii();
|
||||
let id = seq_id(&ascii);
|
||||
let seq_len = ascii.len();
|
||||
let count = sk.count();
|
||||
|
||||
writeln!(
|
||||
write_record(
|
||||
&ascii,
|
||||
&id,
|
||||
&[
|
||||
("seq_length", JsonVal::Num(ascii.len() as u64)),
|
||||
("kmer_size", JsonVal::Num(k as u64)),
|
||||
("minimizer_size",JsonVal::Num(m as u64)),
|
||||
("partition", JsonVal::Num(partition as u64)),
|
||||
("count", JsonVal::Num(sk.count() as u64)),
|
||||
],
|
||||
out,
|
||||
">{id} {{\"seq_length\":{seq_len},\"kmer_size\":{k},\
|
||||
\"minimizer_size\":{m},\"partition\":{partition},\
|
||||
\"count\":{count}}}",
|
||||
id = id,
|
||||
seq_len = seq_len,
|
||||
k = k,
|
||||
m = m,
|
||||
partition = partition,
|
||||
count = count,
|
||||
)?;
|
||||
out.write_all(&ascii)?;
|
||||
out.write_all(b"\n")
|
||||
)
|
||||
}
|
||||
|
||||
/// Write one unitig in FASTA format.
|
||||
///
|
||||
/// Header annotation (JSON):
|
||||
/// ```text
|
||||
/// >HASH {"seq_length":<seql>,"kmer_size":<k>,"n_kmers":<seql-k+1>}
|
||||
/// ```
|
||||
///
|
||||
/// `HASH` is the xxHash-64 of the ASCII sequence (16 uppercase hex digits).
|
||||
/// `n_kmers` is the number of distinct k-mers covered by this unitig.
|
||||
pub fn write_unitig<W: Write>(unitig: &Unitig, k: usize, out: &mut W) -> io::Result<()> {
|
||||
/// ID is `part_PPPPP_unitig_IIIIII` where `P` is the partition index and `I`
|
||||
/// is the unitig index within that partition. JSON annotation includes
|
||||
/// `seq_length`, `kmer_size`, `n_kmers`, `partition`, `unitig_index`.
|
||||
pub fn write_unitig<W: Write>(
|
||||
unitig: &Unitig,
|
||||
k: usize,
|
||||
partition: usize,
|
||||
index: usize,
|
||||
out: &mut W,
|
||||
) -> io::Result<()> {
|
||||
let ascii = unitig.to_ascii();
|
||||
let id = seq_id(&ascii);
|
||||
let seql = unitig.seql();
|
||||
let n_kmers = seql - k + 1;
|
||||
writeln!(
|
||||
let id = format!("part_{partition:05}_unitig_{index:06}");
|
||||
write_record(
|
||||
&ascii,
|
||||
&id,
|
||||
&[
|
||||
("seq_length", JsonVal::Num(seql as u64)),
|
||||
("kmer_size", JsonVal::Num(k as u64)),
|
||||
("n_kmers", JsonVal::Num((seql - k + 1) as u64)),
|
||||
("partition", JsonVal::Num(partition as u64)),
|
||||
("unitig_index", JsonVal::Num(index as u64)),
|
||||
],
|
||||
out,
|
||||
">{id} {{\"seq_length\":{seql},\"kmer_size\":{k},\"n_kmers\":{n_kmers}}}",
|
||||
)?;
|
||||
out.write_all(&ascii)?;
|
||||
out.write_all(b"\n")
|
||||
}
|
||||
|
||||
// ── internal helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
/// xxHash-64 of the ASCII sequence, formatted as 16 uppercase hex digits.
|
||||
fn seq_id(ascii: &[u8]) -> String {
|
||||
format!("{:016X}", xxh64(ascii, 0))
|
||||
)
|
||||
}
|
||||
|
||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||
@@ -178,9 +143,6 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn scatter_minimizer_decoded_from_hash() {
|
||||
// "ACG" right-aligned: A=00, C=01, G=10 → 0b000110 = 6
|
||||
// Left-aligned for m=3: shift by 64 − 2·3 = 58.
|
||||
// set_m(3) so that Minimizer::to_ascii() decodes exactly 3 bases.
|
||||
obikseq::params::set_m(3);
|
||||
let sk = make(b"ACGTACGTACGT");
|
||||
let minimizer = Minimizer::from_raw_unchecked(6u64 << (64 - 2 * 3));
|
||||
@@ -230,13 +192,34 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn count_sequence_line_correct() {
|
||||
// TTTTACGT canonicalises to ACGTAAAA (revcomp is ACGTAAAA < TTTTACGT)
|
||||
let sk = make(b"TTTTACGT");
|
||||
let out = capture(|w| write_count(&sk, w, 4, 2, 0));
|
||||
let lines: Vec<&str> = out.lines().collect();
|
||||
assert_eq!(lines[1], "ACGTAAAA");
|
||||
}
|
||||
|
||||
// ── write_unitig ──────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn unitig_id_format() {
|
||||
obikseq::params::set_k(4);
|
||||
let unitig = obikseq::packed_seq::PackedSeq::from_ascii(b"ACGTACGT");
|
||||
let out = capture(|w| write_unitig(&unitig, 4, 3, 17, w));
|
||||
let id = out.lines().next().unwrap();
|
||||
assert!(id.starts_with(">part_00003_unitig_000017"), "got: {id}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unitig_annotation_fields() {
|
||||
obikseq::params::set_k(4);
|
||||
let unitig = obikseq::packed_seq::PackedSeq::from_ascii(b"ACGTACGT");
|
||||
let out = capture(|w| write_unitig(&unitig, 4, 2, 5, w));
|
||||
assert!(out.contains("\"partition\":2"));
|
||||
assert!(out.contains("\"unitig_index\":5"));
|
||||
assert!(out.contains("\"n_kmers\":5"));
|
||||
assert!(out.contains("\"kmer_size\":4"));
|
||||
}
|
||||
|
||||
// ── ID stability ──────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
@@ -260,7 +243,7 @@ mod tests {
|
||||
.next()
|
||||
.unwrap()[1..]
|
||||
.to_string();
|
||||
assert_eq!(id1, id2, "same sequence must produce same ID");
|
||||
assert_eq!(id1, id2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -269,21 +252,11 @@ mod tests {
|
||||
let sk2 = make(b"TTTTTTTT");
|
||||
|
||||
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)))
|
||||
.lines()
|
||||
.next()
|
||||
.unwrap()
|
||||
.split_whitespace()
|
||||
.next()
|
||||
.unwrap()[1..]
|
||||
.to_string();
|
||||
.lines().next().unwrap()
|
||||
.split_whitespace().next().unwrap()[1..].to_string();
|
||||
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)))
|
||||
.lines()
|
||||
.next()
|
||||
.unwrap()
|
||||
.split_whitespace()
|
||||
.next()
|
||||
.unwrap()[1..]
|
||||
.to_string();
|
||||
.lines().next().unwrap()
|
||||
.split_whitespace().next().unwrap()[1..].to_string();
|
||||
assert_ne!(id1, id2);
|
||||
}
|
||||
|
||||
@@ -291,7 +264,7 @@ mod tests {
|
||||
fn id_is_16_hex_digits() {
|
||||
let sk = make(b"ACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)));
|
||||
let id = &out.lines().next().unwrap()[1..17]; // skip '>'
|
||||
let id = &out.lines().next().unwrap()[1..17];
|
||||
assert_eq!(id.len(), 16);
|
||||
assert!(id.chars().all(|c| c.is_ascii_hexdigit()));
|
||||
}
|
||||
|
||||
@@ -0,0 +1,21 @@
|
||||
[package]
|
||||
name = "obikindex"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
obikpartitionner = { path = "../obikpartitionner" }
|
||||
obikseq = { path = "../obikseq" }
|
||||
obisys = { path = "../obisys" }
|
||||
obiskio = { path = "../obiskio" }
|
||||
obidebruinj = { path = "../obidebruinj" }
|
||||
obilayeredmap = { path = "../obilayeredmap" }
|
||||
obicompactvec = { path = "../obicompactvec" }
|
||||
cacheline-ef = "1.1"
|
||||
epserde = "0.8"
|
||||
ptr_hash = "1.1"
|
||||
rayon = "1"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
indicatif = "0.17"
|
||||
tracing = "0.1.44"
|
||||
@@ -0,0 +1,53 @@
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
|
||||
use obiskio::SKError;
|
||||
use obilayeredmap::OLMError;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum OKIError {
|
||||
Io(io::Error),
|
||||
Json(serde_json::Error),
|
||||
Partition(SKError),
|
||||
Layer(OLMError),
|
||||
}
|
||||
|
||||
pub type OKIResult<T> = Result<T, OKIError>;
|
||||
|
||||
impl fmt::Display for OKIError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
OKIError::Io(e) => write!(f, "I/O error: {e}"),
|
||||
OKIError::Json(e) => write!(f, "JSON error: {e}"),
|
||||
OKIError::Partition(e) => write!(f, "partition error: {e}"),
|
||||
OKIError::Layer(e) => write!(f, "layer error: {e}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for OKIError {
|
||||
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
|
||||
match self {
|
||||
OKIError::Io(e) => Some(e),
|
||||
OKIError::Json(e) => Some(e),
|
||||
OKIError::Partition(e) => Some(e),
|
||||
OKIError::Layer(e) => Some(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<io::Error> for OKIError {
|
||||
fn from(e: io::Error) -> Self { OKIError::Io(e) }
|
||||
}
|
||||
|
||||
impl From<serde_json::Error> for OKIError {
|
||||
fn from(e: serde_json::Error) -> Self { OKIError::Json(e) }
|
||||
}
|
||||
|
||||
impl From<SKError> for OKIError {
|
||||
fn from(e: SKError) -> Self { OKIError::Partition(e) }
|
||||
}
|
||||
|
||||
impl From<OLMError> for OKIError {
|
||||
fn from(e: OLMError) -> Self { OKIError::Layer(e) }
|
||||
}
|
||||
@@ -0,0 +1,301 @@
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use cacheline_ef::{CachelineEf, CachelineEfVec};
|
||||
use epserde::prelude::*;
|
||||
use indicatif::{ProgressBar, ProgressStyle};
|
||||
use obicompactvec::{PersistentCompactIntMatrix, PersistentCompactIntVec};
|
||||
use obidebruinj::GraphDeBruijn;
|
||||
use obikpartitionner::KmerPartition;
|
||||
use obilayeredmap::layer::Layer;
|
||||
use obiskio::{SKFileMeta, SKFileReader};
|
||||
use obisys::{Reporter, Stage};
|
||||
use ptr_hash::{PtrHash, bucket_fn::CubicEps, hash::Xx64};
|
||||
use rayon::prelude::*;
|
||||
use tracing::info;
|
||||
|
||||
use crate::error::{OKIError, OKIResult};
|
||||
use crate::meta::{IndexConfig, IndexMeta};
|
||||
use crate::state::{IndexState, SENTINEL_INDEXED, SENTINEL_SCATTERED};
|
||||
|
||||
type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
|
||||
|
||||
pub struct KmerIndex {
|
||||
root_path: PathBuf,
|
||||
meta: IndexMeta,
|
||||
partition: KmerPartition,
|
||||
}
|
||||
|
||||
impl KmerIndex {
|
||||
/// Create a new index at `path`.
|
||||
///
|
||||
/// If `genome_label` is `Some`, it is stored immediately.
|
||||
/// If `None`, the label will be derived from the first scatter input path
|
||||
/// when `mark_scattered` is called.
|
||||
pub fn create<P: AsRef<Path>>(
|
||||
path: P,
|
||||
config: IndexConfig,
|
||||
genome_label: Option<String>,
|
||||
force: bool,
|
||||
) -> OKIResult<Self> {
|
||||
let root_path = path.as_ref().to_owned();
|
||||
let partition = KmerPartition::create(
|
||||
&root_path,
|
||||
config.n_bits,
|
||||
config.kmer_size,
|
||||
config.minimizer_size,
|
||||
force,
|
||||
)?;
|
||||
let mut meta = IndexMeta::new(config);
|
||||
if let Some(label) = genome_label {
|
||||
meta.genomes.push(label);
|
||||
}
|
||||
meta.write(&root_path)?;
|
||||
Ok(Self { root_path, meta, partition })
|
||||
}
|
||||
|
||||
pub fn open<P: AsRef<Path>>(path: P) -> OKIResult<Self> {
|
||||
let root_path = path.as_ref().to_owned();
|
||||
let meta = IndexMeta::read(&root_path).map_err(OKIError::Io)?;
|
||||
let partition = KmerPartition::open(&root_path)?;
|
||||
Ok(Self { root_path, meta, partition })
|
||||
}
|
||||
|
||||
/// Return `true` if `path` contains an `index.meta` file.
|
||||
pub fn exists<P: AsRef<Path>>(path: P) -> bool {
|
||||
IndexMeta::exists(path.as_ref())
|
||||
}
|
||||
|
||||
/// Current construction state, as reported by sentinel files on disk.
|
||||
pub fn state(&self) -> IndexState {
|
||||
IndexState::detect(&self.root_path).unwrap_or(IndexState::Empty)
|
||||
}
|
||||
|
||||
pub fn meta(&self) -> &IndexMeta { &self.meta }
|
||||
pub fn kmer_size(&self) -> usize { self.meta.config.kmer_size }
|
||||
pub fn minimizer_size(&self) -> usize { self.meta.config.minimizer_size }
|
||||
pub fn n_partitions(&self) -> usize { self.partition.n_partitions() }
|
||||
|
||||
/// Expose the inner partition so the caller can run scatter into it.
|
||||
/// Call `mark_scattered` once scatter is complete.
|
||||
pub fn partition_mut(&mut self) -> &mut KmerPartition {
|
||||
&mut self.partition
|
||||
}
|
||||
|
||||
/// Mark scatter as complete and write `scatter.done`.
|
||||
///
|
||||
/// If no genome label was set at creation time, one is derived from
|
||||
/// `first_scatter_path` (filename stripped of all extensions).
|
||||
/// If `first_scatter_path` is also `None`, the label defaults to `"unknown"`.
|
||||
pub fn mark_scattered(&mut self, first_scatter_path: Option<&Path>) -> OKIResult<()> {
|
||||
if self.meta.genomes.is_empty() {
|
||||
let label = first_scatter_path
|
||||
.map(label_from_path)
|
||||
.unwrap_or_else(|| "unknown".to_string());
|
||||
self.meta.genomes.push(label);
|
||||
self.meta.write(&self.root_path)?;
|
||||
}
|
||||
touch(&self.root_path.join(SENTINEL_SCATTERED))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Dereplicate all partitions then compute kmer counts.
|
||||
///
|
||||
/// Writes `kmer_spectrum_raw.json` at the index root upon completion
|
||||
/// (this file doubles as the `Counted` sentinel).
|
||||
pub fn dereplicate_and_count(&self, rep: &mut Reporter) -> OKIResult<()> {
|
||||
let t = Stage::start("dereplicate");
|
||||
self.partition.dereplicate()?;
|
||||
rep.push(t.stop());
|
||||
|
||||
let t = Stage::start("count_kmer");
|
||||
self.partition.count_kmer()?;
|
||||
rep.push(t.stop());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Build the layered MPHF index for all partitions.
|
||||
///
|
||||
/// Default mode (`config.with_counts = false`): set membership only.
|
||||
/// With counts: count matrix per kmer.
|
||||
///
|
||||
/// Writes `index.done` upon completion.
|
||||
/// Path to the unitigs file for partition `part`, layer `layer`.
|
||||
pub fn layer_unitigs_path(&self, part: usize, layer: usize) -> PathBuf {
|
||||
self.partition.part_dir(part)
|
||||
.join("index")
|
||||
.join(format!("layer_{layer}"))
|
||||
.join("unitigs.bin")
|
||||
}
|
||||
|
||||
pub fn build_layers(
|
||||
&self,
|
||||
min_ab: u32,
|
||||
max_ab: Option<u32>,
|
||||
keep_intermediate: bool,
|
||||
rep: &mut Reporter,
|
||||
) -> OKIResult<()> {
|
||||
let n = self.partition.n_partitions();
|
||||
let t = Stage::start("index");
|
||||
let with_counts = self.meta.config.with_counts;
|
||||
let filter_active = min_ab > 1 || max_ab.is_some();
|
||||
let need_counts = filter_active || with_counts;
|
||||
let total_kmers = AtomicUsize::new(0);
|
||||
|
||||
let partition = &self.partition;
|
||||
|
||||
let pb = Arc::new(Mutex::new(
|
||||
ProgressBar::new(n as u64).with_style(
|
||||
ProgressStyle::with_template("index — [{bar:20}] {pos}/{len} | {msg}").unwrap(),
|
||||
),
|
||||
));
|
||||
|
||||
(0..n).into_par_iter().for_each(|i| {
|
||||
let part_dir = partition.part_dir(i);
|
||||
let dedup_path = part_dir.join("dereplicated.skmer.zst");
|
||||
if !dedup_path.exists() {
|
||||
return;
|
||||
}
|
||||
|
||||
let layer_dir = part_dir.join("index").join("layer_0");
|
||||
if layer_dir.join("mphf.bin").exists() {
|
||||
return;
|
||||
}
|
||||
|
||||
let mphf1_opt: Option<Mphf> = if need_counts {
|
||||
let p = part_dir.join("mphf1.bin");
|
||||
p.exists().then(|| Mphf::load_full(&p).ok()).flatten()
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let counts1_opt: Option<PersistentCompactIntVec> = if need_counts {
|
||||
let p = part_dir.join("counts1.bin");
|
||||
p.exists()
|
||||
.then(|| PersistentCompactIntVec::open(&p).ok())
|
||||
.flatten()
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let mut g = GraphDeBruijn::new();
|
||||
let mut reader = SKFileReader::open(&dedup_path).unwrap_or_else(|e| {
|
||||
eprintln!("error opening {}: {e}", dedup_path.display());
|
||||
std::process::exit(1);
|
||||
});
|
||||
for sk in reader.iter() {
|
||||
for kmer in sk.iter_canonical_kmers() {
|
||||
let accept = if filter_active {
|
||||
match (&mphf1_opt, &counts1_opt) {
|
||||
(Some(mphf), Some(counts)) => {
|
||||
let ab = counts.get(mphf.index(&kmer.raw()));
|
||||
ab >= min_ab && max_ab.map_or(true, |max| ab <= max)
|
||||
}
|
||||
_ => true,
|
||||
}
|
||||
} else {
|
||||
true
|
||||
};
|
||||
if accept {
|
||||
g.push(kmer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let n_kmers = g.len();
|
||||
total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
|
||||
g.compute_degrees();
|
||||
|
||||
fs::create_dir_all(&layer_dir).unwrap_or_else(|e| {
|
||||
eprintln!("error creating {}: {e}", layer_dir.display());
|
||||
std::process::exit(1);
|
||||
});
|
||||
let mut uw = Layer::<()>::unitig_writer(&layer_dir).unwrap_or_else(|e| {
|
||||
eprintln!("error creating unitig writer (partition {i}): {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
for unitig in g.iter_unitig() {
|
||||
uw.write(&unitig).unwrap_or_else(|e| {
|
||||
eprintln!("error writing unitig (partition {i}): {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
}
|
||||
uw.close().unwrap_or_else(|e| {
|
||||
eprintln!("error closing unitig writer (partition {i}): {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
|
||||
if with_counts {
|
||||
Layer::<PersistentCompactIntMatrix>::build(&layer_dir, |kmer| {
|
||||
match (&mphf1_opt, &counts1_opt) {
|
||||
(Some(mphf), Some(counts)) => counts.get(mphf.index(&kmer.raw())),
|
||||
_ => 1,
|
||||
}
|
||||
})
|
||||
.unwrap_or_else(|e| {
|
||||
eprintln!("error building count layer (partition {i}): {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
} else {
|
||||
Layer::<()>::build(&layer_dir).unwrap_or_else(|e| {
|
||||
eprintln!("error building set layer (partition {i}): {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
}
|
||||
|
||||
let pb = pb.lock().unwrap();
|
||||
pb.inc(1);
|
||||
pb.set_message(format!("{i}: {n_kmers} kmers"));
|
||||
});
|
||||
|
||||
pb.lock().unwrap().finish_and_clear();
|
||||
info!(
|
||||
"done — {} total kmers indexed",
|
||||
total_kmers.load(Ordering::Relaxed)
|
||||
);
|
||||
|
||||
if !keep_intermediate {
|
||||
for i in 0..n {
|
||||
let part_dir = partition.part_dir(i);
|
||||
remove_if_exists(&part_dir.join("dereplicated.skmer.zst"));
|
||||
remove_if_exists(&SKFileMeta::sidecar_path(
|
||||
&part_dir.join("dereplicated.skmer.zst"),
|
||||
));
|
||||
remove_if_exists(&part_dir.join("mphf1.bin"));
|
||||
remove_if_exists(&part_dir.join("counts1.bin"));
|
||||
}
|
||||
}
|
||||
|
||||
touch(&self.root_path.join(SENTINEL_INDEXED))?;
|
||||
rep.push(t.stop());
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Derive a genome label from a file path: filename stripped of all extensions.
|
||||
fn label_from_path(path: &Path) -> String {
|
||||
let name = path
|
||||
.file_name()
|
||||
.unwrap_or(path.as_os_str())
|
||||
.to_string_lossy()
|
||||
.into_owned();
|
||||
let mut s = name;
|
||||
while let Some(pos) = s.rfind('.') {
|
||||
s.truncate(pos);
|
||||
}
|
||||
if s.is_empty() { "unknown".to_string() } else { s }
|
||||
}
|
||||
|
||||
fn touch(path: &Path) -> Result<(), std::io::Error> {
|
||||
fs::File::create(path).map(|_| ())
|
||||
}
|
||||
|
||||
fn remove_if_exists(path: &Path) {
|
||||
if let Err(e) = fs::remove_file(path) {
|
||||
if e.kind() != std::io::ErrorKind::NotFound {
|
||||
eprintln!("warning: could not remove {}: {e}", path.display());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
pub mod error;
|
||||
pub mod meta;
|
||||
pub mod state;
|
||||
mod index;
|
||||
|
||||
pub use error::{OKIError, OKIResult};
|
||||
pub use index::KmerIndex;
|
||||
pub use meta::{IndexConfig, IndexMeta, META_FILENAME};
|
||||
pub use state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
|
||||
@@ -0,0 +1,45 @@
|
||||
use std::fs;
|
||||
use std::io;
|
||||
use std::path::Path;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
pub const META_FILENAME: &str = "index.meta";
|
||||
const META_VERSION: u32 = 1;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct IndexConfig {
|
||||
pub kmer_size: usize,
|
||||
pub minimizer_size: usize,
|
||||
pub n_bits: usize,
|
||||
pub with_counts: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct IndexMeta {
|
||||
pub version: u32,
|
||||
pub config: IndexConfig,
|
||||
/// Ordered list of genome labels indexed here.
|
||||
/// Element 0 is the initial genome; subsequent entries come from merges.
|
||||
pub genomes: Vec<String>,
|
||||
}
|
||||
|
||||
impl IndexMeta {
|
||||
pub fn new(config: IndexConfig) -> Self {
|
||||
Self { version: META_VERSION, config, genomes: Vec::new() }
|
||||
}
|
||||
|
||||
pub fn write(&self, root: &Path) -> io::Result<()> {
|
||||
let file = fs::File::create(root.join(META_FILENAME))?;
|
||||
serde_json::to_writer_pretty(file, self).map_err(io::Error::other)
|
||||
}
|
||||
|
||||
pub fn read(root: &Path) -> io::Result<Self> {
|
||||
let file = fs::File::open(root.join(META_FILENAME))?;
|
||||
serde_json::from_reader(file).map_err(io::Error::other)
|
||||
}
|
||||
|
||||
pub fn exists(root: &Path) -> bool {
|
||||
root.join(META_FILENAME).exists()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
use std::path::Path;
|
||||
|
||||
use crate::meta::META_FILENAME;
|
||||
|
||||
pub const SENTINEL_SCATTERED: &str = "scatter.done";
|
||||
pub const SENTINEL_COUNTED: &str = "kmer_spectrum_raw.json";
|
||||
pub const SENTINEL_INDEXED: &str = "index.done";
|
||||
|
||||
/// Progression state of a `KmerIndex`.
|
||||
///
|
||||
/// Variants are ordered: `Empty < Scattered < Counted < Indexed`.
|
||||
/// A state is reported only when its sentinel file is fully present —
|
||||
/// partial states (e.g. scatter interrupted mid-way) are not accepted.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum IndexState {
|
||||
/// `index.meta` present; scatter not yet completed.
|
||||
Empty,
|
||||
/// `scatter.done` sentinel present — all super-kmers have been routed.
|
||||
Scattered,
|
||||
/// `kmer_spectrum_raw.json` present — dereplicate + count complete.
|
||||
Counted,
|
||||
/// `index.done` sentinel present — layered MPHF index fully built.
|
||||
Indexed,
|
||||
}
|
||||
|
||||
impl IndexState {
|
||||
/// Detect the state of the index at `root`.
|
||||
///
|
||||
/// Returns `None` if `index.meta` is absent (not an obikindex directory).
|
||||
pub fn detect(root: &Path) -> Option<Self> {
|
||||
if !root.join(META_FILENAME).exists() {
|
||||
return None;
|
||||
}
|
||||
if root.join(SENTINEL_INDEXED).exists() {
|
||||
return Some(Self::Indexed);
|
||||
}
|
||||
if root.join(SENTINEL_COUNTED).exists() {
|
||||
return Some(Self::Counted);
|
||||
}
|
||||
if root.join(SENTINEL_SCATTERED).exists() {
|
||||
return Some(Self::Scattered);
|
||||
}
|
||||
Some(Self::Empty)
|
||||
}
|
||||
}
|
||||
+2
-10
@@ -13,21 +13,13 @@ obiread = { path = "../obiread" }
|
||||
obiskbuilder = { path = "../obiskbuilder" }
|
||||
obifastwrite = { path = "../obifastwrite" }
|
||||
obipipeline = { path = "../obipipeline" }
|
||||
obidebruinj = { path = "../obidebruinj" }
|
||||
clap = { version = "4", features = ["derive"] }
|
||||
obikrope = { path = "../obikrope" }
|
||||
obikpartitionner = { path = "../obikpartitionner" }
|
||||
obisys = { path = "../obisys" }
|
||||
obiskio = { path = "../obiskio" }
|
||||
obicompactvec = { path = "../obicompactvec" }
|
||||
obilayeredmap = { path = "../obilayeredmap" }
|
||||
niffler = "3"
|
||||
obikindex = { path = "../obikindex" }
|
||||
clap = { version = "4", features = ["derive"] }
|
||||
rayon = "1"
|
||||
ph = "0.11"
|
||||
memmap2 = "0.9"
|
||||
epserde = "0.8"
|
||||
ptr_hash = "1.1"
|
||||
cacheline-ef = "1.1"
|
||||
indicatif = "0.17"
|
||||
tracing = "0.1.44"
|
||||
tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] }
|
||||
|
||||
@@ -1,24 +0,0 @@
|
||||
use clap::Args;
|
||||
use obikpartitionner::KmerPartition;
|
||||
use std::path::PathBuf;
|
||||
use tracing::info;
|
||||
|
||||
#[derive(Args)]
|
||||
pub struct CountArgs {
|
||||
/// Partition directory produced by the `partition` command
|
||||
#[arg(short, long)]
|
||||
pub partition: PathBuf,
|
||||
}
|
||||
|
||||
pub fn run(args: CountArgs) {
|
||||
let kp = KmerPartition::open(&args.partition).unwrap_or_else(|e| {
|
||||
eprintln!("error: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
|
||||
info!("counting kmers in {}", args.partition.display());
|
||||
kp.count_kmer().unwrap_or_else(|e| {
|
||||
eprintln!("error: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
}
|
||||
@@ -1,84 +0,0 @@
|
||||
use std::fs::File;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use clap::Args;
|
||||
use niffler::Level;
|
||||
use niffler::send::compression::Format;
|
||||
use obifastwrite::write_count;
|
||||
use obikpartitionner::KmerPartition;
|
||||
use obiskio::SKFileReader;
|
||||
use rayon::prelude::*;
|
||||
use tracing::info;
|
||||
|
||||
#[derive(Args)]
|
||||
pub struct FastaArgs {
|
||||
/// Root of the k-mer partition directory (produced by the `partition` command)
|
||||
pub partition: PathBuf,
|
||||
|
||||
/// Dump dereplicated super-kmers as FASTA (→ <partition>/dereplicated.skmer.fasta.gz)
|
||||
#[arg(long)]
|
||||
pub super_kmers: bool,
|
||||
}
|
||||
|
||||
pub fn run(args: FastaArgs) {
|
||||
if !args.super_kmers {
|
||||
eprintln!("error: specify at least one output mode (--super-kmers)");
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
let kp = KmerPartition::open(&args.partition).unwrap_or_else(|e| {
|
||||
eprintln!("error opening partition: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
|
||||
if args.super_kmers {
|
||||
dump_super_kmers(&kp, &args.partition);
|
||||
}
|
||||
}
|
||||
|
||||
fn dump_super_kmers(kp: &KmerPartition, _partition_dir: &PathBuf) {
|
||||
let k = kp.kmer_size();
|
||||
let m = kp.minimizer_size();
|
||||
let n = kp.n_partitions();
|
||||
|
||||
info!("writing {n} partition FASTA files (parallel)");
|
||||
|
||||
let total = AtomicUsize::new(0);
|
||||
|
||||
(0..n).into_par_iter().for_each(|i| {
|
||||
let part_dir = kp.part_dir(i);
|
||||
let in_path = part_dir.join("dereplicated.skmer.zst");
|
||||
if !in_path.exists() {
|
||||
return;
|
||||
}
|
||||
let out_path = part_dir.join("dereplicated.skmer.fasta.gz");
|
||||
|
||||
let file = File::create(&out_path).unwrap_or_else(|e| {
|
||||
eprintln!("error creating {}: {e}", out_path.display());
|
||||
std::process::exit(1)
|
||||
});
|
||||
let mut writer = niffler::send::get_writer(Box::new(file), Format::Gzip, Level::Six)
|
||||
.unwrap_or_else(|e| {
|
||||
eprintln!("error creating gzip writer: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
|
||||
let mut reader = SKFileReader::open(&in_path).unwrap_or_else(|e| {
|
||||
eprintln!("error opening {}: {e}", in_path.display());
|
||||
std::process::exit(1)
|
||||
});
|
||||
let mut count = 0usize;
|
||||
for sk in reader.iter() {
|
||||
write_count(&sk, &mut writer, k, m, i as u32).unwrap_or_else(|e| {
|
||||
eprintln!("write error: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
count += 1;
|
||||
}
|
||||
info!("partition {i}: {count} super-kmers → {}", out_path.display());
|
||||
total.fetch_add(count, Ordering::Relaxed);
|
||||
});
|
||||
|
||||
info!("wrote {} super-kmers total", total.load(Ordering::Relaxed));
|
||||
}
|
||||
@@ -1,17 +1,17 @@
|
||||
use std::path::PathBuf;
|
||||
|
||||
use clap::Args;
|
||||
use obikpartitionner::KmerPartition;
|
||||
use obikindex::{IndexConfig, IndexState, KmerIndex};
|
||||
use obikseq::{set_k, set_m};
|
||||
use obisys::Reporter;
|
||||
use tracing::info;
|
||||
|
||||
use crate::cli::CommonArgs;
|
||||
use crate::steps::{build_index, dereplicate_and_count, scatter};
|
||||
use crate::steps::scatter;
|
||||
|
||||
#[derive(Args)]
|
||||
pub struct IndexArgs {
|
||||
/// Output partition directory
|
||||
/// Output index directory
|
||||
#[arg(short, long)]
|
||||
pub output: PathBuf,
|
||||
|
||||
@@ -19,6 +19,10 @@ pub struct IndexArgs {
|
||||
#[arg(long, default_value_t = false)]
|
||||
pub force: bool,
|
||||
|
||||
/// Genome label (default: input filename without path/extension)
|
||||
#[arg(long)]
|
||||
pub label: Option<String>,
|
||||
|
||||
/// Minimum kmer abundance (inclusive)
|
||||
#[arg(long, default_value_t = 1)]
|
||||
pub min_abundance: u32,
|
||||
@@ -43,53 +47,71 @@ pub fn run(args: IndexArgs) {
|
||||
let output = args.output.clone();
|
||||
let mut rep = Reporter::new();
|
||||
|
||||
// ── Stage 1: scatter (skipped if partition already exists) ───────────────
|
||||
let kp = if output.join("partition.meta").exists() {
|
||||
info!("resuming from existing partition at {}", output.display());
|
||||
let kp = KmerPartition::open(&output).unwrap_or_else(|e| {
|
||||
eprintln!("error opening partition: {e}");
|
||||
// ── Open or create the index ─────────────────────────────────────────────
|
||||
let mut idx = if KmerIndex::exists(&output) {
|
||||
info!("resuming from existing index at {}", output.display());
|
||||
KmerIndex::open(&output).unwrap_or_else(|e| {
|
||||
eprintln!("error opening index: {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
set_k(kp.kmer_size());
|
||||
set_m(kp.minimizer_size());
|
||||
kp
|
||||
})
|
||||
} else {
|
||||
let k = args.common.kmer_size;
|
||||
set_k(k);
|
||||
let m = args.common.minimizer_size;
|
||||
set_m(m);
|
||||
let theta = args.common.theta;
|
||||
let config = IndexConfig {
|
||||
kmer_size: args.common.kmer_size,
|
||||
minimizer_size: args.common.minimizer_size,
|
||||
n_bits: args.common.partition_bits,
|
||||
with_counts: args.with_counts,
|
||||
};
|
||||
KmerIndex::create(&output, config, args.label.clone(), args.force).unwrap_or_else(|e| {
|
||||
eprintln!("error creating index: {e}");
|
||||
std::process::exit(1);
|
||||
})
|
||||
};
|
||||
|
||||
set_k(idx.kmer_size());
|
||||
set_m(idx.minimizer_size());
|
||||
|
||||
// ── Stage 1: scatter ─────────────────────────────────────────────────────
|
||||
if idx.state() < IndexState::Scattered {
|
||||
let first_path = args.common.inputs.first().map(PathBuf::from);
|
||||
let k = idx.kmer_size();
|
||||
let level_max = args.common.level_max;
|
||||
let theta = args.common.theta;
|
||||
let n_workers = args.common.threads.max(1);
|
||||
|
||||
let mut kp =
|
||||
KmerPartition::create(&output, args.common.partition_bits, k, m, args.force)
|
||||
.unwrap_or_else(|e| {
|
||||
scatter(idx.partition_mut(), args.common.seqfile_paths(), k, level_max, theta, n_workers, &mut rep);
|
||||
|
||||
idx.mark_scattered(first_path.as_deref()).unwrap_or_else(|e| {
|
||||
eprintln!("error marking scatter done: {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
} else {
|
||||
info!("scatter already done, skipping");
|
||||
}
|
||||
|
||||
// ── Stage 2: dereplicate + count ─────────────────────────────────────────
|
||||
if idx.state() < IndexState::Counted {
|
||||
idx.dereplicate_and_count(&mut rep).unwrap_or_else(|e| {
|
||||
eprintln!("error: {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
|
||||
scatter(&mut kp, args.common.seqfile_paths(), k, level_max, theta, n_workers, &mut rep);
|
||||
kp
|
||||
};
|
||||
|
||||
|
||||
// ── Stage 2: dereplicate + count (skipped if already done) ───────────────
|
||||
if !output.join("kmer_spectrum_raw.json").exists() {
|
||||
dereplicate_and_count(&kp, &mut rep);
|
||||
} else {
|
||||
info!("kmer counts already present, skipping dereplicate + count");
|
||||
info!("dereplicate+count already done, skipping");
|
||||
}
|
||||
|
||||
// ── Stage 3: build layered index ─────────────────────────────────────────
|
||||
build_index(
|
||||
&kp,
|
||||
if idx.state() < IndexState::Indexed {
|
||||
idx.build_layers(
|
||||
args.min_abundance,
|
||||
args.max_abundance,
|
||||
args.with_counts,
|
||||
args.keep_intermediate,
|
||||
&mut rep,
|
||||
);
|
||||
).unwrap_or_else(|e| {
|
||||
eprintln!("error: {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
} else {
|
||||
info!("index already built, skipping");
|
||||
}
|
||||
|
||||
rep.print();
|
||||
}
|
||||
|
||||
@@ -1,143 +0,0 @@
|
||||
use std::fs::File;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use clap::Args;
|
||||
use niffler::Level;
|
||||
use niffler::send::compression::Format;
|
||||
use obidebruinj::GraphDeBruijn;
|
||||
use obikpartitionner::KmerPartition;
|
||||
use obikseq::set_k;
|
||||
use obiskio::SKFileReader;
|
||||
use ph::fmph::GOFunction;
|
||||
use rayon::prelude::*;
|
||||
use tracing::info;
|
||||
|
||||
#[derive(Args)]
|
||||
pub struct LongtigArgs {
|
||||
/// Root of the k-mer partition directory (produced by the `partition` command)
|
||||
pub partition: PathBuf,
|
||||
|
||||
/// Minimum kmer abundance (inclusive); kmers below this threshold are excluded
|
||||
#[arg(long, default_value_t = 1)]
|
||||
pub min_abundance: u32,
|
||||
|
||||
/// Maximum kmer abundance (inclusive); kmers above this threshold are excluded
|
||||
#[arg(long)]
|
||||
pub max_abundance: Option<u32>,
|
||||
}
|
||||
|
||||
pub fn run(args: LongtigArgs) {
|
||||
let kp = KmerPartition::open(&args.partition).unwrap_or_else(|e| {
|
||||
eprintln!("error opening partition: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
|
||||
let k = kp.kmer_size();
|
||||
set_k(k);
|
||||
let n = kp.n_partitions();
|
||||
info!("building longtigs from {n} partitions (k={k}, parallel)");
|
||||
|
||||
let total_kmers = AtomicUsize::new(0);
|
||||
|
||||
(0..n).into_par_iter().for_each(|i| {
|
||||
let part_dir = kp.part_dir(i);
|
||||
let in_path = part_dir.join("dereplicated.skmer.zst");
|
||||
if !in_path.exists() {
|
||||
return;
|
||||
}
|
||||
let out_path = part_dir.join("longtig.fasta.gz");
|
||||
|
||||
let mut g = GraphDeBruijn::new();
|
||||
|
||||
let mphf_path = part_dir.join("mphf1.bin");
|
||||
let counts_path = part_dir.join("counts1.bin");
|
||||
let filter_active = (args.min_abundance > 1 || args.max_abundance.is_some())
|
||||
&& mphf_path.exists()
|
||||
&& counts_path.exists();
|
||||
|
||||
let mphf_opt: Option<GOFunction> = if filter_active {
|
||||
let mut f = File::open(&mphf_path).unwrap_or_else(|e| {
|
||||
eprintln!("error opening {}: {e}", mphf_path.display());
|
||||
std::process::exit(1)
|
||||
});
|
||||
Some(GOFunction::read(&mut f).unwrap_or_else(|e| {
|
||||
eprintln!("error reading MPHF {}: {e}", mphf_path.display());
|
||||
std::process::exit(1)
|
||||
}))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let counts_mmap_opt = if filter_active {
|
||||
let cf = File::open(&counts_path).unwrap_or_else(|e| {
|
||||
eprintln!("error opening {}: {e}", counts_path.display());
|
||||
std::process::exit(1)
|
||||
});
|
||||
Some(unsafe {
|
||||
memmap2::Mmap::map(&cf).unwrap_or_else(|e| {
|
||||
eprintln!("error mmapping {}: {e}", counts_path.display());
|
||||
std::process::exit(1)
|
||||
})
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let counts_slice: Option<&[u32]> = counts_mmap_opt
|
||||
.as_ref()
|
||||
.map(|m| unsafe { std::slice::from_raw_parts(m.as_ptr() as *const u32, m.len() / 4) });
|
||||
|
||||
let mut reader = SKFileReader::open(&in_path).unwrap_or_else(|e| {
|
||||
eprintln!("error opening {}: {e}", in_path.display());
|
||||
std::process::exit(1)
|
||||
});
|
||||
for sk in reader.iter() {
|
||||
for kmer in sk.iter_canonical_kmers() {
|
||||
let accept = match (&mphf_opt, counts_slice) {
|
||||
(Some(mphf), Some(counts)) => {
|
||||
if let Some(slot) = mphf.get(&kmer) {
|
||||
let ab = counts[slot as usize];
|
||||
ab >= args.min_abundance
|
||||
&& args.max_abundance.map_or(true, |max| ab <= max)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
_ => true,
|
||||
};
|
||||
if accept {
|
||||
g.push(kmer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let n_kmers = g.len();
|
||||
total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
|
||||
info!(
|
||||
"partition {i}/{n}: {n_kmers} canonical k-mers → {}",
|
||||
out_path.display()
|
||||
);
|
||||
|
||||
g.compute_degrees();
|
||||
|
||||
let file = File::create(&out_path).unwrap_or_else(|e| {
|
||||
eprintln!("error creating {}: {e}", out_path.display());
|
||||
std::process::exit(1)
|
||||
});
|
||||
let mut writer = niffler::send::get_writer(Box::new(file), Format::Gzip, Level::Six)
|
||||
.unwrap_or_else(|e| {
|
||||
eprintln!("error creating gzip writer: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
g.write_fasta(&mut writer, false).unwrap_or_else(|e| {
|
||||
eprintln!("write error on partition {i}: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
});
|
||||
|
||||
info!(
|
||||
"done — {} total canonical k-mers across all partitions",
|
||||
total_kmers.load(Ordering::Relaxed)
|
||||
);
|
||||
}
|
||||
@@ -1,7 +1,3 @@
|
||||
pub mod count;
|
||||
pub mod fasta;
|
||||
pub mod index;
|
||||
pub mod longtig;
|
||||
pub mod partition;
|
||||
pub mod superkmer;
|
||||
pub mod unitig;
|
||||
|
||||
@@ -1,49 +0,0 @@
|
||||
use std::path::PathBuf;
|
||||
|
||||
use clap::Args;
|
||||
use obikpartitionner::KmerPartition;
|
||||
use obisys::Reporter;
|
||||
use obikseq::{set_k, set_m};
|
||||
|
||||
use crate::cli::CommonArgs;
|
||||
use crate::steps::{dereplicate_and_count, scatter};
|
||||
|
||||
#[derive(Args)]
|
||||
pub struct PartitionArgs {
|
||||
/// Output partition directory
|
||||
#[arg(short, long)]
|
||||
pub output: PathBuf,
|
||||
|
||||
/// Overwrite output directory if it already exists
|
||||
#[arg(long, default_value_t = false)]
|
||||
pub force: bool,
|
||||
|
||||
#[command(flatten)]
|
||||
pub common: CommonArgs,
|
||||
}
|
||||
|
||||
// ── Entry point ───────────────────────────────────────────────────────────────
|
||||
|
||||
pub fn run(args: PartitionArgs) {
|
||||
let k = args.common.kmer_size;
|
||||
set_k(k);
|
||||
let m = args.common.minimizer_size;
|
||||
set_m(m);
|
||||
let theta = args.common.theta;
|
||||
let level_max = args.common.level_max;
|
||||
let n_workers = args.common.threads.max(1);
|
||||
|
||||
let mut kp = KmerPartition::create(&args.output, args.common.partition_bits, k, m, args.force)
|
||||
.unwrap_or_else(|e| {
|
||||
eprintln!("error: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
|
||||
let path_source = args.common.seqfile_paths();
|
||||
let mut rep = Reporter::new();
|
||||
|
||||
scatter(&mut kp, path_source, k, level_max, theta, n_workers, &mut rep);
|
||||
dereplicate_and_count(&kp, &mut rep);
|
||||
|
||||
rep.print();
|
||||
}
|
||||
+23
-112
@@ -1,143 +1,54 @@
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufWriter, Write};
|
||||
use std::path::PathBuf;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Mutex;
|
||||
|
||||
use clap::Args;
|
||||
use niffler::Level;
|
||||
use niffler::send::compression::Format;
|
||||
use obidebruinj::GraphDeBruijn;
|
||||
use obikpartitionner::KmerPartition;
|
||||
use obifastwrite::write_unitig;
|
||||
use obikindex::KmerIndex;
|
||||
use obikseq::set_k;
|
||||
use obiskio::SKFileReader;
|
||||
use ph::fmph::GOFunction;
|
||||
use obiskio::UnitigFileReader;
|
||||
use rayon::prelude::*;
|
||||
use tracing::info;
|
||||
|
||||
#[derive(Args)]
|
||||
pub struct UnitigArgs {
|
||||
/// Root of the k-mer partition directory (produced by the `partition` command)
|
||||
pub partition: PathBuf,
|
||||
|
||||
/// Minimum kmer abundance (inclusive); kmers below this threshold are excluded
|
||||
#[arg(long, default_value_t = 1)]
|
||||
pub min_abundance: u32,
|
||||
|
||||
/// Maximum kmer abundance (inclusive); kmers above this threshold are excluded
|
||||
#[arg(long)]
|
||||
pub max_abundance: Option<u32>,
|
||||
/// Index directory produced by the `index` command
|
||||
pub index: PathBuf,
|
||||
}
|
||||
|
||||
pub fn run(args: UnitigArgs) {
|
||||
let kp = KmerPartition::open(&args.partition).unwrap_or_else(|e| {
|
||||
eprintln!("error opening partition: {e}");
|
||||
let idx = KmerIndex::open(&args.index).unwrap_or_else(|e| {
|
||||
eprintln!("error opening index: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
|
||||
let k = kp.kmer_size();
|
||||
let k = idx.kmer_size();
|
||||
set_k(k);
|
||||
let n = kp.n_partitions();
|
||||
info!("building unitigs from {n} partitions (k={k}, parallel)");
|
||||
let n = idx.n_partitions();
|
||||
info!("dumping unitigs from {n} partitions (k={k})");
|
||||
|
||||
let total_kmers = AtomicUsize::new(0);
|
||||
let stdout = Mutex::new(BufWriter::new(io::stdout()));
|
||||
|
||||
(0..n).into_par_iter().for_each(|i| {
|
||||
let part_dir = kp.part_dir(i);
|
||||
let in_path = part_dir.join("dereplicated.skmer.zst");
|
||||
if !in_path.exists() {
|
||||
let path = idx.layer_unitigs_path(i, 0);
|
||||
if !path.exists() {
|
||||
return;
|
||||
}
|
||||
let out_path = part_dir.join("unitig.fasta.gz");
|
||||
|
||||
let mut g = GraphDeBruijn::new();
|
||||
|
||||
let mphf_path = part_dir.join("mphf1.bin");
|
||||
let counts_path = part_dir.join("counts1.bin");
|
||||
let filter_active = (args.min_abundance > 1 || args.max_abundance.is_some())
|
||||
&& mphf_path.exists()
|
||||
&& counts_path.exists();
|
||||
|
||||
let mphf_opt: Option<GOFunction> = if filter_active {
|
||||
let mut f = File::open(&mphf_path).unwrap_or_else(|e| {
|
||||
eprintln!("error opening {}: {e}", mphf_path.display());
|
||||
let reader = UnitigFileReader::open(&path).unwrap_or_else(|e| {
|
||||
eprintln!("error opening unitigs (partition {i}): {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
Some(GOFunction::read(&mut f).unwrap_or_else(|e| {
|
||||
eprintln!("error reading MPHF {}: {e}", mphf_path.display());
|
||||
std::process::exit(1)
|
||||
}))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let counts_mmap_opt = if filter_active {
|
||||
let cf = File::open(&counts_path).unwrap_or_else(|e| {
|
||||
eprintln!("error opening {}: {e}", counts_path.display());
|
||||
for j in 0..reader.len() {
|
||||
let unitig = reader.unitig(j);
|
||||
let mut out = stdout.lock().unwrap();
|
||||
write_unitig(&unitig, k, i, j, &mut *out).unwrap_or_else(|e| {
|
||||
eprintln!("write error: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
Some(unsafe {
|
||||
memmap2::Mmap::map(&cf).unwrap_or_else(|e| {
|
||||
eprintln!("error mmapping {}: {e}", counts_path.display());
|
||||
std::process::exit(1)
|
||||
})
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let counts_slice: Option<&[u32]> = counts_mmap_opt
|
||||
.as_ref()
|
||||
.map(|m| unsafe { std::slice::from_raw_parts(m.as_ptr() as *const u32, m.len() / 4) });
|
||||
|
||||
let mut reader = SKFileReader::open(&in_path).unwrap_or_else(|e| {
|
||||
eprintln!("error opening {}: {e}", in_path.display());
|
||||
std::process::exit(1)
|
||||
});
|
||||
for sk in reader.iter() {
|
||||
for kmer in sk.iter_canonical_kmers() {
|
||||
let accept = match (&mphf_opt, counts_slice) {
|
||||
(Some(mphf), Some(counts)) => {
|
||||
if let Some(slot) = mphf.get(&kmer) {
|
||||
let ab = counts[slot as usize];
|
||||
ab >= args.min_abundance
|
||||
&& args.max_abundance.map_or(true, |max| ab <= max)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
_ => true,
|
||||
};
|
||||
if accept {
|
||||
g.push(kmer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let n_kmers = g.len();
|
||||
total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
|
||||
info!(
|
||||
"partition {i}/{n}: {n_kmers} canonical k-mers → {}",
|
||||
out_path.display()
|
||||
);
|
||||
|
||||
g.compute_degrees();
|
||||
|
||||
let file = File::create(&out_path).unwrap_or_else(|e| {
|
||||
eprintln!("error creating {}: {e}", out_path.display());
|
||||
std::process::exit(1)
|
||||
});
|
||||
let mut writer = niffler::send::get_writer(Box::new(file), Format::Gzip, Level::Six)
|
||||
.unwrap_or_else(|e| {
|
||||
eprintln!("error creating gzip writer: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
g.write_fasta(&mut writer, true).unwrap_or_else(|e| {
|
||||
eprintln!("write error on partition {i}: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
});
|
||||
|
||||
info!(
|
||||
"done — {} total canonical k-mers across all partitions",
|
||||
total_kmers.load(Ordering::Relaxed)
|
||||
);
|
||||
stdout.into_inner().unwrap().flush().expect("flush error");
|
||||
}
|
||||
|
||||
+4
-16
@@ -14,20 +14,12 @@ struct Cli {
|
||||
|
||||
#[derive(Subcommand)]
|
||||
enum Commands {
|
||||
/// Extract super-kmers from a sequence file (scatter phase)
|
||||
/// Extract super-kmers from a sequence file and write to stdout
|
||||
Superkmer(cmd::superkmer::SuperkmerArgs),
|
||||
/// Partition super-kmers on disk by minimizer
|
||||
Partition(cmd::partition::PartitionArgs),
|
||||
/// Count kmers from an existing dereplicated partition directory
|
||||
Count(cmd::count::CountArgs),
|
||||
/// Export partition data to FASTA (--super-kmers: dereplicated super-kmers)
|
||||
Fasta(cmd::fasta::FastaArgs),
|
||||
/// Build de Bruijn unitigs for all partitions and write to unitig.fasta.gz
|
||||
Unitig(cmd::unitig::UnitigArgs),
|
||||
/// Build de Bruijn longtigs for all partitions and write to longtig.fasta.gz
|
||||
Longtig(cmd::longtig::LongtigArgs),
|
||||
/// Build the complete genome index (scatter → dereplicate → count → layered MPHF)
|
||||
Index(cmd::index::IndexArgs),
|
||||
/// Dump unitigs from a built index to stdout (debug)
|
||||
Unitig(cmd::unitig::UnitigArgs),
|
||||
}
|
||||
|
||||
fn main() {
|
||||
@@ -50,12 +42,8 @@ fn main() {
|
||||
let cli = Cli::parse();
|
||||
match cli.command {
|
||||
Commands::Superkmer(args) => cmd::superkmer::run(args),
|
||||
Commands::Partition(args) => cmd::partition::run(args),
|
||||
Commands::Count(args) => cmd::count::run(args),
|
||||
Commands::Fasta(args) => cmd::fasta::run(args),
|
||||
Commands::Unitig(args) => cmd::unitig::run(args),
|
||||
Commands::Longtig(args) => cmd::longtig::run(args),
|
||||
Commands::Index(args) => cmd::index::run(args),
|
||||
Commands::Unitig(args) => cmd::unitig::run(args),
|
||||
}
|
||||
|
||||
#[cfg(feature = "profiling")]
|
||||
|
||||
@@ -1,177 +0,0 @@
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use cacheline_ef::{CachelineEf, CachelineEfVec};
|
||||
use epserde::prelude::*;
|
||||
use indicatif::{ProgressBar, ProgressStyle};
|
||||
use obicompactvec::PersistentCompactIntMatrix;
|
||||
use obicompactvec::PersistentCompactIntVec;
|
||||
use obidebruinj::GraphDeBruijn;
|
||||
use obikpartitionner::KmerPartition;
|
||||
use obilayeredmap::layer::Layer;
|
||||
use obiskio::{SKFileMeta, SKFileReader};
|
||||
use obisys::{Reporter, Stage};
|
||||
use ptr_hash::{PtrHash, bucket_fn::CubicEps, hash::Xx64};
|
||||
use rayon::prelude::*;
|
||||
use tracing::info;
|
||||
|
||||
type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
|
||||
|
||||
/// Build the layered MPHF index for all partitions in parallel.
|
||||
///
|
||||
/// Default mode (with_counts = false): set membership only (`Layer<()>`).
|
||||
/// With counts (with_counts = true): count matrix per kmer (`Layer<PersistentCompactIntMatrix>`).
|
||||
///
|
||||
/// Skips any partition whose `index/layer_0/mphf.bin` already exists (resume).
|
||||
/// Reports the "index" stage to `rep`.
|
||||
pub fn build_index(
|
||||
kp: &KmerPartition,
|
||||
min_ab: u32,
|
||||
max_ab: Option<u32>,
|
||||
with_counts: bool,
|
||||
keep_intermediate: bool,
|
||||
rep: &mut Reporter,
|
||||
) {
|
||||
let n = kp.n_partitions();
|
||||
let t = Stage::start("index");
|
||||
let total_kmers = AtomicUsize::new(0);
|
||||
let filter_active = min_ab > 1 || max_ab.is_some();
|
||||
let need_counts = filter_active || with_counts;
|
||||
|
||||
let pb = Arc::new(Mutex::new(
|
||||
ProgressBar::new(n as u64).with_style(
|
||||
ProgressStyle::with_template("index — [{bar:20}] {pos}/{len} | {msg}").unwrap(),
|
||||
),
|
||||
));
|
||||
|
||||
(0..n).into_par_iter().for_each(|i| {
|
||||
let part_dir = kp.part_dir(i);
|
||||
let dedup_path = part_dir.join("dereplicated.skmer.zst");
|
||||
if !dedup_path.exists() {
|
||||
return;
|
||||
}
|
||||
|
||||
let layer_dir = part_dir.join("index").join("layer_0");
|
||||
if layer_dir.join("mphf.bin").exists() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Load partition MPHF + counts when needed for filtering or count payload
|
||||
let mphf1_opt: Option<Mphf> = if need_counts {
|
||||
let p = part_dir.join("mphf1.bin");
|
||||
p.exists().then(|| Mphf::load_full(&p).ok()).flatten()
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let counts1_opt: Option<PersistentCompactIntVec> = if need_counts {
|
||||
let p = part_dir.join("counts1.bin");
|
||||
p.exists()
|
||||
.then(|| PersistentCompactIntVec::open(&p).ok())
|
||||
.flatten()
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Build de Bruijn graph with optional abundance filter
|
||||
let mut g = GraphDeBruijn::new();
|
||||
let mut reader = SKFileReader::open(&dedup_path).unwrap_or_else(|e| {
|
||||
eprintln!("error opening {}: {e}", dedup_path.display());
|
||||
std::process::exit(1);
|
||||
});
|
||||
for sk in reader.iter() {
|
||||
for kmer in sk.iter_canonical_kmers() {
|
||||
let accept = if filter_active {
|
||||
match (&mphf1_opt, &counts1_opt) {
|
||||
(Some(mphf), Some(counts)) => {
|
||||
let ab = counts.get(mphf.index(&kmer.raw()));
|
||||
ab >= min_ab && max_ab.map_or(true, |max| ab <= max)
|
||||
}
|
||||
_ => true,
|
||||
}
|
||||
} else {
|
||||
true
|
||||
};
|
||||
if accept {
|
||||
g.push(kmer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let n_kmers = g.len();
|
||||
total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
|
||||
g.compute_degrees();
|
||||
|
||||
// Write unitigs to layer_0/unitigs.bin
|
||||
fs::create_dir_all(&layer_dir).unwrap_or_else(|e| {
|
||||
eprintln!("error creating {}: {e}", layer_dir.display());
|
||||
std::process::exit(1);
|
||||
});
|
||||
let mut uw = Layer::<()>::unitig_writer(&layer_dir).unwrap_or_else(|e| {
|
||||
eprintln!("error creating unitig writer (partition {i}): {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
for unitig in g.iter_unitig() {
|
||||
uw.write(&unitig).unwrap_or_else(|e| {
|
||||
eprintln!("error writing unitig (partition {i}): {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
}
|
||||
uw.close().unwrap_or_else(|e| {
|
||||
eprintln!("error closing unitig writer (partition {i}): {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
|
||||
// Build MPHF layer — mode depends on --with-counts
|
||||
if with_counts {
|
||||
Layer::<PersistentCompactIntMatrix>::build(&layer_dir, |kmer| {
|
||||
match (&mphf1_opt, &counts1_opt) {
|
||||
(Some(mphf), Some(counts)) => counts.get(mphf.index(&kmer.raw())),
|
||||
_ => 1,
|
||||
}
|
||||
})
|
||||
.unwrap_or_else(|e| {
|
||||
eprintln!("error building count layer (partition {i}): {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
} else {
|
||||
Layer::<()>::build(&layer_dir).unwrap_or_else(|e| {
|
||||
eprintln!("error building set layer (partition {i}): {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
}
|
||||
|
||||
let pb = pb.lock().unwrap();
|
||||
pb.inc(1);
|
||||
pb.set_message(format!("{i}: {n_kmers} kmers"));
|
||||
});
|
||||
|
||||
pb.lock().unwrap().finish_and_clear();
|
||||
info!(
|
||||
"done — {} total kmers indexed",
|
||||
total_kmers.load(Ordering::Relaxed)
|
||||
);
|
||||
|
||||
// ── Cleanup intermediate build files ──────────────────────────────────────
|
||||
if !keep_intermediate {
|
||||
for i in 0..n {
|
||||
let part_dir = kp.part_dir(i);
|
||||
remove_if_exists(&part_dir.join("dereplicated.skmer.zst"));
|
||||
remove_if_exists(&SKFileMeta::sidecar_path(&part_dir.join("dereplicated.skmer.zst")));
|
||||
remove_if_exists(&part_dir.join("mphf1.bin"));
|
||||
remove_if_exists(&part_dir.join("counts1.bin"));
|
||||
}
|
||||
}
|
||||
|
||||
rep.push(t.stop());
|
||||
}
|
||||
|
||||
fn remove_if_exists(path: &Path) {
|
||||
if let Err(e) = fs::remove_file(path) {
|
||||
if e.kind() != std::io::ErrorKind::NotFound {
|
||||
eprintln!("warning: could not remove {}: {e}", path.display());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,13 +0,0 @@
|
||||
use obikpartitionner::KmerPartition;
|
||||
use obisys::{Reporter, Stage};
|
||||
|
||||
/// Dereplicate then count kmers. Reports each stage to `rep`.
|
||||
pub fn dereplicate_and_count(kp: &KmerPartition, rep: &mut Reporter) {
|
||||
let t = Stage::start("dereplicate");
|
||||
kp.dereplicate().expect("dereplicate error");
|
||||
rep.push(t.stop());
|
||||
|
||||
let t = Stage::start("count_kmer");
|
||||
kp.count_kmer().expect("count kmer error");
|
||||
rep.push(t.stop());
|
||||
}
|
||||
@@ -1,7 +1,3 @@
|
||||
mod build_index;
|
||||
mod dereplicate_and_count;
|
||||
mod scatter;
|
||||
|
||||
pub use build_index::build_index;
|
||||
pub use dereplicate_and_count::dereplicate_and_count;
|
||||
pub use scatter::scatter;
|
||||
|
||||
Reference in New Issue
Block a user