Push qkpyqurltlpk #1
@@ -1,5 +1,6 @@
|
|||||||
## Chose à vérifier suite à la commande index
|
## Chose à vérifier suite à la commande index
|
||||||
|
|
||||||
|
- il faudrait lister les fichier qui vont être indexés
|
||||||
- partition.meta ne devrait plus exister
|
- partition.meta ne devrait plus exister
|
||||||
- les spectrums globaux devrait etre identifier par génome
|
- les spectrums globaux devrait etre identifier par génome
|
||||||
- regrouper dans un sous-dossier spectrums à la racine de l'index avec un nom basé sur le génome
|
- regrouper dans un sous-dossier spectrums à la racine de l'index avec un nom basé sur le génome
|
||||||
@@ -26,3 +27,5 @@
|
|||||||
- les arbres NJ sont sauvegardés en Newick avec les longeurs de branche
|
- les arbres NJ sont sauvegardés en Newick avec les longeurs de branche
|
||||||
|
|
||||||
- dump : une table csv de l'index avec les kmer et les genomes associés en mode count ou presence/absence avec une option pour forcer le mode presence/absence meme si l'index est en mode count. Par defaut, le mode count est utilisé pour les index en mode count et le mode presence/absence pour les index en mode presence/absence.
|
- dump : une table csv de l'index avec les kmer et les genomes associés en mode count ou presence/absence avec une option pour forcer le mode presence/absence meme si l'index est en mode count. Par defaut, le mode count est utilisé pour les index en mode count et le mode presence/absence pour les index en mode presence/absence.
|
||||||
|
|
||||||
|
- status : affiche le statut de l'index
|
||||||
|
|||||||
Generated
+2
-7
@@ -1452,17 +1452,10 @@ dependencies = [
|
|||||||
name = "obikindex"
|
name = "obikindex"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cacheline-ef",
|
|
||||||
"epserde",
|
|
||||||
"indicatif",
|
"indicatif",
|
||||||
"obicompactvec",
|
|
||||||
"obidebruinj",
|
|
||||||
"obikpartitionner",
|
"obikpartitionner",
|
||||||
"obikseq",
|
|
||||||
"obilayeredmap",
|
|
||||||
"obiskio",
|
"obiskio",
|
||||||
"obisys",
|
"obisys",
|
||||||
"ptr_hash",
|
|
||||||
"rayon",
|
"rayon",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
@@ -1501,8 +1494,10 @@ dependencies = [
|
|||||||
"memmap2",
|
"memmap2",
|
||||||
"niffler 3.0.0",
|
"niffler 3.0.0",
|
||||||
"obicompactvec",
|
"obicompactvec",
|
||||||
|
"obidebruinj",
|
||||||
"obikrope",
|
"obikrope",
|
||||||
"obikseq",
|
"obikseq",
|
||||||
|
"obilayeredmap",
|
||||||
"obiread",
|
"obiread",
|
||||||
"obiskbuilder",
|
"obiskbuilder",
|
||||||
"obiskio",
|
"obiskio",
|
||||||
|
|||||||
@@ -5,15 +5,8 @@ edition = "2024"
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
obikpartitionner = { path = "../obikpartitionner" }
|
obikpartitionner = { path = "../obikpartitionner" }
|
||||||
obikseq = { path = "../obikseq" }
|
|
||||||
obisys = { path = "../obisys" }
|
|
||||||
obiskio = { path = "../obiskio" }
|
obiskio = { path = "../obiskio" }
|
||||||
obidebruinj = { path = "../obidebruinj" }
|
obisys = { path = "../obisys" }
|
||||||
obilayeredmap = { path = "../obilayeredmap" }
|
|
||||||
obicompactvec = { path = "../obicompactvec" }
|
|
||||||
cacheline-ef = "1.1"
|
|
||||||
epserde = "0.8"
|
|
||||||
ptr_hash = "1.1"
|
|
||||||
rayon = "1"
|
rayon = "1"
|
||||||
serde = { version = "1", features = ["derive"] }
|
serde = { version = "1", features = ["derive"] }
|
||||||
serde_json = "1"
|
serde_json = "1"
|
||||||
|
|||||||
@@ -2,14 +2,12 @@ use std::fmt;
|
|||||||
use std::io;
|
use std::io;
|
||||||
|
|
||||||
use obiskio::SKError;
|
use obiskio::SKError;
|
||||||
use obilayeredmap::OLMError;
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub enum OKIError {
|
pub enum OKIError {
|
||||||
Io(io::Error),
|
Io(io::Error),
|
||||||
Json(serde_json::Error),
|
Json(serde_json::Error),
|
||||||
Partition(SKError),
|
Partition(SKError),
|
||||||
Layer(OLMError),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub type OKIResult<T> = Result<T, OKIError>;
|
pub type OKIResult<T> = Result<T, OKIError>;
|
||||||
@@ -20,7 +18,6 @@ impl fmt::Display for OKIError {
|
|||||||
OKIError::Io(e) => write!(f, "I/O error: {e}"),
|
OKIError::Io(e) => write!(f, "I/O error: {e}"),
|
||||||
OKIError::Json(e) => write!(f, "JSON error: {e}"),
|
OKIError::Json(e) => write!(f, "JSON error: {e}"),
|
||||||
OKIError::Partition(e) => write!(f, "partition error: {e}"),
|
OKIError::Partition(e) => write!(f, "partition error: {e}"),
|
||||||
OKIError::Layer(e) => write!(f, "layer error: {e}"),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -31,7 +28,6 @@ impl std::error::Error for OKIError {
|
|||||||
OKIError::Io(e) => Some(e),
|
OKIError::Io(e) => Some(e),
|
||||||
OKIError::Json(e) => Some(e),
|
OKIError::Json(e) => Some(e),
|
||||||
OKIError::Partition(e) => Some(e),
|
OKIError::Partition(e) => Some(e),
|
||||||
OKIError::Layer(e) => Some(e),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -47,7 +43,3 @@ impl From<serde_json::Error> for OKIError {
|
|||||||
impl From<SKError> for OKIError {
|
impl From<SKError> for OKIError {
|
||||||
fn from(e: SKError) -> Self { OKIError::Partition(e) }
|
fn from(e: SKError) -> Self { OKIError::Partition(e) }
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<OLMError> for OKIError {
|
|
||||||
fn from(e: OLMError) -> Self { OKIError::Layer(e) }
|
|
||||||
}
|
|
||||||
|
|||||||
+28
-139
@@ -3,16 +3,9 @@ use std::path::{Path, PathBuf};
|
|||||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||||
use std::sync::{Arc, Mutex};
|
use std::sync::{Arc, Mutex};
|
||||||
|
|
||||||
use cacheline_ef::{CachelineEf, CachelineEfVec};
|
|
||||||
use epserde::prelude::*;
|
|
||||||
use indicatif::{ProgressBar, ProgressStyle};
|
use indicatif::{ProgressBar, ProgressStyle};
|
||||||
use obicompactvec::{PersistentCompactIntMatrix, PersistentCompactIntVec};
|
|
||||||
use obidebruinj::GraphDeBruijn;
|
|
||||||
use obikpartitionner::KmerPartition;
|
use obikpartitionner::KmerPartition;
|
||||||
use obilayeredmap::layer::Layer;
|
|
||||||
use obiskio::{SKFileMeta, SKFileReader};
|
|
||||||
use obisys::{Reporter, Stage};
|
use obisys::{Reporter, Stage};
|
||||||
use ptr_hash::{PtrHash, bucket_fn::CubicEps, hash::Xx64};
|
|
||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
@@ -20,8 +13,6 @@ use crate::error::{OKIError, OKIResult};
|
|||||||
use crate::meta::{IndexConfig, IndexMeta};
|
use crate::meta::{IndexConfig, IndexMeta};
|
||||||
use crate::state::{IndexState, SENTINEL_INDEXED, SENTINEL_SCATTERED};
|
use crate::state::{IndexState, SENTINEL_INDEXED, SENTINEL_SCATTERED};
|
||||||
|
|
||||||
type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
|
|
||||||
|
|
||||||
pub struct KmerIndex {
|
pub struct KmerIndex {
|
||||||
root_path: PathBuf,
|
root_path: PathBuf,
|
||||||
meta: IndexMeta,
|
meta: IndexMeta,
|
||||||
@@ -59,7 +50,12 @@ impl KmerIndex {
|
|||||||
pub fn open<P: AsRef<Path>>(path: P) -> OKIResult<Self> {
|
pub fn open<P: AsRef<Path>>(path: P) -> OKIResult<Self> {
|
||||||
let root_path = path.as_ref().to_owned();
|
let root_path = path.as_ref().to_owned();
|
||||||
let meta = IndexMeta::read(&root_path).map_err(OKIError::Io)?;
|
let meta = IndexMeta::read(&root_path).map_err(OKIError::Io)?;
|
||||||
let partition = KmerPartition::open(&root_path)?;
|
let partition = KmerPartition::open_with_config(
|
||||||
|
&root_path,
|
||||||
|
meta.config.kmer_size,
|
||||||
|
meta.config.minimizer_size,
|
||||||
|
meta.config.n_bits,
|
||||||
|
)?;
|
||||||
Ok(Self { root_path, meta, partition })
|
Ok(Self { root_path, meta, partition })
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -87,13 +83,10 @@ impl KmerIndex {
|
|||||||
/// Mark scatter as complete and write `scatter.done`.
|
/// Mark scatter as complete and write `scatter.done`.
|
||||||
///
|
///
|
||||||
/// If no genome label was set at creation time, one is derived from
|
/// If no genome label was set at creation time, one is derived from
|
||||||
/// `first_scatter_path` (filename stripped of all extensions).
|
/// the index root directory name (stripped of all extensions).
|
||||||
/// If `first_scatter_path` is also `None`, the label defaults to `"unknown"`.
|
pub fn mark_scattered(&mut self) -> OKIResult<()> {
|
||||||
pub fn mark_scattered(&mut self, first_scatter_path: Option<&Path>) -> OKIResult<()> {
|
|
||||||
if self.meta.genomes.is_empty() {
|
if self.meta.genomes.is_empty() {
|
||||||
let label = first_scatter_path
|
let label = label_from_path(&self.root_path);
|
||||||
.map(label_from_path)
|
|
||||||
.unwrap_or_else(|| "unknown".to_string());
|
|
||||||
self.meta.genomes.push(label);
|
self.meta.genomes.push(label);
|
||||||
self.meta.write(&self.root_path)?;
|
self.meta.write(&self.root_path)?;
|
||||||
}
|
}
|
||||||
@@ -116,20 +109,9 @@ impl KmerIndex {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Build the layered MPHF index for all partitions.
|
/// Build the layered MPHF index for all partitions in parallel.
|
||||||
///
|
|
||||||
/// Default mode (`config.with_counts = false`): set membership only.
|
|
||||||
/// With counts: count matrix per kmer.
|
|
||||||
///
|
///
|
||||||
/// Writes `index.done` upon completion.
|
/// Writes `index.done` upon completion.
|
||||||
/// Path to the unitigs file for partition `part`, layer `layer`.
|
|
||||||
pub fn layer_unitigs_path(&self, part: usize, layer: usize) -> PathBuf {
|
|
||||||
self.partition.part_dir(part)
|
|
||||||
.join("index")
|
|
||||||
.join(format!("layer_{layer}"))
|
|
||||||
.join("unitigs.bin")
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn build_layers(
|
pub fn build_layers(
|
||||||
&self,
|
&self,
|
||||||
min_ab: u32,
|
min_ab: u32,
|
||||||
@@ -140,12 +122,8 @@ impl KmerIndex {
|
|||||||
let n = self.partition.n_partitions();
|
let n = self.partition.n_partitions();
|
||||||
let t = Stage::start("index");
|
let t = Stage::start("index");
|
||||||
let with_counts = self.meta.config.with_counts;
|
let with_counts = self.meta.config.with_counts;
|
||||||
let filter_active = min_ab > 1 || max_ab.is_some();
|
|
||||||
let need_counts = filter_active || with_counts;
|
|
||||||
let total_kmers = AtomicUsize::new(0);
|
let total_kmers = AtomicUsize::new(0);
|
||||||
|
|
||||||
let partition = &self.partition;
|
|
||||||
|
|
||||||
let pb = Arc::new(Mutex::new(
|
let pb = Arc::new(Mutex::new(
|
||||||
ProgressBar::new(n as u64).with_style(
|
ProgressBar::new(n as u64).with_style(
|
||||||
ProgressStyle::with_template("index — [{bar:20}] {pos}/{len} | {msg}").unwrap(),
|
ProgressStyle::with_template("index — [{bar:20}] {pos}/{len} | {msg}").unwrap(),
|
||||||
@@ -153,101 +131,19 @@ impl KmerIndex {
|
|||||||
));
|
));
|
||||||
|
|
||||||
(0..n).into_par_iter().for_each(|i| {
|
(0..n).into_par_iter().for_each(|i| {
|
||||||
let part_dir = partition.part_dir(i);
|
match self.partition.build_index_layer(i, min_ab, max_ab, with_counts) {
|
||||||
let dedup_path = part_dir.join("dereplicated.skmer.zst");
|
Ok(0) => {}
|
||||||
if !dedup_path.exists() {
|
Ok(n_kmers) => {
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
let layer_dir = part_dir.join("index").join("layer_0");
|
|
||||||
if layer_dir.join("mphf.bin").exists() {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
let mphf1_opt: Option<Mphf> = if need_counts {
|
|
||||||
let p = part_dir.join("mphf1.bin");
|
|
||||||
p.exists().then(|| Mphf::load_full(&p).ok()).flatten()
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
let counts1_opt: Option<PersistentCompactIntVec> = if need_counts {
|
|
||||||
let p = part_dir.join("counts1.bin");
|
|
||||||
p.exists()
|
|
||||||
.then(|| PersistentCompactIntVec::open(&p).ok())
|
|
||||||
.flatten()
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut g = GraphDeBruijn::new();
|
|
||||||
let mut reader = SKFileReader::open(&dedup_path).unwrap_or_else(|e| {
|
|
||||||
eprintln!("error opening {}: {e}", dedup_path.display());
|
|
||||||
std::process::exit(1);
|
|
||||||
});
|
|
||||||
for sk in reader.iter() {
|
|
||||||
for kmer in sk.iter_canonical_kmers() {
|
|
||||||
let accept = if filter_active {
|
|
||||||
match (&mphf1_opt, &counts1_opt) {
|
|
||||||
(Some(mphf), Some(counts)) => {
|
|
||||||
let ab = counts.get(mphf.index(&kmer.raw()));
|
|
||||||
ab >= min_ab && max_ab.map_or(true, |max| ab <= max)
|
|
||||||
}
|
|
||||||
_ => true,
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
true
|
|
||||||
};
|
|
||||||
if accept {
|
|
||||||
g.push(kmer);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let n_kmers = g.len();
|
|
||||||
total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
|
total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
|
||||||
g.compute_degrees();
|
|
||||||
|
|
||||||
fs::create_dir_all(&layer_dir).unwrap_or_else(|e| {
|
|
||||||
eprintln!("error creating {}: {e}", layer_dir.display());
|
|
||||||
std::process::exit(1);
|
|
||||||
});
|
|
||||||
let mut uw = Layer::<()>::unitig_writer(&layer_dir).unwrap_or_else(|e| {
|
|
||||||
eprintln!("error creating unitig writer (partition {i}): {e}");
|
|
||||||
std::process::exit(1);
|
|
||||||
});
|
|
||||||
for unitig in g.iter_unitig() {
|
|
||||||
uw.write(&unitig).unwrap_or_else(|e| {
|
|
||||||
eprintln!("error writing unitig (partition {i}): {e}");
|
|
||||||
std::process::exit(1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
uw.close().unwrap_or_else(|e| {
|
|
||||||
eprintln!("error closing unitig writer (partition {i}): {e}");
|
|
||||||
std::process::exit(1);
|
|
||||||
});
|
|
||||||
|
|
||||||
if with_counts {
|
|
||||||
Layer::<PersistentCompactIntMatrix>::build(&layer_dir, |kmer| {
|
|
||||||
match (&mphf1_opt, &counts1_opt) {
|
|
||||||
(Some(mphf), Some(counts)) => counts.get(mphf.index(&kmer.raw())),
|
|
||||||
_ => 1,
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.unwrap_or_else(|e| {
|
|
||||||
eprintln!("error building count layer (partition {i}): {e}");
|
|
||||||
std::process::exit(1);
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
Layer::<()>::build(&layer_dir).unwrap_or_else(|e| {
|
|
||||||
eprintln!("error building set layer (partition {i}): {e}");
|
|
||||||
std::process::exit(1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let pb = pb.lock().unwrap();
|
let pb = pb.lock().unwrap();
|
||||||
pb.inc(1);
|
pb.inc(1);
|
||||||
pb.set_message(format!("{i}: {n_kmers} kmers"));
|
pb.set_message(format!("{i}: {n_kmers} kmers"));
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("error building layer for partition {i}: {e}");
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
pb.lock().unwrap().finish_and_clear();
|
pb.lock().unwrap().finish_and_clear();
|
||||||
@@ -258,13 +154,7 @@ impl KmerIndex {
|
|||||||
|
|
||||||
if !keep_intermediate {
|
if !keep_intermediate {
|
||||||
for i in 0..n {
|
for i in 0..n {
|
||||||
let part_dir = partition.part_dir(i);
|
self.partition.remove_build_artifacts(i);
|
||||||
remove_if_exists(&part_dir.join("dereplicated.skmer.zst"));
|
|
||||||
remove_if_exists(&SKFileMeta::sidecar_path(
|
|
||||||
&part_dir.join("dereplicated.skmer.zst"),
|
|
||||||
));
|
|
||||||
remove_if_exists(&part_dir.join("mphf1.bin"));
|
|
||||||
remove_if_exists(&part_dir.join("counts1.bin"));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -272,9 +162,16 @@ impl KmerIndex {
|
|||||||
rep.push(t.stop());
|
rep.push(t.stop());
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Path to the unitigs file for partition `part`, layer `layer`.
|
||||||
|
pub fn layer_unitigs_path(&self, part: usize, layer: usize) -> PathBuf {
|
||||||
|
self.partition.part_dir(part)
|
||||||
|
.join("index")
|
||||||
|
.join(format!("layer_{layer}"))
|
||||||
|
.join("unitigs.bin")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Derive a genome label from a file path: filename stripped of all extensions.
|
|
||||||
fn label_from_path(path: &Path) -> String {
|
fn label_from_path(path: &Path) -> String {
|
||||||
let name = path
|
let name = path
|
||||||
.file_name()
|
.file_name()
|
||||||
@@ -291,11 +188,3 @@ fn label_from_path(path: &Path) -> String {
|
|||||||
fn touch(path: &Path) -> Result<(), std::io::Error> {
|
fn touch(path: &Path) -> Result<(), std::io::Error> {
|
||||||
fs::File::create(path).map(|_| ())
|
fs::File::create(path).map(|_| ())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn remove_if_exists(path: &Path) {
|
|
||||||
if let Err(e) = fs::remove_file(path) {
|
|
||||||
if e.kind() != std::io::ErrorKind::NotFound {
|
|
||||||
eprintln!("warning: could not remove {}: {e}", path.display());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -72,7 +72,10 @@ pub fn run(args: IndexArgs) {
|
|||||||
|
|
||||||
// ── Stage 1: scatter ─────────────────────────────────────────────────────
|
// ── Stage 1: scatter ─────────────────────────────────────────────────────
|
||||||
if idx.state() < IndexState::Scattered {
|
if idx.state() < IndexState::Scattered {
|
||||||
let first_path = args.common.inputs.first().map(PathBuf::from);
|
let paths: Vec<_> = args.common.seqfile_paths().collect();
|
||||||
|
for path in &paths {
|
||||||
|
info!("indexing: {}", path.display());
|
||||||
|
}
|
||||||
let k = idx.kmer_size();
|
let k = idx.kmer_size();
|
||||||
let level_max = args.common.level_max;
|
let level_max = args.common.level_max;
|
||||||
let theta = args.common.theta;
|
let theta = args.common.theta;
|
||||||
@@ -80,7 +83,7 @@ pub fn run(args: IndexArgs) {
|
|||||||
|
|
||||||
scatter(idx.partition_mut(), args.common.seqfile_paths(), k, level_max, theta, n_workers, &mut rep);
|
scatter(idx.partition_mut(), args.common.seqfile_paths(), k, level_max, theta, n_workers, &mut rep);
|
||||||
|
|
||||||
idx.mark_scattered(first_path.as_deref()).unwrap_or_else(|e| {
|
idx.mark_scattered().unwrap_or_else(|e| {
|
||||||
eprintln!("error marking scatter done: {e}");
|
eprintln!("error marking scatter done: {e}");
|
||||||
std::process::exit(1);
|
std::process::exit(1);
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -15,6 +15,8 @@ niffler = "3.0.0"
|
|||||||
remove_dir_all = "0.8"
|
remove_dir_all = "0.8"
|
||||||
obikseq = { path = "../obikseq" }
|
obikseq = { path = "../obikseq" }
|
||||||
obiskio = { path = "../obiskio" }
|
obiskio = { path = "../obiskio" }
|
||||||
|
obidebruinj = { path = "../obidebruinj" }
|
||||||
|
obilayeredmap = { path = "../obilayeredmap" }
|
||||||
rayon = "1"
|
rayon = "1"
|
||||||
sysinfo = "0.33"
|
sysinfo = "0.33"
|
||||||
serde = { version = "1", features = ["derive"] }
|
serde = { version = "1", features = ["derive"] }
|
||||||
|
|||||||
@@ -0,0 +1,135 @@
|
|||||||
|
use std::fs;
|
||||||
|
use std::io;
|
||||||
|
|
||||||
|
use cacheline_ef::{CachelineEf, CachelineEfVec};
|
||||||
|
use epserde::prelude::*;
|
||||||
|
use obicompactvec::{PersistentCompactIntMatrix, PersistentCompactIntVec};
|
||||||
|
use obidebruinj::GraphDeBruijn;
|
||||||
|
use obilayeredmap::{OLMError, layer::Layer};
|
||||||
|
use obiskio::{SKError, SKFileMeta, SKFileReader};
|
||||||
|
use ptr_hash::{PtrHash, bucket_fn::CubicEps, hash::Xx64};
|
||||||
|
|
||||||
|
use crate::partition::KmerPartition;
|
||||||
|
|
||||||
|
type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
|
||||||
|
|
||||||
|
fn olm_to_sk(e: OLMError) -> SKError {
|
||||||
|
match e {
|
||||||
|
OLMError::Io(io_err) => SKError::Io(io_err),
|
||||||
|
other => SKError::InvalidData { context: "layer build", detail: other.to_string() },
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn remove_if_exists(path: &std::path::Path) {
|
||||||
|
if let Err(e) = fs::remove_file(path) {
|
||||||
|
if e.kind() != io::ErrorKind::NotFound {
|
||||||
|
eprintln!("warning: could not remove {}: {e}", path.display());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl KmerPartition {
|
||||||
|
/// Build the layered MPHF index for partition `i`.
|
||||||
|
///
|
||||||
|
/// Returns the number of canonical k-mers indexed, or 0 if the partition
|
||||||
|
/// has no data or its layer was already built (resume-safe).
|
||||||
|
///
|
||||||
|
/// Abundance filtering is applied when `min_ab > 1` or `max_ab.is_some()`,
|
||||||
|
/// using `mphf1.bin` + `counts1.bin` if they exist.
|
||||||
|
/// Count payload is stored iff `with_counts` is true.
|
||||||
|
pub fn build_index_layer(
|
||||||
|
&self,
|
||||||
|
i: usize,
|
||||||
|
min_ab: u32,
|
||||||
|
max_ab: Option<u32>,
|
||||||
|
with_counts: bool,
|
||||||
|
) -> Result<usize, SKError> {
|
||||||
|
let part_dir = self.part_dir(i);
|
||||||
|
let dedup_path = part_dir.join("dereplicated.skmer.zst");
|
||||||
|
if !dedup_path.exists() {
|
||||||
|
return Ok(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
let layer_dir = part_dir.join("index").join("layer_0");
|
||||||
|
if layer_dir.join("mphf.bin").exists() {
|
||||||
|
return Ok(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
let filter_active = min_ab > 1 || max_ab.is_some();
|
||||||
|
let need_counts = filter_active || with_counts;
|
||||||
|
|
||||||
|
let mphf1_opt: Option<Mphf> = if need_counts {
|
||||||
|
let p = part_dir.join("mphf1.bin");
|
||||||
|
p.exists().then(|| Mphf::load_full(&p).ok()).flatten()
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
let counts1_opt: Option<PersistentCompactIntVec> = if need_counts {
|
||||||
|
let p = part_dir.join("counts1.bin");
|
||||||
|
p.exists()
|
||||||
|
.then(|| PersistentCompactIntVec::open(&p).ok())
|
||||||
|
.flatten()
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut g = GraphDeBruijn::new();
|
||||||
|
let mut reader = SKFileReader::open(&dedup_path)?;
|
||||||
|
for sk in reader.iter() {
|
||||||
|
for kmer in sk.iter_canonical_kmers() {
|
||||||
|
let accept = if filter_active {
|
||||||
|
match (&mphf1_opt, &counts1_opt) {
|
||||||
|
(Some(mphf), Some(counts)) => {
|
||||||
|
let ab = counts.get(mphf.index(&kmer.raw()));
|
||||||
|
ab >= min_ab && max_ab.map_or(true, |max| ab <= max)
|
||||||
|
}
|
||||||
|
_ => true,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
true
|
||||||
|
};
|
||||||
|
if accept {
|
||||||
|
g.push(kmer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let n_kmers = g.len();
|
||||||
|
g.compute_degrees();
|
||||||
|
|
||||||
|
fs::create_dir_all(&layer_dir)?;
|
||||||
|
|
||||||
|
let mut uw = Layer::<()>::unitig_writer(&layer_dir).map_err(olm_to_sk)?;
|
||||||
|
for unitig in g.iter_unitig() {
|
||||||
|
uw.write(&unitig)?;
|
||||||
|
}
|
||||||
|
uw.close()?;
|
||||||
|
|
||||||
|
if with_counts {
|
||||||
|
Layer::<PersistentCompactIntMatrix>::build(&layer_dir, |kmer| {
|
||||||
|
match (&mphf1_opt, &counts1_opt) {
|
||||||
|
(Some(mphf), Some(counts)) => counts.get(mphf.index(&kmer.raw())),
|
||||||
|
_ => 1,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.map_err(olm_to_sk)?;
|
||||||
|
} else {
|
||||||
|
Layer::<()>::build(&layer_dir).map_err(olm_to_sk)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(n_kmers)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Remove intermediate build artifacts for partition `i`.
|
||||||
|
///
|
||||||
|
/// Deletes `dereplicated.skmer.zst` (+ sidecar), `mphf1.bin`, `counts1.bin`.
|
||||||
|
pub fn remove_build_artifacts(&self, i: usize) {
|
||||||
|
let part_dir = self.part_dir(i);
|
||||||
|
let dedup = part_dir.join("dereplicated.skmer.zst");
|
||||||
|
remove_if_exists(&SKFileMeta::sidecar_path(&dedup));
|
||||||
|
remove_if_exists(&dedup);
|
||||||
|
remove_if_exists(&part_dir.join("mphf1.bin"));
|
||||||
|
remove_if_exists(&part_dir.join("counts1.bin"));
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,3 +1,4 @@
|
|||||||
|
mod index_layer;
|
||||||
mod kmer_sort;
|
mod kmer_sort;
|
||||||
mod partition;
|
mod partition;
|
||||||
|
|
||||||
|
|||||||
@@ -18,7 +18,6 @@ use obiskio::{SKFileMeta, SKFileReader, SKFileWriter, SKResult};
|
|||||||
use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64};
|
use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64};
|
||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
use remove_dir_all::remove_dir_all;
|
use remove_dir_all::remove_dir_all;
|
||||||
use serde::{Deserialize, Serialize};
|
|
||||||
use sysinfo::System;
|
use sysinfo::System;
|
||||||
|
|
||||||
use niffler::Level;
|
use niffler::Level;
|
||||||
@@ -28,18 +27,9 @@ use crate::kmer_sort::{chunk_size_from_ram, sort_unique_kmers};
|
|||||||
|
|
||||||
type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
|
type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
|
||||||
|
|
||||||
const META_FILENAME: &str = "partition.meta";
|
|
||||||
const SK_EXT: &str = "skmer.zst";
|
const SK_EXT: &str = "skmer.zst";
|
||||||
pub const PARTITIONS_SUBDIR: &str = "partitions";
|
pub const PARTITIONS_SUBDIR: &str = "partitions";
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
|
||||||
struct PartitionMeta {
|
|
||||||
n_bits: usize,
|
|
||||||
kmer_size: usize,
|
|
||||||
minimizer_size: usize,
|
|
||||||
level: u32,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct KmerPartition {
|
pub struct KmerPartition {
|
||||||
root_path: PathBuf,
|
root_path: PathBuf,
|
||||||
n_partitions: usize,
|
n_partitions: usize,
|
||||||
@@ -98,11 +88,15 @@ impl KmerPartition {
|
|||||||
level,
|
level,
|
||||||
closed: false,
|
closed: false,
|
||||||
};
|
};
|
||||||
partition.write_meta(n_bits)?;
|
|
||||||
Ok(partition)
|
Ok(partition)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn open<P: AsRef<Path>>(path: P) -> SKResult<Self> {
|
pub fn open_with_config<P: AsRef<Path>>(
|
||||||
|
path: P,
|
||||||
|
kmer_size: usize,
|
||||||
|
minimizer_size: usize,
|
||||||
|
n_bits: usize,
|
||||||
|
) -> SKResult<Self> {
|
||||||
let root_path = path.as_ref().to_owned();
|
let root_path = path.as_ref().to_owned();
|
||||||
if !root_path.exists() {
|
if !root_path.exists() {
|
||||||
return Err(io::Error::new(
|
return Err(io::Error::new(
|
||||||
@@ -111,22 +105,17 @@ impl KmerPartition {
|
|||||||
)
|
)
|
||||||
.into());
|
.into());
|
||||||
}
|
}
|
||||||
let meta_path = root_path.join(META_FILENAME);
|
let n_partitions = 1usize << n_bits;
|
||||||
let meta: PartitionMeta =
|
|
||||||
serde_json::from_reader(fs::File::open(&meta_path)?).map_err(io::Error::other)?;
|
|
||||||
|
|
||||||
let level = level_from_u32(meta.level);
|
|
||||||
let n_partitions = 1usize << meta.n_bits;
|
|
||||||
let writers = (0..n_partitions).map(|_| None).collect();
|
let writers = (0..n_partitions).map(|_| None).collect();
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
root_path,
|
root_path,
|
||||||
n_partitions,
|
n_partitions,
|
||||||
partitions_mask: (1u64 << meta.n_bits) - 1,
|
partitions_mask: (1u64 << n_bits) - 1,
|
||||||
kmer_size: meta.kmer_size,
|
kmer_size,
|
||||||
minimizer_size: meta.minimizer_size,
|
minimizer_size,
|
||||||
writers,
|
writers,
|
||||||
level,
|
level: Level::One,
|
||||||
closed: true, // read-only: writing is not allowed on an opened partition
|
closed: true,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -339,18 +328,6 @@ impl KmerPartition {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_meta(&self, n_bits: usize) -> SKResult<()> {
|
|
||||||
let meta = PartitionMeta {
|
|
||||||
n_bits,
|
|
||||||
kmer_size: self.kmer_size,
|
|
||||||
minimizer_size: self.minimizer_size,
|
|
||||||
level: u32::from(self.level),
|
|
||||||
};
|
|
||||||
let f = fs::File::create(self.root_path.join(META_FILENAME))?;
|
|
||||||
serde_json::to_writer_pretty(f, &meta).map_err(|e| io::Error::other(e))?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn ensure_writer(&mut self, partition: usize) -> SKResult<&mut SKFileWriter> {
|
fn ensure_writer(&mut self, partition: usize) -> SKResult<&mut SKFileWriter> {
|
||||||
if self.writers[partition].is_none() {
|
if self.writers[partition].is_none() {
|
||||||
let dir = self.root_path.join(PARTITIONS_SUBDIR).join(format!("part_{:05}", partition));
|
let dir = self.root_path.join(PARTITIONS_SUBDIR).join(format!("part_{:05}", partition));
|
||||||
@@ -411,32 +388,6 @@ fn optimal_buckets(raw_path: &Path, available_bytes: u64) -> usize {
|
|||||||
n.next_power_of_two() as usize
|
n.next_power_of_two() as usize
|
||||||
}
|
}
|
||||||
|
|
||||||
fn level_from_u32(n: u32) -> Level {
|
|
||||||
match n {
|
|
||||||
0 => Level::Zero,
|
|
||||||
1 => Level::One,
|
|
||||||
2 => Level::Two,
|
|
||||||
3 => Level::Three,
|
|
||||||
4 => Level::Four,
|
|
||||||
5 => Level::Five,
|
|
||||||
6 => Level::Six,
|
|
||||||
7 => Level::Seven,
|
|
||||||
8 => Level::Eight,
|
|
||||||
9 => Level::Nine,
|
|
||||||
10 => Level::Ten,
|
|
||||||
11 => Level::Eleven,
|
|
||||||
12 => Level::Twelve,
|
|
||||||
13 => Level::Thirteen,
|
|
||||||
14 => Level::Fourteen,
|
|
||||||
15 => Level::Fifteen,
|
|
||||||
16 => Level::Sixteen,
|
|
||||||
17 => Level::Seventeen,
|
|
||||||
18 => Level::Eighteen,
|
|
||||||
19 => Level::Nineteen,
|
|
||||||
20 => Level::Twenty,
|
|
||||||
_ => Level::TwentyOne,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Maximum value that fits in the 24-bit COUNT field of a SuperKmer header.
|
/// Maximum value that fits in the 24-bit COUNT field of a SuperKmer header.
|
||||||
const MAX_SK_COUNT: u64 = (1 << 24) - 1;
|
const MAX_SK_COUNT: u64 = (1 << 24) - 1;
|
||||||
|
|||||||
Reference in New Issue
Block a user