mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Refactor k-mer index management with subcommands and enhanced metadata support
This commit refactors the k-mer index management tools to use a unified subcommand structure with obik, adds support for per-set metadata and ID management, enhances the k-mer set group builder to support appending to existing groups, and improves command-line option handling with a new global options registration system. Key changes: - Introduce obik command with subcommands (index, ls, summary, cp, mv, rm, super, lowmask) - Add support for per-set metadata and ID management in kmer set groups - Implement ability to append to existing kmer index groups - Refactor option parsing to use a global options registration system - Add new commands for listing, copying, moving, and removing sets - Enhance low-complexity masking with new options and output formats - Improve kmer index summary with Jaccard distance matrix support - Remove deprecated obikindex and obisuperkmer commands - Update build process to use the new subcommand structure
This commit is contained in:
@@ -32,15 +32,17 @@ func WithMinFrequency(minFreq int) BuilderOption {
|
||||
// partitioned by minimizer. On Close(), each partition is finalized
|
||||
// (sort, dedup, optional frequency filter) into .kdi files.
|
||||
type KmerSetGroupBuilder struct {
|
||||
dir string
|
||||
k int
|
||||
m int
|
||||
n int // number of sets
|
||||
P int // number of partitions
|
||||
config builderConfig
|
||||
writers [][]*SkmWriter // [setIndex][partIndex]
|
||||
mu [][]sync.Mutex // per-writer mutex for concurrent access
|
||||
closed bool
|
||||
dir string
|
||||
k int
|
||||
m int
|
||||
n int // number of NEW sets being built
|
||||
P int // number of partitions
|
||||
startIndex int // first set index (0 for new groups, existingN for appends)
|
||||
config builderConfig
|
||||
existing *KmerSetGroup // non-nil when appending to existing group
|
||||
writers [][]*SkmWriter // [setIndex][partIndex] (local index 0..n-1)
|
||||
mu [][]sync.Mutex // per-writer mutex for concurrent access
|
||||
closed bool
|
||||
}
|
||||
|
||||
// NewKmerSetGroupBuilder creates a builder for a new KmerSetGroup.
|
||||
@@ -127,17 +129,94 @@ func NewKmerSetGroupBuilder(directory string, k, m, n, P int,
|
||||
}
|
||||
|
||||
return &KmerSetGroupBuilder{
|
||||
dir: directory,
|
||||
k: k,
|
||||
m: m,
|
||||
n: n,
|
||||
P: P,
|
||||
config: config,
|
||||
writers: writers,
|
||||
mu: mutexes,
|
||||
dir: directory,
|
||||
k: k,
|
||||
m: m,
|
||||
n: n,
|
||||
P: P,
|
||||
startIndex: 0,
|
||||
config: config,
|
||||
writers: writers,
|
||||
mu: mutexes,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// AppendKmerSetGroupBuilder opens an existing KmerSetGroup and creates
|
||||
// a builder that adds n new sets starting from the existing set count.
|
||||
// The k, m, and partitions are inherited from the existing group.
|
||||
func AppendKmerSetGroupBuilder(directory string, n int, options ...BuilderOption) (*KmerSetGroupBuilder, error) {
|
||||
existing, err := OpenKmerSetGroup(directory)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("obikmer: open existing group: %w", err)
|
||||
}
|
||||
|
||||
if n < 1 {
|
||||
return nil, fmt.Errorf("obikmer: n must be >= 1, got %d", n)
|
||||
}
|
||||
|
||||
k := existing.K()
|
||||
m := existing.M()
|
||||
P := existing.Partitions()
|
||||
startIndex := existing.Size()
|
||||
|
||||
var config builderConfig
|
||||
for _, opt := range options {
|
||||
opt(&config)
|
||||
}
|
||||
|
||||
// Create build directory structure for new sets
|
||||
buildDir := filepath.Join(directory, ".build")
|
||||
for s := 0; s < n; s++ {
|
||||
setDir := filepath.Join(buildDir, fmt.Sprintf("set_%d", s))
|
||||
if err := os.MkdirAll(setDir, 0755); err != nil {
|
||||
return nil, fmt.Errorf("obikmer: create build dir: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Create SKM writers for new sets
|
||||
writers := make([][]*SkmWriter, n)
|
||||
mutexes := make([][]sync.Mutex, n)
|
||||
for s := 0; s < n; s++ {
|
||||
writers[s] = make([]*SkmWriter, P)
|
||||
mutexes[s] = make([]sync.Mutex, P)
|
||||
for p := 0; p < P; p++ {
|
||||
path := filepath.Join(buildDir, fmt.Sprintf("set_%d", s),
|
||||
fmt.Sprintf("part_%04d.skm", p))
|
||||
w, err := NewSkmWriter(path)
|
||||
if err != nil {
|
||||
for ss := 0; ss <= s; ss++ {
|
||||
for pp := 0; pp < P; pp++ {
|
||||
if writers[ss][pp] != nil {
|
||||
writers[ss][pp].Close()
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil, fmt.Errorf("obikmer: create skm writer: %w", err)
|
||||
}
|
||||
writers[s][p] = w
|
||||
}
|
||||
}
|
||||
|
||||
return &KmerSetGroupBuilder{
|
||||
dir: directory,
|
||||
k: k,
|
||||
m: m,
|
||||
n: n,
|
||||
P: P,
|
||||
startIndex: startIndex,
|
||||
config: config,
|
||||
existing: existing,
|
||||
writers: writers,
|
||||
mu: mutexes,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// StartIndex returns the first global set index for the new sets being built.
|
||||
// For new groups this is 0; for appends it is the existing group's Size().
|
||||
func (b *KmerSetGroupBuilder) StartIndex() int {
|
||||
return b.startIndex
|
||||
}
|
||||
|
||||
// AddSequence extracts super-kmers from a sequence and writes them
|
||||
// to the appropriate partition files for the given set.
|
||||
func (b *KmerSetGroupBuilder) AddSequence(setIndex int, seq *obiseq.BioSequence) {
|
||||
@@ -193,9 +272,10 @@ func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) {
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Create output directory structure
|
||||
// 2. Create output directory structure for new sets
|
||||
for s := 0; s < b.n; s++ {
|
||||
setDir := filepath.Join(b.dir, fmt.Sprintf("set_%d", s))
|
||||
globalIdx := b.startIndex + s
|
||||
setDir := filepath.Join(b.dir, fmt.Sprintf("set_%d", globalIdx))
|
||||
if err := os.MkdirAll(setDir, 0755); err != nil {
|
||||
return nil, fmt.Errorf("obikmer: create set dir: %w", err)
|
||||
}
|
||||
@@ -251,24 +331,44 @@ func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) {
|
||||
}
|
||||
|
||||
// 3. Build KmerSetGroup and write metadata
|
||||
totalCounts := make([]uint64, b.n)
|
||||
newCounts := make([]uint64, b.n)
|
||||
for s := 0; s < b.n; s++ {
|
||||
for p := 0; p < b.P; p++ {
|
||||
totalCounts[s] += counts[s][p]
|
||||
newCounts[s] += counts[s][p]
|
||||
}
|
||||
}
|
||||
|
||||
setsIDs := make([]string, b.n)
|
||||
var ksg *KmerSetGroup
|
||||
|
||||
ksg := &KmerSetGroup{
|
||||
path: b.dir,
|
||||
k: b.k,
|
||||
m: b.m,
|
||||
partitions: b.P,
|
||||
n: b.n,
|
||||
setsIDs: setsIDs,
|
||||
counts: totalCounts,
|
||||
Metadata: make(map[string]interface{}),
|
||||
if b.existing != nil {
|
||||
// Append mode: extend existing group
|
||||
ksg = b.existing
|
||||
ksg.n += b.n
|
||||
ksg.setsIDs = append(ksg.setsIDs, make([]string, b.n)...)
|
||||
ksg.counts = append(ksg.counts, newCounts...)
|
||||
newMeta := make([]map[string]interface{}, b.n)
|
||||
for i := range newMeta {
|
||||
newMeta[i] = make(map[string]interface{})
|
||||
}
|
||||
ksg.setsMetadata = append(ksg.setsMetadata, newMeta...)
|
||||
} else {
|
||||
// New group
|
||||
setsIDs := make([]string, b.n)
|
||||
setsMetadata := make([]map[string]interface{}, b.n)
|
||||
for i := range setsMetadata {
|
||||
setsMetadata[i] = make(map[string]interface{})
|
||||
}
|
||||
ksg = &KmerSetGroup{
|
||||
path: b.dir,
|
||||
k: b.k,
|
||||
m: b.m,
|
||||
partitions: b.P,
|
||||
n: b.n,
|
||||
setsIDs: setsIDs,
|
||||
counts: newCounts,
|
||||
setsMetadata: setsMetadata,
|
||||
Metadata: make(map[string]interface{}),
|
||||
}
|
||||
}
|
||||
|
||||
if err := ksg.saveMetadata(); err != nil {
|
||||
@@ -285,12 +385,14 @@ func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) {
|
||||
// finalizePartition processes a single partition: load SKM, extract k-mers,
|
||||
// sort, dedup/count, write KDI.
|
||||
func (b *KmerSetGroupBuilder) finalizePartition(setIdx, partIdx int, count *uint64) error {
|
||||
// setIdx is local (0..n-1); build dirs use local index, output dirs use global
|
||||
skmPath := filepath.Join(b.dir, ".build",
|
||||
fmt.Sprintf("set_%d", setIdx),
|
||||
fmt.Sprintf("part_%04d.skm", partIdx))
|
||||
|
||||
globalIdx := b.startIndex + setIdx
|
||||
kdiPath := filepath.Join(b.dir,
|
||||
fmt.Sprintf("set_%d", setIdx),
|
||||
fmt.Sprintf("set_%d", globalIdx),
|
||||
fmt.Sprintf("part_%04d.kdi", partIdx))
|
||||
|
||||
// Load super-kmers and extract canonical k-mers
|
||||
|
||||
@@ -2,9 +2,12 @@ package obikmer
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"iter"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"sync"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidist"
|
||||
@@ -42,28 +45,30 @@ func (f MetadataFormat) String() string {
|
||||
//
|
||||
// A KmerSetGroup with Size()==1 is effectively a KmerSet (singleton).
|
||||
type KmerSetGroup struct {
|
||||
path string // root directory
|
||||
id string // user-assigned identifier
|
||||
k int // k-mer size
|
||||
m int // minimizer size
|
||||
partitions int // number of partitions P
|
||||
n int // number of sets N
|
||||
setsIDs []string // IDs of individual sets
|
||||
counts []uint64 // total k-mer count per set (sum over partitions)
|
||||
Metadata map[string]interface{} // group-level user metadata
|
||||
path string // root directory
|
||||
id string // user-assigned identifier
|
||||
k int // k-mer size
|
||||
m int // minimizer size
|
||||
partitions int // number of partitions P
|
||||
n int // number of sets N
|
||||
setsIDs []string // IDs of individual sets
|
||||
counts []uint64 // total k-mer count per set (sum over partitions)
|
||||
setsMetadata []map[string]interface{} // per-set user metadata
|
||||
Metadata map[string]interface{} // group-level user metadata
|
||||
}
|
||||
|
||||
// diskMetadata is the TOML-serializable structure for metadata.toml.
|
||||
type diskMetadata struct {
|
||||
ID string `toml:"id,omitempty"`
|
||||
K int `toml:"k"`
|
||||
M int `toml:"m"`
|
||||
Partitions int `toml:"partitions"`
|
||||
Type string `toml:"type"`
|
||||
Size int `toml:"size"`
|
||||
SetsIDs []string `toml:"sets_ids,omitempty"`
|
||||
Counts []uint64 `toml:"counts,omitempty"`
|
||||
UserMetadata map[string]interface{} `toml:"user_metadata,omitempty"`
|
||||
ID string `toml:"id,omitempty"`
|
||||
K int `toml:"k"`
|
||||
M int `toml:"m"`
|
||||
Partitions int `toml:"partitions"`
|
||||
Type string `toml:"type"`
|
||||
Size int `toml:"size"`
|
||||
SetsIDs []string `toml:"sets_ids,omitempty"`
|
||||
Counts []uint64 `toml:"counts,omitempty"`
|
||||
SetsMetadata []map[string]interface{} `toml:"sets_metadata,omitempty"`
|
||||
UserMetadata map[string]interface{} `toml:"user_metadata,omitempty"`
|
||||
}
|
||||
|
||||
// OpenKmerSetGroup opens a finalized index directory in read-only mode.
|
||||
@@ -81,15 +86,16 @@ func OpenKmerSetGroup(directory string) (*KmerSetGroup, error) {
|
||||
}
|
||||
|
||||
ksg := &KmerSetGroup{
|
||||
path: directory,
|
||||
id: meta.ID,
|
||||
k: meta.K,
|
||||
m: meta.M,
|
||||
partitions: meta.Partitions,
|
||||
n: meta.Size,
|
||||
setsIDs: meta.SetsIDs,
|
||||
counts: meta.Counts,
|
||||
Metadata: meta.UserMetadata,
|
||||
path: directory,
|
||||
id: meta.ID,
|
||||
k: meta.K,
|
||||
m: meta.M,
|
||||
partitions: meta.Partitions,
|
||||
n: meta.Size,
|
||||
setsIDs: meta.SetsIDs,
|
||||
counts: meta.Counts,
|
||||
setsMetadata: meta.SetsMetadata,
|
||||
Metadata: meta.UserMetadata,
|
||||
}
|
||||
if ksg.Metadata == nil {
|
||||
ksg.Metadata = make(map[string]interface{})
|
||||
@@ -97,6 +103,12 @@ func OpenKmerSetGroup(directory string) (*KmerSetGroup, error) {
|
||||
if ksg.setsIDs == nil {
|
||||
ksg.setsIDs = make([]string, ksg.n)
|
||||
}
|
||||
if ksg.setsMetadata == nil {
|
||||
ksg.setsMetadata = make([]map[string]interface{}, ksg.n)
|
||||
for i := range ksg.setsMetadata {
|
||||
ksg.setsMetadata[i] = make(map[string]interface{})
|
||||
}
|
||||
}
|
||||
if ksg.counts == nil {
|
||||
// Compute counts by scanning partitions
|
||||
ksg.counts = make([]uint64, ksg.n)
|
||||
@@ -133,6 +145,7 @@ func (ksg *KmerSetGroup) saveMetadata() error {
|
||||
Size: ksg.n,
|
||||
SetsIDs: ksg.setsIDs,
|
||||
Counts: ksg.counts,
|
||||
SetsMetadata: ksg.setsMetadata,
|
||||
UserMetadata: ksg.Metadata,
|
||||
}
|
||||
|
||||
@@ -578,3 +591,299 @@ func (ksg *KmerSetGroup) JaccardSimilarityMatrix() *obidist.DistMatrix {
|
||||
|
||||
return sm
|
||||
}
|
||||
|
||||
// ==============================
|
||||
// Set ID accessors
|
||||
// ==============================
|
||||
|
||||
// SetsIDs returns a copy of the per-set string identifiers.
|
||||
func (ksg *KmerSetGroup) SetsIDs() []string {
|
||||
out := make([]string, len(ksg.setsIDs))
|
||||
copy(out, ksg.setsIDs)
|
||||
return out
|
||||
}
|
||||
|
||||
// SetIDOf returns the string ID of the set at the given index.
|
||||
// Returns "" if index is out of range.
|
||||
func (ksg *KmerSetGroup) SetIDOf(index int) string {
|
||||
if index < 0 || index >= ksg.n {
|
||||
return ""
|
||||
}
|
||||
return ksg.setsIDs[index]
|
||||
}
|
||||
|
||||
// SetSetID sets the string ID of the set at the given index.
|
||||
func (ksg *KmerSetGroup) SetSetID(index int, id string) {
|
||||
if index >= 0 && index < ksg.n {
|
||||
ksg.setsIDs[index] = id
|
||||
}
|
||||
}
|
||||
|
||||
// IndexOfSetID returns the numeric index for a set ID, or -1 if not found.
|
||||
func (ksg *KmerSetGroup) IndexOfSetID(id string) int {
|
||||
for i, sid := range ksg.setsIDs {
|
||||
if sid == id {
|
||||
return i
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// MatchSetIDs resolves glob patterns against set IDs and returns matching
|
||||
// indices sorted in ascending order. Uses path.Match for pattern matching
|
||||
// (supports *, ?, [...] patterns). Returns error if a pattern is malformed.
|
||||
func (ksg *KmerSetGroup) MatchSetIDs(patterns []string) ([]int, error) {
|
||||
seen := make(map[int]bool)
|
||||
for _, pattern := range patterns {
|
||||
for i, sid := range ksg.setsIDs {
|
||||
matched, err := path.Match(pattern, sid)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("obikmer: invalid glob pattern %q: %w", pattern, err)
|
||||
}
|
||||
if matched {
|
||||
seen[i] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
result := make([]int, 0, len(seen))
|
||||
for idx := range seen {
|
||||
result = append(result, idx)
|
||||
}
|
||||
sort.Ints(result)
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// ==============================
|
||||
// Per-set metadata accessors
|
||||
// ==============================
|
||||
|
||||
// GetSetMetadata returns the value of a per-set metadata key.
|
||||
func (ksg *KmerSetGroup) GetSetMetadata(setIndex int, key string) (interface{}, bool) {
|
||||
if setIndex < 0 || setIndex >= ksg.n {
|
||||
return nil, false
|
||||
}
|
||||
v, ok := ksg.setsMetadata[setIndex][key]
|
||||
return v, ok
|
||||
}
|
||||
|
||||
// SetSetMetadata sets a per-set metadata attribute.
|
||||
func (ksg *KmerSetGroup) SetSetMetadata(setIndex int, key string, value interface{}) {
|
||||
if setIndex < 0 || setIndex >= ksg.n {
|
||||
return
|
||||
}
|
||||
if ksg.setsMetadata[setIndex] == nil {
|
||||
ksg.setsMetadata[setIndex] = make(map[string]interface{})
|
||||
}
|
||||
ksg.setsMetadata[setIndex][key] = value
|
||||
}
|
||||
|
||||
// DeleteSetMetadata removes a per-set metadata attribute.
|
||||
func (ksg *KmerSetGroup) DeleteSetMetadata(setIndex int, key string) {
|
||||
if setIndex < 0 || setIndex >= ksg.n {
|
||||
return
|
||||
}
|
||||
delete(ksg.setsMetadata[setIndex], key)
|
||||
}
|
||||
|
||||
// AllSetMetadata returns a copy of all metadata for a given set.
|
||||
func (ksg *KmerSetGroup) AllSetMetadata(setIndex int) map[string]interface{} {
|
||||
if setIndex < 0 || setIndex >= ksg.n {
|
||||
return nil
|
||||
}
|
||||
out := make(map[string]interface{}, len(ksg.setsMetadata[setIndex]))
|
||||
for k, v := range ksg.setsMetadata[setIndex] {
|
||||
out[k] = v
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// ==============================
|
||||
// Exported partition path and compatibility
|
||||
// ==============================
|
||||
|
||||
// PartitionPath returns the file path for partition partIndex of set setIndex.
|
||||
func (ksg *KmerSetGroup) PartitionPath(setIndex, partIndex int) string {
|
||||
return ksg.partitionPath(setIndex, partIndex)
|
||||
}
|
||||
|
||||
// IsCompatibleWith returns true if the other group has the same k, m, and partitions.
|
||||
func (ksg *KmerSetGroup) IsCompatibleWith(other *KmerSetGroup) bool {
|
||||
return ksg.k == other.k && ksg.m == other.m && ksg.partitions == other.partitions
|
||||
}
|
||||
|
||||
// ==============================
|
||||
// Set management operations
|
||||
// ==============================
|
||||
|
||||
// NewEmptyCompatible creates an empty KmerSetGroup at destDir with the same
|
||||
// k, m, and partitions as this group. The destination must not already exist.
|
||||
func (ksg *KmerSetGroup) NewEmptyCompatible(destDir string) (*KmerSetGroup, error) {
|
||||
if err := os.MkdirAll(destDir, 0755); err != nil {
|
||||
return nil, fmt.Errorf("obikmer: create directory: %w", err)
|
||||
}
|
||||
|
||||
dest := &KmerSetGroup{
|
||||
path: destDir,
|
||||
k: ksg.k,
|
||||
m: ksg.m,
|
||||
partitions: ksg.partitions,
|
||||
n: 0,
|
||||
setsIDs: []string{},
|
||||
counts: []uint64{},
|
||||
setsMetadata: []map[string]interface{}{},
|
||||
Metadata: make(map[string]interface{}),
|
||||
}
|
||||
|
||||
if err := dest.saveMetadata(); err != nil {
|
||||
return nil, fmt.Errorf("obikmer: write metadata: %w", err)
|
||||
}
|
||||
|
||||
return dest, nil
|
||||
}
|
||||
|
||||
// RemoveSetByID removes the set with the given ID from the group.
|
||||
// It deletes the set directory, renumbers all subsequent sets, and
|
||||
// updates the metadata on disk.
|
||||
func (ksg *KmerSetGroup) RemoveSetByID(id string) error {
|
||||
idx := ksg.IndexOfSetID(id)
|
||||
if idx < 0 {
|
||||
return fmt.Errorf("obikmer: set ID %q not found", id)
|
||||
}
|
||||
|
||||
// Delete the set directory
|
||||
setDir := filepath.Join(ksg.path, fmt.Sprintf("set_%d", idx))
|
||||
if err := os.RemoveAll(setDir); err != nil {
|
||||
return fmt.Errorf("obikmer: remove set directory: %w", err)
|
||||
}
|
||||
|
||||
// Renumber subsequent sets
|
||||
for i := idx + 1; i < ksg.n; i++ {
|
||||
oldDir := filepath.Join(ksg.path, fmt.Sprintf("set_%d", i))
|
||||
newDir := filepath.Join(ksg.path, fmt.Sprintf("set_%d", i-1))
|
||||
if err := os.Rename(oldDir, newDir); err != nil {
|
||||
return fmt.Errorf("obikmer: rename set_%d to set_%d: %w", i, i-1, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Update slices
|
||||
ksg.setsIDs = append(ksg.setsIDs[:idx], ksg.setsIDs[idx+1:]...)
|
||||
ksg.counts = append(ksg.counts[:idx], ksg.counts[idx+1:]...)
|
||||
ksg.setsMetadata = append(ksg.setsMetadata[:idx], ksg.setsMetadata[idx+1:]...)
|
||||
ksg.n--
|
||||
|
||||
return ksg.saveMetadata()
|
||||
}
|
||||
|
||||
// CopySetsByIDTo copies sets identified by their IDs into a KmerSetGroup
|
||||
// at destDir. If destDir does not exist, a new compatible empty group is
|
||||
// created. If it exists, compatibility (k, m, partitions) is checked.
|
||||
// If a set ID already exists in the destination, an error is returned
|
||||
// unless force is true (in which case the existing set is replaced).
|
||||
// Per-set metadata travels with the set.
|
||||
func (ksg *KmerSetGroup) CopySetsByIDTo(ids []string, destDir string, force bool) (*KmerSetGroup, error) {
|
||||
// Resolve source IDs to indices
|
||||
srcIndices := make([]int, len(ids))
|
||||
for i, id := range ids {
|
||||
idx := ksg.IndexOfSetID(id)
|
||||
if idx < 0 {
|
||||
return nil, fmt.Errorf("obikmer: source set ID %q not found", id)
|
||||
}
|
||||
srcIndices[i] = idx
|
||||
}
|
||||
|
||||
// Open or create destination
|
||||
var dest *KmerSetGroup
|
||||
metaPath := filepath.Join(destDir, "metadata.toml")
|
||||
if _, err := os.Stat(metaPath); err == nil {
|
||||
// Destination exists
|
||||
dest, err = OpenKmerSetGroup(destDir)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("obikmer: open destination: %w", err)
|
||||
}
|
||||
if !ksg.IsCompatibleWith(dest) {
|
||||
return nil, fmt.Errorf("obikmer: incompatible groups: source (k=%d, m=%d, P=%d) vs dest (k=%d, m=%d, P=%d)",
|
||||
ksg.k, ksg.m, ksg.partitions, dest.k, dest.m, dest.partitions)
|
||||
}
|
||||
} else {
|
||||
// Create new destination
|
||||
var err error
|
||||
dest, err = ksg.NewEmptyCompatible(destDir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
// Copy each set
|
||||
for i, srcIdx := range srcIndices {
|
||||
srcID := ids[i]
|
||||
|
||||
// Check for ID conflict in destination
|
||||
existingIdx := dest.IndexOfSetID(srcID)
|
||||
if existingIdx >= 0 {
|
||||
if !force {
|
||||
return nil, fmt.Errorf("obikmer: set ID %q already exists in destination (use force to replace)", srcID)
|
||||
}
|
||||
// Force: remove existing set in destination
|
||||
if err := dest.RemoveSetByID(srcID); err != nil {
|
||||
return nil, fmt.Errorf("obikmer: remove existing set %q in destination: %w", srcID, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Destination set index = current dest size
|
||||
destIdx := dest.n
|
||||
|
||||
// Create destination set directory
|
||||
destSetDir := filepath.Join(destDir, fmt.Sprintf("set_%d", destIdx))
|
||||
if err := os.MkdirAll(destSetDir, 0755); err != nil {
|
||||
return nil, fmt.Errorf("obikmer: create dest set dir: %w", err)
|
||||
}
|
||||
|
||||
// Copy all partition files
|
||||
for p := 0; p < ksg.partitions; p++ {
|
||||
srcPath := ksg.partitionPath(srcIdx, p)
|
||||
destPath := dest.partitionPath(destIdx, p)
|
||||
if err := copyFile(srcPath, destPath); err != nil {
|
||||
return nil, fmt.Errorf("obikmer: copy partition %d of set %q: %w", p, srcID, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Update destination metadata
|
||||
dest.setsIDs = append(dest.setsIDs, srcID)
|
||||
dest.counts = append(dest.counts, ksg.counts[srcIdx])
|
||||
|
||||
// Copy per-set metadata
|
||||
srcMeta := ksg.AllSetMetadata(srcIdx)
|
||||
if srcMeta == nil {
|
||||
srcMeta = make(map[string]interface{})
|
||||
}
|
||||
dest.setsMetadata = append(dest.setsMetadata, srcMeta)
|
||||
dest.n++
|
||||
}
|
||||
|
||||
if err := dest.saveMetadata(); err != nil {
|
||||
return nil, fmt.Errorf("obikmer: save destination metadata: %w", err)
|
||||
}
|
||||
|
||||
return dest, nil
|
||||
}
|
||||
|
||||
// copyFile copies a file from src to dst.
|
||||
func copyFile(src, dst string) error {
|
||||
in, err := os.Open(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer in.Close()
|
||||
|
||||
out, err := os.Create(dst)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
|
||||
if _, err := io.Copy(out, in); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return out.Close()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user