2026-02-09 21:57:03 +01:00
|
|
|
package obikmer
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"fmt"
|
Refactor k-mer index management with subcommands and enhanced metadata support
This commit refactors the k-mer index management tools to use a unified subcommand structure with obik, adds support for per-set metadata and ID management, enhances the k-mer set group builder to support appending to existing groups, and improves command-line option handling with a new global options registration system.
Key changes:
- Introduce obik command with subcommands (index, ls, summary, cp, mv, rm, super, lowmask)
- Add support for per-set metadata and ID management in kmer set groups
- Implement ability to append to existing kmer index groups
- Refactor option parsing to use a global options registration system
- Add new commands for listing, copying, moving, and removing sets
- Enhance low-complexity masking with new options and output formats
- Improve kmer index summary with Jaccard distance matrix support
- Remove deprecated obikindex and obisuperkmer commands
- Update build process to use the new subcommand structure
2026-02-09 23:10:30 +01:00
|
|
|
"io"
|
2026-02-09 21:57:03 +01:00
|
|
|
"iter"
|
|
|
|
|
"os"
|
Refactor k-mer index management with subcommands and enhanced metadata support
This commit refactors the k-mer index management tools to use a unified subcommand structure with obik, adds support for per-set metadata and ID management, enhances the k-mer set group builder to support appending to existing groups, and improves command-line option handling with a new global options registration system.
Key changes:
- Introduce obik command with subcommands (index, ls, summary, cp, mv, rm, super, lowmask)
- Add support for per-set metadata and ID management in kmer set groups
- Implement ability to append to existing kmer index groups
- Refactor option parsing to use a global options registration system
- Add new commands for listing, copying, moving, and removing sets
- Enhance low-complexity masking with new options and output formats
- Improve kmer index summary with Jaccard distance matrix support
- Remove deprecated obikindex and obisuperkmer commands
- Update build process to use the new subcommand structure
2026-02-09 23:10:30 +01:00
|
|
|
"path"
|
2026-02-09 21:57:03 +01:00
|
|
|
"path/filepath"
|
Refactor k-mer index management with subcommands and enhanced metadata support
This commit refactors the k-mer index management tools to use a unified subcommand structure with obik, adds support for per-set metadata and ID management, enhances the k-mer set group builder to support appending to existing groups, and improves command-line option handling with a new global options registration system.
Key changes:
- Introduce obik command with subcommands (index, ls, summary, cp, mv, rm, super, lowmask)
- Add support for per-set metadata and ID management in kmer set groups
- Implement ability to append to existing kmer index groups
- Refactor option parsing to use a global options registration system
- Add new commands for listing, copying, moving, and removing sets
- Enhance low-complexity masking with new options and output formats
- Improve kmer index summary with Jaccard distance matrix support
- Remove deprecated obikindex and obisuperkmer commands
- Update build process to use the new subcommand structure
2026-02-09 23:10:30 +01:00
|
|
|
"sort"
|
2026-02-09 21:57:03 +01:00
|
|
|
"sync"
|
|
|
|
|
|
|
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidist"
|
|
|
|
|
"github.com/pelletier/go-toml/v2"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
// MetadataFormat represents the metadata serialization format.
|
|
|
|
|
// Currently only TOML is used for disk-based indices, but the type
|
|
|
|
|
// is kept for backward compatibility with CLI options.
|
|
|
|
|
type MetadataFormat int
|
|
|
|
|
|
|
|
|
|
const (
|
|
|
|
|
FormatTOML MetadataFormat = iota
|
|
|
|
|
FormatYAML
|
|
|
|
|
FormatJSON
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
// String returns the file extension for the format.
|
|
|
|
|
func (f MetadataFormat) String() string {
|
|
|
|
|
switch f {
|
|
|
|
|
case FormatTOML:
|
|
|
|
|
return "toml"
|
|
|
|
|
case FormatYAML:
|
|
|
|
|
return "yaml"
|
|
|
|
|
case FormatJSON:
|
|
|
|
|
return "json"
|
|
|
|
|
default:
|
|
|
|
|
return "toml"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// KmerSetGroup is a disk-based collection of N k-mer sets sharing the same
|
|
|
|
|
// k, m, and partition count P. After construction (via KmerSetGroupBuilder),
|
|
|
|
|
// it is immutable and all operations are streaming (partition by partition).
|
|
|
|
|
//
|
|
|
|
|
// A KmerSetGroup with Size()==1 is effectively a KmerSet (singleton).
|
|
|
|
|
type KmerSetGroup struct {
|
Refactor k-mer index management with subcommands and enhanced metadata support
This commit refactors the k-mer index management tools to use a unified subcommand structure with obik, adds support for per-set metadata and ID management, enhances the k-mer set group builder to support appending to existing groups, and improves command-line option handling with a new global options registration system.
Key changes:
- Introduce obik command with subcommands (index, ls, summary, cp, mv, rm, super, lowmask)
- Add support for per-set metadata and ID management in kmer set groups
- Implement ability to append to existing kmer index groups
- Refactor option parsing to use a global options registration system
- Add new commands for listing, copying, moving, and removing sets
- Enhance low-complexity masking with new options and output formats
- Improve kmer index summary with Jaccard distance matrix support
- Remove deprecated obikindex and obisuperkmer commands
- Update build process to use the new subcommand structure
2026-02-09 23:10:30 +01:00
|
|
|
path string // root directory
|
|
|
|
|
id string // user-assigned identifier
|
|
|
|
|
k int // k-mer size
|
|
|
|
|
m int // minimizer size
|
|
|
|
|
partitions int // number of partitions P
|
|
|
|
|
n int // number of sets N
|
|
|
|
|
setsIDs []string // IDs of individual sets
|
|
|
|
|
counts []uint64 // total k-mer count per set (sum over partitions)
|
|
|
|
|
setsMetadata []map[string]interface{} // per-set user metadata
|
|
|
|
|
Metadata map[string]interface{} // group-level user metadata
|
2026-02-09 21:57:03 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// diskMetadata is the TOML-serializable structure for metadata.toml.
|
|
|
|
|
type diskMetadata struct {
|
Refactor k-mer index management with subcommands and enhanced metadata support
This commit refactors the k-mer index management tools to use a unified subcommand structure with obik, adds support for per-set metadata and ID management, enhances the k-mer set group builder to support appending to existing groups, and improves command-line option handling with a new global options registration system.
Key changes:
- Introduce obik command with subcommands (index, ls, summary, cp, mv, rm, super, lowmask)
- Add support for per-set metadata and ID management in kmer set groups
- Implement ability to append to existing kmer index groups
- Refactor option parsing to use a global options registration system
- Add new commands for listing, copying, moving, and removing sets
- Enhance low-complexity masking with new options and output formats
- Improve kmer index summary with Jaccard distance matrix support
- Remove deprecated obikindex and obisuperkmer commands
- Update build process to use the new subcommand structure
2026-02-09 23:10:30 +01:00
|
|
|
ID string `toml:"id,omitempty"`
|
|
|
|
|
K int `toml:"k"`
|
|
|
|
|
M int `toml:"m"`
|
|
|
|
|
Partitions int `toml:"partitions"`
|
|
|
|
|
Type string `toml:"type"`
|
|
|
|
|
Size int `toml:"size"`
|
|
|
|
|
SetsIDs []string `toml:"sets_ids,omitempty"`
|
|
|
|
|
Counts []uint64 `toml:"counts,omitempty"`
|
|
|
|
|
SetsMetadata []map[string]interface{} `toml:"sets_metadata,omitempty"`
|
|
|
|
|
UserMetadata map[string]interface{} `toml:"user_metadata,omitempty"`
|
2026-02-09 21:57:03 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// OpenKmerSetGroup opens a finalized index directory in read-only mode.
|
|
|
|
|
func OpenKmerSetGroup(directory string) (*KmerSetGroup, error) {
|
|
|
|
|
metaPath := filepath.Join(directory, "metadata.toml")
|
|
|
|
|
f, err := os.Open(metaPath)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, fmt.Errorf("obikmer: open metadata: %w", err)
|
|
|
|
|
}
|
|
|
|
|
defer f.Close()
|
|
|
|
|
|
|
|
|
|
var meta diskMetadata
|
|
|
|
|
if err := toml.NewDecoder(f).Decode(&meta); err != nil {
|
|
|
|
|
return nil, fmt.Errorf("obikmer: decode metadata: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ksg := &KmerSetGroup{
|
Refactor k-mer index management with subcommands and enhanced metadata support
This commit refactors the k-mer index management tools to use a unified subcommand structure with obik, adds support for per-set metadata and ID management, enhances the k-mer set group builder to support appending to existing groups, and improves command-line option handling with a new global options registration system.
Key changes:
- Introduce obik command with subcommands (index, ls, summary, cp, mv, rm, super, lowmask)
- Add support for per-set metadata and ID management in kmer set groups
- Implement ability to append to existing kmer index groups
- Refactor option parsing to use a global options registration system
- Add new commands for listing, copying, moving, and removing sets
- Enhance low-complexity masking with new options and output formats
- Improve kmer index summary with Jaccard distance matrix support
- Remove deprecated obikindex and obisuperkmer commands
- Update build process to use the new subcommand structure
2026-02-09 23:10:30 +01:00
|
|
|
path: directory,
|
|
|
|
|
id: meta.ID,
|
|
|
|
|
k: meta.K,
|
|
|
|
|
m: meta.M,
|
|
|
|
|
partitions: meta.Partitions,
|
|
|
|
|
n: meta.Size,
|
|
|
|
|
setsIDs: meta.SetsIDs,
|
|
|
|
|
counts: meta.Counts,
|
|
|
|
|
setsMetadata: meta.SetsMetadata,
|
|
|
|
|
Metadata: meta.UserMetadata,
|
2026-02-09 21:57:03 +01:00
|
|
|
}
|
|
|
|
|
if ksg.Metadata == nil {
|
|
|
|
|
ksg.Metadata = make(map[string]interface{})
|
|
|
|
|
}
|
|
|
|
|
if ksg.setsIDs == nil {
|
|
|
|
|
ksg.setsIDs = make([]string, ksg.n)
|
|
|
|
|
}
|
Refactor k-mer index management with subcommands and enhanced metadata support
This commit refactors the k-mer index management tools to use a unified subcommand structure with obik, adds support for per-set metadata and ID management, enhances the k-mer set group builder to support appending to existing groups, and improves command-line option handling with a new global options registration system.
Key changes:
- Introduce obik command with subcommands (index, ls, summary, cp, mv, rm, super, lowmask)
- Add support for per-set metadata and ID management in kmer set groups
- Implement ability to append to existing kmer index groups
- Refactor option parsing to use a global options registration system
- Add new commands for listing, copying, moving, and removing sets
- Enhance low-complexity masking with new options and output formats
- Improve kmer index summary with Jaccard distance matrix support
- Remove deprecated obikindex and obisuperkmer commands
- Update build process to use the new subcommand structure
2026-02-09 23:10:30 +01:00
|
|
|
if ksg.setsMetadata == nil {
|
|
|
|
|
ksg.setsMetadata = make([]map[string]interface{}, ksg.n)
|
|
|
|
|
for i := range ksg.setsMetadata {
|
|
|
|
|
ksg.setsMetadata[i] = make(map[string]interface{})
|
|
|
|
|
}
|
|
|
|
|
}
|
2026-02-09 21:57:03 +01:00
|
|
|
if ksg.counts == nil {
|
|
|
|
|
// Compute counts by scanning partitions
|
|
|
|
|
ksg.counts = make([]uint64, ksg.n)
|
|
|
|
|
for s := 0; s < ksg.n; s++ {
|
|
|
|
|
for p := 0; p < ksg.partitions; p++ {
|
|
|
|
|
path := ksg.partitionPath(s, p)
|
|
|
|
|
r, err := NewKdiReader(path)
|
|
|
|
|
if err != nil {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
ksg.counts[s] += r.Count()
|
|
|
|
|
r.Close()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return ksg, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// SaveMetadata writes the metadata.toml file. This is useful after
|
|
|
|
|
// modifying attributes or IDs on an already-finalized index.
|
|
|
|
|
func (ksg *KmerSetGroup) SaveMetadata() error {
|
|
|
|
|
return ksg.saveMetadata()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// saveMetadata writes the metadata.toml file (internal).
|
|
|
|
|
func (ksg *KmerSetGroup) saveMetadata() error {
|
|
|
|
|
meta := diskMetadata{
|
|
|
|
|
ID: ksg.id,
|
|
|
|
|
K: ksg.k,
|
|
|
|
|
M: ksg.m,
|
|
|
|
|
Partitions: ksg.partitions,
|
|
|
|
|
Type: "KmerSetGroup",
|
|
|
|
|
Size: ksg.n,
|
|
|
|
|
SetsIDs: ksg.setsIDs,
|
|
|
|
|
Counts: ksg.counts,
|
Refactor k-mer index management with subcommands and enhanced metadata support
This commit refactors the k-mer index management tools to use a unified subcommand structure with obik, adds support for per-set metadata and ID management, enhances the k-mer set group builder to support appending to existing groups, and improves command-line option handling with a new global options registration system.
Key changes:
- Introduce obik command with subcommands (index, ls, summary, cp, mv, rm, super, lowmask)
- Add support for per-set metadata and ID management in kmer set groups
- Implement ability to append to existing kmer index groups
- Refactor option parsing to use a global options registration system
- Add new commands for listing, copying, moving, and removing sets
- Enhance low-complexity masking with new options and output formats
- Improve kmer index summary with Jaccard distance matrix support
- Remove deprecated obikindex and obisuperkmer commands
- Update build process to use the new subcommand structure
2026-02-09 23:10:30 +01:00
|
|
|
SetsMetadata: ksg.setsMetadata,
|
2026-02-09 21:57:03 +01:00
|
|
|
UserMetadata: ksg.Metadata,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
metaPath := filepath.Join(ksg.path, "metadata.toml")
|
|
|
|
|
f, err := os.Create(metaPath)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
defer f.Close()
|
|
|
|
|
|
|
|
|
|
return toml.NewEncoder(f).Encode(meta)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// partitionPath returns the file path for partition p of set s.
|
|
|
|
|
func (ksg *KmerSetGroup) partitionPath(setIndex, partIndex int) string {
|
|
|
|
|
return filepath.Join(ksg.path, fmt.Sprintf("set_%d", setIndex),
|
|
|
|
|
fmt.Sprintf("part_%04d.kdi", partIndex))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Path returns the root directory of the index.
|
|
|
|
|
func (ksg *KmerSetGroup) Path() string {
|
|
|
|
|
return ksg.path
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// K returns the k-mer size.
|
|
|
|
|
func (ksg *KmerSetGroup) K() int {
|
|
|
|
|
return ksg.k
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// M returns the minimizer size.
|
|
|
|
|
func (ksg *KmerSetGroup) M() int {
|
|
|
|
|
return ksg.m
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Partitions returns the number of partitions P.
|
|
|
|
|
func (ksg *KmerSetGroup) Partitions() int {
|
|
|
|
|
return ksg.partitions
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Size returns the number of sets N.
|
|
|
|
|
func (ksg *KmerSetGroup) Size() int {
|
|
|
|
|
return ksg.n
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Id returns the group identifier.
|
|
|
|
|
func (ksg *KmerSetGroup) Id() string {
|
|
|
|
|
return ksg.id
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// SetId sets the group identifier and persists the change.
|
|
|
|
|
func (ksg *KmerSetGroup) SetId(id string) {
|
|
|
|
|
ksg.id = id
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Len returns the total number of k-mers.
|
|
|
|
|
// Without argument: total across all sets.
|
|
|
|
|
// With argument setIndex: count for that specific set.
|
|
|
|
|
func (ksg *KmerSetGroup) Len(setIndex ...int) uint64 {
|
|
|
|
|
if len(setIndex) == 0 {
|
|
|
|
|
var total uint64
|
|
|
|
|
for _, c := range ksg.counts {
|
|
|
|
|
total += c
|
|
|
|
|
}
|
|
|
|
|
return total
|
|
|
|
|
}
|
|
|
|
|
idx := setIndex[0]
|
|
|
|
|
if idx < 0 || idx >= ksg.n {
|
|
|
|
|
return 0
|
|
|
|
|
}
|
|
|
|
|
return ksg.counts[idx]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Contains checks if a k-mer is present in the specified set.
|
|
|
|
|
// Uses binary search on the appropriate partition's KDI file.
|
|
|
|
|
func (ksg *KmerSetGroup) Contains(setIndex int, kmer uint64) bool {
|
|
|
|
|
if setIndex < 0 || setIndex >= ksg.n {
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
// Determine partition from minimizer
|
|
|
|
|
// For a canonical k-mer, we need to find which partition it would fall into.
|
|
|
|
|
// The partition is determined by the minimizer during construction.
|
|
|
|
|
// For Contains, we must scan all partitions of this set (linear search within each).
|
|
|
|
|
// A full binary-search approach would require an index file.
|
|
|
|
|
// For now, scan the partition determined by the k-mer's minimizer.
|
|
|
|
|
// Since we don't know the minimizer, we do a linear scan of all partitions.
|
|
|
|
|
// This is O(total_kmers / P) per partition on average.
|
|
|
|
|
|
|
|
|
|
// Optimization: scan all partitions in parallel
|
|
|
|
|
type result struct {
|
|
|
|
|
found bool
|
|
|
|
|
}
|
|
|
|
|
ch := make(chan result, ksg.partitions)
|
|
|
|
|
|
|
|
|
|
for p := 0; p < ksg.partitions; p++ {
|
|
|
|
|
go func(part int) {
|
|
|
|
|
r, err := NewKdiReader(ksg.partitionPath(setIndex, part))
|
|
|
|
|
if err != nil {
|
|
|
|
|
ch <- result{false}
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
defer r.Close()
|
|
|
|
|
for {
|
|
|
|
|
v, ok := r.Next()
|
|
|
|
|
if !ok {
|
|
|
|
|
ch <- result{false}
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
if v == kmer {
|
|
|
|
|
ch <- result{true}
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
if v > kmer {
|
|
|
|
|
ch <- result{false}
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}(p)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for i := 0; i < ksg.partitions; i++ {
|
|
|
|
|
res := <-ch
|
|
|
|
|
if res.found {
|
|
|
|
|
// Drain remaining goroutines
|
|
|
|
|
go func() {
|
|
|
|
|
for j := i + 1; j < ksg.partitions; j++ {
|
|
|
|
|
<-ch
|
|
|
|
|
}
|
|
|
|
|
}()
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Iterator returns an iterator over all k-mers in the specified set,
|
|
|
|
|
// in sorted order within each partition. Since partitions are independent,
|
|
|
|
|
// to get a globally sorted stream, use iteratorSorted.
|
|
|
|
|
func (ksg *KmerSetGroup) Iterator(setIndex int) iter.Seq[uint64] {
|
|
|
|
|
return func(yield func(uint64) bool) {
|
|
|
|
|
if setIndex < 0 || setIndex >= ksg.n {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Open all partition readers and merge them
|
|
|
|
|
readers := make([]*KdiReader, 0, ksg.partitions)
|
|
|
|
|
for p := 0; p < ksg.partitions; p++ {
|
|
|
|
|
r, err := NewKdiReader(ksg.partitionPath(setIndex, p))
|
|
|
|
|
if err != nil {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if r.Count() > 0 {
|
|
|
|
|
readers = append(readers, r)
|
|
|
|
|
} else {
|
|
|
|
|
r.Close()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if len(readers) == 0 {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
m := NewKWayMerge(readers)
|
|
|
|
|
defer m.Close()
|
|
|
|
|
|
|
|
|
|
for {
|
|
|
|
|
kmer, _, ok := m.Next()
|
|
|
|
|
if !ok {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
if !yield(kmer) {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ==============================
|
|
|
|
|
// Attribute API (compatible with old API)
|
|
|
|
|
// ==============================
|
|
|
|
|
|
|
|
|
|
// HasAttribute checks if a metadata key exists.
|
|
|
|
|
func (ksg *KmerSetGroup) HasAttribute(key string) bool {
|
|
|
|
|
_, ok := ksg.Metadata[key]
|
|
|
|
|
return ok
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// GetAttribute returns the value of an attribute.
|
|
|
|
|
func (ksg *KmerSetGroup) GetAttribute(key string) (interface{}, bool) {
|
|
|
|
|
switch key {
|
|
|
|
|
case "id":
|
|
|
|
|
return ksg.Id(), true
|
|
|
|
|
case "k":
|
|
|
|
|
return ksg.K(), true
|
|
|
|
|
default:
|
|
|
|
|
value, ok := ksg.Metadata[key]
|
|
|
|
|
return value, ok
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// SetAttribute sets a metadata attribute.
|
|
|
|
|
func (ksg *KmerSetGroup) SetAttribute(key string, value interface{}) {
|
|
|
|
|
switch key {
|
|
|
|
|
case "id":
|
|
|
|
|
if id, ok := value.(string); ok {
|
|
|
|
|
ksg.SetId(id)
|
|
|
|
|
} else {
|
|
|
|
|
panic(fmt.Sprintf("id must be a string, got %T", value))
|
|
|
|
|
}
|
|
|
|
|
case "k":
|
|
|
|
|
panic("k is immutable")
|
|
|
|
|
default:
|
|
|
|
|
ksg.Metadata[key] = value
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// DeleteAttribute removes a metadata attribute.
|
|
|
|
|
func (ksg *KmerSetGroup) DeleteAttribute(key string) {
|
|
|
|
|
delete(ksg.Metadata, key)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// GetIntAttribute returns an attribute as int.
|
|
|
|
|
func (ksg *KmerSetGroup) GetIntAttribute(key string) (int, bool) {
|
|
|
|
|
v, ok := ksg.GetAttribute(key)
|
|
|
|
|
if !ok {
|
|
|
|
|
return 0, false
|
|
|
|
|
}
|
|
|
|
|
switch val := v.(type) {
|
|
|
|
|
case int:
|
|
|
|
|
return val, true
|
|
|
|
|
case int64:
|
|
|
|
|
return int(val), true
|
|
|
|
|
case float64:
|
|
|
|
|
return int(val), true
|
|
|
|
|
}
|
|
|
|
|
return 0, false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// GetStringAttribute returns an attribute as string.
|
|
|
|
|
func (ksg *KmerSetGroup) GetStringAttribute(key string) (string, bool) {
|
|
|
|
|
v, ok := ksg.GetAttribute(key)
|
|
|
|
|
if !ok {
|
|
|
|
|
return "", false
|
|
|
|
|
}
|
|
|
|
|
if s, ok := v.(string); ok {
|
|
|
|
|
return s, true
|
|
|
|
|
}
|
|
|
|
|
return fmt.Sprintf("%v", v), true
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ==============================
|
|
|
|
|
// Jaccard metrics (streaming, disk-based)
|
|
|
|
|
// ==============================
|
|
|
|
|
|
|
|
|
|
// JaccardDistanceMatrix computes a pairwise Jaccard distance matrix
|
|
|
|
|
// for all sets in the group. Operates partition by partition in streaming.
|
|
|
|
|
func (ksg *KmerSetGroup) JaccardDistanceMatrix() *obidist.DistMatrix {
|
|
|
|
|
n := ksg.n
|
|
|
|
|
labels := make([]string, n)
|
|
|
|
|
for i := 0; i < n; i++ {
|
|
|
|
|
if i < len(ksg.setsIDs) && ksg.setsIDs[i] != "" {
|
|
|
|
|
labels[i] = ksg.setsIDs[i]
|
|
|
|
|
} else {
|
|
|
|
|
labels[i] = fmt.Sprintf("set_%d", i)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
dm := obidist.NewDistMatrixWithLabels(labels)
|
|
|
|
|
|
|
|
|
|
// Accumulate intersection and union counts
|
|
|
|
|
intersections := make([][]uint64, n)
|
|
|
|
|
unions := make([][]uint64, n)
|
|
|
|
|
for i := 0; i < n; i++ {
|
|
|
|
|
intersections[i] = make([]uint64, n)
|
|
|
|
|
unions[i] = make([]uint64, n)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Process partition by partition
|
|
|
|
|
var mu sync.Mutex
|
|
|
|
|
var wg sync.WaitGroup
|
|
|
|
|
|
|
|
|
|
for p := 0; p < ksg.partitions; p++ {
|
|
|
|
|
wg.Add(1)
|
|
|
|
|
go func(part int) {
|
|
|
|
|
defer wg.Done()
|
|
|
|
|
|
|
|
|
|
// Open all set readers for this partition
|
|
|
|
|
readers := make([]*KdiReader, n)
|
|
|
|
|
for s := 0; s < n; s++ {
|
|
|
|
|
r, err := NewKdiReader(ksg.partitionPath(s, part))
|
|
|
|
|
if err != nil {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
readers[s] = r
|
|
|
|
|
}
|
|
|
|
|
defer func() {
|
|
|
|
|
for _, r := range readers {
|
|
|
|
|
if r != nil {
|
|
|
|
|
r.Close()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}()
|
|
|
|
|
|
|
|
|
|
// Merge all N readers to count intersections and unions
|
|
|
|
|
activeReaders := make([]*KdiReader, 0, n)
|
|
|
|
|
activeIndices := make([]int, 0, n)
|
|
|
|
|
for i, r := range readers {
|
|
|
|
|
if r != nil && r.Count() > 0 {
|
|
|
|
|
activeReaders = append(activeReaders, r)
|
|
|
|
|
activeIndices = append(activeIndices, i)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if len(activeReaders) == 0 {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
merge := NewKWayMerge(activeReaders)
|
|
|
|
|
// Don't close merge here since readers are managed above
|
|
|
|
|
// We only want to iterate
|
|
|
|
|
|
|
|
|
|
// We need per-set presence tracking, so we use a custom merge
|
|
|
|
|
// Rebuild with a direct approach
|
|
|
|
|
merge.Close() // close the merge (which closes readers)
|
|
|
|
|
|
|
|
|
|
// Reopen readers for custom merge
|
|
|
|
|
for s := 0; s < n; s++ {
|
|
|
|
|
readers[s] = nil
|
|
|
|
|
r, err := NewKdiReader(ksg.partitionPath(s, part))
|
|
|
|
|
if err != nil {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if r.Count() > 0 {
|
|
|
|
|
readers[s] = r
|
|
|
|
|
} else {
|
|
|
|
|
r.Close()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Custom k-way merge that tracks which sets contain each kmer
|
|
|
|
|
type entry struct {
|
|
|
|
|
val uint64
|
|
|
|
|
setIdx int
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Use a simpler approach: read all values for this partition into memory
|
|
|
|
|
// for each set, then do a merge
|
|
|
|
|
setKmers := make([][]uint64, n)
|
|
|
|
|
for s := 0; s < n; s++ {
|
|
|
|
|
if readers[s] == nil {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
kmers := make([]uint64, 0, readers[s].Count())
|
|
|
|
|
for {
|
|
|
|
|
v, ok := readers[s].Next()
|
|
|
|
|
if !ok {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
kmers = append(kmers, v)
|
|
|
|
|
}
|
|
|
|
|
setKmers[s] = kmers
|
|
|
|
|
readers[s].Close()
|
|
|
|
|
readers[s] = nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Count pairwise intersections using sorted merge
|
|
|
|
|
// For each pair (i,j), count kmers present in both
|
|
|
|
|
localInter := make([][]uint64, n)
|
|
|
|
|
localUnion := make([][]uint64, n)
|
|
|
|
|
for i := 0; i < n; i++ {
|
|
|
|
|
localInter[i] = make([]uint64, n)
|
|
|
|
|
localUnion[i] = make([]uint64, n)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for i := 0; i < n; i++ {
|
|
|
|
|
localUnion[i][i] = uint64(len(setKmers[i]))
|
|
|
|
|
for j := i + 1; j < n; j++ {
|
|
|
|
|
a, b := setKmers[i], setKmers[j]
|
|
|
|
|
var inter uint64
|
|
|
|
|
ai, bi := 0, 0
|
|
|
|
|
for ai < len(a) && bi < len(b) {
|
|
|
|
|
if a[ai] == b[bi] {
|
|
|
|
|
inter++
|
|
|
|
|
ai++
|
|
|
|
|
bi++
|
|
|
|
|
} else if a[ai] < b[bi] {
|
|
|
|
|
ai++
|
|
|
|
|
} else {
|
|
|
|
|
bi++
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
localInter[i][j] = inter
|
|
|
|
|
localUnion[i][j] = uint64(len(a)) + uint64(len(b)) - inter
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
mu.Lock()
|
|
|
|
|
for i := 0; i < n; i++ {
|
|
|
|
|
for j := i; j < n; j++ {
|
|
|
|
|
intersections[i][j] += localInter[i][j]
|
|
|
|
|
unions[i][j] += localUnion[i][j]
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
mu.Unlock()
|
|
|
|
|
}(p)
|
|
|
|
|
}
|
|
|
|
|
wg.Wait()
|
|
|
|
|
|
|
|
|
|
// Compute distances from accumulated counts
|
|
|
|
|
for i := 0; i < n-1; i++ {
|
|
|
|
|
for j := i + 1; j < n; j++ {
|
|
|
|
|
u := unions[i][j]
|
|
|
|
|
if u == 0 {
|
|
|
|
|
dm.Set(i, j, 1.0)
|
|
|
|
|
} else {
|
|
|
|
|
dm.Set(i, j, 1.0-float64(intersections[i][j])/float64(u))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return dm
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// JaccardSimilarityMatrix computes a pairwise Jaccard similarity matrix.
|
|
|
|
|
func (ksg *KmerSetGroup) JaccardSimilarityMatrix() *obidist.DistMatrix {
|
|
|
|
|
n := ksg.n
|
|
|
|
|
labels := make([]string, n)
|
|
|
|
|
for i := 0; i < n; i++ {
|
|
|
|
|
if i < len(ksg.setsIDs) && ksg.setsIDs[i] != "" {
|
|
|
|
|
labels[i] = ksg.setsIDs[i]
|
|
|
|
|
} else {
|
|
|
|
|
labels[i] = fmt.Sprintf("set_%d", i)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Reuse distance computation
|
|
|
|
|
dm := ksg.JaccardDistanceMatrix()
|
|
|
|
|
sm := obidist.NewSimilarityMatrixWithLabels(labels)
|
|
|
|
|
|
|
|
|
|
for i := 0; i < n-1; i++ {
|
|
|
|
|
for j := i + 1; j < n; j++ {
|
|
|
|
|
sm.Set(i, j, 1.0-dm.Get(i, j))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return sm
|
|
|
|
|
}
|
Refactor k-mer index management with subcommands and enhanced metadata support
This commit refactors the k-mer index management tools to use a unified subcommand structure with obik, adds support for per-set metadata and ID management, enhances the k-mer set group builder to support appending to existing groups, and improves command-line option handling with a new global options registration system.
Key changes:
- Introduce obik command with subcommands (index, ls, summary, cp, mv, rm, super, lowmask)
- Add support for per-set metadata and ID management in kmer set groups
- Implement ability to append to existing kmer index groups
- Refactor option parsing to use a global options registration system
- Add new commands for listing, copying, moving, and removing sets
- Enhance low-complexity masking with new options and output formats
- Improve kmer index summary with Jaccard distance matrix support
- Remove deprecated obikindex and obisuperkmer commands
- Update build process to use the new subcommand structure
2026-02-09 23:10:30 +01:00
|
|
|
|
|
|
|
|
// ==============================
|
|
|
|
|
// Set ID accessors
|
|
|
|
|
// ==============================
|
|
|
|
|
|
|
|
|
|
// SetsIDs returns a copy of the per-set string identifiers.
|
|
|
|
|
func (ksg *KmerSetGroup) SetsIDs() []string {
|
|
|
|
|
out := make([]string, len(ksg.setsIDs))
|
|
|
|
|
copy(out, ksg.setsIDs)
|
|
|
|
|
return out
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// SetIDOf returns the string ID of the set at the given index.
|
|
|
|
|
// Returns "" if index is out of range.
|
|
|
|
|
func (ksg *KmerSetGroup) SetIDOf(index int) string {
|
|
|
|
|
if index < 0 || index >= ksg.n {
|
|
|
|
|
return ""
|
|
|
|
|
}
|
|
|
|
|
return ksg.setsIDs[index]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// SetSetID sets the string ID of the set at the given index.
|
|
|
|
|
func (ksg *KmerSetGroup) SetSetID(index int, id string) {
|
|
|
|
|
if index >= 0 && index < ksg.n {
|
|
|
|
|
ksg.setsIDs[index] = id
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// IndexOfSetID returns the numeric index for a set ID, or -1 if not found.
|
|
|
|
|
func (ksg *KmerSetGroup) IndexOfSetID(id string) int {
|
|
|
|
|
for i, sid := range ksg.setsIDs {
|
|
|
|
|
if sid == id {
|
|
|
|
|
return i
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return -1
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// MatchSetIDs resolves glob patterns against set IDs and returns matching
|
|
|
|
|
// indices sorted in ascending order. Uses path.Match for pattern matching
|
|
|
|
|
// (supports *, ?, [...] patterns). Returns error if a pattern is malformed.
|
|
|
|
|
func (ksg *KmerSetGroup) MatchSetIDs(patterns []string) ([]int, error) {
|
|
|
|
|
seen := make(map[int]bool)
|
|
|
|
|
for _, pattern := range patterns {
|
|
|
|
|
for i, sid := range ksg.setsIDs {
|
|
|
|
|
matched, err := path.Match(pattern, sid)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, fmt.Errorf("obikmer: invalid glob pattern %q: %w", pattern, err)
|
|
|
|
|
}
|
|
|
|
|
if matched {
|
|
|
|
|
seen[i] = true
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
result := make([]int, 0, len(seen))
|
|
|
|
|
for idx := range seen {
|
|
|
|
|
result = append(result, idx)
|
|
|
|
|
}
|
|
|
|
|
sort.Ints(result)
|
|
|
|
|
return result, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ==============================
|
|
|
|
|
// Per-set metadata accessors
|
|
|
|
|
// ==============================
|
|
|
|
|
|
|
|
|
|
// GetSetMetadata returns the value of a per-set metadata key.
|
|
|
|
|
func (ksg *KmerSetGroup) GetSetMetadata(setIndex int, key string) (interface{}, bool) {
|
|
|
|
|
if setIndex < 0 || setIndex >= ksg.n {
|
|
|
|
|
return nil, false
|
|
|
|
|
}
|
|
|
|
|
v, ok := ksg.setsMetadata[setIndex][key]
|
|
|
|
|
return v, ok
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// SetSetMetadata sets a per-set metadata attribute.
|
|
|
|
|
func (ksg *KmerSetGroup) SetSetMetadata(setIndex int, key string, value interface{}) {
|
|
|
|
|
if setIndex < 0 || setIndex >= ksg.n {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
if ksg.setsMetadata[setIndex] == nil {
|
|
|
|
|
ksg.setsMetadata[setIndex] = make(map[string]interface{})
|
|
|
|
|
}
|
|
|
|
|
ksg.setsMetadata[setIndex][key] = value
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// DeleteSetMetadata removes a per-set metadata attribute.
|
|
|
|
|
func (ksg *KmerSetGroup) DeleteSetMetadata(setIndex int, key string) {
|
|
|
|
|
if setIndex < 0 || setIndex >= ksg.n {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
delete(ksg.setsMetadata[setIndex], key)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// AllSetMetadata returns a copy of all metadata for a given set.
|
|
|
|
|
func (ksg *KmerSetGroup) AllSetMetadata(setIndex int) map[string]interface{} {
|
|
|
|
|
if setIndex < 0 || setIndex >= ksg.n {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
out := make(map[string]interface{}, len(ksg.setsMetadata[setIndex]))
|
|
|
|
|
for k, v := range ksg.setsMetadata[setIndex] {
|
|
|
|
|
out[k] = v
|
|
|
|
|
}
|
|
|
|
|
return out
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ==============================
|
|
|
|
|
// Exported partition path and compatibility
|
|
|
|
|
// ==============================
|
|
|
|
|
|
|
|
|
|
// PartitionPath returns the file path for partition partIndex of set setIndex.
|
|
|
|
|
func (ksg *KmerSetGroup) PartitionPath(setIndex, partIndex int) string {
|
|
|
|
|
return ksg.partitionPath(setIndex, partIndex)
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-10 09:26:46 +01:00
|
|
|
// SpectrumPath returns the path to the spectrum.bin file for the given set.
|
|
|
|
|
func (ksg *KmerSetGroup) SpectrumPath(setIndex int) string {
|
|
|
|
|
return filepath.Join(ksg.path, fmt.Sprintf("set_%d", setIndex), "spectrum.bin")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Spectrum reads the k-mer frequency spectrum for the given set.
|
|
|
|
|
// Returns nil, nil if no spectrum file exists.
|
|
|
|
|
func (ksg *KmerSetGroup) Spectrum(setIndex int) (*KmerSpectrum, error) {
|
|
|
|
|
path := ksg.SpectrumPath(setIndex)
|
|
|
|
|
if _, err := os.Stat(path); os.IsNotExist(err) {
|
|
|
|
|
return nil, nil
|
|
|
|
|
}
|
|
|
|
|
return ReadSpectrum(path)
|
|
|
|
|
}
|
|
|
|
|
|
Refactor k-mer index management with subcommands and enhanced metadata support
This commit refactors the k-mer index management tools to use a unified subcommand structure with obik, adds support for per-set metadata and ID management, enhances the k-mer set group builder to support appending to existing groups, and improves command-line option handling with a new global options registration system.
Key changes:
- Introduce obik command with subcommands (index, ls, summary, cp, mv, rm, super, lowmask)
- Add support for per-set metadata and ID management in kmer set groups
- Implement ability to append to existing kmer index groups
- Refactor option parsing to use a global options registration system
- Add new commands for listing, copying, moving, and removing sets
- Enhance low-complexity masking with new options and output formats
- Improve kmer index summary with Jaccard distance matrix support
- Remove deprecated obikindex and obisuperkmer commands
- Update build process to use the new subcommand structure
2026-02-09 23:10:30 +01:00
|
|
|
// IsCompatibleWith returns true if the other group has the same k, m, and partitions.
|
|
|
|
|
func (ksg *KmerSetGroup) IsCompatibleWith(other *KmerSetGroup) bool {
|
|
|
|
|
return ksg.k == other.k && ksg.m == other.m && ksg.partitions == other.partitions
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ==============================
|
|
|
|
|
// Set management operations
|
|
|
|
|
// ==============================
|
|
|
|
|
|
|
|
|
|
// NewEmptyCompatible creates an empty KmerSetGroup at destDir with the same
|
|
|
|
|
// k, m, and partitions as this group. The destination must not already exist.
|
|
|
|
|
func (ksg *KmerSetGroup) NewEmptyCompatible(destDir string) (*KmerSetGroup, error) {
|
|
|
|
|
if err := os.MkdirAll(destDir, 0755); err != nil {
|
|
|
|
|
return nil, fmt.Errorf("obikmer: create directory: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
dest := &KmerSetGroup{
|
|
|
|
|
path: destDir,
|
|
|
|
|
k: ksg.k,
|
|
|
|
|
m: ksg.m,
|
|
|
|
|
partitions: ksg.partitions,
|
|
|
|
|
n: 0,
|
|
|
|
|
setsIDs: []string{},
|
|
|
|
|
counts: []uint64{},
|
|
|
|
|
setsMetadata: []map[string]interface{}{},
|
|
|
|
|
Metadata: make(map[string]interface{}),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if err := dest.saveMetadata(); err != nil {
|
|
|
|
|
return nil, fmt.Errorf("obikmer: write metadata: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return dest, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// RemoveSetByID removes the set with the given ID from the group.
|
|
|
|
|
// It deletes the set directory, renumbers all subsequent sets, and
|
|
|
|
|
// updates the metadata on disk.
|
|
|
|
|
func (ksg *KmerSetGroup) RemoveSetByID(id string) error {
|
|
|
|
|
idx := ksg.IndexOfSetID(id)
|
|
|
|
|
if idx < 0 {
|
|
|
|
|
return fmt.Errorf("obikmer: set ID %q not found", id)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Delete the set directory
|
|
|
|
|
setDir := filepath.Join(ksg.path, fmt.Sprintf("set_%d", idx))
|
|
|
|
|
if err := os.RemoveAll(setDir); err != nil {
|
|
|
|
|
return fmt.Errorf("obikmer: remove set directory: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Renumber subsequent sets
|
|
|
|
|
for i := idx + 1; i < ksg.n; i++ {
|
|
|
|
|
oldDir := filepath.Join(ksg.path, fmt.Sprintf("set_%d", i))
|
|
|
|
|
newDir := filepath.Join(ksg.path, fmt.Sprintf("set_%d", i-1))
|
|
|
|
|
if err := os.Rename(oldDir, newDir); err != nil {
|
|
|
|
|
return fmt.Errorf("obikmer: rename set_%d to set_%d: %w", i, i-1, err)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Update slices
|
|
|
|
|
ksg.setsIDs = append(ksg.setsIDs[:idx], ksg.setsIDs[idx+1:]...)
|
|
|
|
|
ksg.counts = append(ksg.counts[:idx], ksg.counts[idx+1:]...)
|
|
|
|
|
ksg.setsMetadata = append(ksg.setsMetadata[:idx], ksg.setsMetadata[idx+1:]...)
|
|
|
|
|
ksg.n--
|
|
|
|
|
|
|
|
|
|
return ksg.saveMetadata()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// CopySetsByIDTo copies sets identified by their IDs into a KmerSetGroup
|
|
|
|
|
// at destDir. If destDir does not exist, a new compatible empty group is
|
|
|
|
|
// created. If it exists, compatibility (k, m, partitions) is checked.
|
|
|
|
|
// If a set ID already exists in the destination, an error is returned
|
|
|
|
|
// unless force is true (in which case the existing set is replaced).
|
|
|
|
|
// Per-set metadata travels with the set.
|
|
|
|
|
func (ksg *KmerSetGroup) CopySetsByIDTo(ids []string, destDir string, force bool) (*KmerSetGroup, error) {
|
|
|
|
|
// Resolve source IDs to indices
|
|
|
|
|
srcIndices := make([]int, len(ids))
|
|
|
|
|
for i, id := range ids {
|
|
|
|
|
idx := ksg.IndexOfSetID(id)
|
|
|
|
|
if idx < 0 {
|
|
|
|
|
return nil, fmt.Errorf("obikmer: source set ID %q not found", id)
|
|
|
|
|
}
|
|
|
|
|
srcIndices[i] = idx
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Open or create destination
|
|
|
|
|
var dest *KmerSetGroup
|
|
|
|
|
metaPath := filepath.Join(destDir, "metadata.toml")
|
|
|
|
|
if _, err := os.Stat(metaPath); err == nil {
|
|
|
|
|
// Destination exists
|
|
|
|
|
dest, err = OpenKmerSetGroup(destDir)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, fmt.Errorf("obikmer: open destination: %w", err)
|
|
|
|
|
}
|
|
|
|
|
if !ksg.IsCompatibleWith(dest) {
|
|
|
|
|
return nil, fmt.Errorf("obikmer: incompatible groups: source (k=%d, m=%d, P=%d) vs dest (k=%d, m=%d, P=%d)",
|
|
|
|
|
ksg.k, ksg.m, ksg.partitions, dest.k, dest.m, dest.partitions)
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// Create new destination
|
|
|
|
|
var err error
|
|
|
|
|
dest, err = ksg.NewEmptyCompatible(destDir)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, err
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Copy each set
|
|
|
|
|
for i, srcIdx := range srcIndices {
|
|
|
|
|
srcID := ids[i]
|
|
|
|
|
|
|
|
|
|
// Check for ID conflict in destination
|
|
|
|
|
existingIdx := dest.IndexOfSetID(srcID)
|
|
|
|
|
if existingIdx >= 0 {
|
|
|
|
|
if !force {
|
|
|
|
|
return nil, fmt.Errorf("obikmer: set ID %q already exists in destination (use force to replace)", srcID)
|
|
|
|
|
}
|
|
|
|
|
// Force: remove existing set in destination
|
|
|
|
|
if err := dest.RemoveSetByID(srcID); err != nil {
|
|
|
|
|
return nil, fmt.Errorf("obikmer: remove existing set %q in destination: %w", srcID, err)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Destination set index = current dest size
|
|
|
|
|
destIdx := dest.n
|
|
|
|
|
|
|
|
|
|
// Create destination set directory
|
|
|
|
|
destSetDir := filepath.Join(destDir, fmt.Sprintf("set_%d", destIdx))
|
|
|
|
|
if err := os.MkdirAll(destSetDir, 0755); err != nil {
|
|
|
|
|
return nil, fmt.Errorf("obikmer: create dest set dir: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Copy all partition files
|
|
|
|
|
for p := 0; p < ksg.partitions; p++ {
|
|
|
|
|
srcPath := ksg.partitionPath(srcIdx, p)
|
|
|
|
|
destPath := dest.partitionPath(destIdx, p)
|
|
|
|
|
if err := copyFile(srcPath, destPath); err != nil {
|
|
|
|
|
return nil, fmt.Errorf("obikmer: copy partition %d of set %q: %w", p, srcID, err)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-10 09:26:46 +01:00
|
|
|
// Copy spectrum.bin if it exists
|
|
|
|
|
srcSpecPath := ksg.SpectrumPath(srcIdx)
|
|
|
|
|
if _, err := os.Stat(srcSpecPath); err == nil {
|
|
|
|
|
destSpecPath := filepath.Join(destSetDir, "spectrum.bin")
|
|
|
|
|
if err := copyFile(srcSpecPath, destSpecPath); err != nil {
|
|
|
|
|
return nil, fmt.Errorf("obikmer: copy spectrum of set %q: %w", srcID, err)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
Refactor k-mer index management with subcommands and enhanced metadata support
This commit refactors the k-mer index management tools to use a unified subcommand structure with obik, adds support for per-set metadata and ID management, enhances the k-mer set group builder to support appending to existing groups, and improves command-line option handling with a new global options registration system.
Key changes:
- Introduce obik command with subcommands (index, ls, summary, cp, mv, rm, super, lowmask)
- Add support for per-set metadata and ID management in kmer set groups
- Implement ability to append to existing kmer index groups
- Refactor option parsing to use a global options registration system
- Add new commands for listing, copying, moving, and removing sets
- Enhance low-complexity masking with new options and output formats
- Improve kmer index summary with Jaccard distance matrix support
- Remove deprecated obikindex and obisuperkmer commands
- Update build process to use the new subcommand structure
2026-02-09 23:10:30 +01:00
|
|
|
// Update destination metadata
|
|
|
|
|
dest.setsIDs = append(dest.setsIDs, srcID)
|
|
|
|
|
dest.counts = append(dest.counts, ksg.counts[srcIdx])
|
|
|
|
|
|
|
|
|
|
// Copy per-set metadata
|
|
|
|
|
srcMeta := ksg.AllSetMetadata(srcIdx)
|
|
|
|
|
if srcMeta == nil {
|
|
|
|
|
srcMeta = make(map[string]interface{})
|
|
|
|
|
}
|
|
|
|
|
dest.setsMetadata = append(dest.setsMetadata, srcMeta)
|
|
|
|
|
dest.n++
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if err := dest.saveMetadata(); err != nil {
|
|
|
|
|
return nil, fmt.Errorf("obikmer: save destination metadata: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return dest, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// copyFile copies a file from src to dst.
|
|
|
|
|
func copyFile(src, dst string) error {
|
|
|
|
|
in, err := os.Open(src)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
defer in.Close()
|
|
|
|
|
|
|
|
|
|
out, err := os.Create(dst)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
defer out.Close()
|
|
|
|
|
|
|
|
|
|
if _, err := io.Copy(out, in); err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return out.Close()
|
|
|
|
|
}
|