From 56c1f4180c3e45a8213df318f86a2261e9642048 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 9 Feb 2026 23:10:30 +0100 Subject: [PATCH] Refactor k-mer index management with subcommands and enhanced metadata support This commit refactors the k-mer index management tools to use a unified subcommand structure with obik, adds support for per-set metadata and ID management, enhances the k-mer set group builder to support appending to existing groups, and improves command-line option handling with a new global options registration system. Key changes: - Introduce obik command with subcommands (index, ls, summary, cp, mv, rm, super, lowmask) - Add support for per-set metadata and ID management in kmer set groups - Implement ability to append to existing kmer index groups - Refactor option parsing to use a global options registration system - Add new commands for listing, copying, moving, and removing sets - Enhance low-complexity masking with new options and output formats - Improve kmer index summary with Jaccard distance matrix support - Remove deprecated obikindex and obisuperkmer commands - Update build process to use the new subcommand structure --- cmd/obitools/obik/main.go | 34 ++ cmd/obitools/obikindex/main.go | 23 -- cmd/obitools/obilowmask/main.go | 47 --- cmd/obitools/obisuperkmer/main.go | 34 -- pkg/obikmer/kmer_set_builder.go | 166 ++++++-- pkg/obikmer/kmer_set_disk.go | 363 ++++++++++++++++-- pkg/obioptions/options.go | 234 ++++++----- pkg/obioptions/subcommand.go | 43 +++ pkg/obitools/obik/cp.go | 55 +++ pkg/obitools/obik/index.go | 110 ++++++ .../obilowmask.go => obik/lowmask.go} | 138 ++++--- pkg/obitools/obik/ls.go | 96 +++++ pkg/obitools/obik/mv.go | 63 +++ pkg/obitools/obik/obik.go | 64 +++ pkg/obitools/{obikindex => obik}/options.go | 88 ++++- pkg/obitools/obik/rm.go | 56 +++ pkg/obitools/obik/summary.go | 148 +++++++ pkg/obitools/obik/super.go | 64 +++ pkg/obitools/obikindex/obikindex.go | 91 ----- pkg/obitools/obilowmask/entropy.qmd | 332 ---------------- pkg/obitools/obilowmask/obilowmask_test.go | 40 -- pkg/obitools/obilowmask/options.go | 81 ---- pkg/obitools/obisuperkmer/obisuperkmer.go | 10 - pkg/obitools/obisuperkmer/options.go | 69 ---- pkg/obitools/obisuperkmer/superkmer.go | 59 --- pkg/obitools/obisuperkmer/superkmer_test.go | 149 ------- 26 files changed, 1482 insertions(+), 1175 deletions(-) create mode 100644 cmd/obitools/obik/main.go delete mode 100644 cmd/obitools/obikindex/main.go delete mode 100644 cmd/obitools/obilowmask/main.go delete mode 100644 cmd/obitools/obisuperkmer/main.go create mode 100644 pkg/obioptions/subcommand.go create mode 100644 pkg/obitools/obik/cp.go create mode 100644 pkg/obitools/obik/index.go rename pkg/obitools/{obilowmask/obilowmask.go => obik/lowmask.go} (70%) create mode 100644 pkg/obitools/obik/ls.go create mode 100644 pkg/obitools/obik/mv.go create mode 100644 pkg/obitools/obik/obik.go rename pkg/obitools/{obikindex => obik}/options.go (62%) create mode 100644 pkg/obitools/obik/rm.go create mode 100644 pkg/obitools/obik/summary.go create mode 100644 pkg/obitools/obik/super.go delete mode 100644 pkg/obitools/obikindex/obikindex.go delete mode 100644 pkg/obitools/obilowmask/entropy.qmd delete mode 100644 pkg/obitools/obilowmask/obilowmask_test.go delete mode 100644 pkg/obitools/obilowmask/options.go delete mode 100644 pkg/obitools/obisuperkmer/obisuperkmer.go delete mode 100644 pkg/obitools/obisuperkmer/options.go delete mode 100644 pkg/obitools/obisuperkmer/superkmer.go delete mode 100644 pkg/obitools/obisuperkmer/superkmer_test.go diff --git a/cmd/obitools/obik/main.go b/cmd/obitools/obik/main.go new file mode 100644 index 0000000..694d1ab --- /dev/null +++ b/cmd/obitools/obik/main.go @@ -0,0 +1,34 @@ +package main + +import ( + "context" + "errors" + "os" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obik" + "github.com/DavidGamba/go-getoptions" +) + +func main() { + defer obiseq.LogBioSeqStatus() + + opt, parser := obioptions.GenerateSubcommandParser( + "obik", + "Manage disk-based kmer indices", + obik.OptionSet, + ) + + _, remaining := parser(os.Args) + + err := opt.Dispatch(context.Background(), remaining) + if err != nil { + if errors.Is(err, getoptions.ErrorHelpCalled) { + os.Exit(0) + } + log.Fatalf("Error: %v", err) + } +} diff --git a/cmd/obitools/obikindex/main.go b/cmd/obitools/obikindex/main.go deleted file mode 100644 index cb31985..0000000 --- a/cmd/obitools/obikindex/main.go +++ /dev/null @@ -1,23 +0,0 @@ -package main - -import ( - "os" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obikindex" -) - -func main() { - optionParser := obioptions.GenerateOptionParser( - "obikindex", - "builds a disk-based kmer index from sequence files", - obikindex.OptionSet) - - _, args := optionParser(os.Args) - - sequences, err := obiconvert.CLIReadBioSequences(args...) - obiconvert.OpenSequenceDataErrorMessage(args, err) - - obikindex.CLIBuildKmerIndex(sequences) -} diff --git a/cmd/obitools/obilowmask/main.go b/cmd/obitools/obilowmask/main.go deleted file mode 100644 index ec43a54..0000000 --- a/cmd/obitools/obilowmask/main.go +++ /dev/null @@ -1,47 +0,0 @@ -package main - -import ( - "os" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obilowmask" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" -) - -func main() { - - defer obiseq.LogBioSeqStatus() - - // go tool pprof -http=":8000" ./obipairing ./cpu.pprof - // f, err := os.Create("cpu.pprof") - // if err != nil { - // log.Fatal(err) - // } - // pprof.StartCPUProfile(f) - // defer pprof.StopCPUProfile() - - // go tool trace cpu.trace - // ftrace, err := os.Create("cpu.trace") - // if err != nil { - // log.Fatal(err) - // } - // trace.Start(ftrace) - // defer trace.Stop() - - optionParser := obioptions.GenerateOptionParser( - "obimicrosat", - "looks for microsatellites sequences in a sequence file", - obilowmask.OptionSet) - - _, args := optionParser(os.Args) - - sequences, err := obiconvert.CLIReadBioSequences(args...) - obiconvert.OpenSequenceDataErrorMessage(args, err) - - selected := obilowmask.CLISequenceEntropyMasker(sequences) - obiconvert.CLIWriteBioSequences(selected, true) - obiutils.WaitForLastPipe() - -} diff --git a/cmd/obitools/obisuperkmer/main.go b/cmd/obitools/obisuperkmer/main.go deleted file mode 100644 index aedaa80..0000000 --- a/cmd/obitools/obisuperkmer/main.go +++ /dev/null @@ -1,34 +0,0 @@ -package main - -import ( - "os" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obisuperkmer" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" -) - -func main() { - // Generate option parser - optionParser := obioptions.GenerateOptionParser( - "obisuperkmer", - "extract super k-mers from sequence files", - obisuperkmer.OptionSet) - - // Parse command-line arguments - _, args := optionParser(os.Args) - - // Read input sequences - sequences, err := obiconvert.CLIReadBioSequences(args...) - obiconvert.OpenSequenceDataErrorMessage(args, err) - - // Extract super k-mers - superkmers := obisuperkmer.CLIExtractSuperKmers(sequences) - - // Write output sequences - obiconvert.CLIWriteBioSequences(superkmers, true) - - // Wait for pipeline completion - obiutils.WaitForLastPipe() -} diff --git a/pkg/obikmer/kmer_set_builder.go b/pkg/obikmer/kmer_set_builder.go index b987093..10b0bf0 100644 --- a/pkg/obikmer/kmer_set_builder.go +++ b/pkg/obikmer/kmer_set_builder.go @@ -32,15 +32,17 @@ func WithMinFrequency(minFreq int) BuilderOption { // partitioned by minimizer. On Close(), each partition is finalized // (sort, dedup, optional frequency filter) into .kdi files. type KmerSetGroupBuilder struct { - dir string - k int - m int - n int // number of sets - P int // number of partitions - config builderConfig - writers [][]*SkmWriter // [setIndex][partIndex] - mu [][]sync.Mutex // per-writer mutex for concurrent access - closed bool + dir string + k int + m int + n int // number of NEW sets being built + P int // number of partitions + startIndex int // first set index (0 for new groups, existingN for appends) + config builderConfig + existing *KmerSetGroup // non-nil when appending to existing group + writers [][]*SkmWriter // [setIndex][partIndex] (local index 0..n-1) + mu [][]sync.Mutex // per-writer mutex for concurrent access + closed bool } // NewKmerSetGroupBuilder creates a builder for a new KmerSetGroup. @@ -127,17 +129,94 @@ func NewKmerSetGroupBuilder(directory string, k, m, n, P int, } return &KmerSetGroupBuilder{ - dir: directory, - k: k, - m: m, - n: n, - P: P, - config: config, - writers: writers, - mu: mutexes, + dir: directory, + k: k, + m: m, + n: n, + P: P, + startIndex: 0, + config: config, + writers: writers, + mu: mutexes, }, nil } +// AppendKmerSetGroupBuilder opens an existing KmerSetGroup and creates +// a builder that adds n new sets starting from the existing set count. +// The k, m, and partitions are inherited from the existing group. +func AppendKmerSetGroupBuilder(directory string, n int, options ...BuilderOption) (*KmerSetGroupBuilder, error) { + existing, err := OpenKmerSetGroup(directory) + if err != nil { + return nil, fmt.Errorf("obikmer: open existing group: %w", err) + } + + if n < 1 { + return nil, fmt.Errorf("obikmer: n must be >= 1, got %d", n) + } + + k := existing.K() + m := existing.M() + P := existing.Partitions() + startIndex := existing.Size() + + var config builderConfig + for _, opt := range options { + opt(&config) + } + + // Create build directory structure for new sets + buildDir := filepath.Join(directory, ".build") + for s := 0; s < n; s++ { + setDir := filepath.Join(buildDir, fmt.Sprintf("set_%d", s)) + if err := os.MkdirAll(setDir, 0755); err != nil { + return nil, fmt.Errorf("obikmer: create build dir: %w", err) + } + } + + // Create SKM writers for new sets + writers := make([][]*SkmWriter, n) + mutexes := make([][]sync.Mutex, n) + for s := 0; s < n; s++ { + writers[s] = make([]*SkmWriter, P) + mutexes[s] = make([]sync.Mutex, P) + for p := 0; p < P; p++ { + path := filepath.Join(buildDir, fmt.Sprintf("set_%d", s), + fmt.Sprintf("part_%04d.skm", p)) + w, err := NewSkmWriter(path) + if err != nil { + for ss := 0; ss <= s; ss++ { + for pp := 0; pp < P; pp++ { + if writers[ss][pp] != nil { + writers[ss][pp].Close() + } + } + } + return nil, fmt.Errorf("obikmer: create skm writer: %w", err) + } + writers[s][p] = w + } + } + + return &KmerSetGroupBuilder{ + dir: directory, + k: k, + m: m, + n: n, + P: P, + startIndex: startIndex, + config: config, + existing: existing, + writers: writers, + mu: mutexes, + }, nil +} + +// StartIndex returns the first global set index for the new sets being built. +// For new groups this is 0; for appends it is the existing group's Size(). +func (b *KmerSetGroupBuilder) StartIndex() int { + return b.startIndex +} + // AddSequence extracts super-kmers from a sequence and writes them // to the appropriate partition files for the given set. func (b *KmerSetGroupBuilder) AddSequence(setIndex int, seq *obiseq.BioSequence) { @@ -193,9 +272,10 @@ func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) { } } - // 2. Create output directory structure + // 2. Create output directory structure for new sets for s := 0; s < b.n; s++ { - setDir := filepath.Join(b.dir, fmt.Sprintf("set_%d", s)) + globalIdx := b.startIndex + s + setDir := filepath.Join(b.dir, fmt.Sprintf("set_%d", globalIdx)) if err := os.MkdirAll(setDir, 0755); err != nil { return nil, fmt.Errorf("obikmer: create set dir: %w", err) } @@ -251,24 +331,44 @@ func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) { } // 3. Build KmerSetGroup and write metadata - totalCounts := make([]uint64, b.n) + newCounts := make([]uint64, b.n) for s := 0; s < b.n; s++ { for p := 0; p < b.P; p++ { - totalCounts[s] += counts[s][p] + newCounts[s] += counts[s][p] } } - setsIDs := make([]string, b.n) + var ksg *KmerSetGroup - ksg := &KmerSetGroup{ - path: b.dir, - k: b.k, - m: b.m, - partitions: b.P, - n: b.n, - setsIDs: setsIDs, - counts: totalCounts, - Metadata: make(map[string]interface{}), + if b.existing != nil { + // Append mode: extend existing group + ksg = b.existing + ksg.n += b.n + ksg.setsIDs = append(ksg.setsIDs, make([]string, b.n)...) + ksg.counts = append(ksg.counts, newCounts...) + newMeta := make([]map[string]interface{}, b.n) + for i := range newMeta { + newMeta[i] = make(map[string]interface{}) + } + ksg.setsMetadata = append(ksg.setsMetadata, newMeta...) + } else { + // New group + setsIDs := make([]string, b.n) + setsMetadata := make([]map[string]interface{}, b.n) + for i := range setsMetadata { + setsMetadata[i] = make(map[string]interface{}) + } + ksg = &KmerSetGroup{ + path: b.dir, + k: b.k, + m: b.m, + partitions: b.P, + n: b.n, + setsIDs: setsIDs, + counts: newCounts, + setsMetadata: setsMetadata, + Metadata: make(map[string]interface{}), + } } if err := ksg.saveMetadata(); err != nil { @@ -285,12 +385,14 @@ func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) { // finalizePartition processes a single partition: load SKM, extract k-mers, // sort, dedup/count, write KDI. func (b *KmerSetGroupBuilder) finalizePartition(setIdx, partIdx int, count *uint64) error { + // setIdx is local (0..n-1); build dirs use local index, output dirs use global skmPath := filepath.Join(b.dir, ".build", fmt.Sprintf("set_%d", setIdx), fmt.Sprintf("part_%04d.skm", partIdx)) + globalIdx := b.startIndex + setIdx kdiPath := filepath.Join(b.dir, - fmt.Sprintf("set_%d", setIdx), + fmt.Sprintf("set_%d", globalIdx), fmt.Sprintf("part_%04d.kdi", partIdx)) // Load super-kmers and extract canonical k-mers diff --git a/pkg/obikmer/kmer_set_disk.go b/pkg/obikmer/kmer_set_disk.go index 0ac623c..6d8f7e2 100644 --- a/pkg/obikmer/kmer_set_disk.go +++ b/pkg/obikmer/kmer_set_disk.go @@ -2,9 +2,12 @@ package obikmer import ( "fmt" + "io" "iter" "os" + "path" "path/filepath" + "sort" "sync" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidist" @@ -42,28 +45,30 @@ func (f MetadataFormat) String() string { // // A KmerSetGroup with Size()==1 is effectively a KmerSet (singleton). type KmerSetGroup struct { - path string // root directory - id string // user-assigned identifier - k int // k-mer size - m int // minimizer size - partitions int // number of partitions P - n int // number of sets N - setsIDs []string // IDs of individual sets - counts []uint64 // total k-mer count per set (sum over partitions) - Metadata map[string]interface{} // group-level user metadata + path string // root directory + id string // user-assigned identifier + k int // k-mer size + m int // minimizer size + partitions int // number of partitions P + n int // number of sets N + setsIDs []string // IDs of individual sets + counts []uint64 // total k-mer count per set (sum over partitions) + setsMetadata []map[string]interface{} // per-set user metadata + Metadata map[string]interface{} // group-level user metadata } // diskMetadata is the TOML-serializable structure for metadata.toml. type diskMetadata struct { - ID string `toml:"id,omitempty"` - K int `toml:"k"` - M int `toml:"m"` - Partitions int `toml:"partitions"` - Type string `toml:"type"` - Size int `toml:"size"` - SetsIDs []string `toml:"sets_ids,omitempty"` - Counts []uint64 `toml:"counts,omitempty"` - UserMetadata map[string]interface{} `toml:"user_metadata,omitempty"` + ID string `toml:"id,omitempty"` + K int `toml:"k"` + M int `toml:"m"` + Partitions int `toml:"partitions"` + Type string `toml:"type"` + Size int `toml:"size"` + SetsIDs []string `toml:"sets_ids,omitempty"` + Counts []uint64 `toml:"counts,omitempty"` + SetsMetadata []map[string]interface{} `toml:"sets_metadata,omitempty"` + UserMetadata map[string]interface{} `toml:"user_metadata,omitempty"` } // OpenKmerSetGroup opens a finalized index directory in read-only mode. @@ -81,15 +86,16 @@ func OpenKmerSetGroup(directory string) (*KmerSetGroup, error) { } ksg := &KmerSetGroup{ - path: directory, - id: meta.ID, - k: meta.K, - m: meta.M, - partitions: meta.Partitions, - n: meta.Size, - setsIDs: meta.SetsIDs, - counts: meta.Counts, - Metadata: meta.UserMetadata, + path: directory, + id: meta.ID, + k: meta.K, + m: meta.M, + partitions: meta.Partitions, + n: meta.Size, + setsIDs: meta.SetsIDs, + counts: meta.Counts, + setsMetadata: meta.SetsMetadata, + Metadata: meta.UserMetadata, } if ksg.Metadata == nil { ksg.Metadata = make(map[string]interface{}) @@ -97,6 +103,12 @@ func OpenKmerSetGroup(directory string) (*KmerSetGroup, error) { if ksg.setsIDs == nil { ksg.setsIDs = make([]string, ksg.n) } + if ksg.setsMetadata == nil { + ksg.setsMetadata = make([]map[string]interface{}, ksg.n) + for i := range ksg.setsMetadata { + ksg.setsMetadata[i] = make(map[string]interface{}) + } + } if ksg.counts == nil { // Compute counts by scanning partitions ksg.counts = make([]uint64, ksg.n) @@ -133,6 +145,7 @@ func (ksg *KmerSetGroup) saveMetadata() error { Size: ksg.n, SetsIDs: ksg.setsIDs, Counts: ksg.counts, + SetsMetadata: ksg.setsMetadata, UserMetadata: ksg.Metadata, } @@ -578,3 +591,299 @@ func (ksg *KmerSetGroup) JaccardSimilarityMatrix() *obidist.DistMatrix { return sm } + +// ============================== +// Set ID accessors +// ============================== + +// SetsIDs returns a copy of the per-set string identifiers. +func (ksg *KmerSetGroup) SetsIDs() []string { + out := make([]string, len(ksg.setsIDs)) + copy(out, ksg.setsIDs) + return out +} + +// SetIDOf returns the string ID of the set at the given index. +// Returns "" if index is out of range. +func (ksg *KmerSetGroup) SetIDOf(index int) string { + if index < 0 || index >= ksg.n { + return "" + } + return ksg.setsIDs[index] +} + +// SetSetID sets the string ID of the set at the given index. +func (ksg *KmerSetGroup) SetSetID(index int, id string) { + if index >= 0 && index < ksg.n { + ksg.setsIDs[index] = id + } +} + +// IndexOfSetID returns the numeric index for a set ID, or -1 if not found. +func (ksg *KmerSetGroup) IndexOfSetID(id string) int { + for i, sid := range ksg.setsIDs { + if sid == id { + return i + } + } + return -1 +} + +// MatchSetIDs resolves glob patterns against set IDs and returns matching +// indices sorted in ascending order. Uses path.Match for pattern matching +// (supports *, ?, [...] patterns). Returns error if a pattern is malformed. +func (ksg *KmerSetGroup) MatchSetIDs(patterns []string) ([]int, error) { + seen := make(map[int]bool) + for _, pattern := range patterns { + for i, sid := range ksg.setsIDs { + matched, err := path.Match(pattern, sid) + if err != nil { + return nil, fmt.Errorf("obikmer: invalid glob pattern %q: %w", pattern, err) + } + if matched { + seen[i] = true + } + } + } + result := make([]int, 0, len(seen)) + for idx := range seen { + result = append(result, idx) + } + sort.Ints(result) + return result, nil +} + +// ============================== +// Per-set metadata accessors +// ============================== + +// GetSetMetadata returns the value of a per-set metadata key. +func (ksg *KmerSetGroup) GetSetMetadata(setIndex int, key string) (interface{}, bool) { + if setIndex < 0 || setIndex >= ksg.n { + return nil, false + } + v, ok := ksg.setsMetadata[setIndex][key] + return v, ok +} + +// SetSetMetadata sets a per-set metadata attribute. +func (ksg *KmerSetGroup) SetSetMetadata(setIndex int, key string, value interface{}) { + if setIndex < 0 || setIndex >= ksg.n { + return + } + if ksg.setsMetadata[setIndex] == nil { + ksg.setsMetadata[setIndex] = make(map[string]interface{}) + } + ksg.setsMetadata[setIndex][key] = value +} + +// DeleteSetMetadata removes a per-set metadata attribute. +func (ksg *KmerSetGroup) DeleteSetMetadata(setIndex int, key string) { + if setIndex < 0 || setIndex >= ksg.n { + return + } + delete(ksg.setsMetadata[setIndex], key) +} + +// AllSetMetadata returns a copy of all metadata for a given set. +func (ksg *KmerSetGroup) AllSetMetadata(setIndex int) map[string]interface{} { + if setIndex < 0 || setIndex >= ksg.n { + return nil + } + out := make(map[string]interface{}, len(ksg.setsMetadata[setIndex])) + for k, v := range ksg.setsMetadata[setIndex] { + out[k] = v + } + return out +} + +// ============================== +// Exported partition path and compatibility +// ============================== + +// PartitionPath returns the file path for partition partIndex of set setIndex. +func (ksg *KmerSetGroup) PartitionPath(setIndex, partIndex int) string { + return ksg.partitionPath(setIndex, partIndex) +} + +// IsCompatibleWith returns true if the other group has the same k, m, and partitions. +func (ksg *KmerSetGroup) IsCompatibleWith(other *KmerSetGroup) bool { + return ksg.k == other.k && ksg.m == other.m && ksg.partitions == other.partitions +} + +// ============================== +// Set management operations +// ============================== + +// NewEmptyCompatible creates an empty KmerSetGroup at destDir with the same +// k, m, and partitions as this group. The destination must not already exist. +func (ksg *KmerSetGroup) NewEmptyCompatible(destDir string) (*KmerSetGroup, error) { + if err := os.MkdirAll(destDir, 0755); err != nil { + return nil, fmt.Errorf("obikmer: create directory: %w", err) + } + + dest := &KmerSetGroup{ + path: destDir, + k: ksg.k, + m: ksg.m, + partitions: ksg.partitions, + n: 0, + setsIDs: []string{}, + counts: []uint64{}, + setsMetadata: []map[string]interface{}{}, + Metadata: make(map[string]interface{}), + } + + if err := dest.saveMetadata(); err != nil { + return nil, fmt.Errorf("obikmer: write metadata: %w", err) + } + + return dest, nil +} + +// RemoveSetByID removes the set with the given ID from the group. +// It deletes the set directory, renumbers all subsequent sets, and +// updates the metadata on disk. +func (ksg *KmerSetGroup) RemoveSetByID(id string) error { + idx := ksg.IndexOfSetID(id) + if idx < 0 { + return fmt.Errorf("obikmer: set ID %q not found", id) + } + + // Delete the set directory + setDir := filepath.Join(ksg.path, fmt.Sprintf("set_%d", idx)) + if err := os.RemoveAll(setDir); err != nil { + return fmt.Errorf("obikmer: remove set directory: %w", err) + } + + // Renumber subsequent sets + for i := idx + 1; i < ksg.n; i++ { + oldDir := filepath.Join(ksg.path, fmt.Sprintf("set_%d", i)) + newDir := filepath.Join(ksg.path, fmt.Sprintf("set_%d", i-1)) + if err := os.Rename(oldDir, newDir); err != nil { + return fmt.Errorf("obikmer: rename set_%d to set_%d: %w", i, i-1, err) + } + } + + // Update slices + ksg.setsIDs = append(ksg.setsIDs[:idx], ksg.setsIDs[idx+1:]...) + ksg.counts = append(ksg.counts[:idx], ksg.counts[idx+1:]...) + ksg.setsMetadata = append(ksg.setsMetadata[:idx], ksg.setsMetadata[idx+1:]...) + ksg.n-- + + return ksg.saveMetadata() +} + +// CopySetsByIDTo copies sets identified by their IDs into a KmerSetGroup +// at destDir. If destDir does not exist, a new compatible empty group is +// created. If it exists, compatibility (k, m, partitions) is checked. +// If a set ID already exists in the destination, an error is returned +// unless force is true (in which case the existing set is replaced). +// Per-set metadata travels with the set. +func (ksg *KmerSetGroup) CopySetsByIDTo(ids []string, destDir string, force bool) (*KmerSetGroup, error) { + // Resolve source IDs to indices + srcIndices := make([]int, len(ids)) + for i, id := range ids { + idx := ksg.IndexOfSetID(id) + if idx < 0 { + return nil, fmt.Errorf("obikmer: source set ID %q not found", id) + } + srcIndices[i] = idx + } + + // Open or create destination + var dest *KmerSetGroup + metaPath := filepath.Join(destDir, "metadata.toml") + if _, err := os.Stat(metaPath); err == nil { + // Destination exists + dest, err = OpenKmerSetGroup(destDir) + if err != nil { + return nil, fmt.Errorf("obikmer: open destination: %w", err) + } + if !ksg.IsCompatibleWith(dest) { + return nil, fmt.Errorf("obikmer: incompatible groups: source (k=%d, m=%d, P=%d) vs dest (k=%d, m=%d, P=%d)", + ksg.k, ksg.m, ksg.partitions, dest.k, dest.m, dest.partitions) + } + } else { + // Create new destination + var err error + dest, err = ksg.NewEmptyCompatible(destDir) + if err != nil { + return nil, err + } + } + + // Copy each set + for i, srcIdx := range srcIndices { + srcID := ids[i] + + // Check for ID conflict in destination + existingIdx := dest.IndexOfSetID(srcID) + if existingIdx >= 0 { + if !force { + return nil, fmt.Errorf("obikmer: set ID %q already exists in destination (use force to replace)", srcID) + } + // Force: remove existing set in destination + if err := dest.RemoveSetByID(srcID); err != nil { + return nil, fmt.Errorf("obikmer: remove existing set %q in destination: %w", srcID, err) + } + } + + // Destination set index = current dest size + destIdx := dest.n + + // Create destination set directory + destSetDir := filepath.Join(destDir, fmt.Sprintf("set_%d", destIdx)) + if err := os.MkdirAll(destSetDir, 0755); err != nil { + return nil, fmt.Errorf("obikmer: create dest set dir: %w", err) + } + + // Copy all partition files + for p := 0; p < ksg.partitions; p++ { + srcPath := ksg.partitionPath(srcIdx, p) + destPath := dest.partitionPath(destIdx, p) + if err := copyFile(srcPath, destPath); err != nil { + return nil, fmt.Errorf("obikmer: copy partition %d of set %q: %w", p, srcID, err) + } + } + + // Update destination metadata + dest.setsIDs = append(dest.setsIDs, srcID) + dest.counts = append(dest.counts, ksg.counts[srcIdx]) + + // Copy per-set metadata + srcMeta := ksg.AllSetMetadata(srcIdx) + if srcMeta == nil { + srcMeta = make(map[string]interface{}) + } + dest.setsMetadata = append(dest.setsMetadata, srcMeta) + dest.n++ + } + + if err := dest.saveMetadata(); err != nil { + return nil, fmt.Errorf("obikmer: save destination metadata: %w", err) + } + + return dest, nil +} + +// copyFile copies a file from src to dst. +func copyFile(src, dst string) error { + in, err := os.Open(src) + if err != nil { + return err + } + defer in.Close() + + out, err := os.Create(dst) + if err != nil { + return err + } + defer out.Close() + + if _, err := io.Copy(out, in); err != nil { + return err + } + + return out.Close() +} diff --git a/pkg/obioptions/options.go b/pkg/obioptions/options.go index aaf45f9..5109ac3 100644 --- a/pkg/obioptions/options.go +++ b/pkg/obioptions/options.go @@ -26,16 +26,11 @@ var __defaut_taxonomy_mutex__ sync.Mutex type ArgumentParser func([]string) (*getoptions.GetOpt, []string) -func GenerateOptionParser(program string, - documentation string, - optionset ...func(*getoptions.GetOpt)) ArgumentParser { - - options := getoptions.New() - options.Self(program, documentation) - options.SetMode(getoptions.Bundling) - options.SetUnknownMode(getoptions.Fail) - options.Bool("help", false, options.Alias("h", "?")) - +// RegisterGlobalOptions registers the global options shared by all obitools +// commands onto the given GetOpt instance. It does NOT register --help, +// which must be handled by the caller (either as a Bool option or via +// HelpCommand for subcommand-based parsers). +func RegisterGlobalOptions(options *getoptions.GetOpt) { options.Bool("version", false, options.Description("Prints the version and exits.")) @@ -46,17 +41,10 @@ func GenerateOptionParser(program string, options.BoolVar(&_Pprof, "pprof", false, options.Description("Enable pprof server. Look at the log for details.")) - // options.IntVar(&_ParallelWorkers, "workers", _ParallelWorkers, - // options.Alias("w"), - // options.Description("Number of parallele threads computing the result")) - options.IntVar(obidefault.MaxCPUPtr(), "max-cpu", obidefault.MaxCPU(), options.GetEnv("OBIMAXCPU"), options.Description("Number of parallele threads computing the result")) - // options.BoolVar(&_Pprof, "force-one-cpu", false, - // options.Description("Force to use only one cpu core for parallel processing")) - options.IntVar(&_PprofMudex, "pprof-mutex", _PprofMudex, options.GetEnv("OBIPPROFMUTEX"), options.Description("Enable profiling of mutex lock.")) @@ -77,119 +65,119 @@ func GenerateOptionParser(program string, options.GetEnv("OBIWARNING"), options.Description("Stop printing of the warning message"), ) +} + +// ProcessParsedOptions handles the post-parse logic common to all obitools +// commands: help, version, debug, pprof, taxonomy, cpu configuration, etc. +// It receives the GetOpt instance and the parse error (if any). +func ProcessParsedOptions(options *getoptions.GetOpt, parseErr error) { + // Note: "help" may not be registered as a Bool (e.g. when using HelpCommand + // for subcommand-based parsers). Only check if it won't panic. + // We use a recover guard to be safe. + func() { + defer func() { recover() }() + if options.Called("help") { + fmt.Fprint(os.Stderr, options.Help()) + os.Exit(0) + } + }() + + if options.Called("version") { + fmt.Fprintf(os.Stderr, "OBITools %s\n", VersionString()) + os.Exit(0) + } + + if options.Called("taxonomy") { + __defaut_taxonomy_mutex__.Lock() + defer __defaut_taxonomy_mutex__.Unlock() + taxonomy, err := obiformats.LoadTaxonomy( + obidefault.SelectedTaxonomy(), + !obidefault.AreAlternativeNamesSelected(), + SeqAsTaxa(), + ) + + if err != nil { + log.Fatalf("Cannot load default taxonomy: %v", err) + } + + taxonomy.SetAsDefault() + } + + log.SetLevel(log.InfoLevel) + if options.Called("debug") { + log.SetLevel(log.DebugLevel) + log.Debugln("Switch to debug level logging") + } + + if options.Called("pprof") { + url := "localhost:6060" + go http.ListenAndServe(url, nil) + log.Infof("Start a pprof server at address %s/debug/pprof", url) + log.Info("Profil can be followed running concurrently the command :") + log.Info(" go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/profile?seconds=30'") + } + + if options.Called("pprof-mutex") { + url := "localhost:6060" + go http.ListenAndServe(url, nil) + runtime.SetMutexProfileFraction(_PprofMudex) + log.Infof("Start a pprof server at address %s/debug/pprof", url) + log.Info("Profil can be followed running concurrently the command :") + log.Info(" go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/mutex'") + } + + if options.Called("pprof-goroutine") { + url := "localhost:6060" + go http.ListenAndServe(url, nil) + runtime.SetBlockProfileRate(_PprofGoroutine) + log.Infof("Start a pprof server at address %s/debug/pprof", url) + log.Info("Profil can be followed running concurrently the command :") + log.Info(" go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/block'") + } + + // Handle user errors + if parseErr != nil { + fmt.Fprintf(os.Stderr, "ERROR: %s\n\n", parseErr) + fmt.Fprint(os.Stderr, options.Help(getoptions.HelpSynopsis)) + os.Exit(1) + } + + runtime.GOMAXPROCS(obidefault.MaxCPU()) + + if options.Called("max-cpu") { + log.Printf("CPU number limited to %d", obidefault.MaxCPU()) + } + + if options.Called("no-singleton") { + log.Printf("No singleton option set") + } + + log.Printf("Number of workers set %d", obidefault.ParallelWorkers()) + + if options.Called("solexa") { + obidefault.SetReadQualitiesShift(64) + } +} + +func GenerateOptionParser(program string, + documentation string, + optionset ...func(*getoptions.GetOpt)) ArgumentParser { + + options := getoptions.New() + options.Self(program, documentation) + options.SetMode(getoptions.Bundling) + options.SetUnknownMode(getoptions.Fail) + options.Bool("help", false, options.Alias("h", "?")) + + RegisterGlobalOptions(options) for _, o := range optionset { o(options) } return func(args []string) (*getoptions.GetOpt, []string) { - remaining, err := options.Parse(args[1:]) - - if options.Called("help") { - fmt.Fprint(os.Stderr, options.Help()) - os.Exit(0) - } - - if options.Called("version") { - fmt.Fprintf(os.Stderr, "OBITools %s\n", VersionString()) - os.Exit(0) - } - - if options.Called("taxonomy") { - __defaut_taxonomy_mutex__.Lock() - defer __defaut_taxonomy_mutex__.Unlock() - taxonomy, err := obiformats.LoadTaxonomy( - obidefault.SelectedTaxonomy(), - !obidefault.AreAlternativeNamesSelected(), - SeqAsTaxa(), - ) - - if err != nil { - log.Fatalf("Cannot load default taxonomy: %v", err) - - } - - taxonomy.SetAsDefault() - } - - log.SetLevel(log.InfoLevel) - if options.Called("debug") { - log.SetLevel(log.DebugLevel) - log.Debugln("Switch to debug level logging") - } - - if options.Called("pprof") { - url := "localhost:6060" - go http.ListenAndServe(url, nil) - log.Infof("Start a pprof server at address %s/debug/pprof", url) - log.Info("Profil can be followed running concurrently the command :") - log.Info(" go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/profile?seconds=30'") - } - - if options.Called("pprof-mutex") { - url := "localhost:6060" - go http.ListenAndServe(url, nil) - runtime.SetMutexProfileFraction(_PprofMudex) - log.Infof("Start a pprof server at address %s/debug/pprof", url) - log.Info("Profil can be followed running concurrently the command :") - log.Info(" go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/mutex'") - } - - if options.Called("pprof-goroutine") { - url := "localhost:6060" - go http.ListenAndServe(url, nil) - runtime.SetBlockProfileRate(_PprofGoroutine) - log.Infof("Start a pprof server at address %s/debug/pprof", url) - log.Info("Profil can be followed running concurrently the command :") - log.Info(" go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/block'") - } - - // Handle user errors - if err != nil { - fmt.Fprintf(os.Stderr, "ERROR: %s\n\n", err) - fmt.Fprint(os.Stderr, options.Help(getoptions.HelpSynopsis)) - os.Exit(1) - } - - // // Setup the maximum number of CPU usable by the program - // if obidefault.MaxCPU() == 1 { - // log.Warn("Limitating the Maximum number of CPU to 1 is not recommanded") - // log.Warn("The number of CPU requested has been set to 2") - // obidefault.SetMaxCPU(2) - // } - - // if options.Called("force-one-cpu") { - // log.Warn("Limitating the Maximum number of CPU to 1 is not recommanded") - // log.Warn("The number of CPU has been forced to 1") - // log.Warn("This can lead to unexpected behavior") - // obidefault.SetMaxCPU(1) - // } - - runtime.GOMAXPROCS(obidefault.MaxCPU()) - - // if options.Called("max-cpu") || options.Called("force-one-cpu") { - // log.Printf("CPU number limited to %d", obidefault.MaxCPU()) - // } - - if options.Called("max-cpu") { - log.Printf("CPU number limited to %d", obidefault.MaxCPU()) - } - - if options.Called("no-singleton") { - log.Printf("No singleton option set") - } - - log.Printf("Number of workers set %d", obidefault.ParallelWorkers()) - - // if options.Called("workers") { - - // } - - if options.Called("solexa") { - obidefault.SetReadQualitiesShift(64) - } - + ProcessParsedOptions(options, err) return options, remaining } } diff --git a/pkg/obioptions/subcommand.go b/pkg/obioptions/subcommand.go new file mode 100644 index 0000000..bd55d3a --- /dev/null +++ b/pkg/obioptions/subcommand.go @@ -0,0 +1,43 @@ +package obioptions + +import ( + "github.com/DavidGamba/go-getoptions" +) + +// GenerateSubcommandParser creates an option parser that supports subcommands +// via go-getoptions' NewCommand/SetCommandFn/Dispatch API. +// +// The setup function receives the root *GetOpt and should register subcommands +// using opt.NewCommand(). Global options (--debug, --max-cpu, etc.) are +// registered before setup is called and are inherited by all subcommands. +// +// Returns the root *GetOpt (needed for Dispatch) and an ArgumentParser +// that handles parsing and post-parse processing. +func GenerateSubcommandParser( + program string, + documentation string, + setup func(opt *getoptions.GetOpt), +) (*getoptions.GetOpt, ArgumentParser) { + + options := getoptions.New() + options.Self(program, documentation) + options.SetMode(getoptions.Bundling) + options.SetUnknownMode(getoptions.Fail) + + // Register global options (inherited by all subcommands) + RegisterGlobalOptions(options) + + // Let the caller register subcommands + setup(options) + + // Add automatic help subcommand (must be after all commands) + options.HelpCommand("help", options.Description("Show help for a command")) + + parser := func(args []string) (*getoptions.GetOpt, []string) { + remaining, err := options.Parse(args[1:]) + ProcessParsedOptions(options, err) + return options, remaining + } + + return options, parser +} diff --git a/pkg/obitools/obik/cp.go b/pkg/obitools/obik/cp.go new file mode 100644 index 0000000..ada0e61 --- /dev/null +++ b/pkg/obitools/obik/cp.go @@ -0,0 +1,55 @@ +package obik + +import ( + "context" + "fmt" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" + "github.com/DavidGamba/go-getoptions" +) + +func runCp(ctx context.Context, opt *getoptions.GetOpt, args []string) error { + if len(args) < 2 { + return fmt.Errorf("usage: obik cp [--set PATTERN]... [--force] ") + } + + srcDir := args[0] + destDir := args[1] + + ksg, err := obikmer.OpenKmerSetGroup(srcDir) + if err != nil { + return fmt.Errorf("failed to open source kmer index: %w", err) + } + + // Resolve set patterns + patterns := CLISetPatterns() + var ids []string + if len(patterns) > 0 { + indices, err := ksg.MatchSetIDs(patterns) + if err != nil { + return err + } + if len(indices) == 0 { + return fmt.Errorf("no sets match the given patterns") + } + ids = make([]string, len(indices)) + for i, idx := range indices { + ids[i] = ksg.SetIDOf(idx) + } + } else { + // Copy all sets + ids = ksg.SetsIDs() + } + + log.Infof("Copying %d set(s) from %s to %s", len(ids), srcDir, destDir) + + dest, err := ksg.CopySetsByIDTo(ids, destDir, CLIForce()) + if err != nil { + return err + } + + log.Infof("Destination now has %d set(s)", dest.Size()) + return nil +} diff --git a/pkg/obitools/obik/index.go b/pkg/obitools/obik/index.go new file mode 100644 index 0000000..a1a3e03 --- /dev/null +++ b/pkg/obitools/obik/index.go @@ -0,0 +1,110 @@ +package obik + +import ( + "context" + "fmt" + "os" + "path/filepath" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "github.com/DavidGamba/go-getoptions" +) + +func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error { + outDir := CLIOutputDirectory() + if outDir == "" || outDir == "-" { + return fmt.Errorf("--out option is required and must specify a directory path") + } + + k := CLIKmerSize() + if k < 2 || k > 31 { + return fmt.Errorf("invalid k-mer size: %d (must be between 2 and 31)", k) + } + + m := CLIMinimizerSize() + + minOcc := CLIMinOccurrence() + if minOcc < 1 { + return fmt.Errorf("invalid min-occurrence: %d (must be >= 1)", minOcc) + } + + // Build options + var opts []obikmer.BuilderOption + if minOcc > 1 { + opts = append(opts, obikmer.WithMinFrequency(minOcc)) + } + + // Determine whether to append to existing group or create new + var builder *obikmer.KmerSetGroupBuilder + var err error + metaPath := filepath.Join(outDir, "metadata.toml") + if _, statErr := os.Stat(metaPath); statErr == nil { + // Existing group: append + log.Infof("Appending to existing kmer index at %s", outDir) + builder, err = obikmer.AppendKmerSetGroupBuilder(outDir, 1, opts...) + if err != nil { + return fmt.Errorf("failed to open existing kmer index for appending: %w", err) + } + } else { + // New group + log.Infof("Creating new kmer index: k=%d, m=%d, min-occurrence=%d", k, m, minOcc) + builder, err = obikmer.NewKmerSetGroupBuilder(outDir, k, m, 1, -1, opts...) + if err != nil { + return fmt.Errorf("failed to create kmer index builder: %w", err) + } + } + + // Read and process sequences + sequences, err := obiconvert.CLIReadBioSequences(args...) + if err != nil { + return fmt.Errorf("failed to open sequence files: %w", err) + } + + seqCount := 0 + for sequences.Next() { + batch := sequences.Get() + for _, seq := range batch.Slice() { + builder.AddSequence(0, seq) + seqCount++ + } + } + + log.Infof("Processed %d sequences", seqCount) + + // Finalize + ksg, err := builder.Close() + if err != nil { + return fmt.Errorf("failed to finalize kmer index: %w", err) + } + + // Apply index-id to the new set + newSetIdx := builder.StartIndex() + if id := CLIIndexId(); id != "" { + ksg.SetSetID(newSetIdx, id) + } + + // Apply group-level tags (-S) + for key, value := range CLISetTag() { + ksg.SetAttribute(key, value) + } + + // Apply per-set tags (-T) to the new set + for key, value := range _setMetaTags { + ksg.SetSetMetadata(newSetIdx, key, value) + } + + if minOcc > 1 { + ksg.SetAttribute("min_occurrence", minOcc) + } + + if err := ksg.SaveMetadata(); err != nil { + return fmt.Errorf("failed to save metadata: %w", err) + } + + log.Infof("Index contains %d k-mers for set %d in %s", ksg.Len(newSetIdx), newSetIdx, outDir) + log.Info("Done.") + return nil +} diff --git a/pkg/obitools/obilowmask/obilowmask.go b/pkg/obitools/obik/lowmask.go similarity index 70% rename from pkg/obitools/obilowmask/obilowmask.go rename to pkg/obitools/obik/lowmask.go index 893cab2..014a160 100644 --- a/pkg/obitools/obilowmask/obilowmask.go +++ b/pkg/obitools/obik/lowmask.go @@ -1,39 +1,80 @@ -package obilowmask +package obik import ( + "context" "fmt" "math" + "strings" + + log "github.com/sirupsen/logrus" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" + "github.com/DavidGamba/go-getoptions" ) // MaskingMode defines how to handle low-complexity regions type MaskingMode int const ( - Mask MaskingMode = iota // Mask mode: replace low-complexity regions with masked characters - Split // Split mode: split sequence into high-complexity fragments - Extract + MaskMode MaskingMode = iota // Replace low-complexity regions with masked characters + SplitMode // Split sequence into high-complexity fragments + ExtractMode // Extract low-complexity fragments ) -// LowMaskWorker creates a worker to mask low-complexity regions in DNA sequences. -// -// Algorithm principle: -// Calculate the normalized entropy of each k-mer at different scales (wordSize = 1 to level_max). -// K-mers with entropy below the threshold are masked. -// -// Parameters: -// - kmer_size: size of the sliding window for entropy calculation -// - level_max: maximum word size used for entropy calculation (finest scale) -// - threshold: normalized entropy threshold below which masking occurs (between 0 and 1) -// - mode: Mask (masking) or Split (splitting) -// - maskChar: character used for masking (typically 'n' or 'N') -// -// Returns: a SeqWorker function that can be applied to each sequence -func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte) obiseq.SeqWorker { +// Lowmask-specific option variables (separate from index/super kmer-size). +var _lowmaskKmerSize = 31 +var _lowmaskLevelMax = 6 +var _lowmaskThreshold = 0.5 +var _lowmaskSplitMode = false +var _lowmaskLowMode = false +var _lowmaskMaskChar = "." + +// LowMaskOptionSet registers options specific to low-complexity masking. +func LowMaskOptionSet(options *getoptions.GetOpt) { + options.IntVar(&_lowmaskKmerSize, "kmer-size", _lowmaskKmerSize, + options.Description("Size of the kmer considered to estimate entropy.")) + + options.IntVar(&_lowmaskLevelMax, "entropy-size", _lowmaskLevelMax, + options.Description("Maximum word size considered for entropy estimate.")) + + options.Float64Var(&_lowmaskThreshold, "threshold", _lowmaskThreshold, + options.Description("Entropy threshold below which a kmer is masked (0 to 1).")) + + options.BoolVar(&_lowmaskSplitMode, "split-mode", _lowmaskSplitMode, + options.Description("Split sequences to remove masked regions.")) + + options.BoolVar(&_lowmaskLowMode, "low-mode", _lowmaskLowMode, + options.Description("Extract only low-complexity regions.")) + + options.StringVar(&_lowmaskMaskChar, "masking-char", _lowmaskMaskChar, + options.Description("Character used to mask low complexity regions.")) +} + +func lowmaskMaskingMode() MaskingMode { + switch { + case _lowmaskLowMode: + return ExtractMode + case _lowmaskSplitMode: + return SplitMode + default: + return MaskMode + } +} + +func lowmaskMaskingChar() byte { + mask := strings.TrimSpace(_lowmaskMaskChar) + if len(mask) != 1 { + log.Fatalf("--masking-char option accepts a single character, not %s", mask) + } + return []byte(mask)[0] +} + +// lowMaskWorker creates a worker to mask low-complexity regions in DNA sequences. +func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte) obiseq.SeqWorker { nLogN := make([]float64, kmer_size+1) for i := 1; i <= kmer_size; i++ { @@ -87,6 +128,7 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking data[i] = deque[0].value } } + emaxValues := make([]float64, level_max+1) logNwords := make([]float64, level_max+1) for ws := 1; ws <= level_max; ws++ { @@ -329,7 +371,7 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking maskPositions := maskAmbiguities(bseq) - mask := make([]int, len(bseq)) + maskFlags := make([]int, len(bseq)) entropies := make([]float64, len(bseq)) for i := range entropies { entropies[i] = 4.0 @@ -343,7 +385,7 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking for i := range bseq { v := level_max - mask[i] = v + maskFlags[i] = v } for ws := level_max - 1; ws > 0; ws-- { @@ -351,7 +393,7 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking for i, e2 := range entropies2 { if e2 < entropies[i] { entropies[i] = e2 - mask[i] = ws + maskFlags[i] = ws } } } @@ -367,39 +409,49 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking remove[i] = e <= threshold } - sequence.SetAttribute("mask", mask) + sequence.SetAttribute("mask", maskFlags) sequence.SetAttribute("Entropies", entropies) switch mode { - case Mask: + case MaskMode: return applyMaskMode(sequence, remove, maskChar) - case Split: + case SplitMode: return selectunmasked(sequence, remove) - case Extract: + case ExtractMode: return selectMasked(sequence, remove) } - return nil, fmt.Errorf("Unknown mode %d", mode) + return nil, fmt.Errorf("unknown mode %d", mode) } return masking } -// CLISequenceEntropyMasker creates an iterator that applies entropy masking -// to all sequences in an input iterator. -// -// Uses command-line parameters to configure the worker. -func CLISequenceEntropyMasker(iterator obiiter.IBioSequence) obiiter.IBioSequence { - var newIter obiiter.IBioSequence +// runLowmask implements the "obik lowmask" subcommand. +// It masks low-complexity regions in DNA sequences using entropy-based detection. +func runLowmask(ctx context.Context, opt *getoptions.GetOpt, args []string) error { + kmerSize := _lowmaskKmerSize + levelMax := _lowmaskLevelMax + threshold := _lowmaskThreshold + mode := lowmaskMaskingMode() + maskChar := lowmaskMaskingChar() - worker := LowMaskWorker( - CLIKmerSize(), - CLILevelMax(), - CLIThreshold(), - CLIMaskingMode(), - CLIMaskingChar(), - ) + log.Printf("Low-complexity masking: kmer-size=%d, entropy-size=%d, threshold=%.4f", kmerSize, levelMax, threshold) - newIter = iterator.MakeIWorker(worker, false, obidefault.ParallelWorkers()) + sequences, err := obiconvert.CLIReadBioSequences(args...) + if err != nil { + return fmt.Errorf("failed to open sequence files: %w", err) + } - return newIter.FilterEmpty() + worker := lowMaskWorker(kmerSize, levelMax, threshold, mode, maskChar) + + masked := sequences.MakeIWorker( + worker, + false, + obidefault.ParallelWorkers(), + ).FilterEmpty() + + obiconvert.CLIWriteBioSequences(masked, true) + obiutils.WaitForLastPipe() + + return nil } diff --git a/pkg/obitools/obik/ls.go b/pkg/obitools/obik/ls.go new file mode 100644 index 0000000..7a06f43 --- /dev/null +++ b/pkg/obitools/obik/ls.go @@ -0,0 +1,96 @@ +package obik + +import ( + "context" + "encoding/json" + "fmt" + "strings" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" + "github.com/DavidGamba/go-getoptions" + "gopkg.in/yaml.v3" +) + +type setEntry struct { + Index int `json:"index" yaml:"index"` + ID string `json:"id" yaml:"id"` + Count uint64 `json:"count" yaml:"count"` +} + +func runLs(ctx context.Context, opt *getoptions.GetOpt, args []string) error { + if len(args) < 1 { + return fmt.Errorf("usage: obik ls [options] ") + } + + ksg, err := obikmer.OpenKmerSetGroup(args[0]) + if err != nil { + return fmt.Errorf("failed to open kmer index: %w", err) + } + + // Determine which sets to show + patterns := CLISetPatterns() + var indices []int + if len(patterns) > 0 { + indices, err = ksg.MatchSetIDs(patterns) + if err != nil { + return err + } + } else { + indices = make([]int, ksg.Size()) + for i := range indices { + indices[i] = i + } + } + + entries := make([]setEntry, len(indices)) + for i, idx := range indices { + entries[i] = setEntry{ + Index: idx, + ID: ksg.SetIDOf(idx), + Count: ksg.Len(idx), + } + } + + format := CLIOutFormat() + switch format { + case "json": + return outputLsJSON(entries) + case "yaml": + return outputLsYAML(entries) + case "csv": + return outputLsCSV(entries) + default: + return outputLsCSV(entries) + } +} + +func outputLsCSV(entries []setEntry) error { + fmt.Println("index,id,count") + for _, e := range entries { + // Escape commas in ID if needed + id := e.ID + if strings.ContainsAny(id, ",\"") { + id = "\"" + strings.ReplaceAll(id, "\"", "\"\"") + "\"" + } + fmt.Printf("%d,%s,%d\n", e.Index, id, e.Count) + } + return nil +} + +func outputLsJSON(entries []setEntry) error { + data, err := json.MarshalIndent(entries, "", " ") + if err != nil { + return err + } + fmt.Println(string(data)) + return nil +} + +func outputLsYAML(entries []setEntry) error { + data, err := yaml.Marshal(entries) + if err != nil { + return err + } + fmt.Print(string(data)) + return nil +} diff --git a/pkg/obitools/obik/mv.go b/pkg/obitools/obik/mv.go new file mode 100644 index 0000000..6aa2dfd --- /dev/null +++ b/pkg/obitools/obik/mv.go @@ -0,0 +1,63 @@ +package obik + +import ( + "context" + "fmt" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" + "github.com/DavidGamba/go-getoptions" +) + +func runMv(ctx context.Context, opt *getoptions.GetOpt, args []string) error { + if len(args) < 2 { + return fmt.Errorf("usage: obik mv [--set PATTERN]... [--force] ") + } + + srcDir := args[0] + destDir := args[1] + + ksg, err := obikmer.OpenKmerSetGroup(srcDir) + if err != nil { + return fmt.Errorf("failed to open source kmer index: %w", err) + } + + // Resolve set patterns + patterns := CLISetPatterns() + var ids []string + if len(patterns) > 0 { + indices, err := ksg.MatchSetIDs(patterns) + if err != nil { + return err + } + if len(indices) == 0 { + return fmt.Errorf("no sets match the given patterns") + } + ids = make([]string, len(indices)) + for i, idx := range indices { + ids[i] = ksg.SetIDOf(idx) + } + } else { + // Move all sets + ids = ksg.SetsIDs() + } + + log.Infof("Moving %d set(s) from %s to %s", len(ids), srcDir, destDir) + + // Copy first + dest, err := ksg.CopySetsByIDTo(ids, destDir, CLIForce()) + if err != nil { + return err + } + + // Remove from source (in reverse order to avoid renumbering issues) + for i := len(ids) - 1; i >= 0; i-- { + if err := ksg.RemoveSetByID(ids[i]); err != nil { + return fmt.Errorf("failed to remove set %q from source after copy: %w", ids[i], err) + } + } + + log.Infof("Destination now has %d set(s), source has %d set(s)", dest.Size(), ksg.Size()) + return nil +} diff --git a/pkg/obitools/obik/obik.go b/pkg/obitools/obik/obik.go new file mode 100644 index 0000000..cdc792e --- /dev/null +++ b/pkg/obitools/obik/obik.go @@ -0,0 +1,64 @@ +package obik + +import ( + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "github.com/DavidGamba/go-getoptions" +) + +// OptionSet registers all obik subcommands on the root GetOpt. +func OptionSet(opt *getoptions.GetOpt) { + // index: build or extend a kmer index from sequence files + indexCmd := opt.NewCommand("index", "Build a disk-based kmer index from sequence files") + obiconvert.InputOptionSet(indexCmd) + obiconvert.OutputModeOptionSet(indexCmd, false) + KmerIndexOptionSet(indexCmd) + indexCmd.StringMapVar(&_setMetaTags, "tag", 1, 1, + indexCmd.Alias("T"), + indexCmd.ArgName("KEY=VALUE"), + indexCmd.Description("Per-set metadata tag (repeatable).")) + indexCmd.SetCommandFn(runIndex) + + // ls: list sets in a kmer index + lsCmd := opt.NewCommand("ls", "List sets in a kmer index") + OutputFormatOptionSet(lsCmd) + SetSelectionOptionSet(lsCmd) + lsCmd.SetCommandFn(runLs) + + // summary: detailed statistics + summaryCmd := opt.NewCommand("summary", "Show detailed statistics of a kmer index") + OutputFormatOptionSet(summaryCmd) + summaryCmd.BoolVar(&_jaccard, "jaccard", false, + summaryCmd.Description("Compute and display pairwise Jaccard distance matrix.")) + summaryCmd.SetCommandFn(runSummary) + + // cp: copy sets between indices + cpCmd := opt.NewCommand("cp", "Copy sets between kmer indices") + SetSelectionOptionSet(cpCmd) + ForceOptionSet(cpCmd) + cpCmd.SetCommandFn(runCp) + + // mv: move sets between indices + mvCmd := opt.NewCommand("mv", "Move sets between kmer indices") + SetSelectionOptionSet(mvCmd) + ForceOptionSet(mvCmd) + mvCmd.SetCommandFn(runMv) + + // rm: remove sets from an index + rmCmd := opt.NewCommand("rm", "Remove sets from a kmer index") + SetSelectionOptionSet(rmCmd) + rmCmd.SetCommandFn(runRm) + + // super: extract super k-mers from sequences + superCmd := opt.NewCommand("super", "Extract super k-mers from sequence files") + obiconvert.InputOptionSet(superCmd) + obiconvert.OutputOptionSet(superCmd) + SuperKmerOptionSet(superCmd) + superCmd.SetCommandFn(runSuper) + + // lowmask: mask low-complexity regions + lowmaskCmd := opt.NewCommand("lowmask", "Mask low-complexity regions in sequences using entropy") + obiconvert.InputOptionSet(lowmaskCmd) + obiconvert.OutputOptionSet(lowmaskCmd) + LowMaskOptionSet(lowmaskCmd) + lowmaskCmd.SetCommandFn(runLowmask) +} diff --git a/pkg/obitools/obikindex/options.go b/pkg/obitools/obik/options.go similarity index 62% rename from pkg/obitools/obikindex/options.go rename to pkg/obitools/obik/options.go index fd1f094..a7766dd 100644 --- a/pkg/obitools/obikindex/options.go +++ b/pkg/obitools/obik/options.go @@ -1,4 +1,4 @@ -package obikindex +package obik import ( "strings" @@ -11,7 +11,27 @@ import ( "github.com/DavidGamba/go-getoptions" ) -// Private variables for storing option values +// Output format flags +var _jsonOutput bool +var _csvOutput bool +var _yamlOutput bool + +// Set selection flags +var _setPatterns []string + +// Force flag +var _force bool + +// Jaccard flag +var _jaccard bool + +// Per-set tags for index subcommand +var _setMetaTags = make(map[string]string, 0) + +// ============================== +// Kmer index building options (moved from obikindex) +// ============================== + var _kmerSize = 31 var _minimizerSize = -1 // -1 means auto: ceil(k / 2.5) var _indexId = "" @@ -39,7 +59,7 @@ func KmerIndexOptionSet(options *getoptions.GetOpt) { options.StringMapVar(&_setTag, "set-tag", 1, 1, options.Alias("S"), options.ArgName("KEY=VALUE"), - options.Description("Adds a metadata attribute KEY with value VALUE to the index.")) + options.Description("Adds a group-level metadata attribute KEY with value VALUE.")) options.IntVar(&_minOccurrence, "min-occurrence", _minOccurrence, options.Description("Minimum number of occurrences for a k-mer to be kept (default 1 = keep all).")) @@ -48,23 +68,12 @@ func KmerIndexOptionSet(options *getoptions.GetOpt) { options.Description("When using --min-occurrence > 1, save the full frequency filter instead of just the filtered index.")) } -// OptionSet adds to the basic option set every option declared for -// the obikindex command. -func OptionSet(options *getoptions.GetOpt) { - obiconvert.InputOptionSet(options) - obiconvert.OutputModeOptionSet(options, false) - KmerIndexOptionSet(options) -} - // CLIKmerSize returns the k-mer size. func CLIKmerSize() int { return _kmerSize } // CLIMinimizerSize returns the effective minimizer size. -// If -1 (auto), computes ceil(k / 2.5) then applies constraints: -// - minimum: ceil(log(nworkers) / log(4)) -// - maximum: k - 1 func CLIMinimizerSize() int { m := _minimizerSize if m < 0 { @@ -95,7 +104,7 @@ func CLIMetadataFormat() obikmer.MetadataFormat { } } -// CLISetTag returns the metadata key=value pairs. +// CLISetTag returns the group-level metadata key=value pairs. func CLISetTag() map[string]string { return _setTag } @@ -129,3 +138,52 @@ func SetMinimizerSize(m int) { func SetMinOccurrence(n int) { _minOccurrence = n } + +// OutputFormatOptionSet registers --json-output, --csv-output, --yaml-output. +func OutputFormatOptionSet(options *getoptions.GetOpt) { + options.BoolVar(&_jsonOutput, "json-output", false, + options.Description("Print results as JSON.")) + options.BoolVar(&_csvOutput, "csv-output", false, + options.Description("Print results as CSV.")) + options.BoolVar(&_yamlOutput, "yaml-output", false, + options.Description("Print results as YAML.")) +} + +// CLIOutFormat returns the selected output format: "json", "csv", "yaml", or "text". +func CLIOutFormat() string { + if _jsonOutput { + return "json" + } + if _csvOutput { + return "csv" + } + if _yamlOutput { + return "yaml" + } + return "text" +} + +// SetSelectionOptionSet registers --set (repeatable). +func SetSelectionOptionSet(options *getoptions.GetOpt) { + options.StringSliceVar(&_setPatterns, "set", 1, 1, + options.Alias("s"), + options.ArgName("PATTERN"), + options.Description("Set ID or glob pattern (repeatable, supports *, ?, [...]).")) +} + +// CLISetPatterns returns the --set patterns provided by the user. +func CLISetPatterns() []string { + return _setPatterns +} + +// ForceOptionSet registers --force / -f. +func ForceOptionSet(options *getoptions.GetOpt) { + options.BoolVar(&_force, "force", false, + options.Alias("f"), + options.Description("Force operation even if set ID already exists in destination.")) +} + +// CLIForce returns whether --force was specified. +func CLIForce() bool { + return _force +} diff --git a/pkg/obitools/obik/rm.go b/pkg/obitools/obik/rm.go new file mode 100644 index 0000000..77f0f85 --- /dev/null +++ b/pkg/obitools/obik/rm.go @@ -0,0 +1,56 @@ +package obik + +import ( + "context" + "fmt" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" + "github.com/DavidGamba/go-getoptions" +) + +func runRm(ctx context.Context, opt *getoptions.GetOpt, args []string) error { + if len(args) < 1 { + return fmt.Errorf("usage: obik rm --set PATTERN [--set PATTERN]... ") + } + + patterns := CLISetPatterns() + if len(patterns) == 0 { + return fmt.Errorf("--set is required (specify which sets to remove)") + } + + indexDir := args[0] + + ksg, err := obikmer.OpenKmerSetGroup(indexDir) + if err != nil { + return fmt.Errorf("failed to open kmer index: %w", err) + } + + indices, err := ksg.MatchSetIDs(patterns) + if err != nil { + return err + } + if len(indices) == 0 { + return fmt.Errorf("no sets match the given patterns") + } + + // Collect IDs before removal (indices shift as we remove) + ids := make([]string, len(indices)) + for i, idx := range indices { + ids[i] = ksg.SetIDOf(idx) + } + + log.Infof("Removing %d set(s) from %s", len(ids), indexDir) + + // Remove in reverse order to avoid renumbering issues + for i := len(ids) - 1; i >= 0; i-- { + if err := ksg.RemoveSetByID(ids[i]); err != nil { + return fmt.Errorf("failed to remove set %q: %w", ids[i], err) + } + log.Infof("Removed set %q", ids[i]) + } + + log.Infof("Index now has %d set(s)", ksg.Size()) + return nil +} diff --git a/pkg/obitools/obik/summary.go b/pkg/obitools/obik/summary.go new file mode 100644 index 0000000..3cd20de --- /dev/null +++ b/pkg/obitools/obik/summary.go @@ -0,0 +1,148 @@ +package obik + +import ( + "context" + "encoding/json" + "fmt" + "os" + "path/filepath" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" + "github.com/DavidGamba/go-getoptions" + "gopkg.in/yaml.v3" +) + +type setSummary struct { + Index int `json:"index" yaml:"index"` + ID string `json:"id" yaml:"id"` + Count uint64 `json:"count" yaml:"count"` + DiskSize int64 `json:"disk_bytes" yaml:"disk_bytes"` + Metadata map[string]interface{} `json:"metadata,omitempty" yaml:"metadata,omitempty"` +} + +type groupSummary struct { + Path string `json:"path" yaml:"path"` + ID string `json:"id,omitempty" yaml:"id,omitempty"` + K int `json:"k" yaml:"k"` + M int `json:"m" yaml:"m"` + Partitions int `json:"partitions" yaml:"partitions"` + TotalSets int `json:"total_sets" yaml:"total_sets"` + TotalKmers uint64 `json:"total_kmers" yaml:"total_kmers"` + TotalDisk int64 `json:"total_disk_bytes" yaml:"total_disk_bytes"` + Metadata map[string]interface{} `json:"metadata,omitempty" yaml:"metadata,omitempty"` + Sets []setSummary `json:"sets" yaml:"sets"` + Jaccard [][]float64 `json:"jaccard,omitempty" yaml:"jaccard,omitempty"` +} + +func runSummary(ctx context.Context, opt *getoptions.GetOpt, args []string) error { + if len(args) < 1 { + return fmt.Errorf("usage: obik summary [options] ") + } + + ksg, err := obikmer.OpenKmerSetGroup(args[0]) + if err != nil { + return fmt.Errorf("failed to open kmer index: %w", err) + } + + summary := groupSummary{ + Path: ksg.Path(), + ID: ksg.Id(), + K: ksg.K(), + M: ksg.M(), + Partitions: ksg.Partitions(), + TotalSets: ksg.Size(), + TotalKmers: ksg.Len(), + Metadata: ksg.Metadata, + Sets: make([]setSummary, ksg.Size()), + } + + var totalDisk int64 + for i := 0; i < ksg.Size(); i++ { + diskSize := computeSetDiskSize(ksg, i) + totalDisk += diskSize + summary.Sets[i] = setSummary{ + Index: i, + ID: ksg.SetIDOf(i), + Count: ksg.Len(i), + DiskSize: diskSize, + Metadata: ksg.AllSetMetadata(i), + } + } + summary.TotalDisk = totalDisk + + // Jaccard matrix + if _jaccard && ksg.Size() > 1 { + dm := ksg.JaccardDistanceMatrix() + n := ksg.Size() + matrix := make([][]float64, n) + for i := 0; i < n; i++ { + matrix[i] = make([]float64, n) + for j := 0; j < n; j++ { + if i == j { + matrix[i][j] = 0 + } else { + matrix[i][j] = dm.Get(i, j) + } + } + } + summary.Jaccard = matrix + } + + format := CLIOutFormat() + switch format { + case "json": + return outputSummaryJSON(summary) + case "yaml": + return outputSummaryYAML(summary) + case "csv": + return outputSummaryCSV(summary) + default: + return outputSummaryJSON(summary) + } +} + +func computeSetDiskSize(ksg *obikmer.KmerSetGroup, setIndex int) int64 { + var total int64 + for p := 0; p < ksg.Partitions(); p++ { + path := ksg.PartitionPath(setIndex, p) + info, err := os.Stat(path) + if err != nil { + continue + } + total += info.Size() + } + // Also count the set directory entry itself + setDir := filepath.Join(ksg.Path(), fmt.Sprintf("set_%d", setIndex)) + entries, err := os.ReadDir(setDir) + if err == nil { + // We already counted .kdi files above; this is just for completeness + _ = entries + } + return total +} + +func outputSummaryJSON(summary groupSummary) error { + data, err := json.MarshalIndent(summary, "", " ") + if err != nil { + return err + } + fmt.Println(string(data)) + return nil +} + +func outputSummaryYAML(summary groupSummary) error { + data, err := yaml.Marshal(summary) + if err != nil { + return err + } + fmt.Print(string(data)) + return nil +} + +func outputSummaryCSV(summary groupSummary) error { + fmt.Println("index,id,count,disk_bytes") + for _, s := range summary.Sets { + fmt.Printf("%d,%s,%d,%d\n", s.Index, s.ID, s.Count, s.DiskSize) + } + return nil +} diff --git a/pkg/obitools/obik/super.go b/pkg/obitools/obik/super.go new file mode 100644 index 0000000..b6416a4 --- /dev/null +++ b/pkg/obitools/obik/super.go @@ -0,0 +1,64 @@ +package obik + +import ( + "context" + "fmt" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" + "github.com/DavidGamba/go-getoptions" +) + +// Super k-mer specific option variables. +// These reuse _kmerSize and _minimizerSize from options.go since +// only one subcommand runs at a time. + +// SuperKmerOptionSet registers options specific to super k-mer extraction. +func SuperKmerOptionSet(options *getoptions.GetOpt) { + options.IntVar(&_kmerSize, "kmer-size", _kmerSize, + options.Alias("k"), + options.Description("Size of k-mers (must be between m+1 and 31).")) + + options.IntVar(&_minimizerSize, "minimizer-size", _minimizerSize, + options.Alias("m"), + options.Description("Size of minimizers (must be between 1 and k-1).")) +} + +// runSuper implements the "obik super" subcommand. +// It extracts super k-mers from DNA sequences. +func runSuper(ctx context.Context, opt *getoptions.GetOpt, args []string) error { + k := _kmerSize + m := _minimizerSize + + if k < 2 || k > 31 { + return fmt.Errorf("invalid k-mer size: %d (must be between 2 and 31)", k) + } + + if m < 1 || m >= k { + return fmt.Errorf("invalid parameters: minimizer size (%d) must be between 1 and k-1 (%d)", m, k-1) + } + + log.Printf("Extracting super k-mers with k=%d, m=%d", k, m) + + sequences, err := obiconvert.CLIReadBioSequences(args...) + if err != nil { + return fmt.Errorf("failed to open sequence files: %w", err) + } + + worker := obikmer.SuperKmerWorker(k, m) + + superkmers := sequences.MakeIWorker( + worker, + false, + obidefault.ParallelWorkers(), + ) + + obiconvert.CLIWriteBioSequences(superkmers, true) + obiutils.WaitForLastPipe() + + return nil +} diff --git a/pkg/obitools/obikindex/obikindex.go b/pkg/obitools/obikindex/obikindex.go deleted file mode 100644 index 6c504cf..0000000 --- a/pkg/obitools/obikindex/obikindex.go +++ /dev/null @@ -1,91 +0,0 @@ -package obikindex - -import ( - log "github.com/sirupsen/logrus" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" -) - -// CLIBuildKmerIndex reads sequences from the iterator and builds a -// disk-based kmer index using the KmerSetGroupBuilder. -func CLIBuildKmerIndex(iterator obiiter.IBioSequence) { - // Validate output directory - outDir := CLIOutputDirectory() - if outDir == "" || outDir == "-" { - log.Fatalf("Error: --out option is required and must specify a directory path (not stdout)") - } - - // Validate k-mer size - k := CLIKmerSize() - if k < 2 || k > 31 { - log.Fatalf("Invalid k-mer size: %d (must be between 2 and 31)", k) - } - - // Resolve minimizer size - m := CLIMinimizerSize() - - // Validate min-occurrence - minOcc := CLIMinOccurrence() - if minOcc < 1 { - log.Fatalf("Invalid min-occurrence: %d (must be >= 1)", minOcc) - } - - log.Infof("Building kmer index: k=%d, m=%d, min-occurrence=%d", k, m, minOcc) - - // Build options - var opts []obikmer.BuilderOption - if minOcc > 1 { - opts = append(opts, obikmer.WithMinFrequency(minOcc)) - } - - // Create builder (1 set, auto partitions) - builder, err := obikmer.NewKmerSetGroupBuilder(outDir, k, m, 1, -1, opts...) - if err != nil { - log.Fatalf("Failed to create kmer index builder: %v", err) - } - - // Feed sequences - seqCount := 0 - for iterator.Next() { - batch := iterator.Get() - for _, seq := range batch.Slice() { - builder.AddSequence(0, seq) - seqCount++ - } - } - - log.Infof("Processed %d sequences", seqCount) - - // Finalize - ksg, err := builder.Close() - if err != nil { - log.Fatalf("Failed to finalize kmer index: %v", err) - } - - // Apply metadata - applyMetadata(ksg) - - if minOcc > 1 { - ksg.SetAttribute("min_occurrence", minOcc) - } - - // Persist metadata updates - if err := ksg.SaveMetadata(); err != nil { - log.Fatalf("Failed to save metadata: %v", err) - } - - log.Infof("Index contains %d k-mers in %s", ksg.Len(0), outDir) - log.Info("Done.") -} - -// applyMetadata sets index-id and --set-tag metadata on a KmerSetGroup. -func applyMetadata(ksg *obikmer.KmerSetGroup) { - if id := CLIIndexId(); id != "" { - ksg.SetId(id) - } - - for key, value := range CLISetTag() { - ksg.SetAttribute(key, value) - } -} diff --git a/pkg/obitools/obilowmask/entropy.qmd b/pkg/obitools/obilowmask/entropy.qmd deleted file mode 100644 index e851a57..0000000 --- a/pkg/obitools/obilowmask/entropy.qmd +++ /dev/null @@ -1,332 +0,0 @@ -```{r} -library(tidyverse) -``` - -```{r} -x <- sample(1:4096, 29, replace=TRUE) -``` - -```{r} -emax <- function(lseq,word_size) { - nword = lseq - word_size + 1 - nalpha = 4^word_size - - if (nalpha < nword) { - cov = nword %/% nalpha - remains = nword %% nalpha - f1 = cov/nword - f2 = (cov+1)/nword - print(c(nalpha - remains,f1,remains,f2)) - e = -(nalpha - remains) * f1 * log(f1) - - remains * f2 * log(f2) - } else { - e = log(nword) - } - - e -} -``` - -```{r} -ec <- function(data,kmer_size) { - table <- table(data) - s <- sum(table) - e <- sum(table * log(table))/s - ed <- log(s) - e - - em <- emax(s+kmer_size-1,kmer_size) - - ed/em -} -``` - -```{r} -ef <- function(data,kmer_size) { - table <- table(data) - s <- sum(table) - f <- table / s - - f <- as.numeric(f) - f <- f[f > 0] - - em <- emax(s+kmer_size-1,kmer_size) - ed <- -sum(f * log(f)) - - print(c(ed,em,ed/em)) - - ed/em -} -``` - -```{r} -okmer <- function(data,kmer_size) { - str_sub(data,1:(nchar(data)-kmer_size+1)) %>% - str_sub(1,kmer_size) -} -``` - -```{r} -# Normalisation circulaire: retourne le plus petit k-mer par rotation circulaire -normalize_circular <- function(kmer) { - if (nchar(kmer) == 0) return(kmer) - - canonical <- kmer - n <- nchar(kmer) - - # Tester toutes les rotations circulaires - for (i in 2:n) { - rotated <- paste0(str_sub(kmer, i, n), str_sub(kmer, 1, i-1)) - if (rotated < canonical) { - canonical <- rotated - } - } - - canonical -} -``` - -```{r} -# Fonction totient d'Euler: compte le nombre d'entiers de 1 à n coprimes avec n -euler_totient <- function(n) { - if (n <= 0) return(0) - - result <- n - p <- 2 - - # Traiter tous les facteurs premiers - while (p * p <= n) { - if (n %% p == 0) { - # Retirer toutes les occurrences de p - while (n %% p == 0) { - n <- n %/% p - } - # Appliquer la formule: φ(n) = n * (1 - 1/p) - result <- result - result %/% p - } - p <- p + 1 - } - - # Si n est toujours > 1, alors c'est un facteur premier - if (n > 1) { - result <- result - result %/% n - } - - result -} -``` - -```{r} -# Retourne tous les diviseurs de n -divisors <- function(n) { - if (n <= 0) return(integer(0)) - - divs <- c() - i <- 1 - while (i * i <= n) { - if (n %% i == 0) { - divs <- c(divs, i) - if (i != n %/% i) { - divs <- c(divs, n %/% i) - } - } - i <- i + 1 - } - - sort(divs) -} -``` - -```{r} -# Compte le nombre de colliers (necklaces) distincts de longueur n -# sur un alphabet de taille a en utilisant la formule de Moreau: -# N(n, a) = (1/n) * Σ φ(d) * a^(n/d) -# où la somme est sur tous les diviseurs d de n, et φ est la fonction totient d'Euler -necklace_count <- function(n, alphabet_size) { - if (n <= 0) return(0) - - divs <- divisors(n) - sum_val <- 0 - - for (d in divs) { - # Calculer alphabet_size^(n/d) - power <- alphabet_size^(n %/% d) - sum_val <- sum_val + euler_totient(d) * power - } - - sum_val %/% n -} -``` - -```{r} -# Nombre de classes d'équivalence pour les k-mers normalisés -# Utilise la formule exacte de Moreau pour compter les colliers (necklaces) -n_normalized_kmers <- function(kmer_size) { - # Valeurs exactes pré-calculées pour k=1 à 6 - if (kmer_size == 1) return(4) - if (kmer_size == 2) return(10) - if (kmer_size == 3) return(24) - if (kmer_size == 4) return(70) - if (kmer_size == 5) return(208) - if (kmer_size == 6) return(700) - - # Pour k > 6, utiliser la formule de Moreau (exacte) - # Alphabet ADN a 4 bases - necklace_count(kmer_size, 4) -} -``` - -```{r} -# Entropie maximale pour k-mers normalisés -enmax <- function(lseq, word_size) { - nword = lseq - word_size + 1 - nalpha = n_normalized_kmers(word_size) - - if (nalpha < nword) { - cov = nword %/% nalpha - remains = nword %% nalpha - f1 = cov/nword - f2 = (cov+1)/nword - e = -(nalpha - remains) * f1 * log(f1) - - remains * f2 * log(f2) - } else { - e = log(nword) - } - - e -} -``` - -```{r} -# Entropie normalisée avec normalisation circulaire des k-mers -ecn <- function(data, kmer_size) { - # Normaliser tous les k-mers - normalized_data <- sapply(data, normalize_circular) - - # Calculer la table des fréquences - table <- table(normalized_data) - s <- sum(table) - e <- sum(table * log(table))/s - ed <- log(s) - e - - # Entropie maximale avec normalisation - em <- enmax(s + kmer_size - 1, kmer_size) - - ed/em -} -``` - -```{r} -k<-'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa' -ec(okmer(k,1),1) -ec(okmer(k,2),2) -ec(okmer(k,3),3) -ec(okmer(k,4),4) -``` - -```{r} -k<-'atatatatatatatatatatatatatatata' -ef(okmer(k,1),1) -ef(okmer(k,2),2) -ef(okmer(k,3),3) -ef(okmer(k,4),4) -``` - -```{r} -k<-'aaaaaaaaaaaaaaaattttttttttttttt' -ef(okmer(k,1),1) -ef(okmer(k,2),2) -ef(okmer(k,3),3) -ef(okmer(k,4),4) -``` - -```{r} -k<-'atgatgatgatgatgatgatgatgatgatga' -ef(okmer(k,1),1) -ef(okmer(k,2),2) -ef(okmer(k,3),3) -ef(okmer(k,4),4) -``` - -```{r} -k<-'atcgatcgatcgatcgatcgatcgatcgact' -ecn(okmer(k,1),1) -ecn(okmer(k,2),2) -ecn(okmer(k,3),3) -ecn(okmer(k,4),4) -``` - -```{r} -k<-paste(sample(rep(c("a","c","g","t"),8),31),collapse="") -k <- "actatggcaagtcgtaaccgcgcttatcagg" -ecn(okmer(k,1),1) -ecn(okmer(k,2),2) -ecn(okmer(k,3),3) -ecn(okmer(k,4),4) -``` - -aattaaaaaaacaagataaaataatattttt - -```{r} -k<-'aattaaaaaaacaagataaaataatattttt' -ecn(okmer(k,1),1) -ecn(okmer(k,2),2) -ecn(okmer(k,3),3) -ecn(okmer(k,4),4) -``` - -atg tga gat ,,,, - -cat tca atc - -tgatgatgatgatgatgatgatgatgatg - -## Tests de normalisation circulaire - -```{r} -# Test de la fonction de normalisation -normalize_circular("ca") # devrait donner "ac" -normalize_circular("tgca") # devrait donner "atgc" -normalize_circular("acgt") # devrait donner "acgt" -``` - -```{r} -# Comparaison ec vs ecn sur une séquence répétitive -# Les k-mers "atg", "tga", "gat" sont équivalents par rotation -k <- 'atgatgatgatgatgatgatgatgatgatga' -cat("Séquence:", k, "\n") -cat("ec(k,3) =", ec(okmer(k,3),3), "\n") -cat("ecn(k,3) =", ecn(okmer(k,3),3), "\n") -``` - -```{r} -# Comparaison sur séquence aléatoire -k <- "actatggcaagtcgtaaccgcgcttatcagg" -cat("Séquence:", k, "\n") -cat("Sans normalisation:\n") -cat(" ec(k,2) =", ec(okmer(k,2),2), "\n") -cat(" ec(k,3) =", ec(okmer(k,3),3), "\n") -cat(" ec(k,4) =", ec(okmer(k,4),4), "\n") -cat("Avec normalisation circulaire:\n") -cat(" ecn(k,2) =", ecn(okmer(k,2),2), "\n") -cat(" ecn(k,3) =", ecn(okmer(k,3),3), "\n") -cat(" ecn(k,4) =", ecn(okmer(k,4),4), "\n") -``` - -```{r} - -sequence <- "ttcatcactcagcaatcctgaatgatGAGAGCTTTTTTTTTTTATATATATATATATGTATATGTATGAAATACACTtatgctccgtttgtttcgccgtaa" -re <- rev(c(0.8108602271901116,0.8108602271901116,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.7800272339058549,0.7800272339058549,0.7751610144606091,0.7751610144606091,0.7751610144606091,0.764858185548322,0.7325526601302021,0.7137620699527615,0.6789199521982864,0.6584536373623372,0.634002687184193,0.6075290415873623,0.5785545803330997,0.5785545803330997,0.5503220289212184,0.5315314387437778,0.4966893209893028,0.46077361820145696,0.42388221293245526,0.4009547969713408,0.3561142883497758,0.3561142883497758,0.3561142883497758,0.3561142883497758,0.3561142883497758,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.35141814451677883,0.35141814451677883,0.35141814451677883,0.35141814451677883,0.35141814451677883,0.390029016052137,0.42781461756157363,0.45192285937059073,0.47238917420654,0.47238917420654,0.47238917420654,0.5092805794755417,0.5451962822633876,0.5800384000178626,0.602395141014297,0.6046146614886381,0.6046146614886381,0.6119084258128231,0.6119084258128231,0.6214217106113492,0.6424704346756562,0.6482381543085467,0.6635191587399633,0.6635191587399633,0.6635191587399633,0.6828444721058894,0.6950205907027562,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.7208976112999935)) - -di <- c(0.7208976112999935,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6950205907027562,0.6828444721058894,0.6635191587399633,0.6635191587399633,0.6635191587399633,0.6482381543085467,0.6424704346756562,0.6214217106113492,0.6119084258128231,0.6119084258128231,0.6046146614886382,0.6046146614886382,0.6023951410142971,0.5800384000178627,0.5451962822633876,0.5092805794755418,0.47238917420654003,0.47238917420654003,0.47238917420654003,0.4519228593705908,0.4278146175615737,0.39002901605213713,0.35141814451677894,0.35141814451677894,0.35141814451677894,0.35141814451677894,0.35141814451677883,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3561142883497762,0.3561142883497762,0.3561142883497762,0.3561142883497762,0.3561142883497762,0.40095479697134073,0.42388221293245526,0.46077361820145696,0.4966893209893028,0.5315314387437778,0.5503220289212184,0.5785545803330997,0.5785545803330997,0.6075290415873625,0.6340026871841933,0.6584536373623374,0.6789199521982866,0.7137620699527616,0.7325526601302023,0.7648581855483221,0.7751610144606093,0.7751610144606093,0.7751610144606093,0.7800272339058549,0.7800272339058549,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8108602271901116,0.8108602271901116) - -ebidir <- tibble(direct=di,reverse=re) %>% - mutate(position = 1:length(re), - nucleotide = str_sub(sequence,position,position)) - -ebidir %>% - ggplot(aes(x=position,y=direct)) + - geom_line() + - scale_x_continuous(breaks = ebidir$position, labels = ebidir$nucleotide) + - ylim(0,1)+ - geom_hline(yintercept=0.5, col = "red", linetype = "dashed") -``` \ No newline at end of file diff --git a/pkg/obitools/obilowmask/obilowmask_test.go b/pkg/obitools/obilowmask/obilowmask_test.go deleted file mode 100644 index 4afd764..0000000 --- a/pkg/obitools/obilowmask/obilowmask_test.go +++ /dev/null @@ -1,40 +0,0 @@ -package obilowmask - -import ( - "testing" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" -) - -func TestLowMaskWorker(t *testing.T) { - worker := LowMaskWorker(31, 6, 0.3, Mask, 'n') - - seq := obiseq.NewBioSequence("test", []byte("acgtacgtacgtacgtacgtacgtacgtacgt"), "test") - result, err := worker(seq) - if err != nil { - t.Fatalf("Worker failed: %v", err) - } - - if result.Len() != 1 { - t.Fatalf("Expected 1 sequence, got %d", result.Len()) - } - - resultSeq := result[0] - if resultSeq.Len() != 32 { - t.Fatalf("Expected sequence length 32, got %d", resultSeq.Len()) - } -} - -func TestLowMaskWorkerWithAmbiguity(t *testing.T) { - worker := LowMaskWorker(31, 6, 0.3, Mask, 'n') - - seq := obiseq.NewBioSequence("test", []byte("acgtNcgtacgtacgtacgtacgtacgtacgt"), "test") - result, err := worker(seq) - if err != nil { - t.Fatalf("Worker failed: %v", err) - } - - if result.Len() != 1 { - t.Fatalf("Expected 1 sequence, got %d", result.Len()) - } -} diff --git a/pkg/obitools/obilowmask/options.go b/pkg/obitools/obilowmask/options.go deleted file mode 100644 index 30c1408..0000000 --- a/pkg/obitools/obilowmask/options.go +++ /dev/null @@ -1,81 +0,0 @@ -package obilowmask - -import ( - "strings" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" - "github.com/DavidGamba/go-getoptions" - - log "github.com/sirupsen/logrus" -) - -var __kmer_size__ = 31 -var __level_max__ = 6 -var __threshold__ = 0.5 -var __split_mode__ = false -var __low_mode__ = false -var __mask__ = "." - -func LowMaskOptionSet(options *getoptions.GetOpt) { - - options.IntVar(&__kmer_size__, "kmer-size", __kmer_size__, - options.Description("Size of the kmer considered to estimate entropy."), - ) - - options.IntVar(&__level_max__, "entropy_size", __level_max__, - options.Description("Maximum word size considered for entropy estimate"), - ) - - options.Float64Var(&__threshold__, "threshold", __threshold__, - options.Description("entropy theshold used to mask a kmer"), - ) - - options.BoolVar(&__split_mode__, "split-mode", __split_mode__, - options.Description("in split mode, input sequences are splitted to remove masked regions"), - ) - - options.BoolVar(&__low_mode__, "low-mode", __low_mode__, - options.Description("in split mode, input sequences are splitted to remove masked regions"), - ) - - options.StringVar(&__mask__, "masking-char", __mask__, - options.Description("Character used to mask low complexity region"), - ) -} - -func OptionSet(options *getoptions.GetOpt) { - LowMaskOptionSet(options) - obiconvert.InputOptionSet(options) - obiconvert.OutputOptionSet(options) -} - -func CLIKmerSize() int { - return __kmer_size__ -} - -func CLILevelMax() int { - return __level_max__ -} - -func CLIThreshold() float64 { - return __threshold__ -} - -func CLIMaskingMode() MaskingMode { - switch { - case __low_mode__: - return Extract - case __split_mode__: - return Split - default: - return Mask - } -} - -func CLIMaskingChar() byte { - mask := strings.TrimSpace(__mask__) - if len(mask) != 1 { - log.Fatalf("--masking-char option accept a single character, not %s", mask) - } - return []byte(mask)[0] -} diff --git a/pkg/obitools/obisuperkmer/obisuperkmer.go b/pkg/obitools/obisuperkmer/obisuperkmer.go deleted file mode 100644 index c332564..0000000 --- a/pkg/obitools/obisuperkmer/obisuperkmer.go +++ /dev/null @@ -1,10 +0,0 @@ -// obisuperkmer function utility package. -// -// The obitools/obisuperkmer package contains every -// function specifically required by the obisuperkmer utility. -// -// The obisuperkmer command extracts super k-mers from DNA sequences. -// A super k-mer is a maximal subsequence where all consecutive k-mers -// share the same minimizer. This decomposition is useful for efficient -// k-mer indexing and analysis in bioinformatics applications. -package obisuperkmer diff --git a/pkg/obitools/obisuperkmer/options.go b/pkg/obitools/obisuperkmer/options.go deleted file mode 100644 index 2f25a8e..0000000 --- a/pkg/obitools/obisuperkmer/options.go +++ /dev/null @@ -1,69 +0,0 @@ -package obisuperkmer - -import ( - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" - "github.com/DavidGamba/go-getoptions" -) - -// Private variables for storing option values -var _KmerSize = 31 -var _MinimizerSize = 13 - -// SuperKmerOptionSet defines every option related to super k-mer extraction. -// -// The function adds to a CLI every option proposed to the user -// to tune the parameters of the super k-mer extraction algorithm. -// -// Parameters: -// - options: is a pointer to a getoptions.GetOpt instance normally -// produced by the obioptions.GenerateOptionParser function. -func SuperKmerOptionSet(options *getoptions.GetOpt) { - options.IntVar(&_KmerSize, "kmer-size", _KmerSize, - options.Alias("k"), - options.Description("Size of k-mers (must be between m+1 and 31).")) - - options.IntVar(&_MinimizerSize, "minimizer-size", _MinimizerSize, - options.Alias("m"), - options.Description("Size of minimizers (must be between 1 and k-1).")) -} - -// OptionSet adds to the basic option set every option declared for -// the obisuperkmer command. -// -// It takes a pointer to a GetOpt struct as its parameter and does not return anything. -func OptionSet(options *getoptions.GetOpt) { - obiconvert.OptionSet(false)(options) - SuperKmerOptionSet(options) -} - -// CLIKmerSize returns the k-mer size to use for super k-mer extraction. -// -// It does not take any parameters. -// It returns an integer representing the k-mer size. -func CLIKmerSize() int { - return _KmerSize -} - -// SetKmerSize sets the k-mer size for super k-mer extraction. -// -// Parameters: -// - k: the k-mer size (must be between m+1 and 31). -func SetKmerSize(k int) { - _KmerSize = k -} - -// CLIMinimizerSize returns the minimizer size to use for super k-mer extraction. -// -// It does not take any parameters. -// It returns an integer representing the minimizer size. -func CLIMinimizerSize() int { - return _MinimizerSize -} - -// SetMinimizerSize sets the minimizer size for super k-mer extraction. -// -// Parameters: -// - m: the minimizer size (must be between 1 and k-1). -func SetMinimizerSize(m int) { - _MinimizerSize = m -} diff --git a/pkg/obitools/obisuperkmer/superkmer.go b/pkg/obitools/obisuperkmer/superkmer.go deleted file mode 100644 index d4bcf2e..0000000 --- a/pkg/obitools/obisuperkmer/superkmer.go +++ /dev/null @@ -1,59 +0,0 @@ -package obisuperkmer - -import ( - log "github.com/sirupsen/logrus" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" -) - -// CLIExtractSuperKmers extracts super k-mers from an iterator of BioSequences. -// -// This function takes an iterator of BioSequence objects, extracts super k-mers -// from each sequence using the k-mer and minimizer sizes specified by CLI options, -// and returns a new iterator yielding the extracted super k-mers as BioSequence objects. -// -// Each super k-mer is a maximal subsequence where all consecutive k-mers share -// the same minimizer. The resulting BioSequences contain metadata including: -// - minimizer_value: the canonical minimizer value -// - minimizer_seq: the DNA sequence of the minimizer -// - k: the k-mer size used -// - m: the minimizer size used -// - start: starting position in the original sequence -// - end: ending position in the original sequence -// - parent_id: ID of the parent sequence -// -// Parameters: -// - iterator: an iterator yielding BioSequence objects to process. -// -// Returns: -// - An iterator yielding BioSequence objects representing super k-mers. -func CLIExtractSuperKmers(iterator obiiter.IBioSequence) obiiter.IBioSequence { - // Get k-mer and minimizer sizes from CLI options - k := CLIKmerSize() - m := CLIMinimizerSize() - - // Validate parameters - if m < 1 || m >= k { - log.Fatalf("Invalid parameters: minimizer size (%d) must be between 1 and k-1 (%d)", m, k-1) - } - - if k < 2 || k > 31 { - log.Fatalf("Invalid k-mer size: %d (must be between 2 and 31)", k) - } - - log.Printf("Extracting super k-mers with k=%d, m=%d", k, m) - - // Create the worker for super k-mer extraction - worker := obikmer.SuperKmerWorker(k, m) - - // Apply the worker to the iterator with parallel processing - newIter := iterator.MakeIWorker( - worker, - false, // don't merge results - obidefault.ParallelWorkers(), - ) - - return newIter -} diff --git a/pkg/obitools/obisuperkmer/superkmer_test.go b/pkg/obitools/obisuperkmer/superkmer_test.go deleted file mode 100644 index 2d94019..0000000 --- a/pkg/obitools/obisuperkmer/superkmer_test.go +++ /dev/null @@ -1,149 +0,0 @@ -package obisuperkmer - -import ( - "testing" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" -) - -func TestCLIExtractSuperKmers(t *testing.T) { - // Create a test sequence - testSeq := obiseq.NewBioSequence( - "test_seq", - []byte("ACGTACGTACGTACGTACGTACGTACGTACGT"), - "", - ) - - // Create a batch with the test sequence - batch := obiseq.NewBioSequenceBatch() - batch.Add(testSeq) - - // Create an iterator from the batch - iterator := obiiter.MakeBioSequenceBatchChannel(1) - go func() { - iterator.Push(batch) - iterator.Close() - }() - - // Set test parameters - SetKmerSize(15) - SetMinimizerSize(7) - - // Extract super k-mers - result := CLIExtractSuperKmers(iterator) - - // Count the number of super k-mers - count := 0 - for result.Next() { - batch := result.Get() - for _, sk := range batch.Slice() { - count++ - - // Verify that the super k-mer has the expected attributes - if !sk.HasAttribute("minimizer_value") { - t.Error("Super k-mer missing 'minimizer_value' attribute") - } - if !sk.HasAttribute("minimizer_seq") { - t.Error("Super k-mer missing 'minimizer_seq' attribute") - } - if !sk.HasAttribute("k") { - t.Error("Super k-mer missing 'k' attribute") - } - if !sk.HasAttribute("m") { - t.Error("Super k-mer missing 'm' attribute") - } - if !sk.HasAttribute("start") { - t.Error("Super k-mer missing 'start' attribute") - } - if !sk.HasAttribute("end") { - t.Error("Super k-mer missing 'end' attribute") - } - if !sk.HasAttribute("parent_id") { - t.Error("Super k-mer missing 'parent_id' attribute") - } - - // Verify attribute values - k, _ := sk.GetIntAttribute("k") - m, _ := sk.GetIntAttribute("m") - - if k != 15 { - t.Errorf("Expected k=15, got k=%d", k) - } - if m != 7 { - t.Errorf("Expected m=7, got m=%d", m) - } - - parentID, _ := sk.GetStringAttribute("parent_id") - if parentID != "test_seq" { - t.Errorf("Expected parent_id='test_seq', got '%s'", parentID) - } - } - } - - if count == 0 { - t.Error("No super k-mers were extracted") - } - - t.Logf("Extracted %d super k-mers from test sequence", count) -} - -func TestOptionGettersAndSetters(t *testing.T) { - // Test initial values - if CLIKmerSize() != 21 { - t.Errorf("Expected default k-mer size 21, got %d", CLIKmerSize()) - } - if CLIMinimizerSize() != 11 { - t.Errorf("Expected default minimizer size 11, got %d", CLIMinimizerSize()) - } - - // Test setters - SetKmerSize(25) - SetMinimizerSize(13) - - if CLIKmerSize() != 25 { - t.Errorf("SetKmerSize failed: expected 25, got %d", CLIKmerSize()) - } - if CLIMinimizerSize() != 13 { - t.Errorf("SetMinimizerSize failed: expected 13, got %d", CLIMinimizerSize()) - } - - // Reset to defaults - SetKmerSize(21) - SetMinimizerSize(11) -} - -func BenchmarkCLIExtractSuperKmers(b *testing.B) { - // Create a longer test sequence - longSeq := make([]byte, 1000) - bases := []byte{'A', 'C', 'G', 'T'} - for i := range longSeq { - longSeq[i] = bases[i%4] - } - - testSeq := obiseq.NewBioSequence("bench_seq", longSeq, "") - - // Set parameters - SetKmerSize(21) - SetMinimizerSize(11) - - b.ResetTimer() - - for i := 0; i < b.N; i++ { - batch := obiseq.NewBioSequenceBatch() - batch.Add(testSeq) - - iterator := obiiter.MakeBioSequenceBatchChannel(1) - go func() { - iterator.Push(batch) - iterator.Close() - }() - - result := CLIExtractSuperKmers(iterator) - - // Consume the iterator - for result.Next() { - result.Get() - } - } -}