Refactor k-mer index management with subcommands and enhanced metadata support

This commit refactors the k-mer index management tools to use a unified subcommand structure with obik, adds support for per-set metadata and ID management, enhances the k-mer set group builder to support appending to existing groups, and improves command-line option handling with a new global options registration system.

Key changes:
- Introduce obik command with subcommands (index, ls, summary, cp, mv, rm, super, lowmask)
- Add support for per-set metadata and ID management in kmer set groups
- Implement ability to append to existing kmer index groups
- Refactor option parsing to use a global options registration system
- Add new commands for listing, copying, moving, and removing sets
- Enhance low-complexity masking with new options and output formats
- Improve kmer index summary with Jaccard distance matrix support
- Remove deprecated obikindex and obisuperkmer commands
- Update build process to use the new subcommand structure
This commit is contained in:
Eric Coissac
2026-02-09 23:10:30 +01:00
parent f78543ee75
commit 56c1f4180c
26 changed files with 1482 additions and 1175 deletions

55
pkg/obitools/obik/cp.go Normal file
View File

@@ -0,0 +1,55 @@
package obik
import (
"context"
"fmt"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"github.com/DavidGamba/go-getoptions"
)
func runCp(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
if len(args) < 2 {
return fmt.Errorf("usage: obik cp [--set PATTERN]... [--force] <source_index> <dest_index>")
}
srcDir := args[0]
destDir := args[1]
ksg, err := obikmer.OpenKmerSetGroup(srcDir)
if err != nil {
return fmt.Errorf("failed to open source kmer index: %w", err)
}
// Resolve set patterns
patterns := CLISetPatterns()
var ids []string
if len(patterns) > 0 {
indices, err := ksg.MatchSetIDs(patterns)
if err != nil {
return err
}
if len(indices) == 0 {
return fmt.Errorf("no sets match the given patterns")
}
ids = make([]string, len(indices))
for i, idx := range indices {
ids[i] = ksg.SetIDOf(idx)
}
} else {
// Copy all sets
ids = ksg.SetsIDs()
}
log.Infof("Copying %d set(s) from %s to %s", len(ids), srcDir, destDir)
dest, err := ksg.CopySetsByIDTo(ids, destDir, CLIForce())
if err != nil {
return err
}
log.Infof("Destination now has %d set(s)", dest.Size())
return nil
}

110
pkg/obitools/obik/index.go Normal file
View File

@@ -0,0 +1,110 @@
package obik
import (
"context"
"fmt"
"os"
"path/filepath"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
outDir := CLIOutputDirectory()
if outDir == "" || outDir == "-" {
return fmt.Errorf("--out option is required and must specify a directory path")
}
k := CLIKmerSize()
if k < 2 || k > 31 {
return fmt.Errorf("invalid k-mer size: %d (must be between 2 and 31)", k)
}
m := CLIMinimizerSize()
minOcc := CLIMinOccurrence()
if minOcc < 1 {
return fmt.Errorf("invalid min-occurrence: %d (must be >= 1)", minOcc)
}
// Build options
var opts []obikmer.BuilderOption
if minOcc > 1 {
opts = append(opts, obikmer.WithMinFrequency(minOcc))
}
// Determine whether to append to existing group or create new
var builder *obikmer.KmerSetGroupBuilder
var err error
metaPath := filepath.Join(outDir, "metadata.toml")
if _, statErr := os.Stat(metaPath); statErr == nil {
// Existing group: append
log.Infof("Appending to existing kmer index at %s", outDir)
builder, err = obikmer.AppendKmerSetGroupBuilder(outDir, 1, opts...)
if err != nil {
return fmt.Errorf("failed to open existing kmer index for appending: %w", err)
}
} else {
// New group
log.Infof("Creating new kmer index: k=%d, m=%d, min-occurrence=%d", k, m, minOcc)
builder, err = obikmer.NewKmerSetGroupBuilder(outDir, k, m, 1, -1, opts...)
if err != nil {
return fmt.Errorf("failed to create kmer index builder: %w", err)
}
}
// Read and process sequences
sequences, err := obiconvert.CLIReadBioSequences(args...)
if err != nil {
return fmt.Errorf("failed to open sequence files: %w", err)
}
seqCount := 0
for sequences.Next() {
batch := sequences.Get()
for _, seq := range batch.Slice() {
builder.AddSequence(0, seq)
seqCount++
}
}
log.Infof("Processed %d sequences", seqCount)
// Finalize
ksg, err := builder.Close()
if err != nil {
return fmt.Errorf("failed to finalize kmer index: %w", err)
}
// Apply index-id to the new set
newSetIdx := builder.StartIndex()
if id := CLIIndexId(); id != "" {
ksg.SetSetID(newSetIdx, id)
}
// Apply group-level tags (-S)
for key, value := range CLISetTag() {
ksg.SetAttribute(key, value)
}
// Apply per-set tags (-T) to the new set
for key, value := range _setMetaTags {
ksg.SetSetMetadata(newSetIdx, key, value)
}
if minOcc > 1 {
ksg.SetAttribute("min_occurrence", minOcc)
}
if err := ksg.SaveMetadata(); err != nil {
return fmt.Errorf("failed to save metadata: %w", err)
}
log.Infof("Index contains %d k-mers for set %d in %s", ksg.Len(newSetIdx), newSetIdx, outDir)
log.Info("Done.")
return nil
}

View File

@@ -0,0 +1,457 @@
package obik
import (
"context"
"fmt"
"math"
"strings"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
"github.com/DavidGamba/go-getoptions"
)
// MaskingMode defines how to handle low-complexity regions
type MaskingMode int
const (
MaskMode MaskingMode = iota // Replace low-complexity regions with masked characters
SplitMode // Split sequence into high-complexity fragments
ExtractMode // Extract low-complexity fragments
)
// Lowmask-specific option variables (separate from index/super kmer-size).
var _lowmaskKmerSize = 31
var _lowmaskLevelMax = 6
var _lowmaskThreshold = 0.5
var _lowmaskSplitMode = false
var _lowmaskLowMode = false
var _lowmaskMaskChar = "."
// LowMaskOptionSet registers options specific to low-complexity masking.
func LowMaskOptionSet(options *getoptions.GetOpt) {
options.IntVar(&_lowmaskKmerSize, "kmer-size", _lowmaskKmerSize,
options.Description("Size of the kmer considered to estimate entropy."))
options.IntVar(&_lowmaskLevelMax, "entropy-size", _lowmaskLevelMax,
options.Description("Maximum word size considered for entropy estimate."))
options.Float64Var(&_lowmaskThreshold, "threshold", _lowmaskThreshold,
options.Description("Entropy threshold below which a kmer is masked (0 to 1)."))
options.BoolVar(&_lowmaskSplitMode, "split-mode", _lowmaskSplitMode,
options.Description("Split sequences to remove masked regions."))
options.BoolVar(&_lowmaskLowMode, "low-mode", _lowmaskLowMode,
options.Description("Extract only low-complexity regions."))
options.StringVar(&_lowmaskMaskChar, "masking-char", _lowmaskMaskChar,
options.Description("Character used to mask low complexity regions."))
}
func lowmaskMaskingMode() MaskingMode {
switch {
case _lowmaskLowMode:
return ExtractMode
case _lowmaskSplitMode:
return SplitMode
default:
return MaskMode
}
}
func lowmaskMaskingChar() byte {
mask := strings.TrimSpace(_lowmaskMaskChar)
if len(mask) != 1 {
log.Fatalf("--masking-char option accepts a single character, not %s", mask)
}
return []byte(mask)[0]
}
// lowMaskWorker creates a worker to mask low-complexity regions in DNA sequences.
func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte) obiseq.SeqWorker {
nLogN := make([]float64, kmer_size+1)
for i := 1; i <= kmer_size; i++ {
nLogN[i] = float64(i) * math.Log(float64(i))
}
normTables := make([][]int, level_max+1)
for ws := 1; ws <= level_max; ws++ {
size := 1 << (ws * 2)
normTables[ws] = make([]int, size)
for code := 0; code < size; code++ {
normTables[ws][code] = int(obikmer.NormalizeCircular(uint64(code), ws))
}
}
type pair struct {
index int
value float64
}
slidingMin := func(data []float64, window int) {
if len(data) == 0 || window <= 0 {
return
}
if window >= len(data) {
minVal := data[0]
for i := 1; i < len(data); i++ {
if data[i] < minVal {
minVal = data[i]
}
}
for i := range data {
data[i] = minVal
}
return
}
deque := make([]pair, 0, window)
for i, v := range data {
for len(deque) > 0 && deque[0].index <= i-window {
deque = deque[1:]
}
for len(deque) > 0 && deque[len(deque)-1].value >= v {
deque = deque[:len(deque)-1]
}
deque = append(deque, pair{index: i, value: v})
data[i] = deque[0].value
}
}
emaxValues := make([]float64, level_max+1)
logNwords := make([]float64, level_max+1)
for ws := 1; ws <= level_max; ws++ {
nw := kmer_size - ws + 1
na := obikmer.CanonicalCircularKmerCount(ws)
if nw < na {
logNwords[ws] = math.Log(float64(nw))
emaxValues[ws] = math.Log(float64(nw))
} else {
cov := nw / na
remains := nw - (na * cov)
f1 := float64(cov) / float64(nw)
f2 := float64(cov+1) / float64(nw)
logNwords[ws] = math.Log(float64(nw))
emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) +
float64(remains)*f2*math.Log(f2))
}
}
maskAmbiguities := func(sequence []byte) []int {
maskPositions := make([]int, len(sequence))
for i, nuc := range sequence {
if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
end := max(0, i-kmer_size+1)
for j := i; j >= end; j-- {
maskPositions[j] = -1
}
}
}
return maskPositions
}
cleanTable := func(table []int, over int) {
for i := 0; i < over; i++ {
table[i] = 0
}
}
computeEntropies := func(sequence []byte,
maskPositions []int,
entropies []float64,
table []int,
words []int,
wordSize int,
normTable []int) {
lseq := len(sequence)
tableSize := 1 << (wordSize * 2)
nwords := kmer_size - wordSize + 1
float_nwords := float64(nwords)
log_nwords := logNwords[wordSize]
entropyMax := emaxValues[wordSize]
cleanTable(table, tableSize)
for i := 1; i < lseq; i++ {
entropies[i] = 6
}
end := lseq - wordSize + 1
mask := (1 << (wordSize * 2)) - 1
word_index := 0
for i := 0; i < wordSize-1; i++ {
word_index = (word_index << 2) + int(obikmer.EncodeNucleotide(sequence[i]))
}
for i, j := 0, wordSize-1; i < end; i, j = i+1, j+1 {
word_index = ((word_index << 2) & mask) + int(obikmer.EncodeNucleotide(sequence[j]))
words[i] = normTable[word_index]
}
s := 0
sum_n_logn := 0.0
entropy := 1.0
cleaned := true
for i := range end {
s++
switch {
case s < nwords:
cleaned = false
table[words[i]]++
case i >= (nwords-1) && maskPositions[i-nwords+1] < 0:
entropies[i-nwords+1] = 4.0
if !cleaned {
cleanTable(table, tableSize)
}
cleaned = true
s = 0
sum_n_logn = 0.0
case s == nwords:
cleaned = false
table[words[i]]++
sum_n_logn = 0
for j := range tableSize {
n := float64(table[j])
if n > 0 {
sum_n_logn += nLogN[int(n)]
}
}
entropy = (log_nwords - sum_n_logn/float_nwords) / entropyMax
case s > nwords:
cleaned = false
new_word := words[i]
old_word := words[i-nwords]
if old_word != new_word {
table[new_word]++
table[old_word]--
n_old := float64(table[old_word])
n_new := float64(table[new_word])
sum_n_logn -= nLogN[int(n_old+1)]
if n_old > 0 {
sum_n_logn += nLogN[int(n_old)]
}
if n_new > 0 {
sum_n_logn += nLogN[int(n_new)]
}
if n_new > 1 {
sum_n_logn -= nLogN[int(n_new-1)]
}
}
entropy = (log_nwords - sum_n_logn/float_nwords) / entropyMax
}
if s >= nwords && maskPositions[i-nwords+1] >= 0 {
if entropy < 0 {
entropy = 0
}
entropy = math.Round(entropy*10000) / 10000
entropies[i-nwords+1] = entropy
}
}
slidingMin(entropies, kmer_size)
}
applyMaskMode := func(sequence *obiseq.BioSequence, maskPositions []bool, mask byte) (obiseq.BioSequenceSlice, error) {
seqCopy := sequence.Copy()
sequenceBytes := seqCopy.Sequence()
for i := range sequenceBytes {
if maskPositions[i] {
sequenceBytes[i] = mask
}
}
return obiseq.BioSequenceSlice{seqCopy}, nil
}
selectMasked := func(sequence *obiseq.BioSequence, maskPosition []bool) (obiseq.BioSequenceSlice, error) {
rep := obiseq.NewBioSequenceSlice()
inlow := false
fromlow := -1
for i, masked := range maskPosition {
if masked && !inlow {
fromlow = i
inlow = true
}
if inlow && !masked {
if fromlow >= 0 {
frg, err := sequence.Subsequence(fromlow, i, false)
if err != nil {
return nil, err
}
rep.Push(frg)
}
inlow = false
fromlow = -1
}
}
if inlow && fromlow >= 0 {
frg, err := sequence.Subsequence(fromlow, len(maskPosition), false)
if err != nil {
return nil, err
}
rep.Push(frg)
}
return *rep, nil
}
selectunmasked := func(sequence *obiseq.BioSequence, maskPosition []bool) (obiseq.BioSequenceSlice, error) {
rep := obiseq.NewBioSequenceSlice()
inhigh := false
fromhigh := -1
for i, masked := range maskPosition {
if !masked && !inhigh {
fromhigh = i
inhigh = true
}
if inhigh && masked {
if fromhigh >= 0 {
frg, err := sequence.Subsequence(fromhigh, i, false)
if err != nil {
return nil, err
}
rep.Push(frg)
}
inhigh = false
fromhigh = -1
}
}
if inhigh && fromhigh >= 0 {
frg, err := sequence.Subsequence(fromhigh, len(maskPosition), false)
if err != nil {
return nil, err
}
rep.Push(frg)
}
return *rep, nil
}
masking := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
if sequence.Len() < kmer_size {
sequence.SetAttribute("obilowmask_error", "Sequence too short")
remove := make([]bool, sequence.Len())
for i := range remove {
remove[i] = true
}
return applyMaskMode(sequence, remove, maskChar)
}
bseq := sequence.Sequence()
maskPositions := maskAmbiguities(bseq)
maskFlags := make([]int, len(bseq))
entropies := make([]float64, len(bseq))
for i := range entropies {
entropies[i] = 4.0
}
freqs := make([]int, 1<<(2*level_max))
words := make([]int, len(bseq))
entropies2 := make([]float64, len(bseq))
computeEntropies(bseq, maskPositions, entropies, freqs, words, level_max, normTables[level_max])
for i := range bseq {
v := level_max
maskFlags[i] = v
}
for ws := level_max - 1; ws > 0; ws-- {
computeEntropies(bseq, maskPositions, entropies2, freqs, words, ws, normTables[ws])
for i, e2 := range entropies2 {
if e2 < entropies[i] {
entropies[i] = e2
maskFlags[i] = ws
}
}
}
for i, nuc := range bseq {
if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
entropies[i] = 0
}
}
remove := make([]bool, len(entropies))
for i, e := range entropies {
remove[i] = e <= threshold
}
sequence.SetAttribute("mask", maskFlags)
sequence.SetAttribute("Entropies", entropies)
switch mode {
case MaskMode:
return applyMaskMode(sequence, remove, maskChar)
case SplitMode:
return selectunmasked(sequence, remove)
case ExtractMode:
return selectMasked(sequence, remove)
}
return nil, fmt.Errorf("unknown mode %d", mode)
}
return masking
}
// runLowmask implements the "obik lowmask" subcommand.
// It masks low-complexity regions in DNA sequences using entropy-based detection.
func runLowmask(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
kmerSize := _lowmaskKmerSize
levelMax := _lowmaskLevelMax
threshold := _lowmaskThreshold
mode := lowmaskMaskingMode()
maskChar := lowmaskMaskingChar()
log.Printf("Low-complexity masking: kmer-size=%d, entropy-size=%d, threshold=%.4f", kmerSize, levelMax, threshold)
sequences, err := obiconvert.CLIReadBioSequences(args...)
if err != nil {
return fmt.Errorf("failed to open sequence files: %w", err)
}
worker := lowMaskWorker(kmerSize, levelMax, threshold, mode, maskChar)
masked := sequences.MakeIWorker(
worker,
false,
obidefault.ParallelWorkers(),
).FilterEmpty()
obiconvert.CLIWriteBioSequences(masked, true)
obiutils.WaitForLastPipe()
return nil
}

96
pkg/obitools/obik/ls.go Normal file
View File

@@ -0,0 +1,96 @@
package obik
import (
"context"
"encoding/json"
"fmt"
"strings"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"github.com/DavidGamba/go-getoptions"
"gopkg.in/yaml.v3"
)
type setEntry struct {
Index int `json:"index" yaml:"index"`
ID string `json:"id" yaml:"id"`
Count uint64 `json:"count" yaml:"count"`
}
func runLs(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
if len(args) < 1 {
return fmt.Errorf("usage: obik ls [options] <index_directory>")
}
ksg, err := obikmer.OpenKmerSetGroup(args[0])
if err != nil {
return fmt.Errorf("failed to open kmer index: %w", err)
}
// Determine which sets to show
patterns := CLISetPatterns()
var indices []int
if len(patterns) > 0 {
indices, err = ksg.MatchSetIDs(patterns)
if err != nil {
return err
}
} else {
indices = make([]int, ksg.Size())
for i := range indices {
indices[i] = i
}
}
entries := make([]setEntry, len(indices))
for i, idx := range indices {
entries[i] = setEntry{
Index: idx,
ID: ksg.SetIDOf(idx),
Count: ksg.Len(idx),
}
}
format := CLIOutFormat()
switch format {
case "json":
return outputLsJSON(entries)
case "yaml":
return outputLsYAML(entries)
case "csv":
return outputLsCSV(entries)
default:
return outputLsCSV(entries)
}
}
func outputLsCSV(entries []setEntry) error {
fmt.Println("index,id,count")
for _, e := range entries {
// Escape commas in ID if needed
id := e.ID
if strings.ContainsAny(id, ",\"") {
id = "\"" + strings.ReplaceAll(id, "\"", "\"\"") + "\""
}
fmt.Printf("%d,%s,%d\n", e.Index, id, e.Count)
}
return nil
}
func outputLsJSON(entries []setEntry) error {
data, err := json.MarshalIndent(entries, "", " ")
if err != nil {
return err
}
fmt.Println(string(data))
return nil
}
func outputLsYAML(entries []setEntry) error {
data, err := yaml.Marshal(entries)
if err != nil {
return err
}
fmt.Print(string(data))
return nil
}

63
pkg/obitools/obik/mv.go Normal file
View File

@@ -0,0 +1,63 @@
package obik
import (
"context"
"fmt"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"github.com/DavidGamba/go-getoptions"
)
func runMv(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
if len(args) < 2 {
return fmt.Errorf("usage: obik mv [--set PATTERN]... [--force] <source_index> <dest_index>")
}
srcDir := args[0]
destDir := args[1]
ksg, err := obikmer.OpenKmerSetGroup(srcDir)
if err != nil {
return fmt.Errorf("failed to open source kmer index: %w", err)
}
// Resolve set patterns
patterns := CLISetPatterns()
var ids []string
if len(patterns) > 0 {
indices, err := ksg.MatchSetIDs(patterns)
if err != nil {
return err
}
if len(indices) == 0 {
return fmt.Errorf("no sets match the given patterns")
}
ids = make([]string, len(indices))
for i, idx := range indices {
ids[i] = ksg.SetIDOf(idx)
}
} else {
// Move all sets
ids = ksg.SetsIDs()
}
log.Infof("Moving %d set(s) from %s to %s", len(ids), srcDir, destDir)
// Copy first
dest, err := ksg.CopySetsByIDTo(ids, destDir, CLIForce())
if err != nil {
return err
}
// Remove from source (in reverse order to avoid renumbering issues)
for i := len(ids) - 1; i >= 0; i-- {
if err := ksg.RemoveSetByID(ids[i]); err != nil {
return fmt.Errorf("failed to remove set %q from source after copy: %w", ids[i], err)
}
}
log.Infof("Destination now has %d set(s), source has %d set(s)", dest.Size(), ksg.Size())
return nil
}

64
pkg/obitools/obik/obik.go Normal file
View File

@@ -0,0 +1,64 @@
package obik
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
// OptionSet registers all obik subcommands on the root GetOpt.
func OptionSet(opt *getoptions.GetOpt) {
// index: build or extend a kmer index from sequence files
indexCmd := opt.NewCommand("index", "Build a disk-based kmer index from sequence files")
obiconvert.InputOptionSet(indexCmd)
obiconvert.OutputModeOptionSet(indexCmd, false)
KmerIndexOptionSet(indexCmd)
indexCmd.StringMapVar(&_setMetaTags, "tag", 1, 1,
indexCmd.Alias("T"),
indexCmd.ArgName("KEY=VALUE"),
indexCmd.Description("Per-set metadata tag (repeatable)."))
indexCmd.SetCommandFn(runIndex)
// ls: list sets in a kmer index
lsCmd := opt.NewCommand("ls", "List sets in a kmer index")
OutputFormatOptionSet(lsCmd)
SetSelectionOptionSet(lsCmd)
lsCmd.SetCommandFn(runLs)
// summary: detailed statistics
summaryCmd := opt.NewCommand("summary", "Show detailed statistics of a kmer index")
OutputFormatOptionSet(summaryCmd)
summaryCmd.BoolVar(&_jaccard, "jaccard", false,
summaryCmd.Description("Compute and display pairwise Jaccard distance matrix."))
summaryCmd.SetCommandFn(runSummary)
// cp: copy sets between indices
cpCmd := opt.NewCommand("cp", "Copy sets between kmer indices")
SetSelectionOptionSet(cpCmd)
ForceOptionSet(cpCmd)
cpCmd.SetCommandFn(runCp)
// mv: move sets between indices
mvCmd := opt.NewCommand("mv", "Move sets between kmer indices")
SetSelectionOptionSet(mvCmd)
ForceOptionSet(mvCmd)
mvCmd.SetCommandFn(runMv)
// rm: remove sets from an index
rmCmd := opt.NewCommand("rm", "Remove sets from a kmer index")
SetSelectionOptionSet(rmCmd)
rmCmd.SetCommandFn(runRm)
// super: extract super k-mers from sequences
superCmd := opt.NewCommand("super", "Extract super k-mers from sequence files")
obiconvert.InputOptionSet(superCmd)
obiconvert.OutputOptionSet(superCmd)
SuperKmerOptionSet(superCmd)
superCmd.SetCommandFn(runSuper)
// lowmask: mask low-complexity regions
lowmaskCmd := opt.NewCommand("lowmask", "Mask low-complexity regions in sequences using entropy")
obiconvert.InputOptionSet(lowmaskCmd)
obiconvert.OutputOptionSet(lowmaskCmd)
LowMaskOptionSet(lowmaskCmd)
lowmaskCmd.SetCommandFn(runLowmask)
}

View File

@@ -0,0 +1,189 @@
package obik
import (
"strings"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
// Output format flags
var _jsonOutput bool
var _csvOutput bool
var _yamlOutput bool
// Set selection flags
var _setPatterns []string
// Force flag
var _force bool
// Jaccard flag
var _jaccard bool
// Per-set tags for index subcommand
var _setMetaTags = make(map[string]string, 0)
// ==============================
// Kmer index building options (moved from obikindex)
// ==============================
var _kmerSize = 31
var _minimizerSize = -1 // -1 means auto: ceil(k / 2.5)
var _indexId = ""
var _metadataFormat = "toml"
var _setTag = make(map[string]string, 0)
var _minOccurrence = 1
var _saveFullFilter = false
// KmerIndexOptionSet defines every option related to kmer index building.
func KmerIndexOptionSet(options *getoptions.GetOpt) {
options.IntVar(&_kmerSize, "kmer-size", _kmerSize,
options.Alias("k"),
options.Description("Size of k-mers (must be between 2 and 31)."))
options.IntVar(&_minimizerSize, "minimizer-size", _minimizerSize,
options.Alias("m"),
options.Description("Size of minimizers for parallelization (-1 for auto = ceil(k/2.5))."))
options.StringVar(&_indexId, "index-id", _indexId,
options.Description("Identifier for the kmer index."))
options.StringVar(&_metadataFormat, "metadata-format", _metadataFormat,
options.Description("Format for metadata file (toml, yaml, json)."))
options.StringMapVar(&_setTag, "set-tag", 1, 1,
options.Alias("S"),
options.ArgName("KEY=VALUE"),
options.Description("Adds a group-level metadata attribute KEY with value VALUE."))
options.IntVar(&_minOccurrence, "min-occurrence", _minOccurrence,
options.Description("Minimum number of occurrences for a k-mer to be kept (default 1 = keep all)."))
options.BoolVar(&_saveFullFilter, "save-full-filter", _saveFullFilter,
options.Description("When using --min-occurrence > 1, save the full frequency filter instead of just the filtered index."))
}
// CLIKmerSize returns the k-mer size.
func CLIKmerSize() int {
return _kmerSize
}
// CLIMinimizerSize returns the effective minimizer size.
func CLIMinimizerSize() int {
m := _minimizerSize
if m < 0 {
m = obikmer.DefaultMinimizerSize(_kmerSize)
}
nworkers := obidefault.ParallelWorkers()
m = obikmer.ValidateMinimizerSize(m, _kmerSize, nworkers)
return m
}
// CLIIndexId returns the index identifier.
func CLIIndexId() string {
return _indexId
}
// CLIMetadataFormat returns the metadata format.
func CLIMetadataFormat() obikmer.MetadataFormat {
switch strings.ToLower(_metadataFormat) {
case "toml":
return obikmer.FormatTOML
case "yaml":
return obikmer.FormatYAML
case "json":
return obikmer.FormatJSON
default:
log.Warnf("Unknown metadata format %q, defaulting to TOML", _metadataFormat)
return obikmer.FormatTOML
}
}
// CLISetTag returns the group-level metadata key=value pairs.
func CLISetTag() map[string]string {
return _setTag
}
// CLIMinOccurrence returns the minimum occurrence threshold.
func CLIMinOccurrence() int {
return _minOccurrence
}
// CLISaveFullFilter returns whether to save the full frequency filter.
func CLISaveFullFilter() bool {
return _saveFullFilter
}
// CLIOutputDirectory returns the output directory path.
func CLIOutputDirectory() string {
return obiconvert.CLIOutPutFileName()
}
// SetKmerSize sets the k-mer size (for testing).
func SetKmerSize(k int) {
_kmerSize = k
}
// SetMinimizerSize sets the minimizer size (for testing).
func SetMinimizerSize(m int) {
_minimizerSize = m
}
// SetMinOccurrence sets the minimum occurrence (for testing).
func SetMinOccurrence(n int) {
_minOccurrence = n
}
// OutputFormatOptionSet registers --json-output, --csv-output, --yaml-output.
func OutputFormatOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&_jsonOutput, "json-output", false,
options.Description("Print results as JSON."))
options.BoolVar(&_csvOutput, "csv-output", false,
options.Description("Print results as CSV."))
options.BoolVar(&_yamlOutput, "yaml-output", false,
options.Description("Print results as YAML."))
}
// CLIOutFormat returns the selected output format: "json", "csv", "yaml", or "text".
func CLIOutFormat() string {
if _jsonOutput {
return "json"
}
if _csvOutput {
return "csv"
}
if _yamlOutput {
return "yaml"
}
return "text"
}
// SetSelectionOptionSet registers --set <glob_pattern> (repeatable).
func SetSelectionOptionSet(options *getoptions.GetOpt) {
options.StringSliceVar(&_setPatterns, "set", 1, 1,
options.Alias("s"),
options.ArgName("PATTERN"),
options.Description("Set ID or glob pattern (repeatable, supports *, ?, [...])."))
}
// CLISetPatterns returns the --set patterns provided by the user.
func CLISetPatterns() []string {
return _setPatterns
}
// ForceOptionSet registers --force / -f.
func ForceOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&_force, "force", false,
options.Alias("f"),
options.Description("Force operation even if set ID already exists in destination."))
}
// CLIForce returns whether --force was specified.
func CLIForce() bool {
return _force
}

56
pkg/obitools/obik/rm.go Normal file
View File

@@ -0,0 +1,56 @@
package obik
import (
"context"
"fmt"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"github.com/DavidGamba/go-getoptions"
)
func runRm(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
if len(args) < 1 {
return fmt.Errorf("usage: obik rm --set PATTERN [--set PATTERN]... <index_directory>")
}
patterns := CLISetPatterns()
if len(patterns) == 0 {
return fmt.Errorf("--set is required (specify which sets to remove)")
}
indexDir := args[0]
ksg, err := obikmer.OpenKmerSetGroup(indexDir)
if err != nil {
return fmt.Errorf("failed to open kmer index: %w", err)
}
indices, err := ksg.MatchSetIDs(patterns)
if err != nil {
return err
}
if len(indices) == 0 {
return fmt.Errorf("no sets match the given patterns")
}
// Collect IDs before removal (indices shift as we remove)
ids := make([]string, len(indices))
for i, idx := range indices {
ids[i] = ksg.SetIDOf(idx)
}
log.Infof("Removing %d set(s) from %s", len(ids), indexDir)
// Remove in reverse order to avoid renumbering issues
for i := len(ids) - 1; i >= 0; i-- {
if err := ksg.RemoveSetByID(ids[i]); err != nil {
return fmt.Errorf("failed to remove set %q: %w", ids[i], err)
}
log.Infof("Removed set %q", ids[i])
}
log.Infof("Index now has %d set(s)", ksg.Size())
return nil
}

View File

@@ -0,0 +1,148 @@
package obik
import (
"context"
"encoding/json"
"fmt"
"os"
"path/filepath"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"github.com/DavidGamba/go-getoptions"
"gopkg.in/yaml.v3"
)
type setSummary struct {
Index int `json:"index" yaml:"index"`
ID string `json:"id" yaml:"id"`
Count uint64 `json:"count" yaml:"count"`
DiskSize int64 `json:"disk_bytes" yaml:"disk_bytes"`
Metadata map[string]interface{} `json:"metadata,omitempty" yaml:"metadata,omitempty"`
}
type groupSummary struct {
Path string `json:"path" yaml:"path"`
ID string `json:"id,omitempty" yaml:"id,omitempty"`
K int `json:"k" yaml:"k"`
M int `json:"m" yaml:"m"`
Partitions int `json:"partitions" yaml:"partitions"`
TotalSets int `json:"total_sets" yaml:"total_sets"`
TotalKmers uint64 `json:"total_kmers" yaml:"total_kmers"`
TotalDisk int64 `json:"total_disk_bytes" yaml:"total_disk_bytes"`
Metadata map[string]interface{} `json:"metadata,omitempty" yaml:"metadata,omitempty"`
Sets []setSummary `json:"sets" yaml:"sets"`
Jaccard [][]float64 `json:"jaccard,omitempty" yaml:"jaccard,omitempty"`
}
func runSummary(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
if len(args) < 1 {
return fmt.Errorf("usage: obik summary [options] <index_directory>")
}
ksg, err := obikmer.OpenKmerSetGroup(args[0])
if err != nil {
return fmt.Errorf("failed to open kmer index: %w", err)
}
summary := groupSummary{
Path: ksg.Path(),
ID: ksg.Id(),
K: ksg.K(),
M: ksg.M(),
Partitions: ksg.Partitions(),
TotalSets: ksg.Size(),
TotalKmers: ksg.Len(),
Metadata: ksg.Metadata,
Sets: make([]setSummary, ksg.Size()),
}
var totalDisk int64
for i := 0; i < ksg.Size(); i++ {
diskSize := computeSetDiskSize(ksg, i)
totalDisk += diskSize
summary.Sets[i] = setSummary{
Index: i,
ID: ksg.SetIDOf(i),
Count: ksg.Len(i),
DiskSize: diskSize,
Metadata: ksg.AllSetMetadata(i),
}
}
summary.TotalDisk = totalDisk
// Jaccard matrix
if _jaccard && ksg.Size() > 1 {
dm := ksg.JaccardDistanceMatrix()
n := ksg.Size()
matrix := make([][]float64, n)
for i := 0; i < n; i++ {
matrix[i] = make([]float64, n)
for j := 0; j < n; j++ {
if i == j {
matrix[i][j] = 0
} else {
matrix[i][j] = dm.Get(i, j)
}
}
}
summary.Jaccard = matrix
}
format := CLIOutFormat()
switch format {
case "json":
return outputSummaryJSON(summary)
case "yaml":
return outputSummaryYAML(summary)
case "csv":
return outputSummaryCSV(summary)
default:
return outputSummaryJSON(summary)
}
}
func computeSetDiskSize(ksg *obikmer.KmerSetGroup, setIndex int) int64 {
var total int64
for p := 0; p < ksg.Partitions(); p++ {
path := ksg.PartitionPath(setIndex, p)
info, err := os.Stat(path)
if err != nil {
continue
}
total += info.Size()
}
// Also count the set directory entry itself
setDir := filepath.Join(ksg.Path(), fmt.Sprintf("set_%d", setIndex))
entries, err := os.ReadDir(setDir)
if err == nil {
// We already counted .kdi files above; this is just for completeness
_ = entries
}
return total
}
func outputSummaryJSON(summary groupSummary) error {
data, err := json.MarshalIndent(summary, "", " ")
if err != nil {
return err
}
fmt.Println(string(data))
return nil
}
func outputSummaryYAML(summary groupSummary) error {
data, err := yaml.Marshal(summary)
if err != nil {
return err
}
fmt.Print(string(data))
return nil
}
func outputSummaryCSV(summary groupSummary) error {
fmt.Println("index,id,count,disk_bytes")
for _, s := range summary.Sets {
fmt.Printf("%d,%s,%d,%d\n", s.Index, s.ID, s.Count, s.DiskSize)
}
return nil
}

View File

@@ -0,0 +1,64 @@
package obik
import (
"context"
"fmt"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
"github.com/DavidGamba/go-getoptions"
)
// Super k-mer specific option variables.
// These reuse _kmerSize and _minimizerSize from options.go since
// only one subcommand runs at a time.
// SuperKmerOptionSet registers options specific to super k-mer extraction.
func SuperKmerOptionSet(options *getoptions.GetOpt) {
options.IntVar(&_kmerSize, "kmer-size", _kmerSize,
options.Alias("k"),
options.Description("Size of k-mers (must be between m+1 and 31)."))
options.IntVar(&_minimizerSize, "minimizer-size", _minimizerSize,
options.Alias("m"),
options.Description("Size of minimizers (must be between 1 and k-1)."))
}
// runSuper implements the "obik super" subcommand.
// It extracts super k-mers from DNA sequences.
func runSuper(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
k := _kmerSize
m := _minimizerSize
if k < 2 || k > 31 {
return fmt.Errorf("invalid k-mer size: %d (must be between 2 and 31)", k)
}
if m < 1 || m >= k {
return fmt.Errorf("invalid parameters: minimizer size (%d) must be between 1 and k-1 (%d)", m, k-1)
}
log.Printf("Extracting super k-mers with k=%d, m=%d", k, m)
sequences, err := obiconvert.CLIReadBioSequences(args...)
if err != nil {
return fmt.Errorf("failed to open sequence files: %w", err)
}
worker := obikmer.SuperKmerWorker(k, m)
superkmers := sequences.MakeIWorker(
worker,
false,
obidefault.ParallelWorkers(),
)
obiconvert.CLIWriteBioSequences(superkmers, true)
obiutils.WaitForLastPipe()
return nil
}