Refactor k-mer index management with subcommands and enhanced metadata support

This commit refactors the k-mer index management tools to use a unified subcommand structure with obik, adds support for per-set metadata and ID management, enhances the k-mer set group builder to support appending to existing groups, and improves command-line option handling with a new global options registration system.

Key changes:
- Introduce obik command with subcommands (index, ls, summary, cp, mv, rm, super, lowmask)
- Add support for per-set metadata and ID management in kmer set groups
- Implement ability to append to existing kmer index groups
- Refactor option parsing to use a global options registration system
- Add new commands for listing, copying, moving, and removing sets
- Enhance low-complexity masking with new options and output formats
- Improve kmer index summary with Jaccard distance matrix support
- Remove deprecated obikindex and obisuperkmer commands
- Update build process to use the new subcommand structure
This commit is contained in:
Eric Coissac
2026-02-09 23:10:30 +01:00
parent f78543ee75
commit 56c1f4180c
26 changed files with 1482 additions and 1175 deletions

View File

@@ -32,15 +32,17 @@ func WithMinFrequency(minFreq int) BuilderOption {
// partitioned by minimizer. On Close(), each partition is finalized
// (sort, dedup, optional frequency filter) into .kdi files.
type KmerSetGroupBuilder struct {
dir string
k int
m int
n int // number of sets
P int // number of partitions
config builderConfig
writers [][]*SkmWriter // [setIndex][partIndex]
mu [][]sync.Mutex // per-writer mutex for concurrent access
closed bool
dir string
k int
m int
n int // number of NEW sets being built
P int // number of partitions
startIndex int // first set index (0 for new groups, existingN for appends)
config builderConfig
existing *KmerSetGroup // non-nil when appending to existing group
writers [][]*SkmWriter // [setIndex][partIndex] (local index 0..n-1)
mu [][]sync.Mutex // per-writer mutex for concurrent access
closed bool
}
// NewKmerSetGroupBuilder creates a builder for a new KmerSetGroup.
@@ -127,17 +129,94 @@ func NewKmerSetGroupBuilder(directory string, k, m, n, P int,
}
return &KmerSetGroupBuilder{
dir: directory,
k: k,
m: m,
n: n,
P: P,
config: config,
writers: writers,
mu: mutexes,
dir: directory,
k: k,
m: m,
n: n,
P: P,
startIndex: 0,
config: config,
writers: writers,
mu: mutexes,
}, nil
}
// AppendKmerSetGroupBuilder opens an existing KmerSetGroup and creates
// a builder that adds n new sets starting from the existing set count.
// The k, m, and partitions are inherited from the existing group.
func AppendKmerSetGroupBuilder(directory string, n int, options ...BuilderOption) (*KmerSetGroupBuilder, error) {
existing, err := OpenKmerSetGroup(directory)
if err != nil {
return nil, fmt.Errorf("obikmer: open existing group: %w", err)
}
if n < 1 {
return nil, fmt.Errorf("obikmer: n must be >= 1, got %d", n)
}
k := existing.K()
m := existing.M()
P := existing.Partitions()
startIndex := existing.Size()
var config builderConfig
for _, opt := range options {
opt(&config)
}
// Create build directory structure for new sets
buildDir := filepath.Join(directory, ".build")
for s := 0; s < n; s++ {
setDir := filepath.Join(buildDir, fmt.Sprintf("set_%d", s))
if err := os.MkdirAll(setDir, 0755); err != nil {
return nil, fmt.Errorf("obikmer: create build dir: %w", err)
}
}
// Create SKM writers for new sets
writers := make([][]*SkmWriter, n)
mutexes := make([][]sync.Mutex, n)
for s := 0; s < n; s++ {
writers[s] = make([]*SkmWriter, P)
mutexes[s] = make([]sync.Mutex, P)
for p := 0; p < P; p++ {
path := filepath.Join(buildDir, fmt.Sprintf("set_%d", s),
fmt.Sprintf("part_%04d.skm", p))
w, err := NewSkmWriter(path)
if err != nil {
for ss := 0; ss <= s; ss++ {
for pp := 0; pp < P; pp++ {
if writers[ss][pp] != nil {
writers[ss][pp].Close()
}
}
}
return nil, fmt.Errorf("obikmer: create skm writer: %w", err)
}
writers[s][p] = w
}
}
return &KmerSetGroupBuilder{
dir: directory,
k: k,
m: m,
n: n,
P: P,
startIndex: startIndex,
config: config,
existing: existing,
writers: writers,
mu: mutexes,
}, nil
}
// StartIndex returns the first global set index for the new sets being built.
// For new groups this is 0; for appends it is the existing group's Size().
func (b *KmerSetGroupBuilder) StartIndex() int {
return b.startIndex
}
// AddSequence extracts super-kmers from a sequence and writes them
// to the appropriate partition files for the given set.
func (b *KmerSetGroupBuilder) AddSequence(setIndex int, seq *obiseq.BioSequence) {
@@ -193,9 +272,10 @@ func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) {
}
}
// 2. Create output directory structure
// 2. Create output directory structure for new sets
for s := 0; s < b.n; s++ {
setDir := filepath.Join(b.dir, fmt.Sprintf("set_%d", s))
globalIdx := b.startIndex + s
setDir := filepath.Join(b.dir, fmt.Sprintf("set_%d", globalIdx))
if err := os.MkdirAll(setDir, 0755); err != nil {
return nil, fmt.Errorf("obikmer: create set dir: %w", err)
}
@@ -251,24 +331,44 @@ func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) {
}
// 3. Build KmerSetGroup and write metadata
totalCounts := make([]uint64, b.n)
newCounts := make([]uint64, b.n)
for s := 0; s < b.n; s++ {
for p := 0; p < b.P; p++ {
totalCounts[s] += counts[s][p]
newCounts[s] += counts[s][p]
}
}
setsIDs := make([]string, b.n)
var ksg *KmerSetGroup
ksg := &KmerSetGroup{
path: b.dir,
k: b.k,
m: b.m,
partitions: b.P,
n: b.n,
setsIDs: setsIDs,
counts: totalCounts,
Metadata: make(map[string]interface{}),
if b.existing != nil {
// Append mode: extend existing group
ksg = b.existing
ksg.n += b.n
ksg.setsIDs = append(ksg.setsIDs, make([]string, b.n)...)
ksg.counts = append(ksg.counts, newCounts...)
newMeta := make([]map[string]interface{}, b.n)
for i := range newMeta {
newMeta[i] = make(map[string]interface{})
}
ksg.setsMetadata = append(ksg.setsMetadata, newMeta...)
} else {
// New group
setsIDs := make([]string, b.n)
setsMetadata := make([]map[string]interface{}, b.n)
for i := range setsMetadata {
setsMetadata[i] = make(map[string]interface{})
}
ksg = &KmerSetGroup{
path: b.dir,
k: b.k,
m: b.m,
partitions: b.P,
n: b.n,
setsIDs: setsIDs,
counts: newCounts,
setsMetadata: setsMetadata,
Metadata: make(map[string]interface{}),
}
}
if err := ksg.saveMetadata(); err != nil {
@@ -285,12 +385,14 @@ func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) {
// finalizePartition processes a single partition: load SKM, extract k-mers,
// sort, dedup/count, write KDI.
func (b *KmerSetGroupBuilder) finalizePartition(setIdx, partIdx int, count *uint64) error {
// setIdx is local (0..n-1); build dirs use local index, output dirs use global
skmPath := filepath.Join(b.dir, ".build",
fmt.Sprintf("set_%d", setIdx),
fmt.Sprintf("part_%04d.skm", partIdx))
globalIdx := b.startIndex + setIdx
kdiPath := filepath.Join(b.dir,
fmt.Sprintf("set_%d", setIdx),
fmt.Sprintf("set_%d", globalIdx),
fmt.Sprintf("part_%04d.kdi", partIdx))
// Load super-kmers and extract canonical k-mers

View File

@@ -2,9 +2,12 @@ package obikmer
import (
"fmt"
"io"
"iter"
"os"
"path"
"path/filepath"
"sort"
"sync"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidist"
@@ -42,28 +45,30 @@ func (f MetadataFormat) String() string {
//
// A KmerSetGroup with Size()==1 is effectively a KmerSet (singleton).
type KmerSetGroup struct {
path string // root directory
id string // user-assigned identifier
k int // k-mer size
m int // minimizer size
partitions int // number of partitions P
n int // number of sets N
setsIDs []string // IDs of individual sets
counts []uint64 // total k-mer count per set (sum over partitions)
Metadata map[string]interface{} // group-level user metadata
path string // root directory
id string // user-assigned identifier
k int // k-mer size
m int // minimizer size
partitions int // number of partitions P
n int // number of sets N
setsIDs []string // IDs of individual sets
counts []uint64 // total k-mer count per set (sum over partitions)
setsMetadata []map[string]interface{} // per-set user metadata
Metadata map[string]interface{} // group-level user metadata
}
// diskMetadata is the TOML-serializable structure for metadata.toml.
type diskMetadata struct {
ID string `toml:"id,omitempty"`
K int `toml:"k"`
M int `toml:"m"`
Partitions int `toml:"partitions"`
Type string `toml:"type"`
Size int `toml:"size"`
SetsIDs []string `toml:"sets_ids,omitempty"`
Counts []uint64 `toml:"counts,omitempty"`
UserMetadata map[string]interface{} `toml:"user_metadata,omitempty"`
ID string `toml:"id,omitempty"`
K int `toml:"k"`
M int `toml:"m"`
Partitions int `toml:"partitions"`
Type string `toml:"type"`
Size int `toml:"size"`
SetsIDs []string `toml:"sets_ids,omitempty"`
Counts []uint64 `toml:"counts,omitempty"`
SetsMetadata []map[string]interface{} `toml:"sets_metadata,omitempty"`
UserMetadata map[string]interface{} `toml:"user_metadata,omitempty"`
}
// OpenKmerSetGroup opens a finalized index directory in read-only mode.
@@ -81,15 +86,16 @@ func OpenKmerSetGroup(directory string) (*KmerSetGroup, error) {
}
ksg := &KmerSetGroup{
path: directory,
id: meta.ID,
k: meta.K,
m: meta.M,
partitions: meta.Partitions,
n: meta.Size,
setsIDs: meta.SetsIDs,
counts: meta.Counts,
Metadata: meta.UserMetadata,
path: directory,
id: meta.ID,
k: meta.K,
m: meta.M,
partitions: meta.Partitions,
n: meta.Size,
setsIDs: meta.SetsIDs,
counts: meta.Counts,
setsMetadata: meta.SetsMetadata,
Metadata: meta.UserMetadata,
}
if ksg.Metadata == nil {
ksg.Metadata = make(map[string]interface{})
@@ -97,6 +103,12 @@ func OpenKmerSetGroup(directory string) (*KmerSetGroup, error) {
if ksg.setsIDs == nil {
ksg.setsIDs = make([]string, ksg.n)
}
if ksg.setsMetadata == nil {
ksg.setsMetadata = make([]map[string]interface{}, ksg.n)
for i := range ksg.setsMetadata {
ksg.setsMetadata[i] = make(map[string]interface{})
}
}
if ksg.counts == nil {
// Compute counts by scanning partitions
ksg.counts = make([]uint64, ksg.n)
@@ -133,6 +145,7 @@ func (ksg *KmerSetGroup) saveMetadata() error {
Size: ksg.n,
SetsIDs: ksg.setsIDs,
Counts: ksg.counts,
SetsMetadata: ksg.setsMetadata,
UserMetadata: ksg.Metadata,
}
@@ -578,3 +591,299 @@ func (ksg *KmerSetGroup) JaccardSimilarityMatrix() *obidist.DistMatrix {
return sm
}
// ==============================
// Set ID accessors
// ==============================
// SetsIDs returns a copy of the per-set string identifiers.
func (ksg *KmerSetGroup) SetsIDs() []string {
out := make([]string, len(ksg.setsIDs))
copy(out, ksg.setsIDs)
return out
}
// SetIDOf returns the string ID of the set at the given index.
// Returns "" if index is out of range.
func (ksg *KmerSetGroup) SetIDOf(index int) string {
if index < 0 || index >= ksg.n {
return ""
}
return ksg.setsIDs[index]
}
// SetSetID sets the string ID of the set at the given index.
func (ksg *KmerSetGroup) SetSetID(index int, id string) {
if index >= 0 && index < ksg.n {
ksg.setsIDs[index] = id
}
}
// IndexOfSetID returns the numeric index for a set ID, or -1 if not found.
func (ksg *KmerSetGroup) IndexOfSetID(id string) int {
for i, sid := range ksg.setsIDs {
if sid == id {
return i
}
}
return -1
}
// MatchSetIDs resolves glob patterns against set IDs and returns matching
// indices sorted in ascending order. Uses path.Match for pattern matching
// (supports *, ?, [...] patterns). Returns error if a pattern is malformed.
func (ksg *KmerSetGroup) MatchSetIDs(patterns []string) ([]int, error) {
seen := make(map[int]bool)
for _, pattern := range patterns {
for i, sid := range ksg.setsIDs {
matched, err := path.Match(pattern, sid)
if err != nil {
return nil, fmt.Errorf("obikmer: invalid glob pattern %q: %w", pattern, err)
}
if matched {
seen[i] = true
}
}
}
result := make([]int, 0, len(seen))
for idx := range seen {
result = append(result, idx)
}
sort.Ints(result)
return result, nil
}
// ==============================
// Per-set metadata accessors
// ==============================
// GetSetMetadata returns the value of a per-set metadata key.
func (ksg *KmerSetGroup) GetSetMetadata(setIndex int, key string) (interface{}, bool) {
if setIndex < 0 || setIndex >= ksg.n {
return nil, false
}
v, ok := ksg.setsMetadata[setIndex][key]
return v, ok
}
// SetSetMetadata sets a per-set metadata attribute.
func (ksg *KmerSetGroup) SetSetMetadata(setIndex int, key string, value interface{}) {
if setIndex < 0 || setIndex >= ksg.n {
return
}
if ksg.setsMetadata[setIndex] == nil {
ksg.setsMetadata[setIndex] = make(map[string]interface{})
}
ksg.setsMetadata[setIndex][key] = value
}
// DeleteSetMetadata removes a per-set metadata attribute.
func (ksg *KmerSetGroup) DeleteSetMetadata(setIndex int, key string) {
if setIndex < 0 || setIndex >= ksg.n {
return
}
delete(ksg.setsMetadata[setIndex], key)
}
// AllSetMetadata returns a copy of all metadata for a given set.
func (ksg *KmerSetGroup) AllSetMetadata(setIndex int) map[string]interface{} {
if setIndex < 0 || setIndex >= ksg.n {
return nil
}
out := make(map[string]interface{}, len(ksg.setsMetadata[setIndex]))
for k, v := range ksg.setsMetadata[setIndex] {
out[k] = v
}
return out
}
// ==============================
// Exported partition path and compatibility
// ==============================
// PartitionPath returns the file path for partition partIndex of set setIndex.
func (ksg *KmerSetGroup) PartitionPath(setIndex, partIndex int) string {
return ksg.partitionPath(setIndex, partIndex)
}
// IsCompatibleWith returns true if the other group has the same k, m, and partitions.
func (ksg *KmerSetGroup) IsCompatibleWith(other *KmerSetGroup) bool {
return ksg.k == other.k && ksg.m == other.m && ksg.partitions == other.partitions
}
// ==============================
// Set management operations
// ==============================
// NewEmptyCompatible creates an empty KmerSetGroup at destDir with the same
// k, m, and partitions as this group. The destination must not already exist.
func (ksg *KmerSetGroup) NewEmptyCompatible(destDir string) (*KmerSetGroup, error) {
if err := os.MkdirAll(destDir, 0755); err != nil {
return nil, fmt.Errorf("obikmer: create directory: %w", err)
}
dest := &KmerSetGroup{
path: destDir,
k: ksg.k,
m: ksg.m,
partitions: ksg.partitions,
n: 0,
setsIDs: []string{},
counts: []uint64{},
setsMetadata: []map[string]interface{}{},
Metadata: make(map[string]interface{}),
}
if err := dest.saveMetadata(); err != nil {
return nil, fmt.Errorf("obikmer: write metadata: %w", err)
}
return dest, nil
}
// RemoveSetByID removes the set with the given ID from the group.
// It deletes the set directory, renumbers all subsequent sets, and
// updates the metadata on disk.
func (ksg *KmerSetGroup) RemoveSetByID(id string) error {
idx := ksg.IndexOfSetID(id)
if idx < 0 {
return fmt.Errorf("obikmer: set ID %q not found", id)
}
// Delete the set directory
setDir := filepath.Join(ksg.path, fmt.Sprintf("set_%d", idx))
if err := os.RemoveAll(setDir); err != nil {
return fmt.Errorf("obikmer: remove set directory: %w", err)
}
// Renumber subsequent sets
for i := idx + 1; i < ksg.n; i++ {
oldDir := filepath.Join(ksg.path, fmt.Sprintf("set_%d", i))
newDir := filepath.Join(ksg.path, fmt.Sprintf("set_%d", i-1))
if err := os.Rename(oldDir, newDir); err != nil {
return fmt.Errorf("obikmer: rename set_%d to set_%d: %w", i, i-1, err)
}
}
// Update slices
ksg.setsIDs = append(ksg.setsIDs[:idx], ksg.setsIDs[idx+1:]...)
ksg.counts = append(ksg.counts[:idx], ksg.counts[idx+1:]...)
ksg.setsMetadata = append(ksg.setsMetadata[:idx], ksg.setsMetadata[idx+1:]...)
ksg.n--
return ksg.saveMetadata()
}
// CopySetsByIDTo copies sets identified by their IDs into a KmerSetGroup
// at destDir. If destDir does not exist, a new compatible empty group is
// created. If it exists, compatibility (k, m, partitions) is checked.
// If a set ID already exists in the destination, an error is returned
// unless force is true (in which case the existing set is replaced).
// Per-set metadata travels with the set.
func (ksg *KmerSetGroup) CopySetsByIDTo(ids []string, destDir string, force bool) (*KmerSetGroup, error) {
// Resolve source IDs to indices
srcIndices := make([]int, len(ids))
for i, id := range ids {
idx := ksg.IndexOfSetID(id)
if idx < 0 {
return nil, fmt.Errorf("obikmer: source set ID %q not found", id)
}
srcIndices[i] = idx
}
// Open or create destination
var dest *KmerSetGroup
metaPath := filepath.Join(destDir, "metadata.toml")
if _, err := os.Stat(metaPath); err == nil {
// Destination exists
dest, err = OpenKmerSetGroup(destDir)
if err != nil {
return nil, fmt.Errorf("obikmer: open destination: %w", err)
}
if !ksg.IsCompatibleWith(dest) {
return nil, fmt.Errorf("obikmer: incompatible groups: source (k=%d, m=%d, P=%d) vs dest (k=%d, m=%d, P=%d)",
ksg.k, ksg.m, ksg.partitions, dest.k, dest.m, dest.partitions)
}
} else {
// Create new destination
var err error
dest, err = ksg.NewEmptyCompatible(destDir)
if err != nil {
return nil, err
}
}
// Copy each set
for i, srcIdx := range srcIndices {
srcID := ids[i]
// Check for ID conflict in destination
existingIdx := dest.IndexOfSetID(srcID)
if existingIdx >= 0 {
if !force {
return nil, fmt.Errorf("obikmer: set ID %q already exists in destination (use force to replace)", srcID)
}
// Force: remove existing set in destination
if err := dest.RemoveSetByID(srcID); err != nil {
return nil, fmt.Errorf("obikmer: remove existing set %q in destination: %w", srcID, err)
}
}
// Destination set index = current dest size
destIdx := dest.n
// Create destination set directory
destSetDir := filepath.Join(destDir, fmt.Sprintf("set_%d", destIdx))
if err := os.MkdirAll(destSetDir, 0755); err != nil {
return nil, fmt.Errorf("obikmer: create dest set dir: %w", err)
}
// Copy all partition files
for p := 0; p < ksg.partitions; p++ {
srcPath := ksg.partitionPath(srcIdx, p)
destPath := dest.partitionPath(destIdx, p)
if err := copyFile(srcPath, destPath); err != nil {
return nil, fmt.Errorf("obikmer: copy partition %d of set %q: %w", p, srcID, err)
}
}
// Update destination metadata
dest.setsIDs = append(dest.setsIDs, srcID)
dest.counts = append(dest.counts, ksg.counts[srcIdx])
// Copy per-set metadata
srcMeta := ksg.AllSetMetadata(srcIdx)
if srcMeta == nil {
srcMeta = make(map[string]interface{})
}
dest.setsMetadata = append(dest.setsMetadata, srcMeta)
dest.n++
}
if err := dest.saveMetadata(); err != nil {
return nil, fmt.Errorf("obikmer: save destination metadata: %w", err)
}
return dest, nil
}
// copyFile copies a file from src to dst.
func copyFile(src, dst string) error {
in, err := os.Open(src)
if err != nil {
return err
}
defer in.Close()
out, err := os.Create(dst)
if err != nil {
return err
}
defer out.Close()
if _, err := io.Copy(out, in); err != nil {
return err
}
return out.Close()
}

View File

@@ -26,16 +26,11 @@ var __defaut_taxonomy_mutex__ sync.Mutex
type ArgumentParser func([]string) (*getoptions.GetOpt, []string)
func GenerateOptionParser(program string,
documentation string,
optionset ...func(*getoptions.GetOpt)) ArgumentParser {
options := getoptions.New()
options.Self(program, documentation)
options.SetMode(getoptions.Bundling)
options.SetUnknownMode(getoptions.Fail)
options.Bool("help", false, options.Alias("h", "?"))
// RegisterGlobalOptions registers the global options shared by all obitools
// commands onto the given GetOpt instance. It does NOT register --help,
// which must be handled by the caller (either as a Bool option or via
// HelpCommand for subcommand-based parsers).
func RegisterGlobalOptions(options *getoptions.GetOpt) {
options.Bool("version", false,
options.Description("Prints the version and exits."))
@@ -46,17 +41,10 @@ func GenerateOptionParser(program string,
options.BoolVar(&_Pprof, "pprof", false,
options.Description("Enable pprof server. Look at the log for details."))
// options.IntVar(&_ParallelWorkers, "workers", _ParallelWorkers,
// options.Alias("w"),
// options.Description("Number of parallele threads computing the result"))
options.IntVar(obidefault.MaxCPUPtr(), "max-cpu", obidefault.MaxCPU(),
options.GetEnv("OBIMAXCPU"),
options.Description("Number of parallele threads computing the result"))
// options.BoolVar(&_Pprof, "force-one-cpu", false,
// options.Description("Force to use only one cpu core for parallel processing"))
options.IntVar(&_PprofMudex, "pprof-mutex", _PprofMudex,
options.GetEnv("OBIPPROFMUTEX"),
options.Description("Enable profiling of mutex lock."))
@@ -77,119 +65,119 @@ func GenerateOptionParser(program string,
options.GetEnv("OBIWARNING"),
options.Description("Stop printing of the warning message"),
)
}
// ProcessParsedOptions handles the post-parse logic common to all obitools
// commands: help, version, debug, pprof, taxonomy, cpu configuration, etc.
// It receives the GetOpt instance and the parse error (if any).
func ProcessParsedOptions(options *getoptions.GetOpt, parseErr error) {
// Note: "help" may not be registered as a Bool (e.g. when using HelpCommand
// for subcommand-based parsers). Only check if it won't panic.
// We use a recover guard to be safe.
func() {
defer func() { recover() }()
if options.Called("help") {
fmt.Fprint(os.Stderr, options.Help())
os.Exit(0)
}
}()
if options.Called("version") {
fmt.Fprintf(os.Stderr, "OBITools %s\n", VersionString())
os.Exit(0)
}
if options.Called("taxonomy") {
__defaut_taxonomy_mutex__.Lock()
defer __defaut_taxonomy_mutex__.Unlock()
taxonomy, err := obiformats.LoadTaxonomy(
obidefault.SelectedTaxonomy(),
!obidefault.AreAlternativeNamesSelected(),
SeqAsTaxa(),
)
if err != nil {
log.Fatalf("Cannot load default taxonomy: %v", err)
}
taxonomy.SetAsDefault()
}
log.SetLevel(log.InfoLevel)
if options.Called("debug") {
log.SetLevel(log.DebugLevel)
log.Debugln("Switch to debug level logging")
}
if options.Called("pprof") {
url := "localhost:6060"
go http.ListenAndServe(url, nil)
log.Infof("Start a pprof server at address %s/debug/pprof", url)
log.Info("Profil can be followed running concurrently the command :")
log.Info(" go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/profile?seconds=30'")
}
if options.Called("pprof-mutex") {
url := "localhost:6060"
go http.ListenAndServe(url, nil)
runtime.SetMutexProfileFraction(_PprofMudex)
log.Infof("Start a pprof server at address %s/debug/pprof", url)
log.Info("Profil can be followed running concurrently the command :")
log.Info(" go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/mutex'")
}
if options.Called("pprof-goroutine") {
url := "localhost:6060"
go http.ListenAndServe(url, nil)
runtime.SetBlockProfileRate(_PprofGoroutine)
log.Infof("Start a pprof server at address %s/debug/pprof", url)
log.Info("Profil can be followed running concurrently the command :")
log.Info(" go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/block'")
}
// Handle user errors
if parseErr != nil {
fmt.Fprintf(os.Stderr, "ERROR: %s\n\n", parseErr)
fmt.Fprint(os.Stderr, options.Help(getoptions.HelpSynopsis))
os.Exit(1)
}
runtime.GOMAXPROCS(obidefault.MaxCPU())
if options.Called("max-cpu") {
log.Printf("CPU number limited to %d", obidefault.MaxCPU())
}
if options.Called("no-singleton") {
log.Printf("No singleton option set")
}
log.Printf("Number of workers set %d", obidefault.ParallelWorkers())
if options.Called("solexa") {
obidefault.SetReadQualitiesShift(64)
}
}
func GenerateOptionParser(program string,
documentation string,
optionset ...func(*getoptions.GetOpt)) ArgumentParser {
options := getoptions.New()
options.Self(program, documentation)
options.SetMode(getoptions.Bundling)
options.SetUnknownMode(getoptions.Fail)
options.Bool("help", false, options.Alias("h", "?"))
RegisterGlobalOptions(options)
for _, o := range optionset {
o(options)
}
return func(args []string) (*getoptions.GetOpt, []string) {
remaining, err := options.Parse(args[1:])
if options.Called("help") {
fmt.Fprint(os.Stderr, options.Help())
os.Exit(0)
}
if options.Called("version") {
fmt.Fprintf(os.Stderr, "OBITools %s\n", VersionString())
os.Exit(0)
}
if options.Called("taxonomy") {
__defaut_taxonomy_mutex__.Lock()
defer __defaut_taxonomy_mutex__.Unlock()
taxonomy, err := obiformats.LoadTaxonomy(
obidefault.SelectedTaxonomy(),
!obidefault.AreAlternativeNamesSelected(),
SeqAsTaxa(),
)
if err != nil {
log.Fatalf("Cannot load default taxonomy: %v", err)
}
taxonomy.SetAsDefault()
}
log.SetLevel(log.InfoLevel)
if options.Called("debug") {
log.SetLevel(log.DebugLevel)
log.Debugln("Switch to debug level logging")
}
if options.Called("pprof") {
url := "localhost:6060"
go http.ListenAndServe(url, nil)
log.Infof("Start a pprof server at address %s/debug/pprof", url)
log.Info("Profil can be followed running concurrently the command :")
log.Info(" go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/profile?seconds=30'")
}
if options.Called("pprof-mutex") {
url := "localhost:6060"
go http.ListenAndServe(url, nil)
runtime.SetMutexProfileFraction(_PprofMudex)
log.Infof("Start a pprof server at address %s/debug/pprof", url)
log.Info("Profil can be followed running concurrently the command :")
log.Info(" go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/mutex'")
}
if options.Called("pprof-goroutine") {
url := "localhost:6060"
go http.ListenAndServe(url, nil)
runtime.SetBlockProfileRate(_PprofGoroutine)
log.Infof("Start a pprof server at address %s/debug/pprof", url)
log.Info("Profil can be followed running concurrently the command :")
log.Info(" go tool pprof -http=127.0.0.1:8080 'http://localhost:6060/debug/pprof/block'")
}
// Handle user errors
if err != nil {
fmt.Fprintf(os.Stderr, "ERROR: %s\n\n", err)
fmt.Fprint(os.Stderr, options.Help(getoptions.HelpSynopsis))
os.Exit(1)
}
// // Setup the maximum number of CPU usable by the program
// if obidefault.MaxCPU() == 1 {
// log.Warn("Limitating the Maximum number of CPU to 1 is not recommanded")
// log.Warn("The number of CPU requested has been set to 2")
// obidefault.SetMaxCPU(2)
// }
// if options.Called("force-one-cpu") {
// log.Warn("Limitating the Maximum number of CPU to 1 is not recommanded")
// log.Warn("The number of CPU has been forced to 1")
// log.Warn("This can lead to unexpected behavior")
// obidefault.SetMaxCPU(1)
// }
runtime.GOMAXPROCS(obidefault.MaxCPU())
// if options.Called("max-cpu") || options.Called("force-one-cpu") {
// log.Printf("CPU number limited to %d", obidefault.MaxCPU())
// }
if options.Called("max-cpu") {
log.Printf("CPU number limited to %d", obidefault.MaxCPU())
}
if options.Called("no-singleton") {
log.Printf("No singleton option set")
}
log.Printf("Number of workers set %d", obidefault.ParallelWorkers())
// if options.Called("workers") {
// }
if options.Called("solexa") {
obidefault.SetReadQualitiesShift(64)
}
ProcessParsedOptions(options, err)
return options, remaining
}
}

View File

@@ -0,0 +1,43 @@
package obioptions
import (
"github.com/DavidGamba/go-getoptions"
)
// GenerateSubcommandParser creates an option parser that supports subcommands
// via go-getoptions' NewCommand/SetCommandFn/Dispatch API.
//
// The setup function receives the root *GetOpt and should register subcommands
// using opt.NewCommand(). Global options (--debug, --max-cpu, etc.) are
// registered before setup is called and are inherited by all subcommands.
//
// Returns the root *GetOpt (needed for Dispatch) and an ArgumentParser
// that handles parsing and post-parse processing.
func GenerateSubcommandParser(
program string,
documentation string,
setup func(opt *getoptions.GetOpt),
) (*getoptions.GetOpt, ArgumentParser) {
options := getoptions.New()
options.Self(program, documentation)
options.SetMode(getoptions.Bundling)
options.SetUnknownMode(getoptions.Fail)
// Register global options (inherited by all subcommands)
RegisterGlobalOptions(options)
// Let the caller register subcommands
setup(options)
// Add automatic help subcommand (must be after all commands)
options.HelpCommand("help", options.Description("Show help for a command"))
parser := func(args []string) (*getoptions.GetOpt, []string) {
remaining, err := options.Parse(args[1:])
ProcessParsedOptions(options, err)
return options, remaining
}
return options, parser
}

55
pkg/obitools/obik/cp.go Normal file
View File

@@ -0,0 +1,55 @@
package obik
import (
"context"
"fmt"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"github.com/DavidGamba/go-getoptions"
)
func runCp(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
if len(args) < 2 {
return fmt.Errorf("usage: obik cp [--set PATTERN]... [--force] <source_index> <dest_index>")
}
srcDir := args[0]
destDir := args[1]
ksg, err := obikmer.OpenKmerSetGroup(srcDir)
if err != nil {
return fmt.Errorf("failed to open source kmer index: %w", err)
}
// Resolve set patterns
patterns := CLISetPatterns()
var ids []string
if len(patterns) > 0 {
indices, err := ksg.MatchSetIDs(patterns)
if err != nil {
return err
}
if len(indices) == 0 {
return fmt.Errorf("no sets match the given patterns")
}
ids = make([]string, len(indices))
for i, idx := range indices {
ids[i] = ksg.SetIDOf(idx)
}
} else {
// Copy all sets
ids = ksg.SetsIDs()
}
log.Infof("Copying %d set(s) from %s to %s", len(ids), srcDir, destDir)
dest, err := ksg.CopySetsByIDTo(ids, destDir, CLIForce())
if err != nil {
return err
}
log.Infof("Destination now has %d set(s)", dest.Size())
return nil
}

110
pkg/obitools/obik/index.go Normal file
View File

@@ -0,0 +1,110 @@
package obik
import (
"context"
"fmt"
"os"
"path/filepath"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
outDir := CLIOutputDirectory()
if outDir == "" || outDir == "-" {
return fmt.Errorf("--out option is required and must specify a directory path")
}
k := CLIKmerSize()
if k < 2 || k > 31 {
return fmt.Errorf("invalid k-mer size: %d (must be between 2 and 31)", k)
}
m := CLIMinimizerSize()
minOcc := CLIMinOccurrence()
if minOcc < 1 {
return fmt.Errorf("invalid min-occurrence: %d (must be >= 1)", minOcc)
}
// Build options
var opts []obikmer.BuilderOption
if minOcc > 1 {
opts = append(opts, obikmer.WithMinFrequency(minOcc))
}
// Determine whether to append to existing group or create new
var builder *obikmer.KmerSetGroupBuilder
var err error
metaPath := filepath.Join(outDir, "metadata.toml")
if _, statErr := os.Stat(metaPath); statErr == nil {
// Existing group: append
log.Infof("Appending to existing kmer index at %s", outDir)
builder, err = obikmer.AppendKmerSetGroupBuilder(outDir, 1, opts...)
if err != nil {
return fmt.Errorf("failed to open existing kmer index for appending: %w", err)
}
} else {
// New group
log.Infof("Creating new kmer index: k=%d, m=%d, min-occurrence=%d", k, m, minOcc)
builder, err = obikmer.NewKmerSetGroupBuilder(outDir, k, m, 1, -1, opts...)
if err != nil {
return fmt.Errorf("failed to create kmer index builder: %w", err)
}
}
// Read and process sequences
sequences, err := obiconvert.CLIReadBioSequences(args...)
if err != nil {
return fmt.Errorf("failed to open sequence files: %w", err)
}
seqCount := 0
for sequences.Next() {
batch := sequences.Get()
for _, seq := range batch.Slice() {
builder.AddSequence(0, seq)
seqCount++
}
}
log.Infof("Processed %d sequences", seqCount)
// Finalize
ksg, err := builder.Close()
if err != nil {
return fmt.Errorf("failed to finalize kmer index: %w", err)
}
// Apply index-id to the new set
newSetIdx := builder.StartIndex()
if id := CLIIndexId(); id != "" {
ksg.SetSetID(newSetIdx, id)
}
// Apply group-level tags (-S)
for key, value := range CLISetTag() {
ksg.SetAttribute(key, value)
}
// Apply per-set tags (-T) to the new set
for key, value := range _setMetaTags {
ksg.SetSetMetadata(newSetIdx, key, value)
}
if minOcc > 1 {
ksg.SetAttribute("min_occurrence", minOcc)
}
if err := ksg.SaveMetadata(); err != nil {
return fmt.Errorf("failed to save metadata: %w", err)
}
log.Infof("Index contains %d k-mers for set %d in %s", ksg.Len(newSetIdx), newSetIdx, outDir)
log.Info("Done.")
return nil
}

View File

@@ -1,39 +1,80 @@
package obilowmask
package obik
import (
"context"
"fmt"
"math"
"strings"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
"github.com/DavidGamba/go-getoptions"
)
// MaskingMode defines how to handle low-complexity regions
type MaskingMode int
const (
Mask MaskingMode = iota // Mask mode: replace low-complexity regions with masked characters
Split // Split mode: split sequence into high-complexity fragments
Extract
MaskMode MaskingMode = iota // Replace low-complexity regions with masked characters
SplitMode // Split sequence into high-complexity fragments
ExtractMode // Extract low-complexity fragments
)
// LowMaskWorker creates a worker to mask low-complexity regions in DNA sequences.
//
// Algorithm principle:
// Calculate the normalized entropy of each k-mer at different scales (wordSize = 1 to level_max).
// K-mers with entropy below the threshold are masked.
//
// Parameters:
// - kmer_size: size of the sliding window for entropy calculation
// - level_max: maximum word size used for entropy calculation (finest scale)
// - threshold: normalized entropy threshold below which masking occurs (between 0 and 1)
// - mode: Mask (masking) or Split (splitting)
// - maskChar: character used for masking (typically 'n' or 'N')
//
// Returns: a SeqWorker function that can be applied to each sequence
func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte) obiseq.SeqWorker {
// Lowmask-specific option variables (separate from index/super kmer-size).
var _lowmaskKmerSize = 31
var _lowmaskLevelMax = 6
var _lowmaskThreshold = 0.5
var _lowmaskSplitMode = false
var _lowmaskLowMode = false
var _lowmaskMaskChar = "."
// LowMaskOptionSet registers options specific to low-complexity masking.
func LowMaskOptionSet(options *getoptions.GetOpt) {
options.IntVar(&_lowmaskKmerSize, "kmer-size", _lowmaskKmerSize,
options.Description("Size of the kmer considered to estimate entropy."))
options.IntVar(&_lowmaskLevelMax, "entropy-size", _lowmaskLevelMax,
options.Description("Maximum word size considered for entropy estimate."))
options.Float64Var(&_lowmaskThreshold, "threshold", _lowmaskThreshold,
options.Description("Entropy threshold below which a kmer is masked (0 to 1)."))
options.BoolVar(&_lowmaskSplitMode, "split-mode", _lowmaskSplitMode,
options.Description("Split sequences to remove masked regions."))
options.BoolVar(&_lowmaskLowMode, "low-mode", _lowmaskLowMode,
options.Description("Extract only low-complexity regions."))
options.StringVar(&_lowmaskMaskChar, "masking-char", _lowmaskMaskChar,
options.Description("Character used to mask low complexity regions."))
}
func lowmaskMaskingMode() MaskingMode {
switch {
case _lowmaskLowMode:
return ExtractMode
case _lowmaskSplitMode:
return SplitMode
default:
return MaskMode
}
}
func lowmaskMaskingChar() byte {
mask := strings.TrimSpace(_lowmaskMaskChar)
if len(mask) != 1 {
log.Fatalf("--masking-char option accepts a single character, not %s", mask)
}
return []byte(mask)[0]
}
// lowMaskWorker creates a worker to mask low-complexity regions in DNA sequences.
func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte) obiseq.SeqWorker {
nLogN := make([]float64, kmer_size+1)
for i := 1; i <= kmer_size; i++ {
@@ -87,6 +128,7 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
data[i] = deque[0].value
}
}
emaxValues := make([]float64, level_max+1)
logNwords := make([]float64, level_max+1)
for ws := 1; ws <= level_max; ws++ {
@@ -329,7 +371,7 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
maskPositions := maskAmbiguities(bseq)
mask := make([]int, len(bseq))
maskFlags := make([]int, len(bseq))
entropies := make([]float64, len(bseq))
for i := range entropies {
entropies[i] = 4.0
@@ -343,7 +385,7 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
for i := range bseq {
v := level_max
mask[i] = v
maskFlags[i] = v
}
for ws := level_max - 1; ws > 0; ws-- {
@@ -351,7 +393,7 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
for i, e2 := range entropies2 {
if e2 < entropies[i] {
entropies[i] = e2
mask[i] = ws
maskFlags[i] = ws
}
}
}
@@ -367,39 +409,49 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
remove[i] = e <= threshold
}
sequence.SetAttribute("mask", mask)
sequence.SetAttribute("mask", maskFlags)
sequence.SetAttribute("Entropies", entropies)
switch mode {
case Mask:
case MaskMode:
return applyMaskMode(sequence, remove, maskChar)
case Split:
case SplitMode:
return selectunmasked(sequence, remove)
case Extract:
case ExtractMode:
return selectMasked(sequence, remove)
}
return nil, fmt.Errorf("Unknown mode %d", mode)
return nil, fmt.Errorf("unknown mode %d", mode)
}
return masking
}
// CLISequenceEntropyMasker creates an iterator that applies entropy masking
// to all sequences in an input iterator.
//
// Uses command-line parameters to configure the worker.
func CLISequenceEntropyMasker(iterator obiiter.IBioSequence) obiiter.IBioSequence {
var newIter obiiter.IBioSequence
// runLowmask implements the "obik lowmask" subcommand.
// It masks low-complexity regions in DNA sequences using entropy-based detection.
func runLowmask(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
kmerSize := _lowmaskKmerSize
levelMax := _lowmaskLevelMax
threshold := _lowmaskThreshold
mode := lowmaskMaskingMode()
maskChar := lowmaskMaskingChar()
worker := LowMaskWorker(
CLIKmerSize(),
CLILevelMax(),
CLIThreshold(),
CLIMaskingMode(),
CLIMaskingChar(),
)
log.Printf("Low-complexity masking: kmer-size=%d, entropy-size=%d, threshold=%.4f", kmerSize, levelMax, threshold)
newIter = iterator.MakeIWorker(worker, false, obidefault.ParallelWorkers())
sequences, err := obiconvert.CLIReadBioSequences(args...)
if err != nil {
return fmt.Errorf("failed to open sequence files: %w", err)
}
return newIter.FilterEmpty()
worker := lowMaskWorker(kmerSize, levelMax, threshold, mode, maskChar)
masked := sequences.MakeIWorker(
worker,
false,
obidefault.ParallelWorkers(),
).FilterEmpty()
obiconvert.CLIWriteBioSequences(masked, true)
obiutils.WaitForLastPipe()
return nil
}

96
pkg/obitools/obik/ls.go Normal file
View File

@@ -0,0 +1,96 @@
package obik
import (
"context"
"encoding/json"
"fmt"
"strings"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"github.com/DavidGamba/go-getoptions"
"gopkg.in/yaml.v3"
)
type setEntry struct {
Index int `json:"index" yaml:"index"`
ID string `json:"id" yaml:"id"`
Count uint64 `json:"count" yaml:"count"`
}
func runLs(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
if len(args) < 1 {
return fmt.Errorf("usage: obik ls [options] <index_directory>")
}
ksg, err := obikmer.OpenKmerSetGroup(args[0])
if err != nil {
return fmt.Errorf("failed to open kmer index: %w", err)
}
// Determine which sets to show
patterns := CLISetPatterns()
var indices []int
if len(patterns) > 0 {
indices, err = ksg.MatchSetIDs(patterns)
if err != nil {
return err
}
} else {
indices = make([]int, ksg.Size())
for i := range indices {
indices[i] = i
}
}
entries := make([]setEntry, len(indices))
for i, idx := range indices {
entries[i] = setEntry{
Index: idx,
ID: ksg.SetIDOf(idx),
Count: ksg.Len(idx),
}
}
format := CLIOutFormat()
switch format {
case "json":
return outputLsJSON(entries)
case "yaml":
return outputLsYAML(entries)
case "csv":
return outputLsCSV(entries)
default:
return outputLsCSV(entries)
}
}
func outputLsCSV(entries []setEntry) error {
fmt.Println("index,id,count")
for _, e := range entries {
// Escape commas in ID if needed
id := e.ID
if strings.ContainsAny(id, ",\"") {
id = "\"" + strings.ReplaceAll(id, "\"", "\"\"") + "\""
}
fmt.Printf("%d,%s,%d\n", e.Index, id, e.Count)
}
return nil
}
func outputLsJSON(entries []setEntry) error {
data, err := json.MarshalIndent(entries, "", " ")
if err != nil {
return err
}
fmt.Println(string(data))
return nil
}
func outputLsYAML(entries []setEntry) error {
data, err := yaml.Marshal(entries)
if err != nil {
return err
}
fmt.Print(string(data))
return nil
}

63
pkg/obitools/obik/mv.go Normal file
View File

@@ -0,0 +1,63 @@
package obik
import (
"context"
"fmt"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"github.com/DavidGamba/go-getoptions"
)
func runMv(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
if len(args) < 2 {
return fmt.Errorf("usage: obik mv [--set PATTERN]... [--force] <source_index> <dest_index>")
}
srcDir := args[0]
destDir := args[1]
ksg, err := obikmer.OpenKmerSetGroup(srcDir)
if err != nil {
return fmt.Errorf("failed to open source kmer index: %w", err)
}
// Resolve set patterns
patterns := CLISetPatterns()
var ids []string
if len(patterns) > 0 {
indices, err := ksg.MatchSetIDs(patterns)
if err != nil {
return err
}
if len(indices) == 0 {
return fmt.Errorf("no sets match the given patterns")
}
ids = make([]string, len(indices))
for i, idx := range indices {
ids[i] = ksg.SetIDOf(idx)
}
} else {
// Move all sets
ids = ksg.SetsIDs()
}
log.Infof("Moving %d set(s) from %s to %s", len(ids), srcDir, destDir)
// Copy first
dest, err := ksg.CopySetsByIDTo(ids, destDir, CLIForce())
if err != nil {
return err
}
// Remove from source (in reverse order to avoid renumbering issues)
for i := len(ids) - 1; i >= 0; i-- {
if err := ksg.RemoveSetByID(ids[i]); err != nil {
return fmt.Errorf("failed to remove set %q from source after copy: %w", ids[i], err)
}
}
log.Infof("Destination now has %d set(s), source has %d set(s)", dest.Size(), ksg.Size())
return nil
}

64
pkg/obitools/obik/obik.go Normal file
View File

@@ -0,0 +1,64 @@
package obik
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
// OptionSet registers all obik subcommands on the root GetOpt.
func OptionSet(opt *getoptions.GetOpt) {
// index: build or extend a kmer index from sequence files
indexCmd := opt.NewCommand("index", "Build a disk-based kmer index from sequence files")
obiconvert.InputOptionSet(indexCmd)
obiconvert.OutputModeOptionSet(indexCmd, false)
KmerIndexOptionSet(indexCmd)
indexCmd.StringMapVar(&_setMetaTags, "tag", 1, 1,
indexCmd.Alias("T"),
indexCmd.ArgName("KEY=VALUE"),
indexCmd.Description("Per-set metadata tag (repeatable)."))
indexCmd.SetCommandFn(runIndex)
// ls: list sets in a kmer index
lsCmd := opt.NewCommand("ls", "List sets in a kmer index")
OutputFormatOptionSet(lsCmd)
SetSelectionOptionSet(lsCmd)
lsCmd.SetCommandFn(runLs)
// summary: detailed statistics
summaryCmd := opt.NewCommand("summary", "Show detailed statistics of a kmer index")
OutputFormatOptionSet(summaryCmd)
summaryCmd.BoolVar(&_jaccard, "jaccard", false,
summaryCmd.Description("Compute and display pairwise Jaccard distance matrix."))
summaryCmd.SetCommandFn(runSummary)
// cp: copy sets between indices
cpCmd := opt.NewCommand("cp", "Copy sets between kmer indices")
SetSelectionOptionSet(cpCmd)
ForceOptionSet(cpCmd)
cpCmd.SetCommandFn(runCp)
// mv: move sets between indices
mvCmd := opt.NewCommand("mv", "Move sets between kmer indices")
SetSelectionOptionSet(mvCmd)
ForceOptionSet(mvCmd)
mvCmd.SetCommandFn(runMv)
// rm: remove sets from an index
rmCmd := opt.NewCommand("rm", "Remove sets from a kmer index")
SetSelectionOptionSet(rmCmd)
rmCmd.SetCommandFn(runRm)
// super: extract super k-mers from sequences
superCmd := opt.NewCommand("super", "Extract super k-mers from sequence files")
obiconvert.InputOptionSet(superCmd)
obiconvert.OutputOptionSet(superCmd)
SuperKmerOptionSet(superCmd)
superCmd.SetCommandFn(runSuper)
// lowmask: mask low-complexity regions
lowmaskCmd := opt.NewCommand("lowmask", "Mask low-complexity regions in sequences using entropy")
obiconvert.InputOptionSet(lowmaskCmd)
obiconvert.OutputOptionSet(lowmaskCmd)
LowMaskOptionSet(lowmaskCmd)
lowmaskCmd.SetCommandFn(runLowmask)
}

View File

@@ -1,4 +1,4 @@
package obikindex
package obik
import (
"strings"
@@ -11,7 +11,27 @@ import (
"github.com/DavidGamba/go-getoptions"
)
// Private variables for storing option values
// Output format flags
var _jsonOutput bool
var _csvOutput bool
var _yamlOutput bool
// Set selection flags
var _setPatterns []string
// Force flag
var _force bool
// Jaccard flag
var _jaccard bool
// Per-set tags for index subcommand
var _setMetaTags = make(map[string]string, 0)
// ==============================
// Kmer index building options (moved from obikindex)
// ==============================
var _kmerSize = 31
var _minimizerSize = -1 // -1 means auto: ceil(k / 2.5)
var _indexId = ""
@@ -39,7 +59,7 @@ func KmerIndexOptionSet(options *getoptions.GetOpt) {
options.StringMapVar(&_setTag, "set-tag", 1, 1,
options.Alias("S"),
options.ArgName("KEY=VALUE"),
options.Description("Adds a metadata attribute KEY with value VALUE to the index."))
options.Description("Adds a group-level metadata attribute KEY with value VALUE."))
options.IntVar(&_minOccurrence, "min-occurrence", _minOccurrence,
options.Description("Minimum number of occurrences for a k-mer to be kept (default 1 = keep all)."))
@@ -48,23 +68,12 @@ func KmerIndexOptionSet(options *getoptions.GetOpt) {
options.Description("When using --min-occurrence > 1, save the full frequency filter instead of just the filtered index."))
}
// OptionSet adds to the basic option set every option declared for
// the obikindex command.
func OptionSet(options *getoptions.GetOpt) {
obiconvert.InputOptionSet(options)
obiconvert.OutputModeOptionSet(options, false)
KmerIndexOptionSet(options)
}
// CLIKmerSize returns the k-mer size.
func CLIKmerSize() int {
return _kmerSize
}
// CLIMinimizerSize returns the effective minimizer size.
// If -1 (auto), computes ceil(k / 2.5) then applies constraints:
// - minimum: ceil(log(nworkers) / log(4))
// - maximum: k - 1
func CLIMinimizerSize() int {
m := _minimizerSize
if m < 0 {
@@ -95,7 +104,7 @@ func CLIMetadataFormat() obikmer.MetadataFormat {
}
}
// CLISetTag returns the metadata key=value pairs.
// CLISetTag returns the group-level metadata key=value pairs.
func CLISetTag() map[string]string {
return _setTag
}
@@ -129,3 +138,52 @@ func SetMinimizerSize(m int) {
func SetMinOccurrence(n int) {
_minOccurrence = n
}
// OutputFormatOptionSet registers --json-output, --csv-output, --yaml-output.
func OutputFormatOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&_jsonOutput, "json-output", false,
options.Description("Print results as JSON."))
options.BoolVar(&_csvOutput, "csv-output", false,
options.Description("Print results as CSV."))
options.BoolVar(&_yamlOutput, "yaml-output", false,
options.Description("Print results as YAML."))
}
// CLIOutFormat returns the selected output format: "json", "csv", "yaml", or "text".
func CLIOutFormat() string {
if _jsonOutput {
return "json"
}
if _csvOutput {
return "csv"
}
if _yamlOutput {
return "yaml"
}
return "text"
}
// SetSelectionOptionSet registers --set <glob_pattern> (repeatable).
func SetSelectionOptionSet(options *getoptions.GetOpt) {
options.StringSliceVar(&_setPatterns, "set", 1, 1,
options.Alias("s"),
options.ArgName("PATTERN"),
options.Description("Set ID or glob pattern (repeatable, supports *, ?, [...])."))
}
// CLISetPatterns returns the --set patterns provided by the user.
func CLISetPatterns() []string {
return _setPatterns
}
// ForceOptionSet registers --force / -f.
func ForceOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&_force, "force", false,
options.Alias("f"),
options.Description("Force operation even if set ID already exists in destination."))
}
// CLIForce returns whether --force was specified.
func CLIForce() bool {
return _force
}

56
pkg/obitools/obik/rm.go Normal file
View File

@@ -0,0 +1,56 @@
package obik
import (
"context"
"fmt"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"github.com/DavidGamba/go-getoptions"
)
func runRm(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
if len(args) < 1 {
return fmt.Errorf("usage: obik rm --set PATTERN [--set PATTERN]... <index_directory>")
}
patterns := CLISetPatterns()
if len(patterns) == 0 {
return fmt.Errorf("--set is required (specify which sets to remove)")
}
indexDir := args[0]
ksg, err := obikmer.OpenKmerSetGroup(indexDir)
if err != nil {
return fmt.Errorf("failed to open kmer index: %w", err)
}
indices, err := ksg.MatchSetIDs(patterns)
if err != nil {
return err
}
if len(indices) == 0 {
return fmt.Errorf("no sets match the given patterns")
}
// Collect IDs before removal (indices shift as we remove)
ids := make([]string, len(indices))
for i, idx := range indices {
ids[i] = ksg.SetIDOf(idx)
}
log.Infof("Removing %d set(s) from %s", len(ids), indexDir)
// Remove in reverse order to avoid renumbering issues
for i := len(ids) - 1; i >= 0; i-- {
if err := ksg.RemoveSetByID(ids[i]); err != nil {
return fmt.Errorf("failed to remove set %q: %w", ids[i], err)
}
log.Infof("Removed set %q", ids[i])
}
log.Infof("Index now has %d set(s)", ksg.Size())
return nil
}

View File

@@ -0,0 +1,148 @@
package obik
import (
"context"
"encoding/json"
"fmt"
"os"
"path/filepath"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"github.com/DavidGamba/go-getoptions"
"gopkg.in/yaml.v3"
)
type setSummary struct {
Index int `json:"index" yaml:"index"`
ID string `json:"id" yaml:"id"`
Count uint64 `json:"count" yaml:"count"`
DiskSize int64 `json:"disk_bytes" yaml:"disk_bytes"`
Metadata map[string]interface{} `json:"metadata,omitempty" yaml:"metadata,omitempty"`
}
type groupSummary struct {
Path string `json:"path" yaml:"path"`
ID string `json:"id,omitempty" yaml:"id,omitempty"`
K int `json:"k" yaml:"k"`
M int `json:"m" yaml:"m"`
Partitions int `json:"partitions" yaml:"partitions"`
TotalSets int `json:"total_sets" yaml:"total_sets"`
TotalKmers uint64 `json:"total_kmers" yaml:"total_kmers"`
TotalDisk int64 `json:"total_disk_bytes" yaml:"total_disk_bytes"`
Metadata map[string]interface{} `json:"metadata,omitempty" yaml:"metadata,omitempty"`
Sets []setSummary `json:"sets" yaml:"sets"`
Jaccard [][]float64 `json:"jaccard,omitempty" yaml:"jaccard,omitempty"`
}
func runSummary(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
if len(args) < 1 {
return fmt.Errorf("usage: obik summary [options] <index_directory>")
}
ksg, err := obikmer.OpenKmerSetGroup(args[0])
if err != nil {
return fmt.Errorf("failed to open kmer index: %w", err)
}
summary := groupSummary{
Path: ksg.Path(),
ID: ksg.Id(),
K: ksg.K(),
M: ksg.M(),
Partitions: ksg.Partitions(),
TotalSets: ksg.Size(),
TotalKmers: ksg.Len(),
Metadata: ksg.Metadata,
Sets: make([]setSummary, ksg.Size()),
}
var totalDisk int64
for i := 0; i < ksg.Size(); i++ {
diskSize := computeSetDiskSize(ksg, i)
totalDisk += diskSize
summary.Sets[i] = setSummary{
Index: i,
ID: ksg.SetIDOf(i),
Count: ksg.Len(i),
DiskSize: diskSize,
Metadata: ksg.AllSetMetadata(i),
}
}
summary.TotalDisk = totalDisk
// Jaccard matrix
if _jaccard && ksg.Size() > 1 {
dm := ksg.JaccardDistanceMatrix()
n := ksg.Size()
matrix := make([][]float64, n)
for i := 0; i < n; i++ {
matrix[i] = make([]float64, n)
for j := 0; j < n; j++ {
if i == j {
matrix[i][j] = 0
} else {
matrix[i][j] = dm.Get(i, j)
}
}
}
summary.Jaccard = matrix
}
format := CLIOutFormat()
switch format {
case "json":
return outputSummaryJSON(summary)
case "yaml":
return outputSummaryYAML(summary)
case "csv":
return outputSummaryCSV(summary)
default:
return outputSummaryJSON(summary)
}
}
func computeSetDiskSize(ksg *obikmer.KmerSetGroup, setIndex int) int64 {
var total int64
for p := 0; p < ksg.Partitions(); p++ {
path := ksg.PartitionPath(setIndex, p)
info, err := os.Stat(path)
if err != nil {
continue
}
total += info.Size()
}
// Also count the set directory entry itself
setDir := filepath.Join(ksg.Path(), fmt.Sprintf("set_%d", setIndex))
entries, err := os.ReadDir(setDir)
if err == nil {
// We already counted .kdi files above; this is just for completeness
_ = entries
}
return total
}
func outputSummaryJSON(summary groupSummary) error {
data, err := json.MarshalIndent(summary, "", " ")
if err != nil {
return err
}
fmt.Println(string(data))
return nil
}
func outputSummaryYAML(summary groupSummary) error {
data, err := yaml.Marshal(summary)
if err != nil {
return err
}
fmt.Print(string(data))
return nil
}
func outputSummaryCSV(summary groupSummary) error {
fmt.Println("index,id,count,disk_bytes")
for _, s := range summary.Sets {
fmt.Printf("%d,%s,%d,%d\n", s.Index, s.ID, s.Count, s.DiskSize)
}
return nil
}

View File

@@ -0,0 +1,64 @@
package obik
import (
"context"
"fmt"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
"github.com/DavidGamba/go-getoptions"
)
// Super k-mer specific option variables.
// These reuse _kmerSize and _minimizerSize from options.go since
// only one subcommand runs at a time.
// SuperKmerOptionSet registers options specific to super k-mer extraction.
func SuperKmerOptionSet(options *getoptions.GetOpt) {
options.IntVar(&_kmerSize, "kmer-size", _kmerSize,
options.Alias("k"),
options.Description("Size of k-mers (must be between m+1 and 31)."))
options.IntVar(&_minimizerSize, "minimizer-size", _minimizerSize,
options.Alias("m"),
options.Description("Size of minimizers (must be between 1 and k-1)."))
}
// runSuper implements the "obik super" subcommand.
// It extracts super k-mers from DNA sequences.
func runSuper(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
k := _kmerSize
m := _minimizerSize
if k < 2 || k > 31 {
return fmt.Errorf("invalid k-mer size: %d (must be between 2 and 31)", k)
}
if m < 1 || m >= k {
return fmt.Errorf("invalid parameters: minimizer size (%d) must be between 1 and k-1 (%d)", m, k-1)
}
log.Printf("Extracting super k-mers with k=%d, m=%d", k, m)
sequences, err := obiconvert.CLIReadBioSequences(args...)
if err != nil {
return fmt.Errorf("failed to open sequence files: %w", err)
}
worker := obikmer.SuperKmerWorker(k, m)
superkmers := sequences.MakeIWorker(
worker,
false,
obidefault.ParallelWorkers(),
)
obiconvert.CLIWriteBioSequences(superkmers, true)
obiutils.WaitForLastPipe()
return nil
}

View File

@@ -1,91 +0,0 @@
package obikindex
import (
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
)
// CLIBuildKmerIndex reads sequences from the iterator and builds a
// disk-based kmer index using the KmerSetGroupBuilder.
func CLIBuildKmerIndex(iterator obiiter.IBioSequence) {
// Validate output directory
outDir := CLIOutputDirectory()
if outDir == "" || outDir == "-" {
log.Fatalf("Error: --out option is required and must specify a directory path (not stdout)")
}
// Validate k-mer size
k := CLIKmerSize()
if k < 2 || k > 31 {
log.Fatalf("Invalid k-mer size: %d (must be between 2 and 31)", k)
}
// Resolve minimizer size
m := CLIMinimizerSize()
// Validate min-occurrence
minOcc := CLIMinOccurrence()
if minOcc < 1 {
log.Fatalf("Invalid min-occurrence: %d (must be >= 1)", minOcc)
}
log.Infof("Building kmer index: k=%d, m=%d, min-occurrence=%d", k, m, minOcc)
// Build options
var opts []obikmer.BuilderOption
if minOcc > 1 {
opts = append(opts, obikmer.WithMinFrequency(minOcc))
}
// Create builder (1 set, auto partitions)
builder, err := obikmer.NewKmerSetGroupBuilder(outDir, k, m, 1, -1, opts...)
if err != nil {
log.Fatalf("Failed to create kmer index builder: %v", err)
}
// Feed sequences
seqCount := 0
for iterator.Next() {
batch := iterator.Get()
for _, seq := range batch.Slice() {
builder.AddSequence(0, seq)
seqCount++
}
}
log.Infof("Processed %d sequences", seqCount)
// Finalize
ksg, err := builder.Close()
if err != nil {
log.Fatalf("Failed to finalize kmer index: %v", err)
}
// Apply metadata
applyMetadata(ksg)
if minOcc > 1 {
ksg.SetAttribute("min_occurrence", minOcc)
}
// Persist metadata updates
if err := ksg.SaveMetadata(); err != nil {
log.Fatalf("Failed to save metadata: %v", err)
}
log.Infof("Index contains %d k-mers in %s", ksg.Len(0), outDir)
log.Info("Done.")
}
// applyMetadata sets index-id and --set-tag metadata on a KmerSetGroup.
func applyMetadata(ksg *obikmer.KmerSetGroup) {
if id := CLIIndexId(); id != "" {
ksg.SetId(id)
}
for key, value := range CLISetTag() {
ksg.SetAttribute(key, value)
}
}

View File

@@ -1,332 +0,0 @@
```{r}
library(tidyverse)
```
```{r}
x <- sample(1:4096, 29, replace=TRUE)
```
```{r}
emax <- function(lseq,word_size) {
nword = lseq - word_size + 1
nalpha = 4^word_size
if (nalpha < nword) {
cov = nword %/% nalpha
remains = nword %% nalpha
f1 = cov/nword
f2 = (cov+1)/nword
print(c(nalpha - remains,f1,remains,f2))
e = -(nalpha - remains) * f1 * log(f1) -
remains * f2 * log(f2)
} else {
e = log(nword)
}
e
}
```
```{r}
ec <- function(data,kmer_size) {
table <- table(data)
s <- sum(table)
e <- sum(table * log(table))/s
ed <- log(s) - e
em <- emax(s+kmer_size-1,kmer_size)
ed/em
}
```
```{r}
ef <- function(data,kmer_size) {
table <- table(data)
s <- sum(table)
f <- table / s
f <- as.numeric(f)
f <- f[f > 0]
em <- emax(s+kmer_size-1,kmer_size)
ed <- -sum(f * log(f))
print(c(ed,em,ed/em))
ed/em
}
```
```{r}
okmer <- function(data,kmer_size) {
str_sub(data,1:(nchar(data)-kmer_size+1)) %>%
str_sub(1,kmer_size)
}
```
```{r}
# Normalisation circulaire: retourne le plus petit k-mer par rotation circulaire
normalize_circular <- function(kmer) {
if (nchar(kmer) == 0) return(kmer)
canonical <- kmer
n <- nchar(kmer)
# Tester toutes les rotations circulaires
for (i in 2:n) {
rotated <- paste0(str_sub(kmer, i, n), str_sub(kmer, 1, i-1))
if (rotated < canonical) {
canonical <- rotated
}
}
canonical
}
```
```{r}
# Fonction totient d'Euler: compte le nombre d'entiers de 1 à n coprimes avec n
euler_totient <- function(n) {
if (n <= 0) return(0)
result <- n
p <- 2
# Traiter tous les facteurs premiers
while (p * p <= n) {
if (n %% p == 0) {
# Retirer toutes les occurrences de p
while (n %% p == 0) {
n <- n %/% p
}
# Appliquer la formule: φ(n) = n * (1 - 1/p)
result <- result - result %/% p
}
p <- p + 1
}
# Si n est toujours > 1, alors c'est un facteur premier
if (n > 1) {
result <- result - result %/% n
}
result
}
```
```{r}
# Retourne tous les diviseurs de n
divisors <- function(n) {
if (n <= 0) return(integer(0))
divs <- c()
i <- 1
while (i * i <= n) {
if (n %% i == 0) {
divs <- c(divs, i)
if (i != n %/% i) {
divs <- c(divs, n %/% i)
}
}
i <- i + 1
}
sort(divs)
}
```
```{r}
# Compte le nombre de colliers (necklaces) distincts de longueur n
# sur un alphabet de taille a en utilisant la formule de Moreau:
# N(n, a) = (1/n) * Σ φ(d) * a^(n/d)
# où la somme est sur tous les diviseurs d de n, et φ est la fonction totient d'Euler
necklace_count <- function(n, alphabet_size) {
if (n <= 0) return(0)
divs <- divisors(n)
sum_val <- 0
for (d in divs) {
# Calculer alphabet_size^(n/d)
power <- alphabet_size^(n %/% d)
sum_val <- sum_val + euler_totient(d) * power
}
sum_val %/% n
}
```
```{r}
# Nombre de classes d'équivalence pour les k-mers normalisés
# Utilise la formule exacte de Moreau pour compter les colliers (necklaces)
n_normalized_kmers <- function(kmer_size) {
# Valeurs exactes pré-calculées pour k=1 à 6
if (kmer_size == 1) return(4)
if (kmer_size == 2) return(10)
if (kmer_size == 3) return(24)
if (kmer_size == 4) return(70)
if (kmer_size == 5) return(208)
if (kmer_size == 6) return(700)
# Pour k > 6, utiliser la formule de Moreau (exacte)
# Alphabet ADN a 4 bases
necklace_count(kmer_size, 4)
}
```
```{r}
# Entropie maximale pour k-mers normalisés
enmax <- function(lseq, word_size) {
nword = lseq - word_size + 1
nalpha = n_normalized_kmers(word_size)
if (nalpha < nword) {
cov = nword %/% nalpha
remains = nword %% nalpha
f1 = cov/nword
f2 = (cov+1)/nword
e = -(nalpha - remains) * f1 * log(f1) -
remains * f2 * log(f2)
} else {
e = log(nword)
}
e
}
```
```{r}
# Entropie normalisée avec normalisation circulaire des k-mers
ecn <- function(data, kmer_size) {
# Normaliser tous les k-mers
normalized_data <- sapply(data, normalize_circular)
# Calculer la table des fréquences
table <- table(normalized_data)
s <- sum(table)
e <- sum(table * log(table))/s
ed <- log(s) - e
# Entropie maximale avec normalisation
em <- enmax(s + kmer_size - 1, kmer_size)
ed/em
}
```
```{r}
k<-'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
ec(okmer(k,1),1)
ec(okmer(k,2),2)
ec(okmer(k,3),3)
ec(okmer(k,4),4)
```
```{r}
k<-'atatatatatatatatatatatatatatata'
ef(okmer(k,1),1)
ef(okmer(k,2),2)
ef(okmer(k,3),3)
ef(okmer(k,4),4)
```
```{r}
k<-'aaaaaaaaaaaaaaaattttttttttttttt'
ef(okmer(k,1),1)
ef(okmer(k,2),2)
ef(okmer(k,3),3)
ef(okmer(k,4),4)
```
```{r}
k<-'atgatgatgatgatgatgatgatgatgatga'
ef(okmer(k,1),1)
ef(okmer(k,2),2)
ef(okmer(k,3),3)
ef(okmer(k,4),4)
```
```{r}
k<-'atcgatcgatcgatcgatcgatcgatcgact'
ecn(okmer(k,1),1)
ecn(okmer(k,2),2)
ecn(okmer(k,3),3)
ecn(okmer(k,4),4)
```
```{r}
k<-paste(sample(rep(c("a","c","g","t"),8),31),collapse="")
k <- "actatggcaagtcgtaaccgcgcttatcagg"
ecn(okmer(k,1),1)
ecn(okmer(k,2),2)
ecn(okmer(k,3),3)
ecn(okmer(k,4),4)
```
aattaaaaaaacaagataaaataatattttt
```{r}
k<-'aattaaaaaaacaagataaaataatattttt'
ecn(okmer(k,1),1)
ecn(okmer(k,2),2)
ecn(okmer(k,3),3)
ecn(okmer(k,4),4)
```
atg tga gat ,,,,
cat tca atc
tgatgatgatgatgatgatgatgatgatg
## Tests de normalisation circulaire
```{r}
# Test de la fonction de normalisation
normalize_circular("ca") # devrait donner "ac"
normalize_circular("tgca") # devrait donner "atgc"
normalize_circular("acgt") # devrait donner "acgt"
```
```{r}
# Comparaison ec vs ecn sur une séquence répétitive
# Les k-mers "atg", "tga", "gat" sont équivalents par rotation
k <- 'atgatgatgatgatgatgatgatgatgatga'
cat("Séquence:", k, "\n")
cat("ec(k,3) =", ec(okmer(k,3),3), "\n")
cat("ecn(k,3) =", ecn(okmer(k,3),3), "\n")
```
```{r}
# Comparaison sur séquence aléatoire
k <- "actatggcaagtcgtaaccgcgcttatcagg"
cat("Séquence:", k, "\n")
cat("Sans normalisation:\n")
cat(" ec(k,2) =", ec(okmer(k,2),2), "\n")
cat(" ec(k,3) =", ec(okmer(k,3),3), "\n")
cat(" ec(k,4) =", ec(okmer(k,4),4), "\n")
cat("Avec normalisation circulaire:\n")
cat(" ecn(k,2) =", ecn(okmer(k,2),2), "\n")
cat(" ecn(k,3) =", ecn(okmer(k,3),3), "\n")
cat(" ecn(k,4) =", ecn(okmer(k,4),4), "\n")
```
```{r}
sequence <- "ttcatcactcagcaatcctgaatgatGAGAGCTTTTTTTTTTTATATATATATATATGTATATGTATGAAATACACTtatgctccgtttgtttcgccgtaa"
re <- rev(c(0.8108602271901116,0.8108602271901116,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.7800272339058549,0.7800272339058549,0.7751610144606091,0.7751610144606091,0.7751610144606091,0.764858185548322,0.7325526601302021,0.7137620699527615,0.6789199521982864,0.6584536373623372,0.634002687184193,0.6075290415873623,0.5785545803330997,0.5785545803330997,0.5503220289212184,0.5315314387437778,0.4966893209893028,0.46077361820145696,0.42388221293245526,0.4009547969713408,0.3561142883497758,0.3561142883497758,0.3561142883497758,0.3561142883497758,0.3561142883497758,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.35141814451677883,0.35141814451677883,0.35141814451677883,0.35141814451677883,0.35141814451677883,0.390029016052137,0.42781461756157363,0.45192285937059073,0.47238917420654,0.47238917420654,0.47238917420654,0.5092805794755417,0.5451962822633876,0.5800384000178626,0.602395141014297,0.6046146614886381,0.6046146614886381,0.6119084258128231,0.6119084258128231,0.6214217106113492,0.6424704346756562,0.6482381543085467,0.6635191587399633,0.6635191587399633,0.6635191587399633,0.6828444721058894,0.6950205907027562,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.7208976112999935))
di <- c(0.7208976112999935,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6950205907027562,0.6828444721058894,0.6635191587399633,0.6635191587399633,0.6635191587399633,0.6482381543085467,0.6424704346756562,0.6214217106113492,0.6119084258128231,0.6119084258128231,0.6046146614886382,0.6046146614886382,0.6023951410142971,0.5800384000178627,0.5451962822633876,0.5092805794755418,0.47238917420654003,0.47238917420654003,0.47238917420654003,0.4519228593705908,0.4278146175615737,0.39002901605213713,0.35141814451677894,0.35141814451677894,0.35141814451677894,0.35141814451677894,0.35141814451677883,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3561142883497762,0.3561142883497762,0.3561142883497762,0.3561142883497762,0.3561142883497762,0.40095479697134073,0.42388221293245526,0.46077361820145696,0.4966893209893028,0.5315314387437778,0.5503220289212184,0.5785545803330997,0.5785545803330997,0.6075290415873625,0.6340026871841933,0.6584536373623374,0.6789199521982866,0.7137620699527616,0.7325526601302023,0.7648581855483221,0.7751610144606093,0.7751610144606093,0.7751610144606093,0.7800272339058549,0.7800272339058549,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8108602271901116,0.8108602271901116)
ebidir <- tibble(direct=di,reverse=re) %>%
mutate(position = 1:length(re),
nucleotide = str_sub(sequence,position,position))
ebidir %>%
ggplot(aes(x=position,y=direct)) +
geom_line() +
scale_x_continuous(breaks = ebidir$position, labels = ebidir$nucleotide) +
ylim(0,1)+
geom_hline(yintercept=0.5, col = "red", linetype = "dashed")
```

View File

@@ -1,40 +0,0 @@
package obilowmask
import (
"testing"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
)
func TestLowMaskWorker(t *testing.T) {
worker := LowMaskWorker(31, 6, 0.3, Mask, 'n')
seq := obiseq.NewBioSequence("test", []byte("acgtacgtacgtacgtacgtacgtacgtacgt"), "test")
result, err := worker(seq)
if err != nil {
t.Fatalf("Worker failed: %v", err)
}
if result.Len() != 1 {
t.Fatalf("Expected 1 sequence, got %d", result.Len())
}
resultSeq := result[0]
if resultSeq.Len() != 32 {
t.Fatalf("Expected sequence length 32, got %d", resultSeq.Len())
}
}
func TestLowMaskWorkerWithAmbiguity(t *testing.T) {
worker := LowMaskWorker(31, 6, 0.3, Mask, 'n')
seq := obiseq.NewBioSequence("test", []byte("acgtNcgtacgtacgtacgtacgtacgtacgt"), "test")
result, err := worker(seq)
if err != nil {
t.Fatalf("Worker failed: %v", err)
}
if result.Len() != 1 {
t.Fatalf("Expected 1 sequence, got %d", result.Len())
}
}

View File

@@ -1,81 +0,0 @@
package obilowmask
import (
"strings"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
log "github.com/sirupsen/logrus"
)
var __kmer_size__ = 31
var __level_max__ = 6
var __threshold__ = 0.5
var __split_mode__ = false
var __low_mode__ = false
var __mask__ = "."
func LowMaskOptionSet(options *getoptions.GetOpt) {
options.IntVar(&__kmer_size__, "kmer-size", __kmer_size__,
options.Description("Size of the kmer considered to estimate entropy."),
)
options.IntVar(&__level_max__, "entropy_size", __level_max__,
options.Description("Maximum word size considered for entropy estimate"),
)
options.Float64Var(&__threshold__, "threshold", __threshold__,
options.Description("entropy theshold used to mask a kmer"),
)
options.BoolVar(&__split_mode__, "split-mode", __split_mode__,
options.Description("in split mode, input sequences are splitted to remove masked regions"),
)
options.BoolVar(&__low_mode__, "low-mode", __low_mode__,
options.Description("in split mode, input sequences are splitted to remove masked regions"),
)
options.StringVar(&__mask__, "masking-char", __mask__,
options.Description("Character used to mask low complexity region"),
)
}
func OptionSet(options *getoptions.GetOpt) {
LowMaskOptionSet(options)
obiconvert.InputOptionSet(options)
obiconvert.OutputOptionSet(options)
}
func CLIKmerSize() int {
return __kmer_size__
}
func CLILevelMax() int {
return __level_max__
}
func CLIThreshold() float64 {
return __threshold__
}
func CLIMaskingMode() MaskingMode {
switch {
case __low_mode__:
return Extract
case __split_mode__:
return Split
default:
return Mask
}
}
func CLIMaskingChar() byte {
mask := strings.TrimSpace(__mask__)
if len(mask) != 1 {
log.Fatalf("--masking-char option accept a single character, not %s", mask)
}
return []byte(mask)[0]
}

View File

@@ -1,10 +0,0 @@
// obisuperkmer function utility package.
//
// The obitools/obisuperkmer package contains every
// function specifically required by the obisuperkmer utility.
//
// The obisuperkmer command extracts super k-mers from DNA sequences.
// A super k-mer is a maximal subsequence where all consecutive k-mers
// share the same minimizer. This decomposition is useful for efficient
// k-mer indexing and analysis in bioinformatics applications.
package obisuperkmer

View File

@@ -1,69 +0,0 @@
package obisuperkmer
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
// Private variables for storing option values
var _KmerSize = 31
var _MinimizerSize = 13
// SuperKmerOptionSet defines every option related to super k-mer extraction.
//
// The function adds to a CLI every option proposed to the user
// to tune the parameters of the super k-mer extraction algorithm.
//
// Parameters:
// - options: is a pointer to a getoptions.GetOpt instance normally
// produced by the obioptions.GenerateOptionParser function.
func SuperKmerOptionSet(options *getoptions.GetOpt) {
options.IntVar(&_KmerSize, "kmer-size", _KmerSize,
options.Alias("k"),
options.Description("Size of k-mers (must be between m+1 and 31)."))
options.IntVar(&_MinimizerSize, "minimizer-size", _MinimizerSize,
options.Alias("m"),
options.Description("Size of minimizers (must be between 1 and k-1)."))
}
// OptionSet adds to the basic option set every option declared for
// the obisuperkmer command.
//
// It takes a pointer to a GetOpt struct as its parameter and does not return anything.
func OptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(false)(options)
SuperKmerOptionSet(options)
}
// CLIKmerSize returns the k-mer size to use for super k-mer extraction.
//
// It does not take any parameters.
// It returns an integer representing the k-mer size.
func CLIKmerSize() int {
return _KmerSize
}
// SetKmerSize sets the k-mer size for super k-mer extraction.
//
// Parameters:
// - k: the k-mer size (must be between m+1 and 31).
func SetKmerSize(k int) {
_KmerSize = k
}
// CLIMinimizerSize returns the minimizer size to use for super k-mer extraction.
//
// It does not take any parameters.
// It returns an integer representing the minimizer size.
func CLIMinimizerSize() int {
return _MinimizerSize
}
// SetMinimizerSize sets the minimizer size for super k-mer extraction.
//
// Parameters:
// - m: the minimizer size (must be between 1 and k-1).
func SetMinimizerSize(m int) {
_MinimizerSize = m
}

View File

@@ -1,59 +0,0 @@
package obisuperkmer
import (
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
)
// CLIExtractSuperKmers extracts super k-mers from an iterator of BioSequences.
//
// This function takes an iterator of BioSequence objects, extracts super k-mers
// from each sequence using the k-mer and minimizer sizes specified by CLI options,
// and returns a new iterator yielding the extracted super k-mers as BioSequence objects.
//
// Each super k-mer is a maximal subsequence where all consecutive k-mers share
// the same minimizer. The resulting BioSequences contain metadata including:
// - minimizer_value: the canonical minimizer value
// - minimizer_seq: the DNA sequence of the minimizer
// - k: the k-mer size used
// - m: the minimizer size used
// - start: starting position in the original sequence
// - end: ending position in the original sequence
// - parent_id: ID of the parent sequence
//
// Parameters:
// - iterator: an iterator yielding BioSequence objects to process.
//
// Returns:
// - An iterator yielding BioSequence objects representing super k-mers.
func CLIExtractSuperKmers(iterator obiiter.IBioSequence) obiiter.IBioSequence {
// Get k-mer and minimizer sizes from CLI options
k := CLIKmerSize()
m := CLIMinimizerSize()
// Validate parameters
if m < 1 || m >= k {
log.Fatalf("Invalid parameters: minimizer size (%d) must be between 1 and k-1 (%d)", m, k-1)
}
if k < 2 || k > 31 {
log.Fatalf("Invalid k-mer size: %d (must be between 2 and 31)", k)
}
log.Printf("Extracting super k-mers with k=%d, m=%d", k, m)
// Create the worker for super k-mer extraction
worker := obikmer.SuperKmerWorker(k, m)
// Apply the worker to the iterator with parallel processing
newIter := iterator.MakeIWorker(
worker,
false, // don't merge results
obidefault.ParallelWorkers(),
)
return newIter
}

View File

@@ -1,149 +0,0 @@
package obisuperkmer
import (
"testing"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
)
func TestCLIExtractSuperKmers(t *testing.T) {
// Create a test sequence
testSeq := obiseq.NewBioSequence(
"test_seq",
[]byte("ACGTACGTACGTACGTACGTACGTACGTACGT"),
"",
)
// Create a batch with the test sequence
batch := obiseq.NewBioSequenceBatch()
batch.Add(testSeq)
// Create an iterator from the batch
iterator := obiiter.MakeBioSequenceBatchChannel(1)
go func() {
iterator.Push(batch)
iterator.Close()
}()
// Set test parameters
SetKmerSize(15)
SetMinimizerSize(7)
// Extract super k-mers
result := CLIExtractSuperKmers(iterator)
// Count the number of super k-mers
count := 0
for result.Next() {
batch := result.Get()
for _, sk := range batch.Slice() {
count++
// Verify that the super k-mer has the expected attributes
if !sk.HasAttribute("minimizer_value") {
t.Error("Super k-mer missing 'minimizer_value' attribute")
}
if !sk.HasAttribute("minimizer_seq") {
t.Error("Super k-mer missing 'minimizer_seq' attribute")
}
if !sk.HasAttribute("k") {
t.Error("Super k-mer missing 'k' attribute")
}
if !sk.HasAttribute("m") {
t.Error("Super k-mer missing 'm' attribute")
}
if !sk.HasAttribute("start") {
t.Error("Super k-mer missing 'start' attribute")
}
if !sk.HasAttribute("end") {
t.Error("Super k-mer missing 'end' attribute")
}
if !sk.HasAttribute("parent_id") {
t.Error("Super k-mer missing 'parent_id' attribute")
}
// Verify attribute values
k, _ := sk.GetIntAttribute("k")
m, _ := sk.GetIntAttribute("m")
if k != 15 {
t.Errorf("Expected k=15, got k=%d", k)
}
if m != 7 {
t.Errorf("Expected m=7, got m=%d", m)
}
parentID, _ := sk.GetStringAttribute("parent_id")
if parentID != "test_seq" {
t.Errorf("Expected parent_id='test_seq', got '%s'", parentID)
}
}
}
if count == 0 {
t.Error("No super k-mers were extracted")
}
t.Logf("Extracted %d super k-mers from test sequence", count)
}
func TestOptionGettersAndSetters(t *testing.T) {
// Test initial values
if CLIKmerSize() != 21 {
t.Errorf("Expected default k-mer size 21, got %d", CLIKmerSize())
}
if CLIMinimizerSize() != 11 {
t.Errorf("Expected default minimizer size 11, got %d", CLIMinimizerSize())
}
// Test setters
SetKmerSize(25)
SetMinimizerSize(13)
if CLIKmerSize() != 25 {
t.Errorf("SetKmerSize failed: expected 25, got %d", CLIKmerSize())
}
if CLIMinimizerSize() != 13 {
t.Errorf("SetMinimizerSize failed: expected 13, got %d", CLIMinimizerSize())
}
// Reset to defaults
SetKmerSize(21)
SetMinimizerSize(11)
}
func BenchmarkCLIExtractSuperKmers(b *testing.B) {
// Create a longer test sequence
longSeq := make([]byte, 1000)
bases := []byte{'A', 'C', 'G', 'T'}
for i := range longSeq {
longSeq[i] = bases[i%4]
}
testSeq := obiseq.NewBioSequence("bench_seq", longSeq, "")
// Set parameters
SetKmerSize(21)
SetMinimizerSize(11)
b.ResetTimer()
for i := 0; i < b.N; i++ {
batch := obiseq.NewBioSequenceBatch()
batch.Add(testSeq)
iterator := obiiter.MakeBioSequenceBatchChannel(1)
go func() {
iterator.Push(batch)
iterator.Close()
}()
result := CLIExtractSuperKmers(iterator)
// Consume the iterator
for result.Next() {
result.Get()
}
}
}